cdxml-toolkit 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdxml_toolkit/__init__.py +18 -0
- cdxml_toolkit/_jre/__init__.py +2 -0
- cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- cdxml_toolkit/analysis/__init__.py +35 -0
- cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
- cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
- cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
- cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
- cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
- cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
- cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
- cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
- cdxml_toolkit/analysis/extract_nmr.py +47 -0
- cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
- cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
- cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
- cdxml_toolkit/cdxml_builder.py +920 -0
- cdxml_toolkit/cdxml_utils.py +342 -0
- cdxml_toolkit/chemdraw/__init__.py +5 -0
- cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
- cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
- cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
- cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
- cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
- cdxml_toolkit/constants.py +304 -0
- cdxml_toolkit/coord_normalizer.py +438 -0
- cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
- cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
- cdxml_toolkit/image/__init__.py +15 -0
- cdxml_toolkit/image/reaction_from_image.py +2103 -0
- cdxml_toolkit/image/structure_from_image.py +1711 -0
- cdxml_toolkit/layout/__init__.py +5 -0
- cdxml_toolkit/layout/alignment.py +1642 -0
- cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
- cdxml_toolkit/layout/scheme_merger.py +2260 -0
- cdxml_toolkit/mcp_server/__init__.py +0 -0
- cdxml_toolkit/mcp_server/__main__.py +5 -0
- cdxml_toolkit/mcp_server/server.py +1567 -0
- cdxml_toolkit/naming/__init__.py +6 -0
- cdxml_toolkit/naming/aligned_namer.py +2342 -0
- cdxml_toolkit/naming/mol_builder.py +3722 -0
- cdxml_toolkit/naming/name_decomposer.py +2843 -0
- cdxml_toolkit/naming/reactions_datamol.json +2414 -0
- cdxml_toolkit/office/__init__.py +5 -0
- cdxml_toolkit/office/doc_from_template.py +722 -0
- cdxml_toolkit/office/ole_embedder.py +808 -0
- cdxml_toolkit/office/ole_extractor.py +272 -0
- cdxml_toolkit/perception/__init__.py +10 -0
- cdxml_toolkit/perception/compound_search.py +229 -0
- cdxml_toolkit/perception/eln_csv_parser.py +240 -0
- cdxml_toolkit/perception/rdf_parser.py +664 -0
- cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
- cdxml_toolkit/perception/reaction_parser.py +2150 -0
- cdxml_toolkit/perception/scheme_reader.py +2948 -0
- cdxml_toolkit/perception/scheme_refine.py +1404 -0
- cdxml_toolkit/perception/scheme_segmenter.py +619 -0
- cdxml_toolkit/perception/spatial_assignment.py +1013 -0
- cdxml_toolkit/rdkit_utils.py +605 -0
- cdxml_toolkit/render/__init__.py +17 -0
- cdxml_toolkit/render/auto_layout.py +229 -0
- cdxml_toolkit/render/compact_parser.py +632 -0
- cdxml_toolkit/render/parser.py +706 -0
- cdxml_toolkit/render/render_scheme.py +267 -0
- cdxml_toolkit/render/renderer.py +2387 -0
- cdxml_toolkit/render/schema.py +90 -0
- cdxml_toolkit/render/scheme_maker.py +1043 -0
- cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
- cdxml_toolkit/resolve/__init__.py +13 -0
- cdxml_toolkit/resolve/cas_resolver.py +430 -0
- cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
- cdxml_toolkit/resolve/condensed_formula.py +493 -0
- cdxml_toolkit/resolve/jre_manager.py +195 -0
- cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
- cdxml_toolkit/resolve/reagent_db.py +285 -0
- cdxml_toolkit/resolve/superatom_data.json +2856 -0
- cdxml_toolkit/resolve/superatom_table.py +146 -0
- cdxml_toolkit/text_formatting.py +298 -0
- cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
- cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
- cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
- cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
- cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""cdxml-toolkit: Python toolkit for ChemDraw CDXML reaction scheme processing.
|
|
2
|
+
|
|
3
|
+
Provides tools for reading, writing, manipulating, and rendering ChemDraw CDXML
|
|
4
|
+
files. Includes reaction scheme layout, reagent classification, structure
|
|
5
|
+
alignment, and a declarative DSL for building schemes from YAML or text.
|
|
6
|
+
|
|
7
|
+
Core utilities are available without optional dependencies. RDKit, ChemDraw COM,
|
|
8
|
+
and other heavy dependencies are lazy-imported and only required when their
|
|
9
|
+
specific features are used.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
__version__ = "0.5.0"
|
|
13
|
+
|
|
14
|
+
# Core utilities — always available (stdlib + lxml only)
|
|
15
|
+
from .constants import ACS_BOND_LENGTH, ACS_CHAIN_ANGLE, ACS_STYLE
|
|
16
|
+
from .cdxml_utils import parse_cdxml, write_cdxml, fragment_bbox
|
|
17
|
+
from .text_formatting import build_formatted_s_xml
|
|
18
|
+
from .resolve.reagent_db import get_reagent_db
|
|
Binary file
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""Analysis — LCMS parsing, species identification, and lab book generation.
|
|
2
|
+
|
|
3
|
+
Parses Waters MassLynx LCMS PDF reports (standard and manually integrated),
|
|
4
|
+
matches peaks across files, identifies compounds by expected mass, and
|
|
5
|
+
assembles lab book entries. Two workflows:
|
|
6
|
+
|
|
7
|
+
1. **Agent-driven** (recommended): LLM parses individual files via
|
|
8
|
+
``parse_report`` / ``parse_manual_report``, reasons about peaks, and
|
|
9
|
+
calls ``process_entries`` with a JSON entry list to produce a lab book
|
|
10
|
+
entry where all numbers are deterministically sourced.
|
|
11
|
+
|
|
12
|
+
2. **Deterministic batch**: ``deterministic.procedure_writer`` orchestrates
|
|
13
|
+
mass resolution, multi-file LCMS collation, species identification, and
|
|
14
|
+
output formatting in a single pipeline.
|
|
15
|
+
|
|
16
|
+
Optional dependency: ``pdfplumber`` (install via ``pip install cdxml-toolkit[analysis]``).
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
# Agent-driven tools (top-level in analysis/)
|
|
20
|
+
from .lcms_analyzer import (
|
|
21
|
+
parse_report, parse_manual_report, format_table, format_manual_table,
|
|
22
|
+
LCMSReport, ChromPeak, MassSpectrum,
|
|
23
|
+
ManualLCMSReport, ManualLCMSSample, ManualPeak,
|
|
24
|
+
is_waters_report, is_manual_integration,
|
|
25
|
+
)
|
|
26
|
+
from .format_procedure_entry import process_entries
|
|
27
|
+
|
|
28
|
+
# Deterministic pipeline re-exports (from analysis/deterministic/)
|
|
29
|
+
from .deterministic import (
|
|
30
|
+
multi_analyze, AnalysisResult,
|
|
31
|
+
categorize_lcms_file, categorize_lcms_files_batch,
|
|
32
|
+
extract_expected_masses, ExpectedSpecies,
|
|
33
|
+
run_tracking_analysis, run_purified_analysis,
|
|
34
|
+
discover_experiment_files, DiscoveryResult,
|
|
35
|
+
)
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Deterministic pipeline — the original fully-deterministic procedure writer,
|
|
2
|
+
multi-LCMS analyzer, and supporting modules.
|
|
3
|
+
|
|
4
|
+
These tools are superseded by the agent-driven workflow
|
|
5
|
+
(``format_procedure_entry``) but remain available for the batch pipeline.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .multi_lcms_analyzer import analyze as multi_analyze, AnalysisResult
|
|
9
|
+
from .lcms_file_categorizer import categorize_lcms_file, categorize_lcms_files_batch
|
|
10
|
+
from .mass_resolver import extract_expected_masses, ExpectedSpecies
|
|
11
|
+
from .lcms_identifier import run_tracking_analysis, run_purified_analysis
|
|
12
|
+
from .discover_experiment_files import discover_experiment_files, DiscoveryResult
|
|
@@ -0,0 +1,413 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Experiment File Discovery Tool
|
|
4
|
+
|
|
5
|
+
Discovers and classifies all files for a chemistry experiment: ELN CSV,
|
|
6
|
+
CDX/RXN structure files, LCMS PDFs (with category/sort_key), and NMR PDFs.
|
|
7
|
+
|
|
8
|
+
Handles two directory layouts:
|
|
9
|
+
1. input_dir IS the experiment directory (contains .csv directly)
|
|
10
|
+
2. input_dir is a parent directory containing experiment subdirectories
|
|
11
|
+
|
|
12
|
+
Usage:
|
|
13
|
+
python discover_experiment_files.py --input-dir path/to/experiment/ --experiment KL-7001-004
|
|
14
|
+
python discover_experiment_files.py --input-dir path/to/experiment/ --experiment KL-7001-004 --json
|
|
15
|
+
python discover_experiment_files.py --input-dir path/to/experiment/ --experiment KL-7001-004 --json -o files.json
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import argparse
|
|
19
|
+
import json
|
|
20
|
+
import os
|
|
21
|
+
import re
|
|
22
|
+
import sys
|
|
23
|
+
from dataclasses import dataclass, field
|
|
24
|
+
from typing import List, Optional, Dict, Tuple
|
|
25
|
+
|
|
26
|
+
from ..lcms_analyzer import extract_all_text, is_waters_report
|
|
27
|
+
from .lcms_file_categorizer import (
|
|
28
|
+
categorize_lcms_file, categorize_lcms_files_batch,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
# ---------------------------------------------------------------------------
|
|
32
|
+
# Data structures
|
|
33
|
+
# ---------------------------------------------------------------------------
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class LCMSFileRecord:
|
|
37
|
+
"""An LCMS PDF with its classification."""
|
|
38
|
+
path: str
|
|
39
|
+
category: str # "tracking", "workup", "purification", "final"
|
|
40
|
+
sort_key: float
|
|
41
|
+
group_prefix: Optional[str] = None # tracking group prefix (batch categorizer)
|
|
42
|
+
method_variant: Optional[str] = None # filename-derived method hint (AmB, AmF, etc.)
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def filename(self) -> str:
|
|
46
|
+
"""Basename of the file path."""
|
|
47
|
+
return os.path.basename(self.path)
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class DiscoveryResult:
|
|
51
|
+
"""All discovered files for an experiment."""
|
|
52
|
+
experiment: str
|
|
53
|
+
input_dir: str
|
|
54
|
+
csv_files: List[str] = field(default_factory=list)
|
|
55
|
+
cdx_files: List[str] = field(default_factory=list)
|
|
56
|
+
rxn_files: List[str] = field(default_factory=list)
|
|
57
|
+
lcms_files: List[LCMSFileRecord] = field(default_factory=list)
|
|
58
|
+
nmr_files: List[str] = field(default_factory=list)
|
|
59
|
+
warnings: List[str] = field(default_factory=list)
|
|
60
|
+
|
|
61
|
+
def to_dict(self) -> dict:
|
|
62
|
+
"""Convert to JSON-serializable dict."""
|
|
63
|
+
return {
|
|
64
|
+
"experiment": self.experiment,
|
|
65
|
+
"input_dir": self.input_dir,
|
|
66
|
+
"files": {
|
|
67
|
+
"csv": self.csv_files,
|
|
68
|
+
"cdx": self.cdx_files,
|
|
69
|
+
"rxn": self.rxn_files,
|
|
70
|
+
"lcms": [
|
|
71
|
+
{"path": lf.path, "category": lf.category,
|
|
72
|
+
"sort_key": lf.sort_key,
|
|
73
|
+
"group_prefix": lf.group_prefix,
|
|
74
|
+
"method_variant": lf.method_variant}
|
|
75
|
+
for lf in self.lcms_files
|
|
76
|
+
],
|
|
77
|
+
"nmr": self.nmr_files,
|
|
78
|
+
},
|
|
79
|
+
"warnings": self.warnings,
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
# ---------------------------------------------------------------------------
|
|
83
|
+
# Core helpers (extracted from procedure_writer.py)
|
|
84
|
+
# ---------------------------------------------------------------------------
|
|
85
|
+
|
|
86
|
+
def _find_files_matching(directory: str, experiment_name: str,
|
|
87
|
+
extensions: tuple) -> List[str]:
|
|
88
|
+
"""Find files matching experiment name prefix in a directory."""
|
|
89
|
+
if not os.path.isdir(directory):
|
|
90
|
+
return []
|
|
91
|
+
prefix = experiment_name.lower()
|
|
92
|
+
matches = []
|
|
93
|
+
for f in os.listdir(directory):
|
|
94
|
+
fl = f.lower()
|
|
95
|
+
if fl.startswith(prefix) and fl.endswith(extensions):
|
|
96
|
+
# Ensure it's not a different experiment (e.g., KL-7001-0040)
|
|
97
|
+
remainder = f[len(experiment_name):]
|
|
98
|
+
if not remainder or remainder[0] in ('-', '.', ' '):
|
|
99
|
+
matches.append(os.path.join(directory, f))
|
|
100
|
+
return sorted(matches)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _pdf_contains_nmr_data(pdf_path: str) -> bool:
|
|
104
|
+
"""Check if a PDF contains NMR data strings (1H NMR, 13C NMR, etc.)."""
|
|
105
|
+
try:
|
|
106
|
+
text = extract_all_text(pdf_path)
|
|
107
|
+
return bool(re.search(r'\d+[A-Z]\s+NMR', text))
|
|
108
|
+
except Exception:
|
|
109
|
+
return False
|
|
110
|
+
|
|
111
|
+
# ---------------------------------------------------------------------------
|
|
112
|
+
# Main discovery logic (extracted from procedure_writer.discover_files)
|
|
113
|
+
# ---------------------------------------------------------------------------
|
|
114
|
+
|
|
115
|
+
def discover_experiment_files(
|
|
116
|
+
input_dir: str,
|
|
117
|
+
experiment_name: Optional[str] = None,
|
|
118
|
+
) -> DiscoveryResult:
|
|
119
|
+
"""
|
|
120
|
+
Discover all files for an experiment.
|
|
121
|
+
|
|
122
|
+
Handles two layouts:
|
|
123
|
+
1. input_dir IS the experiment dir (contains .csv directly)
|
|
124
|
+
2. input_dir is the parent dir (contains experiment subdirs)
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
input_dir: Path to experiment directory or parent directory.
|
|
128
|
+
experiment_name: Experiment name (e.g. "KL-7001-004"). Required
|
|
129
|
+
if input_dir is the parent directory.
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
DiscoveryResult with all found files classified by type.
|
|
133
|
+
|
|
134
|
+
Raises:
|
|
135
|
+
SystemExit if no CSV found and no experiment name provided.
|
|
136
|
+
"""
|
|
137
|
+
input_dir = os.path.abspath(input_dir)
|
|
138
|
+
|
|
139
|
+
# Try to find CSV directly in input_dir
|
|
140
|
+
csv_in_dir = [f for f in os.listdir(input_dir)
|
|
141
|
+
if f.lower().endswith('.csv')]
|
|
142
|
+
|
|
143
|
+
if csv_in_dir and not experiment_name:
|
|
144
|
+
# input_dir IS the experiment dir — infer experiment name from CSV
|
|
145
|
+
csv_path = os.path.join(input_dir, csv_in_dir[0])
|
|
146
|
+
experiment_name = _infer_experiment_from_csv(csv_path)
|
|
147
|
+
if not experiment_name:
|
|
148
|
+
experiment_name = os.path.basename(input_dir)
|
|
149
|
+
parent_dir = os.path.dirname(input_dir)
|
|
150
|
+
elif experiment_name:
|
|
151
|
+
# input_dir is parent, look in experiment subdir
|
|
152
|
+
parent_dir = input_dir
|
|
153
|
+
else:
|
|
154
|
+
# No CSV, no experiment name — list subdirs as candidates
|
|
155
|
+
print("Error: No CSV found and no --experiment specified.",
|
|
156
|
+
file=sys.stderr)
|
|
157
|
+
subdirs = [d for d in os.listdir(input_dir)
|
|
158
|
+
if os.path.isdir(os.path.join(input_dir, d))
|
|
159
|
+
and not d.startswith('.')
|
|
160
|
+
and d not in ('DATA', 'LCMS files')]
|
|
161
|
+
if subdirs:
|
|
162
|
+
print(f"Available experiments: {', '.join(sorted(subdirs))}",
|
|
163
|
+
file=sys.stderr)
|
|
164
|
+
sys.exit(1)
|
|
165
|
+
|
|
166
|
+
result = DiscoveryResult(
|
|
167
|
+
experiment=experiment_name,
|
|
168
|
+
input_dir=input_dir,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
# --- CSV ---
|
|
172
|
+
exp_dir_path = os.path.join(parent_dir, experiment_name)
|
|
173
|
+
csv_path = os.path.join(exp_dir_path, f"{experiment_name}.csv")
|
|
174
|
+
if os.path.isfile(csv_path):
|
|
175
|
+
result.csv_files.append(csv_path)
|
|
176
|
+
elif csv_in_dir:
|
|
177
|
+
# Flat layout: CSV was found directly in input_dir
|
|
178
|
+
result.csv_files.append(os.path.join(input_dir, csv_in_dir[0]))
|
|
179
|
+
|
|
180
|
+
# --- CDX / RXN ---
|
|
181
|
+
# Check experiment subdir first, then input_dir itself (flat layout)
|
|
182
|
+
if os.path.isdir(exp_dir_path):
|
|
183
|
+
cdx = _find_files_matching(exp_dir_path, experiment_name, ('.cdx',))
|
|
184
|
+
if cdx:
|
|
185
|
+
result.cdx_files.extend(cdx)
|
|
186
|
+
rxn = _find_files_matching(exp_dir_path, experiment_name, ('.rxn',))
|
|
187
|
+
if rxn:
|
|
188
|
+
result.rxn_files.extend(rxn)
|
|
189
|
+
|
|
190
|
+
cdx = _find_files_matching(input_dir, experiment_name, ('.cdx',))
|
|
191
|
+
for f in cdx:
|
|
192
|
+
if f not in result.cdx_files:
|
|
193
|
+
result.cdx_files.append(f)
|
|
194
|
+
rxn = _find_files_matching(input_dir, experiment_name, ('.rxn',))
|
|
195
|
+
for f in rxn:
|
|
196
|
+
if f not in result.rxn_files:
|
|
197
|
+
result.rxn_files.append(f)
|
|
198
|
+
|
|
199
|
+
# --- LCMS PDFs ---
|
|
200
|
+
# Search LCMS files dir, experiment dir, input_dir, parent dir
|
|
201
|
+
# NOT DATA directory (DATA is for NMR)
|
|
202
|
+
lcms_dirs = [
|
|
203
|
+
os.path.join(parent_dir, 'LCMS files'),
|
|
204
|
+
os.path.join(input_dir, 'LCMS files'),
|
|
205
|
+
input_dir,
|
|
206
|
+
parent_dir,
|
|
207
|
+
]
|
|
208
|
+
seen_lcms = set()
|
|
209
|
+
lcms_candidates = [] # (path, filename) — collect all first, then batch
|
|
210
|
+
|
|
211
|
+
for d in lcms_dirs:
|
|
212
|
+
for f in _find_files_matching(d, experiment_name, ('.pdf',)):
|
|
213
|
+
fname = os.path.basename(f).lower()
|
|
214
|
+
if fname in seen_lcms:
|
|
215
|
+
continue
|
|
216
|
+
if 'nmr' in fname or 'mnova' in fname:
|
|
217
|
+
continue
|
|
218
|
+
# Content-based check: skip non-standard PDFs (e.g. manually
|
|
219
|
+
# integrated chromatograms) that aren't Waters MassLynx reports
|
|
220
|
+
if not is_waters_report(f):
|
|
221
|
+
continue
|
|
222
|
+
seen_lcms.add(fname)
|
|
223
|
+
lcms_candidates.append((f, os.path.basename(f)))
|
|
224
|
+
|
|
225
|
+
# Batch-categorize using context-aware categorizer (resolves ambiguities
|
|
226
|
+
# like tNN purification fractions vs tracking timepoints).
|
|
227
|
+
if lcms_candidates:
|
|
228
|
+
filenames = [fn for _, fn in lcms_candidates]
|
|
229
|
+
path_map = {fn: path for path, fn in lcms_candidates}
|
|
230
|
+
batch = categorize_lcms_files_batch(filenames, experiment_name)
|
|
231
|
+
|
|
232
|
+
for fn in filenames:
|
|
233
|
+
if fn in batch.filtered_files:
|
|
234
|
+
continue # skip special files (-MS, -LC, -UV, etc.)
|
|
235
|
+
fc = batch.files.get(fn)
|
|
236
|
+
if fc is not None:
|
|
237
|
+
result.lcms_files.append(LCMSFileRecord(
|
|
238
|
+
path=path_map[fn],
|
|
239
|
+
category=fc.category,
|
|
240
|
+
sort_key=fc.sort_key,
|
|
241
|
+
group_prefix=fc.group_prefix,
|
|
242
|
+
method_variant=(fc.modifiers.method_variant
|
|
243
|
+
if fc.modifiers else None),
|
|
244
|
+
))
|
|
245
|
+
else:
|
|
246
|
+
# Fallback to simple categorizer (shouldn't happen)
|
|
247
|
+
category, sort_key = categorize_lcms_file(fn)
|
|
248
|
+
result.lcms_files.append(LCMSFileRecord(
|
|
249
|
+
path=path_map[fn],
|
|
250
|
+
category=category,
|
|
251
|
+
sort_key=sort_key,
|
|
252
|
+
))
|
|
253
|
+
|
|
254
|
+
# Sort LCMS files chronologically
|
|
255
|
+
result.lcms_files.sort(key=lambda x: x.sort_key)
|
|
256
|
+
|
|
257
|
+
# --- NMR PDFs ---
|
|
258
|
+
# Scan DATA directories for PDFs matching experiment name that
|
|
259
|
+
# contain NMR data strings (content-based detection)
|
|
260
|
+
data_dirs = [
|
|
261
|
+
os.path.join(parent_dir, 'DATA'),
|
|
262
|
+
os.path.join(input_dir, 'DATA'),
|
|
263
|
+
]
|
|
264
|
+
seen_nmr = set()
|
|
265
|
+
|
|
266
|
+
for d in data_dirs:
|
|
267
|
+
for f in _find_files_matching(d, experiment_name, ('.pdf',)):
|
|
268
|
+
fname = os.path.basename(f).lower()
|
|
269
|
+
if fname in seen_nmr:
|
|
270
|
+
continue
|
|
271
|
+
if _pdf_contains_nmr_data(f):
|
|
272
|
+
seen_nmr.add(fname)
|
|
273
|
+
result.nmr_files.append(f)
|
|
274
|
+
|
|
275
|
+
# --- Warnings ---
|
|
276
|
+
if not result.csv_files:
|
|
277
|
+
result.warnings.append("No CSV file found")
|
|
278
|
+
if not result.lcms_files:
|
|
279
|
+
result.warnings.append("No LCMS PDF files found")
|
|
280
|
+
if not result.cdx_files and not result.rxn_files:
|
|
281
|
+
result.warnings.append("No CDX or RXN structure files found")
|
|
282
|
+
|
|
283
|
+
return result
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def _infer_experiment_from_csv(csv_path: str) -> Optional[str]:
|
|
287
|
+
"""Read the EXPERIENCE_NAME field from a Findmolecule CSV."""
|
|
288
|
+
try:
|
|
289
|
+
import csv as csv_mod
|
|
290
|
+
with open(csv_path, 'r', encoding='utf-8-sig') as f:
|
|
291
|
+
reader = csv_mod.reader(f, delimiter=';', quotechar='"')
|
|
292
|
+
rows = list(reader)
|
|
293
|
+
if len(rows) >= 2:
|
|
294
|
+
headers = rows[0]
|
|
295
|
+
values = rows[1]
|
|
296
|
+
meta = dict(zip(headers, values))
|
|
297
|
+
name = meta.get('EXPERIENCE_NAME', '').strip()
|
|
298
|
+
if name:
|
|
299
|
+
return name
|
|
300
|
+
except Exception:
|
|
301
|
+
pass
|
|
302
|
+
return None
|
|
303
|
+
|
|
304
|
+
# ---------------------------------------------------------------------------
|
|
305
|
+
# Output formatting
|
|
306
|
+
# ---------------------------------------------------------------------------
|
|
307
|
+
|
|
308
|
+
def format_text_report(result: DiscoveryResult) -> str:
|
|
309
|
+
"""Format discovery result as human-readable text."""
|
|
310
|
+
lines = []
|
|
311
|
+
lines.append(f"Experiment: {result.experiment}")
|
|
312
|
+
lines.append(f"Input dir: {result.input_dir}")
|
|
313
|
+
lines.append("")
|
|
314
|
+
|
|
315
|
+
# CSV
|
|
316
|
+
lines.append(f"CSV files ({len(result.csv_files)}):")
|
|
317
|
+
for f in result.csv_files:
|
|
318
|
+
lines.append(f" {os.path.basename(f)}")
|
|
319
|
+
if not result.csv_files:
|
|
320
|
+
lines.append(" (none)")
|
|
321
|
+
|
|
322
|
+
# CDX
|
|
323
|
+
lines.append(f"CDX files ({len(result.cdx_files)}):")
|
|
324
|
+
for f in result.cdx_files:
|
|
325
|
+
lines.append(f" {os.path.basename(f)}")
|
|
326
|
+
if not result.cdx_files:
|
|
327
|
+
lines.append(" (none)")
|
|
328
|
+
|
|
329
|
+
# RXN
|
|
330
|
+
lines.append(f"RXN files ({len(result.rxn_files)}):")
|
|
331
|
+
for f in result.rxn_files:
|
|
332
|
+
lines.append(f" {os.path.basename(f)}")
|
|
333
|
+
if not result.rxn_files:
|
|
334
|
+
lines.append(" (none)")
|
|
335
|
+
|
|
336
|
+
# LCMS
|
|
337
|
+
lines.append(f"LCMS files ({len(result.lcms_files)}):")
|
|
338
|
+
categories: Dict[str, List[LCMSFileRecord]] = {}
|
|
339
|
+
for lf in result.lcms_files:
|
|
340
|
+
categories.setdefault(lf.category, []).append(lf)
|
|
341
|
+
for cat in ("tracking", "workup", "purification", "final"):
|
|
342
|
+
cat_files = categories.get(cat, [])
|
|
343
|
+
if cat_files:
|
|
344
|
+
lines.append(f" {cat} ({len(cat_files)}):")
|
|
345
|
+
for lf in cat_files:
|
|
346
|
+
lines.append(f" {os.path.basename(lf.path)} "
|
|
347
|
+
f"[sort_key={lf.sort_key}]")
|
|
348
|
+
if not result.lcms_files:
|
|
349
|
+
lines.append(" (none)")
|
|
350
|
+
|
|
351
|
+
# NMR
|
|
352
|
+
lines.append(f"NMR files ({len(result.nmr_files)}):")
|
|
353
|
+
for f in result.nmr_files:
|
|
354
|
+
lines.append(f" {os.path.basename(f)}")
|
|
355
|
+
if not result.nmr_files:
|
|
356
|
+
lines.append(" (none)")
|
|
357
|
+
|
|
358
|
+
# Warnings
|
|
359
|
+
if result.warnings:
|
|
360
|
+
lines.append("")
|
|
361
|
+
lines.append("Warnings:")
|
|
362
|
+
for w in result.warnings:
|
|
363
|
+
lines.append(f" - {w}")
|
|
364
|
+
|
|
365
|
+
return "\n".join(lines)
|
|
366
|
+
|
|
367
|
+
# ---------------------------------------------------------------------------
|
|
368
|
+
# CLI
|
|
369
|
+
# ---------------------------------------------------------------------------
|
|
370
|
+
|
|
371
|
+
def _build_arg_parser() -> argparse.ArgumentParser:
|
|
372
|
+
p = argparse.ArgumentParser(
|
|
373
|
+
description="Experiment File Discovery Tool",
|
|
374
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
375
|
+
epilog=__doc__,
|
|
376
|
+
)
|
|
377
|
+
p.add_argument("--input-dir", "-i", required=True,
|
|
378
|
+
help="Directory containing experiment files "
|
|
379
|
+
"(experiment dir or parent dir)")
|
|
380
|
+
p.add_argument("--experiment", "-e", default=None,
|
|
381
|
+
help="Experiment name (e.g., KL-7001-004). "
|
|
382
|
+
"Required if input-dir is the parent directory.")
|
|
383
|
+
p.add_argument("--json", "-j", action="store_true",
|
|
384
|
+
help="Output in JSON format")
|
|
385
|
+
p.add_argument("--output", "-o", default=None,
|
|
386
|
+
help="Output file path (default: stdout)")
|
|
387
|
+
return p
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
def main(argv=None) -> int:
|
|
391
|
+
parser = _build_arg_parser()
|
|
392
|
+
args = parser.parse_args(argv)
|
|
393
|
+
|
|
394
|
+
result = discover_experiment_files(args.input_dir, args.experiment)
|
|
395
|
+
|
|
396
|
+
if args.json:
|
|
397
|
+
output = json.dumps(result.to_dict(), indent=2)
|
|
398
|
+
else:
|
|
399
|
+
output = format_text_report(result)
|
|
400
|
+
|
|
401
|
+
if args.output:
|
|
402
|
+
with open(args.output, 'w', encoding='utf-8') as f:
|
|
403
|
+
f.write(output)
|
|
404
|
+
f.write('\n')
|
|
405
|
+
print(f"Output written to {args.output}", file=sys.stderr)
|
|
406
|
+
else:
|
|
407
|
+
print(output)
|
|
408
|
+
|
|
409
|
+
return 0
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
if __name__ == '__main__':
|
|
413
|
+
sys.exit(main())
|