cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,18 @@
1
+ """cdxml-toolkit: Python toolkit for ChemDraw CDXML reaction scheme processing.
2
+
3
+ Provides tools for reading, writing, manipulating, and rendering ChemDraw CDXML
4
+ files. Includes reaction scheme layout, reagent classification, structure
5
+ alignment, and a declarative DSL for building schemes from YAML or text.
6
+
7
+ Core utilities are available without optional dependencies. RDKit, ChemDraw COM,
8
+ and other heavy dependencies are lazy-imported and only required when their
9
+ specific features are used.
10
+ """
11
+
12
+ __version__ = "0.5.0"
13
+
14
+ # Core utilities — always available (stdlib + lxml only)
15
+ from .constants import ACS_BOND_LENGTH, ACS_CHAIN_ANGLE, ACS_STYLE
16
+ from .cdxml_utils import parse_cdxml, write_cdxml, fragment_bbox
17
+ from .text_formatting import build_formatted_s_xml
18
+ from .resolve.reagent_db import get_reagent_db
@@ -0,0 +1,2 @@
1
+ # Bundled Eclipse Temurin JRE 21 (Windows x64) for OPSIN.
2
+ # License: GPL v2 with Classpath Exception (allows bundling).
@@ -0,0 +1,35 @@
1
+ """Analysis — LCMS parsing, species identification, and lab book generation.
2
+
3
+ Parses Waters MassLynx LCMS PDF reports (standard and manually integrated),
4
+ matches peaks across files, identifies compounds by expected mass, and
5
+ assembles lab book entries. Two workflows:
6
+
7
+ 1. **Agent-driven** (recommended): LLM parses individual files via
8
+ ``parse_report`` / ``parse_manual_report``, reasons about peaks, and
9
+ calls ``process_entries`` with a JSON entry list to produce a lab book
10
+ entry where all numbers are deterministically sourced.
11
+
12
+ 2. **Deterministic batch**: ``deterministic.procedure_writer`` orchestrates
13
+ mass resolution, multi-file LCMS collation, species identification, and
14
+ output formatting in a single pipeline.
15
+
16
+ Optional dependency: ``pdfplumber`` (install via ``pip install cdxml-toolkit[analysis]``).
17
+ """
18
+
19
+ # Agent-driven tools (top-level in analysis/)
20
+ from .lcms_analyzer import (
21
+ parse_report, parse_manual_report, format_table, format_manual_table,
22
+ LCMSReport, ChromPeak, MassSpectrum,
23
+ ManualLCMSReport, ManualLCMSSample, ManualPeak,
24
+ is_waters_report, is_manual_integration,
25
+ )
26
+ from .format_procedure_entry import process_entries
27
+
28
+ # Deterministic pipeline re-exports (from analysis/deterministic/)
29
+ from .deterministic import (
30
+ multi_analyze, AnalysisResult,
31
+ categorize_lcms_file, categorize_lcms_files_batch,
32
+ extract_expected_masses, ExpectedSpecies,
33
+ run_tracking_analysis, run_purified_analysis,
34
+ discover_experiment_files, DiscoveryResult,
35
+ )
@@ -0,0 +1,12 @@
1
+ """Deterministic pipeline — the original fully-deterministic procedure writer,
2
+ multi-LCMS analyzer, and supporting modules.
3
+
4
+ These tools are superseded by the agent-driven workflow
5
+ (``format_procedure_entry``) but remain available for the batch pipeline.
6
+ """
7
+
8
+ from .multi_lcms_analyzer import analyze as multi_analyze, AnalysisResult
9
+ from .lcms_file_categorizer import categorize_lcms_file, categorize_lcms_files_batch
10
+ from .mass_resolver import extract_expected_masses, ExpectedSpecies
11
+ from .lcms_identifier import run_tracking_analysis, run_purified_analysis
12
+ from .discover_experiment_files import discover_experiment_files, DiscoveryResult
@@ -0,0 +1,413 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Experiment File Discovery Tool
4
+
5
+ Discovers and classifies all files for a chemistry experiment: ELN CSV,
6
+ CDX/RXN structure files, LCMS PDFs (with category/sort_key), and NMR PDFs.
7
+
8
+ Handles two directory layouts:
9
+ 1. input_dir IS the experiment directory (contains .csv directly)
10
+ 2. input_dir is a parent directory containing experiment subdirectories
11
+
12
+ Usage:
13
+ python discover_experiment_files.py --input-dir path/to/experiment/ --experiment KL-7001-004
14
+ python discover_experiment_files.py --input-dir path/to/experiment/ --experiment KL-7001-004 --json
15
+ python discover_experiment_files.py --input-dir path/to/experiment/ --experiment KL-7001-004 --json -o files.json
16
+ """
17
+
18
+ import argparse
19
+ import json
20
+ import os
21
+ import re
22
+ import sys
23
+ from dataclasses import dataclass, field
24
+ from typing import List, Optional, Dict, Tuple
25
+
26
+ from ..lcms_analyzer import extract_all_text, is_waters_report
27
+ from .lcms_file_categorizer import (
28
+ categorize_lcms_file, categorize_lcms_files_batch,
29
+ )
30
+
31
+ # ---------------------------------------------------------------------------
32
+ # Data structures
33
+ # ---------------------------------------------------------------------------
34
+
35
+ @dataclass
36
+ class LCMSFileRecord:
37
+ """An LCMS PDF with its classification."""
38
+ path: str
39
+ category: str # "tracking", "workup", "purification", "final"
40
+ sort_key: float
41
+ group_prefix: Optional[str] = None # tracking group prefix (batch categorizer)
42
+ method_variant: Optional[str] = None # filename-derived method hint (AmB, AmF, etc.)
43
+
44
+ @property
45
+ def filename(self) -> str:
46
+ """Basename of the file path."""
47
+ return os.path.basename(self.path)
48
+
49
+ @dataclass
50
+ class DiscoveryResult:
51
+ """All discovered files for an experiment."""
52
+ experiment: str
53
+ input_dir: str
54
+ csv_files: List[str] = field(default_factory=list)
55
+ cdx_files: List[str] = field(default_factory=list)
56
+ rxn_files: List[str] = field(default_factory=list)
57
+ lcms_files: List[LCMSFileRecord] = field(default_factory=list)
58
+ nmr_files: List[str] = field(default_factory=list)
59
+ warnings: List[str] = field(default_factory=list)
60
+
61
+ def to_dict(self) -> dict:
62
+ """Convert to JSON-serializable dict."""
63
+ return {
64
+ "experiment": self.experiment,
65
+ "input_dir": self.input_dir,
66
+ "files": {
67
+ "csv": self.csv_files,
68
+ "cdx": self.cdx_files,
69
+ "rxn": self.rxn_files,
70
+ "lcms": [
71
+ {"path": lf.path, "category": lf.category,
72
+ "sort_key": lf.sort_key,
73
+ "group_prefix": lf.group_prefix,
74
+ "method_variant": lf.method_variant}
75
+ for lf in self.lcms_files
76
+ ],
77
+ "nmr": self.nmr_files,
78
+ },
79
+ "warnings": self.warnings,
80
+ }
81
+
82
+ # ---------------------------------------------------------------------------
83
+ # Core helpers (extracted from procedure_writer.py)
84
+ # ---------------------------------------------------------------------------
85
+
86
+ def _find_files_matching(directory: str, experiment_name: str,
87
+ extensions: tuple) -> List[str]:
88
+ """Find files matching experiment name prefix in a directory."""
89
+ if not os.path.isdir(directory):
90
+ return []
91
+ prefix = experiment_name.lower()
92
+ matches = []
93
+ for f in os.listdir(directory):
94
+ fl = f.lower()
95
+ if fl.startswith(prefix) and fl.endswith(extensions):
96
+ # Ensure it's not a different experiment (e.g., KL-7001-0040)
97
+ remainder = f[len(experiment_name):]
98
+ if not remainder or remainder[0] in ('-', '.', ' '):
99
+ matches.append(os.path.join(directory, f))
100
+ return sorted(matches)
101
+
102
+
103
+ def _pdf_contains_nmr_data(pdf_path: str) -> bool:
104
+ """Check if a PDF contains NMR data strings (1H NMR, 13C NMR, etc.)."""
105
+ try:
106
+ text = extract_all_text(pdf_path)
107
+ return bool(re.search(r'\d+[A-Z]\s+NMR', text))
108
+ except Exception:
109
+ return False
110
+
111
+ # ---------------------------------------------------------------------------
112
+ # Main discovery logic (extracted from procedure_writer.discover_files)
113
+ # ---------------------------------------------------------------------------
114
+
115
+ def discover_experiment_files(
116
+ input_dir: str,
117
+ experiment_name: Optional[str] = None,
118
+ ) -> DiscoveryResult:
119
+ """
120
+ Discover all files for an experiment.
121
+
122
+ Handles two layouts:
123
+ 1. input_dir IS the experiment dir (contains .csv directly)
124
+ 2. input_dir is the parent dir (contains experiment subdirs)
125
+
126
+ Args:
127
+ input_dir: Path to experiment directory or parent directory.
128
+ experiment_name: Experiment name (e.g. "KL-7001-004"). Required
129
+ if input_dir is the parent directory.
130
+
131
+ Returns:
132
+ DiscoveryResult with all found files classified by type.
133
+
134
+ Raises:
135
+ SystemExit if no CSV found and no experiment name provided.
136
+ """
137
+ input_dir = os.path.abspath(input_dir)
138
+
139
+ # Try to find CSV directly in input_dir
140
+ csv_in_dir = [f for f in os.listdir(input_dir)
141
+ if f.lower().endswith('.csv')]
142
+
143
+ if csv_in_dir and not experiment_name:
144
+ # input_dir IS the experiment dir — infer experiment name from CSV
145
+ csv_path = os.path.join(input_dir, csv_in_dir[0])
146
+ experiment_name = _infer_experiment_from_csv(csv_path)
147
+ if not experiment_name:
148
+ experiment_name = os.path.basename(input_dir)
149
+ parent_dir = os.path.dirname(input_dir)
150
+ elif experiment_name:
151
+ # input_dir is parent, look in experiment subdir
152
+ parent_dir = input_dir
153
+ else:
154
+ # No CSV, no experiment name — list subdirs as candidates
155
+ print("Error: No CSV found and no --experiment specified.",
156
+ file=sys.stderr)
157
+ subdirs = [d for d in os.listdir(input_dir)
158
+ if os.path.isdir(os.path.join(input_dir, d))
159
+ and not d.startswith('.')
160
+ and d not in ('DATA', 'LCMS files')]
161
+ if subdirs:
162
+ print(f"Available experiments: {', '.join(sorted(subdirs))}",
163
+ file=sys.stderr)
164
+ sys.exit(1)
165
+
166
+ result = DiscoveryResult(
167
+ experiment=experiment_name,
168
+ input_dir=input_dir,
169
+ )
170
+
171
+ # --- CSV ---
172
+ exp_dir_path = os.path.join(parent_dir, experiment_name)
173
+ csv_path = os.path.join(exp_dir_path, f"{experiment_name}.csv")
174
+ if os.path.isfile(csv_path):
175
+ result.csv_files.append(csv_path)
176
+ elif csv_in_dir:
177
+ # Flat layout: CSV was found directly in input_dir
178
+ result.csv_files.append(os.path.join(input_dir, csv_in_dir[0]))
179
+
180
+ # --- CDX / RXN ---
181
+ # Check experiment subdir first, then input_dir itself (flat layout)
182
+ if os.path.isdir(exp_dir_path):
183
+ cdx = _find_files_matching(exp_dir_path, experiment_name, ('.cdx',))
184
+ if cdx:
185
+ result.cdx_files.extend(cdx)
186
+ rxn = _find_files_matching(exp_dir_path, experiment_name, ('.rxn',))
187
+ if rxn:
188
+ result.rxn_files.extend(rxn)
189
+
190
+ cdx = _find_files_matching(input_dir, experiment_name, ('.cdx',))
191
+ for f in cdx:
192
+ if f not in result.cdx_files:
193
+ result.cdx_files.append(f)
194
+ rxn = _find_files_matching(input_dir, experiment_name, ('.rxn',))
195
+ for f in rxn:
196
+ if f not in result.rxn_files:
197
+ result.rxn_files.append(f)
198
+
199
+ # --- LCMS PDFs ---
200
+ # Search LCMS files dir, experiment dir, input_dir, parent dir
201
+ # NOT DATA directory (DATA is for NMR)
202
+ lcms_dirs = [
203
+ os.path.join(parent_dir, 'LCMS files'),
204
+ os.path.join(input_dir, 'LCMS files'),
205
+ input_dir,
206
+ parent_dir,
207
+ ]
208
+ seen_lcms = set()
209
+ lcms_candidates = [] # (path, filename) — collect all first, then batch
210
+
211
+ for d in lcms_dirs:
212
+ for f in _find_files_matching(d, experiment_name, ('.pdf',)):
213
+ fname = os.path.basename(f).lower()
214
+ if fname in seen_lcms:
215
+ continue
216
+ if 'nmr' in fname or 'mnova' in fname:
217
+ continue
218
+ # Content-based check: skip non-standard PDFs (e.g. manually
219
+ # integrated chromatograms) that aren't Waters MassLynx reports
220
+ if not is_waters_report(f):
221
+ continue
222
+ seen_lcms.add(fname)
223
+ lcms_candidates.append((f, os.path.basename(f)))
224
+
225
+ # Batch-categorize using context-aware categorizer (resolves ambiguities
226
+ # like tNN purification fractions vs tracking timepoints).
227
+ if lcms_candidates:
228
+ filenames = [fn for _, fn in lcms_candidates]
229
+ path_map = {fn: path for path, fn in lcms_candidates}
230
+ batch = categorize_lcms_files_batch(filenames, experiment_name)
231
+
232
+ for fn in filenames:
233
+ if fn in batch.filtered_files:
234
+ continue # skip special files (-MS, -LC, -UV, etc.)
235
+ fc = batch.files.get(fn)
236
+ if fc is not None:
237
+ result.lcms_files.append(LCMSFileRecord(
238
+ path=path_map[fn],
239
+ category=fc.category,
240
+ sort_key=fc.sort_key,
241
+ group_prefix=fc.group_prefix,
242
+ method_variant=(fc.modifiers.method_variant
243
+ if fc.modifiers else None),
244
+ ))
245
+ else:
246
+ # Fallback to simple categorizer (shouldn't happen)
247
+ category, sort_key = categorize_lcms_file(fn)
248
+ result.lcms_files.append(LCMSFileRecord(
249
+ path=path_map[fn],
250
+ category=category,
251
+ sort_key=sort_key,
252
+ ))
253
+
254
+ # Sort LCMS files chronologically
255
+ result.lcms_files.sort(key=lambda x: x.sort_key)
256
+
257
+ # --- NMR PDFs ---
258
+ # Scan DATA directories for PDFs matching experiment name that
259
+ # contain NMR data strings (content-based detection)
260
+ data_dirs = [
261
+ os.path.join(parent_dir, 'DATA'),
262
+ os.path.join(input_dir, 'DATA'),
263
+ ]
264
+ seen_nmr = set()
265
+
266
+ for d in data_dirs:
267
+ for f in _find_files_matching(d, experiment_name, ('.pdf',)):
268
+ fname = os.path.basename(f).lower()
269
+ if fname in seen_nmr:
270
+ continue
271
+ if _pdf_contains_nmr_data(f):
272
+ seen_nmr.add(fname)
273
+ result.nmr_files.append(f)
274
+
275
+ # --- Warnings ---
276
+ if not result.csv_files:
277
+ result.warnings.append("No CSV file found")
278
+ if not result.lcms_files:
279
+ result.warnings.append("No LCMS PDF files found")
280
+ if not result.cdx_files and not result.rxn_files:
281
+ result.warnings.append("No CDX or RXN structure files found")
282
+
283
+ return result
284
+
285
+
286
+ def _infer_experiment_from_csv(csv_path: str) -> Optional[str]:
287
+ """Read the EXPERIENCE_NAME field from a Findmolecule CSV."""
288
+ try:
289
+ import csv as csv_mod
290
+ with open(csv_path, 'r', encoding='utf-8-sig') as f:
291
+ reader = csv_mod.reader(f, delimiter=';', quotechar='"')
292
+ rows = list(reader)
293
+ if len(rows) >= 2:
294
+ headers = rows[0]
295
+ values = rows[1]
296
+ meta = dict(zip(headers, values))
297
+ name = meta.get('EXPERIENCE_NAME', '').strip()
298
+ if name:
299
+ return name
300
+ except Exception:
301
+ pass
302
+ return None
303
+
304
+ # ---------------------------------------------------------------------------
305
+ # Output formatting
306
+ # ---------------------------------------------------------------------------
307
+
308
+ def format_text_report(result: DiscoveryResult) -> str:
309
+ """Format discovery result as human-readable text."""
310
+ lines = []
311
+ lines.append(f"Experiment: {result.experiment}")
312
+ lines.append(f"Input dir: {result.input_dir}")
313
+ lines.append("")
314
+
315
+ # CSV
316
+ lines.append(f"CSV files ({len(result.csv_files)}):")
317
+ for f in result.csv_files:
318
+ lines.append(f" {os.path.basename(f)}")
319
+ if not result.csv_files:
320
+ lines.append(" (none)")
321
+
322
+ # CDX
323
+ lines.append(f"CDX files ({len(result.cdx_files)}):")
324
+ for f in result.cdx_files:
325
+ lines.append(f" {os.path.basename(f)}")
326
+ if not result.cdx_files:
327
+ lines.append(" (none)")
328
+
329
+ # RXN
330
+ lines.append(f"RXN files ({len(result.rxn_files)}):")
331
+ for f in result.rxn_files:
332
+ lines.append(f" {os.path.basename(f)}")
333
+ if not result.rxn_files:
334
+ lines.append(" (none)")
335
+
336
+ # LCMS
337
+ lines.append(f"LCMS files ({len(result.lcms_files)}):")
338
+ categories: Dict[str, List[LCMSFileRecord]] = {}
339
+ for lf in result.lcms_files:
340
+ categories.setdefault(lf.category, []).append(lf)
341
+ for cat in ("tracking", "workup", "purification", "final"):
342
+ cat_files = categories.get(cat, [])
343
+ if cat_files:
344
+ lines.append(f" {cat} ({len(cat_files)}):")
345
+ for lf in cat_files:
346
+ lines.append(f" {os.path.basename(lf.path)} "
347
+ f"[sort_key={lf.sort_key}]")
348
+ if not result.lcms_files:
349
+ lines.append(" (none)")
350
+
351
+ # NMR
352
+ lines.append(f"NMR files ({len(result.nmr_files)}):")
353
+ for f in result.nmr_files:
354
+ lines.append(f" {os.path.basename(f)}")
355
+ if not result.nmr_files:
356
+ lines.append(" (none)")
357
+
358
+ # Warnings
359
+ if result.warnings:
360
+ lines.append("")
361
+ lines.append("Warnings:")
362
+ for w in result.warnings:
363
+ lines.append(f" - {w}")
364
+
365
+ return "\n".join(lines)
366
+
367
+ # ---------------------------------------------------------------------------
368
+ # CLI
369
+ # ---------------------------------------------------------------------------
370
+
371
+ def _build_arg_parser() -> argparse.ArgumentParser:
372
+ p = argparse.ArgumentParser(
373
+ description="Experiment File Discovery Tool",
374
+ formatter_class=argparse.RawDescriptionHelpFormatter,
375
+ epilog=__doc__,
376
+ )
377
+ p.add_argument("--input-dir", "-i", required=True,
378
+ help="Directory containing experiment files "
379
+ "(experiment dir or parent dir)")
380
+ p.add_argument("--experiment", "-e", default=None,
381
+ help="Experiment name (e.g., KL-7001-004). "
382
+ "Required if input-dir is the parent directory.")
383
+ p.add_argument("--json", "-j", action="store_true",
384
+ help="Output in JSON format")
385
+ p.add_argument("--output", "-o", default=None,
386
+ help="Output file path (default: stdout)")
387
+ return p
388
+
389
+
390
+ def main(argv=None) -> int:
391
+ parser = _build_arg_parser()
392
+ args = parser.parse_args(argv)
393
+
394
+ result = discover_experiment_files(args.input_dir, args.experiment)
395
+
396
+ if args.json:
397
+ output = json.dumps(result.to_dict(), indent=2)
398
+ else:
399
+ output = format_text_report(result)
400
+
401
+ if args.output:
402
+ with open(args.output, 'w', encoding='utf-8') as f:
403
+ f.write(output)
404
+ f.write('\n')
405
+ print(f"Output written to {args.output}", file=sys.stderr)
406
+ else:
407
+ print(output)
408
+
409
+ return 0
410
+
411
+
412
+ if __name__ == '__main__':
413
+ sys.exit(main())