structflo-cser 0.2.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/.gitignore +3 -0
  2. structflo_cser-0.3.0/CLAUDE.md +210 -0
  3. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/PKG-INFO +1 -1
  4. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/annotate/storage.py +16 -12
  5. structflo_cser-0.3.0/docs/fine-tune.md +115 -0
  6. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/notebooks/02-LPS.ipynb +18 -291
  7. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/pyproject.toml +5 -2
  8. structflo_cser-0.3.0/scripts/finetune/lps/diag_e2e_decompose.py +168 -0
  9. structflo_cser-0.3.0/scripts/finetune/lps/eval_compare.py +205 -0
  10. structflo_cser-0.3.0/scripts/finetune/lps/eval_end2end.py +167 -0
  11. structflo_cser-0.3.0/scripts/finetune/lps/eval_rejection.py +168 -0
  12. structflo_cser-0.3.0/scripts/finetune/lps/mine_fp_negatives.py +127 -0
  13. structflo_cser-0.3.0/scripts/finetune/lps/prepare_data.py +171 -0
  14. structflo_cser-0.3.0/scripts/finetune/lps/train.sh +40 -0
  15. structflo_cser-0.3.0/scripts/finetune/yolo/eval_compare.py +133 -0
  16. structflo_cser-0.3.0/scripts/finetune/yolo/prepare_data.py +191 -0
  17. structflo_cser-0.3.0/scripts/finetune/yolo/train.sh +91 -0
  18. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/lps/dataset.py +35 -0
  19. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/lps/train.py +30 -1
  20. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/pipeline/pipeline.py +5 -2
  21. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/weights.py +16 -2
  22. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/uv.lock +0 -1
  23. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/.github/workflows/ci.yml +0 -0
  24. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/.github/workflows/publish.yml +0 -0
  25. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/.python-version +0 -0
  26. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/Makefile +0 -0
  27. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/README.md +0 -0
  28. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/annotate/__main__.py +0 -0
  29. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/annotate/pdf.py +0 -0
  30. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/annotate/server.py +0 -0
  31. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/annotate/templates/index.html +0 -0
  32. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/asset_scripts/download_chembl.sh +0 -0
  33. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/config/data.yaml +0 -0
  34. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/config/pipeline.yaml +0 -0
  35. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/docs/images/example-1.png +0 -0
  36. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/docs/images/example-2.png +0 -0
  37. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/docs/learned_matcher_plan.md +0 -0
  38. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/docs/lps.md +0 -0
  39. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/docs/publishing-weights.md +0 -0
  40. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/main.py +0 -0
  41. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/notebooks/01-quickstart.ipynb +0 -0
  42. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/notebooks/03-PDF.ipynb +0 -0
  43. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/notebooks/notebook-data/bio-arcgive-1.png +0 -0
  44. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/notebooks/notebook-data/example-annotated.pdf +0 -0
  45. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/notebooks/notebook-data/example.pdf +0 -0
  46. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/notebooks/notebook-data/example.pptx +0 -0
  47. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/notebooks/notebook-data/screen-1.png +0 -0
  48. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/notebooks/notebook-data/syn-1.jpg +0 -0
  49. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/quick.md +0 -0
  50. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/scripts/publish_weights.py +0 -0
  51. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/__init__.py +0 -0
  52. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/_geometry.py +0 -0
  53. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/config.py +0 -0
  54. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/data/__init__.py +0 -0
  55. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/data/distractor_images.py +0 -0
  56. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/data/smiles.py +0 -0
  57. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/distractors/__init__.py +0 -0
  58. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/distractors/charts.py +0 -0
  59. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/distractors/shapes.py +0 -0
  60. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/distractors/text_elements.py +0 -0
  61. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/generation/__init__.py +0 -0
  62. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/generation/dataset.py +0 -0
  63. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/generation/page.py +0 -0
  64. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/generation/specialty.py +0 -0
  65. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/generation/tabular.py +0 -0
  66. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/inference/__init__.py +0 -0
  67. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/inference/detector.py +0 -0
  68. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/inference/nms.py +0 -0
  69. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/inference/pairing.py +0 -0
  70. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/inference/tiling.py +0 -0
  71. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/lps/__init__.py +0 -0
  72. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/lps/evaluate.py +0 -0
  73. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/lps/features.py +0 -0
  74. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/lps/matcher.py +0 -0
  75. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/lps/scorer.py +0 -0
  76. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/pipeline/__init__.py +0 -0
  77. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/pipeline/cli.py +0 -0
  78. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/pipeline/matcher.py +0 -0
  79. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/pipeline/models.py +0 -0
  80. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/pipeline/ocr.py +0 -0
  81. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/pipeline/smiles_extractor.py +0 -0
  82. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/rendering/__init__.py +0 -0
  83. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/rendering/chemistry.py +0 -0
  84. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/rendering/text.py +0 -0
  85. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/training/__init__.py +0 -0
  86. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/training/trainer.py +0 -0
  87. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/viz/__init__.py +0 -0
  88. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/viz/detections.py +0 -0
  89. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/structflo/cser/viz/labels.py +0 -0
  90. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/tests/__init__.py +0 -0
  91. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/tests/test_config.py +0 -0
  92. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/tests/test_generation.py +0 -0
  93. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/tests/test_geometry.py +0 -0
  94. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/tests/test_imports.py +0 -0
  95. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/tests/test_inference.py +0 -0
  96. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/tests/test_models.py +0 -0
  97. {structflo_cser-0.2.0 → structflo_cser-0.3.0}/tests/test_viz.py +0 -0
@@ -44,6 +44,9 @@ archive/
44
44
  # ── Project: datasets & generated data ───────────────────────────────────────
45
45
  /data/
46
46
 
47
+ # ── Unpublished paper drafts (confidential — keep off git, code only) ────────
48
+ docs/*draft*.md
49
+
47
50
  # ── Model weights ─────────────────────────────────────────────────────────────
48
51
  # Weights are published to HF Hub — never commit .pt files directly.
49
52
  # Exception: commit weights.py (the registry) but not the binaries.
@@ -0,0 +1,210 @@
1
+ # CLAUDE.md — structflo-cser
2
+
3
+ ## What this project does
4
+
5
+ Chemical Structure-Label pair Extraction and Recognition (CSER) from scientific document pages.
6
+ Given a PDF or image of a chemistry paper/patent, the pipeline detects chemical structure drawings
7
+ and their compound labels (e.g. "CHEMBL12345", "Compound 1a"), pairs them, then extracts SMILES
8
+ strings and OCR text.
9
+
10
+ ## Package name & layout
11
+
12
+ - **PyPI package**: `structflo-cser`
13
+ - **Top-level packages** (wheel): `structflo`, `annotate`
14
+ - **Source root**: `structflo/cser/` — all library code lives here
15
+ - **Annotate tool**: `annotate/` — Flask web app for manual bbox annotation
16
+
17
+ ### Module map
18
+
19
+ ```
20
+ structflo/cser/
21
+ config.py PageConfig dataclass (A4@300DPI defaults, slide layouts)
22
+ _geometry.py Pure bbox utilities (clamp, intersect, placement)
23
+ weights.py HF Hub weight registry + auto-download (resolve_weights)
24
+ __init__.py Package version
25
+
26
+ data/
27
+ smiles.py Fetch/load SMILES from ChEMBL CSV
28
+ distractor_images.py Download/load distractor images for training data
29
+
30
+ rendering/
31
+ chemistry.py RDKit 2D structure rendering to PIL images
32
+ text.py Label text rendering (random compound IDs, fonts, rotation)
33
+
34
+ distractors/
35
+ charts.py Synthetic chart/figure distractors
36
+ shapes.py Geometric shapes for hard negatives
37
+ text_elements.py Prose blocks, captions, stray text
38
+
39
+ generation/
40
+ page.py Core page compositor (place structures + labels + distractors)
41
+ dataset.py Dataset generation orchestrator (multiprocessing, YOLO label export)
42
+ specialty.py Specialty layouts (SAR tables, MMP sheets, data cards)
43
+ tabular.py Excel-style and grid compound layouts
44
+
45
+ training/
46
+ trainer.py YOLO11l training wrapper (AdamW, cosine LR, grayscale augmentation)
47
+
48
+ inference/
49
+ detector.py YOLO inference (tiled + full-image), visualisation
50
+ tiling.py Sliding-window tile generation
51
+ nms.py Greedy NMS for merging tiled detections
52
+ pairing.py Hungarian matching on centroid distance
53
+
54
+ lps/ Learned Pair Scorer (replaces Euclidean matching)
55
+ features.py 14-dim geometric features + visual crop extraction
56
+ scorer.py PairScorer CNN (~557K params): struct_crop + label_crop + geom → logit
57
+ matcher.py LearnedMatcher (BaseMatcher impl using PairScorer + Hungarian)
58
+ dataset.py LPS training dataset (positive/negative pair sampling from GT)
59
+ train.py LPS training loop
60
+ evaluate.py LPS evaluation script
61
+
62
+ pipeline/
63
+ models.py Core dataclasses: BBox, Detection, CompoundPair
64
+ matcher.py BaseMatcher ABC + HungarianMatcher
65
+ ocr.py BaseOCR ABC + EasyOCRExtractor + NullOCR
66
+ smiles_extractor.py BaseSmilesExtractor ABC + DecimerExtractor + NullSmilesExtractor
67
+ pipeline.py ChemPipeline: detect → match → enrich (main public API)
68
+ cli.py sf-extract CLI entry point
69
+
70
+ viz/
71
+ labels.py Visualise YOLO label files on synthetic pages
72
+ detections.py Matplotlib plots for Detection/CompoundPair objects
73
+
74
+ annotate/
75
+ __main__.py Flask annotation tool entry point
76
+ server.py Flask routes
77
+ pdf.py PDF page rendering for annotation
78
+ storage.py Annotation JSON storage
79
+ templates/ HTML templates
80
+ ```
81
+
82
+ ## CLI entry points (registered in pyproject.toml)
83
+
84
+ | Command | Module | Purpose |
85
+ |---------------------------|-------------------------------------------|--------------------------------------|
86
+ | `sf-generate` | `structflo.cser.generation.dataset:main` | Generate synthetic training data |
87
+ | `sf-train` | `structflo.cser.training.trainer:main` | Train YOLO11l detector |
88
+ | `sf-detect` | `structflo.cser.inference.detector:main` | Run detection on images |
89
+ | `sf-extract` | `structflo.cser.pipeline.cli:main` | Full pipeline: detect+match+extract |
90
+ | `sf-viz` | `structflo.cser.viz.labels:main` | Visualise YOLO labels on images |
91
+ | `sf-fetch-smiles` | `structflo.cser.data.smiles:main` | Download SMILES from ChEMBL |
92
+ | `sf-download-distractors` | `structflo.cser.data.distractor_images:main` | Download distractor images |
93
+ | `sf-annotate` | `annotate.__main__:main` | Manual annotation web tool |
94
+ | `sf-train-lps` | `structflo.cser.lps.train:main` | Train Learned Pair Scorer |
95
+ | `sf-eval-lps` | `structflo.cser.lps.evaluate:main` | Evaluate LPS model |
96
+
97
+ ## Key public API
98
+
99
+ ```python
100
+ from structflo.cser.pipeline import ChemPipeline
101
+
102
+ pipeline = ChemPipeline() # auto-downloads weights
103
+ pairs = pipeline.process("page.png") # detect → match → enrich
104
+ pairs = pipeline.process_pdf("paper.pdf") # per-page processing
105
+
106
+ # Low-level access
107
+ detections = pipeline.detect(image)
108
+ pairs = pipeline.match(detections, image=image)
109
+ pairs = pipeline.enrich(pairs, image)
110
+
111
+ # Output
112
+ ChemPipeline.to_json(pairs)
113
+ ChemPipeline.to_dataframe(pairs)
114
+ ChemPipeline.to_records(pairs)
115
+ ```
116
+
117
+ ## Detection model
118
+
119
+ - **Architecture**: YOLO11l (ultralytics)
120
+ - **Classes**: 2 — `chemical_structure` (0), `compound_label` (1)
121
+ - **Training image size**: 1280px
122
+ - **Inference**: sliding-window tiling (1536px tiles, 20% overlap) + per-class NMS
123
+ - **Training config**: AdamW, cosine LR, grayscale images, no colour augmentation
124
+ - **Runs directory**: `runs/labels_detect/`
125
+ - **YOLO data config**: `config/data.yaml`
126
+
127
+ ## Matching strategies
128
+
129
+ 1. **HungarianMatcher** — centroid Euclidean distance + `scipy.optimize.linear_sum_assignment`
130
+ 2. **LearnedMatcher** (LPS) — CNN scorer produces association probability per (struct, label) pair,
131
+ then Hungarian on `1 - score`. Default in ChemPipeline.
132
+
133
+ ## Weights system
134
+
135
+ Weights are versioned independently of the package and stored on HuggingFace Hub.
136
+ `structflo.cser.weights.resolve_weights(model, version)` handles auto-download + caching.
137
+
138
+ | Model | HF Repo | Latest |
139
+ |----------------|----------------------------------|--------|
140
+ | cser-detector | sidxz/structflo-cser-detector | v0.2 |
141
+ | cser-lps | sidxz/structflo-cser-lps | v0.1 |
142
+
143
+ Publish script: `scripts/publish_weights.py`
144
+
145
+ ## Fine-tuning on real data
146
+
147
+ Scripts live in `scripts/finetune/{yolo,lps}/`, each with `prepare_data.py`, `train.sh`, `eval_compare.py`.
148
+
149
+ ### Data layout
150
+ - **Real annotations**: produced by `sf-annotate`, stored externally (symlinked in)
151
+ - **Combined data**: `data/finetune/{yolo,lps}/` — symlinks mixing subsampled synthetic + oversampled real
152
+ - Knobs at top of each `prepare_data.py`: `N_SYNTH_TRAIN`, `N_SYNTH_VAL`, `REAL_OVERSAMPLE`, `N_REAL_VAL`
153
+
154
+ ### YOLO fine-tune
155
+ - Starts from `runs/labels_detect/yolo11l_panels/weights/best.pt`
156
+ - Output: `runs/labels_detect/finetune_trial/weights/best.pt`
157
+ - Lower LR (1e-4), short warmup (1 epoch), 10 epochs default
158
+
159
+ ### LPS fine-tune
160
+ - Uses `sf-train-lps --finetune <checkpoint>` (loads weights only, fresh optimizer/scheduler)
161
+ - Distinct from `--resume` which restores full training state (optimizer, scheduler, epoch)
162
+ - Starts from `runs/lps/best.pt`, output: `runs/lps_finetune/best.pt`
163
+
164
+ ### Eval
165
+ - `eval_compare.py` runs both baseline and fine-tuned on two val sets (finetune val + original synthetic val)
166
+ - Prints summary table with deltas and a verdict (improvement vs regression)
167
+
168
+ ### Publishing fine-tuned weights
169
+ ```bash
170
+ python scripts/publish_weights.py --model cser-detector --version vX.Y \
171
+ --weights-file runs/labels_detect/finetune_trial/weights/best.pt
172
+ python scripts/publish_weights.py --model cser-lps --version vX.Y \
173
+ --weights-file runs/lps_finetune/best.pt
174
+ ```
175
+
176
+ ## Synthetic data generation
177
+
178
+ - Pages: A4@300DPI (2480x3508) as JPEG, also slide layouts (16:9)
179
+ - Layout types: free-form (~30%), Excel tables (~14%), grids (~12%), SAR tables (~8%),
180
+ MMP sheets (~7%), data cards (~8%), slides (~13%), hard negatives (~8%)
181
+ - Structures rendered via RDKit from ChEMBL SMILES
182
+ - Labels: random compound IDs in various styles (CHEMBL, ZINC, Roman numerals, etc.)
183
+ - Noise augmentation: JPEG artifacts, blur, brightness, Gaussian noise
184
+ - Output: images + YOLO .txt labels + ground truth JSON (per-compound struct/label bboxes + SMILES)
185
+ - Default: 2000 train / 200 val pages, multiprocessing with all CPUs
186
+
187
+ ## Build & dev
188
+
189
+ ```bash
190
+ uv sync --dev # install all deps
191
+ uv run ruff check structflo/ tests/ # lint
192
+ uv run ruff format structflo/ tests/ # format
193
+ uv run pytest -q # tests
194
+ uv build # build wheel
195
+ ```
196
+
197
+ - **Python**: >=3.11 (project uses 3.12)
198
+ - **Build system**: hatchling + hatch-vcs (version from git tags)
199
+ - **Linting**: ruff
200
+ - **Tests**: pytest (tests/ directory)
201
+ - **CI**: GitHub Actions — lint + format check + pytest + coverage on push/PR to main
202
+ - **PyPI publish**: on git tag `v*`
203
+
204
+ ## Conventions
205
+
206
+ - All images converted to grayscale before detection (matches training distribution)
207
+ - Adapters pattern: `BaseMatcher`, `BaseOCR`, `BaseSmilesExtractor` ABCs for swappable components
208
+ - Lazy model loading throughout (YOLO, EasyOCR, DECIMER loaded on first use)
209
+ - Weights never committed to git (*.pt in .gitignore), only on HF Hub
210
+ - `runs/`, `data/`, `detections/`, `archive/` are gitignored
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: structflo-cser
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: Chemical structure-label pair extraction from scientific documents.
5
5
  Requires-Python: >=3.11
6
6
  Requires-Dist: chembl-webresource-client>=0.10.9
@@ -12,7 +12,8 @@ Ground-truth JSON schema (pair format):
12
12
  ]
13
13
 
14
14
  YOLO .txt (written only when pairs are non-empty):
15
- 0 cx cy w h (normalised 0-1; class 0 = compound_panel = union bbox)
15
+ 0 cx cy w h (normalised 0-1; class 0 = chemical_structure)
16
+ 1 cx cy w h (normalised 0-1; class 1 = compound_label)
16
17
 
17
18
  Annotation states:
18
19
  - GT JSON absent → page not yet visited
@@ -47,7 +48,7 @@ def save(page_id: str, pairs: list[dict], img_w: int, img_h: int,
47
48
  GT JSON is *always* written (even for empty pages) so the page is
48
49
  tracked as 'done'. YOLO .txt is only written when pairs are present.
49
50
 
50
- YOLO bounding box = union of struct_bbox and label_bbox (class 0).
51
+ YOLO labels: class 0 = chemical_structure, class 1 = compound_label.
51
52
  """
52
53
  gt_dir = output_dir / "ground_truth"
53
54
  gt_dir.mkdir(parents=True, exist_ok=True)
@@ -75,14 +76,17 @@ def save(page_id: str, pairs: list[dict], img_w: int, img_h: int,
75
76
  s = pair["struct_bbox"] # [x1, y1, x2, y2]
76
77
  l = pair.get("label_bbox") # [x1, y1, x2, y2] or None
77
78
 
79
+ # class 0 = chemical_structure
80
+ sx1, sy1, sx2, sy2 = s
81
+ f.write(
82
+ f"0 {(sx1 + sx2) / 2 / img_w:.6f} {(sy1 + sy2) / 2 / img_h:.6f} "
83
+ f"{(sx2 - sx1) / img_w:.6f} {(sy2 - sy1) / img_h:.6f}\n"
84
+ )
85
+
86
+ # class 1 = compound_label
78
87
  if l:
79
- x1 = min(s[0], l[0]); y1 = min(s[1], l[1])
80
- x2 = max(s[2], l[2]); y2 = max(s[3], l[3])
81
- else:
82
- x1, y1, x2, y2 = s
83
-
84
- cx = (x1 + x2) / 2 / img_w
85
- cy = (y1 + y2) / 2 / img_h
86
- w = (x2 - x1) / img_w
87
- h = (y2 - y1) / img_h
88
- f.write(f"0 {cx:.6f} {cy:.6f} {w:.6f} {h:.6f}\n")
88
+ lx1, ly1, lx2, ly2 = l
89
+ f.write(
90
+ f"1 {(lx1 + lx2) / 2 / img_w:.6f} {(ly1 + ly2) / 2 / img_h:.6f} "
91
+ f"{(lx2 - lx1) / img_w:.6f} {(ly2 - ly1) / img_h:.6f}\n"
92
+ )
@@ -0,0 +1,115 @@
1
+
2
+ # YOLO
3
+ uv run python scripts/finetune/yolo/prepare_data.py
4
+ bash scripts/finetune/yolo/train.sh
5
+ uv run python scripts/finetune/yolo/eval_compare.py
6
+
7
+ # LPS
8
+ uv run python scripts/finetune/lps/prepare_data.py
9
+ bash scripts/finetune/lps/train.sh
10
+ uv run python scripts/finetune/lps/eval_compare.py
11
+
12
+ Eval now checks two things per model:
13
+ 1. Finetune val (50 synth + 2 real) — did real data help?
14
+ 2. Original synthetic val (2000 pages) — did fine-tuning regress?
15
+
16
+ To publish if results look good:
17
+ python scripts/publish_weights.py --model cser-detector --version v0.3 \
18
+ --weights-file runs/labels_detect/finetune_trial/weights/best.pt
19
+
20
+ python scripts/publish_weights.py --model cser-lps --version v0.2 \
21
+ --weights-file runs/lps_finetune/best.pt
22
+
23
+
24
+
25
+
26
+ Based on the current defaults in scripts/finetune/*/prepare_data.py, here's what to scale up:
27
+
28
+ ┌─────────────────┬─────────────┬────────────────────────────┬────────────────────────────────────────────────────┐
29
+ │ Param │ Trial (now) │ Full dataset (~100+ pages) │ Why │
30
+ ├─────────────────┼─────────────┼────────────────────────────┼────────────────────────────────────────────────────┤
31
+ │ N_SYNTH_TRAIN │ 200 │ 2000–5000 │ More synthetic to prevent forgetting │
32
+ ├─────────────────┼─────────────┼────────────────────────────┼────────────────────────────────────────────────────┤
33
+ │ N_SYNTH_VAL │ 50 │ 200–500 │ More reliable val metrics │
34
+ ├─────────────────┼─────────────┼────────────────────────────┼────────────────────────────────────────────────────┤
35
+ │ REAL_OVERSAMPLE │ 10 │ 3–5 │ Less oversampling needed since you have more pages │
36
+ ├─────────────────┼─────────────┼────────────────────────────┼────────────────────────────────────────────────────┤
37
+ │ N_REAL_VAL │ 2 │ 10–20% of total │ Meaningful real val set │
38
+ └─────────────────┴─────────────┴────────────────────────────┴────────────────────────────────────────────────────┘
39
+
40
+ The goal is to keep real data at roughly 30–50% of training. With 100 real pages at 3x oversample + 2000 synthetic, that's 300/(300+2000) = ~13% — bump oversample to 5 for ~20%, or lower N_SYNTH_TRAIN
41
+ to 1000 for ~33%.
42
+
43
+ For training hyperparams in train.sh:
44
+ - Epochs: trial uses 10, full run can stay at 10–15 (more data per epoch means less risk of overfitting)
45
+ - LR: keep as-is (1e-4 YOLO, 3e-4 LPS) — these are already conservative
46
+
47
+
48
+ With 500 annotated pages, here are the recommended values and what each param does:
49
+
50
+ prepare_data.py params
51
+
52
+ N_SYNTH_TRAIN (trial: 200 → recommended: 3000–5000)
53
+
54
+ Number of synthetic images randomly sampled into the training set. Synthetic data acts as a regularizer — it prevents the model from overfitting to quirks of your specific papers (particular fonts, DPI,
55
+ layout style). Too few and the model forgets synthetic-learned features; too many and real data gets drowned out. With 500 real pages, 3000–5000 synthetic keeps the ratio healthy.
56
+
57
+ N_SYNTH_VAL (trial: 50 → recommended: 300–500)
58
+
59
+ Synthetic images in the validation set. More gives you stable metrics — with only 50, a single noisy page can swing mAP by several points. 300+ makes the regression check trustworthy.
60
+
61
+ REAL_OVERSAMPLE (trial: 10 → recommended: 2–3)
62
+
63
+ Each real image gets this many symlink copies with unique names, so YOLO/LPS treats them as separate training examples. With only 14 pages you needed 10x to make real data visible. With 500 pages, 2–3x
64
+ is enough. At 3x: 1500 real / (1500 + 4000 synth) = 27% of training. Too high and the model memorizes your annotation set instead of generalizing.
65
+
66
+ N_REAL_VAL (trial: 2 → recommended: 50–75)
67
+
68
+ Real pages held out for validation (never seen during training). This is your ground truth for "did fine-tuning actually help on real documents?". At 50 pages you get reliable per-class metrics. The
69
+ remaining 425–450 go to training.
70
+
71
+ train.sh params (YOLO)
72
+
73
+ epochs=10 → recommended: 10–15
74
+
75
+ One full pass over all training data. More epochs = more chances to learn, but with 500 real pages you have enough data per epoch that 10–15 is sufficient. Early stopping (patience=5) will halt if val
76
+ metrics plateau anyway.
77
+
78
+ lr0=1e-4 → keep as-is
79
+
80
+ Starting learning rate, 10x lower than from-scratch training (1e-3). Low LR is critical for fine-tuning — too high and you destroy the features learned from 20K synthetic pages. Too low and you never
81
+ adapt. 1e-4 is a standard fine-tune rate.
82
+
83
+ warmup_epochs=1 → keep as-is
84
+
85
+ Epochs where LR ramps from ~0 to lr0. Prevents large early gradient updates that could destabilize the pretrained weights. 1 is enough since we're already starting with a well-trained model.
86
+
87
+ patience=5 → keep as-is
88
+
89
+ Stop training if val mAP doesn't improve for this many epochs. Prevents overfitting if the model converges early.
90
+
91
+ train.sh params (LPS)
92
+
93
+ --lr 3e-4 → keep as-is
94
+
95
+ Same logic — lower than from-scratch (1e-3) but the LPS model is tiny (557K params) so it can tolerate a slightly higher fine-tune LR than YOLO.
96
+
97
+ --epochs 10 → recommended: 10–15
98
+
99
+ --batch 512 → keep as-is
100
+
101
+ Batch size. LPS samples are small (crops + 14-dim features), so 512 fits easily in memory and gives stable gradient estimates.
102
+
103
+ Concrete config for 500 pages
104
+
105
+ # prepare_data.py (both yolo and lps)
106
+ N_SYNTH_TRAIN = 4000
107
+ N_SYNTH_VAL = 400
108
+ N_REAL_VAL = 50
109
+ REAL_OVERSAMPLE = 3
110
+
111
+ That gives you:
112
+ - Train: 4000 synth + 450×3 = 5350 images (25% real)
113
+ - Val: 400 synth + 50 real = 450 images
114
+
115
+ Training hyperparams stay the same — they're already tuned for fine-tuning.