structflo-cser 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. structflo_cser-0.1.0/.github/workflows/ci.yml +40 -0
  2. structflo_cser-0.1.0/.github/workflows/publish.yml +36 -0
  3. structflo_cser-0.1.0/.gitignore +69 -0
  4. structflo_cser-0.1.0/.python-version +1 -0
  5. structflo_cser-0.1.0/Makefile +35 -0
  6. structflo_cser-0.1.0/PKG-INFO +282 -0
  7. structflo_cser-0.1.0/README.md +258 -0
  8. structflo_cser-0.1.0/annotate/__main__.py +40 -0
  9. structflo_cser-0.1.0/annotate/pdf.py +41 -0
  10. structflo_cser-0.1.0/annotate/server.py +146 -0
  11. structflo_cser-0.1.0/annotate/storage.py +86 -0
  12. structflo_cser-0.1.0/annotate/templates/index.html +463 -0
  13. structflo_cser-0.1.0/asset_scripts/download_chembl.sh +72 -0
  14. structflo_cser-0.1.0/config/data.yaml +8 -0
  15. structflo_cser-0.1.0/config/pipeline.yaml +0 -0
  16. structflo_cser-0.1.0/docs/learned_matcher_plan.md +619 -0
  17. structflo_cser-0.1.0/docs/lps.md +363 -0
  18. structflo_cser-0.1.0/docs/publishing-weights.md +179 -0
  19. structflo_cser-0.1.0/main.py +6 -0
  20. structflo_cser-0.1.0/notebooks/01-quickstart.ipynb +959 -0
  21. structflo_cser-0.1.0/notebooks/02-LPS.ipynb +895 -0
  22. structflo_cser-0.1.0/notebooks/notebook-data/syn-1.jpg +0 -0
  23. structflo_cser-0.1.0/pyproject.toml +52 -0
  24. structflo_cser-0.1.0/quick.md +6 -0
  25. structflo_cser-0.1.0/scripts/publish_weights.py +313 -0
  26. structflo_cser-0.1.0/structflo/cser/__init__.py +3 -0
  27. structflo_cser-0.1.0/structflo/cser/_geometry.py +47 -0
  28. structflo_cser-0.1.0/structflo/cser/config.py +101 -0
  29. structflo_cser-0.1.0/structflo/cser/data/__init__.py +0 -0
  30. structflo_cser-0.1.0/structflo/cser/data/distractor_images.py +185 -0
  31. structflo_cser-0.1.0/structflo/cser/data/smiles.py +155 -0
  32. structflo_cser-0.1.0/structflo/cser/distractors/__init__.py +76 -0
  33. structflo_cser-0.1.0/structflo/cser/distractors/charts.py +108 -0
  34. structflo_cser-0.1.0/structflo/cser/distractors/shapes.py +71 -0
  35. structflo_cser-0.1.0/structflo/cser/distractors/text_elements.py +1155 -0
  36. structflo_cser-0.1.0/structflo/cser/generation/__init__.py +0 -0
  37. structflo_cser-0.1.0/structflo/cser/generation/dataset.py +378 -0
  38. structflo_cser-0.1.0/structflo/cser/generation/page.py +326 -0
  39. structflo_cser-0.1.0/structflo/cser/generation/specialty.py +698 -0
  40. structflo_cser-0.1.0/structflo/cser/generation/tabular.py +470 -0
  41. structflo_cser-0.1.0/structflo/cser/inference/__init__.py +0 -0
  42. structflo_cser-0.1.0/structflo/cser/inference/detector.py +279 -0
  43. structflo_cser-0.1.0/structflo/cser/inference/nms.py +24 -0
  44. structflo_cser-0.1.0/structflo/cser/inference/pairing.py +49 -0
  45. structflo_cser-0.1.0/structflo/cser/inference/tiling.py +28 -0
  46. structflo_cser-0.1.0/structflo/cser/lps/__init__.py +22 -0
  47. structflo_cser-0.1.0/structflo/cser/lps/dataset.py +396 -0
  48. structflo_cser-0.1.0/structflo/cser/lps/evaluate.py +215 -0
  49. structflo_cser-0.1.0/structflo/cser/lps/features.py +151 -0
  50. structflo_cser-0.1.0/structflo/cser/lps/matcher.py +200 -0
  51. structflo_cser-0.1.0/structflo/cser/lps/scorer.py +220 -0
  52. structflo_cser-0.1.0/structflo/cser/lps/train.py +348 -0
  53. structflo_cser-0.1.0/structflo/cser/pipeline/__init__.py +50 -0
  54. structflo_cser-0.1.0/structflo/cser/pipeline/cli.py +110 -0
  55. structflo_cser-0.1.0/structflo/cser/pipeline/matcher.py +73 -0
  56. structflo_cser-0.1.0/structflo/cser/pipeline/models.py +87 -0
  57. structflo_cser-0.1.0/structflo/cser/pipeline/ocr.py +61 -0
  58. structflo_cser-0.1.0/structflo/cser/pipeline/pipeline.py +215 -0
  59. structflo_cser-0.1.0/structflo/cser/pipeline/smiles_extractor.py +54 -0
  60. structflo_cser-0.1.0/structflo/cser/rendering/__init__.py +0 -0
  61. structflo_cser-0.1.0/structflo/cser/rendering/chemistry.py +147 -0
  62. structflo_cser-0.1.0/structflo/cser/rendering/text.py +346 -0
  63. structflo_cser-0.1.0/structflo/cser/training/__init__.py +0 -0
  64. structflo_cser-0.1.0/structflo/cser/training/trainer.py +100 -0
  65. structflo_cser-0.1.0/structflo/cser/viz/__init__.py +20 -0
  66. structflo_cser-0.1.0/structflo/cser/viz/detections.py +328 -0
  67. structflo_cser-0.1.0/structflo/cser/viz/labels.py +163 -0
  68. structflo_cser-0.1.0/structflo/cser/weights.py +243 -0
  69. structflo_cser-0.1.0/tests/__init__.py +1 -0
  70. structflo_cser-0.1.0/tests/test_config.py +46 -0
  71. structflo_cser-0.1.0/tests/test_generation.py +33 -0
  72. structflo_cser-0.1.0/tests/test_geometry.py +73 -0
  73. structflo_cser-0.1.0/tests/test_imports.py +55 -0
  74. structflo_cser-0.1.0/tests/test_inference.py +138 -0
  75. structflo_cser-0.1.0/tests/test_models.py +96 -0
  76. structflo_cser-0.1.0/tests/test_viz.py +255 -0
  77. structflo_cser-0.1.0/uv.lock +4068 -0
@@ -0,0 +1,40 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ check:
11
+ runs-on: ubuntu-latest
12
+ permissions:
13
+ contents: read
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+
17
+ - name: Install uv
18
+ uses: astral-sh/setup-uv@v4
19
+
20
+ - name: Set up Python
21
+ run: uv python install 3.12
22
+
23
+ - name: Install dependencies
24
+ run: uv sync --dev
25
+
26
+ - name: Lint
27
+ run: uv run ruff check structflo/ tests/
28
+
29
+ - name: Format
30
+ run: uv run ruff format --check structflo/ tests/
31
+
32
+ - name: Test with coverage
33
+ run: uv run pytest -q --cov=structflo --cov-report=xml
34
+
35
+ - name: Upload coverage to Codecov
36
+ if: github.event_name == 'push' && github.ref == 'refs/heads/main'
37
+ uses: codecov/codecov-action@v4
38
+ with:
39
+ files: coverage.xml
40
+ token: ${{ secrets.CODECOV_TOKEN }}
@@ -0,0 +1,36 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*"
7
+
8
+ jobs:
9
+ publish:
10
+ runs-on: ubuntu-latest
11
+ permissions:
12
+ contents: read
13
+ id-token: write # trusted publishing
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+
17
+ - name: Install uv
18
+ uses: astral-sh/setup-uv@v4
19
+
20
+ - name: Set up Python
21
+ run: uv python install 3.12
22
+
23
+ - name: Install dependencies
24
+ run: uv sync --dev
25
+
26
+ - name: Lint
27
+ run: uv run ruff check structflo/ tests/
28
+
29
+ - name: Test
30
+ run: uv run pytest -q
31
+
32
+ - name: Build
33
+ run: uv build
34
+
35
+ - name: Publish to PyPI
36
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,69 @@
1
+ # ── Python ────────────────────────────────────────────────────────────────────
2
+ __pycache__/
3
+ *.py[oc]
4
+ *.pyd
5
+ build/
6
+ dist/
7
+ wheels/
8
+ *.egg-info/
9
+ .eggs/
10
+ pip-wheel-metadata/
11
+ MANIFEST
12
+
13
+ # ── Virtual environments ───────────────────────────────────────────────────────
14
+ .venv/
15
+ venv/
16
+ env/
17
+
18
+ # ── Testing & linting ─────────────────────────────────────────────────────────
19
+ .pytest_cache/
20
+ .ruff_cache/
21
+ .mypy_cache/
22
+ .coverage
23
+ coverage.xml
24
+ htmlcov/
25
+
26
+ # ── Jupyter ───────────────────────────────────────────────────────────────────
27
+ .ipynb_checkpoints/
28
+ *.nbconvert/
29
+
30
+ # ── Editors & IDEs ────────────────────────────────────────────────────────────
31
+ .vscode/
32
+ .idea/
33
+ *.swp
34
+ *.swo
35
+ *~
36
+ .DS_Store
37
+ Thumbs.db
38
+
39
+ # ── Project: training & inference outputs ────────────────────────────────────
40
+ runs/
41
+ detections/
42
+ archive/
43
+
44
+ # ── Project: datasets & generated data ───────────────────────────────────────
45
+ /data/
46
+
47
+ # ── Model weights ─────────────────────────────────────────────────────────────
48
+ # Weights are published to HF Hub — never commit .pt files directly.
49
+ # Exception: commit weights.py (the registry) but not the binaries.
50
+ *.pt
51
+ *.pth
52
+ *.ckpt
53
+ *.safetensors
54
+
55
+ # ── Secrets & credentials ─────────────────────────────────────────────────────
56
+ .env
57
+ .env.*
58
+ *.pem
59
+ *.key
60
+ secrets/
61
+
62
+ # ── Logs ──────────────────────────────────────────────────────────────────────
63
+ *.log
64
+ logs/
65
+
66
+ # ── Misc ──────────────────────────────────────────────────────────────────────
67
+ .claude/
68
+ tmp/
69
+ scratch/
@@ -0,0 +1 @@
1
+ 3.12
@@ -0,0 +1,35 @@
1
+ .PHONY: install lint format check test clean build
2
+
3
+ ## Install all dependencies (including dev)
4
+ install:
5
+ uv sync --dev
6
+
7
+ ## Run ruff linter
8
+ lint:
9
+ uv run ruff check structflo/ tests/
10
+
11
+ ## Auto-fix lint issues
12
+ fix:
13
+ uv run ruff check --fix structflo/ tests/
14
+
15
+ ## Format code with ruff
16
+ format:
17
+ uv run ruff format structflo/ tests/
18
+
19
+ ## Check formatting + lint (CI-friendly, no changes)
20
+ check:
21
+ uv run ruff format --check structflo/ tests/
22
+ uv run ruff check structflo/ tests/
23
+
24
+ ## Run tests
25
+ test:
26
+ uv run pytest -q
27
+
28
+ ## Remove build artifacts
29
+ clean:
30
+ rm -rf dist/ build/ *.egg-info .pytest_cache
31
+ find . -type d -name __pycache__ -exec rm -rf {} +
32
+
33
+ ## Build package
34
+ build:
35
+ uv build
@@ -0,0 +1,282 @@
1
+ Metadata-Version: 2.4
2
+ Name: structflo-cser
3
+ Version: 0.1.0
4
+ Summary: YOLO-based chemical structure + label detector with synthetic data generation
5
+ Requires-Python: >=3.11
6
+ Requires-Dist: chembl-webresource-client>=0.10.9
7
+ Requires-Dist: decimer>=2.8.0
8
+ Requires-Dist: easyocr>=1.7
9
+ Requires-Dist: flask>=3.1.3
10
+ Requires-Dist: huggingface-hub>=0.24.0
11
+ Requires-Dist: matplotlib>=3.10.8
12
+ Requires-Dist: numpy>=2.4.2
13
+ Requires-Dist: opencv-python-headless>=4.13.0.92
14
+ Requires-Dist: packaging>=23.0
15
+ Requires-Dist: pandas>=2.0.0
16
+ Requires-Dist: pillow>=12.1.1
17
+ Requires-Dist: pymupdf>=1.27.1
18
+ Requires-Dist: rdkit>=2025.9.5
19
+ Requires-Dist: requests>=2.31.0
20
+ Requires-Dist: scipy>=1.17.0
21
+ Requires-Dist: tqdm>=4.67.3
22
+ Requires-Dist: ultralytics>=8.4.14
23
+ Description-Content-Type: text/markdown
24
+
25
+ # structflo-cser
26
+
27
+ YOLO11l-based detector for chemical structures and their compound label IDs in scientific documents.
28
+
29
+ Part of the **structflo** library. Import as:
30
+ ```python
31
+ from structflo.cser.pipeline import ChemPipeline
32
+ ```
33
+
34
+ **Detection target:** A single bounding box (`compound_panel`) enclosing the union of a rendered chemical structure and its nearby label ID (e.g. `CHEMBL12345`).
35
+
36
+ ---
37
+
38
+ ## Installation
39
+
40
+ ```bash
41
+ uv pip install -e .
42
+ ```
43
+
44
+ This installs all dependencies and registers the `sf-*` CLI commands on your PATH.
45
+
46
+ ---
47
+
48
+ ## Pipeline
49
+
50
+ ```
51
+ 1. Fetch SMILES → sf-fetch-smiles
52
+ 2. Download distractors → sf-download-distractors (optional but recommended)
53
+ 3. Generate dataset → sf-generate
54
+ 4. Visualize labels → sf-viz (optional QA check)
55
+ 5. Train YOLO → sf-train
56
+ 6. Run inference → sf-detect
57
+ 7. Annotate real PDFs → sf-annotate (optional)
58
+ ```
59
+
60
+ ---
61
+
62
+ ## Commands
63
+
64
+ ### 1. Fetch SMILES from ChEMBL
65
+
66
+ Extracts ~20 k small-molecule SMILES from a local [ChEMBL SQLite database](https://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/).
67
+
68
+ ```bash
69
+ sf-fetch-smiles \
70
+ --db chembl_35/chembl_35_sqlite/chembl_35.db \
71
+ --output data/smiles/chembl_smiles.csv \
72
+ --n 20000
73
+ ```
74
+
75
+ Output: `data/smiles/chembl_smiles.csv`
76
+
77
+ ---
78
+
79
+ ### 2. Download distractor images
80
+
81
+ Downloads real photographs from [Lorem Picsum](https://picsum.photos/) to use as hard-negative distractors during page generation.
82
+
83
+ ```bash
84
+ sf-download-distractors --out data/distractors --count 1000
85
+ ```
86
+
87
+ ---
88
+
89
+ ### 3. Generate synthetic dataset
90
+
91
+ Generates document-like pages (A4 @ 300 DPI or slide format) containing chemical structures, compound labels, and distractor elements.
92
+
93
+ ```bash
94
+ sf-generate \
95
+ --smiles data/smiles/chembl_smiles.csv \
96
+ --out data/generated \
97
+ --num-train 2000 --num-val 400 \
98
+ --fonts-dir data/fonts \
99
+ --distractors-dir data/distractors \
100
+ --dpi 96,144,200,300 \
101
+ --workers 0
102
+ ```
103
+
104
+ Key options:
105
+
106
+ | Flag | Default | Description |
107
+ |------|---------|-------------|
108
+ | `--num-train` | 2000 | Number of training pages |
109
+ | `--num-val` | 200 | Number of validation pages |
110
+ | `--dpi` | `96,144,200,300` | DPI values randomly sampled per page |
111
+ | `--grayscale` / `--no-grayscale` | on | Convert pages to grayscale |
112
+ | `--workers` | 0 (all CPUs) | Parallel workers; use `1` to disable multiprocessing |
113
+
114
+ **Output structure:**
115
+ ```
116
+ data/generated/
117
+ ├── train/
118
+ │ ├── images/ (JPEG pages)
119
+ │ ├── labels/ (YOLO .txt — union bbox per compound panel)
120
+ │ └── ground_truth/ (JSON with split struct_bbox / label_bbox / smiles)
121
+ └── val/
122
+ ├── images/
123
+ ├── labels/
124
+ └── ground_truth/
125
+ ```
126
+
127
+ ---
128
+
129
+ ### 4. Visualize labels (QA)
130
+
131
+ Overlays YOLO bounding boxes on a random sample of generated pages.
132
+
133
+ ```bash
134
+ sf-viz --split both --n 30 --out data/viz
135
+ ```
136
+
137
+ Green boxes = `chemical_structure`, blue boxes = `compound_label`.
138
+
139
+ ---
140
+
141
+ ### 5. Train
142
+
143
+ Fine-tunes YOLO11l on the generated dataset.
144
+
145
+ ```bash
146
+ sf-train --epochs 50 --imgsz 1280 --batch 8
147
+ ```
148
+
149
+ Key options:
150
+
151
+ | Flag | Default | Description |
152
+ |------|---------|-------------|
153
+ | `--weights` | `yolo11l.pt` | Pretrained backbone |
154
+ | `--imgsz` | 1280 | Training resolution |
155
+ | `--batch` | 8 | Batch size (safe for A6000 48 GB) |
156
+ | `--resume` | — | Path to `last.pt` to resume an interrupted run |
157
+
158
+ **Output:** `runs/labels_detect/yolo11l_panels/weights/best.pt`
159
+
160
+ ---
161
+
162
+ ### 6. Detect
163
+
164
+ Runs the trained detector on images using sliding-window tiling (1536 px tiles, 20 % overlap).
165
+
166
+ ```bash
167
+ # Single image
168
+ sf-detect --image page.png
169
+
170
+ # Directory of images
171
+ sf-detect --image_dir data/real/images/ --out detections/
172
+
173
+ # With Hungarian pairing of structures → labels
174
+ sf-detect --image page.png --pair --max_dist 300
175
+ ```
176
+
177
+ Key options:
178
+
179
+ | Flag | Default | Description |
180
+ |------|---------|-------------|
181
+ | `--weights` | `runs/.../best.pt` | Model weights |
182
+ | `--conf` | 0.3 | Confidence threshold |
183
+ | `--tile_size` | 1536 | Tile size in pixels |
184
+ | `--no_tile` | off | Run on full image (skips tiling) |
185
+ | `--grayscale` | off | Convert to grayscale before detection |
186
+ | `--pair` | off | Hungarian match structures → labels |
187
+
188
+ ---
189
+
190
+ ### 7. Annotate real PDFs (optional)
191
+
192
+ Web-based annotation tool for creating ground truth from real PDF documents.
193
+
194
+ ```bash
195
+ sf-annotate --out data/real --port 8000
196
+ # then open http://127.0.0.1:8000 in a browser
197
+ ```
198
+
199
+ ---
200
+
201
+ ## Package layout
202
+
203
+ ```
204
+ structflo/cser/ # importable package (from structflo.cser import ...)
205
+ ├── _geometry.py # shared bbox utilities (boxes_intersect, try_place_box)
206
+ ├── config.py # PageConfig dataclass + make_page_config()
207
+ ├── data/
208
+ │ ├── smiles.py # load_smiles(), fetch_smiles_from_chembl_sqlite()
209
+ │ └── distractor_images.py # load_distractor_images(), download_picsum()
210
+ ├── rendering/
211
+ │ ├── chemistry.py # render_structure(), place_structure()
212
+ │ └── text.py # draw_rotated_text(), add_label_near_structure(), load_font()
213
+ ├── distractors/
214
+ │ ├── charts.py # bar / scatter / line / pie chart generators
215
+ │ ├── shapes.py # geometric shapes, noise patches, gradients
216
+ │ └── text_elements.py # prose blocks, captions, footnotes, arrows, tables
217
+ ├── generation/
218
+ │ ├── page.py # make_page(), make_negative_page(), apply_noise()
219
+ │ └── dataset.py # generate_dataset(), save_sample(), CLI entry point
220
+ ├── training/
221
+ │ └── trainer.py # train(), CLI entry point
222
+ ├── inference/
223
+ │ ├── tiling.py # generate_tiles()
224
+ │ ├── nms.py # nms()
225
+ │ ├── pairing.py # pair_detections() via Hungarian matching
226
+ │ └── detector.py # detect_tiled(), detect_full(), draw_boxes(), CLI
227
+ └── viz/
228
+ └── labels.py # visualize_split(), draw_boxes(), CLI entry point
229
+
230
+ annotate/ # Flask annotation tool (unchanged)
231
+ config/
232
+ ├── data.yaml # YOLO dataset paths
233
+ └── pipeline.yaml
234
+ data/ # data files (gitignored)
235
+ runs/ # training checkpoints (gitignored)
236
+ ```
237
+
238
+ ---
239
+
240
+ ## Data directory layout
241
+
242
+ ```
243
+ data/
244
+ ├── smiles/
245
+ │ └── chembl_smiles.csv # ~20 k SMILES from ChEMBL
246
+ ├── fonts/ # TTF/OTF fonts for label rendering
247
+ ├── distractors/ # ~1 k real photos (sf-download-distractors output)
248
+ ├── generated/ # synthetic dataset (sf-generate output)
249
+ │ ├── train/
250
+ │ └── val/
251
+ └── real/ # manually annotated real pages (sf-annotate output)
252
+ ├── images/
253
+ ├── labels/
254
+ └── ground_truth/
255
+ ```
256
+
257
+ ---
258
+
259
+ ## YOLO label format
260
+
261
+ Each `.txt` label file contains one line per annotated object:
262
+
263
+ ```
264
+ <class_id> <cx> <cy> <w> <h> (all normalised to [0, 1])
265
+ ```
266
+
267
+ | class_id | name |
268
+ |----------|------|
269
+ | 0 | chemical_structure |
270
+ | 1 | compound_label |
271
+
272
+ Ground-truth JSON files in `ground_truth/` contain raw pixel coordinates plus `smiles` and `label_text` for downstream analysis.
273
+
274
+ ---
275
+
276
+ ## Key design decisions
277
+
278
+ - **Union bounding box** — each compound panel is annotated as the union of structure + label (1 class for YOLO). The GT JSON preserves the individual boxes.
279
+ - **No horizontal flips** — chemical handedness matters; `fliplr=0` is enforced during training.
280
+ - **15 % negative pages** — pages with no structures teach the model to output nothing for non-chemistry content.
281
+ - **Multi-DPI generation** — pages at {96, 144, 200, 300} DPI create scale variance, improving robustness to different scanning resolutions.
282
+ - **Tiled inference** — A4 pages (2480 × 3508 px) are tiled into 1536 px chunks with 20 % overlap to stay within GPU memory.