structflo-cser 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- structflo_cser-0.1.0/.github/workflows/ci.yml +40 -0
- structflo_cser-0.1.0/.github/workflows/publish.yml +36 -0
- structflo_cser-0.1.0/.gitignore +69 -0
- structflo_cser-0.1.0/.python-version +1 -0
- structflo_cser-0.1.0/Makefile +35 -0
- structflo_cser-0.1.0/PKG-INFO +282 -0
- structflo_cser-0.1.0/README.md +258 -0
- structflo_cser-0.1.0/annotate/__main__.py +40 -0
- structflo_cser-0.1.0/annotate/pdf.py +41 -0
- structflo_cser-0.1.0/annotate/server.py +146 -0
- structflo_cser-0.1.0/annotate/storage.py +86 -0
- structflo_cser-0.1.0/annotate/templates/index.html +463 -0
- structflo_cser-0.1.0/asset_scripts/download_chembl.sh +72 -0
- structflo_cser-0.1.0/config/data.yaml +8 -0
- structflo_cser-0.1.0/config/pipeline.yaml +0 -0
- structflo_cser-0.1.0/docs/learned_matcher_plan.md +619 -0
- structflo_cser-0.1.0/docs/lps.md +363 -0
- structflo_cser-0.1.0/docs/publishing-weights.md +179 -0
- structflo_cser-0.1.0/main.py +6 -0
- structflo_cser-0.1.0/notebooks/01-quickstart.ipynb +959 -0
- structflo_cser-0.1.0/notebooks/02-LPS.ipynb +895 -0
- structflo_cser-0.1.0/notebooks/notebook-data/syn-1.jpg +0 -0
- structflo_cser-0.1.0/pyproject.toml +52 -0
- structflo_cser-0.1.0/quick.md +6 -0
- structflo_cser-0.1.0/scripts/publish_weights.py +313 -0
- structflo_cser-0.1.0/structflo/cser/__init__.py +3 -0
- structflo_cser-0.1.0/structflo/cser/_geometry.py +47 -0
- structflo_cser-0.1.0/structflo/cser/config.py +101 -0
- structflo_cser-0.1.0/structflo/cser/data/__init__.py +0 -0
- structflo_cser-0.1.0/structflo/cser/data/distractor_images.py +185 -0
- structflo_cser-0.1.0/structflo/cser/data/smiles.py +155 -0
- structflo_cser-0.1.0/structflo/cser/distractors/__init__.py +76 -0
- structflo_cser-0.1.0/structflo/cser/distractors/charts.py +108 -0
- structflo_cser-0.1.0/structflo/cser/distractors/shapes.py +71 -0
- structflo_cser-0.1.0/structflo/cser/distractors/text_elements.py +1155 -0
- structflo_cser-0.1.0/structflo/cser/generation/__init__.py +0 -0
- structflo_cser-0.1.0/structflo/cser/generation/dataset.py +378 -0
- structflo_cser-0.1.0/structflo/cser/generation/page.py +326 -0
- structflo_cser-0.1.0/structflo/cser/generation/specialty.py +698 -0
- structflo_cser-0.1.0/structflo/cser/generation/tabular.py +470 -0
- structflo_cser-0.1.0/structflo/cser/inference/__init__.py +0 -0
- structflo_cser-0.1.0/structflo/cser/inference/detector.py +279 -0
- structflo_cser-0.1.0/structflo/cser/inference/nms.py +24 -0
- structflo_cser-0.1.0/structflo/cser/inference/pairing.py +49 -0
- structflo_cser-0.1.0/structflo/cser/inference/tiling.py +28 -0
- structflo_cser-0.1.0/structflo/cser/lps/__init__.py +22 -0
- structflo_cser-0.1.0/structflo/cser/lps/dataset.py +396 -0
- structflo_cser-0.1.0/structflo/cser/lps/evaluate.py +215 -0
- structflo_cser-0.1.0/structflo/cser/lps/features.py +151 -0
- structflo_cser-0.1.0/structflo/cser/lps/matcher.py +200 -0
- structflo_cser-0.1.0/structflo/cser/lps/scorer.py +220 -0
- structflo_cser-0.1.0/structflo/cser/lps/train.py +348 -0
- structflo_cser-0.1.0/structflo/cser/pipeline/__init__.py +50 -0
- structflo_cser-0.1.0/structflo/cser/pipeline/cli.py +110 -0
- structflo_cser-0.1.0/structflo/cser/pipeline/matcher.py +73 -0
- structflo_cser-0.1.0/structflo/cser/pipeline/models.py +87 -0
- structflo_cser-0.1.0/structflo/cser/pipeline/ocr.py +61 -0
- structflo_cser-0.1.0/structflo/cser/pipeline/pipeline.py +215 -0
- structflo_cser-0.1.0/structflo/cser/pipeline/smiles_extractor.py +54 -0
- structflo_cser-0.1.0/structflo/cser/rendering/__init__.py +0 -0
- structflo_cser-0.1.0/structflo/cser/rendering/chemistry.py +147 -0
- structflo_cser-0.1.0/structflo/cser/rendering/text.py +346 -0
- structflo_cser-0.1.0/structflo/cser/training/__init__.py +0 -0
- structflo_cser-0.1.0/structflo/cser/training/trainer.py +100 -0
- structflo_cser-0.1.0/structflo/cser/viz/__init__.py +20 -0
- structflo_cser-0.1.0/structflo/cser/viz/detections.py +328 -0
- structflo_cser-0.1.0/structflo/cser/viz/labels.py +163 -0
- structflo_cser-0.1.0/structflo/cser/weights.py +243 -0
- structflo_cser-0.1.0/tests/__init__.py +1 -0
- structflo_cser-0.1.0/tests/test_config.py +46 -0
- structflo_cser-0.1.0/tests/test_generation.py +33 -0
- structflo_cser-0.1.0/tests/test_geometry.py +73 -0
- structflo_cser-0.1.0/tests/test_imports.py +55 -0
- structflo_cser-0.1.0/tests/test_inference.py +138 -0
- structflo_cser-0.1.0/tests/test_models.py +96 -0
- structflo_cser-0.1.0/tests/test_viz.py +255 -0
- structflo_cser-0.1.0/uv.lock +4068 -0
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
check:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
permissions:
|
|
13
|
+
contents: read
|
|
14
|
+
steps:
|
|
15
|
+
- uses: actions/checkout@v4
|
|
16
|
+
|
|
17
|
+
- name: Install uv
|
|
18
|
+
uses: astral-sh/setup-uv@v4
|
|
19
|
+
|
|
20
|
+
- name: Set up Python
|
|
21
|
+
run: uv python install 3.12
|
|
22
|
+
|
|
23
|
+
- name: Install dependencies
|
|
24
|
+
run: uv sync --dev
|
|
25
|
+
|
|
26
|
+
- name: Lint
|
|
27
|
+
run: uv run ruff check structflo/ tests/
|
|
28
|
+
|
|
29
|
+
- name: Format
|
|
30
|
+
run: uv run ruff format --check structflo/ tests/
|
|
31
|
+
|
|
32
|
+
- name: Test with coverage
|
|
33
|
+
run: uv run pytest -q --cov=structflo --cov-report=xml
|
|
34
|
+
|
|
35
|
+
- name: Upload coverage to Codecov
|
|
36
|
+
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
|
|
37
|
+
uses: codecov/codecov-action@v4
|
|
38
|
+
with:
|
|
39
|
+
files: coverage.xml
|
|
40
|
+
token: ${{ secrets.CODECOV_TOKEN }}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v*"
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
publish:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
permissions:
|
|
12
|
+
contents: read
|
|
13
|
+
id-token: write # trusted publishing
|
|
14
|
+
steps:
|
|
15
|
+
- uses: actions/checkout@v4
|
|
16
|
+
|
|
17
|
+
- name: Install uv
|
|
18
|
+
uses: astral-sh/setup-uv@v4
|
|
19
|
+
|
|
20
|
+
- name: Set up Python
|
|
21
|
+
run: uv python install 3.12
|
|
22
|
+
|
|
23
|
+
- name: Install dependencies
|
|
24
|
+
run: uv sync --dev
|
|
25
|
+
|
|
26
|
+
- name: Lint
|
|
27
|
+
run: uv run ruff check structflo/ tests/
|
|
28
|
+
|
|
29
|
+
- name: Test
|
|
30
|
+
run: uv run pytest -q
|
|
31
|
+
|
|
32
|
+
- name: Build
|
|
33
|
+
run: uv build
|
|
34
|
+
|
|
35
|
+
- name: Publish to PyPI
|
|
36
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# ── Python ────────────────────────────────────────────────────────────────────
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[oc]
|
|
4
|
+
*.pyd
|
|
5
|
+
build/
|
|
6
|
+
dist/
|
|
7
|
+
wheels/
|
|
8
|
+
*.egg-info/
|
|
9
|
+
.eggs/
|
|
10
|
+
pip-wheel-metadata/
|
|
11
|
+
MANIFEST
|
|
12
|
+
|
|
13
|
+
# ── Virtual environments ───────────────────────────────────────────────────────
|
|
14
|
+
.venv/
|
|
15
|
+
venv/
|
|
16
|
+
env/
|
|
17
|
+
|
|
18
|
+
# ── Testing & linting ─────────────────────────────────────────────────────────
|
|
19
|
+
.pytest_cache/
|
|
20
|
+
.ruff_cache/
|
|
21
|
+
.mypy_cache/
|
|
22
|
+
.coverage
|
|
23
|
+
coverage.xml
|
|
24
|
+
htmlcov/
|
|
25
|
+
|
|
26
|
+
# ── Jupyter ───────────────────────────────────────────────────────────────────
|
|
27
|
+
.ipynb_checkpoints/
|
|
28
|
+
*.nbconvert/
|
|
29
|
+
|
|
30
|
+
# ── Editors & IDEs ────────────────────────────────────────────────────────────
|
|
31
|
+
.vscode/
|
|
32
|
+
.idea/
|
|
33
|
+
*.swp
|
|
34
|
+
*.swo
|
|
35
|
+
*~
|
|
36
|
+
.DS_Store
|
|
37
|
+
Thumbs.db
|
|
38
|
+
|
|
39
|
+
# ── Project: training & inference outputs ────────────────────────────────────
|
|
40
|
+
runs/
|
|
41
|
+
detections/
|
|
42
|
+
archive/
|
|
43
|
+
|
|
44
|
+
# ── Project: datasets & generated data ───────────────────────────────────────
|
|
45
|
+
/data/
|
|
46
|
+
|
|
47
|
+
# ── Model weights ─────────────────────────────────────────────────────────────
|
|
48
|
+
# Weights are published to HF Hub — never commit .pt files directly.
|
|
49
|
+
# Exception: commit weights.py (the registry) but not the binaries.
|
|
50
|
+
*.pt
|
|
51
|
+
*.pth
|
|
52
|
+
*.ckpt
|
|
53
|
+
*.safetensors
|
|
54
|
+
|
|
55
|
+
# ── Secrets & credentials ─────────────────────────────────────────────────────
|
|
56
|
+
.env
|
|
57
|
+
.env.*
|
|
58
|
+
*.pem
|
|
59
|
+
*.key
|
|
60
|
+
secrets/
|
|
61
|
+
|
|
62
|
+
# ── Logs ──────────────────────────────────────────────────────────────────────
|
|
63
|
+
*.log
|
|
64
|
+
logs/
|
|
65
|
+
|
|
66
|
+
# ── Misc ──────────────────────────────────────────────────────────────────────
|
|
67
|
+
.claude/
|
|
68
|
+
tmp/
|
|
69
|
+
scratch/
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.12
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
.PHONY: install lint format check test clean build
|
|
2
|
+
|
|
3
|
+
## Install all dependencies (including dev)
|
|
4
|
+
install:
|
|
5
|
+
uv sync --dev
|
|
6
|
+
|
|
7
|
+
## Run ruff linter
|
|
8
|
+
lint:
|
|
9
|
+
uv run ruff check structflo/ tests/
|
|
10
|
+
|
|
11
|
+
## Auto-fix lint issues
|
|
12
|
+
fix:
|
|
13
|
+
uv run ruff check --fix structflo/ tests/
|
|
14
|
+
|
|
15
|
+
## Format code with ruff
|
|
16
|
+
format:
|
|
17
|
+
uv run ruff format structflo/ tests/
|
|
18
|
+
|
|
19
|
+
## Check formatting + lint (CI-friendly, no changes)
|
|
20
|
+
check:
|
|
21
|
+
uv run ruff format --check structflo/ tests/
|
|
22
|
+
uv run ruff check structflo/ tests/
|
|
23
|
+
|
|
24
|
+
## Run tests
|
|
25
|
+
test:
|
|
26
|
+
uv run pytest -q
|
|
27
|
+
|
|
28
|
+
## Remove build artifacts
|
|
29
|
+
clean:
|
|
30
|
+
rm -rf dist/ build/ *.egg-info .pytest_cache
|
|
31
|
+
find . -type d -name __pycache__ -exec rm -rf {} +
|
|
32
|
+
|
|
33
|
+
## Build package
|
|
34
|
+
build:
|
|
35
|
+
uv build
|
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: structflo-cser
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: YOLO-based chemical structure + label detector with synthetic data generation
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Requires-Dist: chembl-webresource-client>=0.10.9
|
|
7
|
+
Requires-Dist: decimer>=2.8.0
|
|
8
|
+
Requires-Dist: easyocr>=1.7
|
|
9
|
+
Requires-Dist: flask>=3.1.3
|
|
10
|
+
Requires-Dist: huggingface-hub>=0.24.0
|
|
11
|
+
Requires-Dist: matplotlib>=3.10.8
|
|
12
|
+
Requires-Dist: numpy>=2.4.2
|
|
13
|
+
Requires-Dist: opencv-python-headless>=4.13.0.92
|
|
14
|
+
Requires-Dist: packaging>=23.0
|
|
15
|
+
Requires-Dist: pandas>=2.0.0
|
|
16
|
+
Requires-Dist: pillow>=12.1.1
|
|
17
|
+
Requires-Dist: pymupdf>=1.27.1
|
|
18
|
+
Requires-Dist: rdkit>=2025.9.5
|
|
19
|
+
Requires-Dist: requests>=2.31.0
|
|
20
|
+
Requires-Dist: scipy>=1.17.0
|
|
21
|
+
Requires-Dist: tqdm>=4.67.3
|
|
22
|
+
Requires-Dist: ultralytics>=8.4.14
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
|
|
25
|
+
# structflo-cser
|
|
26
|
+
|
|
27
|
+
YOLO11l-based detector for chemical structures and their compound label IDs in scientific documents.
|
|
28
|
+
|
|
29
|
+
Part of the **structflo** library. Import as:
|
|
30
|
+
```python
|
|
31
|
+
from structflo.cser.pipeline import ChemPipeline
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
**Detection target:** A single bounding box (`compound_panel`) enclosing the union of a rendered chemical structure and its nearby label ID (e.g. `CHEMBL12345`).
|
|
35
|
+
|
|
36
|
+
---
|
|
37
|
+
|
|
38
|
+
## Installation
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
uv pip install -e .
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
This installs all dependencies and registers the `sf-*` CLI commands on your PATH.
|
|
45
|
+
|
|
46
|
+
---
|
|
47
|
+
|
|
48
|
+
## Pipeline
|
|
49
|
+
|
|
50
|
+
```
|
|
51
|
+
1. Fetch SMILES → sf-fetch-smiles
|
|
52
|
+
2. Download distractors → sf-download-distractors (optional but recommended)
|
|
53
|
+
3. Generate dataset → sf-generate
|
|
54
|
+
4. Visualize labels → sf-viz (optional QA check)
|
|
55
|
+
5. Train YOLO → sf-train
|
|
56
|
+
6. Run inference → sf-detect
|
|
57
|
+
7. Annotate real PDFs → sf-annotate (optional)
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## Commands
|
|
63
|
+
|
|
64
|
+
### 1. Fetch SMILES from ChEMBL
|
|
65
|
+
|
|
66
|
+
Extracts ~20 k small-molecule SMILES from a local [ChEMBL SQLite database](https://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/).
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
sf-fetch-smiles \
|
|
70
|
+
--db chembl_35/chembl_35_sqlite/chembl_35.db \
|
|
71
|
+
--output data/smiles/chembl_smiles.csv \
|
|
72
|
+
--n 20000
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Output: `data/smiles/chembl_smiles.csv`
|
|
76
|
+
|
|
77
|
+
---
|
|
78
|
+
|
|
79
|
+
### 2. Download distractor images
|
|
80
|
+
|
|
81
|
+
Downloads real photographs from [Lorem Picsum](https://picsum.photos/) to use as hard-negative distractors during page generation.
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
sf-download-distractors --out data/distractors --count 1000
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
---
|
|
88
|
+
|
|
89
|
+
### 3. Generate synthetic dataset
|
|
90
|
+
|
|
91
|
+
Generates document-like pages (A4 @ 300 DPI or slide format) containing chemical structures, compound labels, and distractor elements.
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
sf-generate \
|
|
95
|
+
--smiles data/smiles/chembl_smiles.csv \
|
|
96
|
+
--out data/generated \
|
|
97
|
+
--num-train 2000 --num-val 400 \
|
|
98
|
+
--fonts-dir data/fonts \
|
|
99
|
+
--distractors-dir data/distractors \
|
|
100
|
+
--dpi 96,144,200,300 \
|
|
101
|
+
--workers 0
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
Key options:
|
|
105
|
+
|
|
106
|
+
| Flag | Default | Description |
|
|
107
|
+
|------|---------|-------------|
|
|
108
|
+
| `--num-train` | 2000 | Number of training pages |
|
|
109
|
+
| `--num-val` | 200 | Number of validation pages |
|
|
110
|
+
| `--dpi` | `96,144,200,300` | DPI values randomly sampled per page |
|
|
111
|
+
| `--grayscale` / `--no-grayscale` | on | Convert pages to grayscale |
|
|
112
|
+
| `--workers` | 0 (all CPUs) | Parallel workers; use `1` to disable multiprocessing |
|
|
113
|
+
|
|
114
|
+
**Output structure:**
|
|
115
|
+
```
|
|
116
|
+
data/generated/
|
|
117
|
+
├── train/
|
|
118
|
+
│ ├── images/ (JPEG pages)
|
|
119
|
+
│ ├── labels/ (YOLO .txt — union bbox per compound panel)
|
|
120
|
+
│ └── ground_truth/ (JSON with split struct_bbox / label_bbox / smiles)
|
|
121
|
+
└── val/
|
|
122
|
+
├── images/
|
|
123
|
+
├── labels/
|
|
124
|
+
└── ground_truth/
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
---
|
|
128
|
+
|
|
129
|
+
### 4. Visualize labels (QA)
|
|
130
|
+
|
|
131
|
+
Overlays YOLO bounding boxes on a random sample of generated pages.
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
sf-viz --split both --n 30 --out data/viz
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
Green boxes = `chemical_structure`, blue boxes = `compound_label`.
|
|
138
|
+
|
|
139
|
+
---
|
|
140
|
+
|
|
141
|
+
### 5. Train
|
|
142
|
+
|
|
143
|
+
Fine-tunes YOLO11l on the generated dataset.
|
|
144
|
+
|
|
145
|
+
```bash
|
|
146
|
+
sf-train --epochs 50 --imgsz 1280 --batch 8
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
Key options:
|
|
150
|
+
|
|
151
|
+
| Flag | Default | Description |
|
|
152
|
+
|------|---------|-------------|
|
|
153
|
+
| `--weights` | `yolo11l.pt` | Pretrained backbone |
|
|
154
|
+
| `--imgsz` | 1280 | Training resolution |
|
|
155
|
+
| `--batch` | 8 | Batch size (safe for A6000 48 GB) |
|
|
156
|
+
| `--resume` | — | Path to `last.pt` to resume an interrupted run |
|
|
157
|
+
|
|
158
|
+
**Output:** `runs/labels_detect/yolo11l_panels/weights/best.pt`
|
|
159
|
+
|
|
160
|
+
---
|
|
161
|
+
|
|
162
|
+
### 6. Detect
|
|
163
|
+
|
|
164
|
+
Runs the trained detector on images using sliding-window tiling (1536 px tiles, 20 % overlap).
|
|
165
|
+
|
|
166
|
+
```bash
|
|
167
|
+
# Single image
|
|
168
|
+
sf-detect --image page.png
|
|
169
|
+
|
|
170
|
+
# Directory of images
|
|
171
|
+
sf-detect --image_dir data/real/images/ --out detections/
|
|
172
|
+
|
|
173
|
+
# With Hungarian pairing of structures → labels
|
|
174
|
+
sf-detect --image page.png --pair --max_dist 300
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
Key options:
|
|
178
|
+
|
|
179
|
+
| Flag | Default | Description |
|
|
180
|
+
|------|---------|-------------|
|
|
181
|
+
| `--weights` | `runs/.../best.pt` | Model weights |
|
|
182
|
+
| `--conf` | 0.3 | Confidence threshold |
|
|
183
|
+
| `--tile_size` | 1536 | Tile size in pixels |
|
|
184
|
+
| `--no_tile` | off | Run on full image (skips tiling) |
|
|
185
|
+
| `--grayscale` | off | Convert to grayscale before detection |
|
|
186
|
+
| `--pair` | off | Hungarian match structures → labels |
|
|
187
|
+
|
|
188
|
+
---
|
|
189
|
+
|
|
190
|
+
### 7. Annotate real PDFs (optional)
|
|
191
|
+
|
|
192
|
+
Web-based annotation tool for creating ground truth from real PDF documents.
|
|
193
|
+
|
|
194
|
+
```bash
|
|
195
|
+
sf-annotate --out data/real --port 8000
|
|
196
|
+
# then open http://127.0.0.1:8000 in a browser
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
---
|
|
200
|
+
|
|
201
|
+
## Package layout
|
|
202
|
+
|
|
203
|
+
```
|
|
204
|
+
structflo/cser/ # importable package (from structflo.cser import ...)
|
|
205
|
+
├── _geometry.py # shared bbox utilities (boxes_intersect, try_place_box)
|
|
206
|
+
├── config.py # PageConfig dataclass + make_page_config()
|
|
207
|
+
├── data/
|
|
208
|
+
│ ├── smiles.py # load_smiles(), fetch_smiles_from_chembl_sqlite()
|
|
209
|
+
│ └── distractor_images.py # load_distractor_images(), download_picsum()
|
|
210
|
+
├── rendering/
|
|
211
|
+
│ ├── chemistry.py # render_structure(), place_structure()
|
|
212
|
+
│ └── text.py # draw_rotated_text(), add_label_near_structure(), load_font()
|
|
213
|
+
├── distractors/
|
|
214
|
+
│ ├── charts.py # bar / scatter / line / pie chart generators
|
|
215
|
+
│ ├── shapes.py # geometric shapes, noise patches, gradients
|
|
216
|
+
│ └── text_elements.py # prose blocks, captions, footnotes, arrows, tables
|
|
217
|
+
├── generation/
|
|
218
|
+
│ ├── page.py # make_page(), make_negative_page(), apply_noise()
|
|
219
|
+
│ └── dataset.py # generate_dataset(), save_sample(), CLI entry point
|
|
220
|
+
├── training/
|
|
221
|
+
│ └── trainer.py # train(), CLI entry point
|
|
222
|
+
├── inference/
|
|
223
|
+
│ ├── tiling.py # generate_tiles()
|
|
224
|
+
│ ├── nms.py # nms()
|
|
225
|
+
│ ├── pairing.py # pair_detections() via Hungarian matching
|
|
226
|
+
│ └── detector.py # detect_tiled(), detect_full(), draw_boxes(), CLI
|
|
227
|
+
└── viz/
|
|
228
|
+
└── labels.py # visualize_split(), draw_boxes(), CLI entry point
|
|
229
|
+
|
|
230
|
+
annotate/ # Flask annotation tool (unchanged)
|
|
231
|
+
config/
|
|
232
|
+
├── data.yaml # YOLO dataset paths
|
|
233
|
+
└── pipeline.yaml
|
|
234
|
+
data/ # data files (gitignored)
|
|
235
|
+
runs/ # training checkpoints (gitignored)
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
---
|
|
239
|
+
|
|
240
|
+
## Data directory layout
|
|
241
|
+
|
|
242
|
+
```
|
|
243
|
+
data/
|
|
244
|
+
├── smiles/
|
|
245
|
+
│ └── chembl_smiles.csv # ~20 k SMILES from ChEMBL
|
|
246
|
+
├── fonts/ # TTF/OTF fonts for label rendering
|
|
247
|
+
├── distractors/ # ~1 k real photos (sf-download-distractors output)
|
|
248
|
+
├── generated/ # synthetic dataset (sf-generate output)
|
|
249
|
+
│ ├── train/
|
|
250
|
+
│ └── val/
|
|
251
|
+
└── real/ # manually annotated real pages (sf-annotate output)
|
|
252
|
+
├── images/
|
|
253
|
+
├── labels/
|
|
254
|
+
└── ground_truth/
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
---
|
|
258
|
+
|
|
259
|
+
## YOLO label format
|
|
260
|
+
|
|
261
|
+
Each `.txt` label file contains one line per annotated object:
|
|
262
|
+
|
|
263
|
+
```
|
|
264
|
+
<class_id> <cx> <cy> <w> <h> (all normalised to [0, 1])
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
| class_id | name |
|
|
268
|
+
|----------|------|
|
|
269
|
+
| 0 | chemical_structure |
|
|
270
|
+
| 1 | compound_label |
|
|
271
|
+
|
|
272
|
+
Ground-truth JSON files in `ground_truth/` contain raw pixel coordinates plus `smiles` and `label_text` for downstream analysis.
|
|
273
|
+
|
|
274
|
+
---
|
|
275
|
+
|
|
276
|
+
## Key design decisions
|
|
277
|
+
|
|
278
|
+
- **Union bounding box** — each compound panel is annotated as the union of structure + label (1 class for YOLO). The GT JSON preserves the individual boxes.
|
|
279
|
+
- **No horizontal flips** — chemical handedness matters; `fliplr=0` is enforced during training.
|
|
280
|
+
- **15 % negative pages** — pages with no structures teach the model to output nothing for non-chemistry content.
|
|
281
|
+
- **Multi-DPI generation** — pages at {96, 144, 200, 300} DPI create scale variance, improving robustness to different scanning resolutions.
|
|
282
|
+
- **Tiled inference** — A4 pages (2480 × 3508 px) are tiled into 1536 px chunks with 20 % overlap to stay within GPU memory.
|