trace-digitiser 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- trace_digitiser-0.1.0/PKG-INFO +176 -0
- trace_digitiser-0.1.0/README.md +157 -0
- trace_digitiser-0.1.0/pyproject.toml +42 -0
- trace_digitiser-0.1.0/setup.cfg +4 -0
- trace_digitiser-0.1.0/src/trace_digitiser/__init__.py +222 -0
- trace_digitiser-0.1.0/src/trace_digitiser/calibration.py +283 -0
- trace_digitiser-0.1.0/src/trace_digitiser/cli.py +123 -0
- trace_digitiser-0.1.0/src/trace_digitiser/diagnostics.py +144 -0
- trace_digitiser-0.1.0/src/trace_digitiser/digitise.py +85 -0
- trace_digitiser-0.1.0/src/trace_digitiser/geometry.py +61 -0
- trace_digitiser-0.1.0/src/trace_digitiser/io.py +74 -0
- trace_digitiser-0.1.0/src/trace_digitiser/line_detection.py +182 -0
- trace_digitiser-0.1.0/src/trace_digitiser/models.py +148 -0
- trace_digitiser-0.1.0/src/trace_digitiser/ocr.py +240 -0
- trace_digitiser-0.1.0/src/trace_digitiser/panel_detection.py +816 -0
- trace_digitiser-0.1.0/src/trace_digitiser/summarise.py +68 -0
- trace_digitiser-0.1.0/src/trace_digitiser/synthetic.py +206 -0
- trace_digitiser-0.1.0/src/trace_digitiser/trace_detection.py +337 -0
- trace_digitiser-0.1.0/src/trace_digitiser/x_calibration.py +228 -0
- trace_digitiser-0.1.0/src/trace_digitiser.egg-info/PKG-INFO +176 -0
- trace_digitiser-0.1.0/src/trace_digitiser.egg-info/SOURCES.txt +27 -0
- trace_digitiser-0.1.0/src/trace_digitiser.egg-info/dependency_links.txt +1 -0
- trace_digitiser-0.1.0/src/trace_digitiser.egg-info/entry_points.txt +2 -0
- trace_digitiser-0.1.0/src/trace_digitiser.egg-info/requires.txt +11 -0
- trace_digitiser-0.1.0/src/trace_digitiser.egg-info/top_level.txt +1 -0
- trace_digitiser-0.1.0/tests/test_geometry.py +45 -0
- trace_digitiser-0.1.0/tests/test_integration.py +250 -0
- trace_digitiser-0.1.0/tests/test_panel_detection.py +74 -0
- trace_digitiser-0.1.0/tests/test_trace_detection.py +35 -0
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: trace-digitiser
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Template-free computer-vision digitisation pipeline for raster scientific line plots.
|
|
5
|
+
Author: Trace Digitiser Contributors
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: opencv-python-headless>=4.8
|
|
10
|
+
Requires-Dist: pytesseract>=0.3.10
|
|
11
|
+
Requires-Dist: Pillow>=10.0
|
|
12
|
+
Requires-Dist: pandas>=2.0
|
|
13
|
+
Requires-Dist: numpy>=1.24
|
|
14
|
+
Requires-Dist: matplotlib>=3.7
|
|
15
|
+
Provides-Extra: dev
|
|
16
|
+
Requires-Dist: pytest>=7.4; extra == "dev"
|
|
17
|
+
Requires-Dist: pytest-cov>=4.1; extra == "dev"
|
|
18
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
19
|
+
|
|
20
|
+
# trace-digitiser
|
|
21
|
+
|
|
22
|
+
Template-free computer-vision digitisation pipeline for raster scientific line plots.
|
|
23
|
+
|
|
24
|
+
Given a raster image of a scientific figure (PNG, JPEG, etc.), this tool detects plot panels, reads y-axis tick labels via OCR, segments coloured traces, and exports digitised data as CSV files — all without requiring manual calibration points, known trace colours, or panel coordinates. The user supplies only a high-level layout hint (e.g. "stacked", "horizontal", "grid").
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
Requires Python 3.10+ and [Tesseract OCR](https://github.com/tesseract-ocr/tesseract).
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
# Install Tesseract (Ubuntu/Debian)
|
|
32
|
+
sudo apt-get install tesseract-ocr
|
|
33
|
+
|
|
34
|
+
# Install Tesseract (macOS)
|
|
35
|
+
brew install tesseract
|
|
36
|
+
|
|
37
|
+
# Install the package
|
|
38
|
+
pip install -e .
|
|
39
|
+
|
|
40
|
+
# With development dependencies
|
|
41
|
+
pip install -e ".[dev]"
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Quick start
|
|
45
|
+
|
|
46
|
+
### Python API
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
from trace_digitiser import digitise
|
|
50
|
+
|
|
51
|
+
result = digitise(
|
|
52
|
+
"figure.jpg",
|
|
53
|
+
layout_mode="stacked",
|
|
54
|
+
expected_rows=2,
|
|
55
|
+
expected_cols=1,
|
|
56
|
+
output_dir="outputs",
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
# Digitised traces as a DataFrame
|
|
60
|
+
print(result.trace_data.head())
|
|
61
|
+
|
|
62
|
+
# Panel metadata
|
|
63
|
+
for panel in result.panels:
|
|
64
|
+
print(f"Panel {panel.panel_id}: {panel.width}×{panel.height}px, "
|
|
65
|
+
f"calibration={panel.calibration.scale_type}")
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### Command line
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
# Single image
|
|
72
|
+
trace-digitiser figure.jpg --layout stacked --rows 2 --cols 1 -o outputs/
|
|
73
|
+
|
|
74
|
+
# Batch processing
|
|
75
|
+
trace-digitiser figures/*.jpg --layout auto -o results/
|
|
76
|
+
|
|
77
|
+
# Generate synthetic test figures
|
|
78
|
+
trace-digitiser --generate-test-figures -o test_figures/
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### Layout modes
|
|
82
|
+
|
|
83
|
+
| Mode | Use case | Example |
|
|
84
|
+
|------|----------|---------|
|
|
85
|
+
| `single` | One plot panel | `--layout single` |
|
|
86
|
+
| `stacked` | Vertically stacked panels | `--layout stacked --rows 2 --cols 1` |
|
|
87
|
+
| `horizontal` | Side-by-side panels | `--layout horizontal --rows 1 --cols 3` |
|
|
88
|
+
| `grid` | Row×column subplot grid | `--layout grid --rows 2 --cols 2` |
|
|
89
|
+
| `auto` | Unconstrained detection | `--layout auto` |
|
|
90
|
+
|
|
91
|
+
## Output files
|
|
92
|
+
|
|
93
|
+
For input `figure.jpg` with default settings:
|
|
94
|
+
|
|
95
|
+
| File | Description |
|
|
96
|
+
|------|-------------|
|
|
97
|
+
| `figure_automated_digitised_trace_by_column.csv` | One row per x-pixel column per trace, with `center_estimate`, `upper_envelope`, `lower_envelope` |
|
|
98
|
+
| `figure_automated_panel_metadata.csv` | Panel coordinates, calibration parameters, detected x-labels |
|
|
99
|
+
| `figure_automated_digitised_summary_by_label.csv` | Summary statistics per detected x-label interval (written only if labels are found) |
|
|
100
|
+
|
|
101
|
+
## Project structure
|
|
102
|
+
|
|
103
|
+
```
|
|
104
|
+
trace_digitiser/
|
|
105
|
+
├── pyproject.toml
|
|
106
|
+
├── src/trace_digitiser/
|
|
107
|
+
│ ├── __init__.py # Public API: digitise()
|
|
108
|
+
│ ├── models.py # Dataclasses: Panel, Calibration, Trace, DigitiserResult
|
|
109
|
+
│ ├── io.py # Image loading, CSV output
|
|
110
|
+
│ ├── geometry.py # Box area/IoU/containment helpers
|
|
111
|
+
│ ├── line_detection.py # Horizontal and vertical line detectors
|
|
112
|
+
│ ├── panel_detection.py # Candidate generation, deduplication, layout selection
|
|
113
|
+
│ ├── calibration.py # Y-axis OCR calibration (linear + log) and cross-row propagation
|
|
114
|
+
│ ├── x_calibration.py # X-axis OCR calibration (numeric ticks)
|
|
115
|
+
│ ├── ocr.py # Tesseract OCR wrappers for tick and x-label reading
|
|
116
|
+
│ ├── trace_detection.py # HSV/CIELAB colour segmentation + achromatic trace detection
|
|
117
|
+
│ ├── digitise.py # Column-by-column trace digitisation
|
|
118
|
+
│ ├── summarise.py # Interval-label summarisation
|
|
119
|
+
│ ├── diagnostics.py # Debug overlays and diagnostic file output
|
|
120
|
+
│ ├── synthetic.py # Synthetic test-figure generation
|
|
121
|
+
│ └── cli.py # Command-line interface
|
|
122
|
+
└── tests/
|
|
123
|
+
├── conftest.py # Shared fixtures (synthetic images)
|
|
124
|
+
├── test_geometry.py
|
|
125
|
+
├── test_panel_detection.py
|
|
126
|
+
├── test_trace_detection.py
|
|
127
|
+
└── test_integration.py # Panel count, calibration quality, trace RMSE
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
## Development
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
# Run tests
|
|
134
|
+
pytest
|
|
135
|
+
|
|
136
|
+
# Run tests with coverage
|
|
137
|
+
pytest --cov=trace_digitiser
|
|
138
|
+
|
|
139
|
+
# Lint
|
|
140
|
+
ruff check src/ tests/
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
## Limitations
|
|
144
|
+
|
|
145
|
+
This tool digitises visible pixels from raster images. It does not recover original raw data. Key limitations:
|
|
146
|
+
|
|
147
|
+
- **Linear and log y-axes supported** — broken, dual, and other nonlinear axes are not supported yet.
|
|
148
|
+
- **X-axis calibration is best-effort** — numeric x-ticks are OCR'd when possible; otherwise x-values are normalised 0–1 or labelled by interval.
|
|
149
|
+
- **Black/grey traces partially supported** — achromatic trace detection is available but works best when traces have enough contrast against the background.
|
|
150
|
+
- **Requires visible structure** — axes, borders, or gridlines must be present for panel detection.
|
|
151
|
+
- **OCR fragility** — small, rotated, or low-contrast tick labels may fail.
|
|
152
|
+
|
|
153
|
+
## Changelog
|
|
154
|
+
|
|
155
|
+
### v0.1.0
|
|
156
|
+
|
|
157
|
+
**Refactored from notebook to installable package** with 15 modules, dataclasses, CLI, and test suite.
|
|
158
|
+
|
|
159
|
+
**Panel detection improvements:**
|
|
160
|
+
- Hint-aware `split_y_clusters` — uses the user's `expected_rows` to split evenly-spaced gridlines at the largest gaps, even when the inter-panel gap is only marginally wider than intra-panel gaps.
|
|
161
|
+
- Post-hoc panel subdivision — `apply_layout_hint` can split oversized candidates at their gridlines when initial detection yields fewer panels than expected.
|
|
162
|
+
- Improved grid detection — properly cross-matches row/column structure.
|
|
163
|
+
|
|
164
|
+
**Trace detection improvements:**
|
|
165
|
+
- CIELAB clustering mode (`use_lab=True`) — clusters non-background pixels in perceptually uniform colour space using mini-batch k-means.
|
|
166
|
+
- Achromatic trace detection — a separate pass detects black/grey traces by looking for low-saturation, horizontally continuous structures distinct from grid/axis lines.
|
|
167
|
+
|
|
168
|
+
**Calibration improvements:**
|
|
169
|
+
- Log-scale y-axis detection — when OCR'd tick values are better explained by `log10(y_value) = a * y_pixel + b`, the calibration uses log scale automatically.
|
|
170
|
+
- X-axis calibration — OCRs numeric x-axis ticks, fits `x_value = a * x_pixel + b`, and adds `x_value` column to the trace CSV.
|
|
171
|
+
|
|
172
|
+
**Infrastructure:**
|
|
173
|
+
- Diagnostic overlays save to files via `save_diagnostics=True`.
|
|
174
|
+
- Tesseract error handling — graceful recovery from OCR crashes on degenerate crops.
|
|
175
|
+
- No Colab dependency.
|
|
176
|
+
- Quantitative test suite with panel count, calibration quality, and trace RMSE metrics against synthetic ground truth.
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
# trace-digitiser
|
|
2
|
+
|
|
3
|
+
Template-free computer-vision digitisation pipeline for raster scientific line plots.
|
|
4
|
+
|
|
5
|
+
Given a raster image of a scientific figure (PNG, JPEG, etc.), this tool detects plot panels, reads y-axis tick labels via OCR, segments coloured traces, and exports digitised data as CSV files — all without requiring manual calibration points, known trace colours, or panel coordinates. The user supplies only a high-level layout hint (e.g. "stacked", "horizontal", "grid").
|
|
6
|
+
|
|
7
|
+
## Installation
|
|
8
|
+
|
|
9
|
+
Requires Python 3.10+ and [Tesseract OCR](https://github.com/tesseract-ocr/tesseract).
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
# Install Tesseract (Ubuntu/Debian)
|
|
13
|
+
sudo apt-get install tesseract-ocr
|
|
14
|
+
|
|
15
|
+
# Install Tesseract (macOS)
|
|
16
|
+
brew install tesseract
|
|
17
|
+
|
|
18
|
+
# Install the package
|
|
19
|
+
pip install -e .
|
|
20
|
+
|
|
21
|
+
# With development dependencies
|
|
22
|
+
pip install -e ".[dev]"
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Quick start
|
|
26
|
+
|
|
27
|
+
### Python API
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
from trace_digitiser import digitise
|
|
31
|
+
|
|
32
|
+
result = digitise(
|
|
33
|
+
"figure.jpg",
|
|
34
|
+
layout_mode="stacked",
|
|
35
|
+
expected_rows=2,
|
|
36
|
+
expected_cols=1,
|
|
37
|
+
output_dir="outputs",
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
# Digitised traces as a DataFrame
|
|
41
|
+
print(result.trace_data.head())
|
|
42
|
+
|
|
43
|
+
# Panel metadata
|
|
44
|
+
for panel in result.panels:
|
|
45
|
+
print(f"Panel {panel.panel_id}: {panel.width}×{panel.height}px, "
|
|
46
|
+
f"calibration={panel.calibration.scale_type}")
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Command line
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
# Single image
|
|
53
|
+
trace-digitiser figure.jpg --layout stacked --rows 2 --cols 1 -o outputs/
|
|
54
|
+
|
|
55
|
+
# Batch processing
|
|
56
|
+
trace-digitiser figures/*.jpg --layout auto -o results/
|
|
57
|
+
|
|
58
|
+
# Generate synthetic test figures
|
|
59
|
+
trace-digitiser --generate-test-figures -o test_figures/
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### Layout modes
|
|
63
|
+
|
|
64
|
+
| Mode | Use case | Example |
|
|
65
|
+
|------|----------|---------|
|
|
66
|
+
| `single` | One plot panel | `--layout single` |
|
|
67
|
+
| `stacked` | Vertically stacked panels | `--layout stacked --rows 2 --cols 1` |
|
|
68
|
+
| `horizontal` | Side-by-side panels | `--layout horizontal --rows 1 --cols 3` |
|
|
69
|
+
| `grid` | Row×column subplot grid | `--layout grid --rows 2 --cols 2` |
|
|
70
|
+
| `auto` | Unconstrained detection | `--layout auto` |
|
|
71
|
+
|
|
72
|
+
## Output files
|
|
73
|
+
|
|
74
|
+
For input `figure.jpg` with default settings:
|
|
75
|
+
|
|
76
|
+
| File | Description |
|
|
77
|
+
|------|-------------|
|
|
78
|
+
| `figure_automated_digitised_trace_by_column.csv` | One row per x-pixel column per trace, with `center_estimate`, `upper_envelope`, `lower_envelope` |
|
|
79
|
+
| `figure_automated_panel_metadata.csv` | Panel coordinates, calibration parameters, detected x-labels |
|
|
80
|
+
| `figure_automated_digitised_summary_by_label.csv` | Summary statistics per detected x-label interval (written only if labels are found) |
|
|
81
|
+
|
|
82
|
+
## Project structure
|
|
83
|
+
|
|
84
|
+
```
|
|
85
|
+
trace_digitiser/
|
|
86
|
+
├── pyproject.toml
|
|
87
|
+
├── src/trace_digitiser/
|
|
88
|
+
│ ├── __init__.py # Public API: digitise()
|
|
89
|
+
│ ├── models.py # Dataclasses: Panel, Calibration, Trace, DigitiserResult
|
|
90
|
+
│ ├── io.py # Image loading, CSV output
|
|
91
|
+
│ ├── geometry.py # Box area/IoU/containment helpers
|
|
92
|
+
│ ├── line_detection.py # Horizontal and vertical line detectors
|
|
93
|
+
│ ├── panel_detection.py # Candidate generation, deduplication, layout selection
|
|
94
|
+
│ ├── calibration.py # Y-axis OCR calibration (linear + log) and cross-row propagation
|
|
95
|
+
│ ├── x_calibration.py # X-axis OCR calibration (numeric ticks)
|
|
96
|
+
│ ├── ocr.py # Tesseract OCR wrappers for tick and x-label reading
|
|
97
|
+
│ ├── trace_detection.py # HSV/CIELAB colour segmentation + achromatic trace detection
|
|
98
|
+
│ ├── digitise.py # Column-by-column trace digitisation
|
|
99
|
+
│ ├── summarise.py # Interval-label summarisation
|
|
100
|
+
│ ├── diagnostics.py # Debug overlays and diagnostic file output
|
|
101
|
+
│ ├── synthetic.py # Synthetic test-figure generation
|
|
102
|
+
│ └── cli.py # Command-line interface
|
|
103
|
+
└── tests/
|
|
104
|
+
├── conftest.py # Shared fixtures (synthetic images)
|
|
105
|
+
├── test_geometry.py
|
|
106
|
+
├── test_panel_detection.py
|
|
107
|
+
├── test_trace_detection.py
|
|
108
|
+
└── test_integration.py # Panel count, calibration quality, trace RMSE
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## Development
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
# Run tests
|
|
115
|
+
pytest
|
|
116
|
+
|
|
117
|
+
# Run tests with coverage
|
|
118
|
+
pytest --cov=trace_digitiser
|
|
119
|
+
|
|
120
|
+
# Lint
|
|
121
|
+
ruff check src/ tests/
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## Limitations
|
|
125
|
+
|
|
126
|
+
This tool digitises visible pixels from raster images. It does not recover original raw data. Key limitations:
|
|
127
|
+
|
|
128
|
+
- **Linear and log y-axes supported** — broken, dual, and other nonlinear axes are not supported yet.
|
|
129
|
+
- **X-axis calibration is best-effort** — numeric x-ticks are OCR'd when possible; otherwise x-values are normalised 0–1 or labelled by interval.
|
|
130
|
+
- **Black/grey traces partially supported** — achromatic trace detection is available but works best when traces have enough contrast against the background.
|
|
131
|
+
- **Requires visible structure** — axes, borders, or gridlines must be present for panel detection.
|
|
132
|
+
- **OCR fragility** — small, rotated, or low-contrast tick labels may fail.
|
|
133
|
+
|
|
134
|
+
## Changelog
|
|
135
|
+
|
|
136
|
+
### v0.1.0
|
|
137
|
+
|
|
138
|
+
**Refactored from notebook to installable package** with 15 modules, dataclasses, CLI, and test suite.
|
|
139
|
+
|
|
140
|
+
**Panel detection improvements:**
|
|
141
|
+
- Hint-aware `split_y_clusters` — uses the user's `expected_rows` to split evenly-spaced gridlines at the largest gaps, even when the inter-panel gap is only marginally wider than intra-panel gaps.
|
|
142
|
+
- Post-hoc panel subdivision — `apply_layout_hint` can split oversized candidates at their gridlines when initial detection yields fewer panels than expected.
|
|
143
|
+
- Improved grid detection — properly cross-matches row/column structure.
|
|
144
|
+
|
|
145
|
+
**Trace detection improvements:**
|
|
146
|
+
- CIELAB clustering mode (`use_lab=True`) — clusters non-background pixels in perceptually uniform colour space using mini-batch k-means.
|
|
147
|
+
- Achromatic trace detection — a separate pass detects black/grey traces by looking for low-saturation, horizontally continuous structures distinct from grid/axis lines.
|
|
148
|
+
|
|
149
|
+
**Calibration improvements:**
|
|
150
|
+
- Log-scale y-axis detection — when OCR'd tick values are better explained by `log10(y_value) = a * y_pixel + b`, the calibration uses log scale automatically.
|
|
151
|
+
- X-axis calibration — OCRs numeric x-axis ticks, fits `x_value = a * x_pixel + b`, and adds `x_value` column to the trace CSV.
|
|
152
|
+
|
|
153
|
+
**Infrastructure:**
|
|
154
|
+
- Diagnostic overlays save to files via `save_diagnostics=True`.
|
|
155
|
+
- Tesseract error handling — graceful recovery from OCR crashes on degenerate crops.
|
|
156
|
+
- No Colab dependency.
|
|
157
|
+
- Quantitative test suite with panel count, calibration quality, and trace RMSE metrics against synthetic ground truth.
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "trace-digitiser"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Template-free computer-vision digitisation pipeline for raster scientific line plots."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [{ name = "Trace Digitiser Contributors" }]
|
|
13
|
+
|
|
14
|
+
dependencies = [
|
|
15
|
+
"opencv-python-headless>=4.8",
|
|
16
|
+
"pytesseract>=0.3.10",
|
|
17
|
+
"Pillow>=10.0",
|
|
18
|
+
"pandas>=2.0",
|
|
19
|
+
"numpy>=1.24",
|
|
20
|
+
"matplotlib>=3.7",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
[project.optional-dependencies]
|
|
24
|
+
dev = [
|
|
25
|
+
"pytest>=7.4",
|
|
26
|
+
"pytest-cov>=4.1",
|
|
27
|
+
"ruff>=0.4",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[project.scripts]
|
|
31
|
+
trace-digitiser = "trace_digitiser.cli:main"
|
|
32
|
+
|
|
33
|
+
[tool.setuptools.packages.find]
|
|
34
|
+
where = ["src"]
|
|
35
|
+
|
|
36
|
+
[tool.pytest.ini_options]
|
|
37
|
+
testpaths = ["tests"]
|
|
38
|
+
addopts = "-v --tb=short"
|
|
39
|
+
|
|
40
|
+
[tool.ruff]
|
|
41
|
+
line-length = 120
|
|
42
|
+
target-version = "py310"
|
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
"""trace_digitiser — template-free digitisation of raster scientific line plots.
|
|
2
|
+
|
|
3
|
+
Quick start::
|
|
4
|
+
|
|
5
|
+
from trace_digitiser import digitise
|
|
6
|
+
|
|
7
|
+
result = digitise(
|
|
8
|
+
"figure.jpg",
|
|
9
|
+
layout_mode="stacked",
|
|
10
|
+
expected_rows=2,
|
|
11
|
+
expected_cols=1,
|
|
12
|
+
output_dir="outputs",
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
print(result.trace_data.head())
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import Optional
|
|
22
|
+
|
|
23
|
+
import pandas as pd
|
|
24
|
+
|
|
25
|
+
from .calibration import propagate_y_calibration_across_rows, robust_y_calibration
|
|
26
|
+
from .diagnostics import draw_digitised_trace, draw_panel_overlay, draw_trace_mask, print_panel_summary
|
|
27
|
+
from .digitise import digitise_trace_mask
|
|
28
|
+
from .io import build_panel_metadata, load_image, save_outputs
|
|
29
|
+
from .models import Calibration, DigitiserResult, Panel, Trace, XLabel
|
|
30
|
+
from .ocr import detect_x_labels
|
|
31
|
+
from .panel_detection import find_plot_panels
|
|
32
|
+
from .summarise import summarise_by_detected_labels
|
|
33
|
+
from .trace_detection import detect_trace_masks
|
|
34
|
+
from .x_calibration import calibrate_x_axis
|
|
35
|
+
|
|
36
|
+
__all__ = [
|
|
37
|
+
"digitise",
|
|
38
|
+
"Calibration",
|
|
39
|
+
"DigitiserResult",
|
|
40
|
+
"Panel",
|
|
41
|
+
"Trace",
|
|
42
|
+
"XLabel",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def digitise(
|
|
47
|
+
image_path: str | Path,
|
|
48
|
+
*,
|
|
49
|
+
layout_mode: str = "auto",
|
|
50
|
+
expected_rows: Optional[int] = None,
|
|
51
|
+
expected_cols: Optional[int] = None,
|
|
52
|
+
expected_panels: Optional[int] = None,
|
|
53
|
+
output_dir: Optional[str | Path] = None,
|
|
54
|
+
output_prefix: Optional[str] = None,
|
|
55
|
+
show_debug: bool = False,
|
|
56
|
+
save_diagnostics: bool = False,
|
|
57
|
+
) -> DigitiserResult:
|
|
58
|
+
"""End-to-end chart digitisation pipeline.
|
|
59
|
+
|
|
60
|
+
Parameters
|
|
61
|
+
----------
|
|
62
|
+
image_path : str or Path
|
|
63
|
+
Path to the input raster image.
|
|
64
|
+
layout_mode : str
|
|
65
|
+
``"auto"``, ``"single"``, ``"stacked"``, ``"horizontal"``, or
|
|
66
|
+
``"grid"``.
|
|
67
|
+
expected_rows, expected_cols, expected_panels : int, optional
|
|
68
|
+
Layout hints that constrain panel selection.
|
|
69
|
+
output_dir : str or Path, optional
|
|
70
|
+
Directory for CSV outputs. Defaults to current directory.
|
|
71
|
+
output_prefix : str, optional
|
|
72
|
+
Prefix for output filenames. Defaults to the image stem.
|
|
73
|
+
show_debug : bool
|
|
74
|
+
If True, display inline diagnostic plots (for interactive use).
|
|
75
|
+
save_diagnostics : bool
|
|
76
|
+
If True, write diagnostic overlay PNGs to *output_dir*.
|
|
77
|
+
|
|
78
|
+
Returns
|
|
79
|
+
-------
|
|
80
|
+
DigitiserResult
|
|
81
|
+
Structured result with panels, traces, DataFrames, and paths.
|
|
82
|
+
"""
|
|
83
|
+
image_path = Path(image_path)
|
|
84
|
+
rgb = load_image(image_path)
|
|
85
|
+
|
|
86
|
+
if output_prefix is None:
|
|
87
|
+
output_prefix = image_path.stem
|
|
88
|
+
|
|
89
|
+
diag_dir: Optional[Path] = None
|
|
90
|
+
if save_diagnostics:
|
|
91
|
+
diag_dir = Path(output_dir or ".") / "diagnostics"
|
|
92
|
+
diag_dir.mkdir(parents=True, exist_ok=True)
|
|
93
|
+
|
|
94
|
+
if show_debug:
|
|
95
|
+
print("Processing:", image_path)
|
|
96
|
+
print("Image size:", rgb.shape[1], "×", rgb.shape[0])
|
|
97
|
+
|
|
98
|
+
# ------------------------------------------------------------------
|
|
99
|
+
# 1. Detect panels
|
|
100
|
+
# ------------------------------------------------------------------
|
|
101
|
+
panels, h_lines, v_lines = find_plot_panels(
|
|
102
|
+
rgb,
|
|
103
|
+
layout_mode=layout_mode,
|
|
104
|
+
expected_rows=expected_rows,
|
|
105
|
+
expected_cols=expected_cols,
|
|
106
|
+
expected_panels=expected_panels,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
if show_debug or save_diagnostics:
|
|
110
|
+
print_panel_summary(panels, h_lines, v_lines, layout_mode)
|
|
111
|
+
draw_panel_overlay(rgb, panels, h_lines, v_lines, output_dir=diag_dir, show=show_debug)
|
|
112
|
+
|
|
113
|
+
# ------------------------------------------------------------------
|
|
114
|
+
# 2. Y-axis calibration
|
|
115
|
+
# ------------------------------------------------------------------
|
|
116
|
+
for p in panels:
|
|
117
|
+
calib = robust_y_calibration(rgb, p, verbose=show_debug)
|
|
118
|
+
p["y_calibration"] = calib.to_dict()
|
|
119
|
+
|
|
120
|
+
panels = propagate_y_calibration_across_rows(panels, verbose=show_debug)
|
|
121
|
+
|
|
122
|
+
# ------------------------------------------------------------------
|
|
123
|
+
# 2b. X-axis calibration (numeric x ticks)
|
|
124
|
+
# ------------------------------------------------------------------
|
|
125
|
+
for p in panels:
|
|
126
|
+
x_cal = calibrate_x_axis(rgb, p, verbose=show_debug)
|
|
127
|
+
if x_cal is not None:
|
|
128
|
+
p["x_calibration"] = x_cal
|
|
129
|
+
|
|
130
|
+
# ------------------------------------------------------------------
|
|
131
|
+
# 3. Trace detection and digitisation
|
|
132
|
+
# ------------------------------------------------------------------
|
|
133
|
+
all_trace_frames: list[pd.DataFrame] = []
|
|
134
|
+
trace_debug: list[tuple[dict, dict]] = []
|
|
135
|
+
|
|
136
|
+
for p in panels:
|
|
137
|
+
masks = detect_trace_masks(rgb, p)
|
|
138
|
+
if show_debug:
|
|
139
|
+
print(f"Panel {p['panel_id']}: detected {len(masks)} coloured trace(s)")
|
|
140
|
+
|
|
141
|
+
for tr in masks:
|
|
142
|
+
if show_debug:
|
|
143
|
+
print(" ", {k: v for k, v in tr.items() if k != "mask"})
|
|
144
|
+
all_trace_frames.append(digitise_trace_mask(p, tr))
|
|
145
|
+
trace_debug.append((p, tr))
|
|
146
|
+
|
|
147
|
+
trace_data = pd.concat(all_trace_frames, ignore_index=True) if all_trace_frames else pd.DataFrame()
|
|
148
|
+
|
|
149
|
+
# ------------------------------------------------------------------
|
|
150
|
+
# 4. X-label OCR
|
|
151
|
+
# ------------------------------------------------------------------
|
|
152
|
+
for p in panels:
|
|
153
|
+
p["x_labels"] = detect_x_labels(rgb, p)
|
|
154
|
+
if show_debug:
|
|
155
|
+
print(f"Panel {p['panel_id']} x labels:")
|
|
156
|
+
for lab in p["x_labels"]:
|
|
157
|
+
print(f" {lab['text']:>8s} x={lab['x']:.1f} conf={lab['conf']:.1f}")
|
|
158
|
+
|
|
159
|
+
# ------------------------------------------------------------------
|
|
160
|
+
# 5. Interval summaries
|
|
161
|
+
# ------------------------------------------------------------------
|
|
162
|
+
summary_by_label = summarise_by_detected_labels(trace_data, panels)
|
|
163
|
+
|
|
164
|
+
# ------------------------------------------------------------------
|
|
165
|
+
# 6. Diagnostics
|
|
166
|
+
# ------------------------------------------------------------------
|
|
167
|
+
if show_debug or save_diagnostics:
|
|
168
|
+
for p, tr in trace_debug:
|
|
169
|
+
draw_trace_mask(rgb, p, tr, output_dir=diag_dir, show=show_debug)
|
|
170
|
+
|
|
171
|
+
if not trace_data.empty:
|
|
172
|
+
for (panel_id, trace_id), _ in trace_data.groupby(["panel_id", "trace_id"]):
|
|
173
|
+
draw_digitised_trace(trace_data, panel_id, trace_id, output_dir=diag_dir, show=show_debug)
|
|
174
|
+
|
|
175
|
+
# ------------------------------------------------------------------
|
|
176
|
+
# 7. Save outputs
|
|
177
|
+
# ------------------------------------------------------------------
|
|
178
|
+
panel_metadata = build_panel_metadata(panels)
|
|
179
|
+
trace_csv, meta_csv, summary_csv = save_outputs(
|
|
180
|
+
trace_data, panel_metadata, summary_by_label, output_prefix, output_dir
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
if show_debug:
|
|
184
|
+
print("Wrote:")
|
|
185
|
+
print(" -", trace_csv)
|
|
186
|
+
print(" -", meta_csv)
|
|
187
|
+
if summary_csv:
|
|
188
|
+
print(" -", summary_csv)
|
|
189
|
+
else:
|
|
190
|
+
print(" - no label summary; fewer than two labels detected")
|
|
191
|
+
|
|
192
|
+
# ------------------------------------------------------------------
|
|
193
|
+
# 8. Build structured result
|
|
194
|
+
# ------------------------------------------------------------------
|
|
195
|
+
return DigitiserResult(
|
|
196
|
+
image_path=image_path,
|
|
197
|
+
rgb=rgb,
|
|
198
|
+
panels=[
|
|
199
|
+
Panel(
|
|
200
|
+
panel_id=p["panel_id"],
|
|
201
|
+
x0=p["x0"],
|
|
202
|
+
x1=p["x1"],
|
|
203
|
+
y_top=p["y_top"],
|
|
204
|
+
y_bottom=p["y_bottom"],
|
|
205
|
+
gridline_y=p["gridline_y"],
|
|
206
|
+
source=p["source"],
|
|
207
|
+
score=p["score"],
|
|
208
|
+
layout_mode=p.get("layout_mode", layout_mode),
|
|
209
|
+
calibration=Calibration(**p["y_calibration"]) if "y_calibration" in p else None,
|
|
210
|
+
x_labels=[
|
|
211
|
+
XLabel(**lab) for lab in p.get("x_labels", [])
|
|
212
|
+
],
|
|
213
|
+
)
|
|
214
|
+
for p in panels
|
|
215
|
+
],
|
|
216
|
+
trace_data=trace_data,
|
|
217
|
+
summary_by_label=summary_by_label,
|
|
218
|
+
panel_metadata=panel_metadata,
|
|
219
|
+
trace_csv_path=trace_csv,
|
|
220
|
+
summary_csv_path=summary_csv,
|
|
221
|
+
metadata_csv_path=meta_csv,
|
|
222
|
+
)
|