unplot 0.0.0__tar.gz → 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- unplot-0.0.1/.github/workflows/pages.yml +42 -0
- unplot-0.0.1/.gitignore +20 -0
- unplot-0.0.1/PKG-INFO +163 -0
- unplot-0.0.1/README.md +125 -0
- unplot-0.0.1/bench/benchmark.py +116 -0
- unplot-0.0.1/compare/INSTRUCTIONS.md +52 -0
- unplot-0.0.1/compare/ground_truth.json +6117 -0
- unplot-0.0.1/compare/make_plots.py +88 -0
- unplot-0.0.1/compare/score.py +111 -0
- unplot-0.0.1/js/.oxlintrc.json +36 -0
- unplot-0.0.1/js/LICENSE +21 -0
- unplot-0.0.1/js/README.md +52 -0
- unplot-0.0.1/js/build-demo.ts +18 -0
- unplot-0.0.1/js/bun.lock +240 -0
- unplot-0.0.1/js/demo/index.html +313 -0
- unplot-0.0.1/js/demo/main.ts +600 -0
- unplot-0.0.1/js/demo/make_sample.py +121 -0
- unplot-0.0.1/js/demo/sample.pdf +0 -0
- unplot-0.0.1/js/demo/serve.py +20 -0
- unplot-0.0.1/js/demo/tsconfig.json +7 -0
- unplot-0.0.1/js/demo/verify.report.json +20 -0
- unplot-0.0.1/js/demo/verify.ts +134 -0
- unplot-0.0.1/js/knip.config.ts +15 -0
- unplot-0.0.1/js/package.json +40 -0
- unplot-0.0.1/js/src/axes/calibrate.ts +127 -0
- unplot-0.0.1/js/src/curves/vectorpaths.ts +97 -0
- unplot-0.0.1/js/src/curveset.ts +88 -0
- unplot-0.0.1/js/src/extract.ts +126 -0
- unplot-0.0.1/js/src/index.ts +11 -0
- unplot-0.0.1/js/src/io/vector.ts +269 -0
- unplot-0.0.1/js/src/num.ts +79 -0
- unplot-0.0.1/js/src/priors.ts +99 -0
- unplot-0.0.1/js/src/qa/confidence.ts +71 -0
- unplot-0.0.1/js/src/qa/shape.ts +93 -0
- unplot-0.0.1/js/src/separate.ts +71 -0
- unplot-0.0.1/js/test/calibrate.test.ts +23 -0
- unplot-0.0.1/js/test/curves.test.ts +27 -0
- unplot-0.0.1/js/test/extract.test.ts +37 -0
- unplot-0.0.1/js/test/fixtures/gridded.pdf +0 -0
- unplot-0.0.1/js/test/fixtures/three_curves.pdf +0 -0
- unplot-0.0.1/js/test/gridlines.test.ts +28 -0
- unplot-0.0.1/js/test/shape.test.ts +26 -0
- unplot-0.0.1/js/test/vector.test.ts +22 -0
- unplot-0.0.1/js/tsconfig.json +22 -0
- unplot-0.0.1/pyproject.toml +30 -0
- unplot-0.0.1/tests/make_synthetic.py +282 -0
- unplot-0.0.1/tests/test_curve_selection.py +51 -0
- unplot-0.0.1/tests/test_folded_axis.py +40 -0
- unplot-0.0.1/tests/test_gridlines.py +44 -0
- unplot-0.0.1/tests/test_prior_separation.py +40 -0
- unplot-0.0.1/tests/test_raster_color.py +32 -0
- unplot-0.0.1/tests/test_raster_monotone.py +44 -0
- unplot-0.0.1/tests/test_roughness.py +30 -0
- unplot-0.0.1/tests/test_vector_roundtrip.py +66 -0
- unplot-0.0.1/unplot/__init__.py +27 -0
- unplot-0.0.1/unplot/axes/__init__.py +0 -0
- unplot-0.0.1/unplot/axes/calibrate.py +134 -0
- unplot-0.0.1/unplot/curves/__init__.py +0 -0
- unplot-0.0.1/unplot/curves/rasterpaths.py +214 -0
- unplot-0.0.1/unplot/curves/vectorpaths.py +108 -0
- unplot-0.0.1/unplot/curveset.py +95 -0
- unplot-0.0.1/unplot/extract.py +181 -0
- unplot-0.0.1/unplot/io/__init__.py +0 -0
- unplot-0.0.1/unplot/io/raster.py +18 -0
- unplot-0.0.1/unplot/io/vector.py +157 -0
- unplot-0.0.1/unplot/priors/__init__.py +135 -0
- unplot-0.0.1/unplot/qa/__init__.py +0 -0
- unplot-0.0.1/unplot/qa/confidence.py +67 -0
- unplot-0.0.1/unplot/qa/roundtrip.py +31 -0
- unplot-0.0.1/unplot/qa/shape.py +116 -0
- unplot-0.0.1/unplot/separate/__init__.py +0 -0
- unplot-0.0.1/unplot/separate/separate.py +76 -0
- unplot-0.0.0/PKG-INFO +0 -20
- unplot-0.0.0/README.md +0 -9
- unplot-0.0.0/pyproject.toml +0 -16
- unplot-0.0.0/unplot/__init__.py +0 -4
- {unplot-0.0.0 → unplot-0.0.1}/LICENSE +0 -0
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
name: Deploy demo to Pages
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
workflow_dispatch:
|
|
7
|
+
|
|
8
|
+
permissions:
|
|
9
|
+
contents: read
|
|
10
|
+
pages: write
|
|
11
|
+
id-token: write
|
|
12
|
+
|
|
13
|
+
concurrency:
|
|
14
|
+
group: pages
|
|
15
|
+
cancel-in-progress: true
|
|
16
|
+
|
|
17
|
+
jobs:
|
|
18
|
+
build:
|
|
19
|
+
runs-on: ubuntu-latest
|
|
20
|
+
steps:
|
|
21
|
+
- uses: actions/checkout@v4
|
|
22
|
+
- uses: oven-sh/setup-bun@v2
|
|
23
|
+
- name: Install
|
|
24
|
+
working-directory: js
|
|
25
|
+
run: bun install --frozen-lockfile
|
|
26
|
+
- name: Build demo
|
|
27
|
+
working-directory: js
|
|
28
|
+
run: bun run build:demo
|
|
29
|
+
- uses: actions/configure-pages@v5
|
|
30
|
+
- uses: actions/upload-pages-artifact@v3
|
|
31
|
+
with:
|
|
32
|
+
path: js/demo/dist
|
|
33
|
+
|
|
34
|
+
deploy:
|
|
35
|
+
needs: build
|
|
36
|
+
runs-on: ubuntu-latest
|
|
37
|
+
environment:
|
|
38
|
+
name: github-pages
|
|
39
|
+
url: ${{ steps.deployment.outputs.page_url }}
|
|
40
|
+
steps:
|
|
41
|
+
- id: deployment
|
|
42
|
+
uses: actions/deploy-pages@v4
|
unplot-0.0.1/.gitignore
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
__pycache__/
|
|
2
|
+
*.py[cod]
|
|
3
|
+
*.egg-info/
|
|
4
|
+
.eggs/
|
|
5
|
+
build/
|
|
6
|
+
dist/
|
|
7
|
+
.venv/
|
|
8
|
+
venv/
|
|
9
|
+
.pytest_cache/
|
|
10
|
+
.ruff_cache/
|
|
11
|
+
.mypy_cache/
|
|
12
|
+
.DS_Store
|
|
13
|
+
*.so
|
|
14
|
+
|
|
15
|
+
# manual-comparison artifacts (regenerable / user data)
|
|
16
|
+
compare/plots/
|
|
17
|
+
compare/submissions/
|
|
18
|
+
|
|
19
|
+
# node / TS port
|
|
20
|
+
node_modules/
|
unplot-0.0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: unplot
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Vector-PDF-native plot extraction with automatic multi-curve crossing separation and a first-class QA report.
|
|
5
|
+
Author: Max Ingham
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2026 Max Ingham
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
License-File: LICENSE
|
|
28
|
+
Keywords: curve,datasheet,digitizer,extraction,pdf,plot,vector
|
|
29
|
+
Requires-Python: >=3.10
|
|
30
|
+
Requires-Dist: numpy>=1.24
|
|
31
|
+
Requires-Dist: pymupdf>=1.23
|
|
32
|
+
Provides-Extra: raster
|
|
33
|
+
Requires-Dist: opencv-python-headless>=4.8; extra == 'raster'
|
|
34
|
+
Requires-Dist: pillow>=10.0; extra == 'raster'
|
|
35
|
+
Provides-Extra: test
|
|
36
|
+
Requires-Dist: pytest>=7.0; extra == 'test'
|
|
37
|
+
Description-Content-Type: text/markdown
|
|
38
|
+
|
|
39
|
+
# unplot
|
|
40
|
+
|
|
41
|
+
**Read curves off a plot, straight from the vector PDF, with overlapping curves pulled apart and a confidence score on each.**
|
|
42
|
+
|
|
43
|
+
**[Try the live demo →](https://somnai-dreams.github.io/unplot/)** — drop a vector PDF in your browser; nothing is uploaded.
|
|
44
|
+
|
|
45
|
+
unplot is a headless library (Python and TypeScript) for digitizing line and curve plots. Interactive tools like WebPlotDigitizer, Engauge, and PlotDigitizer want you to load an image and click to calibrate the axes. unplot runs unattended instead, and does two things those tools don't:
|
|
46
|
+
|
|
47
|
+
- **Vector-PDF-native.** When a PDF's curves are real drawn paths, unplot reads the exact geometry out of the file. No rasterizing, no pixel re-detection, no clicking. The numbers you get back are the numbers that were drawn.
|
|
48
|
+
- **Automatic crossing separation.** Where curves overlap and cross, it keeps them apart: by stroke style or colour for vector paths, by hue for colour-coded raster images, or by de-fanning and re-chaining a single pen stroke that traces several lobes at once.
|
|
49
|
+
|
|
50
|
+
Every result also carries a **confidence report**, per curve and for the set: shape-prior violations, calibration fit, coverage gaps, crossings. If a curve is suspect, the report says so.
|
|
51
|
+
|
|
52
|
+
## Quick start
|
|
53
|
+
|
|
54
|
+
Python:
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
from unplot import extract, Lobe
|
|
58
|
+
|
|
59
|
+
cs = extract("plot.pdf", frame=(60, 40, 360, 240),
|
|
60
|
+
prior=Lobe(0.08), expected_curves=3, order_by="peak-x")
|
|
61
|
+
|
|
62
|
+
for c in cs.curves:
|
|
63
|
+
print(c.id, c.points.shape, c.qa.confidence, c.qa.passed)
|
|
64
|
+
|
|
65
|
+
named = cs.labeled({0: "blue", 1: "green", 2: "red"}) # you map order to meaning
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
TypeScript, in the browser, with no Python and no server (see [`js/`](js/)):
|
|
69
|
+
|
|
70
|
+
```ts
|
|
71
|
+
import { extract, lobe } from "unplot"; // js/src/index.ts
|
|
72
|
+
|
|
73
|
+
const data = new Uint8Array(await (await fetch("plot.pdf")).arrayBuffer());
|
|
74
|
+
const cs = await extract(data, { expectedCurves: 3, prior: lobe(0.08), orderBy: "peak-x" });
|
|
75
|
+
cs.curves.forEach(c => console.log(c.id, c.style.color, c.qa.confidence));
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
[`js/`](js/) also has a drop-a-PDF demo: it renders the page, overlays the recovered curves, shows the QA, and exports CSV. The bytes never leave the page.
|
|
79
|
+
|
|
80
|
+
## What you get back
|
|
81
|
+
|
|
82
|
+
`extract(plot) -> CurveSet`. One plot region in, one CurveSet out:
|
|
83
|
+
|
|
84
|
+
- **calibrated arrays** per curve (`points`, in your axis units), plus source-space points for overlay
|
|
85
|
+
- **neutral identity.** Curves come back as `c0, c1, …`, ordered deterministically by peak-x, mean-y, or first-x. unplot knows nothing about your domain; you map order to meaning.
|
|
86
|
+
- **a QA report**: `CurveQA` per curve, `CurveSetQA` for the set.
|
|
87
|
+
|
|
88
|
+
## Shape priors
|
|
89
|
+
|
|
90
|
+
The one thing you supply is the expected shape of your curves, as a `ShapePrior`. It scores confidence, and it breaks ties at crossings by picking the continuation that keeps each curve on its own flank. Shipped priors:
|
|
91
|
+
|
|
92
|
+
- `Free`: no assumption, raw geometry.
|
|
93
|
+
- `Monotone(direction)`: the curve only goes one way.
|
|
94
|
+
- `Lobe(tolerance)`: approximately unimodal (rise to a peak, then fall). `tolerance` is how much flank reversal is allowed before a curve is flagged, not a demand for strict unimodality, so real shoulders and double-humps survive.
|
|
95
|
+
- `Smooth`: penalise spikes.
|
|
96
|
+
|
|
97
|
+
Write your own by implementing `qa_violation` and `separation_bias`.
|
|
98
|
+
|
|
99
|
+
## Scope
|
|
100
|
+
|
|
101
|
+
Supported: vector-PDF ingest (exact geometry) and raster images (pixel fallback); linear axis calibration from numeric tick labels, robust to stray labels leaking in from a neighbouring plot; crossing separation under a shape prior; the QA report.
|
|
102
|
+
|
|
103
|
+
Not yet, and PRs welcome:
|
|
104
|
+
|
|
105
|
+
- **Log-spaced axes.** Calibration is linear pt-to-value. Axes whose *values* are log quantities work fine when they're drawn linearly; genuinely log-*spaced* axes are not.
|
|
106
|
+
- **Many identically-styled curves on one plot.** With no colour or dash to tell them apart (e.g. a five-curve grey density plot), the separator can merge overlapping ones. The merge is detected — a `roughness` check, total y-variation over y-range — and the tangle is dropped with a warning rather than returned as a curve; cleanly-separated curves on the same plot still come back.
|
|
107
|
+
- **Heavily occluded mono scans.** Colour curves separate by hue; single-colour curves separate by continuity and the shape prior, which handles clean crossings. A low-resolution mono *scan* with a weak curve buried under a stronger one can mis-assign, and the QA reports it as low confidence. Use vector ingest or a colour-coded source there.
|
|
108
|
+
- Broken axis boxes, legends drawn over curves, exotic scales, automatic frame detection for raster scans.
|
|
109
|
+
|
|
110
|
+
## Accuracy
|
|
111
|
+
|
|
112
|
+
Peak error on synthetic ground truth, in x-axis units (the plots span x 400 to 700). From `bench/benchmark.py`, seed-fixed, 20 randomized plots per tier:
|
|
113
|
+
|
|
114
|
+
| tier | median | p90 |
|
|
115
|
+
|-|-|-|
|
|
116
|
+
| vector, 3–4 curves, separated or crossing | 0.5–0.9 | 1.3–1.5 |
|
|
117
|
+
| raster colour, 3 crossing (with or without noise) | 1.1 | 1.5 |
|
|
118
|
+
| raster mono, 3 crossing | 1.0 | 1.6 |
|
|
119
|
+
| raster mono, 3 occluded | 1.3 | 6.3 |
|
|
120
|
+
|
|
121
|
+
Vector is essentially exact: it reads drawn geometry, so crossings and curve count don't move the number. Raster is tight on synthetic plots, around 1 unit median, with an occluded lobe (one weak curve buried under its neighbours) the harder synthetic case. The real-world hard case, a low-resolution mono scan with heavy occlusion, still degrades, and there the QA flags it rather than returning a confident wrong answer.
|
|
122
|
+
|
|
123
|
+
The vector path also reproduces reference extractions of real PDFs numerically: a dash/colour-keyed plot of three overlapping curves (every peak matched exactly, peak-normalised shape RMS 0.000), and a continuous-stroke polyline carrying a real double-hump (peaks within 8 units, the substructure preserved rather than flattened).
|
|
124
|
+
|
|
125
|
+
## How it compares
|
|
126
|
+
|
|
127
|
+
unplot is narrower than the established tools on purpose: line and curve plots, headless, vector-native, self-reporting its own confidence, built for batch and automation rather than interactive correction. WebPlotDigitizer and Engauge are mature GUIs that cover many chart types and let a human fix mistakes live. To digitize one chart by hand, use them.
|
|
128
|
+
|
|
129
|
+
| capability | unplot | WebPlotDigitizer | Engauge | PlotDigitizer |
|
|
130
|
+
|-|-|-|-|-|
|
|
131
|
+
| reads vector-PDF geometry | yes | no (image) | no (image) | no (image) |
|
|
132
|
+
| automatic axis calibration | yes (vector) | click axes | click axes | click axes |
|
|
133
|
+
| crossing separation | hue or shape prior | colour / layers | guided | manual / auto |
|
|
134
|
+
| QA / confidence output | yes | no | no | no |
|
|
135
|
+
| headless, scriptable batch | yes | no (web app) | no (GUI) | no |
|
|
136
|
+
| other chart types (polar, bar…) | no | yes | yes | yes |
|
|
137
|
+
| license | MIT | AGPL-3.0 | GPL | proprietary |
|
|
138
|
+
|
|
139
|
+
On clean colour plots a head-to-head against the real WebPlotDigitizer 4.8 (its own auto-extraction, with an exact calibration set through its API) is a dead heat: both land around 0.5 to 1 unit, because both do a colour mask plus per-column averaging. unplot pulls ahead on vector PDFs, which WPD can't read at all; on noise, where its denoising rode out a case WPD's fixed threshold tripped on; and on frame removal, since WPD grabbed the black plot frame until a region was masked. WPD is the only incumbent benchmarked here. Engauge is a desktop GUI and PlotDigitizer is login-gated, so neither can be batch-run.
|
|
140
|
+
|
|
141
|
+
## Install
|
|
142
|
+
|
|
143
|
+
Python:
|
|
144
|
+
|
|
145
|
+
```bash
|
|
146
|
+
pip install unplot # vector-PDF core + numpy
|
|
147
|
+
pip install "unplot[raster]" # adds opencv/pillow for the raster fallback
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
TypeScript: the port lives in [`js/`](js/). Run `bun install` there; the only runtime dependency is pdf.js (`pdfjs-dist`, Apache-2.0).
|
|
151
|
+
|
|
152
|
+
## Tests
|
|
153
|
+
|
|
154
|
+
```bash
|
|
155
|
+
pip install "unplot[test]" && pytest # Python
|
|
156
|
+
cd js && bun test # TypeScript
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
Both suites build synthetic plots with known ground truth and assert the recovered curves and calibration match. No copyrighted source documents are redistributed.
|
|
160
|
+
|
|
161
|
+
## License
|
|
162
|
+
|
|
163
|
+
MIT, see [LICENSE](LICENSE).
|
unplot-0.0.1/README.md
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
# unplot
|
|
2
|
+
|
|
3
|
+
**Read curves off a plot, straight from the vector PDF, with overlapping curves pulled apart and a confidence score on each.**
|
|
4
|
+
|
|
5
|
+
**[Try the live demo →](https://somnai-dreams.github.io/unplot/)** — drop a vector PDF in your browser; nothing is uploaded.
|
|
6
|
+
|
|
7
|
+
unplot is a headless library (Python and TypeScript) for digitizing line and curve plots. Interactive tools like WebPlotDigitizer, Engauge, and PlotDigitizer want you to load an image and click to calibrate the axes. unplot runs unattended instead, and does two things those tools don't:
|
|
8
|
+
|
|
9
|
+
- **Vector-PDF-native.** When a PDF's curves are real drawn paths, unplot reads the exact geometry out of the file. No rasterizing, no pixel re-detection, no clicking. The numbers you get back are the numbers that were drawn.
|
|
10
|
+
- **Automatic crossing separation.** Where curves overlap and cross, it keeps them apart: by stroke style or colour for vector paths, by hue for colour-coded raster images, or by de-fanning and re-chaining a single pen stroke that traces several lobes at once.
|
|
11
|
+
|
|
12
|
+
Every result also carries a **confidence report**, per curve and for the set: shape-prior violations, calibration fit, coverage gaps, crossings. If a curve is suspect, the report says so.
|
|
13
|
+
|
|
14
|
+
## Quick start
|
|
15
|
+
|
|
16
|
+
Python:
|
|
17
|
+
|
|
18
|
+
```python
|
|
19
|
+
from unplot import extract, Lobe
|
|
20
|
+
|
|
21
|
+
cs = extract("plot.pdf", frame=(60, 40, 360, 240),
|
|
22
|
+
prior=Lobe(0.08), expected_curves=3, order_by="peak-x")
|
|
23
|
+
|
|
24
|
+
for c in cs.curves:
|
|
25
|
+
print(c.id, c.points.shape, c.qa.confidence, c.qa.passed)
|
|
26
|
+
|
|
27
|
+
named = cs.labeled({0: "blue", 1: "green", 2: "red"}) # you map order to meaning
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
TypeScript, in the browser, with no Python and no server (see [`js/`](js/)):
|
|
31
|
+
|
|
32
|
+
```ts
|
|
33
|
+
import { extract, lobe } from "unplot"; // js/src/index.ts
|
|
34
|
+
|
|
35
|
+
const data = new Uint8Array(await (await fetch("plot.pdf")).arrayBuffer());
|
|
36
|
+
const cs = await extract(data, { expectedCurves: 3, prior: lobe(0.08), orderBy: "peak-x" });
|
|
37
|
+
cs.curves.forEach(c => console.log(c.id, c.style.color, c.qa.confidence));
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
[`js/`](js/) also has a drop-a-PDF demo: it renders the page, overlays the recovered curves, shows the QA, and exports CSV. The bytes never leave the page.
|
|
41
|
+
|
|
42
|
+
## What you get back
|
|
43
|
+
|
|
44
|
+
`extract(plot) -> CurveSet`. One plot region in, one CurveSet out:
|
|
45
|
+
|
|
46
|
+
- **calibrated arrays** per curve (`points`, in your axis units), plus source-space points for overlay
|
|
47
|
+
- **neutral identity.** Curves come back as `c0, c1, …`, ordered deterministically by peak-x, mean-y, or first-x. unplot knows nothing about your domain; you map order to meaning.
|
|
48
|
+
- **a QA report**: `CurveQA` per curve, `CurveSetQA` for the set.
|
|
49
|
+
|
|
50
|
+
## Shape priors
|
|
51
|
+
|
|
52
|
+
The one thing you supply is the expected shape of your curves, as a `ShapePrior`. It scores confidence, and it breaks ties at crossings by picking the continuation that keeps each curve on its own flank. Shipped priors:
|
|
53
|
+
|
|
54
|
+
- `Free`: no assumption, raw geometry.
|
|
55
|
+
- `Monotone(direction)`: the curve only goes one way.
|
|
56
|
+
- `Lobe(tolerance)`: approximately unimodal (rise to a peak, then fall). `tolerance` is how much flank reversal is allowed before a curve is flagged, not a demand for strict unimodality, so real shoulders and double-humps survive.
|
|
57
|
+
- `Smooth`: penalise spikes.
|
|
58
|
+
|
|
59
|
+
Write your own by implementing `qa_violation` and `separation_bias`.
|
|
60
|
+
|
|
61
|
+
## Scope
|
|
62
|
+
|
|
63
|
+
Supported: vector-PDF ingest (exact geometry) and raster images (pixel fallback); linear axis calibration from numeric tick labels, robust to stray labels leaking in from a neighbouring plot; crossing separation under a shape prior; the QA report.
|
|
64
|
+
|
|
65
|
+
Not yet, and PRs welcome:
|
|
66
|
+
|
|
67
|
+
- **Log-spaced axes.** Calibration is linear pt-to-value. Axes whose *values* are log quantities work fine when they're drawn linearly; genuinely log-*spaced* axes are not.
|
|
68
|
+
- **Many identically-styled curves on one plot.** With no colour or dash to tell them apart (e.g. a five-curve grey density plot), the separator can merge overlapping ones. The merge is detected — a `roughness` check, total y-variation over y-range — and the tangle is dropped with a warning rather than returned as a curve; cleanly-separated curves on the same plot still come back.
|
|
69
|
+
- **Heavily occluded mono scans.** Colour curves separate by hue; single-colour curves separate by continuity and the shape prior, which handles clean crossings. A low-resolution mono *scan* with a weak curve buried under a stronger one can mis-assign, and the QA reports it as low confidence. Use vector ingest or a colour-coded source there.
|
|
70
|
+
- Broken axis boxes, legends drawn over curves, exotic scales, automatic frame detection for raster scans.
|
|
71
|
+
|
|
72
|
+
## Accuracy
|
|
73
|
+
|
|
74
|
+
Peak error on synthetic ground truth, in x-axis units (the plots span x 400 to 700). From `bench/benchmark.py`, seed-fixed, 20 randomized plots per tier:
|
|
75
|
+
|
|
76
|
+
| tier | median | p90 |
|
|
77
|
+
|-|-|-|
|
|
78
|
+
| vector, 3–4 curves, separated or crossing | 0.5–0.9 | 1.3–1.5 |
|
|
79
|
+
| raster colour, 3 crossing (with or without noise) | 1.1 | 1.5 |
|
|
80
|
+
| raster mono, 3 crossing | 1.0 | 1.6 |
|
|
81
|
+
| raster mono, 3 occluded | 1.3 | 6.3 |
|
|
82
|
+
|
|
83
|
+
Vector is essentially exact: it reads drawn geometry, so crossings and curve count don't move the number. Raster is tight on synthetic plots, around 1 unit median, with an occluded lobe (one weak curve buried under its neighbours) the harder synthetic case. The real-world hard case, a low-resolution mono scan with heavy occlusion, still degrades, and there the QA flags it rather than returning a confident wrong answer.
|
|
84
|
+
|
|
85
|
+
The vector path also reproduces reference extractions of real PDFs numerically: a dash/colour-keyed plot of three overlapping curves (every peak matched exactly, peak-normalised shape RMS 0.000), and a continuous-stroke polyline carrying a real double-hump (peaks within 8 units, the substructure preserved rather than flattened).
|
|
86
|
+
|
|
87
|
+
## How it compares
|
|
88
|
+
|
|
89
|
+
unplot is narrower than the established tools on purpose: line and curve plots, headless, vector-native, self-reporting its own confidence, built for batch and automation rather than interactive correction. WebPlotDigitizer and Engauge are mature GUIs that cover many chart types and let a human fix mistakes live. To digitize one chart by hand, use them.
|
|
90
|
+
|
|
91
|
+
| capability | unplot | WebPlotDigitizer | Engauge | PlotDigitizer |
|
|
92
|
+
|-|-|-|-|-|
|
|
93
|
+
| reads vector-PDF geometry | yes | no (image) | no (image) | no (image) |
|
|
94
|
+
| automatic axis calibration | yes (vector) | click axes | click axes | click axes |
|
|
95
|
+
| crossing separation | hue or shape prior | colour / layers | guided | manual / auto |
|
|
96
|
+
| QA / confidence output | yes | no | no | no |
|
|
97
|
+
| headless, scriptable batch | yes | no (web app) | no (GUI) | no |
|
|
98
|
+
| other chart types (polar, bar…) | no | yes | yes | yes |
|
|
99
|
+
| license | MIT | AGPL-3.0 | GPL | proprietary |
|
|
100
|
+
|
|
101
|
+
On clean colour plots a head-to-head against the real WebPlotDigitizer 4.8 (its own auto-extraction, with an exact calibration set through its API) is a dead heat: both land around 0.5 to 1 unit, because both do a colour mask plus per-column averaging. unplot pulls ahead on vector PDFs, which WPD can't read at all; on noise, where its denoising rode out a case WPD's fixed threshold tripped on; and on frame removal, since WPD grabbed the black plot frame until a region was masked. WPD is the only incumbent benchmarked here. Engauge is a desktop GUI and PlotDigitizer is login-gated, so neither can be batch-run.
|
|
102
|
+
|
|
103
|
+
## Install
|
|
104
|
+
|
|
105
|
+
Python:
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
pip install unplot # vector-PDF core + numpy
|
|
109
|
+
pip install "unplot[raster]" # adds opencv/pillow for the raster fallback
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
TypeScript: the port lives in [`js/`](js/). Run `bun install` there; the only runtime dependency is pdf.js (`pdfjs-dist`, Apache-2.0).
|
|
113
|
+
|
|
114
|
+
## Tests
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
pip install "unplot[test]" && pytest # Python
|
|
118
|
+
cd js && bun test # TypeScript
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
Both suites build synthetic plots with known ground truth and assert the recovered curves and calibration match. No copyrighted source documents are redistributed.
|
|
122
|
+
|
|
123
|
+
## License
|
|
124
|
+
|
|
125
|
+
MIT, see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""Accuracy benchmark: unplot extraction error across difficulty tiers on synthetic ground truth.
|
|
2
|
+
|
|
3
|
+
No competitor here — these are absolute numbers. Each tier generates K randomized plots with known peaks
|
|
4
|
+
(via PyMuPDF), extracts them, and reports the recovery rate (got the expected curve count) and the median
|
|
5
|
+
peak error over recovered cases. Deterministic (fixed seed). Run: python bench/benchmark.py
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
import tempfile
|
|
11
|
+
|
|
12
|
+
import fitz
|
|
13
|
+
import numpy as np
|
|
14
|
+
|
|
15
|
+
from unplot import Lobe, extract
|
|
16
|
+
|
|
17
|
+
FRAME = (60.0, 40.0, 360.0, 240.0)
|
|
18
|
+
XR, YR = (400.0, 700.0), (0.0, 2.0)
|
|
19
|
+
COLORS = [(0, 0, 1), (0, 0.6, 0), (0.9, 0, 0), (0.6, 0, 0.6)]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def x2px(x: float) -> float:
|
|
23
|
+
return FRAME[0] + (x - XR[0]) / (XR[1] - XR[0]) * (FRAME[2] - FRAME[0])
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def v2py(v: float) -> float:
|
|
27
|
+
return FRAME[3] - (v - YR[0]) / (YR[1] - YR[0]) * (FRAME[3] - FRAME[1])
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def make_pdf(peaks, sigma, amp, path, mono=False):
|
|
31
|
+
doc = fitz.open()
|
|
32
|
+
pg = doc.new_page(width=420, height=300)
|
|
33
|
+
nm = np.arange(400.0, 700.01, 2.0)
|
|
34
|
+
for k, (pk, a, s) in enumerate(zip(peaks, amp, sigma)):
|
|
35
|
+
v = a * np.exp(-0.5 * ((nm - pk) / s) ** 2)
|
|
36
|
+
pts = [fitz.Point(x2px(x), v2py(val)) for x, val in zip(nm, v)]
|
|
37
|
+
col = (0, 0, 0) if mono else COLORS[k % 4]
|
|
38
|
+
sh = pg.new_shape(); sh.draw_polyline(pts); sh.finish(color=col, width=1.4, closePath=False); sh.commit()
|
|
39
|
+
for lbl in (400, 500, 600, 700):
|
|
40
|
+
pg.insert_text(fitz.Point(x2px(lbl) - 6, FRAME[3] + 12), str(lbl), fontsize=8)
|
|
41
|
+
for lbl in (0, 1, 2):
|
|
42
|
+
pg.insert_text(fitz.Point(FRAME[0] - 16, v2py(lbl) + 3), str(lbl), fontsize=8)
|
|
43
|
+
doc.save(path); doc.close()
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def render(path, dpi, noise, rng):
|
|
47
|
+
pix = fitz.open(path)[0].get_pixmap(dpi=dpi)
|
|
48
|
+
img = np.frombuffer(pix.samples, np.uint8).reshape(pix.height, pix.width, pix.n)[..., :3].copy()
|
|
49
|
+
if noise > 0:
|
|
50
|
+
img = np.clip(img.astype(float) + rng.normal(0, noise, img.shape), 0, 255).astype(np.uint8)
|
|
51
|
+
return img
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def gen_peaks(n, mode, rng):
|
|
55
|
+
# fixed lobe width across tiers (so only spacing/amplitude vary, not the argmax-jitter from width)
|
|
56
|
+
sigma = [26.0] * n
|
|
57
|
+
spacing = 95.0 if mode == "separated" else 68.0 # crossing/occluded pack the lobes closer
|
|
58
|
+
start = rng.uniform(440, 460)
|
|
59
|
+
peaks = sorted(min(start + i * rng.uniform(spacing - 8, spacing + 8), 686.0) for i in range(n))
|
|
60
|
+
amp = [rng.uniform(1.5, 1.9) for _ in range(n)]
|
|
61
|
+
if mode == "occluded": # one weak lobe buried under its taller neighbours
|
|
62
|
+
amp[int(rng.integers(0, n))] *= 0.4
|
|
63
|
+
return peaks, sigma, amp
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def peak_errors(cs, truth):
|
|
67
|
+
"""Per-curve |recovered peak - true peak| (nm), pairing both sorted by peak. expected_curves is set so
|
|
68
|
+
the count always matches; the error distribution is the real quality signal."""
|
|
69
|
+
rec = sorted(float(c.points[np.argmax(c.points[:, 1]), 0]) for c in cs.curves)
|
|
70
|
+
return [abs(a - b) for a, b in zip(rec, sorted(truth))]
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
TIERS = [
|
|
74
|
+
("vector · 3 separated", dict(ingest="vector", n=3, mode="separated")),
|
|
75
|
+
("vector · 3 crossing", dict(ingest="vector", n=3, mode="crossing")),
|
|
76
|
+
("vector · 4 crossing", dict(ingest="vector", n=4, mode="crossing")),
|
|
77
|
+
("raster colour · 3 crossing", dict(ingest="raster", n=3, mode="crossing", dpi=200, noise=0, mono=False)),
|
|
78
|
+
("raster colour · 3 crossing · noisy", dict(ingest="raster", n=3, mode="crossing", dpi=200, noise=18, mono=False)),
|
|
79
|
+
("raster mono · 3 crossing", dict(ingest="raster", n=3, mode="crossing", dpi=200, noise=0, mono=True)),
|
|
80
|
+
("raster mono · 3 occluded", dict(ingest="raster", n=3, mode="occluded", dpi=200, noise=0, mono=True)),
|
|
81
|
+
]
|
|
82
|
+
K = 20
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def run_tier(cfg, rng, tmp):
|
|
86
|
+
errs = []
|
|
87
|
+
for t in range(K):
|
|
88
|
+
peaks, sigma, amp = gen_peaks(cfg["n"], cfg["mode"], rng)
|
|
89
|
+
p = os.path.join(tmp, f"p{t}.pdf"); make_pdf(peaks, sigma, amp, p, cfg.get("mono", False))
|
|
90
|
+
if cfg["ingest"] == "vector":
|
|
91
|
+
cs = extract(p, frame=FRAME, prior=Lobe(0.08), expected_curves=cfg["n"], order_by="peak-x")
|
|
92
|
+
else:
|
|
93
|
+
s = cfg["dpi"] / 72.0
|
|
94
|
+
img = render(p, cfg["dpi"], cfg["noise"], rng)
|
|
95
|
+
cs = extract(img, ingest="raster", frame=tuple(v * s for v in FRAME),
|
|
96
|
+
x_axis=[(x2px(400) * s, 400.0), (x2px(700) * s, 700.0)],
|
|
97
|
+
y_axis=[(v2py(0) * s, 0.0), (v2py(2) * s, 2.0)],
|
|
98
|
+
prior=Lobe(0.08), expected_curves=cfg["n"], order_by="peak-x")
|
|
99
|
+
errs += peak_errors(cs, peaks)
|
|
100
|
+
return errs
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def main():
|
|
104
|
+
rng = np.random.default_rng(7)
|
|
105
|
+
tmp = tempfile.mkdtemp()
|
|
106
|
+
print(f"unplot accuracy benchmark (K={K} plots/tier, synthetic ground truth, seed 7)")
|
|
107
|
+
print("peak error per curve, nm (vector = exact geometry; raster = pixel floor)\n")
|
|
108
|
+
print(f"{'tier':32}{'median':>10}{'p90':>10}")
|
|
109
|
+
print("-" * 52)
|
|
110
|
+
for name, cfg in TIERS:
|
|
111
|
+
errs = np.array(run_tier(cfg, rng, tmp))
|
|
112
|
+
print(f"{name:32}{np.median(errs):>8.1f} {np.percentile(errs, 90):>8.1f}")
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
if __name__ == "__main__":
|
|
116
|
+
main()
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# Manual head-to-head: digitize these plots in WebPlotDigitizer / Engauge
|
|
2
|
+
|
|
3
|
+
unplot runs automatically in the scorer. The interactive tools (WPD, Engauge) can't be scripted, so this
|
|
4
|
+
is the one part that needs a human. Do as many plots/tools as you have patience for — even one is useful.
|
|
5
|
+
|
|
6
|
+
## The plots (`compare/plots/`)
|
|
7
|
+
|
|
8
|
+
Each plot is provided as a **PNG** (load this into the GUI tools) and a **PDF** (unplot reads this for its
|
|
9
|
+
vector path). Same axes on all: **x = 400…700, y = 0…2**, with tick labels drawn for calibration.
|
|
10
|
+
|
|
11
|
+
|file|curves|colours (left→right by peak)|
|
|
12
|
+
|-|-|-|
|
|
13
|
+
|`p1_single`|1|black|
|
|
14
|
+
|`p2_three_separated`|3|blue, green, red|
|
|
15
|
+
|`p3_three_crossing`|3|blue, green, red|
|
|
16
|
+
|`p4_three_noisy`|3|blue, green, red (with image noise)|
|
|
17
|
+
|
|
18
|
+
## Steps (per plot, per tool)
|
|
19
|
+
|
|
20
|
+
1. Load the **PNG** into WebPlotDigitizer ([apps.automeris.io](https://apps.automeris.io)) or Engauge.
|
|
21
|
+
2. **Calibrate the axes** by clicking known points: on the x-axis click the **400** and **700** ticks; on
|
|
22
|
+
the y-axis click the **0** and **2** ticks. Enter those values when prompted.
|
|
23
|
+
3. **Digitize each curve.** For the 3-curve plots, do each colour separately (WPD: add a dataset per colour
|
|
24
|
+
and use *Automatic Extraction → Color*; Engauge: a curve per colour with *Segment Fill*). Use the
|
|
25
|
+
automatic tracer where it works — that's the tool's strength.
|
|
26
|
+
4. **Export** each curve as CSV (two columns: x then y, in data units).
|
|
27
|
+
|
|
28
|
+
## Where to put the exports
|
|
29
|
+
|
|
30
|
+
Save one CSV per curve, named `<plot>__c<index>.csv`, indexed **left→right by peak** (so c0 = blue, c1 =
|
|
31
|
+
green, c2 = red), under the tool's folder:
|
|
32
|
+
|
|
33
|
+
```
|
|
34
|
+
compare/submissions/wpd/p2_three_separated__c0.csv
|
|
35
|
+
compare/submissions/wpd/p2_three_separated__c1.csv
|
|
36
|
+
compare/submissions/engauge/p3_three_crossing__c0.csv
|
|
37
|
+
...
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
CSV format is forgiving — comma or whitespace separated, header lines are skipped.
|
|
41
|
+
|
|
42
|
+
## Score it
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
python compare/score.py
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
It runs unplot (vector + raster) on every plot and folds in whatever competitor CSVs you've added,
|
|
49
|
+
reporting mean peak error and mean y-RMS per curve against ground truth (in x/y axis units). Re-run
|
|
50
|
+
anytime you add more.
|
|
51
|
+
|
|
52
|
+
(Regenerate the plots with `python compare/make_plots.py` if needed — deterministic, no randomness.)
|