unplot 0.0.0__tar.gz → 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. unplot-0.0.1/.github/workflows/pages.yml +42 -0
  2. unplot-0.0.1/.gitignore +20 -0
  3. unplot-0.0.1/PKG-INFO +163 -0
  4. unplot-0.0.1/README.md +125 -0
  5. unplot-0.0.1/bench/benchmark.py +116 -0
  6. unplot-0.0.1/compare/INSTRUCTIONS.md +52 -0
  7. unplot-0.0.1/compare/ground_truth.json +6117 -0
  8. unplot-0.0.1/compare/make_plots.py +88 -0
  9. unplot-0.0.1/compare/score.py +111 -0
  10. unplot-0.0.1/js/.oxlintrc.json +36 -0
  11. unplot-0.0.1/js/LICENSE +21 -0
  12. unplot-0.0.1/js/README.md +52 -0
  13. unplot-0.0.1/js/build-demo.ts +18 -0
  14. unplot-0.0.1/js/bun.lock +240 -0
  15. unplot-0.0.1/js/demo/index.html +313 -0
  16. unplot-0.0.1/js/demo/main.ts +600 -0
  17. unplot-0.0.1/js/demo/make_sample.py +121 -0
  18. unplot-0.0.1/js/demo/sample.pdf +0 -0
  19. unplot-0.0.1/js/demo/serve.py +20 -0
  20. unplot-0.0.1/js/demo/tsconfig.json +7 -0
  21. unplot-0.0.1/js/demo/verify.report.json +20 -0
  22. unplot-0.0.1/js/demo/verify.ts +134 -0
  23. unplot-0.0.1/js/knip.config.ts +15 -0
  24. unplot-0.0.1/js/package.json +40 -0
  25. unplot-0.0.1/js/src/axes/calibrate.ts +127 -0
  26. unplot-0.0.1/js/src/curves/vectorpaths.ts +97 -0
  27. unplot-0.0.1/js/src/curveset.ts +88 -0
  28. unplot-0.0.1/js/src/extract.ts +126 -0
  29. unplot-0.0.1/js/src/index.ts +11 -0
  30. unplot-0.0.1/js/src/io/vector.ts +269 -0
  31. unplot-0.0.1/js/src/num.ts +79 -0
  32. unplot-0.0.1/js/src/priors.ts +99 -0
  33. unplot-0.0.1/js/src/qa/confidence.ts +71 -0
  34. unplot-0.0.1/js/src/qa/shape.ts +93 -0
  35. unplot-0.0.1/js/src/separate.ts +71 -0
  36. unplot-0.0.1/js/test/calibrate.test.ts +23 -0
  37. unplot-0.0.1/js/test/curves.test.ts +27 -0
  38. unplot-0.0.1/js/test/extract.test.ts +37 -0
  39. unplot-0.0.1/js/test/fixtures/gridded.pdf +0 -0
  40. unplot-0.0.1/js/test/fixtures/three_curves.pdf +0 -0
  41. unplot-0.0.1/js/test/gridlines.test.ts +28 -0
  42. unplot-0.0.1/js/test/shape.test.ts +26 -0
  43. unplot-0.0.1/js/test/vector.test.ts +22 -0
  44. unplot-0.0.1/js/tsconfig.json +22 -0
  45. unplot-0.0.1/pyproject.toml +30 -0
  46. unplot-0.0.1/tests/make_synthetic.py +282 -0
  47. unplot-0.0.1/tests/test_curve_selection.py +51 -0
  48. unplot-0.0.1/tests/test_folded_axis.py +40 -0
  49. unplot-0.0.1/tests/test_gridlines.py +44 -0
  50. unplot-0.0.1/tests/test_prior_separation.py +40 -0
  51. unplot-0.0.1/tests/test_raster_color.py +32 -0
  52. unplot-0.0.1/tests/test_raster_monotone.py +44 -0
  53. unplot-0.0.1/tests/test_roughness.py +30 -0
  54. unplot-0.0.1/tests/test_vector_roundtrip.py +66 -0
  55. unplot-0.0.1/unplot/__init__.py +27 -0
  56. unplot-0.0.1/unplot/axes/__init__.py +0 -0
  57. unplot-0.0.1/unplot/axes/calibrate.py +134 -0
  58. unplot-0.0.1/unplot/curves/__init__.py +0 -0
  59. unplot-0.0.1/unplot/curves/rasterpaths.py +214 -0
  60. unplot-0.0.1/unplot/curves/vectorpaths.py +108 -0
  61. unplot-0.0.1/unplot/curveset.py +95 -0
  62. unplot-0.0.1/unplot/extract.py +181 -0
  63. unplot-0.0.1/unplot/io/__init__.py +0 -0
  64. unplot-0.0.1/unplot/io/raster.py +18 -0
  65. unplot-0.0.1/unplot/io/vector.py +157 -0
  66. unplot-0.0.1/unplot/priors/__init__.py +135 -0
  67. unplot-0.0.1/unplot/qa/__init__.py +0 -0
  68. unplot-0.0.1/unplot/qa/confidence.py +67 -0
  69. unplot-0.0.1/unplot/qa/roundtrip.py +31 -0
  70. unplot-0.0.1/unplot/qa/shape.py +116 -0
  71. unplot-0.0.1/unplot/separate/__init__.py +0 -0
  72. unplot-0.0.1/unplot/separate/separate.py +76 -0
  73. unplot-0.0.0/PKG-INFO +0 -20
  74. unplot-0.0.0/README.md +0 -9
  75. unplot-0.0.0/pyproject.toml +0 -16
  76. unplot-0.0.0/unplot/__init__.py +0 -4
  77. {unplot-0.0.0 → unplot-0.0.1}/LICENSE +0 -0
@@ -0,0 +1,42 @@
1
+ name: Deploy demo to Pages
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ workflow_dispatch:
7
+
8
+ permissions:
9
+ contents: read
10
+ pages: write
11
+ id-token: write
12
+
13
+ concurrency:
14
+ group: pages
15
+ cancel-in-progress: true
16
+
17
+ jobs:
18
+ build:
19
+ runs-on: ubuntu-latest
20
+ steps:
21
+ - uses: actions/checkout@v4
22
+ - uses: oven-sh/setup-bun@v2
23
+ - name: Install
24
+ working-directory: js
25
+ run: bun install --frozen-lockfile
26
+ - name: Build demo
27
+ working-directory: js
28
+ run: bun run build:demo
29
+ - uses: actions/configure-pages@v5
30
+ - uses: actions/upload-pages-artifact@v3
31
+ with:
32
+ path: js/demo/dist
33
+
34
+ deploy:
35
+ needs: build
36
+ runs-on: ubuntu-latest
37
+ environment:
38
+ name: github-pages
39
+ url: ${{ steps.deployment.outputs.page_url }}
40
+ steps:
41
+ - id: deployment
42
+ uses: actions/deploy-pages@v4
@@ -0,0 +1,20 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.egg-info/
4
+ .eggs/
5
+ build/
6
+ dist/
7
+ .venv/
8
+ venv/
9
+ .pytest_cache/
10
+ .ruff_cache/
11
+ .mypy_cache/
12
+ .DS_Store
13
+ *.so
14
+
15
+ # manual-comparison artifacts (regenerable / user data)
16
+ compare/plots/
17
+ compare/submissions/
18
+
19
+ # node / TS port
20
+ node_modules/
unplot-0.0.1/PKG-INFO ADDED
@@ -0,0 +1,163 @@
1
+ Metadata-Version: 2.4
2
+ Name: unplot
3
+ Version: 0.0.1
4
+ Summary: Vector-PDF-native plot extraction with automatic multi-curve crossing separation and a first-class QA report.
5
+ Author: Max Ingham
6
+ License: MIT License
7
+
8
+ Copyright (c) 2026 Max Ingham
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+ License-File: LICENSE
28
+ Keywords: curve,datasheet,digitizer,extraction,pdf,plot,vector
29
+ Requires-Python: >=3.10
30
+ Requires-Dist: numpy>=1.24
31
+ Requires-Dist: pymupdf>=1.23
32
+ Provides-Extra: raster
33
+ Requires-Dist: opencv-python-headless>=4.8; extra == 'raster'
34
+ Requires-Dist: pillow>=10.0; extra == 'raster'
35
+ Provides-Extra: test
36
+ Requires-Dist: pytest>=7.0; extra == 'test'
37
+ Description-Content-Type: text/markdown
38
+
39
+ # unplot
40
+
41
+ **Read curves off a plot, straight from the vector PDF, with overlapping curves pulled apart and a confidence score on each.**
42
+
43
+ **[Try the live demo →](https://somnai-dreams.github.io/unplot/)** — drop a vector PDF in your browser; nothing is uploaded.
44
+
45
+ unplot is a headless library (Python and TypeScript) for digitizing line and curve plots. Interactive tools like WebPlotDigitizer, Engauge, and PlotDigitizer want you to load an image and click to calibrate the axes. unplot runs unattended instead, and does two things those tools don't:
46
+
47
+ - **Vector-PDF-native.** When a PDF's curves are real drawn paths, unplot reads the exact geometry out of the file. No rasterizing, no pixel re-detection, no clicking. The numbers you get back are the numbers that were drawn.
48
+ - **Automatic crossing separation.** Where curves overlap and cross, it keeps them apart: by stroke style or colour for vector paths, by hue for colour-coded raster images, or by de-fanning and re-chaining a single pen stroke that traces several lobes at once.
49
+
50
+ Every result also carries a **confidence report**, per curve and for the set: shape-prior violations, calibration fit, coverage gaps, crossings. If a curve is suspect, the report says so.
51
+
52
+ ## Quick start
53
+
54
+ Python:
55
+
56
+ ```python
57
+ from unplot import extract, Lobe
58
+
59
+ cs = extract("plot.pdf", frame=(60, 40, 360, 240),
60
+ prior=Lobe(0.08), expected_curves=3, order_by="peak-x")
61
+
62
+ for c in cs.curves:
63
+ print(c.id, c.points.shape, c.qa.confidence, c.qa.passed)
64
+
65
+ named = cs.labeled({0: "blue", 1: "green", 2: "red"}) # you map order to meaning
66
+ ```
67
+
68
+ TypeScript, in the browser, with no Python and no server (see [`js/`](js/)):
69
+
70
+ ```ts
71
+ import { extract, lobe } from "unplot"; // js/src/index.ts
72
+
73
+ const data = new Uint8Array(await (await fetch("plot.pdf")).arrayBuffer());
74
+ const cs = await extract(data, { expectedCurves: 3, prior: lobe(0.08), orderBy: "peak-x" });
75
+ cs.curves.forEach(c => console.log(c.id, c.style.color, c.qa.confidence));
76
+ ```
77
+
78
+ [`js/`](js/) also has a drop-a-PDF demo: it renders the page, overlays the recovered curves, shows the QA, and exports CSV. The bytes never leave the page.
79
+
80
+ ## What you get back
81
+
82
+ `extract(plot) -> CurveSet`. One plot region in, one CurveSet out:
83
+
84
+ - **calibrated arrays** per curve (`points`, in your axis units), plus source-space points for overlay
85
+ - **neutral identity.** Curves come back as `c0, c1, …`, ordered deterministically by peak-x, mean-y, or first-x. unplot knows nothing about your domain; you map order to meaning.
86
+ - **a QA report**: `CurveQA` per curve, `CurveSetQA` for the set.
87
+
88
+ ## Shape priors
89
+
90
+ The one thing you supply is the expected shape of your curves, as a `ShapePrior`. It scores confidence, and it breaks ties at crossings by picking the continuation that keeps each curve on its own flank. Shipped priors:
91
+
92
+ - `Free`: no assumption, raw geometry.
93
+ - `Monotone(direction)`: the curve only goes one way.
94
+ - `Lobe(tolerance)`: approximately unimodal (rise to a peak, then fall). `tolerance` is how much flank reversal is allowed before a curve is flagged, not a demand for strict unimodality, so real shoulders and double-humps survive.
95
+ - `Smooth`: penalise spikes.
96
+
97
+ Write your own by implementing `qa_violation` and `separation_bias`.
98
+
99
+ ## Scope
100
+
101
+ Supported: vector-PDF ingest (exact geometry) and raster images (pixel fallback); linear axis calibration from numeric tick labels, robust to stray labels leaking in from a neighbouring plot; crossing separation under a shape prior; the QA report.
102
+
103
+ Not yet, and PRs welcome:
104
+
105
+ - **Log-spaced axes.** Calibration is linear pt-to-value. Axes whose *values* are log quantities work fine when they're drawn linearly; genuinely log-*spaced* axes are not.
106
+ - **Many identically-styled curves on one plot.** With no colour or dash to tell them apart (e.g. a five-curve grey density plot), the separator can merge overlapping ones. The merge is detected — a `roughness` check, total y-variation over y-range — and the tangle is dropped with a warning rather than returned as a curve; cleanly-separated curves on the same plot still come back.
107
+ - **Heavily occluded mono scans.** Colour curves separate by hue; single-colour curves separate by continuity and the shape prior, which handles clean crossings. A low-resolution mono *scan* with a weak curve buried under a stronger one can mis-assign, and the QA reports it as low confidence. Use vector ingest or a colour-coded source there.
108
+ - Broken axis boxes, legends drawn over curves, exotic scales, automatic frame detection for raster scans.
109
+
110
+ ## Accuracy
111
+
112
+ Peak error on synthetic ground truth, in x-axis units (the plots span x 400 to 700). From `bench/benchmark.py`, seed-fixed, 20 randomized plots per tier:
113
+
114
+ | tier | median | p90 |
115
+ |-|-|-|
116
+ | vector, 3–4 curves, separated or crossing | 0.5–0.9 | 1.3–1.5 |
117
+ | raster colour, 3 crossing (with or without noise) | 1.1 | 1.5 |
118
+ | raster mono, 3 crossing | 1.0 | 1.6 |
119
+ | raster mono, 3 occluded | 1.3 | 6.3 |
120
+
121
+ Vector is essentially exact: it reads drawn geometry, so crossings and curve count don't move the number. Raster is tight on synthetic plots, around 1 unit median, with an occluded lobe (one weak curve buried under its neighbours) the harder synthetic case. The real-world hard case, a low-resolution mono scan with heavy occlusion, still degrades, and there the QA flags it rather than returning a confident wrong answer.
122
+
123
+ The vector path also reproduces reference extractions of real PDFs numerically: a dash/colour-keyed plot of three overlapping curves (every peak matched exactly, peak-normalised shape RMS 0.000), and a continuous-stroke polyline carrying a real double-hump (peaks within 8 units, the substructure preserved rather than flattened).
124
+
125
+ ## How it compares
126
+
127
+ unplot is narrower than the established tools on purpose: line and curve plots, headless, vector-native, self-reporting its own confidence, built for batch and automation rather than interactive correction. WebPlotDigitizer and Engauge are mature GUIs that cover many chart types and let a human fix mistakes live. To digitize one chart by hand, use them.
128
+
129
+ | capability | unplot | WebPlotDigitizer | Engauge | PlotDigitizer |
130
+ |-|-|-|-|-|
131
+ | reads vector-PDF geometry | yes | no (image) | no (image) | no (image) |
132
+ | automatic axis calibration | yes (vector) | click axes | click axes | click axes |
133
+ | crossing separation | hue or shape prior | colour / layers | guided | manual / auto |
134
+ | QA / confidence output | yes | no | no | no |
135
+ | headless, scriptable batch | yes | no (web app) | no (GUI) | no |
136
+ | other chart types (polar, bar…) | no | yes | yes | yes |
137
+ | license | MIT | AGPL-3.0 | GPL | proprietary |
138
+
139
+ On clean colour plots a head-to-head against the real WebPlotDigitizer 4.8 (its own auto-extraction, with an exact calibration set through its API) is a dead heat: both land around 0.5 to 1 unit, because both do a colour mask plus per-column averaging. unplot pulls ahead on vector PDFs, which WPD can't read at all; on noise, where its denoising rode out a case WPD's fixed threshold tripped on; and on frame removal, since WPD grabbed the black plot frame until a region was masked. WPD is the only incumbent benchmarked here. Engauge is a desktop GUI and PlotDigitizer is login-gated, so neither can be batch-run.
140
+
141
+ ## Install
142
+
143
+ Python:
144
+
145
+ ```bash
146
+ pip install unplot # vector-PDF core + numpy
147
+ pip install "unplot[raster]" # adds opencv/pillow for the raster fallback
148
+ ```
149
+
150
+ TypeScript: the port lives in [`js/`](js/). Run `bun install` there; the only runtime dependency is pdf.js (`pdfjs-dist`, Apache-2.0).
151
+
152
+ ## Tests
153
+
154
+ ```bash
155
+ pip install "unplot[test]" && pytest # Python
156
+ cd js && bun test # TypeScript
157
+ ```
158
+
159
+ Both suites build synthetic plots with known ground truth and assert the recovered curves and calibration match. No copyrighted source documents are redistributed.
160
+
161
+ ## License
162
+
163
+ MIT, see [LICENSE](LICENSE).
unplot-0.0.1/README.md ADDED
@@ -0,0 +1,125 @@
1
+ # unplot
2
+
3
+ **Read curves off a plot, straight from the vector PDF, with overlapping curves pulled apart and a confidence score on each.**
4
+
5
+ **[Try the live demo →](https://somnai-dreams.github.io/unplot/)** — drop a vector PDF in your browser; nothing is uploaded.
6
+
7
+ unplot is a headless library (Python and TypeScript) for digitizing line and curve plots. Interactive tools like WebPlotDigitizer, Engauge, and PlotDigitizer want you to load an image and click to calibrate the axes. unplot runs unattended instead, and does two things those tools don't:
8
+
9
+ - **Vector-PDF-native.** When a PDF's curves are real drawn paths, unplot reads the exact geometry out of the file. No rasterizing, no pixel re-detection, no clicking. The numbers you get back are the numbers that were drawn.
10
+ - **Automatic crossing separation.** Where curves overlap and cross, it keeps them apart: by stroke style or colour for vector paths, by hue for colour-coded raster images, or by de-fanning and re-chaining a single pen stroke that traces several lobes at once.
11
+
12
+ Every result also carries a **confidence report**, per curve and for the set: shape-prior violations, calibration fit, coverage gaps, crossings. If a curve is suspect, the report says so.
13
+
14
+ ## Quick start
15
+
16
+ Python:
17
+
18
+ ```python
19
+ from unplot import extract, Lobe
20
+
21
+ cs = extract("plot.pdf", frame=(60, 40, 360, 240),
22
+ prior=Lobe(0.08), expected_curves=3, order_by="peak-x")
23
+
24
+ for c in cs.curves:
25
+ print(c.id, c.points.shape, c.qa.confidence, c.qa.passed)
26
+
27
+ named = cs.labeled({0: "blue", 1: "green", 2: "red"}) # you map order to meaning
28
+ ```
29
+
30
+ TypeScript, in the browser, with no Python and no server (see [`js/`](js/)):
31
+
32
+ ```ts
33
+ import { extract, lobe } from "unplot"; // js/src/index.ts
34
+
35
+ const data = new Uint8Array(await (await fetch("plot.pdf")).arrayBuffer());
36
+ const cs = await extract(data, { expectedCurves: 3, prior: lobe(0.08), orderBy: "peak-x" });
37
+ cs.curves.forEach(c => console.log(c.id, c.style.color, c.qa.confidence));
38
+ ```
39
+
40
+ [`js/`](js/) also has a drop-a-PDF demo: it renders the page, overlays the recovered curves, shows the QA, and exports CSV. The bytes never leave the page.
41
+
42
+ ## What you get back
43
+
44
+ `extract(plot) -> CurveSet`. One plot region in, one CurveSet out:
45
+
46
+ - **calibrated arrays** per curve (`points`, in your axis units), plus source-space points for overlay
47
+ - **neutral identity.** Curves come back as `c0, c1, …`, ordered deterministically by peak-x, mean-y, or first-x. unplot knows nothing about your domain; you map order to meaning.
48
+ - **a QA report**: `CurveQA` per curve, `CurveSetQA` for the set.
49
+
50
+ ## Shape priors
51
+
52
+ The one thing you supply is the expected shape of your curves, as a `ShapePrior`. It scores confidence, and it breaks ties at crossings by picking the continuation that keeps each curve on its own flank. Shipped priors:
53
+
54
+ - `Free`: no assumption, raw geometry.
55
+ - `Monotone(direction)`: the curve only goes one way.
56
+ - `Lobe(tolerance)`: approximately unimodal (rise to a peak, then fall). `tolerance` is how much flank reversal is allowed before a curve is flagged, not a demand for strict unimodality, so real shoulders and double-humps survive.
57
+ - `Smooth`: penalise spikes.
58
+
59
+ Write your own by implementing `qa_violation` and `separation_bias`.
60
+
61
+ ## Scope
62
+
63
+ Supported: vector-PDF ingest (exact geometry) and raster images (pixel fallback); linear axis calibration from numeric tick labels, robust to stray labels leaking in from a neighbouring plot; crossing separation under a shape prior; the QA report.
64
+
65
+ Not yet, and PRs welcome:
66
+
67
+ - **Log-spaced axes.** Calibration is linear pt-to-value. Axes whose *values* are log quantities work fine when they're drawn linearly; genuinely log-*spaced* axes are not.
68
+ - **Many identically-styled curves on one plot.** With no colour or dash to tell them apart (e.g. a five-curve grey density plot), the separator can merge overlapping ones. The merge is detected — a `roughness` check, total y-variation over y-range — and the tangle is dropped with a warning rather than returned as a curve; cleanly-separated curves on the same plot still come back.
69
+ - **Heavily occluded mono scans.** Colour curves separate by hue; single-colour curves separate by continuity and the shape prior, which handles clean crossings. A low-resolution mono *scan* with a weak curve buried under a stronger one can mis-assign, and the QA reports it as low confidence. Use vector ingest or a colour-coded source there.
70
+ - Broken axis boxes, legends drawn over curves, exotic scales, automatic frame detection for raster scans.
71
+
72
+ ## Accuracy
73
+
74
+ Peak error on synthetic ground truth, in x-axis units (the plots span x 400 to 700). From `bench/benchmark.py`, seed-fixed, 20 randomized plots per tier:
75
+
76
+ | tier | median | p90 |
77
+ |-|-|-|
78
+ | vector, 3–4 curves, separated or crossing | 0.5–0.9 | 1.3–1.5 |
79
+ | raster colour, 3 crossing (with or without noise) | 1.1 | 1.5 |
80
+ | raster mono, 3 crossing | 1.0 | 1.6 |
81
+ | raster mono, 3 occluded | 1.3 | 6.3 |
82
+
83
+ Vector is essentially exact: it reads drawn geometry, so crossings and curve count don't move the number. Raster is tight on synthetic plots, around 1 unit median, with an occluded lobe (one weak curve buried under its neighbours) the harder synthetic case. The real-world hard case, a low-resolution mono scan with heavy occlusion, still degrades, and there the QA flags it rather than returning a confident wrong answer.
84
+
85
+ The vector path also reproduces reference extractions of real PDFs numerically: a dash/colour-keyed plot of three overlapping curves (every peak matched exactly, peak-normalised shape RMS 0.000), and a continuous-stroke polyline carrying a real double-hump (peaks within 8 units, the substructure preserved rather than flattened).
86
+
87
+ ## How it compares
88
+
89
+ unplot is narrower than the established tools on purpose: line and curve plots, headless, vector-native, self-reporting its own confidence, built for batch and automation rather than interactive correction. WebPlotDigitizer and Engauge are mature GUIs that cover many chart types and let a human fix mistakes live. To digitize one chart by hand, use them.
90
+
91
+ | capability | unplot | WebPlotDigitizer | Engauge | PlotDigitizer |
92
+ |-|-|-|-|-|
93
+ | reads vector-PDF geometry | yes | no (image) | no (image) | no (image) |
94
+ | automatic axis calibration | yes (vector) | click axes | click axes | click axes |
95
+ | crossing separation | hue or shape prior | colour / layers | guided | manual / auto |
96
+ | QA / confidence output | yes | no | no | no |
97
+ | headless, scriptable batch | yes | no (web app) | no (GUI) | no |
98
+ | other chart types (polar, bar…) | no | yes | yes | yes |
99
+ | license | MIT | AGPL-3.0 | GPL | proprietary |
100
+
101
+ On clean colour plots a head-to-head against the real WebPlotDigitizer 4.8 (its own auto-extraction, with an exact calibration set through its API) is a dead heat: both land around 0.5 to 1 unit, because both do a colour mask plus per-column averaging. unplot pulls ahead on vector PDFs, which WPD can't read at all; on noise, where its denoising rode out a case WPD's fixed threshold tripped on; and on frame removal, since WPD grabbed the black plot frame until a region was masked. WPD is the only incumbent benchmarked here. Engauge is a desktop GUI and PlotDigitizer is login-gated, so neither can be batch-run.
102
+
103
+ ## Install
104
+
105
+ Python:
106
+
107
+ ```bash
108
+ pip install unplot # vector-PDF core + numpy
109
+ pip install "unplot[raster]" # adds opencv/pillow for the raster fallback
110
+ ```
111
+
112
+ TypeScript: the port lives in [`js/`](js/). Run `bun install` there; the only runtime dependency is pdf.js (`pdfjs-dist`, Apache-2.0).
113
+
114
+ ## Tests
115
+
116
+ ```bash
117
+ pip install "unplot[test]" && pytest # Python
118
+ cd js && bun test # TypeScript
119
+ ```
120
+
121
+ Both suites build synthetic plots with known ground truth and assert the recovered curves and calibration match. No copyrighted source documents are redistributed.
122
+
123
+ ## License
124
+
125
+ MIT, see [LICENSE](LICENSE).
@@ -0,0 +1,116 @@
1
+ """Accuracy benchmark: unplot extraction error across difficulty tiers on synthetic ground truth.
2
+
3
+ No competitor here — these are absolute numbers. Each tier generates K randomized plots with known peaks
4
+ (via PyMuPDF), extracts them, and reports the recovery rate (got the expected curve count) and the median
5
+ peak error over recovered cases. Deterministic (fixed seed). Run: python bench/benchmark.py
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import os
10
+ import tempfile
11
+
12
+ import fitz
13
+ import numpy as np
14
+
15
+ from unplot import Lobe, extract
16
+
17
+ FRAME = (60.0, 40.0, 360.0, 240.0)
18
+ XR, YR = (400.0, 700.0), (0.0, 2.0)
19
+ COLORS = [(0, 0, 1), (0, 0.6, 0), (0.9, 0, 0), (0.6, 0, 0.6)]
20
+
21
+
22
+ def x2px(x: float) -> float:
23
+ return FRAME[0] + (x - XR[0]) / (XR[1] - XR[0]) * (FRAME[2] - FRAME[0])
24
+
25
+
26
+ def v2py(v: float) -> float:
27
+ return FRAME[3] - (v - YR[0]) / (YR[1] - YR[0]) * (FRAME[3] - FRAME[1])
28
+
29
+
30
+ def make_pdf(peaks, sigma, amp, path, mono=False):
31
+ doc = fitz.open()
32
+ pg = doc.new_page(width=420, height=300)
33
+ nm = np.arange(400.0, 700.01, 2.0)
34
+ for k, (pk, a, s) in enumerate(zip(peaks, amp, sigma)):
35
+ v = a * np.exp(-0.5 * ((nm - pk) / s) ** 2)
36
+ pts = [fitz.Point(x2px(x), v2py(val)) for x, val in zip(nm, v)]
37
+ col = (0, 0, 0) if mono else COLORS[k % 4]
38
+ sh = pg.new_shape(); sh.draw_polyline(pts); sh.finish(color=col, width=1.4, closePath=False); sh.commit()
39
+ for lbl in (400, 500, 600, 700):
40
+ pg.insert_text(fitz.Point(x2px(lbl) - 6, FRAME[3] + 12), str(lbl), fontsize=8)
41
+ for lbl in (0, 1, 2):
42
+ pg.insert_text(fitz.Point(FRAME[0] - 16, v2py(lbl) + 3), str(lbl), fontsize=8)
43
+ doc.save(path); doc.close()
44
+
45
+
46
+ def render(path, dpi, noise, rng):
47
+ pix = fitz.open(path)[0].get_pixmap(dpi=dpi)
48
+ img = np.frombuffer(pix.samples, np.uint8).reshape(pix.height, pix.width, pix.n)[..., :3].copy()
49
+ if noise > 0:
50
+ img = np.clip(img.astype(float) + rng.normal(0, noise, img.shape), 0, 255).astype(np.uint8)
51
+ return img
52
+
53
+
54
+ def gen_peaks(n, mode, rng):
55
+ # fixed lobe width across tiers (so only spacing/amplitude vary, not the argmax-jitter from width)
56
+ sigma = [26.0] * n
57
+ spacing = 95.0 if mode == "separated" else 68.0 # crossing/occluded pack the lobes closer
58
+ start = rng.uniform(440, 460)
59
+ peaks = sorted(min(start + i * rng.uniform(spacing - 8, spacing + 8), 686.0) for i in range(n))
60
+ amp = [rng.uniform(1.5, 1.9) for _ in range(n)]
61
+ if mode == "occluded": # one weak lobe buried under its taller neighbours
62
+ amp[int(rng.integers(0, n))] *= 0.4
63
+ return peaks, sigma, amp
64
+
65
+
66
+ def peak_errors(cs, truth):
67
+ """Per-curve |recovered peak - true peak| (nm), pairing both sorted by peak. expected_curves is set so
68
+ the count always matches; the error distribution is the real quality signal."""
69
+ rec = sorted(float(c.points[np.argmax(c.points[:, 1]), 0]) for c in cs.curves)
70
+ return [abs(a - b) for a, b in zip(rec, sorted(truth))]
71
+
72
+
73
+ TIERS = [
74
+ ("vector · 3 separated", dict(ingest="vector", n=3, mode="separated")),
75
+ ("vector · 3 crossing", dict(ingest="vector", n=3, mode="crossing")),
76
+ ("vector · 4 crossing", dict(ingest="vector", n=4, mode="crossing")),
77
+ ("raster colour · 3 crossing", dict(ingest="raster", n=3, mode="crossing", dpi=200, noise=0, mono=False)),
78
+ ("raster colour · 3 crossing · noisy", dict(ingest="raster", n=3, mode="crossing", dpi=200, noise=18, mono=False)),
79
+ ("raster mono · 3 crossing", dict(ingest="raster", n=3, mode="crossing", dpi=200, noise=0, mono=True)),
80
+ ("raster mono · 3 occluded", dict(ingest="raster", n=3, mode="occluded", dpi=200, noise=0, mono=True)),
81
+ ]
82
+ K = 20
83
+
84
+
85
+ def run_tier(cfg, rng, tmp):
86
+ errs = []
87
+ for t in range(K):
88
+ peaks, sigma, amp = gen_peaks(cfg["n"], cfg["mode"], rng)
89
+ p = os.path.join(tmp, f"p{t}.pdf"); make_pdf(peaks, sigma, amp, p, cfg.get("mono", False))
90
+ if cfg["ingest"] == "vector":
91
+ cs = extract(p, frame=FRAME, prior=Lobe(0.08), expected_curves=cfg["n"], order_by="peak-x")
92
+ else:
93
+ s = cfg["dpi"] / 72.0
94
+ img = render(p, cfg["dpi"], cfg["noise"], rng)
95
+ cs = extract(img, ingest="raster", frame=tuple(v * s for v in FRAME),
96
+ x_axis=[(x2px(400) * s, 400.0), (x2px(700) * s, 700.0)],
97
+ y_axis=[(v2py(0) * s, 0.0), (v2py(2) * s, 2.0)],
98
+ prior=Lobe(0.08), expected_curves=cfg["n"], order_by="peak-x")
99
+ errs += peak_errors(cs, peaks)
100
+ return errs
101
+
102
+
103
+ def main():
104
+ rng = np.random.default_rng(7)
105
+ tmp = tempfile.mkdtemp()
106
+ print(f"unplot accuracy benchmark (K={K} plots/tier, synthetic ground truth, seed 7)")
107
+ print("peak error per curve, nm (vector = exact geometry; raster = pixel floor)\n")
108
+ print(f"{'tier':32}{'median':>10}{'p90':>10}")
109
+ print("-" * 52)
110
+ for name, cfg in TIERS:
111
+ errs = np.array(run_tier(cfg, rng, tmp))
112
+ print(f"{name:32}{np.median(errs):>8.1f} {np.percentile(errs, 90):>8.1f}")
113
+
114
+
115
+ if __name__ == "__main__":
116
+ main()
@@ -0,0 +1,52 @@
1
+ # Manual head-to-head: digitize these plots in WebPlotDigitizer / Engauge
2
+
3
+ unplot runs automatically in the scorer. The interactive tools (WPD, Engauge) can't be scripted, so this
4
+ is the one part that needs a human. Do as many plots/tools as you have patience for — even one is useful.
5
+
6
+ ## The plots (`compare/plots/`)
7
+
8
+ Each plot is provided as a **PNG** (load this into the GUI tools) and a **PDF** (unplot reads this for its
9
+ vector path). Same axes on all: **x = 400…700, y = 0…2**, with tick labels drawn for calibration.
10
+
11
+ |file|curves|colours (left→right by peak)|
12
+ |-|-|-|
13
+ |`p1_single`|1|black|
14
+ |`p2_three_separated`|3|blue, green, red|
15
+ |`p3_three_crossing`|3|blue, green, red|
16
+ |`p4_three_noisy`|3|blue, green, red (with image noise)|
17
+
18
+ ## Steps (per plot, per tool)
19
+
20
+ 1. Load the **PNG** into WebPlotDigitizer ([apps.automeris.io](https://apps.automeris.io)) or Engauge.
21
+ 2. **Calibrate the axes** by clicking known points: on the x-axis click the **400** and **700** ticks; on
22
+ the y-axis click the **0** and **2** ticks. Enter those values when prompted.
23
+ 3. **Digitize each curve.** For the 3-curve plots, do each colour separately (WPD: add a dataset per colour
24
+ and use *Automatic Extraction → Color*; Engauge: a curve per colour with *Segment Fill*). Use the
25
+ automatic tracer where it works — that's the tool's strength.
26
+ 4. **Export** each curve as CSV (two columns: x then y, in data units).
27
+
28
+ ## Where to put the exports
29
+
30
+ Save one CSV per curve, named `<plot>__c<index>.csv`, indexed **left→right by peak** (so c0 = blue, c1 =
31
+ green, c2 = red), under the tool's folder:
32
+
33
+ ```
34
+ compare/submissions/wpd/p2_three_separated__c0.csv
35
+ compare/submissions/wpd/p2_three_separated__c1.csv
36
+ compare/submissions/engauge/p3_three_crossing__c0.csv
37
+ ...
38
+ ```
39
+
40
+ CSV format is forgiving — comma or whitespace separated, header lines are skipped.
41
+
42
+ ## Score it
43
+
44
+ ```bash
45
+ python compare/score.py
46
+ ```
47
+
48
+ It runs unplot (vector + raster) on every plot and folds in whatever competitor CSVs you've added,
49
+ reporting mean peak error and mean y-RMS per curve against ground truth (in x/y axis units). Re-run
50
+ anytime you add more.
51
+
52
+ (Regenerate the plots with `python compare/make_plots.py` if needed — deterministic, no randomness.)