table-stitcher 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- table_stitcher-0.3.0/.claude/settings.json +76 -0
- table_stitcher-0.3.0/.github/ISSUE_TEMPLATE/bug_report.yml +90 -0
- table_stitcher-0.3.0/.github/ISSUE_TEMPLATE/config.yml +8 -0
- table_stitcher-0.3.0/.github/ISSUE_TEMPLATE/feature_request.yml +49 -0
- table_stitcher-0.3.0/.github/dependabot.yml +27 -0
- table_stitcher-0.3.0/.github/pull_request_template.md +15 -0
- table_stitcher-0.3.0/.github/workflows/ci.yml +119 -0
- table_stitcher-0.3.0/.github/workflows/release.yml +52 -0
- table_stitcher-0.3.0/.github/workflows/upstream-smoke.yml +46 -0
- table_stitcher-0.3.0/.gitignore +45 -0
- table_stitcher-0.3.0/.pre-commit-config.yaml +31 -0
- table_stitcher-0.3.0/CHANGELOG.md +128 -0
- table_stitcher-0.3.0/CONTRIBUTING.md +243 -0
- table_stitcher-0.3.0/LICENSE +21 -0
- table_stitcher-0.3.0/PKG-INFO +392 -0
- table_stitcher-0.3.0/README.md +354 -0
- table_stitcher-0.3.0/SECURITY.md +31 -0
- table_stitcher-0.3.0/examples/basic_pipeline.py +72 -0
- table_stitcher-0.3.0/examples/system_controller.py +86 -0
- table_stitcher-0.3.0/pyproject.toml +101 -0
- table_stitcher-0.3.0/scripts/regenerate_docling_snapshots.py +119 -0
- table_stitcher-0.3.0/scripts/release_gate.sh +86 -0
- table_stitcher-0.3.0/src/table_stitcher/__init__.py +340 -0
- table_stitcher-0.3.0/src/table_stitcher/adapters/README.md +173 -0
- table_stitcher-0.3.0/src/table_stitcher/adapters/__init__.py +11 -0
- table_stitcher-0.3.0/src/table_stitcher/adapters/base.py +42 -0
- table_stitcher-0.3.0/src/table_stitcher/adapters/docling.py +797 -0
- table_stitcher-0.3.0/src/table_stitcher/merger.py +979 -0
- table_stitcher-0.3.0/src/table_stitcher/models.py +145 -0
- table_stitcher-0.3.0/src/table_stitcher/py.typed +0 -0
- table_stitcher-0.3.0/tests/README.md +135 -0
- table_stitcher-0.3.0/tests/__init__.py +0 -0
- table_stitcher-0.3.0/tests/fixtures/tablemeta/headerless-width-drift.yaml +26 -0
- table_stitcher-0.3.0/tests/integration/__init__.py +0 -0
- table_stitcher-0.3.0/tests/integration/_tools/__init__.py +0 -0
- table_stitcher-0.3.0/tests/integration/_tools/regenerate_expected.py +173 -0
- table_stitcher-0.3.0/tests/integration/conftest.py +362 -0
- table_stitcher-0.3.0/tests/integration/fixtures/_synth/__init__.py +0 -0
- table_stitcher-0.3.0/tests/integration/fixtures/_synth/generate.py +172 -0
- table_stitcher-0.3.0/tests/integration/fixtures/distinct-tables-no-merge/.gitkeep +0 -0
- table_stitcher-0.3.0/tests/integration/fixtures/distinct-tables-no-merge/kaop-study-mixed-3pg.pt2.docling.json +1 -0
- table_stitcher-0.3.0/tests/integration/fixtures/distinct-tables-no-merge/kaop-study-mixed-3pg.pt2.expected.yaml +82 -0
- table_stitcher-0.3.0/tests/integration/fixtures/distinct-tables-no-merge/kaop-study-mixed-3pg.pt2.pdf +0 -0
- table_stitcher-0.3.0/tests/integration/fixtures/distinct-tables-no-merge/lab-panels-3pg.corp.docling.json +4180 -0
- table_stitcher-0.3.0/tests/integration/fixtures/distinct-tables-no-merge/lab-panels-3pg.corp.expected.yaml +74 -0
- table_stitcher-0.3.0/tests/integration/fixtures/distinct-tables-no-merge/lab-panels-3pg.corp.pdf +130 -0
- table_stitcher-0.3.0/tests/integration/fixtures/false-merge/category-rows-thematic-2pg.corp.docling.json +2413 -0
- table_stitcher-0.3.0/tests/integration/fixtures/false-merge/category-rows-thematic-2pg.corp.expected.yaml +30 -0
- table_stitcher-0.3.0/tests/integration/fixtures/false-merge/category-rows-thematic-2pg.corp.pdf +105 -0
- table_stitcher-0.3.0/tests/integration/fixtures/headerless-continuation/15-page-druglist.corp.docling.json +1 -0
- table_stitcher-0.3.0/tests/integration/fixtures/headerless-continuation/15-page-druglist.corp.expected.yaml +51 -0
- table_stitcher-0.3.0/tests/integration/fixtures/headerless-continuation/15-page-druglist.corp.pdf +0 -0
- table_stitcher-0.3.0/tests/integration/fixtures/headerless-continuation/gene-symbols-6pg.pt2.docling.json +1 -0
- table_stitcher-0.3.0/tests/integration/fixtures/headerless-continuation/gene-symbols-6pg.pt2.expected.yaml +43 -0
- table_stitcher-0.3.0/tests/integration/fixtures/headerless-continuation/gene-symbols-6pg.pt2.pdf +0 -0
- table_stitcher-0.3.0/tests/integration/fixtures/headerless-continuation/rct-study-table-5pg.pt2.docling.json +1 -0
- table_stitcher-0.3.0/tests/integration/fixtures/headerless-continuation/rct-study-table-5pg.pt2.expected.yaml +47 -0
- table_stitcher-0.3.0/tests/integration/fixtures/headerless-continuation/rct-study-table-5pg.pt2.pdf +0 -0
- table_stitcher-0.3.0/tests/integration/fixtures/headerless-continuation/rrt-outcomes-2pg.pt2.docling.json +1 -0
- table_stitcher-0.3.0/tests/integration/fixtures/headerless-continuation/rrt-outcomes-2pg.pt2.expected.yaml +38 -0
- table_stitcher-0.3.0/tests/integration/fixtures/headerless-continuation/rrt-outcomes-2pg.pt2.pdf +0 -0
- table_stitcher-0.3.0/tests/integration/fixtures/headerless-continuation/uveitis-case-series-5pg.pt2.docling.json +1 -0
- table_stitcher-0.3.0/tests/integration/fixtures/headerless-continuation/uveitis-case-series-5pg.pt2.expected.yaml +48 -0
- table_stitcher-0.3.0/tests/integration/fixtures/headerless-continuation/uveitis-case-series-5pg.pt2.pdf +0 -0
- table_stitcher-0.3.0/tests/integration/fixtures/inconsistent-header-detection/covid-misc-labs-4pg.pt2.docling.json +1 -0
- table_stitcher-0.3.0/tests/integration/fixtures/inconsistent-header-detection/covid-misc-labs-4pg.pt2.expected.yaml +96 -0
- table_stitcher-0.3.0/tests/integration/fixtures/inconsistent-header-detection/covid-misc-labs-4pg.pt2.pdf +0 -0
- table_stitcher-0.3.0/tests/integration/fixtures/inconsistent-header-detection/lit-review-3pg.pt2.docling.json +1 -0
- table_stitcher-0.3.0/tests/integration/fixtures/inconsistent-header-detection/lit-review-3pg.pt2.expected.yaml +61 -0
- table_stitcher-0.3.0/tests/integration/fixtures/inconsistent-header-detection/lit-review-3pg.pt2.pdf +0 -0
- table_stitcher-0.3.0/tests/integration/fixtures/inconsistent-header-detection/retirement-portfolio.corp.docling.json +1 -0
- table_stitcher-0.3.0/tests/integration/fixtures/inconsistent-header-detection/retirement-portfolio.corp.expected.yaml +37 -0
- table_stitcher-0.3.0/tests/integration/fixtures/inconsistent-header-detection/retirement-portfolio.corp.pdf +0 -0
- table_stitcher-0.3.0/tests/integration/fixtures/loose-header-layout/.gitkeep +0 -0
- table_stitcher-0.3.0/tests/integration/fixtures/loose-header-layout/biological-process-2pg.pt2.docling.json +1 -0
- table_stitcher-0.3.0/tests/integration/fixtures/loose-header-layout/biological-process-2pg.pt2.expected.yaml +32 -0
- table_stitcher-0.3.0/tests/integration/fixtures/loose-header-layout/biological-process-2pg.pt2.pdf +0 -0
- table_stitcher-0.3.0/tests/integration/fixtures/loose-header-layout/symptoms-mediators-4pg.pt2.docling.json +1 -0
- table_stitcher-0.3.0/tests/integration/fixtures/loose-header-layout/symptoms-mediators-4pg.pt2.expected.yaml +27 -0
- table_stitcher-0.3.0/tests/integration/fixtures/loose-header-layout/symptoms-mediators-4pg.pt2.pdf +0 -0
- table_stitcher-0.3.0/tests/integration/fixtures/multilingual/corporate-history-2pg.edinet.docling.json +1 -0
- table_stitcher-0.3.0/tests/integration/fixtures/multilingual/corporate-history-2pg.edinet.expected.yaml +23 -0
- table_stitcher-0.3.0/tests/integration/fixtures/multilingual/corporate-history-2pg.edinet.pdf +0 -0
- table_stitcher-0.3.0/tests/integration/fixtures/multilingual/subsidiaries-4pg.edinet.docling.json +1 -0
- table_stitcher-0.3.0/tests/integration/fixtures/multilingual/subsidiaries-4pg.edinet.expected.yaml +41 -0
- table_stitcher-0.3.0/tests/integration/fixtures/multilingual/subsidiaries-4pg.edinet.pdf +0 -0
- table_stitcher-0.3.0/tests/integration/fixtures/orphan-pair/.gitkeep +0 -0
- table_stitcher-0.3.0/tests/integration/fixtures/orphan-pair/varicose-veins-new-table-header-7pg.pt2.docling.json +1 -0
- table_stitcher-0.3.0/tests/integration/fixtures/orphan-pair/varicose-veins-new-table-header-7pg.pt2.expected.yaml +63 -0
- table_stitcher-0.3.0/tests/integration/fixtures/orphan-pair/varicose-veins-new-table-header-7pg.pt2.pdf +0 -0
- table_stitcher-0.3.0/tests/integration/fixtures/page-gap-too-large/unrelated-tables-gap4.synth.docling.json +1 -0
- table_stitcher-0.3.0/tests/integration/fixtures/page-gap-too-large/unrelated-tables-gap4.synth.expected.yaml +25 -0
- table_stitcher-0.3.0/tests/integration/fixtures/page-gap-too-large/unrelated-tables-gap4.synth.pdf +131 -0
- table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/4-page-substance-list.corp.docling.json +1 -0
- table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/4-page-substance-list.corp.expected.yaml +61 -0
- table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/4-page-substance-list.corp.pdf +0 -0
- table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/cell-markers-4pg.pt2.docling.json +1 -0
- table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/cell-markers-4pg.pt2.expected.yaml +31 -0
- table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/cell-markers-4pg.pt2.pdf +0 -0
- table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/challenge-categories-2pg.pt2.docling.json +1 -0
- table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/challenge-categories-2pg.pt2.expected.yaml +36 -0
- table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/challenge-categories-2pg.pt2.pdf +0 -0
- table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/fungal-taxonomy-4pg.pt2.docling.json +1 -0
- table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/fungal-taxonomy-4pg.pt2.expected.yaml +42 -0
- table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/fungal-taxonomy-4pg.pt2.pdf +0 -0
- table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/rowspan-insurance-payout.corp.docling.json +1 -0
- table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/rowspan-insurance-payout.corp.expected.yaml +35 -0
- table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/rowspan-insurance-payout.corp.pdf +0 -0
- table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/search-strategies-2pg.pt2.docling.json +1 -0
- table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/search-strategies-2pg.pt2.expected.yaml +37 -0
- table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/search-strategies-2pg.pt2.pdf +0 -0
- table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/study-sample-7pg.pt2.docling.json +1 -0
- table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/study-sample-7pg.pt2.expected.yaml +48 -0
- table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/study-sample-7pg.pt2.pdf +0 -0
- table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/themes-exemplars-3pg.pt2.docling.json +1 -0
- table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/themes-exemplars-3pg.pt2.expected.yaml +35 -0
- table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/themes-exemplars-3pg.pt2.pdf +0 -0
- table_stitcher-0.3.0/tests/integration/fixtures/simple-continuation/sample-table.corp.docling.json +1 -0
- table_stitcher-0.3.0/tests/integration/fixtures/simple-continuation/sample-table.corp.expected.yaml +15 -0
- table_stitcher-0.3.0/tests/integration/fixtures/simple-continuation/sample-table.corp.pdf +0 -0
- table_stitcher-0.3.0/tests/integration/fixtures/spillover/note-overflow.synth.docling.json +1 -0
- table_stitcher-0.3.0/tests/integration/fixtures/spillover/note-overflow.synth.expected.yaml +47 -0
- table_stitcher-0.3.0/tests/integration/fixtures/spillover/note-overflow.synth.pdf +93 -0
- table_stitcher-0.3.0/tests/integration/fixtures/width-drift/.gitkeep +0 -0
- table_stitcher-0.3.0/tests/integration/fixtures/width-drift/abx-literature-review-7pg.pt2.docling.json +1 -0
- table_stitcher-0.3.0/tests/integration/fixtures/width-drift/abx-literature-review-7pg.pt2.expected.yaml +50 -0
- table_stitcher-0.3.0/tests/integration/fixtures/width-drift/abx-literature-review-7pg.pt2.pdf +0 -0
- table_stitcher-0.3.0/tests/integration/test_fixtures.py +54 -0
- table_stitcher-0.3.0/tests/test_docling_adapter.py +757 -0
- table_stitcher-0.3.0/tests/test_merger.py +791 -0
- table_stitcher-0.3.0/tests/test_public_api.py +83 -0
- table_stitcher-0.3.0/tests/test_tablemeta_fixtures.py +82 -0
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
{
|
|
2
|
+
"permissions": {
|
|
3
|
+
"allow": [
|
|
4
|
+
"Bash(python3 -c \"import docling, table_stitcher; print\\('ok'\\)\")",
|
|
5
|
+
"Bash(python3 -m pip show docling)",
|
|
6
|
+
"Bash(python3 -m pip show table_stitcher)",
|
|
7
|
+
"Bash(python3 -c \"import sys; print\\(sys.executable\\)\")",
|
|
8
|
+
"Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 -c \"import table_stitcher; print\\(table_stitcher.__file__\\); import docling; print\\('docling ok'\\)\")",
|
|
9
|
+
"Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 /tmp/inspect_testdocs.py)",
|
|
10
|
+
"Bash(tee /tmp/inspect_run.log)",
|
|
11
|
+
"Bash(python3 -c \"import yaml; print\\(yaml.__version__\\)\")",
|
|
12
|
+
"Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 -m pytest tests/integration/ -v)",
|
|
13
|
+
"Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 -m pytest tests/ -v)",
|
|
14
|
+
"Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 /tmp/gen_expected.py)",
|
|
15
|
+
"Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 -m pytest tests/ -v --no-header)",
|
|
16
|
+
"Bash(curl -L -o images.tar.gz --progress-bar \"https://huggingface.co/datasets/kensho/PubTables-v2/resolve/main/PubTables-v2_Full-Documents_test_images.tar.gz\")",
|
|
17
|
+
"Bash(curl -sL -o cross-page-pairs.tar.gz \"https://huggingface.co/datasets/kensho/PubTables-v2/resolve/main/PubTables-v2_Full-Documents_test_cross-page-table-pairs.tar.gz\")",
|
|
18
|
+
"Bash(curl -sL -o xml-annotations.tar.gz \"https://huggingface.co/datasets/kensho/PubTables-v2/resolve/main/PubTables-v2_Full-Documents_test_xml-annotations.tar.gz\")",
|
|
19
|
+
"Bash(curl -sL -o tables.tar.gz \"https://huggingface.co/datasets/kensho/PubTables-v2/resolve/main/PubTables-v2_Full-Documents_test_tables.tar.gz\")",
|
|
20
|
+
"Bash(curl -sL -o README.md \"https://huggingface.co/datasets/kensho/PubTables-v2/resolve/main/README.md\")",
|
|
21
|
+
"Bash(mkdir -p extracted)",
|
|
22
|
+
"Bash(tar -xzf ../cross-page-pairs.tar.gz)",
|
|
23
|
+
"Bash(tar -xzf ../xml-annotations.tar.gz)",
|
|
24
|
+
"Bash(tar -xzf ../tables.tar.gz)",
|
|
25
|
+
"Bash(awk -F/ '{print $4}')",
|
|
26
|
+
"Bash(python3 -c \"import json; d=json.load\\(open\\('Full Documents/test/tables/PMC10157558_tables.json'\\)\\); print\\(type\\(d\\)\\); print\\(json.dumps\\(d, indent=2\\)[:2000]\\)\")",
|
|
27
|
+
"Bash(python3 /tmp/select_multipage.py)",
|
|
28
|
+
"Bash(tar -tzf /Users/pebbleroad/Documents/table-stitcher/.temp/images.tar.gz)",
|
|
29
|
+
"Bash(python3 -c \"import img2pdf; print\\('img2pdf', img2pdf.__version__\\)\")",
|
|
30
|
+
"Bash(tar -xzf ../images.tar.gz)",
|
|
31
|
+
"Bash(python3 /tmp/bundle_pdfs.py)",
|
|
32
|
+
"Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 /tmp/inspect_pt2.py)",
|
|
33
|
+
"Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 /tmp/install_pt2_fixtures.py)",
|
|
34
|
+
"Bash(tee /tmp/install_output.log)",
|
|
35
|
+
"Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 -m pytest tests/)",
|
|
36
|
+
"Bash(python3 -c ' *)",
|
|
37
|
+
"Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 -m pytest tests/test_docling_adapter.py -v)",
|
|
38
|
+
"Bash(python3 /tmp/select_second_slice.py)",
|
|
39
|
+
"Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 /tmp/inspect_pt2_v2.py)",
|
|
40
|
+
"Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 /tmp/install_pt2_v2_fixtures.py)",
|
|
41
|
+
"Bash(tee /tmp/install_v2.log)",
|
|
42
|
+
"Bash(python3 -c \"import reportlab; print\\(reportlab.Version\\)\")",
|
|
43
|
+
"Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 -m tests.integration.fixtures._synth.generate)",
|
|
44
|
+
"Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 -m pytest tests/test_merger.py -v)",
|
|
45
|
+
"Bash(git rm *)",
|
|
46
|
+
"Bash(python3 -m pip show docling docling-core)",
|
|
47
|
+
"Bash(python3 -c \"import pypdf; print\\(pypdf.__version__\\)\")",
|
|
48
|
+
"Bash(python3 -c \"import pikepdf; print\\(pikepdf.__version__\\)\")",
|
|
49
|
+
"Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 -m pytest tests/integration/ -v -k \"edinet\")",
|
|
50
|
+
"Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 -m pytest tests/test_merger.py -v -k \"tokenize\")",
|
|
51
|
+
"Bash(python3 -m tests.integration._tools.regenerate_expected tests/integration/fixtures/multilingual/corporate-history-2pg.edinet.pdf --clear-xfail --description 'Two-page corporate history table from a Japanese EDINET filing. Both pages carry identical 2-column Japanese headers \\(年⽉ / 概要\\). Exercises the repeated-header Jaccard path against pure CJK text — merges because tokenize\\(\\) emits one token per CJK character, giving Jaccard 1.0 on identical headers.')",
|
|
52
|
+
"Bash(python3 -m tests.integration._tools.regenerate_expected tests/integration/fixtures/multilingual/subsidiaries-4pg.edinet.pdf --clear-xfail --description 'Four-page subsidiaries list from a Japanese EDINET filing. Three fragments \\(widths 6/6/7\\), all with identical 6-column Japanese headers \\(名称 / 住所 / 資本⾦⼜は出資⾦ / 主要な事業の内容 / 議決権の所有割合 / 関係内容\\). The width-7 final fragment is the summary-rows continuation of the same table. All three merge on CJK-aware header Jaccard.')",
|
|
53
|
+
"Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 -m pytest -m integration tests/integration/test_fixtures.py::test_fixture_stitches_as_expected -k \"varicose\")",
|
|
54
|
+
"Bash(python3 -m tests.integration._tools.regenerate_expected tests/integration/fixtures/orphan-pair/varicose-veins-new-table-header-7pg.pt2.pdf)",
|
|
55
|
+
"Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 -m pytest tests/ tests/ -m \"integration\")",
|
|
56
|
+
"Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 -m pytest -m integration tests/)",
|
|
57
|
+
"Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 -m pytest -m integration tests/integration/test_fixtures.py::test_fixture_stitches_as_expected -k \"substance-list or note-overflow\")",
|
|
58
|
+
"Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 -m pytest tests/test_docling_adapter.py::TestInjection::test_injection_failure_restores_partial_mutations -v)",
|
|
59
|
+
"Bash(scripts/release_gate.sh)",
|
|
60
|
+
"Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 -m pytest -m integration tests/integration/test_fixtures.py -k \"substance-list or note-overflow\")",
|
|
61
|
+
"Bash(python -c \"import sys; print\\(sys.version\\)\")",
|
|
62
|
+
"Bash(python3 -m pip show ruff)",
|
|
63
|
+
"Bash(ruff format *)",
|
|
64
|
+
"Bash(ruff check *)",
|
|
65
|
+
"Bash(python3 -m pytest -q)",
|
|
66
|
+
"Bash(python3 -m build --wheel --outdir /tmp/wheel-check)",
|
|
67
|
+
"Bash(unzip -l /tmp/wheel-check/*.whl)",
|
|
68
|
+
"Bash(python3 -m pytest tests/test_public_api.py -v)",
|
|
69
|
+
"Bash(python3 -m pytest tests/:*)",
|
|
70
|
+
"Bash(python3 -m pytest tests:*)",
|
|
71
|
+
"Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 -m pytest tests/:*)",
|
|
72
|
+
"Bash(python3 -m pytest -q:*)",
|
|
73
|
+
"Bash(python3 -m pytest -m:*)"
|
|
74
|
+
]
|
|
75
|
+
}
|
|
76
|
+
}
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
name: Bug report
|
|
2
|
+
description: A merge produced wrong output, threw an unexpected error, or behaved differently than documented.
|
|
3
|
+
labels: ["bug"]
|
|
4
|
+
body:
|
|
5
|
+
- type: markdown
|
|
6
|
+
attributes:
|
|
7
|
+
value: |
|
|
8
|
+
Thanks for filing a bug. The fields below are what we need to reproduce
|
|
9
|
+
the issue. The biggest unknown for a table-stitching bug is usually
|
|
10
|
+
*what the upstream parser produced* — please attach a minimal fixture
|
|
11
|
+
if you can.
|
|
12
|
+
|
|
13
|
+
- type: input
|
|
14
|
+
id: version
|
|
15
|
+
attributes:
|
|
16
|
+
label: table-stitcher version
|
|
17
|
+
description: Output of `python -c "import table_stitcher; print(table_stitcher.__version__)"`
|
|
18
|
+
placeholder: "0.2.0"
|
|
19
|
+
validations:
|
|
20
|
+
required: true
|
|
21
|
+
|
|
22
|
+
- type: input
|
|
23
|
+
id: python
|
|
24
|
+
attributes:
|
|
25
|
+
label: Python version
|
|
26
|
+
placeholder: "3.11.7"
|
|
27
|
+
validations:
|
|
28
|
+
required: true
|
|
29
|
+
|
|
30
|
+
- type: dropdown
|
|
31
|
+
id: parser
|
|
32
|
+
attributes:
|
|
33
|
+
label: Upstream parser
|
|
34
|
+
description: Which extractor produced the input tables?
|
|
35
|
+
options:
|
|
36
|
+
- Docling (built-in adapter)
|
|
37
|
+
- Custom adapter
|
|
38
|
+
- Other
|
|
39
|
+
validations:
|
|
40
|
+
required: true
|
|
41
|
+
|
|
42
|
+
- type: textarea
|
|
43
|
+
id: what-happened
|
|
44
|
+
attributes:
|
|
45
|
+
label: What happened
|
|
46
|
+
description: What did the merger produce, and what did you expect instead?
|
|
47
|
+
placeholder: |
|
|
48
|
+
Pages 4–5 of report.pdf contain one logical table split by a page break.
|
|
49
|
+
I expected `stitch_tables(doc)` to return one merged table; instead I
|
|
50
|
+
got two separate tables with the headers duplicated.
|
|
51
|
+
validations:
|
|
52
|
+
required: true
|
|
53
|
+
|
|
54
|
+
- type: textarea
|
|
55
|
+
id: repro
|
|
56
|
+
attributes:
|
|
57
|
+
label: Minimal reproduction
|
|
58
|
+
description: |
|
|
59
|
+
Smallest input + code that demonstrates the bug. If you can attach a
|
|
60
|
+
PDF fragment or a synthetic `TableMeta` list, that's ideal. See
|
|
61
|
+
[CONTRIBUTING.md](https://github.com/pebbleroad/table-stitcher/blob/main/CONTRIBUTING.md#adding-a-new-fixture)
|
|
62
|
+
for the fixture format we use internally.
|
|
63
|
+
render: python
|
|
64
|
+
placeholder: |
|
|
65
|
+
from docling.document_converter import DocumentConverter
|
|
66
|
+
from table_stitcher import stitch_tables
|
|
67
|
+
|
|
68
|
+
doc = DocumentConverter().convert("attached.pdf").document
|
|
69
|
+
result = stitch_tables(doc)
|
|
70
|
+
# observed: 2 tables; expected: 1
|
|
71
|
+
validations:
|
|
72
|
+
required: true
|
|
73
|
+
|
|
74
|
+
- type: textarea
|
|
75
|
+
id: config
|
|
76
|
+
attributes:
|
|
77
|
+
label: MultiPageConfig (if non-default)
|
|
78
|
+
description: Any custom thresholds you passed via `MultiPageConfig(...)`.
|
|
79
|
+
placeholder: "MultiPageConfig(max_page_gap=2, headerless_width_tolerance=3)"
|
|
80
|
+
validations:
|
|
81
|
+
required: false
|
|
82
|
+
|
|
83
|
+
- type: textarea
|
|
84
|
+
id: logs
|
|
85
|
+
attributes:
|
|
86
|
+
label: Logs / traceback
|
|
87
|
+
description: Output of the merger at INFO level if relevant. Wrap in triple backticks.
|
|
88
|
+
render: shell
|
|
89
|
+
validations:
|
|
90
|
+
required: false
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
blank_issues_enabled: false
|
|
2
|
+
contact_links:
|
|
3
|
+
- name: Security vulnerability
|
|
4
|
+
url: https://github.com/pebbleroad/table-stitcher/security/policy
|
|
5
|
+
about: Please report security issues privately — see SECURITY.md, do not file a public issue.
|
|
6
|
+
- name: Question or discussion
|
|
7
|
+
url: https://github.com/pebbleroad/table-stitcher/discussions
|
|
8
|
+
about: For usage questions or open-ended discussion, use Discussions instead of Issues.
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
name: Feature request
|
|
2
|
+
description: Propose a new merge signal, config knob, adapter, or API surface.
|
|
3
|
+
labels: ["enhancement"]
|
|
4
|
+
body:
|
|
5
|
+
- type: markdown
|
|
6
|
+
attributes:
|
|
7
|
+
value: |
|
|
8
|
+
The library deliberately keeps merge signals **structural** (column
|
|
9
|
+
counts, header similarity, layout proximity) and avoids
|
|
10
|
+
language-specific or vocabulary-based rules. Proposals that fit that
|
|
11
|
+
model land faster — see [CONTRIBUTING.md](https://github.com/pebbleroad/table-stitcher/blob/main/CONTRIBUTING.md)
|
|
12
|
+
for context.
|
|
13
|
+
|
|
14
|
+
- type: textarea
|
|
15
|
+
id: problem
|
|
16
|
+
attributes:
|
|
17
|
+
label: What problem does this solve?
|
|
18
|
+
description: A real document or pipeline where the current behavior falls short.
|
|
19
|
+
validations:
|
|
20
|
+
required: true
|
|
21
|
+
|
|
22
|
+
- type: textarea
|
|
23
|
+
id: proposal
|
|
24
|
+
attributes:
|
|
25
|
+
label: Proposed change
|
|
26
|
+
description: |
|
|
27
|
+
Sketch the API or behavior change. New `MultiPageConfig` field?
|
|
28
|
+
New adapter? New merge phase? Pseudocode is welcome.
|
|
29
|
+
validations:
|
|
30
|
+
required: true
|
|
31
|
+
|
|
32
|
+
- type: textarea
|
|
33
|
+
id: alternatives
|
|
34
|
+
attributes:
|
|
35
|
+
label: Alternatives considered
|
|
36
|
+
description: |
|
|
37
|
+
What you tried already (different config, custom adapter, post-processing) and why it wasn't enough.
|
|
38
|
+
validations:
|
|
39
|
+
required: false
|
|
40
|
+
|
|
41
|
+
- type: checkboxes
|
|
42
|
+
id: structural
|
|
43
|
+
attributes:
|
|
44
|
+
label: Compatibility with the structural-only constraint
|
|
45
|
+
options:
|
|
46
|
+
- label: This proposal does **not** rely on language-specific dictionaries, vocabulary lists, or trained models.
|
|
47
|
+
required: false
|
|
48
|
+
- label: I understand the maintainers may decline language-aware proposals to keep the core multilingual.
|
|
49
|
+
required: false
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
version: 2
|
|
2
|
+
updates:
|
|
3
|
+
# Python deps in pyproject.toml — bumps `pandas`, `docling`, dev tools.
|
|
4
|
+
# Weekly cadence avoids PR spam; security advisories are picked up out-of-band.
|
|
5
|
+
- package-ecosystem: pip
|
|
6
|
+
directory: "/"
|
|
7
|
+
schedule:
|
|
8
|
+
interval: weekly
|
|
9
|
+
open-pull-requests-limit: 5
|
|
10
|
+
groups:
|
|
11
|
+
# Group dev-only deps so a routine ruff/pytest bump doesn't
|
|
12
|
+
# produce three separate PRs in one week.
|
|
13
|
+
dev-tooling:
|
|
14
|
+
patterns:
|
|
15
|
+
- "ruff"
|
|
16
|
+
- "pytest*"
|
|
17
|
+
- "build"
|
|
18
|
+
- "twine"
|
|
19
|
+
- "pre-commit"
|
|
20
|
+
|
|
21
|
+
# GitHub Actions versions in .github/workflows/*.yml — keeps action
|
|
22
|
+
# references current (e.g. actions/checkout@v4 → @v5 when stable).
|
|
23
|
+
- package-ecosystem: github-actions
|
|
24
|
+
directory: "/"
|
|
25
|
+
schedule:
|
|
26
|
+
interval: weekly
|
|
27
|
+
open-pull-requests-limit: 5
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
## What changed
|
|
2
|
+
|
|
3
|
+
<!-- One or two sentences. The "why" matters more than the "what" — the diff shows the what. -->
|
|
4
|
+
|
|
5
|
+
## Why
|
|
6
|
+
|
|
7
|
+
<!-- The motivation. A linked issue, a fixture that was failing, a downstream user request. -->
|
|
8
|
+
|
|
9
|
+
## Checklist
|
|
10
|
+
|
|
11
|
+
- [ ] `pytest tests/` is green locally (or xfails are intentional, with reasons in the YAML)
|
|
12
|
+
- [ ] `ruff check .` and `ruff format --check .` pass (pre-commit handles this for you)
|
|
13
|
+
- [ ] New behavior has a test (unit or fixture)
|
|
14
|
+
- [ ] User-visible changes are noted in [CHANGELOG.md](https://github.com/pebbleroad/table-stitcher/blob/main/CHANGELOG.md)
|
|
15
|
+
- [ ] No language-specific or vocabulary assumptions added — merge signals stay structural
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
name: ci
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
lint:
|
|
11
|
+
# Style + static-analysis gate. Single Python — ruff produces identical
|
|
12
|
+
# results across versions, so a matrix here just burns minutes.
|
|
13
|
+
name: lint
|
|
14
|
+
runs-on: ubuntu-latest
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
- uses: actions/setup-python@v5
|
|
18
|
+
with:
|
|
19
|
+
python-version: "3.12"
|
|
20
|
+
cache: pip
|
|
21
|
+
- name: install ruff
|
|
22
|
+
# Pin via pyproject's dev extra so local + CI run the same version.
|
|
23
|
+
# No need to install the full dev tree (docling, reportlab) just to lint.
|
|
24
|
+
run: |
|
|
25
|
+
python -m pip install --upgrade pip
|
|
26
|
+
pip install "ruff>=0.6"
|
|
27
|
+
- name: ruff check
|
|
28
|
+
run: ruff check .
|
|
29
|
+
- name: ruff format --check
|
|
30
|
+
run: ruff format --check .
|
|
31
|
+
|
|
32
|
+
test:
|
|
33
|
+
# Behavior gate. Matrix across all supported Python versions.
|
|
34
|
+
name: test (py${{ matrix.python }})
|
|
35
|
+
runs-on: ubuntu-latest
|
|
36
|
+
strategy:
|
|
37
|
+
fail-fast: false
|
|
38
|
+
matrix:
|
|
39
|
+
python: ["3.9", "3.10", "3.11", "3.12"]
|
|
40
|
+
steps:
|
|
41
|
+
- uses: actions/checkout@v4
|
|
42
|
+
- uses: actions/setup-python@v5
|
|
43
|
+
with:
|
|
44
|
+
python-version: ${{ matrix.python }}
|
|
45
|
+
cache: pip
|
|
46
|
+
- name: install
|
|
47
|
+
run: |
|
|
48
|
+
python -m pip install --upgrade pip
|
|
49
|
+
pip install -e ".[dev]"
|
|
50
|
+
- name: unit tests
|
|
51
|
+
# Default addopts in pyproject excludes the integration marker, so
|
|
52
|
+
# this run is already unit-only. -v adds per-case outcomes.
|
|
53
|
+
run: pytest tests/ -v
|
|
54
|
+
|
|
55
|
+
integration:
|
|
56
|
+
# Snapshot lane: tests load committed *.docling.json fixtures and run
|
|
57
|
+
# table-stitcher against them. No PDF parsing, no OCR, no model downloads
|
|
58
|
+
# — deterministic across platforms regardless of which OCR engine docling
|
|
59
|
+
# would auto-select. The full live-parse pipeline is exercised separately
|
|
60
|
+
# by .github/workflows/upstream-smoke.yml on a nightly schedule.
|
|
61
|
+
name: integration
|
|
62
|
+
runs-on: ubuntu-latest
|
|
63
|
+
steps:
|
|
64
|
+
- uses: actions/checkout@v4
|
|
65
|
+
- uses: actions/setup-python@v5
|
|
66
|
+
with:
|
|
67
|
+
python-version: "3.11"
|
|
68
|
+
cache: pip
|
|
69
|
+
- name: install
|
|
70
|
+
run: |
|
|
71
|
+
python -m pip install --upgrade pip
|
|
72
|
+
pip install -e ".[dev]"
|
|
73
|
+
- name: integration tests (snapshot lane)
|
|
74
|
+
# Integration tests are gated behind the `integration` marker in
|
|
75
|
+
# pyproject; opt in explicitly here. The snapshot lane is the default
|
|
76
|
+
# — no --live-parse flag.
|
|
77
|
+
run: pytest -m integration tests/ -v
|
|
78
|
+
|
|
79
|
+
build:
|
|
80
|
+
# Packaging gate. Builds sdist + wheel, validates PyPI metadata, and
|
|
81
|
+
# smoke-tests the *installed wheel* in a clean venv outside the checkout.
|
|
82
|
+
# That last step is the one that catches "passes tests, breaks on
|
|
83
|
+
# `pip install`" — the most common post-release failure.
|
|
84
|
+
name: build
|
|
85
|
+
runs-on: ubuntu-latest
|
|
86
|
+
needs: [lint, test]
|
|
87
|
+
steps:
|
|
88
|
+
- uses: actions/checkout@v4
|
|
89
|
+
- uses: actions/setup-python@v5
|
|
90
|
+
with:
|
|
91
|
+
python-version: "3.12"
|
|
92
|
+
cache: pip
|
|
93
|
+
- name: install build tooling
|
|
94
|
+
run: |
|
|
95
|
+
python -m pip install --upgrade pip
|
|
96
|
+
pip install build twine
|
|
97
|
+
- name: build sdist + wheel
|
|
98
|
+
run: python -m build
|
|
99
|
+
- name: twine check
|
|
100
|
+
run: twine check dist/*
|
|
101
|
+
- name: smoke-test installed wheel
|
|
102
|
+
# Install the built wheel into a fresh venv *outside* the checkout,
|
|
103
|
+
# then import + call a public symbol. Proves the wheel is functional,
|
|
104
|
+
# not just buildable.
|
|
105
|
+
run: |
|
|
106
|
+
python -m venv /tmp/smoke-venv
|
|
107
|
+
/tmp/smoke-venv/bin/pip install --upgrade pip
|
|
108
|
+
/tmp/smoke-venv/bin/pip install dist/*.whl
|
|
109
|
+
cd /tmp # leave the checkout so we can't accidentally import from src/
|
|
110
|
+
/tmp/smoke-venv/bin/python - <<'PY'
|
|
111
|
+
import pathlib
|
|
112
|
+
import table_stitcher
|
|
113
|
+
|
|
114
|
+
pkg = pathlib.Path(table_stitcher.__file__).resolve()
|
|
115
|
+
assert "site-packages" in str(pkg), pkg
|
|
116
|
+
assert table_stitcher.__version__, "missing __version__"
|
|
117
|
+
assert callable(table_stitcher.stitch_tables)
|
|
118
|
+
print(f"smoke ok: {table_stitcher.__version__} from {pkg}")
|
|
119
|
+
PY
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
name: release
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v*"
|
|
7
|
+
|
|
8
|
+
# Trusted Publishing requires `id-token: write`. No PyPI API token needed —
|
|
9
|
+
# PyPI verifies the OIDC claim against the project's trusted-publisher config.
|
|
10
|
+
# One-time setup on PyPI: Account → Publishing → Add a pending publisher with
|
|
11
|
+
# repo `pebbleroad/table-stitcher`, workflow `release.yml`, environment `pypi`.
|
|
12
|
+
permissions:
|
|
13
|
+
contents: read
|
|
14
|
+
id-token: write
|
|
15
|
+
|
|
16
|
+
jobs:
|
|
17
|
+
publish:
|
|
18
|
+
name: publish to PyPI
|
|
19
|
+
runs-on: ubuntu-latest
|
|
20
|
+
environment:
|
|
21
|
+
name: pypi
|
|
22
|
+
url: https://pypi.org/p/table-stitcher
|
|
23
|
+
steps:
|
|
24
|
+
- uses: actions/checkout@v4
|
|
25
|
+
- uses: actions/setup-python@v5
|
|
26
|
+
with:
|
|
27
|
+
python-version: "3.12"
|
|
28
|
+
cache: pip
|
|
29
|
+
- name: install build tooling
|
|
30
|
+
run: |
|
|
31
|
+
python -m pip install --upgrade pip
|
|
32
|
+
pip install -e ".[dev]"
|
|
33
|
+
- name: tag matches package version
|
|
34
|
+
# Refuse to publish if the git tag and pyproject version disagree —
|
|
35
|
+
# mismatch here is the classic source of "v0.2.0 on PyPI is actually 0.2.1".
|
|
36
|
+
run: |
|
|
37
|
+
tag="${GITHUB_REF##*/}"
|
|
38
|
+
pkg_version=$(python -c "import tomllib, pathlib; print(tomllib.loads(pathlib.Path('pyproject.toml').read_text())['project']['version'])")
|
|
39
|
+
if [ "$tag" != "v${pkg_version}" ]; then
|
|
40
|
+
echo "tag $tag does not match pyproject version v$pkg_version" >&2
|
|
41
|
+
exit 1
|
|
42
|
+
fi
|
|
43
|
+
- name: release gate (build + wheel smoke + tests)
|
|
44
|
+
# Re-runs the same gate that contributors use locally. Keeps "what
|
|
45
|
+
# gets published" and "what was validated" identical.
|
|
46
|
+
env:
|
|
47
|
+
RELEASE_GATE_ONLINE: "1"
|
|
48
|
+
run: ./scripts/release_gate.sh
|
|
49
|
+
- name: publish
|
|
50
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
51
|
+
with:
|
|
52
|
+
packages-dir: dist/
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
name: upstream-smoke
|
|
2
|
+
|
|
3
|
+
# Live-parse lane: re-runs docling against the fixture PDFs to catch upstream
|
|
4
|
+
# regressions (docling, OCR engines, model updates) without blocking PRs.
|
|
5
|
+
#
|
|
6
|
+
# Comparisons run in lenient mode — only structural fields (members, pages,
|
|
7
|
+
# shape) are checked, since cell text varies by OCR engine. The job is
|
|
8
|
+
# allowed to fail; treat persistent red here as a signal to investigate, not
|
|
9
|
+
# a merge blocker.
|
|
10
|
+
|
|
11
|
+
on:
|
|
12
|
+
schedule:
|
|
13
|
+
# Daily at 06:17 UTC. Off-hour to avoid contention with weekday CI bursts.
|
|
14
|
+
- cron: "17 6 * * *"
|
|
15
|
+
workflow_dispatch:
|
|
16
|
+
|
|
17
|
+
jobs:
|
|
18
|
+
live-parse:
|
|
19
|
+
name: live-parse (${{ matrix.os }})
|
|
20
|
+
# macos-latest matches the OCR engine (ocrmac / Apple Vision) used to
|
|
21
|
+
# generate the committed snapshots, so structural drift here points at a
|
|
22
|
+
# genuine docling change rather than at OCR-engine divergence.
|
|
23
|
+
runs-on: ${{ matrix.os }}
|
|
24
|
+
strategy:
|
|
25
|
+
fail-fast: false
|
|
26
|
+
matrix:
|
|
27
|
+
os: [macos-latest]
|
|
28
|
+
steps:
|
|
29
|
+
- uses: actions/checkout@v4
|
|
30
|
+
- uses: actions/setup-python@v5
|
|
31
|
+
with:
|
|
32
|
+
python-version: "3.11"
|
|
33
|
+
cache: pip
|
|
34
|
+
- name: cache docling models
|
|
35
|
+
uses: actions/cache@v4
|
|
36
|
+
with:
|
|
37
|
+
path: ~/.cache/huggingface
|
|
38
|
+
key: docling-models-${{ runner.os }}-v1
|
|
39
|
+
- name: install
|
|
40
|
+
run: |
|
|
41
|
+
python -m pip install --upgrade pip
|
|
42
|
+
pip install -e ".[dev]"
|
|
43
|
+
- name: live-parse integration tests (lenient)
|
|
44
|
+
# Allowed to fail — we never want this to block a PR.
|
|
45
|
+
continue-on-error: true
|
|
46
|
+
run: pytest -m integration --live-parse tests/ -v
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.egg-info/
|
|
6
|
+
*.egg
|
|
7
|
+
dist/
|
|
8
|
+
build/
|
|
9
|
+
.eggs/
|
|
10
|
+
|
|
11
|
+
# Virtual environments
|
|
12
|
+
.venv/
|
|
13
|
+
venv/
|
|
14
|
+
env/
|
|
15
|
+
|
|
16
|
+
# IDE
|
|
17
|
+
.idea/
|
|
18
|
+
.vscode/
|
|
19
|
+
*.swp
|
|
20
|
+
*.swo
|
|
21
|
+
|
|
22
|
+
# OS
|
|
23
|
+
.DS_Store
|
|
24
|
+
Thumbs.db
|
|
25
|
+
|
|
26
|
+
# Testing
|
|
27
|
+
.pytest_cache/
|
|
28
|
+
.coverage
|
|
29
|
+
htmlcov/
|
|
30
|
+
|
|
31
|
+
# Build
|
|
32
|
+
*.whl
|
|
33
|
+
*.tar.gz
|
|
34
|
+
|
|
35
|
+
# Test output
|
|
36
|
+
*.enriched.*
|
|
37
|
+
*.stitched.*
|
|
38
|
+
|
|
39
|
+
# Dataset cache (PubTables-v2 etc.)
|
|
40
|
+
.temp/
|
|
41
|
+
|
|
42
|
+
# Claude Code per-user overrides (settings.json itself is shared)
|
|
43
|
+
.claude/settings.local.json
|
|
44
|
+
.claude/scheduled_tasks.lock
|
|
45
|
+
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Mirror of the CI lint job. Contributors run `pre-commit install` once,
|
|
2
|
+
# then every commit auto-lints + auto-formats. CI catches anything they
|
|
3
|
+
# bypassed (--no-verify) or skipped.
|
|
4
|
+
#
|
|
5
|
+
# We use `language: system` so pre-commit invokes the ruff that
|
|
6
|
+
# `pip install -e ".[dev]"` already installed. Pinning a separate ruff
|
|
7
|
+
# version in this file would drift from the one CI installs and silently
|
|
8
|
+
# disagree on rules and formatter output. Single source of truth: pyproject.
|
|
9
|
+
repos:
|
|
10
|
+
- repo: local
|
|
11
|
+
hooks:
|
|
12
|
+
- id: ruff-check
|
|
13
|
+
name: ruff check
|
|
14
|
+
entry: ruff check --fix
|
|
15
|
+
language: system
|
|
16
|
+
types: [python]
|
|
17
|
+
- id: ruff-format
|
|
18
|
+
name: ruff format
|
|
19
|
+
entry: ruff format
|
|
20
|
+
language: system
|
|
21
|
+
types: [python]
|
|
22
|
+
|
|
23
|
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
24
|
+
rev: v5.0.0
|
|
25
|
+
hooks:
|
|
26
|
+
- id: trailing-whitespace
|
|
27
|
+
- id: end-of-file-fixer
|
|
28
|
+
- id: check-yaml
|
|
29
|
+
- id: check-toml
|
|
30
|
+
- id: check-added-large-files
|
|
31
|
+
args: [--maxkb=2000] # PDF fixtures are legitimately large
|