table-stitcher 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. table_stitcher-0.3.0/.claude/settings.json +76 -0
  2. table_stitcher-0.3.0/.github/ISSUE_TEMPLATE/bug_report.yml +90 -0
  3. table_stitcher-0.3.0/.github/ISSUE_TEMPLATE/config.yml +8 -0
  4. table_stitcher-0.3.0/.github/ISSUE_TEMPLATE/feature_request.yml +49 -0
  5. table_stitcher-0.3.0/.github/dependabot.yml +27 -0
  6. table_stitcher-0.3.0/.github/pull_request_template.md +15 -0
  7. table_stitcher-0.3.0/.github/workflows/ci.yml +119 -0
  8. table_stitcher-0.3.0/.github/workflows/release.yml +52 -0
  9. table_stitcher-0.3.0/.github/workflows/upstream-smoke.yml +46 -0
  10. table_stitcher-0.3.0/.gitignore +45 -0
  11. table_stitcher-0.3.0/.pre-commit-config.yaml +31 -0
  12. table_stitcher-0.3.0/CHANGELOG.md +128 -0
  13. table_stitcher-0.3.0/CONTRIBUTING.md +243 -0
  14. table_stitcher-0.3.0/LICENSE +21 -0
  15. table_stitcher-0.3.0/PKG-INFO +392 -0
  16. table_stitcher-0.3.0/README.md +354 -0
  17. table_stitcher-0.3.0/SECURITY.md +31 -0
  18. table_stitcher-0.3.0/examples/basic_pipeline.py +72 -0
  19. table_stitcher-0.3.0/examples/system_controller.py +86 -0
  20. table_stitcher-0.3.0/pyproject.toml +101 -0
  21. table_stitcher-0.3.0/scripts/regenerate_docling_snapshots.py +119 -0
  22. table_stitcher-0.3.0/scripts/release_gate.sh +86 -0
  23. table_stitcher-0.3.0/src/table_stitcher/__init__.py +340 -0
  24. table_stitcher-0.3.0/src/table_stitcher/adapters/README.md +173 -0
  25. table_stitcher-0.3.0/src/table_stitcher/adapters/__init__.py +11 -0
  26. table_stitcher-0.3.0/src/table_stitcher/adapters/base.py +42 -0
  27. table_stitcher-0.3.0/src/table_stitcher/adapters/docling.py +797 -0
  28. table_stitcher-0.3.0/src/table_stitcher/merger.py +979 -0
  29. table_stitcher-0.3.0/src/table_stitcher/models.py +145 -0
  30. table_stitcher-0.3.0/src/table_stitcher/py.typed +0 -0
  31. table_stitcher-0.3.0/tests/README.md +135 -0
  32. table_stitcher-0.3.0/tests/__init__.py +0 -0
  33. table_stitcher-0.3.0/tests/fixtures/tablemeta/headerless-width-drift.yaml +26 -0
  34. table_stitcher-0.3.0/tests/integration/__init__.py +0 -0
  35. table_stitcher-0.3.0/tests/integration/_tools/__init__.py +0 -0
  36. table_stitcher-0.3.0/tests/integration/_tools/regenerate_expected.py +173 -0
  37. table_stitcher-0.3.0/tests/integration/conftest.py +362 -0
  38. table_stitcher-0.3.0/tests/integration/fixtures/_synth/__init__.py +0 -0
  39. table_stitcher-0.3.0/tests/integration/fixtures/_synth/generate.py +172 -0
  40. table_stitcher-0.3.0/tests/integration/fixtures/distinct-tables-no-merge/.gitkeep +0 -0
  41. table_stitcher-0.3.0/tests/integration/fixtures/distinct-tables-no-merge/kaop-study-mixed-3pg.pt2.docling.json +1 -0
  42. table_stitcher-0.3.0/tests/integration/fixtures/distinct-tables-no-merge/kaop-study-mixed-3pg.pt2.expected.yaml +82 -0
  43. table_stitcher-0.3.0/tests/integration/fixtures/distinct-tables-no-merge/kaop-study-mixed-3pg.pt2.pdf +0 -0
  44. table_stitcher-0.3.0/tests/integration/fixtures/distinct-tables-no-merge/lab-panels-3pg.corp.docling.json +4180 -0
  45. table_stitcher-0.3.0/tests/integration/fixtures/distinct-tables-no-merge/lab-panels-3pg.corp.expected.yaml +74 -0
  46. table_stitcher-0.3.0/tests/integration/fixtures/distinct-tables-no-merge/lab-panels-3pg.corp.pdf +130 -0
  47. table_stitcher-0.3.0/tests/integration/fixtures/false-merge/category-rows-thematic-2pg.corp.docling.json +2413 -0
  48. table_stitcher-0.3.0/tests/integration/fixtures/false-merge/category-rows-thematic-2pg.corp.expected.yaml +30 -0
  49. table_stitcher-0.3.0/tests/integration/fixtures/false-merge/category-rows-thematic-2pg.corp.pdf +105 -0
  50. table_stitcher-0.3.0/tests/integration/fixtures/headerless-continuation/15-page-druglist.corp.docling.json +1 -0
  51. table_stitcher-0.3.0/tests/integration/fixtures/headerless-continuation/15-page-druglist.corp.expected.yaml +51 -0
  52. table_stitcher-0.3.0/tests/integration/fixtures/headerless-continuation/15-page-druglist.corp.pdf +0 -0
  53. table_stitcher-0.3.0/tests/integration/fixtures/headerless-continuation/gene-symbols-6pg.pt2.docling.json +1 -0
  54. table_stitcher-0.3.0/tests/integration/fixtures/headerless-continuation/gene-symbols-6pg.pt2.expected.yaml +43 -0
  55. table_stitcher-0.3.0/tests/integration/fixtures/headerless-continuation/gene-symbols-6pg.pt2.pdf +0 -0
  56. table_stitcher-0.3.0/tests/integration/fixtures/headerless-continuation/rct-study-table-5pg.pt2.docling.json +1 -0
  57. table_stitcher-0.3.0/tests/integration/fixtures/headerless-continuation/rct-study-table-5pg.pt2.expected.yaml +47 -0
  58. table_stitcher-0.3.0/tests/integration/fixtures/headerless-continuation/rct-study-table-5pg.pt2.pdf +0 -0
  59. table_stitcher-0.3.0/tests/integration/fixtures/headerless-continuation/rrt-outcomes-2pg.pt2.docling.json +1 -0
  60. table_stitcher-0.3.0/tests/integration/fixtures/headerless-continuation/rrt-outcomes-2pg.pt2.expected.yaml +38 -0
  61. table_stitcher-0.3.0/tests/integration/fixtures/headerless-continuation/rrt-outcomes-2pg.pt2.pdf +0 -0
  62. table_stitcher-0.3.0/tests/integration/fixtures/headerless-continuation/uveitis-case-series-5pg.pt2.docling.json +1 -0
  63. table_stitcher-0.3.0/tests/integration/fixtures/headerless-continuation/uveitis-case-series-5pg.pt2.expected.yaml +48 -0
  64. table_stitcher-0.3.0/tests/integration/fixtures/headerless-continuation/uveitis-case-series-5pg.pt2.pdf +0 -0
  65. table_stitcher-0.3.0/tests/integration/fixtures/inconsistent-header-detection/covid-misc-labs-4pg.pt2.docling.json +1 -0
  66. table_stitcher-0.3.0/tests/integration/fixtures/inconsistent-header-detection/covid-misc-labs-4pg.pt2.expected.yaml +96 -0
  67. table_stitcher-0.3.0/tests/integration/fixtures/inconsistent-header-detection/covid-misc-labs-4pg.pt2.pdf +0 -0
  68. table_stitcher-0.3.0/tests/integration/fixtures/inconsistent-header-detection/lit-review-3pg.pt2.docling.json +1 -0
  69. table_stitcher-0.3.0/tests/integration/fixtures/inconsistent-header-detection/lit-review-3pg.pt2.expected.yaml +61 -0
  70. table_stitcher-0.3.0/tests/integration/fixtures/inconsistent-header-detection/lit-review-3pg.pt2.pdf +0 -0
  71. table_stitcher-0.3.0/tests/integration/fixtures/inconsistent-header-detection/retirement-portfolio.corp.docling.json +1 -0
  72. table_stitcher-0.3.0/tests/integration/fixtures/inconsistent-header-detection/retirement-portfolio.corp.expected.yaml +37 -0
  73. table_stitcher-0.3.0/tests/integration/fixtures/inconsistent-header-detection/retirement-portfolio.corp.pdf +0 -0
  74. table_stitcher-0.3.0/tests/integration/fixtures/loose-header-layout/.gitkeep +0 -0
  75. table_stitcher-0.3.0/tests/integration/fixtures/loose-header-layout/biological-process-2pg.pt2.docling.json +1 -0
  76. table_stitcher-0.3.0/tests/integration/fixtures/loose-header-layout/biological-process-2pg.pt2.expected.yaml +32 -0
  77. table_stitcher-0.3.0/tests/integration/fixtures/loose-header-layout/biological-process-2pg.pt2.pdf +0 -0
  78. table_stitcher-0.3.0/tests/integration/fixtures/loose-header-layout/symptoms-mediators-4pg.pt2.docling.json +1 -0
  79. table_stitcher-0.3.0/tests/integration/fixtures/loose-header-layout/symptoms-mediators-4pg.pt2.expected.yaml +27 -0
  80. table_stitcher-0.3.0/tests/integration/fixtures/loose-header-layout/symptoms-mediators-4pg.pt2.pdf +0 -0
  81. table_stitcher-0.3.0/tests/integration/fixtures/multilingual/corporate-history-2pg.edinet.docling.json +1 -0
  82. table_stitcher-0.3.0/tests/integration/fixtures/multilingual/corporate-history-2pg.edinet.expected.yaml +23 -0
  83. table_stitcher-0.3.0/tests/integration/fixtures/multilingual/corporate-history-2pg.edinet.pdf +0 -0
  84. table_stitcher-0.3.0/tests/integration/fixtures/multilingual/subsidiaries-4pg.edinet.docling.json +1 -0
  85. table_stitcher-0.3.0/tests/integration/fixtures/multilingual/subsidiaries-4pg.edinet.expected.yaml +41 -0
  86. table_stitcher-0.3.0/tests/integration/fixtures/multilingual/subsidiaries-4pg.edinet.pdf +0 -0
  87. table_stitcher-0.3.0/tests/integration/fixtures/orphan-pair/.gitkeep +0 -0
  88. table_stitcher-0.3.0/tests/integration/fixtures/orphan-pair/varicose-veins-new-table-header-7pg.pt2.docling.json +1 -0
  89. table_stitcher-0.3.0/tests/integration/fixtures/orphan-pair/varicose-veins-new-table-header-7pg.pt2.expected.yaml +63 -0
  90. table_stitcher-0.3.0/tests/integration/fixtures/orphan-pair/varicose-veins-new-table-header-7pg.pt2.pdf +0 -0
  91. table_stitcher-0.3.0/tests/integration/fixtures/page-gap-too-large/unrelated-tables-gap4.synth.docling.json +1 -0
  92. table_stitcher-0.3.0/tests/integration/fixtures/page-gap-too-large/unrelated-tables-gap4.synth.expected.yaml +25 -0
  93. table_stitcher-0.3.0/tests/integration/fixtures/page-gap-too-large/unrelated-tables-gap4.synth.pdf +131 -0
  94. table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/4-page-substance-list.corp.docling.json +1 -0
  95. table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/4-page-substance-list.corp.expected.yaml +61 -0
  96. table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/4-page-substance-list.corp.pdf +0 -0
  97. table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/cell-markers-4pg.pt2.docling.json +1 -0
  98. table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/cell-markers-4pg.pt2.expected.yaml +31 -0
  99. table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/cell-markers-4pg.pt2.pdf +0 -0
  100. table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/challenge-categories-2pg.pt2.docling.json +1 -0
  101. table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/challenge-categories-2pg.pt2.expected.yaml +36 -0
  102. table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/challenge-categories-2pg.pt2.pdf +0 -0
  103. table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/fungal-taxonomy-4pg.pt2.docling.json +1 -0
  104. table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/fungal-taxonomy-4pg.pt2.expected.yaml +42 -0
  105. table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/fungal-taxonomy-4pg.pt2.pdf +0 -0
  106. table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/rowspan-insurance-payout.corp.docling.json +1 -0
  107. table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/rowspan-insurance-payout.corp.expected.yaml +35 -0
  108. table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/rowspan-insurance-payout.corp.pdf +0 -0
  109. table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/search-strategies-2pg.pt2.docling.json +1 -0
  110. table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/search-strategies-2pg.pt2.expected.yaml +37 -0
  111. table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/search-strategies-2pg.pt2.pdf +0 -0
  112. table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/study-sample-7pg.pt2.docling.json +1 -0
  113. table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/study-sample-7pg.pt2.expected.yaml +48 -0
  114. table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/study-sample-7pg.pt2.pdf +0 -0
  115. table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/themes-exemplars-3pg.pt2.docling.json +1 -0
  116. table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/themes-exemplars-3pg.pt2.expected.yaml +35 -0
  117. table_stitcher-0.3.0/tests/integration/fixtures/repeated-header/themes-exemplars-3pg.pt2.pdf +0 -0
  118. table_stitcher-0.3.0/tests/integration/fixtures/simple-continuation/sample-table.corp.docling.json +1 -0
  119. table_stitcher-0.3.0/tests/integration/fixtures/simple-continuation/sample-table.corp.expected.yaml +15 -0
  120. table_stitcher-0.3.0/tests/integration/fixtures/simple-continuation/sample-table.corp.pdf +0 -0
  121. table_stitcher-0.3.0/tests/integration/fixtures/spillover/note-overflow.synth.docling.json +1 -0
  122. table_stitcher-0.3.0/tests/integration/fixtures/spillover/note-overflow.synth.expected.yaml +47 -0
  123. table_stitcher-0.3.0/tests/integration/fixtures/spillover/note-overflow.synth.pdf +93 -0
  124. table_stitcher-0.3.0/tests/integration/fixtures/width-drift/.gitkeep +0 -0
  125. table_stitcher-0.3.0/tests/integration/fixtures/width-drift/abx-literature-review-7pg.pt2.docling.json +1 -0
  126. table_stitcher-0.3.0/tests/integration/fixtures/width-drift/abx-literature-review-7pg.pt2.expected.yaml +50 -0
  127. table_stitcher-0.3.0/tests/integration/fixtures/width-drift/abx-literature-review-7pg.pt2.pdf +0 -0
  128. table_stitcher-0.3.0/tests/integration/test_fixtures.py +54 -0
  129. table_stitcher-0.3.0/tests/test_docling_adapter.py +757 -0
  130. table_stitcher-0.3.0/tests/test_merger.py +791 -0
  131. table_stitcher-0.3.0/tests/test_public_api.py +83 -0
  132. table_stitcher-0.3.0/tests/test_tablemeta_fixtures.py +82 -0
@@ -0,0 +1,76 @@
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "Bash(python3 -c \"import docling, table_stitcher; print\\('ok'\\)\")",
5
+ "Bash(python3 -m pip show docling)",
6
+ "Bash(python3 -m pip show table_stitcher)",
7
+ "Bash(python3 -c \"import sys; print\\(sys.executable\\)\")",
8
+ "Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 -c \"import table_stitcher; print\\(table_stitcher.__file__\\); import docling; print\\('docling ok'\\)\")",
9
+ "Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 /tmp/inspect_testdocs.py)",
10
+ "Bash(tee /tmp/inspect_run.log)",
11
+ "Bash(python3 -c \"import yaml; print\\(yaml.__version__\\)\")",
12
+ "Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 -m pytest tests/integration/ -v)",
13
+ "Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 -m pytest tests/ -v)",
14
+ "Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 /tmp/gen_expected.py)",
15
+ "Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 -m pytest tests/ -v --no-header)",
16
+ "Bash(curl -L -o images.tar.gz --progress-bar \"https://huggingface.co/datasets/kensho/PubTables-v2/resolve/main/PubTables-v2_Full-Documents_test_images.tar.gz\")",
17
+ "Bash(curl -sL -o cross-page-pairs.tar.gz \"https://huggingface.co/datasets/kensho/PubTables-v2/resolve/main/PubTables-v2_Full-Documents_test_cross-page-table-pairs.tar.gz\")",
18
+ "Bash(curl -sL -o xml-annotations.tar.gz \"https://huggingface.co/datasets/kensho/PubTables-v2/resolve/main/PubTables-v2_Full-Documents_test_xml-annotations.tar.gz\")",
19
+ "Bash(curl -sL -o tables.tar.gz \"https://huggingface.co/datasets/kensho/PubTables-v2/resolve/main/PubTables-v2_Full-Documents_test_tables.tar.gz\")",
20
+ "Bash(curl -sL -o README.md \"https://huggingface.co/datasets/kensho/PubTables-v2/resolve/main/README.md\")",
21
+ "Bash(mkdir -p extracted)",
22
+ "Bash(tar -xzf ../cross-page-pairs.tar.gz)",
23
+ "Bash(tar -xzf ../xml-annotations.tar.gz)",
24
+ "Bash(tar -xzf ../tables.tar.gz)",
25
+ "Bash(awk -F/ '{print $4}')",
26
+ "Bash(python3 -c \"import json; d=json.load\\(open\\('Full Documents/test/tables/PMC10157558_tables.json'\\)\\); print\\(type\\(d\\)\\); print\\(json.dumps\\(d, indent=2\\)[:2000]\\)\")",
27
+ "Bash(python3 /tmp/select_multipage.py)",
28
+ "Bash(tar -tzf /Users/pebbleroad/Documents/table-stitcher/.temp/images.tar.gz)",
29
+ "Bash(python3 -c \"import img2pdf; print\\('img2pdf', img2pdf.__version__\\)\")",
30
+ "Bash(tar -xzf ../images.tar.gz)",
31
+ "Bash(python3 /tmp/bundle_pdfs.py)",
32
+ "Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 /tmp/inspect_pt2.py)",
33
+ "Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 /tmp/install_pt2_fixtures.py)",
34
+ "Bash(tee /tmp/install_output.log)",
35
+ "Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 -m pytest tests/)",
36
+ "Bash(python3 -c ' *)",
37
+ "Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 -m pytest tests/test_docling_adapter.py -v)",
38
+ "Bash(python3 /tmp/select_second_slice.py)",
39
+ "Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 /tmp/inspect_pt2_v2.py)",
40
+ "Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 /tmp/install_pt2_v2_fixtures.py)",
41
+ "Bash(tee /tmp/install_v2.log)",
42
+ "Bash(python3 -c \"import reportlab; print\\(reportlab.Version\\)\")",
43
+ "Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 -m tests.integration.fixtures._synth.generate)",
44
+ "Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 -m pytest tests/test_merger.py -v)",
45
+ "Bash(git rm *)",
46
+ "Bash(python3 -m pip show docling docling-core)",
47
+ "Bash(python3 -c \"import pypdf; print\\(pypdf.__version__\\)\")",
48
+ "Bash(python3 -c \"import pikepdf; print\\(pikepdf.__version__\\)\")",
49
+ "Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 -m pytest tests/integration/ -v -k \"edinet\")",
50
+ "Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 -m pytest tests/test_merger.py -v -k \"tokenize\")",
51
+ "Bash(python3 -m tests.integration._tools.regenerate_expected tests/integration/fixtures/multilingual/corporate-history-2pg.edinet.pdf --clear-xfail --description 'Two-page corporate history table from a Japanese EDINET filing. Both pages carry identical 2-column Japanese headers \\(年⽉ / 概要\\). Exercises the repeated-header Jaccard path against pure CJK text — merges because tokenize\\(\\) emits one token per CJK character, giving Jaccard 1.0 on identical headers.')",
52
+ "Bash(python3 -m tests.integration._tools.regenerate_expected tests/integration/fixtures/multilingual/subsidiaries-4pg.edinet.pdf --clear-xfail --description 'Four-page subsidiaries list from a Japanese EDINET filing. Three fragments \\(widths 6/6/7\\), all with identical 6-column Japanese headers \\(名称 / 住所 / 資本⾦⼜は出資⾦ / 主要な事業の内容 / 議決権の所有割合 / 関係内容\\). The width-7 final fragment is the summary-rows continuation of the same table. All three merge on CJK-aware header Jaccard.')",
53
+ "Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 -m pytest -m integration tests/integration/test_fixtures.py::test_fixture_stitches_as_expected -k \"varicose\")",
54
+ "Bash(python3 -m tests.integration._tools.regenerate_expected tests/integration/fixtures/orphan-pair/varicose-veins-new-table-header-7pg.pt2.pdf)",
55
+ "Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 -m pytest tests/ tests/ -m \"integration\")",
56
+ "Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 -m pytest -m integration tests/)",
57
+ "Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 -m pytest -m integration tests/integration/test_fixtures.py::test_fixture_stitches_as_expected -k \"substance-list or note-overflow\")",
58
+ "Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 -m pytest tests/test_docling_adapter.py::TestInjection::test_injection_failure_restores_partial_mutations -v)",
59
+ "Bash(scripts/release_gate.sh)",
60
+ "Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 -m pytest -m integration tests/integration/test_fixtures.py -k \"substance-list or note-overflow\")",
61
+ "Bash(python -c \"import sys; print\\(sys.version\\)\")",
62
+ "Bash(python3 -m pip show ruff)",
63
+ "Bash(ruff format *)",
64
+ "Bash(ruff check *)",
65
+ "Bash(python3 -m pytest -q)",
66
+ "Bash(python3 -m build --wheel --outdir /tmp/wheel-check)",
67
+ "Bash(unzip -l /tmp/wheel-check/*.whl)",
68
+ "Bash(python3 -m pytest tests/test_public_api.py -v)",
69
+ "Bash(python3 -m pytest tests/:*)",
70
+ "Bash(python3 -m pytest tests:*)",
71
+ "Bash(PYTHONPATH=/Users/pebbleroad/Documents/table-stitcher/src python3 -m pytest tests/:*)",
72
+ "Bash(python3 -m pytest -q:*)",
73
+ "Bash(python3 -m pytest -m:*)"
74
+ ]
75
+ }
76
+ }
@@ -0,0 +1,90 @@
1
+ name: Bug report
2
+ description: A merge produced wrong output, threw an unexpected error, or behaved differently than documented.
3
+ labels: ["bug"]
4
+ body:
5
+ - type: markdown
6
+ attributes:
7
+ value: |
8
+ Thanks for filing a bug. The fields below are what we need to reproduce
9
+ the issue. The biggest unknown for a table-stitching bug is usually
10
+ *what the upstream parser produced* — please attach a minimal fixture
11
+ if you can.
12
+
13
+ - type: input
14
+ id: version
15
+ attributes:
16
+ label: table-stitcher version
17
+ description: Output of `python -c "import table_stitcher; print(table_stitcher.__version__)"`
18
+ placeholder: "0.2.0"
19
+ validations:
20
+ required: true
21
+
22
+ - type: input
23
+ id: python
24
+ attributes:
25
+ label: Python version
26
+ placeholder: "3.11.7"
27
+ validations:
28
+ required: true
29
+
30
+ - type: dropdown
31
+ id: parser
32
+ attributes:
33
+ label: Upstream parser
34
+ description: Which extractor produced the input tables?
35
+ options:
36
+ - Docling (built-in adapter)
37
+ - Custom adapter
38
+ - Other
39
+ validations:
40
+ required: true
41
+
42
+ - type: textarea
43
+ id: what-happened
44
+ attributes:
45
+ label: What happened
46
+ description: What did the merger produce, and what did you expect instead?
47
+ placeholder: |
48
+ Pages 4–5 of report.pdf contain one logical table split by a page break.
49
+ I expected `stitch_tables(doc)` to return one merged table; instead I
50
+ got two separate tables with the headers duplicated.
51
+ validations:
52
+ required: true
53
+
54
+ - type: textarea
55
+ id: repro
56
+ attributes:
57
+ label: Minimal reproduction
58
+ description: |
59
+ Smallest input + code that demonstrates the bug. If you can attach a
60
+ PDF fragment or a synthetic `TableMeta` list, that's ideal. See
61
+ [CONTRIBUTING.md](https://github.com/pebbleroad/table-stitcher/blob/main/CONTRIBUTING.md#adding-a-new-fixture)
62
+ for the fixture format we use internally.
63
+ render: python
64
+ placeholder: |
65
+ from docling.document_converter import DocumentConverter
66
+ from table_stitcher import stitch_tables
67
+
68
+ doc = DocumentConverter().convert("attached.pdf").document
69
+ result = stitch_tables(doc)
70
+ # observed: 2 tables; expected: 1
71
+ validations:
72
+ required: true
73
+
74
+ - type: textarea
75
+ id: config
76
+ attributes:
77
+ label: MultiPageConfig (if non-default)
78
+ description: Any custom thresholds you passed via `MultiPageConfig(...)`.
79
+ placeholder: "MultiPageConfig(max_page_gap=2, headerless_width_tolerance=3)"
80
+ validations:
81
+ required: false
82
+
83
+ - type: textarea
84
+ id: logs
85
+ attributes:
86
+ label: Logs / traceback
87
+ description: Output of the merger at INFO level if relevant. Wrap in triple backticks.
88
+ render: shell
89
+ validations:
90
+ required: false
@@ -0,0 +1,8 @@
1
+ blank_issues_enabled: false
2
+ contact_links:
3
+ - name: Security vulnerability
4
+ url: https://github.com/pebbleroad/table-stitcher/security/policy
5
+ about: Please report security issues privately — see SECURITY.md, do not file a public issue.
6
+ - name: Question or discussion
7
+ url: https://github.com/pebbleroad/table-stitcher/discussions
8
+ about: For usage questions or open-ended discussion, use Discussions instead of Issues.
@@ -0,0 +1,49 @@
1
+ name: Feature request
2
+ description: Propose a new merge signal, config knob, adapter, or API surface.
3
+ labels: ["enhancement"]
4
+ body:
5
+ - type: markdown
6
+ attributes:
7
+ value: |
8
+ The library deliberately keeps merge signals **structural** (column
9
+ counts, header similarity, layout proximity) and avoids
10
+ language-specific or vocabulary-based rules. Proposals that fit that
11
+ model land faster — see [CONTRIBUTING.md](https://github.com/pebbleroad/table-stitcher/blob/main/CONTRIBUTING.md)
12
+ for context.
13
+
14
+ - type: textarea
15
+ id: problem
16
+ attributes:
17
+ label: What problem does this solve?
18
+ description: A real document or pipeline where the current behavior falls short.
19
+ validations:
20
+ required: true
21
+
22
+ - type: textarea
23
+ id: proposal
24
+ attributes:
25
+ label: Proposed change
26
+ description: |
27
+ Sketch the API or behavior change. New `MultiPageConfig` field?
28
+ New adapter? New merge phase? Pseudocode is welcome.
29
+ validations:
30
+ required: true
31
+
32
+ - type: textarea
33
+ id: alternatives
34
+ attributes:
35
+ label: Alternatives considered
36
+ description: |
37
+ What you tried already (different config, custom adapter, post-processing) and why it wasn't enough.
38
+ validations:
39
+ required: false
40
+
41
+ - type: checkboxes
42
+ id: structural
43
+ attributes:
44
+ label: Compatibility with the structural-only constraint
45
+ options:
46
+ - label: This proposal does **not** rely on language-specific dictionaries, vocabulary lists, or trained models.
47
+ required: false
48
+ - label: I understand the maintainers may decline language-aware proposals to keep the core multilingual.
49
+ required: false
@@ -0,0 +1,27 @@
1
+ version: 2
2
+ updates:
3
+ # Python deps in pyproject.toml — bumps `pandas`, `docling`, dev tools.
4
+ # Weekly cadence avoids PR spam; security advisories are picked up out-of-band.
5
+ - package-ecosystem: pip
6
+ directory: "/"
7
+ schedule:
8
+ interval: weekly
9
+ open-pull-requests-limit: 5
10
+ groups:
11
+ # Group dev-only deps so a routine ruff/pytest bump doesn't
12
+ # produce three separate PRs in one week.
13
+ dev-tooling:
14
+ patterns:
15
+ - "ruff"
16
+ - "pytest*"
17
+ - "build"
18
+ - "twine"
19
+ - "pre-commit"
20
+
21
+ # GitHub Actions versions in .github/workflows/*.yml — keeps action
22
+ # references current (e.g. actions/checkout@v4 → @v5 when stable).
23
+ - package-ecosystem: github-actions
24
+ directory: "/"
25
+ schedule:
26
+ interval: weekly
27
+ open-pull-requests-limit: 5
@@ -0,0 +1,15 @@
1
+ ## What changed
2
+
3
+ <!-- One or two sentences. The "why" matters more than the "what" — the diff shows the what. -->
4
+
5
+ ## Why
6
+
7
+ <!-- The motivation. A linked issue, a fixture that was failing, a downstream user request. -->
8
+
9
+ ## Checklist
10
+
11
+ - [ ] `pytest tests/` is green locally (or xfails are intentional, with reasons in the YAML)
12
+ - [ ] `ruff check .` and `ruff format --check .` pass (pre-commit handles this for you)
13
+ - [ ] New behavior has a test (unit or fixture)
14
+ - [ ] User-visible changes are noted in [CHANGELOG.md](https://github.com/pebbleroad/table-stitcher/blob/main/CHANGELOG.md)
15
+ - [ ] No language-specific or vocabulary assumptions added — merge signals stay structural
@@ -0,0 +1,119 @@
1
+ name: ci
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ lint:
11
+ # Style + static-analysis gate. Single Python — ruff produces identical
12
+ # results across versions, so a matrix here just burns minutes.
13
+ name: lint
14
+ runs-on: ubuntu-latest
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+ - uses: actions/setup-python@v5
18
+ with:
19
+ python-version: "3.12"
20
+ cache: pip
21
+ - name: install ruff
22
+ # Pin via pyproject's dev extra so local + CI run the same version.
23
+ # No need to install the full dev tree (docling, reportlab) just to lint.
24
+ run: |
25
+ python -m pip install --upgrade pip
26
+ pip install "ruff>=0.6"
27
+ - name: ruff check
28
+ run: ruff check .
29
+ - name: ruff format --check
30
+ run: ruff format --check .
31
+
32
+ test:
33
+ # Behavior gate. Matrix across all supported Python versions.
34
+ name: test (py${{ matrix.python }})
35
+ runs-on: ubuntu-latest
36
+ strategy:
37
+ fail-fast: false
38
+ matrix:
39
+ python: ["3.9", "3.10", "3.11", "3.12"]
40
+ steps:
41
+ - uses: actions/checkout@v4
42
+ - uses: actions/setup-python@v5
43
+ with:
44
+ python-version: ${{ matrix.python }}
45
+ cache: pip
46
+ - name: install
47
+ run: |
48
+ python -m pip install --upgrade pip
49
+ pip install -e ".[dev]"
50
+ - name: unit tests
51
+ # Default addopts in pyproject excludes the integration marker, so
52
+ # this run is already unit-only. -v adds per-case outcomes.
53
+ run: pytest tests/ -v
54
+
55
+ integration:
56
+ # Snapshot lane: tests load committed *.docling.json fixtures and run
57
+ # table-stitcher against them. No PDF parsing, no OCR, no model downloads
58
+ # — deterministic across platforms regardless of which OCR engine docling
59
+ # would auto-select. The full live-parse pipeline is exercised separately
60
+ # by .github/workflows/upstream-smoke.yml on a nightly schedule.
61
+ name: integration
62
+ runs-on: ubuntu-latest
63
+ steps:
64
+ - uses: actions/checkout@v4
65
+ - uses: actions/setup-python@v5
66
+ with:
67
+ python-version: "3.11"
68
+ cache: pip
69
+ - name: install
70
+ run: |
71
+ python -m pip install --upgrade pip
72
+ pip install -e ".[dev]"
73
+ - name: integration tests (snapshot lane)
74
+ # Integration tests are gated behind the `integration` marker in
75
+ # pyproject; opt in explicitly here. The snapshot lane is the default
76
+ # — no --live-parse flag.
77
+ run: pytest -m integration tests/ -v
78
+
79
+ build:
80
+ # Packaging gate. Builds sdist + wheel, validates PyPI metadata, and
81
+ # smoke-tests the *installed wheel* in a clean venv outside the checkout.
82
+ # That last step is the one that catches "passes tests, breaks on
83
+ # `pip install`" — the most common post-release failure.
84
+ name: build
85
+ runs-on: ubuntu-latest
86
+ needs: [lint, test]
87
+ steps:
88
+ - uses: actions/checkout@v4
89
+ - uses: actions/setup-python@v5
90
+ with:
91
+ python-version: "3.12"
92
+ cache: pip
93
+ - name: install build tooling
94
+ run: |
95
+ python -m pip install --upgrade pip
96
+ pip install build twine
97
+ - name: build sdist + wheel
98
+ run: python -m build
99
+ - name: twine check
100
+ run: twine check dist/*
101
+ - name: smoke-test installed wheel
102
+ # Install the built wheel into a fresh venv *outside* the checkout,
103
+ # then import + call a public symbol. Proves the wheel is functional,
104
+ # not just buildable.
105
+ run: |
106
+ python -m venv /tmp/smoke-venv
107
+ /tmp/smoke-venv/bin/pip install --upgrade pip
108
+ /tmp/smoke-venv/bin/pip install dist/*.whl
109
+ cd /tmp # leave the checkout so we can't accidentally import from src/
110
+ /tmp/smoke-venv/bin/python - <<'PY'
111
+ import pathlib
112
+ import table_stitcher
113
+
114
+ pkg = pathlib.Path(table_stitcher.__file__).resolve()
115
+ assert "site-packages" in str(pkg), pkg
116
+ assert table_stitcher.__version__, "missing __version__"
117
+ assert callable(table_stitcher.stitch_tables)
118
+ print(f"smoke ok: {table_stitcher.__version__} from {pkg}")
119
+ PY
@@ -0,0 +1,52 @@
1
+ name: release
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*"
7
+
8
+ # Trusted Publishing requires `id-token: write`. No PyPI API token needed —
9
+ # PyPI verifies the OIDC claim against the project's trusted-publisher config.
10
+ # One-time setup on PyPI: Account → Publishing → Add a pending publisher with
11
+ # repo `pebbleroad/table-stitcher`, workflow `release.yml`, environment `pypi`.
12
+ permissions:
13
+ contents: read
14
+ id-token: write
15
+
16
+ jobs:
17
+ publish:
18
+ name: publish to PyPI
19
+ runs-on: ubuntu-latest
20
+ environment:
21
+ name: pypi
22
+ url: https://pypi.org/p/table-stitcher
23
+ steps:
24
+ - uses: actions/checkout@v4
25
+ - uses: actions/setup-python@v5
26
+ with:
27
+ python-version: "3.12"
28
+ cache: pip
29
+ - name: install build tooling
30
+ run: |
31
+ python -m pip install --upgrade pip
32
+ pip install -e ".[dev]"
33
+ - name: tag matches package version
34
+ # Refuse to publish if the git tag and pyproject version disagree —
35
+ # mismatch here is the classic source of "v0.2.0 on PyPI is actually 0.2.1".
36
+ run: |
37
+ tag="${GITHUB_REF##*/}"
38
+ pkg_version=$(python -c "import tomllib, pathlib; print(tomllib.loads(pathlib.Path('pyproject.toml').read_text())['project']['version'])")
39
+ if [ "$tag" != "v${pkg_version}" ]; then
40
+ echo "tag $tag does not match pyproject version v$pkg_version" >&2
41
+ exit 1
42
+ fi
43
+ - name: release gate (build + wheel smoke + tests)
44
+ # Re-runs the same gate that contributors use locally. Keeps "what
45
+ # gets published" and "what was validated" identical.
46
+ env:
47
+ RELEASE_GATE_ONLINE: "1"
48
+ run: ./scripts/release_gate.sh
49
+ - name: publish
50
+ uses: pypa/gh-action-pypi-publish@release/v1
51
+ with:
52
+ packages-dir: dist/
@@ -0,0 +1,46 @@
1
+ name: upstream-smoke
2
+
3
+ # Live-parse lane: re-runs docling against the fixture PDFs to catch upstream
4
+ # regressions (docling, OCR engines, model updates) without blocking PRs.
5
+ #
6
+ # Comparisons run in lenient mode — only structural fields (members, pages,
7
+ # shape) are checked, since cell text varies by OCR engine. The job is
8
+ # allowed to fail; treat persistent red here as a signal to investigate, not
9
+ # a merge blocker.
10
+
11
+ on:
12
+ schedule:
13
+ # Daily at 06:17 UTC. Off-hour to avoid contention with weekday CI bursts.
14
+ - cron: "17 6 * * *"
15
+ workflow_dispatch:
16
+
17
+ jobs:
18
+ live-parse:
19
+ name: live-parse (${{ matrix.os }})
20
+ # macos-latest matches the OCR engine (ocrmac / Apple Vision) used to
21
+ # generate the committed snapshots, so structural drift here points at a
22
+ # genuine docling change rather than at OCR-engine divergence.
23
+ runs-on: ${{ matrix.os }}
24
+ strategy:
25
+ fail-fast: false
26
+ matrix:
27
+ os: [macos-latest]
28
+ steps:
29
+ - uses: actions/checkout@v4
30
+ - uses: actions/setup-python@v5
31
+ with:
32
+ python-version: "3.11"
33
+ cache: pip
34
+ - name: cache docling models
35
+ uses: actions/cache@v4
36
+ with:
37
+ path: ~/.cache/huggingface
38
+ key: docling-models-${{ runner.os }}-v1
39
+ - name: install
40
+ run: |
41
+ python -m pip install --upgrade pip
42
+ pip install -e ".[dev]"
43
+ - name: live-parse integration tests (lenient)
44
+ # Allowed to fail — we never want this to block a PR.
45
+ continue-on-error: true
46
+ run: pytest -m integration --live-parse tests/ -v
@@ -0,0 +1,45 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.egg-info/
6
+ *.egg
7
+ dist/
8
+ build/
9
+ .eggs/
10
+
11
+ # Virtual environments
12
+ .venv/
13
+ venv/
14
+ env/
15
+
16
+ # IDE
17
+ .idea/
18
+ .vscode/
19
+ *.swp
20
+ *.swo
21
+
22
+ # OS
23
+ .DS_Store
24
+ Thumbs.db
25
+
26
+ # Testing
27
+ .pytest_cache/
28
+ .coverage
29
+ htmlcov/
30
+
31
+ # Build
32
+ *.whl
33
+ *.tar.gz
34
+
35
+ # Test output
36
+ *.enriched.*
37
+ *.stitched.*
38
+
39
+ # Dataset cache (PubTables-v2 etc.)
40
+ .temp/
41
+
42
+ # Claude Code per-user overrides (settings.json itself is shared)
43
+ .claude/settings.local.json
44
+ .claude/scheduled_tasks.lock
45
+
@@ -0,0 +1,31 @@
1
+ # Mirror of the CI lint job. Contributors run `pre-commit install` once,
2
+ # then every commit auto-lints + auto-formats. CI catches anything they
3
+ # bypassed (--no-verify) or skipped.
4
+ #
5
+ # We use `language: system` so pre-commit invokes the ruff that
6
+ # `pip install -e ".[dev]"` already installed. Pinning a separate ruff
7
+ # version in this file would drift from the one CI installs and silently
8
+ # disagree on rules and formatter output. Single source of truth: pyproject.
9
+ repos:
10
+ - repo: local
11
+ hooks:
12
+ - id: ruff-check
13
+ name: ruff check
14
+ entry: ruff check --fix
15
+ language: system
16
+ types: [python]
17
+ - id: ruff-format
18
+ name: ruff format
19
+ entry: ruff format
20
+ language: system
21
+ types: [python]
22
+
23
+ - repo: https://github.com/pre-commit/pre-commit-hooks
24
+ rev: v5.0.0
25
+ hooks:
26
+ - id: trailing-whitespace
27
+ - id: end-of-file-fixer
28
+ - id: check-yaml
29
+ - id: check-toml
30
+ - id: check-added-large-files
31
+ args: [--maxkb=2000] # PDF fixtures are legitimately large