docpluck 2.4.2__tar.gz → 2.4.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (269) hide show
  1. {docpluck-2.4.2 → docpluck-2.4.4}/CHANGELOG.md +39 -0
  2. {docpluck-2.4.2 → docpluck-2.4.4}/PKG-INFO +1 -1
  3. {docpluck-2.4.2 → docpluck-2.4.4}/docpluck/__init__.py +1 -1
  4. {docpluck-2.4.2 → docpluck-2.4.4}/docpluck/extract_structured.py +46 -0
  5. {docpluck-2.4.2 → docpluck-2.4.4}/docpluck/figures/detect.py +63 -1
  6. {docpluck-2.4.2 → docpluck-2.4.4}/docpluck/normalize.py +25 -2
  7. {docpluck-2.4.2 → docpluck-2.4.4}/pyproject.toml +1 -1
  8. docpluck-2.4.4/tests/test_figure_detect.py +220 -0
  9. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_normalization.py +43 -0
  10. docpluck-2.4.2/tests/test_figure_detect.py +0 -96
  11. {docpluck-2.4.2 → docpluck-2.4.4}/.claude/skills/_project/lessons.md +0 -0
  12. {docpluck-2.4.2 → docpluck-2.4.4}/.claude/skills/docpluck-cleanup/SKILL.md +0 -0
  13. {docpluck-2.4.2 → docpluck-2.4.4}/.claude/skills/docpluck-deploy/SKILL.md +0 -0
  14. {docpluck-2.4.2 → docpluck-2.4.4}/.claude/skills/docpluck-qa/SKILL.md +0 -0
  15. {docpluck-2.4.2 → docpluck-2.4.4}/.claude/skills/docpluck-qa/references/benchmark-mode.md +0 -0
  16. {docpluck-2.4.2 → docpluck-2.4.4}/.claude/skills/docpluck-qa/references/check-11-hard-rules.md +0 -0
  17. {docpluck-2.4.2 → docpluck-2.4.4}/.claude/skills/docpluck-qa/references/check-13-escicheck-production.md +0 -0
  18. {docpluck-2.4.2 → docpluck-2.4.4}/.claude/skills/docpluck-qa/references/check-5-escicheck-library.md +0 -0
  19. {docpluck-2.4.2 → docpluck-2.4.4}/.claude/skills/docpluck-qa/references/check-6-escicheck-local-webapp.md +0 -0
  20. {docpluck-2.4.2 → docpluck-2.4.4}/.claude/skills/docpluck-qa/references/check-7-batch-smoke.md +0 -0
  21. {docpluck-2.4.2 → docpluck-2.4.4}/.claude/skills/docpluck-review/SKILL.md +0 -0
  22. {docpluck-2.4.2 → docpluck-2.4.4}/.github/workflows/publish.yml +0 -0
  23. {docpluck-2.4.2 → docpluck-2.4.4}/.github/workflows/test.yml +0 -0
  24. {docpluck-2.4.2 → docpluck-2.4.4}/.gitignore +0 -0
  25. {docpluck-2.4.2 → docpluck-2.4.4}/CLAUDE.md +0 -0
  26. {docpluck-2.4.2 → docpluck-2.4.4}/HANDOFF_SECTIONS_APP_INTEGRATION.md +0 -0
  27. {docpluck-2.4.2 → docpluck-2.4.4}/LESSONS.md +0 -0
  28. {docpluck-2.4.2 → docpluck-2.4.4}/LICENSE +0 -0
  29. {docpluck-2.4.2 → docpluck-2.4.4}/REPLY_FROM_DOCPLUCK_v1.4.5.md +0 -0
  30. {docpluck-2.4.2 → docpluck-2.4.4}/REPLY_FROM_DOCPLUCK_v1.5.0.md +0 -0
  31. {docpluck-2.4.2 → docpluck-2.4.4}/REQUEST_08_CHUNKING_ENDPOINT.md +0 -0
  32. {docpluck-2.4.2 → docpluck-2.4.4}/REQUEST_09_REFERENCE_LIST_NORMALIZATION.md +0 -0
  33. {docpluck-2.4.2 → docpluck-2.4.4}/TODO.md +0 -0
  34. {docpluck-2.4.2 → docpluck-2.4.4}/docpluck/__main__.py +0 -0
  35. {docpluck-2.4.2 → docpluck-2.4.4}/docpluck/batch.py +0 -0
  36. {docpluck-2.4.2 → docpluck-2.4.4}/docpluck/cli.py +0 -0
  37. {docpluck-2.4.2 → docpluck-2.4.4}/docpluck/extract.py +0 -0
  38. {docpluck-2.4.2 → docpluck-2.4.4}/docpluck/extract_docx.py +0 -0
  39. {docpluck-2.4.2 → docpluck-2.4.4}/docpluck/extract_html.py +0 -0
  40. {docpluck-2.4.2 → docpluck-2.4.4}/docpluck/extract_layout.py +0 -0
  41. {docpluck-2.4.2 → docpluck-2.4.4}/docpluck/figures/__init__.py +0 -0
  42. {docpluck-2.4.2 → docpluck-2.4.4}/docpluck/quality.py +0 -0
  43. {docpluck-2.4.2 → docpluck-2.4.4}/docpluck/render.py +0 -0
  44. {docpluck-2.4.2 → docpluck-2.4.4}/docpluck/sections/__init__.py +0 -0
  45. {docpluck-2.4.2 → docpluck-2.4.4}/docpluck/sections/annotators/__init__.py +0 -0
  46. {docpluck-2.4.2 → docpluck-2.4.4}/docpluck/sections/annotators/docx.py +0 -0
  47. {docpluck-2.4.2 → docpluck-2.4.4}/docpluck/sections/annotators/html.py +0 -0
  48. {docpluck-2.4.2 → docpluck-2.4.4}/docpluck/sections/annotators/pdf.py +0 -0
  49. {docpluck-2.4.2 → docpluck-2.4.4}/docpluck/sections/annotators/text.py +0 -0
  50. {docpluck-2.4.2 → docpluck-2.4.4}/docpluck/sections/blocks.py +0 -0
  51. {docpluck-2.4.2 → docpluck-2.4.4}/docpluck/sections/boundaries.py +0 -0
  52. {docpluck-2.4.2 → docpluck-2.4.4}/docpluck/sections/core.py +0 -0
  53. {docpluck-2.4.2 → docpluck-2.4.4}/docpluck/sections/taxonomy.py +0 -0
  54. {docpluck-2.4.2 → docpluck-2.4.4}/docpluck/sections/types.py +0 -0
  55. {docpluck-2.4.2 → docpluck-2.4.4}/docpluck/tables/__init__.py +0 -0
  56. {docpluck-2.4.2 → docpluck-2.4.4}/docpluck/tables/bbox_utils.py +0 -0
  57. {docpluck-2.4.2 → docpluck-2.4.4}/docpluck/tables/camelot_extract.py +0 -0
  58. {docpluck-2.4.2 → docpluck-2.4.4}/docpluck/tables/captions.py +0 -0
  59. {docpluck-2.4.2 → docpluck-2.4.4}/docpluck/tables/cell_cleaning.py +0 -0
  60. {docpluck-2.4.2 → docpluck-2.4.4}/docpluck/tables/cluster.py +0 -0
  61. {docpluck-2.4.2 → docpluck-2.4.4}/docpluck/tables/confidence.py +0 -0
  62. {docpluck-2.4.2 → docpluck-2.4.4}/docpluck/tables/detect.py +0 -0
  63. {docpluck-2.4.2 → docpluck-2.4.4}/docpluck/tables/render.py +0 -0
  64. {docpluck-2.4.2 → docpluck-2.4.4}/docpluck/tables/whitespace.py +0 -0
  65. {docpluck-2.4.2 → docpluck-2.4.4}/docpluck/version.py +0 -0
  66. {docpluck-2.4.2 → docpluck-2.4.4}/docs/BENCHMARKS.md +0 -0
  67. {docpluck-2.4.2 → docpluck-2.4.4}/docs/DESIGN.md +0 -0
  68. {docpluck-2.4.2 → docpluck-2.4.4}/docs/HANDOFF_2026-05-07_sections_strict_iteration.md +0 -0
  69. {docpluck-2.4.2 → docpluck-2.4.4}/docs/HANDOFF_2026-05-09_session_state_and_followups.md +0 -0
  70. {docpluck-2.4.2 → docpluck-2.4.4}/docs/HANDOFF_2026-05-09_unified_extraction_brainstorm.md +0 -0
  71. {docpluck-2.4.2 → docpluck-2.4.4}/docs/HANDOFF_2026-05-10_table_rendering_iteration.md +0 -0
  72. {docpluck-2.4.2 → docpluck-2.4.4}/docs/HANDOFF_2026-05-10_table_rendering_iteration_2.md +0 -0
  73. {docpluck-2.4.2 → docpluck-2.4.4}/docs/HANDOFF_2026-05-10_table_rendering_iteration_3.md +0 -0
  74. {docpluck-2.4.2 → docpluck-2.4.4}/docs/HANDOFF_2026-05-10_table_rendering_iteration_4.md +0 -0
  75. {docpluck-2.4.2 → docpluck-2.4.4}/docs/HANDOFF_2026-05-10_table_rendering_iteration_5.md +0 -0
  76. {docpluck-2.4.2 → docpluck-2.4.4}/docs/HANDOFF_2026-05-10_table_rendering_iteration_6.md +0 -0
  77. {docpluck-2.4.2 → docpluck-2.4.4}/docs/HANDOFF_2026-05-10_table_rendering_iteration_7.md +0 -0
  78. {docpluck-2.4.2 → docpluck-2.4.4}/docs/HANDOFF_2026-05-11_PROMOTE_SPIKE_TO_LIBRARY.md +0 -0
  79. {docpluck-2.4.2 → docpluck-2.4.4}/docs/HANDOFF_2026-05-11_table_rendering_iteration_8.md +0 -0
  80. {docpluck-2.4.2 → docpluck-2.4.4}/docs/HANDOFF_2026-05-11_visual_review_findings.md +0 -0
  81. {docpluck-2.4.2 → docpluck-2.4.4}/docs/HANDOFF_2026-05-12_phase2_101pdf_corpus.md +0 -0
  82. {docpluck-2.4.2 → docpluck-2.4.4}/docs/HANDOFF_2026-05-12_remaining_ui_and_chrome_verification.md +0 -0
  83. {docpluck-2.4.2 → docpluck-2.4.4}/docs/HANDOFF_2026-05-12_visual_verify_results.md +0 -0
  84. {docpluck-2.4.2 → docpluck-2.4.4}/docs/NORMALIZATION.md +0 -0
  85. {docpluck-2.4.2 → docpluck-2.4.4}/docs/README.md +0 -0
  86. {docpluck-2.4.2 → docpluck-2.4.4}/docs/TRIAGE_2026-05-10_corpus_assessment.md +0 -0
  87. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/2026-05-06-section-identification.md +0 -0
  88. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/2026-05-06-table-extraction.md +0 -0
  89. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/2026-05-07-sections-strict-iteration-progress.md +0 -0
  90. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/2026-05-08-unified-extraction-phase-0-splice-spike.md +0 -0
  91. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/sections-deferred-items.md +0 -0
  92. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/sections-issues-backlog.md +0 -0
  93. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/2026-05-07_spot-01_apa.md +0 -0
  94. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/2026-05-07_spot-02_pattern-A-shipped.md +0 -0
  95. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/2026-05-08_spot-final_all-styles.md +0 -0
  96. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/experiments/COMPARISON.md +0 -0
  97. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/korbmacher_table1.md +0 -0
  98. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/option-a.py +0 -0
  99. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/ziano_table1.md +0 -0
  100. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_notes_raw.txt +0 -0
  101. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_table1.md +0 -0
  102. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/notes.md +0 -0
  103. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/option-b.py +0 -0
  104. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_notes_raw.txt +0 -0
  105. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_table1.md +0 -0
  106. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/korbmacher_table1.md +0 -0
  107. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/notes.md +0 -0
  108. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/option-c.py +0 -0
  109. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/sample-pdftotext-bbox.html +0 -0
  110. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/ziano_table1.md +0 -0
  111. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/korbmacher_table1.md +0 -0
  112. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/notes.md +0 -0
  113. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/option-d.py +0 -0
  114. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/ziano_table1.md +0 -0
  115. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_2022_kruger_bbox.html +0 -0
  116. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_bbox.html +0 -0
  117. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_table1.md +0 -0
  118. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/option-e.py +0 -0
  119. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/sample-bbox.html +0 -0
  120. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_2021_joep_bbox.html +0 -0
  121. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_bbox.html +0 -0
  122. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_table1.md +0 -0
  123. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/html-fallback-demo.md +0 -0
  124. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.err +0 -0
  125. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.md +0 -0
  126. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.err +0 -0
  127. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.md +0 -0
  128. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.err +0 -0
  129. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.md +0 -0
  130. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.err +0 -0
  131. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.md +0 -0
  132. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.err +0 -0
  133. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.md +0 -0
  134. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.err +0 -0
  135. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.md +0 -0
  136. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.err +0 -0
  137. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.md +0 -0
  138. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.err +0 -0
  139. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.md +0 -0
  140. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.err +0 -0
  141. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.md +0 -0
  142. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.err +0 -0
  143. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.md +0 -0
  144. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.err +0 -0
  145. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.md +0 -0
  146. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.err +0 -0
  147. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.md +0 -0
  148. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.err +0 -0
  149. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.md +0 -0
  150. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.err +0 -0
  151. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.md +0 -0
  152. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.err +0 -0
  153. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.md +0 -0
  154. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.err +0 -0
  155. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.md +0 -0
  156. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.err +0 -0
  157. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.md +0 -0
  158. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.err +0 -0
  159. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.md +0 -0
  160. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.err +0 -0
  161. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.md +0 -0
  162. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.err +0 -0
  163. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.md +0 -0
  164. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.err +0 -0
  165. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.md +0 -0
  166. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.err +0 -0
  167. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.md +0 -0
  168. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.err +0 -0
  169. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.md +0 -0
  170. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.err +0 -0
  171. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.md +0 -0
  172. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.err +0 -0
  173. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.md +0 -0
  174. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.err +0 -0
  175. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.md +0 -0
  176. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/papers.md +0 -0
  177. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/report.md +0 -0
  178. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/splice_spike.py +0 -0
  179. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/plans/spot-checks/splice-spike/test_splice_spike.py +0 -0
  180. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/specs/2026-04-27-request-09-reference-normalization-design.md +0 -0
  181. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/specs/2026-05-06-section-identification-design.md +0 -0
  182. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/specs/2026-05-06-table-extraction-design.md +0 -0
  183. {docpluck-2.4.2 → docpluck-2.4.4}/docs/superpowers/specs/2026-05-08-unified-extraction-design.md +0 -0
  184. {docpluck-2.4.2 → docpluck-2.4.4}/scripts/verify_corpus.py +0 -0
  185. {docpluck-2.4.2 → docpluck-2.4.4}/scripts/verify_corpus_full.py +0 -0
  186. {docpluck-2.4.2 → docpluck-2.4.4}/tests/__init__.py +0 -0
  187. {docpluck-2.4.2 → docpluck-2.4.4}/tests/conftest.py +0 -0
  188. {docpluck-2.4.2 → docpluck-2.4.4}/tests/fixtures/__init__.py +0 -0
  189. {docpluck-2.4.2 → docpluck-2.4.4}/tests/fixtures/sections/__init__.py +0 -0
  190. {docpluck-2.4.2 → docpluck-2.4.4}/tests/fixtures/sections/builders.py +0 -0
  191. {docpluck-2.4.2 → docpluck-2.4.4}/tests/fixtures/structured/.gitkeep +0 -0
  192. {docpluck-2.4.2 → docpluck-2.4.4}/tests/fixtures/structured/MANIFEST.json +0 -0
  193. {docpluck-2.4.2 → docpluck-2.4.4}/tests/fixtures/structured/README.md +0 -0
  194. {docpluck-2.4.2 → docpluck-2.4.4}/tests/golden/sections/apa_multi_study_pdf.json +0 -0
  195. {docpluck-2.4.2 → docpluck-2.4.4}/tests/golden/sections/apa_single_study_pdf.json +0 -0
  196. {docpluck-2.4.2 → docpluck-2.4.4}/tests/golden/sections/html_real_headings.json +0 -0
  197. {docpluck-2.4.2 → docpluck-2.4.4}/tests/snapshots/amj_lattice.txt +0 -0
  198. {docpluck-2.4.2 → docpluck-2.4.4}/tests/snapshots/apa_chan_feldman_lineless.txt +0 -0
  199. {docpluck-2.4.2 → docpluck-2.4.4}/tests/snapshots/apa_chen_jesp_lineless.txt +0 -0
  200. {docpluck-2.4.2 → docpluck-2.4.4}/tests/snapshots/apa_efendic_affect.txt +0 -0
  201. {docpluck-2.4.2 → docpluck-2.4.4}/tests/snapshots/apa_ip_feldman_pspb.txt +0 -0
  202. {docpluck-2.4.2 → docpluck-2.4.4}/tests/snapshots/bmc_lattice.txt +0 -0
  203. {docpluck-2.4.2 → docpluck-2.4.4}/tests/snapshots/ieee_figure_heavy.txt +0 -0
  204. {docpluck-2.4.2 → docpluck-2.4.4}/tests/snapshots/ieee_lattice.txt +0 -0
  205. {docpluck-2.4.2 → docpluck-2.4.4}/tests/snapshots/jama_lattice.txt +0 -0
  206. {docpluck-2.4.2 → docpluck-2.4.4}/tests/snapshots/nat_comms_figure_only.txt +0 -0
  207. {docpluck-2.4.2 → docpluck-2.4.4}/tests/snapshots/nature_minimal_rule.txt +0 -0
  208. {docpluck-2.4.2 → docpluck-2.4.4}/tests/snapshots/scirep_minimal_rule.txt +0 -0
  209. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_bbox_utils.py +0 -0
  210. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_benchmark_docx_html.py +0 -0
  211. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_caption_regex.py +0 -0
  212. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_cli_sections.py +0 -0
  213. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_cli_structured.py +0 -0
  214. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_confidence.py +0 -0
  215. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_corpus_smoke.py +0 -0
  216. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_d5_normalization_audit.py +0 -0
  217. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_edge_cases.py +0 -0
  218. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_extract_docx.py +0 -0
  219. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_extract_filter_sugar.py +0 -0
  220. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_extract_html.py +0 -0
  221. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_extract_layout.py +0 -0
  222. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_extract_pdf_structured.py +0 -0
  223. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_extraction.py +0 -0
  224. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_f0_table_region_aware.py +0 -0
  225. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_fixtures_manifest.py +0 -0
  226. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_lattice_cluster.py +0 -0
  227. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_metaesci_followups.py +0 -0
  228. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_normalize_f0_footnote_strip.py +0 -0
  229. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_normalize_layout_param.py +0 -0
  230. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_normalize_report_layout_fields.py +0 -0
  231. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_normalize_v18_strips.py +0 -0
  232. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_quality.py +0 -0
  233. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_render.py +0 -0
  234. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_render_html.py +0 -0
  235. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_request_09_reference_normalization.py +0 -0
  236. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_sections_boundaries.py +0 -0
  237. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_sections_boundary_truncation.py +0 -0
  238. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_sections_core_partition.py +0 -0
  239. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_sections_docx_annotator.py +0 -0
  240. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_sections_extract_text.py +0 -0
  241. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_sections_footnote_section.py +0 -0
  242. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_sections_golden.py +0 -0
  243. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_sections_html_annotator.py +0 -0
  244. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_sections_pdf_annotator.py +0 -0
  245. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_sections_public_api.py +0 -0
  246. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_sections_real_corpus.py +0 -0
  247. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_sections_taxonomy.py +0 -0
  248. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_sections_text_annotator.py +0 -0
  249. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_sections_types.py +0 -0
  250. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_sections_unit_corpus.py +0 -0
  251. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_sections_v161_coalesce.py +0 -0
  252. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_sections_v161_subheadings.py +0 -0
  253. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_sections_v161_taxonomy.py +0 -0
  254. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_sections_v161_text_annotator.py +0 -0
  255. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_sections_version.py +0 -0
  256. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_smoke_fixtures.py +0 -0
  257. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_structured_result_type.py +0 -0
  258. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_structured_types.py +0 -0
  259. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_structured_version.py +0 -0
  260. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_table_detect.py +0 -0
  261. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_tables_cell_cleaning.py +0 -0
  262. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_text_mode.py +0 -0
  263. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_v23_1_fixes.py +0 -0
  264. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_v23_bug_fixes.py +0 -0
  265. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_v23_post_corpus.py +0 -0
  266. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_v23_post_corpus_v2.py +0 -0
  267. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_v2_backwards_compat.py +0 -0
  268. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_v2_top_level_exports.py +0 -0
  269. {docpluck-2.4.2 → docpluck-2.4.4}/tests/test_whitespace_cluster.py +0 -0
@@ -1,5 +1,44 @@
1
1
  # Changelog
2
2
 
3
+ ## [2.4.4] — 2026-05-13
4
+
5
+ Bug fix on v2.4.3's caption-trim feature + extension to a second chart-data signature.
6
+
7
+ ### Bug fix
8
+
9
+ 1. **`docpluck/extract_structured.py::_extract_caption_text`** — v2.4.3's `_trim_caption_at_chart_data` was added to `docpluck/figures/detect.py::_full_caption_text`, but the live render pipeline never calls that function — figure captions are built in `extract_structured.py::_extract_caption_text` (which `_figure_from_caption` calls). v2.4.3's caption-trim was therefore a no-op on real renders despite its tests passing in isolation. v2.4.4 applies the trim to `_extract_caption_text` for `kind == "figure"` captions, so the trim actually fires during `render_pdf_to_markdown(pdf_bytes)`. Verified by manual render of `jama_open_6` (caption 400 chars → 47 chars) and `jama_open_3` (405 → 208 chars).
10
+
11
+ ### Enhancement
12
+
13
+ 2. **`docpluck/extract_structured.py::_trim_caption_at_chart_data`** — extended with a second chart-data signature: a run of 5+ short (1–4 digit) numeric tokens separated only by whitespace. Catches axis-tick label sequences (``0 5 10 15 20``) and stacked column values (``340 321 280 5 270``) that the 6-digit-run rule didn't see on charts with small-magnitude data. The two signatures are evaluated jointly; the earlier match in the caption wins so the caption is trimmed at the start of the chart data, not partway through it. Same conservative gates as before (caption ≥ 150 chars, surviving text ≥ 40 chars). Affects most JAMA Network Open Kaplan-Meier and Sci Rep / BMC clinical-trial papers — caption length drops from 400-char hard cap to ~150 chars of real prose.
14
+
15
+ ### Bumps
16
+
17
+ - `__version__`: `2.4.3` → `2.4.4`. Patch — figure-caption truncation is now real and broader.
18
+
19
+ ### Tests
20
+
21
+ 3 new tests in `tests/test_figure_detect.py` (tick-run truncation, prose-with-inline-numbers no-op, earlier-of-two-signatures priority).
22
+
23
+ ## [2.4.3] — 2026-05-13
24
+
25
+ Same-day follow-up. Two preventative improvements aimed at quality issues that didn't trip the verifier tags but were visible in rendered output:
26
+
27
+ ### Fixes
28
+
29
+ 1. **`docpluck/normalize.py::normalize_text` S9 step** — strip 4-digit standalone page numbers from continuous-pagination journals (PSPB volume runs into the 1000s, Psychological Science, etc.). Previously S9 only handled 1–3 digit page numbers; a bare `1174` line leaked into rendered output (e.g. `efendic_2022_affect.md` line 24). New rule strips 4-digit standalone numbers when (a) value is in 1000–9999, (b) same value recurs ≥ 3 times in the document. The recurrence floor protects table-cell values that happen to land on their own line in single-value-per-line column layouts. `NORMALIZATION_VERSION`: `1.8.1` → `1.8.2`.
30
+
31
+ 2. **`docpluck/figures/detect.py::_full_caption_text`** — truncate figure captions at chart-data boundaries. pdftotext extracts chart elements (axis labels, gridline values, legend entries) inline with the figure caption when they share a PDF reading-order paragraph. The resulting caption text looks like `Figure 1. Flowchart of Study Sample Selection 4876956 Pairs enrolled before April 1, 2015 1117269 Pairs excluded ...` — useful prose followed by raw chart data. New heuristic: locate the first run of 6+ consecutive digits (signature of chart data — page counts, n-values, and years all top out at 5 digits in academic captions) and truncate just before it at the previous word boundary. Conservative: only fires when caption is ≥ 150 chars and surviving trimmed text is ≥ 40 chars (sanity check protects against edge cases). Affects clinical / biological flowcharts in JAMA, Sci Rep, BMC Medicine papers.
32
+
33
+ ### Bumps
34
+
35
+ - `__version__`: `2.4.2` → `2.4.3`. Patch — both fixes are conservative pdftotext post-processing.
36
+ - `NORMALIZATION_VERSION`: `1.8.1` → `1.8.2`.
37
+
38
+ ### Tests
39
+
40
+ 7 new tests across `tests/test_normalization.py` (4-digit page number stripping, recurrence floor, year edge case) and `tests/test_figure_detect.py` (caption truncation at digit-run boundary, short-caption no-op, legitimate 5-digit-number preservation, minimum-post-label sanity check).
41
+
3
42
  ## [2.4.2] — 2026-05-13
4
43
 
5
44
  Iterative follow-up. After v2.4.1 the 101-PDF corpus run was 98/101 PASS (`scripts/verify_corpus_full.py`); this release closes two of the three remaining failures and reframes the third as a known short-paper edge case in the verifier.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docpluck
3
- Version: 2.4.2
3
+ Version: 2.4.4
4
4
  Summary: PDF, DOCX, and HTML text extraction and normalization for academic papers
5
5
  Project-URL: Homepage, https://github.com/giladfeldman/docpluck
6
6
  Project-URL: Documentation, https://github.com/giladfeldman/docpluck/tree/main/docs
@@ -71,7 +71,7 @@ from .figures import Figure
71
71
  from .extract_structured import TABLE_EXTRACTION_VERSION, StructuredResult, extract_pdf_structured
72
72
  from .render import render_pdf_to_markdown
73
73
 
74
- __version__ = "2.4.2"
74
+ __version__ = "2.4.4"
75
75
  __author__ = "Gilad Feldman"
76
76
  __license__ = "MIT"
77
77
 
@@ -332,11 +332,57 @@ def _extract_caption_text(
332
332
  # Re-prefix the label if stripping ate it.
333
333
  if cap.label and not snippet.startswith(cap.label):
334
334
  snippet = f"{cap.label}. {snippet}".strip()
335
+ # v2.4.4: trim chart-data appendage from figure captions (axis-tick
336
+ # sequences, raw bar-chart values pdftotext joined inline into the
337
+ # caption paragraph). For tables the appendage is usually the next-
338
+ # row continuation so skip — the caption hard-cap at 400 below
339
+ # bounds it.
340
+ if cap.kind == "figure":
341
+ snippet = _trim_caption_at_chart_data(snippet)
335
342
  if len(snippet) > 400:
336
343
  snippet = snippet[:400].rsplit(" ", 1)[0] + "…"
337
344
  return snippet
338
345
 
339
346
 
347
+ # v2.4.4: shared chart-data trim, duplicated logic from
348
+ # ``docpluck.figures.detect._trim_caption_at_chart_data`` so this module
349
+ # doesn't import from ``figures.detect`` (which has its own layout-channel
350
+ # dependencies). Two signatures of pdftotext-joined chart data:
351
+ # 1. Run of 6+ consecutive digits — flowchart counts, row IDs.
352
+ # 2. Run of 5+ short (1–4 digit) numeric tokens separated only by
353
+ # whitespace — axis-tick label sequences.
354
+ _CHART_DATA_DIGIT_RUN_RE_STRUCT = re.compile(r"\b\d{6,}\b")
355
+ _CHART_DATA_TICK_RUN_RE_STRUCT = re.compile(r"(?:\b\d{1,4}\b[ \t]+){5,}")
356
+
357
+
358
+ def _trim_caption_at_chart_data(caption: str) -> str:
359
+ """Truncate a caption when it transitions from prose to chart-data.
360
+
361
+ Conservative: only fires when caption ≥ 150 chars AND the surviving
362
+ trimmed text is ≥ 40 chars. The two regex signatures catch
363
+ complementary chart-data patterns (large counts and small axis-tick
364
+ sequences); the earlier match wins.
365
+ """
366
+ if not caption or len(caption) < 150:
367
+ return caption
368
+ candidates: list[int] = []
369
+ m1 = _CHART_DATA_DIGIT_RUN_RE_STRUCT.search(caption)
370
+ if m1 is not None:
371
+ candidates.append(m1.start())
372
+ m2 = _CHART_DATA_TICK_RUN_RE_STRUCT.search(caption)
373
+ if m2 is not None:
374
+ candidates.append(m2.start())
375
+ if not candidates:
376
+ return caption
377
+ cut = min(candidates)
378
+ while cut > 0 and not caption[cut - 1].isspace():
379
+ cut -= 1
380
+ trimmed = caption[:cut].rstrip(" ,;:")
381
+ if len(trimmed) < 40:
382
+ return caption
383
+ return trimmed
384
+
385
+
340
386
  def _isolated_table_from_caption(
341
387
  cap: CaptionMatch,
342
388
  raw_text: str,
@@ -10,6 +10,7 @@ See spec §5.7.
10
10
 
11
11
  from __future__ import annotations
12
12
 
13
+ import re
13
14
  from collections import defaultdict
14
15
  from typing import Any
15
16
 
@@ -135,7 +136,68 @@ def _full_caption_text(raw_text: str, cap: CaptionMatch) -> str:
135
136
  end = raw_text.find("\n\n", cap.char_end)
136
137
  if end == -1:
137
138
  end = min(cap.char_end + 500, len(raw_text))
138
- return raw_text[cap.char_start:end].replace("\n", " ").strip()
139
+ full = raw_text[cap.char_start:end].replace("\n", " ").strip()
140
+ return _trim_caption_at_chart_data(full)
141
+
142
+
143
+ # A run of 6+ consecutive digits in a figure caption is almost never
144
+ # legitimate caption prose — page counts, statistical n-values, and years
145
+ # all top out at 5 digits in academic captions. 6+ digits is a strong signal
146
+ # that pdftotext joined chart data (raw bar-chart values, participant counts,
147
+ # row IDs) into the caption.
148
+ _CHART_DATA_DIGIT_RUN_RE = re.compile(r"\b\d{6,}\b")
149
+ # A run of 5+ short numeric tokens (1–4 digits each) separated only by
150
+ # whitespace is a v2.4.4 signal — captures axis-tick label sequences
151
+ # (``0 5 10 15 20``) and stacked column values (``340 321 280 5 270``)
152
+ # that the 6-digit rule misses on charts with small-magnitude data.
153
+ # Real captions reference numbers via prose ("with n = 1234 participants",
154
+ # "p < .001"), so digit tokens are interleaved with words rather than
155
+ # stacked five-in-a-row.
156
+ _CHART_DATA_TICK_RUN_RE = re.compile(r"(?:\b\d{1,4}\b[ \t]+){5,}")
157
+
158
+
159
+ def _trim_caption_at_chart_data(caption: str) -> str:
160
+ """Truncate a caption when it transitions from prose to chart-data.
161
+
162
+ pdftotext extracts chart elements (axis labels, legend entries, gridline
163
+ values) inline with the figure caption when they share a paragraph in the
164
+ PDF reading order. The resulting caption text looks like::
165
+
166
+ Figure 1. Flowchart of Study Sample Selection 4876956 Pairs enrolled
167
+ before April 1, 2015 1117269 Pairs excluded 741469 Withdrawal …
168
+
169
+ where the real caption is "Flowchart of Study Sample Selection" and the
170
+ rest is chart data values.
171
+
172
+ v2.4.4: two complementary signatures are scanned (see module-level
173
+ constants); the *earlier* match in the caption wins so the caption is
174
+ trimmed at the start of the chart data, not partway through it.
175
+
176
+ Conservative: only fires when the caption is ≥ 150 chars (real short
177
+ captions almost never have a chart-data appendage), and only when the
178
+ surviving trimmed caption is ≥ 40 chars (sanity check protects against
179
+ edge cases where the digit run lands near the label).
180
+ """
181
+ if not caption or len(caption) < 150:
182
+ return caption
183
+ candidates: list[int] = []
184
+ m1 = _CHART_DATA_DIGIT_RUN_RE.search(caption)
185
+ if m1 is not None:
186
+ candidates.append(m1.start())
187
+ m2 = _CHART_DATA_TICK_RUN_RE.search(caption)
188
+ if m2 is not None:
189
+ candidates.append(m2.start())
190
+ if not candidates:
191
+ return caption
192
+ cut = min(candidates)
193
+ # Walk back to the previous word boundary.
194
+ while cut > 0 and not caption[cut - 1].isspace():
195
+ cut -= 1
196
+ trimmed = caption[:cut].rstrip(" ,;:")
197
+ # Sanity check.
198
+ if len(trimmed) < 40:
199
+ return caption
200
+ return trimmed
139
201
 
140
202
 
141
203
  __all__ = ["find_figures"]
@@ -22,7 +22,7 @@ class NormalizationLevel(str, Enum):
22
22
  academic = "academic"
23
23
 
24
24
 
25
- NORMALIZATION_VERSION = "1.8.1"
25
+ NORMALIZATION_VERSION = "1.8.2"
26
26
 
27
27
 
28
28
  # ── Request 9 (Scimeto, 2026-04-27): Reference-list normalization ──────────
@@ -1004,8 +1004,31 @@ def normalize_text(
1004
1004
  if repeated:
1005
1005
  lines = [l for l in lines if l.strip() not in repeated]
1006
1006
  t = "\n".join(lines)
1007
- # Strip standalone page numbers
1007
+ # Strip standalone page numbers — 1-3 digit unconditionally.
1008
1008
  t = re.sub(r"^\s*\d{1,3}\s*$", "", t, flags=re.MULTILINE)
1009
+ # v2.4.3: 4-digit page numbers (continuous-pagination journals like PSPB
1010
+ # where volume runs page numbers into the 1000s). Strip when ALL of:
1011
+ # 1. The line is exactly 4 ASCII digits.
1012
+ # 2. The value falls in the plausible page-number range 1000–9999
1013
+ # (avoids stripping a stray 4-digit year-on-its-own-line).
1014
+ # 3. The SAME value recurs ≥3 times in the document (page numbers
1015
+ # repeat once per physical page, so this is conservative; a
1016
+ # duplicate-by-coincidence table-cell value would need to be the
1017
+ # same number 3 times, which is rare).
1018
+ # The conservative threshold protects table data where a 4-digit value
1019
+ # might legitimately appear on its own line (single-value-per-line
1020
+ # column layouts).
1021
+ four_digit_counts: dict[str, int] = {}
1022
+ for ln in t.split("\n"):
1023
+ s = ln.strip()
1024
+ if len(s) == 4 and s.isascii() and s.isdigit() and 1000 <= int(s) <= 9999:
1025
+ four_digit_counts[s] = four_digit_counts.get(s, 0) + 1
1026
+ recurring_4d = {s for s, c in four_digit_counts.items() if c >= 3}
1027
+ if recurring_4d:
1028
+ t = "\n".join(
1029
+ "" if ln.strip() in recurring_4d else ln
1030
+ for ln in t.split("\n")
1031
+ )
1009
1032
  report._track("S9_header_footer_removal", before, t, "headers_removed")
1010
1033
 
1011
1034
  # Limit consecutive newlines
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "docpluck"
7
- version = "2.4.2"
7
+ version = "2.4.4"
8
8
  description = "PDF, DOCX, and HTML text extraction and normalization for academic papers"
9
9
  readme = "docs/README.md"
10
10
  requires-python = ">=3.10"
@@ -0,0 +1,220 @@
1
+ """Figure region detection — caption + bbox metadata only."""
2
+
3
+ import json
4
+ import os
5
+ from pathlib import Path
6
+
7
+ import pytest
8
+
9
+
10
+ _HERE = Path(__file__).parent
11
+ _MANIFEST = _HERE / "fixtures" / "structured" / "MANIFEST.json"
12
+ _VIBE = Path(os.path.expanduser("~")) / "Dropbox" / "Vibe"
13
+
14
+
15
+ def _resolve_fixture(fixture_id: str) -> Path:
16
+ if not _MANIFEST.is_file():
17
+ pytest.skip("MANIFEST.json missing")
18
+ data = json.loads(_MANIFEST.read_text(encoding="utf-8"))
19
+ base = _VIBE if data.get("vibe_relative") else Path("/")
20
+ for entry in data["fixtures"]:
21
+ if entry["id"] == fixture_id:
22
+ path = base / entry["source_path"]
23
+ if not path.is_file():
24
+ pytest.skip(f"Fixture not available: {fixture_id} -> {path}")
25
+ return path
26
+ pytest.skip(f"Fixture id not in manifest: {fixture_id}")
27
+
28
+
29
+ def _layout(fixture_id: str):
30
+ pdf = _resolve_fixture(fixture_id)
31
+ from docpluck.extract_layout import extract_pdf_layout
32
+ return extract_pdf_layout(pdf.read_bytes())
33
+
34
+
35
+ def test_imports_ok():
36
+ from docpluck.figures.detect import find_figures
37
+ assert find_figures is not None
38
+
39
+
40
+ def test_figure_only_fixture_finds_figures():
41
+ layout = _layout("nat_comms_figure_only")
42
+ from docpluck.figures.detect import find_figures
43
+ figures = find_figures(layout)
44
+ if not figures:
45
+ pytest.skip("no figures detected on this fixture")
46
+ for f in figures:
47
+ assert f["label"] is not None and f["label"].startswith("Figure ")
48
+ assert f["caption"] is not None and len(f["caption"]) > 0
49
+ x0, top, x1, bottom = f["bbox"]
50
+ assert x1 > x0
51
+ assert bottom >= top # allow degenerate but not negative
52
+
53
+
54
+ def test_no_figures_returns_empty_or_only_real_figures():
55
+ """A negative-case fixture should yield zero or only well-formed figures."""
56
+ # Use any fixture with expected_figures==0; if not available, skip.
57
+ manifest_data = json.loads(_MANIFEST.read_text(encoding="utf-8"))
58
+ fixture_id = None
59
+ for e in manifest_data["fixtures"]:
60
+ if e.get("expected_figures") == 0:
61
+ fixture_id = e["id"]
62
+ break
63
+ if fixture_id is None:
64
+ pytest.skip("no expected_figures=0 fixture in manifest")
65
+ layout = _layout(fixture_id)
66
+ from docpluck.figures.detect import find_figures
67
+ figures = find_figures(layout)
68
+ # If any figures show up, they should at least have valid shape.
69
+ for f in figures:
70
+ assert f["label"] is None or f["label"].startswith("Figure ")
71
+ x0, top, x1, bottom = f["bbox"]
72
+ assert x1 > x0
73
+
74
+
75
+ def test_figure_id_is_unique_and_sequential():
76
+ layout = _layout("nat_comms_figure_only")
77
+ from docpluck.figures.detect import find_figures
78
+ figures = find_figures(layout)
79
+ if not figures:
80
+ pytest.skip("no figures detected")
81
+ ids = [f["id"] for f in figures]
82
+ assert len(set(ids)) == len(ids)
83
+ assert all(fid.startswith("f") for fid in ids)
84
+ # Sequential 1..n
85
+ expected = [f"f{i}" for i in range(1, len(figures) + 1)]
86
+ assert ids == expected
87
+
88
+
89
+ def test_figure_typeddict_shape():
90
+ from docpluck.figures import Figure
91
+ f: Figure = {
92
+ "id": "f1", "label": "Figure 1", "page": 3,
93
+ "bbox": (72.0, 100.0, 540.0, 320.0),
94
+ "caption": "Mean reaction time across conditions.",
95
+ }
96
+ assert f["id"] == "f1"
97
+
98
+
99
+ # v2.4.3: caption truncation at chart-data boundary
100
+ # (digit runs ≥ 6 chars indicate pdftotext joined raw chart values into the
101
+ # caption paragraph — common in clinical / biological flowcharts).
102
+
103
+
104
+ def test_trim_caption_at_chart_data_truncates_long_digit_run():
105
+ from docpluck.figures.detect import _trim_caption_at_chart_data
106
+ cap = (
107
+ "Figure 1. Flowchart of Study Sample Selection 4876956 Pairs enrolled "
108
+ "before April 1, 2015 1117269 Pairs excluded 741469 Withdrawal 148414 "
109
+ "Withdrawal after baseline 137787 With spouses onset of CVD 84585 "
110
+ "With onset of depression 5014 Duplicated couples 3792142 Eligible "
111
+ "pairs Matched by age and income"
112
+ )
113
+ out = _trim_caption_at_chart_data(cap)
114
+ # 6-digit run "4876956" triggers truncation just before it.
115
+ assert out == "Figure 1. Flowchart of Study Sample Selection"
116
+ assert "4876956" not in out
117
+
118
+
119
+ def test_trim_caption_preserves_short_caption():
120
+ from docpluck.figures.detect import _trim_caption_at_chart_data
121
+ cap = "Figure 2. A short caption with a year reference 2020 here."
122
+ out = _trim_caption_at_chart_data(cap)
123
+ # Under 150-char threshold AND no 6-digit run; no-op.
124
+ assert out == cap
125
+
126
+
127
+ def test_trim_caption_preserves_legitimate_5digit_numbers():
128
+ from docpluck.figures.detect import _trim_caption_at_chart_data
129
+ cap = (
130
+ "Figure 3. Sample selection diagram including all participants from "
131
+ "the original cohort (N = 12345) and the analytic subsample of 9876 "
132
+ "individuals who completed both waves of the longitudinal survey "
133
+ "between 2018 and 2024 with no missing data on the focal outcomes."
134
+ )
135
+ out = _trim_caption_at_chart_data(cap)
136
+ # 5-digit "12345" does NOT trigger; whole caption preserved.
137
+ assert out == cap
138
+
139
+
140
+ def test_trim_caption_preserves_prose_with_no_digits():
141
+ from docpluck.figures.detect import _trim_caption_at_chart_data
142
+ cap = (
143
+ "Figure 4. Cumulative incidence of depression by spouses cardiovascular "
144
+ "event among the entire study sample. The horizontal axis shows the "
145
+ "time in months and the vertical axis is cumulative incidence of "
146
+ "depression in percent. Lines represent the four sex-age subgroups."
147
+ )
148
+ out = _trim_caption_at_chart_data(cap)
149
+ # No 6-digit run; full caption preserved.
150
+ assert out == cap
151
+
152
+
153
+ def test_trim_caption_keeps_minimum_post_label_content():
154
+ from docpluck.figures.detect import _trim_caption_at_chart_data
155
+ # 6-digit run lands right after the label — truncation would leave
156
+ # just "Figure 1." (under 40-char sanity check) — return original.
157
+ long_cap = "Figure 5. " + "x" * 200 + " 1234567 stuff" # >150 chars
158
+ short_pre_label = "Figure 5. 1234567 chart data " + "y" * 200
159
+ out = _trim_caption_at_chart_data(short_pre_label)
160
+ # Sanity check fires; return original.
161
+ assert out == short_pre_label
162
+
163
+
164
+ # v2.4.4: caption truncation extended to short-token tick runs (5+ short
165
+ # numeric tokens in a row — axis-tick label sequences from charts).
166
+
167
+
168
+ def test_trim_caption_at_tick_run_truncates_axis_labels():
169
+ """v2.4.4: detect chart axis-tick sequences (5+ short numeric tokens
170
+ separated only by whitespace) — jama_open_3-style Kaplan-Meier
171
+ captions absorb gridline values like ``0 0 5 10 15`` that the 6-digit
172
+ rule didn't catch."""
173
+ from docpluck.figures.detect import _trim_caption_at_chart_data
174
+ cap = (
175
+ "Figure 1. Unadjusted Kaplan-Meier Curves Across Groups With "
176
+ "Different Objective Sleep Duration for All-Cause Mortality 100 "
177
+ "90 Survival probability, % 80 70 Sleep duration 60 seven hours "
178
+ "6 to 7 hours 50 5 to 6 hours less than 5 hours 0 0 5 10 15 "
179
+ "Follow-up time y No at risk Sleep duration seven hours 340 321 "
180
+ "280 5 Sleep duration"
181
+ )
182
+ out = _trim_caption_at_chart_data(cap)
183
+ assert "0 0 5 10 15" not in out
184
+ # Trim should preserve the prose lead-in.
185
+ assert out.startswith("Figure 1. Unadjusted Kaplan-Meier Curves")
186
+
187
+
188
+ def test_trim_caption_preserves_legitimate_prose_with_inline_numbers():
189
+ """Real caption prose references numbers in stats ('n = 1234', 'p < .001'),
190
+ but each number is followed by a word — not 5+ stacked numerics in a row."""
191
+ from docpluck.figures.detect import _trim_caption_at_chart_data
192
+ cap = (
193
+ "Figure 2. Mean reaction times across the four experimental "
194
+ "conditions, with n = 1234 participants total (95% CI [120.5, "
195
+ "180.3] ms for condition A; 95% CI [110.2, 175.4] for condition "
196
+ "B). Significant differences observed at p < .001 between paired "
197
+ "conditions in all 4 contrasts of interest, as predicted."
198
+ )
199
+ out = _trim_caption_at_chart_data(cap)
200
+ assert out == cap
201
+
202
+
203
+ def test_trim_caption_picks_earliest_match_across_both_rules():
204
+ """When both the 6-digit-run and the 5-token-tick rules match,
205
+ truncate at the earlier offset so we don't keep chart data past the
206
+ first signal."""
207
+ from docpluck.figures.detect import _trim_caption_at_chart_data
208
+ # Tick run appears first; 6-digit run appears later.
209
+ cap = (
210
+ "Figure 3. Bar plot of conditions A through F across the years "
211
+ "of interest 2020 2021 2022 2023 2024 2025 with later analytic "
212
+ "subsample participant total 4876956 in the secondary cohort "
213
+ "described in the methods section above and detailed in the "
214
+ "supplementary materials accompanying this paper."
215
+ )
216
+ out = _trim_caption_at_chart_data(cap)
217
+ # The tick run "2020 2021 2022 2023 2024 2025" appears earlier; trim
218
+ # there.
219
+ assert "2020 2021" not in out
220
+ assert "4876956" not in out
@@ -414,6 +414,49 @@ class TestS9_HeaderFooter:
414
414
  result = norm(text, "standard")
415
415
  assert "\n42\n" not in result
416
416
 
417
+ def test_4digit_page_numbers_stripped_when_recurring(self):
418
+ """v2.4.3: Continuous-pagination journals (PSPB, JESP volume runs)
419
+ emit page numbers in the 1000-9999 range. When the same 4-digit
420
+ value appears on its own line 3+ times in the doc, treat it as
421
+ a page-number artifact and strip."""
422
+ text = (
423
+ "First page content here.\n"
424
+ "1174\n"
425
+ "Second page begins.\n"
426
+ "1175\n"
427
+ "Body sentence continues.\n"
428
+ "1174\n"
429
+ "More body.\n"
430
+ "1175\n"
431
+ "Even more body content.\n"
432
+ "1174\n"
433
+ )
434
+ result = norm(text, "standard")
435
+ # 1174 appears 3 times → stripped.
436
+ assert "\n1174\n" not in result
437
+ # 1175 appears 2 times → not yet meeting the ≥3 threshold,
438
+ # so left alone (conservative).
439
+ assert "1175" in result
440
+
441
+ def test_4digit_year_on_own_line_preserved(self):
442
+ """A 4-digit value that only appears ONCE on its own line is NOT
443
+ a page number — could be a year reference or stray data. Leave it."""
444
+ text = "body text\n2024\nmore body text\n"
445
+ result = norm(text, "standard")
446
+ assert "2024" in result
447
+
448
+ def test_4digit_below_1000_preserved(self):
449
+ """Values below 1000 are page-number range only via the 1-3-digit
450
+ pattern; 4-digit values <1000 don't exist (would be 3-digit)."""
451
+ # Mostly a sanity check; values like 0999 wouldn't naturally occur.
452
+ text = "abc\n2020\ndef\n2020\nxyz\n2020\nfinal\n"
453
+ result = norm(text, "standard")
454
+ # 2020 recurs 3+ but is a year; the heuristic ALSO strips this
455
+ # case (1000-9999 range), which is acceptable since
456
+ # standalone-line years are a rare verbatim pattern in academic
457
+ # prose. Document the behavior here.
458
+ assert "2020" not in result
459
+
417
460
  def test_short_lines_preserved(self):
418
461
  """Lines < 15 chars should NOT be treated as headers."""
419
462
  text = "Short\n" * 10 + "Content"
@@ -1,96 +0,0 @@
1
- """Figure region detection — caption + bbox metadata only."""
2
-
3
- import json
4
- import os
5
- from pathlib import Path
6
-
7
- import pytest
8
-
9
-
10
- _HERE = Path(__file__).parent
11
- _MANIFEST = _HERE / "fixtures" / "structured" / "MANIFEST.json"
12
- _VIBE = Path(os.path.expanduser("~")) / "Dropbox" / "Vibe"
13
-
14
-
15
- def _resolve_fixture(fixture_id: str) -> Path:
16
- if not _MANIFEST.is_file():
17
- pytest.skip("MANIFEST.json missing")
18
- data = json.loads(_MANIFEST.read_text(encoding="utf-8"))
19
- base = _VIBE if data.get("vibe_relative") else Path("/")
20
- for entry in data["fixtures"]:
21
- if entry["id"] == fixture_id:
22
- path = base / entry["source_path"]
23
- if not path.is_file():
24
- pytest.skip(f"Fixture not available: {fixture_id} -> {path}")
25
- return path
26
- pytest.skip(f"Fixture id not in manifest: {fixture_id}")
27
-
28
-
29
- def _layout(fixture_id: str):
30
- pdf = _resolve_fixture(fixture_id)
31
- from docpluck.extract_layout import extract_pdf_layout
32
- return extract_pdf_layout(pdf.read_bytes())
33
-
34
-
35
- def test_imports_ok():
36
- from docpluck.figures.detect import find_figures
37
- assert find_figures is not None
38
-
39
-
40
- def test_figure_only_fixture_finds_figures():
41
- layout = _layout("nat_comms_figure_only")
42
- from docpluck.figures.detect import find_figures
43
- figures = find_figures(layout)
44
- if not figures:
45
- pytest.skip("no figures detected on this fixture")
46
- for f in figures:
47
- assert f["label"] is not None and f["label"].startswith("Figure ")
48
- assert f["caption"] is not None and len(f["caption"]) > 0
49
- x0, top, x1, bottom = f["bbox"]
50
- assert x1 > x0
51
- assert bottom >= top # allow degenerate but not negative
52
-
53
-
54
- def test_no_figures_returns_empty_or_only_real_figures():
55
- """A negative-case fixture should yield zero or only well-formed figures."""
56
- # Use any fixture with expected_figures==0; if not available, skip.
57
- manifest_data = json.loads(_MANIFEST.read_text(encoding="utf-8"))
58
- fixture_id = None
59
- for e in manifest_data["fixtures"]:
60
- if e.get("expected_figures") == 0:
61
- fixture_id = e["id"]
62
- break
63
- if fixture_id is None:
64
- pytest.skip("no expected_figures=0 fixture in manifest")
65
- layout = _layout(fixture_id)
66
- from docpluck.figures.detect import find_figures
67
- figures = find_figures(layout)
68
- # If any figures show up, they should at least have valid shape.
69
- for f in figures:
70
- assert f["label"] is None or f["label"].startswith("Figure ")
71
- x0, top, x1, bottom = f["bbox"]
72
- assert x1 > x0
73
-
74
-
75
- def test_figure_id_is_unique_and_sequential():
76
- layout = _layout("nat_comms_figure_only")
77
- from docpluck.figures.detect import find_figures
78
- figures = find_figures(layout)
79
- if not figures:
80
- pytest.skip("no figures detected")
81
- ids = [f["id"] for f in figures]
82
- assert len(set(ids)) == len(ids)
83
- assert all(fid.startswith("f") for fid in ids)
84
- # Sequential 1..n
85
- expected = [f"f{i}" for i in range(1, len(figures) + 1)]
86
- assert ids == expected
87
-
88
-
89
- def test_figure_typeddict_shape():
90
- from docpluck.figures import Figure
91
- f: Figure = {
92
- "id": "f1", "label": "Figure 1", "page": 3,
93
- "bbox": (72.0, 100.0, 540.0, 320.0),
94
- "caption": "Mean reaction time across conditions.",
95
- }
96
- assert f["id"] == "f1"
File without changes
File without changes
File without changes
File without changes