docpluck 2.4.0__tar.gz → 2.4.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (268) hide show
  1. {docpluck-2.4.0 → docpluck-2.4.2}/CHANGELOG.md +38 -0
  2. {docpluck-2.4.0 → docpluck-2.4.2}/PKG-INFO +1 -1
  3. {docpluck-2.4.0 → docpluck-2.4.2}/docpluck/__init__.py +1 -1
  4. {docpluck-2.4.0 → docpluck-2.4.2}/docpluck/render.py +74 -18
  5. docpluck-2.4.2/docs/HANDOFF_2026-05-12_phase2_101pdf_corpus.md +57 -0
  6. {docpluck-2.4.0 → docpluck-2.4.2}/pyproject.toml +1 -1
  7. docpluck-2.4.2/scripts/verify_corpus_full.py +288 -0
  8. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_render.py +152 -0
  9. {docpluck-2.4.0 → docpluck-2.4.2}/.claude/skills/_project/lessons.md +0 -0
  10. {docpluck-2.4.0 → docpluck-2.4.2}/.claude/skills/docpluck-cleanup/SKILL.md +0 -0
  11. {docpluck-2.4.0 → docpluck-2.4.2}/.claude/skills/docpluck-deploy/SKILL.md +0 -0
  12. {docpluck-2.4.0 → docpluck-2.4.2}/.claude/skills/docpluck-qa/SKILL.md +0 -0
  13. {docpluck-2.4.0 → docpluck-2.4.2}/.claude/skills/docpluck-qa/references/benchmark-mode.md +0 -0
  14. {docpluck-2.4.0 → docpluck-2.4.2}/.claude/skills/docpluck-qa/references/check-11-hard-rules.md +0 -0
  15. {docpluck-2.4.0 → docpluck-2.4.2}/.claude/skills/docpluck-qa/references/check-13-escicheck-production.md +0 -0
  16. {docpluck-2.4.0 → docpluck-2.4.2}/.claude/skills/docpluck-qa/references/check-5-escicheck-library.md +0 -0
  17. {docpluck-2.4.0 → docpluck-2.4.2}/.claude/skills/docpluck-qa/references/check-6-escicheck-local-webapp.md +0 -0
  18. {docpluck-2.4.0 → docpluck-2.4.2}/.claude/skills/docpluck-qa/references/check-7-batch-smoke.md +0 -0
  19. {docpluck-2.4.0 → docpluck-2.4.2}/.claude/skills/docpluck-review/SKILL.md +0 -0
  20. {docpluck-2.4.0 → docpluck-2.4.2}/.github/workflows/publish.yml +0 -0
  21. {docpluck-2.4.0 → docpluck-2.4.2}/.github/workflows/test.yml +0 -0
  22. {docpluck-2.4.0 → docpluck-2.4.2}/.gitignore +0 -0
  23. {docpluck-2.4.0 → docpluck-2.4.2}/CLAUDE.md +0 -0
  24. {docpluck-2.4.0 → docpluck-2.4.2}/HANDOFF_SECTIONS_APP_INTEGRATION.md +0 -0
  25. {docpluck-2.4.0 → docpluck-2.4.2}/LESSONS.md +0 -0
  26. {docpluck-2.4.0 → docpluck-2.4.2}/LICENSE +0 -0
  27. {docpluck-2.4.0 → docpluck-2.4.2}/REPLY_FROM_DOCPLUCK_v1.4.5.md +0 -0
  28. {docpluck-2.4.0 → docpluck-2.4.2}/REPLY_FROM_DOCPLUCK_v1.5.0.md +0 -0
  29. {docpluck-2.4.0 → docpluck-2.4.2}/REQUEST_08_CHUNKING_ENDPOINT.md +0 -0
  30. {docpluck-2.4.0 → docpluck-2.4.2}/REQUEST_09_REFERENCE_LIST_NORMALIZATION.md +0 -0
  31. {docpluck-2.4.0 → docpluck-2.4.2}/TODO.md +0 -0
  32. {docpluck-2.4.0 → docpluck-2.4.2}/docpluck/__main__.py +0 -0
  33. {docpluck-2.4.0 → docpluck-2.4.2}/docpluck/batch.py +0 -0
  34. {docpluck-2.4.0 → docpluck-2.4.2}/docpluck/cli.py +0 -0
  35. {docpluck-2.4.0 → docpluck-2.4.2}/docpluck/extract.py +0 -0
  36. {docpluck-2.4.0 → docpluck-2.4.2}/docpluck/extract_docx.py +0 -0
  37. {docpluck-2.4.0 → docpluck-2.4.2}/docpluck/extract_html.py +0 -0
  38. {docpluck-2.4.0 → docpluck-2.4.2}/docpluck/extract_layout.py +0 -0
  39. {docpluck-2.4.0 → docpluck-2.4.2}/docpluck/extract_structured.py +0 -0
  40. {docpluck-2.4.0 → docpluck-2.4.2}/docpluck/figures/__init__.py +0 -0
  41. {docpluck-2.4.0 → docpluck-2.4.2}/docpluck/figures/detect.py +0 -0
  42. {docpluck-2.4.0 → docpluck-2.4.2}/docpluck/normalize.py +0 -0
  43. {docpluck-2.4.0 → docpluck-2.4.2}/docpluck/quality.py +0 -0
  44. {docpluck-2.4.0 → docpluck-2.4.2}/docpluck/sections/__init__.py +0 -0
  45. {docpluck-2.4.0 → docpluck-2.4.2}/docpluck/sections/annotators/__init__.py +0 -0
  46. {docpluck-2.4.0 → docpluck-2.4.2}/docpluck/sections/annotators/docx.py +0 -0
  47. {docpluck-2.4.0 → docpluck-2.4.2}/docpluck/sections/annotators/html.py +0 -0
  48. {docpluck-2.4.0 → docpluck-2.4.2}/docpluck/sections/annotators/pdf.py +0 -0
  49. {docpluck-2.4.0 → docpluck-2.4.2}/docpluck/sections/annotators/text.py +0 -0
  50. {docpluck-2.4.0 → docpluck-2.4.2}/docpluck/sections/blocks.py +0 -0
  51. {docpluck-2.4.0 → docpluck-2.4.2}/docpluck/sections/boundaries.py +0 -0
  52. {docpluck-2.4.0 → docpluck-2.4.2}/docpluck/sections/core.py +0 -0
  53. {docpluck-2.4.0 → docpluck-2.4.2}/docpluck/sections/taxonomy.py +0 -0
  54. {docpluck-2.4.0 → docpluck-2.4.2}/docpluck/sections/types.py +0 -0
  55. {docpluck-2.4.0 → docpluck-2.4.2}/docpluck/tables/__init__.py +0 -0
  56. {docpluck-2.4.0 → docpluck-2.4.2}/docpluck/tables/bbox_utils.py +0 -0
  57. {docpluck-2.4.0 → docpluck-2.4.2}/docpluck/tables/camelot_extract.py +0 -0
  58. {docpluck-2.4.0 → docpluck-2.4.2}/docpluck/tables/captions.py +0 -0
  59. {docpluck-2.4.0 → docpluck-2.4.2}/docpluck/tables/cell_cleaning.py +0 -0
  60. {docpluck-2.4.0 → docpluck-2.4.2}/docpluck/tables/cluster.py +0 -0
  61. {docpluck-2.4.0 → docpluck-2.4.2}/docpluck/tables/confidence.py +0 -0
  62. {docpluck-2.4.0 → docpluck-2.4.2}/docpluck/tables/detect.py +0 -0
  63. {docpluck-2.4.0 → docpluck-2.4.2}/docpluck/tables/render.py +0 -0
  64. {docpluck-2.4.0 → docpluck-2.4.2}/docpluck/tables/whitespace.py +0 -0
  65. {docpluck-2.4.0 → docpluck-2.4.2}/docpluck/version.py +0 -0
  66. {docpluck-2.4.0 → docpluck-2.4.2}/docs/BENCHMARKS.md +0 -0
  67. {docpluck-2.4.0 → docpluck-2.4.2}/docs/DESIGN.md +0 -0
  68. {docpluck-2.4.0 → docpluck-2.4.2}/docs/HANDOFF_2026-05-07_sections_strict_iteration.md +0 -0
  69. {docpluck-2.4.0 → docpluck-2.4.2}/docs/HANDOFF_2026-05-09_session_state_and_followups.md +0 -0
  70. {docpluck-2.4.0 → docpluck-2.4.2}/docs/HANDOFF_2026-05-09_unified_extraction_brainstorm.md +0 -0
  71. {docpluck-2.4.0 → docpluck-2.4.2}/docs/HANDOFF_2026-05-10_table_rendering_iteration.md +0 -0
  72. {docpluck-2.4.0 → docpluck-2.4.2}/docs/HANDOFF_2026-05-10_table_rendering_iteration_2.md +0 -0
  73. {docpluck-2.4.0 → docpluck-2.4.2}/docs/HANDOFF_2026-05-10_table_rendering_iteration_3.md +0 -0
  74. {docpluck-2.4.0 → docpluck-2.4.2}/docs/HANDOFF_2026-05-10_table_rendering_iteration_4.md +0 -0
  75. {docpluck-2.4.0 → docpluck-2.4.2}/docs/HANDOFF_2026-05-10_table_rendering_iteration_5.md +0 -0
  76. {docpluck-2.4.0 → docpluck-2.4.2}/docs/HANDOFF_2026-05-10_table_rendering_iteration_6.md +0 -0
  77. {docpluck-2.4.0 → docpluck-2.4.2}/docs/HANDOFF_2026-05-10_table_rendering_iteration_7.md +0 -0
  78. {docpluck-2.4.0 → docpluck-2.4.2}/docs/HANDOFF_2026-05-11_PROMOTE_SPIKE_TO_LIBRARY.md +0 -0
  79. {docpluck-2.4.0 → docpluck-2.4.2}/docs/HANDOFF_2026-05-11_table_rendering_iteration_8.md +0 -0
  80. {docpluck-2.4.0 → docpluck-2.4.2}/docs/HANDOFF_2026-05-11_visual_review_findings.md +0 -0
  81. {docpluck-2.4.0 → docpluck-2.4.2}/docs/HANDOFF_2026-05-12_remaining_ui_and_chrome_verification.md +0 -0
  82. {docpluck-2.4.0 → docpluck-2.4.2}/docs/HANDOFF_2026-05-12_visual_verify_results.md +0 -0
  83. {docpluck-2.4.0 → docpluck-2.4.2}/docs/NORMALIZATION.md +0 -0
  84. {docpluck-2.4.0 → docpluck-2.4.2}/docs/README.md +0 -0
  85. {docpluck-2.4.0 → docpluck-2.4.2}/docs/TRIAGE_2026-05-10_corpus_assessment.md +0 -0
  86. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/2026-05-06-section-identification.md +0 -0
  87. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/2026-05-06-table-extraction.md +0 -0
  88. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/2026-05-07-sections-strict-iteration-progress.md +0 -0
  89. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/2026-05-08-unified-extraction-phase-0-splice-spike.md +0 -0
  90. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/sections-deferred-items.md +0 -0
  91. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/sections-issues-backlog.md +0 -0
  92. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/2026-05-07_spot-01_apa.md +0 -0
  93. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/2026-05-07_spot-02_pattern-A-shipped.md +0 -0
  94. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/2026-05-08_spot-final_all-styles.md +0 -0
  95. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/experiments/COMPARISON.md +0 -0
  96. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/korbmacher_table1.md +0 -0
  97. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/option-a.py +0 -0
  98. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/ziano_table1.md +0 -0
  99. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_notes_raw.txt +0 -0
  100. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_table1.md +0 -0
  101. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/notes.md +0 -0
  102. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/option-b.py +0 -0
  103. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_notes_raw.txt +0 -0
  104. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_table1.md +0 -0
  105. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/korbmacher_table1.md +0 -0
  106. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/notes.md +0 -0
  107. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/option-c.py +0 -0
  108. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/sample-pdftotext-bbox.html +0 -0
  109. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/ziano_table1.md +0 -0
  110. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/korbmacher_table1.md +0 -0
  111. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/notes.md +0 -0
  112. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/option-d.py +0 -0
  113. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/ziano_table1.md +0 -0
  114. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_2022_kruger_bbox.html +0 -0
  115. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_bbox.html +0 -0
  116. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_table1.md +0 -0
  117. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/option-e.py +0 -0
  118. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/sample-bbox.html +0 -0
  119. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_2021_joep_bbox.html +0 -0
  120. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_bbox.html +0 -0
  121. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_table1.md +0 -0
  122. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/html-fallback-demo.md +0 -0
  123. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.err +0 -0
  124. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.md +0 -0
  125. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.err +0 -0
  126. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.md +0 -0
  127. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.err +0 -0
  128. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.md +0 -0
  129. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.err +0 -0
  130. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.md +0 -0
  131. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.err +0 -0
  132. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.md +0 -0
  133. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.err +0 -0
  134. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.md +0 -0
  135. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.err +0 -0
  136. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.md +0 -0
  137. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.err +0 -0
  138. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.md +0 -0
  139. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.err +0 -0
  140. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.md +0 -0
  141. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.err +0 -0
  142. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.md +0 -0
  143. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.err +0 -0
  144. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.md +0 -0
  145. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.err +0 -0
  146. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.md +0 -0
  147. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.err +0 -0
  148. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.md +0 -0
  149. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.err +0 -0
  150. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.md +0 -0
  151. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.err +0 -0
  152. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.md +0 -0
  153. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.err +0 -0
  154. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.md +0 -0
  155. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.err +0 -0
  156. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.md +0 -0
  157. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.err +0 -0
  158. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.md +0 -0
  159. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.err +0 -0
  160. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.md +0 -0
  161. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.err +0 -0
  162. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.md +0 -0
  163. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.err +0 -0
  164. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.md +0 -0
  165. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.err +0 -0
  166. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.md +0 -0
  167. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.err +0 -0
  168. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.md +0 -0
  169. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.err +0 -0
  170. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.md +0 -0
  171. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.err +0 -0
  172. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.md +0 -0
  173. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.err +0 -0
  174. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.md +0 -0
  175. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/papers.md +0 -0
  176. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/report.md +0 -0
  177. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/splice_spike.py +0 -0
  178. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/plans/spot-checks/splice-spike/test_splice_spike.py +0 -0
  179. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/specs/2026-04-27-request-09-reference-normalization-design.md +0 -0
  180. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/specs/2026-05-06-section-identification-design.md +0 -0
  181. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/specs/2026-05-06-table-extraction-design.md +0 -0
  182. {docpluck-2.4.0 → docpluck-2.4.2}/docs/superpowers/specs/2026-05-08-unified-extraction-design.md +0 -0
  183. {docpluck-2.4.0 → docpluck-2.4.2}/scripts/verify_corpus.py +0 -0
  184. {docpluck-2.4.0 → docpluck-2.4.2}/tests/__init__.py +0 -0
  185. {docpluck-2.4.0 → docpluck-2.4.2}/tests/conftest.py +0 -0
  186. {docpluck-2.4.0 → docpluck-2.4.2}/tests/fixtures/__init__.py +0 -0
  187. {docpluck-2.4.0 → docpluck-2.4.2}/tests/fixtures/sections/__init__.py +0 -0
  188. {docpluck-2.4.0 → docpluck-2.4.2}/tests/fixtures/sections/builders.py +0 -0
  189. {docpluck-2.4.0 → docpluck-2.4.2}/tests/fixtures/structured/.gitkeep +0 -0
  190. {docpluck-2.4.0 → docpluck-2.4.2}/tests/fixtures/structured/MANIFEST.json +0 -0
  191. {docpluck-2.4.0 → docpluck-2.4.2}/tests/fixtures/structured/README.md +0 -0
  192. {docpluck-2.4.0 → docpluck-2.4.2}/tests/golden/sections/apa_multi_study_pdf.json +0 -0
  193. {docpluck-2.4.0 → docpluck-2.4.2}/tests/golden/sections/apa_single_study_pdf.json +0 -0
  194. {docpluck-2.4.0 → docpluck-2.4.2}/tests/golden/sections/html_real_headings.json +0 -0
  195. {docpluck-2.4.0 → docpluck-2.4.2}/tests/snapshots/amj_lattice.txt +0 -0
  196. {docpluck-2.4.0 → docpluck-2.4.2}/tests/snapshots/apa_chan_feldman_lineless.txt +0 -0
  197. {docpluck-2.4.0 → docpluck-2.4.2}/tests/snapshots/apa_chen_jesp_lineless.txt +0 -0
  198. {docpluck-2.4.0 → docpluck-2.4.2}/tests/snapshots/apa_efendic_affect.txt +0 -0
  199. {docpluck-2.4.0 → docpluck-2.4.2}/tests/snapshots/apa_ip_feldman_pspb.txt +0 -0
  200. {docpluck-2.4.0 → docpluck-2.4.2}/tests/snapshots/bmc_lattice.txt +0 -0
  201. {docpluck-2.4.0 → docpluck-2.4.2}/tests/snapshots/ieee_figure_heavy.txt +0 -0
  202. {docpluck-2.4.0 → docpluck-2.4.2}/tests/snapshots/ieee_lattice.txt +0 -0
  203. {docpluck-2.4.0 → docpluck-2.4.2}/tests/snapshots/jama_lattice.txt +0 -0
  204. {docpluck-2.4.0 → docpluck-2.4.2}/tests/snapshots/nat_comms_figure_only.txt +0 -0
  205. {docpluck-2.4.0 → docpluck-2.4.2}/tests/snapshots/nature_minimal_rule.txt +0 -0
  206. {docpluck-2.4.0 → docpluck-2.4.2}/tests/snapshots/scirep_minimal_rule.txt +0 -0
  207. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_bbox_utils.py +0 -0
  208. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_benchmark_docx_html.py +0 -0
  209. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_caption_regex.py +0 -0
  210. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_cli_sections.py +0 -0
  211. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_cli_structured.py +0 -0
  212. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_confidence.py +0 -0
  213. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_corpus_smoke.py +0 -0
  214. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_d5_normalization_audit.py +0 -0
  215. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_edge_cases.py +0 -0
  216. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_extract_docx.py +0 -0
  217. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_extract_filter_sugar.py +0 -0
  218. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_extract_html.py +0 -0
  219. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_extract_layout.py +0 -0
  220. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_extract_pdf_structured.py +0 -0
  221. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_extraction.py +0 -0
  222. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_f0_table_region_aware.py +0 -0
  223. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_figure_detect.py +0 -0
  224. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_fixtures_manifest.py +0 -0
  225. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_lattice_cluster.py +0 -0
  226. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_metaesci_followups.py +0 -0
  227. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_normalization.py +0 -0
  228. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_normalize_f0_footnote_strip.py +0 -0
  229. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_normalize_layout_param.py +0 -0
  230. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_normalize_report_layout_fields.py +0 -0
  231. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_normalize_v18_strips.py +0 -0
  232. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_quality.py +0 -0
  233. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_render_html.py +0 -0
  234. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_request_09_reference_normalization.py +0 -0
  235. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_sections_boundaries.py +0 -0
  236. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_sections_boundary_truncation.py +0 -0
  237. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_sections_core_partition.py +0 -0
  238. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_sections_docx_annotator.py +0 -0
  239. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_sections_extract_text.py +0 -0
  240. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_sections_footnote_section.py +0 -0
  241. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_sections_golden.py +0 -0
  242. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_sections_html_annotator.py +0 -0
  243. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_sections_pdf_annotator.py +0 -0
  244. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_sections_public_api.py +0 -0
  245. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_sections_real_corpus.py +0 -0
  246. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_sections_taxonomy.py +0 -0
  247. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_sections_text_annotator.py +0 -0
  248. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_sections_types.py +0 -0
  249. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_sections_unit_corpus.py +0 -0
  250. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_sections_v161_coalesce.py +0 -0
  251. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_sections_v161_subheadings.py +0 -0
  252. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_sections_v161_taxonomy.py +0 -0
  253. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_sections_v161_text_annotator.py +0 -0
  254. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_sections_version.py +0 -0
  255. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_smoke_fixtures.py +0 -0
  256. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_structured_result_type.py +0 -0
  257. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_structured_types.py +0 -0
  258. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_structured_version.py +0 -0
  259. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_table_detect.py +0 -0
  260. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_tables_cell_cleaning.py +0 -0
  261. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_text_mode.py +0 -0
  262. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_v23_1_fixes.py +0 -0
  263. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_v23_bug_fixes.py +0 -0
  264. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_v23_post_corpus.py +0 -0
  265. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_v23_post_corpus_v2.py +0 -0
  266. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_v2_backwards_compat.py +0 -0
  267. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_v2_top_level_exports.py +0 -0
  268. {docpluck-2.4.0 → docpluck-2.4.2}/tests/test_whitespace_cluster.py +0 -0
@@ -1,5 +1,43 @@
1
1
  # Changelog
2
2
 
3
+ ## [2.4.2] — 2026-05-13
4
+
5
+ Iterative follow-up. After v2.4.1 the 101-PDF corpus run was 98/101 PASS (`scripts/verify_corpus_full.py`); this release closes two of the three remaining failures and reframes the third as a known short-paper edge case in the verifier.
6
+
7
+ ### Fixes
8
+
9
+ 1. **`docpluck/render.py::_render_sections_to_markdown`** — table emission when Camelot returned no cells. Previously, a located table with a caption but no structured cells produced ``### Table N\n*caption*\n`` in body markdown — promising structured content that wasn't there. Verifier flagged this with the `H` tag (missing_html). Two papers affected: `bjps_4`, `ar_apa_j_jesp_2009_12_011`. New behavior: when `html` is empty for a body-located table, skip the `### Table N` heading and emit only the caption as a plain italic paragraph (`*Table N. caption text*`). The table reference is still surfaced in body flow, but without the false promise of structured HTML. Same treatment for the unlocated-tables appendix — tables with neither caption nor cells are dropped (a bare `### Table N` stub is information-free).
10
+
11
+ 2. **`docpluck/render.py::_render_sections_to_markdown`** — uppercase canonical section headings when pdftotext flattens Elsevier letter-spaced typography. JESP / Cognition / JEP papers render their section headings with letter-spacing (``a b s t r a c t``), which pdftotext extracts as a lone lowercase word. Without this fix the rendered output mixes ``## abstract`` with ``## Methods`` / ``## Results`` — a stylistic blemish on every Elsevier-style paper. New rule: when the captured `heading_text` is entirely lowercase ASCII AND the section has a recognized canonical label, replace the heading with the pretty Title-Case form (`Abstract`, `Keywords`, etc.). All-caps publisher headings (JAMA ``RESULTS``) are preserved verbatim — only lowercase is rewritten.
12
+
13
+ ### Verifier upgrade
14
+
15
+ 3. **`scripts/verify_corpus_full.py::_classify`** — short-paper exemption. The `S` (section_count < 4) and `X` (output < 5 KB) tags are now suppressed when the rendered title contains `ADDENDUM` / `CORRIGENDUM` / `CORRECTION` / `ERRATUM` / `RETRACTION`. The canonical example is `jdm_.2023.10`, a 1-page archival correction notice that legitimately has 1 section and ~1 KB of body content; flagging it as a render failure was a verifier false positive.
16
+
17
+ ### Bumps
18
+
19
+ - `__version__`: `2.4.1` → `2.4.2`. Patch — render behavior changes affect only the 2 H-tagged papers + lowercase-abstract heading on Elsevier-style papers; no API change.
20
+
21
+ ### Tests
22
+
23
+ 6 new tests in `tests/test_render.py` covering the H-tag emission rules (body-located + appendix), the lowercase-canonical heading uppercase rule, and the happy-path no-op cases.
24
+
25
+ ## [2.4.1] — 2026-05-12
26
+
27
+ Same-day follow-up to v2.4.0. Expanded testing to all 101 PDFs in the wider corpus (vs the 26 spike-baseline papers) and fixed the most common new failure: missing-title on AMA/AOM single-line title layouts.
28
+
29
+ ### Fixes
30
+
31
+ 1. **`docpluck/render.py::_compute_layout_title`** — title-size selection in two passes:
32
+ - Pass 1 (unchanged): largest font with count ≥ 2 (multi-line titles).
33
+ - Pass 2 (new): largest font in the TOP region (y0 ≥ 70% of page height) with count ≥ 1 and combined span text ≥ 10 chars.
34
+
35
+ Without the top-region restriction + text-length floor, a stray same-font glyph elsewhere on the page (a "+" decoration at font 16.0, an "GUIDEPOST" feature-label at font 30.0) would outrank a real single-line title at a smaller-but-still-large font. Affects: `jama_open_3`, `jama_open_4`, `jama_open_6`, `jama_open_10`, `annals_4`, `amd_1` and similar AMA/AOM-style papers.
36
+
37
+ ### Bumps
38
+
39
+ - `__version__`: `2.4.0` → `2.4.1`. Patch-level — internal heuristic improvement, no API change.
40
+
3
41
  ## [2.4.0] — 2026-05-12
4
42
 
5
43
  Same-day follow-up. Closes the three real library bugs surfaced by the AI-Chrome visual verification pass on all 26 corpus papers documented in `docs/HANDOFF_2026-05-12_visual_verify_results.md`. The API-level `verify_corpus.py` was passing 26/26 throughout but couldn't see these — visual inspection in the workspace was needed.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docpluck
3
- Version: 2.4.0
3
+ Version: 2.4.2
4
4
  Summary: PDF, DOCX, and HTML text extraction and normalization for academic papers
5
5
  Project-URL: Homepage, https://github.com/giladfeldman/docpluck
6
6
  Project-URL: Documentation, https://github.com/giladfeldman/docpluck/tree/main/docs
@@ -71,7 +71,7 @@ from .figures import Figure
71
71
  from .extract_structured import TABLE_EXTRACTION_VERSION, StructuredResult, extract_pdf_structured
72
72
  from .render import render_pdf_to_markdown
73
73
 
74
- __version__ = "2.4.0"
74
+ __version__ = "2.4.2"
75
75
  __author__ = "Gilad Feldman"
76
76
  __license__ = "MIT"
77
77
 
@@ -610,13 +610,34 @@ def _compute_layout_title(layout_doc: LayoutDoc) -> Optional[str]:
610
610
  round(float(s.font_size) * 2) / 2 for s in upper_spans
611
611
  )
612
612
  title_size: Optional[float] = None
613
+ # Pass 1: largest font with count >= 2 (the title typically spans
614
+ # 2-3 lines).
613
615
  for sz, count in sorted(size_counts.items(), reverse=True):
614
616
  if sz >= 12.0 and count >= 2:
615
617
  title_size = sz
616
618
  break
617
- if sz >= 14.0 and count >= 1:
618
- title_size = sz
619
- break
619
+ if title_size is None:
620
+ # Pass 2: fall back to the largest font in the TOP region with
621
+ # count >= 1 AND >= 10 chars of combined span text. The top-
622
+ # region filter (y0 >= 70% of page height) rejects mid-page
623
+ # decorations like a "+" badge or section-heading numerals.
624
+ # The text-length filter rejects short feature-labels (e.g. AOM
625
+ # papers' "GUIDEPOST" header at font 30) in favor of the longer
626
+ # title block immediately below.
627
+ top_region_threshold = height * 0.70
628
+ top_spans = [s for s in upper_spans if s.y0 >= top_region_threshold]
629
+ candidate_sizes = sorted(
630
+ {round(float(s.font_size) * 2) / 2 for s in top_spans},
631
+ reverse=True,
632
+ )
633
+ for sz in candidate_sizes:
634
+ if sz < 14.0:
635
+ break
636
+ matching = [s for s in top_spans if abs(float(s.font_size) - sz) < 0.3]
637
+ combined_text_len = sum(len((s.text or "").strip()) for s in matching)
638
+ if combined_text_len >= 10:
639
+ title_size = sz
640
+ break
620
641
  if title_size is None:
621
642
  return None
622
643
 
@@ -1140,6 +1161,23 @@ def _render_sections_to_markdown(
1140
1161
  )
1141
1162
  if not skip_heading:
1142
1163
  heading = sec.heading_text or _pretty_label(sec.label)
1164
+ # v2.4.2: when the heading_text the section detector captured is
1165
+ # entirely lowercase (Elsevier "a b s t r a c t" letter-spaced
1166
+ # typography → pdftotext flattens to "abstract") AND the section
1167
+ # has a recognized canonical label, prefer the pretty Title-Case
1168
+ # form. Without this fix the rendered output reads ``## abstract``
1169
+ # alongside ``## Methods``/``## Results`` — a stylistic blemish
1170
+ # that surfaces on every Elsevier (JESP, Cognition, JEP) paper.
1171
+ if (
1172
+ heading
1173
+ and heading == heading.lower()
1174
+ and heading.isascii()
1175
+ and any(c.isalpha() for c in heading)
1176
+ and canonical != "unknown"
1177
+ ):
1178
+ pretty = _pretty_label(sec.label)
1179
+ if pretty and pretty != heading:
1180
+ heading = pretty
1143
1181
  # \n\n (not \n) separates heading from body so downstream
1144
1182
  # markdown renderers treat them as a heading block + paragraph,
1145
1183
  # not as one mashed paragraph starting with "## Abstract ...".
@@ -1170,11 +1208,19 @@ def _render_sections_to_markdown(
1170
1208
  if kind == "table":
1171
1209
  cells = item.get("cells") or []
1172
1210
  html = item.get("html") or (cells_to_html(cells) if cells else "")
1173
- body_chunks.append(f"\n### {label}\n")
1174
- if cap:
1175
- body_chunks.append(f"*{cap}*\n")
1176
1211
  if html:
1212
+ body_chunks.append(f"\n### {label}\n")
1213
+ if cap:
1214
+ body_chunks.append(f"*{cap}*\n")
1177
1215
  body_chunks.append(html)
1216
+ elif cap:
1217
+ # v2.4.2: Camelot returned no cells for this caption.
1218
+ # Skip the `### Table N` heading (which would falsely
1219
+ # promise structured content) and emit the caption as a
1220
+ # plain italicized paragraph so the table reference is
1221
+ # preserved in body flow. Affected papers in the
1222
+ # 101-PDF corpus: bjps_4, ar_apa_j_jesp_2009_12_011.
1223
+ body_chunks.append(f"\n*{cap}*\n")
1178
1224
  else:
1179
1225
  body_chunks.append(f"\n### {label}\n")
1180
1226
  if cap:
@@ -1192,18 +1238,28 @@ def _render_sections_to_markdown(
1192
1238
  leftover_figures.extend(unlocated_figures)
1193
1239
 
1194
1240
  if leftover_tables:
1195
- out_chunks.append("## Tables (unlocated in body)\n\n")
1196
- for t in leftover_tables:
1197
- label = t.get("label") or "Table"
1198
- cap = t.get("caption") or ""
1199
- cells = t.get("cells") or []
1200
- html = t.get("html") or (cells_to_html(cells) if cells else "")
1201
- out_chunks.append(f"### {label}\n")
1202
- if cap:
1203
- out_chunks.append(f"*{cap}*\n")
1204
- if html:
1205
- out_chunks.append(html + "\n")
1206
- out_chunks.append("\n")
1241
+ # v2.4.2: drop tables that have neither a caption nor structured
1242
+ # HTML — emitting a bare ``### Table N`` header in the appendix
1243
+ # adds no information and clutters the output.
1244
+ renderable_tables = [
1245
+ t for t in leftover_tables
1246
+ if (t.get("caption") or "").strip()
1247
+ or t.get("html")
1248
+ or t.get("cells")
1249
+ ]
1250
+ if renderable_tables:
1251
+ out_chunks.append("## Tables (unlocated in body)\n\n")
1252
+ for t in renderable_tables:
1253
+ label = t.get("label") or "Table"
1254
+ cap = t.get("caption") or ""
1255
+ cells = t.get("cells") or []
1256
+ html = t.get("html") or (cells_to_html(cells) if cells else "")
1257
+ out_chunks.append(f"### {label}\n")
1258
+ if cap:
1259
+ out_chunks.append(f"*{cap}*\n")
1260
+ if html:
1261
+ out_chunks.append(html + "\n")
1262
+ out_chunks.append("\n")
1207
1263
 
1208
1264
  if leftover_figures:
1209
1265
  out_chunks.append("## Figures\n\n")
@@ -0,0 +1,57 @@
1
+ # Handoff — Phase 2 (101-PDF corpus expansion)
2
+
3
+ **Session date:** 2026-05-12 (continuation of the v2.3.1 → v2.4.0 → v2.4.1 release chain)
4
+
5
+ ## State at handoff
6
+
7
+ - **Library:** v2.4.1 tagged + pushed. PyPI not published.
8
+ - **App pin:** `docpluck v2.4.0` in `PDFextractor/service/requirements.txt`. Needs bump to v2.4.1 next session.
9
+ - **26-paper corpus verifier (`scripts/verify_corpus.py`):** 26/26 PASS at v2.4.1.
10
+ - **101-paper corpus verifier (`scripts/verify_corpus_full.py`, new this session):** partial-run result before v2.4.1 was applied — 7 failures observed in 25 papers processed (run cancelled to ship v2.4.1). Of those, 5 were the M-tag (missing title) on AMA/AOM single-span-title layouts that v2.4.1 specifically targets. **Next session must re-run with `python scripts/verify_corpus_full.py` to enumerate the actual v2.4.1 failure set.**
11
+
12
+ ## What's in v2.4.1
13
+
14
+ A single fix to `_compute_layout_title` in `docpluck/render.py`:
15
+
16
+ - Pass 2 of the title-size selector (single-span fallback) now requires the span to be in the TOP region of the page (y0 ≥ 70% of page height) AND have ≥ 10 chars of combined text.
17
+ - Catches AMA/AOM cases where a mid-page big-font decoration (a "+" glyph at font 16.0, an "GUIDEPOST" feature-label at font 30.0) was outranking the actual title at a smaller font (e.g. font 15.0 on the JAMA Open layout).
18
+
19
+ Affects: `jama_open_3/4/6/10`, `amd_1`, `annals_4`, and likely several more AMA-format papers in the wider 101-PDF corpus.
20
+
21
+ ## Known issues remaining (from partial 101-run)
22
+
23
+ | Paper | Tag | Cause |
24
+ |---|---|---|
25
+ | `ar_apa_j_jesp_2009_12_011` | H | Camelot couldn't extract any tables despite body referencing them (`### Table N` headings present but no `<table>` HTML). Known Camelot limitation; banner already warns user. |
26
+
27
+ Other papers' status under v2.4.1 is **unknown** — the partial run was on the v2.4.0 code path and is now stale.
28
+
29
+ ## Recommended next-session workflow
30
+
31
+ 1. **Bump app pin** in `PDFextractor/service/requirements.txt`: `v2.4.0` → `v2.4.1`. Commit + push.
32
+ 2. **Run full 101-PDF verifier:** `python scripts/verify_corpus_full.py --save-renders` (15-30 min).
33
+ 3. **Triage failures** by tag frequency: M / D / R / S / H / C / X / L / J. Probably 2-5 distinct root-cause patterns.
34
+ 4. **Pick top 1-2 patterns** with highest paper-count, root-cause, fix in `render.py` (or wherever it lives), add unit tests.
35
+ 5. **Re-run 26-paper verifier** to guard against regressions.
36
+ 6. **Tag + push** as v2.4.2.
37
+ 7. **Visual spot-check** of representative fixed papers through the workspace via Chrome MCP.
38
+ 8. Repeat from step 2 until weekly quota exhausted or all 101 papers pass.
39
+
40
+ ## Renders directory
41
+
42
+ `tmp/renders_v2.4.0/` contains rendered `.md` files for the ~25 papers processed in the partial run. Useful for grepping for "## Heading word" patterns and other regressions before re-running. **Stale at v2.4.1** — re-render is needed to update them.
43
+
44
+ ## Tagging legend (for the new verifier)
45
+
46
+ | Tag | Meaning |
47
+ |---|---|
48
+ | M | missing `# Title` line |
49
+ | T | title ends in connector word ("of", "the", "and", ...) — almost certainly truncated |
50
+ | D | title is missing distinct words ≥ 4 letters that the spike baseline has (middle truncation; needs spike baseline to fire) |
51
+ | R | title text appears as body prose immediately after `# Title` (Nature-style duplication) |
52
+ | S | section count < 4 |
53
+ | H | `### Table N` headings present in body but no `<table>` HTML element |
54
+ | C | longest `*Figure N. ...*` caption > 800 chars (boundary leak) |
55
+ | X | output < 5 KB (extremely short — likely PDF extract failure) |
56
+ | L | output much shorter than spike baseline (requires baseline) |
57
+ | J | Jaccard vs spike < 0.6 (requires baseline) |
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "docpluck"
7
- version = "2.4.0"
7
+ version = "2.4.2"
8
8
  description = "PDF, DOCX, and HTML text extraction and normalization for academic papers"
9
9
  readme = "docs/README.md"
10
10
  requires-python = ">=3.10"
@@ -0,0 +1,288 @@
1
+ """Full-corpus verifier: run v2.4.0 render across all 101 PDFs in
2
+ PDFextractor/test-pdfs/ and flag papers with structural issues, even those
3
+ without a spike baseline.
4
+
5
+ For papers WITH a spike baseline, full metrics (char-ratio, Jaccard, D-tag)
6
+ apply just like in verify_corpus.py.
7
+
8
+ For papers WITHOUT a spike baseline (75 of the 101), we apply baseline-free
9
+ heuristics:
10
+ - title present? non-trivial? not trailing-truncated?
11
+ - section count >= 4 (most academic papers have at least Abstract +
12
+ Introduction + Methods/Results + Discussion + References)
13
+ - rendered length plausible (>5 KB)
14
+ - title block not duplicated immediately in body (Nature-style)
15
+
16
+ Output: one line per paper with status + tags, then a triage section
17
+ listing the top issues for follow-up.
18
+
19
+ Usage:
20
+ python scripts/verify_corpus_full.py
21
+ python scripts/verify_corpus_full.py --only-fails
22
+ python scripts/verify_corpus_full.py --paper jama_open_5
23
+ """
24
+ from __future__ import annotations
25
+
26
+ import argparse
27
+ import re
28
+ import sys
29
+ import time
30
+ from pathlib import Path
31
+ from typing import Optional
32
+
33
+
34
+ REPO_ROOT = Path(__file__).resolve().parent.parent
35
+ APP_PDFS = REPO_ROOT.parent / "PDFextractor" / "test-pdfs"
36
+ SPIKE_OUT_DIRS = [
37
+ REPO_ROOT / "docs/superpowers/plans/spot-checks/splice-spike/outputs",
38
+ REPO_ROOT / "docs/superpowers/plans/spot-checks/splice-spike/outputs-new",
39
+ ]
40
+ RENDERS_DIR = REPO_ROOT / "tmp" / "renders_v2.4.0"
41
+
42
+
43
+ _CONNECTOR_TAIL = {
44
+ "of", "from", "for", "the", "and", "or", "to", "with", "on", "at",
45
+ "by", "in", "as", "is", "a", "an", "but", "into", "onto", "upon",
46
+ "than", "that", "which", "who", "when", "where", "while", "during",
47
+ "after", "before", "because", "since", "though", "although",
48
+ }
49
+
50
+ _TITLE_RE = re.compile(r"^\s*#\s+([^\n]+)$", re.MULTILINE)
51
+ _H2_RE = re.compile(r"^\s*##\s+([^\n]+)$", re.MULTILINE)
52
+ _TABLE_HTML_RE = re.compile(r"<table>")
53
+ _FIG_CAPTION_RE = re.compile(r"^\*Figure\s+\d+\.?\s+[^\n]*?\*\s*$", re.MULTILINE)
54
+
55
+
56
+ def _all_pdfs() -> list[Path]:
57
+ return sorted(APP_PDFS.rglob("*.pdf"))
58
+
59
+
60
+ def _find_spike_md(name: str) -> Optional[Path]:
61
+ for d in SPIKE_OUT_DIRS:
62
+ p = d / f"{name}.md"
63
+ if p.exists():
64
+ return p
65
+ return None
66
+
67
+
68
+ def _word_set(text: str) -> set[str]:
69
+ return set(re.findall(r"[A-Za-z]{4,}", text.lower()))
70
+
71
+
72
+ def _title_word_delta(rendered_title: Optional[str], spike_title: Optional[str]) -> int:
73
+ if not rendered_title or not spike_title:
74
+ return 0
75
+ rw = set(re.findall(r"[A-Za-z]{4,}", rendered_title.lower()))
76
+ sw = set(re.findall(r"[A-Za-z]{4,}", spike_title.lower()))
77
+ return len(sw - rw)
78
+
79
+
80
+ def _has_immediate_title_repeat(md: str, title: str) -> bool:
81
+ """True if the first few body paragraphs contain a span whose token
82
+ content matches the title (the symptom my Nature-style sweep targets).
83
+ Conservative — should never fire after v2.4.0 unless a regression."""
84
+ if not title:
85
+ return False
86
+ title_tokens = re.findall(r"\w+", title.lower())
87
+ if len(title_tokens) < 4:
88
+ return False
89
+ title_set = set(title_tokens)
90
+ # Skip the title line itself; scan the next ~30 non-blank body lines.
91
+ lines = md.split("\n")
92
+ after_title = False
93
+ accumulated: list[str] = []
94
+ n_scanned = 0
95
+ for ln in lines:
96
+ line = ln.strip()
97
+ if not after_title:
98
+ if line.startswith("# "):
99
+ after_title = True
100
+ continue
101
+ if not line or line.startswith("#"):
102
+ if accumulated:
103
+ # check whole accumulated span
104
+ covered = sum(1 for t in title_tokens if t in accumulated)
105
+ in_title = sum(1 for t in accumulated if t in title_set)
106
+ if covered >= 0.8 * len(title_tokens) and in_title >= 0.7 * len(accumulated):
107
+ return True
108
+ accumulated = []
109
+ continue
110
+ accumulated.extend(re.findall(r"\w+", line.lower()))
111
+ n_scanned += 1
112
+ if n_scanned > 30:
113
+ break
114
+ return False
115
+
116
+
117
+ def _metrics(md: str) -> dict:
118
+ title_m = _TITLE_RE.search(md)
119
+ title = title_m.group(1).strip() if title_m else None
120
+ title_truncated = False
121
+ if title:
122
+ stripped = re.sub(r"[\s\.,;:!?\-—–]+$", "", title).lower()
123
+ last = stripped.rsplit(None, 1)[-1] if " " in stripped else stripped
124
+ title_truncated = last in _CONNECTOR_TAIL
125
+ sections = _H2_RE.findall(md)
126
+ return {
127
+ "title": title,
128
+ "title_truncated": title_truncated,
129
+ "section_count": len(sections),
130
+ "section_names": sections,
131
+ "table_html_count": len(_TABLE_HTML_RE.findall(md)),
132
+ "total_chars": len(md),
133
+ "title_repeat_in_body": _has_immediate_title_repeat(md, title) if title else False,
134
+ "longest_fig_caption_chars": max(
135
+ (len(m.group(0)) for m in _FIG_CAPTION_RE.finditer(md)), default=0
136
+ ),
137
+ }
138
+
139
+
140
+ _CORRECTION_TITLE_RE = re.compile(
141
+ r"\b(?:addendum|corrigendum|correction|erratum|retraction)\b",
142
+ re.IGNORECASE,
143
+ )
144
+
145
+
146
+ def _classify(name: str, md: str, spike_md: Optional[str]) -> tuple[str, dict, list[str]]:
147
+ m = _metrics(md)
148
+ tags: list[str] = []
149
+ title_text = m["title"] or ""
150
+ is_correction_paper = bool(_CORRECTION_TITLE_RE.search(title_text))
151
+
152
+ if m["title"] is None:
153
+ tags.append("M") # missing title
154
+ if m["title_truncated"]:
155
+ tags.append("T")
156
+ if m["section_count"] < 4 and not is_correction_paper:
157
+ tags.append("S")
158
+ if m["title_repeat_in_body"]:
159
+ tags.append("R") # title repeats in body (Nature-style dup)
160
+ appendix_idx = md.find("## Tables (unlocated in body)")
161
+ body_section = md if appendix_idx < 0 else md[:appendix_idx]
162
+ body_table_count = len(re.findall(r"^\s*###\s+Table\s+\d+", body_section, re.MULTILINE))
163
+ if body_table_count > 0 and m["table_html_count"] == 0:
164
+ tags.append("H")
165
+ if m["longest_fig_caption_chars"] > 800:
166
+ tags.append("C")
167
+ # X (short output) is suppressed when the title indicates an ADDENDUM /
168
+ # CORRIGENDUM / CORRECTION / ERRATUM — these are genuinely 1-page
169
+ # correction notices and a short render is correct (the
170
+ # jdm_.2023.10 paper is the canonical case in the 101-PDF corpus).
171
+ if m["total_chars"] < 5000 and not is_correction_paper:
172
+ tags.append("X") # extremely short — likely failure
173
+
174
+ spike_title = None
175
+ if spike_md:
176
+ spike_t = _TITLE_RE.search(spike_md)
177
+ spike_title = spike_t.group(1).strip() if spike_t else None
178
+ if spike_md:
179
+ char_ratio = m["total_chars"] / max(1, len(spike_md))
180
+ my_w = _word_set(md)
181
+ sp_w = _word_set(spike_md)
182
+ union = my_w | sp_w
183
+ jaccard = len(my_w & sp_w) / len(union) if union else None
184
+ m["char_ratio_vs_spike"] = char_ratio
185
+ m["jaccard_vs_spike"] = jaccard
186
+ if char_ratio < 0.7:
187
+ tags.append("L")
188
+ if jaccard is not None and jaccard < 0.6:
189
+ tags.append("J")
190
+ else:
191
+ m["char_ratio_vs_spike"] = None
192
+ m["jaccard_vs_spike"] = None
193
+ if spike_title:
194
+ miss = _title_word_delta(m["title"], spike_title)
195
+ if miss > 0:
196
+ tags.append("D")
197
+ m["title_missing_words"] = miss
198
+ else:
199
+ m["title_missing_words"] = 0
200
+
201
+ if not tags:
202
+ status = "PASS"
203
+ elif set(tags) <= {"L"}:
204
+ status = "WARN"
205
+ else:
206
+ status = "FAIL"
207
+ return status, m, tags
208
+
209
+
210
+ def _run_render(pdf_path: Path) -> tuple[str, float]:
211
+ from docpluck import render_pdf_to_markdown
212
+ t0 = time.time()
213
+ data = pdf_path.read_bytes()
214
+ md = render_pdf_to_markdown(data)
215
+ return md, time.time() - t0
216
+
217
+
218
+ def main() -> int:
219
+ ap = argparse.ArgumentParser()
220
+ ap.add_argument("--paper")
221
+ ap.add_argument("--only-fails", action="store_true")
222
+ ap.add_argument("--save-renders", action="store_true",
223
+ help="dump each rendered .md to tmp/renders_v2.4.0/")
224
+ args = ap.parse_args()
225
+
226
+ if args.paper:
227
+ pdfs = [p for p in _all_pdfs() if p.stem == args.paper]
228
+ else:
229
+ pdfs = _all_pdfs()
230
+ if not pdfs:
231
+ print("ERROR: no PDFs found", file=sys.stderr)
232
+ return 1
233
+ if args.save_renders:
234
+ RENDERS_DIR.mkdir(parents=True, exist_ok=True)
235
+
236
+ print(f"# Full-corpus verification — {len(pdfs)} PDFs (v2.4.0)")
237
+ print(f"# legend: M=missing_title T=title_trunc D=title_words_dropped R=title_repeat_in_body S=few_sections H=missing_html C=cap_too_long X=output_too_short L=much_shorter J=low_jaccard")
238
+ print()
239
+ print(f"{'STATUS':6} {'PAPER':40} {'TAGS':15} {'CHARS':>8} {'SECT':>5} {'TABS':>5} TIME")
240
+ print("-" * 100)
241
+
242
+ summary = {"PASS": 0, "WARN": 0, "FAIL": 0, "ERROR": 0}
243
+ failures: list[tuple[str, str, dict, list[str]]] = []
244
+
245
+ for pdf in pdfs:
246
+ name = pdf.stem
247
+ spike_path = _find_spike_md(name)
248
+ spike_md = spike_path.read_text(encoding="utf-8", errors="ignore") if spike_path else None
249
+ try:
250
+ md, elapsed = _run_render(pdf)
251
+ except Exception as e:
252
+ print(f"{'ERROR':6} {name:40} {type(e).__name__}: {e}")
253
+ summary["ERROR"] += 1
254
+ continue
255
+ status, m, tags = _classify(name, md, spike_md)
256
+ summary[status] += 1
257
+ if status != "PASS":
258
+ failures.append((name, status, m, tags))
259
+ if args.only_fails and status == "PASS":
260
+ continue
261
+ if args.save_renders:
262
+ (RENDERS_DIR / f"{name}.md").write_text(md, encoding="utf-8", errors="replace")
263
+ tag_str = ",".join(tags) or "—"
264
+ print(f"{status:6} {name:40} {tag_str:15} {m['total_chars']:>8} {m['section_count']:>5} {m['table_html_count']:>5} {elapsed:.1f}s")
265
+
266
+ print()
267
+ print("# Summary")
268
+ total = sum(summary.values())
269
+ for k in ("PASS", "WARN", "FAIL", "ERROR"):
270
+ if summary[k]:
271
+ print(f" {k:8} {summary[k]:3} / {total}")
272
+
273
+ if failures:
274
+ print()
275
+ print("# Failure details")
276
+ for name, status, m, tags in failures:
277
+ tag_str = ",".join(tags)
278
+ print(f"\n {status} {name} [{tag_str}]")
279
+ print(f" title: {repr(m['title'])[:120]}")
280
+ print(f" sections={m['section_count']} tables={m['table_html_count']} chars={m['total_chars']}")
281
+ if m.get("char_ratio_vs_spike") is not None:
282
+ print(f" vs_spike: char_ratio={m['char_ratio_vs_spike']:.2f} jaccard={m['jaccard_vs_spike']:.2f} title_missing_words={m.get('title_missing_words', 0)}")
283
+
284
+ return 0 if summary["FAIL"] == 0 and summary["ERROR"] == 0 else 1
285
+
286
+
287
+ if __name__ == "__main__":
288
+ sys.exit(main())