docpluck 2.4.4__tar.gz → 2.4.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (273) hide show
  1. {docpluck-2.4.4 → docpluck-2.4.6}/.claude/skills/docpluck-qa/SKILL.md +75 -0
  2. {docpluck-2.4.4 → docpluck-2.4.6}/CHANGELOG.md +63 -0
  3. {docpluck-2.4.4 → docpluck-2.4.6}/PKG-INFO +1 -1
  4. {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/__init__.py +1 -1
  5. {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/normalize.py +66 -16
  6. {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/render.py +125 -0
  7. docpluck-2.4.6/docs/HANDOFF_2026-05-13_apa_50_expansion.md +360 -0
  8. docpluck-2.4.6/docs/HANDOFF_2026-05-13_apa_50_expansion_iter_1.md +106 -0
  9. docpluck-2.4.6/docs/HANDOFF_2026-05-13_iterative_1.md +112 -0
  10. docpluck-2.4.6/docs/HANDOFF_2026-05-13_iterative_library_improvement.md +235 -0
  11. {docpluck-2.4.4 → docpluck-2.4.6}/pyproject.toml +1 -1
  12. docpluck-2.4.6/scripts/lint_rendered_corpus.py +115 -0
  13. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_normalization.py +131 -0
  14. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_render.py +114 -0
  15. {docpluck-2.4.4 → docpluck-2.4.6}/.claude/skills/_project/lessons.md +0 -0
  16. {docpluck-2.4.4 → docpluck-2.4.6}/.claude/skills/docpluck-cleanup/SKILL.md +0 -0
  17. {docpluck-2.4.4 → docpluck-2.4.6}/.claude/skills/docpluck-deploy/SKILL.md +0 -0
  18. {docpluck-2.4.4 → docpluck-2.4.6}/.claude/skills/docpluck-qa/references/benchmark-mode.md +0 -0
  19. {docpluck-2.4.4 → docpluck-2.4.6}/.claude/skills/docpluck-qa/references/check-11-hard-rules.md +0 -0
  20. {docpluck-2.4.4 → docpluck-2.4.6}/.claude/skills/docpluck-qa/references/check-13-escicheck-production.md +0 -0
  21. {docpluck-2.4.4 → docpluck-2.4.6}/.claude/skills/docpluck-qa/references/check-5-escicheck-library.md +0 -0
  22. {docpluck-2.4.4 → docpluck-2.4.6}/.claude/skills/docpluck-qa/references/check-6-escicheck-local-webapp.md +0 -0
  23. {docpluck-2.4.4 → docpluck-2.4.6}/.claude/skills/docpluck-qa/references/check-7-batch-smoke.md +0 -0
  24. {docpluck-2.4.4 → docpluck-2.4.6}/.claude/skills/docpluck-review/SKILL.md +0 -0
  25. {docpluck-2.4.4 → docpluck-2.4.6}/.github/workflows/publish.yml +0 -0
  26. {docpluck-2.4.4 → docpluck-2.4.6}/.github/workflows/test.yml +0 -0
  27. {docpluck-2.4.4 → docpluck-2.4.6}/.gitignore +0 -0
  28. {docpluck-2.4.4 → docpluck-2.4.6}/CLAUDE.md +0 -0
  29. {docpluck-2.4.4 → docpluck-2.4.6}/HANDOFF_SECTIONS_APP_INTEGRATION.md +0 -0
  30. {docpluck-2.4.4 → docpluck-2.4.6}/LESSONS.md +0 -0
  31. {docpluck-2.4.4 → docpluck-2.4.6}/LICENSE +0 -0
  32. {docpluck-2.4.4 → docpluck-2.4.6}/REPLY_FROM_DOCPLUCK_v1.4.5.md +0 -0
  33. {docpluck-2.4.4 → docpluck-2.4.6}/REPLY_FROM_DOCPLUCK_v1.5.0.md +0 -0
  34. {docpluck-2.4.4 → docpluck-2.4.6}/REQUEST_08_CHUNKING_ENDPOINT.md +0 -0
  35. {docpluck-2.4.4 → docpluck-2.4.6}/REQUEST_09_REFERENCE_LIST_NORMALIZATION.md +0 -0
  36. {docpluck-2.4.4 → docpluck-2.4.6}/TODO.md +0 -0
  37. {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/__main__.py +0 -0
  38. {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/batch.py +0 -0
  39. {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/cli.py +0 -0
  40. {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/extract.py +0 -0
  41. {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/extract_docx.py +0 -0
  42. {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/extract_html.py +0 -0
  43. {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/extract_layout.py +0 -0
  44. {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/extract_structured.py +0 -0
  45. {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/figures/__init__.py +0 -0
  46. {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/figures/detect.py +0 -0
  47. {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/quality.py +0 -0
  48. {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/sections/__init__.py +0 -0
  49. {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/sections/annotators/__init__.py +0 -0
  50. {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/sections/annotators/docx.py +0 -0
  51. {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/sections/annotators/html.py +0 -0
  52. {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/sections/annotators/pdf.py +0 -0
  53. {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/sections/annotators/text.py +0 -0
  54. {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/sections/blocks.py +0 -0
  55. {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/sections/boundaries.py +0 -0
  56. {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/sections/core.py +0 -0
  57. {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/sections/taxonomy.py +0 -0
  58. {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/sections/types.py +0 -0
  59. {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/tables/__init__.py +0 -0
  60. {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/tables/bbox_utils.py +0 -0
  61. {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/tables/camelot_extract.py +0 -0
  62. {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/tables/captions.py +0 -0
  63. {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/tables/cell_cleaning.py +0 -0
  64. {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/tables/cluster.py +0 -0
  65. {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/tables/confidence.py +0 -0
  66. {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/tables/detect.py +0 -0
  67. {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/tables/render.py +0 -0
  68. {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/tables/whitespace.py +0 -0
  69. {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/version.py +0 -0
  70. {docpluck-2.4.4 → docpluck-2.4.6}/docs/BENCHMARKS.md +0 -0
  71. {docpluck-2.4.4 → docpluck-2.4.6}/docs/DESIGN.md +0 -0
  72. {docpluck-2.4.4 → docpluck-2.4.6}/docs/HANDOFF_2026-05-07_sections_strict_iteration.md +0 -0
  73. {docpluck-2.4.4 → docpluck-2.4.6}/docs/HANDOFF_2026-05-09_session_state_and_followups.md +0 -0
  74. {docpluck-2.4.4 → docpluck-2.4.6}/docs/HANDOFF_2026-05-09_unified_extraction_brainstorm.md +0 -0
  75. {docpluck-2.4.4 → docpluck-2.4.6}/docs/HANDOFF_2026-05-10_table_rendering_iteration.md +0 -0
  76. {docpluck-2.4.4 → docpluck-2.4.6}/docs/HANDOFF_2026-05-10_table_rendering_iteration_2.md +0 -0
  77. {docpluck-2.4.4 → docpluck-2.4.6}/docs/HANDOFF_2026-05-10_table_rendering_iteration_3.md +0 -0
  78. {docpluck-2.4.4 → docpluck-2.4.6}/docs/HANDOFF_2026-05-10_table_rendering_iteration_4.md +0 -0
  79. {docpluck-2.4.4 → docpluck-2.4.6}/docs/HANDOFF_2026-05-10_table_rendering_iteration_5.md +0 -0
  80. {docpluck-2.4.4 → docpluck-2.4.6}/docs/HANDOFF_2026-05-10_table_rendering_iteration_6.md +0 -0
  81. {docpluck-2.4.4 → docpluck-2.4.6}/docs/HANDOFF_2026-05-10_table_rendering_iteration_7.md +0 -0
  82. {docpluck-2.4.4 → docpluck-2.4.6}/docs/HANDOFF_2026-05-11_PROMOTE_SPIKE_TO_LIBRARY.md +0 -0
  83. {docpluck-2.4.4 → docpluck-2.4.6}/docs/HANDOFF_2026-05-11_table_rendering_iteration_8.md +0 -0
  84. {docpluck-2.4.4 → docpluck-2.4.6}/docs/HANDOFF_2026-05-11_visual_review_findings.md +0 -0
  85. {docpluck-2.4.4 → docpluck-2.4.6}/docs/HANDOFF_2026-05-12_phase2_101pdf_corpus.md +0 -0
  86. {docpluck-2.4.4 → docpluck-2.4.6}/docs/HANDOFF_2026-05-12_remaining_ui_and_chrome_verification.md +0 -0
  87. {docpluck-2.4.4 → docpluck-2.4.6}/docs/HANDOFF_2026-05-12_visual_verify_results.md +0 -0
  88. {docpluck-2.4.4 → docpluck-2.4.6}/docs/NORMALIZATION.md +0 -0
  89. {docpluck-2.4.4 → docpluck-2.4.6}/docs/README.md +0 -0
  90. {docpluck-2.4.4 → docpluck-2.4.6}/docs/TRIAGE_2026-05-10_corpus_assessment.md +0 -0
  91. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/2026-05-06-section-identification.md +0 -0
  92. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/2026-05-06-table-extraction.md +0 -0
  93. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/2026-05-07-sections-strict-iteration-progress.md +0 -0
  94. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/2026-05-08-unified-extraction-phase-0-splice-spike.md +0 -0
  95. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/sections-deferred-items.md +0 -0
  96. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/sections-issues-backlog.md +0 -0
  97. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/2026-05-07_spot-01_apa.md +0 -0
  98. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/2026-05-07_spot-02_pattern-A-shipped.md +0 -0
  99. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/2026-05-08_spot-final_all-styles.md +0 -0
  100. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/COMPARISON.md +0 -0
  101. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/korbmacher_table1.md +0 -0
  102. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/option-a.py +0 -0
  103. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/ziano_table1.md +0 -0
  104. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_notes_raw.txt +0 -0
  105. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_table1.md +0 -0
  106. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/notes.md +0 -0
  107. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/option-b.py +0 -0
  108. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_notes_raw.txt +0 -0
  109. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_table1.md +0 -0
  110. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/korbmacher_table1.md +0 -0
  111. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/notes.md +0 -0
  112. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/option-c.py +0 -0
  113. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/sample-pdftotext-bbox.html +0 -0
  114. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/ziano_table1.md +0 -0
  115. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/korbmacher_table1.md +0 -0
  116. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/notes.md +0 -0
  117. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/option-d.py +0 -0
  118. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/ziano_table1.md +0 -0
  119. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_2022_kruger_bbox.html +0 -0
  120. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_bbox.html +0 -0
  121. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_table1.md +0 -0
  122. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/option-e.py +0 -0
  123. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/sample-bbox.html +0 -0
  124. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_2021_joep_bbox.html +0 -0
  125. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_bbox.html +0 -0
  126. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_table1.md +0 -0
  127. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/html-fallback-demo.md +0 -0
  128. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.err +0 -0
  129. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.md +0 -0
  130. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.err +0 -0
  131. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.md +0 -0
  132. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.err +0 -0
  133. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.md +0 -0
  134. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.err +0 -0
  135. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.md +0 -0
  136. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.err +0 -0
  137. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.md +0 -0
  138. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.err +0 -0
  139. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.md +0 -0
  140. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.err +0 -0
  141. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.md +0 -0
  142. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.err +0 -0
  143. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.md +0 -0
  144. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.err +0 -0
  145. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.md +0 -0
  146. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.err +0 -0
  147. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.md +0 -0
  148. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.err +0 -0
  149. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.md +0 -0
  150. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.err +0 -0
  151. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.md +0 -0
  152. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.err +0 -0
  153. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.md +0 -0
  154. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.err +0 -0
  155. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.md +0 -0
  156. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.err +0 -0
  157. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.md +0 -0
  158. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.err +0 -0
  159. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.md +0 -0
  160. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.err +0 -0
  161. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.md +0 -0
  162. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.err +0 -0
  163. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.md +0 -0
  164. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.err +0 -0
  165. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.md +0 -0
  166. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.err +0 -0
  167. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.md +0 -0
  168. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.err +0 -0
  169. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.md +0 -0
  170. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.err +0 -0
  171. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.md +0 -0
  172. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.err +0 -0
  173. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.md +0 -0
  174. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.err +0 -0
  175. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.md +0 -0
  176. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.err +0 -0
  177. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.md +0 -0
  178. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.err +0 -0
  179. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.md +0 -0
  180. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/papers.md +0 -0
  181. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/report.md +0 -0
  182. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/splice_spike.py +0 -0
  183. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/test_splice_spike.py +0 -0
  184. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/specs/2026-04-27-request-09-reference-normalization-design.md +0 -0
  185. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/specs/2026-05-06-section-identification-design.md +0 -0
  186. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/specs/2026-05-06-table-extraction-design.md +0 -0
  187. {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/specs/2026-05-08-unified-extraction-design.md +0 -0
  188. {docpluck-2.4.4 → docpluck-2.4.6}/scripts/verify_corpus.py +0 -0
  189. {docpluck-2.4.4 → docpluck-2.4.6}/scripts/verify_corpus_full.py +0 -0
  190. {docpluck-2.4.4 → docpluck-2.4.6}/tests/__init__.py +0 -0
  191. {docpluck-2.4.4 → docpluck-2.4.6}/tests/conftest.py +0 -0
  192. {docpluck-2.4.4 → docpluck-2.4.6}/tests/fixtures/__init__.py +0 -0
  193. {docpluck-2.4.4 → docpluck-2.4.6}/tests/fixtures/sections/__init__.py +0 -0
  194. {docpluck-2.4.4 → docpluck-2.4.6}/tests/fixtures/sections/builders.py +0 -0
  195. {docpluck-2.4.4 → docpluck-2.4.6}/tests/fixtures/structured/.gitkeep +0 -0
  196. {docpluck-2.4.4 → docpluck-2.4.6}/tests/fixtures/structured/MANIFEST.json +0 -0
  197. {docpluck-2.4.4 → docpluck-2.4.6}/tests/fixtures/structured/README.md +0 -0
  198. {docpluck-2.4.4 → docpluck-2.4.6}/tests/golden/sections/apa_multi_study_pdf.json +0 -0
  199. {docpluck-2.4.4 → docpluck-2.4.6}/tests/golden/sections/apa_single_study_pdf.json +0 -0
  200. {docpluck-2.4.4 → docpluck-2.4.6}/tests/golden/sections/html_real_headings.json +0 -0
  201. {docpluck-2.4.4 → docpluck-2.4.6}/tests/snapshots/amj_lattice.txt +0 -0
  202. {docpluck-2.4.4 → docpluck-2.4.6}/tests/snapshots/apa_chan_feldman_lineless.txt +0 -0
  203. {docpluck-2.4.4 → docpluck-2.4.6}/tests/snapshots/apa_chen_jesp_lineless.txt +0 -0
  204. {docpluck-2.4.4 → docpluck-2.4.6}/tests/snapshots/apa_efendic_affect.txt +0 -0
  205. {docpluck-2.4.4 → docpluck-2.4.6}/tests/snapshots/apa_ip_feldman_pspb.txt +0 -0
  206. {docpluck-2.4.4 → docpluck-2.4.6}/tests/snapshots/bmc_lattice.txt +0 -0
  207. {docpluck-2.4.4 → docpluck-2.4.6}/tests/snapshots/ieee_figure_heavy.txt +0 -0
  208. {docpluck-2.4.4 → docpluck-2.4.6}/tests/snapshots/ieee_lattice.txt +0 -0
  209. {docpluck-2.4.4 → docpluck-2.4.6}/tests/snapshots/jama_lattice.txt +0 -0
  210. {docpluck-2.4.4 → docpluck-2.4.6}/tests/snapshots/nat_comms_figure_only.txt +0 -0
  211. {docpluck-2.4.4 → docpluck-2.4.6}/tests/snapshots/nature_minimal_rule.txt +0 -0
  212. {docpluck-2.4.4 → docpluck-2.4.6}/tests/snapshots/scirep_minimal_rule.txt +0 -0
  213. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_bbox_utils.py +0 -0
  214. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_benchmark_docx_html.py +0 -0
  215. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_caption_regex.py +0 -0
  216. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_cli_sections.py +0 -0
  217. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_cli_structured.py +0 -0
  218. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_confidence.py +0 -0
  219. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_corpus_smoke.py +0 -0
  220. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_d5_normalization_audit.py +0 -0
  221. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_edge_cases.py +0 -0
  222. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_extract_docx.py +0 -0
  223. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_extract_filter_sugar.py +0 -0
  224. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_extract_html.py +0 -0
  225. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_extract_layout.py +0 -0
  226. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_extract_pdf_structured.py +0 -0
  227. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_extraction.py +0 -0
  228. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_f0_table_region_aware.py +0 -0
  229. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_figure_detect.py +0 -0
  230. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_fixtures_manifest.py +0 -0
  231. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_lattice_cluster.py +0 -0
  232. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_metaesci_followups.py +0 -0
  233. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_normalize_f0_footnote_strip.py +0 -0
  234. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_normalize_layout_param.py +0 -0
  235. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_normalize_report_layout_fields.py +0 -0
  236. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_normalize_v18_strips.py +0 -0
  237. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_quality.py +0 -0
  238. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_render_html.py +0 -0
  239. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_request_09_reference_normalization.py +0 -0
  240. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_boundaries.py +0 -0
  241. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_boundary_truncation.py +0 -0
  242. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_core_partition.py +0 -0
  243. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_docx_annotator.py +0 -0
  244. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_extract_text.py +0 -0
  245. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_footnote_section.py +0 -0
  246. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_golden.py +0 -0
  247. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_html_annotator.py +0 -0
  248. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_pdf_annotator.py +0 -0
  249. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_public_api.py +0 -0
  250. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_real_corpus.py +0 -0
  251. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_taxonomy.py +0 -0
  252. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_text_annotator.py +0 -0
  253. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_types.py +0 -0
  254. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_unit_corpus.py +0 -0
  255. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_v161_coalesce.py +0 -0
  256. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_v161_subheadings.py +0 -0
  257. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_v161_taxonomy.py +0 -0
  258. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_v161_text_annotator.py +0 -0
  259. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_version.py +0 -0
  260. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_smoke_fixtures.py +0 -0
  261. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_structured_result_type.py +0 -0
  262. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_structured_types.py +0 -0
  263. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_structured_version.py +0 -0
  264. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_table_detect.py +0 -0
  265. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_tables_cell_cleaning.py +0 -0
  266. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_text_mode.py +0 -0
  267. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_v23_1_fixes.py +0 -0
  268. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_v23_bug_fixes.py +0 -0
  269. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_v23_post_corpus.py +0 -0
  270. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_v23_post_corpus_v2.py +0 -0
  271. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_v2_backwards_compat.py +0 -0
  272. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_v2_top_level_exports.py +0 -0
  273. {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_whitespace_cluster.py +0 -0
@@ -250,6 +250,81 @@ python scripts/verify_corpus.py --paper efendic_2022_affect --diff
250
250
  Skips cleanly when the spike `outputs[-new]/` directories aren't on
251
251
  disk (fresh checkouts).
252
252
 
253
+ ### 7c. Visible-Defect Heuristic Linter (v2.4.6+, CRITICAL)
254
+
255
+ `verify_corpus.py` measures char-ratio + Jaccard against a baseline — it is
256
+ **blind to visible defects** that the baseline itself contains. After the
257
+ 2026-05-13 audit (xiao_2021_crsp, maier_2023_collabra) the user identified
258
+ five visible defect classes that the corpus verifier missed entirely:
259
+
260
+ | Defect | Signature regex (on rendered .md, per-line) | Tag |
261
+ |---|---|---|
262
+ | Running header `Q. XIAO ET AL.` style | `^[A-Z]\.(?:\s*[A-Z]\.?)?\s+[A-Z]{2,}\s+ET\s+AL\.?$` | RH |
263
+ | Contact / corresponding-author footer | `^CONTACT\s+[A-Z]\w+(?:\s+[A-Z]\w+)+\s+\S+@` | CT |
264
+ | Prefixed contribution / corresponding footnote | `^[a-c]\s+(?:Contributed\s+equally\|Corresponding\s+Author)\b` | CB |
265
+ | Standalone Dept/University affiliation | `^Department\s+of\s+[A-Z]\w+,\s+University\s+of\s+\w+` | AF |
266
+ | Inline footnote leaked into prose | `^\d+\s+(?:Though\|Note\|See\|We)\s+\w` (per-line, ≤ 200 chars) | FN |
267
+
268
+ Run:
269
+ ```bash
270
+ python scripts/lint_rendered_corpus.py tmp/renders_*/*.md
271
+ ```
272
+
273
+ Any match is a FAIL — the rendered .md contains a defect class that should
274
+ have been stripped upstream. Cite the file + line + tag in the QA report.
275
+
276
+ Note: these patterns target the **rendered output**, not pdftotext. They
277
+ backstop normalize.py + render.py — if a pattern leaks past upstream
278
+ filters and into the .md, the linter catches it.
279
+
280
+ ### 7d. AI Inspection of Rendered Output (v2.4.6+, RECOMMENDED)
281
+
282
+ For 2-5 representative papers per render change, dispatch a Claude subagent
283
+ (via Task or Agent tool) that:
284
+
285
+ 1. Reads `tmp/<paper>.md` (the rendered output).
286
+ 2. Reads the source PDF (via Read tool with `pages=1-5` for the first 5 pages).
287
+ 3. Scores each .md section for fidelity:
288
+ - **Text coverage**: any PDF paragraph missing from the .md?
289
+ - **Section boundaries**: does the heading match the content below it?
290
+ - **Mid-prose leaks**: any running-header / footer / footnote text infused?
291
+ - **False headings**: any `## ...` / `### ...` that isn't actually a section?
292
+
293
+ Output a per-paper defect list. **Default papers:** `xiao_2021_crsp`,
294
+ `maier_2023_collabra`, `chan_feldman_2025_cogemo`, `efendic_2022_affect`,
295
+ `ip_feldman_2025_pspb` — these collectively exercise APA stats tables,
296
+ Collabra footnotes, T&F contact-line footers, sequential page numbers,
297
+ and replication-report subsections.
298
+
299
+ This check exists because **char-ratio + Jaccard are blind to "right words
300
+ in wrong order under wrong heading"** (see CLAUDE.md "Iteration discipline").
301
+ Run it after every render change before declaring the iteration done.
302
+
303
+ ### 7e. Text-Coverage Baseline (v2.4.6+, CRITICAL)
304
+
305
+ Catches the silent-text-loss defect class (rendered .md drops a body
306
+ paragraph that was in the PDF):
307
+
308
+ ```bash
309
+ python -c "
310
+ from pathlib import Path
311
+ from docpluck.extract import extract_pdf
312
+ from docpluck.render import render_pdf_to_markdown
313
+ for pdf_path in Path('../PDFextractor/test-pdfs').glob('**/*.pdf'):
314
+ pdf = pdf_path.read_bytes()
315
+ raw, _ = extract_pdf(pdf)
316
+ md = render_pdf_to_markdown(pdf)
317
+ ratio = len(md) / max(len(raw), 1)
318
+ if ratio < 0.85:
319
+ print(f'COVERAGE FAIL {pdf_path.name}: {ratio:.2f}')
320
+ elif ratio > 2.0:
321
+ print(f'COVERAGE WARN {pdf_path.name}: {ratio:.2f} (suspicious bloat)')
322
+ "
323
+ ```
324
+
325
+ **Threshold:** rendered .md length ≥ 0.85 × pdftotext raw length. Below
326
+ that, body content has been dropped somewhere in the pipeline.
327
+
253
328
  ### 8. Service Health Endpoint
254
329
  ```bash
255
330
  curl -s http://localhost:6117/health
@@ -1,5 +1,68 @@
1
1
  # Changelog
2
2
 
3
+ ## [2.4.6] — 2026-05-13
4
+
5
+ Two fixes addressing visible-defect classes the corpus verifier (char-ratio + Jaccard) was blind to. User visual inspection of `xiao_2021_crsp.pdf` and `maier_2023_collabra.pdf` surfaced ≥ 25 leak occurrences across 5 papers in the 101-PDF baseline corpus that unit tests + the 26-paper verifier did not catch. New heuristic linter (`scripts/lint_rendered_corpus.py`) quantifies remaining defects: baseline 25 → 1 after v2.4.6 on the targeted set.
6
+
7
+ ### Fix 1 — Orphan table cell-text suppression
8
+
9
+ 1. **`docpluck/render.py::_suppress_orphan_table_cell_text`** — new post-processor inserted between `_join_multiline_caption_paragraphs` and `_merge_compound_heading_tails`. Detects single-line `Table N. <caption>` paragraphs (plain, not already italicized — the italic `*Table N. ...*` is the v2.4.2 caption-only emission and never has orphan rows) followed by ≥ 3 consecutive paragraphs matching `_is_orphan_cell_paragraph` (≤ 200 chars, no markdown/HTML/list markers, low stopword density, not multi-sentence prose). When detected: italicizes the caption and drops the orphan paragraphs. Conservative: stops at the first non-orphan paragraph.
10
+
11
+ On `chan_feldman_2025_cogemo`: 5 of 9 captions (Tables 3, 4, 5, 6, 7) were plain `Table N.` lines followed by 3–50 lines of orphan cell rows; all now italicized with zero orphan rows.
12
+
13
+ ### Fix 2 — Running-header / contact-block / affiliation line patterns
14
+
15
+ 2. **`docpluck/normalize.py::_PAGE_FOOTER_LINE_PATTERNS`** — four new patterns:
16
+ - `^[A-Z]\.(?:\s*[A-Z]\.?)?\s+[A-Z]{2,}\s+ET\s+AL\.?$` — `Q. XIAO ET AL.` / `Q.M. SMITH ET AL` running headers (all-caps surname required to avoid stripping legit `Q. Xiao et al.` references in prose).
17
+ - `^CONTACT\s+[A-Z]\w+(?:\s+[A-Z]\w+)+\s+\S+@\S+.*$` — Taylor & Francis (CRSP, etc.) `CONTACT <Name> <email>` page-footer.
18
+ - `^[a-c]\s+(?:Contributed\s+equally|Corresponding\s+Author)\b.*$` — Collabra-style prefixed contribution / corresponding-author footnotes.
19
+ - `^Department\s+of\s+[A-Z]\w+(?:\s+and\s+\w+)?,\s+University\s+of\s+\w+(?:\s+Kong)?,\s+.{2,80}$` — standalone Dept/University affiliation lines (must be standalone — prose mentioning the affiliation mid-sentence stays).
20
+
21
+ On `xiao_2021_crsp`: 18 `Q. XIAO ET AL.` standalone leaks → 0 (one residual is folded inside a figure caption, not at line start). On `maier_2023_collabra`: 3 contact/corresponding leaks → 0.
22
+
23
+ ### New: heuristic linter
24
+
25
+ 3. **`scripts/lint_rendered_corpus.py`** — greps rendered `.md` for 5 leak signatures (RH, CT, CB, AF, FN). Run `python scripts/lint_rendered_corpus.py tmp/renders_v2.4.0/` against the 101-PDF corpus to surface visible defects char-ratio/Jaccard miss. Wired into `docpluck-qa` skill as Check 7c.
26
+
27
+ ### New: QA skill spec updates
28
+
29
+ 4. **`.claude/skills/docpluck-qa/SKILL.md`** — three new checks documented:
30
+ - 7c: Visible-Defect Heuristic Linter (the `lint_rendered_corpus.py` script).
31
+ - 7d: AI Inspection of Rendered Output (Claude subagent compares `.md` paragraph-by-paragraph against source PDF).
32
+ - 7e: Text-Coverage Baseline (asserts `len(rendered.md) ≥ 0.85 × len(pdftotext_raw)` to catch silent text-loss).
33
+
34
+ ### Bumps
35
+
36
+ - `__version__`: `2.4.5` → `2.4.6`. Patch (additive normalize patterns + new render post-processor; no API surface change).
37
+
38
+ ### Tests
39
+
40
+ - 7 new tests in `tests/test_render.py` for `_suppress_orphan_table_cell_text` (drops leaked rows, preserves prose, requires ≥ 3 orphans, skips already-italic caption, stops at next caption, idempotent, no-op when no caption).
41
+ - 7 new tests in `tests/test_normalization.py::TestP0_RunningHeaderFooterPatterns_v246` for the new footer patterns (Q. XIAO ET AL. stripping, two-initials variant, mixed-case preservation, CONTACT footer, prefixed Contributed equally, Dept/University standalone, Dept/University prose preserved).
42
+
43
+ ### Known remaining defects (deferred to next iteration)
44
+
45
+ - `xiao_2021_crsp`: section detector treats mid-paragraph "Experiment" as a heading. Requires context-aware suppression in `sections/taxonomy.py`.
46
+ - `xiao_2021_crsp`: KEYWORDS section boundary not visually separated from Introduction body in render output.
47
+ - `maier_2023_collabra`: subsection headings like "Study 1 Design and Findings" / "Study 3 Design and Findings" remain plain paragraphs — need a subsection-pattern detector in `sections/`.
48
+ - `maier_2023_collabra`: inline footnote leak (`1 Though we note ...`) — F1 footnote post-processing pass needed.
49
+
50
+ ## [2.4.5] — 2026-05-13
51
+
52
+ Continuation of v2.4.3's 4-digit page-number strip. v2.4.3 required the same 4-digit value to recur ≥ 3 times to strip — but continuous-pagination journals (PSPB, Psychological Science) use *sequential* page numbers per page (1174, 1175, 1177, 1179, ...) where each value is different. The v2.4.3 rule missed them entirely.
53
+
54
+ ### Fix
55
+
56
+ 1. **`docpluck/normalize.py::normalize_text` S9** — widened 4-digit page-number strip with a second pattern: when ≥ 3 distinct standalone 4-digit values cluster within a 50-page range AND have mean inter-value gap ≤ 3, treat them all as continuous-pagination page numbers and strip. The conservative gates (max-min spread, mean diff) protect against table-cell values which would have larger spreads and irregular gaps. Verified end-to-end on `efendic_2022_affect.md` — page numbers 1174, 1175, 1177, 1179, 1181, 1183, 1184 now all stripped. `NORMALIZATION_VERSION`: `1.8.2` → `1.8.3`.
57
+
58
+ ### Bumps
59
+
60
+ - `__version__`: `2.4.4` → `2.4.5`. Patch.
61
+
62
+ ### Tests
63
+
64
+ 2 new tests in `tests/test_normalization.py` (sequential page-number stripping, unrelated 4-digit value preservation).
65
+
3
66
  ## [2.4.4] — 2026-05-13
4
67
 
5
68
  Bug fix on v2.4.3's caption-trim feature + extension to a second chart-data signature.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docpluck
3
- Version: 2.4.4
3
+ Version: 2.4.6
4
4
  Summary: PDF, DOCX, and HTML text extraction and normalization for academic papers
5
5
  Project-URL: Homepage, https://github.com/giladfeldman/docpluck
6
6
  Project-URL: Documentation, https://github.com/giladfeldman/docpluck/tree/main/docs
@@ -71,7 +71,7 @@ from .figures import Figure
71
71
  from .extract_structured import TABLE_EXTRACTION_VERSION, StructuredResult, extract_pdf_structured
72
72
  from .render import render_pdf_to_markdown
73
73
 
74
- __version__ = "2.4.4"
74
+ __version__ = "2.4.6"
75
75
  __author__ = "Gilad Feldman"
76
76
  __license__ = "MIT"
77
77
 
@@ -22,7 +22,7 @@ class NormalizationLevel(str, Enum):
22
22
  academic = "academic"
23
23
 
24
24
 
25
- NORMALIZATION_VERSION = "1.8.2"
25
+ NORMALIZATION_VERSION = "1.8.3"
26
26
 
27
27
 
28
28
  # ── Request 9 (Scimeto, 2026-04-27): Reference-list normalization ──────────
@@ -617,6 +617,38 @@ _PAGE_FOOTER_LINE_PATTERNS: list[re.Pattern[str]] = [
617
617
  # Running-header lines with "| <page>" or "<page> Author et al.".
618
618
  re.compile(r"^\S(?:[^|\n]{2,80})\|\s*\d{1,4}\s*$"),
619
619
  re.compile(r"^\d{1,4}\s+[A-ZÀ-ÿ][^\n]{1,60}\s+et al\.?\s*$"),
620
+ # v2.4.6: "Q. XIAO ET AL." style running header — surname journal abbrev
621
+ # used by CRSP, JESP, and many other 2-column journals. Accepts:
622
+ # "Q. XIAO ET AL." single initial + surname
623
+ # "Q.M. XIAO ET AL." two initials with internal period
624
+ # "Q. M. XIAO ET AL" two initials with space (no trailing dot)
625
+ # All-caps surname required (lowercase letters appear in regular prose
626
+ # like "Most participants in the experimental condition were …").
627
+ re.compile(
628
+ r"^[A-Z]\.(?:\s*[A-Z]\.?)?\s+[A-Z]{2,}\s+ET\s+AL\.?\s*$"
629
+ ),
630
+ # v2.4.6: contact-line footer used by Taylor & Francis (CRSP, etc.):
631
+ # "CONTACT Gilad Feldman gfeldman@hku.hk; giladfel@gmail.com …"
632
+ # The `CONTACT` keyword + name + email is distinctive enough to anchor
633
+ # safely. Optional trailing affiliation / region tokens.
634
+ re.compile(
635
+ r"^CONTACT\s+[A-Z][\w'’-]+(?:\s+[A-Z][\w'’-]+)+\s+\S+@\S+.*$"
636
+ ),
637
+ # v2.4.6: prefixed author-contribution / corresponding-author footnotes
638
+ # used by Collabra, eLife, PLOS, etc.:
639
+ # "a Contributed equally, joint first author"
640
+ # "b Contributed equally, joint first author"
641
+ # "c Corresponding Author: <name>, <affiliation>"
642
+ re.compile(
643
+ r"^[a-z]\s+(?:Contributed\s+equally|Corresponding\s+Author)\b.*$"
644
+ ),
645
+ # v2.4.6: standalone affiliation lines that recur on bottom of every
646
+ # page in 2-column journals — "Department of <field>, University of
647
+ # <place>, <region>".
648
+ re.compile(
649
+ r"^Department\s+of\s+[A-Z][A-Za-z]+(?:\s+and\s+[A-Z][A-Za-z]+)?,\s+"
650
+ r"University\s+of\s+[A-Z][A-Za-z]+(?:\s+Kong)?,\s+.{2,80}$"
651
+ ),
620
652
  ]
621
653
 
622
654
 
@@ -1006,27 +1038,45 @@ def normalize_text(
1006
1038
  t = "\n".join(lines)
1007
1039
  # Strip standalone page numbers — 1-3 digit unconditionally.
1008
1040
  t = re.sub(r"^\s*\d{1,3}\s*$", "", t, flags=re.MULTILINE)
1009
- # v2.4.3: 4-digit page numbers (continuous-pagination journals like PSPB
1010
- # where volume runs page numbers into the 1000s). Strip when ALL of:
1011
- # 1. The line is exactly 4 ASCII digits.
1012
- # 2. The value falls in the plausible page-number range 1000–9999
1013
- # (avoids stripping a stray 4-digit year-on-its-own-line).
1014
- # 3. The SAME value recurs ≥3 times in the document (page numbers
1015
- # repeat once per physical page, so this is conservative; a
1016
- # duplicate-by-coincidence table-cell value would need to be the
1017
- # same number 3 times, which is rare).
1018
- # The conservative threshold protects table data where a 4-digit value
1019
- # might legitimately appear on its own line (single-value-per-line
1020
- # column layouts).
1041
+ # v2.4.3/v2.4.5: 4-digit page numbers (continuous-pagination journals like
1042
+ # PSPB where volume runs page numbers into the 1000s, e.g.
1043
+ # ``efendic_2022_affect`` with pages 1174-1185). Two patterns fire:
1044
+ #
1045
+ # (A) RECURRING (v2.4.3) same value appears ≥3 times. Catches PDFs
1046
+ # where every page repeats the same volume number on its own line
1047
+ # (rare for true page numbers, but happens for volume markers).
1048
+ #
1049
+ # (B) SEQUENTIAL (v2.4.5) — ≥3 distinct standalone 4-digit values in
1050
+ # the doc AND they cluster within a 50-page range (max - min ≤ 50)
1051
+ # AND the average per-page gap is small (mean diff ≤ 3). This is
1052
+ # the canonical continuous-pagination signature: page numbers
1053
+ # monotonically increasing across the article. The conservative
1054
+ # gates protect table cells (where 4-digit values would have
1055
+ # larger spreads and irregular gaps).
1021
1056
  four_digit_counts: dict[str, int] = {}
1022
1057
  for ln in t.split("\n"):
1023
1058
  s = ln.strip()
1024
1059
  if len(s) == 4 and s.isascii() and s.isdigit() and 1000 <= int(s) <= 9999:
1025
1060
  four_digit_counts[s] = four_digit_counts.get(s, 0) + 1
1026
- recurring_4d = {s for s, c in four_digit_counts.items() if c >= 3}
1027
- if recurring_4d:
1061
+
1062
+ # Pattern A: same value recurs ≥3 times.
1063
+ strip_set: set[str] = {s for s, c in four_digit_counts.items() if c >= 3}
1064
+
1065
+ # Pattern B: ≥3 distinct values clustered tightly together.
1066
+ if len(four_digit_counts) >= 3:
1067
+ values = sorted(int(s) for s in four_digit_counts.keys())
1068
+ spread = values[-1] - values[0]
1069
+ if spread <= 50:
1070
+ # Compute mean of consecutive diffs.
1071
+ diffs = [values[i + 1] - values[i] for i in range(len(values) - 1)]
1072
+ mean_diff = sum(diffs) / len(diffs)
1073
+ if mean_diff <= 3.0:
1074
+ # All values in the cluster are page numbers.
1075
+ strip_set.update(str(v) for v in values)
1076
+
1077
+ if strip_set:
1028
1078
  t = "\n".join(
1029
- "" if ln.strip() in recurring_4d else ln
1079
+ "" if ln.strip() in strip_set else ln
1030
1080
  for ln in t.split("\n")
1031
1081
  )
1032
1082
  report._track("S9_header_footer_removal", before, t, "headers_removed")
@@ -379,6 +379,130 @@ def _join_multiline_caption_paragraphs(text: str) -> str:
379
379
  return "".join(paragraphs)
380
380
 
381
381
 
382
+ # ── Section C2: orphan table cell-text suppression ──────────────────────────
383
+
384
+
385
+ _ORPHAN_TABLE_CAPTION_RE = re.compile(
386
+ r"^Table\s+(\d+)[.:]\s+(.{3,}?)$"
387
+ )
388
+ _ORPHAN_CELL_STOPWORDS = (
389
+ " the ", " of ", " and ", " in ", " to ", " for ", " with ", " that ",
390
+ " this ", " was ", " were ", " are ", " is ", " have ", " has ",
391
+ " from ", " on ", " by ", " an ", " a ",
392
+ )
393
+
394
+
395
+ def _is_orphan_cell_paragraph(p: str) -> bool:
396
+ """Return True iff ``p`` looks like a leaked table cell row, not prose.
397
+
398
+ Conservative heuristic, used only inside the table-cell-text suppressor:
399
+ - Total length ≤ 200 chars (cell content with quoted instruction text or
400
+ concatenated column headers can run 100-200 chars on a single pdftotext
401
+ line; longer than that is almost certainly prose).
402
+ - Not a heading, caption, HTML block, or list marker.
403
+ - Stopword-density and sentence-structure check rule out short prose.
404
+ """
405
+ if not p:
406
+ return False
407
+ if len(p) > 200:
408
+ return False
409
+ if p.startswith(("#", "*Table", "*Figure", "<table", "</table", "<thead", "<tbody", "<tr", "<td", "<th", ">")):
410
+ return False
411
+ if re.match(r"^(?:Table|Figure)\s+\d", p):
412
+ return False
413
+ if re.match(r"^[*+\-]\s", p) or re.match(r"^\d+\.\s+\w+", p):
414
+ # Markdown list / numbered list — not a cell row.
415
+ # (Numbered ranks like "1. Degree of apology" inside cells can match,
416
+ # but those are typically inside <td> tags, not standalone paragraphs.)
417
+ return False
418
+ if p.startswith("Note") and (":" in p[:8] or "." in p[:8]):
419
+ return False
420
+ lower = " " + p.lower() + " "
421
+ stopword_hits = sum(lower.count(sw) for sw in _ORPHAN_CELL_STOPWORDS)
422
+ # Above 90 chars, prose density must be very low (cells with quoted
423
+ # instruction text or column-header concatenations have ≤ 3 stopwords).
424
+ if len(p) > 90 and stopword_hits >= 4:
425
+ return False
426
+ if len(p) <= 90 and stopword_hits >= 3:
427
+ return False
428
+ # Multi-sentence content is prose, not a cell row.
429
+ if p.count(". ") >= 2:
430
+ return False
431
+ # Single long sentence ending in `.` (not `."` — cells often end in `"`)
432
+ # is prose.
433
+ if p.endswith(".") and not p.endswith(('."', '.")')) and len(p) > 70 and " " in p:
434
+ return False
435
+ return True
436
+
437
+
438
+ def _suppress_orphan_table_cell_text(text: str) -> str:
439
+ """Suppress orphan cell-row text leaks after a plain-text Table caption.
440
+
441
+ When Camelot does not register a table on a page but pdftotext linearized
442
+ the cell content into the section body, the rendered markdown contains:
443
+
444
+ Table 5. Comparison of target article versus replication.
445
+
446
+ Target article
447
+
448
+ Replication
449
+
450
+ Study design
451
+
452
+ Sample characteristics
453
+
454
+ These short orphan paragraphs are leaked cell content with no structural
455
+ value in the rendered view (the user is told to consult the Raw view).
456
+ This pass:
457
+ 1. Detects single-line ``Table N. <caption>`` paragraphs (plain, not
458
+ already italicized — the italic ``*Table N. ...*`` form is the
459
+ v2.4.2 caption-only emission and never has orphan rows).
460
+ 2. Scans forward; if 3+ consecutive paragraphs match
461
+ :func:`_is_orphan_cell_paragraph`, italicizes the caption and drops
462
+ the orphan paragraphs.
463
+
464
+ Conservative: only fires after a ``Table N.`` caption and only when the
465
+ orphan run is at least 3 paragraphs long. Stops at the first non-orphan
466
+ paragraph (normal prose, another caption, or a heading).
467
+ """
468
+ if not text or "Table" not in text:
469
+ return text
470
+ paragraphs = re.split(r"\n\n+", text)
471
+ out: list[str] = []
472
+ i = 0
473
+ while i < len(paragraphs):
474
+ para = paragraphs[i]
475
+ para_stripped = para.strip()
476
+ # Caption must be a single line (no embedded newlines after strip).
477
+ if (
478
+ para_stripped
479
+ and "\n" not in para_stripped
480
+ and not para_stripped.startswith("*")
481
+ and _ORPHAN_TABLE_CAPTION_RE.match(para_stripped)
482
+ ):
483
+ j = i + 1
484
+ orphans: list[int] = []
485
+ while j < len(paragraphs):
486
+ p = paragraphs[j].strip()
487
+ if not p:
488
+ j += 1
489
+ continue
490
+ if _is_orphan_cell_paragraph(p):
491
+ orphans.append(j)
492
+ j += 1
493
+ continue
494
+ break
495
+ if len(orphans) >= 3:
496
+ # Italicize the caption (matches v2.4.2 no-cells caption style)
497
+ # and drop the orphan paragraphs.
498
+ out.append(f"*{para_stripped}*")
499
+ i = j
500
+ continue
501
+ out.append(para)
502
+ i += 1
503
+ return "\n\n".join(out)
504
+
505
+
382
506
  # ── Section D: JAMA Key Points sidebar reformat ─────────────────────────────
383
507
 
384
508
 
@@ -1352,6 +1476,7 @@ def render_pdf_to_markdown(
1352
1476
  md = _dedupe_h2_sections(md)
1353
1477
  md = _fix_hyphenated_line_breaks(md)
1354
1478
  md = _join_multiline_caption_paragraphs(md)
1479
+ md = _suppress_orphan_table_cell_text(md)
1355
1480
  md = _merge_compound_heading_tails(md)
1356
1481
  md = _reformat_jama_key_points_box(md)
1357
1482
  md = _promote_numbered_subsection_headings(md)