docpluck 2.4.90__tar.gz → 2.4.91__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (413) hide show
  1. {docpluck-2.4.90 → docpluck-2.4.91}/.claude/skills/_project/lessons.md +30 -0
  2. {docpluck-2.4.90 → docpluck-2.4.91}/.claude/skills/docpluck-iterate/LEARNINGS.md +14 -0
  3. {docpluck-2.4.90 → docpluck-2.4.91}/.github/workflows/test.yml +3 -0
  4. {docpluck-2.4.90 → docpluck-2.4.91}/CHANGELOG.md +10 -0
  5. {docpluck-2.4.90 → docpluck-2.4.91}/PKG-INFO +6 -6
  6. docpluck-2.4.91/README.md +35 -0
  7. {docpluck-2.4.90 → docpluck-2.4.91}/TODO.md +6 -0
  8. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/__init__.py +1 -1
  9. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/extract.py +33 -8
  10. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/extract_columns.py +0 -290
  11. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/extract_docx.py +13 -1
  12. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/extract_html.py +13 -1
  13. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/extract_structured.py +24 -5
  14. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/render.py +136 -4
  15. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/tables/camelot_extract.py +11 -5
  16. docpluck-2.4.91/docpluck/telemetry.py +30 -0
  17. {docpluck-2.4.90 → docpluck-2.4.91}/docs/BENCHMARKS.md +3 -4
  18. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-06-15_docpluck-iterate-resume.md +24 -7
  19. docpluck-2.4.91/docs/HANDOFF_2026-06-15_rc1-step2-continue.md +82 -0
  20. docpluck-2.4.91/docs/HANDOFF_2026-06-16_docpluck-iterate-resume.md +103 -0
  21. {docpluck-2.4.90 → docpluck-2.4.91}/docs/NORMALIZATION.md +4 -2
  22. {docpluck-2.4.90 → docpluck-2.4.91}/docs/README.md +5 -5
  23. {docpluck-2.4.90 → docpluck-2.4.91}/pyproject.toml +1 -1
  24. docpluck-2.4.91/scripts/check_docs_consistency.py +71 -0
  25. docpluck-2.4.91/tests/test_single_column_subsection_promote_real_pdf.py +164 -0
  26. {docpluck-2.4.90 → docpluck-2.4.91}/.claude/skills/_project/canary.json +0 -0
  27. {docpluck-2.4.90 → docpluck-2.4.91}/.claude/skills/docpluck-cleanup/SKILL.md +0 -0
  28. {docpluck-2.4.90 → docpluck-2.4.91}/.claude/skills/docpluck-deploy/SKILL.md +0 -0
  29. {docpluck-2.4.90 → docpluck-2.4.91}/.claude/skills/docpluck-iterate/SKILL.md +0 -0
  30. {docpluck-2.4.90 → docpluck-2.4.91}/.claude/skills/docpluck-iterate/references/ai-full-doc-verify.md +0 -0
  31. {docpluck-2.4.90 → docpluck-2.4.91}/.claude/skills/docpluck-iterate/references/cycle-report-template.md +0 -0
  32. {docpluck-2.4.90 → docpluck-2.4.91}/.claude/skills/docpluck-iterate/references/local-verification.md +0 -0
  33. {docpluck-2.4.90 → docpluck-2.4.91}/.claude/skills/docpluck-iterate/references/rationalizations.md +0 -0
  34. {docpluck-2.4.90 → docpluck-2.4.91}/.claude/skills/docpluck-iterate/references/real-library-real-pdf.md +0 -0
  35. {docpluck-2.4.90 → docpluck-2.4.91}/.claude/skills/docpluck-iterate/references/release-flow.md +0 -0
  36. {docpluck-2.4.90 → docpluck-2.4.91}/.claude/skills/docpluck-iterate/references/self-improvement.md +0 -0
  37. {docpluck-2.4.90 → docpluck-2.4.91}/.claude/skills/docpluck-iterate/references/subagent-parallelization.md +0 -0
  38. {docpluck-2.4.90 → docpluck-2.4.91}/.claude/skills/docpluck-iterate/references/three-tier-parity.md +0 -0
  39. {docpluck-2.4.90 → docpluck-2.4.91}/.claude/skills/docpluck-qa/SKILL.md +0 -0
  40. {docpluck-2.4.90 → docpluck-2.4.91}/.claude/skills/docpluck-qa/references/benchmark-mode.md +0 -0
  41. {docpluck-2.4.90 → docpluck-2.4.91}/.claude/skills/docpluck-qa/references/check-11-hard-rules.md +0 -0
  42. {docpluck-2.4.90 → docpluck-2.4.91}/.claude/skills/docpluck-qa/references/check-13-escicheck-production.md +0 -0
  43. {docpluck-2.4.90 → docpluck-2.4.91}/.claude/skills/docpluck-qa/references/check-5-escicheck-library.md +0 -0
  44. {docpluck-2.4.90 → docpluck-2.4.91}/.claude/skills/docpluck-qa/references/check-6-escicheck-local-webapp.md +0 -0
  45. {docpluck-2.4.90 → docpluck-2.4.91}/.claude/skills/docpluck-qa/references/check-7-batch-smoke.md +0 -0
  46. {docpluck-2.4.90 → docpluck-2.4.91}/.claude/skills/docpluck-review/SKILL.md +0 -0
  47. {docpluck-2.4.90 → docpluck-2.4.91}/.github/workflows/bump-app-pin.yml +0 -0
  48. {docpluck-2.4.90 → docpluck-2.4.91}/.github/workflows/publish.yml +0 -0
  49. {docpluck-2.4.90 → docpluck-2.4.91}/.gitignore +0 -0
  50. {docpluck-2.4.90 → docpluck-2.4.91}/CLAUDE.md +0 -0
  51. {docpluck-2.4.90 → docpluck-2.4.91}/HANDOFF_SECTIONS_APP_INTEGRATION.md +0 -0
  52. {docpluck-2.4.90 → docpluck-2.4.91}/LESSONS.md +0 -0
  53. {docpluck-2.4.90 → docpluck-2.4.91}/LICENSE +0 -0
  54. {docpluck-2.4.90 → docpluck-2.4.91}/REPLY_FROM_DOCPLUCK_v1.4.5.md +0 -0
  55. {docpluck-2.4.90 → docpluck-2.4.91}/REPLY_FROM_DOCPLUCK_v1.5.0.md +0 -0
  56. {docpluck-2.4.90 → docpluck-2.4.91}/REQUEST_08_CHUNKING_ENDPOINT.md +0 -0
  57. {docpluck-2.4.90 → docpluck-2.4.91}/REQUEST_09_REFERENCE_LIST_NORMALIZATION.md +0 -0
  58. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/__main__.py +0 -0
  59. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/batch.py +0 -0
  60. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/cli.py +0 -0
  61. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/extract_layout.py +0 -0
  62. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/figures/__init__.py +0 -0
  63. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/figures/detect.py +0 -0
  64. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/normalize.py +0 -0
  65. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/quality.py +0 -0
  66. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/sections/__init__.py +0 -0
  67. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/sections/annotators/__init__.py +0 -0
  68. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/sections/annotators/docx.py +0 -0
  69. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/sections/annotators/html.py +0 -0
  70. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/sections/annotators/pdf.py +0 -0
  71. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/sections/annotators/text.py +0 -0
  72. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/sections/blocks.py +0 -0
  73. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/sections/boundaries.py +0 -0
  74. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/sections/core.py +0 -0
  75. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/sections/taxonomy.py +0 -0
  76. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/sections/types.py +0 -0
  77. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/tables/__init__.py +0 -0
  78. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/tables/bbox_utils.py +0 -0
  79. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/tables/captions.py +0 -0
  80. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/tables/cell_cleaning.py +0 -0
  81. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/tables/cluster.py +0 -0
  82. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/tables/confidence.py +0 -0
  83. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/tables/detect.py +0 -0
  84. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/tables/flatten.py +0 -0
  85. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/tables/render.py +0 -0
  86. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/tables/whitespace.py +0 -0
  87. {docpluck-2.4.90 → docpluck-2.4.91}/docpluck/version.py +0 -0
  88. {docpluck-2.4.90 → docpluck-2.4.91}/docs/BENCHMARKS_liteparse_2026-06.md +0 -0
  89. {docpluck-2.4.90 → docpluck-2.4.91}/docs/DESIGN.md +0 -0
  90. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-07_sections_strict_iteration.md +0 -0
  91. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-09_session_state_and_followups.md +0 -0
  92. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-09_unified_extraction_brainstorm.md +0 -0
  93. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-10_table_rendering_iteration.md +0 -0
  94. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-10_table_rendering_iteration_2.md +0 -0
  95. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-10_table_rendering_iteration_3.md +0 -0
  96. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-10_table_rendering_iteration_4.md +0 -0
  97. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-10_table_rendering_iteration_5.md +0 -0
  98. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-10_table_rendering_iteration_6.md +0 -0
  99. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-10_table_rendering_iteration_7.md +0 -0
  100. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-11_PROMOTE_SPIKE_TO_LIBRARY.md +0 -0
  101. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-11_table_rendering_iteration_8.md +0 -0
  102. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-11_visual_review_findings.md +0 -0
  103. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-12_phase2_101pdf_corpus.md +0 -0
  104. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-12_remaining_ui_and_chrome_verification.md +0 -0
  105. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-12_visual_verify_results.md +0 -0
  106. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-13_apa_50_expansion.md +0 -0
  107. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-13_apa_50_expansion_iter_1.md +0 -0
  108. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-13_apa_50_expansion_iter_2.md +0 -0
  109. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-13_iterate_skill_first_use.md +0 -0
  110. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-13_iterative_1.md +0 -0
  111. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-13_iterative_library_improvement.md +0 -0
  112. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-13_table_extraction_next_iteration.md +0 -0
  113. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-14_continue_iterations_v2_4_30_to_15n.md +0 -0
  114. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-14_full_corpus_iteration_v2_4_30.md +0 -0
  115. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-14_iterate_6_cycles_complete.md +0 -0
  116. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-14_iterate_9_cycle_run.md +0 -0
  117. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-14_iterate_resume_4_cycles.md +0 -0
  118. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-14_iterate_v2_4_31_cycle_15n.md +0 -0
  119. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-14_phase_5d_gold_audit_v2_4_29.md +0 -0
  120. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-15_autonomous_apa_first_10h.md +0 -0
  121. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-15_iterate_apa_run_1.md +0 -0
  122. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-16_ai-gold-instructions.md +0 -0
  123. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-16_iterate_apa_run_2.md +0 -0
  124. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-16_iterate_apa_run_3.md +0 -0
  125. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-16_iterate_run_4_final.md +0 -0
  126. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-16_iterate_run_4_fix_and_continue.md +0 -0
  127. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-16_iterate_run_5.md +0 -0
  128. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-16_iterate_run_6.md +0 -0
  129. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-17_iterate_run_7.md +0 -0
  130. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-17_iterate_run_8.md +0 -0
  131. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-17_iterate_run_9.md +0 -0
  132. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-18_iterate_run_9_cont.md +0 -0
  133. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-18_iterate_run_9_cont2.md +0 -0
  134. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-20_iterate_run_9_cont3.md +0 -0
  135. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-22_iterate_run_9_session4_final.md +0 -0
  136. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-22_iterate_run_9_session5_close.md +0 -0
  137. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-25_haiku-orchestration-pretest.md +0 -0
  138. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-05-25_pretest-followups.md +0 -0
  139. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-06-08_iterate_splice-wordintegrity-runningheader.md +0 -0
  140. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-06-08_untested_sweep_v2.4.81.md +0 -0
  141. {docpluck-2.4.90 → docpluck-2.4.91}/docs/HANDOFF_2026-06-13_sciencearena_grobid_liteparse.md +0 -0
  142. {docpluck-2.4.90 → docpluck-2.4.91}/docs/ITERATION_VERIFICATION_LESSONS.md +0 -0
  143. {docpluck-2.4.90 → docpluck-2.4.91}/docs/LIBRARY_APP_SYNC.md +0 -0
  144. {docpluck-2.4.90 → docpluck-2.4.91}/docs/TRIAGE_2026-05-10_corpus_assessment.md +0 -0
  145. {docpluck-2.4.90 → docpluck-2.4.91}/docs/TRIAGE_2026-05-14_phase_5d_gold_audit.md +0 -0
  146. {docpluck-2.4.90 → docpluck-2.4.91}/docs/TRIAGE_2026-06-08_untested_corpus_sweep.md +0 -0
  147. {docpluck-2.4.90 → docpluck-2.4.91}/docs/TRIAGE_2026-06-15_head_v2.4.88_assessment.md +0 -0
  148. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/handoffs/2026-05-22-b1-next-iteration.md +0 -0
  149. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/handoffs/2026-05-22-b2-remaining-halluc-head.md +0 -0
  150. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/handoffs/2026-05-22-b3-b7-structural-defects.md +0 -0
  151. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/handoffs/2026-05-22-residual-after-locally-doable-pass.md +0 -0
  152. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/handoffs/2026-05-23-bundled-residual-cycle-CLOSED.md +0 -0
  153. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/handoffs/2026-05-23-residual-after-iterate-spine-cycles-1-3.md +0 -0
  154. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/handoffs/2026-05-25-canary-audit-architecture-and-cluster-A-B-C-landed.md +0 -0
  155. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/handoffs/2026-05-25-wrapup-r4-cycle.md +0 -0
  156. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/handoffs/2026-05-26-run-11-cluster-A-ter-and-C-bis-landed.md +0 -0
  157. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/handoffs/2026-05-26-text-extraction-defects-from-citationguard-audit.md +0 -0
  158. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/handoffs/2026-06-07-text-extraction-defects-from-citationguard-iterate.md +0 -0
  159. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/handoffs/2026-06-07-v2.4.78-landed-canary-iterate.md +0 -0
  160. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/handoffs/2026-06-07-v2.4.79-findings-1-2-cleared.md +0 -0
  161. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/2026-05-06-section-identification.md +0 -0
  162. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/2026-05-06-table-extraction.md +0 -0
  163. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/2026-05-07-sections-strict-iteration-progress.md +0 -0
  164. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/2026-05-08-unified-extraction-phase-0-splice-spike.md +0 -0
  165. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/2026-05-23-haiku-orchestration-pretest.md +0 -0
  166. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/sections-deferred-items.md +0 -0
  167. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/sections-issues-backlog.md +0 -0
  168. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/2026-05-07_spot-01_apa.md +0 -0
  169. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/2026-05-07_spot-02_pattern-A-shipped.md +0 -0
  170. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/2026-05-08_spot-final_all-styles.md +0 -0
  171. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/experiments/COMPARISON.md +0 -0
  172. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/korbmacher_table1.md +0 -0
  173. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/option-a.py +0 -0
  174. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/ziano_table1.md +0 -0
  175. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_notes_raw.txt +0 -0
  176. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_table1.md +0 -0
  177. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/notes.md +0 -0
  178. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/option-b.py +0 -0
  179. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_notes_raw.txt +0 -0
  180. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_table1.md +0 -0
  181. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/korbmacher_table1.md +0 -0
  182. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/notes.md +0 -0
  183. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/option-c.py +0 -0
  184. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/sample-pdftotext-bbox.html +0 -0
  185. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/ziano_table1.md +0 -0
  186. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/korbmacher_table1.md +0 -0
  187. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/notes.md +0 -0
  188. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/option-d.py +0 -0
  189. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/ziano_table1.md +0 -0
  190. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_2022_kruger_bbox.html +0 -0
  191. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_bbox.html +0 -0
  192. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_table1.md +0 -0
  193. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/option-e.py +0 -0
  194. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/sample-bbox.html +0 -0
  195. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_2021_joep_bbox.html +0 -0
  196. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_bbox.html +0 -0
  197. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_table1.md +0 -0
  198. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/html-fallback-demo.md +0 -0
  199. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.err +0 -0
  200. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.md +0 -0
  201. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.err +0 -0
  202. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.md +0 -0
  203. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.err +0 -0
  204. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.md +0 -0
  205. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.err +0 -0
  206. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.md +0 -0
  207. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.err +0 -0
  208. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.md +0 -0
  209. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.err +0 -0
  210. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.md +0 -0
  211. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.err +0 -0
  212. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.md +0 -0
  213. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.err +0 -0
  214. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.md +0 -0
  215. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.err +0 -0
  216. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.md +0 -0
  217. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.err +0 -0
  218. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.md +0 -0
  219. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.err +0 -0
  220. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.md +0 -0
  221. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.err +0 -0
  222. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.md +0 -0
  223. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.err +0 -0
  224. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.md +0 -0
  225. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.err +0 -0
  226. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.md +0 -0
  227. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.err +0 -0
  228. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.md +0 -0
  229. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.err +0 -0
  230. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.md +0 -0
  231. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.err +0 -0
  232. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.md +0 -0
  233. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.err +0 -0
  234. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.md +0 -0
  235. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.err +0 -0
  236. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.md +0 -0
  237. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.err +0 -0
  238. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.md +0 -0
  239. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.err +0 -0
  240. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.md +0 -0
  241. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.err +0 -0
  242. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.md +0 -0
  243. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.err +0 -0
  244. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.md +0 -0
  245. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.err +0 -0
  246. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.md +0 -0
  247. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.err +0 -0
  248. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.md +0 -0
  249. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.err +0 -0
  250. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.md +0 -0
  251. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/papers.md +0 -0
  252. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/report.md +0 -0
  253. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/splice_spike.py +0 -0
  254. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/plans/spot-checks/splice-spike/test_splice_spike.py +0 -0
  255. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/specs/2026-04-27-request-09-reference-normalization-design.md +0 -0
  256. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/specs/2026-05-06-section-identification-design.md +0 -0
  257. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/specs/2026-05-06-table-extraction-design.md +0 -0
  258. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/specs/2026-05-08-unified-extraction-design.md +0 -0
  259. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/specs/2026-05-23-haiku-orchestration-pretest-design.md +0 -0
  260. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/specs/2026-06-07-ip_feldman-B4-R4-column-interleave-diagnosis.md +0 -0
  261. {docpluck-2.4.90 → docpluck-2.4.91}/docs/superpowers/specs/2026-06-08-rc1-region-aware-column-architecture.md +0 -0
  262. {docpluck-2.4.90 → docpluck-2.4.91}/scripts/__init__.py +0 -0
  263. {docpluck-2.4.90 → docpluck-2.4.91}/scripts/harness/README.md +0 -0
  264. {docpluck-2.4.90 → docpluck-2.4.91}/scripts/harness/VERIFIER_PROMPT.md +0 -0
  265. {docpluck-2.4.90 → docpluck-2.4.91}/scripts/harness/__init__.py +0 -0
  266. {docpluck-2.4.90 → docpluck-2.4.91}/scripts/harness/baseline_matrix.json +0 -0
  267. {docpluck-2.4.90 → docpluck-2.4.91}/scripts/harness/checks.py +0 -0
  268. {docpluck-2.4.90 → docpluck-2.4.91}/scripts/harness/corpus.py +0 -0
  269. {docpluck-2.4.90 → docpluck-2.4.91}/scripts/harness/corpus_manifest.json +0 -0
  270. {docpluck-2.4.90 → docpluck-2.4.91}/scripts/harness/extract.py +0 -0
  271. {docpluck-2.4.90 → docpluck-2.4.91}/scripts/harness/gold_keys.json +0 -0
  272. {docpluck-2.4.90 → docpluck-2.4.91}/scripts/harness/inspect.py +0 -0
  273. {docpluck-2.4.90 → docpluck-2.4.91}/scripts/lint_rendered_corpus.py +0 -0
  274. {docpluck-2.4.90 → docpluck-2.4.91}/scripts/pretest_capture_tokens.py +0 -0
  275. {docpluck-2.4.90 → docpluck-2.4.91}/scripts/verify_corpus.py +0 -0
  276. {docpluck-2.4.90 → docpluck-2.4.91}/scripts/verify_corpus_full.py +0 -0
  277. {docpluck-2.4.90 → docpluck-2.4.91}/tests/__init__.py +0 -0
  278. {docpluck-2.4.90 → docpluck-2.4.91}/tests/conftest.py +0 -0
  279. {docpluck-2.4.90 → docpluck-2.4.91}/tests/fixtures/__init__.py +0 -0
  280. {docpluck-2.4.90 → docpluck-2.4.91}/tests/fixtures/sections/__init__.py +0 -0
  281. {docpluck-2.4.90 → docpluck-2.4.91}/tests/fixtures/sections/builders.py +0 -0
  282. {docpluck-2.4.90 → docpluck-2.4.91}/tests/fixtures/structured/.gitkeep +0 -0
  283. {docpluck-2.4.90 → docpluck-2.4.91}/tests/fixtures/structured/MANIFEST.json +0 -0
  284. {docpluck-2.4.90 → docpluck-2.4.91}/tests/fixtures/structured/README.md +0 -0
  285. {docpluck-2.4.90 → docpluck-2.4.91}/tests/golden/sections/apa_multi_study_pdf.json +0 -0
  286. {docpluck-2.4.90 → docpluck-2.4.91}/tests/golden/sections/apa_single_study_pdf.json +0 -0
  287. {docpluck-2.4.90 → docpluck-2.4.91}/tests/golden/sections/html_real_headings.json +0 -0
  288. {docpluck-2.4.90 → docpluck-2.4.91}/tests/snapshots/amj_lattice.txt +0 -0
  289. {docpluck-2.4.90 → docpluck-2.4.91}/tests/snapshots/apa_chan_feldman_lineless.txt +0 -0
  290. {docpluck-2.4.90 → docpluck-2.4.91}/tests/snapshots/apa_chen_jesp_lineless.txt +0 -0
  291. {docpluck-2.4.90 → docpluck-2.4.91}/tests/snapshots/apa_efendic_affect.txt +0 -0
  292. {docpluck-2.4.90 → docpluck-2.4.91}/tests/snapshots/apa_ip_feldman_pspb.txt +0 -0
  293. {docpluck-2.4.90 → docpluck-2.4.91}/tests/snapshots/bmc_lattice.txt +0 -0
  294. {docpluck-2.4.90 → docpluck-2.4.91}/tests/snapshots/ieee_figure_heavy.txt +0 -0
  295. {docpluck-2.4.90 → docpluck-2.4.91}/tests/snapshots/ieee_lattice.txt +0 -0
  296. {docpluck-2.4.90 → docpluck-2.4.91}/tests/snapshots/jama_lattice.txt +0 -0
  297. {docpluck-2.4.90 → docpluck-2.4.91}/tests/snapshots/nat_comms_figure_only.txt +0 -0
  298. {docpluck-2.4.90 → docpluck-2.4.91}/tests/snapshots/nature_minimal_rule.txt +0 -0
  299. {docpluck-2.4.90 → docpluck-2.4.91}/tests/snapshots/scirep_minimal_rule.txt +0 -0
  300. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_a3c_leading_zero_decimal_real_pdf.py +0 -0
  301. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_a4_ci_period_to_comma.py +0 -0
  302. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_all_caps_section_promote_real_pdf.py +0 -0
  303. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_bbox_utils.py +0 -0
  304. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_benchmark_docx_html.py +0 -0
  305. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_cambridge_footer_strip_real_pdf.py +0 -0
  306. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_camelot_temp_cleanup.py +0 -0
  307. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_canary_provenance.py +0 -0
  308. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_caption_only_table_heading_real_pdf.py +0 -0
  309. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_caption_regex.py +0 -0
  310. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_chart_data_trim_real_pdf.py +0 -0
  311. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_cid_minus_recovery_real_pdf.py +0 -0
  312. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_cli_sections.py +0 -0
  313. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_cli_structured.py +0 -0
  314. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_confidence.py +0 -0
  315. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_corpus_smoke.py +0 -0
  316. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_d5_normalization_audit.py +0 -0
  317. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_dropped_minus_layout_recovery_real_pdf.py +0 -0
  318. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_edge_cases.py +0 -0
  319. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_elsevier_footer_strip_real_pdf.py +0 -0
  320. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_equation_page_header_strip_real_pdf.py +0 -0
  321. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_extract_columns.py +0 -0
  322. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_extract_docx.py +0 -0
  323. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_extract_filter_sugar.py +0 -0
  324. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_extract_html.py +0 -0
  325. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_extract_layout.py +0 -0
  326. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_extract_pdf_structured.py +0 -0
  327. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_extraction.py +0 -0
  328. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_f0_table_region_aware.py +0 -0
  329. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_fffd_comparison_recovery_real_pdf.py +0 -0
  330. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_figure_caption_trim_real_pdf.py +0 -0
  331. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_figure_detect.py +0 -0
  332. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_fixtures_manifest.py +0 -0
  333. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_hallucinated_heading_continuation_guard.py +0 -0
  334. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_harness_text_loss_reflow.py +0 -0
  335. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_harvard_refs_pagebreak_stitch.py +0 -0
  336. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_jama_open_cluster_real_pdf.py +0 -0
  337. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_lattice_cluster.py +0 -0
  338. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_letterspaced_label_real_pdf.py +0 -0
  339. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_ligature_decomposition_real_pdf.py +0 -0
  340. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_lt_operator_recovery_real_pdf.py +0 -0
  341. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_mathitalic_greek_real_pdf.py +0 -0
  342. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_metaesci_followups.py +0 -0
  343. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_minus_sign_recovery_real_pdf.py +0 -0
  344. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_normalization.py +0 -0
  345. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_normalize_a3_r2_body_integer_real_pdf.py +0 -0
  346. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_normalize_f0_footnote_strip.py +0 -0
  347. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_normalize_idempotent_real_pdf.py +0 -0
  348. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_normalize_layout_param.py +0 -0
  349. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_normalize_metadata_leak_real_pdf.py +0 -0
  350. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_normalize_report_layout_fields.py +0 -0
  351. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_normalize_soft_hyphen_dehyphenation.py +0 -0
  352. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_normalize_v18_strips.py +0 -0
  353. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_numbered_heading_promotion_real_pdf.py +0 -0
  354. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_numbered_section_promotion_real_pdf.py +0 -0
  355. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_o5_reference_inversion_real_pdf.py +0 -0
  356. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_orphan_multilevel_number_real_pdf.py +0 -0
  357. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_orphan_section_number_real_pdf.py +0 -0
  358. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_p0r_recurring_running_header_strip.py +0 -0
  359. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_preserve_math_glyphs_real_pdf.py +0 -0
  360. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_pretest_capture_tokens.py +0 -0
  361. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_pua_glyph_recovery_real_pdf.py +0 -0
  362. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_quality.py +0 -0
  363. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_r1_whitespace_cells_wiring_real_pdf.py +0 -0
  364. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_r4_column_correction_real_pdf.py +0 -0
  365. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_rc1_banded_column_real_pdf.py +0 -0
  366. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_rc1_general_column_correction_real_pdf.py +0 -0
  367. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_render.py +0 -0
  368. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_render_frontmatter_masthead.py +0 -0
  369. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_render_html.py +0 -0
  370. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_render_subsection_chain_promotion.py +0 -0
  371. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_request_09_reference_normalization.py +0 -0
  372. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_residual_2026_05_23_bundled.py +0 -0
  373. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_roman_numeral_section_promote_real_pdf.py +0 -0
  374. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_section_row_label_no_merge_real_pdf.py +0 -0
  375. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_sections_boundaries.py +0 -0
  376. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_sections_boundary_truncation.py +0 -0
  377. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_sections_core_partition.py +0 -0
  378. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_sections_docx_annotator.py +0 -0
  379. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_sections_extract_text.py +0 -0
  380. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_sections_footnote_section.py +0 -0
  381. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_sections_golden.py +0 -0
  382. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_sections_html_annotator.py +0 -0
  383. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_sections_pdf_annotator.py +0 -0
  384. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_sections_public_api.py +0 -0
  385. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_sections_real_corpus.py +0 -0
  386. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_sections_taxonomy.py +0 -0
  387. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_sections_text_annotator.py +0 -0
  388. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_sections_types.py +0 -0
  389. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_sections_unit_corpus.py +0 -0
  390. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_sections_v161_coalesce.py +0 -0
  391. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_sections_v161_subheadings.py +0 -0
  392. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_sections_v161_taxonomy.py +0 -0
  393. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_sections_v161_text_annotator.py +0 -0
  394. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_sections_version.py +0 -0
  395. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_smoke_fixtures.py +0 -0
  396. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_structured_result_type.py +0 -0
  397. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_structured_types.py +0 -0
  398. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_structured_version.py +0 -0
  399. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_table_caption_cell_region_real_pdf.py +0 -0
  400. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_table_detect.py +0 -0
  401. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_tables_cell_cleaning.py +0 -0
  402. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_tables_flatten.py +0 -0
  403. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_text_mode.py +0 -0
  404. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_v23_1_fixes.py +0 -0
  405. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_v23_bug_fixes.py +0 -0
  406. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_v23_post_corpus.py +0 -0
  407. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_v23_post_corpus_v2.py +0 -0
  408. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_v2_backwards_compat.py +0 -0
  409. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_v2_top_level_exports.py +0 -0
  410. {docpluck-2.4.90 → docpluck-2.4.91}/tests/test_whitespace_cluster.py +0 -0
  411. {docpluck-2.4.90 → docpluck-2.4.91}/tools/canary_provenance.py +0 -0
  412. {docpluck-2.4.90 → docpluck-2.4.91}/tools/fix_python_env.ps1 +0 -0
  413. {docpluck-2.4.90 → docpluck-2.4.91}/tools/render_for_audit.py +0 -0
@@ -576,6 +576,12 @@ Rotation picks `pool[(N mod L) : (N mod L) + rotation_size]` wrapping. Over `cei
576
576
 
577
577
  **How to diagnose:** 3-way check — pdftotext vs pdfplumber vs a visual/AI read of the PDF. When both deterministic extractors agree on the wrong value and only the visual disagrees, the codepoint is baked wrong and NO text-channel logic can fix it (recovery needs OCR / multimodal-glyph-consensus, a new subsystem). User decision 2026-06-08: **document as a known limitation, do not scope an OCR subsystem.** Consumer guidance: downstream stat-checkers (CitationGuard) must cross-verify digits against CrossRef/visual — docpluck cannot guarantee a digit matches the visual glyph when the publisher baked the wrong codepoint.
578
578
 
579
+ ## RC-1 Step 2 — per-band column re-extraction; the word-preservation guard is the safety (2026-06-15, v2.4.90)
580
+
581
+ **What:** the dominant two-column-interleave defect (Method/Results/Discussion scrambled) on table-bearing pages that Step 1 (`extract_page_text_columns`, whole-page) cannot reach — its bilateral y-row gate + full-height gutter strip reject any page with a full-width band crossing the centre (confirmed: `DOCPLUCK_COLUMN_CORRECT_GENERAL=1` was byte-near-identical on the failing papers). **Step 2** (`extract_page_text_banded`, `docpluck/extract_columns.py`) segments a page into horizontal y-bands, column-corrects prose bands, keeps full-width (table/banner/title) bands intact; applied as a **fallback inside `splice_column_corrected_pages`** only when whole-page returns "", under the SAME word-preservation guard. Ship-dark behind `DOCPLUCK_COLUMN_CORRECT_BANDED` (default OFF → flag-OFF byte-identical, 26/26 baseline unchanged). AI-verified ON_BETTER on chan_feldman + chandrashekar (0 text-loss/halluc/regression).
582
+
583
+ **The load-bearing lesson:** the word-preservation guard (substantial-word multiset of the re-extraction == original page) makes ANY segmentation heuristic SAFE — a bad reorder is rejected, the page kept as-is — so optimize segmentation for COVERAGE, not for never-being-wrong. Validate with a corpus word-multiset scan (flag-OFF vs flag-ON whole-doc multiset MUST be identical, `lost=0 gained=0`) BEFORE AI-verify. Three hazards the guard caught: (1) full-width title lines column-split mid-word → a row is 2-col only if the strip `[gx±4pt]` is glyph-free, not merely "no word spans gx"; (2) band cuts bisecting tall title glyphs → merge vertically-overlapping bands to full-width; (3) per-row both-sides is conservative but halves guard-rejections vs gutter-clear-only on hard pages (6 vs 12 of 71) — keep it, layer banded as a fallback so clean 2-col pages use the proven whole-page path. Remaining before default-flip: band-cut clips (6/71, guard-rejected), title+sidebar pages — see `docs/superpowers/specs/2026-06-08-rc1-region-aware-column-architecture.md` "Step 2 — remaining work". Shared card: `band-reextraction-lean-on-word-preservation-guard`.
584
+
579
585
  ## Dropped-glyph recovery splits into layout-recoverable vs pixel-only — probe per-instance before designing (2026-06-15, v2.4.89 W0h)
580
586
 
581
587
  **What:** A glyph that **pdftotext drops entirely** (emits nothing) is NOT one class but two, and a per-*instance* 3-way diff tells them apart:
@@ -585,3 +591,27 @@ Rotation picks `pool[(N mod L) : (N mod L) + rotation_size]` wrapping. Over `cei
585
591
  **Trap:** "the layout channel recovers what pdftotext drops" is true only for sub-case 1. Probe the SPECIFIC failing instances (geometry: is there a char/line/rect immediately left of the number?) before assuming a layout fix is complete — feasibility here was 3/4, and a recovery that silently fixes 3 of 4 sign-flips is a product call (false-confidence), surfaced to the user.
586
592
 
587
593
  **Plumbing gotcha (cost a detour):** the section/render path calls `normalize_text` WITHOUT `layout=` (`sections/__init__.py`), so F0 and every layout-gated pass is OFF there by design (text-channel-only contract). A layout-aware fix must thread a **dedicated** param (`dropped_minus_layout`, `render → extract_sections → normalize_text`) — reusing the `layout=` gate would also switch F0 on and risk broad regressions. The detector must cluster chars into lines by **y-overlap**, never `round(top)` (a minus sits ~0.4pt off its digits' baseline and rounds into a different bucket, orphaning it).
594
+
595
+ ## On resume, `git status` BEFORE editing source — a concurrent session can co-edit the same files (2026-06-16)
596
+
597
+ **What:** A `/docpluck-iterate` resume opened on the RC-1 Step 2 handoff. The system-prompt git snapshot said "(clean)", so I went straight to implementing Step 2 (`extract_page_text_bands` + helpers in `extract_columns.py`). But a SECOND Claude session was concurrently implementing the SAME feature (`extract_page_text_banded` + splice/extract.py wiring + `DOCPLUCK_COLUMN_CORRECT_BANDED` flag). Two `Edit`/`Write` "File has been modified since read" events fired on files I hadn't changed; `extract.py` got an mtime I never wrote; 18 `claude.exe` procs were live. The other session committed first (`git add` swept MY uncommitted duplicate into the **v2.4.90 release commit `1325d14`** → orphaned dead code shipped to the tag + prod, inert/uncalled but wrong). Resolution: confirmed the duplicate was uncalled (distinct names, no shadowing), removed my whole block, 69 column-path tests green, committed the removal.
598
+
599
+ **Lessons:** (1) **The system-prompt git snapshot is from conversation START and goes stale within the session** — on any resume, run a fresh `git status` + `git log --oneline -5` BEFORE the first source edit. (2) **"File has been modified since read" on a file YOU didn't touch = STOP and check for a concurrent editor** (`git diff HEAD`, file mtimes, `tasklist | grep claude`), don't just re-read-and-retry. (3) When two agents share a working tree, whoever commits first sweeps the other's uncommitted changes into THEIR commit — so a concurrent edit is not just a merge risk, it can silently ship your half-done work in someone else's release. (4) When you discover the collision, **surface it to the user** (they know if two sessions are intentional) rather than racing or unilaterally reverting (reverting is itself a write that can clobber the other session's in-flight work).
600
+
601
+ ## Text-channel heading promotion can't use line-WIDTH to gate — earlier render steps join wrapped lines (2026-06-17)
602
+
603
+ **What:** Resuming docpluck-iterate on the cycle-2 canary FAILs, user chose "headings first." Root-caused the JESP/Elsevier single-column subsection demotion: labels like "Overview", "Practice instructions", "Self-control assessment" are emitted by pdftotext on their own line with NO blank padding on EITHER side (glued between the prior subsection's body and their own body), so every existing promoter — all of which require `blank_before AND blank_after` — skips them. A minimal relaxation of the `blank_before` gate in `_promote_isolated_titlecase_subsection_headings` (admit no-blank-before when the prior line is a sentence-terminated PROSE line, i.e. a clean paragraph boundary, not a mid-sentence column-wrap) **fixes ar_apa perfectly** (+`### Overview` / `### Practice instructions` / `### Self-control assessment`, 0 removed). BUT it over-promotes **5 two-column table-cell / measures-list labels on ip_feldman** (`### Others ratings`, `### Address order effects`, `### Prevalence Estimation Error: …`) — the G5d hallucinated-heading blocker. Reverted; no release.
604
+
605
+ **The trap (durable):** The obvious discriminator — "real single-column heading is followed by full-page-width body (~60-90 char lines); a narrow two-column table cell is surrounded by ~30-char lines" — **WORKS on RAW pdftotext text but is USELESS at the promoter**, because earlier render-pipeline steps JOIN wrapped lines into long paragraphs before `_promote_isolated_titlecase_subsection_headings` runs. Measured: "Others ratings" body max line width is **36 chars raw → 112 chars in-pipeline**; "Address order effects" 32 → 80. A body-width gate computed inside the promoter therefore admits everything and does nothing.
606
+
607
+ **Lessons:** (1) Any heading promote/demote heuristic that needs LINE-WIDTH or LINE-WRAP structure must be computed from the **raw pdftotext output (before line-joining)** and threaded into the promoter as a precomputed signal (e.g. a document-level `is_single_column` flag from median raw body-line width ≥ ~62, or per-region width), NOT recomputed inside the render-pipeline promoter where the signal is already destroyed. (2) The safe general fix for this defect class is to **scope the no-blank-padding relaxation to single-column documents** (computed from raw text or the layout channel) — two-column subsection headings are already handled by the blank-isolation / chain paths, so single-column-gating both fixes JESP/Elsevier AND prevents the two-column table-cell over-promotion. (3) Always verify a promotion change against the **G5d canary (ip_feldman) with a deterministic heading-count delta** before trusting it — `diff <(grep -E '^#{1,4} ' before) <(grep -E '^#{1,4} ' after)` instantly shows added/removed headings per paper; ar_apa gaining exactly 3 and ip_feldman gaining 5 was the whole story in one command.
608
+
609
+ ## Single-column gate via raw-text wide-line fraction RESOLVES the JESP glued-heading blocker — and AI-verify catches a new single-column false promotion (2026-06-17 v2.4.91)
610
+
611
+ **What:** Implemented the precisely-scoped next step from the (same-day) blocked entry above. `_raw_text_is_single_column(raw_text)` = fraction of non-blank RAW-pdftotext lines wider than 65 chars ≥ 0.25, threaded into `_promote_isolated_titlecase_subsection_headings(text, *, is_single_column)`; the hard `blank_before` reject is relaxed ONLY when single-column AND `_prev_paragraph_is_sentence_terminated`. Result: `ar_apa_j_jesp_2009_12_011` gains exactly `### Overview` / `### Practice instructions` / `### Self-control assessment` (AI-verified vs the article-finder reading gold: `new_headings_are_real=true`), two-column `ip_feldman_2025_pspb` byte-identical (G5d trap avoided), 26-baseline 26/26.
612
+
613
+ **Why `frac>65` and NOT median or interleave-pages:** The corpus measurement (all 26 baseline papers) showed median misclassifies single-column table-heavy papers (plos_med median 48 but genuinely single-column), and the column-interleave page count is useless — the genuine single-column target ar_apa has 4/5 pages flagged by `_detect_column_interleave_pages` (short reference/table lines trip it). `frac>65` is the physical invariant: a two-column layout *cannot* emit many >65-char lines (each column wraps at ~30-48), so it stays 0.06–0.24; single-column body prose wraps full-width → 0.28–0.58. The gap (0.235→0.280) is clean and corpus-wide.
614
+
615
+ **The AI-verify catch (why Phase-5d is non-negotiable):** the single-column relaxation ALSO promoted `### Anesthesiologists; CI, confidence interval; DSMB,` on plos_med_1 — an abbreviation-glossary line, a NEW hallucinated heading (absent at HEAD, confirmed by a stash+render heading-delta). A deterministic heading-delta scan of the 15 single-column papers had NOT flagged it as wrong (it was a real heading-count delta); only the Sonnet AI-verify against the gold identified it as a HALLUCINATION. Fix: extend `_is_single_col_relaxation_fragment` to reject any candidate containing an internal `;` or ending in `,` (a heading never does; an abbreviation/clause list always does). After the guard, plos_med net +0, all legitimate headings retained.
616
+
617
+ **How to detect next time:** (1) For any layout-dependent render heuristic, measure the discriminator across the WHOLE baseline corpus before fixing the threshold — don't tune to make one paper pass (median would have). (2) A heading-count *delta* is necessary but NOT sufficient evidence a promotion is correct — a +1 delta can be a hallucination; the AI-gold verify is what distinguishes a real subsection from a promoted glossary/table-cell line. Always AI-verify every single-column paper whose heading count changes, not just the target. (3) Heading false-positive shapes recur per call-site: bracket furniture, leading-preposition wraps, dangling-connector tails, and `;`/trailing-`,` list fragments are the standing reject set for any *relaxed* promoter.
@@ -1198,3 +1198,17 @@ aren't skipped.
1198
1198
 
1199
1199
  ### Process notes
1200
1200
  - One AI-verify cycle surfaced THREE pre-existing defect classes beyond the target: RC-1 two-column interleave (ip_feldman/chandrashekar/chan_feldman/ar_apa-table), B1 table-completeness (plos Tables 2/3/4/5 lose rows/cols/bodies), and metadata-leak (plos affiliations/abbrev/running-headers — an RC-2 residual). Per 0e-bis the run's standing verdict stays FAIL; cycle 1 is an incremental ship, not a clean PASS. Surfaced to user as the run punch-list.
1201
+
1202
+ ---
1203
+
1204
+ ## Run: 2026-06-15 · cycle 2 (resume) · RC-1 Step 2 shipped v2.4.90 (ship-dark)
1205
+
1206
+ ### Outcome
1207
+ - Resumed the open run (B7 v2.4.89 already committed by a concurrent session-instance; independently re-verified it: test 5/5, suite 547 passed, baseline 26/26 — no work lost). Implemented **RC-1 Step 2** (`extract_page_text_banded`): per-band region-aware two-column re-extraction, the architectural fix for THE dominant defect. Ship-dark behind `DOCPLUCK_COLUMN_CORRECT_BANDED` (default OFF; flag-OFF byte-identical, 26/26 baseline unchanged). AI-verified ON_BETTER on chan_feldman + chandrashekar vs article-finder golds (0 text-loss/halluc/regression). Committed `1325d14` + local tag v2.4.90, deploy HELD.
1208
+
1209
+ ### Blind spots / learnings
1210
+ - **The word-preservation guard is the load-bearing safety, not the heuristic.** It rejects any non-pure-reorder, so band segmentation can be optimized for COVERAGE not correctness. Corpus word-multiset scan (flag-OFF vs flag-ON identical) is the fast pre-AI-verify gate. Three hazards it caught: full-width-title column-split (fix: gutter-strip-clear row test, not "no word spans gx"), band-cut glyph bisection (fix: merge overlapping bands to full-width), per-row-both-sides vs gutter-clear coverage/rejection trade. Durable: shared card `band-reextraction-lean-on-word-preservation-guard` + project lessons.md.
1211
+ - **Concurrency hazard observed:** a second session-instance committed B7 to the SAME working tree mid-run (foreign pytest procs + a commit at 14:59 UTC during my session). Reconciled via git log + run-meta (it had run its postflight + paused PARTIAL-PAUSED). Lesson: on resume, `git log origin/main..HEAD` + run-meta `completed_at`/`run_closeout` BEFORE assuming the working tree is yours; a clean `git add` that stages nothing means someone already committed it.
1212
+
1213
+ ### Process notes
1214
+ - cycle-2 iterate-gate = FAIL (I2 partial canary coverage: only chan+chandra AI-verified of the canary/target set; I3 tables remain; I10 rendered_sha is the flag-ON render not the gate's flag-OFF canary artifact). Honest incremental-cycle FAIL per 0e-bis — run stays OPEN/PARTIAL, NOT closed. Remaining punch-list in the handoff + spec "Step 2 — remaining work".
@@ -36,3 +36,6 @@ jobs:
36
36
 
37
37
  - name: Run tests
38
38
  run: pytest tests/ -v --tb=short
39
+
40
+ - name: Check docs and metadata consistency
41
+ run: python scripts/check_docs_consistency.py
@@ -1,5 +1,15 @@
1
1
  # Changelog
2
2
 
3
+ ## [2.4.91] — 2026-06-17
4
+
5
+ **Single-column subsection-heading promotion — recover glued JESP/Elsevier subsection headings without re-opening the two-column G5d trap.** Render-layer only; no `NORMALIZATION_VERSION` / `SECTIONING_VERSION` change.
6
+
7
+ Surfaced by `/docpluck-iterate` Phase-5d (canary `ar_apa_j_jesp_2009_12_011`). Single-column Elsevier/JESP papers emit subsection headings ("Overview", "Practice instructions", "Self-control assessment") on their own line with **no blank padding on either side** — glued directly between the prior subsection's sentence-terminated body and their own body. Every existing promoter requires `blank_before AND blank_after` (or the PSPB no-blank-after relaxation, which still requires `blank_before`), so these stayed demoted to body text.
8
+
9
+ `_promote_isolated_titlecase_subsection_headings` now admits a no-blank-before candidate **only when the document is single-column** AND the immediately-preceding line is a sentence-terminated prose line. The single-column signal (`_raw_text_is_single_column`) is the fraction of raw-pdftotext non-blank lines wider than 65 chars (≥ 0.25) — a structural typographic invariant computed from the **raw** text *before* the render pipeline joins column-wrapped lines and destroys the line-width signal (corpus separation: two-column 0.06–0.24, single-column 0.28–0.58; threshold sits in the natural gap). Two-column layouts keep the hard `blank_before` reject, where the identical shape is a narrow table-cell / measures-list label (the G5d hallucinated-heading trap). A fragment guard (`_is_single_col_relaxation_fragment`) additionally rejects bracket furniture, leading-preposition sentence-wraps, dangling-connector tails, and **abbreviation-glossary / clause lists** (internal `;` or trailing `,`).
10
+
11
+ **Validation (2026-06-17).** Deterministic heading-delta across the corpus: `ar_apa_j_jesp_2009_12_011` **+3** genuine headings (AI-verified vs the article-finder reading gold: `new_headings_are_real=true`, zero hallucination), `ar_apa_…_010` +9, `jmf_1` +11, `demography_1` +4, `bjps_1` +3, `chen_2021_jesp` +3; two-column papers (`ip_feldman_2025_pspb` byte-identical, `chan_feldman`, `chandrashekar`) **+0**; `plos_med_1` net +0 (one abbreviation-glossary false promotion caught by the fragment guard). 26-paper baseline 26/26 (one transient pdftotext timeout, PASS in isolation); full suite 1733 passed. New regression `tests/test_single_column_subsection_promote_real_pdf.py` (23 cases). The canary corpus still FAILs on **pre-existing** table row/column loss and RC-1 column-interleave (separate root causes, queued) — this release is an incremental heading-fidelity improvement, not a corpus-clean claim.
12
+
3
13
  ## [2.4.90] — 2026-06-15
4
14
 
5
15
  **RC-1 Step 2 — per-band region-aware two-column re-extraction (ship-dark behind `DOCPLUCK_COLUMN_CORRECT_BANDED`, default OFF).** No `NORMALIZATION_VERSION` change — the default path is byte-identical; the flag only adds reading-order corrections upstream of normalize.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docpluck
3
- Version: 2.4.90
3
+ Version: 2.4.91
4
4
  Summary: PDF, DOCX, and HTML text extraction and normalization for academic papers
5
5
  Project-URL: Homepage, https://docpluck.app
6
6
  Project-URL: Documentation, https://docpluck.app/api-docs
@@ -57,7 +57,7 @@ Supports three input formats:
57
57
  - **DOCX** via `mammoth` (DOCX → HTML → text, preserving Shift+Enter soft breaks)
58
58
  - **HTML** via `beautifulsoup4` + `lxml` (block/inline-aware tree-walk)
59
59
 
60
- All three formats feed into the same 15-step normalization pipeline and quality scoring.
60
+ All three formats feed into the same normalization pipeline and quality scoring.
61
61
 
62
62
  ---
63
63
 
@@ -298,8 +298,8 @@ Apply the normalization pipeline at the specified level.
298
298
  | Level | Steps | Use when |
299
299
  |-------|-------|----------|
300
300
  | `none` | — | You want raw text, no modifications |
301
- | `standard` | S0-S9 | General text processing (NLP, search indexing) |
302
- | `academic` | S0-S9 + A1-A6 | Statistical pattern matching, meta-analysis |
301
+ | `standard` | Core cleanup (`S*`) + document-shape cleanup (`F0/H0/T0/P0/P1/W0`) + recovery/ref joins (`R2/R3/A7`) | General text processing (NLP, search indexing) |
302
+ | `academic` | `standard` + statistical repairs (`A*` and `W0*`) | Statistical pattern matching, meta-analysis |
303
303
 
304
304
  ```python
305
305
  from docpluck import normalize_text, NormalizationLevel
@@ -313,7 +313,7 @@ text, report = normalize_text(raw, NormalizationLevel.standard)
313
313
  # Full statistical repair (recommended for academic PDFs)
314
314
  text, report = normalize_text(raw, NormalizationLevel.academic)
315
315
 
316
- print(report.version) # "1.1.0"
316
+ print(report.version) # e.g., "1.9.35"
317
317
  print(report.steps_applied) # ["S0_smp_to_ascii", "S1_encoding_validation", ...]
318
318
  print(report.changes_made) # {"ligatures_expanded": 27, "dashes_normalized": 3, ...}
319
319
  ```
@@ -323,7 +323,7 @@ print(report.changes_made) # {"ligatures_expanded": 27, "dashes_normalized":
323
323
  | Field | Type | Description |
324
324
  |-------|------|-------------|
325
325
  | `level` | `str` | Level used: `"none"`, `"standard"`, or `"academic"` |
326
- | `version` | `str` | Pipeline version (e.g. `"1.1.0"`) |
326
+ | `version` | `str` | Pipeline version (e.g. `"1.9.35"`) |
327
327
  | `steps_applied` | `list[str]` | Step codes in order (e.g. `["S1_encoding_validation", "S3_ligature_expansion"]`) |
328
328
  | `changes_made` | `dict[str, int]` | Character-level change counts per step |
329
329
 
@@ -0,0 +1,35 @@
1
+ # docpluck
2
+
3
+ PDF, DOCX, and HTML text extraction plus normalization for academic papers.
4
+
5
+ The full documentation lives in `docs/README.md`.
6
+
7
+ ## Quick install
8
+
9
+ ```bash
10
+ pip install docpluck
11
+ ```
12
+
13
+ Optional extras:
14
+
15
+ ```bash
16
+ pip install docpluck[docx]
17
+ pip install docpluck[html]
18
+ pip install docpluck[all]
19
+ ```
20
+
21
+ ## System requirement for PDF extraction
22
+
23
+ `extract_pdf()` requires the `pdftotext` binary from Poppler.
24
+
25
+ - Linux/WSL: `apt-get install poppler-utils`
26
+ - macOS: `brew install poppler`
27
+ - Windows: install Poppler and add its `bin` folder to `PATH`
28
+
29
+ ## Links
30
+
31
+ - Full usage and API reference: `docs/README.md`
32
+ - Normalization pipeline details: `docs/NORMALIZATION.md`
33
+ - Benchmarks: `docs/BENCHMARKS.md`
34
+ - Design notes: `docs/DESIGN.md`
35
+
@@ -2,6 +2,11 @@
2
2
 
3
3
  This file tracks future-aim items that are scoped out of the current milestone but should not be lost. See `docs/superpowers/specs/` for active specs.
4
4
 
5
+ ## 2026-06-16 — deferred for investigation before code changes
6
+
7
+ - [ ] **Investigate `sections=` extraction de-dup (no behavior change yet).** `extract_pdf(..., sections=...)`, `extract_docx(..., sections=...)`, and `extract_html(..., sections=...)` currently do one extraction pass and then call `extract_sections(...)`, which can re-run extraction/annotation internally by design. Before optimizing, document invariants proving parity with direct `extract_sections(...)` outputs, then run corpus/harness verification to confirm zero regressions. No implementation change until those proofs are in place.
8
+ - [ ] **Investigate adding first-class diagnostics to structured outputs.** Current fallback visibility is via `method` suffixes and telemetry counters. Evaluate a stable `diagnostics` field on structured/text outputs (schema, backwards-compat guarantees, and consumer impact) before landing any API contract change.
9
+
5
10
  ## 2026-06-13 — v2.4.86/87/88 landed (ScienceArena GROBID/liteparse re-audit; PUSHED to origin/main e8b275d, NOT tagged)
6
11
 
7
12
  > ✅ **Push status:** committed AND pushed to `origin/main` as `e8b275d` (`8bfcdba..e8b275d`). The earlier "push hangs" were NOT a network issue — they were the **pre-push canary hook** running the slow full 5-paper audit, which kept getting killed by short timeouts. Pushed with `SKIP_CANARY=1` (justified: the canary's render subprocess was broken by the Python-env issue below, so its verdict was invalid; the changes were independently verified — 1896-test baseline green, deterministic ip_feldman render-diff identical on headings, camelot table fix confirmed on efendic/xiao/maier). NOT tagged.
@@ -183,6 +188,7 @@ Add only when a real downstream consumer asks for one. YAGNI until then.
183
188
  ### Deferred from this session (surfaced, not hacked)
184
189
 
185
190
  - [ ] **`### Reasons for change`** (ip_feldman) — Table 5 column header promoted to heading; needs table-region awareness (the body-coherence guard doesn't catch it because its body starts capitalized). RCA: rank-3 in the 2026-06-06 run-11 RCA.
191
+ - [ ] **Canary finding-key case-norm false-positive** (tooling) — the strict tag-push canary re-flags pre-existing backlog findings as "NEW" when only the leading case differs (`we`→`We`, `extensions`→`Extensions`), forcing `SKIP_CANARY=1` on every release while the deferred backlog is open (it did so for v2.4.90, 2026-06-15). Lowercase-normalize the finding key (TODO ~line 165 in `~/.claude/skills/_shared/iterate-loop/canary-audit.sh` per memory `feedback_canary_gate_nondeterministic`) so release tags pass cleanly when there are no real regressions.
186
192
  - [ ] **`## Data Availability` end-matter absent** — RCA CORRECTED the run-11 "demoter over-strip" premise: the section never enters the text channel (pdftotext drops the title-page box). Needs cross-channel (pdfplumber) recovery, same architecture class as B7. NOT a demoter exception.
187
193
  - [ ] **Glyph `Västfjäll`→`Vastfall`** (ar_apa/collabra, citationguard Defect 2) — baked pdftotext CID-font mis-map; needs a same-document surname-consensus normalizer (new subsystem). **Product/architecture decision on scope.**
188
194
  - [x] **Baked-glyph DIGIT misread `M_age 59.3`→`39.3`** (collabra.77859, surfaced 2026-06-08 RC-1 AI-verify) — **DIAGNOSED + DECISION MADE (2026-06-08): document as known limitation, no code change.** Same class as `Västfjäll` but a DIGIT in a statistic (silent stat corruption, the most dangerous form for meta-science): the PDF *visually* shows `59.3` but the embedded text codepoint is baked as `3`, and **both pdftotext AND pdfplumber faithfully extract `39.3`** (confirmed by visual PDF read + dual-extractor diff). No text-channel logic can recover it; the only fixes are OCR/multimodal-glyph-consensus (a new subsystem the user explicitly **declined** to scope this session). **Consumer note: CitationGuard / downstream stat-checkers must assume baked digit/letter misreads exist in source PDFs and apply their own cross-source (CrossRef/visual) verification — docpluck cannot guarantee a digit matches the visual glyph when the publisher baked the wrong codepoint.**
@@ -78,7 +78,7 @@ from .figures import Figure
78
78
  from .extract_structured import TABLE_EXTRACTION_VERSION, StructuredResult, extract_pdf_structured
79
79
  from .render import render_pdf_to_markdown
80
80
 
81
- __version__ = "2.4.90"
81
+ __version__ = "2.4.91"
82
82
  __author__ = "Gilad Feldman"
83
83
  __license__ = "MIT"
84
84
 
@@ -22,8 +22,15 @@ import tempfile
22
22
  from pathlib import Path
23
23
  from typing import Optional, Union
24
24
 
25
-
26
- def extract_pdf(pdf_bytes: bytes, *, sections: list[str] | None = None) -> tuple[str, str]:
25
+ from .telemetry import record_fallback
26
+
27
+ def extract_pdf(
28
+ pdf_bytes: bytes,
29
+ *,
30
+ sections: list[str] | None = None,
31
+ max_input_bytes: int | None = None,
32
+ pdftotext_timeout_seconds: int = 120,
33
+ ) -> tuple[str, str]:
27
34
  """Extract text from PDF bytes.
28
35
 
29
36
  Uses pdftotext as the primary engine. Automatically falls back to
@@ -48,6 +55,12 @@ def extract_pdf(pdf_bytes: bytes, *, sections: list[str] | None = None) -> tuple
48
55
  "pdftotext_default" — normal extraction
49
56
  "pdftotext_default+pdfplumber_recovery" — SMP fallback triggered
50
57
 
58
+ Guardrails:
59
+ max_input_bytes: Optional hard cap for input size. When set and
60
+ ``len(pdf_bytes)`` exceeds it, a ValueError is raised.
61
+ pdftotext_timeout_seconds: Timeout for the pdftotext subprocess.
62
+ Default 120 seconds preserves current behavior.
63
+
51
64
  Requires:
52
65
  pdftotext binary (from poppler-utils) on PATH.
53
66
 
@@ -60,6 +73,13 @@ def extract_pdf(pdf_bytes: bytes, *, sections: list[str] | None = None) -> tuple
60
73
  with open("paper.pdf", "rb") as f:
61
74
  text, method = extract_pdf(f.read(), sections=["abstract", "methods"])
62
75
  """
76
+ if max_input_bytes is not None and len(pdf_bytes) > max_input_bytes:
77
+ raise ValueError(
78
+ f"PDF input exceeds max_input_bytes: {len(pdf_bytes)} > {max_input_bytes}"
79
+ )
80
+ if pdftotext_timeout_seconds <= 0:
81
+ raise ValueError("pdftotext_timeout_seconds must be > 0")
82
+
63
83
  with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
64
84
  tmp.write(pdf_bytes)
65
85
  tmp_path = tmp.name
@@ -69,7 +89,7 @@ def extract_pdf(pdf_bytes: bytes, *, sections: list[str] | None = None) -> tuple
69
89
  result = subprocess.run(
70
90
  ["pdftotext", "-enc", "UTF-8", tmp_path, "-"],
71
91
  capture_output=True,
72
- timeout=120,
92
+ timeout=pdftotext_timeout_seconds,
73
93
  encoding="utf-8",
74
94
  errors="replace",
75
95
  )
@@ -212,8 +232,10 @@ def extract_pdf(pdf_bytes: bytes, *, sections: list[str] | None = None) -> tuple
212
232
  if corrected and corrected != text and changed:
213
233
  text = corrected
214
234
  method = f"{method}+column_corrected:{','.join(map(str, changed))}"
215
- except Exception:
216
- pass
235
+ except Exception as exc:
236
+ exc_name = type(exc).__name__
237
+ record_fallback("column_correction_exception", detail=exc_name)
238
+ method = f"{method}+column_correction_failed:{exc_name}"
217
239
 
218
240
  if sections is not None:
219
241
  from .sections import extract_sections
@@ -293,11 +315,13 @@ def count_pages(pdf_bytes: bytes) -> int:
293
315
  import pdfplumber # type: ignore[import-not-found]
294
316
  with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
295
317
  return max(len(pdf.pages), 1)
296
- except Exception:
318
+ except Exception as exc:
319
+ record_fallback("count_pages_pdfplumber_fallback_failed", detail=type(exc).__name__)
297
320
  # pdfplumber failed (corrupt PDF, password-protected, etc.) —
298
321
  # fall back to the heuristic's value.
299
322
  return max(count, 1)
300
- except Exception:
323
+ except Exception as exc:
324
+ record_fallback("count_pages_exception", detail=type(exc).__name__)
301
325
  return 0
302
326
 
303
327
 
@@ -458,5 +482,6 @@ def _recover_with_pdfplumber(pdf_path: str) -> Optional[str]:
458
482
 
459
483
  return full_text
460
484
 
461
- except Exception:
485
+ except Exception as exc:
486
+ record_fallback("pdfplumber_recovery_exception", detail=type(exc).__name__)
462
487
  return None
@@ -356,296 +356,6 @@ def _word_multiset(text: str) -> "Counter":
356
356
  return Counter(toks)
357
357
 
358
358
 
359
- # ── RC-1 Step 2: per-band region-aware column de-interleave ──
360
- #
361
- # `extract_page_text_columns` corrects a WHOLE page and is (correctly) refused
362
- # by the bilateral / full-height-gutter gate whenever the page carries an
363
- # embedded full-width table or banner — a whole-page left-then-right crop would
364
- # slice straight through the table. That leaves the two-column PROSE bands
365
- # above/below the table interleaved — the dominant residual on two-column APA
366
- # papers (Collabra / JESP / chandrashekar / ip_feldman).
367
- #
368
- # Step 2 segments the page into horizontal y-bands and column-corrects only the
369
- # bands that are genuinely two-column across THEIR OWN y-range, leaving
370
- # full-width bands (table rows, banners, spanning headings) untouched. The
371
- # gutter is located by the MIN-CROSSING central x — it tolerates table/banner
372
- # rows that cross it (those rows simply classify as full-width) — rather than
373
- # the whole-page full-height clean strip a table would destroy.
374
- #
375
- # Safety is layered so no corruption can ship (rules 0a / 0b):
376
- # 1. a band is column-cropped only when both sides carry substantial text AND
377
- # no word straddles the cut x;
378
- # 2. per-band word-preservation — the two-column crop is accepted only when
379
- # its substantial-word multiset equals the band's full-width crop (a pure
380
- # reorder). A straddle-split (`donation` → `dona` + `tion`) or a
381
- # cross-column glue difference (`betweenoriginal` → `between` + `subject`)
382
- # makes them differ → that band alone falls back to the word-correct
383
- # full-width crop, preserving the page's other good bands;
384
- # 3. the caller's UNCONDITIONAL page-level word-preservation guard in
385
- # `splice_column_corrected_pages` is the final backstop — any page whose
386
- # reassembly changes the page word multiset is rejected wholesale and the
387
- # original (interleaved but word-correct) text kept. Worst case is an
388
- # unimproved page, never a corrupted one.
389
- #
390
- # Keyed on a structural signature (gutter geometry + per-row column occupancy),
391
- # never on paper identity (CLAUDE.md general-fix rule). Algorithm validated as a
392
- # tmp/ prototype on 71 flagged pages across 5 two-column papers before promotion
393
- # (65/71 word-safe before per-band fallback; the 6 straddle/glue violations are
394
- # what safety #2 localizes). pdftotext crop-mode is used for spacing fidelity,
395
- # consistent with `extract_page_text_columns` (CLAUDE.md hard rule 3: conditional
396
- # per-page re-extraction, not a default tool swap).
397
-
398
- # Half-width (PDF points) of the central strip that must be glyph-free for a row
399
- # to read as two-column. A full-width line's inter-word space (~3-4pt) is
400
- # narrower than 2*_GUTTER_STRIP_HALF, so a justified title/abstract line is
401
- # correctly classed full-width; a real column gutter (10-30pt) clears it.
402
- _GUTTER_STRIP_HALF = 4.0
403
-
404
-
405
- def _min_crossing_gutter(words: list[dict], page_width: float) -> float | None:
406
- """Find the central-band x crossed by the FEWEST distinct text rows.
407
-
408
- Unlike `_detect_2col_midline_gutter` (which REQUIRES a near-zero-crossing
409
- full-height strip and so returns None on any page with an embedded
410
- full-width table row), this returns the *least-crossed* central x even when
411
- table/banner rows cross it — those rows become full-width bands downstream.
412
- Confined to [0.35W, 0.65W] and tie-broken toward page center. Returns None
413
- when the page has too few text rows to trust a gutter.
414
- """
415
- if not words or page_width <= 0:
416
- return None
417
- lo_i, hi_i = int(page_width * 0.35), int(page_width * 0.65)
418
- if hi_i - lo_i < _MIN_GUTTER_STRIP_WIDTH:
419
- return None
420
- all_rows = {int(round(w["top"] / _LINE_Y_TOLERANCE)) for w in words}
421
- if len(all_rows) < 10:
422
- return None
423
- crossings: dict[int, set] = defaultdict(set)
424
- for w in words:
425
- x0 = max(lo_i, int(w["x0"]))
426
- x1 = min(hi_i, int(w["x1"]))
427
- if x1 < x0:
428
- continue
429
- rk = int(round(w["top"] / _LINE_Y_TOLERANCE))
430
- for x in range(x0, x1 + 1):
431
- crossings[x].add(rk)
432
- center = page_width / 2.0
433
- best_x: int | None = None
434
- best_n: int | None = None
435
- for x in range(lo_i, hi_i + 1):
436
- n = len(crossings.get(x, ()))
437
- if (best_n is None or n < best_n
438
- or (n == best_n and abs(x - center) < abs(best_x - center))):
439
- best_x, best_n = x, n
440
- return float(best_x) if best_x is not None else None
441
-
442
-
443
- def _row_is_two_column(row_words: list[dict], gutter_x: float) -> bool:
444
- """A row is column-compatible (NOT full-width) iff NO word's horizontal
445
- extent enters the central strip [gx-Δ, gx+Δ] — i.e. nothing crosses the
446
- column gutter / crop line.
447
-
448
- Crucially this does NOT require text on both sides: in real staggered
449
- two-column prose most rows carry text in only ONE column at a given y
450
- (paragraph ends, ragged column bottoms), and those one-sided rows belong in
451
- the two-column band — they crop cleanly to their own side and the empty
452
- side contributes nothing. Requiring both sides per-row (an earlier
453
- prototype's rule) fragments a genuine two-column region into noise and
454
- misses it entirely (collabra_77859). The "is this band actually
455
- two-column" judgement is made once at the BAND level (both sides ≥ 25% of
456
- the band's words) in `extract_page_text_bands`, which is the correct scale
457
- for it. A full-width line (table row, banner, spanning heading, or any line
458
- with a word straddling the cut) has a word in the strip and so is False."""
459
- return not any(
460
- w["x0"] <= gutter_x + _GUTTER_STRIP_HALF
461
- and w["x1"] >= gutter_x - _GUTTER_STRIP_HALF
462
- for w in row_words
463
- )
464
-
465
-
466
- def _segment_into_bands(words: list[dict], gutter_x: float, page_height: float):
467
- """Group a page's text rows into contiguous full-width / two-column y-bands.
468
-
469
- Returns a list of ``(is_full_width, y_top, y_bottom, band_words)`` in
470
- y-order. Up to one isolated opposite-class row is tolerated inside a run (a
471
- stray descender crossing the gutter, a one-line full-width subhead).
472
- Adjacent bands whose y-extents OVERLAP are merged and forced full-width —
473
- overlapping mixed-size content (a tall title line abutting the next row)
474
- can't be cleanly column-separated, and a full-width crop is the safe,
475
- word-preserving fallback there.
476
- """
477
- rows: dict[int, list[dict]] = defaultdict(list)
478
- for w in words:
479
- rows[int(round(w["top"] / _LINE_Y_TOLERANCE))].append(w)
480
- classified = [] # (y_top, y_bottom, is_full_width, words)
481
- for rk in sorted(rows):
482
- ws = rows[rk]
483
- classified.append((
484
- min(w["top"] for w in ws),
485
- max(w["bottom"] for w in ws),
486
- not _row_is_two_column(ws, gutter_x),
487
- ws,
488
- ))
489
- if not classified:
490
- return []
491
-
492
- # Group contiguous same-class rows (tol = 1 isolated opposite row).
493
- grouped: list[tuple[bool, list]] = []
494
- cur_class = classified[0][2]
495
- run = [classified[0]]
496
- opp = 0
497
- for row in classified[1:]:
498
- if row[2] == cur_class:
499
- run.append(row)
500
- opp = 0
501
- else:
502
- opp += 1
503
- run.append(row)
504
- if opp > 1:
505
- keep = run[:-opp]
506
- if keep:
507
- grouped.append((cur_class, keep))
508
- run = run[-opp:]
509
- cur_class = row[2]
510
- opp = 0
511
- if run:
512
- grouped.append((cur_class, run))
513
-
514
- bands = []
515
- for fw, rws in grouped:
516
- bands.append([
517
- fw,
518
- min(r[0] for r in rws),
519
- max(r[1] for r in rws),
520
- [w for r in rws for w in r[3]],
521
- ])
522
-
523
- # Overlap-merge adjacent bands (force full-width on merge).
524
- merged: list[list] = [list(bands[0])]
525
- for b in bands[1:]:
526
- prev = merged[-1]
527
- if b[1] <= prev[2]: # b.y_top <= prev.y_bottom → overlap
528
- prev[0] = True
529
- prev[2] = max(prev[2], b[2])
530
- prev[3] = prev[3] + b[3]
531
- else:
532
- merged.append(list(b))
533
- return [(b[0], b[1], b[2], b[3]) for b in merged]
534
-
535
-
536
- def extract_page_text_bands(layout_doc, page_index: int,
537
- pdf_bytes: bytes | None = None) -> str:
538
- """Per-band region-aware column de-interleave for a single page.
539
-
540
- The Step-2 complement to `extract_page_text_columns`: when a page carries an
541
- embedded full-width table/banner (so the whole-page corrector's bilateral /
542
- full-height-gutter gate skips it), segment the page into y-bands and reorder
543
- only the genuinely-two-column bands left-then-right, leaving full-width bands
544
- as-is, then reassemble in y-order at glyph-free cut lines.
545
-
546
- Args:
547
- layout_doc: LayoutDoc from `docpluck.extract_layout.extract_pdf_layout`.
548
- page_index: 0-based page index.
549
- pdf_bytes: raw PDF bytes (required — pdftotext crop-mode gives the
550
- word spacing pdfplumber drops on tight-kerned PDFs).
551
-
552
- Returns:
553
- The reassembled page text, or "" when the page has no clean gutter or no
554
- confidently-correctable two-column band (caller keeps the original
555
- text). Every two-column band emitted is a per-band word-preserving
556
- reorder; the caller's page-level guard is the final backstop.
557
- """
558
- if pdf_bytes is None:
559
- return ""
560
- if page_index < 0 or page_index >= len(layout_doc.pages):
561
- return ""
562
- page = layout_doc.pages[page_index]
563
- page_width = float(page.width or 0.0)
564
- page_height = float(page.height or 0.0)
565
- if page_width <= 0 or page_height <= 0:
566
- return ""
567
- words = list(page.words or ())
568
- if len(words) < _MIN_WORDS_FOR_COLUMN_MODE:
569
- return ""
570
-
571
- gutter_x = _min_crossing_gutter(words, page_width)
572
- if gutter_x is None:
573
- return ""
574
- bands = _segment_into_bands(words, gutter_x, page_height)
575
- if not bands:
576
- return ""
577
-
578
- # Clean cut lines: after overlap-merge the bands are non-overlapping, so each
579
- # inter-band midpoint sits in a glyph-free gap. Crop [cut[i], cut[i+1]] so the
580
- # page is covered top-to-bottom with no gaps/overlaps and no horizontal cut
581
- # bisects a glyph.
582
- cuts = [0.0]
583
- for i in range(len(bands) - 1):
584
- cuts.append((bands[i][2] + bands[i + 1][1]) / 2.0)
585
- cuts.append(page_height)
586
-
587
- import os
588
- import subprocess
589
- import tempfile
590
- with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
591
- tmp.write(pdf_bytes)
592
- tmp_path = tmp.name
593
-
594
- def _crop(x: float, y: float, w: float, h: float) -> str:
595
- if w <= 1 or h <= 1:
596
- return ""
597
- proc = subprocess.run(
598
- ["pdftotext", "-enc", "UTF-8",
599
- "-f", str(page_index + 1), "-l", str(page_index + 1),
600
- "-x", str(int(x)), "-y", str(int(y)),
601
- "-W", str(int(w)), "-H", str(int(h)), tmp_path, "-"],
602
- capture_output=True, timeout=30, encoding="utf-8", errors="replace",
603
- )
604
- if proc.returncode != 0:
605
- return ""
606
- return (proc.stdout or "").rstrip("\f").strip()
607
-
608
- try:
609
- parts: list[str] = []
610
- n_two_col = 0
611
- for i, (is_full_width, _y_top, _y_bottom, band_words) in enumerate(bands):
612
- top, bot = cuts[i], cuts[i + 1]
613
- height = bot - top
614
- did_two_col = False
615
- if not is_full_width and band_words:
616
- left = [w for w in band_words if (w["x0"] + w["x1"]) / 2 < gutter_x]
617
- right = [w for w in band_words if (w["x0"] + w["x1"]) / 2 >= gutter_x]
618
- straddles = any(w["x0"] < gutter_x < w["x1"] for w in band_words)
619
- if (len(left) >= 0.25 * len(band_words)
620
- and len(right) >= 0.25 * len(band_words)
621
- and not straddles):
622
- lt = _crop(0, top, gutter_x, height)
623
- rt = _crop(gutter_x, top, page_width - gutter_x, height)
624
- # Per-band word-preservation (safety #2): accept the
625
- # two-column reorder only when it neither splits nor re-glues
626
- # a word vs the band's own full-width crop. Otherwise that
627
- # band falls back to full-width — word-correct, still
628
- # interleaved — without sacrificing the page's other bands.
629
- if lt.strip() and rt.strip():
630
- fw = _crop(0, top, page_width, height)
631
- if _word_multiset(lt + "\n" + rt) == _word_multiset(fw):
632
- parts.append((lt + "\n" + rt).strip())
633
- n_two_col += 1
634
- did_two_col = True
635
- if not did_two_col:
636
- parts.append(_crop(0, top, page_width, height))
637
- if n_two_col == 0:
638
- # No band was confidently column-corrected → no improvement over the
639
- # original; signal the caller to keep the original page text.
640
- return ""
641
- return "\n".join(p for p in parts if p.strip())
642
- finally:
643
- try:
644
- os.unlink(tmp_path)
645
- except Exception:
646
- pass
647
-
648
-
649
359
  def splice_column_corrected_pages(
650
360
  raw_text: str,
651
361
  layout_doc,
@@ -29,7 +29,12 @@ import io
29
29
  from .extract_html import html_to_text
30
30
 
31
31
 
32
- def extract_docx(docx_bytes: bytes, *, sections: list[str] | None = None) -> tuple[str, str]:
32
+ def extract_docx(
33
+ docx_bytes: bytes,
34
+ *,
35
+ sections: list[str] | None = None,
36
+ max_input_bytes: int | None = None,
37
+ ) -> tuple[str, str]:
33
38
  """Extract text from DOCX file bytes.
34
39
 
35
40
  Converts the DOCX to HTML via mammoth (preserving soft breaks and block
@@ -42,6 +47,8 @@ def extract_docx(docx_bytes: bytes, *, sections: list[str] | None = None) -> tup
42
47
  "methods"]``) to filter the output. When provided, ``extract_sections``
43
48
  is called and only the requested sections are returned concatenated
44
49
  in document order. Pass ``None`` (default) to return the full text.
50
+ max_input_bytes: Optional hard cap for input size. When set and
51
+ ``len(docx_bytes)`` exceeds it, a ValueError is raised.
45
52
 
46
53
  Returns:
47
54
  A tuple of (text, method) where:
@@ -68,6 +75,11 @@ def extract_docx(docx_bytes: bytes, *, sections: list[str] | None = None) -> tup
68
75
  # Lazy import so the core library works without mammoth installed
69
76
  import mammoth
70
77
 
78
+ if max_input_bytes is not None and len(docx_bytes) > max_input_bytes:
79
+ raise ValueError(
80
+ f"DOCX input exceeds max_input_bytes: {len(docx_bytes)} > {max_input_bytes}"
81
+ )
82
+
71
83
  result = mammoth.convert_to_html(io.BytesIO(docx_bytes))
72
84
  html = result.value
73
85
  text = html_to_text(html)