docpluck 2.4.83__tar.gz → 2.4.85__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (396) hide show
  1. {docpluck-2.4.83 → docpluck-2.4.85}/.claude/skills/docpluck-deploy/SKILL.md +30 -13
  2. {docpluck-2.4.83 → docpluck-2.4.85}/.claude/skills/docpluck-iterate/LEARNINGS.md +51 -0
  3. {docpluck-2.4.83 → docpluck-2.4.85}/.github/workflows/test.yml +8 -1
  4. {docpluck-2.4.83 → docpluck-2.4.85}/CHANGELOG.md +35 -0
  5. {docpluck-2.4.83 → docpluck-2.4.85}/PKG-INFO +1 -1
  6. {docpluck-2.4.83 → docpluck-2.4.85}/TODO.md +15 -0
  7. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/__init__.py +1 -1
  8. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/normalize.py +226 -14
  9. {docpluck-2.4.83 → docpluck-2.4.85}/pyproject.toml +1 -1
  10. docpluck-2.4.85/tests/test_harvard_refs_pagebreak_stitch.py +233 -0
  11. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_normalize_a3_r2_body_integer_real_pdf.py +41 -0
  12. {docpluck-2.4.83 → docpluck-2.4.85}/.claude/skills/_project/canary.json +0 -0
  13. {docpluck-2.4.83 → docpluck-2.4.85}/.claude/skills/_project/lessons.md +0 -0
  14. {docpluck-2.4.83 → docpluck-2.4.85}/.claude/skills/docpluck-cleanup/SKILL.md +0 -0
  15. {docpluck-2.4.83 → docpluck-2.4.85}/.claude/skills/docpluck-iterate/SKILL.md +0 -0
  16. {docpluck-2.4.83 → docpluck-2.4.85}/.claude/skills/docpluck-iterate/references/ai-full-doc-verify.md +0 -0
  17. {docpluck-2.4.83 → docpluck-2.4.85}/.claude/skills/docpluck-iterate/references/cycle-report-template.md +0 -0
  18. {docpluck-2.4.83 → docpluck-2.4.85}/.claude/skills/docpluck-iterate/references/local-verification.md +0 -0
  19. {docpluck-2.4.83 → docpluck-2.4.85}/.claude/skills/docpluck-iterate/references/rationalizations.md +0 -0
  20. {docpluck-2.4.83 → docpluck-2.4.85}/.claude/skills/docpluck-iterate/references/real-library-real-pdf.md +0 -0
  21. {docpluck-2.4.83 → docpluck-2.4.85}/.claude/skills/docpluck-iterate/references/release-flow.md +0 -0
  22. {docpluck-2.4.83 → docpluck-2.4.85}/.claude/skills/docpluck-iterate/references/self-improvement.md +0 -0
  23. {docpluck-2.4.83 → docpluck-2.4.85}/.claude/skills/docpluck-iterate/references/three-tier-parity.md +0 -0
  24. {docpluck-2.4.83 → docpluck-2.4.85}/.claude/skills/docpluck-qa/SKILL.md +0 -0
  25. {docpluck-2.4.83 → docpluck-2.4.85}/.claude/skills/docpluck-qa/references/benchmark-mode.md +0 -0
  26. {docpluck-2.4.83 → docpluck-2.4.85}/.claude/skills/docpluck-qa/references/check-11-hard-rules.md +0 -0
  27. {docpluck-2.4.83 → docpluck-2.4.85}/.claude/skills/docpluck-qa/references/check-13-escicheck-production.md +0 -0
  28. {docpluck-2.4.83 → docpluck-2.4.85}/.claude/skills/docpluck-qa/references/check-5-escicheck-library.md +0 -0
  29. {docpluck-2.4.83 → docpluck-2.4.85}/.claude/skills/docpluck-qa/references/check-6-escicheck-local-webapp.md +0 -0
  30. {docpluck-2.4.83 → docpluck-2.4.85}/.claude/skills/docpluck-qa/references/check-7-batch-smoke.md +0 -0
  31. {docpluck-2.4.83 → docpluck-2.4.85}/.claude/skills/docpluck-review/SKILL.md +0 -0
  32. {docpluck-2.4.83 → docpluck-2.4.85}/.github/workflows/bump-app-pin.yml +0 -0
  33. {docpluck-2.4.83 → docpluck-2.4.85}/.github/workflows/publish.yml +0 -0
  34. {docpluck-2.4.83 → docpluck-2.4.85}/.gitignore +0 -0
  35. {docpluck-2.4.83 → docpluck-2.4.85}/CLAUDE.md +0 -0
  36. {docpluck-2.4.83 → docpluck-2.4.85}/HANDOFF_SECTIONS_APP_INTEGRATION.md +0 -0
  37. {docpluck-2.4.83 → docpluck-2.4.85}/LESSONS.md +0 -0
  38. {docpluck-2.4.83 → docpluck-2.4.85}/LICENSE +0 -0
  39. {docpluck-2.4.83 → docpluck-2.4.85}/REPLY_FROM_DOCPLUCK_v1.4.5.md +0 -0
  40. {docpluck-2.4.83 → docpluck-2.4.85}/REPLY_FROM_DOCPLUCK_v1.5.0.md +0 -0
  41. {docpluck-2.4.83 → docpluck-2.4.85}/REQUEST_08_CHUNKING_ENDPOINT.md +0 -0
  42. {docpluck-2.4.83 → docpluck-2.4.85}/REQUEST_09_REFERENCE_LIST_NORMALIZATION.md +0 -0
  43. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/__main__.py +0 -0
  44. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/batch.py +0 -0
  45. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/cli.py +0 -0
  46. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/extract.py +0 -0
  47. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/extract_columns.py +0 -0
  48. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/extract_docx.py +0 -0
  49. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/extract_html.py +0 -0
  50. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/extract_layout.py +0 -0
  51. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/extract_structured.py +0 -0
  52. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/figures/__init__.py +0 -0
  53. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/figures/detect.py +0 -0
  54. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/quality.py +0 -0
  55. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/render.py +0 -0
  56. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/sections/__init__.py +0 -0
  57. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/sections/annotators/__init__.py +0 -0
  58. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/sections/annotators/docx.py +0 -0
  59. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/sections/annotators/html.py +0 -0
  60. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/sections/annotators/pdf.py +0 -0
  61. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/sections/annotators/text.py +0 -0
  62. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/sections/blocks.py +0 -0
  63. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/sections/boundaries.py +0 -0
  64. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/sections/core.py +0 -0
  65. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/sections/taxonomy.py +0 -0
  66. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/sections/types.py +0 -0
  67. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/tables/__init__.py +0 -0
  68. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/tables/bbox_utils.py +0 -0
  69. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/tables/camelot_extract.py +0 -0
  70. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/tables/captions.py +0 -0
  71. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/tables/cell_cleaning.py +0 -0
  72. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/tables/cluster.py +0 -0
  73. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/tables/confidence.py +0 -0
  74. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/tables/detect.py +0 -0
  75. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/tables/flatten.py +0 -0
  76. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/tables/render.py +0 -0
  77. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/tables/whitespace.py +0 -0
  78. {docpluck-2.4.83 → docpluck-2.4.85}/docpluck/version.py +0 -0
  79. {docpluck-2.4.83 → docpluck-2.4.85}/docs/BENCHMARKS.md +0 -0
  80. {docpluck-2.4.83 → docpluck-2.4.85}/docs/DESIGN.md +0 -0
  81. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-07_sections_strict_iteration.md +0 -0
  82. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-09_session_state_and_followups.md +0 -0
  83. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-09_unified_extraction_brainstorm.md +0 -0
  84. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-10_table_rendering_iteration.md +0 -0
  85. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-10_table_rendering_iteration_2.md +0 -0
  86. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-10_table_rendering_iteration_3.md +0 -0
  87. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-10_table_rendering_iteration_4.md +0 -0
  88. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-10_table_rendering_iteration_5.md +0 -0
  89. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-10_table_rendering_iteration_6.md +0 -0
  90. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-10_table_rendering_iteration_7.md +0 -0
  91. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-11_PROMOTE_SPIKE_TO_LIBRARY.md +0 -0
  92. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-11_table_rendering_iteration_8.md +0 -0
  93. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-11_visual_review_findings.md +0 -0
  94. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-12_phase2_101pdf_corpus.md +0 -0
  95. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-12_remaining_ui_and_chrome_verification.md +0 -0
  96. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-12_visual_verify_results.md +0 -0
  97. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-13_apa_50_expansion.md +0 -0
  98. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-13_apa_50_expansion_iter_1.md +0 -0
  99. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-13_apa_50_expansion_iter_2.md +0 -0
  100. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-13_iterate_skill_first_use.md +0 -0
  101. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-13_iterative_1.md +0 -0
  102. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-13_iterative_library_improvement.md +0 -0
  103. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-13_table_extraction_next_iteration.md +0 -0
  104. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-14_continue_iterations_v2_4_30_to_15n.md +0 -0
  105. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-14_full_corpus_iteration_v2_4_30.md +0 -0
  106. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-14_iterate_6_cycles_complete.md +0 -0
  107. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-14_iterate_9_cycle_run.md +0 -0
  108. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-14_iterate_resume_4_cycles.md +0 -0
  109. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-14_iterate_v2_4_31_cycle_15n.md +0 -0
  110. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-14_phase_5d_gold_audit_v2_4_29.md +0 -0
  111. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-15_autonomous_apa_first_10h.md +0 -0
  112. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-15_iterate_apa_run_1.md +0 -0
  113. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-16_ai-gold-instructions.md +0 -0
  114. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-16_iterate_apa_run_2.md +0 -0
  115. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-16_iterate_apa_run_3.md +0 -0
  116. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-16_iterate_run_4_final.md +0 -0
  117. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-16_iterate_run_4_fix_and_continue.md +0 -0
  118. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-16_iterate_run_5.md +0 -0
  119. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-16_iterate_run_6.md +0 -0
  120. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-17_iterate_run_7.md +0 -0
  121. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-17_iterate_run_8.md +0 -0
  122. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-17_iterate_run_9.md +0 -0
  123. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-18_iterate_run_9_cont.md +0 -0
  124. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-18_iterate_run_9_cont2.md +0 -0
  125. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-20_iterate_run_9_cont3.md +0 -0
  126. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-22_iterate_run_9_session4_final.md +0 -0
  127. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-22_iterate_run_9_session5_close.md +0 -0
  128. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-25_haiku-orchestration-pretest.md +0 -0
  129. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-05-25_pretest-followups.md +0 -0
  130. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-06-08_iterate_splice-wordintegrity-runningheader.md +0 -0
  131. {docpluck-2.4.83 → docpluck-2.4.85}/docs/HANDOFF_2026-06-08_untested_sweep_v2.4.81.md +0 -0
  132. {docpluck-2.4.83 → docpluck-2.4.85}/docs/ITERATION_VERIFICATION_LESSONS.md +0 -0
  133. {docpluck-2.4.83 → docpluck-2.4.85}/docs/LIBRARY_APP_SYNC.md +0 -0
  134. {docpluck-2.4.83 → docpluck-2.4.85}/docs/NORMALIZATION.md +0 -0
  135. {docpluck-2.4.83 → docpluck-2.4.85}/docs/README.md +0 -0
  136. {docpluck-2.4.83 → docpluck-2.4.85}/docs/TRIAGE_2026-05-10_corpus_assessment.md +0 -0
  137. {docpluck-2.4.83 → docpluck-2.4.85}/docs/TRIAGE_2026-05-14_phase_5d_gold_audit.md +0 -0
  138. {docpluck-2.4.83 → docpluck-2.4.85}/docs/TRIAGE_2026-06-08_untested_corpus_sweep.md +0 -0
  139. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/handoffs/2026-05-22-b1-next-iteration.md +0 -0
  140. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/handoffs/2026-05-22-b2-remaining-halluc-head.md +0 -0
  141. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/handoffs/2026-05-22-b3-b7-structural-defects.md +0 -0
  142. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/handoffs/2026-05-22-residual-after-locally-doable-pass.md +0 -0
  143. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/handoffs/2026-05-23-bundled-residual-cycle-CLOSED.md +0 -0
  144. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/handoffs/2026-05-23-residual-after-iterate-spine-cycles-1-3.md +0 -0
  145. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/handoffs/2026-05-25-canary-audit-architecture-and-cluster-A-B-C-landed.md +0 -0
  146. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/handoffs/2026-05-25-wrapup-r4-cycle.md +0 -0
  147. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/handoffs/2026-05-26-run-11-cluster-A-ter-and-C-bis-landed.md +0 -0
  148. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/handoffs/2026-05-26-text-extraction-defects-from-citationguard-audit.md +0 -0
  149. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/handoffs/2026-06-07-text-extraction-defects-from-citationguard-iterate.md +0 -0
  150. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/handoffs/2026-06-07-v2.4.78-landed-canary-iterate.md +0 -0
  151. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/handoffs/2026-06-07-v2.4.79-findings-1-2-cleared.md +0 -0
  152. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/2026-05-06-section-identification.md +0 -0
  153. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/2026-05-06-table-extraction.md +0 -0
  154. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/2026-05-07-sections-strict-iteration-progress.md +0 -0
  155. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/2026-05-08-unified-extraction-phase-0-splice-spike.md +0 -0
  156. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/2026-05-23-haiku-orchestration-pretest.md +0 -0
  157. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/sections-deferred-items.md +0 -0
  158. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/sections-issues-backlog.md +0 -0
  159. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/2026-05-07_spot-01_apa.md +0 -0
  160. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/2026-05-07_spot-02_pattern-A-shipped.md +0 -0
  161. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/2026-05-08_spot-final_all-styles.md +0 -0
  162. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/experiments/COMPARISON.md +0 -0
  163. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/korbmacher_table1.md +0 -0
  164. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/option-a.py +0 -0
  165. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/ziano_table1.md +0 -0
  166. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_notes_raw.txt +0 -0
  167. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_table1.md +0 -0
  168. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/notes.md +0 -0
  169. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/option-b.py +0 -0
  170. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_notes_raw.txt +0 -0
  171. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_table1.md +0 -0
  172. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/korbmacher_table1.md +0 -0
  173. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/notes.md +0 -0
  174. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/option-c.py +0 -0
  175. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/sample-pdftotext-bbox.html +0 -0
  176. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/ziano_table1.md +0 -0
  177. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/korbmacher_table1.md +0 -0
  178. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/notes.md +0 -0
  179. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/option-d.py +0 -0
  180. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/ziano_table1.md +0 -0
  181. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_2022_kruger_bbox.html +0 -0
  182. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_bbox.html +0 -0
  183. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_table1.md +0 -0
  184. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/option-e.py +0 -0
  185. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/sample-bbox.html +0 -0
  186. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_2021_joep_bbox.html +0 -0
  187. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_bbox.html +0 -0
  188. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_table1.md +0 -0
  189. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/html-fallback-demo.md +0 -0
  190. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.err +0 -0
  191. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.md +0 -0
  192. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.err +0 -0
  193. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.md +0 -0
  194. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.err +0 -0
  195. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.md +0 -0
  196. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.err +0 -0
  197. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.md +0 -0
  198. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.err +0 -0
  199. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.md +0 -0
  200. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.err +0 -0
  201. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.md +0 -0
  202. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.err +0 -0
  203. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.md +0 -0
  204. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.err +0 -0
  205. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.md +0 -0
  206. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.err +0 -0
  207. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.md +0 -0
  208. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.err +0 -0
  209. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.md +0 -0
  210. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.err +0 -0
  211. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.md +0 -0
  212. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.err +0 -0
  213. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.md +0 -0
  214. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.err +0 -0
  215. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.md +0 -0
  216. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.err +0 -0
  217. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.md +0 -0
  218. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.err +0 -0
  219. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.md +0 -0
  220. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.err +0 -0
  221. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.md +0 -0
  222. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.err +0 -0
  223. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.md +0 -0
  224. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.err +0 -0
  225. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.md +0 -0
  226. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.err +0 -0
  227. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.md +0 -0
  228. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.err +0 -0
  229. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.md +0 -0
  230. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.err +0 -0
  231. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.md +0 -0
  232. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.err +0 -0
  233. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.md +0 -0
  234. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.err +0 -0
  235. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.md +0 -0
  236. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.err +0 -0
  237. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.md +0 -0
  238. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.err +0 -0
  239. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.md +0 -0
  240. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.err +0 -0
  241. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.md +0 -0
  242. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/papers.md +0 -0
  243. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/report.md +0 -0
  244. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/splice_spike.py +0 -0
  245. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/plans/spot-checks/splice-spike/test_splice_spike.py +0 -0
  246. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/specs/2026-04-27-request-09-reference-normalization-design.md +0 -0
  247. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/specs/2026-05-06-section-identification-design.md +0 -0
  248. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/specs/2026-05-06-table-extraction-design.md +0 -0
  249. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/specs/2026-05-08-unified-extraction-design.md +0 -0
  250. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/specs/2026-05-23-haiku-orchestration-pretest-design.md +0 -0
  251. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/specs/2026-06-07-ip_feldman-B4-R4-column-interleave-diagnosis.md +0 -0
  252. {docpluck-2.4.83 → docpluck-2.4.85}/docs/superpowers/specs/2026-06-08-rc1-region-aware-column-architecture.md +0 -0
  253. {docpluck-2.4.83 → docpluck-2.4.85}/scripts/__init__.py +0 -0
  254. {docpluck-2.4.83 → docpluck-2.4.85}/scripts/harness/README.md +0 -0
  255. {docpluck-2.4.83 → docpluck-2.4.85}/scripts/harness/VERIFIER_PROMPT.md +0 -0
  256. {docpluck-2.4.83 → docpluck-2.4.85}/scripts/harness/__init__.py +0 -0
  257. {docpluck-2.4.83 → docpluck-2.4.85}/scripts/harness/baseline_matrix.json +0 -0
  258. {docpluck-2.4.83 → docpluck-2.4.85}/scripts/harness/checks.py +0 -0
  259. {docpluck-2.4.83 → docpluck-2.4.85}/scripts/harness/corpus.py +0 -0
  260. {docpluck-2.4.83 → docpluck-2.4.85}/scripts/harness/corpus_manifest.json +0 -0
  261. {docpluck-2.4.83 → docpluck-2.4.85}/scripts/harness/extract.py +0 -0
  262. {docpluck-2.4.83 → docpluck-2.4.85}/scripts/harness/gold_keys.json +0 -0
  263. {docpluck-2.4.83 → docpluck-2.4.85}/scripts/harness/inspect.py +0 -0
  264. {docpluck-2.4.83 → docpluck-2.4.85}/scripts/lint_rendered_corpus.py +0 -0
  265. {docpluck-2.4.83 → docpluck-2.4.85}/scripts/pretest_capture_tokens.py +0 -0
  266. {docpluck-2.4.83 → docpluck-2.4.85}/scripts/verify_corpus.py +0 -0
  267. {docpluck-2.4.83 → docpluck-2.4.85}/scripts/verify_corpus_full.py +0 -0
  268. {docpluck-2.4.83 → docpluck-2.4.85}/tests/__init__.py +0 -0
  269. {docpluck-2.4.83 → docpluck-2.4.85}/tests/conftest.py +0 -0
  270. {docpluck-2.4.83 → docpluck-2.4.85}/tests/fixtures/__init__.py +0 -0
  271. {docpluck-2.4.83 → docpluck-2.4.85}/tests/fixtures/sections/__init__.py +0 -0
  272. {docpluck-2.4.83 → docpluck-2.4.85}/tests/fixtures/sections/builders.py +0 -0
  273. {docpluck-2.4.83 → docpluck-2.4.85}/tests/fixtures/structured/.gitkeep +0 -0
  274. {docpluck-2.4.83 → docpluck-2.4.85}/tests/fixtures/structured/MANIFEST.json +0 -0
  275. {docpluck-2.4.83 → docpluck-2.4.85}/tests/fixtures/structured/README.md +0 -0
  276. {docpluck-2.4.83 → docpluck-2.4.85}/tests/golden/sections/apa_multi_study_pdf.json +0 -0
  277. {docpluck-2.4.83 → docpluck-2.4.85}/tests/golden/sections/apa_single_study_pdf.json +0 -0
  278. {docpluck-2.4.83 → docpluck-2.4.85}/tests/golden/sections/html_real_headings.json +0 -0
  279. {docpluck-2.4.83 → docpluck-2.4.85}/tests/snapshots/amj_lattice.txt +0 -0
  280. {docpluck-2.4.83 → docpluck-2.4.85}/tests/snapshots/apa_chan_feldman_lineless.txt +0 -0
  281. {docpluck-2.4.83 → docpluck-2.4.85}/tests/snapshots/apa_chen_jesp_lineless.txt +0 -0
  282. {docpluck-2.4.83 → docpluck-2.4.85}/tests/snapshots/apa_efendic_affect.txt +0 -0
  283. {docpluck-2.4.83 → docpluck-2.4.85}/tests/snapshots/apa_ip_feldman_pspb.txt +0 -0
  284. {docpluck-2.4.83 → docpluck-2.4.85}/tests/snapshots/bmc_lattice.txt +0 -0
  285. {docpluck-2.4.83 → docpluck-2.4.85}/tests/snapshots/ieee_figure_heavy.txt +0 -0
  286. {docpluck-2.4.83 → docpluck-2.4.85}/tests/snapshots/ieee_lattice.txt +0 -0
  287. {docpluck-2.4.83 → docpluck-2.4.85}/tests/snapshots/jama_lattice.txt +0 -0
  288. {docpluck-2.4.83 → docpluck-2.4.85}/tests/snapshots/nat_comms_figure_only.txt +0 -0
  289. {docpluck-2.4.83 → docpluck-2.4.85}/tests/snapshots/nature_minimal_rule.txt +0 -0
  290. {docpluck-2.4.83 → docpluck-2.4.85}/tests/snapshots/scirep_minimal_rule.txt +0 -0
  291. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_a3c_leading_zero_decimal_real_pdf.py +0 -0
  292. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_a4_ci_period_to_comma.py +0 -0
  293. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_all_caps_section_promote_real_pdf.py +0 -0
  294. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_bbox_utils.py +0 -0
  295. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_benchmark_docx_html.py +0 -0
  296. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_cambridge_footer_strip_real_pdf.py +0 -0
  297. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_caption_only_table_heading_real_pdf.py +0 -0
  298. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_caption_regex.py +0 -0
  299. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_chart_data_trim_real_pdf.py +0 -0
  300. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_cid_minus_recovery_real_pdf.py +0 -0
  301. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_cli_sections.py +0 -0
  302. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_cli_structured.py +0 -0
  303. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_confidence.py +0 -0
  304. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_corpus_smoke.py +0 -0
  305. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_d5_normalization_audit.py +0 -0
  306. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_edge_cases.py +0 -0
  307. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_elsevier_footer_strip_real_pdf.py +0 -0
  308. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_equation_page_header_strip_real_pdf.py +0 -0
  309. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_extract_columns.py +0 -0
  310. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_extract_docx.py +0 -0
  311. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_extract_filter_sugar.py +0 -0
  312. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_extract_html.py +0 -0
  313. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_extract_layout.py +0 -0
  314. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_extract_pdf_structured.py +0 -0
  315. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_extraction.py +0 -0
  316. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_f0_table_region_aware.py +0 -0
  317. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_fffd_comparison_recovery_real_pdf.py +0 -0
  318. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_figure_caption_trim_real_pdf.py +0 -0
  319. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_figure_detect.py +0 -0
  320. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_fixtures_manifest.py +0 -0
  321. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_hallucinated_heading_continuation_guard.py +0 -0
  322. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_harness_text_loss_reflow.py +0 -0
  323. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_jama_open_cluster_real_pdf.py +0 -0
  324. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_lattice_cluster.py +0 -0
  325. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_letterspaced_label_real_pdf.py +0 -0
  326. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_ligature_decomposition_real_pdf.py +0 -0
  327. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_lt_operator_recovery_real_pdf.py +0 -0
  328. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_mathitalic_greek_real_pdf.py +0 -0
  329. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_metaesci_followups.py +0 -0
  330. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_minus_sign_recovery_real_pdf.py +0 -0
  331. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_normalization.py +0 -0
  332. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_normalize_f0_footnote_strip.py +0 -0
  333. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_normalize_idempotent_real_pdf.py +0 -0
  334. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_normalize_layout_param.py +0 -0
  335. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_normalize_metadata_leak_real_pdf.py +0 -0
  336. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_normalize_report_layout_fields.py +0 -0
  337. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_normalize_soft_hyphen_dehyphenation.py +0 -0
  338. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_normalize_v18_strips.py +0 -0
  339. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_numbered_heading_promotion_real_pdf.py +0 -0
  340. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_numbered_section_promotion_real_pdf.py +0 -0
  341. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_o5_reference_inversion_real_pdf.py +0 -0
  342. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_orphan_multilevel_number_real_pdf.py +0 -0
  343. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_orphan_section_number_real_pdf.py +0 -0
  344. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_p0r_recurring_running_header_strip.py +0 -0
  345. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_preserve_math_glyphs_real_pdf.py +0 -0
  346. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_pretest_capture_tokens.py +0 -0
  347. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_pua_glyph_recovery_real_pdf.py +0 -0
  348. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_quality.py +0 -0
  349. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_r1_whitespace_cells_wiring_real_pdf.py +0 -0
  350. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_r4_column_correction_real_pdf.py +0 -0
  351. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_rc1_general_column_correction_real_pdf.py +0 -0
  352. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_render.py +0 -0
  353. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_render_frontmatter_masthead.py +0 -0
  354. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_render_html.py +0 -0
  355. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_render_subsection_chain_promotion.py +0 -0
  356. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_request_09_reference_normalization.py +0 -0
  357. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_residual_2026_05_23_bundled.py +0 -0
  358. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_roman_numeral_section_promote_real_pdf.py +0 -0
  359. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_section_row_label_no_merge_real_pdf.py +0 -0
  360. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_sections_boundaries.py +0 -0
  361. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_sections_boundary_truncation.py +0 -0
  362. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_sections_core_partition.py +0 -0
  363. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_sections_docx_annotator.py +0 -0
  364. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_sections_extract_text.py +0 -0
  365. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_sections_footnote_section.py +0 -0
  366. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_sections_golden.py +0 -0
  367. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_sections_html_annotator.py +0 -0
  368. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_sections_pdf_annotator.py +0 -0
  369. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_sections_public_api.py +0 -0
  370. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_sections_real_corpus.py +0 -0
  371. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_sections_taxonomy.py +0 -0
  372. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_sections_text_annotator.py +0 -0
  373. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_sections_types.py +0 -0
  374. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_sections_unit_corpus.py +0 -0
  375. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_sections_v161_coalesce.py +0 -0
  376. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_sections_v161_subheadings.py +0 -0
  377. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_sections_v161_taxonomy.py +0 -0
  378. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_sections_v161_text_annotator.py +0 -0
  379. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_sections_version.py +0 -0
  380. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_smoke_fixtures.py +0 -0
  381. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_structured_result_type.py +0 -0
  382. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_structured_types.py +0 -0
  383. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_structured_version.py +0 -0
  384. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_table_caption_cell_region_real_pdf.py +0 -0
  385. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_table_detect.py +0 -0
  386. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_tables_cell_cleaning.py +0 -0
  387. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_tables_flatten.py +0 -0
  388. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_text_mode.py +0 -0
  389. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_v23_1_fixes.py +0 -0
  390. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_v23_bug_fixes.py +0 -0
  391. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_v23_post_corpus.py +0 -0
  392. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_v23_post_corpus_v2.py +0 -0
  393. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_v2_backwards_compat.py +0 -0
  394. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_v2_top_level_exports.py +0 -0
  395. {docpluck-2.4.83 → docpluck-2.4.85}/tests/test_whitespace_cluster.py +0 -0
  396. {docpluck-2.4.83 → docpluck-2.4.85}/tools/render_for_audit.py +0 -0
@@ -267,26 +267,43 @@ curl -s -o /dev/null -w "%{http_code}" https://docpluck.app/login
267
267
  ```
268
268
  Must return 200.
269
269
 
270
- ### 3. Railway Service Health
270
+ ### 3. Railway Service Health + deployed-version gate (AUTHORITATIVE)
271
+
272
+ `/health` is public and reports the docpluck version the live service actually imported — this is the authoritative "the new library is deployed" gate (the auth-gated `/extract` smoke in check 4 cannot run without a `dp_` API key). A clean boot here also proves docpluck imports without error at the new version.
273
+
271
274
  ```bash
272
- curl -s https://extraction-service-production-d0e5.up.railway.app/health
275
+ LIB_VERSION=$(grep '^__version__' C:/Users/filin/Dropbox/Vibe/MetaScienceTools/docpluck/docpluck/__init__.py | grep -oE '[0-9]+\.[0-9]+\.[0-9]+')
276
+ curl -s --max-time 25 https://extraction-service-production-d0e5.up.railway.app/health | python -c "
277
+ import sys, json
278
+ d = json.load(sys.stdin)
279
+ print('health:', d.get('status'), '| docpluck:', d.get('docpluck_version'), '| db:', d.get('database'))
280
+ assert d.get('status') == 'ok', 'service not ok'
281
+ assert d.get('docpluck_version') == '$LIB_VERSION', f\"deployed docpluck {d.get('docpluck_version')} != released $LIB_VERSION\"
282
+ print('Health + version gate: PASS')
283
+ "
273
284
  ```
274
- Must return `{"status":"ok",...}`.
285
+ **Gate:** `status == ok` AND `docpluck_version == <released version>` AND `database == connected`. A version mismatch means Railway has not finished redeploying on the new pin — wait and re-check; do not declare the deploy done.
286
+
287
+ ### 4. Authenticated extraction smoke (OPTIONAL — requires a `dp_` API key)
288
+
289
+ `/extract` is auth-gated: an unauthenticated POST returns `401 {"detail":"Missing or invalid API key. Use: Authorization: Bearer dp_..."}`. So this is a real end-to-end extraction check ONLY when a key is available (e.g. `DP_SMOKE_API_KEY` in the env). The check-3 health/version gate is the authoritative deployed-version proof; this is extra confidence on the extraction path. Skip (mark INCONCLUSIVE, not FAIL) when no key is present.
275
290
 
276
- ### 4. Smoke Test (if service is live)
277
291
  ```bash
278
- # Test extraction endpoint directly
279
- curl -s -X POST https://extraction-service-production-d0e5.up.railway.app/extract \
280
- -F "file=@test-pdfs/apa/chan_feldman_2025_cogemo.pdf" | python -c "
292
+ if [ -n "$DP_SMOKE_API_KEY" ]; then
293
+ curl -s --max-time 90 -X POST https://extraction-service-production-d0e5.up.railway.app/extract \
294
+ -H "Authorization: Bearer $DP_SMOKE_API_KEY" \
295
+ -F "file=@test-pdfs/apa/chan_feldman_2025_cogemo.pdf" | python -c "
281
296
  import sys, json
282
297
  data = json.load(sys.stdin)
283
- print(f'Engine: {data[\"metadata\"][\"engine\"]}')
284
- print(f'Chars: {data[\"metadata\"][\"chars\"]}')
285
- print(f'Quality: {data[\"quality\"][\"score\"]}')
286
- assert data['metadata']['chars'] > 10000, 'Too few chars'
287
- assert data['quality']['score'] >= 80, 'Quality too low'
288
- print('Smoke test: PASS')
298
+ m, q = data.get('metadata', {}), data.get('quality', {})
299
+ print('engine:', m.get('engine'), '| chars:', m.get('chars'), '| quality:', q.get('score'))
300
+ assert (m.get('chars') or 0) > 10000, 'Too few chars'
301
+ assert (q.get('score') or 0) >= 80, 'Quality too low'
302
+ print('Authenticated smoke test: PASS')
289
303
  "
304
+ else
305
+ echo 'DP_SMOKE_API_KEY not set — authenticated /extract smoke SKIPPED (INCONCLUSIVE). Health+version gate (check 3) is authoritative.'
306
+ fi
290
307
  ```
291
308
 
292
309
  ### 5. Daily-Digest Dry-Run Smoke (post-deploy, CRITICAL)
@@ -1123,3 +1123,54 @@ Add a permanent corpus test `tests/test_column_splice_preserves_raw_multiset.py`
1123
1123
 
1124
1124
  ### Edge cases
1125
1125
  - Hyphenated initials (`M.-J. O’Brien et al.`) need `[A-Z]\.[-\s]*` (not `[A-Z]\.\s*`) in the initial group, else they slip the shape. Curly apostrophe `’` must be in the surname char-class.
1126
+
1127
+ ---
1128
+
1129
+ ## 2026-06-12 — CitationGuard handoff 2026-06-10 (4 items): 1 fix + 3 won't-fix
1130
+
1131
+ **Target:** verify+fix the 4 text-extraction defects filed in
1132
+ `CitationGuard/docs/DOCPLUCK_HANDOFF_2026-06-10.md`. Shipped v2.4.84
1133
+ (NORMALIZATION_VERSION 1.9.31).
1134
+
1135
+ **Methodology lesson (the load-bearing one): VERIFY THE LAYER before
1136
+ dispositioning a consumer-filed "extraction defect".** A CitationGuard handoff
1137
+ calls everything a "docpluck text-extraction defect", but the layer differs and
1138
+ determines whether it's ours:
1139
+ - Reproduce each defect at HEAD against the *exact view the consumer reads*
1140
+ (CitationGuard consumes `/extract?normalize=academic` → `normalize_text(raw,
1141
+ academic)`), AND against *raw pdftotext*.
1142
+ - If the token is **present in raw pdftotext but absent in academic-normalized →
1143
+ it's a DOCPLUCK normalize bug** (item 4: "of 3 instruments" → R2 deleted "3").
1144
+ - If the token is **already corrupted in raw pdftotext → check pdfplumber too**.
1145
+ Corrupted identically in BOTH MIT extractors ⇒ PDF-embedded-text-layer loss,
1146
+ no in-text signal, no general fix, tool-swap forbidden ⇒ won't-fix (items 1–3).
1147
+ Do NOT assume "filed by citationguard ⇒ pdftotext-layer won't-fix" — that
1148
+ generalization (memory `project_citationguard_extraction_defects_wontfix`) is
1149
+ true for SOME items, but item 4 was squarely ours. The 5-min reproduce-at-HEAD
1150
+ step (top preflight lesson) is what separated them.
1151
+
1152
+ **The fix (general, not whack-a-mole):** R2 (references page-number scrub) was
1153
+ deleting any digit whose value appears as a standalone page-number line when it
1154
+ sat between two lowercase words — so the quantifier "3 instruments" (page 3 of
1155
+ the PDF) was stripped. The pre-existing guard was an *open-ended noun allowlist*
1156
+ (`years|participants|…`) that v2.4.17 had been extending one noun at a time.
1157
+ Replaced the strategy with a **closed-class signal**: a page leak follows a
1158
+ CONTENT word ("psychological 41 science"); a quantifier follows a FUNCTION word
1159
+ ("of 3 instruments"). Closed classes generalize where open allowlists can't.
1160
+ Kept the guard **purely additive** (only preserves, never strips more) so it
1161
+ cannot regress the must-strip cases — the safe direction given rule 0a.
1162
+
1163
+ **Verification without the harness:** camelot + `claude` CLI were both absent on
1164
+ this machine, so the full Phase-5H harness / canary-audit couldn't run. The
1165
+ substitute that actually proves a normalize-only, additive change is a
1166
+ **deterministic 101-PDF old-vs-new academic-normalized diff**: stash the change,
1167
+ regenerate all outputs, `diff` per paper, and READ every changed line. Result: 5
1168
+ papers changed, all correct restorations, 0 regressions — and it surfaced 4
1169
+ *pre-existing* silent digit-drops the fix also repaired (amp_1, bmc_med_3,
1170
+ ieee_access_5 ×2, maier_2023). This diff IS the canary evidence; recorded it as
1171
+ the `SKIP_CANARY=1` justification when the `claude`-less pre-commit hook hard-failed.
1172
+
1173
+ **Process miss to fix next time:** no working docpluck venv on this machine
1174
+ (pdfplumber/camelot/pytest all had to be pip-installed mid-run into C:\Python314).
1175
+ Stand up a proper env before an iterate run so the harness + camelot table tests
1176
+ aren't skipped.
@@ -22,7 +22,14 @@ jobs:
22
22
  python-version: ${{ matrix.python-version }}
23
23
 
24
24
  - name: Install poppler-utils (for pdftotext)
25
- run: sudo apt-get install -y poppler-utils
25
+ # `apt-get update` first: the runner image ships a stale package list
26
+ # that pins a poppler point-release (e.g. 24.02.0-1ubuntu9.8) which the
27
+ # mirror has since superseded, so a bare `install` 404s on the old .deb.
28
+ # Refreshing the lists pulls the current version. See the v2.4.84 CI
29
+ # flake (2026-06-12): "Failed to fetch libpoppler134 … 404 Not Found".
30
+ run: |
31
+ sudo apt-get update
32
+ sudo apt-get install -y poppler-utils
26
33
 
27
34
  - name: Install dependencies
28
35
  run: pip install -e ".[dev]"
@@ -1,5 +1,40 @@
1
1
  # Changelog
2
2
 
3
+ ## [2.4.85] — 2026-06-12
4
+
5
+ **Harvard name-year reference splitting (D1) + page-break reference stitch & category-label running-header strip (D2).** `NORMALIZATION_VERSION` 1.9.31 → 1.9.32.
6
+
7
+ Surfaced by `CitationGuard/docs/DOCPLUCK_HANDOFF_2026-06-12.md`.
8
+
9
+ **D1 — HIGH IMPACT — Harvard / Cambridge bibliographies collapsed into one paragraph.** R3 (the references-span continuation join) keeps each reference on its own logical line by breaking whenever a line *looks like a reference start*. That detection (`_looks_like_ref_start`) recognised Vancouver (`12. Surname`), IEEE (`[12] Surname`), and APA (`Surname, A.` — note the comma after the surname) — but **not** the Harvard / Cambridge name-year form `Surname A and Surname B (2020) …`, which has *no* comma between surname and initials. So on `bjps_1` (British Journal of Political Science, 109 entries) R3 treated every entry boundary as a mid-entry wrap and joined the **entire** reference section onto a single line. Downstream this made the consumer (citelink) parse 9 of 109 references (refs.f1 0.051).
10
+
11
+ The fix adds `_REF_START_HARVARD`, keyed on the structural signature *author-block + parenthesised 4-digit year*: a Title-case surname (Latin-Extended, so `Häusermann` / `Tuğal` qualify; hyphenated and compound surnames and `van der` / `Van der` particles handled) + 1–4 initials (spaced `R J`, glued `DH`, hyphenated `H-G`, or period `R.`), optionally chained with ` and `/`&`/comma or `et al.` (incl. `Surname K, et al.`), an optional `(eds)` editor marker, terminated by `(YYYY)`. The parenthesised year is the strong anchor that keeps mid-entry wrap lines (`American Journal of\nPolitical Science 64, 904-20.`) from matching. The same signature is added to `_find_references_spans` so a *pure*-Harvard bibliography (no numbered/IEEE/APA entries at all) is still detected as a references span and R3 runs. Result on `bjps_1`: 109 entries split one-per-line, **zero** entries collapsed; across the 8-paper BJPS Harvard corpus + 9 Nature papers, ≥95% of entries split correctly (was: every Harvard bibliography fully collapsed).
12
+
13
+ **Same-class fix surfaced by the D1 broad-read — APA ref-start missed accented / particle / compound surnames.** `_REF_START_APA` used an ASCII-only `[A-Z][a-z]+` surname, so `Yücel, M.`, `de Kovel, C.`, and `Karlsson Linnér, R.` were not recognised as entry starts and merged into the preceding reference (nat_comms_5: 10 collapsed pairs; nathumbeh_2: several). It now reuses the shared Latin-Extended surname block (with particles and an optional compound second word), keeping the comma-after-surname APA discriminator. Result: nat_comms_5 collapsed-pairs 10 → 1, nathumbeh_2 6 → 3 (the 3 remaining are supplementary-section prose, not reference merges). The change is a strict widening of a tight `Surname, Initial.` signature — journal-tail continuation lines (`Developmental psychology, 50(12), …`) still do not match.
14
+
15
+ **D2 — running-header injected mid-reference + page-break-orphaned year.** On `nat_comms_2` (Nature Communications) ref 34 straddles a page break; pdftotext emits `…EAE based on␊␊␌Article␊histology, … (2008).`. The category label `Article` (Nature prints it atop every page) survived — `_looks_like_running_header_or_footer` recognised multi-word banners and author-pair headers but not a bare single-word category label — and welded into the entry, while the page-break blank line left the entry's `(2008)` year in a detached paragraph (citelink parsed ref 34 with an empty year). Two coordinated fixes:
16
+
17
+ 1. `_CATEGORY_LABEL_HEADER` (new shape in `_looks_like_running_header_or_footer`) recognises the publisher article-type labels H0 already curates (`Article`, `Review`, `Letter`, `Matters Arising`, `Original Investigation`, …). Combined with P0r's existing ≥3-standalone-repetition guard this strips the recurring label wherever it appears (including mid-references), so a one-off body occurrence is never touched. Bare common words (`Research`, `Comment`) are excluded to avoid colliding with section headings.
18
+ 2. R3 now **bridges a page-break blank line** when the current entry is syntactically *incomplete* (does not end in sentence-terminal punctuation) — a page-break split leaves the head mid-clause (`…EAE based on`), so the tail rejoins; a *completed* entry (`…46, 215-39.`) followed by a blank is the end of the list, so a post-reference trailer (`Cite this article: …`) is **not** absorbed. A form-feed stitch (`\f` → newline inside the span) handles the case where the page header survives upstream.
19
+
20
+ Result on `nat_comms_2`: ref 34 is a single logical line carrying its `(2008)` year, with no `Article` injection.
21
+
22
+ New regression tests in `tests/test_harvard_refs_pagebreak_stitch.py`: Harvard ref-start positives (glued/hyphenated/accented/compound/particle surnames) and continuation-line negatives; the bjps_1 one-per-line and nat_comms_2 year-recovery defects at the text level; the page-break stitch + header strip; the trailer-not-absorbed guard; and real-PDF manifest-with-skip assertions on both fixtures.
23
+
24
+ ## [2.4.84] — 2026-06-12
25
+
26
+ **R2 page-number scrub — general quantifier-head guard (stop deleting digits from reference titles).** `NORMALIZATION_VERSION` 1.9.30 → 1.9.31.
27
+
28
+ Surfaced by `CitationGuard/docs/DOCPLUCK_HANDOFF_2026-06-10.md` (item 4): the `plos_med_1` reference "Clinimetric properties of **3** instruments measuring postoperative recovery…" rendered as "…properties of instruments measuring…" — the standalone "3" was silently deleted. Same class as the earlier Mayiwar "3"-drop. Root cause: R2 (the references-span page-number scrub) strips any digit whose value also appears as a standalone page-number line, when it sits between two lowercase words. "3" is page 3 of the PDF, so the legitimate quantifier "3 instruments" was stripped. The v2.4.17 body-noun allowlist (`years|participants|…`) is necessarily incomplete and never enumerated "instruments" — adding nouns one at a time is whack-a-mole.
29
+
30
+ The fix keys on a **closed class** instead: a genuine page-number leak ("psychological **41** science", "recovery **12** in a population") follows a *content* word, whereas a legitimate quantifier ("of **3** instruments", "the **5** factors", "and **3** in Vastfall") follows a *function* word (article / preposition / determiner — a finite closed set). `_r2_is_body_phrase` now also preserves a digit when the immediately-preceding token is in `_R2_QUANTIFIER_HEAD_WORDS`. The guard is purely **additive** — it only ever *preserves* a digit, never strips one — so it cannot make R2 delete anything it didn't already, honoring the correctness asymmetry (silently dropping a digit from a reference title, rule 0a, is far worse than leaving a stray page number). The body-noun list also gained the obvious research-countable nouns (`instruments|measures|scales|factors|experiments|datasets|tasks|…`) as belt-and-suspenders.
31
+
32
+ Corpus audit (101 PDFs, academic-normalized, old vs new): **5 papers changed, every change a correct restoration, zero regressions** — `plos_med_1` ("of 3 instruments"), `amp_1` ("sizes of 12 academic search engines"), `bmc_med_3` ("on 5 types of"), `ieee_access_5` ("kernel size to 3 x 3" + "for 30 iterations"), `maier_2023_collabra` ("Experiments 1a and 3 in Vastfall"). The fix repaired the filed defect **plus 4 pre-existing silent corruptions**. The 11 remaining R2 strips (all preceded by content words) are genuine page-number leaks, correctly still stripped.
33
+
34
+ New regression tests in `tests/test_normalize_a3_r2_body_integer_real_pdf.py`: quantifier-head helper positives (`of 3 instruments`, `the 5 factors`), content-word-leak negative (`psychological 41 science` still strips), and a real-PDF assertion on `plos_med_1` (`of 3 instruments` present, `of instruments measuring` absent).
35
+
36
+ > **Handoff items 1–3 (chen_2021_jesp) — won't-fix, PDF-text-layer corruption.** The same handoff filed three other defects on `chen_2021_jesp`: the lost "Registered reports: " title prefix (Nosek & Lakens), the "TurkPrime.com" → "TurkPrime. Com" space injection (Litman et al.), and the "Open Science Collaboration" → "Open, S. C." + stray "Psychology." mangle (OSC 2015). Verified that **both** MIT extractors — pdftotext (text channel) and pdfplumber (layout channel) — produce byte-identical corruption on all three. When two independent extractors read the same bytes, the corruption lives in the PDF's embedded text layer, not in any tool's reading order, and there is no in-text signal (the references read as clean and complete) to trigger a conditional fallback. No general structural signature exists to detect or repair them, and a tool swap is both forbidden (CLAUDE.md L-001 / AGPL ban) and would not help. These remain consumer-side / unrecoverable without OCR or AI re-read — the established `project_citationguard_extraction_defects_wontfix` class.
37
+
3
38
  ## [2.4.83] — 2026-06-08
4
39
 
5
40
  **P0r — bare "`<Initials> <Surname> et al.`" running-header strip.** `NORMALIZATION_VERSION` 1.9.29 → 1.9.30. Keyed on the `Initial. Surname et al.` line shape + the existing ≥3-standalone-repetition guard, never paper identity.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docpluck
3
- Version: 2.4.83
3
+ Version: 2.4.85
4
4
  Summary: PDF, DOCX, and HTML text extraction and normalization for academic papers
5
5
  Project-URL: Homepage, https://docpluck.app
6
6
  Project-URL: Documentation, https://docpluck.app/api-docs
@@ -146,6 +146,7 @@ Add only when a real downstream consumer asks for one. YAGNI until then.
146
146
  - [ ] **`### Reasons for change`** (ip_feldman) — Table 5 column header promoted to heading; needs table-region awareness (the body-coherence guard doesn't catch it because its body starts capitalized). RCA: rank-3 in the 2026-06-06 run-11 RCA.
147
147
  - [ ] **`## Data Availability` end-matter absent** — RCA CORRECTED the run-11 "demoter over-strip" premise: the section never enters the text channel (pdftotext drops the title-page box). Needs cross-channel (pdfplumber) recovery, same architecture class as B7. NOT a demoter exception.
148
148
  - [ ] **Glyph `Västfjäll`→`Vastfall`** (ar_apa/collabra, citationguard Defect 2) — baked pdftotext CID-font mis-map; needs a same-document surname-consensus normalizer (new subsystem). **Product/architecture decision on scope.**
149
+ - [x] **Baked-glyph DIGIT misread `M_age 59.3`→`39.3`** (collabra.77859, surfaced 2026-06-08 RC-1 AI-verify) — **DIAGNOSED + DECISION MADE (2026-06-08): document as known limitation, no code change.** Same class as `Västfjäll` but a DIGIT in a statistic (silent stat corruption, the most dangerous form for meta-science): the PDF *visually* shows `59.3` but the embedded text codepoint is baked as `3`, and **both pdftotext AND pdfplumber faithfully extract `39.3`** (confirmed by visual PDF read + dual-extractor diff). No text-channel logic can recover it; the only fixes are OCR/multimodal-glyph-consensus (a new subsystem the user explicitly **declined** to scope this session). **Consumer note: CitationGuard / downstream stat-checkers must assume baked digit/letter misreads exist in source PDFs and apply their own cross-source (CrossRef/visual) verification — docpluck cannot guarantee a digit matches the visual glyph when the publisher baked the wrong codepoint.**
149
150
  - [ ] **org-author `Open Science Collaboration`→`Open, S. C.`** — baked into the PDF's embedded text by the publisher (identical in pdftotext AND pdfplumber); no safe general docpluck fix. **Routed to CitationGuard** (DOI/CrossRef author reconciliation).
150
151
  - [ ] **Tag v2.4.78** once the full canary set clears (currently 5 open). Then bump `PDFextractor/service/requirements.txt` pin + run `/docpluck-deploy`.
151
152
 
@@ -165,3 +166,17 @@ Add only when a real downstream consumer asks for one. YAGNI until then.
165
166
  - [ ] **Authenticated prod functional smoke** on chen via Railway /extract (needs a dp_ API key) — confirm O5 ordered-refs end-to-end in prod. Version (2.4.80) + health verified; the authenticated extract was not run.
166
167
  - [ ] **CitationGuard follow-ups** (their repo, documented in docs/DOCPLUCK_HANDOFF_2026-06-07.md): regenerate chen+jamison fixtures from docpluck v2.4.80 academic + re-score; extend citelink's number-ending-host special-case to single COVID-1928. Superscript recovery: WON'T-FIX in docpluck (would regress citelink — tested).
167
168
  - [ ] **ip_feldman interwoven table+prose case** (B4/#3/#4, R4/#5): still open; needs the per-y-band region-aware architecture (tracked in docs/superpowers/specs/2026-06-07-ip_feldman-B4-R4-column-interleave-diagnosis.md). O5 only handled the separable banded case (chen/jamison).
169
+
170
+ ## 2026-06-08 — v2.4.83 landed (footnote_texts API + extract_pdf_layout export + render label de-dup; committed fb0d595, PUSHED to origin/main, NOT tagged)
171
+
172
+ > **Source:** ScienceArena arena-report verification session. The 2026-06-07 arena report (`ScienceArena/docs/reports/2026-06-07-docpluck-arena-issues.md`) blamed docpluck, but verification found the top findings (footnotes, body fidelity) were **ScienceArena adapter bugs**, not docpluck defects — full verdict in `ScienceArena/docs/reports/2026-06-08-docpluck-arena-issues-verified-response.md`. docpluck got 3 additive fixes (`report.footnote_texts`, top-level `extract_pdf_layout` export, render `### Table N` + `*Table N. …*` de-dup) + a load-aware fix to the DOCX/HTML perf tests (flaked the -n10 gate). Full suite **1906 passed**; the canary block on ip_feldman was a **proven false positive** (deterministic render diff: change touched only the 10 formal `### Table N` caption lines) — the 6 "new" findings were the case-normalization ledger bug (line ~165) re-flagging pre-existing B3/B4/column-interleave backlog → landed with `SKIP_CANARY=1` + proof in the commit message.
173
+
174
+ ### Release follow-ups (only when v2.4.83 is tagged)
175
+
176
+ - [ ] **Bump `PDFextractor/service/requirements.txt`** git pin to `@v2.4.83` + update `PDFextractor/API.md` frozen-version examples. Production silently runs the old library until this lands (the `/docpluck-deploy` pre-flight check 4 enforces it).
177
+ - [ ] **Run `/docpluck-deploy`** to ship 2.4.83 to prod after the pin bump + tag.
178
+ - [ ] **Tag v2.4.83** (with the unreleased v2.4.81/82 on this RC branch) once the canary set clears. The strict no-exceptions canary gate fires on **tag** pushes, so the finding-key case-normalization bug (line ~165) must be fixed first or the tag canary will false-block.
179
+
180
+ ### ScienceArena adapter (their repo — fixed this session, verify there)
181
+
182
+ - [ ] **Re-run the ScienceArena benchmark** with docpluck ≥ 2.4.83 installed in ITS venv. The adapter fixes (commit `de35f4a` on sciencearena `main`: pass `layout=`, read `report.footnote_texts`, strip the caption label) are logic-verified against local docpluck but NOT run end-to-end there (docpluck isn't installed in that repo). Also recommend ranking `docpluck-standard` as the primary real-document variant (Greek preserved).
@@ -78,7 +78,7 @@ from .figures import Figure
78
78
  from .extract_structured import TABLE_EXTRACTION_VERSION, StructuredResult, extract_pdf_structured
79
79
  from .render import render_pdf_to_markdown
80
80
 
81
- __version__ = "2.4.83"
81
+ __version__ = "2.4.85"
82
82
  __author__ = "Gilad Feldman"
83
83
  __license__ = "MIT"
84
84
 
@@ -23,7 +23,7 @@ class NormalizationLevel(str, Enum):
23
23
  academic = "academic"
24
24
 
25
25
 
26
- NORMALIZATION_VERSION = "1.9.30"
26
+ NORMALIZATION_VERSION = "1.9.32"
27
27
 
28
28
 
29
29
  # ── Mathematical Alphanumeric Symbols de-styling (shared, v2.4.34) ──────────
@@ -238,7 +238,59 @@ _REFS_END = re.compile(
238
238
  )
239
239
  _REF_START_VANCOUVER = re.compile(r"^\d{1,3}\.\s+[A-Z]")
240
240
  _REF_START_IEEE = re.compile(r"^\[\d+\]\s+[A-Z]")
241
- _REF_START_APA = re.compile(r"^[A-Z][a-z]+(?:-[A-Z][a-z]+)?,\s+[A-Z]\.")
241
+ # Harvard / Cambridge name-year reference entry start (D1, citationguard-iterate
242
+ # 2026-06-12): "Surname A and Surname B (2020) …", "Surname A et al. (2020) …",
243
+ # "Surname A, Surname B and Surname C (2019) …". Distinct from APA, which puts a
244
+ # comma immediately after surname-1 ("Surname, A.") — the Harvard form has NO
245
+ # comma between surname and initials, so _REF_START_APA never matched it and R3
246
+ # collapsed the whole bibliography onto one line (British Journal of Political
247
+ # Science bjps_1: 109 entries joined into a single paragraph). The structural
248
+ # signature is an author block — Surname + 1–3 bare initials, optionally chained
249
+ # with " and "/"&"/comma — terminated by a parenthesised 4-digit year. The
250
+ # parenthesised year is the strong anchor that keeps mid-entry wrap lines
251
+ # ("American Journal of\nPolitical Science 64, 904-20.") from matching.
252
+ # Initials: 1-4 groups, each 1-3 capitals, optionally hyphenated ("H-G", "Z-C")
253
+ # and optionally period-terminated; groups may be spaced ("R J") or glued ("DH").
254
+ _HARVARD_INITIALS = r"(?:\s+[A-Z]{1,3}(?:-[A-Z]{1,3})?\.?){1,4}"
255
+ # Surname: Title-case word (Latin-Extended so "Häusermann", "Öhman" qualify),
256
+ # optionally hyphenated ("Huntington-Klein") and optionally a second word for
257
+ # compound surnames ("Santos Silva", "El Soufi"), with leading lowercase
258
+ # particles ("van der", "de la") permitted.
259
+ # Letter ranges span ASCII + Latin-1 Supplement + Latin Extended-A/B (U+0100-
260
+ # U+024F) so Eastern-European / Turkish surnames ("Häusermann", "Tuğal", "Öhman")
261
+ # qualify. First letter is an uppercase-ish Latin letter; the rest may be any
262
+ # Latin letter or apostrophe.
263
+ _HARVARD_NAME_WORD = (
264
+ r"[A-ZÀ-ÞĀ-ɏ][A-Za-zÀ-ÿĀ-ɏ'’]+(?:-[A-ZÀ-ÞĀ-ɏ][A-Za-zÀ-ÿĀ-ɏ'’]+)?"
265
+ )
266
+ _HARVARD_SURNAME = (
267
+ # Leading particles, capitalised or not ("van der", "de la", "Van der Brug",
268
+ # "De Vries") — the first particle is often title-cased at an entry start.
269
+ r"(?:(?:[Vv]an|[Vv]on|[Dd]e|[Dd]er|[Dd]en|[Dd]i|[Dd]el|[Dd]ella|[Dd]u"
270
+ r"|[Ll]a|[Ll]e|[Ee]l|[Dd]os|[Dd]a)\s+){0,2}"
271
+ + _HARVARD_NAME_WORD
272
+ + r"(?:\s+" + _HARVARD_NAME_WORD + r")?"
273
+ )
274
+ _HARVARD_AUTHOR = _HARVARD_SURNAME + _HARVARD_INITIALS
275
+ _REF_START_HARVARD = re.compile(
276
+ r"^" + _HARVARD_AUTHOR +
277
+ r"(?:"
278
+ r",?\s+et\s+al\.?" # "et al." / "Surname K, et al."
279
+ r"|,?\s+(?:and|&)\s+" + _HARVARD_AUTHOR + # " and Surname I"
280
+ r"|,\s+" + _HARVARD_AUTHOR + # ", Surname I"
281
+ r")*"
282
+ r"(?:\s+\((?:eds?|editors?)\.?\))?" # optional "(eds)" / "(ed.)" marker
283
+ r"\s+\((?:1[89]|20)\d{2}[a-z]?\)", # (Year) optional letter suffix
284
+ re.UNICODE,
285
+ )
286
+ # APA reference entry start: "Surname, A.", "Surname, A. B.". Reuses the shared
287
+ # surname block so accented ("Yücel, M."), particle ("de Kovel, C."), and
288
+ # compound ("Karlsson Linnér, R.") surnames are recognised — the previous
289
+ # ASCII-only `[A-Z][a-z]+` form silently merged those entries into the preceding
290
+ # reference (surfaced on nat_comms_5 / nathumbeh_2 during the D1 broad-read;
291
+ # same root-cause class as D1). The comma-then-initial is the APA discriminator
292
+ # (Harvard has no comma after the surname).
293
+ _REF_START_APA = re.compile(r"^" + _HARVARD_SURNAME + r",\s+[A-Z]\.", re.UNICODE)
242
294
 
243
295
 
244
296
  def _find_references_spans(text: str) -> list[tuple[int, int]]:
@@ -258,7 +310,17 @@ def _find_references_spans(text: str) -> list[tuple[int, int]]:
258
310
  ref_starts = (
259
311
  len(re.findall(r"\b\d{1,3}\.\s+[A-Z]", window))
260
312
  + len(re.findall(r"\n\[\d+\]\s+[A-Z]", window))
261
- + len(re.findall(r"\n[A-Z][a-z]+(?:-[A-Z][a-z]+)?,\s+[A-Z]\.", window))
313
+ + len(re.findall(r"\n" + _HARVARD_SURNAME + r",\s+[A-Z]\.", window))
314
+ # Harvard name-year entries (D1): "\nSurname A(?: and …| et al.)? (YYYY)".
315
+ # Without this a pure-Harvard bibliography (no comma after surname,
316
+ # no numbered/IEEE entries) would score 0 ref-starts and the span
317
+ # would go undetected, so R3 would never run to keep entries split.
318
+ + len(re.findall(
319
+ r"\n" + _HARVARD_AUTHOR +
320
+ r"(?:\s+et\s+al\.?|,?\s+(?:and|&)\s+" + _HARVARD_AUTHOR +
321
+ r")*\s+\((?:1[89]|20)\d{2}[a-z]?\)",
322
+ window,
323
+ ))
262
324
  )
263
325
  if ref_starts >= 3:
264
326
  end_m = _REFS_END.search(text, start)
@@ -469,23 +531,94 @@ _R2_BODY_NOUN_PATTERN = re.compile(
469
531
  r"people|persons?|adults?|children|students?|patients?|workers?|"
470
532
  r"employees?|managers?|leaders?|followers?|users?|members?|"
471
533
  r"votes?|comments?|ratings?|reviews?|posts?|tweets?|messages?|"
472
- r"items?|conditions?|variables?|categories?|topics?|themes?)\b",
534
+ r"items?|conditions?|variables?|categories?|topics?|themes?|"
535
+ r"instruments?|measures?|scales?|factors?|dimensions?|domains?|"
536
+ r"experiments?|datasets?|samples?|tasks?|stimuli|questions?)\b",
473
537
  re.IGNORECASE,
474
538
  )
475
539
 
540
+ # v2.4.84 (NORMALIZATION_VERSION 1.9.31): R2 quantifier-head pre-context guard.
541
+ #
542
+ # The body-noun allowlist above is necessarily incomplete — it can never
543
+ # enumerate every countable noun a reference title might quantify ("3
544
+ # instruments", "5 trajectories", "12 heuristics", …). The amle_1 fix
545
+ # (v2.4.17) added nouns one at a time; the plos_med_1 "Clinimetric properties
546
+ # of 3 instruments" → "… of instruments" drop (filed by citationguard-iterate
547
+ # 2026-06-10, same class as the earlier Mayiwar case) is the same whack-a-mole
548
+ # recurring.
549
+ #
550
+ # A genuine page-number leak and a legitimate quantifier are distinguished by
551
+ # the word IMMEDIATELY PRECEDING the digit, not the noun after it:
552
+ # * quantifier — the digit heads a noun phrase, so it follows a CLOSED-CLASS
553
+ # function word (article / preposition / determiner): "of 3 instruments",
554
+ # "the 5 factors", "first 20 years", "only 3 studies".
555
+ # * page leak — the digit interrupts a content phrase, so it follows a
556
+ # CONTENT word (adjective / noun): "psychological 41 science",
557
+ # "recovery 12 in a population".
558
+ # Function words are a finite closed class, so keying on them generalizes where
559
+ # the open-ended noun list cannot. This guard is purely ADDITIVE — it only
560
+ # ever PRESERVES a digit (returns True), never strips one — so it cannot make
561
+ # R2 strip anything it did not already strip (no false-positive page numbers
562
+ # newly retained beyond the safe direction). Per docpluck's correctness
563
+ # asymmetry, silently deleting a digit from a scientific reference title (rule
564
+ # 0a, NO TEXT MAY DISAPPEAR) is far worse than leaving a stray page number, so
565
+ # biasing toward preserve at a quantifier head is the correct trade.
566
+ _R2_QUANTIFIER_HEAD_WORDS = frozenset(
567
+ {
568
+ # articles
569
+ "a", "an", "the",
570
+ # prepositions
571
+ "of", "in", "on", "at", "to", "for", "with", "by", "from", "as",
572
+ "into", "than", "between", "among", "amongst", "through", "during",
573
+ "after", "before", "over", "under", "about", "across", "per",
574
+ "within", "upon", "against", "toward", "towards", "around",
575
+ # conjunctions
576
+ "and", "or", "nor", "but",
577
+ # determiners / quantifier-context heads
578
+ "all", "both", "these", "those", "some", "any", "each", "every",
579
+ "first", "last", "next", "only", "total", "following", "remaining",
580
+ "top", "approximately", "nearly", "least", "most", "up", "least",
581
+ "another", "additional", "further", "respective", "successive",
582
+ }
583
+ )
584
+ # Trailing-word extractor: the alphabetic word ending right before ``match_pos``
585
+ # (the optional trailing hyphen lets "well-3" style hyphenations still resolve
586
+ # to the final segment, which is what matters for the function-word lookup).
587
+ _R2_PRECEDING_WORD = re.compile(r"([A-Za-z]+)[\s ]*$")
588
+
476
589
 
477
590
  def _r2_is_body_phrase(digit_str: str, refs_text: str, match_pos: int) -> bool:
478
591
  """Return True if the digit at ``match_pos`` is part of a body phrase
479
- (e.g. "20 years", "1,675 participants") and should NOT be stripped by R2.
480
-
481
- Heuristic: check the 30-char window AFTER the matched digit for a
482
- body-noun keyword (years, participants, etc.). If found, the digit is
483
- almost certainly part of legitimate body prose, not a page-number leak.
592
+ (e.g. "20 years", "1,675 participants", "of 3 instruments") and should
593
+ NOT be stripped by R2 (the page-number scrub).
594
+
595
+ Two complementary, both-conservative signals either one preserves:
596
+
597
+ 1. **Following body-noun** — the 60-char window AFTER the digit contains a
598
+ known body-noun keyword (years, participants, instruments, …). Handles
599
+ "20 years", "1,675 participants".
600
+ 2. **Preceding quantifier head** — the word IMMEDIATELY BEFORE the digit is
601
+ a closed-class function word (article / preposition / determiner), so
602
+ the digit heads a quantified noun phrase ("of 3 instruments", "the 5
603
+ factors") rather than interrupting a content phrase. Generalizes beyond
604
+ the finite noun list. See ``_R2_QUANTIFIER_HEAD_WORDS``.
605
+
606
+ A genuine page-number leak ("psychological 41 science", "recovery 12 in a
607
+ population") fails BOTH checks — the noun after isn't in the list and the
608
+ word before is a content word — so it is still stripped.
484
609
  """
485
- # Window starts after the digit + at least one space.
610
+ # (1) Following body-noun. Window starts after the digit + at least one space.
486
611
  window_start = match_pos + len(digit_str)
487
612
  window = refs_text[window_start:window_start + 60]
488
- return bool(_R2_BODY_NOUN_PATTERN.search(window))
613
+ if _R2_BODY_NOUN_PATTERN.search(window):
614
+ return True
615
+
616
+ # (2) Preceding quantifier head (closed-class function word).
617
+ m = _R2_PRECEDING_WORD.search(refs_text[:match_pos])
618
+ if m and m.group(1).lower() in _R2_QUANTIFIER_HEAD_WORDS:
619
+ return True
620
+
621
+ return False
489
622
 
490
623
 
491
624
  def _looks_like_ref_start(line: str) -> bool:
@@ -493,6 +626,7 @@ def _looks_like_ref_start(line: str) -> bool:
493
626
  _REF_START_VANCOUVER.match(line)
494
627
  or _REF_START_IEEE.match(line)
495
628
  or _REF_START_APA.match(line)
629
+ or _REF_START_HARVARD.match(line)
496
630
  )
497
631
 
498
632
 
@@ -1458,6 +1592,32 @@ _AUTHOR_ETAL_INITIAL = re.compile(
1458
1592
  )
1459
1593
 
1460
1594
 
1595
+ # D2 (citationguard-iterate 2026-06-12): single-word / short category-label
1596
+ # running headers. Nature-family and many journals print the article-type label
1597
+ # ("Article", "Review", "Letter", "Matters Arising", …) at the top of every page.
1598
+ # H0 already curates these in _HEADER_BANNER_PATTERNS, but H0 only fires in the
1599
+ # document-header zone (first 30 lines); when the label recurs mid-document — e.g.
1600
+ # inside the References section at a page break — it survives and gets welded into
1601
+ # an entry ("…EAE based on\n\n\x0cArticle histology…" orphaned ref 34's year on
1602
+ # nat_comms_2). Stripping it here is gated by the ≥3-standalone-repetition guard in
1603
+ # _detect_recurring_running_headers, so a one-off body occurrence is never touched.
1604
+ # Scoped to genuine publisher article-type furniture; bare common words ("Research",
1605
+ # "Comment") are excluded to avoid colliding with section-heading body lines.
1606
+ _CATEGORY_LABEL_HEADER = re.compile(
1607
+ r"^(?:"
1608
+ r"Article|ARTICLE|Articles"
1609
+ r"|Review|Reviews|REVIEW"
1610
+ r"|Letter|Letters|LETTER"
1611
+ r"|Resource|Resources"
1612
+ r"|Analysis|Perspective|Perspectives"
1613
+ r"|Correspondence|Editorial"
1614
+ r"|Brief\s+Communication|Matters\s+Arising"
1615
+ r"|Original\s+(?:Investigation|Article|Research)"
1616
+ r"|Research\s+(?:Article|Paper|Letter|Report)"
1617
+ r")$"
1618
+ )
1619
+
1620
+
1461
1621
  def _is_all_caps_journal_banner(line: str) -> bool:
1462
1622
  """All-caps multi-word journal banner: e.g. ``PLOS MEDICINE``,
1463
1623
  ``COGNITION AND EMOTION``, ``JAMA NETWORK OPEN``.
@@ -1501,6 +1661,8 @@ def _looks_like_running_header_or_footer(line: str) -> bool:
1501
1661
  """
1502
1662
  if not line or len(line) > 100:
1503
1663
  return False
1664
+ if _CATEGORY_LABEL_HEADER.match(line):
1665
+ return True
1504
1666
  if _is_all_caps_journal_banner(line):
1505
1667
  return True
1506
1668
  if _AUTHOR_PAIR_ALL_CAPS_AND.match(line):
@@ -3555,6 +3717,22 @@ def normalize_text(
3555
3717
  for r_start, r_end in reversed(_refs_spans):
3556
3718
  refs_text = t[r_start:r_end]
3557
3719
 
3720
+ # R3 page-break stitch (D2, citationguard-iterate 2026-06-12): inside
3721
+ # a bibliography a form-feed (page break) NEVER coincides with a
3722
+ # paragraph boundary — entries are delimited by ref-starts, not blank
3723
+ # lines. When an entry straddles a page break, pdftotext emits
3724
+ # "…based on\n\n\x0cArticle\nhistology…(2008)." — the blank line +
3725
+ # form feed split the entry, so R3's normal continuation join (which
3726
+ # resets on a blank line) leaves the tail (and its year) detached,
3727
+ # orphaning nat_comms_2 ref 34's "(2008)". Collapse each form-feed
3728
+ # junction (and the blank line(s) around it) to a single newline so
3729
+ # the tail rejoins the head as an ordinary continuation. The running-
3730
+ # header label on the new page ("Article") is stripped upstream by
3731
+ # P0r (_CATEGORY_LABEL_HEADER); any that survives is handled by the
3732
+ # continuation join, but the year is recovered regardless.
3733
+ refs_text = re.sub(r"[ \t]*\n[ \t\n]*\f[ \t]*", "\n", refs_text)
3734
+ refs_text = refs_text.replace("\f", "\n")
3735
+
3558
3736
  # R3 pre-pass (Cycle 15 v2.4.67): two-column bibliography
3559
3737
  # pairing. pdftotext renders some 2-column bibliographies by
3560
3738
  # streaming the entire NUMBER column first, then the entire
@@ -3582,15 +3760,49 @@ def normalize_text(
3582
3760
  before_r3 = refs_text
3583
3761
  lines = refs_text.split("\n")
3584
3762
  joined: list[str] = []
3763
+ # Index in ``joined`` of the entry currently being built. Unlike the
3764
+ # previous ``joined[-1]`` check, this survives blank lines: within a
3765
+ # bibliography a blank line is a page-break artifact (entries are
3766
+ # single-newline separated), so a continuation line that follows a
3767
+ # blank still belongs to the entry above it. This rejoins entries
3768
+ # split across a page break — e.g. nat_comms_2 ref 34, whose
3769
+ # "(2008)" year sat past the page-break blank after the running
3770
+ # header was stripped (D2). Entry boundaries are still established
3771
+ # solely by _looks_like_ref_start, so genuinely separate entries do
3772
+ # not merge.
3773
+ cur_entry = -1
3774
+ saw_blank = False
3585
3775
  for line in lines:
3586
3776
  stripped = line.strip()
3587
3777
  if not stripped:
3588
3778
  joined.append("")
3779
+ saw_blank = True
3589
3780
  continue
3590
- if joined and joined[-1] and not _looks_like_ref_start(stripped):
3591
- joined[-1] = joined[-1].rstrip() + " " + stripped
3592
- else:
3781
+ is_start = _looks_like_ref_start(stripped)
3782
+ bridged = False
3783
+ if cur_entry >= 0 and not is_start:
3784
+ if saw_blank:
3785
+ # Crossing a blank line: only bridge when the current
3786
+ # entry is syntactically INCOMPLETE (does not end with
3787
+ # sentence-terminal punctuation). A page-break split
3788
+ # leaves the head mid-clause ("…EAE based on"), so the
3789
+ # tail rejoins; a COMPLETED entry ("…46, 215-39.")
3790
+ # followed by a blank is the end of the list, and the
3791
+ # next block ("Cite this article: …") is post-reference
3792
+ # trailer that must NOT be absorbed.
3793
+ prev = joined[cur_entry].rstrip()
3794
+ if prev and prev[-1] not in ".?!":
3795
+ joined[cur_entry] = prev + " " + stripped
3796
+ bridged = True
3797
+ else:
3798
+ joined[cur_entry] = (
3799
+ joined[cur_entry].rstrip() + " " + stripped
3800
+ )
3801
+ bridged = True
3802
+ if not bridged:
3593
3803
  joined.append(stripped)
3804
+ cur_entry = len(joined) - 1
3805
+ saw_blank = False
3594
3806
  refs_text = "\n".join(joined)
3595
3807
  r3_joins_total += before_r3.count("\n") - refs_text.count("\n")
3596
3808
 
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "docpluck"
7
- version = "2.4.83"
7
+ version = "2.4.85"
8
8
  description = "PDF, DOCX, and HTML text extraction and normalization for academic papers"
9
9
  readme = "docs/README.md"
10
10
  requires-python = ">=3.10"