docpluck 2.4.63__tar.gz → 2.4.64__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (350) hide show
  1. {docpluck-2.4.63 → docpluck-2.4.64}/CHANGELOG.md +32 -0
  2. {docpluck-2.4.63 → docpluck-2.4.64}/PKG-INFO +1 -1
  3. {docpluck-2.4.63 → docpluck-2.4.64}/docpluck/__init__.py +1 -1
  4. {docpluck-2.4.63 → docpluck-2.4.64}/docpluck/normalize.py +91 -35
  5. {docpluck-2.4.63 → docpluck-2.4.64}/pyproject.toml +1 -1
  6. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_normalize_idempotent_real_pdf.py +39 -0
  7. {docpluck-2.4.63 → docpluck-2.4.64}/.claude/skills/_project/lessons.md +0 -0
  8. {docpluck-2.4.63 → docpluck-2.4.64}/.claude/skills/docpluck-cleanup/SKILL.md +0 -0
  9. {docpluck-2.4.63 → docpluck-2.4.64}/.claude/skills/docpluck-deploy/SKILL.md +0 -0
  10. {docpluck-2.4.63 → docpluck-2.4.64}/.claude/skills/docpluck-iterate/LEARNINGS.md +0 -0
  11. {docpluck-2.4.63 → docpluck-2.4.64}/.claude/skills/docpluck-iterate/SKILL.md +0 -0
  12. {docpluck-2.4.63 → docpluck-2.4.64}/.claude/skills/docpluck-iterate/references/ai-full-doc-verify.md +0 -0
  13. {docpluck-2.4.63 → docpluck-2.4.64}/.claude/skills/docpluck-iterate/references/cycle-report-template.md +0 -0
  14. {docpluck-2.4.63 → docpluck-2.4.64}/.claude/skills/docpluck-iterate/references/local-verification.md +0 -0
  15. {docpluck-2.4.63 → docpluck-2.4.64}/.claude/skills/docpluck-iterate/references/rationalizations.md +0 -0
  16. {docpluck-2.4.63 → docpluck-2.4.64}/.claude/skills/docpluck-iterate/references/real-library-real-pdf.md +0 -0
  17. {docpluck-2.4.63 → docpluck-2.4.64}/.claude/skills/docpluck-iterate/references/release-flow.md +0 -0
  18. {docpluck-2.4.63 → docpluck-2.4.64}/.claude/skills/docpluck-iterate/references/self-improvement.md +0 -0
  19. {docpluck-2.4.63 → docpluck-2.4.64}/.claude/skills/docpluck-iterate/references/three-tier-parity.md +0 -0
  20. {docpluck-2.4.63 → docpluck-2.4.64}/.claude/skills/docpluck-qa/SKILL.md +0 -0
  21. {docpluck-2.4.63 → docpluck-2.4.64}/.claude/skills/docpluck-qa/references/benchmark-mode.md +0 -0
  22. {docpluck-2.4.63 → docpluck-2.4.64}/.claude/skills/docpluck-qa/references/check-11-hard-rules.md +0 -0
  23. {docpluck-2.4.63 → docpluck-2.4.64}/.claude/skills/docpluck-qa/references/check-13-escicheck-production.md +0 -0
  24. {docpluck-2.4.63 → docpluck-2.4.64}/.claude/skills/docpluck-qa/references/check-5-escicheck-library.md +0 -0
  25. {docpluck-2.4.63 → docpluck-2.4.64}/.claude/skills/docpluck-qa/references/check-6-escicheck-local-webapp.md +0 -0
  26. {docpluck-2.4.63 → docpluck-2.4.64}/.claude/skills/docpluck-qa/references/check-7-batch-smoke.md +0 -0
  27. {docpluck-2.4.63 → docpluck-2.4.64}/.claude/skills/docpluck-review/SKILL.md +0 -0
  28. {docpluck-2.4.63 → docpluck-2.4.64}/.github/workflows/bump-app-pin.yml +0 -0
  29. {docpluck-2.4.63 → docpluck-2.4.64}/.github/workflows/publish.yml +0 -0
  30. {docpluck-2.4.63 → docpluck-2.4.64}/.github/workflows/test.yml +0 -0
  31. {docpluck-2.4.63 → docpluck-2.4.64}/.gitignore +0 -0
  32. {docpluck-2.4.63 → docpluck-2.4.64}/CLAUDE.md +0 -0
  33. {docpluck-2.4.63 → docpluck-2.4.64}/HANDOFF_SECTIONS_APP_INTEGRATION.md +0 -0
  34. {docpluck-2.4.63 → docpluck-2.4.64}/LESSONS.md +0 -0
  35. {docpluck-2.4.63 → docpluck-2.4.64}/LICENSE +0 -0
  36. {docpluck-2.4.63 → docpluck-2.4.64}/REPLY_FROM_DOCPLUCK_v1.4.5.md +0 -0
  37. {docpluck-2.4.63 → docpluck-2.4.64}/REPLY_FROM_DOCPLUCK_v1.5.0.md +0 -0
  38. {docpluck-2.4.63 → docpluck-2.4.64}/REQUEST_08_CHUNKING_ENDPOINT.md +0 -0
  39. {docpluck-2.4.63 → docpluck-2.4.64}/REQUEST_09_REFERENCE_LIST_NORMALIZATION.md +0 -0
  40. {docpluck-2.4.63 → docpluck-2.4.64}/TODO.md +0 -0
  41. {docpluck-2.4.63 → docpluck-2.4.64}/docpluck/__main__.py +0 -0
  42. {docpluck-2.4.63 → docpluck-2.4.64}/docpluck/batch.py +0 -0
  43. {docpluck-2.4.63 → docpluck-2.4.64}/docpluck/cli.py +0 -0
  44. {docpluck-2.4.63 → docpluck-2.4.64}/docpluck/extract.py +0 -0
  45. {docpluck-2.4.63 → docpluck-2.4.64}/docpluck/extract_docx.py +0 -0
  46. {docpluck-2.4.63 → docpluck-2.4.64}/docpluck/extract_html.py +0 -0
  47. {docpluck-2.4.63 → docpluck-2.4.64}/docpluck/extract_layout.py +0 -0
  48. {docpluck-2.4.63 → docpluck-2.4.64}/docpluck/extract_structured.py +0 -0
  49. {docpluck-2.4.63 → docpluck-2.4.64}/docpluck/figures/__init__.py +0 -0
  50. {docpluck-2.4.63 → docpluck-2.4.64}/docpluck/figures/detect.py +0 -0
  51. {docpluck-2.4.63 → docpluck-2.4.64}/docpluck/quality.py +0 -0
  52. {docpluck-2.4.63 → docpluck-2.4.64}/docpluck/render.py +0 -0
  53. {docpluck-2.4.63 → docpluck-2.4.64}/docpluck/sections/__init__.py +0 -0
  54. {docpluck-2.4.63 → docpluck-2.4.64}/docpluck/sections/annotators/__init__.py +0 -0
  55. {docpluck-2.4.63 → docpluck-2.4.64}/docpluck/sections/annotators/docx.py +0 -0
  56. {docpluck-2.4.63 → docpluck-2.4.64}/docpluck/sections/annotators/html.py +0 -0
  57. {docpluck-2.4.63 → docpluck-2.4.64}/docpluck/sections/annotators/pdf.py +0 -0
  58. {docpluck-2.4.63 → docpluck-2.4.64}/docpluck/sections/annotators/text.py +0 -0
  59. {docpluck-2.4.63 → docpluck-2.4.64}/docpluck/sections/blocks.py +0 -0
  60. {docpluck-2.4.63 → docpluck-2.4.64}/docpluck/sections/boundaries.py +0 -0
  61. {docpluck-2.4.63 → docpluck-2.4.64}/docpluck/sections/core.py +0 -0
  62. {docpluck-2.4.63 → docpluck-2.4.64}/docpluck/sections/taxonomy.py +0 -0
  63. {docpluck-2.4.63 → docpluck-2.4.64}/docpluck/sections/types.py +0 -0
  64. {docpluck-2.4.63 → docpluck-2.4.64}/docpluck/tables/__init__.py +0 -0
  65. {docpluck-2.4.63 → docpluck-2.4.64}/docpluck/tables/bbox_utils.py +0 -0
  66. {docpluck-2.4.63 → docpluck-2.4.64}/docpluck/tables/camelot_extract.py +0 -0
  67. {docpluck-2.4.63 → docpluck-2.4.64}/docpluck/tables/captions.py +0 -0
  68. {docpluck-2.4.63 → docpluck-2.4.64}/docpluck/tables/cell_cleaning.py +0 -0
  69. {docpluck-2.4.63 → docpluck-2.4.64}/docpluck/tables/cluster.py +0 -0
  70. {docpluck-2.4.63 → docpluck-2.4.64}/docpluck/tables/confidence.py +0 -0
  71. {docpluck-2.4.63 → docpluck-2.4.64}/docpluck/tables/detect.py +0 -0
  72. {docpluck-2.4.63 → docpluck-2.4.64}/docpluck/tables/render.py +0 -0
  73. {docpluck-2.4.63 → docpluck-2.4.64}/docpluck/tables/whitespace.py +0 -0
  74. {docpluck-2.4.63 → docpluck-2.4.64}/docpluck/version.py +0 -0
  75. {docpluck-2.4.63 → docpluck-2.4.64}/docs/BENCHMARKS.md +0 -0
  76. {docpluck-2.4.63 → docpluck-2.4.64}/docs/DESIGN.md +0 -0
  77. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-07_sections_strict_iteration.md +0 -0
  78. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-09_session_state_and_followups.md +0 -0
  79. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-09_unified_extraction_brainstorm.md +0 -0
  80. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-10_table_rendering_iteration.md +0 -0
  81. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-10_table_rendering_iteration_2.md +0 -0
  82. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-10_table_rendering_iteration_3.md +0 -0
  83. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-10_table_rendering_iteration_4.md +0 -0
  84. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-10_table_rendering_iteration_5.md +0 -0
  85. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-10_table_rendering_iteration_6.md +0 -0
  86. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-10_table_rendering_iteration_7.md +0 -0
  87. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-11_PROMOTE_SPIKE_TO_LIBRARY.md +0 -0
  88. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-11_table_rendering_iteration_8.md +0 -0
  89. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-11_visual_review_findings.md +0 -0
  90. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-12_phase2_101pdf_corpus.md +0 -0
  91. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-12_remaining_ui_and_chrome_verification.md +0 -0
  92. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-12_visual_verify_results.md +0 -0
  93. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-13_apa_50_expansion.md +0 -0
  94. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-13_apa_50_expansion_iter_1.md +0 -0
  95. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-13_apa_50_expansion_iter_2.md +0 -0
  96. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-13_iterate_skill_first_use.md +0 -0
  97. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-13_iterative_1.md +0 -0
  98. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-13_iterative_library_improvement.md +0 -0
  99. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-13_table_extraction_next_iteration.md +0 -0
  100. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-14_continue_iterations_v2_4_30_to_15n.md +0 -0
  101. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-14_full_corpus_iteration_v2_4_30.md +0 -0
  102. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-14_iterate_6_cycles_complete.md +0 -0
  103. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-14_iterate_9_cycle_run.md +0 -0
  104. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-14_iterate_resume_4_cycles.md +0 -0
  105. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-14_iterate_v2_4_31_cycle_15n.md +0 -0
  106. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-14_phase_5d_gold_audit_v2_4_29.md +0 -0
  107. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-15_autonomous_apa_first_10h.md +0 -0
  108. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-15_iterate_apa_run_1.md +0 -0
  109. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-16_ai-gold-instructions.md +0 -0
  110. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-16_iterate_apa_run_2.md +0 -0
  111. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-16_iterate_apa_run_3.md +0 -0
  112. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-16_iterate_run_4_final.md +0 -0
  113. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-16_iterate_run_4_fix_and_continue.md +0 -0
  114. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-16_iterate_run_5.md +0 -0
  115. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-16_iterate_run_6.md +0 -0
  116. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-17_iterate_run_7.md +0 -0
  117. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-17_iterate_run_8.md +0 -0
  118. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-17_iterate_run_9.md +0 -0
  119. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-18_iterate_run_9_cont.md +0 -0
  120. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-18_iterate_run_9_cont2.md +0 -0
  121. {docpluck-2.4.63 → docpluck-2.4.64}/docs/HANDOFF_2026-05-20_iterate_run_9_cont3.md +0 -0
  122. {docpluck-2.4.63 → docpluck-2.4.64}/docs/ITERATION_VERIFICATION_LESSONS.md +0 -0
  123. {docpluck-2.4.63 → docpluck-2.4.64}/docs/LIBRARY_APP_SYNC.md +0 -0
  124. {docpluck-2.4.63 → docpluck-2.4.64}/docs/NORMALIZATION.md +0 -0
  125. {docpluck-2.4.63 → docpluck-2.4.64}/docs/README.md +0 -0
  126. {docpluck-2.4.63 → docpluck-2.4.64}/docs/TRIAGE_2026-05-10_corpus_assessment.md +0 -0
  127. {docpluck-2.4.63 → docpluck-2.4.64}/docs/TRIAGE_2026-05-14_phase_5d_gold_audit.md +0 -0
  128. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/2026-05-06-section-identification.md +0 -0
  129. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/2026-05-06-table-extraction.md +0 -0
  130. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/2026-05-07-sections-strict-iteration-progress.md +0 -0
  131. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/2026-05-08-unified-extraction-phase-0-splice-spike.md +0 -0
  132. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/sections-deferred-items.md +0 -0
  133. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/sections-issues-backlog.md +0 -0
  134. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/2026-05-07_spot-01_apa.md +0 -0
  135. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/2026-05-07_spot-02_pattern-A-shipped.md +0 -0
  136. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/2026-05-08_spot-final_all-styles.md +0 -0
  137. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/experiments/COMPARISON.md +0 -0
  138. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/korbmacher_table1.md +0 -0
  139. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/option-a.py +0 -0
  140. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/ziano_table1.md +0 -0
  141. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_notes_raw.txt +0 -0
  142. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_table1.md +0 -0
  143. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/notes.md +0 -0
  144. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/option-b.py +0 -0
  145. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_notes_raw.txt +0 -0
  146. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_table1.md +0 -0
  147. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/korbmacher_table1.md +0 -0
  148. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/notes.md +0 -0
  149. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/option-c.py +0 -0
  150. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/sample-pdftotext-bbox.html +0 -0
  151. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/ziano_table1.md +0 -0
  152. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/korbmacher_table1.md +0 -0
  153. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/notes.md +0 -0
  154. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/option-d.py +0 -0
  155. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/ziano_table1.md +0 -0
  156. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_2022_kruger_bbox.html +0 -0
  157. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_bbox.html +0 -0
  158. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_table1.md +0 -0
  159. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/option-e.py +0 -0
  160. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/sample-bbox.html +0 -0
  161. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_2021_joep_bbox.html +0 -0
  162. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_bbox.html +0 -0
  163. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_table1.md +0 -0
  164. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/html-fallback-demo.md +0 -0
  165. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.err +0 -0
  166. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.md +0 -0
  167. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.err +0 -0
  168. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.md +0 -0
  169. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.err +0 -0
  170. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.md +0 -0
  171. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.err +0 -0
  172. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.md +0 -0
  173. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.err +0 -0
  174. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.md +0 -0
  175. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.err +0 -0
  176. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.md +0 -0
  177. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.err +0 -0
  178. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.md +0 -0
  179. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.err +0 -0
  180. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.md +0 -0
  181. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.err +0 -0
  182. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.md +0 -0
  183. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.err +0 -0
  184. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.md +0 -0
  185. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.err +0 -0
  186. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.md +0 -0
  187. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.err +0 -0
  188. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.md +0 -0
  189. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.err +0 -0
  190. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.md +0 -0
  191. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.err +0 -0
  192. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.md +0 -0
  193. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.err +0 -0
  194. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.md +0 -0
  195. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.err +0 -0
  196. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.md +0 -0
  197. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.err +0 -0
  198. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.md +0 -0
  199. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.err +0 -0
  200. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.md +0 -0
  201. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.err +0 -0
  202. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.md +0 -0
  203. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.err +0 -0
  204. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.md +0 -0
  205. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.err +0 -0
  206. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.md +0 -0
  207. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.err +0 -0
  208. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.md +0 -0
  209. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.err +0 -0
  210. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.md +0 -0
  211. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.err +0 -0
  212. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.md +0 -0
  213. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.err +0 -0
  214. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.md +0 -0
  215. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.err +0 -0
  216. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.md +0 -0
  217. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/papers.md +0 -0
  218. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/report.md +0 -0
  219. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/splice_spike.py +0 -0
  220. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/plans/spot-checks/splice-spike/test_splice_spike.py +0 -0
  221. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/specs/2026-04-27-request-09-reference-normalization-design.md +0 -0
  222. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/specs/2026-05-06-section-identification-design.md +0 -0
  223. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/specs/2026-05-06-table-extraction-design.md +0 -0
  224. {docpluck-2.4.63 → docpluck-2.4.64}/docs/superpowers/specs/2026-05-08-unified-extraction-design.md +0 -0
  225. {docpluck-2.4.63 → docpluck-2.4.64}/scripts/harness/README.md +0 -0
  226. {docpluck-2.4.63 → docpluck-2.4.64}/scripts/harness/VERIFIER_PROMPT.md +0 -0
  227. {docpluck-2.4.63 → docpluck-2.4.64}/scripts/harness/__init__.py +0 -0
  228. {docpluck-2.4.63 → docpluck-2.4.64}/scripts/harness/baseline_matrix.json +0 -0
  229. {docpluck-2.4.63 → docpluck-2.4.64}/scripts/harness/checks.py +0 -0
  230. {docpluck-2.4.63 → docpluck-2.4.64}/scripts/harness/corpus.py +0 -0
  231. {docpluck-2.4.63 → docpluck-2.4.64}/scripts/harness/corpus_manifest.json +0 -0
  232. {docpluck-2.4.63 → docpluck-2.4.64}/scripts/harness/extract.py +0 -0
  233. {docpluck-2.4.63 → docpluck-2.4.64}/scripts/harness/gold_keys.json +0 -0
  234. {docpluck-2.4.63 → docpluck-2.4.64}/scripts/harness/inspect.py +0 -0
  235. {docpluck-2.4.63 → docpluck-2.4.64}/scripts/lint_rendered_corpus.py +0 -0
  236. {docpluck-2.4.63 → docpluck-2.4.64}/scripts/verify_corpus.py +0 -0
  237. {docpluck-2.4.63 → docpluck-2.4.64}/scripts/verify_corpus_full.py +0 -0
  238. {docpluck-2.4.63 → docpluck-2.4.64}/tests/__init__.py +0 -0
  239. {docpluck-2.4.63 → docpluck-2.4.64}/tests/conftest.py +0 -0
  240. {docpluck-2.4.63 → docpluck-2.4.64}/tests/fixtures/__init__.py +0 -0
  241. {docpluck-2.4.63 → docpluck-2.4.64}/tests/fixtures/sections/__init__.py +0 -0
  242. {docpluck-2.4.63 → docpluck-2.4.64}/tests/fixtures/sections/builders.py +0 -0
  243. {docpluck-2.4.63 → docpluck-2.4.64}/tests/fixtures/structured/.gitkeep +0 -0
  244. {docpluck-2.4.63 → docpluck-2.4.64}/tests/fixtures/structured/MANIFEST.json +0 -0
  245. {docpluck-2.4.63 → docpluck-2.4.64}/tests/fixtures/structured/README.md +0 -0
  246. {docpluck-2.4.63 → docpluck-2.4.64}/tests/golden/sections/apa_multi_study_pdf.json +0 -0
  247. {docpluck-2.4.63 → docpluck-2.4.64}/tests/golden/sections/apa_single_study_pdf.json +0 -0
  248. {docpluck-2.4.63 → docpluck-2.4.64}/tests/golden/sections/html_real_headings.json +0 -0
  249. {docpluck-2.4.63 → docpluck-2.4.64}/tests/snapshots/amj_lattice.txt +0 -0
  250. {docpluck-2.4.63 → docpluck-2.4.64}/tests/snapshots/apa_chan_feldman_lineless.txt +0 -0
  251. {docpluck-2.4.63 → docpluck-2.4.64}/tests/snapshots/apa_chen_jesp_lineless.txt +0 -0
  252. {docpluck-2.4.63 → docpluck-2.4.64}/tests/snapshots/apa_efendic_affect.txt +0 -0
  253. {docpluck-2.4.63 → docpluck-2.4.64}/tests/snapshots/apa_ip_feldman_pspb.txt +0 -0
  254. {docpluck-2.4.63 → docpluck-2.4.64}/tests/snapshots/bmc_lattice.txt +0 -0
  255. {docpluck-2.4.63 → docpluck-2.4.64}/tests/snapshots/ieee_figure_heavy.txt +0 -0
  256. {docpluck-2.4.63 → docpluck-2.4.64}/tests/snapshots/ieee_lattice.txt +0 -0
  257. {docpluck-2.4.63 → docpluck-2.4.64}/tests/snapshots/jama_lattice.txt +0 -0
  258. {docpluck-2.4.63 → docpluck-2.4.64}/tests/snapshots/nat_comms_figure_only.txt +0 -0
  259. {docpluck-2.4.63 → docpluck-2.4.64}/tests/snapshots/nature_minimal_rule.txt +0 -0
  260. {docpluck-2.4.63 → docpluck-2.4.64}/tests/snapshots/scirep_minimal_rule.txt +0 -0
  261. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_a3c_leading_zero_decimal_real_pdf.py +0 -0
  262. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_all_caps_section_promote_real_pdf.py +0 -0
  263. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_bbox_utils.py +0 -0
  264. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_benchmark_docx_html.py +0 -0
  265. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_cambridge_footer_strip_real_pdf.py +0 -0
  266. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_caption_only_table_heading_real_pdf.py +0 -0
  267. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_caption_regex.py +0 -0
  268. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_chart_data_trim_real_pdf.py +0 -0
  269. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_cid_minus_recovery_real_pdf.py +0 -0
  270. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_cli_sections.py +0 -0
  271. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_cli_structured.py +0 -0
  272. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_confidence.py +0 -0
  273. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_corpus_smoke.py +0 -0
  274. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_d5_normalization_audit.py +0 -0
  275. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_edge_cases.py +0 -0
  276. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_elsevier_footer_strip_real_pdf.py +0 -0
  277. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_equation_page_header_strip_real_pdf.py +0 -0
  278. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_extract_docx.py +0 -0
  279. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_extract_filter_sugar.py +0 -0
  280. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_extract_html.py +0 -0
  281. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_extract_layout.py +0 -0
  282. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_extract_pdf_structured.py +0 -0
  283. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_extraction.py +0 -0
  284. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_f0_table_region_aware.py +0 -0
  285. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_fffd_comparison_recovery_real_pdf.py +0 -0
  286. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_figure_caption_trim_real_pdf.py +0 -0
  287. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_figure_detect.py +0 -0
  288. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_fixtures_manifest.py +0 -0
  289. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_harness_text_loss_reflow.py +0 -0
  290. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_lattice_cluster.py +0 -0
  291. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_letterspaced_label_real_pdf.py +0 -0
  292. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_ligature_decomposition_real_pdf.py +0 -0
  293. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_lt_operator_recovery_real_pdf.py +0 -0
  294. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_mathitalic_greek_real_pdf.py +0 -0
  295. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_metaesci_followups.py +0 -0
  296. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_minus_sign_recovery_real_pdf.py +0 -0
  297. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_normalization.py +0 -0
  298. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_normalize_a3_r2_body_integer_real_pdf.py +0 -0
  299. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_normalize_f0_footnote_strip.py +0 -0
  300. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_normalize_layout_param.py +0 -0
  301. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_normalize_metadata_leak_real_pdf.py +0 -0
  302. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_normalize_report_layout_fields.py +0 -0
  303. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_normalize_v18_strips.py +0 -0
  304. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_numbered_heading_promotion_real_pdf.py +0 -0
  305. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_numbered_section_promotion_real_pdf.py +0 -0
  306. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_orphan_multilevel_number_real_pdf.py +0 -0
  307. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_orphan_section_number_real_pdf.py +0 -0
  308. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_preserve_math_glyphs_real_pdf.py +0 -0
  309. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_pua_glyph_recovery_real_pdf.py +0 -0
  310. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_quality.py +0 -0
  311. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_render.py +0 -0
  312. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_render_html.py +0 -0
  313. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_request_09_reference_normalization.py +0 -0
  314. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_roman_numeral_section_promote_real_pdf.py +0 -0
  315. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_section_row_label_no_merge_real_pdf.py +0 -0
  316. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_sections_boundaries.py +0 -0
  317. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_sections_boundary_truncation.py +0 -0
  318. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_sections_core_partition.py +0 -0
  319. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_sections_docx_annotator.py +0 -0
  320. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_sections_extract_text.py +0 -0
  321. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_sections_footnote_section.py +0 -0
  322. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_sections_golden.py +0 -0
  323. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_sections_html_annotator.py +0 -0
  324. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_sections_pdf_annotator.py +0 -0
  325. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_sections_public_api.py +0 -0
  326. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_sections_real_corpus.py +0 -0
  327. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_sections_taxonomy.py +0 -0
  328. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_sections_text_annotator.py +0 -0
  329. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_sections_types.py +0 -0
  330. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_sections_unit_corpus.py +0 -0
  331. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_sections_v161_coalesce.py +0 -0
  332. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_sections_v161_subheadings.py +0 -0
  333. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_sections_v161_taxonomy.py +0 -0
  334. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_sections_v161_text_annotator.py +0 -0
  335. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_sections_version.py +0 -0
  336. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_smoke_fixtures.py +0 -0
  337. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_structured_result_type.py +0 -0
  338. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_structured_types.py +0 -0
  339. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_structured_version.py +0 -0
  340. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_table_caption_cell_region_real_pdf.py +0 -0
  341. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_table_detect.py +0 -0
  342. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_tables_cell_cleaning.py +0 -0
  343. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_text_mode.py +0 -0
  344. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_v23_1_fixes.py +0 -0
  345. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_v23_bug_fixes.py +0 -0
  346. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_v23_post_corpus.py +0 -0
  347. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_v23_post_corpus_v2.py +0 -0
  348. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_v2_backwards_compat.py +0 -0
  349. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_v2_top_level_exports.py +0 -0
  350. {docpluck-2.4.63 → docpluck-2.4.64}/tests/test_whitespace_cluster.py +0 -0
@@ -1,5 +1,37 @@
1
1
  # Changelog
2
2
 
3
+ ## [2.4.64] — 2026-05-22
4
+
5
+ **Cycle 12 (run 9) — three independent normalize_text idempotence fixes.** A 180-doc scan post-cycle-11 found 17 papers still non-idempotent. This cycle packages three independent fixes that together clear 6 of them:
6
+
7
+ ### 1. Final blank-line collapse (5 papers — chan-etal, horsham, lee-feldman, li-feldman-mental-acct, kassambara)
8
+
9
+ Raw pdftotext output contains form-feed `\x0c` characters at page boundaries. S9's `re.sub(r"\n{3,}", "\n\n", t)` collapses consecutive blank lines, but the form-feed survives upstream stripping into the references region, where R3 (continuation join) processes line-by-line — `"\x0c".strip() == ""` so the form-feed line becomes an empty entry, surrounded by other empty entries. R3 outputs `"\n".join(["...", "", "", "...", ""])` = `\n\n\n\n` (4 newlines). S9's collapse already ran upstream; nothing else collapses. Pass 2 sees the `\n{4}` run and S9 collapses it — non-idempotence.
10
+
11
+ Fix: add a final `re.sub(r"\n{3,}", "\n\n", t)` right before the H0r/P0r blocks. Any late strip step that empties a line is now safely followed by the collapse, regardless of which step produced the gap.
12
+
13
+ ### 2. Cross-paragraph stat-continuation join (2 papers — korbmacher×2)
14
+
15
+ A1 (the early stat-line-repair step using `\s*`) crosses paragraph breaks but runs BEFORE S9 strips header/footer noise. A row like
16
+
17
+ `r(1798) = -0.27,\n\n472\n\nJournal of Decision Making, Vol. 17...\n\n95% CI [-0.31, ...]`
18
+
19
+ has so much intervening junk that A1's lookahead fails on pass 1. S9 then strips `472` (page num) and the journal-masthead/page-header (repeated ≥5 times), leaving `-0.27,\n\n95% CI`. A1 is over; LateJoin's A1r uses strict `[ \t]*\n[ \t]*` (single-newline only) and so doesn't fire. Pass 2's A1 sees the now-clean `,\n\n95% CI` and joins — non-idempotence.
20
+
21
+ Fix: add two paragraph-crossing variants to the LateJoin A1r block, restricted to high-confidence prefixes — `\d+% CI` and `p [<=>]`. No real paragraph STARTS with `95% CI` or `p < .001`, so joining across `\n\n` is safe. The `test_column_bleed_too_many_fragments_ignored` contract is unaffected — its input has no leading `,`/`;`.
22
+
23
+ ### 3. LABELED vs BARE CI bracket discriminator (refines cycle 11)
24
+
25
+ Cycle 11's proximity gate broke 2 pre-existing tests:
26
+ - `test_ci_pairing_recovers_body_line`: `Mposterior = 20.54, SD=0.04, CI = [-0.61, -0.47]` — `, SD=` falsely tripped the "new stat label" sentence-break check, blocking the legitimate recovery of `20.54` → `-0.54`.
27
+ - `test_efendic_table_point_estimates_recovered_via_ci`: efendic's body-line CI recoveries no longer fired.
28
+
29
+ Fix: discriminate LABELED brackets (`CI = [...]` / `95% CI [...]` / `CI: [...]`) from BARE brackets (`[lo, hi]` alone). LABELED brackets can pair with any candidate token in the row (the chain `M = X, SD = Y, CI = [...]` is all describing the same estimate). BARE brackets retain the strict 30-char + period/semicolon-break proximity gate (catches the majumder false-positive — bare bracket ~50 chars after `2.01`, attached to a different stat). The `_CI_LABEL_PREFIX_RE` looks back ≤8 chars from the `[` for `CI` / `\d+% CI` (with optional `=`/`:`).
30
+
31
+ **Impact:** corpus-wide non-idempotency 17 → 11 (cycle 12 cleared 6: 5 bibliography-shift + 2 korbmacher; 3 new bibliography cases of the same shape are now caught by the final collapse). Broad pytest 1356 pass + 1 known pre-existing B6 fail. Harness Tier-D academic: 0 regressions, 0 new fails (1 still failing — plos-med-1 / B1).
32
+
33
+ NORMALIZATION_VERSION 1.9.18. New tests: `test_normalize_collapses_late_blank_line_runs` + `test_late_join_crosses_paragraph_for_stat_continuation`. Cycle 11's tests (`*_proximity_gate_*`) still pass under the LABELED/BARE refinement.
34
+
3
35
  ## [2.4.63] — 2026-05-21
4
36
 
5
37
  **Cycle 11 (run 9) — `recover_minus_via_ci_pairing` proximity gate.** A 180-doc scan post-cycle-10 found 19 papers still non-idempotent. Among them, 8 (majumder, korbmacher×2, van-boven, chan-feldman-baron, ziano, xiao-poc, amp-1, annals-2) shared a structural defect that ALSO ships in single-pass production: the `_recover_minus_in_record` helper paired every candidate `2X.XX` token with EVERY CI bracket in the same record. A record like `M = 5.37, SD = 2.01), t(1827) = 1.83, p tukey = .067, d = 0.09 [-1.86, 0.04]` contains:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docpluck
3
- Version: 2.4.63
3
+ Version: 2.4.64
4
4
  Summary: PDF, DOCX, and HTML text extraction and normalization for academic papers
5
5
  Project-URL: Homepage, https://github.com/giladfeldman/docpluck
6
6
  Project-URL: Documentation, https://github.com/giladfeldman/docpluck/tree/main/docs
@@ -71,7 +71,7 @@ from .figures import Figure
71
71
  from .extract_structured import TABLE_EXTRACTION_VERSION, StructuredResult, extract_pdf_structured
72
72
  from .render import render_pdf_to_markdown
73
73
 
74
- __version__ = "2.4.63"
74
+ __version__ = "2.4.64"
75
75
  __author__ = "Gilad Feldman"
76
76
  __license__ = "MIT"
77
77
 
@@ -23,7 +23,7 @@ class NormalizationLevel(str, Enum):
23
23
  academic = "academic"
24
24
 
25
25
 
26
- NORMALIZATION_VERSION = "1.9.17"
26
+ NORMALIZATION_VERSION = "1.9.18"
27
27
 
28
28
 
29
29
  # ── Mathematical Alphanumeric Symbols de-styling (shared, v2.4.34) ──────────
@@ -1474,35 +1474,45 @@ _CORRUPT_NEG_TOKEN_RE = re.compile(r"(?<![\d.\-])2(\d?\.\d+)\b")
1474
1474
  _TABLE_ROW_RE = re.compile(r"<tr\b.*?</tr>", re.DOTALL | re.IGNORECASE)
1475
1475
 
1476
1476
 
1477
- # Cycle 11 (v2.4.63) — proximity gate for the CI-pairing recovery.
1477
+ # Cycle 11 (v2.4.63) / 12 (v2.4.64) — proximity gate for the CI-pairing recovery.
1478
1478
  #
1479
- # In stat reporting the point estimate is IMMEDIATELY followed by its CI:
1480
- # `B = -2.68 [-4.65, -0.68]`. The previous "pair with any bracket in the
1481
- # record" rule false-positives when a record contains an unrelated SD value
1482
- # and a separately-reported CI:
1483
- # `M = 5.37, SD = 2.01, t(1827) = 1.83, d = 0.09 [-1.86, 0.04]`
1484
- # ^^^^ ^^^^^^^^^^^^^^
1485
- # The `2.01` is a valid SD; the `[-1.86, 0.04]` is the CI for `d = 0.09`.
1486
- # `-0.01` happens to fall inside `[-1.86, 0.04]`, so the old logic
1487
- # recovered `2.01` → `-.01`, corrupting the SD. 8 papers in the corpus
1488
- # (majumder, korbmacher, van-boven, ...) had this defect.
1479
+ # In stat reporting a BARE bracket `[lo, hi]` attaches to the IMMEDIATELY-
1480
+ # preceding point estimate; a LABELED bracket `CI = [lo, hi]` or
1481
+ # `95% CI [lo, hi]` can attach to ANY earlier point estimate on the same
1482
+ # row (the SD/SE/df-pair tokens in between are descriptive of the same
1483
+ # estimate). The cycle 11 proximity gate treated both as needing strict
1484
+ # adjacency, which broke efendic's body-line recovery
1485
+ # `Mposterior = 20.54, SD=0.04, CI = [-0.61, -0.47]`
1486
+ # where `, SD=` falsely tripped the "new stat label" sentence-break check.
1489
1487
  #
1490
- # Fix: require the CI bracket to follow the candidate token closely. The
1491
- # bracket's start must be within _CI_PAIR_MAX_GAP chars of the token's
1492
- # end, AND the intervening text must not contain a sentence break (period
1493
- # followed by space, or semicolon, or a comma followed by a new statistic
1494
- # label like ` SD =`/` t(`/` p =`/` d =`).
1488
+ # Cycle 12 fix: discriminate LABELED vs BARE brackets.
1489
+ # - LABELED bracket (`CI =`/`95% CI`/`CI:` immediately precedes `[`):
1490
+ # pairs with any candidate token in its record (the old wide rule).
1491
+ # - BARE bracket: pairs ONLY with candidates within 30 chars + no
1492
+ # sentence break (period/semicolon + space NOT comma + new label,
1493
+ # because stat-row labels are comma-separated by convention).
1494
+ #
1495
+ # This keeps the majumder fix (bare bracket far from `2.01`) AND
1496
+ # preserves efendic-style labeled CIs that pair across SD/SE annotations.
1495
1497
  _CI_PAIR_MAX_GAP = 30
1496
- _SENTENCE_BREAK_RE = re.compile(
1497
- r"[.;]\s|,\s+(?:SD|SE|t|F|p|d|g|η|χ|r|R²|β|B|γ|R|N|M|Q|Z)\s*[=(]",
1498
- re.IGNORECASE,
1499
- )
1498
+ # Bare-bracket sentence break: only period/semicolon + space. A comma is
1499
+ # NOT a break because stat rows are comma-separated. The majumder false-
1500
+ # positive is now caught by the per-bracket proximity check (the bare
1501
+ # bracket sits ~50 chars after `2.01` — beyond _CI_PAIR_MAX_GAP).
1502
+ _SENTENCE_BREAK_RE = re.compile(r"[.;]\s")
1503
+ # A bracket is "labeled" when prefixed by `CI`, `95 % CI`, or similar
1504
+ # directly before the opening `[`. Allow optional whitespace and an `=` /
1505
+ # `:` between the label and the bracket.
1506
+ _CI_LABEL_PREFIX_RE = re.compile(r"(?:\bCI|\b\d+\s*%\s*CI)\s*[=:]?\s*$", re.IGNORECASE)
1500
1507
 
1501
1508
 
1502
1509
  def _recover_minus_in_record(record: str) -> str:
1503
1510
  """Recover '2X.XX' tokens in a single record (a table row or a text line)
1504
1511
  by pairing each with a CI bracket present in the same record."""
1505
- brackets: list[tuple[float, float, tuple[int, int]]] = []
1512
+ # Each entry: (lo, hi, (bs, be), is_labeled). `is_labeled` is True when
1513
+ # the bracket is prefixed by `CI`/`95% CI`/etc. — see cycle 12 notes
1514
+ # at _CI_LABEL_PREFIX_RE.
1515
+ brackets: list[tuple[float, float, tuple[int, int], bool]] = []
1506
1516
  for m in _CI_PAIR_BRACKET_RE.finditer(record):
1507
1517
  try:
1508
1518
  lo, hi = float(m.group(1)), float(m.group(2))
@@ -1510,13 +1520,17 @@ def _recover_minus_in_record(record: str) -> str:
1510
1520
  continue
1511
1521
  if lo > hi:
1512
1522
  continue # not a well-formed interval
1513
- brackets.append((lo, hi, m.span()))
1523
+ # Look back ≤8 chars for a `CI` / `95 % CI` label.
1524
+ bs, be = m.span()
1525
+ prefix = record[max(0, bs - 8): bs]
1526
+ is_labeled = bool(_CI_LABEL_PREFIX_RE.search(prefix))
1527
+ brackets.append((lo, hi, (bs, be), is_labeled))
1514
1528
  if not brackets:
1515
1529
  return record
1516
1530
 
1517
1531
  def _sub(m: "re.Match[str]") -> str:
1518
1532
  # Never touch a token that lies inside a bracket span (a CI bound).
1519
- for _lo, _hi, (bs, be) in brackets:
1533
+ for _lo, _hi, (bs, be), _lab in brackets:
1520
1534
  if bs <= m.start() < be:
1521
1535
  return m.group(0)
1522
1536
  frac = m.group(1)
@@ -1525,22 +1539,29 @@ def _recover_minus_in_record(record: str) -> str:
1525
1539
  recovered = float("-" + frac)
1526
1540
  except ValueError:
1527
1541
  return m.group(0)
1528
- # Cycle 11 proximity gate: only consider brackets that closely
1529
- # follow the token, with no sentence break or new stat label in
1530
- # between. Pick the NEAREST eligible bracket (the one that would
1531
- # canonically pair with the point estimate).
1542
+ # Cycle 12: pick the NEAREST bracket whose pairing rules accept this
1543
+ # token. LABELED brackets accept any candidate in the record (legacy
1544
+ # wide rule efendic body line `Mposterior = 20.54, SD=0.04,
1545
+ # CI = [-0.61, -0.47]` is the canonical case). BARE brackets only
1546
+ # accept the immediately-preceding stat (within 30 chars, no
1547
+ # sentence break) — this is what blocks the majumder false-positive
1548
+ # `M = 5.37, SD = 2.01, t = ..., d = 0.09 [-1.86, 0.04]`.
1532
1549
  token_end = m.end()
1533
1550
  nearest = None
1534
1551
  nearest_dist = None
1535
- for lo, hi, (bs, be) in brackets:
1552
+ for lo, hi, (bs, be), is_labeled in brackets:
1536
1553
  if bs < token_end:
1537
- continue # bracket precedes the token — not its CI
1538
- gap = bs - token_end
1539
- if gap > _CI_PAIR_MAX_GAP:
1540
- continue
1541
- intervening = record[token_end:bs]
1542
- if _SENTENCE_BREAK_RE.search(intervening):
1543
1554
  continue
1555
+ gap = bs - token_end
1556
+ if is_labeled:
1557
+ # Labeled bracket: only constraint is "comes after the token".
1558
+ pass
1559
+ else:
1560
+ if gap > _CI_PAIR_MAX_GAP:
1561
+ continue
1562
+ intervening = record[token_end:bs]
1563
+ if _SENTENCE_BREAK_RE.search(intervening):
1564
+ continue
1544
1565
  if nearest_dist is None or gap < nearest_dist:
1545
1566
  nearest = (lo, hi)
1546
1567
  nearest_dist = gap
@@ -2649,6 +2670,21 @@ def normalize_text(
2649
2670
  t = re.sub(r"([=<>])[ \t]*\n[ \t]*(?=[-\d.])", r"\1 ", t)
2650
2671
  t = re.sub(r"([,;])[ \t]*\n[ \t]*(?=p\s*[<=>])", r"\1 ", t)
2651
2672
  t = re.sub(r"([,;])[ \t]*\n[ \t]*(?=\d+%\s*CI)", r"\1 ", t)
2673
+ # Cycle 12 (v2.4.64) — cross-paragraph stat-continuation join.
2674
+ # A1 (which uses `\s*` and so crosses paragraph breaks) runs BEFORE
2675
+ # S9 strips header/footer lines. So a stat row like
2676
+ # `r(1798) = -0.27,\n\n472\n\nJournal of Decision Making, ...\n\n95% CI [-0.31, ...]`
2677
+ # has so much intervening junk that A1's lookahead fails on pass 1;
2678
+ # only after S9 strips the junk (producing `,\n\n95% CI`) can the
2679
+ # join happen, and that's pass 2. The two patterns below are the
2680
+ # paragraph-crossing variants of the comma-to-stat-continuation
2681
+ # patterns above — restricted to the high-confidence prefixes
2682
+ # `\d+% CI` and `p [<=>]` because no real paragraph STARTS with
2683
+ # those tokens (test_column_bleed_too_many_fragments_ignored is
2684
+ # unaffected — its input has no leading `,`/`;`).
2685
+ # Clears korbmacher (2 papers) from the non-idempotent set.
2686
+ t = re.sub(r"([,;])\s*\n\s*\n\s*(?=\d+%\s*CI)", r"\1 ", t)
2687
+ t = re.sub(r"([,;])\s*\n\s*\n\s*(?=p\s*[<=>])", r"\1 ", t)
2652
2688
  report._track("LateJoin_line_break_rejoin", before, t, "late_line_joins")
2653
2689
 
2654
2690
  # ── H0r: header-banner re-strip on stabilized line positions ─────────
@@ -2669,6 +2705,26 @@ def normalize_text(
2669
2705
  t = _restripped
2670
2706
  report._track("H0r_header_banner_restrip", before, t, "header_banners_restripped")
2671
2707
 
2708
+ # ── Final blank-line collapse ────────────────────────────────────────
2709
+ # S9 enforces `re.sub(r"\n{3,}", "\n\n", t)` once near the top of the
2710
+ # pipeline. Later steps that REMOVE non-blank content can leave blank
2711
+ # gaps that S9's earlier collapse no longer reaches:
2712
+ #
2713
+ # - R3 (refs-section continuation join) walks the refs span line by
2714
+ # line. A bare form-feed `\x0c` (pdftotext page-break) between two
2715
+ # blank lines becomes `"".strip() == ""` and is preserved as a blank
2716
+ # entry; R3 outputs three consecutive blank entries surrounded by
2717
+ # `"\n".join(...)` — `\n\n\n\n`. Pass 1 leaves this; pass 2's S9
2718
+ # collapses it, producing the bibliography-shift non-idempotence
2719
+ # (cycle 12 — 5 papers: chan-etal, horsham, lee-feldman,
2720
+ # li-feldman-mental, + 1 incidental).
2721
+ # - Same pattern for any late strip step that empties a line without
2722
+ # re-collapsing.
2723
+ #
2724
+ # Add the collapse here so the function is idempotent regardless of
2725
+ # which late step produced the blank-line run.
2726
+ t = re.sub(r"\n{3,}", "\n\n", t)
2727
+
2672
2728
  # ── P0r: page-footer-line re-strip on stabilized line positions ──────
2673
2729
  # Same shape as H0r, applied to P0's anchored ^...$ patterns. P0 runs
2674
2730
  # near the top of the pipeline, where some P0-targeted lines are still
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "docpluck"
7
- version = "2.4.63"
7
+ version = "2.4.64"
8
8
  description = "PDF, DOCX, and HTML text extraction and normalization for academic papers"
9
9
  readme = "docs/README.md"
10
10
  requires-python = ">=3.10"
@@ -234,6 +234,45 @@ def test_s9_4digit_pattern_a_still_strips_isolated_page_numbers():
234
234
  )
235
235
 
236
236
 
237
+ def test_normalize_collapses_late_blank_line_runs():
238
+ """Cycle 12: a late strip step that empties a line (e.g. R3 stripping a
239
+ form-feed `\\x0c` between two blank lines) leaves a `\\n{3+}` run that
240
+ S9's earlier collapse no longer reaches. The final collapse at the end
241
+ of normalize_text catches it."""
242
+ # Simulate: paragraph + form-feed + paragraph (pdftotext page-break case)
243
+ text = "First paragraph ending here.\n\n\x0c\n\nSecond paragraph begins."
244
+ out, _ = normalize_text(text, NormalizationLevel.academic)
245
+ # Should produce one paragraph break, not two
246
+ assert "\n\n\n" not in out, f"normalize_text left a \\n{{3+}} run: {out!r}"
247
+ assert "First paragraph" in out
248
+ assert "Second paragraph" in out
249
+
250
+
251
+ def test_late_join_crosses_paragraph_for_stat_continuation():
252
+ """Cycle 12: a comma/semicolon followed by a paragraph break and a
253
+ high-confidence stat-continuation token (95% CI / p [<=>]) is a
254
+ serializer artifact — joined on pass 1. Pre-cycle-12, only pass 2
255
+ joined it (after S9 stripped the intervening header/footer noise).
256
+
257
+ Defends against the corpus-wide korbmacher pattern where a regression-
258
+ coefficient row was broken by a per-page header insertion."""
259
+ # The full korbmacher pre-S9 pattern is a couple of headers between;
260
+ # post-S9 the input to LateJoin is just `,\n\n95% CI`.
261
+ text = "r(1798) = -0.27,\n\n95% CI [-0.31, -0.22]"
262
+ out, _ = normalize_text(text, NormalizationLevel.academic)
263
+ assert "-0.27, 95% CI" in out, f"cross-paragraph stat join failed: {out!r}"
264
+
265
+ # Same for p-value continuation
266
+ text2 = "t(23) = 2.34,\n\np < .001, d = 0.45"
267
+ out2, _ = normalize_text(text2, NormalizationLevel.academic)
268
+ assert "2.34, p < .001" in out2 or "p < .001" in out2.replace("\n\n", " ")
269
+
270
+ # The column-bleed contract is NOT broken — its input has no leading `,;`.
271
+ cb = "p\n01\n02\n03\n04\n05\n= .05"
272
+ out_cb, _ = normalize_text(cb, NormalizationLevel.academic)
273
+ assert "p = .05" not in out_cb, "column-bleed test contract broken by cycle 12"
274
+
275
+
237
276
  def test_recover_minus_proximity_gate_rejects_distant_unrelated_brackets():
238
277
  """Cycle 11: a stat-table row that mixes an unrelated SD value with a
239
278
  separately-reported CI bracket must NOT have the SD recovered as a
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes