docpluck 2.4.43__tar.gz → 2.4.45__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (335) hide show
  1. {docpluck-2.4.43 → docpluck-2.4.45}/.claude/skills/_project/lessons.md +16 -0
  2. {docpluck-2.4.43 → docpluck-2.4.45}/.claude/skills/docpluck-iterate/LEARNINGS.md +44 -0
  3. {docpluck-2.4.43 → docpluck-2.4.45}/CHANGELOG.md +22 -0
  4. {docpluck-2.4.43 → docpluck-2.4.45}/PKG-INFO +1 -1
  5. {docpluck-2.4.43 → docpluck-2.4.45}/docpluck/__init__.py +1 -1
  6. {docpluck-2.4.43 → docpluck-2.4.45}/docpluck/normalize.py +29 -7
  7. {docpluck-2.4.43 → docpluck-2.4.45}/docpluck/quality.py +1 -1
  8. {docpluck-2.4.43 → docpluck-2.4.45}/docpluck/render.py +14 -14
  9. {docpluck-2.4.43 → docpluck-2.4.45}/docpluck/tables/cell_cleaning.py +5 -0
  10. docpluck-2.4.45/docs/HANDOFF_2026-05-16_ai-gold-instructions.md +173 -0
  11. {docpluck-2.4.43 → docpluck-2.4.45}/docs/TRIAGE_2026-05-14_phase_5d_gold_audit.md +11 -1
  12. {docpluck-2.4.43 → docpluck-2.4.45}/pyproject.toml +1 -1
  13. {docpluck-2.4.43 → docpluck-2.4.45}/tests/golden/sections/apa_multi_study_pdf.json +15 -15
  14. {docpluck-2.4.43 → docpluck-2.4.45}/tests/golden/sections/apa_single_study_pdf.json +11 -11
  15. docpluck-2.4.45/tests/snapshots/amj_lattice.txt +3022 -0
  16. docpluck-2.4.45/tests/snapshots/apa_chan_feldman_lineless.txt +2390 -0
  17. docpluck-2.4.45/tests/snapshots/apa_chen_jesp_lineless.txt +4054 -0
  18. docpluck-2.4.45/tests/snapshots/apa_efendic_affect.txt +1164 -0
  19. docpluck-2.4.45/tests/snapshots/apa_ip_feldman_pspb.txt +2683 -0
  20. docpluck-2.4.45/tests/snapshots/bmc_lattice.txt +1140 -0
  21. docpluck-2.4.45/tests/snapshots/ieee_figure_heavy.txt +1687 -0
  22. docpluck-2.4.45/tests/snapshots/ieee_lattice.txt +1757 -0
  23. docpluck-2.4.45/tests/snapshots/jama_lattice.txt +1881 -0
  24. docpluck-2.4.45/tests/snapshots/nat_comms_figure_only.txt +2092 -0
  25. docpluck-2.4.45/tests/snapshots/nature_minimal_rule.txt +1293 -0
  26. docpluck-2.4.45/tests/snapshots/scirep_minimal_rule.txt +1481 -0
  27. docpluck-2.4.45/tests/test_ligature_decomposition_real_pdf.py +122 -0
  28. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_numbered_heading_promotion_real_pdf.py +35 -2
  29. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_render.py +7 -3
  30. docpluck-2.4.43/tests/snapshots/amj_lattice.txt +0 -1165
  31. docpluck-2.4.43/tests/snapshots/apa_chan_feldman_lineless.txt +0 -1200
  32. docpluck-2.4.43/tests/snapshots/apa_chen_jesp_lineless.txt +0 -2122
  33. docpluck-2.4.43/tests/snapshots/apa_efendic_affect.txt +0 -584
  34. docpluck-2.4.43/tests/snapshots/apa_ip_feldman_pspb.txt +0 -1405
  35. docpluck-2.4.43/tests/snapshots/bmc_lattice.txt +0 -318
  36. docpluck-2.4.43/tests/snapshots/ieee_figure_heavy.txt +0 -543
  37. docpluck-2.4.43/tests/snapshots/ieee_lattice.txt +0 -1395
  38. docpluck-2.4.43/tests/snapshots/jama_lattice.txt +0 -1345
  39. docpluck-2.4.43/tests/snapshots/nat_comms_figure_only.txt +0 -913
  40. docpluck-2.4.43/tests/snapshots/nature_minimal_rule.txt +0 -366
  41. docpluck-2.4.43/tests/snapshots/scirep_minimal_rule.txt +0 -307
  42. {docpluck-2.4.43 → docpluck-2.4.45}/.claude/skills/docpluck-cleanup/SKILL.md +0 -0
  43. {docpluck-2.4.43 → docpluck-2.4.45}/.claude/skills/docpluck-deploy/SKILL.md +0 -0
  44. {docpluck-2.4.43 → docpluck-2.4.45}/.claude/skills/docpluck-iterate/SKILL.md +0 -0
  45. {docpluck-2.4.43 → docpluck-2.4.45}/.claude/skills/docpluck-iterate/references/ai-full-doc-verify.md +0 -0
  46. {docpluck-2.4.43 → docpluck-2.4.45}/.claude/skills/docpluck-iterate/references/cycle-report-template.md +0 -0
  47. {docpluck-2.4.43 → docpluck-2.4.45}/.claude/skills/docpluck-iterate/references/local-verification.md +0 -0
  48. {docpluck-2.4.43 → docpluck-2.4.45}/.claude/skills/docpluck-iterate/references/rationalizations.md +0 -0
  49. {docpluck-2.4.43 → docpluck-2.4.45}/.claude/skills/docpluck-iterate/references/real-library-real-pdf.md +0 -0
  50. {docpluck-2.4.43 → docpluck-2.4.45}/.claude/skills/docpluck-iterate/references/release-flow.md +0 -0
  51. {docpluck-2.4.43 → docpluck-2.4.45}/.claude/skills/docpluck-iterate/references/self-improvement.md +0 -0
  52. {docpluck-2.4.43 → docpluck-2.4.45}/.claude/skills/docpluck-iterate/references/three-tier-parity.md +0 -0
  53. {docpluck-2.4.43 → docpluck-2.4.45}/.claude/skills/docpluck-qa/SKILL.md +0 -0
  54. {docpluck-2.4.43 → docpluck-2.4.45}/.claude/skills/docpluck-qa/references/benchmark-mode.md +0 -0
  55. {docpluck-2.4.43 → docpluck-2.4.45}/.claude/skills/docpluck-qa/references/check-11-hard-rules.md +0 -0
  56. {docpluck-2.4.43 → docpluck-2.4.45}/.claude/skills/docpluck-qa/references/check-13-escicheck-production.md +0 -0
  57. {docpluck-2.4.43 → docpluck-2.4.45}/.claude/skills/docpluck-qa/references/check-5-escicheck-library.md +0 -0
  58. {docpluck-2.4.43 → docpluck-2.4.45}/.claude/skills/docpluck-qa/references/check-6-escicheck-local-webapp.md +0 -0
  59. {docpluck-2.4.43 → docpluck-2.4.45}/.claude/skills/docpluck-qa/references/check-7-batch-smoke.md +0 -0
  60. {docpluck-2.4.43 → docpluck-2.4.45}/.claude/skills/docpluck-review/SKILL.md +0 -0
  61. {docpluck-2.4.43 → docpluck-2.4.45}/.github/workflows/bump-app-pin.yml +0 -0
  62. {docpluck-2.4.43 → docpluck-2.4.45}/.github/workflows/publish.yml +0 -0
  63. {docpluck-2.4.43 → docpluck-2.4.45}/.github/workflows/test.yml +0 -0
  64. {docpluck-2.4.43 → docpluck-2.4.45}/.gitignore +0 -0
  65. {docpluck-2.4.43 → docpluck-2.4.45}/CLAUDE.md +0 -0
  66. {docpluck-2.4.43 → docpluck-2.4.45}/HANDOFF_SECTIONS_APP_INTEGRATION.md +0 -0
  67. {docpluck-2.4.43 → docpluck-2.4.45}/LESSONS.md +0 -0
  68. {docpluck-2.4.43 → docpluck-2.4.45}/LICENSE +0 -0
  69. {docpluck-2.4.43 → docpluck-2.4.45}/REPLY_FROM_DOCPLUCK_v1.4.5.md +0 -0
  70. {docpluck-2.4.43 → docpluck-2.4.45}/REPLY_FROM_DOCPLUCK_v1.5.0.md +0 -0
  71. {docpluck-2.4.43 → docpluck-2.4.45}/REQUEST_08_CHUNKING_ENDPOINT.md +0 -0
  72. {docpluck-2.4.43 → docpluck-2.4.45}/REQUEST_09_REFERENCE_LIST_NORMALIZATION.md +0 -0
  73. {docpluck-2.4.43 → docpluck-2.4.45}/TODO.md +0 -0
  74. {docpluck-2.4.43 → docpluck-2.4.45}/docpluck/__main__.py +0 -0
  75. {docpluck-2.4.43 → docpluck-2.4.45}/docpluck/batch.py +0 -0
  76. {docpluck-2.4.43 → docpluck-2.4.45}/docpluck/cli.py +0 -0
  77. {docpluck-2.4.43 → docpluck-2.4.45}/docpluck/extract.py +0 -0
  78. {docpluck-2.4.43 → docpluck-2.4.45}/docpluck/extract_docx.py +0 -0
  79. {docpluck-2.4.43 → docpluck-2.4.45}/docpluck/extract_html.py +0 -0
  80. {docpluck-2.4.43 → docpluck-2.4.45}/docpluck/extract_layout.py +0 -0
  81. {docpluck-2.4.43 → docpluck-2.4.45}/docpluck/extract_structured.py +0 -0
  82. {docpluck-2.4.43 → docpluck-2.4.45}/docpluck/figures/__init__.py +0 -0
  83. {docpluck-2.4.43 → docpluck-2.4.45}/docpluck/figures/detect.py +0 -0
  84. {docpluck-2.4.43 → docpluck-2.4.45}/docpluck/sections/__init__.py +0 -0
  85. {docpluck-2.4.43 → docpluck-2.4.45}/docpluck/sections/annotators/__init__.py +0 -0
  86. {docpluck-2.4.43 → docpluck-2.4.45}/docpluck/sections/annotators/docx.py +0 -0
  87. {docpluck-2.4.43 → docpluck-2.4.45}/docpluck/sections/annotators/html.py +0 -0
  88. {docpluck-2.4.43 → docpluck-2.4.45}/docpluck/sections/annotators/pdf.py +0 -0
  89. {docpluck-2.4.43 → docpluck-2.4.45}/docpluck/sections/annotators/text.py +0 -0
  90. {docpluck-2.4.43 → docpluck-2.4.45}/docpluck/sections/blocks.py +0 -0
  91. {docpluck-2.4.43 → docpluck-2.4.45}/docpluck/sections/boundaries.py +0 -0
  92. {docpluck-2.4.43 → docpluck-2.4.45}/docpluck/sections/core.py +0 -0
  93. {docpluck-2.4.43 → docpluck-2.4.45}/docpluck/sections/taxonomy.py +0 -0
  94. {docpluck-2.4.43 → docpluck-2.4.45}/docpluck/sections/types.py +0 -0
  95. {docpluck-2.4.43 → docpluck-2.4.45}/docpluck/tables/__init__.py +0 -0
  96. {docpluck-2.4.43 → docpluck-2.4.45}/docpluck/tables/bbox_utils.py +0 -0
  97. {docpluck-2.4.43 → docpluck-2.4.45}/docpluck/tables/camelot_extract.py +0 -0
  98. {docpluck-2.4.43 → docpluck-2.4.45}/docpluck/tables/captions.py +0 -0
  99. {docpluck-2.4.43 → docpluck-2.4.45}/docpluck/tables/cluster.py +0 -0
  100. {docpluck-2.4.43 → docpluck-2.4.45}/docpluck/tables/confidence.py +0 -0
  101. {docpluck-2.4.43 → docpluck-2.4.45}/docpluck/tables/detect.py +0 -0
  102. {docpluck-2.4.43 → docpluck-2.4.45}/docpluck/tables/render.py +0 -0
  103. {docpluck-2.4.43 → docpluck-2.4.45}/docpluck/tables/whitespace.py +0 -0
  104. {docpluck-2.4.43 → docpluck-2.4.45}/docpluck/version.py +0 -0
  105. {docpluck-2.4.43 → docpluck-2.4.45}/docs/BENCHMARKS.md +0 -0
  106. {docpluck-2.4.43 → docpluck-2.4.45}/docs/DESIGN.md +0 -0
  107. {docpluck-2.4.43 → docpluck-2.4.45}/docs/HANDOFF_2026-05-07_sections_strict_iteration.md +0 -0
  108. {docpluck-2.4.43 → docpluck-2.4.45}/docs/HANDOFF_2026-05-09_session_state_and_followups.md +0 -0
  109. {docpluck-2.4.43 → docpluck-2.4.45}/docs/HANDOFF_2026-05-09_unified_extraction_brainstorm.md +0 -0
  110. {docpluck-2.4.43 → docpluck-2.4.45}/docs/HANDOFF_2026-05-10_table_rendering_iteration.md +0 -0
  111. {docpluck-2.4.43 → docpluck-2.4.45}/docs/HANDOFF_2026-05-10_table_rendering_iteration_2.md +0 -0
  112. {docpluck-2.4.43 → docpluck-2.4.45}/docs/HANDOFF_2026-05-10_table_rendering_iteration_3.md +0 -0
  113. {docpluck-2.4.43 → docpluck-2.4.45}/docs/HANDOFF_2026-05-10_table_rendering_iteration_4.md +0 -0
  114. {docpluck-2.4.43 → docpluck-2.4.45}/docs/HANDOFF_2026-05-10_table_rendering_iteration_5.md +0 -0
  115. {docpluck-2.4.43 → docpluck-2.4.45}/docs/HANDOFF_2026-05-10_table_rendering_iteration_6.md +0 -0
  116. {docpluck-2.4.43 → docpluck-2.4.45}/docs/HANDOFF_2026-05-10_table_rendering_iteration_7.md +0 -0
  117. {docpluck-2.4.43 → docpluck-2.4.45}/docs/HANDOFF_2026-05-11_PROMOTE_SPIKE_TO_LIBRARY.md +0 -0
  118. {docpluck-2.4.43 → docpluck-2.4.45}/docs/HANDOFF_2026-05-11_table_rendering_iteration_8.md +0 -0
  119. {docpluck-2.4.43 → docpluck-2.4.45}/docs/HANDOFF_2026-05-11_visual_review_findings.md +0 -0
  120. {docpluck-2.4.43 → docpluck-2.4.45}/docs/HANDOFF_2026-05-12_phase2_101pdf_corpus.md +0 -0
  121. {docpluck-2.4.43 → docpluck-2.4.45}/docs/HANDOFF_2026-05-12_remaining_ui_and_chrome_verification.md +0 -0
  122. {docpluck-2.4.43 → docpluck-2.4.45}/docs/HANDOFF_2026-05-12_visual_verify_results.md +0 -0
  123. {docpluck-2.4.43 → docpluck-2.4.45}/docs/HANDOFF_2026-05-13_apa_50_expansion.md +0 -0
  124. {docpluck-2.4.43 → docpluck-2.4.45}/docs/HANDOFF_2026-05-13_apa_50_expansion_iter_1.md +0 -0
  125. {docpluck-2.4.43 → docpluck-2.4.45}/docs/HANDOFF_2026-05-13_apa_50_expansion_iter_2.md +0 -0
  126. {docpluck-2.4.43 → docpluck-2.4.45}/docs/HANDOFF_2026-05-13_iterate_skill_first_use.md +0 -0
  127. {docpluck-2.4.43 → docpluck-2.4.45}/docs/HANDOFF_2026-05-13_iterative_1.md +0 -0
  128. {docpluck-2.4.43 → docpluck-2.4.45}/docs/HANDOFF_2026-05-13_iterative_library_improvement.md +0 -0
  129. {docpluck-2.4.43 → docpluck-2.4.45}/docs/HANDOFF_2026-05-13_table_extraction_next_iteration.md +0 -0
  130. {docpluck-2.4.43 → docpluck-2.4.45}/docs/HANDOFF_2026-05-14_continue_iterations_v2_4_30_to_15n.md +0 -0
  131. {docpluck-2.4.43 → docpluck-2.4.45}/docs/HANDOFF_2026-05-14_full_corpus_iteration_v2_4_30.md +0 -0
  132. {docpluck-2.4.43 → docpluck-2.4.45}/docs/HANDOFF_2026-05-14_iterate_6_cycles_complete.md +0 -0
  133. {docpluck-2.4.43 → docpluck-2.4.45}/docs/HANDOFF_2026-05-14_iterate_9_cycle_run.md +0 -0
  134. {docpluck-2.4.43 → docpluck-2.4.45}/docs/HANDOFF_2026-05-14_iterate_resume_4_cycles.md +0 -0
  135. {docpluck-2.4.43 → docpluck-2.4.45}/docs/HANDOFF_2026-05-14_iterate_v2_4_31_cycle_15n.md +0 -0
  136. {docpluck-2.4.43 → docpluck-2.4.45}/docs/HANDOFF_2026-05-14_phase_5d_gold_audit_v2_4_29.md +0 -0
  137. {docpluck-2.4.43 → docpluck-2.4.45}/docs/HANDOFF_2026-05-15_autonomous_apa_first_10h.md +0 -0
  138. {docpluck-2.4.43 → docpluck-2.4.45}/docs/HANDOFF_2026-05-15_iterate_apa_run_1.md +0 -0
  139. {docpluck-2.4.43 → docpluck-2.4.45}/docs/HANDOFF_2026-05-16_iterate_apa_run_2.md +0 -0
  140. {docpluck-2.4.43 → docpluck-2.4.45}/docs/HANDOFF_2026-05-16_iterate_apa_run_3.md +0 -0
  141. {docpluck-2.4.43 → docpluck-2.4.45}/docs/LIBRARY_APP_SYNC.md +0 -0
  142. {docpluck-2.4.43 → docpluck-2.4.45}/docs/NORMALIZATION.md +0 -0
  143. {docpluck-2.4.43 → docpluck-2.4.45}/docs/README.md +0 -0
  144. {docpluck-2.4.43 → docpluck-2.4.45}/docs/TRIAGE_2026-05-10_corpus_assessment.md +0 -0
  145. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/2026-05-06-section-identification.md +0 -0
  146. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/2026-05-06-table-extraction.md +0 -0
  147. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/2026-05-07-sections-strict-iteration-progress.md +0 -0
  148. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/2026-05-08-unified-extraction-phase-0-splice-spike.md +0 -0
  149. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/sections-deferred-items.md +0 -0
  150. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/sections-issues-backlog.md +0 -0
  151. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/2026-05-07_spot-01_apa.md +0 -0
  152. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/2026-05-07_spot-02_pattern-A-shipped.md +0 -0
  153. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/2026-05-08_spot-final_all-styles.md +0 -0
  154. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/COMPARISON.md +0 -0
  155. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/korbmacher_table1.md +0 -0
  156. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/option-a.py +0 -0
  157. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/ziano_table1.md +0 -0
  158. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_notes_raw.txt +0 -0
  159. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_table1.md +0 -0
  160. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/notes.md +0 -0
  161. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/option-b.py +0 -0
  162. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_notes_raw.txt +0 -0
  163. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_table1.md +0 -0
  164. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/korbmacher_table1.md +0 -0
  165. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/notes.md +0 -0
  166. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/option-c.py +0 -0
  167. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/sample-pdftotext-bbox.html +0 -0
  168. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/ziano_table1.md +0 -0
  169. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/korbmacher_table1.md +0 -0
  170. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/notes.md +0 -0
  171. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/option-d.py +0 -0
  172. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/ziano_table1.md +0 -0
  173. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_2022_kruger_bbox.html +0 -0
  174. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_bbox.html +0 -0
  175. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_table1.md +0 -0
  176. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/option-e.py +0 -0
  177. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/sample-bbox.html +0 -0
  178. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_2021_joep_bbox.html +0 -0
  179. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_bbox.html +0 -0
  180. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_table1.md +0 -0
  181. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/html-fallback-demo.md +0 -0
  182. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.err +0 -0
  183. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.md +0 -0
  184. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.err +0 -0
  185. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.md +0 -0
  186. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.err +0 -0
  187. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.md +0 -0
  188. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.err +0 -0
  189. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.md +0 -0
  190. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.err +0 -0
  191. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.md +0 -0
  192. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.err +0 -0
  193. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.md +0 -0
  194. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.err +0 -0
  195. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.md +0 -0
  196. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.err +0 -0
  197. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.md +0 -0
  198. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.err +0 -0
  199. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.md +0 -0
  200. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.err +0 -0
  201. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.md +0 -0
  202. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.err +0 -0
  203. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.md +0 -0
  204. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.err +0 -0
  205. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.md +0 -0
  206. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.err +0 -0
  207. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.md +0 -0
  208. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.err +0 -0
  209. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.md +0 -0
  210. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.err +0 -0
  211. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.md +0 -0
  212. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.err +0 -0
  213. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.md +0 -0
  214. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.err +0 -0
  215. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.md +0 -0
  216. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.err +0 -0
  217. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.md +0 -0
  218. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.err +0 -0
  219. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.md +0 -0
  220. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.err +0 -0
  221. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.md +0 -0
  222. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.err +0 -0
  223. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.md +0 -0
  224. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.err +0 -0
  225. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.md +0 -0
  226. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.err +0 -0
  227. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.md +0 -0
  228. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.err +0 -0
  229. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.md +0 -0
  230. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.err +0 -0
  231. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.md +0 -0
  232. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.err +0 -0
  233. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.md +0 -0
  234. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/papers.md +0 -0
  235. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/report.md +0 -0
  236. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/splice_spike.py +0 -0
  237. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/test_splice_spike.py +0 -0
  238. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/specs/2026-04-27-request-09-reference-normalization-design.md +0 -0
  239. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/specs/2026-05-06-section-identification-design.md +0 -0
  240. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/specs/2026-05-06-table-extraction-design.md +0 -0
  241. {docpluck-2.4.43 → docpluck-2.4.45}/docs/superpowers/specs/2026-05-08-unified-extraction-design.md +0 -0
  242. {docpluck-2.4.43 → docpluck-2.4.45}/scripts/lint_rendered_corpus.py +0 -0
  243. {docpluck-2.4.43 → docpluck-2.4.45}/scripts/verify_corpus.py +0 -0
  244. {docpluck-2.4.43 → docpluck-2.4.45}/scripts/verify_corpus_full.py +0 -0
  245. {docpluck-2.4.43 → docpluck-2.4.45}/tests/__init__.py +0 -0
  246. {docpluck-2.4.43 → docpluck-2.4.45}/tests/conftest.py +0 -0
  247. {docpluck-2.4.43 → docpluck-2.4.45}/tests/fixtures/__init__.py +0 -0
  248. {docpluck-2.4.43 → docpluck-2.4.45}/tests/fixtures/sections/__init__.py +0 -0
  249. {docpluck-2.4.43 → docpluck-2.4.45}/tests/fixtures/sections/builders.py +0 -0
  250. {docpluck-2.4.43 → docpluck-2.4.45}/tests/fixtures/structured/.gitkeep +0 -0
  251. {docpluck-2.4.43 → docpluck-2.4.45}/tests/fixtures/structured/MANIFEST.json +0 -0
  252. {docpluck-2.4.43 → docpluck-2.4.45}/tests/fixtures/structured/README.md +0 -0
  253. {docpluck-2.4.43 → docpluck-2.4.45}/tests/golden/sections/html_real_headings.json +0 -0
  254. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_a3c_leading_zero_decimal_real_pdf.py +0 -0
  255. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_all_caps_section_promote_real_pdf.py +0 -0
  256. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_bbox_utils.py +0 -0
  257. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_benchmark_docx_html.py +0 -0
  258. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_cambridge_footer_strip_real_pdf.py +0 -0
  259. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_caption_regex.py +0 -0
  260. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_chart_data_trim_real_pdf.py +0 -0
  261. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_cid_minus_recovery_real_pdf.py +0 -0
  262. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_cli_sections.py +0 -0
  263. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_cli_structured.py +0 -0
  264. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_confidence.py +0 -0
  265. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_corpus_smoke.py +0 -0
  266. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_d5_normalization_audit.py +0 -0
  267. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_edge_cases.py +0 -0
  268. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_elsevier_footer_strip_real_pdf.py +0 -0
  269. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_equation_page_header_strip_real_pdf.py +0 -0
  270. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_extract_docx.py +0 -0
  271. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_extract_filter_sugar.py +0 -0
  272. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_extract_html.py +0 -0
  273. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_extract_layout.py +0 -0
  274. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_extract_pdf_structured.py +0 -0
  275. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_extraction.py +0 -0
  276. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_f0_table_region_aware.py +0 -0
  277. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_figure_caption_trim_real_pdf.py +0 -0
  278. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_figure_detect.py +0 -0
  279. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_fixtures_manifest.py +0 -0
  280. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_lattice_cluster.py +0 -0
  281. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_letterspaced_label_real_pdf.py +0 -0
  282. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_lt_operator_recovery_real_pdf.py +0 -0
  283. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_mathitalic_greek_real_pdf.py +0 -0
  284. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_metaesci_followups.py +0 -0
  285. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_minus_sign_recovery_real_pdf.py +0 -0
  286. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_normalization.py +0 -0
  287. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_normalize_a3_r2_body_integer_real_pdf.py +0 -0
  288. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_normalize_f0_footnote_strip.py +0 -0
  289. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_normalize_layout_param.py +0 -0
  290. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_normalize_metadata_leak_real_pdf.py +0 -0
  291. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_normalize_report_layout_fields.py +0 -0
  292. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_normalize_v18_strips.py +0 -0
  293. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_numbered_section_promotion_real_pdf.py +0 -0
  294. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_orphan_section_number_real_pdf.py +0 -0
  295. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_preserve_math_glyphs_real_pdf.py +0 -0
  296. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_quality.py +0 -0
  297. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_render_html.py +0 -0
  298. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_request_09_reference_normalization.py +0 -0
  299. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_roman_numeral_section_promote_real_pdf.py +0 -0
  300. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_section_row_label_no_merge_real_pdf.py +0 -0
  301. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_sections_boundaries.py +0 -0
  302. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_sections_boundary_truncation.py +0 -0
  303. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_sections_core_partition.py +0 -0
  304. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_sections_docx_annotator.py +0 -0
  305. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_sections_extract_text.py +0 -0
  306. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_sections_footnote_section.py +0 -0
  307. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_sections_golden.py +0 -0
  308. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_sections_html_annotator.py +0 -0
  309. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_sections_pdf_annotator.py +0 -0
  310. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_sections_public_api.py +0 -0
  311. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_sections_real_corpus.py +0 -0
  312. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_sections_taxonomy.py +0 -0
  313. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_sections_text_annotator.py +0 -0
  314. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_sections_types.py +0 -0
  315. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_sections_unit_corpus.py +0 -0
  316. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_sections_v161_coalesce.py +0 -0
  317. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_sections_v161_subheadings.py +0 -0
  318. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_sections_v161_taxonomy.py +0 -0
  319. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_sections_v161_text_annotator.py +0 -0
  320. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_sections_version.py +0 -0
  321. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_smoke_fixtures.py +0 -0
  322. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_structured_result_type.py +0 -0
  323. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_structured_types.py +0 -0
  324. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_structured_version.py +0 -0
  325. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_table_caption_cell_region_real_pdf.py +0 -0
  326. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_table_detect.py +0 -0
  327. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_tables_cell_cleaning.py +0 -0
  328. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_text_mode.py +0 -0
  329. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_v23_1_fixes.py +0 -0
  330. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_v23_bug_fixes.py +0 -0
  331. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_v23_post_corpus.py +0 -0
  332. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_v23_post_corpus_v2.py +0 -0
  333. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_v2_backwards_compat.py +0 -0
  334. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_v2_top_level_exports.py +0 -0
  335. {docpluck-2.4.43 → docpluck-2.4.45}/tests/test_whitespace_cluster.py +0 -0
@@ -182,3 +182,19 @@ Plus three golden snapshot files (`tests/golden/sections/*.json`) had the versio
182
182
  **Why:** When the section partitioner fails to recognise a heading, it leaves the heading text on its own line but does NOT blank-separate it from the surrounding section body. So a demoted heading looks like: `<last line of prev section>` / `2. Omission neglect` / `<first line of this section>`.
183
183
 
184
184
  **How to detect (next time):** Never gate heading-detection/promotion on blank-line isolation — demoted headings are wedged in prose. Use a different discriminator: for a numbered heading, "not adjacent to a sibling `N.` line" distinguishes a heading-before-section-body from a list-item-in-a-list. For single-level numbered promotion specifically, layer multiple independent gates (document-numbering-range, number-uniqueness, list-adjacency, terminal-punctuation, lowercase-run) so an enumerated list is rejected by several of them at once — defense in depth keeps a wide-false-positive-surface fix safe.
185
+
186
+ ## 2026-05-16 · Cycle 12 — don't add a normalize step that duplicates an existing one; localize the defect channel first (v2.4.44)
187
+
188
+ **What:** Latin typographic ligatures (`ff fi fl ffi ffl ſt st`, U+FB00-FB06) rendered verbatim — `confident`, `influence` — in 35 corpus `.md` files. The first cycle-12 attempt added a new `decompose_ligatures` helper and called it EARLY in `normalize_text`, not noticing `normalize_text` already had an `S3_ligature_expansion` step (FB00-FB04). The early call consumed every ligature before S3 ran, so S3 tracked `ligatures_expanded = 0` and `test_report_tracks_changes` broke. Worse, the body channel was never the problem: a channel check showed all 35 papers' ligatures sat in table cells / figure-table captions / `unstructured-table` fences — channels that bypass `normalize_text` entirely. The rework removed the duplicate call, unified S3 to call the shared helper (full FB00-FB06 block via an explicit ASCII table — NFKC of `ſt` yields a non-ASCII long-s), and kept the genuinely-new `cell_cleaning` + render-post-process calls.
189
+
190
+ **Why:** Two failure modes compounded. (1) A new normalize helper added without grepping the existing `normalize_text` S-steps duplicated S3 and, placed before it, starved it. (2) The cycle was scoped from a symptom ("35 papers show ligatures") without localizing WHICH channel was at fault — the body channel was already correct.
191
+
192
+ **How to detect (next time):** Before adding any glyph/encoding helper to `normalize.py`, grep the existing `S0`-`S9` / `W0*` steps for one already handling that character class — extend/unify it rather than adding a parallel path, and never insert a new step *before* an existing one that consumes the same input. Before scoping a glyph cycle, localize the defect: grep the offending glyph's lines in a recent render and confirm whether they sit in `<td>`/`<th>`/`*Table N*`/```unstructured-table``` (table/caption/fence channels — bypass `normalize_text`) or in body prose (the S-step channel).
193
+
194
+ ## 2026-05-16 · Cycle 13 — a heuristic guard's value depends on the false-positive surface, which differs per call site (v2.4.45)
195
+
196
+ **What:** `render.py`'s two numbered-heading promoters shared a `max_lc_run >= 5` "long lowercase-word run" prose guard. It demoted legitimate descriptive headings — jdm_.2023.16 had 19 multi-level numbered subsection headings rendered as body text, with lowercase-runs up to 12 (`3.3.2.1. The quality of planning on the previous trial moderates the effect of reflection`). The fix removed the guard ENTIRELY from `_promote_numbered_subsection_headings` but KEPT it (raised 5→8) in `_promote_numbered_section_headings`.
197
+
198
+ **Why:** A lowercase-word-run count genuinely cannot distinguish a descriptive section heading from prose — both have many lowercase words. What makes a line a heading is the *number shape* + capital-start + no-terminal-punctuation + single short line. For **multi-level** dotted numbering (`N.N[.N…]`) that signature is decisive — a prose line almost never begins with a multi-level dotted number — so the lc-run guard was pure harm. For **single-level** `N.` numbering the signature is weak (a `2.` line collides with an enumerated-list item), so a prose guard there still adds value as defense-in-depth. Same guard, opposite verdicts, because the false-positive surface differs between the two call sites.
199
+
200
+ **How to detect (next time):** When a heuristic guard rejects legitimate inputs, do not just retune its threshold — ask whether the guard discriminates at all at that call site. Reproduce at HEAD and measure the metric's spread on real positives (here: heading lowercase-runs ran 0-12, overlapping prose entirely → no threshold works). If a guard can't separate the classes, remove it where the *other* gates already suffice and keep it only where they don't. When a guard is removed, grep its tests — a contract test (`test_render.py::test_promote_rejects_prose_with_long_lowercase_run`) was asserting the removed behavior and had to be updated in the same cycle.
@@ -503,3 +503,47 @@ Mid-run, ArticleFinder flagged (and the user confirmed as a directive) that docp
503
503
 
504
504
  ### SPINE-SKIPs
505
505
  - R3 (`/docpluck-cleanup` + `/docpluck-review`) — SKIPPED. Cycle 11 is one new render post-processor, gated by 5 conjunctive safety checks; 26/26 baseline + AI-gold verifier confirm 0 false positives. Same shape as cycles 1-10.
506
+
507
+ ---
508
+
509
+ ## Run: 2026-05-16 (autonomous APA-first run, session 3) · Cycle 12 · v2.4.44
510
+
511
+ > **Reworked in run 4 (2026-05-16).** The session-3 cycle-12 attempt was broken — it duplicated the pre-existing S3 step and was never committed. The entry below describes the *reworked, shipped* cycle 12.
512
+
513
+ ### Outcome
514
+ - **Cycle 12 shipped v2.4.44** — Latin typographic ligatures (U+FB00-FB06: ff fi fl ffi ffl ſt st) leaked verbatim in the **table-cell, figure/table-caption, and `unstructured-table`-fence channels**. The body channel's `normalize.py` S3 step already expanded ligatures correctly; those three channels bypass `normalize_text`. `normalize.py::decompose_ligatures` is now the single shared helper for the full U+FB00-FB06 block (explicit ASCII table), called from all three channels (S3 body / `cell_cleaning._html_escape` / `render_pdf_to_markdown` post-process). jdm_m2/korbmacher/jdm16 → 0 residual ligatures. 11 tests.
515
+
516
+ ### Blind spots / process notes
517
+ - **The session-3 cycle-12 attempt duplicated an existing step.** It added a NEW `decompose_ligatures` call EARLY in `normalize_text` — before the pre-existing `S3_ligature_expansion` step (FB00-FB04). The early call consumed every ligature, so S3 tracked `ligatures_expanded = 0` and `test_report_tracks_changes` broke. **Lesson: before adding a glyph-normalization helper, grep the existing `normalize_text` S-steps for one already handling that glyph class — extend/unify it, never add a parallel path. The rework removed the duplicate call and unified S3 to call the shared helper.**
518
+ - **Verify the body channel is actually broken before "fixing" it.** The cycle was triggered by 35 rendered papers showing raw ligatures — but the body channel was fine; the 35 papers' ligatures were in table cells / captions / fences. A 2-minute check (grep the ligature lines in a recent render, look at whether they sit in `<td>`/`<th>`/`*Table N*`/```unstructured-table``` vs body prose) localizes the defect to the right channel before any code is written.
519
+ - **Explicit ASCII table, not scoped NFKC.** NFKC of `ſt` (U+FB05) yields `ſt` with a non-ASCII LONG S — so a per-char NFKC pass does not actually guarantee the ASCII output the docstring promises. An explicit 7-entry table (`ſt/st→st`) does, and matches the existing S3 code style.
520
+ - **The 3-channel glyph pattern, 5th application.** Cycles 2/4/6/7/12 all needed the shared-helper-at-3-chokepoints treatment. The cycle-6 PROPOSED AMENDMENT (still pending user review) is now backed by 5 cycles of evidence — it should be promoted into SKILL.md Phase 4.
521
+
522
+ ### SPINE-SKIPs
523
+ - R3 (`/docpluck-cleanup` + `/docpluck-review`) — SKIPPED. Cycle 12 is one normalize helper (explicit table over a 7-codepoint block) + S3 unified to call it + 2 bypass-channel call sites; 26/26 baseline + AI verifier confirm no regression. Same shape as cycles 2/4/6/7.
524
+
525
+ ---
526
+
527
+ ## Run: 2026-05-16 (run 4, fix-and-continue) · Cycles: cycle-12 rework, tests-regen, cycle 13
528
+
529
+ This run executed `docs/HANDOFF_2026-05-16_iterate_run_4_fix_and_continue.md`'s three jobs. Cycle-12 rework + tests-regen + cycle 13 below; the article-finder AI-gold integration (JOB 2) is tracked in the run-meta.
530
+
531
+ ### tests-regen (commit `c831e28`, no version bump)
532
+ - 15 pre-existing pytest failures triaged. 12 `test_extract_pdf_byte_identical` snapshots + 2 `test_sections_golden` goldens = environmental drift (local pdftotext re-wraps lines differently than the build that captured the snapshots; `extract_pdf` is a pure pdftotext passthrough). Regenerated; the 26-paper baseline is the real extraction-quality gate and stays green.
533
+ - **The 15th, `test_request_09`, is NOT snapshot drift** — it is a real COL-class column-interleave defect: the numbered RSOS bibliography renders as `References\n1. 2. 3. ... 16.\n\nThaler RH...` (the number column split from the entry text). Left red and documented as the escalated COL defect class. Lesson: when a handoff lumps failures as "all snapshot drift," still inspect each — a real-defect-detecting test must never be "regenerated" away.
534
+
535
+ ### Cycle 13 (v2.4.45) — G5b long-descriptive numbered headings demoted
536
+
537
+ ### Outcome
538
+ - **Cycle 13 shipped v2.4.45** — `render.py`'s numbered-heading promoters carried a `max_lc_run >= 5` prose guard that demoted legitimate long descriptive headings. Removed the guard entirely from `_promote_numbered_subsection_headings`; raised it `5→8` in `_promote_numbered_section_headings`. jdm_.2023.16: 19 multi-level subsection headings recovered.
539
+
540
+ ### Blind spots / process notes
541
+ - **The TRIAGE estimate ("raise 5→8") was a partial fix.** Reproducing at HEAD showed jdm16 headings with `max_lc` up to 12 — a `5→8` raise would have left 7 of 19 still demoted. The lesson card `reproduce-triage-defect-at-head-before-trusting-cost-estimate` paid off again: always reproduce and measure before trusting a queue item's prescribed fix. The lc-run count genuinely cannot distinguish a 12-lowercase-word descriptive heading from prose — for multi-level dotted numbering the *number shape* is the discriminator, so the guard had to go, not just move.
542
+ - **A guard worth keeping for one promoter, not the other.** Single-level `N.` numbers collide with enumerated lists (real false-positive risk) → keep a prose guard (raised to 8) as defense-in-depth. Multi-level `N.N[.N…]` numbers do not → the guard was pure harm. Same-named guard, opposite verdicts, because the false-positive surface differs.
543
+ - **A contract test encoded the removed guard.** `test_render.py::test_promote_rejects_prose_with_long_lowercase_run` asserted the old behavior; updated it to assert the new contract (long descriptive titles ARE promoted) in the same cycle — per the cycle-2 `a test can encode the bug` lesson.
544
+
545
+ ### SPINE-SKIPs
546
+ - R3 (`/docpluck-cleanup` + `/docpluck-review`) — SKIPPED. Cycle 13 is a guard removal + one threshold bump in two render post-processors; 26/26 baseline + heading-promotion-only diff confirm no regression. Same shape as cycles 9/11.
547
+
548
+ ### Process note — Codex cross-model verification has a Windows UTF-8 bug
549
+ The `gold-generation.md` Step-4 Codex audit misreads UTF-8 gold files as mojibake on this Windows machine (`Västfjäll`→`VA<SI>stfjA<SI>ll`, `–`→`ƒ?"`), producing ~10-24 false "discrepancies" per paper. The gold files are confirmed clean UTF-8. Worked around by re-running Codex with an explicit "files are UTF-8; mojibake is your decode error, not a discrepancy" preamble. **This is article-finder's protocol to fix** — `gold-generation.md` Step 4 needs a UTF-8 read instruction for Windows. Flagged for coordination with the article-finder owner.
@@ -1,5 +1,27 @@
1
1
  # Changelog
2
2
 
3
+ ## [2.4.45] — 2026-05-16
4
+
5
+ **Cycle 13 (autonomous APA-first run) — long descriptive numbered headings demoted to body text (G5b, S1).** `render.py`'s numbered-heading promoters carried a "long lowercase-word run" prose guard (`max_lc_run >= 5`) that rejected legitimate descriptive headings — e.g. `2.4.2.2. Inference of planning strategies and strategy types`, `3.3.2.1. The quality of planning on the previous trial moderates the effect of reflection`. jdm_.2023.16 alone had 19 multi-level numbered subsection headings demoted to body text.
6
+
7
+ Fix (v2.4.45) — the lowercase-run guard is **removed from `_promote_numbered_subsection_headings`**: multi-level dotted numbering at line-start is itself a strong section-heading signal (combined with capital-started title + no terminal sentence punctuation + single ≤80-char line), and descriptive subsection titles legitimately run to many lowercase words, so the guard could not distinguish a real heading from prose and only mis-rejected headings. For `_promote_numbered_section_headings` (single-level `N.`, which genuinely collides with enumerated lists) the guard is **kept but raised `5 → 8`** — single-level promotion still has its document-numbering-range / uniqueness / list-adjacency gates as defense in depth.
8
+
9
+ jdm_.2023.16: 19 previously-demoted multi-level headings now render as `###`; the v2.4.44→v2.4.45 diff is heading-promotion only (0 text loss, 0 hallucination). 26/26 baseline PASS. New real-PDF + contract tests in `tests/test_numbered_heading_promotion_real_pdf.py` and `tests/test_render.py`.
10
+
11
+ ~11 APA papers still FAIL Phase-5d verification; the autonomous run continues.
12
+
13
+ ## [2.4.44] — 2026-05-16
14
+
15
+ **Cycle 12 (autonomous APA-first run) — Latin typographic ligatures not decomposed in the table/caption channels (GLYPH, S2).** pdftotext preserves presentation-form ligature glyphs (`ff fi fl ffi ffl ſt st`, U+FB00-FB06) verbatim, so words rendered as `confident` / `influence` / `efficient` — broken for search, word matching, and any downstream NLP. A corpus scan found the glyphs in 35 rendered papers (korbmacher 82×, jdm_.2023.16 34×, jdm_m.2022.2 8×). The body channel's `normalize.py` S3 step already expanded ligatures correctly; the leak was confined to **table cells, figure/table captions, and `unstructured-table` fenced blocks**, which bypass `normalize_text` entirely.
16
+
17
+ Fix (v2.4.44) — `normalize.py::decompose_ligatures` is now the single shared helper for the full U+FB00-FB06 block, mapping each glyph to ASCII via an explicit table (`fi→fi`, `fl→fl`, `ffi→ffi`, `ffl→ffl`, `ff→ff`, `ſt/st→st`). An explicit table is used rather than a scoped NFKC pass because NFKC of `ſt` (U+FB05) yields `ſt` with a non-ASCII LONG S. The body channel's S3 step calls the helper (and so gains `ſt/st` coverage); `cell_cleaning._html_escape` (table cells) and the `render_pdf_to_markdown` post-process (captions, `unstructured-table` fences, raw_text fallbacks) call it too — the established three-channel glyph-fix pattern.
18
+
19
+ Verified across 3 papers: jdm_m.2022.2, korbmacher, jdm_.2023.16 — all now render 0 residual ligature glyphs (was 8 / 82 / 34); `confident`→`confident`. Superscripts and plain text untouched; the S3 body step still tracks `ligatures_expanded`. 26/26 baseline PASS. 11 tests in `tests/test_ligature_decomposition_real_pdf.py`.
20
+
21
+ `NORMALIZATION_VERSION` 1.9.7 → 1.9.8.
22
+
23
+ ~12 APA papers still FAIL Phase-5d verification; the autonomous run continues.
24
+
3
25
  ## [2.4.43] — 2026-05-16
4
26
 
5
27
  **Cycle 11 (autonomous APA-first run) — single-level numbered section headings demoted to body text (G5a, S1).** Cycle 9 (v2.4.41) promoted multi-level numbered subsection headings (`5.1.`, `6.1.1.`); single-level top-level numbered headings — `2. Omission neglect`, `3. Choice deferral`, `1. Hindsight bias` — were still rendered as plain body text when the title is not a canonical section word.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docpluck
3
- Version: 2.4.43
3
+ Version: 2.4.45
4
4
  Summary: PDF, DOCX, and HTML text extraction and normalization for academic papers
5
5
  Project-URL: Homepage, https://github.com/giladfeldman/docpluck
6
6
  Project-URL: Documentation, https://github.com/giladfeldman/docpluck/tree/main/docs
@@ -71,7 +71,7 @@ from .figures import Figure
71
71
  from .extract_structured import TABLE_EXTRACTION_VERSION, StructuredResult, extract_pdf_structured
72
72
  from .render import render_pdf_to_markdown
73
73
 
74
- __version__ = "2.4.43"
74
+ __version__ = "2.4.45"
75
75
  __author__ = "Gilad Feldman"
76
76
  __license__ = "MIT"
77
77
 
@@ -23,7 +23,7 @@ class NormalizationLevel(str, Enum):
23
23
  academic = "academic"
24
24
 
25
25
 
26
- NORMALIZATION_VERSION = "1.9.7"
26
+ NORMALIZATION_VERSION = "1.9.8"
27
27
 
28
28
 
29
29
  # ── Mathematical Alphanumeric Symbols de-styling (shared, v2.4.34) ──────────
@@ -1485,6 +1485,30 @@ def recover_minus_via_ci_pairing(text: str) -> str:
1485
1485
  return "\n".join(out)
1486
1486
 
1487
1487
 
1488
+ # v2.4.44 (NORMALIZATION_VERSION 1.9.8): decompose Latin typographic
1489
+ # ligatures (ff fi fl ffi ffl ſt st, U+FB00-FB06). pdftotext preserves these
1490
+ # presentation-form glyphs verbatim, so words render as "confident" /
1491
+ # "influence" — broken for search, word matching, and any downstream NLP.
1492
+ # An explicit ASCII table is used (not a scoped NFKC pass): NFKC of U+FB05
1493
+ # yields "ſt" with a non-ASCII LONG S, and meta-science output must stay
1494
+ # ASCII. This is the SINGLE shared helper for all THREE text channels — the
1495
+ # S3 body step (normalize_text, below), table-cell cleaning, and the render
1496
+ # post-process. Table cells and figure/table captions bypass normalize_text
1497
+ # entirely, so a body-only fix leaves them showing raw ligature glyphs.
1498
+ _LIGATURE_MAP = {
1499
+ "ff": "ff", "fi": "fi", "fl": "fl",
1500
+ "ffi": "ffi", "ffl": "ffl", "ſt": "st", "st": "st",
1501
+ }
1502
+ _LIGATURE_RE = re.compile("[ff-st]")
1503
+
1504
+
1505
+ def decompose_ligatures(text: str) -> str:
1506
+ """Decompose Latin typographic ligatures (U+FB00-FB06) to ASCII."""
1507
+ if not text:
1508
+ return text
1509
+ return _LIGATURE_RE.sub(lambda m: _LIGATURE_MAP[m.group(0)], text)
1510
+
1511
+
1488
1512
  def normalize_text(
1489
1513
  text: str,
1490
1514
  level: NormalizationLevel,
@@ -1658,13 +1682,11 @@ def normalize_text(
1658
1682
  t = t.replace(accent + vowel, combined)
1659
1683
  report._track("S2_accent_recombination", before, t, "accents_recombined")
1660
1684
 
1661
- # S3: Ligature expansion
1685
+ # S3: Ligature expansion \u2014 body channel. Calls the shared
1686
+ # decompose_ligatures helper (full U+FB00-FB06 block, incl. \ufb05/\ufb06\u2192st) so the
1687
+ # body, table-cell, and render-post-process channels stay in lockstep.
1662
1688
  before = t
1663
- t = t.replace("\ufb00", "ff")
1664
- t = t.replace("\ufb01", "fi")
1665
- t = t.replace("\ufb02", "fl")
1666
- t = t.replace("\ufb03", "ffi")
1667
- t = t.replace("\ufb04", "ffl")
1689
+ t = decompose_ligatures(t)
1668
1690
  report._track("S3_ligature_expansion", before, t, "ligatures_expanded")
1669
1691
 
1670
1692
  # S4: Quote normalization
@@ -16,7 +16,7 @@ COMMON_WORDS = {
16
16
  "each", "after", "both", "most", "only", "over", "may", "into",
17
17
  }
18
18
 
19
- LIGATURE_CHARS = set("\ufb00\ufb01\ufb02\ufb03\ufb04")
19
+ LIGATURE_CHARS = set("\ufb00\ufb01\ufb02\ufb03\ufb04\ufb05\ufb06")
20
20
 
21
21
 
22
22
  def compute_quality_score(text: str) -> dict:
@@ -34,6 +34,7 @@ from .extract_structured import extract_pdf_structured
34
34
  from .normalize import (
35
35
  NormalizationLevel,
36
36
  _rejoin_garbled_ocr_headers,
37
+ decompose_ligatures,
37
38
  destyle_math_alphanumeric,
38
39
  recover_corrupted_lt_operator,
39
40
  recover_corrupted_minus_signs,
@@ -232,8 +233,12 @@ def _promote_numbered_subsection_headings(text: str) -> str:
232
233
  """Promote ``1.2 Foo``-style lines to ``### 1.2 Foo`` h3 headings.
233
234
 
234
235
  Conservative: only multi-level numbering (``N.N`` or deeper), title must
235
- start with a capital letter, must not end in sentence-terminator
236
- punctuation, and must not look like prose (no long lowercase-word runs).
236
+ start with a capital letter and must not end in sentence-terminator
237
+ punctuation. Multi-level dotted numbering at line-start is itself a strong
238
+ section-heading signal — descriptive subsection titles legitimately run to
239
+ many lowercase words ("3.3.2.1 The quality of planning on the previous
240
+ trial moderates the effect of reflection"), so a lowercase-run prose guard
241
+ mis-rejects real headings and is not applied here (cycle 13, G5b).
237
242
  Idempotent: re-running the pass is a no-op.
238
243
  """
239
244
  if not text:
@@ -249,17 +254,6 @@ def _promote_numbered_subsection_headings(text: str) -> str:
249
254
  if title.endswith((".", "?", "!", ":", ",", ";")):
250
255
  out.append(line)
251
256
  continue
252
- tokens = title.split()
253
- lc_run = max_lc_run = 0
254
- for tok in tokens:
255
- if tok and tok[0].islower():
256
- lc_run += 1
257
- max_lc_run = max(max_lc_run, lc_run)
258
- else:
259
- lc_run = 0
260
- if max_lc_run >= 5:
261
- out.append(line)
262
- continue
263
257
  if out and out[-1].startswith(f"### {m.group('num')} "):
264
258
  out.append(line)
265
259
  continue
@@ -356,7 +350,7 @@ def _promote_numbered_section_headings(text: str) -> str:
356
350
  max_lc = max(max_lc, lc_run)
357
351
  else:
358
352
  lc_run = 0
359
- if max_lc >= 5: # prose-like run — not a heading
353
+ if max_lc >= 8: # long prose-like run — not a heading (cycle 13, G5b)
360
354
  continue
361
355
  candidates.setdefault(int(m.group("num")), []).append((i, title))
362
356
  if not candidates:
@@ -2151,6 +2145,12 @@ def render_pdf_to_markdown(
2151
2145
  # B-coefficient table cell, the Mposterior mediation estimates — that
2152
2146
  # the descending-bracket rule structurally cannot see.
2153
2147
  md = recover_minus_via_ci_pairing(md)
2148
+ # v2.4.44: final guarantee — decompose Latin typographic ligatures
2149
+ # (fi->fi, fl->fl, …) from the assembled markdown. normalize (body) and
2150
+ # cell_cleaning (table cells) cover their channels; this catches the
2151
+ # remaining surfaces — figure/table captions, unstructured-table fences,
2152
+ # raw_text fallbacks — so no presentation-form ligature reaches the .md.
2153
+ md = decompose_ligatures(md)
2154
2154
  md = _merge_compound_heading_tails(md)
2155
2155
  md = _reformat_jama_key_points_box(md)
2156
2156
  md = _promote_numbered_subsection_headings(md)
@@ -37,6 +37,7 @@ import re
37
37
  from typing import Sequence
38
38
 
39
39
  from docpluck.normalize import (
40
+ decompose_ligatures,
40
41
  destyle_math_alphanumeric,
41
42
  recover_corrupted_lt_operator,
42
43
  recover_corrupted_minus_signs,
@@ -58,6 +59,10 @@ def _html_escape(s: str | None) -> str:
58
59
  # from the Camelot layout channel and bypass normalize_text's S0 step, so
59
60
  # math-italic Greek would otherwise leak raw into rendered table HTML.
60
61
  s = destyle_math_alphanumeric(s)
62
+ # Decompose Latin typographic ligatures (fi->fi, fl->fl, …) — table cells
63
+ # bypass normalize_text, so a cell "confident" would otherwise leak the
64
+ # raw presentation-form glyph into the rendered HTML (v2.4.44).
65
+ s = decompose_ligatures(s)
61
66
  # Recover corrupted minus signs. pdfminer (Camelot's text layer) emits
62
67
  # "(cid:0)" for a font glyph it cannot map to Unicode; in academic stat
63
68
  # tables that unmapped glyph is the U+2212 minus, always printed directly
@@ -0,0 +1,173 @@
1
+ # Instruction — docpluck: AI-gold via article-finder (canonical keys, shared protocol)
2
+
3
+ **Date:** 2026-05-16
4
+ **From:** article-finder / cross-project AI-gold coordination
5
+ **To:** docpluck-iterate maintainer / next docpluck session
6
+ **Status:** Action required. article-finder's side is fixed and committed; the items
7
+ below are docpluck's to do in its next iteration.
8
+
9
+ ---
10
+
11
+ ## TL;DR
12
+
13
+ docpluck must change three things:
14
+
15
+ 1. **Stop using docpluck's private extraction prompt.** Generate all AI gold through
16
+ the shared protocol `gold-generation.md` only.
17
+ 2. **Key every gold under the paper's canonical DOI.** Bare local stems
18
+ (`chen_2021_jesp`, `efendic_2022_affect`) are now *rejected* by the cache.
19
+ 3. **Regenerate docpluck's existing `reading` golds.** They were produced by the old
20
+ private prompt and diverge from the shared protocol's output (the "981-line vs
21
+ 617-line" divergence). They are not trustworthy shared ground truth as-is.
22
+
23
+ **What article-finder already shipped for you** (skill repo, committed):
24
+
25
+ - `generate-gold <pdf>` is now a *routed* invocation — it skips the download cascade
26
+ and runs `gold-generation.md` directly. Previously it was advertised but unrouted.
27
+ - `gold-generation.md`'s `reading` prompt now transcribes every table **in full,
28
+ cell-by-cell**, as a markdown grid — not just the caption. This closes the gap
29
+ that originally justified docpluck's private prompt: the shared `reading` view is
30
+ now rich enough for docpluck's TABLE verifier.
31
+ - `register-view` and `migrate` now **reject a non-canonical key** with an
32
+ actionable error.
33
+ - `gold-generation.md` now enforces a **100%-accuracy, zero-hallucination policy**
34
+ and an **independent Codex / GPT-5.5 cross-model verification**: a second-vendor
35
+ model re-reads the PDF and audits every gold before it is stored. **Ensure the
36
+ `codex` CLI is installed and authenticated** in docpluck's environment
37
+ (`codex --version`; `codex login` if needed) — `generate-gold` blocks without it
38
+ rather than shipping unverified gold.
39
+
40
+ ---
41
+
42
+ ## Why this matters
43
+
44
+ The same paper was landing in the cache under two keys — docpluck's short stem
45
+ `chen_2021_jesp` and ESCIcheckapp's PDF-stem `Chen_et_al-2021-JESP-...`. A paper
46
+ split across two keys fragments its record: every project that reads it sees only
47
+ its own slice, `ai-gold.py gaps` reports phantom gaps, and nobody can reuse anyone
48
+ else's work. The cause was every project keying papers its own way. The fix is one
49
+ canonical key per paper, enforced.
50
+
51
+ Separately, docpluck's `reading` golds were generated by a docpluck-private prompt
52
+ (`references/ai-full-doc-verify.md`, Step 1b), not the shared `gold-generation.md`.
53
+ Two prompts produce two different "ground truths" for the same PDF. Ground truth
54
+ must be single-source.
55
+
56
+ ---
57
+
58
+ ## Rule 1 — Ground truth ONLY through article-finder
59
+
60
+ docpluck never re-implements PDF extraction and never carries its own extraction
61
+ prompt. To obtain AI gold for a paper:
62
+
63
+ - **Consume first.** Before extracting anything, check the cache:
64
+ ```
65
+ python ~/.claude/skills/article-finder/ai-gold.py check <key> --view reading
66
+ python ~/.claude/skills/article-finder/ai-gold.py get <key> --view reading
67
+ ```
68
+ On a hit, use the cached gold. Zero tokens.
69
+ - **Generate on a miss.** Invoke the `article-finder` skill as `generate-gold <pdf>`.
70
+ It runs `gold-generation.md` (dual stats extraction → reading+citations carrier
71
+ pass → cross-check → schema-validated registration) and registers the views under
72
+ the canonical key. You do not write the extraction logic.
73
+ - **Retire `references/ai-full-doc-verify.md` Step 1b** as a gold *producer*. If
74
+ docpluck needs an extra verification pass for its own pipeline, that is fine — but
75
+ the *ground truth* it verifies against comes from the cache, not from that prompt.
76
+
77
+ The shared `reading` prompt now captures full cell-by-cell tables, so docpluck's
78
+ table verifier has the detail it needs. If you find it still insufficient, raise it
79
+ with article-finder — do **not** fork the prompt. `gold-generation.md` is owned by
80
+ article-finder and is the single source of extraction rigor.
81
+
82
+ ## Rule 2 — Canonical keys only
83
+
84
+ A cache key must be a **DOI-stem** (`10.1016__j.jesp.2021.104154`) or a
85
+ `fixture__<producer>__<name>` key. Nothing else.
86
+
87
+ - Pass the paper's **DOI** as the key to `register-view` / `generate-gold`; the CLI
88
+ canonicalizes it (`10.1016/j.jesp.2021.104154` → `10.1016__j.jesp.2021.104154`).
89
+ The DOI is in every gold's `article_metadata.doi` and the `reading` gold's
90
+ `**DOI:**` line.
91
+ - If a paper genuinely has no DOI, use `fixture__docpluck__<pdf-stem>`.
92
+ - `register-view` and `migrate` now HALT on a bare stem. If docpluck-iterate's
93
+ autonomous run keys by a local stem, it will fail loudly — fix the iterate skill
94
+ to resolve and pass the DOI.
95
+
96
+ `docpluck.yaml` (the producer manifest) is reflexive — it globs `ai_gold/*/reading.md`
97
+ and keys by the parent directory. Once golds live in canonically-named directories,
98
+ the manifest follows automatically. No manifest edit is needed; the fix is in the
99
+ docpluck-iterate skill that *writes* the golds.
100
+
101
+ ## Rule 3 — Regenerate the stale `reading` golds
102
+
103
+ docpluck's existing `reading` golds in the cache were produced by the old private
104
+ prompt. Regenerate them through `generate-gold` so the cache holds one
105
+ protocol-consistent `reading` view per paper. Priority order: papers other projects
106
+ consume (the fragmented three below) first, then the rest of docpluck's set.
107
+
108
+ When you regenerate, the new gold registers under the canonical DOI key. A second
109
+ `reading` gold at the same key supersedes the old one (the cache archives the old
110
+ copy automatically).
111
+
112
+ ---
113
+
114
+ ## The three fragmented papers — fix these first
115
+
116
+ Each currently has docpluck's `reading` under a short stem AND ESCIcheckapp's
117
+ `reading`+`stats` under a PDF-stem. Regenerate each via `generate-gold` and register
118
+ under the canonical DOI key; the duplicate keys then collapse to one record.
119
+
120
+ | Paper | docpluck's current key | Canonical DOI key |
121
+ |---|---|---|
122
+ | Chen et al. 2021, JESP — hindsight bias | `chen_2021_jesp` | `10.1016__j.jesp.2021.104154` |
123
+ | Xiao, Zeng & Feldman 2021, CRSP — decoy effect | `xiao_2021_crsp` | `10.1080__23743603.2021.1878340` |
124
+ | Efendic et al. 2022, SPPS — affect heuristic | `efendic_2022_affect` | `10.1177__19485506211056761` |
125
+
126
+ After regeneration, the old short-stem directories (`chen_2021_jesp/` etc.) can be
127
+ removed — coordinate the cleanup with article-finder so `index.json` stays consistent
128
+ (`ai-gold.py audit` must report 0 issues).
129
+
130
+ ---
131
+
132
+ ## docpluck's next iteration — step by step
133
+
134
+ 1. Update docpluck-iterate so its gold step is "invoke `article-finder
135
+ generate-gold <pdf>`", not the private prompt.
136
+ 2. Make docpluck-iterate resolve the paper's DOI and pass it as the key (or let
137
+ `generate-gold` do it — `gold-generation.md` reads `article_metadata.doi`).
138
+ 3. Regenerate `reading` for Chen / Xiao / Efendic; verify each lands under the DOI
139
+ key with `ai-gold.py views <doi>`.
140
+ 4. Regenerate the remaining docpluck `reading` golds through the shared protocol.
141
+ 5. Run `ai-gold.py audit` — expect 0 issues. Run `ai-gold.py gaps` to confirm no
142
+ phantom fragmentation remains.
143
+ 6. Commit docpluck's skill change in the docpluck repo; do not commit cache data
144
+ from docpluck (article-finder owns the cache repo's commits).
145
+
146
+ ## Command cheat-sheet
147
+
148
+ ```
149
+ # Resolve a paper to its canonical key
150
+ python ~/.claude/skills/article-finder/ai-gold.py resolve "10.1016/j.jesp.2021.104154"
151
+
152
+ # Is the view already cached?
153
+ python ~/.claude/skills/article-finder/ai-gold.py check <key> --view reading
154
+ python ~/.claude/skills/article-finder/ai-gold.py get <key> --view reading
155
+
156
+ # What views does a paper have?
157
+ python ~/.claude/skills/article-finder/ai-gold.py views <key>
158
+
159
+ # Generate gold for an uncovered PDF — invoke the article-finder skill:
160
+ # article-finder generate-gold <absolute-pdf-path>
161
+
162
+ # Consistency check (run after any cache change)
163
+ python ~/.claude/skills/article-finder/ai-gold.py audit
164
+ ```
165
+
166
+ ## Definition of done (docpluck)
167
+
168
+ - [ ] docpluck-iterate generates gold only via `article-finder generate-gold` /
169
+ `gold-generation.md`; the private prompt is no longer a gold producer.
170
+ - [ ] Every docpluck gold is registered under a canonical DOI key.
171
+ - [ ] Chen / Xiao / Efendic regenerated and co-located under their DOI keys.
172
+ - [ ] docpluck's other `reading` golds regenerated through the shared protocol.
173
+ - [ ] `ai-gold.py audit` clean.
@@ -265,6 +265,16 @@ New `render.py::_promote_numbered_section_headings` promotes `N. Title` → `##
265
265
 
266
266
  **G5a RESIDUALS (queued):** the ≥5-lowercase-word prose guard rejects long descriptive headings (`4. Knowledge acquisition, decision delay, and choice outcomes`) — same G5b guard issue; list-number collision under-promotes a section heading whose number a body list reuses (chen 1/2/3/5 — conservative, not a false positive).
267
267
 
268
+ ### Cycle 12 (v2.4.44) — GLYPH ligature decomposition — SHIPPED
269
+
270
+ `normalize.py::decompose_ligatures` is the single shared helper for the U+FB00-FB06 ligature block — an explicit ASCII table (`fi→fi`, `fl→fl`, …, `ſt/st→st`; NFKC is avoided because `ſt`→`ſt` carries a non-ASCII long s). **The body channel's S3 step already expanded ligatures** — the real gap was the table-cell, figure/table-caption, and `unstructured-table`-fence channels that bypass `normalize_text`. The helper is now called from all three channels (S3 body / `cell_cleaning._html_escape` / `render_pdf_to_markdown` post-process); the S3 step also gains `ſt/st` reach. Corpus scan found ligatures in 35 rendered papers (korbmacher 82×, jdm16 34×); jdm_m2/korbmacher/jdm16 verified → 0 residual. The `GLYPH ligature` row below is now RESOLVED.
271
+
272
+ > **Cycle-12 rework note (run 4, 2026-05-16):** the first cycle-12 attempt added a SECOND, parallel `decompose_ligatures` call *before* the pre-existing S3 step inside `normalize_text` — it consumed every ligature before S3 ran, so S3 tracked `ligatures_expanded = 0` and broke `test_normalization.py::test_report_tracks_changes`. The rework removed the duplicate call and unified S3 to use the shared helper. Lesson: before adding a glyph-normalization helper, grep the existing `normalize_text` S-steps for one already handling that glyph class — extend/unify it, do not add a parallel path.
273
+
274
+ ### Cycle 13 (v2.4.45) — G5b long-descriptive-title prose guard — SHIPPED
275
+
276
+ `render.py`'s numbered-heading promoters carried a `max_lc_run >= 5` "long lowercase-word run" prose guard that mis-rejected legitimate descriptive headings. Reproduced at HEAD: jdm_.2023.16 alone had **19** multi-level numbered subsection headings demoted to body text, with `max_lc` up to **12** (`3.3.2.1. The quality of planning on the previous trial moderates the effect of reflection`) — far deeper than the TRIAGE's "raise 5→8" estimate. Re-scoped: the lc-run guard is **removed entirely from `_promote_numbered_subsection_headings`** (multi-level dotted numbering + capital-start + no-terminal-punctuation + single ≤80-char line is itself a sufficient heading signature; the lc-run guard cannot distinguish a descriptive heading from prose). For `_promote_numbered_section_headings` (single-level `N.`, real list-collision risk) the guard is kept but raised `5→8`, alongside its existing numbering-range/uniqueness/list-adjacency gates. jdm16: 19 headings recovered; v2.4.44→v2.4.45 diff is heading-promotion only (0 text loss/hallucination); 26/26 baseline.
277
+
268
278
  ### SESSION-3 STANDING VERDICT (rule 0e-bis)
269
279
 
270
280
  The APA corpus is **NOT clean**. Cycles 8-11 shipped 4 verified incremental fixes (v2.4.40-43), each AI-gold-verified OVERALL PASS with 0 regressions. But ~12 APA papers still FAIL Phase-5d on PRE-EXISTING defects the cycles did not reach. Verifier-confirmed open punch-list:
@@ -274,7 +284,7 @@ The APA corpus is **NOT clean**. Cycles 8-11 shipped 4 verified incremental fixe
274
284
  | **TABLE structure destruction** | S0/S1 | efendic, ar_apa_011, xiao, jdm15/16, chen, maier, ip_feldman (~11) | grid lost → caption-bleed; flat number-dump; empty `<table>` shells; two tables merged; rows dropped. C3 — needs a render/structured coordination design. The single largest blocker. |
275
285
  | **G5c split-line numbered headings** | S1 | jdm_m.2022.2 (`5.3.`/`6.3.`/`7.3.` etc.) | number alone on a line, title on the next; renders as orphan bare-number + a MISLABELED generic `## Results`. cycle-3 orphan-folder multi-level analogue. |
276
286
  | **G5d named (unnumbered) heading demotion** | S1 | ar_apa_011 (`Participants`, `Overview`), efendic, chandrashekar, ip_feldman (~7) | section-partitioner work; largest false-positive surface. |
277
- | **G5b long-descriptive-title prose guard** | S1 | jdm16, jdm_m2, chen | `≥5-lowercase-word` guard over-rejects legit long numbered headings. |
287
+ | ~~**G5b long-descriptive-title prose guard**~~ ✓ FIXED v2.4.45 (cycle 13) | S1 | jdm16, jdm_m2, chen | ~~`≥5-lowercase-word` guard over-rejects legit long numbered headings.~~ Subsection promoter's lc-run guard removed; single-level raised 5→8. |
278
288
  | **FIG caption double-emission + truncation** | S2 | jdm_m2, efendic, chan_feldman, ziano, jdm15/16 (~8) | caption inline + in `## Figures` block; truncated mid-word; figure data-labels as orphan body lines. |
279
289
  | **GLYPH ligature** `fi`/`fl` not decomposed | S2 | jdm_m2 (and likely many) | `confident`, `influence` — NFKC would fix; check why current NFC pass misses U+FB01/FB02. |
280
290
  | **D4 metadata residuals** | S2 | ar_apa_011 (`doi:` line), chen, efendic masthead | see D4 RESIDUALS above. |
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "docpluck"
7
- version = "2.4.43"
7
+ version = "2.4.45"
8
8
  description = "PDF, DOCX, and HTML text extraction and normalization for academic papers"
9
9
  readme = "docs/README.md"
10
10
  requires-python = ">=3.10"
@@ -6,7 +6,7 @@
6
6
  "label": "abstract",
7
7
  "canonical_label": "abstract",
8
8
  "char_start": 0,
9
- "char_end": 28,
9
+ "char_end": 29,
10
10
  "pages": [],
11
11
  "confidence": "high",
12
12
  "detected_via": "heading_match",
@@ -15,8 +15,8 @@
15
15
  {
16
16
  "label": "introduction",
17
17
  "canonical_label": "introduction",
18
- "char_start": 28,
19
- "char_end": 53,
18
+ "char_start": 29,
19
+ "char_end": 55,
20
20
  "pages": [],
21
21
  "confidence": "high",
22
22
  "detected_via": "heading_match",
@@ -25,8 +25,8 @@
25
25
  {
26
26
  "label": "methods",
27
27
  "canonical_label": "methods",
28
- "char_start": 53,
29
- "char_end": 78,
28
+ "char_start": 55,
29
+ "char_end": 81,
30
30
  "pages": [],
31
31
  "confidence": "high",
32
32
  "detected_via": "heading_match",
@@ -35,8 +35,8 @@
35
35
  {
36
36
  "label": "results",
37
37
  "canonical_label": "results",
38
- "char_start": 78,
39
- "char_end": 103,
38
+ "char_start": 81,
39
+ "char_end": 107,
40
40
  "pages": [],
41
41
  "confidence": "high",
42
42
  "detected_via": "heading_match",
@@ -45,8 +45,8 @@
45
45
  {
46
46
  "label": "methods_2",
47
47
  "canonical_label": "methods",
48
- "char_start": 103,
49
- "char_end": 128,
48
+ "char_start": 107,
49
+ "char_end": 133,
50
50
  "pages": [],
51
51
  "confidence": "high",
52
52
  "detected_via": "heading_match",
@@ -55,8 +55,8 @@
55
55
  {
56
56
  "label": "results_2",
57
57
  "canonical_label": "results",
58
- "char_start": 128,
59
- "char_end": 153,
58
+ "char_start": 133,
59
+ "char_end": 159,
60
60
  "pages": [],
61
61
  "confidence": "high",
62
62
  "detected_via": "heading_match",
@@ -65,8 +65,8 @@
65
65
  {
66
66
  "label": "general_discussion",
67
67
  "canonical_label": "general_discussion",
68
- "char_start": 153,
69
- "char_end": 183,
68
+ "char_start": 159,
69
+ "char_end": 190,
70
70
  "pages": [],
71
71
  "confidence": "high",
72
72
  "detected_via": "heading_match",
@@ -75,8 +75,8 @@
75
75
  {
76
76
  "label": "references",
77
77
  "canonical_label": "references",
78
- "char_start": 183,
79
- "char_end": 213,
78
+ "char_start": 190,
79
+ "char_end": 220,
80
80
  "pages": [],
81
81
  "confidence": "high",
82
82
  "detected_via": "heading_match",
@@ -6,7 +6,7 @@
6
6
  "label": "abstract",
7
7
  "canonical_label": "abstract",
8
8
  "char_start": 0,
9
- "char_end": 36,
9
+ "char_end": 37,
10
10
  "pages": [],
11
11
  "confidence": "high",
12
12
  "detected_via": "heading_match",
@@ -15,8 +15,8 @@
15
15
  {
16
16
  "label": "introduction",
17
17
  "canonical_label": "introduction",
18
- "char_start": 36,
19
- "char_end": 61,
18
+ "char_start": 37,
19
+ "char_end": 63,
20
20
  "pages": [],
21
21
  "confidence": "high",
22
22
  "detected_via": "heading_match",
@@ -25,8 +25,8 @@
25
25
  {
26
26
  "label": "methods",
27
27
  "canonical_label": "methods",
28
- "char_start": 61,
29
- "char_end": 84,
28
+ "char_start": 63,
29
+ "char_end": 87,
30
30
  "pages": [],
31
31
  "confidence": "high",
32
32
  "detected_via": "heading_match",
@@ -35,8 +35,8 @@
35
35
  {
36
36
  "label": "results",
37
37
  "canonical_label": "results",
38
- "char_start": 84,
39
- "char_end": 108,
38
+ "char_start": 87,
39
+ "char_end": 112,
40
40
  "pages": [],
41
41
  "confidence": "high",
42
42
  "detected_via": "heading_match",
@@ -45,8 +45,8 @@
45
45
  {
46
46
  "label": "discussion",
47
47
  "canonical_label": "discussion",
48
- "char_start": 108,
49
- "char_end": 133,
48
+ "char_start": 112,
49
+ "char_end": 138,
50
50
  "pages": [],
51
51
  "confidence": "high",
52
52
  "detected_via": "heading_match",
@@ -55,8 +55,8 @@
55
55
  {
56
56
  "label": "references",
57
57
  "canonical_label": "references",
58
- "char_start": 133,
59
- "char_end": 163,
58
+ "char_start": 138,
59
+ "char_end": 168,
60
60
  "pages": [],
61
61
  "confidence": "high",
62
62
  "detected_via": "heading_match",