docpluck 2.4.76__tar.gz → 2.4.80__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (390) hide show
  1. {docpluck-2.4.76 → docpluck-2.4.80}/.claude/skills/_project/canary.json +6 -0
  2. {docpluck-2.4.76 → docpluck-2.4.80}/.claude/skills/_project/lessons.md +41 -0
  3. {docpluck-2.4.76 → docpluck-2.4.80}/.claude/skills/docpluck-iterate/LEARNINGS.md +37 -0
  4. {docpluck-2.4.76 → docpluck-2.4.80}/.claude/skills/docpluck-iterate/references/ai-full-doc-verify.md +5 -2
  5. {docpluck-2.4.76 → docpluck-2.4.80}/.gitignore +6 -0
  6. {docpluck-2.4.76 → docpluck-2.4.80}/CHANGELOG.md +52 -0
  7. {docpluck-2.4.76 → docpluck-2.4.80}/PKG-INFO +2 -1
  8. {docpluck-2.4.76 → docpluck-2.4.80}/TODO.md +23 -0
  9. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/__init__.py +1 -1
  10. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/extract.py +20 -5
  11. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/extract_columns.py +274 -21
  12. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/normalize.py +71 -1
  13. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/render.py +587 -4
  14. docpluck-2.4.80/docs/superpowers/handoffs/2026-05-26-run-11-cluster-A-ter-and-C-bis-landed.md +198 -0
  15. docpluck-2.4.80/docs/superpowers/handoffs/2026-05-26-text-extraction-defects-from-citationguard-audit.md +247 -0
  16. docpluck-2.4.80/docs/superpowers/handoffs/2026-06-07-text-extraction-defects-from-citationguard-iterate.md +286 -0
  17. docpluck-2.4.80/docs/superpowers/handoffs/2026-06-07-v2.4.78-landed-canary-iterate.md +94 -0
  18. docpluck-2.4.80/docs/superpowers/handoffs/2026-06-07-v2.4.79-findings-1-2-cleared.md +81 -0
  19. docpluck-2.4.80/docs/superpowers/specs/2026-06-07-ip_feldman-B4-R4-column-interleave-diagnosis.md +270 -0
  20. {docpluck-2.4.76 → docpluck-2.4.80}/pyproject.toml +2 -1
  21. {docpluck-2.4.76 → docpluck-2.4.80}/tests/snapshots/apa_chen_jesp_lineless.txt +90 -92
  22. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_hallucinated_heading_continuation_guard.py +56 -0
  23. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_normalize_metadata_leak_real_pdf.py +214 -0
  24. docpluck-2.4.80/tests/test_normalize_soft_hyphen_dehyphenation.py +93 -0
  25. docpluck-2.4.80/tests/test_o5_reference_inversion_real_pdf.py +120 -0
  26. docpluck-2.4.80/tests/test_render_frontmatter_masthead.py +431 -0
  27. {docpluck-2.4.76 → docpluck-2.4.80}/.claude/skills/docpluck-cleanup/SKILL.md +0 -0
  28. {docpluck-2.4.76 → docpluck-2.4.80}/.claude/skills/docpluck-deploy/SKILL.md +0 -0
  29. {docpluck-2.4.76 → docpluck-2.4.80}/.claude/skills/docpluck-iterate/SKILL.md +0 -0
  30. {docpluck-2.4.76 → docpluck-2.4.80}/.claude/skills/docpluck-iterate/references/cycle-report-template.md +0 -0
  31. {docpluck-2.4.76 → docpluck-2.4.80}/.claude/skills/docpluck-iterate/references/local-verification.md +0 -0
  32. {docpluck-2.4.76 → docpluck-2.4.80}/.claude/skills/docpluck-iterate/references/rationalizations.md +0 -0
  33. {docpluck-2.4.76 → docpluck-2.4.80}/.claude/skills/docpluck-iterate/references/real-library-real-pdf.md +0 -0
  34. {docpluck-2.4.76 → docpluck-2.4.80}/.claude/skills/docpluck-iterate/references/release-flow.md +0 -0
  35. {docpluck-2.4.76 → docpluck-2.4.80}/.claude/skills/docpluck-iterate/references/self-improvement.md +0 -0
  36. {docpluck-2.4.76 → docpluck-2.4.80}/.claude/skills/docpluck-iterate/references/three-tier-parity.md +0 -0
  37. {docpluck-2.4.76 → docpluck-2.4.80}/.claude/skills/docpluck-qa/SKILL.md +0 -0
  38. {docpluck-2.4.76 → docpluck-2.4.80}/.claude/skills/docpluck-qa/references/benchmark-mode.md +0 -0
  39. {docpluck-2.4.76 → docpluck-2.4.80}/.claude/skills/docpluck-qa/references/check-11-hard-rules.md +0 -0
  40. {docpluck-2.4.76 → docpluck-2.4.80}/.claude/skills/docpluck-qa/references/check-13-escicheck-production.md +0 -0
  41. {docpluck-2.4.76 → docpluck-2.4.80}/.claude/skills/docpluck-qa/references/check-5-escicheck-library.md +0 -0
  42. {docpluck-2.4.76 → docpluck-2.4.80}/.claude/skills/docpluck-qa/references/check-6-escicheck-local-webapp.md +0 -0
  43. {docpluck-2.4.76 → docpluck-2.4.80}/.claude/skills/docpluck-qa/references/check-7-batch-smoke.md +0 -0
  44. {docpluck-2.4.76 → docpluck-2.4.80}/.claude/skills/docpluck-review/SKILL.md +0 -0
  45. {docpluck-2.4.76 → docpluck-2.4.80}/.github/workflows/bump-app-pin.yml +0 -0
  46. {docpluck-2.4.76 → docpluck-2.4.80}/.github/workflows/publish.yml +0 -0
  47. {docpluck-2.4.76 → docpluck-2.4.80}/.github/workflows/test.yml +0 -0
  48. {docpluck-2.4.76 → docpluck-2.4.80}/CLAUDE.md +0 -0
  49. {docpluck-2.4.76 → docpluck-2.4.80}/HANDOFF_SECTIONS_APP_INTEGRATION.md +0 -0
  50. {docpluck-2.4.76 → docpluck-2.4.80}/LESSONS.md +0 -0
  51. {docpluck-2.4.76 → docpluck-2.4.80}/LICENSE +0 -0
  52. {docpluck-2.4.76 → docpluck-2.4.80}/REPLY_FROM_DOCPLUCK_v1.4.5.md +0 -0
  53. {docpluck-2.4.76 → docpluck-2.4.80}/REPLY_FROM_DOCPLUCK_v1.5.0.md +0 -0
  54. {docpluck-2.4.76 → docpluck-2.4.80}/REQUEST_08_CHUNKING_ENDPOINT.md +0 -0
  55. {docpluck-2.4.76 → docpluck-2.4.80}/REQUEST_09_REFERENCE_LIST_NORMALIZATION.md +0 -0
  56. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/__main__.py +0 -0
  57. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/batch.py +0 -0
  58. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/cli.py +0 -0
  59. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/extract_docx.py +0 -0
  60. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/extract_html.py +0 -0
  61. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/extract_layout.py +0 -0
  62. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/extract_structured.py +0 -0
  63. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/figures/__init__.py +0 -0
  64. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/figures/detect.py +0 -0
  65. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/quality.py +0 -0
  66. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/sections/__init__.py +0 -0
  67. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/sections/annotators/__init__.py +0 -0
  68. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/sections/annotators/docx.py +0 -0
  69. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/sections/annotators/html.py +0 -0
  70. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/sections/annotators/pdf.py +0 -0
  71. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/sections/annotators/text.py +0 -0
  72. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/sections/blocks.py +0 -0
  73. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/sections/boundaries.py +0 -0
  74. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/sections/core.py +0 -0
  75. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/sections/taxonomy.py +0 -0
  76. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/sections/types.py +0 -0
  77. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/tables/__init__.py +0 -0
  78. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/tables/bbox_utils.py +0 -0
  79. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/tables/camelot_extract.py +0 -0
  80. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/tables/captions.py +0 -0
  81. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/tables/cell_cleaning.py +0 -0
  82. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/tables/cluster.py +0 -0
  83. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/tables/confidence.py +0 -0
  84. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/tables/detect.py +0 -0
  85. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/tables/flatten.py +0 -0
  86. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/tables/render.py +0 -0
  87. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/tables/whitespace.py +0 -0
  88. {docpluck-2.4.76 → docpluck-2.4.80}/docpluck/version.py +0 -0
  89. {docpluck-2.4.76 → docpluck-2.4.80}/docs/BENCHMARKS.md +0 -0
  90. {docpluck-2.4.76 → docpluck-2.4.80}/docs/DESIGN.md +0 -0
  91. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-07_sections_strict_iteration.md +0 -0
  92. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-09_session_state_and_followups.md +0 -0
  93. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-09_unified_extraction_brainstorm.md +0 -0
  94. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-10_table_rendering_iteration.md +0 -0
  95. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-10_table_rendering_iteration_2.md +0 -0
  96. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-10_table_rendering_iteration_3.md +0 -0
  97. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-10_table_rendering_iteration_4.md +0 -0
  98. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-10_table_rendering_iteration_5.md +0 -0
  99. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-10_table_rendering_iteration_6.md +0 -0
  100. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-10_table_rendering_iteration_7.md +0 -0
  101. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-11_PROMOTE_SPIKE_TO_LIBRARY.md +0 -0
  102. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-11_table_rendering_iteration_8.md +0 -0
  103. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-11_visual_review_findings.md +0 -0
  104. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-12_phase2_101pdf_corpus.md +0 -0
  105. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-12_remaining_ui_and_chrome_verification.md +0 -0
  106. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-12_visual_verify_results.md +0 -0
  107. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-13_apa_50_expansion.md +0 -0
  108. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-13_apa_50_expansion_iter_1.md +0 -0
  109. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-13_apa_50_expansion_iter_2.md +0 -0
  110. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-13_iterate_skill_first_use.md +0 -0
  111. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-13_iterative_1.md +0 -0
  112. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-13_iterative_library_improvement.md +0 -0
  113. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-13_table_extraction_next_iteration.md +0 -0
  114. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-14_continue_iterations_v2_4_30_to_15n.md +0 -0
  115. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-14_full_corpus_iteration_v2_4_30.md +0 -0
  116. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-14_iterate_6_cycles_complete.md +0 -0
  117. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-14_iterate_9_cycle_run.md +0 -0
  118. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-14_iterate_resume_4_cycles.md +0 -0
  119. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-14_iterate_v2_4_31_cycle_15n.md +0 -0
  120. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-14_phase_5d_gold_audit_v2_4_29.md +0 -0
  121. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-15_autonomous_apa_first_10h.md +0 -0
  122. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-15_iterate_apa_run_1.md +0 -0
  123. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-16_ai-gold-instructions.md +0 -0
  124. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-16_iterate_apa_run_2.md +0 -0
  125. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-16_iterate_apa_run_3.md +0 -0
  126. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-16_iterate_run_4_final.md +0 -0
  127. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-16_iterate_run_4_fix_and_continue.md +0 -0
  128. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-16_iterate_run_5.md +0 -0
  129. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-16_iterate_run_6.md +0 -0
  130. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-17_iterate_run_7.md +0 -0
  131. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-17_iterate_run_8.md +0 -0
  132. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-17_iterate_run_9.md +0 -0
  133. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-18_iterate_run_9_cont.md +0 -0
  134. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-18_iterate_run_9_cont2.md +0 -0
  135. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-20_iterate_run_9_cont3.md +0 -0
  136. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-22_iterate_run_9_session4_final.md +0 -0
  137. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-22_iterate_run_9_session5_close.md +0 -0
  138. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-25_haiku-orchestration-pretest.md +0 -0
  139. {docpluck-2.4.76 → docpluck-2.4.80}/docs/HANDOFF_2026-05-25_pretest-followups.md +0 -0
  140. {docpluck-2.4.76 → docpluck-2.4.80}/docs/ITERATION_VERIFICATION_LESSONS.md +0 -0
  141. {docpluck-2.4.76 → docpluck-2.4.80}/docs/LIBRARY_APP_SYNC.md +0 -0
  142. {docpluck-2.4.76 → docpluck-2.4.80}/docs/NORMALIZATION.md +0 -0
  143. {docpluck-2.4.76 → docpluck-2.4.80}/docs/README.md +0 -0
  144. {docpluck-2.4.76 → docpluck-2.4.80}/docs/TRIAGE_2026-05-10_corpus_assessment.md +0 -0
  145. {docpluck-2.4.76 → docpluck-2.4.80}/docs/TRIAGE_2026-05-14_phase_5d_gold_audit.md +0 -0
  146. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/handoffs/2026-05-22-b1-next-iteration.md +0 -0
  147. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/handoffs/2026-05-22-b2-remaining-halluc-head.md +0 -0
  148. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/handoffs/2026-05-22-b3-b7-structural-defects.md +0 -0
  149. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/handoffs/2026-05-22-residual-after-locally-doable-pass.md +0 -0
  150. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/handoffs/2026-05-23-bundled-residual-cycle-CLOSED.md +0 -0
  151. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/handoffs/2026-05-23-residual-after-iterate-spine-cycles-1-3.md +0 -0
  152. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/handoffs/2026-05-25-canary-audit-architecture-and-cluster-A-B-C-landed.md +0 -0
  153. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/handoffs/2026-05-25-wrapup-r4-cycle.md +0 -0
  154. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/2026-05-06-section-identification.md +0 -0
  155. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/2026-05-06-table-extraction.md +0 -0
  156. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/2026-05-07-sections-strict-iteration-progress.md +0 -0
  157. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/2026-05-08-unified-extraction-phase-0-splice-spike.md +0 -0
  158. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/2026-05-23-haiku-orchestration-pretest.md +0 -0
  159. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/sections-deferred-items.md +0 -0
  160. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/sections-issues-backlog.md +0 -0
  161. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/2026-05-07_spot-01_apa.md +0 -0
  162. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/2026-05-07_spot-02_pattern-A-shipped.md +0 -0
  163. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/2026-05-08_spot-final_all-styles.md +0 -0
  164. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/experiments/COMPARISON.md +0 -0
  165. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/korbmacher_table1.md +0 -0
  166. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/option-a.py +0 -0
  167. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/ziano_table1.md +0 -0
  168. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_notes_raw.txt +0 -0
  169. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_table1.md +0 -0
  170. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/notes.md +0 -0
  171. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/option-b.py +0 -0
  172. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_notes_raw.txt +0 -0
  173. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_table1.md +0 -0
  174. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/korbmacher_table1.md +0 -0
  175. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/notes.md +0 -0
  176. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/option-c.py +0 -0
  177. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/sample-pdftotext-bbox.html +0 -0
  178. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/ziano_table1.md +0 -0
  179. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/korbmacher_table1.md +0 -0
  180. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/notes.md +0 -0
  181. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/option-d.py +0 -0
  182. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/ziano_table1.md +0 -0
  183. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_2022_kruger_bbox.html +0 -0
  184. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_bbox.html +0 -0
  185. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_table1.md +0 -0
  186. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/option-e.py +0 -0
  187. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/sample-bbox.html +0 -0
  188. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_2021_joep_bbox.html +0 -0
  189. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_bbox.html +0 -0
  190. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_table1.md +0 -0
  191. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/html-fallback-demo.md +0 -0
  192. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.err +0 -0
  193. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.md +0 -0
  194. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.err +0 -0
  195. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.md +0 -0
  196. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.err +0 -0
  197. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.md +0 -0
  198. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.err +0 -0
  199. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.md +0 -0
  200. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.err +0 -0
  201. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.md +0 -0
  202. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.err +0 -0
  203. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.md +0 -0
  204. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.err +0 -0
  205. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.md +0 -0
  206. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.err +0 -0
  207. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.md +0 -0
  208. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.err +0 -0
  209. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.md +0 -0
  210. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.err +0 -0
  211. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.md +0 -0
  212. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.err +0 -0
  213. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.md +0 -0
  214. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.err +0 -0
  215. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.md +0 -0
  216. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.err +0 -0
  217. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.md +0 -0
  218. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.err +0 -0
  219. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.md +0 -0
  220. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.err +0 -0
  221. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.md +0 -0
  222. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.err +0 -0
  223. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.md +0 -0
  224. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.err +0 -0
  225. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.md +0 -0
  226. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.err +0 -0
  227. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.md +0 -0
  228. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.err +0 -0
  229. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.md +0 -0
  230. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.err +0 -0
  231. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.md +0 -0
  232. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.err +0 -0
  233. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.md +0 -0
  234. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.err +0 -0
  235. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.md +0 -0
  236. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.err +0 -0
  237. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.md +0 -0
  238. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.err +0 -0
  239. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.md +0 -0
  240. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.err +0 -0
  241. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.md +0 -0
  242. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.err +0 -0
  243. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.md +0 -0
  244. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/papers.md +0 -0
  245. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/report.md +0 -0
  246. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/splice_spike.py +0 -0
  247. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/plans/spot-checks/splice-spike/test_splice_spike.py +0 -0
  248. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/specs/2026-04-27-request-09-reference-normalization-design.md +0 -0
  249. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/specs/2026-05-06-section-identification-design.md +0 -0
  250. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/specs/2026-05-06-table-extraction-design.md +0 -0
  251. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/specs/2026-05-08-unified-extraction-design.md +0 -0
  252. {docpluck-2.4.76 → docpluck-2.4.80}/docs/superpowers/specs/2026-05-23-haiku-orchestration-pretest-design.md +0 -0
  253. {docpluck-2.4.76 → docpluck-2.4.80}/scripts/__init__.py +0 -0
  254. {docpluck-2.4.76 → docpluck-2.4.80}/scripts/harness/README.md +0 -0
  255. {docpluck-2.4.76 → docpluck-2.4.80}/scripts/harness/VERIFIER_PROMPT.md +0 -0
  256. {docpluck-2.4.76 → docpluck-2.4.80}/scripts/harness/__init__.py +0 -0
  257. {docpluck-2.4.76 → docpluck-2.4.80}/scripts/harness/baseline_matrix.json +0 -0
  258. {docpluck-2.4.76 → docpluck-2.4.80}/scripts/harness/checks.py +0 -0
  259. {docpluck-2.4.76 → docpluck-2.4.80}/scripts/harness/corpus.py +0 -0
  260. {docpluck-2.4.76 → docpluck-2.4.80}/scripts/harness/corpus_manifest.json +0 -0
  261. {docpluck-2.4.76 → docpluck-2.4.80}/scripts/harness/extract.py +0 -0
  262. {docpluck-2.4.76 → docpluck-2.4.80}/scripts/harness/gold_keys.json +0 -0
  263. {docpluck-2.4.76 → docpluck-2.4.80}/scripts/harness/inspect.py +0 -0
  264. {docpluck-2.4.76 → docpluck-2.4.80}/scripts/lint_rendered_corpus.py +0 -0
  265. {docpluck-2.4.76 → docpluck-2.4.80}/scripts/pretest_capture_tokens.py +0 -0
  266. {docpluck-2.4.76 → docpluck-2.4.80}/scripts/verify_corpus.py +0 -0
  267. {docpluck-2.4.76 → docpluck-2.4.80}/scripts/verify_corpus_full.py +0 -0
  268. {docpluck-2.4.76 → docpluck-2.4.80}/tests/__init__.py +0 -0
  269. {docpluck-2.4.76 → docpluck-2.4.80}/tests/conftest.py +0 -0
  270. {docpluck-2.4.76 → docpluck-2.4.80}/tests/fixtures/__init__.py +0 -0
  271. {docpluck-2.4.76 → docpluck-2.4.80}/tests/fixtures/sections/__init__.py +0 -0
  272. {docpluck-2.4.76 → docpluck-2.4.80}/tests/fixtures/sections/builders.py +0 -0
  273. {docpluck-2.4.76 → docpluck-2.4.80}/tests/fixtures/structured/.gitkeep +0 -0
  274. {docpluck-2.4.76 → docpluck-2.4.80}/tests/fixtures/structured/MANIFEST.json +0 -0
  275. {docpluck-2.4.76 → docpluck-2.4.80}/tests/fixtures/structured/README.md +0 -0
  276. {docpluck-2.4.76 → docpluck-2.4.80}/tests/golden/sections/apa_multi_study_pdf.json +0 -0
  277. {docpluck-2.4.76 → docpluck-2.4.80}/tests/golden/sections/apa_single_study_pdf.json +0 -0
  278. {docpluck-2.4.76 → docpluck-2.4.80}/tests/golden/sections/html_real_headings.json +0 -0
  279. {docpluck-2.4.76 → docpluck-2.4.80}/tests/snapshots/amj_lattice.txt +0 -0
  280. {docpluck-2.4.76 → docpluck-2.4.80}/tests/snapshots/apa_chan_feldman_lineless.txt +0 -0
  281. {docpluck-2.4.76 → docpluck-2.4.80}/tests/snapshots/apa_efendic_affect.txt +0 -0
  282. {docpluck-2.4.76 → docpluck-2.4.80}/tests/snapshots/apa_ip_feldman_pspb.txt +0 -0
  283. {docpluck-2.4.76 → docpluck-2.4.80}/tests/snapshots/bmc_lattice.txt +0 -0
  284. {docpluck-2.4.76 → docpluck-2.4.80}/tests/snapshots/ieee_figure_heavy.txt +0 -0
  285. {docpluck-2.4.76 → docpluck-2.4.80}/tests/snapshots/ieee_lattice.txt +0 -0
  286. {docpluck-2.4.76 → docpluck-2.4.80}/tests/snapshots/jama_lattice.txt +0 -0
  287. {docpluck-2.4.76 → docpluck-2.4.80}/tests/snapshots/nat_comms_figure_only.txt +0 -0
  288. {docpluck-2.4.76 → docpluck-2.4.80}/tests/snapshots/nature_minimal_rule.txt +0 -0
  289. {docpluck-2.4.76 → docpluck-2.4.80}/tests/snapshots/scirep_minimal_rule.txt +0 -0
  290. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_a3c_leading_zero_decimal_real_pdf.py +0 -0
  291. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_a4_ci_period_to_comma.py +0 -0
  292. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_all_caps_section_promote_real_pdf.py +0 -0
  293. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_bbox_utils.py +0 -0
  294. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_benchmark_docx_html.py +0 -0
  295. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_cambridge_footer_strip_real_pdf.py +0 -0
  296. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_caption_only_table_heading_real_pdf.py +0 -0
  297. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_caption_regex.py +0 -0
  298. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_chart_data_trim_real_pdf.py +0 -0
  299. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_cid_minus_recovery_real_pdf.py +0 -0
  300. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_cli_sections.py +0 -0
  301. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_cli_structured.py +0 -0
  302. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_confidence.py +0 -0
  303. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_corpus_smoke.py +0 -0
  304. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_d5_normalization_audit.py +0 -0
  305. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_edge_cases.py +0 -0
  306. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_elsevier_footer_strip_real_pdf.py +0 -0
  307. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_equation_page_header_strip_real_pdf.py +0 -0
  308. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_extract_columns.py +0 -0
  309. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_extract_docx.py +0 -0
  310. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_extract_filter_sugar.py +0 -0
  311. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_extract_html.py +0 -0
  312. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_extract_layout.py +0 -0
  313. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_extract_pdf_structured.py +0 -0
  314. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_extraction.py +0 -0
  315. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_f0_table_region_aware.py +0 -0
  316. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_fffd_comparison_recovery_real_pdf.py +0 -0
  317. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_figure_caption_trim_real_pdf.py +0 -0
  318. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_figure_detect.py +0 -0
  319. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_fixtures_manifest.py +0 -0
  320. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_harness_text_loss_reflow.py +0 -0
  321. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_jama_open_cluster_real_pdf.py +0 -0
  322. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_lattice_cluster.py +0 -0
  323. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_letterspaced_label_real_pdf.py +0 -0
  324. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_ligature_decomposition_real_pdf.py +0 -0
  325. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_lt_operator_recovery_real_pdf.py +0 -0
  326. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_mathitalic_greek_real_pdf.py +0 -0
  327. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_metaesci_followups.py +0 -0
  328. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_minus_sign_recovery_real_pdf.py +0 -0
  329. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_normalization.py +0 -0
  330. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_normalize_a3_r2_body_integer_real_pdf.py +0 -0
  331. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_normalize_f0_footnote_strip.py +0 -0
  332. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_normalize_idempotent_real_pdf.py +0 -0
  333. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_normalize_layout_param.py +0 -0
  334. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_normalize_report_layout_fields.py +0 -0
  335. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_normalize_v18_strips.py +0 -0
  336. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_numbered_heading_promotion_real_pdf.py +0 -0
  337. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_numbered_section_promotion_real_pdf.py +0 -0
  338. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_orphan_multilevel_number_real_pdf.py +0 -0
  339. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_orphan_section_number_real_pdf.py +0 -0
  340. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_p0r_recurring_running_header_strip.py +0 -0
  341. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_preserve_math_glyphs_real_pdf.py +0 -0
  342. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_pretest_capture_tokens.py +0 -0
  343. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_pua_glyph_recovery_real_pdf.py +0 -0
  344. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_quality.py +0 -0
  345. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_r1_whitespace_cells_wiring_real_pdf.py +0 -0
  346. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_r4_column_correction_real_pdf.py +0 -0
  347. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_render.py +0 -0
  348. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_render_html.py +0 -0
  349. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_render_subsection_chain_promotion.py +0 -0
  350. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_request_09_reference_normalization.py +0 -0
  351. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_residual_2026_05_23_bundled.py +0 -0
  352. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_roman_numeral_section_promote_real_pdf.py +0 -0
  353. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_section_row_label_no_merge_real_pdf.py +0 -0
  354. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_sections_boundaries.py +0 -0
  355. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_sections_boundary_truncation.py +0 -0
  356. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_sections_core_partition.py +0 -0
  357. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_sections_docx_annotator.py +0 -0
  358. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_sections_extract_text.py +0 -0
  359. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_sections_footnote_section.py +0 -0
  360. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_sections_golden.py +0 -0
  361. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_sections_html_annotator.py +0 -0
  362. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_sections_pdf_annotator.py +0 -0
  363. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_sections_public_api.py +0 -0
  364. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_sections_real_corpus.py +0 -0
  365. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_sections_taxonomy.py +0 -0
  366. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_sections_text_annotator.py +0 -0
  367. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_sections_types.py +0 -0
  368. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_sections_unit_corpus.py +0 -0
  369. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_sections_v161_coalesce.py +0 -0
  370. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_sections_v161_subheadings.py +0 -0
  371. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_sections_v161_taxonomy.py +0 -0
  372. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_sections_v161_text_annotator.py +0 -0
  373. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_sections_version.py +0 -0
  374. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_smoke_fixtures.py +0 -0
  375. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_structured_result_type.py +0 -0
  376. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_structured_types.py +0 -0
  377. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_structured_version.py +0 -0
  378. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_table_caption_cell_region_real_pdf.py +0 -0
  379. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_table_detect.py +0 -0
  380. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_tables_cell_cleaning.py +0 -0
  381. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_tables_flatten.py +0 -0
  382. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_text_mode.py +0 -0
  383. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_v23_1_fixes.py +0 -0
  384. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_v23_bug_fixes.py +0 -0
  385. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_v23_post_corpus.py +0 -0
  386. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_v23_post_corpus_v2.py +0 -0
  387. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_v2_backwards_compat.py +0 -0
  388. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_v2_top_level_exports.py +0 -0
  389. {docpluck-2.4.76 → docpluck-2.4.80}/tests/test_whitespace_cluster.py +0 -0
  390. {docpluck-2.4.76 → docpluck-2.4.80}/tools/render_for_audit.py +0 -0
@@ -56,6 +56,12 @@
56
56
  "locator_via_article_finder": "python ~/.claude/skills/article-finder/cache-check.py <doi>",
57
57
  "gold_via_article_finder": "python ~/.claude/skills/article-finder/ai-gold.py get <key> --view reading",
58
58
  "render_command": "python tools/render_for_audit.py --key {key} --out {out}",
59
+ "allowed_omissions": [
60
+ "Front-matter masthead block between the title (H1) and the Abstract: author name line(s), affiliation line(s), corresponding-author contact line, journal-name banner, volume/issue/page range, 'Article type', publisher/copyright line, 'Article reuse guidelines', journal-home URL. docpluck strips this masthead by design (render.py _strip_frontmatter_masthead_block); consumers needing structured author metadata use CrossRef/DOI lookup. Its absence from the rendered .md is NOT TEXT-LOSS or METADATA-LEAK.",
61
+ "Publication-history date lines such as 'Received <date>; revision accepted <date>' or 'Received/Revised/Accepted' mastheads — stripped as journal furniture (normalize.py _PAGE_FOOTER_LINE_PATTERNS).",
62
+ "Running headers / running feet (e.g. 'Ip and Feldman', 'PERSONALITY AND SOCIAL PSYCHOLOGY BULLETIN'), bare page numbers, standalone DOI/ORCID banner lines, and copyright lines — stripped as page furniture.",
63
+ "ORCID iD lines and 'Data Availability'/'Open Practices' badge furniture that pdftotext renders as standalone fragments."
64
+ ],
59
65
  "audit_subagent_prompt": "~/.claude/skills/_shared/iterate-loop/audit-subagent-prompt.md",
60
66
  "audit_subagent_model": "sonnet",
61
67
  "audit_invocation": {
@@ -1,4 +1,34 @@
1
1
 
2
+ ## Stripping load-bearing front-matter metadata exposes pre-existing wrapped-title duplicates (2026-05-26 Cluster E revert)
3
+
4
+ **What:** Cycle 4 of run 11 added line patterns to strip bare article ID (`1327169`) + article-type code (`research-article2025`) at top of PSPB-layout docs. Patterns smoke-tested clean (zero false positives across 20 cases). Render showed top-of-doc metadata correctly gone — but introduced `### Title duplicate` as wrapped multi-line text immediately under the H1. Root cause: pdftotext emits the title TWICE on PSPB layouts (main + running-header copy in column 2). The metadata lines were absorbing/separating the duplicate so it never reached `_promote_isolated_titlecase_subsection_headings`. Without them, the wrap candidate is now isolated and gets promoted.
5
+
6
+ **How to detect:** any metadata-strip cycle where the BEFORE render had multi-line text just under the H1 (before the author byline) needs a wrapped-title-duplicate check AFTER the strips. Compare H1 token set vs the next 5-10 non-blank lines' token sets — if there's high overlap, the lines under the H1 are likely a duplicate that the metadata block was hiding.
7
+
8
+ **Fix:** strip metadata + install a wrapped-title-duplicate detector in the same change. Don't ship the strip alone. The duplicate detector should match a paragraph-block under the H1 whose concatenated text equals the H1 modulo whitespace (or whose tokens are a high-overlap subset). Run AFTER the strips, BEFORE `_promote_isolated_titlecase_subsection_headings` so the wrap doesn't get promoted to `### `.
9
+
10
+ **File:** `docpluck/normalize.py::_FRONTMATTER_LEAK_LINE_PATTERNS` had the `_ARTICLE_TYPE_CODE` + `_BARE_ARTICLE_ID` additions reverted; the safer `Article reuse guidelines:` leaf-node P0 pattern was kept (it's not load-bearing).
11
+
12
+ ## Subsection-chain promotion needs (a) parent-section blacklist AND (b) strict-adjacent backward walk (2026-05-26 Cluster A-ter)
13
+
14
+ **What:** Stacked Method subsections (e.g., `## Method` immediately followed by `Design and Procedure` + blank + `Power Analysis and Sensitivity Test` + blank + body) were not being promoted to `### ` headings because the existing `_promote_isolated_titlecase_subsection_headings` cell-region reject + sibling-label reject correctly reject each candidate individually but can't see across the chain to confirm "this is a real stacked subsection set." The chain detection helper (`_is_subsection_chain_member`) closes that gap.
15
+
16
+ **Two safety guards are mandatory:**
17
+ 1. **`_CHAIN_REJECT_PARENTS` blacklist** — when the chain's parent is `## Author Contributions` / `## CRediT` / `## Funding` / `## Acknowledgments` / etc., the candidates underneath are list items (CRediT roles, ORCID names, funding agencies), NOT subsection headings. Walking back to find the parent label and rejecting these is essential — otherwise chan_feldman's "Methodology" CRediT role gets promoted to `### Methodology` (the existing `test_chan_feldman_no_credit_role_methodology_heading` test catches this regression).
18
+ 2. **Strict-adjacent backward walk** (don't traverse through body) — a through-body backward walk over-promotes Table 4 row labels on ip_feldman ("Exploratory open-ended" / "Well-being measures and traits" / "IV1: estimation of negative emotional events" — these look like chain members under `## Method` if you walk through body). Strict-adjacent (only blank-separated candidates count) avoids this trap.
19
+
20
+ **How to detect:** after any chain-promotion change, render and grep both: (a) `^### Methodology` on chan_feldman (must be 0) and (b) `^### Exploratory open-ended` / `^### IV1:` on ip_feldman (must be 0).
21
+
22
+ **File:** `docpluck/render.py::_is_subsection_chain_member` (helper) + `_CHAIN_REJECT_PARENTS` frozenset (blacklist) + integration in `_promote_isolated_titlecase_subsection_headings` (bypass cell-region + sibling-label rejects when chain confirmed).
23
+
24
+ ## Orphan affiliation wrap-tail needs a tight line-level pattern with 60-char length lookahead (2026-05-26 Cluster C-bis)
25
+
26
+ **What:** Cluster C's name-led-affiliation pattern in `_FRONTMATTER_LEAK_PARA_PATTERNS` matches the first line of a 2-line wrapped corresponding-author paragraph (`"Gilad Feldman, Department of Psychology, University of Hong Kong, Pok"`), but the wrap-tail (`"Fu Lam, Hong Kong SAR."`) survives because line-by-line iteration in `_strip_frontmatter_metadata_leaks` can't see across the boundary. The 2026-05-25 Cluster C run cleared finding #1 mostly but left this orphan.
27
+
28
+ **Fix shape:** `^(?=.{1,60}$) <1-3 title-case place tokens>, <region: title-case+all-caps OR all-caps+optional-zip OR title-case>\.\s*$`. The 60-char lookahead bounds the line length so legitimate body sentences ending with a "Place, Region" phrase (typically much longer) aren't absorbed. Position-gated to front-matter zone via the outer strip's 8000-char cutoff.
29
+
30
+ **File:** `docpluck/normalize.py::_ORPHAN_AFFIL_WRAP_TAIL`. Regression tests in `tests/test_normalize_metadata_leak_real_pdf.py` covering positive variants ("Berkeley, CA.", "Cambridge, MA 02138.", etc.) and negative shapes ("(Miller & Prentice, 1994).", citations, body sentences containing place names).
31
+
2
32
  ## CHANGELOG-documented public-API names must be in `__all__` (caught 2026-05-07, v2.0.0 release)
3
33
 
4
34
  **What:** v2.0.0 CHANGELOG line "`Cell, Table, Figure, StructuredResult` TypedDicts and `TABLE_EXTRACTION_VERSION` re-exported from top-level `docpluck`" was inaccurate — `Cell` was importable via `docpluck.tables.Cell` but not from top-level `docpluck`. Caught by /ship Phase 3 cleanup against `docpluck.__all__`.
@@ -502,3 +532,14 @@ Rotation picks `pool[(N mod L) : (N mod L) + rotation_size]` wrapping. Over `cei
502
532
 
503
533
  **Bundling N distinct mechanisms in one cycle is OK when each is independently revertible** — the discipline rule "one class of defect per cycle" exists to prevent un-revertible co-fixes. When each fix is a contiguous block touching a different code path with its own test, bundling halves the release/deploy cost vs. N separate cycles. Cycle 14 packaged 3 fixes; cycle 15 packaged 5. Both shipped clean.
504
534
 
535
+
536
+ ## 2026-06-07 — finding-key identity must be line-number-invariant; deferred half-fixes resurface
537
+ - A canary-audit `finding_key` that embeds a line number is brittle: any normalize/render change that adds or removes a line shifts every downstream finding's line number, so `--gate-new-only` re-opens KNOWN deferred findings as NEW and falsely blocks the commit. Identity must be severity + verbatim excerpt (the document text is render-stable). Fixed in `_shared/iterate-loop/canary_findings.py::_norm_location`.
538
+ - The Sonnet canary auditor needs an `allowed_omissions` list (front-matter masthead, publication-history dates, running headers) or it flags docpluck's by-design strips as TEXT-LOSS and forces SKIP_CANARY. Mirror the in-session verifier's allowed-omissions list. Wired via `canary.json::verification_protocol.allowed_omissions` → payload → `audit-subagent-prompt.md`.
539
+ - A docstring that says "a separate pass can rejoin/fix this later" is a latent finding, not a note. `_demote_continuation_promoted_headings` deferred its rejoin in cycle 3; it surfaced as canary finding #2 in cycle 5. Finish the fix when you write it.
540
+ - The per-commit canary hook on a paper with a LARGE open deferred surface + single-audit non-determinism will keep requiring SKIP_CANARY until that surface is fully baselined or fixed. Don't read SKIP_CANARY as "gate is broken" — read it as "this canary has known multi-session debt." Surface it; use `--full` (double-audit union) to baseline more completely.
541
+
542
+ ## 2026-06-07 — O5 reference reading-order inversion + a broken WIP that silently disabled column-correction
543
+ - **A handoff item lumped into a "won't-fix" class can be a real, different-class bug.** scimeto's O5 ("chen refs stranded before the References header") was tagged "docpluck won't-fix" by association with the A–D pdftotext-glyph classes. It was actually a *reading-order* defect docpluck owns (pdftotext inverts a banded page's two reference columns). Always re-derive layer-of-origin per item; don't inherit a sibling's disposition.
544
+ - **`except Exception: pass` around an optional pipeline stage hides a hard syntax error.** A prior session left `extract_columns.py` with an IndentationError + a call to an undefined `_detect_2col_midline_gutter` in the working tree. Because `extract.py` wraps the column-correction call in a bare `except Exception: pass` ("signal-only, never block extraction"), the module simply failed to import on every flagged page → column-correction was silently OFF corpus-wide, with no error surfaced. When you see a broad `except: pass` around a feature, test that the feature actually still runs — don't assume "no error" means "working." (Companion to `feedback_no_silent_optional_deps`.)
545
+ - **The safe way to ship a reading-order change (which char-ratio/Jaccard gates are BLIND to): a word-preservation guard.** Accept a geometric re-extraction only if it preserves the page's substantial-word multiset (alphabetic tokens len≥2) — a pure reorder can't drop or fabricate text (rules 0a/0b), and the corpus char-ratio gate (blind to order) still confirms no content change. Confine the new path behind an explicit flag so the legacy path stays byte-identical, and verify the *trigger* fires on a tiny, hand-checked subset (2 of 101 papers here) before trusting it.
@@ -1013,3 +1013,40 @@ Mid-run (between cycle 6 and cycle 7) the user re-stated the LEAVE NOTHING BEHIN
1013
1013
  - Cycle 11 — diagnose remaining individual non-idempotent cases (verify_out scan post-cycle-10 expected ~10-15 from initial 40). Many are likely whitespace-only or single-paper quirks.
1014
1014
  - Group B / B1 — plos-med-1 TABLE-builder cluster. ARCHITECTURAL — needs user direction before coding.
1015
1015
  - Phase 8 Tier-3 verify — confirm v2.4.62 on Railway after this cycle's auto-bump deploys.
1016
+
1017
+ ---
1018
+
1019
+ ## Cycle 5 (2026-06-07) — canary findings #1 + #2 cleared (v2.4.79) + canary-gate substrate hardening
1020
+
1021
+ **Target (from 2026-06-07 handoff):** clear ip_feldman canary finding #1 (METADATA-LEAK, residual `Received …; revision accepted …` line) and #2 (HALLUCINATION, `Supplemental Materials` fragment). Both DONE + verified.
1022
+
1023
+ **Finding #1** — US-format publication-history line. Root cause: every existing `Received…` strip pattern is European-order (`Received DD Month YYYY`); ip_feldman is US `Received Month DD, YYYY; revision accepted …`. Fix: one `_PAGE_FOOTER_LINE_PATTERNS` entry whose date sub-pattern accepts BOTH orders, anchored on the distinctive `revision accepted` + trailing year. The handoff suggested `^Received .*; revision accepted .*$` — I tightened it (verified it rejects body prose like "We received … revision accepted by reviewers in 2025").
1024
+
1025
+ **Finding #2** — confirmed **audit FALSE-POSITIVE** (the sentence is real, gold line 86). The real defect was a demote-miss: `_demote_continuation_promoted_headings` stripped a `## Supplemental Materials` false-heading but left it as an orphan bare line (period stripped by the promotion, blank-line padding retained) → read as a hallucinated fragment. Fix: rejoin the demoted continuation to the prior line it grammatically continues + restore the terminal period. The function's own docstring had *deferred* this rejoin ("for now we just stop the false-heading") — a deferred half-fix that surfaced as a finding two sessions later. **Lesson: a docstring that says "a separate pass can do X later" is a latent finding; finish it.**
1026
+
1027
+ **Substrate defects found + fixed while gating (LEAVE NOTHING BEHIND):**
1028
+ 1. **`finding_key` was line-number-sensitive.** `_norm_location` folded a location to `"line N"` keeping the number. My fix shifted ip_feldman's line numbers (removed 1 line, rejoined 1) → deferred Table-10 finding moved `line 1351`→`line 1348` → the `--gate-new-only` ledger saw it as NEW and BLOCKED the commit. This defeats the entire regression-only gate: ANY render change trips it. Fixed `_norm_location` to drop the line number (identity = severity + verbatim excerpt). + invariance self-test. **This is the #1 reason the per-commit canary hook was unusable.**
1029
+ 2. **Audit had no `allowed_omissions` concept.** The Sonnet canary auditor flagged docpluck's by-design front-matter masthead strip (authors/affiliations/journal/DOI) as TEXT-LOSS — a recurring false-positive that forces SKIP_CANARY. Added an optional `allowed_omissions` payload field (wired through canary-audit.sh + audit-subagent-prompt.md + canary.json) so by-design omissions aren't findings. Mirrors the in-session verifier's allowed-omissions list.
1030
+ 3. **Single-audit non-determinism vs a large deferred surface.** ip_feldman has a big KNOWN-deferred defect set (Camelot tables + R4 reading-order). The `--quick` single audit samples a DIFFERENT ~7 of them each run, so `--gate-new-only` keeps finding "new" ones until the whole surface is baselined. The `--full` double-audit+union mitigates but can't eliminate. **Practical consequence: per-commit canary on a paper with a large open deferred surface will keep requiring SKIP_CANARY until the deferred work lands — surface this, don't pretend the gate is clean.**
1031
+
1032
+ **Gate honesty:** iterate-gate --cycle 1 = FAIL (I2: only audited the litmus; I3: ip_feldman still FAIL on deferred findings). Correct per rule 0e-bis — the cycle made progress but the canary isn't clean. NOT marked PASS, NOT tagged. The 10 deferred findings (Camelot B4 + R4 reading-order) are each their own multi-session architecture effort — surfaced to the user, not bundled.
1033
+
1034
+ **Verification:** full pytest 1870 passed; 26/26 corpus baseline; substrate self-tests 17/17; canary audit confirmed zero METADATA-LEAK/HALLUCINATION remain on ip_feldman.
1035
+
1036
+ ---
1037
+
1038
+ ## Run: 2026-06-07 (resume) · O5 reference reading-order inversion · v2.4.79 → v2.4.80
1039
+
1040
+ ### Outcome
1041
+ - SHIPPED (uncommitted, user to review) — O5 from the citationguard-iterate handoff: chen + jamison reference lists were reading-order-inverted by pdftotext (a banded page stacks a CRediT table above a 2-column reference list; pdftotext emits the right reference column before the left column carrying the `References` heading → a block of entries stranded above the heading). Fixed via region-aware column re-extraction. chen: 0 stranded / 101 ordered (was 36+ stranded); jamison: 0 / 37; 99 other corpus papers unchanged.
1042
+
1043
+ ### What the handoff started as vs what it became
1044
+ - Began as "address the A–D text-extraction handoff" → user decided **won't-fix, alert scimeto** (A/B superscript flatten+glue, C semantic hyphen, D ref mangling are all pdftotext text-channel / source-PDF — verified at the raw level; Strömwall was already resolved, a pymupdf-fixture artifact). Then the user surfaced **O5** (in `TRIAGE_iterate_2026-06-07.md`, not the A–D handoff) and reversed to "review/verify/fix it all + build O5."
1045
+ - **Lesson: don't inherit a sibling item's disposition.** scimeto had tagged O5 "docpluck won't-fix" by association with A–D. Re-deriving layer-of-origin showed O5 is a *reading-order* bug docpluck owns — a different class entirely.
1046
+
1047
+ ### Blind spots / process
1048
+ - **A broad `except Exception: pass` hid a hard import error.** The working tree had a broken in-flight WIP of this exact feature (`extract_columns.py` IndentationError + undefined `_detect_2col_midline_gutter`); the `except: pass` in `extract.py`'s column-correction call meant the module silently failed to import on every flagged page → column-correction was OFF corpus-wide with no surfaced error. Restored to clean HEAD before building. When a feature is wrapped in `except: pass`, test that it actually runs.
1049
+ - **The corpus char-ratio/Jaccard gate is blind to reordering** (known: `feedback_ai_verification_mandatory`). The safe shipping pattern for a reading-order change: a **word-preservation guard** (accept the re-extraction only if it preserves the substantial-word multiset — a pure reorder, rules 0a/0b) + confine the new path behind an explicit flag (legacy byte-identical) + verify the trigger fires on a tiny hand-checked subset (2 of 101) before trusting it.
1050
+
1051
+ ### Tests
1052
+ - `tests/test_o5_reference_inversion_real_pdf.py` — real-PDF, parametrized over chen + jamison: detection fires, refs corrected (0 before / ≥20 after header), text preserved (anchor surnames present), detector selective (does NOT fire on a normally-ordered paper).
@@ -144,8 +144,11 @@ CHECKS (in order; each must pass):
144
144
  the library may apply: U+2212→hyphen, soft-hyphen rejoin, line-wrap reflow,
145
145
  NFKC composition, smart-quote→straight-quote. Allowed omissions ONLY:
146
146
  running headers, page numbers, copyright lines, Crossref boilerplate,
147
- watermark strips, ORCID lines, DOI banner lines. Anything else missing is a
148
- TEXT-LOSS finding.
147
+ watermark strips, ORCID lines, DOI banner lines, publication-history date
148
+ lines (e.g. "Received <date>; revision accepted <date>", "Received/Revised/
149
+ Accepted" mastheads — docpluck strips these as journal furniture by long-
150
+ established design; not sentence-shaped body prose). Anything else missing is
151
+ a TEXT-LOSS finding.
149
152
 
150
153
  2. HALLUCINATION check.
151
154
  For every substantive paragraph in the rendered .md (≥60 chars,
@@ -47,3 +47,9 @@ tmp_*.py
47
47
 
48
48
  # Verification harness outputs (scripts/harness) — extraction artifacts, not committed.
49
49
  verify_out/
50
+
51
+ # Canary-audit persistent open-finding ledger (per-machine cross-run audit
52
+ # STATE written by ~/.claude/skills/_shared/iterate-loop/canary-audit.sh). It is
53
+ # git-SHA-keyed but reflects local audit history, so it's machine-local state —
54
+ # not committed (canary.json, the sibling CONFIG, IS tracked).
55
+ .claude/skills/_project/canary-findings-ledger.json
@@ -1,5 +1,57 @@
1
1
  # Changelog
2
2
 
3
+ ## [2.4.80] — 2026-06-07
4
+
5
+ **O5 — reference reading-order inversion fixed (citationguard-iterate handoff).** Region-aware column re-extraction (`extract.py` + `extract_columns.py`); no `NORMALIZATION_VERSION` change (text channel only). Keyed on a STRUCTURAL signature — reference-list entries serialized ABOVE their own `References` heading on a page — never paper identity. Gated against the 26-paper corpus baseline + the column-test suite (32 existing pass) + 5 new real-PDF tests.
6
+
7
+ - **Root cause:** on two-column papers whose final page stacks a full-width contributor (CRediT) table ABOVE a two-column reference list, pdftotext serializes the page's columns out of order — emitting the RIGHT reference column before the LEFT column that carries the `References` heading. A block of reference entries is thereby stranded ABOVE the heading, invisible to any consumer scanning for references after it (citationguard-iterate O5: "36 chen refs stranded before the References header"). This is the R4/reading-order half of the same column-interleave root cause diagnosed for `ip_feldman` (finding #5 in 2.4.79).
8
+ - **`_detect_reference_inversion_pages`** (`extract_columns.py`): a cheap, text-only detector flagging a page when ≥3 reference-list-entry lines (`Surname, F. M.` shape, anchored at line start so in-text citations don't match) appear before that page's `References`/`Bibliography` heading. Reference entries cannot precede their own section heading in correct reading order, so this is an unambiguous inversion signature. Fires on exactly 2 of the 101-paper corpus (`chen_2021_jesp` p19, `jamison_2020_jesp` p9) — both genuine inversions.
9
+ - **`_detect_2col_midline_gutter`** (`extract_columns.py`): a full-height empty central GUTTER-STRIP midline detector. Stronger than the word-center histogram for narrow (~4pt) reference-column gutters the histogram can't resolve, and — because a clean full-height strip cannot coexist with a full-width table row crossing the center — it lets the prose columns be corrected while BYPASSING the y-row bilateral gate that (correctly) protects genuine table pages. Confined to the inversion path (`allow_gutter_fallback`), so the legacy column-interleave path is byte-identical.
10
+ - **Word-preservation guard** (`splice_column_corrected_pages` `word_preserve_pages`): a flagged page's geometric re-extraction is accepted ONLY if it preserves the page's substantial-word multiset (every alphabetic token of length ≥2) — a pure reorder that can never drop or fabricate reference text (rules 0a / 0b). Trivial digit/single-char churn at crop boundaries is tolerated.
11
+ - **Result:** `chen_2021_jesp` 0 reference entries stranded before the heading (was 36+); 101 entries now follow it in alphabetical order. `jamison_2020_jesp` likewise (0 stranded; 37 entries). All other 99 corpus papers unchanged (detector does not fire; legacy path byte-identical). Classes A–D from the same handoff are pdftotext text-channel / source-PDF artifacts and remain consumer-side (citelink) — see `docs/superpowers/handoffs/2026-06-07-text-extraction-defects-from-citationguard-iterate.md`.
12
+
13
+ New regression tests: `tests/test_o5_reference_inversion_real_pdf.py` (detection + correction + text-preservation + detector-selectivity; real-PDF, parametrized over chen + jamison).
14
+
15
+ ## [2.4.79] — 2026-06-07
16
+
17
+ **Cycle 5 — canary findings #1 (METADATA-LEAK) + #2 (HALLUCINATION false-positive) cleared on `ip_feldman_2025_pspb`.** `NORMALIZATION_VERSION` 1.9.27 → 1.9.28. Both fixes keyed on a STRUCTURAL signature, never paper identity; gated against the 26-paper corpus baseline + full pytest.
18
+
19
+ - **US-format publication-history line strip** (`normalize.py` `_PAGE_FOOTER_LINE_PATTERNS`): a new pattern strips the `Received Month DD, YYYY; revision accepted Month DD, YYYY` line (Sage / APA journals — PSPB) that leaked between `## Keywords` and `## Introduction`. The existing publication-history patterns are all European-order (`Received DD Month YYYY; Accepted …`); this adds the US `Month DD, YYYY` order with the distinctive `revision accepted` phrase. The date sub-pattern accepts either order; the `revision accepted` token plus a trailing year makes the line unmistakably journal boilerplate, never body prose. Clears canary finding #1. (The verifier's allowed-omissions list — `references/ai-full-doc-verify.md` — now records publication-history dates as journal furniture docpluck strips by long-established design, so the finding can't churn METADATA-LEAK→TEXT-LOSS.)
20
+ - **Continuation-demoter rejoin** (`render.py` `_demote_continuation_promoted_headings`): when a `## <Title>` is demoted because the prior line ends in a continuation word (soft-wrap split false-promotion), the demoted phrase is now **rejoined to the prior line it grammatically continues**, with the trailing sentence terminator restored when the phrase completes the sentence — instead of being left as an orphan bare line. An orphaned `Supplemental Materials` fragment (period stripped by the promotion, blank-line padding retained) read as a hallucination to the AI verifier; `The calculated effect sizes are summarized in the` + `Supplemental Materials` now renders as the single continuous sentence `The calculated effect sizes are summarized in the Supplemental Materials.` matching the gold. Clears canary finding #2 (audit false-positive — the text was always real; the defect was the spurious mid-sentence split). This completes the deferred rejoin the function's own docstring had flagged ("a separate normalize pass can rejoin it … for now we just stop the false-heading").
21
+
22
+ New regression tests: `tests/test_normalize_metadata_leak_real_pdf.py` (US-format Received-line contract + variant + real-PDF cases), `tests/test_hallucinated_heading_continuation_guard.py` (`TestContinuationRejoin` — rejoin/no-orphan, period restoration, no-double-period, lowercase-continuation guard + real-PDF rejoined-sentence assertion).
23
+
24
+ **Still NOT tagged** — canary retains the deferred multi-session findings: #3 Table 2 mid-Introduction (Camelot/B4), #4 Table 10 mid-External-Analysis (Camelot/B4), #5 `## Discussion` displaced Table-9 footnote + External-Analysis fragment (R4 column-aware reading-order). See the 2026-06-07 handoff.
25
+
26
+ ## [2.4.78] — 2026-06-06
27
+
28
+ **Cycle 4 redux — canary findings #1/#3/#4 cleared on `ip_feldman_2025_pspb` + citationguard text-extraction handoff Defect 1.** `NORMALIZATION_VERSION` 1.9.26 → 1.9.27. Driven by the Sonnet-via-Claude-Max canary audit (headless gate now operational after `claude setup-token`). Five render/normalize fixes, each keyed on a STRUCTURAL signature (never paper identity), each gated against the 26-paper corpus baseline + full pytest:
29
+
30
+ - **Cluster E re-land** (`normalize.py` `_FRONTMATTER_LEAK_LINE_PATTERNS`): the `_ARTICLE_TYPE_CODE` (`research-article2025` etc.) + `_BARE_ARTICLE_ID` (6–8 digit standalone) patterns drafted in v2.4.77 were **reverted there** (they exposed a wrapped-title duplicate) and shipped as dead code. Now genuinely wired — the revert's blocker is resolved by the new wrapped-title demoter below. Clears the bare article-ID + article-type-code half of canary finding #1.
31
+ - **Wrapped-title-duplicate demoter** (`render.py` `_demote_wrapped_title_duplicate`): strips a `### {prefix-of-H1}` + continuation block immediately under the H1 (pdftotext emits the title twice on PSPB/Sage column layouts). Token-prefix match vs the H1 with a ≥75%-coverage gate; runs after title rescue. Resolves the side-effect that forced the v2.4.77 Cluster E revert.
32
+ - **Cohesive masthead-block strip** (`render.py` `_strip_frontmatter_masthead_block`): strips the residual publisher masthead (author+superscript lines, journal-name wraps, page range, copyright tail, `DOI:` label, bare DOI) between the H1 and the first `## ` heading. Self-limiting ≥2-hard-marker gate; a prose-break guard preserves an undetected abstract. Clears the remainder of canary finding #1 — the doc now flows `# Title` → `## Abstract` cleanly.
33
+ - **Column-wrapped heading repair** (`render.py` `_repair_column_wrapped_headings`): Rule A promotes a body `{Title} et al.` + bare `(YYYY)` citation-wrap to `### {Title} et al. (YYYY)` (finding #3 — "Choice of Study for Replication"); Rule B reattaches a short colon-/paren-led orphan tail onto a heading split mid-title (finding #4 — "Original Hypotheses…Target Article: Jordan et al. (2011)"). Both gated on followed-by-prose so two real sibling headings are never merged.
34
+ - **Soft-hyphen line-break dehyphenation** (`normalize.py` S6): a U+00AD immediately before a line break is always a discretionary extraction hyphen splitting one word across the wrap; the join now drops the U+00AD *and* the newline (gated on a following letter) BEFORE the existing bare strip, so `relation­\nship` → `relationship` instead of the previously-surviving space-broken `relation ship`. Closes citationguard text-extraction handoff Defect 1 (recovered com/mitment, pro/motion, altru/ism, relation/ship on chan_feldman_2025_cogemo; U+00AD count 0). Defect 3 (dropped reference line) verified CLEAN in docpluck (pymupdf-only); Defect 2 (position-dependent glyph mis-map `Västfjäll`→`Vastfall`) reproduced but deferred — needs a same-document surname-consensus normalizer (architecture decision).
35
+
36
+ **Run-11 hallucinated-heading guards** (`render.py` `_promote_isolated_titlecase_subsection_headings` + new `_strip_pre_title_heading_noise` / `_nearest_h2_parent_label`): three general structural guards that clear four mis-promoted-heading findings across the canary set:
37
+
38
+ - **Pre-H1 heading strip** — a heading sitting ABOVE the document title is a journal-section label promoted in the masthead zone; stripped after title rescue. Clears `### FlashReport` (ar_apa); generalizes to `### RESEARCH ARTICLE`-shape front matter.
39
+ - **Regular-path blacklisted-parent reject** — the regular promotion path now consults `_CHAIN_REJECT_PARENTS` (walking back to the nearest `## ` parent, tolerating interleaved running-header lines), matching the chain path. Clears `### Methodology` (a CRediT role label under `## Author Contributions` on plos_med, promoted when a running-header broke chain adjacency).
40
+ - **Lowercase-body reject** — a candidate whose following body's first alphabetic char is lowercase is a fragment torn from a running sentence, not a title. Clears `### Close replication` (chan_feldman) and `### Proced` (plos_med truncation fragment).
41
+
42
+ Deferred (surfaced, not hacked): `### Reasons for change` (ip_feldman — a Table 5 column header; needs table-region awareness), the `## Data Availability` end-matter absence (RCA correction: it never enters the text channel — pdftotext drops the box; cross-channel/layout recovery, NOT a demoter over-strip as the run-11 handoff assumed), and the citationguard `Open Science Collaboration`→`Open, S. C.` org-author collapse (RCA: baked into the PDF's embedded text by the publisher; identical in pdftotext AND pdfplumber; no safe general docpluck fix — routed to CitationGuard for DOI/CrossRef author reconciliation).
43
+
44
+ New regression tests: `tests/test_render_frontmatter_masthead.py` (27 cases — wrapped-title demoter, masthead-block gate, column-wrapped heading repair, pre-H1/blacklist-parent/lowercase-body guards + real-PDF), `tests/test_normalize_soft_hyphen_dehyphenation.py` (5 cases incl. real-PDF), plus front-matter cases added to `tests/test_normalize_metadata_leak_real_pdf.py`. **NOT yet tagged** — canary still has open table findings (#2/#6/#7 Camelot, multi-session) + reading-order findings (#5/#8, R4 column-aware) + the deferred items above. See the 2026-06-06 handoff.
45
+
46
+ ## [2.4.77] — 2026-05-26
47
+
48
+ **Cluster E front-matter cleanup follow-up to v2.4.76.** `NORMALIZATION_VERSION` 1.9.25 → 1.9.26. Three additional publisher-metadata strip patterns observed after v2.4.76 shipped (Stream A continuation work for ip_feldman_2025_pspb + ar_apa front-matter):
49
+
50
+ - **`_PAGE_FOOTER_LINE_PATTERNS`** (`normalize.py`): new `^Article reuse guidelines:?$` pattern. Sage / PSPB publisher boilerplate that pdftotext emits as a standalone front-matter line. Tight-anchored so it can't match body prose.
51
+ - **`_FRONTMATTER_LEAK_LINE_PATTERNS`** (`normalize.py`): new `_ARTICLE_TYPE_CODE` and `_BARE_ARTICLE_ID` patterns. The article-type code pattern matches `research-article2025`, `meta-analysis2024`, etc. (publisher-internal article-type slug + year). The bare-article-ID pattern matches a standalone 6–8 digit line (the DOI's last segment repeated alone at top-of-doc). Both are position-gated to the front-matter zone (first 8000 chars or 1/6 of doc) via the existing `_strip_frontmatter_metadata_leaks` infrastructure — body false positives impossible.
52
+
53
+ Verification: `test_ip_feldman_top_of_doc_cleaned_real_pdf` PASS in isolation and in 64-test batch. No regression on the v2.4.76 corpus.
54
+
3
55
  ## [2.4.76] — 2026-05-25
4
56
 
5
57
  **§A R4 column-aware re-extraction LANDED — closes jama-open-1 D4 (Key Points sidebar missing).** `NORMALIZATION_VERSION` 1.9.24 → 1.9.25 (concurrent with EC-T1's bump). Closes the final defect of the 2026-05-25 Haiku-orchestration pretest jama-open-1 cluster (HANDOFF_2026-05-25_pretest-followups.md Issue 1 — 5 of 5 defects now closed).
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docpluck
3
- Version: 2.4.76
3
+ Version: 2.4.80
4
4
  Summary: PDF, DOCX, and HTML text extraction and normalization for academic papers
5
5
  Project-URL: Homepage, https://docpluck.app
6
6
  Project-URL: Documentation, https://docpluck.app/api-docs
@@ -35,6 +35,7 @@ Requires-Dist: httpx>=0.27; extra == 'dev'
35
35
  Requires-Dist: lxml>=5.0.0; extra == 'dev'
36
36
  Requires-Dist: mammoth>=1.8.0; extra == 'dev'
37
37
  Requires-Dist: pytest-anyio>=0.0.0; extra == 'dev'
38
+ Requires-Dist: pytest-xdist>=3; extra == 'dev'
38
39
  Requires-Dist: pytest>=8; extra == 'dev'
39
40
  Requires-Dist: python-docx>=1.0.0; extra == 'dev'
40
41
  Requires-Dist: rapidfuzz>=3; extra == 'dev'
@@ -131,6 +131,29 @@ Add only when a real downstream consumer asks for one. YAGNI until then.
131
131
  - [ ] **Task #8**: `ai-gold.py resolve` should accept stem names + source-PDF paths (or docs redirect to `check`).
132
132
  - [ ] **Task #9**: `ai-gold.py onboard` needs `--skip-legacy` / `--ignore-unresolvable` flag (citationguard onboard halted on 3,018 legacy bare-stem keys).
133
133
 
134
+ ## 2026-06-07 — v2.4.78 landed (committed 73462e3, unpushed→pushed, NOT tagged)
135
+
136
+ > **Source:** Cycle-4-redux session. See `docs/superpowers/handoffs/2026-06-07-v2.4.78-landed-canary-iterate.md` for the full state + next-session opener. v2.4.78 cleared canary findings #1/#3/#4 + 4 run-11 hallucinated-heading findings + citationguard soft-hyphen Defect 1. Corpus 26/26, full pytest 1861 passed, canary re-audit 8→5.
137
+
138
+ ### Remaining canary findings on ip_feldman_2025_pspb (from 2026-06-07 re-audit @ 73462e3)
139
+
140
+ - [x] **METADATA-LEAK** — ✅ CLEARED in **v2.4.79** (cycle 5). New `_PAGE_FOOTER_LINE_PATTERNS` entry strips US-format `Received Month DD, YYYY; revision accepted Month DD, YYYY` (date sub-pattern accepts either order; gated on `revision accepted` + trailing year). The handoff-suggested `^Received .*; revision accepted .*$` was tightened to avoid matching body prose. Real-PDF + contract tests added.
141
+ - [x] **HALLUCINATION** — ✅ CLEARED in **v2.4.79** (cycle 5). Confirmed **audit false-positive** (the sentence is real — gold line 86); the actual defect was a spurious mid-sentence split + dropped period left by `_demote_continuation_promoted_headings`. The demoter now rejoins the demoted continuation to the prior line it continues + restores the terminal period. Verifier allowed-omissions doc updated so publication-history dates can't churn METADATA-LEAK→TEXT-LOSS.
142
+ - [ ] **TABLE #3 + #4 (B4) AND SECTION-BOUNDARY #5 (R4) share ONE root cause** — DIAGNOSED 2026-06-07 (cycle 5), see [`docs/superpowers/specs/2026-06-07-ip_feldman-B4-R4-column-interleave-diagnosis.md`](docs/superpowers/specs/2026-06-07-ip_feldman-B4-R4-column-interleave-diagnosis.md). pdftotext column-interleaves table-bearing two-column pages, interleaving table caption/headers/cells AND real body prose into one stream. **Empirically NOT single-cycle** (corrects optimistic subagent estimates): the existing R4 infra's bilateral gate (protects table pages corpus-wide) rejects the target pages (measured bilateral 0.37–0.44 > 0.30), and the B4 leak can't be stripped render-side because real prose is interwoven with table content (would risk TEXT-LOSS, rule 0a). **Real fix = region-aware column detection** (segregate full-width table band from 2-col prose band before column-correcting) — multi-session architecture, **needs user go-ahead**.
143
+
144
+ ### Deferred from this session (surfaced, not hacked)
145
+
146
+ - [ ] **`### Reasons for change`** (ip_feldman) — Table 5 column header promoted to heading; needs table-region awareness (the body-coherence guard doesn't catch it because its body starts capitalized). RCA: rank-3 in the 2026-06-06 run-11 RCA.
147
+ - [ ] **`## Data Availability` end-matter absent** — RCA CORRECTED the run-11 "demoter over-strip" premise: the section never enters the text channel (pdftotext drops the title-page box). Needs cross-channel (pdfplumber) recovery, same architecture class as B7. NOT a demoter exception.
148
+ - [ ] **Glyph `Västfjäll`→`Vastfall`** (ar_apa/collabra, citationguard Defect 2) — baked pdftotext CID-font mis-map; needs a same-document surname-consensus normalizer (new subsystem). **Product/architecture decision on scope.**
149
+ - [ ] **org-author `Open Science Collaboration`→`Open, S. C.`** — baked into the PDF's embedded text by the publisher (identical in pdftotext AND pdfplumber); no safe general docpluck fix. **Routed to CitationGuard** (DOI/CrossRef author reconciliation).
150
+ - [ ] **Tag v2.4.78** once the full canary set clears (currently 5 open). Then bump `PDFextractor/service/requirements.txt` pin + run `/docpluck-deploy`.
151
+
152
+ ### Substrate / infra follow-ups (2026-06-07)
153
+
154
+ - [x] **Tune the canary git-hooks to be ledger-aware (regression-only gate).** ✅ DONE (2026-06-07 cycle 5). `canary-audit.sh --gate-new-only` mode had landed but the hooks weren't passing it. Now wired: `.git/hooks/pre-commit` runs `--quick … --gate-new-only` (routine commits block only on NEW/regressed findings, not the deferred baseline); `.git/hooks/pre-push` uses `--gate-new-only` for **main** pushes but keeps the **strict** no-exceptions gate for **tag** pushes (a release tag must ship a fully-clean canary). Substrate self-tests 16/16 pass.
155
+ - [ ] **Modern Standby permanent disable** (optional, user/admin): `reg add HKLM\SYSTEM\CurrentControlSet\Control\Power /v PlatformAoAcOverride /t REG_DWORD /d 0 /f` + reboot. Current `powercfg /change standby-timeout-ac 0` works but can revert on power-plan change. See memory `feedback_long_runs_die_on_this_machine`.
156
+
134
157
  ### Replicate canary-audit pattern to other iterate skills (after docpluck proven)
135
158
 
136
159
  - [ ] **escicheck-iterate** — easiest pilot (46 successful phase_5d_runs already, well-defined stats-family defect taxonomy). Update `verification_protocol` in its `canary.json`.
@@ -77,7 +77,7 @@ from .figures import Figure
77
77
  from .extract_structured import TABLE_EXTRACTION_VERSION, StructuredResult, extract_pdf_structured
78
78
  from .render import render_pdf_to_markdown
79
79
 
80
- __version__ = "2.4.76"
80
+ __version__ = "2.4.80"
81
81
  __author__ = "Gilad Feldman"
82
82
  __license__ = "MIT"
83
83
 
@@ -134,22 +134,37 @@ def extract_pdf(pdf_bytes: bytes, *, sections: list[str] | None = None) -> tuple
134
134
  # conditional fallback, NOT a default tool swap.
135
135
  try:
136
136
  from .normalize import _detect_column_interleave_pages
137
- from .extract_columns import splice_column_corrected_pages
137
+ from .extract_columns import (
138
+ splice_column_corrected_pages,
139
+ _detect_reference_inversion_pages,
140
+ )
138
141
  ff_offsets: list[int] = [0]
139
142
  for idx, ch in enumerate(text):
140
143
  if ch == "\f":
141
144
  ff_offsets.append(idx + 1)
145
+ # Two cheap text-only detectors decide whether the (more expensive)
146
+ # layout extraction + geometric re-order is worth running:
147
+ # - legacy column-INTERLEAVE (sentences woven between columns), and
148
+ # - O5 reading-order INVERSION (a page's reference entries serialized
149
+ # ABOVE their own References heading — chen page 19). The inversion
150
+ # pages are corrected under the word-preservation guard so the
151
+ # reorder can never lose or fabricate text (rule 0a / 0b).
142
152
  flagged_pages = _detect_column_interleave_pages(text, tuple(ff_offsets))
143
- if flagged_pages:
153
+ inversion_pages = _detect_reference_inversion_pages(text, tuple(ff_offsets))
154
+ all_pages = sorted(set(flagged_pages) | set(inversion_pages))
155
+ if all_pages:
144
156
  from .extract_layout import extract_pdf_layout
145
157
  layout_doc = extract_pdf_layout(pdf_bytes)
158
+ changed: list[int] = []
146
159
  corrected = splice_column_corrected_pages(
147
- text, layout_doc, ff_offsets, flagged_pages,
160
+ text, layout_doc, ff_offsets, all_pages,
148
161
  pdf_bytes=pdf_bytes,
162
+ word_preserve_pages=inversion_pages,
163
+ changed_out=changed,
149
164
  )
150
- if corrected and corrected != text:
165
+ if corrected and corrected != text and changed:
151
166
  text = corrected
152
- method = f"{method}+column_corrected:{','.join(map(str, flagged_pages))}"
167
+ method = f"{method}+column_corrected:{','.join(map(str, changed))}"
153
168
  except Exception:
154
169
  pass
155
170