docpluck 2.4.96__tar.gz → 2.4.98__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (441) hide show
  1. {docpluck-2.4.96 → docpluck-2.4.98}/.claude/skills/_project/lessons.md +12 -0
  2. {docpluck-2.4.96 → docpluck-2.4.98}/.claude/skills/docpluck-iterate/LEARNINGS.md +41 -0
  3. {docpluck-2.4.96 → docpluck-2.4.98}/CHANGELOG.md +41 -0
  4. {docpluck-2.4.96 → docpluck-2.4.98}/LESSONS.md +68 -0
  5. {docpluck-2.4.96 → docpluck-2.4.98}/PKG-INFO +1 -1
  6. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/__init__.py +1 -1
  7. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/extract_structured.py +105 -2
  8. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/tables/cell_cleaning.py +26 -1
  9. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/tables/flatten.py +255 -4
  10. docpluck-2.4.98/docpluck/tables/whitespace.py +360 -0
  11. docpluck-2.4.98/docs/HANDOFF_2026-06-25_iterate_branch-reconcile_rc1-banded-flip-rejected.md +40 -0
  12. docpluck-2.4.98/docs/TRIAGE_2026-06-25_escicheck_handoff_defects.md +120 -0
  13. docpluck-2.4.98/docs/superpowers/handoffs/2026-06-22-dp2-dp5-flatten-fixes-commit.md +71 -0
  14. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/specs/2026-06-08-rc1-region-aware-column-architecture.md +22 -0
  15. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/specs/2026-06-21-rc-t-table-region-prose-contamination.md +47 -0
  16. {docpluck-2.4.96 → docpluck-2.4.98}/pyproject.toml +1 -1
  17. {docpluck-2.4.96 → docpluck-2.4.98}/scripts/check_app_pin_sync.py +37 -5
  18. docpluck-2.4.98/tests/test_check_app_pin_sync.py +99 -0
  19. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_extract_pdf_structured.py +13 -5
  20. docpluck-2.4.98/tests/test_rc_t_layer2_raw_text_real_pdf.py +163 -0
  21. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_smoke_fixtures.py +12 -0
  22. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_tables_flatten.py +85 -0
  23. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_tables_flatten_blank_header_recovery.py +46 -2
  24. docpluck-2.4.98/tests/test_tables_superheader_alignment_real_pdf.py +168 -0
  25. docpluck-2.4.98/tests/test_whitespace_char_fallback.py +232 -0
  26. docpluck-2.4.96/docpluck/tables/whitespace.py +0 -177
  27. {docpluck-2.4.96 → docpluck-2.4.98}/.claude/skills/_project/canary.json +0 -0
  28. {docpluck-2.4.96 → docpluck-2.4.98}/.claude/skills/docpluck-cleanup/SKILL.md +0 -0
  29. {docpluck-2.4.96 → docpluck-2.4.98}/.claude/skills/docpluck-deploy/SKILL.md +0 -0
  30. {docpluck-2.4.96 → docpluck-2.4.98}/.claude/skills/docpluck-iterate/SKILL.md +0 -0
  31. {docpluck-2.4.96 → docpluck-2.4.98}/.claude/skills/docpluck-iterate/references/ai-full-doc-verify.md +0 -0
  32. {docpluck-2.4.96 → docpluck-2.4.98}/.claude/skills/docpluck-iterate/references/cycle-report-template.md +0 -0
  33. {docpluck-2.4.96 → docpluck-2.4.98}/.claude/skills/docpluck-iterate/references/local-verification.md +0 -0
  34. {docpluck-2.4.96 → docpluck-2.4.98}/.claude/skills/docpluck-iterate/references/rationalizations.md +0 -0
  35. {docpluck-2.4.96 → docpluck-2.4.98}/.claude/skills/docpluck-iterate/references/real-library-real-pdf.md +0 -0
  36. {docpluck-2.4.96 → docpluck-2.4.98}/.claude/skills/docpluck-iterate/references/release-flow.md +0 -0
  37. {docpluck-2.4.96 → docpluck-2.4.98}/.claude/skills/docpluck-iterate/references/self-improvement.md +0 -0
  38. {docpluck-2.4.96 → docpluck-2.4.98}/.claude/skills/docpluck-iterate/references/subagent-parallelization.md +0 -0
  39. {docpluck-2.4.96 → docpluck-2.4.98}/.claude/skills/docpluck-iterate/references/three-tier-parity.md +0 -0
  40. {docpluck-2.4.96 → docpluck-2.4.98}/.claude/skills/docpluck-qa/SKILL.md +0 -0
  41. {docpluck-2.4.96 → docpluck-2.4.98}/.claude/skills/docpluck-qa/references/benchmark-mode.md +0 -0
  42. {docpluck-2.4.96 → docpluck-2.4.98}/.claude/skills/docpluck-qa/references/check-11-hard-rules.md +0 -0
  43. {docpluck-2.4.96 → docpluck-2.4.98}/.claude/skills/docpluck-qa/references/check-13-escicheck-production.md +0 -0
  44. {docpluck-2.4.96 → docpluck-2.4.98}/.claude/skills/docpluck-qa/references/check-5-escicheck-library.md +0 -0
  45. {docpluck-2.4.96 → docpluck-2.4.98}/.claude/skills/docpluck-qa/references/check-6-escicheck-local-webapp.md +0 -0
  46. {docpluck-2.4.96 → docpluck-2.4.98}/.claude/skills/docpluck-qa/references/check-7-batch-smoke.md +0 -0
  47. {docpluck-2.4.96 → docpluck-2.4.98}/.claude/skills/docpluck-review/SKILL.md +0 -0
  48. {docpluck-2.4.96 → docpluck-2.4.98}/.github/workflows/bump-app-pin.yml +0 -0
  49. {docpluck-2.4.96 → docpluck-2.4.98}/.github/workflows/publish.yml +0 -0
  50. {docpluck-2.4.96 → docpluck-2.4.98}/.github/workflows/test.yml +0 -0
  51. {docpluck-2.4.96 → docpluck-2.4.98}/.gitignore +0 -0
  52. {docpluck-2.4.96 → docpluck-2.4.98}/CLAUDE.md +0 -0
  53. {docpluck-2.4.96 → docpluck-2.4.98}/CUSTOMER_UPDATE_2026-06-19_tables_sections_api.md +0 -0
  54. {docpluck-2.4.96 → docpluck-2.4.98}/HANDOFF_SECTIONS_APP_INTEGRATION.md +0 -0
  55. {docpluck-2.4.96 → docpluck-2.4.98}/LICENSE +0 -0
  56. {docpluck-2.4.96 → docpluck-2.4.98}/README.md +0 -0
  57. {docpluck-2.4.96 → docpluck-2.4.98}/REPLY_FROM_DOCPLUCK_v1.4.5.md +0 -0
  58. {docpluck-2.4.96 → docpluck-2.4.98}/REPLY_FROM_DOCPLUCK_v1.5.0.md +0 -0
  59. {docpluck-2.4.96 → docpluck-2.4.98}/REPLY_FROM_DOCPLUCK_v2.4.93.md +0 -0
  60. {docpluck-2.4.96 → docpluck-2.4.98}/REPLY_FROM_DOCPLUCK_v2.4.94.md +0 -0
  61. {docpluck-2.4.96 → docpluck-2.4.98}/REPLY_FROM_DOCPLUCK_v2.4.95.md +0 -0
  62. {docpluck-2.4.96 → docpluck-2.4.98}/REQUEST_08_CHUNKING_ENDPOINT.md +0 -0
  63. {docpluck-2.4.96 → docpluck-2.4.98}/REQUEST_09_REFERENCE_LIST_NORMALIZATION.md +0 -0
  64. {docpluck-2.4.96 → docpluck-2.4.98}/REQUEST_10_TABLE_FLATTEN_HTTP_EXPOSURE.md +0 -0
  65. {docpluck-2.4.96 → docpluck-2.4.98}/REQUEST_10_TIER2_ORPHANED_LABEL_ROW_RECOVERY.md +0 -0
  66. {docpluck-2.4.96 → docpluck-2.4.98}/REQUEST_11_FLATTEN_FIELDS_NONCLINICAL_TABLES.md +0 -0
  67. {docpluck-2.4.96 → docpluck-2.4.98}/TODO.md +0 -0
  68. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/__main__.py +0 -0
  69. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/batch.py +0 -0
  70. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/cli.py +0 -0
  71. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/extract.py +0 -0
  72. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/extract_columns.py +0 -0
  73. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/extract_docx.py +0 -0
  74. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/extract_html.py +0 -0
  75. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/extract_layout.py +0 -0
  76. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/figures/__init__.py +0 -0
  77. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/figures/detect.py +0 -0
  78. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/normalize.py +0 -0
  79. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/quality.py +0 -0
  80. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/render.py +0 -0
  81. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/sections/__init__.py +0 -0
  82. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/sections/annotators/__init__.py +0 -0
  83. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/sections/annotators/docx.py +0 -0
  84. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/sections/annotators/html.py +0 -0
  85. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/sections/annotators/pdf.py +0 -0
  86. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/sections/annotators/text.py +0 -0
  87. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/sections/blocks.py +0 -0
  88. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/sections/boundaries.py +0 -0
  89. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/sections/core.py +0 -0
  90. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/sections/taxonomy.py +0 -0
  91. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/sections/types.py +0 -0
  92. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/tables/__init__.py +0 -0
  93. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/tables/bbox_utils.py +0 -0
  94. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/tables/camelot_extract.py +0 -0
  95. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/tables/captions.py +0 -0
  96. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/tables/cluster.py +0 -0
  97. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/tables/confidence.py +0 -0
  98. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/tables/detect.py +0 -0
  99. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/tables/render.py +0 -0
  100. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/telemetry.py +0 -0
  101. {docpluck-2.4.96 → docpluck-2.4.98}/docpluck/version.py +0 -0
  102. {docpluck-2.4.96 → docpluck-2.4.98}/docs/BENCHMARKS.md +0 -0
  103. {docpluck-2.4.96 → docpluck-2.4.98}/docs/BENCHMARKS_liteparse_2026-06.md +0 -0
  104. {docpluck-2.4.96 → docpluck-2.4.98}/docs/DESIGN.md +0 -0
  105. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-07_sections_strict_iteration.md +0 -0
  106. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-09_session_state_and_followups.md +0 -0
  107. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-09_unified_extraction_brainstorm.md +0 -0
  108. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-10_table_rendering_iteration.md +0 -0
  109. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-10_table_rendering_iteration_2.md +0 -0
  110. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-10_table_rendering_iteration_3.md +0 -0
  111. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-10_table_rendering_iteration_4.md +0 -0
  112. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-10_table_rendering_iteration_5.md +0 -0
  113. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-10_table_rendering_iteration_6.md +0 -0
  114. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-10_table_rendering_iteration_7.md +0 -0
  115. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-11_PROMOTE_SPIKE_TO_LIBRARY.md +0 -0
  116. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-11_table_rendering_iteration_8.md +0 -0
  117. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-11_visual_review_findings.md +0 -0
  118. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-12_phase2_101pdf_corpus.md +0 -0
  119. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-12_remaining_ui_and_chrome_verification.md +0 -0
  120. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-12_visual_verify_results.md +0 -0
  121. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-13_apa_50_expansion.md +0 -0
  122. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-13_apa_50_expansion_iter_1.md +0 -0
  123. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-13_apa_50_expansion_iter_2.md +0 -0
  124. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-13_iterate_skill_first_use.md +0 -0
  125. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-13_iterative_1.md +0 -0
  126. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-13_iterative_library_improvement.md +0 -0
  127. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-13_table_extraction_next_iteration.md +0 -0
  128. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-14_continue_iterations_v2_4_30_to_15n.md +0 -0
  129. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-14_full_corpus_iteration_v2_4_30.md +0 -0
  130. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-14_iterate_6_cycles_complete.md +0 -0
  131. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-14_iterate_9_cycle_run.md +0 -0
  132. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-14_iterate_resume_4_cycles.md +0 -0
  133. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-14_iterate_v2_4_31_cycle_15n.md +0 -0
  134. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-14_phase_5d_gold_audit_v2_4_29.md +0 -0
  135. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-15_autonomous_apa_first_10h.md +0 -0
  136. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-15_iterate_apa_run_1.md +0 -0
  137. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-16_ai-gold-instructions.md +0 -0
  138. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-16_iterate_apa_run_2.md +0 -0
  139. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-16_iterate_apa_run_3.md +0 -0
  140. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-16_iterate_run_4_final.md +0 -0
  141. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-16_iterate_run_4_fix_and_continue.md +0 -0
  142. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-16_iterate_run_5.md +0 -0
  143. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-16_iterate_run_6.md +0 -0
  144. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-17_iterate_run_7.md +0 -0
  145. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-17_iterate_run_8.md +0 -0
  146. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-17_iterate_run_9.md +0 -0
  147. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-18_iterate_run_9_cont.md +0 -0
  148. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-18_iterate_run_9_cont2.md +0 -0
  149. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-20_iterate_run_9_cont3.md +0 -0
  150. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-22_iterate_run_9_session4_final.md +0 -0
  151. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-22_iterate_run_9_session5_close.md +0 -0
  152. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-25_haiku-orchestration-pretest.md +0 -0
  153. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-05-25_pretest-followups.md +0 -0
  154. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-06-08_iterate_splice-wordintegrity-runningheader.md +0 -0
  155. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-06-08_untested_sweep_v2.4.81.md +0 -0
  156. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-06-13_sciencearena_grobid_liteparse.md +0 -0
  157. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-06-15_docpluck-iterate-resume.md +0 -0
  158. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-06-15_rc1-step2-continue.md +0 -0
  159. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-06-16_docpluck-iterate-resume.md +0 -0
  160. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-06-17_iterate_resume-cycle1.md +0 -0
  161. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-06-17_iterate_v2491_shipped.md +0 -0
  162. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-06-18_iterate_v2492_affiliation_caption-revert.md +0 -0
  163. {docpluck-2.4.96 → docpluck-2.4.98}/docs/HANDOFF_2026-06-20_request11_flatten_nonclinical_tables.md +0 -0
  164. {docpluck-2.4.96 → docpluck-2.4.98}/docs/ITERATION_VERIFICATION_LESSONS.md +0 -0
  165. {docpluck-2.4.96 → docpluck-2.4.98}/docs/LIBRARY_APP_SYNC.md +0 -0
  166. {docpluck-2.4.96 → docpluck-2.4.98}/docs/NORMALIZATION.md +0 -0
  167. {docpluck-2.4.96 → docpluck-2.4.98}/docs/README.md +0 -0
  168. {docpluck-2.4.96 → docpluck-2.4.98}/docs/TRIAGE_2026-05-10_corpus_assessment.md +0 -0
  169. {docpluck-2.4.96 → docpluck-2.4.98}/docs/TRIAGE_2026-05-14_phase_5d_gold_audit.md +0 -0
  170. {docpluck-2.4.96 → docpluck-2.4.98}/docs/TRIAGE_2026-06-08_untested_corpus_sweep.md +0 -0
  171. {docpluck-2.4.96 → docpluck-2.4.98}/docs/TRIAGE_2026-06-15_head_v2.4.88_assessment.md +0 -0
  172. {docpluck-2.4.96 → docpluck-2.4.98}/docs/TRIAGE_2026-06-21_head_v2.4.95_assessment.md +0 -0
  173. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/handoffs/2026-05-22-b1-next-iteration.md +0 -0
  174. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/handoffs/2026-05-22-b2-remaining-halluc-head.md +0 -0
  175. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/handoffs/2026-05-22-b3-b7-structural-defects.md +0 -0
  176. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/handoffs/2026-05-22-residual-after-locally-doable-pass.md +0 -0
  177. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/handoffs/2026-05-23-bundled-residual-cycle-CLOSED.md +0 -0
  178. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/handoffs/2026-05-23-residual-after-iterate-spine-cycles-1-3.md +0 -0
  179. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/handoffs/2026-05-25-canary-audit-architecture-and-cluster-A-B-C-landed.md +0 -0
  180. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/handoffs/2026-05-25-wrapup-r4-cycle.md +0 -0
  181. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/handoffs/2026-05-26-run-11-cluster-A-ter-and-C-bis-landed.md +0 -0
  182. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/handoffs/2026-05-26-text-extraction-defects-from-citationguard-audit.md +0 -0
  183. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/handoffs/2026-06-07-text-extraction-defects-from-citationguard-iterate.md +0 -0
  184. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/handoffs/2026-06-07-v2.4.78-landed-canary-iterate.md +0 -0
  185. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/handoffs/2026-06-07-v2.4.79-findings-1-2-cleared.md +0 -0
  186. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/handoffs/2026-06-20_docpluck-skill-file-edits-from-app-cron-fix.md +0 -0
  187. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/handoffs/2026-06-21-rc-t-table-region-implementation.md +0 -0
  188. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/2026-05-06-section-identification.md +0 -0
  189. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/2026-05-06-table-extraction.md +0 -0
  190. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/2026-05-07-sections-strict-iteration-progress.md +0 -0
  191. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/2026-05-08-unified-extraction-phase-0-splice-spike.md +0 -0
  192. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/2026-05-23-haiku-orchestration-pretest.md +0 -0
  193. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/sections-deferred-items.md +0 -0
  194. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/sections-issues-backlog.md +0 -0
  195. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/2026-05-07_spot-01_apa.md +0 -0
  196. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/2026-05-07_spot-02_pattern-A-shipped.md +0 -0
  197. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/2026-05-08_spot-final_all-styles.md +0 -0
  198. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/experiments/COMPARISON.md +0 -0
  199. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/korbmacher_table1.md +0 -0
  200. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/option-a.py +0 -0
  201. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/ziano_table1.md +0 -0
  202. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_notes_raw.txt +0 -0
  203. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_table1.md +0 -0
  204. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/notes.md +0 -0
  205. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/option-b.py +0 -0
  206. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_notes_raw.txt +0 -0
  207. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_table1.md +0 -0
  208. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/korbmacher_table1.md +0 -0
  209. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/notes.md +0 -0
  210. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/option-c.py +0 -0
  211. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/sample-pdftotext-bbox.html +0 -0
  212. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/ziano_table1.md +0 -0
  213. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/korbmacher_table1.md +0 -0
  214. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/notes.md +0 -0
  215. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/option-d.py +0 -0
  216. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/ziano_table1.md +0 -0
  217. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_2022_kruger_bbox.html +0 -0
  218. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_bbox.html +0 -0
  219. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_table1.md +0 -0
  220. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/option-e.py +0 -0
  221. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/sample-bbox.html +0 -0
  222. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_2021_joep_bbox.html +0 -0
  223. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_bbox.html +0 -0
  224. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_table1.md +0 -0
  225. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/html-fallback-demo.md +0 -0
  226. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.err +0 -0
  227. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.md +0 -0
  228. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.err +0 -0
  229. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.md +0 -0
  230. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.err +0 -0
  231. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.md +0 -0
  232. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.err +0 -0
  233. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.md +0 -0
  234. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.err +0 -0
  235. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.md +0 -0
  236. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.err +0 -0
  237. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.md +0 -0
  238. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.err +0 -0
  239. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.md +0 -0
  240. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.err +0 -0
  241. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.md +0 -0
  242. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.err +0 -0
  243. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.md +0 -0
  244. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.err +0 -0
  245. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.md +0 -0
  246. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.err +0 -0
  247. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.md +0 -0
  248. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.err +0 -0
  249. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.md +0 -0
  250. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.err +0 -0
  251. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.md +0 -0
  252. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.err +0 -0
  253. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.md +0 -0
  254. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.err +0 -0
  255. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.md +0 -0
  256. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.err +0 -0
  257. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.md +0 -0
  258. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.err +0 -0
  259. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.md +0 -0
  260. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.err +0 -0
  261. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.md +0 -0
  262. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.err +0 -0
  263. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.md +0 -0
  264. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.err +0 -0
  265. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.md +0 -0
  266. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.err +0 -0
  267. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.md +0 -0
  268. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.err +0 -0
  269. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.md +0 -0
  270. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.err +0 -0
  271. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.md +0 -0
  272. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.err +0 -0
  273. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.md +0 -0
  274. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.err +0 -0
  275. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.md +0 -0
  276. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.err +0 -0
  277. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.md +0 -0
  278. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/papers.md +0 -0
  279. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/report.md +0 -0
  280. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/splice_spike.py +0 -0
  281. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/plans/spot-checks/splice-spike/test_splice_spike.py +0 -0
  282. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/specs/2026-04-27-request-09-reference-normalization-design.md +0 -0
  283. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/specs/2026-05-06-section-identification-design.md +0 -0
  284. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/specs/2026-05-06-table-extraction-design.md +0 -0
  285. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/specs/2026-05-08-unified-extraction-design.md +0 -0
  286. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/specs/2026-05-23-haiku-orchestration-pretest-design.md +0 -0
  287. {docpluck-2.4.96 → docpluck-2.4.98}/docs/superpowers/specs/2026-06-07-ip_feldman-B4-R4-column-interleave-diagnosis.md +0 -0
  288. {docpluck-2.4.96 → docpluck-2.4.98}/scripts/__init__.py +0 -0
  289. {docpluck-2.4.96 → docpluck-2.4.98}/scripts/check_docs_consistency.py +0 -0
  290. {docpluck-2.4.96 → docpluck-2.4.98}/scripts/harness/README.md +0 -0
  291. {docpluck-2.4.96 → docpluck-2.4.98}/scripts/harness/VERIFIER_PROMPT.md +0 -0
  292. {docpluck-2.4.96 → docpluck-2.4.98}/scripts/harness/__init__.py +0 -0
  293. {docpluck-2.4.96 → docpluck-2.4.98}/scripts/harness/baseline_matrix.json +0 -0
  294. {docpluck-2.4.96 → docpluck-2.4.98}/scripts/harness/checks.py +0 -0
  295. {docpluck-2.4.96 → docpluck-2.4.98}/scripts/harness/corpus.py +0 -0
  296. {docpluck-2.4.96 → docpluck-2.4.98}/scripts/harness/corpus_manifest.json +0 -0
  297. {docpluck-2.4.96 → docpluck-2.4.98}/scripts/harness/extract.py +0 -0
  298. {docpluck-2.4.96 → docpluck-2.4.98}/scripts/harness/gold_keys.json +0 -0
  299. {docpluck-2.4.96 → docpluck-2.4.98}/scripts/harness/inspect.py +0 -0
  300. {docpluck-2.4.96 → docpluck-2.4.98}/scripts/lint_rendered_corpus.py +0 -0
  301. {docpluck-2.4.96 → docpluck-2.4.98}/scripts/pretest_capture_tokens.py +0 -0
  302. {docpluck-2.4.96 → docpluck-2.4.98}/scripts/verify_corpus.py +0 -0
  303. {docpluck-2.4.96 → docpluck-2.4.98}/scripts/verify_corpus_full.py +0 -0
  304. {docpluck-2.4.96 → docpluck-2.4.98}/tests/__init__.py +0 -0
  305. {docpluck-2.4.96 → docpluck-2.4.98}/tests/conftest.py +0 -0
  306. {docpluck-2.4.96 → docpluck-2.4.98}/tests/fixtures/__init__.py +0 -0
  307. {docpluck-2.4.96 → docpluck-2.4.98}/tests/fixtures/sections/__init__.py +0 -0
  308. {docpluck-2.4.96 → docpluck-2.4.98}/tests/fixtures/sections/builders.py +0 -0
  309. {docpluck-2.4.96 → docpluck-2.4.98}/tests/fixtures/structured/.gitkeep +0 -0
  310. {docpluck-2.4.96 → docpluck-2.4.98}/tests/fixtures/structured/MANIFEST.json +0 -0
  311. {docpluck-2.4.96 → docpluck-2.4.98}/tests/fixtures/structured/README.md +0 -0
  312. {docpluck-2.4.96 → docpluck-2.4.98}/tests/golden/sections/apa_multi_study_pdf.json +0 -0
  313. {docpluck-2.4.96 → docpluck-2.4.98}/tests/golden/sections/apa_single_study_pdf.json +0 -0
  314. {docpluck-2.4.96 → docpluck-2.4.98}/tests/golden/sections/html_real_headings.json +0 -0
  315. {docpluck-2.4.96 → docpluck-2.4.98}/tests/snapshots/amj_lattice.txt +0 -0
  316. {docpluck-2.4.96 → docpluck-2.4.98}/tests/snapshots/apa_chan_feldman_lineless.txt +0 -0
  317. {docpluck-2.4.96 → docpluck-2.4.98}/tests/snapshots/apa_chen_jesp_lineless.txt +0 -0
  318. {docpluck-2.4.96 → docpluck-2.4.98}/tests/snapshots/apa_efendic_affect.txt +0 -0
  319. {docpluck-2.4.96 → docpluck-2.4.98}/tests/snapshots/apa_ip_feldman_pspb.txt +0 -0
  320. {docpluck-2.4.96 → docpluck-2.4.98}/tests/snapshots/bmc_lattice.txt +0 -0
  321. {docpluck-2.4.96 → docpluck-2.4.98}/tests/snapshots/ieee_figure_heavy.txt +0 -0
  322. {docpluck-2.4.96 → docpluck-2.4.98}/tests/snapshots/ieee_lattice.txt +0 -0
  323. {docpluck-2.4.96 → docpluck-2.4.98}/tests/snapshots/jama_lattice.txt +0 -0
  324. {docpluck-2.4.96 → docpluck-2.4.98}/tests/snapshots/nat_comms_figure_only.txt +0 -0
  325. {docpluck-2.4.96 → docpluck-2.4.98}/tests/snapshots/nature_minimal_rule.txt +0 -0
  326. {docpluck-2.4.96 → docpluck-2.4.98}/tests/snapshots/scirep_minimal_rule.txt +0 -0
  327. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_a3c_leading_zero_decimal_real_pdf.py +0 -0
  328. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_a4_ci_period_to_comma.py +0 -0
  329. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_affiliation_heading_promote_guard_real_pdf.py +0 -0
  330. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_all_caps_section_promote_real_pdf.py +0 -0
  331. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_bbox_utils.py +0 -0
  332. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_benchmark_docx_html.py +0 -0
  333. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_cambridge_footer_strip_real_pdf.py +0 -0
  334. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_camelot_lattice_augment.py +0 -0
  335. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_camelot_temp_cleanup.py +0 -0
  336. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_canary_provenance.py +0 -0
  337. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_caption_only_table_heading_real_pdf.py +0 -0
  338. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_caption_regex.py +0 -0
  339. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_chart_data_trim_real_pdf.py +0 -0
  340. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_cid_minus_recovery_real_pdf.py +0 -0
  341. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_cli_sections.py +0 -0
  342. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_cli_structured.py +0 -0
  343. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_confidence.py +0 -0
  344. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_corpus_smoke.py +0 -0
  345. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_d5_normalization_audit.py +0 -0
  346. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_dropped_minus_layout_recovery_real_pdf.py +0 -0
  347. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_edge_cases.py +0 -0
  348. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_elsevier_footer_strip_real_pdf.py +0 -0
  349. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_equation_page_header_strip_real_pdf.py +0 -0
  350. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_extract_columns.py +0 -0
  351. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_extract_docx.py +0 -0
  352. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_extract_filter_sugar.py +0 -0
  353. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_extract_html.py +0 -0
  354. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_extract_layout.py +0 -0
  355. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_extraction.py +0 -0
  356. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_f0_table_region_aware.py +0 -0
  357. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_fffd_comparison_recovery_real_pdf.py +0 -0
  358. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_figure_caption_trim_real_pdf.py +0 -0
  359. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_figure_detect.py +0 -0
  360. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_fixtures_manifest.py +0 -0
  361. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_hallucinated_heading_continuation_guard.py +0 -0
  362. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_harness_text_loss_reflow.py +0 -0
  363. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_harvard_refs_pagebreak_stitch.py +0 -0
  364. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_jama_open_cluster_real_pdf.py +0 -0
  365. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_lattice_cluster.py +0 -0
  366. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_letterspaced_label_real_pdf.py +0 -0
  367. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_ligature_decomposition_real_pdf.py +0 -0
  368. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_lt_operator_recovery_real_pdf.py +0 -0
  369. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_mathitalic_greek_real_pdf.py +0 -0
  370. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_metaesci_followups.py +0 -0
  371. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_minus_sign_recovery_real_pdf.py +0 -0
  372. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_normalization.py +0 -0
  373. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_normalize_a3_r2_body_integer_real_pdf.py +0 -0
  374. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_normalize_f0_footnote_strip.py +0 -0
  375. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_normalize_idempotent_real_pdf.py +0 -0
  376. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_normalize_layout_param.py +0 -0
  377. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_normalize_metadata_leak_real_pdf.py +0 -0
  378. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_normalize_report_layout_fields.py +0 -0
  379. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_normalize_soft_hyphen_dehyphenation.py +0 -0
  380. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_normalize_v18_strips.py +0 -0
  381. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_numbered_heading_promotion_real_pdf.py +0 -0
  382. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_numbered_section_promotion_real_pdf.py +0 -0
  383. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_o5_reference_inversion_real_pdf.py +0 -0
  384. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_orphan_multilevel_number_real_pdf.py +0 -0
  385. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_orphan_section_number_real_pdf.py +0 -0
  386. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_p0r_recurring_running_header_strip.py +0 -0
  387. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_preserve_math_glyphs_real_pdf.py +0 -0
  388. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_pretest_capture_tokens.py +0 -0
  389. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_pua_glyph_recovery_real_pdf.py +0 -0
  390. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_quality.py +0 -0
  391. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_r1_whitespace_cells_wiring_real_pdf.py +0 -0
  392. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_r4_column_correction_real_pdf.py +0 -0
  393. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_rc1_banded_column_real_pdf.py +0 -0
  394. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_rc1_general_column_correction_real_pdf.py +0 -0
  395. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_rc_t_degenerate_table_real_pdf.py +0 -0
  396. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_render.py +0 -0
  397. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_render_frontmatter_masthead.py +0 -0
  398. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_render_html.py +0 -0
  399. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_render_subsection_chain_promotion.py +0 -0
  400. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_request_09_reference_normalization.py +0 -0
  401. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_residual_2026_05_23_bundled.py +0 -0
  402. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_roman_numeral_section_promote_real_pdf.py +0 -0
  403. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_section_row_label_no_merge_real_pdf.py +0 -0
  404. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_sections_boundaries.py +0 -0
  405. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_sections_boundary_truncation.py +0 -0
  406. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_sections_core_partition.py +0 -0
  407. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_sections_docx_annotator.py +0 -0
  408. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_sections_extract_text.py +0 -0
  409. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_sections_footnote_section.py +0 -0
  410. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_sections_golden.py +0 -0
  411. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_sections_html_annotator.py +0 -0
  412. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_sections_pdf_annotator.py +0 -0
  413. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_sections_public_api.py +0 -0
  414. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_sections_real_corpus.py +0 -0
  415. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_sections_taxonomy.py +0 -0
  416. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_sections_text_annotator.py +0 -0
  417. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_sections_types.py +0 -0
  418. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_sections_unit_corpus.py +0 -0
  419. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_sections_v161_coalesce.py +0 -0
  420. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_sections_v161_subheadings.py +0 -0
  421. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_sections_v161_taxonomy.py +0 -0
  422. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_sections_v161_text_annotator.py +0 -0
  423. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_sections_version.py +0 -0
  424. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_single_column_subsection_promote_real_pdf.py +0 -0
  425. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_structured_result_type.py +0 -0
  426. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_structured_types.py +0 -0
  427. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_structured_version.py +0 -0
  428. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_table_caption_cell_region_real_pdf.py +0 -0
  429. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_table_detect.py +0 -0
  430. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_tables_cell_cleaning.py +0 -0
  431. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_text_mode.py +0 -0
  432. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_v23_1_fixes.py +0 -0
  433. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_v23_bug_fixes.py +0 -0
  434. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_v23_post_corpus.py +0 -0
  435. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_v23_post_corpus_v2.py +0 -0
  436. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_v2_backwards_compat.py +0 -0
  437. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_v2_top_level_exports.py +0 -0
  438. {docpluck-2.4.96 → docpluck-2.4.98}/tests/test_whitespace_cluster.py +0 -0
  439. {docpluck-2.4.96 → docpluck-2.4.98}/tools/canary_provenance.py +0 -0
  440. {docpluck-2.4.96 → docpluck-2.4.98}/tools/fix_python_env.ps1 +0 -0
  441. {docpluck-2.4.96 → docpluck-2.4.98}/tools/render_for_audit.py +0 -0
@@ -641,3 +641,15 @@ Rotation picks `pool[(N mod L) : (N mod L) + rotation_size]` wrapping. Over `cei
641
641
  **What (fix):** `api/cron/daily-digest` intermittently logged "Daily dispatch failed" (one-error admin digest email). Ground truth came from prod `system_logs.context.errorStack` (the logger persists `errorName`/`errorMessage`/`errorStack`; queried read-only via the Neon `DATABASE_URL` in `PDFextractor/frontend/.env.local`): two failed runs (06-17, 06-18) were transient **Neon serverless** errors — `terminating connection due to administrator command` (connection recycled on scale-to-zero) and a control-plane HTTP 500 tagged `"neon:retryable":true` — thrown at DIFFERENT queries, with adjacent days succeeding. NOT a logic bug: both unattended crons fire ~15 sequential neon-http calls at 03:00/09:00 UTC against likely-cold compute and the driver does no retries, so one blip aborts the whole run (and Vercel Cron does not re-run failed crons). Fix: new `frontend/src/lib/db-retry.ts` (`withDbRetry`/`isRetryableDbError` — 3 attempts, 250/500ms backoff, retries ONLY transient/connection-class signatures, never app errors) wrapping every DB call in `notifications/dispatch.ts` + `admin/blob-cleanup/route.ts` + the in-band `sendEmail` (safe to replay — `email_events` ON CONFLICT de-dupes). Regression test `frontend/src/lib/db-retry.test.ts` via `node --test` (Node 26 strips TS natively → zero new deps; first frontend unit test + `npm test` script; excluded from `next build` via tsconfig). Shipped commit `ea86e85`; post-deploy `?dryRun=1` smoke = HTTP 200 + clean DispatchResult.
642
642
 
643
643
  **How to detect next time:** (1) The admin digest aggregates the PREVIOUS 24h, so a "1 error" email naming `api/cron/daily-digest` is usually the prior day's transient blip, not a live outage — before theorizing, query `system_logs` for the real `errorName`/`errorStack` (it's `NeonDbError`, often `neon:retryable:true`). (2) ANY unattended Vercel cron doing a burst of neon-http queries needs `withDbRetry`: Neon scale-to-zero WILL recycle a connection mid-run and there is no driver-level retry. (3) `next build` here does NOT run eslint (Next 16 — see card `nextjs-16-build-does-not-run-eslint`), so a red `npm run lint` does not block deploy; keep the lint gate green separately for `/ship`.
644
+
645
+ ## 2026-06-25 — Branch can lag its own pushed tags; `check_app_pin_sync.py` PASS does NOT prove the branch is current
646
+
647
+ **Symptom.** Session opened "clean", `check_app_pin_sync.py` PASS — yet `main` (local AND `origin/main`) was at the v2.4.95 commit while tags `v2.4.96`/`v2.4.97` were already pushed 4 commits ahead, the app pin had auto-bumped to `@v2.4.97`, and the working tree `__version__` read 2.4.95. So **production ran v2.4.97 while the library mainline + working tree claimed v2.4.95.** A prior session tagged+pushed without fast-forwarding the branch, and the tree was checked out backwards.
648
+
649
+ **Why the gate didn't catch it.** `check_app_pin_sync.py` compares the app pin to the *latest tag* — both were v2.4.97, so PASS. It does not compare the *branch HEAD* to the latest tag. And its working-tree note had a direction bug: it told the BEHIND tree it was "ahead — tag+push v2.4.95" (would have tagged an OLDER version). Fixed the note (ordered `_vtuple()`, commit `59cff5b`); the branch-vs-tag check is still a manual step.
650
+
651
+ **Rule.** At session start (and before any release), run `git tag --sort=-v:refname | head` and `git rev-parse main origin/main` and compare to the latest `v*` tag. If `main` is an ANCESTOR of the latest tag → the branch lags its own release; fast-forward (`git merge --ff-only v<latest>`) before iterating, or you'll work on a stale base and risk re-doing / colliding with shipped work. A green `check_app_pin_sync.py` is necessary but NOT sufficient evidence the branch is current. Companion: card `release-version-collision-with-parallel-uncommitted-stream`.
652
+
653
+ ## 2026-06-25 — Word-multiset preservation is BLIND to reading-order regressions; AI-verify is mandatory before any reorder flip
654
+
655
+ The RC-1 banded column-correction flag passed a full-corpus word-preservation scan (26/26 baseline papers, 0 multiset violations) — looked safe to flip default-ON. The 8-canary AI-verify then found **3 ON_REGRESSION** the scan was structurally blind to: running-header furniture injected into prose, Abstract/Intro section order inverted, prose fragmented on a *single-column* paper (false-positive gutter). A pure reorder/furniture-injection preserves the word multiset exactly. **Never flip a reading-order-affecting default on word-preservation (or char-ratio / Jaccard) evidence alone — those gates cannot see "right words, wrong order / wrong place." AI-verify against the gold is required, and the bar is zero ON_REGRESSION across the full canary set.** Re-validates the project ground-truth hard rule.
@@ -1327,3 +1327,44 @@ aren't skipped.
1327
1327
  ### Verification Gaps / Deferred (queued, not dropped)
1328
1328
  - **RC-T Layer-1 recovery** — actually RECOVERING lost table data via tight `table_areas` (plos_med T5's 13 SAE rows; chan_feldman T2 column-squish) — out of Option-A scope, deferred by the user.
1329
1329
  - **Audit of the ~37 `_strip_phantom` th-stripped tables** — some title-shaped `<th>` strips may be wrongly-stripped REAL tables (pre-existing, predates this cycle). Needs its own verification cycle (render each, judge real-vs-phantom). Surfaced explicitly; NOT silently shipped around.
1330
+
1331
+ ---
1332
+
1333
+ ## Run: 2026-06-25 · cycle 1 (investigation + verification) · @ v2.4.97 · verdict PARTIAL
1334
+
1335
+ **Goal:** user invoked `/docpluck-iterate`; established stop-condition implicitly via the two scope decisions below. Targeted RC-T table-bbox (then, on evidence, the RC-1 banded flip).
1336
+
1337
+ ### What landed (clean, shipped, verified)
1338
+ 1. **Branch reconciliation (the big one).** At session start `main` (local AND `origin/main`) was at the v2.4.95-era commit `2dbdd98`, but tags `v2.4.96`/`v2.4.97` were already pushed 4 commits AHEAD (real, test-backed RC-T + DP-2/DP-5 work) and the app pin had auto-bumped to `@v2.4.97` → **production ran v2.4.97 while the library mainline claimed v2.4.95.** Fast-forwarded `main` (local + origin) to `813aa4c` (v2.4.97); now mainline == latest tag == app pin. **Lesson: at session start, ALWAYS check `git tag` vs `git rev-parse main origin/main` — a prior session can push tags without advancing the branch, and the working tree can be checked out backwards. The `check_app_pin_sync.py` PASS hid it (pin==tag is true even when the *branch* lags the tag).**
1339
+ 2. **`check_app_pin_sync.py` direction bug + test** (commit `59cff5b`). `compare()` said "working-tree X is ahead of tag Y — tag+push X" for ANY `X!=Y`; a tree BEHIND its tag (2.4.95 < 2.4.97, exactly this session) got told to tag an OLDER version. Fixed with ordered `_vtuple()`; `tests/test_check_app_pin_sync.py` (10 cases, fails-before/passes-after).
1340
+
1341
+ ### What was investigated and correctly NOT shipped
1342
+ 3. **RC-T table-data recovery = the RC-1 banded layout-channel work (architectural).** ip_feldman T10 (and 3 other canary orphans) lose real table data; gold confirms a 7-row table, render drops all 7. Probed FOUR recovery angles (Camelot free-form, whitespace_cells full region, banded text-channel flag, band-clipped whitespace_cells) — each bottoms out in: full-page Camelot bbox / region-overshoot-into-prose / RC-1 interleave INSIDE the table region (table rows + Discussion prose share y-positions) / tight-kerned glued stat cells that defeat x-gap column detection (`pdfplumber_extract_words_unreliable`). The orphan `### Table N` heading is the DELIBERATE v2.4.55 clean-fail (keeps caption for `table_parity`), NOT the bug. **Lesson: reproduce + probe before trusting a "small fix" hypothesis — my B2-is-easy guess was disproven in ~15 min of probing, saving a mis-scoped cycle.** (card `reproduce-triage-defect-at-head`)
1343
+ 4. **RC-1 banded default-flip ATTEMPTED, REJECTED by 8-canary AI-verify.** Word-preservation corpus scan = 26/26 PRESERVED, 0 violations (looked safe!). But the 8-canary AI-verify (8 parallel Sonnet, Claude Max) found **3 ON_REGRESSION** (deterministically confirmed): `ar_apa` flag-ON injects `M. Muraven / Journal` running-header furniture ×2 + inverts Abstract/Intro order; `chan_feldman` injects Power-Analysis prose into Extension; `maier` (SINGLE-COLUMN) fragments prose via a false-positive gutter. **THE LESSON (re-validates the project hard rule): the word-multiset gate is BLIND to reordering + furniture-injection — it passed 26/26 while the flag corrupted 3 canaries. AI-verify is necessary, not optional, before any reading-order default flip.** Flag stays default-OFF. Precise gap-5/6/7 blockers + a stricter zero-regression re-verify gate written to the RC-1 spec.
1344
+
1345
+ ### Verification-infra defect re-confirmed (surfaced, not fixed — needs portfolio decision)
1346
+ 5. **The pre-push canary-audit.sh hook clobbered run-meta `phase_5d_runs` + `cycle_status` on BOTH pushes this session** — overwriting my honest FAIL verdicts with `AUDIT_DEFERRED_TO_AGENT → union PASS` placeholders (`cycle_status` flipped to PASS, a broken corpus reading green). Restored truthful verdicts after each push per memory `feedback_canary_audit_clobbers_phase5d`. **This bug actively manufactures false-green on a FAIL corpus; it bit twice in one session.** The fix (make the hook not write deferred-PASS, or write BLOCKED/AUDIT_DEFERRED instead of PASS) is a substrate change in `~/.claude/skills/_shared/iterate-loop/canary-audit.sh` — surfaced for a dedicated substrate cycle.
1347
+
1348
+ ### Open (queued, NOT dropped — rule 0e-bis)
1349
+ - 4 canary FAILs remain (ip_feldman T10, chan_feldman T6/T9, chandrashekar T2 table-data loss; plos_med sidebar interleave) → the RC-1 banded layout-channel + sidebar-interleave architectural cycle (spec `2026-06-08-rc1-region-aware-column-architecture.md`, sharpened this run).
1350
+ - Run verdict: **PARTIAL** — 2 clean fixes shipped; the canary corpus is NOT clean (honest punch-list above), so the standing corpus verdict is FAIL and the iterate-gate `--close` correctly cannot green-close.
1351
+
1352
+ ---
1353
+
1354
+ ## Run: 2026-06-25 (cont.) · RC-T char-level column recovery — FOUNDATION landed (v2.4.98), T10 wiring deferred
1355
+
1356
+ ### Outcome
1357
+ - **char_whitespace_cells (the char-level absolute-x-gap column detector) IMPLEMENTED + tested** in `docpluck/tables/whitespace.py`, wired as an automatic fallback inside `whitespace_cells` when the word path finds < 2 columns. This is the layout-channel piece that finally makes the "RC-T table-data recovery" tractable.
1358
+
1359
+ ### Blind spot closed (the key technical finding)
1360
+ - **Earlier this session I assessed T10 recovery as "architectural / multi-session, char detection too hard."** That was WRONG — disproven by probing char-level x-positions: ip_feldman Table 10's 3 stat columns are cleanly separable at the CHAR level (gaps 23-78pt) even though pdfplumber's WORD grouper glues the whole numeric row (`.29***−.21***.07`) into one token so the word-gap detector finds nothing. This is exactly the `pdfplumber_extract_words_unreliable` lesson ("always carry a char-level absolute-x-gap fallback") — I should have reached for it before declaring the wall.
1361
+ - **Right-aligned numeric tables need column-START-edge voting, not gap-MIDPOINT voting.** The label column is variable-width (`Loneliness` vs `Social orientation scale`), so gap midpoints scatter and never reach the stability threshold; but the DATA columns are left-aligned to fixed x, so their left edges are stable. `_find_stable_column_boundaries(bucket_pt>0)` votes on column starts + buckets within 8pt. With this, char_whitespace_cells recovers all 7 T10 data rows matching the gold exactly.
1362
+
1363
+ ### Edge case
1364
+ - **The word path MUST stay byte-identical when adding a char path.** First cut refactored `_find_stable_column_boundaries` to count only multi-column rows in the denominator — that silently changed the WORD path's stability threshold and broke 2 existing table tests (caught at ~65% of the broad run). Fix: branch the function — `bucket_pt<=0` runs the verbatim original word logic, `bucket_pt>0` runs the new char logic. Lesson: when extending a shared helper, keep the existing caller's path provably unchanged (literal copy), don't "improve" it in the same edit.
1365
+
1366
+ ### Deferred (queued, NOT dropped — rule 0e-bis)
1367
+ - **char_whitespace_cells alone does NOT close the T10 canary in production.** T10's caption is *matched* by the garbage full-page Camelot table, so it's added directly to `tables` and the whitespace fallback (where the char path lives) is never reached for it. Closing T10 needs the **degenerate-Camelot-table replacement** in `extract_structured.py`: when a Camelot-matched table has a full-page-ish bbox + furniture/prose cells, discard it and re-extract via `whitespace_cells` on the gutter-band-clipped caption region, PLUS a region prose-trim (the `(extension)` wrapped-caption + Discussion-prose tail still contaminate the region). That change touches the core extraction path for ALL papers → requires the full ~48-paper guard-live-vs-bypassed diff + 7-canary AI-verify before it ships (cycle-3 caption-follows revert is the precedent). Foundation is in place; this is the next focused cycle.
1368
+
1369
+ ### Verification
1370
+ - `tests/test_whitespace_char_fallback.py` (5 cases incl. real-PDF ip_feldman T10 = all 7 gold rows recovered; synthetic tight-kerned recovery; word-space reinsertion; delegation wiring; single-column-no-fabrication guard). 15/15 pass on the whitespace/char/caption-table suite; 0 failures across the broad table/render run; word path proven byte-identical (restored verbatim).
@@ -1,5 +1,46 @@
1
1
  # Changelog
2
2
 
3
+ ## [2.4.98] — 2026-06-25
4
+
5
+ **ESCIcheck handoff defects (DOCPLUCK_HANDOFF_2026-06-25): η²p effect-typing, inline correlation r-typing, and bracketed-CI continuation merge — flatten/cell-cleaning only, AI-gold-verified.** `TABLE_EXTRACTION_VERSION` → `2.4.4`. Render-visible in the inline flattened-table blocks + `.tables.jsonl` sidecar `fields`. **No capture-path change** — caption→table pairing is byte-identical to v2.4.3. Grounded against the AI `reading`/`stats` golds (article-finder), not pdftotext, and confirmed by a 6-paper AI-gold canary verify. Triage: `docs/TRIAGE_2026-06-25_escicheck_handoff_defects.md`.
6
+
7
+ - **DP-3 (table side) — type a font-dropped partial-η² effect column.** Many APA ANOVA tables report effect size as `η²p`, but the glyph lives in a font with no ToUnicode mapping (`pdffonts`: `uni:no`), so pdftotext AND pdfplumber both decode it as a space — the effect-size column header is blank and no "eta"/"η" token survives anywhere, so `tables.flatten._effect_key` fell back to the generic `est` key and the consumer got a nameless number. `flatten._infer_anova_eta2_hint` now types an unlabeled estimate column as `eta2` when the table is structurally an F-test/ANOVA results table (carries an `F` column + a `BF01` and/or CI column) and names NO competing effect (`d`/`dz`/`r`/`OR`/`g`); the value is range-guarded to η²'s domain `[0, 1]`. Keyed on the structural signature, never paper identity. `collabra.90203` Tables 8 & 9 now emit `eta2` matching the gold's `η²p` column (`.06, .00, .01, .04, …`). **The text-channel η²p (in body prose) is unrecoverable — OCR-tier won't-fix** (glyph identity absent from the PDF), as is DP-6 figure-inset text.
8
+
9
+ - **DP-5 (b) — type inline self-labeled correlation cells.** A cell that states its own statistic — `r = .67`, `r = -.73`, `d = 0.32` — is now typed by that token even when the column header is generic ("Effect size") and the column-role recovery produced no estimate for it (`flatten._inline_stat_field`). `cog_emo` (`10.1080/02699931.2024.2434156`) Table 8's intercorrelation rows now carry a typed `r` with their CI/p, matching the gold. The display value is the number only, so the sentence renders `r = 0.63` (not `r = r = .67`).
10
+
11
+ - **DP-5 (a) — merge a CI split across two rows by its bracket tail.** Camelot stream splits a wrapped CI cell — `[0.59,` on one row, `0.73]` on the next. `cell_cleaning._merge_continuation_rows` already folded the *parenthetical* tail (`8.34)`) but not the *bracketed* CI tail (`0.73]`); `_is_fragment_cell` now recognizes a close-bracket tail, so the bound rejoins (`[0.59, 0.73]`) and the bare-fragment junk row no longer survives (cog_emo Table 8: 14 → 10 rows, CIs complete).
12
+
13
+ **Deferred (prototyped + REVERTED this cycle — surfaced, not dropped): DP-1 / DP-2 capture recovery.** A caption page-attribution fix (`find_caption_matches` advancing `char_start` past the leading `\f` its `^\s*` ate) does correct the off-by-one that mis-pages a table starting a page, and in isolation it unblocked `collabra.77859` Table 4's replication `t/df/d` (DP-1). But the **mandatory AI-gold canary verify showed it is net-harmful corpus-wide**: populating the previously-empty `line_text` re-scores `_find_caption_for_table`'s same-page token overlap and surfaces low-quality whitespace tables, mis-pairing tables whose captions share a page (efendic T4/T5, cog_emo T8/T9 swapped) and only half-fixing plos_med (Table 4 still wrong, Table 5 degraded). Reverted to keep the corpus regression-free. It needs same-page-caption disambiguation + whitespace-region quality gating, queued as its own gated cycle (full 48-paper guard-diff + canary AI-verify). DP-2's Table 1 (an 8-column grouped table) is beyond the lineless fallback regardless. See the triage doc.
14
+
15
+ **Also fixed (pre-existing, leave-nothing-behind):** `test_table_html_renders_when_structured` and `test_table_kinds_are_valid` asserted a non-structured table must be `kind == "isolated"`, but the layout-channel `whitespace` fallback (a valid kind since the v2.4.3 foundation) can surface — including when Camelot returns "no tables" under cumulative test-suite load (memory `feedback_camelot_flake_cumulative_load`). The tests now accept `whitespace` (asserting cells + HTML present, no Camelot confidence), so a Camelot load-flake on the jama/ieee/chen fixtures no longer mis-fails them.
16
+
17
+ Verification: shipped flatten/cell-cleaning changes covered by `tests/test_tables_flatten.py` (TestInlineSelfLabeledStat ×3, TestBracketTailCIContinuation) + `tests/test_tables_flatten_blank_header_recovery.py::TestT90203Table8BlankHeaders` (eta2 typed, est gone). Full library suite green under `pytest -n10` (real-PDF Camelot tests flake only under serial cumulative load — each passes per-file / under xdist). 6-paper AI-gold canary verify confirms DP-3 (collabra.90203 η²p), DP-5 (cog_emo r + CI) correct; it is what caught the DP-1/DP-2 page-fix regression and drove its revert.
18
+
19
+ ---
20
+
21
+ **RC-T foundation (earlier same-day commit): char-level column-detection fallback for tight-kerned tables.** `TABLE_EXTRACTION_VERSION` → `2.4.3` (now → `2.4.4` per above). Library-internal capability addition — a foundation for the still-open RC-T Layer-1 table-data recovery.
22
+
23
+ - **`docpluck/tables/whitespace.py` gains `char_whitespace_cells`** — a char-level absolute-x-gap column detector, wired as an automatic fallback inside `whitespace_cells` that fires ONLY when the word-gap detector finds < 2 columns. On tight-kerned PDFs pdfplumber's word grouper glues a whole numeric row into one token (e.g. ip_feldman Table 10's `.29***−.21***.07`), so the word path finds no column gaps; the chars are still cleanly separated by large inter-column gaps. The fallback recovers the grid from char x-gaps, **voting on column-START edges** (in a right-aligned numeric table the label column is variable-width so gap midpoints scatter, but the data columns are left-aligned to fixed x), with an 8pt boundary bucket, and reinserts intra-cell word spacing from glyph geometry.
24
+ - **Strictly additive — currently-correct tables are byte-unchanged.** The word path (`_find_stable_column_boundaries(bucket_pt=0)`) is restored verbatim to its pre-change behavior; only a region that previously yielded NO whitespace grid (`whitespace_cells` returned `[]`) can now gain one, and only when char-columns clear a 60%-row stability bar. In isolation this recovers all 7 of ip_feldman Table 10's regression-coefficient rows matching the AI gold.
25
+
26
+ **Not yet closed (next gated cycle):** the char fallback alone does NOT fix Table 10 in the live pipeline — T10's caption is *matched* by a degenerate full-page Camelot table, which is used directly, so the whitespace fallback (where the char path lives) is never reached. Closing it needs an `extract_structured.py` change — detect a degenerate matched Camelot table (full-page-ish bbox + furniture/prose cells), discard it, and re-extract via `whitespace_cells` on the gutter-band-clipped caption region, plus a region prose-trim — which touches the core extraction path for all papers and so requires the full ~48-paper guard-diff + 7-canary AI-verify before it ships. See `LEARNINGS.md` 2026-06-25 and `docs/superpowers/specs/2026-06-21-rc-t-table-region-prose-contamination.md`.
27
+
28
+ Verification: `tests/test_whitespace_char_fallback.py` (5 cases incl. real-PDF ip_feldman Table 10 = all 7 gold rows recovered, synthetic tight-kerned recovery, intra-cell word-space reinsertion, delegation wiring, single-column-no-fabrication guard); 15/15 pass on the whitespace/char/caption-table suite across two independent runs; 0 failures across the broad table/render suite; word path proven byte-identical.
29
+
30
+ ## [2.4.97] — 2026-06-22
31
+
32
+ **Three table fixes shipped together (combined from two concurrent sessions): type the skipped p+df columns (DP-2), stop dropping / mis-binding two-header-row tables (DP-5), and stop the table raw_text fallback swallowing body prose (RC-T Layer-2).** `TABLE_EXTRACTION_VERSION` → `2.4.2`; no `NORMALIZATION_VERSION` / `SECTIONING_VERSION` change. DP-2/DP-5 are render-visible in the inline flattened-table blocks + the `.tables.jsonl` sidecar `fields` (the `<table>` HTML gains the previously-dropped data rows); RC-T Layer-2 is render-visible in the `unstructured-table` fallback blocks. DP-2/DP-5 filed in `ESCIcheckapp/docs/DOCPLUCK_HANDOFF_2026-06-21.md`; RC-T Layer-2 per `docs/superpowers/specs/2026-06-21-rc-t-table-region-prose-contamination.md`.
33
+
34
+ - **DP-2 — type the unlabeled p and df columns.** `tables.flatten._recover_blank_roles` recovered the leading test statistic and the `d [CI]` column of a header-stripped result table but left the bare p-value and df columns between them untyped, so `collabra.77859` Table 3 emitted `fields: {group, t, d, CI}` and dropped the `p` (`.551`) and `df` (`260.54`). A new Pass 4.5 types a still-blank column that is a bare `.XXX` with no comparison op as `p`, and a bare integer / Welch-decimal sitting between the test statistic and its `est/CI` column as `df` — keyed on data shape + position relative to the already-recovered roles, never bare position. The four Table-3 rows now carry `p` and `df`.
35
+
36
+ - **DP-5 — two-row-header parallel-arm tables: recover the first data row and align centered super-headers.** `collabra.90203` Table 10 delivered only 5 of its 6 correlation rows (the Identifiable/Explicit-learning row was silently dropped), and the Original/Replication arms of `xiao_2021` Table 4 were swapped. Three coupled root-cause fixes: (a) `cell_cleaning._is_header_like_row` now counts APA value shapes (leading-dot decimal, bracketed CI, operator-prefixed p, `N/A`) as data via `_DATA_VALUE_CELL_RE`, so a real first data row is no longer mis-read as a third header row (the bracket branch requires a digit and no letters inside, so a genuine `[95% CI]` header stays a header); (b) `tables.flatten._detect_column_groups` re-derives arm boundaries from equal-width blocks of the data region — each must contain exactly one super-label — so a *centered* super-label (camelot stream loses colspan and folds it mid-span) no longer swaps arm values or pushes a stat column into the label region; left-aligned super-headers stay byte-identical; (c) `tables.flatten._classify_column` reads a folded super-header cell's role from its sub-part so a folded `…<sep>95% CI` column is still typed `CI`. Table 10 now emits all 6 conditions split into Target-article / Replication arms with correct `r` / `n` / `CI` / `p`; xiao Table 4 arms are no longer swapped; incidentally recovers `chan_feldman` Table 8 arm labels and `jama_open_2` Table 3 HR estimates + CIs.
37
+
38
+ - **RC-T Layer-2 — stop the table raw_text fallback swallowing body prose.** When Camelot recovers no cells, `extract_structured._extract_table_body_text` linearises the text after a caption as the `unstructured-table` fallback; its per-line prose gate (`_line_is_body_prose`, len ≥ 80) misses prose that pdftotext WRAPPED into short (~48-char) lines, so a short table's caption-anchored region overshot the table end and swallowed Results/Discussion prose. Two FP-safe structural fixes: **(a) Note-anchor** — a table's `Note:` footnote is, by convention, its last element, so trim everything after the note paragraph (`chan_feldman` T1/T3 + `efendic_2022` T5 trailing Discussion prose removed; the stat rows + the note are kept); **(b) degenerate-prose guard** — suppress a fallback block that STARTS mid-sentence with a lowercase multi-letter word AND is majority sentence-shaped prose, so the renderer emits a clean caption-only table (`chan_feldman` T9 was an entire verbatim duplicate of `## Discussion` — now caption-only, no duplication). FP-safe by construction: real table cells start with a header / label / number / single-letter item marker, never a wrapped mid-sentence continuation — hypotheses ("a There is a positive association…"), descriptive rows ("Median age"), and instrument fragments are preserved. Keyed on the structural overshoot signature, never paper identity.
39
+
40
+ Verification: new real-PDF + contract regression tests (`tests/test_tables_superheader_alignment_real_pdf.py`) — collabra.90203 T10 six-conditions/correct-arms + xiao T4 not-swapped (each FAILS at HEAD, PASSES after), plus `_is_header_like_row` / `_detect_column_groups` contract cases; `tests/test_tables_flatten_blank_header_recovery.py` extended for DP-2. A full-corpus (101-PDF) cached-table flatten diff confirms no clean-table regression — every changed table is a recovered row, a correct arm split, a recovered field, or a removed stat-less spurious row; already-garbage tables shuffle without a clean table regressing. Broad pytest green (real-PDF Camelot tests run serially per file — non-deterministic under cumulative load). RC-T Layer-2 adds `tests/test_rc_t_layer2_raw_text_real_pdf.py` (6 contract + 4 real-PDF: chan T1 Note-anchor, T9 suppress-no-duplication, T3 preserved) and an independent full-corpus 101-PDF guard-live-vs-bypassed raw_text diff (`grew=0 changed=0`; 4 trims + 8 prose-suppressions only). A 7-canary Sonnet AI-gold verify confirms every table this release touched is correct (chan T1/T3/T9, maier T10 six-conditions, xiao T4 arms) with no new TEXT-LOSS / HALLUCINATION.
41
+
42
+ **Deferred (pre-existing, user decision 2026-06-22):** the remaining canary AI-verify FAILs are the architectural backlog, NOT regressions from this release — RC-T **Layer-1** table-data recovery (`table_areas`; e.g. plos_med Table 5's SAE rows, chan_feldman / chandrashekar under-extraction) and RC-1 two-column / sidebar column-interleave. Tracked in `docs/TRIAGE_2026-06-21_head_v2.4.95_assessment.md`; intentionally not addressed here.
43
+
3
44
  ## [2.4.96] — 2026-06-21
4
45
 
5
46
  **RC-T (Option A): strip Camelot "tables" that are absorbed body prose, not data.** Render-only — `render.py::_strip_phantom_camelot_tables`; no `TABLE_EXTRACTION_VERSION` / `NORMALIZATION_VERSION` / `SECTIONING_VERSION` change.
@@ -274,6 +274,74 @@ Cite: `docpluck/tables/camelot_extract.py::_augment_lattice_with_stream_rows` +
274
274
 
275
275
  ---
276
276
 
277
+ ## L-010 — A caption that starts a page is mis-paged by the `^\s*` form-feed; and a font with no ToUnicode makes its glyph unrecoverable (recover the column ROLE, not the glyph)
278
+
279
+ ### Two findings, both surfaced 2026-06-25 by the ESCIcheck handoff
280
+
281
+ 1. **The `^\s*` caption regex eats the `\f` and mis-pages a page-starting table.**
282
+ `TABLE_CAPTION_RE` / `FIGURE_CAPTION_RE` begin `^\s*`. When a table starts a new
283
+ page, pdftotext emits `…results\n\fTable 4. …`, and `\s*` consumes the `\f`, so
284
+ `m.start()` lands *before* the form-feed. `_page_for_offset` then counts the
285
+ caption on the page BEFORE the break (off-by-one), and `_line_at` returns the
286
+ empty pre-`\f` segment (so `line_text == ''`). With the wrong page,
287
+ `_bbox_of_caption_line` can't find the caption in the layout channel → the whole
288
+ layout-region lookup returns None → the whitespace/char fallback never fires and
289
+ the table degrades to a caption-only stub. On `collabra.77859` this hit **all 5
290
+ tables**. The seemingly-obvious fix — advance `char_start` past the leading `\f`
291
+ to the actual "Table"/"Figure" token (`captions.find_caption_matches`) — DOES
292
+ correct the page AND unblock `collabra.77859` Table 4's replication stats (DP-1) in
293
+ isolation. **It was tried 2026-06-25 and REVERTED.** Populating the
294
+ previously-empty `line_text` re-scores `_find_caption_for_table`'s same-page
295
+ token-overlap and surfaces low-quality whitespace tables, so the mandatory AI-gold
296
+ canary verify caught it mis-pairing tables whose captions share a page (efendic
297
+ T4/T5, cog_emo T8/T9 swapped) and only half-fixing plos_med. **The lesson is the
298
+ process, not the patch:** a capture-path change that helps one paper in isolation
299
+ can silently mis-pair others — only a corpus-wide AI-gold verify (NOT the unit
300
+ suite, which stayed green) reveals it. The real fix needs same-page-caption
301
+ disambiguation in `_find_caption_for_table` + whitespace-region quality gating
302
+ FIRST; queued as its own gated cycle. Symptom to watch: `line_text == ''` on a
303
+ caption, or a caption whose `page` is one less than where the table visibly is.
304
+
305
+ 2. **A glyph with no ToUnicode mapping is gone from BOTH channels — recover its
306
+ ROLE, not the glyph.** `collabra.90203` reports `η²p`, but `pdffonts` shows the
307
+ symbol font as `uni: no` (no ToUnicode CMap), so pdftotext AND pdfplumber both
308
+ decode the glyph as U+0020 — the text reads `( = .000, …)` and the table's
309
+ effect-column header is blank. This is the same class as the residual deleted-minus
310
+ (memory `project_docpluck_rc_b7_done_w0h`): the *character identity* is absent from
311
+ the PDF, so text-channel recovery is **OCR-tier won't-fix**. But in a TABLE the
312
+ column's *role* is recoverable from structure — an F-test/ANOVA results table that
313
+ reports a Bayes factor + CI and names no competing effect reports η²p by APA
314
+ convention — so type the value `eta2` from the structural signature (range-guarded
315
+ to η²'s `[0,1]` domain) even though the glyph is unrecoverable
316
+ (`flatten._infer_anova_eta2_hint`). Don't chase the glyph; recover the meaning.
317
+
318
+ ### The rules
319
+
320
+ 1. **A caption-page / capture-path change MUST be AI-gold-verified across the corpus
321
+ before shipping — the unit suite will not catch a mis-pairing.** The page-fix kept
322
+ all 1852 unit tests green yet swapped table↔caption pairings on 3 papers. Per the
323
+ project ground-truth rule, render the canary set and compare TABLES against the AI
324
+ `reading` golds; revert if any paper regresses. (`^\s*`-anchored scans that skip
325
+ `\f` ARE the right idea for page attribution, but the downstream `line_text` /
326
+ same-page-caption scoring must be made robust in the same change, not after.)
327
+ 2. **Before "the symbol got stripped", run `pdffonts`.** If the glyph's font is
328
+ `uni:no`, nothing in the byte stream carries its identity — stop trying to recover
329
+ the glyph; recover the column's role/meaning from structure, or mark it OCR-tier.
330
+ (Shipped: `flatten._infer_anova_eta2_hint` types the value `eta2` from the F-test
331
+ table structure even though the η²p glyph is gone.)
332
+ 3. **A self-labeled cell beats its column header.** `r = .67` / `d = 0.32` states its
333
+ own type; type by the cell token even under a generic "Effect size" header
334
+ (`flatten._inline_stat_field`, shipped) — store only the numeric part so the
335
+ sentence assembler doesn't double the prefix (`r = r = .67`).
336
+
337
+ Cite (SHIPPED v2.4.4): `docpluck/tables/flatten.py::_infer_anova_eta2_hint` +
338
+ `_inline_stat_field`, `docpluck/tables/cell_cleaning.py::_is_fragment_cell` (bracket-CI
339
+ tail). REVERTED (queued): the `captions.find_caption_matches` char_start advance +
340
+ `whitespace._whitespace_grid_is_clean` / `_trim_trailing_prose_rows` gates.
341
+ See `docs/TRIAGE_2026-06-25_escicheck_handoff_defects.md`, CHANGELOG v2.4.98.
342
+
343
+ ---
344
+
277
345
  ## When to add a new lesson here
278
346
 
279
347
  Add a lesson when:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docpluck
3
- Version: 2.4.96
3
+ Version: 2.4.98
4
4
  Summary: PDF, DOCX, and HTML text extraction and normalization for academic papers
5
5
  Project-URL: Homepage, https://docpluck.app
6
6
  Project-URL: Documentation, https://docpluck.app/api-docs
@@ -78,7 +78,7 @@ from .figures import Figure
78
78
  from .extract_structured import TABLE_EXTRACTION_VERSION, StructuredResult, extract_pdf_structured
79
79
  from .render import render_pdf_to_markdown
80
80
 
81
- __version__ = "2.4.96"
81
+ __version__ = "2.4.98"
82
82
  __author__ = "Gilad Feldman"
83
83
  __license__ = "MIT"
84
84
 
@@ -37,7 +37,7 @@ from .tables.render import cells_to_html
37
37
  from .telemetry import record_fallback
38
38
 
39
39
 
40
- TABLE_EXTRACTION_VERSION = "2.4.0" # v2.4.0 (REQUEST_11): flatten now populates fields for NON-clinical result tables — (a) blank-header column-role recovery (tables.flatten._recover_blank_roles): assign a stat role to a header-stripped column from its data-token SHAPE (CI brackets, df1/df2 pair, estimate-adjacent-CI, p-with-operator) AND caption/footnote/all-header-rows vocabulary, never bare position; recovers collabra.77859 T5 (t/df/d/CI) + collabra.90203 T8/T9 (F/df/p/BF01/eta²p-as-est/CI). (b) packed parallel-arm split (tables.flatten._detect_packed_arms/_flatten_packed_arms): tables packing k≥2 arms into single cells ("Separate Joint" + space-joined values) emit one typed record per arm (group=arm) — collabra.77859 T3 Separate/Joint, xiao_2021 T7 Regret/Justifiability. (c) new BF01 role; validity guards drop r∉[-1,1] / non-monotone CI / non-int n / p∉[0,1]. (d) GENERAL L-004 fixes: _parse_number + _parse_ci_cell fold U+2212 MINUS (negative t/d/CI bounds in Camelot cells were dropped/sign-lost); _VALUE_GROUP_RE handles bracket-led CI groups. Default render + PROSECCO output byte-identical. # v2.3.0 (Tier-2, REQUEST_10): cross-flavor lattice-augmentation — recover data rows a lattice extraction vertically TRUNCATED by appending the rows a same-page, same-column-count stream table captured below the lattice bbox (camelot_extract._augment_lattice_with_stream_rows), gated on equal-col-count + bbox overlap + extends-below; PLUS numeric/parenthetical continuation merge (cell_cleaning._merge_continuation_rows) rejoining stream's stacked value/parenthetical cells. Fixes PROSECCO Table 2 R2-R6. v2.2.0: EC-T1 docpluck.tables.flatten — per-row FlattenedRow records (sentence + structured fields) for downstream stat-verification consumers (effectcheck/escimate/scimeto) + opt-in inline "rendered as text" block below each <table> via render_pdf_to_markdown(flatten_tables_inline=True). v2.1.5: cell-cleaning recovers CMEX10 extensible-bracket PUA glyphs (U+F8EE-F8FB). v2.1.4: cell-cleaning recovers Adobe-Symbol-font PUA glyphs (beta/chi/bullet as U+F0xx). v2.1.3: cell-cleaning recovers '<'-as-backslash glyph corruption. v2.1.2: cell-cleaning recovers descending-CI '2'-for-minus corruption. v2.1.1: cell-cleaning recovers (cid:0) corrupted minus signs + strips math-alphanumeric styling. v2.1.0: cell-cleaning pipeline ported from splice spike (multi-row header detection, continuation merging, leader-dot strip, mash-split, group separators, sig-marker attach)
40
+ TABLE_EXTRACTION_VERSION = "2.4.4" # v2.4.4 (ESCIcheck handoff 2026-06-25 — flatten-only, AI-gold-verified): (DP-3) flatten._infer_anova_eta2_hint types a font-dropped η²p effect column by STRUCTURE — an unlabeled estimate column in an F-test/ANOVA results table (F + BF01/CI, no competing d/r/OR) is keyed `eta2`, range-guarded to [0,1]; the η²p glyph itself is unrecoverable (NotoSerif uni:no, OCR-tier). (DP-5b) flatten._inline_stat_field types a self-labeled cell (`r = .67`, `d = 0.32`) by its token even under a generic "Effect size" header. (DP-5a) cell_cleaning._is_fragment_cell recognizes a bracketed-CI close tail (`0.73]`) so a CI split across rows rejoins (`[0.59, 0.73]`) and the junk fragment row is dropped (cog_emo T8 14→10 rows). These three are flatten/cell-cleaning ONLY — no capture-path change, so caption→table pairing is byte-identical to v2.4.3. (DP-1/DP-2 capture recovery — the caption page-attribution fix — was prototyped but REVERTED: AI-gold canary verify showed it mis-pairs same-page-caption tables (efendic T4/T5, cog_emo T8/T9) and only half-fixes plos_med; it is queued as its own gated cycle with same-page disambiguation + region-quality gating. See docs/TRIAGE_2026-06-25_escicheck_handoff_defects.md.) # v2.4.3 (RC-T foundation): tables.whitespace gains a CHAR-LEVEL column-detection fallback (char_whitespace_cells) — when pdfplumber's word grouper glues a tight-kerned numeric row into one token so the word-gap detector finds < 2 columns, recover the grid from char x-gaps, voting on column-START edges (right-aligned data columns are left-edge-stable even when the label column is variable-width) and reinserting intra-cell word spacing from geometry. Fires ONLY as a fallback when the word path returns < 2 columns, so currently-correct tables are byte-unchanged (word path restored verbatim). Recovers ip_feldman Table 10's 7 regression rows in isolation. NOT yet wired to replace a degenerate MATCHED Camelot table (that extract_structured change + region prose-trim is the next gated cycle — see LEARNINGS 2026-06-25). # v2.4.2 (RC-T Layer-2): _extract_table_body_text now (a) Note-anchor — a table's "Note:" footnote is its last element, so trim body prose bled past it (chan_feldman T1/T3, efendic_2022 T5); and (b) degenerate-prose guard — suppress a raw_text fallback that STARTS mid-sentence with a lowercase multi-letter word AND is majority sentence-shaped prose, so render emits a clean caption-only table instead of an unstructured-table dump duplicating Results/Discussion prose (chan_feldman T9 was a verbatim ## Discussion duplicate). FP-safe (real cells start with header/label/number/single-letter marker, never a wrapped continuation); full-corpus 101-PDF guard-diff only trims+suppresses (grew=0 changed=0). # v2.4.1 (DP-2/DP-5): (DP-2) blank-header role recovery now types the unlabeled p-value (a bare `.XXX` after the test stat, no comparison op) and df (a bare integer/Welch-decimal between the stat and the d[CI] column) columns it previously skipped — collabra.77859 T3 fields gain p+df (tables.flatten._recover_blank_roles Pass 4.5). (DP-5) parallel-arm tables with a TWO-ROW header no longer drop their first data row, and a CENTERED super-header is aligned to its arm block instead of its visual-center column: (a) cell_cleaning._is_header_like_row counts APA value shapes (leading-dot decimal, bracketed CI, operator-prefixed p, N/A) as data via _DATA_VALUE_CELL_RE so a real first data row isn't read as a 3rd header row (collabra.90203 T10 recovered the Identifiable/Explicit-learning correlation); (b) tables.flatten._detect_column_groups re-derives arm boundaries from equal-width blocks of the data region (each must hold one super-label) so a centered super-label folded mid-span no longer swaps arm values (xiao_2021 T4 Original/Replication F) or pushes a stat column into the label region; (c) tables.flatten._classify_column reads a folded super-header cell's role from its sub-part (collabra.90203 T10 CI). Full-corpus cached-table flatten diff: no clean-table regression. # v2.4.0 (REQUEST_11): flatten now populates fields for NON-clinical result tables — (a) blank-header column-role recovery (tables.flatten._recover_blank_roles): assign a stat role to a header-stripped column from its data-token SHAPE (CI brackets, df1/df2 pair, estimate-adjacent-CI, p-with-operator) AND caption/footnote/all-header-rows vocabulary, never bare position; recovers collabra.77859 T5 (t/df/d/CI) + collabra.90203 T8/T9 (F/df/p/BF01/eta²p-as-est/CI). (b) packed parallel-arm split (tables.flatten._detect_packed_arms/_flatten_packed_arms): tables packing k≥2 arms into single cells ("Separate Joint" + space-joined values) emit one typed record per arm (group=arm) — collabra.77859 T3 Separate/Joint, xiao_2021 T7 Regret/Justifiability. (c) new BF01 role; validity guards drop r∉[-1,1] / non-monotone CI / non-int n / p∉[0,1]. (d) GENERAL L-004 fixes: _parse_number + _parse_ci_cell fold U+2212 MINUS (negative t/d/CI bounds in Camelot cells were dropped/sign-lost); _VALUE_GROUP_RE handles bracket-led CI groups. Default render + PROSECCO output byte-identical. # v2.3.0 (Tier-2, REQUEST_10): cross-flavor lattice-augmentation — recover data rows a lattice extraction vertically TRUNCATED by appending the rows a same-page, same-column-count stream table captured below the lattice bbox (camelot_extract._augment_lattice_with_stream_rows), gated on equal-col-count + bbox overlap + extends-below; PLUS numeric/parenthetical continuation merge (cell_cleaning._merge_continuation_rows) rejoining stream's stacked value/parenthetical cells. Fixes PROSECCO Table 2 R2-R6. v2.2.0: EC-T1 docpluck.tables.flatten — per-row FlattenedRow records (sentence + structured fields) for downstream stat-verification consumers (effectcheck/escimate/scimeto) + opt-in inline "rendered as text" block below each <table> via render_pdf_to_markdown(flatten_tables_inline=True). v2.1.5: cell-cleaning recovers CMEX10 extensible-bracket PUA glyphs (U+F8EE-F8FB). v2.1.4: cell-cleaning recovers Adobe-Symbol-font PUA glyphs (beta/chi/bullet as U+F0xx). v2.1.3: cell-cleaning recovers '<'-as-backslash glyph corruption. v2.1.2: cell-cleaning recovers descending-CI '2'-for-minus corruption. v2.1.1: cell-cleaning recovers (cid:0) corrupted minus signs + strips math-alphanumeric styling. v2.1.0: cell-cleaning pipeline ported from splice spike (multi-row header detection, continuation merging, leader-dot strip, mash-split, group separators, sig-marker attach)
41
41
 
42
42
  TableTextMode = Literal["raw", "placeholder"]
43
43
 
@@ -1306,6 +1306,74 @@ def _line_is_body_prose(line: str) -> bool:
1306
1306
  return stopwords_hit >= 4
1307
1307
 
1308
1308
 
1309
+ def _join_wrapped_lines(lines: list[str]) -> list[str]:
1310
+ """Merge pdftotext-wrapped lines into logical paragraphs.
1311
+
1312
+ pdftotext linearizes a flowing prose paragraph into several short
1313
+ (~45-60 char) lines; the per-line ``_line_is_body_prose`` gate
1314
+ (len >= 80) cannot see prose in that wrapped form. Joining a line with
1315
+ the next whenever it does not end on sentence-terminal punctuation
1316
+ reconstructs the paragraph so prose can be measured at paragraph scale.
1317
+ """
1318
+ paras: list[str] = []
1319
+ cur = ""
1320
+ for ln in lines:
1321
+ s = ln.strip()
1322
+ if not s:
1323
+ continue
1324
+ cur = (cur + " " + s).strip() if cur else s
1325
+ if s.endswith((".", "!", "?", ":")):
1326
+ paras.append(cur)
1327
+ cur = ""
1328
+ if cur:
1329
+ paras.append(cur)
1330
+ return paras
1331
+
1332
+
1333
+ def _raw_text_is_degenerate_prose(text: str) -> bool:
1334
+ """True if a table raw_text fallback is dominated by flowing body prose.
1335
+
1336
+ RC-T Layer-2 (v2.4.97). When Camelot recovers no cells AND the
1337
+ caption-anchored region has no extractable table text near the caption,
1338
+ the body_start walk lands INSIDE a prose paragraph and the fallback
1339
+ swallows Results/Discussion prose (which is then duplicated under its
1340
+ real section heading). Such a block must be suppressed (render then
1341
+ emits a clean caption-only table) rather than dumped verbatim.
1342
+
1343
+ FP-safe by construction — fires only when BOTH hold:
1344
+ (a) the block STARTS mid-sentence: its first line begins with a
1345
+ lowercase multi-letter continuation word. A real table's
1346
+ linearized cells start with a column header, label, number, or a
1347
+ single-letter item marker (``a``/``b``/``c``) — never a wrapped
1348
+ mid-paragraph continuation like "than empathy. We provided ...".
1349
+ (b) the joined block is majority (>= 60% of chars) sentence-shaped
1350
+ body prose.
1351
+
1352
+ Legitimate degraded tables are preserved: hypotheses ("a There is a
1353
+ positive association ..."), descriptive rows ("Median age (years)"),
1354
+ instrument items ("h et al., 1997)") all fail (a). Keyed purely on the
1355
+ structural overshoot signature, never on paper identity.
1356
+ """
1357
+ lines = [ln for ln in text.split("\n") if ln.strip()]
1358
+ if len(lines) < 4:
1359
+ return False
1360
+ first_tokens = lines[0].split()
1361
+ first_word = first_tokens[0] if first_tokens else ""
1362
+ starts_midsentence = (
1363
+ len(first_word) >= 2
1364
+ and first_word[0].islower()
1365
+ and first_word[0].isalpha()
1366
+ )
1367
+ if not starts_midsentence:
1368
+ return False
1369
+ paragraphs = _join_wrapped_lines(lines)
1370
+ total = sum(len(p) for p in paragraphs)
1371
+ if total == 0:
1372
+ return False
1373
+ prose = sum(len(p) for p in paragraphs if _line_is_body_prose(p))
1374
+ return prose >= 0.6 * total
1375
+
1376
+
1309
1377
  def _extract_table_body_text(
1310
1378
  raw_text: str,
1311
1379
  cap: CaptionMatch,
@@ -1379,6 +1447,31 @@ def _extract_table_body_text(
1379
1447
  break
1380
1448
  kept.append(ln)
1381
1449
 
1450
+ # Note-anchor table-end (RC-T Layer-2, v2.4.97). A table's "Note:" /
1451
+ # "Notes:" footnote is, by academic-table convention, its LAST element.
1452
+ # Any text after the note paragraph is body prose that bled past the
1453
+ # table boundary — the caption-anchored region overshot the table end
1454
+ # and the per-line `_line_is_body_prose` gate (len >= 80) misses prose
1455
+ # that pdftotext WRAPPED into short (~48-char) lines, so it accumulates
1456
+ # here. Trim everything after the note's (possibly wrapped) paragraph.
1457
+ # This is FP-safe: legitimate table cells (hypotheses a/b/c, instrument
1458
+ # items) appear BEFORE the note; nothing legitimate follows it. Keyed on
1459
+ # the structural "Note: ... <sentence end>" signature, never paper
1460
+ # identity. `^Notes?[.:]` requires punctuation so body prose that merely
1461
+ # starts with the word "Note that ..." does not false-trigger.
1462
+ note_idx = next(
1463
+ (i for i, ln in enumerate(kept)
1464
+ if re.match(r"^\s*Notes?[.:]", ln.strip())),
1465
+ None,
1466
+ )
1467
+ if note_idx is not None and not os.environ.get("DOCPLUCK_RCT_L2_BYPASS"):
1468
+ note_end = note_idx
1469
+ for k in range(note_idx, len(kept)):
1470
+ note_end = k
1471
+ if kept[k].strip().endswith((".", "!", "?")):
1472
+ break
1473
+ kept = kept[: note_end + 1]
1474
+
1382
1475
  # Trim trailing heading-like short lines that don't belong to this table
1383
1476
  # (the start of the next section). Two patterns are trimmed:
1384
1477
  # * Title-Case headings without a sentence terminator
@@ -1414,7 +1507,17 @@ def _extract_table_body_text(
1414
1507
  s = re.sub(r"[ \t]+", " ", ln).strip()
1415
1508
  if s:
1416
1509
  cleaned_lines.append(s)
1417
- return "\n".join(cleaned_lines).strip()
1510
+ result = "\n".join(cleaned_lines).strip()
1511
+ # Degenerate-prose guard (RC-T Layer-2, v2.4.97): drop a raw_text
1512
+ # fallback that is really body prose the region overshot into, so the
1513
+ # renderer emits a clean caption-only table instead of an
1514
+ # ``unstructured-table`` dump that duplicates Results/Discussion prose.
1515
+ # ``DOCPLUCK_RCT_L2_BYPASS`` reverts both Layer-2 additions (Note-anchor
1516
+ # + this guard) to HEAD behavior — used only by the FP-scan harness to
1517
+ # diff guard-live vs guard-bypassed over the full corpus.
1518
+ if not os.environ.get("DOCPLUCK_RCT_L2_BYPASS") and _raw_text_is_degenerate_prose(result):
1519
+ return ""
1520
+ return result
1418
1521
 
1419
1522
 
1420
1523
  def _figure_from_caption(
@@ -212,6 +212,12 @@ def _merge_continuation_rows(rows: list[list[str]]) -> list[list[str]]:
212
212
  return True
213
213
  if ")" in s and "(" not in s: # close-paren tail: "8.34)", "10.28)"
214
214
  return True
215
+ # Close-bracket tail of a CI split across two rows: "[0.59," wraps and
216
+ # its upper bound "0.73]" lands on the next row (cog_emo Table 8 — the
217
+ # bracketed CI form, not the parenthetical one above). The parent cell
218
+ # ends with "," so the fragment joins after a space → "[0.59, 0.73]".
219
+ if "]" in s and "[" not in s: # close-bracket tail: "0.73]", "−0.66]"
220
+ return True
215
221
  return False
216
222
 
217
223
  def _is_fragment_continuation(row: list[str], parent: list[str]) -> bool:
@@ -393,13 +399,32 @@ _NUMERIC_CELL_RE = re.compile(
393
399
  r"^[-−–]?\d+(?:[.,]\d+)*(?:[%∗*]+)?(?:\s*\([^)]*\))?$"
394
400
  )
395
401
 
402
+ # A cell carrying a statistic VALUE (vs a header label). Broader than
403
+ # _NUMERIC_CELL_RE: also matches APA leading-dot decimals (".34"), operator-
404
+ # prefixed p-values ("< .001"), bracketed numeric intervals ("[0.53, 0.72]"),
405
+ # and the "N/A" filler — all DATA, not header text. The interval branch requires
406
+ # a digit and NO letters inside the brackets so a genuine header cell like
407
+ # "[95% CI]" (letters present) is NOT counted as data and stays a header. Used by
408
+ # `_is_header_like_row` so a real data row whose APA-formatted values the bare
409
+ # numeric pattern under-counted is not mistaken for an extra header row — the
410
+ # bug that silently dropped the FIRST data row of two-header-row correlation
411
+ # tables (collabra.90203 Table 10, DP-5).
412
+ _DATA_VALUE_CELL_RE = re.compile(
413
+ r"^(?:"
414
+ r"[<>=]?\s*[-−–]?\d*[.,]?\d+(?:[.,]\d+)*(?:[%∗*]+)?(?:\s*\([^)]*\))?"
415
+ r"|\[[^\]A-Za-z]*\d[^\]A-Za-z]*\]"
416
+ r"|n\s*/?\s*a"
417
+ r")$",
418
+ re.I,
419
+ )
420
+
396
421
 
397
422
  def _is_header_like_row(row: list[str]) -> bool:
398
423
  """Heuristic: a row that looks like part of a header rather than data."""
399
424
  nonempty = [c.strip() for c in row if (c or "").strip()]
400
425
  if not nonempty:
401
426
  return False
402
- numeric = sum(1 for c in nonempty if _NUMERIC_CELL_RE.match(c))
427
+ numeric = sum(1 for c in nonempty if _DATA_VALUE_CELL_RE.match(c))
403
428
  if numeric / len(nonempty) > 0.3:
404
429
  return False
405
430
  avg_len = sum(len(c) for c in nonempty) / len(nonempty)