docpluck 2.4.96__tar.gz → 2.4.97__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (436) hide show
  1. {docpluck-2.4.96 → docpluck-2.4.97}/CHANGELOG.md +14 -0
  2. {docpluck-2.4.96 → docpluck-2.4.97}/PKG-INFO +1 -1
  3. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/__init__.py +1 -1
  4. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/extract_structured.py +105 -2
  5. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/tables/cell_cleaning.py +20 -1
  6. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/tables/flatten.py +118 -2
  7. docpluck-2.4.97/docs/superpowers/handoffs/2026-06-22-dp2-dp5-flatten-fixes-commit.md +71 -0
  8. {docpluck-2.4.96 → docpluck-2.4.97}/pyproject.toml +1 -1
  9. docpluck-2.4.97/tests/test_rc_t_layer2_raw_text_real_pdf.py +163 -0
  10. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_tables_flatten_blank_header_recovery.py +31 -1
  11. docpluck-2.4.97/tests/test_tables_superheader_alignment_real_pdf.py +168 -0
  12. {docpluck-2.4.96 → docpluck-2.4.97}/.claude/skills/_project/canary.json +0 -0
  13. {docpluck-2.4.96 → docpluck-2.4.97}/.claude/skills/_project/lessons.md +0 -0
  14. {docpluck-2.4.96 → docpluck-2.4.97}/.claude/skills/docpluck-cleanup/SKILL.md +0 -0
  15. {docpluck-2.4.96 → docpluck-2.4.97}/.claude/skills/docpluck-deploy/SKILL.md +0 -0
  16. {docpluck-2.4.96 → docpluck-2.4.97}/.claude/skills/docpluck-iterate/LEARNINGS.md +0 -0
  17. {docpluck-2.4.96 → docpluck-2.4.97}/.claude/skills/docpluck-iterate/SKILL.md +0 -0
  18. {docpluck-2.4.96 → docpluck-2.4.97}/.claude/skills/docpluck-iterate/references/ai-full-doc-verify.md +0 -0
  19. {docpluck-2.4.96 → docpluck-2.4.97}/.claude/skills/docpluck-iterate/references/cycle-report-template.md +0 -0
  20. {docpluck-2.4.96 → docpluck-2.4.97}/.claude/skills/docpluck-iterate/references/local-verification.md +0 -0
  21. {docpluck-2.4.96 → docpluck-2.4.97}/.claude/skills/docpluck-iterate/references/rationalizations.md +0 -0
  22. {docpluck-2.4.96 → docpluck-2.4.97}/.claude/skills/docpluck-iterate/references/real-library-real-pdf.md +0 -0
  23. {docpluck-2.4.96 → docpluck-2.4.97}/.claude/skills/docpluck-iterate/references/release-flow.md +0 -0
  24. {docpluck-2.4.96 → docpluck-2.4.97}/.claude/skills/docpluck-iterate/references/self-improvement.md +0 -0
  25. {docpluck-2.4.96 → docpluck-2.4.97}/.claude/skills/docpluck-iterate/references/subagent-parallelization.md +0 -0
  26. {docpluck-2.4.96 → docpluck-2.4.97}/.claude/skills/docpluck-iterate/references/three-tier-parity.md +0 -0
  27. {docpluck-2.4.96 → docpluck-2.4.97}/.claude/skills/docpluck-qa/SKILL.md +0 -0
  28. {docpluck-2.4.96 → docpluck-2.4.97}/.claude/skills/docpluck-qa/references/benchmark-mode.md +0 -0
  29. {docpluck-2.4.96 → docpluck-2.4.97}/.claude/skills/docpluck-qa/references/check-11-hard-rules.md +0 -0
  30. {docpluck-2.4.96 → docpluck-2.4.97}/.claude/skills/docpluck-qa/references/check-13-escicheck-production.md +0 -0
  31. {docpluck-2.4.96 → docpluck-2.4.97}/.claude/skills/docpluck-qa/references/check-5-escicheck-library.md +0 -0
  32. {docpluck-2.4.96 → docpluck-2.4.97}/.claude/skills/docpluck-qa/references/check-6-escicheck-local-webapp.md +0 -0
  33. {docpluck-2.4.96 → docpluck-2.4.97}/.claude/skills/docpluck-qa/references/check-7-batch-smoke.md +0 -0
  34. {docpluck-2.4.96 → docpluck-2.4.97}/.claude/skills/docpluck-review/SKILL.md +0 -0
  35. {docpluck-2.4.96 → docpluck-2.4.97}/.github/workflows/bump-app-pin.yml +0 -0
  36. {docpluck-2.4.96 → docpluck-2.4.97}/.github/workflows/publish.yml +0 -0
  37. {docpluck-2.4.96 → docpluck-2.4.97}/.github/workflows/test.yml +0 -0
  38. {docpluck-2.4.96 → docpluck-2.4.97}/.gitignore +0 -0
  39. {docpluck-2.4.96 → docpluck-2.4.97}/CLAUDE.md +0 -0
  40. {docpluck-2.4.96 → docpluck-2.4.97}/CUSTOMER_UPDATE_2026-06-19_tables_sections_api.md +0 -0
  41. {docpluck-2.4.96 → docpluck-2.4.97}/HANDOFF_SECTIONS_APP_INTEGRATION.md +0 -0
  42. {docpluck-2.4.96 → docpluck-2.4.97}/LESSONS.md +0 -0
  43. {docpluck-2.4.96 → docpluck-2.4.97}/LICENSE +0 -0
  44. {docpluck-2.4.96 → docpluck-2.4.97}/README.md +0 -0
  45. {docpluck-2.4.96 → docpluck-2.4.97}/REPLY_FROM_DOCPLUCK_v1.4.5.md +0 -0
  46. {docpluck-2.4.96 → docpluck-2.4.97}/REPLY_FROM_DOCPLUCK_v1.5.0.md +0 -0
  47. {docpluck-2.4.96 → docpluck-2.4.97}/REPLY_FROM_DOCPLUCK_v2.4.93.md +0 -0
  48. {docpluck-2.4.96 → docpluck-2.4.97}/REPLY_FROM_DOCPLUCK_v2.4.94.md +0 -0
  49. {docpluck-2.4.96 → docpluck-2.4.97}/REPLY_FROM_DOCPLUCK_v2.4.95.md +0 -0
  50. {docpluck-2.4.96 → docpluck-2.4.97}/REQUEST_08_CHUNKING_ENDPOINT.md +0 -0
  51. {docpluck-2.4.96 → docpluck-2.4.97}/REQUEST_09_REFERENCE_LIST_NORMALIZATION.md +0 -0
  52. {docpluck-2.4.96 → docpluck-2.4.97}/REQUEST_10_TABLE_FLATTEN_HTTP_EXPOSURE.md +0 -0
  53. {docpluck-2.4.96 → docpluck-2.4.97}/REQUEST_10_TIER2_ORPHANED_LABEL_ROW_RECOVERY.md +0 -0
  54. {docpluck-2.4.96 → docpluck-2.4.97}/REQUEST_11_FLATTEN_FIELDS_NONCLINICAL_TABLES.md +0 -0
  55. {docpluck-2.4.96 → docpluck-2.4.97}/TODO.md +0 -0
  56. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/__main__.py +0 -0
  57. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/batch.py +0 -0
  58. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/cli.py +0 -0
  59. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/extract.py +0 -0
  60. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/extract_columns.py +0 -0
  61. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/extract_docx.py +0 -0
  62. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/extract_html.py +0 -0
  63. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/extract_layout.py +0 -0
  64. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/figures/__init__.py +0 -0
  65. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/figures/detect.py +0 -0
  66. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/normalize.py +0 -0
  67. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/quality.py +0 -0
  68. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/render.py +0 -0
  69. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/sections/__init__.py +0 -0
  70. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/sections/annotators/__init__.py +0 -0
  71. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/sections/annotators/docx.py +0 -0
  72. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/sections/annotators/html.py +0 -0
  73. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/sections/annotators/pdf.py +0 -0
  74. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/sections/annotators/text.py +0 -0
  75. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/sections/blocks.py +0 -0
  76. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/sections/boundaries.py +0 -0
  77. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/sections/core.py +0 -0
  78. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/sections/taxonomy.py +0 -0
  79. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/sections/types.py +0 -0
  80. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/tables/__init__.py +0 -0
  81. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/tables/bbox_utils.py +0 -0
  82. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/tables/camelot_extract.py +0 -0
  83. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/tables/captions.py +0 -0
  84. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/tables/cluster.py +0 -0
  85. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/tables/confidence.py +0 -0
  86. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/tables/detect.py +0 -0
  87. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/tables/render.py +0 -0
  88. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/tables/whitespace.py +0 -0
  89. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/telemetry.py +0 -0
  90. {docpluck-2.4.96 → docpluck-2.4.97}/docpluck/version.py +0 -0
  91. {docpluck-2.4.96 → docpluck-2.4.97}/docs/BENCHMARKS.md +0 -0
  92. {docpluck-2.4.96 → docpluck-2.4.97}/docs/BENCHMARKS_liteparse_2026-06.md +0 -0
  93. {docpluck-2.4.96 → docpluck-2.4.97}/docs/DESIGN.md +0 -0
  94. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-07_sections_strict_iteration.md +0 -0
  95. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-09_session_state_and_followups.md +0 -0
  96. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-09_unified_extraction_brainstorm.md +0 -0
  97. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-10_table_rendering_iteration.md +0 -0
  98. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-10_table_rendering_iteration_2.md +0 -0
  99. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-10_table_rendering_iteration_3.md +0 -0
  100. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-10_table_rendering_iteration_4.md +0 -0
  101. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-10_table_rendering_iteration_5.md +0 -0
  102. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-10_table_rendering_iteration_6.md +0 -0
  103. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-10_table_rendering_iteration_7.md +0 -0
  104. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-11_PROMOTE_SPIKE_TO_LIBRARY.md +0 -0
  105. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-11_table_rendering_iteration_8.md +0 -0
  106. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-11_visual_review_findings.md +0 -0
  107. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-12_phase2_101pdf_corpus.md +0 -0
  108. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-12_remaining_ui_and_chrome_verification.md +0 -0
  109. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-12_visual_verify_results.md +0 -0
  110. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-13_apa_50_expansion.md +0 -0
  111. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-13_apa_50_expansion_iter_1.md +0 -0
  112. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-13_apa_50_expansion_iter_2.md +0 -0
  113. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-13_iterate_skill_first_use.md +0 -0
  114. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-13_iterative_1.md +0 -0
  115. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-13_iterative_library_improvement.md +0 -0
  116. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-13_table_extraction_next_iteration.md +0 -0
  117. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-14_continue_iterations_v2_4_30_to_15n.md +0 -0
  118. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-14_full_corpus_iteration_v2_4_30.md +0 -0
  119. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-14_iterate_6_cycles_complete.md +0 -0
  120. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-14_iterate_9_cycle_run.md +0 -0
  121. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-14_iterate_resume_4_cycles.md +0 -0
  122. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-14_iterate_v2_4_31_cycle_15n.md +0 -0
  123. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-14_phase_5d_gold_audit_v2_4_29.md +0 -0
  124. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-15_autonomous_apa_first_10h.md +0 -0
  125. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-15_iterate_apa_run_1.md +0 -0
  126. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-16_ai-gold-instructions.md +0 -0
  127. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-16_iterate_apa_run_2.md +0 -0
  128. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-16_iterate_apa_run_3.md +0 -0
  129. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-16_iterate_run_4_final.md +0 -0
  130. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-16_iterate_run_4_fix_and_continue.md +0 -0
  131. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-16_iterate_run_5.md +0 -0
  132. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-16_iterate_run_6.md +0 -0
  133. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-17_iterate_run_7.md +0 -0
  134. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-17_iterate_run_8.md +0 -0
  135. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-17_iterate_run_9.md +0 -0
  136. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-18_iterate_run_9_cont.md +0 -0
  137. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-18_iterate_run_9_cont2.md +0 -0
  138. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-20_iterate_run_9_cont3.md +0 -0
  139. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-22_iterate_run_9_session4_final.md +0 -0
  140. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-22_iterate_run_9_session5_close.md +0 -0
  141. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-25_haiku-orchestration-pretest.md +0 -0
  142. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-05-25_pretest-followups.md +0 -0
  143. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-06-08_iterate_splice-wordintegrity-runningheader.md +0 -0
  144. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-06-08_untested_sweep_v2.4.81.md +0 -0
  145. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-06-13_sciencearena_grobid_liteparse.md +0 -0
  146. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-06-15_docpluck-iterate-resume.md +0 -0
  147. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-06-15_rc1-step2-continue.md +0 -0
  148. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-06-16_docpluck-iterate-resume.md +0 -0
  149. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-06-17_iterate_resume-cycle1.md +0 -0
  150. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-06-17_iterate_v2491_shipped.md +0 -0
  151. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-06-18_iterate_v2492_affiliation_caption-revert.md +0 -0
  152. {docpluck-2.4.96 → docpluck-2.4.97}/docs/HANDOFF_2026-06-20_request11_flatten_nonclinical_tables.md +0 -0
  153. {docpluck-2.4.96 → docpluck-2.4.97}/docs/ITERATION_VERIFICATION_LESSONS.md +0 -0
  154. {docpluck-2.4.96 → docpluck-2.4.97}/docs/LIBRARY_APP_SYNC.md +0 -0
  155. {docpluck-2.4.96 → docpluck-2.4.97}/docs/NORMALIZATION.md +0 -0
  156. {docpluck-2.4.96 → docpluck-2.4.97}/docs/README.md +0 -0
  157. {docpluck-2.4.96 → docpluck-2.4.97}/docs/TRIAGE_2026-05-10_corpus_assessment.md +0 -0
  158. {docpluck-2.4.96 → docpluck-2.4.97}/docs/TRIAGE_2026-05-14_phase_5d_gold_audit.md +0 -0
  159. {docpluck-2.4.96 → docpluck-2.4.97}/docs/TRIAGE_2026-06-08_untested_corpus_sweep.md +0 -0
  160. {docpluck-2.4.96 → docpluck-2.4.97}/docs/TRIAGE_2026-06-15_head_v2.4.88_assessment.md +0 -0
  161. {docpluck-2.4.96 → docpluck-2.4.97}/docs/TRIAGE_2026-06-21_head_v2.4.95_assessment.md +0 -0
  162. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/handoffs/2026-05-22-b1-next-iteration.md +0 -0
  163. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/handoffs/2026-05-22-b2-remaining-halluc-head.md +0 -0
  164. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/handoffs/2026-05-22-b3-b7-structural-defects.md +0 -0
  165. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/handoffs/2026-05-22-residual-after-locally-doable-pass.md +0 -0
  166. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/handoffs/2026-05-23-bundled-residual-cycle-CLOSED.md +0 -0
  167. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/handoffs/2026-05-23-residual-after-iterate-spine-cycles-1-3.md +0 -0
  168. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/handoffs/2026-05-25-canary-audit-architecture-and-cluster-A-B-C-landed.md +0 -0
  169. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/handoffs/2026-05-25-wrapup-r4-cycle.md +0 -0
  170. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/handoffs/2026-05-26-run-11-cluster-A-ter-and-C-bis-landed.md +0 -0
  171. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/handoffs/2026-05-26-text-extraction-defects-from-citationguard-audit.md +0 -0
  172. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/handoffs/2026-06-07-text-extraction-defects-from-citationguard-iterate.md +0 -0
  173. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/handoffs/2026-06-07-v2.4.78-landed-canary-iterate.md +0 -0
  174. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/handoffs/2026-06-07-v2.4.79-findings-1-2-cleared.md +0 -0
  175. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/handoffs/2026-06-20_docpluck-skill-file-edits-from-app-cron-fix.md +0 -0
  176. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/handoffs/2026-06-21-rc-t-table-region-implementation.md +0 -0
  177. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/2026-05-06-section-identification.md +0 -0
  178. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/2026-05-06-table-extraction.md +0 -0
  179. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/2026-05-07-sections-strict-iteration-progress.md +0 -0
  180. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/2026-05-08-unified-extraction-phase-0-splice-spike.md +0 -0
  181. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/2026-05-23-haiku-orchestration-pretest.md +0 -0
  182. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/sections-deferred-items.md +0 -0
  183. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/sections-issues-backlog.md +0 -0
  184. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/2026-05-07_spot-01_apa.md +0 -0
  185. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/2026-05-07_spot-02_pattern-A-shipped.md +0 -0
  186. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/2026-05-08_spot-final_all-styles.md +0 -0
  187. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/experiments/COMPARISON.md +0 -0
  188. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/korbmacher_table1.md +0 -0
  189. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/option-a.py +0 -0
  190. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/ziano_table1.md +0 -0
  191. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_notes_raw.txt +0 -0
  192. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_table1.md +0 -0
  193. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/notes.md +0 -0
  194. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/option-b.py +0 -0
  195. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_notes_raw.txt +0 -0
  196. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_table1.md +0 -0
  197. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/korbmacher_table1.md +0 -0
  198. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/notes.md +0 -0
  199. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/option-c.py +0 -0
  200. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/sample-pdftotext-bbox.html +0 -0
  201. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/ziano_table1.md +0 -0
  202. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/korbmacher_table1.md +0 -0
  203. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/notes.md +0 -0
  204. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/option-d.py +0 -0
  205. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/ziano_table1.md +0 -0
  206. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_2022_kruger_bbox.html +0 -0
  207. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_bbox.html +0 -0
  208. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_table1.md +0 -0
  209. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/option-e.py +0 -0
  210. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/sample-bbox.html +0 -0
  211. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_2021_joep_bbox.html +0 -0
  212. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_bbox.html +0 -0
  213. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_table1.md +0 -0
  214. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/html-fallback-demo.md +0 -0
  215. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.err +0 -0
  216. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.md +0 -0
  217. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.err +0 -0
  218. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.md +0 -0
  219. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.err +0 -0
  220. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.md +0 -0
  221. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.err +0 -0
  222. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.md +0 -0
  223. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.err +0 -0
  224. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.md +0 -0
  225. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.err +0 -0
  226. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.md +0 -0
  227. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.err +0 -0
  228. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.md +0 -0
  229. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.err +0 -0
  230. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.md +0 -0
  231. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.err +0 -0
  232. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.md +0 -0
  233. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.err +0 -0
  234. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.md +0 -0
  235. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.err +0 -0
  236. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.md +0 -0
  237. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.err +0 -0
  238. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.md +0 -0
  239. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.err +0 -0
  240. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.md +0 -0
  241. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.err +0 -0
  242. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.md +0 -0
  243. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.err +0 -0
  244. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.md +0 -0
  245. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.err +0 -0
  246. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.md +0 -0
  247. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.err +0 -0
  248. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.md +0 -0
  249. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.err +0 -0
  250. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.md +0 -0
  251. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.err +0 -0
  252. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.md +0 -0
  253. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.err +0 -0
  254. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.md +0 -0
  255. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.err +0 -0
  256. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.md +0 -0
  257. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.err +0 -0
  258. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.md +0 -0
  259. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.err +0 -0
  260. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.md +0 -0
  261. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.err +0 -0
  262. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.md +0 -0
  263. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.err +0 -0
  264. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.md +0 -0
  265. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.err +0 -0
  266. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.md +0 -0
  267. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/papers.md +0 -0
  268. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/report.md +0 -0
  269. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/splice_spike.py +0 -0
  270. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/plans/spot-checks/splice-spike/test_splice_spike.py +0 -0
  271. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/specs/2026-04-27-request-09-reference-normalization-design.md +0 -0
  272. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/specs/2026-05-06-section-identification-design.md +0 -0
  273. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/specs/2026-05-06-table-extraction-design.md +0 -0
  274. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/specs/2026-05-08-unified-extraction-design.md +0 -0
  275. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/specs/2026-05-23-haiku-orchestration-pretest-design.md +0 -0
  276. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/specs/2026-06-07-ip_feldman-B4-R4-column-interleave-diagnosis.md +0 -0
  277. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/specs/2026-06-08-rc1-region-aware-column-architecture.md +0 -0
  278. {docpluck-2.4.96 → docpluck-2.4.97}/docs/superpowers/specs/2026-06-21-rc-t-table-region-prose-contamination.md +0 -0
  279. {docpluck-2.4.96 → docpluck-2.4.97}/scripts/__init__.py +0 -0
  280. {docpluck-2.4.96 → docpluck-2.4.97}/scripts/check_app_pin_sync.py +0 -0
  281. {docpluck-2.4.96 → docpluck-2.4.97}/scripts/check_docs_consistency.py +0 -0
  282. {docpluck-2.4.96 → docpluck-2.4.97}/scripts/harness/README.md +0 -0
  283. {docpluck-2.4.96 → docpluck-2.4.97}/scripts/harness/VERIFIER_PROMPT.md +0 -0
  284. {docpluck-2.4.96 → docpluck-2.4.97}/scripts/harness/__init__.py +0 -0
  285. {docpluck-2.4.96 → docpluck-2.4.97}/scripts/harness/baseline_matrix.json +0 -0
  286. {docpluck-2.4.96 → docpluck-2.4.97}/scripts/harness/checks.py +0 -0
  287. {docpluck-2.4.96 → docpluck-2.4.97}/scripts/harness/corpus.py +0 -0
  288. {docpluck-2.4.96 → docpluck-2.4.97}/scripts/harness/corpus_manifest.json +0 -0
  289. {docpluck-2.4.96 → docpluck-2.4.97}/scripts/harness/extract.py +0 -0
  290. {docpluck-2.4.96 → docpluck-2.4.97}/scripts/harness/gold_keys.json +0 -0
  291. {docpluck-2.4.96 → docpluck-2.4.97}/scripts/harness/inspect.py +0 -0
  292. {docpluck-2.4.96 → docpluck-2.4.97}/scripts/lint_rendered_corpus.py +0 -0
  293. {docpluck-2.4.96 → docpluck-2.4.97}/scripts/pretest_capture_tokens.py +0 -0
  294. {docpluck-2.4.96 → docpluck-2.4.97}/scripts/verify_corpus.py +0 -0
  295. {docpluck-2.4.96 → docpluck-2.4.97}/scripts/verify_corpus_full.py +0 -0
  296. {docpluck-2.4.96 → docpluck-2.4.97}/tests/__init__.py +0 -0
  297. {docpluck-2.4.96 → docpluck-2.4.97}/tests/conftest.py +0 -0
  298. {docpluck-2.4.96 → docpluck-2.4.97}/tests/fixtures/__init__.py +0 -0
  299. {docpluck-2.4.96 → docpluck-2.4.97}/tests/fixtures/sections/__init__.py +0 -0
  300. {docpluck-2.4.96 → docpluck-2.4.97}/tests/fixtures/sections/builders.py +0 -0
  301. {docpluck-2.4.96 → docpluck-2.4.97}/tests/fixtures/structured/.gitkeep +0 -0
  302. {docpluck-2.4.96 → docpluck-2.4.97}/tests/fixtures/structured/MANIFEST.json +0 -0
  303. {docpluck-2.4.96 → docpluck-2.4.97}/tests/fixtures/structured/README.md +0 -0
  304. {docpluck-2.4.96 → docpluck-2.4.97}/tests/golden/sections/apa_multi_study_pdf.json +0 -0
  305. {docpluck-2.4.96 → docpluck-2.4.97}/tests/golden/sections/apa_single_study_pdf.json +0 -0
  306. {docpluck-2.4.96 → docpluck-2.4.97}/tests/golden/sections/html_real_headings.json +0 -0
  307. {docpluck-2.4.96 → docpluck-2.4.97}/tests/snapshots/amj_lattice.txt +0 -0
  308. {docpluck-2.4.96 → docpluck-2.4.97}/tests/snapshots/apa_chan_feldman_lineless.txt +0 -0
  309. {docpluck-2.4.96 → docpluck-2.4.97}/tests/snapshots/apa_chen_jesp_lineless.txt +0 -0
  310. {docpluck-2.4.96 → docpluck-2.4.97}/tests/snapshots/apa_efendic_affect.txt +0 -0
  311. {docpluck-2.4.96 → docpluck-2.4.97}/tests/snapshots/apa_ip_feldman_pspb.txt +0 -0
  312. {docpluck-2.4.96 → docpluck-2.4.97}/tests/snapshots/bmc_lattice.txt +0 -0
  313. {docpluck-2.4.96 → docpluck-2.4.97}/tests/snapshots/ieee_figure_heavy.txt +0 -0
  314. {docpluck-2.4.96 → docpluck-2.4.97}/tests/snapshots/ieee_lattice.txt +0 -0
  315. {docpluck-2.4.96 → docpluck-2.4.97}/tests/snapshots/jama_lattice.txt +0 -0
  316. {docpluck-2.4.96 → docpluck-2.4.97}/tests/snapshots/nat_comms_figure_only.txt +0 -0
  317. {docpluck-2.4.96 → docpluck-2.4.97}/tests/snapshots/nature_minimal_rule.txt +0 -0
  318. {docpluck-2.4.96 → docpluck-2.4.97}/tests/snapshots/scirep_minimal_rule.txt +0 -0
  319. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_a3c_leading_zero_decimal_real_pdf.py +0 -0
  320. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_a4_ci_period_to_comma.py +0 -0
  321. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_affiliation_heading_promote_guard_real_pdf.py +0 -0
  322. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_all_caps_section_promote_real_pdf.py +0 -0
  323. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_bbox_utils.py +0 -0
  324. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_benchmark_docx_html.py +0 -0
  325. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_cambridge_footer_strip_real_pdf.py +0 -0
  326. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_camelot_lattice_augment.py +0 -0
  327. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_camelot_temp_cleanup.py +0 -0
  328. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_canary_provenance.py +0 -0
  329. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_caption_only_table_heading_real_pdf.py +0 -0
  330. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_caption_regex.py +0 -0
  331. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_chart_data_trim_real_pdf.py +0 -0
  332. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_cid_minus_recovery_real_pdf.py +0 -0
  333. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_cli_sections.py +0 -0
  334. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_cli_structured.py +0 -0
  335. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_confidence.py +0 -0
  336. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_corpus_smoke.py +0 -0
  337. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_d5_normalization_audit.py +0 -0
  338. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_dropped_minus_layout_recovery_real_pdf.py +0 -0
  339. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_edge_cases.py +0 -0
  340. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_elsevier_footer_strip_real_pdf.py +0 -0
  341. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_equation_page_header_strip_real_pdf.py +0 -0
  342. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_extract_columns.py +0 -0
  343. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_extract_docx.py +0 -0
  344. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_extract_filter_sugar.py +0 -0
  345. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_extract_html.py +0 -0
  346. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_extract_layout.py +0 -0
  347. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_extract_pdf_structured.py +0 -0
  348. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_extraction.py +0 -0
  349. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_f0_table_region_aware.py +0 -0
  350. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_fffd_comparison_recovery_real_pdf.py +0 -0
  351. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_figure_caption_trim_real_pdf.py +0 -0
  352. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_figure_detect.py +0 -0
  353. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_fixtures_manifest.py +0 -0
  354. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_hallucinated_heading_continuation_guard.py +0 -0
  355. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_harness_text_loss_reflow.py +0 -0
  356. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_harvard_refs_pagebreak_stitch.py +0 -0
  357. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_jama_open_cluster_real_pdf.py +0 -0
  358. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_lattice_cluster.py +0 -0
  359. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_letterspaced_label_real_pdf.py +0 -0
  360. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_ligature_decomposition_real_pdf.py +0 -0
  361. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_lt_operator_recovery_real_pdf.py +0 -0
  362. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_mathitalic_greek_real_pdf.py +0 -0
  363. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_metaesci_followups.py +0 -0
  364. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_minus_sign_recovery_real_pdf.py +0 -0
  365. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_normalization.py +0 -0
  366. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_normalize_a3_r2_body_integer_real_pdf.py +0 -0
  367. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_normalize_f0_footnote_strip.py +0 -0
  368. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_normalize_idempotent_real_pdf.py +0 -0
  369. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_normalize_layout_param.py +0 -0
  370. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_normalize_metadata_leak_real_pdf.py +0 -0
  371. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_normalize_report_layout_fields.py +0 -0
  372. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_normalize_soft_hyphen_dehyphenation.py +0 -0
  373. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_normalize_v18_strips.py +0 -0
  374. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_numbered_heading_promotion_real_pdf.py +0 -0
  375. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_numbered_section_promotion_real_pdf.py +0 -0
  376. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_o5_reference_inversion_real_pdf.py +0 -0
  377. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_orphan_multilevel_number_real_pdf.py +0 -0
  378. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_orphan_section_number_real_pdf.py +0 -0
  379. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_p0r_recurring_running_header_strip.py +0 -0
  380. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_preserve_math_glyphs_real_pdf.py +0 -0
  381. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_pretest_capture_tokens.py +0 -0
  382. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_pua_glyph_recovery_real_pdf.py +0 -0
  383. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_quality.py +0 -0
  384. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_r1_whitespace_cells_wiring_real_pdf.py +0 -0
  385. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_r4_column_correction_real_pdf.py +0 -0
  386. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_rc1_banded_column_real_pdf.py +0 -0
  387. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_rc1_general_column_correction_real_pdf.py +0 -0
  388. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_rc_t_degenerate_table_real_pdf.py +0 -0
  389. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_render.py +0 -0
  390. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_render_frontmatter_masthead.py +0 -0
  391. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_render_html.py +0 -0
  392. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_render_subsection_chain_promotion.py +0 -0
  393. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_request_09_reference_normalization.py +0 -0
  394. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_residual_2026_05_23_bundled.py +0 -0
  395. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_roman_numeral_section_promote_real_pdf.py +0 -0
  396. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_section_row_label_no_merge_real_pdf.py +0 -0
  397. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_sections_boundaries.py +0 -0
  398. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_sections_boundary_truncation.py +0 -0
  399. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_sections_core_partition.py +0 -0
  400. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_sections_docx_annotator.py +0 -0
  401. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_sections_extract_text.py +0 -0
  402. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_sections_footnote_section.py +0 -0
  403. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_sections_golden.py +0 -0
  404. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_sections_html_annotator.py +0 -0
  405. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_sections_pdf_annotator.py +0 -0
  406. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_sections_public_api.py +0 -0
  407. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_sections_real_corpus.py +0 -0
  408. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_sections_taxonomy.py +0 -0
  409. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_sections_text_annotator.py +0 -0
  410. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_sections_types.py +0 -0
  411. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_sections_unit_corpus.py +0 -0
  412. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_sections_v161_coalesce.py +0 -0
  413. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_sections_v161_subheadings.py +0 -0
  414. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_sections_v161_taxonomy.py +0 -0
  415. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_sections_v161_text_annotator.py +0 -0
  416. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_sections_version.py +0 -0
  417. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_single_column_subsection_promote_real_pdf.py +0 -0
  418. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_smoke_fixtures.py +0 -0
  419. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_structured_result_type.py +0 -0
  420. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_structured_types.py +0 -0
  421. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_structured_version.py +0 -0
  422. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_table_caption_cell_region_real_pdf.py +0 -0
  423. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_table_detect.py +0 -0
  424. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_tables_cell_cleaning.py +0 -0
  425. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_tables_flatten.py +0 -0
  426. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_text_mode.py +0 -0
  427. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_v23_1_fixes.py +0 -0
  428. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_v23_bug_fixes.py +0 -0
  429. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_v23_post_corpus.py +0 -0
  430. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_v23_post_corpus_v2.py +0 -0
  431. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_v2_backwards_compat.py +0 -0
  432. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_v2_top_level_exports.py +0 -0
  433. {docpluck-2.4.96 → docpluck-2.4.97}/tests/test_whitespace_cluster.py +0 -0
  434. {docpluck-2.4.96 → docpluck-2.4.97}/tools/canary_provenance.py +0 -0
  435. {docpluck-2.4.96 → docpluck-2.4.97}/tools/fix_python_env.ps1 +0 -0
  436. {docpluck-2.4.96 → docpluck-2.4.97}/tools/render_for_audit.py +0 -0
@@ -1,5 +1,19 @@
1
1
  # Changelog
2
2
 
3
+ ## [2.4.97] — 2026-06-22
4
+
5
+ **Three table fixes shipped together (combined from two concurrent sessions): type the skipped p+df columns (DP-2), stop dropping / mis-binding two-header-row tables (DP-5), and stop the table raw_text fallback swallowing body prose (RC-T Layer-2).** `TABLE_EXTRACTION_VERSION` → `2.4.2`; no `NORMALIZATION_VERSION` / `SECTIONING_VERSION` change. DP-2/DP-5 are render-visible in the inline flattened-table blocks + the `.tables.jsonl` sidecar `fields` (the `<table>` HTML gains the previously-dropped data rows); RC-T Layer-2 is render-visible in the `unstructured-table` fallback blocks. DP-2/DP-5 filed in `ESCIcheckapp/docs/DOCPLUCK_HANDOFF_2026-06-21.md`; RC-T Layer-2 per `docs/superpowers/specs/2026-06-21-rc-t-table-region-prose-contamination.md`.
6
+
7
+ - **DP-2 — type the unlabeled p and df columns.** `tables.flatten._recover_blank_roles` recovered the leading test statistic and the `d [CI]` column of a header-stripped result table but left the bare p-value and df columns between them untyped, so `collabra.77859` Table 3 emitted `fields: {group, t, d, CI}` and dropped the `p` (`.551`) and `df` (`260.54`). A new Pass 4.5 types a still-blank column that is a bare `.XXX` with no comparison op as `p`, and a bare integer / Welch-decimal sitting between the test statistic and its `est/CI` column as `df` — keyed on data shape + position relative to the already-recovered roles, never bare position. The four Table-3 rows now carry `p` and `df`.
8
+
9
+ - **DP-5 — two-row-header parallel-arm tables: recover the first data row and align centered super-headers.** `collabra.90203` Table 10 delivered only 5 of its 6 correlation rows (the Identifiable/Explicit-learning row was silently dropped), and the Original/Replication arms of `xiao_2021` Table 4 were swapped. Three coupled root-cause fixes: (a) `cell_cleaning._is_header_like_row` now counts APA value shapes (leading-dot decimal, bracketed CI, operator-prefixed p, `N/A`) as data via `_DATA_VALUE_CELL_RE`, so a real first data row is no longer mis-read as a third header row (the bracket branch requires a digit and no letters inside, so a genuine `[95% CI]` header stays a header); (b) `tables.flatten._detect_column_groups` re-derives arm boundaries from equal-width blocks of the data region — each must contain exactly one super-label — so a *centered* super-label (camelot stream loses colspan and folds it mid-span) no longer swaps arm values or pushes a stat column into the label region; left-aligned super-headers stay byte-identical; (c) `tables.flatten._classify_column` reads a folded super-header cell's role from its sub-part so a folded `…<sep>95% CI` column is still typed `CI`. Table 10 now emits all 6 conditions split into Target-article / Replication arms with correct `r` / `n` / `CI` / `p`; xiao Table 4 arms are no longer swapped; incidentally recovers `chan_feldman` Table 8 arm labels and `jama_open_2` Table 3 HR estimates + CIs.
10
+
11
+ - **RC-T Layer-2 — stop the table raw_text fallback swallowing body prose.** When Camelot recovers no cells, `extract_structured._extract_table_body_text` linearises the text after a caption as the `unstructured-table` fallback; its per-line prose gate (`_line_is_body_prose`, len ≥ 80) misses prose that pdftotext WRAPPED into short (~48-char) lines, so a short table's caption-anchored region overshot the table end and swallowed Results/Discussion prose. Two FP-safe structural fixes: **(a) Note-anchor** — a table's `Note:` footnote is, by convention, its last element, so trim everything after the note paragraph (`chan_feldman` T1/T3 + `efendic_2022` T5 trailing Discussion prose removed; the stat rows + the note are kept); **(b) degenerate-prose guard** — suppress a fallback block that STARTS mid-sentence with a lowercase multi-letter word AND is majority sentence-shaped prose, so the renderer emits a clean caption-only table (`chan_feldman` T9 was an entire verbatim duplicate of `## Discussion` — now caption-only, no duplication). FP-safe by construction: real table cells start with a header / label / number / single-letter item marker, never a wrapped mid-sentence continuation — hypotheses ("a There is a positive association…"), descriptive rows ("Median age"), and instrument fragments are preserved. Keyed on the structural overshoot signature, never paper identity.
12
+
13
+ Verification: new real-PDF + contract regression tests (`tests/test_tables_superheader_alignment_real_pdf.py`) — collabra.90203 T10 six-conditions/correct-arms + xiao T4 not-swapped (each FAILS at HEAD, PASSES after), plus `_is_header_like_row` / `_detect_column_groups` contract cases; `tests/test_tables_flatten_blank_header_recovery.py` extended for DP-2. A full-corpus (101-PDF) cached-table flatten diff confirms no clean-table regression — every changed table is a recovered row, a correct arm split, a recovered field, or a removed stat-less spurious row; already-garbage tables shuffle without a clean table regressing. Broad pytest green (real-PDF Camelot tests run serially per file — non-deterministic under cumulative load). RC-T Layer-2 adds `tests/test_rc_t_layer2_raw_text_real_pdf.py` (6 contract + 4 real-PDF: chan T1 Note-anchor, T9 suppress-no-duplication, T3 preserved) and an independent full-corpus 101-PDF guard-live-vs-bypassed raw_text diff (`grew=0 changed=0`; 4 trims + 8 prose-suppressions only). A 7-canary Sonnet AI-gold verify confirms every table this release touched is correct (chan T1/T3/T9, maier T10 six-conditions, xiao T4 arms) with no new TEXT-LOSS / HALLUCINATION.
14
+
15
+ **Deferred (pre-existing, user decision 2026-06-22):** the remaining canary AI-verify FAILs are the architectural backlog, NOT regressions from this release — RC-T **Layer-1** table-data recovery (`table_areas`; e.g. plos_med Table 5's SAE rows, chan_feldman / chandrashekar under-extraction) and RC-1 two-column / sidebar column-interleave. Tracked in `docs/TRIAGE_2026-06-21_head_v2.4.95_assessment.md`; intentionally not addressed here.
16
+
3
17
  ## [2.4.96] — 2026-06-21
4
18
 
5
19
  **RC-T (Option A): strip Camelot "tables" that are absorbed body prose, not data.** Render-only — `render.py::_strip_phantom_camelot_tables`; no `TABLE_EXTRACTION_VERSION` / `NORMALIZATION_VERSION` / `SECTIONING_VERSION` change.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docpluck
3
- Version: 2.4.96
3
+ Version: 2.4.97
4
4
  Summary: PDF, DOCX, and HTML text extraction and normalization for academic papers
5
5
  Project-URL: Homepage, https://docpluck.app
6
6
  Project-URL: Documentation, https://docpluck.app/api-docs
@@ -78,7 +78,7 @@ from .figures import Figure
78
78
  from .extract_structured import TABLE_EXTRACTION_VERSION, StructuredResult, extract_pdf_structured
79
79
  from .render import render_pdf_to_markdown
80
80
 
81
- __version__ = "2.4.96"
81
+ __version__ = "2.4.97"
82
82
  __author__ = "Gilad Feldman"
83
83
  __license__ = "MIT"
84
84
 
@@ -37,7 +37,7 @@ from .tables.render import cells_to_html
37
37
  from .telemetry import record_fallback
38
38
 
39
39
 
40
- TABLE_EXTRACTION_VERSION = "2.4.0" # v2.4.0 (REQUEST_11): flatten now populates fields for NON-clinical result tables — (a) blank-header column-role recovery (tables.flatten._recover_blank_roles): assign a stat role to a header-stripped column from its data-token SHAPE (CI brackets, df1/df2 pair, estimate-adjacent-CI, p-with-operator) AND caption/footnote/all-header-rows vocabulary, never bare position; recovers collabra.77859 T5 (t/df/d/CI) + collabra.90203 T8/T9 (F/df/p/BF01/eta²p-as-est/CI). (b) packed parallel-arm split (tables.flatten._detect_packed_arms/_flatten_packed_arms): tables packing k≥2 arms into single cells ("Separate Joint" + space-joined values) emit one typed record per arm (group=arm) — collabra.77859 T3 Separate/Joint, xiao_2021 T7 Regret/Justifiability. (c) new BF01 role; validity guards drop r∉[-1,1] / non-monotone CI / non-int n / p∉[0,1]. (d) GENERAL L-004 fixes: _parse_number + _parse_ci_cell fold U+2212 MINUS (negative t/d/CI bounds in Camelot cells were dropped/sign-lost); _VALUE_GROUP_RE handles bracket-led CI groups. Default render + PROSECCO output byte-identical. # v2.3.0 (Tier-2, REQUEST_10): cross-flavor lattice-augmentation — recover data rows a lattice extraction vertically TRUNCATED by appending the rows a same-page, same-column-count stream table captured below the lattice bbox (camelot_extract._augment_lattice_with_stream_rows), gated on equal-col-count + bbox overlap + extends-below; PLUS numeric/parenthetical continuation merge (cell_cleaning._merge_continuation_rows) rejoining stream's stacked value/parenthetical cells. Fixes PROSECCO Table 2 R2-R6. v2.2.0: EC-T1 docpluck.tables.flatten — per-row FlattenedRow records (sentence + structured fields) for downstream stat-verification consumers (effectcheck/escimate/scimeto) + opt-in inline "rendered as text" block below each <table> via render_pdf_to_markdown(flatten_tables_inline=True). v2.1.5: cell-cleaning recovers CMEX10 extensible-bracket PUA glyphs (U+F8EE-F8FB). v2.1.4: cell-cleaning recovers Adobe-Symbol-font PUA glyphs (beta/chi/bullet as U+F0xx). v2.1.3: cell-cleaning recovers '<'-as-backslash glyph corruption. v2.1.2: cell-cleaning recovers descending-CI '2'-for-minus corruption. v2.1.1: cell-cleaning recovers (cid:0) corrupted minus signs + strips math-alphanumeric styling. v2.1.0: cell-cleaning pipeline ported from splice spike (multi-row header detection, continuation merging, leader-dot strip, mash-split, group separators, sig-marker attach)
40
+ TABLE_EXTRACTION_VERSION = "2.4.2" # v2.4.2 (RC-T Layer-2): _extract_table_body_text now (a) Note-anchor — a table's "Note:" footnote is its last element, so trim body prose bled past it (chan_feldman T1/T3, efendic_2022 T5); and (b) degenerate-prose guard — suppress a raw_text fallback that STARTS mid-sentence with a lowercase multi-letter word AND is majority sentence-shaped prose, so render emits a clean caption-only table instead of an unstructured-table dump duplicating Results/Discussion prose (chan_feldman T9 was a verbatim ## Discussion duplicate). FP-safe (real cells start with header/label/number/single-letter marker, never a wrapped continuation); full-corpus 101-PDF guard-diff only trims+suppresses (grew=0 changed=0). # v2.4.1 (DP-2/DP-5): (DP-2) blank-header role recovery now types the unlabeled p-value (a bare `.XXX` after the test stat, no comparison op) and df (a bare integer/Welch-decimal between the stat and the d[CI] column) columns it previously skipped — collabra.77859 T3 fields gain p+df (tables.flatten._recover_blank_roles Pass 4.5). (DP-5) parallel-arm tables with a TWO-ROW header no longer drop their first data row, and a CENTERED super-header is aligned to its arm block instead of its visual-center column: (a) cell_cleaning._is_header_like_row counts APA value shapes (leading-dot decimal, bracketed CI, operator-prefixed p, N/A) as data via _DATA_VALUE_CELL_RE so a real first data row isn't read as a 3rd header row (collabra.90203 T10 recovered the Identifiable/Explicit-learning correlation); (b) tables.flatten._detect_column_groups re-derives arm boundaries from equal-width blocks of the data region (each must hold one super-label) so a centered super-label folded mid-span no longer swaps arm values (xiao_2021 T4 Original/Replication F) or pushes a stat column into the label region; (c) tables.flatten._classify_column reads a folded super-header cell's role from its sub-part (collabra.90203 T10 CI). Full-corpus cached-table flatten diff: no clean-table regression. # v2.4.0 (REQUEST_11): flatten now populates fields for NON-clinical result tables — (a) blank-header column-role recovery (tables.flatten._recover_blank_roles): assign a stat role to a header-stripped column from its data-token SHAPE (CI brackets, df1/df2 pair, estimate-adjacent-CI, p-with-operator) AND caption/footnote/all-header-rows vocabulary, never bare position; recovers collabra.77859 T5 (t/df/d/CI) + collabra.90203 T8/T9 (F/df/p/BF01/eta²p-as-est/CI). (b) packed parallel-arm split (tables.flatten._detect_packed_arms/_flatten_packed_arms): tables packing k≥2 arms into single cells ("Separate Joint" + space-joined values) emit one typed record per arm (group=arm) — collabra.77859 T3 Separate/Joint, xiao_2021 T7 Regret/Justifiability. (c) new BF01 role; validity guards drop r∉[-1,1] / non-monotone CI / non-int n / p∉[0,1]. (d) GENERAL L-004 fixes: _parse_number + _parse_ci_cell fold U+2212 MINUS (negative t/d/CI bounds in Camelot cells were dropped/sign-lost); _VALUE_GROUP_RE handles bracket-led CI groups. Default render + PROSECCO output byte-identical. # v2.3.0 (Tier-2, REQUEST_10): cross-flavor lattice-augmentation — recover data rows a lattice extraction vertically TRUNCATED by appending the rows a same-page, same-column-count stream table captured below the lattice bbox (camelot_extract._augment_lattice_with_stream_rows), gated on equal-col-count + bbox overlap + extends-below; PLUS numeric/parenthetical continuation merge (cell_cleaning._merge_continuation_rows) rejoining stream's stacked value/parenthetical cells. Fixes PROSECCO Table 2 R2-R6. v2.2.0: EC-T1 docpluck.tables.flatten — per-row FlattenedRow records (sentence + structured fields) for downstream stat-verification consumers (effectcheck/escimate/scimeto) + opt-in inline "rendered as text" block below each <table> via render_pdf_to_markdown(flatten_tables_inline=True). v2.1.5: cell-cleaning recovers CMEX10 extensible-bracket PUA glyphs (U+F8EE-F8FB). v2.1.4: cell-cleaning recovers Adobe-Symbol-font PUA glyphs (beta/chi/bullet as U+F0xx). v2.1.3: cell-cleaning recovers '<'-as-backslash glyph corruption. v2.1.2: cell-cleaning recovers descending-CI '2'-for-minus corruption. v2.1.1: cell-cleaning recovers (cid:0) corrupted minus signs + strips math-alphanumeric styling. v2.1.0: cell-cleaning pipeline ported from splice spike (multi-row header detection, continuation merging, leader-dot strip, mash-split, group separators, sig-marker attach)
41
41
 
42
42
  TableTextMode = Literal["raw", "placeholder"]
43
43
 
@@ -1306,6 +1306,74 @@ def _line_is_body_prose(line: str) -> bool:
1306
1306
  return stopwords_hit >= 4
1307
1307
 
1308
1308
 
1309
+ def _join_wrapped_lines(lines: list[str]) -> list[str]:
1310
+ """Merge pdftotext-wrapped lines into logical paragraphs.
1311
+
1312
+ pdftotext linearizes a flowing prose paragraph into several short
1313
+ (~45-60 char) lines; the per-line ``_line_is_body_prose`` gate
1314
+ (len >= 80) cannot see prose in that wrapped form. Joining a line with
1315
+ the next whenever it does not end on sentence-terminal punctuation
1316
+ reconstructs the paragraph so prose can be measured at paragraph scale.
1317
+ """
1318
+ paras: list[str] = []
1319
+ cur = ""
1320
+ for ln in lines:
1321
+ s = ln.strip()
1322
+ if not s:
1323
+ continue
1324
+ cur = (cur + " " + s).strip() if cur else s
1325
+ if s.endswith((".", "!", "?", ":")):
1326
+ paras.append(cur)
1327
+ cur = ""
1328
+ if cur:
1329
+ paras.append(cur)
1330
+ return paras
1331
+
1332
+
1333
+ def _raw_text_is_degenerate_prose(text: str) -> bool:
1334
+ """True if a table raw_text fallback is dominated by flowing body prose.
1335
+
1336
+ RC-T Layer-2 (v2.4.97). When Camelot recovers no cells AND the
1337
+ caption-anchored region has no extractable table text near the caption,
1338
+ the body_start walk lands INSIDE a prose paragraph and the fallback
1339
+ swallows Results/Discussion prose (which is then duplicated under its
1340
+ real section heading). Such a block must be suppressed (render then
1341
+ emits a clean caption-only table) rather than dumped verbatim.
1342
+
1343
+ FP-safe by construction — fires only when BOTH hold:
1344
+ (a) the block STARTS mid-sentence: its first line begins with a
1345
+ lowercase multi-letter continuation word. A real table's
1346
+ linearized cells start with a column header, label, number, or a
1347
+ single-letter item marker (``a``/``b``/``c``) — never a wrapped
1348
+ mid-paragraph continuation like "than empathy. We provided ...".
1349
+ (b) the joined block is majority (>= 60% of chars) sentence-shaped
1350
+ body prose.
1351
+
1352
+ Legitimate degraded tables are preserved: hypotheses ("a There is a
1353
+ positive association ..."), descriptive rows ("Median age (years)"),
1354
+ instrument items ("h et al., 1997)") all fail (a). Keyed purely on the
1355
+ structural overshoot signature, never on paper identity.
1356
+ """
1357
+ lines = [ln for ln in text.split("\n") if ln.strip()]
1358
+ if len(lines) < 4:
1359
+ return False
1360
+ first_tokens = lines[0].split()
1361
+ first_word = first_tokens[0] if first_tokens else ""
1362
+ starts_midsentence = (
1363
+ len(first_word) >= 2
1364
+ and first_word[0].islower()
1365
+ and first_word[0].isalpha()
1366
+ )
1367
+ if not starts_midsentence:
1368
+ return False
1369
+ paragraphs = _join_wrapped_lines(lines)
1370
+ total = sum(len(p) for p in paragraphs)
1371
+ if total == 0:
1372
+ return False
1373
+ prose = sum(len(p) for p in paragraphs if _line_is_body_prose(p))
1374
+ return prose >= 0.6 * total
1375
+
1376
+
1309
1377
  def _extract_table_body_text(
1310
1378
  raw_text: str,
1311
1379
  cap: CaptionMatch,
@@ -1379,6 +1447,31 @@ def _extract_table_body_text(
1379
1447
  break
1380
1448
  kept.append(ln)
1381
1449
 
1450
+ # Note-anchor table-end (RC-T Layer-2, v2.4.97). A table's "Note:" /
1451
+ # "Notes:" footnote is, by academic-table convention, its LAST element.
1452
+ # Any text after the note paragraph is body prose that bled past the
1453
+ # table boundary — the caption-anchored region overshot the table end
1454
+ # and the per-line `_line_is_body_prose` gate (len >= 80) misses prose
1455
+ # that pdftotext WRAPPED into short (~48-char) lines, so it accumulates
1456
+ # here. Trim everything after the note's (possibly wrapped) paragraph.
1457
+ # This is FP-safe: legitimate table cells (hypotheses a/b/c, instrument
1458
+ # items) appear BEFORE the note; nothing legitimate follows it. Keyed on
1459
+ # the structural "Note: ... <sentence end>" signature, never paper
1460
+ # identity. `^Notes?[.:]` requires punctuation so body prose that merely
1461
+ # starts with the word "Note that ..." does not false-trigger.
1462
+ note_idx = next(
1463
+ (i for i, ln in enumerate(kept)
1464
+ if re.match(r"^\s*Notes?[.:]", ln.strip())),
1465
+ None,
1466
+ )
1467
+ if note_idx is not None and not os.environ.get("DOCPLUCK_RCT_L2_BYPASS"):
1468
+ note_end = note_idx
1469
+ for k in range(note_idx, len(kept)):
1470
+ note_end = k
1471
+ if kept[k].strip().endswith((".", "!", "?")):
1472
+ break
1473
+ kept = kept[: note_end + 1]
1474
+
1382
1475
  # Trim trailing heading-like short lines that don't belong to this table
1383
1476
  # (the start of the next section). Two patterns are trimmed:
1384
1477
  # * Title-Case headings without a sentence terminator
@@ -1414,7 +1507,17 @@ def _extract_table_body_text(
1414
1507
  s = re.sub(r"[ \t]+", " ", ln).strip()
1415
1508
  if s:
1416
1509
  cleaned_lines.append(s)
1417
- return "\n".join(cleaned_lines).strip()
1510
+ result = "\n".join(cleaned_lines).strip()
1511
+ # Degenerate-prose guard (RC-T Layer-2, v2.4.97): drop a raw_text
1512
+ # fallback that is really body prose the region overshot into, so the
1513
+ # renderer emits a clean caption-only table instead of an
1514
+ # ``unstructured-table`` dump that duplicates Results/Discussion prose.
1515
+ # ``DOCPLUCK_RCT_L2_BYPASS`` reverts both Layer-2 additions (Note-anchor
1516
+ # + this guard) to HEAD behavior — used only by the FP-scan harness to
1517
+ # diff guard-live vs guard-bypassed over the full corpus.
1518
+ if not os.environ.get("DOCPLUCK_RCT_L2_BYPASS") and _raw_text_is_degenerate_prose(result):
1519
+ return ""
1520
+ return result
1418
1521
 
1419
1522
 
1420
1523
  def _figure_from_caption(
@@ -393,13 +393,32 @@ _NUMERIC_CELL_RE = re.compile(
393
393
  r"^[-−–]?\d+(?:[.,]\d+)*(?:[%∗*]+)?(?:\s*\([^)]*\))?$"
394
394
  )
395
395
 
396
+ # A cell carrying a statistic VALUE (vs a header label). Broader than
397
+ # _NUMERIC_CELL_RE: also matches APA leading-dot decimals (".34"), operator-
398
+ # prefixed p-values ("< .001"), bracketed numeric intervals ("[0.53, 0.72]"),
399
+ # and the "N/A" filler — all DATA, not header text. The interval branch requires
400
+ # a digit and NO letters inside the brackets so a genuine header cell like
401
+ # "[95% CI]" (letters present) is NOT counted as data and stays a header. Used by
402
+ # `_is_header_like_row` so a real data row whose APA-formatted values the bare
403
+ # numeric pattern under-counted is not mistaken for an extra header row — the
404
+ # bug that silently dropped the FIRST data row of two-header-row correlation
405
+ # tables (collabra.90203 Table 10, DP-5).
406
+ _DATA_VALUE_CELL_RE = re.compile(
407
+ r"^(?:"
408
+ r"[<>=]?\s*[-−–]?\d*[.,]?\d+(?:[.,]\d+)*(?:[%∗*]+)?(?:\s*\([^)]*\))?"
409
+ r"|\[[^\]A-Za-z]*\d[^\]A-Za-z]*\]"
410
+ r"|n\s*/?\s*a"
411
+ r")$",
412
+ re.I,
413
+ )
414
+
396
415
 
397
416
  def _is_header_like_row(row: list[str]) -> bool:
398
417
  """Heuristic: a row that looks like part of a header rather than data."""
399
418
  nonempty = [c.strip() for c in row if (c or "").strip()]
400
419
  if not nonempty:
401
420
  return False
402
- numeric = sum(1 for c in nonempty if _NUMERIC_CELL_RE.match(c))
421
+ numeric = sum(1 for c in nonempty if _DATA_VALUE_CELL_RE.match(c))
403
422
  if numeric / len(nonempty) > 0.3:
404
423
  return False
405
424
  avg_len = sum(len(c) for c in nonempty) / len(nonempty)
@@ -261,6 +261,17 @@ def _classify_column(header: str) -> Optional[str]:
261
261
  h = (header or "").strip()
262
262
  if not h:
263
263
  return None
264
+ # A folded super-header cell ("Replication\x00BR\x0095% CI") carries the GROUP
265
+ # label in the super-part and the column's OWN role in the sub-part. Classify
266
+ # on the sub-part first (then the super-part) so a folded CI / p / stat column
267
+ # is still recognized — otherwise the whole "Replication…95% CI" string never
268
+ # matches and the column's role is lost (collabra.90203 T10 CI, DP-5).
269
+ if _MERGE_SEPARATOR in h:
270
+ for part in reversed([p.strip() for p in h.split(_MERGE_SEPARATOR) if p.strip()]):
271
+ role = _classify_column(part)
272
+ if role:
273
+ return role
274
+ return None
264
275
  # Strip a single trailing punct (`,`, `:`, `.`) that some PDFs include.
265
276
  h = h.rstrip(",:.")
266
277
  for role, pat in _ROLE_PATTERNS:
@@ -477,6 +488,18 @@ def _looks_like_p(v: str) -> bool:
477
488
  return bool(_P_SHAPE_RE.match(v or "") or _NA_RE.match(v or ""))
478
489
 
479
490
 
491
+ # A sub-one decimal: a value in [0, 1) written APA-style (".551", "0.03") with an
492
+ # optional comparison op ("<.001"). Unlike `_P_SHAPE_RE` it rejects an integer
493
+ # part ≥ 1 (so a test statistic like "1.31" is NOT mistaken for a p-value). Used
494
+ # to separate a still-blank p column (sub-one) from a still-blank df / n column
495
+ # (values ≥ 1) when both are unlabeled and adjacent. (Pass 3.5, DP-2.)
496
+ _SUB_ONE_DEC_RE = re.compile(r"^\s*[<>=]?\s*0?\.\d+\s*$")
497
+
498
+
499
+ def _looks_like_sub_one(v: str) -> bool:
500
+ return bool(_SUB_ONE_DEC_RE.match(v or ""))
501
+
502
+
480
503
  def _has_comparison_op(v: str) -> bool:
481
504
  return "<" in (v or "") or ">" in (v or "")
482
505
 
@@ -800,6 +823,54 @@ def _recover_blank_roles(
800
823
  if _frac_match(vals, _is_num_or_na) and (ci + 1) in ci_cols:
801
824
  override[ci] = "est"
802
825
 
826
+ # Pass 4.5 — p / df (or n) recovery for an established t/F/r results table.
827
+ # Once the statistic column is typed (Pass 3 / grid) and the table carries a
828
+ # recognized effect/CI column, the still-blank bare-numeric columns BETWEEN
829
+ # the statistic and that interval are the p-value and the df/n: p is a sub-one
830
+ # decimal (".551", "0.03", "<.001") with no integer part; df/n is a bare
831
+ # number ≥ 1 (Welch "260.54", integer "131"). This types the operator-less p
832
+ # that Pass 1 defers and the mixed-integer/decimal df that Pass 2 (all-integer
833
+ # only) skips — both unambiguous HERE because position (after the statistic,
834
+ # before the interval) pins them. Runs AFTER the caption-run pass so a leaked
835
+ # header always wins, and BEFORE Pass 5 so a real df is not stolen as an
836
+ # est-adjacent point estimate. Keyed on structure, never paper identity.
837
+ # (DP-2: collabra.77859 Separate/Joint t-tests dropped p + Welch df.)
838
+ if family in ("t", "F", "r"):
839
+ stat_col = next(
840
+ (ci for ci in cols if (override.get(ci) or grid_role[ci]) == family),
841
+ None,
842
+ )
843
+ if stat_col is not None:
844
+ right_bound = min(
845
+ (
846
+ ci
847
+ for ci in cols
848
+ if ci > stat_col
849
+ and grid_role[ci] in ("est_ci", "CI", "CI_lo", "CI_hi", "est")
850
+ ),
851
+ default=n,
852
+ )
853
+ present_roles = {grid_role[ci] for ci in cols if grid_role[ci]} | set(
854
+ override.values()
855
+ )
856
+ has_p = "p" in present_roles
857
+ for ci in cols:
858
+ if grid_role[ci] or ci in override:
859
+ continue
860
+ if not (stat_col < ci < right_bound):
861
+ continue
862
+ vals = _column_values(body, ci)
863
+ if not _frac_match(vals, _is_num_or_na):
864
+ continue
865
+ if not has_p and _frac_match(vals, _looks_like_sub_one):
866
+ override[ci] = "p"
867
+ has_p = True
868
+ elif _frac_match(
869
+ vals,
870
+ lambda v: bool(_BARE_NUM_RE.match(v)) and (_parse_number(v) or 0) >= 1,
871
+ ):
872
+ override[ci] = "n" if family == "r" else "df"
873
+
803
874
  # Pass 5 — final est-adjacency sweep for tables with no caption run: a
804
875
  # still-blank bare-number column immediately left of a CI column is the
805
876
  # interval's point estimate.
@@ -1164,10 +1235,55 @@ def _detect_column_groups(
1164
1235
  starts = [i for i, h in enumerate(header) if _MERGE_SEPARATOR in (h or "")]
1165
1236
  if len(starts) < 2:
1166
1237
  return None
1238
+ n = len(header)
1239
+
1240
+ # The sentinel marks where camelot PLACED each super-label — but a *centered*
1241
+ # spanning label (colspan is lost in stream extraction) lands mid-span, not at
1242
+ # its arm's first column, so trusting the sentinel as the arm boundary
1243
+ # mis-bins columns: collabra.90203 T10 puts the Target-article "r" into the
1244
+ # label region, and xiao_2021 T4 splits Original/Replication with the F values
1245
+ # swapped. Re-derive arm boundaries from EQUAL-WIDTH blocks of the data region
1246
+ # (the columns between the leading + trailing non-stat label columns), each of
1247
+ # which must contain exactly one super-label. Falls back to the literal
1248
+ # sentinel boundaries when the region does not divide evenly — so every
1249
+ # previously-grouped table stays byte-identical unless this strictly corrects
1250
+ # its alignment (a left-aligned super-header already at the block start yields
1251
+ # the identical grouping). General, keyed on structure, not paper id. (DP-5.)
1252
+ def _is_label_col(i: int) -> bool:
1253
+ return i not in starts and not _classify_column(header[i])
1254
+
1255
+ lead = 0
1256
+ while lead < n and _is_label_col(lead):
1257
+ lead += 1
1258
+ trail = n - 1
1259
+ while trail >= 0 and _is_label_col(trail):
1260
+ trail -= 1
1261
+ width = trail - lead + 1
1262
+ k = len(starts)
1263
+ if (
1264
+ width >= k
1265
+ and width % k == 0
1266
+ and lead <= starts[0]
1267
+ and starts[-1] <= trail
1268
+ ):
1269
+ block = width // k
1270
+ blocks = [(lead + j * block, lead + (j + 1) * block - 1) for j in range(k)]
1271
+ if all(b_lo <= s <= b_hi for (b_lo, b_hi), s in zip(blocks, starts)):
1272
+ label_cols = [i for i in range(n) if i < blocks[0][0] or i > blocks[-1][1]]
1273
+ groups = [
1274
+ (
1275
+ (header[s].split(_MERGE_SEPARATOR, 1)[0]).strip(),
1276
+ list(range(b_lo, b_hi + 1)),
1277
+ )
1278
+ for (b_lo, b_hi), s in zip(blocks, starts)
1279
+ ]
1280
+ return label_cols, groups
1281
+
1282
+ # Fallback: literal sentinel-boundary grouping (pre-existing behavior).
1167
1283
  label_cols = list(range(0, starts[0]))
1168
- groups: list[tuple[str, list[int]]] = []
1284
+ groups = []
1169
1285
  for gi, start in enumerate(starts):
1170
- end = starts[gi + 1] if gi + 1 < len(starts) else len(header)
1286
+ end = starts[gi + 1] if gi + 1 < len(starts) else n
1171
1287
  glabel = (header[start].split(_MERGE_SEPARATOR, 1)[0]).strip()
1172
1288
  groups.append((glabel, list(range(start, end))))
1173
1289
  return label_cols, groups
@@ -0,0 +1,71 @@
1
+ # DP-2 + DP-5 flatten fixes — commit/coordination handoff (2026-06-22)
2
+
3
+ ## 1. Goal
4
+ Commit the **DP-2 + DP-5 table-flatten fixes** (already complete + verified, working tree, v2.4.97) as a clean commit on top of the concurrent session's RC-T render-guard commit `84a4d42` (v2.4.96) — staging **only the 8 listed files** — without colliding with the parallel session that shares this working tree.
5
+
6
+ ## 2. Why it matters
7
+ docpluck is a meta-science tool; a dropped or mis-bound table row silently corrupts downstream stat verification (effectcheck/ESCImate). DP-2 and DP-5 came from a real consumer handoff (`ESCIcheckapp/docs/DOCPLUCK_HANDOFF_2026-06-21.md`). **Two Claude sessions are editing this one working tree concurrently** (the other committed `84a4d42` mid-work), so the commit must stage explicit paths — a `git add -A`/`git add .` from either session would sweep the other's unfinished work into the wrong commit (memory `release-version-collision-with-parallel-uncommitted-stream`).
8
+
9
+ ## 3. State at handoff
10
+ - Branch: `feat/rc-t-table-region-guard`
11
+ - HEAD commit: `84a4d42` (`fix(render): RC-T — strip Camelot tables that are absorbed body prose (v2.4.96)`) — committed by the **concurrent session** (Gilad Feldman, 2026-06-22 07:56), not this one.
12
+ - Committed in this session: **none** (this session did not commit, per the "commit only when asked" rule).
13
+ - Uncommitted (this session's DP-2/DP-5 work, all on top of `84a4d42`):
14
+ - `docpluck/tables/flatten.py` — DP-2 Pass 4.5 (type blank `p`/`df`); DP-5 `_classify_column` sentinel-aware + `_detect_column_groups` equal-width-block arm alignment
15
+ - `docpluck/tables/cell_cleaning.py` — DP-5 `_DATA_VALUE_CELL_RE` + `_is_header_like_row` data-value recognition
16
+ - `docpluck/extract_structured.py` — `TABLE_EXTRACTION_VERSION` `2.4.0` → `2.4.1` (DP-2/DP-5 note)
17
+ - `docpluck/__init__.py` — `__version__` `2.4.96` → `2.4.97`
18
+ - `pyproject.toml` — `version` `2.4.96` → `2.4.97`
19
+ - `CHANGELOG.md` — new `[2.4.97]` entry
20
+ - `tests/test_tables_flatten_blank_header_recovery.py` — DP-2 tests added (`test_separate_arm_p_and_df`, `test_joint_arm_p_and_integer_df`, packed-arms p/df assertions)
21
+ - `tests/test_tables_superheader_alignment_real_pdf.py` — **new** (DP-5 real-PDF + contract tests)
22
+ - Working artifacts (gitignored, safe to ignore/delete): `tmp/repro_dp.py`, `tmp/dbg_dp2.py`, `tmp/cache_and_flatten.py`, `tmp/flat_mine.json`, `tmp/flat_head.json`, `tmp/tblcache/*.json`.
23
+
24
+ ## 4. What's done (verified)
25
+ - **DP-2 — 77859 Table 3 `fields` now include `p` + `df`.** `_recover_blank_roles` Pass 4.5 types the operator-less `.XXX` p column and the integer/Welch-decimal df column it previously skipped. Verified: the DP-2 tests **fail at HEAD** (`KeyError: 'p'`) and **pass** after; `flatten_table` on the live PDF yields `p=.551, df=260.54` for the Separate arm.
26
+ - **DP-5 — 90203 Table 10 emits all 6 conditions, correctly arm-split.** Handoff blamed Camelot, but Camelot extracted all 6 rows — `flatten` dropped the first data row (mistook it for a 3rd header row) then mis-bound the centered super-header. Three coupled fixes (header-detection, block-alignment, folded-header classify). Verified: Table 10 → 12 rows (6 conditions × Target/Replication) with the **exact** handoff values (`r=.63, n=170, CI [0.53,0.72]` for Identifiable/Explicit); rendered `.md` `<table>` shows all 6 rows; real-PDF tests fail-at-HEAD/pass-after.
27
+ - **Incidental correct improvements** (same root cause): `xiao_2021` T4 Original/Replication F **un-swapped** (was wrong at HEAD — a canary), `chan_feldman` T8 arm labels recovered (canary), `jama_open_2` T3 HR estimates+CIs recovered.
28
+ - **No clean-table regression**: full-corpus (101-PDF) **deterministic cached-table flatten diff** (mine vs HEAD) — every change is a recovered row, a correct arm split, a recovered field, or a removed stat-less spurious row; already-garbage tables (chen T9, aom amd_2, ieee T10) shuffle but no clean table regressed. 285 contract tests pass; both touched test files pass per-file (superheader 7/7, flatten_blank_header 27/27).
29
+
30
+ ## 5. What's next (numbered, concrete)
31
+ 1. **Coordinate with the concurrent session first.** Confirm the other session has finished writing to the working tree (or have it commit/stash its own files) so the two change-sets don't interleave. Both sets are on `84a4d42`; they touch disjoint files (theirs: `render.py`; mine: `flatten.py`/`cell_cleaning.py`), so they compose cleanly.
32
+ 2. **Commit DP-2 + DP-5 as v2.4.97, staging only these 8 files** (never `git add -A`):
33
+ ```bash
34
+ git add docpluck/__init__.py pyproject.toml docpluck/extract_structured.py \
35
+ docpluck/tables/cell_cleaning.py docpluck/tables/flatten.py \
36
+ tests/test_tables_flatten_blank_header_recovery.py \
37
+ tests/test_tables_superheader_alignment_real_pdf.py CHANGELOG.md
38
+ git commit -m "fix(tables): DP-2 type blank p/df + DP-5 two-header-row recovery & super-header alignment (v2.4.97)"
39
+ ```
40
+ (Optional: split into two commits — DP-2 = `flatten.py` Pass 4.5 + the `test_tables_flatten_blank_header_recovery.py` additions; DP-5 = `cell_cleaning.py` + the rest of `flatten.py` + the new test file — if independent revertability is wanted. The version-bump files go with whichever commit ships last.)
41
+ 3. **Before any `git tag v2.4.97` / release**: run the formal Sonnet canary AI-verify (the project's keystone gate — `references/ai-full-doc-verify.md`) on the touched canaries (`90203` maier, `xiao`, `chan_feldman`) against the article-finder golds, and the 26-paper baseline. Tagging fires `bump-app-pin.yml`, so do not tag until that gate is green (run `python scripts/check_app_pin_sync.py` after).
42
+ 4. **Architectural backlog — leave as documented backlog** (user decision 2026-06-22). Do NOT start RC-T Layer-1 recovery or RC-1 default-flip this stream. The four remaining handoff defects map to existing tracked work (see §6 / §8).
43
+
44
+ ## 6. Open decisions
45
+ - **Tag/release v2.4.97 now, or batch with later table work?** Options: (A) tag now — ships the fixes to the app via the pin bump, but each tag is a Railway redeploy; (B) leave committed-but-untagged and batch with the next table cycle. **Recommendation: (B)** — the concurrent session's v2.4.96 is also untagged on this branch; tag once when the branch's table work is consolidated, after the formal canary AI-verify. Confirm with the user.
46
+ - **Commit granularity (1 vs 2 commits).** Recommendation: **1 commit** — DP-2 and DP-5 are both from the same handoff, ship together as v2.4.97, and were verified together; the CHANGELOG documents both. Split only if you specifically want per-defect revertability.
47
+
48
+ ## 7. Watchouts
49
+ - **Shared working tree (live collision risk).** The other session committed `84a4d42` while this session worked. NEVER `git add -A`/`git add .` — stage the 8 explicit paths only. Verify `git status` shows nothing unexpected staged before committing.
50
+ - **Real-PDF Camelot tests flake under *cumulative* load — even serially.** Running 13 Camelot-heavy test files in one `pytest` process flaked 9 ("no tables extracted"); each passes per-file. So the canonical `pytest tests/ -q` whole-suite run is unreliable for the table real-PDF tests — run them **per file** (or in small batches) to gate. This is a pre-existing infra issue (`test_rc_t_degenerate_table_real_pdf.py` docstring notes the xdist variant; this extends it to serial cumulative load). Not fixed here.
51
+ - **DP-5 was misdiagnosed in the source handoff** ("Camelot drops a row" → actually a `flatten` header-miscount). Reproducing at HEAD before coding is what caught it (memory `reproduce-triage-defect-at-head-before-trusting-cost-estimate`). Apply the same to DP-1/DP-6 before assuming they're Layer-1.
52
+ - **Version already bumped in the working tree** (`__version__`/`pyproject` → 2.4.97, `TABLE_EXTRACTION_VERSION` → 2.4.1). Don't double-bump.
53
+ - **No formal Sonnet AI-gold verify run yet** (only deterministic + the consumer-handoff's AI-derived expected values, which my output matches exactly). The project rule is AI-gold is the verdict — run it before tagging (§5 step 3).
54
+ - **`NORMALIZATION_VERSION` / `SECTIONING_VERSION` intentionally NOT bumped** — this change is table-flatten only.
55
+
56
+ ## 8. Context pointers
57
+ - Source defect list (the 6 DP defects): `../../../../ESCIcheckapp/docs/DOCPLUCK_HANDOFF_2026-06-21.md`
58
+ - Living work queue (RC-T / RC-1 architectural backlog = DP-1/3/4/6): `docs/TRIAGE_2026-06-21_head_v2.4.95_assessment.md`
59
+ - RC-T spec (DP-1/DP-6 Layer-1 recovery is the out-of-scope follow-on): `docs/superpowers/specs/2026-06-21-rc-t-table-region-prose-contamination.md`
60
+ - RC-1 spec (DP-3/DP-4 interleave; banded flag exists, default-OFF pending Step-2 polish): `docs/superpowers/specs/2026-06-08-rc1-region-aware-column-architecture.md`
61
+ - This session's tests: `tests/test_tables_superheader_alignment_real_pdf.py`, `tests/test_tables_flatten_blank_header_recovery.py`
62
+ - Release/AI-verify gate: `.claude/skills/docpluck-iterate/references/ai-full-doc-verify.md`; app-pin gate `scripts/check_app_pin_sync.py`
63
+ - Memories: `feedback_docpluck_app_pin_sync` (verify origin/master before/after tag), `feedback_canary_audit_clobbers_phase5d` (don't trust AUDIT_DEFERRED PASS), `feedback_general_fixes_not_pdf_specific`, `release-version-collision-with-parallel-uncommitted-stream`
64
+
65
+ ### Architectural backlog (DP-1/3/4/6 — left as documented backlog per user decision 2026-06-22)
66
+ | Defect | Paper | Maps to | Status |
67
+ |---|---|---|---|
68
+ | DP-1 | 77859 Table 1/2 not extracted (Camelot 0 cells) | RC-T **Layer-1** recovery (`table_areas`) | deferred — out-of-scope in RC-T spec |
69
+ | DP-3 | 37122 figure-caption interleaved between stat & CI | RC-1 column interleave | deferred — banded flag exists, default-OFF |
70
+ | DP-4 | cog_emo under-extraction (22 vs 47) | RC-T + RC-1 | partial (T6 prose-strip in `84a4d42`; T8 arm labels in v2.4.97); rest deferred |
71
+ | DP-6 | 37122 results-summary table mashed into prose | RC-T Layer-1 / RC-1 | deferred |
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "docpluck"
7
- version = "2.4.96"
7
+ version = "2.4.97"
8
8
  description = "PDF, DOCX, and HTML text extraction and normalization for academic papers"
9
9
  readme = "docs/README.md"
10
10
  requires-python = ">=3.10"
@@ -0,0 +1,163 @@
1
+ """RC-T Layer-2 — raw_text-fallback prose contamination (v2.4.98, 2026-06-22).
2
+
3
+ When Camelot recovers no cells, ``_extract_table_body_text`` linearizes the
4
+ text following a table caption as the ``unstructured-table`` fallback. Its
5
+ per-line prose gate (``_line_is_body_prose``, len>=80) cannot see body prose
6
+ that pdftotext WRAPPED into short (~48-char) lines, so the region overshoot
7
+ swallowed Results/Discussion prose into the block:
8
+
9
+ * chan_feldman Table 1 — Discussion prose ("Our main focus was the
10
+ replication …") accumulated AFTER the table's ``Note:`` footnote.
11
+ * chan_feldman Table 9 — the block was ENTIRELY flowing prose ("than
12
+ empathy. We provided full analyses …") duplicating the real ``##
13
+ Discussion`` section verbatim.
14
+
15
+ Two structural-signature fixes (rule 16), both FP-safe by construction:
16
+ 1. Note-anchor: a table's ``Note:`` is its last element — trim everything
17
+ after the note paragraph (T1).
18
+ 2. Degenerate-prose guard: suppress a block that STARTS mid-sentence with a
19
+ lowercase multi-letter word AND is majority prose; render then emits a
20
+ clean caption-only table (T9).
21
+
22
+ Contract tests pin the FP-safe predicate deterministically; real-PDF tests
23
+ (rule 0d) confirm on chan_feldman. PDFs are closed-access
24
+ (``feedback_no_pdfs_in_repo``); real-PDF tests skip when the fixture is absent.
25
+ """
26
+ from __future__ import annotations
27
+
28
+ import os
29
+ import re
30
+ from pathlib import Path
31
+
32
+ import pytest
33
+
34
+ from docpluck.extract_structured import (
35
+ _join_wrapped_lines,
36
+ _raw_text_is_degenerate_prose,
37
+ )
38
+ from docpluck.render import render_pdf_to_markdown
39
+
40
+ from .conftest import pdf_available, pdf_path, requires_pdftotext
41
+
42
+ _skip_under_xdist = pytest.mark.skipif(
43
+ bool(os.environ.get("PYTEST_XDIST_WORKER")),
44
+ reason="real-PDF Camelot extraction is non-deterministic under parallel "
45
+ "xdist load; runs serially (isolation/serial run is the real gate)",
46
+ )
47
+
48
+
49
+ # ── contract tests: the FP-safe degenerate-prose predicate (deterministic) ────
50
+
51
+ # All-prose block that STARTS mid-sentence (lowercase multi-letter word) — the
52
+ # region-overshoot signature. Must be flagged degenerate.
53
+ _DEGENERATE = (
54
+ "than empathy. We provided full analyses and results\n"
55
+ "for the comparisons in the supplementary materials\n"
56
+ "section of this paper across all of the conditions.\n"
57
+ "We replicated all of the supported findings of the\n"
58
+ "target article and summarised the results below here."
59
+ )
60
+ # Hypotheses table (legit, degraded): starts with a single-letter item marker.
61
+ _HYPOTHESES = (
62
+ "a There is a positive association between a wronged\n"
63
+ "person's empathy for an offender and reported\n"
64
+ "forgiveness for the offender.\n"
65
+ "b Apology increases the likelihood of forgiving."
66
+ )
67
+ # Descriptive rows (legit): starts with a Capitalized label.
68
+ _DESCRIPTIVE = "Median age (years)\n24.0\nAverage age\n28.8\n(years)\nStandard deviation"
69
+ # Instrument-table fragment (legit): starts with a single-letter token "h".
70
+ _INSTRUMENT = "h et al., 1997)\nPerceived apology\nEmpathy\nThe offender has apologised?"
71
+
72
+
73
+ def test_degenerate_prose_flagged():
74
+ assert _raw_text_is_degenerate_prose(_DEGENERATE) is True
75
+
76
+
77
+ def test_hypotheses_not_flagged():
78
+ """Single-letter item marker ('a ...') => not a mid-sentence continuation."""
79
+ assert _raw_text_is_degenerate_prose(_HYPOTHESES) is False
80
+
81
+
82
+ def test_descriptive_rows_not_flagged():
83
+ assert _raw_text_is_degenerate_prose(_DESCRIPTIVE) is False
84
+
85
+
86
+ def test_instrument_fragment_not_flagged():
87
+ assert _raw_text_is_degenerate_prose(_INSTRUMENT) is False
88
+
89
+
90
+ def test_short_block_not_flagged():
91
+ assert _raw_text_is_degenerate_prose("than empathy.\nWe provided.") is False
92
+
93
+
94
+ def test_join_wrapped_lines_merges_to_sentence():
95
+ assert _join_wrapped_lines(["a foo", "bar baz.", "next one."]) == [
96
+ "a foo bar baz.",
97
+ "next one.",
98
+ ]
99
+
100
+
101
+ # ── real-PDF tests (chan_feldman) ─────────────────────────────────────────────
102
+
103
+
104
+ def _unstructured_blocks(md: str) -> str:
105
+ """Whitespace-normalized concatenation of every ```unstructured-table``` block."""
106
+ blocks = re.findall(r"```unstructured-table\n(.*?)```", md, re.DOTALL)
107
+ return re.sub(r"\s+", " ", "\n".join(blocks))
108
+
109
+
110
+ @pytest.fixture(scope="module")
111
+ def chan_md() -> str:
112
+ key = "10.1080__02699931.2024.2434156"
113
+ if not pdf_available("articlerepo", f"{key}.pdf"):
114
+ pytest.skip(f"closed-access fixture missing: {key}.pdf")
115
+ return render_pdf_to_markdown(Path(pdf_path("articlerepo", f"{key}.pdf")).read_bytes())
116
+
117
+
118
+ @requires_pdftotext
119
+ @_skip_under_xdist
120
+ def test_t1_note_anchor_trims_trailing_prose(chan_md: str):
121
+ """Table 1: body prose after the ``Note:`` footnote must be trimmed from the
122
+ fallback block (FAIL at HEAD — it was swallowed)."""
123
+ blocks = _unstructured_blocks(chan_md)
124
+ assert "Our main focus was the replication" not in blocks, (
125
+ "chan_feldman T1 still swallows post-Note Discussion prose — the "
126
+ "Note-anchor trim in _extract_table_body_text did not fire."
127
+ )
128
+
129
+
130
+ @requires_pdftotext
131
+ @_skip_under_xdist
132
+ def test_t1_table_content_and_note_retained(chan_md: str):
133
+ """FP guard: the Note-anchor must KEEP the table content + the note itself
134
+ (hypotheses come before the note; trimming starts after it)."""
135
+ blocks = _unstructured_blocks(chan_md)
136
+ assert "There is a positive association" in blocks, "T1 hypothesis content lost (over-trim)"
137
+ assert "Hypothesis 3 is not included in the replication" in blocks, "T1 Note paragraph lost (over-trim)"
138
+
139
+
140
+ @requires_pdftotext
141
+ @_skip_under_xdist
142
+ def test_t9_degenerate_block_suppressed_no_duplication(chan_md: str):
143
+ """Table 9: the all-prose fallback (a verbatim duplicate of ## Discussion)
144
+ must be suppressed — the Discussion opener appears exactly once, never inside
145
+ an unstructured-table block."""
146
+ opener = "We conducted a replication and extensions Registered Report"
147
+ assert opener not in _unstructured_blocks(chan_md), (
148
+ "chan_feldman T9 still dumps Discussion prose into an unstructured-table "
149
+ "block — the degenerate-prose guard did not fire."
150
+ )
151
+ assert "### Table 9" in chan_md, "T9 heading lost (table_parity broken)"
152
+ n = len(re.findall(re.escape(opener), chan_md))
153
+ assert n == 1, f"Discussion opener appears {n}x (expected 1 — T9 duplication not resolved)"
154
+
155
+
156
+ @requires_pdftotext
157
+ @_skip_under_xdist
158
+ def test_t3_legit_fallback_table_survives(chan_md: str):
159
+ """FP guard: Table 3 (a real descriptive table starting with a Capitalized
160
+ label) must keep its fallback block + its Note — never suppressed/over-trimmed."""
161
+ blocks = _unstructured_blocks(chan_md)
162
+ assert "Median age" in blocks, "chan_feldman T3 descriptive fallback wrongly suppressed (FP)"
163
+ assert "Origin was not explicitly mentioned" in blocks, "T3 Note over-trimmed (FP)"