docpluck 2.4.98__tar.gz → 2.4.99__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (440) hide show
  1. {docpluck-2.4.98 → docpluck-2.4.99}/.claude/skills/docpluck-qa/references/benchmark-mode.md +4 -4
  2. {docpluck-2.4.98 → docpluck-2.4.99}/.claude/skills/docpluck-qa/references/check-6-escicheck-local-webapp.md +1 -1
  3. {docpluck-2.4.98 → docpluck-2.4.99}/.claude/skills/docpluck-qa/references/check-7-batch-smoke.md +1 -1
  4. {docpluck-2.4.98 → docpluck-2.4.99}/CHANGELOG.md +12 -0
  5. {docpluck-2.4.98 → docpluck-2.4.99}/PKG-INFO +1 -1
  6. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/__init__.py +1 -1
  7. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/extract_structured.py +231 -9
  8. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/tables/camelot_extract.py +261 -97
  9. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/tables/captions.py +20 -0
  10. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/tables/whitespace.py +245 -2
  11. {docpluck-2.4.98 → docpluck-2.4.99}/pyproject.toml +1 -1
  12. {docpluck-2.4.98 → docpluck-2.4.99}/.claude/skills/_project/canary.json +0 -0
  13. {docpluck-2.4.98 → docpluck-2.4.99}/.claude/skills/_project/lessons.md +0 -0
  14. {docpluck-2.4.98 → docpluck-2.4.99}/.claude/skills/docpluck-cleanup/SKILL.md +0 -0
  15. {docpluck-2.4.98 → docpluck-2.4.99}/.claude/skills/docpluck-deploy/SKILL.md +0 -0
  16. {docpluck-2.4.98 → docpluck-2.4.99}/.claude/skills/docpluck-iterate/LEARNINGS.md +0 -0
  17. {docpluck-2.4.98 → docpluck-2.4.99}/.claude/skills/docpluck-iterate/SKILL.md +0 -0
  18. {docpluck-2.4.98 → docpluck-2.4.99}/.claude/skills/docpluck-iterate/references/ai-full-doc-verify.md +0 -0
  19. {docpluck-2.4.98 → docpluck-2.4.99}/.claude/skills/docpluck-iterate/references/cycle-report-template.md +0 -0
  20. {docpluck-2.4.98 → docpluck-2.4.99}/.claude/skills/docpluck-iterate/references/local-verification.md +0 -0
  21. {docpluck-2.4.98 → docpluck-2.4.99}/.claude/skills/docpluck-iterate/references/rationalizations.md +0 -0
  22. {docpluck-2.4.98 → docpluck-2.4.99}/.claude/skills/docpluck-iterate/references/real-library-real-pdf.md +0 -0
  23. {docpluck-2.4.98 → docpluck-2.4.99}/.claude/skills/docpluck-iterate/references/release-flow.md +0 -0
  24. {docpluck-2.4.98 → docpluck-2.4.99}/.claude/skills/docpluck-iterate/references/self-improvement.md +0 -0
  25. {docpluck-2.4.98 → docpluck-2.4.99}/.claude/skills/docpluck-iterate/references/subagent-parallelization.md +0 -0
  26. {docpluck-2.4.98 → docpluck-2.4.99}/.claude/skills/docpluck-iterate/references/three-tier-parity.md +0 -0
  27. {docpluck-2.4.98 → docpluck-2.4.99}/.claude/skills/docpluck-qa/SKILL.md +0 -0
  28. {docpluck-2.4.98 → docpluck-2.4.99}/.claude/skills/docpluck-qa/references/check-11-hard-rules.md +0 -0
  29. {docpluck-2.4.98 → docpluck-2.4.99}/.claude/skills/docpluck-qa/references/check-13-escicheck-production.md +0 -0
  30. {docpluck-2.4.98 → docpluck-2.4.99}/.claude/skills/docpluck-qa/references/check-5-escicheck-library.md +0 -0
  31. {docpluck-2.4.98 → docpluck-2.4.99}/.claude/skills/docpluck-review/SKILL.md +0 -0
  32. {docpluck-2.4.98 → docpluck-2.4.99}/.github/workflows/bump-app-pin.yml +0 -0
  33. {docpluck-2.4.98 → docpluck-2.4.99}/.github/workflows/publish.yml +0 -0
  34. {docpluck-2.4.98 → docpluck-2.4.99}/.github/workflows/test.yml +0 -0
  35. {docpluck-2.4.98 → docpluck-2.4.99}/.gitignore +0 -0
  36. {docpluck-2.4.98 → docpluck-2.4.99}/CLAUDE.md +0 -0
  37. {docpluck-2.4.98 → docpluck-2.4.99}/CUSTOMER_UPDATE_2026-06-19_tables_sections_api.md +0 -0
  38. {docpluck-2.4.98 → docpluck-2.4.99}/HANDOFF_SECTIONS_APP_INTEGRATION.md +0 -0
  39. {docpluck-2.4.98 → docpluck-2.4.99}/LESSONS.md +0 -0
  40. {docpluck-2.4.98 → docpluck-2.4.99}/LICENSE +0 -0
  41. {docpluck-2.4.98 → docpluck-2.4.99}/README.md +0 -0
  42. {docpluck-2.4.98 → docpluck-2.4.99}/REPLY_FROM_DOCPLUCK_v1.4.5.md +0 -0
  43. {docpluck-2.4.98 → docpluck-2.4.99}/REPLY_FROM_DOCPLUCK_v1.5.0.md +0 -0
  44. {docpluck-2.4.98 → docpluck-2.4.99}/REPLY_FROM_DOCPLUCK_v2.4.93.md +0 -0
  45. {docpluck-2.4.98 → docpluck-2.4.99}/REPLY_FROM_DOCPLUCK_v2.4.94.md +0 -0
  46. {docpluck-2.4.98 → docpluck-2.4.99}/REPLY_FROM_DOCPLUCK_v2.4.95.md +0 -0
  47. {docpluck-2.4.98 → docpluck-2.4.99}/REQUEST_08_CHUNKING_ENDPOINT.md +0 -0
  48. {docpluck-2.4.98 → docpluck-2.4.99}/REQUEST_09_REFERENCE_LIST_NORMALIZATION.md +0 -0
  49. {docpluck-2.4.98 → docpluck-2.4.99}/REQUEST_10_TABLE_FLATTEN_HTTP_EXPOSURE.md +0 -0
  50. {docpluck-2.4.98 → docpluck-2.4.99}/REQUEST_10_TIER2_ORPHANED_LABEL_ROW_RECOVERY.md +0 -0
  51. {docpluck-2.4.98 → docpluck-2.4.99}/REQUEST_11_FLATTEN_FIELDS_NONCLINICAL_TABLES.md +0 -0
  52. {docpluck-2.4.98 → docpluck-2.4.99}/TODO.md +0 -0
  53. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/__main__.py +0 -0
  54. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/batch.py +0 -0
  55. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/cli.py +0 -0
  56. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/extract.py +0 -0
  57. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/extract_columns.py +0 -0
  58. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/extract_docx.py +0 -0
  59. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/extract_html.py +0 -0
  60. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/extract_layout.py +0 -0
  61. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/figures/__init__.py +0 -0
  62. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/figures/detect.py +0 -0
  63. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/normalize.py +0 -0
  64. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/quality.py +0 -0
  65. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/render.py +0 -0
  66. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/sections/__init__.py +0 -0
  67. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/sections/annotators/__init__.py +0 -0
  68. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/sections/annotators/docx.py +0 -0
  69. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/sections/annotators/html.py +0 -0
  70. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/sections/annotators/pdf.py +0 -0
  71. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/sections/annotators/text.py +0 -0
  72. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/sections/blocks.py +0 -0
  73. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/sections/boundaries.py +0 -0
  74. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/sections/core.py +0 -0
  75. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/sections/taxonomy.py +0 -0
  76. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/sections/types.py +0 -0
  77. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/tables/__init__.py +0 -0
  78. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/tables/bbox_utils.py +0 -0
  79. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/tables/cell_cleaning.py +0 -0
  80. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/tables/cluster.py +0 -0
  81. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/tables/confidence.py +0 -0
  82. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/tables/detect.py +0 -0
  83. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/tables/flatten.py +0 -0
  84. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/tables/render.py +0 -0
  85. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/telemetry.py +0 -0
  86. {docpluck-2.4.98 → docpluck-2.4.99}/docpluck/version.py +0 -0
  87. {docpluck-2.4.98 → docpluck-2.4.99}/docs/BENCHMARKS.md +0 -0
  88. {docpluck-2.4.98 → docpluck-2.4.99}/docs/BENCHMARKS_liteparse_2026-06.md +0 -0
  89. {docpluck-2.4.98 → docpluck-2.4.99}/docs/DESIGN.md +0 -0
  90. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-07_sections_strict_iteration.md +0 -0
  91. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-09_session_state_and_followups.md +0 -0
  92. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-09_unified_extraction_brainstorm.md +0 -0
  93. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-10_table_rendering_iteration.md +0 -0
  94. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-10_table_rendering_iteration_2.md +0 -0
  95. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-10_table_rendering_iteration_3.md +0 -0
  96. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-10_table_rendering_iteration_4.md +0 -0
  97. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-10_table_rendering_iteration_5.md +0 -0
  98. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-10_table_rendering_iteration_6.md +0 -0
  99. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-10_table_rendering_iteration_7.md +0 -0
  100. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-11_PROMOTE_SPIKE_TO_LIBRARY.md +0 -0
  101. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-11_table_rendering_iteration_8.md +0 -0
  102. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-11_visual_review_findings.md +0 -0
  103. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-12_phase2_101pdf_corpus.md +0 -0
  104. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-12_remaining_ui_and_chrome_verification.md +0 -0
  105. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-12_visual_verify_results.md +0 -0
  106. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-13_apa_50_expansion.md +0 -0
  107. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-13_apa_50_expansion_iter_1.md +0 -0
  108. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-13_apa_50_expansion_iter_2.md +0 -0
  109. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-13_iterate_skill_first_use.md +0 -0
  110. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-13_iterative_1.md +0 -0
  111. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-13_iterative_library_improvement.md +0 -0
  112. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-13_table_extraction_next_iteration.md +0 -0
  113. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-14_continue_iterations_v2_4_30_to_15n.md +0 -0
  114. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-14_full_corpus_iteration_v2_4_30.md +0 -0
  115. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-14_iterate_6_cycles_complete.md +0 -0
  116. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-14_iterate_9_cycle_run.md +0 -0
  117. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-14_iterate_resume_4_cycles.md +0 -0
  118. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-14_iterate_v2_4_31_cycle_15n.md +0 -0
  119. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-14_phase_5d_gold_audit_v2_4_29.md +0 -0
  120. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-15_autonomous_apa_first_10h.md +0 -0
  121. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-15_iterate_apa_run_1.md +0 -0
  122. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-16_ai-gold-instructions.md +0 -0
  123. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-16_iterate_apa_run_2.md +0 -0
  124. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-16_iterate_apa_run_3.md +0 -0
  125. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-16_iterate_run_4_final.md +0 -0
  126. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-16_iterate_run_4_fix_and_continue.md +0 -0
  127. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-16_iterate_run_5.md +0 -0
  128. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-16_iterate_run_6.md +0 -0
  129. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-17_iterate_run_7.md +0 -0
  130. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-17_iterate_run_8.md +0 -0
  131. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-17_iterate_run_9.md +0 -0
  132. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-18_iterate_run_9_cont.md +0 -0
  133. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-18_iterate_run_9_cont2.md +0 -0
  134. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-20_iterate_run_9_cont3.md +0 -0
  135. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-22_iterate_run_9_session4_final.md +0 -0
  136. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-22_iterate_run_9_session5_close.md +0 -0
  137. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-25_haiku-orchestration-pretest.md +0 -0
  138. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-05-25_pretest-followups.md +0 -0
  139. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-06-08_iterate_splice-wordintegrity-runningheader.md +0 -0
  140. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-06-08_untested_sweep_v2.4.81.md +0 -0
  141. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-06-13_sciencearena_grobid_liteparse.md +0 -0
  142. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-06-15_docpluck-iterate-resume.md +0 -0
  143. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-06-15_rc1-step2-continue.md +0 -0
  144. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-06-16_docpluck-iterate-resume.md +0 -0
  145. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-06-17_iterate_resume-cycle1.md +0 -0
  146. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-06-17_iterate_v2491_shipped.md +0 -0
  147. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-06-18_iterate_v2492_affiliation_caption-revert.md +0 -0
  148. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-06-20_request11_flatten_nonclinical_tables.md +0 -0
  149. {docpluck-2.4.98 → docpluck-2.4.99}/docs/HANDOFF_2026-06-25_iterate_branch-reconcile_rc1-banded-flip-rejected.md +0 -0
  150. {docpluck-2.4.98 → docpluck-2.4.99}/docs/ITERATION_VERIFICATION_LESSONS.md +0 -0
  151. {docpluck-2.4.98 → docpluck-2.4.99}/docs/LIBRARY_APP_SYNC.md +0 -0
  152. {docpluck-2.4.98 → docpluck-2.4.99}/docs/NORMALIZATION.md +0 -0
  153. {docpluck-2.4.98 → docpluck-2.4.99}/docs/README.md +0 -0
  154. {docpluck-2.4.98 → docpluck-2.4.99}/docs/TRIAGE_2026-05-10_corpus_assessment.md +0 -0
  155. {docpluck-2.4.98 → docpluck-2.4.99}/docs/TRIAGE_2026-05-14_phase_5d_gold_audit.md +0 -0
  156. {docpluck-2.4.98 → docpluck-2.4.99}/docs/TRIAGE_2026-06-08_untested_corpus_sweep.md +0 -0
  157. {docpluck-2.4.98 → docpluck-2.4.99}/docs/TRIAGE_2026-06-15_head_v2.4.88_assessment.md +0 -0
  158. {docpluck-2.4.98 → docpluck-2.4.99}/docs/TRIAGE_2026-06-21_head_v2.4.95_assessment.md +0 -0
  159. {docpluck-2.4.98 → docpluck-2.4.99}/docs/TRIAGE_2026-06-25_escicheck_handoff_defects.md +0 -0
  160. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/handoffs/2026-05-22-b1-next-iteration.md +0 -0
  161. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/handoffs/2026-05-22-b2-remaining-halluc-head.md +0 -0
  162. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/handoffs/2026-05-22-b3-b7-structural-defects.md +0 -0
  163. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/handoffs/2026-05-22-residual-after-locally-doable-pass.md +0 -0
  164. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/handoffs/2026-05-23-bundled-residual-cycle-CLOSED.md +0 -0
  165. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/handoffs/2026-05-23-residual-after-iterate-spine-cycles-1-3.md +0 -0
  166. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/handoffs/2026-05-25-canary-audit-architecture-and-cluster-A-B-C-landed.md +0 -0
  167. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/handoffs/2026-05-25-wrapup-r4-cycle.md +0 -0
  168. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/handoffs/2026-05-26-run-11-cluster-A-ter-and-C-bis-landed.md +0 -0
  169. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/handoffs/2026-05-26-text-extraction-defects-from-citationguard-audit.md +0 -0
  170. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/handoffs/2026-06-07-text-extraction-defects-from-citationguard-iterate.md +0 -0
  171. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/handoffs/2026-06-07-v2.4.78-landed-canary-iterate.md +0 -0
  172. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/handoffs/2026-06-07-v2.4.79-findings-1-2-cleared.md +0 -0
  173. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/handoffs/2026-06-20_docpluck-skill-file-edits-from-app-cron-fix.md +0 -0
  174. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/handoffs/2026-06-21-rc-t-table-region-implementation.md +0 -0
  175. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/handoffs/2026-06-22-dp2-dp5-flatten-fixes-commit.md +0 -0
  176. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/2026-05-06-section-identification.md +0 -0
  177. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/2026-05-06-table-extraction.md +0 -0
  178. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/2026-05-07-sections-strict-iteration-progress.md +0 -0
  179. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/2026-05-08-unified-extraction-phase-0-splice-spike.md +0 -0
  180. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/2026-05-23-haiku-orchestration-pretest.md +0 -0
  181. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/sections-deferred-items.md +0 -0
  182. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/sections-issues-backlog.md +0 -0
  183. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/2026-05-07_spot-01_apa.md +0 -0
  184. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/2026-05-07_spot-02_pattern-A-shipped.md +0 -0
  185. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/2026-05-08_spot-final_all-styles.md +0 -0
  186. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/experiments/COMPARISON.md +0 -0
  187. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/korbmacher_table1.md +0 -0
  188. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/option-a.py +0 -0
  189. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/ziano_table1.md +0 -0
  190. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_notes_raw.txt +0 -0
  191. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_table1.md +0 -0
  192. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/notes.md +0 -0
  193. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/option-b.py +0 -0
  194. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_notes_raw.txt +0 -0
  195. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_table1.md +0 -0
  196. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/korbmacher_table1.md +0 -0
  197. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/notes.md +0 -0
  198. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/option-c.py +0 -0
  199. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/sample-pdftotext-bbox.html +0 -0
  200. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/ziano_table1.md +0 -0
  201. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/korbmacher_table1.md +0 -0
  202. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/notes.md +0 -0
  203. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/option-d.py +0 -0
  204. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/ziano_table1.md +0 -0
  205. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_2022_kruger_bbox.html +0 -0
  206. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_bbox.html +0 -0
  207. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_table1.md +0 -0
  208. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/option-e.py +0 -0
  209. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/sample-bbox.html +0 -0
  210. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_2021_joep_bbox.html +0 -0
  211. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_bbox.html +0 -0
  212. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_table1.md +0 -0
  213. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/html-fallback-demo.md +0 -0
  214. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.err +0 -0
  215. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.md +0 -0
  216. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.err +0 -0
  217. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.md +0 -0
  218. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.err +0 -0
  219. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.md +0 -0
  220. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.err +0 -0
  221. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.md +0 -0
  222. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.err +0 -0
  223. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.md +0 -0
  224. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.err +0 -0
  225. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.md +0 -0
  226. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.err +0 -0
  227. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.md +0 -0
  228. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.err +0 -0
  229. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.md +0 -0
  230. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.err +0 -0
  231. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.md +0 -0
  232. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.err +0 -0
  233. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.md +0 -0
  234. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.err +0 -0
  235. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.md +0 -0
  236. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.err +0 -0
  237. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.md +0 -0
  238. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.err +0 -0
  239. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.md +0 -0
  240. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.err +0 -0
  241. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.md +0 -0
  242. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.err +0 -0
  243. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.md +0 -0
  244. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.err +0 -0
  245. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.md +0 -0
  246. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.err +0 -0
  247. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.md +0 -0
  248. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.err +0 -0
  249. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.md +0 -0
  250. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.err +0 -0
  251. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.md +0 -0
  252. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.err +0 -0
  253. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.md +0 -0
  254. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.err +0 -0
  255. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.md +0 -0
  256. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.err +0 -0
  257. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.md +0 -0
  258. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.err +0 -0
  259. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.md +0 -0
  260. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.err +0 -0
  261. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.md +0 -0
  262. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.err +0 -0
  263. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.md +0 -0
  264. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.err +0 -0
  265. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.md +0 -0
  266. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/papers.md +0 -0
  267. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/report.md +0 -0
  268. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/splice_spike.py +0 -0
  269. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/plans/spot-checks/splice-spike/test_splice_spike.py +0 -0
  270. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/specs/2026-04-27-request-09-reference-normalization-design.md +0 -0
  271. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/specs/2026-05-06-section-identification-design.md +0 -0
  272. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/specs/2026-05-06-table-extraction-design.md +0 -0
  273. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/specs/2026-05-08-unified-extraction-design.md +0 -0
  274. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/specs/2026-05-23-haiku-orchestration-pretest-design.md +0 -0
  275. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/specs/2026-06-07-ip_feldman-B4-R4-column-interleave-diagnosis.md +0 -0
  276. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/specs/2026-06-08-rc1-region-aware-column-architecture.md +0 -0
  277. {docpluck-2.4.98 → docpluck-2.4.99}/docs/superpowers/specs/2026-06-21-rc-t-table-region-prose-contamination.md +0 -0
  278. {docpluck-2.4.98 → docpluck-2.4.99}/scripts/__init__.py +0 -0
  279. {docpluck-2.4.98 → docpluck-2.4.99}/scripts/check_app_pin_sync.py +0 -0
  280. {docpluck-2.4.98 → docpluck-2.4.99}/scripts/check_docs_consistency.py +0 -0
  281. {docpluck-2.4.98 → docpluck-2.4.99}/scripts/harness/README.md +0 -0
  282. {docpluck-2.4.98 → docpluck-2.4.99}/scripts/harness/VERIFIER_PROMPT.md +0 -0
  283. {docpluck-2.4.98 → docpluck-2.4.99}/scripts/harness/__init__.py +0 -0
  284. {docpluck-2.4.98 → docpluck-2.4.99}/scripts/harness/baseline_matrix.json +0 -0
  285. {docpluck-2.4.98 → docpluck-2.4.99}/scripts/harness/checks.py +0 -0
  286. {docpluck-2.4.98 → docpluck-2.4.99}/scripts/harness/corpus.py +0 -0
  287. {docpluck-2.4.98 → docpluck-2.4.99}/scripts/harness/corpus_manifest.json +0 -0
  288. {docpluck-2.4.98 → docpluck-2.4.99}/scripts/harness/extract.py +0 -0
  289. {docpluck-2.4.98 → docpluck-2.4.99}/scripts/harness/gold_keys.json +0 -0
  290. {docpluck-2.4.98 → docpluck-2.4.99}/scripts/harness/inspect.py +0 -0
  291. {docpluck-2.4.98 → docpluck-2.4.99}/scripts/lint_rendered_corpus.py +0 -0
  292. {docpluck-2.4.98 → docpluck-2.4.99}/scripts/pretest_capture_tokens.py +0 -0
  293. {docpluck-2.4.98 → docpluck-2.4.99}/scripts/verify_corpus.py +0 -0
  294. {docpluck-2.4.98 → docpluck-2.4.99}/scripts/verify_corpus_full.py +0 -0
  295. {docpluck-2.4.98 → docpluck-2.4.99}/tests/__init__.py +0 -0
  296. {docpluck-2.4.98 → docpluck-2.4.99}/tests/conftest.py +0 -0
  297. {docpluck-2.4.98 → docpluck-2.4.99}/tests/fixtures/__init__.py +0 -0
  298. {docpluck-2.4.98 → docpluck-2.4.99}/tests/fixtures/sections/__init__.py +0 -0
  299. {docpluck-2.4.98 → docpluck-2.4.99}/tests/fixtures/sections/builders.py +0 -0
  300. {docpluck-2.4.98 → docpluck-2.4.99}/tests/fixtures/structured/.gitkeep +0 -0
  301. {docpluck-2.4.98 → docpluck-2.4.99}/tests/fixtures/structured/MANIFEST.json +0 -0
  302. {docpluck-2.4.98 → docpluck-2.4.99}/tests/fixtures/structured/README.md +0 -0
  303. {docpluck-2.4.98 → docpluck-2.4.99}/tests/golden/sections/apa_multi_study_pdf.json +0 -0
  304. {docpluck-2.4.98 → docpluck-2.4.99}/tests/golden/sections/apa_single_study_pdf.json +0 -0
  305. {docpluck-2.4.98 → docpluck-2.4.99}/tests/golden/sections/html_real_headings.json +0 -0
  306. {docpluck-2.4.98 → docpluck-2.4.99}/tests/snapshots/amj_lattice.txt +0 -0
  307. {docpluck-2.4.98 → docpluck-2.4.99}/tests/snapshots/apa_chan_feldman_lineless.txt +0 -0
  308. {docpluck-2.4.98 → docpluck-2.4.99}/tests/snapshots/apa_chen_jesp_lineless.txt +0 -0
  309. {docpluck-2.4.98 → docpluck-2.4.99}/tests/snapshots/apa_efendic_affect.txt +0 -0
  310. {docpluck-2.4.98 → docpluck-2.4.99}/tests/snapshots/apa_ip_feldman_pspb.txt +0 -0
  311. {docpluck-2.4.98 → docpluck-2.4.99}/tests/snapshots/bmc_lattice.txt +0 -0
  312. {docpluck-2.4.98 → docpluck-2.4.99}/tests/snapshots/ieee_figure_heavy.txt +0 -0
  313. {docpluck-2.4.98 → docpluck-2.4.99}/tests/snapshots/ieee_lattice.txt +0 -0
  314. {docpluck-2.4.98 → docpluck-2.4.99}/tests/snapshots/jama_lattice.txt +0 -0
  315. {docpluck-2.4.98 → docpluck-2.4.99}/tests/snapshots/nat_comms_figure_only.txt +0 -0
  316. {docpluck-2.4.98 → docpluck-2.4.99}/tests/snapshots/nature_minimal_rule.txt +0 -0
  317. {docpluck-2.4.98 → docpluck-2.4.99}/tests/snapshots/scirep_minimal_rule.txt +0 -0
  318. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_a3c_leading_zero_decimal_real_pdf.py +0 -0
  319. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_a4_ci_period_to_comma.py +0 -0
  320. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_affiliation_heading_promote_guard_real_pdf.py +0 -0
  321. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_all_caps_section_promote_real_pdf.py +0 -0
  322. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_bbox_utils.py +0 -0
  323. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_benchmark_docx_html.py +0 -0
  324. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_cambridge_footer_strip_real_pdf.py +0 -0
  325. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_camelot_lattice_augment.py +0 -0
  326. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_camelot_temp_cleanup.py +0 -0
  327. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_canary_provenance.py +0 -0
  328. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_caption_only_table_heading_real_pdf.py +0 -0
  329. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_caption_regex.py +0 -0
  330. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_chart_data_trim_real_pdf.py +0 -0
  331. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_check_app_pin_sync.py +0 -0
  332. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_cid_minus_recovery_real_pdf.py +0 -0
  333. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_cli_sections.py +0 -0
  334. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_cli_structured.py +0 -0
  335. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_confidence.py +0 -0
  336. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_corpus_smoke.py +0 -0
  337. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_d5_normalization_audit.py +0 -0
  338. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_dropped_minus_layout_recovery_real_pdf.py +0 -0
  339. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_edge_cases.py +0 -0
  340. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_elsevier_footer_strip_real_pdf.py +0 -0
  341. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_equation_page_header_strip_real_pdf.py +0 -0
  342. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_extract_columns.py +0 -0
  343. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_extract_docx.py +0 -0
  344. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_extract_filter_sugar.py +0 -0
  345. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_extract_html.py +0 -0
  346. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_extract_layout.py +0 -0
  347. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_extract_pdf_structured.py +0 -0
  348. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_extraction.py +0 -0
  349. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_f0_table_region_aware.py +0 -0
  350. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_fffd_comparison_recovery_real_pdf.py +0 -0
  351. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_figure_caption_trim_real_pdf.py +0 -0
  352. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_figure_detect.py +0 -0
  353. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_fixtures_manifest.py +0 -0
  354. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_hallucinated_heading_continuation_guard.py +0 -0
  355. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_harness_text_loss_reflow.py +0 -0
  356. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_harvard_refs_pagebreak_stitch.py +0 -0
  357. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_jama_open_cluster_real_pdf.py +0 -0
  358. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_lattice_cluster.py +0 -0
  359. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_letterspaced_label_real_pdf.py +0 -0
  360. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_ligature_decomposition_real_pdf.py +0 -0
  361. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_lt_operator_recovery_real_pdf.py +0 -0
  362. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_mathitalic_greek_real_pdf.py +0 -0
  363. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_metaesci_followups.py +0 -0
  364. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_minus_sign_recovery_real_pdf.py +0 -0
  365. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_normalization.py +0 -0
  366. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_normalize_a3_r2_body_integer_real_pdf.py +0 -0
  367. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_normalize_f0_footnote_strip.py +0 -0
  368. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_normalize_idempotent_real_pdf.py +0 -0
  369. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_normalize_layout_param.py +0 -0
  370. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_normalize_metadata_leak_real_pdf.py +0 -0
  371. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_normalize_report_layout_fields.py +0 -0
  372. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_normalize_soft_hyphen_dehyphenation.py +0 -0
  373. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_normalize_v18_strips.py +0 -0
  374. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_numbered_heading_promotion_real_pdf.py +0 -0
  375. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_numbered_section_promotion_real_pdf.py +0 -0
  376. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_o5_reference_inversion_real_pdf.py +0 -0
  377. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_orphan_multilevel_number_real_pdf.py +0 -0
  378. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_orphan_section_number_real_pdf.py +0 -0
  379. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_p0r_recurring_running_header_strip.py +0 -0
  380. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_preserve_math_glyphs_real_pdf.py +0 -0
  381. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_pretest_capture_tokens.py +0 -0
  382. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_pua_glyph_recovery_real_pdf.py +0 -0
  383. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_quality.py +0 -0
  384. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_r1_whitespace_cells_wiring_real_pdf.py +0 -0
  385. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_r4_column_correction_real_pdf.py +0 -0
  386. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_rc1_banded_column_real_pdf.py +0 -0
  387. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_rc1_general_column_correction_real_pdf.py +0 -0
  388. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_rc_t_degenerate_table_real_pdf.py +0 -0
  389. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_rc_t_layer2_raw_text_real_pdf.py +0 -0
  390. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_render.py +0 -0
  391. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_render_frontmatter_masthead.py +0 -0
  392. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_render_html.py +0 -0
  393. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_render_subsection_chain_promotion.py +0 -0
  394. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_request_09_reference_normalization.py +0 -0
  395. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_residual_2026_05_23_bundled.py +0 -0
  396. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_roman_numeral_section_promote_real_pdf.py +0 -0
  397. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_section_row_label_no_merge_real_pdf.py +0 -0
  398. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_sections_boundaries.py +0 -0
  399. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_sections_boundary_truncation.py +0 -0
  400. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_sections_core_partition.py +0 -0
  401. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_sections_docx_annotator.py +0 -0
  402. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_sections_extract_text.py +0 -0
  403. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_sections_footnote_section.py +0 -0
  404. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_sections_golden.py +0 -0
  405. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_sections_html_annotator.py +0 -0
  406. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_sections_pdf_annotator.py +0 -0
  407. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_sections_public_api.py +0 -0
  408. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_sections_real_corpus.py +0 -0
  409. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_sections_taxonomy.py +0 -0
  410. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_sections_text_annotator.py +0 -0
  411. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_sections_types.py +0 -0
  412. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_sections_unit_corpus.py +0 -0
  413. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_sections_v161_coalesce.py +0 -0
  414. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_sections_v161_subheadings.py +0 -0
  415. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_sections_v161_taxonomy.py +0 -0
  416. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_sections_v161_text_annotator.py +0 -0
  417. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_sections_version.py +0 -0
  418. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_single_column_subsection_promote_real_pdf.py +0 -0
  419. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_smoke_fixtures.py +0 -0
  420. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_structured_result_type.py +0 -0
  421. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_structured_types.py +0 -0
  422. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_structured_version.py +0 -0
  423. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_table_caption_cell_region_real_pdf.py +0 -0
  424. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_table_detect.py +0 -0
  425. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_tables_cell_cleaning.py +0 -0
  426. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_tables_flatten.py +0 -0
  427. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_tables_flatten_blank_header_recovery.py +0 -0
  428. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_tables_superheader_alignment_real_pdf.py +0 -0
  429. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_text_mode.py +0 -0
  430. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_v23_1_fixes.py +0 -0
  431. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_v23_bug_fixes.py +0 -0
  432. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_v23_post_corpus.py +0 -0
  433. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_v23_post_corpus_v2.py +0 -0
  434. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_v2_backwards_compat.py +0 -0
  435. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_v2_top_level_exports.py +0 -0
  436. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_whitespace_char_fallback.py +0 -0
  437. {docpluck-2.4.98 → docpluck-2.4.99}/tests/test_whitespace_cluster.py +0 -0
  438. {docpluck-2.4.98 → docpluck-2.4.99}/tools/canary_provenance.py +0 -0
  439. {docpluck-2.4.98 → docpluck-2.4.99}/tools/fix_python_env.ps1 +0 -0
  440. {docpluck-2.4.98 → docpluck-2.4.99}/tools/render_for_audit.py +0 -0
@@ -12,23 +12,23 @@ Opt-in cross-format benchmark suite (DOCX corpus, DOCX↔PDF parity via Word COM
12
12
  - Microsoft Word installed (for `docx2pdf`, accessed via COM)
13
13
  - Python packages: `mammoth`, `beautifulsoup4`, `lxml`, `rapidfuzz`, `pdf2docx`, `docx2pdf`
14
14
  - CitationGuard corpus present at `C:\Users\filin\Dropbox\Vibe\CitationGuard\apps\worker\testpdfs\validation\docx\`
15
- - PDF test corpus at `C:\Users\filin\Dropbox\Vibe\PDFextractor\test-pdfs\`
15
+ - PDF test corpus at `C:\Users\filin\Dropbox\Vibe\MetaScienceTools\PDFextractor\test-pdfs\`
16
16
 
17
17
  ### Running
18
18
 
19
19
  Full benchmark (5–15 minutes, launches Word):
20
20
  ```bash
21
- cd C:\Users\filin\Dropbox\Vibe\docpluck && python benchmarks/run_all.py
21
+ cd C:\Users\filin\Dropbox\Vibe\MetaScienceTools\docpluck && python benchmarks/run_all.py
22
22
  ```
23
23
 
24
24
  Quick mode (3 files per benchmark, 2–4 minutes):
25
25
  ```bash
26
- cd C:\Users\filin\Dropbox\Vibe\docpluck && python benchmarks/run_all.py --quick
26
+ cd C:\Users\filin\Dropbox\Vibe\MetaScienceTools\docpluck && python benchmarks/run_all.py --quick
27
27
  ```
28
28
 
29
29
  Skip Word-based conversion (if Word unavailable):
30
30
  ```bash
31
- cd C:\Users\filin\Dropbox\Vibe\docpluck && python benchmarks/run_all.py --skip docx2pdf
31
+ cd C:\Users\filin\Dropbox\Vibe\MetaScienceTools\docpluck && python benchmarks/run_all.py --skip docx2pdf
32
32
  ```
33
33
 
34
34
  Individual benchmarks:
@@ -7,7 +7,7 @@ Requires local service running on port 6117 AND frontend on port 6116. Test via
7
7
 
8
8
  First start the service (if not running):
9
9
  ```bash
10
- cd C:\Users\filin\Dropbox\Vibe\PDFextractor\service && uvicorn app.main:app --port 6117 --reload &
10
+ cd C:\Users\filin\Dropbox\Vibe\MetaScienceTools\PDFextractor\service && uvicorn app.main:app --port 6117 --reload &
11
11
  ```
12
12
 
13
13
  Then run:
@@ -3,7 +3,7 @@
3
3
  _Extracted from [../SKILL.md](../SKILL.md). Full procedure lives here._
4
4
 
5
5
  ```bash
6
- cd C:\Users\filin\Dropbox\Vibe\PDFextractor\service && python -c "
6
+ cd C:\Users\filin\Dropbox\Vibe\MetaScienceTools\PDFextractor\service && python -c "
7
7
  import os
8
8
  from docpluck import extract_pdf
9
9
  base = '../test-pdfs'
@@ -1,5 +1,17 @@
1
1
  # Changelog
2
2
 
3
+ ## [2.4.99] — 2026-06-29
4
+
5
+ **Region-driven table capture (DP-1/DP-2): drive Camelot with docpluck's own caption-anchored region as `table_areas` — AI-gold-verified, the first CAPTURE-PATH change in the v2.4.x table series.** `TABLE_EXTRACTION_VERSION` → `2.4.5`. Instead of running Camelot blind (`pages="all"`) and pairing tables to captions after the fact, each table caption now drives a `stream` extraction constrained to its OWN region (`extract_tables_camelot_by_region` + `_region_driven_capture` in `extract_structured.py`), so a caption gets exactly its table by construction. This recovers the **stacked / side-by-side multi-table-per-page** class that blind detection + token-overlap pairing could not separate — **efendic Tables 4+5 now split correctly** (was a 30×2 merge + empty stub), and **12 papers' previously-empty `0×0` stubs become real tables** (chandrashekar, jdm, ieee×4, bmc_med_3 T4, bmj_open_1, …).
6
+
7
+ Region-driven tables are **candidates, not automatic winners**: `_pick_better_table` keeps the structurally-richer of {region, auto-detect} per caption (more columns + a cell-retention guard), because `_region_for_caption`'s bbox is reliable vertically but often too *narrow* horizontally — without this a 7-column demographics table would collapse to 2 columns (jama_open_1 correctly stays 43×7). A *starved* caption (the case region-driving exists for) has no auto-detect candidate, so the region table wins automatically.
8
+
9
+ **Content-plausibility guards** (region path only — applying them to auto-detect wholesale rejected legitimate-but-imperfect tables, a regression): `_cell_is_prose` dominance reject (a caption-anchored region that over-captured 2-column body text → cog_emo Table 3 was a 27×4 grid of the procedure paragraph) and `_CAPTION_LABEL_RE` reject (a cell containing `Table N.`/`Figure N.` proves the region absorbed an adjacent caption → cog_emo Table 9). Both now emit honest `0×0` stubs. The caption **page-fix** (advance `char_start` past a leading `\n\n`/`\f` so captions after a paragraph break / at a page top get a non-empty `line_text` and correct page) is the foundation the layout-channel region lookup needs.
10
+
11
+ Verification (ground truth = AI multimodal `reading`/`stats` golds via article-finder, **never** pdftotext): full **101-PDF structured diff** vs v2.4.98 baseline = 27 papers changed, **0 errors**; **6-paper AI-gold canary** (chandrashekar / ieee_access_4 / efendic page-8 split / bmj_open_1 / ieee_access_7 / cog_emo) all returned FIX / PASS / NET-BETTER with no new regressions. Full table unit + real-PDF test suite green.
12
+
13
+ **Known documented hard cases** (present at baseline, NOT regressions — a robust fix needs more than this cycle): cog_emo Table 8 (Camelot under-segments two tables into one — the correct content exists in the region candidate but cannot be reliably selected); chandrashekar Table 3/4 caption (2-column caption interleaving in the text channel); efendic Table 2 (Camelot detects zero tables on the page — vector/image grid); efendic Table 1 (tiny glued predictions table, header mis-mark); bmc_med_3 duplicate-fragment pairing (both a same-page raw-text dedupe and a bbox-proximity pairing were tried and **reverted** — each regressed another paper; Camelot bboxes are too unreliable to pair on and dropping a duplicate perturbs the order-dependent pairing — see `LEARNINGS`).
14
+
3
15
  ## [2.4.98] — 2026-06-25
4
16
 
5
17
  **ESCIcheck handoff defects (DOCPLUCK_HANDOFF_2026-06-25): η²p effect-typing, inline correlation r-typing, and bracketed-CI continuation merge — flatten/cell-cleaning only, AI-gold-verified.** `TABLE_EXTRACTION_VERSION` → `2.4.4`. Render-visible in the inline flattened-table blocks + `.tables.jsonl` sidecar `fields`. **No capture-path change** — caption→table pairing is byte-identical to v2.4.3. Grounded against the AI `reading`/`stats` golds (article-finder), not pdftotext, and confirmed by a 6-paper AI-gold canary verify. Triage: `docs/TRIAGE_2026-06-25_escicheck_handoff_defects.md`.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docpluck
3
- Version: 2.4.98
3
+ Version: 2.4.99
4
4
  Summary: PDF, DOCX, and HTML text extraction and normalization for academic papers
5
5
  Project-URL: Homepage, https://docpluck.app
6
6
  Project-URL: Documentation, https://docpluck.app/api-docs
@@ -78,7 +78,7 @@ from .figures import Figure
78
78
  from .extract_structured import TABLE_EXTRACTION_VERSION, StructuredResult, extract_pdf_structured
79
79
  from .render import render_pdf_to_markdown
80
80
 
81
- __version__ = "2.4.98"
81
+ __version__ = "2.4.99"
82
82
  __author__ = "Gilad Feldman"
83
83
  __license__ = "MIT"
84
84
 
@@ -37,7 +37,7 @@ from .tables.render import cells_to_html
37
37
  from .telemetry import record_fallback
38
38
 
39
39
 
40
- TABLE_EXTRACTION_VERSION = "2.4.4" # v2.4.4 (ESCIcheck handoff 2026-06-25 — flatten-only, AI-gold-verified): (DP-3) flatten._infer_anova_eta2_hint types a font-dropped η²p effect column by STRUCTURE — an unlabeled estimate column in an F-test/ANOVA results table (F + BF01/CI, no competing d/r/OR) is keyed `eta2`, range-guarded to [0,1]; the η²p glyph itself is unrecoverable (NotoSerif uni:no, OCR-tier). (DP-5b) flatten._inline_stat_field types a self-labeled cell (`r = .67`, `d = 0.32`) by its token even under a generic "Effect size" header. (DP-5a) cell_cleaning._is_fragment_cell recognizes a bracketed-CI close tail (`0.73]`) so a CI split across rows rejoins (`[0.59, 0.73]`) and the junk fragment row is dropped (cog_emo T8 14→10 rows). These three are flatten/cell-cleaning ONLY — no capture-path change, so caption→table pairing is byte-identical to v2.4.3. (DP-1/DP-2 capture recovery — the caption page-attribution fix — was prototyped but REVERTED: AI-gold canary verify showed it mis-pairs same-page-caption tables (efendic T4/T5, cog_emo T8/T9) and only half-fixes plos_med; it is queued as its own gated cycle with same-page disambiguation + region-quality gating. See docs/TRIAGE_2026-06-25_escicheck_handoff_defects.md.) # v2.4.3 (RC-T foundation): tables.whitespace gains a CHAR-LEVEL column-detection fallback (char_whitespace_cells) — when pdfplumber's word grouper glues a tight-kerned numeric row into one token so the word-gap detector finds < 2 columns, recover the grid from char x-gaps, voting on column-START edges (right-aligned data columns are left-edge-stable even when the label column is variable-width) and reinserting intra-cell word spacing from geometry. Fires ONLY as a fallback when the word path returns < 2 columns, so currently-correct tables are byte-unchanged (word path restored verbatim). Recovers ip_feldman Table 10's 7 regression rows in isolation. NOT yet wired to replace a degenerate MATCHED Camelot table (that extract_structured change + region prose-trim is the next gated cycle — see LEARNINGS 2026-06-25). # v2.4.2 (RC-T Layer-2): _extract_table_body_text now (a) Note-anchor — a table's "Note:" footnote is its last element, so trim body prose bled past it (chan_feldman T1/T3, efendic_2022 T5); and (b) degenerate-prose guard — suppress a raw_text fallback that STARTS mid-sentence with a lowercase multi-letter word AND is majority sentence-shaped prose, so render emits a clean caption-only table instead of an unstructured-table dump duplicating Results/Discussion prose (chan_feldman T9 was a verbatim ## Discussion duplicate). FP-safe (real cells start with header/label/number/single-letter marker, never a wrapped continuation); full-corpus 101-PDF guard-diff only trims+suppresses (grew=0 changed=0). # v2.4.1 (DP-2/DP-5): (DP-2) blank-header role recovery now types the unlabeled p-value (a bare `.XXX` after the test stat, no comparison op) and df (a bare integer/Welch-decimal between the stat and the d[CI] column) columns it previously skipped — collabra.77859 T3 fields gain p+df (tables.flatten._recover_blank_roles Pass 4.5). (DP-5) parallel-arm tables with a TWO-ROW header no longer drop their first data row, and a CENTERED super-header is aligned to its arm block instead of its visual-center column: (a) cell_cleaning._is_header_like_row counts APA value shapes (leading-dot decimal, bracketed CI, operator-prefixed p, N/A) as data via _DATA_VALUE_CELL_RE so a real first data row isn't read as a 3rd header row (collabra.90203 T10 recovered the Identifiable/Explicit-learning correlation); (b) tables.flatten._detect_column_groups re-derives arm boundaries from equal-width blocks of the data region (each must hold one super-label) so a centered super-label folded mid-span no longer swaps arm values (xiao_2021 T4 Original/Replication F) or pushes a stat column into the label region; (c) tables.flatten._classify_column reads a folded super-header cell's role from its sub-part (collabra.90203 T10 CI). Full-corpus cached-table flatten diff: no clean-table regression. # v2.4.0 (REQUEST_11): flatten now populates fields for NON-clinical result tables — (a) blank-header column-role recovery (tables.flatten._recover_blank_roles): assign a stat role to a header-stripped column from its data-token SHAPE (CI brackets, df1/df2 pair, estimate-adjacent-CI, p-with-operator) AND caption/footnote/all-header-rows vocabulary, never bare position; recovers collabra.77859 T5 (t/df/d/CI) + collabra.90203 T8/T9 (F/df/p/BF01/eta²p-as-est/CI). (b) packed parallel-arm split (tables.flatten._detect_packed_arms/_flatten_packed_arms): tables packing k≥2 arms into single cells ("Separate Joint" + space-joined values) emit one typed record per arm (group=arm) — collabra.77859 T3 Separate/Joint, xiao_2021 T7 Regret/Justifiability. (c) new BF01 role; validity guards drop r∉[-1,1] / non-monotone CI / non-int n / p∉[0,1]. (d) GENERAL L-004 fixes: _parse_number + _parse_ci_cell fold U+2212 MINUS (negative t/d/CI bounds in Camelot cells were dropped/sign-lost); _VALUE_GROUP_RE handles bracket-led CI groups. Default render + PROSECCO output byte-identical. # v2.3.0 (Tier-2, REQUEST_10): cross-flavor lattice-augmentation — recover data rows a lattice extraction vertically TRUNCATED by appending the rows a same-page, same-column-count stream table captured below the lattice bbox (camelot_extract._augment_lattice_with_stream_rows), gated on equal-col-count + bbox overlap + extends-below; PLUS numeric/parenthetical continuation merge (cell_cleaning._merge_continuation_rows) rejoining stream's stacked value/parenthetical cells. Fixes PROSECCO Table 2 R2-R6. v2.2.0: EC-T1 docpluck.tables.flatten — per-row FlattenedRow records (sentence + structured fields) for downstream stat-verification consumers (effectcheck/escimate/scimeto) + opt-in inline "rendered as text" block below each <table> via render_pdf_to_markdown(flatten_tables_inline=True). v2.1.5: cell-cleaning recovers CMEX10 extensible-bracket PUA glyphs (U+F8EE-F8FB). v2.1.4: cell-cleaning recovers Adobe-Symbol-font PUA glyphs (beta/chi/bullet as U+F0xx). v2.1.3: cell-cleaning recovers '<'-as-backslash glyph corruption. v2.1.2: cell-cleaning recovers descending-CI '2'-for-minus corruption. v2.1.1: cell-cleaning recovers (cid:0) corrupted minus signs + strips math-alphanumeric styling. v2.1.0: cell-cleaning pipeline ported from splice spike (multi-row header detection, continuation merging, leader-dot strip, mash-split, group separators, sig-marker attach)
40
+ TABLE_EXTRACTION_VERSION = "2.4.5" # v2.4.5 (region-driven capture, 2026-06-29 — AI-gold-verified, CAPTURE-PATH change): each table caption now drives Camelot stream with its OWN caption-anchored region as table_areas (extract_tables_camelot_by_region + _region_driven_capture), so a caption gets exactly its table by construction — recovers stacked / side-by-side multi-table-per-page cases blind pages="all" + token-overlap pairing could not separate (efendic Table 4+5 split, 12 papers' 0×0 stubs → real). Region tables are CANDIDATES: _pick_better_table keeps the structurally-richer of {region, auto-detect} per caption (region too-narrow bbox can't collapse a wide table; jama_open_1 stays 43×7). Content-plausibility guards on the region path (_cell_is_prose dominance + _CAPTION_LABEL_RE absorption) reject prose-as-grid / caption-absorbed fragments → honest 0×0 stubs (cog_emo Table 3/9). Caption page-fix (char_start past leading \n\n/\f) is the foundation. Full 101-PDF structured diff (27 changed/0 err) + 6-paper AI-gold canary (all FIX/PASS/NET-BETTER, no new regressions). Known documented hard cases (not regressions vs prior): cog_emo T8 Camelot under-segmentation, chandrashekar 2-col caption interleave, efendic T2 Camelot-invisible, bmc_med_3 duplicate-fragment pairing (dedupe + bbox-proximity pairing both tried + reverted — net-harmful, see LEARNINGS). # v2.4.4 (ESCIcheck handoff 2026-06-25 — flatten-only, AI-gold-verified): (DP-3) flatten._infer_anova_eta2_hint types a font-dropped η²p effect column by STRUCTURE — an unlabeled estimate column in an F-test/ANOVA results table (F + BF01/CI, no competing d/r/OR) is keyed `eta2`, range-guarded to [0,1]; the η²p glyph itself is unrecoverable (NotoSerif uni:no, OCR-tier). (DP-5b) flatten._inline_stat_field types a self-labeled cell (`r = .67`, `d = 0.32`) by its token even under a generic "Effect size" header. (DP-5a) cell_cleaning._is_fragment_cell recognizes a bracketed-CI close tail (`0.73]`) so a CI split across rows rejoins (`[0.59, 0.73]`) and the junk fragment row is dropped (cog_emo T8 14→10 rows). These three are flatten/cell-cleaning ONLY — no capture-path change, so caption→table pairing is byte-identical to v2.4.3. (DP-1/DP-2 capture recovery — the caption page-attribution fix — was prototyped but REVERTED: AI-gold canary verify showed it mis-pairs same-page-caption tables (efendic T4/T5, cog_emo T8/T9) and only half-fixes plos_med; it is queued as its own gated cycle with same-page disambiguation + region-quality gating. See docs/TRIAGE_2026-06-25_escicheck_handoff_defects.md.) # v2.4.3 (RC-T foundation): tables.whitespace gains a CHAR-LEVEL column-detection fallback (char_whitespace_cells) — when pdfplumber's word grouper glues a tight-kerned numeric row into one token so the word-gap detector finds < 2 columns, recover the grid from char x-gaps, voting on column-START edges (right-aligned data columns are left-edge-stable even when the label column is variable-width) and reinserting intra-cell word spacing from geometry. Fires ONLY as a fallback when the word path returns < 2 columns, so currently-correct tables are byte-unchanged (word path restored verbatim). Recovers ip_feldman Table 10's 7 regression rows in isolation. NOT yet wired to replace a degenerate MATCHED Camelot table (that extract_structured change + region prose-trim is the next gated cycle — see LEARNINGS 2026-06-25). # v2.4.2 (RC-T Layer-2): _extract_table_body_text now (a) Note-anchor — a table's "Note:" footnote is its last element, so trim body prose bled past it (chan_feldman T1/T3, efendic_2022 T5); and (b) degenerate-prose guard — suppress a raw_text fallback that STARTS mid-sentence with a lowercase multi-letter word AND is majority sentence-shaped prose, so render emits a clean caption-only table instead of an unstructured-table dump duplicating Results/Discussion prose (chan_feldman T9 was a verbatim ## Discussion duplicate). FP-safe (real cells start with header/label/number/single-letter marker, never a wrapped continuation); full-corpus 101-PDF guard-diff only trims+suppresses (grew=0 changed=0). # v2.4.1 (DP-2/DP-5): (DP-2) blank-header role recovery now types the unlabeled p-value (a bare `.XXX` after the test stat, no comparison op) and df (a bare integer/Welch-decimal between the stat and the d[CI] column) columns it previously skipped — collabra.77859 T3 fields gain p+df (tables.flatten._recover_blank_roles Pass 4.5). (DP-5) parallel-arm tables with a TWO-ROW header no longer drop their first data row, and a CENTERED super-header is aligned to its arm block instead of its visual-center column: (a) cell_cleaning._is_header_like_row counts APA value shapes (leading-dot decimal, bracketed CI, operator-prefixed p, N/A) as data via _DATA_VALUE_CELL_RE so a real first data row isn't read as a 3rd header row (collabra.90203 T10 recovered the Identifiable/Explicit-learning correlation); (b) tables.flatten._detect_column_groups re-derives arm boundaries from equal-width blocks of the data region (each must hold one super-label) so a centered super-label folded mid-span no longer swaps arm values (xiao_2021 T4 Original/Replication F) or pushes a stat column into the label region; (c) tables.flatten._classify_column reads a folded super-header cell's role from its sub-part (collabra.90203 T10 CI). Full-corpus cached-table flatten diff: no clean-table regression. # v2.4.0 (REQUEST_11): flatten now populates fields for NON-clinical result tables — (a) blank-header column-role recovery (tables.flatten._recover_blank_roles): assign a stat role to a header-stripped column from its data-token SHAPE (CI brackets, df1/df2 pair, estimate-adjacent-CI, p-with-operator) AND caption/footnote/all-header-rows vocabulary, never bare position; recovers collabra.77859 T5 (t/df/d/CI) + collabra.90203 T8/T9 (F/df/p/BF01/eta²p-as-est/CI). (b) packed parallel-arm split (tables.flatten._detect_packed_arms/_flatten_packed_arms): tables packing k≥2 arms into single cells ("Separate Joint" + space-joined values) emit one typed record per arm (group=arm) — collabra.77859 T3 Separate/Joint, xiao_2021 T7 Regret/Justifiability. (c) new BF01 role; validity guards drop r∉[-1,1] / non-monotone CI / non-int n / p∉[0,1]. (d) GENERAL L-004 fixes: _parse_number + _parse_ci_cell fold U+2212 MINUS (negative t/d/CI bounds in Camelot cells were dropped/sign-lost); _VALUE_GROUP_RE handles bracket-led CI groups. Default render + PROSECCO output byte-identical. # v2.3.0 (Tier-2, REQUEST_10): cross-flavor lattice-augmentation — recover data rows a lattice extraction vertically TRUNCATED by appending the rows a same-page, same-column-count stream table captured below the lattice bbox (camelot_extract._augment_lattice_with_stream_rows), gated on equal-col-count + bbox overlap + extends-below; PLUS numeric/parenthetical continuation merge (cell_cleaning._merge_continuation_rows) rejoining stream's stacked value/parenthetical cells. Fixes PROSECCO Table 2 R2-R6. v2.2.0: EC-T1 docpluck.tables.flatten — per-row FlattenedRow records (sentence + structured fields) for downstream stat-verification consumers (effectcheck/escimate/scimeto) + opt-in inline "rendered as text" block below each <table> via render_pdf_to_markdown(flatten_tables_inline=True). v2.1.5: cell-cleaning recovers CMEX10 extensible-bracket PUA glyphs (U+F8EE-F8FB). v2.1.4: cell-cleaning recovers Adobe-Symbol-font PUA glyphs (beta/chi/bullet as U+F0xx). v2.1.3: cell-cleaning recovers '<'-as-backslash glyph corruption. v2.1.2: cell-cleaning recovers descending-CI '2'-for-minus corruption. v2.1.1: cell-cleaning recovers (cid:0) corrupted minus signs + strips math-alphanumeric styling. v2.1.0: cell-cleaning pipeline ported from splice spike (multi-row header detection, continuation merging, leader-dot strip, mash-split, group separators, sig-marker attach)
41
41
 
42
42
  TableTextMode = Literal["raw", "placeholder"]
43
43
 
@@ -180,12 +180,34 @@ def extract_pdf_structured(
180
180
  used_caption_ids: set[int] = set()
181
181
  table_captions = [c for c in captions if c.kind == "table"]
182
182
 
183
- # Filter Camelot's output to tables that have a same-page caption.
184
- # This anchors detection to caption signal (matching the pre-pdfplumber-removal
185
- # behavior of docpluck) and drops false-positive Camelot detections like
186
- # bibliographies or address blocks. Tables without captions are rare in APA
187
- # corpus and the existing tests are calibrated against caption-anchored counts.
183
+ # ---- Region-driven capture (candidate generator) ----
184
+ # For each caption, hand Camelot the caption-anchored region as table_areas so
185
+ # it extracts exactly that caption's table sidestepping the header-absorption,
186
+ # two-tables-merged-into-one, and caption-pairing failures of blind
187
+ # auto-detection. CRUCIAL: region-driven tables are CANDIDATES, not winners.
188
+ # ``_region_for_caption``'s bbox is reliable for VERTICAL anchoring but often
189
+ # too NARROW horizontally for a wide table (its width tracks the caption /
190
+ # partial content, not the full grid), so a naive "region wins" would collapse
191
+ # a 7-column demographics table to 2 columns. We therefore pick, per caption,
192
+ # the BETTER of the region-driven candidate and the auto-detected candidate
193
+ # (``_pick_better_table``: more columns wins; tie-break on cell count) — so
194
+ # region-driving can only ADD coverage (0×0 stub → real table) or replace an
195
+ # auto-detect table it strictly beats, never degrade one.
196
+ layout_doc = _layout_doc
197
+ region_tables: dict[int, Table] = {}
198
+ if not camelot_disabled and table_captions:
199
+ region_tables, layout_doc = _region_driven_capture(
200
+ pdf_bytes, rejoined, table_captions, layout_doc,
201
+ next_boundary_by_id, method_pieces,
202
+ )
203
+
204
+ # Auto-detect candidate per caption (legacy pairing). Filter Camelot's output
205
+ # to tables that have a same-page caption — this anchors detection to caption
206
+ # signal (matching the pre-pdfplumber-removal behavior) and drops false
207
+ # positives like bibliographies. Uncaptioned auto-detect tables are dropped as
208
+ # before (rare in the APA corpus; existing tests are caption-anchored).
188
209
  pages_with_table_caption = {c.page for c in table_captions}
210
+ auto_by_cap: dict[int, Table] = {}
189
211
  for ct in camelot_tables:
190
212
  if (ct.get("page") or 0) not in pages_with_table_caption:
191
213
  continue
@@ -196,7 +218,37 @@ def extract_pdf_structured(
196
218
  ct["caption"] = _extract_caption_text(
197
219
  rejoined, match, next_boundary_by_id.get(id(match))
198
220
  )
199
- tables.append(ct)
221
+ auto_by_cap[id(match)] = ct
222
+
223
+ # Pick the candidate per caption. ``used_caption_ids`` already holds the
224
+ # auto-detect-matched caps; add any caption the region pass alone satisfied.
225
+ #
226
+ # ``_pick_better_table`` is the single arbiter: region-driving's value is that
227
+ # it gives a STARVED caption its own table — and a starved caption has NO
228
+ # auto-detect candidate (auto-detect captured the page as one table and
229
+ # token-paired it to the OTHER caption), so ``auto_td is None`` and region
230
+ # wins automatically. Where auto-detect DID give this caption a real grid, the
231
+ # column-collapse guard keeps the richer one. This handles both stacked-table
232
+ # separation (cog_emo Table 9: auto=None → region 5×3 vs baseline 0×0) and
233
+ # wide single tables (jama_open_1 Table 1: region too narrow → keep auto 43×7)
234
+ # without a page-multiplicity special case (which over-forced region on
235
+ # multi-caption pages whose tables auto-detect already separated cleanly —
236
+ # bmc_med_3, ieee_access_8 — collapsing their columns).
237
+ region_win = region_only = 0
238
+ for cap in table_captions:
239
+ auto_td = auto_by_cap.get(id(cap))
240
+ region_td = region_tables.get(id(cap))
241
+ if auto_td is None and region_td is None:
242
+ continue
243
+ best = _pick_better_table(region_td, auto_td)
244
+ if best is region_td and auto_td is not None:
245
+ region_win += 1
246
+ elif best is region_td:
247
+ region_only += 1
248
+ tables.append(best)
249
+ used_caption_ids.add(id(cap))
250
+ if region_win or region_only:
251
+ method_pieces.append(f"region_pick:{region_win}+{region_only}")
200
252
 
201
253
  # §A R1 / B1 (NORMALIZATION_VERSION 1.9.23, 2026-05-23): whitespace_cells
202
254
  # fallback for caption-detected tables that Camelot could not recover.
@@ -209,8 +261,8 @@ def extract_pdf_structured(
209
261
  #
210
262
  # Lazy-import the layout extraction + whitespace helper so the rest of
211
263
  # the pipeline works in environments without pdfplumber-layout deps; any
212
- # failure here falls back transparently to the isolated path.
213
- layout_doc = _layout_doc # reuse caller-supplied doc when available
264
+ # failure here falls back transparently to the isolated path. `layout_doc`
265
+ # may already be materialized by the region-driven pass above; reuse it.
214
266
  _whitespace_cells = None
215
267
  _region_for_caption_fn = None
216
268
  unmatched_caps = [
@@ -346,6 +398,176 @@ def _join_split_captions(text: str) -> str:
346
398
  return text
347
399
 
348
400
 
401
+ # A region's bottom is clipped this many points ABOVE the next same-page
402
+ # caption's top, so a short table's region does not extend down into the NEXT
403
+ # table (cog_emo p13: Table 8's matrix region would otherwise absorb Table 9).
404
+ _REGION_NEXT_CAPTION_GAP_PT: float = 2.0
405
+
406
+ # A region's top is clamped to the caption's top minus this margin (enough to
407
+ # include the caption line's own glyph ascent, but not the content above it), so
408
+ # a region whose SEARCH_ABOVE fallback reached up into a preceding table is
409
+ # pulled back to start at its own caption.
410
+ _REGION_TOP_MARGIN_PT: float = 4.0
411
+
412
+
413
+ def _table_cell_count(td: Optional[Table]) -> int:
414
+ if not td:
415
+ return 0
416
+ return sum(1 for c in (td.get("cells") or []) if (c.get("text") or "").strip())
417
+
418
+
419
+ def _pick_better_table(
420
+ region_td: Optional[Table], auto_td: Optional[Table]
421
+ ) -> Optional[Table]:
422
+ """Choose between a caption's region-driven candidate and its auto-detected
423
+ candidate. Region-driving anchors the RIGHT table to the caption but its
424
+ bbox can be too narrow (collapsing columns), so it only wins when it is at
425
+ least as wide as the auto-detect table AND carries comparable content.
426
+
427
+ Rules (auto-detect is the incumbent; region must clearly not regress it):
428
+ - exactly one present → that one.
429
+ - region has FEWER columns than auto → auto (guards the column-collapse
430
+ regression: a 7-col demographics table must not become 2-col).
431
+ - region has MORE columns than auto → region, UNLESS it lost a large share
432
+ of cells (a wider-but-emptier grid is a mis-detection, keep auto).
433
+ - equal columns → whichever has more populated cells (ties → auto, the
434
+ stable incumbent).
435
+ """
436
+ if region_td is None:
437
+ return auto_td
438
+ if auto_td is None:
439
+ return region_td
440
+ rc = int(region_td.get("n_cols") or 0)
441
+ ac = int(auto_td.get("n_cols") or 0)
442
+ r_cells = _table_cell_count(region_td)
443
+ a_cells = _table_cell_count(auto_td)
444
+ if rc < ac:
445
+ return auto_td
446
+ if rc > ac:
447
+ # Wider grid must still retain most of the content to be believed.
448
+ if a_cells > 0 and r_cells < 0.6 * a_cells:
449
+ return auto_td
450
+ return region_td
451
+ # Equal column count: prefer the fuller grid; tie goes to the incumbent.
452
+ return region_td if r_cells > a_cells else auto_td
453
+
454
+
455
+ def _region_driven_capture(
456
+ pdf_bytes: bytes,
457
+ rejoined: str,
458
+ table_captions: list[CaptionMatch],
459
+ layout_doc,
460
+ next_boundary_by_id: dict[int, Optional[int]],
461
+ method_pieces: list[str],
462
+ ) -> tuple[dict[int, Table], object]:
463
+ """Drive Camelot once per caption with the caption-anchored region as
464
+ ``table_areas`` and return ``{id(cap): Table}`` for every caption that
465
+ yielded a usable table, plus the (possibly newly-materialized) layout doc.
466
+
467
+ For each caption we (1) locate its layout region via ``_region_for_caption``,
468
+ (2) clip the region bottom at the next same-page caption so it can't reach
469
+ into the following table, (3) convert the pdfplumber TOP-DOWN bbox to the
470
+ Camelot PDF BOTTOM-UP ``table_areas`` string, and (4) attach the extracted
471
+ caption text. Any failure degrades gracefully to ``{}`` so the legacy
472
+ auto-detect + whitespace path still runs.
473
+ """
474
+ empty: dict[int, Table] = {}
475
+ try:
476
+ from .tables.detect import _region_for_caption, _bbox_of_caption_line
477
+ from .tables.camelot_extract import extract_tables_camelot_by_region
478
+ if layout_doc is None:
479
+ from .extract_layout import extract_pdf_layout
480
+ layout_doc = extract_pdf_layout(pdf_bytes)
481
+ except Exception as exc:
482
+ record_fallback("region_driven_setup_exception", detail=type(exc).__name__)
483
+ return empty, layout_doc
484
+
485
+ pages = getattr(layout_doc, "pages", None) or ()
486
+
487
+ # Group captions per page (top-to-bottom) so the next-caption bottom-clip can
488
+ # look at the following caption on the same page.
489
+ by_page: dict[int, list[CaptionMatch]] = {}
490
+ for c in table_captions:
491
+ by_page.setdefault(c.page, []).append(c)
492
+
493
+ specs: list[dict] = []
494
+ cap_by_key: dict[str, CaptionMatch] = {}
495
+ for page, caps in by_page.items():
496
+ if not (1 <= page <= len(pages)):
497
+ continue
498
+ page_obj = pages[page - 1]
499
+ page_height = float(getattr(page_obj, "height", 0.0) or 0.0)
500
+ if page_height <= 0.0:
501
+ continue
502
+ caps_sorted = sorted(caps, key=lambda c: c.char_start)
503
+ cap_top_td: dict[int, Optional[float]] = {}
504
+ for c in caps_sorted:
505
+ cb = _bbox_of_caption_line(page_obj, c)
506
+ cap_top_td[id(c)] = cb[1] if cb else None
507
+ for i, c in enumerate(caps_sorted):
508
+ try:
509
+ region = _region_for_caption(layout_doc, c)
510
+ except Exception as exc:
511
+ record_fallback("region_driven_region_exception", detail=type(exc).__name__)
512
+ region = None
513
+ if region is None:
514
+ continue
515
+ x0, top, x1, bottom = region.bbox
516
+ # Clip the region TOP at the caption's own top. A table never extends
517
+ # ABOVE its caption line; ``_region_for_caption``'s SEARCH_ABOVE
518
+ # fallback can pull the region up into a PRECEDING table when no
519
+ # geometry is found below (cog_emo Table 9, whose caption sits below
520
+ # Table 8's matrix, otherwise captures Table 8's rows). Clamp so the
521
+ # region starts at the caption line (minus a small margin for the
522
+ # caption glyphs' own ascent), never higher.
523
+ this_cap_top = cap_top_td.get(id(c))
524
+ if this_cap_top is not None and top < this_cap_top - _REGION_TOP_MARGIN_PT:
525
+ top = this_cap_top - _REGION_TOP_MARGIN_PT
526
+ # Clip bottom at the next same-page caption that sits below this one.
527
+ for j in range(i + 1, len(caps_sorted)):
528
+ nxt_top = cap_top_td.get(id(caps_sorted[j]))
529
+ if nxt_top is not None and nxt_top > top + 5.0:
530
+ bottom = min(bottom, nxt_top - _REGION_NEXT_CAPTION_GAP_PT)
531
+ break
532
+ if bottom <= top:
533
+ continue
534
+ key = f"{id(c)}"
535
+ specs.append({
536
+ "key": key,
537
+ "page": page,
538
+ "area": f"{x0:.2f},{page_height - top:.2f},{x1:.2f},{page_height - bottom:.2f}",
539
+ "area_bu": (x0, page_height - top, x1, page_height - bottom),
540
+ "label": c.label,
541
+ })
542
+ cap_by_key[key] = c
543
+
544
+ if not specs:
545
+ return empty, layout_doc
546
+
547
+ try:
548
+ by_key = extract_tables_camelot_by_region(pdf_bytes, specs)
549
+ except Exception as exc:
550
+ record_fallback("region_driven_extract_exception", detail=type(exc).__name__)
551
+ return empty, layout_doc
552
+
553
+ result: dict[int, Table] = {}
554
+ for key, td in by_key.items():
555
+ cap = cap_by_key.get(key)
556
+ if cap is None:
557
+ continue
558
+ # Use the caption-number id scheme (``t{number}``) — the same the
559
+ # isolated/whitespace path uses — so ids stay deterministic and match the
560
+ # recognized-prefix invariant (test_table_ids_unique_and_sequential).
561
+ td["id"] = f"t{cap.number}"
562
+ td["caption"] = _extract_caption_text(
563
+ rejoined, cap, next_boundary_by_id.get(id(cap))
564
+ )
565
+ result[id(cap)] = td
566
+ if result:
567
+ method_pieces.append("region_driven")
568
+ return result, layout_doc
569
+
570
+
349
571
  def _find_caption_for_table(
350
572
  camelot_table: Table,
351
573
  captions: list[CaptionMatch],