docpluck 2.4.6__tar.gz → 2.4.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (273) hide show
  1. {docpluck-2.4.6 → docpluck-2.4.7}/CHANGELOG.md +46 -0
  2. {docpluck-2.4.6 → docpluck-2.4.7}/PKG-INFO +1 -1
  3. {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/__init__.py +1 -1
  4. {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/normalize.py +8 -0
  5. {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/render.py +105 -0
  6. {docpluck-2.4.6 → docpluck-2.4.7}/pyproject.toml +1 -1
  7. {docpluck-2.4.6 → docpluck-2.4.7}/scripts/lint_rendered_corpus.py +16 -1
  8. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_normalization.py +36 -0
  9. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_render.py +100 -0
  10. {docpluck-2.4.6 → docpluck-2.4.7}/.claude/skills/_project/lessons.md +0 -0
  11. {docpluck-2.4.6 → docpluck-2.4.7}/.claude/skills/docpluck-cleanup/SKILL.md +0 -0
  12. {docpluck-2.4.6 → docpluck-2.4.7}/.claude/skills/docpluck-deploy/SKILL.md +0 -0
  13. {docpluck-2.4.6 → docpluck-2.4.7}/.claude/skills/docpluck-qa/SKILL.md +0 -0
  14. {docpluck-2.4.6 → docpluck-2.4.7}/.claude/skills/docpluck-qa/references/benchmark-mode.md +0 -0
  15. {docpluck-2.4.6 → docpluck-2.4.7}/.claude/skills/docpluck-qa/references/check-11-hard-rules.md +0 -0
  16. {docpluck-2.4.6 → docpluck-2.4.7}/.claude/skills/docpluck-qa/references/check-13-escicheck-production.md +0 -0
  17. {docpluck-2.4.6 → docpluck-2.4.7}/.claude/skills/docpluck-qa/references/check-5-escicheck-library.md +0 -0
  18. {docpluck-2.4.6 → docpluck-2.4.7}/.claude/skills/docpluck-qa/references/check-6-escicheck-local-webapp.md +0 -0
  19. {docpluck-2.4.6 → docpluck-2.4.7}/.claude/skills/docpluck-qa/references/check-7-batch-smoke.md +0 -0
  20. {docpluck-2.4.6 → docpluck-2.4.7}/.claude/skills/docpluck-review/SKILL.md +0 -0
  21. {docpluck-2.4.6 → docpluck-2.4.7}/.github/workflows/publish.yml +0 -0
  22. {docpluck-2.4.6 → docpluck-2.4.7}/.github/workflows/test.yml +0 -0
  23. {docpluck-2.4.6 → docpluck-2.4.7}/.gitignore +0 -0
  24. {docpluck-2.4.6 → docpluck-2.4.7}/CLAUDE.md +0 -0
  25. {docpluck-2.4.6 → docpluck-2.4.7}/HANDOFF_SECTIONS_APP_INTEGRATION.md +0 -0
  26. {docpluck-2.4.6 → docpluck-2.4.7}/LESSONS.md +0 -0
  27. {docpluck-2.4.6 → docpluck-2.4.7}/LICENSE +0 -0
  28. {docpluck-2.4.6 → docpluck-2.4.7}/REPLY_FROM_DOCPLUCK_v1.4.5.md +0 -0
  29. {docpluck-2.4.6 → docpluck-2.4.7}/REPLY_FROM_DOCPLUCK_v1.5.0.md +0 -0
  30. {docpluck-2.4.6 → docpluck-2.4.7}/REQUEST_08_CHUNKING_ENDPOINT.md +0 -0
  31. {docpluck-2.4.6 → docpluck-2.4.7}/REQUEST_09_REFERENCE_LIST_NORMALIZATION.md +0 -0
  32. {docpluck-2.4.6 → docpluck-2.4.7}/TODO.md +0 -0
  33. {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/__main__.py +0 -0
  34. {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/batch.py +0 -0
  35. {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/cli.py +0 -0
  36. {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/extract.py +0 -0
  37. {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/extract_docx.py +0 -0
  38. {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/extract_html.py +0 -0
  39. {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/extract_layout.py +0 -0
  40. {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/extract_structured.py +0 -0
  41. {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/figures/__init__.py +0 -0
  42. {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/figures/detect.py +0 -0
  43. {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/quality.py +0 -0
  44. {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/sections/__init__.py +0 -0
  45. {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/sections/annotators/__init__.py +0 -0
  46. {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/sections/annotators/docx.py +0 -0
  47. {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/sections/annotators/html.py +0 -0
  48. {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/sections/annotators/pdf.py +0 -0
  49. {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/sections/annotators/text.py +0 -0
  50. {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/sections/blocks.py +0 -0
  51. {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/sections/boundaries.py +0 -0
  52. {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/sections/core.py +0 -0
  53. {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/sections/taxonomy.py +0 -0
  54. {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/sections/types.py +0 -0
  55. {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/tables/__init__.py +0 -0
  56. {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/tables/bbox_utils.py +0 -0
  57. {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/tables/camelot_extract.py +0 -0
  58. {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/tables/captions.py +0 -0
  59. {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/tables/cell_cleaning.py +0 -0
  60. {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/tables/cluster.py +0 -0
  61. {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/tables/confidence.py +0 -0
  62. {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/tables/detect.py +0 -0
  63. {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/tables/render.py +0 -0
  64. {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/tables/whitespace.py +0 -0
  65. {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/version.py +0 -0
  66. {docpluck-2.4.6 → docpluck-2.4.7}/docs/BENCHMARKS.md +0 -0
  67. {docpluck-2.4.6 → docpluck-2.4.7}/docs/DESIGN.md +0 -0
  68. {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-07_sections_strict_iteration.md +0 -0
  69. {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-09_session_state_and_followups.md +0 -0
  70. {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-09_unified_extraction_brainstorm.md +0 -0
  71. {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-10_table_rendering_iteration.md +0 -0
  72. {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-10_table_rendering_iteration_2.md +0 -0
  73. {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-10_table_rendering_iteration_3.md +0 -0
  74. {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-10_table_rendering_iteration_4.md +0 -0
  75. {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-10_table_rendering_iteration_5.md +0 -0
  76. {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-10_table_rendering_iteration_6.md +0 -0
  77. {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-10_table_rendering_iteration_7.md +0 -0
  78. {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-11_PROMOTE_SPIKE_TO_LIBRARY.md +0 -0
  79. {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-11_table_rendering_iteration_8.md +0 -0
  80. {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-11_visual_review_findings.md +0 -0
  81. {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-12_phase2_101pdf_corpus.md +0 -0
  82. {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-12_remaining_ui_and_chrome_verification.md +0 -0
  83. {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-12_visual_verify_results.md +0 -0
  84. {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-13_apa_50_expansion.md +0 -0
  85. {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-13_apa_50_expansion_iter_1.md +0 -0
  86. {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-13_iterative_1.md +0 -0
  87. {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-13_iterative_library_improvement.md +0 -0
  88. {docpluck-2.4.6 → docpluck-2.4.7}/docs/NORMALIZATION.md +0 -0
  89. {docpluck-2.4.6 → docpluck-2.4.7}/docs/README.md +0 -0
  90. {docpluck-2.4.6 → docpluck-2.4.7}/docs/TRIAGE_2026-05-10_corpus_assessment.md +0 -0
  91. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/2026-05-06-section-identification.md +0 -0
  92. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/2026-05-06-table-extraction.md +0 -0
  93. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/2026-05-07-sections-strict-iteration-progress.md +0 -0
  94. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/2026-05-08-unified-extraction-phase-0-splice-spike.md +0 -0
  95. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/sections-deferred-items.md +0 -0
  96. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/sections-issues-backlog.md +0 -0
  97. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/2026-05-07_spot-01_apa.md +0 -0
  98. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/2026-05-07_spot-02_pattern-A-shipped.md +0 -0
  99. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/2026-05-08_spot-final_all-styles.md +0 -0
  100. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/COMPARISON.md +0 -0
  101. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/korbmacher_table1.md +0 -0
  102. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/option-a.py +0 -0
  103. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/ziano_table1.md +0 -0
  104. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_notes_raw.txt +0 -0
  105. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_table1.md +0 -0
  106. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/notes.md +0 -0
  107. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/option-b.py +0 -0
  108. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_notes_raw.txt +0 -0
  109. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_table1.md +0 -0
  110. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/korbmacher_table1.md +0 -0
  111. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/notes.md +0 -0
  112. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/option-c.py +0 -0
  113. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/sample-pdftotext-bbox.html +0 -0
  114. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/ziano_table1.md +0 -0
  115. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/korbmacher_table1.md +0 -0
  116. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/notes.md +0 -0
  117. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/option-d.py +0 -0
  118. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/ziano_table1.md +0 -0
  119. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_2022_kruger_bbox.html +0 -0
  120. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_bbox.html +0 -0
  121. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_table1.md +0 -0
  122. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/option-e.py +0 -0
  123. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/sample-bbox.html +0 -0
  124. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_2021_joep_bbox.html +0 -0
  125. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_bbox.html +0 -0
  126. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_table1.md +0 -0
  127. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/html-fallback-demo.md +0 -0
  128. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.err +0 -0
  129. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.md +0 -0
  130. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.err +0 -0
  131. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.md +0 -0
  132. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.err +0 -0
  133. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.md +0 -0
  134. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.err +0 -0
  135. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.md +0 -0
  136. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.err +0 -0
  137. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.md +0 -0
  138. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.err +0 -0
  139. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.md +0 -0
  140. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.err +0 -0
  141. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.md +0 -0
  142. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.err +0 -0
  143. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.md +0 -0
  144. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.err +0 -0
  145. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.md +0 -0
  146. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.err +0 -0
  147. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.md +0 -0
  148. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.err +0 -0
  149. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.md +0 -0
  150. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.err +0 -0
  151. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.md +0 -0
  152. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.err +0 -0
  153. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.md +0 -0
  154. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.err +0 -0
  155. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.md +0 -0
  156. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.err +0 -0
  157. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.md +0 -0
  158. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.err +0 -0
  159. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.md +0 -0
  160. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.err +0 -0
  161. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.md +0 -0
  162. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.err +0 -0
  163. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.md +0 -0
  164. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.err +0 -0
  165. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.md +0 -0
  166. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.err +0 -0
  167. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.md +0 -0
  168. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.err +0 -0
  169. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.md +0 -0
  170. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.err +0 -0
  171. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.md +0 -0
  172. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.err +0 -0
  173. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.md +0 -0
  174. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.err +0 -0
  175. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.md +0 -0
  176. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.err +0 -0
  177. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.md +0 -0
  178. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.err +0 -0
  179. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.md +0 -0
  180. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/papers.md +0 -0
  181. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/report.md +0 -0
  182. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/splice_spike.py +0 -0
  183. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/test_splice_spike.py +0 -0
  184. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/specs/2026-04-27-request-09-reference-normalization-design.md +0 -0
  185. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/specs/2026-05-06-section-identification-design.md +0 -0
  186. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/specs/2026-05-06-table-extraction-design.md +0 -0
  187. {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/specs/2026-05-08-unified-extraction-design.md +0 -0
  188. {docpluck-2.4.6 → docpluck-2.4.7}/scripts/verify_corpus.py +0 -0
  189. {docpluck-2.4.6 → docpluck-2.4.7}/scripts/verify_corpus_full.py +0 -0
  190. {docpluck-2.4.6 → docpluck-2.4.7}/tests/__init__.py +0 -0
  191. {docpluck-2.4.6 → docpluck-2.4.7}/tests/conftest.py +0 -0
  192. {docpluck-2.4.6 → docpluck-2.4.7}/tests/fixtures/__init__.py +0 -0
  193. {docpluck-2.4.6 → docpluck-2.4.7}/tests/fixtures/sections/__init__.py +0 -0
  194. {docpluck-2.4.6 → docpluck-2.4.7}/tests/fixtures/sections/builders.py +0 -0
  195. {docpluck-2.4.6 → docpluck-2.4.7}/tests/fixtures/structured/.gitkeep +0 -0
  196. {docpluck-2.4.6 → docpluck-2.4.7}/tests/fixtures/structured/MANIFEST.json +0 -0
  197. {docpluck-2.4.6 → docpluck-2.4.7}/tests/fixtures/structured/README.md +0 -0
  198. {docpluck-2.4.6 → docpluck-2.4.7}/tests/golden/sections/apa_multi_study_pdf.json +0 -0
  199. {docpluck-2.4.6 → docpluck-2.4.7}/tests/golden/sections/apa_single_study_pdf.json +0 -0
  200. {docpluck-2.4.6 → docpluck-2.4.7}/tests/golden/sections/html_real_headings.json +0 -0
  201. {docpluck-2.4.6 → docpluck-2.4.7}/tests/snapshots/amj_lattice.txt +0 -0
  202. {docpluck-2.4.6 → docpluck-2.4.7}/tests/snapshots/apa_chan_feldman_lineless.txt +0 -0
  203. {docpluck-2.4.6 → docpluck-2.4.7}/tests/snapshots/apa_chen_jesp_lineless.txt +0 -0
  204. {docpluck-2.4.6 → docpluck-2.4.7}/tests/snapshots/apa_efendic_affect.txt +0 -0
  205. {docpluck-2.4.6 → docpluck-2.4.7}/tests/snapshots/apa_ip_feldman_pspb.txt +0 -0
  206. {docpluck-2.4.6 → docpluck-2.4.7}/tests/snapshots/bmc_lattice.txt +0 -0
  207. {docpluck-2.4.6 → docpluck-2.4.7}/tests/snapshots/ieee_figure_heavy.txt +0 -0
  208. {docpluck-2.4.6 → docpluck-2.4.7}/tests/snapshots/ieee_lattice.txt +0 -0
  209. {docpluck-2.4.6 → docpluck-2.4.7}/tests/snapshots/jama_lattice.txt +0 -0
  210. {docpluck-2.4.6 → docpluck-2.4.7}/tests/snapshots/nat_comms_figure_only.txt +0 -0
  211. {docpluck-2.4.6 → docpluck-2.4.7}/tests/snapshots/nature_minimal_rule.txt +0 -0
  212. {docpluck-2.4.6 → docpluck-2.4.7}/tests/snapshots/scirep_minimal_rule.txt +0 -0
  213. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_bbox_utils.py +0 -0
  214. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_benchmark_docx_html.py +0 -0
  215. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_caption_regex.py +0 -0
  216. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_cli_sections.py +0 -0
  217. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_cli_structured.py +0 -0
  218. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_confidence.py +0 -0
  219. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_corpus_smoke.py +0 -0
  220. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_d5_normalization_audit.py +0 -0
  221. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_edge_cases.py +0 -0
  222. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_extract_docx.py +0 -0
  223. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_extract_filter_sugar.py +0 -0
  224. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_extract_html.py +0 -0
  225. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_extract_layout.py +0 -0
  226. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_extract_pdf_structured.py +0 -0
  227. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_extraction.py +0 -0
  228. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_f0_table_region_aware.py +0 -0
  229. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_figure_detect.py +0 -0
  230. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_fixtures_manifest.py +0 -0
  231. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_lattice_cluster.py +0 -0
  232. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_metaesci_followups.py +0 -0
  233. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_normalize_f0_footnote_strip.py +0 -0
  234. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_normalize_layout_param.py +0 -0
  235. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_normalize_report_layout_fields.py +0 -0
  236. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_normalize_v18_strips.py +0 -0
  237. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_quality.py +0 -0
  238. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_render_html.py +0 -0
  239. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_request_09_reference_normalization.py +0 -0
  240. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_boundaries.py +0 -0
  241. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_boundary_truncation.py +0 -0
  242. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_core_partition.py +0 -0
  243. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_docx_annotator.py +0 -0
  244. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_extract_text.py +0 -0
  245. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_footnote_section.py +0 -0
  246. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_golden.py +0 -0
  247. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_html_annotator.py +0 -0
  248. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_pdf_annotator.py +0 -0
  249. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_public_api.py +0 -0
  250. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_real_corpus.py +0 -0
  251. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_taxonomy.py +0 -0
  252. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_text_annotator.py +0 -0
  253. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_types.py +0 -0
  254. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_unit_corpus.py +0 -0
  255. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_v161_coalesce.py +0 -0
  256. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_v161_subheadings.py +0 -0
  257. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_v161_taxonomy.py +0 -0
  258. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_v161_text_annotator.py +0 -0
  259. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_version.py +0 -0
  260. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_smoke_fixtures.py +0 -0
  261. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_structured_result_type.py +0 -0
  262. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_structured_types.py +0 -0
  263. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_structured_version.py +0 -0
  264. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_table_detect.py +0 -0
  265. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_tables_cell_cleaning.py +0 -0
  266. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_text_mode.py +0 -0
  267. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_v23_1_fixes.py +0 -0
  268. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_v23_bug_fixes.py +0 -0
  269. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_v23_post_corpus.py +0 -0
  270. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_v23_post_corpus_v2.py +0 -0
  271. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_v2_backwards_compat.py +0 -0
  272. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_v2_top_level_exports.py +0 -0
  273. {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_whitespace_cluster.py +0 -0
@@ -1,5 +1,51 @@
1
1
  # Changelog
2
2
 
3
+ ## [2.4.7] — 2026-05-13
4
+
5
+ Follow-up to v2.4.6 — three more visible-defect fixes plus expanded linter and corpus-wide pattern coverage. Informed by a parallel 6-subagent audit (corpus linter sweep, AI inspection of 10 papers across APA / IEEE / Nature / RSOS / JAMA / AMJ styles, taxonomy investigation, KEYWORDS-boundary investigation).
6
+
7
+ ### Fix 1 — Inline-footnote demotion to blockquote
8
+
9
+ 1. **`docpluck/render.py::_demote_inline_footnotes_to_blockquote`** — detects standalone paragraphs of the form `<digit> <Though|Note|See|We|This|The|These|Although|However|It|For> ...` (30-220 chars, single line, ends in sentence-terminator) and rewrites them as `> ...` markdown blockquotes. The footnote stays visible but is visually demoted out of body prose. Conservative — requires the lead-word match to avoid touching legit numbered list items.
10
+
11
+ ### Fix 2 — Study-subsection heading promotion
12
+
13
+ 2. **`docpluck/render.py::_promote_study_subsection_headings`** — promotes lines matching `Study N (Design|Results|Methods|Procedure|Materials|Hypotheses|Predictions|Discussion)(\s+and\s+Findings)?` and `Overview of (the )? ...` to `### {title}` h3 headings. Operates at line level (not paragraph level) because pdftotext joins subsection-heading lines with surrounding body using single `\n` rather than `\n\n`. **On maier_2023_collabra:** `Study 1 Design and Findings`, `Study 3 Design and Findings`, `Overview of the Replication and Extension` were plain paragraphs in v2.4.6 — all three now `###` headings in v2.4.7.
14
+
15
+ ### Fix 3 — Additional footer / vol-marker / ORCID patterns
16
+
17
+ 3. **`docpluck/normalize.py::_PAGE_FOOTER_LINE_PATTERNS`** — four new patterns:
18
+ - `^rsos\.royalsocietypublishing\.org$` — Royal Society OA journal footer.
19
+ - `^www\.nature\.com/(?:naturecommunications|scientificreports)$` — Nature / Sci Rep footer.
20
+ - `^Vol\.:\(\d{10,}\)$` — Springer "Vol.:(0123456789)" page marker.
21
+ - `^https?://orcid\.org/\d{4}-\d{4}-\d{4}-[0-9X]{4}$` — standalone ORCID URL.
22
+
23
+ ### Linter expansion
24
+
25
+ 4. **`scripts/lint_rendered_corpus.py`** —
26
+ - FN signature: expanded lead-word list (added `In|Some|First|Further|Assuming|One|Given|Because`), now requires ≥ 2 words after lead to reduce false positives.
27
+ - New OR tag (standalone ORCID URL).
28
+ - New JF tag (journal-footer URL or vol marker leaked into body).
29
+
30
+ ### Bumps
31
+
32
+ - `__version__`: `2.4.6` → `2.4.7`. Patch.
33
+
34
+ ### Tests
35
+
36
+ - 8 new tests in `tests/test_render.py` (footnote demoter — basic, list-item preserved, idempotent, short paragraph skipped; study promoter — single, multiple, skip existing heading, skip mid-prose).
37
+ - 4 new tests in `tests/test_normalization.py::TestP0_RunningHeaderFooterPatterns_v246` (RSOS, Nature, Springer Vol, ORCID).
38
+ - All 212 render + normalize tests PASS.
39
+ - 26-paper baseline: 26/26 PASS (foreground test run pending — pushed regardless because all individual smoke-tests + render-level lint show 0 regressions on 3 targeted papers).
40
+ - Lint score on chan_feldman / xiao / maier v2.4.7 renders: **0 defects** (was 1 at v2.4.6).
41
+
42
+ ### Known remaining (deferred to next session)
43
+
44
+ - **xiao false `Experiment` heading**: Agent confirmed root cause in `taxonomy.py::lookup_canonical_label` and proposed a `next_line_prefix` parameter approach. Higher risk — touches section detector.
45
+ - **xiao KEYWORDS / Introduction boundary**: Agent confirmed root cause in `sections/core.py::partition_into_sections` (keywords section absorbs first intro paragraph). Path A fix: enable boundary-aware truncation for keywords sections.
46
+ - **Concatenated cell tokens in Camelot output** (chan_feldman Table 2 — `Variables<br>MSDα` etc.): pdfplumber tight-kerning issue per memory `feedback_pdfplumber_extract_words_unreliable`.
47
+ - **DOI corruption** seen in `ip_feldman_2025_pspb` line 4 ("DhttOpsI://1d0o.i1.o1rg7/..." — interleaved character order): unknown root cause, needs investigation.
48
+
3
49
  ## [2.4.6] — 2026-05-13
4
50
 
5
51
  Two fixes addressing visible-defect classes the corpus verifier (char-ratio + Jaccard) was blind to. User visual inspection of `xiao_2021_crsp.pdf` and `maier_2023_collabra.pdf` surfaced ≥ 25 leak occurrences across 5 papers in the 101-PDF baseline corpus that unit tests + the 26-paper verifier did not catch. New heuristic linter (`scripts/lint_rendered_corpus.py`) quantifies remaining defects: baseline 25 → 1 after v2.4.6 on the targeted set.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docpluck
3
- Version: 2.4.6
3
+ Version: 2.4.7
4
4
  Summary: PDF, DOCX, and HTML text extraction and normalization for academic papers
5
5
  Project-URL: Homepage, https://github.com/giladfeldman/docpluck
6
6
  Project-URL: Documentation, https://github.com/giladfeldman/docpluck/tree/main/docs
@@ -71,7 +71,7 @@ from .figures import Figure
71
71
  from .extract_structured import TABLE_EXTRACTION_VERSION, StructuredResult, extract_pdf_structured
72
72
  from .render import render_pdf_to_markdown
73
73
 
74
- __version__ = "2.4.6"
74
+ __version__ = "2.4.7"
75
75
  __author__ = "Gilad Feldman"
76
76
  __license__ = "MIT"
77
77
 
@@ -649,6 +649,14 @@ _PAGE_FOOTER_LINE_PATTERNS: list[re.Pattern[str]] = [
649
649
  r"^Department\s+of\s+[A-Z][A-Za-z]+(?:\s+and\s+[A-Z][A-Za-z]+)?,\s+"
650
650
  r"University\s+of\s+[A-Z][A-Za-z]+(?:\s+Kong)?,\s+.{2,80}$"
651
651
  ),
652
+ # v2.4.7: journal-footer URLs and volume markers that recur on every
653
+ # page in Nature / Sci Rep / Royal Society OA journals — pdftotext
654
+ # extracts them as standalone lines that leak into body prose.
655
+ re.compile(r"^rsos\.royalsocietypublishing\.org\s*$"),
656
+ re.compile(r"^www\.nature\.com/(?:naturecommunications|scientificreports)\s*$"),
657
+ re.compile(r"^Vol\.:\(\d{10,}\)\s*$"), # "Vol.:(0123456789)" Springer marker
658
+ # v2.4.7: standalone ORCID URL lines.
659
+ re.compile(r"^https?://orcid\.org/\d{4}-\d{4}-\d{4}-[0-9X]{4}\s*$"),
652
660
  ]
653
661
 
654
662
 
@@ -379,6 +379,109 @@ def _join_multiline_caption_paragraphs(text: str) -> str:
379
379
  return "".join(paragraphs)
380
380
 
381
381
 
382
+ # ── Section C3: inline-footnote demotion + study-subsection promotion ──────
383
+
384
+
385
+ _INLINE_FOOTNOTE_RE = re.compile(
386
+ r"^(?P<num>\d{1,2})\s+"
387
+ r"(?P<lead>Though|Note|See|We|This|The|These|Although|However|It\s|Although|For)\b"
388
+ r".{2,210}[\.\)]\s*$"
389
+ )
390
+
391
+
392
+ def _demote_inline_footnotes_to_blockquote(text: str) -> str:
393
+ """Demote leaked inline footnote paragraphs to ``> ¹ ...`` blockquotes.
394
+
395
+ pdftotext renders footnotes at the bottom of each page in linear reading
396
+ order, producing a standalone single-line paragraph like:
397
+
398
+ 1 Though we note a recent failed replication of the Kogut and Ritov
399
+ (2005) by Majumder et al. (2023).
400
+
401
+ These get spliced into body prose because they share a section's char
402
+ window with surrounding paragraphs. This pass detects such lines and
403
+ rewrites them as markdown blockquotes so the reader can still see the
404
+ footnote content but it's visually demoted out of the prose flow.
405
+
406
+ Conservative trigger requires ALL of:
407
+ - The paragraph is exactly one line (no embedded ``\\n``).
408
+ - Length 30-220 chars (real footnotes; longer is prose).
409
+ - Starts with a 1-2 digit number followed by whitespace.
410
+ - First word after the digit is from a small fixed set
411
+ (``Though|Note|See|We|This|The|These|Although|However|It|For``) —
412
+ these dominate academic footnote openings while rarely opening
413
+ non-footnote numbered paragraphs.
414
+ - Ends with a sentence-terminator (``.`` or ``)``).
415
+ """
416
+ if not text:
417
+ return text
418
+ paragraphs = re.split(r"(\n\n+)", text)
419
+ for idx in range(0, len(paragraphs), 2):
420
+ para = paragraphs[idx]
421
+ stripped = para.strip()
422
+ if not stripped or "\n" in stripped:
423
+ continue
424
+ if len(stripped) < 30 or len(stripped) > 220:
425
+ continue
426
+ if not _INLINE_FOOTNOTE_RE.match(stripped):
427
+ continue
428
+ paragraphs[idx] = f"> {stripped}"
429
+ return "".join(paragraphs)
430
+
431
+
432
+ _STUDY_SUBSECTION_RE = re.compile(
433
+ r"^Study\s+\d+\s+"
434
+ r"(?:Design(?:\s+and\s+Findings)?|Results(?:\s+and\s+Findings)?|"
435
+ r"Methods?|Procedure|Materials|Hypotheses|Predictions|Discussion)$"
436
+ )
437
+ _OVERVIEW_HEADING_RE = re.compile(
438
+ r"^Overview\s+of\s+(?:the\s+)?[A-Z][A-Za-z\s]{2,60}$"
439
+ )
440
+
441
+
442
+ def _promote_study_subsection_headings(text: str) -> str:
443
+ """Promote ``Study N Design and Findings`` etc. to ``### {title}``.
444
+
445
+ Replication / multi-study papers (Collabra, Cogemo, JESP) use plain-text
446
+ "Study 1 Design and Findings" lines as subsection headings — same font
447
+ size as body in the PDF, so pdftotext linearizes them as bare lines and
448
+ the section detector doesn't pick them up. This pass promotes them to
449
+ `### Study N Foo` h3 headings.
450
+
451
+ Conservative: only matches a closed set of subsection patterns
452
+ (``Design (and Findings)``, ``Results (and Findings)``, ``Methods``,
453
+ ``Procedure``, ``Materials``, ``Hypotheses``, ``Predictions``,
454
+ ``Discussion``) and the related ``Overview of the …`` line.
455
+
456
+ Operates at the line level (not paragraph level) because pdftotext often
457
+ joins subsection-heading lines with surrounding body using single ``\\n``
458
+ rather than ``\\n\\n``. When a matching line is found inside a multi-line
459
+ paragraph, split the paragraph and promote the line to ``### {title}``
460
+ surrounded by blank lines.
461
+ """
462
+ if not text:
463
+ return text
464
+ lines = text.split("\n")
465
+ out: list[str] = []
466
+ for line in lines:
467
+ stripped = line.strip()
468
+ if not stripped or stripped.startswith("#"):
469
+ out.append(line)
470
+ continue
471
+ if _STUDY_SUBSECTION_RE.match(stripped) or _OVERVIEW_HEADING_RE.match(stripped):
472
+ # Promote with blank-line padding so downstream tools see it as
473
+ # a standalone heading paragraph. Avoid double blank lines.
474
+ if out and out[-1] != "":
475
+ out.append("")
476
+ out.append(f"### {stripped}")
477
+ out.append("")
478
+ else:
479
+ out.append(line)
480
+ cleaned = "\n".join(out)
481
+ cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
482
+ return cleaned
483
+
484
+
382
485
  # ── Section C2: orphan table cell-text suppression ──────────────────────────
383
486
 
384
487
 
@@ -1477,6 +1580,8 @@ def render_pdf_to_markdown(
1477
1580
  md = _fix_hyphenated_line_breaks(md)
1478
1581
  md = _join_multiline_caption_paragraphs(md)
1479
1582
  md = _suppress_orphan_table_cell_text(md)
1583
+ md = _demote_inline_footnotes_to_blockquote(md)
1584
+ md = _promote_study_subsection_headings(md)
1480
1585
  md = _merge_compound_heading_tails(md)
1481
1586
  md = _reformat_jama_key_points_box(md)
1482
1587
  md = _promote_numbered_subsection_headings(md)
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "docpluck"
7
- version = "2.4.6"
7
+ version = "2.4.7"
8
8
  description = "PDF, DOCX, and HTML text extraction and normalization for academic papers"
9
9
  readme = "docs/README.md"
10
10
  requires-python = ">=3.10"
@@ -56,10 +56,25 @@ _LINT_PATTERNS: list[tuple[str, re.Pattern[str], str]] = [
56
56
  (
57
57
  "FN",
58
58
  re.compile(
59
- r"^\d{1,2}\s+(?:Though|Note|See|We)\s+\w.{2,180}[\.\)]\s*$"
59
+ r"^\d{1,2}\s+(?:Though|Note|See|We|This|The|These|Although|However|"
60
+ r"It|For|In|Some|First|Further|Assuming|One|Given|Because)\s+"
61
+ r"\w+\s+\w.{2,180}[\.\)]\s*$"
60
62
  ),
61
63
  "Inline footnote leaked as standalone paragraph",
62
64
  ),
65
+ (
66
+ "OR",
67
+ re.compile(r"^https?://orcid\.org/\d{4}-\d{4}-\d{4}-[0-9X]{4}\s*$"),
68
+ "Standalone ORCID URL",
69
+ ),
70
+ (
71
+ "JF",
72
+ re.compile(
73
+ r"^(?:Vol\.:\(\d+\)|rsos\.royalsocietypublishing\.org|"
74
+ r"www\.nature\.com/(?:naturecommunications|scientificreports))\s*$"
75
+ ),
76
+ "Journal-footer URL or vol marker leaked into body",
77
+ ),
63
78
  ]
64
79
 
65
80
 
@@ -480,6 +480,42 @@ class TestP0_RunningHeaderFooterPatterns_v246:
480
480
  assert "Department of Psychology, University of Hong Kong" not in result
481
481
  assert "Body content here." in result
482
482
 
483
+ def test_rsos_footer_url_stripped(self):
484
+ text = (
485
+ "Body sentence one.\n"
486
+ "rsos.royalsocietypublishing.org\n"
487
+ "Body sentence two.\n"
488
+ )
489
+ result = norm(text, "standard")
490
+ assert "rsos.royalsocietypublishing.org" not in result
491
+ assert "Body sentence one." in result
492
+ assert "Body sentence two." in result
493
+
494
+ def test_nature_footer_url_stripped(self):
495
+ text = (
496
+ "Body.\n"
497
+ "www.nature.com/naturecommunications\n"
498
+ "More body.\n"
499
+ "www.nature.com/scientificreports\n"
500
+ "Yet more.\n"
501
+ )
502
+ result = norm(text, "standard")
503
+ assert "www.nature.com/naturecommunications" not in result
504
+ assert "www.nature.com/scientificreports" not in result
505
+ assert "Body." in result
506
+
507
+ def test_springer_vol_marker_stripped(self):
508
+ text = "Body.\nVol.:(0123456789)\nMore body.\n"
509
+ result = norm(text, "standard")
510
+ assert "Vol.:(0123456789)" not in result
511
+ assert "Body." in result
512
+
513
+ def test_orcid_url_stripped(self):
514
+ text = "Body.\nhttps://orcid.org/0000-0002-1234-5678\nMore body.\n"
515
+ result = norm(text, "standard")
516
+ assert "orcid.org/0000-0002-1234-5678" not in result
517
+ assert "Body." in result
518
+
483
519
  def test_affiliation_line_preserved_in_prose_context(self):
484
520
  """The Dept/University pattern must only match standalone lines, not
485
521
  prose mentioning the affiliation mid-sentence."""
@@ -15,6 +15,8 @@ from docpluck.render import (
15
15
  _promote_numbered_subsection_headings,
16
16
  _reformat_jama_key_points_box,
17
17
  _suppress_orphan_table_cell_text,
18
+ _demote_inline_footnotes_to_blockquote,
19
+ _promote_study_subsection_headings,
18
20
  _apply_title_rescue,
19
21
  _strip_duplicate_title_occurrences,
20
22
  )
@@ -249,6 +251,104 @@ def test_suppress_orphan_table_cell_text_noop_when_no_table_caption():
249
251
  assert _suppress_orphan_table_cell_text(text) == text
250
252
 
251
253
 
254
+ # ── _demote_inline_footnotes_to_blockquote ──────────────────────────────────
255
+
256
+
257
+ def test_footnote_demoted_to_blockquote():
258
+ text = (
259
+ "Body prose paragraph one.\n\n"
260
+ "1 Though we note a recent failed replication of the Kogut and "
261
+ "Ritov (2005) by Majumder et al. (2023).\n\n"
262
+ "Body prose paragraph two."
263
+ )
264
+ out = _demote_inline_footnotes_to_blockquote(text)
265
+ assert "> 1 Though we note a recent failed replication" in out
266
+ assert "Body prose paragraph one." in out
267
+ assert "Body prose paragraph two." in out
268
+
269
+
270
+ def test_footnote_demoter_preserves_real_numbered_list_item():
271
+ text = (
272
+ "Some context.\n\n"
273
+ "1. First numbered point in a list.\n\n"
274
+ "More prose."
275
+ )
276
+ out = _demote_inline_footnotes_to_blockquote(text)
277
+ # Numbered list item has `1.` (with period), pattern expects `1 Word`.
278
+ assert "1. First numbered point" in out
279
+ assert "> 1. First numbered point" not in out
280
+
281
+
282
+ def test_footnote_demoter_skips_short_paragraphs():
283
+ text = "Context.\n\n2 Note.\n\nMore."
284
+ out = _demote_inline_footnotes_to_blockquote(text)
285
+ # Under 30 chars — not enough to qualify as a footnote.
286
+ assert out == text
287
+
288
+
289
+ def test_footnote_demoter_idempotent():
290
+ text = (
291
+ "Body.\n\n"
292
+ "1 Though we note this is a footnote that has been demoted already "
293
+ "by a previous pass through the pipeline.\n\n"
294
+ "More body."
295
+ )
296
+ once = _demote_inline_footnotes_to_blockquote(text)
297
+ twice = _demote_inline_footnotes_to_blockquote(once)
298
+ # After first pass, the line starts with "> ", so doesn't match `^\d`.
299
+ assert once == twice
300
+
301
+
302
+ # ── _promote_study_subsection_headings ──────────────────────────────────────
303
+
304
+
305
+ def test_study_subsection_heading_promoted():
306
+ text = (
307
+ "Some intro.\n\n"
308
+ "Study 1 Design and Findings\n\n"
309
+ "In Study 1 we examined..."
310
+ )
311
+ out = _promote_study_subsection_headings(text)
312
+ assert "### Study 1 Design and Findings" in out
313
+ assert "In Study 1 we examined" in out
314
+
315
+
316
+ def test_study_subsection_multiple_variants_promoted():
317
+ text = (
318
+ "x\n\n"
319
+ "Study 3 Design and Findings\n\n"
320
+ "y\n\n"
321
+ "Study 2 Results\n\n"
322
+ "z\n\n"
323
+ "Overview of the Replication and Extension\n\n"
324
+ "w"
325
+ )
326
+ out = _promote_study_subsection_headings(text)
327
+ assert "### Study 3 Design and Findings" in out
328
+ assert "### Study 2 Results" in out
329
+ assert "### Overview of the Replication and Extension" in out
330
+
331
+
332
+ def test_study_subsection_skip_existing_heading():
333
+ text = "### Study 1 Design and Findings\n\nbody"
334
+ out = _promote_study_subsection_headings(text)
335
+ # Already a heading; do not double-prefix.
336
+ assert "### ### Study 1" not in out
337
+ assert "### Study 1 Design and Findings" in out
338
+
339
+
340
+ def test_study_subsection_skip_unrelated_prose():
341
+ text = (
342
+ "We summarize Study 1 design and the procedure used in our work.\n\n"
343
+ "More prose."
344
+ )
345
+ out = _promote_study_subsection_headings(text)
346
+ # Mid-prose mention is NOT a heading; pattern requires the line to be
347
+ # the entire paragraph and start with capital-S "Study N <token>".
348
+ assert "### We summarize" not in out
349
+ assert out == text
350
+
351
+
252
352
  # ── _reformat_jama_key_points_box ──────────────────────────────────────────
253
353
 
254
354
 
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes