docpluck 2.4.8__tar.gz → 2.4.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (275) hide show
  1. {docpluck-2.4.8 → docpluck-2.4.9}/CHANGELOG.md +20 -0
  2. {docpluck-2.4.8 → docpluck-2.4.9}/PKG-INFO +1 -1
  3. {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/__init__.py +1 -1
  4. {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/render.py +32 -0
  5. {docpluck-2.4.8 → docpluck-2.4.9}/pyproject.toml +1 -1
  6. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_render.py +38 -10
  7. docpluck-2.4.8/docpluck/__init__.py.tmp.54476.1778653086029 +0 -114
  8. {docpluck-2.4.8 → docpluck-2.4.9}/.claude/skills/_project/lessons.md +0 -0
  9. {docpluck-2.4.8 → docpluck-2.4.9}/.claude/skills/docpluck-cleanup/SKILL.md +0 -0
  10. {docpluck-2.4.8 → docpluck-2.4.9}/.claude/skills/docpluck-deploy/SKILL.md +0 -0
  11. {docpluck-2.4.8 → docpluck-2.4.9}/.claude/skills/docpluck-qa/SKILL.md +0 -0
  12. {docpluck-2.4.8 → docpluck-2.4.9}/.claude/skills/docpluck-qa/references/benchmark-mode.md +0 -0
  13. {docpluck-2.4.8 → docpluck-2.4.9}/.claude/skills/docpluck-qa/references/check-11-hard-rules.md +0 -0
  14. {docpluck-2.4.8 → docpluck-2.4.9}/.claude/skills/docpluck-qa/references/check-13-escicheck-production.md +0 -0
  15. {docpluck-2.4.8 → docpluck-2.4.9}/.claude/skills/docpluck-qa/references/check-5-escicheck-library.md +0 -0
  16. {docpluck-2.4.8 → docpluck-2.4.9}/.claude/skills/docpluck-qa/references/check-6-escicheck-local-webapp.md +0 -0
  17. {docpluck-2.4.8 → docpluck-2.4.9}/.claude/skills/docpluck-qa/references/check-7-batch-smoke.md +0 -0
  18. {docpluck-2.4.8 → docpluck-2.4.9}/.claude/skills/docpluck-review/SKILL.md +0 -0
  19. {docpluck-2.4.8 → docpluck-2.4.9}/.github/workflows/publish.yml +0 -0
  20. {docpluck-2.4.8 → docpluck-2.4.9}/.github/workflows/test.yml +0 -0
  21. {docpluck-2.4.8 → docpluck-2.4.9}/.gitignore +0 -0
  22. {docpluck-2.4.8 → docpluck-2.4.9}/CLAUDE.md +0 -0
  23. {docpluck-2.4.8 → docpluck-2.4.9}/HANDOFF_SECTIONS_APP_INTEGRATION.md +0 -0
  24. {docpluck-2.4.8 → docpluck-2.4.9}/LESSONS.md +0 -0
  25. {docpluck-2.4.8 → docpluck-2.4.9}/LICENSE +0 -0
  26. {docpluck-2.4.8 → docpluck-2.4.9}/REPLY_FROM_DOCPLUCK_v1.4.5.md +0 -0
  27. {docpluck-2.4.8 → docpluck-2.4.9}/REPLY_FROM_DOCPLUCK_v1.5.0.md +0 -0
  28. {docpluck-2.4.8 → docpluck-2.4.9}/REQUEST_08_CHUNKING_ENDPOINT.md +0 -0
  29. {docpluck-2.4.8 → docpluck-2.4.9}/REQUEST_09_REFERENCE_LIST_NORMALIZATION.md +0 -0
  30. {docpluck-2.4.8 → docpluck-2.4.9}/TODO.md +0 -0
  31. {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/__main__.py +0 -0
  32. {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/batch.py +0 -0
  33. {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/cli.py +0 -0
  34. {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/extract.py +0 -0
  35. {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/extract_docx.py +0 -0
  36. {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/extract_html.py +0 -0
  37. {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/extract_layout.py +0 -0
  38. {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/extract_structured.py +0 -0
  39. {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/figures/__init__.py +0 -0
  40. {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/figures/detect.py +0 -0
  41. {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/normalize.py +0 -0
  42. {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/quality.py +0 -0
  43. {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/sections/__init__.py +0 -0
  44. {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/sections/annotators/__init__.py +0 -0
  45. {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/sections/annotators/docx.py +0 -0
  46. {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/sections/annotators/html.py +0 -0
  47. {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/sections/annotators/pdf.py +0 -0
  48. {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/sections/annotators/text.py +0 -0
  49. {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/sections/blocks.py +0 -0
  50. {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/sections/boundaries.py +0 -0
  51. {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/sections/core.py +0 -0
  52. {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/sections/taxonomy.py +0 -0
  53. {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/sections/types.py +0 -0
  54. {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/tables/__init__.py +0 -0
  55. {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/tables/bbox_utils.py +0 -0
  56. {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/tables/camelot_extract.py +0 -0
  57. {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/tables/captions.py +0 -0
  58. {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/tables/cell_cleaning.py +0 -0
  59. {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/tables/cluster.py +0 -0
  60. {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/tables/confidence.py +0 -0
  61. {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/tables/detect.py +0 -0
  62. {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/tables/render.py +0 -0
  63. {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/tables/whitespace.py +0 -0
  64. {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/version.py +0 -0
  65. {docpluck-2.4.8 → docpluck-2.4.9}/docs/BENCHMARKS.md +0 -0
  66. {docpluck-2.4.8 → docpluck-2.4.9}/docs/DESIGN.md +0 -0
  67. {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-07_sections_strict_iteration.md +0 -0
  68. {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-09_session_state_and_followups.md +0 -0
  69. {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-09_unified_extraction_brainstorm.md +0 -0
  70. {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-10_table_rendering_iteration.md +0 -0
  71. {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-10_table_rendering_iteration_2.md +0 -0
  72. {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-10_table_rendering_iteration_3.md +0 -0
  73. {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-10_table_rendering_iteration_4.md +0 -0
  74. {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-10_table_rendering_iteration_5.md +0 -0
  75. {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-10_table_rendering_iteration_6.md +0 -0
  76. {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-10_table_rendering_iteration_7.md +0 -0
  77. {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-11_PROMOTE_SPIKE_TO_LIBRARY.md +0 -0
  78. {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-11_table_rendering_iteration_8.md +0 -0
  79. {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-11_visual_review_findings.md +0 -0
  80. {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-12_phase2_101pdf_corpus.md +0 -0
  81. {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-12_remaining_ui_and_chrome_verification.md +0 -0
  82. {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-12_visual_verify_results.md +0 -0
  83. {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-13_apa_50_expansion.md +0 -0
  84. {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-13_apa_50_expansion_iter_1.md +0 -0
  85. {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-13_apa_50_expansion_iter_2.md +0 -0
  86. {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-13_iterative_1.md +0 -0
  87. {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-13_iterative_library_improvement.md +0 -0
  88. {docpluck-2.4.8 → docpluck-2.4.9}/docs/NORMALIZATION.md +0 -0
  89. {docpluck-2.4.8 → docpluck-2.4.9}/docs/README.md +0 -0
  90. {docpluck-2.4.8 → docpluck-2.4.9}/docs/TRIAGE_2026-05-10_corpus_assessment.md +0 -0
  91. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/2026-05-06-section-identification.md +0 -0
  92. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/2026-05-06-table-extraction.md +0 -0
  93. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/2026-05-07-sections-strict-iteration-progress.md +0 -0
  94. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/2026-05-08-unified-extraction-phase-0-splice-spike.md +0 -0
  95. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/sections-deferred-items.md +0 -0
  96. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/sections-issues-backlog.md +0 -0
  97. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/2026-05-07_spot-01_apa.md +0 -0
  98. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/2026-05-07_spot-02_pattern-A-shipped.md +0 -0
  99. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/2026-05-08_spot-final_all-styles.md +0 -0
  100. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/COMPARISON.md +0 -0
  101. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/korbmacher_table1.md +0 -0
  102. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/option-a.py +0 -0
  103. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/ziano_table1.md +0 -0
  104. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_notes_raw.txt +0 -0
  105. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_table1.md +0 -0
  106. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/notes.md +0 -0
  107. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/option-b.py +0 -0
  108. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_notes_raw.txt +0 -0
  109. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_table1.md +0 -0
  110. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/korbmacher_table1.md +0 -0
  111. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/notes.md +0 -0
  112. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/option-c.py +0 -0
  113. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/sample-pdftotext-bbox.html +0 -0
  114. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/ziano_table1.md +0 -0
  115. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/korbmacher_table1.md +0 -0
  116. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/notes.md +0 -0
  117. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/option-d.py +0 -0
  118. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/ziano_table1.md +0 -0
  119. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_2022_kruger_bbox.html +0 -0
  120. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_bbox.html +0 -0
  121. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_table1.md +0 -0
  122. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/option-e.py +0 -0
  123. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/sample-bbox.html +0 -0
  124. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_2021_joep_bbox.html +0 -0
  125. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_bbox.html +0 -0
  126. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_table1.md +0 -0
  127. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/html-fallback-demo.md +0 -0
  128. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.err +0 -0
  129. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.md +0 -0
  130. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.err +0 -0
  131. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.md +0 -0
  132. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.err +0 -0
  133. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.md +0 -0
  134. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.err +0 -0
  135. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.md +0 -0
  136. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.err +0 -0
  137. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.md +0 -0
  138. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.err +0 -0
  139. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.md +0 -0
  140. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.err +0 -0
  141. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.md +0 -0
  142. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.err +0 -0
  143. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.md +0 -0
  144. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.err +0 -0
  145. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.md +0 -0
  146. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.err +0 -0
  147. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.md +0 -0
  148. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.err +0 -0
  149. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.md +0 -0
  150. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.err +0 -0
  151. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.md +0 -0
  152. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.err +0 -0
  153. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.md +0 -0
  154. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.err +0 -0
  155. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.md +0 -0
  156. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.err +0 -0
  157. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.md +0 -0
  158. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.err +0 -0
  159. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.md +0 -0
  160. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.err +0 -0
  161. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.md +0 -0
  162. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.err +0 -0
  163. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.md +0 -0
  164. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.err +0 -0
  165. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.md +0 -0
  166. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.err +0 -0
  167. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.md +0 -0
  168. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.err +0 -0
  169. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.md +0 -0
  170. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.err +0 -0
  171. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.md +0 -0
  172. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.err +0 -0
  173. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.md +0 -0
  174. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.err +0 -0
  175. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.md +0 -0
  176. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.err +0 -0
  177. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.md +0 -0
  178. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.err +0 -0
  179. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.md +0 -0
  180. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/papers.md +0 -0
  181. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/report.md +0 -0
  182. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/splice_spike.py +0 -0
  183. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/test_splice_spike.py +0 -0
  184. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/specs/2026-04-27-request-09-reference-normalization-design.md +0 -0
  185. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/specs/2026-05-06-section-identification-design.md +0 -0
  186. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/specs/2026-05-06-table-extraction-design.md +0 -0
  187. {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/specs/2026-05-08-unified-extraction-design.md +0 -0
  188. {docpluck-2.4.8 → docpluck-2.4.9}/scripts/lint_rendered_corpus.py +0 -0
  189. {docpluck-2.4.8 → docpluck-2.4.9}/scripts/verify_corpus.py +0 -0
  190. {docpluck-2.4.8 → docpluck-2.4.9}/scripts/verify_corpus_full.py +0 -0
  191. {docpluck-2.4.8 → docpluck-2.4.9}/tests/__init__.py +0 -0
  192. {docpluck-2.4.8 → docpluck-2.4.9}/tests/conftest.py +0 -0
  193. {docpluck-2.4.8 → docpluck-2.4.9}/tests/fixtures/__init__.py +0 -0
  194. {docpluck-2.4.8 → docpluck-2.4.9}/tests/fixtures/sections/__init__.py +0 -0
  195. {docpluck-2.4.8 → docpluck-2.4.9}/tests/fixtures/sections/builders.py +0 -0
  196. {docpluck-2.4.8 → docpluck-2.4.9}/tests/fixtures/structured/.gitkeep +0 -0
  197. {docpluck-2.4.8 → docpluck-2.4.9}/tests/fixtures/structured/MANIFEST.json +0 -0
  198. {docpluck-2.4.8 → docpluck-2.4.9}/tests/fixtures/structured/README.md +0 -0
  199. {docpluck-2.4.8 → docpluck-2.4.9}/tests/golden/sections/apa_multi_study_pdf.json +0 -0
  200. {docpluck-2.4.8 → docpluck-2.4.9}/tests/golden/sections/apa_single_study_pdf.json +0 -0
  201. {docpluck-2.4.8 → docpluck-2.4.9}/tests/golden/sections/html_real_headings.json +0 -0
  202. {docpluck-2.4.8 → docpluck-2.4.9}/tests/snapshots/amj_lattice.txt +0 -0
  203. {docpluck-2.4.8 → docpluck-2.4.9}/tests/snapshots/apa_chan_feldman_lineless.txt +0 -0
  204. {docpluck-2.4.8 → docpluck-2.4.9}/tests/snapshots/apa_chen_jesp_lineless.txt +0 -0
  205. {docpluck-2.4.8 → docpluck-2.4.9}/tests/snapshots/apa_efendic_affect.txt +0 -0
  206. {docpluck-2.4.8 → docpluck-2.4.9}/tests/snapshots/apa_ip_feldman_pspb.txt +0 -0
  207. {docpluck-2.4.8 → docpluck-2.4.9}/tests/snapshots/bmc_lattice.txt +0 -0
  208. {docpluck-2.4.8 → docpluck-2.4.9}/tests/snapshots/ieee_figure_heavy.txt +0 -0
  209. {docpluck-2.4.8 → docpluck-2.4.9}/tests/snapshots/ieee_lattice.txt +0 -0
  210. {docpluck-2.4.8 → docpluck-2.4.9}/tests/snapshots/jama_lattice.txt +0 -0
  211. {docpluck-2.4.8 → docpluck-2.4.9}/tests/snapshots/nat_comms_figure_only.txt +0 -0
  212. {docpluck-2.4.8 → docpluck-2.4.9}/tests/snapshots/nature_minimal_rule.txt +0 -0
  213. {docpluck-2.4.8 → docpluck-2.4.9}/tests/snapshots/scirep_minimal_rule.txt +0 -0
  214. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_bbox_utils.py +0 -0
  215. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_benchmark_docx_html.py +0 -0
  216. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_caption_regex.py +0 -0
  217. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_cli_sections.py +0 -0
  218. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_cli_structured.py +0 -0
  219. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_confidence.py +0 -0
  220. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_corpus_smoke.py +0 -0
  221. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_d5_normalization_audit.py +0 -0
  222. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_edge_cases.py +0 -0
  223. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_extract_docx.py +0 -0
  224. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_extract_filter_sugar.py +0 -0
  225. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_extract_html.py +0 -0
  226. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_extract_layout.py +0 -0
  227. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_extract_pdf_structured.py +0 -0
  228. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_extraction.py +0 -0
  229. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_f0_table_region_aware.py +0 -0
  230. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_figure_detect.py +0 -0
  231. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_fixtures_manifest.py +0 -0
  232. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_lattice_cluster.py +0 -0
  233. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_metaesci_followups.py +0 -0
  234. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_normalization.py +0 -0
  235. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_normalize_f0_footnote_strip.py +0 -0
  236. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_normalize_layout_param.py +0 -0
  237. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_normalize_report_layout_fields.py +0 -0
  238. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_normalize_v18_strips.py +0 -0
  239. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_quality.py +0 -0
  240. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_render_html.py +0 -0
  241. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_request_09_reference_normalization.py +0 -0
  242. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_boundaries.py +0 -0
  243. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_boundary_truncation.py +0 -0
  244. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_core_partition.py +0 -0
  245. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_docx_annotator.py +0 -0
  246. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_extract_text.py +0 -0
  247. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_footnote_section.py +0 -0
  248. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_golden.py +0 -0
  249. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_html_annotator.py +0 -0
  250. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_pdf_annotator.py +0 -0
  251. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_public_api.py +0 -0
  252. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_real_corpus.py +0 -0
  253. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_taxonomy.py +0 -0
  254. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_text_annotator.py +0 -0
  255. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_types.py +0 -0
  256. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_unit_corpus.py +0 -0
  257. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_v161_coalesce.py +0 -0
  258. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_v161_subheadings.py +0 -0
  259. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_v161_taxonomy.py +0 -0
  260. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_v161_text_annotator.py +0 -0
  261. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_version.py +0 -0
  262. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_smoke_fixtures.py +0 -0
  263. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_structured_result_type.py +0 -0
  264. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_structured_types.py +0 -0
  265. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_structured_version.py +0 -0
  266. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_table_detect.py +0 -0
  267. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_tables_cell_cleaning.py +0 -0
  268. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_text_mode.py +0 -0
  269. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_v23_1_fixes.py +0 -0
  270. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_v23_bug_fixes.py +0 -0
  271. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_v23_post_corpus.py +0 -0
  272. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_v23_post_corpus_v2.py +0 -0
  273. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_v2_backwards_compat.py +0 -0
  274. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_v2_top_level_exports.py +0 -0
  275. {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_whitespace_cluster.py +0 -0
@@ -1,5 +1,25 @@
1
1
  # Changelog
2
2
 
3
+ ## [2.4.9] — 2026-05-13
4
+
5
+ Regression hotfix for v2.4.8's `_demote_false_single_word_headings`. The 26-paper baseline gate caught it: ar_royal_society_rsos_140066 + ar_royal_society_rsos_140072 dropped from 4 → 2 sections because `## Discussion`/`## References` got demoted (next line started with lowercase `of this study...` or `1. Öhman A...`).
6
+
7
+ ### Fix
8
+
9
+ 1. **`docpluck/render.py::_demote_false_single_word_headings`** —
10
+ - Added `_STRONG_SECTION_NAMES` allowlist: abstract / introduction / background / methods / materials / results / discussion / conclusion / references / bibliography / acknowledgments / funding / limitations / appendix / keywords. Headings with these words are NEVER demoted — they are authoritative section markers.
11
+ - Added numbered-subsection guard: if next line matches `^\d+(?:\.\d+){1,3}\.?\s+\w` (e.g., `3.1. Subjects`, `3.1.2. Foo`), the heading stays — the numbered subsection is legitimate body content.
12
+
13
+ ### Tests
14
+
15
+ - 4 new tests in `tests/test_render.py` (strong-section preservation for Results / Discussion / References, non-canonical word like ``Theory`` still demoted, numbered-subsection guard).
16
+ - 55 render tests PASS.
17
+ - **26-paper baseline: 26/26 PASS** (vs v2.4.8: 24/26).
18
+
19
+ ### Bumps
20
+
21
+ - `__version__`: `2.4.8` → `2.4.9`. Patch.
22
+
3
23
  ## [2.4.8] — 2026-05-13
4
24
 
5
25
  Massive defect-class sweep informed by 8 parallel subagent audits. Highest-impact item: a render-level false-heading demoter that addresses 197 false `## Word` / `### Word` headings (24% of all single-word headings in the v2.4.0 101-paper corpus) where pdftotext split a single line ("Results of Study 1") across a column wrap.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docpluck
3
- Version: 2.4.8
3
+ Version: 2.4.9
4
4
  Summary: PDF, DOCX, and HTML text extraction and normalization for academic papers
5
5
  Project-URL: Homepage, https://github.com/giladfeldman/docpluck
6
6
  Project-URL: Documentation, https://github.com/giladfeldman/docpluck/tree/main/docs
@@ -71,7 +71,7 @@ from .figures import Figure
71
71
  from .extract_structured import TABLE_EXTRACTION_VERSION, StructuredResult, extract_pdf_structured
72
72
  from .render import render_pdf_to_markdown
73
73
 
74
- __version__ = "2.4.8"
74
+ __version__ = "2.4.9"
75
75
  __author__ = "Gilad Feldman"
76
76
  __license__ = "MIT"
77
77
 
@@ -384,6 +384,20 @@ def _join_multiline_caption_paragraphs(text: str) -> str:
384
384
 
385
385
  _FALSE_HEADING_RE = re.compile(r"^(#{2,3})\s+(?P<word>[A-Z][A-Za-z]{2,12})\s*$")
386
386
 
387
+ # Strong canonical section names — never demote even when followed by a
388
+ # lowercase or digit continuation. These are unambiguous section markers
389
+ # whose authoritative source is the document structure, not the surrounding
390
+ # prose. The RSOS-family regression (v2.4.9) showed that ``## Discussion``
391
+ # followed by body prose starting with ``of this study...`` got demoted —
392
+ # losing the section. Same for ``## References\n\n1. Öhman A...``.
393
+ _STRONG_SECTION_NAMES = frozenset({
394
+ "abstract", "introduction", "background", "methods", "method",
395
+ "materials", "results", "discussion", "discussions", "conclusion",
396
+ "conclusions", "references", "bibliography", "acknowledgments",
397
+ "acknowledgements", "funding", "limitations", "supplementary",
398
+ "appendix", "keywords",
399
+ })
400
+
387
401
 
388
402
  def _demote_false_single_word_headings(text: str) -> str:
389
403
  """Demote ``## Word`` / ``### Word`` lines that are mid-prose continuations.
@@ -421,6 +435,14 @@ def _demote_false_single_word_headings(text: str) -> str:
421
435
  out.append(line)
422
436
  i += 1
423
437
  continue
438
+ # v2.4.9: never demote strong canonical section names. The body
439
+ # text following `## Discussion` or `## References` can start with
440
+ # lowercase prose / numbered list ("of this study...", "1. Öhman A..."),
441
+ # but the heading itself is authoritative.
442
+ if m.group("word").lower() in _STRONG_SECTION_NAMES:
443
+ out.append(line)
444
+ i += 1
445
+ continue
424
446
  # Find the next non-blank line.
425
447
  j = i + 1
426
448
  while j < len(lines) and not lines[j].strip():
@@ -435,6 +457,16 @@ def _demote_false_single_word_headings(text: str) -> str:
435
457
  # original heading line (``Results of Study 1`` → ``## Results`` +
436
458
  # ``of Study 1``). Skip the lookahead for proper-sentence starts.
437
459
  first_char = next_line[:1]
460
+ # v2.4.9: don't demote when the next line is a numbered subsection
461
+ # (``3.1. Subjects``, ``3.1 Subjects``, ``4.1. Do seasonal``).
462
+ # Royal Society RSOS papers use ``## Methods\n\n3.1. Subjects`` as
463
+ # a legitimate section + numbered-subsection structure. The
464
+ # `_promote_numbered_subsection_headings` post-processor will lift
465
+ # those into ``### 3.1 Subjects`` headings.
466
+ if re.match(r"^\d+(?:\.\d+){1,3}\.?\s+\w", next_line):
467
+ out.append(line)
468
+ i += 1
469
+ continue
438
470
  is_continuation = bool(
439
471
  first_char and (first_char.islower() or first_char.isdigit())
440
472
  )
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "docpluck"
7
- version = "2.4.8"
7
+ version = "2.4.9"
8
8
  description = "PDF, DOCX, and HTML text extraction and normalization for academic papers"
9
9
  readme = "docs/README.md"
10
10
  requires-python = ">=3.10"
@@ -353,24 +353,35 @@ def test_study_subsection_skip_unrelated_prose():
353
353
  # ── _demote_false_single_word_headings ──────────────────────────────────────
354
354
 
355
355
 
356
- def test_false_heading_demoted_when_next_line_is_continuation_of():
356
+ def test_strong_section_heading_results_preserved_with_continuation_text():
357
+ """v2.4.9 regression fix: ``## Results`` is a strong canonical section;
358
+ even if pdftotext rendered the body starting with lowercase ``of Study 1``,
359
+ the heading stays — the body keeps its (slightly weird) opening, but the
360
+ section structure survives."""
357
361
  text = "## Results\n\nof Study 1 showed significant effects."
358
362
  out = _demote_false_single_word_headings(text)
359
- assert "## Results" not in out
360
- assert "Results of Study 1 showed significant effects." in out
363
+ assert "## Results" in out
364
+
365
+
366
+ def test_strong_section_heading_discussion_preserved():
367
+ text = "## Discussion\n\nof this study apparently present evidence against."
368
+ out = _demote_false_single_word_headings(text)
369
+ assert "## Discussion" in out
361
370
 
362
371
 
363
- def test_false_heading_demoted_when_next_line_starts_lowercase():
364
- text = "## Discussion\n\nsection of the article was extensive."
372
+ def test_strong_section_heading_references_preserved_with_numbered_list():
373
+ text = "## References\n\n1. Öhman A, Lundqvist D, Esteves F. 2001 The face in the crowd."
365
374
  out = _demote_false_single_word_headings(text)
366
- assert "## Discussion" not in out
367
- assert "Discussion section of the article" in out
375
+ assert "## References" in out
368
376
 
369
377
 
370
- def test_false_heading_demoted_when_next_line_starts_digit():
371
- text = "## References\n\n1. Author, A. (2023). Title."
378
+ def test_false_heading_demoted_for_non_canonical_word():
379
+ """A non-canonical single-word heading (``## Theory``) followed by
380
+ lowercase continuation IS demoted (v2.4.8 behavior preserved)."""
381
+ text = "### Theory\n\nof the firm: managerial implications follow."
372
382
  out = _demote_false_single_word_headings(text)
373
- assert "## References\n\n1." not in out
383
+ assert "### Theory" not in out
384
+ assert "Theory of the firm" in out
374
385
 
375
386
 
376
387
  def test_legit_heading_preserved_when_next_line_capitalized_sentence():
@@ -400,6 +411,23 @@ def test_false_heading_demoter_idempotent():
400
411
  assert once == twice
401
412
 
402
413
 
414
+ def test_false_heading_preserved_when_next_line_is_numbered_subsection():
415
+ """v2.4.9 regression fix: RSOS-style ``## Methods\\n\\n3.1. Subjects``
416
+ must keep the heading + numbered subsection intact. Demoting here
417
+ would destroy the section structure."""
418
+ text = "## Methods\n\n3.1. Subjects and study site\n\nWe sampled..."
419
+ out = _demote_false_single_word_headings(text)
420
+ assert "## Methods" in out
421
+ assert "3.1. Subjects and study site" in out
422
+
423
+
424
+ def test_false_heading_preserved_with_4digit_numbered_subsection():
425
+ text = "## Results\n\n4.1. Do seasonal challenges affect...\n\nResults follow."
426
+ out = _demote_false_single_word_headings(text)
427
+ assert "## Results" in out
428
+ assert "4.1. Do seasonal challenges affect..." in out
429
+
430
+
403
431
  # ── _reformat_jama_key_points_box ──────────────────────────────────────────
404
432
 
405
433
 
@@ -1,114 +0,0 @@
1
- """
2
- docpluck — PDF, DOCX, and HTML text extraction and normalization for academic papers
3
- ====================================================================================
4
-
5
- A Python library for extracting and normalizing text from academic documents.
6
- Built from cross-project lessons across 8,000+ PDFs from psychology, medicine,
7
- economics, physics, and biology.
8
-
9
- Supports:
10
- - **PDF** via pdftotext (default mode, with pdfplumber SMP fallback)
11
- - **DOCX** via mammoth (DOCX → HTML → text, preserves soft breaks)
12
- - **HTML** via beautifulsoup4 + lxml (custom block/inline-aware tree-walk)
13
-
14
- Quick start::
15
-
16
- from docpluck import extract_pdf, extract_docx, extract_html
17
- from docpluck import normalize_text, NormalizationLevel, compute_quality_score
18
-
19
- # PDF
20
- with open("paper.pdf", "rb") as f:
21
- text, method = extract_pdf(f.read())
22
-
23
- # DOCX (requires: pip install docpluck[docx])
24
- with open("paper.docx", "rb") as f:
25
- text, method = extract_docx(f.read())
26
-
27
- # HTML (requires: pip install docpluck[html])
28
- with open("paper.html", "rb") as f:
29
- text, method = extract_html(f.read())
30
-
31
- # Normalization and quality scoring work on text from any source
32
- normalized, report = normalize_text(text, NormalizationLevel.academic)
33
- quality = compute_quality_score(normalized)
34
-
35
- print(f"Method: {method}")
36
- print(f"Quality: {quality['score']}/100 ({quality['confidence']})")
37
- print(f"Steps applied: {report.steps_applied}")
38
-
39
- Installation::
40
-
41
- pip install docpluck # PDF only (pdfplumber)
42
- pip install docpluck[docx] # + mammoth
43
- pip install docpluck[html] # + beautifulsoup4 + lxml
44
- pip install docpluck[all] # everything
45
-
46
- # extract_pdf() also requires poppler-utils:
47
- # Linux/WSL: apt-get install poppler-utils
48
- # macOS: brew install poppler
49
- # Windows: https://github.com/oschwartz10612/poppler-windows/releases
50
-
51
- See Also:
52
- - docs/README.md — Full usage guide and API reference
53
- - docs/DESIGN.md — Implementation decisions and rationale
54
- - docs/BENCHMARKS.md — Benchmark results across all supported formats
55
- - docs/NORMALIZATION.md — All 15 pipeline steps documented
56
- """
57
-
58
- from .extract import extract_pdf, extract_pdf_file, count_pages
59
- from .extract_docx import extract_docx
60
- from .extract_html import extract_html, html_to_text
61
- from .normalize import normalize_text, NormalizationLevel, NormalizationReport
62
- from .quality import compute_quality_score
63
- from .batch import ExtractionReport, extract_to_dir
64
- from .version import get_version_info
65
- from .sections import (
66
- extract_sections, SectionedDocument, Section,
67
- SectionLabel, Confidence, DetectedVia, SECTIONING_VERSION,
68
- )
69
- from .tables import Cell, Table
70
- from .figures import Figure
71
- from .extract_structured import TABLE_EXTRACTION_VERSION, StructuredResult, extract_pdf_structured
72
- from .render import render_pdf_to_markdown
73
-
74
- __version__ = "2.4.8"
75
- __author__ = "Gilad Feldman"
76
- __license__ = "MIT"
77
-
78
- __all__ = [
79
- # Extraction
80
- "extract_pdf",
81
- "extract_pdf_file",
82
- "extract_docx",
83
- "extract_html",
84
- "html_to_text",
85
- "count_pages",
86
- # Normalization
87
- "normalize_text",
88
- "NormalizationLevel",
89
- "NormalizationReport",
90
- # Quality
91
- "compute_quality_score",
92
- # Batch
93
- "ExtractionReport",
94
- "extract_to_dir",
95
- # Version
96
- "get_version_info",
97
- # Sections
98
- "extract_sections",
99
- "SectionedDocument",
100
- "Section",
101
- "SectionLabel",
102
- "Confidence",
103
- "DetectedVia",
104
- "SECTIONING_VERSION",
105
- # Structured extraction (v2.0)
106
- "Cell",
107
- "Table",
108
- "Figure",
109
- "TABLE_EXTRACTION_VERSION",
110
- "StructuredResult",
111
- "extract_pdf_structured",
112
- # Markdown rendering (v2.2)
113
- "render_pdf_to_markdown",
114
- ]
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes