docpluck 2.4.8__tar.gz → 2.4.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docpluck-2.4.8 → docpluck-2.4.9}/CHANGELOG.md +20 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/PKG-INFO +1 -1
- {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/__init__.py +1 -1
- {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/render.py +32 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/pyproject.toml +1 -1
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_render.py +38 -10
- docpluck-2.4.8/docpluck/__init__.py.tmp.54476.1778653086029 +0 -114
- {docpluck-2.4.8 → docpluck-2.4.9}/.claude/skills/_project/lessons.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/.claude/skills/docpluck-cleanup/SKILL.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/.claude/skills/docpluck-deploy/SKILL.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/.claude/skills/docpluck-qa/SKILL.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/.claude/skills/docpluck-qa/references/benchmark-mode.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/.claude/skills/docpluck-qa/references/check-11-hard-rules.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/.claude/skills/docpluck-qa/references/check-13-escicheck-production.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/.claude/skills/docpluck-qa/references/check-5-escicheck-library.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/.claude/skills/docpluck-qa/references/check-6-escicheck-local-webapp.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/.claude/skills/docpluck-qa/references/check-7-batch-smoke.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/.claude/skills/docpluck-review/SKILL.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/.github/workflows/publish.yml +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/.github/workflows/test.yml +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/.gitignore +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/CLAUDE.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/HANDOFF_SECTIONS_APP_INTEGRATION.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/LESSONS.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/LICENSE +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/REPLY_FROM_DOCPLUCK_v1.4.5.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/REPLY_FROM_DOCPLUCK_v1.5.0.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/REQUEST_08_CHUNKING_ENDPOINT.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/REQUEST_09_REFERENCE_LIST_NORMALIZATION.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/TODO.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/__main__.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/batch.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/cli.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/extract.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/extract_docx.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/extract_html.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/extract_layout.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/extract_structured.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/figures/__init__.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/figures/detect.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/normalize.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/quality.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/sections/__init__.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/sections/annotators/__init__.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/sections/annotators/docx.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/sections/annotators/html.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/sections/annotators/pdf.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/sections/annotators/text.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/sections/blocks.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/sections/boundaries.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/sections/core.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/sections/taxonomy.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/sections/types.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/tables/__init__.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/tables/bbox_utils.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/tables/camelot_extract.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/tables/captions.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/tables/cell_cleaning.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/tables/cluster.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/tables/confidence.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/tables/detect.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/tables/render.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/tables/whitespace.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docpluck/version.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/BENCHMARKS.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/DESIGN.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-07_sections_strict_iteration.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-09_session_state_and_followups.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-09_unified_extraction_brainstorm.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-10_table_rendering_iteration.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-10_table_rendering_iteration_2.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-10_table_rendering_iteration_3.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-10_table_rendering_iteration_4.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-10_table_rendering_iteration_5.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-10_table_rendering_iteration_6.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-10_table_rendering_iteration_7.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-11_PROMOTE_SPIKE_TO_LIBRARY.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-11_table_rendering_iteration_8.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-11_visual_review_findings.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-12_phase2_101pdf_corpus.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-12_remaining_ui_and_chrome_verification.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-12_visual_verify_results.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-13_apa_50_expansion.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-13_apa_50_expansion_iter_1.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-13_apa_50_expansion_iter_2.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-13_iterative_1.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-13_iterative_library_improvement.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/NORMALIZATION.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/README.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/TRIAGE_2026-05-10_corpus_assessment.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/2026-05-06-section-identification.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/2026-05-06-table-extraction.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/2026-05-07-sections-strict-iteration-progress.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/2026-05-08-unified-extraction-phase-0-splice-spike.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/sections-deferred-items.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/sections-issues-backlog.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/2026-05-07_spot-01_apa.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/2026-05-07_spot-02_pattern-A-shipped.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/2026-05-08_spot-final_all-styles.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/COMPARISON.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/korbmacher_table1.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/option-a.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/ziano_table1.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_notes_raw.txt +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_table1.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/notes.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/option-b.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_notes_raw.txt +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_table1.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/korbmacher_table1.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/notes.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/option-c.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/sample-pdftotext-bbox.html +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/ziano_table1.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/korbmacher_table1.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/notes.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/option-d.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/ziano_table1.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_2022_kruger_bbox.html +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_bbox.html +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_table1.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/option-e.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/sample-bbox.html +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_2021_joep_bbox.html +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_bbox.html +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_table1.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/html-fallback-demo.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.err +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.err +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.err +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.err +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.err +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.err +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.err +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.err +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.err +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.err +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.err +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.err +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.err +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.err +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.err +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.err +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.err +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.err +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.err +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.err +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.err +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.err +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.err +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.err +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.err +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.err +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/papers.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/report.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/splice_spike.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/splice-spike/test_splice_spike.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/specs/2026-04-27-request-09-reference-normalization-design.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/specs/2026-05-06-section-identification-design.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/specs/2026-05-06-table-extraction-design.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/specs/2026-05-08-unified-extraction-design.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/scripts/lint_rendered_corpus.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/scripts/verify_corpus.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/scripts/verify_corpus_full.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/__init__.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/conftest.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/fixtures/__init__.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/fixtures/sections/__init__.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/fixtures/sections/builders.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/fixtures/structured/.gitkeep +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/fixtures/structured/MANIFEST.json +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/fixtures/structured/README.md +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/golden/sections/apa_multi_study_pdf.json +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/golden/sections/apa_single_study_pdf.json +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/golden/sections/html_real_headings.json +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/snapshots/amj_lattice.txt +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/snapshots/apa_chan_feldman_lineless.txt +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/snapshots/apa_chen_jesp_lineless.txt +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/snapshots/apa_efendic_affect.txt +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/snapshots/apa_ip_feldman_pspb.txt +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/snapshots/bmc_lattice.txt +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/snapshots/ieee_figure_heavy.txt +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/snapshots/ieee_lattice.txt +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/snapshots/jama_lattice.txt +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/snapshots/nat_comms_figure_only.txt +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/snapshots/nature_minimal_rule.txt +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/snapshots/scirep_minimal_rule.txt +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_bbox_utils.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_benchmark_docx_html.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_caption_regex.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_cli_sections.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_cli_structured.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_confidence.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_corpus_smoke.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_d5_normalization_audit.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_edge_cases.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_extract_docx.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_extract_filter_sugar.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_extract_html.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_extract_layout.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_extract_pdf_structured.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_extraction.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_f0_table_region_aware.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_figure_detect.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_fixtures_manifest.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_lattice_cluster.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_metaesci_followups.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_normalization.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_normalize_f0_footnote_strip.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_normalize_layout_param.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_normalize_report_layout_fields.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_normalize_v18_strips.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_quality.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_render_html.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_request_09_reference_normalization.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_boundaries.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_boundary_truncation.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_core_partition.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_docx_annotator.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_extract_text.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_footnote_section.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_golden.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_html_annotator.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_pdf_annotator.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_public_api.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_real_corpus.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_taxonomy.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_text_annotator.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_types.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_unit_corpus.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_v161_coalesce.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_v161_subheadings.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_v161_taxonomy.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_v161_text_annotator.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_sections_version.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_smoke_fixtures.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_structured_result_type.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_structured_types.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_structured_version.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_table_detect.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_tables_cell_cleaning.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_text_mode.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_v23_1_fixes.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_v23_bug_fixes.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_v23_post_corpus.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_v23_post_corpus_v2.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_v2_backwards_compat.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_v2_top_level_exports.py +0 -0
- {docpluck-2.4.8 → docpluck-2.4.9}/tests/test_whitespace_cluster.py +0 -0
|
@@ -1,5 +1,25 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [2.4.9] — 2026-05-13
|
|
4
|
+
|
|
5
|
+
Regression hotfix for v2.4.8's `_demote_false_single_word_headings`. The 26-paper baseline gate caught it: ar_royal_society_rsos_140066 + ar_royal_society_rsos_140072 dropped from 4 → 2 sections because `## Discussion`/`## References` got demoted (next line started with lowercase `of this study...` or `1. Öhman A...`).
|
|
6
|
+
|
|
7
|
+
### Fix
|
|
8
|
+
|
|
9
|
+
1. **`docpluck/render.py::_demote_false_single_word_headings`** —
|
|
10
|
+
- Added `_STRONG_SECTION_NAMES` allowlist: abstract / introduction / background / methods / materials / results / discussion / conclusion / references / bibliography / acknowledgments / funding / limitations / appendix / keywords. Headings with these words are NEVER demoted — they are authoritative section markers.
|
|
11
|
+
- Added numbered-subsection guard: if next line matches `^\d+(?:\.\d+){1,3}\.?\s+\w` (e.g., `3.1. Subjects`, `3.1.2. Foo`), the heading stays — the numbered subsection is legitimate body content.
|
|
12
|
+
|
|
13
|
+
### Tests
|
|
14
|
+
|
|
15
|
+
- 4 new tests in `tests/test_render.py` (strong-section preservation for Results / Discussion / References, non-canonical word like ``Theory`` still demoted, numbered-subsection guard).
|
|
16
|
+
- 55 render tests PASS.
|
|
17
|
+
- **26-paper baseline: 26/26 PASS** (vs v2.4.8: 24/26).
|
|
18
|
+
|
|
19
|
+
### Bumps
|
|
20
|
+
|
|
21
|
+
- `__version__`: `2.4.8` → `2.4.9`. Patch.
|
|
22
|
+
|
|
3
23
|
## [2.4.8] — 2026-05-13
|
|
4
24
|
|
|
5
25
|
Massive defect-class sweep informed by 8 parallel subagent audits. Highest-impact item: a render-level false-heading demoter that addresses 197 false `## Word` / `### Word` headings (24% of all single-word headings in the v2.4.0 101-paper corpus) where pdftotext split a single line ("Results of Study 1") across a column wrap.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docpluck
|
|
3
|
-
Version: 2.4.
|
|
3
|
+
Version: 2.4.9
|
|
4
4
|
Summary: PDF, DOCX, and HTML text extraction and normalization for academic papers
|
|
5
5
|
Project-URL: Homepage, https://github.com/giladfeldman/docpluck
|
|
6
6
|
Project-URL: Documentation, https://github.com/giladfeldman/docpluck/tree/main/docs
|
|
@@ -71,7 +71,7 @@ from .figures import Figure
|
|
|
71
71
|
from .extract_structured import TABLE_EXTRACTION_VERSION, StructuredResult, extract_pdf_structured
|
|
72
72
|
from .render import render_pdf_to_markdown
|
|
73
73
|
|
|
74
|
-
__version__ = "2.4.
|
|
74
|
+
__version__ = "2.4.9"
|
|
75
75
|
__author__ = "Gilad Feldman"
|
|
76
76
|
__license__ = "MIT"
|
|
77
77
|
|
|
@@ -384,6 +384,20 @@ def _join_multiline_caption_paragraphs(text: str) -> str:
|
|
|
384
384
|
|
|
385
385
|
_FALSE_HEADING_RE = re.compile(r"^(#{2,3})\s+(?P<word>[A-Z][A-Za-z]{2,12})\s*$")
|
|
386
386
|
|
|
387
|
+
# Strong canonical section names — never demote even when followed by a
|
|
388
|
+
# lowercase or digit continuation. These are unambiguous section markers
|
|
389
|
+
# whose authoritative source is the document structure, not the surrounding
|
|
390
|
+
# prose. The RSOS-family regression (v2.4.9) showed that ``## Discussion``
|
|
391
|
+
# followed by body prose starting with ``of this study...`` got demoted —
|
|
392
|
+
# losing the section. Same for ``## References\n\n1. Öhman A...``.
|
|
393
|
+
_STRONG_SECTION_NAMES = frozenset({
|
|
394
|
+
"abstract", "introduction", "background", "methods", "method",
|
|
395
|
+
"materials", "results", "discussion", "discussions", "conclusion",
|
|
396
|
+
"conclusions", "references", "bibliography", "acknowledgments",
|
|
397
|
+
"acknowledgements", "funding", "limitations", "supplementary",
|
|
398
|
+
"appendix", "keywords",
|
|
399
|
+
})
|
|
400
|
+
|
|
387
401
|
|
|
388
402
|
def _demote_false_single_word_headings(text: str) -> str:
|
|
389
403
|
"""Demote ``## Word`` / ``### Word`` lines that are mid-prose continuations.
|
|
@@ -421,6 +435,14 @@ def _demote_false_single_word_headings(text: str) -> str:
|
|
|
421
435
|
out.append(line)
|
|
422
436
|
i += 1
|
|
423
437
|
continue
|
|
438
|
+
# v2.4.9: never demote strong canonical section names. The body
|
|
439
|
+
# text following `## Discussion` or `## References` can start with
|
|
440
|
+
# lowercase prose / numbered list ("of this study...", "1. Öhman A..."),
|
|
441
|
+
# but the heading itself is authoritative.
|
|
442
|
+
if m.group("word").lower() in _STRONG_SECTION_NAMES:
|
|
443
|
+
out.append(line)
|
|
444
|
+
i += 1
|
|
445
|
+
continue
|
|
424
446
|
# Find the next non-blank line.
|
|
425
447
|
j = i + 1
|
|
426
448
|
while j < len(lines) and not lines[j].strip():
|
|
@@ -435,6 +457,16 @@ def _demote_false_single_word_headings(text: str) -> str:
|
|
|
435
457
|
# original heading line (``Results of Study 1`` → ``## Results`` +
|
|
436
458
|
# ``of Study 1``). Skip the lookahead for proper-sentence starts.
|
|
437
459
|
first_char = next_line[:1]
|
|
460
|
+
# v2.4.9: don't demote when the next line is a numbered subsection
|
|
461
|
+
# (``3.1. Subjects``, ``3.1 Subjects``, ``4.1. Do seasonal``).
|
|
462
|
+
# Royal Society RSOS papers use ``## Methods\n\n3.1. Subjects`` as
|
|
463
|
+
# a legitimate section + numbered-subsection structure. The
|
|
464
|
+
# `_promote_numbered_subsection_headings` post-processor will lift
|
|
465
|
+
# those into ``### 3.1 Subjects`` headings.
|
|
466
|
+
if re.match(r"^\d+(?:\.\d+){1,3}\.?\s+\w", next_line):
|
|
467
|
+
out.append(line)
|
|
468
|
+
i += 1
|
|
469
|
+
continue
|
|
438
470
|
is_continuation = bool(
|
|
439
471
|
first_char and (first_char.islower() or first_char.isdigit())
|
|
440
472
|
)
|
|
@@ -353,24 +353,35 @@ def test_study_subsection_skip_unrelated_prose():
|
|
|
353
353
|
# ── _demote_false_single_word_headings ──────────────────────────────────────
|
|
354
354
|
|
|
355
355
|
|
|
356
|
-
def
|
|
356
|
+
def test_strong_section_heading_results_preserved_with_continuation_text():
|
|
357
|
+
"""v2.4.9 regression fix: ``## Results`` is a strong canonical section;
|
|
358
|
+
even if pdftotext rendered the body starting with lowercase ``of Study 1``,
|
|
359
|
+
the heading stays — the body keeps its (slightly weird) opening, but the
|
|
360
|
+
section structure survives."""
|
|
357
361
|
text = "## Results\n\nof Study 1 showed significant effects."
|
|
358
362
|
out = _demote_false_single_word_headings(text)
|
|
359
|
-
assert "## Results"
|
|
360
|
-
|
|
363
|
+
assert "## Results" in out
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
def test_strong_section_heading_discussion_preserved():
|
|
367
|
+
text = "## Discussion\n\nof this study apparently present evidence against."
|
|
368
|
+
out = _demote_false_single_word_headings(text)
|
|
369
|
+
assert "## Discussion" in out
|
|
361
370
|
|
|
362
371
|
|
|
363
|
-
def
|
|
364
|
-
text = "##
|
|
372
|
+
def test_strong_section_heading_references_preserved_with_numbered_list():
|
|
373
|
+
text = "## References\n\n1. Öhman A, Lundqvist D, Esteves F. 2001 The face in the crowd."
|
|
365
374
|
out = _demote_false_single_word_headings(text)
|
|
366
|
-
assert "##
|
|
367
|
-
assert "Discussion section of the article" in out
|
|
375
|
+
assert "## References" in out
|
|
368
376
|
|
|
369
377
|
|
|
370
|
-
def
|
|
371
|
-
|
|
378
|
+
def test_false_heading_demoted_for_non_canonical_word():
|
|
379
|
+
"""A non-canonical single-word heading (``## Theory``) followed by
|
|
380
|
+
lowercase continuation IS demoted (v2.4.8 behavior preserved)."""
|
|
381
|
+
text = "### Theory\n\nof the firm: managerial implications follow."
|
|
372
382
|
out = _demote_false_single_word_headings(text)
|
|
373
|
-
assert "
|
|
383
|
+
assert "### Theory" not in out
|
|
384
|
+
assert "Theory of the firm" in out
|
|
374
385
|
|
|
375
386
|
|
|
376
387
|
def test_legit_heading_preserved_when_next_line_capitalized_sentence():
|
|
@@ -400,6 +411,23 @@ def test_false_heading_demoter_idempotent():
|
|
|
400
411
|
assert once == twice
|
|
401
412
|
|
|
402
413
|
|
|
414
|
+
def test_false_heading_preserved_when_next_line_is_numbered_subsection():
|
|
415
|
+
"""v2.4.9 regression fix: RSOS-style ``## Methods\\n\\n3.1. Subjects``
|
|
416
|
+
must keep the heading + numbered subsection intact. Demoting here
|
|
417
|
+
would destroy the section structure."""
|
|
418
|
+
text = "## Methods\n\n3.1. Subjects and study site\n\nWe sampled..."
|
|
419
|
+
out = _demote_false_single_word_headings(text)
|
|
420
|
+
assert "## Methods" in out
|
|
421
|
+
assert "3.1. Subjects and study site" in out
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
def test_false_heading_preserved_with_4digit_numbered_subsection():
|
|
425
|
+
text = "## Results\n\n4.1. Do seasonal challenges affect...\n\nResults follow."
|
|
426
|
+
out = _demote_false_single_word_headings(text)
|
|
427
|
+
assert "## Results" in out
|
|
428
|
+
assert "4.1. Do seasonal challenges affect..." in out
|
|
429
|
+
|
|
430
|
+
|
|
403
431
|
# ── _reformat_jama_key_points_box ──────────────────────────────────────────
|
|
404
432
|
|
|
405
433
|
|
|
@@ -1,114 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
docpluck — PDF, DOCX, and HTML text extraction and normalization for academic papers
|
|
3
|
-
====================================================================================
|
|
4
|
-
|
|
5
|
-
A Python library for extracting and normalizing text from academic documents.
|
|
6
|
-
Built from cross-project lessons across 8,000+ PDFs from psychology, medicine,
|
|
7
|
-
economics, physics, and biology.
|
|
8
|
-
|
|
9
|
-
Supports:
|
|
10
|
-
- **PDF** via pdftotext (default mode, with pdfplumber SMP fallback)
|
|
11
|
-
- **DOCX** via mammoth (DOCX → HTML → text, preserves soft breaks)
|
|
12
|
-
- **HTML** via beautifulsoup4 + lxml (custom block/inline-aware tree-walk)
|
|
13
|
-
|
|
14
|
-
Quick start::
|
|
15
|
-
|
|
16
|
-
from docpluck import extract_pdf, extract_docx, extract_html
|
|
17
|
-
from docpluck import normalize_text, NormalizationLevel, compute_quality_score
|
|
18
|
-
|
|
19
|
-
# PDF
|
|
20
|
-
with open("paper.pdf", "rb") as f:
|
|
21
|
-
text, method = extract_pdf(f.read())
|
|
22
|
-
|
|
23
|
-
# DOCX (requires: pip install docpluck[docx])
|
|
24
|
-
with open("paper.docx", "rb") as f:
|
|
25
|
-
text, method = extract_docx(f.read())
|
|
26
|
-
|
|
27
|
-
# HTML (requires: pip install docpluck[html])
|
|
28
|
-
with open("paper.html", "rb") as f:
|
|
29
|
-
text, method = extract_html(f.read())
|
|
30
|
-
|
|
31
|
-
# Normalization and quality scoring work on text from any source
|
|
32
|
-
normalized, report = normalize_text(text, NormalizationLevel.academic)
|
|
33
|
-
quality = compute_quality_score(normalized)
|
|
34
|
-
|
|
35
|
-
print(f"Method: {method}")
|
|
36
|
-
print(f"Quality: {quality['score']}/100 ({quality['confidence']})")
|
|
37
|
-
print(f"Steps applied: {report.steps_applied}")
|
|
38
|
-
|
|
39
|
-
Installation::
|
|
40
|
-
|
|
41
|
-
pip install docpluck # PDF only (pdfplumber)
|
|
42
|
-
pip install docpluck[docx] # + mammoth
|
|
43
|
-
pip install docpluck[html] # + beautifulsoup4 + lxml
|
|
44
|
-
pip install docpluck[all] # everything
|
|
45
|
-
|
|
46
|
-
# extract_pdf() also requires poppler-utils:
|
|
47
|
-
# Linux/WSL: apt-get install poppler-utils
|
|
48
|
-
# macOS: brew install poppler
|
|
49
|
-
# Windows: https://github.com/oschwartz10612/poppler-windows/releases
|
|
50
|
-
|
|
51
|
-
See Also:
|
|
52
|
-
- docs/README.md — Full usage guide and API reference
|
|
53
|
-
- docs/DESIGN.md — Implementation decisions and rationale
|
|
54
|
-
- docs/BENCHMARKS.md — Benchmark results across all supported formats
|
|
55
|
-
- docs/NORMALIZATION.md — All 15 pipeline steps documented
|
|
56
|
-
"""
|
|
57
|
-
|
|
58
|
-
from .extract import extract_pdf, extract_pdf_file, count_pages
|
|
59
|
-
from .extract_docx import extract_docx
|
|
60
|
-
from .extract_html import extract_html, html_to_text
|
|
61
|
-
from .normalize import normalize_text, NormalizationLevel, NormalizationReport
|
|
62
|
-
from .quality import compute_quality_score
|
|
63
|
-
from .batch import ExtractionReport, extract_to_dir
|
|
64
|
-
from .version import get_version_info
|
|
65
|
-
from .sections import (
|
|
66
|
-
extract_sections, SectionedDocument, Section,
|
|
67
|
-
SectionLabel, Confidence, DetectedVia, SECTIONING_VERSION,
|
|
68
|
-
)
|
|
69
|
-
from .tables import Cell, Table
|
|
70
|
-
from .figures import Figure
|
|
71
|
-
from .extract_structured import TABLE_EXTRACTION_VERSION, StructuredResult, extract_pdf_structured
|
|
72
|
-
from .render import render_pdf_to_markdown
|
|
73
|
-
|
|
74
|
-
__version__ = "2.4.8"
|
|
75
|
-
__author__ = "Gilad Feldman"
|
|
76
|
-
__license__ = "MIT"
|
|
77
|
-
|
|
78
|
-
__all__ = [
|
|
79
|
-
# Extraction
|
|
80
|
-
"extract_pdf",
|
|
81
|
-
"extract_pdf_file",
|
|
82
|
-
"extract_docx",
|
|
83
|
-
"extract_html",
|
|
84
|
-
"html_to_text",
|
|
85
|
-
"count_pages",
|
|
86
|
-
# Normalization
|
|
87
|
-
"normalize_text",
|
|
88
|
-
"NormalizationLevel",
|
|
89
|
-
"NormalizationReport",
|
|
90
|
-
# Quality
|
|
91
|
-
"compute_quality_score",
|
|
92
|
-
# Batch
|
|
93
|
-
"ExtractionReport",
|
|
94
|
-
"extract_to_dir",
|
|
95
|
-
# Version
|
|
96
|
-
"get_version_info",
|
|
97
|
-
# Sections
|
|
98
|
-
"extract_sections",
|
|
99
|
-
"SectionedDocument",
|
|
100
|
-
"Section",
|
|
101
|
-
"SectionLabel",
|
|
102
|
-
"Confidence",
|
|
103
|
-
"DetectedVia",
|
|
104
|
-
"SECTIONING_VERSION",
|
|
105
|
-
# Structured extraction (v2.0)
|
|
106
|
-
"Cell",
|
|
107
|
-
"Table",
|
|
108
|
-
"Figure",
|
|
109
|
-
"TABLE_EXTRACTION_VERSION",
|
|
110
|
-
"StructuredResult",
|
|
111
|
-
"extract_pdf_structured",
|
|
112
|
-
# Markdown rendering (v2.2)
|
|
113
|
-
"render_pdf_to_markdown",
|
|
114
|
-
]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docpluck-2.4.8 → docpluck-2.4.9}/.claude/skills/docpluck-qa/references/check-11-hard-rules.md
RENAMED
|
File without changes
|
|
File without changes
|
{docpluck-2.4.8 → docpluck-2.4.9}/.claude/skills/docpluck-qa/references/check-5-escicheck-library.md
RENAMED
|
File without changes
|
|
File without changes
|
{docpluck-2.4.8 → docpluck-2.4.9}/.claude/skills/docpluck-qa/references/check-7-batch-smoke.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docpluck-2.4.8 → docpluck-2.4.9}/docs/HANDOFF_2026-05-12_remaining_ui_and_chrome_verification.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/2026-05-06-section-identification.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docpluck-2.4.8 → docpluck-2.4.9}/docs/superpowers/plans/spot-checks/2026-05-07_spot-01_apa.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|