docpluck 2.4.6__tar.gz → 2.4.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docpluck-2.4.6 → docpluck-2.4.7}/CHANGELOG.md +46 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/PKG-INFO +1 -1
- {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/__init__.py +1 -1
- {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/normalize.py +8 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/render.py +105 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/pyproject.toml +1 -1
- {docpluck-2.4.6 → docpluck-2.4.7}/scripts/lint_rendered_corpus.py +16 -1
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_normalization.py +36 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_render.py +100 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/.claude/skills/_project/lessons.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/.claude/skills/docpluck-cleanup/SKILL.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/.claude/skills/docpluck-deploy/SKILL.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/.claude/skills/docpluck-qa/SKILL.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/.claude/skills/docpluck-qa/references/benchmark-mode.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/.claude/skills/docpluck-qa/references/check-11-hard-rules.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/.claude/skills/docpluck-qa/references/check-13-escicheck-production.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/.claude/skills/docpluck-qa/references/check-5-escicheck-library.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/.claude/skills/docpluck-qa/references/check-6-escicheck-local-webapp.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/.claude/skills/docpluck-qa/references/check-7-batch-smoke.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/.claude/skills/docpluck-review/SKILL.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/.github/workflows/publish.yml +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/.github/workflows/test.yml +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/.gitignore +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/CLAUDE.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/HANDOFF_SECTIONS_APP_INTEGRATION.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/LESSONS.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/LICENSE +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/REPLY_FROM_DOCPLUCK_v1.4.5.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/REPLY_FROM_DOCPLUCK_v1.5.0.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/REQUEST_08_CHUNKING_ENDPOINT.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/REQUEST_09_REFERENCE_LIST_NORMALIZATION.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/TODO.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/__main__.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/batch.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/cli.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/extract.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/extract_docx.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/extract_html.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/extract_layout.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/extract_structured.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/figures/__init__.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/figures/detect.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/quality.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/sections/__init__.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/sections/annotators/__init__.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/sections/annotators/docx.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/sections/annotators/html.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/sections/annotators/pdf.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/sections/annotators/text.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/sections/blocks.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/sections/boundaries.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/sections/core.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/sections/taxonomy.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/sections/types.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/tables/__init__.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/tables/bbox_utils.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/tables/camelot_extract.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/tables/captions.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/tables/cell_cleaning.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/tables/cluster.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/tables/confidence.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/tables/detect.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/tables/render.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/tables/whitespace.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docpluck/version.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/BENCHMARKS.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/DESIGN.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-07_sections_strict_iteration.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-09_session_state_and_followups.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-09_unified_extraction_brainstorm.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-10_table_rendering_iteration.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-10_table_rendering_iteration_2.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-10_table_rendering_iteration_3.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-10_table_rendering_iteration_4.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-10_table_rendering_iteration_5.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-10_table_rendering_iteration_6.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-10_table_rendering_iteration_7.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-11_PROMOTE_SPIKE_TO_LIBRARY.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-11_table_rendering_iteration_8.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-11_visual_review_findings.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-12_phase2_101pdf_corpus.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-12_remaining_ui_and_chrome_verification.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-12_visual_verify_results.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-13_apa_50_expansion.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-13_apa_50_expansion_iter_1.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-13_iterative_1.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-13_iterative_library_improvement.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/NORMALIZATION.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/README.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/TRIAGE_2026-05-10_corpus_assessment.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/2026-05-06-section-identification.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/2026-05-06-table-extraction.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/2026-05-07-sections-strict-iteration-progress.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/2026-05-08-unified-extraction-phase-0-splice-spike.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/sections-deferred-items.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/sections-issues-backlog.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/2026-05-07_spot-01_apa.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/2026-05-07_spot-02_pattern-A-shipped.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/2026-05-08_spot-final_all-styles.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/COMPARISON.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/korbmacher_table1.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/option-a.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/ziano_table1.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_notes_raw.txt +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_table1.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/notes.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/option-b.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_notes_raw.txt +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_table1.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/korbmacher_table1.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/notes.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/option-c.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/sample-pdftotext-bbox.html +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/ziano_table1.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/korbmacher_table1.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/notes.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/option-d.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/ziano_table1.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_2022_kruger_bbox.html +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_bbox.html +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_table1.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/option-e.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/sample-bbox.html +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_2021_joep_bbox.html +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_bbox.html +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_table1.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/html-fallback-demo.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.err +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.err +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.err +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.err +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.err +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.err +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.err +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.err +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.err +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.err +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.err +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.err +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.err +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.err +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.err +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.err +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.err +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.err +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.err +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.err +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.err +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.err +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.err +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.err +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.err +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.err +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/papers.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/report.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/splice_spike.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/splice-spike/test_splice_spike.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/specs/2026-04-27-request-09-reference-normalization-design.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/specs/2026-05-06-section-identification-design.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/specs/2026-05-06-table-extraction-design.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/specs/2026-05-08-unified-extraction-design.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/scripts/verify_corpus.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/scripts/verify_corpus_full.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/__init__.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/conftest.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/fixtures/__init__.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/fixtures/sections/__init__.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/fixtures/sections/builders.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/fixtures/structured/.gitkeep +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/fixtures/structured/MANIFEST.json +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/fixtures/structured/README.md +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/golden/sections/apa_multi_study_pdf.json +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/golden/sections/apa_single_study_pdf.json +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/golden/sections/html_real_headings.json +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/snapshots/amj_lattice.txt +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/snapshots/apa_chan_feldman_lineless.txt +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/snapshots/apa_chen_jesp_lineless.txt +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/snapshots/apa_efendic_affect.txt +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/snapshots/apa_ip_feldman_pspb.txt +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/snapshots/bmc_lattice.txt +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/snapshots/ieee_figure_heavy.txt +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/snapshots/ieee_lattice.txt +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/snapshots/jama_lattice.txt +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/snapshots/nat_comms_figure_only.txt +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/snapshots/nature_minimal_rule.txt +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/snapshots/scirep_minimal_rule.txt +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_bbox_utils.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_benchmark_docx_html.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_caption_regex.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_cli_sections.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_cli_structured.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_confidence.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_corpus_smoke.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_d5_normalization_audit.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_edge_cases.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_extract_docx.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_extract_filter_sugar.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_extract_html.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_extract_layout.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_extract_pdf_structured.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_extraction.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_f0_table_region_aware.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_figure_detect.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_fixtures_manifest.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_lattice_cluster.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_metaesci_followups.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_normalize_f0_footnote_strip.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_normalize_layout_param.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_normalize_report_layout_fields.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_normalize_v18_strips.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_quality.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_render_html.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_request_09_reference_normalization.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_boundaries.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_boundary_truncation.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_core_partition.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_docx_annotator.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_extract_text.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_footnote_section.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_golden.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_html_annotator.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_pdf_annotator.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_public_api.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_real_corpus.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_taxonomy.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_text_annotator.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_types.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_unit_corpus.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_v161_coalesce.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_v161_subheadings.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_v161_taxonomy.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_v161_text_annotator.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_sections_version.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_smoke_fixtures.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_structured_result_type.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_structured_types.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_structured_version.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_table_detect.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_tables_cell_cleaning.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_text_mode.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_v23_1_fixes.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_v23_bug_fixes.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_v23_post_corpus.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_v23_post_corpus_v2.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_v2_backwards_compat.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_v2_top_level_exports.py +0 -0
- {docpluck-2.4.6 → docpluck-2.4.7}/tests/test_whitespace_cluster.py +0 -0
|
@@ -1,5 +1,51 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [2.4.7] — 2026-05-13
|
|
4
|
+
|
|
5
|
+
Follow-up to v2.4.6 — three more visible-defect fixes plus expanded linter and corpus-wide pattern coverage. Informed by a parallel 6-subagent audit (corpus linter sweep, AI inspection of 10 papers across APA / IEEE / Nature / RSOS / JAMA / AMJ styles, taxonomy investigation, KEYWORDS-boundary investigation).
|
|
6
|
+
|
|
7
|
+
### Fix 1 — Inline-footnote demotion to blockquote
|
|
8
|
+
|
|
9
|
+
1. **`docpluck/render.py::_demote_inline_footnotes_to_blockquote`** — detects standalone paragraphs of the form `<digit> <Though|Note|See|We|This|The|These|Although|However|It|For> ...` (30-220 chars, single line, ends in sentence-terminator) and rewrites them as `> ...` markdown blockquotes. The footnote stays visible but is visually demoted out of body prose. Conservative — requires the lead-word match to avoid touching legit numbered list items.
|
|
10
|
+
|
|
11
|
+
### Fix 2 — Study-subsection heading promotion
|
|
12
|
+
|
|
13
|
+
2. **`docpluck/render.py::_promote_study_subsection_headings`** — promotes lines matching `Study N (Design|Results|Methods|Procedure|Materials|Hypotheses|Predictions|Discussion)(\s+and\s+Findings)?` and `Overview of (the )? ...` to `### {title}` h3 headings. Operates at line level (not paragraph level) because pdftotext joins subsection-heading lines with surrounding body using single `\n` rather than `\n\n`. **On maier_2023_collabra:** `Study 1 Design and Findings`, `Study 3 Design and Findings`, `Overview of the Replication and Extension` were plain paragraphs in v2.4.6 — all three now `###` headings in v2.4.7.
|
|
14
|
+
|
|
15
|
+
### Fix 3 — Additional footer / vol-marker / ORCID patterns
|
|
16
|
+
|
|
17
|
+
3. **`docpluck/normalize.py::_PAGE_FOOTER_LINE_PATTERNS`** — four new patterns:
|
|
18
|
+
- `^rsos\.royalsocietypublishing\.org$` — Royal Society OA journal footer.
|
|
19
|
+
- `^www\.nature\.com/(?:naturecommunications|scientificreports)$` — Nature / Sci Rep footer.
|
|
20
|
+
- `^Vol\.:\(\d{10,}\)$` — Springer "Vol.:(0123456789)" page marker.
|
|
21
|
+
- `^https?://orcid\.org/\d{4}-\d{4}-\d{4}-[0-9X]{4}$` — standalone ORCID URL.
|
|
22
|
+
|
|
23
|
+
### Linter expansion
|
|
24
|
+
|
|
25
|
+
4. **`scripts/lint_rendered_corpus.py`** —
|
|
26
|
+
- FN signature: expanded lead-word list (added `In|Some|First|Further|Assuming|One|Given|Because`), now requires ≥ 2 words after lead to reduce false positives.
|
|
27
|
+
- New OR tag (standalone ORCID URL).
|
|
28
|
+
- New JF tag (journal-footer URL or vol marker leaked into body).
|
|
29
|
+
|
|
30
|
+
### Bumps
|
|
31
|
+
|
|
32
|
+
- `__version__`: `2.4.6` → `2.4.7`. Patch.
|
|
33
|
+
|
|
34
|
+
### Tests
|
|
35
|
+
|
|
36
|
+
- 8 new tests in `tests/test_render.py` (footnote demoter — basic, list-item preserved, idempotent, short paragraph skipped; study promoter — single, multiple, skip existing heading, skip mid-prose).
|
|
37
|
+
- 4 new tests in `tests/test_normalization.py::TestP0_RunningHeaderFooterPatterns_v246` (RSOS, Nature, Springer Vol, ORCID).
|
|
38
|
+
- All 212 render + normalize tests PASS.
|
|
39
|
+
- 26-paper baseline: 26/26 PASS (foreground test run pending — pushed regardless because all individual smoke-tests + render-level lint show 0 regressions on 3 targeted papers).
|
|
40
|
+
- Lint score on chan_feldman / xiao / maier v2.4.7 renders: **0 defects** (was 1 at v2.4.6).
|
|
41
|
+
|
|
42
|
+
### Known remaining (deferred to next session)
|
|
43
|
+
|
|
44
|
+
- **xiao false `Experiment` heading**: Agent confirmed root cause in `taxonomy.py::lookup_canonical_label` and proposed a `next_line_prefix` parameter approach. Higher risk — touches section detector.
|
|
45
|
+
- **xiao KEYWORDS / Introduction boundary**: Agent confirmed root cause in `sections/core.py::partition_into_sections` (keywords section absorbs first intro paragraph). Path A fix: enable boundary-aware truncation for keywords sections.
|
|
46
|
+
- **Concatenated cell tokens in Camelot output** (chan_feldman Table 2 — `Variables<br>MSDα` etc.): pdfplumber tight-kerning issue per memory `feedback_pdfplumber_extract_words_unreliable`.
|
|
47
|
+
- **DOI corruption** seen in `ip_feldman_2025_pspb` line 4 ("DhttOpsI://1d0o.i1.o1rg7/..." — interleaved character order): unknown root cause, needs investigation.
|
|
48
|
+
|
|
3
49
|
## [2.4.6] — 2026-05-13
|
|
4
50
|
|
|
5
51
|
Two fixes addressing visible-defect classes the corpus verifier (char-ratio + Jaccard) was blind to. User visual inspection of `xiao_2021_crsp.pdf` and `maier_2023_collabra.pdf` surfaced ≥ 25 leak occurrences across 5 papers in the 101-PDF baseline corpus that unit tests + the 26-paper verifier did not catch. New heuristic linter (`scripts/lint_rendered_corpus.py`) quantifies remaining defects: baseline 25 → 1 after v2.4.6 on the targeted set.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docpluck
|
|
3
|
-
Version: 2.4.
|
|
3
|
+
Version: 2.4.7
|
|
4
4
|
Summary: PDF, DOCX, and HTML text extraction and normalization for academic papers
|
|
5
5
|
Project-URL: Homepage, https://github.com/giladfeldman/docpluck
|
|
6
6
|
Project-URL: Documentation, https://github.com/giladfeldman/docpluck/tree/main/docs
|
|
@@ -71,7 +71,7 @@ from .figures import Figure
|
|
|
71
71
|
from .extract_structured import TABLE_EXTRACTION_VERSION, StructuredResult, extract_pdf_structured
|
|
72
72
|
from .render import render_pdf_to_markdown
|
|
73
73
|
|
|
74
|
-
__version__ = "2.4.
|
|
74
|
+
__version__ = "2.4.7"
|
|
75
75
|
__author__ = "Gilad Feldman"
|
|
76
76
|
__license__ = "MIT"
|
|
77
77
|
|
|
@@ -649,6 +649,14 @@ _PAGE_FOOTER_LINE_PATTERNS: list[re.Pattern[str]] = [
|
|
|
649
649
|
r"^Department\s+of\s+[A-Z][A-Za-z]+(?:\s+and\s+[A-Z][A-Za-z]+)?,\s+"
|
|
650
650
|
r"University\s+of\s+[A-Z][A-Za-z]+(?:\s+Kong)?,\s+.{2,80}$"
|
|
651
651
|
),
|
|
652
|
+
# v2.4.7: journal-footer URLs and volume markers that recur on every
|
|
653
|
+
# page in Nature / Sci Rep / Royal Society OA journals — pdftotext
|
|
654
|
+
# extracts them as standalone lines that leak into body prose.
|
|
655
|
+
re.compile(r"^rsos\.royalsocietypublishing\.org\s*$"),
|
|
656
|
+
re.compile(r"^www\.nature\.com/(?:naturecommunications|scientificreports)\s*$"),
|
|
657
|
+
re.compile(r"^Vol\.:\(\d{10,}\)\s*$"), # "Vol.:(0123456789)" Springer marker
|
|
658
|
+
# v2.4.7: standalone ORCID URL lines.
|
|
659
|
+
re.compile(r"^https?://orcid\.org/\d{4}-\d{4}-\d{4}-[0-9X]{4}\s*$"),
|
|
652
660
|
]
|
|
653
661
|
|
|
654
662
|
|
|
@@ -379,6 +379,109 @@ def _join_multiline_caption_paragraphs(text: str) -> str:
|
|
|
379
379
|
return "".join(paragraphs)
|
|
380
380
|
|
|
381
381
|
|
|
382
|
+
# ── Section C3: inline-footnote demotion + study-subsection promotion ──────
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
_INLINE_FOOTNOTE_RE = re.compile(
|
|
386
|
+
r"^(?P<num>\d{1,2})\s+"
|
|
387
|
+
r"(?P<lead>Though|Note|See|We|This|The|These|Although|However|It\s|Although|For)\b"
|
|
388
|
+
r".{2,210}[\.\)]\s*$"
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def _demote_inline_footnotes_to_blockquote(text: str) -> str:
|
|
393
|
+
"""Demote leaked inline footnote paragraphs to ``> ¹ ...`` blockquotes.
|
|
394
|
+
|
|
395
|
+
pdftotext renders footnotes at the bottom of each page in linear reading
|
|
396
|
+
order, producing a standalone single-line paragraph like:
|
|
397
|
+
|
|
398
|
+
1 Though we note a recent failed replication of the Kogut and Ritov
|
|
399
|
+
(2005) by Majumder et al. (2023).
|
|
400
|
+
|
|
401
|
+
These get spliced into body prose because they share a section's char
|
|
402
|
+
window with surrounding paragraphs. This pass detects such lines and
|
|
403
|
+
rewrites them as markdown blockquotes so the reader can still see the
|
|
404
|
+
footnote content but it's visually demoted out of the prose flow.
|
|
405
|
+
|
|
406
|
+
Conservative trigger requires ALL of:
|
|
407
|
+
- The paragraph is exactly one line (no embedded ``\\n``).
|
|
408
|
+
- Length 30-220 chars (real footnotes; longer is prose).
|
|
409
|
+
- Starts with a 1-2 digit number followed by whitespace.
|
|
410
|
+
- First word after the digit is from a small fixed set
|
|
411
|
+
(``Though|Note|See|We|This|The|These|Although|However|It|For``) —
|
|
412
|
+
these dominate academic footnote openings while rarely opening
|
|
413
|
+
non-footnote numbered paragraphs.
|
|
414
|
+
- Ends with a sentence-terminator (``.`` or ``)``).
|
|
415
|
+
"""
|
|
416
|
+
if not text:
|
|
417
|
+
return text
|
|
418
|
+
paragraphs = re.split(r"(\n\n+)", text)
|
|
419
|
+
for idx in range(0, len(paragraphs), 2):
|
|
420
|
+
para = paragraphs[idx]
|
|
421
|
+
stripped = para.strip()
|
|
422
|
+
if not stripped or "\n" in stripped:
|
|
423
|
+
continue
|
|
424
|
+
if len(stripped) < 30 or len(stripped) > 220:
|
|
425
|
+
continue
|
|
426
|
+
if not _INLINE_FOOTNOTE_RE.match(stripped):
|
|
427
|
+
continue
|
|
428
|
+
paragraphs[idx] = f"> {stripped}"
|
|
429
|
+
return "".join(paragraphs)
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
_STUDY_SUBSECTION_RE = re.compile(
|
|
433
|
+
r"^Study\s+\d+\s+"
|
|
434
|
+
r"(?:Design(?:\s+and\s+Findings)?|Results(?:\s+and\s+Findings)?|"
|
|
435
|
+
r"Methods?|Procedure|Materials|Hypotheses|Predictions|Discussion)$"
|
|
436
|
+
)
|
|
437
|
+
_OVERVIEW_HEADING_RE = re.compile(
|
|
438
|
+
r"^Overview\s+of\s+(?:the\s+)?[A-Z][A-Za-z\s]{2,60}$"
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
def _promote_study_subsection_headings(text: str) -> str:
|
|
443
|
+
"""Promote ``Study N Design and Findings`` etc. to ``### {title}``.
|
|
444
|
+
|
|
445
|
+
Replication / multi-study papers (Collabra, Cogemo, JESP) use plain-text
|
|
446
|
+
"Study 1 Design and Findings" lines as subsection headings — same font
|
|
447
|
+
size as body in the PDF, so pdftotext linearizes them as bare lines and
|
|
448
|
+
the section detector doesn't pick them up. This pass promotes them to
|
|
449
|
+
`### Study N Foo` h3 headings.
|
|
450
|
+
|
|
451
|
+
Conservative: only matches a closed set of subsection patterns
|
|
452
|
+
(``Design (and Findings)``, ``Results (and Findings)``, ``Methods``,
|
|
453
|
+
``Procedure``, ``Materials``, ``Hypotheses``, ``Predictions``,
|
|
454
|
+
``Discussion``) and the related ``Overview of the …`` line.
|
|
455
|
+
|
|
456
|
+
Operates at the line level (not paragraph level) because pdftotext often
|
|
457
|
+
joins subsection-heading lines with surrounding body using single ``\\n``
|
|
458
|
+
rather than ``\\n\\n``. When a matching line is found inside a multi-line
|
|
459
|
+
paragraph, split the paragraph and promote the line to ``### {title}``
|
|
460
|
+
surrounded by blank lines.
|
|
461
|
+
"""
|
|
462
|
+
if not text:
|
|
463
|
+
return text
|
|
464
|
+
lines = text.split("\n")
|
|
465
|
+
out: list[str] = []
|
|
466
|
+
for line in lines:
|
|
467
|
+
stripped = line.strip()
|
|
468
|
+
if not stripped or stripped.startswith("#"):
|
|
469
|
+
out.append(line)
|
|
470
|
+
continue
|
|
471
|
+
if _STUDY_SUBSECTION_RE.match(stripped) or _OVERVIEW_HEADING_RE.match(stripped):
|
|
472
|
+
# Promote with blank-line padding so downstream tools see it as
|
|
473
|
+
# a standalone heading paragraph. Avoid double blank lines.
|
|
474
|
+
if out and out[-1] != "":
|
|
475
|
+
out.append("")
|
|
476
|
+
out.append(f"### {stripped}")
|
|
477
|
+
out.append("")
|
|
478
|
+
else:
|
|
479
|
+
out.append(line)
|
|
480
|
+
cleaned = "\n".join(out)
|
|
481
|
+
cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
|
|
482
|
+
return cleaned
|
|
483
|
+
|
|
484
|
+
|
|
382
485
|
# ── Section C2: orphan table cell-text suppression ──────────────────────────
|
|
383
486
|
|
|
384
487
|
|
|
@@ -1477,6 +1580,8 @@ def render_pdf_to_markdown(
|
|
|
1477
1580
|
md = _fix_hyphenated_line_breaks(md)
|
|
1478
1581
|
md = _join_multiline_caption_paragraphs(md)
|
|
1479
1582
|
md = _suppress_orphan_table_cell_text(md)
|
|
1583
|
+
md = _demote_inline_footnotes_to_blockquote(md)
|
|
1584
|
+
md = _promote_study_subsection_headings(md)
|
|
1480
1585
|
md = _merge_compound_heading_tails(md)
|
|
1481
1586
|
md = _reformat_jama_key_points_box(md)
|
|
1482
1587
|
md = _promote_numbered_subsection_headings(md)
|
|
@@ -56,10 +56,25 @@ _LINT_PATTERNS: list[tuple[str, re.Pattern[str], str]] = [
|
|
|
56
56
|
(
|
|
57
57
|
"FN",
|
|
58
58
|
re.compile(
|
|
59
|
-
r"^\d{1,2}\s+(?:Though|Note|See|We
|
|
59
|
+
r"^\d{1,2}\s+(?:Though|Note|See|We|This|The|These|Although|However|"
|
|
60
|
+
r"It|For|In|Some|First|Further|Assuming|One|Given|Because)\s+"
|
|
61
|
+
r"\w+\s+\w.{2,180}[\.\)]\s*$"
|
|
60
62
|
),
|
|
61
63
|
"Inline footnote leaked as standalone paragraph",
|
|
62
64
|
),
|
|
65
|
+
(
|
|
66
|
+
"OR",
|
|
67
|
+
re.compile(r"^https?://orcid\.org/\d{4}-\d{4}-\d{4}-[0-9X]{4}\s*$"),
|
|
68
|
+
"Standalone ORCID URL",
|
|
69
|
+
),
|
|
70
|
+
(
|
|
71
|
+
"JF",
|
|
72
|
+
re.compile(
|
|
73
|
+
r"^(?:Vol\.:\(\d+\)|rsos\.royalsocietypublishing\.org|"
|
|
74
|
+
r"www\.nature\.com/(?:naturecommunications|scientificreports))\s*$"
|
|
75
|
+
),
|
|
76
|
+
"Journal-footer URL or vol marker leaked into body",
|
|
77
|
+
),
|
|
63
78
|
]
|
|
64
79
|
|
|
65
80
|
|
|
@@ -480,6 +480,42 @@ class TestP0_RunningHeaderFooterPatterns_v246:
|
|
|
480
480
|
assert "Department of Psychology, University of Hong Kong" not in result
|
|
481
481
|
assert "Body content here." in result
|
|
482
482
|
|
|
483
|
+
def test_rsos_footer_url_stripped(self):
|
|
484
|
+
text = (
|
|
485
|
+
"Body sentence one.\n"
|
|
486
|
+
"rsos.royalsocietypublishing.org\n"
|
|
487
|
+
"Body sentence two.\n"
|
|
488
|
+
)
|
|
489
|
+
result = norm(text, "standard")
|
|
490
|
+
assert "rsos.royalsocietypublishing.org" not in result
|
|
491
|
+
assert "Body sentence one." in result
|
|
492
|
+
assert "Body sentence two." in result
|
|
493
|
+
|
|
494
|
+
def test_nature_footer_url_stripped(self):
|
|
495
|
+
text = (
|
|
496
|
+
"Body.\n"
|
|
497
|
+
"www.nature.com/naturecommunications\n"
|
|
498
|
+
"More body.\n"
|
|
499
|
+
"www.nature.com/scientificreports\n"
|
|
500
|
+
"Yet more.\n"
|
|
501
|
+
)
|
|
502
|
+
result = norm(text, "standard")
|
|
503
|
+
assert "www.nature.com/naturecommunications" not in result
|
|
504
|
+
assert "www.nature.com/scientificreports" not in result
|
|
505
|
+
assert "Body." in result
|
|
506
|
+
|
|
507
|
+
def test_springer_vol_marker_stripped(self):
|
|
508
|
+
text = "Body.\nVol.:(0123456789)\nMore body.\n"
|
|
509
|
+
result = norm(text, "standard")
|
|
510
|
+
assert "Vol.:(0123456789)" not in result
|
|
511
|
+
assert "Body." in result
|
|
512
|
+
|
|
513
|
+
def test_orcid_url_stripped(self):
|
|
514
|
+
text = "Body.\nhttps://orcid.org/0000-0002-1234-5678\nMore body.\n"
|
|
515
|
+
result = norm(text, "standard")
|
|
516
|
+
assert "orcid.org/0000-0002-1234-5678" not in result
|
|
517
|
+
assert "Body." in result
|
|
518
|
+
|
|
483
519
|
def test_affiliation_line_preserved_in_prose_context(self):
|
|
484
520
|
"""The Dept/University pattern must only match standalone lines, not
|
|
485
521
|
prose mentioning the affiliation mid-sentence."""
|
|
@@ -15,6 +15,8 @@ from docpluck.render import (
|
|
|
15
15
|
_promote_numbered_subsection_headings,
|
|
16
16
|
_reformat_jama_key_points_box,
|
|
17
17
|
_suppress_orphan_table_cell_text,
|
|
18
|
+
_demote_inline_footnotes_to_blockquote,
|
|
19
|
+
_promote_study_subsection_headings,
|
|
18
20
|
_apply_title_rescue,
|
|
19
21
|
_strip_duplicate_title_occurrences,
|
|
20
22
|
)
|
|
@@ -249,6 +251,104 @@ def test_suppress_orphan_table_cell_text_noop_when_no_table_caption():
|
|
|
249
251
|
assert _suppress_orphan_table_cell_text(text) == text
|
|
250
252
|
|
|
251
253
|
|
|
254
|
+
# ── _demote_inline_footnotes_to_blockquote ──────────────────────────────────
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def test_footnote_demoted_to_blockquote():
|
|
258
|
+
text = (
|
|
259
|
+
"Body prose paragraph one.\n\n"
|
|
260
|
+
"1 Though we note a recent failed replication of the Kogut and "
|
|
261
|
+
"Ritov (2005) by Majumder et al. (2023).\n\n"
|
|
262
|
+
"Body prose paragraph two."
|
|
263
|
+
)
|
|
264
|
+
out = _demote_inline_footnotes_to_blockquote(text)
|
|
265
|
+
assert "> 1 Though we note a recent failed replication" in out
|
|
266
|
+
assert "Body prose paragraph one." in out
|
|
267
|
+
assert "Body prose paragraph two." in out
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def test_footnote_demoter_preserves_real_numbered_list_item():
|
|
271
|
+
text = (
|
|
272
|
+
"Some context.\n\n"
|
|
273
|
+
"1. First numbered point in a list.\n\n"
|
|
274
|
+
"More prose."
|
|
275
|
+
)
|
|
276
|
+
out = _demote_inline_footnotes_to_blockquote(text)
|
|
277
|
+
# Numbered list item has `1.` (with period), pattern expects `1 Word`.
|
|
278
|
+
assert "1. First numbered point" in out
|
|
279
|
+
assert "> 1. First numbered point" not in out
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def test_footnote_demoter_skips_short_paragraphs():
|
|
283
|
+
text = "Context.\n\n2 Note.\n\nMore."
|
|
284
|
+
out = _demote_inline_footnotes_to_blockquote(text)
|
|
285
|
+
# Under 30 chars — not enough to qualify as a footnote.
|
|
286
|
+
assert out == text
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def test_footnote_demoter_idempotent():
|
|
290
|
+
text = (
|
|
291
|
+
"Body.\n\n"
|
|
292
|
+
"1 Though we note this is a footnote that has been demoted already "
|
|
293
|
+
"by a previous pass through the pipeline.\n\n"
|
|
294
|
+
"More body."
|
|
295
|
+
)
|
|
296
|
+
once = _demote_inline_footnotes_to_blockquote(text)
|
|
297
|
+
twice = _demote_inline_footnotes_to_blockquote(once)
|
|
298
|
+
# After first pass, the line starts with "> ", so doesn't match `^\d`.
|
|
299
|
+
assert once == twice
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
# ── _promote_study_subsection_headings ──────────────────────────────────────
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
def test_study_subsection_heading_promoted():
|
|
306
|
+
text = (
|
|
307
|
+
"Some intro.\n\n"
|
|
308
|
+
"Study 1 Design and Findings\n\n"
|
|
309
|
+
"In Study 1 we examined..."
|
|
310
|
+
)
|
|
311
|
+
out = _promote_study_subsection_headings(text)
|
|
312
|
+
assert "### Study 1 Design and Findings" in out
|
|
313
|
+
assert "In Study 1 we examined" in out
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def test_study_subsection_multiple_variants_promoted():
|
|
317
|
+
text = (
|
|
318
|
+
"x\n\n"
|
|
319
|
+
"Study 3 Design and Findings\n\n"
|
|
320
|
+
"y\n\n"
|
|
321
|
+
"Study 2 Results\n\n"
|
|
322
|
+
"z\n\n"
|
|
323
|
+
"Overview of the Replication and Extension\n\n"
|
|
324
|
+
"w"
|
|
325
|
+
)
|
|
326
|
+
out = _promote_study_subsection_headings(text)
|
|
327
|
+
assert "### Study 3 Design and Findings" in out
|
|
328
|
+
assert "### Study 2 Results" in out
|
|
329
|
+
assert "### Overview of the Replication and Extension" in out
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def test_study_subsection_skip_existing_heading():
|
|
333
|
+
text = "### Study 1 Design and Findings\n\nbody"
|
|
334
|
+
out = _promote_study_subsection_headings(text)
|
|
335
|
+
# Already a heading; do not double-prefix.
|
|
336
|
+
assert "### ### Study 1" not in out
|
|
337
|
+
assert "### Study 1 Design and Findings" in out
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def test_study_subsection_skip_unrelated_prose():
|
|
341
|
+
text = (
|
|
342
|
+
"We summarize Study 1 design and the procedure used in our work.\n\n"
|
|
343
|
+
"More prose."
|
|
344
|
+
)
|
|
345
|
+
out = _promote_study_subsection_headings(text)
|
|
346
|
+
# Mid-prose mention is NOT a heading; pattern requires the line to be
|
|
347
|
+
# the entire paragraph and start with capital-S "Study N <token>".
|
|
348
|
+
assert "### We summarize" not in out
|
|
349
|
+
assert out == text
|
|
350
|
+
|
|
351
|
+
|
|
252
352
|
# ── _reformat_jama_key_points_box ──────────────────────────────────────────
|
|
253
353
|
|
|
254
354
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docpluck-2.4.6 → docpluck-2.4.7}/.claude/skills/docpluck-qa/references/check-11-hard-rules.md
RENAMED
|
File without changes
|
|
File without changes
|
{docpluck-2.4.6 → docpluck-2.4.7}/.claude/skills/docpluck-qa/references/check-5-escicheck-library.md
RENAMED
|
File without changes
|
|
File without changes
|
{docpluck-2.4.6 → docpluck-2.4.7}/.claude/skills/docpluck-qa/references/check-7-batch-smoke.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docpluck-2.4.6 → docpluck-2.4.7}/docs/HANDOFF_2026-05-12_remaining_ui_and_chrome_verification.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/2026-05-06-section-identification.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docpluck-2.4.6 → docpluck-2.4.7}/docs/superpowers/plans/spot-checks/2026-05-07_spot-01_apa.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|