docpluck 2.4.4__tar.gz → 2.4.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docpluck-2.4.4 → docpluck-2.4.6}/.claude/skills/docpluck-qa/SKILL.md +75 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/CHANGELOG.md +63 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/PKG-INFO +1 -1
- {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/__init__.py +1 -1
- {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/normalize.py +66 -16
- {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/render.py +125 -0
- docpluck-2.4.6/docs/HANDOFF_2026-05-13_apa_50_expansion.md +360 -0
- docpluck-2.4.6/docs/HANDOFF_2026-05-13_apa_50_expansion_iter_1.md +106 -0
- docpluck-2.4.6/docs/HANDOFF_2026-05-13_iterative_1.md +112 -0
- docpluck-2.4.6/docs/HANDOFF_2026-05-13_iterative_library_improvement.md +235 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/pyproject.toml +1 -1
- docpluck-2.4.6/scripts/lint_rendered_corpus.py +115 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_normalization.py +131 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_render.py +114 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/.claude/skills/_project/lessons.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/.claude/skills/docpluck-cleanup/SKILL.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/.claude/skills/docpluck-deploy/SKILL.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/.claude/skills/docpluck-qa/references/benchmark-mode.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/.claude/skills/docpluck-qa/references/check-11-hard-rules.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/.claude/skills/docpluck-qa/references/check-13-escicheck-production.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/.claude/skills/docpluck-qa/references/check-5-escicheck-library.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/.claude/skills/docpluck-qa/references/check-6-escicheck-local-webapp.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/.claude/skills/docpluck-qa/references/check-7-batch-smoke.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/.claude/skills/docpluck-review/SKILL.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/.github/workflows/publish.yml +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/.github/workflows/test.yml +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/.gitignore +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/CLAUDE.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/HANDOFF_SECTIONS_APP_INTEGRATION.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/LESSONS.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/LICENSE +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/REPLY_FROM_DOCPLUCK_v1.4.5.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/REPLY_FROM_DOCPLUCK_v1.5.0.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/REQUEST_08_CHUNKING_ENDPOINT.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/REQUEST_09_REFERENCE_LIST_NORMALIZATION.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/TODO.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/__main__.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/batch.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/cli.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/extract.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/extract_docx.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/extract_html.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/extract_layout.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/extract_structured.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/figures/__init__.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/figures/detect.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/quality.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/sections/__init__.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/sections/annotators/__init__.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/sections/annotators/docx.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/sections/annotators/html.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/sections/annotators/pdf.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/sections/annotators/text.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/sections/blocks.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/sections/boundaries.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/sections/core.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/sections/taxonomy.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/sections/types.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/tables/__init__.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/tables/bbox_utils.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/tables/camelot_extract.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/tables/captions.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/tables/cell_cleaning.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/tables/cluster.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/tables/confidence.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/tables/detect.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/tables/render.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/tables/whitespace.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docpluck/version.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/BENCHMARKS.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/DESIGN.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/HANDOFF_2026-05-07_sections_strict_iteration.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/HANDOFF_2026-05-09_session_state_and_followups.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/HANDOFF_2026-05-09_unified_extraction_brainstorm.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/HANDOFF_2026-05-10_table_rendering_iteration.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/HANDOFF_2026-05-10_table_rendering_iteration_2.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/HANDOFF_2026-05-10_table_rendering_iteration_3.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/HANDOFF_2026-05-10_table_rendering_iteration_4.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/HANDOFF_2026-05-10_table_rendering_iteration_5.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/HANDOFF_2026-05-10_table_rendering_iteration_6.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/HANDOFF_2026-05-10_table_rendering_iteration_7.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/HANDOFF_2026-05-11_PROMOTE_SPIKE_TO_LIBRARY.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/HANDOFF_2026-05-11_table_rendering_iteration_8.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/HANDOFF_2026-05-11_visual_review_findings.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/HANDOFF_2026-05-12_phase2_101pdf_corpus.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/HANDOFF_2026-05-12_remaining_ui_and_chrome_verification.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/HANDOFF_2026-05-12_visual_verify_results.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/NORMALIZATION.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/README.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/TRIAGE_2026-05-10_corpus_assessment.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/2026-05-06-section-identification.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/2026-05-06-table-extraction.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/2026-05-07-sections-strict-iteration-progress.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/2026-05-08-unified-extraction-phase-0-splice-spike.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/sections-deferred-items.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/sections-issues-backlog.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/2026-05-07_spot-01_apa.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/2026-05-07_spot-02_pattern-A-shipped.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/2026-05-08_spot-final_all-styles.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/COMPARISON.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/korbmacher_table1.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/option-a.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/ziano_table1.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_notes_raw.txt +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_table1.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/notes.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/option-b.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_notes_raw.txt +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_table1.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/korbmacher_table1.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/notes.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/option-c.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/sample-pdftotext-bbox.html +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/ziano_table1.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/korbmacher_table1.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/notes.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/option-d.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/ziano_table1.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_2022_kruger_bbox.html +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_bbox.html +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_table1.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/option-e.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/sample-bbox.html +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_2021_joep_bbox.html +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_bbox.html +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_table1.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/html-fallback-demo.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.err +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.err +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.err +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.err +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.err +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.err +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.err +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.err +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.err +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.err +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.err +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.err +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.err +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.err +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.err +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.err +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.err +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.err +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.err +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.err +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.err +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.err +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.err +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.err +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.err +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.err +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/papers.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/report.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/splice_spike.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/plans/spot-checks/splice-spike/test_splice_spike.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/specs/2026-04-27-request-09-reference-normalization-design.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/specs/2026-05-06-section-identification-design.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/specs/2026-05-06-table-extraction-design.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/docs/superpowers/specs/2026-05-08-unified-extraction-design.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/scripts/verify_corpus.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/scripts/verify_corpus_full.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/__init__.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/conftest.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/fixtures/__init__.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/fixtures/sections/__init__.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/fixtures/sections/builders.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/fixtures/structured/.gitkeep +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/fixtures/structured/MANIFEST.json +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/fixtures/structured/README.md +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/golden/sections/apa_multi_study_pdf.json +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/golden/sections/apa_single_study_pdf.json +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/golden/sections/html_real_headings.json +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/snapshots/amj_lattice.txt +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/snapshots/apa_chan_feldman_lineless.txt +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/snapshots/apa_chen_jesp_lineless.txt +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/snapshots/apa_efendic_affect.txt +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/snapshots/apa_ip_feldman_pspb.txt +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/snapshots/bmc_lattice.txt +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/snapshots/ieee_figure_heavy.txt +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/snapshots/ieee_lattice.txt +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/snapshots/jama_lattice.txt +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/snapshots/nat_comms_figure_only.txt +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/snapshots/nature_minimal_rule.txt +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/snapshots/scirep_minimal_rule.txt +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_bbox_utils.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_benchmark_docx_html.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_caption_regex.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_cli_sections.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_cli_structured.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_confidence.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_corpus_smoke.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_d5_normalization_audit.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_edge_cases.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_extract_docx.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_extract_filter_sugar.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_extract_html.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_extract_layout.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_extract_pdf_structured.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_extraction.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_f0_table_region_aware.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_figure_detect.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_fixtures_manifest.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_lattice_cluster.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_metaesci_followups.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_normalize_f0_footnote_strip.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_normalize_layout_param.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_normalize_report_layout_fields.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_normalize_v18_strips.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_quality.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_render_html.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_request_09_reference_normalization.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_boundaries.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_boundary_truncation.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_core_partition.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_docx_annotator.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_extract_text.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_footnote_section.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_golden.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_html_annotator.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_pdf_annotator.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_public_api.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_real_corpus.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_taxonomy.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_text_annotator.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_types.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_unit_corpus.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_v161_coalesce.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_v161_subheadings.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_v161_taxonomy.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_v161_text_annotator.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_sections_version.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_smoke_fixtures.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_structured_result_type.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_structured_types.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_structured_version.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_table_detect.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_tables_cell_cleaning.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_text_mode.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_v23_1_fixes.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_v23_bug_fixes.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_v23_post_corpus.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_v23_post_corpus_v2.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_v2_backwards_compat.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_v2_top_level_exports.py +0 -0
- {docpluck-2.4.4 → docpluck-2.4.6}/tests/test_whitespace_cluster.py +0 -0
|
@@ -250,6 +250,81 @@ python scripts/verify_corpus.py --paper efendic_2022_affect --diff
|
|
|
250
250
|
Skips cleanly when the spike `outputs[-new]/` directories aren't on
|
|
251
251
|
disk (fresh checkouts).
|
|
252
252
|
|
|
253
|
+
### 7c. Visible-Defect Heuristic Linter (v2.4.6+, CRITICAL)
|
|
254
|
+
|
|
255
|
+
`verify_corpus.py` measures char-ratio + Jaccard against a baseline — it is
|
|
256
|
+
**blind to visible defects** that the baseline itself contains. After the
|
|
257
|
+
2026-05-13 audit (xiao_2021_crsp, maier_2023_collabra) the user identified
|
|
258
|
+
five visible defect classes that the corpus verifier missed entirely:
|
|
259
|
+
|
|
260
|
+
| Defect | Signature regex (on rendered .md, per-line) | Tag |
|
|
261
|
+
|---|---|---|
|
|
262
|
+
| Running header `Q. XIAO ET AL.` style | `^[A-Z]\.(?:\s*[A-Z]\.?)?\s+[A-Z]{2,}\s+ET\s+AL\.?$` | RH |
|
|
263
|
+
| Contact / corresponding-author footer | `^CONTACT\s+[A-Z]\w+(?:\s+[A-Z]\w+)+\s+\S+@` | CT |
|
|
264
|
+
| Prefixed contribution / corresponding footnote | `^[a-c]\s+(?:Contributed\s+equally\|Corresponding\s+Author)\b` | CB |
|
|
265
|
+
| Standalone Dept/University affiliation | `^Department\s+of\s+[A-Z]\w+,\s+University\s+of\s+\w+` | AF |
|
|
266
|
+
| Inline footnote leaked into prose | `^\d+\s+(?:Though\|Note\|See\|We)\s+\w` (per-line, ≤ 200 chars) | FN |
|
|
267
|
+
|
|
268
|
+
Run:
|
|
269
|
+
```bash
|
|
270
|
+
python scripts/lint_rendered_corpus.py tmp/renders_*/*.md
|
|
271
|
+
```
|
|
272
|
+
|
|
273
|
+
Any match is a FAIL — the rendered .md contains a defect class that should
|
|
274
|
+
have been stripped upstream. Cite the file + line + tag in the QA report.
|
|
275
|
+
|
|
276
|
+
Note: these patterns target the **rendered output**, not pdftotext. They
|
|
277
|
+
backstop normalize.py + render.py — if a pattern leaks past upstream
|
|
278
|
+
filters and into the .md, the linter catches it.
|
|
279
|
+
|
|
280
|
+
### 7d. AI Inspection of Rendered Output (v2.4.6+, RECOMMENDED)
|
|
281
|
+
|
|
282
|
+
For 2-5 representative papers per render change, dispatch a Claude subagent
|
|
283
|
+
(via Task or Agent tool) that:
|
|
284
|
+
|
|
285
|
+
1. Reads `tmp/<paper>.md` (the rendered output).
|
|
286
|
+
2. Reads the source PDF (via Read tool with `pages=1-5` for the first 5 pages).
|
|
287
|
+
3. Scores each .md section for fidelity:
|
|
288
|
+
- **Text coverage**: any PDF paragraph missing from the .md?
|
|
289
|
+
- **Section boundaries**: does the heading match the content below it?
|
|
290
|
+
- **Mid-prose leaks**: any running-header / footer / footnote text infused?
|
|
291
|
+
- **False headings**: any `## ...` / `### ...` that isn't actually a section?
|
|
292
|
+
|
|
293
|
+
Output a per-paper defect list. **Default papers:** `xiao_2021_crsp`,
|
|
294
|
+
`maier_2023_collabra`, `chan_feldman_2025_cogemo`, `efendic_2022_affect`,
|
|
295
|
+
`ip_feldman_2025_pspb` — these collectively exercise APA stats tables,
|
|
296
|
+
Collabra footnotes, T&F contact-line footers, sequential page numbers,
|
|
297
|
+
and replication-report subsections.
|
|
298
|
+
|
|
299
|
+
This check exists because **char-ratio + Jaccard are blind to "right words
|
|
300
|
+
in wrong order under wrong heading"** (see CLAUDE.md "Iteration discipline").
|
|
301
|
+
Run it after every render change before declaring the iteration done.
|
|
302
|
+
|
|
303
|
+
### 7e. Text-Coverage Baseline (v2.4.6+, CRITICAL)
|
|
304
|
+
|
|
305
|
+
Catches the silent-text-loss defect class (rendered .md drops a body
|
|
306
|
+
paragraph that was in the PDF):
|
|
307
|
+
|
|
308
|
+
```bash
|
|
309
|
+
python -c "
|
|
310
|
+
from pathlib import Path
|
|
311
|
+
from docpluck.extract import extract_pdf
|
|
312
|
+
from docpluck.render import render_pdf_to_markdown
|
|
313
|
+
for pdf_path in Path('../PDFextractor/test-pdfs').glob('**/*.pdf'):
|
|
314
|
+
pdf = pdf_path.read_bytes()
|
|
315
|
+
raw, _ = extract_pdf(pdf)
|
|
316
|
+
md = render_pdf_to_markdown(pdf)
|
|
317
|
+
ratio = len(md) / max(len(raw), 1)
|
|
318
|
+
if ratio < 0.85:
|
|
319
|
+
print(f'COVERAGE FAIL {pdf_path.name}: {ratio:.2f}')
|
|
320
|
+
elif ratio > 2.0:
|
|
321
|
+
print(f'COVERAGE WARN {pdf_path.name}: {ratio:.2f} (suspicious bloat)')
|
|
322
|
+
"
|
|
323
|
+
```
|
|
324
|
+
|
|
325
|
+
**Threshold:** rendered .md length ≥ 0.85 × pdftotext raw length. Below
|
|
326
|
+
that, body content has been dropped somewhere in the pipeline.
|
|
327
|
+
|
|
253
328
|
### 8. Service Health Endpoint
|
|
254
329
|
```bash
|
|
255
330
|
curl -s http://localhost:6117/health
|
|
@@ -1,5 +1,68 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [2.4.6] — 2026-05-13
|
|
4
|
+
|
|
5
|
+
Two fixes addressing visible-defect classes the corpus verifier (char-ratio + Jaccard) was blind to. User visual inspection of `xiao_2021_crsp.pdf` and `maier_2023_collabra.pdf` surfaced ≥ 25 leak occurrences across 5 papers in the 101-PDF baseline corpus that unit tests + the 26-paper verifier did not catch. New heuristic linter (`scripts/lint_rendered_corpus.py`) quantifies remaining defects: baseline 25 → 1 after v2.4.6 on the targeted set.
|
|
6
|
+
|
|
7
|
+
### Fix 1 — Orphan table cell-text suppression
|
|
8
|
+
|
|
9
|
+
1. **`docpluck/render.py::_suppress_orphan_table_cell_text`** — new post-processor inserted between `_join_multiline_caption_paragraphs` and `_merge_compound_heading_tails`. Detects single-line `Table N. <caption>` paragraphs (plain, not already italicized — the italic `*Table N. ...*` is the v2.4.2 caption-only emission and never has orphan rows) followed by ≥ 3 consecutive paragraphs matching `_is_orphan_cell_paragraph` (≤ 200 chars, no markdown/HTML/list markers, low stopword density, not multi-sentence prose). When detected: italicizes the caption and drops the orphan paragraphs. Conservative: stops at the first non-orphan paragraph.
|
|
10
|
+
|
|
11
|
+
On `chan_feldman_2025_cogemo`: 5 of 9 captions (Tables 3, 4, 5, 6, 7) were plain `Table N.` lines followed by 3–50 lines of orphan cell rows; all now italicized with zero orphan rows.
|
|
12
|
+
|
|
13
|
+
### Fix 2 — Running-header / contact-block / affiliation line patterns
|
|
14
|
+
|
|
15
|
+
2. **`docpluck/normalize.py::_PAGE_FOOTER_LINE_PATTERNS`** — four new patterns:
|
|
16
|
+
- `^[A-Z]\.(?:\s*[A-Z]\.?)?\s+[A-Z]{2,}\s+ET\s+AL\.?$` — `Q. XIAO ET AL.` / `Q.M. SMITH ET AL` running headers (all-caps surname required to avoid stripping legit `Q. Xiao et al.` references in prose).
|
|
17
|
+
- `^CONTACT\s+[A-Z]\w+(?:\s+[A-Z]\w+)+\s+\S+@\S+.*$` — Taylor & Francis (CRSP, etc.) `CONTACT <Name> <email>` page-footer.
|
|
18
|
+
- `^[a-c]\s+(?:Contributed\s+equally|Corresponding\s+Author)\b.*$` — Collabra-style prefixed contribution / corresponding-author footnotes.
|
|
19
|
+
- `^Department\s+of\s+[A-Z]\w+(?:\s+and\s+\w+)?,\s+University\s+of\s+\w+(?:\s+Kong)?,\s+.{2,80}$` — standalone Dept/University affiliation lines (must be standalone — prose mentioning the affiliation mid-sentence stays).
|
|
20
|
+
|
|
21
|
+
On `xiao_2021_crsp`: 18 `Q. XIAO ET AL.` standalone leaks → 0 (one residual is folded inside a figure caption, not at line start). On `maier_2023_collabra`: 3 contact/corresponding leaks → 0.
|
|
22
|
+
|
|
23
|
+
### New: heuristic linter
|
|
24
|
+
|
|
25
|
+
3. **`scripts/lint_rendered_corpus.py`** — greps rendered `.md` for 5 leak signatures (RH, CT, CB, AF, FN). Run `python scripts/lint_rendered_corpus.py tmp/renders_v2.4.0/` against the 101-PDF corpus to surface visible defects char-ratio/Jaccard miss. Wired into `docpluck-qa` skill as Check 7c.
|
|
26
|
+
|
|
27
|
+
### New: QA skill spec updates
|
|
28
|
+
|
|
29
|
+
4. **`.claude/skills/docpluck-qa/SKILL.md`** — three new checks documented:
|
|
30
|
+
- 7c: Visible-Defect Heuristic Linter (the `lint_rendered_corpus.py` script).
|
|
31
|
+
- 7d: AI Inspection of Rendered Output (Claude subagent compares `.md` paragraph-by-paragraph against source PDF).
|
|
32
|
+
- 7e: Text-Coverage Baseline (asserts `len(rendered.md) ≥ 0.85 × len(pdftotext_raw)` to catch silent text-loss).
|
|
33
|
+
|
|
34
|
+
### Bumps
|
|
35
|
+
|
|
36
|
+
- `__version__`: `2.4.5` → `2.4.6`. Patch (additive normalize patterns + new render post-processor; no API surface change).
|
|
37
|
+
|
|
38
|
+
### Tests
|
|
39
|
+
|
|
40
|
+
- 7 new tests in `tests/test_render.py` for `_suppress_orphan_table_cell_text` (drops leaked rows, preserves prose, requires ≥ 3 orphans, skips already-italic caption, stops at next caption, idempotent, no-op when no caption).
|
|
41
|
+
- 7 new tests in `tests/test_normalization.py::TestP0_RunningHeaderFooterPatterns_v246` for the new footer patterns (Q. XIAO ET AL. stripping, two-initials variant, mixed-case preservation, CONTACT footer, prefixed Contributed equally, Dept/University standalone, Dept/University prose preserved).
|
|
42
|
+
|
|
43
|
+
### Known remaining defects (deferred to next iteration)
|
|
44
|
+
|
|
45
|
+
- `xiao_2021_crsp`: section detector treats mid-paragraph "Experiment" as a heading. Requires context-aware suppression in `sections/taxonomy.py`.
|
|
46
|
+
- `xiao_2021_crsp`: KEYWORDS section boundary not visually separated from Introduction body in render output.
|
|
47
|
+
- `maier_2023_collabra`: subsection headings like "Study 1 Design and Findings" / "Study 3 Design and Findings" remain plain paragraphs — need a subsection-pattern detector in `sections/`.
|
|
48
|
+
- `maier_2023_collabra`: inline footnote leak (`1 Though we note ...`) — F1 footnote post-processing pass needed.
|
|
49
|
+
|
|
50
|
+
## [2.4.5] — 2026-05-13
|
|
51
|
+
|
|
52
|
+
Continuation of v2.4.3's 4-digit page-number strip. v2.4.3 required the same 4-digit value to recur ≥ 3 times to strip — but continuous-pagination journals (PSPB, Psychological Science) use *sequential* page numbers per page (1174, 1175, 1177, 1179, ...) where each value is different. The v2.4.3 rule missed them entirely.
|
|
53
|
+
|
|
54
|
+
### Fix
|
|
55
|
+
|
|
56
|
+
1. **`docpluck/normalize.py::normalize_text` S9** — widened 4-digit page-number strip with a second pattern: when ≥ 3 distinct standalone 4-digit values cluster within a 50-page range AND have mean inter-value gap ≤ 3, treat them all as continuous-pagination page numbers and strip. The conservative gates (max-min spread, mean diff) protect against table-cell values which would have larger spreads and irregular gaps. Verified end-to-end on `efendic_2022_affect.md` — page numbers 1174, 1175, 1177, 1179, 1181, 1183, 1184 now all stripped. `NORMALIZATION_VERSION`: `1.8.2` → `1.8.3`.
|
|
57
|
+
|
|
58
|
+
### Bumps
|
|
59
|
+
|
|
60
|
+
- `__version__`: `2.4.4` → `2.4.5`. Patch.
|
|
61
|
+
|
|
62
|
+
### Tests
|
|
63
|
+
|
|
64
|
+
2 new tests in `tests/test_normalization.py` (sequential page-number stripping, unrelated 4-digit value preservation).
|
|
65
|
+
|
|
3
66
|
## [2.4.4] — 2026-05-13
|
|
4
67
|
|
|
5
68
|
Bug fix on v2.4.3's caption-trim feature + extension to a second chart-data signature.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docpluck
|
|
3
|
-
Version: 2.4.
|
|
3
|
+
Version: 2.4.6
|
|
4
4
|
Summary: PDF, DOCX, and HTML text extraction and normalization for academic papers
|
|
5
5
|
Project-URL: Homepage, https://github.com/giladfeldman/docpluck
|
|
6
6
|
Project-URL: Documentation, https://github.com/giladfeldman/docpluck/tree/main/docs
|
|
@@ -71,7 +71,7 @@ from .figures import Figure
|
|
|
71
71
|
from .extract_structured import TABLE_EXTRACTION_VERSION, StructuredResult, extract_pdf_structured
|
|
72
72
|
from .render import render_pdf_to_markdown
|
|
73
73
|
|
|
74
|
-
__version__ = "2.4.
|
|
74
|
+
__version__ = "2.4.6"
|
|
75
75
|
__author__ = "Gilad Feldman"
|
|
76
76
|
__license__ = "MIT"
|
|
77
77
|
|
|
@@ -22,7 +22,7 @@ class NormalizationLevel(str, Enum):
|
|
|
22
22
|
academic = "academic"
|
|
23
23
|
|
|
24
24
|
|
|
25
|
-
NORMALIZATION_VERSION = "1.8.
|
|
25
|
+
NORMALIZATION_VERSION = "1.8.3"
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
# ── Request 9 (Scimeto, 2026-04-27): Reference-list normalization ──────────
|
|
@@ -617,6 +617,38 @@ _PAGE_FOOTER_LINE_PATTERNS: list[re.Pattern[str]] = [
|
|
|
617
617
|
# Running-header lines with "| <page>" or "<page> Author et al.".
|
|
618
618
|
re.compile(r"^\S(?:[^|\n]{2,80})\|\s*\d{1,4}\s*$"),
|
|
619
619
|
re.compile(r"^\d{1,4}\s+[A-ZÀ-ÿ][^\n]{1,60}\s+et al\.?\s*$"),
|
|
620
|
+
# v2.4.6: "Q. XIAO ET AL." style running header — surname journal abbrev
|
|
621
|
+
# used by CRSP, JESP, and many other 2-column journals. Accepts:
|
|
622
|
+
# "Q. XIAO ET AL." single initial + surname
|
|
623
|
+
# "Q.M. XIAO ET AL." two initials with internal period
|
|
624
|
+
# "Q. M. XIAO ET AL" two initials with space (no trailing dot)
|
|
625
|
+
# All-caps surname required (lowercase letters appear in regular prose
|
|
626
|
+
# like "Most participants in the experimental condition were …").
|
|
627
|
+
re.compile(
|
|
628
|
+
r"^[A-Z]\.(?:\s*[A-Z]\.?)?\s+[A-Z]{2,}\s+ET\s+AL\.?\s*$"
|
|
629
|
+
),
|
|
630
|
+
# v2.4.6: contact-line footer used by Taylor & Francis (CRSP, etc.):
|
|
631
|
+
# "CONTACT Gilad Feldman gfeldman@hku.hk; giladfel@gmail.com …"
|
|
632
|
+
# The `CONTACT` keyword + name + email is distinctive enough to anchor
|
|
633
|
+
# safely. Optional trailing affiliation / region tokens.
|
|
634
|
+
re.compile(
|
|
635
|
+
r"^CONTACT\s+[A-Z][\w'’-]+(?:\s+[A-Z][\w'’-]+)+\s+\S+@\S+.*$"
|
|
636
|
+
),
|
|
637
|
+
# v2.4.6: prefixed author-contribution / corresponding-author footnotes
|
|
638
|
+
# used by Collabra, eLife, PLOS, etc.:
|
|
639
|
+
# "a Contributed equally, joint first author"
|
|
640
|
+
# "b Contributed equally, joint first author"
|
|
641
|
+
# "c Corresponding Author: <name>, <affiliation>"
|
|
642
|
+
re.compile(
|
|
643
|
+
r"^[a-z]\s+(?:Contributed\s+equally|Corresponding\s+Author)\b.*$"
|
|
644
|
+
),
|
|
645
|
+
# v2.4.6: standalone affiliation lines that recur on bottom of every
|
|
646
|
+
# page in 2-column journals — "Department of <field>, University of
|
|
647
|
+
# <place>, <region>".
|
|
648
|
+
re.compile(
|
|
649
|
+
r"^Department\s+of\s+[A-Z][A-Za-z]+(?:\s+and\s+[A-Z][A-Za-z]+)?,\s+"
|
|
650
|
+
r"University\s+of\s+[A-Z][A-Za-z]+(?:\s+Kong)?,\s+.{2,80}$"
|
|
651
|
+
),
|
|
620
652
|
]
|
|
621
653
|
|
|
622
654
|
|
|
@@ -1006,27 +1038,45 @@ def normalize_text(
|
|
|
1006
1038
|
t = "\n".join(lines)
|
|
1007
1039
|
# Strip standalone page numbers — 1-3 digit unconditionally.
|
|
1008
1040
|
t = re.sub(r"^\s*\d{1,3}\s*$", "", t, flags=re.MULTILINE)
|
|
1009
|
-
# v2.4.3: 4-digit page numbers (continuous-pagination journals like
|
|
1010
|
-
# where volume runs page numbers into the 1000s
|
|
1011
|
-
#
|
|
1012
|
-
#
|
|
1013
|
-
#
|
|
1014
|
-
#
|
|
1015
|
-
#
|
|
1016
|
-
#
|
|
1017
|
-
#
|
|
1018
|
-
#
|
|
1019
|
-
#
|
|
1020
|
-
#
|
|
1041
|
+
# v2.4.3/v2.4.5: 4-digit page numbers (continuous-pagination journals like
|
|
1042
|
+
# PSPB where volume runs page numbers into the 1000s, e.g.
|
|
1043
|
+
# ``efendic_2022_affect`` with pages 1174-1185). Two patterns fire:
|
|
1044
|
+
#
|
|
1045
|
+
# (A) RECURRING (v2.4.3) — same value appears ≥3 times. Catches PDFs
|
|
1046
|
+
# where every page repeats the same volume number on its own line
|
|
1047
|
+
# (rare for true page numbers, but happens for volume markers).
|
|
1048
|
+
#
|
|
1049
|
+
# (B) SEQUENTIAL (v2.4.5) — ≥3 distinct standalone 4-digit values in
|
|
1050
|
+
# the doc AND they cluster within a 50-page range (max - min ≤ 50)
|
|
1051
|
+
# AND the average per-page gap is small (mean diff ≤ 3). This is
|
|
1052
|
+
# the canonical continuous-pagination signature: page numbers
|
|
1053
|
+
# monotonically increasing across the article. The conservative
|
|
1054
|
+
# gates protect table cells (where 4-digit values would have
|
|
1055
|
+
# larger spreads and irregular gaps).
|
|
1021
1056
|
four_digit_counts: dict[str, int] = {}
|
|
1022
1057
|
for ln in t.split("\n"):
|
|
1023
1058
|
s = ln.strip()
|
|
1024
1059
|
if len(s) == 4 and s.isascii() and s.isdigit() and 1000 <= int(s) <= 9999:
|
|
1025
1060
|
four_digit_counts[s] = four_digit_counts.get(s, 0) + 1
|
|
1026
|
-
|
|
1027
|
-
|
|
1061
|
+
|
|
1062
|
+
# Pattern A: same value recurs ≥3 times.
|
|
1063
|
+
strip_set: set[str] = {s for s, c in four_digit_counts.items() if c >= 3}
|
|
1064
|
+
|
|
1065
|
+
# Pattern B: ≥3 distinct values clustered tightly together.
|
|
1066
|
+
if len(four_digit_counts) >= 3:
|
|
1067
|
+
values = sorted(int(s) for s in four_digit_counts.keys())
|
|
1068
|
+
spread = values[-1] - values[0]
|
|
1069
|
+
if spread <= 50:
|
|
1070
|
+
# Compute mean of consecutive diffs.
|
|
1071
|
+
diffs = [values[i + 1] - values[i] for i in range(len(values) - 1)]
|
|
1072
|
+
mean_diff = sum(diffs) / len(diffs)
|
|
1073
|
+
if mean_diff <= 3.0:
|
|
1074
|
+
# All values in the cluster are page numbers.
|
|
1075
|
+
strip_set.update(str(v) for v in values)
|
|
1076
|
+
|
|
1077
|
+
if strip_set:
|
|
1028
1078
|
t = "\n".join(
|
|
1029
|
-
"" if ln.strip() in
|
|
1079
|
+
"" if ln.strip() in strip_set else ln
|
|
1030
1080
|
for ln in t.split("\n")
|
|
1031
1081
|
)
|
|
1032
1082
|
report._track("S9_header_footer_removal", before, t, "headers_removed")
|
|
@@ -379,6 +379,130 @@ def _join_multiline_caption_paragraphs(text: str) -> str:
|
|
|
379
379
|
return "".join(paragraphs)
|
|
380
380
|
|
|
381
381
|
|
|
382
|
+
# ── Section C2: orphan table cell-text suppression ──────────────────────────
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
_ORPHAN_TABLE_CAPTION_RE = re.compile(
|
|
386
|
+
r"^Table\s+(\d+)[.:]\s+(.{3,}?)$"
|
|
387
|
+
)
|
|
388
|
+
_ORPHAN_CELL_STOPWORDS = (
|
|
389
|
+
" the ", " of ", " and ", " in ", " to ", " for ", " with ", " that ",
|
|
390
|
+
" this ", " was ", " were ", " are ", " is ", " have ", " has ",
|
|
391
|
+
" from ", " on ", " by ", " an ", " a ",
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def _is_orphan_cell_paragraph(p: str) -> bool:
|
|
396
|
+
"""Return True iff ``p`` looks like a leaked table cell row, not prose.
|
|
397
|
+
|
|
398
|
+
Conservative heuristic, used only inside the table-cell-text suppressor:
|
|
399
|
+
- Total length ≤ 200 chars (cell content with quoted instruction text or
|
|
400
|
+
concatenated column headers can run 100-200 chars on a single pdftotext
|
|
401
|
+
line; longer than that is almost certainly prose).
|
|
402
|
+
- Not a heading, caption, HTML block, or list marker.
|
|
403
|
+
- Stopword-density and sentence-structure check rule out short prose.
|
|
404
|
+
"""
|
|
405
|
+
if not p:
|
|
406
|
+
return False
|
|
407
|
+
if len(p) > 200:
|
|
408
|
+
return False
|
|
409
|
+
if p.startswith(("#", "*Table", "*Figure", "<table", "</table", "<thead", "<tbody", "<tr", "<td", "<th", ">")):
|
|
410
|
+
return False
|
|
411
|
+
if re.match(r"^(?:Table|Figure)\s+\d", p):
|
|
412
|
+
return False
|
|
413
|
+
if re.match(r"^[*+\-]\s", p) or re.match(r"^\d+\.\s+\w+", p):
|
|
414
|
+
# Markdown list / numbered list — not a cell row.
|
|
415
|
+
# (Numbered ranks like "1. Degree of apology" inside cells can match,
|
|
416
|
+
# but those are typically inside <td> tags, not standalone paragraphs.)
|
|
417
|
+
return False
|
|
418
|
+
if p.startswith("Note") and (":" in p[:8] or "." in p[:8]):
|
|
419
|
+
return False
|
|
420
|
+
lower = " " + p.lower() + " "
|
|
421
|
+
stopword_hits = sum(lower.count(sw) for sw in _ORPHAN_CELL_STOPWORDS)
|
|
422
|
+
# Above 90 chars, prose density must be very low (cells with quoted
|
|
423
|
+
# instruction text or column-header concatenations have ≤ 3 stopwords).
|
|
424
|
+
if len(p) > 90 and stopword_hits >= 4:
|
|
425
|
+
return False
|
|
426
|
+
if len(p) <= 90 and stopword_hits >= 3:
|
|
427
|
+
return False
|
|
428
|
+
# Multi-sentence content is prose, not a cell row.
|
|
429
|
+
if p.count(". ") >= 2:
|
|
430
|
+
return False
|
|
431
|
+
# Single long sentence ending in `.` (not `."` — cells often end in `"`)
|
|
432
|
+
# is prose.
|
|
433
|
+
if p.endswith(".") and not p.endswith(('."', '.")')) and len(p) > 70 and " " in p:
|
|
434
|
+
return False
|
|
435
|
+
return True
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
def _suppress_orphan_table_cell_text(text: str) -> str:
|
|
439
|
+
"""Suppress orphan cell-row text leaks after a plain-text Table caption.
|
|
440
|
+
|
|
441
|
+
When Camelot does not register a table on a page but pdftotext linearized
|
|
442
|
+
the cell content into the section body, the rendered markdown contains:
|
|
443
|
+
|
|
444
|
+
Table 5. Comparison of target article versus replication.
|
|
445
|
+
|
|
446
|
+
Target article
|
|
447
|
+
|
|
448
|
+
Replication
|
|
449
|
+
|
|
450
|
+
Study design
|
|
451
|
+
|
|
452
|
+
Sample characteristics
|
|
453
|
+
|
|
454
|
+
These short orphan paragraphs are leaked cell content with no structural
|
|
455
|
+
value in the rendered view (the user is told to consult the Raw view).
|
|
456
|
+
This pass:
|
|
457
|
+
1. Detects single-line ``Table N. <caption>`` paragraphs (plain, not
|
|
458
|
+
already italicized — the italic ``*Table N. ...*`` form is the
|
|
459
|
+
v2.4.2 caption-only emission and never has orphan rows).
|
|
460
|
+
2. Scans forward; if 3+ consecutive paragraphs match
|
|
461
|
+
:func:`_is_orphan_cell_paragraph`, italicizes the caption and drops
|
|
462
|
+
the orphan paragraphs.
|
|
463
|
+
|
|
464
|
+
Conservative: only fires after a ``Table N.`` caption and only when the
|
|
465
|
+
orphan run is at least 3 paragraphs long. Stops at the first non-orphan
|
|
466
|
+
paragraph (normal prose, another caption, or a heading).
|
|
467
|
+
"""
|
|
468
|
+
if not text or "Table" not in text:
|
|
469
|
+
return text
|
|
470
|
+
paragraphs = re.split(r"\n\n+", text)
|
|
471
|
+
out: list[str] = []
|
|
472
|
+
i = 0
|
|
473
|
+
while i < len(paragraphs):
|
|
474
|
+
para = paragraphs[i]
|
|
475
|
+
para_stripped = para.strip()
|
|
476
|
+
# Caption must be a single line (no embedded newlines after strip).
|
|
477
|
+
if (
|
|
478
|
+
para_stripped
|
|
479
|
+
and "\n" not in para_stripped
|
|
480
|
+
and not para_stripped.startswith("*")
|
|
481
|
+
and _ORPHAN_TABLE_CAPTION_RE.match(para_stripped)
|
|
482
|
+
):
|
|
483
|
+
j = i + 1
|
|
484
|
+
orphans: list[int] = []
|
|
485
|
+
while j < len(paragraphs):
|
|
486
|
+
p = paragraphs[j].strip()
|
|
487
|
+
if not p:
|
|
488
|
+
j += 1
|
|
489
|
+
continue
|
|
490
|
+
if _is_orphan_cell_paragraph(p):
|
|
491
|
+
orphans.append(j)
|
|
492
|
+
j += 1
|
|
493
|
+
continue
|
|
494
|
+
break
|
|
495
|
+
if len(orphans) >= 3:
|
|
496
|
+
# Italicize the caption (matches v2.4.2 no-cells caption style)
|
|
497
|
+
# and drop the orphan paragraphs.
|
|
498
|
+
out.append(f"*{para_stripped}*")
|
|
499
|
+
i = j
|
|
500
|
+
continue
|
|
501
|
+
out.append(para)
|
|
502
|
+
i += 1
|
|
503
|
+
return "\n\n".join(out)
|
|
504
|
+
|
|
505
|
+
|
|
382
506
|
# ── Section D: JAMA Key Points sidebar reformat ─────────────────────────────
|
|
383
507
|
|
|
384
508
|
|
|
@@ -1352,6 +1476,7 @@ def render_pdf_to_markdown(
|
|
|
1352
1476
|
md = _dedupe_h2_sections(md)
|
|
1353
1477
|
md = _fix_hyphenated_line_breaks(md)
|
|
1354
1478
|
md = _join_multiline_caption_paragraphs(md)
|
|
1479
|
+
md = _suppress_orphan_table_cell_text(md)
|
|
1355
1480
|
md = _merge_compound_heading_tails(md)
|
|
1356
1481
|
md = _reformat_jama_key_points_box(md)
|
|
1357
1482
|
md = _promote_numbered_subsection_headings(md)
|