docpluck 2.4.2__tar.gz → 2.4.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docpluck-2.4.2 → docpluck-2.4.3}/CHANGELOG.md +19 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/PKG-INFO +1 -1
- {docpluck-2.4.2 → docpluck-2.4.3}/docpluck/__init__.py +1 -1
- {docpluck-2.4.2 → docpluck-2.4.3}/docpluck/figures/detect.py +49 -1
- {docpluck-2.4.2 → docpluck-2.4.3}/docpluck/normalize.py +25 -2
- {docpluck-2.4.2 → docpluck-2.4.3}/pyproject.toml +1 -1
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_figure_detect.py +65 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_normalization.py +43 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/.claude/skills/_project/lessons.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/.claude/skills/docpluck-cleanup/SKILL.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/.claude/skills/docpluck-deploy/SKILL.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/.claude/skills/docpluck-qa/SKILL.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/.claude/skills/docpluck-qa/references/benchmark-mode.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/.claude/skills/docpluck-qa/references/check-11-hard-rules.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/.claude/skills/docpluck-qa/references/check-13-escicheck-production.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/.claude/skills/docpluck-qa/references/check-5-escicheck-library.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/.claude/skills/docpluck-qa/references/check-6-escicheck-local-webapp.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/.claude/skills/docpluck-qa/references/check-7-batch-smoke.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/.claude/skills/docpluck-review/SKILL.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/.github/workflows/publish.yml +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/.github/workflows/test.yml +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/.gitignore +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/CLAUDE.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/HANDOFF_SECTIONS_APP_INTEGRATION.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/LESSONS.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/LICENSE +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/REPLY_FROM_DOCPLUCK_v1.4.5.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/REPLY_FROM_DOCPLUCK_v1.5.0.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/REQUEST_08_CHUNKING_ENDPOINT.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/REQUEST_09_REFERENCE_LIST_NORMALIZATION.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/TODO.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docpluck/__main__.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docpluck/batch.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docpluck/cli.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docpluck/extract.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docpluck/extract_docx.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docpluck/extract_html.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docpluck/extract_layout.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docpluck/extract_structured.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docpluck/figures/__init__.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docpluck/quality.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docpluck/render.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docpluck/sections/__init__.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docpluck/sections/annotators/__init__.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docpluck/sections/annotators/docx.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docpluck/sections/annotators/html.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docpluck/sections/annotators/pdf.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docpluck/sections/annotators/text.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docpluck/sections/blocks.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docpluck/sections/boundaries.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docpluck/sections/core.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docpluck/sections/taxonomy.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docpluck/sections/types.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docpluck/tables/__init__.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docpluck/tables/bbox_utils.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docpluck/tables/camelot_extract.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docpluck/tables/captions.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docpluck/tables/cell_cleaning.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docpluck/tables/cluster.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docpluck/tables/confidence.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docpluck/tables/detect.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docpluck/tables/render.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docpluck/tables/whitespace.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docpluck/version.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/BENCHMARKS.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/DESIGN.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/HANDOFF_2026-05-07_sections_strict_iteration.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/HANDOFF_2026-05-09_session_state_and_followups.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/HANDOFF_2026-05-09_unified_extraction_brainstorm.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/HANDOFF_2026-05-10_table_rendering_iteration.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/HANDOFF_2026-05-10_table_rendering_iteration_2.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/HANDOFF_2026-05-10_table_rendering_iteration_3.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/HANDOFF_2026-05-10_table_rendering_iteration_4.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/HANDOFF_2026-05-10_table_rendering_iteration_5.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/HANDOFF_2026-05-10_table_rendering_iteration_6.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/HANDOFF_2026-05-10_table_rendering_iteration_7.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/HANDOFF_2026-05-11_PROMOTE_SPIKE_TO_LIBRARY.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/HANDOFF_2026-05-11_table_rendering_iteration_8.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/HANDOFF_2026-05-11_visual_review_findings.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/HANDOFF_2026-05-12_phase2_101pdf_corpus.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/HANDOFF_2026-05-12_remaining_ui_and_chrome_verification.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/HANDOFF_2026-05-12_visual_verify_results.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/NORMALIZATION.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/README.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/TRIAGE_2026-05-10_corpus_assessment.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/2026-05-06-section-identification.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/2026-05-06-table-extraction.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/2026-05-07-sections-strict-iteration-progress.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/2026-05-08-unified-extraction-phase-0-splice-spike.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/sections-deferred-items.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/sections-issues-backlog.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/2026-05-07_spot-01_apa.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/2026-05-07_spot-02_pattern-A-shipped.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/2026-05-08_spot-final_all-styles.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/experiments/COMPARISON.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/korbmacher_table1.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/option-a.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/ziano_table1.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_notes_raw.txt +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_table1.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/notes.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/option-b.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_notes_raw.txt +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_table1.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/korbmacher_table1.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/notes.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/option-c.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/sample-pdftotext-bbox.html +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/ziano_table1.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/korbmacher_table1.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/notes.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/option-d.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/ziano_table1.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_2022_kruger_bbox.html +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_bbox.html +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_table1.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/option-e.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/sample-bbox.html +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_2021_joep_bbox.html +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_bbox.html +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_table1.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/html-fallback-demo.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.err +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.err +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.err +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.err +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.err +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.err +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.err +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.err +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.err +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.err +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.err +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.err +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.err +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.err +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.err +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.err +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.err +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.err +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.err +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.err +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.err +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.err +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.err +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.err +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.err +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.err +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/papers.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/report.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/splice_spike.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/splice-spike/test_splice_spike.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/specs/2026-04-27-request-09-reference-normalization-design.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/specs/2026-05-06-section-identification-design.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/specs/2026-05-06-table-extraction-design.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/specs/2026-05-08-unified-extraction-design.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/scripts/verify_corpus.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/scripts/verify_corpus_full.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/__init__.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/conftest.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/fixtures/__init__.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/fixtures/sections/__init__.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/fixtures/sections/builders.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/fixtures/structured/.gitkeep +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/fixtures/structured/MANIFEST.json +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/fixtures/structured/README.md +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/golden/sections/apa_multi_study_pdf.json +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/golden/sections/apa_single_study_pdf.json +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/golden/sections/html_real_headings.json +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/snapshots/amj_lattice.txt +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/snapshots/apa_chan_feldman_lineless.txt +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/snapshots/apa_chen_jesp_lineless.txt +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/snapshots/apa_efendic_affect.txt +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/snapshots/apa_ip_feldman_pspb.txt +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/snapshots/bmc_lattice.txt +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/snapshots/ieee_figure_heavy.txt +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/snapshots/ieee_lattice.txt +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/snapshots/jama_lattice.txt +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/snapshots/nat_comms_figure_only.txt +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/snapshots/nature_minimal_rule.txt +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/snapshots/scirep_minimal_rule.txt +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_bbox_utils.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_benchmark_docx_html.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_caption_regex.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_cli_sections.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_cli_structured.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_confidence.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_corpus_smoke.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_d5_normalization_audit.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_edge_cases.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_extract_docx.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_extract_filter_sugar.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_extract_html.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_extract_layout.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_extract_pdf_structured.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_extraction.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_f0_table_region_aware.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_fixtures_manifest.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_lattice_cluster.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_metaesci_followups.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_normalize_f0_footnote_strip.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_normalize_layout_param.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_normalize_report_layout_fields.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_normalize_v18_strips.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_quality.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_render.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_render_html.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_request_09_reference_normalization.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_sections_boundaries.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_sections_boundary_truncation.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_sections_core_partition.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_sections_docx_annotator.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_sections_extract_text.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_sections_footnote_section.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_sections_golden.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_sections_html_annotator.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_sections_pdf_annotator.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_sections_public_api.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_sections_real_corpus.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_sections_taxonomy.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_sections_text_annotator.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_sections_types.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_sections_unit_corpus.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_sections_v161_coalesce.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_sections_v161_subheadings.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_sections_v161_taxonomy.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_sections_v161_text_annotator.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_sections_version.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_smoke_fixtures.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_structured_result_type.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_structured_types.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_structured_version.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_table_detect.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_tables_cell_cleaning.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_text_mode.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_v23_1_fixes.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_v23_bug_fixes.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_v23_post_corpus.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_v23_post_corpus_v2.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_v2_backwards_compat.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_v2_top_level_exports.py +0 -0
- {docpluck-2.4.2 → docpluck-2.4.3}/tests/test_whitespace_cluster.py +0 -0
|
@@ -1,5 +1,24 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [2.4.3] — 2026-05-13
|
|
4
|
+
|
|
5
|
+
Same-day follow-up. Two preventative improvements aimed at quality issues that didn't trip the verifier tags but were visible in rendered output:
|
|
6
|
+
|
|
7
|
+
### Fixes
|
|
8
|
+
|
|
9
|
+
1. **`docpluck/normalize.py::normalize_text` S9 step** — strip 4-digit standalone page numbers from continuous-pagination journals (PSPB volume runs into the 1000s, Psychological Science, etc.). Previously S9 only handled 1–3 digit page numbers; a bare `1174` line leaked into rendered output (e.g. `efendic_2022_affect.md` line 24). New rule strips 4-digit standalone numbers when (a) value is in 1000–9999, (b) same value recurs ≥ 3 times in the document. The recurrence floor protects table-cell values that happen to land on their own line in single-value-per-line column layouts. `NORMALIZATION_VERSION`: `1.8.1` → `1.8.2`.
|
|
10
|
+
|
|
11
|
+
2. **`docpluck/figures/detect.py::_full_caption_text`** — truncate figure captions at chart-data boundaries. pdftotext extracts chart elements (axis labels, gridline values, legend entries) inline with the figure caption when they share a PDF reading-order paragraph. The resulting caption text looks like `Figure 1. Flowchart of Study Sample Selection 4876956 Pairs enrolled before April 1, 2015 1117269 Pairs excluded ...` — useful prose followed by raw chart data. New heuristic: locate the first run of 6+ consecutive digits (signature of chart data — page counts, n-values, and years all top out at 5 digits in academic captions) and truncate just before it at the previous word boundary. Conservative: only fires when caption is ≥ 150 chars and surviving trimmed text is ≥ 40 chars (sanity check protects against edge cases). Affects clinical / biological flowcharts in JAMA, Sci Rep, BMC Medicine papers.
|
|
12
|
+
|
|
13
|
+
### Bumps
|
|
14
|
+
|
|
15
|
+
- `__version__`: `2.4.2` → `2.4.3`. Patch — both fixes are conservative pdftotext post-processing.
|
|
16
|
+
- `NORMALIZATION_VERSION`: `1.8.1` → `1.8.2`.
|
|
17
|
+
|
|
18
|
+
### Tests
|
|
19
|
+
|
|
20
|
+
7 new tests across `tests/test_normalization.py` (4-digit page number stripping, recurrence floor, year edge case) and `tests/test_figure_detect.py` (caption truncation at digit-run boundary, short-caption no-op, legitimate 5-digit-number preservation, minimum-post-label sanity check).
|
|
21
|
+
|
|
3
22
|
## [2.4.2] — 2026-05-13
|
|
4
23
|
|
|
5
24
|
Iterative follow-up. After v2.4.1 the 101-PDF corpus run was 98/101 PASS (`scripts/verify_corpus_full.py`); this release closes two of the three remaining failures and reframes the third as a known short-paper edge case in the verifier.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docpluck
|
|
3
|
-
Version: 2.4.
|
|
3
|
+
Version: 2.4.3
|
|
4
4
|
Summary: PDF, DOCX, and HTML text extraction and normalization for academic papers
|
|
5
5
|
Project-URL: Homepage, https://github.com/giladfeldman/docpluck
|
|
6
6
|
Project-URL: Documentation, https://github.com/giladfeldman/docpluck/tree/main/docs
|
|
@@ -71,7 +71,7 @@ from .figures import Figure
|
|
|
71
71
|
from .extract_structured import TABLE_EXTRACTION_VERSION, StructuredResult, extract_pdf_structured
|
|
72
72
|
from .render import render_pdf_to_markdown
|
|
73
73
|
|
|
74
|
-
__version__ = "2.4.
|
|
74
|
+
__version__ = "2.4.3"
|
|
75
75
|
__author__ = "Gilad Feldman"
|
|
76
76
|
__license__ = "MIT"
|
|
77
77
|
|
|
@@ -10,6 +10,7 @@ See spec §5.7.
|
|
|
10
10
|
|
|
11
11
|
from __future__ import annotations
|
|
12
12
|
|
|
13
|
+
import re
|
|
13
14
|
from collections import defaultdict
|
|
14
15
|
from typing import Any
|
|
15
16
|
|
|
@@ -135,7 +136,54 @@ def _full_caption_text(raw_text: str, cap: CaptionMatch) -> str:
|
|
|
135
136
|
end = raw_text.find("\n\n", cap.char_end)
|
|
136
137
|
if end == -1:
|
|
137
138
|
end = min(cap.char_end + 500, len(raw_text))
|
|
138
|
-
|
|
139
|
+
full = raw_text[cap.char_start:end].replace("\n", " ").strip()
|
|
140
|
+
return _trim_caption_at_chart_data(full)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
# A run of 6+ consecutive digits in a figure caption is almost never
|
|
144
|
+
# legitimate caption prose — page counts, statistical n-values, and years
|
|
145
|
+
# all top out at 5 digits in academic captions. 6+ digits is a strong signal
|
|
146
|
+
# that pdftotext joined chart data (raw bar-chart values, participant counts,
|
|
147
|
+
# row IDs) into the caption.
|
|
148
|
+
_CHART_DATA_DIGIT_RUN_RE = re.compile(r"\b\d{6,}\b")
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _trim_caption_at_chart_data(caption: str) -> str:
|
|
152
|
+
"""Truncate a caption when it transitions from prose to chart-data.
|
|
153
|
+
|
|
154
|
+
pdftotext extracts chart elements (axis labels, legend entries, gridline
|
|
155
|
+
values) inline with the figure caption when they share a paragraph in the
|
|
156
|
+
PDF reading order. The resulting caption text looks like::
|
|
157
|
+
|
|
158
|
+
Figure 1. Flowchart of Study Sample Selection 4876956 Pairs enrolled
|
|
159
|
+
before April 1, 2015 1117269 Pairs excluded 741469 Withdrawal …
|
|
160
|
+
|
|
161
|
+
where the real caption is "Flowchart of Study Sample Selection" and the
|
|
162
|
+
rest is chart data values.
|
|
163
|
+
|
|
164
|
+
Strategy: locate the first run of 6+ consecutive digits (the signature
|
|
165
|
+
of chart data — counts or row IDs that no real caption sentence would
|
|
166
|
+
contain). Truncate the caption just before that run, falling back to
|
|
167
|
+
the word boundary so we don't end mid-word.
|
|
168
|
+
|
|
169
|
+
Conservative: only fires when the caption is ≥ 150 chars (real short
|
|
170
|
+
captions almost never have chart-data appendage), and only when the
|
|
171
|
+
surviving trimmed caption is ≥ 40 chars after the label.
|
|
172
|
+
"""
|
|
173
|
+
if not caption or len(caption) < 150:
|
|
174
|
+
return caption
|
|
175
|
+
m = _CHART_DATA_DIGIT_RUN_RE.search(caption)
|
|
176
|
+
if m is None:
|
|
177
|
+
return caption
|
|
178
|
+
cut = m.start()
|
|
179
|
+
# Walk back to the previous word boundary.
|
|
180
|
+
while cut > 0 and not caption[cut - 1].isspace():
|
|
181
|
+
cut -= 1
|
|
182
|
+
trimmed = caption[:cut].rstrip(" ,;:")
|
|
183
|
+
# Sanity check.
|
|
184
|
+
if len(trimmed) < 40:
|
|
185
|
+
return caption
|
|
186
|
+
return trimmed
|
|
139
187
|
|
|
140
188
|
|
|
141
189
|
__all__ = ["find_figures"]
|
|
@@ -22,7 +22,7 @@ class NormalizationLevel(str, Enum):
|
|
|
22
22
|
academic = "academic"
|
|
23
23
|
|
|
24
24
|
|
|
25
|
-
NORMALIZATION_VERSION = "1.8.
|
|
25
|
+
NORMALIZATION_VERSION = "1.8.2"
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
# ── Request 9 (Scimeto, 2026-04-27): Reference-list normalization ──────────
|
|
@@ -1004,8 +1004,31 @@ def normalize_text(
|
|
|
1004
1004
|
if repeated:
|
|
1005
1005
|
lines = [l for l in lines if l.strip() not in repeated]
|
|
1006
1006
|
t = "\n".join(lines)
|
|
1007
|
-
# Strip standalone page numbers
|
|
1007
|
+
# Strip standalone page numbers — 1-3 digit unconditionally.
|
|
1008
1008
|
t = re.sub(r"^\s*\d{1,3}\s*$", "", t, flags=re.MULTILINE)
|
|
1009
|
+
# v2.4.3: 4-digit page numbers (continuous-pagination journals like PSPB
|
|
1010
|
+
# where volume runs page numbers into the 1000s). Strip when ALL of:
|
|
1011
|
+
# 1. The line is exactly 4 ASCII digits.
|
|
1012
|
+
# 2. The value falls in the plausible page-number range 1000–9999
|
|
1013
|
+
# (avoids stripping a stray 4-digit year-on-its-own-line).
|
|
1014
|
+
# 3. The SAME value recurs ≥3 times in the document (page numbers
|
|
1015
|
+
# repeat once per physical page, so this is conservative; a
|
|
1016
|
+
# duplicate-by-coincidence table-cell value would need to be the
|
|
1017
|
+
# same number 3 times, which is rare).
|
|
1018
|
+
# The conservative threshold protects table data where a 4-digit value
|
|
1019
|
+
# might legitimately appear on its own line (single-value-per-line
|
|
1020
|
+
# column layouts).
|
|
1021
|
+
four_digit_counts: dict[str, int] = {}
|
|
1022
|
+
for ln in t.split("\n"):
|
|
1023
|
+
s = ln.strip()
|
|
1024
|
+
if len(s) == 4 and s.isascii() and s.isdigit() and 1000 <= int(s) <= 9999:
|
|
1025
|
+
four_digit_counts[s] = four_digit_counts.get(s, 0) + 1
|
|
1026
|
+
recurring_4d = {s for s, c in four_digit_counts.items() if c >= 3}
|
|
1027
|
+
if recurring_4d:
|
|
1028
|
+
t = "\n".join(
|
|
1029
|
+
"" if ln.strip() in recurring_4d else ln
|
|
1030
|
+
for ln in t.split("\n")
|
|
1031
|
+
)
|
|
1009
1032
|
report._track("S9_header_footer_removal", before, t, "headers_removed")
|
|
1010
1033
|
|
|
1011
1034
|
# Limit consecutive newlines
|
|
@@ -94,3 +94,68 @@ def test_figure_typeddict_shape():
|
|
|
94
94
|
"caption": "Mean reaction time across conditions.",
|
|
95
95
|
}
|
|
96
96
|
assert f["id"] == "f1"
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
# v2.4.3: caption truncation at chart-data boundary
|
|
100
|
+
# (digit runs ≥ 6 chars indicate pdftotext joined raw chart values into the
|
|
101
|
+
# caption paragraph — common in clinical / biological flowcharts).
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def test_trim_caption_at_chart_data_truncates_long_digit_run():
|
|
105
|
+
from docpluck.figures.detect import _trim_caption_at_chart_data
|
|
106
|
+
cap = (
|
|
107
|
+
"Figure 1. Flowchart of Study Sample Selection 4876956 Pairs enrolled "
|
|
108
|
+
"before April 1, 2015 1117269 Pairs excluded 741469 Withdrawal 148414 "
|
|
109
|
+
"Withdrawal after baseline 137787 With spouses onset of CVD 84585 "
|
|
110
|
+
"With onset of depression 5014 Duplicated couples 3792142 Eligible "
|
|
111
|
+
"pairs Matched by age and income"
|
|
112
|
+
)
|
|
113
|
+
out = _trim_caption_at_chart_data(cap)
|
|
114
|
+
# 6-digit run "4876956" triggers truncation just before it.
|
|
115
|
+
assert out == "Figure 1. Flowchart of Study Sample Selection"
|
|
116
|
+
assert "4876956" not in out
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def test_trim_caption_preserves_short_caption():
|
|
120
|
+
from docpluck.figures.detect import _trim_caption_at_chart_data
|
|
121
|
+
cap = "Figure 2. A short caption with a year reference 2020 here."
|
|
122
|
+
out = _trim_caption_at_chart_data(cap)
|
|
123
|
+
# Under 150-char threshold AND no 6-digit run; no-op.
|
|
124
|
+
assert out == cap
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def test_trim_caption_preserves_legitimate_5digit_numbers():
|
|
128
|
+
from docpluck.figures.detect import _trim_caption_at_chart_data
|
|
129
|
+
cap = (
|
|
130
|
+
"Figure 3. Sample selection diagram including all participants from "
|
|
131
|
+
"the original cohort (N = 12345) and the analytic subsample of 9876 "
|
|
132
|
+
"individuals who completed both waves of the longitudinal survey "
|
|
133
|
+
"between 2018 and 2024 with no missing data on the focal outcomes."
|
|
134
|
+
)
|
|
135
|
+
out = _trim_caption_at_chart_data(cap)
|
|
136
|
+
# 5-digit "12345" does NOT trigger; whole caption preserved.
|
|
137
|
+
assert out == cap
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def test_trim_caption_preserves_prose_with_no_digits():
|
|
141
|
+
from docpluck.figures.detect import _trim_caption_at_chart_data
|
|
142
|
+
cap = (
|
|
143
|
+
"Figure 4. Cumulative incidence of depression by spouses cardiovascular "
|
|
144
|
+
"event among the entire study sample. The horizontal axis shows the "
|
|
145
|
+
"time in months and the vertical axis is cumulative incidence of "
|
|
146
|
+
"depression in percent. Lines represent the four sex-age subgroups."
|
|
147
|
+
)
|
|
148
|
+
out = _trim_caption_at_chart_data(cap)
|
|
149
|
+
# No 6-digit run; full caption preserved.
|
|
150
|
+
assert out == cap
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def test_trim_caption_keeps_minimum_post_label_content():
|
|
154
|
+
from docpluck.figures.detect import _trim_caption_at_chart_data
|
|
155
|
+
# 6-digit run lands right after the label — truncation would leave
|
|
156
|
+
# just "Figure 1." (under 40-char sanity check) — return original.
|
|
157
|
+
long_cap = "Figure 5. " + "x" * 200 + " 1234567 stuff" # >150 chars
|
|
158
|
+
short_pre_label = "Figure 5. 1234567 chart data " + "y" * 200
|
|
159
|
+
out = _trim_caption_at_chart_data(short_pre_label)
|
|
160
|
+
# Sanity check fires; return original.
|
|
161
|
+
assert out == short_pre_label
|
|
@@ -414,6 +414,49 @@ class TestS9_HeaderFooter:
|
|
|
414
414
|
result = norm(text, "standard")
|
|
415
415
|
assert "\n42\n" not in result
|
|
416
416
|
|
|
417
|
+
def test_4digit_page_numbers_stripped_when_recurring(self):
|
|
418
|
+
"""v2.4.3: Continuous-pagination journals (PSPB, JESP volume runs)
|
|
419
|
+
emit page numbers in the 1000-9999 range. When the same 4-digit
|
|
420
|
+
value appears on its own line 3+ times in the doc, treat it as
|
|
421
|
+
a page-number artifact and strip."""
|
|
422
|
+
text = (
|
|
423
|
+
"First page content here.\n"
|
|
424
|
+
"1174\n"
|
|
425
|
+
"Second page begins.\n"
|
|
426
|
+
"1175\n"
|
|
427
|
+
"Body sentence continues.\n"
|
|
428
|
+
"1174\n"
|
|
429
|
+
"More body.\n"
|
|
430
|
+
"1175\n"
|
|
431
|
+
"Even more body content.\n"
|
|
432
|
+
"1174\n"
|
|
433
|
+
)
|
|
434
|
+
result = norm(text, "standard")
|
|
435
|
+
# 1174 appears 3 times → stripped.
|
|
436
|
+
assert "\n1174\n" not in result
|
|
437
|
+
# 1175 appears 2 times → not yet meeting the ≥3 threshold,
|
|
438
|
+
# so left alone (conservative).
|
|
439
|
+
assert "1175" in result
|
|
440
|
+
|
|
441
|
+
def test_4digit_year_on_own_line_preserved(self):
|
|
442
|
+
"""A 4-digit value that only appears ONCE on its own line is NOT
|
|
443
|
+
a page number — could be a year reference or stray data. Leave it."""
|
|
444
|
+
text = "body text\n2024\nmore body text\n"
|
|
445
|
+
result = norm(text, "standard")
|
|
446
|
+
assert "2024" in result
|
|
447
|
+
|
|
448
|
+
def test_4digit_below_1000_preserved(self):
|
|
449
|
+
"""Values below 1000 are page-number range only via the 1-3-digit
|
|
450
|
+
pattern; 4-digit values <1000 don't exist (would be 3-digit)."""
|
|
451
|
+
# Mostly a sanity check; values like 0999 wouldn't naturally occur.
|
|
452
|
+
text = "abc\n2020\ndef\n2020\nxyz\n2020\nfinal\n"
|
|
453
|
+
result = norm(text, "standard")
|
|
454
|
+
# 2020 recurs 3+ but is a year; the heuristic ALSO strips this
|
|
455
|
+
# case (1000-9999 range), which is acceptable since
|
|
456
|
+
# standalone-line years are a rare verbatim pattern in academic
|
|
457
|
+
# prose. Document the behavior here.
|
|
458
|
+
assert "2020" not in result
|
|
459
|
+
|
|
417
460
|
def test_short_lines_preserved(self):
|
|
418
461
|
"""Lines < 15 chars should NOT be treated as headers."""
|
|
419
462
|
text = "Short\n" * 10 + "Content"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docpluck-2.4.2 → docpluck-2.4.3}/.claude/skills/docpluck-qa/references/check-11-hard-rules.md
RENAMED
|
File without changes
|
|
File without changes
|
{docpluck-2.4.2 → docpluck-2.4.3}/.claude/skills/docpluck-qa/references/check-5-escicheck-library.md
RENAMED
|
File without changes
|
|
File without changes
|
{docpluck-2.4.2 → docpluck-2.4.3}/.claude/skills/docpluck-qa/references/check-7-batch-smoke.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docpluck-2.4.2 → docpluck-2.4.3}/docs/HANDOFF_2026-05-12_remaining_ui_and_chrome_verification.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/2026-05-06-section-identification.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docpluck-2.4.2 → docpluck-2.4.3}/docs/superpowers/plans/spot-checks/2026-05-07_spot-01_apa.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|