docpluck 2.4.63__tar.gz → 2.4.65__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docpluck-2.4.63 → docpluck-2.4.65}/CHANGELOG.md +58 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/PKG-INFO +1 -1
- {docpluck-2.4.63 → docpluck-2.4.65}/docpluck/__init__.py +1 -1
- {docpluck-2.4.63 → docpluck-2.4.65}/docpluck/normalize.py +150 -33
- {docpluck-2.4.63 → docpluck-2.4.65}/pyproject.toml +1 -1
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_normalize_idempotent_real_pdf.py +39 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/.claude/skills/_project/lessons.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/.claude/skills/docpluck-cleanup/SKILL.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/.claude/skills/docpluck-deploy/SKILL.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/.claude/skills/docpluck-iterate/LEARNINGS.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/.claude/skills/docpluck-iterate/SKILL.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/.claude/skills/docpluck-iterate/references/ai-full-doc-verify.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/.claude/skills/docpluck-iterate/references/cycle-report-template.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/.claude/skills/docpluck-iterate/references/local-verification.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/.claude/skills/docpluck-iterate/references/rationalizations.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/.claude/skills/docpluck-iterate/references/real-library-real-pdf.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/.claude/skills/docpluck-iterate/references/release-flow.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/.claude/skills/docpluck-iterate/references/self-improvement.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/.claude/skills/docpluck-iterate/references/three-tier-parity.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/.claude/skills/docpluck-qa/SKILL.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/.claude/skills/docpluck-qa/references/benchmark-mode.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/.claude/skills/docpluck-qa/references/check-11-hard-rules.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/.claude/skills/docpluck-qa/references/check-13-escicheck-production.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/.claude/skills/docpluck-qa/references/check-5-escicheck-library.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/.claude/skills/docpluck-qa/references/check-6-escicheck-local-webapp.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/.claude/skills/docpluck-qa/references/check-7-batch-smoke.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/.claude/skills/docpluck-review/SKILL.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/.github/workflows/bump-app-pin.yml +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/.github/workflows/publish.yml +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/.github/workflows/test.yml +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/.gitignore +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/CLAUDE.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/HANDOFF_SECTIONS_APP_INTEGRATION.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/LESSONS.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/LICENSE +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/REPLY_FROM_DOCPLUCK_v1.4.5.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/REPLY_FROM_DOCPLUCK_v1.5.0.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/REQUEST_08_CHUNKING_ENDPOINT.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/REQUEST_09_REFERENCE_LIST_NORMALIZATION.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/TODO.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docpluck/__main__.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docpluck/batch.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docpluck/cli.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docpluck/extract.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docpluck/extract_docx.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docpluck/extract_html.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docpluck/extract_layout.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docpluck/extract_structured.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docpluck/figures/__init__.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docpluck/figures/detect.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docpluck/quality.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docpluck/render.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docpluck/sections/__init__.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docpluck/sections/annotators/__init__.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docpluck/sections/annotators/docx.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docpluck/sections/annotators/html.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docpluck/sections/annotators/pdf.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docpluck/sections/annotators/text.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docpluck/sections/blocks.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docpluck/sections/boundaries.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docpluck/sections/core.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docpluck/sections/taxonomy.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docpluck/sections/types.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docpluck/tables/__init__.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docpluck/tables/bbox_utils.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docpluck/tables/camelot_extract.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docpluck/tables/captions.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docpluck/tables/cell_cleaning.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docpluck/tables/cluster.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docpluck/tables/confidence.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docpluck/tables/detect.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docpluck/tables/render.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docpluck/tables/whitespace.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docpluck/version.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/BENCHMARKS.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/DESIGN.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-07_sections_strict_iteration.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-09_session_state_and_followups.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-09_unified_extraction_brainstorm.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-10_table_rendering_iteration.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-10_table_rendering_iteration_2.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-10_table_rendering_iteration_3.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-10_table_rendering_iteration_4.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-10_table_rendering_iteration_5.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-10_table_rendering_iteration_6.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-10_table_rendering_iteration_7.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-11_PROMOTE_SPIKE_TO_LIBRARY.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-11_table_rendering_iteration_8.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-11_visual_review_findings.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-12_phase2_101pdf_corpus.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-12_remaining_ui_and_chrome_verification.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-12_visual_verify_results.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-13_apa_50_expansion.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-13_apa_50_expansion_iter_1.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-13_apa_50_expansion_iter_2.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-13_iterate_skill_first_use.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-13_iterative_1.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-13_iterative_library_improvement.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-13_table_extraction_next_iteration.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-14_continue_iterations_v2_4_30_to_15n.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-14_full_corpus_iteration_v2_4_30.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-14_iterate_6_cycles_complete.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-14_iterate_9_cycle_run.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-14_iterate_resume_4_cycles.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-14_iterate_v2_4_31_cycle_15n.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-14_phase_5d_gold_audit_v2_4_29.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-15_autonomous_apa_first_10h.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-15_iterate_apa_run_1.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-16_ai-gold-instructions.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-16_iterate_apa_run_2.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-16_iterate_apa_run_3.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-16_iterate_run_4_final.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-16_iterate_run_4_fix_and_continue.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-16_iterate_run_5.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-16_iterate_run_6.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-17_iterate_run_7.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-17_iterate_run_8.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-17_iterate_run_9.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-18_iterate_run_9_cont.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-18_iterate_run_9_cont2.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-20_iterate_run_9_cont3.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/ITERATION_VERIFICATION_LESSONS.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/LIBRARY_APP_SYNC.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/NORMALIZATION.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/README.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/TRIAGE_2026-05-10_corpus_assessment.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/TRIAGE_2026-05-14_phase_5d_gold_audit.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/2026-05-06-section-identification.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/2026-05-06-table-extraction.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/2026-05-07-sections-strict-iteration-progress.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/2026-05-08-unified-extraction-phase-0-splice-spike.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/sections-deferred-items.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/sections-issues-backlog.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/2026-05-07_spot-01_apa.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/2026-05-07_spot-02_pattern-A-shipped.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/2026-05-08_spot-final_all-styles.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/experiments/COMPARISON.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/korbmacher_table1.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/option-a.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/ziano_table1.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_notes_raw.txt +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_table1.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/notes.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/option-b.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_notes_raw.txt +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_table1.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/korbmacher_table1.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/notes.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/option-c.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/sample-pdftotext-bbox.html +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/ziano_table1.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/korbmacher_table1.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/notes.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/option-d.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/ziano_table1.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_2022_kruger_bbox.html +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_bbox.html +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_table1.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/option-e.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/sample-bbox.html +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_2021_joep_bbox.html +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_bbox.html +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_table1.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/html-fallback-demo.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.err +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.err +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.err +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.err +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.err +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.err +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.err +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.err +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.err +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.err +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.err +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.err +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.err +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.err +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.err +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.err +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.err +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.err +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.err +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.err +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.err +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.err +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.err +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.err +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.err +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.err +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/papers.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/report.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/splice_spike.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/plans/spot-checks/splice-spike/test_splice_spike.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/specs/2026-04-27-request-09-reference-normalization-design.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/specs/2026-05-06-section-identification-design.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/specs/2026-05-06-table-extraction-design.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/docs/superpowers/specs/2026-05-08-unified-extraction-design.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/scripts/harness/README.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/scripts/harness/VERIFIER_PROMPT.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/scripts/harness/__init__.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/scripts/harness/baseline_matrix.json +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/scripts/harness/checks.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/scripts/harness/corpus.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/scripts/harness/corpus_manifest.json +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/scripts/harness/extract.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/scripts/harness/gold_keys.json +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/scripts/harness/inspect.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/scripts/lint_rendered_corpus.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/scripts/verify_corpus.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/scripts/verify_corpus_full.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/__init__.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/conftest.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/fixtures/__init__.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/fixtures/sections/__init__.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/fixtures/sections/builders.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/fixtures/structured/.gitkeep +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/fixtures/structured/MANIFEST.json +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/fixtures/structured/README.md +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/golden/sections/apa_multi_study_pdf.json +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/golden/sections/apa_single_study_pdf.json +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/golden/sections/html_real_headings.json +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/snapshots/amj_lattice.txt +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/snapshots/apa_chan_feldman_lineless.txt +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/snapshots/apa_chen_jesp_lineless.txt +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/snapshots/apa_efendic_affect.txt +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/snapshots/apa_ip_feldman_pspb.txt +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/snapshots/bmc_lattice.txt +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/snapshots/ieee_figure_heavy.txt +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/snapshots/ieee_lattice.txt +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/snapshots/jama_lattice.txt +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/snapshots/nat_comms_figure_only.txt +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/snapshots/nature_minimal_rule.txt +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/snapshots/scirep_minimal_rule.txt +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_a3c_leading_zero_decimal_real_pdf.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_all_caps_section_promote_real_pdf.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_bbox_utils.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_benchmark_docx_html.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_cambridge_footer_strip_real_pdf.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_caption_only_table_heading_real_pdf.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_caption_regex.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_chart_data_trim_real_pdf.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_cid_minus_recovery_real_pdf.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_cli_sections.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_cli_structured.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_confidence.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_corpus_smoke.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_d5_normalization_audit.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_edge_cases.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_elsevier_footer_strip_real_pdf.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_equation_page_header_strip_real_pdf.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_extract_docx.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_extract_filter_sugar.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_extract_html.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_extract_layout.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_extract_pdf_structured.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_extraction.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_f0_table_region_aware.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_fffd_comparison_recovery_real_pdf.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_figure_caption_trim_real_pdf.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_figure_detect.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_fixtures_manifest.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_harness_text_loss_reflow.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_lattice_cluster.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_letterspaced_label_real_pdf.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_ligature_decomposition_real_pdf.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_lt_operator_recovery_real_pdf.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_mathitalic_greek_real_pdf.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_metaesci_followups.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_minus_sign_recovery_real_pdf.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_normalization.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_normalize_a3_r2_body_integer_real_pdf.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_normalize_f0_footnote_strip.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_normalize_layout_param.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_normalize_metadata_leak_real_pdf.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_normalize_report_layout_fields.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_normalize_v18_strips.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_numbered_heading_promotion_real_pdf.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_numbered_section_promotion_real_pdf.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_orphan_multilevel_number_real_pdf.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_orphan_section_number_real_pdf.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_preserve_math_glyphs_real_pdf.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_pua_glyph_recovery_real_pdf.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_quality.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_render.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_render_html.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_request_09_reference_normalization.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_roman_numeral_section_promote_real_pdf.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_section_row_label_no_merge_real_pdf.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_sections_boundaries.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_sections_boundary_truncation.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_sections_core_partition.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_sections_docx_annotator.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_sections_extract_text.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_sections_footnote_section.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_sections_golden.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_sections_html_annotator.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_sections_pdf_annotator.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_sections_public_api.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_sections_real_corpus.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_sections_taxonomy.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_sections_text_annotator.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_sections_types.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_sections_unit_corpus.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_sections_v161_coalesce.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_sections_v161_subheadings.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_sections_v161_taxonomy.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_sections_v161_text_annotator.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_sections_version.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_smoke_fixtures.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_structured_result_type.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_structured_types.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_structured_version.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_table_caption_cell_region_real_pdf.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_table_detect.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_tables_cell_cleaning.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_text_mode.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_v23_1_fixes.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_v23_bug_fixes.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_v23_post_corpus.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_v23_post_corpus_v2.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_v2_backwards_compat.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_v2_top_level_exports.py +0 -0
- {docpluck-2.4.63 → docpluck-2.4.65}/tests/test_whitespace_cluster.py +0 -0
|
@@ -1,5 +1,63 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [2.4.65] — 2026-05-22
|
|
4
|
+
|
|
5
|
+
**Cycle 13 (run 9) — three further normalize_text idempotence fixes.** Post-cycle-12: 11 papers non-idempotent. Cycle 13 packages three independent fixes that together clear 4.
|
|
6
|
+
|
|
7
|
+
### 1. P1r late re-strip — front-matter metadata leak (4 papers: li-feldman-fox, amp-1, annals-2, xiao-poc-epley)
|
|
8
|
+
|
|
9
|
+
`_strip_frontmatter_metadata_leaks` matches acknowledgment lines by anchored prefix + keyword guard (`reviewers|editor|feedback|comments|suggestions|insights|helpful`) within 300 chars. pdftotext often line-wraps the acknowledgment BEFORE the guard keyword fires: `We thank the target article's authors - Prof. Craig Fox and Prof. Rebecca Ratner, for being very` (96 chars; no keyword yet). S7/S8 join the continuation; the joined line now contains `helpful in providing us with materials...` — but P1 has already run by then. Pass 2's P1 catches the joined form — non-idempotence + a real missed production strip.
|
|
10
|
+
|
|
11
|
+
Fix: P1r block at end of `normalize_text`, after H0r and before P0r. Same shape as cycle 7's H0r and cycle 9's P0r — fixed-point re-application of an idempotent line-strip on stabilized line positions.
|
|
12
|
+
|
|
13
|
+
### 2. Cross-paragraph `=`/`<`/`>` → digit join — same shape as cycle 12 (1 paper: li-feldman-fox additional defect)
|
|
14
|
+
|
|
15
|
+
A1's `re.sub(r"([=<>])\s*\n\s*([-\d.])", r"\1 \2", t)` uses `\s*` (crosses paragraphs) but runs BEFORE S9 strips header/footer junk. `p =\n\n\x0cFox et al. (2005)...\n\n38\n\n.25, OR = .96, 95%CI [.90, 1.03]` fails on pass 1 (the header text isn't `\s`); S9 strips, leaves `p =\n\n.25`; A1 is over.
|
|
16
|
+
|
|
17
|
+
Fix: add `re.sub(r"([=<>])\s*\n\s*\n\s*(?=[-.]?\d)", r"\1 ", t)` to the LateJoin block. The lookahead `(?=[-.]?\d)` is the load-bearing constraint — real paragraphs rarely START with a leading dot or `-digit`. Same shape as cycle 12's cross-paragraph `,/;` → `CI/p` joins.
|
|
18
|
+
|
|
19
|
+
### 3. LABELED CI bracket — intervening-stat-label gate (refines cycle 12; 1 paper: majumder)
|
|
20
|
+
|
|
21
|
+
Cycle 12's LABELED-bracket discriminator was too permissive. `M = 5.37, SD = 2.01), t(1827) = 1.83, p tukey = .067, d = 0.09, 95% CI [-0.006, 0.18]` has a LABELED `95% CI [...]` that incorrectly paired with `SD = 2.01` (across `t(`, `p tukey =`, `d =` — three INDEPENDENT-statistic labels). The CI is for `d = 0.09`, not for the SD.
|
|
22
|
+
|
|
23
|
+
Fix: even a LABELED bracket cannot reach back across an independent-stat label. `_INDEPENDENT_STAT_BETWEEN_RE` rejects pairings whose intervening text contains a NEW estimate label (`t`, `F`, `d`, `g`, `OR`, `RR`, `β`, `R²`, `Z`, …) — only variance-family labels (`SD`, `SE`, `M`, `CI`, `%`) are allowed between the candidate token and the labeled CI. efendic's `Mposterior = 20.54, SD=0.04, CI = [-0.61, -0.47]` (only `SD` between) still pairs correctly.
|
|
24
|
+
|
|
25
|
+
**Impact:** corpus-wide non-idempotency 11 → 7. Broad pytest 1356 pass + 1 known pre-existing B6 fail. Harness Tier-D academic: 0 regressions, 0 new fails (1 still failing — plos-med-1 / B1).
|
|
26
|
+
|
|
27
|
+
NORMALIZATION_VERSION 1.9.19. Cycle 11/12 contract tests still pass under the refined LABELED-bracket gate.
|
|
28
|
+
|
|
29
|
+
## [2.4.64] — 2026-05-22
|
|
30
|
+
|
|
31
|
+
**Cycle 12 (run 9) — three independent normalize_text idempotence fixes.** A 180-doc scan post-cycle-11 found 17 papers still non-idempotent. This cycle packages three independent fixes that together clear 6 of them:
|
|
32
|
+
|
|
33
|
+
### 1. Final blank-line collapse (5 papers — chan-etal, horsham, lee-feldman, li-feldman-mental-acct, kassambara)
|
|
34
|
+
|
|
35
|
+
Raw pdftotext output contains form-feed `\x0c` characters at page boundaries. S9's `re.sub(r"\n{3,}", "\n\n", t)` collapses consecutive blank lines, but the form-feed survives upstream stripping into the references region, where R3 (continuation join) processes line-by-line — `"\x0c".strip() == ""` so the form-feed line becomes an empty entry, surrounded by other empty entries. R3 outputs `"\n".join(["...", "", "", "...", ""])` = `\n\n\n\n` (4 newlines). S9's collapse already ran upstream; nothing else collapses. Pass 2 sees the `\n{4}` run and S9 collapses it — non-idempotence.
|
|
36
|
+
|
|
37
|
+
Fix: add a final `re.sub(r"\n{3,}", "\n\n", t)` right before the H0r/P0r blocks. Any late strip step that empties a line is now safely followed by the collapse, regardless of which step produced the gap.
|
|
38
|
+
|
|
39
|
+
### 2. Cross-paragraph stat-continuation join (2 papers — korbmacher×2)
|
|
40
|
+
|
|
41
|
+
A1 (the early stat-line-repair step using `\s*`) crosses paragraph breaks but runs BEFORE S9 strips header/footer noise. A row like
|
|
42
|
+
|
|
43
|
+
`r(1798) = -0.27,\n\n472\n\nJournal of Decision Making, Vol. 17...\n\n95% CI [-0.31, ...]`
|
|
44
|
+
|
|
45
|
+
has so much intervening junk that A1's lookahead fails on pass 1. S9 then strips `472` (page num) and the journal-masthead/page-header (repeated ≥5 times), leaving `-0.27,\n\n95% CI`. A1 is over; LateJoin's A1r uses strict `[ \t]*\n[ \t]*` (single-newline only) and so doesn't fire. Pass 2's A1 sees the now-clean `,\n\n95% CI` and joins — non-idempotence.
|
|
46
|
+
|
|
47
|
+
Fix: add two paragraph-crossing variants to the LateJoin A1r block, restricted to high-confidence prefixes — `\d+% CI` and `p [<=>]`. No real paragraph STARTS with `95% CI` or `p < .001`, so joining across `\n\n` is safe. The `test_column_bleed_too_many_fragments_ignored` contract is unaffected — its input has no leading `,`/`;`.
|
|
48
|
+
|
|
49
|
+
### 3. LABELED vs BARE CI bracket discriminator (refines cycle 11)
|
|
50
|
+
|
|
51
|
+
Cycle 11's proximity gate broke 2 pre-existing tests:
|
|
52
|
+
- `test_ci_pairing_recovers_body_line`: `Mposterior = 20.54, SD=0.04, CI = [-0.61, -0.47]` — `, SD=` falsely tripped the "new stat label" sentence-break check, blocking the legitimate recovery of `20.54` → `-0.54`.
|
|
53
|
+
- `test_efendic_table_point_estimates_recovered_via_ci`: efendic's body-line CI recoveries no longer fired.
|
|
54
|
+
|
|
55
|
+
Fix: discriminate LABELED brackets (`CI = [...]` / `95% CI [...]` / `CI: [...]`) from BARE brackets (`[lo, hi]` alone). LABELED brackets can pair with any candidate token in the row (the chain `M = X, SD = Y, CI = [...]` is all describing the same estimate). BARE brackets retain the strict 30-char + period/semicolon-break proximity gate (catches the majumder false-positive — bare bracket ~50 chars after `2.01`, attached to a different stat). The `_CI_LABEL_PREFIX_RE` looks back ≤8 chars from the `[` for `CI` / `\d+% CI` (with optional `=`/`:`).
|
|
56
|
+
|
|
57
|
+
**Impact:** corpus-wide non-idempotency 17 → 11 (cycle 12 cleared 6: 5 bibliography-shift + 2 korbmacher; 3 new bibliography cases of the same shape are now caught by the final collapse). Broad pytest 1356 pass + 1 known pre-existing B6 fail. Harness Tier-D academic: 0 regressions, 0 new fails (1 still failing — plos-med-1 / B1).
|
|
58
|
+
|
|
59
|
+
NORMALIZATION_VERSION 1.9.18. New tests: `test_normalize_collapses_late_blank_line_runs` + `test_late_join_crosses_paragraph_for_stat_continuation`. Cycle 11's tests (`*_proximity_gate_*`) still pass under the LABELED/BARE refinement.
|
|
60
|
+
|
|
3
61
|
## [2.4.63] — 2026-05-21
|
|
4
62
|
|
|
5
63
|
**Cycle 11 (run 9) — `recover_minus_via_ci_pairing` proximity gate.** A 180-doc scan post-cycle-10 found 19 papers still non-idempotent. Among them, 8 (majumder, korbmacher×2, van-boven, chan-feldman-baron, ziano, xiao-poc, amp-1, annals-2) shared a structural defect that ALSO ships in single-pass production: the `_recover_minus_in_record` helper paired every candidate `2X.XX` token with EVERY CI bracket in the same record. A record like `M = 5.37, SD = 2.01), t(1827) = 1.83, p tukey = .067, d = 0.09 [-1.86, 0.04]` contains:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docpluck
|
|
3
|
-
Version: 2.4.
|
|
3
|
+
Version: 2.4.65
|
|
4
4
|
Summary: PDF, DOCX, and HTML text extraction and normalization for academic papers
|
|
5
5
|
Project-URL: Homepage, https://github.com/giladfeldman/docpluck
|
|
6
6
|
Project-URL: Documentation, https://github.com/giladfeldman/docpluck/tree/main/docs
|
|
@@ -71,7 +71,7 @@ from .figures import Figure
|
|
|
71
71
|
from .extract_structured import TABLE_EXTRACTION_VERSION, StructuredResult, extract_pdf_structured
|
|
72
72
|
from .render import render_pdf_to_markdown
|
|
73
73
|
|
|
74
|
-
__version__ = "2.4.
|
|
74
|
+
__version__ = "2.4.65"
|
|
75
75
|
__author__ = "Gilad Feldman"
|
|
76
76
|
__license__ = "MIT"
|
|
77
77
|
|
|
@@ -23,7 +23,7 @@ class NormalizationLevel(str, Enum):
|
|
|
23
23
|
academic = "academic"
|
|
24
24
|
|
|
25
25
|
|
|
26
|
-
NORMALIZATION_VERSION = "1.9.
|
|
26
|
+
NORMALIZATION_VERSION = "1.9.19"
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
# ── Mathematical Alphanumeric Symbols de-styling (shared, v2.4.34) ──────────
|
|
@@ -1474,35 +1474,64 @@ _CORRUPT_NEG_TOKEN_RE = re.compile(r"(?<![\d.\-])2(\d?\.\d+)\b")
|
|
|
1474
1474
|
_TABLE_ROW_RE = re.compile(r"<tr\b.*?</tr>", re.DOTALL | re.IGNORECASE)
|
|
1475
1475
|
|
|
1476
1476
|
|
|
1477
|
-
# Cycle 11 (v2.4.63) — proximity gate for the CI-pairing recovery.
|
|
1477
|
+
# Cycle 11 (v2.4.63) / 12 (v2.4.64) — proximity gate for the CI-pairing recovery.
|
|
1478
1478
|
#
|
|
1479
|
-
# In stat reporting
|
|
1480
|
-
#
|
|
1481
|
-
#
|
|
1482
|
-
#
|
|
1483
|
-
#
|
|
1484
|
-
#
|
|
1485
|
-
#
|
|
1486
|
-
#
|
|
1487
|
-
# recovered `2.01` → `-.01`, corrupting the SD. 8 papers in the corpus
|
|
1488
|
-
# (majumder, korbmacher, van-boven, ...) had this defect.
|
|
1479
|
+
# In stat reporting a BARE bracket `[lo, hi]` attaches to the IMMEDIATELY-
|
|
1480
|
+
# preceding point estimate; a LABELED bracket `CI = [lo, hi]` or
|
|
1481
|
+
# `95% CI [lo, hi]` can attach to ANY earlier point estimate on the same
|
|
1482
|
+
# row (the SD/SE/df-pair tokens in between are descriptive of the same
|
|
1483
|
+
# estimate). The cycle 11 proximity gate treated both as needing strict
|
|
1484
|
+
# adjacency, which broke efendic's body-line recovery
|
|
1485
|
+
# `Mposterior = 20.54, SD=0.04, CI = [-0.61, -0.47]`
|
|
1486
|
+
# where `, SD=` falsely tripped the "new stat label" sentence-break check.
|
|
1489
1487
|
#
|
|
1490
|
-
#
|
|
1491
|
-
# bracket
|
|
1492
|
-
#
|
|
1493
|
-
#
|
|
1494
|
-
#
|
|
1488
|
+
# Cycle 12 fix: discriminate LABELED vs BARE brackets.
|
|
1489
|
+
# - LABELED bracket (`CI =`/`95% CI`/`CI:` immediately precedes `[`):
|
|
1490
|
+
# pairs with any candidate token in its record (the old wide rule).
|
|
1491
|
+
# - BARE bracket: pairs ONLY with candidates within 30 chars + no
|
|
1492
|
+
# sentence break (period/semicolon + space — NOT comma + new label,
|
|
1493
|
+
# because stat-row labels are comma-separated by convention).
|
|
1494
|
+
#
|
|
1495
|
+
# This keeps the majumder fix (bare bracket far from `2.01`) AND
|
|
1496
|
+
# preserves efendic-style labeled CIs that pair across SD/SE annotations.
|
|
1495
1497
|
_CI_PAIR_MAX_GAP = 30
|
|
1496
|
-
|
|
1497
|
-
|
|
1498
|
-
|
|
1498
|
+
# Bare-bracket sentence break: only period/semicolon + space. A comma is
|
|
1499
|
+
# NOT a break because stat rows are comma-separated. The majumder false-
|
|
1500
|
+
# positive is now caught by the per-bracket proximity check (the bare
|
|
1501
|
+
# bracket sits ~50 chars after `2.01` — beyond _CI_PAIR_MAX_GAP).
|
|
1502
|
+
_SENTENCE_BREAK_RE = re.compile(r"[.;]\s")
|
|
1503
|
+
# A bracket is "labeled" when prefixed by `CI`, `95 % CI`, or similar
|
|
1504
|
+
# directly before the opening `[`. Allow optional whitespace and an `=` /
|
|
1505
|
+
# `:` between the label and the bracket.
|
|
1506
|
+
_CI_LABEL_PREFIX_RE = re.compile(r"(?:\bCI|\b\d+\s*%\s*CI)\s*[=:]?\s*$", re.IGNORECASE)
|
|
1507
|
+
# Cycle 13 (v2.4.65) — even a LABELED CI cannot reach back ACROSS an
|
|
1508
|
+
# independent-test-statistic label. The discriminator: between the
|
|
1509
|
+
# candidate token and the labeled bracket, allow ONLY variance-family
|
|
1510
|
+
# labels (SD, SE, M, Mdn, Var, CI, 95% CI itself, %), reject anything
|
|
1511
|
+
# that introduces a NEW estimate (t, F, p, d, g, η, χ, r, R², β, OR, RR,
|
|
1512
|
+
# HR, B, Z, Q).
|
|
1513
|
+
#
|
|
1514
|
+
# Why: `Mposterior = 20.54, SD=0.04, CI = [-0.61, -0.47]` (efendic) has
|
|
1515
|
+
# only SD between the candidate and the CI — same estimate, paired OK.
|
|
1516
|
+
# `M = 5.37, SD = 2.01, t(1827) = 1.83, p tukey = .067, d = 0.09, 95% CI
|
|
1517
|
+
# [-0.006, 0.18]` (majumder) has t, p, d — three independent estimates —
|
|
1518
|
+
# between `2.01` and the CI; the CI is for `d`, not `2.01`. Reject.
|
|
1519
|
+
_INDEPENDENT_STAT_BETWEEN_RE = re.compile(
|
|
1520
|
+
r"(?:^|[,;\s])\s*"
|
|
1521
|
+
r"(?:t|F|d|g|R|R²|β|γ|B|OR|RR|HR|H|Q|Z|f|n|η|χ|η²|χ²|r|"
|
|
1522
|
+
r"p\s+tukey|p\s+holm|p\s+bonf(?:erroni)?|p\s+adj|"
|
|
1523
|
+
r"\bp(?:\s*[=<>]))"
|
|
1524
|
+
r"\s*[=(\(]",
|
|
1499
1525
|
)
|
|
1500
1526
|
|
|
1501
1527
|
|
|
1502
1528
|
def _recover_minus_in_record(record: str) -> str:
|
|
1503
1529
|
"""Recover '2X.XX' tokens in a single record (a table row or a text line)
|
|
1504
1530
|
by pairing each with a CI bracket present in the same record."""
|
|
1505
|
-
|
|
1531
|
+
# Each entry: (lo, hi, (bs, be), is_labeled). `is_labeled` is True when
|
|
1532
|
+
# the bracket is prefixed by `CI`/`95% CI`/etc. — see cycle 12 notes
|
|
1533
|
+
# at _CI_LABEL_PREFIX_RE.
|
|
1534
|
+
brackets: list[tuple[float, float, tuple[int, int], bool]] = []
|
|
1506
1535
|
for m in _CI_PAIR_BRACKET_RE.finditer(record):
|
|
1507
1536
|
try:
|
|
1508
1537
|
lo, hi = float(m.group(1)), float(m.group(2))
|
|
@@ -1510,13 +1539,17 @@ def _recover_minus_in_record(record: str) -> str:
|
|
|
1510
1539
|
continue
|
|
1511
1540
|
if lo > hi:
|
|
1512
1541
|
continue # not a well-formed interval
|
|
1513
|
-
|
|
1542
|
+
# Look back ≤8 chars for a `CI` / `95 % CI` label.
|
|
1543
|
+
bs, be = m.span()
|
|
1544
|
+
prefix = record[max(0, bs - 8): bs]
|
|
1545
|
+
is_labeled = bool(_CI_LABEL_PREFIX_RE.search(prefix))
|
|
1546
|
+
brackets.append((lo, hi, (bs, be), is_labeled))
|
|
1514
1547
|
if not brackets:
|
|
1515
1548
|
return record
|
|
1516
1549
|
|
|
1517
1550
|
def _sub(m: "re.Match[str]") -> str:
|
|
1518
1551
|
# Never touch a token that lies inside a bracket span (a CI bound).
|
|
1519
|
-
for _lo, _hi, (bs, be) in brackets:
|
|
1552
|
+
for _lo, _hi, (bs, be), _lab in brackets:
|
|
1520
1553
|
if bs <= m.start() < be:
|
|
1521
1554
|
return m.group(0)
|
|
1522
1555
|
frac = m.group(1)
|
|
@@ -1525,22 +1558,33 @@ def _recover_minus_in_record(record: str) -> str:
|
|
|
1525
1558
|
recovered = float("-" + frac)
|
|
1526
1559
|
except ValueError:
|
|
1527
1560
|
return m.group(0)
|
|
1528
|
-
# Cycle
|
|
1529
|
-
#
|
|
1530
|
-
#
|
|
1531
|
-
#
|
|
1561
|
+
# Cycle 12: pick the NEAREST bracket whose pairing rules accept this
|
|
1562
|
+
# token. LABELED brackets accept any candidate in the record (legacy
|
|
1563
|
+
# wide rule — efendic body line `Mposterior = 20.54, SD=0.04,
|
|
1564
|
+
# CI = [-0.61, -0.47]` is the canonical case). BARE brackets only
|
|
1565
|
+
# accept the immediately-preceding stat (within 30 chars, no
|
|
1566
|
+
# sentence break) — this is what blocks the majumder false-positive
|
|
1567
|
+
# `M = 5.37, SD = 2.01, t = ..., d = 0.09 [-1.86, 0.04]`.
|
|
1532
1568
|
token_end = m.end()
|
|
1533
1569
|
nearest = None
|
|
1534
1570
|
nearest_dist = None
|
|
1535
|
-
for lo, hi, (bs, be) in brackets:
|
|
1571
|
+
for lo, hi, (bs, be), is_labeled in brackets:
|
|
1536
1572
|
if bs < token_end:
|
|
1537
|
-
continue # bracket precedes the token — not its CI
|
|
1538
|
-
gap = bs - token_end
|
|
1539
|
-
if gap > _CI_PAIR_MAX_GAP:
|
|
1540
1573
|
continue
|
|
1574
|
+
gap = bs - token_end
|
|
1541
1575
|
intervening = record[token_end:bs]
|
|
1542
|
-
if
|
|
1543
|
-
|
|
1576
|
+
if is_labeled:
|
|
1577
|
+
# Labeled bracket: relaxed proximity, but still reject if
|
|
1578
|
+
# an independent-stat label intervenes. The label gates the
|
|
1579
|
+
# pairing to the variance-family (SD/SE/M/CI/%) of the
|
|
1580
|
+
# SAME estimate. See _INDEPENDENT_STAT_BETWEEN_RE notes.
|
|
1581
|
+
if _INDEPENDENT_STAT_BETWEEN_RE.search(intervening):
|
|
1582
|
+
continue
|
|
1583
|
+
else:
|
|
1584
|
+
if gap > _CI_PAIR_MAX_GAP:
|
|
1585
|
+
continue
|
|
1586
|
+
if _SENTENCE_BREAK_RE.search(intervening):
|
|
1587
|
+
continue
|
|
1544
1588
|
if nearest_dist is None or gap < nearest_dist:
|
|
1545
1589
|
nearest = (lo, hi)
|
|
1546
1590
|
nearest_dist = gap
|
|
@@ -2649,6 +2693,31 @@ def normalize_text(
|
|
|
2649
2693
|
t = re.sub(r"([=<>])[ \t]*\n[ \t]*(?=[-\d.])", r"\1 ", t)
|
|
2650
2694
|
t = re.sub(r"([,;])[ \t]*\n[ \t]*(?=p\s*[<=>])", r"\1 ", t)
|
|
2651
2695
|
t = re.sub(r"([,;])[ \t]*\n[ \t]*(?=\d+%\s*CI)", r"\1 ", t)
|
|
2696
|
+
# Cycle 12 (v2.4.64) — cross-paragraph stat-continuation join.
|
|
2697
|
+
# A1 (which uses `\s*` and so crosses paragraph breaks) runs BEFORE
|
|
2698
|
+
# S9 strips header/footer lines. So a stat row like
|
|
2699
|
+
# `r(1798) = -0.27,\n\n472\n\nJournal of Decision Making, ...\n\n95% CI [-0.31, ...]`
|
|
2700
|
+
# has so much intervening junk that A1's lookahead fails on pass 1;
|
|
2701
|
+
# only after S9 strips the junk (producing `,\n\n95% CI`) can the
|
|
2702
|
+
# join happen, and that's pass 2. The two patterns below are the
|
|
2703
|
+
# paragraph-crossing variants of the comma-to-stat-continuation
|
|
2704
|
+
# patterns above — restricted to the high-confidence prefixes
|
|
2705
|
+
# `\d+% CI` and `p [<=>]` because no real paragraph STARTS with
|
|
2706
|
+
# those tokens (test_column_bleed_too_many_fragments_ignored is
|
|
2707
|
+
# unaffected — its input has no leading `,`/`;`).
|
|
2708
|
+
# Clears korbmacher (2 papers) from the non-idempotent set.
|
|
2709
|
+
t = re.sub(r"([,;])\s*\n\s*\n\s*(?=\d+%\s*CI)", r"\1 ", t)
|
|
2710
|
+
t = re.sub(r"([,;])\s*\n\s*\n\s*(?=p\s*[<=>])", r"\1 ", t)
|
|
2711
|
+
# Cycle 13 (v2.4.65) — same shape, applied to `=/<>` → digit/dot
|
|
2712
|
+
# continuations. li-feldman-fox has `p =\n\n\x0cFox et al. (2005)...
|
|
2713
|
+
# \n\n38\n\n.25, OR = .96, 95%CI [.90, 1.03])` where A1's
|
|
2714
|
+
# `([=<>])\s*\n\s*([-\d.])` pattern fails on pass 1 (the journal-
|
|
2715
|
+
# header text isn't `\s`); S9 strips the header + page number,
|
|
2716
|
+
# leaving `p =\n\n.25` — but A1 is over. Pass 2 joins on the
|
|
2717
|
+
# cleaned form. The lookahead `(?=[-\d.])` is the load-bearing
|
|
2718
|
+
# constraint — real paragraphs rarely START with a leading dot
|
|
2719
|
+
# or `-digit`.
|
|
2720
|
+
t = re.sub(r"([=<>])\s*\n\s*\n\s*(?=[-.]?\d)", r"\1 ", t)
|
|
2652
2721
|
report._track("LateJoin_line_break_rejoin", before, t, "late_line_joins")
|
|
2653
2722
|
|
|
2654
2723
|
# ── H0r: header-banner re-strip on stabilized line positions ─────────
|
|
@@ -2669,6 +2738,54 @@ def normalize_text(
|
|
|
2669
2738
|
t = _restripped
|
|
2670
2739
|
report._track("H0r_header_banner_restrip", before, t, "header_banners_restripped")
|
|
2671
2740
|
|
|
2741
|
+
# ── Final blank-line collapse ────────────────────────────────────────
|
|
2742
|
+
# S9 enforces `re.sub(r"\n{3,}", "\n\n", t)` once near the top of the
|
|
2743
|
+
# pipeline. Later steps that REMOVE non-blank content can leave blank
|
|
2744
|
+
# gaps that S9's earlier collapse no longer reaches:
|
|
2745
|
+
#
|
|
2746
|
+
# - R3 (refs-section continuation join) walks the refs span line by
|
|
2747
|
+
# line. A bare form-feed `\x0c` (pdftotext page-break) between two
|
|
2748
|
+
# blank lines becomes `"".strip() == ""` and is preserved as a blank
|
|
2749
|
+
# entry; R3 outputs three consecutive blank entries surrounded by
|
|
2750
|
+
# `"\n".join(...)` — `\n\n\n\n`. Pass 1 leaves this; pass 2's S9
|
|
2751
|
+
# collapses it, producing the bibliography-shift non-idempotence
|
|
2752
|
+
# (cycle 12 — 5 papers: chan-etal, horsham, lee-feldman,
|
|
2753
|
+
# li-feldman-mental, + 1 incidental).
|
|
2754
|
+
# - Same pattern for any late strip step that empties a line without
|
|
2755
|
+
# re-collapsing.
|
|
2756
|
+
#
|
|
2757
|
+
# Add the collapse here so the function is idempotent regardless of
|
|
2758
|
+
# which late step produced the blank-line run.
|
|
2759
|
+
t = re.sub(r"\n{3,}", "\n\n", t)
|
|
2760
|
+
|
|
2761
|
+
# ── P1r: front-matter metadata-leak re-strip on stabilized lines ─────
|
|
2762
|
+
# Same shape as H0r and P0r. P1's `_strip_frontmatter_metadata_leaks`
|
|
2763
|
+
# matches an acknowledgment-style line by ANCHORED prefix + a keyword
|
|
2764
|
+
# check within the first 300 chars (e.g. `^We\s+thank...reviewers|
|
|
2765
|
+
# editor|feedback|comments|suggestions|insights|helpful`). pdftotext
|
|
2766
|
+
# often line-wraps the acknowledgment before the keyword fires (e.g.
|
|
2767
|
+
# `We thank the target article's authors - Prof. Craig Fox and Prof.
|
|
2768
|
+
# Rebecca Ratner, for being very` — the raw line stops before
|
|
2769
|
+
# `helpful`). S7/S8 join the continuation; the joined line now contains
|
|
2770
|
+
# the keyword, but P1 has already run by then. Pass 2's P1 catches the
|
|
2771
|
+
# joined form and strips — non-idempotence + a missed production strip.
|
|
2772
|
+
#
|
|
2773
|
+
# Re-running here on the post-LateJoin line positions catches every
|
|
2774
|
+
# form (the original short line where the keyword was already in
|
|
2775
|
+
# window, AND the post-join long line where it's only in window after
|
|
2776
|
+
# the join).
|
|
2777
|
+
#
|
|
2778
|
+
# Cycle 13 (v2.4.65) — clears li-feldman-fox + amp-1 + annals-2 +
|
|
2779
|
+
# xiao-poc-epley (4 acknowledgment-block papers) from the
|
|
2780
|
+
# non-idempotent set.
|
|
2781
|
+
before = t
|
|
2782
|
+
while True:
|
|
2783
|
+
_restripped = _strip_frontmatter_metadata_leaks(t)
|
|
2784
|
+
if _restripped == t:
|
|
2785
|
+
break
|
|
2786
|
+
t = _restripped
|
|
2787
|
+
report._track("P1r_frontmatter_leak_restrip", before, t, "frontmatter_leaks_restripped")
|
|
2788
|
+
|
|
2672
2789
|
# ── P0r: page-footer-line re-strip on stabilized line positions ──────
|
|
2673
2790
|
# Same shape as H0r, applied to P0's anchored ^...$ patterns. P0 runs
|
|
2674
2791
|
# near the top of the pipeline, where some P0-targeted lines are still
|
|
@@ -234,6 +234,45 @@ def test_s9_4digit_pattern_a_still_strips_isolated_page_numbers():
|
|
|
234
234
|
)
|
|
235
235
|
|
|
236
236
|
|
|
237
|
+
def test_normalize_collapses_late_blank_line_runs():
|
|
238
|
+
"""Cycle 12: a late strip step that empties a line (e.g. R3 stripping a
|
|
239
|
+
form-feed `\\x0c` between two blank lines) leaves a `\\n{3+}` run that
|
|
240
|
+
S9's earlier collapse no longer reaches. The final collapse at the end
|
|
241
|
+
of normalize_text catches it."""
|
|
242
|
+
# Simulate: paragraph + form-feed + paragraph (pdftotext page-break case)
|
|
243
|
+
text = "First paragraph ending here.\n\n\x0c\n\nSecond paragraph begins."
|
|
244
|
+
out, _ = normalize_text(text, NormalizationLevel.academic)
|
|
245
|
+
# Should produce one paragraph break, not two
|
|
246
|
+
assert "\n\n\n" not in out, f"normalize_text left a \\n{{3+}} run: {out!r}"
|
|
247
|
+
assert "First paragraph" in out
|
|
248
|
+
assert "Second paragraph" in out
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def test_late_join_crosses_paragraph_for_stat_continuation():
|
|
252
|
+
"""Cycle 12: a comma/semicolon followed by a paragraph break and a
|
|
253
|
+
high-confidence stat-continuation token (95% CI / p [<=>]) is a
|
|
254
|
+
serializer artifact — joined on pass 1. Pre-cycle-12, only pass 2
|
|
255
|
+
joined it (after S9 stripped the intervening header/footer noise).
|
|
256
|
+
|
|
257
|
+
Defends against the corpus-wide korbmacher pattern where a regression-
|
|
258
|
+
coefficient row was broken by a per-page header insertion."""
|
|
259
|
+
# The full korbmacher pre-S9 pattern is a couple of headers between;
|
|
260
|
+
# post-S9 the input to LateJoin is just `,\n\n95% CI`.
|
|
261
|
+
text = "r(1798) = -0.27,\n\n95% CI [-0.31, -0.22]"
|
|
262
|
+
out, _ = normalize_text(text, NormalizationLevel.academic)
|
|
263
|
+
assert "-0.27, 95% CI" in out, f"cross-paragraph stat join failed: {out!r}"
|
|
264
|
+
|
|
265
|
+
# Same for p-value continuation
|
|
266
|
+
text2 = "t(23) = 2.34,\n\np < .001, d = 0.45"
|
|
267
|
+
out2, _ = normalize_text(text2, NormalizationLevel.academic)
|
|
268
|
+
assert "2.34, p < .001" in out2 or "p < .001" in out2.replace("\n\n", " ")
|
|
269
|
+
|
|
270
|
+
# The column-bleed contract is NOT broken — its input has no leading `,;`.
|
|
271
|
+
cb = "p\n01\n02\n03\n04\n05\n= .05"
|
|
272
|
+
out_cb, _ = normalize_text(cb, NormalizationLevel.academic)
|
|
273
|
+
assert "p = .05" not in out_cb, "column-bleed test contract broken by cycle 12"
|
|
274
|
+
|
|
275
|
+
|
|
237
276
|
def test_recover_minus_proximity_gate_rejects_distant_unrelated_brackets():
|
|
238
277
|
"""Cycle 11: a stat-table row that mixes an unrelated SD value with a
|
|
239
278
|
separately-reported CI bracket must NOT have the SD recovered as a
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docpluck-2.4.63 → docpluck-2.4.65}/.claude/skills/docpluck-iterate/references/ai-full-doc-verify.md
RENAMED
|
File without changes
|
|
File without changes
|
{docpluck-2.4.63 → docpluck-2.4.65}/.claude/skills/docpluck-iterate/references/local-verification.md
RENAMED
|
File without changes
|
{docpluck-2.4.63 → docpluck-2.4.65}/.claude/skills/docpluck-iterate/references/rationalizations.md
RENAMED
|
File without changes
|
|
File without changes
|
{docpluck-2.4.63 → docpluck-2.4.65}/.claude/skills/docpluck-iterate/references/release-flow.md
RENAMED
|
File without changes
|
{docpluck-2.4.63 → docpluck-2.4.65}/.claude/skills/docpluck-iterate/references/self-improvement.md
RENAMED
|
File without changes
|
{docpluck-2.4.63 → docpluck-2.4.65}/.claude/skills/docpluck-iterate/references/three-tier-parity.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docpluck-2.4.63 → docpluck-2.4.65}/.claude/skills/docpluck-qa/references/check-11-hard-rules.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docpluck-2.4.63 → docpluck-2.4.65}/.claude/skills/docpluck-qa/references/check-7-batch-smoke.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-09_unified_extraction_brainstorm.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docpluck-2.4.63 → docpluck-2.4.65}/docs/HANDOFF_2026-05-12_remaining_ui_and_chrome_verification.md
RENAMED
|
File without changes
|
|
File without changes
|