docpluck 2.4.44__tar.gz → 2.4.45__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docpluck-2.4.44 → docpluck-2.4.45}/.claude/skills/_project/lessons.md +8 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/.claude/skills/docpluck-iterate/LEARNINGS.md +26 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/CHANGELOG.md +10 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/PKG-INFO +1 -1
- {docpluck-2.4.44 → docpluck-2.4.45}/docpluck/__init__.py +1 -1
- {docpluck-2.4.44 → docpluck-2.4.45}/docpluck/render.py +7 -14
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/TRIAGE_2026-05-14_phase_5d_gold_audit.md +5 -1
- {docpluck-2.4.44 → docpluck-2.4.45}/pyproject.toml +1 -1
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/golden/sections/apa_multi_study_pdf.json +15 -15
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/golden/sections/apa_single_study_pdf.json +11 -11
- docpluck-2.4.45/tests/snapshots/amj_lattice.txt +3022 -0
- docpluck-2.4.45/tests/snapshots/apa_chan_feldman_lineless.txt +2390 -0
- docpluck-2.4.45/tests/snapshots/apa_chen_jesp_lineless.txt +4054 -0
- docpluck-2.4.45/tests/snapshots/apa_efendic_affect.txt +1164 -0
- docpluck-2.4.45/tests/snapshots/apa_ip_feldman_pspb.txt +2683 -0
- docpluck-2.4.45/tests/snapshots/bmc_lattice.txt +1140 -0
- docpluck-2.4.45/tests/snapshots/ieee_figure_heavy.txt +1687 -0
- docpluck-2.4.45/tests/snapshots/ieee_lattice.txt +1757 -0
- docpluck-2.4.45/tests/snapshots/jama_lattice.txt +1881 -0
- docpluck-2.4.45/tests/snapshots/nat_comms_figure_only.txt +2092 -0
- docpluck-2.4.45/tests/snapshots/nature_minimal_rule.txt +1293 -0
- docpluck-2.4.45/tests/snapshots/scirep_minimal_rule.txt +1481 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_numbered_heading_promotion_real_pdf.py +35 -2
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_render.py +7 -3
- docpluck-2.4.44/tests/snapshots/amj_lattice.txt +0 -1165
- docpluck-2.4.44/tests/snapshots/apa_chan_feldman_lineless.txt +0 -1200
- docpluck-2.4.44/tests/snapshots/apa_chen_jesp_lineless.txt +0 -2122
- docpluck-2.4.44/tests/snapshots/apa_efendic_affect.txt +0 -584
- docpluck-2.4.44/tests/snapshots/apa_ip_feldman_pspb.txt +0 -1405
- docpluck-2.4.44/tests/snapshots/bmc_lattice.txt +0 -318
- docpluck-2.4.44/tests/snapshots/ieee_figure_heavy.txt +0 -543
- docpluck-2.4.44/tests/snapshots/ieee_lattice.txt +0 -1395
- docpluck-2.4.44/tests/snapshots/jama_lattice.txt +0 -1345
- docpluck-2.4.44/tests/snapshots/nat_comms_figure_only.txt +0 -913
- docpluck-2.4.44/tests/snapshots/nature_minimal_rule.txt +0 -366
- docpluck-2.4.44/tests/snapshots/scirep_minimal_rule.txt +0 -307
- {docpluck-2.4.44 → docpluck-2.4.45}/.claude/skills/docpluck-cleanup/SKILL.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/.claude/skills/docpluck-deploy/SKILL.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/.claude/skills/docpluck-iterate/SKILL.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/.claude/skills/docpluck-iterate/references/ai-full-doc-verify.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/.claude/skills/docpluck-iterate/references/cycle-report-template.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/.claude/skills/docpluck-iterate/references/local-verification.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/.claude/skills/docpluck-iterate/references/rationalizations.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/.claude/skills/docpluck-iterate/references/real-library-real-pdf.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/.claude/skills/docpluck-iterate/references/release-flow.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/.claude/skills/docpluck-iterate/references/self-improvement.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/.claude/skills/docpluck-iterate/references/three-tier-parity.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/.claude/skills/docpluck-qa/SKILL.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/.claude/skills/docpluck-qa/references/benchmark-mode.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/.claude/skills/docpluck-qa/references/check-11-hard-rules.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/.claude/skills/docpluck-qa/references/check-13-escicheck-production.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/.claude/skills/docpluck-qa/references/check-5-escicheck-library.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/.claude/skills/docpluck-qa/references/check-6-escicheck-local-webapp.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/.claude/skills/docpluck-qa/references/check-7-batch-smoke.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/.claude/skills/docpluck-review/SKILL.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/.github/workflows/bump-app-pin.yml +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/.github/workflows/publish.yml +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/.github/workflows/test.yml +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/.gitignore +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/CLAUDE.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/HANDOFF_SECTIONS_APP_INTEGRATION.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/LESSONS.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/LICENSE +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/REPLY_FROM_DOCPLUCK_v1.4.5.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/REPLY_FROM_DOCPLUCK_v1.5.0.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/REQUEST_08_CHUNKING_ENDPOINT.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/REQUEST_09_REFERENCE_LIST_NORMALIZATION.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/TODO.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docpluck/__main__.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docpluck/batch.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docpluck/cli.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docpluck/extract.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docpluck/extract_docx.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docpluck/extract_html.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docpluck/extract_layout.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docpluck/extract_structured.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docpluck/figures/__init__.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docpluck/figures/detect.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docpluck/normalize.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docpluck/quality.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docpluck/sections/__init__.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docpluck/sections/annotators/__init__.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docpluck/sections/annotators/docx.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docpluck/sections/annotators/html.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docpluck/sections/annotators/pdf.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docpluck/sections/annotators/text.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docpluck/sections/blocks.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docpluck/sections/boundaries.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docpluck/sections/core.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docpluck/sections/taxonomy.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docpluck/sections/types.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docpluck/tables/__init__.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docpluck/tables/bbox_utils.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docpluck/tables/camelot_extract.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docpluck/tables/captions.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docpluck/tables/cell_cleaning.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docpluck/tables/cluster.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docpluck/tables/confidence.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docpluck/tables/detect.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docpluck/tables/render.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docpluck/tables/whitespace.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docpluck/version.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/BENCHMARKS.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/DESIGN.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/HANDOFF_2026-05-07_sections_strict_iteration.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/HANDOFF_2026-05-09_session_state_and_followups.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/HANDOFF_2026-05-09_unified_extraction_brainstorm.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/HANDOFF_2026-05-10_table_rendering_iteration.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/HANDOFF_2026-05-10_table_rendering_iteration_2.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/HANDOFF_2026-05-10_table_rendering_iteration_3.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/HANDOFF_2026-05-10_table_rendering_iteration_4.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/HANDOFF_2026-05-10_table_rendering_iteration_5.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/HANDOFF_2026-05-10_table_rendering_iteration_6.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/HANDOFF_2026-05-10_table_rendering_iteration_7.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/HANDOFF_2026-05-11_PROMOTE_SPIKE_TO_LIBRARY.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/HANDOFF_2026-05-11_table_rendering_iteration_8.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/HANDOFF_2026-05-11_visual_review_findings.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/HANDOFF_2026-05-12_phase2_101pdf_corpus.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/HANDOFF_2026-05-12_remaining_ui_and_chrome_verification.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/HANDOFF_2026-05-12_visual_verify_results.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/HANDOFF_2026-05-13_apa_50_expansion.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/HANDOFF_2026-05-13_apa_50_expansion_iter_1.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/HANDOFF_2026-05-13_apa_50_expansion_iter_2.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/HANDOFF_2026-05-13_iterate_skill_first_use.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/HANDOFF_2026-05-13_iterative_1.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/HANDOFF_2026-05-13_iterative_library_improvement.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/HANDOFF_2026-05-13_table_extraction_next_iteration.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/HANDOFF_2026-05-14_continue_iterations_v2_4_30_to_15n.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/HANDOFF_2026-05-14_full_corpus_iteration_v2_4_30.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/HANDOFF_2026-05-14_iterate_6_cycles_complete.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/HANDOFF_2026-05-14_iterate_9_cycle_run.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/HANDOFF_2026-05-14_iterate_resume_4_cycles.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/HANDOFF_2026-05-14_iterate_v2_4_31_cycle_15n.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/HANDOFF_2026-05-14_phase_5d_gold_audit_v2_4_29.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/HANDOFF_2026-05-15_autonomous_apa_first_10h.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/HANDOFF_2026-05-15_iterate_apa_run_1.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/HANDOFF_2026-05-16_ai-gold-instructions.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/HANDOFF_2026-05-16_iterate_apa_run_2.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/HANDOFF_2026-05-16_iterate_apa_run_3.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/LIBRARY_APP_SYNC.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/NORMALIZATION.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/README.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/TRIAGE_2026-05-10_corpus_assessment.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/2026-05-06-section-identification.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/2026-05-06-table-extraction.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/2026-05-07-sections-strict-iteration-progress.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/2026-05-08-unified-extraction-phase-0-splice-spike.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/sections-deferred-items.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/sections-issues-backlog.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/2026-05-07_spot-01_apa.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/2026-05-07_spot-02_pattern-A-shipped.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/2026-05-08_spot-final_all-styles.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/COMPARISON.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/korbmacher_table1.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/option-a.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-a/ziano_table1.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_notes_raw.txt +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/korbmacher_table1.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/notes.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/option-b.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_notes_raw.txt +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-b/ziano_table1.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/korbmacher_table1.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/notes.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/option-c.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/sample-pdftotext-bbox.html +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-c/ziano_table1.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/korbmacher_table1.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/notes.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/option-d.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-d/ziano_table1.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_2022_kruger_bbox.html +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_bbox.html +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/korbmacher_table1.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/option-e.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/sample-bbox.html +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_2021_joep_bbox.html +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_bbox.html +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/experiments/option-e/ziano_table1.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/html-fallback-demo.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.err +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs/chandrashekar_2023_mp.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.err +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs/efendic_2022_affect.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.err +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ieee_access_2.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.err +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ip_feldman_2025_pspb.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.err +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs/korbmacher_2022_kruger.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.err +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs/nat_comms_1.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.err +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs/ziano_2021_joep.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.err +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/am_sociol_rev_3.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.err +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amc_1.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.err +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amj_1.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.err +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/amle_1.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.err +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_apa_j_jesp_2009_12_010.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.err +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140066.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.err +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ar_royal_society_rsos_140072.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.err +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/bjps_1.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.err +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chan_feldman_2025_cogemo.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.err +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/chen_2021_jesp.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.err +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/demography_1.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.err +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_3.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.err +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/ieee_access_4.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.err +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_1.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.err +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jama_open_2.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.err +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/jmf_1.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.err +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/nat_comms_2.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.err +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/sci_rep_1.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.err +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/outputs-new/social_forces_1.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/papers.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/report.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/splice_spike.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/plans/spot-checks/splice-spike/test_splice_spike.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/specs/2026-04-27-request-09-reference-normalization-design.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/specs/2026-05-06-section-identification-design.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/specs/2026-05-06-table-extraction-design.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/docs/superpowers/specs/2026-05-08-unified-extraction-design.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/scripts/lint_rendered_corpus.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/scripts/verify_corpus.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/scripts/verify_corpus_full.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/__init__.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/conftest.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/fixtures/__init__.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/fixtures/sections/__init__.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/fixtures/sections/builders.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/fixtures/structured/.gitkeep +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/fixtures/structured/MANIFEST.json +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/fixtures/structured/README.md +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/golden/sections/html_real_headings.json +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_a3c_leading_zero_decimal_real_pdf.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_all_caps_section_promote_real_pdf.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_bbox_utils.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_benchmark_docx_html.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_cambridge_footer_strip_real_pdf.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_caption_regex.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_chart_data_trim_real_pdf.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_cid_minus_recovery_real_pdf.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_cli_sections.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_cli_structured.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_confidence.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_corpus_smoke.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_d5_normalization_audit.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_edge_cases.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_elsevier_footer_strip_real_pdf.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_equation_page_header_strip_real_pdf.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_extract_docx.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_extract_filter_sugar.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_extract_html.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_extract_layout.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_extract_pdf_structured.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_extraction.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_f0_table_region_aware.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_figure_caption_trim_real_pdf.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_figure_detect.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_fixtures_manifest.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_lattice_cluster.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_letterspaced_label_real_pdf.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_ligature_decomposition_real_pdf.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_lt_operator_recovery_real_pdf.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_mathitalic_greek_real_pdf.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_metaesci_followups.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_minus_sign_recovery_real_pdf.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_normalization.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_normalize_a3_r2_body_integer_real_pdf.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_normalize_f0_footnote_strip.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_normalize_layout_param.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_normalize_metadata_leak_real_pdf.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_normalize_report_layout_fields.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_normalize_v18_strips.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_numbered_section_promotion_real_pdf.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_orphan_section_number_real_pdf.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_preserve_math_glyphs_real_pdf.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_quality.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_render_html.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_request_09_reference_normalization.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_roman_numeral_section_promote_real_pdf.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_section_row_label_no_merge_real_pdf.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_sections_boundaries.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_sections_boundary_truncation.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_sections_core_partition.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_sections_docx_annotator.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_sections_extract_text.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_sections_footnote_section.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_sections_golden.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_sections_html_annotator.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_sections_pdf_annotator.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_sections_public_api.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_sections_real_corpus.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_sections_taxonomy.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_sections_text_annotator.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_sections_types.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_sections_unit_corpus.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_sections_v161_coalesce.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_sections_v161_subheadings.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_sections_v161_taxonomy.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_sections_v161_text_annotator.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_sections_version.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_smoke_fixtures.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_structured_result_type.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_structured_types.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_structured_version.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_table_caption_cell_region_real_pdf.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_table_detect.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_tables_cell_cleaning.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_text_mode.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_v23_1_fixes.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_v23_bug_fixes.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_v23_post_corpus.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_v23_post_corpus_v2.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_v2_backwards_compat.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_v2_top_level_exports.py +0 -0
- {docpluck-2.4.44 → docpluck-2.4.45}/tests/test_whitespace_cluster.py +0 -0
|
@@ -190,3 +190,11 @@ Plus three golden snapshot files (`tests/golden/sections/*.json`) had the versio
|
|
|
190
190
|
**Why:** Two failure modes compounded. (1) A new normalize helper added without grepping the existing `normalize_text` S-steps duplicated S3 and, placed before it, starved it. (2) The cycle was scoped from a symptom ("35 papers show ligatures") without localizing WHICH channel was at fault — the body channel was already correct.
|
|
191
191
|
|
|
192
192
|
**How to detect (next time):** Before adding any glyph/encoding helper to `normalize.py`, grep the existing `S0`-`S9` / `W0*` steps for one already handling that character class — extend/unify it rather than adding a parallel path, and never insert a new step *before* an existing one that consumes the same input. Before scoping a glyph cycle, localize the defect: grep the offending glyph's lines in a recent render and confirm whether they sit in `<td>`/`<th>`/`*Table N*`/```unstructured-table``` (table/caption/fence channels — bypass `normalize_text`) or in body prose (the S-step channel).
|
|
193
|
+
|
|
194
|
+
## 2026-05-16 · Cycle 13 — a heuristic guard's value depends on the false-positive surface, which differs per call site (v2.4.45)
|
|
195
|
+
|
|
196
|
+
**What:** `render.py`'s two numbered-heading promoters shared a `max_lc_run >= 5` "long lowercase-word run" prose guard. It demoted legitimate descriptive headings — jdm_.2023.16 had 19 multi-level numbered subsection headings rendered as body text, with lowercase-runs up to 12 (`3.3.2.1. The quality of planning on the previous trial moderates the effect of reflection`). The fix removed the guard ENTIRELY from `_promote_numbered_subsection_headings` but KEPT it (raised 5→8) in `_promote_numbered_section_headings`.
|
|
197
|
+
|
|
198
|
+
**Why:** A lowercase-word-run count genuinely cannot distinguish a descriptive section heading from prose — both have many lowercase words. What makes a line a heading is the *number shape* + capital-start + no-terminal-punctuation + single short line. For **multi-level** dotted numbering (`N.N[.N…]`) that signature is decisive — a prose line almost never begins with a multi-level dotted number — so the lc-run guard was pure harm. For **single-level** `N.` numbering the signature is weak (a `2.` line collides with an enumerated-list item), so a prose guard there still adds value as defense-in-depth. Same guard, opposite verdicts, because the false-positive surface differs between the two call sites.
|
|
199
|
+
|
|
200
|
+
**How to detect (next time):** When a heuristic guard rejects legitimate inputs, do not just retune its threshold — ask whether the guard discriminates at all at that call site. Reproduce at HEAD and measure the metric's spread on real positives (here: heading lowercase-runs ran 0-12, overlapping prose entirely → no threshold works). If a guard can't separate the classes, remove it where the *other* gates already suffice and keep it only where they don't. When a guard is removed, grep its tests — a contract test (`test_render.py::test_promote_rejects_prose_with_long_lowercase_run`) was asserting the removed behavior and had to be updated in the same cycle.
|
|
@@ -521,3 +521,29 @@ Mid-run, ArticleFinder flagged (and the user confirmed as a directive) that docp
|
|
|
521
521
|
|
|
522
522
|
### SPINE-SKIPs
|
|
523
523
|
- R3 (`/docpluck-cleanup` + `/docpluck-review`) — SKIPPED. Cycle 12 is one normalize helper (explicit table over a 7-codepoint block) + S3 unified to call it + 2 bypass-channel call sites; 26/26 baseline + AI verifier confirm no regression. Same shape as cycles 2/4/6/7.
|
|
524
|
+
|
|
525
|
+
---
|
|
526
|
+
|
|
527
|
+
## Run: 2026-05-16 (run 4, fix-and-continue) · Cycles: cycle-12 rework, tests-regen, cycle 13
|
|
528
|
+
|
|
529
|
+
This run executed `docs/HANDOFF_2026-05-16_iterate_run_4_fix_and_continue.md`'s three jobs. Cycle-12 rework + tests-regen + cycle 13 below; the article-finder AI-gold integration (JOB 2) is tracked in the run-meta.
|
|
530
|
+
|
|
531
|
+
### tests-regen (commit `c831e28`, no version bump)
|
|
532
|
+
- 15 pre-existing pytest failures triaged. 12 `test_extract_pdf_byte_identical` snapshots + 2 `test_sections_golden` goldens = environmental drift (local pdftotext re-wraps lines differently than the build that captured the snapshots; `extract_pdf` is a pure pdftotext passthrough). Regenerated; the 26-paper baseline is the real extraction-quality gate and stays green.
|
|
533
|
+
- **The 15th, `test_request_09`, is NOT snapshot drift** — it is a real COL-class column-interleave defect: the numbered RSOS bibliography renders as `References\n1. 2. 3. ... 16.\n\nThaler RH...` (the number column split from the entry text). Left red and documented as the escalated COL defect class. Lesson: when a handoff lumps failures as "all snapshot drift," still inspect each — a real-defect-detecting test must never be "regenerated" away.
|
|
534
|
+
|
|
535
|
+
### Cycle 13 (v2.4.45) — G5b long-descriptive numbered headings demoted
|
|
536
|
+
|
|
537
|
+
### Outcome
|
|
538
|
+
- **Cycle 13 shipped v2.4.45** — `render.py`'s numbered-heading promoters carried a `max_lc_run >= 5` prose guard that demoted legitimate long descriptive headings. Removed the guard entirely from `_promote_numbered_subsection_headings`; raised it `5→8` in `_promote_numbered_section_headings`. jdm_.2023.16: 19 multi-level subsection headings recovered.
|
|
539
|
+
|
|
540
|
+
### Blind spots / process notes
|
|
541
|
+
- **The TRIAGE estimate ("raise 5→8") was a partial fix.** Reproducing at HEAD showed jdm16 headings with `max_lc` up to 12 — a `5→8` raise would have left 7 of 19 still demoted. The lesson card `reproduce-triage-defect-at-head-before-trusting-cost-estimate` paid off again: always reproduce and measure before trusting a queue item's prescribed fix. The lc-run count genuinely cannot distinguish a 12-lowercase-word descriptive heading from prose — for multi-level dotted numbering the *number shape* is the discriminator, so the guard had to go, not just move.
|
|
542
|
+
- **A guard worth keeping for one promoter, not the other.** Single-level `N.` numbers collide with enumerated lists (real false-positive risk) → keep a prose guard (raised to 8) as defense-in-depth. Multi-level `N.N[.N…]` numbers do not → the guard was pure harm. Same-named guard, opposite verdicts, because the false-positive surface differs.
|
|
543
|
+
- **A contract test encoded the removed guard.** `test_render.py::test_promote_rejects_prose_with_long_lowercase_run` asserted the old behavior; updated it to assert the new contract (long descriptive titles ARE promoted) in the same cycle — per the cycle-2 `a test can encode the bug` lesson.
|
|
544
|
+
|
|
545
|
+
### SPINE-SKIPs
|
|
546
|
+
- R3 (`/docpluck-cleanup` + `/docpluck-review`) — SKIPPED. Cycle 13 is a guard removal + one threshold bump in two render post-processors; 26/26 baseline + heading-promotion-only diff confirm no regression. Same shape as cycles 9/11.
|
|
547
|
+
|
|
548
|
+
### Process note — Codex cross-model verification has a Windows UTF-8 bug
|
|
549
|
+
The `gold-generation.md` Step-4 Codex audit misreads UTF-8 gold files as mojibake on this Windows machine (`Västfjäll`→`VA<SI>stfjA<SI>ll`, `–`→`ƒ?"`), producing ~10-24 false "discrepancies" per paper. The gold files are confirmed clean UTF-8. Worked around by re-running Codex with an explicit "files are UTF-8; mojibake is your decode error, not a discrepancy" preamble. **This is article-finder's protocol to fix** — `gold-generation.md` Step 4 needs a UTF-8 read instruction for Windows. Flagged for coordination with the article-finder owner.
|
|
@@ -1,5 +1,15 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [2.4.45] — 2026-05-16
|
|
4
|
+
|
|
5
|
+
**Cycle 13 (autonomous APA-first run) — long descriptive numbered headings demoted to body text (G5b, S1).** `render.py`'s numbered-heading promoters carried a "long lowercase-word run" prose guard (`max_lc_run >= 5`) that rejected legitimate descriptive headings — e.g. `2.4.2.2. Inference of planning strategies and strategy types`, `3.3.2.1. The quality of planning on the previous trial moderates the effect of reflection`. jdm_.2023.16 alone had 19 multi-level numbered subsection headings demoted to body text.
|
|
6
|
+
|
|
7
|
+
Fix (v2.4.45) — the lowercase-run guard is **removed from `_promote_numbered_subsection_headings`**: multi-level dotted numbering at line-start is itself a strong section-heading signal (combined with capital-started title + no terminal sentence punctuation + single ≤80-char line), and descriptive subsection titles legitimately run to many lowercase words, so the guard could not distinguish a real heading from prose and only mis-rejected headings. For `_promote_numbered_section_headings` (single-level `N.`, which genuinely collides with enumerated lists) the guard is **kept but raised `5 → 8`** — single-level promotion still has its document-numbering-range / uniqueness / list-adjacency gates as defense in depth.
|
|
8
|
+
|
|
9
|
+
jdm_.2023.16: 19 previously-demoted multi-level headings now render as `###`; the v2.4.44→v2.4.45 diff is heading-promotion only (0 text loss, 0 hallucination). 26/26 baseline PASS. New real-PDF + contract tests in `tests/test_numbered_heading_promotion_real_pdf.py` and `tests/test_render.py`.
|
|
10
|
+
|
|
11
|
+
~11 APA papers still FAIL Phase-5d verification; the autonomous run continues.
|
|
12
|
+
|
|
3
13
|
## [2.4.44] — 2026-05-16
|
|
4
14
|
|
|
5
15
|
**Cycle 12 (autonomous APA-first run) — Latin typographic ligatures not decomposed in the table/caption channels (GLYPH, S2).** pdftotext preserves presentation-form ligature glyphs (`ff fi fl ffi ffl ſt st`, U+FB00-FB06) verbatim, so words rendered as `confident` / `influence` / `efficient` — broken for search, word matching, and any downstream NLP. A corpus scan found the glyphs in 35 rendered papers (korbmacher 82×, jdm_.2023.16 34×, jdm_m.2022.2 8×). The body channel's `normalize.py` S3 step already expanded ligatures correctly; the leak was confined to **table cells, figure/table captions, and `unstructured-table` fenced blocks**, which bypass `normalize_text` entirely.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docpluck
|
|
3
|
-
Version: 2.4.
|
|
3
|
+
Version: 2.4.45
|
|
4
4
|
Summary: PDF, DOCX, and HTML text extraction and normalization for academic papers
|
|
5
5
|
Project-URL: Homepage, https://github.com/giladfeldman/docpluck
|
|
6
6
|
Project-URL: Documentation, https://github.com/giladfeldman/docpluck/tree/main/docs
|
|
@@ -71,7 +71,7 @@ from .figures import Figure
|
|
|
71
71
|
from .extract_structured import TABLE_EXTRACTION_VERSION, StructuredResult, extract_pdf_structured
|
|
72
72
|
from .render import render_pdf_to_markdown
|
|
73
73
|
|
|
74
|
-
__version__ = "2.4.
|
|
74
|
+
__version__ = "2.4.45"
|
|
75
75
|
__author__ = "Gilad Feldman"
|
|
76
76
|
__license__ = "MIT"
|
|
77
77
|
|
|
@@ -233,8 +233,12 @@ def _promote_numbered_subsection_headings(text: str) -> str:
|
|
|
233
233
|
"""Promote ``1.2 Foo``-style lines to ``### 1.2 Foo`` h3 headings.
|
|
234
234
|
|
|
235
235
|
Conservative: only multi-level numbering (``N.N`` or deeper), title must
|
|
236
|
-
start with a capital letter
|
|
237
|
-
punctuation
|
|
236
|
+
start with a capital letter and must not end in sentence-terminator
|
|
237
|
+
punctuation. Multi-level dotted numbering at line-start is itself a strong
|
|
238
|
+
section-heading signal — descriptive subsection titles legitimately run to
|
|
239
|
+
many lowercase words ("3.3.2.1 The quality of planning on the previous
|
|
240
|
+
trial moderates the effect of reflection"), so a lowercase-run prose guard
|
|
241
|
+
mis-rejects real headings and is not applied here (cycle 13, G5b).
|
|
238
242
|
Idempotent: re-running the pass is a no-op.
|
|
239
243
|
"""
|
|
240
244
|
if not text:
|
|
@@ -250,17 +254,6 @@ def _promote_numbered_subsection_headings(text: str) -> str:
|
|
|
250
254
|
if title.endswith((".", "?", "!", ":", ",", ";")):
|
|
251
255
|
out.append(line)
|
|
252
256
|
continue
|
|
253
|
-
tokens = title.split()
|
|
254
|
-
lc_run = max_lc_run = 0
|
|
255
|
-
for tok in tokens:
|
|
256
|
-
if tok and tok[0].islower():
|
|
257
|
-
lc_run += 1
|
|
258
|
-
max_lc_run = max(max_lc_run, lc_run)
|
|
259
|
-
else:
|
|
260
|
-
lc_run = 0
|
|
261
|
-
if max_lc_run >= 5:
|
|
262
|
-
out.append(line)
|
|
263
|
-
continue
|
|
264
257
|
if out and out[-1].startswith(f"### {m.group('num')} "):
|
|
265
258
|
out.append(line)
|
|
266
259
|
continue
|
|
@@ -357,7 +350,7 @@ def _promote_numbered_section_headings(text: str) -> str:
|
|
|
357
350
|
max_lc = max(max_lc, lc_run)
|
|
358
351
|
else:
|
|
359
352
|
lc_run = 0
|
|
360
|
-
if max_lc >=
|
|
353
|
+
if max_lc >= 8: # long prose-like run — not a heading (cycle 13, G5b)
|
|
361
354
|
continue
|
|
362
355
|
candidates.setdefault(int(m.group("num")), []).append((i, title))
|
|
363
356
|
if not candidates:
|
|
@@ -271,6 +271,10 @@ New `render.py::_promote_numbered_section_headings` promotes `N. Title` → `##
|
|
|
271
271
|
|
|
272
272
|
> **Cycle-12 rework note (run 4, 2026-05-16):** the first cycle-12 attempt added a SECOND, parallel `decompose_ligatures` call *before* the pre-existing S3 step inside `normalize_text` — it consumed every ligature before S3 ran, so S3 tracked `ligatures_expanded = 0` and broke `test_normalization.py::test_report_tracks_changes`. The rework removed the duplicate call and unified S3 to use the shared helper. Lesson: before adding a glyph-normalization helper, grep the existing `normalize_text` S-steps for one already handling that glyph class — extend/unify it, do not add a parallel path.
|
|
273
273
|
|
|
274
|
+
### Cycle 13 (v2.4.45) — G5b long-descriptive-title prose guard — SHIPPED
|
|
275
|
+
|
|
276
|
+
`render.py`'s numbered-heading promoters carried a `max_lc_run >= 5` "long lowercase-word run" prose guard that mis-rejected legitimate descriptive headings. Reproduced at HEAD: jdm_.2023.16 alone had **19** multi-level numbered subsection headings demoted to body text, with `max_lc` up to **12** (`3.3.2.1. The quality of planning on the previous trial moderates the effect of reflection`) — far deeper than the TRIAGE's "raise 5→8" estimate. Re-scoped: the lc-run guard is **removed entirely from `_promote_numbered_subsection_headings`** (multi-level dotted numbering + capital-start + no-terminal-punctuation + single ≤80-char line is itself a sufficient heading signature; the lc-run guard cannot distinguish a descriptive heading from prose). For `_promote_numbered_section_headings` (single-level `N.`, real list-collision risk) the guard is kept but raised `5→8`, alongside its existing numbering-range/uniqueness/list-adjacency gates. jdm16: 19 headings recovered; v2.4.44→v2.4.45 diff is heading-promotion only (0 text loss/hallucination); 26/26 baseline.
|
|
277
|
+
|
|
274
278
|
### SESSION-3 STANDING VERDICT (rule 0e-bis)
|
|
275
279
|
|
|
276
280
|
The APA corpus is **NOT clean**. Cycles 8-11 shipped 4 verified incremental fixes (v2.4.40-43), each AI-gold-verified OVERALL PASS with 0 regressions. But ~12 APA papers still FAIL Phase-5d on PRE-EXISTING defects the cycles did not reach. Verifier-confirmed open punch-list:
|
|
@@ -280,7 +284,7 @@ The APA corpus is **NOT clean**. Cycles 8-11 shipped 4 verified incremental fixe
|
|
|
280
284
|
| **TABLE structure destruction** | S0/S1 | efendic, ar_apa_011, xiao, jdm15/16, chen, maier, ip_feldman (~11) | grid lost → caption-bleed; flat number-dump; empty `<table>` shells; two tables merged; rows dropped. C3 — needs a render/structured coordination design. The single largest blocker. |
|
|
281
285
|
| **G5c split-line numbered headings** | S1 | jdm_m.2022.2 (`5.3.`/`6.3.`/`7.3.` etc.) | number alone on a line, title on the next; renders as orphan bare-number + a MISLABELED generic `## Results`. cycle-3 orphan-folder multi-level analogue. |
|
|
282
286
|
| **G5d named (unnumbered) heading demotion** | S1 | ar_apa_011 (`Participants`, `Overview`), efendic, chandrashekar, ip_feldman (~7) | section-partitioner work; largest false-positive surface. |
|
|
283
|
-
|
|
|
287
|
+
| ~~**G5b long-descriptive-title prose guard**~~ ✓ FIXED v2.4.45 (cycle 13) | S1 | jdm16, jdm_m2, chen | ~~`≥5-lowercase-word` guard over-rejects legit long numbered headings.~~ Subsection promoter's lc-run guard removed; single-level raised 5→8. |
|
|
284
288
|
| **FIG caption double-emission + truncation** | S2 | jdm_m2, efendic, chan_feldman, ziano, jdm15/16 (~8) | caption inline + in `## Figures` block; truncated mid-word; figure data-labels as orphan body lines. |
|
|
285
289
|
| **GLYPH ligature** `fi`/`fl` not decomposed | S2 | jdm_m2 (and likely many) | `confident`, `influence` — NFKC would fix; check why current NFC pass misses U+FB01/FB02. |
|
|
286
290
|
| **D4 metadata residuals** | S2 | ar_apa_011 (`doi:` line), chen, efendic masthead | see D4 RESIDUALS above. |
|
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
"label": "abstract",
|
|
7
7
|
"canonical_label": "abstract",
|
|
8
8
|
"char_start": 0,
|
|
9
|
-
"char_end":
|
|
9
|
+
"char_end": 29,
|
|
10
10
|
"pages": [],
|
|
11
11
|
"confidence": "high",
|
|
12
12
|
"detected_via": "heading_match",
|
|
@@ -15,8 +15,8 @@
|
|
|
15
15
|
{
|
|
16
16
|
"label": "introduction",
|
|
17
17
|
"canonical_label": "introduction",
|
|
18
|
-
"char_start":
|
|
19
|
-
"char_end":
|
|
18
|
+
"char_start": 29,
|
|
19
|
+
"char_end": 55,
|
|
20
20
|
"pages": [],
|
|
21
21
|
"confidence": "high",
|
|
22
22
|
"detected_via": "heading_match",
|
|
@@ -25,8 +25,8 @@
|
|
|
25
25
|
{
|
|
26
26
|
"label": "methods",
|
|
27
27
|
"canonical_label": "methods",
|
|
28
|
-
"char_start":
|
|
29
|
-
"char_end":
|
|
28
|
+
"char_start": 55,
|
|
29
|
+
"char_end": 81,
|
|
30
30
|
"pages": [],
|
|
31
31
|
"confidence": "high",
|
|
32
32
|
"detected_via": "heading_match",
|
|
@@ -35,8 +35,8 @@
|
|
|
35
35
|
{
|
|
36
36
|
"label": "results",
|
|
37
37
|
"canonical_label": "results",
|
|
38
|
-
"char_start":
|
|
39
|
-
"char_end":
|
|
38
|
+
"char_start": 81,
|
|
39
|
+
"char_end": 107,
|
|
40
40
|
"pages": [],
|
|
41
41
|
"confidence": "high",
|
|
42
42
|
"detected_via": "heading_match",
|
|
@@ -45,8 +45,8 @@
|
|
|
45
45
|
{
|
|
46
46
|
"label": "methods_2",
|
|
47
47
|
"canonical_label": "methods",
|
|
48
|
-
"char_start":
|
|
49
|
-
"char_end":
|
|
48
|
+
"char_start": 107,
|
|
49
|
+
"char_end": 133,
|
|
50
50
|
"pages": [],
|
|
51
51
|
"confidence": "high",
|
|
52
52
|
"detected_via": "heading_match",
|
|
@@ -55,8 +55,8 @@
|
|
|
55
55
|
{
|
|
56
56
|
"label": "results_2",
|
|
57
57
|
"canonical_label": "results",
|
|
58
|
-
"char_start":
|
|
59
|
-
"char_end":
|
|
58
|
+
"char_start": 133,
|
|
59
|
+
"char_end": 159,
|
|
60
60
|
"pages": [],
|
|
61
61
|
"confidence": "high",
|
|
62
62
|
"detected_via": "heading_match",
|
|
@@ -65,8 +65,8 @@
|
|
|
65
65
|
{
|
|
66
66
|
"label": "general_discussion",
|
|
67
67
|
"canonical_label": "general_discussion",
|
|
68
|
-
"char_start":
|
|
69
|
-
"char_end":
|
|
68
|
+
"char_start": 159,
|
|
69
|
+
"char_end": 190,
|
|
70
70
|
"pages": [],
|
|
71
71
|
"confidence": "high",
|
|
72
72
|
"detected_via": "heading_match",
|
|
@@ -75,8 +75,8 @@
|
|
|
75
75
|
{
|
|
76
76
|
"label": "references",
|
|
77
77
|
"canonical_label": "references",
|
|
78
|
-
"char_start":
|
|
79
|
-
"char_end":
|
|
78
|
+
"char_start": 190,
|
|
79
|
+
"char_end": 220,
|
|
80
80
|
"pages": [],
|
|
81
81
|
"confidence": "high",
|
|
82
82
|
"detected_via": "heading_match",
|
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
"label": "abstract",
|
|
7
7
|
"canonical_label": "abstract",
|
|
8
8
|
"char_start": 0,
|
|
9
|
-
"char_end":
|
|
9
|
+
"char_end": 37,
|
|
10
10
|
"pages": [],
|
|
11
11
|
"confidence": "high",
|
|
12
12
|
"detected_via": "heading_match",
|
|
@@ -15,8 +15,8 @@
|
|
|
15
15
|
{
|
|
16
16
|
"label": "introduction",
|
|
17
17
|
"canonical_label": "introduction",
|
|
18
|
-
"char_start":
|
|
19
|
-
"char_end":
|
|
18
|
+
"char_start": 37,
|
|
19
|
+
"char_end": 63,
|
|
20
20
|
"pages": [],
|
|
21
21
|
"confidence": "high",
|
|
22
22
|
"detected_via": "heading_match",
|
|
@@ -25,8 +25,8 @@
|
|
|
25
25
|
{
|
|
26
26
|
"label": "methods",
|
|
27
27
|
"canonical_label": "methods",
|
|
28
|
-
"char_start":
|
|
29
|
-
"char_end":
|
|
28
|
+
"char_start": 63,
|
|
29
|
+
"char_end": 87,
|
|
30
30
|
"pages": [],
|
|
31
31
|
"confidence": "high",
|
|
32
32
|
"detected_via": "heading_match",
|
|
@@ -35,8 +35,8 @@
|
|
|
35
35
|
{
|
|
36
36
|
"label": "results",
|
|
37
37
|
"canonical_label": "results",
|
|
38
|
-
"char_start":
|
|
39
|
-
"char_end":
|
|
38
|
+
"char_start": 87,
|
|
39
|
+
"char_end": 112,
|
|
40
40
|
"pages": [],
|
|
41
41
|
"confidence": "high",
|
|
42
42
|
"detected_via": "heading_match",
|
|
@@ -45,8 +45,8 @@
|
|
|
45
45
|
{
|
|
46
46
|
"label": "discussion",
|
|
47
47
|
"canonical_label": "discussion",
|
|
48
|
-
"char_start":
|
|
49
|
-
"char_end":
|
|
48
|
+
"char_start": 112,
|
|
49
|
+
"char_end": 138,
|
|
50
50
|
"pages": [],
|
|
51
51
|
"confidence": "high",
|
|
52
52
|
"detected_via": "heading_match",
|
|
@@ -55,8 +55,8 @@
|
|
|
55
55
|
{
|
|
56
56
|
"label": "references",
|
|
57
57
|
"canonical_label": "references",
|
|
58
|
-
"char_start":
|
|
59
|
-
"char_end":
|
|
58
|
+
"char_start": 138,
|
|
59
|
+
"char_end": 168,
|
|
60
60
|
"pages": [],
|
|
61
61
|
"confidence": "high",
|
|
62
62
|
"detected_via": "heading_match",
|