docpluck 1.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. docpluck-1.5.0/.claude/skills/docpluck-cleanup/SKILL.md +70 -0
  2. docpluck-1.5.0/.claude/skills/docpluck-deploy/SKILL.md +204 -0
  3. docpluck-1.5.0/.claude/skills/docpluck-qa/SKILL.md +251 -0
  4. docpluck-1.5.0/.claude/skills/docpluck-qa/references/benchmark-mode.md +106 -0
  5. docpluck-1.5.0/.claude/skills/docpluck-qa/references/check-11-hard-rules.md +44 -0
  6. docpluck-1.5.0/.claude/skills/docpluck-qa/references/check-13-escicheck-production.md +67 -0
  7. docpluck-1.5.0/.claude/skills/docpluck-qa/references/check-5-escicheck-library.md +69 -0
  8. docpluck-1.5.0/.claude/skills/docpluck-qa/references/check-6-escicheck-local-webapp.md +67 -0
  9. docpluck-1.5.0/.claude/skills/docpluck-qa/references/check-7-batch-smoke.md +36 -0
  10. docpluck-1.5.0/.claude/skills/docpluck-review/SKILL.md +110 -0
  11. docpluck-1.5.0/.github/workflows/publish.yml +33 -0
  12. docpluck-1.5.0/.github/workflows/test.yml +31 -0
  13. docpluck-1.5.0/.gitignore +19 -0
  14. docpluck-1.5.0/CHANGELOG.md +382 -0
  15. docpluck-1.5.0/CLAUDE.md +62 -0
  16. docpluck-1.5.0/LICENSE +21 -0
  17. docpluck-1.5.0/PKG-INFO +451 -0
  18. docpluck-1.5.0/REPLY_FROM_DOCPLUCK_v1.4.5.md +164 -0
  19. docpluck-1.5.0/REPLY_FROM_DOCPLUCK_v1.5.0.md +77 -0
  20. docpluck-1.5.0/REQUEST_08_CHUNKING_ENDPOINT.md +348 -0
  21. docpluck-1.5.0/REQUEST_09_REFERENCE_LIST_NORMALIZATION.md +168 -0
  22. docpluck-1.5.0/docpluck/__init__.py +89 -0
  23. docpluck-1.5.0/docpluck/__main__.py +3 -0
  24. docpluck-1.5.0/docpluck/batch.py +183 -0
  25. docpluck-1.5.0/docpluck/cli.py +35 -0
  26. docpluck-1.5.0/docpluck/extract.py +191 -0
  27. docpluck-1.5.0/docpluck/extract_docx.py +64 -0
  28. docpluck-1.5.0/docpluck/extract_html.py +149 -0
  29. docpluck-1.5.0/docpluck/normalize.py +637 -0
  30. docpluck-1.5.0/docpluck/quality.py +92 -0
  31. docpluck-1.5.0/docpluck/version.py +58 -0
  32. docpluck-1.5.0/docs/BENCHMARKS.md +405 -0
  33. docpluck-1.5.0/docs/DESIGN.md +277 -0
  34. docpluck-1.5.0/docs/NORMALIZATION.md +428 -0
  35. docpluck-1.5.0/docs/README.md +405 -0
  36. docpluck-1.5.0/docs/superpowers/specs/2026-04-27-request-09-reference-normalization-design.md +75 -0
  37. docpluck-1.5.0/pyproject.toml +62 -0
  38. docpluck-1.5.0/tests/__init__.py +0 -0
  39. docpluck-1.5.0/tests/conftest.py +47 -0
  40. docpluck-1.5.0/tests/test_benchmark_docx_html.py +260 -0
  41. docpluck-1.5.0/tests/test_d5_normalization_audit.py +832 -0
  42. docpluck-1.5.0/tests/test_edge_cases.py +247 -0
  43. docpluck-1.5.0/tests/test_extract_docx.py +187 -0
  44. docpluck-1.5.0/tests/test_extract_html.py +315 -0
  45. docpluck-1.5.0/tests/test_extraction.py +120 -0
  46. docpluck-1.5.0/tests/test_metaesci_followups.py +169 -0
  47. docpluck-1.5.0/tests/test_normalization.py +802 -0
  48. docpluck-1.5.0/tests/test_quality.py +123 -0
  49. docpluck-1.5.0/tests/test_request_09_reference_normalization.py +89 -0
@@ -0,0 +1,70 @@
1
+ ---
2
+ name: docpluck-cleanup
3
+ description: Clean up Docpluck codebase. Sync CLAUDE.md/README.md/ARCHITECTURE.md against actual code, remove dead benchmark scripts, verify LESSONS.md is current, clean temp/ directory, check for stale environment variables, verify .gitignore covers sensitive files, update TODO.md progress. Use /docpluck-cleanup periodically or before releases.
4
+ tags: [python, pdf, docx, html, fastapi, nextjs, docs, cleanup]
5
+ ---
6
+
7
+ ## [MANDATORY FIRST ACTION] preflight (do NOT skip, even if orchestrated by /ship)
8
+
9
+ **Your very first action in this skill, BEFORE reading anything else, is:**
10
+
11
+ 1. Run: `bash ~/.claude/skills/_shared/bin/preflight-filter.sh <this-skill-name>` and print its `🔧 skill-optimize pre-check · ...` heartbeat as your first visible output line.
12
+ 2. Initialize `~/.claude/skills/_shared/run-meta/<this-skill-name>.json` per `~/.claude/skills/_shared/preflight.md` step 6 (include `phase_start_sha` from `git rev-parse HEAD`).
13
+ 3. Load `~/.claude/skills/_shared/quality-loop/core.md` into working memory (MUST-level rules gated by /ship).
14
+
15
+ If you skip these steps, /ship will detect the missing heartbeat and FAIL this phase. Do not proceed to the skill body until preflight has run.
16
+
17
+ # Docpluck Cleanup
18
+
19
+ You are a codebase janitor for Docpluck. Your job is to keep documentation accurate, remove dead code, and ensure the repo is in a clean state.
20
+
21
+ ## Project Location
22
+ `C:\Users\filin\Dropbox\Vibe\MetaScienceTools\PDFextractor`
23
+
24
+ ## Cleanup Checklist
25
+
26
+ ### 1. Documentation Sync
27
+ Read the actual code and verify each doc file is accurate:
28
+
29
+ - **CLAUDE.md** — Does the project structure match? Are the commands correct? Do the critical rules still apply?
30
+ - **README.md** — Does the architecture diagram match reality? Are the tech versions correct?
31
+ - **ARCHITECTURE.md** — Do the system diagrams match actual deployment? Are all tables listed?
32
+ - **API.md** — Do endpoint signatures match `service/app/main.py`? Are normalization steps current with `service/app/normalize.py`?
33
+ - **SETUP_GUIDE.md** — Are all URLs, project IDs, and domain names current?
34
+ - **TODO.md** — Mark completed items. Remove items that are no longer relevant.
35
+
36
+ ### 2. Dead Code Removal
37
+ Check `scripts/` for benchmark scripts that were intermediate steps and no longer needed:
38
+ - `scripts/investigate_stat_loss.py` — diagnostic, can be removed
39
+ - `scripts/investigate_stat_loss2.py` — diagnostic, can be removed
40
+ - `scripts/trace_pipeline.py` — diagnostic, can be removed
41
+ - `scripts/trace_pipeline2.py` — diagnostic, can be removed
42
+ - `scripts/verify_ieee_fp.py` — diagnostic, can be removed
43
+ - `scripts/quick_apa_verify.py` — diagnostic, can be removed
44
+ - `scripts/diagnose_mismatches.py` — diagnostic, can be removed
45
+ - `scripts/benchmark_docling_optimized.py` — Docling was dropped
46
+
47
+ Keep: `benchmark.py`, `ground_truth_verify.py`, `optimized_extractors.py`, `pdftotext_enhanced.py`, `setup_test_pdfs.py`, `ai_verify.py`, `final_showdown.py`
48
+
49
+ ### 3. Temp Directory
50
+ ```bash
51
+ rm -rf temp/
52
+ ```
53
+ Contains rendered PNG pages from verification — not needed in repo.
54
+
55
+ ### 4. Environment Variable Audit
56
+ - Check `.env.local` has no real secrets committed
57
+ - Verify `.gitignore` includes: `.env`, `.env.local`, `.env.production`, `.vercel/`, `test-pdfs/`, `temp/`, `__pycache__/`, `node_modules/`, `.next/`
58
+
59
+ ### 5. Stale Dependencies
60
+ - Check `frontend/package.json` for unused deps
61
+ - Check `service/requirements.txt` — should only have fastapi, uvicorn, pdfplumber, python-multipart
62
+ - Verify no pymupdf or pymupdf4llm in requirements (AGPL dropped)
63
+
64
+ ### 6. Memory Files
65
+ Check `C:\Users\filin\.claude\projects\c--Users-filin-Dropbox-Vibe-MetaScienceTools-PDFextractor\memory\` for stale memories that no longer reflect current state.
66
+
67
+ ## Output Format
68
+ Report what was cleaned, what was updated, and what needs manual attention.
69
+
70
+ ## Final step: read ~/.claude/skills/_shared/postflight.md and follow it.
@@ -0,0 +1,204 @@
1
+ ---
2
+ name: docpluck-deploy
3
+ description: Deploy Docpluck to production. Pre-flight checks (Next.js build, Python service health, git status), verify Vercel env vars (DATABASE_URL, AUTH_SECRET, AUTH_GITHUB_ID, AUTH_GOOGLE_ID, EXTRACTION_SERVICE_URL), push to GitHub for auto-deploy, verify Vercel deployment status, check Railway extraction service health, run post-deploy smoke test. Use /docpluck-deploy to deploy or verify deployment.
4
+ tags: [docpluck, nextjs, python, vercel, railway, neon, auth, deploy]
5
+ ---
6
+
7
+ ## [MANDATORY FIRST ACTION] preflight (do NOT skip, even if orchestrated by /ship)
8
+
9
+ **Your very first action in this skill, BEFORE reading anything else, is:**
10
+
11
+ 1. Run: `bash ~/.claude/skills/_shared/bin/preflight-filter.sh <this-skill-name>` and print its `🔧 skill-optimize pre-check · ...` heartbeat as your first visible output line.
12
+ 2. Initialize `~/.claude/skills/_shared/run-meta/<this-skill-name>.json` per `~/.claude/skills/_shared/preflight.md` step 6 (include `phase_start_sha` from `git rev-parse HEAD`).
13
+ 3. Load `~/.claude/skills/_shared/quality-loop/core.md` into working memory (MUST-level rules gated by /ship).
14
+
15
+ If you skip these steps, /ship will detect the missing heartbeat and FAIL this phase. Do not proceed to the skill body until preflight has run.
16
+
17
+ # Docpluck Deploy
18
+
19
+ Deploy Docpluck to production on Vercel (frontend) and Railway (extraction service).
20
+
21
+ ## Two-Repo Architecture (CRITICAL — read before deploying)
22
+
23
+ Docpluck is split across **two repos** under `C:\Users\filin\Dropbox\Vibe\MetaScienceTools\`:
24
+
25
+ | Path | Repo | Visibility | Contains |
26
+ |------|------|------------|----------|
27
+ | `docpluck/` | `giladfeldman/docpluck` | **public** | The `docpluck` Python library only (extraction + normalization + quality + DOCX/HTML). Published to PyPI. |
28
+ | `PDFextractor/` | `giladfeldman/docpluckapp` | **private** | The SaaS app only (Next.js frontend, FastAPI service `service/app/main.py`, Drizzle schema, Auth.js). **No library code duplication** — the service imports `docpluck` via a git pin in `service/requirements.txt`. |
29
+
30
+ Library changes therefore reach production via TWO steps:
31
+ 1. Tag + push the library repo (this updates PyPI, but the app pins by git tag).
32
+ 2. Bump the git pin in `PDFextractor/service/requirements.txt` (`docpluck @ git+https://...@v<NEW>`) and redeploy the app.
33
+
34
+ **Skipping step 2 silently keeps production on the old library.** Deploy pre-flight check 4 below catches this.
35
+
36
+ ## Pre-Flight Checks
37
+
38
+ Run ALL checks before deploying. Any failure is a blocker.
39
+
40
+ ### 1. Git Status (both repos)
41
+ ```bash
42
+ cd C:/Users/filin/Dropbox/Vibe/MetaScienceTools/docpluck && git status --short && git log --oneline -3
43
+ echo "---"
44
+ cd C:/Users/filin/Dropbox/Vibe/MetaScienceTools/PDFextractor && git status --short && git log --oneline -3
45
+ ```
46
+ Both working trees must be clean. Library tagged at `v<X.Y.Z>` matching `__version__`.
47
+
48
+ ### 2. Frontend Build
49
+ ```bash
50
+ cd C:/Users/filin/Dropbox/Vibe/MetaScienceTools/PDFextractor/frontend && npm run build
51
+ ```
52
+ Must pass with 0 errors.
53
+
54
+ ### 3. Python Service Module Check
55
+ ```bash
56
+ cd C:/Users/filin/Dropbox/Vibe/MetaScienceTools/PDFextractor/service && python -c "
57
+ from app.main import app
58
+ # Library modules (NOT app.normalize / app.quality — those moved to the docpluck library)
59
+ from docpluck import normalize_text, NormalizationLevel, compute_quality_score, get_version_info
60
+ info = get_version_info()
61
+ print(f'All imports OK; docpluck=={info[\"version\"]} normalize={info[\"normalize_version\"]}')
62
+ "
63
+ ```
64
+
65
+ ### 4. Cross-Repo Library Version Sync (CRITICAL)
66
+
67
+ Verify the app's `service/requirements.txt` git pin matches the library's latest tag. Mismatches mean the deploy will silently ship the OLD library to prod.
68
+
69
+ ```bash
70
+ LIB_VERSION=$(grep '^__version__' C:/Users/filin/Dropbox/Vibe/MetaScienceTools/docpluck/docpluck/__init__.py | grep -oE '[0-9]+\.[0-9]+\.[0-9]+')
71
+ APP_PIN=$(grep -oE 'docpluck.*@v[0-9]+\.[0-9]+\.[0-9]+' C:/Users/filin/Dropbox/Vibe/MetaScienceTools/PDFextractor/service/requirements.txt | grep -oE '[0-9]+\.[0-9]+\.[0-9]+')
72
+ echo "Library __version__: $LIB_VERSION"
73
+ echo "App requirements.txt pin: v$APP_PIN"
74
+ if [ "$LIB_VERSION" != "$APP_PIN" ]; then
75
+ echo "❌ MISMATCH — bump PDFextractor/service/requirements.txt to docpluck @ git+https://github.com/giladfeldman/docpluck.git@v$LIB_VERSION before deploying"
76
+ exit 1
77
+ fi
78
+
79
+ # Also verify the API.md examples are not stale beyond a major version
80
+ API_DOC_VERSION=$(grep -oE 'docpluck_version["\s:]+[0-9]+\.[0-9]+\.[0-9]+' C:/Users/filin/Dropbox/Vibe/MetaScienceTools/PDFextractor/API.md | head -1 | grep -oE '[0-9]+\.[0-9]+\.[0-9]+')
81
+ LIB_MAJOR_MINOR=$(echo "$LIB_VERSION" | cut -d. -f1,2)
82
+ DOC_MAJOR_MINOR=$(echo "$API_DOC_VERSION" | cut -d. -f1,2)
83
+ if [ "$LIB_MAJOR_MINOR" != "$DOC_MAJOR_MINOR" ]; then
84
+ echo "⚠️ API.md examples reference docpluck_version $API_DOC_VERSION; library is at $LIB_VERSION. Update PDFextractor/API.md."
85
+ fi
86
+ echo "✅ Library version sync OK"
87
+ ```
88
+
89
+ ### 4. Verify Vercel Environment Variables
90
+ ```bash
91
+ cd frontend && vercel env ls
92
+ ```
93
+
94
+ Required variables (all must show as "Encrypted"):
95
+ - `DATABASE_URL` — Neon connection string
96
+ - `AUTH_SECRET` — Auth.js session key
97
+ - `AUTH_TRUST_HOST` — must be `true`
98
+ - `AUTH_GITHUB_ID` — GitHub OAuth client ID
99
+ - `AUTH_GITHUB_SECRET` — GitHub OAuth client secret
100
+ - `AUTH_GOOGLE_ID` — Google OAuth client ID
101
+ - `AUTH_GOOGLE_SECRET` — Google OAuth client secret
102
+ - `EXTRACTION_SERVICE_URL` — Railway service URL
103
+
104
+ If any are missing, refer to SETUP_GUIDE.md.
105
+
106
+ ## Deploy
107
+
108
+ ### Frontend (Vercel)
109
+ Push to GitHub triggers auto-deploy:
110
+ ```bash
111
+ git push origin master
112
+ ```
113
+
114
+ Or manual deploy:
115
+ ```bash
116
+ cd frontend && vercel --prod
117
+ ```
118
+
119
+ ### Extraction Service (Railway)
120
+ If connected to GitHub, push triggers auto-deploy.
121
+
122
+ If not connected, deploy via CLI:
123
+ ```bash
124
+ cd service && railway up --detach
125
+ ```
126
+
127
+ Note: Railway CLI upload may timeout. If so, connect GitHub repo in Railway dashboard (root dir `/service`).
128
+
129
+ ## Post-Deploy Verification
130
+
131
+ ### 1. Vercel Deployment Status
132
+ ```bash
133
+ cd frontend && vercel ls | head -5
134
+ ```
135
+ Latest deployment must show `Ready`.
136
+
137
+ ### 2. Frontend Health
138
+ ```bash
139
+ curl -s -o /dev/null -w "%{http_code}" https://docpluck.vercel.app/login
140
+ ```
141
+ Must return 200.
142
+
143
+ ### 3. Railway Service Health
144
+ ```bash
145
+ curl -s https://extraction-service-production-d0e5.up.railway.app/health
146
+ ```
147
+ Must return `{"status":"ok",...}`.
148
+
149
+ ### 4. Smoke Test (if service is live)
150
+ ```bash
151
+ # Test extraction endpoint directly
152
+ curl -s -X POST https://extraction-service-production-d0e5.up.railway.app/extract \
153
+ -F "file=@test-pdfs/apa/chan_feldman_2025_cogemo.pdf" | python -c "
154
+ import sys, json
155
+ data = json.load(sys.stdin)
156
+ print(f'Engine: {data[\"metadata\"][\"engine\"]}')
157
+ print(f'Chars: {data[\"metadata\"][\"chars\"]}')
158
+ print(f'Quality: {data[\"quality\"][\"score\"]}')
159
+ assert data['metadata']['chars'] > 10000, 'Too few chars'
160
+ assert data['quality']['score'] >= 80, 'Quality too low'
161
+ print('Smoke test: PASS')
162
+ "
163
+ ```
164
+
165
+ ## Rollback
166
+
167
+ If deployment fails:
168
+ ```bash
169
+ # Vercel: rollback to previous deployment
170
+ cd frontend && vercel rollback
171
+
172
+ # Railway: redeploy from last working commit
173
+ railway service extraction-service
174
+ railway redeploy
175
+ ```
176
+
177
+ ## Report Format
178
+
179
+ ```
180
+ ## Docpluck Deploy Report
181
+
182
+ ### Pre-Flight
183
+ | Check | Status |
184
+ |-------|--------|
185
+ | Git clean | PASS/FAIL |
186
+ | Frontend build | PASS/FAIL |
187
+ | Service modules | PASS/FAIL |
188
+ | Env vars | X/Y present |
189
+
190
+ ### Deployment
191
+ | Target | Status | URL |
192
+ |--------|--------|-----|
193
+ | Vercel | DEPLOYED/FAILED | https://docpluck.vercel.app |
194
+ | Railway | DEPLOYED/FAILED | https://extraction-service-production-d0e5.up.railway.app |
195
+
196
+ ### Post-Deploy
197
+ | Check | Status |
198
+ |-------|--------|
199
+ | Frontend 200 | PASS/FAIL |
200
+ | Service health | PASS/FAIL |
201
+ | Smoke test | PASS/FAIL/SKIP |
202
+ ```
203
+
204
+ ## Final step: read ~/.claude/skills/_shared/postflight.md and follow it.
@@ -0,0 +1,251 @@
1
+ ---
2
+ name: docpluck-qa
3
+ description: Comprehensive QA engineer for Docpluck App (PDF + DOCX + HTML extraction SaaS). Runs 211-test Python suite, ESCIcheck 10-PDF AI verification (library + local webapp + production), normalization spot-check, batch extraction, service health, DB, admin API, and deployment checks. When asked for a "DOCX benchmark" or "format parity benchmark" or "--benchmark-docx" or similar, runs the special cross-format benchmark suite (CitationGuard DOCX corpus + DOCX\u2192PDF + PDF\u2192DOCX). Use /docpluck-qa whenever testing, after changes, or before deployment.
4
+ tags: [python, nextjs, pdf, docx, fastapi, drizzle, neon, qa]
5
+ ---
6
+
7
+ # Docpluck QA
8
+
9
+ ## [MANDATORY FIRST ACTION] preflight (do NOT skip, even if orchestrated by /ship)
10
+
11
+ **Your very first action in this skill, BEFORE reading anything else, is:**
12
+
13
+ 1. Run: `bash ~/.claude/skills/_shared/bin/preflight-filter.sh <this-skill-name>` and print its `🔧 skill-optimize pre-check · ...` heartbeat as your first visible output line.
14
+ 2. Initialize `~/.claude/skills/_shared/run-meta/<this-skill-name>.json` per `~/.claude/skills/_shared/preflight.md` step 6 (include `phase_start_sha` from `git rev-parse HEAD`).
15
+ 3. Load `~/.claude/skills/_shared/quality-loop/core.md` into working memory (MUST-level rules gated by /ship).
16
+
17
+ If you skip these steps, /ship will detect the missing heartbeat and FAIL this phase. Do not proceed to the skill body until preflight has run.
18
+
19
+ You are a QA engineer for Docpluck App, a universal academic PDF text extraction SaaS.
20
+
21
+ ## Project Context
22
+
23
+ - **App repo (private):** `C:\Users\filin\Dropbox\Vibe\MetaScienceTools\PDFextractor` (GitHub: giladfeldman/docpluckapp)
24
+ - **Library repo (public):** `C:\Users\filin\Dropbox\Vibe\docpluck` (GitHub: giladfeldman/docpluck, PyPI: docpluck)
25
+ - **Frontend:** Next.js 16 + Auth.js + Drizzle (in `frontend/`), port 6116
26
+ - **Service:** Python FastAPI importing `docpluck` library (in `service/`), port 6117
27
+ - **Database:** Neon Postgres (docpluck project)
28
+ - **ESCIcheck PDFs:** `C:\Users\filin\Dropbox\Vibe\ESCIcheck\testpdfs\Coded already\` (56 PDFs, APA psychology papers)
29
+ - **Test PDFs:** `test-pdfs/` (47 PDFs, 8 citation styles)
30
+ - **Test suite:** `service/tests/` (151 tests across 6 files)
31
+
32
+ ## QA Checklist
33
+
34
+ Run ALL checks sequentially. Report results in a structured table at the end.
35
+
36
+ ---
37
+
38
+ ### 1. Frontend Build
39
+ ```bash
40
+ cd C:\Users\filin\Dropbox\Vibe\MetaScienceTools\PDFextractor\frontend && npm run build 2>&1 | tail -20
41
+ ```
42
+ Must compile with **0 errors**. Warnings about middleware/turbopack are expected (Next.js 16).
43
+
44
+ ---
45
+
46
+ ### 2. Python Test Suite (CRITICAL — 364+ tests)
47
+
48
+ Run both the library repo and the service repo suites:
49
+
50
+ ```bash
51
+ cd C:\Users\filin\Dropbox\Vibe\MetaScienceTools\docpluck && python -m pytest tests/ -q --tb=short 2>&1 | tail -10
52
+ cd C:\Users\filin\Dropbox\Vibe\MetaScienceTools\PDFextractor\service && python -m pytest tests/ -q --tb=short 2>&1 | tail -10
53
+ ```
54
+ **All tests must pass.** Any failure indicates a regression.
55
+
56
+ Library test coverage (`docpluck/tests/`):
57
+ - `test_normalization.py` — All 15 pipeline steps (S0-S9, A1-A6)
58
+ - `test_d5_normalization_audit.py` — **153 tests** (D5 audit, 2026-04-12): comprehensive regression suite for every normalization regex. Covers D5 bug fix, safe regex guard isolation, all A1 sub-rules, A1/S9 interaction, S7/S8/S9 stat protection, A2-A6 edge cases, all 13 stat types near section boundaries, extreme edge cases (section numbers, page numbers, value formats, sequences, Unicode), moderate-risk regex coverage. **This file is the primary defense against silent data corruption — run it on every normalization change.**
59
+ - `test_quality.py` — Scoring, garbled detection, confidence levels
60
+ - `test_extraction.py` — Real PDFs, SMP recovery, 8 citation styles
61
+ - `test_edge_cases.py` — Cross-project lessons (dropped decimals, Unicode soup, column merges)
62
+ - `test_extract_html.py` — 46 tests, block/inline tree-walk, ChanORCID regression
63
+ - `test_extract_docx.py` — 18 tests, mammoth integration, soft breaks, smart quotes
64
+ - `test_benchmark_docx_html.py` — 12 tests, ground-truth passage survival for DOCX/HTML
65
+ - `test_metaesci_followups.py` — D3/D5/D6/D7 regression tests
66
+
67
+ Service test coverage (`PDFextractor/service/tests/`):
68
+ - `test_api_integration.py` — FastAPI /health and /extract endpoints
69
+ - `test_benchmark.py` — Ground truth regression, idempotency
70
+
71
+ ---
72
+
73
+ ### 2b. D5 Normalization Regression (CRITICAL — 153 tests)
74
+
75
+ This dedicated check runs the D5 audit test suite that guards against silent data
76
+ corruption in the normalization pipeline. Added 2026-04-12 after MetaESCI found
77
+ that a single regex destroyed ~800-1,200 stat lines across ~1,590 PDFs with zero
78
+ warnings. **This check is mandatory after ANY change to normalize.py.**
79
+
80
+ ```bash
81
+ cd C:\Users\filin\Dropbox\Vibe\MetaScienceTools\docpluck && python -m pytest tests/test_d5_normalization_audit.py -v --tb=short 2>&1 | tail -30
82
+ ```
83
+
84
+ **All 153 tests must pass.** Key coverage areas:
85
+ - D5 bug regression (12 tests): all MetaESCI corruption cases must NOT recur
86
+ - Safe regex guards (11 tests): both letter-start and p-value-format guards work
87
+ - A1 sub-rule isolation (17 tests): every stat linebreak repair rule independently
88
+ - A1/S9 interaction (6 tests): stat values protected from page-number stripping
89
+ - All 13 stat types near section boundaries: p, d, g, r, F, t, chi2, eta2, omega2, beta, OR, CI, RR
90
+ - Extreme edge cases (32 tests): section numbers, page numbers, value formats, Unicode
91
+
92
+ **Regex safety rules** (from D5 lesson):
93
+ 1. NEVER use `[^\n]` or `.` as catch-all in `re.sub` patterns
94
+ 2. ALWAYS constrain BOTH skipped content AND replacement target (two independent guards)
95
+ 3. Test every regex against `stat-value\nsection-number` patterns (18.5% of PDFs)
96
+ 4. Prefer narrow character classes (`[a-zA-Z]`) over broad exclusions (`[^\n]`)
97
+
98
+ ---
99
+
100
+ ### 3. Normalization Spot-Check
101
+ ```bash
102
+ cd C:\Users\filin\Dropbox\Vibe\MetaScienceTools\PDFextractor\service && python -c "
103
+ from docpluck import normalize_text, NormalizationLevel
104
+
105
+ raw = 'The signi\ufb01cant result was r(261) = \u22120.73, 95%\nCI [\u22120.78; \u22120.67], p\n< .001, d = 484'
106
+ result, report = normalize_text(raw, NormalizationLevel.academic)
107
+ assert 'significant' in result, 'S3 ligature failed'
108
+ assert '-0.73' in result, 'S5 unicode minus failed'
109
+ assert '95% CI' in result, 'A1 stat linebreak failed'
110
+ assert '[-0.78, -0.67]' in result, 'A4 CI delimiter failed'
111
+ assert 'p < .001' in result, 'A1 p-value linebreak failed'
112
+ assert '.484' in result, 'A2 dropped decimal failed'
113
+ print(f'Pipeline: PASS ({len(report.steps_applied)} steps, {len(report.changes_made)} changes)')
114
+ print(f'Version: {report.version}')
115
+ "
116
+ ```
117
+
118
+ ---
119
+
120
+ ### 4. SMP Recovery Test
121
+ ```bash
122
+ cd C:\Users\filin\Dropbox\Vibe\MetaScienceTools\PDFextractor\service && python -c "
123
+ import os
124
+ pdf_path = '../test-pdfs/nature/nathumbeh_2.pdf'
125
+ if os.path.exists(pdf_path):
126
+ from docpluck import extract_pdf
127
+ with open(pdf_path, 'rb') as f:
128
+ content = f.read()
129
+ text, method = extract_pdf(content)
130
+ assert text.count('\ufffd') == 0, f'SMP recovery failed: {text.count(chr(0xFFFD))} garbled'
131
+ assert 'pdfplumber' in method, f'SMP recovery not triggered: {method}'
132
+ print(f'SMP Recovery: PASS (method={method})')
133
+ else:
134
+ print('SMP Recovery: SKIP (no test PDF)')
135
+ "
136
+ ```
137
+
138
+ ---
139
+
140
+ ### 5. ESCIcheck 10-PDF Verification — Library (CRITICAL)
141
+
142
+ CRITICAL check: runs 10 ESCIcheck PDFs through the library extract+normalize pipeline, verifies chars, quality score, p-values, method, and sample coherence.
143
+
144
+ **Full procedure:** [references/check-5-escicheck-library.md](references/check-5-escicheck-library.md)
145
+
146
+ ### 6. ESCIcheck 10-PDF Verification — Local Webapp (CRITICAL)
147
+
148
+ CRITICAL check: same 10 PDFs through the local webapp /extract endpoint. Verifies HTTP status, engine, quality, and timing.
149
+
150
+ **Full procedure:** [references/check-6-escicheck-local-webapp.md](references/check-6-escicheck-local-webapp.md)
151
+
152
+ ### 7. Batch Extraction Smoke Test (test-pdfs/)
153
+
154
+ Walks the test-pdfs/ tree, runs extract_pdf on each, reports failures. Default corpus ~47 PDFs.
155
+
156
+ **Full procedure:** [references/check-7-batch-smoke.md](references/check-7-batch-smoke.md)
157
+
158
+ ### 8. Service Health Endpoint
159
+ ```bash
160
+ curl -s http://localhost:6117/health
161
+ ```
162
+ Must return `{"status":"ok","pdftotext":"...","engines":["pdftotext_default"]}`.
163
+
164
+ ---
165
+
166
+ ### 9. Database Connectivity
167
+ ```bash
168
+ cd C:\Users\filin\Dropbox\Vibe\MetaScienceTools\PDFextractor\frontend && node -e "
169
+ const { neon } = require('@neondatabase/serverless');
170
+ require('dotenv').config({ path: '.env.local' });
171
+ const sql = neon(process.env.DATABASE_URL);
172
+ sql\`SELECT table_name FROM information_schema.tables WHERE table_schema = 'public' ORDER BY table_name\`.then(r => {
173
+ const tables = r.map(t => t.table_name);
174
+ const expected = ['user', 'account', 'session', 'verificationToken', 'api_key', 'extraction_cache', 'usage_log'];
175
+ const missing = expected.filter(t => !tables.includes(t));
176
+ if (missing.length) console.log('MISSING:', missing.join(', '));
177
+ else console.log('Database: PASS (7/7 tables)', tables.join(', '));
178
+ }).catch(e => console.log('Database: FAIL', e.message));
179
+ "
180
+ ```
181
+
182
+ ---
183
+
184
+ ### 10. Admin API Smoke Test
185
+ ```bash
186
+ curl -s http://localhost:6116/api/admin/health | python -c "import sys,json; d=json.load(sys.stdin); print('Admin health:', d.get('service',{}).get('status','?'))"
187
+ curl -s http://localhost:6116/api/admin/stats | python -c "import sys,json; d=json.load(sys.stdin); print('Admin stats:', 'users' in d and 'keys' in d)"
188
+ ```
189
+
190
+ ---
191
+
192
+ ### 11. Hard Rules Verification
193
+
194
+ Asserts 4 hard rules: no -layout flag in pdftotext calls, no AGPL imports, U+2212 normalization present, library version consistency.
195
+
196
+ **Full procedure:** [references/check-11-hard-rules.md](references/check-11-hard-rules.md)
197
+
198
+ ### 12. Production Deployment (Vercel + Railway)
199
+ ```bash
200
+ # Vercel frontend
201
+ curl -s -o /dev/null -w "Vercel: HTTP %{http_code}\n" https://docpluck.vercel.app/login
202
+
203
+ # Railway extraction service
204
+ curl -s https://extraction-service-production-d0e5.up.railway.app/health | python -c "import sys,json; d=json.load(sys.stdin); print('Railway:', d.get('status','error'), d.get('pdftotext','unknown'))"
205
+ ```
206
+
207
+ ---
208
+
209
+ ### 13. ESCIcheck 10-PDF Verification — Production Webapp (CRITICAL)
210
+
211
+ CRITICAL check: same 10 PDFs through the production Vercel endpoint. Verifies auth, cache behavior, and parity with local.
212
+
213
+ **Full procedure:** [references/check-13-escicheck-production.md](references/check-13-escicheck-production.md)
214
+
215
+ ## Special Benchmark Mode: DOCX/PDF Parity + MetaESCI Regression
216
+
217
+ Opt-in cross-format benchmark suite — DOCX corpus integrity, DOCX↔PDF parity via Word COM, PDF↔DOCX parity via pdf2docx, and the 200-DOI MetaESCI regression. Runtime 5-15 min (launches Word). Trigger only when user explicitly asks for "benchmark-docx", "format parity benchmark", or `/docpluck-qa benchmark-docx`.
218
+
219
+ **Full procedure** (prerequisites, per-benchmark running instructions, pass criteria, interpretation, output format): [references/benchmark-mode.md](references/benchmark-mode.md)
220
+
221
+ ## Report Format
222
+
223
+ ```
224
+ ## Docpluck QA Report — [date]
225
+
226
+ | # | Check | Status | Details |
227
+ |---|-------|--------|---------|
228
+ | 1 | Frontend build | PASS/FAIL | 0 errors |
229
+ | 2 | Python test suite (151) | PASS/FAIL | 151/151 passed |
230
+ | 3 | Normalization spot-check | PASS/FAIL | 15 steps, N changes |
231
+ | 4 | SMP recovery | PASS/FAIL/SKIP | method used |
232
+ | 5 | ESCIcheck 10-PDF (library) | PASS/FAIL | X/10 passed |
233
+ | 6 | ESCIcheck 10-PDF (local webapp) | PASS/FAIL | X/10 passed |
234
+ | 7 | Batch extraction (test-pdfs/) | PASS/FAIL | X/47 succeeded |
235
+ | 8 | Service health | PASS/FAIL | pdftotext version |
236
+ | 9 | Database connectivity | PASS/FAIL | 7/7 tables |
237
+ | 10 | Admin API | PASS/FAIL | health + stats |
238
+ | 11 | Hard rules (4 checks) | PASS/FAIL | no -layout, no AGPL, U+2212, version |
239
+ | 12 | Production health | PASS/FAIL | HTTP codes |
240
+ | 13 | ESCIcheck 10-PDF (production) | PASS/FAIL/SKIP | X/10 passed |
241
+
242
+ **Overall: X/13 checks passed**
243
+
244
+ ### Issues Found
245
+ - [list any failures with exact error messages and file:line]
246
+
247
+ ### AI Verification Notes (Checks 5, 6, 13)
248
+ For each PDF: file name, chars, quality, p-values found, sample text judgment (coherent/garbled/column-interleaved)
249
+ ```
250
+
251
+ ## Final step: read ~/.claude/skills/_shared/postflight.md and follow it.
@@ -0,0 +1,106 @@
1
+ # Special Benchmark Mode: DOCX/PDF Parity + MetaESCI Regression
2
+
3
+ _Extracted from [../SKILL.md](../SKILL.md). Opt-in; 5-15 min runtime._
4
+
5
+
6
+ Opt-in cross-format benchmark suite (DOCX corpus, DOCX↔PDF parity via Word COM, PDF↔DOCX via pdf2docx, MetaESCI 200-DOI regression). Takes 5-15 minutes; only run when explicitly requested.
7
+
8
+ **Full procedure:** [references/benchmark-mode.md](references/benchmark-mode.md)
9
+
10
+ ### Prerequisites
11
+
12
+ - Microsoft Word installed (for `docx2pdf`, accessed via COM)
13
+ - Python packages: `mammoth`, `beautifulsoup4`, `lxml`, `rapidfuzz`, `pdf2docx`, `docx2pdf`
14
+ - CitationGuard corpus present at `C:\Users\filin\Dropbox\Vibe\CitationGuard\apps\worker\testpdfs\validation\docx\`
15
+ - PDF test corpus at `C:\Users\filin\Dropbox\Vibe\PDFextractor\test-pdfs\`
16
+
17
+ ### Running
18
+
19
+ Full benchmark (5–15 minutes, launches Word):
20
+ ```bash
21
+ cd C:\Users\filin\Dropbox\Vibe\docpluck && python benchmarks/run_all.py
22
+ ```
23
+
24
+ Quick mode (3 files per benchmark, 2–4 minutes):
25
+ ```bash
26
+ cd C:\Users\filin\Dropbox\Vibe\docpluck && python benchmarks/run_all.py --quick
27
+ ```
28
+
29
+ Skip Word-based conversion (if Word unavailable):
30
+ ```bash
31
+ cd C:\Users\filin\Dropbox\Vibe\docpluck && python benchmarks/run_all.py --skip docx2pdf
32
+ ```
33
+
34
+ Individual benchmarks:
35
+ ```bash
36
+ # 1. DOCX corpus (24 files, ~45s)
37
+ python benchmarks/bench_docx_corpus.py --json benchmarks/results/docx_corpus.json
38
+
39
+ # 2. DOCX → PDF cross-format (20 files via Word COM, ~5-10 min)
40
+ python benchmarks/bench_docx_vs_pdf.py --json benchmarks/results/docx_vs_pdf.json
41
+
42
+ # 3. PDF → DOCX reverse cross-format (8 files via pdf2docx, ~2-4 min)
43
+ python benchmarks/bench_pdf_vs_docx.py --json benchmarks/results/pdf_vs_docx.json
44
+
45
+ # 4. MetaESCI regression baseline (200 frozen DOIs, ~70s)
46
+ python benchmarks/bench_metaesci_regression.py --json benchmarks/results/metaesci_regression.json
47
+ ```
48
+
49
+ ### What Each Benchmark Validates
50
+
51
+ **1. DOCX corpus benchmark** (`bench_docx_corpus.py`)
52
+ - Runs `extract_docx()` on all 24 CitationGuard DOCX files
53
+ - Validates: all 20 real papers extract successfully; all 4 corrupted files fail correctly
54
+ - Checks quality score ≥80 and "high" confidence for every real paper
55
+ - Detects garbled extractions (should be 0)
56
+ - Reports total chars, per-file times, aggregate stats
57
+
58
+ **2. DOCX → PDF cross-format benchmark** (`bench_docx_vs_pdf.py`)
59
+ - For each DOCX: extract text, convert DOCX→PDF via Word (`docx2pdf`), extract PDF text, compare
60
+ - Uses `rapidfuzz.token_set_ratio` to measure similarity (should be ≥ 80%)
61
+ - Uses char-count ratio to detect content loss (should be 0.7–1.3)
62
+ - Both extractors should produce "high" confidence quality scores
63
+ - This is the A/B gold-standard: same content, two extraction paths — asymmetry reveals bugs in either extractor
64
+
65
+ **3. PDF → DOCX reverse cross-format benchmark** (`bench_pdf_vs_docx.py`)
66
+ - For each PDF: extract text, convert PDF→DOCX via `pdf2docx` (pure Python), extract DOCX text, compare
67
+ - Default selection: 8 PDFs across APA, Vancouver, AMA, Harvard, IEEE, Nature styles
68
+ - Same pass criteria as #2
69
+
70
+ ### Pass Criteria
71
+
72
+ Benchmark | Threshold
73
+ --- | ---
74
+ DOCX corpus extraction success | 20/20 real papers
75
+ DOCX corpus quality score | avg ≥ 80, all high confidence
76
+ DOCX corpus garbled count | 0
77
+ DOCX→PDF similarity (token_set_ratio) | avg ≥ 80%, min ≥ 80%
78
+ DOCX→PDF char ratio | 0.7–1.3
79
+ PDF→DOCX similarity | avg ≥ 80%, min ≥ 80%
80
+ PDF→DOCX char ratio | 0.7–1.3
81
+ MetaESCI extraction success | 200/200 (zero crashes, zero extraction errors)
82
+ MetaESCI high confidence | ≥ 99% (198/200+)
83
+ MetaESCI avg quality | ≥ 80 / 100
84
+ MetaESCI total U+FFFD | 0
85
+ MetaESCI stat linebreak survivors | 0
86
+ MetaESCI dropped decimal survivors | 0 (watch — current baseline has 21 in 3 files, real gaps pending A1/A2 enhancement)
87
+
88
+ ### Interpreting Results
89
+
90
+ - **Similarity < 80%**: the two extractors are producing substantially different text for the same content. Investigate word-order or reading-order differences, soft-break handling, or table linearization.
91
+ - **Char ratio far from 1.0**: one extractor is missing content (images, equations, headers, tracked changes). Investigate which format is losing data.
92
+ - **DOCX quality score < PDF quality score**: something is wrong — DOCX should be cleaner than PDF (no column interleaving, no ligatures, no SMP Unicode). This is a red flag for mammoth configuration or the tree-walk.
93
+ - **OMML equations**: known limitation — mammoth drops Office Math objects. Papers with stats inside equations will show lower char counts in DOCX vs PDF.
94
+
95
+ ### Output
96
+
97
+ Benchmark writes three files to `benchmarks/results/`:
98
+ - `docx_corpus.json` — per-file metrics
99
+ - `docx_vs_pdf.json` — cross-format comparison
100
+ - `pdf_vs_docx.json` — reverse comparison
101
+ - `REPORT.md` — consolidated markdown report
102
+
103
+ After running, read `benchmarks/results/REPORT.md` and report the summary to the user.
104
+
105
+ ---
106
+
@@ -0,0 +1,44 @@
1
+ # 11. Hard Rules Verification
2
+
3
+ _Extracted from [../SKILL.md](../SKILL.md). Full procedure lives here._
4
+
5
+ ```bash
6
+ cd C:\Users\filin\Dropbox\Vibe\PDFextractor\service && python -c "
7
+ import re
8
+
9
+ # Rule 1: No -layout flag in pdftotext calls (check library)
10
+ import docpluck.extract as ext_mod
11
+ import inspect
12
+ source = inspect.getsource(ext_mod)
13
+ calls = re.findall(r'subprocess\.run\(\s*\[.*?\]', source, re.DOTALL)
14
+ for call in calls:
15
+ assert '-layout' not in call, f'BLOCKER: -layout in {call}'
16
+ print('Rule 1 (no -layout): PASS')
17
+
18
+ # Rule 2: No AGPL imports in library or service
19
+ import docpluck.normalize as norm_mod, docpluck.quality as qual_mod
20
+ for name, mod in [('normalize', norm_mod), ('quality', qual_mod), ('extract', ext_mod)]:
21
+ src = inspect.getsource(mod)
22
+ assert 'pymupdf4llm' not in src, f'AGPL import in {name}'
23
+ assert 'column_boxes' not in src, f'AGPL method in {name}'
24
+ with open('app/main.py') as f:
25
+ main_src = f.read()
26
+ assert 'pymupdf4llm' not in main_src, 'AGPL import in main.py'
27
+ print('Rule 2 (no AGPL): PASS')
28
+
29
+ # Rule 3: U+2212 normalization exists in library (check file bytes to avoid encoding issues)
30
+ import docpluck.normalize as _nm_mod
31
+ with open(_nm_mod.__file__, 'rb') as _f:
32
+ _norm_bytes = _f.read()
33
+ assert b'\\u2212' in _norm_bytes or b'\xe2\x88\x92' in _norm_bytes, 'U+2212 normalization missing'
34
+ print('Rule 3 (U+2212 norm): PASS')
35
+
36
+ # Rule 4: Library version is consistent
37
+ import docpluck
38
+ assert docpluck.__version__ == '1.4.5', f'Version mismatch: {docpluck.__version__}'
39
+ print(f'Rule 4 (version=1.4.5): PASS')
40
+ "
41
+ ```
42
+
43
+ ---
44
+