docpluck 1.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docpluck-1.5.0/.claude/skills/docpluck-cleanup/SKILL.md +70 -0
- docpluck-1.5.0/.claude/skills/docpluck-deploy/SKILL.md +204 -0
- docpluck-1.5.0/.claude/skills/docpluck-qa/SKILL.md +251 -0
- docpluck-1.5.0/.claude/skills/docpluck-qa/references/benchmark-mode.md +106 -0
- docpluck-1.5.0/.claude/skills/docpluck-qa/references/check-11-hard-rules.md +44 -0
- docpluck-1.5.0/.claude/skills/docpluck-qa/references/check-13-escicheck-production.md +67 -0
- docpluck-1.5.0/.claude/skills/docpluck-qa/references/check-5-escicheck-library.md +69 -0
- docpluck-1.5.0/.claude/skills/docpluck-qa/references/check-6-escicheck-local-webapp.md +67 -0
- docpluck-1.5.0/.claude/skills/docpluck-qa/references/check-7-batch-smoke.md +36 -0
- docpluck-1.5.0/.claude/skills/docpluck-review/SKILL.md +110 -0
- docpluck-1.5.0/.github/workflows/publish.yml +33 -0
- docpluck-1.5.0/.github/workflows/test.yml +31 -0
- docpluck-1.5.0/.gitignore +19 -0
- docpluck-1.5.0/CHANGELOG.md +382 -0
- docpluck-1.5.0/CLAUDE.md +62 -0
- docpluck-1.5.0/LICENSE +21 -0
- docpluck-1.5.0/PKG-INFO +451 -0
- docpluck-1.5.0/REPLY_FROM_DOCPLUCK_v1.4.5.md +164 -0
- docpluck-1.5.0/REPLY_FROM_DOCPLUCK_v1.5.0.md +77 -0
- docpluck-1.5.0/REQUEST_08_CHUNKING_ENDPOINT.md +348 -0
- docpluck-1.5.0/REQUEST_09_REFERENCE_LIST_NORMALIZATION.md +168 -0
- docpluck-1.5.0/docpluck/__init__.py +89 -0
- docpluck-1.5.0/docpluck/__main__.py +3 -0
- docpluck-1.5.0/docpluck/batch.py +183 -0
- docpluck-1.5.0/docpluck/cli.py +35 -0
- docpluck-1.5.0/docpluck/extract.py +191 -0
- docpluck-1.5.0/docpluck/extract_docx.py +64 -0
- docpluck-1.5.0/docpluck/extract_html.py +149 -0
- docpluck-1.5.0/docpluck/normalize.py +637 -0
- docpluck-1.5.0/docpluck/quality.py +92 -0
- docpluck-1.5.0/docpluck/version.py +58 -0
- docpluck-1.5.0/docs/BENCHMARKS.md +405 -0
- docpluck-1.5.0/docs/DESIGN.md +277 -0
- docpluck-1.5.0/docs/NORMALIZATION.md +428 -0
- docpluck-1.5.0/docs/README.md +405 -0
- docpluck-1.5.0/docs/superpowers/specs/2026-04-27-request-09-reference-normalization-design.md +75 -0
- docpluck-1.5.0/pyproject.toml +62 -0
- docpluck-1.5.0/tests/__init__.py +0 -0
- docpluck-1.5.0/tests/conftest.py +47 -0
- docpluck-1.5.0/tests/test_benchmark_docx_html.py +260 -0
- docpluck-1.5.0/tests/test_d5_normalization_audit.py +832 -0
- docpluck-1.5.0/tests/test_edge_cases.py +247 -0
- docpluck-1.5.0/tests/test_extract_docx.py +187 -0
- docpluck-1.5.0/tests/test_extract_html.py +315 -0
- docpluck-1.5.0/tests/test_extraction.py +120 -0
- docpluck-1.5.0/tests/test_metaesci_followups.py +169 -0
- docpluck-1.5.0/tests/test_normalization.py +802 -0
- docpluck-1.5.0/tests/test_quality.py +123 -0
- docpluck-1.5.0/tests/test_request_09_reference_normalization.py +89 -0
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: docpluck-cleanup
|
|
3
|
+
description: Clean up Docpluck codebase. Sync CLAUDE.md/README.md/ARCHITECTURE.md against actual code, remove dead benchmark scripts, verify LESSONS.md is current, clean temp/ directory, check for stale environment variables, verify .gitignore covers sensitive files, update TODO.md progress. Use /docpluck-cleanup periodically or before releases.
|
|
4
|
+
tags: [python, pdf, docx, html, fastapi, nextjs, docs, cleanup]
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## [MANDATORY FIRST ACTION] preflight (do NOT skip, even if orchestrated by /ship)
|
|
8
|
+
|
|
9
|
+
**Your very first action in this skill, BEFORE reading anything else, is:**
|
|
10
|
+
|
|
11
|
+
1. Run: `bash ~/.claude/skills/_shared/bin/preflight-filter.sh <this-skill-name>` and print its `🔧 skill-optimize pre-check · ...` heartbeat as your first visible output line.
|
|
12
|
+
2. Initialize `~/.claude/skills/_shared/run-meta/<this-skill-name>.json` per `~/.claude/skills/_shared/preflight.md` step 6 (include `phase_start_sha` from `git rev-parse HEAD`).
|
|
13
|
+
3. Load `~/.claude/skills/_shared/quality-loop/core.md` into working memory (MUST-level rules gated by /ship).
|
|
14
|
+
|
|
15
|
+
If you skip these steps, /ship will detect the missing heartbeat and FAIL this phase. Do not proceed to the skill body until preflight has run.
|
|
16
|
+
|
|
17
|
+
# Docpluck Cleanup
|
|
18
|
+
|
|
19
|
+
You are a codebase janitor for Docpluck. Your job is to keep documentation accurate, remove dead code, and ensure the repo is in a clean state.
|
|
20
|
+
|
|
21
|
+
## Project Location
|
|
22
|
+
`C:\Users\filin\Dropbox\Vibe\MetaScienceTools\PDFextractor`
|
|
23
|
+
|
|
24
|
+
## Cleanup Checklist
|
|
25
|
+
|
|
26
|
+
### 1. Documentation Sync
|
|
27
|
+
Read the actual code and verify each doc file is accurate:
|
|
28
|
+
|
|
29
|
+
- **CLAUDE.md** — Does the project structure match? Are the commands correct? Do the critical rules still apply?
|
|
30
|
+
- **README.md** — Does the architecture diagram match reality? Are the tech versions correct?
|
|
31
|
+
- **ARCHITECTURE.md** — Do the system diagrams match actual deployment? Are all tables listed?
|
|
32
|
+
- **API.md** — Do endpoint signatures match `service/app/main.py`? Are normalization steps current with `service/app/normalize.py`?
|
|
33
|
+
- **SETUP_GUIDE.md** — Are all URLs, project IDs, and domain names current?
|
|
34
|
+
- **TODO.md** — Mark completed items. Remove items that are no longer relevant.
|
|
35
|
+
|
|
36
|
+
### 2. Dead Code Removal
|
|
37
|
+
Check `scripts/` for benchmark scripts that were intermediate steps and no longer needed:
|
|
38
|
+
- `scripts/investigate_stat_loss.py` — diagnostic, can be removed
|
|
39
|
+
- `scripts/investigate_stat_loss2.py` — diagnostic, can be removed
|
|
40
|
+
- `scripts/trace_pipeline.py` — diagnostic, can be removed
|
|
41
|
+
- `scripts/trace_pipeline2.py` — diagnostic, can be removed
|
|
42
|
+
- `scripts/verify_ieee_fp.py` — diagnostic, can be removed
|
|
43
|
+
- `scripts/quick_apa_verify.py` — diagnostic, can be removed
|
|
44
|
+
- `scripts/diagnose_mismatches.py` — diagnostic, can be removed
|
|
45
|
+
- `scripts/benchmark_docling_optimized.py` — Docling was dropped
|
|
46
|
+
|
|
47
|
+
Keep: `benchmark.py`, `ground_truth_verify.py`, `optimized_extractors.py`, `pdftotext_enhanced.py`, `setup_test_pdfs.py`, `ai_verify.py`, `final_showdown.py`
|
|
48
|
+
|
|
49
|
+
### 3. Temp Directory
|
|
50
|
+
```bash
|
|
51
|
+
rm -rf temp/
|
|
52
|
+
```
|
|
53
|
+
Contains rendered PNG pages from verification — not needed in repo.
|
|
54
|
+
|
|
55
|
+
### 4. Environment Variable Audit
|
|
56
|
+
- Check `.env.local` has no real secrets committed
|
|
57
|
+
- Verify `.gitignore` includes: `.env`, `.env.local`, `.env.production`, `.vercel/`, `test-pdfs/`, `temp/`, `__pycache__/`, `node_modules/`, `.next/`
|
|
58
|
+
|
|
59
|
+
### 5. Stale Dependencies
|
|
60
|
+
- Check `frontend/package.json` for unused deps
|
|
61
|
+
- Check `service/requirements.txt` — should only have fastapi, uvicorn, pdfplumber, python-multipart
|
|
62
|
+
- Verify no pymupdf or pymupdf4llm in requirements (AGPL dropped)
|
|
63
|
+
|
|
64
|
+
### 6. Memory Files
|
|
65
|
+
Check `C:\Users\filin\.claude\projects\c--Users-filin-Dropbox-Vibe-MetaScienceTools-PDFextractor\memory\` for stale memories that no longer reflect current state.
|
|
66
|
+
|
|
67
|
+
## Output Format
|
|
68
|
+
Report what was cleaned, what was updated, and what needs manual attention.
|
|
69
|
+
|
|
70
|
+
## Final step: read ~/.claude/skills/_shared/postflight.md and follow it.
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: docpluck-deploy
|
|
3
|
+
description: Deploy Docpluck to production. Pre-flight checks (Next.js build, Python service health, git status), verify Vercel env vars (DATABASE_URL, AUTH_SECRET, AUTH_GITHUB_ID, AUTH_GOOGLE_ID, EXTRACTION_SERVICE_URL), push to GitHub for auto-deploy, verify Vercel deployment status, check Railway extraction service health, run post-deploy smoke test. Use /docpluck-deploy to deploy or verify deployment.
|
|
4
|
+
tags: [docpluck, nextjs, python, vercel, railway, neon, auth, deploy]
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## [MANDATORY FIRST ACTION] preflight (do NOT skip, even if orchestrated by /ship)
|
|
8
|
+
|
|
9
|
+
**Your very first action in this skill, BEFORE reading anything else, is:**
|
|
10
|
+
|
|
11
|
+
1. Run: `bash ~/.claude/skills/_shared/bin/preflight-filter.sh <this-skill-name>` and print its `🔧 skill-optimize pre-check · ...` heartbeat as your first visible output line.
|
|
12
|
+
2. Initialize `~/.claude/skills/_shared/run-meta/<this-skill-name>.json` per `~/.claude/skills/_shared/preflight.md` step 6 (include `phase_start_sha` from `git rev-parse HEAD`).
|
|
13
|
+
3. Load `~/.claude/skills/_shared/quality-loop/core.md` into working memory (MUST-level rules gated by /ship).
|
|
14
|
+
|
|
15
|
+
If you skip these steps, /ship will detect the missing heartbeat and FAIL this phase. Do not proceed to the skill body until preflight has run.
|
|
16
|
+
|
|
17
|
+
# Docpluck Deploy
|
|
18
|
+
|
|
19
|
+
Deploy Docpluck to production on Vercel (frontend) and Railway (extraction service).
|
|
20
|
+
|
|
21
|
+
## Two-Repo Architecture (CRITICAL — read before deploying)
|
|
22
|
+
|
|
23
|
+
Docpluck is split across **two repos** under `C:\Users\filin\Dropbox\Vibe\MetaScienceTools\`:
|
|
24
|
+
|
|
25
|
+
| Path | Repo | Visibility | Contains |
|
|
26
|
+
|------|------|------------|----------|
|
|
27
|
+
| `docpluck/` | `giladfeldman/docpluck` | **public** | The `docpluck` Python library only (extraction + normalization + quality + DOCX/HTML). Published to PyPI. |
|
|
28
|
+
| `PDFextractor/` | `giladfeldman/docpluckapp` | **private** | The SaaS app only (Next.js frontend, FastAPI service `service/app/main.py`, Drizzle schema, Auth.js). **No library code duplication** — the service imports `docpluck` via a git pin in `service/requirements.txt`. |
|
|
29
|
+
|
|
30
|
+
Library changes therefore reach production via TWO steps:
|
|
31
|
+
1. Tag + push the library repo (this updates PyPI, but the app pins by git tag).
|
|
32
|
+
2. Bump the git pin in `PDFextractor/service/requirements.txt` (`docpluck @ git+https://...@v<NEW>`) and redeploy the app.
|
|
33
|
+
|
|
34
|
+
**Skipping step 2 silently keeps production on the old library.** Deploy pre-flight check 4 below catches this.
|
|
35
|
+
|
|
36
|
+
## Pre-Flight Checks
|
|
37
|
+
|
|
38
|
+
Run ALL checks before deploying. Any failure is a blocker.
|
|
39
|
+
|
|
40
|
+
### 1. Git Status (both repos)
|
|
41
|
+
```bash
|
|
42
|
+
cd C:/Users/filin/Dropbox/Vibe/MetaScienceTools/docpluck && git status --short && git log --oneline -3
|
|
43
|
+
echo "---"
|
|
44
|
+
cd C:/Users/filin/Dropbox/Vibe/MetaScienceTools/PDFextractor && git status --short && git log --oneline -3
|
|
45
|
+
```
|
|
46
|
+
Both working trees must be clean. Library tagged at `v<X.Y.Z>` matching `__version__`.
|
|
47
|
+
|
|
48
|
+
### 2. Frontend Build
|
|
49
|
+
```bash
|
|
50
|
+
cd C:/Users/filin/Dropbox/Vibe/MetaScienceTools/PDFextractor/frontend && npm run build
|
|
51
|
+
```
|
|
52
|
+
Must pass with 0 errors.
|
|
53
|
+
|
|
54
|
+
### 3. Python Service Module Check
|
|
55
|
+
```bash
|
|
56
|
+
cd C:/Users/filin/Dropbox/Vibe/MetaScienceTools/PDFextractor/service && python -c "
|
|
57
|
+
from app.main import app
|
|
58
|
+
# Library modules (NOT app.normalize / app.quality — those moved to the docpluck library)
|
|
59
|
+
from docpluck import normalize_text, NormalizationLevel, compute_quality_score, get_version_info
|
|
60
|
+
info = get_version_info()
|
|
61
|
+
print(f'All imports OK; docpluck=={info[\"version\"]} normalize={info[\"normalize_version\"]}')
|
|
62
|
+
"
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### 4. Cross-Repo Library Version Sync (CRITICAL)
|
|
66
|
+
|
|
67
|
+
Verify the app's `service/requirements.txt` git pin matches the library's latest tag. Mismatches mean the deploy will silently ship the OLD library to prod.
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
LIB_VERSION=$(grep '^__version__' C:/Users/filin/Dropbox/Vibe/MetaScienceTools/docpluck/docpluck/__init__.py | grep -oE '[0-9]+\.[0-9]+\.[0-9]+')
|
|
71
|
+
APP_PIN=$(grep -oE 'docpluck.*@v[0-9]+\.[0-9]+\.[0-9]+' C:/Users/filin/Dropbox/Vibe/MetaScienceTools/PDFextractor/service/requirements.txt | grep -oE '[0-9]+\.[0-9]+\.[0-9]+')
|
|
72
|
+
echo "Library __version__: $LIB_VERSION"
|
|
73
|
+
echo "App requirements.txt pin: v$APP_PIN"
|
|
74
|
+
if [ "$LIB_VERSION" != "$APP_PIN" ]; then
|
|
75
|
+
echo "❌ MISMATCH — bump PDFextractor/service/requirements.txt to docpluck @ git+https://github.com/giladfeldman/docpluck.git@v$LIB_VERSION before deploying"
|
|
76
|
+
exit 1
|
|
77
|
+
fi
|
|
78
|
+
|
|
79
|
+
# Also verify the API.md examples are not stale beyond a major version
|
|
80
|
+
API_DOC_VERSION=$(grep -oE 'docpluck_version["\s:]+[0-9]+\.[0-9]+\.[0-9]+' C:/Users/filin/Dropbox/Vibe/MetaScienceTools/PDFextractor/API.md | head -1 | grep -oE '[0-9]+\.[0-9]+\.[0-9]+')
|
|
81
|
+
LIB_MAJOR_MINOR=$(echo "$LIB_VERSION" | cut -d. -f1,2)
|
|
82
|
+
DOC_MAJOR_MINOR=$(echo "$API_DOC_VERSION" | cut -d. -f1,2)
|
|
83
|
+
if [ "$LIB_MAJOR_MINOR" != "$DOC_MAJOR_MINOR" ]; then
|
|
84
|
+
echo "⚠️ API.md examples reference docpluck_version $API_DOC_VERSION; library is at $LIB_VERSION. Update PDFextractor/API.md."
|
|
85
|
+
fi
|
|
86
|
+
echo "✅ Library version sync OK"
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### 4. Verify Vercel Environment Variables
|
|
90
|
+
```bash
|
|
91
|
+
cd frontend && vercel env ls
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
Required variables (all must show as "Encrypted"):
|
|
95
|
+
- `DATABASE_URL` — Neon connection string
|
|
96
|
+
- `AUTH_SECRET` — Auth.js session key
|
|
97
|
+
- `AUTH_TRUST_HOST` — must be `true`
|
|
98
|
+
- `AUTH_GITHUB_ID` — GitHub OAuth client ID
|
|
99
|
+
- `AUTH_GITHUB_SECRET` — GitHub OAuth client secret
|
|
100
|
+
- `AUTH_GOOGLE_ID` — Google OAuth client ID
|
|
101
|
+
- `AUTH_GOOGLE_SECRET` — Google OAuth client secret
|
|
102
|
+
- `EXTRACTION_SERVICE_URL` — Railway service URL
|
|
103
|
+
|
|
104
|
+
If any are missing, refer to SETUP_GUIDE.md.
|
|
105
|
+
|
|
106
|
+
## Deploy
|
|
107
|
+
|
|
108
|
+
### Frontend (Vercel)
|
|
109
|
+
Push to GitHub triggers auto-deploy:
|
|
110
|
+
```bash
|
|
111
|
+
git push origin master
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
Or manual deploy:
|
|
115
|
+
```bash
|
|
116
|
+
cd frontend && vercel --prod
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
### Extraction Service (Railway)
|
|
120
|
+
If connected to GitHub, push triggers auto-deploy.
|
|
121
|
+
|
|
122
|
+
If not connected, deploy via CLI:
|
|
123
|
+
```bash
|
|
124
|
+
cd service && railway up --detach
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
Note: Railway CLI upload may timeout. If so, connect GitHub repo in Railway dashboard (root dir `/service`).
|
|
128
|
+
|
|
129
|
+
## Post-Deploy Verification
|
|
130
|
+
|
|
131
|
+
### 1. Vercel Deployment Status
|
|
132
|
+
```bash
|
|
133
|
+
cd frontend && vercel ls | head -5
|
|
134
|
+
```
|
|
135
|
+
Latest deployment must show `Ready`.
|
|
136
|
+
|
|
137
|
+
### 2. Frontend Health
|
|
138
|
+
```bash
|
|
139
|
+
curl -s -o /dev/null -w "%{http_code}" https://docpluck.vercel.app/login
|
|
140
|
+
```
|
|
141
|
+
Must return 200.
|
|
142
|
+
|
|
143
|
+
### 3. Railway Service Health
|
|
144
|
+
```bash
|
|
145
|
+
curl -s https://extraction-service-production-d0e5.up.railway.app/health
|
|
146
|
+
```
|
|
147
|
+
Must return `{"status":"ok",...}`.
|
|
148
|
+
|
|
149
|
+
### 4. Smoke Test (if service is live)
|
|
150
|
+
```bash
|
|
151
|
+
# Test extraction endpoint directly
|
|
152
|
+
curl -s -X POST https://extraction-service-production-d0e5.up.railway.app/extract \
|
|
153
|
+
-F "file=@test-pdfs/apa/chan_feldman_2025_cogemo.pdf" | python -c "
|
|
154
|
+
import sys, json
|
|
155
|
+
data = json.load(sys.stdin)
|
|
156
|
+
print(f'Engine: {data[\"metadata\"][\"engine\"]}')
|
|
157
|
+
print(f'Chars: {data[\"metadata\"][\"chars\"]}')
|
|
158
|
+
print(f'Quality: {data[\"quality\"][\"score\"]}')
|
|
159
|
+
assert data['metadata']['chars'] > 10000, 'Too few chars'
|
|
160
|
+
assert data['quality']['score'] >= 80, 'Quality too low'
|
|
161
|
+
print('Smoke test: PASS')
|
|
162
|
+
"
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
## Rollback
|
|
166
|
+
|
|
167
|
+
If deployment fails:
|
|
168
|
+
```bash
|
|
169
|
+
# Vercel: rollback to previous deployment
|
|
170
|
+
cd frontend && vercel rollback
|
|
171
|
+
|
|
172
|
+
# Railway: redeploy from last working commit
|
|
173
|
+
railway service extraction-service
|
|
174
|
+
railway redeploy
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
## Report Format
|
|
178
|
+
|
|
179
|
+
```
|
|
180
|
+
## Docpluck Deploy Report
|
|
181
|
+
|
|
182
|
+
### Pre-Flight
|
|
183
|
+
| Check | Status |
|
|
184
|
+
|-------|--------|
|
|
185
|
+
| Git clean | PASS/FAIL |
|
|
186
|
+
| Frontend build | PASS/FAIL |
|
|
187
|
+
| Service modules | PASS/FAIL |
|
|
188
|
+
| Env vars | X/Y present |
|
|
189
|
+
|
|
190
|
+
### Deployment
|
|
191
|
+
| Target | Status | URL |
|
|
192
|
+
|--------|--------|-----|
|
|
193
|
+
| Vercel | DEPLOYED/FAILED | https://docpluck.vercel.app |
|
|
194
|
+
| Railway | DEPLOYED/FAILED | https://extraction-service-production-d0e5.up.railway.app |
|
|
195
|
+
|
|
196
|
+
### Post-Deploy
|
|
197
|
+
| Check | Status |
|
|
198
|
+
|-------|--------|
|
|
199
|
+
| Frontend 200 | PASS/FAIL |
|
|
200
|
+
| Service health | PASS/FAIL |
|
|
201
|
+
| Smoke test | PASS/FAIL/SKIP |
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
## Final step: read ~/.claude/skills/_shared/postflight.md and follow it.
|
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: docpluck-qa
|
|
3
|
+
description: Comprehensive QA engineer for Docpluck App (PDF + DOCX + HTML extraction SaaS). Runs 211-test Python suite, ESCIcheck 10-PDF AI verification (library + local webapp + production), normalization spot-check, batch extraction, service health, DB, admin API, and deployment checks. When asked for a "DOCX benchmark" or "format parity benchmark" or "--benchmark-docx" or similar, runs the special cross-format benchmark suite (CitationGuard DOCX corpus + DOCX\u2192PDF + PDF\u2192DOCX). Use /docpluck-qa whenever testing, after changes, or before deployment.
|
|
4
|
+
tags: [python, nextjs, pdf, docx, fastapi, drizzle, neon, qa]
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# Docpluck QA
|
|
8
|
+
|
|
9
|
+
## [MANDATORY FIRST ACTION] preflight (do NOT skip, even if orchestrated by /ship)
|
|
10
|
+
|
|
11
|
+
**Your very first action in this skill, BEFORE reading anything else, is:**
|
|
12
|
+
|
|
13
|
+
1. Run: `bash ~/.claude/skills/_shared/bin/preflight-filter.sh <this-skill-name>` and print its `🔧 skill-optimize pre-check · ...` heartbeat as your first visible output line.
|
|
14
|
+
2. Initialize `~/.claude/skills/_shared/run-meta/<this-skill-name>.json` per `~/.claude/skills/_shared/preflight.md` step 6 (include `phase_start_sha` from `git rev-parse HEAD`).
|
|
15
|
+
3. Load `~/.claude/skills/_shared/quality-loop/core.md` into working memory (MUST-level rules gated by /ship).
|
|
16
|
+
|
|
17
|
+
If you skip these steps, /ship will detect the missing heartbeat and FAIL this phase. Do not proceed to the skill body until preflight has run.
|
|
18
|
+
|
|
19
|
+
You are a QA engineer for Docpluck App, a universal academic PDF text extraction SaaS.
|
|
20
|
+
|
|
21
|
+
## Project Context
|
|
22
|
+
|
|
23
|
+
- **App repo (private):** `C:\Users\filin\Dropbox\Vibe\MetaScienceTools\PDFextractor` (GitHub: giladfeldman/docpluckapp)
|
|
24
|
+
- **Library repo (public):** `C:\Users\filin\Dropbox\Vibe\docpluck` (GitHub: giladfeldman/docpluck, PyPI: docpluck)
|
|
25
|
+
- **Frontend:** Next.js 16 + Auth.js + Drizzle (in `frontend/`), port 6116
|
|
26
|
+
- **Service:** Python FastAPI importing `docpluck` library (in `service/`), port 6117
|
|
27
|
+
- **Database:** Neon Postgres (docpluck project)
|
|
28
|
+
- **ESCIcheck PDFs:** `C:\Users\filin\Dropbox\Vibe\ESCIcheck\testpdfs\Coded already\` (56 PDFs, APA psychology papers)
|
|
29
|
+
- **Test PDFs:** `test-pdfs/` (47 PDFs, 8 citation styles)
|
|
30
|
+
- **Test suite:** `service/tests/` (151 tests across 6 files)
|
|
31
|
+
|
|
32
|
+
## QA Checklist
|
|
33
|
+
|
|
34
|
+
Run ALL checks sequentially. Report results in a structured table at the end.
|
|
35
|
+
|
|
36
|
+
---
|
|
37
|
+
|
|
38
|
+
### 1. Frontend Build
|
|
39
|
+
```bash
|
|
40
|
+
cd C:\Users\filin\Dropbox\Vibe\MetaScienceTools\PDFextractor\frontend && npm run build 2>&1 | tail -20
|
|
41
|
+
```
|
|
42
|
+
Must compile with **0 errors**. Warnings about middleware/turbopack are expected (Next.js 16).
|
|
43
|
+
|
|
44
|
+
---
|
|
45
|
+
|
|
46
|
+
### 2. Python Test Suite (CRITICAL — 364+ tests)
|
|
47
|
+
|
|
48
|
+
Run both the library repo and the service repo suites:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
cd C:\Users\filin\Dropbox\Vibe\MetaScienceTools\docpluck && python -m pytest tests/ -q --tb=short 2>&1 | tail -10
|
|
52
|
+
cd C:\Users\filin\Dropbox\Vibe\MetaScienceTools\PDFextractor\service && python -m pytest tests/ -q --tb=short 2>&1 | tail -10
|
|
53
|
+
```
|
|
54
|
+
**All tests must pass.** Any failure indicates a regression.
|
|
55
|
+
|
|
56
|
+
Library test coverage (`docpluck/tests/`):
|
|
57
|
+
- `test_normalization.py` — All 15 pipeline steps (S0-S9, A1-A6)
|
|
58
|
+
- `test_d5_normalization_audit.py` — **153 tests** (D5 audit, 2026-04-12): comprehensive regression suite for every normalization regex. Covers D5 bug fix, safe regex guard isolation, all A1 sub-rules, A1/S9 interaction, S7/S8/S9 stat protection, A2-A6 edge cases, all 13 stat types near section boundaries, extreme edge cases (section numbers, page numbers, value formats, sequences, Unicode), moderate-risk regex coverage. **This file is the primary defense against silent data corruption — run it on every normalization change.**
|
|
59
|
+
- `test_quality.py` — Scoring, garbled detection, confidence levels
|
|
60
|
+
- `test_extraction.py` — Real PDFs, SMP recovery, 8 citation styles
|
|
61
|
+
- `test_edge_cases.py` — Cross-project lessons (dropped decimals, Unicode soup, column merges)
|
|
62
|
+
- `test_extract_html.py` — 46 tests, block/inline tree-walk, ChanORCID regression
|
|
63
|
+
- `test_extract_docx.py` — 18 tests, mammoth integration, soft breaks, smart quotes
|
|
64
|
+
- `test_benchmark_docx_html.py` — 12 tests, ground-truth passage survival for DOCX/HTML
|
|
65
|
+
- `test_metaesci_followups.py` — D3/D5/D6/D7 regression tests
|
|
66
|
+
|
|
67
|
+
Service test coverage (`PDFextractor/service/tests/`):
|
|
68
|
+
- `test_api_integration.py` — FastAPI /health and /extract endpoints
|
|
69
|
+
- `test_benchmark.py` — Ground truth regression, idempotency
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
### 2b. D5 Normalization Regression (CRITICAL — 153 tests)
|
|
74
|
+
|
|
75
|
+
This dedicated check runs the D5 audit test suite that guards against silent data
|
|
76
|
+
corruption in the normalization pipeline. Added 2026-04-12 after MetaESCI found
|
|
77
|
+
that a single regex destroyed ~800-1,200 stat lines across ~1,590 PDFs with zero
|
|
78
|
+
warnings. **This check is mandatory after ANY change to normalize.py.**
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
cd C:\Users\filin\Dropbox\Vibe\MetaScienceTools\docpluck && python -m pytest tests/test_d5_normalization_audit.py -v --tb=short 2>&1 | tail -30
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
**All 153 tests must pass.** Key coverage areas:
|
|
85
|
+
- D5 bug regression (12 tests): all MetaESCI corruption cases must NOT recur
|
|
86
|
+
- Safe regex guards (11 tests): both letter-start and p-value-format guards work
|
|
87
|
+
- A1 sub-rule isolation (17 tests): every stat linebreak repair rule independently
|
|
88
|
+
- A1/S9 interaction (6 tests): stat values protected from page-number stripping
|
|
89
|
+
- All 13 stat types near section boundaries: p, d, g, r, F, t, chi2, eta2, omega2, beta, OR, CI, RR
|
|
90
|
+
- Extreme edge cases (32 tests): section numbers, page numbers, value formats, Unicode
|
|
91
|
+
|
|
92
|
+
**Regex safety rules** (from D5 lesson):
|
|
93
|
+
1. NEVER use `[^\n]` or `.` as catch-all in `re.sub` patterns
|
|
94
|
+
2. ALWAYS constrain BOTH skipped content AND replacement target (two independent guards)
|
|
95
|
+
3. Test every regex against `stat-value\nsection-number` patterns (18.5% of PDFs)
|
|
96
|
+
4. Prefer narrow character classes (`[a-zA-Z]`) over broad exclusions (`[^\n]`)
|
|
97
|
+
|
|
98
|
+
---
|
|
99
|
+
|
|
100
|
+
### 3. Normalization Spot-Check
|
|
101
|
+
```bash
|
|
102
|
+
cd C:\Users\filin\Dropbox\Vibe\MetaScienceTools\PDFextractor\service && python -c "
|
|
103
|
+
from docpluck import normalize_text, NormalizationLevel
|
|
104
|
+
|
|
105
|
+
raw = 'The signi\ufb01cant result was r(261) = \u22120.73, 95%\nCI [\u22120.78; \u22120.67], p\n< .001, d = 484'
|
|
106
|
+
result, report = normalize_text(raw, NormalizationLevel.academic)
|
|
107
|
+
assert 'significant' in result, 'S3 ligature failed'
|
|
108
|
+
assert '-0.73' in result, 'S5 unicode minus failed'
|
|
109
|
+
assert '95% CI' in result, 'A1 stat linebreak failed'
|
|
110
|
+
assert '[-0.78, -0.67]' in result, 'A4 CI delimiter failed'
|
|
111
|
+
assert 'p < .001' in result, 'A1 p-value linebreak failed'
|
|
112
|
+
assert '.484' in result, 'A2 dropped decimal failed'
|
|
113
|
+
print(f'Pipeline: PASS ({len(report.steps_applied)} steps, {len(report.changes_made)} changes)')
|
|
114
|
+
print(f'Version: {report.version}')
|
|
115
|
+
"
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
---
|
|
119
|
+
|
|
120
|
+
### 4. SMP Recovery Test
|
|
121
|
+
```bash
|
|
122
|
+
cd C:\Users\filin\Dropbox\Vibe\MetaScienceTools\PDFextractor\service && python -c "
|
|
123
|
+
import os
|
|
124
|
+
pdf_path = '../test-pdfs/nature/nathumbeh_2.pdf'
|
|
125
|
+
if os.path.exists(pdf_path):
|
|
126
|
+
from docpluck import extract_pdf
|
|
127
|
+
with open(pdf_path, 'rb') as f:
|
|
128
|
+
content = f.read()
|
|
129
|
+
text, method = extract_pdf(content)
|
|
130
|
+
assert text.count('\ufffd') == 0, f'SMP recovery failed: {text.count(chr(0xFFFD))} garbled'
|
|
131
|
+
assert 'pdfplumber' in method, f'SMP recovery not triggered: {method}'
|
|
132
|
+
print(f'SMP Recovery: PASS (method={method})')
|
|
133
|
+
else:
|
|
134
|
+
print('SMP Recovery: SKIP (no test PDF)')
|
|
135
|
+
"
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
---
|
|
139
|
+
|
|
140
|
+
### 5. ESCIcheck 10-PDF Verification — Library (CRITICAL)
|
|
141
|
+
|
|
142
|
+
CRITICAL check: runs 10 ESCIcheck PDFs through the library extract+normalize pipeline, verifies chars, quality score, p-values, method, and sample coherence.
|
|
143
|
+
|
|
144
|
+
**Full procedure:** [references/check-5-escicheck-library.md](references/check-5-escicheck-library.md)
|
|
145
|
+
|
|
146
|
+
### 6. ESCIcheck 10-PDF Verification — Local Webapp (CRITICAL)
|
|
147
|
+
|
|
148
|
+
CRITICAL check: same 10 PDFs through the local webapp /extract endpoint. Verifies HTTP status, engine, quality, and timing.
|
|
149
|
+
|
|
150
|
+
**Full procedure:** [references/check-6-escicheck-local-webapp.md](references/check-6-escicheck-local-webapp.md)
|
|
151
|
+
|
|
152
|
+
### 7. Batch Extraction Smoke Test (test-pdfs/)
|
|
153
|
+
|
|
154
|
+
Walks the test-pdfs/ tree, runs extract_pdf on each, reports failures. Default corpus ~47 PDFs.
|
|
155
|
+
|
|
156
|
+
**Full procedure:** [references/check-7-batch-smoke.md](references/check-7-batch-smoke.md)
|
|
157
|
+
|
|
158
|
+
### 8. Service Health Endpoint
|
|
159
|
+
```bash
|
|
160
|
+
curl -s http://localhost:6117/health
|
|
161
|
+
```
|
|
162
|
+
Must return `{"status":"ok","pdftotext":"...","engines":["pdftotext_default"]}`.
|
|
163
|
+
|
|
164
|
+
---
|
|
165
|
+
|
|
166
|
+
### 9. Database Connectivity
|
|
167
|
+
```bash
|
|
168
|
+
cd C:\Users\filin\Dropbox\Vibe\MetaScienceTools\PDFextractor\frontend && node -e "
|
|
169
|
+
const { neon } = require('@neondatabase/serverless');
|
|
170
|
+
require('dotenv').config({ path: '.env.local' });
|
|
171
|
+
const sql = neon(process.env.DATABASE_URL);
|
|
172
|
+
sql\`SELECT table_name FROM information_schema.tables WHERE table_schema = 'public' ORDER BY table_name\`.then(r => {
|
|
173
|
+
const tables = r.map(t => t.table_name);
|
|
174
|
+
const expected = ['user', 'account', 'session', 'verificationToken', 'api_key', 'extraction_cache', 'usage_log'];
|
|
175
|
+
const missing = expected.filter(t => !tables.includes(t));
|
|
176
|
+
if (missing.length) console.log('MISSING:', missing.join(', '));
|
|
177
|
+
else console.log('Database: PASS (7/7 tables)', tables.join(', '));
|
|
178
|
+
}).catch(e => console.log('Database: FAIL', e.message));
|
|
179
|
+
"
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
---
|
|
183
|
+
|
|
184
|
+
### 10. Admin API Smoke Test
|
|
185
|
+
```bash
|
|
186
|
+
curl -s http://localhost:6116/api/admin/health | python -c "import sys,json; d=json.load(sys.stdin); print('Admin health:', d.get('service',{}).get('status','?'))"
|
|
187
|
+
curl -s http://localhost:6116/api/admin/stats | python -c "import sys,json; d=json.load(sys.stdin); print('Admin stats:', 'users' in d and 'keys' in d)"
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
---
|
|
191
|
+
|
|
192
|
+
### 11. Hard Rules Verification
|
|
193
|
+
|
|
194
|
+
Asserts 4 hard rules: no -layout flag in pdftotext calls, no AGPL imports, U+2212 normalization present, library version consistency.
|
|
195
|
+
|
|
196
|
+
**Full procedure:** [references/check-11-hard-rules.md](references/check-11-hard-rules.md)
|
|
197
|
+
|
|
198
|
+
### 12. Production Deployment (Vercel + Railway)
|
|
199
|
+
```bash
|
|
200
|
+
# Vercel frontend
|
|
201
|
+
curl -s -o /dev/null -w "Vercel: HTTP %{http_code}\n" https://docpluck.vercel.app/login
|
|
202
|
+
|
|
203
|
+
# Railway extraction service
|
|
204
|
+
curl -s https://extraction-service-production-d0e5.up.railway.app/health | python -c "import sys,json; d=json.load(sys.stdin); print('Railway:', d.get('status','error'), d.get('pdftotext','unknown'))"
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
---
|
|
208
|
+
|
|
209
|
+
### 13. ESCIcheck 10-PDF Verification — Production Webapp (CRITICAL)
|
|
210
|
+
|
|
211
|
+
CRITICAL check: same 10 PDFs through the production Vercel endpoint. Verifies auth, cache behavior, and parity with local.
|
|
212
|
+
|
|
213
|
+
**Full procedure:** [references/check-13-escicheck-production.md](references/check-13-escicheck-production.md)
|
|
214
|
+
|
|
215
|
+
## Special Benchmark Mode: DOCX/PDF Parity + MetaESCI Regression
|
|
216
|
+
|
|
217
|
+
Opt-in cross-format benchmark suite — DOCX corpus integrity, DOCX↔PDF parity via Word COM, PDF↔DOCX parity via pdf2docx, and the 200-DOI MetaESCI regression. Runtime 5-15 min (launches Word). Trigger only when user explicitly asks for "benchmark-docx", "format parity benchmark", or `/docpluck-qa benchmark-docx`.
|
|
218
|
+
|
|
219
|
+
**Full procedure** (prerequisites, per-benchmark running instructions, pass criteria, interpretation, output format): [references/benchmark-mode.md](references/benchmark-mode.md)
|
|
220
|
+
|
|
221
|
+
## Report Format
|
|
222
|
+
|
|
223
|
+
```
|
|
224
|
+
## Docpluck QA Report — [date]
|
|
225
|
+
|
|
226
|
+
| # | Check | Status | Details |
|
|
227
|
+
|---|-------|--------|---------|
|
|
228
|
+
| 1 | Frontend build | PASS/FAIL | 0 errors |
|
|
229
|
+
| 2 | Python test suite (151) | PASS/FAIL | 151/151 passed |
|
|
230
|
+
| 3 | Normalization spot-check | PASS/FAIL | 15 steps, N changes |
|
|
231
|
+
| 4 | SMP recovery | PASS/FAIL/SKIP | method used |
|
|
232
|
+
| 5 | ESCIcheck 10-PDF (library) | PASS/FAIL | X/10 passed |
|
|
233
|
+
| 6 | ESCIcheck 10-PDF (local webapp) | PASS/FAIL | X/10 passed |
|
|
234
|
+
| 7 | Batch extraction (test-pdfs/) | PASS/FAIL | X/47 succeeded |
|
|
235
|
+
| 8 | Service health | PASS/FAIL | pdftotext version |
|
|
236
|
+
| 9 | Database connectivity | PASS/FAIL | 7/7 tables |
|
|
237
|
+
| 10 | Admin API | PASS/FAIL | health + stats |
|
|
238
|
+
| 11 | Hard rules (4 checks) | PASS/FAIL | no -layout, no AGPL, U+2212, version |
|
|
239
|
+
| 12 | Production health | PASS/FAIL | HTTP codes |
|
|
240
|
+
| 13 | ESCIcheck 10-PDF (production) | PASS/FAIL/SKIP | X/10 passed |
|
|
241
|
+
|
|
242
|
+
**Overall: X/13 checks passed**
|
|
243
|
+
|
|
244
|
+
### Issues Found
|
|
245
|
+
- [list any failures with exact error messages and file:line]
|
|
246
|
+
|
|
247
|
+
### AI Verification Notes (Checks 5, 6, 13)
|
|
248
|
+
For each PDF: file name, chars, quality, p-values found, sample text judgment (coherent/garbled/column-interleaved)
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
## Final step: read ~/.claude/skills/_shared/postflight.md and follow it.
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# Special Benchmark Mode: DOCX/PDF Parity + MetaESCI Regression
|
|
2
|
+
|
|
3
|
+
_Extracted from [../SKILL.md](../SKILL.md). Opt-in; 5-15 min runtime._
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
Opt-in cross-format benchmark suite (DOCX corpus, DOCX↔PDF parity via Word COM, PDF↔DOCX via pdf2docx, MetaESCI 200-DOI regression). Takes 5-15 minutes; only run when explicitly requested.
|
|
7
|
+
|
|
8
|
+
**Full procedure:** [references/benchmark-mode.md](references/benchmark-mode.md)
|
|
9
|
+
|
|
10
|
+
### Prerequisites
|
|
11
|
+
|
|
12
|
+
- Microsoft Word installed (for `docx2pdf`, accessed via COM)
|
|
13
|
+
- Python packages: `mammoth`, `beautifulsoup4`, `lxml`, `rapidfuzz`, `pdf2docx`, `docx2pdf`
|
|
14
|
+
- CitationGuard corpus present at `C:\Users\filin\Dropbox\Vibe\CitationGuard\apps\worker\testpdfs\validation\docx\`
|
|
15
|
+
- PDF test corpus at `C:\Users\filin\Dropbox\Vibe\PDFextractor\test-pdfs\`
|
|
16
|
+
|
|
17
|
+
### Running
|
|
18
|
+
|
|
19
|
+
Full benchmark (5–15 minutes, launches Word):
|
|
20
|
+
```bash
|
|
21
|
+
cd C:\Users\filin\Dropbox\Vibe\docpluck && python benchmarks/run_all.py
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
Quick mode (3 files per benchmark, 2–4 minutes):
|
|
25
|
+
```bash
|
|
26
|
+
cd C:\Users\filin\Dropbox\Vibe\docpluck && python benchmarks/run_all.py --quick
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
Skip Word-based conversion (if Word unavailable):
|
|
30
|
+
```bash
|
|
31
|
+
cd C:\Users\filin\Dropbox\Vibe\docpluck && python benchmarks/run_all.py --skip docx2pdf
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
Individual benchmarks:
|
|
35
|
+
```bash
|
|
36
|
+
# 1. DOCX corpus (24 files, ~45s)
|
|
37
|
+
python benchmarks/bench_docx_corpus.py --json benchmarks/results/docx_corpus.json
|
|
38
|
+
|
|
39
|
+
# 2. DOCX → PDF cross-format (20 files via Word COM, ~5-10 min)
|
|
40
|
+
python benchmarks/bench_docx_vs_pdf.py --json benchmarks/results/docx_vs_pdf.json
|
|
41
|
+
|
|
42
|
+
# 3. PDF → DOCX reverse cross-format (8 files via pdf2docx, ~2-4 min)
|
|
43
|
+
python benchmarks/bench_pdf_vs_docx.py --json benchmarks/results/pdf_vs_docx.json
|
|
44
|
+
|
|
45
|
+
# 4. MetaESCI regression baseline (200 frozen DOIs, ~70s)
|
|
46
|
+
python benchmarks/bench_metaesci_regression.py --json benchmarks/results/metaesci_regression.json
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### What Each Benchmark Validates
|
|
50
|
+
|
|
51
|
+
**1. DOCX corpus benchmark** (`bench_docx_corpus.py`)
|
|
52
|
+
- Runs `extract_docx()` on all 24 CitationGuard DOCX files
|
|
53
|
+
- Validates: all 20 real papers extract successfully; all 4 corrupted files fail correctly
|
|
54
|
+
- Checks quality score ≥80 and "high" confidence for every real paper
|
|
55
|
+
- Detects garbled extractions (should be 0)
|
|
56
|
+
- Reports total chars, per-file times, aggregate stats
|
|
57
|
+
|
|
58
|
+
**2. DOCX → PDF cross-format benchmark** (`bench_docx_vs_pdf.py`)
|
|
59
|
+
- For each DOCX: extract text, convert DOCX→PDF via Word (`docx2pdf`), extract PDF text, compare
|
|
60
|
+
- Uses `rapidfuzz.token_set_ratio` to measure similarity (should be ≥ 80%)
|
|
61
|
+
- Uses char-count ratio to detect content loss (should be 0.7–1.3)
|
|
62
|
+
- Both extractors should produce "high" confidence quality scores
|
|
63
|
+
- This is the A/B gold-standard: same content, two extraction paths — asymmetry reveals bugs in either extractor
|
|
64
|
+
|
|
65
|
+
**3. PDF → DOCX reverse cross-format benchmark** (`bench_pdf_vs_docx.py`)
|
|
66
|
+
- For each PDF: extract text, convert PDF→DOCX via `pdf2docx` (pure Python), extract DOCX text, compare
|
|
67
|
+
- Default selection: 8 PDFs across APA, Vancouver, AMA, Harvard, IEEE, Nature styles
|
|
68
|
+
- Same pass criteria as #2
|
|
69
|
+
|
|
70
|
+
### Pass Criteria
|
|
71
|
+
|
|
72
|
+
Benchmark | Threshold
|
|
73
|
+
--- | ---
|
|
74
|
+
DOCX corpus extraction success | 20/20 real papers
|
|
75
|
+
DOCX corpus quality score | avg ≥ 80, all high confidence
|
|
76
|
+
DOCX corpus garbled count | 0
|
|
77
|
+
DOCX→PDF similarity (token_set_ratio) | avg ≥ 80%, min ≥ 80%
|
|
78
|
+
DOCX→PDF char ratio | 0.7–1.3
|
|
79
|
+
PDF→DOCX similarity | avg ≥ 80%, min ≥ 80%
|
|
80
|
+
PDF→DOCX char ratio | 0.7–1.3
|
|
81
|
+
MetaESCI extraction success | 200/200 (zero crashes, zero extraction errors)
|
|
82
|
+
MetaESCI high confidence | ≥ 99% (198/200+)
|
|
83
|
+
MetaESCI avg quality | ≥ 80 / 100
|
|
84
|
+
MetaESCI total U+FFFD | 0
|
|
85
|
+
MetaESCI stat linebreak survivors | 0
|
|
86
|
+
MetaESCI dropped decimal survivors | 0 (watch — current baseline has 21 in 3 files, real gaps pending A1/A2 enhancement)
|
|
87
|
+
|
|
88
|
+
### Interpreting Results
|
|
89
|
+
|
|
90
|
+
- **Similarity < 80%**: the two extractors are producing substantially different text for the same content. Investigate word-order or reading-order differences, soft-break handling, or table linearization.
|
|
91
|
+
- **Char ratio far from 1.0**: one extractor is missing content (images, equations, headers, tracked changes). Investigate which format is losing data.
|
|
92
|
+
- **DOCX quality score < PDF quality score**: something is wrong — DOCX should be cleaner than PDF (no column interleaving, no ligatures, no SMP Unicode). This is a red flag for mammoth configuration or the tree-walk.
|
|
93
|
+
- **OMML equations**: known limitation — mammoth drops Office Math objects. Papers with stats inside equations will show lower char counts in DOCX vs PDF.
|
|
94
|
+
|
|
95
|
+
### Output
|
|
96
|
+
|
|
97
|
+
Benchmark writes three files to `benchmarks/results/`:
|
|
98
|
+
- `docx_corpus.json` — per-file metrics
|
|
99
|
+
- `docx_vs_pdf.json` — cross-format comparison
|
|
100
|
+
- `pdf_vs_docx.json` — reverse comparison
|
|
101
|
+
- `REPORT.md` — consolidated markdown report
|
|
102
|
+
|
|
103
|
+
After running, read `benchmarks/results/REPORT.md` and report the summary to the user.
|
|
104
|
+
|
|
105
|
+
---
|
|
106
|
+
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# 11. Hard Rules Verification
|
|
2
|
+
|
|
3
|
+
_Extracted from [../SKILL.md](../SKILL.md). Full procedure lives here._
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
cd C:\Users\filin\Dropbox\Vibe\PDFextractor\service && python -c "
|
|
7
|
+
import re
|
|
8
|
+
|
|
9
|
+
# Rule 1: No -layout flag in pdftotext calls (check library)
|
|
10
|
+
import docpluck.extract as ext_mod
|
|
11
|
+
import inspect
|
|
12
|
+
source = inspect.getsource(ext_mod)
|
|
13
|
+
calls = re.findall(r'subprocess\.run\(\s*\[.*?\]', source, re.DOTALL)
|
|
14
|
+
for call in calls:
|
|
15
|
+
assert '-layout' not in call, f'BLOCKER: -layout in {call}'
|
|
16
|
+
print('Rule 1 (no -layout): PASS')
|
|
17
|
+
|
|
18
|
+
# Rule 2: No AGPL imports in library or service
|
|
19
|
+
import docpluck.normalize as norm_mod, docpluck.quality as qual_mod
|
|
20
|
+
for name, mod in [('normalize', norm_mod), ('quality', qual_mod), ('extract', ext_mod)]:
|
|
21
|
+
src = inspect.getsource(mod)
|
|
22
|
+
assert 'pymupdf4llm' not in src, f'AGPL import in {name}'
|
|
23
|
+
assert 'column_boxes' not in src, f'AGPL method in {name}'
|
|
24
|
+
with open('app/main.py') as f:
|
|
25
|
+
main_src = f.read()
|
|
26
|
+
assert 'pymupdf4llm' not in main_src, 'AGPL import in main.py'
|
|
27
|
+
print('Rule 2 (no AGPL): PASS')
|
|
28
|
+
|
|
29
|
+
# Rule 3: U+2212 normalization exists in library (check file bytes to avoid encoding issues)
|
|
30
|
+
import docpluck.normalize as _nm_mod
|
|
31
|
+
with open(_nm_mod.__file__, 'rb') as _f:
|
|
32
|
+
_norm_bytes = _f.read()
|
|
33
|
+
assert b'\\u2212' in _norm_bytes or b'\xe2\x88\x92' in _norm_bytes, 'U+2212 normalization missing'
|
|
34
|
+
print('Rule 3 (U+2212 norm): PASS')
|
|
35
|
+
|
|
36
|
+
# Rule 4: Library version is consistent
|
|
37
|
+
import docpluck
|
|
38
|
+
assert docpluck.__version__ == '1.4.5', f'Version mismatch: {docpluck.__version__}'
|
|
39
|
+
print(f'Rule 4 (version=1.4.5): PASS')
|
|
40
|
+
"
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|