docpluck 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docpluck/__init__.py +89 -0
- docpluck/__main__.py +3 -0
- docpluck/batch.py +183 -0
- docpluck/cli.py +35 -0
- docpluck/extract.py +191 -0
- docpluck/extract_docx.py +64 -0
- docpluck/extract_html.py +149 -0
- docpluck/normalize.py +637 -0
- docpluck/quality.py +92 -0
- docpluck/version.py +58 -0
- docpluck-1.5.0.dist-info/METADATA +451 -0
- docpluck-1.5.0.dist-info/RECORD +15 -0
- docpluck-1.5.0.dist-info/WHEEL +4 -0
- docpluck-1.5.0.dist-info/entry_points.txt +2 -0
- docpluck-1.5.0.dist-info/licenses/LICENSE +21 -0
docpluck/__init__.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""
|
|
2
|
+
docpluck — PDF, DOCX, and HTML text extraction and normalization for academic papers
|
|
3
|
+
====================================================================================
|
|
4
|
+
|
|
5
|
+
A Python library for extracting and normalizing text from academic documents.
|
|
6
|
+
Built from cross-project lessons across 8,000+ PDFs from psychology, medicine,
|
|
7
|
+
economics, physics, and biology.
|
|
8
|
+
|
|
9
|
+
Supports:
|
|
10
|
+
- **PDF** via pdftotext (default mode, with pdfplumber SMP fallback)
|
|
11
|
+
- **DOCX** via mammoth (DOCX → HTML → text, preserves soft breaks)
|
|
12
|
+
- **HTML** via beautifulsoup4 + lxml (custom block/inline-aware tree-walk)
|
|
13
|
+
|
|
14
|
+
Quick start::
|
|
15
|
+
|
|
16
|
+
from docpluck import extract_pdf, extract_docx, extract_html
|
|
17
|
+
from docpluck import normalize_text, NormalizationLevel, compute_quality_score
|
|
18
|
+
|
|
19
|
+
# PDF
|
|
20
|
+
with open("paper.pdf", "rb") as f:
|
|
21
|
+
text, method = extract_pdf(f.read())
|
|
22
|
+
|
|
23
|
+
# DOCX (requires: pip install docpluck[docx])
|
|
24
|
+
with open("paper.docx", "rb") as f:
|
|
25
|
+
text, method = extract_docx(f.read())
|
|
26
|
+
|
|
27
|
+
# HTML (requires: pip install docpluck[html])
|
|
28
|
+
with open("paper.html", "rb") as f:
|
|
29
|
+
text, method = extract_html(f.read())
|
|
30
|
+
|
|
31
|
+
# Normalization and quality scoring work on text from any source
|
|
32
|
+
normalized, report = normalize_text(text, NormalizationLevel.academic)
|
|
33
|
+
quality = compute_quality_score(normalized)
|
|
34
|
+
|
|
35
|
+
print(f"Method: {method}")
|
|
36
|
+
print(f"Quality: {quality['score']}/100 ({quality['confidence']})")
|
|
37
|
+
print(f"Steps applied: {report.steps_applied}")
|
|
38
|
+
|
|
39
|
+
Installation::
|
|
40
|
+
|
|
41
|
+
pip install docpluck # PDF only (pdfplumber)
|
|
42
|
+
pip install docpluck[docx] # + mammoth
|
|
43
|
+
pip install docpluck[html] # + beautifulsoup4 + lxml
|
|
44
|
+
pip install docpluck[all] # everything
|
|
45
|
+
|
|
46
|
+
# extract_pdf() also requires poppler-utils:
|
|
47
|
+
# Linux/WSL: apt-get install poppler-utils
|
|
48
|
+
# macOS: brew install poppler
|
|
49
|
+
# Windows: https://github.com/oschwartz10612/poppler-windows/releases
|
|
50
|
+
|
|
51
|
+
See Also:
|
|
52
|
+
- docs/README.md — Full usage guide and API reference
|
|
53
|
+
- docs/DESIGN.md — Implementation decisions and rationale
|
|
54
|
+
- docs/BENCHMARKS.md — Benchmark results across all supported formats
|
|
55
|
+
- docs/NORMALIZATION.md — All 15 pipeline steps documented
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
from .extract import extract_pdf, extract_pdf_file, count_pages
|
|
59
|
+
from .extract_docx import extract_docx
|
|
60
|
+
from .extract_html import extract_html, html_to_text
|
|
61
|
+
from .normalize import normalize_text, NormalizationLevel, NormalizationReport
|
|
62
|
+
from .quality import compute_quality_score
|
|
63
|
+
from .batch import ExtractionReport, extract_to_dir
|
|
64
|
+
from .version import get_version_info
|
|
65
|
+
|
|
66
|
+
__version__ = "1.5.0"
|
|
67
|
+
__author__ = "Gilad Feldman"
|
|
68
|
+
__license__ = "MIT"
|
|
69
|
+
|
|
70
|
+
__all__ = [
|
|
71
|
+
# Extraction
|
|
72
|
+
"extract_pdf",
|
|
73
|
+
"extract_pdf_file",
|
|
74
|
+
"extract_docx",
|
|
75
|
+
"extract_html",
|
|
76
|
+
"html_to_text",
|
|
77
|
+
"count_pages",
|
|
78
|
+
# Normalization
|
|
79
|
+
"normalize_text",
|
|
80
|
+
"NormalizationLevel",
|
|
81
|
+
"NormalizationReport",
|
|
82
|
+
# Quality
|
|
83
|
+
"compute_quality_score",
|
|
84
|
+
# Batch
|
|
85
|
+
"ExtractionReport",
|
|
86
|
+
"extract_to_dir",
|
|
87
|
+
# Version
|
|
88
|
+
"get_version_info",
|
|
89
|
+
]
|
docpluck/__main__.py
ADDED
docpluck/batch.py
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Batch extraction helper for directory-level runs.
|
|
3
|
+
|
|
4
|
+
MetaESCI, Scimeto, and ESCImate all want the same "walk a list of PDFs,
|
|
5
|
+
normalize them, drop a sidecar, and give me a receipt" pattern. Instead of
|
|
6
|
+
each downstream re-implementing it, :func:`extract_to_dir` lives here and
|
|
7
|
+
returns an :class:`ExtractionReport` that doubles as a reproducibility
|
|
8
|
+
receipt (``docpluck_version``, ``normalize_version``, ``git_sha``, per-file
|
|
9
|
+
status).
|
|
10
|
+
|
|
11
|
+
Example::
|
|
12
|
+
|
|
13
|
+
from docpluck import extract_to_dir, NormalizationLevel
|
|
14
|
+
|
|
15
|
+
report = extract_to_dir(
|
|
16
|
+
pdf_paths=list(Path("pdfs").glob("*.pdf")),
|
|
17
|
+
out_dir="normalized_text",
|
|
18
|
+
level=NormalizationLevel.academic,
|
|
19
|
+
)
|
|
20
|
+
print(f"{report.n_ok}/{report.n_total} ok, {report.elapsed_seconds:.1f}s")
|
|
21
|
+
report.write_receipt("normalized_text/_docpluck_receipt.json")
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
import json
|
|
27
|
+
import time
|
|
28
|
+
from dataclasses import dataclass, field, asdict
|
|
29
|
+
from pathlib import Path
|
|
30
|
+
from typing import Iterable, Optional, Union
|
|
31
|
+
|
|
32
|
+
from .extract import extract_pdf_file
|
|
33
|
+
from .normalize import NormalizationLevel, normalize_text
|
|
34
|
+
from .version import get_version_info
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class ExtractionFileResult:
|
|
39
|
+
path: str
|
|
40
|
+
ok: bool
|
|
41
|
+
method: Optional[str] = None
|
|
42
|
+
n_chars_raw: int = 0
|
|
43
|
+
n_chars_normalized: int = 0
|
|
44
|
+
normalize_steps_changed: list[str] = field(default_factory=list)
|
|
45
|
+
error: Optional[str] = None
|
|
46
|
+
elapsed_seconds: float = 0.0
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class ExtractionReport:
|
|
51
|
+
"""Machine-readable receipt for a batch extraction run.
|
|
52
|
+
|
|
53
|
+
Contains the docpluck version metadata, per-file results, and aggregate
|
|
54
|
+
counts. Serializable to JSON via :meth:`to_dict` / :meth:`write_receipt`
|
|
55
|
+
so downstream pipelines can pin reproducibility against a fixed run.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
docpluck_version: str
|
|
59
|
+
normalize_version: str
|
|
60
|
+
git_sha: str
|
|
61
|
+
level: str
|
|
62
|
+
out_dir: str
|
|
63
|
+
n_total: int = 0
|
|
64
|
+
n_ok: int = 0
|
|
65
|
+
n_failed: int = 0
|
|
66
|
+
elapsed_seconds: float = 0.0
|
|
67
|
+
results: list[ExtractionFileResult] = field(default_factory=list)
|
|
68
|
+
|
|
69
|
+
def to_dict(self) -> dict:
|
|
70
|
+
return {
|
|
71
|
+
"docpluck_version": self.docpluck_version,
|
|
72
|
+
"normalize_version": self.normalize_version,
|
|
73
|
+
"git_sha": self.git_sha,
|
|
74
|
+
"level": self.level,
|
|
75
|
+
"out_dir": self.out_dir,
|
|
76
|
+
"n_total": self.n_total,
|
|
77
|
+
"n_ok": self.n_ok,
|
|
78
|
+
"n_failed": self.n_failed,
|
|
79
|
+
"elapsed_seconds": round(self.elapsed_seconds, 3),
|
|
80
|
+
"results": [asdict(r) for r in self.results],
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
def write_receipt(self, path: Union[str, Path]) -> Path:
|
|
84
|
+
out = Path(path)
|
|
85
|
+
out.parent.mkdir(parents=True, exist_ok=True)
|
|
86
|
+
out.write_text(json.dumps(self.to_dict(), indent=2), encoding="utf-8")
|
|
87
|
+
return out
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def extract_to_dir(
|
|
91
|
+
pdf_paths: Iterable[Union[str, Path]],
|
|
92
|
+
out_dir: Union[str, Path],
|
|
93
|
+
level: NormalizationLevel = NormalizationLevel.academic,
|
|
94
|
+
write_sidecar: bool = True,
|
|
95
|
+
) -> ExtractionReport:
|
|
96
|
+
"""Extract and normalize a collection of PDFs into a directory.
|
|
97
|
+
|
|
98
|
+
For each input PDF, writes ``<stem>.txt`` containing normalized text.
|
|
99
|
+
When ``write_sidecar`` is true (default), also writes ``<stem>.json``
|
|
100
|
+
with per-file metadata (method, normalize steps, timings, errors).
|
|
101
|
+
|
|
102
|
+
Missing files are recorded as failures on the report — this function
|
|
103
|
+
does not raise on individual file errors, only on argument errors.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
pdf_paths: Iterable of PDF paths. Each path must point to a file.
|
|
107
|
+
out_dir: Directory that will receive ``<stem>.txt`` (and sidecars).
|
|
108
|
+
Created if it does not exist.
|
|
109
|
+
level: Normalization level. Defaults to ``academic``.
|
|
110
|
+
write_sidecar: Whether to emit the per-file ``.json`` sidecar.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
:class:`ExtractionReport` with aggregate counts and per-file results.
|
|
114
|
+
"""
|
|
115
|
+
info = get_version_info()
|
|
116
|
+
out = Path(out_dir)
|
|
117
|
+
out.mkdir(parents=True, exist_ok=True)
|
|
118
|
+
|
|
119
|
+
report = ExtractionReport(
|
|
120
|
+
docpluck_version=info["version"],
|
|
121
|
+
normalize_version=info["normalize_version"],
|
|
122
|
+
git_sha=info["git_sha"],
|
|
123
|
+
level=level.value if isinstance(level, NormalizationLevel) else str(level),
|
|
124
|
+
out_dir=str(out),
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
batch_start = time.monotonic()
|
|
128
|
+
for p in pdf_paths:
|
|
129
|
+
p = Path(p)
|
|
130
|
+
report.n_total += 1
|
|
131
|
+
file_start = time.monotonic()
|
|
132
|
+
result = ExtractionFileResult(path=str(p), ok=False)
|
|
133
|
+
|
|
134
|
+
try:
|
|
135
|
+
raw_text, method = extract_pdf_file(p)
|
|
136
|
+
result.method = method
|
|
137
|
+
result.n_chars_raw = len(raw_text)
|
|
138
|
+
|
|
139
|
+
if raw_text.startswith("ERROR:"):
|
|
140
|
+
result.error = raw_text
|
|
141
|
+
else:
|
|
142
|
+
normalized, norm_report = normalize_text(raw_text, level)
|
|
143
|
+
result.n_chars_normalized = len(normalized)
|
|
144
|
+
result.normalize_steps_changed = list(
|
|
145
|
+
getattr(norm_report, "steps_changed", norm_report.steps_applied)
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
text_path = out / f"{p.stem}.txt"
|
|
149
|
+
text_path.write_text(normalized, encoding="utf-8")
|
|
150
|
+
|
|
151
|
+
if write_sidecar:
|
|
152
|
+
sidecar = {
|
|
153
|
+
"source": str(p),
|
|
154
|
+
"method": method,
|
|
155
|
+
"level": level.value if isinstance(level, NormalizationLevel) else str(level),
|
|
156
|
+
"normalize_version": info["normalize_version"],
|
|
157
|
+
"docpluck_version": info["version"],
|
|
158
|
+
"git_sha": info["git_sha"],
|
|
159
|
+
"n_chars_raw": result.n_chars_raw,
|
|
160
|
+
"n_chars_normalized": result.n_chars_normalized,
|
|
161
|
+
"steps_changed": result.normalize_steps_changed,
|
|
162
|
+
"changes_made": dict(norm_report.changes_made),
|
|
163
|
+
}
|
|
164
|
+
sidecar_path = out / f"{p.stem}.json"
|
|
165
|
+
sidecar_path.write_text(
|
|
166
|
+
json.dumps(sidecar, indent=2), encoding="utf-8"
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
result.ok = True
|
|
170
|
+
except FileNotFoundError as e:
|
|
171
|
+
result.error = f"FileNotFoundError: {e}"
|
|
172
|
+
except Exception as e: # noqa: BLE001 — batch runner must never raise
|
|
173
|
+
result.error = f"{type(e).__name__}: {e}"
|
|
174
|
+
|
|
175
|
+
result.elapsed_seconds = round(time.monotonic() - file_start, 3)
|
|
176
|
+
report.results.append(result)
|
|
177
|
+
if result.ok:
|
|
178
|
+
report.n_ok += 1
|
|
179
|
+
else:
|
|
180
|
+
report.n_failed += 1
|
|
181
|
+
|
|
182
|
+
report.elapsed_seconds = time.monotonic() - batch_start
|
|
183
|
+
return report
|
docpluck/cli.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Minimal docpluck CLI.
|
|
3
|
+
|
|
4
|
+
Currently supports a single flag::
|
|
5
|
+
|
|
6
|
+
docpluck --version
|
|
7
|
+
|
|
8
|
+
which prints ``{version, normalize_version, git_sha}`` as JSON. Downstream
|
|
9
|
+
batch runners call this once per run and write the output next to their
|
|
10
|
+
results as a reproducibility receipt (see MetaESCI request D3).
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
import sys
|
|
17
|
+
|
|
18
|
+
from .version import get_version_info
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def main(argv: list[str] | None = None) -> int:
|
|
22
|
+
args = list(sys.argv[1:] if argv is None else argv)
|
|
23
|
+
if not args or args[0] in ("-V", "--version", "version"):
|
|
24
|
+
print(json.dumps(get_version_info()))
|
|
25
|
+
return 0
|
|
26
|
+
if args[0] in ("-h", "--help", "help"):
|
|
27
|
+
print("usage: docpluck [--version]")
|
|
28
|
+
return 0
|
|
29
|
+
print(f"docpluck: unknown argument: {args[0]}", file=sys.stderr)
|
|
30
|
+
print("usage: docpluck [--version]", file=sys.stderr)
|
|
31
|
+
return 2
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
if __name__ == "__main__":
|
|
35
|
+
raise SystemExit(main())
|
docpluck/extract.py
ADDED
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PDF Text Extraction
|
|
3
|
+
====================
|
|
4
|
+
Primary engine: pdftotext default mode (no -layout flag)
|
|
5
|
+
Fallback: pdfplumber for PDFs with SMP Unicode (Mathematical Italic fonts)
|
|
6
|
+
|
|
7
|
+
Requires poppler-utils installed on the system:
|
|
8
|
+
- Linux/WSL: apt-get install poppler-utils
|
|
9
|
+
- macOS: brew install poppler
|
|
10
|
+
- Windows: https://github.com/oschwartz10612/poppler-windows/releases
|
|
11
|
+
|
|
12
|
+
Key design decision: pdftotext default mode (NO -layout flag).
|
|
13
|
+
The -layout flag preserves physical column layout, causing column interleaving
|
|
14
|
+
that breaks statistical pattern matching. Default mode correctly reconstructs
|
|
15
|
+
reading order. Verified on 50 PDFs across 8 citation styles — see BENCHMARKS.md.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import os
|
|
19
|
+
import subprocess
|
|
20
|
+
import tempfile
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from typing import Optional, Union
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def extract_pdf(pdf_bytes: bytes) -> tuple[str, str]:
|
|
26
|
+
"""Extract text from PDF bytes.
|
|
27
|
+
|
|
28
|
+
Uses pdftotext as the primary engine. Automatically falls back to
|
|
29
|
+
pdfplumber if the PDF contains SMP Unicode characters (e.g. Mathematical
|
|
30
|
+
Italic fonts used by Nature/Cell journals) that Xpdf cannot handle.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
pdf_bytes: Raw PDF file content as bytes.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
A tuple of (text, method) where:
|
|
37
|
+
- text: Extracted plain text. May start with "ERROR: ..." if extraction
|
|
38
|
+
failed — check with text.startswith("ERROR:").
|
|
39
|
+
- method: Engine used. One of:
|
|
40
|
+
"pdftotext_default" — normal extraction
|
|
41
|
+
"pdftotext_default+pdfplumber_recovery" — SMP fallback triggered
|
|
42
|
+
|
|
43
|
+
Requires:
|
|
44
|
+
pdftotext binary (from poppler-utils) on PATH.
|
|
45
|
+
|
|
46
|
+
Example:
|
|
47
|
+
with open("paper.pdf", "rb") as f:
|
|
48
|
+
text, method = extract_pdf(f.read())
|
|
49
|
+
print(f"Extracted {len(text)} chars via {method}")
|
|
50
|
+
"""
|
|
51
|
+
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
|
|
52
|
+
tmp.write(pdf_bytes)
|
|
53
|
+
tmp_path = tmp.name
|
|
54
|
+
|
|
55
|
+
try:
|
|
56
|
+
# Primary: pdftotext default mode (no -layout flag — critical)
|
|
57
|
+
result = subprocess.run(
|
|
58
|
+
["pdftotext", "-enc", "UTF-8", tmp_path, "-"],
|
|
59
|
+
capture_output=True,
|
|
60
|
+
timeout=120,
|
|
61
|
+
encoding="utf-8",
|
|
62
|
+
errors="replace",
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
if result.returncode != 0:
|
|
66
|
+
return f"ERROR: pdftotext failed with code {result.returncode}", "error"
|
|
67
|
+
|
|
68
|
+
text = result.stdout
|
|
69
|
+
method = "pdftotext_default"
|
|
70
|
+
|
|
71
|
+
# SMP recovery: Xpdf replaces U+FFFF+ characters with U+FFFD (replacement char).
|
|
72
|
+
# pdfplumber handles these correctly and remaps them to ASCII equivalents.
|
|
73
|
+
if text.count("\ufffd") > 0:
|
|
74
|
+
recovered = _recover_with_pdfplumber(tmp_path)
|
|
75
|
+
if recovered:
|
|
76
|
+
text = recovered
|
|
77
|
+
method = "pdftotext_default+pdfplumber_recovery"
|
|
78
|
+
|
|
79
|
+
return text, method
|
|
80
|
+
|
|
81
|
+
finally:
|
|
82
|
+
os.unlink(tmp_path)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def extract_pdf_file(path: Union[str, Path]) -> tuple[str, str]:
|
|
86
|
+
"""Extract text from a PDF file on disk.
|
|
87
|
+
|
|
88
|
+
Thin convenience wrapper around ``extract_pdf`` that reads ``path`` and
|
|
89
|
+
raises a clean ``FileNotFoundError`` when the file does not exist, instead
|
|
90
|
+
of the generic exception pdftotext emits on a missing input. Useful for
|
|
91
|
+
batch runners that walk directories and want actionable errors.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
path: Path to the PDF file on disk (str or pathlib.Path).
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
Same tuple as ``extract_pdf``: ``(text, method)``.
|
|
98
|
+
|
|
99
|
+
Raises:
|
|
100
|
+
FileNotFoundError: If ``path`` does not exist or is not a regular file.
|
|
101
|
+
|
|
102
|
+
Example:
|
|
103
|
+
text, method = extract_pdf_file("paper.pdf")
|
|
104
|
+
"""
|
|
105
|
+
p = Path(path)
|
|
106
|
+
if not p.is_file():
|
|
107
|
+
# is_file() returns False for both missing paths and non-file entries
|
|
108
|
+
# (directories, broken symlinks). Distinguish for a clearer error.
|
|
109
|
+
if p.exists():
|
|
110
|
+
raise FileNotFoundError(f"Path is not a regular file: {p}")
|
|
111
|
+
raise FileNotFoundError(f"PDF file not found: {p}")
|
|
112
|
+
return extract_pdf(p.read_bytes())
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def count_pages(pdf_bytes: bytes) -> int:
|
|
116
|
+
"""Count the number of pages in a PDF.
|
|
117
|
+
|
|
118
|
+
Uses byte pattern matching (no external binary required). Fast and
|
|
119
|
+
reliable for well-formed PDFs. Returns 1 as a minimum.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
pdf_bytes: Raw PDF file content as bytes.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
Page count (integer, minimum 1).
|
|
126
|
+
|
|
127
|
+
Example:
|
|
128
|
+
with open("paper.pdf", "rb") as f:
|
|
129
|
+
n = count_pages(f.read())
|
|
130
|
+
print(f"{n} pages")
|
|
131
|
+
"""
|
|
132
|
+
try:
|
|
133
|
+
count = pdf_bytes.count(b"/Type /Page") - pdf_bytes.count(b"/Type /Pages")
|
|
134
|
+
return max(count, 1)
|
|
135
|
+
except Exception:
|
|
136
|
+
return 0
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _recover_with_pdfplumber(pdf_path: str) -> Optional[str]:
|
|
140
|
+
"""Recover text using pdfplumber when pdftotext produces garbled output.
|
|
141
|
+
|
|
142
|
+
Triggered when U+FFFD (replacement character) appears in pdftotext output,
|
|
143
|
+
which indicates SMP Mathematical Italic fonts (U+1D434-U+1D467) that
|
|
144
|
+
Xpdf/poppler cannot decode. pdfplumber (using pdfminer) handles these
|
|
145
|
+
correctly. Maps the recovered SMP characters to ASCII equivalents so
|
|
146
|
+
downstream regex patterns work normally.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
pdf_path: Path to the PDF file on disk.
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
Recovered text string, or None if recovery failed.
|
|
153
|
+
"""
|
|
154
|
+
try:
|
|
155
|
+
import pdfplumber
|
|
156
|
+
|
|
157
|
+
smp_to_ascii: dict[str, str] = {}
|
|
158
|
+
# Math italic capitals A-Z: U+1D434–U+1D44D
|
|
159
|
+
for i, letter in enumerate("ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
|
|
160
|
+
smp_to_ascii[chr(0x1D434 + i)] = letter
|
|
161
|
+
# Math italic small a-z: U+1D44E–U+1D467
|
|
162
|
+
for i, letter in enumerate("abcdefghijklmnopqrstuvwxyz"):
|
|
163
|
+
smp_to_ascii[chr(0x1D44E + i)] = letter
|
|
164
|
+
# Math italic Greek (common in physics/biology papers)
|
|
165
|
+
greek = {
|
|
166
|
+
0x1D6E2: "A", 0x1D6E4: "G", 0x1D6E5: "D", 0x1D6F4: "S",
|
|
167
|
+
0x1D6F7: "Ph", 0x1D6F8: "Ch", 0x1D6F9: "Ps", 0x1D6FA: "O",
|
|
168
|
+
0x1D6FC: "a", 0x1D6FD: "b", 0x1D6FE: "g", 0x1D6FF: "d",
|
|
169
|
+
0x1D700: "e", 0x1D701: "z", 0x1D702: "n", 0x1D703: "th",
|
|
170
|
+
0x1D707: "m", 0x1D70B: "pi", 0x1D70C: "r", 0x1D70E: "s",
|
|
171
|
+
0x1D711: "ph", 0x1D712: "ch", 0x1D713: "ps",
|
|
172
|
+
}
|
|
173
|
+
for cp, repl in greek.items():
|
|
174
|
+
smp_to_ascii[chr(cp)] = repl
|
|
175
|
+
|
|
176
|
+
pages_text = []
|
|
177
|
+
with pdfplumber.open(pdf_path) as pdf:
|
|
178
|
+
for page in pdf.pages:
|
|
179
|
+
page_text = page.extract_text()
|
|
180
|
+
if page_text:
|
|
181
|
+
pages_text.append(page_text)
|
|
182
|
+
|
|
183
|
+
full_text = "\n\n".join(pages_text)
|
|
184
|
+
|
|
185
|
+
for smp_char, ascii_equiv in smp_to_ascii.items():
|
|
186
|
+
full_text = full_text.replace(smp_char, ascii_equiv)
|
|
187
|
+
|
|
188
|
+
return full_text
|
|
189
|
+
|
|
190
|
+
except Exception:
|
|
191
|
+
return None
|
docpluck/extract_docx.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DOCX Text Extraction
|
|
3
|
+
=====================
|
|
4
|
+
Primary engine: mammoth (DOCX → HTML → text via html_to_text)
|
|
5
|
+
|
|
6
|
+
Why mammoth:
|
|
7
|
+
- `mammoth.convert_to_html()` preserves Shift+Enter soft breaks as <br> tags
|
|
8
|
+
(critical for academic documents with poetry, equations, addresses, etc.)
|
|
9
|
+
- `mammoth.extract_raw_text()` loses intra-paragraph line breaks — do NOT use it
|
|
10
|
+
- python-docx only provides paragraph-level access, not enough structure
|
|
11
|
+
- docx2txt is effectively abandoned
|
|
12
|
+
- pypandoc requires a binary (pandoc) that's hard to deploy
|
|
13
|
+
- BSD-2 license, available in both Python (mammoth) and Node.js (mammoth.js)
|
|
14
|
+
- Battle-tested in Scimeto production since Dec 2025
|
|
15
|
+
|
|
16
|
+
Known limitations:
|
|
17
|
+
- OMML equations (Office Math) are silently dropped. Papers with inline
|
|
18
|
+
stats inside equation objects will lose those values. In practice this is
|
|
19
|
+
rare in social science papers where stats are written as plain text.
|
|
20
|
+
- Tracked changes: only deleted paragraphs are handled minimally.
|
|
21
|
+
- Memory: peak usage is ~3-5x file size. Not a concern for single-file
|
|
22
|
+
processing but worth noting for very large documents.
|
|
23
|
+
|
|
24
|
+
Requires the `docx` optional dependency:
|
|
25
|
+
pip install docpluck[docx]
|
|
26
|
+
"""
|
|
27
|
+
import io
|
|
28
|
+
|
|
29
|
+
from .extract_html import html_to_text
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def extract_docx(docx_bytes: bytes) -> tuple[str, str]:
|
|
33
|
+
"""Extract text from DOCX file bytes.
|
|
34
|
+
|
|
35
|
+
Converts the DOCX to HTML via mammoth (preserving soft breaks and block
|
|
36
|
+
structure), then runs it through html_to_text() for the final plain-text
|
|
37
|
+
output. This two-step pipeline is what makes soft-break preservation work.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
docx_bytes: Raw DOCX file content as bytes.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
A tuple of (text, method) where:
|
|
44
|
+
- text: Extracted plain text with block/inline-aware formatting.
|
|
45
|
+
- method: Always "mammoth".
|
|
46
|
+
|
|
47
|
+
Raises:
|
|
48
|
+
ValueError: If the DOCX is malformed (mammoth raises — we re-raise).
|
|
49
|
+
ImportError: If mammoth is not installed.
|
|
50
|
+
|
|
51
|
+
Requires:
|
|
52
|
+
mammoth (install with `pip install docpluck[docx]`).
|
|
53
|
+
|
|
54
|
+
Example:
|
|
55
|
+
with open("paper.docx", "rb") as f:
|
|
56
|
+
text, method = extract_docx(f.read())
|
|
57
|
+
"""
|
|
58
|
+
# Lazy import so the core library works without mammoth installed
|
|
59
|
+
import mammoth
|
|
60
|
+
|
|
61
|
+
result = mammoth.convert_to_html(io.BytesIO(docx_bytes))
|
|
62
|
+
html = result.value
|
|
63
|
+
text = html_to_text(html)
|
|
64
|
+
return text, "mammoth"
|