docpluck 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
docpluck/__init__.py ADDED
@@ -0,0 +1,89 @@
1
+ """
2
+ docpluck — PDF, DOCX, and HTML text extraction and normalization for academic papers
3
+ ====================================================================================
4
+
5
+ A Python library for extracting and normalizing text from academic documents.
6
+ Built from cross-project lessons across 8,000+ PDFs from psychology, medicine,
7
+ economics, physics, and biology.
8
+
9
+ Supports:
10
+ - **PDF** via pdftotext (default mode, with pdfplumber SMP fallback)
11
+ - **DOCX** via mammoth (DOCX → HTML → text, preserves soft breaks)
12
+ - **HTML** via beautifulsoup4 + lxml (custom block/inline-aware tree-walk)
13
+
14
+ Quick start::
15
+
16
+ from docpluck import extract_pdf, extract_docx, extract_html
17
+ from docpluck import normalize_text, NormalizationLevel, compute_quality_score
18
+
19
+ # PDF
20
+ with open("paper.pdf", "rb") as f:
21
+ text, method = extract_pdf(f.read())
22
+
23
+ # DOCX (requires: pip install docpluck[docx])
24
+ with open("paper.docx", "rb") as f:
25
+ text, method = extract_docx(f.read())
26
+
27
+ # HTML (requires: pip install docpluck[html])
28
+ with open("paper.html", "rb") as f:
29
+ text, method = extract_html(f.read())
30
+
31
+ # Normalization and quality scoring work on text from any source
32
+ normalized, report = normalize_text(text, NormalizationLevel.academic)
33
+ quality = compute_quality_score(normalized)
34
+
35
+ print(f"Method: {method}")
36
+ print(f"Quality: {quality['score']}/100 ({quality['confidence']})")
37
+ print(f"Steps applied: {report.steps_applied}")
38
+
39
+ Installation::
40
+
41
+ pip install docpluck # PDF only (pdfplumber)
42
+ pip install docpluck[docx] # + mammoth
43
+ pip install docpluck[html] # + beautifulsoup4 + lxml
44
+ pip install docpluck[all] # everything
45
+
46
+ # extract_pdf() also requires poppler-utils:
47
+ # Linux/WSL: apt-get install poppler-utils
48
+ # macOS: brew install poppler
49
+ # Windows: https://github.com/oschwartz10612/poppler-windows/releases
50
+
51
+ See Also:
52
+ - docs/README.md — Full usage guide and API reference
53
+ - docs/DESIGN.md — Implementation decisions and rationale
54
+ - docs/BENCHMARKS.md — Benchmark results across all supported formats
55
+ - docs/NORMALIZATION.md — All 15 pipeline steps documented
56
+ """
57
+
58
+ from .extract import extract_pdf, extract_pdf_file, count_pages
59
+ from .extract_docx import extract_docx
60
+ from .extract_html import extract_html, html_to_text
61
+ from .normalize import normalize_text, NormalizationLevel, NormalizationReport
62
+ from .quality import compute_quality_score
63
+ from .batch import ExtractionReport, extract_to_dir
64
+ from .version import get_version_info
65
+
66
+ __version__ = "1.5.0"
67
+ __author__ = "Gilad Feldman"
68
+ __license__ = "MIT"
69
+
70
+ __all__ = [
71
+ # Extraction
72
+ "extract_pdf",
73
+ "extract_pdf_file",
74
+ "extract_docx",
75
+ "extract_html",
76
+ "html_to_text",
77
+ "count_pages",
78
+ # Normalization
79
+ "normalize_text",
80
+ "NormalizationLevel",
81
+ "NormalizationReport",
82
+ # Quality
83
+ "compute_quality_score",
84
+ # Batch
85
+ "ExtractionReport",
86
+ "extract_to_dir",
87
+ # Version
88
+ "get_version_info",
89
+ ]
docpluck/__main__.py ADDED
@@ -0,0 +1,3 @@
1
+ from .cli import main
2
+
3
+ raise SystemExit(main())
docpluck/batch.py ADDED
@@ -0,0 +1,183 @@
1
+ """
2
+ Batch extraction helper for directory-level runs.
3
+
4
+ MetaESCI, Scimeto, and ESCImate all want the same "walk a list of PDFs,
5
+ normalize them, drop a sidecar, and give me a receipt" pattern. Instead of
6
+ each downstream re-implementing it, :func:`extract_to_dir` lives here and
7
+ returns an :class:`ExtractionReport` that doubles as a reproducibility
8
+ receipt (``docpluck_version``, ``normalize_version``, ``git_sha``, per-file
9
+ status).
10
+
11
+ Example::
12
+
13
+ from docpluck import extract_to_dir, NormalizationLevel
14
+
15
+ report = extract_to_dir(
16
+ pdf_paths=list(Path("pdfs").glob("*.pdf")),
17
+ out_dir="normalized_text",
18
+ level=NormalizationLevel.academic,
19
+ )
20
+ print(f"{report.n_ok}/{report.n_total} ok, {report.elapsed_seconds:.1f}s")
21
+ report.write_receipt("normalized_text/_docpluck_receipt.json")
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ import json
27
+ import time
28
+ from dataclasses import dataclass, field, asdict
29
+ from pathlib import Path
30
+ from typing import Iterable, Optional, Union
31
+
32
+ from .extract import extract_pdf_file
33
+ from .normalize import NormalizationLevel, normalize_text
34
+ from .version import get_version_info
35
+
36
+
37
+ @dataclass
38
+ class ExtractionFileResult:
39
+ path: str
40
+ ok: bool
41
+ method: Optional[str] = None
42
+ n_chars_raw: int = 0
43
+ n_chars_normalized: int = 0
44
+ normalize_steps_changed: list[str] = field(default_factory=list)
45
+ error: Optional[str] = None
46
+ elapsed_seconds: float = 0.0
47
+
48
+
49
+ @dataclass
50
+ class ExtractionReport:
51
+ """Machine-readable receipt for a batch extraction run.
52
+
53
+ Contains the docpluck version metadata, per-file results, and aggregate
54
+ counts. Serializable to JSON via :meth:`to_dict` / :meth:`write_receipt`
55
+ so downstream pipelines can pin reproducibility against a fixed run.
56
+ """
57
+
58
+ docpluck_version: str
59
+ normalize_version: str
60
+ git_sha: str
61
+ level: str
62
+ out_dir: str
63
+ n_total: int = 0
64
+ n_ok: int = 0
65
+ n_failed: int = 0
66
+ elapsed_seconds: float = 0.0
67
+ results: list[ExtractionFileResult] = field(default_factory=list)
68
+
69
+ def to_dict(self) -> dict:
70
+ return {
71
+ "docpluck_version": self.docpluck_version,
72
+ "normalize_version": self.normalize_version,
73
+ "git_sha": self.git_sha,
74
+ "level": self.level,
75
+ "out_dir": self.out_dir,
76
+ "n_total": self.n_total,
77
+ "n_ok": self.n_ok,
78
+ "n_failed": self.n_failed,
79
+ "elapsed_seconds": round(self.elapsed_seconds, 3),
80
+ "results": [asdict(r) for r in self.results],
81
+ }
82
+
83
+ def write_receipt(self, path: Union[str, Path]) -> Path:
84
+ out = Path(path)
85
+ out.parent.mkdir(parents=True, exist_ok=True)
86
+ out.write_text(json.dumps(self.to_dict(), indent=2), encoding="utf-8")
87
+ return out
88
+
89
+
90
+ def extract_to_dir(
91
+ pdf_paths: Iterable[Union[str, Path]],
92
+ out_dir: Union[str, Path],
93
+ level: NormalizationLevel = NormalizationLevel.academic,
94
+ write_sidecar: bool = True,
95
+ ) -> ExtractionReport:
96
+ """Extract and normalize a collection of PDFs into a directory.
97
+
98
+ For each input PDF, writes ``<stem>.txt`` containing normalized text.
99
+ When ``write_sidecar`` is true (default), also writes ``<stem>.json``
100
+ with per-file metadata (method, normalize steps, timings, errors).
101
+
102
+ Missing files are recorded as failures on the report — this function
103
+ does not raise on individual file errors, only on argument errors.
104
+
105
+ Args:
106
+ pdf_paths: Iterable of PDF paths. Each path must point to a file.
107
+ out_dir: Directory that will receive ``<stem>.txt`` (and sidecars).
108
+ Created if it does not exist.
109
+ level: Normalization level. Defaults to ``academic``.
110
+ write_sidecar: Whether to emit the per-file ``.json`` sidecar.
111
+
112
+ Returns:
113
+ :class:`ExtractionReport` with aggregate counts and per-file results.
114
+ """
115
+ info = get_version_info()
116
+ out = Path(out_dir)
117
+ out.mkdir(parents=True, exist_ok=True)
118
+
119
+ report = ExtractionReport(
120
+ docpluck_version=info["version"],
121
+ normalize_version=info["normalize_version"],
122
+ git_sha=info["git_sha"],
123
+ level=level.value if isinstance(level, NormalizationLevel) else str(level),
124
+ out_dir=str(out),
125
+ )
126
+
127
+ batch_start = time.monotonic()
128
+ for p in pdf_paths:
129
+ p = Path(p)
130
+ report.n_total += 1
131
+ file_start = time.monotonic()
132
+ result = ExtractionFileResult(path=str(p), ok=False)
133
+
134
+ try:
135
+ raw_text, method = extract_pdf_file(p)
136
+ result.method = method
137
+ result.n_chars_raw = len(raw_text)
138
+
139
+ if raw_text.startswith("ERROR:"):
140
+ result.error = raw_text
141
+ else:
142
+ normalized, norm_report = normalize_text(raw_text, level)
143
+ result.n_chars_normalized = len(normalized)
144
+ result.normalize_steps_changed = list(
145
+ getattr(norm_report, "steps_changed", norm_report.steps_applied)
146
+ )
147
+
148
+ text_path = out / f"{p.stem}.txt"
149
+ text_path.write_text(normalized, encoding="utf-8")
150
+
151
+ if write_sidecar:
152
+ sidecar = {
153
+ "source": str(p),
154
+ "method": method,
155
+ "level": level.value if isinstance(level, NormalizationLevel) else str(level),
156
+ "normalize_version": info["normalize_version"],
157
+ "docpluck_version": info["version"],
158
+ "git_sha": info["git_sha"],
159
+ "n_chars_raw": result.n_chars_raw,
160
+ "n_chars_normalized": result.n_chars_normalized,
161
+ "steps_changed": result.normalize_steps_changed,
162
+ "changes_made": dict(norm_report.changes_made),
163
+ }
164
+ sidecar_path = out / f"{p.stem}.json"
165
+ sidecar_path.write_text(
166
+ json.dumps(sidecar, indent=2), encoding="utf-8"
167
+ )
168
+
169
+ result.ok = True
170
+ except FileNotFoundError as e:
171
+ result.error = f"FileNotFoundError: {e}"
172
+ except Exception as e: # noqa: BLE001 — batch runner must never raise
173
+ result.error = f"{type(e).__name__}: {e}"
174
+
175
+ result.elapsed_seconds = round(time.monotonic() - file_start, 3)
176
+ report.results.append(result)
177
+ if result.ok:
178
+ report.n_ok += 1
179
+ else:
180
+ report.n_failed += 1
181
+
182
+ report.elapsed_seconds = time.monotonic() - batch_start
183
+ return report
docpluck/cli.py ADDED
@@ -0,0 +1,35 @@
1
+ """
2
+ Minimal docpluck CLI.
3
+
4
+ Currently supports a single flag::
5
+
6
+ docpluck --version
7
+
8
+ which prints ``{version, normalize_version, git_sha}`` as JSON. Downstream
9
+ batch runners call this once per run and write the output next to their
10
+ results as a reproducibility receipt (see MetaESCI request D3).
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import json
16
+ import sys
17
+
18
+ from .version import get_version_info
19
+
20
+
21
+ def main(argv: list[str] | None = None) -> int:
22
+ args = list(sys.argv[1:] if argv is None else argv)
23
+ if not args or args[0] in ("-V", "--version", "version"):
24
+ print(json.dumps(get_version_info()))
25
+ return 0
26
+ if args[0] in ("-h", "--help", "help"):
27
+ print("usage: docpluck [--version]")
28
+ return 0
29
+ print(f"docpluck: unknown argument: {args[0]}", file=sys.stderr)
30
+ print("usage: docpluck [--version]", file=sys.stderr)
31
+ return 2
32
+
33
+
34
+ if __name__ == "__main__":
35
+ raise SystemExit(main())
docpluck/extract.py ADDED
@@ -0,0 +1,191 @@
1
+ """
2
+ PDF Text Extraction
3
+ ====================
4
+ Primary engine: pdftotext default mode (no -layout flag)
5
+ Fallback: pdfplumber for PDFs with SMP Unicode (Mathematical Italic fonts)
6
+
7
+ Requires poppler-utils installed on the system:
8
+ - Linux/WSL: apt-get install poppler-utils
9
+ - macOS: brew install poppler
10
+ - Windows: https://github.com/oschwartz10612/poppler-windows/releases
11
+
12
+ Key design decision: pdftotext default mode (NO -layout flag).
13
+ The -layout flag preserves physical column layout, causing column interleaving
14
+ that breaks statistical pattern matching. Default mode correctly reconstructs
15
+ reading order. Verified on 50 PDFs across 8 citation styles — see BENCHMARKS.md.
16
+ """
17
+
18
+ import os
19
+ import subprocess
20
+ import tempfile
21
+ from pathlib import Path
22
+ from typing import Optional, Union
23
+
24
+
25
+ def extract_pdf(pdf_bytes: bytes) -> tuple[str, str]:
26
+ """Extract text from PDF bytes.
27
+
28
+ Uses pdftotext as the primary engine. Automatically falls back to
29
+ pdfplumber if the PDF contains SMP Unicode characters (e.g. Mathematical
30
+ Italic fonts used by Nature/Cell journals) that Xpdf cannot handle.
31
+
32
+ Args:
33
+ pdf_bytes: Raw PDF file content as bytes.
34
+
35
+ Returns:
36
+ A tuple of (text, method) where:
37
+ - text: Extracted plain text. May start with "ERROR: ..." if extraction
38
+ failed — check with text.startswith("ERROR:").
39
+ - method: Engine used. One of:
40
+ "pdftotext_default" — normal extraction
41
+ "pdftotext_default+pdfplumber_recovery" — SMP fallback triggered
42
+
43
+ Requires:
44
+ pdftotext binary (from poppler-utils) on PATH.
45
+
46
+ Example:
47
+ with open("paper.pdf", "rb") as f:
48
+ text, method = extract_pdf(f.read())
49
+ print(f"Extracted {len(text)} chars via {method}")
50
+ """
51
+ with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
52
+ tmp.write(pdf_bytes)
53
+ tmp_path = tmp.name
54
+
55
+ try:
56
+ # Primary: pdftotext default mode (no -layout flag — critical)
57
+ result = subprocess.run(
58
+ ["pdftotext", "-enc", "UTF-8", tmp_path, "-"],
59
+ capture_output=True,
60
+ timeout=120,
61
+ encoding="utf-8",
62
+ errors="replace",
63
+ )
64
+
65
+ if result.returncode != 0:
66
+ return f"ERROR: pdftotext failed with code {result.returncode}", "error"
67
+
68
+ text = result.stdout
69
+ method = "pdftotext_default"
70
+
71
+ # SMP recovery: Xpdf replaces U+FFFF+ characters with U+FFFD (replacement char).
72
+ # pdfplumber handles these correctly and remaps them to ASCII equivalents.
73
+ if text.count("\ufffd") > 0:
74
+ recovered = _recover_with_pdfplumber(tmp_path)
75
+ if recovered:
76
+ text = recovered
77
+ method = "pdftotext_default+pdfplumber_recovery"
78
+
79
+ return text, method
80
+
81
+ finally:
82
+ os.unlink(tmp_path)
83
+
84
+
85
+ def extract_pdf_file(path: Union[str, Path]) -> tuple[str, str]:
86
+ """Extract text from a PDF file on disk.
87
+
88
+ Thin convenience wrapper around ``extract_pdf`` that reads ``path`` and
89
+ raises a clean ``FileNotFoundError`` when the file does not exist, instead
90
+ of the generic exception pdftotext emits on a missing input. Useful for
91
+ batch runners that walk directories and want actionable errors.
92
+
93
+ Args:
94
+ path: Path to the PDF file on disk (str or pathlib.Path).
95
+
96
+ Returns:
97
+ Same tuple as ``extract_pdf``: ``(text, method)``.
98
+
99
+ Raises:
100
+ FileNotFoundError: If ``path`` does not exist or is not a regular file.
101
+
102
+ Example:
103
+ text, method = extract_pdf_file("paper.pdf")
104
+ """
105
+ p = Path(path)
106
+ if not p.is_file():
107
+ # is_file() returns False for both missing paths and non-file entries
108
+ # (directories, broken symlinks). Distinguish for a clearer error.
109
+ if p.exists():
110
+ raise FileNotFoundError(f"Path is not a regular file: {p}")
111
+ raise FileNotFoundError(f"PDF file not found: {p}")
112
+ return extract_pdf(p.read_bytes())
113
+
114
+
115
+ def count_pages(pdf_bytes: bytes) -> int:
116
+ """Count the number of pages in a PDF.
117
+
118
+ Uses byte pattern matching (no external binary required). Fast and
119
+ reliable for well-formed PDFs. Returns 1 as a minimum.
120
+
121
+ Args:
122
+ pdf_bytes: Raw PDF file content as bytes.
123
+
124
+ Returns:
125
+ Page count (integer, minimum 1).
126
+
127
+ Example:
128
+ with open("paper.pdf", "rb") as f:
129
+ n = count_pages(f.read())
130
+ print(f"{n} pages")
131
+ """
132
+ try:
133
+ count = pdf_bytes.count(b"/Type /Page") - pdf_bytes.count(b"/Type /Pages")
134
+ return max(count, 1)
135
+ except Exception:
136
+ return 0
137
+
138
+
139
+ def _recover_with_pdfplumber(pdf_path: str) -> Optional[str]:
140
+ """Recover text using pdfplumber when pdftotext produces garbled output.
141
+
142
+ Triggered when U+FFFD (replacement character) appears in pdftotext output,
143
+ which indicates SMP Mathematical Italic fonts (U+1D434-U+1D467) that
144
+ Xpdf/poppler cannot decode. pdfplumber (using pdfminer) handles these
145
+ correctly. Maps the recovered SMP characters to ASCII equivalents so
146
+ downstream regex patterns work normally.
147
+
148
+ Args:
149
+ pdf_path: Path to the PDF file on disk.
150
+
151
+ Returns:
152
+ Recovered text string, or None if recovery failed.
153
+ """
154
+ try:
155
+ import pdfplumber
156
+
157
+ smp_to_ascii: dict[str, str] = {}
158
+ # Math italic capitals A-Z: U+1D434–U+1D44D
159
+ for i, letter in enumerate("ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
160
+ smp_to_ascii[chr(0x1D434 + i)] = letter
161
+ # Math italic small a-z: U+1D44E–U+1D467
162
+ for i, letter in enumerate("abcdefghijklmnopqrstuvwxyz"):
163
+ smp_to_ascii[chr(0x1D44E + i)] = letter
164
+ # Math italic Greek (common in physics/biology papers)
165
+ greek = {
166
+ 0x1D6E2: "A", 0x1D6E4: "G", 0x1D6E5: "D", 0x1D6F4: "S",
167
+ 0x1D6F7: "Ph", 0x1D6F8: "Ch", 0x1D6F9: "Ps", 0x1D6FA: "O",
168
+ 0x1D6FC: "a", 0x1D6FD: "b", 0x1D6FE: "g", 0x1D6FF: "d",
169
+ 0x1D700: "e", 0x1D701: "z", 0x1D702: "n", 0x1D703: "th",
170
+ 0x1D707: "m", 0x1D70B: "pi", 0x1D70C: "r", 0x1D70E: "s",
171
+ 0x1D711: "ph", 0x1D712: "ch", 0x1D713: "ps",
172
+ }
173
+ for cp, repl in greek.items():
174
+ smp_to_ascii[chr(cp)] = repl
175
+
176
+ pages_text = []
177
+ with pdfplumber.open(pdf_path) as pdf:
178
+ for page in pdf.pages:
179
+ page_text = page.extract_text()
180
+ if page_text:
181
+ pages_text.append(page_text)
182
+
183
+ full_text = "\n\n".join(pages_text)
184
+
185
+ for smp_char, ascii_equiv in smp_to_ascii.items():
186
+ full_text = full_text.replace(smp_char, ascii_equiv)
187
+
188
+ return full_text
189
+
190
+ except Exception:
191
+ return None
@@ -0,0 +1,64 @@
1
+ """
2
+ DOCX Text Extraction
3
+ =====================
4
+ Primary engine: mammoth (DOCX → HTML → text via html_to_text)
5
+
6
+ Why mammoth:
7
+ - `mammoth.convert_to_html()` preserves Shift+Enter soft breaks as <br> tags
8
+ (critical for academic documents with poetry, equations, addresses, etc.)
9
+ - `mammoth.extract_raw_text()` loses intra-paragraph line breaks — do NOT use it
10
+ - python-docx only provides paragraph-level access, not enough structure
11
+ - docx2txt is effectively abandoned
12
+ - pypandoc requires a binary (pandoc) that's hard to deploy
13
+ - BSD-2 license, available in both Python (mammoth) and Node.js (mammoth.js)
14
+ - Battle-tested in Scimeto production since Dec 2025
15
+
16
+ Known limitations:
17
+ - OMML equations (Office Math) are silently dropped. Papers with inline
18
+ stats inside equation objects will lose those values. In practice this is
19
+ rare in social science papers where stats are written as plain text.
20
+ - Tracked changes: only deleted paragraphs are handled minimally.
21
+ - Memory: peak usage is ~3-5x file size. Not a concern for single-file
22
+ processing but worth noting for very large documents.
23
+
24
+ Requires the `docx` optional dependency:
25
+ pip install docpluck[docx]
26
+ """
27
+ import io
28
+
29
+ from .extract_html import html_to_text
30
+
31
+
32
+ def extract_docx(docx_bytes: bytes) -> tuple[str, str]:
33
+ """Extract text from DOCX file bytes.
34
+
35
+ Converts the DOCX to HTML via mammoth (preserving soft breaks and block
36
+ structure), then runs it through html_to_text() for the final plain-text
37
+ output. This two-step pipeline is what makes soft-break preservation work.
38
+
39
+ Args:
40
+ docx_bytes: Raw DOCX file content as bytes.
41
+
42
+ Returns:
43
+ A tuple of (text, method) where:
44
+ - text: Extracted plain text with block/inline-aware formatting.
45
+ - method: Always "mammoth".
46
+
47
+ Raises:
48
+ ValueError: If the DOCX is malformed (mammoth raises — we re-raise).
49
+ ImportError: If mammoth is not installed.
50
+
51
+ Requires:
52
+ mammoth (install with `pip install docpluck[docx]`).
53
+
54
+ Example:
55
+ with open("paper.docx", "rb") as f:
56
+ text, method = extract_docx(f.read())
57
+ """
58
+ # Lazy import so the core library works without mammoth installed
59
+ import mammoth
60
+
61
+ result = mammoth.convert_to_html(io.BytesIO(docx_bytes))
62
+ html = result.value
63
+ text = html_to_text(html)
64
+ return text, "mammoth"