pdfblah 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,24 @@
1
+ name: ci
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ workflow_dispatch: {}
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ${{ matrix.os }}
12
+ strategy:
13
+ matrix:
14
+ os: [ubuntu-latest, macos-latest]
15
+ python: ["3.9", "3.12"]
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+ - uses: actions/setup-python@v5
19
+ with:
20
+ python-version: ${{ matrix.python }}
21
+ - run: pip install -e ".[test]"
22
+ - run: pytest -q
23
+ - name: Build wheel
24
+ run: pip install build && python -m build
@@ -0,0 +1,23 @@
1
+ name: pypi
2
+
3
+ # Publishes to PyPI when a GitHub Release is published. Uses PyPI Trusted
4
+ # Publishing (OIDC), so no API token is stored. One-time setup: on PyPI, add a
5
+ # pending publisher for project "pdfblah" -> repo KuvopLLC/pdfblah, workflow
6
+ # pypi.yml.
7
+ on:
8
+ release:
9
+ types: [published]
10
+ workflow_dispatch: {}
11
+
12
+ jobs:
13
+ pypi:
14
+ runs-on: ubuntu-latest
15
+ permissions:
16
+ id-token: write
17
+ steps:
18
+ - uses: actions/checkout@v4
19
+ - uses: actions/setup-python@v5
20
+ with:
21
+ python-version: "3.12"
22
+ - run: pip install build && python -m build
23
+ - uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,8 @@
1
+ __pycache__/
2
+ *.pyc
3
+ *.egg-info/
4
+ build/
5
+ dist/
6
+ .pytest_cache/
7
+ .venv/
8
+ venv/
pdfblah-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Kuvop LLC
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
pdfblah-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,108 @@
1
+ Metadata-Version: 2.4
2
+ Name: pdfblah
3
+ Version: 0.1.0
4
+ Summary: Real find and replace on the actual text in a PDF. No overlay, metadata preserved.
5
+ Project-URL: Homepage, https://pdfblah.com
6
+ Project-URL: Source, https://github.com/KuvopLLC/pdfblah
7
+ Project-URL: Issues, https://github.com/KuvopLLC/pdfblah/issues
8
+ Author: Kuvop LLC
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Keywords: acrobat-alternative,cli,edit,find,pdf,pdf-editor,redact,replace,text
12
+ Classifier: Environment :: Console
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: End Users/Desktop
15
+ Classifier: Operating System :: MacOS
16
+ Classifier: Operating System :: POSIX
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Topic :: Text Processing
19
+ Classifier: Topic :: Utilities
20
+ Requires-Python: >=3.9
21
+ Requires-Dist: pdfplumber>=0.11
22
+ Requires-Dist: pikepdf>=8
23
+ Provides-Extra: test
24
+ Requires-Dist: pytest; extra == 'test'
25
+ Requires-Dist: reportlab; extra == 'test'
26
+ Description-Content-Type: text/markdown
27
+
28
+ # pdfblah
29
+
30
+ Real find and replace on the actual text in a PDF, from the command line.
31
+
32
+ Most tools "edit" a PDF by painting a box over the old text and drawing new text
33
+ on top, which leaves the original underneath (copy and paste still reveals it) and
34
+ often adds a watermark. `pdfblah` rewrites the real text in the content stream, so:
35
+
36
+ - the old text is genuinely gone (`pdftotext`, Ctrl-F, and copy show only the new value)
37
+ - no overlay, no watermark
38
+ - the original metadata (dates, Producer, XMP) is preserved byte for byte
39
+ - alignment is auto-detected and kept, so right-aligned numbers stay flush
40
+ - fonts it cannot reproduce are refused instead of garbled
41
+
42
+ Pure Python (pdfplumber + pikepdf). No system dependencies.
43
+
44
+ ## Install
45
+
46
+ ```sh
47
+ pipx install pdfblah # or: pip install pdfblah
48
+ ```
49
+
50
+ ## Use
51
+
52
+ Replace the first match:
53
+
54
+ ```sh
55
+ pdfblah in.pdf out.pdf --find "Old Name" --replace "New Name"
56
+ ```
57
+
58
+ Options:
59
+
60
+ ```sh
61
+ --scope all change every match (default: first)
62
+ --scope 3 change the 3rd match
63
+ --ci ignore case
64
+ --word whole word only ("cat" will not match "category")
65
+ --page 2 only page 2
66
+ --replace "" delete the text
67
+ ```
68
+
69
+ Many rules from a file (`FIND | REPLACE | FLAGS` per line):
70
+
71
+ ```sh
72
+ pdfblah in.pdf out.pdf --rules rules.txt
73
+ ```
74
+
75
+ ```
76
+ # rules.txt
77
+ Old Company Name | New Company Name | all
78
+ CONFIDENTIAL DRAFT | FINAL | ci
79
+ Jane Doe | John Smith | all word
80
+ Total | Sum | 2
81
+ delete this phrase |
82
+ ```
83
+
84
+ ## Library
85
+
86
+ ```python
87
+ from pdfblah import process, apply_rules, parse_rules_file
88
+
89
+ process("in.pdf", "out.pdf", "999.00", "42.00", scope="all", ci=True)
90
+ ```
91
+
92
+ Each call returns a report dict (`ok`, `count`, `refused`, `reason`, ...).
93
+
94
+ ## What it does not do
95
+
96
+ Scanned PDFs (image only, no text layer) cannot be edited. Fonts that are not
97
+ embedded and not standard, or use a custom encoding, are refused rather than
98
+ rendered wrong. This is by design: a wrong-looking edit is worse than a clear "no".
99
+
100
+ ## Hosted version
101
+
102
+ Want it without installing anything, or for a non-technical colleague? The hosted
103
+ version at **[pdfblah.com](https://pdfblah.com)** does the same edit in the browser:
104
+ upload, preview for free, download.
105
+
106
+ ## License
107
+
108
+ MIT, (c) 2026 Kuvop LLC.
@@ -0,0 +1,81 @@
1
+ # pdfblah
2
+
3
+ Real find and replace on the actual text in a PDF, from the command line.
4
+
5
+ Most tools "edit" a PDF by painting a box over the old text and drawing new text
6
+ on top, which leaves the original underneath (copy and paste still reveals it) and
7
+ often adds a watermark. `pdfblah` rewrites the real text in the content stream, so:
8
+
9
+ - the old text is genuinely gone (`pdftotext`, Ctrl-F, and copy show only the new value)
10
+ - no overlay, no watermark
11
+ - the original metadata (dates, Producer, XMP) is preserved byte for byte
12
+ - alignment is auto-detected and kept, so right-aligned numbers stay flush
13
+ - fonts it cannot reproduce are refused instead of garbled
14
+
15
+ Pure Python (pdfplumber + pikepdf). No system dependencies.
16
+
17
+ ## Install
18
+
19
+ ```sh
20
+ pipx install pdfblah # or: pip install pdfblah
21
+ ```
22
+
23
+ ## Use
24
+
25
+ Replace the first match:
26
+
27
+ ```sh
28
+ pdfblah in.pdf out.pdf --find "Old Name" --replace "New Name"
29
+ ```
30
+
31
+ Options:
32
+
33
+ ```sh
34
+ --scope all change every match (default: first)
35
+ --scope 3 change the 3rd match
36
+ --ci ignore case
37
+ --word whole word only ("cat" will not match "category")
38
+ --page 2 only page 2
39
+ --replace "" delete the text
40
+ ```
41
+
42
+ Many rules from a file (`FIND | REPLACE | FLAGS` per line):
43
+
44
+ ```sh
45
+ pdfblah in.pdf out.pdf --rules rules.txt
46
+ ```
47
+
48
+ ```
49
+ # rules.txt
50
+ Old Company Name | New Company Name | all
51
+ CONFIDENTIAL DRAFT | FINAL | ci
52
+ Jane Doe | John Smith | all word
53
+ Total | Sum | 2
54
+ delete this phrase |
55
+ ```
56
+
57
+ ## Library
58
+
59
+ ```python
60
+ from pdfblah import process, apply_rules, parse_rules_file
61
+
62
+ process("in.pdf", "out.pdf", "999.00", "42.00", scope="all", ci=True)
63
+ ```
64
+
65
+ Each call returns a report dict (`ok`, `count`, `refused`, `reason`, ...).
66
+
67
+ ## What it does not do
68
+
69
+ Scanned PDFs (image only, no text layer) cannot be edited. Fonts that are not
70
+ embedded and not standard, or use a custom encoding, are refused rather than
71
+ rendered wrong. This is by design: a wrong-looking edit is worse than a clear "no".
72
+
73
+ ## Hosted version
74
+
75
+ Want it without installing anything, or for a non-technical colleague? The hosted
76
+ version at **[pdfblah.com](https://pdfblah.com)** does the same edit in the browser:
77
+ upload, preview for free, download.
78
+
79
+ ## License
80
+
81
+ MIT, (c) 2026 Kuvop LLC.
@@ -0,0 +1,20 @@
1
+ """pdfblah: real find and replace on the actual text in a PDF.
2
+
3
+ Rewrites the real text in the content stream (no overlay, no watermark), preserves
4
+ all original metadata, auto-detects alignment, and refuses fonts it cannot
5
+ reproduce instead of garbling them.
6
+ """
7
+ from .engine import (
8
+ process,
9
+ apply_rules,
10
+ parse_rules_file,
11
+ parse_flags,
12
+ font_safe,
13
+ detect_alignment,
14
+ )
15
+
16
+ __version__ = "0.1.0"
17
+ __all__ = [
18
+ "process", "apply_rules", "parse_rules_file", "parse_flags",
19
+ "font_safe", "detect_alignment",
20
+ ]
@@ -0,0 +1,62 @@
1
+ """Command-line interface for pdfblah."""
2
+ import argparse
3
+ import json
4
+ import sys
5
+
6
+ from .engine import process, apply_rules, parse_rules_file
7
+
8
+
9
+ def main(argv=None):
10
+ ap = argparse.ArgumentParser(
11
+ prog="pdfblah",
12
+ description="Real find and replace on the actual text in a PDF. "
13
+ "No overlay, metadata preserved, alignment auto-detected.")
14
+ ap.add_argument("input", help="input PDF")
15
+ ap.add_argument("output", help="output PDF (never overwrites the input)")
16
+ ap.add_argument("--find", help="text to find")
17
+ ap.add_argument("--replace", default="", help="replacement text (empty deletes the text)")
18
+ ap.add_argument("--rules", metavar="FILE",
19
+ help="apply many rules from a file, one 'FIND | REPLACE | FLAGS' per line")
20
+ ap.add_argument("--scope", default="first", metavar="WHICH",
21
+ help="first (default), all, or a number for the Nth match")
22
+ ap.add_argument("--ci", action="store_true", help="ignore case")
23
+ ap.add_argument("--word", action="store_true", help="whole word only")
24
+ ap.add_argument("--page", type=int, help="limit to this page number")
25
+ ap.add_argument("--json", action="store_true", help="print the full JSON report")
26
+ a = ap.parse_args(argv)
27
+
28
+ if a.rules:
29
+ with open(a.rules, encoding="utf-8") as fh:
30
+ rules = parse_rules_file(fh.read())
31
+ if not rules:
32
+ print(f"no rules found in {a.rules}", file=sys.stderr)
33
+ return 2
34
+ rep = apply_rules(a.input, a.output, rules)
35
+ if a.json:
36
+ print(json.dumps(rep, indent=2))
37
+ else:
38
+ print(f"{rep['applied']}/{rep['total']} rules applied -> {a.output}")
39
+ for r in rep["rules"]:
40
+ mark = "ok " if r["applied"] else "skip"
41
+ extra = f" (x{r['count']})" if r.get("count") else ""
42
+ why = "" if r["applied"] else " " + (r.get("reason") or r.get("error") or "not applied")
43
+ print(f" [{mark}] {r['find']!r} -> {r['replace']!r}{extra}{why}")
44
+ return 0 if rep["applied"] else 1
45
+
46
+ if not a.find:
47
+ ap.error("give --find (with --replace), or --rules FILE")
48
+ scope = int(a.scope) if a.scope.isdigit() else a.scope
49
+ r = process(a.input, a.output, a.find, a.replace, a.page, scope, a.ci, a.word)
50
+ if a.json:
51
+ print(json.dumps(r, indent=2))
52
+ elif r.get("ok"):
53
+ print(f"replaced {r['count']} match(es) of {a.find!r} -> {a.output}")
54
+ elif r.get("refused"):
55
+ print(f"refused: {r.get('reason')}", file=sys.stderr)
56
+ else:
57
+ print(f"failed: {r.get('error')}", file=sys.stderr)
58
+ return 0 if r.get("ok") else (3 if r.get("refused") else 1)
59
+
60
+
61
+ if __name__ == "__main__":
62
+ sys.exit(main())
@@ -0,0 +1,387 @@
1
+ """pdfblah engine: real in-content-stream PDF text replacement.
2
+
3
+ Rewrites the actual Tj/TJ string operands in the page content stream, so the old
4
+ text is gone (pdftotext, Ctrl-F, and copy show only the new value). No overlay, no
5
+ watermark. Alignment is auto-detected and preserved. All original metadata
6
+ (DocInfo + XMP, including dates and Producer) is kept verbatim. Fonts we cannot
7
+ reproduce (non-embedded exotic or custom-encoded) are refused rather than garbled.
8
+
9
+ Library use:
10
+ from pdfblah import process, apply_rules, parse_rules_file
11
+ process("in.pdf", "out.pdf", "OLD", "NEW", scope="all", ci=True)
12
+ """
13
+ import os, re, json, tempfile, shutil
14
+ import pikepdf
15
+ from pikepdf import Operator, String, Array
16
+ import pdfplumber
17
+
18
+
19
+ # ---- alignment auto-detection (proven on the trade-confirmation PDF) ----------
20
+ def detect_alignment(target, words, tol=1.2, vband=300):
21
+ tx0, tx1, ttop = target["x0"], target["x1"], target["top"]
22
+ tcx = (tx0 + tx1) / 2
23
+ cx = lambda w: (w["x0"] + w["x1"]) / 2
24
+ band = [w for w in words if abs(w["top"] - ttop) <= vband]
25
+ m = {"left": sum(1 for w in band if abs(w["x0"] - tx0) <= tol),
26
+ "right": sum(1 for w in band if abs(w["x1"] - tx1) <= tol),
27
+ "center": sum(1 for w in band if abs(cx(w) - tcx) <= tol)}
28
+ align = max(m, key=m.get)
29
+ if m[align] < 2:
30
+ align = "left"
31
+ elif m["right"] == m["left"] and any(c.isdigit() for c in target["text"]):
32
+ align = "right"
33
+ return align, m
34
+
35
+
36
+ # ---- per-character advance widths, taken from the REAL font as rendered --------
37
+ def unit_widths(page, fontname):
38
+ """Map char -> advance width per 1pt of font size, measured from actual glyph
39
+ advances of the same font on the page. Font-exact, no substitute needed."""
40
+ tbl, samples = {}, {}
41
+ chars = [c for c in page.chars if c.get("fontname") == fontname]
42
+ # advance = delta of x0 to the next char on the same line & word run
43
+ chars_sorted = sorted(chars, key=lambda c: (round(c["top"]), c["x0"]))
44
+ for a, b in zip(chars_sorted, chars_sorted[1:]):
45
+ if abs(a["top"] - b["top"]) > 1: # different line
46
+ continue
47
+ adv = b["x0"] - a["x0"]
48
+ size = a.get("size") or 0
49
+ if size <= 0 or adv <= 0 or adv > size * 3: # sane advance only
50
+ continue
51
+ samples.setdefault(a["text"], []).append(adv / size)
52
+ for ch, vals in samples.items():
53
+ vals.sort(); tbl[ch] = vals[len(vals) // 2] # median
54
+ return tbl
55
+
56
+
57
+ def string_width(s, tbl, size, fallback):
58
+ return sum(tbl.get(ch, fallback) for ch in s) * size
59
+
60
+
61
+ def matcher(find, ci=False, word=False):
62
+ """Compiled regex for `find`: literal (re.escape), optional case-insensitive
63
+ and whole-word (not inside a longer alphanumeric run)."""
64
+ pat = re.escape(find)
65
+ if word:
66
+ pat = r"(?<![0-9A-Za-z])" + pat + r"(?![0-9A-Za-z])"
67
+ return re.compile(pat, re.IGNORECASE if ci else 0)
68
+
69
+
70
+ def locate_boxes(words, find, ci=False, word=False):
71
+ """Ordered boxes where `find` matches: inside one word, or as consecutive
72
+ words on one line (multi-word phrase). Returns [(x0, top, x1, bottom), ...]."""
73
+ rx = matcher(find, ci, word)
74
+ boxes = []
75
+ for w in words:
76
+ m = rx.search(w["text"])
77
+ if not m:
78
+ continue
79
+ # box of the matched substring (estimated by char proportion) so the
80
+ # rewrite targets the match, not the start of a longer word.
81
+ n = len(w["text"]) or 1; span = w["x1"] - w["x0"]
82
+ sx0 = w["x0"] + span * (m.start() / n)
83
+ sx1 = w["x0"] + span * (m.end() / n)
84
+ boxes.append((sx0, w["top"], sx1, w["bottom"]))
85
+ if boxes:
86
+ return boxes
87
+ fl = find.lower() if ci else find
88
+ n = len(words)
89
+ for i in range(n):
90
+ acc = ""
91
+ for j in range(i, n):
92
+ if j > i and abs(words[j]["top"] - words[i]["top"]) > 3:
93
+ break
94
+ acc = words[j]["text"] if j == i else acc + " " + words[j]["text"]
95
+ a = acc.lower() if ci else acc
96
+ if a == fl:
97
+ grp = words[i:j + 1]
98
+ boxes.append((min(w["x0"] for w in grp), min(w["top"] for w in grp),
99
+ max(w["x1"] for w in grp), max(w["bottom"] for w in grp)))
100
+ break
101
+ if not fl.startswith(a):
102
+ break
103
+ return boxes
104
+
105
+
106
+ # ---- content-stream rewrite (position-matched to the located instance) --------
107
+ def _mul(m, n):
108
+ a, b, c, d, e, f = m; A, B, C, D, E, F = n
109
+ return (a*A + b*C, a*B + b*D, c*A + d*C, c*B + d*D,
110
+ e*A + f*C + E, e*B + f*D + F)
111
+
112
+ def _op_text(operands, so):
113
+ if so == "TJ":
114
+ return "".join(bytes(e).decode("latin-1") for e in operands[0]
115
+ if isinstance(e, (String, bytes)))
116
+ if so == "\"":
117
+ return bytes(operands[2]).decode("latin-1")
118
+ return bytes(operands[0]).decode("latin-1") # Tj and '
119
+
120
+
121
+ def rewrite_stream(pdf, page, find, replace, target_xy, align, size, wtbl, fb,
122
+ ci=False, word=False, tol=14.0):
123
+ """Replace the single find-match whose device position is nearest target_xy
124
+ (x, y_from_bottom). Honors case-insensitive / whole-word matching. Returns
125
+ (True, dist) or (False, dist|None)."""
126
+ rx = matcher(find, ci, word)
127
+ ID = (1, 0, 0, 1, 0, 0)
128
+ ctm = ID; gstack = []; tm = ID; tlm = ID; leading = 0.0
129
+ tx, ty = target_xy
130
+ best = None # (dist, idx, so, mstart, mend)
131
+ ops = list(pikepdf.parse_content_stream(page))
132
+
133
+ def show_pos():
134
+ m = _mul(tm, ctm)
135
+ return m[4], m[5]
136
+
137
+ for idx, (operands, op) in enumerate(ops):
138
+ so = str(op)
139
+ if so == "q": gstack.append(ctm)
140
+ elif so == "Q": ctm = gstack.pop() if gstack else ctm
141
+ elif so == "cm": ctm = _mul(tuple(float(x) for x in operands), ctm)
142
+ elif so == "BT": tm = tlm = ID
143
+ elif so == "TL": leading = float(operands[0])
144
+ elif so in ("Td", "TD"):
145
+ dx, dy = float(operands[0]), float(operands[1])
146
+ if so == "TD": leading = -dy
147
+ tlm = _mul((1, 0, 0, 1, dx, dy), tlm); tm = tlm
148
+ elif so == "Tm": tm = tlm = tuple(float(x) for x in operands)
149
+ elif so == "T*": tlm = _mul((1, 0, 0, 1, 0, -leading), tlm); tm = tlm
150
+ elif so in ("Tj", "TJ", "'", "\""):
151
+ if so in ("'", "\""):
152
+ tlm = _mul((1, 0, 0, 1, 0, -leading), tlm); tm = tlm
153
+ x, y = show_pos()
154
+ txt = _op_text(operands, so)
155
+ for m in rx.finditer(txt):
156
+ cx = x + string_width(txt[:m.start()], wtbl, size, fb)
157
+ d = ((cx - tx) ** 2 + (y - ty) ** 2) ** 0.5
158
+ if best is None or d < best[0]:
159
+ best = (d, idx, so, m.start(), m.end())
160
+ if best is None or best[0] > tol:
161
+ return (False, None if best is None else best[0])
162
+
163
+ _, idx, so, ms, me = best
164
+ operands, op = ops[idx]
165
+ patched = _rewrite_one(operands, so, ms, me, replace, align, size, wtbl, fb)
166
+ if patched is None: # span crosses TJ elements -> unsupported
167
+ return (False, best[0])
168
+ ops[idx] = patched
169
+ page.Contents = pdf.make_stream(pikepdf.unparse_content_stream(ops))
170
+ return (True, best[0])
171
+
172
+
173
+ def _rewrite_one(operands, so, ms, me, replace, align, size, wtbl, fb):
174
+ """Rewrite the [ms:me] span of an op's shown text with `replace`."""
175
+ if so == "Tj":
176
+ s = bytes(operands[0]).decode("latin-1")
177
+ return _patch_span(s, ms, me, replace, align, size, wtbl, fb)
178
+ if so == "'":
179
+ s = bytes(operands[0]).decode("latin-1")
180
+ return ([String((s[:ms] + replace + s[me:]).encode("latin-1"))], Operator("'"))
181
+ if so == "\"":
182
+ s = bytes(operands[2]).decode("latin-1")
183
+ return ([operands[0], operands[1],
184
+ String((s[:ms] + replace + s[me:]).encode("latin-1"))], Operator("\""))
185
+ # TJ: map the joined offset back to one string element (skip if it crosses)
186
+ arr = operands[0]; pos = 0; new_arr = []; done = False
187
+ for e in arr:
188
+ if isinstance(e, (String, bytes)):
189
+ es = bytes(e).decode("latin-1"); L = len(es)
190
+ if not done and pos <= ms and me <= pos + L:
191
+ ls, le = ms - pos, me - pos
192
+ new_arr.append(String((es[:ls] + replace + es[le:]).encode("latin-1")))
193
+ done = True
194
+ else:
195
+ new_arr.append(e)
196
+ pos += L
197
+ else:
198
+ new_arr.append(e)
199
+ return ([Array(new_arr)], Operator("TJ")) if done else None
200
+
201
+
202
+ def _patch_span(s, ms, me, replace, align, size, wtbl, fb):
203
+ """Show `s` with [ms:me] replaced by `replace`, repositioned so the detected
204
+ anchor edge stays fixed (right/center); left = in place."""
205
+ ns = s[:ms] + replace + s[me:]
206
+ if align == "left":
207
+ return [String(ns.encode("latin-1"))], Operator("Tj")
208
+ ow = string_width(s[ms:me], wtbl, size, fb)
209
+ nw = string_width(replace, wtbl, size, fb)
210
+ frac = (ow - nw) if align == "right" else (ow - nw) / 2.0
211
+ num = -frac * 1000.0 / size
212
+ return [Array([num, String(ns.encode("latin-1"))])], Operator("TJ")
213
+
214
+
215
+ STD14 = {"helvetica", "times", "courier", "symbol", "zapfdingbats"}
216
+
217
+ def font_safe(page, fontname, replace, wtbl):
218
+ """Decide whether NEW text in `fontname` will render faithfully.
219
+ - embedded font: safe only if every replace char was already observed
220
+ (in-subset); a brand-new glyph may be missing.
221
+ - non-embedded: safe only for the true standard-14 families with a
222
+ standard byte encoding. Everything else (exotic AFP fonts, custom-encoded
223
+ Arial, etc.) is REFUSED — proven to garble.
224
+ Returns (bool, reason)."""
225
+ fam = base_family_lc(fontname)
226
+ try:
227
+ fonts = page.get("/Resources", {}).get("/Font", {})
228
+ except Exception:
229
+ fonts = {}
230
+ fd = None
231
+ for _, f in dict(fonts).items():
232
+ base = strip_subset(str(f.get("/BaseFont", "")).lstrip("/"))
233
+ if base and (base.lower() == strip_subset(fontname or "").lower()
234
+ or base_family_lc(base) == fam):
235
+ fd = f; break
236
+ embedded, enc_custom = False, False
237
+ if fd is not None:
238
+ desc = fd.get("/FontDescriptor", {})
239
+ embedded = any(k in desc for k in ("/FontFile", "/FontFile2", "/FontFile3"))
240
+ enc = fd.get("/Encoding", None)
241
+ enc_custom = isinstance(enc, pikepdf.Dictionary) # has /Differences
242
+ if embedded:
243
+ missing = [c for c in replace if c not in wtbl and c != " "]
244
+ if missing:
245
+ return False, f"embedded subset font missing glyph(s) {missing!r}"
246
+ return True, "embedded; all replacement glyphs already present"
247
+ if fam in STD14 and not enc_custom:
248
+ return True, "non-embedded standard-14 font with standard encoding"
249
+ return False, (f"non-embedded font {fontname!r} (family {fam!r}) is not a "
250
+ f"standard-14 font{' and uses a custom encoding' if enc_custom else ''}")
251
+
252
+
253
+ def base_family_lc(n):
254
+ return base_family(strip_subset(n or "")).lower().replace(" ", "")
255
+
256
+
257
+ def strip_subset(n): return re.sub(r"^[A-Z]{6}\+", "", n or "")
258
+ def base_family(n):
259
+ n = re.split(r"[-,]", strip_subset(n))[0]
260
+ return re.sub(r"(?i)(bold|italic|oblique|regular|mt|ps)+$", "", n).strip() or strip_subset(n)
261
+
262
+
263
+ def process(input_path, output_path, find, replace, page=None, scope="first",
264
+ ci=False, word=False, align="auto"):
265
+ """Real in-stream replacement. `scope` is "first", "all", or an int N (Nth).
266
+ `ci` = ignore case, `word` = whole-word only. Returns a status dict; metadata
267
+ (DocInfo + XMP) is preserved verbatim on save."""
268
+ if os.path.abspath(input_path) == os.path.abspath(output_path):
269
+ return {"ok": False, "error": "refusing to overwrite the input"}
270
+ nth = scope if isinstance(scope, int) else None
271
+
272
+ # 1) locate all matches (across pages), measure font/alignment per match
273
+ targets = [] # {pi, xy, align, size, wtbl, fb, fontname}
274
+ with pdfplumber.open(input_path) as pdf:
275
+ pages = range(len(pdf.pages)) if page is None else [page - 1]
276
+ for pi in pages:
277
+ pg = pdf.pages[pi]
278
+ words = pg.extract_words(use_text_flow=True)
279
+ for (x0, top, x1, bottom) in locate_boxes(words, find, ci, word):
280
+ al, amap = (align, {})
281
+ if align == "auto":
282
+ al, amap = detect_alignment(
283
+ {"x0": x0, "top": top, "x1": x1, "bottom": bottom, "text": find}, words)
284
+ chars = [c for c in pg.chars if x0 - 1 <= c["x0"] <= x1 + 1
285
+ and top - 1 <= c["top"] <= bottom + 1]
286
+ fontname = chars[0].get("fontname") if chars else None
287
+ size = chars[0].get("size") if chars else 10.0
288
+ wtbl = unit_widths(pg, fontname) if fontname else {}
289
+ fb = (sum(wtbl.values()) / len(wtbl)) if wtbl else 0.5
290
+ targets.append({"pi": pi, "xy": (x0, pg.height - bottom), "align": al,
291
+ "amap": amap, "size": size or 10.0, "wtbl": wtbl,
292
+ "fb": fb, "fontname": fontname})
293
+ if not targets:
294
+ return {"ok": False, "error": f"text {find!r} not found"}
295
+ if nth is not None:
296
+ if nth > len(targets):
297
+ return {"ok": False, "error": f"{find!r} has only {len(targets)} match(es); "
298
+ f"asked for #{nth}"}
299
+ targets = [targets[nth - 1]]
300
+ elif scope != "all":
301
+ targets = [targets[0]] # "first"
302
+
303
+ # 2) FONT-SAFETY GATE — refuse if any selected match uses a font we can't reproduce
304
+ pdf = pikepdf.open(input_path)
305
+ for t in targets:
306
+ safe, reason = font_safe(pdf.pages[t["pi"]], t["fontname"], replace, t["wtbl"])
307
+ if not safe:
308
+ return {"ok": False, "refused": True, "font": t["fontname"], "reason": reason,
309
+ "hint": "non-embedded/exotic or custom-encoded font; new text can "
310
+ "garble, so this is the detect-and-refuse path"}
311
+
312
+ # 3) rewrite each selected match (sequential rewrites don't move other runs)
313
+ count = 0
314
+ for t in targets:
315
+ ok, _ = rewrite_stream(pdf, pdf.pages[t["pi"]], find, replace, t["xy"],
316
+ t["align"], t["size"], t["wtbl"], t["fb"], ci, word)
317
+ if ok:
318
+ count += 1
319
+ if count == 0:
320
+ return {"ok": False,
321
+ "error": f"{find!r} located visually but not found as an editable "
322
+ "Tj/TJ string (split/encoded run)"}
323
+ pdf.save(output_path, fix_metadata_version=False, deterministic_id=False)
324
+ first = targets[0]
325
+ return {"ok": True, "page": first["pi"] + 1, "font": first["fontname"],
326
+ "size_pt": round(first["size"], 2), "align": first["align"],
327
+ "align_votes": first["amap"], "scope": scope, "count": count,
328
+ "replaced": f"{find!r} -> {replace!r}", "output": output_path}
329
+
330
+
331
+ def apply_rules(input_path, output_path, rules):
332
+ """Apply many rules in order. Unapplicable rules are reported and skipped.
333
+ rules: list of {"find","replace","scope"?,"ci"?,"word"?,"page"?,"align"?}."""
334
+ report = []; cur = input_path; tmps = []; applied = 0
335
+ for rule in rules:
336
+ find = rule["find"]; replace = rule.get("replace", "")
337
+ scope = rule.get("scope", "first")
338
+ if isinstance(scope, str) and scope.isdigit():
339
+ scope = int(scope)
340
+ nxt = tempfile.mktemp(suffix=".pdf")
341
+ r = process(cur, nxt, find, replace, rule.get("page"), scope,
342
+ bool(rule.get("ci")), bool(rule.get("word")), rule.get("align", "auto"))
343
+ entry = {"find": find, "replace": replace, "applied": bool(r.get("ok")), "scope": scope}
344
+ for k in ("count", "font", "reason", "error", "refused", "page"):
345
+ if k in r:
346
+ entry[k] = r[k]
347
+ report.append(entry)
348
+ if r.get("ok"):
349
+ cur = nxt; tmps.append(nxt); applied += 1
350
+ elif os.path.exists(nxt):
351
+ os.remove(nxt)
352
+ shutil.copyfile(cur, output_path)
353
+ for t in tmps:
354
+ if os.path.abspath(t) != os.path.abspath(output_path) and os.path.exists(t):
355
+ os.remove(t)
356
+ return {"applied": applied, "total": len(rules),
357
+ "all_applied": applied == len(rules), "rules": report}
358
+
359
+
360
+ def parse_flags(s):
361
+ out = {"scope": "first", "ci": False, "word": False}
362
+ for f in (s or "").strip().lower().split():
363
+ if f == "all": out["scope"] = "all"
364
+ elif f == "first": out["scope"] = "first"
365
+ elif f.isdigit(): out["scope"] = int(f)
366
+ elif f == "ci": out["ci"] = True
367
+ elif f == "word": out["word"] = True
368
+ elif re.fullmatch(r"p\d+", f): out["page"] = int(f[1:])
369
+ return out
370
+
371
+
372
+ def parse_rules_file(text):
373
+ """Parse a rules file: one 'FIND | REPLACE | FLAGS' per line.
374
+ '#' comment lines and blank lines are skipped. FLAGS: all/first/N/ci/word/pN."""
375
+ rules = []
376
+ for raw in text.splitlines():
377
+ t = raw.strip()
378
+ if not t or t.startswith("#"):
379
+ continue
380
+ parts = t.split("|")
381
+ find = parts[0].strip()
382
+ if not find:
383
+ continue
384
+ rule = {"find": find, "replace": parts[1].strip() if len(parts) > 1 else ""}
385
+ rule.update(parse_flags(parts[2] if len(parts) > 2 else ""))
386
+ rules.append(rule)
387
+ return rules
@@ -0,0 +1,39 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "pdfblah"
7
+ version = "0.1.0"
8
+ description = "Real find and replace on the actual text in a PDF. No overlay, metadata preserved."
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ license = "MIT"
12
+ license-files = ["LICENSE"]
13
+ authors = [{ name = "Kuvop LLC" }]
14
+ keywords = ["pdf", "find", "replace", "edit", "text", "cli", "redact", "pdf-editor", "acrobat-alternative"]
15
+ classifiers = [
16
+ "Programming Language :: Python :: 3",
17
+ "Operating System :: POSIX",
18
+ "Operating System :: MacOS",
19
+ "Environment :: Console",
20
+ "Topic :: Text Processing",
21
+ "Topic :: Utilities",
22
+ "Intended Audience :: Developers",
23
+ "Intended Audience :: End Users/Desktop",
24
+ ]
25
+ dependencies = ["pdfplumber>=0.11", "pikepdf>=8"]
26
+
27
+ [project.optional-dependencies]
28
+ test = ["pytest", "reportlab"]
29
+
30
+ [project.urls]
31
+ Homepage = "https://pdfblah.com"
32
+ Source = "https://github.com/KuvopLLC/pdfblah"
33
+ Issues = "https://github.com/KuvopLLC/pdfblah/issues"
34
+
35
+ [project.scripts]
36
+ pdfblah = "pdfblah.cli:main"
37
+
38
+ [tool.hatch.build.targets.wheel]
39
+ packages = ["pdfblah"]
@@ -0,0 +1,116 @@
1
+ import pdfplumber
2
+ import pikepdf
3
+ import pytest
4
+ from reportlab.pdfgen import canvas
5
+
6
+ from pdfblah import process, apply_rules, parse_rules_file, parse_flags, font_safe
7
+
8
+
9
+ def make_pdf(path, lines, font="Helvetica", size=16):
10
+ c = canvas.Canvas(str(path), pagesize=(400, 300))
11
+ c.setFont(font, size)
12
+ for i, t in enumerate(lines):
13
+ c.drawString(40, 250 - i * 40, t)
14
+ c.save()
15
+ return str(path)
16
+
17
+
18
+ def alltext(path):
19
+ with pdfplumber.open(str(path)) as pdf:
20
+ return "\n".join((p.extract_text() or "") for p in pdf.pages)
21
+
22
+
23
+ @pytest.fixture
24
+ def reps(tmp_path):
25
+ return make_pdf(tmp_path / "in.pdf",
26
+ ["apple one", "apple two", "Apple three", "pineapple four", "apple five"])
27
+
28
+
29
+ def test_scope_first(reps, tmp_path):
30
+ o = str(tmp_path / "o.pdf")
31
+ r = process(reps, o, "apple", "APRICOT", scope="first")
32
+ assert r["ok"] and r["count"] == 1
33
+ assert alltext(o).count("APRICOT") == 1
34
+
35
+
36
+ def test_scope_all(reps, tmp_path):
37
+ o = str(tmp_path / "o.pdf")
38
+ r = process(reps, o, "apple", "APRICOT", scope="all")
39
+ assert r["count"] == 4
40
+ assert alltext(o).count("APRICOT") == 4
41
+
42
+
43
+ def test_scope_nth(reps, tmp_path):
44
+ o = str(tmp_path / "o.pdf")
45
+ r = process(reps, o, "apple", "APRICOT", scope=2)
46
+ assert r["count"] == 1
47
+
48
+
49
+ def test_ignore_case(reps, tmp_path):
50
+ o = str(tmp_path / "o.pdf")
51
+ r = process(reps, o, "apple", "APRICOT", scope="all", ci=True)
52
+ assert r["count"] == 5 # also catches "Apple three"
53
+
54
+
55
+ def test_whole_word(reps, tmp_path):
56
+ o = str(tmp_path / "o.pdf")
57
+ r = process(reps, o, "apple", "APRICOT", scope="all", word=True)
58
+ assert r["count"] == 3 # spares "pineapple"
59
+ assert "pineapple" in alltext(o)
60
+
61
+
62
+ def test_delete(reps, tmp_path):
63
+ o = str(tmp_path / "o.pdf")
64
+ r = process(reps, o, "apple", "", scope="all", word=True)
65
+ assert r["ok"]
66
+ assert "apple one" not in alltext(o)
67
+
68
+
69
+ def test_not_found(reps, tmp_path):
70
+ r = process(reps, str(tmp_path / "o.pdf"), "banana", "x")
71
+ assert not r["ok"] and "not found" in r["error"]
72
+
73
+
74
+ def test_no_overwrite(reps):
75
+ r = process(reps, reps, "apple", "x")
76
+ assert not r["ok"]
77
+
78
+
79
+ def test_metadata_preserved(reps, tmp_path):
80
+ o = str(tmp_path / "o.pdf")
81
+ with pikepdf.open(reps) as p:
82
+ before = {str(k): str(v) for k, v in (p.docinfo or {}).items()}
83
+ process(reps, o, "apple", "APRICOT", scope="all")
84
+ with pikepdf.open(o) as p:
85
+ after = {str(k): str(v) for k, v in (p.docinfo or {}).items()}
86
+ for k in ("/Producer", "/CreationDate"):
87
+ if k in before:
88
+ assert before[k] == after[k]
89
+
90
+
91
+ def test_font_safe_refuses_exotic():
92
+ exotic = {"/Resources": {"/Font": {"/F1": {"/BaseFont": "/C0A075B0.afm"}}}}
93
+ safe, reason = font_safe(exotic, "C0A075B0.afm", "NEW", {})
94
+ assert safe is False and "standard-14" in reason
95
+
96
+
97
+ def test_font_safe_accepts_standard14():
98
+ std = {"/Resources": {"/Font": {"/F1": {"/BaseFont": "/Helvetica"}}}}
99
+ safe, _ = font_safe(std, "Helvetica", "NEW", {})
100
+ assert safe is True
101
+
102
+
103
+ def test_parse_flags():
104
+ assert parse_flags("all ci") == {"scope": "all", "ci": True, "word": False}
105
+ assert parse_flags("3 word") == {"scope": 3, "ci": False, "word": True}
106
+ assert parse_flags("p2")["page"] == 2
107
+ assert parse_flags("") == {"scope": "first", "ci": False, "word": False}
108
+
109
+
110
+ def test_rules_file(reps, tmp_path):
111
+ rules = parse_rules_file("# comment\napple | APRICOT | all\nApple | X | ci\nbanana | y")
112
+ assert len(rules) == 3
113
+ assert rules[0]["scope"] == "all"
114
+ o = str(tmp_path / "o.pdf")
115
+ rep = apply_rules(reps, o, rules)
116
+ assert rep["applied"] == 2 and rep["total"] == 3 # banana not found -> skipped