pdfblah 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdfblah/__init__.py +20 -0
- pdfblah/cli.py +62 -0
- pdfblah/engine.py +387 -0
- pdfblah-0.1.0.dist-info/METADATA +108 -0
- pdfblah-0.1.0.dist-info/RECORD +8 -0
- pdfblah-0.1.0.dist-info/WHEEL +4 -0
- pdfblah-0.1.0.dist-info/entry_points.txt +2 -0
- pdfblah-0.1.0.dist-info/licenses/LICENSE +21 -0
pdfblah/__init__.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""pdfblah: real find and replace on the actual text in a PDF.
|
|
2
|
+
|
|
3
|
+
Rewrites the real text in the content stream (no overlay, no watermark), preserves
|
|
4
|
+
all original metadata, auto-detects alignment, and refuses fonts it cannot
|
|
5
|
+
reproduce instead of garbling them.
|
|
6
|
+
"""
|
|
7
|
+
from .engine import (
|
|
8
|
+
process,
|
|
9
|
+
apply_rules,
|
|
10
|
+
parse_rules_file,
|
|
11
|
+
parse_flags,
|
|
12
|
+
font_safe,
|
|
13
|
+
detect_alignment,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
__version__ = "0.1.0"
|
|
17
|
+
__all__ = [
|
|
18
|
+
"process", "apply_rules", "parse_rules_file", "parse_flags",
|
|
19
|
+
"font_safe", "detect_alignment",
|
|
20
|
+
]
|
pdfblah/cli.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Command-line interface for pdfblah."""
|
|
2
|
+
import argparse
|
|
3
|
+
import json
|
|
4
|
+
import sys
|
|
5
|
+
|
|
6
|
+
from .engine import process, apply_rules, parse_rules_file
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def main(argv=None):
|
|
10
|
+
ap = argparse.ArgumentParser(
|
|
11
|
+
prog="pdfblah",
|
|
12
|
+
description="Real find and replace on the actual text in a PDF. "
|
|
13
|
+
"No overlay, metadata preserved, alignment auto-detected.")
|
|
14
|
+
ap.add_argument("input", help="input PDF")
|
|
15
|
+
ap.add_argument("output", help="output PDF (never overwrites the input)")
|
|
16
|
+
ap.add_argument("--find", help="text to find")
|
|
17
|
+
ap.add_argument("--replace", default="", help="replacement text (empty deletes the text)")
|
|
18
|
+
ap.add_argument("--rules", metavar="FILE",
|
|
19
|
+
help="apply many rules from a file, one 'FIND | REPLACE | FLAGS' per line")
|
|
20
|
+
ap.add_argument("--scope", default="first", metavar="WHICH",
|
|
21
|
+
help="first (default), all, or a number for the Nth match")
|
|
22
|
+
ap.add_argument("--ci", action="store_true", help="ignore case")
|
|
23
|
+
ap.add_argument("--word", action="store_true", help="whole word only")
|
|
24
|
+
ap.add_argument("--page", type=int, help="limit to this page number")
|
|
25
|
+
ap.add_argument("--json", action="store_true", help="print the full JSON report")
|
|
26
|
+
a = ap.parse_args(argv)
|
|
27
|
+
|
|
28
|
+
if a.rules:
|
|
29
|
+
with open(a.rules, encoding="utf-8") as fh:
|
|
30
|
+
rules = parse_rules_file(fh.read())
|
|
31
|
+
if not rules:
|
|
32
|
+
print(f"no rules found in {a.rules}", file=sys.stderr)
|
|
33
|
+
return 2
|
|
34
|
+
rep = apply_rules(a.input, a.output, rules)
|
|
35
|
+
if a.json:
|
|
36
|
+
print(json.dumps(rep, indent=2))
|
|
37
|
+
else:
|
|
38
|
+
print(f"{rep['applied']}/{rep['total']} rules applied -> {a.output}")
|
|
39
|
+
for r in rep["rules"]:
|
|
40
|
+
mark = "ok " if r["applied"] else "skip"
|
|
41
|
+
extra = f" (x{r['count']})" if r.get("count") else ""
|
|
42
|
+
why = "" if r["applied"] else " " + (r.get("reason") or r.get("error") or "not applied")
|
|
43
|
+
print(f" [{mark}] {r['find']!r} -> {r['replace']!r}{extra}{why}")
|
|
44
|
+
return 0 if rep["applied"] else 1
|
|
45
|
+
|
|
46
|
+
if not a.find:
|
|
47
|
+
ap.error("give --find (with --replace), or --rules FILE")
|
|
48
|
+
scope = int(a.scope) if a.scope.isdigit() else a.scope
|
|
49
|
+
r = process(a.input, a.output, a.find, a.replace, a.page, scope, a.ci, a.word)
|
|
50
|
+
if a.json:
|
|
51
|
+
print(json.dumps(r, indent=2))
|
|
52
|
+
elif r.get("ok"):
|
|
53
|
+
print(f"replaced {r['count']} match(es) of {a.find!r} -> {a.output}")
|
|
54
|
+
elif r.get("refused"):
|
|
55
|
+
print(f"refused: {r.get('reason')}", file=sys.stderr)
|
|
56
|
+
else:
|
|
57
|
+
print(f"failed: {r.get('error')}", file=sys.stderr)
|
|
58
|
+
return 0 if r.get("ok") else (3 if r.get("refused") else 1)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
if __name__ == "__main__":
|
|
62
|
+
sys.exit(main())
|
pdfblah/engine.py
ADDED
|
@@ -0,0 +1,387 @@
|
|
|
1
|
+
"""pdfblah engine: real in-content-stream PDF text replacement.
|
|
2
|
+
|
|
3
|
+
Rewrites the actual Tj/TJ string operands in the page content stream, so the old
|
|
4
|
+
text is gone (pdftotext, Ctrl-F, and copy show only the new value). No overlay, no
|
|
5
|
+
watermark. Alignment is auto-detected and preserved. All original metadata
|
|
6
|
+
(DocInfo + XMP, including dates and Producer) is kept verbatim. Fonts we cannot
|
|
7
|
+
reproduce (non-embedded exotic or custom-encoded) are refused rather than garbled.
|
|
8
|
+
|
|
9
|
+
Library use:
|
|
10
|
+
from pdfblah import process, apply_rules, parse_rules_file
|
|
11
|
+
process("in.pdf", "out.pdf", "OLD", "NEW", scope="all", ci=True)
|
|
12
|
+
"""
|
|
13
|
+
import os, re, json, tempfile, shutil
|
|
14
|
+
import pikepdf
|
|
15
|
+
from pikepdf import Operator, String, Array
|
|
16
|
+
import pdfplumber
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# ---- alignment auto-detection (proven on the trade-confirmation PDF) ----------
|
|
20
|
+
def detect_alignment(target, words, tol=1.2, vband=300):
|
|
21
|
+
tx0, tx1, ttop = target["x0"], target["x1"], target["top"]
|
|
22
|
+
tcx = (tx0 + tx1) / 2
|
|
23
|
+
cx = lambda w: (w["x0"] + w["x1"]) / 2
|
|
24
|
+
band = [w for w in words if abs(w["top"] - ttop) <= vband]
|
|
25
|
+
m = {"left": sum(1 for w in band if abs(w["x0"] - tx0) <= tol),
|
|
26
|
+
"right": sum(1 for w in band if abs(w["x1"] - tx1) <= tol),
|
|
27
|
+
"center": sum(1 for w in band if abs(cx(w) - tcx) <= tol)}
|
|
28
|
+
align = max(m, key=m.get)
|
|
29
|
+
if m[align] < 2:
|
|
30
|
+
align = "left"
|
|
31
|
+
elif m["right"] == m["left"] and any(c.isdigit() for c in target["text"]):
|
|
32
|
+
align = "right"
|
|
33
|
+
return align, m
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# ---- per-character advance widths, taken from the REAL font as rendered --------
|
|
37
|
+
def unit_widths(page, fontname):
|
|
38
|
+
"""Map char -> advance width per 1pt of font size, measured from actual glyph
|
|
39
|
+
advances of the same font on the page. Font-exact, no substitute needed."""
|
|
40
|
+
tbl, samples = {}, {}
|
|
41
|
+
chars = [c for c in page.chars if c.get("fontname") == fontname]
|
|
42
|
+
# advance = delta of x0 to the next char on the same line & word run
|
|
43
|
+
chars_sorted = sorted(chars, key=lambda c: (round(c["top"]), c["x0"]))
|
|
44
|
+
for a, b in zip(chars_sorted, chars_sorted[1:]):
|
|
45
|
+
if abs(a["top"] - b["top"]) > 1: # different line
|
|
46
|
+
continue
|
|
47
|
+
adv = b["x0"] - a["x0"]
|
|
48
|
+
size = a.get("size") or 0
|
|
49
|
+
if size <= 0 or adv <= 0 or adv > size * 3: # sane advance only
|
|
50
|
+
continue
|
|
51
|
+
samples.setdefault(a["text"], []).append(adv / size)
|
|
52
|
+
for ch, vals in samples.items():
|
|
53
|
+
vals.sort(); tbl[ch] = vals[len(vals) // 2] # median
|
|
54
|
+
return tbl
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def string_width(s, tbl, size, fallback):
|
|
58
|
+
return sum(tbl.get(ch, fallback) for ch in s) * size
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def matcher(find, ci=False, word=False):
|
|
62
|
+
"""Compiled regex for `find`: literal (re.escape), optional case-insensitive
|
|
63
|
+
and whole-word (not inside a longer alphanumeric run)."""
|
|
64
|
+
pat = re.escape(find)
|
|
65
|
+
if word:
|
|
66
|
+
pat = r"(?<![0-9A-Za-z])" + pat + r"(?![0-9A-Za-z])"
|
|
67
|
+
return re.compile(pat, re.IGNORECASE if ci else 0)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def locate_boxes(words, find, ci=False, word=False):
|
|
71
|
+
"""Ordered boxes where `find` matches: inside one word, or as consecutive
|
|
72
|
+
words on one line (multi-word phrase). Returns [(x0, top, x1, bottom), ...]."""
|
|
73
|
+
rx = matcher(find, ci, word)
|
|
74
|
+
boxes = []
|
|
75
|
+
for w in words:
|
|
76
|
+
m = rx.search(w["text"])
|
|
77
|
+
if not m:
|
|
78
|
+
continue
|
|
79
|
+
# box of the matched substring (estimated by char proportion) so the
|
|
80
|
+
# rewrite targets the match, not the start of a longer word.
|
|
81
|
+
n = len(w["text"]) or 1; span = w["x1"] - w["x0"]
|
|
82
|
+
sx0 = w["x0"] + span * (m.start() / n)
|
|
83
|
+
sx1 = w["x0"] + span * (m.end() / n)
|
|
84
|
+
boxes.append((sx0, w["top"], sx1, w["bottom"]))
|
|
85
|
+
if boxes:
|
|
86
|
+
return boxes
|
|
87
|
+
fl = find.lower() if ci else find
|
|
88
|
+
n = len(words)
|
|
89
|
+
for i in range(n):
|
|
90
|
+
acc = ""
|
|
91
|
+
for j in range(i, n):
|
|
92
|
+
if j > i and abs(words[j]["top"] - words[i]["top"]) > 3:
|
|
93
|
+
break
|
|
94
|
+
acc = words[j]["text"] if j == i else acc + " " + words[j]["text"]
|
|
95
|
+
a = acc.lower() if ci else acc
|
|
96
|
+
if a == fl:
|
|
97
|
+
grp = words[i:j + 1]
|
|
98
|
+
boxes.append((min(w["x0"] for w in grp), min(w["top"] for w in grp),
|
|
99
|
+
max(w["x1"] for w in grp), max(w["bottom"] for w in grp)))
|
|
100
|
+
break
|
|
101
|
+
if not fl.startswith(a):
|
|
102
|
+
break
|
|
103
|
+
return boxes
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
# ---- content-stream rewrite (position-matched to the located instance) --------
|
|
107
|
+
def _mul(m, n):
|
|
108
|
+
a, b, c, d, e, f = m; A, B, C, D, E, F = n
|
|
109
|
+
return (a*A + b*C, a*B + b*D, c*A + d*C, c*B + d*D,
|
|
110
|
+
e*A + f*C + E, e*B + f*D + F)
|
|
111
|
+
|
|
112
|
+
def _op_text(operands, so):
|
|
113
|
+
if so == "TJ":
|
|
114
|
+
return "".join(bytes(e).decode("latin-1") for e in operands[0]
|
|
115
|
+
if isinstance(e, (String, bytes)))
|
|
116
|
+
if so == "\"":
|
|
117
|
+
return bytes(operands[2]).decode("latin-1")
|
|
118
|
+
return bytes(operands[0]).decode("latin-1") # Tj and '
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def rewrite_stream(pdf, page, find, replace, target_xy, align, size, wtbl, fb,
|
|
122
|
+
ci=False, word=False, tol=14.0):
|
|
123
|
+
"""Replace the single find-match whose device position is nearest target_xy
|
|
124
|
+
(x, y_from_bottom). Honors case-insensitive / whole-word matching. Returns
|
|
125
|
+
(True, dist) or (False, dist|None)."""
|
|
126
|
+
rx = matcher(find, ci, word)
|
|
127
|
+
ID = (1, 0, 0, 1, 0, 0)
|
|
128
|
+
ctm = ID; gstack = []; tm = ID; tlm = ID; leading = 0.0
|
|
129
|
+
tx, ty = target_xy
|
|
130
|
+
best = None # (dist, idx, so, mstart, mend)
|
|
131
|
+
ops = list(pikepdf.parse_content_stream(page))
|
|
132
|
+
|
|
133
|
+
def show_pos():
|
|
134
|
+
m = _mul(tm, ctm)
|
|
135
|
+
return m[4], m[5]
|
|
136
|
+
|
|
137
|
+
for idx, (operands, op) in enumerate(ops):
|
|
138
|
+
so = str(op)
|
|
139
|
+
if so == "q": gstack.append(ctm)
|
|
140
|
+
elif so == "Q": ctm = gstack.pop() if gstack else ctm
|
|
141
|
+
elif so == "cm": ctm = _mul(tuple(float(x) for x in operands), ctm)
|
|
142
|
+
elif so == "BT": tm = tlm = ID
|
|
143
|
+
elif so == "TL": leading = float(operands[0])
|
|
144
|
+
elif so in ("Td", "TD"):
|
|
145
|
+
dx, dy = float(operands[0]), float(operands[1])
|
|
146
|
+
if so == "TD": leading = -dy
|
|
147
|
+
tlm = _mul((1, 0, 0, 1, dx, dy), tlm); tm = tlm
|
|
148
|
+
elif so == "Tm": tm = tlm = tuple(float(x) for x in operands)
|
|
149
|
+
elif so == "T*": tlm = _mul((1, 0, 0, 1, 0, -leading), tlm); tm = tlm
|
|
150
|
+
elif so in ("Tj", "TJ", "'", "\""):
|
|
151
|
+
if so in ("'", "\""):
|
|
152
|
+
tlm = _mul((1, 0, 0, 1, 0, -leading), tlm); tm = tlm
|
|
153
|
+
x, y = show_pos()
|
|
154
|
+
txt = _op_text(operands, so)
|
|
155
|
+
for m in rx.finditer(txt):
|
|
156
|
+
cx = x + string_width(txt[:m.start()], wtbl, size, fb)
|
|
157
|
+
d = ((cx - tx) ** 2 + (y - ty) ** 2) ** 0.5
|
|
158
|
+
if best is None or d < best[0]:
|
|
159
|
+
best = (d, idx, so, m.start(), m.end())
|
|
160
|
+
if best is None or best[0] > tol:
|
|
161
|
+
return (False, None if best is None else best[0])
|
|
162
|
+
|
|
163
|
+
_, idx, so, ms, me = best
|
|
164
|
+
operands, op = ops[idx]
|
|
165
|
+
patched = _rewrite_one(operands, so, ms, me, replace, align, size, wtbl, fb)
|
|
166
|
+
if patched is None: # span crosses TJ elements -> unsupported
|
|
167
|
+
return (False, best[0])
|
|
168
|
+
ops[idx] = patched
|
|
169
|
+
page.Contents = pdf.make_stream(pikepdf.unparse_content_stream(ops))
|
|
170
|
+
return (True, best[0])
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _rewrite_one(operands, so, ms, me, replace, align, size, wtbl, fb):
|
|
174
|
+
"""Rewrite the [ms:me] span of an op's shown text with `replace`."""
|
|
175
|
+
if so == "Tj":
|
|
176
|
+
s = bytes(operands[0]).decode("latin-1")
|
|
177
|
+
return _patch_span(s, ms, me, replace, align, size, wtbl, fb)
|
|
178
|
+
if so == "'":
|
|
179
|
+
s = bytes(operands[0]).decode("latin-1")
|
|
180
|
+
return ([String((s[:ms] + replace + s[me:]).encode("latin-1"))], Operator("'"))
|
|
181
|
+
if so == "\"":
|
|
182
|
+
s = bytes(operands[2]).decode("latin-1")
|
|
183
|
+
return ([operands[0], operands[1],
|
|
184
|
+
String((s[:ms] + replace + s[me:]).encode("latin-1"))], Operator("\""))
|
|
185
|
+
# TJ: map the joined offset back to one string element (skip if it crosses)
|
|
186
|
+
arr = operands[0]; pos = 0; new_arr = []; done = False
|
|
187
|
+
for e in arr:
|
|
188
|
+
if isinstance(e, (String, bytes)):
|
|
189
|
+
es = bytes(e).decode("latin-1"); L = len(es)
|
|
190
|
+
if not done and pos <= ms and me <= pos + L:
|
|
191
|
+
ls, le = ms - pos, me - pos
|
|
192
|
+
new_arr.append(String((es[:ls] + replace + es[le:]).encode("latin-1")))
|
|
193
|
+
done = True
|
|
194
|
+
else:
|
|
195
|
+
new_arr.append(e)
|
|
196
|
+
pos += L
|
|
197
|
+
else:
|
|
198
|
+
new_arr.append(e)
|
|
199
|
+
return ([Array(new_arr)], Operator("TJ")) if done else None
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def _patch_span(s, ms, me, replace, align, size, wtbl, fb):
|
|
203
|
+
"""Show `s` with [ms:me] replaced by `replace`, repositioned so the detected
|
|
204
|
+
anchor edge stays fixed (right/center); left = in place."""
|
|
205
|
+
ns = s[:ms] + replace + s[me:]
|
|
206
|
+
if align == "left":
|
|
207
|
+
return [String(ns.encode("latin-1"))], Operator("Tj")
|
|
208
|
+
ow = string_width(s[ms:me], wtbl, size, fb)
|
|
209
|
+
nw = string_width(replace, wtbl, size, fb)
|
|
210
|
+
frac = (ow - nw) if align == "right" else (ow - nw) / 2.0
|
|
211
|
+
num = -frac * 1000.0 / size
|
|
212
|
+
return [Array([num, String(ns.encode("latin-1"))])], Operator("TJ")
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
STD14 = {"helvetica", "times", "courier", "symbol", "zapfdingbats"}
|
|
216
|
+
|
|
217
|
+
def font_safe(page, fontname, replace, wtbl):
|
|
218
|
+
"""Decide whether NEW text in `fontname` will render faithfully.
|
|
219
|
+
- embedded font: safe only if every replace char was already observed
|
|
220
|
+
(in-subset); a brand-new glyph may be missing.
|
|
221
|
+
- non-embedded: safe only for the true standard-14 families with a
|
|
222
|
+
standard byte encoding. Everything else (exotic AFP fonts, custom-encoded
|
|
223
|
+
Arial, etc.) is REFUSED — proven to garble.
|
|
224
|
+
Returns (bool, reason)."""
|
|
225
|
+
fam = base_family_lc(fontname)
|
|
226
|
+
try:
|
|
227
|
+
fonts = page.get("/Resources", {}).get("/Font", {})
|
|
228
|
+
except Exception:
|
|
229
|
+
fonts = {}
|
|
230
|
+
fd = None
|
|
231
|
+
for _, f in dict(fonts).items():
|
|
232
|
+
base = strip_subset(str(f.get("/BaseFont", "")).lstrip("/"))
|
|
233
|
+
if base and (base.lower() == strip_subset(fontname or "").lower()
|
|
234
|
+
or base_family_lc(base) == fam):
|
|
235
|
+
fd = f; break
|
|
236
|
+
embedded, enc_custom = False, False
|
|
237
|
+
if fd is not None:
|
|
238
|
+
desc = fd.get("/FontDescriptor", {})
|
|
239
|
+
embedded = any(k in desc for k in ("/FontFile", "/FontFile2", "/FontFile3"))
|
|
240
|
+
enc = fd.get("/Encoding", None)
|
|
241
|
+
enc_custom = isinstance(enc, pikepdf.Dictionary) # has /Differences
|
|
242
|
+
if embedded:
|
|
243
|
+
missing = [c for c in replace if c not in wtbl and c != " "]
|
|
244
|
+
if missing:
|
|
245
|
+
return False, f"embedded subset font missing glyph(s) {missing!r}"
|
|
246
|
+
return True, "embedded; all replacement glyphs already present"
|
|
247
|
+
if fam in STD14 and not enc_custom:
|
|
248
|
+
return True, "non-embedded standard-14 font with standard encoding"
|
|
249
|
+
return False, (f"non-embedded font {fontname!r} (family {fam!r}) is not a "
|
|
250
|
+
f"standard-14 font{' and uses a custom encoding' if enc_custom else ''}")
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def base_family_lc(n):
|
|
254
|
+
return base_family(strip_subset(n or "")).lower().replace(" ", "")
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def strip_subset(n): return re.sub(r"^[A-Z]{6}\+", "", n or "")
|
|
258
|
+
def base_family(n):
|
|
259
|
+
n = re.split(r"[-,]", strip_subset(n))[0]
|
|
260
|
+
return re.sub(r"(?i)(bold|italic|oblique|regular|mt|ps)+$", "", n).strip() or strip_subset(n)
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def process(input_path, output_path, find, replace, page=None, scope="first",
|
|
264
|
+
ci=False, word=False, align="auto"):
|
|
265
|
+
"""Real in-stream replacement. `scope` is "first", "all", or an int N (Nth).
|
|
266
|
+
`ci` = ignore case, `word` = whole-word only. Returns a status dict; metadata
|
|
267
|
+
(DocInfo + XMP) is preserved verbatim on save."""
|
|
268
|
+
if os.path.abspath(input_path) == os.path.abspath(output_path):
|
|
269
|
+
return {"ok": False, "error": "refusing to overwrite the input"}
|
|
270
|
+
nth = scope if isinstance(scope, int) else None
|
|
271
|
+
|
|
272
|
+
# 1) locate all matches (across pages), measure font/alignment per match
|
|
273
|
+
targets = [] # {pi, xy, align, size, wtbl, fb, fontname}
|
|
274
|
+
with pdfplumber.open(input_path) as pdf:
|
|
275
|
+
pages = range(len(pdf.pages)) if page is None else [page - 1]
|
|
276
|
+
for pi in pages:
|
|
277
|
+
pg = pdf.pages[pi]
|
|
278
|
+
words = pg.extract_words(use_text_flow=True)
|
|
279
|
+
for (x0, top, x1, bottom) in locate_boxes(words, find, ci, word):
|
|
280
|
+
al, amap = (align, {})
|
|
281
|
+
if align == "auto":
|
|
282
|
+
al, amap = detect_alignment(
|
|
283
|
+
{"x0": x0, "top": top, "x1": x1, "bottom": bottom, "text": find}, words)
|
|
284
|
+
chars = [c for c in pg.chars if x0 - 1 <= c["x0"] <= x1 + 1
|
|
285
|
+
and top - 1 <= c["top"] <= bottom + 1]
|
|
286
|
+
fontname = chars[0].get("fontname") if chars else None
|
|
287
|
+
size = chars[0].get("size") if chars else 10.0
|
|
288
|
+
wtbl = unit_widths(pg, fontname) if fontname else {}
|
|
289
|
+
fb = (sum(wtbl.values()) / len(wtbl)) if wtbl else 0.5
|
|
290
|
+
targets.append({"pi": pi, "xy": (x0, pg.height - bottom), "align": al,
|
|
291
|
+
"amap": amap, "size": size or 10.0, "wtbl": wtbl,
|
|
292
|
+
"fb": fb, "fontname": fontname})
|
|
293
|
+
if not targets:
|
|
294
|
+
return {"ok": False, "error": f"text {find!r} not found"}
|
|
295
|
+
if nth is not None:
|
|
296
|
+
if nth > len(targets):
|
|
297
|
+
return {"ok": False, "error": f"{find!r} has only {len(targets)} match(es); "
|
|
298
|
+
f"asked for #{nth}"}
|
|
299
|
+
targets = [targets[nth - 1]]
|
|
300
|
+
elif scope != "all":
|
|
301
|
+
targets = [targets[0]] # "first"
|
|
302
|
+
|
|
303
|
+
# 2) FONT-SAFETY GATE — refuse if any selected match uses a font we can't reproduce
|
|
304
|
+
pdf = pikepdf.open(input_path)
|
|
305
|
+
for t in targets:
|
|
306
|
+
safe, reason = font_safe(pdf.pages[t["pi"]], t["fontname"], replace, t["wtbl"])
|
|
307
|
+
if not safe:
|
|
308
|
+
return {"ok": False, "refused": True, "font": t["fontname"], "reason": reason,
|
|
309
|
+
"hint": "non-embedded/exotic or custom-encoded font; new text can "
|
|
310
|
+
"garble, so this is the detect-and-refuse path"}
|
|
311
|
+
|
|
312
|
+
# 3) rewrite each selected match (sequential rewrites don't move other runs)
|
|
313
|
+
count = 0
|
|
314
|
+
for t in targets:
|
|
315
|
+
ok, _ = rewrite_stream(pdf, pdf.pages[t["pi"]], find, replace, t["xy"],
|
|
316
|
+
t["align"], t["size"], t["wtbl"], t["fb"], ci, word)
|
|
317
|
+
if ok:
|
|
318
|
+
count += 1
|
|
319
|
+
if count == 0:
|
|
320
|
+
return {"ok": False,
|
|
321
|
+
"error": f"{find!r} located visually but not found as an editable "
|
|
322
|
+
"Tj/TJ string (split/encoded run)"}
|
|
323
|
+
pdf.save(output_path, fix_metadata_version=False, deterministic_id=False)
|
|
324
|
+
first = targets[0]
|
|
325
|
+
return {"ok": True, "page": first["pi"] + 1, "font": first["fontname"],
|
|
326
|
+
"size_pt": round(first["size"], 2), "align": first["align"],
|
|
327
|
+
"align_votes": first["amap"], "scope": scope, "count": count,
|
|
328
|
+
"replaced": f"{find!r} -> {replace!r}", "output": output_path}
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def apply_rules(input_path, output_path, rules):
|
|
332
|
+
"""Apply many rules in order. Unapplicable rules are reported and skipped.
|
|
333
|
+
rules: list of {"find","replace","scope"?,"ci"?,"word"?,"page"?,"align"?}."""
|
|
334
|
+
report = []; cur = input_path; tmps = []; applied = 0
|
|
335
|
+
for rule in rules:
|
|
336
|
+
find = rule["find"]; replace = rule.get("replace", "")
|
|
337
|
+
scope = rule.get("scope", "first")
|
|
338
|
+
if isinstance(scope, str) and scope.isdigit():
|
|
339
|
+
scope = int(scope)
|
|
340
|
+
nxt = tempfile.mktemp(suffix=".pdf")
|
|
341
|
+
r = process(cur, nxt, find, replace, rule.get("page"), scope,
|
|
342
|
+
bool(rule.get("ci")), bool(rule.get("word")), rule.get("align", "auto"))
|
|
343
|
+
entry = {"find": find, "replace": replace, "applied": bool(r.get("ok")), "scope": scope}
|
|
344
|
+
for k in ("count", "font", "reason", "error", "refused", "page"):
|
|
345
|
+
if k in r:
|
|
346
|
+
entry[k] = r[k]
|
|
347
|
+
report.append(entry)
|
|
348
|
+
if r.get("ok"):
|
|
349
|
+
cur = nxt; tmps.append(nxt); applied += 1
|
|
350
|
+
elif os.path.exists(nxt):
|
|
351
|
+
os.remove(nxt)
|
|
352
|
+
shutil.copyfile(cur, output_path)
|
|
353
|
+
for t in tmps:
|
|
354
|
+
if os.path.abspath(t) != os.path.abspath(output_path) and os.path.exists(t):
|
|
355
|
+
os.remove(t)
|
|
356
|
+
return {"applied": applied, "total": len(rules),
|
|
357
|
+
"all_applied": applied == len(rules), "rules": report}
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def parse_flags(s):
|
|
361
|
+
out = {"scope": "first", "ci": False, "word": False}
|
|
362
|
+
for f in (s or "").strip().lower().split():
|
|
363
|
+
if f == "all": out["scope"] = "all"
|
|
364
|
+
elif f == "first": out["scope"] = "first"
|
|
365
|
+
elif f.isdigit(): out["scope"] = int(f)
|
|
366
|
+
elif f == "ci": out["ci"] = True
|
|
367
|
+
elif f == "word": out["word"] = True
|
|
368
|
+
elif re.fullmatch(r"p\d+", f): out["page"] = int(f[1:])
|
|
369
|
+
return out
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
def parse_rules_file(text):
|
|
373
|
+
"""Parse a rules file: one 'FIND | REPLACE | FLAGS' per line.
|
|
374
|
+
'#' comment lines and blank lines are skipped. FLAGS: all/first/N/ci/word/pN."""
|
|
375
|
+
rules = []
|
|
376
|
+
for raw in text.splitlines():
|
|
377
|
+
t = raw.strip()
|
|
378
|
+
if not t or t.startswith("#"):
|
|
379
|
+
continue
|
|
380
|
+
parts = t.split("|")
|
|
381
|
+
find = parts[0].strip()
|
|
382
|
+
if not find:
|
|
383
|
+
continue
|
|
384
|
+
rule = {"find": find, "replace": parts[1].strip() if len(parts) > 1 else ""}
|
|
385
|
+
rule.update(parse_flags(parts[2] if len(parts) > 2 else ""))
|
|
386
|
+
rules.append(rule)
|
|
387
|
+
return rules
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pdfblah
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Real find and replace on the actual text in a PDF. No overlay, metadata preserved.
|
|
5
|
+
Project-URL: Homepage, https://pdfblah.com
|
|
6
|
+
Project-URL: Source, https://github.com/KuvopLLC/pdfblah
|
|
7
|
+
Project-URL: Issues, https://github.com/KuvopLLC/pdfblah/issues
|
|
8
|
+
Author: Kuvop LLC
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: acrobat-alternative,cli,edit,find,pdf,pdf-editor,redact,replace,text
|
|
12
|
+
Classifier: Environment :: Console
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: End Users/Desktop
|
|
15
|
+
Classifier: Operating System :: MacOS
|
|
16
|
+
Classifier: Operating System :: POSIX
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Topic :: Text Processing
|
|
19
|
+
Classifier: Topic :: Utilities
|
|
20
|
+
Requires-Python: >=3.9
|
|
21
|
+
Requires-Dist: pdfplumber>=0.11
|
|
22
|
+
Requires-Dist: pikepdf>=8
|
|
23
|
+
Provides-Extra: test
|
|
24
|
+
Requires-Dist: pytest; extra == 'test'
|
|
25
|
+
Requires-Dist: reportlab; extra == 'test'
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
|
|
28
|
+
# pdfblah
|
|
29
|
+
|
|
30
|
+
Real find and replace on the actual text in a PDF, from the command line.
|
|
31
|
+
|
|
32
|
+
Most tools "edit" a PDF by painting a box over the old text and drawing new text
|
|
33
|
+
on top, which leaves the original underneath (copy and paste still reveals it) and
|
|
34
|
+
often adds a watermark. `pdfblah` rewrites the real text in the content stream, so:
|
|
35
|
+
|
|
36
|
+
- the old text is genuinely gone (`pdftotext`, Ctrl-F, and copy show only the new value)
|
|
37
|
+
- no overlay, no watermark
|
|
38
|
+
- the original metadata (dates, Producer, XMP) is preserved byte for byte
|
|
39
|
+
- alignment is auto-detected and kept, so right-aligned numbers stay flush
|
|
40
|
+
- fonts it cannot reproduce are refused instead of garbled
|
|
41
|
+
|
|
42
|
+
Pure Python (pdfplumber + pikepdf). No system dependencies.
|
|
43
|
+
|
|
44
|
+
## Install
|
|
45
|
+
|
|
46
|
+
```sh
|
|
47
|
+
pipx install pdfblah # or: pip install pdfblah
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Use
|
|
51
|
+
|
|
52
|
+
Replace the first match:
|
|
53
|
+
|
|
54
|
+
```sh
|
|
55
|
+
pdfblah in.pdf out.pdf --find "Old Name" --replace "New Name"
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Options:
|
|
59
|
+
|
|
60
|
+
```sh
|
|
61
|
+
--scope all change every match (default: first)
|
|
62
|
+
--scope 3 change the 3rd match
|
|
63
|
+
--ci ignore case
|
|
64
|
+
--word whole word only ("cat" will not match "category")
|
|
65
|
+
--page 2 only page 2
|
|
66
|
+
--replace "" delete the text
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
Many rules from a file (`FIND | REPLACE | FLAGS` per line):
|
|
70
|
+
|
|
71
|
+
```sh
|
|
72
|
+
pdfblah in.pdf out.pdf --rules rules.txt
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
```
|
|
76
|
+
# rules.txt
|
|
77
|
+
Old Company Name | New Company Name | all
|
|
78
|
+
CONFIDENTIAL DRAFT | FINAL | ci
|
|
79
|
+
Jane Doe | John Smith | all word
|
|
80
|
+
Total | Sum | 2
|
|
81
|
+
delete this phrase |
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## Library
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
from pdfblah import process, apply_rules, parse_rules_file
|
|
88
|
+
|
|
89
|
+
process("in.pdf", "out.pdf", "999.00", "42.00", scope="all", ci=True)
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
Each call returns a report dict (`ok`, `count`, `refused`, `reason`, ...).
|
|
93
|
+
|
|
94
|
+
## What it does not do
|
|
95
|
+
|
|
96
|
+
Scanned PDFs (image only, no text layer) cannot be edited. Fonts that are not
|
|
97
|
+
embedded and not standard, or use a custom encoding, are refused rather than
|
|
98
|
+
rendered wrong. This is by design: a wrong-looking edit is worse than a clear "no".
|
|
99
|
+
|
|
100
|
+
## Hosted version
|
|
101
|
+
|
|
102
|
+
Want it without installing anything, or for a non-technical colleague? The hosted
|
|
103
|
+
version at **[pdfblah.com](https://pdfblah.com)** does the same edit in the browser:
|
|
104
|
+
upload, preview for free, download.
|
|
105
|
+
|
|
106
|
+
## License
|
|
107
|
+
|
|
108
|
+
MIT, (c) 2026 Kuvop LLC.
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
pdfblah/__init__.py,sha256=cMXBVWRgliwl1vgXTgcjP0ll0iaOcl9oiZi053maKqE,531
|
|
2
|
+
pdfblah/cli.py,sha256=G1P2ew8bF99RNo81Tf-_32N6maDhv92Pt8uoSByUmo4,2759
|
|
3
|
+
pdfblah/engine.py,sha256=IwVQPp4RwDpPkvTr0U64iqbHAteutVanMs9i0fJ-ezg,17366
|
|
4
|
+
pdfblah-0.1.0.dist-info/METADATA,sha256=tstWvKG0vLW-FLJBFiLAsUeJpUodjxyfHZy5ep8RFtk,3233
|
|
5
|
+
pdfblah-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
6
|
+
pdfblah-0.1.0.dist-info/entry_points.txt,sha256=MRAGFGRvUX_YSAbER9rOoX3nl0zG2wgKRefiibyY_OY,45
|
|
7
|
+
pdfblah-0.1.0.dist-info/licenses/LICENSE,sha256=PwjdCjOX6bX5j_SaqjZm4UC9JLSXsIDx4pEtkVEDnjA,1066
|
|
8
|
+
pdfblah-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Kuvop LLC
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|