pychd-pyobf 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,28 @@
1
+ """pychd_pyobf — anonymise identifiers / constants / metadata in a .pyc.
2
+
3
+ Public API:
4
+
5
+ * :func:`obfuscate` — main entry point: ``obfuscate(in_path, out_path)``
6
+ rewrites a .pyc in place and returns an :class:`ObfuscationReport`.
7
+ * :class:`ObfuscationReport` — the report dataclass (paths, writer
8
+ version, identifier mapping, native vs subprocess flag).
9
+ * :class:`ObfuscationMapping` — the original→anonymised name table
10
+ the report carries.
11
+
12
+ The CLI entry point is :func:`pychd_pyobf.cli.main` (registered as the
13
+ ``pychd-pyobf`` console script).
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ from .dispatch import ObfuscationReport, obfuscate
19
+ from .rewrite_native import ObfuscationMapping
20
+
21
+ __version__ = "0.1.0"
22
+
23
+ __all__ = [
24
+ "ObfuscationMapping",
25
+ "ObfuscationReport",
26
+ "__version__",
27
+ "obfuscate",
28
+ ]
pychd_pyobf/cli.py ADDED
@@ -0,0 +1,67 @@
1
+ """``pychd-pyobf`` command-line entry point.
2
+
3
+ Usage::
4
+
5
+ pychd-pyobf rewrite IN.pyc OUT.pyc [--mapping mapping.json]
6
+ [--force-subprocess]
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import argparse
12
+ import json
13
+ import sys
14
+ from pathlib import Path
15
+
16
+ from .dispatch import obfuscate
17
+
18
+
19
+ def _cmd_rewrite(args: argparse.Namespace) -> int:
20
+ report = obfuscate(
21
+ args.in_pyc,
22
+ args.out_pyc,
23
+ force_subprocess=args.force_subprocess,
24
+ )
25
+ path = "native" if report.used_native else "subprocess"
26
+ print(
27
+ f"pychd-pyobf: wrote {report.out_path} "
28
+ f"(writer Py {report.version.version[0]}.{report.version.version[1]}, "
29
+ f"{path} path, {report.total_renames()} renames)",
30
+ file=sys.stderr,
31
+ )
32
+ if args.mapping is not None:
33
+ args.mapping.write_text(json.dumps(report.mapping.to_dict(), indent=2))
34
+ print(f"pychd-pyobf: mapping → {args.mapping}", file=sys.stderr)
35
+ return 0
36
+
37
+
38
+ def main(argv: list[str] | None = None) -> int:
39
+ parser = argparse.ArgumentParser(prog="pychd-pyobf", description=__doc__)
40
+ sub = parser.add_subparsers(dest="cmd", required=True)
41
+
42
+ rew = sub.add_parser("rewrite", help="anonymise IN.pyc → OUT.pyc")
43
+ rew.add_argument("in_pyc", type=Path)
44
+ rew.add_argument("out_pyc", type=Path)
45
+ rew.add_argument(
46
+ "--mapping",
47
+ type=Path,
48
+ default=None,
49
+ help="optional path to dump the original→anonymised JSON map",
50
+ )
51
+ rew.add_argument(
52
+ "--force-subprocess",
53
+ action="store_true",
54
+ help=(
55
+ "always take the subprocess path even when the writer minor"
56
+ " matches the current interpreter (useful for testing the"
57
+ " cross-version code)"
58
+ ),
59
+ )
60
+ rew.set_defaults(func=_cmd_rewrite)
61
+
62
+ args = parser.parse_args(argv)
63
+ return args.func(args)
64
+
65
+
66
+ if __name__ == "__main__":
67
+ raise SystemExit(main())
@@ -0,0 +1,96 @@
1
+ """Top-level obfuscation entry point.
2
+
3
+ Dispatches between the native rewriter (when the .pyc was written by
4
+ the *currently-running* interpreter) and the subprocess rewriter
5
+ (everything else). Returns an :class:`ObfuscationReport` carrying the
6
+ output path, the writer version, and the identifier mapping.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import sys
12
+ from dataclasses import dataclass
13
+ from pathlib import Path
14
+
15
+ from pychd.versions import VersionInfo
16
+
17
+ from .header import header_length_for, merge_pyc, split_pyc
18
+ from .rewrite_native import ObfuscationMapping, anonymise
19
+ from .rewrite_subprocess import run_subprocess_rewrite, uv_run_command
20
+
21
+
22
+ @dataclass
23
+ class ObfuscationReport:
24
+ """Result of an obfuscation run."""
25
+
26
+ in_path: Path
27
+ out_path: Path
28
+ version: VersionInfo
29
+ used_native: bool
30
+ mapping: ObfuscationMapping
31
+
32
+ def total_renames(self) -> int:
33
+ m = self.mapping
34
+ return (
35
+ len(m.names)
36
+ + len(m.varnames)
37
+ + len(m.freevars)
38
+ + len(m.cellvars)
39
+ + len(m.consts)
40
+ + len(m.co_names)
41
+ )
42
+
43
+
44
+ def _current_minor() -> tuple[int, int]:
45
+ return (sys.version_info.major, sys.version_info.minor)
46
+
47
+
48
+ def obfuscate(
49
+ in_pyc: Path,
50
+ out_pyc: Path,
51
+ *,
52
+ force_subprocess: bool = False,
53
+ ) -> ObfuscationReport:
54
+ """Rewrite *in_pyc* → *out_pyc* and return an audit report.
55
+
56
+ Native path: when the writer's minor matches the running
57
+ interpreter we ``marshal.loads`` directly and rewrite in-process.
58
+ Cross-version path: spawn ``uv run --python <writer-minor>`` and
59
+ run the same rewrite inside it (see ``rewrite_subprocess.py``).
60
+
61
+ ``force_subprocess=True`` always takes the subprocess path; useful
62
+ for tests that want to verify the cross-version code on the
63
+ currently-running version too.
64
+ """
65
+ in_pyc = Path(in_pyc)
66
+ out_pyc = Path(out_pyc)
67
+ version, header, body = split_pyc(in_pyc)
68
+ hlen = header_length_for(version)
69
+ use_native = (not force_subprocess) and version.version == _current_minor()
70
+ if use_native:
71
+ import marshal
72
+
73
+ code = marshal.loads(body)
74
+ new_code, mapping = anonymise(code)
75
+ new_body = marshal.dumps(new_code)
76
+ out_pyc.parent.mkdir(parents=True, exist_ok=True)
77
+ out_pyc.write_bytes(merge_pyc(header, new_body))
78
+ return ObfuscationReport(
79
+ in_path=in_pyc,
80
+ out_path=out_pyc,
81
+ version=version,
82
+ used_native=True,
83
+ mapping=mapping,
84
+ )
85
+ target_cmd = uv_run_command(version.version)
86
+ mapping = run_subprocess_rewrite(target_cmd, in_pyc, out_pyc, hlen)
87
+ return ObfuscationReport(
88
+ in_path=in_pyc,
89
+ out_path=out_pyc,
90
+ version=version,
91
+ used_native=False,
92
+ mapping=mapping,
93
+ )
94
+
95
+
96
+ __all__ = ["ObfuscationReport", "obfuscate"]
pychd_pyobf/header.py ADDED
@@ -0,0 +1,72 @@
1
+ """CPython ``.pyc`` header parsing + reconstruction.
2
+
3
+ CPython has used two header layouts across the 3.x line:
4
+
5
+ * **3.0 – 3.6** (12-byte header): ``magic (4) | timestamp (4) | source_size (4)``.
6
+ * **3.7+** (PEP 552, 16-byte header):
7
+ ``magic (4) | bit_field (4) | timestamp-or-hash (8) | source_size (8 if hash mode)``
8
+ Concretely the layout is still 16 bytes total — the ``bit_field``
9
+ decides whether the next 8 bytes are timestamp-based (timestamp(4) +
10
+ source_size(4)) or hash-based (8-byte hash).
11
+
12
+ We reuse :func:`pychd.versions.read_magic` / :func:`pychd.versions.detect_version`
13
+ to identify the writer. ``header_length_for(version)`` then tells us
14
+ where the marshalled code object begins; ``split_pyc(pyc)`` returns
15
+ ``(header_bytes, body_bytes)``.
16
+
17
+ We deliberately do not parse the bit_field — the obfuscator preserves
18
+ the original header verbatim, so re-serialising the rewritten code
19
+ object just needs to concatenate the original bytes with the new body.
20
+ The only field we ever consider rewriting is ``source_size``, which we
21
+ zero out (no source on disk for an anonymised .pyc), but only when the
22
+ writer is 3.7+ where that field is unambiguous.
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ from pathlib import Path
28
+
29
+ from pychd.versions import VersionInfo, detect_version
30
+
31
+
32
+ def header_length_for(version: VersionInfo) -> int:
33
+ """Return the byte length of the .pyc header for *version*'s writer.
34
+
35
+ 3.7 introduced the 16-byte PEP 552 header. Everything before that
36
+ used a 12-byte layout (magic + timestamp + source_size, each 4
37
+ bytes little-endian).
38
+ """
39
+ if version.version >= (3, 7):
40
+ return 16
41
+ return 12
42
+
43
+
44
+ def split_pyc(pyc_path: Path) -> tuple[VersionInfo, bytes, bytes]:
45
+ """Read *pyc_path* and return (version, header_bytes, body_bytes).
46
+
47
+ The body is the marshalled top-level code object, ready to feed
48
+ into :func:`marshal.loads` under the writer's Python interpreter.
49
+ """
50
+ data = pyc_path.read_bytes()
51
+ version = detect_version(pyc_path)
52
+ hlen = header_length_for(version)
53
+ if len(data) < hlen:
54
+ raise ValueError(
55
+ f"{pyc_path}: truncated — only {len(data)} bytes but expected"
56
+ f" at least {hlen} for Python {version.version[0]}."
57
+ f"{version.version[1]}",
58
+ )
59
+ return version, data[:hlen], data[hlen:]
60
+
61
+
62
+ def merge_pyc(header: bytes, body: bytes) -> bytes:
63
+ """Reassemble a .pyc from its (header, body) pair.
64
+
65
+ This is a thin wrapper that exists so callers can match the
66
+ :func:`split_pyc` mental model rather than concatenating raw
67
+ bytes.
68
+ """
69
+ return header + body
70
+
71
+
72
+ __all__ = ["header_length_for", "split_pyc", "merge_pyc"]
@@ -0,0 +1,210 @@
1
+ """Native (3.14 / running-interpreter) .pyc anonymiser.
2
+
3
+ Uses :func:`marshal.loads` + :meth:`types.CodeType.replace` to rewrite
4
+ identifiers, constants, and metadata recursively, then re-marshals the
5
+ top-level code object. The opcode stream (``co_code``) is preserved
6
+ byte-for-byte so :mod:`dis` still walks the result and pychd's rule
7
+ pass still sees the same instruction structure.
8
+
9
+ The cross-version path (``rewrite_subprocess``) reuses the same
10
+ algorithm, just executed inside a subprocess running the target
11
+ interpreter.
12
+
13
+ Anonymisation rules (kept in sync with the package docstring):
14
+
15
+ * ``co_names`` → ``_n0, _n1, …``
16
+ * ``co_varnames`` → ``_v0, _v1, …``
17
+ * ``co_freevars`` → ``_f0, _f1, …``
18
+ * ``co_cellvars`` → ``_c0, _c1, …``
19
+ * ``co_consts`` → string literals → ``_s0, _s1, …``; other
20
+ primitives left alone; tuples / frozensets
21
+ mapped recursively; nested code objects
22
+ recursively anonymised
23
+ * ``co_name`` → per-depth ``_fn0, _fn1, …``
24
+ * ``co_qualname`` → same per-depth scheme (3.11+ only)
25
+ * ``co_filename`` → fixed literal ``"<anonymised>"``
26
+ * ``co_lnotab`` /
27
+ ``co_linetable`` /
28
+ ``co_positions``→ replaced with empty bytes — pychd's rule pass
29
+ does not depend on line info
30
+ * ``co_firstlineno`` → 1
31
+ * docstring (the leading ``co_consts[0]`` when it is a ``str``) →
32
+ retained as a string but rewritten via the same ``co_consts``
33
+ mapping (so it ends up as ``_sN`` rather than its original text)
34
+
35
+ The function returns an :class:`ObfuscationMapping` so callers can
36
+ audit the rewriting (and so the unit tests can assert that every
37
+ emitted identifier starts with the expected prefix).
38
+ """
39
+
40
+ from __future__ import annotations
41
+
42
+ from dataclasses import dataclass, field
43
+ from types import CodeType
44
+
45
+
46
+ @dataclass
47
+ class ObfuscationMapping:
48
+ """Original → anonymised name table, returned alongside the rewrite."""
49
+
50
+ names: dict[str, str] = field(default_factory=dict)
51
+ varnames: dict[str, str] = field(default_factory=dict)
52
+ freevars: dict[str, str] = field(default_factory=dict)
53
+ cellvars: dict[str, str] = field(default_factory=dict)
54
+ consts: dict[str, str] = field(default_factory=dict)
55
+ co_names: dict[str, str] = field(default_factory=dict) # co_name (function name)
56
+
57
+ def to_dict(self) -> dict[str, dict[str, str]]:
58
+ return {
59
+ "names": dict(self.names),
60
+ "varnames": dict(self.varnames),
61
+ "freevars": dict(self.freevars),
62
+ "cellvars": dict(self.cellvars),
63
+ "consts": dict(self.consts),
64
+ "co_names": dict(self.co_names),
65
+ }
66
+
67
+
68
+ _ANON_FILENAME = "<anonymised>"
69
+
70
+
71
+ def _anonymise_tuple(
72
+ original: tuple[str, ...],
73
+ prefix: str,
74
+ mapping: dict[str, str],
75
+ ) -> tuple[str, ...]:
76
+ """Rewrite *original* (a tuple of strings) into ``_<prefix>N`` form,
77
+ growing *mapping* with the rename pairs.
78
+
79
+ The suffix counter is the *global* size of ``mapping`` rather than
80
+ the per-tuple index — otherwise two different code objects whose
81
+ parameter lists each start at index 0 would both map their first
82
+ fresh name to ``_<prefix>0``, producing duplicate-argument bugs
83
+ when ``apply_mapping_to_source`` writes them out.
84
+ """
85
+ out: list[str] = []
86
+ for name in original:
87
+ if name in mapping:
88
+ out.append(mapping[name])
89
+ continue
90
+ new_name = f"_{prefix}{len(mapping)}"
91
+ mapping[name] = new_name
92
+ out.append(new_name)
93
+ return tuple(out)
94
+
95
+
96
+ def _anonymise_const(
97
+ const: object,
98
+ mapping: ObfuscationMapping,
99
+ depth: int,
100
+ depth_counter: dict[int, int],
101
+ ) -> object:
102
+ """Recursively rewrite a ``co_consts`` entry.
103
+
104
+ * Strings become ``_sN`` (interned across the whole code-object
105
+ tree so equal strings get the same anonymised name).
106
+ * Tuples / frozensets are remapped element-by-element so they
107
+ remain hashable.
108
+ * Nested :class:`CodeType` objects are recursively anonymised.
109
+ * Numbers, bytes, ``None``, ``True``, ``False``, ``Ellipsis`` are
110
+ preserved (the LLM cannot infer source identity from a numeric
111
+ literal that the rule pass also sees verbatim).
112
+ """
113
+ if isinstance(const, str):
114
+ if const in mapping.consts:
115
+ return mapping.consts[const]
116
+ new = f"_s{len(mapping.consts)}"
117
+ mapping.consts[const] = new
118
+ return new
119
+ if isinstance(const, tuple):
120
+ return tuple(
121
+ _anonymise_const(item, mapping, depth, depth_counter) for item in const
122
+ )
123
+ if isinstance(const, frozenset):
124
+ return frozenset(
125
+ _anonymise_const(item, mapping, depth, depth_counter) for item in const
126
+ )
127
+ if isinstance(const, CodeType):
128
+ return _anonymise_code(const, mapping, depth + 1, depth_counter)
129
+ # int / float / complex / bool / None / bytes / Ellipsis: keep.
130
+ return const
131
+
132
+
133
+ def _empty_lineinfo() -> bytes:
134
+ return b""
135
+
136
+
137
+ def _anonymise_code(
138
+ code: CodeType,
139
+ mapping: ObfuscationMapping,
140
+ depth: int,
141
+ depth_counter: dict[int, int],
142
+ ) -> CodeType:
143
+ """Return a new :class:`CodeType` with anonymised identifiers."""
144
+ # Identifier tuples.
145
+ new_names = _anonymise_tuple(code.co_names, "n", mapping.names)
146
+ new_varnames = _anonymise_tuple(code.co_varnames, "v", mapping.varnames)
147
+ new_freevars = _anonymise_tuple(code.co_freevars, "f", mapping.freevars)
148
+ new_cellvars = _anonymise_tuple(code.co_cellvars, "c", mapping.cellvars)
149
+
150
+ # Constants (recursive).
151
+ new_consts = tuple(
152
+ _anonymise_const(c, mapping, depth, depth_counter) for c in code.co_consts
153
+ )
154
+
155
+ # Per-depth function name counter — ``_fn0`` at depth 0,
156
+ # ``_fn1, _fn2, …`` for nested defs.
157
+ n_at_depth = depth_counter.setdefault(depth, 0)
158
+ new_co_name = f"_fn{depth}_{n_at_depth}"
159
+ depth_counter[depth] = n_at_depth + 1
160
+ mapping.co_names[code.co_name] = new_co_name
161
+
162
+ # First do the always-supported rewrite. The remaining kwargs are
163
+ # version-conditional and applied via a second ``replace`` call so
164
+ # we keep the strict signature of the first call for the type
165
+ # checker while still letting older interpreters skip kwargs they
166
+ # do not accept.
167
+ new_code = code.replace(
168
+ co_names=new_names,
169
+ co_varnames=new_varnames,
170
+ co_freevars=new_freevars,
171
+ co_cellvars=new_cellvars,
172
+ co_consts=new_consts,
173
+ co_name=new_co_name,
174
+ co_filename=_ANON_FILENAME,
175
+ co_firstlineno=1,
176
+ )
177
+ # Optional fields. Each ``replace`` returns a fresh CodeType, so
178
+ # chaining is fine.
179
+ if hasattr(new_code, "co_qualname"):
180
+ new_code = new_code.replace(co_qualname=new_co_name)
181
+ if hasattr(new_code, "co_linetable"):
182
+ # 3.11+ uses ``co_linetable`` as the canonical line table.
183
+ new_code = new_code.replace(co_linetable=_empty_lineinfo())
184
+ # On 3.10 and earlier, ``co_lnotab`` is the canonical line table.
185
+ # We suppress the deprecation warning that ``hasattr(code,
186
+ # "co_lnotab")`` raises on 3.11+ where the attribute is now a
187
+ # read-only alias and ``replace()`` no longer accepts the kwarg.
188
+ import sys as _sys
189
+ import warnings as _warnings
190
+
191
+ if _sys.version_info < (3, 11):
192
+ with _warnings.catch_warnings():
193
+ _warnings.simplefilter("ignore", DeprecationWarning)
194
+ if hasattr(new_code, "co_lnotab"):
195
+ new_code = new_code.replace(co_lnotab=_empty_lineinfo())
196
+ # ``co_exceptiontable`` (3.11+) carries try/except metadata; the
197
+ # opcode stream still needs valid handler offsets so we leave it
198
+ # alone. ``co_positions`` is computed lazily from co_linetable so
199
+ # zeroing the table is enough.
200
+ return new_code
201
+
202
+
203
+ def anonymise(code: CodeType) -> tuple[CodeType, ObfuscationMapping]:
204
+ """Public entry point: anonymise a top-level code object."""
205
+ mapping = ObfuscationMapping()
206
+ new_code = _anonymise_code(code, mapping, depth=0, depth_counter={})
207
+ return new_code, mapping
208
+
209
+
210
+ __all__ = ["ObfuscationMapping", "anonymise"]
@@ -0,0 +1,243 @@
1
+ """Cross-version .pyc anonymiser via subprocess into the target Python.
2
+
3
+ The native rewriter (``rewrite_native``) only works for .pyc files
4
+ produced by the currently-running interpreter — ``types.CodeType``
5
+ internals (e.g. ``co_qualname`` availability, exception-table layout
6
+ on 3.11+, the older ``co_lnotab`` shape on 3.10-) differ across
7
+ minors and the ``replace`` kwarg surface must match.
8
+
9
+ To stay version-agnostic without re-implementing every layout, this
10
+ module spawns the *writer*'s Python interpreter under ``uv run
11
+ --python 3.X --no-project python -c "<snippet>"`` and runs the same
12
+ ``marshal.loads → recursive replace → marshal.dumps`` dance inside
13
+ that subprocess. The snippet is the multi-line string at the bottom
14
+ of this file, kept as a plain ``str`` so it is easy to read and
15
+ review.
16
+
17
+ Constraints:
18
+
19
+ * Communication is via three file-paths passed on argv (input .pyc,
20
+ output .pyc, mapping JSON). No piping marshalled bytes through
21
+ stdin/stdout — keeps the snippet trivial and avoids encoding
22
+ issues across 3.x.
23
+ * ``uv`` is the only required tooling — it manages downloading the
24
+ target Python on first run via python-build-standalone. Hosts
25
+ without uv get a clear ``FileNotFoundError`` instead of a
26
+ baffling ``subprocess`` traceback.
27
+ * 30-second wall-clock timeout per call, matching the cross-version
28
+ fixture builder (``tools/build_multiversion_fixtures.py``).
29
+ """
30
+
31
+ from __future__ import annotations
32
+
33
+ import json
34
+ import subprocess
35
+ from pathlib import Path
36
+
37
+ from .rewrite_native import ObfuscationMapping
38
+
39
+
40
+ def _snippet() -> str:
41
+ """Return the subprocess script as a single-string ``-c`` body.
42
+
43
+ The snippet uses only standard library modules available in every
44
+ Python 3.x release (``marshal`` / ``types`` / ``json`` / ``sys``)
45
+ so we do not need to install anything inside the target venv.
46
+ """
47
+ return r"""
48
+ import json
49
+ import marshal
50
+ import sys
51
+ from pathlib import Path
52
+
53
+ # argv layout: in_pyc, out_pyc, mapping_json, header_len
54
+ in_pyc = Path(sys.argv[1])
55
+ out_pyc = Path(sys.argv[2])
56
+ mapping_path = Path(sys.argv[3])
57
+ header_len = int(sys.argv[4])
58
+
59
+ data = in_pyc.read_bytes()
60
+ header = data[:header_len]
61
+ body = data[header_len:]
62
+ code = marshal.loads(body)
63
+
64
+ mapping = {
65
+ "names": {},
66
+ "varnames": {},
67
+ "freevars": {},
68
+ "cellvars": {},
69
+ "consts": {},
70
+ "co_names": {},
71
+ }
72
+
73
+
74
+ def _anon_tuple(seq, prefix, table):
75
+ # Use the global size of *table* as the suffix counter so two
76
+ # distinct code objects with the same per-tuple index 0 do not
77
+ # both claim "_<prefix>0" — that produces duplicate-arg bugs in
78
+ # the anonymised source.
79
+ out = []
80
+ for name in seq:
81
+ if name in table:
82
+ out.append(table[name])
83
+ continue
84
+ new = "_" + prefix + str(len(table))
85
+ table[name] = new
86
+ out.append(new)
87
+ return tuple(out)
88
+
89
+
90
+ def _anon_const(c):
91
+ if isinstance(c, str):
92
+ if c in mapping["consts"]:
93
+ return mapping["consts"][c]
94
+ new = "_s" + str(len(mapping["consts"]))
95
+ mapping["consts"][c] = new
96
+ return new
97
+ if isinstance(c, tuple):
98
+ return tuple(_anon_const(item) for item in c)
99
+ if isinstance(c, frozenset):
100
+ return frozenset(_anon_const(item) for item in c)
101
+ if type(c).__name__ == "code": # CodeType
102
+ return _anon_code(c, depth + 1) # noqa: F821 — depth bound at outer scope
103
+ return c
104
+
105
+
106
+ _depth_counters = {}
107
+
108
+
109
+ def _anon_code(code, depth):
110
+ # NOTE: ``_anon_const`` references ``depth`` via closure on each
111
+ # entry to ``_anon_code`` — we rebind it at each level by
112
+ # assigning a fresh inner function. This keeps the script under
113
+ # 60 LOC and avoids passing depth through the const recursion.
114
+ global _anon_const
115
+
116
+ def _anon_const(c, _d=depth):
117
+ if isinstance(c, str):
118
+ if c in mapping["consts"]:
119
+ return mapping["consts"][c]
120
+ new = "_s" + str(len(mapping["consts"]))
121
+ mapping["consts"][c] = new
122
+ return new
123
+ if isinstance(c, tuple):
124
+ return tuple(_anon_const(item) for item in c)
125
+ if isinstance(c, frozenset):
126
+ return frozenset(_anon_const(item) for item in c)
127
+ if type(c).__name__ == "code":
128
+ return _anon_code(c, _d + 1)
129
+ return c
130
+
131
+ new_names = _anon_tuple(code.co_names, "n", mapping["names"])
132
+ new_varnames = _anon_tuple(code.co_varnames, "v", mapping["varnames"])
133
+ new_freevars = _anon_tuple(code.co_freevars, "f", mapping["freevars"])
134
+ new_cellvars = _anon_tuple(code.co_cellvars, "c", mapping["cellvars"])
135
+
136
+ new_consts = tuple(_anon_const(c) for c in code.co_consts)
137
+
138
+ n_at_depth = _depth_counters.get(depth, 0)
139
+ new_co_name = "_fn" + str(depth) + "_" + str(n_at_depth)
140
+ _depth_counters[depth] = n_at_depth + 1
141
+ mapping["co_names"][code.co_name] = new_co_name
142
+
143
+ kwargs = dict(
144
+ co_names=new_names,
145
+ co_varnames=new_varnames,
146
+ co_freevars=new_freevars,
147
+ co_cellvars=new_cellvars,
148
+ co_consts=new_consts,
149
+ co_name=new_co_name,
150
+ co_filename="<anonymised>",
151
+ co_firstlineno=1,
152
+ )
153
+ if hasattr(code, "co_qualname"):
154
+ kwargs["co_qualname"] = new_co_name
155
+ if hasattr(code, "co_linetable"):
156
+ kwargs["co_linetable"] = b""
157
+ # co_lnotab is the line-table kwarg only on 3.10 and earlier; on
158
+ # 3.11+ it exists as a deprecated read-only alias and code.replace
159
+ # rejects it.
160
+ if hasattr(code, "co_lnotab") and sys.version_info < (3, 11):
161
+ kwargs["co_lnotab"] = b""
162
+ return code.replace(**kwargs)
163
+
164
+
165
+ new_code = _anon_code(code, 0)
166
+ new_body = marshal.dumps(new_code)
167
+ out_pyc.write_bytes(header + new_body)
168
+ mapping_path.write_text(json.dumps(mapping))
169
+ """
170
+
171
+
172
+ def run_subprocess_rewrite(
173
+ target_python: str,
174
+ in_pyc: Path,
175
+ out_pyc: Path,
176
+ header_len: int,
177
+ *,
178
+ timeout: float = 30.0,
179
+ ) -> ObfuscationMapping:
180
+ """Spawn *target_python* and rewrite *in_pyc* into *out_pyc*.
181
+
182
+ *target_python* is the command/path that, when executed, runs the
183
+ correct Python minor. The standard form on this repo is
184
+ ``uv run --python 3.X --no-project python`` — call sites pass
185
+ that as a single string (or a list joined with spaces) and the
186
+ function dispatches via ``shlex`` if necessary.
187
+
188
+ The mapping JSON is written to a temp file next to *out_pyc* and
189
+ parsed back here so the caller receives a fully-populated
190
+ :class:`ObfuscationMapping`.
191
+ """
192
+ import shlex
193
+ import tempfile
194
+
195
+ out_pyc.parent.mkdir(parents=True, exist_ok=True)
196
+ with tempfile.NamedTemporaryFile(
197
+ prefix="pyobf-map-", suffix=".json", delete=False
198
+ ) as fh:
199
+ mapping_path = Path(fh.name)
200
+ cmd = shlex.split(target_python) + [
201
+ "-c",
202
+ _snippet(),
203
+ str(in_pyc),
204
+ str(out_pyc),
205
+ str(mapping_path),
206
+ str(header_len),
207
+ ]
208
+ proc = subprocess.run(
209
+ cmd,
210
+ capture_output=True,
211
+ text=True,
212
+ timeout=timeout,
213
+ )
214
+ if proc.returncode != 0:
215
+ mapping_path.unlink(missing_ok=True)
216
+ raise RuntimeError(
217
+ f"pyobf subprocess (cmd={cmd[:4]!r}) failed: "
218
+ f"rc={proc.returncode}, stderr={proc.stderr!r}"
219
+ )
220
+ raw = json.loads(mapping_path.read_text())
221
+ mapping_path.unlink(missing_ok=True)
222
+ om = ObfuscationMapping()
223
+ om.names.update(raw["names"])
224
+ om.varnames.update(raw["varnames"])
225
+ om.freevars.update(raw["freevars"])
226
+ om.cellvars.update(raw["cellvars"])
227
+ om.consts.update(raw["consts"])
228
+ om.co_names.update(raw["co_names"])
229
+ return om
230
+
231
+
232
+ def uv_run_command(version: tuple[int, int]) -> str:
233
+ """Return the ``uv``-mediated command string that runs the target
234
+ Python without any project dependencies.
235
+
236
+ Centralised here so the dispatcher and the eval-harness both call
237
+ it the same way (and so the test suite can monkey-patch it when
238
+ running offline).
239
+ """
240
+ return f"uv run --python {version[0]}.{version[1]} --no-project python"
241
+
242
+
243
+ __all__ = ["run_subprocess_rewrite", "uv_run_command"]
@@ -0,0 +1,51 @@
1
+ Metadata-Version: 2.4
2
+ Name: pychd-pyobf
3
+ Version: 0.1.0
4
+ Summary: Anonymise identifiers / constants / metadata inside a CPython .pyc while preserving the opcode stream — for contamination-free decompiler benchmarking
5
+ Author-email: 卍diohabara卍 <diohabara@users.noreply.github.com>
6
+ Requires-Python: >=3.14
7
+ Requires-Dist: pychd>=1.2.0
8
+ Description-Content-Type: text/markdown
9
+
10
+ # pychd-pyobf
11
+
12
+ Anonymise identifiers, string constants, docstrings, and metadata
13
+ inside a CPython `.pyc` while preserving the opcode stream exactly.
14
+
15
+ Built to neutralise LLM training-data memorisation when benchmarking
16
+ Python decompilers: even if an LLM has seen the original source on
17
+ the internet, the anonymised `.pyc` does not contain the surface
18
+ tokens (variable names, comments, docstrings) it would use to
19
+ recognise the source.
20
+
21
+ Covers every CPython release pychd recognises: 3.0–3.14.
22
+ - 3.14 (the running interpreter) is rewritten natively via
23
+ `types.CodeType.replace()`.
24
+ - 3.0–3.13 are rewritten via a subprocess into a uv-managed Python of
25
+ that minor version, so the obfuscator stays a tiny dependency.
26
+
27
+ Pair with `pychd-pyfuzz` (random valid-Python source generator) for
28
+ the strongest available contamination guarantee.
29
+
30
+ See the main [pychd README](https://github.com/diohabara/pychd) for
31
+ the broader story.
32
+
33
+ ## Install
34
+
35
+ ```bash
36
+ pip install pychd-pyobf
37
+ ```
38
+
39
+ ## Use
40
+
41
+ ```bash
42
+ pychd-pyobf rewrite IN.pyc OUT.pyc --mapping mapping.json
43
+ ```
44
+
45
+ The `--mapping` flag (optional) writes the original-to-anonymised
46
+ identifier dict to JSON for audit / debugging. Without it, the
47
+ mapping is discarded after rewriting.
48
+
49
+ ## Status
50
+
51
+ Pre-release. API and CLI are still evolving with the parent project.
@@ -0,0 +1,10 @@
1
+ pychd_pyobf/__init__.py,sha256=MoQAgjFZDzTuIAUvLJ1UkOJ5DHN0JDmOP44dqR5j2r4,824
2
+ pychd_pyobf/cli.py,sha256=8VTtzNogKK-regImOjxKxhvnHfdC_PcuWZRSRy6IIaA,1929
3
+ pychd_pyobf/dispatch.py,sha256=41J3W2c28t30IO7MKCEidGmktbagdR2yMfRg_NGj_-c,2859
4
+ pychd_pyobf/header.py,sha256=oylBmZUJAVuxcRzVjtE-CZg1_34I77eDtd7ZvoGEu7M,2676
5
+ pychd_pyobf/rewrite_native.py,sha256=lwWhyVuUIq443VRhchrLrRCqkMgiYYg5OwatfectFss,8086
6
+ pychd_pyobf/rewrite_subprocess.py,sha256=aGqomAwCzIyUCqpQu1H_9zi-qrnAXhkkK0WjWxyThy8,8059
7
+ pychd_pyobf-0.1.0.dist-info/METADATA,sha256=NlBAQ1QwZCkmayKDCBeXDyhS9Wy8ZvWydyQl7ZRyxcE,1681
8
+ pychd_pyobf-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
9
+ pychd_pyobf-0.1.0.dist-info/entry_points.txt,sha256=RFPUYvWbP8OCb_ZDR15t32xFXKV4Sn3usPRQZCV8s9M,53
10
+ pychd_pyobf-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ pychd-pyobf = pychd_pyobf.cli:main