pychd-pyobf 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pychd_pyobf-0.1.0/.gitignore +172 -0
- pychd_pyobf-0.1.0/PKG-INFO +51 -0
- pychd_pyobf-0.1.0/README.md +42 -0
- pychd_pyobf-0.1.0/pychd_pyobf/__init__.py +28 -0
- pychd_pyobf-0.1.0/pychd_pyobf/cli.py +67 -0
- pychd_pyobf-0.1.0/pychd_pyobf/dispatch.py +96 -0
- pychd_pyobf-0.1.0/pychd_pyobf/header.py +72 -0
- pychd_pyobf-0.1.0/pychd_pyobf/rewrite_native.py +210 -0
- pychd_pyobf-0.1.0/pychd_pyobf/rewrite_subprocess.py +243 -0
- pychd_pyobf-0.1.0/pyproject.toml +26 -0
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
share/python-wheels/
|
|
24
|
+
*.egg-info/
|
|
25
|
+
.installed.cfg
|
|
26
|
+
*.egg
|
|
27
|
+
MANIFEST
|
|
28
|
+
|
|
29
|
+
# PyInstaller
|
|
30
|
+
# Usually these files are written by a python script from a template
|
|
31
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
32
|
+
*.manifest
|
|
33
|
+
*.spec
|
|
34
|
+
|
|
35
|
+
# Installer logs
|
|
36
|
+
pip-log.txt
|
|
37
|
+
pip-delete-this-directory.txt
|
|
38
|
+
|
|
39
|
+
# Unit test / coverage reports
|
|
40
|
+
htmlcov/
|
|
41
|
+
.tox/
|
|
42
|
+
.nox/
|
|
43
|
+
.coverage
|
|
44
|
+
.coverage.*
|
|
45
|
+
.cache
|
|
46
|
+
nosetests.xml
|
|
47
|
+
coverage.xml
|
|
48
|
+
*.cover
|
|
49
|
+
*.py,cover
|
|
50
|
+
.hypothesis/
|
|
51
|
+
.pytest_cache/
|
|
52
|
+
cover/
|
|
53
|
+
|
|
54
|
+
# Translations
|
|
55
|
+
*.mo
|
|
56
|
+
*.pot
|
|
57
|
+
|
|
58
|
+
# Django stuff:
|
|
59
|
+
*.log
|
|
60
|
+
local_settings.py
|
|
61
|
+
db.sqlite3
|
|
62
|
+
db.sqlite3-journal
|
|
63
|
+
|
|
64
|
+
# Flask stuff:
|
|
65
|
+
instance/
|
|
66
|
+
.webassets-cache
|
|
67
|
+
|
|
68
|
+
# Scrapy stuff:
|
|
69
|
+
.scrapy
|
|
70
|
+
|
|
71
|
+
# Sphinx documentation
|
|
72
|
+
docs/_build/
|
|
73
|
+
|
|
74
|
+
# PyBuilder
|
|
75
|
+
.pybuilder/
|
|
76
|
+
target/
|
|
77
|
+
|
|
78
|
+
# Jupyter Notebook
|
|
79
|
+
.ipynb_checkpoints
|
|
80
|
+
|
|
81
|
+
# IPython
|
|
82
|
+
profile_default/
|
|
83
|
+
ipython_config.py
|
|
84
|
+
|
|
85
|
+
# pyenv
|
|
86
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
87
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
88
|
+
# .python-version
|
|
89
|
+
|
|
90
|
+
# pipenv
|
|
91
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
92
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
93
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
94
|
+
# install all needed dependencies.
|
|
95
|
+
#Pipfile.lock
|
|
96
|
+
|
|
97
|
+
# poetry
|
|
98
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
99
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
100
|
+
# commonly ignored for libraries.
|
|
101
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
102
|
+
#poetry.lock
|
|
103
|
+
|
|
104
|
+
# pdm
|
|
105
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
106
|
+
#pdm.lock
|
|
107
|
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
|
108
|
+
# in version control.
|
|
109
|
+
# https://pdm.fming.dev/#use-with-ide
|
|
110
|
+
.pdm.toml
|
|
111
|
+
|
|
112
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
113
|
+
__pypackages__/
|
|
114
|
+
|
|
115
|
+
# Celery stuff
|
|
116
|
+
celerybeat-schedule
|
|
117
|
+
celerybeat.pid
|
|
118
|
+
|
|
119
|
+
# SageMath parsed files
|
|
120
|
+
*.sage.py
|
|
121
|
+
|
|
122
|
+
# Environments
|
|
123
|
+
.env
|
|
124
|
+
.venv
|
|
125
|
+
env/
|
|
126
|
+
venv/
|
|
127
|
+
ENV/
|
|
128
|
+
env.bak/
|
|
129
|
+
venv.bak/
|
|
130
|
+
|
|
131
|
+
# Spyder project settings
|
|
132
|
+
.spyderproject
|
|
133
|
+
.spyproject
|
|
134
|
+
|
|
135
|
+
# Rope project settings
|
|
136
|
+
.ropeproject
|
|
137
|
+
|
|
138
|
+
# mkdocs documentation
|
|
139
|
+
/site
|
|
140
|
+
|
|
141
|
+
# mypy
|
|
142
|
+
.mypy_cache/
|
|
143
|
+
.dmypy.json
|
|
144
|
+
dmypy.json
|
|
145
|
+
|
|
146
|
+
# Pyre type checker
|
|
147
|
+
.pyre/
|
|
148
|
+
|
|
149
|
+
# pytype static type analyzer
|
|
150
|
+
.pytype/
|
|
151
|
+
|
|
152
|
+
# Cython debug symbols
|
|
153
|
+
cython_debug/
|
|
154
|
+
|
|
155
|
+
# PyCharm
|
|
156
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
157
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
158
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
159
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
160
|
+
#.idea/
|
|
161
|
+
|
|
162
|
+
**/logging.conf
|
|
163
|
+
.envrc
|
|
164
|
+
|
|
165
|
+
# Local Claude Code session state
|
|
166
|
+
.claude/
|
|
167
|
+
|
|
168
|
+
# Codex review scratch (transient, do not commit)
|
|
169
|
+
.codex
|
|
170
|
+
|
|
171
|
+
# Local PNG previews (SVGs are the committed source of truth)
|
|
172
|
+
assets/*.png
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pychd-pyobf
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Anonymise identifiers / constants / metadata inside a CPython .pyc while preserving the opcode stream — for contamination-free decompiler benchmarking
|
|
5
|
+
Author-email: 卍diohabara卍 <diohabara@users.noreply.github.com>
|
|
6
|
+
Requires-Python: >=3.14
|
|
7
|
+
Requires-Dist: pychd>=1.2.0
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
|
|
10
|
+
# pychd-pyobf
|
|
11
|
+
|
|
12
|
+
Anonymise identifiers, string constants, docstrings, and metadata
|
|
13
|
+
inside a CPython `.pyc` while preserving the opcode stream exactly.
|
|
14
|
+
|
|
15
|
+
Built to neutralise LLM training-data memorisation when benchmarking
|
|
16
|
+
Python decompilers: even if an LLM has seen the original source on
|
|
17
|
+
the internet, the anonymised `.pyc` does not contain the surface
|
|
18
|
+
tokens (variable names, comments, docstrings) it would use to
|
|
19
|
+
recognise the source.
|
|
20
|
+
|
|
21
|
+
Covers every CPython release pychd recognises: 3.0–3.14.
|
|
22
|
+
- 3.14 (the running interpreter) is rewritten natively via
|
|
23
|
+
`types.CodeType.replace()`.
|
|
24
|
+
- 3.0–3.13 are rewritten via a subprocess into a uv-managed Python of
|
|
25
|
+
that minor version, so the obfuscator stays a tiny dependency.
|
|
26
|
+
|
|
27
|
+
Pair with `pychd-pyfuzz` (random valid-Python source generator) for
|
|
28
|
+
the strongest available contamination guarantee.
|
|
29
|
+
|
|
30
|
+
See the main [pychd README](https://github.com/diohabara/pychd) for
|
|
31
|
+
the broader story.
|
|
32
|
+
|
|
33
|
+
## Install
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install pychd-pyobf
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Use
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pychd-pyobf rewrite IN.pyc OUT.pyc --mapping mapping.json
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
The `--mapping` flag (optional) writes the original-to-anonymised
|
|
46
|
+
identifier dict to JSON for audit / debugging. Without it, the
|
|
47
|
+
mapping is discarded after rewriting.
|
|
48
|
+
|
|
49
|
+
## Status
|
|
50
|
+
|
|
51
|
+
Pre-release. API and CLI are still evolving with the parent project.
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# pychd-pyobf
|
|
2
|
+
|
|
3
|
+
Anonymise identifiers, string constants, docstrings, and metadata
|
|
4
|
+
inside a CPython `.pyc` while preserving the opcode stream exactly.
|
|
5
|
+
|
|
6
|
+
Built to neutralise LLM training-data memorisation when benchmarking
|
|
7
|
+
Python decompilers: even if an LLM has seen the original source on
|
|
8
|
+
the internet, the anonymised `.pyc` does not contain the surface
|
|
9
|
+
tokens (variable names, comments, docstrings) it would use to
|
|
10
|
+
recognise the source.
|
|
11
|
+
|
|
12
|
+
Covers every CPython release pychd recognises: 3.0–3.14.
|
|
13
|
+
- 3.14 (the running interpreter) is rewritten natively via
|
|
14
|
+
`types.CodeType.replace()`.
|
|
15
|
+
- 3.0–3.13 are rewritten via a subprocess into a uv-managed Python of
|
|
16
|
+
that minor version, so the obfuscator stays a tiny dependency.
|
|
17
|
+
|
|
18
|
+
Pair with `pychd-pyfuzz` (random valid-Python source generator) for
|
|
19
|
+
the strongest available contamination guarantee.
|
|
20
|
+
|
|
21
|
+
See the main [pychd README](https://github.com/diohabara/pychd) for
|
|
22
|
+
the broader story.
|
|
23
|
+
|
|
24
|
+
## Install
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pip install pychd-pyobf
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Use
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
pychd-pyobf rewrite IN.pyc OUT.pyc --mapping mapping.json
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
The `--mapping` flag (optional) writes the original-to-anonymised
|
|
37
|
+
identifier dict to JSON for audit / debugging. Without it, the
|
|
38
|
+
mapping is discarded after rewriting.
|
|
39
|
+
|
|
40
|
+
## Status
|
|
41
|
+
|
|
42
|
+
Pre-release. API and CLI are still evolving with the parent project.
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""pychd_pyobf — anonymise identifiers / constants / metadata in a .pyc.
|
|
2
|
+
|
|
3
|
+
Public API:
|
|
4
|
+
|
|
5
|
+
* :func:`obfuscate` — main entry point: ``obfuscate(in_path, out_path)``
|
|
6
|
+
rewrites a .pyc in place and returns an :class:`ObfuscationReport`.
|
|
7
|
+
* :class:`ObfuscationReport` — the report dataclass (paths, writer
|
|
8
|
+
version, identifier mapping, native vs subprocess flag).
|
|
9
|
+
* :class:`ObfuscationMapping` — the original→anonymised name table
|
|
10
|
+
the report carries.
|
|
11
|
+
|
|
12
|
+
The CLI entry point is :func:`pychd_pyobf.cli.main` (registered as the
|
|
13
|
+
``pychd-pyobf`` console script).
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
from .dispatch import ObfuscationReport, obfuscate
|
|
19
|
+
from .rewrite_native import ObfuscationMapping
|
|
20
|
+
|
|
21
|
+
__version__ = "0.1.0"
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"ObfuscationMapping",
|
|
25
|
+
"ObfuscationReport",
|
|
26
|
+
"__version__",
|
|
27
|
+
"obfuscate",
|
|
28
|
+
]
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""``pychd-pyobf`` command-line entry point.
|
|
2
|
+
|
|
3
|
+
Usage::
|
|
4
|
+
|
|
5
|
+
pychd-pyobf rewrite IN.pyc OUT.pyc [--mapping mapping.json]
|
|
6
|
+
[--force-subprocess]
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import argparse
|
|
12
|
+
import json
|
|
13
|
+
import sys
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
from .dispatch import obfuscate
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _cmd_rewrite(args: argparse.Namespace) -> int:
|
|
20
|
+
report = obfuscate(
|
|
21
|
+
args.in_pyc,
|
|
22
|
+
args.out_pyc,
|
|
23
|
+
force_subprocess=args.force_subprocess,
|
|
24
|
+
)
|
|
25
|
+
path = "native" if report.used_native else "subprocess"
|
|
26
|
+
print(
|
|
27
|
+
f"pychd-pyobf: wrote {report.out_path} "
|
|
28
|
+
f"(writer Py {report.version.version[0]}.{report.version.version[1]}, "
|
|
29
|
+
f"{path} path, {report.total_renames()} renames)",
|
|
30
|
+
file=sys.stderr,
|
|
31
|
+
)
|
|
32
|
+
if args.mapping is not None:
|
|
33
|
+
args.mapping.write_text(json.dumps(report.mapping.to_dict(), indent=2))
|
|
34
|
+
print(f"pychd-pyobf: mapping → {args.mapping}", file=sys.stderr)
|
|
35
|
+
return 0
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def main(argv: list[str] | None = None) -> int:
|
|
39
|
+
parser = argparse.ArgumentParser(prog="pychd-pyobf", description=__doc__)
|
|
40
|
+
sub = parser.add_subparsers(dest="cmd", required=True)
|
|
41
|
+
|
|
42
|
+
rew = sub.add_parser("rewrite", help="anonymise IN.pyc → OUT.pyc")
|
|
43
|
+
rew.add_argument("in_pyc", type=Path)
|
|
44
|
+
rew.add_argument("out_pyc", type=Path)
|
|
45
|
+
rew.add_argument(
|
|
46
|
+
"--mapping",
|
|
47
|
+
type=Path,
|
|
48
|
+
default=None,
|
|
49
|
+
help="optional path to dump the original→anonymised JSON map",
|
|
50
|
+
)
|
|
51
|
+
rew.add_argument(
|
|
52
|
+
"--force-subprocess",
|
|
53
|
+
action="store_true",
|
|
54
|
+
help=(
|
|
55
|
+
"always take the subprocess path even when the writer minor"
|
|
56
|
+
" matches the current interpreter (useful for testing the"
|
|
57
|
+
" cross-version code)"
|
|
58
|
+
),
|
|
59
|
+
)
|
|
60
|
+
rew.set_defaults(func=_cmd_rewrite)
|
|
61
|
+
|
|
62
|
+
args = parser.parse_args(argv)
|
|
63
|
+
return args.func(args)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
if __name__ == "__main__":
|
|
67
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""Top-level obfuscation entry point.
|
|
2
|
+
|
|
3
|
+
Dispatches between the native rewriter (when the .pyc was written by
|
|
4
|
+
the *currently-running* interpreter) and the subprocess rewriter
|
|
5
|
+
(everything else). Returns an :class:`ObfuscationReport` carrying the
|
|
6
|
+
output path, the writer version, and the identifier mapping.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import sys
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
from pychd.versions import VersionInfo
|
|
16
|
+
|
|
17
|
+
from .header import header_length_for, merge_pyc, split_pyc
|
|
18
|
+
from .rewrite_native import ObfuscationMapping, anonymise
|
|
19
|
+
from .rewrite_subprocess import run_subprocess_rewrite, uv_run_command
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class ObfuscationReport:
|
|
24
|
+
"""Result of an obfuscation run."""
|
|
25
|
+
|
|
26
|
+
in_path: Path
|
|
27
|
+
out_path: Path
|
|
28
|
+
version: VersionInfo
|
|
29
|
+
used_native: bool
|
|
30
|
+
mapping: ObfuscationMapping
|
|
31
|
+
|
|
32
|
+
def total_renames(self) -> int:
|
|
33
|
+
m = self.mapping
|
|
34
|
+
return (
|
|
35
|
+
len(m.names)
|
|
36
|
+
+ len(m.varnames)
|
|
37
|
+
+ len(m.freevars)
|
|
38
|
+
+ len(m.cellvars)
|
|
39
|
+
+ len(m.consts)
|
|
40
|
+
+ len(m.co_names)
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _current_minor() -> tuple[int, int]:
|
|
45
|
+
return (sys.version_info.major, sys.version_info.minor)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def obfuscate(
|
|
49
|
+
in_pyc: Path,
|
|
50
|
+
out_pyc: Path,
|
|
51
|
+
*,
|
|
52
|
+
force_subprocess: bool = False,
|
|
53
|
+
) -> ObfuscationReport:
|
|
54
|
+
"""Rewrite *in_pyc* → *out_pyc* and return an audit report.
|
|
55
|
+
|
|
56
|
+
Native path: when the writer's minor matches the running
|
|
57
|
+
interpreter we ``marshal.loads`` directly and rewrite in-process.
|
|
58
|
+
Cross-version path: spawn ``uv run --python <writer-minor>`` and
|
|
59
|
+
run the same rewrite inside it (see ``rewrite_subprocess.py``).
|
|
60
|
+
|
|
61
|
+
``force_subprocess=True`` always takes the subprocess path; useful
|
|
62
|
+
for tests that want to verify the cross-version code on the
|
|
63
|
+
currently-running version too.
|
|
64
|
+
"""
|
|
65
|
+
in_pyc = Path(in_pyc)
|
|
66
|
+
out_pyc = Path(out_pyc)
|
|
67
|
+
version, header, body = split_pyc(in_pyc)
|
|
68
|
+
hlen = header_length_for(version)
|
|
69
|
+
use_native = (not force_subprocess) and version.version == _current_minor()
|
|
70
|
+
if use_native:
|
|
71
|
+
import marshal
|
|
72
|
+
|
|
73
|
+
code = marshal.loads(body)
|
|
74
|
+
new_code, mapping = anonymise(code)
|
|
75
|
+
new_body = marshal.dumps(new_code)
|
|
76
|
+
out_pyc.parent.mkdir(parents=True, exist_ok=True)
|
|
77
|
+
out_pyc.write_bytes(merge_pyc(header, new_body))
|
|
78
|
+
return ObfuscationReport(
|
|
79
|
+
in_path=in_pyc,
|
|
80
|
+
out_path=out_pyc,
|
|
81
|
+
version=version,
|
|
82
|
+
used_native=True,
|
|
83
|
+
mapping=mapping,
|
|
84
|
+
)
|
|
85
|
+
target_cmd = uv_run_command(version.version)
|
|
86
|
+
mapping = run_subprocess_rewrite(target_cmd, in_pyc, out_pyc, hlen)
|
|
87
|
+
return ObfuscationReport(
|
|
88
|
+
in_path=in_pyc,
|
|
89
|
+
out_path=out_pyc,
|
|
90
|
+
version=version,
|
|
91
|
+
used_native=False,
|
|
92
|
+
mapping=mapping,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
__all__ = ["ObfuscationReport", "obfuscate"]
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""CPython ``.pyc`` header parsing + reconstruction.
|
|
2
|
+
|
|
3
|
+
CPython has used two header layouts across the 3.x line:
|
|
4
|
+
|
|
5
|
+
* **3.0 – 3.6** (12-byte header): ``magic (4) | timestamp (4) | source_size (4)``.
|
|
6
|
+
* **3.7+** (PEP 552, 16-byte header):
|
|
7
|
+
``magic (4) | bit_field (4) | timestamp-or-hash (8) | source_size (8 if hash mode)``
|
|
8
|
+
Concretely the layout is still 16 bytes total — the ``bit_field``
|
|
9
|
+
decides whether the next 8 bytes are timestamp-based (timestamp(4) +
|
|
10
|
+
source_size(4)) or hash-based (8-byte hash).
|
|
11
|
+
|
|
12
|
+
We reuse :func:`pychd.versions.read_magic` / :func:`pychd.versions.detect_version`
|
|
13
|
+
to identify the writer. ``header_length_for(version)`` then tells us
|
|
14
|
+
where the marshalled code object begins; ``split_pyc(pyc)`` returns
|
|
15
|
+
``(header_bytes, body_bytes)``.
|
|
16
|
+
|
|
17
|
+
We deliberately do not parse the bit_field — the obfuscator preserves
|
|
18
|
+
the original header verbatim, so re-serialising the rewritten code
|
|
19
|
+
object just needs to concatenate the original bytes with the new body.
|
|
20
|
+
The only field we ever consider rewriting is ``source_size``, which we
|
|
21
|
+
zero out (no source on disk for an anonymised .pyc), but only when the
|
|
22
|
+
writer is 3.7+ where that field is unambiguous.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
|
|
29
|
+
from pychd.versions import VersionInfo, detect_version
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def header_length_for(version: VersionInfo) -> int:
|
|
33
|
+
"""Return the byte length of the .pyc header for *version*'s writer.
|
|
34
|
+
|
|
35
|
+
3.7 introduced the 16-byte PEP 552 header. Everything before that
|
|
36
|
+
used a 12-byte layout (magic + timestamp + source_size, each 4
|
|
37
|
+
bytes little-endian).
|
|
38
|
+
"""
|
|
39
|
+
if version.version >= (3, 7):
|
|
40
|
+
return 16
|
|
41
|
+
return 12
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def split_pyc(pyc_path: Path) -> tuple[VersionInfo, bytes, bytes]:
|
|
45
|
+
"""Read *pyc_path* and return (version, header_bytes, body_bytes).
|
|
46
|
+
|
|
47
|
+
The body is the marshalled top-level code object, ready to feed
|
|
48
|
+
into :func:`marshal.loads` under the writer's Python interpreter.
|
|
49
|
+
"""
|
|
50
|
+
data = pyc_path.read_bytes()
|
|
51
|
+
version = detect_version(pyc_path)
|
|
52
|
+
hlen = header_length_for(version)
|
|
53
|
+
if len(data) < hlen:
|
|
54
|
+
raise ValueError(
|
|
55
|
+
f"{pyc_path}: truncated — only {len(data)} bytes but expected"
|
|
56
|
+
f" at least {hlen} for Python {version.version[0]}."
|
|
57
|
+
f"{version.version[1]}",
|
|
58
|
+
)
|
|
59
|
+
return version, data[:hlen], data[hlen:]
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def merge_pyc(header: bytes, body: bytes) -> bytes:
|
|
63
|
+
"""Reassemble a .pyc from its (header, body) pair.
|
|
64
|
+
|
|
65
|
+
This is a thin wrapper that exists so callers can match the
|
|
66
|
+
:func:`split_pyc` mental model rather than concatenating raw
|
|
67
|
+
bytes.
|
|
68
|
+
"""
|
|
69
|
+
return header + body
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
__all__ = ["header_length_for", "split_pyc", "merge_pyc"]
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
"""Native (3.14 / running-interpreter) .pyc anonymiser.
|
|
2
|
+
|
|
3
|
+
Uses :func:`marshal.loads` + :meth:`types.CodeType.replace` to rewrite
|
|
4
|
+
identifiers, constants, and metadata recursively, then re-marshals the
|
|
5
|
+
top-level code object. The opcode stream (``co_code``) is preserved
|
|
6
|
+
byte-for-byte so :mod:`dis` still walks the result and pychd's rule
|
|
7
|
+
pass still sees the same instruction structure.
|
|
8
|
+
|
|
9
|
+
The cross-version path (``rewrite_subprocess``) reuses the same
|
|
10
|
+
algorithm, just executed inside a subprocess running the target
|
|
11
|
+
interpreter.
|
|
12
|
+
|
|
13
|
+
Anonymisation rules (kept in sync with the package docstring):
|
|
14
|
+
|
|
15
|
+
* ``co_names`` → ``_n0, _n1, …``
|
|
16
|
+
* ``co_varnames`` → ``_v0, _v1, …``
|
|
17
|
+
* ``co_freevars`` → ``_f0, _f1, …``
|
|
18
|
+
* ``co_cellvars`` → ``_c0, _c1, …``
|
|
19
|
+
* ``co_consts`` → string literals → ``_s0, _s1, …``; other
|
|
20
|
+
primitives left alone; tuples / frozensets
|
|
21
|
+
mapped recursively; nested code objects
|
|
22
|
+
recursively anonymised
|
|
23
|
+
* ``co_name`` → per-depth ``_fn0, _fn1, …``
|
|
24
|
+
* ``co_qualname`` → same per-depth scheme (3.11+ only)
|
|
25
|
+
* ``co_filename`` → fixed literal ``"<anonymised>"``
|
|
26
|
+
* ``co_lnotab`` /
|
|
27
|
+
``co_linetable`` /
|
|
28
|
+
``co_positions``→ replaced with empty bytes — pychd's rule pass
|
|
29
|
+
does not depend on line info
|
|
30
|
+
* ``co_firstlineno`` → 1
|
|
31
|
+
* docstring (the leading ``co_consts[0]`` when it is a ``str``) →
|
|
32
|
+
retained as a string but rewritten via the same ``co_consts``
|
|
33
|
+
mapping (so it ends up as ``_sN`` rather than its original text)
|
|
34
|
+
|
|
35
|
+
The function returns an :class:`ObfuscationMapping` so callers can
|
|
36
|
+
audit the rewriting (and so the unit tests can assert that every
|
|
37
|
+
emitted identifier starts with the expected prefix).
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
from __future__ import annotations
|
|
41
|
+
|
|
42
|
+
from dataclasses import dataclass, field
|
|
43
|
+
from types import CodeType
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class ObfuscationMapping:
|
|
48
|
+
"""Original → anonymised name table, returned alongside the rewrite."""
|
|
49
|
+
|
|
50
|
+
names: dict[str, str] = field(default_factory=dict)
|
|
51
|
+
varnames: dict[str, str] = field(default_factory=dict)
|
|
52
|
+
freevars: dict[str, str] = field(default_factory=dict)
|
|
53
|
+
cellvars: dict[str, str] = field(default_factory=dict)
|
|
54
|
+
consts: dict[str, str] = field(default_factory=dict)
|
|
55
|
+
co_names: dict[str, str] = field(default_factory=dict) # co_name (function name)
|
|
56
|
+
|
|
57
|
+
def to_dict(self) -> dict[str, dict[str, str]]:
|
|
58
|
+
return {
|
|
59
|
+
"names": dict(self.names),
|
|
60
|
+
"varnames": dict(self.varnames),
|
|
61
|
+
"freevars": dict(self.freevars),
|
|
62
|
+
"cellvars": dict(self.cellvars),
|
|
63
|
+
"consts": dict(self.consts),
|
|
64
|
+
"co_names": dict(self.co_names),
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
_ANON_FILENAME = "<anonymised>"
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _anonymise_tuple(
|
|
72
|
+
original: tuple[str, ...],
|
|
73
|
+
prefix: str,
|
|
74
|
+
mapping: dict[str, str],
|
|
75
|
+
) -> tuple[str, ...]:
|
|
76
|
+
"""Rewrite *original* (a tuple of strings) into ``_<prefix>N`` form,
|
|
77
|
+
growing *mapping* with the rename pairs.
|
|
78
|
+
|
|
79
|
+
The suffix counter is the *global* size of ``mapping`` rather than
|
|
80
|
+
the per-tuple index — otherwise two different code objects whose
|
|
81
|
+
parameter lists each start at index 0 would both map their first
|
|
82
|
+
fresh name to ``_<prefix>0``, producing duplicate-argument bugs
|
|
83
|
+
when ``apply_mapping_to_source`` writes them out.
|
|
84
|
+
"""
|
|
85
|
+
out: list[str] = []
|
|
86
|
+
for name in original:
|
|
87
|
+
if name in mapping:
|
|
88
|
+
out.append(mapping[name])
|
|
89
|
+
continue
|
|
90
|
+
new_name = f"_{prefix}{len(mapping)}"
|
|
91
|
+
mapping[name] = new_name
|
|
92
|
+
out.append(new_name)
|
|
93
|
+
return tuple(out)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _anonymise_const(
|
|
97
|
+
const: object,
|
|
98
|
+
mapping: ObfuscationMapping,
|
|
99
|
+
depth: int,
|
|
100
|
+
depth_counter: dict[int, int],
|
|
101
|
+
) -> object:
|
|
102
|
+
"""Recursively rewrite a ``co_consts`` entry.
|
|
103
|
+
|
|
104
|
+
* Strings become ``_sN`` (interned across the whole code-object
|
|
105
|
+
tree so equal strings get the same anonymised name).
|
|
106
|
+
* Tuples / frozensets are remapped element-by-element so they
|
|
107
|
+
remain hashable.
|
|
108
|
+
* Nested :class:`CodeType` objects are recursively anonymised.
|
|
109
|
+
* Numbers, bytes, ``None``, ``True``, ``False``, ``Ellipsis`` are
|
|
110
|
+
preserved (the LLM cannot infer source identity from a numeric
|
|
111
|
+
literal that the rule pass also sees verbatim).
|
|
112
|
+
"""
|
|
113
|
+
if isinstance(const, str):
|
|
114
|
+
if const in mapping.consts:
|
|
115
|
+
return mapping.consts[const]
|
|
116
|
+
new = f"_s{len(mapping.consts)}"
|
|
117
|
+
mapping.consts[const] = new
|
|
118
|
+
return new
|
|
119
|
+
if isinstance(const, tuple):
|
|
120
|
+
return tuple(
|
|
121
|
+
_anonymise_const(item, mapping, depth, depth_counter) for item in const
|
|
122
|
+
)
|
|
123
|
+
if isinstance(const, frozenset):
|
|
124
|
+
return frozenset(
|
|
125
|
+
_anonymise_const(item, mapping, depth, depth_counter) for item in const
|
|
126
|
+
)
|
|
127
|
+
if isinstance(const, CodeType):
|
|
128
|
+
return _anonymise_code(const, mapping, depth + 1, depth_counter)
|
|
129
|
+
# int / float / complex / bool / None / bytes / Ellipsis: keep.
|
|
130
|
+
return const
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _empty_lineinfo() -> bytes:
|
|
134
|
+
return b""
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _anonymise_code(
|
|
138
|
+
code: CodeType,
|
|
139
|
+
mapping: ObfuscationMapping,
|
|
140
|
+
depth: int,
|
|
141
|
+
depth_counter: dict[int, int],
|
|
142
|
+
) -> CodeType:
|
|
143
|
+
"""Return a new :class:`CodeType` with anonymised identifiers."""
|
|
144
|
+
# Identifier tuples.
|
|
145
|
+
new_names = _anonymise_tuple(code.co_names, "n", mapping.names)
|
|
146
|
+
new_varnames = _anonymise_tuple(code.co_varnames, "v", mapping.varnames)
|
|
147
|
+
new_freevars = _anonymise_tuple(code.co_freevars, "f", mapping.freevars)
|
|
148
|
+
new_cellvars = _anonymise_tuple(code.co_cellvars, "c", mapping.cellvars)
|
|
149
|
+
|
|
150
|
+
# Constants (recursive).
|
|
151
|
+
new_consts = tuple(
|
|
152
|
+
_anonymise_const(c, mapping, depth, depth_counter) for c in code.co_consts
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
# Per-depth function name counter — ``_fn0`` at depth 0,
|
|
156
|
+
# ``_fn1, _fn2, …`` for nested defs.
|
|
157
|
+
n_at_depth = depth_counter.setdefault(depth, 0)
|
|
158
|
+
new_co_name = f"_fn{depth}_{n_at_depth}"
|
|
159
|
+
depth_counter[depth] = n_at_depth + 1
|
|
160
|
+
mapping.co_names[code.co_name] = new_co_name
|
|
161
|
+
|
|
162
|
+
# First do the always-supported rewrite. The remaining kwargs are
|
|
163
|
+
# version-conditional and applied via a second ``replace`` call so
|
|
164
|
+
# we keep the strict signature of the first call for the type
|
|
165
|
+
# checker while still letting older interpreters skip kwargs they
|
|
166
|
+
# do not accept.
|
|
167
|
+
new_code = code.replace(
|
|
168
|
+
co_names=new_names,
|
|
169
|
+
co_varnames=new_varnames,
|
|
170
|
+
co_freevars=new_freevars,
|
|
171
|
+
co_cellvars=new_cellvars,
|
|
172
|
+
co_consts=new_consts,
|
|
173
|
+
co_name=new_co_name,
|
|
174
|
+
co_filename=_ANON_FILENAME,
|
|
175
|
+
co_firstlineno=1,
|
|
176
|
+
)
|
|
177
|
+
# Optional fields. Each ``replace`` returns a fresh CodeType, so
|
|
178
|
+
# chaining is fine.
|
|
179
|
+
if hasattr(new_code, "co_qualname"):
|
|
180
|
+
new_code = new_code.replace(co_qualname=new_co_name)
|
|
181
|
+
if hasattr(new_code, "co_linetable"):
|
|
182
|
+
# 3.11+ uses ``co_linetable`` as the canonical line table.
|
|
183
|
+
new_code = new_code.replace(co_linetable=_empty_lineinfo())
|
|
184
|
+
# On 3.10 and earlier, ``co_lnotab`` is the canonical line table.
|
|
185
|
+
# We suppress the deprecation warning that ``hasattr(code,
|
|
186
|
+
# "co_lnotab")`` raises on 3.11+ where the attribute is now a
|
|
187
|
+
# read-only alias and ``replace()`` no longer accepts the kwarg.
|
|
188
|
+
import sys as _sys
|
|
189
|
+
import warnings as _warnings
|
|
190
|
+
|
|
191
|
+
if _sys.version_info < (3, 11):
|
|
192
|
+
with _warnings.catch_warnings():
|
|
193
|
+
_warnings.simplefilter("ignore", DeprecationWarning)
|
|
194
|
+
if hasattr(new_code, "co_lnotab"):
|
|
195
|
+
new_code = new_code.replace(co_lnotab=_empty_lineinfo())
|
|
196
|
+
# ``co_exceptiontable`` (3.11+) carries try/except metadata; the
|
|
197
|
+
# opcode stream still needs valid handler offsets so we leave it
|
|
198
|
+
# alone. ``co_positions`` is computed lazily from co_linetable so
|
|
199
|
+
# zeroing the table is enough.
|
|
200
|
+
return new_code
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def anonymise(code: CodeType) -> tuple[CodeType, ObfuscationMapping]:
|
|
204
|
+
"""Public entry point: anonymise a top-level code object."""
|
|
205
|
+
mapping = ObfuscationMapping()
|
|
206
|
+
new_code = _anonymise_code(code, mapping, depth=0, depth_counter={})
|
|
207
|
+
return new_code, mapping
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
__all__ = ["ObfuscationMapping", "anonymise"]
|
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
"""Cross-version .pyc anonymiser via subprocess into the target Python.
|
|
2
|
+
|
|
3
|
+
The native rewriter (``rewrite_native``) only works for .pyc files
|
|
4
|
+
produced by the currently-running interpreter — ``types.CodeType``
|
|
5
|
+
internals (e.g. ``co_qualname`` availability, exception-table layout
|
|
6
|
+
on 3.11+, the older ``co_lnotab`` shape on 3.10-) differ across
|
|
7
|
+
minors and the ``replace`` kwarg surface must match.
|
|
8
|
+
|
|
9
|
+
To stay version-agnostic without re-implementing every layout, this
|
|
10
|
+
module spawns the *writer*'s Python interpreter under ``uv run
|
|
11
|
+
--python 3.X --no-project python -c "<snippet>"`` and runs the same
|
|
12
|
+
``marshal.loads → recursive replace → marshal.dumps`` dance inside
|
|
13
|
+
that subprocess. The snippet is the multi-line string at the bottom
|
|
14
|
+
of this file, kept as a plain ``str`` so it is easy to read and
|
|
15
|
+
review.
|
|
16
|
+
|
|
17
|
+
Constraints:
|
|
18
|
+
|
|
19
|
+
* Communication is via three file-paths passed on argv (input .pyc,
|
|
20
|
+
output .pyc, mapping JSON). No piping marshalled bytes through
|
|
21
|
+
stdin/stdout — keeps the snippet trivial and avoids encoding
|
|
22
|
+
issues across 3.x.
|
|
23
|
+
* ``uv`` is the only required tooling — it manages downloading the
|
|
24
|
+
target Python on first run via python-build-standalone. Hosts
|
|
25
|
+
without uv get a clear ``FileNotFoundError`` instead of a
|
|
26
|
+
baffling ``subprocess`` traceback.
|
|
27
|
+
* 30-second wall-clock timeout per call, matching the cross-version
|
|
28
|
+
fixture builder (``tools/build_multiversion_fixtures.py``).
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
from __future__ import annotations
|
|
32
|
+
|
|
33
|
+
import json
|
|
34
|
+
import subprocess
|
|
35
|
+
from pathlib import Path
|
|
36
|
+
|
|
37
|
+
from .rewrite_native import ObfuscationMapping
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _snippet() -> str:
|
|
41
|
+
"""Return the subprocess script as a single-string ``-c`` body.
|
|
42
|
+
|
|
43
|
+
The snippet uses only standard library modules available in every
|
|
44
|
+
Python 3.x release (``marshal`` / ``types`` / ``json`` / ``sys``)
|
|
45
|
+
so we do not need to install anything inside the target venv.
|
|
46
|
+
"""
|
|
47
|
+
return r"""
|
|
48
|
+
import json
|
|
49
|
+
import marshal
|
|
50
|
+
import sys
|
|
51
|
+
from pathlib import Path
|
|
52
|
+
|
|
53
|
+
# argv layout: in_pyc, out_pyc, mapping_json, header_len
|
|
54
|
+
in_pyc = Path(sys.argv[1])
|
|
55
|
+
out_pyc = Path(sys.argv[2])
|
|
56
|
+
mapping_path = Path(sys.argv[3])
|
|
57
|
+
header_len = int(sys.argv[4])
|
|
58
|
+
|
|
59
|
+
data = in_pyc.read_bytes()
|
|
60
|
+
header = data[:header_len]
|
|
61
|
+
body = data[header_len:]
|
|
62
|
+
code = marshal.loads(body)
|
|
63
|
+
|
|
64
|
+
mapping = {
|
|
65
|
+
"names": {},
|
|
66
|
+
"varnames": {},
|
|
67
|
+
"freevars": {},
|
|
68
|
+
"cellvars": {},
|
|
69
|
+
"consts": {},
|
|
70
|
+
"co_names": {},
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _anon_tuple(seq, prefix, table):
|
|
75
|
+
# Use the global size of *table* as the suffix counter so two
|
|
76
|
+
# distinct code objects with the same per-tuple index 0 do not
|
|
77
|
+
# both claim "_<prefix>0" — that produces duplicate-arg bugs in
|
|
78
|
+
# the anonymised source.
|
|
79
|
+
out = []
|
|
80
|
+
for name in seq:
|
|
81
|
+
if name in table:
|
|
82
|
+
out.append(table[name])
|
|
83
|
+
continue
|
|
84
|
+
new = "_" + prefix + str(len(table))
|
|
85
|
+
table[name] = new
|
|
86
|
+
out.append(new)
|
|
87
|
+
return tuple(out)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _anon_const(c):
|
|
91
|
+
if isinstance(c, str):
|
|
92
|
+
if c in mapping["consts"]:
|
|
93
|
+
return mapping["consts"][c]
|
|
94
|
+
new = "_s" + str(len(mapping["consts"]))
|
|
95
|
+
mapping["consts"][c] = new
|
|
96
|
+
return new
|
|
97
|
+
if isinstance(c, tuple):
|
|
98
|
+
return tuple(_anon_const(item) for item in c)
|
|
99
|
+
if isinstance(c, frozenset):
|
|
100
|
+
return frozenset(_anon_const(item) for item in c)
|
|
101
|
+
if type(c).__name__ == "code": # CodeType
|
|
102
|
+
return _anon_code(c, depth + 1) # noqa: F821 — depth bound at outer scope
|
|
103
|
+
return c
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
_depth_counters = {}
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _anon_code(code, depth):
|
|
110
|
+
# NOTE: ``_anon_const`` references ``depth`` via closure on each
|
|
111
|
+
# entry to ``_anon_code`` — we rebind it at each level by
|
|
112
|
+
# assigning a fresh inner function. This keeps the script under
|
|
113
|
+
# 60 LOC and avoids passing depth through the const recursion.
|
|
114
|
+
global _anon_const
|
|
115
|
+
|
|
116
|
+
def _anon_const(c, _d=depth):
|
|
117
|
+
if isinstance(c, str):
|
|
118
|
+
if c in mapping["consts"]:
|
|
119
|
+
return mapping["consts"][c]
|
|
120
|
+
new = "_s" + str(len(mapping["consts"]))
|
|
121
|
+
mapping["consts"][c] = new
|
|
122
|
+
return new
|
|
123
|
+
if isinstance(c, tuple):
|
|
124
|
+
return tuple(_anon_const(item) for item in c)
|
|
125
|
+
if isinstance(c, frozenset):
|
|
126
|
+
return frozenset(_anon_const(item) for item in c)
|
|
127
|
+
if type(c).__name__ == "code":
|
|
128
|
+
return _anon_code(c, _d + 1)
|
|
129
|
+
return c
|
|
130
|
+
|
|
131
|
+
new_names = _anon_tuple(code.co_names, "n", mapping["names"])
|
|
132
|
+
new_varnames = _anon_tuple(code.co_varnames, "v", mapping["varnames"])
|
|
133
|
+
new_freevars = _anon_tuple(code.co_freevars, "f", mapping["freevars"])
|
|
134
|
+
new_cellvars = _anon_tuple(code.co_cellvars, "c", mapping["cellvars"])
|
|
135
|
+
|
|
136
|
+
new_consts = tuple(_anon_const(c) for c in code.co_consts)
|
|
137
|
+
|
|
138
|
+
n_at_depth = _depth_counters.get(depth, 0)
|
|
139
|
+
new_co_name = "_fn" + str(depth) + "_" + str(n_at_depth)
|
|
140
|
+
_depth_counters[depth] = n_at_depth + 1
|
|
141
|
+
mapping["co_names"][code.co_name] = new_co_name
|
|
142
|
+
|
|
143
|
+
kwargs = dict(
|
|
144
|
+
co_names=new_names,
|
|
145
|
+
co_varnames=new_varnames,
|
|
146
|
+
co_freevars=new_freevars,
|
|
147
|
+
co_cellvars=new_cellvars,
|
|
148
|
+
co_consts=new_consts,
|
|
149
|
+
co_name=new_co_name,
|
|
150
|
+
co_filename="<anonymised>",
|
|
151
|
+
co_firstlineno=1,
|
|
152
|
+
)
|
|
153
|
+
if hasattr(code, "co_qualname"):
|
|
154
|
+
kwargs["co_qualname"] = new_co_name
|
|
155
|
+
if hasattr(code, "co_linetable"):
|
|
156
|
+
kwargs["co_linetable"] = b""
|
|
157
|
+
# co_lnotab is the line-table kwarg only on 3.10 and earlier; on
|
|
158
|
+
# 3.11+ it exists as a deprecated read-only alias and code.replace
|
|
159
|
+
# rejects it.
|
|
160
|
+
if hasattr(code, "co_lnotab") and sys.version_info < (3, 11):
|
|
161
|
+
kwargs["co_lnotab"] = b""
|
|
162
|
+
return code.replace(**kwargs)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
new_code = _anon_code(code, 0)
|
|
166
|
+
new_body = marshal.dumps(new_code)
|
|
167
|
+
out_pyc.write_bytes(header + new_body)
|
|
168
|
+
mapping_path.write_text(json.dumps(mapping))
|
|
169
|
+
"""
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def run_subprocess_rewrite(
|
|
173
|
+
target_python: str,
|
|
174
|
+
in_pyc: Path,
|
|
175
|
+
out_pyc: Path,
|
|
176
|
+
header_len: int,
|
|
177
|
+
*,
|
|
178
|
+
timeout: float = 30.0,
|
|
179
|
+
) -> ObfuscationMapping:
|
|
180
|
+
"""Spawn *target_python* and rewrite *in_pyc* into *out_pyc*.
|
|
181
|
+
|
|
182
|
+
*target_python* is the command/path that, when executed, runs the
|
|
183
|
+
correct Python minor. The standard form on this repo is
|
|
184
|
+
``uv run --python 3.X --no-project python`` — call sites pass
|
|
185
|
+
that as a single string (or a list joined with spaces) and the
|
|
186
|
+
function dispatches via ``shlex`` if necessary.
|
|
187
|
+
|
|
188
|
+
The mapping JSON is written to a temp file next to *out_pyc* and
|
|
189
|
+
parsed back here so the caller receives a fully-populated
|
|
190
|
+
:class:`ObfuscationMapping`.
|
|
191
|
+
"""
|
|
192
|
+
import shlex
|
|
193
|
+
import tempfile
|
|
194
|
+
|
|
195
|
+
out_pyc.parent.mkdir(parents=True, exist_ok=True)
|
|
196
|
+
with tempfile.NamedTemporaryFile(
|
|
197
|
+
prefix="pyobf-map-", suffix=".json", delete=False
|
|
198
|
+
) as fh:
|
|
199
|
+
mapping_path = Path(fh.name)
|
|
200
|
+
cmd = shlex.split(target_python) + [
|
|
201
|
+
"-c",
|
|
202
|
+
_snippet(),
|
|
203
|
+
str(in_pyc),
|
|
204
|
+
str(out_pyc),
|
|
205
|
+
str(mapping_path),
|
|
206
|
+
str(header_len),
|
|
207
|
+
]
|
|
208
|
+
proc = subprocess.run(
|
|
209
|
+
cmd,
|
|
210
|
+
capture_output=True,
|
|
211
|
+
text=True,
|
|
212
|
+
timeout=timeout,
|
|
213
|
+
)
|
|
214
|
+
if proc.returncode != 0:
|
|
215
|
+
mapping_path.unlink(missing_ok=True)
|
|
216
|
+
raise RuntimeError(
|
|
217
|
+
f"pyobf subprocess (cmd={cmd[:4]!r}) failed: "
|
|
218
|
+
f"rc={proc.returncode}, stderr={proc.stderr!r}"
|
|
219
|
+
)
|
|
220
|
+
raw = json.loads(mapping_path.read_text())
|
|
221
|
+
mapping_path.unlink(missing_ok=True)
|
|
222
|
+
om = ObfuscationMapping()
|
|
223
|
+
om.names.update(raw["names"])
|
|
224
|
+
om.varnames.update(raw["varnames"])
|
|
225
|
+
om.freevars.update(raw["freevars"])
|
|
226
|
+
om.cellvars.update(raw["cellvars"])
|
|
227
|
+
om.consts.update(raw["consts"])
|
|
228
|
+
om.co_names.update(raw["co_names"])
|
|
229
|
+
return om
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def uv_run_command(version: tuple[int, int]) -> str:
|
|
233
|
+
"""Return the ``uv``-mediated command string that runs the target
|
|
234
|
+
Python without any project dependencies.
|
|
235
|
+
|
|
236
|
+
Centralised here so the dispatcher and the eval-harness both call
|
|
237
|
+
it the same way (and so the test suite can monkey-patch it when
|
|
238
|
+
running offline).
|
|
239
|
+
"""
|
|
240
|
+
return f"uv run --python {version[0]}.{version[1]} --no-project python"
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
__all__ = ["run_subprocess_rewrite", "uv_run_command"]
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "pychd-pyobf"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Anonymise identifiers / constants / metadata inside a CPython .pyc while preserving the opcode stream — for contamination-free decompiler benchmarking"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "卍diohabara卍", email = "diohabara@users.noreply.github.com" }
|
|
8
|
+
]
|
|
9
|
+
dependencies = [
|
|
10
|
+
# We reuse ``pychd.versions.detect_version`` to identify the .pyc
|
|
11
|
+
# magic number. xdis is a transitive dependency through pychd but we
|
|
12
|
+
# do not import it directly — the cross-version rewrite delegates to
|
|
13
|
+
# a subprocess running the target Python.
|
|
14
|
+
"pychd>=1.2.0",
|
|
15
|
+
]
|
|
16
|
+
requires-python = ">= 3.14"
|
|
17
|
+
|
|
18
|
+
[build-system]
|
|
19
|
+
requires = ["hatchling"]
|
|
20
|
+
build-backend = "hatchling.build"
|
|
21
|
+
|
|
22
|
+
[project.scripts]
|
|
23
|
+
pychd-pyobf = "pychd_pyobf.cli:main"
|
|
24
|
+
|
|
25
|
+
[tool.hatch.build.targets.wheel]
|
|
26
|
+
packages = ["pychd_pyobf"]
|