picorescue 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,90 @@
1
+ Metadata-Version: 2.3
2
+ Name: picorescue
3
+ Version: 0.1.0
4
+ Summary: Inspect and recover LittleFS/FAT partitions from Raspberry Pi Pico flash dumps
5
+ Author: Phil Howard
6
+ Author-email: Phil Howard <github@gadgetoid.com>
7
+ Requires-Dist: click>=8.1
8
+ Requires-Dist: littlefs-python>=0.12
9
+ Requires-Python: >=3.11
10
+ Description-Content-Type: text/markdown
11
+
12
+ # picorescue
13
+
14
+ Find, inspect and recover data from Raspberry Pi Pico (RP2040 / RP2350) flash
15
+ dumps. Built for Pimoroni-style images that pair a MicroPython firmware with a
16
+ **LittleFS** filesystem (and optionally a **FAT** partition), where the rescue
17
+ target is usually a handful of accidentally-deleted Python scripts.
18
+
19
+ It reads the `bi_decl` *binary info* region (the same `BlockDevice` declarations
20
+ parsed by [`py_decl`](https://github.com/gadgetoid/py_decl)) to locate
21
+ partitions, then reads and recovers data from them. It also signature-scans the
22
+ whole dump and merges the results, so it finds filesystems that are missing from
23
+ `bi_decl` or *nested* inside a declared device - e.g. the `dir2uf2 --fs-reserve`
24
+ hybrid where a LittleFS sits in the tail of a FAT block device.
25
+
26
+ ## Install / run
27
+
28
+ Uses [uv](https://docs.astral.sh/uv/):
29
+
30
+ ```bash
31
+ uv sync
32
+ uv run picorescue --help
33
+ ```
34
+
35
+ Accepts a raw flash dump (`.bin`, offset 0 = flash base `0x10000000`) or a `.uf2`
36
+ (reassembled to the correct flash addresses, gaps filled with `0xFF`).
37
+
38
+ A full-flash dump is ideal. You can grab one with `picotool save -a flash.bin`
39
+ or over SWD with OpenOCD.
40
+
41
+ ## Commands
42
+
43
+ ```bash
44
+ picorescue info DUMP # bi_decl info + discovered partitions
45
+ picorescue partitions DUMP # just the partition table
46
+ picorescue ls DUMP [-p NAME] # list live files in each filesystem
47
+ picorescue extract DUMP OUTDIR # extract live (non-deleted) files
48
+ picorescue recover DUMP OUTDIR # recover deleted / orphaned data
49
+ ```
50
+
51
+ `recover` options: `-p/--partition NAME`, `--no-carve`, `--whole-dump` (carve the
52
+ entire image, not just known partitions), `--min-score` (carver threshold).
53
+
54
+ ## How recovery works
55
+
56
+ `recover` writes into `OUTDIR/<partition>/` and a top-level `MANIFEST.json`
57
+ describing every recovered item and the method used.
58
+
59
+ - **LittleFS metadata** (`deleted/`, `metadata/`) - LittleFS is a copy-on-write,
60
+ log-structured filesystem. Deleting or overwriting a file leaves the old
61
+ commit (its name and, for small files, its *inline* content) in the metadata
62
+ block until that block is erased and compacted. picorescue threads the
63
+ XOR-delta tag log of every metadata block and pulls out names + inline data
64
+ across **all** commits, not just the live view. Entries whose name is absent
65
+ from the mounted filesystem are flagged as likely-deleted and sorted into
66
+ `deleted/`.
67
+ - **FAT undelete** (`undelete/`) - deletion only sets the directory entry's
68
+ first name byte to `0xE5` and frees the cluster chain; the starting cluster
69
+ and size remain, so contiguous files undelete cleanly. Verify integrity of
70
+ anything recovered this way - fragmented files may be partial.
71
+ - **Carving** (`carved/`) - filesystem-agnostic. Scans raw blocks for printable
72
+ text runs and scores them for "Python-ness" (`import`/`def`/`class`, indented
73
+ multi-line structure, assignments). Catches scripts whose directory entry or
74
+ metadata is gone but whose content still lingers in flash. Carved content is
75
+ de-duplicated against live files.
76
+
77
+ Recovered scripts are best-effort: always eyeball them. Carved fragments may
78
+ have a stray leading byte or be truncated at a block boundary.
79
+
80
+ ## Layout
81
+
82
+ ```
83
+ src/picorescue/
84
+ bidecl.py vendored py_decl bi_decl parser (BlockDevice discovery)
85
+ dump.py load .bin/.uf2 -> addressed flash image; partition discovery
86
+ lfs.py LittleFS mount/list/extract + metadata-log recovery
87
+ fat.py FAT12/16/32 read + 0xE5 undelete
88
+ carve.py Python-source carver
89
+ cli.py click CLI
90
+ ```
@@ -0,0 +1,79 @@
1
+ # picorescue
2
+
3
+ Find, inspect and recover data from Raspberry Pi Pico (RP2040 / RP2350) flash
4
+ dumps. Built for Pimoroni-style images that pair a MicroPython firmware with a
5
+ **LittleFS** filesystem (and optionally a **FAT** partition), where the rescue
6
+ target is usually a handful of accidentally-deleted Python scripts.
7
+
8
+ It reads the `bi_decl` *binary info* region (the same `BlockDevice` declarations
9
+ parsed by [`py_decl`](https://github.com/gadgetoid/py_decl)) to locate
10
+ partitions, then reads and recovers data from them. It also signature-scans the
11
+ whole dump and merges the results, so it finds filesystems that are missing from
12
+ `bi_decl` or *nested* inside a declared device - e.g. the `dir2uf2 --fs-reserve`
13
+ hybrid where a LittleFS sits in the tail of a FAT block device.
14
+
15
+ ## Install / run
16
+
17
+ Uses [uv](https://docs.astral.sh/uv/):
18
+
19
+ ```bash
20
+ uv sync
21
+ uv run picorescue --help
22
+ ```
23
+
24
+ Accepts a raw flash dump (`.bin`, offset 0 = flash base `0x10000000`) or a `.uf2`
25
+ (reassembled to the correct flash addresses, gaps filled with `0xFF`).
26
+
27
+ A full-flash dump is ideal. You can grab one with `picotool save -a flash.bin`
28
+ or over SWD with OpenOCD.
29
+
30
+ ## Commands
31
+
32
+ ```bash
33
+ picorescue info DUMP # bi_decl info + discovered partitions
34
+ picorescue partitions DUMP # just the partition table
35
+ picorescue ls DUMP [-p NAME] # list live files in each filesystem
36
+ picorescue extract DUMP OUTDIR # extract live (non-deleted) files
37
+ picorescue recover DUMP OUTDIR # recover deleted / orphaned data
38
+ ```
39
+
40
+ `recover` options: `-p/--partition NAME`, `--no-carve`, `--whole-dump` (carve the
41
+ entire image, not just known partitions), `--min-score` (carver threshold).
42
+
43
+ ## How recovery works
44
+
45
+ `recover` writes into `OUTDIR/<partition>/` and a top-level `MANIFEST.json`
46
+ describing every recovered item and the method used.
47
+
48
+ - **LittleFS metadata** (`deleted/`, `metadata/`) - LittleFS is a copy-on-write,
49
+ log-structured filesystem. Deleting or overwriting a file leaves the old
50
+ commit (its name and, for small files, its *inline* content) in the metadata
51
+ block until that block is erased and compacted. picorescue threads the
52
+ XOR-delta tag log of every metadata block and pulls out names + inline data
53
+ across **all** commits, not just the live view. Entries whose name is absent
54
+ from the mounted filesystem are flagged as likely-deleted and sorted into
55
+ `deleted/`.
56
+ - **FAT undelete** (`undelete/`) - deletion only sets the directory entry's
57
+ first name byte to `0xE5` and frees the cluster chain; the starting cluster
58
+ and size remain, so contiguous files undelete cleanly. Verify integrity of
59
+ anything recovered this way - fragmented files may be partial.
60
+ - **Carving** (`carved/`) - filesystem-agnostic. Scans raw blocks for printable
61
+ text runs and scores them for "Python-ness" (`import`/`def`/`class`, indented
62
+ multi-line structure, assignments). Catches scripts whose directory entry or
63
+ metadata is gone but whose content still lingers in flash. Carved content is
64
+ de-duplicated against live files.
65
+
66
+ Recovered scripts are best-effort: always eyeball them. Carved fragments may
67
+ have a stray leading byte or be truncated at a block boundary.
68
+
69
+ ## Layout
70
+
71
+ ```
72
+ src/picorescue/
73
+ bidecl.py vendored py_decl bi_decl parser (BlockDevice discovery)
74
+ dump.py load .bin/.uf2 -> addressed flash image; partition discovery
75
+ lfs.py LittleFS mount/list/extract + metadata-log recovery
76
+ fat.py FAT12/16/32 read + 0xE5 undelete
77
+ carve.py Python-source carver
78
+ cli.py click CLI
79
+ ```
@@ -0,0 +1,20 @@
1
+ [project]
2
+ name = "picorescue"
3
+ version = "0.1.0"
4
+ description = "Inspect and recover LittleFS/FAT partitions from Raspberry Pi Pico flash dumps"
5
+ readme = "README.md"
6
+ authors = [
7
+ { name = "Phil Howard", email = "github@gadgetoid.com" }
8
+ ]
9
+ requires-python = ">=3.11"
10
+ dependencies = [
11
+ "click>=8.1",
12
+ "littlefs-python>=0.12",
13
+ ]
14
+
15
+ [project.scripts]
16
+ picorescue = "picorescue.cli:cli"
17
+
18
+ [build-system]
19
+ requires = ["uv_build>=0.11.23,<0.12.0"]
20
+ build-backend = "uv_build"
@@ -0,0 +1,3 @@
1
+ """picorescue: inspect & recover LittleFS/FAT partitions from Pico flash dumps."""
2
+
3
+ __all__ = ["bidecl", "dump", "lfs", "fat", "carve"]
@@ -0,0 +1,289 @@
1
+ """Vendored Binary Info (bi_decl) parser, adapted from py_decl.
2
+
3
+ Originally from https://github.com/gadgetoid/py_decl (MIT). Trimmed to the
4
+ parser itself; the CLI/argparse harness is removed. Used here to locate
5
+ ``BlockDevice`` declarations (LittleFS / FAT partitions) inside a Pico binary.
6
+ """
7
+ import io
8
+ import struct
9
+ import sys
10
+
11
+ UF2_MAGIC_START0 = 0x0A324655 # "UF2\n"
12
+ UF2_MAGIC_START1 = 0x9E5D5157
13
+ UF2_MAGIC_END = 0x0AB16F30
14
+
15
+ FAMILY_ID_RP2040 = 0xE48BFF56
16
+ FAMILY_ID_PAD = 0xE48BFF57
17
+ FAMILY_ID_RP2350 = 0xE48BFF59
18
+
19
+ FLASH_START_ADDR = 0x10000000
20
+
21
+ BLOCK_SIZE = 512
22
+ DATA_SIZE = 256
23
+ HEADER_SIZE = 32
24
+ FOOTER_SIZE = 4
25
+
26
+ BI_MAGIC = b"\xf2\xeb\x88\x71"
27
+ BI_END = b"\x90\xa3\x1a\xe7"
28
+
29
+ GPIO_FUNCS = {
30
+ 0: "XIP", 1: "SPI", 2: "UART", 3: "I2C", 4: "PWM", 5: "SIO",
31
+ 6: "PIO0", 7: "PIO1", 8: "GPCK", 9: "USB", 0xF: "NULL",
32
+ }
33
+
34
+ TYPE_RAW_DATA = 1
35
+ TYPE_SIZED_DATA = 2
36
+ TYPE_LIST_ZERO_TERMINATED = 3
37
+ TYPE_BSON = 4
38
+ TYPE_ID_AND_INT = 5
39
+ TYPE_ID_AND_STRING = 6
40
+ TYPE_BLOCK_DEVICE = 7
41
+ TYPE_PINS_WITH_FUNC = 8
42
+ TYPE_PINS_WITH_NAME = 9
43
+ TYPE_NAMED_GROUP = 10
44
+
45
+ ID_PROGRAM_NAME = 0x02031C86
46
+ ID_PROGRAM_VERSION_STRING = 0x11A9BC3A
47
+ ID_PROGRAM_BUILD_DATE_STRING = 0x9DA22254
48
+ ID_BINARY_END = 0x68F465DE
49
+ ID_PROGRAM_URL = 0x1856239A
50
+ ID_PROGRAM_DESCRIPTION = 0xB6A07C19
51
+ ID_PROGRAM_FEATURE = 0xA1F4B453
52
+ ID_PROGRAM_BUILD_ATTRIBUTE = 0x4275F0D3
53
+ ID_SDK_VERSION = 0x5360B3AB
54
+ ID_PICO_BOARD = 0xB63CFFBB
55
+ ID_BOOT2_NAME = 0x7F8882E1
56
+ ID_FILESYSTEM = 0x1009BE7E
57
+
58
+ IDS = {
59
+ ID_PROGRAM_NAME: "Program Name",
60
+ ID_PROGRAM_VERSION_STRING: "Program Version",
61
+ ID_PROGRAM_BUILD_DATE_STRING: "Build Date",
62
+ ID_BINARY_END: "Binary End Address",
63
+ ID_PROGRAM_URL: "Program URL",
64
+ ID_PROGRAM_DESCRIPTION: "Program Description",
65
+ ID_PROGRAM_FEATURE: "Program Feature",
66
+ ID_PROGRAM_BUILD_ATTRIBUTE: "Program Build Attribute",
67
+ ID_SDK_VERSION: "SDK Version",
68
+ ID_PICO_BOARD: "Pico Board",
69
+ ID_BOOT2_NAME: "Boot Stage 2 Name",
70
+ }
71
+
72
+ TYPES = {
73
+ TYPE_RAW_DATA: "Raw Data",
74
+ TYPE_SIZED_DATA: "Sized Data",
75
+ TYPE_LIST_ZERO_TERMINATED: "Zero Terminated List",
76
+ TYPE_BSON: "BSON",
77
+ TYPE_ID_AND_INT: "ID & Int",
78
+ TYPE_ID_AND_STRING: "ID & Str",
79
+ TYPE_BLOCK_DEVICE: "Block Device",
80
+ TYPE_PINS_WITH_FUNC: "Pins With Func",
81
+ TYPE_PINS_WITH_NAME: "Pins With Name",
82
+ TYPE_NAMED_GROUP: "Named Group",
83
+ }
84
+
85
+ # Block device permission / partition-table flags.
86
+ BLOCK_DEV_FLAG_READ = 1 << 0
87
+ BLOCK_DEV_FLAG_WRITE = 1 << 1
88
+ BLOCK_DEV_FLAG_REFORMAT = 1 << 2
89
+
90
+ ALWAYS_A_LIST = ("NamedGroup", "BlockDevice", "ProgramFeature")
91
+
92
+
93
+ class UF2Reader(io.BytesIO):
94
+ """Flatten the first RP2040/RP2350 family section of a UF2 into a BytesIO.
95
+
96
+ NOTE: this concatenates block data and is only suitable for the contiguous
97
+ firmware region (which is what bi_decl parsing needs). For correctly
98
+ *addressed* flash images use :func:`picorescue.dump.load_image`.
99
+ """
100
+
101
+ def __init__(self, filepath):
102
+ bin_data = b""
103
+ for section in self.uf2_to_bin(filepath):
104
+ _, _, family_id, _, _, block_data = section
105
+ if family_id in (FAMILY_ID_RP2040, FAMILY_ID_RP2350):
106
+ bin_data = block_data
107
+ break
108
+ io.BytesIO.__init__(self, bin_data)
109
+
110
+ def uf2_to_bin(self, filepath):
111
+ with open(filepath, "rb") as file:
112
+ section_index = 0
113
+ while data := file.read(BLOCK_SIZE):
114
+ _, _, _, addr, _, block_no, num_blocks, family_id = struct.unpack(
115
+ b"<IIIIIIII", data[0:HEADER_SIZE]
116
+ )
117
+ if block_no == 0:
118
+ file.seek(file.tell() - BLOCK_SIZE)
119
+ yield (
120
+ section_index, addr, family_id, _, num_blocks,
121
+ b"".join(self.uf2_section_data(file)),
122
+ )
123
+ section_index += 1
124
+
125
+ def uf2_section_data(self, file):
126
+ count = 0
127
+ while data := file.read(BLOCK_SIZE):
128
+ _, _, _, addr, _, block_no, num_blocks, family_id = struct.unpack(
129
+ b"<IIIIIIII", data[0:HEADER_SIZE]
130
+ )
131
+ if block_no == 0 and count > 0:
132
+ file.seek(file.tell() - BLOCK_SIZE)
133
+ break
134
+ yield data[HEADER_SIZE:HEADER_SIZE + DATA_SIZE]
135
+ count += 1
136
+
137
+
138
+ class PyDecl:
139
+ def __init__(self, file, debug=False):
140
+ self.entry_parsers = {
141
+ TYPE_ID_AND_INT: self._parse_type_id_and_int,
142
+ TYPE_ID_AND_STRING: self._parse_type_id_and_str,
143
+ TYPE_BLOCK_DEVICE: self._parse_block_device,
144
+ TYPE_NAMED_GROUP: self._parse_named_group,
145
+ TYPE_PINS_WITH_FUNC: self._parse_pins_with_func,
146
+ TYPE_PINS_WITH_NAME: self._parse_pins_with_name,
147
+ }
148
+ self.file = file
149
+ self.debug = debug
150
+
151
+ def parse(self):
152
+ self.file.seek(0)
153
+ if self.read_until(BI_MAGIC) is None:
154
+ return None
155
+ data = self.read_until(BI_END)
156
+ if len(data) != 12:
157
+ return None
158
+ entries_start, entries_end, _ = struct.unpack("III", data)
159
+ entries_start = self.addr_to_bin_offset(entries_start)
160
+ entries_end = self.addr_to_bin_offset(entries_end)
161
+ entries_bytes_len = entries_end - entries_start
162
+ entries_len = entries_bytes_len // 4
163
+
164
+ self.file.seek(entries_start)
165
+ data = self.file.read(entries_bytes_len)
166
+ if len(data) != entries_bytes_len:
167
+ return None
168
+ entries = struct.unpack("I" * entries_len, data)
169
+
170
+ parsed = {}
171
+ for entry in entries:
172
+ self.file.seek(self.addr_to_bin_offset(entry))
173
+ if (parsed_entry := self.parse_entry()) is not None:
174
+ k, v = parsed_entry
175
+ if k in parsed:
176
+ if k == "Pins":
177
+ parsed[k].update(v)
178
+ continue
179
+ if isinstance(parsed[k], list):
180
+ parsed[k] += [v]
181
+ else:
182
+ parsed[k] = [parsed[k], v]
183
+ else:
184
+ parsed[k] = [v] if k in ALWAYS_A_LIST else v
185
+
186
+ if "NamedGroup" in parsed:
187
+ for group in parsed["NamedGroup"]:
188
+ if group["id"] in parsed:
189
+ group["data"] = parsed[group["id"]]
190
+ del parsed[group["id"]]
191
+ return parsed
192
+
193
+ def addr_to_bin_offset(self, addr):
194
+ return addr - FLASH_START_ADDR
195
+
196
+ def data_type_to_str(self, data_type):
197
+ return TYPES.get(data_type, "Unknown")
198
+
199
+ def data_id_to_str(self, data_id):
200
+ return IDS.get(data_id, "Unknown")
201
+
202
+ def is_valid_data_id(self, data_id):
203
+ return data_id in IDS
204
+
205
+ def data_id_to_typename(self, data_id):
206
+ return self.data_id_to_str(data_id).replace(" ", "")
207
+
208
+ def _read_until(self, delimiter=b"\x00"):
209
+ while (chunk := self.file.read(len(delimiter))) != delimiter:
210
+ if len(chunk) == 0:
211
+ raise EOFError
212
+ yield chunk
213
+
214
+ def read_until(self, delimiter=b"\x00"):
215
+ try:
216
+ return b"".join(self._read_until(delimiter))
217
+ except EOFError:
218
+ return None
219
+
220
+ def lookup_string(self, address):
221
+ self.file.seek(self.addr_to_bin_offset(address))
222
+ return self.read_until(delimiter=b"\x00").decode("utf-8", "replace")
223
+
224
+ def _parse_type_id_and_int(self, tag):
225
+ data_id, data_value = struct.unpack("<II", self.file.read(8))
226
+ if self.is_valid_data_id(data_id):
227
+ return self.data_id_to_typename(data_id), data_value
228
+ return data_id, data_value
229
+
230
+ def _parse_type_id_and_str(self, tag):
231
+ data_id, str_addr = struct.unpack("<II", self.file.read(8))
232
+ data_value = self.lookup_string(str_addr)
233
+ if self.is_valid_data_id(data_id):
234
+ return self.data_id_to_typename(data_id), data_value
235
+ return data_id, data_value
236
+
237
+ def _parse_block_device(self, tag):
238
+ name_addr, start_addr, size, _more_info_addr, flags = struct.unpack(
239
+ "<IIIIH", self.file.read(18)
240
+ )
241
+ name = self.lookup_string(name_addr)
242
+ return "BlockDevice", {
243
+ "name": name, "address": start_addr, "size": size, "flags": flags,
244
+ }
245
+
246
+ def _parse_named_group(self, tag):
247
+ parent_id, flags, group_tag, group_id, label_addr = struct.unpack(
248
+ "<IHHII", self.file.read(16)
249
+ )
250
+ label = self.lookup_string(label_addr)
251
+ return "NamedGroup", {
252
+ "label": label, "parent": parent_id, "flags": flags,
253
+ "tag": group_tag, "id": group_id,
254
+ }
255
+
256
+ def _parse_pins_with_func(self, tag):
257
+ pin_encoding = struct.unpack("<I", self.file.read(4))[0]
258
+ encoding_type = pin_encoding & 0b111
259
+ func = (pin_encoding & 0b1111000) >> 3
260
+ func_name = GPIO_FUNCS.get(func)
261
+ pin_encoding >>= 7
262
+ pins = []
263
+ if encoding_type == 0b001:
264
+ for _ in range(5):
265
+ pins.append(pin_encoding & 0b11111)
266
+ pin_encoding >>= 5
267
+ elif encoding_type == 0b010:
268
+ pin_end = pin_encoding & 0b11111
269
+ pin_start = (pin_encoding >> 5) & 0b11111
270
+ pins = list(range(pin_start, pin_end + 1))
271
+ return "Pins", {pin: {"function": func_name} for pin in pins}
272
+
273
+ def _parse_pins_with_name(self, tag):
274
+ pin_mask, name_addr = struct.unpack("<II", self.file.read(8))
275
+ name = self.lookup_string(name_addr)
276
+ pin_no = bin(pin_mask)[::-1].index("1")
277
+ return "Pins", {pin_no: {"name": name}}
278
+
279
+ def parse_entry(self, include_tags=("RP", "MP")):
280
+ data_type, tag = struct.unpack("<H2s", self.file.read(4))
281
+ if tag.decode("utf-8", "replace") in include_tags:
282
+ try:
283
+ return self.entry_parsers[data_type](tag)
284
+ except KeyError:
285
+ if self.debug:
286
+ sys.stderr.write(
287
+ f"ERROR: No parser for: {self.data_type_to_str(data_type)}\n"
288
+ )
289
+ return None
@@ -0,0 +1,115 @@
1
+ """Carve Python source out of raw flash regions.
2
+
3
+ Filesystem-agnostic recovery: most rescue targets are small UTF-8 text files
4
+ (MicroPython scripts). When a file is deleted or its directory entry is gone,
5
+ the *content* usually still sits in flash until that block is erased and
6
+ rewritten. We scan for printable text runs and score them for "Python-ness".
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import hashlib
11
+ import re
12
+ from dataclasses import dataclass
13
+
14
+ # Bytes we treat as "text": printable ASCII + tab/newline/carriage-return.
15
+ _TEXT = bytes(range(0x20, 0x7F)) + b"\t\n\r"
16
+ _TEXT_SET = set(_TEXT)
17
+
18
+ # Strong Python/MicroPython signals.
19
+ _KEYWORDS = re.compile(
20
+ rb"\b(?:import|from|def|class|return|self|print|lambda|async|await|"
21
+ rb"machine|micropython|const|while|for|if|elif|else|try|except|with|"
22
+ rb"yield|raise|global|nonlocal)\b"
23
+ )
24
+ _ASSIGN = re.compile(rb"^\s*[A-Za-z_][A-Za-z0-9_]*\s*=", re.MULTILINE)
25
+ _DEF_OR_IMPORT = re.compile(rb"^\s*(?:def |class |import |from )", re.MULTILINE)
26
+
27
+
28
+ @dataclass
29
+ class Candidate:
30
+ offset: int # offset within the scanned region
31
+ data: bytes
32
+ score: float
33
+ sha1: str
34
+
35
+ @property
36
+ def text(self) -> str:
37
+ return self.data.decode("utf-8", "replace")
38
+
39
+ @property
40
+ def line_count(self) -> int:
41
+ return self.data.count(b"\n") + 1
42
+
43
+ def suggested_name(self) -> str:
44
+ """Guess a filename from a shebang, module docstring, or first def/class."""
45
+ head = self.data[:512]
46
+ m = re.search(rb"^#\s*([\w.\-/]+\.py)\b", head, re.MULTILINE)
47
+ if m:
48
+ return m.group(1).decode("ascii", "replace").replace("/", "_")
49
+ m = re.search(rb"^\s*class\s+([A-Za-z_]\w*)", head, re.MULTILINE)
50
+ if m:
51
+ return m.group(1).decode("ascii", "replace") + ".py"
52
+ m = re.search(rb"^\s*def\s+([A-Za-z_]\w*)", head, re.MULTILINE)
53
+ if m:
54
+ return m.group(1).decode("ascii", "replace") + ".py"
55
+ return "unknown.py"
56
+
57
+
58
+ def _text_runs(data: bytes, min_len: int):
59
+ """Yield (offset, bytes) for maximal runs of text bytes >= min_len."""
60
+ start = None
61
+ for i, b in enumerate(data):
62
+ if b in _TEXT_SET:
63
+ if start is None:
64
+ start = i
65
+ else:
66
+ if start is not None and i - start >= min_len:
67
+ yield start, data[start:i]
68
+ start = None
69
+ if start is not None and len(data) - start >= min_len:
70
+ yield start, data[start:]
71
+
72
+
73
+ def score(run: bytes) -> float:
74
+ """Heuristic 0..~ score that a text run is Python source."""
75
+ if not run:
76
+ return 0.0
77
+ s = 0.0
78
+ kw = len(_KEYWORDS.findall(run))
79
+ s += kw * 2.0
80
+ s += len(_DEF_OR_IMPORT.findall(run)) * 4.0
81
+ s += len(_ASSIGN.findall(run)) * 1.0
82
+ # Reward multi-line, indented structure.
83
+ lines = run.split(b"\n")
84
+ if len(lines) >= 3:
85
+ s += 2.0
86
+ if any(ln.startswith((b" ", b"\t")) for ln in lines):
87
+ s += 2.0
88
+ # Penalise runs that look like a single long blob (no newlines).
89
+ if b"\n" not in run and len(run) > 200:
90
+ s -= 3.0
91
+ # Normalise lightly by length so a giant blob with one keyword doesn't win.
92
+ return s
93
+
94
+
95
+ def carve(data: bytes, min_len: int = 40, min_score: float = 6.0,
96
+ known_hashes: set[str] | None = None) -> list[Candidate]:
97
+ """Return scored Python-source candidates found in ``data``."""
98
+ known_hashes = known_hashes or set()
99
+ out = []
100
+ seen: set[str] = set()
101
+ for offset, run in _text_runs(data, min_len):
102
+ sc = score(run)
103
+ if sc < min_score:
104
+ continue
105
+ # Trim leading/trailing junk to whole lines.
106
+ trimmed = run.strip(b"\x00").strip()
107
+ if not trimmed:
108
+ continue
109
+ sha1 = hashlib.sha1(trimmed).hexdigest()
110
+ if sha1 in known_hashes or sha1 in seen:
111
+ continue
112
+ seen.add(sha1)
113
+ out.append(Candidate(offset=offset, data=trimmed, score=sc, sha1=sha1))
114
+ out.sort(key=lambda c: c.score, reverse=True)
115
+ return out