linear-mcp-fast 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ccl_chromium_reader/__init__.py +2 -0
- ccl_chromium_reader/ccl_chromium_cache.py +1335 -0
- ccl_chromium_reader/ccl_chromium_filesystem.py +302 -0
- ccl_chromium_reader/ccl_chromium_history.py +357 -0
- ccl_chromium_reader/ccl_chromium_indexeddb.py +1060 -0
- ccl_chromium_reader/ccl_chromium_localstorage.py +454 -0
- ccl_chromium_reader/ccl_chromium_notifications.py +268 -0
- ccl_chromium_reader/ccl_chromium_profile_folder.py +568 -0
- ccl_chromium_reader/ccl_chromium_sessionstorage.py +368 -0
- ccl_chromium_reader/ccl_chromium_snss2.py +332 -0
- ccl_chromium_reader/ccl_shared_proto_db_downloads.py +189 -0
- ccl_chromium_reader/common.py +19 -0
- ccl_chromium_reader/download_common.py +78 -0
- ccl_chromium_reader/profile_folder_protocols.py +276 -0
- ccl_chromium_reader/serialization_formats/__init__.py +0 -0
- ccl_chromium_reader/serialization_formats/ccl_blink_value_deserializer.py +401 -0
- ccl_chromium_reader/serialization_formats/ccl_easy_chromium_pickle.py +133 -0
- ccl_chromium_reader/serialization_formats/ccl_protobuff.py +276 -0
- ccl_chromium_reader/serialization_formats/ccl_v8_value_deserializer.py +627 -0
- ccl_chromium_reader/storage_formats/__init__.py +0 -0
- ccl_chromium_reader/storage_formats/ccl_leveldb.py +582 -0
- ccl_simplesnappy/__init__.py +1 -0
- ccl_simplesnappy/ccl_simplesnappy.py +306 -0
- linear_mcp_fast/__init__.py +8 -0
- linear_mcp_fast/__main__.py +6 -0
- linear_mcp_fast/reader.py +433 -0
- linear_mcp_fast/server.py +367 -0
- linear_mcp_fast/store_detector.py +117 -0
- linear_mcp_fast-0.1.0.dist-info/METADATA +160 -0
- linear_mcp_fast-0.1.0.dist-info/RECORD +39 -0
- linear_mcp_fast-0.1.0.dist-info/WHEEL +5 -0
- linear_mcp_fast-0.1.0.dist-info/entry_points.txt +2 -0
- linear_mcp_fast-0.1.0.dist-info/top_level.txt +4 -0
- tools_and_utilities/Chromium_dump_local_storage.py +111 -0
- tools_and_utilities/Chromium_dump_session_storage.py +92 -0
- tools_and_utilities/benchmark.py +35 -0
- tools_and_utilities/ccl_chrome_audit.py +651 -0
- tools_and_utilities/dump_indexeddb_details.py +59 -0
- tools_and_utilities/dump_leveldb.py +53 -0
|
@@ -0,0 +1,1335 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2022-2025, CCL Forensics
|
|
3
|
+
|
|
4
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
|
5
|
+
this software and associated documentation files (the "Software"), to deal in
|
|
6
|
+
the Software without restriction, including without limitation the rights to
|
|
7
|
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
|
8
|
+
of the Software, and to permit persons to whom the Software is furnished to do
|
|
9
|
+
so, subject to the following conditions:
|
|
10
|
+
|
|
11
|
+
The above copyright notice and this permission notice shall be included in all
|
|
12
|
+
copies or substantial portions of the Software.
|
|
13
|
+
|
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
15
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
16
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
17
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
18
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
19
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
20
|
+
SOFTWARE.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
import abc
|
|
25
|
+
import dataclasses
|
|
26
|
+
import io
|
|
27
|
+
import os
|
|
28
|
+
import re
|
|
29
|
+
import sys
|
|
30
|
+
import types
|
|
31
|
+
import typing
|
|
32
|
+
import pathlib
|
|
33
|
+
import datetime
|
|
34
|
+
import struct
|
|
35
|
+
import enum
|
|
36
|
+
import zlib
|
|
37
|
+
|
|
38
|
+
__version__ = "0.22"
|
|
39
|
+
__description__ = "Library for reading Chrome/Chromium Cache (both blockfile and simple format)"
|
|
40
|
+
__contact__ = "Alex Caithness"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
_CHROME_EPOCH = datetime.datetime(1601, 1, 1)
|
|
44
|
+
EIGHT_BYTE_PICKLE_ALIGNMENT = True # switch this if you get errors about the EOF magic when doing a Simple Cache
|
|
45
|
+
SIMPLE_EOF_SIZE = 24 if EIGHT_BYTE_PICKLE_ALIGNMENT else 20
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def decode_chrome_time(us: int) -> datetime.datetime:
|
|
49
|
+
return _CHROME_EPOCH + datetime.timedelta(microseconds=us)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class BinaryReader:
|
|
53
|
+
"""
|
|
54
|
+
Utility class which wraps a BinaryIO and provides reading for a bunch of data types we need to do the cache stuff
|
|
55
|
+
"""
|
|
56
|
+
def __init__(self, stream: typing.BinaryIO):
|
|
57
|
+
self._stream = stream
|
|
58
|
+
self._closed = False
|
|
59
|
+
|
|
60
|
+
@classmethod
|
|
61
|
+
def from_bytes(cls, buffer: bytes):
|
|
62
|
+
return cls(io.BytesIO(buffer))
|
|
63
|
+
|
|
64
|
+
def close(self):
|
|
65
|
+
self._stream.close()
|
|
66
|
+
self._closed = True
|
|
67
|
+
|
|
68
|
+
def __enter__(self) -> "BinaryReader":
|
|
69
|
+
return self
|
|
70
|
+
|
|
71
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
72
|
+
self.close()
|
|
73
|
+
|
|
74
|
+
def tell(self) -> int:
|
|
75
|
+
return self._stream.tell()
|
|
76
|
+
|
|
77
|
+
def seek(self, offset: int, whence: int) -> int:
|
|
78
|
+
return self._stream.seek(offset, whence)
|
|
79
|
+
|
|
80
|
+
def read_raw(self, count: int) -> bytes:
|
|
81
|
+
start_offset = self._stream.tell()
|
|
82
|
+
result = self._stream.read(count)
|
|
83
|
+
if len(result) != count:
|
|
84
|
+
raise ValueError(
|
|
85
|
+
f"Could not read all of the data starting at {start_offset}. Expected: {count}; got {len(result)}")
|
|
86
|
+
return result
|
|
87
|
+
|
|
88
|
+
def read_utf8(self, count: int) -> str:
|
|
89
|
+
return self.read_raw(count).decode("utf-8")
|
|
90
|
+
|
|
91
|
+
def read_int16(self) -> int:
|
|
92
|
+
raw = self.read_raw(2)
|
|
93
|
+
return struct.unpack("<h", raw)[0]
|
|
94
|
+
|
|
95
|
+
def read_int32(self) -> int:
|
|
96
|
+
raw = self.read_raw(4)
|
|
97
|
+
return struct.unpack("<i", raw)[0]
|
|
98
|
+
|
|
99
|
+
def read_int64(self) -> int:
|
|
100
|
+
raw = self.read_raw(8)
|
|
101
|
+
return struct.unpack("<q", raw)[0]
|
|
102
|
+
|
|
103
|
+
def read_uint16(self) -> int:
|
|
104
|
+
raw = self.read_raw(2)
|
|
105
|
+
return struct.unpack("<H", raw)[0]
|
|
106
|
+
|
|
107
|
+
def read_uint32(self) -> int:
|
|
108
|
+
raw = self.read_raw(4)
|
|
109
|
+
return struct.unpack("<I", raw)[0]
|
|
110
|
+
|
|
111
|
+
def read_uint64(self) -> int:
|
|
112
|
+
raw = self.read_raw(8)
|
|
113
|
+
return struct.unpack("<Q", raw)[0]
|
|
114
|
+
|
|
115
|
+
def read_addr(self) -> "Addr":
|
|
116
|
+
return Addr.from_int(self.read_uint32())
|
|
117
|
+
|
|
118
|
+
def read_datetime(self) -> datetime.datetime:
|
|
119
|
+
return decode_chrome_time(self.read_uint64())
|
|
120
|
+
|
|
121
|
+
@property
|
|
122
|
+
def is_closed(self) -> bool:
|
|
123
|
+
return self._closed
|
|
124
|
+
|
|
125
|
+
@property
|
|
126
|
+
def is_eof(self) -> bool:
|
|
127
|
+
test = self._stream.read(1)
|
|
128
|
+
if len(test) == 0:
|
|
129
|
+
return True
|
|
130
|
+
self._stream.seek(-1, os.SEEK_CUR)
|
|
131
|
+
return False
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class FileType(enum.IntEnum):
|
|
135
|
+
# net/disk_cache/blockfile/disk_format.h
|
|
136
|
+
EXTERNAL = 0
|
|
137
|
+
RANKINGS = 1
|
|
138
|
+
BLOCK_256 = 2
|
|
139
|
+
BLOCK_1K = 3
|
|
140
|
+
BLOCK_4K = 4
|
|
141
|
+
BLOCK_FILES = 5
|
|
142
|
+
BLOCK_ENTRIES = 6
|
|
143
|
+
BLOCK_EVICTED = 7
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
_BLOCKSIZE_FOR_FILETYPE = {
|
|
147
|
+
FileType.RANKINGS: 36,
|
|
148
|
+
FileType.BLOCK_256: 256,
|
|
149
|
+
FileType.BLOCK_1K: 1024,
|
|
150
|
+
FileType.BLOCK_4K: 4096,
|
|
151
|
+
FileType.BLOCK_FILES: 8,
|
|
152
|
+
FileType.BLOCK_ENTRIES: 104,
|
|
153
|
+
FileType.BLOCK_EVICTED: 48,
|
|
154
|
+
FileType.EXTERNAL: 0
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
_BLOCK_FILE_FILETYPE = {FileType.BLOCK_256, FileType.BLOCK_1K, FileType.BLOCK_4K}
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
class CacheKey:
|
|
162
|
+
"""
|
|
163
|
+
Class representing a parsed Chromium Cache Key.
|
|
164
|
+
"""
|
|
165
|
+
# net/http/http_cache.cc GenerateCacheKey
|
|
166
|
+
CRED_UPLOAD_KEY_PREFIX_PATTERN = re.compile(r"^\d+/\d+/") # 'current' (since Sept '21)
|
|
167
|
+
UPLOAD_ONLY_KEY_PREFIX_PATTERN = re.compile(r"^\d+/") # prior to Sept '21
|
|
168
|
+
# if neither of the above we assume we only have a URL
|
|
169
|
+
|
|
170
|
+
def __init__(self, raw_key: str):
|
|
171
|
+
self._raw_key = raw_key
|
|
172
|
+
|
|
173
|
+
# We have to account for a few different versions of keys, we can do this based on a prefix
|
|
174
|
+
if CacheKey.UPLOAD_ONLY_KEY_PREFIX_PATTERN.match(self._raw_key):
|
|
175
|
+
if CacheKey.CRED_UPLOAD_KEY_PREFIX_PATTERN.match(self._raw_key):
|
|
176
|
+
split_key = self._raw_key.split("/", 2)
|
|
177
|
+
self._credential_key = split_key[0]
|
|
178
|
+
self._upload_data_identifier = int(split_key[1])
|
|
179
|
+
else:
|
|
180
|
+
split_key = self._raw_key.split("/", 1)
|
|
181
|
+
self._credential_key = ""
|
|
182
|
+
self._upload_data_identifier = int(split_key[0])
|
|
183
|
+
|
|
184
|
+
if split_key[-1].startswith("_dk_"):
|
|
185
|
+
# consume two kDoubleKeySeparator (a space), the url is after that
|
|
186
|
+
(self._isolation_key_top_frame_site,
|
|
187
|
+
self._isolation_key_variable_part,
|
|
188
|
+
self._url) = split_key[-1][4:].split(" ", 3)
|
|
189
|
+
if self._isolation_key_top_frame_site.startswith("s_"):
|
|
190
|
+
self._isolation_key_top_frame_site = self._isolation_key_top_frame_site[2:]
|
|
191
|
+
else:
|
|
192
|
+
self._url = split_key[-1]
|
|
193
|
+
self._isolation_key_top_frame_site = None
|
|
194
|
+
self._isolation_key_variable_part = None
|
|
195
|
+
else:
|
|
196
|
+
# if the prefixes don't hit, this should just be a URL
|
|
197
|
+
self._url = self._raw_key
|
|
198
|
+
self._isolation_key_top_frame_site = None
|
|
199
|
+
self._isolation_key_variable_part = None
|
|
200
|
+
|
|
201
|
+
@property
|
|
202
|
+
def raw_key(self) -> str:
|
|
203
|
+
return self._raw_key
|
|
204
|
+
|
|
205
|
+
@property
|
|
206
|
+
def url(self) -> str:
|
|
207
|
+
return self._url
|
|
208
|
+
|
|
209
|
+
@property
|
|
210
|
+
def credential_key(self) -> str:
|
|
211
|
+
return self._credential_key
|
|
212
|
+
|
|
213
|
+
@property
|
|
214
|
+
def upload_data_identifier(self) -> int:
|
|
215
|
+
return self._upload_data_identifier
|
|
216
|
+
|
|
217
|
+
@property
|
|
218
|
+
def isolation_key_top_frame_site(self) -> str:
|
|
219
|
+
return self._isolation_key_top_frame_site
|
|
220
|
+
|
|
221
|
+
@property
|
|
222
|
+
def isolation_key_variable_part(self):
|
|
223
|
+
return self._isolation_key_variable_part
|
|
224
|
+
|
|
225
|
+
def __str__(self):
|
|
226
|
+
return self._raw_key
|
|
227
|
+
|
|
228
|
+
def __repr__(self):
|
|
229
|
+
return (f"<CacheKey url: {self._url}; credential_key: {self._credential_key}; "
|
|
230
|
+
f"upload_data_identifier: {self._upload_data_identifier}; "
|
|
231
|
+
f"isolation_key_top_frame_site: {self._isolation_key_top_frame_site}; "
|
|
232
|
+
f"isolation_key_variable_part: {self._isolation_key_variable_part}>")
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
class Addr:
|
|
236
|
+
# net/disk_cache/blockfile/addr.h
|
|
237
|
+
def __init__(
|
|
238
|
+
self, is_initialized: bool, file_type: FileType, file_number: typing.Optional[int],
|
|
239
|
+
contiguous_blocks: typing.Optional[int], file_selector: typing.Optional[int], block_number: int,
|
|
240
|
+
reserved_bits: typing.Optional[int]):
|
|
241
|
+
self._is_initialized = is_initialized
|
|
242
|
+
self._file_type = file_type
|
|
243
|
+
self._file_number = file_number
|
|
244
|
+
self._contiguous_blocks = contiguous_blocks
|
|
245
|
+
self._file_selector = file_selector
|
|
246
|
+
self._block_number = block_number
|
|
247
|
+
self._reserved_bits = reserved_bits
|
|
248
|
+
|
|
249
|
+
def __repr__(self):
|
|
250
|
+
return (f"<Addr: is_initialized: {self._is_initialized}; file_type: {self._file_type.name}; "
|
|
251
|
+
f"file_number: {self._file_number}; contiguous_blocks: {self._contiguous_blocks}; "
|
|
252
|
+
f"file_selector: {self._file_selector}; block_number: {self._block_number}>")
|
|
253
|
+
|
|
254
|
+
@classmethod
|
|
255
|
+
def from_int(cls, i: int):
|
|
256
|
+
is_initialized = (i & 0x80000000) > 0
|
|
257
|
+
file_type = FileType((i & 0x70000000) >> 28)
|
|
258
|
+
|
|
259
|
+
if file_type == FileType.EXTERNAL:
|
|
260
|
+
file_number = i & 0x0fffffff
|
|
261
|
+
contiguous_blocks = None
|
|
262
|
+
file_selector = None
|
|
263
|
+
block_number = None
|
|
264
|
+
reserved_bits = None
|
|
265
|
+
else:
|
|
266
|
+
file_number = None
|
|
267
|
+
contiguous_blocks = 1 + ((i & 0x03000000) >> 24)
|
|
268
|
+
file_selector = (i & 0x00ff0000) >> 16
|
|
269
|
+
block_number = i & 0x0000ffff
|
|
270
|
+
reserved_bits = i & 0x0c000000
|
|
271
|
+
|
|
272
|
+
return Addr(
|
|
273
|
+
is_initialized,
|
|
274
|
+
file_type,
|
|
275
|
+
file_number,
|
|
276
|
+
contiguous_blocks,
|
|
277
|
+
file_selector,
|
|
278
|
+
block_number,
|
|
279
|
+
reserved_bits)
|
|
280
|
+
|
|
281
|
+
def sanity_check(self) -> bool:
|
|
282
|
+
# implementation from addr.cc - will hopefully identify invalid data and skip it rather than raising exceptions
|
|
283
|
+
# we omit the initialized check from that version, as that's to identify a totally empty entry (which is sane
|
|
284
|
+
# but of no use to us in any context we use it).
|
|
285
|
+
if self._file_type > FileType.BLOCK_4K:
|
|
286
|
+
return False
|
|
287
|
+
if self._file_type != FileType.EXTERNAL and self._reserved_bits != 0:
|
|
288
|
+
return False
|
|
289
|
+
|
|
290
|
+
return True
|
|
291
|
+
|
|
292
|
+
def sanity_check_for_entry(self) -> bool:
|
|
293
|
+
return self.sanity_check() and self._file_type == FileType.BLOCK_256
|
|
294
|
+
|
|
295
|
+
@property
|
|
296
|
+
def is_initialized(self) -> bool:
|
|
297
|
+
return self._is_initialized
|
|
298
|
+
|
|
299
|
+
@property
|
|
300
|
+
def file_type(self) -> FileType:
|
|
301
|
+
return self._file_type
|
|
302
|
+
|
|
303
|
+
@property
|
|
304
|
+
def contiguous_blocks(self) -> int:
|
|
305
|
+
return self._contiguous_blocks
|
|
306
|
+
|
|
307
|
+
@property
|
|
308
|
+
def file_selector(self) -> int:
|
|
309
|
+
return self._file_selector
|
|
310
|
+
|
|
311
|
+
@property
|
|
312
|
+
def block_number(self) -> int:
|
|
313
|
+
return self._block_number
|
|
314
|
+
|
|
315
|
+
@property
|
|
316
|
+
def external_file_number(self) -> int:
|
|
317
|
+
return self._file_number
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
@dataclasses.dataclass(frozen=True)
|
|
321
|
+
class LruData:
|
|
322
|
+
# net/disk_cache/blockfile/disk_format.h
|
|
323
|
+
filled: int
|
|
324
|
+
sizes: typing.Collection[int]
|
|
325
|
+
heads: typing.Collection[Addr]
|
|
326
|
+
tails: typing.Collection[Addr]
|
|
327
|
+
transactions: Addr
|
|
328
|
+
operation: int
|
|
329
|
+
operation_list: int
|
|
330
|
+
|
|
331
|
+
@classmethod
|
|
332
|
+
def from_bytes(cls, buffer: bytes):
|
|
333
|
+
with BinaryReader.from_bytes(buffer) as reader:
|
|
334
|
+
return cls.from_reader(reader)
|
|
335
|
+
|
|
336
|
+
@classmethod
|
|
337
|
+
def from_reader(cls, reader: BinaryReader):
|
|
338
|
+
_ = [reader.read_int32() for x in range(2)]
|
|
339
|
+
filled = reader.read_int32()
|
|
340
|
+
sizes = tuple(reader.read_int32() for _ in range(5))
|
|
341
|
+
heads = tuple(reader.read_addr() for _ in range(5))
|
|
342
|
+
tails = tuple(reader.read_addr() for _ in range(5))
|
|
343
|
+
transaction = reader.read_addr()
|
|
344
|
+
operation = reader.read_int32()
|
|
345
|
+
operation_list = reader.read_int32()
|
|
346
|
+
_ = [reader.read_int32() for x in range(7)]
|
|
347
|
+
|
|
348
|
+
return cls(filled, sizes, heads, tails, transaction, operation, operation_list)
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
@dataclasses.dataclass(frozen=True)
|
|
352
|
+
class BlockFileIndexHeader:
|
|
353
|
+
# net/disk_cache/blockfile/disk_format.h
|
|
354
|
+
version: int
|
|
355
|
+
num_entries: int
|
|
356
|
+
num_bytes_v2: int
|
|
357
|
+
last_file: int
|
|
358
|
+
this_id: int
|
|
359
|
+
stats_addr: Addr
|
|
360
|
+
table_length: int
|
|
361
|
+
crash: int
|
|
362
|
+
experiment: int
|
|
363
|
+
create_time: datetime.datetime
|
|
364
|
+
num_bytes_v3: int
|
|
365
|
+
lru: LruData
|
|
366
|
+
|
|
367
|
+
_MAGIC: typing.ClassVar[int] = 0xC103CAC3
|
|
368
|
+
|
|
369
|
+
@classmethod
|
|
370
|
+
def from_bytes(cls, buffer: bytes):
|
|
371
|
+
with BinaryReader.from_bytes(buffer) as reader:
|
|
372
|
+
return cls.from_reader(reader)
|
|
373
|
+
|
|
374
|
+
@classmethod
|
|
375
|
+
def from_reader(cls, reader: BinaryReader):
|
|
376
|
+
magic = reader.read_uint32()
|
|
377
|
+
if magic != BlockFileIndexHeader._MAGIC:
|
|
378
|
+
raise ValueError("invalid magic")
|
|
379
|
+
version = reader.read_uint32()
|
|
380
|
+
num_entries = reader.read_int32()
|
|
381
|
+
old_v2_num_bytes = reader.read_uint32()
|
|
382
|
+
last_file = reader.read_int32()
|
|
383
|
+
this_id = reader.read_int32()
|
|
384
|
+
stats_addr = reader.read_addr()
|
|
385
|
+
table_length = reader.read_int32() or 0x10000
|
|
386
|
+
crash = reader.read_int32()
|
|
387
|
+
experiment = reader.read_int32()
|
|
388
|
+
create_time = reader.read_datetime()
|
|
389
|
+
num_bytes = reader.read_int64()
|
|
390
|
+
_ = [reader.read_int32() for x in range(50)]
|
|
391
|
+
lru = LruData.from_reader(reader)
|
|
392
|
+
|
|
393
|
+
return cls(
|
|
394
|
+
version, num_entries, old_v2_num_bytes, last_file, this_id, stats_addr,
|
|
395
|
+
table_length, crash, experiment, create_time, num_bytes, lru)
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
class BlockFileIndexFile:
|
|
399
|
+
# net/disk_cache/blockfile/disk_format.h
|
|
400
|
+
def __init__(self, file_path: typing.Union[os.PathLike, str]):
|
|
401
|
+
self._input_path = pathlib.Path(file_path)
|
|
402
|
+
with BinaryReader(self._input_path.open("rb")) as reader:
|
|
403
|
+
self._header = BlockFileIndexHeader.from_reader(reader)
|
|
404
|
+
self._entries = tuple(reader.read_addr() for _ in range(self._header.table_length))
|
|
405
|
+
self._entries_initialized = tuple(x for x in self._entries if x.is_initialized)
|
|
406
|
+
|
|
407
|
+
@property
|
|
408
|
+
def input_path(self):
|
|
409
|
+
return self._input_path
|
|
410
|
+
|
|
411
|
+
@property
|
|
412
|
+
def header(self) -> BlockFileIndexHeader:
|
|
413
|
+
return self._header
|
|
414
|
+
|
|
415
|
+
@property
|
|
416
|
+
def index(self) -> typing.Collection[Addr]:
|
|
417
|
+
return self._entries
|
|
418
|
+
|
|
419
|
+
@property
|
|
420
|
+
def index_initialized_only(self):
|
|
421
|
+
return self._entries_initialized
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
class EntryState(enum.IntEnum):
|
|
425
|
+
NORMAL = 0
|
|
426
|
+
EVICTED = 1
|
|
427
|
+
DOOMED = 2
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
class EntryFlags(enum.IntFlag):
|
|
431
|
+
PARENT_ENTRY = 1 << 0
|
|
432
|
+
CHILD_ENTRY = 1 << 1
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
@dataclasses.dataclass(frozen=True)
|
|
436
|
+
class EntryStore:
|
|
437
|
+
# net/disk_cache/blockfile/disk_format.h
|
|
438
|
+
entry_hash: int
|
|
439
|
+
next_entry: Addr
|
|
440
|
+
rankings_node: Addr
|
|
441
|
+
reuse_count: int
|
|
442
|
+
refetch_count: int
|
|
443
|
+
state: EntryState
|
|
444
|
+
creation_time: datetime.datetime
|
|
445
|
+
key_length: int
|
|
446
|
+
long_key_addr: Addr
|
|
447
|
+
data_sizes: tuple[int, int, int, int]
|
|
448
|
+
data_addrs: tuple[Addr, Addr, Addr, Addr]
|
|
449
|
+
flags: EntryFlags
|
|
450
|
+
self_hash: int
|
|
451
|
+
key: typing.Optional[str]
|
|
452
|
+
|
|
453
|
+
@property
|
|
454
|
+
def key_is_external(self) -> bool:
|
|
455
|
+
return self.long_key_addr.is_initialized
|
|
456
|
+
|
|
457
|
+
@classmethod
|
|
458
|
+
def from_bytes(cls, buffer: bytes):
|
|
459
|
+
with BinaryReader.from_bytes(buffer) as reader:
|
|
460
|
+
return cls.from_reader(reader)
|
|
461
|
+
|
|
462
|
+
@classmethod
|
|
463
|
+
def from_reader(cls, reader: BinaryReader):
|
|
464
|
+
start = reader.tell()
|
|
465
|
+
|
|
466
|
+
entry_hash = reader.read_uint32()
|
|
467
|
+
next_entry = reader.read_addr()
|
|
468
|
+
rankings_node = reader.read_addr()
|
|
469
|
+
reuse_count = reader.read_int32()
|
|
470
|
+
refetch_count = reader.read_int32()
|
|
471
|
+
state = EntryState(reader.read_int32())
|
|
472
|
+
creation_time = reader.read_datetime()
|
|
473
|
+
key_length = reader.read_int32()
|
|
474
|
+
long_key_addr = reader.read_addr()
|
|
475
|
+
data_sizes = (reader.read_int32(), reader.read_int32(), reader.read_int32(), reader.read_int32())
|
|
476
|
+
data_addrs = (reader.read_addr(), reader.read_addr(), reader.read_addr(), reader.read_addr())
|
|
477
|
+
flags = EntryFlags(reader.read_uint32())
|
|
478
|
+
_ = [reader.read_int32() for x in range(4)]
|
|
479
|
+
self_hash = reader.read_uint32()
|
|
480
|
+
|
|
481
|
+
meta_length = reader.tell() - start
|
|
482
|
+
|
|
483
|
+
key = None
|
|
484
|
+
key_is_external = long_key_addr.is_initialized
|
|
485
|
+
if not key_is_external:
|
|
486
|
+
key = reader.read_utf8(key_length)
|
|
487
|
+
|
|
488
|
+
return cls(
|
|
489
|
+
entry_hash, next_entry, rankings_node, reuse_count, refetch_count, state, creation_time, key_length,
|
|
490
|
+
long_key_addr, data_sizes, data_addrs, flags, self_hash, key)
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
@dataclasses.dataclass(frozen=True)
|
|
494
|
+
class BlockFileHeader:
|
|
495
|
+
# net/disk_cache/blockfile/disk_format_base.h
|
|
496
|
+
version: int
|
|
497
|
+
this_file: int
|
|
498
|
+
next_file: int
|
|
499
|
+
entry_size: int
|
|
500
|
+
num_entries: int
|
|
501
|
+
max_entries: int
|
|
502
|
+
empty_type_counts: tuple[int, int, int, int]
|
|
503
|
+
hints: tuple[int, int, int, int]
|
|
504
|
+
updating: int
|
|
505
|
+
user: tuple[int, int, int, int, int]
|
|
506
|
+
allocation_map: bytes
|
|
507
|
+
|
|
508
|
+
_MAGIC: typing.ClassVar[int] = 0xC104CAC3
|
|
509
|
+
_BLOCK_HEADER_SIZE: typing.ClassVar[int] = 8192
|
|
510
|
+
_MAX_BLOCKS: typing.ClassVar[int] = (_BLOCK_HEADER_SIZE - 80) * 8
|
|
511
|
+
|
|
512
|
+
def __post_init__(self):
|
|
513
|
+
if len(self.allocation_map) != self._MAX_BLOCKS // 8:
|
|
514
|
+
raise ValueError("invalid allocation map length")
|
|
515
|
+
|
|
516
|
+
@classmethod
|
|
517
|
+
def from_bytes(cls, buffer: bytes):
|
|
518
|
+
with BinaryReader.from_bytes(buffer) as reader:
|
|
519
|
+
return cls.from_reader(reader)
|
|
520
|
+
|
|
521
|
+
@classmethod
|
|
522
|
+
def from_reader(cls, reader: BinaryReader):
|
|
523
|
+
magic = reader.read_uint32()
|
|
524
|
+
if magic != cls._MAGIC:
|
|
525
|
+
raise ValueError("invalid magic")
|
|
526
|
+
version = reader.read_uint32()
|
|
527
|
+
this_file = reader.read_int16()
|
|
528
|
+
next_file = reader.read_int16()
|
|
529
|
+
entry_size = reader.read_int32()
|
|
530
|
+
num_entries = reader.read_int32()
|
|
531
|
+
max_entries = reader.read_int32()
|
|
532
|
+
empty = (reader.read_int32(), reader.read_int32(), reader.read_int32(), reader.read_int32())
|
|
533
|
+
hints = (reader.read_int32(), reader.read_int32(), reader.read_int32(), reader.read_int32())
|
|
534
|
+
updating = reader.read_int32()
|
|
535
|
+
user = (reader.read_int32(), reader.read_int32(), reader.read_int32(), reader.read_int32(), reader.read_int32())
|
|
536
|
+
|
|
537
|
+
allocation_map = reader.read_raw(cls._MAX_BLOCKS // 8)
|
|
538
|
+
|
|
539
|
+
return cls(
|
|
540
|
+
version, this_file, next_file, entry_size, num_entries, max_entries,
|
|
541
|
+
empty, hints, updating, user, allocation_map)
|
|
542
|
+
|
|
543
|
+
|
|
544
|
+
class CachedMetadataFlags(enum.IntFlag):
|
|
545
|
+
# net/http/http_response_info.cc
|
|
546
|
+
|
|
547
|
+
RESPONSE_INFO_VERSION = 3
|
|
548
|
+
RESPONSE_INFO_VERSION_MASK = 0xFF
|
|
549
|
+
|
|
550
|
+
# This bit is set if the response info has a cert at the end.
|
|
551
|
+
RESPONSE_INFO_HAS_CERT = 1 << 8
|
|
552
|
+
RESPONSE_INFO_HAS_SECURITY_BITS = 1 << 9
|
|
553
|
+
RESPONSE_INFO_HAS_CERT_STATUS = 1 << 10
|
|
554
|
+
RESPONSE_INFO_HAS_VARY_DATA = 1 << 11
|
|
555
|
+
RESPONSE_INFO_TRUNCATED = 1 << 12
|
|
556
|
+
RESPONSE_INFO_WAS_SPDY = 1 << 13
|
|
557
|
+
RESPONSE_INFO_WAS_ALPN = 1 << 14
|
|
558
|
+
RESPONSE_INFO_WAS_PROXY = 1 << 15
|
|
559
|
+
RESPONSE_INFO_HAS_SSL_CONNECTION_STATUS = 1 << 16
|
|
560
|
+
RESPONSE_INFO_HAS_ALPN_NEGOTIATED_PROTOCOL = 1 << 17
|
|
561
|
+
RESPONSE_INFO_HAS_CONNECTION_INFO = 1 << 18
|
|
562
|
+
RESPONSE_INFO_USE_HTTP_AUTHENTICATION = 1 << 19
|
|
563
|
+
RESPONSE_INFO_HAS_SIGNED_CERTIFICATE_TIMESTAMPS = 1 << 20
|
|
564
|
+
RESPONSE_INFO_UNUSED_SINCE_PREFETCH = 1 << 21
|
|
565
|
+
RESPONSE_INFO_HAS_KEY_EXCHANGE_GROUP = 1 << 22
|
|
566
|
+
RESPONSE_INFO_PKP_BYPASSED = 1 << 23
|
|
567
|
+
RESPONSE_INFO_HAS_STALENESS = 1 << 24
|
|
568
|
+
RESPONSE_INFO_HAS_PEER_SIGNATURE_ALGORITHM = 1 << 25
|
|
569
|
+
RESPONSE_INFO_RESTRICTED_PREFETCH = 1 << 26
|
|
570
|
+
RESPONSE_INFO_HAS_DNS_ALIASES = 1 << 27
|
|
571
|
+
RESPONSE_INFO_SINGLE_KEYED_CACHE_ENTRY_UNUSABLE = 1 << 28
|
|
572
|
+
RESPONSE_INFO_ENCRYPTED_CLIENT_HELLO = 1 << 29
|
|
573
|
+
RESPONSE_INFO_BROWSER_RUN_ID = 1 << 30
|
|
574
|
+
RESPONSE_INFO_HAS_EXTRA_FLAGS = 1 << 31 # indicates that we need to read the extra flags after this value
|
|
575
|
+
|
|
576
|
+
|
|
577
|
+
class CachedMetadataExtraFlags(enum.IntFlag):
|
|
578
|
+
RESPONSE_EXTRA_INFO_DID_USE_SHARED_DICTIONARY = 1
|
|
579
|
+
RESPONSE_EXTRA_INFO_HAS_PROXY_CHAIN = 1 << 1
|
|
580
|
+
RESPONSE_EXTRA_INFO_HAS_ORIGINAL_RESPONSE_TIME = 1 << 2
|
|
581
|
+
|
|
582
|
+
|
|
583
|
+
class CachedMetadata:
|
|
584
|
+
# net/http/http_response_info.cc / net/http/http_response_info.h
|
|
585
|
+
def __init__(
|
|
586
|
+
self, header_declarations: set[str], header_attributes: dict[str, list[str]],
|
|
587
|
+
request_time: datetime.datetime, response_time: datetime.datetime, certs: list[bytes],
|
|
588
|
+
host_address: str, hot_port: int, other_attributes: dict[str, typing.Any]):
|
|
589
|
+
self._declarations = header_declarations.copy()
|
|
590
|
+
self._attributes = types.MappingProxyType(header_attributes.copy())
|
|
591
|
+
self._request_time = request_time
|
|
592
|
+
self._response_time = response_time
|
|
593
|
+
self._certs = certs.copy()
|
|
594
|
+
self._other_attributes = types.MappingProxyType(other_attributes)
|
|
595
|
+
self._host_address = host_address
|
|
596
|
+
self._host_port = hot_port
|
|
597
|
+
|
|
598
|
+
@property
|
|
599
|
+
def certs(self) -> typing.Iterable[bytes]:
|
|
600
|
+
yield from self._certs
|
|
601
|
+
|
|
602
|
+
@property
|
|
603
|
+
def http_header_declarations(self) -> typing.Iterable[str]:
|
|
604
|
+
yield from self._declarations
|
|
605
|
+
|
|
606
|
+
@property
|
|
607
|
+
def request_time(self) -> datetime.datetime:
|
|
608
|
+
return self._request_time
|
|
609
|
+
|
|
610
|
+
@property
|
|
611
|
+
def response_time(self) -> datetime.datetime:
|
|
612
|
+
return self._response_time
|
|
613
|
+
|
|
614
|
+
@property
|
|
615
|
+
def http_header_attributes(self) -> typing.Iterable[tuple[str, str]]:
|
|
616
|
+
for key, vals in self._attributes.items():
|
|
617
|
+
for val in vals:
|
|
618
|
+
yield key, val
|
|
619
|
+
|
|
620
|
+
def has_declaration(self, declaration: str) -> bool:
|
|
621
|
+
return declaration in self._declarations
|
|
622
|
+
|
|
623
|
+
def get_attribute(self, attribute: str) -> list[str]:
|
|
624
|
+
return self._attributes.get(attribute.lower()) or []
|
|
625
|
+
|
|
626
|
+
@property
|
|
627
|
+
def other_cache_attributes(self):
|
|
628
|
+
return self._other_attributes
|
|
629
|
+
|
|
630
|
+
@classmethod
|
|
631
|
+
def from_buffer(cls, buffer: bytes):
|
|
632
|
+
# net/http/http_response_info.cc / net/http/http_response_info.h
|
|
633
|
+
# and for the proxy chain:
|
|
634
|
+
# net/base/proxy_chain.cc / net/base/proxy_server.h / net/base/proxy_server.cc
|
|
635
|
+
# This is a pickle, but it's a very simple one so just align manually rather than use a pickle library
|
|
636
|
+
# TODO: this is increasingly not "very simple", so we should move to using ccl_easy_chromium_pickle to tidy
|
|
637
|
+
# things up.
|
|
638
|
+
reader = BinaryReader.from_bytes(buffer)
|
|
639
|
+
total_length = reader.read_uint32()
|
|
640
|
+
if total_length != len(buffer) - 4:
|
|
641
|
+
raise ValueError("Metadata buffer is not the declared size")
|
|
642
|
+
|
|
643
|
+
def align():
|
|
644
|
+
alignment = reader.tell() % 4
|
|
645
|
+
if alignment != 0:
|
|
646
|
+
reader.read_raw(4 - alignment)
|
|
647
|
+
|
|
648
|
+
flags = CachedMetadataFlags(reader.read_uint32())
|
|
649
|
+
if flags & CachedMetadataFlags.RESPONSE_INFO_HAS_EXTRA_FLAGS:
|
|
650
|
+
extra_flags = CachedMetadataExtraFlags(reader.read_uint32())
|
|
651
|
+
else:
|
|
652
|
+
extra_flags = CachedMetadataExtraFlags(0)
|
|
653
|
+
|
|
654
|
+
request_time = reader.read_datetime()
|
|
655
|
+
response_time = reader.read_datetime()
|
|
656
|
+
|
|
657
|
+
if extra_flags & CachedMetadataExtraFlags.RESPONSE_EXTRA_INFO_HAS_ORIGINAL_RESPONSE_TIME:
|
|
658
|
+
# not currently reported as the meaning is not clear, but needs to be read if present in the pickle
|
|
659
|
+
original_response_time = reader.read_datetime()
|
|
660
|
+
|
|
661
|
+
http_header_length = reader.read_uint32()
|
|
662
|
+
http_header_raw = reader.read_raw(http_header_length)
|
|
663
|
+
|
|
664
|
+
header_attributes: dict[str, list[str]] = {}
|
|
665
|
+
header_declarations = set()
|
|
666
|
+
|
|
667
|
+
for header_entry in http_header_raw.split(b"\00"):
|
|
668
|
+
if not header_entry:
|
|
669
|
+
continue # skip empty entries
|
|
670
|
+
parsed_entry = header_entry.decode("latin-1").split(":", 1)
|
|
671
|
+
if len(parsed_entry) == 1:
|
|
672
|
+
header_declarations.add(parsed_entry[0])
|
|
673
|
+
elif len(parsed_entry) == 2:
|
|
674
|
+
header_attributes.setdefault(parsed_entry[0].lower(), [])
|
|
675
|
+
header_attributes[parsed_entry[0].lower()].append(parsed_entry[1].strip())
|
|
676
|
+
|
|
677
|
+
other_attributes = {}
|
|
678
|
+
|
|
679
|
+
certs = []
|
|
680
|
+
if flags & CachedMetadataFlags.RESPONSE_INFO_HAS_CERT:
|
|
681
|
+
# net/cert/x509_certificate.cc CreateFromPickle
|
|
682
|
+
align()
|
|
683
|
+
cert_count = reader.read_uint32()
|
|
684
|
+
for _ in range(cert_count):
|
|
685
|
+
align()
|
|
686
|
+
cert_length = reader.read_uint32()
|
|
687
|
+
certs.append(reader.read_raw(cert_length))
|
|
688
|
+
|
|
689
|
+
if flags & CachedMetadataFlags.RESPONSE_INFO_HAS_CERT_STATUS:
|
|
690
|
+
align()
|
|
691
|
+
other_attributes["cert_status"] = reader.read_uint32()
|
|
692
|
+
|
|
693
|
+
if flags & CachedMetadataFlags.RESPONSE_INFO_HAS_SECURITY_BITS:
|
|
694
|
+
align()
|
|
695
|
+
other_attributes["security_bits"] = reader.read_int32()
|
|
696
|
+
|
|
697
|
+
if flags & CachedMetadataFlags.RESPONSE_INFO_HAS_SSL_CONNECTION_STATUS:
|
|
698
|
+
align()
|
|
699
|
+
other_attributes["ssl_connection_status"] = reader.read_int32()
|
|
700
|
+
|
|
701
|
+
if flags & CachedMetadataFlags.RESPONSE_INFO_HAS_SIGNED_CERTIFICATE_TIMESTAMPS:
|
|
702
|
+
align()
|
|
703
|
+
# these are unused, only here for backwards compatibility
|
|
704
|
+
ts_count = reader.read_int32()
|
|
705
|
+
for _ in range(ts_count):
|
|
706
|
+
# net/cert/signed_certificate_timestamp.cc
|
|
707
|
+
ts_version = reader.read_int32()
|
|
708
|
+
str_len = reader.read_int32()
|
|
709
|
+
ts_log_id = reader.read_raw(str_len)
|
|
710
|
+
align()
|
|
711
|
+
ts_timestamp = reader.read_datetime()
|
|
712
|
+
str_len = reader.read_int32()
|
|
713
|
+
ts_extensions = reader.read_raw(str_len)
|
|
714
|
+
align()
|
|
715
|
+
ts_hash_algo = reader.read_int32()
|
|
716
|
+
ts_sig_algo = reader.read_int32()
|
|
717
|
+
str_len = reader.read_int32()
|
|
718
|
+
ts_sig_data = reader.read_raw(str_len)
|
|
719
|
+
align()
|
|
720
|
+
ts_origin = reader.read_int32()
|
|
721
|
+
str_len = reader.read_int32()
|
|
722
|
+
ts_log_desc = reader.read_raw(str_len)
|
|
723
|
+
align()
|
|
724
|
+
ts_status = reader.read_uint16()
|
|
725
|
+
align()
|
|
726
|
+
|
|
727
|
+
if flags & CachedMetadataFlags.RESPONSE_INFO_HAS_VARY_DATA:
|
|
728
|
+
# net/http/http_vary_data.cc InitFromPickle
|
|
729
|
+
align()
|
|
730
|
+
other_attributes["vary_data"] = reader.read_raw(16)
|
|
731
|
+
|
|
732
|
+
host, port = None, None
|
|
733
|
+
try:
|
|
734
|
+
align()
|
|
735
|
+
host_length = reader.read_uint32()
|
|
736
|
+
host = reader.read_raw(host_length).decode("latin-1")
|
|
737
|
+
align()
|
|
738
|
+
port = reader.read_uint16()
|
|
739
|
+
except ValueError:
|
|
740
|
+
# bail out at this point if we've hit eof
|
|
741
|
+
return cls(
|
|
742
|
+
header_declarations, header_attributes, request_time, response_time, certs, host, port,
|
|
743
|
+
other_attributes)
|
|
744
|
+
|
|
745
|
+
# todo: there are other fields, they don't look too relevant for us in many cases,
|
|
746
|
+
# but I can return here to review.
|
|
747
|
+
|
|
748
|
+
return cls(
|
|
749
|
+
header_declarations, header_attributes, request_time, response_time, certs, host, port, other_attributes)
|
|
750
|
+
|
|
751
|
+
|
|
752
|
+
@dataclasses.dataclass(frozen=True)
|
|
753
|
+
class CacheFileLocation:
|
|
754
|
+
file_name: str
|
|
755
|
+
offset: int
|
|
756
|
+
|
|
757
|
+
def __repr__(self):
|
|
758
|
+
return f"<CacheFileLocation; file_name: '{self.file_name}'; offset: {self.offset}"
|
|
759
|
+
|
|
760
|
+
def __str__(self):
|
|
761
|
+
return f"{self.file_name} @ {self.offset}"
|
|
762
|
+
|
|
763
|
+
|
|
764
|
+
class ChromiumCache(abc.ABC):
|
|
765
|
+
"""
|
|
766
|
+
Abstract base class that both forms of concrete cache types inherit from
|
|
767
|
+
"""
|
|
768
|
+
def get_metadata(self, key: typing.Union[str, CacheKey]) -> list[typing.Optional[CachedMetadata]]: # typing.Optional[CachedMetadata]:
|
|
769
|
+
"""
|
|
770
|
+
:param key: the cache key for the entry
|
|
771
|
+
:return: a list of CachedMetadata objects for this key. Most often this list will contain only one entry but
|
|
772
|
+
this library can return old versions of records in some cases. The order of metadata should be the same as
|
|
773
|
+
the records returned by get_cachefile
|
|
774
|
+
"""
|
|
775
|
+
raise NotImplementedError()
|
|
776
|
+
|
|
777
|
+
def get_cachefile(self, key: typing.Union[str, CacheKey]) -> list[bytes]: # typing.Optional[bytes]:
|
|
778
|
+
"""
|
|
779
|
+
:param key: the cache key for the entry
|
|
780
|
+
:return: a list of bytes objects for this key containing the cached resource. Most often this list will contain
|
|
781
|
+
only one entry but this library can return old versions of records in some cases. The order of data should
|
|
782
|
+
be the same as the records returned by get_metadata
|
|
783
|
+
"""
|
|
784
|
+
raise NotImplementedError()
|
|
785
|
+
|
|
786
|
+
def get_location_for_metadata(self, key: typing.Union[str, CacheKey]) -> list[CacheFileLocation]:
|
|
787
|
+
"""
|
|
788
|
+
:param key: the cache key for the entry
|
|
789
|
+
:return: a list of CacheFileLocation objects for this key's metadata. Most often this list will contain only one
|
|
790
|
+
entry but this library can return old versions of records in some cases. The order of metadata should be the
|
|
791
|
+
same as the records returned by get_metadata
|
|
792
|
+
"""
|
|
793
|
+
raise NotImplementedError()
|
|
794
|
+
|
|
795
|
+
def get_location_for_cachefile(self, key: typing.Union[str, CacheKey]) -> list[CacheFileLocation]:
|
|
796
|
+
"""
|
|
797
|
+
:param key: the cache key for the entry
|
|
798
|
+
:return: a list of CacheFileLocation objects for this key's data. Most often this list will contain only one
|
|
799
|
+
entry but this library can return old versions of records in some cases. The order of metadata should be the
|
|
800
|
+
same as the records returned by get_metadata
|
|
801
|
+
"""
|
|
802
|
+
raise NotImplementedError()
|
|
803
|
+
|
|
804
|
+
def __enter__(self) -> "ChromiumCache":
|
|
805
|
+
raise NotImplementedError()
|
|
806
|
+
|
|
807
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
808
|
+
raise NotImplementedError()
|
|
809
|
+
|
|
810
|
+
def keys(self) -> typing.Iterable[str]:
|
|
811
|
+
"""
|
|
812
|
+
:return: yields the cache keys for this cache instance
|
|
813
|
+
"""
|
|
814
|
+
raise NotImplementedError()
|
|
815
|
+
|
|
816
|
+
def cache_keys(self) -> typing.Iterable[CacheKey]:
|
|
817
|
+
"""
|
|
818
|
+
:return: yields the cache keys (as CacheKey objects) for this cache instance
|
|
819
|
+
"""
|
|
820
|
+
raise NotImplementedError()
|
|
821
|
+
|
|
822
|
+
|
|
823
|
+
class ChromiumBlockFileCache(ChromiumCache):
|
|
824
|
+
def __init__(self, cache_dir: typing.Union[os.PathLike, str]):
|
|
825
|
+
self._in_dir = pathlib.Path(cache_dir)
|
|
826
|
+
self._index_file = BlockFileIndexFile(self._in_dir / "index")
|
|
827
|
+
self._block_files: dict[int, tuple[BlockFileHeader, typing.BinaryIO]] = {}
|
|
828
|
+
self._keys = self._build_keys()
|
|
829
|
+
|
|
830
|
+
def _get_block_file(self, block_file_number: int) -> tuple[BlockFileHeader, typing.BinaryIO]:
|
|
831
|
+
if cached := self._block_files.get(block_file_number):
|
|
832
|
+
return cached
|
|
833
|
+
|
|
834
|
+
block_file_stream = (self._in_dir / f"data_{block_file_number}").open("rb")
|
|
835
|
+
header = BlockFileHeader.from_bytes(block_file_stream.read(BlockFileHeader._BLOCK_HEADER_SIZE))
|
|
836
|
+
self._block_files[block_file_number] = (header, block_file_stream)
|
|
837
|
+
return header, block_file_stream
|
|
838
|
+
|
|
839
|
+
def _build_keys(self):
|
|
840
|
+
result = {}
|
|
841
|
+
for addr in self._index_file.index:
|
|
842
|
+
while addr.is_initialized:
|
|
843
|
+
if not addr.sanity_check_for_entry():
|
|
844
|
+
print(f"Warning: Addr skipped as it is not sane for an entry: {addr}", file=sys.stderr)
|
|
845
|
+
break
|
|
846
|
+
raw = self.get_data_for_addr(addr)
|
|
847
|
+
try:
|
|
848
|
+
es = EntryStore.from_bytes(raw)
|
|
849
|
+
except (ValueError, OverflowError):
|
|
850
|
+
print("Warning: EntryStore could not be read and it being skipped; bad data follows, if you "
|
|
851
|
+
"believe it to be a valid record, please contact the developer.", file=sys.stderr)
|
|
852
|
+
print(raw, file=sys.stderr)
|
|
853
|
+
break
|
|
854
|
+
if es.key is not None:
|
|
855
|
+
key = es.key
|
|
856
|
+
else:
|
|
857
|
+
key = self.get_data_for_addr(es.long_key_addr).decode("utf-8")[0:es.key_length]
|
|
858
|
+
|
|
859
|
+
result[key] = es
|
|
860
|
+
addr = es.next_entry
|
|
861
|
+
|
|
862
|
+
return result
|
|
863
|
+
|
|
864
|
+
def _get_location(self, key: str, stream_number: int):
|
|
865
|
+
es = self._keys[key]
|
|
866
|
+
addr = es.data_addrs[stream_number]
|
|
867
|
+
if addr.file_type in _BLOCK_FILE_FILETYPE:
|
|
868
|
+
file_name = f"data_{addr.file_selector}"
|
|
869
|
+
block_header, stream = self._get_block_file(addr.file_selector)
|
|
870
|
+
offset = BlockFileHeader._BLOCK_HEADER_SIZE + (block_header.entry_size * addr.block_number)
|
|
871
|
+
return CacheFileLocation(file_name, offset)
|
|
872
|
+
elif addr.file_type == FileType.EXTERNAL:
|
|
873
|
+
file_name = f"f_{addr.external_file_number:06x}"
|
|
874
|
+
return CacheFileLocation(file_name, 0)
|
|
875
|
+
|
|
876
|
+
raise ValueError("unexpected file type")
|
|
877
|
+
|
|
878
|
+
def get_location_for_metadata(self, key: typing.Union[str, CacheKey]) -> list[CacheFileLocation]:
|
|
879
|
+
if isinstance(key, CacheKey):
|
|
880
|
+
key = key.raw_key
|
|
881
|
+
return [self._get_location(key, 0)]
|
|
882
|
+
|
|
883
|
+
def get_location_for_cachefile(self, key: typing.Union[str, CacheKey]) -> list[CacheFileLocation]:
|
|
884
|
+
if isinstance(key, CacheKey):
|
|
885
|
+
key = key.raw_key
|
|
886
|
+
return [self._get_location(key, 1)]
|
|
887
|
+
|
|
888
|
+
def get_stream_for_addr(self, addr: Addr) -> typing.BinaryIO:
|
|
889
|
+
if not addr.is_initialized:
|
|
890
|
+
raise ValueError("Addr is not initialized")
|
|
891
|
+
if addr.file_type in _BLOCK_FILE_FILETYPE:
|
|
892
|
+
block_header, stream = self._get_block_file(addr.file_selector)
|
|
893
|
+
stream.seek(BlockFileHeader._BLOCK_HEADER_SIZE + (block_header.entry_size * addr.block_number))
|
|
894
|
+
return io.BytesIO(stream.read(block_header.entry_size * addr.contiguous_blocks)) # slow probably
|
|
895
|
+
elif addr.file_type == FileType.EXTERNAL:
|
|
896
|
+
return (self._in_dir / f"f_{addr.external_file_number:06x}").open("rb")
|
|
897
|
+
|
|
898
|
+
raise ValueError("unexpected file type")
|
|
899
|
+
|
|
900
|
+
def get_data_for_addr(self, addr: Addr) -> typing.Optional[bytes]:
|
|
901
|
+
if not addr.is_initialized:
|
|
902
|
+
raise ValueError("Addr is not initialized")
|
|
903
|
+
if addr.file_type in _BLOCK_FILE_FILETYPE:
|
|
904
|
+
block_header, stream = self._get_block_file(addr.file_selector)
|
|
905
|
+
stream.seek(BlockFileHeader._BLOCK_HEADER_SIZE + (block_header.entry_size * addr.block_number))
|
|
906
|
+
return stream.read(block_header.entry_size * addr.contiguous_blocks)
|
|
907
|
+
elif addr.file_type == FileType.EXTERNAL:
|
|
908
|
+
external_file_path = self._in_dir / f"f_{addr.external_file_number:06x}"
|
|
909
|
+
if not external_file_path.exists():
|
|
910
|
+
print(f"Warning: External cache file {external_file_path} is referenced in the data, but "
|
|
911
|
+
f"does not exist in the cache folder.", file=sys.stderr)
|
|
912
|
+
return None
|
|
913
|
+
with external_file_path.open("rb") as f:
|
|
914
|
+
return f.read()
|
|
915
|
+
|
|
916
|
+
raise ValueError("unexpected file type")
|
|
917
|
+
|
|
918
|
+
def get_data_buffer(self, key: typing.Union[str, EntryStore, CacheKey], stream_number: int) -> typing.Optional[bytes]:
|
|
919
|
+
if stream_number < 0 or stream_number > 2:
|
|
920
|
+
raise ValueError("invalid stream number")
|
|
921
|
+
if isinstance(key, EntryStore):
|
|
922
|
+
es = key
|
|
923
|
+
elif isinstance(key, CacheKey):
|
|
924
|
+
es = self._keys[key.raw_key]
|
|
925
|
+
else:
|
|
926
|
+
es = self._keys[key]
|
|
927
|
+
|
|
928
|
+
addr = es.data_addrs[stream_number]
|
|
929
|
+
if not addr.is_initialized:
|
|
930
|
+
return None
|
|
931
|
+
|
|
932
|
+
data = self.get_data_for_addr(addr)
|
|
933
|
+
if data is None:
|
|
934
|
+
return None
|
|
935
|
+
|
|
936
|
+
stream_length = es.data_sizes[stream_number]
|
|
937
|
+
if data is not None and len(data) < stream_length:
|
|
938
|
+
print(es, file=sys.stderr)
|
|
939
|
+
raise ValueError(f"Could not get all of the data for stream {stream_number}")
|
|
940
|
+
data = data[0:stream_length]
|
|
941
|
+
return data
|
|
942
|
+
|
|
943
|
+
def get_metadata(self, key: typing.Union[str, EntryStore, CacheKey]) -> list[typing.Optional[CachedMetadata]]:
|
|
944
|
+
buffer = self.get_data_buffer(key, 0)
|
|
945
|
+
if not buffer:
|
|
946
|
+
return [None]
|
|
947
|
+
meta = CachedMetadata.from_buffer(buffer)
|
|
948
|
+
return [meta]
|
|
949
|
+
|
|
950
|
+
def get_cachefile(self, key: typing.Union[str, EntryStore, CacheKey]) -> list[bytes]:
|
|
951
|
+
return [self.get_data_buffer(key, 1)]
|
|
952
|
+
|
|
953
|
+
def __enter__(self) -> "ChromiumBlockFileCache":
|
|
954
|
+
return self
|
|
955
|
+
|
|
956
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
957
|
+
self.close()
|
|
958
|
+
|
|
959
|
+
def keys(self) -> typing.Iterable[str]:
|
|
960
|
+
yield from self._keys.keys()
|
|
961
|
+
|
|
962
|
+
def cache_keys(self) -> typing.Iterable[CacheKey]:
|
|
963
|
+
for k in self._keys.keys():
|
|
964
|
+
yield CacheKey(k)
|
|
965
|
+
|
|
966
|
+
def values(self) -> typing.Iterable[EntryStore]:
|
|
967
|
+
yield from self._keys.values()
|
|
968
|
+
|
|
969
|
+
def items(self) -> typing.Iterable[tuple[str, EntryStore]]:
|
|
970
|
+
yield from self._keys.items()
|
|
971
|
+
|
|
972
|
+
def __contains__(self, item) -> bool:
|
|
973
|
+
if isinstance(item, CacheKey):
|
|
974
|
+
item = item.raw_key
|
|
975
|
+
return item in self._keys
|
|
976
|
+
|
|
977
|
+
def __getitem__(self, item) -> EntryStore:
|
|
978
|
+
if isinstance(item, CacheKey):
|
|
979
|
+
item = item.raw_key
|
|
980
|
+
return self._keys[item]
|
|
981
|
+
|
|
982
|
+
def close(self):
|
|
983
|
+
for _, stream in self._block_files.values():
|
|
984
|
+
stream.close()
|
|
985
|
+
|
|
986
|
+
|
|
987
|
+
@dataclasses.dataclass(frozen=True)
|
|
988
|
+
class SimpleCacheEOF:
|
|
989
|
+
# net/disk_cache/simple/simple_entry_format.h
|
|
990
|
+
flags: int
|
|
991
|
+
data_crc: int
|
|
992
|
+
stream_size: int
|
|
993
|
+
|
|
994
|
+
_SIMPLE_FINAL_MAGIC: typing.ClassVar[int] = 0xf4fa6f45970d41d8 # is written little-endian in the file
|
|
995
|
+
|
|
996
|
+
@classmethod
|
|
997
|
+
def from_reader(cls, reader: BinaryReader):
|
|
998
|
+
magic = reader.read_uint64()
|
|
999
|
+
if magic != SimpleCacheEOF._SIMPLE_FINAL_MAGIC:
|
|
1000
|
+
raise ValueError(f"Invalid magic (expected {SimpleCacheEOF._SIMPLE_FINAL_MAGIC}; got {magic}")
|
|
1001
|
+
|
|
1002
|
+
flags = reader.read_uint32()
|
|
1003
|
+
data_crc = reader.read_uint32()
|
|
1004
|
+
stream_size = reader.read_uint32()
|
|
1005
|
+
|
|
1006
|
+
return cls(flags, data_crc, stream_size)
|
|
1007
|
+
|
|
1008
|
+
@property
|
|
1009
|
+
def has_crc(self):
|
|
1010
|
+
return self.flags & 1 > 0
|
|
1011
|
+
|
|
1012
|
+
@property
|
|
1013
|
+
def has_key_sha256(self):
|
|
1014
|
+
return self.flags & 2 > 0
|
|
1015
|
+
|
|
1016
|
+
|
|
1017
|
+
@dataclasses.dataclass(frozen=True)
|
|
1018
|
+
class SimpleCacheHeader:
|
|
1019
|
+
# net/disk_cache/simple/simple_entry_format.h
|
|
1020
|
+
version: int
|
|
1021
|
+
key_length: int
|
|
1022
|
+
key_hash: int
|
|
1023
|
+
|
|
1024
|
+
_SIMPLE_INITIAL_MAGIC: typing.ClassVar[int] = 0xfcfb6d1ba7725c30 # is written little-endian in the file
|
|
1025
|
+
|
|
1026
|
+
@classmethod
|
|
1027
|
+
def from_reader(cls, reader: BinaryReader):
|
|
1028
|
+
magic = reader.read_uint64()
|
|
1029
|
+
if magic != SimpleCacheHeader._SIMPLE_INITIAL_MAGIC:
|
|
1030
|
+
raise ValueError(f"Invalid magic (expected {SimpleCacheHeader._SIMPLE_INITIAL_MAGIC}; got {magic}")
|
|
1031
|
+
version = reader.read_uint32()
|
|
1032
|
+
key_length = reader.read_uint32()
|
|
1033
|
+
key_hash = reader.read_uint32()
|
|
1034
|
+
|
|
1035
|
+
if EIGHT_BYTE_PICKLE_ALIGNMENT:
|
|
1036
|
+
_ = reader.read_uint32() # need to align to 8 bytes before we get to the key
|
|
1037
|
+
|
|
1038
|
+
return cls(version, key_length, key_hash)
|
|
1039
|
+
|
|
1040
|
+
|
|
1041
|
+
class SimpleCacheFile:
|
|
1042
|
+
# net/disk_cache/simple/simple_entry_format.h
|
|
1043
|
+
|
|
1044
|
+
def __init__(self, cache_file: typing.Union[os.PathLike, str]):
|
|
1045
|
+
self._path = pathlib.Path(cache_file)
|
|
1046
|
+
self._reader = BinaryReader(self._path.open("rb"))
|
|
1047
|
+
self._header = SimpleCacheHeader.from_reader(self._reader)
|
|
1048
|
+
self._key = self._reader.read_raw(self._header.key_length).decode("latin-1")
|
|
1049
|
+
|
|
1050
|
+
# Peek forwards - are we at EOF? Sometimes (rarely) you only get a URL
|
|
1051
|
+
if self._reader.is_eof:
|
|
1052
|
+
self._stream_0_eof = None
|
|
1053
|
+
self._stream_1_eof = None
|
|
1054
|
+
self._stream_0_start_offset_negative = 0
|
|
1055
|
+
self._stream_1_start_offset = 0
|
|
1056
|
+
self._stream_1_length = 0
|
|
1057
|
+
|
|
1058
|
+
self._has_data = False
|
|
1059
|
+
return
|
|
1060
|
+
else:
|
|
1061
|
+
self._has_data = True
|
|
1062
|
+
|
|
1063
|
+
# get stream 0 EOF
|
|
1064
|
+
self._reader.seek(-SIMPLE_EOF_SIZE, os.SEEK_END)
|
|
1065
|
+
self._stream_0_eof = SimpleCacheEOF.from_reader(self._reader)
|
|
1066
|
+
self._stream_0_start_offset_negative = -SIMPLE_EOF_SIZE - self._stream_0_eof.stream_size
|
|
1067
|
+
if self._stream_0_eof.has_key_sha256:
|
|
1068
|
+
self._stream_0_start_offset_negative -= 32
|
|
1069
|
+
|
|
1070
|
+
# get stream 1 EOF
|
|
1071
|
+
# the size of the stream 0 eof, the size of the stream 1 eof, the size of stream 0, 32 bytes if there's sha256
|
|
1072
|
+
self._reader.seek(-SIMPLE_EOF_SIZE - SIMPLE_EOF_SIZE - self._stream_0_eof.stream_size, os.SEEK_END)
|
|
1073
|
+
if self._stream_0_eof.has_key_sha256:
|
|
1074
|
+
self._reader.seek(-32, os.SEEK_CUR)
|
|
1075
|
+
stream_1_end_offset = self._reader.tell()
|
|
1076
|
+
# the eof for stream 1 might contain a stream length, but the comments in simple_entry_format.h say it won't?
|
|
1077
|
+
self._stream_1_eof = SimpleCacheEOF.from_reader(self._reader)
|
|
1078
|
+
self._stream_1_start_offset = SIMPLE_EOF_SIZE + self._header.key_length # 20 = header length
|
|
1079
|
+
self._stream_1_length = stream_1_end_offset - self._stream_1_start_offset
|
|
1080
|
+
|
|
1081
|
+
def __enter__(self) -> "SimpleCacheFile":
|
|
1082
|
+
return self
|
|
1083
|
+
|
|
1084
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
1085
|
+
self.close()
|
|
1086
|
+
|
|
1087
|
+
def get_stream_0(self):
|
|
1088
|
+
if self._has_data:
|
|
1089
|
+
self._reader.seek(self._stream_0_start_offset_negative, os.SEEK_END)
|
|
1090
|
+
return self._reader.read_raw(self._stream_0_eof.stream_size)
|
|
1091
|
+
return b""
|
|
1092
|
+
|
|
1093
|
+
def get_stream_1(self):
|
|
1094
|
+
if self._has_data:
|
|
1095
|
+
self._reader.seek(self._stream_1_start_offset, os.SEEK_SET)
|
|
1096
|
+
return self._reader.read_raw(self._stream_1_length)
|
|
1097
|
+
return b""
|
|
1098
|
+
|
|
1099
|
+
@property
|
|
1100
|
+
def data_start_offset(self):
|
|
1101
|
+
return self._stream_1_start_offset
|
|
1102
|
+
|
|
1103
|
+
@property
|
|
1104
|
+
def metadata_start_offset_negative(self):
|
|
1105
|
+
return self._stream_0_start_offset_negative
|
|
1106
|
+
|
|
1107
|
+
@property
|
|
1108
|
+
def path(self) -> pathlib.Path:
|
|
1109
|
+
return self._path
|
|
1110
|
+
|
|
1111
|
+
@property
|
|
1112
|
+
def key(self) -> str:
|
|
1113
|
+
return self._key
|
|
1114
|
+
|
|
1115
|
+
@property
|
|
1116
|
+
def key_hash(self) -> int:
|
|
1117
|
+
return self._header.key_hash
|
|
1118
|
+
|
|
1119
|
+
def close(self):
|
|
1120
|
+
self._reader.close()
|
|
1121
|
+
|
|
1122
|
+
|
|
1123
|
+
class ChromiumSimpleFileCache(ChromiumCache):
|
|
1124
|
+
# net/disk_cache/simple/simple_entry_format.h
|
|
1125
|
+
_STREAM_0_1_FILENAME_PATTERN = re.compile(r"^[0-9a-f]{16}_0$")
|
|
1126
|
+
|
|
1127
|
+
def __init__(self, cache_dir: typing.Union[os.PathLike, str]):
|
|
1128
|
+
self._cache_dir = pathlib.Path(cache_dir)
|
|
1129
|
+
self._file_lookup = types.MappingProxyType(self._build_keys())
|
|
1130
|
+
|
|
1131
|
+
@property
|
|
1132
|
+
def cache_dir(self) -> pathlib.Path:
|
|
1133
|
+
return self._cache_dir
|
|
1134
|
+
|
|
1135
|
+
def _build_keys(self) -> dict[str, list[pathlib.Path]]:
|
|
1136
|
+
# doing it this way is slow, but saves on having a million file handles open
|
|
1137
|
+
lookup: dict[str, list[pathlib.Path]] = {}
|
|
1138
|
+
for cache_file in self._cache_dir.iterdir():
|
|
1139
|
+
if cache_file.is_file() and ChromiumSimpleFileCache._STREAM_0_1_FILENAME_PATTERN.match(cache_file.name):
|
|
1140
|
+
with SimpleCacheFile(cache_file) as cf:
|
|
1141
|
+
# if cf.key in lookup:
|
|
1142
|
+
# raise ValueError(f"{cf.key} already in lookup (please contact developer)")
|
|
1143
|
+
lookup.setdefault(cf.key, [])
|
|
1144
|
+
lookup[cf.key].append(cache_file)
|
|
1145
|
+
|
|
1146
|
+
return lookup
|
|
1147
|
+
|
|
1148
|
+
def get_location_for_metadata(self, key: typing.Union[str, CacheKey]) -> list[CacheFileLocation]:
|
|
1149
|
+
result = []
|
|
1150
|
+
if isinstance(key, CacheKey):
|
|
1151
|
+
key = key.raw_key
|
|
1152
|
+
for file in self._file_lookup[key]:
|
|
1153
|
+
file_length = file.stat().st_size
|
|
1154
|
+
with SimpleCacheFile(file) as cf:
|
|
1155
|
+
offset = file_length + cf.metadata_start_offset_negative
|
|
1156
|
+
result.append(CacheFileLocation(file.name, offset))
|
|
1157
|
+
return result
|
|
1158
|
+
|
|
1159
|
+
def get_location_for_cachefile(self, key: typing.Union[str, CacheKey]) -> list[CacheFileLocation]:
|
|
1160
|
+
result = []
|
|
1161
|
+
if isinstance(key, CacheKey):
|
|
1162
|
+
key = key.raw_key
|
|
1163
|
+
for file in self._file_lookup[key]:
|
|
1164
|
+
with SimpleCacheFile(file) as cf:
|
|
1165
|
+
offset = cf.data_start_offset
|
|
1166
|
+
result.append(CacheFileLocation(file.name, offset))
|
|
1167
|
+
return result
|
|
1168
|
+
|
|
1169
|
+
def get_metadata(self, key: typing.Union[str, CacheKey]) -> list[typing.Optional[CachedMetadata]]:
|
|
1170
|
+
result = []
|
|
1171
|
+
if isinstance(key, CacheKey):
|
|
1172
|
+
key = key.raw_key
|
|
1173
|
+
for file in self._file_lookup[key]:
|
|
1174
|
+
with SimpleCacheFile(file) as cf:
|
|
1175
|
+
buffer = cf.get_stream_0()
|
|
1176
|
+
if buffer:
|
|
1177
|
+
result.append(CachedMetadata.from_buffer(buffer))
|
|
1178
|
+
else:
|
|
1179
|
+
result.append(None)
|
|
1180
|
+
return result
|
|
1181
|
+
|
|
1182
|
+
def get_cachefile(self, key: typing.Union[str, CacheKey]) -> list[bytes]:
|
|
1183
|
+
result = []
|
|
1184
|
+
if isinstance(key, CacheKey):
|
|
1185
|
+
key = key.raw_key
|
|
1186
|
+
for file in self._file_lookup[key]:
|
|
1187
|
+
with SimpleCacheFile(file) as cf:
|
|
1188
|
+
result.append(cf.get_stream_1())
|
|
1189
|
+
return result
|
|
1190
|
+
|
|
1191
|
+
def __enter__(self) -> "ChromiumCache":
|
|
1192
|
+
return self
|
|
1193
|
+
|
|
1194
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
1195
|
+
pass
|
|
1196
|
+
|
|
1197
|
+
def close(self):
|
|
1198
|
+
pass
|
|
1199
|
+
|
|
1200
|
+
def keys(self) -> typing.Iterable[str]:
|
|
1201
|
+
yield from self._file_lookup.keys()
|
|
1202
|
+
|
|
1203
|
+
def cache_keys(self) -> typing.Iterable[CacheKey]:
|
|
1204
|
+
for k in self._file_lookup.keys():
|
|
1205
|
+
yield CacheKey(k)
|
|
1206
|
+
|
|
1207
|
+
def get_file_for_key(self, key: typing.Union[str, CacheKey]) -> list[str]:
|
|
1208
|
+
if isinstance(key, CacheKey):
|
|
1209
|
+
key = key.raw_key
|
|
1210
|
+
return [x.name for x in self._file_lookup[key]]
|
|
1211
|
+
|
|
1212
|
+
|
|
1213
|
+
def guess_cache_class(
|
|
1214
|
+
cache_dir: typing.Optional[typing.Union[pathlib.Path, os.PathLike]]) \
|
|
1215
|
+
-> typing.Optional[typing.Type[typing.Union[ChromiumBlockFileCache, ChromiumSimpleFileCache]]]:
|
|
1216
|
+
cache_dir = pathlib.Path(cache_dir)
|
|
1217
|
+
data_files = {"data_0", "data_1", "data_2", "data_3"}
|
|
1218
|
+
|
|
1219
|
+
for file in cache_dir.iterdir():
|
|
1220
|
+
# multiple tests to we can return as soon as possible
|
|
1221
|
+
if file.name == "index-dir":
|
|
1222
|
+
return ChromiumSimpleFileCache
|
|
1223
|
+
elif file.name in data_files:
|
|
1224
|
+
return ChromiumBlockFileCache
|
|
1225
|
+
elif re.match(r"f_[0-9a-f]{6}", file.name):
|
|
1226
|
+
return ChromiumBlockFileCache
|
|
1227
|
+
elif re.match(r"^[0-9a-f]{16}_0$", file.name):
|
|
1228
|
+
return ChromiumSimpleFileCache
|
|
1229
|
+
|
|
1230
|
+
return None
|
|
1231
|
+
|
|
1232
|
+
|
|
1233
|
+
def main(args):
|
|
1234
|
+
import csv
|
|
1235
|
+
import hashlib
|
|
1236
|
+
import mimetypes
|
|
1237
|
+
import brotli
|
|
1238
|
+
import gzip
|
|
1239
|
+
|
|
1240
|
+
in_cache_dir = pathlib.Path(args[0])
|
|
1241
|
+
out_dir = pathlib.Path(args[1])
|
|
1242
|
+
cache_out_dir = out_dir / "cache_files"
|
|
1243
|
+
|
|
1244
|
+
if not in_cache_dir.is_dir():
|
|
1245
|
+
raise ValueError("Input directory is not a directory or does not exist")
|
|
1246
|
+
|
|
1247
|
+
if out_dir.exists():
|
|
1248
|
+
raise ValueError("Output directory already exists")
|
|
1249
|
+
|
|
1250
|
+
out_dir.mkdir()
|
|
1251
|
+
cache_out_dir.mkdir()
|
|
1252
|
+
|
|
1253
|
+
default_row_headers = ["file_hash", "key", "request_time", "response_time", "date"]
|
|
1254
|
+
dynamic_row_headers = set()
|
|
1255
|
+
rows: list[dict] = []
|
|
1256
|
+
|
|
1257
|
+
cache_type = guess_cache_class(in_cache_dir)
|
|
1258
|
+
if cache_type is None:
|
|
1259
|
+
raise ValueError("Could not detect Chrome cache type")
|
|
1260
|
+
|
|
1261
|
+
with cache_type(in_cache_dir) as cache:
|
|
1262
|
+
for key in cache.keys():
|
|
1263
|
+
out_extension = ""
|
|
1264
|
+
content_encoding = ""
|
|
1265
|
+
row = {"key": key}
|
|
1266
|
+
rows.append(row)
|
|
1267
|
+
|
|
1268
|
+
metas = cache.get_metadata(key)
|
|
1269
|
+
datas = cache.get_cachefile(key)
|
|
1270
|
+
|
|
1271
|
+
if len(metas) != len(datas):
|
|
1272
|
+
raise ValueError("Metadata records count does not match data records count")
|
|
1273
|
+
|
|
1274
|
+
for meta, data in zip(metas, datas):
|
|
1275
|
+
if meta is not None:
|
|
1276
|
+
row["request_time"] = meta.request_time
|
|
1277
|
+
row["response_time"] = meta.response_time
|
|
1278
|
+
for attribute, value in meta.http_header_attributes:
|
|
1279
|
+
dynamic_row_headers.add(attribute)
|
|
1280
|
+
if attribute in row:
|
|
1281
|
+
row[attribute] += f"; {value}"
|
|
1282
|
+
else:
|
|
1283
|
+
row[attribute] = value
|
|
1284
|
+
|
|
1285
|
+
if mime := meta.get_attribute("content-type"):
|
|
1286
|
+
out_extension = mimetypes.guess_extension(mime[0]) or ""
|
|
1287
|
+
|
|
1288
|
+
content_encoding = (meta.get_attribute("content-encoding") or [""])[0]
|
|
1289
|
+
|
|
1290
|
+
#data = cache.get_cachefile(key)
|
|
1291
|
+
if data is not None:
|
|
1292
|
+
if content_encoding.strip() == "gzip":
|
|
1293
|
+
try:
|
|
1294
|
+
data = gzip.decompress(data)
|
|
1295
|
+
except (EOFError, gzip.BadGzipFile) as ex:
|
|
1296
|
+
print(f"Warning: could not decompress data for key: \"{key}\"; reason: {ex}")
|
|
1297
|
+
elif content_encoding.strip() == "br":
|
|
1298
|
+
try:
|
|
1299
|
+
data = brotli.decompress(data)
|
|
1300
|
+
except brotli.error as ex:
|
|
1301
|
+
print(f"Warning: could not decompress data for key: \"{key}\"; reason: {ex}")
|
|
1302
|
+
elif content_encoding.strip() == "deflate":
|
|
1303
|
+
try:
|
|
1304
|
+
data = zlib.decompress(data, -zlib.MAX_WBITS) # suppress trying to read a header
|
|
1305
|
+
except zlib.error as ex:
|
|
1306
|
+
print(f"Warning: could not decompress data for key: \"{key}\"; reason: {ex}")
|
|
1307
|
+
elif content_encoding.strip() != "":
|
|
1308
|
+
print(f"Warning: unknown content-encoding: {content_encoding}")
|
|
1309
|
+
|
|
1310
|
+
h = hashlib.sha256()
|
|
1311
|
+
h.update(data)
|
|
1312
|
+
cache_file_hash = h.hexdigest()
|
|
1313
|
+
row["file_hash"] = cache_file_hash
|
|
1314
|
+
with (cache_out_dir / (cache_file_hash + out_extension)).open("wb") as out:
|
|
1315
|
+
out.write(data)
|
|
1316
|
+
else:
|
|
1317
|
+
row["file_hash"] = "<No cache file data>"
|
|
1318
|
+
|
|
1319
|
+
csv_out_f = (out_dir / "cache_report.csv").open("wt", encoding="utf-8", newline="")
|
|
1320
|
+
csv_out_f.write("\ufeff")
|
|
1321
|
+
csv_out = csv.DictWriter(
|
|
1322
|
+
csv_out_f, fieldnames=default_row_headers + sorted(dynamic_row_headers), dialect=csv.excel,
|
|
1323
|
+
quoting=csv.QUOTE_ALL, quotechar="\"", escapechar="\\")
|
|
1324
|
+
csv_out.writeheader()
|
|
1325
|
+
for row in rows:
|
|
1326
|
+
csv_out.writerow(row)
|
|
1327
|
+
|
|
1328
|
+
csv_out_f.close()
|
|
1329
|
+
|
|
1330
|
+
|
|
1331
|
+
if __name__ == '__main__':
|
|
1332
|
+
if len(sys.argv) < 3:
|
|
1333
|
+
print(f"USAGE: {pathlib.Path(sys.argv[0]).name} <cache input dir> <out dir>")
|
|
1334
|
+
exit(1)
|
|
1335
|
+
main(sys.argv[1:])
|