linear-mcp-fast 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. ccl_chromium_reader/__init__.py +2 -0
  2. ccl_chromium_reader/ccl_chromium_cache.py +1335 -0
  3. ccl_chromium_reader/ccl_chromium_filesystem.py +302 -0
  4. ccl_chromium_reader/ccl_chromium_history.py +357 -0
  5. ccl_chromium_reader/ccl_chromium_indexeddb.py +1060 -0
  6. ccl_chromium_reader/ccl_chromium_localstorage.py +454 -0
  7. ccl_chromium_reader/ccl_chromium_notifications.py +268 -0
  8. ccl_chromium_reader/ccl_chromium_profile_folder.py +568 -0
  9. ccl_chromium_reader/ccl_chromium_sessionstorage.py +368 -0
  10. ccl_chromium_reader/ccl_chromium_snss2.py +332 -0
  11. ccl_chromium_reader/ccl_shared_proto_db_downloads.py +189 -0
  12. ccl_chromium_reader/common.py +19 -0
  13. ccl_chromium_reader/download_common.py +78 -0
  14. ccl_chromium_reader/profile_folder_protocols.py +276 -0
  15. ccl_chromium_reader/serialization_formats/__init__.py +0 -0
  16. ccl_chromium_reader/serialization_formats/ccl_blink_value_deserializer.py +401 -0
  17. ccl_chromium_reader/serialization_formats/ccl_easy_chromium_pickle.py +133 -0
  18. ccl_chromium_reader/serialization_formats/ccl_protobuff.py +276 -0
  19. ccl_chromium_reader/serialization_formats/ccl_v8_value_deserializer.py +627 -0
  20. ccl_chromium_reader/storage_formats/__init__.py +0 -0
  21. ccl_chromium_reader/storage_formats/ccl_leveldb.py +582 -0
  22. ccl_simplesnappy/__init__.py +1 -0
  23. ccl_simplesnappy/ccl_simplesnappy.py +306 -0
  24. linear_mcp_fast/__init__.py +8 -0
  25. linear_mcp_fast/__main__.py +6 -0
  26. linear_mcp_fast/reader.py +433 -0
  27. linear_mcp_fast/server.py +367 -0
  28. linear_mcp_fast/store_detector.py +117 -0
  29. linear_mcp_fast-0.1.0.dist-info/METADATA +160 -0
  30. linear_mcp_fast-0.1.0.dist-info/RECORD +39 -0
  31. linear_mcp_fast-0.1.0.dist-info/WHEEL +5 -0
  32. linear_mcp_fast-0.1.0.dist-info/entry_points.txt +2 -0
  33. linear_mcp_fast-0.1.0.dist-info/top_level.txt +4 -0
  34. tools_and_utilities/Chromium_dump_local_storage.py +111 -0
  35. tools_and_utilities/Chromium_dump_session_storage.py +92 -0
  36. tools_and_utilities/benchmark.py +35 -0
  37. tools_and_utilities/ccl_chrome_audit.py +651 -0
  38. tools_and_utilities/dump_indexeddb_details.py +59 -0
  39. tools_and_utilities/dump_leveldb.py +53 -0
@@ -0,0 +1,1335 @@
1
+ """
2
+ Copyright 2022-2025, CCL Forensics
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining a copy of
5
+ this software and associated documentation files (the "Software"), to deal in
6
+ the Software without restriction, including without limitation the rights to
7
+ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
8
+ of the Software, and to permit persons to whom the Software is furnished to do
9
+ so, subject to the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be included in all
12
+ copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20
+ SOFTWARE.
21
+ """
22
+
23
+
24
+ import abc
25
+ import dataclasses
26
+ import io
27
+ import os
28
+ import re
29
+ import sys
30
+ import types
31
+ import typing
32
+ import pathlib
33
+ import datetime
34
+ import struct
35
+ import enum
36
+ import zlib
37
+
38
+ __version__ = "0.22"
39
+ __description__ = "Library for reading Chrome/Chromium Cache (both blockfile and simple format)"
40
+ __contact__ = "Alex Caithness"
41
+
42
+
43
+ _CHROME_EPOCH = datetime.datetime(1601, 1, 1)
44
+ EIGHT_BYTE_PICKLE_ALIGNMENT = True # switch this if you get errors about the EOF magic when doing a Simple Cache
45
+ SIMPLE_EOF_SIZE = 24 if EIGHT_BYTE_PICKLE_ALIGNMENT else 20
46
+
47
+
48
+ def decode_chrome_time(us: int) -> datetime.datetime:
49
+ return _CHROME_EPOCH + datetime.timedelta(microseconds=us)
50
+
51
+
52
+ class BinaryReader:
53
+ """
54
+ Utility class which wraps a BinaryIO and provides reading for a bunch of data types we need to do the cache stuff
55
+ """
56
+ def __init__(self, stream: typing.BinaryIO):
57
+ self._stream = stream
58
+ self._closed = False
59
+
60
+ @classmethod
61
+ def from_bytes(cls, buffer: bytes):
62
+ return cls(io.BytesIO(buffer))
63
+
64
+ def close(self):
65
+ self._stream.close()
66
+ self._closed = True
67
+
68
+ def __enter__(self) -> "BinaryReader":
69
+ return self
70
+
71
+ def __exit__(self, exc_type, exc_val, exc_tb):
72
+ self.close()
73
+
74
+ def tell(self) -> int:
75
+ return self._stream.tell()
76
+
77
+ def seek(self, offset: int, whence: int) -> int:
78
+ return self._stream.seek(offset, whence)
79
+
80
+ def read_raw(self, count: int) -> bytes:
81
+ start_offset = self._stream.tell()
82
+ result = self._stream.read(count)
83
+ if len(result) != count:
84
+ raise ValueError(
85
+ f"Could not read all of the data starting at {start_offset}. Expected: {count}; got {len(result)}")
86
+ return result
87
+
88
+ def read_utf8(self, count: int) -> str:
89
+ return self.read_raw(count).decode("utf-8")
90
+
91
+ def read_int16(self) -> int:
92
+ raw = self.read_raw(2)
93
+ return struct.unpack("<h", raw)[0]
94
+
95
+ def read_int32(self) -> int:
96
+ raw = self.read_raw(4)
97
+ return struct.unpack("<i", raw)[0]
98
+
99
+ def read_int64(self) -> int:
100
+ raw = self.read_raw(8)
101
+ return struct.unpack("<q", raw)[0]
102
+
103
+ def read_uint16(self) -> int:
104
+ raw = self.read_raw(2)
105
+ return struct.unpack("<H", raw)[0]
106
+
107
+ def read_uint32(self) -> int:
108
+ raw = self.read_raw(4)
109
+ return struct.unpack("<I", raw)[0]
110
+
111
+ def read_uint64(self) -> int:
112
+ raw = self.read_raw(8)
113
+ return struct.unpack("<Q", raw)[0]
114
+
115
+ def read_addr(self) -> "Addr":
116
+ return Addr.from_int(self.read_uint32())
117
+
118
+ def read_datetime(self) -> datetime.datetime:
119
+ return decode_chrome_time(self.read_uint64())
120
+
121
+ @property
122
+ def is_closed(self) -> bool:
123
+ return self._closed
124
+
125
+ @property
126
+ def is_eof(self) -> bool:
127
+ test = self._stream.read(1)
128
+ if len(test) == 0:
129
+ return True
130
+ self._stream.seek(-1, os.SEEK_CUR)
131
+ return False
132
+
133
+
134
+ class FileType(enum.IntEnum):
135
+ # net/disk_cache/blockfile/disk_format.h
136
+ EXTERNAL = 0
137
+ RANKINGS = 1
138
+ BLOCK_256 = 2
139
+ BLOCK_1K = 3
140
+ BLOCK_4K = 4
141
+ BLOCK_FILES = 5
142
+ BLOCK_ENTRIES = 6
143
+ BLOCK_EVICTED = 7
144
+
145
+
146
+ _BLOCKSIZE_FOR_FILETYPE = {
147
+ FileType.RANKINGS: 36,
148
+ FileType.BLOCK_256: 256,
149
+ FileType.BLOCK_1K: 1024,
150
+ FileType.BLOCK_4K: 4096,
151
+ FileType.BLOCK_FILES: 8,
152
+ FileType.BLOCK_ENTRIES: 104,
153
+ FileType.BLOCK_EVICTED: 48,
154
+ FileType.EXTERNAL: 0
155
+ }
156
+
157
+
158
+ _BLOCK_FILE_FILETYPE = {FileType.BLOCK_256, FileType.BLOCK_1K, FileType.BLOCK_4K}
159
+
160
+
161
+ class CacheKey:
162
+ """
163
+ Class representing a parsed Chromium Cache Key.
164
+ """
165
+ # net/http/http_cache.cc GenerateCacheKey
166
+ CRED_UPLOAD_KEY_PREFIX_PATTERN = re.compile(r"^\d+/\d+/") # 'current' (since Sept '21)
167
+ UPLOAD_ONLY_KEY_PREFIX_PATTERN = re.compile(r"^\d+/") # prior to Sept '21
168
+ # if neither of the above we assume we only have a URL
169
+
170
+ def __init__(self, raw_key: str):
171
+ self._raw_key = raw_key
172
+
173
+ # We have to account for a few different versions of keys, we can do this based on a prefix
174
+ if CacheKey.UPLOAD_ONLY_KEY_PREFIX_PATTERN.match(self._raw_key):
175
+ if CacheKey.CRED_UPLOAD_KEY_PREFIX_PATTERN.match(self._raw_key):
176
+ split_key = self._raw_key.split("/", 2)
177
+ self._credential_key = split_key[0]
178
+ self._upload_data_identifier = int(split_key[1])
179
+ else:
180
+ split_key = self._raw_key.split("/", 1)
181
+ self._credential_key = ""
182
+ self._upload_data_identifier = int(split_key[0])
183
+
184
+ if split_key[-1].startswith("_dk_"):
185
+ # consume two kDoubleKeySeparator (a space), the url is after that
186
+ (self._isolation_key_top_frame_site,
187
+ self._isolation_key_variable_part,
188
+ self._url) = split_key[-1][4:].split(" ", 3)
189
+ if self._isolation_key_top_frame_site.startswith("s_"):
190
+ self._isolation_key_top_frame_site = self._isolation_key_top_frame_site[2:]
191
+ else:
192
+ self._url = split_key[-1]
193
+ self._isolation_key_top_frame_site = None
194
+ self._isolation_key_variable_part = None
195
+ else:
196
+ # if the prefixes don't hit, this should just be a URL
197
+ self._url = self._raw_key
198
+ self._isolation_key_top_frame_site = None
199
+ self._isolation_key_variable_part = None
200
+
201
+ @property
202
+ def raw_key(self) -> str:
203
+ return self._raw_key
204
+
205
+ @property
206
+ def url(self) -> str:
207
+ return self._url
208
+
209
+ @property
210
+ def credential_key(self) -> str:
211
+ return self._credential_key
212
+
213
+ @property
214
+ def upload_data_identifier(self) -> int:
215
+ return self._upload_data_identifier
216
+
217
+ @property
218
+ def isolation_key_top_frame_site(self) -> str:
219
+ return self._isolation_key_top_frame_site
220
+
221
+ @property
222
+ def isolation_key_variable_part(self):
223
+ return self._isolation_key_variable_part
224
+
225
+ def __str__(self):
226
+ return self._raw_key
227
+
228
+ def __repr__(self):
229
+ return (f"<CacheKey url: {self._url}; credential_key: {self._credential_key}; "
230
+ f"upload_data_identifier: {self._upload_data_identifier}; "
231
+ f"isolation_key_top_frame_site: {self._isolation_key_top_frame_site}; "
232
+ f"isolation_key_variable_part: {self._isolation_key_variable_part}>")
233
+
234
+
235
+ class Addr:
236
+ # net/disk_cache/blockfile/addr.h
237
+ def __init__(
238
+ self, is_initialized: bool, file_type: FileType, file_number: typing.Optional[int],
239
+ contiguous_blocks: typing.Optional[int], file_selector: typing.Optional[int], block_number: int,
240
+ reserved_bits: typing.Optional[int]):
241
+ self._is_initialized = is_initialized
242
+ self._file_type = file_type
243
+ self._file_number = file_number
244
+ self._contiguous_blocks = contiguous_blocks
245
+ self._file_selector = file_selector
246
+ self._block_number = block_number
247
+ self._reserved_bits = reserved_bits
248
+
249
+ def __repr__(self):
250
+ return (f"<Addr: is_initialized: {self._is_initialized}; file_type: {self._file_type.name}; "
251
+ f"file_number: {self._file_number}; contiguous_blocks: {self._contiguous_blocks}; "
252
+ f"file_selector: {self._file_selector}; block_number: {self._block_number}>")
253
+
254
+ @classmethod
255
+ def from_int(cls, i: int):
256
+ is_initialized = (i & 0x80000000) > 0
257
+ file_type = FileType((i & 0x70000000) >> 28)
258
+
259
+ if file_type == FileType.EXTERNAL:
260
+ file_number = i & 0x0fffffff
261
+ contiguous_blocks = None
262
+ file_selector = None
263
+ block_number = None
264
+ reserved_bits = None
265
+ else:
266
+ file_number = None
267
+ contiguous_blocks = 1 + ((i & 0x03000000) >> 24)
268
+ file_selector = (i & 0x00ff0000) >> 16
269
+ block_number = i & 0x0000ffff
270
+ reserved_bits = i & 0x0c000000
271
+
272
+ return Addr(
273
+ is_initialized,
274
+ file_type,
275
+ file_number,
276
+ contiguous_blocks,
277
+ file_selector,
278
+ block_number,
279
+ reserved_bits)
280
+
281
+ def sanity_check(self) -> bool:
282
+ # implementation from addr.cc - will hopefully identify invalid data and skip it rather than raising exceptions
283
+ # we omit the initialized check from that version, as that's to identify a totally empty entry (which is sane
284
+ # but of no use to us in any context we use it).
285
+ if self._file_type > FileType.BLOCK_4K:
286
+ return False
287
+ if self._file_type != FileType.EXTERNAL and self._reserved_bits != 0:
288
+ return False
289
+
290
+ return True
291
+
292
+ def sanity_check_for_entry(self) -> bool:
293
+ return self.sanity_check() and self._file_type == FileType.BLOCK_256
294
+
295
+ @property
296
+ def is_initialized(self) -> bool:
297
+ return self._is_initialized
298
+
299
+ @property
300
+ def file_type(self) -> FileType:
301
+ return self._file_type
302
+
303
+ @property
304
+ def contiguous_blocks(self) -> int:
305
+ return self._contiguous_blocks
306
+
307
+ @property
308
+ def file_selector(self) -> int:
309
+ return self._file_selector
310
+
311
+ @property
312
+ def block_number(self) -> int:
313
+ return self._block_number
314
+
315
+ @property
316
+ def external_file_number(self) -> int:
317
+ return self._file_number
318
+
319
+
320
+ @dataclasses.dataclass(frozen=True)
321
+ class LruData:
322
+ # net/disk_cache/blockfile/disk_format.h
323
+ filled: int
324
+ sizes: typing.Collection[int]
325
+ heads: typing.Collection[Addr]
326
+ tails: typing.Collection[Addr]
327
+ transactions: Addr
328
+ operation: int
329
+ operation_list: int
330
+
331
+ @classmethod
332
+ def from_bytes(cls, buffer: bytes):
333
+ with BinaryReader.from_bytes(buffer) as reader:
334
+ return cls.from_reader(reader)
335
+
336
+ @classmethod
337
+ def from_reader(cls, reader: BinaryReader):
338
+ _ = [reader.read_int32() for x in range(2)]
339
+ filled = reader.read_int32()
340
+ sizes = tuple(reader.read_int32() for _ in range(5))
341
+ heads = tuple(reader.read_addr() for _ in range(5))
342
+ tails = tuple(reader.read_addr() for _ in range(5))
343
+ transaction = reader.read_addr()
344
+ operation = reader.read_int32()
345
+ operation_list = reader.read_int32()
346
+ _ = [reader.read_int32() for x in range(7)]
347
+
348
+ return cls(filled, sizes, heads, tails, transaction, operation, operation_list)
349
+
350
+
351
+ @dataclasses.dataclass(frozen=True)
352
+ class BlockFileIndexHeader:
353
+ # net/disk_cache/blockfile/disk_format.h
354
+ version: int
355
+ num_entries: int
356
+ num_bytes_v2: int
357
+ last_file: int
358
+ this_id: int
359
+ stats_addr: Addr
360
+ table_length: int
361
+ crash: int
362
+ experiment: int
363
+ create_time: datetime.datetime
364
+ num_bytes_v3: int
365
+ lru: LruData
366
+
367
+ _MAGIC: typing.ClassVar[int] = 0xC103CAC3
368
+
369
+ @classmethod
370
+ def from_bytes(cls, buffer: bytes):
371
+ with BinaryReader.from_bytes(buffer) as reader:
372
+ return cls.from_reader(reader)
373
+
374
+ @classmethod
375
+ def from_reader(cls, reader: BinaryReader):
376
+ magic = reader.read_uint32()
377
+ if magic != BlockFileIndexHeader._MAGIC:
378
+ raise ValueError("invalid magic")
379
+ version = reader.read_uint32()
380
+ num_entries = reader.read_int32()
381
+ old_v2_num_bytes = reader.read_uint32()
382
+ last_file = reader.read_int32()
383
+ this_id = reader.read_int32()
384
+ stats_addr = reader.read_addr()
385
+ table_length = reader.read_int32() or 0x10000
386
+ crash = reader.read_int32()
387
+ experiment = reader.read_int32()
388
+ create_time = reader.read_datetime()
389
+ num_bytes = reader.read_int64()
390
+ _ = [reader.read_int32() for x in range(50)]
391
+ lru = LruData.from_reader(reader)
392
+
393
+ return cls(
394
+ version, num_entries, old_v2_num_bytes, last_file, this_id, stats_addr,
395
+ table_length, crash, experiment, create_time, num_bytes, lru)
396
+
397
+
398
+ class BlockFileIndexFile:
399
+ # net/disk_cache/blockfile/disk_format.h
400
+ def __init__(self, file_path: typing.Union[os.PathLike, str]):
401
+ self._input_path = pathlib.Path(file_path)
402
+ with BinaryReader(self._input_path.open("rb")) as reader:
403
+ self._header = BlockFileIndexHeader.from_reader(reader)
404
+ self._entries = tuple(reader.read_addr() for _ in range(self._header.table_length))
405
+ self._entries_initialized = tuple(x for x in self._entries if x.is_initialized)
406
+
407
+ @property
408
+ def input_path(self):
409
+ return self._input_path
410
+
411
+ @property
412
+ def header(self) -> BlockFileIndexHeader:
413
+ return self._header
414
+
415
+ @property
416
+ def index(self) -> typing.Collection[Addr]:
417
+ return self._entries
418
+
419
+ @property
420
+ def index_initialized_only(self):
421
+ return self._entries_initialized
422
+
423
+
424
+ class EntryState(enum.IntEnum):
425
+ NORMAL = 0
426
+ EVICTED = 1
427
+ DOOMED = 2
428
+
429
+
430
+ class EntryFlags(enum.IntFlag):
431
+ PARENT_ENTRY = 1 << 0
432
+ CHILD_ENTRY = 1 << 1
433
+
434
+
435
+ @dataclasses.dataclass(frozen=True)
436
+ class EntryStore:
437
+ # net/disk_cache/blockfile/disk_format.h
438
+ entry_hash: int
439
+ next_entry: Addr
440
+ rankings_node: Addr
441
+ reuse_count: int
442
+ refetch_count: int
443
+ state: EntryState
444
+ creation_time: datetime.datetime
445
+ key_length: int
446
+ long_key_addr: Addr
447
+ data_sizes: tuple[int, int, int, int]
448
+ data_addrs: tuple[Addr, Addr, Addr, Addr]
449
+ flags: EntryFlags
450
+ self_hash: int
451
+ key: typing.Optional[str]
452
+
453
+ @property
454
+ def key_is_external(self) -> bool:
455
+ return self.long_key_addr.is_initialized
456
+
457
+ @classmethod
458
+ def from_bytes(cls, buffer: bytes):
459
+ with BinaryReader.from_bytes(buffer) as reader:
460
+ return cls.from_reader(reader)
461
+
462
+ @classmethod
463
+ def from_reader(cls, reader: BinaryReader):
464
+ start = reader.tell()
465
+
466
+ entry_hash = reader.read_uint32()
467
+ next_entry = reader.read_addr()
468
+ rankings_node = reader.read_addr()
469
+ reuse_count = reader.read_int32()
470
+ refetch_count = reader.read_int32()
471
+ state = EntryState(reader.read_int32())
472
+ creation_time = reader.read_datetime()
473
+ key_length = reader.read_int32()
474
+ long_key_addr = reader.read_addr()
475
+ data_sizes = (reader.read_int32(), reader.read_int32(), reader.read_int32(), reader.read_int32())
476
+ data_addrs = (reader.read_addr(), reader.read_addr(), reader.read_addr(), reader.read_addr())
477
+ flags = EntryFlags(reader.read_uint32())
478
+ _ = [reader.read_int32() for x in range(4)]
479
+ self_hash = reader.read_uint32()
480
+
481
+ meta_length = reader.tell() - start
482
+
483
+ key = None
484
+ key_is_external = long_key_addr.is_initialized
485
+ if not key_is_external:
486
+ key = reader.read_utf8(key_length)
487
+
488
+ return cls(
489
+ entry_hash, next_entry, rankings_node, reuse_count, refetch_count, state, creation_time, key_length,
490
+ long_key_addr, data_sizes, data_addrs, flags, self_hash, key)
491
+
492
+
493
+ @dataclasses.dataclass(frozen=True)
494
+ class BlockFileHeader:
495
+ # net/disk_cache/blockfile/disk_format_base.h
496
+ version: int
497
+ this_file: int
498
+ next_file: int
499
+ entry_size: int
500
+ num_entries: int
501
+ max_entries: int
502
+ empty_type_counts: tuple[int, int, int, int]
503
+ hints: tuple[int, int, int, int]
504
+ updating: int
505
+ user: tuple[int, int, int, int, int]
506
+ allocation_map: bytes
507
+
508
+ _MAGIC: typing.ClassVar[int] = 0xC104CAC3
509
+ _BLOCK_HEADER_SIZE: typing.ClassVar[int] = 8192
510
+ _MAX_BLOCKS: typing.ClassVar[int] = (_BLOCK_HEADER_SIZE - 80) * 8
511
+
512
+ def __post_init__(self):
513
+ if len(self.allocation_map) != self._MAX_BLOCKS // 8:
514
+ raise ValueError("invalid allocation map length")
515
+
516
+ @classmethod
517
+ def from_bytes(cls, buffer: bytes):
518
+ with BinaryReader.from_bytes(buffer) as reader:
519
+ return cls.from_reader(reader)
520
+
521
+ @classmethod
522
+ def from_reader(cls, reader: BinaryReader):
523
+ magic = reader.read_uint32()
524
+ if magic != cls._MAGIC:
525
+ raise ValueError("invalid magic")
526
+ version = reader.read_uint32()
527
+ this_file = reader.read_int16()
528
+ next_file = reader.read_int16()
529
+ entry_size = reader.read_int32()
530
+ num_entries = reader.read_int32()
531
+ max_entries = reader.read_int32()
532
+ empty = (reader.read_int32(), reader.read_int32(), reader.read_int32(), reader.read_int32())
533
+ hints = (reader.read_int32(), reader.read_int32(), reader.read_int32(), reader.read_int32())
534
+ updating = reader.read_int32()
535
+ user = (reader.read_int32(), reader.read_int32(), reader.read_int32(), reader.read_int32(), reader.read_int32())
536
+
537
+ allocation_map = reader.read_raw(cls._MAX_BLOCKS // 8)
538
+
539
+ return cls(
540
+ version, this_file, next_file, entry_size, num_entries, max_entries,
541
+ empty, hints, updating, user, allocation_map)
542
+
543
+
544
+ class CachedMetadataFlags(enum.IntFlag):
545
+ # net/http/http_response_info.cc
546
+
547
+ RESPONSE_INFO_VERSION = 3
548
+ RESPONSE_INFO_VERSION_MASK = 0xFF
549
+
550
+ # This bit is set if the response info has a cert at the end.
551
+ RESPONSE_INFO_HAS_CERT = 1 << 8
552
+ RESPONSE_INFO_HAS_SECURITY_BITS = 1 << 9
553
+ RESPONSE_INFO_HAS_CERT_STATUS = 1 << 10
554
+ RESPONSE_INFO_HAS_VARY_DATA = 1 << 11
555
+ RESPONSE_INFO_TRUNCATED = 1 << 12
556
+ RESPONSE_INFO_WAS_SPDY = 1 << 13
557
+ RESPONSE_INFO_WAS_ALPN = 1 << 14
558
+ RESPONSE_INFO_WAS_PROXY = 1 << 15
559
+ RESPONSE_INFO_HAS_SSL_CONNECTION_STATUS = 1 << 16
560
+ RESPONSE_INFO_HAS_ALPN_NEGOTIATED_PROTOCOL = 1 << 17
561
+ RESPONSE_INFO_HAS_CONNECTION_INFO = 1 << 18
562
+ RESPONSE_INFO_USE_HTTP_AUTHENTICATION = 1 << 19
563
+ RESPONSE_INFO_HAS_SIGNED_CERTIFICATE_TIMESTAMPS = 1 << 20
564
+ RESPONSE_INFO_UNUSED_SINCE_PREFETCH = 1 << 21
565
+ RESPONSE_INFO_HAS_KEY_EXCHANGE_GROUP = 1 << 22
566
+ RESPONSE_INFO_PKP_BYPASSED = 1 << 23
567
+ RESPONSE_INFO_HAS_STALENESS = 1 << 24
568
+ RESPONSE_INFO_HAS_PEER_SIGNATURE_ALGORITHM = 1 << 25
569
+ RESPONSE_INFO_RESTRICTED_PREFETCH = 1 << 26
570
+ RESPONSE_INFO_HAS_DNS_ALIASES = 1 << 27
571
+ RESPONSE_INFO_SINGLE_KEYED_CACHE_ENTRY_UNUSABLE = 1 << 28
572
+ RESPONSE_INFO_ENCRYPTED_CLIENT_HELLO = 1 << 29
573
+ RESPONSE_INFO_BROWSER_RUN_ID = 1 << 30
574
+ RESPONSE_INFO_HAS_EXTRA_FLAGS = 1 << 31 # indicates that we need to read the extra flags after this value
575
+
576
+
577
+ class CachedMetadataExtraFlags(enum.IntFlag):
578
+ RESPONSE_EXTRA_INFO_DID_USE_SHARED_DICTIONARY = 1
579
+ RESPONSE_EXTRA_INFO_HAS_PROXY_CHAIN = 1 << 1
580
+ RESPONSE_EXTRA_INFO_HAS_ORIGINAL_RESPONSE_TIME = 1 << 2
581
+
582
+
583
+ class CachedMetadata:
584
+ # net/http/http_response_info.cc / net/http/http_response_info.h
585
+ def __init__(
586
+ self, header_declarations: set[str], header_attributes: dict[str, list[str]],
587
+ request_time: datetime.datetime, response_time: datetime.datetime, certs: list[bytes],
588
+ host_address: str, hot_port: int, other_attributes: dict[str, typing.Any]):
589
+ self._declarations = header_declarations.copy()
590
+ self._attributes = types.MappingProxyType(header_attributes.copy())
591
+ self._request_time = request_time
592
+ self._response_time = response_time
593
+ self._certs = certs.copy()
594
+ self._other_attributes = types.MappingProxyType(other_attributes)
595
+ self._host_address = host_address
596
+ self._host_port = hot_port
597
+
598
+ @property
599
+ def certs(self) -> typing.Iterable[bytes]:
600
+ yield from self._certs
601
+
602
+ @property
603
+ def http_header_declarations(self) -> typing.Iterable[str]:
604
+ yield from self._declarations
605
+
606
+ @property
607
+ def request_time(self) -> datetime.datetime:
608
+ return self._request_time
609
+
610
+ @property
611
+ def response_time(self) -> datetime.datetime:
612
+ return self._response_time
613
+
614
+ @property
615
+ def http_header_attributes(self) -> typing.Iterable[tuple[str, str]]:
616
+ for key, vals in self._attributes.items():
617
+ for val in vals:
618
+ yield key, val
619
+
620
+ def has_declaration(self, declaration: str) -> bool:
621
+ return declaration in self._declarations
622
+
623
+ def get_attribute(self, attribute: str) -> list[str]:
624
+ return self._attributes.get(attribute.lower()) or []
625
+
626
+ @property
627
+ def other_cache_attributes(self):
628
+ return self._other_attributes
629
+
630
+ @classmethod
631
+ def from_buffer(cls, buffer: bytes):
632
+ # net/http/http_response_info.cc / net/http/http_response_info.h
633
+ # and for the proxy chain:
634
+ # net/base/proxy_chain.cc / net/base/proxy_server.h / net/base/proxy_server.cc
635
+ # This is a pickle, but it's a very simple one so just align manually rather than use a pickle library
636
+ # TODO: this is increasingly not "very simple", so we should move to using ccl_easy_chromium_pickle to tidy
637
+ # things up.
638
+ reader = BinaryReader.from_bytes(buffer)
639
+ total_length = reader.read_uint32()
640
+ if total_length != len(buffer) - 4:
641
+ raise ValueError("Metadata buffer is not the declared size")
642
+
643
+ def align():
644
+ alignment = reader.tell() % 4
645
+ if alignment != 0:
646
+ reader.read_raw(4 - alignment)
647
+
648
+ flags = CachedMetadataFlags(reader.read_uint32())
649
+ if flags & CachedMetadataFlags.RESPONSE_INFO_HAS_EXTRA_FLAGS:
650
+ extra_flags = CachedMetadataExtraFlags(reader.read_uint32())
651
+ else:
652
+ extra_flags = CachedMetadataExtraFlags(0)
653
+
654
+ request_time = reader.read_datetime()
655
+ response_time = reader.read_datetime()
656
+
657
+ if extra_flags & CachedMetadataExtraFlags.RESPONSE_EXTRA_INFO_HAS_ORIGINAL_RESPONSE_TIME:
658
+ # not currently reported as the meaning is not clear, but needs to be read if present in the pickle
659
+ original_response_time = reader.read_datetime()
660
+
661
+ http_header_length = reader.read_uint32()
662
+ http_header_raw = reader.read_raw(http_header_length)
663
+
664
+ header_attributes: dict[str, list[str]] = {}
665
+ header_declarations = set()
666
+
667
+ for header_entry in http_header_raw.split(b"\00"):
668
+ if not header_entry:
669
+ continue # skip empty entries
670
+ parsed_entry = header_entry.decode("latin-1").split(":", 1)
671
+ if len(parsed_entry) == 1:
672
+ header_declarations.add(parsed_entry[0])
673
+ elif len(parsed_entry) == 2:
674
+ header_attributes.setdefault(parsed_entry[0].lower(), [])
675
+ header_attributes[parsed_entry[0].lower()].append(parsed_entry[1].strip())
676
+
677
+ other_attributes = {}
678
+
679
+ certs = []
680
+ if flags & CachedMetadataFlags.RESPONSE_INFO_HAS_CERT:
681
+ # net/cert/x509_certificate.cc CreateFromPickle
682
+ align()
683
+ cert_count = reader.read_uint32()
684
+ for _ in range(cert_count):
685
+ align()
686
+ cert_length = reader.read_uint32()
687
+ certs.append(reader.read_raw(cert_length))
688
+
689
+ if flags & CachedMetadataFlags.RESPONSE_INFO_HAS_CERT_STATUS:
690
+ align()
691
+ other_attributes["cert_status"] = reader.read_uint32()
692
+
693
+ if flags & CachedMetadataFlags.RESPONSE_INFO_HAS_SECURITY_BITS:
694
+ align()
695
+ other_attributes["security_bits"] = reader.read_int32()
696
+
697
+ if flags & CachedMetadataFlags.RESPONSE_INFO_HAS_SSL_CONNECTION_STATUS:
698
+ align()
699
+ other_attributes["ssl_connection_status"] = reader.read_int32()
700
+
701
+ if flags & CachedMetadataFlags.RESPONSE_INFO_HAS_SIGNED_CERTIFICATE_TIMESTAMPS:
702
+ align()
703
+ # these are unused, only here for backwards compatibility
704
+ ts_count = reader.read_int32()
705
+ for _ in range(ts_count):
706
+ # net/cert/signed_certificate_timestamp.cc
707
+ ts_version = reader.read_int32()
708
+ str_len = reader.read_int32()
709
+ ts_log_id = reader.read_raw(str_len)
710
+ align()
711
+ ts_timestamp = reader.read_datetime()
712
+ str_len = reader.read_int32()
713
+ ts_extensions = reader.read_raw(str_len)
714
+ align()
715
+ ts_hash_algo = reader.read_int32()
716
+ ts_sig_algo = reader.read_int32()
717
+ str_len = reader.read_int32()
718
+ ts_sig_data = reader.read_raw(str_len)
719
+ align()
720
+ ts_origin = reader.read_int32()
721
+ str_len = reader.read_int32()
722
+ ts_log_desc = reader.read_raw(str_len)
723
+ align()
724
+ ts_status = reader.read_uint16()
725
+ align()
726
+
727
+ if flags & CachedMetadataFlags.RESPONSE_INFO_HAS_VARY_DATA:
728
+ # net/http/http_vary_data.cc InitFromPickle
729
+ align()
730
+ other_attributes["vary_data"] = reader.read_raw(16)
731
+
732
+ host, port = None, None
733
+ try:
734
+ align()
735
+ host_length = reader.read_uint32()
736
+ host = reader.read_raw(host_length).decode("latin-1")
737
+ align()
738
+ port = reader.read_uint16()
739
+ except ValueError:
740
+ # bail out at this point if we've hit eof
741
+ return cls(
742
+ header_declarations, header_attributes, request_time, response_time, certs, host, port,
743
+ other_attributes)
744
+
745
+ # todo: there are other fields, they don't look too relevant for us in many cases,
746
+ # but I can return here to review.
747
+
748
+ return cls(
749
+ header_declarations, header_attributes, request_time, response_time, certs, host, port, other_attributes)
750
+
751
+
752
+ @dataclasses.dataclass(frozen=True)
753
+ class CacheFileLocation:
754
+ file_name: str
755
+ offset: int
756
+
757
+ def __repr__(self):
758
+ return f"<CacheFileLocation; file_name: '{self.file_name}'; offset: {self.offset}"
759
+
760
+ def __str__(self):
761
+ return f"{self.file_name} @ {self.offset}"
762
+
763
+
764
+ class ChromiumCache(abc.ABC):
765
+ """
766
+ Abstract base class that both forms of concrete cache types inherit from
767
+ """
768
+ def get_metadata(self, key: typing.Union[str, CacheKey]) -> list[typing.Optional[CachedMetadata]]: # typing.Optional[CachedMetadata]:
769
+ """
770
+ :param key: the cache key for the entry
771
+ :return: a list of CachedMetadata objects for this key. Most often this list will contain only one entry but
772
+ this library can return old versions of records in some cases. The order of metadata should be the same as
773
+ the records returned by get_cachefile
774
+ """
775
+ raise NotImplementedError()
776
+
777
+ def get_cachefile(self, key: typing.Union[str, CacheKey]) -> list[bytes]: # typing.Optional[bytes]:
778
+ """
779
+ :param key: the cache key for the entry
780
+ :return: a list of bytes objects for this key containing the cached resource. Most often this list will contain
781
+ only one entry but this library can return old versions of records in some cases. The order of data should
782
+ be the same as the records returned by get_metadata
783
+ """
784
+ raise NotImplementedError()
785
+
786
+ def get_location_for_metadata(self, key: typing.Union[str, CacheKey]) -> list[CacheFileLocation]:
787
+ """
788
+ :param key: the cache key for the entry
789
+ :return: a list of CacheFileLocation objects for this key's metadata. Most often this list will contain only one
790
+ entry but this library can return old versions of records in some cases. The order of metadata should be the
791
+ same as the records returned by get_metadata
792
+ """
793
+ raise NotImplementedError()
794
+
795
+ def get_location_for_cachefile(self, key: typing.Union[str, CacheKey]) -> list[CacheFileLocation]:
796
+ """
797
+ :param key: the cache key for the entry
798
+ :return: a list of CacheFileLocation objects for this key's data. Most often this list will contain only one
799
+ entry but this library can return old versions of records in some cases. The order of metadata should be the
800
+ same as the records returned by get_metadata
801
+ """
802
+ raise NotImplementedError()
803
+
804
+ def __enter__(self) -> "ChromiumCache":
805
+ raise NotImplementedError()
806
+
807
+ def __exit__(self, exc_type, exc_val, exc_tb):
808
+ raise NotImplementedError()
809
+
810
+ def keys(self) -> typing.Iterable[str]:
811
+ """
812
+ :return: yields the cache keys for this cache instance
813
+ """
814
+ raise NotImplementedError()
815
+
816
+ def cache_keys(self) -> typing.Iterable[CacheKey]:
817
+ """
818
+ :return: yields the cache keys (as CacheKey objects) for this cache instance
819
+ """
820
+ raise NotImplementedError()
821
+
822
+
823
+ class ChromiumBlockFileCache(ChromiumCache):
824
+ def __init__(self, cache_dir: typing.Union[os.PathLike, str]):
825
+ self._in_dir = pathlib.Path(cache_dir)
826
+ self._index_file = BlockFileIndexFile(self._in_dir / "index")
827
+ self._block_files: dict[int, tuple[BlockFileHeader, typing.BinaryIO]] = {}
828
+ self._keys = self._build_keys()
829
+
830
+ def _get_block_file(self, block_file_number: int) -> tuple[BlockFileHeader, typing.BinaryIO]:
831
+ if cached := self._block_files.get(block_file_number):
832
+ return cached
833
+
834
+ block_file_stream = (self._in_dir / f"data_{block_file_number}").open("rb")
835
+ header = BlockFileHeader.from_bytes(block_file_stream.read(BlockFileHeader._BLOCK_HEADER_SIZE))
836
+ self._block_files[block_file_number] = (header, block_file_stream)
837
+ return header, block_file_stream
838
+
839
+ def _build_keys(self):
840
+ result = {}
841
+ for addr in self._index_file.index:
842
+ while addr.is_initialized:
843
+ if not addr.sanity_check_for_entry():
844
+ print(f"Warning: Addr skipped as it is not sane for an entry: {addr}", file=sys.stderr)
845
+ break
846
+ raw = self.get_data_for_addr(addr)
847
+ try:
848
+ es = EntryStore.from_bytes(raw)
849
+ except (ValueError, OverflowError):
850
+ print("Warning: EntryStore could not be read and it being skipped; bad data follows, if you "
851
+ "believe it to be a valid record, please contact the developer.", file=sys.stderr)
852
+ print(raw, file=sys.stderr)
853
+ break
854
+ if es.key is not None:
855
+ key = es.key
856
+ else:
857
+ key = self.get_data_for_addr(es.long_key_addr).decode("utf-8")[0:es.key_length]
858
+
859
+ result[key] = es
860
+ addr = es.next_entry
861
+
862
+ return result
863
+
864
+ def _get_location(self, key: str, stream_number: int):
865
+ es = self._keys[key]
866
+ addr = es.data_addrs[stream_number]
867
+ if addr.file_type in _BLOCK_FILE_FILETYPE:
868
+ file_name = f"data_{addr.file_selector}"
869
+ block_header, stream = self._get_block_file(addr.file_selector)
870
+ offset = BlockFileHeader._BLOCK_HEADER_SIZE + (block_header.entry_size * addr.block_number)
871
+ return CacheFileLocation(file_name, offset)
872
+ elif addr.file_type == FileType.EXTERNAL:
873
+ file_name = f"f_{addr.external_file_number:06x}"
874
+ return CacheFileLocation(file_name, 0)
875
+
876
+ raise ValueError("unexpected file type")
877
+
878
+ def get_location_for_metadata(self, key: typing.Union[str, CacheKey]) -> list[CacheFileLocation]:
879
+ if isinstance(key, CacheKey):
880
+ key = key.raw_key
881
+ return [self._get_location(key, 0)]
882
+
883
+ def get_location_for_cachefile(self, key: typing.Union[str, CacheKey]) -> list[CacheFileLocation]:
884
+ if isinstance(key, CacheKey):
885
+ key = key.raw_key
886
+ return [self._get_location(key, 1)]
887
+
888
+ def get_stream_for_addr(self, addr: Addr) -> typing.BinaryIO:
889
+ if not addr.is_initialized:
890
+ raise ValueError("Addr is not initialized")
891
+ if addr.file_type in _BLOCK_FILE_FILETYPE:
892
+ block_header, stream = self._get_block_file(addr.file_selector)
893
+ stream.seek(BlockFileHeader._BLOCK_HEADER_SIZE + (block_header.entry_size * addr.block_number))
894
+ return io.BytesIO(stream.read(block_header.entry_size * addr.contiguous_blocks)) # slow probably
895
+ elif addr.file_type == FileType.EXTERNAL:
896
+ return (self._in_dir / f"f_{addr.external_file_number:06x}").open("rb")
897
+
898
+ raise ValueError("unexpected file type")
899
+
900
+ def get_data_for_addr(self, addr: Addr) -> typing.Optional[bytes]:
901
+ if not addr.is_initialized:
902
+ raise ValueError("Addr is not initialized")
903
+ if addr.file_type in _BLOCK_FILE_FILETYPE:
904
+ block_header, stream = self._get_block_file(addr.file_selector)
905
+ stream.seek(BlockFileHeader._BLOCK_HEADER_SIZE + (block_header.entry_size * addr.block_number))
906
+ return stream.read(block_header.entry_size * addr.contiguous_blocks)
907
+ elif addr.file_type == FileType.EXTERNAL:
908
+ external_file_path = self._in_dir / f"f_{addr.external_file_number:06x}"
909
+ if not external_file_path.exists():
910
+ print(f"Warning: External cache file {external_file_path} is referenced in the data, but "
911
+ f"does not exist in the cache folder.", file=sys.stderr)
912
+ return None
913
+ with external_file_path.open("rb") as f:
914
+ return f.read()
915
+
916
+ raise ValueError("unexpected file type")
917
+
918
+ def get_data_buffer(self, key: typing.Union[str, EntryStore, CacheKey], stream_number: int) -> typing.Optional[bytes]:
919
+ if stream_number < 0 or stream_number > 2:
920
+ raise ValueError("invalid stream number")
921
+ if isinstance(key, EntryStore):
922
+ es = key
923
+ elif isinstance(key, CacheKey):
924
+ es = self._keys[key.raw_key]
925
+ else:
926
+ es = self._keys[key]
927
+
928
+ addr = es.data_addrs[stream_number]
929
+ if not addr.is_initialized:
930
+ return None
931
+
932
+ data = self.get_data_for_addr(addr)
933
+ if data is None:
934
+ return None
935
+
936
+ stream_length = es.data_sizes[stream_number]
937
+ if data is not None and len(data) < stream_length:
938
+ print(es, file=sys.stderr)
939
+ raise ValueError(f"Could not get all of the data for stream {stream_number}")
940
+ data = data[0:stream_length]
941
+ return data
942
+
943
+ def get_metadata(self, key: typing.Union[str, EntryStore, CacheKey]) -> list[typing.Optional[CachedMetadata]]:
944
+ buffer = self.get_data_buffer(key, 0)
945
+ if not buffer:
946
+ return [None]
947
+ meta = CachedMetadata.from_buffer(buffer)
948
+ return [meta]
949
+
950
+ def get_cachefile(self, key: typing.Union[str, EntryStore, CacheKey]) -> list[bytes]:
951
+ return [self.get_data_buffer(key, 1)]
952
+
953
+ def __enter__(self) -> "ChromiumBlockFileCache":
954
+ return self
955
+
956
+ def __exit__(self, exc_type, exc_val, exc_tb):
957
+ self.close()
958
+
959
+ def keys(self) -> typing.Iterable[str]:
960
+ yield from self._keys.keys()
961
+
962
+ def cache_keys(self) -> typing.Iterable[CacheKey]:
963
+ for k in self._keys.keys():
964
+ yield CacheKey(k)
965
+
966
+ def values(self) -> typing.Iterable[EntryStore]:
967
+ yield from self._keys.values()
968
+
969
+ def items(self) -> typing.Iterable[tuple[str, EntryStore]]:
970
+ yield from self._keys.items()
971
+
972
+ def __contains__(self, item) -> bool:
973
+ if isinstance(item, CacheKey):
974
+ item = item.raw_key
975
+ return item in self._keys
976
+
977
+ def __getitem__(self, item) -> EntryStore:
978
+ if isinstance(item, CacheKey):
979
+ item = item.raw_key
980
+ return self._keys[item]
981
+
982
+ def close(self):
983
+ for _, stream in self._block_files.values():
984
+ stream.close()
985
+
986
+
987
+ @dataclasses.dataclass(frozen=True)
988
+ class SimpleCacheEOF:
989
+ # net/disk_cache/simple/simple_entry_format.h
990
+ flags: int
991
+ data_crc: int
992
+ stream_size: int
993
+
994
+ _SIMPLE_FINAL_MAGIC: typing.ClassVar[int] = 0xf4fa6f45970d41d8 # is written little-endian in the file
995
+
996
+ @classmethod
997
+ def from_reader(cls, reader: BinaryReader):
998
+ magic = reader.read_uint64()
999
+ if magic != SimpleCacheEOF._SIMPLE_FINAL_MAGIC:
1000
+ raise ValueError(f"Invalid magic (expected {SimpleCacheEOF._SIMPLE_FINAL_MAGIC}; got {magic}")
1001
+
1002
+ flags = reader.read_uint32()
1003
+ data_crc = reader.read_uint32()
1004
+ stream_size = reader.read_uint32()
1005
+
1006
+ return cls(flags, data_crc, stream_size)
1007
+
1008
+ @property
1009
+ def has_crc(self):
1010
+ return self.flags & 1 > 0
1011
+
1012
+ @property
1013
+ def has_key_sha256(self):
1014
+ return self.flags & 2 > 0
1015
+
1016
+
1017
+ @dataclasses.dataclass(frozen=True)
1018
+ class SimpleCacheHeader:
1019
+ # net/disk_cache/simple/simple_entry_format.h
1020
+ version: int
1021
+ key_length: int
1022
+ key_hash: int
1023
+
1024
+ _SIMPLE_INITIAL_MAGIC: typing.ClassVar[int] = 0xfcfb6d1ba7725c30 # is written little-endian in the file
1025
+
1026
+ @classmethod
1027
+ def from_reader(cls, reader: BinaryReader):
1028
+ magic = reader.read_uint64()
1029
+ if magic != SimpleCacheHeader._SIMPLE_INITIAL_MAGIC:
1030
+ raise ValueError(f"Invalid magic (expected {SimpleCacheHeader._SIMPLE_INITIAL_MAGIC}; got {magic}")
1031
+ version = reader.read_uint32()
1032
+ key_length = reader.read_uint32()
1033
+ key_hash = reader.read_uint32()
1034
+
1035
+ if EIGHT_BYTE_PICKLE_ALIGNMENT:
1036
+ _ = reader.read_uint32() # need to align to 8 bytes before we get to the key
1037
+
1038
+ return cls(version, key_length, key_hash)
1039
+
1040
+
1041
+ class SimpleCacheFile:
1042
+ # net/disk_cache/simple/simple_entry_format.h
1043
+
1044
+ def __init__(self, cache_file: typing.Union[os.PathLike, str]):
1045
+ self._path = pathlib.Path(cache_file)
1046
+ self._reader = BinaryReader(self._path.open("rb"))
1047
+ self._header = SimpleCacheHeader.from_reader(self._reader)
1048
+ self._key = self._reader.read_raw(self._header.key_length).decode("latin-1")
1049
+
1050
+ # Peek forwards - are we at EOF? Sometimes (rarely) you only get a URL
1051
+ if self._reader.is_eof:
1052
+ self._stream_0_eof = None
1053
+ self._stream_1_eof = None
1054
+ self._stream_0_start_offset_negative = 0
1055
+ self._stream_1_start_offset = 0
1056
+ self._stream_1_length = 0
1057
+
1058
+ self._has_data = False
1059
+ return
1060
+ else:
1061
+ self._has_data = True
1062
+
1063
+ # get stream 0 EOF
1064
+ self._reader.seek(-SIMPLE_EOF_SIZE, os.SEEK_END)
1065
+ self._stream_0_eof = SimpleCacheEOF.from_reader(self._reader)
1066
+ self._stream_0_start_offset_negative = -SIMPLE_EOF_SIZE - self._stream_0_eof.stream_size
1067
+ if self._stream_0_eof.has_key_sha256:
1068
+ self._stream_0_start_offset_negative -= 32
1069
+
1070
+ # get stream 1 EOF
1071
+ # the size of the stream 0 eof, the size of the stream 1 eof, the size of stream 0, 32 bytes if there's sha256
1072
+ self._reader.seek(-SIMPLE_EOF_SIZE - SIMPLE_EOF_SIZE - self._stream_0_eof.stream_size, os.SEEK_END)
1073
+ if self._stream_0_eof.has_key_sha256:
1074
+ self._reader.seek(-32, os.SEEK_CUR)
1075
+ stream_1_end_offset = self._reader.tell()
1076
+ # the eof for stream 1 might contain a stream length, but the comments in simple_entry_format.h say it won't?
1077
+ self._stream_1_eof = SimpleCacheEOF.from_reader(self._reader)
1078
+ self._stream_1_start_offset = SIMPLE_EOF_SIZE + self._header.key_length # 20 = header length
1079
+ self._stream_1_length = stream_1_end_offset - self._stream_1_start_offset
1080
+
1081
+ def __enter__(self) -> "SimpleCacheFile":
1082
+ return self
1083
+
1084
+ def __exit__(self, exc_type, exc_val, exc_tb):
1085
+ self.close()
1086
+
1087
+ def get_stream_0(self):
1088
+ if self._has_data:
1089
+ self._reader.seek(self._stream_0_start_offset_negative, os.SEEK_END)
1090
+ return self._reader.read_raw(self._stream_0_eof.stream_size)
1091
+ return b""
1092
+
1093
+ def get_stream_1(self):
1094
+ if self._has_data:
1095
+ self._reader.seek(self._stream_1_start_offset, os.SEEK_SET)
1096
+ return self._reader.read_raw(self._stream_1_length)
1097
+ return b""
1098
+
1099
+ @property
1100
+ def data_start_offset(self):
1101
+ return self._stream_1_start_offset
1102
+
1103
+ @property
1104
+ def metadata_start_offset_negative(self):
1105
+ return self._stream_0_start_offset_negative
1106
+
1107
+ @property
1108
+ def path(self) -> pathlib.Path:
1109
+ return self._path
1110
+
1111
+ @property
1112
+ def key(self) -> str:
1113
+ return self._key
1114
+
1115
+ @property
1116
+ def key_hash(self) -> int:
1117
+ return self._header.key_hash
1118
+
1119
+ def close(self):
1120
+ self._reader.close()
1121
+
1122
+
1123
+ class ChromiumSimpleFileCache(ChromiumCache):
1124
+ # net/disk_cache/simple/simple_entry_format.h
1125
+ _STREAM_0_1_FILENAME_PATTERN = re.compile(r"^[0-9a-f]{16}_0$")
1126
+
1127
+ def __init__(self, cache_dir: typing.Union[os.PathLike, str]):
1128
+ self._cache_dir = pathlib.Path(cache_dir)
1129
+ self._file_lookup = types.MappingProxyType(self._build_keys())
1130
+
1131
+ @property
1132
+ def cache_dir(self) -> pathlib.Path:
1133
+ return self._cache_dir
1134
+
1135
+ def _build_keys(self) -> dict[str, list[pathlib.Path]]:
1136
+ # doing it this way is slow, but saves on having a million file handles open
1137
+ lookup: dict[str, list[pathlib.Path]] = {}
1138
+ for cache_file in self._cache_dir.iterdir():
1139
+ if cache_file.is_file() and ChromiumSimpleFileCache._STREAM_0_1_FILENAME_PATTERN.match(cache_file.name):
1140
+ with SimpleCacheFile(cache_file) as cf:
1141
+ # if cf.key in lookup:
1142
+ # raise ValueError(f"{cf.key} already in lookup (please contact developer)")
1143
+ lookup.setdefault(cf.key, [])
1144
+ lookup[cf.key].append(cache_file)
1145
+
1146
+ return lookup
1147
+
1148
+ def get_location_for_metadata(self, key: typing.Union[str, CacheKey]) -> list[CacheFileLocation]:
1149
+ result = []
1150
+ if isinstance(key, CacheKey):
1151
+ key = key.raw_key
1152
+ for file in self._file_lookup[key]:
1153
+ file_length = file.stat().st_size
1154
+ with SimpleCacheFile(file) as cf:
1155
+ offset = file_length + cf.metadata_start_offset_negative
1156
+ result.append(CacheFileLocation(file.name, offset))
1157
+ return result
1158
+
1159
+ def get_location_for_cachefile(self, key: typing.Union[str, CacheKey]) -> list[CacheFileLocation]:
1160
+ result = []
1161
+ if isinstance(key, CacheKey):
1162
+ key = key.raw_key
1163
+ for file in self._file_lookup[key]:
1164
+ with SimpleCacheFile(file) as cf:
1165
+ offset = cf.data_start_offset
1166
+ result.append(CacheFileLocation(file.name, offset))
1167
+ return result
1168
+
1169
+ def get_metadata(self, key: typing.Union[str, CacheKey]) -> list[typing.Optional[CachedMetadata]]:
1170
+ result = []
1171
+ if isinstance(key, CacheKey):
1172
+ key = key.raw_key
1173
+ for file in self._file_lookup[key]:
1174
+ with SimpleCacheFile(file) as cf:
1175
+ buffer = cf.get_stream_0()
1176
+ if buffer:
1177
+ result.append(CachedMetadata.from_buffer(buffer))
1178
+ else:
1179
+ result.append(None)
1180
+ return result
1181
+
1182
+ def get_cachefile(self, key: typing.Union[str, CacheKey]) -> list[bytes]:
1183
+ result = []
1184
+ if isinstance(key, CacheKey):
1185
+ key = key.raw_key
1186
+ for file in self._file_lookup[key]:
1187
+ with SimpleCacheFile(file) as cf:
1188
+ result.append(cf.get_stream_1())
1189
+ return result
1190
+
1191
+ def __enter__(self) -> "ChromiumCache":
1192
+ return self
1193
+
1194
+ def __exit__(self, exc_type, exc_val, exc_tb):
1195
+ pass
1196
+
1197
+ def close(self):
1198
+ pass
1199
+
1200
+ def keys(self) -> typing.Iterable[str]:
1201
+ yield from self._file_lookup.keys()
1202
+
1203
+ def cache_keys(self) -> typing.Iterable[CacheKey]:
1204
+ for k in self._file_lookup.keys():
1205
+ yield CacheKey(k)
1206
+
1207
+ def get_file_for_key(self, key: typing.Union[str, CacheKey]) -> list[str]:
1208
+ if isinstance(key, CacheKey):
1209
+ key = key.raw_key
1210
+ return [x.name for x in self._file_lookup[key]]
1211
+
1212
+
1213
+ def guess_cache_class(
1214
+ cache_dir: typing.Optional[typing.Union[pathlib.Path, os.PathLike]]) \
1215
+ -> typing.Optional[typing.Type[typing.Union[ChromiumBlockFileCache, ChromiumSimpleFileCache]]]:
1216
+ cache_dir = pathlib.Path(cache_dir)
1217
+ data_files = {"data_0", "data_1", "data_2", "data_3"}
1218
+
1219
+ for file in cache_dir.iterdir():
1220
+ # multiple tests to we can return as soon as possible
1221
+ if file.name == "index-dir":
1222
+ return ChromiumSimpleFileCache
1223
+ elif file.name in data_files:
1224
+ return ChromiumBlockFileCache
1225
+ elif re.match(r"f_[0-9a-f]{6}", file.name):
1226
+ return ChromiumBlockFileCache
1227
+ elif re.match(r"^[0-9a-f]{16}_0$", file.name):
1228
+ return ChromiumSimpleFileCache
1229
+
1230
+ return None
1231
+
1232
+
1233
+ def main(args):
1234
+ import csv
1235
+ import hashlib
1236
+ import mimetypes
1237
+ import brotli
1238
+ import gzip
1239
+
1240
+ in_cache_dir = pathlib.Path(args[0])
1241
+ out_dir = pathlib.Path(args[1])
1242
+ cache_out_dir = out_dir / "cache_files"
1243
+
1244
+ if not in_cache_dir.is_dir():
1245
+ raise ValueError("Input directory is not a directory or does not exist")
1246
+
1247
+ if out_dir.exists():
1248
+ raise ValueError("Output directory already exists")
1249
+
1250
+ out_dir.mkdir()
1251
+ cache_out_dir.mkdir()
1252
+
1253
+ default_row_headers = ["file_hash", "key", "request_time", "response_time", "date"]
1254
+ dynamic_row_headers = set()
1255
+ rows: list[dict] = []
1256
+
1257
+ cache_type = guess_cache_class(in_cache_dir)
1258
+ if cache_type is None:
1259
+ raise ValueError("Could not detect Chrome cache type")
1260
+
1261
+ with cache_type(in_cache_dir) as cache:
1262
+ for key in cache.keys():
1263
+ out_extension = ""
1264
+ content_encoding = ""
1265
+ row = {"key": key}
1266
+ rows.append(row)
1267
+
1268
+ metas = cache.get_metadata(key)
1269
+ datas = cache.get_cachefile(key)
1270
+
1271
+ if len(metas) != len(datas):
1272
+ raise ValueError("Metadata records count does not match data records count")
1273
+
1274
+ for meta, data in zip(metas, datas):
1275
+ if meta is not None:
1276
+ row["request_time"] = meta.request_time
1277
+ row["response_time"] = meta.response_time
1278
+ for attribute, value in meta.http_header_attributes:
1279
+ dynamic_row_headers.add(attribute)
1280
+ if attribute in row:
1281
+ row[attribute] += f"; {value}"
1282
+ else:
1283
+ row[attribute] = value
1284
+
1285
+ if mime := meta.get_attribute("content-type"):
1286
+ out_extension = mimetypes.guess_extension(mime[0]) or ""
1287
+
1288
+ content_encoding = (meta.get_attribute("content-encoding") or [""])[0]
1289
+
1290
+ #data = cache.get_cachefile(key)
1291
+ if data is not None:
1292
+ if content_encoding.strip() == "gzip":
1293
+ try:
1294
+ data = gzip.decompress(data)
1295
+ except (EOFError, gzip.BadGzipFile) as ex:
1296
+ print(f"Warning: could not decompress data for key: \"{key}\"; reason: {ex}")
1297
+ elif content_encoding.strip() == "br":
1298
+ try:
1299
+ data = brotli.decompress(data)
1300
+ except brotli.error as ex:
1301
+ print(f"Warning: could not decompress data for key: \"{key}\"; reason: {ex}")
1302
+ elif content_encoding.strip() == "deflate":
1303
+ try:
1304
+ data = zlib.decompress(data, -zlib.MAX_WBITS) # suppress trying to read a header
1305
+ except zlib.error as ex:
1306
+ print(f"Warning: could not decompress data for key: \"{key}\"; reason: {ex}")
1307
+ elif content_encoding.strip() != "":
1308
+ print(f"Warning: unknown content-encoding: {content_encoding}")
1309
+
1310
+ h = hashlib.sha256()
1311
+ h.update(data)
1312
+ cache_file_hash = h.hexdigest()
1313
+ row["file_hash"] = cache_file_hash
1314
+ with (cache_out_dir / (cache_file_hash + out_extension)).open("wb") as out:
1315
+ out.write(data)
1316
+ else:
1317
+ row["file_hash"] = "<No cache file data>"
1318
+
1319
+ csv_out_f = (out_dir / "cache_report.csv").open("wt", encoding="utf-8", newline="")
1320
+ csv_out_f.write("\ufeff")
1321
+ csv_out = csv.DictWriter(
1322
+ csv_out_f, fieldnames=default_row_headers + sorted(dynamic_row_headers), dialect=csv.excel,
1323
+ quoting=csv.QUOTE_ALL, quotechar="\"", escapechar="\\")
1324
+ csv_out.writeheader()
1325
+ for row in rows:
1326
+ csv_out.writerow(row)
1327
+
1328
+ csv_out_f.close()
1329
+
1330
+
1331
+ if __name__ == '__main__':
1332
+ if len(sys.argv) < 3:
1333
+ print(f"USAGE: {pathlib.Path(sys.argv[0]).name} <cache input dir> <out dir>")
1334
+ exit(1)
1335
+ main(sys.argv[1:])