har2tree 1.36.2__tar.gz → 1.36.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {har2tree-1.36.2 → har2tree-1.36.4}/PKG-INFO +5 -5
- {har2tree-1.36.2 → har2tree-1.36.4}/har2tree/har2tree.py +2 -2
- {har2tree-1.36.2 → har2tree-1.36.4}/har2tree/helper.py +14 -12
- {har2tree-1.36.2 → har2tree-1.36.4}/har2tree/nodes.py +245 -166
- {har2tree-1.36.2 → har2tree-1.36.4}/pyproject.toml +7 -7
- {har2tree-1.36.2 → har2tree-1.36.4}/LICENSE +0 -0
- {har2tree-1.36.2 → har2tree-1.36.4}/README.md +0 -0
- {har2tree-1.36.2 → har2tree-1.36.4}/har2tree/__init__.py +0 -0
- {har2tree-1.36.2 → har2tree-1.36.4}/har2tree/parser.py +0 -0
- {har2tree-1.36.2 → har2tree-1.36.4}/har2tree/py.typed +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: har2tree
|
|
3
|
-
Version: 1.36.
|
|
3
|
+
Version: 1.36.4
|
|
4
4
|
Summary: HTTP Archive (HAR) to ETE Toolkit generator
|
|
5
5
|
License-Expression: BSD-3-Clause
|
|
6
6
|
License-File: LICENSE
|
|
@@ -23,17 +23,17 @@ Provides-Extra: docs
|
|
|
23
23
|
Requires-Dist: Sphinx (>=9.1.0) ; (python_version >= "3.12") and (extra == "docs")
|
|
24
24
|
Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.14.3)
|
|
25
25
|
Requires-Dist: ete3 (>=3.1.3)
|
|
26
|
-
Requires-Dist: filetype (>=1.2.0)
|
|
27
26
|
Requires-Dist: json-stream (>=2.3.3,<3.0.0)
|
|
28
27
|
Requires-Dist: legacy-cgi (>=2.6.4) ; python_version >= "3.13" and python_version < "4.0"
|
|
29
28
|
Requires-Dist: multipart (>=1.3.0,<2.0.0)
|
|
30
29
|
Requires-Dist: numpy (>=2.2,<2.3) ; python_version < "3.11"
|
|
31
|
-
Requires-Dist: numpy (>=2.4.
|
|
32
|
-
Requires-Dist:
|
|
30
|
+
Requires-Dist: numpy (>=2.4.2) ; python_version >= "3.11" and python_version < "4.0"
|
|
31
|
+
Requires-Dist: pure-magic-rs (>=0.3.1,<0.4.0)
|
|
32
|
+
Requires-Dist: pyfaup-rs (>=0.3,<0.4.0)
|
|
33
33
|
Requires-Dist: requests-toolbelt (>=1.0.0,<2.0.0)
|
|
34
34
|
Requires-Dist: six (>=1.17.0) ; extra == "docs"
|
|
35
35
|
Requires-Dist: tinycss2 (>=1.5.1)
|
|
36
|
-
Requires-Dist: w3lib (>=2.
|
|
36
|
+
Requires-Dist: w3lib (>=2.4.0)
|
|
37
37
|
Project-URL: Documentation, https://har2tree.readthedocs.io/en/latest/
|
|
38
38
|
Project-URL: Repository, https://github.com/Lookyloo/har2tree
|
|
39
39
|
Project-URL: issues, https://github.com/Lookyloo/har2tree/issues
|
|
@@ -250,7 +250,7 @@ class HarFile():
|
|
|
250
250
|
@property
|
|
251
251
|
def root_url(self) -> str:
|
|
252
252
|
"""First URL of the capture"""
|
|
253
|
-
return self.entries[0]['request']['url']
|
|
253
|
+
return self.entries[0]['request']['url'].strip()
|
|
254
254
|
|
|
255
255
|
def __find_header_value(self, har_entry: dict[str, Any], header_name: str) -> str | None:
|
|
256
256
|
"""Get the value of a specific header"""
|
|
@@ -600,7 +600,7 @@ class Har2Tree:
|
|
|
600
600
|
if i in ignore:
|
|
601
601
|
continue
|
|
602
602
|
|
|
603
|
-
n = URLNode(capture_uuid=self.har.capture_uuid
|
|
603
|
+
n = URLNode(capture_uuid=self.har.capture_uuid)
|
|
604
604
|
n.load_har_entry(url_entry, list(self.all_url_requests.keys()))
|
|
605
605
|
if hasattr(n, 'redirect_url'):
|
|
606
606
|
self.all_redirects.append(n.redirect_url)
|
|
@@ -13,13 +13,12 @@ from base64 import b64decode
|
|
|
13
13
|
from collections import defaultdict
|
|
14
14
|
from io import BytesIO
|
|
15
15
|
from logging import LoggerAdapter
|
|
16
|
+
from pure_magic_rs import MagicDb
|
|
16
17
|
from typing import Any
|
|
17
18
|
from collections.abc import Iterable
|
|
18
19
|
from collections.abc import Mapping, MutableMapping
|
|
19
20
|
from urllib.parse import urlparse, unquote_plus, unquote_to_bytes, urljoin
|
|
20
21
|
|
|
21
|
-
import filetype # type: ignore
|
|
22
|
-
|
|
23
22
|
from bs4 import BeautifulSoup, Tag, MarkupResemblesLocatorWarning
|
|
24
23
|
from charset_normalizer import from_bytes
|
|
25
24
|
import tinycss2 # type: ignore[import-untyped]
|
|
@@ -233,11 +232,7 @@ def _unpack_data_uri(data: str) -> tuple[str, str, BytesIO] | None:
|
|
|
233
232
|
mime, mimeparams, unpacked_data = parsed_uri
|
|
234
233
|
if '/' not in mime:
|
|
235
234
|
# Turns out, it happens. The mimetype can be null for example.
|
|
236
|
-
|
|
237
|
-
if kind:
|
|
238
|
-
mime = kind.mime
|
|
239
|
-
else:
|
|
240
|
-
mime = ''
|
|
235
|
+
mime = guess_magic_type(unpacked_data)
|
|
241
236
|
|
|
242
237
|
blob = BytesIO(unpacked_data)
|
|
243
238
|
b_hash = hashlib.sha512(blob.getvalue()).hexdigest()
|
|
@@ -450,11 +445,7 @@ def find_external_ressources(mimetype: str, data: bytes, base_url: str, all_requ
|
|
|
450
445
|
# Just in case, there is sometimes an unescape call in JS code
|
|
451
446
|
for to_unescape in re.findall(r'unescape\(\'(.*)\'\)', string_soup):
|
|
452
447
|
unescaped = unquote_to_bytes(to_unescape)
|
|
453
|
-
|
|
454
|
-
if kind:
|
|
455
|
-
mimetype = kind.mime
|
|
456
|
-
else:
|
|
457
|
-
mimetype = ''
|
|
448
|
+
mimetype = guess_magic_type(unescaped)
|
|
458
449
|
blob = BytesIO(unescaped)
|
|
459
450
|
b_hash = hashlib.sha512(blob.getvalue()).hexdigest()
|
|
460
451
|
embedded_ressources[mimetype].append((b_hash, blob))
|
|
@@ -473,3 +464,14 @@ class Har2TreeLogAdapter(LoggerAdapter): # type: ignore[type-arg]
|
|
|
473
464
|
"""
|
|
474
465
|
def process(self, msg: str, kwargs: MutableMapping[str, Any]) -> tuple[str, MutableMapping[str, Any]]:
|
|
475
466
|
return '[{}] {}'.format(self.extra['uuid'], msg), kwargs # type: ignore[index]
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
magic_db = None
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
def guess_magic_type(data: bytes) -> str:
|
|
473
|
+
global magic_db
|
|
474
|
+
if magic_db is None:
|
|
475
|
+
magic_db = MagicDb()
|
|
476
|
+
m = magic_db.best_magic_buffer(data)
|
|
477
|
+
return m.mime_type
|
|
@@ -13,40 +13,32 @@ import re
|
|
|
13
13
|
|
|
14
14
|
from base64 import b64decode
|
|
15
15
|
from datetime import datetime, timedelta
|
|
16
|
-
from functools import
|
|
16
|
+
from functools import cached_property
|
|
17
17
|
from hashlib import sha256
|
|
18
18
|
from io import BytesIO, StringIO
|
|
19
|
-
from pathlib import
|
|
19
|
+
from pathlib import PurePath
|
|
20
20
|
from typing import Any
|
|
21
21
|
from collections.abc import MutableMapping
|
|
22
22
|
from urllib.parse import unquote_plus, urlparse, urljoin, parse_qs
|
|
23
23
|
|
|
24
|
-
import filetype # type: ignore
|
|
25
24
|
import json_stream # type: ignore
|
|
26
25
|
|
|
27
26
|
from bs4 import BeautifulSoup
|
|
28
27
|
from ete3 import TreeNode # type: ignore
|
|
29
|
-
from
|
|
28
|
+
from pyfaup import Url, Host
|
|
30
29
|
from requests_toolbelt.multipart import decoder # type: ignore
|
|
31
30
|
from w3lib.html import strip_html5_whitespace
|
|
32
31
|
from w3lib.url import canonicalize_url, safe_url_string
|
|
33
32
|
|
|
34
|
-
from .helper import find_external_ressources, rebuild_url, find_identifiers, make_soup
|
|
33
|
+
from .helper import find_external_ressources, rebuild_url, find_identifiers, make_soup, guess_magic_type
|
|
35
34
|
from .helper import Har2TreeError, Har2TreeLogAdapter, make_hhhash, HHHashError, HHHashNote
|
|
36
35
|
|
|
37
36
|
|
|
38
|
-
@lru_cache(64)
|
|
39
|
-
def get_public_suffix_list() -> PublicSuffixList:
|
|
40
|
-
# Initialize Public Suffix List
|
|
41
|
-
# TODO (?): fetch the list
|
|
42
|
-
return PublicSuffixList()
|
|
43
|
-
|
|
44
|
-
|
|
45
37
|
class HarTreeNode(TreeNode): # type: ignore[misc]
|
|
46
38
|
|
|
47
|
-
def __init__(self, capture_uuid: str
|
|
39
|
+
def __init__(self, capture_uuid: str):
|
|
48
40
|
"""Node dumpable in json to display with d3js"""
|
|
49
|
-
super().__init__(
|
|
41
|
+
super().__init__()
|
|
50
42
|
logger = logging.getLogger(f'{__name__}.{self.__class__.__name__}')
|
|
51
43
|
self.logger = Har2TreeLogAdapter(logger, {'uuid': capture_uuid})
|
|
52
44
|
self.add_feature('uuid', str(uuid.uuid4()))
|
|
@@ -75,12 +67,11 @@ class URLNode(HarTreeNode):
|
|
|
75
67
|
|
|
76
68
|
start_time: datetime
|
|
77
69
|
|
|
78
|
-
def __init__(self, capture_uuid: str
|
|
70
|
+
def __init__(self, capture_uuid: str):
|
|
79
71
|
"""Node of the URL Tree"""
|
|
80
|
-
super().__init__(capture_uuid=capture_uuid
|
|
72
|
+
super().__init__(capture_uuid=capture_uuid)
|
|
81
73
|
# Do not add the body in the json dump
|
|
82
74
|
self.features_to_skip.add('body')
|
|
83
|
-
self.features_to_skip.add('url_split')
|
|
84
75
|
self.features_to_skip.add('start_time')
|
|
85
76
|
self.features_to_skip.add('time')
|
|
86
77
|
self.features_to_skip.add('time_content_received')
|
|
@@ -134,23 +125,86 @@ class URLNode(HarTreeNode):
|
|
|
134
125
|
return b64decode(_to_decode, altchars=b'-_', validate=True)
|
|
135
126
|
return b64decode(_to_decode, validate=True)
|
|
136
127
|
|
|
128
|
+
@cached_property
|
|
129
|
+
def tld(self) -> str | None:
|
|
130
|
+
if not hasattr(self, 'original_url'):
|
|
131
|
+
return None
|
|
132
|
+
try:
|
|
133
|
+
faup_url = Url(self.original_url)
|
|
134
|
+
if faup_url.suffix:
|
|
135
|
+
return str(faup_url.suffix)
|
|
136
|
+
|
|
137
|
+
self.logger.warning(f'No TLD: "{self.name}"')
|
|
138
|
+
return None
|
|
139
|
+
except Exception as e:
|
|
140
|
+
self.logger.warning(f'Unable to parse URI "{self.name}": {e}')
|
|
141
|
+
return None
|
|
142
|
+
|
|
143
|
+
@cached_property
|
|
144
|
+
def known_tld(self) -> str | None:
|
|
145
|
+
# An alias, to avoid breaking things.
|
|
146
|
+
return self.tld
|
|
147
|
+
|
|
148
|
+
@cached_property
|
|
149
|
+
def domain(self) -> str | None:
|
|
150
|
+
if not hasattr(self, 'original_url'):
|
|
151
|
+
return None
|
|
152
|
+
try:
|
|
153
|
+
faup_url = Url(self.original_url)
|
|
154
|
+
if faup_url.domain:
|
|
155
|
+
return str(faup_url.domain)
|
|
156
|
+
|
|
157
|
+
self.logger.warning(f'No domain: "{self.name}"')
|
|
158
|
+
return None
|
|
159
|
+
except Exception as e:
|
|
160
|
+
self.logger.warning(f'Unable to parse URI "{self.name}": {e}')
|
|
161
|
+
return None
|
|
162
|
+
|
|
137
163
|
def load_har_entry(self, har_entry: MutableMapping[str, Any], all_requests: list[str]) -> None:
|
|
138
164
|
"""Load one entry of the HAR file, initialize most of the features of the node"""
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
165
|
+
|
|
166
|
+
# NOTE: by the HAR specs: "Absolute URL of the request (fragments are not included)."
|
|
167
|
+
self.add_feature('name', unquote_plus(har_entry['request']['url']))
|
|
168
|
+
|
|
169
|
+
# 2026-30-01: Keep original URL so it can be parsed by faup
|
|
170
|
+
if (har_entry['request']['url'].startswith('blob:')
|
|
171
|
+
and har_entry['request']['url'][5:].startswith('http')):
|
|
172
|
+
self.add_feature('original_url', har_entry['request']['url'][5:])
|
|
173
|
+
|
|
174
|
+
elif har_entry['request']['url'].startswith('http'):
|
|
175
|
+
self.add_feature('original_url', har_entry['request']['url'])
|
|
143
176
|
|
|
144
177
|
splitted_url = urlparse(self.name)
|
|
145
|
-
if splitted_url.scheme == '
|
|
146
|
-
# this is a new weird feature, but it seems to be usable as a URL, so let's do that
|
|
147
|
-
self.add_feature('url_split', urlparse(splitted_url.path))
|
|
148
|
-
elif splitted_url.scheme == 'file':
|
|
178
|
+
if splitted_url.scheme == 'file':
|
|
149
179
|
# file on disk, we do not have a proper URL
|
|
150
180
|
self.add_feature('file_on_disk', True)
|
|
151
|
-
|
|
181
|
+
# TODO: Do something better? hostname is the feature name used for the aggregated tree
|
|
182
|
+
# so we need that unless we want to change the JS
|
|
183
|
+
path = PurePath(splitted_url.path)
|
|
184
|
+
self.add_feature('hostname', str(path.parent))
|
|
185
|
+
if path.name:
|
|
186
|
+
self.add_feature('filename', path.name)
|
|
187
|
+
else:
|
|
188
|
+
self.add_feature('filename', 'file.bin')
|
|
152
189
|
else:
|
|
153
|
-
|
|
190
|
+
# We have a URL
|
|
191
|
+
if splitted_url.scheme == 'blob':
|
|
192
|
+
# this is a new weird feature, but it seems to be usable as a URL, so let's do that
|
|
193
|
+
splitted_url = urlparse(splitted_url.path)
|
|
194
|
+
|
|
195
|
+
if splitted_url.hostname:
|
|
196
|
+
self.add_feature('hostname', splitted_url.hostname)
|
|
197
|
+
else:
|
|
198
|
+
self.logger.warning(f'Weird URI with no hostname (?): "{self.name}"')
|
|
199
|
+
self.add_feature('hostname', self.name)
|
|
200
|
+
|
|
201
|
+
if filename := PurePath(splitted_url.path).name:
|
|
202
|
+
self.add_feature('filename', filename)
|
|
203
|
+
else:
|
|
204
|
+
self.add_feature('filename', 'file.bin')
|
|
205
|
+
|
|
206
|
+
if not self.hostname:
|
|
207
|
+
self.logger.warning(f'Missing hostname, something is broken in that node: {har_entry}')
|
|
154
208
|
|
|
155
209
|
# If the URL contains a fragment (i.e. something after a #), it is stripped in the referer.
|
|
156
210
|
# So we need an alternative URL to do a lookup against
|
|
@@ -167,19 +221,6 @@ class URLNode(HarTreeNode):
|
|
|
167
221
|
self.add_feature('time', timedelta(milliseconds=har_entry['time']))
|
|
168
222
|
self.add_feature('time_content_received', self.start_time + self.time) # Instant the response is fully received (and the processing of the content by the browser can start)
|
|
169
223
|
|
|
170
|
-
if hasattr(self, 'file_on_disk'):
|
|
171
|
-
# TODO: Do something better? hostname is the feature name used for the aggregated tree
|
|
172
|
-
# so we need that unless we want to change the JS
|
|
173
|
-
self.add_feature('hostname', str(Path(self.url_split.path).parent))
|
|
174
|
-
else:
|
|
175
|
-
if self.url_split.hostname:
|
|
176
|
-
self.add_feature('hostname', self.url_split.hostname)
|
|
177
|
-
else:
|
|
178
|
-
self.add_feature('hostname', self.name)
|
|
179
|
-
|
|
180
|
-
if not self.hostname:
|
|
181
|
-
self.logger.warning(f'Something is broken in that node: {har_entry}')
|
|
182
|
-
|
|
183
224
|
try:
|
|
184
225
|
ipaddress.ip_address(self.hostname)
|
|
185
226
|
self.add_feature('hostname_is_ip', True)
|
|
@@ -196,13 +237,6 @@ class URLNode(HarTreeNode):
|
|
|
196
237
|
except UnicodeError:
|
|
197
238
|
pass
|
|
198
239
|
|
|
199
|
-
if not hasattr(self, 'hostname_is_ip') and not hasattr(self, 'file_on_disk'):
|
|
200
|
-
tld = get_public_suffix_list().publicsuffix(self.hostname)
|
|
201
|
-
if tld:
|
|
202
|
-
self.add_feature('known_tld', tld)
|
|
203
|
-
else:
|
|
204
|
-
self.logger.debug(f'No TLD/domain broken {self.name}')
|
|
205
|
-
|
|
206
240
|
self.add_feature('request', har_entry['request'])
|
|
207
241
|
# Try to get a referer from the headers
|
|
208
242
|
for h in self.request['headers']:
|
|
@@ -249,128 +283,134 @@ class URLNode(HarTreeNode):
|
|
|
249
283
|
decoded_posted_data = self._dirty_safe_b64decode(self.request['postData']['text'])
|
|
250
284
|
except binascii.Error:
|
|
251
285
|
decoded_posted_data = self.request['postData']['text']
|
|
286
|
+
|
|
252
287
|
if 'mimeType' in self.request['postData']:
|
|
253
288
|
# make it easier to compare.
|
|
254
289
|
mimetype_lower = self.request['postData']['mimeType'].lower()
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
#
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
else:
|
|
291
|
-
self.logger.warning(f"Expected json, got garbage: {mimetype_lower} - {decoded_posted_data}")
|
|
292
|
-
elif mimetype_lower.startswith('application/x-json-stream'):
|
|
290
|
+
else:
|
|
291
|
+
if isinstance(decoded_posted_data, bytes):
|
|
292
|
+
# if b64 decode worked, we may have a useful type there.
|
|
293
|
+
mimetype_lower = guess_magic_type(decoded_posted_data)
|
|
294
|
+
else:
|
|
295
|
+
mimetype_lower = 'text/plain'
|
|
296
|
+
self.logger.warning(f'Missing mimetype in POST, guessed it: {mimetype_lower}')
|
|
297
|
+
|
|
298
|
+
if mimetype_lower.startswith('application/x-www-form-urlencoded'):
|
|
299
|
+
# NOTE: this should never happen as there should
|
|
300
|
+
# be something in self.request['postData']['params']
|
|
301
|
+
# and we already processed it before but just in case...
|
|
302
|
+
self.logger.debug('Got a application/x-www-form-urlencoded without params key')
|
|
303
|
+
# 100% sure there will be websites where decode will fail
|
|
304
|
+
try:
|
|
305
|
+
if isinstance(decoded_posted_data, bytes):
|
|
306
|
+
decoded_posted_data = decoded_posted_data.decode()
|
|
307
|
+
if isinstance(decoded_posted_data, str):
|
|
308
|
+
decoded_posted_data = unquote_plus(decoded_posted_data)
|
|
309
|
+
if isinstance(decoded_posted_data, str):
|
|
310
|
+
decoded_posted_data = parse_qs(decoded_posted_data)
|
|
311
|
+
self.add_feature('posted_data_info', "Successfully decoded POST request.")
|
|
312
|
+
except Exception as e:
|
|
313
|
+
self.logger.warning(f'Unable to unquote or parse form data "{decoded_posted_data!r}": {e}')
|
|
314
|
+
self.add_feature('posted_data_info', "Unable to decode POST request.")
|
|
315
|
+
elif (mimetype_lower.startswith('application/json')
|
|
316
|
+
or mimetype_lower.startswith('application/csp-report')
|
|
317
|
+
or mimetype_lower.startswith('application/x-amz-json-1.1')
|
|
318
|
+
or mimetype_lower.startswith('application/reports+json')
|
|
319
|
+
or mimetype_lower.startswith('application/vnd.adobe.dc+json')
|
|
320
|
+
or mimetype_lower.startswith('application/ion+json')
|
|
321
|
+
or mimetype_lower.endswith('json')
|
|
322
|
+
):
|
|
323
|
+
if isinstance(decoded_posted_data, (str, bytes)):
|
|
324
|
+
# at this stage, it will always be bytes or str
|
|
293
325
|
try:
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
to_stream = StringIO(decoded_posted_data)
|
|
297
|
-
elif isinstance(decoded_posted_data, bytes):
|
|
298
|
-
to_stream = BytesIO(decoded_posted_data)
|
|
299
|
-
else:
|
|
300
|
-
raise ValueError(f'Invalid type: {type(decoded_posted_data)}')
|
|
301
|
-
streamed_data = json_stream.load(to_stream)
|
|
302
|
-
decoded_posted_data = json_stream.to_standard_types(streamed_data)
|
|
326
|
+
# NOTE 2023-08-22: loads here may give us a int, float or a bool.
|
|
327
|
+
decoded_posted_data = json.loads(decoded_posted_data)
|
|
303
328
|
self.add_feature('posted_data_info', "Successfully decoded POST request.")
|
|
304
329
|
except Exception:
|
|
330
|
+
self.add_feature('posted_data_info', "Unable to decode POST request.")
|
|
305
331
|
if isinstance(decoded_posted_data, (str, bytes)):
|
|
306
|
-
self.logger.warning(f"Expected json
|
|
332
|
+
self.logger.warning(f"Expected json, got garbage: {mimetype_lower} - {decoded_posted_data[:20]!r}[...]")
|
|
307
333
|
else:
|
|
308
|
-
self.logger.warning(f"Expected json
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
334
|
+
self.logger.warning(f"Expected json, got garbage: {mimetype_lower} - {decoded_posted_data}")
|
|
335
|
+
elif mimetype_lower.startswith('application/x-json-stream'):
|
|
336
|
+
try:
|
|
337
|
+
to_stream: StringIO | BytesIO
|
|
312
338
|
if isinstance(decoded_posted_data, str):
|
|
313
|
-
|
|
314
|
-
multipart_to_decode = decoded_posted_data.encode()
|
|
339
|
+
to_stream = StringIO(decoded_posted_data)
|
|
315
340
|
elif isinstance(decoded_posted_data, bytes):
|
|
316
|
-
|
|
341
|
+
to_stream = BytesIO(decoded_posted_data)
|
|
317
342
|
else:
|
|
318
|
-
raise ValueError(f'Invalid type
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
elif
|
|
334
|
-
|
|
335
|
-
self.logger.debug(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
|
|
336
|
-
self.add_feature('posted_data_info', f"Decoding {mimetype_lower} is not supported yet.")
|
|
337
|
-
elif mimetype_lower.startswith('text') and isinstance(decoded_posted_data, (str, bytes)):
|
|
338
|
-
try:
|
|
339
|
-
# NOTE 2023-08-22: Quite a few text entries are in fact json, give it a shot.
|
|
340
|
-
# loads here may give us a int, float or a bool.
|
|
341
|
-
decoded_posted_data = json.loads(decoded_posted_data)
|
|
342
|
-
self.add_feature('posted_data_info', "Decoded JSON out of POST request.")
|
|
343
|
-
except Exception:
|
|
344
|
-
# keep it as it is otherwise.
|
|
345
|
-
pass
|
|
346
|
-
elif mimetype_lower.endswith('javascript'):
|
|
347
|
-
# keep it as it is
|
|
348
|
-
self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
|
|
349
|
-
self.add_feature('posted_data_info', f"Pretty rendering of {mimetype_lower} is not supported yet.")
|
|
350
|
-
elif mimetype_lower in ['?', '*/*']:
|
|
351
|
-
self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
|
|
352
|
-
self.add_feature('posted_data_info', f"Weird MimeType ({mimetype_lower}) is not supported yet.")
|
|
353
|
-
elif mimetype_lower == 'application/binary':
|
|
354
|
-
self.logger.warning(f'Got a POST {mimetype_lower}, not a broken gziped blob: {decoded_posted_data!r}')
|
|
355
|
-
self.add_feature('posted_data_info', f"MimeType ({mimetype_lower}) is not supported yet.")
|
|
356
|
-
elif mimetype_lower in ['application/octet-stream']:
|
|
357
|
-
# Should flag it, maybe?
|
|
358
|
-
self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
|
|
359
|
-
self.add_feature('posted_data_info', f"MimeType ({mimetype_lower}) is not supported yet.")
|
|
360
|
-
elif mimetype_lower in ['application/grpc-web+proto']:
|
|
361
|
-
# Can be decoded?
|
|
362
|
-
self.logger.warning(f'Got a POST {mimetype_lower} - can be decoded: {decoded_posted_data!r}')
|
|
363
|
-
self.add_feature('posted_data_info', f"MimeType ({mimetype_lower}) is not supported yet.")
|
|
364
|
-
elif mimetype_lower in ['application/unknown']:
|
|
365
|
-
# Weird but already seen stuff
|
|
366
|
-
self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
|
|
367
|
-
self.add_feature('posted_data_info', f"MimeType ({mimetype_lower}) is not supported yet.")
|
|
343
|
+
raise ValueError(f'Invalid type: {type(decoded_posted_data)}')
|
|
344
|
+
streamed_data = json_stream.load(to_stream)
|
|
345
|
+
decoded_posted_data = json_stream.to_standard_types(streamed_data)
|
|
346
|
+
self.add_feature('posted_data_info', "Successfully decoded POST request.")
|
|
347
|
+
except Exception:
|
|
348
|
+
if isinstance(decoded_posted_data, (str, bytes)):
|
|
349
|
+
self.logger.warning(f"Expected json stream, got garbage: {mimetype_lower} - {decoded_posted_data[:20]!r}[...]")
|
|
350
|
+
else:
|
|
351
|
+
self.logger.warning(f"Expected json stream, got garbage: {mimetype_lower} - {decoded_posted_data}")
|
|
352
|
+
self.add_feature('posted_data_info', "Unable to decode POST request.")
|
|
353
|
+
elif mimetype_lower.startswith('multipart'):
|
|
354
|
+
self.add_feature('posted_data_info', f"Decoding {mimetype_lower} is partially supported.")
|
|
355
|
+
if isinstance(decoded_posted_data, str):
|
|
356
|
+
# must be encoded for decoding
|
|
357
|
+
multipart_to_decode = decoded_posted_data.encode()
|
|
358
|
+
elif isinstance(decoded_posted_data, bytes):
|
|
359
|
+
multipart_to_decode = decoded_posted_data
|
|
368
360
|
else:
|
|
369
|
-
|
|
370
|
-
|
|
361
|
+
raise ValueError(f'Invalid type for multipart POST: {type(decoded_posted_data)}')
|
|
362
|
+
if b"\r\n" not in multipart_to_decode:
|
|
363
|
+
# the decoder wants that
|
|
364
|
+
multipart_to_decode = multipart_to_decode.replace(b"\n", b"\r\n")
|
|
365
|
+
try:
|
|
366
|
+
multipart_data = decoder.MultipartDecoder(multipart_to_decode, mimetype_lower)
|
|
367
|
+
decoded_posted_data = []
|
|
368
|
+
for part in multipart_data.parts:
|
|
369
|
+
headers = {k.decode(): v.decode() for k, v in part.headers.items()}
|
|
370
|
+
content = part.text
|
|
371
|
+
decoded_posted_data.append({'headers': headers, 'content': content})
|
|
372
|
+
except Exception as e:
|
|
373
|
+
self.logger.warning(f'Unable to decode multipart POST: {e}')
|
|
374
|
+
self.add_feature('posted_data_info', "Unable to decode multipart in POST request.")
|
|
375
|
+
|
|
376
|
+
elif mimetype_lower.startswith('application/x-protobuf'):
|
|
377
|
+
# FIXME If possible, decode?
|
|
378
|
+
self.logger.debug(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
|
|
379
|
+
self.add_feature('posted_data_info', f"Decoding {mimetype_lower} is not supported yet.")
|
|
380
|
+
elif mimetype_lower.startswith('text') and isinstance(decoded_posted_data, (str, bytes)):
|
|
381
|
+
try:
|
|
382
|
+
# NOTE 2023-08-22: Quite a few text entries are in fact json, give it a shot.
|
|
383
|
+
# loads here may give us a int, float or a bool.
|
|
384
|
+
decoded_posted_data = json.loads(decoded_posted_data)
|
|
385
|
+
self.add_feature('posted_data_info', "Decoded JSON out of POST request.")
|
|
386
|
+
except Exception:
|
|
387
|
+
# keep it as it is otherwise.
|
|
388
|
+
pass
|
|
389
|
+
elif mimetype_lower.endswith('javascript'):
|
|
390
|
+
# keep it as it is
|
|
391
|
+
self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
|
|
392
|
+
self.add_feature('posted_data_info', f"Pretty rendering of {mimetype_lower} is not supported yet.")
|
|
393
|
+
elif mimetype_lower in ['?', '*/*']:
|
|
394
|
+
self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
|
|
395
|
+
self.add_feature('posted_data_info', f"Weird MimeType ({mimetype_lower}) is not supported yet.")
|
|
396
|
+
elif mimetype_lower == 'application/binary':
|
|
397
|
+
self.logger.warning(f'Got a POST {mimetype_lower}, not a broken gziped blob: {decoded_posted_data!r}')
|
|
398
|
+
self.add_feature('posted_data_info', f"MimeType ({mimetype_lower}) is not supported yet.")
|
|
399
|
+
elif mimetype_lower in ['application/octet-stream']:
|
|
400
|
+
# Should flag it, maybe?
|
|
401
|
+
self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
|
|
402
|
+
self.add_feature('posted_data_info', f"MimeType ({mimetype_lower}) is not supported yet.")
|
|
403
|
+
elif mimetype_lower in ['application/grpc-web+proto']:
|
|
404
|
+
# Can be decoded?
|
|
405
|
+
self.logger.warning(f'Got a POST {mimetype_lower} - can be decoded: {decoded_posted_data!r}')
|
|
406
|
+
self.add_feature('posted_data_info', f"MimeType ({mimetype_lower}) is not supported yet.")
|
|
407
|
+
elif mimetype_lower in ['application/unknown']:
|
|
408
|
+
# Weird but already seen stuff
|
|
409
|
+
self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
|
|
410
|
+
self.add_feature('posted_data_info', f"MimeType ({mimetype_lower}) is not supported yet.")
|
|
371
411
|
else:
|
|
372
|
-
self.logger.warning(f'
|
|
373
|
-
self.add_feature('posted_data_info', "
|
|
412
|
+
self.logger.warning(f'Unexpected mime type: {mimetype_lower} - {decoded_posted_data!r}')
|
|
413
|
+
self.add_feature('posted_data_info', f"Unexpected MimeType ({mimetype_lower}) is not supported yet.")
|
|
374
414
|
|
|
375
415
|
# NOTE 2023-08-22: Blind attempt to process the data as json
|
|
376
416
|
if decoded_posted_data and isinstance(decoded_posted_data, (str, bytes)):
|
|
@@ -467,8 +507,8 @@ class URLNode(HarTreeNode):
|
|
|
467
507
|
|
|
468
508
|
if not hasattr(self, 'mimetype'):
|
|
469
509
|
# try to guess something better
|
|
470
|
-
if
|
|
471
|
-
self.add_feature('mimetype',
|
|
510
|
+
if mime := guess_magic_type(self.body.getvalue()):
|
|
511
|
+
self.add_feature('mimetype', mime)
|
|
472
512
|
|
|
473
513
|
if not hasattr(self, 'mimetype'):
|
|
474
514
|
self.add_feature('mimetype', '')
|
|
@@ -477,12 +517,6 @@ class URLNode(HarTreeNode):
|
|
|
477
517
|
self.add_feature('external_ressources', external_ressources)
|
|
478
518
|
self.add_feature('embedded_ressources', embedded_ressources)
|
|
479
519
|
|
|
480
|
-
filename = Path(self.url_split.path).name
|
|
481
|
-
if filename:
|
|
482
|
-
self.add_feature('filename', filename)
|
|
483
|
-
else:
|
|
484
|
-
self.add_feature('filename', 'file.bin')
|
|
485
|
-
|
|
486
520
|
# Common JS redirect we can catch easily
|
|
487
521
|
# NOTE: it is extremely fragile and doesn't work very often but is kinda better than nothing.
|
|
488
522
|
# NOTE 2025-08-30: Also, finding that doesn't mean it is in a part of the code that is executed without user interaction. It can be triggered after a user fills a form for example.
|
|
@@ -622,9 +656,9 @@ class URLNode(HarTreeNode):
|
|
|
622
656
|
|
|
623
657
|
class HostNode(HarTreeNode):
|
|
624
658
|
|
|
625
|
-
def __init__(self, capture_uuid: str
|
|
659
|
+
def __init__(self, capture_uuid: str):
|
|
626
660
|
"""Node of the Hostname Tree"""
|
|
627
|
-
super().__init__(capture_uuid=capture_uuid
|
|
661
|
+
super().__init__(capture_uuid=capture_uuid)
|
|
628
662
|
# Do not add the URLs in the json dump
|
|
629
663
|
self.features_to_skip.add('urls')
|
|
630
664
|
|
|
@@ -672,6 +706,48 @@ class HostNode(HarTreeNode):
|
|
|
672
706
|
"""Number of unique 3rd party cookies received in the responses of all the URL nodes"""
|
|
673
707
|
return sum(third for _, _, third in self.cookies_received if third)
|
|
674
708
|
|
|
709
|
+
@cached_property
|
|
710
|
+
def domain(self) -> str | None:
|
|
711
|
+
if hasattr(self, 'hostname_is_ip') or hasattr(self, 'file_on_disk'):
|
|
712
|
+
return None
|
|
713
|
+
try:
|
|
714
|
+
faup_host = Host(self.name)
|
|
715
|
+
if not faup_host.is_hostname():
|
|
716
|
+
return None
|
|
717
|
+
faup_hostname = faup_host.try_into_hostname()
|
|
718
|
+
if faup_hostname.domain:
|
|
719
|
+
return str(faup_hostname.domain)
|
|
720
|
+
|
|
721
|
+
self.logger.warning(f'No domain: "{self.name}"')
|
|
722
|
+
return None
|
|
723
|
+
except ValueError as e:
|
|
724
|
+
self.logger.warning(f'Not a Host "{self.name}": {e}')
|
|
725
|
+
return None
|
|
726
|
+
except Exception as e:
|
|
727
|
+
self.logger.warning(f'Unable to parse Hostname "{self.name}": {e}')
|
|
728
|
+
return None
|
|
729
|
+
|
|
730
|
+
@cached_property
|
|
731
|
+
def tld(self) -> str | None:
|
|
732
|
+
if hasattr(self, 'hostname_is_ip') or hasattr(self, 'file_on_disk'):
|
|
733
|
+
return None
|
|
734
|
+
try:
|
|
735
|
+
faup_host = Host(self.name)
|
|
736
|
+
if not faup_host.is_hostname():
|
|
737
|
+
return None
|
|
738
|
+
faup_hostname = faup_host.try_into_hostname()
|
|
739
|
+
if faup_hostname.suffix:
|
|
740
|
+
return str(faup_hostname.suffix)
|
|
741
|
+
|
|
742
|
+
self.logger.warning(f'No domain: "{self.name}"')
|
|
743
|
+
return None
|
|
744
|
+
except ValueError as e:
|
|
745
|
+
self.logger.warning(f'Not a Host "{self.name}": {e}')
|
|
746
|
+
return None
|
|
747
|
+
except Exception as e:
|
|
748
|
+
self.logger.warning(f'Unable to parse Hostname "{self.name}": {e}')
|
|
749
|
+
return None
|
|
750
|
+
|
|
675
751
|
def add_url(self, url: URLNode) -> None:
|
|
676
752
|
"""Add a URL node to the Host node, initialize/update the features"""
|
|
677
753
|
if not self.name:
|
|
@@ -682,6 +758,9 @@ class HostNode(HarTreeNode):
|
|
|
682
758
|
if hasattr(url, 'hostname_is_ip') and url.hostname_is_ip:
|
|
683
759
|
self.add_feature('hostname_is_ip', True)
|
|
684
760
|
|
|
761
|
+
if hasattr(url, 'file_on_disk') and url.file_on_disk:
|
|
762
|
+
self.add_feature('file_on_disk', True)
|
|
763
|
+
|
|
685
764
|
self.urls.append(url)
|
|
686
765
|
|
|
687
766
|
# Add to URLNode a reference to the HostNode UUID
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "har2tree"
|
|
3
|
-
version = "1.36.
|
|
3
|
+
version = "1.36.4"
|
|
4
4
|
description = "HTTP Archive (HAR) to ETE Toolkit generator"
|
|
5
5
|
authors = [
|
|
6
6
|
{name="Raphaël Vinot", email="raphael.vinot@circl.lu"}
|
|
@@ -14,18 +14,18 @@ dynamic = [ "classifiers" ]
|
|
|
14
14
|
dependencies = [
|
|
15
15
|
"ete3 (>=3.1.3)",
|
|
16
16
|
"beautifulsoup4[charset-normalizer,lxml] (>=4.14.3)",
|
|
17
|
-
"publicsuffixlist (>=1.0.2.20260126)",
|
|
18
|
-
"filetype (>=1.2.0)",
|
|
19
17
|
# poetry up fails with the version of numpy forced for python < 3.11.
|
|
20
18
|
# The work around is to comment it, run poetry up, uncomment it. and run poetry update.
|
|
21
19
|
"numpy (>=2.2,<2.3) ; python_version < '3.11'",
|
|
22
|
-
"numpy (>=2.4.
|
|
23
|
-
"w3lib (>=2.
|
|
20
|
+
"numpy (>=2.4.2) ; python_version >= \"3.11\" and python_version < \"4.0\"",
|
|
21
|
+
"w3lib (>=2.4.0)",
|
|
24
22
|
"tinycss2 (>=1.5.1)",
|
|
25
23
|
"legacy-cgi (>=2.6.4) ; python_version >= \"3.13\" and python_version < \"4.0\"",
|
|
26
24
|
"multipart (>=1.3.0,<2.0.0)",
|
|
27
25
|
"json-stream (>=2.3.3,<3.0.0)",
|
|
28
|
-
"requests-toolbelt (>=1.0.0,<2.0.0)"
|
|
26
|
+
"requests-toolbelt (>=1.0.0,<2.0.0)",
|
|
27
|
+
"pyfaup-rs (>=0.3,<0.4.0)",
|
|
28
|
+
"pure-magic-rs (>=0.3.1,<0.4.0)"
|
|
29
29
|
]
|
|
30
30
|
|
|
31
31
|
[project.urls]
|
|
@@ -49,7 +49,7 @@ docs = ["Sphinx (>=9.1.0) ; python_version >= \"3.12\"", "six (>=1.17.0)"]
|
|
|
49
49
|
[tool.poetry.group.dev.dependencies]
|
|
50
50
|
mypy = "^1.19.1"
|
|
51
51
|
pytest-cov = "^7.0.0"
|
|
52
|
-
coverage = "^7.13.
|
|
52
|
+
coverage = "^7.13.3"
|
|
53
53
|
types-beautifulsoup4 = "^4.12.0.20250516"
|
|
54
54
|
|
|
55
55
|
[build-system]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|