har2tree 1.36.1__tar.gz → 1.36.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: har2tree
3
- Version: 1.36.1
3
+ Version: 1.36.3
4
4
  Summary: HTTP Archive (HAR) to ETE Toolkit generator
5
5
  License-Expression: BSD-3-Clause
6
6
  License-File: LICENSE
@@ -20,20 +20,20 @@ Classifier: Programming Language :: Python :: 3.14
20
20
  Classifier: Topic :: Internet
21
21
  Classifier: Topic :: Security
22
22
  Provides-Extra: docs
23
- Requires-Dist: Sphinx (>=9.0.4) ; (python_version >= "3.11") and (extra == "docs")
23
+ Requires-Dist: Sphinx (>=9.1.0) ; (python_version >= "3.12") and (extra == "docs")
24
24
  Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.14.3)
25
25
  Requires-Dist: ete3 (>=3.1.3)
26
- Requires-Dist: filetype (>=1.2.0)
27
26
  Requires-Dist: json-stream (>=2.3.3,<3.0.0)
28
27
  Requires-Dist: legacy-cgi (>=2.6.4) ; python_version >= "3.13" and python_version < "4.0"
29
28
  Requires-Dist: multipart (>=1.3.0,<2.0.0)
30
29
  Requires-Dist: numpy (>=2.2,<2.3) ; python_version < "3.11"
31
- Requires-Dist: numpy (>=2.3.5) ; python_version >= "3.11" and python_version < "4.0"
32
- Requires-Dist: publicsuffixlist (>=1.0.2.20251217)
30
+ Requires-Dist: numpy (>=2.4.2) ; python_version >= "3.11" and python_version < "4.0"
31
+ Requires-Dist: pure-magic-rs (>=0.3.1,<0.4.0)
32
+ Requires-Dist: pyfaup-rs (>=0.2.1,<0.3.0)
33
33
  Requires-Dist: requests-toolbelt (>=1.0.0,<2.0.0)
34
34
  Requires-Dist: six (>=1.17.0) ; extra == "docs"
35
35
  Requires-Dist: tinycss2 (>=1.5.1)
36
- Requires-Dist: w3lib (>=2.3.1)
36
+ Requires-Dist: w3lib (>=2.4.0)
37
37
  Project-URL: Documentation, https://har2tree.readthedocs.io/en/latest/
38
38
  Project-URL: Repository, https://github.com/Lookyloo/har2tree
39
39
  Project-URL: issues, https://github.com/Lookyloo/har2tree/issues
@@ -600,7 +600,7 @@ class Har2Tree:
600
600
  if i in ignore:
601
601
  continue
602
602
 
603
- n = URLNode(capture_uuid=self.har.capture_uuid, name=unquote_plus(url_entry['request']['url']))
603
+ n = URLNode(capture_uuid=self.har.capture_uuid)
604
604
  n.load_har_entry(url_entry, list(self.all_url_requests.keys()))
605
605
  if hasattr(n, 'redirect_url'):
606
606
  self.all_redirects.append(n.redirect_url)
@@ -798,7 +798,7 @@ class Har2Tree:
798
798
  # AND we already have a node in the tree with this pageref
799
799
  # => attach to that node.
800
800
  if ('pages' in self.har.har['log'] and len(self.har.har['log']['pages']) > 1
801
- and node.pageref != self.har.har['log']['pages'][0]
801
+ and hasattr(node, 'pageref') and node.pageref != self.har.har['log']['pages'][0]
802
802
  and self.pages_root[node.pageref] != node.uuid):
803
803
  # In that case, we check if there is already a page with the pageref of the orphan node,
804
804
  # and attach the node to that.
@@ -824,7 +824,7 @@ class Har2Tree:
824
824
  # No luck, the node is root for this pageref, let's attach it to the prior page in the list, or the very first node (tree root)
825
825
  page_before = self.har.har['log']['pages'][0]
826
826
  for page in self.har.har['log']['pages'][1:]:
827
- if page['id'] == node.pageref:
827
+ if hasattr(node, 'pageref') and page['id'] == node.pageref:
828
828
  break
829
829
  # Sometimes, the page listed in the list of pages is not related to
830
830
  # any of the entries. Go figure what happened.
@@ -13,13 +13,12 @@ from base64 import b64decode
13
13
  from collections import defaultdict
14
14
  from io import BytesIO
15
15
  from logging import LoggerAdapter
16
+ from pure_magic_rs import MagicDb
16
17
  from typing import Any
17
18
  from collections.abc import Iterable
18
19
  from collections.abc import Mapping, MutableMapping
19
20
  from urllib.parse import urlparse, unquote_plus, unquote_to_bytes, urljoin
20
21
 
21
- import filetype # type: ignore
22
-
23
22
  from bs4 import BeautifulSoup, Tag, MarkupResemblesLocatorWarning
24
23
  from charset_normalizer import from_bytes
25
24
  import tinycss2 # type: ignore[import-untyped]
@@ -233,11 +232,7 @@ def _unpack_data_uri(data: str) -> tuple[str, str, BytesIO] | None:
233
232
  mime, mimeparams, unpacked_data = parsed_uri
234
233
  if '/' not in mime:
235
234
  # Turns out, it happens. The mimetype can be null for example.
236
- kind = filetype.guess(unpacked_data)
237
- if kind:
238
- mime = kind.mime
239
- else:
240
- mime = ''
235
+ mime = guess_magic_type(unpacked_data)
241
236
 
242
237
  blob = BytesIO(unpacked_data)
243
238
  b_hash = hashlib.sha512(blob.getvalue()).hexdigest()
@@ -268,7 +263,7 @@ def find_identifiers(soup: BeautifulSoup) -> dict[str, list[str]] | None:
268
263
  # This is beta and kinda fragile, but it's going to find (most) of the google tag IDs
269
264
  # https://support.google.com/google-ads/answer/12326985?hl=en_us_us
270
265
  # NOTE: the doc says 9 X, but all the examples I found have 10 X so we cannot trust it
271
- if google_tag_ids := set(re.findall(r"(?:G-|AW-|GA-|UA-)\w{9,13}", str(soup))):
266
+ if google_tag_ids := set(re.findall(r"(?:G-|AW-|GA-|UA-|GTM-)\w{9,15}", str(soup))):
272
267
  blocklist = {'UA-Compatible'}
273
268
  google_tag_ids -= blocklist
274
269
  if google_tag_ids:
@@ -450,11 +445,7 @@ def find_external_ressources(mimetype: str, data: bytes, base_url: str, all_requ
450
445
  # Just in case, there is sometimes an unescape call in JS code
451
446
  for to_unescape in re.findall(r'unescape\(\'(.*)\'\)', string_soup):
452
447
  unescaped = unquote_to_bytes(to_unescape)
453
- kind = filetype.guess(unescaped)
454
- if kind:
455
- mimetype = kind.mime
456
- else:
457
- mimetype = ''
448
+ mimetype = guess_magic_type(unescaped)
458
449
  blob = BytesIO(unescaped)
459
450
  b_hash = hashlib.sha512(blob.getvalue()).hexdigest()
460
451
  embedded_ressources[mimetype].append((b_hash, blob))
@@ -473,3 +464,14 @@ class Har2TreeLogAdapter(LoggerAdapter): # type: ignore[type-arg]
473
464
  """
474
465
  def process(self, msg: str, kwargs: MutableMapping[str, Any]) -> tuple[str, MutableMapping[str, Any]]:
475
466
  return '[{}] {}'.format(self.extra['uuid'], msg), kwargs # type: ignore[index]
467
+
468
+
469
+ magic_db = None
470
+
471
+
472
+ def guess_magic_type(data: bytes) -> str:
473
+ global magic_db
474
+ if magic_db is None:
475
+ magic_db = MagicDb()
476
+ m = magic_db.best_magic_buffer(data)
477
+ return m.mime_type
@@ -13,40 +13,32 @@ import re
13
13
 
14
14
  from base64 import b64decode
15
15
  from datetime import datetime, timedelta
16
- from functools import lru_cache, cached_property
16
+ from functools import cached_property
17
17
  from hashlib import sha256
18
18
  from io import BytesIO, StringIO
19
- from pathlib import Path
19
+ from pathlib import PurePath
20
20
  from typing import Any
21
21
  from collections.abc import MutableMapping
22
22
  from urllib.parse import unquote_plus, urlparse, urljoin, parse_qs
23
23
 
24
- import filetype # type: ignore
25
24
  import json_stream # type: ignore
26
25
 
27
26
  from bs4 import BeautifulSoup
28
27
  from ete3 import TreeNode # type: ignore
29
- from publicsuffixlist import PublicSuffixList # type: ignore
28
+ from pyfaup import Url, Hostname
30
29
  from requests_toolbelt.multipart import decoder # type: ignore
31
30
  from w3lib.html import strip_html5_whitespace
32
31
  from w3lib.url import canonicalize_url, safe_url_string
33
32
 
34
- from .helper import find_external_ressources, rebuild_url, find_identifiers, make_soup
33
+ from .helper import find_external_ressources, rebuild_url, find_identifiers, make_soup, guess_magic_type
35
34
  from .helper import Har2TreeError, Har2TreeLogAdapter, make_hhhash, HHHashError, HHHashNote
36
35
 
37
36
 
38
- @lru_cache(64)
39
- def get_public_suffix_list() -> PublicSuffixList:
40
- # Initialize Public Suffix List
41
- # TODO (?): fetch the list
42
- return PublicSuffixList()
43
-
44
-
45
37
  class HarTreeNode(TreeNode): # type: ignore[misc]
46
38
 
47
- def __init__(self, capture_uuid: str, **kwargs: Any):
39
+ def __init__(self, capture_uuid: str):
48
40
  """Node dumpable in json to display with d3js"""
49
- super().__init__(**kwargs)
41
+ super().__init__()
50
42
  logger = logging.getLogger(f'{__name__}.{self.__class__.__name__}')
51
43
  self.logger = Har2TreeLogAdapter(logger, {'uuid': capture_uuid})
52
44
  self.add_feature('uuid', str(uuid.uuid4()))
@@ -75,12 +67,11 @@ class URLNode(HarTreeNode):
75
67
 
76
68
  start_time: datetime
77
69
 
78
- def __init__(self, capture_uuid: str, **kwargs: Any):
70
+ def __init__(self, capture_uuid: str):
79
71
  """Node of the URL Tree"""
80
- super().__init__(capture_uuid=capture_uuid, **kwargs)
72
+ super().__init__(capture_uuid=capture_uuid)
81
73
  # Do not add the body in the json dump
82
74
  self.features_to_skip.add('body')
83
- self.features_to_skip.add('url_split')
84
75
  self.features_to_skip.add('start_time')
85
76
  self.features_to_skip.add('time')
86
77
  self.features_to_skip.add('time_content_received')
@@ -134,23 +125,86 @@ class URLNode(HarTreeNode):
134
125
  return b64decode(_to_decode, altchars=b'-_', validate=True)
135
126
  return b64decode(_to_decode, validate=True)
136
127
 
128
+ @cached_property
129
+ def tld(self) -> str | None:
130
+ if not hasattr(self, 'original_url'):
131
+ return None
132
+ try:
133
+ faup_url = Url(self.original_url)
134
+ if faup_url.suffix:
135
+ return str(faup_url.suffix)
136
+
137
+ self.logger.warning(f'No TLD: "{self.name}"')
138
+ return None
139
+ except Exception as e:
140
+ self.logger.warning(f'Unable to parse URI "{self.name}": {e}')
141
+ return None
142
+
143
+ @cached_property
144
+ def known_tld(self) -> str | None:
145
+ # An alias, to avoid breaking things.
146
+ return self.tld
147
+
148
+ @cached_property
149
+ def domain(self) -> str | None:
150
+ if not hasattr(self, 'original_url'):
151
+ return None
152
+ try:
153
+ faup_url = Url(self.original_url)
154
+ if faup_url.domain:
155
+ return str(faup_url.domain)
156
+
157
+ self.logger.warning(f'No domain: "{self.name}"')
158
+ return None
159
+ except Exception as e:
160
+ self.logger.warning(f'Unable to parse URI "{self.name}": {e}')
161
+ return None
162
+
137
163
  def load_har_entry(self, har_entry: MutableMapping[str, Any], all_requests: list[str]) -> None:
138
164
  """Load one entry of the HAR file, initialize most of the features of the node"""
139
- if not self.name:
140
- # We're in the actual root node
141
- # NOTE: by the HAR specs: "Absolute URL of the request (fragments are not included)."
142
- self.add_feature('name', unquote_plus(har_entry['request']['url']))
165
+
166
+ # NOTE: by the HAR specs: "Absolute URL of the request (fragments are not included)."
167
+ self.add_feature('name', unquote_plus(har_entry['request']['url']))
168
+
169
+ # 2026-30-01: Keep original URL so it can be parsed by faup
170
+ if (har_entry['request']['url'].startswith('blob:')
171
+ and har_entry['request']['url'][5:].startswith('http')):
172
+ self.add_feature('original_url', har_entry['request']['url'][5:])
173
+
174
+ elif har_entry['request']['url'].startswith('http'):
175
+ self.add_feature('original_url', har_entry['request']['url'])
143
176
 
144
177
  splitted_url = urlparse(self.name)
145
- if splitted_url.scheme == 'blob':
146
- # this is a new weird feature, but it seems to be usable as a URL, so let's do that
147
- self.add_feature('url_split', urlparse(splitted_url.path))
148
- elif splitted_url.scheme == 'file':
178
+ if splitted_url.scheme == 'file':
149
179
  # file on disk, we do not have a proper URL
150
180
  self.add_feature('file_on_disk', True)
151
- self.add_feature('url_split', urlparse(splitted_url.path))
181
+ # TODO: Do something better? hostname is the feature name used for the aggregated tree
182
+ # so we need that unless we want to change the JS
183
+ path = PurePath(splitted_url.path)
184
+ self.add_feature('hostname', str(path.parent))
185
+ if path.name:
186
+ self.add_feature('filename', path.name)
187
+ else:
188
+ self.add_feature('filename', 'file.bin')
152
189
  else:
153
- self.add_feature('url_split', splitted_url)
190
+ # We have a URL
191
+ if splitted_url.scheme == 'blob':
192
+ # this is a new weird feature, but it seems to be usable as a URL, so let's do that
193
+ splitted_url = urlparse(splitted_url.path)
194
+
195
+ if splitted_url.hostname:
196
+ self.add_feature('hostname', splitted_url.hostname)
197
+ else:
198
+ self.logger.warning(f'Weird URI with no hostname (?): "{self.name}"')
199
+ self.add_feature('hostname', self.name)
200
+
201
+ if filename := PurePath(splitted_url.path).name:
202
+ self.add_feature('filename', filename)
203
+ else:
204
+ self.add_feature('filename', 'file.bin')
205
+
206
+ if not self.hostname:
207
+ self.logger.warning(f'Missing hostname, something is broken in that node: {har_entry}')
154
208
 
155
209
  # If the URL contains a fragment (i.e. something after a #), it is stripped in the referer.
156
210
  # So we need an alternative URL to do a lookup against
@@ -167,19 +221,6 @@ class URLNode(HarTreeNode):
167
221
  self.add_feature('time', timedelta(milliseconds=har_entry['time']))
168
222
  self.add_feature('time_content_received', self.start_time + self.time) # Instant the response is fully received (and the processing of the content by the browser can start)
169
223
 
170
- if hasattr(self, 'file_on_disk'):
171
- # TODO: Do something better? hostname is the feature name used for the aggregated tree
172
- # so we need that unless we want to change the JS
173
- self.add_feature('hostname', str(Path(self.url_split.path).parent))
174
- else:
175
- if self.url_split.hostname:
176
- self.add_feature('hostname', self.url_split.hostname)
177
- else:
178
- self.add_feature('hostname', self.name)
179
-
180
- if not self.hostname:
181
- self.logger.warning(f'Something is broken in that node: {har_entry}')
182
-
183
224
  try:
184
225
  ipaddress.ip_address(self.hostname)
185
226
  self.add_feature('hostname_is_ip', True)
@@ -196,13 +237,6 @@ class URLNode(HarTreeNode):
196
237
  except UnicodeError:
197
238
  pass
198
239
 
199
- if not hasattr(self, 'hostname_is_ip') and not hasattr(self, 'file_on_disk'):
200
- tld = get_public_suffix_list().publicsuffix(self.hostname)
201
- if tld:
202
- self.add_feature('known_tld', tld)
203
- else:
204
- self.logger.debug(f'No TLD/domain broken {self.name}')
205
-
206
240
  self.add_feature('request', har_entry['request'])
207
241
  # Try to get a referer from the headers
208
242
  for h in self.request['headers']:
@@ -249,128 +283,134 @@ class URLNode(HarTreeNode):
249
283
  decoded_posted_data = self._dirty_safe_b64decode(self.request['postData']['text'])
250
284
  except binascii.Error:
251
285
  decoded_posted_data = self.request['postData']['text']
286
+
252
287
  if 'mimeType' in self.request['postData']:
253
288
  # make it easier to compare.
254
289
  mimetype_lower = self.request['postData']['mimeType'].lower()
255
- if mimetype_lower.startswith('application/x-www-form-urlencoded'):
256
- # NOTE: this should never happen as there should
257
- # be something in self.request['postData']['params']
258
- # and we already processed it before but just in case...
259
- self.logger.debug('Got a application/x-www-form-urlencoded without params key')
260
- # 100% sure there will be websites where decode will fail
261
- try:
262
- if isinstance(decoded_posted_data, bytes):
263
- decoded_posted_data = decoded_posted_data.decode()
264
- if isinstance(decoded_posted_data, str):
265
- decoded_posted_data = unquote_plus(decoded_posted_data)
266
- if isinstance(decoded_posted_data, str):
267
- decoded_posted_data = parse_qs(decoded_posted_data)
268
- self.add_feature('posted_data_info', "Successfully decoded POST request.")
269
- except Exception as e:
270
- self.logger.warning(f'Unable to unquote or parse form data "{decoded_posted_data!r}": {e}')
271
- self.add_feature('posted_data_info', "Unable to decode POST request.")
272
- elif (mimetype_lower.startswith('application/json')
273
- or mimetype_lower.startswith('application/csp-report')
274
- or mimetype_lower.startswith('application/x-amz-json-1.1')
275
- or mimetype_lower.startswith('application/reports+json')
276
- or mimetype_lower.startswith('application/vnd.adobe.dc+json')
277
- or mimetype_lower.startswith('application/ion+json')
278
- or mimetype_lower.endswith('json')
279
- ):
280
- if isinstance(decoded_posted_data, (str, bytes)):
281
- # at this stage, it will always be bytes or str
282
- try:
283
- # NOTE 2023-08-22: loads here may give us a int, float or a bool.
284
- decoded_posted_data = json.loads(decoded_posted_data)
285
- self.add_feature('posted_data_info', "Successfully decoded POST request.")
286
- except Exception:
287
- self.add_feature('posted_data_info', "Unable to decode POST request.")
288
- if isinstance(decoded_posted_data, (str, bytes)):
289
- self.logger.warning(f"Expected json, got garbage: {mimetype_lower} - {decoded_posted_data[:20]!r}[...]")
290
- else:
291
- self.logger.warning(f"Expected json, got garbage: {mimetype_lower} - {decoded_posted_data}")
292
- elif mimetype_lower.startswith('application/x-json-stream'):
290
+ else:
291
+ if isinstance(decoded_posted_data, bytes):
292
+ # if b64 decode worked, we may have a useful type there.
293
+ mimetype_lower = guess_magic_type(decoded_posted_data)
294
+ else:
295
+ mimetype_lower = 'text/plain'
296
+ self.logger.warning(f'Missing mimetype in POST, guessed it: {mimetype_lower}')
297
+
298
+ if mimetype_lower.startswith('application/x-www-form-urlencoded'):
299
+ # NOTE: this should never happen as there should
300
+ # be something in self.request['postData']['params']
301
+ # and we already processed it before but just in case...
302
+ self.logger.debug('Got a application/x-www-form-urlencoded without params key')
303
+ # 100% sure there will be websites where decode will fail
304
+ try:
305
+ if isinstance(decoded_posted_data, bytes):
306
+ decoded_posted_data = decoded_posted_data.decode()
307
+ if isinstance(decoded_posted_data, str):
308
+ decoded_posted_data = unquote_plus(decoded_posted_data)
309
+ if isinstance(decoded_posted_data, str):
310
+ decoded_posted_data = parse_qs(decoded_posted_data)
311
+ self.add_feature('posted_data_info', "Successfully decoded POST request.")
312
+ except Exception as e:
313
+ self.logger.warning(f'Unable to unquote or parse form data "{decoded_posted_data!r}": {e}')
314
+ self.add_feature('posted_data_info', "Unable to decode POST request.")
315
+ elif (mimetype_lower.startswith('application/json')
316
+ or mimetype_lower.startswith('application/csp-report')
317
+ or mimetype_lower.startswith('application/x-amz-json-1.1')
318
+ or mimetype_lower.startswith('application/reports+json')
319
+ or mimetype_lower.startswith('application/vnd.adobe.dc+json')
320
+ or mimetype_lower.startswith('application/ion+json')
321
+ or mimetype_lower.endswith('json')
322
+ ):
323
+ if isinstance(decoded_posted_data, (str, bytes)):
324
+ # at this stage, it will always be bytes or str
293
325
  try:
294
- to_stream: StringIO | BytesIO
295
- if isinstance(decoded_posted_data, str):
296
- to_stream = StringIO(decoded_posted_data)
297
- elif isinstance(decoded_posted_data, bytes):
298
- to_stream = BytesIO(decoded_posted_data)
299
- else:
300
- raise ValueError(f'Invalid type: {type(decoded_posted_data)}')
301
- streamed_data = json_stream.load(to_stream)
302
- decoded_posted_data = json_stream.to_standard_types(streamed_data)
326
+ # NOTE 2023-08-22: loads here may give us a int, float or a bool.
327
+ decoded_posted_data = json.loads(decoded_posted_data)
303
328
  self.add_feature('posted_data_info', "Successfully decoded POST request.")
304
329
  except Exception:
330
+ self.add_feature('posted_data_info', "Unable to decode POST request.")
305
331
  if isinstance(decoded_posted_data, (str, bytes)):
306
- self.logger.warning(f"Expected json stream, got garbage: {mimetype_lower} - {decoded_posted_data[:20]!r}[...]")
332
+ self.logger.warning(f"Expected json, got garbage: {mimetype_lower} - {decoded_posted_data[:20]!r}[...]")
307
333
  else:
308
- self.logger.warning(f"Expected json stream, got garbage: {mimetype_lower} - {decoded_posted_data}")
309
- self.add_feature('posted_data_info', "Unable to decode POST request.")
310
- elif mimetype_lower.startswith('multipart'):
311
- self.add_feature('posted_data_info', f"Decoding {mimetype_lower} is partially supported.")
334
+ self.logger.warning(f"Expected json, got garbage: {mimetype_lower} - {decoded_posted_data}")
335
+ elif mimetype_lower.startswith('application/x-json-stream'):
336
+ try:
337
+ to_stream: StringIO | BytesIO
312
338
  if isinstance(decoded_posted_data, str):
313
- # must be encoded for decoding
314
- multipart_to_decode = decoded_posted_data.encode()
339
+ to_stream = StringIO(decoded_posted_data)
315
340
  elif isinstance(decoded_posted_data, bytes):
316
- multipart_to_decode = decoded_posted_data
341
+ to_stream = BytesIO(decoded_posted_data)
317
342
  else:
318
- raise ValueError(f'Invalid type for multipart POST: {type(decoded_posted_data)}')
319
- if b"\r\n" not in multipart_to_decode:
320
- # the decoder wants that
321
- multipart_to_decode = multipart_to_decode.replace(b"\n", b"\r\n")
322
- try:
323
- multipart_data = decoder.MultipartDecoder(multipart_to_decode, mimetype_lower)
324
- decoded_posted_data = []
325
- for part in multipart_data.parts:
326
- headers = {k.decode(): v.decode() for k, v in part.headers.items()}
327
- content = part.text
328
- decoded_posted_data.append({'headers': headers, 'content': content})
329
- except Exception as e:
330
- self.logger.warning(f'Unable to decode multipart POST: {e}')
331
- self.add_feature('posted_data_info', "Unable to decode multipart in POST request.")
332
-
333
- elif mimetype_lower.startswith('application/x-protobuf'):
334
- # FIXME If possible, decode?
335
- self.logger.debug(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
336
- self.add_feature('posted_data_info', f"Decoding {mimetype_lower} is not supported yet.")
337
- elif mimetype_lower.startswith('text') and isinstance(decoded_posted_data, (str, bytes)):
338
- try:
339
- # NOTE 2023-08-22: Quite a few text entries are in fact json, give it a shot.
340
- # loads here may give us a int, float or a bool.
341
- decoded_posted_data = json.loads(decoded_posted_data)
342
- self.add_feature('posted_data_info', "Decoded JSON out of POST request.")
343
- except Exception:
344
- # keep it as it is otherwise.
345
- pass
346
- elif mimetype_lower.endswith('javascript'):
347
- # keep it as it is
348
- self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
349
- self.add_feature('posted_data_info', f"Pretty rendering of {mimetype_lower} is not supported yet.")
350
- elif mimetype_lower in ['?', '*/*']:
351
- self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
352
- self.add_feature('posted_data_info', f"Weird MimeType ({mimetype_lower}) is not supported yet.")
353
- elif mimetype_lower == 'application/binary':
354
- self.logger.warning(f'Got a POST {mimetype_lower}, not a broken gziped blob: {decoded_posted_data!r}')
355
- self.add_feature('posted_data_info', f"MimeType ({mimetype_lower}) is not supported yet.")
356
- elif mimetype_lower in ['application/octet-stream']:
357
- # Should flag it, maybe?
358
- self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
359
- self.add_feature('posted_data_info', f"MimeType ({mimetype_lower}) is not supported yet.")
360
- elif mimetype_lower in ['application/grpc-web+proto']:
361
- # Can be decoded?
362
- self.logger.warning(f'Got a POST {mimetype_lower} - can be decoded: {decoded_posted_data!r}')
363
- self.add_feature('posted_data_info', f"MimeType ({mimetype_lower}) is not supported yet.")
364
- elif mimetype_lower in ['application/unknown']:
365
- # Weird but already seen stuff
366
- self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
367
- self.add_feature('posted_data_info', f"MimeType ({mimetype_lower}) is not supported yet.")
343
+ raise ValueError(f'Invalid type: {type(decoded_posted_data)}')
344
+ streamed_data = json_stream.load(to_stream)
345
+ decoded_posted_data = json_stream.to_standard_types(streamed_data)
346
+ self.add_feature('posted_data_info', "Successfully decoded POST request.")
347
+ except Exception:
348
+ if isinstance(decoded_posted_data, (str, bytes)):
349
+ self.logger.warning(f"Expected json stream, got garbage: {mimetype_lower} - {decoded_posted_data[:20]!r}[...]")
350
+ else:
351
+ self.logger.warning(f"Expected json stream, got garbage: {mimetype_lower} - {decoded_posted_data}")
352
+ self.add_feature('posted_data_info', "Unable to decode POST request.")
353
+ elif mimetype_lower.startswith('multipart'):
354
+ self.add_feature('posted_data_info', f"Decoding {mimetype_lower} is partially supported.")
355
+ if isinstance(decoded_posted_data, str):
356
+ # must be encoded for decoding
357
+ multipart_to_decode = decoded_posted_data.encode()
358
+ elif isinstance(decoded_posted_data, bytes):
359
+ multipart_to_decode = decoded_posted_data
368
360
  else:
369
- self.logger.warning(f'Unexpected mime type: {mimetype_lower} - {decoded_posted_data!r}')
370
- self.add_feature('posted_data_info', f"Unexpected MimeType ({mimetype_lower}) is not supported yet.")
361
+ raise ValueError(f'Invalid type for multipart POST: {type(decoded_posted_data)}')
362
+ if b"\r\n" not in multipart_to_decode:
363
+ # the decoder wants that
364
+ multipart_to_decode = multipart_to_decode.replace(b"\n", b"\r\n")
365
+ try:
366
+ multipart_data = decoder.MultipartDecoder(multipart_to_decode, mimetype_lower)
367
+ decoded_posted_data = []
368
+ for part in multipart_data.parts:
369
+ headers = {k.decode(): v.decode() for k, v in part.headers.items()}
370
+ content = part.text
371
+ decoded_posted_data.append({'headers': headers, 'content': content})
372
+ except Exception as e:
373
+ self.logger.warning(f'Unable to decode multipart POST: {e}')
374
+ self.add_feature('posted_data_info', "Unable to decode multipart in POST request.")
375
+
376
+ elif mimetype_lower.startswith('application/x-protobuf'):
377
+ # FIXME If possible, decode?
378
+ self.logger.debug(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
379
+ self.add_feature('posted_data_info', f"Decoding {mimetype_lower} is not supported yet.")
380
+ elif mimetype_lower.startswith('text') and isinstance(decoded_posted_data, (str, bytes)):
381
+ try:
382
+ # NOTE 2023-08-22: Quite a few text entries are in fact json, give it a shot.
383
+ # loads here may give us a int, float or a bool.
384
+ decoded_posted_data = json.loads(decoded_posted_data)
385
+ self.add_feature('posted_data_info', "Decoded JSON out of POST request.")
386
+ except Exception:
387
+ # keep it as it is otherwise.
388
+ pass
389
+ elif mimetype_lower.endswith('javascript'):
390
+ # keep it as it is
391
+ self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
392
+ self.add_feature('posted_data_info', f"Pretty rendering of {mimetype_lower} is not supported yet.")
393
+ elif mimetype_lower in ['?', '*/*']:
394
+ self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
395
+ self.add_feature('posted_data_info', f"Weird MimeType ({mimetype_lower}) is not supported yet.")
396
+ elif mimetype_lower == 'application/binary':
397
+ self.logger.warning(f'Got a POST {mimetype_lower}, not a broken gziped blob: {decoded_posted_data!r}')
398
+ self.add_feature('posted_data_info', f"MimeType ({mimetype_lower}) is not supported yet.")
399
+ elif mimetype_lower in ['application/octet-stream']:
400
+ # Should flag it, maybe?
401
+ self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
402
+ self.add_feature('posted_data_info', f"MimeType ({mimetype_lower}) is not supported yet.")
403
+ elif mimetype_lower in ['application/grpc-web+proto']:
404
+ # Can be decoded?
405
+ self.logger.warning(f'Got a POST {mimetype_lower} - can be decoded: {decoded_posted_data!r}')
406
+ self.add_feature('posted_data_info', f"MimeType ({mimetype_lower}) is not supported yet.")
407
+ elif mimetype_lower in ['application/unknown']:
408
+ # Weird but already seen stuff
409
+ self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
410
+ self.add_feature('posted_data_info', f"MimeType ({mimetype_lower}) is not supported yet.")
371
411
  else:
372
- self.logger.warning(f'Missing mimetype in POST: {self.request["postData"]}')
373
- self.add_feature('posted_data_info', "Missing MimeType, not sure what to do.")
412
+ self.logger.warning(f'Unexpected mime type: {mimetype_lower} - {decoded_posted_data!r}')
413
+ self.add_feature('posted_data_info', f"Unexpected MimeType ({mimetype_lower}) is not supported yet.")
374
414
 
375
415
  # NOTE 2023-08-22: Blind attempt to process the data as json
376
416
  if decoded_posted_data and isinstance(decoded_posted_data, (str, bytes)):
@@ -467,8 +507,8 @@ class URLNode(HarTreeNode):
467
507
 
468
508
  if not hasattr(self, 'mimetype'):
469
509
  # try to guess something better
470
- if kind := filetype.guess(self.body.getvalue()):
471
- self.add_feature('mimetype', kind.mime)
510
+ if mime := guess_magic_type(self.body.getvalue()):
511
+ self.add_feature('mimetype', mime)
472
512
 
473
513
  if not hasattr(self, 'mimetype'):
474
514
  self.add_feature('mimetype', '')
@@ -477,12 +517,6 @@ class URLNode(HarTreeNode):
477
517
  self.add_feature('external_ressources', external_ressources)
478
518
  self.add_feature('embedded_ressources', embedded_ressources)
479
519
 
480
- filename = Path(self.url_split.path).name
481
- if filename:
482
- self.add_feature('filename', filename)
483
- else:
484
- self.add_feature('filename', 'file.bin')
485
-
486
520
  # Common JS redirect we can catch easily
487
521
  # NOTE: it is extremely fragile and doesn't work very often but is kinda better than nothing.
488
522
  # NOTE 2025-08-30: Also, finding that doesn't mean it is in a part of the code that is executed without user interaction. It can be triggered after a user fills a form for example.
@@ -622,9 +656,9 @@ class URLNode(HarTreeNode):
622
656
 
623
657
  class HostNode(HarTreeNode):
624
658
 
625
- def __init__(self, capture_uuid: str, **kwargs: Any):
659
+ def __init__(self, capture_uuid: str):
626
660
  """Node of the Hostname Tree"""
627
- super().__init__(capture_uuid=capture_uuid, **kwargs)
661
+ super().__init__(capture_uuid=capture_uuid)
628
662
  # Do not add the URLs in the json dump
629
663
  self.features_to_skip.add('urls')
630
664
 
@@ -672,6 +706,36 @@ class HostNode(HarTreeNode):
672
706
  """Number of unique 3rd party cookies received in the responses of all the URL nodes"""
673
707
  return sum(third for _, _, third in self.cookies_received if third)
674
708
 
709
+ @cached_property
710
+ def domain(self) -> str | None:
711
+ if hasattr(self, 'hostname_is_ip') or hasattr(self, 'file_on_disk'):
712
+ return None
713
+ try:
714
+ faup_hostname = Hostname(self.name)
715
+ if faup_hostname.domain:
716
+ return str(faup_hostname.domain)
717
+
718
+ self.logger.warning(f'No domain: "{self.name}"')
719
+ return None
720
+ except Exception as e:
721
+ self.logger.warning(f'Unable to parse Hostname "{self.name}": {e}')
722
+ return None
723
+
724
+ @cached_property
725
+ def tld(self) -> str | None:
726
+ if hasattr(self, 'hostname_is_ip') or hasattr(self, 'file_on_disk'):
727
+ return None
728
+ try:
729
+ faup_hostname = Hostname(self.name)
730
+ if faup_hostname.suffix:
731
+ return str(faup_hostname.suffix)
732
+
733
+ self.logger.warning(f'No domain: "{self.name}"')
734
+ return None
735
+ except Exception as e:
736
+ self.logger.warning(f'Unable to parse Hostname "{self.name}": {e}')
737
+ return None
738
+
675
739
  def add_url(self, url: URLNode) -> None:
676
740
  """Add a URL node to the Host node, initialize/update the features"""
677
741
  if not self.name:
@@ -682,6 +746,9 @@ class HostNode(HarTreeNode):
682
746
  if hasattr(url, 'hostname_is_ip') and url.hostname_is_ip:
683
747
  self.add_feature('hostname_is_ip', True)
684
748
 
749
+ if hasattr(url, 'file_on_disk') and url.file_on_disk:
750
+ self.add_feature('file_on_disk', True)
751
+
685
752
  self.urls.append(url)
686
753
 
687
754
  # Add to URLNode a reference to the HostNode UUID
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "har2tree"
3
- version = "1.36.1"
3
+ version = "1.36.3"
4
4
  description = "HTTP Archive (HAR) to ETE Toolkit generator"
5
5
  authors = [
6
6
  {name="Raphaël Vinot", email="raphael.vinot@circl.lu"}
@@ -14,18 +14,18 @@ dynamic = [ "classifiers" ]
14
14
  dependencies = [
15
15
  "ete3 (>=3.1.3)",
16
16
  "beautifulsoup4[charset-normalizer,lxml] (>=4.14.3)",
17
- "publicsuffixlist (>=1.0.2.20251217)",
18
- "filetype (>=1.2.0)",
19
17
  # poetry up fails with the version of numpy forced for python < 3.11.
20
18
  # The work around is to comment it, run poetry up, uncomment it. and run poetry update.
21
19
  "numpy (>=2.2,<2.3) ; python_version < '3.11'",
22
- "numpy (>=2.3.5) ; python_version >= \"3.11\" and python_version < \"4.0\"",
23
- "w3lib (>=2.3.1)",
20
+ "numpy (>=2.4.2) ; python_version >= \"3.11\" and python_version < \"4.0\"",
21
+ "w3lib (>=2.4.0)",
24
22
  "tinycss2 (>=1.5.1)",
25
23
  "legacy-cgi (>=2.6.4) ; python_version >= \"3.13\" and python_version < \"4.0\"",
26
24
  "multipart (>=1.3.0,<2.0.0)",
27
25
  "json-stream (>=2.3.3,<3.0.0)",
28
- "requests-toolbelt (>=1.0.0,<2.0.0)"
26
+ "requests-toolbelt (>=1.0.0,<2.0.0)",
27
+ "pyfaup-rs (>=0.2.1,<0.3.0)",
28
+ "pure-magic-rs (>=0.3.1,<0.4.0)"
29
29
  ]
30
30
 
31
31
  [project.urls]
@@ -44,12 +44,12 @@ classifiers = [
44
44
  ]
45
45
 
46
46
  [project.optional-dependencies]
47
- docs = ["Sphinx (>=9.0.4) ; python_version >= \"3.11\"", "six (>=1.17.0)"]
47
+ docs = ["Sphinx (>=9.1.0) ; python_version >= \"3.12\"", "six (>=1.17.0)"]
48
48
 
49
49
  [tool.poetry.group.dev.dependencies]
50
50
  mypy = "^1.19.1"
51
51
  pytest-cov = "^7.0.0"
52
- coverage = "^7.13.0"
52
+ coverage = "^7.13.2"
53
53
  types-beautifulsoup4 = "^4.12.0.20250516"
54
54
 
55
55
  [build-system]
File without changes
File without changes
File without changes
File without changes