PyPI - har2tree - Versions diffs - 1.36.1__tar.gz → 1.36.3__tar.gz - Mend

har2tree 1.36.1tar.gz → 1.36.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

{har2tree-1.36.1 → har2tree-1.36.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: har2tree
-Version: 1.36.1
+Version: 1.36.3
 Summary: HTTP Archive (HAR) to ETE Toolkit generator
 License-Expression: BSD-3-Clause
 License-File: LICENSE
@@ -20,20 +20,20 @@ Classifier: Programming Language :: Python :: 3.14
 Classifier: Topic :: Internet
 Classifier: Topic :: Security
 Provides-Extra: docs
-Requires-Dist: Sphinx (>=9.0.4) ; (python_version >= "3.11") and (extra == "docs")
+Requires-Dist: Sphinx (>=9.1.0) ; (python_version >= "3.12") and (extra == "docs")
 Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.14.3)
 Requires-Dist: ete3 (>=3.1.3)
-Requires-Dist: filetype (>=1.2.0)
 Requires-Dist: json-stream (>=2.3.3,<3.0.0)
 Requires-Dist: legacy-cgi (>=2.6.4) ; python_version >= "3.13" and python_version < "4.0"
 Requires-Dist: multipart (>=1.3.0,<2.0.0)
 Requires-Dist: numpy (>=2.2,<2.3) ; python_version < "3.11"
-Requires-Dist: numpy (>=2.3.5) ; python_version >= "3.11" and python_version < "4.0"
-Requires-Dist: publicsuffixlist (>=1.0.2.20251217)
+Requires-Dist: numpy (>=2.4.2) ; python_version >= "3.11" and python_version < "4.0"
+Requires-Dist: pure-magic-rs (>=0.3.1,<0.4.0)
+Requires-Dist: pyfaup-rs (>=0.2.1,<0.3.0)
 Requires-Dist: requests-toolbelt (>=1.0.0,<2.0.0)
 Requires-Dist: six (>=1.17.0) ; extra == "docs"
 Requires-Dist: tinycss2 (>=1.5.1)
-Requires-Dist: w3lib (>=2.3.1)
+Requires-Dist: w3lib (>=2.4.0)
 Project-URL: Documentation, https://har2tree.readthedocs.io/en/latest/
 Project-URL: Repository, https://github.com/Lookyloo/har2tree
 Project-URL: issues, https://github.com/Lookyloo/har2tree/issues

{har2tree-1.36.1 → har2tree-1.36.3}/har2tree/har2tree.py RENAMED Viewed

@@ -600,7 +600,7 @@ class Har2Tree:
             if i in ignore:
                 continue
-            n = URLNode(capture_uuid=self.har.capture_uuid, name=unquote_plus(url_entry['request']['url']))
+            n = URLNode(capture_uuid=self.har.capture_uuid)
             n.load_har_entry(url_entry, list(self.all_url_requests.keys()))
             if hasattr(n, 'redirect_url'):
                 self.all_redirects.append(n.redirect_url)
@@ -798,7 +798,7 @@ class Har2Tree:
         # AND we already have a node in the tree with this pageref
         # => attach to that node.
         if ('pages' in self.har.har['log'] and len(self.har.har['log']['pages']) > 1
-                and node.pageref != self.har.har['log']['pages'][0]
+                and hasattr(node, 'pageref') and node.pageref != self.har.har['log']['pages'][0]
                 and self.pages_root[node.pageref] != node.uuid):
             # In that case, we check if there is already a page with the pageref of the orphan node,
             # and attach the node to that.
@@ -824,7 +824,7 @@ class Har2Tree:
             # No luck, the node is root for this pageref, let's attach it to the prior page in the list, or the very first node (tree root)
             page_before = self.har.har['log']['pages'][0]
             for page in self.har.har['log']['pages'][1:]:
-                if page['id'] == node.pageref:
+                if hasattr(node, 'pageref') and page['id'] == node.pageref:
                     break
                 # Sometimes, the page listed in the list of pages is not related to
                 # any of the entries. Go figure what happened.

{har2tree-1.36.1 → har2tree-1.36.3}/har2tree/helper.py RENAMED Viewed

@@ -13,13 +13,12 @@ from base64 import b64decode
 from collections import defaultdict
 from io import BytesIO
 from logging import LoggerAdapter
+from pure_magic_rs import MagicDb
 from typing import Any
 from collections.abc import Iterable
 from collections.abc import Mapping, MutableMapping
 from urllib.parse import urlparse, unquote_plus, unquote_to_bytes, urljoin
-import filetype  # type: ignore
 from bs4 import BeautifulSoup, Tag, MarkupResemblesLocatorWarning
 from charset_normalizer import from_bytes
 import tinycss2  # type: ignore[import-untyped]
@@ -233,11 +232,7 @@ def _unpack_data_uri(data: str) -> tuple[str, str, BytesIO] | None:
             mime, mimeparams, unpacked_data = parsed_uri
             if '/' not in mime:
                 # Turns out, it happens. The mimetype can be null for example.
-                kind = filetype.guess(unpacked_data)
-                if kind:
-                    mime = kind.mime
-                else:
-                    mime = ''
+                mime = guess_magic_type(unpacked_data)
             blob = BytesIO(unpacked_data)
             b_hash = hashlib.sha512(blob.getvalue()).hexdigest()
@@ -268,7 +263,7 @@ def find_identifiers(soup: BeautifulSoup) -> dict[str, list[str]] | None:
     # This is beta and kinda fragile, but it's going to find (most) of the google tag IDs
     # https://support.google.com/google-ads/answer/12326985?hl=en_us_us
     # NOTE: the doc says 9 X, but all the examples I found have 10 X so we cannot trust it
-    if google_tag_ids := set(re.findall(r"(?:G-|AW-|GA-|UA-)\w{9,13}", str(soup))):
+    if google_tag_ids := set(re.findall(r"(?:G-|AW-|GA-|UA-|GTM-)\w{9,15}", str(soup))):
         blocklist = {'UA-Compatible'}
         google_tag_ids -= blocklist
         if google_tag_ids:
@@ -450,11 +445,7 @@ def find_external_ressources(mimetype: str, data: bytes, base_url: str, all_requ
         # Just in case, there is sometimes an unescape call in JS code
         for to_unescape in re.findall(r'unescape\(\'(.*)\'\)', string_soup):
             unescaped = unquote_to_bytes(to_unescape)
-            kind = filetype.guess(unescaped)
-            if kind:
-                mimetype = kind.mime
-            else:
-                mimetype = ''
+            mimetype = guess_magic_type(unescaped)
             blob = BytesIO(unescaped)
             b_hash = hashlib.sha512(blob.getvalue()).hexdigest()
             embedded_ressources[mimetype].append((b_hash, blob))
@@ -473,3 +464,14 @@ class Har2TreeLogAdapter(LoggerAdapter):  # type: ignore[type-arg]
     """
     def process(self, msg: str, kwargs: MutableMapping[str, Any]) -> tuple[str, MutableMapping[str, Any]]:
         return '[{}] {}'.format(self.extra['uuid'], msg), kwargs  # type: ignore[index]
+magic_db = None
+def guess_magic_type(data: bytes) -> str:
+    global magic_db
+    if magic_db is None:
+        magic_db = MagicDb()
+    m = magic_db.best_magic_buffer(data)
+    return m.mime_type

{har2tree-1.36.1 → har2tree-1.36.3}/har2tree/nodes.py RENAMED Viewed

@@ -13,40 +13,32 @@ import re
 from base64 import b64decode
 from datetime import datetime, timedelta
-from functools import lru_cache, cached_property
+from functools import cached_property
 from hashlib import sha256
 from io import BytesIO, StringIO
-from pathlib import Path
+from pathlib import PurePath
 from typing import Any
 from collections.abc import MutableMapping
 from urllib.parse import unquote_plus, urlparse, urljoin, parse_qs
-import filetype  # type: ignore
 import json_stream  # type: ignore
 from bs4 import BeautifulSoup
 from ete3 import TreeNode  # type: ignore
-from publicsuffixlist import PublicSuffixList  # type: ignore
+from pyfaup import Url, Hostname
 from requests_toolbelt.multipart import decoder  # type: ignore
 from w3lib.html import strip_html5_whitespace
 from w3lib.url import canonicalize_url, safe_url_string
-from .helper import find_external_ressources, rebuild_url, find_identifiers, make_soup
+from .helper import find_external_ressources, rebuild_url, find_identifiers, make_soup, guess_magic_type
 from .helper import Har2TreeError, Har2TreeLogAdapter, make_hhhash, HHHashError, HHHashNote
-@lru_cache(64)
-def get_public_suffix_list() -> PublicSuffixList:
-    # Initialize Public Suffix List
-    # TODO (?): fetch the list
-    return PublicSuffixList()
 class HarTreeNode(TreeNode):  # type: ignore[misc]
-    def __init__(self, capture_uuid: str, **kwargs: Any):
+    def __init__(self, capture_uuid: str):
         """Node dumpable in json to display with d3js"""
-        super().__init__(**kwargs)
+        super().__init__()
         logger = logging.getLogger(f'{__name__}.{self.__class__.__name__}')
         self.logger = Har2TreeLogAdapter(logger, {'uuid': capture_uuid})
         self.add_feature('uuid', str(uuid.uuid4()))
@@ -75,12 +67,11 @@ class URLNode(HarTreeNode):
     start_time: datetime
-    def __init__(self, capture_uuid: str, **kwargs: Any):
+    def __init__(self, capture_uuid: str):
         """Node of the URL Tree"""
-        super().__init__(capture_uuid=capture_uuid, **kwargs)
+        super().__init__(capture_uuid=capture_uuid)
         # Do not add the body in the json dump
         self.features_to_skip.add('body')
-        self.features_to_skip.add('url_split')
         self.features_to_skip.add('start_time')
         self.features_to_skip.add('time')
         self.features_to_skip.add('time_content_received')
@@ -134,23 +125,86 @@ class URLNode(HarTreeNode):
             return b64decode(_to_decode, altchars=b'-_', validate=True)
         return b64decode(_to_decode, validate=True)
+    @cached_property
+    def tld(self) -> str | None:
+        if not hasattr(self, 'original_url'):
+            return None
+        try:
+            faup_url = Url(self.original_url)
+            if faup_url.suffix:
+                return str(faup_url.suffix)
+            self.logger.warning(f'No TLD: "{self.name}"')
+            return None
+        except Exception as e:
+            self.logger.warning(f'Unable to parse URI "{self.name}": {e}')
+            return None
+    @cached_property
+    def known_tld(self) -> str | None:
+        # An alias, to avoid breaking things.
+        return self.tld
+    @cached_property
+    def domain(self) -> str | None:
+        if not hasattr(self, 'original_url'):
+            return None
+        try:
+            faup_url = Url(self.original_url)
+            if faup_url.domain:
+                return str(faup_url.domain)
+            self.logger.warning(f'No domain: "{self.name}"')
+            return None
+        except Exception as e:
+            self.logger.warning(f'Unable to parse URI "{self.name}": {e}')
+            return None
     def load_har_entry(self, har_entry: MutableMapping[str, Any], all_requests: list[str]) -> None:
         """Load one entry of the HAR file, initialize most of the features of the node"""
-        if not self.name:
-            # We're in the actual root node
-            # NOTE: by the HAR specs: "Absolute URL of the request (fragments are not included)."
-            self.add_feature('name', unquote_plus(har_entry['request']['url']))
+        # NOTE: by the HAR specs: "Absolute URL of the request (fragments are not included)."
+        self.add_feature('name', unquote_plus(har_entry['request']['url']))
+        # 2026-30-01: Keep original URL so it can be parsed by faup
+        if (har_entry['request']['url'].startswith('blob:')
+                and har_entry['request']['url'][5:].startswith('http')):
+            self.add_feature('original_url', har_entry['request']['url'][5:])
+        elif har_entry['request']['url'].startswith('http'):
+            self.add_feature('original_url', har_entry['request']['url'])
         splitted_url = urlparse(self.name)
-        if splitted_url.scheme == 'blob':
-            # this is a new weird feature, but it seems to be usable as a URL, so let's do that
-            self.add_feature('url_split', urlparse(splitted_url.path))
-        elif splitted_url.scheme == 'file':
+        if splitted_url.scheme == 'file':
             # file on disk, we do not have a proper URL
             self.add_feature('file_on_disk', True)
-            self.add_feature('url_split', urlparse(splitted_url.path))
+            # TODO: Do something better? hostname is the feature name used for the aggregated tree
+            # so we need that unless we want to change the JS
+            path = PurePath(splitted_url.path)
+            self.add_feature('hostname', str(path.parent))
+            if path.name:
+                self.add_feature('filename', path.name)
+            else:
+                self.add_feature('filename', 'file.bin')
         else:
-            self.add_feature('url_split', splitted_url)
+            # We have a URL
+            if splitted_url.scheme == 'blob':
+                # this is a new weird feature, but it seems to be usable as a URL, so let's do that
+                splitted_url = urlparse(splitted_url.path)
+            if splitted_url.hostname:
+                self.add_feature('hostname', splitted_url.hostname)
+            else:
+                self.logger.warning(f'Weird URI with no hostname (?): "{self.name}"')
+                self.add_feature('hostname', self.name)
+            if filename := PurePath(splitted_url.path).name:
+                self.add_feature('filename', filename)
+            else:
+                self.add_feature('filename', 'file.bin')
+        if not self.hostname:
+            self.logger.warning(f'Missing hostname, something is broken in that node: {har_entry}')
         # If the URL contains a fragment (i.e. something after a #), it is stripped in the referer.
         # So we need an alternative URL to do a lookup against
@@ -167,19 +221,6 @@ class URLNode(HarTreeNode):
         self.add_feature('time', timedelta(milliseconds=har_entry['time']))
         self.add_feature('time_content_received', self.start_time + self.time)  # Instant the response is fully received (and the processing of the content by the browser can start)
-        if hasattr(self, 'file_on_disk'):
-            # TODO: Do something better? hostname is the feature name used for the aggregated tree
-            # so we need that unless we want to change the JS
-            self.add_feature('hostname', str(Path(self.url_split.path).parent))
-        else:
-            if self.url_split.hostname:
-                self.add_feature('hostname', self.url_split.hostname)
-            else:
-                self.add_feature('hostname', self.name)
-        if not self.hostname:
-            self.logger.warning(f'Something is broken in that node: {har_entry}')
         try:
             ipaddress.ip_address(self.hostname)
             self.add_feature('hostname_is_ip', True)
@@ -196,13 +237,6 @@ class URLNode(HarTreeNode):
             except UnicodeError:
                 pass
-        if not hasattr(self, 'hostname_is_ip') and not hasattr(self, 'file_on_disk'):
-            tld = get_public_suffix_list().publicsuffix(self.hostname)
-            if tld:
-                self.add_feature('known_tld', tld)
-            else:
-                self.logger.debug(f'No TLD/domain broken {self.name}')
         self.add_feature('request', har_entry['request'])
         # Try to get a referer from the headers
         for h in self.request['headers']:
@@ -249,128 +283,134 @@ class URLNode(HarTreeNode):
                     decoded_posted_data = self._dirty_safe_b64decode(self.request['postData']['text'])
                 except binascii.Error:
                     decoded_posted_data = self.request['postData']['text']
                 if 'mimeType' in self.request['postData']:
                     # make it easier to compare.
                     mimetype_lower = self.request['postData']['mimeType'].lower()
-                    if mimetype_lower.startswith('application/x-www-form-urlencoded'):
-                        # NOTE: this should never happen as there should
-                        # be something in self.request['postData']['params']
-                        # and we already processed it before but just in case...
-                        self.logger.debug('Got a application/x-www-form-urlencoded without params key')
-                        # 100% sure there will be websites where decode will fail
-                        try:
-                            if isinstance(decoded_posted_data, bytes):
-                                decoded_posted_data = decoded_posted_data.decode()
-                            if isinstance(decoded_posted_data, str):
-                                decoded_posted_data = unquote_plus(decoded_posted_data)
-                            if isinstance(decoded_posted_data, str):
-                                decoded_posted_data = parse_qs(decoded_posted_data)
-                            self.add_feature('posted_data_info', "Successfully decoded POST request.")
-                        except Exception as e:
-                            self.logger.warning(f'Unable to unquote or parse form data "{decoded_posted_data!r}": {e}')
-                            self.add_feature('posted_data_info', "Unable to decode POST request.")
-                    elif (mimetype_lower.startswith('application/json')
-                          or mimetype_lower.startswith('application/csp-report')
-                          or mimetype_lower.startswith('application/x-amz-json-1.1')
-                          or mimetype_lower.startswith('application/reports+json')
-                          or mimetype_lower.startswith('application/vnd.adobe.dc+json')
-                          or mimetype_lower.startswith('application/ion+json')
-                          or mimetype_lower.endswith('json')
-                          ):
-                        if isinstance(decoded_posted_data, (str, bytes)):
-                            # at this stage, it will always be bytes or str
-                            try:
-                                # NOTE 2023-08-22: loads here may give us a int, float or a bool.
-                                decoded_posted_data = json.loads(decoded_posted_data)
-                                self.add_feature('posted_data_info', "Successfully decoded POST request.")
-                            except Exception:
-                                self.add_feature('posted_data_info', "Unable to decode POST request.")
-                                if isinstance(decoded_posted_data, (str, bytes)):
-                                    self.logger.warning(f"Expected json, got garbage: {mimetype_lower} - {decoded_posted_data[:20]!r}[...]")
-                                else:
-                                    self.logger.warning(f"Expected json, got garbage: {mimetype_lower} - {decoded_posted_data}")
-                    elif mimetype_lower.startswith('application/x-json-stream'):
+                else:
+                    if isinstance(decoded_posted_data, bytes):
+                        # if b64 decode worked, we may have a useful type there.
+                        mimetype_lower = guess_magic_type(decoded_posted_data)
+                    else:
+                        mimetype_lower = 'text/plain'
+                    self.logger.warning(f'Missing mimetype in POST, guessed it: {mimetype_lower}')
+                if mimetype_lower.startswith('application/x-www-form-urlencoded'):
+                    # NOTE: this should never happen as there should
+                    # be something in self.request['postData']['params']
+                    # and we already processed it before but just in case...
+                    self.logger.debug('Got a application/x-www-form-urlencoded without params key')
+                    # 100% sure there will be websites where decode will fail
+                    try:
+                        if isinstance(decoded_posted_data, bytes):
+                            decoded_posted_data = decoded_posted_data.decode()
+                        if isinstance(decoded_posted_data, str):
+                            decoded_posted_data = unquote_plus(decoded_posted_data)
+                        if isinstance(decoded_posted_data, str):
+                            decoded_posted_data = parse_qs(decoded_posted_data)
+                        self.add_feature('posted_data_info', "Successfully decoded POST request.")
+                    except Exception as e:
+                        self.logger.warning(f'Unable to unquote or parse form data "{decoded_posted_data!r}": {e}')
+                        self.add_feature('posted_data_info', "Unable to decode POST request.")
+                elif (mimetype_lower.startswith('application/json')
+                      or mimetype_lower.startswith('application/csp-report')
+                      or mimetype_lower.startswith('application/x-amz-json-1.1')
+                      or mimetype_lower.startswith('application/reports+json')
+                      or mimetype_lower.startswith('application/vnd.adobe.dc+json')
+                      or mimetype_lower.startswith('application/ion+json')
+                      or mimetype_lower.endswith('json')
+                      ):
+                    if isinstance(decoded_posted_data, (str, bytes)):
+                        # at this stage, it will always be bytes or str
                         try:
-                            to_stream: StringIO | BytesIO
-                            if isinstance(decoded_posted_data, str):
-                                to_stream = StringIO(decoded_posted_data)
-                            elif isinstance(decoded_posted_data, bytes):
-                                to_stream = BytesIO(decoded_posted_data)
-                            else:
-                                raise ValueError(f'Invalid type: {type(decoded_posted_data)}')
-                            streamed_data = json_stream.load(to_stream)
-                            decoded_posted_data = json_stream.to_standard_types(streamed_data)
+                            # NOTE 2023-08-22: loads here may give us a int, float or a bool.
+                            decoded_posted_data = json.loads(decoded_posted_data)
                             self.add_feature('posted_data_info', "Successfully decoded POST request.")
                         except Exception:
+                            self.add_feature('posted_data_info', "Unable to decode POST request.")
                             if isinstance(decoded_posted_data, (str, bytes)):
-                                self.logger.warning(f"Expected json stream, got garbage: {mimetype_lower} - {decoded_posted_data[:20]!r}[...]")
+                                self.logger.warning(f"Expected json, got garbage: {mimetype_lower} - {decoded_posted_data[:20]!r}[...]")
                             else:
-                                self.logger.warning(f"Expected json stream, got garbage: {mimetype_lower} - {decoded_posted_data}")
-                            self.add_feature('posted_data_info', "Unable to decode POST request.")
-                    elif mimetype_lower.startswith('multipart'):
-                        self.add_feature('posted_data_info', f"Decoding {mimetype_lower} is partially supported.")
+                                self.logger.warning(f"Expected json, got garbage: {mimetype_lower} - {decoded_posted_data}")
+                elif mimetype_lower.startswith('application/x-json-stream'):
+                    try:
+                        to_stream: StringIO | BytesIO
                         if isinstance(decoded_posted_data, str):
-                            # must be encoded for decoding
-                            multipart_to_decode = decoded_posted_data.encode()
+                            to_stream = StringIO(decoded_posted_data)
                         elif isinstance(decoded_posted_data, bytes):
-                            multipart_to_decode = decoded_posted_data
+                            to_stream = BytesIO(decoded_posted_data)
                         else:
-                            raise ValueError(f'Invalid type for multipart POST: {type(decoded_posted_data)}')
-                        if b"\r\n" not in multipart_to_decode:
-                            # the decoder wants that
-                            multipart_to_decode = multipart_to_decode.replace(b"\n", b"\r\n")
-                        try:
-                            multipart_data = decoder.MultipartDecoder(multipart_to_decode, mimetype_lower)
-                            decoded_posted_data = []
-                            for part in multipart_data.parts:
-                                headers = {k.decode(): v.decode() for k, v in part.headers.items()}
-                                content = part.text
-                                decoded_posted_data.append({'headers': headers, 'content': content})
-                        except Exception as e:
-                            self.logger.warning(f'Unable to decode multipart POST: {e}')
-                            self.add_feature('posted_data_info', "Unable to decode multipart in POST request.")
-                    elif mimetype_lower.startswith('application/x-protobuf'):
-                        # FIXME If possible, decode?
-                        self.logger.debug(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
-                        self.add_feature('posted_data_info', f"Decoding {mimetype_lower} is not supported yet.")
-                    elif mimetype_lower.startswith('text') and isinstance(decoded_posted_data, (str, bytes)):
-                        try:
-                            # NOTE 2023-08-22: Quite a few text entries are in fact json, give it a shot.
-                            # loads here may give us a int, float or a bool.
-                            decoded_posted_data = json.loads(decoded_posted_data)
-                            self.add_feature('posted_data_info', "Decoded JSON out of POST request.")
-                        except Exception:
-                            # keep it as it is otherwise.
-                            pass
-                    elif mimetype_lower.endswith('javascript'):
-                        # keep it as it is
-                        self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
-                        self.add_feature('posted_data_info', f"Pretty rendering of {mimetype_lower} is not supported yet.")
-                    elif mimetype_lower in ['?', '*/*']:
-                        self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
-                        self.add_feature('posted_data_info', f"Weird MimeType ({mimetype_lower}) is not supported yet.")
-                    elif mimetype_lower == 'application/binary':
-                        self.logger.warning(f'Got a POST {mimetype_lower}, not a broken gziped blob: {decoded_posted_data!r}')
-                        self.add_feature('posted_data_info', f"MimeType ({mimetype_lower}) is not supported yet.")
-                    elif mimetype_lower in ['application/octet-stream']:
-                        # Should flag it, maybe?
-                        self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
-                        self.add_feature('posted_data_info', f"MimeType ({mimetype_lower}) is not supported yet.")
-                    elif mimetype_lower in ['application/grpc-web+proto']:
-                        # Can be decoded?
-                        self.logger.warning(f'Got a POST {mimetype_lower} - can be decoded: {decoded_posted_data!r}')
-                        self.add_feature('posted_data_info', f"MimeType ({mimetype_lower}) is not supported yet.")
-                    elif mimetype_lower in ['application/unknown']:
-                        # Weird but already seen stuff
-                        self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
-                        self.add_feature('posted_data_info', f"MimeType ({mimetype_lower}) is not supported yet.")
+                            raise ValueError(f'Invalid type: {type(decoded_posted_data)}')
+                        streamed_data = json_stream.load(to_stream)
+                        decoded_posted_data = json_stream.to_standard_types(streamed_data)
+                        self.add_feature('posted_data_info', "Successfully decoded POST request.")
+                    except Exception:
+                        if isinstance(decoded_posted_data, (str, bytes)):
+                            self.logger.warning(f"Expected json stream, got garbage: {mimetype_lower} - {decoded_posted_data[:20]!r}[...]")
+                        else:
+                            self.logger.warning(f"Expected json stream, got garbage: {mimetype_lower} - {decoded_posted_data}")
+                        self.add_feature('posted_data_info', "Unable to decode POST request.")
+                elif mimetype_lower.startswith('multipart'):
+                    self.add_feature('posted_data_info', f"Decoding {mimetype_lower} is partially supported.")
+                    if isinstance(decoded_posted_data, str):
+                        # must be encoded for decoding
+                        multipart_to_decode = decoded_posted_data.encode()
+                    elif isinstance(decoded_posted_data, bytes):
+                        multipart_to_decode = decoded_posted_data
                     else:
-                        self.logger.warning(f'Unexpected mime type: {mimetype_lower} - {decoded_posted_data!r}')
-                        self.add_feature('posted_data_info', f"Unexpected MimeType ({mimetype_lower}) is not supported yet.")
+                        raise ValueError(f'Invalid type for multipart POST: {type(decoded_posted_data)}')
+                    if b"\r\n" not in multipart_to_decode:
+                        # the decoder wants that
+                        multipart_to_decode = multipart_to_decode.replace(b"\n", b"\r\n")
+                    try:
+                        multipart_data = decoder.MultipartDecoder(multipart_to_decode, mimetype_lower)
+                        decoded_posted_data = []
+                        for part in multipart_data.parts:
+                            headers = {k.decode(): v.decode() for k, v in part.headers.items()}
+                            content = part.text
+                            decoded_posted_data.append({'headers': headers, 'content': content})
+                    except Exception as e:
+                        self.logger.warning(f'Unable to decode multipart POST: {e}')
+                        self.add_feature('posted_data_info', "Unable to decode multipart in POST request.")
+                elif mimetype_lower.startswith('application/x-protobuf'):
+                    # FIXME If possible, decode?
+                    self.logger.debug(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
+                    self.add_feature('posted_data_info', f"Decoding {mimetype_lower} is not supported yet.")
+                elif mimetype_lower.startswith('text') and isinstance(decoded_posted_data, (str, bytes)):
+                    try:
+                        # NOTE 2023-08-22: Quite a few text entries are in fact json, give it a shot.
+                        # loads here may give us a int, float or a bool.
+                        decoded_posted_data = json.loads(decoded_posted_data)
+                        self.add_feature('posted_data_info', "Decoded JSON out of POST request.")
+                    except Exception:
+                        # keep it as it is otherwise.
+                        pass
+                elif mimetype_lower.endswith('javascript'):
+                    # keep it as it is
+                    self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
+                    self.add_feature('posted_data_info', f"Pretty rendering of {mimetype_lower} is not supported yet.")
+                elif mimetype_lower in ['?', '*/*']:
+                    self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
+                    self.add_feature('posted_data_info', f"Weird MimeType ({mimetype_lower}) is not supported yet.")
+                elif mimetype_lower == 'application/binary':
+                    self.logger.warning(f'Got a POST {mimetype_lower}, not a broken gziped blob: {decoded_posted_data!r}')
+                    self.add_feature('posted_data_info', f"MimeType ({mimetype_lower}) is not supported yet.")
+                elif mimetype_lower in ['application/octet-stream']:
+                    # Should flag it, maybe?
+                    self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
+                    self.add_feature('posted_data_info', f"MimeType ({mimetype_lower}) is not supported yet.")
+                elif mimetype_lower in ['application/grpc-web+proto']:
+                    # Can be decoded?
+                    self.logger.warning(f'Got a POST {mimetype_lower} - can be decoded: {decoded_posted_data!r}')
+                    self.add_feature('posted_data_info', f"MimeType ({mimetype_lower}) is not supported yet.")
+                elif mimetype_lower in ['application/unknown']:
+                    # Weird but already seen stuff
+                    self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
+                    self.add_feature('posted_data_info', f"MimeType ({mimetype_lower}) is not supported yet.")
                 else:
-                    self.logger.warning(f'Missing mimetype in POST: {self.request["postData"]}')
-                    self.add_feature('posted_data_info', "Missing MimeType, not sure what to do.")
+                    self.logger.warning(f'Unexpected mime type: {mimetype_lower} - {decoded_posted_data!r}')
+                    self.add_feature('posted_data_info', f"Unexpected MimeType ({mimetype_lower}) is not supported yet.")
             # NOTE 2023-08-22: Blind attempt to process the data as json
             if decoded_posted_data and isinstance(decoded_posted_data, (str, bytes)):
@@ -467,8 +507,8 @@ class URLNode(HarTreeNode):
             if not hasattr(self, 'mimetype'):
                 # try to guess something better
-                if kind := filetype.guess(self.body.getvalue()):
-                    self.add_feature('mimetype', kind.mime)
+                if mime := guess_magic_type(self.body.getvalue()):
+                    self.add_feature('mimetype', mime)
             if not hasattr(self, 'mimetype'):
                 self.add_feature('mimetype', '')
@@ -477,12 +517,6 @@ class URLNode(HarTreeNode):
             self.add_feature('external_ressources', external_ressources)
             self.add_feature('embedded_ressources', embedded_ressources)
-            filename = Path(self.url_split.path).name
-            if filename:
-                self.add_feature('filename', filename)
-            else:
-                self.add_feature('filename', 'file.bin')
             # Common JS redirect we can catch easily
             # NOTE: it is extremely fragile and doesn't work very often but is kinda better than nothing.
             # NOTE 2025-08-30: Also, finding that doesn't mean it is in a part of the code that is executed without user interaction. It can be triggered after a user fills a form for example.
@@ -622,9 +656,9 @@ class URLNode(HarTreeNode):
 class HostNode(HarTreeNode):
-    def __init__(self, capture_uuid: str, **kwargs: Any):
+    def __init__(self, capture_uuid: str):
         """Node of the Hostname Tree"""
-        super().__init__(capture_uuid=capture_uuid, **kwargs)
+        super().__init__(capture_uuid=capture_uuid)
         # Do not add the URLs in the json dump
         self.features_to_skip.add('urls')
@@ -672,6 +706,36 @@ class HostNode(HarTreeNode):
         """Number of unique 3rd party cookies received in the responses of all the URL nodes"""
         return sum(third for _, _, third in self.cookies_received if third)
+    @cached_property
+    def domain(self) -> str | None:
+        if hasattr(self, 'hostname_is_ip') or hasattr(self, 'file_on_disk'):
+            return None
+        try:
+            faup_hostname = Hostname(self.name)
+            if faup_hostname.domain:
+                return str(faup_hostname.domain)
+            self.logger.warning(f'No domain: "{self.name}"')
+            return None
+        except Exception as e:
+            self.logger.warning(f'Unable to parse Hostname "{self.name}": {e}')
+            return None
+    @cached_property
+    def tld(self) -> str | None:
+        if hasattr(self, 'hostname_is_ip') or hasattr(self, 'file_on_disk'):
+            return None
+        try:
+            faup_hostname = Hostname(self.name)
+            if faup_hostname.suffix:
+                return str(faup_hostname.suffix)
+            self.logger.warning(f'No domain: "{self.name}"')
+            return None
+        except Exception as e:
+            self.logger.warning(f'Unable to parse Hostname "{self.name}": {e}')
+            return None
     def add_url(self, url: URLNode) -> None:
         """Add a URL node to the Host node, initialize/update the features"""
         if not self.name:
@@ -682,6 +746,9 @@ class HostNode(HarTreeNode):
         if hasattr(url, 'hostname_is_ip') and url.hostname_is_ip:
             self.add_feature('hostname_is_ip', True)
+        if hasattr(url, 'file_on_disk') and url.file_on_disk:
+            self.add_feature('file_on_disk', True)
         self.urls.append(url)
         # Add to URLNode a reference to the HostNode UUID

{har2tree-1.36.1 → har2tree-1.36.3}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "har2tree"
-version = "1.36.1"
+version = "1.36.3"
 description = "HTTP Archive (HAR) to ETE Toolkit generator"
 authors = [
     {name="Raphaël Vinot", email="raphael.vinot@circl.lu"}
@@ -14,18 +14,18 @@ dynamic = [ "classifiers" ]
 dependencies = [
     "ete3 (>=3.1.3)",
     "beautifulsoup4[charset-normalizer,lxml] (>=4.14.3)",
-    "publicsuffixlist (>=1.0.2.20251217)",
-    "filetype (>=1.2.0)",
     # poetry up fails with the version of numpy forced for python < 3.11.
     # The work around is to comment it, run poetry up, uncomment it. and run poetry update.
     "numpy (>=2.2,<2.3) ; python_version < '3.11'",
-    "numpy (>=2.3.5) ; python_version >= \"3.11\" and python_version < \"4.0\"",
-    "w3lib (>=2.3.1)",
+    "numpy (>=2.4.2) ; python_version >= \"3.11\" and python_version < \"4.0\"",
+    "w3lib (>=2.4.0)",
     "tinycss2 (>=1.5.1)",
     "legacy-cgi (>=2.6.4) ; python_version >= \"3.13\" and python_version < \"4.0\"",
     "multipart (>=1.3.0,<2.0.0)",
     "json-stream (>=2.3.3,<3.0.0)",
-    "requests-toolbelt (>=1.0.0,<2.0.0)"
+    "requests-toolbelt (>=1.0.0,<2.0.0)",
+    "pyfaup-rs (>=0.2.1,<0.3.0)",
+    "pure-magic-rs (>=0.3.1,<0.4.0)"
 ]
 [project.urls]
@@ -44,12 +44,12 @@ classifiers = [
 ]
 [project.optional-dependencies]
-docs = ["Sphinx (>=9.0.4) ; python_version >= \"3.11\"", "six (>=1.17.0)"]
+docs = ["Sphinx (>=9.1.0) ; python_version >= \"3.12\"", "six (>=1.17.0)"]
 [tool.poetry.group.dev.dependencies]
 mypy = "^1.19.1"
 pytest-cov = "^7.0.0"
-coverage = "^7.13.0"
+coverage = "^7.13.2"
 types-beautifulsoup4 = "^4.12.0.20250516"
 [build-system]

{har2tree-1.36.1 → har2tree-1.36.3}/LICENSE RENAMED Viewed

File without changes

{har2tree-1.36.1 → har2tree-1.36.3}/README.md RENAMED Viewed

File without changes

{har2tree-1.36.1 → har2tree-1.36.3}/har2tree/__init__.py RENAMED Viewed

File without changes

{har2tree-1.36.1 → har2tree-1.36.3}/har2tree/parser.py RENAMED Viewed

File without changes

{har2tree-1.36.1 → har2tree-1.36.3}/har2tree/py.typed RENAMED Viewed

File without changes

har2tree 1.36.1__tar.gz → 1.36.3__tar.gz

har2tree 1.36.1tar.gz → 1.36.3tar.gz