PyPI - har2tree - Versions diffs - 1.36.0__tar.gz → 1.36.2__tar.gz - Mend

har2tree 1.36.0tar.gz → 1.36.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

{har2tree-1.36.0 → har2tree-1.36.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: har2tree
-Version: 1.36.0
+Version: 1.36.2
 Summary: HTTP Archive (HAR) to ETE Toolkit generator
 License-Expression: BSD-3-Clause
 License-File: LICENSE
@@ -20,7 +20,7 @@ Classifier: Programming Language :: Python :: 3.14
 Classifier: Topic :: Internet
 Classifier: Topic :: Security
 Provides-Extra: docs
-Requires-Dist: Sphinx (>=9.0.4) ; (python_version >= "3.11") and (extra == "docs")
+Requires-Dist: Sphinx (>=9.1.0) ; (python_version >= "3.12") and (extra == "docs")
 Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.14.3)
 Requires-Dist: ete3 (>=3.1.3)
 Requires-Dist: filetype (>=1.2.0)
@@ -28,8 +28,8 @@ Requires-Dist: json-stream (>=2.3.3,<3.0.0)
 Requires-Dist: legacy-cgi (>=2.6.4) ; python_version >= "3.13" and python_version < "4.0"
 Requires-Dist: multipart (>=1.3.0,<2.0.0)
 Requires-Dist: numpy (>=2.2,<2.3) ; python_version < "3.11"
-Requires-Dist: numpy (>=2.3.5) ; python_version >= "3.11" and python_version < "3.15"
-Requires-Dist: publicsuffixlist (>=1.0.2.20251209)
+Requires-Dist: numpy (>=2.4.1) ; python_version >= "3.11" and python_version < "4.0"
+Requires-Dist: publicsuffixlist (>=1.0.2.20260126)
 Requires-Dist: requests-toolbelt (>=1.0.0,<2.0.0)
 Requires-Dist: six (>=1.17.0) ; extra == "docs"
 Requires-Dist: tinycss2 (>=1.5.1)

{har2tree-1.36.0 → har2tree-1.36.2}/har2tree/har2tree.py RENAMED Viewed

@@ -750,23 +750,47 @@ class Har2Tree:
             self.logger.warning(f'Wrong format for the frames ({type(self.har.frames)}), very old capture.')
         return self.url_tree
+    def _guess_best_node_for_partial_referer(self, node: URLNode, potential_parents: list[URLNode]) -> URLNode:
+        # we have more than one node with the hostname of the referer *and* content.
+        # 2025-12-17:
+        # 1. find the deepest HTML node in the list
+        for pp in reversed(potential_parents):
+            if 'html' in pp.mimetype:
+                return pp
+        else:
+            # 2. if there are no HTML node anywhere in the list, attach to the deepest node
+            return potential_parents[-1]
     @trace_make_subtree_fallback
     def _make_subtree_fallback(self, node: URLNode, dev_debug: bool=False) -> None:
         if hasattr(node, 'referer'):
             # 2022-04-28: the node has a referer, but for some reason, it could't be attached to the tree
             #             Probable reason: the referer is a part of the URL (hostname)
-            # FIXME: this is a very dirty fix, but I'm not sure we can do it any better
             if (referer_hostname := urlparse(node.referer).hostname):
-                # the referer has a hostname
                 if (nodes_with_hostname := self.url_tree.search_nodes(hostname=referer_hostname)):
-                    # the hostname has at least a node in the tree
-                    for node_with_hostname in nodes_with_hostname:
-                        if not node_with_hostname.empty_response:
-                            # we got an non-empty response, breaking
-                            break
-                    # attach to the the first response with something, or to whatever we get.
-                    self._make_subtree(node_with_hostname, [node], fallback=True)
-                    return
+                    attach_to: URLNode
+                    # 2025-12-17: we have at least one node with that hostname.
+                    if len(nodes_with_hostname) == 1:
+                        # That's the only one, use it
+                        attach_to = nodes_with_hostname[0]
+                    else:
+                        # check if there are empty nodes
+                        if (nodes_with_hostname_and_response := [n for n in nodes_with_hostname if not n.empty_response]):
+                            if len(nodes_with_hostname_and_response) == 1:
+                                attach_to = nodes_with_hostname_and_response[0]
+                            else:
+                                # multiple non-empty nodes with that hostname, this is the more difficult one
+                                attach_to = self._guess_best_node_for_partial_referer(node, nodes_with_hostname_and_response)
+                        else:
+                            # more than one node with that hostname, but they're all empty, attach to the first one
+                            attach_to = nodes_with_hostname[0]
+                    return self._make_subtree(attach_to, [node], fallback=True)
+                else:
+                    # no node with that hostname at all, this should not happen
+                    self.logger.warning(f'Unable to find any node with the hostname {referer_hostname}, despites it being set as referer.')
+            else:
+                # the referer has no hostname and it is fascinating
+                self.logger.warning(f'Unable to get hostname out of referer: {node.referer}')
         # Sometimes, the har has a list of pages, generally when we have HTTP redirects.
         # IF we have more than one page in the list
@@ -774,7 +798,7 @@ class Har2Tree:
         # AND we already have a node in the tree with this pageref
         # => attach to that node.
         if ('pages' in self.har.har['log'] and len(self.har.har['log']['pages']) > 1
-                and node.pageref != self.har.har['log']['pages'][0]
+                and hasattr(node, 'pageref') and node.pageref != self.har.har['log']['pages'][0]
                 and self.pages_root[node.pageref] != node.uuid):
             # In that case, we check if there is already a page with the pageref of the orphan node,
             # and attach the node to that.
@@ -800,7 +824,7 @@ class Har2Tree:
             # No luck, the node is root for this pageref, let's attach it to the prior page in the list, or the very first node (tree root)
             page_before = self.har.har['log']['pages'][0]
             for page in self.har.har['log']['pages'][1:]:
-                if page['id'] == node.pageref:
+                if hasattr(node, 'pageref') and page['id'] == node.pageref:
                     break
                 # Sometimes, the page listed in the list of pages is not related to
                 # any of the entries. Go figure what happened.
@@ -929,7 +953,6 @@ class Har2Tree:
                     if dev_debug:
                         self.logger.warning(f'Found via initiator from {unode.name} to {matching_urls}.')
                     self._make_subtree(unode, matching_urls)
             # The node can have a redirect, but also trigger ressources refering to themselves, we need to trigger this code on each node.
             if self.all_initiator_url.get(unode.name):
                 # The URL (unode.name) is in the list of known urls initiating calls

{har2tree-1.36.0 → har2tree-1.36.2}/har2tree/helper.py RENAMED Viewed

@@ -268,7 +268,7 @@ def find_identifiers(soup: BeautifulSoup) -> dict[str, list[str]] | None:
     # This is beta and kinda fragile, but it's going to find (most) of the google tag IDs
     # https://support.google.com/google-ads/answer/12326985?hl=en_us_us
     # NOTE: the doc says 9 X, but all the examples I found have 10 X so we cannot trust it
-    if google_tag_ids := set(re.findall(r"(?:G-|AW-|GA-|UA-)\w{9,13}", str(soup))):
+    if google_tag_ids := set(re.findall(r"(?:G-|AW-|GA-|UA-|GTM-)\w{9,15}", str(soup))):
         blocklist = {'UA-Compatible'}
         google_tag_ids -= blocklist
         if google_tag_ids:

{har2tree-1.36.0 → har2tree-1.36.2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "har2tree"
-version = "1.36.0"
+version = "1.36.2"
 description = "HTTP Archive (HAR) to ETE Toolkit generator"
 authors = [
     {name="Raphaël Vinot", email="raphael.vinot@circl.lu"}
@@ -14,12 +14,12 @@ dynamic = [ "classifiers" ]
 dependencies = [
     "ete3 (>=3.1.3)",
     "beautifulsoup4[charset-normalizer,lxml] (>=4.14.3)",
-    "publicsuffixlist (>=1.0.2.20251209)",
+    "publicsuffixlist (>=1.0.2.20260126)",
     "filetype (>=1.2.0)",
     # poetry up fails with the version of numpy forced for python < 3.11.
     # The work around is to comment it, run poetry up, uncomment it. and run poetry update.
     "numpy (>=2.2,<2.3) ; python_version < '3.11'",
-    "numpy (>=2.3.5) ; python_version >= \"3.11\" and python_version < \"3.15\"",
+    "numpy (>=2.4.1) ; python_version >= \"3.11\" and python_version < \"4.0\"",
     "w3lib (>=2.3.1)",
     "tinycss2 (>=1.5.1)",
     "legacy-cgi (>=2.6.4) ; python_version >= \"3.13\" and python_version < \"4.0\"",
@@ -44,12 +44,12 @@ classifiers = [
 ]
 [project.optional-dependencies]
-docs = ["Sphinx (>=9.0.4) ; python_version >= \"3.11\"", "six (>=1.17.0)"]
+docs = ["Sphinx (>=9.1.0) ; python_version >= \"3.12\"", "six (>=1.17.0)"]
 [tool.poetry.group.dev.dependencies]
-mypy = "^1.19.0"
+mypy = "^1.19.1"
 pytest-cov = "^7.0.0"
-coverage = "^7.13.0"
+coverage = "^7.13.2"
 types-beautifulsoup4 = "^4.12.0.20250516"
 [build-system]

{har2tree-1.36.0 → har2tree-1.36.2}/LICENSE RENAMED Viewed

File without changes

{har2tree-1.36.0 → har2tree-1.36.2}/README.md RENAMED Viewed

File without changes

{har2tree-1.36.0 → har2tree-1.36.2}/har2tree/__init__.py RENAMED Viewed

File without changes

{har2tree-1.36.0 → har2tree-1.36.2}/har2tree/nodes.py RENAMED Viewed

File without changes

{har2tree-1.36.0 → har2tree-1.36.2}/har2tree/parser.py RENAMED Viewed

File without changes

{har2tree-1.36.0 → har2tree-1.36.2}/har2tree/py.typed RENAMED Viewed

File without changes

har2tree 1.36.0__tar.gz → 1.36.2__tar.gz

har2tree 1.36.0tar.gz → 1.36.2tar.gz