PyPI - har2tree - Versions diffs - 1.32.0__tar.gz → 1.34.2__tar.gz - Mend

har2tree 1.32.0tar.gz → 1.34.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

{har2tree-1.32.0 → har2tree-1.34.2}/PKG-INFO RENAMED Viewed

@@ -1,36 +1,35 @@
-Metadata-Version: 2.3
+Metadata-Version: 2.4
 Name: har2tree
-Version: 1.32.0
+Version: 1.34.2
 Summary: HTTP Archive (HAR) to ETE Toolkit generator
-License: BSD-3-Clause
+License-Expression: BSD-3-Clause
+License-File: LICENSE
 Author: Raphaël Vinot
 Author-email: raphael.vinot@circl.lu
-Requires-Python: >=3.9,<4.0
+Requires-Python: >=3.10,<3.15
 Classifier: Intended Audience :: Information Technology
 Classifier: Intended Audience :: Science/Research
 Classifier: Intended Audience :: Telecommunications Industry
-Classifier: License :: OSI Approved :: BSD License
 Classifier: Operating System :: POSIX :: Linux
 Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
+Classifier: Programming Language :: Python :: 3.14
 Classifier: Topic :: Internet
 Classifier: Topic :: Security
 Provides-Extra: docs
 Requires-Dist: Sphinx (>=8.2.3) ; (python_version >= "3.11") and (extra == "docs")
-Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.13.5)
+Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.14.2)
 Requires-Dist: ete3 (>=3.1.3)
 Requires-Dist: filetype (>=1.2.0)
 Requires-Dist: json-stream (>=2.3.3,<3.0.0)
-Requires-Dist: legacy-cgi (>=2.6.3) ; python_version >= "3.13,<4.0"
+Requires-Dist: legacy-cgi (>=2.6.4) ; python_version >= "3.13" and python_version < "4.0"
 Requires-Dist: multipart (>=1.3.0,<2.0.0)
-Requires-Dist: numpy (<2.1) ; python_version < "3.10"
-Requires-Dist: numpy (<2.3) ; python_version < "3.11"
-Requires-Dist: numpy (>=2.3.2) ; python_version >= "3.11"
-Requires-Dist: publicsuffixlist (>=1.0.2.20250824)
+Requires-Dist: numpy (>=2.2,<2.3) ; python_version < "3.11"
+Requires-Dist: numpy (>=2.3.4) ; python_version >= "3.11" and python_version < "4.0"
+Requires-Dist: publicsuffixlist (>=1.0.2.20251115)
 Requires-Dist: requests-toolbelt (>=1.0.0,<2.0.0)
 Requires-Dist: six (>=1.17.0) ; extra == "docs"
 Requires-Dist: tinycss2 (>=1.4.0)

{har2tree-1.32.0 → har2tree-1.34.2}/har2tree/har2tree.py RENAMED Viewed

@@ -13,7 +13,9 @@ from functools import wraps, lru_cache
 from io import BytesIO
 from operator import itemgetter
 from pathlib import Path
-from typing import Any, Callable
+from typing import Any, TypedDict
+from collections.abc import Iterator
+from collections.abc import Callable
 from urllib.parse import unquote_plus, urlparse
 from .helper import rebuild_url, Har2TreeError, Har2TreeLogAdapter
@@ -58,7 +60,8 @@ def trace_make_subtree_fallback(method: Callable[..., None]) -> Callable[..., No
 def trace_make_subtree(method: Callable[..., None]) -> Callable[..., None]:
     @wraps(method)
-    def _impl(self: Any, root: URLNode, nodes_to_attach: list[URLNode] | None=None, dev_debug: bool=False) -> None:
+    def _impl(self: Any, root: URLNode, nodes_to_attach: list[URLNode] | None=None,
+              dev_debug: bool=False, fallback: bool=False) -> None:
         if dev_debug_mode:
             __load_debug_files()
             if dev_debug_url and root.name == dev_debug_url or nodes_to_attach is not None and any(True for u in nodes_to_attach if u.name == dev_debug_url):
@@ -67,7 +70,7 @@ def trace_make_subtree(method: Callable[..., None]) -> Callable[..., None]:
             elif dev_debug_hostname and root.hostname == dev_debug_hostname or nodes_to_attach is not None and any(True for u in nodes_to_attach if u.hostname == dev_debug_hostname):
                 root.logger.warning(f'Debugging Hostname: {dev_debug_hostname}.')
                 dev_debug = True
-        return method(self, root, nodes_to_attach, dev_debug)
+        return method(self, root, nodes_to_attach, dev_debug, fallback)
     return _impl
@@ -84,6 +87,15 @@ def __load_debug_files() -> None:
             dev_debug_hostname = f.read().strip()
+# NOTE: Copy from PlaywrightCapture to avoid extra dep
+class FramesResponse(TypedDict, total=False):
+    name: str
+    url: str
+    content: str | None
+    children: list[FramesResponse] | None
 class HarFile():
     def __init__(self, harfile: Path, capture_uuid: str):
@@ -118,7 +130,11 @@ class HarFile():
                 last_redirect = unquote_plus(_lr.read())
             self.final_redirect: str = last_redirect
             if not self._search_final_redirect():
-                self.logger.warning(f'Final redirect URL from address bar not in tree: {last_redirect}')
+                if last_redirect.startswith('chrome') or last_redirect.startswith('about'):
+                    # the capture failed.
+                    pass
+                else:
+                    self.logger.info(f'Final redirect URL from address bar not in tree: {last_redirect}')
         else:
             self.logger.debug('No last_redirect file available.')
             self.final_redirect = ''
@@ -131,6 +147,14 @@ class HarFile():
             self.logger.debug('No cookies file available.')
             self.cookies = []
+        framesfile = self.path.parent / f'{root_name}.frames.json'
+        if framesfile.is_file():
+            with framesfile.open() as c:
+                self.frames: FramesResponse = json.load(c)
+        else:
+            self.logger.debug('No frames file available.')
+            self.frames = {}
         dlfile = self.path.parent / f'{root_name}.data'
         dlfilename = self.path.parent / f'{root_name}.data.filename'
         self.downloaded_file: BytesIO | None
@@ -309,8 +333,9 @@ class Har2Tree:
         self.pages_root: dict[str, str] = {}
         self.all_redirects: list[str] = []
-        self.all_referer: dict[str, list[str]] = defaultdict(list)
-        self.all_initiator_url: dict[str, list[str]] = defaultdict(list)
+        # 2025-11-16: make values of referers and initiators sets because there will be duplicates
+        self.all_referer: dict[str, set[str]] = defaultdict(set)
+        self.all_initiator_url: dict[str, set[str]] = defaultdict(set)
         self._load_url_entries()
         # Generate cookies lookup tables
@@ -401,6 +426,62 @@ class Har2Tree:
         self.url_tree = self._nodes_list.pop(0)
+    def _load_iframes(self, current: URLNode, frames: FramesResponse) -> None:
+        if not frames.get('content') or frames['content'] is None:
+            # NOTE: debug stuff, no content makes it pretty useless.
+            if frames.get('url'):
+                if frames['url'] == "about:blank":
+                    self.logger.info('Got a frame to about:blank with no content.')
+                else:
+                    u = unquote_plus(frames['url'])
+                    self.logger.warning(f'Got a url ({u}) for the frame, but no content')
+            else:
+                self.logger.info('Got a frame, but no content.')
+            return
+        if (frames.get('url')
+                and not (frames['url'] in ['about:blank']  # not loading anything, same as empty
+                         or frames['url'].startswith('data')  # base64 encoded content
+                         or frames['url'].startswith('chrome-error')  # not in the HAR/tree
+                         or frames['url'].startswith('blob')  # blobs aren't URLs
+                         )):
+            u = unquote_plus(frames['url'])
+            possible_child_name = {u, u.split('#', 1)[0]}
+            # this url should be in a node directly attached to that one
+            # we need to find that node
+            for child in current.traverse():
+                if child.name in possible_child_name:
+                    self.logger.debug(f'Found URL "{u}".')
+                    # Found the node, adding the content
+                    if not hasattr(child, 'rendered_frame'):
+                        child.rendered_frame = []
+                    child.rendered_frame.append(BytesIO(frames['content'].encode()))
+                    # and mark the node as iframe
+                    child.add_feature('iframe', True)
+                    # if there are children, use that node as parent and call the current method recursvely
+                    if f_children := frames.get('children'):
+                        for f_child in f_children:
+                            self._load_iframes(child, f_child)
+                    break
+            else:
+                # Couldn'd find the node Oo
+                to_print = ', '.join(possible_child_name)
+                children_to_print = ', '.join([child.name for child in current.traverse()])
+                self.logger.warning(f'Unable to find "{to_print}" in the children of "{current.name}" - {children_to_print}')
+        else:
+            self.logger.debug(f'"{current.name}" contains an iFrame.')
+            # No URL, this frame is directly in the parent frame.
+            if not hasattr(current, 'rendered_frame'):
+                current.rendered_frame = []
+            current.rendered_frame.append(BytesIO(frames['content'].encode()))
+            self.logger.debug(f'"{current.name}" has {len(current.rendered_frame)} iFrames.')
+            # and mark the node as iframe
+            current.add_feature('iframe', True)
+            # if there are children, use that node as parent and call the current method recursvely
+            if f_children := frames.get('children'):
+                for f_child in f_children:
+                    self._load_iframes(current, f_child)
     @property
     def initial_referer(self) -> str | None:
         '''The referer passed to the first URL in the tree'''
@@ -523,7 +604,7 @@ class Har2Tree:
             if hasattr(n, 'initiator_url'):
                 # The HAR file was created by chrome/chromium and we got the _initiator key
-                self.all_initiator_url[n.initiator_url].append(n.name)
+                self.all_initiator_url[n.initiator_url].add(n.name)
             if url_entry['startedDateTime'] in self.har.pages_start_times:
                 for page in self.har.pages_start_times[url_entry['startedDateTime']]:
@@ -536,7 +617,7 @@ class Har2Tree:
             if hasattr(n, 'referer') and i > 0:
                 # NOTE 2021-05-14: referer to self are a real thing: url -> POST to self
                 if n.name != n.referer or ('method' in n.request and n.request['method'] == 'POST'):
-                    self.all_referer[n.referer].append(n.name)
+                    self.all_referer[n.referer].add(n.name)
             self._nodes_list.append(n)
             self.all_url_requests[n.name].append(n)
@@ -617,6 +698,14 @@ class Har2Tree:
             for child_node_hostname, child_nodes_url in sub_roots.items():
                 self.make_hostname_tree(child_nodes_url, child_node_hostname)
+    def _all_urlnodes_in_host_tree(self) -> None:
+        # debug: check if all the nodes in the URL tree are in the hostnode tree (they must have an UUID)
+        self.logger.warning('Validating host tree....')
+        for urlnode in self.url_tree.traverse():
+            if not hasattr(urlnode, 'hostnode_uuid'):
+                self.logger.error(f'URL Node not un host tree: {urlnode}')
+        self.logger.warning('host tree validated.')
     def make_tree(self) -> URLNode:
         """Build URL and Host trees"""
         self._make_subtree(self.url_tree)
@@ -647,6 +736,15 @@ class Har2Tree:
         # Initialize the hostname tree root
         self.hostname_tree.add_url(self.url_tree)
         self.make_hostname_tree(self.url_tree, self.hostname_tree)
+        if dev_debug_mode:
+            self._all_urlnodes_in_host_tree()
+        if isinstance(self.har.frames, dict):
+            if self.har.frames.get('children') and self.har.frames['children'] is not None:
+                # we have frames in the main one
+                for f_child in self.har.frames['children']:
+                    self._load_iframes(self.rendered_node, f_child)
+        else:
+            self.logger.warning(f'Wrong format for the frames ({type(self.har.frames)}), very old capture.')
         return self.url_tree
     @trace_make_subtree_fallback
@@ -664,7 +762,7 @@ class Har2Tree:
                             # we got an non-empty response, breaking
                             break
                     # attach to the the first response with something, or to whatever we get.
-                    self._make_subtree(node_with_hostname, [node])
+                    self._make_subtree(node_with_hostname, [node], fallback=True)
                     return
         # Sometimes, the har has a list of pages, generally when we have HTTP redirects.
@@ -682,7 +780,7 @@ class Har2Tree:
             page_root_node = self.get_url_node_by_uuid(self.pages_root[node.pageref])
             if dev_debug:
                 self.logger.warning(f'Failed to attach URLNode in the normal process, attaching node to page {node.pageref} - Node: {page_root_node.uuid} - {page_root_node.name}.')
-            self._make_subtree(page_root_node, [node])
+            self._make_subtree(page_root_node, [node], fallback=True)
         elif self.rendered_node != self.url_tree:
             # Generally, when we have a bunch of redirects, they (generally) do not branch out
             # before the final landing page *but* it is not always the case: some intermediary
@@ -694,7 +792,7 @@ class Har2Tree:
             # end of this method anyway
             if dev_debug:
                 self.logger.warning(f'Failed to attach URLNode in the normal process, attaching node to final redirect: {self.har.final_redirect}.')
-            self._make_subtree(self.rendered_node, [node])
+            self._make_subtree(self.rendered_node, [node], fallback=True)
         elif 'pages' in self.har.har['log']:
             # No luck, the node is root for this pageref, let's attach it to the prior page in the list, or the very first node (tree root)
             page_before = self.har.har['log']['pages'][0]
@@ -716,13 +814,42 @@ class Har2Tree:
                 # node to the root node
                 page_root_node = self.url_tree
                 self.logger.warning('The pages in the HAR are in in the wrong order, this should not happen but here we are')
-            self._make_subtree(page_root_node, [node])
+            self._make_subtree(page_root_node, [node], fallback=True)
         else:
             # no way to attach it to anything else, attach to the root node
-            self._make_subtree(self.url_tree, [node])
+            self._make_subtree(self.url_tree, [node], fallback=True)
+    def all_real_urls_in_children(self, frame: FramesResponse) -> Iterator[str]:
+        # from a frame, search all the real urls in each of the children, stop at the first one
+        if (frame.get('url') and frame['url'] is not None
+                and not (frame['url'] in ['about:blank', 'about:srcdoc']  # not loading anything, same as empty
+                         or frame['url'].startswith('data')  # base64 encoded content
+                         or frame['url'].startswith('chrome-error')  # not in the HAR/tree
+                         or frame['url'].startswith('blob'))):  # blobs aren't URLs
+            yield frame['url']
+        else:
+            # got no real URL, try the children
+            if frame.get('children') and frame['children'] is not None:
+                for c in frame['children']:
+                    yield from self.all_real_urls_in_children(c)
+    def search_in_frames(self, urls: set[str], frame: FramesResponse) -> Iterator[str]:
+        # If the frame doesn't have children, there are no potential URLs to attach
+        if not frame.get('children') or frame['children'] is None:
+            return None
+        if frame.get('url'):
+            u = unquote_plus(frame['url'])
+            if urls & {u, u.split('#', 1)[0]}:
+                # got a matching URL, get list of potential iframes urls
+                for c in frame['children']:
+                    yield from self.all_real_urls_in_children(c)
+        for c in frame['children']:
+            yield from self.search_in_frames(urls, c)
     @trace_make_subtree
-    def _make_subtree(self, root: URLNode, nodes_to_attach: list[URLNode] | None=None, dev_debug: bool=False) -> None:
+    def _make_subtree(self, root: URLNode, nodes_to_attach: list[URLNode] | None=None,
+                      dev_debug: bool=False, fallback: bool=False) -> None:
         """Recursive method building each level of the tree"""
         matching_urls: list[URLNode]
         if nodes_to_attach is None:
@@ -784,6 +911,26 @@ class Har2Tree:
             if unode.empty_response:
                 continue
+            # 2025-11-14
+            # the referer of an iframe is the hostname of the parent, even if the parent
+            # is a URL with a full path. Before using the referer, we need to check if we have
+            # the current url in the frame tree. If we do, find nodes (in the remaining list)
+            # with the URLs of the children - any fragment will be missing - and attach that node
+            possible_iframe_urls = {unode.name, unode.name.split('#', 1)[0]}
+            for possible_url in self.search_in_frames(possible_iframe_urls, self.har.frames):
+                cu = unquote_plus(possible_url)
+                for u in {cu, cu.split('#', 1)[0]}:
+                    if u not in self.all_url_requests:
+                        if '#' not in u:
+                            self.logger.info(f'"{u}" in the frames URLs, but not in the HAR.')
+                        continue
+                    matching_urls = [url_node for url_node in self.all_url_requests[u]
+                                     if url_node in self._nodes_list]
+                    self._nodes_list = [node for node in self._nodes_list if node not in matching_urls]
+                    if dev_debug:
+                        self.logger.warning(f'Found via initiator from {unode.name} to {matching_urls}.')
+                    self._make_subtree(unode, matching_urls)
             # The node can have a redirect, but also trigger ressources refering to themselves, we need to trigger this code on each node.
             if self.all_initiator_url.get(unode.name):
                 # The URL (unode.name) is in the list of known urls initiating calls
@@ -815,6 +962,12 @@ class Har2Tree:
             if hasattr(unode, 'external_ressources'):
                 # the url loads external things, and some of them have no referer....
                 for external_tag, links in unode.external_ressources.items():
+                    # 2025-11-06: skip full regex until we're calling this method in the fallback
+                    #             the iframes will often (not always) have a referer set and the URL
+                    #             might be found by the regex and it will not be attached at the
+                    #             right place
+                    if external_tag == 'full_regex' and not fallback:
+                        continue
                     for link in links:
                         if link not in self.all_url_requests or link == self.har.final_redirect:
                             # We have a lot of false positives

{har2tree-1.32.0 → har2tree-1.34.2}/har2tree/helper.py RENAMED Viewed

@@ -364,7 +364,8 @@ def find_external_ressources(mimetype: str, data: bytes, base_url: str, all_requ
     # link: https://www.w3schools.com/TAGs/tag_link.asp -> href
     # object: https://www.w3schools.com/TAGs/tag_object.asp -> data
     external_ressources: dict[str, list[str]] = {'img': [], 'script': [], 'video': [], 'audio': [],
-                                                 'iframe': [], 'embed': [], 'source': [],
+                                                 'iframe': [],
+                                                 'embed': [], 'source': [],
                                                  'link': [],
                                                  'object': [],
                                                  'css': [],

{har2tree-1.32.0 → har2tree-1.34.2}/har2tree/nodes.py RENAMED Viewed

@@ -485,6 +485,7 @@ class URLNode(HarTreeNode):
             # Common JS redirect we can catch easily
             # NOTE: it is extremely fragile and doesn't work very often but is kinda better than nothing.
+            # NOTE 2025-08-30: Also, finding that doesn't mean it is in a part of the code that is executed without user interaction. It can be triggered after a user fills a form for example.
             # Source: https://stackoverflow.com/questions/13363174/regular-expression-to-catch-as-many-javascript-redirections-as-possible
             regex = re.compile(br"""((location.href)|(window.location)|(location.replace)|(location.assign))(( ?= ?)|( ?\( ?))("|')([^'"]*)("|')( ?\) ?)?;""", re.I)
             matches = re.findall(regex, self.body.getvalue())
@@ -586,7 +587,7 @@ class URLNode(HarTreeNode):
             return href
         if not hasattr(self, 'rendered_html') or not self.rendered_html:
-            raise Har2TreeError('Not the node of a page rendered ({self.uuid}), invalid request.')
+            raise Har2TreeError(f'Not the node of a page rendered ({self.uuid}), invalid request.')
         urls: set[str] = set()
         # The simple ones: the links.

{har2tree-1.32.0 → har2tree-1.34.2}/pyproject.toml RENAMED Viewed

@@ -1,32 +1,31 @@
 [project]
 name = "har2tree"
-version = "1.32.0"
+version = "1.34.2"
 description = "HTTP Archive (HAR) to ETE Toolkit generator"
 authors = [
     {name="Raphaël Vinot", email="raphael.vinot@circl.lu"}
 ]
 license = "BSD-3-Clause"
 readme = "README.md"
-requires-python = ">=3.9,<4.0"
+requires-python = ">=3.10,<3.15"
 dynamic = [ "classifiers" ]
 dependencies = [
     "ete3 (>=3.1.3)",
-    "beautifulsoup4[charset-normalizer,lxml] (>=4.13.5)",
-    "publicsuffixlist (>=1.0.2.20250824)",
+    "beautifulsoup4[charset-normalizer,lxml] (>=4.14.2)",
+    "publicsuffixlist (>=1.0.2.20251115)",
     "filetype (>=1.2.0)",
-    # poetry up fails with the version of numpy forced for python < 3.10.
+    # poetry up fails with the version of numpy forced for python < 3.11.
     # The work around is to comment it, run poetry up, uncomment it. and run poetry update.
-    "numpy (<2.1) ; python_version < \"3.10\"",
-    "numpy (<2.3) ; python_version < \"3.11\"",
-    "numpy (>=2.3.2) ; python_version >= \"3.11\"",
+    "numpy (>=2.2,<2.3) ; python_version < '3.11'",
+    "numpy (>=2.3.4) ; python_version >= \"3.11\" and python_version < \"4.0\"",
     "w3lib (>=2.3.1)",
     "tinycss2 (>=1.4.0)",
-    "legacy-cgi (>=2.6.3) ; python_version >= \"3.13,<4.0\"",
+    "legacy-cgi (>=2.6.4) ; python_version >= \"3.13\" and python_version < \"4.0\"",
     "multipart (>=1.3.0,<2.0.0)",
     "json-stream (>=2.3.3,<3.0.0)",
-    "requests-toolbelt (>=1.0.0,<2.0.0)",
+    "requests-toolbelt (>=1.0.0,<2.0.0)"
 ]
 [project.urls]
@@ -48,9 +47,9 @@ classifiers = [
 docs = ["Sphinx (>=8.2.3) ; python_version >= \"3.11\"", "six (>=1.17.0)"]
 [tool.poetry.group.dev.dependencies]
-mypy = "^1.17.1"
-pytest-cov = "^6.2.1"
-coverage = "^7.10.5"
+mypy = "^1.18.2"
+pytest-cov = "^7.0.0"
+coverage = "^7.11.3"
 types-beautifulsoup4 = "^4.12.0.20250516"
 [build-system]

{har2tree-1.32.0 → har2tree-1.34.2}/LICENSE RENAMED Viewed

File without changes

{har2tree-1.32.0 → har2tree-1.34.2}/README.md RENAMED Viewed

File without changes

{har2tree-1.32.0 → har2tree-1.34.2}/har2tree/__init__.py RENAMED Viewed

File without changes

{har2tree-1.32.0 → har2tree-1.34.2}/har2tree/parser.py RENAMED Viewed

File without changes

{har2tree-1.32.0 → har2tree-1.34.2}/har2tree/py.typed RENAMED Viewed

File without changes

har2tree 1.32.0__tar.gz → 1.34.2__tar.gz

har2tree 1.32.0tar.gz → 1.34.2tar.gz