har2tree 1.36.0__tar.gz → 1.36.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: har2tree
3
- Version: 1.36.0
3
+ Version: 1.36.2
4
4
  Summary: HTTP Archive (HAR) to ETE Toolkit generator
5
5
  License-Expression: BSD-3-Clause
6
6
  License-File: LICENSE
@@ -20,7 +20,7 @@ Classifier: Programming Language :: Python :: 3.14
20
20
  Classifier: Topic :: Internet
21
21
  Classifier: Topic :: Security
22
22
  Provides-Extra: docs
23
- Requires-Dist: Sphinx (>=9.0.4) ; (python_version >= "3.11") and (extra == "docs")
23
+ Requires-Dist: Sphinx (>=9.1.0) ; (python_version >= "3.12") and (extra == "docs")
24
24
  Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.14.3)
25
25
  Requires-Dist: ete3 (>=3.1.3)
26
26
  Requires-Dist: filetype (>=1.2.0)
@@ -28,8 +28,8 @@ Requires-Dist: json-stream (>=2.3.3,<3.0.0)
28
28
  Requires-Dist: legacy-cgi (>=2.6.4) ; python_version >= "3.13" and python_version < "4.0"
29
29
  Requires-Dist: multipart (>=1.3.0,<2.0.0)
30
30
  Requires-Dist: numpy (>=2.2,<2.3) ; python_version < "3.11"
31
- Requires-Dist: numpy (>=2.3.5) ; python_version >= "3.11" and python_version < "3.15"
32
- Requires-Dist: publicsuffixlist (>=1.0.2.20251209)
31
+ Requires-Dist: numpy (>=2.4.1) ; python_version >= "3.11" and python_version < "4.0"
32
+ Requires-Dist: publicsuffixlist (>=1.0.2.20260126)
33
33
  Requires-Dist: requests-toolbelt (>=1.0.0,<2.0.0)
34
34
  Requires-Dist: six (>=1.17.0) ; extra == "docs"
35
35
  Requires-Dist: tinycss2 (>=1.5.1)
@@ -750,23 +750,47 @@ class Har2Tree:
750
750
  self.logger.warning(f'Wrong format for the frames ({type(self.har.frames)}), very old capture.')
751
751
  return self.url_tree
752
752
 
753
+ def _guess_best_node_for_partial_referer(self, node: URLNode, potential_parents: list[URLNode]) -> URLNode:
754
+ # we have more than one node with the hostname of the referer *and* content.
755
+ # 2025-12-17:
756
+ # 1. find the deepest HTML node in the list
757
+ for pp in reversed(potential_parents):
758
+ if 'html' in pp.mimetype:
759
+ return pp
760
+ else:
761
+ # 2. if there are no HTML node anywhere in the list, attach to the deepest node
762
+ return potential_parents[-1]
763
+
753
764
  @trace_make_subtree_fallback
754
765
  def _make_subtree_fallback(self, node: URLNode, dev_debug: bool=False) -> None:
755
766
  if hasattr(node, 'referer'):
756
767
  # 2022-04-28: the node has a referer, but for some reason, it could't be attached to the tree
757
768
  # Probable reason: the referer is a part of the URL (hostname)
758
- # FIXME: this is a very dirty fix, but I'm not sure we can do it any better
759
769
  if (referer_hostname := urlparse(node.referer).hostname):
760
- # the referer has a hostname
761
770
  if (nodes_with_hostname := self.url_tree.search_nodes(hostname=referer_hostname)):
762
- # the hostname has at least a node in the tree
763
- for node_with_hostname in nodes_with_hostname:
764
- if not node_with_hostname.empty_response:
765
- # we got an non-empty response, breaking
766
- break
767
- # attach to the the first response with something, or to whatever we get.
768
- self._make_subtree(node_with_hostname, [node], fallback=True)
769
- return
771
+ attach_to: URLNode
772
+ # 2025-12-17: we have at least one node with that hostname.
773
+ if len(nodes_with_hostname) == 1:
774
+ # That's the only one, use it
775
+ attach_to = nodes_with_hostname[0]
776
+ else:
777
+ # check if there are empty nodes
778
+ if (nodes_with_hostname_and_response := [n for n in nodes_with_hostname if not n.empty_response]):
779
+ if len(nodes_with_hostname_and_response) == 1:
780
+ attach_to = nodes_with_hostname_and_response[0]
781
+ else:
782
+ # multiple non-empty nodes with that hostname, this is the more difficult one
783
+ attach_to = self._guess_best_node_for_partial_referer(node, nodes_with_hostname_and_response)
784
+ else:
785
+ # more than one node with that hostname, but they're all empty, attach to the first one
786
+ attach_to = nodes_with_hostname[0]
787
+ return self._make_subtree(attach_to, [node], fallback=True)
788
+ else:
789
+ # no node with that hostname at all, this should not happen
790
+ self.logger.warning(f'Unable to find any node with the hostname {referer_hostname}, despites it being set as referer.')
791
+ else:
792
+ # the referer has no hostname and it is fascinating
793
+ self.logger.warning(f'Unable to get hostname out of referer: {node.referer}')
770
794
 
771
795
  # Sometimes, the har has a list of pages, generally when we have HTTP redirects.
772
796
  # IF we have more than one page in the list
@@ -774,7 +798,7 @@ class Har2Tree:
774
798
  # AND we already have a node in the tree with this pageref
775
799
  # => attach to that node.
776
800
  if ('pages' in self.har.har['log'] and len(self.har.har['log']['pages']) > 1
777
- and node.pageref != self.har.har['log']['pages'][0]
801
+ and hasattr(node, 'pageref') and node.pageref != self.har.har['log']['pages'][0]
778
802
  and self.pages_root[node.pageref] != node.uuid):
779
803
  # In that case, we check if there is already a page with the pageref of the orphan node,
780
804
  # and attach the node to that.
@@ -800,7 +824,7 @@ class Har2Tree:
800
824
  # No luck, the node is root for this pageref, let's attach it to the prior page in the list, or the very first node (tree root)
801
825
  page_before = self.har.har['log']['pages'][0]
802
826
  for page in self.har.har['log']['pages'][1:]:
803
- if page['id'] == node.pageref:
827
+ if hasattr(node, 'pageref') and page['id'] == node.pageref:
804
828
  break
805
829
  # Sometimes, the page listed in the list of pages is not related to
806
830
  # any of the entries. Go figure what happened.
@@ -929,7 +953,6 @@ class Har2Tree:
929
953
  if dev_debug:
930
954
  self.logger.warning(f'Found via initiator from {unode.name} to {matching_urls}.')
931
955
  self._make_subtree(unode, matching_urls)
932
-
933
956
  # The node can have a redirect, but also trigger ressources refering to themselves, we need to trigger this code on each node.
934
957
  if self.all_initiator_url.get(unode.name):
935
958
  # The URL (unode.name) is in the list of known urls initiating calls
@@ -268,7 +268,7 @@ def find_identifiers(soup: BeautifulSoup) -> dict[str, list[str]] | None:
268
268
  # This is beta and kinda fragile, but it's going to find (most) of the google tag IDs
269
269
  # https://support.google.com/google-ads/answer/12326985?hl=en_us_us
270
270
  # NOTE: the doc says 9 X, but all the examples I found have 10 X so we cannot trust it
271
- if google_tag_ids := set(re.findall(r"(?:G-|AW-|GA-|UA-)\w{9,13}", str(soup))):
271
+ if google_tag_ids := set(re.findall(r"(?:G-|AW-|GA-|UA-|GTM-)\w{9,15}", str(soup))):
272
272
  blocklist = {'UA-Compatible'}
273
273
  google_tag_ids -= blocklist
274
274
  if google_tag_ids:
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "har2tree"
3
- version = "1.36.0"
3
+ version = "1.36.2"
4
4
  description = "HTTP Archive (HAR) to ETE Toolkit generator"
5
5
  authors = [
6
6
  {name="Raphaël Vinot", email="raphael.vinot@circl.lu"}
@@ -14,12 +14,12 @@ dynamic = [ "classifiers" ]
14
14
  dependencies = [
15
15
  "ete3 (>=3.1.3)",
16
16
  "beautifulsoup4[charset-normalizer,lxml] (>=4.14.3)",
17
- "publicsuffixlist (>=1.0.2.20251209)",
17
+ "publicsuffixlist (>=1.0.2.20260126)",
18
18
  "filetype (>=1.2.0)",
19
19
  # poetry up fails with the version of numpy forced for python < 3.11.
20
20
  # The work around is to comment it, run poetry up, uncomment it. and run poetry update.
21
21
  "numpy (>=2.2,<2.3) ; python_version < '3.11'",
22
- "numpy (>=2.3.5) ; python_version >= \"3.11\" and python_version < \"3.15\"",
22
+ "numpy (>=2.4.1) ; python_version >= \"3.11\" and python_version < \"4.0\"",
23
23
  "w3lib (>=2.3.1)",
24
24
  "tinycss2 (>=1.5.1)",
25
25
  "legacy-cgi (>=2.6.4) ; python_version >= \"3.13\" and python_version < \"4.0\"",
@@ -44,12 +44,12 @@ classifiers = [
44
44
  ]
45
45
 
46
46
  [project.optional-dependencies]
47
- docs = ["Sphinx (>=9.0.4) ; python_version >= \"3.11\"", "six (>=1.17.0)"]
47
+ docs = ["Sphinx (>=9.1.0) ; python_version >= \"3.12\"", "six (>=1.17.0)"]
48
48
 
49
49
  [tool.poetry.group.dev.dependencies]
50
- mypy = "^1.19.0"
50
+ mypy = "^1.19.1"
51
51
  pytest-cov = "^7.0.0"
52
- coverage = "^7.13.0"
52
+ coverage = "^7.13.2"
53
53
  types-beautifulsoup4 = "^4.12.0.20250516"
54
54
 
55
55
  [build-system]
File without changes
File without changes
File without changes
File without changes
File without changes