har2tree 1.36.0__tar.gz → 1.36.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {har2tree-1.36.0 → har2tree-1.36.2}/PKG-INFO +4 -4
- {har2tree-1.36.0 → har2tree-1.36.2}/har2tree/har2tree.py +36 -13
- {har2tree-1.36.0 → har2tree-1.36.2}/har2tree/helper.py +1 -1
- {har2tree-1.36.0 → har2tree-1.36.2}/pyproject.toml +6 -6
- {har2tree-1.36.0 → har2tree-1.36.2}/LICENSE +0 -0
- {har2tree-1.36.0 → har2tree-1.36.2}/README.md +0 -0
- {har2tree-1.36.0 → har2tree-1.36.2}/har2tree/__init__.py +0 -0
- {har2tree-1.36.0 → har2tree-1.36.2}/har2tree/nodes.py +0 -0
- {har2tree-1.36.0 → har2tree-1.36.2}/har2tree/parser.py +0 -0
- {har2tree-1.36.0 → har2tree-1.36.2}/har2tree/py.typed +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: har2tree
|
|
3
|
-
Version: 1.36.
|
|
3
|
+
Version: 1.36.2
|
|
4
4
|
Summary: HTTP Archive (HAR) to ETE Toolkit generator
|
|
5
5
|
License-Expression: BSD-3-Clause
|
|
6
6
|
License-File: LICENSE
|
|
@@ -20,7 +20,7 @@ Classifier: Programming Language :: Python :: 3.14
|
|
|
20
20
|
Classifier: Topic :: Internet
|
|
21
21
|
Classifier: Topic :: Security
|
|
22
22
|
Provides-Extra: docs
|
|
23
|
-
Requires-Dist: Sphinx (>=9.0
|
|
23
|
+
Requires-Dist: Sphinx (>=9.1.0) ; (python_version >= "3.12") and (extra == "docs")
|
|
24
24
|
Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.14.3)
|
|
25
25
|
Requires-Dist: ete3 (>=3.1.3)
|
|
26
26
|
Requires-Dist: filetype (>=1.2.0)
|
|
@@ -28,8 +28,8 @@ Requires-Dist: json-stream (>=2.3.3,<3.0.0)
|
|
|
28
28
|
Requires-Dist: legacy-cgi (>=2.6.4) ; python_version >= "3.13" and python_version < "4.0"
|
|
29
29
|
Requires-Dist: multipart (>=1.3.0,<2.0.0)
|
|
30
30
|
Requires-Dist: numpy (>=2.2,<2.3) ; python_version < "3.11"
|
|
31
|
-
Requires-Dist: numpy (>=2.
|
|
32
|
-
Requires-Dist: publicsuffixlist (>=1.0.2.
|
|
31
|
+
Requires-Dist: numpy (>=2.4.1) ; python_version >= "3.11" and python_version < "4.0"
|
|
32
|
+
Requires-Dist: publicsuffixlist (>=1.0.2.20260126)
|
|
33
33
|
Requires-Dist: requests-toolbelt (>=1.0.0,<2.0.0)
|
|
34
34
|
Requires-Dist: six (>=1.17.0) ; extra == "docs"
|
|
35
35
|
Requires-Dist: tinycss2 (>=1.5.1)
|
|
@@ -750,23 +750,47 @@ class Har2Tree:
|
|
|
750
750
|
self.logger.warning(f'Wrong format for the frames ({type(self.har.frames)}), very old capture.')
|
|
751
751
|
return self.url_tree
|
|
752
752
|
|
|
753
|
+
def _guess_best_node_for_partial_referer(self, node: URLNode, potential_parents: list[URLNode]) -> URLNode:
|
|
754
|
+
# we have more than one node with the hostname of the referer *and* content.
|
|
755
|
+
# 2025-12-17:
|
|
756
|
+
# 1. find the deepest HTML node in the list
|
|
757
|
+
for pp in reversed(potential_parents):
|
|
758
|
+
if 'html' in pp.mimetype:
|
|
759
|
+
return pp
|
|
760
|
+
else:
|
|
761
|
+
# 2. if there are no HTML node anywhere in the list, attach to the deepest node
|
|
762
|
+
return potential_parents[-1]
|
|
763
|
+
|
|
753
764
|
@trace_make_subtree_fallback
|
|
754
765
|
def _make_subtree_fallback(self, node: URLNode, dev_debug: bool=False) -> None:
|
|
755
766
|
if hasattr(node, 'referer'):
|
|
756
767
|
# 2022-04-28: the node has a referer, but for some reason, it could't be attached to the tree
|
|
757
768
|
# Probable reason: the referer is a part of the URL (hostname)
|
|
758
|
-
# FIXME: this is a very dirty fix, but I'm not sure we can do it any better
|
|
759
769
|
if (referer_hostname := urlparse(node.referer).hostname):
|
|
760
|
-
# the referer has a hostname
|
|
761
770
|
if (nodes_with_hostname := self.url_tree.search_nodes(hostname=referer_hostname)):
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
771
|
+
attach_to: URLNode
|
|
772
|
+
# 2025-12-17: we have at least one node with that hostname.
|
|
773
|
+
if len(nodes_with_hostname) == 1:
|
|
774
|
+
# That's the only one, use it
|
|
775
|
+
attach_to = nodes_with_hostname[0]
|
|
776
|
+
else:
|
|
777
|
+
# check if there are empty nodes
|
|
778
|
+
if (nodes_with_hostname_and_response := [n for n in nodes_with_hostname if not n.empty_response]):
|
|
779
|
+
if len(nodes_with_hostname_and_response) == 1:
|
|
780
|
+
attach_to = nodes_with_hostname_and_response[0]
|
|
781
|
+
else:
|
|
782
|
+
# multiple non-empty nodes with that hostname, this is the more difficult one
|
|
783
|
+
attach_to = self._guess_best_node_for_partial_referer(node, nodes_with_hostname_and_response)
|
|
784
|
+
else:
|
|
785
|
+
# more than one node with that hostname, but they're all empty, attach to the first one
|
|
786
|
+
attach_to = nodes_with_hostname[0]
|
|
787
|
+
return self._make_subtree(attach_to, [node], fallback=True)
|
|
788
|
+
else:
|
|
789
|
+
# no node with that hostname at all, this should not happen
|
|
790
|
+
self.logger.warning(f'Unable to find any node with the hostname {referer_hostname}, despites it being set as referer.')
|
|
791
|
+
else:
|
|
792
|
+
# the referer has no hostname and it is fascinating
|
|
793
|
+
self.logger.warning(f'Unable to get hostname out of referer: {node.referer}')
|
|
770
794
|
|
|
771
795
|
# Sometimes, the har has a list of pages, generally when we have HTTP redirects.
|
|
772
796
|
# IF we have more than one page in the list
|
|
@@ -774,7 +798,7 @@ class Har2Tree:
|
|
|
774
798
|
# AND we already have a node in the tree with this pageref
|
|
775
799
|
# => attach to that node.
|
|
776
800
|
if ('pages' in self.har.har['log'] and len(self.har.har['log']['pages']) > 1
|
|
777
|
-
and node.pageref != self.har.har['log']['pages'][0]
|
|
801
|
+
and hasattr(node, 'pageref') and node.pageref != self.har.har['log']['pages'][0]
|
|
778
802
|
and self.pages_root[node.pageref] != node.uuid):
|
|
779
803
|
# In that case, we check if there is already a page with the pageref of the orphan node,
|
|
780
804
|
# and attach the node to that.
|
|
@@ -800,7 +824,7 @@ class Har2Tree:
|
|
|
800
824
|
# No luck, the node is root for this pageref, let's attach it to the prior page in the list, or the very first node (tree root)
|
|
801
825
|
page_before = self.har.har['log']['pages'][0]
|
|
802
826
|
for page in self.har.har['log']['pages'][1:]:
|
|
803
|
-
if page['id'] == node.pageref:
|
|
827
|
+
if hasattr(node, 'pageref') and page['id'] == node.pageref:
|
|
804
828
|
break
|
|
805
829
|
# Sometimes, the page listed in the list of pages is not related to
|
|
806
830
|
# any of the entries. Go figure what happened.
|
|
@@ -929,7 +953,6 @@ class Har2Tree:
|
|
|
929
953
|
if dev_debug:
|
|
930
954
|
self.logger.warning(f'Found via initiator from {unode.name} to {matching_urls}.')
|
|
931
955
|
self._make_subtree(unode, matching_urls)
|
|
932
|
-
|
|
933
956
|
# The node can have a redirect, but also trigger ressources refering to themselves, we need to trigger this code on each node.
|
|
934
957
|
if self.all_initiator_url.get(unode.name):
|
|
935
958
|
# The URL (unode.name) is in the list of known urls initiating calls
|
|
@@ -268,7 +268,7 @@ def find_identifiers(soup: BeautifulSoup) -> dict[str, list[str]] | None:
|
|
|
268
268
|
# This is beta and kinda fragile, but it's going to find (most) of the google tag IDs
|
|
269
269
|
# https://support.google.com/google-ads/answer/12326985?hl=en_us_us
|
|
270
270
|
# NOTE: the doc says 9 X, but all the examples I found have 10 X so we cannot trust it
|
|
271
|
-
if google_tag_ids := set(re.findall(r"(?:G-|AW-|GA-|UA-)\w{9,
|
|
271
|
+
if google_tag_ids := set(re.findall(r"(?:G-|AW-|GA-|UA-|GTM-)\w{9,15}", str(soup))):
|
|
272
272
|
blocklist = {'UA-Compatible'}
|
|
273
273
|
google_tag_ids -= blocklist
|
|
274
274
|
if google_tag_ids:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "har2tree"
|
|
3
|
-
version = "1.36.
|
|
3
|
+
version = "1.36.2"
|
|
4
4
|
description = "HTTP Archive (HAR) to ETE Toolkit generator"
|
|
5
5
|
authors = [
|
|
6
6
|
{name="Raphaël Vinot", email="raphael.vinot@circl.lu"}
|
|
@@ -14,12 +14,12 @@ dynamic = [ "classifiers" ]
|
|
|
14
14
|
dependencies = [
|
|
15
15
|
"ete3 (>=3.1.3)",
|
|
16
16
|
"beautifulsoup4[charset-normalizer,lxml] (>=4.14.3)",
|
|
17
|
-
"publicsuffixlist (>=1.0.2.
|
|
17
|
+
"publicsuffixlist (>=1.0.2.20260126)",
|
|
18
18
|
"filetype (>=1.2.0)",
|
|
19
19
|
# poetry up fails with the version of numpy forced for python < 3.11.
|
|
20
20
|
# The work around is to comment it, run poetry up, uncomment it. and run poetry update.
|
|
21
21
|
"numpy (>=2.2,<2.3) ; python_version < '3.11'",
|
|
22
|
-
"numpy (>=2.
|
|
22
|
+
"numpy (>=2.4.1) ; python_version >= \"3.11\" and python_version < \"4.0\"",
|
|
23
23
|
"w3lib (>=2.3.1)",
|
|
24
24
|
"tinycss2 (>=1.5.1)",
|
|
25
25
|
"legacy-cgi (>=2.6.4) ; python_version >= \"3.13\" and python_version < \"4.0\"",
|
|
@@ -44,12 +44,12 @@ classifiers = [
|
|
|
44
44
|
]
|
|
45
45
|
|
|
46
46
|
[project.optional-dependencies]
|
|
47
|
-
docs = ["Sphinx (>=9.0
|
|
47
|
+
docs = ["Sphinx (>=9.1.0) ; python_version >= \"3.12\"", "six (>=1.17.0)"]
|
|
48
48
|
|
|
49
49
|
[tool.poetry.group.dev.dependencies]
|
|
50
|
-
mypy = "^1.19.
|
|
50
|
+
mypy = "^1.19.1"
|
|
51
51
|
pytest-cov = "^7.0.0"
|
|
52
|
-
coverage = "^7.13.
|
|
52
|
+
coverage = "^7.13.2"
|
|
53
53
|
types-beautifulsoup4 = "^4.12.0.20250516"
|
|
54
54
|
|
|
55
55
|
[build-system]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|