har2tree 1.33.0__tar.gz → 1.34.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,12 +1,12 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: har2tree
3
- Version: 1.33.0
3
+ Version: 1.34.1
4
4
  Summary: HTTP Archive (HAR) to ETE Toolkit generator
5
5
  License-Expression: BSD-3-Clause
6
6
  License-File: LICENSE
7
7
  Author: Raphaël Vinot
8
8
  Author-email: raphael.vinot@circl.lu
9
- Requires-Python: >=3.9.2,<4.0
9
+ Requires-Python: >=3.10,<3.15
10
10
  Classifier: Intended Audience :: Information Technology
11
11
  Classifier: Intended Audience :: Science/Research
12
12
  Classifier: Intended Audience :: Telecommunications Industry
@@ -21,16 +21,15 @@ Classifier: Topic :: Internet
21
21
  Classifier: Topic :: Security
22
22
  Provides-Extra: docs
23
23
  Requires-Dist: Sphinx (>=8.2.3) ; (python_version >= "3.11") and (extra == "docs")
24
- Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.13.5)
24
+ Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.14.2)
25
25
  Requires-Dist: ete3 (>=3.1.3)
26
26
  Requires-Dist: filetype (>=1.2.0)
27
27
  Requires-Dist: json-stream (>=2.3.3,<3.0.0)
28
- Requires-Dist: legacy-cgi (>=2.6.3) ; python_version >= "3.13" and python_version < "4.0"
28
+ Requires-Dist: legacy-cgi (>=2.6.4) ; python_version >= "3.13" and python_version < "4.0"
29
29
  Requires-Dist: multipart (>=1.3.0,<2.0.0)
30
- Requires-Dist: numpy (<2.1) ; python_version < "3.10"
31
- Requires-Dist: numpy (>=2.2,<2.3) ; python_version == "3.10"
32
- Requires-Dist: numpy (>=2.3.3) ; python_version >= "3.11" and python_version < "4.0"
33
- Requires-Dist: publicsuffixlist (>=1.0.2.20250919)
30
+ Requires-Dist: numpy (>=2.2,<2.3) ; python_version < "3.11"
31
+ Requires-Dist: numpy (>=2.3.4) ; python_version >= "3.11" and python_version < "4.0"
32
+ Requires-Dist: publicsuffixlist (>=1.0.2.20251107)
34
33
  Requires-Dist: requests-toolbelt (>=1.0.0,<2.0.0)
35
34
  Requires-Dist: six (>=1.17.0) ; extra == "docs"
36
35
  Requires-Dist: tinycss2 (>=1.4.0)
@@ -13,7 +13,8 @@ from functools import wraps, lru_cache
13
13
  from io import BytesIO
14
14
  from operator import itemgetter
15
15
  from pathlib import Path
16
- from typing import Any, Callable
16
+ from typing import Any, TypedDict
17
+ from collections.abc import Callable
17
18
  from urllib.parse import unquote_plus, urlparse
18
19
 
19
20
  from .helper import rebuild_url, Har2TreeError, Har2TreeLogAdapter
@@ -58,7 +59,8 @@ def trace_make_subtree_fallback(method: Callable[..., None]) -> Callable[..., No
58
59
 
59
60
  def trace_make_subtree(method: Callable[..., None]) -> Callable[..., None]:
60
61
  @wraps(method)
61
- def _impl(self: Any, root: URLNode, nodes_to_attach: list[URLNode] | None=None, dev_debug: bool=False) -> None:
62
+ def _impl(self: Any, root: URLNode, nodes_to_attach: list[URLNode] | None=None,
63
+ dev_debug: bool=False, fallback: bool=False) -> None:
62
64
  if dev_debug_mode:
63
65
  __load_debug_files()
64
66
  if dev_debug_url and root.name == dev_debug_url or nodes_to_attach is not None and any(True for u in nodes_to_attach if u.name == dev_debug_url):
@@ -67,7 +69,7 @@ def trace_make_subtree(method: Callable[..., None]) -> Callable[..., None]:
67
69
  elif dev_debug_hostname and root.hostname == dev_debug_hostname or nodes_to_attach is not None and any(True for u in nodes_to_attach if u.hostname == dev_debug_hostname):
68
70
  root.logger.warning(f'Debugging Hostname: {dev_debug_hostname}.')
69
71
  dev_debug = True
70
- return method(self, root, nodes_to_attach, dev_debug)
72
+ return method(self, root, nodes_to_attach, dev_debug, fallback)
71
73
  return _impl
72
74
 
73
75
 
@@ -84,6 +86,15 @@ def __load_debug_files() -> None:
84
86
  dev_debug_hostname = f.read().strip()
85
87
 
86
88
 
89
+ # NOTE: Copy from PlaywrightCapture to avoid extra dep
90
+ class FramesResponse(TypedDict, total=False):
91
+
92
+ name: str
93
+ url: str
94
+ content: str | None
95
+ children: list[FramesResponse] | None
96
+
97
+
87
98
  class HarFile():
88
99
 
89
100
  def __init__(self, harfile: Path, capture_uuid: str):
@@ -135,6 +146,14 @@ class HarFile():
135
146
  self.logger.debug('No cookies file available.')
136
147
  self.cookies = []
137
148
 
149
+ framesfile = self.path.parent / f'{root_name}.frames.json'
150
+ if framesfile.is_file():
151
+ with framesfile.open() as c:
152
+ self.frames: FramesResponse = json.load(c)
153
+ else:
154
+ self.logger.debug('No frames file available.')
155
+ self.frames = {}
156
+
138
157
  dlfile = self.path.parent / f'{root_name}.data'
139
158
  dlfilename = self.path.parent / f'{root_name}.data.filename'
140
159
  self.downloaded_file: BytesIO | None
@@ -405,6 +424,54 @@ class Har2Tree:
405
424
 
406
425
  self.url_tree = self._nodes_list.pop(0)
407
426
 
427
+ def _load_iframes(self, current: URLNode, frames: FramesResponse) -> None:
428
+ if not frames.get('content') or frames['content'] is None:
429
+ # NOTE: debug stuff, no content makes it pretty useless.
430
+ if frames.get('url'):
431
+ if frames['url'] == "about:blank":
432
+ self.logger.info('Got a frame to about:blank with no content.')
433
+ else:
434
+ u = unquote_plus(frames['url'])
435
+ self.logger.warning(f'Got a url ({u}) for the frame, but no content')
436
+ else:
437
+ self.logger.info('Got a frame, but no content.')
438
+ return
439
+
440
+ if frames.get('url') and not (frames['url'] in ['about:blank']):
441
+ u = unquote_plus(frames['url'])
442
+ # this url should be in a node directly attached to that one
443
+ # we need to find that node
444
+ for child in current.traverse():
445
+ if child.name in [u, u.split('#', 1)[0]]:
446
+ self.logger.debug(f'Found URL "{u}".')
447
+ # Found the node, adding the content
448
+ if not hasattr(child, 'rendered_frame'):
449
+ child.rendered_frame = []
450
+ child.rendered_frame.append(BytesIO(frames['content'].encode()))
451
+ # and mark the node as iframe
452
+ child.add_feature('iframe', True)
453
+ # if there are children, use that node as parent and call the current method recursvely
454
+ if f_children := frames.get('children'):
455
+ for f_child in f_children:
456
+ self._load_iframes(child, f_child)
457
+ break
458
+ else:
459
+ # Couldn'd find the node Oo
460
+ self.logger.warning(f'Unable to find "{u}" in the children of "{current.name}"')
461
+ else:
462
+ self.logger.debug(f'"{current.name}" contains an iFrame.')
463
+ # No URL, this frame is directly in the parent frame.
464
+ if not hasattr(current, 'rendered_frame'):
465
+ current.rendered_frame = []
466
+ current.rendered_frame.append(BytesIO(frames['content'].encode()))
467
+ self.logger.debug(f'"{current.name}" has {len(current.rendered_frame)} iFrames.')
468
+ # and mark the node as iframe
469
+ current.add_feature('iframe', True)
470
+ # if there are children, use that node as parent and call the current method recursvely
471
+ if f_children := frames.get('children'):
472
+ for f_child in f_children:
473
+ self._load_iframes(current, f_child)
474
+
408
475
  @property
409
476
  def initial_referer(self) -> str | None:
410
477
  '''The referer passed to the first URL in the tree'''
@@ -621,6 +688,14 @@ class Har2Tree:
621
688
  for child_node_hostname, child_nodes_url in sub_roots.items():
622
689
  self.make_hostname_tree(child_nodes_url, child_node_hostname)
623
690
 
691
+ def _all_urlnodes_in_host_tree(self) -> None:
692
+ # debug: check if all the nodes in the URL tree are in the hostnode tree (they must have an UUID)
693
+ self.logger.warning('Validating host tree....')
694
+ for urlnode in self.url_tree.traverse():
695
+ if not hasattr(urlnode, 'hostnode_uuid'):
696
+ self.logger.error(f'URL Node not un host tree: {urlnode}')
697
+ self.logger.warning('host tree validated.')
698
+
624
699
  def make_tree(self) -> URLNode:
625
700
  """Build URL and Host trees"""
626
701
  self._make_subtree(self.url_tree)
@@ -651,6 +726,12 @@ class Har2Tree:
651
726
  # Initialize the hostname tree root
652
727
  self.hostname_tree.add_url(self.url_tree)
653
728
  self.make_hostname_tree(self.url_tree, self.hostname_tree)
729
+ if dev_debug_mode:
730
+ self._all_urlnodes_in_host_tree()
731
+ if self.har.frames.get('children') and self.har.frames['children'] is not None:
732
+ # we have frames in the main one
733
+ for f_child in self.har.frames['children']:
734
+ self._load_iframes(self.rendered_node, f_child)
654
735
  return self.url_tree
655
736
 
656
737
  @trace_make_subtree_fallback
@@ -668,7 +749,7 @@ class Har2Tree:
668
749
  # we got an non-empty response, breaking
669
750
  break
670
751
  # attach to the the first response with something, or to whatever we get.
671
- self._make_subtree(node_with_hostname, [node])
752
+ self._make_subtree(node_with_hostname, [node], fallback=True)
672
753
  return
673
754
 
674
755
  # Sometimes, the har has a list of pages, generally when we have HTTP redirects.
@@ -686,7 +767,7 @@ class Har2Tree:
686
767
  page_root_node = self.get_url_node_by_uuid(self.pages_root[node.pageref])
687
768
  if dev_debug:
688
769
  self.logger.warning(f'Failed to attach URLNode in the normal process, attaching node to page {node.pageref} - Node: {page_root_node.uuid} - {page_root_node.name}.')
689
- self._make_subtree(page_root_node, [node])
770
+ self._make_subtree(page_root_node, [node], fallback=True)
690
771
  elif self.rendered_node != self.url_tree:
691
772
  # Generally, when we have a bunch of redirects, they (generally) do not branch out
692
773
  # before the final landing page *but* it is not always the case: some intermediary
@@ -698,7 +779,7 @@ class Har2Tree:
698
779
  # end of this method anyway
699
780
  if dev_debug:
700
781
  self.logger.warning(f'Failed to attach URLNode in the normal process, attaching node to final redirect: {self.har.final_redirect}.')
701
- self._make_subtree(self.rendered_node, [node])
782
+ self._make_subtree(self.rendered_node, [node], fallback=True)
702
783
  elif 'pages' in self.har.har['log']:
703
784
  # No luck, the node is root for this pageref, let's attach it to the prior page in the list, or the very first node (tree root)
704
785
  page_before = self.har.har['log']['pages'][0]
@@ -720,13 +801,14 @@ class Har2Tree:
720
801
  # node to the root node
721
802
  page_root_node = self.url_tree
722
803
  self.logger.warning('The pages in the HAR are in in the wrong order, this should not happen but here we are')
723
- self._make_subtree(page_root_node, [node])
804
+ self._make_subtree(page_root_node, [node], fallback=True)
724
805
  else:
725
806
  # no way to attach it to anything else, attach to the root node
726
- self._make_subtree(self.url_tree, [node])
807
+ self._make_subtree(self.url_tree, [node], fallback=True)
727
808
 
728
809
  @trace_make_subtree
729
- def _make_subtree(self, root: URLNode, nodes_to_attach: list[URLNode] | None=None, dev_debug: bool=False) -> None:
810
+ def _make_subtree(self, root: URLNode, nodes_to_attach: list[URLNode] | None=None,
811
+ dev_debug: bool=False, fallback: bool=False) -> None:
730
812
  """Recursive method building each level of the tree"""
731
813
  matching_urls: list[URLNode]
732
814
  if nodes_to_attach is None:
@@ -819,6 +901,12 @@ class Har2Tree:
819
901
  if hasattr(unode, 'external_ressources'):
820
902
  # the url loads external things, and some of them have no referer....
821
903
  for external_tag, links in unode.external_ressources.items():
904
+ # 2025-11-06: skip full regex until we're calling this method in the fallback
905
+ # the iframes will often (not always) have a referer set and the URL
906
+ # might be found by the regex and it will not be attached at the
907
+ # right place
908
+ if external_tag == 'full_regex' and not fallback:
909
+ continue
822
910
  for link in links:
823
911
  if link not in self.all_url_requests or link == self.har.final_redirect:
824
912
  # We have a lot of false positives
@@ -364,7 +364,8 @@ def find_external_ressources(mimetype: str, data: bytes, base_url: str, all_requ
364
364
  # link: https://www.w3schools.com/TAGs/tag_link.asp -> href
365
365
  # object: https://www.w3schools.com/TAGs/tag_object.asp -> data
366
366
  external_ressources: dict[str, list[str]] = {'img': [], 'script': [], 'video': [], 'audio': [],
367
- 'iframe': [], 'embed': [], 'source': [],
367
+ 'iframe': [],
368
+ 'embed': [], 'source': [],
368
369
  'link': [],
369
370
  'object': [],
370
371
  'css': [],
@@ -1,32 +1,31 @@
1
1
  [project]
2
2
  name = "har2tree"
3
- version = "1.33.0"
3
+ version = "1.34.1"
4
4
  description = "HTTP Archive (HAR) to ETE Toolkit generator"
5
5
  authors = [
6
6
  {name="Raphaël Vinot", email="raphael.vinot@circl.lu"}
7
7
  ]
8
8
  license = "BSD-3-Clause"
9
9
  readme = "README.md"
10
- requires-python = ">=3.9.2,<4.0"
10
+ requires-python = ">=3.10,<3.15"
11
11
 
12
12
  dynamic = [ "classifiers" ]
13
13
 
14
14
  dependencies = [
15
15
  "ete3 (>=3.1.3)",
16
- "beautifulsoup4[charset-normalizer,lxml] (>=4.13.5)",
17
- "publicsuffixlist (>=1.0.2.20250919)",
16
+ "beautifulsoup4[charset-normalizer,lxml] (>=4.14.2)",
17
+ "publicsuffixlist (>=1.0.2.20251107)",
18
18
  "filetype (>=1.2.0)",
19
- # poetry up fails with the version of numpy forced for python < 3.10.
19
+ # poetry up fails with the version of numpy forced for python < 3.11.
20
20
  # The work around is to comment it, run poetry up, uncomment it. and run poetry update.
21
- "numpy (<2.1) ; python_version < \"3.10\"",
22
- "numpy (>=2.2,<2.3) ; python_version >= '3.10' and python_version < '3.11'",
23
- "numpy (>=2.3.3) ; python_version >= '3.11' and python_version < '4.0'",
21
+ "numpy (>=2.2,<2.3) ; python_version < '3.11'",
22
+ "numpy (>=2.3.4) ; python_version >= \"3.11\" and python_version < \"4.0\"",
24
23
  "w3lib (>=2.3.1)",
25
24
  "tinycss2 (>=1.4.0)",
26
- "legacy-cgi (>=2.6.3) ; python_version >= '3.13' and python_version < '4.0'",
25
+ "legacy-cgi (>=2.6.4) ; python_version >= \"3.13\" and python_version < \"4.0\"",
27
26
  "multipart (>=1.3.0,<2.0.0)",
28
27
  "json-stream (>=2.3.3,<3.0.0)",
29
- "requests-toolbelt (>=1.0.0,<2.0.0)",
28
+ "requests-toolbelt (>=1.0.0,<2.0.0)"
30
29
  ]
31
30
 
32
31
  [project.urls]
@@ -50,7 +49,7 @@ docs = ["Sphinx (>=8.2.3) ; python_version >= \"3.11\"", "six (>=1.17.0)"]
50
49
  [tool.poetry.group.dev.dependencies]
51
50
  mypy = "^1.18.2"
52
51
  pytest-cov = "^7.0.0"
53
- coverage = "^7.10.6"
52
+ coverage = "^7.11.1"
54
53
  types-beautifulsoup4 = "^4.12.0.20250516"
55
54
 
56
55
  [build-system]
File without changes
File without changes
File without changes
File without changes
File without changes