har2tree 1.34.0__py3-none-any.whl → 1.34.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
har2tree/har2tree.py CHANGED
@@ -13,7 +13,9 @@ from functools import wraps, lru_cache
13
13
  from io import BytesIO
14
14
  from operator import itemgetter
15
15
  from pathlib import Path
16
- from typing import Any, Callable
16
+ from typing import Any, TypedDict
17
+ from collections.abc import Iterator
18
+ from collections.abc import Callable
17
19
  from urllib.parse import unquote_plus, urlparse
18
20
 
19
21
  from .helper import rebuild_url, Har2TreeError, Har2TreeLogAdapter
@@ -58,7 +60,8 @@ def trace_make_subtree_fallback(method: Callable[..., None]) -> Callable[..., No
58
60
 
59
61
  def trace_make_subtree(method: Callable[..., None]) -> Callable[..., None]:
60
62
  @wraps(method)
61
- def _impl(self: Any, root: URLNode, nodes_to_attach: list[URLNode] | None=None, dev_debug: bool=False) -> None:
63
+ def _impl(self: Any, root: URLNode, nodes_to_attach: list[URLNode] | None=None,
64
+ dev_debug: bool=False, fallback: bool=False) -> None:
62
65
  if dev_debug_mode:
63
66
  __load_debug_files()
64
67
  if dev_debug_url and root.name == dev_debug_url or nodes_to_attach is not None and any(True for u in nodes_to_attach if u.name == dev_debug_url):
@@ -67,7 +70,7 @@ def trace_make_subtree(method: Callable[..., None]) -> Callable[..., None]:
67
70
  elif dev_debug_hostname and root.hostname == dev_debug_hostname or nodes_to_attach is not None and any(True for u in nodes_to_attach if u.hostname == dev_debug_hostname):
68
71
  root.logger.warning(f'Debugging Hostname: {dev_debug_hostname}.')
69
72
  dev_debug = True
70
- return method(self, root, nodes_to_attach, dev_debug)
73
+ return method(self, root, nodes_to_attach, dev_debug, fallback)
71
74
  return _impl
72
75
 
73
76
 
@@ -84,6 +87,15 @@ def __load_debug_files() -> None:
84
87
  dev_debug_hostname = f.read().strip()
85
88
 
86
89
 
90
+ # NOTE: Copy from PlaywrightCapture to avoid extra dep
91
+ class FramesResponse(TypedDict, total=False):
92
+
93
+ name: str
94
+ url: str
95
+ content: str | None
96
+ children: list[FramesResponse] | None
97
+
98
+
87
99
  class HarFile():
88
100
 
89
101
  def __init__(self, harfile: Path, capture_uuid: str):
@@ -135,6 +147,14 @@ class HarFile():
135
147
  self.logger.debug('No cookies file available.')
136
148
  self.cookies = []
137
149
 
150
+ framesfile = self.path.parent / f'{root_name}.frames.json'
151
+ if framesfile.is_file():
152
+ with framesfile.open() as c:
153
+ self.frames: FramesResponse = json.load(c)
154
+ else:
155
+ self.logger.debug('No frames file available.')
156
+ self.frames = {}
157
+
138
158
  dlfile = self.path.parent / f'{root_name}.data'
139
159
  dlfilename = self.path.parent / f'{root_name}.data.filename'
140
160
  self.downloaded_file: BytesIO | None
@@ -313,8 +333,9 @@ class Har2Tree:
313
333
  self.pages_root: dict[str, str] = {}
314
334
 
315
335
  self.all_redirects: list[str] = []
316
- self.all_referer: dict[str, list[str]] = defaultdict(list)
317
- self.all_initiator_url: dict[str, list[str]] = defaultdict(list)
336
+ # 2025-11-16: make values of referers and initiators sets because there will be duplicates
337
+ self.all_referer: dict[str, set[str]] = defaultdict(set)
338
+ self.all_initiator_url: dict[str, set[str]] = defaultdict(set)
318
339
  self._load_url_entries()
319
340
 
320
341
  # Generate cookies lookup tables
@@ -405,6 +426,62 @@ class Har2Tree:
405
426
 
406
427
  self.url_tree = self._nodes_list.pop(0)
407
428
 
429
+ def _load_iframes(self, current: URLNode, frames: FramesResponse) -> None:
430
+ if not frames.get('content') or frames['content'] is None:
431
+ # NOTE: debug stuff, no content makes it pretty useless.
432
+ if frames.get('url'):
433
+ if frames['url'] == "about:blank":
434
+ self.logger.info('Got a frame to about:blank with no content.')
435
+ else:
436
+ u = unquote_plus(frames['url'])
437
+ self.logger.warning(f'Got a url ({u}) for the frame, but no content')
438
+ else:
439
+ self.logger.info('Got a frame, but no content.')
440
+ return
441
+
442
+ if (frames.get('url')
443
+ and not (frames['url'] in ['about:blank'] # not loading anything, same as empty
444
+ or frames['url'].startswith('data') # base64 encoded content
445
+ or frames['url'].startswith('chrome-error') # not in the HAR/tree
446
+ or frames['url'].startswith('blob') # blobs aren't URLs
447
+ )):
448
+ u = unquote_plus(frames['url'])
449
+ possible_child_name = {u, u.split('#', 1)[0]}
450
+ # this url should be in a node directly attached to that one
451
+ # we need to find that node
452
+ for child in current.traverse():
453
+ if child.name in possible_child_name:
454
+ self.logger.debug(f'Found URL "{u}".')
455
+ # Found the node, adding the content
456
+ if not hasattr(child, 'rendered_frame'):
457
+ child.rendered_frame = []
458
+ child.rendered_frame.append(BytesIO(frames['content'].encode()))
459
+ # and mark the node as iframe
460
+ child.add_feature('iframe', True)
461
+ # if there are children, use that node as parent and call the current method recursvely
462
+ if f_children := frames.get('children'):
463
+ for f_child in f_children:
464
+ self._load_iframes(child, f_child)
465
+ break
466
+ else:
467
+ # Couldn'd find the node Oo
468
+ to_print = ', '.join(possible_child_name)
469
+ children_to_print = ', '.join([child.name for child in current.traverse()])
470
+ self.logger.warning(f'Unable to find "{to_print}" in the children of "{current.name}" - {children_to_print}')
471
+ else:
472
+ self.logger.debug(f'"{current.name}" contains an iFrame.')
473
+ # No URL, this frame is directly in the parent frame.
474
+ if not hasattr(current, 'rendered_frame'):
475
+ current.rendered_frame = []
476
+ current.rendered_frame.append(BytesIO(frames['content'].encode()))
477
+ self.logger.debug(f'"{current.name}" has {len(current.rendered_frame)} iFrames.')
478
+ # and mark the node as iframe
479
+ current.add_feature('iframe', True)
480
+ # if there are children, use that node as parent and call the current method recursvely
481
+ if f_children := frames.get('children'):
482
+ for f_child in f_children:
483
+ self._load_iframes(current, f_child)
484
+
408
485
  @property
409
486
  def initial_referer(self) -> str | None:
410
487
  '''The referer passed to the first URL in the tree'''
@@ -527,7 +604,7 @@ class Har2Tree:
527
604
 
528
605
  if hasattr(n, 'initiator_url'):
529
606
  # The HAR file was created by chrome/chromium and we got the _initiator key
530
- self.all_initiator_url[n.initiator_url].append(n.name)
607
+ self.all_initiator_url[n.initiator_url].add(n.name)
531
608
 
532
609
  if url_entry['startedDateTime'] in self.har.pages_start_times:
533
610
  for page in self.har.pages_start_times[url_entry['startedDateTime']]:
@@ -540,7 +617,7 @@ class Har2Tree:
540
617
  if hasattr(n, 'referer') and i > 0:
541
618
  # NOTE 2021-05-14: referer to self are a real thing: url -> POST to self
542
619
  if n.name != n.referer or ('method' in n.request and n.request['method'] == 'POST'):
543
- self.all_referer[n.referer].append(n.name)
620
+ self.all_referer[n.referer].add(n.name)
544
621
 
545
622
  self._nodes_list.append(n)
546
623
  self.all_url_requests[n.name].append(n)
@@ -621,6 +698,14 @@ class Har2Tree:
621
698
  for child_node_hostname, child_nodes_url in sub_roots.items():
622
699
  self.make_hostname_tree(child_nodes_url, child_node_hostname)
623
700
 
701
+ def _all_urlnodes_in_host_tree(self) -> None:
702
+ # debug: check if all the nodes in the URL tree are in the hostnode tree (they must have an UUID)
703
+ self.logger.warning('Validating host tree....')
704
+ for urlnode in self.url_tree.traverse():
705
+ if not hasattr(urlnode, 'hostnode_uuid'):
706
+ self.logger.error(f'URL Node not un host tree: {urlnode}')
707
+ self.logger.warning('host tree validated.')
708
+
624
709
  def make_tree(self) -> URLNode:
625
710
  """Build URL and Host trees"""
626
711
  self._make_subtree(self.url_tree)
@@ -651,6 +736,15 @@ class Har2Tree:
651
736
  # Initialize the hostname tree root
652
737
  self.hostname_tree.add_url(self.url_tree)
653
738
  self.make_hostname_tree(self.url_tree, self.hostname_tree)
739
+ if dev_debug_mode:
740
+ self._all_urlnodes_in_host_tree()
741
+ if isinstance(self.har.frames, dict):
742
+ if self.har.frames.get('children') and self.har.frames['children'] is not None:
743
+ # we have frames in the main one
744
+ for f_child in self.har.frames['children']:
745
+ self._load_iframes(self.rendered_node, f_child)
746
+ else:
747
+ self.logger.warning(f'Wrong format for the frames ({type(self.har.frames)}), very old capture.')
654
748
  return self.url_tree
655
749
 
656
750
  @trace_make_subtree_fallback
@@ -668,7 +762,7 @@ class Har2Tree:
668
762
  # we got an non-empty response, breaking
669
763
  break
670
764
  # attach to the the first response with something, or to whatever we get.
671
- self._make_subtree(node_with_hostname, [node])
765
+ self._make_subtree(node_with_hostname, [node], fallback=True)
672
766
  return
673
767
 
674
768
  # Sometimes, the har has a list of pages, generally when we have HTTP redirects.
@@ -686,7 +780,7 @@ class Har2Tree:
686
780
  page_root_node = self.get_url_node_by_uuid(self.pages_root[node.pageref])
687
781
  if dev_debug:
688
782
  self.logger.warning(f'Failed to attach URLNode in the normal process, attaching node to page {node.pageref} - Node: {page_root_node.uuid} - {page_root_node.name}.')
689
- self._make_subtree(page_root_node, [node])
783
+ self._make_subtree(page_root_node, [node], fallback=True)
690
784
  elif self.rendered_node != self.url_tree:
691
785
  # Generally, when we have a bunch of redirects, they (generally) do not branch out
692
786
  # before the final landing page *but* it is not always the case: some intermediary
@@ -698,7 +792,7 @@ class Har2Tree:
698
792
  # end of this method anyway
699
793
  if dev_debug:
700
794
  self.logger.warning(f'Failed to attach URLNode in the normal process, attaching node to final redirect: {self.har.final_redirect}.')
701
- self._make_subtree(self.rendered_node, [node])
795
+ self._make_subtree(self.rendered_node, [node], fallback=True)
702
796
  elif 'pages' in self.har.har['log']:
703
797
  # No luck, the node is root for this pageref, let's attach it to the prior page in the list, or the very first node (tree root)
704
798
  page_before = self.har.har['log']['pages'][0]
@@ -720,13 +814,42 @@ class Har2Tree:
720
814
  # node to the root node
721
815
  page_root_node = self.url_tree
722
816
  self.logger.warning('The pages in the HAR are in in the wrong order, this should not happen but here we are')
723
- self._make_subtree(page_root_node, [node])
817
+ self._make_subtree(page_root_node, [node], fallback=True)
724
818
  else:
725
819
  # no way to attach it to anything else, attach to the root node
726
- self._make_subtree(self.url_tree, [node])
820
+ self._make_subtree(self.url_tree, [node], fallback=True)
821
+
822
+ def all_real_urls_in_children(self, frame: FramesResponse) -> Iterator[str]:
823
+ # from a frame, search all the real urls in each of the children, stop at the first one
824
+ if (frame.get('url') and frame['url'] is not None
825
+ and not (frame['url'] in ['about:blank', 'about:srcdoc'] # not loading anything, same as empty
826
+ or frame['url'].startswith('data') # base64 encoded content
827
+ or frame['url'].startswith('chrome-error') # not in the HAR/tree
828
+ or frame['url'].startswith('blob'))): # blobs aren't URLs
829
+ yield frame['url']
830
+ else:
831
+ # got no real URL, try the children
832
+ if frame.get('children') and frame['children'] is not None:
833
+ for c in frame['children']:
834
+ yield from self.all_real_urls_in_children(c)
835
+
836
+ def search_in_frames(self, urls: set[str], frame: FramesResponse) -> Iterator[str]:
837
+ # If the frame doesn't have children, there are no potential URLs to attach
838
+ if not frame.get('children') or frame['children'] is None:
839
+ return None
840
+
841
+ if frame.get('url'):
842
+ u = unquote_plus(frame['url'])
843
+ if urls & {u, u.split('#', 1)[0]}:
844
+ # got a matching URL, get list of potential iframes urls
845
+ for c in frame['children']:
846
+ yield from self.all_real_urls_in_children(c)
847
+ for c in frame['children']:
848
+ yield from self.search_in_frames(urls, c)
727
849
 
728
850
  @trace_make_subtree
729
- def _make_subtree(self, root: URLNode, nodes_to_attach: list[URLNode] | None=None, dev_debug: bool=False) -> None:
851
+ def _make_subtree(self, root: URLNode, nodes_to_attach: list[URLNode] | None=None,
852
+ dev_debug: bool=False, fallback: bool=False) -> None:
730
853
  """Recursive method building each level of the tree"""
731
854
  matching_urls: list[URLNode]
732
855
  if nodes_to_attach is None:
@@ -788,6 +911,26 @@ class Har2Tree:
788
911
  if unode.empty_response:
789
912
  continue
790
913
 
914
+ # 2025-11-14
915
+ # the referer of an iframe is the hostname of the parent, even if the parent
916
+ # is a URL with a full path. Before using the referer, we need to check if we have
917
+ # the current url in the frame tree. If we do, find nodes (in the remaining list)
918
+ # with the URLs of the children - any fragment will be missing - and attach that node
919
+ possible_iframe_urls = {unode.name, unode.name.split('#', 1)[0]}
920
+ for possible_url in self.search_in_frames(possible_iframe_urls, self.har.frames):
921
+ cu = unquote_plus(possible_url)
922
+ for u in {cu, cu.split('#', 1)[0]}:
923
+ if u not in self.all_url_requests:
924
+ if '#' not in u:
925
+ self.logger.info(f'"{u}" in the frames URLs, but not in the HAR.')
926
+ continue
927
+ matching_urls = [url_node for url_node in self.all_url_requests[u]
928
+ if url_node in self._nodes_list]
929
+ self._nodes_list = [node for node in self._nodes_list if node not in matching_urls]
930
+ if dev_debug:
931
+ self.logger.warning(f'Found via initiator from {unode.name} to {matching_urls}.')
932
+ self._make_subtree(unode, matching_urls)
933
+
791
934
  # The node can have a redirect, but also trigger ressources refering to themselves, we need to trigger this code on each node.
792
935
  if self.all_initiator_url.get(unode.name):
793
936
  # The URL (unode.name) is in the list of known urls initiating calls
@@ -819,6 +962,12 @@ class Har2Tree:
819
962
  if hasattr(unode, 'external_ressources'):
820
963
  # the url loads external things, and some of them have no referer....
821
964
  for external_tag, links in unode.external_ressources.items():
965
+ # 2025-11-06: skip full regex until we're calling this method in the fallback
966
+ # the iframes will often (not always) have a referer set and the URL
967
+ # might be found by the regex and it will not be attached at the
968
+ # right place
969
+ if external_tag == 'full_regex' and not fallback:
970
+ continue
822
971
  for link in links:
823
972
  if link not in self.all_url_requests or link == self.har.final_redirect:
824
973
  # We have a lot of false positives
har2tree/helper.py CHANGED
@@ -364,7 +364,8 @@ def find_external_ressources(mimetype: str, data: bytes, base_url: str, all_requ
364
364
  # link: https://www.w3schools.com/TAGs/tag_link.asp -> href
365
365
  # object: https://www.w3schools.com/TAGs/tag_object.asp -> data
366
366
  external_ressources: dict[str, list[str]] = {'img': [], 'script': [], 'video': [], 'audio': [],
367
- 'iframe': [], 'embed': [], 'source': [],
367
+ 'iframe': [],
368
+ 'embed': [], 'source': [],
368
369
  'link': [],
369
370
  'object': [],
370
371
  'css': [],
@@ -1,12 +1,12 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: har2tree
3
- Version: 1.34.0
3
+ Version: 1.34.2
4
4
  Summary: HTTP Archive (HAR) to ETE Toolkit generator
5
5
  License-Expression: BSD-3-Clause
6
6
  License-File: LICENSE
7
7
  Author: Raphaël Vinot
8
8
  Author-email: raphael.vinot@circl.lu
9
- Requires-Python: >=3.10,<4.0
9
+ Requires-Python: >=3.10,<3.15
10
10
  Classifier: Intended Audience :: Information Technology
11
11
  Classifier: Intended Audience :: Science/Research
12
12
  Classifier: Intended Audience :: Telecommunications Industry
@@ -25,11 +25,11 @@ Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.14.2)
25
25
  Requires-Dist: ete3 (>=3.1.3)
26
26
  Requires-Dist: filetype (>=1.2.0)
27
27
  Requires-Dist: json-stream (>=2.3.3,<3.0.0)
28
- Requires-Dist: legacy-cgi (>=2.6.3) ; python_version >= "3.13" and python_version < "4.0"
28
+ Requires-Dist: legacy-cgi (>=2.6.4) ; python_version >= "3.13" and python_version < "4.0"
29
29
  Requires-Dist: multipart (>=1.3.0,<2.0.0)
30
30
  Requires-Dist: numpy (>=2.2,<2.3) ; python_version < "3.11"
31
31
  Requires-Dist: numpy (>=2.3.4) ; python_version >= "3.11" and python_version < "4.0"
32
- Requires-Dist: publicsuffixlist (>=1.0.2.20251015)
32
+ Requires-Dist: publicsuffixlist (>=1.0.2.20251115)
33
33
  Requires-Dist: requests-toolbelt (>=1.0.0,<2.0.0)
34
34
  Requires-Dist: six (>=1.17.0) ; extra == "docs"
35
35
  Requires-Dist: tinycss2 (>=1.4.0)
@@ -0,0 +1,10 @@
1
+ har2tree/__init__.py,sha256=Na3mxHkUBq3rzYbxiLNJF37DxH5mcghSorjzXw5Teug,422
2
+ har2tree/har2tree.py,sha256=1HvGNfUMccR21mnuBnqYr3vG-28wjz0vp2CVcyMuzg8,52958
3
+ har2tree/helper.py,sha256=psMpYWs5w0CONLfEo33yFgz6VwVY13xYbNDejOZ_EDw,20730
4
+ har2tree/nodes.py,sha256=QWKqEUnuW7J6pASVvzwWAQNqL-_KDzSs2ld6uJl3qbw,37710
5
+ har2tree/parser.py,sha256=4yej1OcVYAIiLfzYZsO9WCw3WyM_ykDTuvpW7UO1ROE,3645
6
+ har2tree/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ har2tree-1.34.2.dist-info/METADATA,sha256=7WlrwtcMWJ4YefcbzBwAf9l5d1DjlxJk3oKTGLomzJI,2239
8
+ har2tree-1.34.2.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
9
+ har2tree-1.34.2.dist-info/licenses/LICENSE,sha256=Xa4EVROgJsEo10CW-ISCRiw0TtqdKz1JuM3BBLBM55c,1803
10
+ har2tree-1.34.2.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- har2tree/__init__.py,sha256=Na3mxHkUBq3rzYbxiLNJF37DxH5mcghSorjzXw5Teug,422
2
- har2tree/har2tree.py,sha256=Ss8CLjspHs0hA4am9ODX6vNP0J2Gzg3w1D3265vf4kw,44620
3
- har2tree/helper.py,sha256=CgeXqfBeHs8SbkW7TRNKqJBTZLAu63KggQjbGHCZAGI,20681
4
- har2tree/nodes.py,sha256=QWKqEUnuW7J6pASVvzwWAQNqL-_KDzSs2ld6uJl3qbw,37710
5
- har2tree/parser.py,sha256=4yej1OcVYAIiLfzYZsO9WCw3WyM_ykDTuvpW7UO1ROE,3645
6
- har2tree/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- har2tree-1.34.0.dist-info/METADATA,sha256=TFV-a-T0QPVyC654xta3n0LWQnFNE_XKYZOWEayM0DA,2238
8
- har2tree-1.34.0.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
9
- har2tree-1.34.0.dist-info/licenses/LICENSE,sha256=Xa4EVROgJsEo10CW-ISCRiw0TtqdKz1JuM3BBLBM55c,1803
10
- har2tree-1.34.0.dist-info/RECORD,,