har2tree 1.34.1__py3-none-any.whl → 1.34.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
har2tree/har2tree.py CHANGED
@@ -14,6 +14,7 @@ from io import BytesIO
14
14
  from operator import itemgetter
15
15
  from pathlib import Path
16
16
  from typing import Any, TypedDict
17
+ from collections.abc import Iterator
17
18
  from collections.abc import Callable
18
19
  from urllib.parse import unquote_plus, urlparse
19
20
 
@@ -332,8 +333,9 @@ class Har2Tree:
332
333
  self.pages_root: dict[str, str] = {}
333
334
 
334
335
  self.all_redirects: list[str] = []
335
- self.all_referer: dict[str, list[str]] = defaultdict(list)
336
- self.all_initiator_url: dict[str, list[str]] = defaultdict(list)
336
+ # 2025-11-16: make values of referers and initiators sets because there will be duplicates
337
+ self.all_referer: dict[str, set[str]] = defaultdict(set)
338
+ self.all_initiator_url: dict[str, set[str]] = defaultdict(set)
337
339
  self._load_url_entries()
338
340
 
339
341
  # Generate cookies lookup tables
@@ -437,12 +439,18 @@ class Har2Tree:
437
439
  self.logger.info('Got a frame, but no content.')
438
440
  return
439
441
 
440
- if frames.get('url') and not (frames['url'] in ['about:blank']):
442
+ if (frames.get('url')
443
+ and not (frames['url'] in ['about:blank'] # not loading anything, same as empty
444
+ or frames['url'].startswith('data') # base64 encoded content
445
+ or frames['url'].startswith('chrome-error') # not in the HAR/tree
446
+ or frames['url'].startswith('blob') # blobs aren't URLs
447
+ )):
441
448
  u = unquote_plus(frames['url'])
449
+ possible_child_name = {u, u.split('#', 1)[0]}
442
450
  # this url should be in a node directly attached to that one
443
451
  # we need to find that node
444
452
  for child in current.traverse():
445
- if child.name in [u, u.split('#', 1)[0]]:
453
+ if child.name in possible_child_name:
446
454
  self.logger.debug(f'Found URL "{u}".')
447
455
  # Found the node, adding the content
448
456
  if not hasattr(child, 'rendered_frame'):
@@ -457,7 +465,9 @@ class Har2Tree:
457
465
  break
458
466
  else:
459
467
  # Couldn'd find the node Oo
460
- self.logger.warning(f'Unable to find "{u}" in the children of "{current.name}"')
468
+ to_print = ', '.join(possible_child_name)
469
+ children_to_print = ', '.join([child.name for child in current.traverse()])
470
+ self.logger.warning(f'Unable to find "{to_print}" in the children of "{current.name}" - {children_to_print}')
461
471
  else:
462
472
  self.logger.debug(f'"{current.name}" contains an iFrame.')
463
473
  # No URL, this frame is directly in the parent frame.
@@ -594,7 +604,7 @@ class Har2Tree:
594
604
 
595
605
  if hasattr(n, 'initiator_url'):
596
606
  # The HAR file was created by chrome/chromium and we got the _initiator key
597
- self.all_initiator_url[n.initiator_url].append(n.name)
607
+ self.all_initiator_url[n.initiator_url].add(n.name)
598
608
 
599
609
  if url_entry['startedDateTime'] in self.har.pages_start_times:
600
610
  for page in self.har.pages_start_times[url_entry['startedDateTime']]:
@@ -607,7 +617,7 @@ class Har2Tree:
607
617
  if hasattr(n, 'referer') and i > 0:
608
618
  # NOTE 2021-05-14: referer to self are a real thing: url -> POST to self
609
619
  if n.name != n.referer or ('method' in n.request and n.request['method'] == 'POST'):
610
- self.all_referer[n.referer].append(n.name)
620
+ self.all_referer[n.referer].add(n.name)
611
621
 
612
622
  self._nodes_list.append(n)
613
623
  self.all_url_requests[n.name].append(n)
@@ -728,10 +738,13 @@ class Har2Tree:
728
738
  self.make_hostname_tree(self.url_tree, self.hostname_tree)
729
739
  if dev_debug_mode:
730
740
  self._all_urlnodes_in_host_tree()
731
- if self.har.frames.get('children') and self.har.frames['children'] is not None:
732
- # we have frames in the main one
733
- for f_child in self.har.frames['children']:
734
- self._load_iframes(self.rendered_node, f_child)
741
+ if isinstance(self.har.frames, dict):
742
+ if self.har.frames.get('children') and self.har.frames['children'] is not None:
743
+ # we have frames in the main one
744
+ for f_child in self.har.frames['children']:
745
+ self._load_iframes(self.rendered_node, f_child)
746
+ else:
747
+ self.logger.warning(f'Wrong format for the frames ({type(self.har.frames)}), very old capture.')
735
748
  return self.url_tree
736
749
 
737
750
  @trace_make_subtree_fallback
@@ -806,6 +819,34 @@ class Har2Tree:
806
819
  # no way to attach it to anything else, attach to the root node
807
820
  self._make_subtree(self.url_tree, [node], fallback=True)
808
821
 
822
+ def all_real_urls_in_children(self, frame: FramesResponse) -> Iterator[str]:
823
+ # from a frame, search all the real urls in each of the children, stop at the first one
824
+ if (frame.get('url') and frame['url'] is not None
825
+ and not (frame['url'] in ['about:blank', 'about:srcdoc'] # not loading anything, same as empty
826
+ or frame['url'].startswith('data') # base64 encoded content
827
+ or frame['url'].startswith('chrome-error') # not in the HAR/tree
828
+ or frame['url'].startswith('blob'))): # blobs aren't URLs
829
+ yield frame['url']
830
+ else:
831
+ # got no real URL, try the children
832
+ if frame.get('children') and frame['children'] is not None:
833
+ for c in frame['children']:
834
+ yield from self.all_real_urls_in_children(c)
835
+
836
+ def search_in_frames(self, urls: set[str], frame: FramesResponse) -> Iterator[str]:
837
+ # If the frame doesn't have children, there are no potential URLs to attach
838
+ if not frame.get('children') or frame['children'] is None:
839
+ return None
840
+
841
+ if frame.get('url'):
842
+ u = unquote_plus(frame['url'])
843
+ if urls & {u, u.split('#', 1)[0]}:
844
+ # got a matching URL, get list of potential iframes urls
845
+ for c in frame['children']:
846
+ yield from self.all_real_urls_in_children(c)
847
+ for c in frame['children']:
848
+ yield from self.search_in_frames(urls, c)
849
+
809
850
  @trace_make_subtree
810
851
  def _make_subtree(self, root: URLNode, nodes_to_attach: list[URLNode] | None=None,
811
852
  dev_debug: bool=False, fallback: bool=False) -> None:
@@ -870,6 +911,26 @@ class Har2Tree:
870
911
  if unode.empty_response:
871
912
  continue
872
913
 
914
+ # 2025-11-14
915
+ # the referer of an iframe is the hostname of the parent, even if the parent
916
+ # is a URL with a full path. Before using the referer, we need to check if we have
917
+ # the current url in the frame tree. If we do, find nodes (in the remaining list)
918
+ # with the URLs of the children - any fragment will be missing - and attach that node
919
+ possible_iframe_urls = {unode.name, unode.name.split('#', 1)[0]}
920
+ for possible_url in self.search_in_frames(possible_iframe_urls, self.har.frames):
921
+ cu = unquote_plus(possible_url)
922
+ for u in {cu, cu.split('#', 1)[0]}:
923
+ if u not in self.all_url_requests:
924
+ if '#' not in u:
925
+ self.logger.info(f'"{u}" in the frames URLs, but not in the HAR.')
926
+ continue
927
+ matching_urls = [url_node for url_node in self.all_url_requests[u]
928
+ if url_node in self._nodes_list]
929
+ self._nodes_list = [node for node in self._nodes_list if node not in matching_urls]
930
+ if dev_debug:
931
+ self.logger.warning(f'Found via initiator from {unode.name} to {matching_urls}.')
932
+ self._make_subtree(unode, matching_urls)
933
+
873
934
  # The node can have a redirect, but also trigger ressources refering to themselves, we need to trigger this code on each node.
874
935
  if self.all_initiator_url.get(unode.name):
875
936
  # The URL (unode.name) is in the list of known urls initiating calls
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: har2tree
3
- Version: 1.34.1
3
+ Version: 1.34.2
4
4
  Summary: HTTP Archive (HAR) to ETE Toolkit generator
5
5
  License-Expression: BSD-3-Clause
6
6
  License-File: LICENSE
@@ -29,7 +29,7 @@ Requires-Dist: legacy-cgi (>=2.6.4) ; python_version >= "3.13" and python_versio
29
29
  Requires-Dist: multipart (>=1.3.0,<2.0.0)
30
30
  Requires-Dist: numpy (>=2.2,<2.3) ; python_version < "3.11"
31
31
  Requires-Dist: numpy (>=2.3.4) ; python_version >= "3.11" and python_version < "4.0"
32
- Requires-Dist: publicsuffixlist (>=1.0.2.20251107)
32
+ Requires-Dist: publicsuffixlist (>=1.0.2.20251115)
33
33
  Requires-Dist: requests-toolbelt (>=1.0.0,<2.0.0)
34
34
  Requires-Dist: six (>=1.17.0) ; extra == "docs"
35
35
  Requires-Dist: tinycss2 (>=1.4.0)
@@ -1,10 +1,10 @@
1
1
  har2tree/__init__.py,sha256=Na3mxHkUBq3rzYbxiLNJF37DxH5mcghSorjzXw5Teug,422
2
- har2tree/har2tree.py,sha256=JdHGoyXNmu3ZCLRDJ-QC253Q2Lfi4SLHHBDS6Gt4Ez0,49172
2
+ har2tree/har2tree.py,sha256=1HvGNfUMccR21mnuBnqYr3vG-28wjz0vp2CVcyMuzg8,52958
3
3
  har2tree/helper.py,sha256=psMpYWs5w0CONLfEo33yFgz6VwVY13xYbNDejOZ_EDw,20730
4
4
  har2tree/nodes.py,sha256=QWKqEUnuW7J6pASVvzwWAQNqL-_KDzSs2ld6uJl3qbw,37710
5
5
  har2tree/parser.py,sha256=4yej1OcVYAIiLfzYZsO9WCw3WyM_ykDTuvpW7UO1ROE,3645
6
6
  har2tree/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- har2tree-1.34.1.dist-info/METADATA,sha256=SfEJxkKbVnEDpXtwYiLtYhRi3vZQBIYn4HotdWavTl0,2239
8
- har2tree-1.34.1.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
9
- har2tree-1.34.1.dist-info/licenses/LICENSE,sha256=Xa4EVROgJsEo10CW-ISCRiw0TtqdKz1JuM3BBLBM55c,1803
10
- har2tree-1.34.1.dist-info/RECORD,,
7
+ har2tree-1.34.2.dist-info/METADATA,sha256=7WlrwtcMWJ4YefcbzBwAf9l5d1DjlxJk3oKTGLomzJI,2239
8
+ har2tree-1.34.2.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
9
+ har2tree-1.34.2.dist-info/licenses/LICENSE,sha256=Xa4EVROgJsEo10CW-ISCRiw0TtqdKz1JuM3BBLBM55c,1803
10
+ har2tree-1.34.2.dist-info/RECORD,,