har2tree 1.34.1__tar.gz → 1.35.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: har2tree
3
- Version: 1.34.1
3
+ Version: 1.35.0
4
4
  Summary: HTTP Archive (HAR) to ETE Toolkit generator
5
5
  License-Expression: BSD-3-Clause
6
6
  License-File: LICENSE
@@ -28,8 +28,8 @@ Requires-Dist: json-stream (>=2.3.3,<3.0.0)
28
28
  Requires-Dist: legacy-cgi (>=2.6.4) ; python_version >= "3.13" and python_version < "4.0"
29
29
  Requires-Dist: multipart (>=1.3.0,<2.0.0)
30
30
  Requires-Dist: numpy (>=2.2,<2.3) ; python_version < "3.11"
31
- Requires-Dist: numpy (>=2.3.4) ; python_version >= "3.11" and python_version < "4.0"
32
- Requires-Dist: publicsuffixlist (>=1.0.2.20251107)
31
+ Requires-Dist: numpy (>=2.3.5) ; python_version >= "3.11" and python_version < "4.0"
32
+ Requires-Dist: publicsuffixlist (>=1.0.2.20251115)
33
33
  Requires-Dist: requests-toolbelt (>=1.0.0,<2.0.0)
34
34
  Requires-Dist: six (>=1.17.0) ; extra == "docs"
35
35
  Requires-Dist: tinycss2 (>=1.4.0)
@@ -14,6 +14,7 @@ from io import BytesIO
14
14
  from operator import itemgetter
15
15
  from pathlib import Path
16
16
  from typing import Any, TypedDict
17
+ from collections.abc import Iterator
17
18
  from collections.abc import Callable
18
19
  from urllib.parse import unquote_plus, urlparse
19
20
 
@@ -332,8 +333,9 @@ class Har2Tree:
332
333
  self.pages_root: dict[str, str] = {}
333
334
 
334
335
  self.all_redirects: list[str] = []
335
- self.all_referer: dict[str, list[str]] = defaultdict(list)
336
- self.all_initiator_url: dict[str, list[str]] = defaultdict(list)
336
+ # 2025-11-16: make values of referers and initiators sets because there will be duplicates
337
+ self.all_referer: dict[str, set[str]] = defaultdict(set)
338
+ self.all_initiator_url: dict[str, set[str]] = defaultdict(set)
337
339
  self._load_url_entries()
338
340
 
339
341
  # Generate cookies lookup tables
@@ -424,12 +426,20 @@ class Har2Tree:
424
426
 
425
427
  self.url_tree = self._nodes_list.pop(0)
426
428
 
429
+ def _url_to_local_only_content(self, url: str | None) -> bool:
430
+ return (url is None
431
+ or url in ['about:blank', 'about:srcdoc', ''] # not loading anything remotely
432
+ or url.startswith('data') # base64 encoded content
433
+ or url.startswith('chrome-error') # not in the HAR/tree
434
+ or url.startswith('blob') # blobs aren't URLs
435
+ )
436
+
427
437
  def _load_iframes(self, current: URLNode, frames: FramesResponse) -> None:
428
438
  if not frames.get('content') or frames['content'] is None:
429
439
  # NOTE: debug stuff, no content makes it pretty useless.
430
440
  if frames.get('url'):
431
- if frames['url'] == "about:blank":
432
- self.logger.info('Got a frame to about:blank with no content.')
441
+ if self._url_to_local_only_content(frames['url']):
442
+ self.logger.info('Got an empty frame to local content.')
433
443
  else:
434
444
  u = unquote_plus(frames['url'])
435
445
  self.logger.warning(f'Got a url ({u}) for the frame, but no content')
@@ -437,12 +447,13 @@ class Har2Tree:
437
447
  self.logger.info('Got a frame, but no content.')
438
448
  return
439
449
 
440
- if frames.get('url') and not (frames['url'] in ['about:blank']):
450
+ if frames.get('url') and not self._url_to_local_only_content(frames['url']):
441
451
  u = unquote_plus(frames['url'])
452
+ possible_child_name = {u, u.split('#', 1)[0]}
442
453
  # this url should be in a node directly attached to that one
443
454
  # we need to find that node
444
455
  for child in current.traverse():
445
- if child.name in [u, u.split('#', 1)[0]]:
456
+ if child.name in possible_child_name:
446
457
  self.logger.debug(f'Found URL "{u}".')
447
458
  # Found the node, adding the content
448
459
  if not hasattr(child, 'rendered_frame'):
@@ -457,7 +468,9 @@ class Har2Tree:
457
468
  break
458
469
  else:
459
470
  # Couldn'd find the node Oo
460
- self.logger.warning(f'Unable to find "{u}" in the children of "{current.name}"')
471
+ to_print = ', '.join(possible_child_name)
472
+ children_to_print = ', '.join([child.name for child in current.traverse()])
473
+ self.logger.warning(f'Unable to find "{to_print}" in the children of "{current.name}" - {children_to_print}')
461
474
  else:
462
475
  self.logger.debug(f'"{current.name}" contains an iFrame.')
463
476
  # No URL, this frame is directly in the parent frame.
@@ -594,7 +607,7 @@ class Har2Tree:
594
607
 
595
608
  if hasattr(n, 'initiator_url'):
596
609
  # The HAR file was created by chrome/chromium and we got the _initiator key
597
- self.all_initiator_url[n.initiator_url].append(n.name)
610
+ self.all_initiator_url[n.initiator_url].add(n.name)
598
611
 
599
612
  if url_entry['startedDateTime'] in self.har.pages_start_times:
600
613
  for page in self.har.pages_start_times[url_entry['startedDateTime']]:
@@ -607,7 +620,7 @@ class Har2Tree:
607
620
  if hasattr(n, 'referer') and i > 0:
608
621
  # NOTE 2021-05-14: referer to self are a real thing: url -> POST to self
609
622
  if n.name != n.referer or ('method' in n.request and n.request['method'] == 'POST'):
610
- self.all_referer[n.referer].append(n.name)
623
+ self.all_referer[n.referer].add(n.name)
611
624
 
612
625
  self._nodes_list.append(n)
613
626
  self.all_url_requests[n.name].append(n)
@@ -728,10 +741,13 @@ class Har2Tree:
728
741
  self.make_hostname_tree(self.url_tree, self.hostname_tree)
729
742
  if dev_debug_mode:
730
743
  self._all_urlnodes_in_host_tree()
731
- if self.har.frames.get('children') and self.har.frames['children'] is not None:
732
- # we have frames in the main one
733
- for f_child in self.har.frames['children']:
734
- self._load_iframes(self.rendered_node, f_child)
744
+ if isinstance(self.har.frames, dict):
745
+ if self.har.frames.get('children') and self.har.frames['children'] is not None:
746
+ # we have frames in the main one
747
+ for f_child in self.har.frames['children']:
748
+ self._load_iframes(self.rendered_node, f_child)
749
+ else:
750
+ self.logger.warning(f'Wrong format for the frames ({type(self.har.frames)}), very old capture.')
735
751
  return self.url_tree
736
752
 
737
753
  @trace_make_subtree_fallback
@@ -806,6 +822,30 @@ class Har2Tree:
806
822
  # no way to attach it to anything else, attach to the root node
807
823
  self._make_subtree(self.url_tree, [node], fallback=True)
808
824
 
825
+ def all_real_urls_in_children(self, frame: FramesResponse) -> Iterator[str]:
826
+ # from a frame, search all the real urls in each of the children, stop at the first one
827
+ if (frame.get('url') and frame['url'] is not None and not self._url_to_local_only_content(frame['url'])):
828
+ yield frame['url']
829
+ else:
830
+ # got no real URL, try the children
831
+ if frame.get('children') and frame['children'] is not None:
832
+ for c in frame['children']:
833
+ yield from self.all_real_urls_in_children(c)
834
+
835
+ def search_in_frames(self, urls: set[str], frame: FramesResponse) -> Iterator[str]:
836
+ # If the frame doesn't have children, there are no potential URLs to attach
837
+ if not frame.get('children') or frame['children'] is None:
838
+ return None
839
+
840
+ if frame.get('url'):
841
+ u = unquote_plus(frame['url'])
842
+ if urls & {u, u.split('#', 1)[0]}:
843
+ # got a matching URL, get list of potential iframes urls
844
+ for c in frame['children']:
845
+ yield from self.all_real_urls_in_children(c)
846
+ for c in frame['children']:
847
+ yield from self.search_in_frames(urls, c)
848
+
809
849
  @trace_make_subtree
810
850
  def _make_subtree(self, root: URLNode, nodes_to_attach: list[URLNode] | None=None,
811
851
  dev_debug: bool=False, fallback: bool=False) -> None:
@@ -870,6 +910,26 @@ class Har2Tree:
870
910
  if unode.empty_response:
871
911
  continue
872
912
 
913
+ # 2025-11-14
914
+ # the referer of an iframe is the hostname of the parent, even if the parent
915
+ # is a URL with a full path. Before using the referer, we need to check if we have
916
+ # the current url in the frame tree. If we do, find nodes (in the remaining list)
917
+ # with the URLs of the children - any fragment will be missing - and attach that node
918
+ possible_iframe_urls = {unode.name, unode.name.split('#', 1)[0]}
919
+ for possible_url in self.search_in_frames(possible_iframe_urls, self.har.frames):
920
+ cu = unquote_plus(possible_url)
921
+ for u in {cu, cu.split('#', 1)[0]}:
922
+ if u not in self.all_url_requests:
923
+ if '#' not in u:
924
+ self.logger.info(f'"{u}" in the frames URLs, but not in the HAR.')
925
+ continue
926
+ matching_urls = [url_node for url_node in self.all_url_requests[u]
927
+ if url_node in self._nodes_list]
928
+ self._nodes_list = [node for node in self._nodes_list if node not in matching_urls]
929
+ if dev_debug:
930
+ self.logger.warning(f'Found via initiator from {unode.name} to {matching_urls}.')
931
+ self._make_subtree(unode, matching_urls)
932
+
873
933
  # The node can have a redirect, but also trigger ressources refering to themselves, we need to trigger this code on each node.
874
934
  if self.all_initiator_url.get(unode.name):
875
935
  # The URL (unode.name) is in the list of known urls initiating calls
@@ -72,7 +72,7 @@ def make_hhhash(entry: dict[str, Any]) -> str:
72
72
  # We need the HTTP version used for the query:
73
73
  # * The HTTP Header names in HTTP 1.1 can have uppercase characters
74
74
  # * The HTTP Header names in HTTP 2 *must* be lowercase: https://www.rfc-editor.org/rfc/rfc7540#section-8.1.2
75
- if entry['httpVersion'].lower() in ["http/1.1", "http/1.0"]:
75
+ if entry['httpVersion'].lower() in ["http/1.1", "http/1.0", "1.1"]:
76
76
  return f'hhh:1:{sha256}'
77
77
  if entry['httpVersion'].lower() == "http/2.0":
78
78
  return f'hhh:2:{sha256}'
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "har2tree"
3
- version = "1.34.1"
3
+ version = "1.35.0"
4
4
  description = "HTTP Archive (HAR) to ETE Toolkit generator"
5
5
  authors = [
6
6
  {name="Raphaël Vinot", email="raphael.vinot@circl.lu"}
@@ -14,12 +14,12 @@ dynamic = [ "classifiers" ]
14
14
  dependencies = [
15
15
  "ete3 (>=3.1.3)",
16
16
  "beautifulsoup4[charset-normalizer,lxml] (>=4.14.2)",
17
- "publicsuffixlist (>=1.0.2.20251107)",
17
+ "publicsuffixlist (>=1.0.2.20251115)",
18
18
  "filetype (>=1.2.0)",
19
19
  # poetry up fails with the version of numpy forced for python < 3.11.
20
20
  # The work around is to comment it, run poetry up, uncomment it. and run poetry update.
21
21
  "numpy (>=2.2,<2.3) ; python_version < '3.11'",
22
- "numpy (>=2.3.4) ; python_version >= \"3.11\" and python_version < \"4.0\"",
22
+ "numpy (>=2.3.5) ; python_version >= \"3.11\" and python_version < \"4.0\"",
23
23
  "w3lib (>=2.3.1)",
24
24
  "tinycss2 (>=1.4.0)",
25
25
  "legacy-cgi (>=2.6.4) ; python_version >= \"3.13\" and python_version < \"4.0\"",
@@ -49,7 +49,7 @@ docs = ["Sphinx (>=8.2.3) ; python_version >= \"3.11\"", "six (>=1.17.0)"]
49
49
  [tool.poetry.group.dev.dependencies]
50
50
  mypy = "^1.18.2"
51
51
  pytest-cov = "^7.0.0"
52
- coverage = "^7.11.1"
52
+ coverage = "^7.11.3"
53
53
  types-beautifulsoup4 = "^4.12.0.20250516"
54
54
 
55
55
  [build-system]
File without changes
File without changes
File without changes
File without changes
File without changes