har2tree 1.34.1__py3-none-any.whl → 1.35.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- har2tree/har2tree.py +73 -13
- har2tree/helper.py +1 -1
- {har2tree-1.34.1.dist-info → har2tree-1.35.0.dist-info}/METADATA +3 -3
- har2tree-1.35.0.dist-info/RECORD +10 -0
- har2tree-1.34.1.dist-info/RECORD +0 -10
- {har2tree-1.34.1.dist-info → har2tree-1.35.0.dist-info}/WHEEL +0 -0
- {har2tree-1.34.1.dist-info → har2tree-1.35.0.dist-info}/licenses/LICENSE +0 -0
har2tree/har2tree.py
CHANGED
|
@@ -14,6 +14,7 @@ from io import BytesIO
|
|
|
14
14
|
from operator import itemgetter
|
|
15
15
|
from pathlib import Path
|
|
16
16
|
from typing import Any, TypedDict
|
|
17
|
+
from collections.abc import Iterator
|
|
17
18
|
from collections.abc import Callable
|
|
18
19
|
from urllib.parse import unquote_plus, urlparse
|
|
19
20
|
|
|
@@ -332,8 +333,9 @@ class Har2Tree:
|
|
|
332
333
|
self.pages_root: dict[str, str] = {}
|
|
333
334
|
|
|
334
335
|
self.all_redirects: list[str] = []
|
|
335
|
-
|
|
336
|
-
self.
|
|
336
|
+
# 2025-11-16: make values of referers and initiators sets because there will be duplicates
|
|
337
|
+
self.all_referer: dict[str, set[str]] = defaultdict(set)
|
|
338
|
+
self.all_initiator_url: dict[str, set[str]] = defaultdict(set)
|
|
337
339
|
self._load_url_entries()
|
|
338
340
|
|
|
339
341
|
# Generate cookies lookup tables
|
|
@@ -424,12 +426,20 @@ class Har2Tree:
|
|
|
424
426
|
|
|
425
427
|
self.url_tree = self._nodes_list.pop(0)
|
|
426
428
|
|
|
429
|
+
def _url_to_local_only_content(self, url: str | None) -> bool:
|
|
430
|
+
return (url is None
|
|
431
|
+
or url in ['about:blank', 'about:srcdoc', ''] # not loading anything remotely
|
|
432
|
+
or url.startswith('data') # base64 encoded content
|
|
433
|
+
or url.startswith('chrome-error') # not in the HAR/tree
|
|
434
|
+
or url.startswith('blob') # blobs aren't URLs
|
|
435
|
+
)
|
|
436
|
+
|
|
427
437
|
def _load_iframes(self, current: URLNode, frames: FramesResponse) -> None:
|
|
428
438
|
if not frames.get('content') or frames['content'] is None:
|
|
429
439
|
# NOTE: debug stuff, no content makes it pretty useless.
|
|
430
440
|
if frames.get('url'):
|
|
431
|
-
if frames['url']
|
|
432
|
-
self.logger.info('Got
|
|
441
|
+
if self._url_to_local_only_content(frames['url']):
|
|
442
|
+
self.logger.info('Got an empty frame to local content.')
|
|
433
443
|
else:
|
|
434
444
|
u = unquote_plus(frames['url'])
|
|
435
445
|
self.logger.warning(f'Got a url ({u}) for the frame, but no content')
|
|
@@ -437,12 +447,13 @@ class Har2Tree:
|
|
|
437
447
|
self.logger.info('Got a frame, but no content.')
|
|
438
448
|
return
|
|
439
449
|
|
|
440
|
-
if frames.get('url') and not (frames['url']
|
|
450
|
+
if frames.get('url') and not self._url_to_local_only_content(frames['url']):
|
|
441
451
|
u = unquote_plus(frames['url'])
|
|
452
|
+
possible_child_name = {u, u.split('#', 1)[0]}
|
|
442
453
|
# this url should be in a node directly attached to that one
|
|
443
454
|
# we need to find that node
|
|
444
455
|
for child in current.traverse():
|
|
445
|
-
if child.name in
|
|
456
|
+
if child.name in possible_child_name:
|
|
446
457
|
self.logger.debug(f'Found URL "{u}".')
|
|
447
458
|
# Found the node, adding the content
|
|
448
459
|
if not hasattr(child, 'rendered_frame'):
|
|
@@ -457,7 +468,9 @@ class Har2Tree:
|
|
|
457
468
|
break
|
|
458
469
|
else:
|
|
459
470
|
# Couldn'd find the node Oo
|
|
460
|
-
|
|
471
|
+
to_print = ', '.join(possible_child_name)
|
|
472
|
+
children_to_print = ', '.join([child.name for child in current.traverse()])
|
|
473
|
+
self.logger.warning(f'Unable to find "{to_print}" in the children of "{current.name}" - {children_to_print}')
|
|
461
474
|
else:
|
|
462
475
|
self.logger.debug(f'"{current.name}" contains an iFrame.')
|
|
463
476
|
# No URL, this frame is directly in the parent frame.
|
|
@@ -594,7 +607,7 @@ class Har2Tree:
|
|
|
594
607
|
|
|
595
608
|
if hasattr(n, 'initiator_url'):
|
|
596
609
|
# The HAR file was created by chrome/chromium and we got the _initiator key
|
|
597
|
-
self.all_initiator_url[n.initiator_url].
|
|
610
|
+
self.all_initiator_url[n.initiator_url].add(n.name)
|
|
598
611
|
|
|
599
612
|
if url_entry['startedDateTime'] in self.har.pages_start_times:
|
|
600
613
|
for page in self.har.pages_start_times[url_entry['startedDateTime']]:
|
|
@@ -607,7 +620,7 @@ class Har2Tree:
|
|
|
607
620
|
if hasattr(n, 'referer') and i > 0:
|
|
608
621
|
# NOTE 2021-05-14: referer to self are a real thing: url -> POST to self
|
|
609
622
|
if n.name != n.referer or ('method' in n.request and n.request['method'] == 'POST'):
|
|
610
|
-
self.all_referer[n.referer].
|
|
623
|
+
self.all_referer[n.referer].add(n.name)
|
|
611
624
|
|
|
612
625
|
self._nodes_list.append(n)
|
|
613
626
|
self.all_url_requests[n.name].append(n)
|
|
@@ -728,10 +741,13 @@ class Har2Tree:
|
|
|
728
741
|
self.make_hostname_tree(self.url_tree, self.hostname_tree)
|
|
729
742
|
if dev_debug_mode:
|
|
730
743
|
self._all_urlnodes_in_host_tree()
|
|
731
|
-
if
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
self.
|
|
744
|
+
if isinstance(self.har.frames, dict):
|
|
745
|
+
if self.har.frames.get('children') and self.har.frames['children'] is not None:
|
|
746
|
+
# we have frames in the main one
|
|
747
|
+
for f_child in self.har.frames['children']:
|
|
748
|
+
self._load_iframes(self.rendered_node, f_child)
|
|
749
|
+
else:
|
|
750
|
+
self.logger.warning(f'Wrong format for the frames ({type(self.har.frames)}), very old capture.')
|
|
735
751
|
return self.url_tree
|
|
736
752
|
|
|
737
753
|
@trace_make_subtree_fallback
|
|
@@ -806,6 +822,30 @@ class Har2Tree:
|
|
|
806
822
|
# no way to attach it to anything else, attach to the root node
|
|
807
823
|
self._make_subtree(self.url_tree, [node], fallback=True)
|
|
808
824
|
|
|
825
|
+
def all_real_urls_in_children(self, frame: FramesResponse) -> Iterator[str]:
|
|
826
|
+
# from a frame, search all the real urls in each of the children, stop at the first one
|
|
827
|
+
if (frame.get('url') and frame['url'] is not None and not self._url_to_local_only_content(frame['url'])):
|
|
828
|
+
yield frame['url']
|
|
829
|
+
else:
|
|
830
|
+
# got no real URL, try the children
|
|
831
|
+
if frame.get('children') and frame['children'] is not None:
|
|
832
|
+
for c in frame['children']:
|
|
833
|
+
yield from self.all_real_urls_in_children(c)
|
|
834
|
+
|
|
835
|
+
def search_in_frames(self, urls: set[str], frame: FramesResponse) -> Iterator[str]:
|
|
836
|
+
# If the frame doesn't have children, there are no potential URLs to attach
|
|
837
|
+
if not frame.get('children') or frame['children'] is None:
|
|
838
|
+
return None
|
|
839
|
+
|
|
840
|
+
if frame.get('url'):
|
|
841
|
+
u = unquote_plus(frame['url'])
|
|
842
|
+
if urls & {u, u.split('#', 1)[0]}:
|
|
843
|
+
# got a matching URL, get list of potential iframes urls
|
|
844
|
+
for c in frame['children']:
|
|
845
|
+
yield from self.all_real_urls_in_children(c)
|
|
846
|
+
for c in frame['children']:
|
|
847
|
+
yield from self.search_in_frames(urls, c)
|
|
848
|
+
|
|
809
849
|
@trace_make_subtree
|
|
810
850
|
def _make_subtree(self, root: URLNode, nodes_to_attach: list[URLNode] | None=None,
|
|
811
851
|
dev_debug: bool=False, fallback: bool=False) -> None:
|
|
@@ -870,6 +910,26 @@ class Har2Tree:
|
|
|
870
910
|
if unode.empty_response:
|
|
871
911
|
continue
|
|
872
912
|
|
|
913
|
+
# 2025-11-14
|
|
914
|
+
# the referer of an iframe is the hostname of the parent, even if the parent
|
|
915
|
+
# is a URL with a full path. Before using the referer, we need to check if we have
|
|
916
|
+
# the current url in the frame tree. If we do, find nodes (in the remaining list)
|
|
917
|
+
# with the URLs of the children - any fragment will be missing - and attach that node
|
|
918
|
+
possible_iframe_urls = {unode.name, unode.name.split('#', 1)[0]}
|
|
919
|
+
for possible_url in self.search_in_frames(possible_iframe_urls, self.har.frames):
|
|
920
|
+
cu = unquote_plus(possible_url)
|
|
921
|
+
for u in {cu, cu.split('#', 1)[0]}:
|
|
922
|
+
if u not in self.all_url_requests:
|
|
923
|
+
if '#' not in u:
|
|
924
|
+
self.logger.info(f'"{u}" in the frames URLs, but not in the HAR.')
|
|
925
|
+
continue
|
|
926
|
+
matching_urls = [url_node for url_node in self.all_url_requests[u]
|
|
927
|
+
if url_node in self._nodes_list]
|
|
928
|
+
self._nodes_list = [node for node in self._nodes_list if node not in matching_urls]
|
|
929
|
+
if dev_debug:
|
|
930
|
+
self.logger.warning(f'Found via initiator from {unode.name} to {matching_urls}.')
|
|
931
|
+
self._make_subtree(unode, matching_urls)
|
|
932
|
+
|
|
873
933
|
# The node can have a redirect, but also trigger ressources refering to themselves, we need to trigger this code on each node.
|
|
874
934
|
if self.all_initiator_url.get(unode.name):
|
|
875
935
|
# The URL (unode.name) is in the list of known urls initiating calls
|
har2tree/helper.py
CHANGED
|
@@ -72,7 +72,7 @@ def make_hhhash(entry: dict[str, Any]) -> str:
|
|
|
72
72
|
# We need the HTTP version used for the query:
|
|
73
73
|
# * The HTTP Header names in HTTP 1.1 can have uppercase characters
|
|
74
74
|
# * The HTTP Header names in HTTP 2 *must* be lowercase: https://www.rfc-editor.org/rfc/rfc7540#section-8.1.2
|
|
75
|
-
if entry['httpVersion'].lower() in ["http/1.1", "http/1.0"]:
|
|
75
|
+
if entry['httpVersion'].lower() in ["http/1.1", "http/1.0", "1.1"]:
|
|
76
76
|
return f'hhh:1:{sha256}'
|
|
77
77
|
if entry['httpVersion'].lower() == "http/2.0":
|
|
78
78
|
return f'hhh:2:{sha256}'
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: har2tree
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.35.0
|
|
4
4
|
Summary: HTTP Archive (HAR) to ETE Toolkit generator
|
|
5
5
|
License-Expression: BSD-3-Clause
|
|
6
6
|
License-File: LICENSE
|
|
@@ -28,8 +28,8 @@ Requires-Dist: json-stream (>=2.3.3,<3.0.0)
|
|
|
28
28
|
Requires-Dist: legacy-cgi (>=2.6.4) ; python_version >= "3.13" and python_version < "4.0"
|
|
29
29
|
Requires-Dist: multipart (>=1.3.0,<2.0.0)
|
|
30
30
|
Requires-Dist: numpy (>=2.2,<2.3) ; python_version < "3.11"
|
|
31
|
-
Requires-Dist: numpy (>=2.3.
|
|
32
|
-
Requires-Dist: publicsuffixlist (>=1.0.2.
|
|
31
|
+
Requires-Dist: numpy (>=2.3.5) ; python_version >= "3.11" and python_version < "4.0"
|
|
32
|
+
Requires-Dist: publicsuffixlist (>=1.0.2.20251115)
|
|
33
33
|
Requires-Dist: requests-toolbelt (>=1.0.0,<2.0.0)
|
|
34
34
|
Requires-Dist: six (>=1.17.0) ; extra == "docs"
|
|
35
35
|
Requires-Dist: tinycss2 (>=1.4.0)
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
har2tree/__init__.py,sha256=Na3mxHkUBq3rzYbxiLNJF37DxH5mcghSorjzXw5Teug,422
|
|
2
|
+
har2tree/har2tree.py,sha256=m-Ge_ihbPMVtjNvzSxi8p664a4Jr_EtmFCNL80xGrHY,52731
|
|
3
|
+
har2tree/helper.py,sha256=ktX5Fq-K_t4r0VVAXIH4uy7xc-qCjtSaiUvkX_PYxhw,20737
|
|
4
|
+
har2tree/nodes.py,sha256=QWKqEUnuW7J6pASVvzwWAQNqL-_KDzSs2ld6uJl3qbw,37710
|
|
5
|
+
har2tree/parser.py,sha256=4yej1OcVYAIiLfzYZsO9WCw3WyM_ykDTuvpW7UO1ROE,3645
|
|
6
|
+
har2tree/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
+
har2tree-1.35.0.dist-info/METADATA,sha256=nfxaJZ7GjmAyCiN1hWZ5K0ZLt6qR9MrLRSqJsoi4EXQ,2239
|
|
8
|
+
har2tree-1.35.0.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
9
|
+
har2tree-1.35.0.dist-info/licenses/LICENSE,sha256=Xa4EVROgJsEo10CW-ISCRiw0TtqdKz1JuM3BBLBM55c,1803
|
|
10
|
+
har2tree-1.35.0.dist-info/RECORD,,
|
har2tree-1.34.1.dist-info/RECORD
DELETED
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
har2tree/__init__.py,sha256=Na3mxHkUBq3rzYbxiLNJF37DxH5mcghSorjzXw5Teug,422
|
|
2
|
-
har2tree/har2tree.py,sha256=JdHGoyXNmu3ZCLRDJ-QC253Q2Lfi4SLHHBDS6Gt4Ez0,49172
|
|
3
|
-
har2tree/helper.py,sha256=psMpYWs5w0CONLfEo33yFgz6VwVY13xYbNDejOZ_EDw,20730
|
|
4
|
-
har2tree/nodes.py,sha256=QWKqEUnuW7J6pASVvzwWAQNqL-_KDzSs2ld6uJl3qbw,37710
|
|
5
|
-
har2tree/parser.py,sha256=4yej1OcVYAIiLfzYZsO9WCw3WyM_ykDTuvpW7UO1ROE,3645
|
|
6
|
-
har2tree/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
-
har2tree-1.34.1.dist-info/METADATA,sha256=SfEJxkKbVnEDpXtwYiLtYhRi3vZQBIYn4HotdWavTl0,2239
|
|
8
|
-
har2tree-1.34.1.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
9
|
-
har2tree-1.34.1.dist-info/licenses/LICENSE,sha256=Xa4EVROgJsEo10CW-ISCRiw0TtqdKz1JuM3BBLBM55c,1803
|
|
10
|
-
har2tree-1.34.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|