har2tree 1.34.1__py3-none-any.whl → 1.34.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- har2tree/har2tree.py +72 -11
- {har2tree-1.34.1.dist-info → har2tree-1.34.2.dist-info}/METADATA +2 -2
- {har2tree-1.34.1.dist-info → har2tree-1.34.2.dist-info}/RECORD +5 -5
- {har2tree-1.34.1.dist-info → har2tree-1.34.2.dist-info}/WHEEL +0 -0
- {har2tree-1.34.1.dist-info → har2tree-1.34.2.dist-info}/licenses/LICENSE +0 -0
har2tree/har2tree.py
CHANGED
|
@@ -14,6 +14,7 @@ from io import BytesIO
|
|
|
14
14
|
from operator import itemgetter
|
|
15
15
|
from pathlib import Path
|
|
16
16
|
from typing import Any, TypedDict
|
|
17
|
+
from collections.abc import Iterator
|
|
17
18
|
from collections.abc import Callable
|
|
18
19
|
from urllib.parse import unquote_plus, urlparse
|
|
19
20
|
|
|
@@ -332,8 +333,9 @@ class Har2Tree:
|
|
|
332
333
|
self.pages_root: dict[str, str] = {}
|
|
333
334
|
|
|
334
335
|
self.all_redirects: list[str] = []
|
|
335
|
-
|
|
336
|
-
self.
|
|
336
|
+
# 2025-11-16: make values of referers and initiators sets because there will be duplicates
|
|
337
|
+
self.all_referer: dict[str, set[str]] = defaultdict(set)
|
|
338
|
+
self.all_initiator_url: dict[str, set[str]] = defaultdict(set)
|
|
337
339
|
self._load_url_entries()
|
|
338
340
|
|
|
339
341
|
# Generate cookies lookup tables
|
|
@@ -437,12 +439,18 @@ class Har2Tree:
|
|
|
437
439
|
self.logger.info('Got a frame, but no content.')
|
|
438
440
|
return
|
|
439
441
|
|
|
440
|
-
if frames.get('url')
|
|
442
|
+
if (frames.get('url')
|
|
443
|
+
and not (frames['url'] in ['about:blank'] # not loading anything, same as empty
|
|
444
|
+
or frames['url'].startswith('data') # base64 encoded content
|
|
445
|
+
or frames['url'].startswith('chrome-error') # not in the HAR/tree
|
|
446
|
+
or frames['url'].startswith('blob') # blobs aren't URLs
|
|
447
|
+
)):
|
|
441
448
|
u = unquote_plus(frames['url'])
|
|
449
|
+
possible_child_name = {u, u.split('#', 1)[0]}
|
|
442
450
|
# this url should be in a node directly attached to that one
|
|
443
451
|
# we need to find that node
|
|
444
452
|
for child in current.traverse():
|
|
445
|
-
if child.name in
|
|
453
|
+
if child.name in possible_child_name:
|
|
446
454
|
self.logger.debug(f'Found URL "{u}".')
|
|
447
455
|
# Found the node, adding the content
|
|
448
456
|
if not hasattr(child, 'rendered_frame'):
|
|
@@ -457,7 +465,9 @@ class Har2Tree:
|
|
|
457
465
|
break
|
|
458
466
|
else:
|
|
459
467
|
# Couldn'd find the node Oo
|
|
460
|
-
|
|
468
|
+
to_print = ', '.join(possible_child_name)
|
|
469
|
+
children_to_print = ', '.join([child.name for child in current.traverse()])
|
|
470
|
+
self.logger.warning(f'Unable to find "{to_print}" in the children of "{current.name}" - {children_to_print}')
|
|
461
471
|
else:
|
|
462
472
|
self.logger.debug(f'"{current.name}" contains an iFrame.')
|
|
463
473
|
# No URL, this frame is directly in the parent frame.
|
|
@@ -594,7 +604,7 @@ class Har2Tree:
|
|
|
594
604
|
|
|
595
605
|
if hasattr(n, 'initiator_url'):
|
|
596
606
|
# The HAR file was created by chrome/chromium and we got the _initiator key
|
|
597
|
-
self.all_initiator_url[n.initiator_url].
|
|
607
|
+
self.all_initiator_url[n.initiator_url].add(n.name)
|
|
598
608
|
|
|
599
609
|
if url_entry['startedDateTime'] in self.har.pages_start_times:
|
|
600
610
|
for page in self.har.pages_start_times[url_entry['startedDateTime']]:
|
|
@@ -607,7 +617,7 @@ class Har2Tree:
|
|
|
607
617
|
if hasattr(n, 'referer') and i > 0:
|
|
608
618
|
# NOTE 2021-05-14: referer to self are a real thing: url -> POST to self
|
|
609
619
|
if n.name != n.referer or ('method' in n.request and n.request['method'] == 'POST'):
|
|
610
|
-
self.all_referer[n.referer].
|
|
620
|
+
self.all_referer[n.referer].add(n.name)
|
|
611
621
|
|
|
612
622
|
self._nodes_list.append(n)
|
|
613
623
|
self.all_url_requests[n.name].append(n)
|
|
@@ -728,10 +738,13 @@ class Har2Tree:
|
|
|
728
738
|
self.make_hostname_tree(self.url_tree, self.hostname_tree)
|
|
729
739
|
if dev_debug_mode:
|
|
730
740
|
self._all_urlnodes_in_host_tree()
|
|
731
|
-
if
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
self.
|
|
741
|
+
if isinstance(self.har.frames, dict):
|
|
742
|
+
if self.har.frames.get('children') and self.har.frames['children'] is not None:
|
|
743
|
+
# we have frames in the main one
|
|
744
|
+
for f_child in self.har.frames['children']:
|
|
745
|
+
self._load_iframes(self.rendered_node, f_child)
|
|
746
|
+
else:
|
|
747
|
+
self.logger.warning(f'Wrong format for the frames ({type(self.har.frames)}), very old capture.')
|
|
735
748
|
return self.url_tree
|
|
736
749
|
|
|
737
750
|
@trace_make_subtree_fallback
|
|
@@ -806,6 +819,34 @@ class Har2Tree:
|
|
|
806
819
|
# no way to attach it to anything else, attach to the root node
|
|
807
820
|
self._make_subtree(self.url_tree, [node], fallback=True)
|
|
808
821
|
|
|
822
|
+
def all_real_urls_in_children(self, frame: FramesResponse) -> Iterator[str]:
|
|
823
|
+
# from a frame, search all the real urls in each of the children, stop at the first one
|
|
824
|
+
if (frame.get('url') and frame['url'] is not None
|
|
825
|
+
and not (frame['url'] in ['about:blank', 'about:srcdoc'] # not loading anything, same as empty
|
|
826
|
+
or frame['url'].startswith('data') # base64 encoded content
|
|
827
|
+
or frame['url'].startswith('chrome-error') # not in the HAR/tree
|
|
828
|
+
or frame['url'].startswith('blob'))): # blobs aren't URLs
|
|
829
|
+
yield frame['url']
|
|
830
|
+
else:
|
|
831
|
+
# got no real URL, try the children
|
|
832
|
+
if frame.get('children') and frame['children'] is not None:
|
|
833
|
+
for c in frame['children']:
|
|
834
|
+
yield from self.all_real_urls_in_children(c)
|
|
835
|
+
|
|
836
|
+
def search_in_frames(self, urls: set[str], frame: FramesResponse) -> Iterator[str]:
|
|
837
|
+
# If the frame doesn't have children, there are no potential URLs to attach
|
|
838
|
+
if not frame.get('children') or frame['children'] is None:
|
|
839
|
+
return None
|
|
840
|
+
|
|
841
|
+
if frame.get('url'):
|
|
842
|
+
u = unquote_plus(frame['url'])
|
|
843
|
+
if urls & {u, u.split('#', 1)[0]}:
|
|
844
|
+
# got a matching URL, get list of potential iframes urls
|
|
845
|
+
for c in frame['children']:
|
|
846
|
+
yield from self.all_real_urls_in_children(c)
|
|
847
|
+
for c in frame['children']:
|
|
848
|
+
yield from self.search_in_frames(urls, c)
|
|
849
|
+
|
|
809
850
|
@trace_make_subtree
|
|
810
851
|
def _make_subtree(self, root: URLNode, nodes_to_attach: list[URLNode] | None=None,
|
|
811
852
|
dev_debug: bool=False, fallback: bool=False) -> None:
|
|
@@ -870,6 +911,26 @@ class Har2Tree:
|
|
|
870
911
|
if unode.empty_response:
|
|
871
912
|
continue
|
|
872
913
|
|
|
914
|
+
# 2025-11-14
|
|
915
|
+
# the referer of an iframe is the hostname of the parent, even if the parent
|
|
916
|
+
# is a URL with a full path. Before using the referer, we need to check if we have
|
|
917
|
+
# the current url in the frame tree. If we do, find nodes (in the remaining list)
|
|
918
|
+
# with the URLs of the children - any fragment will be missing - and attach that node
|
|
919
|
+
possible_iframe_urls = {unode.name, unode.name.split('#', 1)[0]}
|
|
920
|
+
for possible_url in self.search_in_frames(possible_iframe_urls, self.har.frames):
|
|
921
|
+
cu = unquote_plus(possible_url)
|
|
922
|
+
for u in {cu, cu.split('#', 1)[0]}:
|
|
923
|
+
if u not in self.all_url_requests:
|
|
924
|
+
if '#' not in u:
|
|
925
|
+
self.logger.info(f'"{u}" in the frames URLs, but not in the HAR.')
|
|
926
|
+
continue
|
|
927
|
+
matching_urls = [url_node for url_node in self.all_url_requests[u]
|
|
928
|
+
if url_node in self._nodes_list]
|
|
929
|
+
self._nodes_list = [node for node in self._nodes_list if node not in matching_urls]
|
|
930
|
+
if dev_debug:
|
|
931
|
+
self.logger.warning(f'Found via initiator from {unode.name} to {matching_urls}.')
|
|
932
|
+
self._make_subtree(unode, matching_urls)
|
|
933
|
+
|
|
873
934
|
# The node can have a redirect, but also trigger ressources refering to themselves, we need to trigger this code on each node.
|
|
874
935
|
if self.all_initiator_url.get(unode.name):
|
|
875
936
|
# The URL (unode.name) is in the list of known urls initiating calls
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: har2tree
|
|
3
|
-
Version: 1.34.
|
|
3
|
+
Version: 1.34.2
|
|
4
4
|
Summary: HTTP Archive (HAR) to ETE Toolkit generator
|
|
5
5
|
License-Expression: BSD-3-Clause
|
|
6
6
|
License-File: LICENSE
|
|
@@ -29,7 +29,7 @@ Requires-Dist: legacy-cgi (>=2.6.4) ; python_version >= "3.13" and python_versio
|
|
|
29
29
|
Requires-Dist: multipart (>=1.3.0,<2.0.0)
|
|
30
30
|
Requires-Dist: numpy (>=2.2,<2.3) ; python_version < "3.11"
|
|
31
31
|
Requires-Dist: numpy (>=2.3.4) ; python_version >= "3.11" and python_version < "4.0"
|
|
32
|
-
Requires-Dist: publicsuffixlist (>=1.0.2.
|
|
32
|
+
Requires-Dist: publicsuffixlist (>=1.0.2.20251115)
|
|
33
33
|
Requires-Dist: requests-toolbelt (>=1.0.0,<2.0.0)
|
|
34
34
|
Requires-Dist: six (>=1.17.0) ; extra == "docs"
|
|
35
35
|
Requires-Dist: tinycss2 (>=1.4.0)
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
har2tree/__init__.py,sha256=Na3mxHkUBq3rzYbxiLNJF37DxH5mcghSorjzXw5Teug,422
|
|
2
|
-
har2tree/har2tree.py,sha256=
|
|
2
|
+
har2tree/har2tree.py,sha256=1HvGNfUMccR21mnuBnqYr3vG-28wjz0vp2CVcyMuzg8,52958
|
|
3
3
|
har2tree/helper.py,sha256=psMpYWs5w0CONLfEo33yFgz6VwVY13xYbNDejOZ_EDw,20730
|
|
4
4
|
har2tree/nodes.py,sha256=QWKqEUnuW7J6pASVvzwWAQNqL-_KDzSs2ld6uJl3qbw,37710
|
|
5
5
|
har2tree/parser.py,sha256=4yej1OcVYAIiLfzYZsO9WCw3WyM_ykDTuvpW7UO1ROE,3645
|
|
6
6
|
har2tree/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
-
har2tree-1.34.
|
|
8
|
-
har2tree-1.34.
|
|
9
|
-
har2tree-1.34.
|
|
10
|
-
har2tree-1.34.
|
|
7
|
+
har2tree-1.34.2.dist-info/METADATA,sha256=7WlrwtcMWJ4YefcbzBwAf9l5d1DjlxJk3oKTGLomzJI,2239
|
|
8
|
+
har2tree-1.34.2.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
9
|
+
har2tree-1.34.2.dist-info/licenses/LICENSE,sha256=Xa4EVROgJsEo10CW-ISCRiw0TtqdKz1JuM3BBLBM55c,1803
|
|
10
|
+
har2tree-1.34.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|