har2tree 1.33.0__py3-none-any.whl → 1.34.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- har2tree/har2tree.py +97 -9
- har2tree/helper.py +2 -1
- {har2tree-1.33.0.dist-info → har2tree-1.34.1.dist-info}/METADATA +7 -8
- har2tree-1.34.1.dist-info/RECORD +10 -0
- {har2tree-1.33.0.dist-info → har2tree-1.34.1.dist-info}/WHEEL +1 -1
- har2tree-1.33.0.dist-info/RECORD +0 -10
- {har2tree-1.33.0.dist-info → har2tree-1.34.1.dist-info}/licenses/LICENSE +0 -0
har2tree/har2tree.py
CHANGED
|
@@ -13,7 +13,8 @@ from functools import wraps, lru_cache
|
|
|
13
13
|
from io import BytesIO
|
|
14
14
|
from operator import itemgetter
|
|
15
15
|
from pathlib import Path
|
|
16
|
-
from typing import Any,
|
|
16
|
+
from typing import Any, TypedDict
|
|
17
|
+
from collections.abc import Callable
|
|
17
18
|
from urllib.parse import unquote_plus, urlparse
|
|
18
19
|
|
|
19
20
|
from .helper import rebuild_url, Har2TreeError, Har2TreeLogAdapter
|
|
@@ -58,7 +59,8 @@ def trace_make_subtree_fallback(method: Callable[..., None]) -> Callable[..., No
|
|
|
58
59
|
|
|
59
60
|
def trace_make_subtree(method: Callable[..., None]) -> Callable[..., None]:
|
|
60
61
|
@wraps(method)
|
|
61
|
-
def _impl(self: Any, root: URLNode, nodes_to_attach: list[URLNode] | None=None,
|
|
62
|
+
def _impl(self: Any, root: URLNode, nodes_to_attach: list[URLNode] | None=None,
|
|
63
|
+
dev_debug: bool=False, fallback: bool=False) -> None:
|
|
62
64
|
if dev_debug_mode:
|
|
63
65
|
__load_debug_files()
|
|
64
66
|
if dev_debug_url and root.name == dev_debug_url or nodes_to_attach is not None and any(True for u in nodes_to_attach if u.name == dev_debug_url):
|
|
@@ -67,7 +69,7 @@ def trace_make_subtree(method: Callable[..., None]) -> Callable[..., None]:
|
|
|
67
69
|
elif dev_debug_hostname and root.hostname == dev_debug_hostname or nodes_to_attach is not None and any(True for u in nodes_to_attach if u.hostname == dev_debug_hostname):
|
|
68
70
|
root.logger.warning(f'Debugging Hostname: {dev_debug_hostname}.')
|
|
69
71
|
dev_debug = True
|
|
70
|
-
return method(self, root, nodes_to_attach, dev_debug)
|
|
72
|
+
return method(self, root, nodes_to_attach, dev_debug, fallback)
|
|
71
73
|
return _impl
|
|
72
74
|
|
|
73
75
|
|
|
@@ -84,6 +86,15 @@ def __load_debug_files() -> None:
|
|
|
84
86
|
dev_debug_hostname = f.read().strip()
|
|
85
87
|
|
|
86
88
|
|
|
89
|
+
# NOTE: Copy from PlaywrightCapture to avoid extra dep
|
|
90
|
+
class FramesResponse(TypedDict, total=False):
|
|
91
|
+
|
|
92
|
+
name: str
|
|
93
|
+
url: str
|
|
94
|
+
content: str | None
|
|
95
|
+
children: list[FramesResponse] | None
|
|
96
|
+
|
|
97
|
+
|
|
87
98
|
class HarFile():
|
|
88
99
|
|
|
89
100
|
def __init__(self, harfile: Path, capture_uuid: str):
|
|
@@ -135,6 +146,14 @@ class HarFile():
|
|
|
135
146
|
self.logger.debug('No cookies file available.')
|
|
136
147
|
self.cookies = []
|
|
137
148
|
|
|
149
|
+
framesfile = self.path.parent / f'{root_name}.frames.json'
|
|
150
|
+
if framesfile.is_file():
|
|
151
|
+
with framesfile.open() as c:
|
|
152
|
+
self.frames: FramesResponse = json.load(c)
|
|
153
|
+
else:
|
|
154
|
+
self.logger.debug('No frames file available.')
|
|
155
|
+
self.frames = {}
|
|
156
|
+
|
|
138
157
|
dlfile = self.path.parent / f'{root_name}.data'
|
|
139
158
|
dlfilename = self.path.parent / f'{root_name}.data.filename'
|
|
140
159
|
self.downloaded_file: BytesIO | None
|
|
@@ -405,6 +424,54 @@ class Har2Tree:
|
|
|
405
424
|
|
|
406
425
|
self.url_tree = self._nodes_list.pop(0)
|
|
407
426
|
|
|
427
|
+
def _load_iframes(self, current: URLNode, frames: FramesResponse) -> None:
|
|
428
|
+
if not frames.get('content') or frames['content'] is None:
|
|
429
|
+
# NOTE: debug stuff, no content makes it pretty useless.
|
|
430
|
+
if frames.get('url'):
|
|
431
|
+
if frames['url'] == "about:blank":
|
|
432
|
+
self.logger.info('Got a frame to about:blank with no content.')
|
|
433
|
+
else:
|
|
434
|
+
u = unquote_plus(frames['url'])
|
|
435
|
+
self.logger.warning(f'Got a url ({u}) for the frame, but no content')
|
|
436
|
+
else:
|
|
437
|
+
self.logger.info('Got a frame, but no content.')
|
|
438
|
+
return
|
|
439
|
+
|
|
440
|
+
if frames.get('url') and not (frames['url'] in ['about:blank']):
|
|
441
|
+
u = unquote_plus(frames['url'])
|
|
442
|
+
# this url should be in a node directly attached to that one
|
|
443
|
+
# we need to find that node
|
|
444
|
+
for child in current.traverse():
|
|
445
|
+
if child.name in [u, u.split('#', 1)[0]]:
|
|
446
|
+
self.logger.debug(f'Found URL "{u}".')
|
|
447
|
+
# Found the node, adding the content
|
|
448
|
+
if not hasattr(child, 'rendered_frame'):
|
|
449
|
+
child.rendered_frame = []
|
|
450
|
+
child.rendered_frame.append(BytesIO(frames['content'].encode()))
|
|
451
|
+
# and mark the node as iframe
|
|
452
|
+
child.add_feature('iframe', True)
|
|
453
|
+
# if there are children, use that node as parent and call the current method recursvely
|
|
454
|
+
if f_children := frames.get('children'):
|
|
455
|
+
for f_child in f_children:
|
|
456
|
+
self._load_iframes(child, f_child)
|
|
457
|
+
break
|
|
458
|
+
else:
|
|
459
|
+
# Couldn'd find the node Oo
|
|
460
|
+
self.logger.warning(f'Unable to find "{u}" in the children of "{current.name}"')
|
|
461
|
+
else:
|
|
462
|
+
self.logger.debug(f'"{current.name}" contains an iFrame.')
|
|
463
|
+
# No URL, this frame is directly in the parent frame.
|
|
464
|
+
if not hasattr(current, 'rendered_frame'):
|
|
465
|
+
current.rendered_frame = []
|
|
466
|
+
current.rendered_frame.append(BytesIO(frames['content'].encode()))
|
|
467
|
+
self.logger.debug(f'"{current.name}" has {len(current.rendered_frame)} iFrames.')
|
|
468
|
+
# and mark the node as iframe
|
|
469
|
+
current.add_feature('iframe', True)
|
|
470
|
+
# if there are children, use that node as parent and call the current method recursvely
|
|
471
|
+
if f_children := frames.get('children'):
|
|
472
|
+
for f_child in f_children:
|
|
473
|
+
self._load_iframes(current, f_child)
|
|
474
|
+
|
|
408
475
|
@property
|
|
409
476
|
def initial_referer(self) -> str | None:
|
|
410
477
|
'''The referer passed to the first URL in the tree'''
|
|
@@ -621,6 +688,14 @@ class Har2Tree:
|
|
|
621
688
|
for child_node_hostname, child_nodes_url in sub_roots.items():
|
|
622
689
|
self.make_hostname_tree(child_nodes_url, child_node_hostname)
|
|
623
690
|
|
|
691
|
+
def _all_urlnodes_in_host_tree(self) -> None:
|
|
692
|
+
# debug: check if all the nodes in the URL tree are in the hostnode tree (they must have an UUID)
|
|
693
|
+
self.logger.warning('Validating host tree....')
|
|
694
|
+
for urlnode in self.url_tree.traverse():
|
|
695
|
+
if not hasattr(urlnode, 'hostnode_uuid'):
|
|
696
|
+
self.logger.error(f'URL Node not un host tree: {urlnode}')
|
|
697
|
+
self.logger.warning('host tree validated.')
|
|
698
|
+
|
|
624
699
|
def make_tree(self) -> URLNode:
|
|
625
700
|
"""Build URL and Host trees"""
|
|
626
701
|
self._make_subtree(self.url_tree)
|
|
@@ -651,6 +726,12 @@ class Har2Tree:
|
|
|
651
726
|
# Initialize the hostname tree root
|
|
652
727
|
self.hostname_tree.add_url(self.url_tree)
|
|
653
728
|
self.make_hostname_tree(self.url_tree, self.hostname_tree)
|
|
729
|
+
if dev_debug_mode:
|
|
730
|
+
self._all_urlnodes_in_host_tree()
|
|
731
|
+
if self.har.frames.get('children') and self.har.frames['children'] is not None:
|
|
732
|
+
# we have frames in the main one
|
|
733
|
+
for f_child in self.har.frames['children']:
|
|
734
|
+
self._load_iframes(self.rendered_node, f_child)
|
|
654
735
|
return self.url_tree
|
|
655
736
|
|
|
656
737
|
@trace_make_subtree_fallback
|
|
@@ -668,7 +749,7 @@ class Har2Tree:
|
|
|
668
749
|
# we got an non-empty response, breaking
|
|
669
750
|
break
|
|
670
751
|
# attach to the the first response with something, or to whatever we get.
|
|
671
|
-
self._make_subtree(node_with_hostname, [node])
|
|
752
|
+
self._make_subtree(node_with_hostname, [node], fallback=True)
|
|
672
753
|
return
|
|
673
754
|
|
|
674
755
|
# Sometimes, the har has a list of pages, generally when we have HTTP redirects.
|
|
@@ -686,7 +767,7 @@ class Har2Tree:
|
|
|
686
767
|
page_root_node = self.get_url_node_by_uuid(self.pages_root[node.pageref])
|
|
687
768
|
if dev_debug:
|
|
688
769
|
self.logger.warning(f'Failed to attach URLNode in the normal process, attaching node to page {node.pageref} - Node: {page_root_node.uuid} - {page_root_node.name}.')
|
|
689
|
-
self._make_subtree(page_root_node, [node])
|
|
770
|
+
self._make_subtree(page_root_node, [node], fallback=True)
|
|
690
771
|
elif self.rendered_node != self.url_tree:
|
|
691
772
|
# Generally, when we have a bunch of redirects, they (generally) do not branch out
|
|
692
773
|
# before the final landing page *but* it is not always the case: some intermediary
|
|
@@ -698,7 +779,7 @@ class Har2Tree:
|
|
|
698
779
|
# end of this method anyway
|
|
699
780
|
if dev_debug:
|
|
700
781
|
self.logger.warning(f'Failed to attach URLNode in the normal process, attaching node to final redirect: {self.har.final_redirect}.')
|
|
701
|
-
self._make_subtree(self.rendered_node, [node])
|
|
782
|
+
self._make_subtree(self.rendered_node, [node], fallback=True)
|
|
702
783
|
elif 'pages' in self.har.har['log']:
|
|
703
784
|
# No luck, the node is root for this pageref, let's attach it to the prior page in the list, or the very first node (tree root)
|
|
704
785
|
page_before = self.har.har['log']['pages'][0]
|
|
@@ -720,13 +801,14 @@ class Har2Tree:
|
|
|
720
801
|
# node to the root node
|
|
721
802
|
page_root_node = self.url_tree
|
|
722
803
|
self.logger.warning('The pages in the HAR are in in the wrong order, this should not happen but here we are')
|
|
723
|
-
self._make_subtree(page_root_node, [node])
|
|
804
|
+
self._make_subtree(page_root_node, [node], fallback=True)
|
|
724
805
|
else:
|
|
725
806
|
# no way to attach it to anything else, attach to the root node
|
|
726
|
-
self._make_subtree(self.url_tree, [node])
|
|
807
|
+
self._make_subtree(self.url_tree, [node], fallback=True)
|
|
727
808
|
|
|
728
809
|
@trace_make_subtree
|
|
729
|
-
def _make_subtree(self, root: URLNode, nodes_to_attach: list[URLNode] | None=None,
|
|
810
|
+
def _make_subtree(self, root: URLNode, nodes_to_attach: list[URLNode] | None=None,
|
|
811
|
+
dev_debug: bool=False, fallback: bool=False) -> None:
|
|
730
812
|
"""Recursive method building each level of the tree"""
|
|
731
813
|
matching_urls: list[URLNode]
|
|
732
814
|
if nodes_to_attach is None:
|
|
@@ -819,6 +901,12 @@ class Har2Tree:
|
|
|
819
901
|
if hasattr(unode, 'external_ressources'):
|
|
820
902
|
# the url loads external things, and some of them have no referer....
|
|
821
903
|
for external_tag, links in unode.external_ressources.items():
|
|
904
|
+
# 2025-11-06: skip full regex until we're calling this method in the fallback
|
|
905
|
+
# the iframes will often (not always) have a referer set and the URL
|
|
906
|
+
# might be found by the regex and it will not be attached at the
|
|
907
|
+
# right place
|
|
908
|
+
if external_tag == 'full_regex' and not fallback:
|
|
909
|
+
continue
|
|
822
910
|
for link in links:
|
|
823
911
|
if link not in self.all_url_requests or link == self.har.final_redirect:
|
|
824
912
|
# We have a lot of false positives
|
har2tree/helper.py
CHANGED
|
@@ -364,7 +364,8 @@ def find_external_ressources(mimetype: str, data: bytes, base_url: str, all_requ
|
|
|
364
364
|
# link: https://www.w3schools.com/TAGs/tag_link.asp -> href
|
|
365
365
|
# object: https://www.w3schools.com/TAGs/tag_object.asp -> data
|
|
366
366
|
external_ressources: dict[str, list[str]] = {'img': [], 'script': [], 'video': [], 'audio': [],
|
|
367
|
-
'iframe': [],
|
|
367
|
+
'iframe': [],
|
|
368
|
+
'embed': [], 'source': [],
|
|
368
369
|
'link': [],
|
|
369
370
|
'object': [],
|
|
370
371
|
'css': [],
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: har2tree
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.34.1
|
|
4
4
|
Summary: HTTP Archive (HAR) to ETE Toolkit generator
|
|
5
5
|
License-Expression: BSD-3-Clause
|
|
6
6
|
License-File: LICENSE
|
|
7
7
|
Author: Raphaël Vinot
|
|
8
8
|
Author-email: raphael.vinot@circl.lu
|
|
9
|
-
Requires-Python: >=3.
|
|
9
|
+
Requires-Python: >=3.10,<3.15
|
|
10
10
|
Classifier: Intended Audience :: Information Technology
|
|
11
11
|
Classifier: Intended Audience :: Science/Research
|
|
12
12
|
Classifier: Intended Audience :: Telecommunications Industry
|
|
@@ -21,16 +21,15 @@ Classifier: Topic :: Internet
|
|
|
21
21
|
Classifier: Topic :: Security
|
|
22
22
|
Provides-Extra: docs
|
|
23
23
|
Requires-Dist: Sphinx (>=8.2.3) ; (python_version >= "3.11") and (extra == "docs")
|
|
24
|
-
Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.
|
|
24
|
+
Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.14.2)
|
|
25
25
|
Requires-Dist: ete3 (>=3.1.3)
|
|
26
26
|
Requires-Dist: filetype (>=1.2.0)
|
|
27
27
|
Requires-Dist: json-stream (>=2.3.3,<3.0.0)
|
|
28
|
-
Requires-Dist: legacy-cgi (>=2.6.
|
|
28
|
+
Requires-Dist: legacy-cgi (>=2.6.4) ; python_version >= "3.13" and python_version < "4.0"
|
|
29
29
|
Requires-Dist: multipart (>=1.3.0,<2.0.0)
|
|
30
|
-
Requires-Dist: numpy (
|
|
31
|
-
Requires-Dist: numpy (>=2.
|
|
32
|
-
Requires-Dist:
|
|
33
|
-
Requires-Dist: publicsuffixlist (>=1.0.2.20250919)
|
|
30
|
+
Requires-Dist: numpy (>=2.2,<2.3) ; python_version < "3.11"
|
|
31
|
+
Requires-Dist: numpy (>=2.3.4) ; python_version >= "3.11" and python_version < "4.0"
|
|
32
|
+
Requires-Dist: publicsuffixlist (>=1.0.2.20251107)
|
|
34
33
|
Requires-Dist: requests-toolbelt (>=1.0.0,<2.0.0)
|
|
35
34
|
Requires-Dist: six (>=1.17.0) ; extra == "docs"
|
|
36
35
|
Requires-Dist: tinycss2 (>=1.4.0)
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
har2tree/__init__.py,sha256=Na3mxHkUBq3rzYbxiLNJF37DxH5mcghSorjzXw5Teug,422
|
|
2
|
+
har2tree/har2tree.py,sha256=JdHGoyXNmu3ZCLRDJ-QC253Q2Lfi4SLHHBDS6Gt4Ez0,49172
|
|
3
|
+
har2tree/helper.py,sha256=psMpYWs5w0CONLfEo33yFgz6VwVY13xYbNDejOZ_EDw,20730
|
|
4
|
+
har2tree/nodes.py,sha256=QWKqEUnuW7J6pASVvzwWAQNqL-_KDzSs2ld6uJl3qbw,37710
|
|
5
|
+
har2tree/parser.py,sha256=4yej1OcVYAIiLfzYZsO9WCw3WyM_ykDTuvpW7UO1ROE,3645
|
|
6
|
+
har2tree/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
+
har2tree-1.34.1.dist-info/METADATA,sha256=SfEJxkKbVnEDpXtwYiLtYhRi3vZQBIYn4HotdWavTl0,2239
|
|
8
|
+
har2tree-1.34.1.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
9
|
+
har2tree-1.34.1.dist-info/licenses/LICENSE,sha256=Xa4EVROgJsEo10CW-ISCRiw0TtqdKz1JuM3BBLBM55c,1803
|
|
10
|
+
har2tree-1.34.1.dist-info/RECORD,,
|
har2tree-1.33.0.dist-info/RECORD
DELETED
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
har2tree/__init__.py,sha256=Na3mxHkUBq3rzYbxiLNJF37DxH5mcghSorjzXw5Teug,422
|
|
2
|
-
har2tree/har2tree.py,sha256=Ss8CLjspHs0hA4am9ODX6vNP0J2Gzg3w1D3265vf4kw,44620
|
|
3
|
-
har2tree/helper.py,sha256=CgeXqfBeHs8SbkW7TRNKqJBTZLAu63KggQjbGHCZAGI,20681
|
|
4
|
-
har2tree/nodes.py,sha256=QWKqEUnuW7J6pASVvzwWAQNqL-_KDzSs2ld6uJl3qbw,37710
|
|
5
|
-
har2tree/parser.py,sha256=4yej1OcVYAIiLfzYZsO9WCw3WyM_ykDTuvpW7UO1ROE,3645
|
|
6
|
-
har2tree/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
-
har2tree-1.33.0.dist-info/METADATA,sha256=zsM-qSqQdlwGOR_miEr4xvqbimIXulCKKn4fU0B3_Ak,2294
|
|
8
|
-
har2tree-1.33.0.dist-info/WHEEL,sha256=M5asmiAlL6HEcOq52Yi5mmk9KmTVjY2RDPtO4p9DMrc,88
|
|
9
|
-
har2tree-1.33.0.dist-info/licenses/LICENSE,sha256=Xa4EVROgJsEo10CW-ISCRiw0TtqdKz1JuM3BBLBM55c,1803
|
|
10
|
-
har2tree-1.33.0.dist-info/RECORD,,
|
|
File without changes
|