har2tree 1.32.0__tar.gz → 1.34.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {har2tree-1.32.0 → har2tree-1.34.2}/PKG-INFO +11 -12
- {har2tree-1.32.0 → har2tree-1.34.2}/har2tree/har2tree.py +167 -14
- {har2tree-1.32.0 → har2tree-1.34.2}/har2tree/helper.py +2 -1
- {har2tree-1.32.0 → har2tree-1.34.2}/har2tree/nodes.py +2 -1
- {har2tree-1.32.0 → har2tree-1.34.2}/pyproject.toml +12 -13
- {har2tree-1.32.0 → har2tree-1.34.2}/LICENSE +0 -0
- {har2tree-1.32.0 → har2tree-1.34.2}/README.md +0 -0
- {har2tree-1.32.0 → har2tree-1.34.2}/har2tree/__init__.py +0 -0
- {har2tree-1.32.0 → har2tree-1.34.2}/har2tree/parser.py +0 -0
- {har2tree-1.32.0 → har2tree-1.34.2}/har2tree/py.typed +0 -0
|
@@ -1,36 +1,35 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: har2tree
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.34.2
|
|
4
4
|
Summary: HTTP Archive (HAR) to ETE Toolkit generator
|
|
5
|
-
License: BSD-3-Clause
|
|
5
|
+
License-Expression: BSD-3-Clause
|
|
6
|
+
License-File: LICENSE
|
|
6
7
|
Author: Raphaël Vinot
|
|
7
8
|
Author-email: raphael.vinot@circl.lu
|
|
8
|
-
Requires-Python: >=3.
|
|
9
|
+
Requires-Python: >=3.10,<3.15
|
|
9
10
|
Classifier: Intended Audience :: Information Technology
|
|
10
11
|
Classifier: Intended Audience :: Science/Research
|
|
11
12
|
Classifier: Intended Audience :: Telecommunications Industry
|
|
12
|
-
Classifier: License :: OSI Approved :: BSD License
|
|
13
13
|
Classifier: Operating System :: POSIX :: Linux
|
|
14
14
|
Classifier: Programming Language :: Python :: 3
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
16
15
|
Classifier: Programming Language :: Python :: 3.10
|
|
17
16
|
Classifier: Programming Language :: Python :: 3.11
|
|
18
17
|
Classifier: Programming Language :: Python :: 3.12
|
|
19
18
|
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
20
20
|
Classifier: Topic :: Internet
|
|
21
21
|
Classifier: Topic :: Security
|
|
22
22
|
Provides-Extra: docs
|
|
23
23
|
Requires-Dist: Sphinx (>=8.2.3) ; (python_version >= "3.11") and (extra == "docs")
|
|
24
|
-
Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.
|
|
24
|
+
Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.14.2)
|
|
25
25
|
Requires-Dist: ete3 (>=3.1.3)
|
|
26
26
|
Requires-Dist: filetype (>=1.2.0)
|
|
27
27
|
Requires-Dist: json-stream (>=2.3.3,<3.0.0)
|
|
28
|
-
Requires-Dist: legacy-cgi (>=2.6.
|
|
28
|
+
Requires-Dist: legacy-cgi (>=2.6.4) ; python_version >= "3.13" and python_version < "4.0"
|
|
29
29
|
Requires-Dist: multipart (>=1.3.0,<2.0.0)
|
|
30
|
-
Requires-Dist: numpy (
|
|
31
|
-
Requires-Dist: numpy (
|
|
32
|
-
Requires-Dist:
|
|
33
|
-
Requires-Dist: publicsuffixlist (>=1.0.2.20250824)
|
|
30
|
+
Requires-Dist: numpy (>=2.2,<2.3) ; python_version < "3.11"
|
|
31
|
+
Requires-Dist: numpy (>=2.3.4) ; python_version >= "3.11" and python_version < "4.0"
|
|
32
|
+
Requires-Dist: publicsuffixlist (>=1.0.2.20251115)
|
|
34
33
|
Requires-Dist: requests-toolbelt (>=1.0.0,<2.0.0)
|
|
35
34
|
Requires-Dist: six (>=1.17.0) ; extra == "docs"
|
|
36
35
|
Requires-Dist: tinycss2 (>=1.4.0)
|
|
@@ -13,7 +13,9 @@ from functools import wraps, lru_cache
|
|
|
13
13
|
from io import BytesIO
|
|
14
14
|
from operator import itemgetter
|
|
15
15
|
from pathlib import Path
|
|
16
|
-
from typing import Any,
|
|
16
|
+
from typing import Any, TypedDict
|
|
17
|
+
from collections.abc import Iterator
|
|
18
|
+
from collections.abc import Callable
|
|
17
19
|
from urllib.parse import unquote_plus, urlparse
|
|
18
20
|
|
|
19
21
|
from .helper import rebuild_url, Har2TreeError, Har2TreeLogAdapter
|
|
@@ -58,7 +60,8 @@ def trace_make_subtree_fallback(method: Callable[..., None]) -> Callable[..., No
|
|
|
58
60
|
|
|
59
61
|
def trace_make_subtree(method: Callable[..., None]) -> Callable[..., None]:
|
|
60
62
|
@wraps(method)
|
|
61
|
-
def _impl(self: Any, root: URLNode, nodes_to_attach: list[URLNode] | None=None,
|
|
63
|
+
def _impl(self: Any, root: URLNode, nodes_to_attach: list[URLNode] | None=None,
|
|
64
|
+
dev_debug: bool=False, fallback: bool=False) -> None:
|
|
62
65
|
if dev_debug_mode:
|
|
63
66
|
__load_debug_files()
|
|
64
67
|
if dev_debug_url and root.name == dev_debug_url or nodes_to_attach is not None and any(True for u in nodes_to_attach if u.name == dev_debug_url):
|
|
@@ -67,7 +70,7 @@ def trace_make_subtree(method: Callable[..., None]) -> Callable[..., None]:
|
|
|
67
70
|
elif dev_debug_hostname and root.hostname == dev_debug_hostname or nodes_to_attach is not None and any(True for u in nodes_to_attach if u.hostname == dev_debug_hostname):
|
|
68
71
|
root.logger.warning(f'Debugging Hostname: {dev_debug_hostname}.')
|
|
69
72
|
dev_debug = True
|
|
70
|
-
return method(self, root, nodes_to_attach, dev_debug)
|
|
73
|
+
return method(self, root, nodes_to_attach, dev_debug, fallback)
|
|
71
74
|
return _impl
|
|
72
75
|
|
|
73
76
|
|
|
@@ -84,6 +87,15 @@ def __load_debug_files() -> None:
|
|
|
84
87
|
dev_debug_hostname = f.read().strip()
|
|
85
88
|
|
|
86
89
|
|
|
90
|
+
# NOTE: Copy from PlaywrightCapture to avoid extra dep
|
|
91
|
+
class FramesResponse(TypedDict, total=False):
|
|
92
|
+
|
|
93
|
+
name: str
|
|
94
|
+
url: str
|
|
95
|
+
content: str | None
|
|
96
|
+
children: list[FramesResponse] | None
|
|
97
|
+
|
|
98
|
+
|
|
87
99
|
class HarFile():
|
|
88
100
|
|
|
89
101
|
def __init__(self, harfile: Path, capture_uuid: str):
|
|
@@ -118,7 +130,11 @@ class HarFile():
|
|
|
118
130
|
last_redirect = unquote_plus(_lr.read())
|
|
119
131
|
self.final_redirect: str = last_redirect
|
|
120
132
|
if not self._search_final_redirect():
|
|
121
|
-
|
|
133
|
+
if last_redirect.startswith('chrome') or last_redirect.startswith('about'):
|
|
134
|
+
# the capture failed.
|
|
135
|
+
pass
|
|
136
|
+
else:
|
|
137
|
+
self.logger.info(f'Final redirect URL from address bar not in tree: {last_redirect}')
|
|
122
138
|
else:
|
|
123
139
|
self.logger.debug('No last_redirect file available.')
|
|
124
140
|
self.final_redirect = ''
|
|
@@ -131,6 +147,14 @@ class HarFile():
|
|
|
131
147
|
self.logger.debug('No cookies file available.')
|
|
132
148
|
self.cookies = []
|
|
133
149
|
|
|
150
|
+
framesfile = self.path.parent / f'{root_name}.frames.json'
|
|
151
|
+
if framesfile.is_file():
|
|
152
|
+
with framesfile.open() as c:
|
|
153
|
+
self.frames: FramesResponse = json.load(c)
|
|
154
|
+
else:
|
|
155
|
+
self.logger.debug('No frames file available.')
|
|
156
|
+
self.frames = {}
|
|
157
|
+
|
|
134
158
|
dlfile = self.path.parent / f'{root_name}.data'
|
|
135
159
|
dlfilename = self.path.parent / f'{root_name}.data.filename'
|
|
136
160
|
self.downloaded_file: BytesIO | None
|
|
@@ -309,8 +333,9 @@ class Har2Tree:
|
|
|
309
333
|
self.pages_root: dict[str, str] = {}
|
|
310
334
|
|
|
311
335
|
self.all_redirects: list[str] = []
|
|
312
|
-
|
|
313
|
-
self.
|
|
336
|
+
# 2025-11-16: make values of referers and initiators sets because there will be duplicates
|
|
337
|
+
self.all_referer: dict[str, set[str]] = defaultdict(set)
|
|
338
|
+
self.all_initiator_url: dict[str, set[str]] = defaultdict(set)
|
|
314
339
|
self._load_url_entries()
|
|
315
340
|
|
|
316
341
|
# Generate cookies lookup tables
|
|
@@ -401,6 +426,62 @@ class Har2Tree:
|
|
|
401
426
|
|
|
402
427
|
self.url_tree = self._nodes_list.pop(0)
|
|
403
428
|
|
|
429
|
+
def _load_iframes(self, current: URLNode, frames: FramesResponse) -> None:
|
|
430
|
+
if not frames.get('content') or frames['content'] is None:
|
|
431
|
+
# NOTE: debug stuff, no content makes it pretty useless.
|
|
432
|
+
if frames.get('url'):
|
|
433
|
+
if frames['url'] == "about:blank":
|
|
434
|
+
self.logger.info('Got a frame to about:blank with no content.')
|
|
435
|
+
else:
|
|
436
|
+
u = unquote_plus(frames['url'])
|
|
437
|
+
self.logger.warning(f'Got a url ({u}) for the frame, but no content')
|
|
438
|
+
else:
|
|
439
|
+
self.logger.info('Got a frame, but no content.')
|
|
440
|
+
return
|
|
441
|
+
|
|
442
|
+
if (frames.get('url')
|
|
443
|
+
and not (frames['url'] in ['about:blank'] # not loading anything, same as empty
|
|
444
|
+
or frames['url'].startswith('data') # base64 encoded content
|
|
445
|
+
or frames['url'].startswith('chrome-error') # not in the HAR/tree
|
|
446
|
+
or frames['url'].startswith('blob') # blobs aren't URLs
|
|
447
|
+
)):
|
|
448
|
+
u = unquote_plus(frames['url'])
|
|
449
|
+
possible_child_name = {u, u.split('#', 1)[0]}
|
|
450
|
+
# this url should be in a node directly attached to that one
|
|
451
|
+
# we need to find that node
|
|
452
|
+
for child in current.traverse():
|
|
453
|
+
if child.name in possible_child_name:
|
|
454
|
+
self.logger.debug(f'Found URL "{u}".')
|
|
455
|
+
# Found the node, adding the content
|
|
456
|
+
if not hasattr(child, 'rendered_frame'):
|
|
457
|
+
child.rendered_frame = []
|
|
458
|
+
child.rendered_frame.append(BytesIO(frames['content'].encode()))
|
|
459
|
+
# and mark the node as iframe
|
|
460
|
+
child.add_feature('iframe', True)
|
|
461
|
+
# if there are children, use that node as parent and call the current method recursvely
|
|
462
|
+
if f_children := frames.get('children'):
|
|
463
|
+
for f_child in f_children:
|
|
464
|
+
self._load_iframes(child, f_child)
|
|
465
|
+
break
|
|
466
|
+
else:
|
|
467
|
+
# Couldn'd find the node Oo
|
|
468
|
+
to_print = ', '.join(possible_child_name)
|
|
469
|
+
children_to_print = ', '.join([child.name for child in current.traverse()])
|
|
470
|
+
self.logger.warning(f'Unable to find "{to_print}" in the children of "{current.name}" - {children_to_print}')
|
|
471
|
+
else:
|
|
472
|
+
self.logger.debug(f'"{current.name}" contains an iFrame.')
|
|
473
|
+
# No URL, this frame is directly in the parent frame.
|
|
474
|
+
if not hasattr(current, 'rendered_frame'):
|
|
475
|
+
current.rendered_frame = []
|
|
476
|
+
current.rendered_frame.append(BytesIO(frames['content'].encode()))
|
|
477
|
+
self.logger.debug(f'"{current.name}" has {len(current.rendered_frame)} iFrames.')
|
|
478
|
+
# and mark the node as iframe
|
|
479
|
+
current.add_feature('iframe', True)
|
|
480
|
+
# if there are children, use that node as parent and call the current method recursvely
|
|
481
|
+
if f_children := frames.get('children'):
|
|
482
|
+
for f_child in f_children:
|
|
483
|
+
self._load_iframes(current, f_child)
|
|
484
|
+
|
|
404
485
|
@property
|
|
405
486
|
def initial_referer(self) -> str | None:
|
|
406
487
|
'''The referer passed to the first URL in the tree'''
|
|
@@ -523,7 +604,7 @@ class Har2Tree:
|
|
|
523
604
|
|
|
524
605
|
if hasattr(n, 'initiator_url'):
|
|
525
606
|
# The HAR file was created by chrome/chromium and we got the _initiator key
|
|
526
|
-
self.all_initiator_url[n.initiator_url].
|
|
607
|
+
self.all_initiator_url[n.initiator_url].add(n.name)
|
|
527
608
|
|
|
528
609
|
if url_entry['startedDateTime'] in self.har.pages_start_times:
|
|
529
610
|
for page in self.har.pages_start_times[url_entry['startedDateTime']]:
|
|
@@ -536,7 +617,7 @@ class Har2Tree:
|
|
|
536
617
|
if hasattr(n, 'referer') and i > 0:
|
|
537
618
|
# NOTE 2021-05-14: referer to self are a real thing: url -> POST to self
|
|
538
619
|
if n.name != n.referer or ('method' in n.request and n.request['method'] == 'POST'):
|
|
539
|
-
self.all_referer[n.referer].
|
|
620
|
+
self.all_referer[n.referer].add(n.name)
|
|
540
621
|
|
|
541
622
|
self._nodes_list.append(n)
|
|
542
623
|
self.all_url_requests[n.name].append(n)
|
|
@@ -617,6 +698,14 @@ class Har2Tree:
|
|
|
617
698
|
for child_node_hostname, child_nodes_url in sub_roots.items():
|
|
618
699
|
self.make_hostname_tree(child_nodes_url, child_node_hostname)
|
|
619
700
|
|
|
701
|
+
def _all_urlnodes_in_host_tree(self) -> None:
|
|
702
|
+
# debug: check if all the nodes in the URL tree are in the hostnode tree (they must have an UUID)
|
|
703
|
+
self.logger.warning('Validating host tree....')
|
|
704
|
+
for urlnode in self.url_tree.traverse():
|
|
705
|
+
if not hasattr(urlnode, 'hostnode_uuid'):
|
|
706
|
+
self.logger.error(f'URL Node not un host tree: {urlnode}')
|
|
707
|
+
self.logger.warning('host tree validated.')
|
|
708
|
+
|
|
620
709
|
def make_tree(self) -> URLNode:
|
|
621
710
|
"""Build URL and Host trees"""
|
|
622
711
|
self._make_subtree(self.url_tree)
|
|
@@ -647,6 +736,15 @@ class Har2Tree:
|
|
|
647
736
|
# Initialize the hostname tree root
|
|
648
737
|
self.hostname_tree.add_url(self.url_tree)
|
|
649
738
|
self.make_hostname_tree(self.url_tree, self.hostname_tree)
|
|
739
|
+
if dev_debug_mode:
|
|
740
|
+
self._all_urlnodes_in_host_tree()
|
|
741
|
+
if isinstance(self.har.frames, dict):
|
|
742
|
+
if self.har.frames.get('children') and self.har.frames['children'] is not None:
|
|
743
|
+
# we have frames in the main one
|
|
744
|
+
for f_child in self.har.frames['children']:
|
|
745
|
+
self._load_iframes(self.rendered_node, f_child)
|
|
746
|
+
else:
|
|
747
|
+
self.logger.warning(f'Wrong format for the frames ({type(self.har.frames)}), very old capture.')
|
|
650
748
|
return self.url_tree
|
|
651
749
|
|
|
652
750
|
@trace_make_subtree_fallback
|
|
@@ -664,7 +762,7 @@ class Har2Tree:
|
|
|
664
762
|
# we got an non-empty response, breaking
|
|
665
763
|
break
|
|
666
764
|
# attach to the the first response with something, or to whatever we get.
|
|
667
|
-
self._make_subtree(node_with_hostname, [node])
|
|
765
|
+
self._make_subtree(node_with_hostname, [node], fallback=True)
|
|
668
766
|
return
|
|
669
767
|
|
|
670
768
|
# Sometimes, the har has a list of pages, generally when we have HTTP redirects.
|
|
@@ -682,7 +780,7 @@ class Har2Tree:
|
|
|
682
780
|
page_root_node = self.get_url_node_by_uuid(self.pages_root[node.pageref])
|
|
683
781
|
if dev_debug:
|
|
684
782
|
self.logger.warning(f'Failed to attach URLNode in the normal process, attaching node to page {node.pageref} - Node: {page_root_node.uuid} - {page_root_node.name}.')
|
|
685
|
-
self._make_subtree(page_root_node, [node])
|
|
783
|
+
self._make_subtree(page_root_node, [node], fallback=True)
|
|
686
784
|
elif self.rendered_node != self.url_tree:
|
|
687
785
|
# Generally, when we have a bunch of redirects, they (generally) do not branch out
|
|
688
786
|
# before the final landing page *but* it is not always the case: some intermediary
|
|
@@ -694,7 +792,7 @@ class Har2Tree:
|
|
|
694
792
|
# end of this method anyway
|
|
695
793
|
if dev_debug:
|
|
696
794
|
self.logger.warning(f'Failed to attach URLNode in the normal process, attaching node to final redirect: {self.har.final_redirect}.')
|
|
697
|
-
self._make_subtree(self.rendered_node, [node])
|
|
795
|
+
self._make_subtree(self.rendered_node, [node], fallback=True)
|
|
698
796
|
elif 'pages' in self.har.har['log']:
|
|
699
797
|
# No luck, the node is root for this pageref, let's attach it to the prior page in the list, or the very first node (tree root)
|
|
700
798
|
page_before = self.har.har['log']['pages'][0]
|
|
@@ -716,13 +814,42 @@ class Har2Tree:
|
|
|
716
814
|
# node to the root node
|
|
717
815
|
page_root_node = self.url_tree
|
|
718
816
|
self.logger.warning('The pages in the HAR are in in the wrong order, this should not happen but here we are')
|
|
719
|
-
self._make_subtree(page_root_node, [node])
|
|
817
|
+
self._make_subtree(page_root_node, [node], fallback=True)
|
|
720
818
|
else:
|
|
721
819
|
# no way to attach it to anything else, attach to the root node
|
|
722
|
-
self._make_subtree(self.url_tree, [node])
|
|
820
|
+
self._make_subtree(self.url_tree, [node], fallback=True)
|
|
821
|
+
|
|
822
|
+
def all_real_urls_in_children(self, frame: FramesResponse) -> Iterator[str]:
|
|
823
|
+
# from a frame, search all the real urls in each of the children, stop at the first one
|
|
824
|
+
if (frame.get('url') and frame['url'] is not None
|
|
825
|
+
and not (frame['url'] in ['about:blank', 'about:srcdoc'] # not loading anything, same as empty
|
|
826
|
+
or frame['url'].startswith('data') # base64 encoded content
|
|
827
|
+
or frame['url'].startswith('chrome-error') # not in the HAR/tree
|
|
828
|
+
or frame['url'].startswith('blob'))): # blobs aren't URLs
|
|
829
|
+
yield frame['url']
|
|
830
|
+
else:
|
|
831
|
+
# got no real URL, try the children
|
|
832
|
+
if frame.get('children') and frame['children'] is not None:
|
|
833
|
+
for c in frame['children']:
|
|
834
|
+
yield from self.all_real_urls_in_children(c)
|
|
835
|
+
|
|
836
|
+
def search_in_frames(self, urls: set[str], frame: FramesResponse) -> Iterator[str]:
|
|
837
|
+
# If the frame doesn't have children, there are no potential URLs to attach
|
|
838
|
+
if not frame.get('children') or frame['children'] is None:
|
|
839
|
+
return None
|
|
840
|
+
|
|
841
|
+
if frame.get('url'):
|
|
842
|
+
u = unquote_plus(frame['url'])
|
|
843
|
+
if urls & {u, u.split('#', 1)[0]}:
|
|
844
|
+
# got a matching URL, get list of potential iframes urls
|
|
845
|
+
for c in frame['children']:
|
|
846
|
+
yield from self.all_real_urls_in_children(c)
|
|
847
|
+
for c in frame['children']:
|
|
848
|
+
yield from self.search_in_frames(urls, c)
|
|
723
849
|
|
|
724
850
|
@trace_make_subtree
|
|
725
|
-
def _make_subtree(self, root: URLNode, nodes_to_attach: list[URLNode] | None=None,
|
|
851
|
+
def _make_subtree(self, root: URLNode, nodes_to_attach: list[URLNode] | None=None,
|
|
852
|
+
dev_debug: bool=False, fallback: bool=False) -> None:
|
|
726
853
|
"""Recursive method building each level of the tree"""
|
|
727
854
|
matching_urls: list[URLNode]
|
|
728
855
|
if nodes_to_attach is None:
|
|
@@ -784,6 +911,26 @@ class Har2Tree:
|
|
|
784
911
|
if unode.empty_response:
|
|
785
912
|
continue
|
|
786
913
|
|
|
914
|
+
# 2025-11-14
|
|
915
|
+
# the referer of an iframe is the hostname of the parent, even if the parent
|
|
916
|
+
# is a URL with a full path. Before using the referer, we need to check if we have
|
|
917
|
+
# the current url in the frame tree. If we do, find nodes (in the remaining list)
|
|
918
|
+
# with the URLs of the children - any fragment will be missing - and attach that node
|
|
919
|
+
possible_iframe_urls = {unode.name, unode.name.split('#', 1)[0]}
|
|
920
|
+
for possible_url in self.search_in_frames(possible_iframe_urls, self.har.frames):
|
|
921
|
+
cu = unquote_plus(possible_url)
|
|
922
|
+
for u in {cu, cu.split('#', 1)[0]}:
|
|
923
|
+
if u not in self.all_url_requests:
|
|
924
|
+
if '#' not in u:
|
|
925
|
+
self.logger.info(f'"{u}" in the frames URLs, but not in the HAR.')
|
|
926
|
+
continue
|
|
927
|
+
matching_urls = [url_node for url_node in self.all_url_requests[u]
|
|
928
|
+
if url_node in self._nodes_list]
|
|
929
|
+
self._nodes_list = [node for node in self._nodes_list if node not in matching_urls]
|
|
930
|
+
if dev_debug:
|
|
931
|
+
self.logger.warning(f'Found via initiator from {unode.name} to {matching_urls}.')
|
|
932
|
+
self._make_subtree(unode, matching_urls)
|
|
933
|
+
|
|
787
934
|
# The node can have a redirect, but also trigger ressources refering to themselves, we need to trigger this code on each node.
|
|
788
935
|
if self.all_initiator_url.get(unode.name):
|
|
789
936
|
# The URL (unode.name) is in the list of known urls initiating calls
|
|
@@ -815,6 +962,12 @@ class Har2Tree:
|
|
|
815
962
|
if hasattr(unode, 'external_ressources'):
|
|
816
963
|
# the url loads external things, and some of them have no referer....
|
|
817
964
|
for external_tag, links in unode.external_ressources.items():
|
|
965
|
+
# 2025-11-06: skip full regex until we're calling this method in the fallback
|
|
966
|
+
# the iframes will often (not always) have a referer set and the URL
|
|
967
|
+
# might be found by the regex and it will not be attached at the
|
|
968
|
+
# right place
|
|
969
|
+
if external_tag == 'full_regex' and not fallback:
|
|
970
|
+
continue
|
|
818
971
|
for link in links:
|
|
819
972
|
if link not in self.all_url_requests or link == self.har.final_redirect:
|
|
820
973
|
# We have a lot of false positives
|
|
@@ -364,7 +364,8 @@ def find_external_ressources(mimetype: str, data: bytes, base_url: str, all_requ
|
|
|
364
364
|
# link: https://www.w3schools.com/TAGs/tag_link.asp -> href
|
|
365
365
|
# object: https://www.w3schools.com/TAGs/tag_object.asp -> data
|
|
366
366
|
external_ressources: dict[str, list[str]] = {'img': [], 'script': [], 'video': [], 'audio': [],
|
|
367
|
-
'iframe': [],
|
|
367
|
+
'iframe': [],
|
|
368
|
+
'embed': [], 'source': [],
|
|
368
369
|
'link': [],
|
|
369
370
|
'object': [],
|
|
370
371
|
'css': [],
|
|
@@ -485,6 +485,7 @@ class URLNode(HarTreeNode):
|
|
|
485
485
|
|
|
486
486
|
# Common JS redirect we can catch easily
|
|
487
487
|
# NOTE: it is extremely fragile and doesn't work very often but is kinda better than nothing.
|
|
488
|
+
# NOTE 2025-08-30: Also, finding that doesn't mean it is in a part of the code that is executed without user interaction. It can be triggered after a user fills a form for example.
|
|
488
489
|
# Source: https://stackoverflow.com/questions/13363174/regular-expression-to-catch-as-many-javascript-redirections-as-possible
|
|
489
490
|
regex = re.compile(br"""((location.href)|(window.location)|(location.replace)|(location.assign))(( ?= ?)|( ?\( ?))("|')([^'"]*)("|')( ?\) ?)?;""", re.I)
|
|
490
491
|
matches = re.findall(regex, self.body.getvalue())
|
|
@@ -586,7 +587,7 @@ class URLNode(HarTreeNode):
|
|
|
586
587
|
return href
|
|
587
588
|
|
|
588
589
|
if not hasattr(self, 'rendered_html') or not self.rendered_html:
|
|
589
|
-
raise Har2TreeError('Not the node of a page rendered ({self.uuid}), invalid request.')
|
|
590
|
+
raise Har2TreeError(f'Not the node of a page rendered ({self.uuid}), invalid request.')
|
|
590
591
|
urls: set[str] = set()
|
|
591
592
|
|
|
592
593
|
# The simple ones: the links.
|
|
@@ -1,32 +1,31 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "har2tree"
|
|
3
|
-
version = "1.
|
|
3
|
+
version = "1.34.2"
|
|
4
4
|
description = "HTTP Archive (HAR) to ETE Toolkit generator"
|
|
5
5
|
authors = [
|
|
6
6
|
{name="Raphaël Vinot", email="raphael.vinot@circl.lu"}
|
|
7
7
|
]
|
|
8
8
|
license = "BSD-3-Clause"
|
|
9
9
|
readme = "README.md"
|
|
10
|
-
requires-python = ">=3.
|
|
10
|
+
requires-python = ">=3.10,<3.15"
|
|
11
11
|
|
|
12
12
|
dynamic = [ "classifiers" ]
|
|
13
13
|
|
|
14
14
|
dependencies = [
|
|
15
15
|
"ete3 (>=3.1.3)",
|
|
16
|
-
"beautifulsoup4[charset-normalizer,lxml] (>=4.
|
|
17
|
-
"publicsuffixlist (>=1.0.2.
|
|
16
|
+
"beautifulsoup4[charset-normalizer,lxml] (>=4.14.2)",
|
|
17
|
+
"publicsuffixlist (>=1.0.2.20251115)",
|
|
18
18
|
"filetype (>=1.2.0)",
|
|
19
|
-
# poetry up fails with the version of numpy forced for python < 3.
|
|
19
|
+
# poetry up fails with the version of numpy forced for python < 3.11.
|
|
20
20
|
# The work around is to comment it, run poetry up, uncomment it. and run poetry update.
|
|
21
|
-
"numpy (
|
|
22
|
-
"numpy (
|
|
23
|
-
"numpy (>=2.3.2) ; python_version >= \"3.11\"",
|
|
21
|
+
"numpy (>=2.2,<2.3) ; python_version < '3.11'",
|
|
22
|
+
"numpy (>=2.3.4) ; python_version >= \"3.11\" and python_version < \"4.0\"",
|
|
24
23
|
"w3lib (>=2.3.1)",
|
|
25
24
|
"tinycss2 (>=1.4.0)",
|
|
26
|
-
"legacy-cgi (>=2.6.
|
|
25
|
+
"legacy-cgi (>=2.6.4) ; python_version >= \"3.13\" and python_version < \"4.0\"",
|
|
27
26
|
"multipart (>=1.3.0,<2.0.0)",
|
|
28
27
|
"json-stream (>=2.3.3,<3.0.0)",
|
|
29
|
-
"requests-toolbelt (>=1.0.0,<2.0.0)"
|
|
28
|
+
"requests-toolbelt (>=1.0.0,<2.0.0)"
|
|
30
29
|
]
|
|
31
30
|
|
|
32
31
|
[project.urls]
|
|
@@ -48,9 +47,9 @@ classifiers = [
|
|
|
48
47
|
docs = ["Sphinx (>=8.2.3) ; python_version >= \"3.11\"", "six (>=1.17.0)"]
|
|
49
48
|
|
|
50
49
|
[tool.poetry.group.dev.dependencies]
|
|
51
|
-
mypy = "^1.
|
|
52
|
-
pytest-cov = "^
|
|
53
|
-
coverage = "^7.
|
|
50
|
+
mypy = "^1.18.2"
|
|
51
|
+
pytest-cov = "^7.0.0"
|
|
52
|
+
coverage = "^7.11.3"
|
|
54
53
|
types-beautifulsoup4 = "^4.12.0.20250516"
|
|
55
54
|
|
|
56
55
|
[build-system]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|