har2tree 1.31.6__py3-none-any.whl → 1.32.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- har2tree/har2tree.py +21 -19
- har2tree/nodes.py +28 -7
- {har2tree-1.31.6.dist-info → har2tree-1.32.1.dist-info}/METADATA +4 -3
- har2tree-1.32.1.dist-info/RECORD +10 -0
- har2tree-1.31.6.dist-info/RECORD +0 -10
- {har2tree-1.31.6.dist-info → har2tree-1.32.1.dist-info}/LICENSE +0 -0
- {har2tree-1.31.6.dist-info → har2tree-1.32.1.dist-info}/WHEEL +0 -0
har2tree/har2tree.py
CHANGED
|
@@ -115,8 +115,10 @@ class HarFile():
|
|
|
115
115
|
last_redirect_file = self.path.parent / f'{root_name}.last_redirect.txt'
|
|
116
116
|
if last_redirect_file.is_file():
|
|
117
117
|
with last_redirect_file.open('r') as _lr:
|
|
118
|
-
|
|
119
|
-
self.
|
|
118
|
+
last_redirect = unquote_plus(_lr.read())
|
|
119
|
+
self.final_redirect: str = last_redirect
|
|
120
|
+
if not self._search_final_redirect():
|
|
121
|
+
self.logger.warning(f'Final redirect URL from address bar not in tree: {last_redirect}')
|
|
120
122
|
else:
|
|
121
123
|
self.logger.debug('No last_redirect file available.')
|
|
122
124
|
self.final_redirect = ''
|
|
@@ -169,29 +171,30 @@ class HarFile():
|
|
|
169
171
|
# Set to false if initial_redirects fails to find the chain.
|
|
170
172
|
self.need_tree_redirects = False
|
|
171
173
|
|
|
172
|
-
def _search_final_redirect(self) ->
|
|
174
|
+
def _search_final_redirect(self) -> bool:
|
|
173
175
|
"""Try to find the final path to the final redirect without building the tree"""
|
|
174
176
|
for e in self.entries:
|
|
175
177
|
unquoted_url = unquote_plus(e['request']['url'])
|
|
176
178
|
if unquoted_url == self.final_redirect:
|
|
177
|
-
|
|
179
|
+
return True
|
|
178
180
|
elif unquoted_url.startswith(f'{self.final_redirect}?'):
|
|
179
181
|
# WARNING: the URL in that file may not be present in the HAR: the query part is stripped by splash
|
|
180
182
|
self.final_redirect = unquoted_url
|
|
181
|
-
|
|
183
|
+
return True
|
|
182
184
|
else:
|
|
183
185
|
# Update 2020-04-01: .. but the fragment is not striped so self.final_redirect may not be found
|
|
184
186
|
# Unless we find the entry in the har, we need to search again without the fragment
|
|
185
187
|
if '#' in self.final_redirect:
|
|
186
188
|
self.final_redirect = self.final_redirect.split('#', 1)[0]
|
|
187
|
-
self._search_final_redirect()
|
|
189
|
+
return self._search_final_redirect()
|
|
188
190
|
elif '?' in self.final_redirect:
|
|
189
191
|
# At this point, we're trying things. The final URL returned by splash may have been changed
|
|
190
192
|
# in JavaScript and never appear in the HAR. Let's try to find the closest one with the same path
|
|
191
193
|
self.final_redirect = self.final_redirect.split('?', 1)[0]
|
|
192
|
-
self._search_final_redirect()
|
|
194
|
+
return self._search_final_redirect()
|
|
193
195
|
else:
|
|
194
196
|
self.logger.info(f'Unable to find the final redirect: {self.final_redirect}')
|
|
197
|
+
return False
|
|
195
198
|
|
|
196
199
|
@property
|
|
197
200
|
def number_entries(self) -> int:
|
|
@@ -566,12 +569,6 @@ class Har2Tree:
|
|
|
566
569
|
if node:
|
|
567
570
|
return node[0]
|
|
568
571
|
|
|
569
|
-
browser_errors = ['chrome-error', 'about:blank']
|
|
570
|
-
if self.har.final_redirect and not any(self.har.final_redirect.startswith(r) for r in browser_errors):
|
|
571
|
-
self.logger.warning(f'Final redirect URL from adress bar not in tree: {self.har.final_redirect}')
|
|
572
|
-
else:
|
|
573
|
-
# No final redirect, already logged earlier.
|
|
574
|
-
pass
|
|
575
572
|
# Just try to get the best guess: first node after JS/HTTP redirects
|
|
576
573
|
curnode = self.url_tree
|
|
577
574
|
while hasattr(curnode, 'redirect') and curnode.redirect:
|
|
@@ -679,20 +676,25 @@ class Har2Tree:
|
|
|
679
676
|
and node.pageref != self.har.har['log']['pages'][0]
|
|
680
677
|
and self.pages_root[node.pageref] != node.uuid):
|
|
681
678
|
# In that case, we check if there is already a page with the pageref of the orphan node,
|
|
682
|
-
# and attach the node to that.
|
|
679
|
+
# and attach the node to that.
|
|
680
|
+
# NOTE: we can only do that if there is already a node with this pageref in the tree.
|
|
683
681
|
# This node is not a page root, we can attach it \o/
|
|
684
682
|
page_root_node = self.get_url_node_by_uuid(self.pages_root[node.pageref])
|
|
685
683
|
if dev_debug:
|
|
686
684
|
self.logger.warning(f'Failed to attach URLNode in the normal process, attaching node to page {node.pageref} - Node: {page_root_node.uuid} - {page_root_node.name}.')
|
|
687
685
|
self._make_subtree(page_root_node, [node])
|
|
688
|
-
elif self.
|
|
689
|
-
# Generally, when we have a bunch of redirects, they do not branch out
|
|
690
|
-
# *but* it is not always the case: some intermediary
|
|
686
|
+
elif self.rendered_node != self.url_tree:
|
|
687
|
+
# Generally, when we have a bunch of redirects, they (generally) do not branch out
|
|
688
|
+
# before the final landing page *but* it is not always the case: some intermediary
|
|
689
|
+
# redirects will have calls to 3rd party pages.
|
|
691
690
|
# Hopefully, this last case was taken care of in the branch above.
|
|
692
|
-
# In this branch, we get the landing page after the redirects
|
|
691
|
+
# In this branch, we get the landing page after the redirects, and attach the node to it.
|
|
692
|
+
|
|
693
|
+
# We skip this call if there are no redirects as it is the very last fallback at the
|
|
694
|
+
# end of this method anyway
|
|
693
695
|
if dev_debug:
|
|
694
696
|
self.logger.warning(f'Failed to attach URLNode in the normal process, attaching node to final redirect: {self.har.final_redirect}.')
|
|
695
|
-
self._make_subtree(self.
|
|
697
|
+
self._make_subtree(self.rendered_node, [node])
|
|
696
698
|
elif 'pages' in self.har.har['log']:
|
|
697
699
|
# No luck, the node is root for this pageref, let's attach it to the prior page in the list, or the very first node (tree root)
|
|
698
700
|
page_before = self.har.har['log']['pages'][0]
|
har2tree/nodes.py
CHANGED
|
@@ -27,6 +27,7 @@ import json_stream # type: ignore
|
|
|
27
27
|
from bs4 import BeautifulSoup
|
|
28
28
|
from ete3 import TreeNode # type: ignore
|
|
29
29
|
from publicsuffixlist import PublicSuffixList # type: ignore
|
|
30
|
+
from requests_toolbelt.multipart import decoder # type: ignore
|
|
30
31
|
from w3lib.html import strip_html5_whitespace
|
|
31
32
|
from w3lib.url import canonicalize_url, safe_url_string
|
|
32
33
|
|
|
@@ -272,6 +273,8 @@ class URLNode(HarTreeNode):
|
|
|
272
273
|
or mimetype_lower.startswith('application/csp-report')
|
|
273
274
|
or mimetype_lower.startswith('application/x-amz-json-1.1')
|
|
274
275
|
or mimetype_lower.startswith('application/reports+json')
|
|
276
|
+
or mimetype_lower.startswith('application/vnd.adobe.dc+json')
|
|
277
|
+
or mimetype_lower.startswith('application/ion+json')
|
|
275
278
|
or mimetype_lower.endswith('json')
|
|
276
279
|
):
|
|
277
280
|
if isinstance(decoded_posted_data, (str, bytes)):
|
|
@@ -304,10 +307,29 @@ class URLNode(HarTreeNode):
|
|
|
304
307
|
else:
|
|
305
308
|
self.logger.warning(f"Expected json stream, got garbage: {mimetype_lower} - {decoded_posted_data}")
|
|
306
309
|
self.add_feature('posted_data_info', "Unable to decode POST request.")
|
|
307
|
-
elif mimetype_lower.startswith('multipart
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
310
|
+
elif mimetype_lower.startswith('multipart'):
|
|
311
|
+
self.add_feature('posted_data_info', f"Decoding {mimetype_lower} is partially supported.")
|
|
312
|
+
if isinstance(decoded_posted_data, str):
|
|
313
|
+
# must be encoded for decoding
|
|
314
|
+
multipart_to_decode = decoded_posted_data.encode()
|
|
315
|
+
elif isinstance(decoded_posted_data, bytes):
|
|
316
|
+
multipart_to_decode = decoded_posted_data
|
|
317
|
+
else:
|
|
318
|
+
raise ValueError(f'Invalid type for multipart POST: {type(decoded_posted_data)}')
|
|
319
|
+
if b"\r\n" not in multipart_to_decode:
|
|
320
|
+
# the decoder wants that
|
|
321
|
+
multipart_to_decode = multipart_to_decode.replace(b"\n", b"\r\n")
|
|
322
|
+
try:
|
|
323
|
+
multipart_data = decoder.MultipartDecoder(multipart_to_decode, mimetype_lower)
|
|
324
|
+
decoded_posted_data = []
|
|
325
|
+
for part in multipart_data.parts:
|
|
326
|
+
headers = {k.decode(): v.decode() for k, v in part.headers.items()}
|
|
327
|
+
content = part.text
|
|
328
|
+
decoded_posted_data.append({'headers': headers, 'content': content})
|
|
329
|
+
except Exception as e:
|
|
330
|
+
self.logger.warning(f'Unable to decode multipart POST: {e}')
|
|
331
|
+
self.add_feature('posted_data_info', "Unable to decode multipart in POST request.")
|
|
332
|
+
|
|
311
333
|
elif mimetype_lower.startswith('application/x-protobuf'):
|
|
312
334
|
# FIXME If possible, decode?
|
|
313
335
|
self.logger.debug(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
|
|
@@ -325,8 +347,7 @@ class URLNode(HarTreeNode):
|
|
|
325
347
|
# keep it as it is
|
|
326
348
|
self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
|
|
327
349
|
self.add_feature('posted_data_info', f"Pretty rendering of {mimetype_lower} is not supported yet.")
|
|
328
|
-
elif mimetype_lower
|
|
329
|
-
# Just skip it, no need to go in the warnings
|
|
350
|
+
elif mimetype_lower in ['?', '*/*']:
|
|
330
351
|
self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
|
|
331
352
|
self.add_feature('posted_data_info', f"Weird MimeType ({mimetype_lower}) is not supported yet.")
|
|
332
353
|
elif mimetype_lower == 'application/binary':
|
|
@@ -565,7 +586,7 @@ class URLNode(HarTreeNode):
|
|
|
565
586
|
return href
|
|
566
587
|
|
|
567
588
|
if not hasattr(self, 'rendered_html') or not self.rendered_html:
|
|
568
|
-
raise Har2TreeError('Not the node of a page rendered, invalid request.')
|
|
589
|
+
raise Har2TreeError(f'Not the node of a page rendered ({self.uuid}), invalid request.')
|
|
569
590
|
urls: set[str] = set()
|
|
570
591
|
|
|
571
592
|
# The simple ones: the links.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: har2tree
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.32.1
|
|
4
4
|
Summary: HTTP Archive (HAR) to ETE Toolkit generator
|
|
5
5
|
License: BSD-3-Clause
|
|
6
6
|
Author: Raphaël Vinot
|
|
@@ -21,7 +21,7 @@ Classifier: Topic :: Internet
|
|
|
21
21
|
Classifier: Topic :: Security
|
|
22
22
|
Provides-Extra: docs
|
|
23
23
|
Requires-Dist: Sphinx (>=8.2.3) ; (python_version >= "3.11") and (extra == "docs")
|
|
24
|
-
Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.13.
|
|
24
|
+
Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.13.5)
|
|
25
25
|
Requires-Dist: ete3 (>=3.1.3)
|
|
26
26
|
Requires-Dist: filetype (>=1.2.0)
|
|
27
27
|
Requires-Dist: json-stream (>=2.3.3,<3.0.0)
|
|
@@ -30,7 +30,8 @@ Requires-Dist: multipart (>=1.3.0,<2.0.0)
|
|
|
30
30
|
Requires-Dist: numpy (<2.1) ; python_version < "3.10"
|
|
31
31
|
Requires-Dist: numpy (<2.3) ; python_version < "3.11"
|
|
32
32
|
Requires-Dist: numpy (>=2.3.2) ; python_version >= "3.11"
|
|
33
|
-
Requires-Dist: publicsuffixlist (>=1.0.2.
|
|
33
|
+
Requires-Dist: publicsuffixlist (>=1.0.2.20250824)
|
|
34
|
+
Requires-Dist: requests-toolbelt (>=1.0.0,<2.0.0)
|
|
34
35
|
Requires-Dist: six (>=1.17.0) ; extra == "docs"
|
|
35
36
|
Requires-Dist: tinycss2 (>=1.4.0)
|
|
36
37
|
Requires-Dist: w3lib (>=2.3.1)
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
har2tree/__init__.py,sha256=Na3mxHkUBq3rzYbxiLNJF37DxH5mcghSorjzXw5Teug,422
|
|
2
|
+
har2tree/har2tree.py,sha256=PBRJZk-cqIOctbrIav4v5z2wKUFApayl4SQmLTKdF6E,44438
|
|
3
|
+
har2tree/helper.py,sha256=CgeXqfBeHs8SbkW7TRNKqJBTZLAu63KggQjbGHCZAGI,20681
|
|
4
|
+
har2tree/nodes.py,sha256=OtsQnXs8cmBGDJ6MUDWmqKZVOwxKhFJCR_sY0-nYP20,37517
|
|
5
|
+
har2tree/parser.py,sha256=4yej1OcVYAIiLfzYZsO9WCw3WyM_ykDTuvpW7UO1ROE,3645
|
|
6
|
+
har2tree/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
+
har2tree-1.32.1.dist-info/LICENSE,sha256=Xa4EVROgJsEo10CW-ISCRiw0TtqdKz1JuM3BBLBM55c,1803
|
|
8
|
+
har2tree-1.32.1.dist-info/METADATA,sha256=Ja_ikI4U9yVZ5paqPJ4AKm4G2ZmVDsUfOaYABZump88,2253
|
|
9
|
+
har2tree-1.32.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
10
|
+
har2tree-1.32.1.dist-info/RECORD,,
|
har2tree-1.31.6.dist-info/RECORD
DELETED
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
har2tree/__init__.py,sha256=Na3mxHkUBq3rzYbxiLNJF37DxH5mcghSorjzXw5Teug,422
|
|
2
|
-
har2tree/har2tree.py,sha256=47x9X5tY69f9SXkYJgJsnAaX2kxgXHgzFThGz6M86Zw,44495
|
|
3
|
-
har2tree/helper.py,sha256=CgeXqfBeHs8SbkW7TRNKqJBTZLAu63KggQjbGHCZAGI,20681
|
|
4
|
-
har2tree/nodes.py,sha256=Z-NKlcrDcBbEDwpPMFcQzqbCr3bOnb8xzPAxFI5GNSs,36111
|
|
5
|
-
har2tree/parser.py,sha256=4yej1OcVYAIiLfzYZsO9WCw3WyM_ykDTuvpW7UO1ROE,3645
|
|
6
|
-
har2tree/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
-
har2tree-1.31.6.dist-info/LICENSE,sha256=Xa4EVROgJsEo10CW-ISCRiw0TtqdKz1JuM3BBLBM55c,1803
|
|
8
|
-
har2tree-1.31.6.dist-info/METADATA,sha256=r4-PI8eNiVboZ7B4NSarQhqxBhOsy7C1gi4e6q2G99Y,2203
|
|
9
|
-
har2tree-1.31.6.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
10
|
-
har2tree-1.31.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|