har2tree 1.31.5__py3-none-any.whl → 1.32.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
har2tree/har2tree.py CHANGED
@@ -115,8 +115,10 @@ class HarFile():
115
115
  last_redirect_file = self.path.parent / f'{root_name}.last_redirect.txt'
116
116
  if last_redirect_file.is_file():
117
117
  with last_redirect_file.open('r') as _lr:
118
- self.final_redirect: str = unquote_plus(_lr.read())
119
- self._search_final_redirect()
118
+ last_redirect = unquote_plus(_lr.read())
119
+ self.final_redirect: str = last_redirect
120
+ if not self._search_final_redirect():
121
+ self.logger.warning(f'Final redirect URL from address bar not in tree: {last_redirect}')
120
122
  else:
121
123
  self.logger.debug('No last_redirect file available.')
122
124
  self.final_redirect = ''
@@ -169,29 +171,30 @@ class HarFile():
169
171
  # Set to false if initial_redirects fails to find the chain.
170
172
  self.need_tree_redirects = False
171
173
 
172
- def _search_final_redirect(self) -> None:
174
+ def _search_final_redirect(self) -> bool:
173
175
  """Try to find the final path to the final redirect without building the tree"""
174
176
  for e in self.entries:
175
177
  unquoted_url = unquote_plus(e['request']['url'])
176
178
  if unquoted_url == self.final_redirect:
177
- break
179
+ return True
178
180
  elif unquoted_url.startswith(f'{self.final_redirect}?'):
179
181
  # WARNING: the URL in that file may not be present in the HAR: the query part is stripped by splash
180
182
  self.final_redirect = unquoted_url
181
- break
183
+ return True
182
184
  else:
183
185
  # Update 2020-04-01: .. but the fragment is not striped so self.final_redirect may not be found
184
186
  # Unless we find the entry in the har, we need to search again without the fragment
185
187
  if '#' in self.final_redirect:
186
188
  self.final_redirect = self.final_redirect.split('#', 1)[0]
187
- self._search_final_redirect()
189
+ return self._search_final_redirect()
188
190
  elif '?' in self.final_redirect:
189
191
  # At this point, we're trying things. The final URL returned by splash may have been changed
190
192
  # in JavaScript and never appear in the HAR. Let's try to find the closest one with the same path
191
193
  self.final_redirect = self.final_redirect.split('?', 1)[0]
192
- self._search_final_redirect()
194
+ return self._search_final_redirect()
193
195
  else:
194
196
  self.logger.info(f'Unable to find the final redirect: {self.final_redirect}')
197
+ return False
195
198
 
196
199
  @property
197
200
  def number_entries(self) -> int:
@@ -566,12 +569,6 @@ class Har2Tree:
566
569
  if node:
567
570
  return node[0]
568
571
 
569
- browser_errors = ['chrome-error', 'about:blank']
570
- if self.har.final_redirect and not any(self.har.final_redirect.startswith(r) for r in browser_errors):
571
- self.logger.warning(f'Final redirect URL from adress bar not in tree: {self.har.final_redirect}')
572
- else:
573
- # No final redirect, already logged earlier.
574
- pass
575
572
  # Just try to get the best guess: first node after JS/HTTP redirects
576
573
  curnode = self.url_tree
577
574
  while hasattr(curnode, 'redirect') and curnode.redirect:
@@ -679,20 +676,25 @@ class Har2Tree:
679
676
  and node.pageref != self.har.har['log']['pages'][0]
680
677
  and self.pages_root[node.pageref] != node.uuid):
681
678
  # In that case, we check if there is already a page with the pageref of the orphan node,
682
- # and attach the node to that. NOTE: we can only do that if there is already a node with this pageref in the tree.
679
+ # and attach the node to that.
680
+ # NOTE: we can only do that if there is already a node with this pageref in the tree.
683
681
  # This node is not a page root, we can attach it \o/
684
682
  page_root_node = self.get_url_node_by_uuid(self.pages_root[node.pageref])
685
683
  if dev_debug:
686
684
  self.logger.warning(f'Failed to attach URLNode in the normal process, attaching node to page {node.pageref} - Node: {page_root_node.uuid} - {page_root_node.name}.')
687
685
  self._make_subtree(page_root_node, [node])
688
- elif self.url_tree.search_nodes(name=self.har.final_redirect):
689
- # Generally, when we have a bunch of redirects, they do not branch out before the final landing page
690
- # *but* it is not always the case: some intermediary redirects will have calls to 3rd party pages.
686
+ elif self.rendered_node != self.url_tree:
687
+ # Generally, when we have a bunch of redirects, they (generally) do not branch out
688
+ # before the final landing page *but* it is not always the case: some intermediary
689
+ # redirects will have calls to 3rd party pages.
691
690
  # Hopefully, this last case was taken care of in the branch above.
692
- # In this branch, we get the landing page after the redirects (if any), and attach the node to it.
691
+ # In this branch, we get the landing page after the redirects, and attach the node to it.
692
+
693
+ # We skip this call if there are no redirects as it is the very last fallback at the
694
+ # end of this method anyway
693
695
  if dev_debug:
694
696
  self.logger.warning(f'Failed to attach URLNode in the normal process, attaching node to final redirect: {self.har.final_redirect}.')
695
- self._make_subtree(self.url_tree.search_nodes(name=self.har.final_redirect)[0], [node])
697
+ self._make_subtree(self.rendered_node, [node])
696
698
  elif 'pages' in self.har.har['log']:
697
699
  # No luck, the node is root for this pageref, let's attach it to the prior page in the list, or the very first node (tree root)
698
700
  page_before = self.har.har['log']['pages'][0]
har2tree/nodes.py CHANGED
@@ -27,6 +27,7 @@ import json_stream # type: ignore
27
27
  from bs4 import BeautifulSoup
28
28
  from ete3 import TreeNode # type: ignore
29
29
  from publicsuffixlist import PublicSuffixList # type: ignore
30
+ from requests_toolbelt.multipart import decoder # type: ignore
30
31
  from w3lib.html import strip_html5_whitespace
31
32
  from w3lib.url import canonicalize_url, safe_url_string
32
33
 
@@ -217,10 +218,19 @@ class URLNode(HarTreeNode):
217
218
  decoded_posted_data: list[Any] | str | bytes | int | float | bool | dict[str, str] | dict[str, list[str]] | None = None
218
219
  if 'postData' not in self.request or 'text' not in self.request['postData']:
219
220
  self.logger.debug('POST request with no content.')
221
+ self.add_feature('posted_data_info', "No content.")
220
222
  elif not self.request['postData']['text']:
221
223
  # If the POST content is empty
222
224
  self.logger.debug('Empty POST request.')
223
225
  decoded_posted_data = ''
226
+ self.add_feature('posted_data_info', "Empty request.")
227
+ elif self.request['postData']['text'].startswith('\x1f\uFFFD\x08'):
228
+ # b'\x1f\xef\xbf\xbd\x08', decoded to UTF-8
229
+ # => the replacement character
230
+ # https://www.cogsci.ed.ac.uk/~richard/utf-8.cgi?input=%EF%BF%BD&mode=char
231
+ self.logger.debug('Got a garbled gzipped POST blob.')
232
+ self.add_feature('posted_data_info', "It was a POSTed gzipped blob, but the data has been garbled.")
233
+ decoded_posted_data = self.request['postData']['text']
224
234
  elif self.request['postData'].get('params'):
225
235
  # NOTE 2025-08-08
226
236
  # if the posted data mimetype is "application/x-www-form-urlencoded"
@@ -231,10 +241,11 @@ class URLNode(HarTreeNode):
231
241
  # TODO: some processing on the data part (it's often a json blob)
232
242
  self.logger.debug('Got a params POST.')
233
243
  decoded_posted_data = {entry['name']: entry['value'] for entry in self.request['postData']['params']}
244
+ self.add_feature('posted_data_info', "POST request as URL params.")
234
245
  else:
235
- # NOTE 2023-08-22: Blind attempt to base64 decode the data
236
246
  self.logger.debug('Got a normal POST')
237
247
  try:
248
+ # NOTE 2023-08-22: Blind attempt to base64 decode the data
238
249
  decoded_posted_data = self._dirty_safe_b64decode(self.request['postData']['text'])
239
250
  except binascii.Error:
240
251
  decoded_posted_data = self.request['postData']['text']
@@ -254,12 +265,16 @@ class URLNode(HarTreeNode):
254
265
  decoded_posted_data = unquote_plus(decoded_posted_data)
255
266
  if isinstance(decoded_posted_data, str):
256
267
  decoded_posted_data = parse_qs(decoded_posted_data)
268
+ self.add_feature('posted_data_info', "Successfully decoded POST request.")
257
269
  except Exception as e:
258
270
  self.logger.warning(f'Unable to unquote or parse form data "{decoded_posted_data!r}": {e}')
271
+ self.add_feature('posted_data_info', "Unable to decode POST request.")
259
272
  elif (mimetype_lower.startswith('application/json')
260
273
  or mimetype_lower.startswith('application/csp-report')
261
274
  or mimetype_lower.startswith('application/x-amz-json-1.1')
262
275
  or mimetype_lower.startswith('application/reports+json')
276
+ or mimetype_lower.startswith('application/vnd.adobe.dc+json')
277
+ or mimetype_lower.startswith('application/ion+json')
263
278
  or mimetype_lower.endswith('json')
264
279
  ):
265
280
  if isinstance(decoded_posted_data, (str, bytes)):
@@ -267,7 +282,9 @@ class URLNode(HarTreeNode):
267
282
  try:
268
283
  # NOTE 2023-08-22: loads here may give us a int, float or a bool.
269
284
  decoded_posted_data = json.loads(decoded_posted_data)
285
+ self.add_feature('posted_data_info', "Successfully decoded POST request.")
270
286
  except Exception:
287
+ self.add_feature('posted_data_info', "Unable to decode POST request.")
271
288
  if isinstance(decoded_posted_data, (str, bytes)):
272
289
  self.logger.warning(f"Expected json, got garbage: {mimetype_lower} - {decoded_posted_data[:20]!r}[...]")
273
290
  else:
@@ -283,71 +300,109 @@ class URLNode(HarTreeNode):
283
300
  raise ValueError(f'Invalid type: {type(decoded_posted_data)}')
284
301
  streamed_data = json_stream.load(to_stream)
285
302
  decoded_posted_data = json_stream.to_standard_types(streamed_data)
303
+ self.add_feature('posted_data_info', "Successfully decoded POST request.")
286
304
  except Exception:
287
305
  if isinstance(decoded_posted_data, (str, bytes)):
288
306
  self.logger.warning(f"Expected json stream, got garbage: {mimetype_lower} - {decoded_posted_data[:20]!r}[...]")
289
307
  else:
290
308
  self.logger.warning(f"Expected json stream, got garbage: {mimetype_lower} - {decoded_posted_data}")
291
- elif mimetype_lower.startswith('multipart/form-data'):
292
- # FIXME multipart content (similar to email). Not totally sure what do do with it tight now.
293
- self.logger.debug(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
294
- pass
309
+ self.add_feature('posted_data_info', "Unable to decode POST request.")
310
+ elif mimetype_lower.startswith('multipart'):
311
+ self.add_feature('posted_data_info', f"Decoding {mimetype_lower} is partially supported.")
312
+ if isinstance(decoded_posted_data, str):
313
+ # must be encoded for decoding
314
+ multipart_to_decode = decoded_posted_data.encode()
315
+ elif isinstance(decoded_posted_data, bytes):
316
+ multipart_to_decode = decoded_posted_data
317
+ else:
318
+ raise ValueError(f'Invalid type for multipart POST: {type(decoded_posted_data)}')
319
+ if b"\r\n" not in multipart_to_decode:
320
+ # the decoder wants that
321
+ multipart_to_decode = multipart_to_decode.replace(b"\n", b"\r\n")
322
+ try:
323
+ multipart_data = decoder.MultipartDecoder(multipart_to_decode, mimetype_lower)
324
+ decoded_posted_data = []
325
+ for part in multipart_data.parts:
326
+ headers = {k.decode(): v.decode() for k, v in part.headers.items()}
327
+ content = part.text
328
+ decoded_posted_data.append({'headers': headers, 'content': content})
329
+ except Exception as e:
330
+ self.logger.warning(f'Unable to decode multipart POST: {e}')
331
+ self.add_feature('posted_data_info', "Unable to decode multipart in POST request.")
332
+
295
333
  elif mimetype_lower.startswith('application/x-protobuf'):
296
334
  # FIXME If possible, decode?
297
335
  self.logger.debug(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
298
- pass
336
+ self.add_feature('posted_data_info', f"Decoding {mimetype_lower} is not supported yet.")
299
337
  elif mimetype_lower.startswith('text') and isinstance(decoded_posted_data, (str, bytes)):
300
338
  try:
301
339
  # NOTE 2023-08-22: Quite a few text entries are in fact json, give it a shot.
302
340
  # loads here may give us a int, float or a bool.
303
341
  decoded_posted_data = json.loads(decoded_posted_data)
342
+ self.add_feature('posted_data_info', "Decoded JSON out of POST request.")
304
343
  except Exception:
305
344
  # keep it as it is otherwise.
306
345
  pass
307
346
  elif mimetype_lower.endswith('javascript'):
308
347
  # keep it as it is
309
348
  self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
310
- pass
311
- elif mimetype_lower == '?':
312
- # Just skip it, no need to go in the warnings
349
+ self.add_feature('posted_data_info', f"Pretty rendering of {mimetype_lower} is not supported yet.")
350
+ elif mimetype_lower in ['?', '*/*']:
313
351
  self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
314
- pass
352
+ self.add_feature('posted_data_info', f"Weird MimeType ({mimetype_lower}) is not supported yet.")
315
353
  elif mimetype_lower == 'application/binary':
316
- # generally a broken gzipped blob
317
- self.logger.debug(f'Got a POST {mimetype_lower}, most probably a broken gziped blob: {decoded_posted_data!r}')
354
+ self.logger.warning(f'Got a POST {mimetype_lower}, not a broken gziped blob: {decoded_posted_data!r}')
355
+ self.add_feature('posted_data_info', f"MimeType ({mimetype_lower}) is not supported yet.")
318
356
  elif mimetype_lower in ['application/octet-stream']:
319
357
  # Should flag it, maybe?
320
358
  self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
321
- pass
359
+ self.add_feature('posted_data_info', f"MimeType ({mimetype_lower}) is not supported yet.")
322
360
  elif mimetype_lower in ['application/grpc-web+proto']:
323
361
  # Can be decoded?
324
362
  self.logger.warning(f'Got a POST {mimetype_lower} - can be decoded: {decoded_posted_data!r}')
363
+ self.add_feature('posted_data_info', f"MimeType ({mimetype_lower}) is not supported yet.")
325
364
  elif mimetype_lower in ['application/unknown']:
326
365
  # Weird but already seen stuff
327
366
  self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
328
- pass
367
+ self.add_feature('posted_data_info', f"MimeType ({mimetype_lower}) is not supported yet.")
329
368
  else:
330
- self.logger.warning(f'Unexpected mime type: {mimetype_lower}')
369
+ self.logger.warning(f'Unexpected mime type: {mimetype_lower} - {decoded_posted_data!r}')
370
+ self.add_feature('posted_data_info', f"Unexpected MimeType ({mimetype_lower}) is not supported yet.")
331
371
  else:
332
372
  self.logger.warning(f'Missing mimetype in POST: {self.request["postData"]}')
373
+ self.add_feature('posted_data_info', "Missing MimeType, not sure what to do.")
333
374
 
334
375
  # NOTE 2023-08-22: Blind attempt to process the data as json
335
- if isinstance(decoded_posted_data, (str, bytes)):
376
+ if decoded_posted_data and isinstance(decoded_posted_data, (str, bytes)):
336
377
  try:
337
378
  decoded_posted_data = json.loads(decoded_posted_data)
338
379
  except Exception:
339
380
  pass
340
381
 
341
- if isinstance(decoded_posted_data, bytes):
382
+ if decoded_posted_data and isinstance(decoded_posted_data, bytes):
342
383
  # NOTE 2023-08-22: Blind attempt to decode the bytes
343
384
  # Try to decode it as utf-8
344
385
  try:
345
386
  decoded_posted_data = decoded_posted_data.decode('utf-8')
346
387
  except Exception:
347
388
  pass
389
+
348
390
  self.add_feature('posted_data', decoded_posted_data)
349
391
  if 'postData' in self.request and self.request['postData'].get('mimeType'):
350
392
  self.add_feature('posted_data_mimetype', self.request['postData']['mimeType'])
393
+ # Get size, post decode.
394
+ if not decoded_posted_data:
395
+ # empty or None, set to 0
396
+ self.add_feature('posted_data_size', 0)
397
+ elif isinstance(decoded_posted_data, (list, dict)):
398
+ # set size to the json dump
399
+ self.add_feature('posted_data_size', len(json.dumps(decoded_posted_data)))
400
+ elif isinstance(decoded_posted_data, (str, bytes)):
401
+ # length
402
+ self.add_feature('posted_data_size', len(decoded_posted_data))
403
+ else:
404
+ # Stringify and len
405
+ self.add_feature('posted_data_size', len(str(decoded_posted_data)))
351
406
 
352
407
  self.add_feature('response', har_entry['response'])
353
408
  try:
@@ -531,7 +586,7 @@ class URLNode(HarTreeNode):
531
586
  return href
532
587
 
533
588
  if not hasattr(self, 'rendered_html') or not self.rendered_html:
534
- raise Har2TreeError('Not the node of a page rendered, invalid request.')
589
+ raise Har2TreeError('Not the node of a page rendered ({self.uuid}), invalid request.')
535
590
  urls: set[str] = set()
536
591
 
537
592
  # The simple ones: the links.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: har2tree
3
- Version: 1.31.5
3
+ Version: 1.32.0
4
4
  Summary: HTTP Archive (HAR) to ETE Toolkit generator
5
5
  License: BSD-3-Clause
6
6
  Author: Raphaël Vinot
@@ -21,7 +21,7 @@ Classifier: Topic :: Internet
21
21
  Classifier: Topic :: Security
22
22
  Provides-Extra: docs
23
23
  Requires-Dist: Sphinx (>=8.2.3) ; (python_version >= "3.11") and (extra == "docs")
24
- Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.13.4)
24
+ Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.13.5)
25
25
  Requires-Dist: ete3 (>=3.1.3)
26
26
  Requires-Dist: filetype (>=1.2.0)
27
27
  Requires-Dist: json-stream (>=2.3.3,<3.0.0)
@@ -30,7 +30,8 @@ Requires-Dist: multipart (>=1.3.0,<2.0.0)
30
30
  Requires-Dist: numpy (<2.1) ; python_version < "3.10"
31
31
  Requires-Dist: numpy (<2.3) ; python_version < "3.11"
32
32
  Requires-Dist: numpy (>=2.3.2) ; python_version >= "3.11"
33
- Requires-Dist: publicsuffixlist (>=1.0.2.20250809)
33
+ Requires-Dist: publicsuffixlist (>=1.0.2.20250824)
34
+ Requires-Dist: requests-toolbelt (>=1.0.0,<2.0.0)
34
35
  Requires-Dist: six (>=1.17.0) ; extra == "docs"
35
36
  Requires-Dist: tinycss2 (>=1.4.0)
36
37
  Requires-Dist: w3lib (>=2.3.1)
@@ -0,0 +1,10 @@
1
+ har2tree/__init__.py,sha256=Na3mxHkUBq3rzYbxiLNJF37DxH5mcghSorjzXw5Teug,422
2
+ har2tree/har2tree.py,sha256=PBRJZk-cqIOctbrIav4v5z2wKUFApayl4SQmLTKdF6E,44438
3
+ har2tree/helper.py,sha256=CgeXqfBeHs8SbkW7TRNKqJBTZLAu63KggQjbGHCZAGI,20681
4
+ har2tree/nodes.py,sha256=8Z1CTCQvLF6TQIYK7UO0BzXsJMIwwZ6uVGPwa1Kv7zo,37516
5
+ har2tree/parser.py,sha256=4yej1OcVYAIiLfzYZsO9WCw3WyM_ykDTuvpW7UO1ROE,3645
6
+ har2tree/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ har2tree-1.32.0.dist-info/LICENSE,sha256=Xa4EVROgJsEo10CW-ISCRiw0TtqdKz1JuM3BBLBM55c,1803
8
+ har2tree-1.32.0.dist-info/METADATA,sha256=gV4DHjc7BnpGnEQlxqLl0bYHK3xPgjsVNUqeqkgRE58,2253
9
+ har2tree-1.32.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
10
+ har2tree-1.32.0.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- har2tree/__init__.py,sha256=Na3mxHkUBq3rzYbxiLNJF37DxH5mcghSorjzXw5Teug,422
2
- har2tree/har2tree.py,sha256=47x9X5tY69f9SXkYJgJsnAaX2kxgXHgzFThGz6M86Zw,44495
3
- har2tree/helper.py,sha256=CgeXqfBeHs8SbkW7TRNKqJBTZLAu63KggQjbGHCZAGI,20681
4
- har2tree/nodes.py,sha256=a-5tk_AbnIklbdujlesb_1E0KGnSyK0OsTnbnd5i0D4,32961
5
- har2tree/parser.py,sha256=4yej1OcVYAIiLfzYZsO9WCw3WyM_ykDTuvpW7UO1ROE,3645
6
- har2tree/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- har2tree-1.31.5.dist-info/LICENSE,sha256=Xa4EVROgJsEo10CW-ISCRiw0TtqdKz1JuM3BBLBM55c,1803
8
- har2tree-1.31.5.dist-info/METADATA,sha256=5QfFL4ESUuWJn7JuxcnLSgw70q3MGZoDfe9PJFS5JkA,2203
9
- har2tree-1.31.5.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
10
- har2tree-1.31.5.dist-info/RECORD,,