har2tree 1.31.3__py3-none-any.whl → 1.36.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
har2tree/har2tree.py CHANGED
@@ -13,7 +13,9 @@ from functools import wraps, lru_cache
13
13
  from io import BytesIO
14
14
  from operator import itemgetter
15
15
  from pathlib import Path
16
- from typing import Any, Callable
16
+ from typing import Any, TypedDict
17
+ from collections.abc import Iterator
18
+ from collections.abc import Callable
17
19
  from urllib.parse import unquote_plus, urlparse
18
20
 
19
21
  from .helper import rebuild_url, Har2TreeError, Har2TreeLogAdapter
@@ -58,7 +60,8 @@ def trace_make_subtree_fallback(method: Callable[..., None]) -> Callable[..., No
58
60
 
59
61
  def trace_make_subtree(method: Callable[..., None]) -> Callable[..., None]:
60
62
  @wraps(method)
61
- def _impl(self: Any, root: URLNode, nodes_to_attach: list[URLNode] | None=None, dev_debug: bool=False) -> None:
63
+ def _impl(self: Any, root: URLNode, nodes_to_attach: list[URLNode] | None=None,
64
+ dev_debug: bool=False, fallback: bool=False) -> None:
62
65
  if dev_debug_mode:
63
66
  __load_debug_files()
64
67
  if dev_debug_url and root.name == dev_debug_url or nodes_to_attach is not None and any(True for u in nodes_to_attach if u.name == dev_debug_url):
@@ -67,7 +70,7 @@ def trace_make_subtree(method: Callable[..., None]) -> Callable[..., None]:
67
70
  elif dev_debug_hostname and root.hostname == dev_debug_hostname or nodes_to_attach is not None and any(True for u in nodes_to_attach if u.hostname == dev_debug_hostname):
68
71
  root.logger.warning(f'Debugging Hostname: {dev_debug_hostname}.')
69
72
  dev_debug = True
70
- return method(self, root, nodes_to_attach, dev_debug)
73
+ return method(self, root, nodes_to_attach, dev_debug, fallback)
71
74
  return _impl
72
75
 
73
76
 
@@ -84,6 +87,15 @@ def __load_debug_files() -> None:
84
87
  dev_debug_hostname = f.read().strip()
85
88
 
86
89
 
90
+ # NOTE: Copy from PlaywrightCapture to avoid extra dep
91
+ class FramesResponse(TypedDict, total=False):
92
+
93
+ name: str
94
+ url: str
95
+ content: str | None
96
+ children: list[FramesResponse] | None
97
+
98
+
87
99
  class HarFile():
88
100
 
89
101
  def __init__(self, harfile: Path, capture_uuid: str):
@@ -115,8 +127,14 @@ class HarFile():
115
127
  last_redirect_file = self.path.parent / f'{root_name}.last_redirect.txt'
116
128
  if last_redirect_file.is_file():
117
129
  with last_redirect_file.open('r') as _lr:
118
- self.final_redirect: str = unquote_plus(_lr.read())
119
- self._search_final_redirect()
130
+ last_redirect = unquote_plus(_lr.read())
131
+ self.final_redirect: str = last_redirect
132
+ if not self._search_final_redirect():
133
+ if last_redirect.startswith('chrome') or last_redirect.startswith('about'):
134
+ # the capture failed.
135
+ pass
136
+ else:
137
+ self.logger.info(f'Final redirect URL from address bar not in tree: {last_redirect}')
120
138
  else:
121
139
  self.logger.debug('No last_redirect file available.')
122
140
  self.final_redirect = ''
@@ -129,6 +147,14 @@ class HarFile():
129
147
  self.logger.debug('No cookies file available.')
130
148
  self.cookies = []
131
149
 
150
+ framesfile = self.path.parent / f'{root_name}.frames.json'
151
+ if framesfile.is_file():
152
+ with framesfile.open() as c:
153
+ self.frames: FramesResponse = json.load(c)
154
+ else:
155
+ self.logger.debug('No frames file available.')
156
+ self.frames = {}
157
+
132
158
  dlfile = self.path.parent / f'{root_name}.data'
133
159
  dlfilename = self.path.parent / f'{root_name}.data.filename'
134
160
  self.downloaded_file: BytesIO | None
@@ -169,29 +195,30 @@ class HarFile():
169
195
  # Set to false if initial_redirects fails to find the chain.
170
196
  self.need_tree_redirects = False
171
197
 
172
- def _search_final_redirect(self) -> None:
198
+ def _search_final_redirect(self) -> bool:
173
199
  """Try to find the final path to the final redirect without building the tree"""
174
200
  for e in self.entries:
175
201
  unquoted_url = unquote_plus(e['request']['url'])
176
202
  if unquoted_url == self.final_redirect:
177
- break
203
+ return True
178
204
  elif unquoted_url.startswith(f'{self.final_redirect}?'):
179
205
  # WARNING: the URL in that file may not be present in the HAR: the query part is stripped by splash
180
206
  self.final_redirect = unquoted_url
181
- break
207
+ return True
182
208
  else:
183
209
  # Update 2020-04-01: .. but the fragment is not striped so self.final_redirect may not be found
184
210
  # Unless we find the entry in the har, we need to search again without the fragment
185
211
  if '#' in self.final_redirect:
186
212
  self.final_redirect = self.final_redirect.split('#', 1)[0]
187
- self._search_final_redirect()
213
+ return self._search_final_redirect()
188
214
  elif '?' in self.final_redirect:
189
215
  # At this point, we're trying things. The final URL returned by splash may have been changed
190
216
  # in JavaScript and never appear in the HAR. Let's try to find the closest one with the same path
191
217
  self.final_redirect = self.final_redirect.split('?', 1)[0]
192
- self._search_final_redirect()
218
+ return self._search_final_redirect()
193
219
  else:
194
220
  self.logger.info(f'Unable to find the final redirect: {self.final_redirect}')
221
+ return False
195
222
 
196
223
  @property
197
224
  def number_entries(self) -> int:
@@ -306,8 +333,9 @@ class Har2Tree:
306
333
  self.pages_root: dict[str, str] = {}
307
334
 
308
335
  self.all_redirects: list[str] = []
309
- self.all_referer: dict[str, list[str]] = defaultdict(list)
310
- self.all_initiator_url: dict[str, list[str]] = defaultdict(list)
336
+ # 2025-11-16: make values of referers and initiators sets because there will be duplicates
337
+ self.all_referer: dict[str, set[str]] = defaultdict(set)
338
+ self.all_initiator_url: dict[str, set[str]] = defaultdict(set)
311
339
  self._load_url_entries()
312
340
 
313
341
  # Generate cookies lookup tables
@@ -398,6 +426,65 @@ class Har2Tree:
398
426
 
399
427
  self.url_tree = self._nodes_list.pop(0)
400
428
 
429
+ def _url_to_local_only_content(self, url: str | None) -> bool:
430
+ return (url is None
431
+ or url in ['about:blank', 'about:srcdoc', ''] # not loading anything remotely
432
+ or url.startswith('data') # base64 encoded content
433
+ or url.startswith('chrome-error') # not in the HAR/tree
434
+ or url.startswith('blob') # blobs aren't URLs
435
+ )
436
+
437
+ def _load_iframes(self, current: URLNode, frames: FramesResponse) -> None:
438
+ if not frames.get('content') or frames['content'] is None:
439
+ # NOTE: debug stuff, no content makes it pretty useless.
440
+ if frames.get('url'):
441
+ if self._url_to_local_only_content(frames['url']):
442
+ self.logger.info('Got an empty frame to local content.')
443
+ else:
444
+ u = unquote_plus(frames['url'])
445
+ self.logger.warning(f'Got a url ({u}) for the frame, but no content')
446
+ else:
447
+ self.logger.info('Got a frame, but no content.')
448
+ return
449
+
450
+ if frames.get('url') and not self._url_to_local_only_content(frames['url']):
451
+ u = unquote_plus(frames['url'])
452
+ possible_child_name = {u, u.split('#', 1)[0]}
453
+ # this url should be in a node directly attached to that one
454
+ # we need to find that node
455
+ for child in current.traverse():
456
+ if child.name in possible_child_name:
457
+ self.logger.debug(f'Found URL "{u}".')
458
+ # Found the node, adding the content
459
+ if not hasattr(child, 'rendered_frame'):
460
+ child.rendered_frame = []
461
+ child.rendered_frame.append(BytesIO(frames['content'].encode()))
462
+ # and mark the node as iframe
463
+ child.add_feature('iframe', True)
464
+ # if there are children, use that node as parent and call the current method recursvely
465
+ if f_children := frames.get('children'):
466
+ for f_child in f_children:
467
+ self._load_iframes(child, f_child)
468
+ break
469
+ else:
470
+ # Couldn'd find the node Oo
471
+ to_print = ', '.join(possible_child_name)
472
+ children_to_print = ', '.join([child.name for child in current.traverse()])
473
+ self.logger.warning(f'Unable to find "{to_print}" in the children of "{current.name}" - {children_to_print}')
474
+ else:
475
+ self.logger.debug(f'"{current.name}" contains an iFrame.')
476
+ # No URL, this frame is directly in the parent frame.
477
+ if not hasattr(current, 'rendered_frame'):
478
+ current.rendered_frame = []
479
+ current.rendered_frame.append(BytesIO(frames['content'].encode()))
480
+ self.logger.debug(f'"{current.name}" has {len(current.rendered_frame)} iFrames.')
481
+ # and mark the node as iframe
482
+ current.add_feature('iframe', True)
483
+ # if there are children, use that node as parent and call the current method recursvely
484
+ if f_children := frames.get('children'):
485
+ for f_child in f_children:
486
+ self._load_iframes(current, f_child)
487
+
401
488
  @property
402
489
  def initial_referer(self) -> str | None:
403
490
  '''The referer passed to the first URL in the tree'''
@@ -520,7 +607,7 @@ class Har2Tree:
520
607
 
521
608
  if hasattr(n, 'initiator_url'):
522
609
  # The HAR file was created by chrome/chromium and we got the _initiator key
523
- self.all_initiator_url[n.initiator_url].append(n.name)
610
+ self.all_initiator_url[n.initiator_url].add(n.name)
524
611
 
525
612
  if url_entry['startedDateTime'] in self.har.pages_start_times:
526
613
  for page in self.har.pages_start_times[url_entry['startedDateTime']]:
@@ -533,7 +620,7 @@ class Har2Tree:
533
620
  if hasattr(n, 'referer') and i > 0:
534
621
  # NOTE 2021-05-14: referer to self are a real thing: url -> POST to self
535
622
  if n.name != n.referer or ('method' in n.request and n.request['method'] == 'POST'):
536
- self.all_referer[n.referer].append(n.name)
623
+ self.all_referer[n.referer].add(n.name)
537
624
 
538
625
  self._nodes_list.append(n)
539
626
  self.all_url_requests[n.name].append(n)
@@ -566,12 +653,6 @@ class Har2Tree:
566
653
  if node:
567
654
  return node[0]
568
655
 
569
- browser_errors = ['chrome-error', 'about:blank']
570
- if self.har.final_redirect and not any(self.har.final_redirect.startswith(r) for r in browser_errors):
571
- self.logger.warning(f'Final redirect URL from adress bar not in tree: {self.har.final_redirect}')
572
- else:
573
- # No final redirect, already logged earlier.
574
- pass
575
656
  # Just try to get the best guess: first node after JS/HTTP redirects
576
657
  curnode = self.url_tree
577
658
  while hasattr(curnode, 'redirect') and curnode.redirect:
@@ -620,6 +701,14 @@ class Har2Tree:
620
701
  for child_node_hostname, child_nodes_url in sub_roots.items():
621
702
  self.make_hostname_tree(child_nodes_url, child_node_hostname)
622
703
 
704
+ def _all_urlnodes_in_host_tree(self) -> None:
705
+ # debug: check if all the nodes in the URL tree are in the hostnode tree (they must have an UUID)
706
+ self.logger.warning('Validating host tree....')
707
+ for urlnode in self.url_tree.traverse():
708
+ if not hasattr(urlnode, 'hostnode_uuid'):
709
+ self.logger.error(f'URL Node not un host tree: {urlnode}')
710
+ self.logger.warning('host tree validated.')
711
+
623
712
  def make_tree(self) -> URLNode:
624
713
  """Build URL and Host trees"""
625
714
  self._make_subtree(self.url_tree)
@@ -650,6 +739,15 @@ class Har2Tree:
650
739
  # Initialize the hostname tree root
651
740
  self.hostname_tree.add_url(self.url_tree)
652
741
  self.make_hostname_tree(self.url_tree, self.hostname_tree)
742
+ if dev_debug_mode:
743
+ self._all_urlnodes_in_host_tree()
744
+ if isinstance(self.har.frames, dict):
745
+ if self.har.frames.get('children') and self.har.frames['children'] is not None:
746
+ # we have frames in the main one
747
+ for f_child in self.har.frames['children']:
748
+ self._load_iframes(self.rendered_node, f_child)
749
+ else:
750
+ self.logger.warning(f'Wrong format for the frames ({type(self.har.frames)}), very old capture.')
653
751
  return self.url_tree
654
752
 
655
753
  @trace_make_subtree_fallback
@@ -667,7 +765,7 @@ class Har2Tree:
667
765
  # we got an non-empty response, breaking
668
766
  break
669
767
  # attach to the the first response with something, or to whatever we get.
670
- self._make_subtree(node_with_hostname, [node])
768
+ self._make_subtree(node_with_hostname, [node], fallback=True)
671
769
  return
672
770
 
673
771
  # Sometimes, the har has a list of pages, generally when we have HTTP redirects.
@@ -679,20 +777,25 @@ class Har2Tree:
679
777
  and node.pageref != self.har.har['log']['pages'][0]
680
778
  and self.pages_root[node.pageref] != node.uuid):
681
779
  # In that case, we check if there is already a page with the pageref of the orphan node,
682
- # and attach the node to that. NOTE: we can only do that if there is already a node with this pageref in the tree.
780
+ # and attach the node to that.
781
+ # NOTE: we can only do that if there is already a node with this pageref in the tree.
683
782
  # This node is not a page root, we can attach it \o/
684
783
  page_root_node = self.get_url_node_by_uuid(self.pages_root[node.pageref])
685
784
  if dev_debug:
686
785
  self.logger.warning(f'Failed to attach URLNode in the normal process, attaching node to page {node.pageref} - Node: {page_root_node.uuid} - {page_root_node.name}.')
687
- self._make_subtree(page_root_node, [node])
688
- elif self.url_tree.search_nodes(name=self.har.final_redirect):
689
- # Generally, when we have a bunch of redirects, they do not branch out before the final landing page
690
- # *but* it is not always the case: some intermediary redirects will have calls to 3rd party pages.
786
+ self._make_subtree(page_root_node, [node], fallback=True)
787
+ elif self.rendered_node != self.url_tree:
788
+ # Generally, when we have a bunch of redirects, they (generally) do not branch out
789
+ # before the final landing page *but* it is not always the case: some intermediary
790
+ # redirects will have calls to 3rd party pages.
691
791
  # Hopefully, this last case was taken care of in the branch above.
692
- # In this branch, we get the landing page after the redirects (if any), and attach the node to it.
792
+ # In this branch, we get the landing page after the redirects, and attach the node to it.
793
+
794
+ # We skip this call if there are no redirects as it is the very last fallback at the
795
+ # end of this method anyway
693
796
  if dev_debug:
694
797
  self.logger.warning(f'Failed to attach URLNode in the normal process, attaching node to final redirect: {self.har.final_redirect}.')
695
- self._make_subtree(self.url_tree.search_nodes(name=self.har.final_redirect)[0], [node])
798
+ self._make_subtree(self.rendered_node, [node], fallback=True)
696
799
  elif 'pages' in self.har.har['log']:
697
800
  # No luck, the node is root for this pageref, let's attach it to the prior page in the list, or the very first node (tree root)
698
801
  page_before = self.har.har['log']['pages'][0]
@@ -714,13 +817,38 @@ class Har2Tree:
714
817
  # node to the root node
715
818
  page_root_node = self.url_tree
716
819
  self.logger.warning('The pages in the HAR are in in the wrong order, this should not happen but here we are')
717
- self._make_subtree(page_root_node, [node])
820
+ self._make_subtree(page_root_node, [node], fallback=True)
718
821
  else:
719
822
  # no way to attach it to anything else, attach to the root node
720
- self._make_subtree(self.url_tree, [node])
823
+ self._make_subtree(self.url_tree, [node], fallback=True)
824
+
825
+ def all_real_urls_in_children(self, frame: FramesResponse) -> Iterator[str]:
826
+ # from a frame, search all the real urls in each of the children, stop at the first one
827
+ if (frame.get('url') and frame['url'] is not None and not self._url_to_local_only_content(frame['url'])):
828
+ yield frame['url']
829
+ else:
830
+ # got no real URL, try the children
831
+ if frame.get('children') and frame['children'] is not None:
832
+ for c in frame['children']:
833
+ yield from self.all_real_urls_in_children(c)
834
+
835
+ def search_in_frames(self, urls: set[str], frame: FramesResponse) -> Iterator[str]:
836
+ # If the frame doesn't have children, there are no potential URLs to attach
837
+ if not isinstance(frame, dict) or not frame.get('children') or frame['children'] is None:
838
+ return None
839
+
840
+ if frame.get('url'):
841
+ u = unquote_plus(frame['url'])
842
+ if urls & {u, u.split('#', 1)[0]}:
843
+ # got a matching URL, get list of potential iframes urls
844
+ for c in frame['children']:
845
+ yield from self.all_real_urls_in_children(c)
846
+ for c in frame['children']:
847
+ yield from self.search_in_frames(urls, c)
721
848
 
722
849
  @trace_make_subtree
723
- def _make_subtree(self, root: URLNode, nodes_to_attach: list[URLNode] | None=None, dev_debug: bool=False) -> None:
850
+ def _make_subtree(self, root: URLNode, nodes_to_attach: list[URLNode] | None=None,
851
+ dev_debug: bool=False, fallback: bool=False) -> None:
724
852
  """Recursive method building each level of the tree"""
725
853
  matching_urls: list[URLNode]
726
854
  if nodes_to_attach is None:
@@ -782,6 +910,26 @@ class Har2Tree:
782
910
  if unode.empty_response:
783
911
  continue
784
912
 
913
+ # 2025-11-14
914
+ # the referer of an iframe is the hostname of the parent, even if the parent
915
+ # is a URL with a full path. Before using the referer, we need to check if we have
916
+ # the current url in the frame tree. If we do, find nodes (in the remaining list)
917
+ # with the URLs of the children - any fragment will be missing - and attach that node
918
+ possible_iframe_urls = {unode.name, unode.name.split('#', 1)[0]}
919
+ for possible_url in self.search_in_frames(possible_iframe_urls, self.har.frames):
920
+ cu = unquote_plus(possible_url)
921
+ for u in {cu, cu.split('#', 1)[0]}:
922
+ if u not in self.all_url_requests:
923
+ if '#' not in u:
924
+ self.logger.info(f'"{u}" in the frames URLs, but not in the HAR.')
925
+ continue
926
+ matching_urls = [url_node for url_node in self.all_url_requests[u]
927
+ if url_node in self._nodes_list]
928
+ self._nodes_list = [node for node in self._nodes_list if node not in matching_urls]
929
+ if dev_debug:
930
+ self.logger.warning(f'Found via initiator from {unode.name} to {matching_urls}.')
931
+ self._make_subtree(unode, matching_urls)
932
+
785
933
  # The node can have a redirect, but also trigger ressources refering to themselves, we need to trigger this code on each node.
786
934
  if self.all_initiator_url.get(unode.name):
787
935
  # The URL (unode.name) is in the list of known urls initiating calls
@@ -813,6 +961,12 @@ class Har2Tree:
813
961
  if hasattr(unode, 'external_ressources'):
814
962
  # the url loads external things, and some of them have no referer....
815
963
  for external_tag, links in unode.external_ressources.items():
964
+ # 2025-11-06: skip full regex until we're calling this method in the fallback
965
+ # the iframes will often (not always) have a referer set and the URL
966
+ # might be found by the regex and it will not be attached at the
967
+ # right place
968
+ if external_tag == 'full_regex' and not fallback:
969
+ continue
816
970
  for link in links:
817
971
  if link not in self.all_url_requests or link == self.har.final_redirect:
818
972
  # We have a lot of false positives
har2tree/helper.py CHANGED
@@ -72,7 +72,7 @@ def make_hhhash(entry: dict[str, Any]) -> str:
72
72
  # We need the HTTP version used for the query:
73
73
  # * The HTTP Header names in HTTP 1.1 can have uppercase characters
74
74
  # * The HTTP Header names in HTTP 2 *must* be lowercase: https://www.rfc-editor.org/rfc/rfc7540#section-8.1.2
75
- if entry['httpVersion'].lower() in ["http/1.1", "http/1.0"]:
75
+ if entry['httpVersion'].lower() in ["http/1.1", "http/1.0", "1.1"]:
76
76
  return f'hhh:1:{sha256}'
77
77
  if entry['httpVersion'].lower() == "http/2.0":
78
78
  return f'hhh:2:{sha256}'
@@ -364,7 +364,8 @@ def find_external_ressources(mimetype: str, data: bytes, base_url: str, all_requ
364
364
  # link: https://www.w3schools.com/TAGs/tag_link.asp -> href
365
365
  # object: https://www.w3schools.com/TAGs/tag_object.asp -> data
366
366
  external_ressources: dict[str, list[str]] = {'img': [], 'script': [], 'video': [], 'audio': [],
367
- 'iframe': [], 'embed': [], 'source': [],
367
+ 'iframe': [],
368
+ 'embed': [], 'source': [],
368
369
  'link': [],
369
370
  'object': [],
370
371
  'css': [],
har2tree/nodes.py CHANGED
@@ -15,16 +15,19 @@ from base64 import b64decode
15
15
  from datetime import datetime, timedelta
16
16
  from functools import lru_cache, cached_property
17
17
  from hashlib import sha256
18
- from io import BytesIO
18
+ from io import BytesIO, StringIO
19
19
  from pathlib import Path
20
20
  from typing import Any
21
21
  from collections.abc import MutableMapping
22
- from urllib.parse import unquote_plus, urlparse, urljoin
22
+ from urllib.parse import unquote_plus, urlparse, urljoin, parse_qs
23
23
 
24
24
  import filetype # type: ignore
25
+ import json_stream # type: ignore
26
+
25
27
  from bs4 import BeautifulSoup
26
28
  from ete3 import TreeNode # type: ignore
27
29
  from publicsuffixlist import PublicSuffixList # type: ignore
30
+ from requests_toolbelt.multipart import decoder # type: ignore
28
31
  from w3lib.html import strip_html5_whitespace
29
32
  from w3lib.url import canonicalize_url, safe_url_string
30
33
 
@@ -211,33 +214,67 @@ class URLNode(HarTreeNode):
211
214
  if 'user_agent' not in self.features:
212
215
  self.add_feature('user_agent', '')
213
216
 
214
- if 'method' in self.request and self.request['method'] == 'POST' and 'postData' in self.request:
215
- # If the content is empty, we don't care
216
- if self.request['postData']['text']:
217
- _posted_data: str = self.request['postData']['text']
218
- decoded_posted_data: str | bytes | int | float | bool
219
- # NOTE 2023-08-22: Blind attempt to base64 decode the data
217
+ if 'method' in self.request and self.request['method'] == 'POST':
218
+ decoded_posted_data: list[Any] | str | bytes | int | float | bool | dict[str, str] | dict[str, list[str]] | None = None
219
+ if 'postData' not in self.request or 'text' not in self.request['postData']:
220
+ self.logger.debug('POST request with no content.')
221
+ self.add_feature('posted_data_info', "No content.")
222
+ elif not self.request['postData']['text']:
223
+ # If the POST content is empty
224
+ self.logger.debug('Empty POST request.')
225
+ decoded_posted_data = ''
226
+ self.add_feature('posted_data_info', "Empty request.")
227
+ elif self.request['postData']['text'].startswith('\x1f\uFFFD\x08'):
228
+ # b'\x1f\xef\xbf\xbd\x08', decoded to UTF-8
229
+ # => the replacement character
230
+ # https://www.cogsci.ed.ac.uk/~richard/utf-8.cgi?input=%EF%BF%BD&mode=char
231
+ self.logger.debug('Got a garbled gzipped POST blob.')
232
+ self.add_feature('posted_data_info', "It was a POSTed gzipped blob, but the data has been garbled.")
233
+ decoded_posted_data = self.request['postData']['text']
234
+ elif self.request['postData'].get('params'):
235
+ # NOTE 2025-08-08
236
+ # if the posted data mimetype is "application/x-www-form-urlencoded"
237
+ # the HAR contains the decoded entry in the params key
238
+ # The params key is a list of dicts with a key and a value
239
+ # {"name": <key>, "value": <data>}
240
+ # I'd rather have it as {<key>: <data>}
241
+ # TODO: some processing on the data part (it's often a json blob)
242
+ self.logger.debug('Got a params POST.')
243
+ decoded_posted_data = {entry['name']: entry['value'] for entry in self.request['postData']['params']}
244
+ self.add_feature('posted_data_info', "POST request as URL params.")
245
+ else:
246
+ self.logger.debug('Got a normal POST')
220
247
  try:
221
- decoded_posted_data = self._dirty_safe_b64decode(_posted_data)
248
+ # NOTE 2023-08-22: Blind attempt to base64 decode the data
249
+ decoded_posted_data = self._dirty_safe_b64decode(self.request['postData']['text'])
222
250
  except binascii.Error:
223
- decoded_posted_data = _posted_data
251
+ decoded_posted_data = self.request['postData']['text']
224
252
  if 'mimeType' in self.request['postData']:
225
253
  # make it easier to compare.
226
254
  mimetype_lower = self.request['postData']['mimeType'].lower()
227
255
  if mimetype_lower.startswith('application/x-www-form-urlencoded'):
256
+ # NOTE: this should never happen as there should
257
+ # be something in self.request['postData']['params']
258
+ # and we already processed it before but just in case...
259
+ self.logger.debug('Got a application/x-www-form-urlencoded without params key')
228
260
  # 100% sure there will be websites where decode will fail
229
261
  try:
230
262
  if isinstance(decoded_posted_data, bytes):
231
263
  decoded_posted_data = decoded_posted_data.decode()
232
264
  if isinstance(decoded_posted_data, str):
233
265
  decoded_posted_data = unquote_plus(decoded_posted_data)
266
+ if isinstance(decoded_posted_data, str):
267
+ decoded_posted_data = parse_qs(decoded_posted_data)
268
+ self.add_feature('posted_data_info', "Successfully decoded POST request.")
234
269
  except Exception as e:
235
- self.logger.warning(f'Unable to unquote form data "{decoded_posted_data!r}": {e}')
270
+ self.logger.warning(f'Unable to unquote or parse form data "{decoded_posted_data!r}": {e}')
271
+ self.add_feature('posted_data_info', "Unable to decode POST request.")
236
272
  elif (mimetype_lower.startswith('application/json')
237
273
  or mimetype_lower.startswith('application/csp-report')
238
274
  or mimetype_lower.startswith('application/x-amz-json-1.1')
239
- or mimetype_lower.startswith('application/x-json-stream')
240
275
  or mimetype_lower.startswith('application/reports+json')
276
+ or mimetype_lower.startswith('application/vnd.adobe.dc+json')
277
+ or mimetype_lower.startswith('application/ion+json')
241
278
  or mimetype_lower.endswith('json')
242
279
  ):
243
280
  if isinstance(decoded_posted_data, (str, bytes)):
@@ -245,56 +282,127 @@ class URLNode(HarTreeNode):
245
282
  try:
246
283
  # NOTE 2023-08-22: loads here may give us a int, float or a bool.
247
284
  decoded_posted_data = json.loads(decoded_posted_data)
285
+ self.add_feature('posted_data_info', "Successfully decoded POST request.")
248
286
  except Exception:
287
+ self.add_feature('posted_data_info', "Unable to decode POST request.")
249
288
  if isinstance(decoded_posted_data, (str, bytes)):
250
- self.logger.debug(f"Expected json, got garbage: {mimetype_lower} - {decoded_posted_data[:20]!r}[...]")
289
+ self.logger.warning(f"Expected json, got garbage: {mimetype_lower} - {decoded_posted_data[:20]!r}[...]")
251
290
  else:
252
- self.logger.debug(f"Expected json, got garbage: {mimetype_lower} - {decoded_posted_data}")
291
+ self.logger.warning(f"Expected json, got garbage: {mimetype_lower} - {decoded_posted_data}")
292
+ elif mimetype_lower.startswith('application/x-json-stream'):
293
+ try:
294
+ to_stream: StringIO | BytesIO
295
+ if isinstance(decoded_posted_data, str):
296
+ to_stream = StringIO(decoded_posted_data)
297
+ elif isinstance(decoded_posted_data, bytes):
298
+ to_stream = BytesIO(decoded_posted_data)
299
+ else:
300
+ raise ValueError(f'Invalid type: {type(decoded_posted_data)}')
301
+ streamed_data = json_stream.load(to_stream)
302
+ decoded_posted_data = json_stream.to_standard_types(streamed_data)
303
+ self.add_feature('posted_data_info', "Successfully decoded POST request.")
304
+ except Exception:
305
+ if isinstance(decoded_posted_data, (str, bytes)):
306
+ self.logger.warning(f"Expected json stream, got garbage: {mimetype_lower} - {decoded_posted_data[:20]!r}[...]")
307
+ else:
308
+ self.logger.warning(f"Expected json stream, got garbage: {mimetype_lower} - {decoded_posted_data}")
309
+ self.add_feature('posted_data_info', "Unable to decode POST request.")
310
+ elif mimetype_lower.startswith('multipart'):
311
+ self.add_feature('posted_data_info', f"Decoding {mimetype_lower} is partially supported.")
312
+ if isinstance(decoded_posted_data, str):
313
+ # must be encoded for decoding
314
+ multipart_to_decode = decoded_posted_data.encode()
315
+ elif isinstance(decoded_posted_data, bytes):
316
+ multipart_to_decode = decoded_posted_data
317
+ else:
318
+ raise ValueError(f'Invalid type for multipart POST: {type(decoded_posted_data)}')
319
+ if b"\r\n" not in multipart_to_decode:
320
+ # the decoder wants that
321
+ multipart_to_decode = multipart_to_decode.replace(b"\n", b"\r\n")
322
+ try:
323
+ multipart_data = decoder.MultipartDecoder(multipart_to_decode, mimetype_lower)
324
+ decoded_posted_data = []
325
+ for part in multipart_data.parts:
326
+ headers = {k.decode(): v.decode() for k, v in part.headers.items()}
327
+ content = part.text
328
+ decoded_posted_data.append({'headers': headers, 'content': content})
329
+ except Exception as e:
330
+ self.logger.warning(f'Unable to decode multipart POST: {e}')
331
+ self.add_feature('posted_data_info', "Unable to decode multipart in POST request.")
253
332
 
254
- elif mimetype_lower.startswith('multipart/form-data'):
255
- # FIXME multipart content (similar to email). Not totally sure what do do with it tight now.
256
- pass
257
333
  elif mimetype_lower.startswith('application/x-protobuf'):
258
334
  # FIXME If possible, decode?
259
- pass
260
- elif mimetype_lower.startswith('text'):
335
+ self.logger.debug(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
336
+ self.add_feature('posted_data_info', f"Decoding {mimetype_lower} is not supported yet.")
337
+ elif mimetype_lower.startswith('text') and isinstance(decoded_posted_data, (str, bytes)):
261
338
  try:
262
339
  # NOTE 2023-08-22: Quite a few text entries are in fact json, give it a shot.
263
340
  # loads here may give us a int, float or a bool.
264
341
  decoded_posted_data = json.loads(decoded_posted_data)
342
+ self.add_feature('posted_data_info', "Decoded JSON out of POST request.")
265
343
  except Exception:
266
344
  # keep it as it is otherwise.
267
345
  pass
268
346
  elif mimetype_lower.endswith('javascript'):
269
347
  # keep it as it is
270
- pass
271
- elif mimetype_lower == '?':
272
- # Just skip it, no need to go in the warnings
273
- pass
274
- elif mimetype_lower in ['application/octet-stream', 'application/binary']:
348
+ self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
349
+ self.add_feature('posted_data_info', f"Pretty rendering of {mimetype_lower} is not supported yet.")
350
+ elif mimetype_lower in ['?', '*/*']:
351
+ self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
352
+ self.add_feature('posted_data_info', f"Weird MimeType ({mimetype_lower}) is not supported yet.")
353
+ elif mimetype_lower == 'application/binary':
354
+ self.logger.warning(f'Got a POST {mimetype_lower}, not a broken gziped blob: {decoded_posted_data!r}')
355
+ self.add_feature('posted_data_info', f"MimeType ({mimetype_lower}) is not supported yet.")
356
+ elif mimetype_lower in ['application/octet-stream']:
275
357
  # Should flag it, maybe?
276
- pass
277
- elif mimetype_lower in ['application/unknown', 'application/grpc-web+proto']:
358
+ self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
359
+ self.add_feature('posted_data_info', f"MimeType ({mimetype_lower}) is not supported yet.")
360
+ elif mimetype_lower in ['application/grpc-web+proto']:
361
+ # Can be decoded?
362
+ self.logger.warning(f'Got a POST {mimetype_lower} - can be decoded: {decoded_posted_data!r}')
363
+ self.add_feature('posted_data_info', f"MimeType ({mimetype_lower}) is not supported yet.")
364
+ elif mimetype_lower in ['application/unknown']:
278
365
  # Weird but already seen stuff
279
- pass
366
+ self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
367
+ self.add_feature('posted_data_info', f"MimeType ({mimetype_lower}) is not supported yet.")
280
368
  else:
281
- self.logger.warning(f'Unexpected mime type: {mimetype_lower}')
282
-
283
- # NOTE 2023-08-22: Blind attempt to process the data as json
284
- if isinstance(decoded_posted_data, (str, bytes)):
285
- try:
286
- decoded_posted_data = json.loads(decoded_posted_data)
287
- except Exception:
288
- pass
289
-
290
- if isinstance(decoded_posted_data, bytes):
291
- # NOTE 2023-08-22: Blind attempt to decode the bytes
292
- # Try to decode it as utf-8
293
- try:
294
- decoded_posted_data = decoded_posted_data.decode('utf-8')
295
- except Exception:
296
- pass
297
- self.add_feature('posted_data', decoded_posted_data)
369
+ self.logger.warning(f'Unexpected mime type: {mimetype_lower} - {decoded_posted_data!r}')
370
+ self.add_feature('posted_data_info', f"Unexpected MimeType ({mimetype_lower}) is not supported yet.")
371
+ else:
372
+ self.logger.warning(f'Missing mimetype in POST: {self.request["postData"]}')
373
+ self.add_feature('posted_data_info', "Missing MimeType, not sure what to do.")
374
+
375
+ # NOTE 2023-08-22: Blind attempt to process the data as json
376
+ if decoded_posted_data and isinstance(decoded_posted_data, (str, bytes)):
377
+ try:
378
+ decoded_posted_data = json.loads(decoded_posted_data)
379
+ except Exception:
380
+ pass
381
+
382
+ if decoded_posted_data and isinstance(decoded_posted_data, bytes):
383
+ # NOTE 2023-08-22: Blind attempt to decode the bytes
384
+ # Try to decode it as utf-8
385
+ try:
386
+ decoded_posted_data = decoded_posted_data.decode('utf-8')
387
+ except Exception:
388
+ pass
389
+
390
+ self.add_feature('posted_data', decoded_posted_data)
391
+ if 'postData' in self.request and self.request['postData'].get('mimeType'):
392
+ self.add_feature('posted_data_mimetype', self.request['postData']['mimeType'])
393
+ # Get size, post decode.
394
+ if not decoded_posted_data:
395
+ # empty or None, set to 0
396
+ self.add_feature('posted_data_size', 0)
397
+ elif isinstance(decoded_posted_data, (list, dict)):
398
+ # set size to the json dump
399
+ self.add_feature('posted_data_size', len(json.dumps(decoded_posted_data)))
400
+ elif isinstance(decoded_posted_data, (str, bytes)):
401
+ # length
402
+ self.add_feature('posted_data_size', len(decoded_posted_data))
403
+ else:
404
+ # Stringify and len
405
+ self.add_feature('posted_data_size', len(str(decoded_posted_data)))
298
406
 
299
407
  self.add_feature('response', har_entry['response'])
300
408
  try:
@@ -377,6 +485,7 @@ class URLNode(HarTreeNode):
377
485
 
378
486
  # Common JS redirect we can catch easily
379
487
  # NOTE: it is extremely fragile and doesn't work very often but is kinda better than nothing.
488
+ # NOTE 2025-08-30: Also, finding that doesn't mean it is in a part of the code that is executed without user interaction. It can be triggered after a user fills a form for example.
380
489
  # Source: https://stackoverflow.com/questions/13363174/regular-expression-to-catch-as-many-javascript-redirections-as-possible
381
490
  regex = re.compile(br"""((location.href)|(window.location)|(location.replace)|(location.assign))(( ?= ?)|( ?\( ?))("|')([^'"]*)("|')( ?\) ?)?;""", re.I)
382
491
  matches = re.findall(regex, self.body.getvalue())
@@ -478,7 +587,7 @@ class URLNode(HarTreeNode):
478
587
  return href
479
588
 
480
589
  if not hasattr(self, 'rendered_html') or not self.rendered_html:
481
- raise Har2TreeError('Not the node of a page rendered, invalid request.')
590
+ raise Har2TreeError(f'Not the node of a page rendered ({self.uuid}), invalid request.')
482
591
  urls: set[str] = set()
483
592
 
484
593
  # The simple ones: the links.
@@ -1,36 +1,38 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.4
2
2
  Name: har2tree
3
- Version: 1.31.3
3
+ Version: 1.36.0
4
4
  Summary: HTTP Archive (HAR) to ETE Toolkit generator
5
- License: BSD-3-Clause
5
+ License-Expression: BSD-3-Clause
6
+ License-File: LICENSE
6
7
  Author: Raphaël Vinot
7
8
  Author-email: raphael.vinot@circl.lu
8
- Requires-Python: >=3.9
9
+ Requires-Python: >=3.10,<3.15
9
10
  Classifier: Intended Audience :: Information Technology
10
11
  Classifier: Intended Audience :: Science/Research
11
12
  Classifier: Intended Audience :: Telecommunications Industry
12
- Classifier: License :: OSI Approved :: BSD License
13
13
  Classifier: Operating System :: POSIX :: Linux
14
14
  Classifier: Programming Language :: Python :: 3
15
- Classifier: Programming Language :: Python :: 3.9
16
15
  Classifier: Programming Language :: Python :: 3.10
17
16
  Classifier: Programming Language :: Python :: 3.11
18
17
  Classifier: Programming Language :: Python :: 3.12
19
18
  Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Programming Language :: Python :: 3.14
20
20
  Classifier: Topic :: Internet
21
21
  Classifier: Topic :: Security
22
22
  Provides-Extra: docs
23
- Requires-Dist: Sphinx (>=8.2.3) ; (python_version >= "3.11") and (extra == "docs")
24
- Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.13.4)
23
+ Requires-Dist: Sphinx (>=9.0.4) ; (python_version >= "3.11") and (extra == "docs")
24
+ Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.14.3)
25
25
  Requires-Dist: ete3 (>=3.1.3)
26
26
  Requires-Dist: filetype (>=1.2.0)
27
- Requires-Dist: legacy-cgi (>=2.6.3) ; python_version >= "3.13,<4.0"
28
- Requires-Dist: numpy (<2.1) ; python_version < "3.10"
29
- Requires-Dist: numpy (<2.3) ; python_version < "3.11"
30
- Requires-Dist: numpy (>=2.3.1) ; python_version >= "3.11"
31
- Requires-Dist: publicsuffixlist (>=1.0.2.20250719)
27
+ Requires-Dist: json-stream (>=2.3.3,<3.0.0)
28
+ Requires-Dist: legacy-cgi (>=2.6.4) ; python_version >= "3.13" and python_version < "4.0"
29
+ Requires-Dist: multipart (>=1.3.0,<2.0.0)
30
+ Requires-Dist: numpy (>=2.2,<2.3) ; python_version < "3.11"
31
+ Requires-Dist: numpy (>=2.3.5) ; python_version >= "3.11" and python_version < "3.15"
32
+ Requires-Dist: publicsuffixlist (>=1.0.2.20251209)
33
+ Requires-Dist: requests-toolbelt (>=1.0.0,<2.0.0)
32
34
  Requires-Dist: six (>=1.17.0) ; extra == "docs"
33
- Requires-Dist: tinycss2 (>=1.4.0)
35
+ Requires-Dist: tinycss2 (>=1.5.1)
34
36
  Requires-Dist: w3lib (>=2.3.1)
35
37
  Project-URL: Documentation, https://har2tree.readthedocs.io/en/latest/
36
38
  Project-URL: Repository, https://github.com/Lookyloo/har2tree
@@ -0,0 +1,10 @@
1
+ har2tree/__init__.py,sha256=Na3mxHkUBq3rzYbxiLNJF37DxH5mcghSorjzXw5Teug,422
2
+ har2tree/har2tree.py,sha256=24Puk4dlDXWOVIAPV7SIXNoP-oP-_7ERH2mZPxXiwn8,52762
3
+ har2tree/helper.py,sha256=ktX5Fq-K_t4r0VVAXIH4uy7xc-qCjtSaiUvkX_PYxhw,20737
4
+ har2tree/nodes.py,sha256=QWKqEUnuW7J6pASVvzwWAQNqL-_KDzSs2ld6uJl3qbw,37710
5
+ har2tree/parser.py,sha256=4yej1OcVYAIiLfzYZsO9WCw3WyM_ykDTuvpW7UO1ROE,3645
6
+ har2tree/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ har2tree-1.36.0.dist-info/METADATA,sha256=jjZ2lxWFYv19ZpFL_1ehSNbUNLqEePYGNmLOABeUdnM,2240
8
+ har2tree-1.36.0.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
9
+ har2tree-1.36.0.dist-info/licenses/LICENSE,sha256=Xa4EVROgJsEo10CW-ISCRiw0TtqdKz1JuM3BBLBM55c,1803
10
+ har2tree-1.36.0.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 2.1.3
2
+ Generator: poetry-core 2.2.1
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,10 +0,0 @@
1
- har2tree/__init__.py,sha256=Na3mxHkUBq3rzYbxiLNJF37DxH5mcghSorjzXw5Teug,422
2
- har2tree/har2tree.py,sha256=47x9X5tY69f9SXkYJgJsnAaX2kxgXHgzFThGz6M86Zw,44495
3
- har2tree/helper.py,sha256=CgeXqfBeHs8SbkW7TRNKqJBTZLAu63KggQjbGHCZAGI,20681
4
- har2tree/nodes.py,sha256=CC3NseEaM455JOpPqjfTAQ-dwWiGWmzlceGSSeTwoRo,28951
5
- har2tree/parser.py,sha256=4yej1OcVYAIiLfzYZsO9WCw3WyM_ykDTuvpW7UO1ROE,3645
6
- har2tree/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- har2tree-1.31.3.dist-info/LICENSE,sha256=Xa4EVROgJsEo10CW-ISCRiw0TtqdKz1JuM3BBLBM55c,1803
8
- har2tree-1.31.3.dist-info/METADATA,sha256=PSDu0bnPUYje8It-uyZfcDVbo4TTTC7RCzH-2CRAc0U,2112
9
- har2tree-1.31.3.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
10
- har2tree-1.31.3.dist-info/RECORD,,