har2tree 1.31.4__py3-none-any.whl → 1.31.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
har2tree/nodes.py CHANGED
@@ -15,13 +15,15 @@ from base64 import b64decode
15
15
  from datetime import datetime, timedelta
16
16
  from functools import lru_cache, cached_property
17
17
  from hashlib import sha256
18
- from io import BytesIO
18
+ from io import BytesIO, StringIO
19
19
  from pathlib import Path
20
20
  from typing import Any
21
21
  from collections.abc import MutableMapping
22
- from urllib.parse import unquote_plus, urlparse, urljoin
22
+ from urllib.parse import unquote_plus, urlparse, urljoin, parse_qs
23
23
 
24
24
  import filetype # type: ignore
25
+ import json_stream # type: ignore
26
+
25
27
  from bs4 import BeautifulSoup
26
28
  from ete3 import TreeNode # type: ignore
27
29
  from publicsuffixlist import PublicSuffixList # type: ignore
@@ -212,13 +214,22 @@ class URLNode(HarTreeNode):
212
214
  self.add_feature('user_agent', '')
213
215
 
214
216
  if 'method' in self.request and self.request['method'] == 'POST':
215
- decoded_posted_data: str | bytes | int | float | bool | dict[str, str] | None = None
217
+ decoded_posted_data: list[Any] | str | bytes | int | float | bool | dict[str, str] | dict[str, list[str]] | None = None
216
218
  if 'postData' not in self.request or 'text' not in self.request['postData']:
217
219
  self.logger.debug('POST request with no content.')
220
+ self.add_feature('posted_data_info', "No content.")
218
221
  elif not self.request['postData']['text']:
219
222
  # If the POST content is empty
220
223
  self.logger.debug('Empty POST request.')
221
224
  decoded_posted_data = ''
225
+ self.add_feature('posted_data_info', "Empty request.")
226
+ elif self.request['postData']['text'].startswith('\x1f\uFFFD\x08'):
227
+ # b'\x1f\xef\xbf\xbd\x08', decoded to UTF-8
228
+ # => the replacement character
229
+ # https://www.cogsci.ed.ac.uk/~richard/utf-8.cgi?input=%EF%BF%BD&mode=char
230
+ self.logger.debug('Got a garbled gzipped POST blob.')
231
+ self.add_feature('posted_data_info', "It was a POSTed gzipped blob, but the data has been garbled.")
232
+ decoded_posted_data = self.request['postData']['text']
222
233
  elif self.request['postData'].get('params'):
223
234
  # NOTE 2025-08-08
224
235
  # if the posted data mimetype is "application/x-www-form-urlencoded"
@@ -229,10 +240,11 @@ class URLNode(HarTreeNode):
229
240
  # TODO: some processing on the data part (it's often a json blob)
230
241
  self.logger.debug('Got a params POST.')
231
242
  decoded_posted_data = {entry['name']: entry['value'] for entry in self.request['postData']['params']}
243
+ self.add_feature('posted_data_info', "POST request as URL params.")
232
244
  else:
233
- # NOTE 2023-08-22: Blind attempt to base64 decode the data
234
245
  self.logger.debug('Got a normal POST')
235
246
  try:
247
+ # NOTE 2023-08-22: Blind attempt to base64 decode the data
236
248
  decoded_posted_data = self._dirty_safe_b64decode(self.request['postData']['text'])
237
249
  except binascii.Error:
238
250
  decoded_posted_data = self.request['postData']['text']
@@ -243,19 +255,22 @@ class URLNode(HarTreeNode):
243
255
  # NOTE: this should never happen as there should
244
256
  # be something in self.request['postData']['params']
245
257
  # and we already processed it before but just in case...
246
- self.logger.warning(f'Got a application/x-www-form-urlencoded without params key: {self.request}')
258
+ self.logger.debug('Got a application/x-www-form-urlencoded without params key')
247
259
  # 100% sure there will be websites where decode will fail
248
260
  try:
249
261
  if isinstance(decoded_posted_data, bytes):
250
262
  decoded_posted_data = decoded_posted_data.decode()
251
263
  if isinstance(decoded_posted_data, str):
252
264
  decoded_posted_data = unquote_plus(decoded_posted_data)
265
+ if isinstance(decoded_posted_data, str):
266
+ decoded_posted_data = parse_qs(decoded_posted_data)
267
+ self.add_feature('posted_data_info', "Successfully decoded POST request.")
253
268
  except Exception as e:
254
- self.logger.warning(f'Unable to unquote form data "{decoded_posted_data!r}": {e}')
269
+ self.logger.warning(f'Unable to unquote or parse form data "{decoded_posted_data!r}": {e}')
270
+ self.add_feature('posted_data_info', "Unable to decode POST request.")
255
271
  elif (mimetype_lower.startswith('application/json')
256
272
  or mimetype_lower.startswith('application/csp-report')
257
273
  or mimetype_lower.startswith('application/x-amz-json-1.1')
258
- or mimetype_lower.startswith('application/x-json-stream')
259
274
  or mimetype_lower.startswith('application/reports+json')
260
275
  or mimetype_lower.endswith('json')
261
276
  ):
@@ -264,64 +279,109 @@ class URLNode(HarTreeNode):
264
279
  try:
265
280
  # NOTE 2023-08-22: loads here may give us a int, float or a bool.
266
281
  decoded_posted_data = json.loads(decoded_posted_data)
282
+ self.add_feature('posted_data_info', "Successfully decoded POST request.")
267
283
  except Exception:
284
+ self.add_feature('posted_data_info', "Unable to decode POST request.")
268
285
  if isinstance(decoded_posted_data, (str, bytes)):
269
286
  self.logger.warning(f"Expected json, got garbage: {mimetype_lower} - {decoded_posted_data[:20]!r}[...]")
270
287
  else:
271
288
  self.logger.warning(f"Expected json, got garbage: {mimetype_lower} - {decoded_posted_data}")
272
-
289
+ elif mimetype_lower.startswith('application/x-json-stream'):
290
+ try:
291
+ to_stream: StringIO | BytesIO
292
+ if isinstance(decoded_posted_data, str):
293
+ to_stream = StringIO(decoded_posted_data)
294
+ elif isinstance(decoded_posted_data, bytes):
295
+ to_stream = BytesIO(decoded_posted_data)
296
+ else:
297
+ raise ValueError(f'Invalid type: {type(decoded_posted_data)}')
298
+ streamed_data = json_stream.load(to_stream)
299
+ decoded_posted_data = json_stream.to_standard_types(streamed_data)
300
+ self.add_feature('posted_data_info', "Successfully decoded POST request.")
301
+ except Exception:
302
+ if isinstance(decoded_posted_data, (str, bytes)):
303
+ self.logger.warning(f"Expected json stream, got garbage: {mimetype_lower} - {decoded_posted_data[:20]!r}[...]")
304
+ else:
305
+ self.logger.warning(f"Expected json stream, got garbage: {mimetype_lower} - {decoded_posted_data}")
306
+ self.add_feature('posted_data_info', "Unable to decode POST request.")
273
307
  elif mimetype_lower.startswith('multipart/form-data'):
274
308
  # FIXME multipart content (similar to email). Not totally sure what do do with it tight now.
275
309
  self.logger.debug(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
276
- pass
310
+ self.add_feature('posted_data_info', f"Decoding {mimetype_lower} is not supported yet.")
277
311
  elif mimetype_lower.startswith('application/x-protobuf'):
278
312
  # FIXME If possible, decode?
279
313
  self.logger.debug(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
280
- pass
314
+ self.add_feature('posted_data_info', f"Decoding {mimetype_lower} is not supported yet.")
281
315
  elif mimetype_lower.startswith('text') and isinstance(decoded_posted_data, (str, bytes)):
282
316
  try:
283
317
  # NOTE 2023-08-22: Quite a few text entries are in fact json, give it a shot.
284
318
  # loads here may give us a int, float or a bool.
285
319
  decoded_posted_data = json.loads(decoded_posted_data)
320
+ self.add_feature('posted_data_info', "Decoded JSON out of POST request.")
286
321
  except Exception:
287
322
  # keep it as it is otherwise.
288
323
  pass
289
324
  elif mimetype_lower.endswith('javascript'):
290
325
  # keep it as it is
291
326
  self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
292
- pass
327
+ self.add_feature('posted_data_info', f"Pretty rendering of {mimetype_lower} is not supported yet.")
293
328
  elif mimetype_lower == '?':
294
329
  # Just skip it, no need to go in the warnings
295
330
  self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
296
- pass
297
- elif mimetype_lower in ['application/octet-stream', 'application/binary']:
331
+ self.add_feature('posted_data_info', f"Weird MimeType ({mimetype_lower}) is not supported yet.")
332
+ elif mimetype_lower == 'application/binary':
333
+ self.logger.warning(f'Got a POST {mimetype_lower}, not a broken gziped blob: {decoded_posted_data!r}')
334
+ self.add_feature('posted_data_info', f"MimeType ({mimetype_lower}) is not supported yet.")
335
+ elif mimetype_lower in ['application/octet-stream']:
298
336
  # Should flag it, maybe?
299
337
  self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
300
- pass
301
- elif mimetype_lower in ['application/unknown', 'application/grpc-web+proto']:
338
+ self.add_feature('posted_data_info', f"MimeType ({mimetype_lower}) is not supported yet.")
339
+ elif mimetype_lower in ['application/grpc-web+proto']:
340
+ # Can be decoded?
341
+ self.logger.warning(f'Got a POST {mimetype_lower} - can be decoded: {decoded_posted_data!r}')
342
+ self.add_feature('posted_data_info', f"MimeType ({mimetype_lower}) is not supported yet.")
343
+ elif mimetype_lower in ['application/unknown']:
302
344
  # Weird but already seen stuff
303
345
  self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
304
- pass
346
+ self.add_feature('posted_data_info', f"MimeType ({mimetype_lower}) is not supported yet.")
305
347
  else:
306
- self.logger.warning(f'Unexpected mime type: {mimetype_lower}')
348
+ self.logger.warning(f'Unexpected mime type: {mimetype_lower} - {decoded_posted_data!r}')
349
+ self.add_feature('posted_data_info', f"Unexpected MimeType ({mimetype_lower}) is not supported yet.")
307
350
  else:
308
351
  self.logger.warning(f'Missing mimetype in POST: {self.request["postData"]}')
352
+ self.add_feature('posted_data_info', "Missing MimeType, not sure what to do.")
309
353
 
310
354
  # NOTE 2023-08-22: Blind attempt to process the data as json
311
- if isinstance(decoded_posted_data, (str, bytes)):
355
+ if decoded_posted_data and isinstance(decoded_posted_data, (str, bytes)):
312
356
  try:
313
357
  decoded_posted_data = json.loads(decoded_posted_data)
314
358
  except Exception:
315
359
  pass
316
360
 
317
- if isinstance(decoded_posted_data, bytes):
361
+ if decoded_posted_data and isinstance(decoded_posted_data, bytes):
318
362
  # NOTE 2023-08-22: Blind attempt to decode the bytes
319
363
  # Try to decode it as utf-8
320
364
  try:
321
365
  decoded_posted_data = decoded_posted_data.decode('utf-8')
322
366
  except Exception:
323
367
  pass
368
+
324
369
  self.add_feature('posted_data', decoded_posted_data)
370
+ if 'postData' in self.request and self.request['postData'].get('mimeType'):
371
+ self.add_feature('posted_data_mimetype', self.request['postData']['mimeType'])
372
+ # Get size, post decode.
373
+ if not decoded_posted_data:
374
+ # empty or None, set to 0
375
+ self.add_feature('posted_data_size', 0)
376
+ elif isinstance(decoded_posted_data, (list, dict)):
377
+ # set size to the json dump
378
+ self.add_feature('posted_data_size', len(json.dumps(decoded_posted_data)))
379
+ elif isinstance(decoded_posted_data, (str, bytes)):
380
+ # length
381
+ self.add_feature('posted_data_size', len(decoded_posted_data))
382
+ else:
383
+ # Stringify and len
384
+ self.add_feature('posted_data_size', len(str(decoded_posted_data)))
325
385
 
326
386
  self.add_feature('response', har_entry['response'])
327
387
  try:
@@ -1,11 +1,11 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: har2tree
3
- Version: 1.31.4
3
+ Version: 1.31.6
4
4
  Summary: HTTP Archive (HAR) to ETE Toolkit generator
5
5
  License: BSD-3-Clause
6
6
  Author: Raphaël Vinot
7
7
  Author-email: raphael.vinot@circl.lu
8
- Requires-Python: >=3.9
8
+ Requires-Python: >=3.9,<4.0
9
9
  Classifier: Intended Audience :: Information Technology
10
10
  Classifier: Intended Audience :: Science/Research
11
11
  Classifier: Intended Audience :: Telecommunications Industry
@@ -24,12 +24,13 @@ Requires-Dist: Sphinx (>=8.2.3) ; (python_version >= "3.11") and (extra == "docs
24
24
  Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.13.4)
25
25
  Requires-Dist: ete3 (>=3.1.3)
26
26
  Requires-Dist: filetype (>=1.2.0)
27
+ Requires-Dist: json-stream (>=2.3.3,<3.0.0)
27
28
  Requires-Dist: legacy-cgi (>=2.6.3) ; python_version >= "3.13,<4.0"
28
29
  Requires-Dist: multipart (>=1.3.0,<2.0.0)
29
30
  Requires-Dist: numpy (<2.1) ; python_version < "3.10"
30
31
  Requires-Dist: numpy (<2.3) ; python_version < "3.11"
31
32
  Requires-Dist: numpy (>=2.3.2) ; python_version >= "3.11"
32
- Requires-Dist: publicsuffixlist (>=1.0.2.20250802)
33
+ Requires-Dist: publicsuffixlist (>=1.0.2.20250812)
33
34
  Requires-Dist: six (>=1.17.0) ; extra == "docs"
34
35
  Requires-Dist: tinycss2 (>=1.4.0)
35
36
  Requires-Dist: w3lib (>=2.3.1)
@@ -1,10 +1,10 @@
1
1
  har2tree/__init__.py,sha256=Na3mxHkUBq3rzYbxiLNJF37DxH5mcghSorjzXw5Teug,422
2
2
  har2tree/har2tree.py,sha256=47x9X5tY69f9SXkYJgJsnAaX2kxgXHgzFThGz6M86Zw,44495
3
3
  har2tree/helper.py,sha256=CgeXqfBeHs8SbkW7TRNKqJBTZLAu63KggQjbGHCZAGI,20681
4
- har2tree/nodes.py,sha256=LfnVP66dhGNcbcoaao32WzzYGSkjOPqrwHE9Swh3sug,31005
4
+ har2tree/nodes.py,sha256=Z-NKlcrDcBbEDwpPMFcQzqbCr3bOnb8xzPAxFI5GNSs,36111
5
5
  har2tree/parser.py,sha256=4yej1OcVYAIiLfzYZsO9WCw3WyM_ykDTuvpW7UO1ROE,3645
6
6
  har2tree/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- har2tree-1.31.4.dist-info/LICENSE,sha256=Xa4EVROgJsEo10CW-ISCRiw0TtqdKz1JuM3BBLBM55c,1803
8
- har2tree-1.31.4.dist-info/METADATA,sha256=axJorvFoHABeSnKQhWEn2oVRBtD2ghtaY7mSSiRM1Q8,2154
9
- har2tree-1.31.4.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
10
- har2tree-1.31.4.dist-info/RECORD,,
7
+ har2tree-1.31.6.dist-info/LICENSE,sha256=Xa4EVROgJsEo10CW-ISCRiw0TtqdKz1JuM3BBLBM55c,1803
8
+ har2tree-1.31.6.dist-info/METADATA,sha256=r4-PI8eNiVboZ7B4NSarQhqxBhOsy7C1gi4e6q2G99Y,2203
9
+ har2tree-1.31.6.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
10
+ har2tree-1.31.6.dist-info/RECORD,,