har2tree 1.31.3__py3-none-any.whl → 1.31.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
har2tree/nodes.py CHANGED
@@ -15,13 +15,15 @@ from base64 import b64decode
15
15
  from datetime import datetime, timedelta
16
16
  from functools import lru_cache, cached_property
17
17
  from hashlib import sha256
18
- from io import BytesIO
18
+ from io import BytesIO, StringIO
19
19
  from pathlib import Path
20
20
  from typing import Any
21
21
  from collections.abc import MutableMapping
22
- from urllib.parse import unquote_plus, urlparse, urljoin
22
+ from urllib.parse import unquote_plus, urlparse, urljoin, parse_qs
23
23
 
24
24
  import filetype # type: ignore
25
+ import json_stream # type: ignore
26
+
25
27
  from bs4 import BeautifulSoup
26
28
  from ete3 import TreeNode # type: ignore
27
29
  from publicsuffixlist import PublicSuffixList # type: ignore
@@ -211,32 +213,52 @@ class URLNode(HarTreeNode):
211
213
  if 'user_agent' not in self.features:
212
214
  self.add_feature('user_agent', '')
213
215
 
214
- if 'method' in self.request and self.request['method'] == 'POST' and 'postData' in self.request:
215
- # If the content is empty, we don't care
216
- if self.request['postData']['text']:
217
- _posted_data: str = self.request['postData']['text']
218
- decoded_posted_data: str | bytes | int | float | bool
216
+ if 'method' in self.request and self.request['method'] == 'POST':
217
+ decoded_posted_data: list[Any] | str | bytes | int | float | bool | dict[str, str] | dict[str, list[str]] | None = None
218
+ if 'postData' not in self.request or 'text' not in self.request['postData']:
219
+ self.logger.debug('POST request with no content.')
220
+ elif not self.request['postData']['text']:
221
+ # If the POST content is empty
222
+ self.logger.debug('Empty POST request.')
223
+ decoded_posted_data = ''
224
+ elif self.request['postData'].get('params'):
225
+ # NOTE 2025-08-08
226
+ # if the posted data mimetype is "application/x-www-form-urlencoded"
227
+ # the HAR contains the decoded entry in the params key
228
+ # The params key is a list of dicts with a key and a value
229
+ # {"name": <key>, "value": <data>}
230
+ # I'd rather have it as {<key>: <data>}
231
+ # TODO: some processing on the data part (it's often a json blob)
232
+ self.logger.debug('Got a params POST.')
233
+ decoded_posted_data = {entry['name']: entry['value'] for entry in self.request['postData']['params']}
234
+ else:
219
235
  # NOTE 2023-08-22: Blind attempt to base64 decode the data
236
+ self.logger.debug('Got a normal POST')
220
237
  try:
221
- decoded_posted_data = self._dirty_safe_b64decode(_posted_data)
238
+ decoded_posted_data = self._dirty_safe_b64decode(self.request['postData']['text'])
222
239
  except binascii.Error:
223
- decoded_posted_data = _posted_data
240
+ decoded_posted_data = self.request['postData']['text']
224
241
  if 'mimeType' in self.request['postData']:
225
242
  # make it easier to compare.
226
243
  mimetype_lower = self.request['postData']['mimeType'].lower()
227
244
  if mimetype_lower.startswith('application/x-www-form-urlencoded'):
245
+ # NOTE: this should never happen as there should
246
+ # be something in self.request['postData']['params']
247
+ # and we already processed it before but just in case...
248
+ self.logger.debug('Got a application/x-www-form-urlencoded without params key')
228
249
  # 100% sure there will be websites where decode will fail
229
250
  try:
230
251
  if isinstance(decoded_posted_data, bytes):
231
252
  decoded_posted_data = decoded_posted_data.decode()
232
253
  if isinstance(decoded_posted_data, str):
233
254
  decoded_posted_data = unquote_plus(decoded_posted_data)
255
+ if isinstance(decoded_posted_data, str):
256
+ decoded_posted_data = parse_qs(decoded_posted_data)
234
257
  except Exception as e:
235
- self.logger.warning(f'Unable to unquote form data "{decoded_posted_data!r}": {e}')
258
+ self.logger.warning(f'Unable to unquote or parse form data "{decoded_posted_data!r}": {e}')
236
259
  elif (mimetype_lower.startswith('application/json')
237
260
  or mimetype_lower.startswith('application/csp-report')
238
261
  or mimetype_lower.startswith('application/x-amz-json-1.1')
239
- or mimetype_lower.startswith('application/x-json-stream')
240
262
  or mimetype_lower.startswith('application/reports+json')
241
263
  or mimetype_lower.endswith('json')
242
264
  ):
@@ -247,17 +269,34 @@ class URLNode(HarTreeNode):
247
269
  decoded_posted_data = json.loads(decoded_posted_data)
248
270
  except Exception:
249
271
  if isinstance(decoded_posted_data, (str, bytes)):
250
- self.logger.debug(f"Expected json, got garbage: {mimetype_lower} - {decoded_posted_data[:20]!r}[...]")
272
+ self.logger.warning(f"Expected json, got garbage: {mimetype_lower} - {decoded_posted_data[:20]!r}[...]")
251
273
  else:
252
- self.logger.debug(f"Expected json, got garbage: {mimetype_lower} - {decoded_posted_data}")
253
-
274
+ self.logger.warning(f"Expected json, got garbage: {mimetype_lower} - {decoded_posted_data}")
275
+ elif mimetype_lower.startswith('application/x-json-stream'):
276
+ try:
277
+ to_stream: StringIO | BytesIO
278
+ if isinstance(decoded_posted_data, str):
279
+ to_stream = StringIO(decoded_posted_data)
280
+ elif isinstance(decoded_posted_data, bytes):
281
+ to_stream = BytesIO(decoded_posted_data)
282
+ else:
283
+ raise ValueError(f'Invalid type: {type(decoded_posted_data)}')
284
+ streamed_data = json_stream.load(to_stream)
285
+ decoded_posted_data = json_stream.to_standard_types(streamed_data)
286
+ except Exception:
287
+ if isinstance(decoded_posted_data, (str, bytes)):
288
+ self.logger.warning(f"Expected json stream, got garbage: {mimetype_lower} - {decoded_posted_data[:20]!r}[...]")
289
+ else:
290
+ self.logger.warning(f"Expected json stream, got garbage: {mimetype_lower} - {decoded_posted_data}")
254
291
  elif mimetype_lower.startswith('multipart/form-data'):
255
292
  # FIXME multipart content (similar to email). Not totally sure what do do with it tight now.
293
+ self.logger.debug(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
256
294
  pass
257
295
  elif mimetype_lower.startswith('application/x-protobuf'):
258
296
  # FIXME If possible, decode?
297
+ self.logger.debug(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
259
298
  pass
260
- elif mimetype_lower.startswith('text'):
299
+ elif mimetype_lower.startswith('text') and isinstance(decoded_posted_data, (str, bytes)):
261
300
  try:
262
301
  # NOTE 2023-08-22: Quite a few text entries are in fact json, give it a shot.
263
302
  # loads here may give us a int, float or a bool.
@@ -267,34 +306,48 @@ class URLNode(HarTreeNode):
267
306
  pass
268
307
  elif mimetype_lower.endswith('javascript'):
269
308
  # keep it as it is
309
+ self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
270
310
  pass
271
311
  elif mimetype_lower == '?':
272
312
  # Just skip it, no need to go in the warnings
313
+ self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
273
314
  pass
274
- elif mimetype_lower in ['application/octet-stream', 'application/binary']:
315
+ elif mimetype_lower == 'application/binary':
316
+ # generally a broken gzipped blob
317
+ self.logger.debug(f'Got a POST {mimetype_lower}, most probably a broken gziped blob: {decoded_posted_data!r}')
318
+ elif mimetype_lower in ['application/octet-stream']:
275
319
  # Should flag it, maybe?
320
+ self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
276
321
  pass
277
- elif mimetype_lower in ['application/unknown', 'application/grpc-web+proto']:
322
+ elif mimetype_lower in ['application/grpc-web+proto']:
323
+ # Can be decoded?
324
+ self.logger.warning(f'Got a POST {mimetype_lower} - can be decoded: {decoded_posted_data!r}')
325
+ elif mimetype_lower in ['application/unknown']:
278
326
  # Weird but already seen stuff
327
+ self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
279
328
  pass
280
329
  else:
281
330
  self.logger.warning(f'Unexpected mime type: {mimetype_lower}')
331
+ else:
332
+ self.logger.warning(f'Missing mimetype in POST: {self.request["postData"]}')
282
333
 
283
- # NOTE 2023-08-22: Blind attempt to process the data as json
284
- if isinstance(decoded_posted_data, (str, bytes)):
285
- try:
286
- decoded_posted_data = json.loads(decoded_posted_data)
287
- except Exception:
288
- pass
334
+ # NOTE 2023-08-22: Blind attempt to process the data as json
335
+ if isinstance(decoded_posted_data, (str, bytes)):
336
+ try:
337
+ decoded_posted_data = json.loads(decoded_posted_data)
338
+ except Exception:
339
+ pass
289
340
 
290
- if isinstance(decoded_posted_data, bytes):
291
- # NOTE 2023-08-22: Blind attempt to decode the bytes
292
- # Try to decode it as utf-8
293
- try:
294
- decoded_posted_data = decoded_posted_data.decode('utf-8')
295
- except Exception:
296
- pass
297
- self.add_feature('posted_data', decoded_posted_data)
341
+ if isinstance(decoded_posted_data, bytes):
342
+ # NOTE 2023-08-22: Blind attempt to decode the bytes
343
+ # Try to decode it as utf-8
344
+ try:
345
+ decoded_posted_data = decoded_posted_data.decode('utf-8')
346
+ except Exception:
347
+ pass
348
+ self.add_feature('posted_data', decoded_posted_data)
349
+ if 'postData' in self.request and self.request['postData'].get('mimeType'):
350
+ self.add_feature('posted_data_mimetype', self.request['postData']['mimeType'])
298
351
 
299
352
  self.add_feature('response', har_entry['response'])
300
353
  try:
@@ -1,11 +1,11 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: har2tree
3
- Version: 1.31.3
3
+ Version: 1.31.5
4
4
  Summary: HTTP Archive (HAR) to ETE Toolkit generator
5
5
  License: BSD-3-Clause
6
6
  Author: Raphaël Vinot
7
7
  Author-email: raphael.vinot@circl.lu
8
- Requires-Python: >=3.9
8
+ Requires-Python: >=3.9,<4.0
9
9
  Classifier: Intended Audience :: Information Technology
10
10
  Classifier: Intended Audience :: Science/Research
11
11
  Classifier: Intended Audience :: Telecommunications Industry
@@ -24,11 +24,13 @@ Requires-Dist: Sphinx (>=8.2.3) ; (python_version >= "3.11") and (extra == "docs
24
24
  Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.13.4)
25
25
  Requires-Dist: ete3 (>=3.1.3)
26
26
  Requires-Dist: filetype (>=1.2.0)
27
+ Requires-Dist: json-stream (>=2.3.3,<3.0.0)
27
28
  Requires-Dist: legacy-cgi (>=2.6.3) ; python_version >= "3.13,<4.0"
29
+ Requires-Dist: multipart (>=1.3.0,<2.0.0)
28
30
  Requires-Dist: numpy (<2.1) ; python_version < "3.10"
29
31
  Requires-Dist: numpy (<2.3) ; python_version < "3.11"
30
- Requires-Dist: numpy (>=2.3.1) ; python_version >= "3.11"
31
- Requires-Dist: publicsuffixlist (>=1.0.2.20250719)
32
+ Requires-Dist: numpy (>=2.3.2) ; python_version >= "3.11"
33
+ Requires-Dist: publicsuffixlist (>=1.0.2.20250809)
32
34
  Requires-Dist: six (>=1.17.0) ; extra == "docs"
33
35
  Requires-Dist: tinycss2 (>=1.4.0)
34
36
  Requires-Dist: w3lib (>=2.3.1)
@@ -1,10 +1,10 @@
1
1
  har2tree/__init__.py,sha256=Na3mxHkUBq3rzYbxiLNJF37DxH5mcghSorjzXw5Teug,422
2
2
  har2tree/har2tree.py,sha256=47x9X5tY69f9SXkYJgJsnAaX2kxgXHgzFThGz6M86Zw,44495
3
3
  har2tree/helper.py,sha256=CgeXqfBeHs8SbkW7TRNKqJBTZLAu63KggQjbGHCZAGI,20681
4
- har2tree/nodes.py,sha256=CC3NseEaM455JOpPqjfTAQ-dwWiGWmzlceGSSeTwoRo,28951
4
+ har2tree/nodes.py,sha256=a-5tk_AbnIklbdujlesb_1E0KGnSyK0OsTnbnd5i0D4,32961
5
5
  har2tree/parser.py,sha256=4yej1OcVYAIiLfzYZsO9WCw3WyM_ykDTuvpW7UO1ROE,3645
6
6
  har2tree/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- har2tree-1.31.3.dist-info/LICENSE,sha256=Xa4EVROgJsEo10CW-ISCRiw0TtqdKz1JuM3BBLBM55c,1803
8
- har2tree-1.31.3.dist-info/METADATA,sha256=PSDu0bnPUYje8It-uyZfcDVbo4TTTC7RCzH-2CRAc0U,2112
9
- har2tree-1.31.3.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
10
- har2tree-1.31.3.dist-info/RECORD,,
7
+ har2tree-1.31.5.dist-info/LICENSE,sha256=Xa4EVROgJsEo10CW-ISCRiw0TtqdKz1JuM3BBLBM55c,1803
8
+ har2tree-1.31.5.dist-info/METADATA,sha256=5QfFL4ESUuWJn7JuxcnLSgw70q3MGZoDfe9PJFS5JkA,2203
9
+ har2tree-1.31.5.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
10
+ har2tree-1.31.5.dist-info/RECORD,,