har2tree 1.31.3__tar.gz → 1.31.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {har2tree-1.31.3 → har2tree-1.31.5}/PKG-INFO +6 -4
- {har2tree-1.31.3 → har2tree-1.31.5}/har2tree/nodes.py +84 -31
- {har2tree-1.31.3 → har2tree-1.31.5}/pyproject.toml +8 -6
- {har2tree-1.31.3 → har2tree-1.31.5}/LICENSE +0 -0
- {har2tree-1.31.3 → har2tree-1.31.5}/README.md +0 -0
- {har2tree-1.31.3 → har2tree-1.31.5}/har2tree/__init__.py +0 -0
- {har2tree-1.31.3 → har2tree-1.31.5}/har2tree/har2tree.py +0 -0
- {har2tree-1.31.3 → har2tree-1.31.5}/har2tree/helper.py +0 -0
- {har2tree-1.31.3 → har2tree-1.31.5}/har2tree/parser.py +0 -0
- {har2tree-1.31.3 → har2tree-1.31.5}/har2tree/py.typed +0 -0
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: har2tree
|
|
3
|
-
Version: 1.31.
|
|
3
|
+
Version: 1.31.5
|
|
4
4
|
Summary: HTTP Archive (HAR) to ETE Toolkit generator
|
|
5
5
|
License: BSD-3-Clause
|
|
6
6
|
Author: Raphaël Vinot
|
|
7
7
|
Author-email: raphael.vinot@circl.lu
|
|
8
|
-
Requires-Python: >=3.9
|
|
8
|
+
Requires-Python: >=3.9,<4.0
|
|
9
9
|
Classifier: Intended Audience :: Information Technology
|
|
10
10
|
Classifier: Intended Audience :: Science/Research
|
|
11
11
|
Classifier: Intended Audience :: Telecommunications Industry
|
|
@@ -24,11 +24,13 @@ Requires-Dist: Sphinx (>=8.2.3) ; (python_version >= "3.11") and (extra == "docs
|
|
|
24
24
|
Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.13.4)
|
|
25
25
|
Requires-Dist: ete3 (>=3.1.3)
|
|
26
26
|
Requires-Dist: filetype (>=1.2.0)
|
|
27
|
+
Requires-Dist: json-stream (>=2.3.3,<3.0.0)
|
|
27
28
|
Requires-Dist: legacy-cgi (>=2.6.3) ; python_version >= "3.13,<4.0"
|
|
29
|
+
Requires-Dist: multipart (>=1.3.0,<2.0.0)
|
|
28
30
|
Requires-Dist: numpy (<2.1) ; python_version < "3.10"
|
|
29
31
|
Requires-Dist: numpy (<2.3) ; python_version < "3.11"
|
|
30
|
-
Requires-Dist: numpy (>=2.3.
|
|
31
|
-
Requires-Dist: publicsuffixlist (>=1.0.2.
|
|
32
|
+
Requires-Dist: numpy (>=2.3.2) ; python_version >= "3.11"
|
|
33
|
+
Requires-Dist: publicsuffixlist (>=1.0.2.20250809)
|
|
32
34
|
Requires-Dist: six (>=1.17.0) ; extra == "docs"
|
|
33
35
|
Requires-Dist: tinycss2 (>=1.4.0)
|
|
34
36
|
Requires-Dist: w3lib (>=2.3.1)
|
|
@@ -15,13 +15,15 @@ from base64 import b64decode
|
|
|
15
15
|
from datetime import datetime, timedelta
|
|
16
16
|
from functools import lru_cache, cached_property
|
|
17
17
|
from hashlib import sha256
|
|
18
|
-
from io import BytesIO
|
|
18
|
+
from io import BytesIO, StringIO
|
|
19
19
|
from pathlib import Path
|
|
20
20
|
from typing import Any
|
|
21
21
|
from collections.abc import MutableMapping
|
|
22
|
-
from urllib.parse import unquote_plus, urlparse, urljoin
|
|
22
|
+
from urllib.parse import unquote_plus, urlparse, urljoin, parse_qs
|
|
23
23
|
|
|
24
24
|
import filetype # type: ignore
|
|
25
|
+
import json_stream # type: ignore
|
|
26
|
+
|
|
25
27
|
from bs4 import BeautifulSoup
|
|
26
28
|
from ete3 import TreeNode # type: ignore
|
|
27
29
|
from publicsuffixlist import PublicSuffixList # type: ignore
|
|
@@ -211,32 +213,52 @@ class URLNode(HarTreeNode):
|
|
|
211
213
|
if 'user_agent' not in self.features:
|
|
212
214
|
self.add_feature('user_agent', '')
|
|
213
215
|
|
|
214
|
-
if 'method' in self.request and self.request['method'] == 'POST'
|
|
215
|
-
|
|
216
|
-
if self.request
|
|
217
|
-
|
|
218
|
-
|
|
216
|
+
if 'method' in self.request and self.request['method'] == 'POST':
|
|
217
|
+
decoded_posted_data: list[Any] | str | bytes | int | float | bool | dict[str, str] | dict[str, list[str]] | None = None
|
|
218
|
+
if 'postData' not in self.request or 'text' not in self.request['postData']:
|
|
219
|
+
self.logger.debug('POST request with no content.')
|
|
220
|
+
elif not self.request['postData']['text']:
|
|
221
|
+
# If the POST content is empty
|
|
222
|
+
self.logger.debug('Empty POST request.')
|
|
223
|
+
decoded_posted_data = ''
|
|
224
|
+
elif self.request['postData'].get('params'):
|
|
225
|
+
# NOTE 2025-08-08
|
|
226
|
+
# if the posted data mimetype is "application/x-www-form-urlencoded"
|
|
227
|
+
# the HAR contains the decoded entry in the params key
|
|
228
|
+
# The params key is a list of dicts with a key and a value
|
|
229
|
+
# {"name": <key>, "value": <data>}
|
|
230
|
+
# I'd rather have it as {<key>: <data>}
|
|
231
|
+
# TODO: some processing on the data part (it's often a json blob)
|
|
232
|
+
self.logger.debug('Got a params POST.')
|
|
233
|
+
decoded_posted_data = {entry['name']: entry['value'] for entry in self.request['postData']['params']}
|
|
234
|
+
else:
|
|
219
235
|
# NOTE 2023-08-22: Blind attempt to base64 decode the data
|
|
236
|
+
self.logger.debug('Got a normal POST')
|
|
220
237
|
try:
|
|
221
|
-
decoded_posted_data = self._dirty_safe_b64decode(
|
|
238
|
+
decoded_posted_data = self._dirty_safe_b64decode(self.request['postData']['text'])
|
|
222
239
|
except binascii.Error:
|
|
223
|
-
decoded_posted_data =
|
|
240
|
+
decoded_posted_data = self.request['postData']['text']
|
|
224
241
|
if 'mimeType' in self.request['postData']:
|
|
225
242
|
# make it easier to compare.
|
|
226
243
|
mimetype_lower = self.request['postData']['mimeType'].lower()
|
|
227
244
|
if mimetype_lower.startswith('application/x-www-form-urlencoded'):
|
|
245
|
+
# NOTE: this should never happen as there should
|
|
246
|
+
# be something in self.request['postData']['params']
|
|
247
|
+
# and we already processed it before but just in case...
|
|
248
|
+
self.logger.debug('Got a application/x-www-form-urlencoded without params key')
|
|
228
249
|
# 100% sure there will be websites where decode will fail
|
|
229
250
|
try:
|
|
230
251
|
if isinstance(decoded_posted_data, bytes):
|
|
231
252
|
decoded_posted_data = decoded_posted_data.decode()
|
|
232
253
|
if isinstance(decoded_posted_data, str):
|
|
233
254
|
decoded_posted_data = unquote_plus(decoded_posted_data)
|
|
255
|
+
if isinstance(decoded_posted_data, str):
|
|
256
|
+
decoded_posted_data = parse_qs(decoded_posted_data)
|
|
234
257
|
except Exception as e:
|
|
235
|
-
self.logger.warning(f'Unable to unquote form data "{decoded_posted_data!r}": {e}')
|
|
258
|
+
self.logger.warning(f'Unable to unquote or parse form data "{decoded_posted_data!r}": {e}')
|
|
236
259
|
elif (mimetype_lower.startswith('application/json')
|
|
237
260
|
or mimetype_lower.startswith('application/csp-report')
|
|
238
261
|
or mimetype_lower.startswith('application/x-amz-json-1.1')
|
|
239
|
-
or mimetype_lower.startswith('application/x-json-stream')
|
|
240
262
|
or mimetype_lower.startswith('application/reports+json')
|
|
241
263
|
or mimetype_lower.endswith('json')
|
|
242
264
|
):
|
|
@@ -247,17 +269,34 @@ class URLNode(HarTreeNode):
|
|
|
247
269
|
decoded_posted_data = json.loads(decoded_posted_data)
|
|
248
270
|
except Exception:
|
|
249
271
|
if isinstance(decoded_posted_data, (str, bytes)):
|
|
250
|
-
self.logger.
|
|
272
|
+
self.logger.warning(f"Expected json, got garbage: {mimetype_lower} - {decoded_posted_data[:20]!r}[...]")
|
|
251
273
|
else:
|
|
252
|
-
self.logger.
|
|
253
|
-
|
|
274
|
+
self.logger.warning(f"Expected json, got garbage: {mimetype_lower} - {decoded_posted_data}")
|
|
275
|
+
elif mimetype_lower.startswith('application/x-json-stream'):
|
|
276
|
+
try:
|
|
277
|
+
to_stream: StringIO | BytesIO
|
|
278
|
+
if isinstance(decoded_posted_data, str):
|
|
279
|
+
to_stream = StringIO(decoded_posted_data)
|
|
280
|
+
elif isinstance(decoded_posted_data, bytes):
|
|
281
|
+
to_stream = BytesIO(decoded_posted_data)
|
|
282
|
+
else:
|
|
283
|
+
raise ValueError(f'Invalid type: {type(decoded_posted_data)}')
|
|
284
|
+
streamed_data = json_stream.load(to_stream)
|
|
285
|
+
decoded_posted_data = json_stream.to_standard_types(streamed_data)
|
|
286
|
+
except Exception:
|
|
287
|
+
if isinstance(decoded_posted_data, (str, bytes)):
|
|
288
|
+
self.logger.warning(f"Expected json stream, got garbage: {mimetype_lower} - {decoded_posted_data[:20]!r}[...]")
|
|
289
|
+
else:
|
|
290
|
+
self.logger.warning(f"Expected json stream, got garbage: {mimetype_lower} - {decoded_posted_data}")
|
|
254
291
|
elif mimetype_lower.startswith('multipart/form-data'):
|
|
255
292
|
# FIXME multipart content (similar to email). Not totally sure what do do with it tight now.
|
|
293
|
+
self.logger.debug(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
|
|
256
294
|
pass
|
|
257
295
|
elif mimetype_lower.startswith('application/x-protobuf'):
|
|
258
296
|
# FIXME If possible, decode?
|
|
297
|
+
self.logger.debug(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
|
|
259
298
|
pass
|
|
260
|
-
elif mimetype_lower.startswith('text'):
|
|
299
|
+
elif mimetype_lower.startswith('text') and isinstance(decoded_posted_data, (str, bytes)):
|
|
261
300
|
try:
|
|
262
301
|
# NOTE 2023-08-22: Quite a few text entries are in fact json, give it a shot.
|
|
263
302
|
# loads here may give us a int, float or a bool.
|
|
@@ -267,34 +306,48 @@ class URLNode(HarTreeNode):
|
|
|
267
306
|
pass
|
|
268
307
|
elif mimetype_lower.endswith('javascript'):
|
|
269
308
|
# keep it as it is
|
|
309
|
+
self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
|
|
270
310
|
pass
|
|
271
311
|
elif mimetype_lower == '?':
|
|
272
312
|
# Just skip it, no need to go in the warnings
|
|
313
|
+
self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
|
|
273
314
|
pass
|
|
274
|
-
elif mimetype_lower
|
|
315
|
+
elif mimetype_lower == 'application/binary':
|
|
316
|
+
# generally a broken gzipped blob
|
|
317
|
+
self.logger.debug(f'Got a POST {mimetype_lower}, most probably a broken gziped blob: {decoded_posted_data!r}')
|
|
318
|
+
elif mimetype_lower in ['application/octet-stream']:
|
|
275
319
|
# Should flag it, maybe?
|
|
320
|
+
self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
|
|
276
321
|
pass
|
|
277
|
-
elif mimetype_lower in ['application/
|
|
322
|
+
elif mimetype_lower in ['application/grpc-web+proto']:
|
|
323
|
+
# Can be decoded?
|
|
324
|
+
self.logger.warning(f'Got a POST {mimetype_lower} - can be decoded: {decoded_posted_data!r}')
|
|
325
|
+
elif mimetype_lower in ['application/unknown']:
|
|
278
326
|
# Weird but already seen stuff
|
|
327
|
+
self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
|
|
279
328
|
pass
|
|
280
329
|
else:
|
|
281
330
|
self.logger.warning(f'Unexpected mime type: {mimetype_lower}')
|
|
331
|
+
else:
|
|
332
|
+
self.logger.warning(f'Missing mimetype in POST: {self.request["postData"]}')
|
|
282
333
|
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
334
|
+
# NOTE 2023-08-22: Blind attempt to process the data as json
|
|
335
|
+
if isinstance(decoded_posted_data, (str, bytes)):
|
|
336
|
+
try:
|
|
337
|
+
decoded_posted_data = json.loads(decoded_posted_data)
|
|
338
|
+
except Exception:
|
|
339
|
+
pass
|
|
289
340
|
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
341
|
+
if isinstance(decoded_posted_data, bytes):
|
|
342
|
+
# NOTE 2023-08-22: Blind attempt to decode the bytes
|
|
343
|
+
# Try to decode it as utf-8
|
|
344
|
+
try:
|
|
345
|
+
decoded_posted_data = decoded_posted_data.decode('utf-8')
|
|
346
|
+
except Exception:
|
|
347
|
+
pass
|
|
348
|
+
self.add_feature('posted_data', decoded_posted_data)
|
|
349
|
+
if 'postData' in self.request and self.request['postData'].get('mimeType'):
|
|
350
|
+
self.add_feature('posted_data_mimetype', self.request['postData']['mimeType'])
|
|
298
351
|
|
|
299
352
|
self.add_feature('response', har_entry['response'])
|
|
300
353
|
try:
|
|
@@ -1,29 +1,31 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "har2tree"
|
|
3
|
-
version = "1.31.
|
|
3
|
+
version = "1.31.5"
|
|
4
4
|
description = "HTTP Archive (HAR) to ETE Toolkit generator"
|
|
5
5
|
authors = [
|
|
6
6
|
{name="Raphaël Vinot", email="raphael.vinot@circl.lu"}
|
|
7
7
|
]
|
|
8
8
|
license = "BSD-3-Clause"
|
|
9
9
|
readme = "README.md"
|
|
10
|
-
requires-python = ">=3.9"
|
|
10
|
+
requires-python = ">=3.9,<4.0"
|
|
11
11
|
|
|
12
12
|
dynamic = [ "classifiers" ]
|
|
13
13
|
|
|
14
14
|
dependencies = [
|
|
15
15
|
"ete3 (>=3.1.3)",
|
|
16
16
|
"beautifulsoup4[charset-normalizer,lxml] (>=4.13.4)",
|
|
17
|
-
"publicsuffixlist (>=1.0.2.
|
|
17
|
+
"publicsuffixlist (>=1.0.2.20250809)",
|
|
18
18
|
"filetype (>=1.2.0)",
|
|
19
19
|
# poetry up fails with the version of numpy forced for python < 3.10.
|
|
20
20
|
# The work around is to comment it, run poetry up, uncomment it. and run poetry update.
|
|
21
21
|
"numpy (<2.1) ; python_version < \"3.10\"",
|
|
22
22
|
"numpy (<2.3) ; python_version < \"3.11\"",
|
|
23
|
-
"numpy (>=2.3.
|
|
23
|
+
"numpy (>=2.3.2) ; python_version >= \"3.11\"",
|
|
24
24
|
"w3lib (>=2.3.1)",
|
|
25
25
|
"tinycss2 (>=1.4.0)",
|
|
26
26
|
"legacy-cgi (>=2.6.3) ; python_version >= \"3.13,<4.0\"",
|
|
27
|
+
"multipart (>=1.3.0,<2.0.0)",
|
|
28
|
+
"json-stream (>=2.3.3,<3.0.0)",
|
|
27
29
|
]
|
|
28
30
|
|
|
29
31
|
[project.urls]
|
|
@@ -45,9 +47,9 @@ classifiers = [
|
|
|
45
47
|
docs = ["Sphinx (>=8.2.3) ; python_version >= \"3.11\"", "six (>=1.17.0)"]
|
|
46
48
|
|
|
47
49
|
[tool.poetry.group.dev.dependencies]
|
|
48
|
-
mypy = "^1.17.
|
|
50
|
+
mypy = "^1.17.1"
|
|
49
51
|
pytest-cov = "^6.2.1"
|
|
50
|
-
coverage = "^7.
|
|
52
|
+
coverage = "^7.10.3"
|
|
51
53
|
types-beautifulsoup4 = "^4.12.0.20250516"
|
|
52
54
|
|
|
53
55
|
[build-system]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|