har2tree 1.31.2__tar.gz → 1.31.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: har2tree
3
- Version: 1.31.2
3
+ Version: 1.31.4
4
4
  Summary: HTTP Archive (HAR) to ETE Toolkit generator
5
5
  License: BSD-3-Clause
6
6
  Author: Raphaël Vinot
@@ -25,10 +25,11 @@ Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.13.4)
25
25
  Requires-Dist: ete3 (>=3.1.3)
26
26
  Requires-Dist: filetype (>=1.2.0)
27
27
  Requires-Dist: legacy-cgi (>=2.6.3) ; python_version >= "3.13,<4.0"
28
+ Requires-Dist: multipart (>=1.3.0,<2.0.0)
28
29
  Requires-Dist: numpy (<2.1) ; python_version < "3.10"
29
30
  Requires-Dist: numpy (<2.3) ; python_version < "3.11"
30
- Requires-Dist: numpy (>=2.3.1) ; python_version >= "3.11"
31
- Requires-Dist: publicsuffixlist (>=1.0.2.20250627)
31
+ Requires-Dist: numpy (>=2.3.2) ; python_version >= "3.11"
32
+ Requires-Dist: publicsuffixlist (>=1.0.2.20250802)
32
33
  Requires-Dist: six (>=1.17.0) ; extra == "docs"
33
34
  Requires-Dist: tinycss2 (>=1.4.0)
34
35
  Requires-Dist: w3lib (>=2.3.1)
@@ -286,7 +286,7 @@ def make_soup(html: bytes) -> BeautifulSoup:
286
286
  try:
287
287
  return BeautifulSoup(doc_as_str, 'lxml')
288
288
  except Exception as e:
289
- logger.warning(f'Unable to parse doc with lxml, try again with the default parser: {e}')
289
+ logger.info(f'Unable to parse doc with lxml, try again with the default parser: {e}')
290
290
  # Fallback to the default python parser
291
291
  return BeautifulSoup(doc_as_str, 'html.parser')
292
292
 
@@ -211,20 +211,39 @@ class URLNode(HarTreeNode):
211
211
  if 'user_agent' not in self.features:
212
212
  self.add_feature('user_agent', '')
213
213
 
214
- if 'method' in self.request and self.request['method'] == 'POST' and 'postData' in self.request:
215
- # If the content is empty, we don't care
216
- if self.request['postData']['text']:
217
- _posted_data: str = self.request['postData']['text']
218
- decoded_posted_data: str | bytes | int | float | bool
214
+ if 'method' in self.request and self.request['method'] == 'POST':
215
+ decoded_posted_data: str | bytes | int | float | bool | dict[str, str] | None = None
216
+ if 'postData' not in self.request or 'text' not in self.request['postData']:
217
+ self.logger.debug('POST request with no content.')
218
+ elif not self.request['postData']['text']:
219
+ # If the POST content is empty
220
+ self.logger.debug('Empty POST request.')
221
+ decoded_posted_data = ''
222
+ elif self.request['postData'].get('params'):
223
+ # NOTE 2025-08-08
224
+ # if the posted data mimetype is "application/x-www-form-urlencoded"
225
+ # the HAR contains the decoded entry in the params key
226
+ # The params key is a list of dicts with a key and a value
227
+ # {"name": <key>, "value": <data>}
228
+ # I'd rather have it as {<key>: <data>}
229
+ # TODO: some processing on the data part (it's often a json blob)
230
+ self.logger.debug('Got a params POST.')
231
+ decoded_posted_data = {entry['name']: entry['value'] for entry in self.request['postData']['params']}
232
+ else:
219
233
  # NOTE 2023-08-22: Blind attempt to base64 decode the data
234
+ self.logger.debug('Got a normal POST')
220
235
  try:
221
- decoded_posted_data = self._dirty_safe_b64decode(_posted_data)
236
+ decoded_posted_data = self._dirty_safe_b64decode(self.request['postData']['text'])
222
237
  except binascii.Error:
223
- decoded_posted_data = _posted_data
238
+ decoded_posted_data = self.request['postData']['text']
224
239
  if 'mimeType' in self.request['postData']:
225
240
  # make it easier to compare.
226
241
  mimetype_lower = self.request['postData']['mimeType'].lower()
227
242
  if mimetype_lower.startswith('application/x-www-form-urlencoded'):
243
+ # NOTE: this should never happen as there should
244
+ # be something in self.request['postData']['params']
245
+ # and we already processed it before but just in case...
246
+ self.logger.warning(f'Got a application/x-www-form-urlencoded without params key: {self.request}')
228
247
  # 100% sure there will be websites where decode will fail
229
248
  try:
230
249
  if isinstance(decoded_posted_data, bytes):
@@ -247,17 +266,19 @@ class URLNode(HarTreeNode):
247
266
  decoded_posted_data = json.loads(decoded_posted_data)
248
267
  except Exception:
249
268
  if isinstance(decoded_posted_data, (str, bytes)):
250
- self.logger.debug(f"Expected json, got garbage: {mimetype_lower} - {decoded_posted_data[:20]!r}[...]")
269
+ self.logger.warning(f"Expected json, got garbage: {mimetype_lower} - {decoded_posted_data[:20]!r}[...]")
251
270
  else:
252
- self.logger.debug(f"Expected json, got garbage: {mimetype_lower} - {decoded_posted_data}")
271
+ self.logger.warning(f"Expected json, got garbage: {mimetype_lower} - {decoded_posted_data}")
253
272
 
254
273
  elif mimetype_lower.startswith('multipart/form-data'):
255
274
  # FIXME multipart content (similar to email). Not totally sure what do do with it tight now.
275
+ self.logger.debug(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
256
276
  pass
257
277
  elif mimetype_lower.startswith('application/x-protobuf'):
258
278
  # FIXME If possible, decode?
279
+ self.logger.debug(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
259
280
  pass
260
- elif mimetype_lower.startswith('text'):
281
+ elif mimetype_lower.startswith('text') and isinstance(decoded_posted_data, (str, bytes)):
261
282
  try:
262
283
  # NOTE 2023-08-22: Quite a few text entries are in fact json, give it a shot.
263
284
  # loads here may give us a int, float or a bool.
@@ -267,34 +288,40 @@ class URLNode(HarTreeNode):
267
288
  pass
268
289
  elif mimetype_lower.endswith('javascript'):
269
290
  # keep it as it is
291
+ self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
270
292
  pass
271
293
  elif mimetype_lower == '?':
272
294
  # Just skip it, no need to go in the warnings
295
+ self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
273
296
  pass
274
297
  elif mimetype_lower in ['application/octet-stream', 'application/binary']:
275
298
  # Should flag it, maybe?
299
+ self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
276
300
  pass
277
301
  elif mimetype_lower in ['application/unknown', 'application/grpc-web+proto']:
278
302
  # Weird but already seen stuff
303
+ self.logger.warning(f'Got a POST {mimetype_lower}: {decoded_posted_data!r}')
279
304
  pass
280
305
  else:
281
306
  self.logger.warning(f'Unexpected mime type: {mimetype_lower}')
307
+ else:
308
+ self.logger.warning(f'Missing mimetype in POST: {self.request["postData"]}')
282
309
 
283
- # NOTE 2023-08-22: Blind attempt to process the data as json
284
- if isinstance(decoded_posted_data, (str, bytes)):
285
- try:
286
- decoded_posted_data = json.loads(decoded_posted_data)
287
- except Exception:
288
- pass
310
+ # NOTE 2023-08-22: Blind attempt to process the data as json
311
+ if isinstance(decoded_posted_data, (str, bytes)):
312
+ try:
313
+ decoded_posted_data = json.loads(decoded_posted_data)
314
+ except Exception:
315
+ pass
289
316
 
290
- if isinstance(decoded_posted_data, bytes):
291
- # NOTE 2023-08-22: Blind attempt to decode the bytes
292
- # Try to decode it as utf-8
293
- try:
294
- decoded_posted_data = decoded_posted_data.decode('utf-8')
295
- except Exception:
296
- pass
297
- self.add_feature('posted_data', decoded_posted_data)
317
+ if isinstance(decoded_posted_data, bytes):
318
+ # NOTE 2023-08-22: Blind attempt to decode the bytes
319
+ # Try to decode it as utf-8
320
+ try:
321
+ decoded_posted_data = decoded_posted_data.decode('utf-8')
322
+ except Exception:
323
+ pass
324
+ self.add_feature('posted_data', decoded_posted_data)
298
325
 
299
326
  self.add_feature('response', har_entry['response'])
300
327
  try:
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "har2tree"
3
- version = "1.31.2"
3
+ version = "1.31.4"
4
4
  description = "HTTP Archive (HAR) to ETE Toolkit generator"
5
5
  authors = [
6
6
  {name="Raphaël Vinot", email="raphael.vinot@circl.lu"}
@@ -14,16 +14,17 @@ dynamic = [ "classifiers" ]
14
14
  dependencies = [
15
15
  "ete3 (>=3.1.3)",
16
16
  "beautifulsoup4[charset-normalizer,lxml] (>=4.13.4)",
17
- "publicsuffixlist (>=1.0.2.20250627)",
17
+ "publicsuffixlist (>=1.0.2.20250802)",
18
18
  "filetype (>=1.2.0)",
19
19
  # poetry up fails with the version of numpy forced for python < 3.10.
20
20
  # The work around is to comment it, run poetry up, uncomment it. and run poetry update.
21
21
  "numpy (<2.1) ; python_version < \"3.10\"",
22
22
  "numpy (<2.3) ; python_version < \"3.11\"",
23
- "numpy (>=2.3.1) ; python_version >= \"3.11\"",
23
+ "numpy (>=2.3.2) ; python_version >= \"3.11\"",
24
24
  "w3lib (>=2.3.1)",
25
25
  "tinycss2 (>=1.4.0)",
26
26
  "legacy-cgi (>=2.6.3) ; python_version >= \"3.13,<4.0\"",
27
+ "multipart (>=1.3.0,<2.0.0)",
27
28
  ]
28
29
 
29
30
  [project.urls]
@@ -45,9 +46,9 @@ classifiers = [
45
46
  docs = ["Sphinx (>=8.2.3) ; python_version >= \"3.11\"", "six (>=1.17.0)"]
46
47
 
47
48
  [tool.poetry.group.dev.dependencies]
48
- mypy = "^1.16.1"
49
+ mypy = "^1.17.1"
49
50
  pytest-cov = "^6.2.1"
50
- coverage = "^7.9.1"
51
+ coverage = "^7.10.2"
51
52
  types-beautifulsoup4 = "^4.12.0.20250516"
52
53
 
53
54
  [build-system]
File without changes
File without changes
File without changes
File without changes