python-multipart 0.0.10__tar.gz → 0.0.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. {python_multipart-0.0.10 → python_multipart-0.0.11}/CHANGELOG.md +5 -0
  2. {python_multipart-0.0.10 → python_multipart-0.0.11}/PKG-INFO +1 -1
  3. {python_multipart-0.0.10 → python_multipart-0.0.11}/multipart/__init__.py +1 -1
  4. {python_multipart-0.0.10 → python_multipart-0.0.11}/multipart/multipart.py +62 -70
  5. python_multipart-0.0.11/tests/test_data/http/CRLF_in_header.http +6 -0
  6. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/CR_in_header.yaml +1 -1
  7. python_multipart-0.0.11/tests/test_data/http/bad_header_char.yaml +3 -0
  8. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_multipart.py +28 -7
  9. {python_multipart-0.0.10 → python_multipart-0.0.11}/.gitignore +0 -0
  10. {python_multipart-0.0.10 → python_multipart-0.0.11}/LICENSE.txt +0 -0
  11. {python_multipart-0.0.10 → python_multipart-0.0.11}/README.md +0 -0
  12. {python_multipart-0.0.10 → python_multipart-0.0.11}/multipart/decoders.py +0 -0
  13. {python_multipart-0.0.10 → python_multipart-0.0.11}/multipart/exceptions.py +0 -0
  14. {python_multipart-0.0.10 → python_multipart-0.0.11}/pyproject.toml +0 -0
  15. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/__init__.py +0 -0
  16. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/compat.py +0 -0
  17. /python_multipart-0.0.10/tests/test_data/http/bad_header_char.yaml → /python_multipart-0.0.11/tests/test_data/http/CRLF_in_header.yaml +0 -0
  18. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/CR_in_header.http +0 -0
  19. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/CR_in_header_value.http +0 -0
  20. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/CR_in_header_value.yaml +0 -0
  21. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/almost_match_boundary.http +0 -0
  22. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/almost_match_boundary.yaml +0 -0
  23. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/almost_match_boundary_without_CR.http +0 -0
  24. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/almost_match_boundary_without_CR.yaml +0 -0
  25. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/almost_match_boundary_without_LF.http +0 -0
  26. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/almost_match_boundary_without_LF.yaml +0 -0
  27. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/almost_match_boundary_without_final_hyphen.http +0 -0
  28. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/almost_match_boundary_without_final_hyphen.yaml +0 -0
  29. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/bad_end_of_headers.http +0 -0
  30. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/bad_end_of_headers.yaml +0 -0
  31. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/bad_header_char.http +0 -0
  32. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/bad_initial_boundary.http +0 -0
  33. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/bad_initial_boundary.yaml +0 -0
  34. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/base64_encoding.http +0 -0
  35. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/base64_encoding.yaml +0 -0
  36. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/empty_header.http +0 -0
  37. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/empty_header.yaml +0 -0
  38. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/header_with_number.http +0 -0
  39. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/header_with_number.yaml +0 -0
  40. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/mixed_plain_and_base64_encoding.http +0 -0
  41. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/mixed_plain_and_base64_encoding.yaml +0 -0
  42. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/multiple_fields.http +0 -0
  43. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/multiple_fields.yaml +0 -0
  44. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/multiple_files.http +0 -0
  45. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/multiple_files.yaml +0 -0
  46. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/quoted_printable_encoding.http +0 -0
  47. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/quoted_printable_encoding.yaml +0 -0
  48. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/single_field.http +0 -0
  49. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/single_field.yaml +0 -0
  50. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/single_field_blocks.http +0 -0
  51. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/single_field_blocks.yaml +0 -0
  52. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/single_field_longer.http +0 -0
  53. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/single_field_longer.yaml +0 -0
  54. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/single_field_single_file.http +0 -0
  55. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/single_field_single_file.yaml +0 -0
  56. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/single_field_with_leading_newlines.http +0 -0
  57. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/single_field_with_leading_newlines.yaml +0 -0
  58. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/single_file.http +0 -0
  59. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/single_file.yaml +0 -0
  60. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/utf8_filename.http +0 -0
  61. {python_multipart-0.0.10 → python_multipart-0.0.11}/tests/test_data/http/utf8_filename.yaml +0 -0
@@ -1,5 +1,10 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.0.11 (2024-09-28)
4
+
5
+ * Improve performance, especially in data with many CR-LF [#137](https://github.com/Kludex/python-multipart/pull/137).
6
+ * Handle invalid CRLF in header name [#141](https://github.com/Kludex/python-multipart/pull/141).
7
+
3
8
  ## 0.0.10 (2024-09-21)
4
9
 
5
10
  * Support `on_header_begin` [#103](https://github.com/Kludex/python-multipart/pull/103).
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: python-multipart
3
- Version: 0.0.10
3
+ Version: 0.0.11
4
4
  Summary: A streaming multipart parser for Python
5
5
  Project-URL: Homepage, https://github.com/Kludex/python-multipart
6
6
  Project-URL: Documentation, https://kludex.github.io/python-multipart/
@@ -2,7 +2,7 @@
2
2
  __author__ = "Andrew Dunham"
3
3
  __license__ = "Apache"
4
4
  __copyright__ = "Copyright (c) 2012-2013, Andrew Dunham"
5
- __version__ = "0.0.10"
5
+ __version__ = "0.0.11"
6
6
 
7
7
  from .multipart import (
8
8
  BaseParser,
@@ -146,10 +146,6 @@ def ord_char(c: int) -> int:
146
146
  return c
147
147
 
148
148
 
149
- def join_bytes(b: bytes) -> bytes:
150
- return bytes(list(b))
151
-
152
-
153
149
  def parse_options_header(value: str | bytes) -> tuple[bytes, dict[bytes, bytes]]:
154
150
  """Parses a Content-Type header into a value in the following format: (content_type, {parameters})."""
155
151
  # Uses email.message.Message to parse the header as described in PEP 594.
@@ -976,29 +972,11 @@ class MultipartParser(BaseParser):
976
972
  # Setup marks. These are used to track the state of data received.
977
973
  self.marks: dict[str, int] = {}
978
974
 
979
- # TODO: Actually use this rather than the dumb version we currently use
980
- # # Precompute the skip table for the Boyer-Moore-Horspool algorithm.
981
- # skip = [len(boundary) for x in range(256)]
982
- # for i in range(len(boundary) - 1):
983
- # skip[ord_char(boundary[i])] = len(boundary) - i - 1
984
- #
985
- # # We use a tuple since it's a constant, and marginally faster.
986
- # self.skip = tuple(skip)
987
-
988
975
  # Save our boundary.
989
976
  if isinstance(boundary, str): # pragma: no cover
990
977
  boundary = boundary.encode("latin-1")
991
978
  self.boundary = b"\r\n--" + boundary
992
979
 
993
- # Get a set of characters that belong to our boundary.
994
- self.boundary_chars = frozenset(self.boundary)
995
-
996
- # We also create a lookbehind list.
997
- # Note: the +8 is since we can have, at maximum, "\r\n--" + boundary +
998
- # "--\r\n" at the final boundary, and the length of '\r\n--' and
999
- # '--\r\n' is 8 bytes.
1000
- self.lookbehind = [NULL for _ in range(len(boundary) + 8)]
1001
-
1002
980
  def write(self, data: bytes) -> int:
1003
981
  """Write some data to the parser, which will perform size verification,
1004
982
  and then parse the data into the appropriate location (e.g. header,
@@ -1061,21 +1039,43 @@ class MultipartParser(BaseParser):
1061
1039
  # end of the buffer, and reset the mark, instead of deleting it. This
1062
1040
  # is used at the end of the function to call our callbacks with any
1063
1041
  # remaining data in this chunk.
1064
- def data_callback(name: str, remaining: bool = False) -> None:
1042
+ def data_callback(name: str, end_i: int, remaining: bool = False) -> None:
1065
1043
  marked_index = self.marks.get(name)
1066
1044
  if marked_index is None:
1067
1045
  return
1068
1046
 
1069
- # If we're getting remaining data, we ignore the current i value
1070
- # and just call with the remaining data.
1071
- if remaining:
1072
- self.callback(name, data, marked_index, length)
1073
- self.marks[name] = 0
1074
-
1075
1047
  # Otherwise, we call it from the mark to the current byte we're
1076
1048
  # processing.
1049
+ if end_i <= marked_index:
1050
+ # There is no additional data to send.
1051
+ pass
1052
+ elif marked_index >= 0:
1053
+ # We are emitting data from the local buffer.
1054
+ self.callback(name, data, marked_index, end_i)
1055
+ else:
1056
+ # Some of the data comes from a partial boundary match.
1057
+ # and requires look-behind.
1058
+ # We need to use self.flags (and not flags) because we care about
1059
+ # the state when we entered the loop.
1060
+ lookbehind_len = -marked_index
1061
+ if lookbehind_len <= len(boundary):
1062
+ self.callback(name, boundary, 0, lookbehind_len)
1063
+ elif self.flags & FLAG_PART_BOUNDARY:
1064
+ lookback = boundary + b"\r\n"
1065
+ self.callback(name, lookback, 0, lookbehind_len)
1066
+ elif self.flags & FLAG_LAST_BOUNDARY:
1067
+ lookback = boundary + b"--\r\n"
1068
+ self.callback(name, lookback, 0, lookbehind_len)
1069
+ else: # pragma: no cover (error case)
1070
+ self.logger.warning("Look-back buffer error")
1071
+
1072
+ if end_i > 0:
1073
+ self.callback(name, data, 0, end_i)
1074
+ # If we're getting remaining data, we have got all the data we
1075
+ # can be certain is not a boundary, leaving only a partial boundary match.
1076
+ if remaining:
1077
+ self.marks[name] = end_i - length
1077
1078
  else:
1078
- self.callback(name, data, marked_index, i)
1079
1079
  self.marks.pop(name, None)
1080
1080
 
1081
1081
  # For each byte...
@@ -1163,7 +1163,7 @@ class MultipartParser(BaseParser):
1163
1163
  # If we've reached a CR at the beginning of a header, it means
1164
1164
  # that we've reached the second of 2 newlines, and so there are
1165
1165
  # no more headers to parse.
1166
- if c == CR:
1166
+ if c == CR and index == 0:
1167
1167
  delete_mark("header_field")
1168
1168
  state = MultipartState.HEADERS_ALMOST_DONE
1169
1169
  i += 1
@@ -1183,7 +1183,7 @@ class MultipartParser(BaseParser):
1183
1183
  raise e
1184
1184
 
1185
1185
  # Call our callback with the header field.
1186
- data_callback("header_field")
1186
+ data_callback("header_field", i)
1187
1187
 
1188
1188
  # Move to parsing the header value.
1189
1189
  state = MultipartState.HEADER_VALUE_START
@@ -1212,7 +1212,7 @@ class MultipartParser(BaseParser):
1212
1212
  # If we've got a CR, we're nearly done our headers. Otherwise,
1213
1213
  # we do nothing and just move past this character.
1214
1214
  if c == CR:
1215
- data_callback("header_value")
1215
+ data_callback("header_value", i)
1216
1216
  self.callback("header_end")
1217
1217
  state = MultipartState.HEADER_VALUE_ALMOST_DONE
1218
1218
 
@@ -1256,9 +1256,6 @@ class MultipartParser(BaseParser):
1256
1256
  # We're processing our part data right now. During this, we
1257
1257
  # need to efficiently search for our boundary, since any data
1258
1258
  # on any number of lines can be a part of the current data.
1259
- # We use the Boyer-Moore-Horspool algorithm to efficiently
1260
- # search through the remainder of the buffer looking for our
1261
- # boundary.
1262
1259
 
1263
1260
  # Save the current value of our index. We use this in case we
1264
1261
  # find part of a boundary, but it doesn't match fully.
@@ -1266,24 +1263,32 @@ class MultipartParser(BaseParser):
1266
1263
 
1267
1264
  # Set up variables.
1268
1265
  boundary_length = len(boundary)
1269
- boundary_end = boundary_length - 1
1270
1266
  data_length = length
1271
- boundary_chars = self.boundary_chars
1272
1267
 
1273
1268
  # If our index is 0, we're starting a new part, so start our
1274
1269
  # search.
1275
1270
  if index == 0:
1276
- # Search forward until we either hit the end of our buffer,
1277
- # or reach a character that's in our boundary.
1278
- i += boundary_end
1279
- while i < data_length - 1 and data[i] not in boundary_chars:
1280
- i += boundary_length
1281
-
1282
- # Reset i back the length of our boundary, which is the
1283
- # earliest possible location that could be our match (i.e.
1284
- # if we've just broken out of our loop since we saw the
1285
- # last character in our boundary)
1286
- i -= boundary_end
1271
+ # The most common case is likely to be that the whole
1272
+ # boundary is present in the buffer.
1273
+ # Calling `find` is much faster than iterating here.
1274
+ i0 = data.find(boundary, i, data_length)
1275
+ if i0 >= 0:
1276
+ # We matched the whole boundary string.
1277
+ index = boundary_length - 1
1278
+ i = i0 + boundary_length - 1
1279
+ else:
1280
+ # No match found for whole string.
1281
+ # There may be a partial boundary at the end of the
1282
+ # data, which the find will not match.
1283
+ # Since the length should to be searched is limited to
1284
+ # the boundary length, just perform a naive search.
1285
+ i = max(i, data_length - boundary_length)
1286
+
1287
+ # Search forward until we either hit the end of our buffer,
1288
+ # or reach a potential start of the boundary.
1289
+ while i < data_length - 1 and data[i] != boundary[0]:
1290
+ i += 1
1291
+
1287
1292
  c = data[i]
1288
1293
 
1289
1294
  # Now, we have a couple of cases here. If our index is before
@@ -1291,11 +1296,6 @@ class MultipartParser(BaseParser):
1291
1296
  if index < boundary_length:
1292
1297
  # If the character matches...
1293
1298
  if boundary[index] == c:
1294
- # If we found a match for our boundary, we send the
1295
- # existing data.
1296
- if index == 0:
1297
- data_callback("part_data")
1298
-
1299
1299
  # The current character matches, so continue!
1300
1300
  index += 1
1301
1301
  else:
@@ -1332,6 +1332,8 @@ class MultipartParser(BaseParser):
1332
1332
  # Unset the part boundary flag.
1333
1333
  flags &= ~FLAG_PART_BOUNDARY
1334
1334
 
1335
+ # We have identified a boundary, callback for any data before it.
1336
+ data_callback("part_data", i - index)
1335
1337
  # Callback indicating that we've reached the end of
1336
1338
  # a part, and are starting a new one.
1337
1339
  self.callback("part_end")
@@ -1353,6 +1355,8 @@ class MultipartParser(BaseParser):
1353
1355
  elif flags & FLAG_LAST_BOUNDARY:
1354
1356
  # We need a second hyphen here.
1355
1357
  if c == HYPHEN:
1358
+ # We have identified a boundary, callback for any data before it.
1359
+ data_callback("part_data", i - index)
1356
1360
  # Callback to end the current part, and then the
1357
1361
  # message.
1358
1362
  self.callback("part_end")
@@ -1362,26 +1366,14 @@ class MultipartParser(BaseParser):
1362
1366
  # No match, so reset index.
1363
1367
  index = 0
1364
1368
 
1365
- # If we have an index, we need to keep this byte for later, in
1366
- # case we can't match the full boundary.
1367
- if index > 0:
1368
- self.lookbehind[index - 1] = c
1369
-
1370
1369
  # Otherwise, our index is 0. If the previous index is not, it
1371
1370
  # means we reset something, and we need to take the data we
1372
1371
  # thought was part of our boundary and send it along as actual
1373
1372
  # data.
1374
- elif prev_index > 0:
1375
- # Callback to write the saved data.
1376
- lb_data = join_bytes(self.lookbehind)
1377
- self.callback("part_data", lb_data, 0, prev_index)
1378
-
1373
+ if index == 0 and prev_index > 0:
1379
1374
  # Overwrite our previous index.
1380
1375
  prev_index = 0
1381
1376
 
1382
- # Re-set our mark for part data.
1383
- set_mark("part_data")
1384
-
1385
1377
  # Re-consider the current character, since this could be
1386
1378
  # the start of the boundary itself.
1387
1379
  i -= 1
@@ -1410,9 +1402,9 @@ class MultipartParser(BaseParser):
1410
1402
  # that we haven't yet reached the end of this 'thing'. So, by setting
1411
1403
  # the mark to 0, we cause any data callbacks that take place in future
1412
1404
  # calls to this function to start from the beginning of that buffer.
1413
- data_callback("header_field", True)
1414
- data_callback("header_value", True)
1415
- data_callback("part_data", True)
1405
+ data_callback("header_field", length, True)
1406
+ data_callback("header_value", length, True)
1407
+ data_callback("part_data", length - index, True)
1416
1408
 
1417
1409
  # Save values to locals.
1418
1410
  self.state = state
@@ -0,0 +1,6 @@
1
+ ------WebKitFormBoundaryTkr3kCBQlBe1nrhc
2
+ Content-
3
+ isposition: form-data; name="field"
4
+
5
+ This is a test.
6
+ ------WebKitFormBoundaryTkr3kCBQlBe1nrhc--
@@ -1,3 +1,3 @@
1
1
  boundary: ----WebKitFormBoundaryTkr3kCBQlBe1nrhc
2
2
  expected:
3
- error: 51
3
+ error: 50
@@ -0,0 +1,3 @@
1
+ boundary: ----WebKitFormBoundaryTkr3kCBQlBe1nrhc
2
+ expected:
3
+ error: 50
@@ -695,6 +695,14 @@ for f in os.listdir(http_tests_dir):
695
695
 
696
696
  http_tests.append({"name": fname, "test": test_data, "result": yaml_data})
697
697
 
698
+ # Datasets used for single-byte writing test.
699
+ single_byte_tests = [
700
+ "almost_match_boundary",
701
+ "almost_match_boundary_without_CR",
702
+ "almost_match_boundary_without_LF",
703
+ "almost_match_boundary_without_final_hyphen",
704
+ "single_field_single_file",
705
+ ]
698
706
 
699
707
  def split_all(val):
700
708
  """
@@ -843,17 +851,19 @@ class TestFormParser(unittest.TestCase):
843
851
  self.assert_field(b"field", b"test1")
844
852
  self.assert_file(b"file", b"file.txt", b"test2")
845
853
 
846
- def test_feed_single_bytes(self):
854
+ @parametrize("param", [ t for t in http_tests if t["name"] in single_byte_tests])
855
+ def test_feed_single_bytes(self, param):
847
856
  """
848
- This test parses a simple multipart body 1 byte at a time.
857
+ This test parses multipart bodies 1 byte at a time.
849
858
  """
850
859
  # Load test data.
851
- test_file = "single_field_single_file.http"
860
+ test_file = param["name"] + ".http"
861
+ boundary = param["result"]["boundary"]
852
862
  with open(os.path.join(http_tests_dir, test_file), "rb") as f:
853
863
  test_data = f.read()
854
864
 
855
865
  # Create form parser.
856
- self.make("boundary")
866
+ self.make(boundary)
857
867
 
858
868
  # Write all bytes.
859
869
  # NOTE: Can't simply do `for b in test_data`, since that gives
@@ -868,9 +878,20 @@ class TestFormParser(unittest.TestCase):
868
878
  # Assert we processed everything.
869
879
  self.assertEqual(i, len(test_data))
870
880
 
871
- # Assert that our file and field are here.
872
- self.assert_field(b"field", b"test1")
873
- self.assert_file(b"file", b"file.txt", b"test2")
881
+ # Assert that the parser gave us the appropriate fields/files.
882
+ for e in param["result"]["expected"]:
883
+ # Get our type and name.
884
+ type = e["type"]
885
+ name = e["name"].encode("latin-1")
886
+
887
+ if type == "field":
888
+ self.assert_field(name, e["data"])
889
+
890
+ elif type == "file":
891
+ self.assert_file(name, e["file_name"].encode("latin-1"), e["data"])
892
+
893
+ else:
894
+ assert False
874
895
 
875
896
  def test_feed_blocks(self):
876
897
  """