sapiopycommons 2024.9.25a334__py3-none-any.whl → 2024.9.30a335__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sapiopycommons might be problematic. Click here for more details.

@@ -23,8 +23,8 @@ class FileUtil:
23
23
  """
24
24
  @staticmethod
25
25
  def tokenize_csv(file_bytes: bytes, required_headers: list[str] | None = None, header_row_index: int | None = 0,
26
- seperator: str = ",", *, encoding: str | None = None, exception_on_empty: bool = True) \
27
- -> tuple[list[dict[str, str]], list[list[str]]]:
26
+ seperator: str = ",", *, encoding: str | None = None, encoding_error: str | None = "strict",
27
+ exception_on_empty: bool = True) -> tuple[list[dict[str, str]], list[list[str]]]:
28
28
  """
29
29
  Tokenize a CSV file. The provided file must be uniform. That is, if row 1 has 10 cells, all the rows in the file
30
30
  must have 10 cells. Otherwise, the Pandas parser throws a tokenizer exception.
@@ -39,7 +39,11 @@ class FileUtil:
39
39
  :param seperator: The character that separates cells in the table.
40
40
  :param encoding: The encoding used to read the given file bytes. If not provided, uses utf-8. If your file
41
41
  contains a non-utf-8 character, then a UnicodeDecodeError will be thrown. If this happens, consider using
42
- ISO-8859-1 as the encoding.
42
+ ISO-8859-1 as the encoding, or investigate what encoding would handle the characters in your file.
43
+ :param encoding_error: The error handling behavior if an encoding error is encountered. By default, the behavior
44
+ is "strict", meaning that encoding errors raise an exception. Change this to "ignore" to skip over invalid
45
+ characters or "replace" to replace invalid characters with a ? character. For a full list of options, see
46
+ https://docs.python.org/3/library/codecs.html#error-handlers
43
47
  :param exception_on_empty: Throw a user error exception if the provided file bytes result in an empty list in
44
48
  the first element of the returned tuple.
45
49
  :return: The CSV parsed into a list of dicts where each dict is a row, mapping the headers to the cells for
@@ -49,7 +53,7 @@ class FileUtil:
49
53
  # Parse the file bytes into two DataFrames. The first is metadata of the file located above the header row,
50
54
  # while the second is the body of the file below the header row.
51
55
  file_body, file_metadata = FileUtil.csv_to_data_frames(file_bytes, header_row_index, seperator,
52
- encoding=encoding)
56
+ encoding=encoding, encoding_error=encoding_error)
53
57
  # Parse the metadata from above the header row index into a list of lists.
54
58
  metadata: list[list[str]] = FileUtil.data_frame_to_lists(file_metadata)
55
59
  # Parse the data from the file body into a list of dicts.
@@ -90,7 +94,8 @@ class FileUtil:
90
94
 
91
95
  @staticmethod
92
96
  def csv_to_data_frames(file_bytes: bytes, header_row_index: int | None = 0, seperator: str = ",",
93
- *, encoding: str | None = None) -> tuple[DataFrame, DataFrame | None]:
97
+ *, encoding: str | None = None, encoding_error: str | None = "strict") \
98
+ -> tuple[DataFrame, DataFrame | None]:
94
99
  """
95
100
  Parse the file bytes for a CSV into DataFrames. The provided file must be uniform. That is, if row 1 has 10
96
101
  cells, all the rows in the file must have 10 cells. Otherwise, the Pandas parser throws a tokenizer exception.
@@ -103,7 +108,11 @@ class FileUtil:
103
108
  :param seperator: The character that separates cells in the table.
104
109
  :param encoding: The encoding used to read the given file bytes. If not provided, uses utf-8. If your file
105
110
  contains a non-utf-8 character, then a UnicodeDecodeError will be thrown. If this happens, consider using
106
- ISO-8859-1 as the encoding.
111
+ ISO-8859-1 as the encoding, or investigate what encoding would handle the characters in your file.
112
+ :param encoding_error: The error handling behavior if an encoding error is encountered. By default, the behavior
113
+ is "strict", meaning that encoding errors raise an exception. Change this to "ignore" to skip over invalid
114
+ characters or "replace" to replace invalid characters with a ? character. For a full list of options, see
115
+ https://docs.python.org/3/library/codecs.html#error-handlers
107
116
  :return: A tuple of two DataFrames. The first is the frame for the CSV table body, while the second is for the
108
117
  metadata from above the header row, or None if there is no metadata.
109
118
  """
@@ -115,7 +124,8 @@ class FileUtil:
115
124
  # can throw off the header row index.
116
125
  file_metadata = pandas.read_csv(file_io, header=None, dtype=dtype(str),
117
126
  skiprows=lambda x: x >= header_row_index,
118
- skip_blank_lines=False, sep=seperator, encoding=encoding)
127
+ skip_blank_lines=False, sep=seperator, encoding=encoding,
128
+ encoding_errors=encoding_error)
119
129
  with io.BytesIO(file_bytes) as file_io:
120
130
  # The use of the dtype argument is to ensure that everything from the file gets read as a string. Added
121
131
  # because some numerical values would get ".0" appended to them, even when casting the DataFrame cell to a
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: sapiopycommons
3
- Version: 2024.9.25a334
3
+ Version: 2024.9.30a335
4
4
  Summary: Official Sapio Python API Utilities Package
5
5
  Project-URL: Homepage, https://github.com/sapiosciences
6
6
  Author-email: Jonathan Steck <jsteck@sapiosciences.com>, Yechen Qiao <yqiao@sapiosciences.com>
@@ -19,7 +19,7 @@ sapiopycommons/files/complex_data_loader.py,sha256=T39veNhvYl6j_uZjIIJ8Mk5Aa7otR
19
19
  sapiopycommons/files/file_bridge.py,sha256=WwCVegk0OGA8eqho8chsOsLlqg1nXctO75zfh-rHF-g,5950
20
20
  sapiopycommons/files/file_bridge_handler.py,sha256=bt2IfIsxJ4lcJYo_NHvCQ17ZV6C4fSAEa8Zcgixh7B4,14263
21
21
  sapiopycommons/files/file_data_handler.py,sha256=SCsjODMJIPEBSsahzXUeOM7CfSCmYwPPoGAM6aOfelo,36743
22
- sapiopycommons/files/file_util.py,sha256=ZrgoGwHHfPdL5KHkGwlrEHJqGpttmZzRkGQCXdLjra8,28284
22
+ sapiopycommons/files/file_util.py,sha256=wbL3rxcFc-t2mXaPWWkoFWYGopvTcQts9Wf-L5GkhT8,29498
23
23
  sapiopycommons/files/file_validator.py,sha256=4OvY98ueJWPJdpndwnKv2nqVvLP9S2W7Il_dM0Y0ojo,28709
24
24
  sapiopycommons/files/file_writer.py,sha256=96Xl8TTT46Krxe_J8rmmlEMtel4nzZB961f5Yqtl1-I,17616
25
25
  sapiopycommons/general/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -44,7 +44,7 @@ sapiopycommons/rules/on_save_rule_handler.py,sha256=Rkqvph20RbNq6m-RF4fbvCP-YfD2
44
44
  sapiopycommons/webhook/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
45
  sapiopycommons/webhook/webhook_handlers.py,sha256=jwc4xu-wwl8haS5k1dENZ1UIYK9GQk74TAo3CGxMW9U,16583
46
46
  sapiopycommons/webhook/webservice_handlers.py,sha256=1J56zFI0pWl5MHoNTznvcZumITXgAHJMluj8-2BqYEw,3315
47
- sapiopycommons-2024.9.25a334.dist-info/METADATA,sha256=V8-SfiaBoaQ29_YOx_hYn1feKD_2V05OSTBRc2s3Z3c,3176
48
- sapiopycommons-2024.9.25a334.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
49
- sapiopycommons-2024.9.25a334.dist-info/licenses/LICENSE,sha256=HyVuytGSiAUQ6ErWBHTqt1iSGHhLmlC8fO7jTCuR8dU,16725
50
- sapiopycommons-2024.9.25a334.dist-info/RECORD,,
47
+ sapiopycommons-2024.9.30a335.dist-info/METADATA,sha256=weAijlonfLBXlTj0y187KCHixvVi2TNNifX1wPAluWg,3176
48
+ sapiopycommons-2024.9.30a335.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
49
+ sapiopycommons-2024.9.30a335.dist-info/licenses/LICENSE,sha256=HyVuytGSiAUQ6ErWBHTqt1iSGHhLmlC8fO7jTCuR8dU,16725
50
+ sapiopycommons-2024.9.30a335.dist-info/RECORD,,