csv-detective 0.10.3.dev1__py3-none-any.whl → 0.10.3.dev3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,8 +14,8 @@ from csv_detective.utils import display_logs_depending_process_time
14
14
 
15
15
 
16
16
  def cast(value: str, _type: str) -> str | int | float | bool | date | datetime | bytes | None:
17
- if not isinstance(value, str) or not value:
18
- # None is the current default value in hydra, should we keep this?
17
+ if not isinstance(value, str) or value in pd._libs.parsers.STR_NA_VALUES:
18
+ # STR_NA_VALUES are directly ingested as NaN by pandas, we avoid trying to cast them (into int for instance)
19
19
  return None
20
20
  match _type:
21
21
  case "string":
@@ -1,3 +1,4 @@
1
+ import codecs
1
2
  from io import BytesIO, StringIO
2
3
 
3
4
  import pandas as pd
@@ -69,12 +70,13 @@ def load_file(
69
70
  binary_file.seek(0)
70
71
  # decoding and reading file
71
72
  if is_url(file_path) or engine in COMPRESSION_ENGINES:
73
+ decoder = codecs.getincrementaldecoder(encoding)()
72
74
  str_file = StringIO()
73
75
  while True:
74
76
  chunk = binary_file.read(1024**2)
75
77
  if not chunk:
76
78
  break
77
- str_file.write(chunk.decode(encoding=encoding))
79
+ str_file.write(decoder.decode(chunk))
78
80
  del binary_file
79
81
  str_file.seek(0)
80
82
  else:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: csv-detective
3
- Version: 0.10.3.dev1
3
+ Version: 0.10.3.dev3
4
4
  Summary: Detect tabular files column content
5
5
  Author-email: "data.gouv.fr" <opendatateam@data.gouv.fr>
6
6
  License: MIT
@@ -74,7 +74,7 @@ csv_detective/formats/data/iso_country_code_alpha2.txt,sha256=mLt_qcQ6D8hfy9zdi7
74
74
  csv_detective/formats/data/iso_country_code_alpha3.txt,sha256=XFPdGBsyZCBg4D8IDn6VgwsycCwYVfuqPbyHfNeqGv0,1003
75
75
  csv_detective/formats/data/iso_country_code_numeric.txt,sha256=sdGpn0PqDMlc59-7prThkihHrf7mwB6j5uEHpxGvLFE,1003
76
76
  csv_detective/output/__init__.py,sha256=ALSq_tgX7rGyh--7rmbKz8wHkmResN0h7mNujndow3w,2103
77
- csv_detective/output/dataframe.py,sha256=QX5vplx0AOKgnwwJ6dKvDHWRX9IGPStax-svXEyweJ8,3584
77
+ csv_detective/output/dataframe.py,sha256=juBMdj0eiL8c3OrJJ3kCf15Qs4-CFQfHqh91FnVbG9E,3656
78
78
  csv_detective/output/example.py,sha256=8LWheSBYCeDFfarbnmzBrdCbTd8Alh1U4pfXMKfabOw,8630
79
79
  csv_detective/output/profile.py,sha256=ADr5DwuvwcBYxugjN38fHm11l6ivfzGHXPd8a87Ht-s,4985
80
80
  csv_detective/output/schema.py,sha256=XoKljXPXP00DfqPCiz1ydwTHYGAFsvNxnaPCNBuuBIo,10443
@@ -84,12 +84,12 @@ csv_detective/parsing/columns.py,sha256=rb5JywbKnYCT3Jb0ZaG1BnyPVtB3gy5mSD-K7qcO
84
84
  csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
85
85
  csv_detective/parsing/csv.py,sha256=5rw6gXZFQC1T4NT9CnW0AumidrYOkF8kjrfWGmk949I,1716
86
86
  csv_detective/parsing/excel.py,sha256=tb65I78tdYlZci_tzvvQt8U6bZSYKjeVdn2CEvsET1o,6972
87
- csv_detective/parsing/load.py,sha256=f-8aKiNpy_47qg4Lq-UZUR4NNrbJ_-KEGvcUQZ8cmb0,4317
87
+ csv_detective/parsing/load.py,sha256=orW6PV5XUsHA093yVSxXkJl33LEUUArr3hP81U9Bzd4,4386
88
88
  csv_detective/parsing/text.py,sha256=yDAcop5xJQc25UtbZcV0guHXAZQfm-H8WuJORTy8Rr8,1734
89
- csv_detective-0.10.3.dev1.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
89
+ csv_detective-0.10.3.dev3.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
90
90
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
91
91
  tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
92
- tests/test_fields.py,sha256=EuD2F1JUR8y88Hm-AYuJ5X7AKkGSyLIQfsGdxYgIWng,5680
92
+ tests/test_fields.py,sha256=DSI-ZXDcRt69iZArYZZAr_3OEb-qvwgOVBZxmYAKIkI,5918
93
93
  tests/test_file.py,sha256=_ftEymft5-1keUVE5AUdF2XkVcChJo6oBjws3ye06FE,14543
94
94
  tests/test_labels.py,sha256=lgxRbLrGV1C-MkASf3KIQ120BG-UHzFQ4pqDWaeBvaw,539
95
95
  tests/test_structure.py,sha256=XDbviuuvk-0Mu9Y9PI6He2e5hry2dXVJ6yBVwEqF_2o,1043
@@ -104,8 +104,8 @@ tests/data/file.ods,sha256=4dR7zWptz5djALIBVeWHQ20GaZNfA63fevIJGFIk1_U,11832
104
104
  tests/data/file.xls,sha256=QYmNX3FF0QfcQSzYQMtaMJaepJf5EZpDa1miKc4wMdQ,21495
105
105
  tests/data/file.xlsx,sha256=naWzL02PK4pdIjMzfEyfSW9GQhkYYd_e7bpJvB8Pb2w,8314
106
106
  tests/data/xlsx_file,sha256=NyOyN_rIe7ryJuHQLqjxVdKCc8V4s5pxyHl6wWFykCM,8305
107
- csv_detective-0.10.3.dev1.dist-info/METADATA,sha256=gJcwR3wni376q4qqk8xOG6uy9W4fwA5enfmfT066990,11082
108
- csv_detective-0.10.3.dev1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
109
- csv_detective-0.10.3.dev1.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
110
- csv_detective-0.10.3.dev1.dist-info/top_level.txt,sha256=KDI4gyOpkmormGgUvSWrE3jen2e0unIsxR2b96DRvcw,25
111
- csv_detective-0.10.3.dev1.dist-info/RECORD,,
107
+ csv_detective-0.10.3.dev3.dist-info/METADATA,sha256=aVDOx1LTRqvJIGf4wu4krkEjPl8HDyCICBbswfn_Wvg,11082
108
+ csv_detective-0.10.3.dev3.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
109
+ csv_detective-0.10.3.dev3.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
110
+ csv_detective-0.10.3.dev3.dist-info/top_level.txt,sha256=KDI4gyOpkmormGgUvSWrE3jen2e0unIsxR2b96DRvcw,25
111
+ csv_detective-0.10.3.dev3.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
tests/test_fields.py CHANGED
@@ -104,11 +104,17 @@ def test_fields_with_values(args):
104
104
  ("2022-08-01", "date", _date),
105
105
  ("2024-09-23 17:32:07", "datetime", _datetime),
106
106
  ("2024-09-23 17:32:07+02:00", "datetime", _datetime),
107
+ ("N/A", "int", None),
108
+ ("nan", "bool", None),
109
+ ("", "date", None), # all NaN-like values should be cast as None for all type
107
110
  ),
108
111
  )
109
112
  def test_cast(args):
110
113
  value, detected_type, cast_type = args
111
- assert isinstance(cast(value, detected_type), cast_type)
114
+ if cast_type is None:
115
+ assert cast(value, detected_type) is None
116
+ else:
117
+ assert isinstance(cast(value, detected_type), cast_type)
112
118
 
113
119
 
114
120
  @pytest.mark.parametrize(