csv-detective 0.10.3.dev1__py3-none-any.whl → 0.10.3.dev3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/output/dataframe.py +2 -2
- csv_detective/parsing/load.py +3 -1
- {csv_detective-0.10.3.dev1.dist-info → csv_detective-0.10.3.dev3.dist-info}/METADATA +1 -1
- {csv_detective-0.10.3.dev1.dist-info → csv_detective-0.10.3.dev3.dist-info}/RECORD +9 -9
- {csv_detective-0.10.3.dev1.dist-info → csv_detective-0.10.3.dev3.dist-info}/WHEEL +1 -1
- tests/test_fields.py +7 -1
- {csv_detective-0.10.3.dev1.dist-info → csv_detective-0.10.3.dev3.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.10.3.dev1.dist-info → csv_detective-0.10.3.dev3.dist-info}/licenses/LICENSE +0 -0
- {csv_detective-0.10.3.dev1.dist-info → csv_detective-0.10.3.dev3.dist-info}/top_level.txt +0 -0
|
@@ -14,8 +14,8 @@ from csv_detective.utils import display_logs_depending_process_time
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
def cast(value: str, _type: str) -> str | int | float | bool | date | datetime | bytes | None:
|
|
17
|
-
if not isinstance(value, str) or
|
|
18
|
-
#
|
|
17
|
+
if not isinstance(value, str) or value in pd._libs.parsers.STR_NA_VALUES:
|
|
18
|
+
# STR_NA_VALUES are directly ingested as NaN by pandas, we avoid trying to cast them (into int for instance)
|
|
19
19
|
return None
|
|
20
20
|
match _type:
|
|
21
21
|
case "string":
|
csv_detective/parsing/load.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import codecs
|
|
1
2
|
from io import BytesIO, StringIO
|
|
2
3
|
|
|
3
4
|
import pandas as pd
|
|
@@ -69,12 +70,13 @@ def load_file(
|
|
|
69
70
|
binary_file.seek(0)
|
|
70
71
|
# decoding and reading file
|
|
71
72
|
if is_url(file_path) or engine in COMPRESSION_ENGINES:
|
|
73
|
+
decoder = codecs.getincrementaldecoder(encoding)()
|
|
72
74
|
str_file = StringIO()
|
|
73
75
|
while True:
|
|
74
76
|
chunk = binary_file.read(1024**2)
|
|
75
77
|
if not chunk:
|
|
76
78
|
break
|
|
77
|
-
str_file.write(
|
|
79
|
+
str_file.write(decoder.decode(chunk))
|
|
78
80
|
del binary_file
|
|
79
81
|
str_file.seek(0)
|
|
80
82
|
else:
|
|
@@ -74,7 +74,7 @@ csv_detective/formats/data/iso_country_code_alpha2.txt,sha256=mLt_qcQ6D8hfy9zdi7
|
|
|
74
74
|
csv_detective/formats/data/iso_country_code_alpha3.txt,sha256=XFPdGBsyZCBg4D8IDn6VgwsycCwYVfuqPbyHfNeqGv0,1003
|
|
75
75
|
csv_detective/formats/data/iso_country_code_numeric.txt,sha256=sdGpn0PqDMlc59-7prThkihHrf7mwB6j5uEHpxGvLFE,1003
|
|
76
76
|
csv_detective/output/__init__.py,sha256=ALSq_tgX7rGyh--7rmbKz8wHkmResN0h7mNujndow3w,2103
|
|
77
|
-
csv_detective/output/dataframe.py,sha256=
|
|
77
|
+
csv_detective/output/dataframe.py,sha256=juBMdj0eiL8c3OrJJ3kCf15Qs4-CFQfHqh91FnVbG9E,3656
|
|
78
78
|
csv_detective/output/example.py,sha256=8LWheSBYCeDFfarbnmzBrdCbTd8Alh1U4pfXMKfabOw,8630
|
|
79
79
|
csv_detective/output/profile.py,sha256=ADr5DwuvwcBYxugjN38fHm11l6ivfzGHXPd8a87Ht-s,4985
|
|
80
80
|
csv_detective/output/schema.py,sha256=XoKljXPXP00DfqPCiz1ydwTHYGAFsvNxnaPCNBuuBIo,10443
|
|
@@ -84,12 +84,12 @@ csv_detective/parsing/columns.py,sha256=rb5JywbKnYCT3Jb0ZaG1BnyPVtB3gy5mSD-K7qcO
|
|
|
84
84
|
csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
|
|
85
85
|
csv_detective/parsing/csv.py,sha256=5rw6gXZFQC1T4NT9CnW0AumidrYOkF8kjrfWGmk949I,1716
|
|
86
86
|
csv_detective/parsing/excel.py,sha256=tb65I78tdYlZci_tzvvQt8U6bZSYKjeVdn2CEvsET1o,6972
|
|
87
|
-
csv_detective/parsing/load.py,sha256=
|
|
87
|
+
csv_detective/parsing/load.py,sha256=orW6PV5XUsHA093yVSxXkJl33LEUUArr3hP81U9Bzd4,4386
|
|
88
88
|
csv_detective/parsing/text.py,sha256=yDAcop5xJQc25UtbZcV0guHXAZQfm-H8WuJORTy8Rr8,1734
|
|
89
|
-
csv_detective-0.10.3.
|
|
89
|
+
csv_detective-0.10.3.dev3.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
|
|
90
90
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
91
91
|
tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
|
|
92
|
-
tests/test_fields.py,sha256=
|
|
92
|
+
tests/test_fields.py,sha256=DSI-ZXDcRt69iZArYZZAr_3OEb-qvwgOVBZxmYAKIkI,5918
|
|
93
93
|
tests/test_file.py,sha256=_ftEymft5-1keUVE5AUdF2XkVcChJo6oBjws3ye06FE,14543
|
|
94
94
|
tests/test_labels.py,sha256=lgxRbLrGV1C-MkASf3KIQ120BG-UHzFQ4pqDWaeBvaw,539
|
|
95
95
|
tests/test_structure.py,sha256=XDbviuuvk-0Mu9Y9PI6He2e5hry2dXVJ6yBVwEqF_2o,1043
|
|
@@ -104,8 +104,8 @@ tests/data/file.ods,sha256=4dR7zWptz5djALIBVeWHQ20GaZNfA63fevIJGFIk1_U,11832
|
|
|
104
104
|
tests/data/file.xls,sha256=QYmNX3FF0QfcQSzYQMtaMJaepJf5EZpDa1miKc4wMdQ,21495
|
|
105
105
|
tests/data/file.xlsx,sha256=naWzL02PK4pdIjMzfEyfSW9GQhkYYd_e7bpJvB8Pb2w,8314
|
|
106
106
|
tests/data/xlsx_file,sha256=NyOyN_rIe7ryJuHQLqjxVdKCc8V4s5pxyHl6wWFykCM,8305
|
|
107
|
-
csv_detective-0.10.3.
|
|
108
|
-
csv_detective-0.10.3.
|
|
109
|
-
csv_detective-0.10.3.
|
|
110
|
-
csv_detective-0.10.3.
|
|
111
|
-
csv_detective-0.10.3.
|
|
107
|
+
csv_detective-0.10.3.dev3.dist-info/METADATA,sha256=aVDOx1LTRqvJIGf4wu4krkEjPl8HDyCICBbswfn_Wvg,11082
|
|
108
|
+
csv_detective-0.10.3.dev3.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
109
|
+
csv_detective-0.10.3.dev3.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
|
|
110
|
+
csv_detective-0.10.3.dev3.dist-info/top_level.txt,sha256=KDI4gyOpkmormGgUvSWrE3jen2e0unIsxR2b96DRvcw,25
|
|
111
|
+
csv_detective-0.10.3.dev3.dist-info/RECORD,,
|
tests/test_fields.py
CHANGED
|
@@ -104,11 +104,17 @@ def test_fields_with_values(args):
|
|
|
104
104
|
("2022-08-01", "date", _date),
|
|
105
105
|
("2024-09-23 17:32:07", "datetime", _datetime),
|
|
106
106
|
("2024-09-23 17:32:07+02:00", "datetime", _datetime),
|
|
107
|
+
("N/A", "int", None),
|
|
108
|
+
("nan", "bool", None),
|
|
109
|
+
("", "date", None), # all NaN-like values should be cast as None for all type
|
|
107
110
|
),
|
|
108
111
|
)
|
|
109
112
|
def test_cast(args):
|
|
110
113
|
value, detected_type, cast_type = args
|
|
111
|
-
|
|
114
|
+
if cast_type is None:
|
|
115
|
+
assert cast(value, detected_type) is None
|
|
116
|
+
else:
|
|
117
|
+
assert isinstance(cast(value, detected_type), cast_type)
|
|
112
118
|
|
|
113
119
|
|
|
114
120
|
@pytest.mark.parametrize(
|
{csv_detective-0.10.3.dev1.dist-info → csv_detective-0.10.3.dev3.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{csv_detective-0.10.3.dev1.dist-info → csv_detective-0.10.3.dev3.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|