csv-detective 0.7.5.dev1078__py3-none-any.whl → 0.7.5.dev1139__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/detect_fields/__init__.py +1 -1
- csv_detective/detect_fields/other/booleen/__init__.py +22 -16
- csv_detective/detect_fields/other/float/__init__.py +2 -2
- csv_detective/detect_fields/temp/date/__init__.py +21 -37
- csv_detective/detect_fields/temp/datetime/__init__.py +19 -0
- csv_detective/detect_labels/temp/date/__init__.py +3 -1
- csv_detective/detection.py +2 -0
- csv_detective/explore_csv.py +17 -2
- csv_detective/utils.py +53 -2
- {csv_detective-0.7.5.dev1078.data → csv_detective-0.7.5.dev1139.data}/data/share/csv_detective/CHANGELOG.md +2 -0
- {csv_detective-0.7.5.dev1078.dist-info → csv_detective-0.7.5.dev1139.dist-info}/METADATA +12 -2
- {csv_detective-0.7.5.dev1078.dist-info → csv_detective-0.7.5.dev1139.dist-info}/RECORD +20 -19
- {csv_detective-0.7.5.dev1078.dist-info → csv_detective-0.7.5.dev1139.dist-info}/WHEEL +1 -1
- tests/test_fields.py +19 -0
- tests/test_file.py +28 -0
- {csv_detective-0.7.5.dev1078.data → csv_detective-0.7.5.dev1139.data}/data/share/csv_detective/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.7.5.dev1078.data → csv_detective-0.7.5.dev1139.data}/data/share/csv_detective/README.md +0 -0
- {csv_detective-0.7.5.dev1078.dist-info → csv_detective-0.7.5.dev1139.dist-info}/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.7.5.dev1078.dist-info → csv_detective-0.7.5.dev1139.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.7.5.dev1078.dist-info → csv_detective-0.7.5.dev1139.dist-info}/top_level.txt +0 -0
|
@@ -1,21 +1,27 @@
|
|
|
1
1
|
PROPORTION = 1
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
2
|
+
bool_mapping = {
|
|
3
|
+
"1": True,
|
|
4
|
+
"0": False,
|
|
5
|
+
"vrai": True,
|
|
6
|
+
"faux": False,
|
|
7
|
+
"true": True,
|
|
8
|
+
"false": False,
|
|
9
|
+
"oui": True,
|
|
10
|
+
"non": False,
|
|
11
|
+
"yes": True,
|
|
12
|
+
"no": False,
|
|
13
|
+
"y": True,
|
|
14
|
+
"n": False,
|
|
15
|
+
"o": True,
|
|
16
16
|
}
|
|
17
17
|
|
|
18
|
+
liste_bool = set(bool_mapping.keys())
|
|
18
19
|
|
|
19
|
-
|
|
20
|
-
|
|
20
|
+
|
|
21
|
+
def bool_casting(val: str) -> bool:
|
|
22
|
+
return bool_mapping.get(val)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _is(val: str) -> bool:
|
|
26
|
+
'''Détecte les booléens'''
|
|
21
27
|
return isinstance(val, str) and val.lower() in liste_bool
|
|
@@ -1,46 +1,30 @@
|
|
|
1
|
-
import
|
|
2
|
-
from
|
|
3
|
-
|
|
4
|
-
from
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from dateparser import parse as date_parser
|
|
5
|
+
from dateutil.parser import parse as dateutil_parser, ParserError
|
|
5
6
|
|
|
6
7
|
PROPORTION = 1
|
|
7
8
|
# /!\ this is only for dates, not datetimes which are handled by other utils
|
|
8
9
|
|
|
9
10
|
|
|
10
|
-
def
|
|
11
|
-
|
|
12
|
-
# longest date string expected here is DD-septembre-YYYY, so 17 characters
|
|
13
|
-
if len(val) > 17:
|
|
14
|
-
return False
|
|
11
|
+
def date_casting(val: str) -> Optional[datetime]:
|
|
12
|
+
"""For performance reasons, we try first with dateutil and fallback on dateparser"""
|
|
15
13
|
try:
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
return True
|
|
20
|
-
except (ParserError, ValueError, TypeError, OverflowError):
|
|
21
|
-
return False
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
seps = r'[\s/\-\*_\|;.,]'
|
|
25
|
-
# matches JJ-MM-AAAA with any of the listed separators
|
|
26
|
-
pat = r'^(0[1-9]|[12][0-9]|3[01])SEP(0[1-9]|1[0-2])SEP((19|20)\d{2})$'.replace('SEP', seps)
|
|
27
|
-
# matches AAAA-MM-JJ with any of the listed separators OR NO SEPARATOR
|
|
28
|
-
tap = r'^((19|20)\d{2})SEP(0[1-9]|1[0-2])SEP(0[1-9]|[12][0-9]|3[01])$'.replace('SEP', seps + '?')
|
|
29
|
-
# matches JJ-mmm-AAAA and JJ-mmm...mm-AAAA with any of the listed separators OR NO SEPARATOR
|
|
30
|
-
letters = (
|
|
31
|
-
r'^(0[1-9]|[12][0-9]|3[01])SEP(jan|fev|feb|mar|avr|apr'
|
|
32
|
-
r'|mai|may|jun|jui|jul|aou|aug|sep|oct|nov|dec|janvier|fevrier|mars|avril|'
|
|
33
|
-
r'mai|juin|jullet|aout|septembre|octobre|novembre|decembre)SEP'
|
|
34
|
-
r'(\d{2}|\d{4})$'
|
|
35
|
-
).replace('SEP', seps + '?')
|
|
14
|
+
return dateutil_parser(val)
|
|
15
|
+
except ParserError:
|
|
16
|
+
return date_parser(val)
|
|
36
17
|
|
|
37
18
|
|
|
38
19
|
def _is(val):
|
|
39
|
-
'''Renvoie True si val peut être une date, False sinon
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
)
|
|
20
|
+
'''Renvoie True si val peut être une date, False sinon'''
|
|
21
|
+
# early stops, to cut processing time
|
|
22
|
+
if not isinstance(val, str) or len(val) > 20 or len(val) < 8:
|
|
23
|
+
return False
|
|
24
|
+
threshold = 0.3
|
|
25
|
+
if sum([char.isdigit() for char in val]) / len(val) < threshold:
|
|
26
|
+
return False
|
|
27
|
+
res = date_casting(val)
|
|
28
|
+
if not res or res.hour or res.minute or res.second:
|
|
29
|
+
return False
|
|
30
|
+
return True
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from typing import Any, Optional
|
|
2
|
+
|
|
3
|
+
from csv_detective.detect_fields.temp.date import date_casting
|
|
4
|
+
|
|
5
|
+
PROPORTION = 1
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _is(val: Optional[Any]) -> bool:
|
|
9
|
+
'''Renvoie True si val peut être un datetime, False sinon'''
|
|
10
|
+
# early stops, to cut processing time
|
|
11
|
+
if not isinstance(val, str) or len(val) > 30 or len(val) < 15:
|
|
12
|
+
return False
|
|
13
|
+
threshold = 0.7
|
|
14
|
+
if sum([char.isdigit() for char in val]) / len(val) < threshold:
|
|
15
|
+
return False
|
|
16
|
+
res = date_casting(val)
|
|
17
|
+
if res and (res.hour or res.minute or res.second):
|
|
18
|
+
return True
|
|
19
|
+
return False
|
csv_detective/detection.py
CHANGED
|
@@ -198,6 +198,8 @@ def detect_encoding(csv_file_path: str, verbose: bool = False) -> str:
|
|
|
198
198
|
else:
|
|
199
199
|
binary_file = open(csv_file_path, mode="rb")
|
|
200
200
|
encoding_dict = detect(binary_file.read())
|
|
201
|
+
if not encoding_dict["encoding"]:
|
|
202
|
+
raise ValueError("Could not detect the file's encoding. Consider specifying it in the routine call.")
|
|
201
203
|
if verbose:
|
|
202
204
|
message = f'Detected encoding: "{encoding_dict["encoding"]}"'
|
|
203
205
|
message += f' in {round(time() - start, 3)}s (confidence: {round(encoding_dict["confidence"]*100)}%)'
|
csv_detective/explore_csv.py
CHANGED
|
@@ -19,7 +19,13 @@ import pandas as pd
|
|
|
19
19
|
from csv_detective import detect_fields, detect_labels
|
|
20
20
|
from csv_detective.s3_utils import download_from_minio, upload_to_minio
|
|
21
21
|
from csv_detective.schema_generation import generate_table_schema
|
|
22
|
-
from csv_detective.utils import
|
|
22
|
+
from csv_detective.utils import (
|
|
23
|
+
cast_df,
|
|
24
|
+
display_logs_depending_process_time,
|
|
25
|
+
prepare_output_dict,
|
|
26
|
+
test_col,
|
|
27
|
+
test_label,
|
|
28
|
+
)
|
|
23
29
|
from .detection import (
|
|
24
30
|
detect_engine,
|
|
25
31
|
detect_separator,
|
|
@@ -111,6 +117,7 @@ def routine(
|
|
|
111
117
|
output_profile: bool = False,
|
|
112
118
|
output_schema: bool = False,
|
|
113
119
|
output_df: bool = False,
|
|
120
|
+
cast_json: bool = True,
|
|
114
121
|
verbose: bool = False,
|
|
115
122
|
sheet_name: Union[str, int] = None,
|
|
116
123
|
) -> Union[dict, tuple[dict, pd.DataFrame]]:
|
|
@@ -127,6 +134,7 @@ def routine(
|
|
|
127
134
|
output_profile: whether or not to add the 'profile' field to the output
|
|
128
135
|
output_schema: whether or not to add the 'schema' field to the output (tableschema)
|
|
129
136
|
output_df: whether or not to return the loaded DataFrame along with the analysis report
|
|
137
|
+
cast_json: whether or not to cast json columns into objects (otherwise they are returned as strings)
|
|
130
138
|
verbose: whether or not to print process logs in console
|
|
131
139
|
sheet_name: if reading multi-sheet file (xls-like), which sheet to consider
|
|
132
140
|
skipna: whether to keep NaN (empty cells) for tests
|
|
@@ -276,6 +284,8 @@ def routine(
|
|
|
276
284
|
"json": "json",
|
|
277
285
|
"json_geojson": "json",
|
|
278
286
|
"datetime": "datetime",
|
|
287
|
+
"datetime_iso": "datetime",
|
|
288
|
+
"datetime_rfc822": "datetime",
|
|
279
289
|
"date": "date",
|
|
280
290
|
"latitude": "float",
|
|
281
291
|
"latitude_l93": "float",
|
|
@@ -352,7 +362,12 @@ def routine(
|
|
|
352
362
|
time() - start_routine
|
|
353
363
|
)
|
|
354
364
|
if output_df:
|
|
355
|
-
return analysis,
|
|
365
|
+
return analysis, cast_df(
|
|
366
|
+
df=table,
|
|
367
|
+
columns=analysis["columns"],
|
|
368
|
+
cast_json=cast_json,
|
|
369
|
+
verbose=verbose,
|
|
370
|
+
)
|
|
356
371
|
return analysis
|
|
357
372
|
|
|
358
373
|
|
csv_detective/utils.py
CHANGED
|
@@ -1,7 +1,13 @@
|
|
|
1
|
-
from typing import Callable
|
|
1
|
+
from typing import Callable, Optional, Union
|
|
2
|
+
import json
|
|
2
3
|
import pandas as pd
|
|
3
4
|
import logging
|
|
4
5
|
from time import time
|
|
6
|
+
from datetime import date, datetime
|
|
7
|
+
|
|
8
|
+
from csv_detective.detect_fields.other.booleen import bool_casting
|
|
9
|
+
from csv_detective.detect_fields.other.float import float_casting
|
|
10
|
+
from csv_detective.detect_fields.temp.date import date_casting
|
|
5
11
|
|
|
6
12
|
logging.basicConfig(level=logging.INFO)
|
|
7
13
|
|
|
@@ -210,7 +216,52 @@ def prepare_output_dict(return_table: pd.DataFrame, limited_output: bool):
|
|
|
210
216
|
|
|
211
217
|
def full_word_strictly_inside_string(word: str, string: str):
|
|
212
218
|
return (
|
|
213
|
-
|
|
219
|
+
word == string
|
|
220
|
+
or (" " + word + " " in string)
|
|
214
221
|
or (string.startswith(word + " "))
|
|
215
222
|
or (string.endswith(" " + word))
|
|
216
223
|
)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def cast(value: str, _type: str) -> Optional[Union[str, float, bool, date, datetime]]:
|
|
227
|
+
if not isinstance(value, str) or not value:
|
|
228
|
+
# None is the current default value in hydra, should we keep this?
|
|
229
|
+
return None
|
|
230
|
+
if _type == "float":
|
|
231
|
+
return float_casting(value)
|
|
232
|
+
if _type == "bool":
|
|
233
|
+
return bool_casting(value)
|
|
234
|
+
if _type == "json":
|
|
235
|
+
# in hydra json are given to postgres as strings, conversion is done by postgres
|
|
236
|
+
return json.loads(value)
|
|
237
|
+
if _type == "date":
|
|
238
|
+
_date = date_casting(value)
|
|
239
|
+
return _date.date() if _date else None
|
|
240
|
+
if _type == "datetime":
|
|
241
|
+
return date_casting(value)
|
|
242
|
+
raise ValueError(f"Unknown type `{_type}`")
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def cast_df(df: pd.DataFrame, columns: dict, cast_json: bool = True, verbose: bool = False) -> pd.DataFrame:
|
|
246
|
+
if verbose:
|
|
247
|
+
start = time()
|
|
248
|
+
output_df = pd.DataFrame()
|
|
249
|
+
for col_name, detection in columns.items():
|
|
250
|
+
if detection["python_type"] == "string" or (detection["python_type"] == "json" and not cast_json):
|
|
251
|
+
# no change if detected type is string
|
|
252
|
+
output_df[col_name] = df[col_name].copy()
|
|
253
|
+
elif detection["python_type"] == "int":
|
|
254
|
+
# to allow having ints and NaN in the same column
|
|
255
|
+
output_df[col_name] = df[col_name].copy().astype(pd.Int64Dtype())
|
|
256
|
+
else:
|
|
257
|
+
output_df[col_name] = df[col_name].apply(
|
|
258
|
+
lambda col: cast(col, _type=detection["python_type"])
|
|
259
|
+
)
|
|
260
|
+
# to save RAM
|
|
261
|
+
del df[col_name]
|
|
262
|
+
if verbose:
|
|
263
|
+
display_logs_depending_process_time(
|
|
264
|
+
f'Casting columns completed in {round(time() - start, 3)}s',
|
|
265
|
+
time() - start,
|
|
266
|
+
)
|
|
267
|
+
return output_df
|
|
@@ -5,6 +5,8 @@
|
|
|
5
5
|
- New function that creates a csv from a list of fields and constraints, or from a TableSchema [#101](https://github.com/datagouv/csv-detective/pull/101)
|
|
6
6
|
- Enable outputing loaded dataframe [#102](https://github.com/datagouv/csv-detective/pull/102)
|
|
7
7
|
- Better naming, hint types and minor refactors [#103](https://github.com/datagouv/csv-detective/pull/103)
|
|
8
|
+
- The returned dataframe has its columns properly cast to the detected types [#104](https://github.com/datagouv/csv-detective/pull/104)
|
|
9
|
+
- Raise an error if the encoding could not be guessed [#106](https://github.com/datagouv/csv-detective/pull/106)
|
|
8
10
|
|
|
9
11
|
## 0.7.4 (2024-11-15)
|
|
10
12
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
2
|
Name: csv_detective
|
|
3
|
-
Version: 0.7.5.
|
|
3
|
+
Version: 0.7.5.dev1139
|
|
4
4
|
Summary: Detect CSV column content
|
|
5
5
|
Home-page: https://github.com/etalab/csv_detective
|
|
6
6
|
Author: Etalab
|
|
@@ -15,6 +15,7 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
|
15
15
|
Description-Content-Type: text/markdown
|
|
16
16
|
License-File: LICENSE.AGPL.txt
|
|
17
17
|
Requires-Dist: boto3==1.34.0
|
|
18
|
+
Requires-Dist: dateparser==1.2.0
|
|
18
19
|
Requires-Dist: faust-cchardet==2.1.19
|
|
19
20
|
Requires-Dist: pandas==2.2.0
|
|
20
21
|
Requires-Dist: pytest==8.3.0
|
|
@@ -29,3 +30,12 @@ Requires-Dist: python-magic==0.4.27
|
|
|
29
30
|
Requires-Dist: frformat==0.4.0
|
|
30
31
|
Requires-Dist: faker==33.0.0
|
|
31
32
|
Requires-Dist: rstr==3.2.2
|
|
33
|
+
Dynamic: author
|
|
34
|
+
Dynamic: author-email
|
|
35
|
+
Dynamic: classifier
|
|
36
|
+
Dynamic: description-content-type
|
|
37
|
+
Dynamic: home-page
|
|
38
|
+
Dynamic: keywords
|
|
39
|
+
Dynamic: license
|
|
40
|
+
Dynamic: requires-dist
|
|
41
|
+
Dynamic: summary
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
csv_detective/__init__.py,sha256=Au4bNJ_Gi6P6o0uO4R56nYdshG7M6-7Rg_xX4whLmLI,143
|
|
2
2
|
csv_detective/cli.py,sha256=Ua7SE1wMH2uFUsTmfumh4nJk7O06okpMd2gvjUDO1II,1048
|
|
3
3
|
csv_detective/create_example.py,sha256=358e7Q7RWMrY_eEo3pUteJWmg2smFb5edJ_AzcQPrqA,8646
|
|
4
|
-
csv_detective/detection.py,sha256=
|
|
5
|
-
csv_detective/explore_csv.py,sha256
|
|
4
|
+
csv_detective/detection.py,sha256=zrP8qvLDvhVXTHi7Ty8G_ga4zfZPjBhuyApqFQkPq2Y,22373
|
|
5
|
+
csv_detective/explore_csv.py,sha256=-AxnM0hGlhrbI4w1wdZwC_w-DYgoOCFpMQ94agIaD5U,17380
|
|
6
6
|
csv_detective/process_text.py,sha256=rsfk66BCmdpsCOd0kDJ8tmqMsEWd-OeBkEisWc4Ej9k,1246
|
|
7
7
|
csv_detective/s3_utils.py,sha256=1cIVdQUYY2ovErbMwp72Gqtqx2bkB8nfVhn-QaOFTT0,1451
|
|
8
8
|
csv_detective/schema_generation.py,sha256=D1Cq4QRajsKtY8EJSwbRTIB-T_Cb2ZpcmYtCrJ6DvJQ,13135
|
|
9
|
-
csv_detective/utils.py,sha256=
|
|
10
|
-
csv_detective/detect_fields/__init__.py,sha256=
|
|
9
|
+
csv_detective/utils.py,sha256=yO9INaLh-QX-FFL2A153AlMqftE04wb0hpN6HJvsKGg,10581
|
|
10
|
+
csv_detective/detect_fields/__init__.py,sha256=NVfE3BQVExgXb-BPbhDvlkM5-0naEVLpZ4aM_OGHYfE,931
|
|
11
11
|
csv_detective/detect_fields/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
12
|
csv_detective/detect_fields/FR/geo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
13
|
csv_detective/detect_fields/FR/geo/adresse/__init__.py,sha256=e5JqMNOPxx0Ivju3zAHCGMopZroCpR4vr3DJKlQhMz4,1675
|
|
@@ -55,9 +55,9 @@ csv_detective/detect_fields/geo/latitude_wgs/__init__.py,sha256=ArS6PuYEd0atZwSq
|
|
|
55
55
|
csv_detective/detect_fields/geo/latlon_wgs/__init__.py,sha256=3nlBqFYD4kVSVxw4b9DTPcxW59oL0T3Kj0OxPlyP9og,268
|
|
56
56
|
csv_detective/detect_fields/geo/longitude_wgs/__init__.py,sha256=G7afWOKiGh_Tv7gwDNGt1a4B_A8hkCBkIxn3THDCUFk,330
|
|
57
57
|
csv_detective/detect_fields/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
58
|
-
csv_detective/detect_fields/other/booleen/__init__.py,sha256=
|
|
58
|
+
csv_detective/detect_fields/other/booleen/__init__.py,sha256=1qIEI681iEaPVb9XxmH2ewxDdfmYhHe4-s3MZ6L1A9Q,489
|
|
59
59
|
csv_detective/detect_fields/other/email/__init__.py,sha256=O9tgJmq0O8Q-8iin63NqEEDhlsUJjxFZNaNFM4GZaws,178
|
|
60
|
-
csv_detective/detect_fields/other/float/__init__.py,sha256=
|
|
60
|
+
csv_detective/detect_fields/other/float/__init__.py,sha256=dpEd5ZijmjQ7gqcTnYRoRoLGGJae0RyGwVC6MPra9go,549
|
|
61
61
|
csv_detective/detect_fields/other/int/__init__.py,sha256=QN3kQJLYqLRBiubUK7g4Xq03PlA5wqVwx2pPPIO9FdI,320
|
|
62
62
|
csv_detective/detect_fields/other/json/__init__.py,sha256=DhzyvT12kOqgum89silIu3uoSYXmC_s_AaxLtXAD4eU,540
|
|
63
63
|
csv_detective/detect_fields/other/mongo_object_id/__init__.py,sha256=7fcrHsOZAqXp2_N0IjPskYJ_qi4xRlo9iyNNDQVLzsU,156
|
|
@@ -65,7 +65,8 @@ csv_detective/detect_fields/other/twitter/__init__.py,sha256=qbwLKsTBRFQ4PyTNVeE
|
|
|
65
65
|
csv_detective/detect_fields/other/url/__init__.py,sha256=9WaTqCglEsw_lJG_xZsBMdxJXg2yuQ92_fkX6CXWNV0,286
|
|
66
66
|
csv_detective/detect_fields/other/uuid/__init__.py,sha256=3-z0fDax29SJc57zPjNGR6DPICJu6gfuNGC5L3jh4d0,223
|
|
67
67
|
csv_detective/detect_fields/temp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
68
|
-
csv_detective/detect_fields/temp/date/__init__.py,sha256=
|
|
68
|
+
csv_detective/detect_fields/temp/date/__init__.py,sha256=aFP1feFWFhCpR6Q9s_4BBwWxFtwFiMXY1iduSeQIjdA,943
|
|
69
|
+
csv_detective/detect_fields/temp/datetime/__init__.py,sha256=Ykwhk2ospjY9P0KOG0AitgqN0sld6UmhOlbMz_XGQzQ,597
|
|
69
70
|
csv_detective/detect_fields/temp/datetime_iso/__init__.py,sha256=DOfli-A7gPlZmiV2J6Ka5_yDUCaOgxis29LET_tfhA4,444
|
|
70
71
|
csv_detective/detect_fields/temp/datetime_rfc822/__init__.py,sha256=JtUzg3BXYd-XJMLGxQ0P1OAJGOQ7DlYMD4fCU9yndg0,511
|
|
71
72
|
csv_detective/detect_fields/temp/year/__init__.py,sha256=RjsiIHoplnI4Odi5587TzRhKTDT-FTqGOBpdartuShA,194
|
|
@@ -122,22 +123,22 @@ csv_detective/detect_labels/other/twitter/__init__.py,sha256=D8G4vGsFL9a99OJz-03
|
|
|
122
123
|
csv_detective/detect_labels/other/url/__init__.py,sha256=vqUQvn5o6JZU8iRsSG3AYqggjlhzagozVYWwpuSReV8,1202
|
|
123
124
|
csv_detective/detect_labels/other/uuid/__init__.py,sha256=OdMUxqvqMdGaY5nph7CbIF_Q0LSxljxE72kCMT4m-Zk,931
|
|
124
125
|
csv_detective/detect_labels/temp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
125
|
-
csv_detective/detect_labels/temp/date/__init__.py,sha256
|
|
126
|
+
csv_detective/detect_labels/temp/date/__init__.py,sha256=-R7VqlryozelSn4wH_7w9x6ks77DP1kw2XMBYSLrzXE,1322
|
|
126
127
|
csv_detective/detect_labels/temp/datetime_iso/__init__.py,sha256=Ih9l56nBcdmGLyWDavVUWuUUuVZBz9QUDE1hHzADvVg,1157
|
|
127
128
|
csv_detective/detect_labels/temp/datetime_rfc822/__init__.py,sha256=DQ_h4uDW1e6qu2rATEhgGKw6O-vVi7HbDhbEDDCT9uY,1175
|
|
128
129
|
csv_detective/detect_labels/temp/year/__init__.py,sha256=zPF_mvhzhXMAlHPAskS8mhuxjLj2AlKpV4ss8Q4tDms,1150
|
|
129
|
-
csv_detective-0.7.5.
|
|
130
|
-
csv_detective-0.7.5.
|
|
131
|
-
csv_detective-0.7.5.
|
|
130
|
+
csv_detective-0.7.5.dev1139.data/data/share/csv_detective/CHANGELOG.md,sha256=jgcSpxGkuEdOQtQ08tRFflkCNuHstGFHN_29Tv6M1dE,7176
|
|
131
|
+
csv_detective-0.7.5.dev1139.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
|
|
132
|
+
csv_detective-0.7.5.dev1139.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
|
|
132
133
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
133
134
|
tests/test_example.py,sha256=0NfChooJQlFxTo2nY5FOQIcsK4zzWA_SBmt2LwVQovY,2014
|
|
134
|
-
tests/test_fields.py,sha256=
|
|
135
|
-
tests/test_file.py,sha256=
|
|
135
|
+
tests/test_fields.py,sha256=_96htvTzvM7u-W57RpOBbsacWirIm4R36PP7JhPEaYQ,11123
|
|
136
|
+
tests/test_file.py,sha256=HO-Zqv0ZDFy3d0ZrpjWQPXBrwgUmzesseoEofy8G2UU,7529
|
|
136
137
|
tests/test_labels.py,sha256=6MOKrGznkwU5fjZ_3oiB6Scmb480Eu-9geBJs0UDLds,159
|
|
137
138
|
tests/test_structure.py,sha256=SVsnluVoIIprYw_67I1_gB3cp9m1wlO8C7SpdsLW8cM,1161
|
|
138
|
-
csv_detective-0.7.5.
|
|
139
|
-
csv_detective-0.7.5.
|
|
140
|
-
csv_detective-0.7.5.
|
|
141
|
-
csv_detective-0.7.5.
|
|
142
|
-
csv_detective-0.7.5.
|
|
143
|
-
csv_detective-0.7.5.
|
|
139
|
+
csv_detective-0.7.5.dev1139.dist-info/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
|
|
140
|
+
csv_detective-0.7.5.dev1139.dist-info/METADATA,sha256=L-WEqkw3fze2JUmOVwfqV1Ttgsf5lhi_WqEULi0AKkA,1364
|
|
141
|
+
csv_detective-0.7.5.dev1139.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
|
142
|
+
csv_detective-0.7.5.dev1139.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
|
|
143
|
+
csv_detective-0.7.5.dev1139.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
|
|
144
|
+
csv_detective-0.7.5.dev1139.dist-info/RECORD,,
|
tests/test_fields.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
2
|
from numpy import random
|
|
3
|
+
import pytest
|
|
4
|
+
from datetime import date as _date, datetime as _datetime
|
|
3
5
|
|
|
4
6
|
from csv_detective.detect_fields.FR.geo import (
|
|
5
7
|
adresse,
|
|
@@ -46,6 +48,7 @@ from csv_detective.detection import (
|
|
|
46
48
|
detetect_categorical_variable,
|
|
47
49
|
)
|
|
48
50
|
from csv_detective.explore_csv import return_all_tests
|
|
51
|
+
from csv_detective.utils import cast
|
|
49
52
|
|
|
50
53
|
|
|
51
54
|
def test_all_tests_return_bool():
|
|
@@ -504,3 +507,19 @@ def test_match_float():
|
|
|
504
507
|
def test_not_match_float():
|
|
505
508
|
for val in ["01053", "01053.89", "1e3", "123_456", "123_456.78", "+35", "+35.9"]:
|
|
506
509
|
assert not test_float._is(val)
|
|
510
|
+
|
|
511
|
+
|
|
512
|
+
@pytest.mark.parametrize(
|
|
513
|
+
"args",
|
|
514
|
+
(
|
|
515
|
+
("1.9", "float", float),
|
|
516
|
+
("oui", "bool", bool),
|
|
517
|
+
("[1, 2]", "json", list),
|
|
518
|
+
('{"a": 1}', "json", dict),
|
|
519
|
+
("2022-08-01", "date", _date),
|
|
520
|
+
("2024-09-23 17:32:07", "datetime", _datetime),
|
|
521
|
+
),
|
|
522
|
+
)
|
|
523
|
+
def test_cast(args):
|
|
524
|
+
value, detected_type, cast_type = args
|
|
525
|
+
assert isinstance(cast(value, detected_type), cast_type)
|
tests/test_file.py
CHANGED
|
@@ -232,3 +232,31 @@ def test_output_df():
|
|
|
232
232
|
assert isinstance(output, dict)
|
|
233
233
|
assert isinstance(df, pd.DataFrame)
|
|
234
234
|
assert len(df) == 6
|
|
235
|
+
assert df["partly_empty"].dtype == pd.Int64Dtype()
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
@pytest.mark.parametrize(
|
|
239
|
+
"cast_json",
|
|
240
|
+
(
|
|
241
|
+
(True, dict),
|
|
242
|
+
(False, str),
|
|
243
|
+
),
|
|
244
|
+
)
|
|
245
|
+
def test_cast_json(mocked_responses, cast_json):
|
|
246
|
+
cast_json, expected_type = cast_json
|
|
247
|
+
expected_content = 'id,a_simple_dict\n1,{"a": 1}\n2,{"b": 2}\n3,{"c": 3}\n'
|
|
248
|
+
mocked_responses.get(
|
|
249
|
+
'http://example.com/test.csv',
|
|
250
|
+
body=expected_content,
|
|
251
|
+
status=200,
|
|
252
|
+
)
|
|
253
|
+
analysis, df = routine(
|
|
254
|
+
csv_file_path='http://example.com/test.csv',
|
|
255
|
+
num_rows=-1,
|
|
256
|
+
output_profile=False,
|
|
257
|
+
save_results=False,
|
|
258
|
+
output_df=True,
|
|
259
|
+
cast_json=cast_json,
|
|
260
|
+
)
|
|
261
|
+
assert analysis['columns']["a_simple_dict"]["python_type"] == "json"
|
|
262
|
+
assert isinstance(df["a_simple_dict"][0], expected_type)
|
|
File without changes
|
|
File without changes
|
{csv_detective-0.7.5.dev1078.dist-info → csv_detective-0.7.5.dev1139.dist-info}/LICENSE.AGPL.txt
RENAMED
|
File without changes
|
{csv_detective-0.7.5.dev1078.dist-info → csv_detective-0.7.5.dev1139.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{csv_detective-0.7.5.dev1078.dist-info → csv_detective-0.7.5.dev1139.dist-info}/top_level.txt
RENAMED
|
File without changes
|