csv-detective 0.8.1.dev1703__py3-none-any.whl → 0.8.1.dev1729__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/cli.py +6 -9
- csv_detective/detect_fields/FR/geo/adresse/__init__.py +78 -78
- csv_detective/detect_fields/FR/geo/code_departement/__init__.py +2 -2
- csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -1
- csv_detective/detect_fields/FR/geo/code_region/__init__.py +1 -1
- csv_detective/detect_fields/FR/geo/commune/__init__.py +2 -2
- csv_detective/detect_fields/FR/geo/departement/__init__.py +2 -2
- csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +2 -2
- csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +1 -2
- csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +1 -1
- csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +1 -2
- csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +1 -1
- csv_detective/detect_fields/FR/geo/pays/__init__.py +6 -6
- csv_detective/detect_fields/FR/geo/region/__init__.py +6 -4
- csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +15 -14
- csv_detective/detect_fields/FR/other/csp_insee/__init__.py +4 -3
- csv_detective/detect_fields/FR/other/date_fr/__init__.py +3 -3
- csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +4 -3
- csv_detective/detect_fields/FR/other/sexe/__init__.py +2 -2
- csv_detective/detect_fields/FR/other/siren/__init__.py +3 -3
- csv_detective/detect_fields/FR/other/siret/__init__.py +3 -3
- csv_detective/detect_fields/FR/other/tel_fr/__init__.py +3 -3
- csv_detective/detect_fields/FR/other/uai/__init__.py +2 -2
- csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +15 -15
- csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +27 -27
- csv_detective/detect_fields/__init__.py +94 -43
- csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +5 -5
- csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +5 -5
- csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +5 -5
- csv_detective/detect_fields/geo/latitude_wgs/__init__.py +1 -1
- csv_detective/detect_fields/geo/longitude_wgs/__init__.py +1 -1
- csv_detective/detect_fields/other/booleen/__init__.py +1 -1
- csv_detective/detect_fields/other/email/__init__.py +4 -2
- csv_detective/detect_fields/other/int/__init__.py +3 -3
- csv_detective/detect_fields/other/mongo_object_id/__init__.py +2 -2
- csv_detective/detect_fields/other/twitter/__init__.py +2 -2
- csv_detective/detect_fields/other/uuid/__init__.py +4 -5
- csv_detective/detect_fields/temp/date/__init__.py +3 -2
- csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +6 -6
- csv_detective/detect_fields/temp/year/__init__.py +1 -1
- csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -1
- csv_detective/detect_labels/__init__.py +51 -1
- csv_detective/detect_labels/geo/lonlat_wgs/__init__.py +1 -0
- csv_detective/detect_labels/other/mongo_object_id/__init__.py +1 -1
- csv_detective/detection/columns.py +9 -9
- csv_detective/detection/encoding.py +6 -4
- csv_detective/detection/engine.py +6 -5
- csv_detective/detection/formats.py +19 -19
- csv_detective/detection/headers.py +3 -5
- csv_detective/detection/rows.py +1 -1
- csv_detective/detection/variables.py +6 -7
- csv_detective/explore_csv.py +7 -8
- csv_detective/load_tests.py +7 -16
- csv_detective/output/__init__.py +3 -7
- csv_detective/output/dataframe.py +9 -5
- csv_detective/output/example.py +13 -13
- csv_detective/output/profile.py +30 -23
- csv_detective/output/schema.py +20 -23
- csv_detective/output/utils.py +15 -15
- csv_detective/parsing/columns.py +23 -12
- csv_detective/parsing/csv.py +1 -1
- csv_detective/parsing/excel.py +10 -11
- csv_detective/parsing/load.py +11 -8
- csv_detective/parsing/text.py +4 -9
- csv_detective/s3_utils.py +3 -7
- csv_detective/utils.py +4 -2
- csv_detective/validate.py +18 -13
- {csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/METADATA +12 -2
- {csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/RECORD +79 -79
- tests/test_example.py +2 -6
- tests/test_fields.py +16 -10
- tests/test_file.py +10 -9
- tests/test_labels.py +3 -2
- tests/test_structure.py +4 -3
- tests/test_validation.py +9 -6
- {csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/WHEEL +0 -0
- {csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/licenses/LICENSE +0 -0
- {csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/top_level.txt +0 -0
tests/test_fields.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
from datetime import date as _date
|
|
1
|
+
from datetime import date as _date
|
|
2
|
+
from datetime import datetime as _datetime
|
|
2
3
|
|
|
3
|
-
from numpy import random
|
|
4
4
|
import pandas as pd
|
|
5
5
|
import pytest
|
|
6
|
+
from numpy import random
|
|
6
7
|
|
|
7
8
|
from csv_detective.detect_fields.FR.geo import (
|
|
8
9
|
adresse,
|
|
@@ -23,8 +24,8 @@ from csv_detective.detect_fields.FR.geo import (
|
|
|
23
24
|
)
|
|
24
25
|
from csv_detective.detect_fields.FR.other import (
|
|
25
26
|
code_csp_insee,
|
|
26
|
-
code_rna,
|
|
27
27
|
code_import,
|
|
28
|
+
code_rna,
|
|
28
29
|
code_waldec,
|
|
29
30
|
csp_insee,
|
|
30
31
|
date_fr,
|
|
@@ -56,9 +57,13 @@ from csv_detective.detect_fields.other import (
|
|
|
56
57
|
twitter,
|
|
57
58
|
url,
|
|
58
59
|
uuid,
|
|
59
|
-
|
|
60
|
+
)
|
|
61
|
+
from csv_detective.detect_fields.other import (
|
|
60
62
|
float as test_float,
|
|
61
63
|
)
|
|
64
|
+
from csv_detective.detect_fields.other import (
|
|
65
|
+
int as test_int,
|
|
66
|
+
)
|
|
62
67
|
from csv_detective.detect_fields.temp import (
|
|
63
68
|
date,
|
|
64
69
|
datetime_aware,
|
|
@@ -67,8 +72,8 @@ from csv_detective.detect_fields.temp import (
|
|
|
67
72
|
year,
|
|
68
73
|
)
|
|
69
74
|
from csv_detective.detection.variables import (
|
|
70
|
-
detect_continuous_variable,
|
|
71
75
|
detect_categorical_variable,
|
|
76
|
+
detect_continuous_variable,
|
|
72
77
|
)
|
|
73
78
|
from csv_detective.load_tests import return_all_tests
|
|
74
79
|
from csv_detective.output.dataframe import cast
|
|
@@ -225,10 +230,7 @@ fields = {
|
|
|
225
230
|
True: ["13 fevrier 1996"],
|
|
226
231
|
False: ["44 march 2025"],
|
|
227
232
|
},
|
|
228
|
-
insee_ape700: {
|
|
229
|
-
True: ["0116Z"],
|
|
230
|
-
False: ["0116A"]
|
|
231
|
-
},
|
|
233
|
+
insee_ape700: {True: ["0116Z"], False: ["0116A"]},
|
|
232
234
|
tel_fr: {
|
|
233
235
|
True: ["0134643467"],
|
|
234
236
|
False: ["6625388263", "01288398"],
|
|
@@ -360,7 +362,11 @@ fields = {
|
|
|
360
362
|
},
|
|
361
363
|
datetime_naive: {
|
|
362
364
|
True: ["2021-06-22 10:20:10", "2030/06/22 00:00:00.0028"],
|
|
363
|
-
False: [
|
|
365
|
+
False: [
|
|
366
|
+
"2021-06-22T30:20:10",
|
|
367
|
+
"Sun, 06 Nov 1994 08:49:37 GMT",
|
|
368
|
+
"2021-06-44 10:20:10+02:00",
|
|
369
|
+
],
|
|
364
370
|
},
|
|
365
371
|
datetime_rfc822: {
|
|
366
372
|
True: ["Sun, 06 Nov 1994 08:49:37 GMT"],
|
tests/test_file.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
|
+
from unittest.mock import patch
|
|
2
|
+
|
|
1
3
|
import pandas as pd
|
|
2
4
|
import pytest
|
|
3
5
|
import responses
|
|
4
|
-
from unittest.mock import patch
|
|
5
6
|
|
|
6
7
|
from csv_detective import routine
|
|
7
8
|
|
|
@@ -70,10 +71,10 @@ def test_profile_output_on_file():
|
|
|
70
71
|
[
|
|
71
72
|
c in list(output["profile"]["NUMCOM"].keys())
|
|
72
73
|
for c in [
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
74
|
+
"min",
|
|
75
|
+
"max",
|
|
76
|
+
"mean",
|
|
77
|
+
"std",
|
|
77
78
|
]
|
|
78
79
|
]
|
|
79
80
|
)
|
|
@@ -191,7 +192,7 @@ def mocked_responses():
|
|
|
191
192
|
"params",
|
|
192
193
|
# ideally we'd like to do the same with params_others but pandas.read_excel uses urllib
|
|
193
194
|
# which doesn't support the way we mock the response, TBC
|
|
194
|
-
params_csv + [("a_test_file.csv", {"separator": ";", "header_row_idx": 2, "total_lines": 404})]
|
|
195
|
+
params_csv + [("a_test_file.csv", {"separator": ";", "header_row_idx": 2, "total_lines": 404})],
|
|
195
196
|
)
|
|
196
197
|
def test_urls(mocked_responses, params):
|
|
197
198
|
file_name, checks = params
|
|
@@ -261,17 +262,17 @@ def test_cast_json(mocked_responses, cast_json):
|
|
|
261
262
|
cast_json, expected_type = cast_json
|
|
262
263
|
expected_content = 'id,a_simple_dict\n1,{"a": 1}\n2,{"b": 2}\n3,{"c": 3}\n'
|
|
263
264
|
mocked_responses.get(
|
|
264
|
-
|
|
265
|
+
"http://example.com/test.csv",
|
|
265
266
|
body=expected_content,
|
|
266
267
|
status=200,
|
|
267
268
|
)
|
|
268
269
|
analysis, df = routine(
|
|
269
|
-
file_path=
|
|
270
|
+
file_path="http://example.com/test.csv",
|
|
270
271
|
num_rows=-1,
|
|
271
272
|
output_profile=False,
|
|
272
273
|
save_results=False,
|
|
273
274
|
output_df=True,
|
|
274
275
|
cast_json=cast_json,
|
|
275
276
|
)
|
|
276
|
-
assert analysis[
|
|
277
|
+
assert analysis["columns"]["a_simple_dict"]["python_type"] == "json"
|
|
277
278
|
assert isinstance(df["a_simple_dict"][0], expected_type)
|
tests/test_labels.py
CHANGED
|
@@ -10,13 +10,14 @@ def test_money_labels():
|
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
@pytest.mark.parametrize(
|
|
13
|
-
"params",
|
|
13
|
+
"params",
|
|
14
|
+
[
|
|
14
15
|
("latitude", 1.0),
|
|
15
16
|
("lat", 1.0),
|
|
16
17
|
("coord_lat", 0.5),
|
|
17
18
|
("y", 1.0),
|
|
18
19
|
("nb_cycles", 0.0),
|
|
19
|
-
]
|
|
20
|
+
],
|
|
20
21
|
)
|
|
21
22
|
def test_latitude(params):
|
|
22
23
|
header, expected = params
|
tests/test_structure.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import os
|
|
2
|
-
|
|
3
|
-
from csv_detective import detect_fields, detect_labels
|
|
2
|
+
|
|
3
|
+
from csv_detective import detect_fields, detect_labels # noqa
|
|
4
4
|
from csv_detective.load_tests import return_all_tests
|
|
5
5
|
|
|
6
6
|
|
|
@@ -18,7 +18,8 @@ def tests_conformity():
|
|
|
18
18
|
if "__pycache__" not in dirname:
|
|
19
19
|
subfolders.append(os.path.join(dirpath, dirname))
|
|
20
20
|
final_subfolders = [
|
|
21
|
-
sf
|
|
21
|
+
sf
|
|
22
|
+
for sf in subfolders
|
|
22
23
|
if not any(other_sf.startswith(sf) for other_sf in subfolders if sf != other_sf)
|
|
23
24
|
]
|
|
24
25
|
for f_sf in final_subfolders:
|
tests/test_validation.py
CHANGED
|
@@ -30,13 +30,16 @@ def get_nested_value(source_dict: dict, key_chain: list[str]):
|
|
|
30
30
|
((False, None, None), {"separator": "|"}),
|
|
31
31
|
((False, None, None), {"encoding": "unknown"}),
|
|
32
32
|
((False, None, None), {"header": ["a", "b"]}),
|
|
33
|
-
(
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
"
|
|
37
|
-
|
|
33
|
+
(
|
|
34
|
+
(False, pd.DataFrame, dict),
|
|
35
|
+
{
|
|
36
|
+
"columns.NUMCOM": {
|
|
37
|
+
"python_type": "int",
|
|
38
|
+
"format": "int",
|
|
39
|
+
"score": 1.0,
|
|
40
|
+
},
|
|
38
41
|
},
|
|
39
|
-
|
|
42
|
+
),
|
|
40
43
|
),
|
|
41
44
|
)
|
|
42
45
|
def test_validation(_params):
|
|
File without changes
|
{csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
{csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/top_level.txt
RENAMED
|
File without changes
|