csv-detective 0.9.3.dev2258__py3-none-any.whl → 0.9.3.dev2348__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/detection/formats.py +12 -15
- csv_detective/explore_csv.py +28 -9
- csv_detective/format.py +67 -0
- csv_detective/formats/__init__.py +9 -0
- csv_detective/{detect_fields/FR/geo/adresse/__init__.py → formats/adresse.py} +116 -100
- csv_detective/{detect_fields/other/booleen/__init__.py → formats/booleen.py} +35 -27
- csv_detective/formats/code_commune_insee.py +26 -0
- csv_detective/{detect_fields/FR/other/code_csp_insee/__init__.py → formats/code_csp_insee.py} +36 -29
- csv_detective/{detect_fields/FR/geo/code_departement/__init__.py → formats/code_departement.py} +29 -15
- csv_detective/formats/code_fantoir.py +21 -0
- csv_detective/{detect_fields/FR/other/code_import/__init__.py → formats/code_import.py} +17 -9
- csv_detective/formats/code_postal.py +25 -0
- csv_detective/formats/code_region.py +22 -0
- csv_detective/formats/code_rna.py +29 -0
- csv_detective/formats/code_waldec.py +17 -0
- csv_detective/{detect_fields/FR/geo/commune/__init__.py → formats/commune.py} +27 -16
- csv_detective/{detect_fields/FR/other/csp_insee/__init__.py → formats/csp_insee.py} +31 -19
- csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt +0 -0
- csv_detective/{detect_fields/temp/date/__init__.py → formats/date.py} +99 -62
- csv_detective/formats/date_fr.py +22 -0
- csv_detective/{detect_fields/temp/datetime_aware/__init__.py → formats/datetime_aware.py} +18 -7
- csv_detective/{detect_fields/temp/datetime_naive/__init__.py → formats/datetime_naive.py} +21 -2
- csv_detective/{detect_fields/temp/datetime_rfc822/__init__.py → formats/datetime_rfc822.py} +24 -18
- csv_detective/formats/departement.py +37 -0
- csv_detective/formats/email.py +28 -0
- csv_detective/{detect_fields/other/float/__init__.py → formats/float.py} +29 -21
- csv_detective/formats/geojson.py +36 -0
- csv_detective/{detect_fields/FR/other/insee_ape700/__init__.py → formats/insee_ape700.py} +31 -19
- csv_detective/{detect_fields/FR/geo/insee_canton/__init__.py → formats/insee_canton.py} +28 -15
- csv_detective/{detect_fields/other/int/__init__.py → formats/int.py} +23 -16
- csv_detective/formats/iso_country_code_alpha2.py +30 -0
- csv_detective/formats/iso_country_code_alpha3.py +30 -0
- csv_detective/formats/iso_country_code_numeric.py +31 -0
- csv_detective/{detect_fields/FR/temp/jour_de_la_semaine/__init__.py → formats/jour_de_la_semaine.py} +41 -25
- csv_detective/{detect_fields/other/json/__init__.py → formats/json.py} +20 -14
- csv_detective/formats/latitude_l93.py +48 -0
- csv_detective/formats/latitude_wgs.py +42 -0
- csv_detective/formats/latitude_wgs_fr_metropole.py +42 -0
- csv_detective/formats/latlon_wgs.py +53 -0
- csv_detective/formats/longitude_l93.py +39 -0
- csv_detective/formats/longitude_wgs.py +32 -0
- csv_detective/formats/longitude_wgs_fr_metropole.py +32 -0
- csv_detective/formats/lonlat_wgs.py +36 -0
- csv_detective/{detect_fields/FR/temp/mois_de_annee/__init__.py → formats/mois_de_lannee.py} +48 -39
- csv_detective/formats/money.py +18 -0
- csv_detective/formats/mongo_object_id.py +14 -0
- csv_detective/formats/pays.py +35 -0
- csv_detective/formats/percent.py +16 -0
- csv_detective/{detect_fields/FR/geo/region/__init__.py → formats/region.py} +70 -50
- csv_detective/formats/sexe.py +17 -0
- csv_detective/{detect_fields/FR/other/siren/__init__.py → formats/siren.py} +37 -20
- csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} +47 -31
- csv_detective/formats/tel_fr.py +36 -0
- csv_detective/formats/uai.py +36 -0
- csv_detective/formats/url.py +45 -0
- csv_detective/formats/username.py +14 -0
- csv_detective/formats/uuid.py +16 -0
- csv_detective/formats/year.py +28 -0
- csv_detective/output/__init__.py +3 -4
- csv_detective/output/dataframe.py +3 -3
- csv_detective/output/profile.py +2 -3
- csv_detective/output/schema.py +2 -2
- csv_detective/parsing/columns.py +35 -50
- csv_detective/parsing/csv.py +2 -2
- csv_detective/parsing/load.py +4 -5
- csv_detective/validate.py +9 -4
- {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2348.dist-info}/METADATA +6 -5
- csv_detective-0.9.3.dev2348.dist-info/RECORD +102 -0
- tests/test_fields.py +39 -364
- tests/test_file.py +1 -1
- tests/test_labels.py +5 -3
- tests/test_structure.py +40 -36
- csv_detective/detect_fields/FR/__init__.py +0 -0
- csv_detective/detect_fields/FR/geo/__init__.py +0 -0
- csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +0 -9
- csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +0 -9
- csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -9
- csv_detective/detect_fields/FR/geo/code_region/__init__.py +0 -10
- csv_detective/detect_fields/FR/geo/departement/__init__.py +0 -16
- csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +0 -19
- csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +0 -19
- csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/pays/__init__.py +0 -16
- csv_detective/detect_fields/FR/other/__init__.py +0 -0
- csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
- csv_detective/detect_fields/FR/other/code_rna/__init__.py +0 -9
- csv_detective/detect_fields/FR/other/code_waldec/__init__.py +0 -9
- csv_detective/detect_fields/FR/other/date_fr/__init__.py +0 -12
- csv_detective/detect_fields/FR/other/sexe/__init__.py +0 -11
- csv_detective/detect_fields/FR/other/tel_fr/__init__.py +0 -17
- csv_detective/detect_fields/FR/other/uai/__init__.py +0 -15
- csv_detective/detect_fields/FR/temp/__init__.py +0 -0
- csv_detective/detect_fields/__init__.py +0 -112
- csv_detective/detect_fields/geo/__init__.py +0 -0
- csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +0 -15
- csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +0 -14
- csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +0 -15
- csv_detective/detect_fields/geo/json_geojson/__init__.py +0 -18
- csv_detective/detect_fields/geo/latitude_wgs/__init__.py +0 -13
- csv_detective/detect_fields/geo/latlon_wgs/__init__.py +0 -16
- csv_detective/detect_fields/geo/longitude_wgs/__init__.py +0 -13
- csv_detective/detect_fields/geo/lonlat_wgs/__init__.py +0 -16
- csv_detective/detect_fields/other/__init__.py +0 -0
- csv_detective/detect_fields/other/email/__init__.py +0 -10
- csv_detective/detect_fields/other/money/__init__.py +0 -11
- csv_detective/detect_fields/other/mongo_object_id/__init__.py +0 -8
- csv_detective/detect_fields/other/percent/__init__.py +0 -9
- csv_detective/detect_fields/other/twitter/__init__.py +0 -8
- csv_detective/detect_fields/other/url/__init__.py +0 -14
- csv_detective/detect_fields/other/uuid/__init__.py +0 -10
- csv_detective/detect_fields/temp/__init__.py +0 -0
- csv_detective/detect_fields/temp/year/__init__.py +0 -10
- csv_detective/detect_labels/FR/__init__.py +0 -0
- csv_detective/detect_labels/FR/geo/__init__.py +0 -0
- csv_detective/detect_labels/FR/geo/adresse/__init__.py +0 -15
- csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +0 -17
- csv_detective/detect_labels/FR/geo/code_departement/__init__.py +0 -15
- csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +0 -12
- csv_detective/detect_labels/FR/geo/code_postal/__init__.py +0 -16
- csv_detective/detect_labels/FR/geo/code_region/__init__.py +0 -14
- csv_detective/detect_labels/FR/geo/commune/__init__.py +0 -12
- csv_detective/detect_labels/FR/geo/departement/__init__.py +0 -22
- csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +0 -13
- csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +0 -30
- csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -30
- csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +0 -21
- csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -21
- csv_detective/detect_labels/FR/geo/pays/__init__.py +0 -20
- csv_detective/detect_labels/FR/geo/region/__init__.py +0 -20
- csv_detective/detect_labels/FR/other/__init__.py +0 -0
- csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +0 -8
- csv_detective/detect_labels/FR/other/code_rna/__init__.py +0 -13
- csv_detective/detect_labels/FR/other/code_waldec/__init__.py +0 -8
- csv_detective/detect_labels/FR/other/csp_insee/__init__.py +0 -13
- csv_detective/detect_labels/FR/other/date_fr/__init__.py +0 -9
- csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +0 -15
- csv_detective/detect_labels/FR/other/sexe/__init__.py +0 -8
- csv_detective/detect_labels/FR/other/siren/__init__.py +0 -17
- csv_detective/detect_labels/FR/other/siret/__init__.py +0 -16
- csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -20
- csv_detective/detect_labels/FR/other/uai/__init__.py +0 -25
- csv_detective/detect_labels/FR/temp/__init__.py +0 -0
- csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +0 -16
- csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +0 -8
- csv_detective/detect_labels/__init__.py +0 -94
- csv_detective/detect_labels/geo/__init__.py +0 -0
- csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +0 -16
- csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +0 -16
- csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +0 -16
- csv_detective/detect_labels/geo/json_geojson/__init__.py +0 -17
- csv_detective/detect_labels/geo/latitude_wgs/__init__.py +0 -30
- csv_detective/detect_labels/geo/latlon_wgs/__init__.py +0 -39
- csv_detective/detect_labels/geo/longitude_wgs/__init__.py +0 -21
- csv_detective/detect_labels/geo/lonlat_wgs/__init__.py +0 -23
- csv_detective/detect_labels/other/__init__.py +0 -0
- csv_detective/detect_labels/other/booleen/__init__.py +0 -8
- csv_detective/detect_labels/other/email/__init__.py +0 -20
- csv_detective/detect_labels/other/float/__init__.py +0 -8
- csv_detective/detect_labels/other/int/__init__.py +0 -8
- csv_detective/detect_labels/other/money/__init__.py +0 -8
- csv_detective/detect_labels/other/mongo_object_id/__init__.py +0 -8
- csv_detective/detect_labels/other/twitter/__init__.py +0 -8
- csv_detective/detect_labels/other/url/__init__.py +0 -23
- csv_detective/detect_labels/other/uuid/__init__.py +0 -8
- csv_detective/detect_labels/temp/__init__.py +0 -0
- csv_detective/detect_labels/temp/date/__init__.py +0 -28
- csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +0 -19
- csv_detective/detect_labels/temp/year/__init__.py +0 -19
- csv_detective/load_tests.py +0 -59
- csv_detective-0.9.3.dev2258.dist-info/RECORD +0 -166
- /csv_detective/{detect_fields/FR/other/csp_insee → formats/data}/csp_insee.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_alpha2 → formats/data}/iso_country_code_alpha2.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_alpha3 → formats/data}/iso_country_code_alpha3.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_numeric → formats/data}/iso_country_code_numeric.txt +0 -0
- {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2348.dist-info}/WHEEL +0 -0
- {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2348.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2348.dist-info}/licenses/LICENSE +0 -0
- {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2348.dist-info}/top_level.txt +0 -0
csv_detective/output/__init__.py
CHANGED
|
@@ -4,12 +4,11 @@ from typing import Iterator
|
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
|
|
7
|
+
from csv_detective.output.dataframe import cast_df_chunks
|
|
8
|
+
from csv_detective.output.profile import create_profile
|
|
9
|
+
from csv_detective.output.schema import generate_table_schema
|
|
7
10
|
from csv_detective.utils import is_url
|
|
8
11
|
|
|
9
|
-
from .dataframe import cast_df_chunks
|
|
10
|
-
from .profile import create_profile
|
|
11
|
-
from .schema import generate_table_schema
|
|
12
|
-
|
|
13
12
|
|
|
14
13
|
def generate_output(
|
|
15
14
|
table: pd.DataFrame,
|
|
@@ -5,9 +5,9 @@ from typing import Iterator
|
|
|
5
5
|
|
|
6
6
|
import pandas as pd
|
|
7
7
|
|
|
8
|
-
from csv_detective.
|
|
9
|
-
from csv_detective.
|
|
10
|
-
from csv_detective.
|
|
8
|
+
from csv_detective.formats.booleen import bool_casting
|
|
9
|
+
from csv_detective.formats.date import date_casting
|
|
10
|
+
from csv_detective.formats.float import float_casting
|
|
11
11
|
from csv_detective.parsing.csv import CHUNK_SIZE
|
|
12
12
|
from csv_detective.utils import display_logs_depending_process_time
|
|
13
13
|
|
csv_detective/output/profile.py
CHANGED
|
@@ -1,12 +1,11 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from collections import defaultdict
|
|
3
3
|
from time import time
|
|
4
|
-
from typing import Optional
|
|
5
4
|
|
|
6
5
|
import numpy as np
|
|
7
6
|
import pandas as pd
|
|
8
7
|
|
|
9
|
-
from csv_detective.
|
|
8
|
+
from csv_detective.formats.float import float_casting
|
|
10
9
|
from csv_detective.utils import cast_prevent_nan, display_logs_depending_process_time
|
|
11
10
|
|
|
12
11
|
|
|
@@ -17,7 +16,7 @@ def create_profile(
|
|
|
17
16
|
limited_output: bool = True,
|
|
18
17
|
cast_json: bool = True,
|
|
19
18
|
verbose: bool = False,
|
|
20
|
-
_col_values:
|
|
19
|
+
_col_values: dict[str, pd.Series] | None = None,
|
|
21
20
|
) -> dict:
|
|
22
21
|
if verbose:
|
|
23
22
|
start = time()
|
csv_detective/output/schema.py
CHANGED
|
@@ -103,7 +103,7 @@ def get_validata_type(format: str) -> str:
|
|
|
103
103
|
"datetime_aware": "datetime",
|
|
104
104
|
"datetime_naive": "datetime",
|
|
105
105
|
"datetime_rfc822": "datetime",
|
|
106
|
-
"
|
|
106
|
+
"geojson": "geojson",
|
|
107
107
|
"latitude": "number",
|
|
108
108
|
"latitude_l93": "number",
|
|
109
109
|
"latitude_wgs": "number",
|
|
@@ -150,7 +150,7 @@ def get_example(format: str) -> str:
|
|
|
150
150
|
"iso_country_code_alpha3": "FRA",
|
|
151
151
|
"iso_country_code_numeric": 250,
|
|
152
152
|
"jour_de_la_semaine": "lundi",
|
|
153
|
-
"
|
|
153
|
+
"geojson": '{"type": "Point", "coordinates": [0, 0]}',
|
|
154
154
|
"latitude": 42.42,
|
|
155
155
|
"latitude_l93": 6037008,
|
|
156
156
|
"latitude_wgs": 42.42,
|
csv_detective/parsing/columns.py
CHANGED
|
@@ -5,6 +5,7 @@ from typing import Callable
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
from more_itertools import peekable
|
|
7
7
|
|
|
8
|
+
from csv_detective.format import Format
|
|
8
9
|
from csv_detective.parsing.csv import CHUNK_SIZE
|
|
9
10
|
from csv_detective.utils import display_logs_depending_process_time
|
|
10
11
|
|
|
@@ -14,15 +15,13 @@ MAX_NUMBER_CATEGORICAL_VALUES = 25
|
|
|
14
15
|
|
|
15
16
|
def test_col_val(
|
|
16
17
|
serie: pd.Series,
|
|
17
|
-
|
|
18
|
-
proportion: float = 0.9,
|
|
18
|
+
format: Format,
|
|
19
19
|
skipna: bool = True,
|
|
20
20
|
limited_output: bool = False,
|
|
21
21
|
verbose: bool = False,
|
|
22
22
|
) -> float:
|
|
23
23
|
"""Tests values of the serie using test_func.
|
|
24
|
-
- skipna : if True indicates that NaNs are
|
|
25
|
-
- proportion : indicates the proportion of values that have to pass the test
|
|
24
|
+
- skipna : if True indicates that NaNs are considered True
|
|
26
25
|
for the serie to be detected as a certain format
|
|
27
26
|
"""
|
|
28
27
|
if verbose:
|
|
@@ -34,28 +33,28 @@ def test_col_val(
|
|
|
34
33
|
|
|
35
34
|
try:
|
|
36
35
|
if skipna:
|
|
37
|
-
serie = serie[serie.notnull()]
|
|
36
|
+
serie = serie.loc[serie.notnull()]
|
|
38
37
|
ser_len = len(serie)
|
|
39
38
|
if ser_len == 0:
|
|
40
39
|
# being here means the whole column is NaN, so if skipna it's a pass
|
|
41
40
|
return 1.0 if skipna else 0.0
|
|
42
41
|
if not limited_output:
|
|
43
|
-
result = apply_test_func(serie,
|
|
44
|
-
return result if result >= proportion else 0.0
|
|
42
|
+
result = apply_test_func(serie, format.func, ser_len).sum() / ser_len
|
|
43
|
+
return result if result >= format.proportion else 0.0
|
|
45
44
|
else:
|
|
46
|
-
if proportion == 1:
|
|
45
|
+
if format.proportion == 1:
|
|
47
46
|
# early stops (1 then 5 rows) to not waste time if directly unsuccessful
|
|
48
47
|
for _range in [
|
|
49
48
|
min(1, ser_len),
|
|
50
49
|
min(5, ser_len),
|
|
51
50
|
ser_len,
|
|
52
51
|
]:
|
|
53
|
-
if not all(apply_test_func(serie,
|
|
52
|
+
if not all(apply_test_func(serie, format.func, _range)):
|
|
54
53
|
return 0.0
|
|
55
54
|
return 1.0
|
|
56
55
|
else:
|
|
57
|
-
result = apply_test_func(serie,
|
|
58
|
-
return result if result >= proportion else 0.0
|
|
56
|
+
result = apply_test_func(serie, format.func, ser_len).sum() / ser_len
|
|
57
|
+
return result if result >= format.proportion else 0.0
|
|
59
58
|
finally:
|
|
60
59
|
if verbose and time() - start > 3:
|
|
61
60
|
display_logs_depending_process_time(
|
|
@@ -64,42 +63,27 @@ def test_col_val(
|
|
|
64
63
|
)
|
|
65
64
|
|
|
66
65
|
|
|
67
|
-
def test_col_label(
|
|
68
|
-
label: str, test_func: Callable, proportion: float = 1, limited_output: bool = False
|
|
69
|
-
):
|
|
70
|
-
"""Tests label (from header) using test_func.
|
|
71
|
-
- proportion : indicates the minimum score to pass the test for the serie
|
|
72
|
-
to be detected as a certain format
|
|
73
|
-
"""
|
|
74
|
-
if not limited_output:
|
|
75
|
-
return test_func(label)
|
|
76
|
-
else:
|
|
77
|
-
result = test_func(label)
|
|
78
|
-
return result if result >= proportion else 0
|
|
79
|
-
|
|
80
|
-
|
|
81
66
|
def test_col(
|
|
82
67
|
table: pd.DataFrame,
|
|
83
|
-
|
|
68
|
+
formats: dict[str, Format],
|
|
84
69
|
limited_output: bool,
|
|
85
70
|
skipna: bool = True,
|
|
86
71
|
verbose: bool = False,
|
|
87
72
|
):
|
|
88
73
|
if verbose:
|
|
89
74
|
start = time()
|
|
90
|
-
logging.info("Testing columns to get
|
|
75
|
+
logging.info("Testing columns to get formats")
|
|
91
76
|
return_table = pd.DataFrame(columns=table.columns)
|
|
92
|
-
for idx, (
|
|
77
|
+
for idx, (label, format) in enumerate(formats.items()):
|
|
93
78
|
if verbose:
|
|
94
79
|
start_type = time()
|
|
95
|
-
logging.info(f"\t- Starting with
|
|
80
|
+
logging.info(f"\t- Starting with format '{label}'")
|
|
96
81
|
# improvement lead : put the longest tests behind and make them only if previous tests not satisfactory
|
|
97
82
|
# => the following needs to change, "apply" means all columns are tested for one type at once
|
|
98
|
-
return_table.loc[
|
|
83
|
+
return_table.loc[label] = table.apply(
|
|
99
84
|
lambda serie: test_col_val(
|
|
100
85
|
serie,
|
|
101
|
-
|
|
102
|
-
attributes["prop"],
|
|
86
|
+
format,
|
|
103
87
|
skipna=skipna,
|
|
104
88
|
limited_output=limited_output,
|
|
105
89
|
verbose=verbose,
|
|
@@ -107,7 +91,7 @@ def test_col(
|
|
|
107
91
|
)
|
|
108
92
|
if verbose:
|
|
109
93
|
display_logs_depending_process_time(
|
|
110
|
-
f'\t> Done with type "{
|
|
94
|
+
f'\t> Done with type "{label}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(formats)})',
|
|
111
95
|
time() - start_type,
|
|
112
96
|
)
|
|
113
97
|
if verbose:
|
|
@@ -118,23 +102,20 @@ def test_col(
|
|
|
118
102
|
|
|
119
103
|
|
|
120
104
|
def test_label(
|
|
121
|
-
columns: list[str],
|
|
105
|
+
columns: list[str], formats: dict[str, Format], limited_output: bool, verbose: bool = False
|
|
122
106
|
):
|
|
123
107
|
if verbose:
|
|
124
108
|
start = time()
|
|
125
109
|
logging.info("Testing labels to get types")
|
|
126
110
|
|
|
127
111
|
return_table = pd.DataFrame(columns=columns)
|
|
128
|
-
for idx, (
|
|
112
|
+
for idx, (label, format) in enumerate(formats.items()):
|
|
129
113
|
if verbose:
|
|
130
114
|
start_type = time()
|
|
131
|
-
return_table.loc[
|
|
132
|
-
test_col_label(col_name, value["func"], value["prop"], limited_output=limited_output)
|
|
133
|
-
for col_name in columns
|
|
134
|
-
]
|
|
115
|
+
return_table.loc[label] = [format.is_valid_label(col_name) for col_name in columns]
|
|
135
116
|
if verbose:
|
|
136
117
|
display_logs_depending_process_time(
|
|
137
|
-
f'\t- Done with type "{
|
|
118
|
+
f'\t- Done with type "{label}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(formats)})',
|
|
138
119
|
time() - start_type,
|
|
139
120
|
)
|
|
140
121
|
if verbose:
|
|
@@ -148,23 +129,28 @@ def test_col_chunks(
|
|
|
148
129
|
table: pd.DataFrame,
|
|
149
130
|
file_path: str,
|
|
150
131
|
analysis: dict,
|
|
151
|
-
|
|
132
|
+
formats: dict[str, Format],
|
|
152
133
|
limited_output: bool,
|
|
153
134
|
skipna: bool = True,
|
|
154
135
|
verbose: bool = False,
|
|
155
136
|
) -> tuple[pd.DataFrame, dict, dict[str, pd.Series]]:
|
|
156
137
|
def build_remaining_tests_per_col(return_table: pd.DataFrame) -> dict[str, list[str]]:
|
|
138
|
+
# returns a dict with the table's columns as keys and the list of remaining format labels to apply
|
|
157
139
|
return {
|
|
158
|
-
col: [
|
|
140
|
+
col: [
|
|
141
|
+
fmt_label
|
|
142
|
+
for fmt_label in return_table.index
|
|
143
|
+
if return_table.loc[fmt_label, col] > 0
|
|
144
|
+
]
|
|
159
145
|
for col in return_table.columns
|
|
160
146
|
}
|
|
161
147
|
|
|
162
148
|
if verbose:
|
|
163
149
|
start = time()
|
|
164
|
-
logging.info("Testing columns to get
|
|
150
|
+
logging.info("Testing columns to get formats on chunks")
|
|
165
151
|
|
|
166
152
|
# analysing the sample to get a first guess
|
|
167
|
-
return_table = test_col(table,
|
|
153
|
+
return_table = test_col(table, formats, limited_output, skipna=skipna, verbose=verbose)
|
|
168
154
|
remaining_tests_per_col = build_remaining_tests_per_col(return_table)
|
|
169
155
|
|
|
170
156
|
# hashing rows to get nb_duplicates
|
|
@@ -217,23 +203,22 @@ def test_col_chunks(
|
|
|
217
203
|
if not any(remaining_tests for remaining_tests in remaining_tests_per_col.values()):
|
|
218
204
|
# no more potential tests to do on any column, early stop
|
|
219
205
|
break
|
|
220
|
-
for col,
|
|
206
|
+
for col, fmt_labels in remaining_tests_per_col.items():
|
|
221
207
|
# testing each column with the tests that are still competing
|
|
222
208
|
# after previous batchs analyses
|
|
223
|
-
for
|
|
209
|
+
for label in fmt_labels:
|
|
224
210
|
batch_col_test = test_col_val(
|
|
225
211
|
batch[col],
|
|
226
|
-
|
|
227
|
-
all_tests[test]["prop"],
|
|
212
|
+
formats[label],
|
|
228
213
|
limited_output=limited_output,
|
|
229
214
|
skipna=skipna,
|
|
230
215
|
)
|
|
231
|
-
return_table.loc[
|
|
216
|
+
return_table.loc[label, col] = (
|
|
232
217
|
# if this batch's column tested 0 then test fails overall
|
|
233
218
|
0
|
|
234
219
|
if batch_col_test == 0
|
|
235
220
|
# otherwise updating the score with weighted average
|
|
236
|
-
else ((return_table.loc[
|
|
221
|
+
else ((return_table.loc[label, col] * idx + batch_col_test) / (idx + 1))
|
|
237
222
|
)
|
|
238
223
|
remaining_tests_per_col = build_remaining_tests_per_col(return_table)
|
|
239
224
|
batch, batch_number = [], batch_number + 1
|
csv_detective/parsing/csv.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from time import time
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import TextIO
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
|
|
@@ -18,7 +18,7 @@ def parse_csv(
|
|
|
18
18
|
skiprows: int,
|
|
19
19
|
random_state: int = 42,
|
|
20
20
|
verbose: bool = False,
|
|
21
|
-
) -> tuple[pd.DataFrame,
|
|
21
|
+
) -> tuple[pd.DataFrame, int | None, int | None]:
|
|
22
22
|
if verbose:
|
|
23
23
|
start = time()
|
|
24
24
|
logging.info("Parsing table")
|
csv_detective/parsing/load.py
CHANGED
|
@@ -12,14 +12,13 @@ from csv_detective.detection.engine import (
|
|
|
12
12
|
)
|
|
13
13
|
from csv_detective.detection.headers import detect_headers
|
|
14
14
|
from csv_detective.detection.separator import detect_separator
|
|
15
|
-
from csv_detective.
|
|
16
|
-
|
|
17
|
-
from .
|
|
18
|
-
from .csv import parse_csv
|
|
19
|
-
from .excel import (
|
|
15
|
+
from csv_detective.parsing.compression import unzip
|
|
16
|
+
from csv_detective.parsing.csv import parse_csv
|
|
17
|
+
from csv_detective.parsing.excel import (
|
|
20
18
|
XLS_LIKE_EXT,
|
|
21
19
|
parse_excel,
|
|
22
20
|
)
|
|
21
|
+
from csv_detective.utils import is_url
|
|
23
22
|
|
|
24
23
|
|
|
25
24
|
def load_file(
|
csv_detective/validate.py
CHANGED
|
@@ -2,13 +2,13 @@ import logging
|
|
|
2
2
|
|
|
3
3
|
import pandas as pd
|
|
4
4
|
|
|
5
|
-
from csv_detective.
|
|
5
|
+
from csv_detective.format import FormatsManager
|
|
6
6
|
from csv_detective.parsing.columns import MAX_NUMBER_CATEGORICAL_VALUES, test_col_val
|
|
7
7
|
|
|
8
8
|
VALIDATION_CHUNK_SIZE = int(1e5)
|
|
9
9
|
logging.basicConfig(level=logging.INFO)
|
|
10
10
|
|
|
11
|
-
|
|
11
|
+
formats = FormatsManager().formats
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
def validate(
|
|
@@ -19,6 +19,12 @@ def validate(
|
|
|
19
19
|
) -> tuple[bool, pd.DataFrame | None, dict | None, dict[str, pd.Series] | None]:
|
|
20
20
|
"""
|
|
21
21
|
Verify is the given file has the same fields and types as in the given analysis.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
file_path: the path of the file to validate
|
|
25
|
+
previous_analysis: the previous analysis to validate against (expected in the same structure as the output of the routine)
|
|
26
|
+
verbose: whether the code displays the steps it's going through
|
|
27
|
+
skipna: whether to ignore NaN values in the checks
|
|
22
28
|
"""
|
|
23
29
|
try:
|
|
24
30
|
if previous_analysis.get("separator"):
|
|
@@ -101,8 +107,7 @@ def validate(
|
|
|
101
107
|
continue
|
|
102
108
|
test_result: float = test_col_val(
|
|
103
109
|
serie=chunk[col_name],
|
|
104
|
-
|
|
105
|
-
proportion=tests[args["format"]]["prop"],
|
|
110
|
+
format=formats[args["format"]],
|
|
106
111
|
skipna=skipna,
|
|
107
112
|
)
|
|
108
113
|
if not bool(test_result):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: csv-detective
|
|
3
|
-
Version: 0.9.3.
|
|
3
|
+
Version: 0.9.3.dev2348
|
|
4
4
|
Summary: Detect tabular files column content
|
|
5
5
|
Author-email: Etalab <opendatateam@data.gouv.fr>
|
|
6
6
|
License: MIT
|
|
@@ -33,7 +33,7 @@ Dynamic: license-file
|
|
|
33
33
|
|
|
34
34
|
This is a package to **automatically detect column content in tabular files**. The script reads either the whole file or the first few rows and performs various checks (regex, casting, comparison with official lists...) to see for each column if it matches with various content types.
|
|
35
35
|
|
|
36
|
-
Currently supported file types: csv, xls, xlsx, ods.
|
|
36
|
+
Currently supported file types: csv(.gz), xls, xlsx, ods.
|
|
37
37
|
|
|
38
38
|
You can also directly feed the URL of a remote file (from data.gouv.fr for instance).
|
|
39
39
|
|
|
@@ -65,7 +65,8 @@ inspection_results = routine(
|
|
|
65
65
|
num_rows=-1, # Value -1 will analyze all lines of your file, you can change with the number of lines you wish to analyze
|
|
66
66
|
save_results=False, # Default False. If True, it will save result output into the same directory as the analyzed file, using the same name as your file and .json extension
|
|
67
67
|
output_profile=True, # Default False. If True, returned dict will contain a property "profile" indicating profile (min, max, mean, tops...) of every column of you csv
|
|
68
|
-
output_schema=True, # Default False. If True, returned dict will contain a property "schema" containing basic [tableschema](https://specs.frictionlessdata.io/table-schema/) of your file. This can be use to validate structure of other csv which should match same structure.
|
|
68
|
+
output_schema=True, # Default False. If True, returned dict will contain a property "schema" containing basic [tableschema](https://specs.frictionlessdata.io/table-schema/) of your file. This can be use to validate structure of other csv which should match same structure.
|
|
69
|
+
tags=["fr"], # Default None. If set as a list of strings, only performs checks related to the specified tags (you can see the available tags with FormatsManager().available_tags())
|
|
69
70
|
)
|
|
70
71
|
```
|
|
71
72
|
|
|
@@ -73,7 +74,7 @@ inspection_results = routine(
|
|
|
73
74
|
|
|
74
75
|
### Output
|
|
75
76
|
|
|
76
|
-
The program creates a `
|
|
77
|
+
The program creates a `python` dictionnary with the following information :
|
|
77
78
|
|
|
78
79
|
```
|
|
79
80
|
{
|
|
@@ -216,7 +217,7 @@ Only the format with highest score is present in the output.
|
|
|
216
217
|
## Improvement suggestions
|
|
217
218
|
|
|
218
219
|
- Smarter refactors
|
|
219
|
-
-
|
|
220
|
+
- Performances improvements
|
|
220
221
|
- Test other ways to load and process data (`pandas` alternatives)
|
|
221
222
|
- Add more and more detection modules...
|
|
222
223
|
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
csv_detective/__init__.py,sha256=zlYElTOp_I2_VG7ZdOTuAu0wuCXSc0cr3sH6gtk2bcg,152
|
|
2
|
+
csv_detective/cli.py,sha256=mu5anmBmaDk52_uZGiA4T37wYZCuV43gZAepjs1Cqzc,1389
|
|
3
|
+
csv_detective/explore_csv.py,sha256=-LCHr7vyT0Q0oLtXeOO8pEevJ6-8Ib9JP3D7nVgZM8o,7090
|
|
4
|
+
csv_detective/format.py,sha256=XX_cSTQc0jlsQq3GUqHi7Cz36AiRrpjrwPmeoOTLMvo,2396
|
|
5
|
+
csv_detective/utils.py,sha256=RJ_zFOJ1DRY8HtDrKPiCdNk5gU6-KwOrOKOyfSkBZZY,1118
|
|
6
|
+
csv_detective/validate.py,sha256=XldlbGkUlPaIh0y4z9iaWlmmahwCrD1900s5Cxlq5wI,5430
|
|
7
|
+
csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
+
csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvcaGd0Go,2887
|
|
9
|
+
csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
|
|
10
|
+
csv_detective/detection/engine.py,sha256=wQeDKpp2DKF-HcS1R8H6GgQyaUgQme4szPtEHgAjBII,1552
|
|
11
|
+
csv_detective/detection/formats.py,sha256=uxmWz7J3btAwaOONIACxiL9vTZ8Iv7NdTSUqAOPQy0o,5381
|
|
12
|
+
csv_detective/detection/headers.py,sha256=95pTL524Sy5PGxyQ03ofFUaamvlmkxTJQe8u6HfzOkU,1051
|
|
13
|
+
csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
|
|
14
|
+
csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
|
|
15
|
+
csv_detective/detection/variables.py,sha256=-QtZOB96z3pWbqnZ-c1RU3yzoYqcO61A0JzeS6JbkxY,3576
|
|
16
|
+
csv_detective/formats/__init__.py,sha256=Egiy29kcG3Oz2eE2maYhD3wP29zOSOWyRlOpGD5LGvU,318
|
|
17
|
+
csv_detective/formats/adresse.py,sha256=jALDpEDAWyAcgqEfNVRg_W1r6XaYuJKD_jAaP2l-bxk,1943
|
|
18
|
+
csv_detective/formats/booleen.py,sha256=AnDDKShkSYpWO4POhwY2V7_C4yPWbmqBu8CJPgQ9Gwc,648
|
|
19
|
+
csv_detective/formats/code_commune_insee.py,sha256=MhwCPVAhwWH-MyaNAIVRNbqKfeNe3oiCpzEGfpHkpJY,504
|
|
20
|
+
csv_detective/formats/code_csp_insee.py,sha256=_JQ-YbnHMenNnwIg1xBmNVqgCa1tLD2hbPN1soODhDk,656
|
|
21
|
+
csv_detective/formats/code_departement.py,sha256=odwVbmktgjEhL-dSFHXuCRVwhkF8bL8G7VlpVTnMY2A,628
|
|
22
|
+
csv_detective/formats/code_fantoir.py,sha256=nFVFYJEP2HHE2TyhR_dhGdPCMLfCROBO_B8wxwQn7T8,366
|
|
23
|
+
csv_detective/formats/code_import.py,sha256=N5NVvnHkRwC7ARHoM77R-2cYSeyNmPoRIn6JL3Fbnjs,346
|
|
24
|
+
csv_detective/formats/code_postal.py,sha256=C6XMkiVTxhMFvfyvJmGp3iwvh722EzMwD_UdqQU4aR0,427
|
|
25
|
+
csv_detective/formats/code_region.py,sha256=VFKh1rGYVYTNWBJZ2_m0xS4rhJlrI_Gr8q8RXuZCr-w,366
|
|
26
|
+
csv_detective/formats/code_rna.py,sha256=WExlQtlAUfOFT4N3MKsMBhZVxTdNzgexFjmXhZdRM1w,512
|
|
27
|
+
csv_detective/formats/code_waldec.py,sha256=kJEJfikbhMfVwtA8hBpup0tpeSFoY_rWrEdXQxgNwhg,297
|
|
28
|
+
csv_detective/formats/commune.py,sha256=oVpwINGqpwMOT43KkasozipJ9hBeoQ5FrKV_wIeVJGE,532
|
|
29
|
+
csv_detective/formats/csp_insee.py,sha256=HE6NK6Sw91mLFeAAKwWUXZZfXX6fiA0zK4RI4YdkUFY,656
|
|
30
|
+
csv_detective/formats/date.py,sha256=X4ohXaFO8cXPJktUSumc3bfdlbDIWEYTG8S9ugVRcsE,2730
|
|
31
|
+
csv_detective/formats/date_fr.py,sha256=3hTw5RommrhcgECFRSt9KgyB9zyi1j4W3UygEHmRgoE,502
|
|
32
|
+
csv_detective/formats/datetime_aware.py,sha256=-1ZBix6vYlYXTvhXrijP-98AN7iPB0x_DbbwU1QjMCI,1470
|
|
33
|
+
csv_detective/formats/datetime_naive.py,sha256=nvA8qT1fb2RmpXN5_Cw9YZA6pC4BryX_B0V-E6O2UbU,1521
|
|
34
|
+
csv_detective/formats/datetime_rfc822.py,sha256=l-SLb34hSuHxC2JQ-9SD-nG38JqzoozwUZiGtoybb0A,601
|
|
35
|
+
csv_detective/formats/departement.py,sha256=UP9UF23BFq_-mIS8N10K5XkoCXwPmDeSoa_7lCAkI4w,768
|
|
36
|
+
csv_detective/formats/email.py,sha256=Qen2EBDYY5TtWXwxrrTGWRrbIybz0ySlVpl4ZRk8pzA,517
|
|
37
|
+
csv_detective/formats/float.py,sha256=tWs_tW64OuacNQENu3uk5GOEVQMQls2iiteFOacQRAQ,832
|
|
38
|
+
csv_detective/formats/geojson.py,sha256=udbBxCBRmb0o6TD8z5ryemfqdinBz6njNJU0XcbfMig,757
|
|
39
|
+
csv_detective/formats/insee_ape700.py,sha256=cLs3Eersqm4wX6oqsqp0Vb3WGPJb2xY5Za_vh0uLgKc,780
|
|
40
|
+
csv_detective/formats/insee_canton.py,sha256=Q5jczsOmh1wPP2KtDkcmqZ7Hlv50Zz9YvPIbxy46qs0,531
|
|
41
|
+
csv_detective/formats/int.py,sha256=ZBUOn50luMtlNKWPyOaMIkY3J4f4hA0MqwcoFtksozU,482
|
|
42
|
+
csv_detective/formats/iso_country_code_alpha2.py,sha256=vIep_j0xuqlXKyuvk8c8GaJC73HuJqKfQ4QzQKHsPc0,613
|
|
43
|
+
csv_detective/formats/iso_country_code_alpha3.py,sha256=yOmm91O8ot6KoUBfss5cqykDfeeMNCwafDAvPNvbufA,668
|
|
44
|
+
csv_detective/formats/iso_country_code_numeric.py,sha256=989ypOmjIrNTV9vFnrBlbpRWQ9whd3Rv9gNasdF_O4g,685
|
|
45
|
+
csv_detective/formats/jour_de_la_semaine.py,sha256=c5QBw9eZfwRs_jL_Ckm95UH-TxlExdFmfZNYW7-_iZI,606
|
|
46
|
+
csv_detective/formats/json.py,sha256=E-s7IHW0q5WgAJVK0I-5Rv7W_RdofROB5wnIXbNegZQ,446
|
|
47
|
+
csv_detective/formats/latitude_l93.py,sha256=GteGpxAht-jeOBLr_deCuEXA_LliVYIAmyr_7jFAWgI,986
|
|
48
|
+
csv_detective/formats/latitude_wgs.py,sha256=HPcFlLzJNqynLugDQ07vO04rOCNBuAabVJEP8FQ89Q0,780
|
|
49
|
+
csv_detective/formats/latitude_wgs_fr_metropole.py,sha256=ruGzQLJPiMV2AlnsBneQIhMzstseddzWA0bDg5gfTG4,791
|
|
50
|
+
csv_detective/formats/latlon_wgs.py,sha256=CbNi4Y-ZgBfNyYi54xwcZGLpEusiLAWVpFP1YgHtI1M,1224
|
|
51
|
+
csv_detective/formats/longitude_l93.py,sha256=vJE4k_DyQOjAruqu_Q0E2sJKZB4mXGGN6bS9WCelsbs,768
|
|
52
|
+
csv_detective/formats/longitude_wgs.py,sha256=DUZCUxJQl53HHVQbXlz_lWXoAZhy3MvJWcPNdiK5cCM,552
|
|
53
|
+
csv_detective/formats/longitude_wgs_fr_metropole.py,sha256=wPlJP06K0BVWfrx1wwEAKK93AKIqvsuw705gKAlWAfQ,550
|
|
54
|
+
csv_detective/formats/lonlat_wgs.py,sha256=BgtTl2ReI0hSQB-7mcR4TDxx-QzvA1B9fiZWxTb5xPI,1005
|
|
55
|
+
csv_detective/formats/mois_de_lannee.py,sha256=4_mmdr9S83utVCgPaK_epkeBm2mhwdUWQEoB_Fhdh2o,759
|
|
56
|
+
csv_detective/formats/money.py,sha256=HpjrmfUmbG8sXF557XbYzQ7TLtpNVRgpC991gGokO8I,414
|
|
57
|
+
csv_detective/formats/mongo_object_id.py,sha256=XsiP4iMxfBBIeuL-4g5bm3jgS6yUMJC2X5CmrEJ40oI,296
|
|
58
|
+
csv_detective/formats/pays.py,sha256=FRvoQwIWiKbm0RC62Sus1X0Y_yJ-cfvdB5RYhkY-4NY,693
|
|
59
|
+
csv_detective/formats/percent.py,sha256=s6eQBMwJr2uyTZMUCK1_ifA0c4Rt2iEe9_E_hKKU_mk,308
|
|
60
|
+
csv_detective/formats/region.py,sha256=CkN7JTsZB1X3bH5xohbtMCxL5BX9MSpith36_1mHMd4,1483
|
|
61
|
+
csv_detective/formats/sexe.py,sha256=yioD4W6EkgUgo74rxn6KLZtN_0XYXtmA4mqVyI7e1mU,387
|
|
62
|
+
csv_detective/formats/siren.py,sha256=ieLe50vdSnkXadcUI8VXnnId9GFGHyIBWVTP6bJtyMo,758
|
|
63
|
+
csv_detective/formats/siret.py,sha256=ehkZgOH-HggN6IgxF4G0DMut_6giZ3gc4g9wMdwZFHQ,997
|
|
64
|
+
csv_detective/formats/tel_fr.py,sha256=yKCqIlqKO2yKucCoCjYfSjqNKfTjqFcmNXxg6THG0WE,624
|
|
65
|
+
csv_detective/formats/uai.py,sha256=uT5gjdTmoFH9QPZdTFkJgiyuKLW0B6KmT6yqHQeaeOU,711
|
|
66
|
+
csv_detective/formats/url.py,sha256=GYE9j_i4kpEQueBXa1Fla0wk8_sc0n230GL3KaIRvwY,932
|
|
67
|
+
csv_detective/formats/username.py,sha256=y38OggfWpEQsGi0JnD9QRM30musa29lO6nz-qybR24U,249
|
|
68
|
+
csv_detective/formats/uuid.py,sha256=ekMEFfzQtz0cLudzmu3AoCM0Yf5pu23qAcFNFgHWJ1A,346
|
|
69
|
+
csv_detective/formats/year.py,sha256=pkAfYPKZdy0g1ZoHGgJNpgTS5y5weGEKXCVMGaxIX8k,472
|
|
70
|
+
csv_detective/formats/data/csp_insee.txt,sha256=kgKaKc-5PHu5U4--ugLjpFyMNtTU9CGdZ9ANU3YAsM4,32879
|
|
71
|
+
csv_detective/formats/data/insee_ape700.txt,sha256=nKgslakENwgE7sPkVNHqR23iXuxF02p9-v5MC2_ntx8,4398
|
|
72
|
+
csv_detective/formats/data/iso_country_code_alpha2.txt,sha256=YyPlDqCdz65ecf4Wes_r0P4rDSJG35niXtjc4MmctXM,1740
|
|
73
|
+
csv_detective/formats/data/iso_country_code_alpha3.txt,sha256=aYqKSohgXuBtcIBfF52f8JWYDdxL_HV_Ol1srGnWBp4,1003
|
|
74
|
+
csv_detective/formats/data/iso_country_code_numeric.txt,sha256=2GtEhuporsHYV-pU4q9kfXU5iOtfW5C0GYBTTKQtnnA,1004
|
|
75
|
+
csv_detective/output/__init__.py,sha256=ALSq_tgX7rGyh--7rmbKz8wHkmResN0h7mNujndow3w,2103
|
|
76
|
+
csv_detective/output/dataframe.py,sha256=TyBc2ObaVUns_ydJWOMKmCYvuj7ddxag0QN3z37g3GE,3219
|
|
77
|
+
csv_detective/output/example.py,sha256=8LWheSBYCeDFfarbnmzBrdCbTd8Alh1U4pfXMKfabOw,8630
|
|
78
|
+
csv_detective/output/profile.py,sha256=VUQp0VJ22dfY4R5TybTpuQW_TOX_rLEp98cOzu-Jf44,4876
|
|
79
|
+
csv_detective/output/schema.py,sha256=XoKljXPXP00DfqPCiz1ydwTHYGAFsvNxnaPCNBuuBIo,10443
|
|
80
|
+
csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
|
|
81
|
+
csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
82
|
+
csv_detective/parsing/columns.py,sha256=CqtZRZYMYDNMopxnqs4eZLSABi-ms61wqv5M9vWJ7iU,9343
|
|
83
|
+
csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
|
|
84
|
+
csv_detective/parsing/csv.py,sha256=0T0gpaXzwJo-sq41IoLQD704GiMUYeDVVASVbat-zWg,1726
|
|
85
|
+
csv_detective/parsing/excel.py,sha256=oAVTuoDccJc4-kVjHXiIPLQx3lq3aZRRZQxkG1c06JQ,6992
|
|
86
|
+
csv_detective/parsing/load.py,sha256=f-8aKiNpy_47qg4Lq-UZUR4NNrbJ_-KEGvcUQZ8cmb0,4317
|
|
87
|
+
csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
|
|
88
|
+
csv_detective-0.9.3.dev2348.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
|
|
89
|
+
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
90
|
+
tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
|
|
91
|
+
tests/test_fields.py,sha256=EWHIKwRSdIh74bBSoozYmZBETf7V03JMWpglyxA0ci0,5616
|
|
92
|
+
tests/test_file.py,sha256=MxJOWwhRG2Xm1_m3C9x8CS9FepjUebET-6EsMi3DvmY,13125
|
|
93
|
+
tests/test_labels.py,sha256=kDPerWC3_J3l1p5I3-MHwz7BmhcuxZAws_wSgHCHUuI,536
|
|
94
|
+
tests/test_structure.py,sha256=XDbviuuvk-0Mu9Y9PI6He2e5hry2dXVJ6yBVwEqF_2o,1043
|
|
95
|
+
tests/test_validation.py,sha256=9djBT-PDhu_563OFgWyE20o-wPEWEIQGXp6Pjh0_MQM,3463
|
|
96
|
+
venv/bin/activate_this.py,sha256=wS7qPipy8R-dS_0ICD8PqqUQ8F-PrtcpiJw2DUPngYM,1287
|
|
97
|
+
venv/bin/runxlrd.py,sha256=YlZMuycM_V_hzNt2yt3FyXPuwouMCmMhvj1oZaBeeuw,16092
|
|
98
|
+
csv_detective-0.9.3.dev2348.dist-info/METADATA,sha256=0sy4vWAscpleL8quByGyJX5tw0OGkJfX_2lHsOetvy4,11038
|
|
99
|
+
csv_detective-0.9.3.dev2348.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
100
|
+
csv_detective-0.9.3.dev2348.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
|
|
101
|
+
csv_detective-0.9.3.dev2348.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
|
|
102
|
+
csv_detective-0.9.3.dev2348.dist-info/RECORD,,
|