csv-detective 0.10.2549__py3-none-any.whl → 0.10.12674__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/detection/formats.py +11 -38
- csv_detective/explore_csv.py +3 -2
- csv_detective/format.py +11 -4
- csv_detective/formats/adresse.py +9 -9
- csv_detective/formats/binary.py +2 -1
- csv_detective/formats/booleen.py +3 -2
- csv_detective/formats/code_commune_insee.py +12 -10
- csv_detective/formats/code_csp_insee.py +1 -1
- csv_detective/formats/code_departement.py +8 -7
- csv_detective/formats/code_fantoir.py +6 -5
- csv_detective/formats/code_import.py +1 -1
- csv_detective/formats/code_postal.py +10 -9
- csv_detective/formats/code_region.py +7 -6
- csv_detective/formats/code_rna.py +7 -6
- csv_detective/formats/code_waldec.py +1 -1
- csv_detective/formats/commune.py +5 -5
- csv_detective/formats/csp_insee.py +6 -5
- csv_detective/formats/data/insee_ape700.txt +1 -1
- csv_detective/formats/data/iso_country_code_alpha2.txt +153 -397
- csv_detective/formats/data/iso_country_code_alpha3.txt +132 -132
- csv_detective/formats/data/iso_country_code_numeric.txt +94 -94
- csv_detective/formats/date.py +18 -17
- csv_detective/formats/date_fr.py +1 -1
- csv_detective/formats/datetime_aware.py +7 -2
- csv_detective/formats/datetime_naive.py +3 -0
- csv_detective/formats/datetime_rfc822.py +1 -0
- csv_detective/formats/departement.py +15 -15
- csv_detective/formats/email.py +13 -13
- csv_detective/formats/float.py +2 -1
- csv_detective/formats/geojson.py +10 -10
- csv_detective/formats/insee_ape700.py +10 -8
- csv_detective/formats/insee_canton.py +6 -6
- csv_detective/formats/int.py +2 -1
- csv_detective/formats/iso_country_code_alpha2.py +14 -14
- csv_detective/formats/iso_country_code_alpha3.py +6 -13
- csv_detective/formats/iso_country_code_numeric.py +2 -9
- csv_detective/formats/jour_de_la_semaine.py +11 -12
- csv_detective/formats/json.py +6 -0
- csv_detective/formats/latitude_l93.py +8 -22
- csv_detective/formats/latitude_wgs.py +31 -29
- csv_detective/formats/latitude_wgs_fr_metropole.py +7 -30
- csv_detective/formats/latlon_wgs.py +30 -28
- csv_detective/formats/longitude_l93.py +8 -13
- csv_detective/formats/longitude_wgs.py +34 -19
- csv_detective/formats/longitude_wgs_fr_metropole.py +6 -19
- csv_detective/formats/lonlat_wgs.py +12 -11
- csv_detective/formats/mois_de_lannee.py +1 -1
- csv_detective/formats/money.py +1 -1
- csv_detective/formats/mongo_object_id.py +1 -1
- csv_detective/formats/pays.py +11 -13
- csv_detective/formats/percent.py +1 -1
- csv_detective/formats/region.py +13 -13
- csv_detective/formats/sexe.py +1 -1
- csv_detective/formats/siren.py +9 -10
- csv_detective/formats/siret.py +9 -9
- csv_detective/formats/tel_fr.py +7 -13
- csv_detective/formats/uai.py +17 -18
- csv_detective/formats/url.py +16 -16
- csv_detective/formats/username.py +1 -1
- csv_detective/formats/uuid.py +1 -1
- csv_detective/formats/year.py +7 -12
- csv_detective/output/dataframe.py +6 -1
- csv_detective/output/profile.py +5 -1
- csv_detective/parsing/text.py +13 -12
- {csv_detective-0.10.2549.dist-info → csv_detective-0.10.12674.dist-info}/METADATA +2 -2
- csv_detective-0.10.12674.dist-info/RECORD +92 -0
- {csv_detective-0.10.2549.dist-info → csv_detective-0.10.12674.dist-info}/WHEEL +1 -1
- csv_detective-0.10.2549.dist-info/RECORD +0 -92
- {csv_detective-0.10.2549.dist-info → csv_detective-0.10.12674.dist-info}/entry_points.txt +0 -0
|
@@ -82,22 +82,7 @@ def detect_formats(
|
|
|
82
82
|
# To reduce false positives: ensure these formats are detected only if the label yields
|
|
83
83
|
# a detection (skipping the ones that have been excluded by the users).
|
|
84
84
|
formats_with_mandatory_label = [
|
|
85
|
-
f
|
|
86
|
-
for f in [
|
|
87
|
-
"code_departement",
|
|
88
|
-
"code_commune_insee",
|
|
89
|
-
"code_postal",
|
|
90
|
-
"code_fantoir",
|
|
91
|
-
"latitude_wgs",
|
|
92
|
-
"longitude_wgs",
|
|
93
|
-
"latitude_wgs_fr_metropole",
|
|
94
|
-
"longitude_wgs_fr_metropole",
|
|
95
|
-
"latitude_l93",
|
|
96
|
-
"longitude_l93",
|
|
97
|
-
"siren",
|
|
98
|
-
"siret",
|
|
99
|
-
]
|
|
100
|
-
if f in scores_table.index
|
|
85
|
+
f for f in fmtm.get_formats_with_mandatory_label() if f in scores_table.index
|
|
101
86
|
]
|
|
102
87
|
scores_table.loc[formats_with_mandatory_label, :] = np.where(
|
|
103
88
|
scores_table_labels.loc[formats_with_mandatory_label, :],
|
|
@@ -106,32 +91,16 @@ def detect_formats(
|
|
|
106
91
|
)
|
|
107
92
|
analysis["columns"] = prepare_output_dict(scores_table, limited_output)
|
|
108
93
|
|
|
109
|
-
metier_to_python_type = {
|
|
110
|
-
"booleen": "bool",
|
|
111
|
-
"int": "int",
|
|
112
|
-
"float": "float",
|
|
113
|
-
"string": "string",
|
|
114
|
-
"json": "json",
|
|
115
|
-
"geojson": "json",
|
|
116
|
-
"datetime_aware": "datetime",
|
|
117
|
-
"datetime_naive": "datetime",
|
|
118
|
-
"datetime_rfc822": "datetime",
|
|
119
|
-
"date": "date",
|
|
120
|
-
"latitude_l93": "float",
|
|
121
|
-
"latitude_wgs": "float",
|
|
122
|
-
"latitude_wgs_fr_metropole": "float",
|
|
123
|
-
"longitude_l93": "float",
|
|
124
|
-
"longitude_wgs": "float",
|
|
125
|
-
"longitude_wgs_fr_metropole": "float",
|
|
126
|
-
"binary": "binary",
|
|
127
|
-
}
|
|
128
|
-
|
|
129
94
|
if not limited_output:
|
|
130
95
|
for detection_method in ["columns_fields", "columns_labels", "columns"]:
|
|
131
96
|
analysis[detection_method] = {
|
|
132
97
|
col_name: [
|
|
133
98
|
{
|
|
134
|
-
"python_type":
|
|
99
|
+
"python_type": (
|
|
100
|
+
"string"
|
|
101
|
+
if detection["format"] == "string"
|
|
102
|
+
else fmtm.formats[detection["format"]].python_type
|
|
103
|
+
),
|
|
135
104
|
**detection,
|
|
136
105
|
}
|
|
137
106
|
for detection in detections
|
|
@@ -142,7 +111,11 @@ def detect_formats(
|
|
|
142
111
|
for detection_method in ["columns_fields", "columns_labels", "columns"]:
|
|
143
112
|
analysis[detection_method] = {
|
|
144
113
|
col_name: {
|
|
145
|
-
"python_type":
|
|
114
|
+
"python_type": (
|
|
115
|
+
"string"
|
|
116
|
+
if detection["format"] == "string"
|
|
117
|
+
else fmtm.formats[detection["format"]].python_type
|
|
118
|
+
),
|
|
146
119
|
**detection,
|
|
147
120
|
}
|
|
148
121
|
for col_name, detection in analysis[detection_method].items()
|
csv_detective/explore_csv.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from time import time
|
|
3
|
+
from typing import Iterator
|
|
3
4
|
|
|
4
5
|
import pandas as pd
|
|
5
6
|
|
|
@@ -27,7 +28,7 @@ def routine(
|
|
|
27
28
|
cast_json: bool = True,
|
|
28
29
|
verbose: bool = False,
|
|
29
30
|
sheet_name: str | int | None = None,
|
|
30
|
-
) -> dict | tuple[dict, pd.DataFrame]:
|
|
31
|
+
) -> dict | tuple[dict, Iterator[pd.DataFrame]]:
|
|
31
32
|
"""
|
|
32
33
|
Returns a dict with information about the table and possible column contents, and if requested the DataFrame with columns cast according to analysis.
|
|
33
34
|
|
|
@@ -115,7 +116,7 @@ def validate_then_detect(
|
|
|
115
116
|
output_df: bool = False,
|
|
116
117
|
cast_json: bool = True,
|
|
117
118
|
verbose: bool = False,
|
|
118
|
-
):
|
|
119
|
+
) -> dict | tuple[dict, Iterator[pd.DataFrame]]:
|
|
119
120
|
"""
|
|
120
121
|
Performs a validation of the given file against the given analysis.
|
|
121
122
|
If the validation fails, performs a full analysis and return it.
|
csv_detective/format.py
CHANGED
|
@@ -9,9 +9,11 @@ class Format:
|
|
|
9
9
|
name: str,
|
|
10
10
|
func: Callable[[Any], bool],
|
|
11
11
|
_test_values: dict[bool, list[str]],
|
|
12
|
-
labels:
|
|
12
|
+
labels: dict[str, float] = {},
|
|
13
13
|
proportion: float = 1,
|
|
14
14
|
tags: list[str] = [],
|
|
15
|
+
mandatory_label: bool = False,
|
|
16
|
+
python_type: str = "string",
|
|
15
17
|
) -> None:
|
|
16
18
|
"""
|
|
17
19
|
Instanciates a Format object.
|
|
@@ -20,16 +22,18 @@ class Format:
|
|
|
20
22
|
name: the name of the format.
|
|
21
23
|
func: the value test for the format (returns whether a string is valid).
|
|
22
24
|
_test_values: lists of valid and invalid values, used in the tests
|
|
23
|
-
labels: the
|
|
25
|
+
labels: the dict of hint headers and their credibilty for the header score (NB: credibility is relative witin a single format, should be used to rank the valid labels)
|
|
24
26
|
proportion: the tolerance (between 0 and 1) to say a column is valid for a format. (1 => 100% of the column has to pass the func check for the column to be considered valid)
|
|
25
27
|
tags: to allow users to submit a file to only a subset of formats
|
|
26
28
|
"""
|
|
27
29
|
self.name: str = name
|
|
28
30
|
self.func: Callable = func
|
|
29
31
|
self._test_values: dict[bool, list[str]] = _test_values
|
|
30
|
-
self.labels:
|
|
32
|
+
self.labels: dict[str, float] = labels
|
|
31
33
|
self.proportion: float = proportion
|
|
32
34
|
self.tags: list[str] = tags
|
|
35
|
+
self.mandatory_label: bool = mandatory_label
|
|
36
|
+
self.python_type: str = python_type
|
|
33
37
|
|
|
34
38
|
def is_valid_label(self, val: str) -> float:
|
|
35
39
|
return header_score(val, self.labels)
|
|
@@ -49,7 +53,7 @@ class FormatsManager:
|
|
|
49
53
|
_test_values=module._test_values,
|
|
50
54
|
**{
|
|
51
55
|
attr: val
|
|
52
|
-
for attr in ["labels", "proportion", "tags"]
|
|
56
|
+
for attr in ["labels", "proportion", "tags", "mandatory_label", "python_type"]
|
|
53
57
|
if (val := getattr(module, attr, None))
|
|
54
58
|
},
|
|
55
59
|
)
|
|
@@ -63,5 +67,8 @@ class FormatsManager:
|
|
|
63
67
|
if all(tag in fmt.tags for tag in tags)
|
|
64
68
|
}
|
|
65
69
|
|
|
70
|
+
def get_formats_with_mandatory_label(self) -> dict[str, Format]:
|
|
71
|
+
return {label: fmt for label, fmt in self.formats.items() if fmt.mandatory_label}
|
|
72
|
+
|
|
66
73
|
def available_tags(self) -> set[str]:
|
|
67
74
|
return set(tag for format in self.formats.values() for tag in format.tags)
|
csv_detective/formats/adresse.py
CHANGED
|
@@ -2,15 +2,15 @@ from csv_detective.parsing.text import _process_text
|
|
|
2
2
|
|
|
3
3
|
proportion = 0.55
|
|
4
4
|
tags = ["fr", "geo"]
|
|
5
|
-
labels =
|
|
6
|
-
"adresse",
|
|
7
|
-
"localisation",
|
|
8
|
-
"adresse postale",
|
|
9
|
-
"adresse geographique",
|
|
10
|
-
"adr",
|
|
11
|
-
"adresse complete",
|
|
12
|
-
"adresse station",
|
|
13
|
-
|
|
5
|
+
labels = {
|
|
6
|
+
"adresse": 1,
|
|
7
|
+
"localisation": 1,
|
|
8
|
+
"adresse postale": 1,
|
|
9
|
+
"adresse geographique": 1,
|
|
10
|
+
"adr": 0.5,
|
|
11
|
+
"adresse complete": 1,
|
|
12
|
+
"adresse station": 1,
|
|
13
|
+
}
|
|
14
14
|
|
|
15
15
|
voies = {
|
|
16
16
|
"aire ",
|
csv_detective/formats/binary.py
CHANGED
csv_detective/formats/booleen.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
proportion = 1
|
|
2
2
|
tags = ["type"]
|
|
3
|
-
|
|
3
|
+
python_type = "bool"
|
|
4
|
+
labels = {"is ": 1, "has ": 1, "est ": 1}
|
|
4
5
|
|
|
5
6
|
bool_mapping = {
|
|
6
7
|
"1": True,
|
|
@@ -21,7 +22,7 @@ bool_mapping = {
|
|
|
21
22
|
liste_bool = set(bool_mapping.keys())
|
|
22
23
|
|
|
23
24
|
|
|
24
|
-
def bool_casting(val: str) -> bool:
|
|
25
|
+
def bool_casting(val: str) -> bool | None:
|
|
25
26
|
return bool_mapping.get(val.lower())
|
|
26
27
|
|
|
27
28
|
|
|
@@ -2,16 +2,18 @@ from frformat import CodeCommuneInsee, Millesime
|
|
|
2
2
|
|
|
3
3
|
proportion = 0.75
|
|
4
4
|
tags = ["fr", "geo"]
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
"code insee",
|
|
8
|
-
"
|
|
9
|
-
"
|
|
10
|
-
"code
|
|
11
|
-
"insee",
|
|
12
|
-
"
|
|
13
|
-
"com",
|
|
14
|
-
|
|
5
|
+
mandatory_label = True
|
|
6
|
+
labels = {
|
|
7
|
+
"code commune insee": 1,
|
|
8
|
+
"code insee": 1,
|
|
9
|
+
"codes insee": 1,
|
|
10
|
+
"code commune": 1,
|
|
11
|
+
"code insee commune": 1,
|
|
12
|
+
"insee": 0.75,
|
|
13
|
+
"code com": 1,
|
|
14
|
+
"com": 0.5,
|
|
15
|
+
"code": 0.5,
|
|
16
|
+
}
|
|
15
17
|
|
|
16
18
|
_code_commune_insee = CodeCommuneInsee(Millesime.LATEST)
|
|
17
19
|
|
|
@@ -2,13 +2,14 @@ from frformat import Millesime, NumeroDepartement, Options
|
|
|
2
2
|
|
|
3
3
|
proportion = 1
|
|
4
4
|
tags = ["fr", "geo"]
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
"
|
|
8
|
-
"
|
|
9
|
-
"
|
|
10
|
-
"
|
|
11
|
-
|
|
5
|
+
mandatory_label = True
|
|
6
|
+
labels = {
|
|
7
|
+
"code departement": 1,
|
|
8
|
+
"code_departement": 1,
|
|
9
|
+
"dep": 0.5,
|
|
10
|
+
"departement": 1,
|
|
11
|
+
"dept": 0.75,
|
|
12
|
+
}
|
|
12
13
|
|
|
13
14
|
_options = Options(
|
|
14
15
|
ignore_case=True,
|
|
@@ -2,11 +2,12 @@ from frformat import CodeFantoir
|
|
|
2
2
|
|
|
3
3
|
proportion = 1
|
|
4
4
|
tags = ["fr", "geo"]
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
"
|
|
8
|
-
"fantoir",
|
|
9
|
-
|
|
5
|
+
mandatory_label = True
|
|
6
|
+
labels = {
|
|
7
|
+
"cadastre1": 1,
|
|
8
|
+
"code fantoir": 1,
|
|
9
|
+
"fantoir": 1,
|
|
10
|
+
}
|
|
10
11
|
|
|
11
12
|
_code_fantoir = CodeFantoir()
|
|
12
13
|
|
|
@@ -2,15 +2,16 @@ from frformat import CodePostal
|
|
|
2
2
|
|
|
3
3
|
proportion = 0.9
|
|
4
4
|
tags = ["fr", "geo"]
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
"postal
|
|
8
|
-
"
|
|
9
|
-
"
|
|
10
|
-
"
|
|
11
|
-
"
|
|
12
|
-
"
|
|
13
|
-
|
|
5
|
+
mandatory_label = True
|
|
6
|
+
labels = {
|
|
7
|
+
"code postal": 1,
|
|
8
|
+
"postal code": 1,
|
|
9
|
+
"postcode": 1,
|
|
10
|
+
"post code": 1,
|
|
11
|
+
"cp": 0.5,
|
|
12
|
+
"codes postaux": 1,
|
|
13
|
+
"location postcode": 1,
|
|
14
|
+
}
|
|
14
15
|
|
|
15
16
|
_code_postal = CodePostal()
|
|
16
17
|
|
|
@@ -2,12 +2,13 @@ from frformat import CodeRegion, Millesime
|
|
|
2
2
|
|
|
3
3
|
proportion = 1
|
|
4
4
|
tags = ["fr", "geo"]
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
"
|
|
8
|
-
"
|
|
9
|
-
"region",
|
|
10
|
-
|
|
5
|
+
mandatory_label = True
|
|
6
|
+
labels = {
|
|
7
|
+
"code region": 1,
|
|
8
|
+
"reg": 0.5,
|
|
9
|
+
"code insee region": 1,
|
|
10
|
+
"region": 1,
|
|
11
|
+
}
|
|
11
12
|
|
|
12
13
|
_code_region = CodeRegion(Millesime.LATEST)
|
|
13
14
|
|
|
@@ -2,12 +2,13 @@ from frformat import CodeRNA
|
|
|
2
2
|
|
|
3
3
|
proportion = 0.9
|
|
4
4
|
tags = ["fr"]
|
|
5
|
-
labels =
|
|
6
|
-
"code rna",
|
|
7
|
-
"rna",
|
|
8
|
-
"n° inscription association",
|
|
9
|
-
"identifiant association",
|
|
10
|
-
|
|
5
|
+
labels = {
|
|
6
|
+
"code rna": 1,
|
|
7
|
+
"rna": 1,
|
|
8
|
+
"n° inscription association": 1,
|
|
9
|
+
"identifiant association": 1,
|
|
10
|
+
"asso": 0.75,
|
|
11
|
+
}
|
|
11
12
|
|
|
12
13
|
_code_rna = CodeRNA()
|
|
13
14
|
|
csv_detective/formats/commune.py
CHANGED
|
@@ -2,11 +2,11 @@ from frformat import Commune, Millesime, Options
|
|
|
2
2
|
|
|
3
3
|
proportion = 0.8
|
|
4
4
|
tags = ["fr", "geo"]
|
|
5
|
-
labels =
|
|
6
|
-
"commune",
|
|
7
|
-
"ville",
|
|
8
|
-
"libelle commune",
|
|
9
|
-
|
|
5
|
+
labels = {
|
|
6
|
+
"commune": 1,
|
|
7
|
+
"ville": 1,
|
|
8
|
+
"libelle commune": 1,
|
|
9
|
+
}
|
|
10
10
|
|
|
11
11
|
_options = Options(
|
|
12
12
|
ignore_case=True,
|
|
@@ -4,11 +4,12 @@ from csv_detective.parsing.text import _process_text
|
|
|
4
4
|
|
|
5
5
|
proportion = 1
|
|
6
6
|
tags = ["fr"]
|
|
7
|
-
labels =
|
|
8
|
-
"csp insee",
|
|
9
|
-
"csp",
|
|
10
|
-
"categorie socioprofessionnelle",
|
|
11
|
-
|
|
7
|
+
labels = {
|
|
8
|
+
"csp insee": 1,
|
|
9
|
+
"csp": 0.75,
|
|
10
|
+
"categorie socioprofessionnelle": 1,
|
|
11
|
+
"sociopro": 1,
|
|
12
|
+
}
|
|
12
13
|
|
|
13
14
|
f = open(join(dirname(__file__), "data", "csp_insee.txt"), "r")
|
|
14
15
|
codes_insee = f.read().split("\n")
|