csv-detective 0.7.5.dev1286__py3-none-any.whl → 0.7.5.dev1298__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/detect_fields/__init__.py +6 -4
- csv_detective/detect_fields/geo/latlon_wgs/__init__.py +7 -7
- csv_detective/detect_fields/other/money/__init__.py +11 -0
- csv_detective/detect_fields/other/percent/__init__.py +9 -0
- csv_detective/output/example.py +77 -77
- {csv_detective-0.7.5.dev1286.data → csv_detective-0.7.5.dev1298.data}/data/share/csv_detective/CHANGELOG.md +1 -0
- {csv_detective-0.7.5.dev1286.dist-info → csv_detective-0.7.5.dev1298.dist-info}/METADATA +1 -1
- {csv_detective-0.7.5.dev1286.dist-info → csv_detective-0.7.5.dev1298.dist-info}/RECORD +16 -14
- {csv_detective-0.7.5.dev1286.dist-info → csv_detective-0.7.5.dev1298.dist-info}/WHEEL +1 -1
- tests/test_example.py +10 -10
- tests/test_fields.py +269 -414
- {csv_detective-0.7.5.dev1286.data → csv_detective-0.7.5.dev1298.data}/data/share/csv_detective/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.7.5.dev1286.data → csv_detective-0.7.5.dev1298.data}/data/share/csv_detective/README.md +0 -0
- {csv_detective-0.7.5.dev1286.dist-info → csv_detective-0.7.5.dev1298.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.7.5.dev1286.dist-info → csv_detective-0.7.5.dev1298.dist-info}/licenses/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.7.5.dev1286.dist-info → csv_detective-0.7.5.dev1298.dist-info}/top_level.txt +0 -0
|
@@ -10,19 +10,21 @@ from .FR.other import (
|
|
|
10
10
|
insee_ape700,
|
|
11
11
|
date_fr,
|
|
12
12
|
code_waldec,
|
|
13
|
-
code_rna
|
|
13
|
+
code_rna,
|
|
14
14
|
)
|
|
15
15
|
|
|
16
16
|
from .other import (
|
|
17
17
|
email,
|
|
18
18
|
url,
|
|
19
19
|
booleen,
|
|
20
|
+
money,
|
|
20
21
|
mongo_object_id,
|
|
22
|
+
percent,
|
|
21
23
|
twitter,
|
|
22
24
|
float,
|
|
23
25
|
int,
|
|
24
26
|
uuid,
|
|
25
|
-
json
|
|
27
|
+
json,
|
|
26
28
|
)
|
|
27
29
|
|
|
28
30
|
from .FR.geo import (
|
|
@@ -40,7 +42,7 @@ from .FR.geo import (
|
|
|
40
42
|
code_region,
|
|
41
43
|
latitude_l93,
|
|
42
44
|
longitude_l93,
|
|
43
|
-
insee_canton
|
|
45
|
+
insee_canton,
|
|
44
46
|
)
|
|
45
47
|
|
|
46
48
|
from .geo import (
|
|
@@ -50,7 +52,7 @@ from .geo import (
|
|
|
50
52
|
latitude_wgs,
|
|
51
53
|
longitude_wgs,
|
|
52
54
|
latlon_wgs,
|
|
53
|
-
json_geojson
|
|
55
|
+
json_geojson,
|
|
54
56
|
)
|
|
55
57
|
|
|
56
58
|
from .FR.temp import jour_de_la_semaine, mois_de_annee
|
|
@@ -1,13 +1,13 @@
|
|
|
1
|
-
import
|
|
1
|
+
from ..latitude_wgs import _is as is_lat
|
|
2
|
+
from ..longitude_wgs import _is as is_lon
|
|
2
3
|
|
|
3
|
-
PROPORTION =
|
|
4
|
+
PROPORTION = 1
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
def _is(val):
|
|
7
8
|
'''Renvoie True si val peut etre une latitude,longitude'''
|
|
8
9
|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
)
|
|
10
|
+
if not isinstance(val, str) or val.count(",") != 1:
|
|
11
|
+
return False
|
|
12
|
+
lat, lon = val.split(",")
|
|
13
|
+
return is_lat(lat) and is_lon(lon.replace(" ", ""))
|
csv_detective/output/example.py
CHANGED
|
@@ -17,12 +17,12 @@ def create_example_csv_file(
|
|
|
17
17
|
fields: Optional[dict] = None,
|
|
18
18
|
schema_path: Optional[str] = None,
|
|
19
19
|
file_length: int = 10,
|
|
20
|
-
output_name: str =
|
|
21
|
-
output_sep: str =
|
|
22
|
-
encoding: str =
|
|
20
|
+
output_name: Optional[str] = "example_file.csv",
|
|
21
|
+
output_sep: str = ";",
|
|
22
|
+
encoding: str = "utf-8",
|
|
23
23
|
ignore_required: bool = False,
|
|
24
24
|
) -> pd.DataFrame:
|
|
25
|
-
|
|
25
|
+
"""
|
|
26
26
|
Create an example file based on a list of dicts like follows:
|
|
27
27
|
fields = [
|
|
28
28
|
{
|
|
@@ -33,7 +33,7 @@ def create_example_csv_file(
|
|
|
33
33
|
...
|
|
34
34
|
]
|
|
35
35
|
Or from a TableSchema
|
|
36
|
-
|
|
36
|
+
"""
|
|
37
37
|
# need to make a CLI command
|
|
38
38
|
|
|
39
39
|
if not (fields or schema_path):
|
|
@@ -53,65 +53,65 @@ def create_example_csv_file(
|
|
|
53
53
|
enum: Optional[str] = None,
|
|
54
54
|
) -> str:
|
|
55
55
|
if potential_skip(required):
|
|
56
|
-
return
|
|
56
|
+
return ""
|
|
57
57
|
if pattern is not None:
|
|
58
58
|
return rstr.xeger(pattern)
|
|
59
59
|
elif enum is not None:
|
|
60
60
|
return random.choice(enum)
|
|
61
61
|
else:
|
|
62
62
|
letters = string.ascii_lowercase
|
|
63
|
-
return
|
|
63
|
+
return "".join(random.choice(letters) for i in range(length))
|
|
64
64
|
|
|
65
65
|
def _id(
|
|
66
66
|
required: bool = True,
|
|
67
67
|
) -> str:
|
|
68
68
|
if potential_skip(required):
|
|
69
|
-
return
|
|
69
|
+
return ""
|
|
70
70
|
return str(uuid.uuid4())
|
|
71
71
|
|
|
72
72
|
def _date(
|
|
73
73
|
date_range: Union[None, list[str]] = None,
|
|
74
|
-
format: str =
|
|
74
|
+
format: str = "%Y-%m-%d",
|
|
75
75
|
required: bool = True,
|
|
76
76
|
) -> str:
|
|
77
77
|
# the bounds specified in date_range are expected in the same format as the desired output format
|
|
78
|
-
assert all([k in format for k in [
|
|
78
|
+
assert all([k in format for k in ["%d", "%m", "%Y"]])
|
|
79
79
|
if potential_skip(required):
|
|
80
|
-
return
|
|
80
|
+
return ""
|
|
81
81
|
if date_range is None:
|
|
82
82
|
return fake.date(format)
|
|
83
83
|
else:
|
|
84
84
|
if len(date_range) != 2:
|
|
85
|
-
raise ValueError('
|
|
85
|
+
raise ValueError("'date_range' must have exactly two elements.")
|
|
86
86
|
return fake.date_between_dates(
|
|
87
87
|
datetime.strptime(date_range[0], format),
|
|
88
88
|
datetime.strptime(date_range[1], format),
|
|
89
89
|
).strftime(format)
|
|
90
90
|
|
|
91
91
|
def _time(
|
|
92
|
-
format: str =
|
|
92
|
+
format: str = "%H:%M:%S",
|
|
93
93
|
required: bool = True,
|
|
94
94
|
) -> str:
|
|
95
|
-
assert all([k in format for k in [
|
|
95
|
+
assert all([k in format for k in ["%H", "%M", "%S"]])
|
|
96
96
|
if potential_skip(required):
|
|
97
|
-
return
|
|
97
|
+
return ""
|
|
98
98
|
# maybe add a time_range argument?
|
|
99
99
|
return fake.time(format)
|
|
100
100
|
|
|
101
101
|
def _datetime(
|
|
102
102
|
datetime_range: Optional[list[str]] = None,
|
|
103
|
-
format: str =
|
|
103
|
+
format: str = "%Y-%m-%d %H-%M-%S",
|
|
104
104
|
required: bool = True,
|
|
105
105
|
) -> str:
|
|
106
106
|
# the bounds specified in datetime_range are expected in the same format as the desired output format
|
|
107
|
-
assert all([k in format for k in [
|
|
107
|
+
assert all([k in format for k in ["%d", "%m", "%Y", "%H", "%M", "%S"]])
|
|
108
108
|
if potential_skip(required):
|
|
109
|
-
return
|
|
109
|
+
return ""
|
|
110
110
|
if datetime_range is None:
|
|
111
111
|
return fake.date_time().strftime(format)
|
|
112
112
|
else:
|
|
113
113
|
if len(datetime_range) != 2:
|
|
114
|
-
raise ValueError('
|
|
114
|
+
raise ValueError("'date_range' must have exactly two elements.")
|
|
115
115
|
return fake.date_time_between(
|
|
116
116
|
datetime.strptime(datetime_range[0], format),
|
|
117
117
|
datetime.strptime(datetime_range[1], format),
|
|
@@ -119,8 +119,8 @@ def create_example_csv_file(
|
|
|
119
119
|
|
|
120
120
|
def _url(required: bool = True) -> str:
|
|
121
121
|
if potential_skip(required):
|
|
122
|
-
return
|
|
123
|
-
return f
|
|
122
|
+
return ""
|
|
123
|
+
return f"http://{rstr.domainsafe()}.{rstr.letters(3)}/{rstr.urlsafe()}"
|
|
124
124
|
|
|
125
125
|
def _number(
|
|
126
126
|
num_type: Type[Union[int, float]] = int,
|
|
@@ -130,7 +130,7 @@ def create_example_csv_file(
|
|
|
130
130
|
) -> Union[int, float]:
|
|
131
131
|
assert num_range is None or len(num_range) == 2
|
|
132
132
|
if potential_skip(required):
|
|
133
|
-
return
|
|
133
|
+
return ""
|
|
134
134
|
if enum:
|
|
135
135
|
return random.choice(enum)
|
|
136
136
|
if num_range is None:
|
|
@@ -142,100 +142,100 @@ def create_example_csv_file(
|
|
|
142
142
|
|
|
143
143
|
def _bool(required: bool = True) -> bool:
|
|
144
144
|
if potential_skip(required):
|
|
145
|
-
return
|
|
145
|
+
return ""
|
|
146
146
|
return random.randint(0, 1) == 0
|
|
147
147
|
|
|
148
148
|
def _array(enum: list[Any], required: bool = True) -> str:
|
|
149
149
|
if potential_skip(required):
|
|
150
|
-
return
|
|
150
|
+
return ""
|
|
151
151
|
return f"[{','.join(random.sample(enum, random.randint(1, len(enum))))}]"
|
|
152
152
|
|
|
153
153
|
def build_args_from_constraints(constraints: dict) -> dict:
|
|
154
154
|
args = {}
|
|
155
|
-
args[
|
|
156
|
-
for _ in [
|
|
155
|
+
args["required"] = constraints.get("required", False)
|
|
156
|
+
for _ in ["pattern", "enum", "format"]:
|
|
157
157
|
if _ in constraints:
|
|
158
158
|
args[_] = constraints[_]
|
|
159
|
-
if
|
|
160
|
-
args[
|
|
159
|
+
if "minimum" in constraints and "maximum" in constraints:
|
|
160
|
+
args["num_range"] = [constraints["minimum"], constraints["maximum"]]
|
|
161
161
|
# maybe there are better values than these?
|
|
162
|
-
elif
|
|
163
|
-
args[
|
|
164
|
-
elif
|
|
165
|
-
args[
|
|
166
|
-
if
|
|
167
|
-
args[
|
|
168
|
-
if
|
|
169
|
-
args[
|
|
162
|
+
elif "minimum" in constraints:
|
|
163
|
+
args["num_range"] = [constraints["minimum"], 10 + constraints["minimum"]]
|
|
164
|
+
elif "maximum" in constraints:
|
|
165
|
+
args["num_range"] = [constraints["maximum"] - 10, constraints["maximum"]]
|
|
166
|
+
if "minLength" in constraints:
|
|
167
|
+
args["length"] = constraints["minLength"]
|
|
168
|
+
if "maxLength" in constraints:
|
|
169
|
+
args["length"] = constraints["maxLength"]
|
|
170
170
|
return args
|
|
171
171
|
|
|
172
172
|
schema_types_to_python = {
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
173
|
+
"number": "float",
|
|
174
|
+
"integer": "int",
|
|
175
|
+
"string": "str",
|
|
176
|
+
"year": "year",
|
|
177
|
+
"boolean": "bool",
|
|
178
|
+
"date": "date",
|
|
179
|
+
"yearmonth": "date",
|
|
180
|
+
"time": "time",
|
|
181
|
+
"datetime": "datetime",
|
|
182
|
+
"array": "array"
|
|
183
183
|
}
|
|
184
184
|
|
|
185
185
|
if schema_path:
|
|
186
|
-
if schema_path.startswith(
|
|
186
|
+
if schema_path.startswith("http"):
|
|
187
187
|
schema = requests.get(schema_path).json()
|
|
188
188
|
else:
|
|
189
189
|
with open(schema_path, encoding=encoding) as jsonfile:
|
|
190
190
|
schema = json.load(jsonfile)
|
|
191
|
-
if not (
|
|
192
|
-
raise ValueError(
|
|
191
|
+
if not ("fields" in schema.keys()):
|
|
192
|
+
raise ValueError("The schema must have a 'fields' key.")
|
|
193
193
|
else:
|
|
194
194
|
fields = [
|
|
195
195
|
{
|
|
196
|
-
|
|
197
|
-
|
|
196
|
+
"name": f["name"],
|
|
197
|
+
"type": schema_types_to_python.get(f["type"], "str"),
|
|
198
198
|
# when frformat is supported in TableSchema, we can build args for French standards
|
|
199
199
|
# linked to https://github.com/datagouv/fr-format/issues/26
|
|
200
|
-
|
|
201
|
-
build_args_from_constraints(f[
|
|
202
|
-
else build_args_from_constraints(f[
|
|
203
|
-
if
|
|
200
|
+
"args": (
|
|
201
|
+
build_args_from_constraints(f["constraints"]) if "constraints" in f.keys()
|
|
202
|
+
else build_args_from_constraints(f["arrayItem"]["constraints"])
|
|
203
|
+
if "arrayItem" in f.keys() and "constraints" in f["arrayItem"].keys()
|
|
204
204
|
else {}
|
|
205
205
|
)
|
|
206
|
-
} for f in schema[
|
|
206
|
+
} for f in schema["fields"]
|
|
207
207
|
]
|
|
208
208
|
|
|
209
209
|
for k in range(len(fields)):
|
|
210
|
-
if
|
|
211
|
-
fields[k][
|
|
212
|
-
if fields[k][
|
|
213
|
-
fields[k][
|
|
214
|
-
elif fields[k][
|
|
215
|
-
fields[k][
|
|
216
|
-
elif fields[k][
|
|
217
|
-
fields[k][
|
|
218
|
-
fields[k][
|
|
210
|
+
if "args" not in fields[k]:
|
|
211
|
+
fields[k]["args"] = {}
|
|
212
|
+
if fields[k]["type"] == "float":
|
|
213
|
+
fields[k]["args"]["num_type"] = float
|
|
214
|
+
elif fields[k]["type"] == "int":
|
|
215
|
+
fields[k]["args"]["num_type"] = int
|
|
216
|
+
elif fields[k]["type"] == "year":
|
|
217
|
+
fields[k]["args"]["num_type"] = int
|
|
218
|
+
fields[k]["args"]["num_range"] = [1990, 2050]
|
|
219
219
|
|
|
220
220
|
types_to_func = {
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
221
|
+
"int": _number,
|
|
222
|
+
"float": _number,
|
|
223
|
+
"date": _date,
|
|
224
|
+
"time": _time,
|
|
225
|
+
"str": _string,
|
|
226
|
+
"url": _url,
|
|
227
|
+
"id": _id,
|
|
228
|
+
"year": _number,
|
|
229
|
+
"bool": _bool,
|
|
230
|
+
"datetime": _datetime,
|
|
231
|
+
"array": _array,
|
|
232
232
|
}
|
|
233
233
|
|
|
234
234
|
# would it be better to create by column or by row (as for now)?
|
|
235
235
|
output = pd.DataFrame(
|
|
236
236
|
[
|
|
237
237
|
[
|
|
238
|
-
types_to_func.get(f[
|
|
238
|
+
types_to_func.get(f["type"], "str")(**f["args"])
|
|
239
239
|
for f in fields
|
|
240
240
|
] for _ in range(file_length)
|
|
241
241
|
],
|
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
- Restructure repo (breaking changes) [#111](https://github.com/datagouv/csv-detective/pull/111)
|
|
16
16
|
- Add validation function and associated flow [#112](https://github.com/datagouv/csv-detective/pull/112)
|
|
17
17
|
- Better float detection [#113](https://github.com/datagouv/csv-detective/pull/113)
|
|
18
|
+
- Refactor fields tests [#114](https://github.com/datagouv/csv-detective/pull/114)
|
|
18
19
|
|
|
19
20
|
## 0.7.4 (2024-11-15)
|
|
20
21
|
|
|
@@ -5,7 +5,7 @@ csv_detective/load_tests.py,sha256=GILvfkd4OVI-72mA4nzbPlZqgcXZ4wznOhGfZ1ucWkM,2
|
|
|
5
5
|
csv_detective/s3_utils.py,sha256=1cIVdQUYY2ovErbMwp72Gqtqx2bkB8nfVhn-QaOFTT0,1451
|
|
6
6
|
csv_detective/utils.py,sha256=Bx_1k4Sdpd5PCjuAy4AeayCmmw7TMR_zgtKIHNLi5g0,1157
|
|
7
7
|
csv_detective/validate.py,sha256=o4Qulf8E-x1zsWT9OD4Fpw83Gku1WA3JlX83j7bu0DA,2314
|
|
8
|
-
csv_detective/detect_fields/__init__.py,sha256=
|
|
8
|
+
csv_detective/detect_fields/__init__.py,sha256=qkwT_o_S7qvLEsRssICpoGmCc3h5y2MVy1XI56LFcV0,959
|
|
9
9
|
csv_detective/detect_fields/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
csv_detective/detect_fields/FR/geo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
11
|
csv_detective/detect_fields/FR/geo/adresse/__init__.py,sha256=NqV8ULf9gY9iFnA1deKR-1Yobr96WwCsn5JfbP_MjiY,1675
|
|
@@ -50,7 +50,7 @@ csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py,sha256=wJAy
|
|
|
50
50
|
csv_detective/detect_fields/geo/iso_country_code_numeric/iso_country_code_numeric.txt,sha256=2GtEhuporsHYV-pU4q9kfXU5iOtfW5C0GYBTTKQtnnA,1004
|
|
51
51
|
csv_detective/detect_fields/geo/json_geojson/__init__.py,sha256=FPHOfTrfXJs62-NgeOcNGOvwPd7I1fEVp8lTdMNfj3w,433
|
|
52
52
|
csv_detective/detect_fields/geo/latitude_wgs/__init__.py,sha256=ArS6PuYEd0atZwSqNDZhXZz1TwzdiwdV8ovRYTOacpg,327
|
|
53
|
-
csv_detective/detect_fields/geo/latlon_wgs/__init__.py,sha256=
|
|
53
|
+
csv_detective/detect_fields/geo/latlon_wgs/__init__.py,sha256=7_mnO9uC_kI7e2WR8xIer7Kqw8zi-v-JKaAD4zcoGbE,342
|
|
54
54
|
csv_detective/detect_fields/geo/longitude_wgs/__init__.py,sha256=G7afWOKiGh_Tv7gwDNGt1a4B_A8hkCBkIxn3THDCUFk,330
|
|
55
55
|
csv_detective/detect_fields/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
56
56
|
csv_detective/detect_fields/other/booleen/__init__.py,sha256=wn_yyTAmGxqo0l0b7JRpGb0da_E27iGxES9zWCrnsqc,497
|
|
@@ -58,7 +58,9 @@ csv_detective/detect_fields/other/email/__init__.py,sha256=O9tgJmq0O8Q-8iin63NqE
|
|
|
58
58
|
csv_detective/detect_fields/other/float/__init__.py,sha256=AT4Kpgwoz5PuAoLx00u0SL8DjjXZxsE8zSRbN18uAv4,578
|
|
59
59
|
csv_detective/detect_fields/other/int/__init__.py,sha256=QN3kQJLYqLRBiubUK7g4Xq03PlA5wqVwx2pPPIO9FdI,320
|
|
60
60
|
csv_detective/detect_fields/other/json/__init__.py,sha256=DhzyvT12kOqgum89silIu3uoSYXmC_s_AaxLtXAD4eU,540
|
|
61
|
+
csv_detective/detect_fields/other/money/__init__.py,sha256=g_ZwBZXl9LhldwFYQotC5WqLiE8qQCZHtoI9eJvl_9M,232
|
|
61
62
|
csv_detective/detect_fields/other/mongo_object_id/__init__.py,sha256=7fcrHsOZAqXp2_N0IjPskYJ_qi4xRlo9iyNNDQVLzsU,156
|
|
63
|
+
csv_detective/detect_fields/other/percent/__init__.py,sha256=vgpekNOPBRuunoVBXMi81rwHv4uSOhe78pbVtQ5SBO8,177
|
|
62
64
|
csv_detective/detect_fields/other/twitter/__init__.py,sha256=qbwLKsTBRFQ4PyTNVeEZ5Hkf5Wwi3ZKclLER_V0YO3g,154
|
|
63
65
|
csv_detective/detect_fields/other/url/__init__.py,sha256=9WaTqCglEsw_lJG_xZsBMdxJXg2yuQ92_fkX6CXWNV0,286
|
|
64
66
|
csv_detective/detect_fields/other/uuid/__init__.py,sha256=3-z0fDax29SJc57zPjNGR6DPICJu6gfuNGC5L3jh4d0,223
|
|
@@ -135,7 +137,7 @@ csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_p
|
|
|
135
137
|
csv_detective/detection/variables.py,sha256=3qEMtjZ_zyIFXvTnFgK7ZMDx8C12uQXKfFjEj2moyJc,3558
|
|
136
138
|
csv_detective/output/__init__.py,sha256=XDS4Dgvv6oloIao9JquHa0m1nnlQ_q2gHuEPGlaETic,1890
|
|
137
139
|
csv_detective/output/dataframe.py,sha256=89iQRE59cHQyQQEsujQVIKP2YAUYpPklWkdDOqZE-wE,2183
|
|
138
|
-
csv_detective/output/example.py,sha256=
|
|
140
|
+
csv_detective/output/example.py,sha256=26rY7XNXK47e9xJMl-Js8jJwFIuv7V7B7e256VecKuk,8652
|
|
139
141
|
csv_detective/output/profile.py,sha256=B8YU541T_YPDezJGh4dkHckOShiwHSrZd9GS8jbmz7A,2919
|
|
140
142
|
csv_detective/output/schema.py,sha256=ZDBWDOD8IYp7rcB0_n8l9JXGIhOQ6bTZHFWfTmnNNEQ,13480
|
|
141
143
|
csv_detective/output/utils.py,sha256=HbmvCCCmFo7NJxhD_UsJIveuw-rrfhrvYckv1CJn_10,2301
|
|
@@ -145,19 +147,19 @@ csv_detective/parsing/csv.py,sha256=11mibDnJhIjykXLGZvA5ZEU5U7KgxIrbyO6BNv6jlro,
|
|
|
145
147
|
csv_detective/parsing/excel.py,sha256=AslE2S1e67o8yTIAIhp-lAnJ6-XqeBBRz1-VMFqhZBM,7055
|
|
146
148
|
csv_detective/parsing/load.py,sha256=SpP0pfxswOAPPpwbZfoP1blh0EKV5VMs0TpTgQJKzjs,3621
|
|
147
149
|
csv_detective/parsing/text.py,sha256=rsfk66BCmdpsCOd0kDJ8tmqMsEWd-OeBkEisWc4Ej9k,1246
|
|
148
|
-
csv_detective-0.7.5.
|
|
149
|
-
csv_detective-0.7.5.
|
|
150
|
-
csv_detective-0.7.5.
|
|
151
|
-
csv_detective-0.7.5.
|
|
150
|
+
csv_detective-0.7.5.dev1298.data/data/share/csv_detective/CHANGELOG.md,sha256=Y8aL18x5EGGvA9AqukEi4tn78se_Lzisa2J32kOSer8,7984
|
|
151
|
+
csv_detective-0.7.5.dev1298.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
|
|
152
|
+
csv_detective-0.7.5.dev1298.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
|
|
153
|
+
csv_detective-0.7.5.dev1298.dist-info/licenses/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
|
|
152
154
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
153
|
-
tests/test_example.py,sha256=
|
|
154
|
-
tests/test_fields.py,sha256=
|
|
155
|
+
tests/test_example.py,sha256=JeHxSK0IVDcSrOhSZlNGSQv4JAc_r6mzvJM8PfmLTMw,2018
|
|
156
|
+
tests/test_fields.py,sha256=0hce2XtDHY9dTLCYhrm2s4I41OeKsQbbaKmDZ4XctUw,9824
|
|
155
157
|
tests/test_file.py,sha256=9APE1d43lQ8Dk8lwJFNUK_YekYYsQ0ae2_fgpcPE9mk,8116
|
|
156
158
|
tests/test_labels.py,sha256=6MOKrGznkwU5fjZ_3oiB6Scmb480Eu-9geBJs0UDLds,159
|
|
157
159
|
tests/test_structure.py,sha256=bv-tjgXohvQAxwmxzH0BynFpK2TyPjcxvtIAmIRlZmA,1393
|
|
158
160
|
tests/test_validation.py,sha256=VwtBcnGAQ_eSFrBibWnMSTDjuy6y2JLlqvc3Zb667NY,479
|
|
159
|
-
csv_detective-0.7.5.
|
|
160
|
-
csv_detective-0.7.5.
|
|
161
|
-
csv_detective-0.7.5.
|
|
162
|
-
csv_detective-0.7.5.
|
|
163
|
-
csv_detective-0.7.5.
|
|
161
|
+
csv_detective-0.7.5.dev1298.dist-info/METADATA,sha256=cy8kKhsbQVd8DQ2UMJe7z1nyxoGEvmFnQfsTdCTwXXc,1386
|
|
162
|
+
csv_detective-0.7.5.dev1298.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
|
|
163
|
+
csv_detective-0.7.5.dev1298.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
|
|
164
|
+
csv_detective-0.7.5.dev1298.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
|
|
165
|
+
csv_detective-0.7.5.dev1298.dist-info/RECORD,,
|
tests/test_example.py
CHANGED
|
@@ -13,24 +13,24 @@ def test_example_creation():
|
|
|
13
13
|
{
|
|
14
14
|
"name": "nom_modele",
|
|
15
15
|
"type": "str",
|
|
16
|
-
"args": {
|
|
16
|
+
"args": {"length": 20},
|
|
17
17
|
},
|
|
18
18
|
{
|
|
19
19
|
"name": "siret",
|
|
20
20
|
"type": "str",
|
|
21
|
-
"args": {
|
|
21
|
+
"args": {"pattern": "^\\d{14}$"},
|
|
22
22
|
},
|
|
23
23
|
{
|
|
24
24
|
"name": "type_producteur",
|
|
25
25
|
"type": "str",
|
|
26
|
-
"args": {
|
|
26
|
+
"args": {"enum": ["privé", "public", "association"]},
|
|
27
27
|
},
|
|
28
28
|
{
|
|
29
29
|
"name": "date_creation",
|
|
30
30
|
"type": "date",
|
|
31
31
|
"args": {
|
|
32
|
-
|
|
33
|
-
|
|
32
|
+
"date_range": ["1996-02-13", "2000-01-28"],
|
|
33
|
+
"format": "%Y-%m-%d",
|
|
34
34
|
},
|
|
35
35
|
},
|
|
36
36
|
{
|
|
@@ -44,20 +44,20 @@ def test_example_creation():
|
|
|
44
44
|
{
|
|
45
45
|
"name": "note",
|
|
46
46
|
"type": "float",
|
|
47
|
-
"args": {
|
|
47
|
+
"args": {"num_range": [1, 20]}
|
|
48
48
|
},
|
|
49
49
|
]
|
|
50
50
|
df = create_example_csv_file(
|
|
51
51
|
fields=fields,
|
|
52
52
|
file_length=5,
|
|
53
|
-
output_name=
|
|
53
|
+
output_name=None,
|
|
54
54
|
)
|
|
55
55
|
assert len(df) == 5
|
|
56
56
|
assert all(UUID(_) for _ in df["id_unique"])
|
|
57
57
|
assert all(len(_) == 20 for _ in df["nom_modele"])
|
|
58
58
|
assert all(re.match("^\\d{14}$", _) for _ in df["siret"])
|
|
59
|
-
assert all(_ in [
|
|
60
|
-
assert all(_ >=
|
|
59
|
+
assert all(_ in ["privé", "public", "association"] for _ in df["type_producteur"])
|
|
60
|
+
assert all(_ >= "1996-02-13" and _ <= "2000-01-28" for _ in df["date_creation"])
|
|
61
61
|
assert all(_.startswith("http") for _ in df["url_produit"])
|
|
62
62
|
assert all(isinstance(_, int) for _ in df["nb_produits"])
|
|
63
63
|
assert all(_ >= 1 and _ <= 20 for _ in df["note"])
|
|
@@ -66,6 +66,6 @@ def test_example_creation():
|
|
|
66
66
|
def test_example_from_tableschema():
|
|
67
67
|
df = create_example_csv_file(
|
|
68
68
|
schema_path="https://schema.data.gouv.fr/schemas/etalab/schema-irve-statique/2.3.1/schema-statique.json",
|
|
69
|
-
output_name=
|
|
69
|
+
output_name=None,
|
|
70
70
|
)
|
|
71
71
|
assert len(df) == 10
|
tests/test_fields.py
CHANGED
|
@@ -15,7 +15,9 @@ from csv_detective.detect_fields.FR.geo import (
|
|
|
15
15
|
departement,
|
|
16
16
|
insee_canton,
|
|
17
17
|
latitude_l93,
|
|
18
|
+
latitude_wgs_fr_metropole,
|
|
18
19
|
longitude_l93,
|
|
20
|
+
longitude_wgs_fr_metropole,
|
|
19
21
|
pays,
|
|
20
22
|
region,
|
|
21
23
|
)
|
|
@@ -24,26 +26,38 @@ from csv_detective.detect_fields.FR.other import (
|
|
|
24
26
|
code_rna,
|
|
25
27
|
code_waldec,
|
|
26
28
|
csp_insee,
|
|
29
|
+
date_fr,
|
|
30
|
+
insee_ape700,
|
|
27
31
|
sexe,
|
|
28
32
|
siren,
|
|
33
|
+
siret,
|
|
29
34
|
tel_fr,
|
|
35
|
+
uai,
|
|
30
36
|
)
|
|
31
|
-
from csv_detective.detect_fields.FR.temp import jour_de_la_semaine
|
|
37
|
+
from csv_detective.detect_fields.FR.temp import jour_de_la_semaine, mois_de_annee
|
|
32
38
|
from csv_detective.detect_fields.geo import (
|
|
33
39
|
iso_country_code_alpha2,
|
|
34
40
|
iso_country_code_alpha3,
|
|
35
41
|
iso_country_code_numeric,
|
|
42
|
+
json_geojson,
|
|
43
|
+
latitude_wgs,
|
|
44
|
+
latlon_wgs,
|
|
45
|
+
longitude_wgs,
|
|
36
46
|
)
|
|
37
47
|
from csv_detective.detect_fields.other import (
|
|
48
|
+
booleen,
|
|
38
49
|
email,
|
|
39
50
|
json,
|
|
51
|
+
money,
|
|
40
52
|
mongo_object_id,
|
|
53
|
+
percent,
|
|
54
|
+
twitter,
|
|
41
55
|
url,
|
|
42
56
|
uuid,
|
|
43
57
|
int as test_int,
|
|
44
58
|
float as test_float,
|
|
45
59
|
)
|
|
46
|
-
from csv_detective.detect_fields.temp import date, datetime_iso, datetime_rfc822, year
|
|
60
|
+
from csv_detective.detect_fields.temp import date, datetime, datetime_iso, datetime_rfc822, year
|
|
47
61
|
from csv_detective.detection.variables import (
|
|
48
62
|
detect_continuous_variable,
|
|
49
63
|
detect_categorical_variable,
|
|
@@ -94,420 +108,261 @@ def test_detect_continuous_variable():
|
|
|
94
108
|
assert res2.values and res2.values[0] == "cont"
|
|
95
109
|
|
|
96
110
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
assert iso_country_code_alpha2._is(val)
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
def test_do_not_match_iso_country_code():
|
|
342
|
-
val = "XX"
|
|
343
|
-
assert not iso_country_code_alpha2._is(val)
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
# iso_country_code alpha-3
|
|
347
|
-
def test_match_iso_country_code_alpha3():
|
|
348
|
-
val = "FRA"
|
|
349
|
-
assert iso_country_code_alpha3._is(val)
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
def test_do_not_match_iso_country_code_alpha3():
|
|
353
|
-
val = "ABC"
|
|
354
|
-
assert not iso_country_code_alpha3._is(val)
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
# iso_country_code numerique
|
|
358
|
-
def test_match_iso_country_code_numeric():
|
|
359
|
-
val = "250"
|
|
360
|
-
assert iso_country_code_numeric._is(val)
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
def test_do_not_match_iso_country_code_numeric():
|
|
364
|
-
val = "003"
|
|
365
|
-
assert not iso_country_code_numeric._is(val)
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
# jour de la semaine
|
|
369
|
-
def test_match_jour_de_la_semaine():
|
|
370
|
-
val = "lundi"
|
|
371
|
-
assert jour_de_la_semaine._is(val)
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
def test_do_not_match_jour_de_la_semaine():
|
|
375
|
-
val = "jour de la biere"
|
|
376
|
-
assert not jour_de_la_semaine._is(val)
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
# year
|
|
380
|
-
def test_match_year():
|
|
381
|
-
val = "2015"
|
|
382
|
-
assert year._is(val)
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
def test_do_not_match_year():
|
|
386
|
-
val = "20166"
|
|
387
|
-
assert not year._is(val)
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
# date
|
|
391
|
-
def test_match_date():
|
|
392
|
-
val = "1960-08-07"
|
|
393
|
-
assert date._is(val)
|
|
394
|
-
val = "12/02/2007"
|
|
395
|
-
assert date._is(val)
|
|
396
|
-
val = "15 jan 1985"
|
|
397
|
-
assert date._is(val)
|
|
398
|
-
val = "15 décembre 1985"
|
|
399
|
-
assert date._is(val)
|
|
400
|
-
val = "02 05 2003"
|
|
401
|
-
assert date._is(val)
|
|
402
|
-
val = "20030502"
|
|
403
|
-
assert date._is(val)
|
|
404
|
-
val = "1993-12/02"
|
|
405
|
-
assert date._is(val)
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
def test_do_not_match_date():
|
|
409
|
-
val = "1993-1993-1993"
|
|
410
|
-
assert not date._is(val)
|
|
411
|
-
val = "39-10-1993"
|
|
412
|
-
assert not date._is(val)
|
|
413
|
-
val = "19-15-1993"
|
|
414
|
-
assert not date._is(val)
|
|
415
|
-
val = "15 tambour 1985"
|
|
416
|
-
assert not date._is(val)
|
|
417
|
-
val = "12152003"
|
|
418
|
-
assert not date._is(val)
|
|
419
|
-
val = "20031512"
|
|
420
|
-
assert not date._is(val)
|
|
421
|
-
val = "02052003"
|
|
422
|
-
assert not date._is(val)
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
# datetime
|
|
426
|
-
def test_match_datetime():
|
|
427
|
-
val = "2021-06-22T10:20:10"
|
|
428
|
-
assert datetime_iso._is(val)
|
|
429
|
-
val = "2021-06-22T30:20:10"
|
|
430
|
-
assert not datetime_iso._is(val)
|
|
431
|
-
|
|
432
|
-
val = "Sun, 06 Nov 1994 08:49:37 GMT"
|
|
433
|
-
assert datetime_rfc822._is(val)
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
# siren
|
|
437
|
-
def test_match_siren():
|
|
438
|
-
val = "552 100 554"
|
|
439
|
-
assert siren._is(val)
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
def test_do_not_match_siren():
|
|
443
|
-
val = "42"
|
|
444
|
-
assert not siren._is(val)
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
# rna
|
|
448
|
-
def test_match_rna():
|
|
449
|
-
val = "W751515517"
|
|
450
|
-
assert code_rna._is(val)
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
def test_do_not_match_rna():
|
|
454
|
-
vals = [
|
|
455
|
-
"W111111111111111111111111111111111111",
|
|
456
|
-
"w143788974",
|
|
457
|
-
"W12",
|
|
458
|
-
"678W23456",
|
|
459
|
-
"165789325",
|
|
460
|
-
"Wa1#89sf&h",
|
|
461
|
-
]
|
|
462
|
-
for val in vals:
|
|
463
|
-
assert not code_rna._is(val)
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
def test_match_waldec():
|
|
467
|
-
val = "751P00188854"
|
|
468
|
-
assert code_waldec._is(val)
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
def test_do_not_match_waldec():
|
|
472
|
-
val = "AA751PEE00188854"
|
|
473
|
-
assert not code_waldec._is(val)
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
# json
|
|
477
|
-
def test_match_json():
|
|
478
|
-
val = '{"pomme": "fruit", "reponse": 42}'
|
|
479
|
-
assert json._is(val)
|
|
480
|
-
val = "[1,2,3,4]"
|
|
481
|
-
assert json._is(val)
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
def test_do_not_match_json():
|
|
485
|
-
val = '{"coordinates": [45.783753, 3.049342], "citycode": "63870"}'
|
|
486
|
-
assert not json._is(val)
|
|
487
|
-
val = "666"
|
|
488
|
-
assert not json._is(val)
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
# int
|
|
492
|
-
def test_match_int():
|
|
493
|
-
for val in ["1", "0", "1764", "-24"]:
|
|
494
|
-
assert test_int._is(val)
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
def test_not_match_int():
|
|
498
|
-
for val in ["01053", "1.2", "123_456", "+35"]:
|
|
499
|
-
assert not test_int._is(val)
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
# float
|
|
503
|
-
def test_match_float():
|
|
504
|
-
for val in ["1", "0", "1764", "-24", "1.2", "1863.23", "-12.7", "0.1"]:
|
|
505
|
-
assert test_float._is(val)
|
|
111
|
+
fields = {
|
|
112
|
+
adresse: {
|
|
113
|
+
True: ["rue du martyr"],
|
|
114
|
+
False: ["un batiment"],
|
|
115
|
+
},
|
|
116
|
+
code_commune_insee: {
|
|
117
|
+
True: ["91471", "01053"],
|
|
118
|
+
False: ["914712", "01000"],
|
|
119
|
+
},
|
|
120
|
+
code_departement: {
|
|
121
|
+
True: ["75", "2A", "2b", "974", "01"],
|
|
122
|
+
False: ["00", "96", "101"],
|
|
123
|
+
},
|
|
124
|
+
code_fantoir: {
|
|
125
|
+
True: ["7755A", "B150B", "ZA04C", "ZB03D"],
|
|
126
|
+
False: ["7755", "ZA99A"],
|
|
127
|
+
},
|
|
128
|
+
code_postal: {
|
|
129
|
+
True: ["75020", "01000"],
|
|
130
|
+
False: ["77777", "018339"],
|
|
131
|
+
},
|
|
132
|
+
code_region: {
|
|
133
|
+
True: ["32"],
|
|
134
|
+
False: ["55"],
|
|
135
|
+
},
|
|
136
|
+
commune: {
|
|
137
|
+
True: ["saint denis"],
|
|
138
|
+
False: ["new york", "lion"],
|
|
139
|
+
},
|
|
140
|
+
departement: {
|
|
141
|
+
True: ["essonne"],
|
|
142
|
+
False: ["alabama", "auvergne"],
|
|
143
|
+
},
|
|
144
|
+
insee_canton: {
|
|
145
|
+
True: ["nantua"],
|
|
146
|
+
False: ["california"],
|
|
147
|
+
},
|
|
148
|
+
latitude_l93: {
|
|
149
|
+
True: ["6037008", "7123528.5", "7124528,5"],
|
|
150
|
+
False: ["0", "-6734529.6", "7245669.8", "3422674,78", "32_34"],
|
|
151
|
+
},
|
|
152
|
+
longitude_l93: {
|
|
153
|
+
True: ["0", "-154", "1265783,45", "34723.4"],
|
|
154
|
+
False: ["1456669.8", "-776225", "346_3214"],
|
|
155
|
+
},
|
|
156
|
+
latitude_wgs_fr_metropole: {
|
|
157
|
+
True: ["42.5"],
|
|
158
|
+
False: ["22.5", "62.5"],
|
|
159
|
+
},
|
|
160
|
+
longitude_wgs_fr_metropole: {
|
|
161
|
+
True: ["-2.5"],
|
|
162
|
+
False: ["12.8"],
|
|
163
|
+
},
|
|
164
|
+
pays: {
|
|
165
|
+
True: ["france", "italie"],
|
|
166
|
+
False: ["amerique", "paris"],
|
|
167
|
+
},
|
|
168
|
+
region: {
|
|
169
|
+
True: ["bretagne", "ile-de-france"],
|
|
170
|
+
False: ["baviere", "overgne"],
|
|
171
|
+
},
|
|
172
|
+
code_csp_insee: {
|
|
173
|
+
True: ["121f"],
|
|
174
|
+
False: ["121x"],
|
|
175
|
+
},
|
|
176
|
+
code_rna: {
|
|
177
|
+
True: ["W751515517"],
|
|
178
|
+
False: [
|
|
179
|
+
"W111111111111111111111111111111111111",
|
|
180
|
+
"w143788974",
|
|
181
|
+
"W12",
|
|
182
|
+
"678W23456",
|
|
183
|
+
"165789325",
|
|
184
|
+
"Wa1#89sf&h",
|
|
185
|
+
],
|
|
186
|
+
},
|
|
187
|
+
code_waldec: {
|
|
188
|
+
True: ["751P00188854"],
|
|
189
|
+
False: ["AA751PEE00188854"],
|
|
190
|
+
},
|
|
191
|
+
csp_insee: {
|
|
192
|
+
True: ["employes de la poste"],
|
|
193
|
+
False: ["super-heros"],
|
|
194
|
+
},
|
|
195
|
+
sexe: {
|
|
196
|
+
True: ["homme"],
|
|
197
|
+
False: ["hermaphrodite"],
|
|
198
|
+
},
|
|
199
|
+
siren: {
|
|
200
|
+
True: ["552 100 554", "552100554"],
|
|
201
|
+
False: ["42"],
|
|
202
|
+
},
|
|
203
|
+
siret: {
|
|
204
|
+
True: ["13002526500013", "130 025 265 00013"],
|
|
205
|
+
False: ["13002526500012"],
|
|
206
|
+
},
|
|
207
|
+
uai: {
|
|
208
|
+
True: ["0422170F"],
|
|
209
|
+
False: ["04292E"],
|
|
210
|
+
},
|
|
211
|
+
date_fr: {
|
|
212
|
+
True: ["13 fevrier 1996"],
|
|
213
|
+
False: ["44 march 2025"],
|
|
214
|
+
},
|
|
215
|
+
insee_ape700: {
|
|
216
|
+
True: ["0116Z"],
|
|
217
|
+
False: ["0116A"]
|
|
218
|
+
},
|
|
219
|
+
tel_fr: {
|
|
220
|
+
True: ["0134643467"],
|
|
221
|
+
False: ["6625388263", "01288398"],
|
|
222
|
+
},
|
|
223
|
+
jour_de_la_semaine: {
|
|
224
|
+
True: ["lundi"],
|
|
225
|
+
False: ["jour de la biere"],
|
|
226
|
+
},
|
|
227
|
+
mois_de_annee: {
|
|
228
|
+
True: ["juin", "décembre"],
|
|
229
|
+
False: ["november"],
|
|
230
|
+
},
|
|
231
|
+
iso_country_code_alpha2: {
|
|
232
|
+
True: ["FR"],
|
|
233
|
+
False: ["XX", "A", "FRA"],
|
|
234
|
+
},
|
|
235
|
+
iso_country_code_alpha3: {
|
|
236
|
+
True: ["FRA"],
|
|
237
|
+
False: ["XXX", "FR", "A"],
|
|
238
|
+
},
|
|
239
|
+
iso_country_code_numeric: {
|
|
240
|
+
True: ["250"],
|
|
241
|
+
False: ["003"],
|
|
242
|
+
},
|
|
243
|
+
json_geojson: {
|
|
244
|
+
True: [
|
|
245
|
+
'{"coordinates": [45.783753, 3.049342], "type": "63870"}',
|
|
246
|
+
'{"geometry": {"coordinates": [45.783753, 3.049342]}}',
|
|
247
|
+
],
|
|
248
|
+
False: ['{"pomme": "fruit", "reponse": 42}'],
|
|
249
|
+
},
|
|
250
|
+
latitude_wgs: {
|
|
251
|
+
True: ["43.2", "-22"],
|
|
252
|
+
False: ["100"],
|
|
253
|
+
},
|
|
254
|
+
latlon_wgs: {
|
|
255
|
+
True: ["43.2,-22.6", "-10.7,140", "-40.7, 10.8"],
|
|
256
|
+
False: ["0.1,192", "-102, 92"],
|
|
257
|
+
},
|
|
258
|
+
longitude_wgs: {
|
|
259
|
+
True: ["120", "-20.2"],
|
|
260
|
+
False: ["-200"],
|
|
261
|
+
},
|
|
262
|
+
booleen: {
|
|
263
|
+
True: ["oui", "0", "1", "yes", "false", "True"],
|
|
264
|
+
False: ["nein", "ja", "2", "-0"],
|
|
265
|
+
},
|
|
266
|
+
email: {
|
|
267
|
+
True: ["cdo_intern@data.gouv.fr"],
|
|
268
|
+
False: ["cdo@@gouv.sfd"],
|
|
269
|
+
},
|
|
270
|
+
json: {
|
|
271
|
+
True: ['{"pomme": "fruit", "reponse": 42}', "[1,2,3,4]"],
|
|
272
|
+
False: ['{"coordinates": [45.783753, 3.049342], "citycode": "63870"}', "{zefib:"],
|
|
273
|
+
},
|
|
274
|
+
money: {
|
|
275
|
+
True: ["120€", "-20.2$"],
|
|
276
|
+
False: ["200", "100 euros"],
|
|
277
|
+
},
|
|
278
|
+
mongo_object_id: {
|
|
279
|
+
True: ["62320e50f981bc2b57bcc044"],
|
|
280
|
+
False: ["884762be-51f3-44c3-b811-1e14c5d89262", "0230240284a66e"],
|
|
281
|
+
},
|
|
282
|
+
percent: {
|
|
283
|
+
True: ["120%", "-20.2%"],
|
|
284
|
+
False: ["200", "100 pourcents"],
|
|
285
|
+
},
|
|
286
|
+
twitter: {
|
|
287
|
+
True: ["@accueil1"],
|
|
288
|
+
False: ["adresse@mail"],
|
|
289
|
+
},
|
|
290
|
+
url: {
|
|
291
|
+
True: ["www.etalab.data.gouv.fr"],
|
|
292
|
+
False: ["une phrase avec un @ dedans"],
|
|
293
|
+
},
|
|
294
|
+
uuid: {
|
|
295
|
+
True: ["884762be-51f3-44c3-b811-1e14c5d89262"],
|
|
296
|
+
False: ["0610928327"],
|
|
297
|
+
},
|
|
298
|
+
test_int: {
|
|
299
|
+
True: ["1", "0", "1764", "-24"],
|
|
300
|
+
False: ["01053", "1.2", "123_456", "+35"],
|
|
301
|
+
},
|
|
302
|
+
test_float: {
|
|
303
|
+
True: ["1", "0", "1764", "-24", "1.2", "1863.23", "-12.7", "0.1"],
|
|
304
|
+
False: ["01053", "01053.89", "1e3", "123_456", "123_456.78", "+35", "+35.9"],
|
|
305
|
+
},
|
|
306
|
+
date: {
|
|
307
|
+
True: [
|
|
308
|
+
"1960-08-07",
|
|
309
|
+
"12/02/2007",
|
|
310
|
+
"15 jan 1985",
|
|
311
|
+
"15 décembre 1985",
|
|
312
|
+
"02 05 2003",
|
|
313
|
+
"20030502",
|
|
314
|
+
"1993-12/02",
|
|
315
|
+
],
|
|
316
|
+
False: [
|
|
317
|
+
"1993-1993-1993",
|
|
318
|
+
"39-10-1993",
|
|
319
|
+
"19-15-1993",
|
|
320
|
+
"15 tambour 1985",
|
|
321
|
+
"12152003",
|
|
322
|
+
"20031512",
|
|
323
|
+
"02052003",
|
|
324
|
+
],
|
|
325
|
+
},
|
|
326
|
+
datetime: {
|
|
327
|
+
True: ["2021-06-22T10:20:10"],
|
|
328
|
+
False: ["2021-06-22T30:20:10", "Sun, 06 Nov 1994 08:49:37 GMT"],
|
|
329
|
+
},
|
|
330
|
+
datetime_iso: {
|
|
331
|
+
True: ["2021-06-22T10:20:10"],
|
|
332
|
+
False: ["2021-06-22T30:20:10", "Sun, 06 Nov 1994 08:49:37 GMT"],
|
|
333
|
+
},
|
|
334
|
+
datetime_rfc822: {
|
|
335
|
+
True: ["Sun, 06 Nov 1994 08:49:37 GMT"],
|
|
336
|
+
False: ["2021-06-22T10:20:10"],
|
|
337
|
+
},
|
|
338
|
+
year: {
|
|
339
|
+
True: ["2015"],
|
|
340
|
+
False: ["20166"],
|
|
341
|
+
},
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
# we could also have a function here to add all True values of (almost)
|
|
345
|
+
# each field to the False values of all others
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def test_all_fields_have_tests():
|
|
349
|
+
all_tests = return_all_tests("ALL", "detect_fields")
|
|
350
|
+
for test in all_tests:
|
|
351
|
+
assert fields.get(test)
|
|
506
352
|
|
|
507
353
|
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
354
|
+
@pytest.mark.parametrize(
|
|
355
|
+
"args",
|
|
356
|
+
(
|
|
357
|
+
(field, value, valid)
|
|
358
|
+
for field in fields
|
|
359
|
+
for valid in [True, False]
|
|
360
|
+
for value in fields[field][valid]
|
|
361
|
+
),
|
|
362
|
+
)
|
|
363
|
+
def test_fields_with_values(args):
|
|
364
|
+
field, value, valid = args
|
|
365
|
+
assert field._is(value) is valid
|
|
511
366
|
|
|
512
367
|
|
|
513
368
|
@pytest.mark.parametrize(
|
|
File without changes
|
|
File without changes
|
{csv_detective-0.7.5.dev1286.dist-info → csv_detective-0.7.5.dev1298.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
{csv_detective-0.7.5.dev1286.dist-info → csv_detective-0.7.5.dev1298.dist-info}/top_level.txt
RENAMED
|
File without changes
|