csv-detective 0.7.5.dev980__py3-none-any.whl → 0.7.5.dev1052__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/__init__.py +1 -0
- csv_detective/create_example.py +247 -0
- csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +6 -2
- csv_detective/detect_fields/FR/geo/code_departement/__init__.py +10 -2
- csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +3 -1
- csv_detective/detect_fields/FR/geo/code_postal/__init__.py +6 -1
- csv_detective/detect_fields/FR/geo/code_region/__init__.py +4 -2
- csv_detective/detect_fields/FR/geo/commune/__init__.py +10 -2
- csv_detective/detect_fields/FR/geo/departement/__init__.py +10 -2
- csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +9 -2
- csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +5 -5
- csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +5 -5
- csv_detective/detect_fields/FR/geo/pays/__init__.py +10 -2
- csv_detective/detect_fields/FR/geo/region/__init__.py +42 -2
- csv_detective/detect_fields/FR/other/code_rna/__init__.py +3 -1
- {csv_detective-0.7.5.dev980.data → csv_detective-0.7.5.dev1052.data}/data/share/csv_detective/CHANGELOG.md +1 -1
- {csv_detective-0.7.5.dev980.dist-info → csv_detective-0.7.5.dev1052.dist-info}/METADATA +4 -2
- {csv_detective-0.7.5.dev980.dist-info → csv_detective-0.7.5.dev1052.dist-info}/RECORD +26 -24
- tests/test_example.py +71 -0
- tests/test_fields.py +4 -4
- {csv_detective-0.7.5.dev980.data → csv_detective-0.7.5.dev1052.data}/data/share/csv_detective/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.7.5.dev980.data → csv_detective-0.7.5.dev1052.data}/data/share/csv_detective/README.md +0 -0
- {csv_detective-0.7.5.dev980.dist-info → csv_detective-0.7.5.dev1052.dist-info}/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.7.5.dev980.dist-info → csv_detective-0.7.5.dev1052.dist-info}/WHEEL +0 -0
- {csv_detective-0.7.5.dev980.dist-info → csv_detective-0.7.5.dev1052.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.7.5.dev980.dist-info → csv_detective-0.7.5.dev1052.dist-info}/top_level.txt +0 -0
csv_detective/__init__.py
CHANGED
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
import random
|
|
2
|
+
import uuid
|
|
3
|
+
import string
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from typing import List, Union, Optional, Any, Type
|
|
7
|
+
import json
|
|
8
|
+
import requests
|
|
9
|
+
import rstr
|
|
10
|
+
from faker import Faker
|
|
11
|
+
|
|
12
|
+
fake = Faker()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def create_example_csv_file(
|
|
16
|
+
fields: Optional[dict] = None,
|
|
17
|
+
schema_path: Optional[str] = None,
|
|
18
|
+
file_length: int = 10,
|
|
19
|
+
output_name: str = 'example_file.csv',
|
|
20
|
+
output_sep: str = ';',
|
|
21
|
+
encoding: str = 'utf-8',
|
|
22
|
+
ignore_required: bool = False,
|
|
23
|
+
) -> pd.DataFrame:
|
|
24
|
+
'''
|
|
25
|
+
Create an example file based on a list of dicts like follows:
|
|
26
|
+
fields = [
|
|
27
|
+
{
|
|
28
|
+
"name": "column_name",
|
|
29
|
+
"type": "column_type",
|
|
30
|
+
"args": {dict_of_args} # optional
|
|
31
|
+
},
|
|
32
|
+
...
|
|
33
|
+
]
|
|
34
|
+
Or from a TableSchema
|
|
35
|
+
'''
|
|
36
|
+
# need to make a CLI command
|
|
37
|
+
|
|
38
|
+
if not (fields or schema_path):
|
|
39
|
+
raise ValueError("At least fields or schema_path must be specified.")
|
|
40
|
+
|
|
41
|
+
def potential_skip(required: bool) -> bool:
|
|
42
|
+
if ignore_required:
|
|
43
|
+
return False
|
|
44
|
+
if not required:
|
|
45
|
+
# for now 30% chance to have an optional value, this could go as an argument
|
|
46
|
+
return random.randint(1, 100) <= 30
|
|
47
|
+
|
|
48
|
+
def _string(
|
|
49
|
+
length: int = 10,
|
|
50
|
+
required: bool = True,
|
|
51
|
+
pattern: Optional[str] = None,
|
|
52
|
+
enum: Optional[str] = None,
|
|
53
|
+
) -> str:
|
|
54
|
+
if potential_skip(required):
|
|
55
|
+
return ''
|
|
56
|
+
if pattern is not None:
|
|
57
|
+
return rstr.xeger(pattern)
|
|
58
|
+
elif enum is not None:
|
|
59
|
+
return random.choice(enum)
|
|
60
|
+
else:
|
|
61
|
+
letters = string.ascii_lowercase
|
|
62
|
+
return ''.join(random.choice(letters) for i in range(length))
|
|
63
|
+
|
|
64
|
+
def _id(
|
|
65
|
+
required: bool = True,
|
|
66
|
+
) -> str:
|
|
67
|
+
if potential_skip(required):
|
|
68
|
+
return ''
|
|
69
|
+
return str(uuid.uuid4())
|
|
70
|
+
|
|
71
|
+
def _date(
|
|
72
|
+
date_range: Union[None, List[str]] = None,
|
|
73
|
+
format: str = '%Y-%m-%d',
|
|
74
|
+
required: bool = True,
|
|
75
|
+
) -> str:
|
|
76
|
+
# the bounds specified in date_range are expected in the same format as the desired output format
|
|
77
|
+
assert all([k in format for k in ['%d', '%m', '%Y']])
|
|
78
|
+
if potential_skip(required):
|
|
79
|
+
return ''
|
|
80
|
+
if date_range is None:
|
|
81
|
+
return fake.date(format)
|
|
82
|
+
else:
|
|
83
|
+
if len(date_range) != 2:
|
|
84
|
+
raise ValueError('"date_range" must have exactly two elements.')
|
|
85
|
+
return fake.date_between_dates(
|
|
86
|
+
datetime.strptime(date_range[0], format),
|
|
87
|
+
datetime.strptime(date_range[1], format),
|
|
88
|
+
).strftime(format)
|
|
89
|
+
|
|
90
|
+
def _time(
|
|
91
|
+
format: str = '%H:%M:%S',
|
|
92
|
+
required: bool = True,
|
|
93
|
+
) -> str:
|
|
94
|
+
assert all([k in format for k in ['%H', '%M', '%S']])
|
|
95
|
+
if potential_skip(required):
|
|
96
|
+
return ''
|
|
97
|
+
# maybe add a time_range argument?
|
|
98
|
+
return fake.time(format)
|
|
99
|
+
|
|
100
|
+
def _datetime(
|
|
101
|
+
datetime_range: Optional[List[str]] = None,
|
|
102
|
+
format: str = '%Y-%m-%d %H-%M-%S',
|
|
103
|
+
required: bool = True,
|
|
104
|
+
) -> str:
|
|
105
|
+
# the bounds specified in datetime_range are expected in the same format as the desired output format
|
|
106
|
+
assert all([k in format for k in ['%d', '%m', '%Y', '%H', '%M', '%S']])
|
|
107
|
+
if potential_skip(required):
|
|
108
|
+
return ''
|
|
109
|
+
if datetime_range is None:
|
|
110
|
+
return fake.date_time().strftime(format)
|
|
111
|
+
else:
|
|
112
|
+
if len(datetime_range) != 2:
|
|
113
|
+
raise ValueError('"date_range" must have exactly two elements.')
|
|
114
|
+
return fake.date_time_between(
|
|
115
|
+
datetime.strptime(datetime_range[0], format),
|
|
116
|
+
datetime.strptime(datetime_range[1], format),
|
|
117
|
+
).strftime(format)
|
|
118
|
+
|
|
119
|
+
def _url(required: bool = True) -> str:
|
|
120
|
+
if potential_skip(required):
|
|
121
|
+
return ''
|
|
122
|
+
return f'http://{rstr.domainsafe()}.{rstr.letters(3)}/{rstr.urlsafe()}'
|
|
123
|
+
|
|
124
|
+
def _number(
|
|
125
|
+
num_type: Type[Union[int, float]] = int,
|
|
126
|
+
num_range: Optional[List[float]] = None,
|
|
127
|
+
enum: Optional[list] = None,
|
|
128
|
+
required: bool = True,
|
|
129
|
+
) -> Union[int, float]:
|
|
130
|
+
assert num_range is None or len(num_range) == 2
|
|
131
|
+
if potential_skip(required):
|
|
132
|
+
return ''
|
|
133
|
+
if enum:
|
|
134
|
+
return random.choice(enum)
|
|
135
|
+
if num_range is None:
|
|
136
|
+
num_range = [0, 1000]
|
|
137
|
+
if num_type == int:
|
|
138
|
+
return random.randint(num_range[0], num_range[1])
|
|
139
|
+
else:
|
|
140
|
+
return round(random.uniform(num_range[0], num_range[1]), 1)
|
|
141
|
+
|
|
142
|
+
def _bool(required: bool = True) -> bool:
|
|
143
|
+
if potential_skip(required):
|
|
144
|
+
return ''
|
|
145
|
+
return random.randint(0, 1) == 0
|
|
146
|
+
|
|
147
|
+
def _array(enum: List[Any], required: bool = True) -> str:
|
|
148
|
+
if potential_skip(required):
|
|
149
|
+
return ''
|
|
150
|
+
return f"[{','.join(random.sample(enum, random.randint(1, len(enum))))}]"
|
|
151
|
+
|
|
152
|
+
def build_args_from_constraints(constraints: dict) -> dict:
|
|
153
|
+
args = {}
|
|
154
|
+
args['required'] = constraints.get('required', False)
|
|
155
|
+
for _ in ['pattern', 'enum', 'format']:
|
|
156
|
+
if _ in constraints:
|
|
157
|
+
args[_] = constraints[_]
|
|
158
|
+
if 'minimum' in constraints and 'maximum' in constraints:
|
|
159
|
+
args['num_range'] = [constraints['minimum'], constraints['maximum']]
|
|
160
|
+
# maybe there are better values than these?
|
|
161
|
+
elif 'minimum' in constraints:
|
|
162
|
+
args['num_range'] = [constraints['minimum'], 10 + constraints['minimum']]
|
|
163
|
+
elif 'maximum' in constraints:
|
|
164
|
+
args['num_range'] = [constraints['maximum'] - 10, constraints['maximum']]
|
|
165
|
+
if 'minLength' in constraints:
|
|
166
|
+
args['length'] = constraints['minLength']
|
|
167
|
+
if 'maxLength' in constraints:
|
|
168
|
+
args['length'] = constraints['maxLength']
|
|
169
|
+
return args
|
|
170
|
+
|
|
171
|
+
schema_types_to_python = {
|
|
172
|
+
'number': 'float',
|
|
173
|
+
'integer': 'int',
|
|
174
|
+
'string': 'str',
|
|
175
|
+
'year': 'year',
|
|
176
|
+
'boolean': 'bool',
|
|
177
|
+
'date': 'date',
|
|
178
|
+
'yearmonth': 'date',
|
|
179
|
+
'time': 'time',
|
|
180
|
+
'datetime': 'datetime',
|
|
181
|
+
'array': 'array'
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
if schema_path:
|
|
185
|
+
if schema_path.startswith('http'):
|
|
186
|
+
schema = requests.get(schema_path).json()
|
|
187
|
+
else:
|
|
188
|
+
with open(schema_path, encoding=encoding) as jsonfile:
|
|
189
|
+
schema = json.load(jsonfile)
|
|
190
|
+
if not ('fields' in schema.keys()):
|
|
191
|
+
raise ValueError('The schema must have a "fields" key.')
|
|
192
|
+
else:
|
|
193
|
+
fields = [
|
|
194
|
+
{
|
|
195
|
+
'name': f['name'],
|
|
196
|
+
'type': schema_types_to_python.get(f['type'], 'str'),
|
|
197
|
+
# when frformat is supported in TableSchema, we can build args for French standards
|
|
198
|
+
# linked to https://github.com/datagouv/fr-format/issues/26
|
|
199
|
+
'args': (
|
|
200
|
+
build_args_from_constraints(f['constraints']) if 'constraints' in f.keys()
|
|
201
|
+
else build_args_from_constraints(f['arrayItem']['constraints'])
|
|
202
|
+
if 'arrayItem' in f.keys() and 'constraints' in f['arrayItem'].keys()
|
|
203
|
+
else {}
|
|
204
|
+
)
|
|
205
|
+
} for f in schema['fields']
|
|
206
|
+
]
|
|
207
|
+
|
|
208
|
+
for k in range(len(fields)):
|
|
209
|
+
if 'args' not in fields[k]:
|
|
210
|
+
fields[k]['args'] = {}
|
|
211
|
+
if fields[k]['type'] == 'float':
|
|
212
|
+
fields[k]['args']['num_type'] = float
|
|
213
|
+
elif fields[k]['type'] == 'int':
|
|
214
|
+
fields[k]['args']['num_type'] = int
|
|
215
|
+
elif fields[k]['type'] == 'year':
|
|
216
|
+
fields[k]['args']['num_type'] = int
|
|
217
|
+
fields[k]['args']['num_range'] = [1990, 2050]
|
|
218
|
+
|
|
219
|
+
types_to_func = {
|
|
220
|
+
'int': _number,
|
|
221
|
+
'float': _number,
|
|
222
|
+
'date': _date,
|
|
223
|
+
'time': _time,
|
|
224
|
+
'str': _string,
|
|
225
|
+
'url': _url,
|
|
226
|
+
'id': _id,
|
|
227
|
+
'year': _number,
|
|
228
|
+
'bool': _bool,
|
|
229
|
+
'datetime': _datetime,
|
|
230
|
+
'array': _array,
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
# would it be better to create by column or by row (as for now)?
|
|
234
|
+
output = pd.DataFrame(
|
|
235
|
+
[
|
|
236
|
+
[
|
|
237
|
+
types_to_func.get(f['type'], 'str')(**f['args'])
|
|
238
|
+
for f in fields
|
|
239
|
+
] for _ in range(file_length)
|
|
240
|
+
],
|
|
241
|
+
columns=[f["name"] for f in fields],
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
if output_name:
|
|
245
|
+
output.to_csv(output_name, sep=output_sep, index=False)
|
|
246
|
+
|
|
247
|
+
return output
|
|
@@ -1,5 +1,9 @@
|
|
|
1
|
-
from frformat import CodeCommuneInsee
|
|
1
|
+
from frformat import CodeCommuneInsee, Millesime
|
|
2
2
|
|
|
3
3
|
PROPORTION = 0.75
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
_code_commune_insee = CodeCommuneInsee(Millesime.LATEST)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _is(val):
|
|
9
|
+
return _code_commune_insee.is_valid(val)
|
|
@@ -1,7 +1,15 @@
|
|
|
1
|
-
from frformat import NumeroDepartement
|
|
1
|
+
from frformat import NumeroDepartement, Options, Millesime
|
|
2
2
|
|
|
3
3
|
PROPORTION = 1
|
|
4
4
|
|
|
5
|
+
_options = Options(
|
|
6
|
+
ignore_case=True,
|
|
7
|
+
ignore_accents=True,
|
|
8
|
+
replace_non_alphanumeric_with_space=True,
|
|
9
|
+
ignore_extra_whitespace=True
|
|
10
|
+
)
|
|
11
|
+
_numero_departement = NumeroDepartement(Millesime.LATEST, _options)
|
|
12
|
+
|
|
5
13
|
|
|
6
14
|
def _is(val):
|
|
7
|
-
return isinstance(val, str) and
|
|
15
|
+
return isinstance(val, str) and _numero_departement.is_valid(val)
|
|
@@ -1,8 +1,10 @@
|
|
|
1
|
-
from frformat import CodeRegion
|
|
1
|
+
from frformat import CodeRegion, Millesime
|
|
2
2
|
|
|
3
3
|
PROPORTION = 1
|
|
4
4
|
|
|
5
|
+
_code_region = CodeRegion(Millesime.LATEST)
|
|
6
|
+
|
|
5
7
|
|
|
6
8
|
def _is(val):
|
|
7
9
|
'''Renvoie True si val peut être un code_région, False sinon'''
|
|
8
|
-
return isinstance(val, str) and
|
|
10
|
+
return isinstance(val, str) and _code_region.is_valid(val)
|
|
@@ -1,8 +1,16 @@
|
|
|
1
|
-
from frformat import Commune
|
|
1
|
+
from frformat import Commune, Options, Millesime
|
|
2
2
|
|
|
3
3
|
PROPORTION = 0.9
|
|
4
4
|
|
|
5
|
+
_options = Options(
|
|
6
|
+
ignore_case=True,
|
|
7
|
+
ignore_accents=True,
|
|
8
|
+
replace_non_alphanumeric_with_space=True,
|
|
9
|
+
ignore_extra_whitespace=True
|
|
10
|
+
)
|
|
11
|
+
_commune = Commune(Millesime.LATEST, _options)
|
|
12
|
+
|
|
5
13
|
|
|
6
14
|
def _is(val):
|
|
7
15
|
"""Match avec le nom des communes"""
|
|
8
|
-
return isinstance(val, str) and
|
|
16
|
+
return isinstance(val, str) and _commune.is_valid(val)
|
|
@@ -1,8 +1,16 @@
|
|
|
1
|
-
from frformat import Departement
|
|
1
|
+
from frformat import Departement, Options, Millesime
|
|
2
2
|
|
|
3
3
|
PROPORTION = 0.9
|
|
4
4
|
|
|
5
|
+
_options = Options(
|
|
6
|
+
ignore_case=True,
|
|
7
|
+
ignore_accents=True,
|
|
8
|
+
replace_non_alphanumeric_with_space=True,
|
|
9
|
+
ignore_extra_whitespace=True
|
|
10
|
+
)
|
|
11
|
+
_departement = Departement(Millesime.LATEST, _options)
|
|
12
|
+
|
|
5
13
|
|
|
6
14
|
def _is(val):
|
|
7
15
|
"""Match avec le nom des departements"""
|
|
8
|
-
return isinstance(val, str) and
|
|
16
|
+
return isinstance(val, str) and _departement.is_valid(val)
|
|
@@ -1,8 +1,15 @@
|
|
|
1
|
-
from frformat import Canton
|
|
1
|
+
from frformat import Canton, Options, Millesime
|
|
2
2
|
|
|
3
3
|
PROPORTION = 0.9
|
|
4
|
+
_options = Options(
|
|
5
|
+
ignore_case=True,
|
|
6
|
+
ignore_accents=True,
|
|
7
|
+
replace_non_alphanumeric_with_space=True,
|
|
8
|
+
ignore_extra_whitespace=True
|
|
9
|
+
)
|
|
10
|
+
_canton = Canton(Millesime.LATEST, _options)
|
|
4
11
|
|
|
5
12
|
|
|
6
13
|
def _is(val):
|
|
7
14
|
"""Match avec le nom des cantons"""
|
|
8
|
-
return isinstance(val, str) and
|
|
15
|
+
return isinstance(val, str) and _canton.is_valid(val)
|
|
@@ -1,18 +1,18 @@
|
|
|
1
1
|
from frformat import LatitudeL93
|
|
2
2
|
from csv_detective.detect_fields.other.float import _is as is_float
|
|
3
|
+
|
|
3
4
|
from csv_detective.detect_fields.other.float import float_casting
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
PROPORTION = 0.9
|
|
7
8
|
|
|
9
|
+
_latitudel93 = LatitudeL93()
|
|
10
|
+
|
|
8
11
|
|
|
9
12
|
def _is(val):
|
|
10
13
|
try:
|
|
11
|
-
if isinstance(val, (
|
|
12
|
-
return
|
|
13
|
-
|
|
14
|
-
elif isinstance(val, str) and is_float(val):
|
|
15
|
-
return LatitudeL93.is_valid(float_casting(val))
|
|
14
|
+
if isinstance(val, str) and is_float(val):
|
|
15
|
+
return _latitudel93.is_valid(float_casting(val))
|
|
16
16
|
|
|
17
17
|
return False
|
|
18
18
|
|
|
@@ -1,18 +1,18 @@
|
|
|
1
1
|
from frformat import LongitudeL93
|
|
2
2
|
from csv_detective.detect_fields.other.float import _is as is_float
|
|
3
|
+
|
|
3
4
|
from csv_detective.detect_fields.other.float import float_casting
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
PROPORTION = 0.9
|
|
7
8
|
|
|
9
|
+
_longitudel93 = LongitudeL93()
|
|
10
|
+
|
|
8
11
|
|
|
9
12
|
def _is(val):
|
|
10
13
|
try:
|
|
11
|
-
if isinstance(val, (
|
|
12
|
-
return
|
|
13
|
-
|
|
14
|
-
elif isinstance(val, str) and is_float(val):
|
|
15
|
-
return LongitudeL93.is_valid(float_casting(val))
|
|
14
|
+
if isinstance(val, str) and is_float(val):
|
|
15
|
+
return _longitudel93.is_valid(float_casting(val))
|
|
16
16
|
|
|
17
17
|
return False
|
|
18
18
|
|
|
@@ -1,8 +1,16 @@
|
|
|
1
|
-
from frformat import Pays
|
|
1
|
+
from frformat import Pays, Options, Millesime
|
|
2
2
|
|
|
3
3
|
PROPORTION = 0.6
|
|
4
4
|
|
|
5
|
+
_options = Options(
|
|
6
|
+
ignore_case=True,
|
|
7
|
+
ignore_accents=True,
|
|
8
|
+
replace_non_alphanumeric_with_space=True,
|
|
9
|
+
ignore_extra_whitespace=True
|
|
10
|
+
)
|
|
11
|
+
_pays = Pays(Millesime.LATEST, _options)
|
|
12
|
+
|
|
5
13
|
|
|
6
14
|
def _is(val):
|
|
7
15
|
"""Match avec le nom des pays"""
|
|
8
|
-
return isinstance(val, str) and
|
|
16
|
+
return isinstance(val, str) and _pays.is_valid(val)
|
|
@@ -1,8 +1,48 @@
|
|
|
1
|
-
from frformat import Region
|
|
1
|
+
from frformat import Region, Options, Millesime
|
|
2
2
|
|
|
3
3
|
PROPORTION = 1
|
|
4
4
|
|
|
5
|
+
_extra_valid_values_set = frozenset({
|
|
6
|
+
"alsace",
|
|
7
|
+
"aquitaine",
|
|
8
|
+
"ara",
|
|
9
|
+
"aura",
|
|
10
|
+
"auvergne",
|
|
11
|
+
"auvergne et rhone alpes",
|
|
12
|
+
"basse normandie",
|
|
13
|
+
"bfc",
|
|
14
|
+
"bourgogne",
|
|
15
|
+
"bourgogne et franche comte",
|
|
16
|
+
"centre",
|
|
17
|
+
"champagne ardenne",
|
|
18
|
+
"franche comte",
|
|
19
|
+
"ge",
|
|
20
|
+
"haute normandie",
|
|
21
|
+
"hdf",
|
|
22
|
+
"languedoc roussillon",
|
|
23
|
+
"limousin",
|
|
24
|
+
"lorraine",
|
|
25
|
+
"midi pyrenees",
|
|
26
|
+
"nord pas de calais",
|
|
27
|
+
"npdc",
|
|
28
|
+
"paca",
|
|
29
|
+
"picardie",
|
|
30
|
+
"poitou charentes",
|
|
31
|
+
"reunion",
|
|
32
|
+
"rhone alpes",
|
|
33
|
+
})
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
_options = Options(
|
|
37
|
+
ignore_case=True,
|
|
38
|
+
ignore_accents=True,
|
|
39
|
+
replace_non_alphanumeric_with_space=True,
|
|
40
|
+
ignore_extra_whitespace=True,
|
|
41
|
+
extra_valid_values=_extra_valid_values_set
|
|
42
|
+
)
|
|
43
|
+
_region = Region(Millesime.LATEST, _options)
|
|
44
|
+
|
|
5
45
|
|
|
6
46
|
def _is(val):
|
|
7
47
|
"""Match avec le nom des regions"""
|
|
8
|
-
return isinstance(val, str) and
|
|
48
|
+
return isinstance(val, str) and _region.is_valid(val)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: csv_detective
|
|
3
|
-
Version: 0.7.5.
|
|
3
|
+
Version: 0.7.5.dev1052
|
|
4
4
|
Summary: Detect CSV column content
|
|
5
5
|
Home-page: https://github.com/etalab/csv_detective
|
|
6
6
|
Author: Etalab
|
|
@@ -26,5 +26,7 @@ Requires-Dist: odfpy==1.4.1
|
|
|
26
26
|
Requires-Dist: requests==2.32.3
|
|
27
27
|
Requires-Dist: responses==0.25.0
|
|
28
28
|
Requires-Dist: python-magic==0.4.27
|
|
29
|
-
Requires-Dist: frformat==0.
|
|
29
|
+
Requires-Dist: frformat==0.4.0
|
|
30
|
+
Requires-Dist: faker==33.0.0
|
|
31
|
+
Requires-Dist: rstr==3.2.2
|
|
30
32
|
|
|
@@ -1,5 +1,6 @@
|
|
|
1
|
-
csv_detective/__init__.py,sha256=
|
|
1
|
+
csv_detective/__init__.py,sha256=Au4bNJ_Gi6P6o0uO4R56nYdshG7M6-7Rg_xX4whLmLI,143
|
|
2
2
|
csv_detective/cli.py,sha256=Ua7SE1wMH2uFUsTmfumh4nJk7O06okpMd2gvjUDO1II,1048
|
|
3
|
+
csv_detective/create_example.py,sha256=358e7Q7RWMrY_eEo3pUteJWmg2smFb5edJ_AzcQPrqA,8646
|
|
3
4
|
csv_detective/detection.py,sha256=AuXlPOZfzqznZY2ybAAgaXIq6qVITYd3MXf2CoigI3I,22097
|
|
4
5
|
csv_detective/explore_csv.py,sha256=X5yZS3WCUsafUMcs5tOnDTeMGzMnfr0iB9vEDx7xiqg,16977
|
|
5
6
|
csv_detective/process_text.py,sha256=rsfk66BCmdpsCOd0kDJ8tmqMsEWd-OeBkEisWc4Ej9k,1246
|
|
@@ -10,24 +11,24 @@ csv_detective/detect_fields/__init__.py,sha256=CchNbi1vrgIGh_uBexXZTzfjBETDY0kQL
|
|
|
10
11
|
csv_detective/detect_fields/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
12
|
csv_detective/detect_fields/FR/geo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
13
|
csv_detective/detect_fields/FR/geo/adresse/__init__.py,sha256=e5JqMNOPxx0Ivju3zAHCGMopZroCpR4vr3DJKlQhMz4,1675
|
|
13
|
-
csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py,sha256=
|
|
14
|
-
csv_detective/detect_fields/FR/geo/code_departement/__init__.py,sha256=
|
|
15
|
-
csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py,sha256=
|
|
16
|
-
csv_detective/detect_fields/FR/geo/code_postal/__init__.py,sha256=
|
|
17
|
-
csv_detective/detect_fields/FR/geo/code_region/__init__.py,sha256=
|
|
18
|
-
csv_detective/detect_fields/FR/geo/commune/__init__.py,sha256=
|
|
19
|
-
csv_detective/detect_fields/FR/geo/departement/__init__.py,sha256=
|
|
20
|
-
csv_detective/detect_fields/FR/geo/insee_canton/__init__.py,sha256=
|
|
21
|
-
csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py,sha256=
|
|
14
|
+
csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py,sha256=tfHdqUnCQ0cv-fBo3Cy--8UNXzgjld4kseI5eQ_sR4E,187
|
|
15
|
+
csv_detective/detect_fields/FR/geo/code_departement/__init__.py,sha256=unr-Y4zquKSM5PVUiQGnOm-zQvaN8qd3v_XHf0W2VH8,378
|
|
16
|
+
csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py,sha256=27bCkZP5w7tpsKUdOIXuiAG90DTdw066CWg3G5HtsKE,160
|
|
17
|
+
csv_detective/detect_fields/FR/geo/code_postal/__init__.py,sha256=e1SdnW8zVSxrRMm-CeK9tlkLzORP2C6KOInTWnB7h3o,134
|
|
18
|
+
csv_detective/detect_fields/FR/geo/code_region/__init__.py,sha256=y-TPljkf-W209tp7V0RnJ34936XxB6FA2-XPYK3DV8I,253
|
|
19
|
+
csv_detective/detect_fields/FR/geo/commune/__init__.py,sha256=tZ4d1BQd9Xow0SWBcmuGlnX-RKHDzCstdY9AsXM6-Nk,379
|
|
20
|
+
csv_detective/detect_fields/FR/geo/departement/__init__.py,sha256=je2zLsPlK_X189bbmKzf4BJSEoFShxMz2eQNXB7hsh0,399
|
|
21
|
+
csv_detective/detect_fields/FR/geo/insee_canton/__init__.py,sha256=3uNN_Iha6dFfm24CluUmkHFg6nj7kRQaXrHDEcLfyjY,373
|
|
22
|
+
csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py,sha256=5v07RJbi12eoPa-e_-q8xlWBew80FPMxsggcMgZQiI8,438
|
|
22
23
|
csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py,sha256=WjPHg8s0ND6bOwS-yo6FP1dnwD-6SWg9oH1K0avHsbI,344
|
|
23
|
-
csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py,sha256=
|
|
24
|
+
csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py,sha256=ZwThPSfbRwNHA_anuplxTPYHK-WMduc_np2Xw9XsApM,442
|
|
24
25
|
csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py,sha256=d4fLK4IndwllDhsddyTbyRiPfc8O9wT0pLIRI_C3QvQ,344
|
|
25
|
-
csv_detective/detect_fields/FR/geo/pays/__init__.py,sha256=
|
|
26
|
-
csv_detective/detect_fields/FR/geo/region/__init__.py,sha256=
|
|
26
|
+
csv_detective/detect_fields/FR/geo/pays/__init__.py,sha256=2q5T4SmCK6ZFF1mrv7d-q9tOIQKBcROI24y_UYIuvz0,383
|
|
27
|
+
csv_detective/detect_fields/FR/geo/region/__init__.py,sha256=JbFKDd4jAnd9yb7YqP36MoLdO1JFPm1cg60fGXt6ZvI,1074
|
|
27
28
|
csv_detective/detect_fields/FR/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
28
29
|
csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py,sha256=X0NT6YbBg9PrxIcBwzUCQuBiv_QdDdqb3CJnrlent28,566
|
|
29
30
|
csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt,sha256=rbcjtMP6qTZ7BTU6ZegkiXKCruqY_m9Ep6ZgRabFS_E,2486
|
|
30
|
-
csv_detective/detect_fields/FR/other/code_rna/__init__.py,sha256=
|
|
31
|
+
csv_detective/detect_fields/FR/other/code_rna/__init__.py,sha256=Z0RjMBt1--ZL7Jd1RsHAQCCbTAQk_BnlnTq8VF1o_VA,146
|
|
31
32
|
csv_detective/detect_fields/FR/other/code_waldec/__init__.py,sha256=g9n5sOjRlk4I9YFZjdaTYrXf8ftXRDunGZOUpYhN4fA,295
|
|
32
33
|
csv_detective/detect_fields/FR/other/csp_insee/__init__.py,sha256=XacU_3rwXqtdbw_ULTSnu0OOtx0w_rKlviCrLmNdHjc,496
|
|
33
34
|
csv_detective/detect_fields/FR/other/csp_insee/csp_insee.txt,sha256=kgKaKc-5PHu5U4--ugLjpFyMNtTU9CGdZ9ANU3YAsM4,32879
|
|
@@ -125,17 +126,18 @@ csv_detective/detect_labels/temp/date/__init__.py,sha256=GrIbo64WVM3hi7ShBRKKyKU
|
|
|
125
126
|
csv_detective/detect_labels/temp/datetime_iso/__init__.py,sha256=Ih9l56nBcdmGLyWDavVUWuUUuVZBz9QUDE1hHzADvVg,1157
|
|
126
127
|
csv_detective/detect_labels/temp/datetime_rfc822/__init__.py,sha256=DQ_h4uDW1e6qu2rATEhgGKw6O-vVi7HbDhbEDDCT9uY,1175
|
|
127
128
|
csv_detective/detect_labels/temp/year/__init__.py,sha256=zPF_mvhzhXMAlHPAskS8mhuxjLj2AlKpV4ss8Q4tDms,1150
|
|
128
|
-
csv_detective-0.7.5.
|
|
129
|
-
csv_detective-0.7.5.
|
|
130
|
-
csv_detective-0.7.5.
|
|
129
|
+
csv_detective-0.7.5.dev1052.data/data/share/csv_detective/CHANGELOG.md,sha256=oDqKO3qTo-cUSJB4fMbsyQY2O4pEQhOwWeHsZwaGkxM,6725
|
|
130
|
+
csv_detective-0.7.5.dev1052.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
|
|
131
|
+
csv_detective-0.7.5.dev1052.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
|
|
131
132
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
132
|
-
tests/
|
|
133
|
+
tests/test_example.py,sha256=0NfChooJQlFxTo2nY5FOQIcsK4zzWA_SBmt2LwVQovY,2014
|
|
134
|
+
tests/test_fields.py,sha256=kXel-hiyQYrJ3OLmwUMg1K3DKbbwBLvUplxZWxpp18I,10605
|
|
133
135
|
tests/test_file.py,sha256=1fEOu3bArGBaarRKAoTXAF3cSIGJfFN3UIwOW6esWRs,6399
|
|
134
136
|
tests/test_labels.py,sha256=6MOKrGznkwU5fjZ_3oiB6Scmb480Eu-9geBJs0UDLds,159
|
|
135
137
|
tests/test_structure.py,sha256=SVsnluVoIIprYw_67I1_gB3cp9m1wlO8C7SpdsLW8cM,1161
|
|
136
|
-
csv_detective-0.7.5.
|
|
137
|
-
csv_detective-0.7.5.
|
|
138
|
-
csv_detective-0.7.5.
|
|
139
|
-
csv_detective-0.7.5.
|
|
140
|
-
csv_detective-0.7.5.
|
|
141
|
-
csv_detective-0.7.5.
|
|
138
|
+
csv_detective-0.7.5.dev1052.dist-info/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
|
|
139
|
+
csv_detective-0.7.5.dev1052.dist-info/METADATA,sha256=mgg54BtjqPn-L_G2a4JU0SyorK8uYzUyp64cxOAIe6A,1146
|
|
140
|
+
csv_detective-0.7.5.dev1052.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
|
|
141
|
+
csv_detective-0.7.5.dev1052.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
|
|
142
|
+
csv_detective-0.7.5.dev1052.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
|
|
143
|
+
csv_detective-0.7.5.dev1052.dist-info/RECORD,,
|
tests/test_example.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
|
|
2
|
+
import re
|
|
3
|
+
from uuid import UUID
|
|
4
|
+
from csv_detective import create_example_csv_file
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def test_example_creation():
|
|
8
|
+
fields = [
|
|
9
|
+
{
|
|
10
|
+
"name": "id_unique",
|
|
11
|
+
"type": "id",
|
|
12
|
+
},
|
|
13
|
+
{
|
|
14
|
+
"name": "nom_modele",
|
|
15
|
+
"type": "str",
|
|
16
|
+
"args": {'length': 20},
|
|
17
|
+
},
|
|
18
|
+
{
|
|
19
|
+
"name": "siret",
|
|
20
|
+
"type": "str",
|
|
21
|
+
"args": {'pattern': '^\\d{14}$'},
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
"name": "type_producteur",
|
|
25
|
+
"type": "str",
|
|
26
|
+
"args": {'enum': ['privé', 'public', 'association']},
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
"name": "date_creation",
|
|
30
|
+
"type": "date",
|
|
31
|
+
"args": {
|
|
32
|
+
'date_range': ['1996-02-13', '2000-01-28'],
|
|
33
|
+
'format': '%Y-%m-%d',
|
|
34
|
+
},
|
|
35
|
+
},
|
|
36
|
+
{
|
|
37
|
+
"name": "url_produit",
|
|
38
|
+
"type": "url",
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
"name": "nb_produits",
|
|
42
|
+
"type": "int",
|
|
43
|
+
},
|
|
44
|
+
{
|
|
45
|
+
"name": "note",
|
|
46
|
+
"type": "float",
|
|
47
|
+
"args": {'num_range': [1, 20]}
|
|
48
|
+
},
|
|
49
|
+
]
|
|
50
|
+
df = create_example_csv_file(
|
|
51
|
+
fields=fields,
|
|
52
|
+
file_length=5,
|
|
53
|
+
output_name="",
|
|
54
|
+
)
|
|
55
|
+
assert len(df) == 5
|
|
56
|
+
assert all(UUID(_) for _ in df["id_unique"])
|
|
57
|
+
assert all(len(_) == 20 for _ in df["nom_modele"])
|
|
58
|
+
assert all(re.match("^\\d{14}$", _) for _ in df["siret"])
|
|
59
|
+
assert all(_ in ['privé', 'public', 'association'] for _ in df["type_producteur"])
|
|
60
|
+
assert all(_ >= '1996-02-13' and _ <= '2000-01-28' for _ in df["date_creation"])
|
|
61
|
+
assert all(_.startswith("http") for _ in df["url_produit"])
|
|
62
|
+
assert all(isinstance(_, int) for _ in df["nb_produits"])
|
|
63
|
+
assert all(_ >= 1 and _ <= 20 for _ in df["note"])
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def test_example_from_tableschema():
|
|
67
|
+
df = create_example_csv_file(
|
|
68
|
+
schema_path="https://schema.data.gouv.fr/schemas/etalab/schema-irve-statique/2.3.1/schema-statique.json",
|
|
69
|
+
output_name="",
|
|
70
|
+
)
|
|
71
|
+
assert len(df) == 10
|
tests/test_fields.py
CHANGED
|
@@ -282,26 +282,26 @@ def test_do_not_match_canton():
|
|
|
282
282
|
|
|
283
283
|
# latitude_l93
|
|
284
284
|
def test_match_latitude_l93():
|
|
285
|
-
vals = [6037008, 7123528.5, "7124528,5"]
|
|
285
|
+
vals = ["6037008", "7123528.5", "7124528,5"]
|
|
286
286
|
for val in vals:
|
|
287
287
|
assert latitude_l93._is(val)
|
|
288
288
|
|
|
289
289
|
|
|
290
290
|
def test_do_not_match_latitude_93():
|
|
291
|
-
vals = [0, -6734529.6, 7245669.8, "3422674,78", "32_34"]
|
|
291
|
+
vals = ["0", "-6734529.6", "7245669.8", "3422674,78", "32_34"]
|
|
292
292
|
for val in vals:
|
|
293
293
|
assert not latitude_l93._is(val)
|
|
294
294
|
|
|
295
295
|
|
|
296
296
|
# longitude_l93
|
|
297
297
|
def test_match_longitude_l93():
|
|
298
|
-
vals = [0, -154, "1265783,45", 34723.4]
|
|
298
|
+
vals = ["0", "-154", "1265783,45", "34723.4"]
|
|
299
299
|
for val in vals:
|
|
300
300
|
assert longitude_l93._is(val)
|
|
301
301
|
|
|
302
302
|
|
|
303
303
|
def test_do_not_match_longitude_93():
|
|
304
|
-
vals = [1456669.8, "-776225", "346_3214"]
|
|
304
|
+
vals = ["1456669.8", "-776225", "346_3214"]
|
|
305
305
|
for val in vals:
|
|
306
306
|
assert not longitude_l93._is(val)
|
|
307
307
|
|
|
File without changes
|
|
File without changes
|
{csv_detective-0.7.5.dev980.dist-info → csv_detective-0.7.5.dev1052.dist-info}/LICENSE.AGPL.txt
RENAMED
|
File without changes
|
|
File without changes
|
{csv_detective-0.7.5.dev980.dist-info → csv_detective-0.7.5.dev1052.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{csv_detective-0.7.5.dev980.dist-info → csv_detective-0.7.5.dev1052.dist-info}/top_level.txt
RENAMED
|
File without changes
|