csv-detective 0.7.5.dev1009__py3-none-any.whl → 0.7.5.dev1052__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/__init__.py +1 -0
- csv_detective/create_example.py +247 -0
- {csv_detective-0.7.5.dev1009.data → csv_detective-0.7.5.dev1052.data}/data/share/csv_detective/CHANGELOG.md +1 -1
- {csv_detective-0.7.5.dev1009.dist-info → csv_detective-0.7.5.dev1052.dist-info}/METADATA +3 -1
- {csv_detective-0.7.5.dev1009.dist-info → csv_detective-0.7.5.dev1052.dist-info}/RECORD +12 -10
- tests/test_example.py +71 -0
- {csv_detective-0.7.5.dev1009.data → csv_detective-0.7.5.dev1052.data}/data/share/csv_detective/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.7.5.dev1009.data → csv_detective-0.7.5.dev1052.data}/data/share/csv_detective/README.md +0 -0
- {csv_detective-0.7.5.dev1009.dist-info → csv_detective-0.7.5.dev1052.dist-info}/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.7.5.dev1009.dist-info → csv_detective-0.7.5.dev1052.dist-info}/WHEEL +0 -0
- {csv_detective-0.7.5.dev1009.dist-info → csv_detective-0.7.5.dev1052.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.7.5.dev1009.dist-info → csv_detective-0.7.5.dev1052.dist-info}/top_level.txt +0 -0
csv_detective/__init__.py
CHANGED
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
import random
|
|
2
|
+
import uuid
|
|
3
|
+
import string
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from typing import List, Union, Optional, Any, Type
|
|
7
|
+
import json
|
|
8
|
+
import requests
|
|
9
|
+
import rstr
|
|
10
|
+
from faker import Faker
|
|
11
|
+
|
|
12
|
+
fake = Faker()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def create_example_csv_file(
|
|
16
|
+
fields: Optional[dict] = None,
|
|
17
|
+
schema_path: Optional[str] = None,
|
|
18
|
+
file_length: int = 10,
|
|
19
|
+
output_name: str = 'example_file.csv',
|
|
20
|
+
output_sep: str = ';',
|
|
21
|
+
encoding: str = 'utf-8',
|
|
22
|
+
ignore_required: bool = False,
|
|
23
|
+
) -> pd.DataFrame:
|
|
24
|
+
'''
|
|
25
|
+
Create an example file based on a list of dicts like follows:
|
|
26
|
+
fields = [
|
|
27
|
+
{
|
|
28
|
+
"name": "column_name",
|
|
29
|
+
"type": "column_type",
|
|
30
|
+
"args": {dict_of_args} # optional
|
|
31
|
+
},
|
|
32
|
+
...
|
|
33
|
+
]
|
|
34
|
+
Or from a TableSchema
|
|
35
|
+
'''
|
|
36
|
+
# need to make a CLI command
|
|
37
|
+
|
|
38
|
+
if not (fields or schema_path):
|
|
39
|
+
raise ValueError("At least fields or schema_path must be specified.")
|
|
40
|
+
|
|
41
|
+
def potential_skip(required: bool) -> bool:
|
|
42
|
+
if ignore_required:
|
|
43
|
+
return False
|
|
44
|
+
if not required:
|
|
45
|
+
# for now 30% chance to have an optional value, this could go as an argument
|
|
46
|
+
return random.randint(1, 100) <= 30
|
|
47
|
+
|
|
48
|
+
def _string(
|
|
49
|
+
length: int = 10,
|
|
50
|
+
required: bool = True,
|
|
51
|
+
pattern: Optional[str] = None,
|
|
52
|
+
enum: Optional[str] = None,
|
|
53
|
+
) -> str:
|
|
54
|
+
if potential_skip(required):
|
|
55
|
+
return ''
|
|
56
|
+
if pattern is not None:
|
|
57
|
+
return rstr.xeger(pattern)
|
|
58
|
+
elif enum is not None:
|
|
59
|
+
return random.choice(enum)
|
|
60
|
+
else:
|
|
61
|
+
letters = string.ascii_lowercase
|
|
62
|
+
return ''.join(random.choice(letters) for i in range(length))
|
|
63
|
+
|
|
64
|
+
def _id(
|
|
65
|
+
required: bool = True,
|
|
66
|
+
) -> str:
|
|
67
|
+
if potential_skip(required):
|
|
68
|
+
return ''
|
|
69
|
+
return str(uuid.uuid4())
|
|
70
|
+
|
|
71
|
+
def _date(
|
|
72
|
+
date_range: Union[None, List[str]] = None,
|
|
73
|
+
format: str = '%Y-%m-%d',
|
|
74
|
+
required: bool = True,
|
|
75
|
+
) -> str:
|
|
76
|
+
# the bounds specified in date_range are expected in the same format as the desired output format
|
|
77
|
+
assert all([k in format for k in ['%d', '%m', '%Y']])
|
|
78
|
+
if potential_skip(required):
|
|
79
|
+
return ''
|
|
80
|
+
if date_range is None:
|
|
81
|
+
return fake.date(format)
|
|
82
|
+
else:
|
|
83
|
+
if len(date_range) != 2:
|
|
84
|
+
raise ValueError('"date_range" must have exactly two elements.')
|
|
85
|
+
return fake.date_between_dates(
|
|
86
|
+
datetime.strptime(date_range[0], format),
|
|
87
|
+
datetime.strptime(date_range[1], format),
|
|
88
|
+
).strftime(format)
|
|
89
|
+
|
|
90
|
+
def _time(
|
|
91
|
+
format: str = '%H:%M:%S',
|
|
92
|
+
required: bool = True,
|
|
93
|
+
) -> str:
|
|
94
|
+
assert all([k in format for k in ['%H', '%M', '%S']])
|
|
95
|
+
if potential_skip(required):
|
|
96
|
+
return ''
|
|
97
|
+
# maybe add a time_range argument?
|
|
98
|
+
return fake.time(format)
|
|
99
|
+
|
|
100
|
+
def _datetime(
|
|
101
|
+
datetime_range: Optional[List[str]] = None,
|
|
102
|
+
format: str = '%Y-%m-%d %H-%M-%S',
|
|
103
|
+
required: bool = True,
|
|
104
|
+
) -> str:
|
|
105
|
+
# the bounds specified in datetime_range are expected in the same format as the desired output format
|
|
106
|
+
assert all([k in format for k in ['%d', '%m', '%Y', '%H', '%M', '%S']])
|
|
107
|
+
if potential_skip(required):
|
|
108
|
+
return ''
|
|
109
|
+
if datetime_range is None:
|
|
110
|
+
return fake.date_time().strftime(format)
|
|
111
|
+
else:
|
|
112
|
+
if len(datetime_range) != 2:
|
|
113
|
+
raise ValueError('"date_range" must have exactly two elements.')
|
|
114
|
+
return fake.date_time_between(
|
|
115
|
+
datetime.strptime(datetime_range[0], format),
|
|
116
|
+
datetime.strptime(datetime_range[1], format),
|
|
117
|
+
).strftime(format)
|
|
118
|
+
|
|
119
|
+
def _url(required: bool = True) -> str:
|
|
120
|
+
if potential_skip(required):
|
|
121
|
+
return ''
|
|
122
|
+
return f'http://{rstr.domainsafe()}.{rstr.letters(3)}/{rstr.urlsafe()}'
|
|
123
|
+
|
|
124
|
+
def _number(
|
|
125
|
+
num_type: Type[Union[int, float]] = int,
|
|
126
|
+
num_range: Optional[List[float]] = None,
|
|
127
|
+
enum: Optional[list] = None,
|
|
128
|
+
required: bool = True,
|
|
129
|
+
) -> Union[int, float]:
|
|
130
|
+
assert num_range is None or len(num_range) == 2
|
|
131
|
+
if potential_skip(required):
|
|
132
|
+
return ''
|
|
133
|
+
if enum:
|
|
134
|
+
return random.choice(enum)
|
|
135
|
+
if num_range is None:
|
|
136
|
+
num_range = [0, 1000]
|
|
137
|
+
if num_type == int:
|
|
138
|
+
return random.randint(num_range[0], num_range[1])
|
|
139
|
+
else:
|
|
140
|
+
return round(random.uniform(num_range[0], num_range[1]), 1)
|
|
141
|
+
|
|
142
|
+
def _bool(required: bool = True) -> bool:
|
|
143
|
+
if potential_skip(required):
|
|
144
|
+
return ''
|
|
145
|
+
return random.randint(0, 1) == 0
|
|
146
|
+
|
|
147
|
+
def _array(enum: List[Any], required: bool = True) -> str:
|
|
148
|
+
if potential_skip(required):
|
|
149
|
+
return ''
|
|
150
|
+
return f"[{','.join(random.sample(enum, random.randint(1, len(enum))))}]"
|
|
151
|
+
|
|
152
|
+
def build_args_from_constraints(constraints: dict) -> dict:
|
|
153
|
+
args = {}
|
|
154
|
+
args['required'] = constraints.get('required', False)
|
|
155
|
+
for _ in ['pattern', 'enum', 'format']:
|
|
156
|
+
if _ in constraints:
|
|
157
|
+
args[_] = constraints[_]
|
|
158
|
+
if 'minimum' in constraints and 'maximum' in constraints:
|
|
159
|
+
args['num_range'] = [constraints['minimum'], constraints['maximum']]
|
|
160
|
+
# maybe there are better values than these?
|
|
161
|
+
elif 'minimum' in constraints:
|
|
162
|
+
args['num_range'] = [constraints['minimum'], 10 + constraints['minimum']]
|
|
163
|
+
elif 'maximum' in constraints:
|
|
164
|
+
args['num_range'] = [constraints['maximum'] - 10, constraints['maximum']]
|
|
165
|
+
if 'minLength' in constraints:
|
|
166
|
+
args['length'] = constraints['minLength']
|
|
167
|
+
if 'maxLength' in constraints:
|
|
168
|
+
args['length'] = constraints['maxLength']
|
|
169
|
+
return args
|
|
170
|
+
|
|
171
|
+
schema_types_to_python = {
|
|
172
|
+
'number': 'float',
|
|
173
|
+
'integer': 'int',
|
|
174
|
+
'string': 'str',
|
|
175
|
+
'year': 'year',
|
|
176
|
+
'boolean': 'bool',
|
|
177
|
+
'date': 'date',
|
|
178
|
+
'yearmonth': 'date',
|
|
179
|
+
'time': 'time',
|
|
180
|
+
'datetime': 'datetime',
|
|
181
|
+
'array': 'array'
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
if schema_path:
|
|
185
|
+
if schema_path.startswith('http'):
|
|
186
|
+
schema = requests.get(schema_path).json()
|
|
187
|
+
else:
|
|
188
|
+
with open(schema_path, encoding=encoding) as jsonfile:
|
|
189
|
+
schema = json.load(jsonfile)
|
|
190
|
+
if not ('fields' in schema.keys()):
|
|
191
|
+
raise ValueError('The schema must have a "fields" key.')
|
|
192
|
+
else:
|
|
193
|
+
fields = [
|
|
194
|
+
{
|
|
195
|
+
'name': f['name'],
|
|
196
|
+
'type': schema_types_to_python.get(f['type'], 'str'),
|
|
197
|
+
# when frformat is supported in TableSchema, we can build args for French standards
|
|
198
|
+
# linked to https://github.com/datagouv/fr-format/issues/26
|
|
199
|
+
'args': (
|
|
200
|
+
build_args_from_constraints(f['constraints']) if 'constraints' in f.keys()
|
|
201
|
+
else build_args_from_constraints(f['arrayItem']['constraints'])
|
|
202
|
+
if 'arrayItem' in f.keys() and 'constraints' in f['arrayItem'].keys()
|
|
203
|
+
else {}
|
|
204
|
+
)
|
|
205
|
+
} for f in schema['fields']
|
|
206
|
+
]
|
|
207
|
+
|
|
208
|
+
for k in range(len(fields)):
|
|
209
|
+
if 'args' not in fields[k]:
|
|
210
|
+
fields[k]['args'] = {}
|
|
211
|
+
if fields[k]['type'] == 'float':
|
|
212
|
+
fields[k]['args']['num_type'] = float
|
|
213
|
+
elif fields[k]['type'] == 'int':
|
|
214
|
+
fields[k]['args']['num_type'] = int
|
|
215
|
+
elif fields[k]['type'] == 'year':
|
|
216
|
+
fields[k]['args']['num_type'] = int
|
|
217
|
+
fields[k]['args']['num_range'] = [1990, 2050]
|
|
218
|
+
|
|
219
|
+
types_to_func = {
|
|
220
|
+
'int': _number,
|
|
221
|
+
'float': _number,
|
|
222
|
+
'date': _date,
|
|
223
|
+
'time': _time,
|
|
224
|
+
'str': _string,
|
|
225
|
+
'url': _url,
|
|
226
|
+
'id': _id,
|
|
227
|
+
'year': _number,
|
|
228
|
+
'bool': _bool,
|
|
229
|
+
'datetime': _datetime,
|
|
230
|
+
'array': _array,
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
# would it be better to create by column or by row (as for now)?
|
|
234
|
+
output = pd.DataFrame(
|
|
235
|
+
[
|
|
236
|
+
[
|
|
237
|
+
types_to_func.get(f['type'], 'str')(**f['args'])
|
|
238
|
+
for f in fields
|
|
239
|
+
] for _ in range(file_length)
|
|
240
|
+
],
|
|
241
|
+
columns=[f["name"] for f in fields],
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
if output_name:
|
|
245
|
+
output.to_csv(output_name, sep=output_sep, index=False)
|
|
246
|
+
|
|
247
|
+
return output
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: csv_detective
|
|
3
|
-
Version: 0.7.5.
|
|
3
|
+
Version: 0.7.5.dev1052
|
|
4
4
|
Summary: Detect CSV column content
|
|
5
5
|
Home-page: https://github.com/etalab/csv_detective
|
|
6
6
|
Author: Etalab
|
|
@@ -27,4 +27,6 @@ Requires-Dist: requests==2.32.3
|
|
|
27
27
|
Requires-Dist: responses==0.25.0
|
|
28
28
|
Requires-Dist: python-magic==0.4.27
|
|
29
29
|
Requires-Dist: frformat==0.4.0
|
|
30
|
+
Requires-Dist: faker==33.0.0
|
|
31
|
+
Requires-Dist: rstr==3.2.2
|
|
30
32
|
|
|
@@ -1,5 +1,6 @@
|
|
|
1
|
-
csv_detective/__init__.py,sha256=
|
|
1
|
+
csv_detective/__init__.py,sha256=Au4bNJ_Gi6P6o0uO4R56nYdshG7M6-7Rg_xX4whLmLI,143
|
|
2
2
|
csv_detective/cli.py,sha256=Ua7SE1wMH2uFUsTmfumh4nJk7O06okpMd2gvjUDO1II,1048
|
|
3
|
+
csv_detective/create_example.py,sha256=358e7Q7RWMrY_eEo3pUteJWmg2smFb5edJ_AzcQPrqA,8646
|
|
3
4
|
csv_detective/detection.py,sha256=AuXlPOZfzqznZY2ybAAgaXIq6qVITYd3MXf2CoigI3I,22097
|
|
4
5
|
csv_detective/explore_csv.py,sha256=X5yZS3WCUsafUMcs5tOnDTeMGzMnfr0iB9vEDx7xiqg,16977
|
|
5
6
|
csv_detective/process_text.py,sha256=rsfk66BCmdpsCOd0kDJ8tmqMsEWd-OeBkEisWc4Ej9k,1246
|
|
@@ -125,17 +126,18 @@ csv_detective/detect_labels/temp/date/__init__.py,sha256=GrIbo64WVM3hi7ShBRKKyKU
|
|
|
125
126
|
csv_detective/detect_labels/temp/datetime_iso/__init__.py,sha256=Ih9l56nBcdmGLyWDavVUWuUUuVZBz9QUDE1hHzADvVg,1157
|
|
126
127
|
csv_detective/detect_labels/temp/datetime_rfc822/__init__.py,sha256=DQ_h4uDW1e6qu2rATEhgGKw6O-vVi7HbDhbEDDCT9uY,1175
|
|
127
128
|
csv_detective/detect_labels/temp/year/__init__.py,sha256=zPF_mvhzhXMAlHPAskS8mhuxjLj2AlKpV4ss8Q4tDms,1150
|
|
128
|
-
csv_detective-0.7.5.
|
|
129
|
-
csv_detective-0.7.5.
|
|
130
|
-
csv_detective-0.7.5.
|
|
129
|
+
csv_detective-0.7.5.dev1052.data/data/share/csv_detective/CHANGELOG.md,sha256=oDqKO3qTo-cUSJB4fMbsyQY2O4pEQhOwWeHsZwaGkxM,6725
|
|
130
|
+
csv_detective-0.7.5.dev1052.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
|
|
131
|
+
csv_detective-0.7.5.dev1052.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
|
|
131
132
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
133
|
+
tests/test_example.py,sha256=0NfChooJQlFxTo2nY5FOQIcsK4zzWA_SBmt2LwVQovY,2014
|
|
132
134
|
tests/test_fields.py,sha256=kXel-hiyQYrJ3OLmwUMg1K3DKbbwBLvUplxZWxpp18I,10605
|
|
133
135
|
tests/test_file.py,sha256=1fEOu3bArGBaarRKAoTXAF3cSIGJfFN3UIwOW6esWRs,6399
|
|
134
136
|
tests/test_labels.py,sha256=6MOKrGznkwU5fjZ_3oiB6Scmb480Eu-9geBJs0UDLds,159
|
|
135
137
|
tests/test_structure.py,sha256=SVsnluVoIIprYw_67I1_gB3cp9m1wlO8C7SpdsLW8cM,1161
|
|
136
|
-
csv_detective-0.7.5.
|
|
137
|
-
csv_detective-0.7.5.
|
|
138
|
-
csv_detective-0.7.5.
|
|
139
|
-
csv_detective-0.7.5.
|
|
140
|
-
csv_detective-0.7.5.
|
|
141
|
-
csv_detective-0.7.5.
|
|
138
|
+
csv_detective-0.7.5.dev1052.dist-info/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
|
|
139
|
+
csv_detective-0.7.5.dev1052.dist-info/METADATA,sha256=mgg54BtjqPn-L_G2a4JU0SyorK8uYzUyp64cxOAIe6A,1146
|
|
140
|
+
csv_detective-0.7.5.dev1052.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
|
|
141
|
+
csv_detective-0.7.5.dev1052.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
|
|
142
|
+
csv_detective-0.7.5.dev1052.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
|
|
143
|
+
csv_detective-0.7.5.dev1052.dist-info/RECORD,,
|
tests/test_example.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
|
|
2
|
+
import re
|
|
3
|
+
from uuid import UUID
|
|
4
|
+
from csv_detective import create_example_csv_file
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def test_example_creation():
|
|
8
|
+
fields = [
|
|
9
|
+
{
|
|
10
|
+
"name": "id_unique",
|
|
11
|
+
"type": "id",
|
|
12
|
+
},
|
|
13
|
+
{
|
|
14
|
+
"name": "nom_modele",
|
|
15
|
+
"type": "str",
|
|
16
|
+
"args": {'length': 20},
|
|
17
|
+
},
|
|
18
|
+
{
|
|
19
|
+
"name": "siret",
|
|
20
|
+
"type": "str",
|
|
21
|
+
"args": {'pattern': '^\\d{14}$'},
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
"name": "type_producteur",
|
|
25
|
+
"type": "str",
|
|
26
|
+
"args": {'enum': ['privé', 'public', 'association']},
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
"name": "date_creation",
|
|
30
|
+
"type": "date",
|
|
31
|
+
"args": {
|
|
32
|
+
'date_range': ['1996-02-13', '2000-01-28'],
|
|
33
|
+
'format': '%Y-%m-%d',
|
|
34
|
+
},
|
|
35
|
+
},
|
|
36
|
+
{
|
|
37
|
+
"name": "url_produit",
|
|
38
|
+
"type": "url",
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
"name": "nb_produits",
|
|
42
|
+
"type": "int",
|
|
43
|
+
},
|
|
44
|
+
{
|
|
45
|
+
"name": "note",
|
|
46
|
+
"type": "float",
|
|
47
|
+
"args": {'num_range': [1, 20]}
|
|
48
|
+
},
|
|
49
|
+
]
|
|
50
|
+
df = create_example_csv_file(
|
|
51
|
+
fields=fields,
|
|
52
|
+
file_length=5,
|
|
53
|
+
output_name="",
|
|
54
|
+
)
|
|
55
|
+
assert len(df) == 5
|
|
56
|
+
assert all(UUID(_) for _ in df["id_unique"])
|
|
57
|
+
assert all(len(_) == 20 for _ in df["nom_modele"])
|
|
58
|
+
assert all(re.match("^\\d{14}$", _) for _ in df["siret"])
|
|
59
|
+
assert all(_ in ['privé', 'public', 'association'] for _ in df["type_producteur"])
|
|
60
|
+
assert all(_ >= '1996-02-13' and _ <= '2000-01-28' for _ in df["date_creation"])
|
|
61
|
+
assert all(_.startswith("http") for _ in df["url_produit"])
|
|
62
|
+
assert all(isinstance(_, int) for _ in df["nb_produits"])
|
|
63
|
+
assert all(_ >= 1 and _ <= 20 for _ in df["note"])
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def test_example_from_tableschema():
|
|
67
|
+
df = create_example_csv_file(
|
|
68
|
+
schema_path="https://schema.data.gouv.fr/schemas/etalab/schema-irve-statique/2.3.1/schema-statique.json",
|
|
69
|
+
output_name="",
|
|
70
|
+
)
|
|
71
|
+
assert len(df) == 10
|
|
File without changes
|
|
File without changes
|
{csv_detective-0.7.5.dev1009.dist-info → csv_detective-0.7.5.dev1052.dist-info}/LICENSE.AGPL.txt
RENAMED
|
File without changes
|
|
File without changes
|
{csv_detective-0.7.5.dev1009.dist-info → csv_detective-0.7.5.dev1052.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{csv_detective-0.7.5.dev1009.dist-info → csv_detective-0.7.5.dev1052.dist-info}/top_level.txt
RENAMED
|
File without changes
|