csv-detective 0.7.5.dev1009__py3-none-any.whl → 0.7.5.dev1052__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
csv_detective/__init__.py CHANGED
@@ -1,3 +1,4 @@
1
1
  from .explore_csv import routine, routine_minio # noqa
2
+ from .create_example import create_example_csv_file # noqa
2
3
 
3
4
  __version__ = '0.7.5.dev'
@@ -0,0 +1,247 @@
1
+ import random
2
+ import uuid
3
+ import string
4
+ from datetime import datetime
5
+ import pandas as pd
6
+ from typing import List, Union, Optional, Any, Type
7
+ import json
8
+ import requests
9
+ import rstr
10
+ from faker import Faker
11
+
12
+ fake = Faker()
13
+
14
+
15
+ def create_example_csv_file(
16
+ fields: Optional[dict] = None,
17
+ schema_path: Optional[str] = None,
18
+ file_length: int = 10,
19
+ output_name: str = 'example_file.csv',
20
+ output_sep: str = ';',
21
+ encoding: str = 'utf-8',
22
+ ignore_required: bool = False,
23
+ ) -> pd.DataFrame:
24
+ '''
25
+ Create an example file based on a list of dicts like follows:
26
+ fields = [
27
+ {
28
+ "name": "column_name",
29
+ "type": "column_type",
30
+ "args": {dict_of_args} # optional
31
+ },
32
+ ...
33
+ ]
34
+ Or from a TableSchema
35
+ '''
36
+ # need to make a CLI command
37
+
38
+ if not (fields or schema_path):
39
+ raise ValueError("At least fields or schema_path must be specified.")
40
+
41
+ def potential_skip(required: bool) -> bool:
42
+ if ignore_required:
43
+ return False
44
+ if not required:
45
+ # for now 30% chance to have an optional value, this could go as an argument
46
+ return random.randint(1, 100) <= 30
47
+
48
+ def _string(
49
+ length: int = 10,
50
+ required: bool = True,
51
+ pattern: Optional[str] = None,
52
+ enum: Optional[str] = None,
53
+ ) -> str:
54
+ if potential_skip(required):
55
+ return ''
56
+ if pattern is not None:
57
+ return rstr.xeger(pattern)
58
+ elif enum is not None:
59
+ return random.choice(enum)
60
+ else:
61
+ letters = string.ascii_lowercase
62
+ return ''.join(random.choice(letters) for i in range(length))
63
+
64
+ def _id(
65
+ required: bool = True,
66
+ ) -> str:
67
+ if potential_skip(required):
68
+ return ''
69
+ return str(uuid.uuid4())
70
+
71
+ def _date(
72
+ date_range: Union[None, List[str]] = None,
73
+ format: str = '%Y-%m-%d',
74
+ required: bool = True,
75
+ ) -> str:
76
+ # the bounds specified in date_range are expected in the same format as the desired output format
77
+ assert all([k in format for k in ['%d', '%m', '%Y']])
78
+ if potential_skip(required):
79
+ return ''
80
+ if date_range is None:
81
+ return fake.date(format)
82
+ else:
83
+ if len(date_range) != 2:
84
+ raise ValueError('"date_range" must have exactly two elements.')
85
+ return fake.date_between_dates(
86
+ datetime.strptime(date_range[0], format),
87
+ datetime.strptime(date_range[1], format),
88
+ ).strftime(format)
89
+
90
+ def _time(
91
+ format: str = '%H:%M:%S',
92
+ required: bool = True,
93
+ ) -> str:
94
+ assert all([k in format for k in ['%H', '%M', '%S']])
95
+ if potential_skip(required):
96
+ return ''
97
+ # maybe add a time_range argument?
98
+ return fake.time(format)
99
+
100
+ def _datetime(
101
+ datetime_range: Optional[List[str]] = None,
102
+ format: str = '%Y-%m-%d %H-%M-%S',
103
+ required: bool = True,
104
+ ) -> str:
105
+ # the bounds specified in datetime_range are expected in the same format as the desired output format
106
+ assert all([k in format for k in ['%d', '%m', '%Y', '%H', '%M', '%S']])
107
+ if potential_skip(required):
108
+ return ''
109
+ if datetime_range is None:
110
+ return fake.date_time().strftime(format)
111
+ else:
112
+ if len(datetime_range) != 2:
113
+ raise ValueError('"date_range" must have exactly two elements.')
114
+ return fake.date_time_between(
115
+ datetime.strptime(datetime_range[0], format),
116
+ datetime.strptime(datetime_range[1], format),
117
+ ).strftime(format)
118
+
119
+ def _url(required: bool = True) -> str:
120
+ if potential_skip(required):
121
+ return ''
122
+ return f'http://{rstr.domainsafe()}.{rstr.letters(3)}/{rstr.urlsafe()}'
123
+
124
+ def _number(
125
+ num_type: Type[Union[int, float]] = int,
126
+ num_range: Optional[List[float]] = None,
127
+ enum: Optional[list] = None,
128
+ required: bool = True,
129
+ ) -> Union[int, float]:
130
+ assert num_range is None or len(num_range) == 2
131
+ if potential_skip(required):
132
+ return ''
133
+ if enum:
134
+ return random.choice(enum)
135
+ if num_range is None:
136
+ num_range = [0, 1000]
137
+ if num_type == int:
138
+ return random.randint(num_range[0], num_range[1])
139
+ else:
140
+ return round(random.uniform(num_range[0], num_range[1]), 1)
141
+
142
+ def _bool(required: bool = True) -> bool:
143
+ if potential_skip(required):
144
+ return ''
145
+ return random.randint(0, 1) == 0
146
+
147
+ def _array(enum: List[Any], required: bool = True) -> str:
148
+ if potential_skip(required):
149
+ return ''
150
+ return f"[{','.join(random.sample(enum, random.randint(1, len(enum))))}]"
151
+
152
+ def build_args_from_constraints(constraints: dict) -> dict:
153
+ args = {}
154
+ args['required'] = constraints.get('required', False)
155
+ for _ in ['pattern', 'enum', 'format']:
156
+ if _ in constraints:
157
+ args[_] = constraints[_]
158
+ if 'minimum' in constraints and 'maximum' in constraints:
159
+ args['num_range'] = [constraints['minimum'], constraints['maximum']]
160
+ # maybe there are better values than these?
161
+ elif 'minimum' in constraints:
162
+ args['num_range'] = [constraints['minimum'], 10 + constraints['minimum']]
163
+ elif 'maximum' in constraints:
164
+ args['num_range'] = [constraints['maximum'] - 10, constraints['maximum']]
165
+ if 'minLength' in constraints:
166
+ args['length'] = constraints['minLength']
167
+ if 'maxLength' in constraints:
168
+ args['length'] = constraints['maxLength']
169
+ return args
170
+
171
+ schema_types_to_python = {
172
+ 'number': 'float',
173
+ 'integer': 'int',
174
+ 'string': 'str',
175
+ 'year': 'year',
176
+ 'boolean': 'bool',
177
+ 'date': 'date',
178
+ 'yearmonth': 'date',
179
+ 'time': 'time',
180
+ 'datetime': 'datetime',
181
+ 'array': 'array'
182
+ }
183
+
184
+ if schema_path:
185
+ if schema_path.startswith('http'):
186
+ schema = requests.get(schema_path).json()
187
+ else:
188
+ with open(schema_path, encoding=encoding) as jsonfile:
189
+ schema = json.load(jsonfile)
190
+ if not ('fields' in schema.keys()):
191
+ raise ValueError('The schema must have a "fields" key.')
192
+ else:
193
+ fields = [
194
+ {
195
+ 'name': f['name'],
196
+ 'type': schema_types_to_python.get(f['type'], 'str'),
197
+ # when frformat is supported in TableSchema, we can build args for French standards
198
+ # linked to https://github.com/datagouv/fr-format/issues/26
199
+ 'args': (
200
+ build_args_from_constraints(f['constraints']) if 'constraints' in f.keys()
201
+ else build_args_from_constraints(f['arrayItem']['constraints'])
202
+ if 'arrayItem' in f.keys() and 'constraints' in f['arrayItem'].keys()
203
+ else {}
204
+ )
205
+ } for f in schema['fields']
206
+ ]
207
+
208
+ for k in range(len(fields)):
209
+ if 'args' not in fields[k]:
210
+ fields[k]['args'] = {}
211
+ if fields[k]['type'] == 'float':
212
+ fields[k]['args']['num_type'] = float
213
+ elif fields[k]['type'] == 'int':
214
+ fields[k]['args']['num_type'] = int
215
+ elif fields[k]['type'] == 'year':
216
+ fields[k]['args']['num_type'] = int
217
+ fields[k]['args']['num_range'] = [1990, 2050]
218
+
219
+ types_to_func = {
220
+ 'int': _number,
221
+ 'float': _number,
222
+ 'date': _date,
223
+ 'time': _time,
224
+ 'str': _string,
225
+ 'url': _url,
226
+ 'id': _id,
227
+ 'year': _number,
228
+ 'bool': _bool,
229
+ 'datetime': _datetime,
230
+ 'array': _array,
231
+ }
232
+
233
+ # would it be better to create by column or by row (as for now)?
234
+ output = pd.DataFrame(
235
+ [
236
+ [
237
+ types_to_func.get(f['type'], 'str')(**f['args'])
238
+ for f in fields
239
+ ] for _ in range(file_length)
240
+ ],
241
+ columns=[f["name"] for f in fields],
242
+ )
243
+
244
+ if output_name:
245
+ output.to_csv(output_name, sep=output_sep, index=False)
246
+
247
+ return output
@@ -2,7 +2,7 @@
2
2
 
3
3
  ## Current (in progress)
4
4
 
5
- - Nothing yet
5
+ - New function that creates a csv from a list of fields and constraints, or from a TableSchema [#100](https://github.com/datagouv/csv-detective/pull/100)
6
6
 
7
7
  ## 0.7.4 (2024-11-15)
8
8
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: csv_detective
3
- Version: 0.7.5.dev1009
3
+ Version: 0.7.5.dev1052
4
4
  Summary: Detect CSV column content
5
5
  Home-page: https://github.com/etalab/csv_detective
6
6
  Author: Etalab
@@ -27,4 +27,6 @@ Requires-Dist: requests==2.32.3
27
27
  Requires-Dist: responses==0.25.0
28
28
  Requires-Dist: python-magic==0.4.27
29
29
  Requires-Dist: frformat==0.4.0
30
+ Requires-Dist: faker==33.0.0
31
+ Requires-Dist: rstr==3.2.2
30
32
 
@@ -1,5 +1,6 @@
1
- csv_detective/__init__.py,sha256=giVhs0g13y4U2H0WiVBLcrvytcMxQ1LiCd2i03XITwQ,83
1
+ csv_detective/__init__.py,sha256=Au4bNJ_Gi6P6o0uO4R56nYdshG7M6-7Rg_xX4whLmLI,143
2
2
  csv_detective/cli.py,sha256=Ua7SE1wMH2uFUsTmfumh4nJk7O06okpMd2gvjUDO1II,1048
3
+ csv_detective/create_example.py,sha256=358e7Q7RWMrY_eEo3pUteJWmg2smFb5edJ_AzcQPrqA,8646
3
4
  csv_detective/detection.py,sha256=AuXlPOZfzqznZY2ybAAgaXIq6qVITYd3MXf2CoigI3I,22097
4
5
  csv_detective/explore_csv.py,sha256=X5yZS3WCUsafUMcs5tOnDTeMGzMnfr0iB9vEDx7xiqg,16977
5
6
  csv_detective/process_text.py,sha256=rsfk66BCmdpsCOd0kDJ8tmqMsEWd-OeBkEisWc4Ej9k,1246
@@ -125,17 +126,18 @@ csv_detective/detect_labels/temp/date/__init__.py,sha256=GrIbo64WVM3hi7ShBRKKyKU
125
126
  csv_detective/detect_labels/temp/datetime_iso/__init__.py,sha256=Ih9l56nBcdmGLyWDavVUWuUUuVZBz9QUDE1hHzADvVg,1157
126
127
  csv_detective/detect_labels/temp/datetime_rfc822/__init__.py,sha256=DQ_h4uDW1e6qu2rATEhgGKw6O-vVi7HbDhbEDDCT9uY,1175
127
128
  csv_detective/detect_labels/temp/year/__init__.py,sha256=zPF_mvhzhXMAlHPAskS8mhuxjLj2AlKpV4ss8Q4tDms,1150
128
- csv_detective-0.7.5.dev1009.data/data/share/csv_detective/CHANGELOG.md,sha256=4ABp5UF2L6tPg-eK7Dj6NWgnFnkU74BwhrMzrRGJ2Lw,6585
129
- csv_detective-0.7.5.dev1009.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
130
- csv_detective-0.7.5.dev1009.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
129
+ csv_detective-0.7.5.dev1052.data/data/share/csv_detective/CHANGELOG.md,sha256=oDqKO3qTo-cUSJB4fMbsyQY2O4pEQhOwWeHsZwaGkxM,6725
130
+ csv_detective-0.7.5.dev1052.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
131
+ csv_detective-0.7.5.dev1052.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
131
132
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
133
+ tests/test_example.py,sha256=0NfChooJQlFxTo2nY5FOQIcsK4zzWA_SBmt2LwVQovY,2014
132
134
  tests/test_fields.py,sha256=kXel-hiyQYrJ3OLmwUMg1K3DKbbwBLvUplxZWxpp18I,10605
133
135
  tests/test_file.py,sha256=1fEOu3bArGBaarRKAoTXAF3cSIGJfFN3UIwOW6esWRs,6399
134
136
  tests/test_labels.py,sha256=6MOKrGznkwU5fjZ_3oiB6Scmb480Eu-9geBJs0UDLds,159
135
137
  tests/test_structure.py,sha256=SVsnluVoIIprYw_67I1_gB3cp9m1wlO8C7SpdsLW8cM,1161
136
- csv_detective-0.7.5.dev1009.dist-info/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
137
- csv_detective-0.7.5.dev1009.dist-info/METADATA,sha256=dd1MAjbcCr_c82zoDPcO5zbfB8KLLDd8WdDCMWtchzA,1090
138
- csv_detective-0.7.5.dev1009.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
139
- csv_detective-0.7.5.dev1009.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
140
- csv_detective-0.7.5.dev1009.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
141
- csv_detective-0.7.5.dev1009.dist-info/RECORD,,
138
+ csv_detective-0.7.5.dev1052.dist-info/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
139
+ csv_detective-0.7.5.dev1052.dist-info/METADATA,sha256=mgg54BtjqPn-L_G2a4JU0SyorK8uYzUyp64cxOAIe6A,1146
140
+ csv_detective-0.7.5.dev1052.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
141
+ csv_detective-0.7.5.dev1052.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
142
+ csv_detective-0.7.5.dev1052.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
143
+ csv_detective-0.7.5.dev1052.dist-info/RECORD,,
tests/test_example.py ADDED
@@ -0,0 +1,71 @@
1
+
2
+ import re
3
+ from uuid import UUID
4
+ from csv_detective import create_example_csv_file
5
+
6
+
7
+ def test_example_creation():
8
+ fields = [
9
+ {
10
+ "name": "id_unique",
11
+ "type": "id",
12
+ },
13
+ {
14
+ "name": "nom_modele",
15
+ "type": "str",
16
+ "args": {'length': 20},
17
+ },
18
+ {
19
+ "name": "siret",
20
+ "type": "str",
21
+ "args": {'pattern': '^\\d{14}$'},
22
+ },
23
+ {
24
+ "name": "type_producteur",
25
+ "type": "str",
26
+ "args": {'enum': ['privé', 'public', 'association']},
27
+ },
28
+ {
29
+ "name": "date_creation",
30
+ "type": "date",
31
+ "args": {
32
+ 'date_range': ['1996-02-13', '2000-01-28'],
33
+ 'format': '%Y-%m-%d',
34
+ },
35
+ },
36
+ {
37
+ "name": "url_produit",
38
+ "type": "url",
39
+ },
40
+ {
41
+ "name": "nb_produits",
42
+ "type": "int",
43
+ },
44
+ {
45
+ "name": "note",
46
+ "type": "float",
47
+ "args": {'num_range': [1, 20]}
48
+ },
49
+ ]
50
+ df = create_example_csv_file(
51
+ fields=fields,
52
+ file_length=5,
53
+ output_name="",
54
+ )
55
+ assert len(df) == 5
56
+ assert all(UUID(_) for _ in df["id_unique"])
57
+ assert all(len(_) == 20 for _ in df["nom_modele"])
58
+ assert all(re.match("^\\d{14}$", _) for _ in df["siret"])
59
+ assert all(_ in ['privé', 'public', 'association'] for _ in df["type_producteur"])
60
+ assert all(_ >= '1996-02-13' and _ <= '2000-01-28' for _ in df["date_creation"])
61
+ assert all(_.startswith("http") for _ in df["url_produit"])
62
+ assert all(isinstance(_, int) for _ in df["nb_produits"])
63
+ assert all(_ >= 1 and _ <= 20 for _ in df["note"])
64
+
65
+
66
+ def test_example_from_tableschema():
67
+ df = create_example_csv_file(
68
+ schema_path="https://schema.data.gouv.fr/schemas/etalab/schema-irve-statique/2.3.1/schema-statique.json",
69
+ output_name="",
70
+ )
71
+ assert len(df) == 10