csv-detective 0.9.3.dev1977__py3-none-any.whl → 0.9.3.dev2039__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
csv_detective/__init__.py CHANGED
@@ -1,7 +1,6 @@
1
- from csv_detective.explore_csv import routine, routine_minio, validate_then_detect
1
+ from csv_detective.explore_csv import routine, validate_then_detect
2
2
 
3
3
  __all__ = [
4
4
  "routine",
5
- "routine_minio",
6
5
  "validate_then_detect",
7
6
  ]
@@ -3,7 +3,7 @@ from frformat import LatitudeL93
3
3
  from csv_detective.detect_fields.other.float import _is as is_float
4
4
  from csv_detective.detect_fields.other.float import float_casting
5
5
 
6
- PROPORTION = 0.9
6
+ PROPORTION = 1
7
7
 
8
8
  _latitudel93 = LatitudeL93()
9
9
 
@@ -1,6 +1,6 @@
1
1
  from csv_detective.detect_fields.other.float import _is as is_float
2
2
 
3
- PROPORTION = 0.9
3
+ PROPORTION = 1
4
4
 
5
5
 
6
6
  def _is(val):
@@ -3,7 +3,7 @@ from frformat import LongitudeL93
3
3
  from csv_detective.detect_fields.other.float import _is as is_float
4
4
  from csv_detective.detect_fields.other.float import float_casting
5
5
 
6
- PROPORTION = 0.9
6
+ PROPORTION = 1
7
7
 
8
8
  _longitudel93 = LongitudeL93()
9
9
 
@@ -1,6 +1,6 @@
1
1
  from csv_detective.detect_fields.other.float import _is as is_float
2
2
 
3
- PROPORTION = 0.9
3
+ PROPORTION = 1
4
4
 
5
5
 
6
6
  def _is(val):
@@ -1,6 +1,6 @@
1
1
  from csv_detective.detect_fields.other.float import _is as is_float
2
2
 
3
- PROPORTION = 0.9
3
+ PROPORTION = 1
4
4
 
5
5
 
6
6
  def _is(val):
@@ -1,6 +1,6 @@
1
1
  from csv_detective.detect_fields.other.float import _is as is_float
2
2
 
3
- PROPORTION = 0.9
3
+ PROPORTION = 1
4
4
 
5
5
 
6
6
  def _is(val):
@@ -1,16 +1,12 @@
1
- import json
2
1
  import logging
3
- import os
4
- import tempfile
5
2
  from time import time
6
3
  from typing import Optional, Union
7
4
 
8
5
  import pandas as pd
9
6
 
10
7
  from csv_detective.detection.formats import detect_formats
11
- from csv_detective.output import generate_output, generate_table_schema
8
+ from csv_detective.output import generate_output
12
9
  from csv_detective.parsing.load import load_file
13
- from csv_detective.s3_utils import download_from_minio, upload_to_minio
14
10
  from csv_detective.utils import display_logs_depending_process_time, is_url
15
11
  from csv_detective.validate import validate
16
12
 
@@ -33,11 +29,11 @@ def routine(
33
29
  verbose: bool = False,
34
30
  sheet_name: Optional[Union[str, int]] = None,
35
31
  ) -> Union[dict, tuple[dict, pd.DataFrame]]:
36
- """Returns a dict with information about the csv table and possible
32
+ """Returns a dict with information about the table and possible
37
33
  column contents, and if requested the DataFrame with columns cast according to analysis.
38
34
 
39
35
  Args:
40
- file_path: local path to CSV file if not using Minio
36
+ file_path: local path or URL to file
41
37
  num_rows: number of rows to sample from the file for analysis ; -1 for analysis
42
38
  of the whole file
43
39
  user_input_tests: tests to run on the file
@@ -173,106 +169,3 @@ def validate_then_detect(
173
169
  display_logs_depending_process_time(
174
170
  f"Process completed in {round(time() - start_routine, 3)}s", time() - start_routine
175
171
  )
176
-
177
-
178
- def routine_minio(
179
- csv_minio_location: dict[str, str],
180
- output_minio_location: dict[str, str],
181
- tableschema_minio_location: dict[str, str],
182
- minio_user: str,
183
- minio_pwd: str,
184
- **kwargs,
185
- ):
186
- """Returns a dict with information about the csv table and possible
187
- column contents.
188
-
189
- Args:
190
- csv_minio_location: dict with Minio URL, bucket and key of the CSV file
191
- output_minio_location: Minio URL, bucket and key to store output file. None if
192
- not uploading to Minio.
193
- tableschema_minio_location: Minio URL, bucket and key to store tableschema file.
194
- None if not uploading the tableschema to Minio.
195
- minio_user: user name for the minio instance
196
- minio_pwd: password for the minio instance
197
- kwargs: arguments for routine
198
-
199
- Returns:
200
- dict: a dict with information about the csv and possible types for each column
201
- """
202
-
203
- if (
204
- (
205
- any(
206
- [
207
- location_dict is not None
208
- for location_dict in [
209
- csv_minio_location,
210
- output_minio_location,
211
- tableschema_minio_location,
212
- ]
213
- ]
214
- )
215
- )
216
- and (minio_user is None)
217
- or (minio_pwd is None)
218
- ):
219
- raise ValueError("Minio credentials are required if using Minio")
220
-
221
- for location_dict in [
222
- csv_minio_location,
223
- output_minio_location,
224
- tableschema_minio_location,
225
- ]:
226
- if location_dict is not None:
227
- if any(
228
- [
229
- (location_key not in location_dict) or (location_dict[location_key] is None)
230
- for location_key in ["netloc", "bucket", "key"]
231
- ]
232
- ):
233
- raise ValueError("Minio location dict must contain url, bucket and key")
234
-
235
- file_path = tempfile.NamedTemporaryFile(delete=False).name
236
- download_from_minio(
237
- netloc=csv_minio_location["netloc"],
238
- bucket=csv_minio_location["bucket"],
239
- key=csv_minio_location["key"],
240
- filepath=file_path,
241
- minio_user=minio_user,
242
- minio_pwd=minio_pwd,
243
- )
244
-
245
- analysis = routine(
246
- file_path,
247
- save_results=True,
248
- **kwargs,
249
- )
250
-
251
- # Write report JSON file.
252
- output_path_to_store_minio_file = os.path.splitext(file_path)[0] + ".json"
253
- with open(output_path_to_store_minio_file, "w", encoding="utf8") as fp:
254
- json.dump(analysis, fp, indent=4, separators=(",", ": "))
255
-
256
- upload_to_minio(
257
- netloc=output_minio_location["netloc"],
258
- bucket=output_minio_location["bucket"],
259
- key=output_minio_location["key"],
260
- filepath=output_path_to_store_minio_file,
261
- minio_user=minio_user,
262
- minio_pwd=minio_pwd,
263
- )
264
-
265
- os.remove(output_path_to_store_minio_file)
266
- os.remove(file_path)
267
-
268
- generate_table_schema(
269
- analysis_report=analysis,
270
- save_file=True,
271
- netloc=tableschema_minio_location["netloc"],
272
- bucket=tableschema_minio_location["bucket"],
273
- key=tableschema_minio_location["key"],
274
- minio_user=minio_user,
275
- minio_pwd=minio_pwd,
276
- )
277
-
278
- return analysis
@@ -51,7 +51,7 @@ def generate_output(
51
51
  )
52
52
 
53
53
  if output_schema:
54
- analysis["schema"] = generate_table_schema(analysis, save_file=False, verbose=verbose)
54
+ analysis["schema"] = generate_table_schema(analysis, save_results=False, verbose=verbose)
55
55
 
56
56
  if output_df:
57
57
  return analysis, cast_df(
@@ -1,14 +1,9 @@
1
1
  import json
2
2
  import logging
3
- import os
4
- import tempfile
5
3
  from datetime import datetime
6
4
  from time import time
7
- from typing import Optional
5
+ from typing import Union
8
6
 
9
- from botocore.exceptions import ClientError
10
-
11
- from csv_detective.s3_utils import download_from_minio, get_s3_client, upload_to_minio
12
7
  from csv_detective.utils import display_logs_depending_process_time
13
8
 
14
9
 
@@ -202,25 +197,14 @@ def get_constraints(format: str) -> dict:
202
197
 
203
198
  def generate_table_schema(
204
199
  analysis_report: dict,
205
- save_file: bool,
206
- netloc: Optional[str] = None,
207
- bucket: Optional[str] = None,
208
- key: Optional[str] = None,
209
- minio_user: Optional[str] = None,
210
- minio_pwd: Optional[str] = None,
200
+ save_results: Union[bool, str] = True,
211
201
  verbose: bool = False,
212
202
  ) -> dict:
213
203
  """Generates a table schema from the analysis report
214
204
 
215
205
  Args:
216
206
  analysis_report (dict): The analysis report from csv_detective
217
- save_file (bool): indicate if schema should be saved into minio or just returned
218
- netloc (str): The netloc of the minio instance to upload the tableschema
219
- bucket (str): The bucket to save the schema in
220
- key (str): The key to save the schema in (without extension as we will append
221
- version number and extension)
222
- minio_user (str): The minio user
223
- minio_pwd (str): The minio password
207
+ save_results (bool or str): whether and where to save the results
224
208
 
225
209
  Returns:
226
210
  """
@@ -277,71 +261,9 @@ def generate_table_schema(
277
261
  f"Created schema in {round(time() - start, 3)}s", time() - start
278
262
  )
279
263
 
280
- if not save_file:
281
- return schema
282
-
283
- if save_file:
284
- if not all([netloc, key, bucket, minio_user, minio_pwd]):
285
- raise Exception(
286
- "To save schema into minio, parameters : netloc, key, bucket, "
287
- "minio_user, minio_pwd should be provided"
288
- )
289
-
290
- # Create bucket if does not exist
291
- client = get_s3_client(netloc, minio_user, minio_pwd)
292
- try:
293
- client.head_bucket(Bucket=bucket)
294
- except ClientError:
295
- client.create_bucket(Bucket=bucket)
296
-
297
- tableschema_objects = client.list_objects(Bucket=bucket, Prefix=key, Delimiter="/")
298
- if "Contents" in tableschema_objects:
299
- tableschema_keys = [
300
- tableschema["Key"]
301
- for tableschema in client.list_objects(Bucket=bucket, Prefix=key, Delimiter="/")[
302
- "Contents"
303
- ]
304
- ]
305
- tableschema_versions = [
306
- os.path.splitext(tableschema_key)[0].split("_")[-1]
307
- for tableschema_key in tableschema_keys
308
- ]
309
- latest_version = max(tableschema_versions)
264
+ if save_results:
265
+ output_path = save_results if isinstance(save_results, str) else "schema.json"
266
+ with open(output_path, "w", encoding="utf8") as fp:
267
+ json.dump(schema, fp, indent=4, separators=(",", ": "), ensure_ascii=False, default=str)
310
268
 
311
- with tempfile.NamedTemporaryFile() as latest_schema_file:
312
- with open(latest_schema_file.name, "w") as fp:
313
- download_from_minio(
314
- netloc,
315
- bucket,
316
- f"{key}_{latest_version}.json",
317
- latest_schema_file.name,
318
- minio_user,
319
- minio_pwd,
320
- )
321
- # Check if files are different
322
- with open(latest_schema_file.name, "r") as fp:
323
- latest_schema = json.load(fp)
324
- if latest_schema["fields"] != fields:
325
- latest_version_split = latest_version.split(".")
326
- new_version = (
327
- latest_version_split[0]
328
- + "."
329
- + latest_version_split[1]
330
- + "."
331
- + str(int(latest_version_split[2]) + 1)
332
- )
333
- else:
334
- return None
335
-
336
- schema["version"] = new_version
337
-
338
- tableschema_file = tempfile.NamedTemporaryFile(delete=False)
339
- with open(tableschema_file.name, "w") as fp:
340
- json.dump(schema, fp, indent=4)
341
-
342
- new_version_key = f"{key}_{new_version}.json"
343
- upload_to_minio(
344
- netloc, bucket, new_version_key, tableschema_file.name, minio_user, minio_pwd
345
- )
346
- os.unlink(tableschema_file.name)
347
- return {"netloc": netloc, "bucket": bucket, "key": new_version_key}
269
+ return schema
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: csv-detective
3
- Version: 0.9.3.dev1977
3
+ Version: 0.9.3.dev2039
4
4
  Summary: Detect tabular files column content
5
5
  Author-email: Etalab <opendatateam@data.gouv.fr>
6
6
  License: MIT
@@ -9,7 +9,6 @@ Keywords: CSV,data processing,encoding,guess,parser,tabular
9
9
  Requires-Python: <3.14,>=3.9
10
10
  Description-Content-Type: text/markdown
11
11
  License-File: LICENSE
12
- Requires-Dist: boto3<2,>=1.34.0
13
12
  Requires-Dist: dateparser<2,>=1.2.0
14
13
  Requires-Dist: faust-cchardet==2.1.19
15
14
  Requires-Dist: pandas<3,>=2.2.0
@@ -1,8 +1,7 @@
1
- csv_detective/__init__.py,sha256=FsL6q5F-gKLMnWy05-1CJpa4cz9tquheZ2LS1tjkVgI,162
1
+ csv_detective/__init__.py,sha256=qvjDQBcw1ZIpapIrdGg1IUjBJ1q5KPhQda_05fevleg,126
2
2
  csv_detective/cli.py,sha256=mu5anmBmaDk52_uZGiA4T37wYZCuV43gZAepjs1Cqzc,1389
3
- csv_detective/explore_csv.py,sha256=sEMza4Z27ac88fGq7tUiK1zlfvuftztHhHVoa0c2EVU,9191
3
+ csv_detective/explore_csv.py,sha256=1q9ZGGLZWwDwHRancdWwSypk0b_mQwpc2LNvcXMeiKQ,5806
4
4
  csv_detective/load_tests.py,sha256=uVKweLq3cf-yB5ZZI-m9tBVs_SWNcOw8sDJa97TOJGo,2266
5
- csv_detective/s3_utils.py,sha256=z1KTVVkdurMv21o-rZu7_aluMJnSi-d5uxnQbqT2NoI,1407
6
5
  csv_detective/utils.py,sha256=xiIO7ZDqkTm9Rnhnq6RaDdnrPIfoG0JV9AsmaOG6plA,1162
7
6
  csv_detective/validate.py,sha256=RLHXLrRuynkdcvHUlSEbyglPvdbNYlT1Z4nQI-BdYdA,2898
8
7
  csv_detective/detect_fields/__init__.py,sha256=ZZ7u9zsMtCqPC2xxeLp57UTCbqpKFJi6D_LO1ew15BU,1980
@@ -17,10 +16,10 @@ csv_detective/detect_fields/FR/geo/code_region/__init__.py,sha256=9pR2tVS4J2Kryt
17
16
  csv_detective/detect_fields/FR/geo/commune/__init__.py,sha256=5vw4zjlmWaR2djxuQOUrmwsNIc9HgAE-zdxwerVR3S0,380
18
17
  csv_detective/detect_fields/FR/geo/departement/__init__.py,sha256=UsMEW1EVVgnw-daOc1jBkEaGKvqTONSAGnj1s3QgM8w,400
19
18
  csv_detective/detect_fields/FR/geo/insee_canton/__init__.py,sha256=YsAGiblFexBxvu_E3XaXhy_bordc6c1oKPgDzTsDeXw,374
20
- csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py,sha256=RjkDSZzIbp4nnvDpa5GomDpyIJGvwErX7TgC4dlBJ14,437
21
- csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py,sha256=7xmYpTYoHvFfcuocAhm6dP_j4sMII_hG1PMSrWId4FY,344
22
- csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py,sha256=JbKuGK5UmUGAQKPFpN4RSLf3axJ5D1aCjzRXYHW-iXU,441
23
- csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py,sha256=5VWDaHZvGhJAJu5XQrj6gLx5CVA9dNOE30eTXQ3pSf0,344
19
+ csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py,sha256=7ONo0MxrJY1gPWRwyPCX4ZDbCINmxnKRV85zscADxT8,435
20
+ csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py,sha256=lIgWebNapfrnPt0XeNUMs78Xa_csGNAtTk8VEk9wXXo,342
21
+ csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py,sha256=YXTWSymmcXW9eD2OfiSlmX7N-IUtZkDrNYHd6vTnJTc,439
22
+ csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py,sha256=7tZ8sgIkQ9zuSOZ-vGYBkH04Vv1xlPlJDM78xYfD57Y,342
24
23
  csv_detective/detect_fields/FR/geo/pays/__init__.py,sha256=85y-5qNRAWJrKqL0wh9iPMUBQjvPwc9lv1cYB2m0daQ,364
25
24
  csv_detective/detect_fields/FR/geo/region/__init__.py,sha256=6mJRaGsCPBY5JHHe8EWxEjDpAOIfvBPTaZKJb3_n3gU,1077
26
25
  csv_detective/detect_fields/FR/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -50,9 +49,9 @@ csv_detective/detect_fields/geo/iso_country_code_alpha3/iso_country_code_alpha3.
50
49
  csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py,sha256=AnAridM4C8hcm4PeNdr8969czgrzM4KemGVZWAJSM1U,436
51
50
  csv_detective/detect_fields/geo/iso_country_code_numeric/iso_country_code_numeric.txt,sha256=2GtEhuporsHYV-pU4q9kfXU5iOtfW5C0GYBTTKQtnnA,1004
52
51
  csv_detective/detect_fields/geo/json_geojson/__init__.py,sha256=6wlwlxQmsVIZ21g-THvH3nBj-I8FuoF2sBlZAoEMGiQ,393
53
- csv_detective/detect_fields/geo/latitude_wgs/__init__.py,sha256=sdor-L1WDHv5opg1Le13mru4ImSA-yEbxchlWENuUFE,327
52
+ csv_detective/detect_fields/geo/latitude_wgs/__init__.py,sha256=PI-wlTJmPk6nznzu_Fou_SSCET90wIf78mXwb1W1K70,325
54
53
  csv_detective/detect_fields/geo/latlon_wgs/__init__.py,sha256=IXDTqD4YFUJYI1FYZ5ZfkqXY6KvNY7sgBVFRAvgTHtI,454
55
- csv_detective/detect_fields/geo/longitude_wgs/__init__.py,sha256=gPnNTe-L9xjBVE-30VCJiK6IVZttj6Cy6zu1IL5907Y,330
54
+ csv_detective/detect_fields/geo/longitude_wgs/__init__.py,sha256=B7YFfvEI48DfAn8xbc-vpVERQaKh9_59ERfieo2D6OY,328
56
55
  csv_detective/detect_fields/geo/lonlat_wgs/__init__.py,sha256=CnBMYevfGdhBvureF3oc_zqT-RZjG419iAuUlugQFLc,454
57
56
  csv_detective/detect_fields/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
58
57
  csv_detective/detect_fields/other/booleen/__init__.py,sha256=37ZUJACrZA9FQBYLDeVJGze7_I9x-ZWv5yWuBcqHcwI,497
@@ -137,11 +136,11 @@ csv_detective/detection/headers.py,sha256=y5iR4jWH5fUtAH_Zg0zxWSVG_INCHlXJFMbhPp
137
136
  csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
138
137
  csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
139
138
  csv_detective/detection/variables.py,sha256=wfsA_MOk14TPMOY7gkvpTGpo9-USzMnFaAou3MPHqxc,3536
140
- csv_detective/output/__init__.py,sha256=bMsLp-XCVf4sNymIof_kdMdqFIY7GocOas-lPNekfQg,1930
139
+ csv_detective/output/__init__.py,sha256=Vo7hK5fq6hfK5019K4fEnv-LcfeRuNPQubQFkZAMszs,1933
141
140
  csv_detective/output/dataframe.py,sha256=pjxvpzIWVUW9_xvT3JjoPnOIVUUHnzL7kZo1xQdMDxQ,2139
142
141
  csv_detective/output/example.py,sha256=XrnPS_uC0cICn7tgnLWNctpUbnPzl7fIMzNTzJEWGJc,8655
143
142
  csv_detective/output/profile.py,sha256=thckCcfy9cES5yYNW6TDGV82gP1OFWJuLhInT1g7JpI,2814
144
- csv_detective/output/schema.py,sha256=5Duw5qnsJ-LaVC6JgF7p1zZAkehDzsbXA4iTSJUgLNM,13760
143
+ csv_detective/output/schema.py,sha256=YUt9c33mzP2fHoj-NwW7kBcANyrkU3lIBWvXRbugtyU,10485
145
144
  csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
146
145
  csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
147
146
  csv_detective/parsing/columns.py,sha256=HRHJBJ1gftuheegJHzhQmg-u83pVAXXuQ9GKR34mKgk,5696
@@ -150,19 +149,18 @@ csv_detective/parsing/csv.py,sha256=fJkjKvyk7InkNnYKtmivyi48mmcwvrha7gvZ5J4-86A,
150
149
  csv_detective/parsing/excel.py,sha256=sKD5PRN1TlzPPOKFnZ3VRb0r1yIjPLlpxVWmZQeLYFk,7027
151
150
  csv_detective/parsing/load.py,sha256=C3M8nvgWenOb8aDFi5dpDGCoAw9EBqr4EB63zbz2M14,3699
152
151
  csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
153
- csv_detective-0.9.3.dev1977.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
152
+ csv_detective-0.9.3.dev2039.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
154
153
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
155
154
  tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
156
- tests/test_fields.py,sha256=-6wwuqNmGUIxpNn4u9_OmgqgS95uKWBtahDGy3iw3NI,12566
155
+ tests/test_fields.py,sha256=m5wGyHIl7e8PFmBe5OuWST9J0YVvdqniUEuCvWEj1qU,13558
157
156
  tests/test_file.py,sha256=QEBv69P0bAKWBzhQ3KKOR1Z1RQSf5CVEilqBojwP2Yc,10791
158
157
  tests/test_labels.py,sha256=Y0XlOpztCyV65pk7iAS_nMMfdysoBujlBmz10vHul9A,469
159
158
  tests/test_structure.py,sha256=GRDYKy0UcdqlN4qglzsRC0puFj5cb-SVvONjvcPvtAA,1400
160
159
  tests/test_validation.py,sha256=ie-Xf0vk6-M6GQq-x7kY5yse1EmXfxQkbaV7fR3fvYo,3308
161
160
  venv/bin/activate_this.py,sha256=NRy3waFmwW1pOaNUp33wNN0vD1Kzkd-zXX-Sgl4EiVI,1286
162
- venv/bin/jp.py,sha256=7z7dvRg0M7HzpZG4ssQID7nScjvQx7bcYTxJWDOrS6E,1717
163
161
  venv/bin/runxlrd.py,sha256=YlZMuycM_V_hzNt2yt3FyXPuwouMCmMhvj1oZaBeeuw,16092
164
- csv_detective-0.9.3.dev1977.dist-info/METADATA,sha256=ltt9Ve8vQcPHPIaBd-BTAaGuJ_a2KtjHTvr-6d2eIMk,9767
165
- csv_detective-0.9.3.dev1977.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
166
- csv_detective-0.9.3.dev1977.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
167
- csv_detective-0.9.3.dev1977.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
168
- csv_detective-0.9.3.dev1977.dist-info/RECORD,,
162
+ csv_detective-0.9.3.dev2039.dist-info/METADATA,sha256=tIrO6IwjU0482nXQqYhvwmdPhr4vtuOht6U4CLNCZeQ,9735
163
+ csv_detective-0.9.3.dev2039.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
164
+ csv_detective-0.9.3.dev2039.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
165
+ csv_detective-0.9.3.dev2039.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
166
+ csv_detective-0.9.3.dev2039.dist-info/RECORD,,
tests/test_fields.py CHANGED
@@ -79,6 +79,7 @@ from csv_detective.detection.variables import (
79
79
  from csv_detective.load_tests import return_all_tests
80
80
  from csv_detective.output.dataframe import cast
81
81
  from csv_detective.output.utils import prepare_output_dict
82
+ from csv_detective.parsing.columns import test_col as col_test # to prevent pytest from testing it
82
83
 
83
84
 
84
85
  def test_all_tests_return_bool():
@@ -461,3 +462,27 @@ def test_early_detection(args):
461
462
  res = module._is(value)
462
463
  assert res
463
464
  mock_func.assert_not_called()
465
+
466
+
467
+ def test_all_proportion_1():
468
+ all_tests = return_all_tests("ALL", "detect_fields")
469
+ prop_1 = {
470
+ t.__name__.split(".")[-1]: eval(
471
+ t.__name__.split(".")[-1]
472
+ if t.__name__.split(".")[-1] not in ["int", "float"]
473
+ else "test_" + t.__name__.split(".")[-1]
474
+ )
475
+ for t in all_tests
476
+ if t.PROPORTION == 1
477
+ }
478
+ # building a table that uses only correct values for these formats, except on one row
479
+ table = pd.DataFrame(
480
+ {
481
+ test_name: (fields[test_module][True] * 100)[:100] + ["not_suitable"]
482
+ for test_name, test_module in prop_1.items()
483
+ }
484
+ )
485
+ # testing columns for all formats
486
+ returned_table = col_test(table, all_tests, limited_output=True)
487
+ # the analysis should have found no match on any format
488
+ assert all(returned_table[col].sum() == 0 for col in table.columns)
csv_detective/s3_utils.py DELETED
@@ -1,44 +0,0 @@
1
- import logging
2
-
3
- import boto3
4
- from botocore.client import Config
5
- from botocore.exceptions import ClientError
6
-
7
-
8
- def get_minio_url(netloc: str, bucket: str, key: str) -> str:
9
- """Returns location of given resource in minio once it is saved"""
10
- return netloc + "/" + bucket + "/" + key
11
-
12
-
13
- def get_s3_client(url: str, minio_user: str, minio_pwd: str) -> boto3.client:
14
- return boto3.client(
15
- "s3",
16
- endpoint_url=url,
17
- aws_access_key_id=minio_user,
18
- aws_secret_access_key=minio_pwd,
19
- config=Config(signature_version="s3v4"),
20
- )
21
-
22
-
23
- def download_from_minio(
24
- netloc: str, bucket: str, key: str, filepath: str, minio_user: str, minio_pwd: str
25
- ) -> None:
26
- logging.info("Downloading from minio")
27
- s3 = get_s3_client(netloc, minio_user, minio_pwd)
28
- try:
29
- s3.download_file(bucket, key, filepath)
30
- logging.info(f"Resource downloaded from minio at {get_minio_url(netloc, bucket, key)}")
31
- except ClientError as e:
32
- logging.error(e)
33
-
34
-
35
- def upload_to_minio(
36
- netloc: str, bucket: str, key: str, filepath: str, minio_user: str, minio_pwd: str
37
- ) -> None:
38
- logging.info("Saving to minio")
39
- s3 = get_s3_client(netloc, minio_user, minio_pwd)
40
- try:
41
- s3.upload_file(filepath, bucket, key)
42
- logging.info(f"Resource saved into minio at {get_minio_url(netloc, bucket, key)}")
43
- except ClientError as e:
44
- logging.error(e)
venv/bin/jp.py DELETED
@@ -1,54 +0,0 @@
1
- #!/home/circleci/project/venv/bin/python
2
-
3
- import sys
4
- import json
5
- import argparse
6
- from pprint import pformat
7
-
8
- import jmespath
9
- from jmespath import exceptions
10
-
11
-
12
- def main():
13
- parser = argparse.ArgumentParser()
14
- parser.add_argument('expression')
15
- parser.add_argument('-f', '--filename',
16
- help=('The filename containing the input data. '
17
- 'If a filename is not given then data is '
18
- 'read from stdin.'))
19
- parser.add_argument('--ast', action='store_true',
20
- help=('Pretty print the AST, do not search the data.'))
21
- args = parser.parse_args()
22
- expression = args.expression
23
- if args.ast:
24
- # Only print the AST
25
- expression = jmespath.compile(args.expression)
26
- sys.stdout.write(pformat(expression.parsed))
27
- sys.stdout.write('\n')
28
- return 0
29
- if args.filename:
30
- with open(args.filename, 'r') as f:
31
- data = json.load(f)
32
- else:
33
- data = sys.stdin.read()
34
- data = json.loads(data)
35
- try:
36
- sys.stdout.write(json.dumps(
37
- jmespath.search(expression, data), indent=4, ensure_ascii=False))
38
- sys.stdout.write('\n')
39
- except exceptions.ArityError as e:
40
- sys.stderr.write("invalid-arity: %s\n" % e)
41
- return 1
42
- except exceptions.JMESPathTypeError as e:
43
- sys.stderr.write("invalid-type: %s\n" % e)
44
- return 1
45
- except exceptions.UnknownFunctionError as e:
46
- sys.stderr.write("unknown-function: %s\n" % e)
47
- return 1
48
- except exceptions.ParseError as e:
49
- sys.stderr.write("syntax-error: %s\n" % e)
50
- return 1
51
-
52
-
53
- if __name__ == '__main__':
54
- sys.exit(main())