csv-detective 0.7.5.dev1239__py3-none-any.whl → 0.7.5.dev1277__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
csv_detective/cli.py CHANGED
@@ -8,37 +8,52 @@ from .explore_csv import routine
8
8
 
9
9
 
10
10
  def run():
11
- explorer = argparse.ArgumentParser(description='Get the arguments we want')
11
+ explorer = argparse.ArgumentParser(description="Analyse a tabular file")
12
12
  explorer.add_argument(
13
- 'file_path',
13
+ "file_path",
14
14
  type=str,
15
- help='Enter path of csv file to explore'
15
+ help="Enter path of tabular file to explore"
16
16
  )
17
17
  explorer.add_argument(
18
- '-n',
19
- '--num_rows',
20
- dest='num_rows',
18
+ "-n",
19
+ "--num_rows",
20
+ dest="num_rows",
21
21
  type=int,
22
- nargs='?',
23
- help='Number of rows to use for detection'
22
+ nargs="?",
23
+ help="Number of rows to use for detection (default 500)"
24
24
  )
25
25
  explorer.add_argument(
26
- '-t',
27
- '--select_tests',
28
- dest='city',
26
+ "-s",
27
+ "--sep",
28
+ dest="sep",
29
29
  type=str,
30
- nargs='*',
31
- help='List of tests to be performed (use "" if you want to use the dash option to remove tests)'
30
+ nargs="?",
31
+ help="Columns separator (detected if not specified)"
32
+ )
33
+ explorer.add_argument(
34
+ "--save",
35
+ dest="save_results",
36
+ type=int,
37
+ nargs="?",
38
+ help="Whether to save the resulting analysis to json (1 = save, 0 = don't)"
39
+ )
40
+ explorer.add_argument(
41
+ "-v",
42
+ "--verbose",
43
+ dest="verbose",
44
+ type=int,
45
+ nargs="?",
46
+ help="Verbose (0 = quiet, 1 = details)"
32
47
  )
33
48
 
34
49
  opts = explorer.parse_args()
35
50
 
36
- num_rows = opts.num_rows or 50
37
51
  inspection_results = routine(
38
- opts.file_path,
39
- num_rows=num_rows,
40
- user_input_tests='ALL',
41
- output_mode='ALL'
52
+ csv_file_path=opts.file_path,
53
+ num_rows=opts.num_rows,
54
+ sep=opts.sep,
55
+ save_results=bool(opts.save_results),
56
+ verbose=bool(opts.verbose),
42
57
  )
43
58
 
44
- print(json.dumps(inspection_results, indent=4, sort_keys=True, ensure_ascii=False))
59
+ print(json.dumps(inspection_results, indent=4, ensure_ascii=False))
@@ -12,7 +12,7 @@ def _is(val):
12
12
  if (
13
13
  not isinstance(val, str)
14
14
  or any([k in val for k in ['_', '+', 'e', 'E']])
15
- or (val.startswith('0') and len(val) > 1)
15
+ or (val.startswith("0") and len(val) > 1 and val[1] not in [".", ","])
16
16
  ):
17
17
  return False
18
18
  float_casting(val)
@@ -4,7 +4,7 @@ import logging
4
4
  import os
5
5
  import tempfile
6
6
  from time import time
7
- from typing import Union
7
+ from typing import Optional, Union
8
8
 
9
9
  import numpy as np
10
10
  import pandas as pd
@@ -25,7 +25,7 @@ from .s3_utils import download_from_minio, upload_to_minio
25
25
  from .utils import display_logs_depending_process_time, is_url
26
26
 
27
27
 
28
- def get_all_packages(detect_type) -> list:
28
+ def get_all_packages(detect_type: str) -> list:
29
29
  root_dir = os.path.dirname(os.path.abspath(__file__)) + "/" + detect_type
30
30
  modules = []
31
31
  for dirpath, _, filenames in os.walk(root_dir):
@@ -88,15 +88,15 @@ def routine(
88
88
  user_input_tests: Union[str, list[str]] = "ALL",
89
89
  limited_output: bool = True,
90
90
  save_results: Union[bool, str] = True,
91
- encoding: str = None,
92
- sep: str = None,
91
+ encoding: Optional[str] = None,
92
+ sep: Optional[str] = None,
93
93
  skipna: bool = True,
94
94
  output_profile: bool = False,
95
95
  output_schema: bool = False,
96
96
  output_df: bool = False,
97
97
  cast_json: bool = True,
98
98
  verbose: bool = False,
99
- sheet_name: Union[str, int] = None,
99
+ sheet_name: Optional[Union[str, int]] = None,
100
100
  ) -> Union[dict, tuple[dict, pd.DataFrame]]:
101
101
  """Returns a dict with information about the csv table and possible
102
102
  column contents.
@@ -307,10 +307,7 @@ def routine_minio(
307
307
  tableschema_minio_location: dict[str, str],
308
308
  minio_user: str,
309
309
  minio_pwd: str,
310
- num_rows: int = 500,
311
- user_input_tests: Union[str, list[str]] = "ALL",
312
- encoding: str = None,
313
- sep: str = None,
310
+ **kwargs,
314
311
  ):
315
312
  """Returns a dict with information about the csv table and possible
316
313
  column contents.
@@ -323,11 +320,7 @@ def routine_minio(
323
320
  None if not uploading the tableschema to Minio.
324
321
  minio_user: user name for the minio instance
325
322
  minio_pwd: password for the minio instance
326
- num_rows: number of rows to sample from the file for analysis ; -1 for analysis of
327
- the whole file
328
- user_input_tests: tests to run on the file
329
- output_mode: LIMITED or ALL, whether or not to return all possible types or only
330
- the most likely one for each column
323
+ kwargs: arguments for routine
331
324
 
332
325
  Returns:
333
326
  dict: a dict with information about the csv and possible types for each column
@@ -376,14 +369,10 @@ def routine_minio(
376
369
  minio_pwd=minio_pwd,
377
370
  )
378
371
 
379
- analysis = routine(
380
- file_path,
372
+ analysis = routine(file_path,
381
373
  num_rows,
382
- user_input_tests,
383
- output_mode="LIMITED",
384
374
  save_results=True,
385
- encoding=encoding,
386
- sep=sep,
375
+ **kwargs,
387
376
  )
388
377
 
389
378
  # Write report JSON file.
@@ -404,8 +393,8 @@ def routine_minio(
404
393
  os.remove(file_path)
405
394
 
406
395
  generate_table_schema(
407
- analysis,
408
- True,
396
+ analysis_report=analysis,
397
+ save_file=True,
409
398
  netloc=tableschema_minio_location["netloc"],
410
399
  bucket=tableschema_minio_location["bucket"],
411
400
  key=tableschema_minio_location["key"],
@@ -7,11 +7,13 @@
7
7
  - Better naming, hint types and minor refactors [#103](https://github.com/datagouv/csv-detective/pull/103)
8
8
  - The returned dataframe has its columns properly cast to the detected types [#104](https://github.com/datagouv/csv-detective/pull/104)
9
9
  - Raise an error if the encoding could not be guessed [#106](https://github.com/datagouv/csv-detective/pull/106)
10
+ - Fix CLI and minio routine [#107](https://github.com/datagouv/csv-detective/pull/107)
10
11
  - Allow to only specify tests to skip ("all but...") [#108](https://github.com/datagouv/csv-detective/pull/108)
11
12
  - Fix bool casting [#109](https://github.com/datagouv/csv-detective/pull/109)
12
13
  - Handle csv.gz files [#110](https://github.com/datagouv/csv-detective/pull/110)
13
14
  - Refactor file tests [#110](https://github.com/datagouv/csv-detective/pull/110)
14
15
  - Restructure repo (breaking changes) [#111](https://github.com/datagouv/csv-detective/pull/111)
16
+ - Better float detection [#113](https://github.com/datagouv/csv-detective/pull/113)
15
17
 
16
18
  ## 0.7.4 (2024-11-15)
17
19
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: csv_detective
3
- Version: 0.7.5.dev1239
3
+ Version: 0.7.5.dev1277
4
4
  Summary: Detect CSV column content
5
5
  Home-page: https://github.com/etalab/csv_detective
6
6
  Author: Etalab
@@ -1,6 +1,6 @@
1
1
  csv_detective/__init__.py,sha256=GCHgu0BhH5ACV7cf-1gDr9nRyvSoeQ1vRw9SjEHeMT4,143
2
- csv_detective/cli.py,sha256=Ua7SE1wMH2uFUsTmfumh4nJk7O06okpMd2gvjUDO1II,1048
3
- csv_detective/explore_csv.py,sha256=aJ2pG7lK4sgY9Pv31zEzFVGByxkfw4wwgrQqfgUtBOo,14903
2
+ csv_detective/cli.py,sha256=itooHtpyfC6DUsL_DchPKe1xo7m0MYJIp1L4R8eqoTk,1401
3
+ csv_detective/explore_csv.py,sha256=FmgJ2h1SxV8b_wOWia4xsswyVJTlCCW66e0nhltz-0s,14511
4
4
  csv_detective/s3_utils.py,sha256=1cIVdQUYY2ovErbMwp72Gqtqx2bkB8nfVhn-QaOFTT0,1451
5
5
  csv_detective/utils.py,sha256=KAYfSJXnPuAXnSc38Jm57oQ_JP_0kUkmI1OV6gN5_ys,1116
6
6
  csv_detective/detect_fields/__init__.py,sha256=NVfE3BQVExgXb-BPbhDvlkM5-0naEVLpZ4aM_OGHYfE,931
@@ -53,7 +53,7 @@ csv_detective/detect_fields/geo/longitude_wgs/__init__.py,sha256=G7afWOKiGh_Tv7g
53
53
  csv_detective/detect_fields/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
54
54
  csv_detective/detect_fields/other/booleen/__init__.py,sha256=wn_yyTAmGxqo0l0b7JRpGb0da_E27iGxES9zWCrnsqc,497
55
55
  csv_detective/detect_fields/other/email/__init__.py,sha256=O9tgJmq0O8Q-8iin63NqEEDhlsUJjxFZNaNFM4GZaws,178
56
- csv_detective/detect_fields/other/float/__init__.py,sha256=dpEd5ZijmjQ7gqcTnYRoRoLGGJae0RyGwVC6MPra9go,549
56
+ csv_detective/detect_fields/other/float/__init__.py,sha256=7bXuPAmBuIhKJEhq7d20B60WVol1AUpqRkWhreQpWfU,578
57
57
  csv_detective/detect_fields/other/int/__init__.py,sha256=QN3kQJLYqLRBiubUK7g4Xq03PlA5wqVwx2pPPIO9FdI,320
58
58
  csv_detective/detect_fields/other/json/__init__.py,sha256=DhzyvT12kOqgum89silIu3uoSYXmC_s_AaxLtXAD4eU,540
59
59
  csv_detective/detect_fields/other/mongo_object_id/__init__.py,sha256=7fcrHsOZAqXp2_N0IjPskYJ_qi4xRlo9iyNNDQVLzsU,156
@@ -141,18 +141,18 @@ csv_detective/parsing/csv.py,sha256=11mibDnJhIjykXLGZvA5ZEU5U7KgxIrbyO6BNv6jlro,
141
141
  csv_detective/parsing/excel.py,sha256=AslE2S1e67o8yTIAIhp-lAnJ6-XqeBBRz1-VMFqhZBM,7055
142
142
  csv_detective/parsing/load.py,sha256=SpP0pfxswOAPPpwbZfoP1blh0EKV5VMs0TpTgQJKzjs,3621
143
143
  csv_detective/parsing/text.py,sha256=rsfk66BCmdpsCOd0kDJ8tmqMsEWd-OeBkEisWc4Ej9k,1246
144
- csv_detective-0.7.5.dev1239.data/data/share/csv_detective/CHANGELOG.md,sha256=povo1ufNJvsxJLkzdjYLgkTy9E-MNFWTg6elXe2nyqU,7625
145
- csv_detective-0.7.5.dev1239.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
146
- csv_detective-0.7.5.dev1239.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
147
- csv_detective-0.7.5.dev1239.dist-info/licenses/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
144
+ csv_detective-0.7.5.dev1277.data/data/share/csv_detective/CHANGELOG.md,sha256=tgIIm6s4qoP4RGJK1cmqf-Cm5aHmXmBrwi37NVIYedg,7796
145
+ csv_detective-0.7.5.dev1277.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
146
+ csv_detective-0.7.5.dev1277.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
147
+ csv_detective-0.7.5.dev1277.dist-info/licenses/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
148
148
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
149
149
  tests/test_example.py,sha256=0NfChooJQlFxTo2nY5FOQIcsK4zzWA_SBmt2LwVQovY,2014
150
- tests/test_fields.py,sha256=fcgycaFxacOcN0WdwuUvxef_ejd6tRHNpkD5pxMjMXE,11141
150
+ tests/test_fields.py,sha256=LPLx09cX5u9XHAh65XvTgIqzKylToiHZxXzKhpV0wsk,11148
151
151
  tests/test_file.py,sha256=EleTssys5fCP4N0W1eTZN35uijzoF15e3dIcuIlrMsk,7865
152
152
  tests/test_labels.py,sha256=6MOKrGznkwU5fjZ_3oiB6Scmb480Eu-9geBJs0UDLds,159
153
153
  tests/test_structure.py,sha256=SVsnluVoIIprYw_67I1_gB3cp9m1wlO8C7SpdsLW8cM,1161
154
- csv_detective-0.7.5.dev1239.dist-info/METADATA,sha256=81-Ik3akmjcTO7mTqHRWrMLUP-4uZ4ffPyg9L74pImg,1386
155
- csv_detective-0.7.5.dev1239.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
156
- csv_detective-0.7.5.dev1239.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
157
- csv_detective-0.7.5.dev1239.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
158
- csv_detective-0.7.5.dev1239.dist-info/RECORD,,
154
+ csv_detective-0.7.5.dev1277.dist-info/METADATA,sha256=RgcnqpKqQ1us0lmVf6McKYJs38DC1sqvAh10XgnJOY8,1386
155
+ csv_detective-0.7.5.dev1277.dist-info/WHEEL,sha256=pxyMxgL8-pra_rKaQ4drOZAegBVuX-G_4nRHjjgWbmo,91
156
+ csv_detective-0.7.5.dev1277.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
157
+ csv_detective-0.7.5.dev1277.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
158
+ csv_detective-0.7.5.dev1277.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (78.1.0)
2
+ Generator: setuptools (79.0.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
tests/test_fields.py CHANGED
@@ -501,7 +501,7 @@ def test_not_match_int():
501
501
 
502
502
  # float
503
503
  def test_match_float():
504
- for val in ["1", "0", "1764", "-24", "1.2", "1863.23", "-12.7"]:
504
+ for val in ["1", "0", "1764", "-24", "1.2", "1863.23", "-12.7", "0.1"]:
505
505
  assert test_float._is(val)
506
506
 
507
507