csv-detective 0.7.5.dev1239__py3-none-any.whl → 0.7.5.dev1277__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/cli.py +34 -19
- csv_detective/detect_fields/other/float/__init__.py +1 -1
- csv_detective/explore_csv.py +11 -22
- {csv_detective-0.7.5.dev1239.data → csv_detective-0.7.5.dev1277.data}/data/share/csv_detective/CHANGELOG.md +2 -0
- {csv_detective-0.7.5.dev1239.dist-info → csv_detective-0.7.5.dev1277.dist-info}/METADATA +1 -1
- {csv_detective-0.7.5.dev1239.dist-info → csv_detective-0.7.5.dev1277.dist-info}/RECORD +13 -13
- {csv_detective-0.7.5.dev1239.dist-info → csv_detective-0.7.5.dev1277.dist-info}/WHEEL +1 -1
- tests/test_fields.py +1 -1
- {csv_detective-0.7.5.dev1239.data → csv_detective-0.7.5.dev1277.data}/data/share/csv_detective/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.7.5.dev1239.data → csv_detective-0.7.5.dev1277.data}/data/share/csv_detective/README.md +0 -0
- {csv_detective-0.7.5.dev1239.dist-info → csv_detective-0.7.5.dev1277.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.7.5.dev1239.dist-info → csv_detective-0.7.5.dev1277.dist-info}/licenses/LICENSE.AGPL.txt +0 -0
- {csv_detective-0.7.5.dev1239.dist-info → csv_detective-0.7.5.dev1277.dist-info}/top_level.txt +0 -0
csv_detective/cli.py
CHANGED
|
@@ -8,37 +8,52 @@ from .explore_csv import routine
|
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
def run():
|
|
11
|
-
explorer = argparse.ArgumentParser(description=
|
|
11
|
+
explorer = argparse.ArgumentParser(description="Analyse a tabular file")
|
|
12
12
|
explorer.add_argument(
|
|
13
|
-
|
|
13
|
+
"file_path",
|
|
14
14
|
type=str,
|
|
15
|
-
help=
|
|
15
|
+
help="Enter path of tabular file to explore"
|
|
16
16
|
)
|
|
17
17
|
explorer.add_argument(
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
dest=
|
|
18
|
+
"-n",
|
|
19
|
+
"--num_rows",
|
|
20
|
+
dest="num_rows",
|
|
21
21
|
type=int,
|
|
22
|
-
nargs=
|
|
23
|
-
help=
|
|
22
|
+
nargs="?",
|
|
23
|
+
help="Number of rows to use for detection (default 500)"
|
|
24
24
|
)
|
|
25
25
|
explorer.add_argument(
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
dest=
|
|
26
|
+
"-s",
|
|
27
|
+
"--sep",
|
|
28
|
+
dest="sep",
|
|
29
29
|
type=str,
|
|
30
|
-
nargs=
|
|
31
|
-
help=
|
|
30
|
+
nargs="?",
|
|
31
|
+
help="Columns separator (detected if not specified)"
|
|
32
|
+
)
|
|
33
|
+
explorer.add_argument(
|
|
34
|
+
"--save",
|
|
35
|
+
dest="save_results",
|
|
36
|
+
type=int,
|
|
37
|
+
nargs="?",
|
|
38
|
+
help="Whether to save the resulting analysis to json (1 = save, 0 = don't)"
|
|
39
|
+
)
|
|
40
|
+
explorer.add_argument(
|
|
41
|
+
"-v",
|
|
42
|
+
"--verbose",
|
|
43
|
+
dest="verbose",
|
|
44
|
+
type=int,
|
|
45
|
+
nargs="?",
|
|
46
|
+
help="Verbose (0 = quiet, 1 = details)"
|
|
32
47
|
)
|
|
33
48
|
|
|
34
49
|
opts = explorer.parse_args()
|
|
35
50
|
|
|
36
|
-
num_rows = opts.num_rows or 50
|
|
37
51
|
inspection_results = routine(
|
|
38
|
-
opts.file_path,
|
|
39
|
-
num_rows=num_rows,
|
|
40
|
-
|
|
41
|
-
|
|
52
|
+
csv_file_path=opts.file_path,
|
|
53
|
+
num_rows=opts.num_rows,
|
|
54
|
+
sep=opts.sep,
|
|
55
|
+
save_results=bool(opts.save_results),
|
|
56
|
+
verbose=bool(opts.verbose),
|
|
42
57
|
)
|
|
43
58
|
|
|
44
|
-
print(json.dumps(inspection_results, indent=4,
|
|
59
|
+
print(json.dumps(inspection_results, indent=4, ensure_ascii=False))
|
|
@@ -12,7 +12,7 @@ def _is(val):
|
|
|
12
12
|
if (
|
|
13
13
|
not isinstance(val, str)
|
|
14
14
|
or any([k in val for k in ['_', '+', 'e', 'E']])
|
|
15
|
-
or (val.startswith(
|
|
15
|
+
or (val.startswith("0") and len(val) > 1 and val[1] not in [".", ","])
|
|
16
16
|
):
|
|
17
17
|
return False
|
|
18
18
|
float_casting(val)
|
csv_detective/explore_csv.py
CHANGED
|
@@ -4,7 +4,7 @@ import logging
|
|
|
4
4
|
import os
|
|
5
5
|
import tempfile
|
|
6
6
|
from time import time
|
|
7
|
-
from typing import Union
|
|
7
|
+
from typing import Optional, Union
|
|
8
8
|
|
|
9
9
|
import numpy as np
|
|
10
10
|
import pandas as pd
|
|
@@ -25,7 +25,7 @@ from .s3_utils import download_from_minio, upload_to_minio
|
|
|
25
25
|
from .utils import display_logs_depending_process_time, is_url
|
|
26
26
|
|
|
27
27
|
|
|
28
|
-
def get_all_packages(detect_type) -> list:
|
|
28
|
+
def get_all_packages(detect_type: str) -> list:
|
|
29
29
|
root_dir = os.path.dirname(os.path.abspath(__file__)) + "/" + detect_type
|
|
30
30
|
modules = []
|
|
31
31
|
for dirpath, _, filenames in os.walk(root_dir):
|
|
@@ -88,15 +88,15 @@ def routine(
|
|
|
88
88
|
user_input_tests: Union[str, list[str]] = "ALL",
|
|
89
89
|
limited_output: bool = True,
|
|
90
90
|
save_results: Union[bool, str] = True,
|
|
91
|
-
encoding: str = None,
|
|
92
|
-
sep: str = None,
|
|
91
|
+
encoding: Optional[str] = None,
|
|
92
|
+
sep: Optional[str] = None,
|
|
93
93
|
skipna: bool = True,
|
|
94
94
|
output_profile: bool = False,
|
|
95
95
|
output_schema: bool = False,
|
|
96
96
|
output_df: bool = False,
|
|
97
97
|
cast_json: bool = True,
|
|
98
98
|
verbose: bool = False,
|
|
99
|
-
sheet_name: Union[str, int] = None,
|
|
99
|
+
sheet_name: Optional[Union[str, int]] = None,
|
|
100
100
|
) -> Union[dict, tuple[dict, pd.DataFrame]]:
|
|
101
101
|
"""Returns a dict with information about the csv table and possible
|
|
102
102
|
column contents.
|
|
@@ -307,10 +307,7 @@ def routine_minio(
|
|
|
307
307
|
tableschema_minio_location: dict[str, str],
|
|
308
308
|
minio_user: str,
|
|
309
309
|
minio_pwd: str,
|
|
310
|
-
|
|
311
|
-
user_input_tests: Union[str, list[str]] = "ALL",
|
|
312
|
-
encoding: str = None,
|
|
313
|
-
sep: str = None,
|
|
310
|
+
**kwargs,
|
|
314
311
|
):
|
|
315
312
|
"""Returns a dict with information about the csv table and possible
|
|
316
313
|
column contents.
|
|
@@ -323,11 +320,7 @@ def routine_minio(
|
|
|
323
320
|
None if not uploading the tableschema to Minio.
|
|
324
321
|
minio_user: user name for the minio instance
|
|
325
322
|
minio_pwd: password for the minio instance
|
|
326
|
-
|
|
327
|
-
the whole file
|
|
328
|
-
user_input_tests: tests to run on the file
|
|
329
|
-
output_mode: LIMITED or ALL, whether or not to return all possible types or only
|
|
330
|
-
the most likely one for each column
|
|
323
|
+
kwargs: arguments for routine
|
|
331
324
|
|
|
332
325
|
Returns:
|
|
333
326
|
dict: a dict with information about the csv and possible types for each column
|
|
@@ -376,14 +369,10 @@ def routine_minio(
|
|
|
376
369
|
minio_pwd=minio_pwd,
|
|
377
370
|
)
|
|
378
371
|
|
|
379
|
-
analysis = routine(
|
|
380
|
-
file_path,
|
|
372
|
+
analysis = routine(file_path,
|
|
381
373
|
num_rows,
|
|
382
|
-
user_input_tests,
|
|
383
|
-
output_mode="LIMITED",
|
|
384
374
|
save_results=True,
|
|
385
|
-
|
|
386
|
-
sep=sep,
|
|
375
|
+
**kwargs,
|
|
387
376
|
)
|
|
388
377
|
|
|
389
378
|
# Write report JSON file.
|
|
@@ -404,8 +393,8 @@ def routine_minio(
|
|
|
404
393
|
os.remove(file_path)
|
|
405
394
|
|
|
406
395
|
generate_table_schema(
|
|
407
|
-
analysis,
|
|
408
|
-
True,
|
|
396
|
+
analysis_report=analysis,
|
|
397
|
+
save_file=True,
|
|
409
398
|
netloc=tableschema_minio_location["netloc"],
|
|
410
399
|
bucket=tableschema_minio_location["bucket"],
|
|
411
400
|
key=tableschema_minio_location["key"],
|
|
@@ -7,11 +7,13 @@
|
|
|
7
7
|
- Better naming, hint types and minor refactors [#103](https://github.com/datagouv/csv-detective/pull/103)
|
|
8
8
|
- The returned dataframe has its columns properly cast to the detected types [#104](https://github.com/datagouv/csv-detective/pull/104)
|
|
9
9
|
- Raise an error if the encoding could not be guessed [#106](https://github.com/datagouv/csv-detective/pull/106)
|
|
10
|
+
- Fix CLI and minio routine [#107](https://github.com/datagouv/csv-detective/pull/107)
|
|
10
11
|
- Allow to only specify tests to skip ("all but...") [#108](https://github.com/datagouv/csv-detective/pull/108)
|
|
11
12
|
- Fix bool casting [#109](https://github.com/datagouv/csv-detective/pull/109)
|
|
12
13
|
- Handle csv.gz files [#110](https://github.com/datagouv/csv-detective/pull/110)
|
|
13
14
|
- Refactor file tests [#110](https://github.com/datagouv/csv-detective/pull/110)
|
|
14
15
|
- Restructure repo (breaking changes) [#111](https://github.com/datagouv/csv-detective/pull/111)
|
|
16
|
+
- Better float detection [#113](https://github.com/datagouv/csv-detective/pull/113)
|
|
15
17
|
|
|
16
18
|
## 0.7.4 (2024-11-15)
|
|
17
19
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
csv_detective/__init__.py,sha256=GCHgu0BhH5ACV7cf-1gDr9nRyvSoeQ1vRw9SjEHeMT4,143
|
|
2
|
-
csv_detective/cli.py,sha256=
|
|
3
|
-
csv_detective/explore_csv.py,sha256=
|
|
2
|
+
csv_detective/cli.py,sha256=itooHtpyfC6DUsL_DchPKe1xo7m0MYJIp1L4R8eqoTk,1401
|
|
3
|
+
csv_detective/explore_csv.py,sha256=FmgJ2h1SxV8b_wOWia4xsswyVJTlCCW66e0nhltz-0s,14511
|
|
4
4
|
csv_detective/s3_utils.py,sha256=1cIVdQUYY2ovErbMwp72Gqtqx2bkB8nfVhn-QaOFTT0,1451
|
|
5
5
|
csv_detective/utils.py,sha256=KAYfSJXnPuAXnSc38Jm57oQ_JP_0kUkmI1OV6gN5_ys,1116
|
|
6
6
|
csv_detective/detect_fields/__init__.py,sha256=NVfE3BQVExgXb-BPbhDvlkM5-0naEVLpZ4aM_OGHYfE,931
|
|
@@ -53,7 +53,7 @@ csv_detective/detect_fields/geo/longitude_wgs/__init__.py,sha256=G7afWOKiGh_Tv7g
|
|
|
53
53
|
csv_detective/detect_fields/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
54
54
|
csv_detective/detect_fields/other/booleen/__init__.py,sha256=wn_yyTAmGxqo0l0b7JRpGb0da_E27iGxES9zWCrnsqc,497
|
|
55
55
|
csv_detective/detect_fields/other/email/__init__.py,sha256=O9tgJmq0O8Q-8iin63NqEEDhlsUJjxFZNaNFM4GZaws,178
|
|
56
|
-
csv_detective/detect_fields/other/float/__init__.py,sha256=
|
|
56
|
+
csv_detective/detect_fields/other/float/__init__.py,sha256=7bXuPAmBuIhKJEhq7d20B60WVol1AUpqRkWhreQpWfU,578
|
|
57
57
|
csv_detective/detect_fields/other/int/__init__.py,sha256=QN3kQJLYqLRBiubUK7g4Xq03PlA5wqVwx2pPPIO9FdI,320
|
|
58
58
|
csv_detective/detect_fields/other/json/__init__.py,sha256=DhzyvT12kOqgum89silIu3uoSYXmC_s_AaxLtXAD4eU,540
|
|
59
59
|
csv_detective/detect_fields/other/mongo_object_id/__init__.py,sha256=7fcrHsOZAqXp2_N0IjPskYJ_qi4xRlo9iyNNDQVLzsU,156
|
|
@@ -141,18 +141,18 @@ csv_detective/parsing/csv.py,sha256=11mibDnJhIjykXLGZvA5ZEU5U7KgxIrbyO6BNv6jlro,
|
|
|
141
141
|
csv_detective/parsing/excel.py,sha256=AslE2S1e67o8yTIAIhp-lAnJ6-XqeBBRz1-VMFqhZBM,7055
|
|
142
142
|
csv_detective/parsing/load.py,sha256=SpP0pfxswOAPPpwbZfoP1blh0EKV5VMs0TpTgQJKzjs,3621
|
|
143
143
|
csv_detective/parsing/text.py,sha256=rsfk66BCmdpsCOd0kDJ8tmqMsEWd-OeBkEisWc4Ej9k,1246
|
|
144
|
-
csv_detective-0.7.5.
|
|
145
|
-
csv_detective-0.7.5.
|
|
146
|
-
csv_detective-0.7.5.
|
|
147
|
-
csv_detective-0.7.5.
|
|
144
|
+
csv_detective-0.7.5.dev1277.data/data/share/csv_detective/CHANGELOG.md,sha256=tgIIm6s4qoP4RGJK1cmqf-Cm5aHmXmBrwi37NVIYedg,7796
|
|
145
|
+
csv_detective-0.7.5.dev1277.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
|
|
146
|
+
csv_detective-0.7.5.dev1277.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
|
|
147
|
+
csv_detective-0.7.5.dev1277.dist-info/licenses/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
|
|
148
148
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
149
149
|
tests/test_example.py,sha256=0NfChooJQlFxTo2nY5FOQIcsK4zzWA_SBmt2LwVQovY,2014
|
|
150
|
-
tests/test_fields.py,sha256=
|
|
150
|
+
tests/test_fields.py,sha256=LPLx09cX5u9XHAh65XvTgIqzKylToiHZxXzKhpV0wsk,11148
|
|
151
151
|
tests/test_file.py,sha256=EleTssys5fCP4N0W1eTZN35uijzoF15e3dIcuIlrMsk,7865
|
|
152
152
|
tests/test_labels.py,sha256=6MOKrGznkwU5fjZ_3oiB6Scmb480Eu-9geBJs0UDLds,159
|
|
153
153
|
tests/test_structure.py,sha256=SVsnluVoIIprYw_67I1_gB3cp9m1wlO8C7SpdsLW8cM,1161
|
|
154
|
-
csv_detective-0.7.5.
|
|
155
|
-
csv_detective-0.7.5.
|
|
156
|
-
csv_detective-0.7.5.
|
|
157
|
-
csv_detective-0.7.5.
|
|
158
|
-
csv_detective-0.7.5.
|
|
154
|
+
csv_detective-0.7.5.dev1277.dist-info/METADATA,sha256=RgcnqpKqQ1us0lmVf6McKYJs38DC1sqvAh10XgnJOY8,1386
|
|
155
|
+
csv_detective-0.7.5.dev1277.dist-info/WHEEL,sha256=pxyMxgL8-pra_rKaQ4drOZAegBVuX-G_4nRHjjgWbmo,91
|
|
156
|
+
csv_detective-0.7.5.dev1277.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
|
|
157
|
+
csv_detective-0.7.5.dev1277.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
|
|
158
|
+
csv_detective-0.7.5.dev1277.dist-info/RECORD,,
|
tests/test_fields.py
CHANGED
|
@@ -501,7 +501,7 @@ def test_not_match_int():
|
|
|
501
501
|
|
|
502
502
|
# float
|
|
503
503
|
def test_match_float():
|
|
504
|
-
for val in ["1", "0", "1764", "-24", "1.2", "1863.23", "-12.7"]:
|
|
504
|
+
for val in ["1", "0", "1764", "-24", "1.2", "1863.23", "-12.7", "0.1"]:
|
|
505
505
|
assert test_float._is(val)
|
|
506
506
|
|
|
507
507
|
|
|
File without changes
|
|
File without changes
|
{csv_detective-0.7.5.dev1239.dist-info → csv_detective-0.7.5.dev1277.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
{csv_detective-0.7.5.dev1239.dist-info → csv_detective-0.7.5.dev1277.dist-info}/top_level.txt
RENAMED
|
File without changes
|