csv-detective 0.10.4.dev1__py3-none-any.whl → 0.10.4.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -36,7 +36,7 @@ def routine(
36
36
  file_path: local path or URL to file
37
37
  num_rows: number of rows to sample from the file for analysis ; -1 for analysis of the whole file
38
38
  tags: tags to filter formats (for instance ["geo", "fr] to run only the checks related to geo and French formats)
39
- limited_output: whether or not to return all possible types or only the most likely one for each column
39
+ limited_output: whether or not to return all possible formats or only the most likely one for each column
40
40
  save_results: whether or not to save the results in a json file, or the path where to dump the output
41
41
  output_profile: whether or not to add the 'profile' field to the output
42
42
  output_schema: whether or not to add the 'schema' field to the output (tableschema)
@@ -47,7 +47,7 @@ def routine(
47
47
  skipna: whether to keep NaN (empty cells) for tests
48
48
 
49
49
  Returns:
50
- dict: a dict with information about the csv and possible types for each column
50
+ dict: a dict with information about the csv and possible formats for each column
51
51
  """
52
52
 
53
53
  if not (
csv_detective/format.py CHANGED
@@ -25,6 +25,8 @@ class Format:
25
25
  labels: the dict of hint headers and their credibilty for the header score (NB: credibility is relative witin a single format, should be used to rank the valid labels)
26
26
  proportion: the tolerance (between 0 and 1) to say a column is valid for a format. (1 => 100% of the column has to pass the func check for the column to be considered valid)
27
27
  tags: to allow users to submit a file to only a subset of formats
28
+ mandatory_label: whether the format can only be considered if the column passes both field and label tests
29
+ python_type: the python type related to the format (less specific, used for downstream casting)
28
30
  """
29
31
  self.name: str = name
30
32
  self.func: Callable[[Any], bool] = func
@@ -94,7 +94,7 @@ def test_col(
94
94
  )
95
95
  if verbose:
96
96
  display_logs_depending_process_time(
97
- f'\t> Done with type "{label}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(formats)})',
97
+ f'\t> Done with format "{label}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(formats)})',
98
98
  time() - start_type,
99
99
  )
100
100
  if verbose:
@@ -109,7 +109,7 @@ def test_label(
109
109
  ):
110
110
  if verbose:
111
111
  start = time()
112
- logging.info("Testing labels to get types")
112
+ logging.info("Testing labels to get formats")
113
113
 
114
114
  return_table = pd.DataFrame(columns=columns)
115
115
  for idx, (label, format) in enumerate(formats.items()):
@@ -118,7 +118,7 @@ def test_label(
118
118
  return_table.loc[label] = [format.is_valid_label(col_name) for col_name in columns]
119
119
  if verbose:
120
120
  display_logs_depending_process_time(
121
- f'\t- Done with type "{label}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(formats)})',
121
+ f'\t- Done with format "{label}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(formats)})',
122
122
  time() - start_type,
123
123
  )
124
124
  if verbose:
@@ -39,7 +39,7 @@ def parse_csv(
39
39
  # branch between small and big files starts here
40
40
  if total_lines == CHUNK_SIZE:
41
41
  if verbose:
42
- logging.warning(f"File is too long, analysing in chunks of {CHUNK_SIZE} rows")
42
+ logging.warning(f"File is too long, loading in chunks of {CHUNK_SIZE} rows")
43
43
  total_lines, nb_duplicates = None, None
44
44
  else:
45
45
  nb_duplicates = table.duplicated().sum()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: csv-detective
3
- Version: 0.10.4.dev1
3
+ Version: 0.10.4.dev2
4
4
  Summary: Detect tabular files column content
5
5
  Author-email: "data.gouv.fr" <opendatateam@data.gouv.fr>
6
6
  License: MIT
@@ -1,7 +1,7 @@
1
1
  csv_detective/__init__.py,sha256=zlYElTOp_I2_VG7ZdOTuAu0wuCXSc0cr3sH6gtk2bcg,152
2
2
  csv_detective/cli.py,sha256=mu5anmBmaDk52_uZGiA4T37wYZCuV43gZAepjs1Cqzc,1389
3
- csv_detective/explore_csv.py,sha256=M8jabAP08raPY438v5UeBqJy3bBudTeuo-UNe2unWyE,7639
4
- csv_detective/format.py,sha256=VTdwg4gp9pq6WYhbkCxv9X2hXq0fMrzfooFchmIL0as,2911
3
+ csv_detective/explore_csv.py,sha256=YjkQihSm1vgZbEfXHxJ-_bVJrtCUT3Ut8x8FX60ZK3k,7643
4
+ csv_detective/format.py,sha256=vYz4h-WDUZ3pZIfxUc1toXRbUiNnWqHHiUXBbZtnaUw,3140
5
5
  csv_detective/utils.py,sha256=RJ_zFOJ1DRY8HtDrKPiCdNk5gU6-KwOrOKOyfSkBZZY,1118
6
6
  csv_detective/validate.py,sha256=7k0GC5AsTn5BbsRChetZZDmnTGiYLe40qPKiP3GruYs,7495
7
7
  csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -80,13 +80,13 @@ csv_detective/output/profile.py,sha256=R9YMl-dANde69RXkFlZpvMDBsX7e1SyMAnlW8p1XN
80
80
  csv_detective/output/schema.py,sha256=XoKljXPXP00DfqPCiz1ydwTHYGAFsvNxnaPCNBuuBIo,10443
81
81
  csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
82
82
  csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
83
- csv_detective/parsing/columns.py,sha256=MFtEJFLsFdlKdM5AXtgXbf5p6HRW6DuOC4XnxhFMpIY,9344
83
+ csv_detective/parsing/columns.py,sha256=ExabNAzQB-IvQfjmVLClBfj1GMkbAQxUUnournFpdy8,9350
84
84
  csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
85
- csv_detective/parsing/csv.py,sha256=5rw6gXZFQC1T4NT9CnW0AumidrYOkF8kjrfWGmk949I,1716
85
+ csv_detective/parsing/csv.py,sha256=vfAHkpgzLkzeUXWVqrWlttZ1y-Hql0GNFSZZrA366mk,1714
86
86
  csv_detective/parsing/excel.py,sha256=pX6dbhAdAdbRpoGcrGsL1lSaF-fbzEb4WcvwcCGEgFw,6978
87
87
  csv_detective/parsing/load.py,sha256=1Fk43ikIOJwtWJUY-e8oNeNOk4MMtpmZV7s-VbQBS1k,4345
88
88
  csv_detective/parsing/text.py,sha256=yDAcop5xJQc25UtbZcV0guHXAZQfm-H8WuJORTy8Rr8,1734
89
- csv_detective-0.10.4.dev1.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
89
+ csv_detective-0.10.4.dev2.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
90
90
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
91
91
  tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
92
92
  tests/test_fields.py,sha256=DSI-ZXDcRt69iZArYZZAr_3OEb-qvwgOVBZxmYAKIkI,5918
@@ -104,8 +104,8 @@ tests/data/file.ods,sha256=4dR7zWptz5djALIBVeWHQ20GaZNfA63fevIJGFIk1_U,11832
104
104
  tests/data/file.xls,sha256=QYmNX3FF0QfcQSzYQMtaMJaepJf5EZpDa1miKc4wMdQ,21495
105
105
  tests/data/file.xlsx,sha256=naWzL02PK4pdIjMzfEyfSW9GQhkYYd_e7bpJvB8Pb2w,8314
106
106
  tests/data/xlsx_file,sha256=NyOyN_rIe7ryJuHQLqjxVdKCc8V4s5pxyHl6wWFykCM,8305
107
- csv_detective-0.10.4.dev1.dist-info/METADATA,sha256=le1Rn1JIh8MoIf_RTc3Fi9DOOlvug4eR-Mwpw4AK0To,10925
108
- csv_detective-0.10.4.dev1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
109
- csv_detective-0.10.4.dev1.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
110
- csv_detective-0.10.4.dev1.dist-info/top_level.txt,sha256=KDI4gyOpkmormGgUvSWrE3jen2e0unIsxR2b96DRvcw,25
111
- csv_detective-0.10.4.dev1.dist-info/RECORD,,
107
+ csv_detective-0.10.4.dev2.dist-info/METADATA,sha256=52ZirEUs9m49EuivGtEmtr_p2h_wVsSRbyQTs0SfoWE,10925
108
+ csv_detective-0.10.4.dev2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
109
+ csv_detective-0.10.4.dev2.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
110
+ csv_detective-0.10.4.dev2.dist-info/top_level.txt,sha256=KDI4gyOpkmormGgUvSWrE3jen2e0unIsxR2b96DRvcw,25
111
+ csv_detective-0.10.4.dev2.dist-info/RECORD,,