csv-detective 0.10.3.dev7__py3-none-any.whl → 0.10.4.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,6 +11,7 @@ from csv_detective.format import Format, FormatsManager
11
11
  from csv_detective.output.utils import prepare_output_dict
12
12
  from csv_detective.parsing.columns import (
13
13
  MAX_NUMBER_CATEGORICAL_VALUES,
14
+ handle_empty_columns,
14
15
  test_col,
15
16
  test_col_chunks,
16
17
  test_label,
@@ -49,6 +50,7 @@ def detect_formats(
49
50
  skipna=skipna,
50
51
  verbose=verbose,
51
52
  )
53
+ handle_empty_columns(scores_table_fields)
52
54
  res_categorical, _ = detect_categorical_variable(
53
55
  table,
54
56
  max_number_categorical_values=MAX_NUMBER_CATEGORICAL_VALUES,
@@ -36,7 +36,7 @@ def routine(
36
36
  file_path: local path or URL to file
37
37
  num_rows: number of rows to sample from the file for analysis ; -1 for analysis of the whole file
38
38
  tags: tags to filter formats (for instance ["geo", "fr] to run only the checks related to geo and French formats)
39
- limited_output: whether or not to return all possible types or only the most likely one for each column
39
+ limited_output: whether or not to return all possible formats or only the most likely one for each column
40
40
  save_results: whether or not to save the results in a json file, or the path where to dump the output
41
41
  output_profile: whether or not to add the 'profile' field to the output
42
42
  output_schema: whether or not to add the 'schema' field to the output (tableschema)
@@ -47,7 +47,7 @@ def routine(
47
47
  skipna: whether to keep NaN (empty cells) for tests
48
48
 
49
49
  Returns:
50
- dict: a dict with information about the csv and possible types for each column
50
+ dict: a dict with information about the csv and possible formats for each column
51
51
  """
52
52
 
53
53
  if not (
csv_detective/format.py CHANGED
@@ -25,6 +25,8 @@ class Format:
25
25
  labels: the dict of hint headers and their credibilty for the header score (NB: credibility is relative witin a single format, should be used to rank the valid labels)
26
26
  proportion: the tolerance (between 0 and 1) to say a column is valid for a format. (1 => 100% of the column has to pass the func check for the column to be considered valid)
27
27
  tags: to allow users to submit a file to only a subset of formats
28
+ mandatory_label: whether the format can only be considered if the column passes both field and label tests
29
+ python_type: the python type related to the format (less specific, used for downstream casting)
28
30
  """
29
31
  self.name: str = name
30
32
  self.func: Callable[[Any], bool] = func
@@ -13,6 +13,13 @@ from csv_detective.utils import display_logs_depending_process_time
13
13
  MAX_NUMBER_CATEGORICAL_VALUES = 25
14
14
 
15
15
 
16
+ def handle_empty_columns(return_table: pd.DataFrame):
17
+ # handling that empty columns score 1 everywhere
18
+ for col in return_table.columns:
19
+ if sum(return_table[col]) == len(return_table):
20
+ return_table[col] = 0
21
+
22
+
16
23
  def test_col_val(
17
24
  serie: pd.Series,
18
25
  format: Format,
@@ -87,7 +94,7 @@ def test_col(
87
94
  )
88
95
  if verbose:
89
96
  display_logs_depending_process_time(
90
- f'\t> Done with type "{label}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(formats)})',
97
+ f'\t> Done with format "{label}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(formats)})',
91
98
  time() - start_type,
92
99
  )
93
100
  if verbose:
@@ -102,7 +109,7 @@ def test_label(
102
109
  ):
103
110
  if verbose:
104
111
  start = time()
105
- logging.info("Testing labels to get types")
112
+ logging.info("Testing labels to get formats")
106
113
 
107
114
  return_table = pd.DataFrame(columns=columns)
108
115
  for idx, (label, format) in enumerate(formats.items()):
@@ -111,7 +118,7 @@ def test_label(
111
118
  return_table.loc[label] = [format.is_valid_label(col_name) for col_name in columns]
112
119
  if verbose:
113
120
  display_logs_depending_process_time(
114
- f'\t- Done with type "{label}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(formats)})',
121
+ f'\t- Done with format "{label}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(formats)})',
115
122
  time() - start_type,
116
123
  )
117
124
  if verbose:
@@ -222,10 +229,7 @@ def test_col_chunks(
222
229
  analysis["categorical"] = [
223
230
  col for col, values in col_values.items() if len(values) <= MAX_NUMBER_CATEGORICAL_VALUES
224
231
  ]
225
- # handling that empty columns score 1 everywhere
226
- for col in return_table.columns:
227
- if sum(return_table[col]) == len(return_table):
228
- return_table[col] = 0
232
+ handle_empty_columns(return_table)
229
233
  if verbose:
230
234
  display_logs_depending_process_time(
231
235
  f"Done testing chunks in {round(time() - start, 3)}s", time() - start
@@ -39,7 +39,7 @@ def parse_csv(
39
39
  # branch between small and big files starts here
40
40
  if total_lines == CHUNK_SIZE:
41
41
  if verbose:
42
- logging.warning(f"File is too long, analysing in chunks of {CHUNK_SIZE} rows")
42
+ logging.warning(f"File is too long, loading in chunks of {CHUNK_SIZE} rows")
43
43
  total_lines, nb_duplicates = None, None
44
44
  else:
45
45
  nb_duplicates = table.duplicated().sum()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: csv-detective
3
- Version: 0.10.3.dev7
3
+ Version: 0.10.4.dev2
4
4
  Summary: Detect tabular files column content
5
5
  Author-email: "data.gouv.fr" <opendatateam@data.gouv.fr>
6
6
  License: MIT
@@ -1,14 +1,14 @@
1
1
  csv_detective/__init__.py,sha256=zlYElTOp_I2_VG7ZdOTuAu0wuCXSc0cr3sH6gtk2bcg,152
2
2
  csv_detective/cli.py,sha256=mu5anmBmaDk52_uZGiA4T37wYZCuV43gZAepjs1Cqzc,1389
3
- csv_detective/explore_csv.py,sha256=M8jabAP08raPY438v5UeBqJy3bBudTeuo-UNe2unWyE,7639
4
- csv_detective/format.py,sha256=VTdwg4gp9pq6WYhbkCxv9X2hXq0fMrzfooFchmIL0as,2911
3
+ csv_detective/explore_csv.py,sha256=YjkQihSm1vgZbEfXHxJ-_bVJrtCUT3Ut8x8FX60ZK3k,7643
4
+ csv_detective/format.py,sha256=vYz4h-WDUZ3pZIfxUc1toXRbUiNnWqHHiUXBbZtnaUw,3140
5
5
  csv_detective/utils.py,sha256=RJ_zFOJ1DRY8HtDrKPiCdNk5gU6-KwOrOKOyfSkBZZY,1118
6
6
  csv_detective/validate.py,sha256=7k0GC5AsTn5BbsRChetZZDmnTGiYLe40qPKiP3GruYs,7495
7
7
  csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
8
  csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvcaGd0Go,2887
9
9
  csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
10
10
  csv_detective/detection/engine.py,sha256=wQeDKpp2DKF-HcS1R8H6GgQyaUgQme4szPtEHgAjBII,1552
11
- csv_detective/detection/formats.py,sha256=9aIE4gwTN8c8pa-kofeJ7zalo8NqjGZabYD-G79kV5I,4734
11
+ csv_detective/detection/formats.py,sha256=cgECpxRaygwnedPhOteG1P_697qCoceeDrKK9G_O-u8,4812
12
12
  csv_detective/detection/headers.py,sha256=lnbWRxkI6rdyoWGtmxSfsPkqNjS0Nlpgw-pVevtmBP0,899
13
13
  csv_detective/detection/rows.py,sha256=JQsmKP8-i8wzcZIWI_13LUer5mpYRIqaKg6qW01ZO3A,750
14
14
  csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
@@ -80,17 +80,17 @@ csv_detective/output/profile.py,sha256=R9YMl-dANde69RXkFlZpvMDBsX7e1SyMAnlW8p1XN
80
80
  csv_detective/output/schema.py,sha256=XoKljXPXP00DfqPCiz1ydwTHYGAFsvNxnaPCNBuuBIo,10443
81
81
  csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
82
82
  csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
83
- csv_detective/parsing/columns.py,sha256=H_dKHhSgQMIiOfxibnGl6HwTW9bRwGtIeUcYBN13-3A,9245
83
+ csv_detective/parsing/columns.py,sha256=ExabNAzQB-IvQfjmVLClBfj1GMkbAQxUUnournFpdy8,9350
84
84
  csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
85
- csv_detective/parsing/csv.py,sha256=5rw6gXZFQC1T4NT9CnW0AumidrYOkF8kjrfWGmk949I,1716
85
+ csv_detective/parsing/csv.py,sha256=vfAHkpgzLkzeUXWVqrWlttZ1y-Hql0GNFSZZrA366mk,1714
86
86
  csv_detective/parsing/excel.py,sha256=pX6dbhAdAdbRpoGcrGsL1lSaF-fbzEb4WcvwcCGEgFw,6978
87
87
  csv_detective/parsing/load.py,sha256=1Fk43ikIOJwtWJUY-e8oNeNOk4MMtpmZV7s-VbQBS1k,4345
88
88
  csv_detective/parsing/text.py,sha256=yDAcop5xJQc25UtbZcV0guHXAZQfm-H8WuJORTy8Rr8,1734
89
- csv_detective-0.10.3.dev7.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
89
+ csv_detective-0.10.4.dev2.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
90
90
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
91
91
  tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
92
92
  tests/test_fields.py,sha256=DSI-ZXDcRt69iZArYZZAr_3OEb-qvwgOVBZxmYAKIkI,5918
93
- tests/test_file.py,sha256=9Zne9ULDqkr-ajgc03lEMEod4d71Y-UDY4ckT6FFw_I,15205
93
+ tests/test_file.py,sha256=Ov9NGvZQxeoehxTpfcsnwEybebM0tnbmcRsFwe46cjg,15277
94
94
  tests/test_labels.py,sha256=lgxRbLrGV1C-MkASf3KIQ120BG-UHzFQ4pqDWaeBvaw,539
95
95
  tests/test_structure.py,sha256=XDbviuuvk-0Mu9Y9PI6He2e5hry2dXVJ6yBVwEqF_2o,1043
96
96
  tests/test_validation.py,sha256=309k3Axgbp-1Wh6qvCj2BpeMBp3HXzLi5j9UKm1bRQs,5384
@@ -104,8 +104,8 @@ tests/data/file.ods,sha256=4dR7zWptz5djALIBVeWHQ20GaZNfA63fevIJGFIk1_U,11832
104
104
  tests/data/file.xls,sha256=QYmNX3FF0QfcQSzYQMtaMJaepJf5EZpDa1miKc4wMdQ,21495
105
105
  tests/data/file.xlsx,sha256=naWzL02PK4pdIjMzfEyfSW9GQhkYYd_e7bpJvB8Pb2w,8314
106
106
  tests/data/xlsx_file,sha256=NyOyN_rIe7ryJuHQLqjxVdKCc8V4s5pxyHl6wWFykCM,8305
107
- csv_detective-0.10.3.dev7.dist-info/METADATA,sha256=Kj-Nd01oXyqZpNwaVNJ9CwRcYYX80YCKtIhsijoO89A,10925
108
- csv_detective-0.10.3.dev7.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
109
- csv_detective-0.10.3.dev7.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
110
- csv_detective-0.10.3.dev7.dist-info/top_level.txt,sha256=KDI4gyOpkmormGgUvSWrE3jen2e0unIsxR2b96DRvcw,25
111
- csv_detective-0.10.3.dev7.dist-info/RECORD,,
107
+ csv_detective-0.10.4.dev2.dist-info/METADATA,sha256=52ZirEUs9m49EuivGtEmtr_p2h_wVsSRbyQTs0SfoWE,10925
108
+ csv_detective-0.10.4.dev2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
109
+ csv_detective-0.10.4.dev2.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
110
+ csv_detective-0.10.4.dev2.dist-info/top_level.txt,sha256=KDI4gyOpkmormGgUvSWrE3jen2e0unIsxR2b96DRvcw,25
111
+ csv_detective-0.10.4.dev2.dist-info/RECORD,,
tests/test_file.py CHANGED
@@ -388,10 +388,11 @@ def test_almost_uniform_column(mocked_responses):
388
388
  assert analysis["columns"][col_name]["format"] == "int"
389
389
 
390
390
 
391
- def test_full_nan_column(mocked_responses):
391
+ @pytest.mark.parametrize("nb_rows", (CHUNK_SIZE // 10, CHUNK_SIZE + 1))
392
+ def test_full_nan_column(mocked_responses, nb_rows):
392
393
  # we want a file that needs sampling
393
394
  col_name = "only_nan"
394
- expected_content = f"{col_name},second_col\n" + ",1\n" * (CHUNK_SIZE + 1)
395
+ expected_content = f"{col_name},second_col\n" + ",1\n" * nb_rows
395
396
  mocked_responses.get(
396
397
  "http://example.com/test.csv",
397
398
  body=expected_content,