csv-detective 0.10.3.dev4__py3-none-any.whl → 0.10.3.dev5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -142,20 +142,19 @@ def validate_then_detect(
142
142
  if is_url(file_path):
143
143
  logging.info("Path recognized as a URL")
144
144
 
145
- is_valid, table, analysis, col_values = validate(
145
+ is_valid, analysis, col_values = validate(
146
146
  file_path=file_path,
147
147
  previous_analysis=previous_analysis,
148
148
  verbose=verbose,
149
149
  skipna=skipna,
150
150
  )
151
- if analysis is None:
152
- # if loading failed in validate, we load it from scratch
151
+ if not is_valid:
152
+ # if loading failed in validate, we load it from scratch and initiate an analysis
153
153
  table, analysis = load_file(
154
154
  file_path=file_path,
155
155
  num_rows=num_rows,
156
156
  verbose=verbose,
157
157
  )
158
- if not is_valid:
159
158
  analysis, col_values = detect_formats(
160
159
  table=table,
161
160
  analysis=analysis,
@@ -165,6 +164,18 @@ def validate_then_detect(
165
164
  skipna=skipna,
166
165
  verbose=verbose,
167
166
  )
167
+ else:
168
+ # successful validation means we have a correct analysis and col_values
169
+ # only need to reload the table, and we already know how
170
+ table, _ = load_file(
171
+ file_path=file_path,
172
+ num_rows=num_rows,
173
+ verbose=verbose,
174
+ sep=analysis.get("separator"),
175
+ encoding=analysis.get("encoding"),
176
+ engine=analysis.get("engine"),
177
+ sheet_name=analysis.get("sheet_name"),
178
+ )
168
179
  try:
169
180
  return generate_output(
170
181
  table=table,
csv_detective/format.py CHANGED
@@ -27,7 +27,7 @@ class Format:
27
27
  tags: to allow users to submit a file to only a subset of formats
28
28
  """
29
29
  self.name: str = name
30
- self.func: Callable = func
30
+ self.func: Callable[[Any], bool] = func
31
31
  self._test_values: dict[bool, list[str]] = _test_values
32
32
  self.labels: dict[str, float] = labels
33
33
  self.proportion: float = proportion
@@ -23,7 +23,7 @@ def create_profile(
23
23
  logging.info("Creating profile")
24
24
 
25
25
  if num_rows > 0:
26
- raise ValueError("To create profiles num_rows has to be set to -1")
26
+ raise ValueError("To create profile `num_rows` must be set to -1")
27
27
  if not limited_output:
28
28
  columns = {
29
29
  k: v[0] if v else {"python_type": "string", "format": "string", "score": 1.0}
@@ -33,7 +33,7 @@ def test_col_val(
33
33
 
34
34
  try:
35
35
  if skipna:
36
- serie = serie.loc[serie.notnull()]
36
+ serie = serie.dropna()
37
37
  ser_len = len(serie)
38
38
  if ser_len == 0:
39
39
  # being here means the whole column is NaN, so if skipna it's a pass
@@ -23,7 +23,7 @@ def parse_excel(
23
23
  file_path: str,
24
24
  num_rows: int = -1,
25
25
  engine: str | None = None,
26
- sheet_name: str | None = None,
26
+ sheet_name: str | int | None = None,
27
27
  random_state: int = 42,
28
28
  verbose: bool = False,
29
29
  ) -> tuple[pd.DataFrame, int, int, str, str, int]:
@@ -28,12 +28,12 @@ def load_file(
28
28
  encoding: str | None = None,
29
29
  sep: str | None = None,
30
30
  verbose: bool = False,
31
+ engine: str | None = None,
31
32
  sheet_name: str | int | None = None,
32
33
  ) -> tuple[pd.DataFrame, dict]:
33
34
  file_name = file_path.split("/")[-1]
34
- engine = None
35
- if "." not in file_name or not file_name.endswith("csv"):
36
- # file has no extension, we'll investigate how to read it
35
+ if ("." not in file_name or not file_name.endswith("csv")) and engine is None and sep is None:
36
+ # file has no extension and we don't have insights from arguments, we'll investigate how to read it
37
37
  engine = detect_engine(file_path, verbose=verbose)
38
38
 
39
39
  if engine in EXCEL_ENGINES or any([file_path.endswith(k) for k in XLS_LIKE_EXT]):
csv_detective/validate.py CHANGED
@@ -1,10 +1,13 @@
1
1
  import logging
2
+ from collections import defaultdict
2
3
 
3
4
  import pandas as pd
4
5
 
5
6
  from csv_detective.format import FormatsManager
6
7
  from csv_detective.parsing.columns import MAX_NUMBER_CATEGORICAL_VALUES, test_col_val
7
8
 
9
+ # VALIDATION_CHUNK_SIZE is bigger than (analysis) CHUNK_SIZE because
10
+ # it's faster to validate so we can afford to load more rows
8
11
  VALIDATION_CHUNK_SIZE = int(1e5)
9
12
  logging.basicConfig(level=logging.INFO)
10
13
 
@@ -16,9 +19,9 @@ def validate(
16
19
  previous_analysis: dict,
17
20
  verbose: bool = False,
18
21
  skipna: bool = True,
19
- ) -> tuple[bool, pd.DataFrame | None, dict | None, dict[str, pd.Series] | None]:
22
+ ) -> tuple[bool, dict | None, dict[str, pd.Series] | None]:
20
23
  """
21
- Verify is the given file has the same fields and types as in the given analysis.
24
+ Verify is the given file has the same fields and formats as in the given analysis.
22
25
 
23
26
  Args:
24
27
  file_path: the path of the file to validate
@@ -26,6 +29,15 @@ def validate(
26
29
  verbose: whether the code displays the steps it's going through
27
30
  skipna: whether to ignore NaN values in the checks
28
31
  """
32
+ if verbose:
33
+ logging.info(f"Checking given formats exist")
34
+ for col_name, detected in previous_analysis["columns"].items():
35
+ if detected["format"] == "string":
36
+ continue
37
+ elif detected["format"] not in formats:
38
+ if verbose:
39
+ logging.warning(f"> Unknown format `{detected['format']}` in analysis")
40
+ return False, None, None
29
41
  try:
30
42
  if previous_analysis.get("separator"):
31
43
  # loading the table in chunks
@@ -58,77 +70,94 @@ def validate(
58
70
  ]
59
71
  )
60
72
  analysis = {k: v for k, v in previous_analysis.items() if k in ["engine", "sheet_name"]}
61
- first_chunk = next(chunks)
62
73
  analysis.update(
63
74
  {k: v for k, v in previous_analysis.items() if k in ["header_row_idx", "header"]}
64
75
  )
65
76
  except Exception as e:
66
77
  if verbose:
67
78
  logging.warning(f"> Could not load the file with previous analysis values: {e}")
68
- return False, None, None, None
79
+ return False, None, None
69
80
  if verbose:
70
81
  logging.info("Comparing table with the previous analysis")
71
- logging.info("- Checking if all columns match")
72
- if len(first_chunk.columns) != len(previous_analysis["header"]) or any(
73
- list(first_chunk.columns)[k] != previous_analysis["header"][k]
74
- for k in range(len(previous_analysis["header"]))
75
- ):
76
- if verbose:
77
- logging.warning("> Columns do not match, proceeding with full analysis")
78
- return False, None, None, None
79
- if verbose:
80
82
  logging.info(
81
83
  f"Testing previously detected formats on chunks of {VALIDATION_CHUNK_SIZE} rows"
82
84
  )
83
85
 
84
- # hashing rows to get nb_duplicates
85
- row_hashes_count = pd.util.hash_pandas_object(first_chunk, index=False).value_counts()
86
- # getting values for profile to read the file only once
87
- col_values = {col: first_chunk[col].value_counts(dropna=False) for col in first_chunk.columns}
86
+ # will contain hashes of each row of the file as index and the number of times
87
+ # each hash was seen as values; used to compute nb_duplicates
88
+ row_hashes_count = pd.Series()
89
+ # will contain the number of times each value of each column is seen in the whole file
90
+ # used for profile to read the file only once
91
+ # naming it "count" to be iso with how col_values are made in detect_formats
92
+ col_values: defaultdict[str, pd.Series] = defaultdict(lambda: pd.Series(name="count"))
88
93
  analysis["total_lines"] = 0
89
- for idx, chunk in enumerate([first_chunk, *chunks]):
94
+ checked_values: dict[str, int] = {col_name: 0 for col_name in previous_analysis["columns"]}
95
+ valid_values: dict[str, int] = {col_name: 0 for col_name in previous_analysis["columns"]}
96
+ for idx, chunk in enumerate(chunks):
90
97
  if verbose:
91
- logging.info(f"> Testing chunk number {idx}")
98
+ logging.info(f"- Testing chunk number {idx}")
99
+ if idx == 0:
100
+ if verbose:
101
+ logging.info("Checking if all columns match")
102
+ if len(chunk.columns) != len(previous_analysis["header"]) or any(
103
+ list(chunk.columns)[k] != previous_analysis["header"][k]
104
+ for k in range(len(previous_analysis["header"]))
105
+ ):
106
+ if verbose:
107
+ logging.warning("> Columns in the file do not match those of the analysis")
108
+ return False, None, None
92
109
  analysis["total_lines"] += len(chunk)
93
110
  row_hashes_count = row_hashes_count.add(
94
111
  pd.util.hash_pandas_object(chunk, index=False).value_counts(),
95
112
  fill_value=0,
96
113
  )
97
- for col in chunk.columns:
98
- col_values[col] = col_values[col].add(
99
- chunk[col].value_counts(dropna=False),
100
- fill_value=0,
101
- )
102
114
  for col_name, detected in previous_analysis["columns"].items():
103
115
  if verbose:
104
116
  logging.info(f"- Testing {col_name} for {detected['format']}")
105
117
  if detected["format"] == "string":
106
118
  # no test for columns that have not been recognized as a specific format
107
119
  continue
108
- if detected["format"] not in formats:
120
+ to_check = chunk[col_name].dropna() if skipna else chunk[col_name]
121
+ chunk_valid_values = sum(to_check.apply(formats[detected["format"]].func))
122
+ if formats[detected["format"]].proportion == 1 and chunk_valid_values < len(to_check):
123
+ # we can early stop in this case, not all values are valid while we want 100%
109
124
  if verbose:
110
125
  logging.warning(
111
- f"> Unknown format `{detected['format']}`, proceeding with full analysis"
126
+ f"> Test failed for column {col_name} with format {detected['format']}"
112
127
  )
113
- return False, first_chunk, analysis, None
114
- test_result: float = test_col_val(
115
- serie=chunk[col_name],
116
- format=formats[detected["format"]],
117
- skipna=skipna,
118
- )
119
- if not bool(test_result):
120
- if verbose:
121
- logging.warning("> Test failed, proceeding with full analysis")
122
- return False, first_chunk, analysis, None
128
+ return False, None, None
129
+ checked_values[col_name] += len(to_check)
130
+ valid_values[col_name] += chunk_valid_values
131
+ col_values[col_name] = (
132
+ col_values[col_name]
133
+ .add(
134
+ chunk[col_name].value_counts(dropna=False),
135
+ fill_value=0,
136
+ )
137
+ .rename_axis(col_name)
138
+ ) # rename_axis because *sometimes* pandas doesn't pass on the column's name ¯\_(ツ)_/¯
139
+ del chunk
140
+ # finally we loop through the formats that accept less than 100% valid values to check the proportion
141
+ for col_name, detected in previous_analysis["columns"].items():
142
+ if (
143
+ checked_values[col_name] > 0
144
+ and valid_values[col_name] / checked_values[col_name]
145
+ < formats[detected["format"]].proportion
146
+ ):
147
+ if verbose:
148
+ logging.warning(
149
+ f"> Test failed for column {col_name} with format {detected['format']}"
150
+ )
151
+ return False, None, None
123
152
  if verbose:
124
153
  logging.info("> All checks successful")
125
154
  analysis["nb_duplicates"] = sum(row_hashes_count > 1)
155
+ del row_hashes_count
126
156
  analysis["categorical"] = [
127
157
  col for col, values in col_values.items() if len(values) <= MAX_NUMBER_CATEGORICAL_VALUES
128
158
  ]
129
159
  return (
130
160
  True,
131
- first_chunk,
132
161
  analysis
133
162
  | {
134
163
  k: previous_analysis[k]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: csv-detective
3
- Version: 0.10.3.dev4
3
+ Version: 0.10.3.dev5
4
4
  Summary: Detect tabular files column content
5
5
  Author-email: "data.gouv.fr" <opendatateam@data.gouv.fr>
6
6
  License: MIT
@@ -1,9 +1,9 @@
1
1
  csv_detective/__init__.py,sha256=zlYElTOp_I2_VG7ZdOTuAu0wuCXSc0cr3sH6gtk2bcg,152
2
2
  csv_detective/cli.py,sha256=mu5anmBmaDk52_uZGiA4T37wYZCuV43gZAepjs1Cqzc,1389
3
- csv_detective/explore_csv.py,sha256=qSf6N3tbp43BUMJF5wiXz3aYKaTez6ro-75KL2Arci4,7174
4
- csv_detective/format.py,sha256=VglcxWBmjTvWNMhwSUZDfMdJcK9lAUum64Jxvm70AJ4,2898
3
+ csv_detective/explore_csv.py,sha256=M8jabAP08raPY438v5UeBqJy3bBudTeuo-UNe2unWyE,7639
4
+ csv_detective/format.py,sha256=VTdwg4gp9pq6WYhbkCxv9X2hXq0fMrzfooFchmIL0as,2911
5
5
  csv_detective/utils.py,sha256=RJ_zFOJ1DRY8HtDrKPiCdNk5gU6-KwOrOKOyfSkBZZY,1118
6
- csv_detective/validate.py,sha256=QBJhwHP0U0Ux7ODGV6foqNGm-DlbECIo6jUsBFOdDr0,5739
6
+ csv_detective/validate.py,sha256=7k0GC5AsTn5BbsRChetZZDmnTGiYLe40qPKiP3GruYs,7495
7
7
  csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
8
  csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvcaGd0Go,2887
9
9
  csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
@@ -76,24 +76,24 @@ csv_detective/formats/data/iso_country_code_numeric.txt,sha256=sdGpn0PqDMlc59-7p
76
76
  csv_detective/output/__init__.py,sha256=ALSq_tgX7rGyh--7rmbKz8wHkmResN0h7mNujndow3w,2103
77
77
  csv_detective/output/dataframe.py,sha256=juBMdj0eiL8c3OrJJ3kCf15Qs4-CFQfHqh91FnVbG9E,3656
78
78
  csv_detective/output/example.py,sha256=8LWheSBYCeDFfarbnmzBrdCbTd8Alh1U4pfXMKfabOw,8630
79
- csv_detective/output/profile.py,sha256=ADr5DwuvwcBYxugjN38fHm11l6ivfzGHXPd8a87Ht-s,4985
79
+ csv_detective/output/profile.py,sha256=R9YMl-dANde69RXkFlZpvMDBsX7e1SyMAnlW8p1XNNM,4984
80
80
  csv_detective/output/schema.py,sha256=XoKljXPXP00DfqPCiz1ydwTHYGAFsvNxnaPCNBuuBIo,10443
81
81
  csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
82
82
  csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
83
- csv_detective/parsing/columns.py,sha256=rb5JywbKnYCT3Jb0ZaG1BnyPVtB3gy5mSD-K7qcOl8I,9257
83
+ csv_detective/parsing/columns.py,sha256=H_dKHhSgQMIiOfxibnGl6HwTW9bRwGtIeUcYBN13-3A,9245
84
84
  csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
85
85
  csv_detective/parsing/csv.py,sha256=5rw6gXZFQC1T4NT9CnW0AumidrYOkF8kjrfWGmk949I,1716
86
- csv_detective/parsing/excel.py,sha256=tb65I78tdYlZci_tzvvQt8U6bZSYKjeVdn2CEvsET1o,6972
87
- csv_detective/parsing/load.py,sha256=uWX4r_2K9bf-9qKL6IaiGNkiSPnSSNqng420zdrFkDg,4371
86
+ csv_detective/parsing/excel.py,sha256=pX6dbhAdAdbRpoGcrGsL1lSaF-fbzEb4WcvwcCGEgFw,6978
87
+ csv_detective/parsing/load.py,sha256=pZ9ub47s0GO39F5-0D7KZhWQRAjMg8L8ljqDIRDjWg8,4463
88
88
  csv_detective/parsing/text.py,sha256=yDAcop5xJQc25UtbZcV0guHXAZQfm-H8WuJORTy8Rr8,1734
89
- csv_detective-0.10.3.dev4.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
89
+ csv_detective-0.10.3.dev5.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
90
90
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
91
91
  tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
92
92
  tests/test_fields.py,sha256=DSI-ZXDcRt69iZArYZZAr_3OEb-qvwgOVBZxmYAKIkI,5918
93
93
  tests/test_file.py,sha256=9Zne9ULDqkr-ajgc03lEMEod4d71Y-UDY4ckT6FFw_I,15205
94
94
  tests/test_labels.py,sha256=lgxRbLrGV1C-MkASf3KIQ120BG-UHzFQ4pqDWaeBvaw,539
95
95
  tests/test_structure.py,sha256=XDbviuuvk-0Mu9Y9PI6He2e5hry2dXVJ6yBVwEqF_2o,1043
96
- tests/test_validation.py,sha256=9djBT-PDhu_563OFgWyE20o-wPEWEIQGXp6Pjh0_MQM,3463
96
+ tests/test_validation.py,sha256=309k3Axgbp-1Wh6qvCj2BpeMBp3HXzLi5j9UKm1bRQs,5384
97
97
  tests/data/a_test_file.csv,sha256=SOHjseGYqZer9yu3Bd3oS12Vw8MFsebo0BzrLZ_R4Cc,68871
98
98
  tests/data/a_test_file.json,sha256=fB9bCpAMFPxFw8KxHRFlgRqjYG819QVGrCQWxQvwkvo,10542
99
99
  tests/data/b_test_file.csv,sha256=wJGX62KhYjZi62De2XjZWClAzeRFEBsg3ET0IPX1BNU,98
@@ -104,8 +104,8 @@ tests/data/file.ods,sha256=4dR7zWptz5djALIBVeWHQ20GaZNfA63fevIJGFIk1_U,11832
104
104
  tests/data/file.xls,sha256=QYmNX3FF0QfcQSzYQMtaMJaepJf5EZpDa1miKc4wMdQ,21495
105
105
  tests/data/file.xlsx,sha256=naWzL02PK4pdIjMzfEyfSW9GQhkYYd_e7bpJvB8Pb2w,8314
106
106
  tests/data/xlsx_file,sha256=NyOyN_rIe7ryJuHQLqjxVdKCc8V4s5pxyHl6wWFykCM,8305
107
- csv_detective-0.10.3.dev4.dist-info/METADATA,sha256=FRE6DkDtXSuva4aVTV1ws8tj53w03fjesToRlgh_78s,11082
108
- csv_detective-0.10.3.dev4.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
109
- csv_detective-0.10.3.dev4.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
110
- csv_detective-0.10.3.dev4.dist-info/top_level.txt,sha256=KDI4gyOpkmormGgUvSWrE3jen2e0unIsxR2b96DRvcw,25
111
- csv_detective-0.10.3.dev4.dist-info/RECORD,,
107
+ csv_detective-0.10.3.dev5.dist-info/METADATA,sha256=TxQwNe_bPUxntht3aTRh0ct8I8J8NJOLv2ysXjpPLxA,11082
108
+ csv_detective-0.10.3.dev5.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
109
+ csv_detective-0.10.3.dev5.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
110
+ csv_detective-0.10.3.dev5.dist-info/top_level.txt,sha256=KDI4gyOpkmormGgUvSWrE3jen2e0unIsxR2b96DRvcw,25
111
+ csv_detective-0.10.3.dev5.dist-info/RECORD,,
tests/test_validation.py CHANGED
@@ -1,4 +1,5 @@
1
1
  import json
2
+ from unittest.mock import MagicMock, patch
2
3
 
3
4
  import pandas as pd
4
5
  import pytest
@@ -26,12 +27,12 @@ def get_nested_value(source_dict: dict, key_chain: list[str]):
26
27
  @pytest.mark.parametrize(
27
28
  "_params",
28
29
  (
29
- ((True, pd.DataFrame, dict), {}),
30
- ((False, None, None), {"separator": "|"}),
31
- ((False, None, None), {"encoding": "unknown"}),
32
- ((False, None, None), {"header": ["a", "b"]}),
30
+ ((True, dict), {}),
31
+ ((False, None), {"separator": "|"}),
32
+ ((False, None), {"encoding": "unknown"}),
33
+ ((False, None), {"header": ["a", "b"]}),
33
34
  (
34
- (False, pd.DataFrame, dict),
35
+ (False, None),
35
36
  {
36
37
  "columns.NUMCOM": {
37
38
  "python_type": "int",
@@ -43,35 +44,89 @@ def get_nested_value(source_dict: dict, key_chain: list[str]):
43
44
  ),
44
45
  )
45
46
  def test_validation(_params):
46
- (should_be_valid, table_type, analysis_type), modif_previous_analysis = _params
47
+ (should_be_valid, analysis_type), modif_previous_analysis = _params
47
48
  with open("tests/data/a_test_file.json", "r") as f:
48
49
  previous_analysis = json.load(f)
49
50
  for dotkey in modif_previous_analysis:
50
51
  keys = dotkey.split(".")
51
52
  set_nested_value(previous_analysis, keys, modif_previous_analysis[dotkey])
52
- is_valid, table, analysis, col_values = validate(
53
+ is_valid, analysis, col_values = validate(
53
54
  "tests/data/a_test_file.csv",
54
55
  previous_analysis=previous_analysis,
55
56
  )
56
57
  assert is_valid == should_be_valid
57
- if table_type is None:
58
- assert table is None
59
- else:
60
- assert isinstance(table, table_type)
61
58
  if analysis_type is None:
62
59
  assert analysis is None
63
60
  else:
64
61
  assert isinstance(analysis, analysis_type)
65
62
  if should_be_valid:
66
63
  assert isinstance(col_values, dict)
67
- assert all(
68
- col in table.columns and isinstance(values, pd.Series)
69
- for col, values in col_values.items()
70
- )
71
64
  else:
72
65
  assert col_values is None
73
66
 
74
67
 
68
+ @pytest.mark.parametrize(
69
+ "_params",
70
+ (
71
+ # int: proportion = 1, should fail (early)
72
+ ("12", "1.2", {"python_type": "int", "format": "int", "score": 1.5}, False),
73
+ # siren: proportion = 0.9, should fail (later)
74
+ (
75
+ "130025265",
76
+ "A13794BC",
77
+ {"python_type": "string", "format": "siren", "score": 1.5},
78
+ False,
79
+ ),
80
+ # siret: proportion = 0.8, should succeed
81
+ (
82
+ "13002526500013",
83
+ "A13794BC",
84
+ {"python_type": "string", "format": "siret", "score": 1.5},
85
+ True,
86
+ ),
87
+ ),
88
+ )
89
+ def test_validation_with_proportions(_params):
90
+ # testing the behaviour for a file that has 15% invalid values, but all in a single chunk
91
+ valid_value, invalid_value, detected, should_be_valid = _params
92
+ url = f"http://example.com/test.csv"
93
+ expected_content = "col\n"
94
+ for _ in range(60):
95
+ # 60 rows of valid values
96
+ expected_content += f"{valid_value}\n"
97
+ for _ in range(15):
98
+ # 15 rows of invalid values
99
+ expected_content += f"{invalid_value}\n"
100
+ for _ in range(25):
101
+ # 25 rows of valid values
102
+ expected_content += f"{valid_value}\n"
103
+ previous_analysis = {
104
+ "encoding": "utf-8",
105
+ "separator": ",",
106
+ "header_row_idx": 0,
107
+ "header": ["col"],
108
+ "columns": {"col": detected},
109
+ # just setting these keys when validation is successful, they're not used for the validation itself
110
+ "categorical": [],
111
+ "columns_fields": {},
112
+ "columns_labels": {},
113
+ "formats": {},
114
+ }
115
+ with (
116
+ patch("urllib.request.urlopen") as mock_urlopen,
117
+ patch("csv_detective.validate.VALIDATION_CHUNK_SIZE", 10),
118
+ ):
119
+ mock_response = MagicMock()
120
+ mock_response.read.return_value = expected_content.encode("utf-8")
121
+ mock_response.__enter__.return_value = mock_response
122
+ mock_urlopen.return_value = mock_response
123
+ is_valid, *_ = validate(
124
+ file_path=url,
125
+ previous_analysis=previous_analysis,
126
+ )
127
+ assert is_valid == should_be_valid
128
+
129
+
75
130
  @pytest.mark.parametrize(
76
131
  "modif_previous_analysis",
77
132
  (