csv-detective 0.10.3__py3-none-any.whl → 0.10.3.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,22 +5,24 @@ from typing import TextIO
5
5
  from csv_detective.utils import display_logs_depending_process_time
6
6
 
7
7
 
8
- def detect_header_position(file: TextIO, verbose: bool = False) -> int:
8
+ def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int, list | None]:
9
9
  """Tests 10 first rows for possible header (in case header is not 1st row)"""
10
10
  if verbose:
11
11
  start = time()
12
- logging.info("Detecting header position")
12
+ logging.info("Detecting headers")
13
13
  file.seek(0)
14
14
  for i in range(10):
15
15
  row = file.readline()
16
16
  position = file.tell()
17
- next_row = file.readline()
18
- file.seek(position)
19
- if row != next_row:
20
- if verbose:
21
- display_logs_depending_process_time(
22
- f"Detected header position in {round(time() - start, 3)}s",
23
- time() - start,
24
- )
25
- return i
26
- raise ValueError("Could not accurately retrieve headers position")
17
+ headers = [c for c in row.replace("\n", "").split(sep) if c]
18
+ if not any(col == "" for col in headers):
19
+ next_row = file.readline()
20
+ file.seek(position)
21
+ if row != next_row:
22
+ if verbose:
23
+ display_logs_depending_process_time(
24
+ f"Detected headers in {round(time() - start, 3)}s",
25
+ time() - start,
26
+ )
27
+ return i, headers
28
+ raise ValueError("Could not retrieve headers")
@@ -2,7 +2,7 @@ import pandas as pd
2
2
 
3
3
 
4
4
  def remove_empty_first_rows(table: pd.DataFrame) -> tuple[pd.DataFrame, int]:
5
- """Analog process to detect_header_position for csv files, determines how many rows to skip
5
+ """Analog process to detect_headers for csv files, determines how many rows to skip
6
6
  to end up with the header at the right place"""
7
7
  idx = 0
8
8
  if all([str(c).startswith("Unnamed:") for c in table.columns]):
@@ -142,19 +142,20 @@ def validate_then_detect(
142
142
  if is_url(file_path):
143
143
  logging.info("Path recognized as a URL")
144
144
 
145
- is_valid, analysis, col_values = validate(
145
+ is_valid, table, analysis, col_values = validate(
146
146
  file_path=file_path,
147
147
  previous_analysis=previous_analysis,
148
148
  verbose=verbose,
149
149
  skipna=skipna,
150
150
  )
151
- if not is_valid:
152
- # if loading failed in validate, we load it from scratch and initiate an analysis
151
+ if analysis is None:
152
+ # if loading failed in validate, we load it from scratch
153
153
  table, analysis = load_file(
154
154
  file_path=file_path,
155
155
  num_rows=num_rows,
156
156
  verbose=verbose,
157
157
  )
158
+ if not is_valid:
158
159
  analysis, col_values = detect_formats(
159
160
  table=table,
160
161
  analysis=analysis,
@@ -164,18 +165,6 @@ def validate_then_detect(
164
165
  skipna=skipna,
165
166
  verbose=verbose,
166
167
  )
167
- else:
168
- # successful validation means we have a correct analysis and col_values
169
- # only need to reload the table, and we already know how
170
- table, _ = load_file(
171
- file_path=file_path,
172
- num_rows=num_rows,
173
- verbose=verbose,
174
- sep=analysis.get("separator"),
175
- encoding=analysis.get("encoding"),
176
- engine=analysis.get("engine"),
177
- sheet_name=analysis.get("sheet_name"),
178
- )
179
168
  try:
180
169
  return generate_output(
181
170
  table=table,
csv_detective/format.py CHANGED
@@ -27,7 +27,7 @@ class Format:
27
27
  tags: to allow users to submit a file to only a subset of formats
28
28
  """
29
29
  self.name: str = name
30
- self.func: Callable[[Any], bool] = func
30
+ self.func: Callable = func
31
31
  self._test_values: dict[bool, list[str]] = _test_values
32
32
  self.labels: dict[str, float] = labels
33
33
  self.proportion: float = proportion
@@ -14,8 +14,8 @@ from csv_detective.utils import display_logs_depending_process_time
14
14
 
15
15
 
16
16
  def cast(value: str, _type: str) -> str | int | float | bool | date | datetime | bytes | None:
17
- if not isinstance(value, str) or value in pd._libs.parsers.STR_NA_VALUES:
18
- # STR_NA_VALUES are directly ingested as NaN by pandas, we avoid trying to cast them (into int for instance)
17
+ if not isinstance(value, str) or not value:
18
+ # None is the current default value in hydra, should we keep this?
19
19
  return None
20
20
  match _type:
21
21
  case "string":
@@ -23,7 +23,7 @@ def create_profile(
23
23
  logging.info("Creating profile")
24
24
 
25
25
  if num_rows > 0:
26
- raise ValueError("To create profile `num_rows` must be set to -1")
26
+ raise ValueError("To create profiles num_rows has to be set to -1")
27
27
  if not limited_output:
28
28
  columns = {
29
29
  k: v[0] if v else {"python_type": "string", "format": "string", "score": 1.0}
@@ -33,7 +33,7 @@ def test_col_val(
33
33
 
34
34
  try:
35
35
  if skipna:
36
- serie = serie.dropna()
36
+ serie = serie.loc[serie.notnull()]
37
37
  ser_len = len(serie)
38
38
  if ser_len == 0:
39
39
  # being here means the whole column is NaN, so if skipna it's a pass
@@ -23,7 +23,7 @@ def parse_excel(
23
23
  file_path: str,
24
24
  num_rows: int = -1,
25
25
  engine: str | None = None,
26
- sheet_name: str | int | None = None,
26
+ sheet_name: str | None = None,
27
27
  random_state: int = 42,
28
28
  verbose: bool = False,
29
29
  ) -> tuple[pd.DataFrame, int, int, str, str, int]:
@@ -11,7 +11,7 @@ from csv_detective.detection.engine import (
11
11
  EXCEL_ENGINES,
12
12
  detect_engine,
13
13
  )
14
- from csv_detective.detection.headers import detect_header_position
14
+ from csv_detective.detection.headers import detect_headers
15
15
  from csv_detective.detection.separator import detect_separator
16
16
  from csv_detective.parsing.compression import unzip
17
17
  from csv_detective.parsing.csv import parse_csv
@@ -28,12 +28,12 @@ def load_file(
28
28
  encoding: str | None = None,
29
29
  sep: str | None = None,
30
30
  verbose: bool = False,
31
- engine: str | None = None,
32
31
  sheet_name: str | int | None = None,
33
32
  ) -> tuple[pd.DataFrame, dict]:
34
33
  file_name = file_path.split("/")[-1]
35
- if ("." not in file_name or not file_name.endswith("csv")) and engine is None and sep is None:
36
- # file has no extension and we don't have insights from arguments, we'll investigate how to read it
34
+ engine = None
35
+ if "." not in file_name or not file_name.endswith("csv"):
36
+ # file has no extension, we'll investigate how to read it
37
37
  engine = detect_engine(file_path, verbose=verbose)
38
38
 
39
39
  if engine in EXCEL_ENGINES or any([file_path.endswith(k) for k in XLS_LIKE_EXT]):
@@ -46,6 +46,9 @@ def load_file(
46
46
  )
47
47
  if table.empty:
48
48
  raise ValueError("Table seems to be empty")
49
+ header = table.columns.to_list()
50
+ if any(col.startswith("Unnamed") for col in header):
51
+ raise ValueError("Could not retrieve headers")
49
52
  analysis = {
50
53
  "engine": engine,
51
54
  "sheet_name": sheet_name,
@@ -80,7 +83,9 @@ def load_file(
80
83
  str_file = open(file_path, "r", encoding=encoding)
81
84
  if sep is None:
82
85
  sep = detect_separator(str_file, verbose=verbose)
83
- header_row_idx = detect_header_position(str_file, verbose=verbose)
86
+ header_row_idx, header = detect_headers(str_file, sep, verbose=verbose)
87
+ if header is None or (isinstance(header, list) and any([h is None for h in header])):
88
+ raise ValueError("Could not retrieve headers")
84
89
  heading_columns = detect_heading_columns(str_file, sep, verbose=verbose)
85
90
  trailing_columns = detect_trailing_columns(str_file, sep, heading_columns, verbose=verbose)
86
91
  table, total_lines, nb_duplicates = parse_csv(
@@ -97,11 +102,9 @@ def load_file(
97
102
  }
98
103
  if engine is not None:
99
104
  analysis["compression"] = engine
100
- if any(not isinstance(col, str) or col.startswith("Unnamed:") for col in table.columns):
101
- raise ValueError("Could not accurately detect the file's columns")
102
105
  analysis |= {
103
106
  "header_row_idx": header_row_idx,
104
- "header": list(table.columns),
107
+ "header": header,
105
108
  }
106
109
  if total_lines is not None:
107
110
  analysis["total_lines"] = total_lines
csv_detective/validate.py CHANGED
@@ -1,13 +1,10 @@
1
1
  import logging
2
- from collections import defaultdict
3
2
 
4
3
  import pandas as pd
5
4
 
6
5
  from csv_detective.format import FormatsManager
7
6
  from csv_detective.parsing.columns import MAX_NUMBER_CATEGORICAL_VALUES, test_col_val
8
7
 
9
- # VALIDATION_CHUNK_SIZE is bigger than (analysis) CHUNK_SIZE because
10
- # it's faster to validate so we can afford to load more rows
11
8
  VALIDATION_CHUNK_SIZE = int(1e5)
12
9
  logging.basicConfig(level=logging.INFO)
13
10
 
@@ -19,9 +16,9 @@ def validate(
19
16
  previous_analysis: dict,
20
17
  verbose: bool = False,
21
18
  skipna: bool = True,
22
- ) -> tuple[bool, dict | None, dict[str, pd.Series] | None]:
19
+ ) -> tuple[bool, pd.DataFrame | None, dict | None, dict[str, pd.Series] | None]:
23
20
  """
24
- Verify is the given file has the same fields and formats as in the given analysis.
21
+ Verify is the given file has the same fields and types as in the given analysis.
25
22
 
26
23
  Args:
27
24
  file_path: the path of the file to validate
@@ -29,15 +26,6 @@ def validate(
29
26
  verbose: whether the code displays the steps it's going through
30
27
  skipna: whether to ignore NaN values in the checks
31
28
  """
32
- if verbose:
33
- logging.info(f"Checking given formats exist")
34
- for col_name, detected in previous_analysis["columns"].items():
35
- if detected["format"] == "string":
36
- continue
37
- elif detected["format"] not in formats:
38
- if verbose:
39
- logging.warning(f"> Unknown format `{detected['format']}` in analysis")
40
- return False, None, None
41
29
  try:
42
30
  if previous_analysis.get("separator"):
43
31
  # loading the table in chunks
@@ -70,94 +58,77 @@ def validate(
70
58
  ]
71
59
  )
72
60
  analysis = {k: v for k, v in previous_analysis.items() if k in ["engine", "sheet_name"]}
61
+ first_chunk = next(chunks)
73
62
  analysis.update(
74
63
  {k: v for k, v in previous_analysis.items() if k in ["header_row_idx", "header"]}
75
64
  )
76
65
  except Exception as e:
77
66
  if verbose:
78
67
  logging.warning(f"> Could not load the file with previous analysis values: {e}")
79
- return False, None, None
68
+ return False, None, None, None
80
69
  if verbose:
81
70
  logging.info("Comparing table with the previous analysis")
71
+ logging.info("- Checking if all columns match")
72
+ if len(first_chunk.columns) != len(previous_analysis["header"]) or any(
73
+ list(first_chunk.columns)[k] != previous_analysis["header"][k]
74
+ for k in range(len(previous_analysis["header"]))
75
+ ):
76
+ if verbose:
77
+ logging.warning("> Columns do not match, proceeding with full analysis")
78
+ return False, None, None, None
79
+ if verbose:
82
80
  logging.info(
83
81
  f"Testing previously detected formats on chunks of {VALIDATION_CHUNK_SIZE} rows"
84
82
  )
85
83
 
86
- # will contain hashes of each row of the file as index and the number of times
87
- # each hash was seen as values; used to compute nb_duplicates
88
- row_hashes_count = pd.Series()
89
- # will contain the number of times each value of each column is seen in the whole file
90
- # used for profile to read the file only once
91
- # naming it "count" to be iso with how col_values are made in detect_formats
92
- col_values: defaultdict[str, pd.Series] = defaultdict(lambda: pd.Series(name="count"))
84
+ # hashing rows to get nb_duplicates
85
+ row_hashes_count = pd.util.hash_pandas_object(first_chunk, index=False).value_counts()
86
+ # getting values for profile to read the file only once
87
+ col_values = {col: first_chunk[col].value_counts(dropna=False) for col in first_chunk.columns}
93
88
  analysis["total_lines"] = 0
94
- checked_values: dict[str, int] = {col_name: 0 for col_name in previous_analysis["columns"]}
95
- valid_values: dict[str, int] = {col_name: 0 for col_name in previous_analysis["columns"]}
96
- for idx, chunk in enumerate(chunks):
89
+ for idx, chunk in enumerate([first_chunk, *chunks]):
97
90
  if verbose:
98
- logging.info(f"- Testing chunk number {idx}")
99
- if idx == 0:
100
- if verbose:
101
- logging.info("Checking if all columns match")
102
- if len(chunk.columns) != len(previous_analysis["header"]) or any(
103
- list(chunk.columns)[k] != previous_analysis["header"][k]
104
- for k in range(len(previous_analysis["header"]))
105
- ):
106
- if verbose:
107
- logging.warning("> Columns in the file do not match those of the analysis")
108
- return False, None, None
91
+ logging.info(f"> Testing chunk number {idx}")
109
92
  analysis["total_lines"] += len(chunk)
110
93
  row_hashes_count = row_hashes_count.add(
111
94
  pd.util.hash_pandas_object(chunk, index=False).value_counts(),
112
95
  fill_value=0,
113
96
  )
97
+ for col in chunk.columns:
98
+ col_values[col] = col_values[col].add(
99
+ chunk[col].value_counts(dropna=False),
100
+ fill_value=0,
101
+ )
114
102
  for col_name, detected in previous_analysis["columns"].items():
115
103
  if verbose:
116
104
  logging.info(f"- Testing {col_name} for {detected['format']}")
117
105
  if detected["format"] == "string":
118
106
  # no test for columns that have not been recognized as a specific format
119
107
  continue
120
- to_check = chunk[col_name].dropna() if skipna else chunk[col_name]
121
- chunk_valid_values = sum(to_check.apply(formats[detected["format"]].func))
122
- if formats[detected["format"]].proportion == 1 and chunk_valid_values < len(to_check):
123
- # we can early stop in this case, not all values are valid while we want 100%
108
+ if detected["format"] not in formats:
124
109
  if verbose:
125
110
  logging.warning(
126
- f"> Test failed for column {col_name} with format {detected['format']}"
111
+ f"> Unknown format `{detected['format']}`, proceeding with full analysis"
127
112
  )
128
- return False, None, None
129
- checked_values[col_name] += len(to_check)
130
- valid_values[col_name] += chunk_valid_values
131
- col_values[col_name] = (
132
- col_values[col_name]
133
- .add(
134
- chunk[col_name].value_counts(dropna=False),
135
- fill_value=0,
136
- )
137
- .rename_axis(col_name)
138
- ) # rename_axis because *sometimes* pandas doesn't pass on the column's name ¯\_(ツ)_/¯
139
- del chunk
140
- # finally we loop through the formats that accept less than 100% valid values to check the proportion
141
- for col_name, detected in previous_analysis["columns"].items():
142
- if (
143
- checked_values[col_name] > 0
144
- and valid_values[col_name] / checked_values[col_name]
145
- < formats[detected["format"]].proportion
146
- ):
147
- if verbose:
148
- logging.warning(
149
- f"> Test failed for column {col_name} with format {detected['format']}"
150
- )
151
- return False, None, None
113
+ return False, first_chunk, analysis, None
114
+ test_result: float = test_col_val(
115
+ serie=chunk[col_name],
116
+ format=formats[detected["format"]],
117
+ skipna=skipna,
118
+ )
119
+ if not bool(test_result):
120
+ if verbose:
121
+ logging.warning("> Test failed, proceeding with full analysis")
122
+ return False, first_chunk, analysis, None
152
123
  if verbose:
153
124
  logging.info("> All checks successful")
154
125
  analysis["nb_duplicates"] = sum(row_hashes_count > 1)
155
- del row_hashes_count
156
126
  analysis["categorical"] = [
157
127
  col for col, values in col_values.items() if len(values) <= MAX_NUMBER_CATEGORICAL_VALUES
158
128
  ]
159
129
  return (
160
130
  True,
131
+ first_chunk,
161
132
  analysis
162
133
  | {
163
134
  k: previous_analysis[k]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: csv-detective
3
- Version: 0.10.3
3
+ Version: 0.10.3.dev2
4
4
  Summary: Detect tabular files column content
5
5
  Author-email: "data.gouv.fr" <opendatateam@data.gouv.fr>
6
6
  License: MIT
@@ -23,6 +23,10 @@ Requires-Dist: frformat==0.4.0
23
23
  Requires-Dist: Faker>=33.0.0
24
24
  Requires-Dist: rstr>=3.2.2
25
25
  Requires-Dist: more-itertools>=10.8.0
26
+ Provides-Extra: dev
27
+ Requires-Dist: pytest>=8.3.0; extra == "dev"
28
+ Requires-Dist: responses>=0.25.0; extra == "dev"
29
+ Requires-Dist: ruff>=0.9.3; extra == "dev"
26
30
  Dynamic: license-file
27
31
 
28
32
  # CSV Detective
@@ -1,16 +1,16 @@
1
1
  csv_detective/__init__.py,sha256=zlYElTOp_I2_VG7ZdOTuAu0wuCXSc0cr3sH6gtk2bcg,152
2
2
  csv_detective/cli.py,sha256=mu5anmBmaDk52_uZGiA4T37wYZCuV43gZAepjs1Cqzc,1389
3
- csv_detective/explore_csv.py,sha256=M8jabAP08raPY438v5UeBqJy3bBudTeuo-UNe2unWyE,7639
4
- csv_detective/format.py,sha256=VTdwg4gp9pq6WYhbkCxv9X2hXq0fMrzfooFchmIL0as,2911
3
+ csv_detective/explore_csv.py,sha256=qSf6N3tbp43BUMJF5wiXz3aYKaTez6ro-75KL2Arci4,7174
4
+ csv_detective/format.py,sha256=VglcxWBmjTvWNMhwSUZDfMdJcK9lAUum64Jxvm70AJ4,2898
5
5
  csv_detective/utils.py,sha256=RJ_zFOJ1DRY8HtDrKPiCdNk5gU6-KwOrOKOyfSkBZZY,1118
6
- csv_detective/validate.py,sha256=7k0GC5AsTn5BbsRChetZZDmnTGiYLe40qPKiP3GruYs,7495
6
+ csv_detective/validate.py,sha256=QBJhwHP0U0Ux7ODGV6foqNGm-DlbECIo6jUsBFOdDr0,5739
7
7
  csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
8
  csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvcaGd0Go,2887
9
9
  csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
10
10
  csv_detective/detection/engine.py,sha256=wQeDKpp2DKF-HcS1R8H6GgQyaUgQme4szPtEHgAjBII,1552
11
11
  csv_detective/detection/formats.py,sha256=9aIE4gwTN8c8pa-kofeJ7zalo8NqjGZabYD-G79kV5I,4734
12
- csv_detective/detection/headers.py,sha256=lnbWRxkI6rdyoWGtmxSfsPkqNjS0Nlpgw-pVevtmBP0,899
13
- csv_detective/detection/rows.py,sha256=JQsmKP8-i8wzcZIWI_13LUer5mpYRIqaKg6qW01ZO3A,750
12
+ csv_detective/detection/headers.py,sha256=95pTL524Sy5PGxyQ03ofFUaamvlmkxTJQe8u6HfzOkU,1051
13
+ csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
14
14
  csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
15
15
  csv_detective/detection/variables.py,sha256=-QtZOB96z3pWbqnZ-c1RU3yzoYqcO61A0JzeS6JbkxY,3576
16
16
  csv_detective/formats/__init__.py,sha256=Egiy29kcG3Oz2eE2maYhD3wP29zOSOWyRlOpGD5LGvU,318
@@ -74,26 +74,26 @@ csv_detective/formats/data/iso_country_code_alpha2.txt,sha256=mLt_qcQ6D8hfy9zdi7
74
74
  csv_detective/formats/data/iso_country_code_alpha3.txt,sha256=XFPdGBsyZCBg4D8IDn6VgwsycCwYVfuqPbyHfNeqGv0,1003
75
75
  csv_detective/formats/data/iso_country_code_numeric.txt,sha256=sdGpn0PqDMlc59-7prThkihHrf7mwB6j5uEHpxGvLFE,1003
76
76
  csv_detective/output/__init__.py,sha256=ALSq_tgX7rGyh--7rmbKz8wHkmResN0h7mNujndow3w,2103
77
- csv_detective/output/dataframe.py,sha256=juBMdj0eiL8c3OrJJ3kCf15Qs4-CFQfHqh91FnVbG9E,3656
77
+ csv_detective/output/dataframe.py,sha256=QX5vplx0AOKgnwwJ6dKvDHWRX9IGPStax-svXEyweJ8,3584
78
78
  csv_detective/output/example.py,sha256=8LWheSBYCeDFfarbnmzBrdCbTd8Alh1U4pfXMKfabOw,8630
79
- csv_detective/output/profile.py,sha256=R9YMl-dANde69RXkFlZpvMDBsX7e1SyMAnlW8p1XNNM,4984
79
+ csv_detective/output/profile.py,sha256=ADr5DwuvwcBYxugjN38fHm11l6ivfzGHXPd8a87Ht-s,4985
80
80
  csv_detective/output/schema.py,sha256=XoKljXPXP00DfqPCiz1ydwTHYGAFsvNxnaPCNBuuBIo,10443
81
81
  csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
82
82
  csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
83
- csv_detective/parsing/columns.py,sha256=H_dKHhSgQMIiOfxibnGl6HwTW9bRwGtIeUcYBN13-3A,9245
83
+ csv_detective/parsing/columns.py,sha256=rb5JywbKnYCT3Jb0ZaG1BnyPVtB3gy5mSD-K7qcOl8I,9257
84
84
  csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
85
85
  csv_detective/parsing/csv.py,sha256=5rw6gXZFQC1T4NT9CnW0AumidrYOkF8kjrfWGmk949I,1716
86
- csv_detective/parsing/excel.py,sha256=pX6dbhAdAdbRpoGcrGsL1lSaF-fbzEb4WcvwcCGEgFw,6978
87
- csv_detective/parsing/load.py,sha256=1Fk43ikIOJwtWJUY-e8oNeNOk4MMtpmZV7s-VbQBS1k,4345
86
+ csv_detective/parsing/excel.py,sha256=tb65I78tdYlZci_tzvvQt8U6bZSYKjeVdn2CEvsET1o,6972
87
+ csv_detective/parsing/load.py,sha256=orW6PV5XUsHA093yVSxXkJl33LEUUArr3hP81U9Bzd4,4386
88
88
  csv_detective/parsing/text.py,sha256=yDAcop5xJQc25UtbZcV0guHXAZQfm-H8WuJORTy8Rr8,1734
89
- csv_detective-0.10.3.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
89
+ csv_detective-0.10.3.dev2.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
90
90
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
91
91
  tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
92
- tests/test_fields.py,sha256=DSI-ZXDcRt69iZArYZZAr_3OEb-qvwgOVBZxmYAKIkI,5918
93
- tests/test_file.py,sha256=9Zne9ULDqkr-ajgc03lEMEod4d71Y-UDY4ckT6FFw_I,15205
92
+ tests/test_fields.py,sha256=EuD2F1JUR8y88Hm-AYuJ5X7AKkGSyLIQfsGdxYgIWng,5680
93
+ tests/test_file.py,sha256=_ftEymft5-1keUVE5AUdF2XkVcChJo6oBjws3ye06FE,14543
94
94
  tests/test_labels.py,sha256=lgxRbLrGV1C-MkASf3KIQ120BG-UHzFQ4pqDWaeBvaw,539
95
95
  tests/test_structure.py,sha256=XDbviuuvk-0Mu9Y9PI6He2e5hry2dXVJ6yBVwEqF_2o,1043
96
- tests/test_validation.py,sha256=309k3Axgbp-1Wh6qvCj2BpeMBp3HXzLi5j9UKm1bRQs,5384
96
+ tests/test_validation.py,sha256=9djBT-PDhu_563OFgWyE20o-wPEWEIQGXp6Pjh0_MQM,3463
97
97
  tests/data/a_test_file.csv,sha256=SOHjseGYqZer9yu3Bd3oS12Vw8MFsebo0BzrLZ_R4Cc,68871
98
98
  tests/data/a_test_file.json,sha256=fB9bCpAMFPxFw8KxHRFlgRqjYG819QVGrCQWxQvwkvo,10542
99
99
  tests/data/b_test_file.csv,sha256=wJGX62KhYjZi62De2XjZWClAzeRFEBsg3ET0IPX1BNU,98
@@ -104,8 +104,8 @@ tests/data/file.ods,sha256=4dR7zWptz5djALIBVeWHQ20GaZNfA63fevIJGFIk1_U,11832
104
104
  tests/data/file.xls,sha256=QYmNX3FF0QfcQSzYQMtaMJaepJf5EZpDa1miKc4wMdQ,21495
105
105
  tests/data/file.xlsx,sha256=naWzL02PK4pdIjMzfEyfSW9GQhkYYd_e7bpJvB8Pb2w,8314
106
106
  tests/data/xlsx_file,sha256=NyOyN_rIe7ryJuHQLqjxVdKCc8V4s5pxyHl6wWFykCM,8305
107
- csv_detective-0.10.3.dist-info/METADATA,sha256=L638U_kKVd5jFzjTk76y48hTz3nMldJ5PkfMngGHobg,10920
108
- csv_detective-0.10.3.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
109
- csv_detective-0.10.3.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
110
- csv_detective-0.10.3.dist-info/top_level.txt,sha256=KDI4gyOpkmormGgUvSWrE3jen2e0unIsxR2b96DRvcw,25
111
- csv_detective-0.10.3.dist-info/RECORD,,
107
+ csv_detective-0.10.3.dev2.dist-info/METADATA,sha256=QhAD5N5OZx1L_9ajLLuEjhSYSz6q05eAEwVd6_kDPFc,11082
108
+ csv_detective-0.10.3.dev2.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
109
+ csv_detective-0.10.3.dev2.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
110
+ csv_detective-0.10.3.dev2.dist-info/top_level.txt,sha256=KDI4gyOpkmormGgUvSWrE3jen2e0unIsxR2b96DRvcw,25
111
+ csv_detective-0.10.3.dev2.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.10.2)
2
+ Generator: setuptools (80.10.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
tests/test_fields.py CHANGED
@@ -104,17 +104,11 @@ def test_fields_with_values(args):
104
104
  ("2022-08-01", "date", _date),
105
105
  ("2024-09-23 17:32:07", "datetime", _datetime),
106
106
  ("2024-09-23 17:32:07+02:00", "datetime", _datetime),
107
- ("N/A", "int", None),
108
- ("nan", "bool", None),
109
- ("", "date", None), # all NaN-like values should be cast as None for all type
110
107
  ),
111
108
  )
112
109
  def test_cast(args):
113
110
  value, detected_type, cast_type = args
114
- if cast_type is None:
115
- assert cast(value, detected_type) is None
116
- else:
117
- assert isinstance(cast(value, detected_type), cast_type)
111
+ assert isinstance(cast(value, detected_type), cast_type)
118
112
 
119
113
 
120
114
  @pytest.mark.parametrize(
tests/test_file.py CHANGED
@@ -9,12 +9,6 @@ from csv_detective.output.profile import create_profile
9
9
  from csv_detective.parsing.csv import CHUNK_SIZE
10
10
 
11
11
 
12
- @pytest.fixture
13
- def mocked_responses():
14
- with responses.RequestsMock() as rsps:
15
- yield rsps
16
-
17
-
18
12
  @pytest.mark.parametrize(
19
13
  "chunk_size",
20
14
  (100, 404, int(1e5)),
@@ -171,26 +165,6 @@ def test_exception_different_number_of_columns():
171
165
  )
172
166
 
173
167
 
174
- def test_exception_malformed_columns(mocked_responses):
175
- """
176
- A ValueError should be raised if any column is Unnamed
177
- """
178
- url = f"http://example.com/bad_cols.csv"
179
- expected_content = b"col1,col2,\n1,2,\n3,4,"
180
- mocked_responses.get(
181
- url,
182
- body=expected_content,
183
- status=200,
184
- )
185
- with patch("urllib.request.urlopen") as mock_urlopen:
186
- mock_response = MagicMock()
187
- mock_response.read.return_value = expected_content
188
- mock_response.__enter__.return_value = mock_response
189
- mock_urlopen.return_value = mock_response
190
- with pytest.raises(ValueError):
191
- routine(file_path=url)
192
-
193
-
194
168
  def test_code_dep_reg_on_file():
195
169
  output = routine(
196
170
  file_path="tests/data/b_test_file.csv",
@@ -263,6 +237,12 @@ def test_non_csv_files(params):
263
237
  assert _[k] == v
264
238
 
265
239
 
240
+ @pytest.fixture
241
+ def mocked_responses():
242
+ with responses.RequestsMock() as rsps:
243
+ yield rsps
244
+
245
+
266
246
  @pytest.mark.parametrize(
267
247
  "params",
268
248
  # ideally we'd like to do the same with params_others but pandas.read_excel uses urllib
tests/test_validation.py CHANGED
@@ -1,5 +1,4 @@
1
1
  import json
2
- from unittest.mock import MagicMock, patch
3
2
 
4
3
  import pandas as pd
5
4
  import pytest
@@ -27,12 +26,12 @@ def get_nested_value(source_dict: dict, key_chain: list[str]):
27
26
  @pytest.mark.parametrize(
28
27
  "_params",
29
28
  (
30
- ((True, dict), {}),
31
- ((False, None), {"separator": "|"}),
32
- ((False, None), {"encoding": "unknown"}),
33
- ((False, None), {"header": ["a", "b"]}),
29
+ ((True, pd.DataFrame, dict), {}),
30
+ ((False, None, None), {"separator": "|"}),
31
+ ((False, None, None), {"encoding": "unknown"}),
32
+ ((False, None, None), {"header": ["a", "b"]}),
34
33
  (
35
- (False, None),
34
+ (False, pd.DataFrame, dict),
36
35
  {
37
36
  "columns.NUMCOM": {
38
37
  "python_type": "int",
@@ -44,89 +43,35 @@ def get_nested_value(source_dict: dict, key_chain: list[str]):
44
43
  ),
45
44
  )
46
45
  def test_validation(_params):
47
- (should_be_valid, analysis_type), modif_previous_analysis = _params
46
+ (should_be_valid, table_type, analysis_type), modif_previous_analysis = _params
48
47
  with open("tests/data/a_test_file.json", "r") as f:
49
48
  previous_analysis = json.load(f)
50
49
  for dotkey in modif_previous_analysis:
51
50
  keys = dotkey.split(".")
52
51
  set_nested_value(previous_analysis, keys, modif_previous_analysis[dotkey])
53
- is_valid, analysis, col_values = validate(
52
+ is_valid, table, analysis, col_values = validate(
54
53
  "tests/data/a_test_file.csv",
55
54
  previous_analysis=previous_analysis,
56
55
  )
57
56
  assert is_valid == should_be_valid
57
+ if table_type is None:
58
+ assert table is None
59
+ else:
60
+ assert isinstance(table, table_type)
58
61
  if analysis_type is None:
59
62
  assert analysis is None
60
63
  else:
61
64
  assert isinstance(analysis, analysis_type)
62
65
  if should_be_valid:
63
66
  assert isinstance(col_values, dict)
67
+ assert all(
68
+ col in table.columns and isinstance(values, pd.Series)
69
+ for col, values in col_values.items()
70
+ )
64
71
  else:
65
72
  assert col_values is None
66
73
 
67
74
 
68
- @pytest.mark.parametrize(
69
- "_params",
70
- (
71
- # int: proportion = 1, should fail (early)
72
- ("12", "1.2", {"python_type": "int", "format": "int", "score": 1.5}, False),
73
- # siren: proportion = 0.9, should fail (later)
74
- (
75
- "130025265",
76
- "A13794BC",
77
- {"python_type": "string", "format": "siren", "score": 1.5},
78
- False,
79
- ),
80
- # siret: proportion = 0.8, should succeed
81
- (
82
- "13002526500013",
83
- "A13794BC",
84
- {"python_type": "string", "format": "siret", "score": 1.5},
85
- True,
86
- ),
87
- ),
88
- )
89
- def test_validation_with_proportions(_params):
90
- # testing the behaviour for a file that has 15% invalid values, but all in a single chunk
91
- valid_value, invalid_value, detected, should_be_valid = _params
92
- url = f"http://example.com/test.csv"
93
- expected_content = "col\n"
94
- for _ in range(60):
95
- # 60 rows of valid values
96
- expected_content += f"{valid_value}\n"
97
- for _ in range(15):
98
- # 15 rows of invalid values
99
- expected_content += f"{invalid_value}\n"
100
- for _ in range(25):
101
- # 25 rows of valid values
102
- expected_content += f"{valid_value}\n"
103
- previous_analysis = {
104
- "encoding": "utf-8",
105
- "separator": ",",
106
- "header_row_idx": 0,
107
- "header": ["col"],
108
- "columns": {"col": detected},
109
- # just setting these keys when validation is successful, they're not used for the validation itself
110
- "categorical": [],
111
- "columns_fields": {},
112
- "columns_labels": {},
113
- "formats": {},
114
- }
115
- with (
116
- patch("urllib.request.urlopen") as mock_urlopen,
117
- patch("csv_detective.validate.VALIDATION_CHUNK_SIZE", 10),
118
- ):
119
- mock_response = MagicMock()
120
- mock_response.read.return_value = expected_content.encode("utf-8")
121
- mock_response.__enter__.return_value = mock_response
122
- mock_urlopen.return_value = mock_response
123
- is_valid, *_ = validate(
124
- file_path=url,
125
- previous_analysis=previous_analysis,
126
- )
127
- assert is_valid == should_be_valid
128
-
129
-
130
75
  @pytest.mark.parametrize(
131
76
  "modif_previous_analysis",
132
77
  (