csv-detective 0.9.0__py3-none-any.whl → 0.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
csv_detective/__init__.py CHANGED
@@ -1,2 +1,7 @@
1
- from csv_detective.explore_csv import routine, routine_minio, validate_then_detect # noqa
2
- from csv_detective.output.example import create_example_csv_file # noqa
1
+ from csv_detective.explore_csv import routine, routine_minio, validate_then_detect
2
+
3
+ __all__ = [
4
+ "routine",
5
+ "routine_minio",
6
+ "validate_then_detect",
7
+ ]
@@ -1,3 +1,4 @@
1
+ import re
1
2
  from datetime import datetime
2
3
  from typing import Optional
3
4
 
@@ -19,6 +20,23 @@ def date_casting(val: str) -> Optional[datetime]:
19
20
  return None
20
21
 
21
22
 
23
+ seps = r"[\s/\-\*_\|;.,]"
24
+ # matches JJ-MM-AAAA with any of the listed separators
25
+ jjmmaaaa_pattern = r"^(0[1-9]|[12][0-9]|3[01])SEP(0[1-9]|1[0-2])SEP((19|20)\d{2})$".replace(
26
+ "SEP", seps
27
+ )
28
+ # matches AAAA-MM-JJ with any of the listed separators OR NO SEPARATOR
29
+ aaaammjj_pattern = r"^((19|20)\d{2})SEP(0[1-9]|1[0-2])SEP(0[1-9]|[12][0-9]|3[01])$".replace(
30
+ "SEP", seps + "?"
31
+ )
32
+ # matches JJ-mmm-AAAA and JJ-mmm...mm-AAAA with any of the listed separators OR NO SEPARATOR
33
+ string_month_pattern = (
34
+ r"^(0[1-9]|[12][0-9]|3[01])SEP(jan|fev|feb|mar|avr|apr"
35
+ r"|mai|may|jun|jui|jul|aou|aug|sep|oct|nov|dec|janvier|fevrier|mars|avril|"
36
+ r"mai|juin|jullet|aout|septembre|octobre|novembre|decembre)SEP"
37
+ r"([0-9]{2}$|(19|20)[0-9]{2}$)"
38
+ ).replace("SEP", seps + "?")
39
+
22
40
  threshold = 0.3
23
41
 
24
42
 
@@ -27,6 +45,16 @@ def _is(val):
27
45
  # early stops, to cut processing time
28
46
  if not isinstance(val, str) or len(val) > 20 or len(val) < 8:
29
47
  return False
48
+ # if it's a usual date pattern
49
+ if any(
50
+ # with this syntax, if any of the first value is True, the next ones are not computed
51
+ [
52
+ bool(re.match(jjmmaaaa_pattern, val))
53
+ or bool(re.match(aaaammjj_pattern, val))
54
+ or bool(re.match(string_month_pattern, val, re.IGNORECASE))
55
+ ]
56
+ ):
57
+ return True
30
58
  if sum([char.isdigit() for char in val]) / len(val) < threshold:
31
59
  return False
32
60
  res = date_casting(val)
@@ -1,8 +1,16 @@
1
+ import re
1
2
  from typing import Any, Optional
2
3
 
3
- from csv_detective.detect_fields.temp.date import date_casting
4
+ from csv_detective.detect_fields.temp.date import aaaammjj_pattern, date_casting
4
5
 
5
6
  PROPORTION = 1
7
+ threshold = 0.7
8
+
9
+ # matches AAAA-MM-JJTHH:MM:SS(.dddddd)±HH:MM with any of the listed separators for the date OR NO SEPARATOR
10
+ pat = (
11
+ aaaammjj_pattern.replace("$", "")
12
+ + r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})?[+-](0\d|1[0-9]|2[0-3]):([0-5][0-9])$"
13
+ )
6
14
 
7
15
 
8
16
  def _is(val: Optional[Any]) -> bool:
@@ -12,7 +20,9 @@ def _is(val: Optional[Any]) -> bool:
12
20
  # 32 is the maximal length of an ISO datetime format YYYY-MM-DDTHH:MM:SS.dddddd+HH:MM, keeping some slack
13
21
  if not isinstance(val, str) or len(val) > 35 or len(val) < 21:
14
22
  return False
15
- threshold = 0.7
23
+ # if usual format, no need to parse
24
+ if bool(re.match(pat, val)):
25
+ return True
16
26
  if sum([char.isdigit() or char in {"-", "/", ":", " "} for char in val]) / len(val) < threshold:
17
27
  return False
18
28
  res = date_casting(val)
@@ -1,8 +1,16 @@
1
+ import re
1
2
  from typing import Any, Optional
2
3
 
3
- from csv_detective.detect_fields.temp.date import date_casting
4
+ from csv_detective.detect_fields.temp.date import aaaammjj_pattern, date_casting
4
5
 
5
6
  PROPORTION = 1
7
+ threshold = 0.7
8
+
9
+ # matches AAAA-MM-JJTHH:MM:SS(.dddddd)Z with any of the listed separators for the date OR NO SEPARATOR
10
+ pat = (
11
+ aaaammjj_pattern.replace("$", "")
12
+ + r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})?Z$"
13
+ )
6
14
 
7
15
 
8
16
  def _is(val: Optional[Any]) -> bool:
@@ -12,7 +20,9 @@ def _is(val: Optional[Any]) -> bool:
12
20
  # 26 is the maximal length of an ISO datetime format YYYY-MM-DDTHH:MM:SS.dddddd, keeping some slack
13
21
  if not isinstance(val, str) or len(val) > 30 or len(val) < 15:
14
22
  return False
15
- threshold = 0.7
23
+ # if usual format, no need to parse
24
+ if bool(re.match(pat, val)):
25
+ return True
16
26
  if sum([char.isdigit() or char in {"-", "/", ":", " "} for char in val]) / len(val) < threshold:
17
27
  return False
18
28
  res = date_casting(val)
@@ -14,6 +14,9 @@ from csv_detective.output.utils import prepare_output_dict
14
14
  from csv_detective.parsing.columns import MAX_ROWS_ANALYSIS, test_col, test_label
15
15
  from csv_detective.validate import validate
16
16
 
17
+ # above this threshold, a column is not considered categorical
18
+ MAX_NUMBER_CATEGORICAL_VALUES = 25
19
+
17
20
 
18
21
  def detect_formats(
19
22
  table: pd.DataFrame,
@@ -28,14 +31,18 @@ def detect_formats(
28
31
  if on_sample:
29
32
  if verbose:
30
33
  logging.warning(f"File is too long, analysing the {MAX_ROWS_ANALYSIS} first rows")
31
- table = table.sample(n=MAX_ROWS_ANALYSIS, random_state=1)
34
+ table = build_sample(table)
32
35
 
33
36
  if table.empty:
34
37
  res_categorical = []
35
38
  # res_continuous = []
36
39
  else:
37
40
  # Detects columns that are categorical
38
- res_categorical, categorical_mask = detect_categorical_variable(table, verbose=verbose)
41
+ res_categorical, categorical_mask = detect_categorical_variable(
42
+ table,
43
+ max_number_categorical_values=MAX_NUMBER_CATEGORICAL_VALUES,
44
+ verbose=verbose,
45
+ )
39
46
  res_categorical = list(res_categorical)
40
47
  # Detect columns that are continuous (we already know the categorical) :
41
48
  # we don't need this for now, cuts processing time
@@ -166,3 +173,33 @@ def detect_formats(
166
173
  raise ValueError("Could not infer detected formats on the whole file")
167
174
 
168
175
  return analysis
176
+
177
+
178
+ def build_sample(table: pd.DataFrame) -> pd.DataFrame:
179
+ """
180
+ building a sample of MAX_ROWS_ANALYSIS rows that contains at least one representative of
181
+ the min and max values of each column, and one case of NaN if the column contains any.
182
+ """
183
+ samples = pd.concat(
184
+ [
185
+ # one row with the minimum of the column
186
+ table.loc[table[col] == table[col].dropna().min()].iloc[[0]]
187
+ for col in table.columns
188
+ ]
189
+ + [
190
+ # one row with the maximum of the column
191
+ table.loc[table[col] == table[col].dropna().max()].iloc[[0]]
192
+ for col in table.columns
193
+ ]
194
+ + [
195
+ # one row with a NaN value if the column has any
196
+ table.loc[table[col].isna()].iloc[[0]]
197
+ for col in table.columns
198
+ if table[col].isna().any()
199
+ ],
200
+ ignore_index=True,
201
+ )
202
+ return pd.concat(
203
+ [samples, table.sample(n=MAX_ROWS_ANALYSIS - len(samples), random_state=1)],
204
+ ignore_index=True,
205
+ )
@@ -25,12 +25,20 @@ def generate_output(
25
25
  verbose: bool = False,
26
26
  sheet_name: Optional[Union[str, int]] = None,
27
27
  ) -> Union[dict, tuple[dict, pd.DataFrame]]:
28
- if output_profile:
28
+ if output_profile or output_df:
29
+ # to create the profile we have to cast columns, so using the dedicated function
30
+ table = cast_df(
31
+ df=table,
32
+ columns=analysis["columns"],
33
+ cast_json=cast_json,
34
+ verbose=verbose,
35
+ )
29
36
  analysis["profile"] = create_profile(
30
37
  table=table,
31
- dict_cols_fields=analysis["columns"],
38
+ columns=analysis["columns"],
32
39
  num_rows=num_rows,
33
40
  limited_output=limited_output,
41
+ cast_json=cast_json,
34
42
  verbose=verbose,
35
43
  )
36
44
 
@@ -45,16 +53,13 @@ def generate_output(
45
53
  output_path += "_sheet-" + str(sheet_name)
46
54
  output_path += ".json"
47
55
  with open(output_path, "w", encoding="utf8") as fp:
48
- json.dump(analysis, fp, indent=4, separators=(",", ": "), ensure_ascii=False)
56
+ json.dump(
57
+ analysis, fp, indent=4, separators=(",", ": "), ensure_ascii=False, default=str
58
+ )
49
59
 
50
60
  if output_schema:
51
61
  analysis["schema"] = generate_table_schema(analysis, save_file=False, verbose=verbose)
52
62
 
53
63
  if output_df:
54
- return analysis, cast_df(
55
- df=table,
56
- columns=analysis["columns"],
57
- cast_json=cast_json,
58
- verbose=verbose,
59
- )
64
+ return analysis, table
60
65
  return analysis
@@ -33,27 +33,23 @@ def cast(value: str, _type: str) -> Optional[Union[str, float, bool, date, datet
33
33
  def cast_df(
34
34
  df: pd.DataFrame, columns: dict, cast_json: bool = True, verbose: bool = False
35
35
  ) -> pd.DataFrame:
36
+ # for efficiency this modifies the dataframe in place as we don't need it anymore afterwards
36
37
  if verbose:
37
38
  start = time()
38
- output_df = pd.DataFrame()
39
39
  for col_name, detection in columns.items():
40
40
  if detection["python_type"] == "string" or (
41
41
  detection["python_type"] == "json" and not cast_json
42
42
  ):
43
43
  # no change if detected type is string
44
- output_df[col_name] = df[col_name].copy()
44
+ continue
45
45
  elif detection["python_type"] == "int":
46
46
  # to allow having ints and NaN in the same column
47
- output_df[col_name] = df[col_name].copy().astype(pd.Int64Dtype())
47
+ df[col_name] = df[col_name].astype(pd.Int64Dtype())
48
48
  else:
49
- output_df[col_name] = df[col_name].apply(
50
- lambda col: cast(col, _type=detection["python_type"])
51
- )
52
- # to save RAM
53
- del df[col_name]
49
+ df[col_name] = df[col_name].apply(lambda col: cast(col, _type=detection["python_type"]))
54
50
  if verbose:
55
51
  display_logs_depending_process_time(
56
52
  f"Casting columns completed in {round(time() - start, 3)}s",
57
53
  time() - start,
58
54
  )
59
- return output_df
55
+ return df
@@ -4,15 +4,15 @@ from time import time
4
4
 
5
5
  import pandas as pd
6
6
 
7
- from csv_detective.detect_fields.other.float import float_casting
8
7
  from csv_detective.utils import display_logs_depending_process_time, prevent_nan
9
8
 
10
9
 
11
10
  def create_profile(
12
11
  table: pd.DataFrame,
13
- dict_cols_fields: dict,
12
+ columns: dict,
14
13
  num_rows: int,
15
14
  limited_output: bool = True,
15
+ cast_json: bool = True,
16
16
  verbose: bool = False,
17
17
  ) -> dict:
18
18
  if verbose:
@@ -26,65 +26,44 @@ def create_profile(
26
26
 
27
27
  if num_rows > 0:
28
28
  raise ValueError("To create profiles num_rows has to be set to -1")
29
- safe_table = table.copy()
30
29
  if not limited_output:
31
- dict_cols_fields = {
30
+ columns = {
32
31
  k: v[0] if v else {"python_type": "string", "format": "string", "score": 1.0}
33
- for k, v in dict_cols_fields.items()
32
+ for k, v in columns.items()
34
33
  }
35
- dtypes = {k: map_python_types.get(v["python_type"], str) for k, v in dict_cols_fields.items()}
36
- for c in safe_table.columns:
37
- if dtypes[c] is float:
38
- safe_table[c] = safe_table[c].apply(
39
- lambda s: float_casting(s) if isinstance(s, str) else s
40
- )
41
34
  profile = defaultdict(dict)
42
- for c in safe_table.columns:
43
- if map_python_types.get(dict_cols_fields[c]["python_type"], str) in [
44
- float,
45
- int,
46
- ]:
35
+ for c in table.columns:
36
+ # for numerical formats we want min, max, mean, std
37
+ if columns[c]["python_type"] in ["float", "int"]:
47
38
  profile[c].update(
48
- min=prevent_nan(
49
- map_python_types.get(dict_cols_fields[c]["python_type"], str)(
50
- safe_table[c].min()
51
- )
52
- ),
53
- max=prevent_nan(
54
- map_python_types.get(dict_cols_fields[c]["python_type"], str)(
55
- safe_table[c].max()
56
- )
57
- ),
58
- mean=prevent_nan(
59
- map_python_types.get(dict_cols_fields[c]["python_type"], str)(
60
- safe_table[c].mean()
61
- )
62
- ),
63
- std=prevent_nan(
64
- map_python_types.get(dict_cols_fields[c]["python_type"], str)(
65
- safe_table[c].std()
66
- )
67
- ),
39
+ min=prevent_nan(map_python_types[columns[c]["python_type"]](table[c].min())),
40
+ max=prevent_nan(map_python_types[columns[c]["python_type"]](table[c].max())),
41
+ mean=prevent_nan(map_python_types[columns[c]["python_type"]](table[c].mean())),
42
+ std=prevent_nan(map_python_types[columns[c]["python_type"]](table[c].std())),
68
43
  )
44
+ # for all formats we want most frequent values, nb unique values and nb missing values
69
45
  tops_bruts = (
70
- safe_table[safe_table[c].notna()][c]
71
- .value_counts(dropna=True)
46
+ table.loc[table[c].notna(), c]
47
+ .value_counts()
72
48
  .reset_index()
73
49
  .iloc[:10]
74
50
  .to_dict(orient="records")
75
51
  )
76
- tops = []
77
- for tb in tops_bruts:
78
- tops.append(
52
+ profile[c].update(
53
+ tops=[
79
54
  {
80
55
  "count": tb["count"],
81
56
  "value": tb[c],
82
57
  }
83
- )
84
- profile[c].update(
85
- tops=tops,
86
- nb_distinct=safe_table[c].nunique(),
87
- nb_missing_values=len(safe_table[c].loc[safe_table[c].isna()]),
58
+ for tb in tops_bruts
59
+ ],
60
+ nb_distinct=(
61
+ table[c].nunique()
62
+ if columns[c]["python_type"] != "json" or not cast_json
63
+ # a column containing cast json is not serializable
64
+ else table[c].astype(str).nunique()
65
+ ),
66
+ nb_missing_values=len(table[c].loc[table[c].isna()]),
88
67
  )
89
68
  if verbose:
90
69
  display_logs_depending_process_time(
@@ -6,7 +6,7 @@ import pandas as pd
6
6
 
7
7
  from csv_detective.utils import display_logs_depending_process_time
8
8
 
9
- MAX_ROWS_ANALYSIS = int(1e5)
9
+ MAX_ROWS_ANALYSIS = int(1e4)
10
10
 
11
11
 
12
12
  def test_col_val(
@@ -32,9 +32,7 @@ def parse_csv(
32
32
  if "ISO-8859" in encoding:
33
33
  encoding = "ISO-8859-1"
34
34
  try:
35
- table = pd.read_csv(
36
- the_file, sep=sep, dtype="unicode", encoding=encoding, skiprows=skiprows
37
- )
35
+ table = pd.read_csv(the_file, sep=sep, dtype=str, encoding=encoding, skiprows=skiprows)
38
36
  total_lines = len(table)
39
37
  nb_duplicates = len(table.loc[table.duplicated()])
40
38
  if num_rows > 0:
@@ -101,7 +101,7 @@ def parse_excel(
101
101
  file_path,
102
102
  engine="odf",
103
103
  sheet_name=None,
104
- dtype="unicode",
104
+ dtype=str,
105
105
  )
106
106
  sizes = {sheet_name: table.size for sheet_name, table in tables.items()}
107
107
  sheet_name = max(sizes, key=sizes.get)
@@ -121,7 +121,7 @@ def parse_excel(
121
121
  file_path,
122
122
  engine="odf",
123
123
  sheet_name=sheet_name,
124
- dtype="unicode",
124
+ dtype=str,
125
125
  )
126
126
  table, header_row_idx = remove_empty_first_rows(table)
127
127
  total_lines = len(table)
@@ -152,7 +152,7 @@ def parse_excel(
152
152
  file_path,
153
153
  engine=engine,
154
154
  sheet_name=sheet_name,
155
- dtype="unicode",
155
+ dtype=str,
156
156
  )
157
157
  table, header_row_idx = remove_empty_first_rows(table)
158
158
  total_lines = len(table)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: csv-detective
3
- Version: 0.9.0
3
+ Version: 0.9.1
4
4
  Summary: Detect tabular files column content
5
5
  Author-email: Etalab <opendatateam@data.gouv.fr>
6
6
  License: MIT
@@ -1,4 +1,4 @@
1
- csv_detective/__init__.py,sha256=XY7pnoNHlocvyUiK8EQpJYPSQt5BRWWJD8KiPlvI9pU,164
1
+ csv_detective/__init__.py,sha256=FsL6q5F-gKLMnWy05-1CJpa4cz9tquheZ2LS1tjkVgI,162
2
2
  csv_detective/cli.py,sha256=mu5anmBmaDk52_uZGiA4T37wYZCuV43gZAepjs1Cqzc,1389
3
3
  csv_detective/explore_csv.py,sha256=sEMza4Z27ac88fGq7tUiK1zlfvuftztHhHVoa0c2EVU,9191
4
4
  csv_detective/load_tests.py,sha256=uVKweLq3cf-yB5ZZI-m9tBVs_SWNcOw8sDJa97TOJGo,2266
@@ -67,9 +67,9 @@ csv_detective/detect_fields/other/twitter/__init__.py,sha256=Npu6ZbyNfHq1y7xn0Gd
67
67
  csv_detective/detect_fields/other/url/__init__.py,sha256=L7h9fZldh1w86XwCx0x3Q1TXSJ_nIId1C-l1yFzZYrA,299
68
68
  csv_detective/detect_fields/other/uuid/__init__.py,sha256=XFxbIsdIhRw0dtFxBXQBhicE4yy7P4jmwYXeJhq6FVY,215
69
69
  csv_detective/detect_fields/temp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
70
- csv_detective/detect_fields/temp/date/__init__.py,sha256=uVOszufihKqiQmS0wz7nUuQ2Dz-Tq9fSk1nf3S00mg4,1010
71
- csv_detective/detect_fields/temp/datetime_aware/__init__.py,sha256=bEfWvXx_GNCRUxMGJYqfOK4wRDr3WMaGVAmIa_C2pXE,853
72
- csv_detective/detect_fields/temp/datetime_naive/__init__.py,sha256=GtQo55SrrXfoT-L7ZXW63jrlAYvNT5m56wMfhuY3pyI,836
70
+ csv_detective/detect_fields/temp/date/__init__.py,sha256=JtWaK8hkzBaIUc-fu0G7lIFpWqCfraRh6l0Mo65U3b0,2155
71
+ csv_detective/detect_fields/temp/datetime_aware/__init__.py,sha256=ZDNUcbU0ZJzaxUt0Utc1Y9dRrq4HHW9uCbcnOuz5Sfk,1247
72
+ csv_detective/detect_fields/temp/datetime_naive/__init__.py,sha256=QoVOA98lT_GVSGO_mQwKtAy2o-REs8C9d6JB9d_L_B4,1189
73
73
  csv_detective/detect_fields/temp/datetime_rfc822/__init__.py,sha256=-pFdIIPgaLq2_QbFJ9zwy4YIwZuC73F0A_cNDntTuvQ,512
74
74
  csv_detective/detect_fields/temp/year/__init__.py,sha256=gHchVciZExbGZLMBcbBaDXB0IgGptkQc4RhfSOMY0Ww,194
75
75
  csv_detective/detect_labels/__init__.py,sha256=93s93DRNeFw9fJiGp0rW3iRWZX3WOeVau2PAaF4QlPE,1777
@@ -132,37 +132,37 @@ csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG
132
132
  csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvcaGd0Go,2887
133
133
  csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
134
134
  csv_detective/detection/engine.py,sha256=1Z4vzjxwPRZ9-vv8nw-zU2sgBZtOsEz0UoKjGaSwVJU,1543
135
- csv_detective/detection/formats.py,sha256=dzJPdi2rP2jTHZBk9UHpJL3c5N-PSohCymHs-OZt45c,6211
135
+ csv_detective/detection/formats.py,sha256=YFFEJHhlMw7IMtbotpam1qYt07djnYMHd8j6AvOA3XA,7459
136
136
  csv_detective/detection/headers.py,sha256=y5iR4jWH5fUtAH_Zg0zxWSVG_INCHlXJFMbhPpI2YMo,1148
137
137
  csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
138
138
  csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
139
139
  csv_detective/detection/variables.py,sha256=wfsA_MOk14TPMOY7gkvpTGpo9-USzMnFaAou3MPHqxc,3536
140
- csv_detective/output/__init__.py,sha256=f-UFv_iULpVF_Fy39H4sfACEnrthjK4N3mCAVPkjnKw,1860
141
- csv_detective/output/dataframe.py,sha256=UpLuSxx_SFbKpem1n-xY7jF16MXGpKQYEWjaSMIiB4s,2215
140
+ csv_detective/output/__init__.py,sha256=02F5D5TODMiImyZzjnX-vIkMPkUC0ioIryqdBm6xT-w,2056
141
+ csv_detective/output/dataframe.py,sha256=pjxvpzIWVUW9_xvT3JjoPnOIVUUHnzL7kZo1xQdMDxQ,2139
142
142
  csv_detective/output/example.py,sha256=XrnPS_uC0cICn7tgnLWNctpUbnPzl7fIMzNTzJEWGJc,8655
143
- csv_detective/output/profile.py,sha256=Jeh0mrfH_hAVxV2E5I4XzdCm7ZAGAV_Xj3AXOi77lcA,3130
143
+ csv_detective/output/profile.py,sha256=k-t--uVHkrt3MRLnRAthiaF069jGc1jsQnfcOoBchrU,2524
144
144
  csv_detective/output/schema.py,sha256=5Duw5qnsJ-LaVC6JgF7p1zZAkehDzsbXA4iTSJUgLNM,13760
145
145
  csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
146
146
  csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
147
- csv_detective/parsing/columns.py,sha256=fbvQMu12gAmz4TnNCL7pLnMFB-mWN_O-zEoj8jEGj0A,5696
147
+ csv_detective/parsing/columns.py,sha256=HRHJBJ1gftuheegJHzhQmg-u83pVAXXuQ9GKR34mKgk,5696
148
148
  csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
149
- csv_detective/parsing/csv.py,sha256=qZFLOT3YCPoHF0svfVfQBnS8eHtucjDZ7dFITAPgLhc,1626
150
- csv_detective/parsing/excel.py,sha256=ULUDw76z6hs1Xm2yL9KBM0EOvIsfBLkxwqTZfDEx6aE,7045
149
+ csv_detective/parsing/csv.py,sha256=fJkjKvyk7InkNnYKtmivyi48mmcwvrha7gvZ5J4-86A,1588
150
+ csv_detective/parsing/excel.py,sha256=sKD5PRN1TlzPPOKFnZ3VRb0r1yIjPLlpxVWmZQeLYFk,7027
151
151
  csv_detective/parsing/load.py,sha256=C3M8nvgWenOb8aDFi5dpDGCoAw9EBqr4EB63zbz2M14,3699
152
152
  csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
153
- csv_detective-0.9.0.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
153
+ csv_detective-0.9.1.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
154
154
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
155
- tests/test_example.py,sha256=iO4RxMHZxnBAiKm6fsFar5OVg8hYKnqNZCw0SUnEuQQ,1972
156
- tests/test_fields.py,sha256=Y2mBfV9ZdxTHYwHnkzGbpo1k_qJRLC8nU-zzAUxFmAE,11964
157
- tests/test_file.py,sha256=YuVbSfeo_ASPiLT8CyxXqJENcDpj4wAFXzLwu_GzsOA,8437
155
+ tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
156
+ tests/test_fields.py,sha256=5901OxKDReGMPQm3ZJ36oDjtJ8H3El5jPxf1YNu5wVg,12542
157
+ tests/test_file.py,sha256=NBLwPCFN2skZHLkckPZ0M0ZvanEdL88KVK1Vi9GhSaU,8925
158
158
  tests/test_labels.py,sha256=Y0XlOpztCyV65pk7iAS_nMMfdysoBujlBmz10vHul9A,469
159
159
  tests/test_structure.py,sha256=GRDYKy0UcdqlN4qglzsRC0puFj5cb-SVvONjvcPvtAA,1400
160
160
  tests/test_validation.py,sha256=ie-Xf0vk6-M6GQq-x7kY5yse1EmXfxQkbaV7fR3fvYo,3308
161
161
  venv/bin/activate_this.py,sha256=NRy3waFmwW1pOaNUp33wNN0vD1Kzkd-zXX-Sgl4EiVI,1286
162
162
  venv/bin/jp.py,sha256=7z7dvRg0M7HzpZG4ssQID7nScjvQx7bcYTxJWDOrS6E,1717
163
163
  venv/bin/runxlrd.py,sha256=YlZMuycM_V_hzNt2yt3FyXPuwouMCmMhvj1oZaBeeuw,16092
164
- csv_detective-0.9.0.dist-info/METADATA,sha256=rSA0lM-SqevxCztwKsR6K8sRI9jueGZo3yQV2B0-jdU,9759
165
- csv_detective-0.9.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
166
- csv_detective-0.9.0.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
167
- csv_detective-0.9.0.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
168
- csv_detective-0.9.0.dist-info/RECORD,,
164
+ csv_detective-0.9.1.dist-info/METADATA,sha256=AXtW7yGuAY6Y0XOdIXMTrDmnw9EMDtJbOB3Vl4oai6w,9759
165
+ csv_detective-0.9.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
166
+ csv_detective-0.9.1.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
167
+ csv_detective-0.9.1.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
168
+ csv_detective-0.9.1.dist-info/RECORD,,
tests/test_example.py CHANGED
@@ -1,7 +1,7 @@
1
1
  import re
2
2
  from uuid import UUID
3
3
 
4
- from csv_detective import create_example_csv_file
4
+ from csv_detective.output.example import create_example_csv_file
5
5
 
6
6
 
7
7
  def test_example_creation():
tests/test_fields.py CHANGED
@@ -1,5 +1,6 @@
1
1
  from datetime import date as _date
2
2
  from datetime import datetime as _datetime
3
+ from unittest.mock import patch
3
4
 
4
5
  import pandas as pd
5
6
  import pytest
@@ -98,7 +99,7 @@ def test_detetect_categorical_variable():
98
99
  "cat2": categorical_col2,
99
100
  "not_cat": not_categorical_col,
100
101
  }
101
- df = pd.DataFrame(df_dict, dtype="unicode")
102
+ df = pd.DataFrame(df_dict, dtype=str)
102
103
 
103
104
  res, _ = detect_categorical_variable(df)
104
105
  assert len(res.values) and all(k in res.values for k in ["cat", "cat2"])
@@ -113,8 +114,8 @@ def test_detect_continuous_variable():
113
114
  df_dict = {"cont": continuous_col, "not_cont": not_continuous_col}
114
115
  df_dict_2 = {"cont": continuous_col_2, "not_cont": not_continuous_col}
115
116
 
116
- df = pd.DataFrame(df_dict, dtype="unicode")
117
- df2 = pd.DataFrame(df_dict_2, dtype="unicode")
117
+ df = pd.DataFrame(df_dict, dtype=str)
118
+ df2 = pd.DataFrame(df_dict_2, dtype=str)
118
119
 
119
120
  res = detect_continuous_variable(df)
120
121
  res2 = detect_continuous_variable(df2, continuous_th=0.65)
@@ -441,3 +442,22 @@ def test_priority(args):
441
442
  col = "col1"
442
443
  output = prepare_output_dict(pd.DataFrame({col: detections}), limited_output=True)
443
444
  assert output[col]["format"] == expected
445
+
446
+
447
+ @pytest.mark.parametrize(
448
+ "args",
449
+ (
450
+ ("1996-02-13", date),
451
+ ("28/01/2000", date),
452
+ ("2025-08-20T14:30:00+02:00", datetime_aware),
453
+ ("2025/08/20 14:30:00.2763-12:00", datetime_aware),
454
+ ("1925_12_20T14:30:00.2763Z", datetime_naive),
455
+ ("1925 12 20 14:30:00Z", datetime_naive),
456
+ ),
457
+ )
458
+ def test_early_detection(args):
459
+ value, module = args
460
+ with patch("csv_detective.detect_fields.temp.date.date_casting") as mock_func:
461
+ res = module._is(value)
462
+ assert res
463
+ mock_func.assert_not_called()
tests/test_file.py CHANGED
@@ -276,3 +276,20 @@ def test_cast_json(mocked_responses, cast_json):
276
276
  )
277
277
  assert analysis["columns"]["a_simple_dict"]["python_type"] == "json"
278
278
  assert isinstance(df["a_simple_dict"][0], expected_type)
279
+
280
+
281
+ def test_almost_uniform_column(mocked_responses):
282
+ col_name = "int_not_bool"
283
+ expected_content = f"{col_name}\n" + "9\n" + "1\n" * int(1e7)
284
+ mocked_responses.get(
285
+ "http://example.com/test.csv",
286
+ body=expected_content,
287
+ status=200,
288
+ )
289
+ analysis = routine(
290
+ file_path="http://example.com/test.csv",
291
+ num_rows=-1,
292
+ output_profile=False,
293
+ save_results=False,
294
+ )
295
+ assert analysis["columns"][col_name]["format"] == "int"