csv-detective 0.9.3.dev2215__py3-none-any.whl → 0.9.3.dev2232__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
csv_detective/__init__.py CHANGED
@@ -1,6 +1,7 @@
1
- from csv_detective.explore_csv import routine, validate_then_detect
1
+ from csv_detective.explore_csv import routine, validate, validate_then_detect
2
2
 
3
3
  __all__ = [
4
4
  "routine",
5
+ "validate",
5
6
  "validate_then_detect",
6
7
  ]
@@ -29,7 +29,7 @@ def detect_engine(file_path: str, verbose=False) -> str | None:
29
29
  }
30
30
  # if none of the above, we move forwards with the csv process
31
31
  if is_url(file_path):
32
- remote_content = requests.get(file_path).content
32
+ remote_content = next(requests.get(file_path, stream=True).iter_content(chunk_size=1024))
33
33
  engine = mapping.get(magic.from_buffer(remote_content, mime=True))
34
34
  else:
35
35
  engine = mapping.get(magic.from_file(file_path, mime=True))
@@ -1,4 +1,3 @@
1
- import logging
2
1
  from collections import defaultdict
3
2
 
4
3
  import numpy as np
@@ -10,11 +9,12 @@ from csv_detective.detection.variables import (
10
9
  )
11
10
  from csv_detective.load_tests import return_all_tests
12
11
  from csv_detective.output.utils import prepare_output_dict
13
- from csv_detective.parsing.columns import MAX_ROWS_ANALYSIS, test_col, test_label
14
- from csv_detective.validate import validate
15
-
16
- # above this threshold, a column is not considered categorical
17
- MAX_NUMBER_CATEGORICAL_VALUES = 25
12
+ from csv_detective.parsing.columns import (
13
+ MAX_NUMBER_CATEGORICAL_VALUES,
14
+ test_col,
15
+ test_col_chunks,
16
+ test_label,
17
+ )
18
18
 
19
19
 
20
20
  def detect_formats(
@@ -25,36 +25,8 @@ def detect_formats(
25
25
  limited_output: bool = True,
26
26
  skipna: bool = True,
27
27
  verbose: bool = False,
28
- ):
29
- on_sample = len(table) > MAX_ROWS_ANALYSIS
30
- if on_sample:
31
- if verbose:
32
- logging.warning(f"File is too long, analysing a sample of {MAX_ROWS_ANALYSIS} rows")
33
- table = build_sample(table)
34
-
35
- if table.empty:
36
- res_categorical = []
37
- # res_continuous = []
38
- else:
39
- # Detects columns that are categorical
40
- res_categorical, categorical_mask = detect_categorical_variable(
41
- table,
42
- max_number_categorical_values=MAX_NUMBER_CATEGORICAL_VALUES,
43
- verbose=verbose,
44
- )
45
- res_categorical = list(res_categorical)
46
- # Detect columns that are continuous (we already know the categorical) :
47
- # we don't need this for now, cuts processing time
48
- # res_continuous = list(
49
- # detect_continuous_variable(table.iloc[:, ~categorical_mask.values], verbose=verbose)
50
- # )
51
-
52
- analysis.update(
53
- {
54
- "categorical": res_categorical,
55
- # "continuous": res_continuous,
56
- }
57
- )
28
+ ) -> tuple[dict, dict[str, pd.Series] | None]:
29
+ in_chunks = analysis.get("total_lines") is None
58
30
 
59
31
  # list testing to be performed
60
32
  all_tests_fields = return_all_tests(
@@ -66,16 +38,41 @@ def detect_formats(
66
38
 
67
39
  # if no testing then return
68
40
  if not all_tests_fields and not all_tests_labels:
69
- return analysis
41
+ return analysis, None
70
42
 
71
43
  # Perform testing on fields
72
- scores_table_fields = test_col(
73
- table, all_tests_fields, limited_output, skipna=skipna, verbose=verbose
74
- )
44
+ if not in_chunks:
45
+ # table is small enough to be tested in one go
46
+ scores_table_fields = test_col(
47
+ table=table,
48
+ all_tests=all_tests_fields,
49
+ limited_output=limited_output,
50
+ skipna=skipna,
51
+ verbose=verbose,
52
+ )
53
+ res_categorical, _ = detect_categorical_variable(
54
+ table,
55
+ max_number_categorical_values=MAX_NUMBER_CATEGORICAL_VALUES,
56
+ verbose=verbose,
57
+ )
58
+ analysis["categorical"] = res_categorical
59
+ col_values = None
60
+ else:
61
+ scores_table_fields, analysis, col_values = test_col_chunks(
62
+ table=table,
63
+ file_path=file_path,
64
+ analysis=analysis,
65
+ all_tests=all_tests_fields,
66
+ limited_output=limited_output,
67
+ skipna=skipna,
68
+ verbose=verbose,
69
+ )
75
70
  analysis["columns_fields"] = prepare_output_dict(scores_table_fields, limited_output)
76
71
 
77
72
  # Perform testing on labels
78
- scores_table_labels = test_label(table, all_tests_labels, limited_output, verbose=verbose)
73
+ scores_table_labels = test_label(
74
+ analysis["header"], all_tests_labels, limited_output, verbose=verbose
75
+ )
79
76
  analysis["columns_labels"] = prepare_output_dict(scores_table_labels, limited_output)
80
77
 
81
78
  # Multiply the results of the fields by 1 + 0.5 * the results of the labels.
@@ -158,57 +155,4 @@ def detect_formats(
158
155
  for header, col_metadata in analysis["columns"].items():
159
156
  analysis["formats"][col_metadata["format"]].append(header)
160
157
 
161
- if on_sample:
162
- if verbose:
163
- logging.warning("Validating that analysis on the sample works on the whole file")
164
- is_valid, _, _ = validate(
165
- file_path=file_path,
166
- previous_analysis=analysis,
167
- num_rows=-1,
168
- encoding=analysis.get("encoding"),
169
- sep=analysis.get("separator"),
170
- sheet_name=analysis.get("sheet_name"),
171
- verbose=verbose,
172
- skipna=skipna,
173
- )
174
- if not is_valid:
175
- raise ValueError("Could not infer detected formats on the whole file")
176
-
177
- return analysis
178
-
179
-
180
- def build_sample(table: pd.DataFrame) -> pd.DataFrame:
181
- """
182
- building a sample of MAX_ROWS_ANALYSIS rows that contains at least one representative of
183
- the min and max values of each column, and one case of NaN if the column contains any.
184
- """
185
- samples = pd.concat(
186
- [
187
- # one row with the minimum of the column
188
- table.loc[table[col] == val].iloc[[0]]
189
- for col in table.columns
190
- if not pd.isna(val := table[col].dropna().min())
191
- ]
192
- + [
193
- # one row with the maximum of the column
194
- table.loc[table[col] == val].iloc[[0]]
195
- for col in table.columns
196
- if not pd.isna(val := table[col].dropna().max())
197
- ]
198
- + [
199
- # one row with a NaN value if the column has any
200
- table.loc[table[col].isna()].iloc[[0]]
201
- for col in table.columns
202
- if table[col].isna().any()
203
- ],
204
- ignore_index=True,
205
- )
206
- return (
207
- pd.concat(
208
- [samples, table.sample(n=MAX_ROWS_ANALYSIS - len(samples), random_state=1)],
209
- ignore_index=True,
210
- )
211
- # this is very unlikely but we never know
212
- if len(samples) <= MAX_ROWS_ANALYSIS
213
- else samples.sample(n=MAX_ROWS_ANALYSIS, random_state=1)
214
- )
158
+ return analysis, col_values
@@ -56,7 +56,7 @@ def detect_categorical_variable(
56
56
  threshold_pct_categorical: float = 0.05,
57
57
  max_number_categorical_values: int = 25,
58
58
  verbose: bool = False,
59
- ):
59
+ ) -> tuple[list[str], pd.DataFrame]:
60
60
  """
61
61
  Heuristically detects whether a table (df) contains categorical values according to
62
62
  the number of unique values contained.
@@ -94,4 +94,4 @@ def detect_categorical_variable(
94
94
  f"Detected {sum(res)} categorical columns out of {len(table.columns)} in {round(time() - start, 3)}s",
95
95
  time() - start,
96
96
  )
97
- return res.index[res], res
97
+ return list(res.index[res]), res
@@ -70,7 +70,7 @@ def routine(
70
70
  sheet_name=sheet_name,
71
71
  )
72
72
 
73
- analysis = detect_formats(
73
+ analysis, _col_values = detect_formats(
74
74
  table=table,
75
75
  analysis=analysis,
76
76
  file_path=file_path,
@@ -94,6 +94,7 @@ def routine(
94
94
  cast_json=cast_json,
95
95
  verbose=verbose,
96
96
  sheet_name=sheet_name,
97
+ _col_values=_col_values,
97
98
  )
98
99
  finally:
99
100
  if verbose:
@@ -121,13 +122,9 @@ def validate_then_detect(
121
122
  if is_url(file_path):
122
123
  logging.info("Path recognized as a URL")
123
124
 
124
- is_valid, table, analysis = validate(
125
+ is_valid, table, analysis, col_values = validate(
125
126
  file_path=file_path,
126
127
  previous_analysis=previous_analysis,
127
- num_rows=num_rows,
128
- encoding=previous_analysis.get("encoding"),
129
- sep=previous_analysis.get("separator"),
130
- sheet_name=previous_analysis.get("sheet_name"),
131
128
  verbose=verbose,
132
129
  skipna=skipna,
133
130
  )
@@ -139,7 +136,7 @@ def validate_then_detect(
139
136
  verbose=verbose,
140
137
  )
141
138
  if not is_valid:
142
- analysis = detect_formats(
139
+ analysis, col_values = detect_formats(
143
140
  table=table,
144
141
  analysis=analysis,
145
142
  file_path=file_path,
@@ -162,6 +159,7 @@ def validate_then_detect(
162
159
  cast_json=cast_json,
163
160
  verbose=verbose,
164
161
  sheet_name=analysis.get("sheet_name"),
162
+ _col_values=col_values,
165
163
  )
166
164
  finally:
167
165
  if verbose:
@@ -19,7 +19,7 @@ def get_all_packages(detect_type) -> list:
19
19
  def return_all_tests(
20
20
  user_input_tests: str | list,
21
21
  detect_type: str,
22
- ) -> list:
22
+ ) -> dict[str, dict]:
23
23
  """
24
24
  returns all tests that have a method _is and are listed in the user_input_tests
25
25
  the function can select a sub_package from csv_detective
@@ -40,6 +40,7 @@ def return_all_tests(
40
40
  else:
41
41
  tests_to_do = [f"{detect_type}.{x}" for x in user_input_tests if x[0] != "-"]
42
42
  tests_skipped = [f"{detect_type}.{x[1:]}" for x in user_input_tests if x[0] == "-"]
43
+ # removing specified (groups of) tests
43
44
  all_tests = [
44
45
  # this is why we need to import detect_fields/labels
45
46
  eval(x)
@@ -47,6 +48,12 @@ def return_all_tests(
47
48
  if any([y == x[: len(y)] for y in tests_to_do])
48
49
  and all([y != x[: len(y)] for y in tests_skipped])
49
50
  ]
50
- # to remove groups of tests
51
- all_tests = [test for test in all_tests if "_is" in dir(test)]
52
- return all_tests
51
+ return {
52
+ test.__name__.split(".")[-1]: {
53
+ "func": test._is,
54
+ "prop": test.PROPORTION,
55
+ "module": test,
56
+ }
57
+ for test in all_tests
58
+ if "_is" in dir(test)
59
+ }
@@ -1,11 +1,12 @@
1
1
  import json
2
2
  import os
3
+ from typing import Iterator
3
4
 
4
5
  import pandas as pd
5
6
 
6
7
  from csv_detective.utils import is_url
7
8
 
8
- from .dataframe import cast_df
9
+ from .dataframe import cast_df_chunks
9
10
  from .profile import create_profile
10
11
  from .schema import generate_table_schema
11
12
 
@@ -23,7 +24,8 @@ def generate_output(
23
24
  cast_json: bool = True,
24
25
  verbose: bool = False,
25
26
  sheet_name: str | int | None = None,
26
- ) -> dict | tuple[dict, pd.DataFrame]:
27
+ _col_values: dict[str, pd.Series] | None = None,
28
+ ) -> dict | tuple[dict, Iterator[pd.DataFrame]]:
27
29
  if output_profile:
28
30
  analysis["profile"] = create_profile(
29
31
  table=table,
@@ -32,6 +34,7 @@ def generate_output(
32
34
  limited_output=limited_output,
33
35
  cast_json=cast_json,
34
36
  verbose=verbose,
37
+ _col_values=_col_values,
35
38
  )
36
39
 
37
40
  if save_results:
@@ -53,9 +56,10 @@ def generate_output(
53
56
  analysis["schema"] = generate_table_schema(analysis, save_results=False, verbose=verbose)
54
57
 
55
58
  if output_df:
56
- return analysis, cast_df(
59
+ return analysis, cast_df_chunks(
57
60
  df=table,
58
- columns=analysis["columns"],
61
+ analysis=analysis,
62
+ file_path=file_path,
59
63
  cast_json=cast_json,
60
64
  verbose=verbose,
61
65
  )
@@ -1,12 +1,14 @@
1
1
  import json
2
2
  from datetime import date, datetime
3
3
  from time import time
4
+ from typing import Iterator
4
5
 
5
6
  import pandas as pd
6
7
 
7
8
  from csv_detective.detect_fields.other.booleen import bool_casting
8
9
  from csv_detective.detect_fields.other.float import float_casting
9
10
  from csv_detective.detect_fields.temp.date import date_casting
11
+ from csv_detective.parsing.csv import CHUNK_SIZE
10
12
  from csv_detective.utils import display_logs_depending_process_time
11
13
 
12
14
 
@@ -52,3 +54,38 @@ def cast_df(
52
54
  time() - start,
53
55
  )
54
56
  return df
57
+
58
+
59
+ def cast_df_chunks(
60
+ df: pd.DataFrame,
61
+ analysis: dict,
62
+ file_path: str,
63
+ cast_json: bool = True,
64
+ verbose: bool = False,
65
+ ) -> Iterator[pd.DataFrame]:
66
+ if analysis.get("engine") or analysis["total_lines"] <= CHUNK_SIZE:
67
+ # the file is loaded in one chunk, so returning the cast df
68
+ yield cast_df(
69
+ df=df,
70
+ columns=analysis["columns"],
71
+ cast_json=cast_json,
72
+ verbose=verbose,
73
+ )
74
+ else:
75
+ # loading the csv in chunks using the analysis
76
+ chunks = pd.read_csv(
77
+ file_path,
78
+ dtype=str,
79
+ sep=analysis["separator"],
80
+ encoding=analysis["encoding"],
81
+ skiprows=analysis["header_row_idx"],
82
+ compression=analysis.get("compression"),
83
+ chunksize=CHUNK_SIZE,
84
+ )
85
+ for chunk in chunks:
86
+ yield cast_df(
87
+ df=chunk,
88
+ columns=analysis["columns"],
89
+ cast_json=cast_json,
90
+ verbose=verbose,
91
+ )
@@ -10,6 +10,8 @@ import requests
10
10
  import rstr
11
11
  from faker import Faker
12
12
 
13
+ from csv_detective.utils import is_url
14
+
13
15
  fake = Faker()
14
16
 
15
17
 
@@ -183,7 +185,7 @@ def create_example_csv_file(
183
185
  }
184
186
 
185
187
  if schema_path:
186
- if schema_path.startswith("http"):
188
+ if is_url(schema_path):
187
189
  schema = requests.get(schema_path).json()
188
190
  else:
189
191
  with open(schema_path, encoding=encoding) as jsonfile:
@@ -1,7 +1,9 @@
1
1
  import logging
2
2
  from collections import defaultdict
3
3
  from time import time
4
+ from typing import Optional
4
5
 
6
+ import numpy as np
5
7
  import pandas as pd
6
8
 
7
9
  from csv_detective.detect_fields.other.float import float_casting
@@ -15,6 +17,7 @@ def create_profile(
15
17
  limited_output: bool = True,
16
18
  cast_json: bool = True,
17
19
  verbose: bool = False,
20
+ _col_values: Optional[dict[str, pd.Series]] = None,
18
21
  ) -> dict:
19
22
  if verbose:
20
23
  start = time()
@@ -31,23 +34,51 @@ def create_profile(
31
34
  for c in table.columns:
32
35
  # for numerical formats we want min, max, mean, std
33
36
  if columns[c]["python_type"] in ["float", "int"]:
34
- # we locally cast the column to perform the operations, using the same method as in cast_df
35
- cast_col = (
36
- table[c].astype(pd.Int64Dtype())
37
- if columns[c]["python_type"] == "int"
38
- else table[c].apply(lambda x: float_casting(x) if isinstance(x, str) else pd.NA)
39
- )
40
- profile[c].update(
41
- min=cast_prevent_nan(cast_col.min(), columns[c]["python_type"]),
42
- max=cast_prevent_nan(cast_col.max(), columns[c]["python_type"]),
43
- mean=cast_prevent_nan(cast_col.mean(), columns[c]["python_type"]),
44
- std=cast_prevent_nan(cast_col.std(), columns[c]["python_type"]),
45
- )
37
+ # if we have read the file in chunks we already have what we need
38
+ if _col_values is None:
39
+ # we locally cast the column to perform the operations,
40
+ # using the same method as in cast_df
41
+ cast_col = (
42
+ table[c].astype(pd.Int64Dtype())
43
+ if columns[c]["python_type"] == "int"
44
+ else table[c].apply(lambda x: float_casting(x) if isinstance(x, str) else pd.NA)
45
+ )
46
+ stats = {
47
+ "min": cast_prevent_nan(cast_col.min(), columns[c]["python_type"]),
48
+ "mean": cast_prevent_nan(cast_col.mean(), columns[c]["python_type"]),
49
+ "max": cast_prevent_nan(cast_col.max(), columns[c]["python_type"]),
50
+ "std": cast_prevent_nan(cast_col.std(), columns[c]["python_type"]),
51
+ }
52
+ else:
53
+ cast_col = _col_values[c].reset_index()
54
+ cast_col = cast_col.loc[cast_col[c].notna()]
55
+ cast_col[c] = (
56
+ cast_col[c].astype(pd.Int64Dtype())
57
+ if columns[c]["python_type"] == "int"
58
+ else cast_col[c].apply(
59
+ lambda x: float_casting(x) if isinstance(x, str) else pd.NA
60
+ )
61
+ )
62
+ stats = {
63
+ "min": cast_prevent_nan(cast_col[c].min(), columns[c]["python_type"]),
64
+ "mean": cast_prevent_nan(
65
+ (cast_col[c] * cast_col["count"]).sum() / sum(cast_col["count"]),
66
+ columns[c]["python_type"],
67
+ ),
68
+ "max": cast_prevent_nan(cast_col[c].max(), columns[c]["python_type"]),
69
+ }
70
+ stats["std"] = cast_prevent_nan(
71
+ np.sqrt(
72
+ sum(cast_col["count"] * (cast_col[c] - stats["mean"]) ** 2)
73
+ / sum(cast_col["count"])
74
+ ),
75
+ columns[c]["python_type"],
76
+ )
77
+ profile[c].update(**stats)
46
78
  del cast_col
47
79
  # for all formats we want most frequent values, nb unique values and nb missing values
48
80
  tops_bruts = (
49
- table.loc[table[c].notna(), c]
50
- .value_counts()
81
+ (table[c].value_counts() if _col_values is None else _col_values[c].sort_values())
51
82
  .reset_index()
52
83
  .iloc[:10]
53
84
  .to_dict(orient="records")
@@ -61,16 +92,25 @@ def create_profile(
61
92
  for tb in tops_bruts
62
93
  ],
63
94
  nb_distinct=(
64
- table[c].nunique()
65
- if columns[c]["python_type"] != "json" or not cast_json
66
- # a column containing cast json is not serializable
67
- else table[c].astype(str).nunique()
95
+ (
96
+ table[c].nunique()
97
+ if columns[c]["python_type"] != "json" or not cast_json
98
+ # a column containing cast json is not serializable
99
+ else table[c].astype(str).nunique()
100
+ )
101
+ if _col_values is None
102
+ else len(_col_values)
103
+ ),
104
+ nb_missing_values=(
105
+ len(table[c].loc[table[c].isna()])
106
+ if _col_values is None
107
+ else (_col_values[c].loc[pd.NA] if pd.NA in _col_values[c].index else 0)
68
108
  ),
69
- nb_missing_values=len(table[c].loc[table[c].isna()]),
70
109
  )
71
110
  if verbose:
72
111
  display_logs_depending_process_time(
73
112
  f"Created profile in {round(time() - start, 3)}s",
74
113
  time() - start,
75
114
  )
115
+ del _col_values
76
116
  return profile