csv-detective 0.7.5.dev1277__py3-none-any.whl → 0.7.5.dev1286__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. csv_detective/__init__.py +1 -1
  2. csv_detective/detect_fields/other/float/__init__.py +4 -4
  3. csv_detective/detection/formats.py +145 -0
  4. csv_detective/explore_csv.py +94 -222
  5. csv_detective/load_tests.py +62 -0
  6. csv_detective/output/__init__.py +64 -0
  7. csv_detective/output/dataframe.py +0 -0
  8. csv_detective/output/example.py +0 -0
  9. csv_detective/output/profile.py +0 -0
  10. csv_detective/output/schema.py +0 -0
  11. csv_detective/output/utils.py +0 -0
  12. csv_detective/utils.py +2 -0
  13. csv_detective/validate.py +70 -0
  14. {csv_detective-0.7.5.dev1277.data → csv_detective-0.7.5.dev1286.data}/data/share/csv_detective/CHANGELOG.md +1 -0
  15. {csv_detective-0.7.5.dev1277.dist-info → csv_detective-0.7.5.dev1286.dist-info}/METADATA +1 -1
  16. {csv_detective-0.7.5.dev1277.dist-info → csv_detective-0.7.5.dev1286.dist-info}/RECORD +21 -16
  17. tests/test_fields.py +1 -1
  18. tests/test_file.py +19 -9
  19. tests/test_structure.py +6 -0
  20. tests/test_validation.py +18 -0
  21. {csv_detective-0.7.5.dev1277.data → csv_detective-0.7.5.dev1286.data}/data/share/csv_detective/LICENSE.AGPL.txt +0 -0
  22. {csv_detective-0.7.5.dev1277.data → csv_detective-0.7.5.dev1286.data}/data/share/csv_detective/README.md +0 -0
  23. {csv_detective-0.7.5.dev1277.dist-info → csv_detective-0.7.5.dev1286.dist-info}/WHEEL +0 -0
  24. {csv_detective-0.7.5.dev1277.dist-info → csv_detective-0.7.5.dev1286.dist-info}/entry_points.txt +0 -0
  25. {csv_detective-0.7.5.dev1277.dist-info → csv_detective-0.7.5.dev1286.dist-info}/licenses/LICENSE.AGPL.txt +0 -0
  26. {csv_detective-0.7.5.dev1277.dist-info → csv_detective-0.7.5.dev1286.dist-info}/top_level.txt +0 -0
csv_detective/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- from .explore_csv import routine, routine_minio # noqa
1
+ from .explore_csv import routine, routine_minio, validate_then_detect # noqa
2
2
  from .output.example import create_example_csv_file # noqa
3
3
 
4
4
  __version__ = '0.7.5.dev'
@@ -2,16 +2,16 @@ PROPORTION = 1
2
2
 
3
3
 
4
4
  def float_casting(val: str) -> float:
5
- return float(val.replace(',', '.'))
5
+ return float(val.replace(",", "."))
6
6
 
7
7
 
8
8
  def _is(val):
9
- '''Detects floats, assuming that tables will not have scientific
10
- notations (3e6) or "+" in the string. "-" is still accepted.'''
9
+ """Detects floats, assuming that tables will not have scientific
10
+ notations (3e6) or "+" in the string. "-" is still accepted."""
11
11
  try:
12
12
  if (
13
13
  not isinstance(val, str)
14
- or any([k in val for k in ['_', '+', 'e', 'E']])
14
+ or any([k in val for k in ["_", "+", "e", "E"]])
15
15
  or (val.startswith("0") and len(val) > 1 and val[1] not in [".", ","])
16
16
  ):
17
17
  return False
@@ -0,0 +1,145 @@
1
+ from collections import defaultdict
2
+ from typing import Union
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+ from csv_detective.detection.variables import (
7
+ detect_categorical_variable,
8
+ # detect_continuous_variable,
9
+ )
10
+ from csv_detective.load_tests import return_all_tests
11
+ from csv_detective.output.utils import prepare_output_dict
12
+ from csv_detective.parsing.columns import test_col, test_label
13
+
14
+
15
+ def detect_formats(
16
+ table: pd.DataFrame,
17
+ analysis: dict,
18
+ user_input_tests: Union[str, list[str]] = "ALL",
19
+ limited_output: bool = True,
20
+ skipna: bool = True,
21
+ verbose: bool = False,
22
+ ):
23
+
24
+ if table.empty:
25
+ res_categorical = []
26
+ # res_continuous = []
27
+ else:
28
+ # Detects columns that are categorical
29
+ res_categorical, categorical_mask = detect_categorical_variable(table, verbose=verbose)
30
+ res_categorical = list(res_categorical)
31
+ # Detect columns that are continuous (we already know the categorical) :
32
+ # we don't need this for now, cuts processing time
33
+ # res_continuous = list(
34
+ # detect_continuous_variable(table.iloc[:, ~categorical_mask.values], verbose=verbose)
35
+ # )
36
+
37
+ analysis.update({
38
+ "categorical": res_categorical,
39
+ # "continuous": res_continuous,
40
+ })
41
+
42
+ # list testing to be performed
43
+ all_tests_fields = return_all_tests(
44
+ user_input_tests, detect_type="detect_fields"
45
+ ) # list all tests for the fields
46
+ all_tests_labels = return_all_tests(
47
+ user_input_tests, detect_type="detect_labels"
48
+ ) # list all tests for the labels
49
+
50
+ # if no testing then return
51
+ if not all_tests_fields and not all_tests_labels:
52
+ return analysis
53
+
54
+ # Perform testing on fields
55
+ scores_table_fields = test_col(table, all_tests_fields, limited_output, skipna=skipna, verbose=verbose)
56
+ analysis["columns_fields"] = prepare_output_dict(scores_table_fields, limited_output)
57
+
58
+ # Perform testing on labels
59
+ scores_table_labels = test_label(table, all_tests_labels, limited_output, verbose=verbose)
60
+ analysis["columns_labels"] = prepare_output_dict(scores_table_labels, limited_output)
61
+
62
+ # Multiply the results of the fields by 1 + 0.5 * the results of the labels.
63
+ # This is because the fields are more important than the labels and yields a max
64
+ # of 1.5 for the final score.
65
+ scores_table = scores_table_fields * (
66
+ 1
67
+ + scores_table_labels.reindex(
68
+ index=scores_table_fields.index, fill_value=0
69
+ ).values / 2
70
+ )
71
+
72
+ # To reduce false positives: ensure these formats are detected only if the label yields
73
+ # a detection (skipping the ones that have been excluded by the users).
74
+ formats_with_mandatory_label = [
75
+ f for f in [
76
+ "code_departement",
77
+ "code_commune_insee",
78
+ "code_postal",
79
+ "latitude_wgs",
80
+ "longitude_wgs",
81
+ "latitude_wgs_fr_metropole",
82
+ "longitude_wgs_fr_metropole",
83
+ "latitude_l93",
84
+ "longitude_l93",
85
+ ] if f in scores_table.index
86
+ ]
87
+ scores_table.loc[formats_with_mandatory_label, :] = np.where(
88
+ scores_table_labels.loc[formats_with_mandatory_label, :],
89
+ scores_table.loc[formats_with_mandatory_label, :],
90
+ 0,
91
+ )
92
+ analysis["columns"] = prepare_output_dict(scores_table, limited_output)
93
+
94
+ metier_to_python_type = {
95
+ "booleen": "bool",
96
+ "int": "int",
97
+ "float": "float",
98
+ "string": "string",
99
+ "json": "json",
100
+ "json_geojson": "json",
101
+ "datetime": "datetime",
102
+ "datetime_iso": "datetime",
103
+ "datetime_rfc822": "datetime",
104
+ "date": "date",
105
+ "latitude": "float",
106
+ "latitude_l93": "float",
107
+ "latitude_wgs": "float",
108
+ "latitude_wgs_fr_metropole": "float",
109
+ "longitude": "float",
110
+ "longitude_l93": "float",
111
+ "longitude_wgs": "float",
112
+ "longitude_wgs_fr_metropole": "float",
113
+ }
114
+
115
+ if not limited_output:
116
+ for detection_method in ["columns_fields", "columns_labels", "columns"]:
117
+ analysis[detection_method] = {
118
+ col_name: [
119
+ {
120
+ "python_type": metier_to_python_type.get(
121
+ detection["format"], "string"
122
+ ),
123
+ **detection,
124
+ }
125
+ for detection in detections
126
+ ]
127
+ for col_name, detections in analysis[detection_method].items()
128
+ }
129
+ else:
130
+ for detection_method in ["columns_fields", "columns_labels", "columns"]:
131
+ analysis[detection_method] = {
132
+ col_name: {
133
+ "python_type": metier_to_python_type.get(
134
+ detection["format"], "string"
135
+ ),
136
+ **detection,
137
+ }
138
+ for col_name, detection in analysis[detection_method].items()
139
+ }
140
+
141
+ # Add detection with formats as keys
142
+ analysis["formats"] = defaultdict(list)
143
+ for header, col_metadata in analysis["columns"].items():
144
+ analysis["formats"][col_metadata["format"]].append(header)
145
+ return analysis
@@ -1,4 +1,3 @@
1
- from collections import defaultdict
2
1
  import json
3
2
  import logging
4
3
  import os
@@ -6,80 +5,16 @@ import tempfile
6
5
  from time import time
7
6
  from typing import Optional, Union
8
7
 
9
- import numpy as np
10
8
  import pandas as pd
11
9
 
12
- # flake8: noqa
13
- from csv_detective import detect_fields, detect_labels
14
- from .detection.variables import (
15
- detect_categorical_variable,
16
- # detect_continuous_variable,
17
- )
18
- from .output.dataframe import cast_df
19
- from .output.profile import create_profile
20
- from .output.schema import generate_table_schema
21
- from .output.utils import prepare_output_dict
10
+ from .detection.formats import detect_formats
11
+ from .output import generate_output, generate_table_schema
22
12
  from .parsing.load import load_file
23
- from .parsing.columns import test_col, test_label
24
13
  from .s3_utils import download_from_minio, upload_to_minio
25
14
  from .utils import display_logs_depending_process_time, is_url
15
+ from .validate import validate
26
16
 
27
-
28
- def get_all_packages(detect_type: str) -> list:
29
- root_dir = os.path.dirname(os.path.abspath(__file__)) + "/" + detect_type
30
- modules = []
31
- for dirpath, _, filenames in os.walk(root_dir):
32
- for filename in filenames:
33
- file = os.path.join(dirpath, filename).replace(root_dir, "")
34
- if file.endswith("__init__.py"):
35
- module = (
36
- file.replace("__init__.py", "")
37
- .replace("/", ".").replace("\\", ".")[:-1]
38
- )
39
- if module:
40
- modules.append(detect_type + module)
41
- return modules
42
-
43
-
44
- def return_all_tests(
45
- user_input_tests: Union[str, list],
46
- detect_type: str,
47
- ) -> list:
48
- """
49
- returns all tests that have a method _is and are listed in the user_input_tests
50
- the function can select a sub_package from csv_detective
51
- user_input_tests may look like this:
52
- - "ALL": all possible tests are made
53
- - "FR.other.siren" (or any other path-like string to one of the tests, or a group of tests, like "FR.geo"):
54
- this specifc (group of) test(s) only
55
- - ["FR.temp.mois_de_annee", "geo", ...]: only the specified tests will be made ; you may also skip
56
- specific (groups of) tests by add "-" at the start (e.g "-temp.date")
57
- """
58
- assert detect_type in ["detect_fields", "detect_labels"]
59
- all_packages = get_all_packages(detect_type=detect_type)
60
-
61
- if isinstance(user_input_tests, str):
62
- user_input_tests = [user_input_tests]
63
- if "ALL" in user_input_tests or all(x[0] == "-" for x in user_input_tests):
64
- tests_to_do = [detect_type]
65
- else:
66
- tests_to_do = [
67
- f"{detect_type}.{x}" for x in user_input_tests if x[0] != "-"
68
- ]
69
- tests_skipped = [
70
- f"{detect_type}.{x[1:]}" for x in user_input_tests if x[0] == "-"
71
- ]
72
- all_tests = [
73
- # this is why we need to import detect_fields/labels
74
- eval(x) for x in all_packages
75
- if any([y == x[: len(y)] for y in tests_to_do])
76
- and all([y != x[: len(y)] for y in tests_skipped])
77
- ]
78
- # to remove groups of tests
79
- all_tests = [
80
- test for test in all_tests if "_is" in dir(test)
81
- ]
82
- return all_tests
17
+ logging.basicConfig(level=logging.INFO)
83
18
 
84
19
 
85
20
  def routine(
@@ -99,7 +34,7 @@ def routine(
99
34
  sheet_name: Optional[Union[str, int]] = None,
100
35
  ) -> Union[dict, tuple[dict, pd.DataFrame]]:
101
36
  """Returns a dict with information about the csv table and possible
102
- column contents.
37
+ column contents, and if requested the DataFrame with columns cast according to analysis.
103
38
 
104
39
  Args:
105
40
  file_path: local path to CSV file if not using Minio
@@ -112,14 +47,14 @@ def routine(
112
47
  output_schema: whether or not to add the 'schema' field to the output (tableschema)
113
48
  output_df: whether or not to return the loaded DataFrame along with the analysis report
114
49
  cast_json: whether or not to cast json columns into objects (otherwise they are returned as strings)
115
- verbose: whether or not to print process logs in console
50
+ verbose: whether or not to print process logs in console
116
51
  sheet_name: if reading multi-sheet file (xls-like), which sheet to consider
117
52
  skipna: whether to keep NaN (empty cells) for tests
118
53
 
119
54
  Returns:
120
55
  dict: a dict with information about the csv and possible types for each column
121
56
  """
122
-
57
+
123
58
  if not (isinstance(save_results, bool) or (isinstance(save_results, str) and save_results.endswith(".json"))):
124
59
  raise ValueError("`save_results` must be a bool or a valid path to a json file.")
125
60
 
@@ -137,168 +72,105 @@ def routine(
137
72
  sheet_name=sheet_name,
138
73
  )
139
74
 
140
- if table.empty:
141
- res_categorical = []
142
- # res_continuous = []
143
- else:
144
- # Detects columns that are categorical
145
- res_categorical, categorical_mask = detect_categorical_variable(table, verbose=verbose)
146
- res_categorical = list(res_categorical)
147
- # Detect columns that are continuous (we already know the categorical) : we don't need this for now, cuts processing time
148
- # res_continuous = list(
149
- # detect_continuous_variable(table.iloc[:, ~categorical_mask.values], verbose=verbose)
150
- # )
151
-
152
- analysis.update({
153
- "categorical": res_categorical,
154
- # "continuous": res_continuous,
155
- })
156
-
157
- # list testing to be performed
158
- all_tests_fields = return_all_tests(
159
- user_input_tests, detect_type="detect_fields"
160
- ) # list all tests for the fields
161
- all_tests_labels = return_all_tests(
162
- user_input_tests, detect_type="detect_labels"
163
- ) # list all tests for the labels
164
-
165
- # if no testing then return
166
- if not all_tests_fields and not all_tests_labels:
167
- return analysis
168
-
169
- # Perform testing on fields
170
- scores_table_fields = test_col(table, all_tests_fields, limited_output, skipna=skipna, verbose=verbose)
171
- analysis["columns_fields"] = prepare_output_dict(scores_table_fields, limited_output)
172
-
173
- # Perform testing on labels
174
- scores_table_labels = test_label(table, all_tests_labels, limited_output, verbose=verbose)
175
- analysis["columns_labels"] = prepare_output_dict(scores_table_labels, limited_output)
176
-
177
- # Multiply the results of the fields by 1 + 0.5 * the results of the labels.
178
- # This is because the fields are more important than the labels and yields a max
179
- # of 1.5 for the final score.
180
- scores_table = scores_table_fields * (
181
- 1
182
- + scores_table_labels.reindex(
183
- index=scores_table_fields.index, fill_value=0
184
- ).values / 2
185
- )
186
-
187
- # To reduce false positives: ensure these formats are detected only if the label yields
188
- # a detection (skipping the ones that have been excluded by the users).
189
- formats_with_mandatory_label = [
190
- f for f in [
191
- "code_departement",
192
- "code_commune_insee",
193
- "code_postal",
194
- "latitude_wgs",
195
- "longitude_wgs",
196
- "latitude_wgs_fr_metropole",
197
- "longitude_wgs_fr_metropole",
198
- "latitude_l93",
199
- "longitude_l93",
200
- ] if f in scores_table.index
201
- ]
202
- scores_table.loc[formats_with_mandatory_label, :] = np.where(
203
- scores_table_labels.loc[formats_with_mandatory_label, :],
204
- scores_table.loc[formats_with_mandatory_label, :],
205
- 0,
75
+ analysis = detect_formats(
76
+ table=table,
77
+ analysis=analysis,
78
+ user_input_tests=user_input_tests,
79
+ limited_output=limited_output,
80
+ skipna=skipna,
81
+ verbose=verbose,
206
82
  )
207
- analysis["columns"] = prepare_output_dict(scores_table, limited_output)
208
-
209
- metier_to_python_type = {
210
- "booleen": "bool",
211
- "int": "int",
212
- "float": "float",
213
- "string": "string",
214
- "json": "json",
215
- "json_geojson": "json",
216
- "datetime": "datetime",
217
- "datetime_iso": "datetime",
218
- "datetime_rfc822": "datetime",
219
- "date": "date",
220
- "latitude": "float",
221
- "latitude_l93": "float",
222
- "latitude_wgs": "float",
223
- "latitude_wgs_fr_metropole": "float",
224
- "longitude": "float",
225
- "longitude_l93": "float",
226
- "longitude_wgs": "float",
227
- "longitude_wgs_fr_metropole": "float",
228
- }
229
-
230
- if not limited_output:
231
- for detection_method in ["columns_fields", "columns_labels", "columns"]:
232
- analysis[detection_method] = {
233
- col_name: [
234
- {
235
- "python_type": metier_to_python_type.get(
236
- detection["format"], "string"
237
- ),
238
- **detection,
239
- }
240
- for detection in detections
241
- ]
242
- for col_name, detections in analysis[detection_method].items()
243
- }
244
- else:
245
- for detection_method in ["columns_fields", "columns_labels", "columns"]:
246
- analysis[detection_method] = {
247
- col_name: {
248
- "python_type": metier_to_python_type.get(
249
- detection["format"], "string"
250
- ),
251
- **detection,
252
- }
253
- for col_name, detection in analysis[detection_method].items()
254
- }
255
83
 
256
- # Add detection with formats as keys
257
- analysis["formats"] = defaultdict(list)
258
- for header, col_metadata in analysis["columns"].items():
259
- analysis["formats"][col_metadata["format"]].append(header)
260
-
261
- if output_profile:
262
- analysis["profile"] = create_profile(
84
+ try:
85
+ return generate_output(
263
86
  table=table,
264
- dict_cols_fields=analysis["columns"],
87
+ analysis=analysis,
88
+ file_path=file_path,
265
89
  num_rows=num_rows,
266
90
  limited_output=limited_output,
91
+ save_results=save_results,
92
+ output_profile=output_profile,
93
+ output_schema=output_schema,
94
+ output_df=output_df,
95
+ cast_json=cast_json,
267
96
  verbose=verbose,
97
+ sheet_name=sheet_name,
268
98
  )
99
+ finally:
100
+ if verbose:
101
+ display_logs_depending_process_time(
102
+ f"Routine completed in {round(time() - start_routine, 3)}s",
103
+ time() - start_routine
104
+ )
269
105
 
270
- if save_results:
271
- if isinstance(save_results, str):
272
- output_path = save_results
273
- else:
274
- output_path = os.path.splitext(file_path)[0]
275
- if is_url(output_path):
276
- output_path = output_path.split('/')[-1]
277
- if analysis.get("sheet_name"):
278
- output_path += "_sheet-" + str(sheet_name)
279
- output_path += ".json"
280
- with open(output_path, "w", encoding="utf8") as fp:
281
- json.dump(analysis, fp, indent=4, separators=(",", ": "), ensure_ascii=False)
282
106
 
283
- if output_schema:
284
- analysis["schema"] = generate_table_schema(
285
- analysis,
286
- save_file=False,
287
- verbose=verbose
288
- )
107
+ def validate_then_detect(
108
+ file_path: str,
109
+ previous_analysis: dict,
110
+ num_rows: int = 500,
111
+ user_input_tests: Union[str, list[str]] = "ALL",
112
+ limited_output: bool = True,
113
+ save_results: Union[bool, str] = True,
114
+ encoding: str = None,
115
+ sep: str = None,
116
+ skipna: bool = True,
117
+ output_profile: bool = False,
118
+ output_schema: bool = False,
119
+ output_df: bool = False,
120
+ cast_json: bool = True,
121
+ verbose: bool = False,
122
+ sheet_name: Union[str, int] = None,
123
+ ):
124
+
289
125
  if verbose:
290
- display_logs_depending_process_time(
291
- f'Routine completed in {round(time() - start_routine, 3)}s',
292
- time() - start_routine
126
+ start_routine = time()
127
+ if is_url(file_path):
128
+ logging.info("Path recognized as a URL")
129
+
130
+ is_valid, table, analysis = validate(
131
+ file_path=file_path,
132
+ previous_analysis=previous_analysis,
133
+ num_rows=num_rows,
134
+ encoding=encoding,
135
+ sep=sep,
136
+ verbose=verbose,
137
+ skipna=skipna,
138
+ sheet_name=sheet_name,
139
+ )
140
+ if is_valid:
141
+ # skipping formats detection as the validation is successful
142
+ analysis = previous_analysis
143
+ del analysis["profile"]
144
+ else:
145
+ analysis = detect_formats(
146
+ table=table,
147
+ analysis=analysis,
148
+ user_input_tests=user_input_tests,
149
+ limited_output=limited_output,
150
+ skipna=skipna,
151
+ verbose=verbose,
293
152
  )
294
- if output_df:
295
- return analysis, cast_df(
296
- df=table,
297
- columns=analysis["columns"],
153
+ try:
154
+ return generate_output(
155
+ table=table,
156
+ analysis=analysis,
157
+ file_path=file_path,
158
+ num_rows=num_rows,
159
+ limited_output=limited_output,
160
+ save_results=save_results,
161
+ output_profile=output_profile,
162
+ output_schema=output_schema,
163
+ output_df=output_df,
298
164
  cast_json=cast_json,
299
165
  verbose=verbose,
166
+ sheet_name=sheet_name,
300
167
  )
301
- return analysis
168
+ finally:
169
+ if verbose:
170
+ display_logs_depending_process_time(
171
+ f"Process completed in {round(time() - start_routine, 3)}s",
172
+ time() - start_routine
173
+ )
302
174
 
303
175
 
304
176
  def routine_minio(
@@ -369,8 +241,8 @@ def routine_minio(
369
241
  minio_pwd=minio_pwd,
370
242
  )
371
243
 
372
- analysis = routine(file_path,
373
- num_rows,
244
+ analysis = routine(
245
+ file_path,
374
246
  save_results=True,
375
247
  **kwargs,
376
248
  )
@@ -0,0 +1,62 @@
1
+ import os
2
+ from typing import Union
3
+
4
+ # flake8: noqa
5
+ from csv_detective import detect_fields, detect_labels
6
+
7
+
8
+ def get_all_packages(detect_type) -> list:
9
+ root_dir = os.path.dirname(os.path.abspath(__file__)) + "/" + detect_type
10
+ modules = []
11
+ for dirpath, _, filenames in os.walk(root_dir):
12
+ for filename in filenames:
13
+ file = os.path.join(dirpath, filename).replace(root_dir, "")
14
+ if file.endswith("__init__.py"):
15
+ module = (
16
+ file.replace("__init__.py", "")
17
+ .replace("/", ".").replace("\\", ".")[:-1]
18
+ )
19
+ if module:
20
+ modules.append(detect_type + module)
21
+ return modules
22
+
23
+
24
+ def return_all_tests(
25
+ user_input_tests: Union[str, list],
26
+ detect_type: str,
27
+ ) -> list:
28
+ """
29
+ returns all tests that have a method _is and are listed in the user_input_tests
30
+ the function can select a sub_package from csv_detective
31
+ user_input_tests may look like this:
32
+ - "ALL": all possible tests are made
33
+ - "FR.other.siren" (or any other path-like string to one of the tests, or a group of tests, like "FR.geo"):
34
+ this specifc (group of) test(s) only
35
+ - ["FR.temp.mois_de_annee", "geo", ...]: only the specified tests will be made ; you may also skip
36
+ specific (groups of) tests by add "-" at the start (e.g "-temp.date")
37
+ """
38
+ assert detect_type in ["detect_fields", "detect_labels"]
39
+ all_packages = get_all_packages(detect_type=detect_type)
40
+
41
+ if isinstance(user_input_tests, str):
42
+ user_input_tests = [user_input_tests]
43
+ if "ALL" in user_input_tests or all(x[0] == "-" for x in user_input_tests):
44
+ tests_to_do = [detect_type]
45
+ else:
46
+ tests_to_do = [
47
+ f"{detect_type}.{x}" for x in user_input_tests if x[0] != "-"
48
+ ]
49
+ tests_skipped = [
50
+ f"{detect_type}.{x[1:]}" for x in user_input_tests if x[0] == "-"
51
+ ]
52
+ all_tests = [
53
+ # this is why we need to import detect_fields/labels
54
+ eval(x) for x in all_packages
55
+ if any([y == x[: len(y)] for y in tests_to_do])
56
+ and all([y != x[: len(y)] for y in tests_skipped])
57
+ ]
58
+ # to remove groups of tests
59
+ all_tests = [
60
+ test for test in all_tests if "_is" in dir(test)
61
+ ]
62
+ return all_tests
@@ -0,0 +1,64 @@
1
+ import json
2
+ import os
3
+ from typing import Union
4
+
5
+ import pandas as pd
6
+
7
+ from csv_detective.utils import is_url
8
+ from .dataframe import cast_df
9
+ from .profile import create_profile
10
+ from .schema import generate_table_schema
11
+
12
+
13
+ def generate_output(
14
+ table: pd.DataFrame,
15
+ analysis: dict,
16
+ file_path: str,
17
+ num_rows: int = 500,
18
+ limited_output: bool = True,
19
+ save_results: Union[bool, str] = True,
20
+ output_profile: bool = False,
21
+ output_schema: bool = False,
22
+ output_df: bool = False,
23
+ cast_json: bool = True,
24
+ verbose: bool = False,
25
+ sheet_name: Union[str, int] = None,
26
+ ) -> Union[dict, tuple[dict, pd.DataFrame]]:
27
+
28
+ if output_profile:
29
+ analysis["profile"] = create_profile(
30
+ table=table,
31
+ dict_cols_fields=analysis["columns"],
32
+ num_rows=num_rows,
33
+ limited_output=limited_output,
34
+ verbose=verbose,
35
+ )
36
+
37
+ if save_results:
38
+ if isinstance(save_results, str):
39
+ output_path = save_results
40
+ else:
41
+ output_path = os.path.splitext(file_path)[0]
42
+ if is_url(output_path):
43
+ output_path = output_path.split('/')[-1]
44
+ if analysis.get("sheet_name"):
45
+ output_path += "_sheet-" + str(sheet_name)
46
+ output_path += ".json"
47
+ with open(output_path, "w", encoding="utf8") as fp:
48
+ json.dump(analysis, fp, indent=4, separators=(",", ": "), ensure_ascii=False)
49
+
50
+ if output_schema:
51
+ analysis["schema"] = generate_table_schema(
52
+ analysis,
53
+ save_file=False,
54
+ verbose=verbose
55
+ )
56
+
57
+ if output_df:
58
+ return analysis, cast_df(
59
+ df=table,
60
+ columns=analysis["columns"],
61
+ cast_json=cast_json,
62
+ verbose=verbose,
63
+ )
64
+ return analysis
File without changes
File without changes
File without changes
File without changes
File without changes
csv_detective/utils.py CHANGED
@@ -2,6 +2,8 @@ import logging
2
2
  import math
3
3
  from typing import Optional
4
4
 
5
+ logging.basicConfig(level=logging.INFO)
6
+
5
7
 
6
8
  def display_logs_depending_process_time(prompt: str, duration: float):
7
9
  '''
@@ -0,0 +1,70 @@
1
+ import logging
2
+ from typing import Union
3
+
4
+ import pandas as pd
5
+
6
+ from csv_detective.load_tests import return_all_tests
7
+ from .parsing.load import load_file
8
+
9
+ logging.basicConfig(level=logging.INFO)
10
+
11
+ tests = {
12
+ t.__name__.split(".")[-1]: t._is
13
+ for t in return_all_tests("ALL", "detect_fields")
14
+ }
15
+
16
+
17
+ def validate(
18
+ file_path: str,
19
+ previous_analysis: dict,
20
+ num_rows: int = 500,
21
+ encoding: str = None,
22
+ sep: str = None,
23
+ verbose: bool = False,
24
+ skipna: bool = True,
25
+ sheet_name: Union[str, int] = None,
26
+ ) -> tuple[bool, pd.DataFrame, dict]:
27
+ """
28
+ Verify is the given file has the same fields and types as in the previous analysis.
29
+ """
30
+ table, analysis = load_file(
31
+ file_path=file_path,
32
+ num_rows=num_rows,
33
+ encoding=encoding,
34
+ sep=sep,
35
+ verbose=verbose,
36
+ sheet_name=sheet_name,
37
+ )
38
+ if verbose:
39
+ logging.info("Comparing table with the previous analysis")
40
+ logging.info("- Checking if all columns match")
41
+ if (
42
+ any(col_name not in list(table.columns) for col_name in previous_analysis["columns"])
43
+ or any(col_name not in list(previous_analysis["columns"].keys()) for col_name in table.columns)
44
+ ):
45
+ logging.warning("> Columns do not match, proceeding with full analysis")
46
+ return False, table, analysis
47
+ for col_name, args in previous_analysis["columns"].items():
48
+ if verbose:
49
+ logging.info(f"- Testing {col_name} for {args['format']}")
50
+ if args["format"] == "string":
51
+ # no test for columns that have not been recognized as a specific format
52
+ continue
53
+ test_func = tests[args["format"]]
54
+ col_data = table[col_name]
55
+ if skipna:
56
+ col_data = col_data.loc[~col_data.isna()]
57
+ if not col_data.apply(test_func).all():
58
+ logging.warning("> Test failed, proceeding with full analysis")
59
+ return False, table, analysis
60
+ if verbose:
61
+ logging.info("> All checks successful")
62
+ return True, table, analysis | {
63
+ k: previous_analysis[k] for k in [
64
+ "categorical",
65
+ "columns",
66
+ "columns_fields",
67
+ "columns_labels",
68
+ "formats",
69
+ ]
70
+ }
@@ -13,6 +13,7 @@
13
13
  - Handle csv.gz files [#110](https://github.com/datagouv/csv-detective/pull/110)
14
14
  - Refactor file tests [#110](https://github.com/datagouv/csv-detective/pull/110)
15
15
  - Restructure repo (breaking changes) [#111](https://github.com/datagouv/csv-detective/pull/111)
16
+ - Add validation function and associated flow [#112](https://github.com/datagouv/csv-detective/pull/112)
16
17
  - Better float detection [#113](https://github.com/datagouv/csv-detective/pull/113)
17
18
 
18
19
  ## 0.7.4 (2024-11-15)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: csv_detective
3
- Version: 0.7.5.dev1277
3
+ Version: 0.7.5.dev1286
4
4
  Summary: Detect CSV column content
5
5
  Home-page: https://github.com/etalab/csv_detective
6
6
  Author: Etalab
@@ -1,8 +1,10 @@
1
- csv_detective/__init__.py,sha256=GCHgu0BhH5ACV7cf-1gDr9nRyvSoeQ1vRw9SjEHeMT4,143
1
+ csv_detective/__init__.py,sha256=vpK7WMkIQbcJzu6HKOwcn7PpHsNCCaXZ1YLMS5Wq9tM,165
2
2
  csv_detective/cli.py,sha256=itooHtpyfC6DUsL_DchPKe1xo7m0MYJIp1L4R8eqoTk,1401
3
- csv_detective/explore_csv.py,sha256=FmgJ2h1SxV8b_wOWia4xsswyVJTlCCW66e0nhltz-0s,14511
3
+ csv_detective/explore_csv.py,sha256=ocWlUEtuwZ-6bjDc6gfhC2-6DljMVhvXhHrfICCXGfQ,8986
4
+ csv_detective/load_tests.py,sha256=GILvfkd4OVI-72mA4nzbPlZqgcXZ4wznOhGfZ1ucWkM,2385
4
5
  csv_detective/s3_utils.py,sha256=1cIVdQUYY2ovErbMwp72Gqtqx2bkB8nfVhn-QaOFTT0,1451
5
- csv_detective/utils.py,sha256=KAYfSJXnPuAXnSc38Jm57oQ_JP_0kUkmI1OV6gN5_ys,1116
6
+ csv_detective/utils.py,sha256=Bx_1k4Sdpd5PCjuAy4AeayCmmw7TMR_zgtKIHNLi5g0,1157
7
+ csv_detective/validate.py,sha256=o4Qulf8E-x1zsWT9OD4Fpw83Gku1WA3JlX83j7bu0DA,2314
6
8
  csv_detective/detect_fields/__init__.py,sha256=NVfE3BQVExgXb-BPbhDvlkM5-0naEVLpZ4aM_OGHYfE,931
7
9
  csv_detective/detect_fields/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
10
  csv_detective/detect_fields/FR/geo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -53,7 +55,7 @@ csv_detective/detect_fields/geo/longitude_wgs/__init__.py,sha256=G7afWOKiGh_Tv7g
53
55
  csv_detective/detect_fields/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
54
56
  csv_detective/detect_fields/other/booleen/__init__.py,sha256=wn_yyTAmGxqo0l0b7JRpGb0da_E27iGxES9zWCrnsqc,497
55
57
  csv_detective/detect_fields/other/email/__init__.py,sha256=O9tgJmq0O8Q-8iin63NqEEDhlsUJjxFZNaNFM4GZaws,178
56
- csv_detective/detect_fields/other/float/__init__.py,sha256=7bXuPAmBuIhKJEhq7d20B60WVol1AUpqRkWhreQpWfU,578
58
+ csv_detective/detect_fields/other/float/__init__.py,sha256=AT4Kpgwoz5PuAoLx00u0SL8DjjXZxsE8zSRbN18uAv4,578
57
59
  csv_detective/detect_fields/other/int/__init__.py,sha256=QN3kQJLYqLRBiubUK7g4Xq03PlA5wqVwx2pPPIO9FdI,320
58
60
  csv_detective/detect_fields/other/json/__init__.py,sha256=DhzyvT12kOqgum89silIu3uoSYXmC_s_AaxLtXAD4eU,540
59
61
  csv_detective/detect_fields/other/mongo_object_id/__init__.py,sha256=7fcrHsOZAqXp2_N0IjPskYJ_qi4xRlo9iyNNDQVLzsU,156
@@ -126,10 +128,12 @@ csv_detective/detect_labels/temp/year/__init__.py,sha256=3U9j8Hux432KdGtIyArq_-v
126
128
  csv_detective/detection/columns.py,sha256=vfE-DKESA6J9Rfsl-a8tjgZfE21VmzArO5TrbzL0KmE,2905
127
129
  csv_detective/detection/encoding.py,sha256=tpjJEMNM_2TcLXDzn1lNQPnSRnsWYjs83tQ8jNwTj4E,973
128
130
  csv_detective/detection/engine.py,sha256=HiIrU-l9EO5Fbc2Vh8W_Uy5-dpKcQQzlxCqMuWc09LY,1530
131
+ csv_detective/detection/formats.py,sha256=VwFazRAFJN6eaYUK7IauVU88vuUBHccESY4UD8EgGUo,5386
129
132
  csv_detective/detection/headers.py,sha256=wrVII2RQpsVmHhrO1DHf3dmiu8kbtOjBlskf41cnQmc,1172
130
133
  csv_detective/detection/rows.py,sha256=3qvsbsBcMxiqqfSYYkOgsRpX777rk22tnRHDwUA97kU,742
131
134
  csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
132
135
  csv_detective/detection/variables.py,sha256=3qEMtjZ_zyIFXvTnFgK7ZMDx8C12uQXKfFjEj2moyJc,3558
136
+ csv_detective/output/__init__.py,sha256=XDS4Dgvv6oloIao9JquHa0m1nnlQ_q2gHuEPGlaETic,1890
133
137
  csv_detective/output/dataframe.py,sha256=89iQRE59cHQyQQEsujQVIKP2YAUYpPklWkdDOqZE-wE,2183
134
138
  csv_detective/output/example.py,sha256=i8PkdXxidF7qR_9aK8vh12JpZdJQryhBgyrMS8iy5rk,8642
135
139
  csv_detective/output/profile.py,sha256=B8YU541T_YPDezJGh4dkHckOShiwHSrZd9GS8jbmz7A,2919
@@ -141,18 +145,19 @@ csv_detective/parsing/csv.py,sha256=11mibDnJhIjykXLGZvA5ZEU5U7KgxIrbyO6BNv6jlro,
141
145
  csv_detective/parsing/excel.py,sha256=AslE2S1e67o8yTIAIhp-lAnJ6-XqeBBRz1-VMFqhZBM,7055
142
146
  csv_detective/parsing/load.py,sha256=SpP0pfxswOAPPpwbZfoP1blh0EKV5VMs0TpTgQJKzjs,3621
143
147
  csv_detective/parsing/text.py,sha256=rsfk66BCmdpsCOd0kDJ8tmqMsEWd-OeBkEisWc4Ej9k,1246
144
- csv_detective-0.7.5.dev1277.data/data/share/csv_detective/CHANGELOG.md,sha256=tgIIm6s4qoP4RGJK1cmqf-Cm5aHmXmBrwi37NVIYedg,7796
145
- csv_detective-0.7.5.dev1277.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
146
- csv_detective-0.7.5.dev1277.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
147
- csv_detective-0.7.5.dev1277.dist-info/licenses/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
148
+ csv_detective-0.7.5.dev1286.data/data/share/csv_detective/CHANGELOG.md,sha256=Gqw7W41bXK_JgIYi80vdOPR6JLY5rgABeNsiDStE4XA,7901
149
+ csv_detective-0.7.5.dev1286.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
150
+ csv_detective-0.7.5.dev1286.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
151
+ csv_detective-0.7.5.dev1286.dist-info/licenses/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
148
152
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
149
153
  tests/test_example.py,sha256=0NfChooJQlFxTo2nY5FOQIcsK4zzWA_SBmt2LwVQovY,2014
150
- tests/test_fields.py,sha256=LPLx09cX5u9XHAh65XvTgIqzKylToiHZxXzKhpV0wsk,11148
151
- tests/test_file.py,sha256=EleTssys5fCP4N0W1eTZN35uijzoF15e3dIcuIlrMsk,7865
154
+ tests/test_fields.py,sha256=53kiUQiqGt4_fnyCoxhNLeEsuN1LRDB-7HGT3p_Ed9I,11147
155
+ tests/test_file.py,sha256=9APE1d43lQ8Dk8lwJFNUK_YekYYsQ0ae2_fgpcPE9mk,8116
152
156
  tests/test_labels.py,sha256=6MOKrGznkwU5fjZ_3oiB6Scmb480Eu-9geBJs0UDLds,159
153
- tests/test_structure.py,sha256=SVsnluVoIIprYw_67I1_gB3cp9m1wlO8C7SpdsLW8cM,1161
154
- csv_detective-0.7.5.dev1277.dist-info/METADATA,sha256=RgcnqpKqQ1us0lmVf6McKYJs38DC1sqvAh10XgnJOY8,1386
155
- csv_detective-0.7.5.dev1277.dist-info/WHEEL,sha256=pxyMxgL8-pra_rKaQ4drOZAegBVuX-G_4nRHjjgWbmo,91
156
- csv_detective-0.7.5.dev1277.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
157
- csv_detective-0.7.5.dev1277.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
158
- csv_detective-0.7.5.dev1277.dist-info/RECORD,,
157
+ tests/test_structure.py,sha256=bv-tjgXohvQAxwmxzH0BynFpK2TyPjcxvtIAmIRlZmA,1393
158
+ tests/test_validation.py,sha256=VwtBcnGAQ_eSFrBibWnMSTDjuy6y2JLlqvc3Zb667NY,479
159
+ csv_detective-0.7.5.dev1286.dist-info/METADATA,sha256=rLptgL-FkLZzfkxPt7_0I-k7EKPKbEHhd3Ei2qt54KI,1386
160
+ csv_detective-0.7.5.dev1286.dist-info/WHEEL,sha256=pxyMxgL8-pra_rKaQ4drOZAegBVuX-G_4nRHjjgWbmo,91
161
+ csv_detective-0.7.5.dev1286.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
162
+ csv_detective-0.7.5.dev1286.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
163
+ csv_detective-0.7.5.dev1286.dist-info/RECORD,,
tests/test_fields.py CHANGED
@@ -48,7 +48,7 @@ from csv_detective.detection.variables import (
48
48
  detect_continuous_variable,
49
49
  detect_categorical_variable,
50
50
  )
51
- from csv_detective.explore_csv import return_all_tests
51
+ from csv_detective.load_tests import return_all_tests
52
52
  from csv_detective.output.dataframe import cast
53
53
 
54
54
 
tests/test_file.py CHANGED
@@ -28,7 +28,7 @@ def test_columns_output_on_file():
28
28
  "STRUCTURED_INFO",
29
29
  "GEO_INFO",
30
30
  ]
31
- assert output["total_lines"] == 414
31
+ assert output["total_lines"] == 404
32
32
  assert output["nb_duplicates"] == 7
33
33
  assert output["columns"]["NOMCOM"]["format"] == "commune"
34
34
  assert output["columns"]["NOMDEP"]["format"] == "departement"
@@ -48,7 +48,7 @@ def test_profile_output_on_file():
48
48
  )
49
49
  assert all(
50
50
  [
51
- c in list(output["profile"]["NUMCOM"].keys())
51
+ c in list(output["profile"]["TXCOUVGLO_COM_2014"].keys())
52
52
  for c in [
53
53
  "min",
54
54
  "max",
@@ -60,12 +60,22 @@ def test_profile_output_on_file():
60
60
  ]
61
61
  ]
62
62
  )
63
- assert len(output["profile"]["NOMCOM"].keys()) == 3
64
- assert output["profile"]["NUMCOM"]["min"] == 1001
65
- assert output["profile"]["NUMCOM"]["max"] == 6125
66
- assert round(output["profile"]["NUMCOM"]["mean"]) == 1245
67
- assert round(output["profile"]["NUMCOM"]["std"]) == 363
68
- assert output["profile"]["TXCOUVGLO_COM_2014"]["nb_distinct"] == 296
63
+ assert not any(
64
+ [
65
+ c in list(output["profile"]["NUMCOM"].keys())
66
+ for c in [
67
+ "min",
68
+ "max",
69
+ "mean",
70
+ "std",
71
+ ]
72
+ ]
73
+ )
74
+ assert output["profile"]["TXCOUVGLO_COM_2014"]["min"] == 0.0
75
+ assert output["profile"]["TXCOUVGLO_COM_2014"]["max"] == 200.2
76
+ assert round(output["profile"]["TXCOUVGLO_COM_2014"]["mean"]) == 60
77
+ assert round(output["profile"]["TXCOUVGLO_COM_2014"]["std"]) == 36
78
+ assert output["profile"]["TXCOUVGLO_COM_2014"]["nb_distinct"] == 290
69
79
  assert output["profile"]["TXCOUVGLO_COM_2014"]["nb_missing_values"] == 3
70
80
  assert output["profile"]["GEO_INFO"]["nb_distinct"] == 1
71
81
 
@@ -175,7 +185,7 @@ def mocked_responses():
175
185
  "params",
176
186
  # ideally we'd like to do the same with params_others but pandas.read_excel uses urllib
177
187
  # which doesn't support the way we mock the response, TBC
178
- params_csv + [("a_test_file.csv", {"separator": ";", "header_row_idx": 2, "total_lines": 414})]
188
+ params_csv + [("a_test_file.csv", {"separator": ";", "header_row_idx": 2, "total_lines": 404})]
179
189
  )
180
190
  def test_urls(mocked_responses, params):
181
191
  file_name, checks = params
tests/test_structure.py CHANGED
@@ -1,6 +1,7 @@
1
1
  import os
2
2
  # flake8: noqa
3
3
  from csv_detective import detect_fields, detect_labels
4
+ from csv_detective.load_tests import return_all_tests
4
5
 
5
6
 
6
7
  def tests_conformity():
@@ -29,3 +30,8 @@ def tests_conformity():
29
30
  .replace("/", ".")
30
31
  )
31
32
  assert "_is" in dir(_package)
33
+
34
+
35
+ def test_all_tests_have_unique_name():
36
+ names = [t.__name__.split(".")[-1] for t in return_all_tests("ALL", "detect_fields")]
37
+ assert len(names) == len(set(names))
@@ -0,0 +1,18 @@
1
+ import json
2
+
3
+ import pandas as pd
4
+
5
+ from csv_detective.validate import validate
6
+
7
+
8
+ def test_validation():
9
+ with open("tests/data/a_test_file.json", "r") as f:
10
+ previous_analysis = json.load(f)
11
+ is_valid, table, analysis = validate(
12
+ "tests/data/a_test_file.csv",
13
+ previous_analysis=previous_analysis,
14
+ num_rows=-1,
15
+ )
16
+ assert is_valid is True
17
+ assert isinstance(table, pd.DataFrame)
18
+ assert isinstance(analysis, dict)