csv-detective 0.7.5.dev1244__py3-none-any.whl → 0.7.5.dev1286__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. csv_detective/__init__.py +1 -1
  2. csv_detective/cli.py +34 -19
  3. csv_detective/detect_fields/other/float/__init__.py +4 -4
  4. csv_detective/detection/formats.py +145 -0
  5. csv_detective/explore_csv.py +101 -240
  6. csv_detective/load_tests.py +62 -0
  7. csv_detective/output/__init__.py +64 -0
  8. csv_detective/output/dataframe.py +0 -0
  9. csv_detective/output/example.py +0 -0
  10. csv_detective/output/profile.py +0 -0
  11. csv_detective/output/schema.py +0 -0
  12. csv_detective/output/utils.py +0 -0
  13. csv_detective/utils.py +2 -0
  14. csv_detective/validate.py +70 -0
  15. {csv_detective-0.7.5.dev1244.data → csv_detective-0.7.5.dev1286.data}/data/share/csv_detective/CHANGELOG.md +2 -0
  16. {csv_detective-0.7.5.dev1244.dist-info → csv_detective-0.7.5.dev1286.dist-info}/METADATA +1 -1
  17. {csv_detective-0.7.5.dev1244.dist-info → csv_detective-0.7.5.dev1286.dist-info}/RECORD +22 -17
  18. {csv_detective-0.7.5.dev1244.dist-info → csv_detective-0.7.5.dev1286.dist-info}/WHEEL +1 -1
  19. tests/test_fields.py +1 -1
  20. tests/test_file.py +19 -9
  21. tests/test_structure.py +6 -0
  22. tests/test_validation.py +18 -0
  23. {csv_detective-0.7.5.dev1244.data → csv_detective-0.7.5.dev1286.data}/data/share/csv_detective/LICENSE.AGPL.txt +0 -0
  24. {csv_detective-0.7.5.dev1244.data → csv_detective-0.7.5.dev1286.data}/data/share/csv_detective/README.md +0 -0
  25. {csv_detective-0.7.5.dev1244.dist-info → csv_detective-0.7.5.dev1286.dist-info}/entry_points.txt +0 -0
  26. {csv_detective-0.7.5.dev1244.dist-info → csv_detective-0.7.5.dev1286.dist-info}/licenses/LICENSE.AGPL.txt +0 -0
  27. {csv_detective-0.7.5.dev1244.dist-info → csv_detective-0.7.5.dev1286.dist-info}/top_level.txt +0 -0
csv_detective/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- from .explore_csv import routine, routine_minio # noqa
1
+ from .explore_csv import routine, routine_minio, validate_then_detect # noqa
2
2
  from .output.example import create_example_csv_file # noqa
3
3
 
4
4
  __version__ = '0.7.5.dev'
csv_detective/cli.py CHANGED
@@ -8,37 +8,52 @@ from .explore_csv import routine
8
8
 
9
9
 
10
10
  def run():
11
- explorer = argparse.ArgumentParser(description='Get the arguments we want')
11
+ explorer = argparse.ArgumentParser(description="Analyse a tabular file")
12
12
  explorer.add_argument(
13
- 'file_path',
13
+ "file_path",
14
14
  type=str,
15
- help='Enter path of csv file to explore'
15
+ help="Enter path of tabular file to explore"
16
16
  )
17
17
  explorer.add_argument(
18
- '-n',
19
- '--num_rows',
20
- dest='num_rows',
18
+ "-n",
19
+ "--num_rows",
20
+ dest="num_rows",
21
21
  type=int,
22
- nargs='?',
23
- help='Number of rows to use for detection'
22
+ nargs="?",
23
+ help="Number of rows to use for detection (default 500)"
24
24
  )
25
25
  explorer.add_argument(
26
- '-t',
27
- '--select_tests',
28
- dest='city',
26
+ "-s",
27
+ "--sep",
28
+ dest="sep",
29
29
  type=str,
30
- nargs='*',
31
- help='List of tests to be performed (use "" if you want to use the dash option to remove tests)'
30
+ nargs="?",
31
+ help="Columns separator (detected if not specified)"
32
+ )
33
+ explorer.add_argument(
34
+ "--save",
35
+ dest="save_results",
36
+ type=int,
37
+ nargs="?",
38
+ help="Whether to save the resulting analysis to json (1 = save, 0 = don't)"
39
+ )
40
+ explorer.add_argument(
41
+ "-v",
42
+ "--verbose",
43
+ dest="verbose",
44
+ type=int,
45
+ nargs="?",
46
+ help="Verbose (0 = quiet, 1 = details)"
32
47
  )
33
48
 
34
49
  opts = explorer.parse_args()
35
50
 
36
- num_rows = opts.num_rows or 50
37
51
  inspection_results = routine(
38
- opts.file_path,
39
- num_rows=num_rows,
40
- user_input_tests='ALL',
41
- output_mode='ALL'
52
+ csv_file_path=opts.file_path,
53
+ num_rows=opts.num_rows,
54
+ sep=opts.sep,
55
+ save_results=bool(opts.save_results),
56
+ verbose=bool(opts.verbose),
42
57
  )
43
58
 
44
- print(json.dumps(inspection_results, indent=4, sort_keys=True, ensure_ascii=False))
59
+ print(json.dumps(inspection_results, indent=4, ensure_ascii=False))
@@ -2,16 +2,16 @@ PROPORTION = 1
2
2
 
3
3
 
4
4
  def float_casting(val: str) -> float:
5
- return float(val.replace(',', '.'))
5
+ return float(val.replace(",", "."))
6
6
 
7
7
 
8
8
  def _is(val):
9
- '''Detects floats, assuming that tables will not have scientific
10
- notations (3e6) or "+" in the string. "-" is still accepted.'''
9
+ """Detects floats, assuming that tables will not have scientific
10
+ notations (3e6) or "+" in the string. "-" is still accepted."""
11
11
  try:
12
12
  if (
13
13
  not isinstance(val, str)
14
- or any([k in val for k in ['_', '+', 'e', 'E']])
14
+ or any([k in val for k in ["_", "+", "e", "E"]])
15
15
  or (val.startswith("0") and len(val) > 1 and val[1] not in [".", ","])
16
16
  ):
17
17
  return False
@@ -0,0 +1,145 @@
1
+ from collections import defaultdict
2
+ from typing import Union
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+ from csv_detective.detection.variables import (
7
+ detect_categorical_variable,
8
+ # detect_continuous_variable,
9
+ )
10
+ from csv_detective.load_tests import return_all_tests
11
+ from csv_detective.output.utils import prepare_output_dict
12
+ from csv_detective.parsing.columns import test_col, test_label
13
+
14
+
15
+ def detect_formats(
16
+ table: pd.DataFrame,
17
+ analysis: dict,
18
+ user_input_tests: Union[str, list[str]] = "ALL",
19
+ limited_output: bool = True,
20
+ skipna: bool = True,
21
+ verbose: bool = False,
22
+ ):
23
+
24
+ if table.empty:
25
+ res_categorical = []
26
+ # res_continuous = []
27
+ else:
28
+ # Detects columns that are categorical
29
+ res_categorical, categorical_mask = detect_categorical_variable(table, verbose=verbose)
30
+ res_categorical = list(res_categorical)
31
+ # Detect columns that are continuous (we already know the categorical) :
32
+ # we don't need this for now, cuts processing time
33
+ # res_continuous = list(
34
+ # detect_continuous_variable(table.iloc[:, ~categorical_mask.values], verbose=verbose)
35
+ # )
36
+
37
+ analysis.update({
38
+ "categorical": res_categorical,
39
+ # "continuous": res_continuous,
40
+ })
41
+
42
+ # list testing to be performed
43
+ all_tests_fields = return_all_tests(
44
+ user_input_tests, detect_type="detect_fields"
45
+ ) # list all tests for the fields
46
+ all_tests_labels = return_all_tests(
47
+ user_input_tests, detect_type="detect_labels"
48
+ ) # list all tests for the labels
49
+
50
+ # if no testing then return
51
+ if not all_tests_fields and not all_tests_labels:
52
+ return analysis
53
+
54
+ # Perform testing on fields
55
+ scores_table_fields = test_col(table, all_tests_fields, limited_output, skipna=skipna, verbose=verbose)
56
+ analysis["columns_fields"] = prepare_output_dict(scores_table_fields, limited_output)
57
+
58
+ # Perform testing on labels
59
+ scores_table_labels = test_label(table, all_tests_labels, limited_output, verbose=verbose)
60
+ analysis["columns_labels"] = prepare_output_dict(scores_table_labels, limited_output)
61
+
62
+ # Multiply the results of the fields by 1 + 0.5 * the results of the labels.
63
+ # This is because the fields are more important than the labels and yields a max
64
+ # of 1.5 for the final score.
65
+ scores_table = scores_table_fields * (
66
+ 1
67
+ + scores_table_labels.reindex(
68
+ index=scores_table_fields.index, fill_value=0
69
+ ).values / 2
70
+ )
71
+
72
+ # To reduce false positives: ensure these formats are detected only if the label yields
73
+ # a detection (skipping the ones that have been excluded by the users).
74
+ formats_with_mandatory_label = [
75
+ f for f in [
76
+ "code_departement",
77
+ "code_commune_insee",
78
+ "code_postal",
79
+ "latitude_wgs",
80
+ "longitude_wgs",
81
+ "latitude_wgs_fr_metropole",
82
+ "longitude_wgs_fr_metropole",
83
+ "latitude_l93",
84
+ "longitude_l93",
85
+ ] if f in scores_table.index
86
+ ]
87
+ scores_table.loc[formats_with_mandatory_label, :] = np.where(
88
+ scores_table_labels.loc[formats_with_mandatory_label, :],
89
+ scores_table.loc[formats_with_mandatory_label, :],
90
+ 0,
91
+ )
92
+ analysis["columns"] = prepare_output_dict(scores_table, limited_output)
93
+
94
+ metier_to_python_type = {
95
+ "booleen": "bool",
96
+ "int": "int",
97
+ "float": "float",
98
+ "string": "string",
99
+ "json": "json",
100
+ "json_geojson": "json",
101
+ "datetime": "datetime",
102
+ "datetime_iso": "datetime",
103
+ "datetime_rfc822": "datetime",
104
+ "date": "date",
105
+ "latitude": "float",
106
+ "latitude_l93": "float",
107
+ "latitude_wgs": "float",
108
+ "latitude_wgs_fr_metropole": "float",
109
+ "longitude": "float",
110
+ "longitude_l93": "float",
111
+ "longitude_wgs": "float",
112
+ "longitude_wgs_fr_metropole": "float",
113
+ }
114
+
115
+ if not limited_output:
116
+ for detection_method in ["columns_fields", "columns_labels", "columns"]:
117
+ analysis[detection_method] = {
118
+ col_name: [
119
+ {
120
+ "python_type": metier_to_python_type.get(
121
+ detection["format"], "string"
122
+ ),
123
+ **detection,
124
+ }
125
+ for detection in detections
126
+ ]
127
+ for col_name, detections in analysis[detection_method].items()
128
+ }
129
+ else:
130
+ for detection_method in ["columns_fields", "columns_labels", "columns"]:
131
+ analysis[detection_method] = {
132
+ col_name: {
133
+ "python_type": metier_to_python_type.get(
134
+ detection["format"], "string"
135
+ ),
136
+ **detection,
137
+ }
138
+ for col_name, detection in analysis[detection_method].items()
139
+ }
140
+
141
+ # Add detection with formats as keys
142
+ analysis["formats"] = defaultdict(list)
143
+ for header, col_metadata in analysis["columns"].items():
144
+ analysis["formats"][col_metadata["format"]].append(header)
145
+ return analysis
@@ -1,85 +1,20 @@
1
- from collections import defaultdict
2
1
  import json
3
2
  import logging
4
3
  import os
5
4
  import tempfile
6
5
  from time import time
7
- from typing import Union
6
+ from typing import Optional, Union
8
7
 
9
- import numpy as np
10
8
  import pandas as pd
11
9
 
12
- # flake8: noqa
13
- from csv_detective import detect_fields, detect_labels
14
- from .detection.variables import (
15
- detect_categorical_variable,
16
- # detect_continuous_variable,
17
- )
18
- from .output.dataframe import cast_df
19
- from .output.profile import create_profile
20
- from .output.schema import generate_table_schema
21
- from .output.utils import prepare_output_dict
10
+ from .detection.formats import detect_formats
11
+ from .output import generate_output, generate_table_schema
22
12
  from .parsing.load import load_file
23
- from .parsing.columns import test_col, test_label
24
13
  from .s3_utils import download_from_minio, upload_to_minio
25
14
  from .utils import display_logs_depending_process_time, is_url
15
+ from .validate import validate
26
16
 
27
-
28
- def get_all_packages(detect_type) -> list:
29
- root_dir = os.path.dirname(os.path.abspath(__file__)) + "/" + detect_type
30
- modules = []
31
- for dirpath, _, filenames in os.walk(root_dir):
32
- for filename in filenames:
33
- file = os.path.join(dirpath, filename).replace(root_dir, "")
34
- if file.endswith("__init__.py"):
35
- module = (
36
- file.replace("__init__.py", "")
37
- .replace("/", ".").replace("\\", ".")[:-1]
38
- )
39
- if module:
40
- modules.append(detect_type + module)
41
- return modules
42
-
43
-
44
- def return_all_tests(
45
- user_input_tests: Union[str, list],
46
- detect_type: str,
47
- ) -> list:
48
- """
49
- returns all tests that have a method _is and are listed in the user_input_tests
50
- the function can select a sub_package from csv_detective
51
- user_input_tests may look like this:
52
- - "ALL": all possible tests are made
53
- - "FR.other.siren" (or any other path-like string to one of the tests, or a group of tests, like "FR.geo"):
54
- this specifc (group of) test(s) only
55
- - ["FR.temp.mois_de_annee", "geo", ...]: only the specified tests will be made ; you may also skip
56
- specific (groups of) tests by add "-" at the start (e.g "-temp.date")
57
- """
58
- assert detect_type in ["detect_fields", "detect_labels"]
59
- all_packages = get_all_packages(detect_type=detect_type)
60
-
61
- if isinstance(user_input_tests, str):
62
- user_input_tests = [user_input_tests]
63
- if "ALL" in user_input_tests or all(x[0] == "-" for x in user_input_tests):
64
- tests_to_do = [detect_type]
65
- else:
66
- tests_to_do = [
67
- f"{detect_type}.{x}" for x in user_input_tests if x[0] != "-"
68
- ]
69
- tests_skipped = [
70
- f"{detect_type}.{x[1:]}" for x in user_input_tests if x[0] == "-"
71
- ]
72
- all_tests = [
73
- # this is why we need to import detect_fields/labels
74
- eval(x) for x in all_packages
75
- if any([y == x[: len(y)] for y in tests_to_do])
76
- and all([y != x[: len(y)] for y in tests_skipped])
77
- ]
78
- # to remove groups of tests
79
- all_tests = [
80
- test for test in all_tests if "_is" in dir(test)
81
- ]
82
- return all_tests
17
+ logging.basicConfig(level=logging.INFO)
83
18
 
84
19
 
85
20
  def routine(
@@ -88,18 +23,18 @@ def routine(
88
23
  user_input_tests: Union[str, list[str]] = "ALL",
89
24
  limited_output: bool = True,
90
25
  save_results: Union[bool, str] = True,
91
- encoding: str = None,
92
- sep: str = None,
26
+ encoding: Optional[str] = None,
27
+ sep: Optional[str] = None,
93
28
  skipna: bool = True,
94
29
  output_profile: bool = False,
95
30
  output_schema: bool = False,
96
31
  output_df: bool = False,
97
32
  cast_json: bool = True,
98
33
  verbose: bool = False,
99
- sheet_name: Union[str, int] = None,
34
+ sheet_name: Optional[Union[str, int]] = None,
100
35
  ) -> Union[dict, tuple[dict, pd.DataFrame]]:
101
36
  """Returns a dict with information about the csv table and possible
102
- column contents.
37
+ column contents, and if requested the DataFrame with columns cast according to analysis.
103
38
 
104
39
  Args:
105
40
  file_path: local path to CSV file if not using Minio
@@ -112,14 +47,14 @@ def routine(
112
47
  output_schema: whether or not to add the 'schema' field to the output (tableschema)
113
48
  output_df: whether or not to return the loaded DataFrame along with the analysis report
114
49
  cast_json: whether or not to cast json columns into objects (otherwise they are returned as strings)
115
- verbose: whether or not to print process logs in console
50
+ verbose: whether or not to print process logs in console
116
51
  sheet_name: if reading multi-sheet file (xls-like), which sheet to consider
117
52
  skipna: whether to keep NaN (empty cells) for tests
118
53
 
119
54
  Returns:
120
55
  dict: a dict with information about the csv and possible types for each column
121
56
  """
122
-
57
+
123
58
  if not (isinstance(save_results, bool) or (isinstance(save_results, str) and save_results.endswith(".json"))):
124
59
  raise ValueError("`save_results` must be a bool or a valid path to a json file.")
125
60
 
@@ -137,168 +72,105 @@ def routine(
137
72
  sheet_name=sheet_name,
138
73
  )
139
74
 
140
- if table.empty:
141
- res_categorical = []
142
- # res_continuous = []
143
- else:
144
- # Detects columns that are categorical
145
- res_categorical, categorical_mask = detect_categorical_variable(table, verbose=verbose)
146
- res_categorical = list(res_categorical)
147
- # Detect columns that are continuous (we already know the categorical) : we don't need this for now, cuts processing time
148
- # res_continuous = list(
149
- # detect_continuous_variable(table.iloc[:, ~categorical_mask.values], verbose=verbose)
150
- # )
151
-
152
- analysis.update({
153
- "categorical": res_categorical,
154
- # "continuous": res_continuous,
155
- })
156
-
157
- # list testing to be performed
158
- all_tests_fields = return_all_tests(
159
- user_input_tests, detect_type="detect_fields"
160
- ) # list all tests for the fields
161
- all_tests_labels = return_all_tests(
162
- user_input_tests, detect_type="detect_labels"
163
- ) # list all tests for the labels
164
-
165
- # if no testing then return
166
- if not all_tests_fields and not all_tests_labels:
167
- return analysis
168
-
169
- # Perform testing on fields
170
- scores_table_fields = test_col(table, all_tests_fields, limited_output, skipna=skipna, verbose=verbose)
171
- analysis["columns_fields"] = prepare_output_dict(scores_table_fields, limited_output)
172
-
173
- # Perform testing on labels
174
- scores_table_labels = test_label(table, all_tests_labels, limited_output, verbose=verbose)
175
- analysis["columns_labels"] = prepare_output_dict(scores_table_labels, limited_output)
176
-
177
- # Multiply the results of the fields by 1 + 0.5 * the results of the labels.
178
- # This is because the fields are more important than the labels and yields a max
179
- # of 1.5 for the final score.
180
- scores_table = scores_table_fields * (
181
- 1
182
- + scores_table_labels.reindex(
183
- index=scores_table_fields.index, fill_value=0
184
- ).values / 2
185
- )
186
-
187
- # To reduce false positives: ensure these formats are detected only if the label yields
188
- # a detection (skipping the ones that have been excluded by the users).
189
- formats_with_mandatory_label = [
190
- f for f in [
191
- "code_departement",
192
- "code_commune_insee",
193
- "code_postal",
194
- "latitude_wgs",
195
- "longitude_wgs",
196
- "latitude_wgs_fr_metropole",
197
- "longitude_wgs_fr_metropole",
198
- "latitude_l93",
199
- "longitude_l93",
200
- ] if f in scores_table.index
201
- ]
202
- scores_table.loc[formats_with_mandatory_label, :] = np.where(
203
- scores_table_labels.loc[formats_with_mandatory_label, :],
204
- scores_table.loc[formats_with_mandatory_label, :],
205
- 0,
75
+ analysis = detect_formats(
76
+ table=table,
77
+ analysis=analysis,
78
+ user_input_tests=user_input_tests,
79
+ limited_output=limited_output,
80
+ skipna=skipna,
81
+ verbose=verbose,
206
82
  )
207
- analysis["columns"] = prepare_output_dict(scores_table, limited_output)
208
83
 
209
- metier_to_python_type = {
210
- "booleen": "bool",
211
- "int": "int",
212
- "float": "float",
213
- "string": "string",
214
- "json": "json",
215
- "json_geojson": "json",
216
- "datetime": "datetime",
217
- "datetime_iso": "datetime",
218
- "datetime_rfc822": "datetime",
219
- "date": "date",
220
- "latitude": "float",
221
- "latitude_l93": "float",
222
- "latitude_wgs": "float",
223
- "latitude_wgs_fr_metropole": "float",
224
- "longitude": "float",
225
- "longitude_l93": "float",
226
- "longitude_wgs": "float",
227
- "longitude_wgs_fr_metropole": "float",
228
- }
229
-
230
- if not limited_output:
231
- for detection_method in ["columns_fields", "columns_labels", "columns"]:
232
- analysis[detection_method] = {
233
- col_name: [
234
- {
235
- "python_type": metier_to_python_type.get(
236
- detection["format"], "string"
237
- ),
238
- **detection,
239
- }
240
- for detection in detections
241
- ]
242
- for col_name, detections in analysis[detection_method].items()
243
- }
244
- else:
245
- for detection_method in ["columns_fields", "columns_labels", "columns"]:
246
- analysis[detection_method] = {
247
- col_name: {
248
- "python_type": metier_to_python_type.get(
249
- detection["format"], "string"
250
- ),
251
- **detection,
252
- }
253
- for col_name, detection in analysis[detection_method].items()
254
- }
255
-
256
- # Add detection with formats as keys
257
- analysis["formats"] = defaultdict(list)
258
- for header, col_metadata in analysis["columns"].items():
259
- analysis["formats"][col_metadata["format"]].append(header)
260
-
261
- if output_profile:
262
- analysis["profile"] = create_profile(
84
+ try:
85
+ return generate_output(
263
86
  table=table,
264
- dict_cols_fields=analysis["columns"],
87
+ analysis=analysis,
88
+ file_path=file_path,
265
89
  num_rows=num_rows,
266
90
  limited_output=limited_output,
91
+ save_results=save_results,
92
+ output_profile=output_profile,
93
+ output_schema=output_schema,
94
+ output_df=output_df,
95
+ cast_json=cast_json,
267
96
  verbose=verbose,
97
+ sheet_name=sheet_name,
268
98
  )
99
+ finally:
100
+ if verbose:
101
+ display_logs_depending_process_time(
102
+ f"Routine completed in {round(time() - start_routine, 3)}s",
103
+ time() - start_routine
104
+ )
269
105
 
270
- if save_results:
271
- if isinstance(save_results, str):
272
- output_path = save_results
273
- else:
274
- output_path = os.path.splitext(file_path)[0]
275
- if is_url(output_path):
276
- output_path = output_path.split('/')[-1]
277
- if analysis.get("sheet_name"):
278
- output_path += "_sheet-" + str(sheet_name)
279
- output_path += ".json"
280
- with open(output_path, "w", encoding="utf8") as fp:
281
- json.dump(analysis, fp, indent=4, separators=(",", ": "), ensure_ascii=False)
282
106
 
283
- if output_schema:
284
- analysis["schema"] = generate_table_schema(
285
- analysis,
286
- save_file=False,
287
- verbose=verbose
288
- )
107
+ def validate_then_detect(
108
+ file_path: str,
109
+ previous_analysis: dict,
110
+ num_rows: int = 500,
111
+ user_input_tests: Union[str, list[str]] = "ALL",
112
+ limited_output: bool = True,
113
+ save_results: Union[bool, str] = True,
114
+ encoding: str = None,
115
+ sep: str = None,
116
+ skipna: bool = True,
117
+ output_profile: bool = False,
118
+ output_schema: bool = False,
119
+ output_df: bool = False,
120
+ cast_json: bool = True,
121
+ verbose: bool = False,
122
+ sheet_name: Union[str, int] = None,
123
+ ):
124
+
289
125
  if verbose:
290
- display_logs_depending_process_time(
291
- f'Routine completed in {round(time() - start_routine, 3)}s',
292
- time() - start_routine
126
+ start_routine = time()
127
+ if is_url(file_path):
128
+ logging.info("Path recognized as a URL")
129
+
130
+ is_valid, table, analysis = validate(
131
+ file_path=file_path,
132
+ previous_analysis=previous_analysis,
133
+ num_rows=num_rows,
134
+ encoding=encoding,
135
+ sep=sep,
136
+ verbose=verbose,
137
+ skipna=skipna,
138
+ sheet_name=sheet_name,
139
+ )
140
+ if is_valid:
141
+ # skipping formats detection as the validation is successful
142
+ analysis = previous_analysis
143
+ del analysis["profile"]
144
+ else:
145
+ analysis = detect_formats(
146
+ table=table,
147
+ analysis=analysis,
148
+ user_input_tests=user_input_tests,
149
+ limited_output=limited_output,
150
+ skipna=skipna,
151
+ verbose=verbose,
293
152
  )
294
- if output_df:
295
- return analysis, cast_df(
296
- df=table,
297
- columns=analysis["columns"],
153
+ try:
154
+ return generate_output(
155
+ table=table,
156
+ analysis=analysis,
157
+ file_path=file_path,
158
+ num_rows=num_rows,
159
+ limited_output=limited_output,
160
+ save_results=save_results,
161
+ output_profile=output_profile,
162
+ output_schema=output_schema,
163
+ output_df=output_df,
298
164
  cast_json=cast_json,
299
165
  verbose=verbose,
166
+ sheet_name=sheet_name,
300
167
  )
301
- return analysis
168
+ finally:
169
+ if verbose:
170
+ display_logs_depending_process_time(
171
+ f"Process completed in {round(time() - start_routine, 3)}s",
172
+ time() - start_routine
173
+ )
302
174
 
303
175
 
304
176
  def routine_minio(
@@ -307,10 +179,7 @@ def routine_minio(
307
179
  tableschema_minio_location: dict[str, str],
308
180
  minio_user: str,
309
181
  minio_pwd: str,
310
- num_rows: int = 500,
311
- user_input_tests: Union[str, list[str]] = "ALL",
312
- encoding: str = None,
313
- sep: str = None,
182
+ **kwargs,
314
183
  ):
315
184
  """Returns a dict with information about the csv table and possible
316
185
  column contents.
@@ -323,11 +192,7 @@ def routine_minio(
323
192
  None if not uploading the tableschema to Minio.
324
193
  minio_user: user name for the minio instance
325
194
  minio_pwd: password for the minio instance
326
- num_rows: number of rows to sample from the file for analysis ; -1 for analysis of
327
- the whole file
328
- user_input_tests: tests to run on the file
329
- output_mode: LIMITED or ALL, whether or not to return all possible types or only
330
- the most likely one for each column
195
+ kwargs: arguments for routine
331
196
 
332
197
  Returns:
333
198
  dict: a dict with information about the csv and possible types for each column
@@ -378,12 +243,8 @@ def routine_minio(
378
243
 
379
244
  analysis = routine(
380
245
  file_path,
381
- num_rows,
382
- user_input_tests,
383
- output_mode="LIMITED",
384
246
  save_results=True,
385
- encoding=encoding,
386
- sep=sep,
247
+ **kwargs,
387
248
  )
388
249
 
389
250
  # Write report JSON file.
@@ -404,8 +265,8 @@ def routine_minio(
404
265
  os.remove(file_path)
405
266
 
406
267
  generate_table_schema(
407
- analysis,
408
- True,
268
+ analysis_report=analysis,
269
+ save_file=True,
409
270
  netloc=tableschema_minio_location["netloc"],
410
271
  bucket=tableschema_minio_location["bucket"],
411
272
  key=tableschema_minio_location["key"],
@@ -0,0 +1,62 @@
1
+ import os
2
+ from typing import Union
3
+
4
+ # flake8: noqa
5
+ from csv_detective import detect_fields, detect_labels
6
+
7
+
8
+ def get_all_packages(detect_type) -> list:
9
+ root_dir = os.path.dirname(os.path.abspath(__file__)) + "/" + detect_type
10
+ modules = []
11
+ for dirpath, _, filenames in os.walk(root_dir):
12
+ for filename in filenames:
13
+ file = os.path.join(dirpath, filename).replace(root_dir, "")
14
+ if file.endswith("__init__.py"):
15
+ module = (
16
+ file.replace("__init__.py", "")
17
+ .replace("/", ".").replace("\\", ".")[:-1]
18
+ )
19
+ if module:
20
+ modules.append(detect_type + module)
21
+ return modules
22
+
23
+
24
+ def return_all_tests(
25
+ user_input_tests: Union[str, list],
26
+ detect_type: str,
27
+ ) -> list:
28
+ """
29
+ returns all tests that have a method _is and are listed in the user_input_tests
30
+ the function can select a sub_package from csv_detective
31
+ user_input_tests may look like this:
32
+ - "ALL": all possible tests are made
33
+ - "FR.other.siren" (or any other path-like string to one of the tests, or a group of tests, like "FR.geo"):
34
+ this specifc (group of) test(s) only
35
+ - ["FR.temp.mois_de_annee", "geo", ...]: only the specified tests will be made ; you may also skip
36
+ specific (groups of) tests by add "-" at the start (e.g "-temp.date")
37
+ """
38
+ assert detect_type in ["detect_fields", "detect_labels"]
39
+ all_packages = get_all_packages(detect_type=detect_type)
40
+
41
+ if isinstance(user_input_tests, str):
42
+ user_input_tests = [user_input_tests]
43
+ if "ALL" in user_input_tests or all(x[0] == "-" for x in user_input_tests):
44
+ tests_to_do = [detect_type]
45
+ else:
46
+ tests_to_do = [
47
+ f"{detect_type}.{x}" for x in user_input_tests if x[0] != "-"
48
+ ]
49
+ tests_skipped = [
50
+ f"{detect_type}.{x[1:]}" for x in user_input_tests if x[0] == "-"
51
+ ]
52
+ all_tests = [
53
+ # this is why we need to import detect_fields/labels
54
+ eval(x) for x in all_packages
55
+ if any([y == x[: len(y)] for y in tests_to_do])
56
+ and all([y != x[: len(y)] for y in tests_skipped])
57
+ ]
58
+ # to remove groups of tests
59
+ all_tests = [
60
+ test for test in all_tests if "_is" in dir(test)
61
+ ]
62
+ return all_tests
@@ -0,0 +1,64 @@
1
+ import json
2
+ import os
3
+ from typing import Union
4
+
5
+ import pandas as pd
6
+
7
+ from csv_detective.utils import is_url
8
+ from .dataframe import cast_df
9
+ from .profile import create_profile
10
+ from .schema import generate_table_schema
11
+
12
+
13
+ def generate_output(
14
+ table: pd.DataFrame,
15
+ analysis: dict,
16
+ file_path: str,
17
+ num_rows: int = 500,
18
+ limited_output: bool = True,
19
+ save_results: Union[bool, str] = True,
20
+ output_profile: bool = False,
21
+ output_schema: bool = False,
22
+ output_df: bool = False,
23
+ cast_json: bool = True,
24
+ verbose: bool = False,
25
+ sheet_name: Union[str, int] = None,
26
+ ) -> Union[dict, tuple[dict, pd.DataFrame]]:
27
+
28
+ if output_profile:
29
+ analysis["profile"] = create_profile(
30
+ table=table,
31
+ dict_cols_fields=analysis["columns"],
32
+ num_rows=num_rows,
33
+ limited_output=limited_output,
34
+ verbose=verbose,
35
+ )
36
+
37
+ if save_results:
38
+ if isinstance(save_results, str):
39
+ output_path = save_results
40
+ else:
41
+ output_path = os.path.splitext(file_path)[0]
42
+ if is_url(output_path):
43
+ output_path = output_path.split('/')[-1]
44
+ if analysis.get("sheet_name"):
45
+ output_path += "_sheet-" + str(sheet_name)
46
+ output_path += ".json"
47
+ with open(output_path, "w", encoding="utf8") as fp:
48
+ json.dump(analysis, fp, indent=4, separators=(",", ": "), ensure_ascii=False)
49
+
50
+ if output_schema:
51
+ analysis["schema"] = generate_table_schema(
52
+ analysis,
53
+ save_file=False,
54
+ verbose=verbose
55
+ )
56
+
57
+ if output_df:
58
+ return analysis, cast_df(
59
+ df=table,
60
+ columns=analysis["columns"],
61
+ cast_json=cast_json,
62
+ verbose=verbose,
63
+ )
64
+ return analysis
File without changes
File without changes
File without changes
File without changes
File without changes
csv_detective/utils.py CHANGED
@@ -2,6 +2,8 @@ import logging
2
2
  import math
3
3
  from typing import Optional
4
4
 
5
+ logging.basicConfig(level=logging.INFO)
6
+
5
7
 
6
8
  def display_logs_depending_process_time(prompt: str, duration: float):
7
9
  '''
@@ -0,0 +1,70 @@
1
+ import logging
2
+ from typing import Union
3
+
4
+ import pandas as pd
5
+
6
+ from csv_detective.load_tests import return_all_tests
7
+ from .parsing.load import load_file
8
+
9
+ logging.basicConfig(level=logging.INFO)
10
+
11
+ tests = {
12
+ t.__name__.split(".")[-1]: t._is
13
+ for t in return_all_tests("ALL", "detect_fields")
14
+ }
15
+
16
+
17
+ def validate(
18
+ file_path: str,
19
+ previous_analysis: dict,
20
+ num_rows: int = 500,
21
+ encoding: str = None,
22
+ sep: str = None,
23
+ verbose: bool = False,
24
+ skipna: bool = True,
25
+ sheet_name: Union[str, int] = None,
26
+ ) -> tuple[bool, pd.DataFrame, dict]:
27
+ """
28
+ Verify is the given file has the same fields and types as in the previous analysis.
29
+ """
30
+ table, analysis = load_file(
31
+ file_path=file_path,
32
+ num_rows=num_rows,
33
+ encoding=encoding,
34
+ sep=sep,
35
+ verbose=verbose,
36
+ sheet_name=sheet_name,
37
+ )
38
+ if verbose:
39
+ logging.info("Comparing table with the previous analysis")
40
+ logging.info("- Checking if all columns match")
41
+ if (
42
+ any(col_name not in list(table.columns) for col_name in previous_analysis["columns"])
43
+ or any(col_name not in list(previous_analysis["columns"].keys()) for col_name in table.columns)
44
+ ):
45
+ logging.warning("> Columns do not match, proceeding with full analysis")
46
+ return False, table, analysis
47
+ for col_name, args in previous_analysis["columns"].items():
48
+ if verbose:
49
+ logging.info(f"- Testing {col_name} for {args['format']}")
50
+ if args["format"] == "string":
51
+ # no test for columns that have not been recognized as a specific format
52
+ continue
53
+ test_func = tests[args["format"]]
54
+ col_data = table[col_name]
55
+ if skipna:
56
+ col_data = col_data.loc[~col_data.isna()]
57
+ if not col_data.apply(test_func).all():
58
+ logging.warning("> Test failed, proceeding with full analysis")
59
+ return False, table, analysis
60
+ if verbose:
61
+ logging.info("> All checks successful")
62
+ return True, table, analysis | {
63
+ k: previous_analysis[k] for k in [
64
+ "categorical",
65
+ "columns",
66
+ "columns_fields",
67
+ "columns_labels",
68
+ "formats",
69
+ ]
70
+ }
@@ -7,11 +7,13 @@
7
7
  - Better naming, hint types and minor refactors [#103](https://github.com/datagouv/csv-detective/pull/103)
8
8
  - The returned dataframe has its columns properly cast to the detected types [#104](https://github.com/datagouv/csv-detective/pull/104)
9
9
  - Raise an error if the encoding could not be guessed [#106](https://github.com/datagouv/csv-detective/pull/106)
10
+ - Fix CLI and minio routine [#107](https://github.com/datagouv/csv-detective/pull/107)
10
11
  - Allow to only specify tests to skip ("all but...") [#108](https://github.com/datagouv/csv-detective/pull/108)
11
12
  - Fix bool casting [#109](https://github.com/datagouv/csv-detective/pull/109)
12
13
  - Handle csv.gz files [#110](https://github.com/datagouv/csv-detective/pull/110)
13
14
  - Refactor file tests [#110](https://github.com/datagouv/csv-detective/pull/110)
14
15
  - Restructure repo (breaking changes) [#111](https://github.com/datagouv/csv-detective/pull/111)
16
+ - Add validation function and associated flow [#112](https://github.com/datagouv/csv-detective/pull/112)
15
17
  - Better float detection [#113](https://github.com/datagouv/csv-detective/pull/113)
16
18
 
17
19
  ## 0.7.4 (2024-11-15)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: csv_detective
3
- Version: 0.7.5.dev1244
3
+ Version: 0.7.5.dev1286
4
4
  Summary: Detect CSV column content
5
5
  Home-page: https://github.com/etalab/csv_detective
6
6
  Author: Etalab
@@ -1,8 +1,10 @@
1
- csv_detective/__init__.py,sha256=GCHgu0BhH5ACV7cf-1gDr9nRyvSoeQ1vRw9SjEHeMT4,143
2
- csv_detective/cli.py,sha256=Ua7SE1wMH2uFUsTmfumh4nJk7O06okpMd2gvjUDO1II,1048
3
- csv_detective/explore_csv.py,sha256=aJ2pG7lK4sgY9Pv31zEzFVGByxkfw4wwgrQqfgUtBOo,14903
1
+ csv_detective/__init__.py,sha256=vpK7WMkIQbcJzu6HKOwcn7PpHsNCCaXZ1YLMS5Wq9tM,165
2
+ csv_detective/cli.py,sha256=itooHtpyfC6DUsL_DchPKe1xo7m0MYJIp1L4R8eqoTk,1401
3
+ csv_detective/explore_csv.py,sha256=ocWlUEtuwZ-6bjDc6gfhC2-6DljMVhvXhHrfICCXGfQ,8986
4
+ csv_detective/load_tests.py,sha256=GILvfkd4OVI-72mA4nzbPlZqgcXZ4wznOhGfZ1ucWkM,2385
4
5
  csv_detective/s3_utils.py,sha256=1cIVdQUYY2ovErbMwp72Gqtqx2bkB8nfVhn-QaOFTT0,1451
5
- csv_detective/utils.py,sha256=KAYfSJXnPuAXnSc38Jm57oQ_JP_0kUkmI1OV6gN5_ys,1116
6
+ csv_detective/utils.py,sha256=Bx_1k4Sdpd5PCjuAy4AeayCmmw7TMR_zgtKIHNLi5g0,1157
7
+ csv_detective/validate.py,sha256=o4Qulf8E-x1zsWT9OD4Fpw83Gku1WA3JlX83j7bu0DA,2314
6
8
  csv_detective/detect_fields/__init__.py,sha256=NVfE3BQVExgXb-BPbhDvlkM5-0naEVLpZ4aM_OGHYfE,931
7
9
  csv_detective/detect_fields/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
10
  csv_detective/detect_fields/FR/geo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -53,7 +55,7 @@ csv_detective/detect_fields/geo/longitude_wgs/__init__.py,sha256=G7afWOKiGh_Tv7g
53
55
  csv_detective/detect_fields/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
54
56
  csv_detective/detect_fields/other/booleen/__init__.py,sha256=wn_yyTAmGxqo0l0b7JRpGb0da_E27iGxES9zWCrnsqc,497
55
57
  csv_detective/detect_fields/other/email/__init__.py,sha256=O9tgJmq0O8Q-8iin63NqEEDhlsUJjxFZNaNFM4GZaws,178
56
- csv_detective/detect_fields/other/float/__init__.py,sha256=7bXuPAmBuIhKJEhq7d20B60WVol1AUpqRkWhreQpWfU,578
58
+ csv_detective/detect_fields/other/float/__init__.py,sha256=AT4Kpgwoz5PuAoLx00u0SL8DjjXZxsE8zSRbN18uAv4,578
57
59
  csv_detective/detect_fields/other/int/__init__.py,sha256=QN3kQJLYqLRBiubUK7g4Xq03PlA5wqVwx2pPPIO9FdI,320
58
60
  csv_detective/detect_fields/other/json/__init__.py,sha256=DhzyvT12kOqgum89silIu3uoSYXmC_s_AaxLtXAD4eU,540
59
61
  csv_detective/detect_fields/other/mongo_object_id/__init__.py,sha256=7fcrHsOZAqXp2_N0IjPskYJ_qi4xRlo9iyNNDQVLzsU,156
@@ -126,10 +128,12 @@ csv_detective/detect_labels/temp/year/__init__.py,sha256=3U9j8Hux432KdGtIyArq_-v
126
128
  csv_detective/detection/columns.py,sha256=vfE-DKESA6J9Rfsl-a8tjgZfE21VmzArO5TrbzL0KmE,2905
127
129
  csv_detective/detection/encoding.py,sha256=tpjJEMNM_2TcLXDzn1lNQPnSRnsWYjs83tQ8jNwTj4E,973
128
130
  csv_detective/detection/engine.py,sha256=HiIrU-l9EO5Fbc2Vh8W_Uy5-dpKcQQzlxCqMuWc09LY,1530
131
+ csv_detective/detection/formats.py,sha256=VwFazRAFJN6eaYUK7IauVU88vuUBHccESY4UD8EgGUo,5386
129
132
  csv_detective/detection/headers.py,sha256=wrVII2RQpsVmHhrO1DHf3dmiu8kbtOjBlskf41cnQmc,1172
130
133
  csv_detective/detection/rows.py,sha256=3qvsbsBcMxiqqfSYYkOgsRpX777rk22tnRHDwUA97kU,742
131
134
  csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
132
135
  csv_detective/detection/variables.py,sha256=3qEMtjZ_zyIFXvTnFgK7ZMDx8C12uQXKfFjEj2moyJc,3558
136
+ csv_detective/output/__init__.py,sha256=XDS4Dgvv6oloIao9JquHa0m1nnlQ_q2gHuEPGlaETic,1890
133
137
  csv_detective/output/dataframe.py,sha256=89iQRE59cHQyQQEsujQVIKP2YAUYpPklWkdDOqZE-wE,2183
134
138
  csv_detective/output/example.py,sha256=i8PkdXxidF7qR_9aK8vh12JpZdJQryhBgyrMS8iy5rk,8642
135
139
  csv_detective/output/profile.py,sha256=B8YU541T_YPDezJGh4dkHckOShiwHSrZd9GS8jbmz7A,2919
@@ -141,18 +145,19 @@ csv_detective/parsing/csv.py,sha256=11mibDnJhIjykXLGZvA5ZEU5U7KgxIrbyO6BNv6jlro,
141
145
  csv_detective/parsing/excel.py,sha256=AslE2S1e67o8yTIAIhp-lAnJ6-XqeBBRz1-VMFqhZBM,7055
142
146
  csv_detective/parsing/load.py,sha256=SpP0pfxswOAPPpwbZfoP1blh0EKV5VMs0TpTgQJKzjs,3621
143
147
  csv_detective/parsing/text.py,sha256=rsfk66BCmdpsCOd0kDJ8tmqMsEWd-OeBkEisWc4Ej9k,1246
144
- csv_detective-0.7.5.dev1244.data/data/share/csv_detective/CHANGELOG.md,sha256=OhgKjKnSxoFPvzW0BUdnOijCDlaCazFPCH76ZWhlijo,7709
145
- csv_detective-0.7.5.dev1244.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
146
- csv_detective-0.7.5.dev1244.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
147
- csv_detective-0.7.5.dev1244.dist-info/licenses/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
148
+ csv_detective-0.7.5.dev1286.data/data/share/csv_detective/CHANGELOG.md,sha256=Gqw7W41bXK_JgIYi80vdOPR6JLY5rgABeNsiDStE4XA,7901
149
+ csv_detective-0.7.5.dev1286.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
150
+ csv_detective-0.7.5.dev1286.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
151
+ csv_detective-0.7.5.dev1286.dist-info/licenses/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
148
152
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
149
153
  tests/test_example.py,sha256=0NfChooJQlFxTo2nY5FOQIcsK4zzWA_SBmt2LwVQovY,2014
150
- tests/test_fields.py,sha256=LPLx09cX5u9XHAh65XvTgIqzKylToiHZxXzKhpV0wsk,11148
151
- tests/test_file.py,sha256=EleTssys5fCP4N0W1eTZN35uijzoF15e3dIcuIlrMsk,7865
154
+ tests/test_fields.py,sha256=53kiUQiqGt4_fnyCoxhNLeEsuN1LRDB-7HGT3p_Ed9I,11147
155
+ tests/test_file.py,sha256=9APE1d43lQ8Dk8lwJFNUK_YekYYsQ0ae2_fgpcPE9mk,8116
152
156
  tests/test_labels.py,sha256=6MOKrGznkwU5fjZ_3oiB6Scmb480Eu-9geBJs0UDLds,159
153
- tests/test_structure.py,sha256=SVsnluVoIIprYw_67I1_gB3cp9m1wlO8C7SpdsLW8cM,1161
154
- csv_detective-0.7.5.dev1244.dist-info/METADATA,sha256=h8jO7r1lWud7Afh7T0eye7GbdJmukr_a9ZHPl_HeXbs,1386
155
- csv_detective-0.7.5.dev1244.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
156
- csv_detective-0.7.5.dev1244.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
157
- csv_detective-0.7.5.dev1244.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
158
- csv_detective-0.7.5.dev1244.dist-info/RECORD,,
157
+ tests/test_structure.py,sha256=bv-tjgXohvQAxwmxzH0BynFpK2TyPjcxvtIAmIRlZmA,1393
158
+ tests/test_validation.py,sha256=VwtBcnGAQ_eSFrBibWnMSTDjuy6y2JLlqvc3Zb667NY,479
159
+ csv_detective-0.7.5.dev1286.dist-info/METADATA,sha256=rLptgL-FkLZzfkxPt7_0I-k7EKPKbEHhd3Ei2qt54KI,1386
160
+ csv_detective-0.7.5.dev1286.dist-info/WHEEL,sha256=pxyMxgL8-pra_rKaQ4drOZAegBVuX-G_4nRHjjgWbmo,91
161
+ csv_detective-0.7.5.dev1286.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
162
+ csv_detective-0.7.5.dev1286.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
163
+ csv_detective-0.7.5.dev1286.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (78.1.0)
2
+ Generator: setuptools (79.0.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
tests/test_fields.py CHANGED
@@ -48,7 +48,7 @@ from csv_detective.detection.variables import (
48
48
  detect_continuous_variable,
49
49
  detect_categorical_variable,
50
50
  )
51
- from csv_detective.explore_csv import return_all_tests
51
+ from csv_detective.load_tests import return_all_tests
52
52
  from csv_detective.output.dataframe import cast
53
53
 
54
54
 
tests/test_file.py CHANGED
@@ -28,7 +28,7 @@ def test_columns_output_on_file():
28
28
  "STRUCTURED_INFO",
29
29
  "GEO_INFO",
30
30
  ]
31
- assert output["total_lines"] == 414
31
+ assert output["total_lines"] == 404
32
32
  assert output["nb_duplicates"] == 7
33
33
  assert output["columns"]["NOMCOM"]["format"] == "commune"
34
34
  assert output["columns"]["NOMDEP"]["format"] == "departement"
@@ -48,7 +48,7 @@ def test_profile_output_on_file():
48
48
  )
49
49
  assert all(
50
50
  [
51
- c in list(output["profile"]["NUMCOM"].keys())
51
+ c in list(output["profile"]["TXCOUVGLO_COM_2014"].keys())
52
52
  for c in [
53
53
  "min",
54
54
  "max",
@@ -60,12 +60,22 @@ def test_profile_output_on_file():
60
60
  ]
61
61
  ]
62
62
  )
63
- assert len(output["profile"]["NOMCOM"].keys()) == 3
64
- assert output["profile"]["NUMCOM"]["min"] == 1001
65
- assert output["profile"]["NUMCOM"]["max"] == 6125
66
- assert round(output["profile"]["NUMCOM"]["mean"]) == 1245
67
- assert round(output["profile"]["NUMCOM"]["std"]) == 363
68
- assert output["profile"]["TXCOUVGLO_COM_2014"]["nb_distinct"] == 296
63
+ assert not any(
64
+ [
65
+ c in list(output["profile"]["NUMCOM"].keys())
66
+ for c in [
67
+ "min",
68
+ "max",
69
+ "mean",
70
+ "std",
71
+ ]
72
+ ]
73
+ )
74
+ assert output["profile"]["TXCOUVGLO_COM_2014"]["min"] == 0.0
75
+ assert output["profile"]["TXCOUVGLO_COM_2014"]["max"] == 200.2
76
+ assert round(output["profile"]["TXCOUVGLO_COM_2014"]["mean"]) == 60
77
+ assert round(output["profile"]["TXCOUVGLO_COM_2014"]["std"]) == 36
78
+ assert output["profile"]["TXCOUVGLO_COM_2014"]["nb_distinct"] == 290
69
79
  assert output["profile"]["TXCOUVGLO_COM_2014"]["nb_missing_values"] == 3
70
80
  assert output["profile"]["GEO_INFO"]["nb_distinct"] == 1
71
81
 
@@ -175,7 +185,7 @@ def mocked_responses():
175
185
  "params",
176
186
  # ideally we'd like to do the same with params_others but pandas.read_excel uses urllib
177
187
  # which doesn't support the way we mock the response, TBC
178
- params_csv + [("a_test_file.csv", {"separator": ";", "header_row_idx": 2, "total_lines": 414})]
188
+ params_csv + [("a_test_file.csv", {"separator": ";", "header_row_idx": 2, "total_lines": 404})]
179
189
  )
180
190
  def test_urls(mocked_responses, params):
181
191
  file_name, checks = params
tests/test_structure.py CHANGED
@@ -1,6 +1,7 @@
1
1
  import os
2
2
  # flake8: noqa
3
3
  from csv_detective import detect_fields, detect_labels
4
+ from csv_detective.load_tests import return_all_tests
4
5
 
5
6
 
6
7
  def tests_conformity():
@@ -29,3 +30,8 @@ def tests_conformity():
29
30
  .replace("/", ".")
30
31
  )
31
32
  assert "_is" in dir(_package)
33
+
34
+
35
+ def test_all_tests_have_unique_name():
36
+ names = [t.__name__.split(".")[-1] for t in return_all_tests("ALL", "detect_fields")]
37
+ assert len(names) == len(set(names))
@@ -0,0 +1,18 @@
1
+ import json
2
+
3
+ import pandas as pd
4
+
5
+ from csv_detective.validate import validate
6
+
7
+
8
+ def test_validation():
9
+ with open("tests/data/a_test_file.json", "r") as f:
10
+ previous_analysis = json.load(f)
11
+ is_valid, table, analysis = validate(
12
+ "tests/data/a_test_file.csv",
13
+ previous_analysis=previous_analysis,
14
+ num_rows=-1,
15
+ )
16
+ assert is_valid is True
17
+ assert isinstance(table, pd.DataFrame)
18
+ assert isinstance(analysis, dict)