csv-detective 0.9.3.dev2215__py3-none-any.whl → 0.9.3.dev2241__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,10 +3,13 @@ from time import time
3
3
  from typing import Callable
4
4
 
5
5
  import pandas as pd
6
+ from more_itertools import peekable
6
7
 
8
+ from csv_detective.parsing.csv import CHUNK_SIZE
7
9
  from csv_detective.utils import display_logs_depending_process_time
8
10
 
9
- MAX_ROWS_ANALYSIS = int(1e4)
11
+ # above this threshold, a column is not considered categorical
12
+ MAX_NUMBER_CATEGORICAL_VALUES = 25
10
13
 
11
14
 
12
15
  def test_col_val(
@@ -34,28 +37,24 @@ def test_col_val(
34
37
  serie = serie[serie.notnull()]
35
38
  ser_len = len(serie)
36
39
  if ser_len == 0:
37
- return 0.0
40
+ # being here means the whole column is NaN, so if skipna it's a pass
41
+ return 1.0 if skipna else 0.0
38
42
  if not limited_output:
39
43
  result = apply_test_func(serie, test_func, ser_len).sum() / ser_len
40
44
  return result if result >= proportion else 0.0
41
45
  else:
42
- if proportion == 1: # Then try first 1 value, then 5, then all
46
+ if proportion == 1:
47
+ # early stops (1 then 5 rows) to not waste time if directly unsuccessful
43
48
  for _range in [
44
49
  min(1, ser_len),
45
50
  min(5, ser_len),
46
51
  ser_len,
47
- ]: # Pour ne pas faire d'opérations inutiles, on commence par 1,
48
- # puis 5 valeurs puis la serie complète
49
- if all(apply_test_func(serie, test_func, _range)):
50
- pass
51
- else:
52
+ ]:
53
+ if not all(apply_test_func(serie, test_func, _range)):
52
54
  return 0.0
53
55
  return 1.0
54
56
  else:
55
- # if we have a proportion, statistically it's OK to analyse up to 10k rows
56
- # (arbitrary number) and get a significant result
57
- to_analyse = min(ser_len, MAX_ROWS_ANALYSIS)
58
- result = apply_test_func(serie, test_func, to_analyse).sum() / to_analyse
57
+ result = apply_test_func(serie, test_func, ser_len).sum() / ser_len
59
58
  return result if result >= proportion else 0.0
60
59
  finally:
61
60
  if verbose and time() - start > 3:
@@ -81,7 +80,7 @@ def test_col_label(
81
80
 
82
81
  def test_col(
83
82
  table: pd.DataFrame,
84
- all_tests: list,
83
+ all_tests: dict[str, dict],
85
84
  limited_output: bool,
86
85
  skipna: bool = True,
87
86
  verbose: bool = False,
@@ -89,25 +88,18 @@ def test_col(
89
88
  if verbose:
90
89
  start = time()
91
90
  logging.info("Testing columns to get types")
92
- test_funcs = {
93
- test.__name__.split(".")[-1]: {
94
- "func": test._is,
95
- "prop": test.PROPORTION,
96
- }
97
- for test in all_tests
98
- }
99
91
  return_table = pd.DataFrame(columns=table.columns)
100
- for idx, (key, value) in enumerate(test_funcs.items()):
92
+ for idx, (name, attributes) in enumerate(all_tests.items()):
101
93
  if verbose:
102
94
  start_type = time()
103
- logging.info(f"\t- Starting with type '{key}'")
95
+ logging.info(f"\t- Starting with type '{name}'")
104
96
  # improvement lead : put the longest tests behind and make them only if previous tests not satisfactory
105
97
  # => the following needs to change, "apply" means all columns are tested for one type at once
106
- return_table.loc[key] = table.apply(
98
+ return_table.loc[name] = table.apply(
107
99
  lambda serie: test_col_val(
108
100
  serie,
109
- value["func"],
110
- value["prop"],
101
+ attributes["func"],
102
+ attributes["prop"],
111
103
  skipna=skipna,
112
104
  limited_output=limited_output,
113
105
  verbose=verbose,
@@ -115,7 +107,7 @@ def test_col(
115
107
  )
116
108
  if verbose:
117
109
  display_logs_depending_process_time(
118
- f'\t> Done with type "{key}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(test_funcs)})',
110
+ f'\t> Done with type "{name}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(all_tests)})',
119
111
  time() - start_type,
120
112
  )
121
113
  if verbose:
@@ -125,26 +117,24 @@ def test_col(
125
117
  return return_table
126
118
 
127
119
 
128
- def test_label(table: pd.DataFrame, all_tests: list, limited_output: bool, verbose: bool = False):
120
+ def test_label(
121
+ columns: list[str], all_tests: dict[str, dict], limited_output: bool, verbose: bool = False
122
+ ):
129
123
  if verbose:
130
124
  start = time()
131
125
  logging.info("Testing labels to get types")
132
- test_funcs = dict()
133
- for test in all_tests:
134
- name = test.__name__.split(".")[-1]
135
- test_funcs[name] = {"func": test._is, "prop": test.PROPORTION}
136
126
 
137
- return_table = pd.DataFrame(columns=table.columns)
138
- for idx, (key, value) in enumerate(test_funcs.items()):
127
+ return_table = pd.DataFrame(columns=columns)
128
+ for idx, (key, value) in enumerate(all_tests.items()):
139
129
  if verbose:
140
130
  start_type = time()
141
131
  return_table.loc[key] = [
142
132
  test_col_label(col_name, value["func"], value["prop"], limited_output=limited_output)
143
- for col_name in table.columns
133
+ for col_name in columns
144
134
  ]
145
135
  if verbose:
146
136
  display_logs_depending_process_time(
147
- f'\t- Done with type "{key}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(test_funcs)})',
137
+ f'\t- Done with type "{key}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(all_tests)})',
148
138
  time() - start_type,
149
139
  )
150
140
  if verbose:
@@ -152,3 +142,111 @@ def test_label(table: pd.DataFrame, all_tests: list, limited_output: bool, verbo
152
142
  f"Done testing labels in {round(time() - start, 3)}s", time() - start
153
143
  )
154
144
  return return_table
145
+
146
+
147
+ def test_col_chunks(
148
+ table: pd.DataFrame,
149
+ file_path: str,
150
+ analysis: dict,
151
+ all_tests: list,
152
+ limited_output: bool,
153
+ skipna: bool = True,
154
+ verbose: bool = False,
155
+ ) -> tuple[pd.DataFrame, dict, dict[str, pd.Series]]:
156
+ def build_remaining_tests_per_col(return_table: pd.DataFrame) -> dict[str, list[str]]:
157
+ return {
158
+ col: [test for test in return_table.index if return_table.loc[test, col] > 0]
159
+ for col in return_table.columns
160
+ }
161
+
162
+ if verbose:
163
+ start = time()
164
+ logging.info("Testing columns to get types on chunks")
165
+
166
+ # analysing the sample to get a first guess
167
+ return_table = test_col(table, all_tests, limited_output, skipna=skipna, verbose=verbose)
168
+ remaining_tests_per_col = build_remaining_tests_per_col(return_table)
169
+
170
+ # hashing rows to get nb_duplicates
171
+ row_hashes_count = table.apply(lambda row: hash(tuple(row)), axis=1).value_counts()
172
+ # getting values for profile to read the file only once
173
+ col_values = {col: table[col].value_counts(dropna=False) for col in table.columns}
174
+
175
+ # only csv files can end up here, can't chunk excel
176
+ chunks = pd.read_csv(
177
+ file_path,
178
+ dtype=str,
179
+ encoding=analysis["encoding"],
180
+ sep=analysis["separator"],
181
+ skiprows=analysis["header_row_idx"],
182
+ compression=analysis.get("compression"),
183
+ chunksize=CHUNK_SIZE,
184
+ )
185
+ analysis["total_lines"] = CHUNK_SIZE
186
+ batch, batch_number = [], 1
187
+ iterator = peekable(enumerate(chunks))
188
+ while iterator:
189
+ idx, chunk = next(iterator)
190
+ if idx == 0:
191
+ # we have read and analysed the first chunk already
192
+ continue
193
+ if len(batch) < 10:
194
+ # it's too slow to process chunks directly, but we want to keep the first analysis
195
+ # on a "small" chunk, so partial analyses are done on batches of chunks
196
+ batch.append(chunk)
197
+ # we don't know when the chunks end, and doing one additionnal step
198
+ # for the final batch is ugly
199
+ try:
200
+ iterator.peek()
201
+ continue
202
+ except StopIteration:
203
+ pass
204
+ if verbose:
205
+ logging.info(f"> Testing batch number {batch_number}")
206
+ batch = pd.concat(batch, ignore_index=True)
207
+ analysis["total_lines"] += len(batch)
208
+ row_hashes_count = row_hashes_count.add(
209
+ batch.apply(lambda row: hash(tuple(row)), axis=1).value_counts(),
210
+ fill_value=0,
211
+ )
212
+ for col in batch.columns:
213
+ col_values[col] = col_values[col].add(
214
+ batch[col].value_counts(dropna=False),
215
+ fill_value=0,
216
+ )
217
+ if not any(remaining_tests for remaining_tests in remaining_tests_per_col.values()):
218
+ # no more potential tests to do on any column, early stop
219
+ break
220
+ for col, tests in remaining_tests_per_col.items():
221
+ # testing each column with the tests that are still competing
222
+ # after previous batchs analyses
223
+ for test in tests:
224
+ batch_col_test = test_col_val(
225
+ batch[col],
226
+ all_tests[test]["func"],
227
+ all_tests[test]["prop"],
228
+ limited_output=limited_output,
229
+ skipna=skipna,
230
+ )
231
+ return_table.loc[test, col] = (
232
+ # if this batch's column tested 0 then test fails overall
233
+ 0
234
+ if batch_col_test == 0
235
+ # otherwise updating the score with weighted average
236
+ else ((return_table.loc[test, col] * idx + batch_col_test) / (idx + 1))
237
+ )
238
+ remaining_tests_per_col = build_remaining_tests_per_col(return_table)
239
+ batch, batch_number = [], batch_number + 1
240
+ analysis["nb_duplicates"] = sum(row_hashes_count > 1)
241
+ analysis["categorical"] = [
242
+ col for col, values in col_values.items() if len(values) <= MAX_NUMBER_CATEGORICAL_VALUES
243
+ ]
244
+ # handling that empty columns score 1 everywhere
245
+ for col in return_table.columns:
246
+ if sum(return_table[col]) == len(return_table):
247
+ return_table[col] = 0
248
+ if verbose:
249
+ display_logs_depending_process_time(
250
+ f"Done testing chunks in {round(time() - start, 3)}s", time() - start
251
+ )
252
+ return return_table, analysis, col_values
@@ -1,11 +1,14 @@
1
1
  import logging
2
2
  from time import time
3
- from typing import TextIO
3
+ from typing import Optional, TextIO
4
4
 
5
5
  import pandas as pd
6
6
 
7
7
  from csv_detective.utils import display_logs_depending_process_time
8
8
 
9
+ # the number of rows for the first analysis, and the number of rows per chunk of the df iterator
10
+ CHUNK_SIZE = int(1e4)
11
+
9
12
 
10
13
  def parse_csv(
11
14
  the_file: TextIO,
@@ -15,36 +18,36 @@ def parse_csv(
15
18
  skiprows: int,
16
19
  random_state: int = 42,
17
20
  verbose: bool = False,
18
- ) -> tuple[pd.DataFrame, int, int]:
21
+ ) -> tuple[pd.DataFrame, Optional[int], Optional[int]]:
19
22
  if verbose:
20
23
  start = time()
21
24
  logging.info("Parsing table")
22
- table = None
23
25
 
24
26
  if not isinstance(the_file, str):
25
27
  the_file.seek(0)
26
28
 
27
- total_lines = None
28
- for encoding in [encoding, "ISO-8859-1", "utf-8"]:
29
- if encoding is None:
30
- continue
31
-
32
- if "ISO-8859" in encoding:
33
- encoding = "ISO-8859-1"
34
- try:
35
- table = pd.read_csv(the_file, sep=sep, dtype=str, encoding=encoding, skiprows=skiprows)
36
- total_lines = len(table)
29
+ try:
30
+ table = pd.read_csv(
31
+ the_file,
32
+ sep=sep,
33
+ dtype=str,
34
+ encoding=encoding,
35
+ skiprows=skiprows,
36
+ nrows=CHUNK_SIZE,
37
+ )
38
+ total_lines = len(table)
39
+ # branch between small and big files starts here
40
+ if total_lines == CHUNK_SIZE:
41
+ if verbose:
42
+ logging.warning(f"File is too long, analysing in chunks of {CHUNK_SIZE} rows")
43
+ total_lines, nb_duplicates = None, None
44
+ else:
37
45
  nb_duplicates = len(table.loc[table.duplicated()])
38
- if num_rows > 0:
39
- num_rows = min(num_rows - 1, total_lines)
40
- table = table.sample(num_rows, random_state=random_state)
41
- # else : table is unchanged
42
- break
43
- except TypeError:
44
- print("Trying encoding : {encoding}".format(encoding=encoding))
45
-
46
- if table is None:
47
- raise ValueError("Could not load file")
46
+ if num_rows > 0:
47
+ num_rows = min(num_rows, total_lines or len(table))
48
+ table = table.sample(num_rows, random_state=random_state)
49
+ except Exception as e:
50
+ raise ValueError("Could not load file") from e
48
51
  if verbose:
49
52
  display_logs_depending_process_time(
50
53
  f"Table parsed successfully in {round(time() - start, 3)}s",
@@ -44,6 +44,8 @@ def load_file(
44
44
  sheet_name=sheet_name,
45
45
  verbose=verbose,
46
46
  )
47
+ if table.empty:
48
+ raise ValueError("Table seems to be empty")
47
49
  header = table.columns.to_list()
48
50
  analysis = {
49
51
  "engine": engine,
@@ -66,34 +68,45 @@ def load_file(
66
68
  binary_file.seek(0)
67
69
  # decoding and reading file
68
70
  if is_url(file_path) or engine in COMPRESSION_ENGINES:
69
- str_file = StringIO(binary_file.read().decode(encoding=encoding))
71
+ str_file = StringIO()
72
+ while True:
73
+ chunk = binary_file.read(1024**2)
74
+ if not chunk:
75
+ break
76
+ str_file.write(chunk.decode(encoding=encoding))
77
+ del binary_file
78
+ str_file.seek(0)
70
79
  else:
71
80
  str_file = open(file_path, "r", encoding=encoding)
72
81
  if sep is None:
73
82
  sep = detect_separator(str_file, verbose=verbose)
74
83
  header_row_idx, header = detect_headers(str_file, sep, verbose=verbose)
75
- if header is None:
76
- return {"error": True}
77
- elif isinstance(header, list):
78
- if any([x is None for x in header]):
79
- return {"error": True}
84
+ if header is None or (isinstance(header, list) and any([h is None for h in header])):
85
+ raise ValueError("Could not retrieve headers")
80
86
  heading_columns = detect_heading_columns(str_file, sep, verbose=verbose)
81
87
  trailing_columns = detect_trailing_columns(str_file, sep, heading_columns, verbose=verbose)
82
88
  table, total_lines, nb_duplicates = parse_csv(
83
89
  str_file, encoding, sep, num_rows, header_row_idx, verbose=verbose
84
90
  )
91
+ del str_file
92
+ if table.empty:
93
+ raise ValueError("Table seems to be empty")
85
94
  analysis = {
86
95
  "encoding": encoding,
87
96
  "separator": sep,
88
97
  "heading_columns": heading_columns,
89
98
  "trailing_columns": trailing_columns,
90
99
  }
100
+ if engine is not None:
101
+ analysis["compression"] = engine
91
102
  analysis.update(
92
103
  {
93
104
  "header_row_idx": header_row_idx,
94
105
  "header": header,
95
- "total_lines": total_lines,
96
- "nb_duplicates": nb_duplicates,
97
106
  }
98
107
  )
108
+ if total_lines is not None:
109
+ analysis["total_lines"] = total_lines
110
+ if nb_duplicates is not None:
111
+ analysis["nb_duplicates"] = nb_duplicates
99
112
  return table, analysis
csv_detective/validate.py CHANGED
@@ -3,76 +3,121 @@ import logging
3
3
  import pandas as pd
4
4
 
5
5
  from csv_detective.load_tests import return_all_tests
6
- from csv_detective.parsing.columns import test_col_val
7
- from csv_detective.parsing.load import load_file
6
+ from csv_detective.parsing.columns import MAX_NUMBER_CATEGORICAL_VALUES, test_col_val
8
7
 
8
+ VALIDATION_CHUNK_SIZE = int(1e5)
9
9
  logging.basicConfig(level=logging.INFO)
10
10
 
11
- tests = {
12
- t.__name__.split(".")[-1]: {
13
- "func": t._is,
14
- "prop": t.PROPORTION,
15
- }
16
- for t in return_all_tests("ALL", "detect_fields")
17
- }
11
+ tests = return_all_tests("ALL", "detect_fields")
18
12
 
19
13
 
20
14
  def validate(
21
15
  file_path: str,
22
16
  previous_analysis: dict,
23
- num_rows: int = 500,
24
- encoding: str | None = None,
25
- sep: str | None = None,
26
17
  verbose: bool = False,
27
18
  skipna: bool = True,
28
- sheet_name: str | int | None = None,
29
- ) -> tuple[bool, pd.DataFrame | None, dict | None]:
19
+ ) -> tuple[bool, pd.DataFrame | None, dict | None, dict[str, pd.Series] | None]:
30
20
  """
31
- Verify is the given file has the same fields and types as in the previous analysis.
21
+ Verify is the given file has the same fields and types as in the given analysis.
32
22
  """
33
23
  try:
34
- table, analysis = load_file(
35
- file_path=file_path,
36
- num_rows=num_rows,
37
- encoding=encoding,
38
- sep=sep,
39
- verbose=verbose,
40
- sheet_name=sheet_name,
24
+ if previous_analysis.get("separator"):
25
+ # loading the table in chunks
26
+ chunks = pd.read_csv(
27
+ file_path,
28
+ dtype=str,
29
+ sep=previous_analysis["separator"],
30
+ encoding=previous_analysis["encoding"],
31
+ skiprows=previous_analysis["header_row_idx"],
32
+ compression=previous_analysis.get("compression"),
33
+ chunksize=VALIDATION_CHUNK_SIZE,
34
+ )
35
+ analysis = {
36
+ k: v
37
+ for k, v in previous_analysis.items()
38
+ if k
39
+ in ["encoding", "separator", "compression", "heading_columns", "trailing_columns"]
40
+ and v is not None
41
+ }
42
+ else:
43
+ # or chunks-like if not chunkable
44
+ chunks = iter(
45
+ [
46
+ pd.read_excel(
47
+ file_path,
48
+ dtype=str,
49
+ engine=previous_analysis["engine"],
50
+ sheet_name=previous_analysis["sheet_name"],
51
+ )
52
+ ]
53
+ )
54
+ analysis = {k: v for k, v in previous_analysis.items() if k in ["engine", "sheet_name"]}
55
+ first_chunk = next(chunks)
56
+ analysis.update(
57
+ {k: v for k, v in previous_analysis.items() if k in ["header_row_idx", "header"]}
41
58
  )
42
59
  except Exception as e:
43
60
  if verbose:
44
61
  logging.warning(f"> Could not load the file with previous analysis values: {e}")
45
- return False, None, None
62
+ return False, None, None, None
46
63
  if verbose:
47
64
  logging.info("Comparing table with the previous analysis")
48
65
  logging.info("- Checking if all columns match")
49
- if any(col_name not in analysis["header"] for col_name in previous_analysis["header"]) or any(
50
- col_name not in previous_analysis["header"] for col_name in analysis["header"]
66
+ if len(first_chunk.columns) != len(previous_analysis["header"]) or any(
67
+ list(first_chunk.columns)[k] != previous_analysis["header"][k]
68
+ for k in range(len(previous_analysis["header"]))
51
69
  ):
52
70
  if verbose:
53
71
  logging.warning("> Columns do not match, proceeding with full analysis")
54
- return False, None, None
55
- for col_name, args in previous_analysis["columns"].items():
72
+ return False, None, None, None
73
+ if verbose:
74
+ logging.info(
75
+ f"Testing previously detected formats on chunks of {VALIDATION_CHUNK_SIZE} rows"
76
+ )
77
+
78
+ # hashing rows to get nb_duplicates
79
+ row_hashes_count = first_chunk.apply(lambda row: hash(tuple(row)), axis=1).value_counts()
80
+ # getting values for profile to read the file only once
81
+ col_values = {col: first_chunk[col].value_counts(dropna=False) for col in first_chunk.columns}
82
+ analysis["total_lines"] = 0
83
+ for idx, chunk in enumerate([first_chunk, *chunks]):
56
84
  if verbose:
57
- logging.info(f"- Testing {col_name} for {args['format']}")
58
- if args["format"] == "string":
59
- # no test for columns that have not been recognized as a specific format
60
- continue
61
- test_result: float = test_col_val(
62
- serie=table[col_name],
63
- test_func=tests[args["format"]]["func"],
64
- proportion=tests[args["format"]]["prop"],
65
- skipna=skipna,
85
+ logging.info(f"> Testing chunk number {idx}")
86
+ analysis["total_lines"] += len(chunk)
87
+ row_hashes_count = row_hashes_count.add(
88
+ chunk.apply(lambda row: hash(tuple(row)), axis=1).value_counts(),
89
+ fill_value=0,
66
90
  )
67
- if not bool(test_result):
91
+ for col in chunk.columns:
92
+ col_values[col] = col_values[col].add(
93
+ chunk[col].value_counts(dropna=False),
94
+ fill_value=0,
95
+ )
96
+ for col_name, args in previous_analysis["columns"].items():
68
97
  if verbose:
69
- logging.warning("> Test failed, proceeding with full analysis")
70
- return False, table, analysis
98
+ logging.info(f"- Testing {col_name} for {args['format']}")
99
+ if args["format"] == "string":
100
+ # no test for columns that have not been recognized as a specific format
101
+ continue
102
+ test_result: float = test_col_val(
103
+ serie=chunk[col_name],
104
+ test_func=tests[args["format"]]["func"],
105
+ proportion=tests[args["format"]]["prop"],
106
+ skipna=skipna,
107
+ )
108
+ if not bool(test_result):
109
+ if verbose:
110
+ logging.warning("> Test failed, proceeding with full analysis")
111
+ return False, first_chunk, analysis, None
71
112
  if verbose:
72
113
  logging.info("> All checks successful")
114
+ analysis["nb_duplicates"] = sum(row_hashes_count > 1)
115
+ analysis["categorical"] = [
116
+ col for col, values in col_values.items() if len(values) <= MAX_NUMBER_CATEGORICAL_VALUES
117
+ ]
73
118
  return (
74
119
  True,
75
- table,
120
+ first_chunk,
76
121
  analysis
77
122
  | {
78
123
  k: previous_analysis[k]
@@ -84,4 +129,5 @@ def validate(
84
129
  "formats",
85
130
  ]
86
131
  },
132
+ col_values,
87
133
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: csv-detective
3
- Version: 0.9.3.dev2215
3
+ Version: 0.9.3.dev2241
4
4
  Summary: Detect tabular files column content
5
5
  Author-email: Etalab <opendatateam@data.gouv.fr>
6
6
  License: MIT
@@ -22,6 +22,7 @@ Requires-Dist: python-magic==0.4.27
22
22
  Requires-Dist: frformat==0.4.0
23
23
  Requires-Dist: Faker>=33.0.0
24
24
  Requires-Dist: rstr==3.2.2
25
+ Requires-Dist: more-itertools>=10.8.0
25
26
  Provides-Extra: dev
26
27
  Requires-Dist: pytest>=8.3.0; extra == "dev"
27
28
  Requires-Dist: responses>=0.25.0; extra == "dev"
@@ -30,7 +31,7 @@ Dynamic: license-file
30
31
 
31
32
  # CSV Detective
32
33
 
33
- This is a package to **automatically detect column content in tabular files**. The script reads either the whole file or the first few rows and performs various checks to see for each column if it matches with various content types. This is currently done through regex and string comparison.
34
+ This is a package to **automatically detect column content in tabular files**. The script reads either the whole file or the first few rows and performs various checks (regex, casting, comparison with official lists...) to see for each column if it matches with various content types.
34
35
 
35
36
  Currently supported file types: csv, xls, xlsx, ods.
36
37
 
@@ -50,7 +51,7 @@ pip install csv-detective
50
51
 
51
52
  Say you have a tabular file located at `file_path`. This is how you could use `csv_detective`:
52
53
 
53
- ```
54
+ ```python
54
55
  # Import the csv_detective package
55
56
  from csv_detective import routine
56
57
  import os # for this example only
@@ -158,13 +159,26 @@ The program creates a `Python` dictionnary with the following information :
158
159
  ```
159
160
 
160
161
  The output slightly differs depending on the file format:
161
- - csv files have `encoding` and `separator`
162
+ - csv files have `encoding` and `separator` (and `compression` if relevant)
162
163
  - xls, xls, ods files have `engine` and `sheet_name`
163
164
 
165
+ You may also set `output_df` to `True`, in which case the output is a tuple of two elements:
166
+ - the analysis (as described above)
167
+ - an iteror of `pd.DataFrame`s which contain the columns cast with the detected types (which can be used with `pd.concat` or in a loop):
168
+ ```python
169
+ inspection, df_chunks = routine(
170
+ file_path=file_path,
171
+ num_rows=-1,
172
+ output_df=True,
173
+ )
174
+ cast_df = pd.concat(df_chunks, ignore_index=True)
175
+ # if "col1" has been detected as a float, then cast_df["col1"] contains floats
176
+ ```
177
+
164
178
  ### What Formats Can Be Detected
165
179
 
166
180
  Includes :
167
-
181
+ - types (float, int, dates, datetimes, JSON) and more specific (latitude, longitude, geoJSON...)
168
182
  - Communes, Départements, Régions, Pays
169
183
  - Codes Communes, Codes Postaux, Codes Departement, ISO Pays
170
184
  - Codes CSP, Description CSP, SIREN
@@ -172,6 +186,16 @@ Includes :
172
186
  - Years, Dates, Jours de la Semaine FR
173
187
  - UUIDs, Mongo ObjectIds
174
188
 
189
+ ### Validation
190
+ If you have a pre-made analysis of a file, you can check whether an other file conforms to the same analysis:
191
+ ```python
192
+ from csv_detective import validate
193
+ is_valid, *_ = validate(
194
+ file_path,
195
+ previous_analysis, # exactly as it came out from the routine function
196
+ )
197
+ ```
198
+
175
199
  ### Format detection and scoring
176
200
  For each column, 3 scores are computed for each format, the higher the score, the more likely the format:
177
201
  - the field score based on the values contained in the column (0.0 to 1.0).
@@ -199,7 +223,6 @@ Only the format with highest score is present in the output.
199
223
  Related ideas:
200
224
 
201
225
  - store column names to make a learning model based on column names for (possible pre-screen)
202
- - normalising data based on column prediction
203
226
  - entity resolution (good luck...)
204
227
 
205
228
  ## Why Could This Be of Any Use ?