csv-detective 0.7.5.dev1180__py3-none-any.whl → 0.7.5.dev1209__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. csv_detective/__init__.py +1 -1
  2. csv_detective/detect_fields/FR/geo/adresse/__init__.py +1 -1
  3. csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +1 -1
  4. csv_detective/detect_fields/FR/other/csp_insee/__init__.py +1 -1
  5. csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +1 -1
  6. csv_detective/detect_fields/FR/other/sexe/__init__.py +1 -1
  7. csv_detective/detect_fields/temp/date/__init__.py +5 -1
  8. csv_detective/detect_labels/FR/geo/adresse/__init__.py +1 -1
  9. csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +1 -1
  10. csv_detective/detect_labels/FR/geo/code_departement/__init__.py +1 -1
  11. csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +1 -1
  12. csv_detective/detect_labels/FR/geo/code_postal/__init__.py +1 -1
  13. csv_detective/detect_labels/FR/geo/code_region/__init__.py +1 -1
  14. csv_detective/detect_labels/FR/geo/commune/__init__.py +1 -1
  15. csv_detective/detect_labels/FR/geo/departement/__init__.py +1 -1
  16. csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +1 -1
  17. csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +1 -1
  18. csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +1 -1
  19. csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +1 -1
  20. csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +1 -1
  21. csv_detective/detect_labels/FR/geo/pays/__init__.py +1 -1
  22. csv_detective/detect_labels/FR/geo/region/__init__.py +1 -1
  23. csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +1 -1
  24. csv_detective/detect_labels/FR/other/code_rna/__init__.py +1 -1
  25. csv_detective/detect_labels/FR/other/code_waldec/__init__.py +1 -1
  26. csv_detective/detect_labels/FR/other/csp_insee/__init__.py +1 -1
  27. csv_detective/detect_labels/FR/other/date_fr/__init__.py +1 -1
  28. csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +1 -1
  29. csv_detective/detect_labels/FR/other/sexe/__init__.py +1 -1
  30. csv_detective/detect_labels/FR/other/siren/__init__.py +1 -1
  31. csv_detective/detect_labels/FR/other/siret/__init__.py +1 -1
  32. csv_detective/detect_labels/FR/other/tel_fr/__init__.py +1 -1
  33. csv_detective/detect_labels/FR/other/uai/__init__.py +1 -1
  34. csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +1 -1
  35. csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +1 -1
  36. csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +1 -1
  37. csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +1 -1
  38. csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +1 -1
  39. csv_detective/detect_labels/geo/json_geojson/__init__.py +1 -1
  40. csv_detective/detect_labels/geo/latitude_wgs/__init__.py +1 -1
  41. csv_detective/detect_labels/geo/latlon_wgs/__init__.py +1 -1
  42. csv_detective/detect_labels/geo/longitude_wgs/__init__.py +1 -1
  43. csv_detective/detect_labels/other/booleen/__init__.py +1 -1
  44. csv_detective/detect_labels/other/email/__init__.py +1 -1
  45. csv_detective/detect_labels/other/float/__init__.py +1 -1
  46. csv_detective/detect_labels/other/int/__init__.py +1 -1
  47. csv_detective/detect_labels/other/mongo_object_id/__init__.py +1 -1
  48. csv_detective/detect_labels/other/twitter/__init__.py +1 -1
  49. csv_detective/detect_labels/other/url/__init__.py +1 -1
  50. csv_detective/detect_labels/other/uuid/__init__.py +1 -1
  51. csv_detective/detect_labels/temp/date/__init__.py +1 -1
  52. csv_detective/detect_labels/temp/datetime_iso/__init__.py +1 -1
  53. csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +1 -1
  54. csv_detective/detect_labels/temp/year/__init__.py +1 -1
  55. csv_detective/detection/columns.py +89 -0
  56. csv_detective/detection/encoding.py +27 -0
  57. csv_detective/detection/engine.py +46 -0
  58. csv_detective/detection/headers.py +32 -0
  59. csv_detective/detection/rows.py +18 -0
  60. csv_detective/detection/separator.py +44 -0
  61. csv_detective/detection/variables.py +98 -0
  62. csv_detective/explore_csv.py +40 -110
  63. csv_detective/output/dataframe.py +55 -0
  64. csv_detective/{create_example.py → output/example.py} +10 -9
  65. csv_detective/output/profile.py +87 -0
  66. csv_detective/{schema_generation.py → output/schema.py} +344 -343
  67. csv_detective/output/utils.py +51 -0
  68. csv_detective/parsing/columns.py +141 -0
  69. csv_detective/parsing/compression.py +11 -0
  70. csv_detective/parsing/csv.py +55 -0
  71. csv_detective/parsing/excel.py +169 -0
  72. csv_detective/parsing/load.py +97 -0
  73. csv_detective/utils.py +10 -236
  74. {csv_detective-0.7.5.dev1180.data → csv_detective-0.7.5.dev1209.data}/data/share/csv_detective/CHANGELOG.md +3 -0
  75. {csv_detective-0.7.5.dev1180.dist-info → csv_detective-0.7.5.dev1209.dist-info}/METADATA +3 -2
  76. {csv_detective-0.7.5.dev1180.dist-info → csv_detective-0.7.5.dev1209.dist-info}/RECORD +85 -71
  77. {csv_detective-0.7.5.dev1180.dist-info → csv_detective-0.7.5.dev1209.dist-info}/WHEEL +1 -1
  78. tests/test_fields.py +7 -6
  79. tests/test_file.py +56 -57
  80. csv_detective/detection.py +0 -618
  81. /csv_detective/{process_text.py → parsing/text.py} +0 -0
  82. {csv_detective-0.7.5.dev1180.data → csv_detective-0.7.5.dev1209.data}/data/share/csv_detective/LICENSE.AGPL.txt +0 -0
  83. {csv_detective-0.7.5.dev1180.data → csv_detective-0.7.5.dev1209.data}/data/share/csv_detective/README.md +0 -0
  84. {csv_detective-0.7.5.dev1180.dist-info → csv_detective-0.7.5.dev1209.dist-info}/entry_points.txt +0 -0
  85. {csv_detective-0.7.5.dev1180.dist-info → csv_detective-0.7.5.dev1209.dist-info/licenses}/LICENSE.AGPL.txt +0 -0
  86. {csv_detective-0.7.5.dev1180.dist-info → csv_detective-0.7.5.dev1209.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,32 @@
1
+ import logging
2
+ from time import time
3
+ from typing import Optional, TextIO
4
+
5
+ from csv_detective.utils import display_logs_depending_process_time
6
+
7
+
8
+ def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int, Optional[list]]:
9
+ """Tests 10 first rows for possible header (in case header is not 1st row)"""
10
+ if verbose:
11
+ start = time()
12
+ logging.info("Detecting headers")
13
+ file.seek(0)
14
+ for i in range(10):
15
+ header = file.readline()
16
+ position = file.tell()
17
+ chaine = [c for c in header.replace("\n", "").split(sep) if c]
18
+ if chaine[-1] not in ["", "\n"] and all(
19
+ [mot not in ["", "\n"] for mot in chaine[1:-1]]
20
+ ):
21
+ next_row = file.readline()
22
+ file.seek(position)
23
+ if header != next_row:
24
+ if verbose:
25
+ display_logs_depending_process_time(
26
+ f'Detected headers in {round(time() - start, 3)}s',
27
+ time() - start,
28
+ )
29
+ return i, chaine
30
+ if verbose:
31
+ logging.info('No header detected')
32
+ return 0, None
@@ -0,0 +1,18 @@
1
+ import pandas as pd
2
+
3
+
4
+ def remove_empty_first_rows(table: pd.DataFrame) -> tuple[pd.DataFrame, int]:
5
+ """Analog process to detect_headers for csv files, determines how many rows to skip
6
+ to end up with the header at the right place"""
7
+ idx = 0
8
+ if all([str(c).startswith('Unnamed:') for c in table.columns]):
9
+ # there is on offset between the index in the file (idx here)
10
+ # and the index in the dataframe, because of the header
11
+ idx = 1
12
+ while table.iloc[idx - 1].isna().all():
13
+ idx += 1
14
+ cols = table.iloc[idx - 1]
15
+ table = table.iloc[idx:]
16
+ table.columns = cols.to_list()
17
+ # +1 here because the headers should count as a row
18
+ return table, idx
@@ -0,0 +1,44 @@
1
+ import csv
2
+ import logging
3
+ from time import time
4
+ from typing import TextIO
5
+
6
+ from csv_detective.utils import display_logs_depending_process_time
7
+
8
+
9
+ def detect_separator(file: TextIO, verbose: bool = False) -> str:
10
+ """Detects csv separator"""
11
+ # TODO: add a robust detection:
12
+ # si on a un point virgule comme texte et \t comme séparateur, on renvoie
13
+ # pour l'instant un point virgule
14
+ if verbose:
15
+ start = time()
16
+ logging.info("Detecting separator")
17
+ file.seek(0)
18
+ header = file.readline()
19
+ possible_separators = [";", ",", "|", "\t"]
20
+ sep_count = dict()
21
+ for sep in possible_separators:
22
+ sep_count[sep] = header.count(sep)
23
+ sep = max(sep_count, key=sep_count.get)
24
+ # testing that the first 10 (arbitrary) rows all have the same number of fields
25
+ # as the header. Prevents downstream unwanted behaviour where pandas can load
26
+ # the file (in a weird way) but the process is irrelevant.
27
+ file.seek(0)
28
+ reader = csv.reader(file, delimiter=sep)
29
+ rows_lengths = set()
30
+ for idx, row in enumerate(reader):
31
+ if idx > 10:
32
+ break
33
+ rows_lengths.add(len(row))
34
+ if len(rows_lengths) > 1:
35
+ raise ValueError(
36
+ f"Number of columns is not even across the first 10 rows (detected separator: {sep})."
37
+ )
38
+
39
+ if verbose:
40
+ display_logs_depending_process_time(
41
+ f'Detected separator: "{sep}" in {round(time() - start, 3)}s',
42
+ time() - start,
43
+ )
44
+ return sep
@@ -0,0 +1,98 @@
1
+ from ast import literal_eval
2
+ import logging
3
+ from time import time
4
+
5
+ import pandas as pd
6
+
7
+ from csv_detective.utils import display_logs_depending_process_time
8
+
9
+
10
+ def detect_continuous_variable(table: pd.DataFrame, continuous_th: float = 0.9, verbose: bool = False):
11
+ """
12
+ Detects whether a column contains continuous variables. We consider a continuous column
13
+ one that contains a considerable amount of float values.
14
+ We removed the integers as we then end up with postal codes, insee codes, and all sort
15
+ of codes and types.
16
+ This is not optimal but it will do for now.
17
+ """
18
+ # if we need this again in the future, could be first based on columns detected as int/float to cut time
19
+
20
+ def check_threshold(serie: pd.Series, continuous_th: float) -> bool:
21
+ count = serie.value_counts().to_dict()
22
+ total_nb = len(serie)
23
+ if float in count:
24
+ nb_floats = count[float]
25
+ else:
26
+ return False
27
+ if nb_floats / total_nb >= continuous_th:
28
+ return True
29
+ else:
30
+ return False
31
+
32
+ def parses_to_integer(value: str):
33
+ try:
34
+ value = value.replace(",", ".")
35
+ value = literal_eval(value)
36
+ return type(value)
37
+ # flake8: noqa
38
+ except:
39
+ return False
40
+
41
+ if verbose:
42
+ start = time()
43
+ logging.info("Detecting continuous columns")
44
+ res = table.apply(
45
+ lambda serie: check_threshold(serie.apply(parses_to_integer), continuous_th)
46
+ )
47
+ if verbose:
48
+ display_logs_depending_process_time(
49
+ f"Detected {sum(res)} continuous columns in {round(time() - start, 3)}s",
50
+ time() - start,
51
+ )
52
+ return res.index[res]
53
+
54
+
55
+ def detect_categorical_variable(
56
+ table: pd.DataFrame,
57
+ threshold_pct_categorical: float = 0.05,
58
+ max_number_categorical_values: int = 25,
59
+ verbose: bool = False,
60
+ ):
61
+ """
62
+ Heuristically detects whether a table (df) contains categorical values according to
63
+ the number of unique values contained.
64
+ As the idea of detecting categorical values is to then try to learn models to predict
65
+ them, we limit categorical values to at most 25 different modes or at most 5% disparity.
66
+ Postal code, insee code, code region and so on, may be thus not considered categorical values.
67
+ :param table:
68
+ :param threshold_pct_categorical:
69
+ :param max_number_categorical_values:
70
+ :return:
71
+ """
72
+
73
+ def abs_number_different_values(column_values: pd.Series):
74
+ return column_values.nunique()
75
+
76
+ def rel_number_different_values(column_values: pd.Series):
77
+ return column_values.nunique() / len(column_values)
78
+
79
+ def detect_categorical(column_values: pd.Series):
80
+ abs_unique_values = abs_number_different_values(column_values)
81
+ rel_unique_values = rel_number_different_values(column_values)
82
+ if (
83
+ abs_unique_values <= max_number_categorical_values
84
+ or rel_unique_values <= threshold_pct_categorical
85
+ ):
86
+ return True
87
+ return False
88
+
89
+ if verbose:
90
+ start = time()
91
+ logging.info("Detecting categorical columns")
92
+ res = table.apply(lambda serie: detect_categorical(serie))
93
+ if verbose:
94
+ display_logs_depending_process_time(
95
+ f"Detected {sum(res)} categorical columns out of {len(table.columns)} in {round(time() - start, 3)}s",
96
+ time() - start,
97
+ )
98
+ return res.index[res], res
@@ -1,49 +1,28 @@
1
- """
2
- Ce script analyse les premières lignes d'un CSV pour essayer de déterminer le
3
- contenu possible des champs
4
- """
5
-
6
- from typing import Dict, List, Union
7
1
  from collections import defaultdict
8
2
  import json
9
- import numpy as np
3
+ import logging
10
4
  import os
11
5
  import tempfile
12
- import logging
13
6
  from time import time
14
- import requests
15
- from io import StringIO
7
+ from typing import Union
8
+
9
+ import numpy as np
16
10
  import pandas as pd
17
11
 
18
12
  # flake8: noqa
19
13
  from csv_detective import detect_fields, detect_labels
20
- from csv_detective.s3_utils import download_from_minio, upload_to_minio
21
- from csv_detective.schema_generation import generate_table_schema
22
- from csv_detective.utils import (
23
- cast_df,
24
- display_logs_depending_process_time,
25
- prepare_output_dict,
26
- test_col,
27
- test_label,
28
- )
29
- from .detection import (
30
- detect_engine,
31
- detect_separator,
32
- detect_encoding,
33
- detect_headers,
34
- detect_heading_columns,
35
- detect_trailing_columns,
36
- parse_table,
37
- parse_excel,
38
- create_profile,
39
- detetect_categorical_variable,
14
+ from .detection.variables import (
15
+ detect_categorical_variable,
40
16
  # detect_continuous_variable,
41
- is_url,
42
- XLS_LIKE_EXT,
43
17
  )
44
-
45
-
46
- logging.basicConfig(level=logging.INFO)
18
+ from .output.dataframe import cast_df
19
+ from .output.profile import create_profile
20
+ from .output.schema import generate_table_schema
21
+ from .output.utils import prepare_output_dict
22
+ from .parsing.load import load_file
23
+ from .parsing.columns import test_col, test_label
24
+ from .s3_utils import download_from_minio, upload_to_minio
25
+ from .utils import display_logs_depending_process_time, is_url
47
26
 
48
27
 
49
28
  def get_all_packages(detect_type) -> list:
@@ -104,9 +83,9 @@ def return_all_tests(
104
83
 
105
84
 
106
85
  def routine(
107
- csv_file_path: str,
86
+ file_path: str,
108
87
  num_rows: int = 500,
109
- user_input_tests: Union[str, List[str]] = "ALL",
88
+ user_input_tests: Union[str, list[str]] = "ALL",
110
89
  limited_output: bool = True,
111
90
  save_results: Union[bool, str] = True,
112
91
  encoding: str = None,
@@ -123,7 +102,7 @@ def routine(
123
102
  column contents.
124
103
 
125
104
  Args:
126
- csv_file_path: local path to CSV file if not using Minio
105
+ file_path: local path to CSV file if not using Minio
127
106
  num_rows: number of rows to sample from the file for analysis ; -1 for analysis
128
107
  of the whole file
129
108
  user_input_tests: tests to run on the file
@@ -140,89 +119,40 @@ def routine(
140
119
  Returns:
141
120
  dict: a dict with information about the csv and possible types for each column
142
121
  """
143
- if not csv_file_path:
144
- raise ValueError("csv_file_path is required.")
145
122
 
146
123
  if not (isinstance(save_results, bool) or (isinstance(save_results, str) and save_results.endswith(".json"))):
147
124
  raise ValueError("`save_results` must be a bool or a valid path to a json file.")
148
125
 
149
126
  if verbose:
150
127
  start_routine = time()
151
- if is_url(csv_file_path):
128
+ if is_url(file_path):
152
129
  logging.info("Path recognized as a URL")
153
130
 
154
- file_name = csv_file_path.split('/')[-1]
155
- engine = None
156
- if '.' not in file_name:
157
- # file has no extension, we'll investigate how to read it
158
- engine = detect_engine(csv_file_path, verbose=verbose)
159
-
160
- is_xls_like = False
161
- if engine or any([csv_file_path.endswith(k) for k in XLS_LIKE_EXT]):
162
- is_xls_like = True
163
- encoding, sep, heading_columns, trailing_columns = None, None, None, None
164
- table, total_lines, nb_duplicates, sheet_name, engine, header_row_idx = parse_excel(
165
- csv_file_path=csv_file_path,
166
- num_rows=num_rows,
167
- engine=engine,
168
- sheet_name=sheet_name,
169
- verbose=verbose,
170
- )
171
- header = table.columns.to_list()
172
- else:
173
- if encoding is None:
174
- encoding = detect_encoding(csv_file_path, verbose=verbose)
175
- if is_url(csv_file_path):
176
- r = requests.get(csv_file_path, allow_redirects=True)
177
- r.raise_for_status()
178
- str_file = StringIO(r.content.decode(encoding=encoding))
179
- else:
180
- str_file = open(csv_file_path, "r", encoding=encoding)
181
- if sep is None:
182
- sep = detect_separator(str_file, verbose=verbose)
183
- header_row_idx, header = detect_headers(str_file, sep, verbose=verbose)
184
- if header is None:
185
- return {"error": True}
186
- elif isinstance(header, list):
187
- if any([x is None for x in header]):
188
- return {"error": True}
189
- heading_columns = detect_heading_columns(str_file, sep, verbose=verbose)
190
- trailing_columns = detect_trailing_columns(str_file, sep, heading_columns, verbose=verbose)
191
- table, total_lines, nb_duplicates = parse_table(
192
- str_file, encoding, sep, num_rows, header_row_idx, verbose=verbose
193
- )
131
+ table, analysis = load_file(
132
+ file_path=file_path,
133
+ num_rows=num_rows,
134
+ encoding=encoding,
135
+ sep=sep,
136
+ verbose=verbose,
137
+ sheet_name=sheet_name,
138
+ )
194
139
 
195
140
  if table.empty:
196
141
  res_categorical = []
197
142
  # res_continuous = []
198
143
  else:
199
144
  # Detects columns that are categorical
200
- res_categorical, categorical_mask = detetect_categorical_variable(table, verbose=verbose)
145
+ res_categorical, categorical_mask = detect_categorical_variable(table, verbose=verbose)
201
146
  res_categorical = list(res_categorical)
202
147
  # Detect columns that are continuous (we already know the categorical) : we don't need this for now, cuts processing time
203
148
  # res_continuous = list(
204
149
  # detect_continuous_variable(table.iloc[:, ~categorical_mask.values], verbose=verbose)
205
150
  # )
206
151
 
207
- # Creating return dictionary
208
- analysis = {
209
- "header_row_idx": header_row_idx,
210
- "header": header,
211
- "total_lines": total_lines,
212
- "nb_duplicates": nb_duplicates,
213
- "heading_columns": heading_columns,
214
- "trailing_columns": trailing_columns,
152
+ analysis.update({
215
153
  "categorical": res_categorical,
216
154
  # "continuous": res_continuous,
217
- }
218
- # this is only relevant for xls-like
219
- if is_xls_like:
220
- analysis["engine"] = engine
221
- analysis["sheet_name"] = sheet_name
222
- # this is only relevant for csv
223
- else:
224
- analysis["encoding"] = encoding
225
- analysis["separator"] = sep
155
+ })
226
156
 
227
157
  # list testing to be performed
228
158
  all_tests_fields = return_all_tests(
@@ -341,10 +271,10 @@ def routine(
341
271
  if isinstance(save_results, str):
342
272
  output_path = save_results
343
273
  else:
344
- output_path = os.path.splitext(csv_file_path)[0]
274
+ output_path = os.path.splitext(file_path)[0]
345
275
  if is_url(output_path):
346
276
  output_path = output_path.split('/')[-1]
347
- if is_xls_like:
277
+ if analysis.get("sheet_name"):
348
278
  output_path += "_sheet-" + str(sheet_name)
349
279
  output_path += ".json"
350
280
  with open(output_path, "w", encoding="utf8") as fp:
@@ -372,13 +302,13 @@ def routine(
372
302
 
373
303
 
374
304
  def routine_minio(
375
- csv_minio_location: Dict[str, str],
376
- output_minio_location: Dict[str, str],
377
- tableschema_minio_location: Dict[str, str],
305
+ csv_minio_location: dict[str, str],
306
+ output_minio_location: dict[str, str],
307
+ tableschema_minio_location: dict[str, str],
378
308
  minio_user: str,
379
309
  minio_pwd: str,
380
310
  num_rows: int = 500,
381
- user_input_tests: Union[str, List[str]] = "ALL",
311
+ user_input_tests: Union[str, list[str]] = "ALL",
382
312
  encoding: str = None,
383
313
  sep: str = None,
384
314
  ):
@@ -436,18 +366,18 @@ def routine_minio(
436
366
  ):
437
367
  raise ValueError("Minio location dict must contain url, bucket and key")
438
368
 
439
- csv_file_path = tempfile.NamedTemporaryFile(delete=False).name
369
+ file_path = tempfile.NamedTemporaryFile(delete=False).name
440
370
  download_from_minio(
441
371
  netloc=csv_minio_location["netloc"],
442
372
  bucket=csv_minio_location["bucket"],
443
373
  key=csv_minio_location["key"],
444
- filepath=csv_file_path,
374
+ filepath=file_path,
445
375
  minio_user=minio_user,
446
376
  minio_pwd=minio_pwd,
447
377
  )
448
378
 
449
379
  analysis = routine(
450
- csv_file_path,
380
+ file_path,
451
381
  num_rows,
452
382
  user_input_tests,
453
383
  output_mode="LIMITED",
@@ -457,7 +387,7 @@ def routine_minio(
457
387
  )
458
388
 
459
389
  # Write report JSON file.
460
- output_path_to_store_minio_file = os.path.splitext(csv_file_path)[0] + ".json"
390
+ output_path_to_store_minio_file = os.path.splitext(file_path)[0] + ".json"
461
391
  with open(output_path_to_store_minio_file, "w", encoding="utf8") as fp:
462
392
  json.dump(analysis, fp, indent=4, separators=(",", ": "))
463
393
 
@@ -471,7 +401,7 @@ def routine_minio(
471
401
  )
472
402
 
473
403
  os.remove(output_path_to_store_minio_file)
474
- os.remove(csv_file_path)
404
+ os.remove(file_path)
475
405
 
476
406
  generate_table_schema(
477
407
  analysis,
@@ -0,0 +1,55 @@
1
+ from datetime import date, datetime
2
+ import json
3
+ from typing import Optional, Union
4
+ from time import time
5
+
6
+ import pandas as pd
7
+
8
+ from csv_detective.detect_fields.other.booleen import bool_casting
9
+ from csv_detective.detect_fields.other.float import float_casting
10
+ from csv_detective.detect_fields.temp.date import date_casting
11
+ from csv_detective.utils import display_logs_depending_process_time
12
+
13
+
14
+ def cast(value: str, _type: str) -> Optional[Union[str, float, bool, date, datetime]]:
15
+ if not isinstance(value, str) or not value:
16
+ # None is the current default value in hydra, should we keep this?
17
+ return None
18
+ if _type == "float":
19
+ return float_casting(value)
20
+ if _type == "bool":
21
+ return bool_casting(value)
22
+ if _type == "json":
23
+ # in hydra json are given to postgres as strings, conversion is done by postgres
24
+ return json.loads(value)
25
+ if _type == "date":
26
+ _date = date_casting(value)
27
+ return _date.date() if _date else None
28
+ if _type == "datetime":
29
+ return date_casting(value)
30
+ raise ValueError(f"Unknown type `{_type}`")
31
+
32
+
33
+ def cast_df(df: pd.DataFrame, columns: dict, cast_json: bool = True, verbose: bool = False) -> pd.DataFrame:
34
+ if verbose:
35
+ start = time()
36
+ output_df = pd.DataFrame()
37
+ for col_name, detection in columns.items():
38
+ if detection["python_type"] == "string" or (detection["python_type"] == "json" and not cast_json):
39
+ # no change if detected type is string
40
+ output_df[col_name] = df[col_name].copy()
41
+ elif detection["python_type"] == "int":
42
+ # to allow having ints and NaN in the same column
43
+ output_df[col_name] = df[col_name].copy().astype(pd.Int64Dtype())
44
+ else:
45
+ output_df[col_name] = df[col_name].apply(
46
+ lambda col: cast(col, _type=detection["python_type"])
47
+ )
48
+ # to save RAM
49
+ del df[col_name]
50
+ if verbose:
51
+ display_logs_depending_process_time(
52
+ f'Casting columns completed in {round(time() - start, 3)}s',
53
+ time() - start,
54
+ )
55
+ return output_df
@@ -1,13 +1,14 @@
1
+ from datetime import datetime
2
+ import json
1
3
  import random
2
- import uuid
3
4
  import string
4
- from datetime import datetime
5
+ from typing import Union, Optional, Any, Type
6
+ import uuid
7
+
8
+ from faker import Faker
5
9
  import pandas as pd
6
- from typing import List, Union, Optional, Any, Type
7
- import json
8
10
  import requests
9
11
  import rstr
10
- from faker import Faker
11
12
 
12
13
  fake = Faker()
13
14
 
@@ -69,7 +70,7 @@ def create_example_csv_file(
69
70
  return str(uuid.uuid4())
70
71
 
71
72
  def _date(
72
- date_range: Union[None, List[str]] = None,
73
+ date_range: Union[None, list[str]] = None,
73
74
  format: str = '%Y-%m-%d',
74
75
  required: bool = True,
75
76
  ) -> str:
@@ -98,7 +99,7 @@ def create_example_csv_file(
98
99
  return fake.time(format)
99
100
 
100
101
  def _datetime(
101
- datetime_range: Optional[List[str]] = None,
102
+ datetime_range: Optional[list[str]] = None,
102
103
  format: str = '%Y-%m-%d %H-%M-%S',
103
104
  required: bool = True,
104
105
  ) -> str:
@@ -123,7 +124,7 @@ def create_example_csv_file(
123
124
 
124
125
  def _number(
125
126
  num_type: Type[Union[int, float]] = int,
126
- num_range: Optional[List[float]] = None,
127
+ num_range: Optional[list[float]] = None,
127
128
  enum: Optional[list] = None,
128
129
  required: bool = True,
129
130
  ) -> Union[int, float]:
@@ -144,7 +145,7 @@ def create_example_csv_file(
144
145
  return ''
145
146
  return random.randint(0, 1) == 0
146
147
 
147
- def _array(enum: List[Any], required: bool = True) -> str:
148
+ def _array(enum: list[Any], required: bool = True) -> str:
148
149
  if potential_skip(required):
149
150
  return ''
150
151
  return f"[{','.join(random.sample(enum, random.randint(1, len(enum))))}]"
@@ -0,0 +1,87 @@
1
+ from collections import defaultdict
2
+ import logging
3
+ from time import time
4
+
5
+ import pandas as pd
6
+
7
+ from csv_detective.detect_fields.other.float import float_casting
8
+ from csv_detective.utils import display_logs_depending_process_time, prevent_nan
9
+
10
+
11
+ def create_profile(
12
+ table: pd.DataFrame,
13
+ dict_cols_fields: dict,
14
+ num_rows: int,
15
+ limited_output: bool = True,
16
+ verbose: bool = False,
17
+ ) -> dict:
18
+ if verbose:
19
+ start = time()
20
+ logging.info("Creating profile")
21
+ map_python_types = {
22
+ "string": str,
23
+ "int": float,
24
+ "float": float,
25
+ }
26
+
27
+ if num_rows > 0:
28
+ raise ValueError("To create profiles num_rows has to be set to -1")
29
+ safe_table = table.copy()
30
+ if not limited_output:
31
+ dict_cols_fields = {
32
+ k: v[0] if v else {'python_type': 'string', 'format': 'string', 'score': 1.0}
33
+ for k, v in dict_cols_fields.items()
34
+ }
35
+ dtypes = {
36
+ k: map_python_types.get(v["python_type"], str)
37
+ for k, v in dict_cols_fields.items()
38
+ }
39
+ for c in safe_table.columns:
40
+ if dtypes[c] == float:
41
+ safe_table[c] = safe_table[c].apply(
42
+ lambda s: float_casting(s) if isinstance(s, str) else s
43
+ )
44
+ profile = defaultdict(dict)
45
+ for c in safe_table.columns:
46
+ if map_python_types.get(dict_cols_fields[c]["python_type"], str) in [
47
+ float,
48
+ int,
49
+ ]:
50
+ profile[c].update(
51
+ min=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
52
+ safe_table[c].min()
53
+ )),
54
+ max=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
55
+ safe_table[c].max()
56
+ )),
57
+ mean=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
58
+ safe_table[c].mean()
59
+ )),
60
+ std=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
61
+ safe_table[c].std()
62
+ )),
63
+ )
64
+ tops_bruts = (
65
+ safe_table[safe_table[c].notna()][c]
66
+ .value_counts(dropna=True)
67
+ .reset_index()
68
+ .iloc[:10]
69
+ .to_dict(orient="records")
70
+ )
71
+ tops = []
72
+ for tb in tops_bruts:
73
+ tops.append({
74
+ "count": tb["count"],
75
+ "value": tb[c],
76
+ })
77
+ profile[c].update(
78
+ tops=tops,
79
+ nb_distinct=safe_table[c].nunique(),
80
+ nb_missing_values=len(safe_table[c].loc[safe_table[c].isna()]),
81
+ )
82
+ if verbose:
83
+ display_logs_depending_process_time(
84
+ f"Created profile in {round(time() - start, 3)}s",
85
+ time() - start,
86
+ )
87
+ return profile