csv-detective 0.7.5.dev1197__py3-none-any.whl → 0.7.5.dev1209__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. csv_detective/__init__.py +1 -1
  2. csv_detective/detect_fields/FR/geo/adresse/__init__.py +1 -1
  3. csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +1 -1
  4. csv_detective/detect_fields/FR/other/csp_insee/__init__.py +1 -1
  5. csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +1 -1
  6. csv_detective/detect_fields/FR/other/sexe/__init__.py +1 -1
  7. csv_detective/detect_labels/FR/geo/adresse/__init__.py +1 -1
  8. csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +1 -1
  9. csv_detective/detect_labels/FR/geo/code_departement/__init__.py +1 -1
  10. csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +1 -1
  11. csv_detective/detect_labels/FR/geo/code_postal/__init__.py +1 -1
  12. csv_detective/detect_labels/FR/geo/code_region/__init__.py +1 -1
  13. csv_detective/detect_labels/FR/geo/commune/__init__.py +1 -1
  14. csv_detective/detect_labels/FR/geo/departement/__init__.py +1 -1
  15. csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +1 -1
  16. csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +1 -1
  17. csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +1 -1
  18. csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +1 -1
  19. csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +1 -1
  20. csv_detective/detect_labels/FR/geo/pays/__init__.py +1 -1
  21. csv_detective/detect_labels/FR/geo/region/__init__.py +1 -1
  22. csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +1 -1
  23. csv_detective/detect_labels/FR/other/code_rna/__init__.py +1 -1
  24. csv_detective/detect_labels/FR/other/code_waldec/__init__.py +1 -1
  25. csv_detective/detect_labels/FR/other/csp_insee/__init__.py +1 -1
  26. csv_detective/detect_labels/FR/other/date_fr/__init__.py +1 -1
  27. csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +1 -1
  28. csv_detective/detect_labels/FR/other/sexe/__init__.py +1 -1
  29. csv_detective/detect_labels/FR/other/siren/__init__.py +1 -1
  30. csv_detective/detect_labels/FR/other/siret/__init__.py +1 -1
  31. csv_detective/detect_labels/FR/other/tel_fr/__init__.py +1 -1
  32. csv_detective/detect_labels/FR/other/uai/__init__.py +1 -1
  33. csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +1 -1
  34. csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +1 -1
  35. csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +1 -1
  36. csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +1 -1
  37. csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +1 -1
  38. csv_detective/detect_labels/geo/json_geojson/__init__.py +1 -1
  39. csv_detective/detect_labels/geo/latitude_wgs/__init__.py +1 -1
  40. csv_detective/detect_labels/geo/latlon_wgs/__init__.py +1 -1
  41. csv_detective/detect_labels/geo/longitude_wgs/__init__.py +1 -1
  42. csv_detective/detect_labels/other/booleen/__init__.py +1 -1
  43. csv_detective/detect_labels/other/email/__init__.py +1 -1
  44. csv_detective/detect_labels/other/float/__init__.py +1 -1
  45. csv_detective/detect_labels/other/int/__init__.py +1 -1
  46. csv_detective/detect_labels/other/mongo_object_id/__init__.py +1 -1
  47. csv_detective/detect_labels/other/twitter/__init__.py +1 -1
  48. csv_detective/detect_labels/other/url/__init__.py +1 -1
  49. csv_detective/detect_labels/other/uuid/__init__.py +1 -1
  50. csv_detective/detect_labels/temp/date/__init__.py +1 -1
  51. csv_detective/detect_labels/temp/datetime_iso/__init__.py +1 -1
  52. csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +1 -1
  53. csv_detective/detect_labels/temp/year/__init__.py +1 -1
  54. csv_detective/detection/columns.py +89 -0
  55. csv_detective/detection/encoding.py +27 -0
  56. csv_detective/detection/engine.py +46 -0
  57. csv_detective/detection/headers.py +32 -0
  58. csv_detective/detection/rows.py +18 -0
  59. csv_detective/detection/separator.py +44 -0
  60. csv_detective/detection/variables.py +98 -0
  61. csv_detective/explore_csv.py +40 -124
  62. csv_detective/output/dataframe.py +55 -0
  63. csv_detective/{create_example.py → output/example.py} +10 -9
  64. csv_detective/output/profile.py +87 -0
  65. csv_detective/{schema_generation.py → output/schema.py} +344 -343
  66. csv_detective/output/utils.py +51 -0
  67. csv_detective/parsing/columns.py +141 -0
  68. csv_detective/parsing/compression.py +11 -0
  69. csv_detective/parsing/csv.py +55 -0
  70. csv_detective/parsing/excel.py +169 -0
  71. csv_detective/parsing/load.py +97 -0
  72. csv_detective/utils.py +10 -236
  73. {csv_detective-0.7.5.dev1197.data → csv_detective-0.7.5.dev1209.data}/data/share/csv_detective/CHANGELOG.md +1 -0
  74. {csv_detective-0.7.5.dev1197.dist-info → csv_detective-0.7.5.dev1209.dist-info}/METADATA +1 -1
  75. {csv_detective-0.7.5.dev1197.dist-info → csv_detective-0.7.5.dev1209.dist-info}/RECORD +84 -70
  76. tests/test_fields.py +7 -6
  77. tests/test_file.py +15 -14
  78. csv_detective/detection.py +0 -633
  79. /csv_detective/{process_text.py → parsing/text.py} +0 -0
  80. {csv_detective-0.7.5.dev1197.data → csv_detective-0.7.5.dev1209.data}/data/share/csv_detective/LICENSE.AGPL.txt +0 -0
  81. {csv_detective-0.7.5.dev1197.data → csv_detective-0.7.5.dev1209.data}/data/share/csv_detective/README.md +0 -0
  82. {csv_detective-0.7.5.dev1197.dist-info → csv_detective-0.7.5.dev1209.dist-info}/WHEEL +0 -0
  83. {csv_detective-0.7.5.dev1197.dist-info → csv_detective-0.7.5.dev1209.dist-info}/entry_points.txt +0 -0
  84. {csv_detective-0.7.5.dev1197.dist-info → csv_detective-0.7.5.dev1209.dist-info}/licenses/LICENSE.AGPL.txt +0 -0
  85. {csv_detective-0.7.5.dev1197.dist-info → csv_detective-0.7.5.dev1209.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,18 @@
1
+ import pandas as pd
2
+
3
+
4
+ def remove_empty_first_rows(table: pd.DataFrame) -> tuple[pd.DataFrame, int]:
5
+ """Analog process to detect_headers for csv files, determines how many rows to skip
6
+ to end up with the header at the right place"""
7
+ idx = 0
8
+ if all([str(c).startswith('Unnamed:') for c in table.columns]):
9
+ # there is on offset between the index in the file (idx here)
10
+ # and the index in the dataframe, because of the header
11
+ idx = 1
12
+ while table.iloc[idx - 1].isna().all():
13
+ idx += 1
14
+ cols = table.iloc[idx - 1]
15
+ table = table.iloc[idx:]
16
+ table.columns = cols.to_list()
17
+ # +1 here because the headers should count as a row
18
+ return table, idx
@@ -0,0 +1,44 @@
1
+ import csv
2
+ import logging
3
+ from time import time
4
+ from typing import TextIO
5
+
6
+ from csv_detective.utils import display_logs_depending_process_time
7
+
8
+
9
+ def detect_separator(file: TextIO, verbose: bool = False) -> str:
10
+ """Detects csv separator"""
11
+ # TODO: add a robust detection:
12
+ # si on a un point virgule comme texte et \t comme séparateur, on renvoie
13
+ # pour l'instant un point virgule
14
+ if verbose:
15
+ start = time()
16
+ logging.info("Detecting separator")
17
+ file.seek(0)
18
+ header = file.readline()
19
+ possible_separators = [";", ",", "|", "\t"]
20
+ sep_count = dict()
21
+ for sep in possible_separators:
22
+ sep_count[sep] = header.count(sep)
23
+ sep = max(sep_count, key=sep_count.get)
24
+ # testing that the first 10 (arbitrary) rows all have the same number of fields
25
+ # as the header. Prevents downstream unwanted behaviour where pandas can load
26
+ # the file (in a weird way) but the process is irrelevant.
27
+ file.seek(0)
28
+ reader = csv.reader(file, delimiter=sep)
29
+ rows_lengths = set()
30
+ for idx, row in enumerate(reader):
31
+ if idx > 10:
32
+ break
33
+ rows_lengths.add(len(row))
34
+ if len(rows_lengths) > 1:
35
+ raise ValueError(
36
+ f"Number of columns is not even across the first 10 rows (detected separator: {sep})."
37
+ )
38
+
39
+ if verbose:
40
+ display_logs_depending_process_time(
41
+ f'Detected separator: "{sep}" in {round(time() - start, 3)}s',
42
+ time() - start,
43
+ )
44
+ return sep
@@ -0,0 +1,98 @@
1
+ from ast import literal_eval
2
+ import logging
3
+ from time import time
4
+
5
+ import pandas as pd
6
+
7
+ from csv_detective.utils import display_logs_depending_process_time
8
+
9
+
10
+ def detect_continuous_variable(table: pd.DataFrame, continuous_th: float = 0.9, verbose: bool = False):
11
+ """
12
+ Detects whether a column contains continuous variables. We consider a continuous column
13
+ one that contains a considerable amount of float values.
14
+ We removed the integers as we then end up with postal codes, insee codes, and all sort
15
+ of codes and types.
16
+ This is not optimal but it will do for now.
17
+ """
18
+ # if we need this again in the future, could be first based on columns detected as int/float to cut time
19
+
20
+ def check_threshold(serie: pd.Series, continuous_th: float) -> bool:
21
+ count = serie.value_counts().to_dict()
22
+ total_nb = len(serie)
23
+ if float in count:
24
+ nb_floats = count[float]
25
+ else:
26
+ return False
27
+ if nb_floats / total_nb >= continuous_th:
28
+ return True
29
+ else:
30
+ return False
31
+
32
+ def parses_to_integer(value: str):
33
+ try:
34
+ value = value.replace(",", ".")
35
+ value = literal_eval(value)
36
+ return type(value)
37
+ # flake8: noqa
38
+ except:
39
+ return False
40
+
41
+ if verbose:
42
+ start = time()
43
+ logging.info("Detecting continuous columns")
44
+ res = table.apply(
45
+ lambda serie: check_threshold(serie.apply(parses_to_integer), continuous_th)
46
+ )
47
+ if verbose:
48
+ display_logs_depending_process_time(
49
+ f"Detected {sum(res)} continuous columns in {round(time() - start, 3)}s",
50
+ time() - start,
51
+ )
52
+ return res.index[res]
53
+
54
+
55
+ def detect_categorical_variable(
56
+ table: pd.DataFrame,
57
+ threshold_pct_categorical: float = 0.05,
58
+ max_number_categorical_values: int = 25,
59
+ verbose: bool = False,
60
+ ):
61
+ """
62
+ Heuristically detects whether a table (df) contains categorical values according to
63
+ the number of unique values contained.
64
+ As the idea of detecting categorical values is to then try to learn models to predict
65
+ them, we limit categorical values to at most 25 different modes or at most 5% disparity.
66
+ Postal code, insee code, code region and so on, may be thus not considered categorical values.
67
+ :param table:
68
+ :param threshold_pct_categorical:
69
+ :param max_number_categorical_values:
70
+ :return:
71
+ """
72
+
73
+ def abs_number_different_values(column_values: pd.Series):
74
+ return column_values.nunique()
75
+
76
+ def rel_number_different_values(column_values: pd.Series):
77
+ return column_values.nunique() / len(column_values)
78
+
79
+ def detect_categorical(column_values: pd.Series):
80
+ abs_unique_values = abs_number_different_values(column_values)
81
+ rel_unique_values = rel_number_different_values(column_values)
82
+ if (
83
+ abs_unique_values <= max_number_categorical_values
84
+ or rel_unique_values <= threshold_pct_categorical
85
+ ):
86
+ return True
87
+ return False
88
+
89
+ if verbose:
90
+ start = time()
91
+ logging.info("Detecting categorical columns")
92
+ res = table.apply(lambda serie: detect_categorical(serie))
93
+ if verbose:
94
+ display_logs_depending_process_time(
95
+ f"Detected {sum(res)} categorical columns out of {len(table.columns)} in {round(time() - start, 3)}s",
96
+ time() - start,
97
+ )
98
+ return res.index[res], res
@@ -1,52 +1,28 @@
1
- """
2
- Ce script analyse les premières lignes d'un CSV pour essayer de déterminer le
3
- contenu possible des champs
4
- """
5
-
6
- from typing import Dict, List, Union
7
1
  from collections import defaultdict
8
2
  import json
9
- import numpy as np
3
+ import logging
10
4
  import os
11
5
  import tempfile
12
- import logging
13
6
  from time import time
14
- import requests
15
- from io import BytesIO, StringIO
7
+ from typing import Union
8
+
9
+ import numpy as np
16
10
  import pandas as pd
17
11
 
18
12
  # flake8: noqa
19
13
  from csv_detective import detect_fields, detect_labels
20
- from csv_detective.s3_utils import download_from_minio, upload_to_minio
21
- from csv_detective.schema_generation import generate_table_schema
22
- from csv_detective.utils import (
23
- cast_df,
24
- display_logs_depending_process_time,
25
- prepare_output_dict,
26
- test_col,
27
- test_label,
28
- )
29
- from .detection import (
30
- detect_engine,
31
- detect_separator,
32
- detect_encoding,
33
- detect_headers,
34
- detect_heading_columns,
35
- detect_trailing_columns,
36
- parse_table,
37
- parse_excel,
38
- create_profile,
39
- detetect_categorical_variable,
14
+ from .detection.variables import (
15
+ detect_categorical_variable,
40
16
  # detect_continuous_variable,
41
- is_url,
42
- unzip,
43
- XLS_LIKE_EXT,
44
- EXCEL_ENGINES,
45
- COMPRESSION_ENGINES,
46
17
  )
47
-
48
-
49
- logging.basicConfig(level=logging.INFO)
18
+ from .output.dataframe import cast_df
19
+ from .output.profile import create_profile
20
+ from .output.schema import generate_table_schema
21
+ from .output.utils import prepare_output_dict
22
+ from .parsing.load import load_file
23
+ from .parsing.columns import test_col, test_label
24
+ from .s3_utils import download_from_minio, upload_to_minio
25
+ from .utils import display_logs_depending_process_time, is_url
50
26
 
51
27
 
52
28
  def get_all_packages(detect_type) -> list:
@@ -107,9 +83,9 @@ def return_all_tests(
107
83
 
108
84
 
109
85
  def routine(
110
- csv_file_path: str,
86
+ file_path: str,
111
87
  num_rows: int = 500,
112
- user_input_tests: Union[str, List[str]] = "ALL",
88
+ user_input_tests: Union[str, list[str]] = "ALL",
113
89
  limited_output: bool = True,
114
90
  save_results: Union[bool, str] = True,
115
91
  encoding: str = None,
@@ -126,7 +102,7 @@ def routine(
126
102
  column contents.
127
103
 
128
104
  Args:
129
- csv_file_path: local path to CSV file if not using Minio
105
+ file_path: local path to CSV file if not using Minio
130
106
  num_rows: number of rows to sample from the file for analysis ; -1 for analysis
131
107
  of the whole file
132
108
  user_input_tests: tests to run on the file
@@ -143,100 +119,40 @@ def routine(
143
119
  Returns:
144
120
  dict: a dict with information about the csv and possible types for each column
145
121
  """
146
- if not csv_file_path:
147
- raise ValueError("csv_file_path is required.")
148
122
 
149
123
  if not (isinstance(save_results, bool) or (isinstance(save_results, str) and save_results.endswith(".json"))):
150
124
  raise ValueError("`save_results` must be a bool or a valid path to a json file.")
151
125
 
152
126
  if verbose:
153
127
  start_routine = time()
154
- if is_url(csv_file_path):
128
+ if is_url(file_path):
155
129
  logging.info("Path recognized as a URL")
156
130
 
157
- file_name = csv_file_path.split('/')[-1]
158
- engine = None
159
- if '.' not in file_name or not file_name.endswith("csv"):
160
- # file has no extension, we'll investigate how to read it
161
- engine = detect_engine(csv_file_path, verbose=verbose)
162
-
163
- is_xls_like = False
164
- if engine in EXCEL_ENGINES or any([csv_file_path.endswith(k) for k in XLS_LIKE_EXT]):
165
- is_xls_like = True
166
- encoding, sep, heading_columns, trailing_columns = None, None, None, None
167
- table, total_lines, nb_duplicates, sheet_name, engine, header_row_idx = parse_excel(
168
- csv_file_path=csv_file_path,
169
- num_rows=num_rows,
170
- engine=engine,
171
- sheet_name=sheet_name,
172
- verbose=verbose,
173
- )
174
- header = table.columns.to_list()
175
- else:
176
- # fetching or reading file as binary
177
- if is_url(csv_file_path):
178
- r = requests.get(csv_file_path, allow_redirects=True)
179
- r.raise_for_status()
180
- binary_file = BytesIO(r.content)
181
- else:
182
- binary_file = open(csv_file_path, "rb")
183
- # handling compression
184
- if engine in COMPRESSION_ENGINES:
185
- binary_file: BytesIO = unzip(binary_file=binary_file, engine=engine)
186
- # detecting encoding if not specified
187
- if encoding is None:
188
- encoding: str = detect_encoding(binary_file, verbose=verbose)
189
- binary_file.seek(0)
190
- # decoding and reading file
191
- if is_url(csv_file_path) or engine in COMPRESSION_ENGINES:
192
- str_file = StringIO(binary_file.read().decode(encoding=encoding))
193
- else:
194
- str_file = open(csv_file_path, "r", encoding=encoding)
195
- if sep is None:
196
- sep = detect_separator(str_file, verbose=verbose)
197
- header_row_idx, header = detect_headers(str_file, sep, verbose=verbose)
198
- if header is None:
199
- return {"error": True}
200
- elif isinstance(header, list):
201
- if any([x is None for x in header]):
202
- return {"error": True}
203
- heading_columns = detect_heading_columns(str_file, sep, verbose=verbose)
204
- trailing_columns = detect_trailing_columns(str_file, sep, heading_columns, verbose=verbose)
205
- table, total_lines, nb_duplicates = parse_table(
206
- str_file, encoding, sep, num_rows, header_row_idx, verbose=verbose
207
- )
131
+ table, analysis = load_file(
132
+ file_path=file_path,
133
+ num_rows=num_rows,
134
+ encoding=encoding,
135
+ sep=sep,
136
+ verbose=verbose,
137
+ sheet_name=sheet_name,
138
+ )
208
139
 
209
140
  if table.empty:
210
141
  res_categorical = []
211
142
  # res_continuous = []
212
143
  else:
213
144
  # Detects columns that are categorical
214
- res_categorical, categorical_mask = detetect_categorical_variable(table, verbose=verbose)
145
+ res_categorical, categorical_mask = detect_categorical_variable(table, verbose=verbose)
215
146
  res_categorical = list(res_categorical)
216
147
  # Detect columns that are continuous (we already know the categorical) : we don't need this for now, cuts processing time
217
148
  # res_continuous = list(
218
149
  # detect_continuous_variable(table.iloc[:, ~categorical_mask.values], verbose=verbose)
219
150
  # )
220
151
 
221
- # Creating return dictionary
222
- analysis = {
223
- "header_row_idx": header_row_idx,
224
- "header": header,
225
- "total_lines": total_lines,
226
- "nb_duplicates": nb_duplicates,
227
- "heading_columns": heading_columns,
228
- "trailing_columns": trailing_columns,
152
+ analysis.update({
229
153
  "categorical": res_categorical,
230
154
  # "continuous": res_continuous,
231
- }
232
- # this is only relevant for xls-like
233
- if is_xls_like:
234
- analysis["engine"] = engine
235
- analysis["sheet_name"] = sheet_name
236
- # this is only relevant for csv
237
- else:
238
- analysis["encoding"] = encoding
239
- analysis["separator"] = sep
155
+ })
240
156
 
241
157
  # list testing to be performed
242
158
  all_tests_fields = return_all_tests(
@@ -355,10 +271,10 @@ def routine(
355
271
  if isinstance(save_results, str):
356
272
  output_path = save_results
357
273
  else:
358
- output_path = os.path.splitext(csv_file_path)[0]
274
+ output_path = os.path.splitext(file_path)[0]
359
275
  if is_url(output_path):
360
276
  output_path = output_path.split('/')[-1]
361
- if is_xls_like:
277
+ if analysis.get("sheet_name"):
362
278
  output_path += "_sheet-" + str(sheet_name)
363
279
  output_path += ".json"
364
280
  with open(output_path, "w", encoding="utf8") as fp:
@@ -386,13 +302,13 @@ def routine(
386
302
 
387
303
 
388
304
  def routine_minio(
389
- csv_minio_location: Dict[str, str],
390
- output_minio_location: Dict[str, str],
391
- tableschema_minio_location: Dict[str, str],
305
+ csv_minio_location: dict[str, str],
306
+ output_minio_location: dict[str, str],
307
+ tableschema_minio_location: dict[str, str],
392
308
  minio_user: str,
393
309
  minio_pwd: str,
394
310
  num_rows: int = 500,
395
- user_input_tests: Union[str, List[str]] = "ALL",
311
+ user_input_tests: Union[str, list[str]] = "ALL",
396
312
  encoding: str = None,
397
313
  sep: str = None,
398
314
  ):
@@ -450,18 +366,18 @@ def routine_minio(
450
366
  ):
451
367
  raise ValueError("Minio location dict must contain url, bucket and key")
452
368
 
453
- csv_file_path = tempfile.NamedTemporaryFile(delete=False).name
369
+ file_path = tempfile.NamedTemporaryFile(delete=False).name
454
370
  download_from_minio(
455
371
  netloc=csv_minio_location["netloc"],
456
372
  bucket=csv_minio_location["bucket"],
457
373
  key=csv_minio_location["key"],
458
- filepath=csv_file_path,
374
+ filepath=file_path,
459
375
  minio_user=minio_user,
460
376
  minio_pwd=minio_pwd,
461
377
  )
462
378
 
463
379
  analysis = routine(
464
- csv_file_path,
380
+ file_path,
465
381
  num_rows,
466
382
  user_input_tests,
467
383
  output_mode="LIMITED",
@@ -471,7 +387,7 @@ def routine_minio(
471
387
  )
472
388
 
473
389
  # Write report JSON file.
474
- output_path_to_store_minio_file = os.path.splitext(csv_file_path)[0] + ".json"
390
+ output_path_to_store_minio_file = os.path.splitext(file_path)[0] + ".json"
475
391
  with open(output_path_to_store_minio_file, "w", encoding="utf8") as fp:
476
392
  json.dump(analysis, fp, indent=4, separators=(",", ": "))
477
393
 
@@ -485,7 +401,7 @@ def routine_minio(
485
401
  )
486
402
 
487
403
  os.remove(output_path_to_store_minio_file)
488
- os.remove(csv_file_path)
404
+ os.remove(file_path)
489
405
 
490
406
  generate_table_schema(
491
407
  analysis,
@@ -0,0 +1,55 @@
1
+ from datetime import date, datetime
2
+ import json
3
+ from typing import Optional, Union
4
+ from time import time
5
+
6
+ import pandas as pd
7
+
8
+ from csv_detective.detect_fields.other.booleen import bool_casting
9
+ from csv_detective.detect_fields.other.float import float_casting
10
+ from csv_detective.detect_fields.temp.date import date_casting
11
+ from csv_detective.utils import display_logs_depending_process_time
12
+
13
+
14
+ def cast(value: str, _type: str) -> Optional[Union[str, float, bool, date, datetime]]:
15
+ if not isinstance(value, str) or not value:
16
+ # None is the current default value in hydra, should we keep this?
17
+ return None
18
+ if _type == "float":
19
+ return float_casting(value)
20
+ if _type == "bool":
21
+ return bool_casting(value)
22
+ if _type == "json":
23
+ # in hydra json are given to postgres as strings, conversion is done by postgres
24
+ return json.loads(value)
25
+ if _type == "date":
26
+ _date = date_casting(value)
27
+ return _date.date() if _date else None
28
+ if _type == "datetime":
29
+ return date_casting(value)
30
+ raise ValueError(f"Unknown type `{_type}`")
31
+
32
+
33
+ def cast_df(df: pd.DataFrame, columns: dict, cast_json: bool = True, verbose: bool = False) -> pd.DataFrame:
34
+ if verbose:
35
+ start = time()
36
+ output_df = pd.DataFrame()
37
+ for col_name, detection in columns.items():
38
+ if detection["python_type"] == "string" or (detection["python_type"] == "json" and not cast_json):
39
+ # no change if detected type is string
40
+ output_df[col_name] = df[col_name].copy()
41
+ elif detection["python_type"] == "int":
42
+ # to allow having ints and NaN in the same column
43
+ output_df[col_name] = df[col_name].copy().astype(pd.Int64Dtype())
44
+ else:
45
+ output_df[col_name] = df[col_name].apply(
46
+ lambda col: cast(col, _type=detection["python_type"])
47
+ )
48
+ # to save RAM
49
+ del df[col_name]
50
+ if verbose:
51
+ display_logs_depending_process_time(
52
+ f'Casting columns completed in {round(time() - start, 3)}s',
53
+ time() - start,
54
+ )
55
+ return output_df
@@ -1,13 +1,14 @@
1
+ from datetime import datetime
2
+ import json
1
3
  import random
2
- import uuid
3
4
  import string
4
- from datetime import datetime
5
+ from typing import Union, Optional, Any, Type
6
+ import uuid
7
+
8
+ from faker import Faker
5
9
  import pandas as pd
6
- from typing import List, Union, Optional, Any, Type
7
- import json
8
10
  import requests
9
11
  import rstr
10
- from faker import Faker
11
12
 
12
13
  fake = Faker()
13
14
 
@@ -69,7 +70,7 @@ def create_example_csv_file(
69
70
  return str(uuid.uuid4())
70
71
 
71
72
  def _date(
72
- date_range: Union[None, List[str]] = None,
73
+ date_range: Union[None, list[str]] = None,
73
74
  format: str = '%Y-%m-%d',
74
75
  required: bool = True,
75
76
  ) -> str:
@@ -98,7 +99,7 @@ def create_example_csv_file(
98
99
  return fake.time(format)
99
100
 
100
101
  def _datetime(
101
- datetime_range: Optional[List[str]] = None,
102
+ datetime_range: Optional[list[str]] = None,
102
103
  format: str = '%Y-%m-%d %H-%M-%S',
103
104
  required: bool = True,
104
105
  ) -> str:
@@ -123,7 +124,7 @@ def create_example_csv_file(
123
124
 
124
125
  def _number(
125
126
  num_type: Type[Union[int, float]] = int,
126
- num_range: Optional[List[float]] = None,
127
+ num_range: Optional[list[float]] = None,
127
128
  enum: Optional[list] = None,
128
129
  required: bool = True,
129
130
  ) -> Union[int, float]:
@@ -144,7 +145,7 @@ def create_example_csv_file(
144
145
  return ''
145
146
  return random.randint(0, 1) == 0
146
147
 
147
- def _array(enum: List[Any], required: bool = True) -> str:
148
+ def _array(enum: list[Any], required: bool = True) -> str:
148
149
  if potential_skip(required):
149
150
  return ''
150
151
  return f"[{','.join(random.sample(enum, random.randint(1, len(enum))))}]"
@@ -0,0 +1,87 @@
1
+ from collections import defaultdict
2
+ import logging
3
+ from time import time
4
+
5
+ import pandas as pd
6
+
7
+ from csv_detective.detect_fields.other.float import float_casting
8
+ from csv_detective.utils import display_logs_depending_process_time, prevent_nan
9
+
10
+
11
+ def create_profile(
12
+ table: pd.DataFrame,
13
+ dict_cols_fields: dict,
14
+ num_rows: int,
15
+ limited_output: bool = True,
16
+ verbose: bool = False,
17
+ ) -> dict:
18
+ if verbose:
19
+ start = time()
20
+ logging.info("Creating profile")
21
+ map_python_types = {
22
+ "string": str,
23
+ "int": float,
24
+ "float": float,
25
+ }
26
+
27
+ if num_rows > 0:
28
+ raise ValueError("To create profiles num_rows has to be set to -1")
29
+ safe_table = table.copy()
30
+ if not limited_output:
31
+ dict_cols_fields = {
32
+ k: v[0] if v else {'python_type': 'string', 'format': 'string', 'score': 1.0}
33
+ for k, v in dict_cols_fields.items()
34
+ }
35
+ dtypes = {
36
+ k: map_python_types.get(v["python_type"], str)
37
+ for k, v in dict_cols_fields.items()
38
+ }
39
+ for c in safe_table.columns:
40
+ if dtypes[c] == float:
41
+ safe_table[c] = safe_table[c].apply(
42
+ lambda s: float_casting(s) if isinstance(s, str) else s
43
+ )
44
+ profile = defaultdict(dict)
45
+ for c in safe_table.columns:
46
+ if map_python_types.get(dict_cols_fields[c]["python_type"], str) in [
47
+ float,
48
+ int,
49
+ ]:
50
+ profile[c].update(
51
+ min=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
52
+ safe_table[c].min()
53
+ )),
54
+ max=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
55
+ safe_table[c].max()
56
+ )),
57
+ mean=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
58
+ safe_table[c].mean()
59
+ )),
60
+ std=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
61
+ safe_table[c].std()
62
+ )),
63
+ )
64
+ tops_bruts = (
65
+ safe_table[safe_table[c].notna()][c]
66
+ .value_counts(dropna=True)
67
+ .reset_index()
68
+ .iloc[:10]
69
+ .to_dict(orient="records")
70
+ )
71
+ tops = []
72
+ for tb in tops_bruts:
73
+ tops.append({
74
+ "count": tb["count"],
75
+ "value": tb[c],
76
+ })
77
+ profile[c].update(
78
+ tops=tops,
79
+ nb_distinct=safe_table[c].nunique(),
80
+ nb_missing_values=len(safe_table[c].loc[safe_table[c].isna()]),
81
+ )
82
+ if verbose:
83
+ display_logs_depending_process_time(
84
+ f"Created profile in {round(time() - start, 3)}s",
85
+ time() - start,
86
+ )
87
+ return profile