csv-detective 0.7.5.dev1069__py3-none-any.whl → 0.7.5.dev1113__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -54,4 +54,4 @@ from .geo import (
54
54
  )
55
55
 
56
56
  from .FR.temp import jour_de_la_semaine, mois_de_annee
57
- from .temp import year, date, datetime_iso, datetime_rfc822
57
+ from .temp import year, date, datetime, datetime_iso, datetime_rfc822
@@ -1,21 +1,27 @@
1
1
  PROPORTION = 1
2
- liste_bool = {
3
- '0',
4
- '1',
5
- 'vrai',
6
- 'faux',
7
- 'true',
8
- 'false',
9
- 'oui',
10
- 'non',
11
- 'yes',
12
- 'no',
13
- 'y',
14
- 'n',
15
- 'o'
2
+ bool_mapping = {
3
+ "1": True,
4
+ "0": False,
5
+ "vrai": True,
6
+ "faux": False,
7
+ "true": True,
8
+ "false": False,
9
+ "oui": True,
10
+ "non": False,
11
+ "yes": True,
12
+ "no": False,
13
+ "y": True,
14
+ "n": False,
15
+ "o": True,
16
16
  }
17
17
 
18
+ liste_bool = set(bool_mapping.keys())
18
19
 
19
- def _is(val):
20
- '''Détection les booléens'''
20
+
21
+ def bool_casting(val: str) -> bool:
22
+ return bool_mapping.get(val)
23
+
24
+
25
+ def _is(val: str) -> bool:
26
+ '''Détecte les booléens'''
21
27
  return isinstance(val, str) and val.lower() in liste_bool
@@ -1,8 +1,8 @@
1
1
  PROPORTION = 1
2
2
 
3
3
 
4
- def float_casting(str2cast):
5
- return float(str2cast.replace(',', '.'))
4
+ def float_casting(val: str) -> float:
5
+ return float(val.replace(',', '.'))
6
6
 
7
7
 
8
8
  def _is(val):
@@ -1,46 +1,30 @@
1
- import re
2
- from dateutil.parser import parse, ParserError
3
- from csv_detective.detect_fields.other.float import _is as is_float
4
- from unidecode import unidecode
1
+ from datetime import datetime
2
+ from typing import Optional
3
+
4
+ from dateparser import parse as date_parser
5
+ from dateutil.parser import parse as dateutil_parser, ParserError
5
6
 
6
7
  PROPORTION = 1
7
8
  # /!\ this is only for dates, not datetimes which are handled by other utils
8
9
 
9
10
 
10
- def is_dateutil_date(val: str) -> bool:
11
- # we don't want to get datetimes here, so length restriction
12
- # longest date string expected here is DD-septembre-YYYY, so 17 characters
13
- if len(val) > 17:
14
- return False
11
+ def date_casting(val: str) -> Optional[datetime]:
12
+ """For performance reasons, we try first with dateutil and fallback on dateparser"""
15
13
  try:
16
- res = parse(val, fuzzy=False)
17
- if res.hour or res.minute or res.second:
18
- return False
19
- return True
20
- except (ParserError, ValueError, TypeError, OverflowError):
21
- return False
22
-
23
-
24
- seps = r'[\s/\-\*_\|;.,]'
25
- # matches JJ-MM-AAAA with any of the listed separators
26
- pat = r'^(0[1-9]|[12][0-9]|3[01])SEP(0[1-9]|1[0-2])SEP((19|20)\d{2})$'.replace('SEP', seps)
27
- # matches AAAA-MM-JJ with any of the listed separators OR NO SEPARATOR
28
- tap = r'^((19|20)\d{2})SEP(0[1-9]|1[0-2])SEP(0[1-9]|[12][0-9]|3[01])$'.replace('SEP', seps + '?')
29
- # matches JJ-mmm-AAAA and JJ-mmm...mm-AAAA with any of the listed separators OR NO SEPARATOR
30
- letters = (
31
- r'^(0[1-9]|[12][0-9]|3[01])SEP(jan|fev|feb|mar|avr|apr'
32
- r'|mai|may|jun|jui|jul|aou|aug|sep|oct|nov|dec|janvier|fevrier|mars|avril|'
33
- r'mai|juin|jullet|aout|septembre|octobre|novembre|decembre)SEP'
34
- r'(\d{2}|\d{4})$'
35
- ).replace('SEP', seps + '?')
14
+ return dateutil_parser(val)
15
+ except ParserError:
16
+ return date_parser(val)
36
17
 
37
18
 
38
19
  def _is(val):
39
- '''Renvoie True si val peut être une date, False sinon
40
- On ne garde que les regex pour les cas où parse() ne convient pas'''
41
- return isinstance(val, str) and (
42
- (is_dateutil_date(val) and not is_float(val))
43
- or bool(re.match(letters, unidecode(val)))
44
- or bool(re.match(pat, val))
45
- or bool(re.match(tap, val))
46
- )
20
+ '''Renvoie True si val peut être une date, False sinon'''
21
+ # early stops, to cut processing time
22
+ if not isinstance(val, str) or len(val) > 20 or len(val) < 8:
23
+ return False
24
+ threshold = 0.3
25
+ if sum([char.isdigit() for char in val]) / len(val) < threshold:
26
+ return False
27
+ res = date_casting(val)
28
+ if not res or res.hour or res.minute or res.second:
29
+ return False
30
+ return True
@@ -0,0 +1,19 @@
1
+ from typing import Any, Optional
2
+
3
+ from csv_detective.detect_fields.temp.date import date_casting
4
+
5
+ PROPORTION = 1
6
+
7
+
8
+ def _is(val: Optional[Any]) -> bool:
9
+ '''Renvoie True si val peut être un datetime, False sinon'''
10
+ # early stops, to cut processing time
11
+ if not isinstance(val, str) or len(val) > 30 or len(val) < 15:
12
+ return False
13
+ threshold = 0.7
14
+ if sum([char.isdigit() for char in val]) / len(val) < threshold:
15
+ return False
16
+ res = date_casting(val)
17
+ if res and (res.hour or res.minute or res.second):
18
+ return True
19
+ return False
@@ -27,7 +27,9 @@ def _is(header):
27
27
  'dateouv',
28
28
  'date der maj',
29
29
  'dmaj',
30
- 'jour'
30
+ 'jour',
31
+ 'yyyymmdd',
32
+ 'aaaammjj',
31
33
  ]
32
34
  processed_header = _process_text(header)
33
35
 
@@ -1,4 +1,5 @@
1
- from typing import TextIO, Optional
1
+ from typing import TextIO, Optional, Union
2
+ from collections import defaultdict
2
3
  import pandas as pd
3
4
  import math
4
5
  import csv
@@ -27,7 +28,7 @@ engine_to_file = {
27
28
  }
28
29
 
29
30
 
30
- def is_url(csv_file_path: str):
31
+ def is_url(csv_file_path: str) -> bool:
31
32
  # could be more sophisticated if needed
32
33
  return csv_file_path.startswith('http')
33
34
 
@@ -35,17 +36,14 @@ def is_url(csv_file_path: str):
35
36
  def detect_continuous_variable(table: pd.DataFrame, continuous_th: float = 0.9, verbose: bool = False):
36
37
  """
37
38
  Detects whether a column contains continuous variables. We consider a continuous column
38
- one that contains
39
- a considerable amount of float values.
39
+ one that contains a considerable amount of float values.
40
40
  We removed the integers as we then end up with postal codes, insee codes, and all sort
41
41
  of codes and types.
42
42
  This is not optimal but it will do for now.
43
- :param table:
44
- :return:
45
43
  """
46
44
  # if we need this again in the future, could be first based on columns detected as int/float to cut time
47
45
 
48
- def check_threshold(serie: pd.Series, continuous_th: float):
46
+ def check_threshold(serie: pd.Series, continuous_th: float) -> bool:
49
47
  count = serie.value_counts().to_dict()
50
48
  total_nb = len(serie)
51
49
  if float in count:
@@ -75,7 +73,7 @@ def detect_continuous_variable(table: pd.DataFrame, continuous_th: float = 0.9,
75
73
  if verbose:
76
74
  display_logs_depending_process_time(
77
75
  f"Detected {sum(res)} continuous columns in {round(time() - start, 3)}s",
78
- time() - start
76
+ time() - start,
79
77
  )
80
78
  return res.index[res]
81
79
 
@@ -121,12 +119,12 @@ def detetect_categorical_variable(
121
119
  if verbose:
122
120
  display_logs_depending_process_time(
123
121
  f"Detected {sum(res)} categorical columns out of {len(table.columns)} in {round(time() - start, 3)}s",
124
- time() - start
122
+ time() - start,
125
123
  )
126
124
  return res.index[res], res
127
125
 
128
126
 
129
- def detect_engine(csv_file_path: str, verbose=False):
127
+ def detect_engine(csv_file_path: str, verbose=False) -> Optional[str]:
130
128
  if verbose:
131
129
  start = time()
132
130
  mapping = {
@@ -145,12 +143,12 @@ def detect_engine(csv_file_path: str, verbose=False):
145
143
  if verbose:
146
144
  display_logs_depending_process_time(
147
145
  f'File has no extension, detected {engine_to_file.get(engine, "csv")}',
148
- time() - start
146
+ time() - start,
149
147
  )
150
148
  return engine
151
149
 
152
150
 
153
- def detect_separator(file: TextIO, verbose: bool = False):
151
+ def detect_separator(file: TextIO, verbose: bool = False) -> str:
154
152
  """Detects csv separator"""
155
153
  # TODO: add a robust detection:
156
154
  # si on a un point virgule comme texte et \t comme séparateur, on renvoie
@@ -181,12 +179,12 @@ def detect_separator(file: TextIO, verbose: bool = False):
181
179
  if verbose:
182
180
  display_logs_depending_process_time(
183
181
  f'Detected separator: "{sep}" in {round(time() - start, 3)}s',
184
- time() - start
182
+ time() - start,
185
183
  )
186
184
  return sep
187
185
 
188
186
 
189
- def detect_encoding(csv_file_path: str, verbose: bool = False):
187
+ def detect_encoding(csv_file_path: str, verbose: bool = False) -> str:
190
188
  """
191
189
  Detects file encoding using faust-cchardet (forked from the original cchardet)
192
190
  """
@@ -205,7 +203,7 @@ def detect_encoding(csv_file_path: str, verbose: bool = False):
205
203
  message += f' in {round(time() - start, 3)}s (confidence: {round(encoding_dict["confidence"]*100)}%)'
206
204
  display_logs_depending_process_time(
207
205
  message,
208
- time() - start
206
+ time() - start,
209
207
  )
210
208
  return encoding_dict['encoding']
211
209
 
@@ -218,8 +216,7 @@ def parse_table(
218
216
  skiprows: int,
219
217
  random_state: int = 42,
220
218
  verbose : bool = False,
221
- ):
222
- # Takes care of some problems
219
+ ) -> tuple[pd.DataFrame, int, int]:
223
220
  if verbose:
224
221
  start = time()
225
222
  logging.info("Parsing table")
@@ -230,7 +227,6 @@ def parse_table(
230
227
 
231
228
  total_lines = None
232
229
  for encoding in [encoding, "ISO-8859-1", "utf-8"]:
233
- # TODO : modification systematique
234
230
  if encoding is None:
235
231
  continue
236
232
 
@@ -251,17 +247,16 @@ def parse_table(
251
247
  print("Trying encoding : {encoding}".format(encoding=encoding))
252
248
 
253
249
  if table is None:
254
- logging.error(" >> encoding not found")
255
- return table, "NA", "NA"
250
+ raise ValueError("Could not load file")
256
251
  if verbose:
257
252
  display_logs_depending_process_time(
258
253
  f'Table parsed successfully in {round(time() - start, 3)}s',
259
- time() - start
254
+ time() - start,
260
255
  )
261
256
  return table, total_lines, nb_duplicates
262
257
 
263
258
 
264
- def remove_empty_first_rows(table: pd.DataFrame):
259
+ def remove_empty_first_rows(table: pd.DataFrame) -> tuple[pd.DataFrame, int]:
265
260
  """Analog process to detect_headers for csv files, determines how many rows to skip
266
261
  to end up with the header at the right place"""
267
262
  idx = 0
@@ -274,7 +269,7 @@ def remove_empty_first_rows(table: pd.DataFrame):
274
269
  cols = table.iloc[idx - 1]
275
270
  table = table.iloc[idx:]
276
271
  table.columns = cols.to_list()
277
- # +1 here because the columns should count as a row
272
+ # +1 here because the headers should count as a row
278
273
  return table, idx
279
274
 
280
275
 
@@ -285,7 +280,7 @@ def parse_excel(
285
280
  sheet_name: Optional[str] = None,
286
281
  random_state: int = 42,
287
282
  verbose : bool = False,
288
- ):
283
+ ) -> tuple[pd.DataFrame, int, int, str, str, int]:
289
284
  """"Excel-like parsing is really slow, could be a good improvement for future development"""
290
285
  if verbose:
291
286
  start = time()
@@ -309,7 +304,7 @@ def parse_excel(
309
304
  if verbose:
310
305
  display_logs_depending_process_time(
311
306
  f'Detected {engine_to_file[engine]} file, no sheet specified, reading the largest one',
312
- time() - start
307
+ time() - start,
313
308
  )
314
309
  try:
315
310
  if engine == "openpyxl":
@@ -341,7 +336,7 @@ def parse_excel(
341
336
  if verbose:
342
337
  display_logs_depending_process_time(
343
338
  'Could not read file with classic xls reader, trying with ODS',
344
- time() - start
339
+ time() - start,
345
340
  )
346
341
  engine = "odf"
347
342
 
@@ -354,33 +349,33 @@ def parse_excel(
354
349
  if verbose:
355
350
  display_logs_depending_process_time(
356
351
  f'Detected {engine_to_file[engine]} file, no sheet specified, reading the largest one',
357
- time() - start
352
+ time() - start,
358
353
  )
359
354
  tables = pd.read_excel(
360
355
  csv_file_path,
361
356
  engine="odf",
362
357
  sheet_name=None,
363
- dtype="unicode"
358
+ dtype="unicode",
364
359
  )
365
360
  sizes = {sheet_name: table.size for sheet_name, table in tables.items()}
366
361
  sheet_name = max(sizes, key=sizes.get)
367
362
  if verbose:
368
363
  display_logs_depending_process_time(
369
364
  f'Going forwards with sheet "{sheet_name}"',
370
- time() - start
365
+ time() - start,
371
366
  )
372
367
  table = tables[sheet_name]
373
368
  else:
374
369
  if verbose:
375
370
  display_logs_depending_process_time(
376
371
  f'Detected {engine_to_file[engine]} file, reading sheet "{sheet_name}"',
377
- time() - start
372
+ time() - start,
378
373
  )
379
374
  table = pd.read_excel(
380
375
  csv_file_path,
381
376
  engine="odf",
382
377
  sheet_name=sheet_name,
383
- dtype="unicode"
378
+ dtype="unicode",
384
379
  )
385
380
  table, header_row_idx = remove_empty_first_rows(table)
386
381
  total_lines = len(table)
@@ -391,7 +386,7 @@ def parse_excel(
391
386
  if verbose:
392
387
  display_logs_depending_process_time(
393
388
  f'Table parsed successfully in {round(time() - start, 3)}s',
394
- time() - start
389
+ time() - start,
395
390
  )
396
391
  return table, total_lines, nb_duplicates, sheet_name, engine, header_row_idx
397
392
 
@@ -400,18 +395,18 @@ def parse_excel(
400
395
  if no_sheet_specified:
401
396
  display_logs_depending_process_time(
402
397
  f'Going forwards with sheet "{sheet_name}"',
403
- time() - start
398
+ time() - start,
404
399
  )
405
400
  else:
406
401
  display_logs_depending_process_time(
407
402
  f'Detected {engine_to_file[engine]} file, reading sheet "{sheet_name}"',
408
- time() - start
403
+ time() - start,
409
404
  )
410
405
  table = pd.read_excel(
411
406
  csv_file_path,
412
407
  engine=engine,
413
408
  sheet_name=sheet_name,
414
- dtype="unicode"
409
+ dtype="unicode",
415
410
  )
416
411
  table, header_row_idx = remove_empty_first_rows(table)
417
412
  total_lines = len(table)
@@ -422,12 +417,12 @@ def parse_excel(
422
417
  if verbose:
423
418
  display_logs_depending_process_time(
424
419
  f'Table parsed successfully in {round(time() - start, 3)}s',
425
- time() - start
420
+ time() - start,
426
421
  )
427
422
  return table, total_lines, nb_duplicates, sheet_name, engine, header_row_idx
428
423
 
429
424
 
430
- def prevent_nan(value: float):
425
+ def prevent_nan(value: float) -> Optional[float]:
431
426
  if math.isnan(value):
432
427
  return None
433
428
  return value
@@ -439,7 +434,7 @@ def create_profile(
439
434
  num_rows: int,
440
435
  limited_output: bool = True,
441
436
  verbose: bool = False,
442
- ):
437
+ ) -> dict:
443
438
  if verbose:
444
439
  start = time()
445
440
  logging.info("Creating profile")
@@ -466,9 +461,8 @@ def create_profile(
466
461
  safe_table[c] = safe_table[c].apply(
467
462
  lambda s: float_casting(s) if isinstance(s, str) else s
468
463
  )
469
- profile = {}
464
+ profile = defaultdict(dict)
470
465
  for c in safe_table.columns:
471
- profile[c] = {}
472
466
  if map_python_types.get(dict_cols_fields[c]["python_type"], str) in [
473
467
  float,
474
468
  int,
@@ -494,10 +488,10 @@ def create_profile(
494
488
  .to_dict(orient="records")
495
489
  tops = []
496
490
  for tb in tops_bruts:
497
- top = {}
498
- top["count"] = tb["count"]
499
- top["value"] = tb[c]
500
- tops.append(top)
491
+ tops.append({
492
+ "count": tb["count"],
493
+ "value": tb[c],
494
+ })
501
495
  profile[c].update(
502
496
  tops=tops,
503
497
  nb_distinct=safe_table[c].nunique(),
@@ -506,7 +500,7 @@ def create_profile(
506
500
  if verbose:
507
501
  display_logs_depending_process_time(
508
502
  f"Created profile in {round(time() - start, 3)}s",
509
- time() - start
503
+ time() - start,
510
504
  )
511
505
  return profile
512
506
 
@@ -540,7 +534,7 @@ def detect_extra_columns(file: TextIO, sep: str):
540
534
  return nb_useless_col, retour
541
535
 
542
536
 
543
- def detect_headers(file: TextIO, sep: str, verbose: bool = False):
537
+ def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int, Optional[list]]:
544
538
  """Tests 10 first rows for possible header (header not in 1st line)"""
545
539
  if verbose:
546
540
  start = time()
@@ -559,7 +553,7 @@ def detect_headers(file: TextIO, sep: str, verbose: bool = False):
559
553
  if verbose:
560
554
  display_logs_depending_process_time(
561
555
  f'Detected headers in {round(time() - start, 3)}s',
562
- time() - start
556
+ time() - start,
563
557
  )
564
558
  return i, chaine
565
559
  if verbose:
@@ -567,7 +561,7 @@ def detect_headers(file: TextIO, sep: str, verbose: bool = False):
567
561
  return 0, None
568
562
 
569
563
 
570
- def detect_heading_columns(file: TextIO, sep: str, verbose : bool = False):
564
+ def detect_heading_columns(file: TextIO, sep: str, verbose : bool = False) -> int:
571
565
  """Tests first 10 lines to see if there are empty heading columns"""
572
566
  if verbose:
573
567
  start = time()
@@ -581,18 +575,18 @@ def detect_heading_columns(file: TextIO, sep: str, verbose : bool = False):
581
575
  if verbose:
582
576
  display_logs_depending_process_time(
583
577
  f'No heading column detected in {round(time() - start, 3)}s',
584
- time() - start
578
+ time() - start,
585
579
  )
586
580
  return 0
587
581
  if verbose:
588
582
  display_logs_depending_process_time(
589
583
  f'{return_int} heading columns detected in {round(time() - start, 3)}s',
590
- time() - start
584
+ time() - start,
591
585
  )
592
586
  return return_int
593
587
 
594
588
 
595
- def detect_trailing_columns(file: TextIO, sep: str, heading_columns: int, verbose : bool = False):
589
+ def detect_trailing_columns(file: TextIO, sep: str, heading_columns: int, verbose : bool = False) -> int:
596
590
  """Tests first 10 lines to see if there are empty trailing columns"""
597
591
  if verbose:
598
592
  start = time()
@@ -611,12 +605,12 @@ def detect_trailing_columns(file: TextIO, sep: str, heading_columns: int, verbos
611
605
  if verbose:
612
606
  display_logs_depending_process_time(
613
607
  f'No trailing column detected in {round(time() - start, 3)}s',
614
- time() - start
608
+ time() - start,
615
609
  )
616
610
  return 0
617
611
  if verbose:
618
612
  display_logs_depending_process_time(
619
613
  f'{return_int} trailing columns detected in {round(time() - start, 3)}s',
620
- time() - start
614
+ time() - start,
621
615
  )
622
616
  return return_int
@@ -4,6 +4,7 @@ contenu possible des champs
4
4
  """
5
5
 
6
6
  from typing import Dict, List, Union
7
+ from collections import defaultdict
7
8
  import json
8
9
  import numpy as np
9
10
  import os
@@ -18,7 +19,13 @@ import pandas as pd
18
19
  from csv_detective import detect_fields, detect_labels
19
20
  from csv_detective.s3_utils import download_from_minio, upload_to_minio
20
21
  from csv_detective.schema_generation import generate_table_schema
21
- from csv_detective.utils import test_col, test_label, prepare_output_dict, display_logs_depending_process_time
22
+ from csv_detective.utils import (
23
+ cast_df,
24
+ display_logs_depending_process_time,
25
+ prepare_output_dict,
26
+ test_col,
27
+ test_label,
28
+ )
22
29
  from .detection import (
23
30
  detect_engine,
24
31
  detect_separator,
@@ -39,7 +46,7 @@ from .detection import (
39
46
  logging.basicConfig(level=logging.INFO)
40
47
 
41
48
 
42
- def get_all_packages(detect_type):
49
+ def get_all_packages(detect_type) -> list:
43
50
  root_dir = os.path.dirname(os.path.abspath(__file__)) + "/" + detect_type
44
51
  modules = []
45
52
  for dirpath, _, filenames in os.walk(root_dir):
@@ -58,7 +65,7 @@ def get_all_packages(detect_type):
58
65
  def return_all_tests(
59
66
  user_input_tests: Union[str, list],
60
67
  detect_type: str,
61
- ):
68
+ ) -> list:
62
69
  """
63
70
  returns all tests that have a method _is and are listed in the user_input_tests
64
71
  the function can select a sub_package from csv_detective
@@ -110,6 +117,7 @@ def routine(
110
117
  output_profile: bool = False,
111
118
  output_schema: bool = False,
112
119
  output_df: bool = False,
120
+ cast_json: bool = True,
113
121
  verbose: bool = False,
114
122
  sheet_name: Union[str, int] = None,
115
123
  ) -> Union[dict, tuple[dict, pd.DataFrame]]:
@@ -126,6 +134,7 @@ def routine(
126
134
  output_profile: whether or not to add the 'profile' field to the output
127
135
  output_schema: whether or not to add the 'schema' field to the output (tableschema)
128
136
  output_df: whether or not to return the loaded DataFrame along with the analysis report
137
+ cast_json: whether or not to cast json columns into objects (otherwise they are returned as strings)
129
138
  verbose: whether or not to print process logs in console
130
139
  sheet_name: if reading multi-sheet file (xls-like), which sheet to consider
131
140
  skipna: whether to keep NaN (empty cells) for tests
@@ -175,12 +184,10 @@ def routine(
175
184
  sep = detect_separator(str_file, verbose=verbose)
176
185
  header_row_idx, header = detect_headers(str_file, sep, verbose=verbose)
177
186
  if header is None:
178
- return_dict = {"error": True}
179
- return return_dict
187
+ return {"error": True}
180
188
  elif isinstance(header, list):
181
189
  if any([x is None for x in header]):
182
- return_dict = {"error": True}
183
- return return_dict
190
+ return {"error": True}
184
191
  heading_columns = detect_heading_columns(str_file, sep, verbose=verbose)
185
192
  trailing_columns = detect_trailing_columns(str_file, sep, heading_columns, verbose=verbose)
186
193
  table, total_lines, nb_duplicates = parse_table(
@@ -200,7 +207,7 @@ def routine(
200
207
  # )
201
208
 
202
209
  # Creating return dictionary
203
- return_dict = {
210
+ analysis = {
204
211
  "header_row_idx": header_row_idx,
205
212
  "header": header,
206
213
  "total_lines": total_lines,
@@ -212,12 +219,12 @@ def routine(
212
219
  }
213
220
  # this is only relevant for xls-like
214
221
  if is_xls_like:
215
- return_dict["engine"] = engine
216
- return_dict["sheet_name"] = sheet_name
222
+ analysis["engine"] = engine
223
+ analysis["sheet_name"] = sheet_name
217
224
  # this is only relevant for csv
218
225
  else:
219
- return_dict["encoding"] = encoding
220
- return_dict["separator"] = sep
226
+ analysis["encoding"] = encoding
227
+ analysis["separator"] = sep
221
228
 
222
229
  # list testing to be performed
223
230
  all_tests_fields = return_all_tests(
@@ -229,25 +236,24 @@ def routine(
229
236
 
230
237
  # if no testing then return
231
238
  if not all_tests_fields and not all_tests_labels:
232
- return return_dict
239
+ return analysis
233
240
 
234
241
  # Perform testing on fields
235
- return_table_fields = test_col(table, all_tests_fields, limited_output, skipna=skipna, verbose=verbose)
236
- return_dict["columns_fields"] = prepare_output_dict(return_table_fields, limited_output)
242
+ scores_table_fields = test_col(table, all_tests_fields, limited_output, skipna=skipna, verbose=verbose)
243
+ analysis["columns_fields"] = prepare_output_dict(scores_table_fields, limited_output)
237
244
 
238
245
  # Perform testing on labels
239
- return_table_labels = test_label(table, all_tests_labels, limited_output, verbose=verbose)
240
- return_dict["columns_labels"] = prepare_output_dict(return_table_labels, limited_output)
246
+ scores_table_labels = test_label(table, all_tests_labels, limited_output, verbose=verbose)
247
+ analysis["columns_labels"] = prepare_output_dict(scores_table_labels, limited_output)
241
248
 
242
249
  # Multiply the results of the fields by 1 + 0.5 * the results of the labels.
243
250
  # This is because the fields are more important than the labels and yields a max
244
251
  # of 1.5 for the final score.
245
- return_table = return_table_fields * (
252
+ scores_table = scores_table_fields * (
246
253
  1
247
- + return_table_labels.reindex(
248
- index=return_table_fields.index, fill_value=0
249
- ).values
250
- / 2
254
+ + scores_table_labels.reindex(
255
+ index=scores_table_fields.index, fill_value=0
256
+ ).values / 2
251
257
  )
252
258
 
253
259
  # To reduce false positives: ensure these formats are detected only if the label yields
@@ -263,12 +269,12 @@ def routine(
263
269
  "latitude_l93",
264
270
  "longitude_l93",
265
271
  ]
266
- return_table.loc[formats_with_mandatory_label, :] = np.where(
267
- return_table_labels.loc[formats_with_mandatory_label, :],
268
- return_table.loc[formats_with_mandatory_label, :],
272
+ scores_table.loc[formats_with_mandatory_label, :] = np.where(
273
+ scores_table_labels.loc[formats_with_mandatory_label, :],
274
+ scores_table.loc[formats_with_mandatory_label, :],
269
275
  0,
270
276
  )
271
- return_dict["columns"] = prepare_output_dict(return_table, limited_output)
277
+ analysis["columns"] = prepare_output_dict(scores_table, limited_output)
272
278
 
273
279
  metier_to_python_type = {
274
280
  "booleen": "bool",
@@ -278,6 +284,8 @@ def routine(
278
284
  "json": "json",
279
285
  "json_geojson": "json",
280
286
  "datetime": "datetime",
287
+ "datetime_iso": "datetime",
288
+ "datetime_rfc822": "datetime",
281
289
  "date": "date",
282
290
  "latitude": "float",
283
291
  "latitude_l93": "float",
@@ -291,7 +299,7 @@ def routine(
291
299
 
292
300
  if not limited_output:
293
301
  for detection_method in ["columns_fields", "columns_labels", "columns"]:
294
- return_dict[detection_method] = {
302
+ analysis[detection_method] = {
295
303
  col_name: [
296
304
  {
297
305
  "python_type": metier_to_python_type.get(
@@ -301,32 +309,29 @@ def routine(
301
309
  }
302
310
  for detection in detections
303
311
  ]
304
- for col_name, detections in return_dict[detection_method].items()
312
+ for col_name, detections in analysis[detection_method].items()
305
313
  }
306
314
  else:
307
315
  for detection_method in ["columns_fields", "columns_labels", "columns"]:
308
- return_dict[detection_method] = {
316
+ analysis[detection_method] = {
309
317
  col_name: {
310
318
  "python_type": metier_to_python_type.get(
311
319
  detection["format"], "string"
312
320
  ),
313
321
  **detection,
314
322
  }
315
- for col_name, detection in return_dict[detection_method].items()
323
+ for col_name, detection in analysis[detection_method].items()
316
324
  }
317
325
 
318
326
  # Add detection with formats as keys
319
- return_dict["formats"] = {
320
- column_metadata["format"]: []
321
- for column_metadata in return_dict["columns"].values()
322
- }
323
- for header, col_metadata in return_dict["columns"].items():
324
- return_dict["formats"][col_metadata["format"]].append(header)
327
+ analysis["formats"] = defaultdict(list)
328
+ for header, col_metadata in analysis["columns"].items():
329
+ analysis["formats"][col_metadata["format"]].append(header)
325
330
 
326
331
  if output_profile:
327
- return_dict["profile"] = create_profile(
332
+ analysis["profile"] = create_profile(
328
333
  table=table,
329
- dict_cols_fields=return_dict["columns"],
334
+ dict_cols_fields=analysis["columns"],
330
335
  num_rows=num_rows,
331
336
  limited_output=limited_output,
332
337
  verbose=verbose,
@@ -343,11 +348,11 @@ def routine(
343
348
  output_path += "_sheet-" + str(sheet_name)
344
349
  output_path += ".json"
345
350
  with open(output_path, "w", encoding="utf8") as fp:
346
- json.dump(return_dict, fp, indent=4, separators=(",", ": "), ensure_ascii=False)
351
+ json.dump(analysis, fp, indent=4, separators=(",", ": "), ensure_ascii=False)
347
352
 
348
353
  if output_schema:
349
- return_dict["schema"] = generate_table_schema(
350
- return_dict,
354
+ analysis["schema"] = generate_table_schema(
355
+ analysis,
351
356
  save_file=False,
352
357
  verbose=verbose
353
358
  )
@@ -357,8 +362,13 @@ def routine(
357
362
  time() - start_routine
358
363
  )
359
364
  if output_df:
360
- return return_dict, table
361
- return return_dict
365
+ return analysis, cast_df(
366
+ df=table,
367
+ columns=analysis["columns"],
368
+ cast_json=cast_json,
369
+ verbose=verbose,
370
+ )
371
+ return analysis
362
372
 
363
373
 
364
374
  def routine_minio(
@@ -436,7 +446,7 @@ def routine_minio(
436
446
  minio_pwd=minio_pwd,
437
447
  )
438
448
 
439
- return_dict = routine(
449
+ analysis = routine(
440
450
  csv_file_path,
441
451
  num_rows,
442
452
  user_input_tests,
@@ -449,7 +459,7 @@ def routine_minio(
449
459
  # Write report JSON file.
450
460
  output_path_to_store_minio_file = os.path.splitext(csv_file_path)[0] + ".json"
451
461
  with open(output_path_to_store_minio_file, "w", encoding="utf8") as fp:
452
- json.dump(return_dict, fp, indent=4, separators=(",", ": "))
462
+ json.dump(analysis, fp, indent=4, separators=(",", ": "))
453
463
 
454
464
  upload_to_minio(
455
465
  netloc=output_minio_location["netloc"],
@@ -464,7 +474,7 @@ def routine_minio(
464
474
  os.remove(csv_file_path)
465
475
 
466
476
  generate_table_schema(
467
- return_dict,
477
+ analysis,
468
478
  True,
469
479
  netloc=tableschema_minio_location["netloc"],
470
480
  bucket=tableschema_minio_location["bucket"],
@@ -473,4 +483,4 @@ def routine_minio(
473
483
  minio_pwd=minio_pwd,
474
484
  )
475
485
 
476
- return return_dict
486
+ return analysis
csv_detective/utils.py CHANGED
@@ -1,7 +1,13 @@
1
- from typing import Callable
1
+ from typing import Callable, Optional, Union
2
+ import json
2
3
  import pandas as pd
3
4
  import logging
4
5
  from time import time
6
+ from datetime import date, datetime
7
+
8
+ from csv_detective.detect_fields.other.booleen import bool_casting
9
+ from csv_detective.detect_fields.other.float import float_casting
10
+ from csv_detective.detect_fields.temp.date import date_casting
5
11
 
6
12
  logging.basicConfig(level=logging.INFO)
7
13
 
@@ -210,7 +216,52 @@ def prepare_output_dict(return_table: pd.DataFrame, limited_output: bool):
210
216
 
211
217
  def full_word_strictly_inside_string(word: str, string: str):
212
218
  return (
213
- (" " + word + " " in string)
219
+ word == string
220
+ or (" " + word + " " in string)
214
221
  or (string.startswith(word + " "))
215
222
  or (string.endswith(" " + word))
216
223
  )
224
+
225
+
226
+ def cast(value: str, _type: str) -> Optional[Union[str, float, bool, date, datetime]]:
227
+ if not isinstance(value, str) or not value:
228
+ # None is the current default value in hydra, should we keep this?
229
+ return None
230
+ if _type == "float":
231
+ return float_casting(value)
232
+ if _type == "bool":
233
+ return bool_casting(value)
234
+ if _type == "json":
235
+ # in hydra json are given to postgres as strings, conversion is done by postgres
236
+ return json.loads(value)
237
+ if _type == "date":
238
+ _date = date_casting(value)
239
+ return _date.date() if _date else None
240
+ if _type == "datetime":
241
+ return date_casting(value)
242
+ raise ValueError(f"Unknown type `{_type}`")
243
+
244
+
245
+ def cast_df(df: pd.DataFrame, columns: dict, cast_json: bool = True, verbose: bool = False) -> pd.DataFrame:
246
+ if verbose:
247
+ start = time()
248
+ output_df = pd.DataFrame()
249
+ for col_name, detection in columns.items():
250
+ if detection["python_type"] == "string" or (detection["python_type"] == "json" and not cast_json):
251
+ # no change if detected type is string
252
+ output_df[col_name] = df[col_name].copy()
253
+ elif detection["python_type"] == "int":
254
+ # to allow having ints and NaN in the same column
255
+ output_df[col_name] = df[col_name].copy().astype(pd.Int64Dtype())
256
+ else:
257
+ output_df[col_name] = df[col_name].apply(
258
+ lambda col: cast(col, _type=detection["python_type"])
259
+ )
260
+ # to save RAM
261
+ del df[col_name]
262
+ if verbose:
263
+ display_logs_depending_process_time(
264
+ f'Casting columns completed in {round(time() - start, 3)}s',
265
+ time() - start,
266
+ )
267
+ return output_df
@@ -4,6 +4,8 @@
4
4
 
5
5
  - New function that creates a csv from a list of fields and constraints, or from a TableSchema [#101](https://github.com/datagouv/csv-detective/pull/101)
6
6
  - Enable outputing loaded dataframe [#102](https://github.com/datagouv/csv-detective/pull/102)
7
+ - Better naming, hint types and minor refactors [#103](https://github.com/datagouv/csv-detective/pull/103)
8
+ - The returned dataframe has its columns properly cast to the detected types [#104](https://github.com/datagouv/csv-detective/pull/104)
7
9
 
8
10
  ## 0.7.4 (2024-11-15)
9
11
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: csv_detective
3
- Version: 0.7.5.dev1069
3
+ Version: 0.7.5.dev1113
4
4
  Summary: Detect CSV column content
5
5
  Home-page: https://github.com/etalab/csv_detective
6
6
  Author: Etalab
@@ -15,6 +15,7 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
15
15
  Description-Content-Type: text/markdown
16
16
  License-File: LICENSE.AGPL.txt
17
17
  Requires-Dist: boto3==1.34.0
18
+ Requires-Dist: dateparser==1.2.0
18
19
  Requires-Dist: faust-cchardet==2.1.19
19
20
  Requires-Dist: pandas==2.2.0
20
21
  Requires-Dist: pytest==8.3.0
@@ -1,13 +1,13 @@
1
1
  csv_detective/__init__.py,sha256=Au4bNJ_Gi6P6o0uO4R56nYdshG7M6-7Rg_xX4whLmLI,143
2
2
  csv_detective/cli.py,sha256=Ua7SE1wMH2uFUsTmfumh4nJk7O06okpMd2gvjUDO1II,1048
3
3
  csv_detective/create_example.py,sha256=358e7Q7RWMrY_eEo3pUteJWmg2smFb5edJ_AzcQPrqA,8646
4
- csv_detective/detection.py,sha256=AuXlPOZfzqznZY2ybAAgaXIq6qVITYd3MXf2CoigI3I,22097
5
- csv_detective/explore_csv.py,sha256=6kGl1E061_CefAdei-wgwafZT1g8oKWg0eE1D5zWTOk,17216
4
+ csv_detective/detection.py,sha256=SUNGMvvuM_bj3gKYw-x6-CjjkirqCPoeAm0NCPkijrM,22225
5
+ csv_detective/explore_csv.py,sha256=-AxnM0hGlhrbI4w1wdZwC_w-DYgoOCFpMQ94agIaD5U,17380
6
6
  csv_detective/process_text.py,sha256=rsfk66BCmdpsCOd0kDJ8tmqMsEWd-OeBkEisWc4Ej9k,1246
7
7
  csv_detective/s3_utils.py,sha256=1cIVdQUYY2ovErbMwp72Gqtqx2bkB8nfVhn-QaOFTT0,1451
8
8
  csv_detective/schema_generation.py,sha256=D1Cq4QRajsKtY8EJSwbRTIB-T_Cb2ZpcmYtCrJ6DvJQ,13135
9
- csv_detective/utils.py,sha256=3nzHNjMaNtAhwhQv_leVuBFXEYgPVFmWy1KzNCybblw,8556
10
- csv_detective/detect_fields/__init__.py,sha256=CchNbi1vrgIGh_uBexXZTzfjBETDY0kQLjI-PAquU8M,921
9
+ csv_detective/utils.py,sha256=yO9INaLh-QX-FFL2A153AlMqftE04wb0hpN6HJvsKGg,10581
10
+ csv_detective/detect_fields/__init__.py,sha256=NVfE3BQVExgXb-BPbhDvlkM5-0naEVLpZ4aM_OGHYfE,931
11
11
  csv_detective/detect_fields/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  csv_detective/detect_fields/FR/geo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
13
  csv_detective/detect_fields/FR/geo/adresse/__init__.py,sha256=e5JqMNOPxx0Ivju3zAHCGMopZroCpR4vr3DJKlQhMz4,1675
@@ -55,9 +55,9 @@ csv_detective/detect_fields/geo/latitude_wgs/__init__.py,sha256=ArS6PuYEd0atZwSq
55
55
  csv_detective/detect_fields/geo/latlon_wgs/__init__.py,sha256=3nlBqFYD4kVSVxw4b9DTPcxW59oL0T3Kj0OxPlyP9og,268
56
56
  csv_detective/detect_fields/geo/longitude_wgs/__init__.py,sha256=G7afWOKiGh_Tv7gwDNGt1a4B_A8hkCBkIxn3THDCUFk,330
57
57
  csv_detective/detect_fields/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
58
- csv_detective/detect_fields/other/booleen/__init__.py,sha256=rM__y88CGoLkMXoRkonC4YxJT2E-HfjAXocKFjIqoxU,281
58
+ csv_detective/detect_fields/other/booleen/__init__.py,sha256=1qIEI681iEaPVb9XxmH2ewxDdfmYhHe4-s3MZ6L1A9Q,489
59
59
  csv_detective/detect_fields/other/email/__init__.py,sha256=O9tgJmq0O8Q-8iin63NqEEDhlsUJjxFZNaNFM4GZaws,178
60
- csv_detective/detect_fields/other/float/__init__.py,sha256=tdHBimi668qpJhVc87w-msUfGGUcKY_tex31u5W_VQs,545
60
+ csv_detective/detect_fields/other/float/__init__.py,sha256=dpEd5ZijmjQ7gqcTnYRoRoLGGJae0RyGwVC6MPra9go,549
61
61
  csv_detective/detect_fields/other/int/__init__.py,sha256=QN3kQJLYqLRBiubUK7g4Xq03PlA5wqVwx2pPPIO9FdI,320
62
62
  csv_detective/detect_fields/other/json/__init__.py,sha256=DhzyvT12kOqgum89silIu3uoSYXmC_s_AaxLtXAD4eU,540
63
63
  csv_detective/detect_fields/other/mongo_object_id/__init__.py,sha256=7fcrHsOZAqXp2_N0IjPskYJ_qi4xRlo9iyNNDQVLzsU,156
@@ -65,7 +65,8 @@ csv_detective/detect_fields/other/twitter/__init__.py,sha256=qbwLKsTBRFQ4PyTNVeE
65
65
  csv_detective/detect_fields/other/url/__init__.py,sha256=9WaTqCglEsw_lJG_xZsBMdxJXg2yuQ92_fkX6CXWNV0,286
66
66
  csv_detective/detect_fields/other/uuid/__init__.py,sha256=3-z0fDax29SJc57zPjNGR6DPICJu6gfuNGC5L3jh4d0,223
67
67
  csv_detective/detect_fields/temp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
68
- csv_detective/detect_fields/temp/date/__init__.py,sha256=9-XhY3sMYRFQliEbprwKhfXCNz4_imgweZs_4Mbno9M,1784
68
+ csv_detective/detect_fields/temp/date/__init__.py,sha256=aFP1feFWFhCpR6Q9s_4BBwWxFtwFiMXY1iduSeQIjdA,943
69
+ csv_detective/detect_fields/temp/datetime/__init__.py,sha256=Ykwhk2ospjY9P0KOG0AitgqN0sld6UmhOlbMz_XGQzQ,597
69
70
  csv_detective/detect_fields/temp/datetime_iso/__init__.py,sha256=DOfli-A7gPlZmiV2J6Ka5_yDUCaOgxis29LET_tfhA4,444
70
71
  csv_detective/detect_fields/temp/datetime_rfc822/__init__.py,sha256=JtUzg3BXYd-XJMLGxQ0P1OAJGOQ7DlYMD4fCU9yndg0,511
71
72
  csv_detective/detect_fields/temp/year/__init__.py,sha256=RjsiIHoplnI4Odi5587TzRhKTDT-FTqGOBpdartuShA,194
@@ -122,22 +123,22 @@ csv_detective/detect_labels/other/twitter/__init__.py,sha256=D8G4vGsFL9a99OJz-03
122
123
  csv_detective/detect_labels/other/url/__init__.py,sha256=vqUQvn5o6JZU8iRsSG3AYqggjlhzagozVYWwpuSReV8,1202
123
124
  csv_detective/detect_labels/other/uuid/__init__.py,sha256=OdMUxqvqMdGaY5nph7CbIF_Q0LSxljxE72kCMT4m-Zk,931
124
125
  csv_detective/detect_labels/temp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
125
- csv_detective/detect_labels/temp/date/__init__.py,sha256=GrIbo64WVM3hi7ShBRKKyKUZxkZlVKhpgk41FxkM1VI,1281
126
+ csv_detective/detect_labels/temp/date/__init__.py,sha256=-R7VqlryozelSn4wH_7w9x6ks77DP1kw2XMBYSLrzXE,1322
126
127
  csv_detective/detect_labels/temp/datetime_iso/__init__.py,sha256=Ih9l56nBcdmGLyWDavVUWuUUuVZBz9QUDE1hHzADvVg,1157
127
128
  csv_detective/detect_labels/temp/datetime_rfc822/__init__.py,sha256=DQ_h4uDW1e6qu2rATEhgGKw6O-vVi7HbDhbEDDCT9uY,1175
128
129
  csv_detective/detect_labels/temp/year/__init__.py,sha256=zPF_mvhzhXMAlHPAskS8mhuxjLj2AlKpV4ss8Q4tDms,1150
129
- csv_detective-0.7.5.dev1069.data/data/share/csv_detective/CHANGELOG.md,sha256=QbZKEEWbkt7a-TMHB6CpzzliDqv3BLECa_zkJgZOFkY,6820
130
- csv_detective-0.7.5.dev1069.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
131
- csv_detective-0.7.5.dev1069.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
130
+ csv_detective-0.7.5.dev1113.data/data/share/csv_detective/CHANGELOG.md,sha256=S9f0BlHhNQhrJ8bbw7bThthn2AG-gP5n8eg4Eep05IA,7063
131
+ csv_detective-0.7.5.dev1113.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
132
+ csv_detective-0.7.5.dev1113.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
132
133
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
133
134
  tests/test_example.py,sha256=0NfChooJQlFxTo2nY5FOQIcsK4zzWA_SBmt2LwVQovY,2014
134
- tests/test_fields.py,sha256=kXel-hiyQYrJ3OLmwUMg1K3DKbbwBLvUplxZWxpp18I,10605
135
- tests/test_file.py,sha256=oQITvAxdcrqDby2wWSh_X9TCwFqdFaP34XNy92ibXyg,6725
135
+ tests/test_fields.py,sha256=_96htvTzvM7u-W57RpOBbsacWirIm4R36PP7JhPEaYQ,11123
136
+ tests/test_file.py,sha256=HO-Zqv0ZDFy3d0ZrpjWQPXBrwgUmzesseoEofy8G2UU,7529
136
137
  tests/test_labels.py,sha256=6MOKrGznkwU5fjZ_3oiB6Scmb480Eu-9geBJs0UDLds,159
137
138
  tests/test_structure.py,sha256=SVsnluVoIIprYw_67I1_gB3cp9m1wlO8C7SpdsLW8cM,1161
138
- csv_detective-0.7.5.dev1069.dist-info/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
139
- csv_detective-0.7.5.dev1069.dist-info/METADATA,sha256=sqa9hWFoiOj9-MpBX1uuwOl5qyPCSoca3wo0RrglmNY,1145
140
- csv_detective-0.7.5.dev1069.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
141
- csv_detective-0.7.5.dev1069.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
142
- csv_detective-0.7.5.dev1069.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
143
- csv_detective-0.7.5.dev1069.dist-info/RECORD,,
139
+ csv_detective-0.7.5.dev1113.dist-info/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
140
+ csv_detective-0.7.5.dev1113.dist-info/METADATA,sha256=7kqAw_UnjMjoBSfLqk59j7OYdY9PB0bPC35p9QxXbFY,1178
141
+ csv_detective-0.7.5.dev1113.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
142
+ csv_detective-0.7.5.dev1113.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
143
+ csv_detective-0.7.5.dev1113.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
144
+ csv_detective-0.7.5.dev1113.dist-info/RECORD,,
tests/test_fields.py CHANGED
@@ -1,5 +1,7 @@
1
1
  import pandas as pd
2
2
  from numpy import random
3
+ import pytest
4
+ from datetime import date as _date, datetime as _datetime
3
5
 
4
6
  from csv_detective.detect_fields.FR.geo import (
5
7
  adresse,
@@ -46,6 +48,7 @@ from csv_detective.detection import (
46
48
  detetect_categorical_variable,
47
49
  )
48
50
  from csv_detective.explore_csv import return_all_tests
51
+ from csv_detective.utils import cast
49
52
 
50
53
 
51
54
  def test_all_tests_return_bool():
@@ -504,3 +507,19 @@ def test_match_float():
504
507
  def test_not_match_float():
505
508
  for val in ["01053", "01053.89", "1e3", "123_456", "123_456.78", "+35", "+35.9"]:
506
509
  assert not test_float._is(val)
510
+
511
+
512
+ @pytest.mark.parametrize(
513
+ "args",
514
+ (
515
+ ("1.9", "float", float),
516
+ ("oui", "bool", bool),
517
+ ("[1, 2]", "json", list),
518
+ ('{"a": 1}', "json", dict),
519
+ ("2022-08-01", "date", _date),
520
+ ("2024-09-23 17:32:07", "datetime", _datetime),
521
+ ),
522
+ )
523
+ def test_cast(args):
524
+ value, detected_type, cast_type = args
525
+ assert isinstance(cast(value, detected_type), cast_type)
tests/test_file.py CHANGED
@@ -232,3 +232,31 @@ def test_output_df():
232
232
  assert isinstance(output, dict)
233
233
  assert isinstance(df, pd.DataFrame)
234
234
  assert len(df) == 6
235
+ assert df["partly_empty"].dtype == pd.Int64Dtype()
236
+
237
+
238
+ @pytest.mark.parametrize(
239
+ "cast_json",
240
+ (
241
+ (True, dict),
242
+ (False, str),
243
+ ),
244
+ )
245
+ def test_cast_json(mocked_responses, cast_json):
246
+ cast_json, expected_type = cast_json
247
+ expected_content = 'id,a_simple_dict\n1,{"a": 1}\n2,{"b": 2}\n3,{"c": 3}\n'
248
+ mocked_responses.get(
249
+ 'http://example.com/test.csv',
250
+ body=expected_content,
251
+ status=200,
252
+ )
253
+ analysis, df = routine(
254
+ csv_file_path='http://example.com/test.csv',
255
+ num_rows=-1,
256
+ output_profile=False,
257
+ save_results=False,
258
+ output_df=True,
259
+ cast_json=cast_json,
260
+ )
261
+ assert analysis['columns']["a_simple_dict"]["python_type"] == "json"
262
+ assert isinstance(df["a_simple_dict"][0], expected_type)