csv-detective 0.7.5.dev1078__py3-none-any.whl → 0.7.5.dev1139__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -54,4 +54,4 @@ from .geo import (
54
54
  )
55
55
 
56
56
  from .FR.temp import jour_de_la_semaine, mois_de_annee
57
- from .temp import year, date, datetime_iso, datetime_rfc822
57
+ from .temp import year, date, datetime, datetime_iso, datetime_rfc822
@@ -1,21 +1,27 @@
1
1
  PROPORTION = 1
2
- liste_bool = {
3
- '0',
4
- '1',
5
- 'vrai',
6
- 'faux',
7
- 'true',
8
- 'false',
9
- 'oui',
10
- 'non',
11
- 'yes',
12
- 'no',
13
- 'y',
14
- 'n',
15
- 'o'
2
+ bool_mapping = {
3
+ "1": True,
4
+ "0": False,
5
+ "vrai": True,
6
+ "faux": False,
7
+ "true": True,
8
+ "false": False,
9
+ "oui": True,
10
+ "non": False,
11
+ "yes": True,
12
+ "no": False,
13
+ "y": True,
14
+ "n": False,
15
+ "o": True,
16
16
  }
17
17
 
18
+ liste_bool = set(bool_mapping.keys())
18
19
 
19
- def _is(val):
20
- '''Détection les booléens'''
20
+
21
+ def bool_casting(val: str) -> bool:
22
+ return bool_mapping.get(val)
23
+
24
+
25
+ def _is(val: str) -> bool:
26
+ '''Détecte les booléens'''
21
27
  return isinstance(val, str) and val.lower() in liste_bool
@@ -1,8 +1,8 @@
1
1
  PROPORTION = 1
2
2
 
3
3
 
4
- def float_casting(str2cast):
5
- return float(str2cast.replace(',', '.'))
4
+ def float_casting(val: str) -> float:
5
+ return float(val.replace(',', '.'))
6
6
 
7
7
 
8
8
  def _is(val):
@@ -1,46 +1,30 @@
1
- import re
2
- from dateutil.parser import parse, ParserError
3
- from csv_detective.detect_fields.other.float import _is as is_float
4
- from unidecode import unidecode
1
+ from datetime import datetime
2
+ from typing import Optional
3
+
4
+ from dateparser import parse as date_parser
5
+ from dateutil.parser import parse as dateutil_parser, ParserError
5
6
 
6
7
  PROPORTION = 1
7
8
  # /!\ this is only for dates, not datetimes which are handled by other utils
8
9
 
9
10
 
10
- def is_dateutil_date(val: str) -> bool:
11
- # we don't want to get datetimes here, so length restriction
12
- # longest date string expected here is DD-septembre-YYYY, so 17 characters
13
- if len(val) > 17:
14
- return False
11
+ def date_casting(val: str) -> Optional[datetime]:
12
+ """For performance reasons, we try first with dateutil and fallback on dateparser"""
15
13
  try:
16
- res = parse(val, fuzzy=False)
17
- if res.hour or res.minute or res.second:
18
- return False
19
- return True
20
- except (ParserError, ValueError, TypeError, OverflowError):
21
- return False
22
-
23
-
24
- seps = r'[\s/\-\*_\|;.,]'
25
- # matches JJ-MM-AAAA with any of the listed separators
26
- pat = r'^(0[1-9]|[12][0-9]|3[01])SEP(0[1-9]|1[0-2])SEP((19|20)\d{2})$'.replace('SEP', seps)
27
- # matches AAAA-MM-JJ with any of the listed separators OR NO SEPARATOR
28
- tap = r'^((19|20)\d{2})SEP(0[1-9]|1[0-2])SEP(0[1-9]|[12][0-9]|3[01])$'.replace('SEP', seps + '?')
29
- # matches JJ-mmm-AAAA and JJ-mmm...mm-AAAA with any of the listed separators OR NO SEPARATOR
30
- letters = (
31
- r'^(0[1-9]|[12][0-9]|3[01])SEP(jan|fev|feb|mar|avr|apr'
32
- r'|mai|may|jun|jui|jul|aou|aug|sep|oct|nov|dec|janvier|fevrier|mars|avril|'
33
- r'mai|juin|jullet|aout|septembre|octobre|novembre|decembre)SEP'
34
- r'(\d{2}|\d{4})$'
35
- ).replace('SEP', seps + '?')
14
+ return dateutil_parser(val)
15
+ except ParserError:
16
+ return date_parser(val)
36
17
 
37
18
 
38
19
  def _is(val):
39
- '''Renvoie True si val peut être une date, False sinon
40
- On ne garde que les regex pour les cas où parse() ne convient pas'''
41
- return isinstance(val, str) and (
42
- (is_dateutil_date(val) and not is_float(val))
43
- or bool(re.match(letters, unidecode(val)))
44
- or bool(re.match(pat, val))
45
- or bool(re.match(tap, val))
46
- )
20
+ '''Renvoie True si val peut être une date, False sinon'''
21
+ # early stops, to cut processing time
22
+ if not isinstance(val, str) or len(val) > 20 or len(val) < 8:
23
+ return False
24
+ threshold = 0.3
25
+ if sum([char.isdigit() for char in val]) / len(val) < threshold:
26
+ return False
27
+ res = date_casting(val)
28
+ if not res or res.hour or res.minute or res.second:
29
+ return False
30
+ return True
@@ -0,0 +1,19 @@
1
+ from typing import Any, Optional
2
+
3
+ from csv_detective.detect_fields.temp.date import date_casting
4
+
5
+ PROPORTION = 1
6
+
7
+
8
+ def _is(val: Optional[Any]) -> bool:
9
+ '''Renvoie True si val peut être un datetime, False sinon'''
10
+ # early stops, to cut processing time
11
+ if not isinstance(val, str) or len(val) > 30 or len(val) < 15:
12
+ return False
13
+ threshold = 0.7
14
+ if sum([char.isdigit() for char in val]) / len(val) < threshold:
15
+ return False
16
+ res = date_casting(val)
17
+ if res and (res.hour or res.minute or res.second):
18
+ return True
19
+ return False
@@ -27,7 +27,9 @@ def _is(header):
27
27
  'dateouv',
28
28
  'date der maj',
29
29
  'dmaj',
30
- 'jour'
30
+ 'jour',
31
+ 'yyyymmdd',
32
+ 'aaaammjj',
31
33
  ]
32
34
  processed_header = _process_text(header)
33
35
 
@@ -198,6 +198,8 @@ def detect_encoding(csv_file_path: str, verbose: bool = False) -> str:
198
198
  else:
199
199
  binary_file = open(csv_file_path, mode="rb")
200
200
  encoding_dict = detect(binary_file.read())
201
+ if not encoding_dict["encoding"]:
202
+ raise ValueError("Could not detect the file's encoding. Consider specifying it in the routine call.")
201
203
  if verbose:
202
204
  message = f'Detected encoding: "{encoding_dict["encoding"]}"'
203
205
  message += f' in {round(time() - start, 3)}s (confidence: {round(encoding_dict["confidence"]*100)}%)'
@@ -19,7 +19,13 @@ import pandas as pd
19
19
  from csv_detective import detect_fields, detect_labels
20
20
  from csv_detective.s3_utils import download_from_minio, upload_to_minio
21
21
  from csv_detective.schema_generation import generate_table_schema
22
- from csv_detective.utils import test_col, test_label, prepare_output_dict, display_logs_depending_process_time
22
+ from csv_detective.utils import (
23
+ cast_df,
24
+ display_logs_depending_process_time,
25
+ prepare_output_dict,
26
+ test_col,
27
+ test_label,
28
+ )
23
29
  from .detection import (
24
30
  detect_engine,
25
31
  detect_separator,
@@ -111,6 +117,7 @@ def routine(
111
117
  output_profile: bool = False,
112
118
  output_schema: bool = False,
113
119
  output_df: bool = False,
120
+ cast_json: bool = True,
114
121
  verbose: bool = False,
115
122
  sheet_name: Union[str, int] = None,
116
123
  ) -> Union[dict, tuple[dict, pd.DataFrame]]:
@@ -127,6 +134,7 @@ def routine(
127
134
  output_profile: whether or not to add the 'profile' field to the output
128
135
  output_schema: whether or not to add the 'schema' field to the output (tableschema)
129
136
  output_df: whether or not to return the loaded DataFrame along with the analysis report
137
+ cast_json: whether or not to cast json columns into objects (otherwise they are returned as strings)
130
138
  verbose: whether or not to print process logs in console
131
139
  sheet_name: if reading multi-sheet file (xls-like), which sheet to consider
132
140
  skipna: whether to keep NaN (empty cells) for tests
@@ -276,6 +284,8 @@ def routine(
276
284
  "json": "json",
277
285
  "json_geojson": "json",
278
286
  "datetime": "datetime",
287
+ "datetime_iso": "datetime",
288
+ "datetime_rfc822": "datetime",
279
289
  "date": "date",
280
290
  "latitude": "float",
281
291
  "latitude_l93": "float",
@@ -352,7 +362,12 @@ def routine(
352
362
  time() - start_routine
353
363
  )
354
364
  if output_df:
355
- return analysis, table
365
+ return analysis, cast_df(
366
+ df=table,
367
+ columns=analysis["columns"],
368
+ cast_json=cast_json,
369
+ verbose=verbose,
370
+ )
356
371
  return analysis
357
372
 
358
373
 
csv_detective/utils.py CHANGED
@@ -1,7 +1,13 @@
1
- from typing import Callable
1
+ from typing import Callable, Optional, Union
2
+ import json
2
3
  import pandas as pd
3
4
  import logging
4
5
  from time import time
6
+ from datetime import date, datetime
7
+
8
+ from csv_detective.detect_fields.other.booleen import bool_casting
9
+ from csv_detective.detect_fields.other.float import float_casting
10
+ from csv_detective.detect_fields.temp.date import date_casting
5
11
 
6
12
  logging.basicConfig(level=logging.INFO)
7
13
 
@@ -210,7 +216,52 @@ def prepare_output_dict(return_table: pd.DataFrame, limited_output: bool):
210
216
 
211
217
  def full_word_strictly_inside_string(word: str, string: str):
212
218
  return (
213
- (" " + word + " " in string)
219
+ word == string
220
+ or (" " + word + " " in string)
214
221
  or (string.startswith(word + " "))
215
222
  or (string.endswith(" " + word))
216
223
  )
224
+
225
+
226
+ def cast(value: str, _type: str) -> Optional[Union[str, float, bool, date, datetime]]:
227
+ if not isinstance(value, str) or not value:
228
+ # None is the current default value in hydra, should we keep this?
229
+ return None
230
+ if _type == "float":
231
+ return float_casting(value)
232
+ if _type == "bool":
233
+ return bool_casting(value)
234
+ if _type == "json":
235
+ # in hydra json are given to postgres as strings, conversion is done by postgres
236
+ return json.loads(value)
237
+ if _type == "date":
238
+ _date = date_casting(value)
239
+ return _date.date() if _date else None
240
+ if _type == "datetime":
241
+ return date_casting(value)
242
+ raise ValueError(f"Unknown type `{_type}`")
243
+
244
+
245
+ def cast_df(df: pd.DataFrame, columns: dict, cast_json: bool = True, verbose: bool = False) -> pd.DataFrame:
246
+ if verbose:
247
+ start = time()
248
+ output_df = pd.DataFrame()
249
+ for col_name, detection in columns.items():
250
+ if detection["python_type"] == "string" or (detection["python_type"] == "json" and not cast_json):
251
+ # no change if detected type is string
252
+ output_df[col_name] = df[col_name].copy()
253
+ elif detection["python_type"] == "int":
254
+ # to allow having ints and NaN in the same column
255
+ output_df[col_name] = df[col_name].copy().astype(pd.Int64Dtype())
256
+ else:
257
+ output_df[col_name] = df[col_name].apply(
258
+ lambda col: cast(col, _type=detection["python_type"])
259
+ )
260
+ # to save RAM
261
+ del df[col_name]
262
+ if verbose:
263
+ display_logs_depending_process_time(
264
+ f'Casting columns completed in {round(time() - start, 3)}s',
265
+ time() - start,
266
+ )
267
+ return output_df
@@ -5,6 +5,8 @@
5
5
  - New function that creates a csv from a list of fields and constraints, or from a TableSchema [#101](https://github.com/datagouv/csv-detective/pull/101)
6
6
  - Enable outputing loaded dataframe [#102](https://github.com/datagouv/csv-detective/pull/102)
7
7
  - Better naming, hint types and minor refactors [#103](https://github.com/datagouv/csv-detective/pull/103)
8
+ - The returned dataframe has its columns properly cast to the detected types [#104](https://github.com/datagouv/csv-detective/pull/104)
9
+ - Raise an error if the encoding could not be guessed [#106](https://github.com/datagouv/csv-detective/pull/106)
8
10
 
9
11
  ## 0.7.4 (2024-11-15)
10
12
 
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: csv_detective
3
- Version: 0.7.5.dev1078
3
+ Version: 0.7.5.dev1139
4
4
  Summary: Detect CSV column content
5
5
  Home-page: https://github.com/etalab/csv_detective
6
6
  Author: Etalab
@@ -15,6 +15,7 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
15
15
  Description-Content-Type: text/markdown
16
16
  License-File: LICENSE.AGPL.txt
17
17
  Requires-Dist: boto3==1.34.0
18
+ Requires-Dist: dateparser==1.2.0
18
19
  Requires-Dist: faust-cchardet==2.1.19
19
20
  Requires-Dist: pandas==2.2.0
20
21
  Requires-Dist: pytest==8.3.0
@@ -29,3 +30,12 @@ Requires-Dist: python-magic==0.4.27
29
30
  Requires-Dist: frformat==0.4.0
30
31
  Requires-Dist: faker==33.0.0
31
32
  Requires-Dist: rstr==3.2.2
33
+ Dynamic: author
34
+ Dynamic: author-email
35
+ Dynamic: classifier
36
+ Dynamic: description-content-type
37
+ Dynamic: home-page
38
+ Dynamic: keywords
39
+ Dynamic: license
40
+ Dynamic: requires-dist
41
+ Dynamic: summary
@@ -1,13 +1,13 @@
1
1
  csv_detective/__init__.py,sha256=Au4bNJ_Gi6P6o0uO4R56nYdshG7M6-7Rg_xX4whLmLI,143
2
2
  csv_detective/cli.py,sha256=Ua7SE1wMH2uFUsTmfumh4nJk7O06okpMd2gvjUDO1II,1048
3
3
  csv_detective/create_example.py,sha256=358e7Q7RWMrY_eEo3pUteJWmg2smFb5edJ_AzcQPrqA,8646
4
- csv_detective/detection.py,sha256=SUNGMvvuM_bj3gKYw-x6-CjjkirqCPoeAm0NCPkijrM,22225
5
- csv_detective/explore_csv.py,sha256=i1m1JmnMSILlGnPhXlXsUbDVcgXaJ1E2nKE7_6D2xEE,16996
4
+ csv_detective/detection.py,sha256=zrP8qvLDvhVXTHi7Ty8G_ga4zfZPjBhuyApqFQkPq2Y,22373
5
+ csv_detective/explore_csv.py,sha256=-AxnM0hGlhrbI4w1wdZwC_w-DYgoOCFpMQ94agIaD5U,17380
6
6
  csv_detective/process_text.py,sha256=rsfk66BCmdpsCOd0kDJ8tmqMsEWd-OeBkEisWc4Ej9k,1246
7
7
  csv_detective/s3_utils.py,sha256=1cIVdQUYY2ovErbMwp72Gqtqx2bkB8nfVhn-QaOFTT0,1451
8
8
  csv_detective/schema_generation.py,sha256=D1Cq4QRajsKtY8EJSwbRTIB-T_Cb2ZpcmYtCrJ6DvJQ,13135
9
- csv_detective/utils.py,sha256=3nzHNjMaNtAhwhQv_leVuBFXEYgPVFmWy1KzNCybblw,8556
10
- csv_detective/detect_fields/__init__.py,sha256=CchNbi1vrgIGh_uBexXZTzfjBETDY0kQLjI-PAquU8M,921
9
+ csv_detective/utils.py,sha256=yO9INaLh-QX-FFL2A153AlMqftE04wb0hpN6HJvsKGg,10581
10
+ csv_detective/detect_fields/__init__.py,sha256=NVfE3BQVExgXb-BPbhDvlkM5-0naEVLpZ4aM_OGHYfE,931
11
11
  csv_detective/detect_fields/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  csv_detective/detect_fields/FR/geo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
13
  csv_detective/detect_fields/FR/geo/adresse/__init__.py,sha256=e5JqMNOPxx0Ivju3zAHCGMopZroCpR4vr3DJKlQhMz4,1675
@@ -55,9 +55,9 @@ csv_detective/detect_fields/geo/latitude_wgs/__init__.py,sha256=ArS6PuYEd0atZwSq
55
55
  csv_detective/detect_fields/geo/latlon_wgs/__init__.py,sha256=3nlBqFYD4kVSVxw4b9DTPcxW59oL0T3Kj0OxPlyP9og,268
56
56
  csv_detective/detect_fields/geo/longitude_wgs/__init__.py,sha256=G7afWOKiGh_Tv7gwDNGt1a4B_A8hkCBkIxn3THDCUFk,330
57
57
  csv_detective/detect_fields/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
58
- csv_detective/detect_fields/other/booleen/__init__.py,sha256=rM__y88CGoLkMXoRkonC4YxJT2E-HfjAXocKFjIqoxU,281
58
+ csv_detective/detect_fields/other/booleen/__init__.py,sha256=1qIEI681iEaPVb9XxmH2ewxDdfmYhHe4-s3MZ6L1A9Q,489
59
59
  csv_detective/detect_fields/other/email/__init__.py,sha256=O9tgJmq0O8Q-8iin63NqEEDhlsUJjxFZNaNFM4GZaws,178
60
- csv_detective/detect_fields/other/float/__init__.py,sha256=tdHBimi668qpJhVc87w-msUfGGUcKY_tex31u5W_VQs,545
60
+ csv_detective/detect_fields/other/float/__init__.py,sha256=dpEd5ZijmjQ7gqcTnYRoRoLGGJae0RyGwVC6MPra9go,549
61
61
  csv_detective/detect_fields/other/int/__init__.py,sha256=QN3kQJLYqLRBiubUK7g4Xq03PlA5wqVwx2pPPIO9FdI,320
62
62
  csv_detective/detect_fields/other/json/__init__.py,sha256=DhzyvT12kOqgum89silIu3uoSYXmC_s_AaxLtXAD4eU,540
63
63
  csv_detective/detect_fields/other/mongo_object_id/__init__.py,sha256=7fcrHsOZAqXp2_N0IjPskYJ_qi4xRlo9iyNNDQVLzsU,156
@@ -65,7 +65,8 @@ csv_detective/detect_fields/other/twitter/__init__.py,sha256=qbwLKsTBRFQ4PyTNVeE
65
65
  csv_detective/detect_fields/other/url/__init__.py,sha256=9WaTqCglEsw_lJG_xZsBMdxJXg2yuQ92_fkX6CXWNV0,286
66
66
  csv_detective/detect_fields/other/uuid/__init__.py,sha256=3-z0fDax29SJc57zPjNGR6DPICJu6gfuNGC5L3jh4d0,223
67
67
  csv_detective/detect_fields/temp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
68
- csv_detective/detect_fields/temp/date/__init__.py,sha256=9-XhY3sMYRFQliEbprwKhfXCNz4_imgweZs_4Mbno9M,1784
68
+ csv_detective/detect_fields/temp/date/__init__.py,sha256=aFP1feFWFhCpR6Q9s_4BBwWxFtwFiMXY1iduSeQIjdA,943
69
+ csv_detective/detect_fields/temp/datetime/__init__.py,sha256=Ykwhk2ospjY9P0KOG0AitgqN0sld6UmhOlbMz_XGQzQ,597
69
70
  csv_detective/detect_fields/temp/datetime_iso/__init__.py,sha256=DOfli-A7gPlZmiV2J6Ka5_yDUCaOgxis29LET_tfhA4,444
70
71
  csv_detective/detect_fields/temp/datetime_rfc822/__init__.py,sha256=JtUzg3BXYd-XJMLGxQ0P1OAJGOQ7DlYMD4fCU9yndg0,511
71
72
  csv_detective/detect_fields/temp/year/__init__.py,sha256=RjsiIHoplnI4Odi5587TzRhKTDT-FTqGOBpdartuShA,194
@@ -122,22 +123,22 @@ csv_detective/detect_labels/other/twitter/__init__.py,sha256=D8G4vGsFL9a99OJz-03
122
123
  csv_detective/detect_labels/other/url/__init__.py,sha256=vqUQvn5o6JZU8iRsSG3AYqggjlhzagozVYWwpuSReV8,1202
123
124
  csv_detective/detect_labels/other/uuid/__init__.py,sha256=OdMUxqvqMdGaY5nph7CbIF_Q0LSxljxE72kCMT4m-Zk,931
124
125
  csv_detective/detect_labels/temp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
125
- csv_detective/detect_labels/temp/date/__init__.py,sha256=GrIbo64WVM3hi7ShBRKKyKUZxkZlVKhpgk41FxkM1VI,1281
126
+ csv_detective/detect_labels/temp/date/__init__.py,sha256=-R7VqlryozelSn4wH_7w9x6ks77DP1kw2XMBYSLrzXE,1322
126
127
  csv_detective/detect_labels/temp/datetime_iso/__init__.py,sha256=Ih9l56nBcdmGLyWDavVUWuUUuVZBz9QUDE1hHzADvVg,1157
127
128
  csv_detective/detect_labels/temp/datetime_rfc822/__init__.py,sha256=DQ_h4uDW1e6qu2rATEhgGKw6O-vVi7HbDhbEDDCT9uY,1175
128
129
  csv_detective/detect_labels/temp/year/__init__.py,sha256=zPF_mvhzhXMAlHPAskS8mhuxjLj2AlKpV4ss8Q4tDms,1150
129
- csv_detective-0.7.5.dev1078.data/data/share/csv_detective/CHANGELOG.md,sha256=5M95hTftsY9Ic2q_jexDNp-MgAFAXuPZyWGyFABi3l4,6927
130
- csv_detective-0.7.5.dev1078.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
131
- csv_detective-0.7.5.dev1078.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
130
+ csv_detective-0.7.5.dev1139.data/data/share/csv_detective/CHANGELOG.md,sha256=jgcSpxGkuEdOQtQ08tRFflkCNuHstGFHN_29Tv6M1dE,7176
131
+ csv_detective-0.7.5.dev1139.data/data/share/csv_detective/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
132
+ csv_detective-0.7.5.dev1139.data/data/share/csv_detective/README.md,sha256=Qr8xRXc-dxQ-tdXCpCTCKp1Uliqq84r0UOlPRNuGCpI,9506
132
133
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
133
134
  tests/test_example.py,sha256=0NfChooJQlFxTo2nY5FOQIcsK4zzWA_SBmt2LwVQovY,2014
134
- tests/test_fields.py,sha256=kXel-hiyQYrJ3OLmwUMg1K3DKbbwBLvUplxZWxpp18I,10605
135
- tests/test_file.py,sha256=oQITvAxdcrqDby2wWSh_X9TCwFqdFaP34XNy92ibXyg,6725
135
+ tests/test_fields.py,sha256=_96htvTzvM7u-W57RpOBbsacWirIm4R36PP7JhPEaYQ,11123
136
+ tests/test_file.py,sha256=HO-Zqv0ZDFy3d0ZrpjWQPXBrwgUmzesseoEofy8G2UU,7529
136
137
  tests/test_labels.py,sha256=6MOKrGznkwU5fjZ_3oiB6Scmb480Eu-9geBJs0UDLds,159
137
138
  tests/test_structure.py,sha256=SVsnluVoIIprYw_67I1_gB3cp9m1wlO8C7SpdsLW8cM,1161
138
- csv_detective-0.7.5.dev1078.dist-info/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
139
- csv_detective-0.7.5.dev1078.dist-info/METADATA,sha256=NSxmqCJpApiSavZ59QEMfRuzeB_pmOk2Wm_zTy-o2eQ,1145
140
- csv_detective-0.7.5.dev1078.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
141
- csv_detective-0.7.5.dev1078.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
142
- csv_detective-0.7.5.dev1078.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
143
- csv_detective-0.7.5.dev1078.dist-info/RECORD,,
139
+ csv_detective-0.7.5.dev1139.dist-info/LICENSE.AGPL.txt,sha256=2N5ReRelkdqkR9a-KP-y-shmcD5P62XoYiG-miLTAzo,34519
140
+ csv_detective-0.7.5.dev1139.dist-info/METADATA,sha256=L-WEqkw3fze2JUmOVwfqV1Ttgsf5lhi_WqEULi0AKkA,1364
141
+ csv_detective-0.7.5.dev1139.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
142
+ csv_detective-0.7.5.dev1139.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
143
+ csv_detective-0.7.5.dev1139.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
144
+ csv_detective-0.7.5.dev1139.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.6.0)
2
+ Generator: setuptools (75.8.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
tests/test_fields.py CHANGED
@@ -1,5 +1,7 @@
1
1
  import pandas as pd
2
2
  from numpy import random
3
+ import pytest
4
+ from datetime import date as _date, datetime as _datetime
3
5
 
4
6
  from csv_detective.detect_fields.FR.geo import (
5
7
  adresse,
@@ -46,6 +48,7 @@ from csv_detective.detection import (
46
48
  detetect_categorical_variable,
47
49
  )
48
50
  from csv_detective.explore_csv import return_all_tests
51
+ from csv_detective.utils import cast
49
52
 
50
53
 
51
54
  def test_all_tests_return_bool():
@@ -504,3 +507,19 @@ def test_match_float():
504
507
  def test_not_match_float():
505
508
  for val in ["01053", "01053.89", "1e3", "123_456", "123_456.78", "+35", "+35.9"]:
506
509
  assert not test_float._is(val)
510
+
511
+
512
+ @pytest.mark.parametrize(
513
+ "args",
514
+ (
515
+ ("1.9", "float", float),
516
+ ("oui", "bool", bool),
517
+ ("[1, 2]", "json", list),
518
+ ('{"a": 1}', "json", dict),
519
+ ("2022-08-01", "date", _date),
520
+ ("2024-09-23 17:32:07", "datetime", _datetime),
521
+ ),
522
+ )
523
+ def test_cast(args):
524
+ value, detected_type, cast_type = args
525
+ assert isinstance(cast(value, detected_type), cast_type)
tests/test_file.py CHANGED
@@ -232,3 +232,31 @@ def test_output_df():
232
232
  assert isinstance(output, dict)
233
233
  assert isinstance(df, pd.DataFrame)
234
234
  assert len(df) == 6
235
+ assert df["partly_empty"].dtype == pd.Int64Dtype()
236
+
237
+
238
+ @pytest.mark.parametrize(
239
+ "cast_json",
240
+ (
241
+ (True, dict),
242
+ (False, str),
243
+ ),
244
+ )
245
+ def test_cast_json(mocked_responses, cast_json):
246
+ cast_json, expected_type = cast_json
247
+ expected_content = 'id,a_simple_dict\n1,{"a": 1}\n2,{"b": 2}\n3,{"c": 3}\n'
248
+ mocked_responses.get(
249
+ 'http://example.com/test.csv',
250
+ body=expected_content,
251
+ status=200,
252
+ )
253
+ analysis, df = routine(
254
+ csv_file_path='http://example.com/test.csv',
255
+ num_rows=-1,
256
+ output_profile=False,
257
+ save_results=False,
258
+ output_df=True,
259
+ cast_json=cast_json,
260
+ )
261
+ assert analysis['columns']["a_simple_dict"]["python_type"] == "json"
262
+ assert isinstance(df["a_simple_dict"][0], expected_type)