csv-detective 0.9.2.dev1874__py3-none-any.whl → 0.9.3.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. csv_detective/__init__.py +1 -2
  2. csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +1 -1
  3. csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +1 -1
  4. csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +1 -1
  5. csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +1 -1
  6. csv_detective/detect_fields/geo/latitude_wgs/__init__.py +1 -1
  7. csv_detective/detect_fields/geo/longitude_wgs/__init__.py +1 -1
  8. csv_detective/detect_fields/other/email/__init__.py +2 -2
  9. csv_detective/detect_fields/temp/date/__init__.py +1 -2
  10. csv_detective/detect_fields/temp/datetime_aware/__init__.py +7 -6
  11. csv_detective/detect_fields/temp/datetime_naive/__init__.py +4 -8
  12. csv_detective/detection/engine.py +1 -2
  13. csv_detective/detection/formats.py +14 -8
  14. csv_detective/detection/headers.py +2 -2
  15. csv_detective/explore_csv.py +11 -119
  16. csv_detective/load_tests.py +1 -2
  17. csv_detective/output/__init__.py +11 -14
  18. csv_detective/output/dataframe.py +1 -2
  19. csv_detective/output/example.py +12 -12
  20. csv_detective/output/profile.py +13 -10
  21. csv_detective/output/schema.py +7 -86
  22. csv_detective/parsing/excel.py +2 -3
  23. csv_detective/parsing/load.py +3 -4
  24. csv_detective/utils.py +4 -3
  25. csv_detective/validate.py +4 -5
  26. {csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.3.dev0.dist-info}/METADATA +18 -26
  27. {csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.3.dev0.dist-info}/RECORD +34 -36
  28. tests/test_fields.py +37 -4
  29. tests/test_file.py +68 -0
  30. venv/bin/activate_this.py +1 -1
  31. csv_detective/s3_utils.py +0 -44
  32. venv/bin/jp.py +0 -54
  33. {csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.3.dev0.dist-info}/WHEEL +0 -0
  34. {csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.3.dev0.dist-info}/entry_points.txt +0 -0
  35. {csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.3.dev0.dist-info}/licenses/LICENSE +0 -0
  36. {csv_detective-0.9.2.dev1874.dist-info → csv_detective-0.9.3.dev0.dist-info}/top_level.txt +0 -0
csv_detective/__init__.py CHANGED
@@ -1,7 +1,6 @@
1
- from csv_detective.explore_csv import routine, routine_minio, validate_then_detect
1
+ from csv_detective.explore_csv import routine, validate_then_detect
2
2
 
3
3
  __all__ = [
4
4
  "routine",
5
- "routine_minio",
6
5
  "validate_then_detect",
7
6
  ]
@@ -3,7 +3,7 @@ from frformat import LatitudeL93
3
3
  from csv_detective.detect_fields.other.float import _is as is_float
4
4
  from csv_detective.detect_fields.other.float import float_casting
5
5
 
6
- PROPORTION = 0.9
6
+ PROPORTION = 1
7
7
 
8
8
  _latitudel93 = LatitudeL93()
9
9
 
@@ -1,6 +1,6 @@
1
1
  from csv_detective.detect_fields.other.float import _is as is_float
2
2
 
3
- PROPORTION = 0.9
3
+ PROPORTION = 1
4
4
 
5
5
 
6
6
  def _is(val):
@@ -3,7 +3,7 @@ from frformat import LongitudeL93
3
3
  from csv_detective.detect_fields.other.float import _is as is_float
4
4
  from csv_detective.detect_fields.other.float import float_casting
5
5
 
6
- PROPORTION = 0.9
6
+ PROPORTION = 1
7
7
 
8
8
  _longitudel93 = LongitudeL93()
9
9
 
@@ -1,6 +1,6 @@
1
1
  from csv_detective.detect_fields.other.float import _is as is_float
2
2
 
3
- PROPORTION = 0.9
3
+ PROPORTION = 1
4
4
 
5
5
 
6
6
  def _is(val):
@@ -1,6 +1,6 @@
1
1
  from csv_detective.detect_fields.other.float import _is as is_float
2
2
 
3
- PROPORTION = 0.9
3
+ PROPORTION = 1
4
4
 
5
5
 
6
6
  def _is(val):
@@ -1,6 +1,6 @@
1
1
  from csv_detective.detect_fields.other.float import _is as is_float
2
2
 
3
- PROPORTION = 0.9
3
+ PROPORTION = 1
4
4
 
5
5
 
6
6
  def _is(val):
@@ -1,10 +1,10 @@
1
1
  import re
2
2
 
3
- PROPORTION = 1
3
+ PROPORTION = 0.9
4
4
 
5
5
 
6
6
  def _is(val):
7
7
  """Detects e-mails"""
8
8
  return isinstance(val, str) and bool(
9
- re.match(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}$", val)
9
+ re.match(r"^[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}$", val, re.IGNORECASE)
10
10
  )
@@ -1,6 +1,5 @@
1
1
  import re
2
2
  from datetime import datetime
3
- from typing import Optional
4
3
 
5
4
  from dateparser import parse as date_parser
6
5
  from dateutil.parser import ParserError
@@ -10,7 +9,7 @@ PROPORTION = 1
10
9
  # /!\ this is only for dates, not datetimes which are handled by other utils
11
10
 
12
11
 
13
- def date_casting(val: str) -> Optional[datetime]:
12
+ def date_casting(val: str) -> datetime | None:
14
13
  """For performance reasons, we try first with dateutil and fallback on dateparser"""
15
14
  try:
16
15
  return dateutil_parser(val)
@@ -1,24 +1,25 @@
1
1
  import re
2
- from typing import Any, Optional
2
+ from typing import Any
3
3
 
4
4
  from csv_detective.detect_fields.temp.date import aaaammjj_pattern, date_casting
5
5
 
6
6
  PROPORTION = 1
7
7
  threshold = 0.7
8
8
 
9
- # matches AAAA-MM-JJTHH:MM:SS(.dddddd)±HH:MM with any of the listed separators for the date OR NO SEPARATOR
9
+ # matches AAAA-MM-JJTHH:MM:SS(.dddddd)(±HH:MM|Z) with any of the listed separators for the date OR NO SEPARATOR
10
10
  pat = (
11
11
  aaaammjj_pattern.replace("$", "")
12
- + r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})?[+-](0\d|1[0-9]|2[0-3]):([0-5][0-9])$"
12
+ + r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})"
13
+ + r"?(([+-](0\d|1[0-9]|2[0-3]):([0-5][0-9]))|Z)$"
13
14
  )
14
15
 
15
16
 
16
- def _is(val: Optional[Any]) -> bool:
17
+ def _is(val: Any | None) -> bool:
17
18
  """Detects timezone-aware datetimes only"""
18
19
  # early stops, to cut processing time
19
- # 21 is the minimal length of a datetime format YYMMDDTHH:MM:SS+HH:MM
20
+ # 16 is the minimal length of a datetime format YYMMDDTHH:MM:SSZ
20
21
  # 32 is the maximal length of an ISO datetime format YYYY-MM-DDTHH:MM:SS.dddddd+HH:MM, keeping some slack
21
- if not isinstance(val, str) or len(val) > 35 or len(val) < 21:
22
+ if not isinstance(val, str) or len(val) > 35 or len(val) < 16:
22
23
  return False
23
24
  # if usual format, no need to parse
24
25
  if bool(re.match(pat, val)):
@@ -1,5 +1,5 @@
1
1
  import re
2
- from typing import Any, Optional
2
+ from typing import Any
3
3
 
4
4
  from csv_detective.detect_fields.temp.date import aaaammjj_pattern, date_casting
5
5
 
@@ -9,11 +9,11 @@ threshold = 0.7
9
9
  # matches AAAA-MM-JJTHH:MM:SS(.dddddd)Z with any of the listed separators for the date OR NO SEPARATOR
10
10
  pat = (
11
11
  aaaammjj_pattern.replace("$", "")
12
- + r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})?Z$"
12
+ + r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})?$"
13
13
  )
14
14
 
15
15
 
16
- def _is(val: Optional[Any]) -> bool:
16
+ def _is(val: Any | None) -> bool:
17
17
  """Detects naive datetimes only"""
18
18
  # early stops, to cut processing time
19
19
  # 15 is the minimal length of a datetime format YYMMDDTHH:MM:SS
@@ -26,8 +26,4 @@ def _is(val: Optional[Any]) -> bool:
26
26
  if sum([char.isdigit() or char in {"-", "/", ":", " "} for char in val]) / len(val) < threshold:
27
27
  return False
28
28
  res = date_casting(val)
29
- return (
30
- res is not None
31
- and bool(res.hour or res.minute or res.second or res.microsecond)
32
- and not bool(res.tzinfo)
33
- )
29
+ return res is not None and not bool(res.tzinfo)
@@ -1,5 +1,4 @@
1
1
  from time import time
2
- from typing import Optional
3
2
 
4
3
  import magic
5
4
  import requests
@@ -16,7 +15,7 @@ engine_to_file = {
16
15
  }
17
16
 
18
17
 
19
- def detect_engine(file_path: str, verbose=False) -> Optional[str]:
18
+ def detect_engine(file_path: str, verbose=False) -> str | None:
20
19
  if verbose:
21
20
  start = time()
22
21
  mapping = {
@@ -1,6 +1,5 @@
1
1
  import logging
2
2
  from collections import defaultdict
3
- from typing import Union
4
3
 
5
4
  import numpy as np
6
5
  import pandas as pd
@@ -22,7 +21,7 @@ def detect_formats(
22
21
  table: pd.DataFrame,
23
22
  analysis: dict,
24
23
  file_path: str,
25
- user_input_tests: Union[str, list[str]] = "ALL",
24
+ user_input_tests: str | list[str] = "ALL",
26
25
  limited_output: bool = True,
27
26
  skipna: bool = True,
28
27
  verbose: bool = False,
@@ -30,7 +29,7 @@ def detect_formats(
30
29
  on_sample = len(table) > MAX_ROWS_ANALYSIS
31
30
  if on_sample:
32
31
  if verbose:
33
- logging.warning(f"File is too long, analysing the {MAX_ROWS_ANALYSIS} first rows")
32
+ logging.warning(f"File is too long, analysing a sample of {MAX_ROWS_ANALYSIS} rows")
34
33
  table = build_sample(table)
35
34
 
36
35
  if table.empty:
@@ -183,13 +182,15 @@ def build_sample(table: pd.DataFrame) -> pd.DataFrame:
183
182
  samples = pd.concat(
184
183
  [
185
184
  # one row with the minimum of the column
186
- table.loc[table[col] == table[col].dropna().min()].iloc[[0]]
185
+ table.loc[table[col] == val].iloc[[0]]
187
186
  for col in table.columns
187
+ if not pd.isna(val := table[col].dropna().min())
188
188
  ]
189
189
  + [
190
190
  # one row with the maximum of the column
191
- table.loc[table[col] == table[col].dropna().max()].iloc[[0]]
191
+ table.loc[table[col] == val].iloc[[0]]
192
192
  for col in table.columns
193
+ if not pd.isna(val := table[col].dropna().max())
193
194
  ]
194
195
  + [
195
196
  # one row with a NaN value if the column has any
@@ -199,7 +200,12 @@ def build_sample(table: pd.DataFrame) -> pd.DataFrame:
199
200
  ],
200
201
  ignore_index=True,
201
202
  )
202
- return pd.concat(
203
- [samples, table.sample(n=MAX_ROWS_ANALYSIS - len(samples), random_state=1)],
204
- ignore_index=True,
203
+ return (
204
+ pd.concat(
205
+ [samples, table.sample(n=MAX_ROWS_ANALYSIS - len(samples), random_state=1)],
206
+ ignore_index=True,
207
+ )
208
+ # this is very unlikely but we never know
209
+ if len(samples) <= MAX_ROWS_ANALYSIS
210
+ else samples.sample(n=MAX_ROWS_ANALYSIS, random_state=1)
205
211
  )
@@ -1,11 +1,11 @@
1
1
  import logging
2
2
  from time import time
3
- from typing import Optional, TextIO
3
+ from typing import TextIO
4
4
 
5
5
  from csv_detective.utils import display_logs_depending_process_time
6
6
 
7
7
 
8
- def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int, Optional[list]]:
8
+ def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int, list | None]:
9
9
  """Tests 10 first rows for possible header (in case header is not 1st row)"""
10
10
  if verbose:
11
11
  start = time()
@@ -1,16 +1,11 @@
1
- import json
2
1
  import logging
3
- import os
4
- import tempfile
5
2
  from time import time
6
- from typing import Optional, Union
7
3
 
8
4
  import pandas as pd
9
5
 
10
6
  from csv_detective.detection.formats import detect_formats
11
- from csv_detective.output import generate_output, generate_table_schema
7
+ from csv_detective.output import generate_output
12
8
  from csv_detective.parsing.load import load_file
13
- from csv_detective.s3_utils import download_from_minio, upload_to_minio
14
9
  from csv_detective.utils import display_logs_depending_process_time, is_url
15
10
  from csv_detective.validate import validate
16
11
 
@@ -20,24 +15,24 @@ logging.basicConfig(level=logging.INFO)
20
15
  def routine(
21
16
  file_path: str,
22
17
  num_rows: int = 500,
23
- user_input_tests: Union[str, list[str]] = "ALL",
18
+ user_input_tests: str | list[str] = "ALL",
24
19
  limited_output: bool = True,
25
- save_results: Union[bool, str] = True,
26
- encoding: Optional[str] = None,
27
- sep: Optional[str] = None,
20
+ save_results: bool | str = True,
21
+ encoding: str | None = None,
22
+ sep: str | None = None,
28
23
  skipna: bool = True,
29
24
  output_profile: bool = False,
30
25
  output_schema: bool = False,
31
26
  output_df: bool = False,
32
27
  cast_json: bool = True,
33
28
  verbose: bool = False,
34
- sheet_name: Optional[Union[str, int]] = None,
35
- ) -> Union[dict, tuple[dict, pd.DataFrame]]:
36
- """Returns a dict with information about the csv table and possible
29
+ sheet_name: str | int | None = None,
30
+ ) -> dict | tuple[dict, pd.DataFrame]:
31
+ """Returns a dict with information about the table and possible
37
32
  column contents, and if requested the DataFrame with columns cast according to analysis.
38
33
 
39
34
  Args:
40
- file_path: local path to CSV file if not using Minio
35
+ file_path: local path or URL to file
41
36
  num_rows: number of rows to sample from the file for analysis ; -1 for analysis
42
37
  of the whole file
43
38
  user_input_tests: tests to run on the file
@@ -111,9 +106,9 @@ def validate_then_detect(
111
106
  file_path: str,
112
107
  previous_analysis: dict,
113
108
  num_rows: int = 500,
114
- user_input_tests: Union[str, list[str]] = "ALL",
109
+ user_input_tests: str | list[str] = "ALL",
115
110
  limited_output: bool = True,
116
- save_results: Union[bool, str] = True,
111
+ save_results: bool | str = True,
117
112
  skipna: bool = True,
118
113
  output_profile: bool = False,
119
114
  output_schema: bool = False,
@@ -173,106 +168,3 @@ def validate_then_detect(
173
168
  display_logs_depending_process_time(
174
169
  f"Process completed in {round(time() - start_routine, 3)}s", time() - start_routine
175
170
  )
176
-
177
-
178
- def routine_minio(
179
- csv_minio_location: dict[str, str],
180
- output_minio_location: dict[str, str],
181
- tableschema_minio_location: dict[str, str],
182
- minio_user: str,
183
- minio_pwd: str,
184
- **kwargs,
185
- ):
186
- """Returns a dict with information about the csv table and possible
187
- column contents.
188
-
189
- Args:
190
- csv_minio_location: dict with Minio URL, bucket and key of the CSV file
191
- output_minio_location: Minio URL, bucket and key to store output file. None if
192
- not uploading to Minio.
193
- tableschema_minio_location: Minio URL, bucket and key to store tableschema file.
194
- None if not uploading the tableschema to Minio.
195
- minio_user: user name for the minio instance
196
- minio_pwd: password for the minio instance
197
- kwargs: arguments for routine
198
-
199
- Returns:
200
- dict: a dict with information about the csv and possible types for each column
201
- """
202
-
203
- if (
204
- (
205
- any(
206
- [
207
- location_dict is not None
208
- for location_dict in [
209
- csv_minio_location,
210
- output_minio_location,
211
- tableschema_minio_location,
212
- ]
213
- ]
214
- )
215
- )
216
- and (minio_user is None)
217
- or (minio_pwd is None)
218
- ):
219
- raise ValueError("Minio credentials are required if using Minio")
220
-
221
- for location_dict in [
222
- csv_minio_location,
223
- output_minio_location,
224
- tableschema_minio_location,
225
- ]:
226
- if location_dict is not None:
227
- if any(
228
- [
229
- (location_key not in location_dict) or (location_dict[location_key] is None)
230
- for location_key in ["netloc", "bucket", "key"]
231
- ]
232
- ):
233
- raise ValueError("Minio location dict must contain url, bucket and key")
234
-
235
- file_path = tempfile.NamedTemporaryFile(delete=False).name
236
- download_from_minio(
237
- netloc=csv_minio_location["netloc"],
238
- bucket=csv_minio_location["bucket"],
239
- key=csv_minio_location["key"],
240
- filepath=file_path,
241
- minio_user=minio_user,
242
- minio_pwd=minio_pwd,
243
- )
244
-
245
- analysis = routine(
246
- file_path,
247
- save_results=True,
248
- **kwargs,
249
- )
250
-
251
- # Write report JSON file.
252
- output_path_to_store_minio_file = os.path.splitext(file_path)[0] + ".json"
253
- with open(output_path_to_store_minio_file, "w", encoding="utf8") as fp:
254
- json.dump(analysis, fp, indent=4, separators=(",", ": "))
255
-
256
- upload_to_minio(
257
- netloc=output_minio_location["netloc"],
258
- bucket=output_minio_location["bucket"],
259
- key=output_minio_location["key"],
260
- filepath=output_path_to_store_minio_file,
261
- minio_user=minio_user,
262
- minio_pwd=minio_pwd,
263
- )
264
-
265
- os.remove(output_path_to_store_minio_file)
266
- os.remove(file_path)
267
-
268
- generate_table_schema(
269
- analysis_report=analysis,
270
- save_file=True,
271
- netloc=tableschema_minio_location["netloc"],
272
- bucket=tableschema_minio_location["bucket"],
273
- key=tableschema_minio_location["key"],
274
- minio_user=minio_user,
275
- minio_pwd=minio_pwd,
276
- )
277
-
278
- return analysis
@@ -1,5 +1,4 @@
1
1
  import os
2
- from typing import Union
3
2
 
4
3
  from csv_detective import detect_fields, detect_labels # noqa
5
4
 
@@ -18,7 +17,7 @@ def get_all_packages(detect_type) -> list:
18
17
 
19
18
 
20
19
  def return_all_tests(
21
- user_input_tests: Union[str, list],
20
+ user_input_tests: str | list,
22
21
  detect_type: str,
23
22
  ) -> list:
24
23
  """
@@ -1,6 +1,5 @@
1
1
  import json
2
2
  import os
3
- from typing import Optional, Union
4
3
 
5
4
  import pandas as pd
6
5
 
@@ -17,22 +16,15 @@ def generate_output(
17
16
  file_path: str,
18
17
  num_rows: int = 500,
19
18
  limited_output: bool = True,
20
- save_results: Union[bool, str] = True,
19
+ save_results: bool | str = True,
21
20
  output_profile: bool = False,
22
21
  output_schema: bool = False,
23
22
  output_df: bool = False,
24
23
  cast_json: bool = True,
25
24
  verbose: bool = False,
26
- sheet_name: Optional[Union[str, int]] = None,
27
- ) -> Union[dict, tuple[dict, pd.DataFrame]]:
28
- if output_profile or output_df:
29
- # to create the profile we have to cast columns, so using the dedicated function
30
- table = cast_df(
31
- df=table,
32
- columns=analysis["columns"],
33
- cast_json=cast_json,
34
- verbose=verbose,
35
- )
25
+ sheet_name: str | int | None = None,
26
+ ) -> dict | tuple[dict, pd.DataFrame]:
27
+ if output_profile:
36
28
  analysis["profile"] = create_profile(
37
29
  table=table,
38
30
  columns=analysis["columns"],
@@ -58,8 +50,13 @@ def generate_output(
58
50
  )
59
51
 
60
52
  if output_schema:
61
- analysis["schema"] = generate_table_schema(analysis, save_file=False, verbose=verbose)
53
+ analysis["schema"] = generate_table_schema(analysis, save_results=False, verbose=verbose)
62
54
 
63
55
  if output_df:
64
- return analysis, table
56
+ return analysis, cast_df(
57
+ df=table,
58
+ columns=analysis["columns"],
59
+ cast_json=cast_json,
60
+ verbose=verbose,
61
+ )
65
62
  return analysis
@@ -1,7 +1,6 @@
1
1
  import json
2
2
  from datetime import date, datetime
3
3
  from time import time
4
- from typing import Optional, Union
5
4
 
6
5
  import pandas as pd
7
6
 
@@ -11,7 +10,7 @@ from csv_detective.detect_fields.temp.date import date_casting
11
10
  from csv_detective.utils import display_logs_depending_process_time
12
11
 
13
12
 
14
- def cast(value: str, _type: str) -> Optional[Union[str, float, bool, date, datetime]]:
13
+ def cast(value: str, _type: str) -> str | float | bool | date | datetime | None:
15
14
  if not isinstance(value, str) or not value:
16
15
  # None is the current default value in hydra, should we keep this?
17
16
  return None
@@ -3,7 +3,7 @@ import random
3
3
  import string
4
4
  import uuid
5
5
  from datetime import datetime
6
- from typing import Any, Optional, Type, Union
6
+ from typing import Any, Type
7
7
 
8
8
  import pandas as pd
9
9
  import requests
@@ -14,10 +14,10 @@ fake = Faker()
14
14
 
15
15
 
16
16
  def create_example_csv_file(
17
- fields: Optional[dict] = None,
18
- schema_path: Optional[str] = None,
17
+ fields: dict | None = None,
18
+ schema_path: str | None = None,
19
19
  file_length: int = 10,
20
- output_name: Optional[str] = "example_file.csv",
20
+ output_name: str | None = "example_file.csv",
21
21
  output_sep: str = ";",
22
22
  encoding: str = "utf-8",
23
23
  ignore_required: bool = False,
@@ -49,8 +49,8 @@ def create_example_csv_file(
49
49
  def _string(
50
50
  length: int = 10,
51
51
  required: bool = True,
52
- pattern: Optional[str] = None,
53
- enum: Optional[str] = None,
52
+ pattern: str | None = None,
53
+ enum: str | None = None,
54
54
  ) -> str:
55
55
  if potential_skip(required):
56
56
  return ""
@@ -70,7 +70,7 @@ def create_example_csv_file(
70
70
  return str(uuid.uuid4())
71
71
 
72
72
  def _date(
73
- date_range: Optional[list[str]] = None,
73
+ date_range: list[str] | None = None,
74
74
  format: str = "%Y-%m-%d",
75
75
  required: bool = True,
76
76
  ) -> str:
@@ -99,7 +99,7 @@ def create_example_csv_file(
99
99
  return fake.time(format)
100
100
 
101
101
  def _datetime(
102
- datetime_range: Optional[list[str]] = None,
102
+ datetime_range: list[str] | None = None,
103
103
  format: str = "%Y-%m-%d %H-%M-%S",
104
104
  required: bool = True,
105
105
  ) -> str:
@@ -123,11 +123,11 @@ def create_example_csv_file(
123
123
  return f"http://{rstr.domainsafe()}.{rstr.letters(3)}/{rstr.urlsafe()}"
124
124
 
125
125
  def _number(
126
- num_type: Type[Union[int, float]] = int,
127
- num_range: Optional[list[float]] = None,
128
- enum: Optional[list] = None,
126
+ num_type: Type[int | float] = int,
127
+ num_range: list[float] | None = None,
128
+ enum: list | None = None,
129
129
  required: bool = True,
130
- ) -> Union[int, float]:
130
+ ) -> int | float:
131
131
  assert num_range is None or len(num_range) == 2
132
132
  if potential_skip(required):
133
133
  return ""
@@ -4,7 +4,8 @@ from time import time
4
4
 
5
5
  import pandas as pd
6
6
 
7
- from csv_detective.utils import display_logs_depending_process_time, prevent_nan
7
+ from csv_detective.detect_fields.other.float import float_casting
8
+ from csv_detective.utils import cast_prevent_nan, display_logs_depending_process_time
8
9
 
9
10
 
10
11
  def create_profile(
@@ -18,11 +19,6 @@ def create_profile(
18
19
  if verbose:
19
20
  start = time()
20
21
  logging.info("Creating profile")
21
- map_python_types = {
22
- "string": str,
23
- "int": float,
24
- "float": float,
25
- }
26
22
 
27
23
  if num_rows > 0:
28
24
  raise ValueError("To create profiles num_rows has to be set to -1")
@@ -35,12 +31,19 @@ def create_profile(
35
31
  for c in table.columns:
36
32
  # for numerical formats we want min, max, mean, std
37
33
  if columns[c]["python_type"] in ["float", "int"]:
34
+ # we locally cast the column to perform the operations, using the same method as in cast_df
35
+ cast_col = (
36
+ table[c].astype(pd.Int64Dtype())
37
+ if columns[c]["python_type"] == "int"
38
+ else table[c].apply(lambda x: float_casting(x) if isinstance(x, str) else pd.NA)
39
+ )
38
40
  profile[c].update(
39
- min=prevent_nan(map_python_types[columns[c]["python_type"]](table[c].min())),
40
- max=prevent_nan(map_python_types[columns[c]["python_type"]](table[c].max())),
41
- mean=prevent_nan(map_python_types[columns[c]["python_type"]](table[c].mean())),
42
- std=prevent_nan(map_python_types[columns[c]["python_type"]](table[c].std())),
41
+ min=cast_prevent_nan(cast_col.min(), columns[c]["python_type"]),
42
+ max=cast_prevent_nan(cast_col.max(), columns[c]["python_type"]),
43
+ mean=cast_prevent_nan(cast_col.mean(), columns[c]["python_type"]),
44
+ std=cast_prevent_nan(cast_col.std(), columns[c]["python_type"]),
43
45
  )
46
+ del cast_col
44
47
  # for all formats we want most frequent values, nb unique values and nb missing values
45
48
  tops_bruts = (
46
49
  table.loc[table[c].notna(), c]