csv-detective 0.9.2.dev1896__py3-none-any.whl → 0.9.3.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. csv_detective/__init__.py +1 -2
  2. csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +1 -1
  3. csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +1 -1
  4. csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +1 -1
  5. csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +1 -1
  6. csv_detective/detect_fields/geo/latitude_wgs/__init__.py +1 -1
  7. csv_detective/detect_fields/geo/longitude_wgs/__init__.py +1 -1
  8. csv_detective/detect_fields/other/email/__init__.py +2 -2
  9. csv_detective/detect_fields/temp/date/__init__.py +1 -2
  10. csv_detective/detect_fields/temp/datetime_aware/__init__.py +7 -6
  11. csv_detective/detect_fields/temp/datetime_naive/__init__.py +4 -8
  12. csv_detective/detection/engine.py +1 -2
  13. csv_detective/detection/formats.py +14 -8
  14. csv_detective/detection/headers.py +2 -2
  15. csv_detective/explore_csv.py +11 -119
  16. csv_detective/load_tests.py +1 -2
  17. csv_detective/output/__init__.py +4 -5
  18. csv_detective/output/dataframe.py +1 -2
  19. csv_detective/output/example.py +12 -12
  20. csv_detective/output/schema.py +7 -86
  21. csv_detective/parsing/excel.py +2 -3
  22. csv_detective/parsing/load.py +3 -4
  23. csv_detective/utils.py +1 -2
  24. csv_detective/validate.py +4 -5
  25. {csv_detective-0.9.2.dev1896.dist-info → csv_detective-0.9.3.dev0.dist-info}/METADATA +18 -26
  26. {csv_detective-0.9.2.dev1896.dist-info → csv_detective-0.9.3.dev0.dist-info}/RECORD +33 -35
  27. tests/test_fields.py +37 -4
  28. tests/test_file.py +18 -0
  29. venv/bin/activate_this.py +1 -1
  30. csv_detective/s3_utils.py +0 -44
  31. venv/bin/jp.py +0 -54
  32. {csv_detective-0.9.2.dev1896.dist-info → csv_detective-0.9.3.dev0.dist-info}/WHEEL +0 -0
  33. {csv_detective-0.9.2.dev1896.dist-info → csv_detective-0.9.3.dev0.dist-info}/entry_points.txt +0 -0
  34. {csv_detective-0.9.2.dev1896.dist-info → csv_detective-0.9.3.dev0.dist-info}/licenses/LICENSE +0 -0
  35. {csv_detective-0.9.2.dev1896.dist-info → csv_detective-0.9.3.dev0.dist-info}/top_level.txt +0 -0
csv_detective/__init__.py CHANGED
@@ -1,7 +1,6 @@
1
- from csv_detective.explore_csv import routine, routine_minio, validate_then_detect
1
+ from csv_detective.explore_csv import routine, validate_then_detect
2
2
 
3
3
  __all__ = [
4
4
  "routine",
5
- "routine_minio",
6
5
  "validate_then_detect",
7
6
  ]
@@ -3,7 +3,7 @@ from frformat import LatitudeL93
3
3
  from csv_detective.detect_fields.other.float import _is as is_float
4
4
  from csv_detective.detect_fields.other.float import float_casting
5
5
 
6
- PROPORTION = 0.9
6
+ PROPORTION = 1
7
7
 
8
8
  _latitudel93 = LatitudeL93()
9
9
 
@@ -1,6 +1,6 @@
1
1
  from csv_detective.detect_fields.other.float import _is as is_float
2
2
 
3
- PROPORTION = 0.9
3
+ PROPORTION = 1
4
4
 
5
5
 
6
6
  def _is(val):
@@ -3,7 +3,7 @@ from frformat import LongitudeL93
3
3
  from csv_detective.detect_fields.other.float import _is as is_float
4
4
  from csv_detective.detect_fields.other.float import float_casting
5
5
 
6
- PROPORTION = 0.9
6
+ PROPORTION = 1
7
7
 
8
8
  _longitudel93 = LongitudeL93()
9
9
 
@@ -1,6 +1,6 @@
1
1
  from csv_detective.detect_fields.other.float import _is as is_float
2
2
 
3
- PROPORTION = 0.9
3
+ PROPORTION = 1
4
4
 
5
5
 
6
6
  def _is(val):
@@ -1,6 +1,6 @@
1
1
  from csv_detective.detect_fields.other.float import _is as is_float
2
2
 
3
- PROPORTION = 0.9
3
+ PROPORTION = 1
4
4
 
5
5
 
6
6
  def _is(val):
@@ -1,6 +1,6 @@
1
1
  from csv_detective.detect_fields.other.float import _is as is_float
2
2
 
3
- PROPORTION = 0.9
3
+ PROPORTION = 1
4
4
 
5
5
 
6
6
  def _is(val):
@@ -1,10 +1,10 @@
1
1
  import re
2
2
 
3
- PROPORTION = 1
3
+ PROPORTION = 0.9
4
4
 
5
5
 
6
6
  def _is(val):
7
7
  """Detects e-mails"""
8
8
  return isinstance(val, str) and bool(
9
- re.match(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}$", val)
9
+ re.match(r"^[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}$", val, re.IGNORECASE)
10
10
  )
@@ -1,6 +1,5 @@
1
1
  import re
2
2
  from datetime import datetime
3
- from typing import Optional
4
3
 
5
4
  from dateparser import parse as date_parser
6
5
  from dateutil.parser import ParserError
@@ -10,7 +9,7 @@ PROPORTION = 1
10
9
  # /!\ this is only for dates, not datetimes which are handled by other utils
11
10
 
12
11
 
13
- def date_casting(val: str) -> Optional[datetime]:
12
+ def date_casting(val: str) -> datetime | None:
14
13
  """For performance reasons, we try first with dateutil and fallback on dateparser"""
15
14
  try:
16
15
  return dateutil_parser(val)
@@ -1,24 +1,25 @@
1
1
  import re
2
- from typing import Any, Optional
2
+ from typing import Any
3
3
 
4
4
  from csv_detective.detect_fields.temp.date import aaaammjj_pattern, date_casting
5
5
 
6
6
  PROPORTION = 1
7
7
  threshold = 0.7
8
8
 
9
- # matches AAAA-MM-JJTHH:MM:SS(.dddddd)±HH:MM with any of the listed separators for the date OR NO SEPARATOR
9
+ # matches AAAA-MM-JJTHH:MM:SS(.dddddd)(±HH:MM|Z) with any of the listed separators for the date OR NO SEPARATOR
10
10
  pat = (
11
11
  aaaammjj_pattern.replace("$", "")
12
- + r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})?[+-](0\d|1[0-9]|2[0-3]):([0-5][0-9])$"
12
+ + r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})"
13
+ + r"?(([+-](0\d|1[0-9]|2[0-3]):([0-5][0-9]))|Z)$"
13
14
  )
14
15
 
15
16
 
16
- def _is(val: Optional[Any]) -> bool:
17
+ def _is(val: Any | None) -> bool:
17
18
  """Detects timezone-aware datetimes only"""
18
19
  # early stops, to cut processing time
19
- # 21 is the minimal length of a datetime format YYMMDDTHH:MM:SS+HH:MM
20
+ # 16 is the minimal length of a datetime format YYMMDDTHH:MM:SSZ
20
21
  # 32 is the maximal length of an ISO datetime format YYYY-MM-DDTHH:MM:SS.dddddd+HH:MM, keeping some slack
21
- if not isinstance(val, str) or len(val) > 35 or len(val) < 21:
22
+ if not isinstance(val, str) or len(val) > 35 or len(val) < 16:
22
23
  return False
23
24
  # if usual format, no need to parse
24
25
  if bool(re.match(pat, val)):
@@ -1,5 +1,5 @@
1
1
  import re
2
- from typing import Any, Optional
2
+ from typing import Any
3
3
 
4
4
  from csv_detective.detect_fields.temp.date import aaaammjj_pattern, date_casting
5
5
 
@@ -9,11 +9,11 @@ threshold = 0.7
9
9
  # matches AAAA-MM-JJTHH:MM:SS(.dddddd)Z with any of the listed separators for the date OR NO SEPARATOR
10
10
  pat = (
11
11
  aaaammjj_pattern.replace("$", "")
12
- + r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})?Z$"
12
+ + r"(T|\s)(0\d|1[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])(.\d{1,6})?$"
13
13
  )
14
14
 
15
15
 
16
- def _is(val: Optional[Any]) -> bool:
16
+ def _is(val: Any | None) -> bool:
17
17
  """Detects naive datetimes only"""
18
18
  # early stops, to cut processing time
19
19
  # 15 is the minimal length of a datetime format YYMMDDTHH:MM:SS
@@ -26,8 +26,4 @@ def _is(val: Optional[Any]) -> bool:
26
26
  if sum([char.isdigit() or char in {"-", "/", ":", " "} for char in val]) / len(val) < threshold:
27
27
  return False
28
28
  res = date_casting(val)
29
- return (
30
- res is not None
31
- and bool(res.hour or res.minute or res.second or res.microsecond)
32
- and not bool(res.tzinfo)
33
- )
29
+ return res is not None and not bool(res.tzinfo)
@@ -1,5 +1,4 @@
1
1
  from time import time
2
- from typing import Optional
3
2
 
4
3
  import magic
5
4
  import requests
@@ -16,7 +15,7 @@ engine_to_file = {
16
15
  }
17
16
 
18
17
 
19
- def detect_engine(file_path: str, verbose=False) -> Optional[str]:
18
+ def detect_engine(file_path: str, verbose=False) -> str | None:
20
19
  if verbose:
21
20
  start = time()
22
21
  mapping = {
@@ -1,6 +1,5 @@
1
1
  import logging
2
2
  from collections import defaultdict
3
- from typing import Union
4
3
 
5
4
  import numpy as np
6
5
  import pandas as pd
@@ -22,7 +21,7 @@ def detect_formats(
22
21
  table: pd.DataFrame,
23
22
  analysis: dict,
24
23
  file_path: str,
25
- user_input_tests: Union[str, list[str]] = "ALL",
24
+ user_input_tests: str | list[str] = "ALL",
26
25
  limited_output: bool = True,
27
26
  skipna: bool = True,
28
27
  verbose: bool = False,
@@ -30,7 +29,7 @@ def detect_formats(
30
29
  on_sample = len(table) > MAX_ROWS_ANALYSIS
31
30
  if on_sample:
32
31
  if verbose:
33
- logging.warning(f"File is too long, analysing the {MAX_ROWS_ANALYSIS} first rows")
32
+ logging.warning(f"File is too long, analysing a sample of {MAX_ROWS_ANALYSIS} rows")
34
33
  table = build_sample(table)
35
34
 
36
35
  if table.empty:
@@ -183,13 +182,15 @@ def build_sample(table: pd.DataFrame) -> pd.DataFrame:
183
182
  samples = pd.concat(
184
183
  [
185
184
  # one row with the minimum of the column
186
- table.loc[table[col] == table[col].dropna().min()].iloc[[0]]
185
+ table.loc[table[col] == val].iloc[[0]]
187
186
  for col in table.columns
187
+ if not pd.isna(val := table[col].dropna().min())
188
188
  ]
189
189
  + [
190
190
  # one row with the maximum of the column
191
- table.loc[table[col] == table[col].dropna().max()].iloc[[0]]
191
+ table.loc[table[col] == val].iloc[[0]]
192
192
  for col in table.columns
193
+ if not pd.isna(val := table[col].dropna().max())
193
194
  ]
194
195
  + [
195
196
  # one row with a NaN value if the column has any
@@ -199,7 +200,12 @@ def build_sample(table: pd.DataFrame) -> pd.DataFrame:
199
200
  ],
200
201
  ignore_index=True,
201
202
  )
202
- return pd.concat(
203
- [samples, table.sample(n=MAX_ROWS_ANALYSIS - len(samples), random_state=1)],
204
- ignore_index=True,
203
+ return (
204
+ pd.concat(
205
+ [samples, table.sample(n=MAX_ROWS_ANALYSIS - len(samples), random_state=1)],
206
+ ignore_index=True,
207
+ )
208
+ # this is very unlikely but we never know
209
+ if len(samples) <= MAX_ROWS_ANALYSIS
210
+ else samples.sample(n=MAX_ROWS_ANALYSIS, random_state=1)
205
211
  )
@@ -1,11 +1,11 @@
1
1
  import logging
2
2
  from time import time
3
- from typing import Optional, TextIO
3
+ from typing import TextIO
4
4
 
5
5
  from csv_detective.utils import display_logs_depending_process_time
6
6
 
7
7
 
8
- def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int, Optional[list]]:
8
+ def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int, list | None]:
9
9
  """Tests 10 first rows for possible header (in case header is not 1st row)"""
10
10
  if verbose:
11
11
  start = time()
@@ -1,16 +1,11 @@
1
- import json
2
1
  import logging
3
- import os
4
- import tempfile
5
2
  from time import time
6
- from typing import Optional, Union
7
3
 
8
4
  import pandas as pd
9
5
 
10
6
  from csv_detective.detection.formats import detect_formats
11
- from csv_detective.output import generate_output, generate_table_schema
7
+ from csv_detective.output import generate_output
12
8
  from csv_detective.parsing.load import load_file
13
- from csv_detective.s3_utils import download_from_minio, upload_to_minio
14
9
  from csv_detective.utils import display_logs_depending_process_time, is_url
15
10
  from csv_detective.validate import validate
16
11
 
@@ -20,24 +15,24 @@ logging.basicConfig(level=logging.INFO)
20
15
  def routine(
21
16
  file_path: str,
22
17
  num_rows: int = 500,
23
- user_input_tests: Union[str, list[str]] = "ALL",
18
+ user_input_tests: str | list[str] = "ALL",
24
19
  limited_output: bool = True,
25
- save_results: Union[bool, str] = True,
26
- encoding: Optional[str] = None,
27
- sep: Optional[str] = None,
20
+ save_results: bool | str = True,
21
+ encoding: str | None = None,
22
+ sep: str | None = None,
28
23
  skipna: bool = True,
29
24
  output_profile: bool = False,
30
25
  output_schema: bool = False,
31
26
  output_df: bool = False,
32
27
  cast_json: bool = True,
33
28
  verbose: bool = False,
34
- sheet_name: Optional[Union[str, int]] = None,
35
- ) -> Union[dict, tuple[dict, pd.DataFrame]]:
36
- """Returns a dict with information about the csv table and possible
29
+ sheet_name: str | int | None = None,
30
+ ) -> dict | tuple[dict, pd.DataFrame]:
31
+ """Returns a dict with information about the table and possible
37
32
  column contents, and if requested the DataFrame with columns cast according to analysis.
38
33
 
39
34
  Args:
40
- file_path: local path to CSV file if not using Minio
35
+ file_path: local path or URL to file
41
36
  num_rows: number of rows to sample from the file for analysis ; -1 for analysis
42
37
  of the whole file
43
38
  user_input_tests: tests to run on the file
@@ -111,9 +106,9 @@ def validate_then_detect(
111
106
  file_path: str,
112
107
  previous_analysis: dict,
113
108
  num_rows: int = 500,
114
- user_input_tests: Union[str, list[str]] = "ALL",
109
+ user_input_tests: str | list[str] = "ALL",
115
110
  limited_output: bool = True,
116
- save_results: Union[bool, str] = True,
111
+ save_results: bool | str = True,
117
112
  skipna: bool = True,
118
113
  output_profile: bool = False,
119
114
  output_schema: bool = False,
@@ -173,106 +168,3 @@ def validate_then_detect(
173
168
  display_logs_depending_process_time(
174
169
  f"Process completed in {round(time() - start_routine, 3)}s", time() - start_routine
175
170
  )
176
-
177
-
178
- def routine_minio(
179
- csv_minio_location: dict[str, str],
180
- output_minio_location: dict[str, str],
181
- tableschema_minio_location: dict[str, str],
182
- minio_user: str,
183
- minio_pwd: str,
184
- **kwargs,
185
- ):
186
- """Returns a dict with information about the csv table and possible
187
- column contents.
188
-
189
- Args:
190
- csv_minio_location: dict with Minio URL, bucket and key of the CSV file
191
- output_minio_location: Minio URL, bucket and key to store output file. None if
192
- not uploading to Minio.
193
- tableschema_minio_location: Minio URL, bucket and key to store tableschema file.
194
- None if not uploading the tableschema to Minio.
195
- minio_user: user name for the minio instance
196
- minio_pwd: password for the minio instance
197
- kwargs: arguments for routine
198
-
199
- Returns:
200
- dict: a dict with information about the csv and possible types for each column
201
- """
202
-
203
- if (
204
- (
205
- any(
206
- [
207
- location_dict is not None
208
- for location_dict in [
209
- csv_minio_location,
210
- output_minio_location,
211
- tableschema_minio_location,
212
- ]
213
- ]
214
- )
215
- )
216
- and (minio_user is None)
217
- or (minio_pwd is None)
218
- ):
219
- raise ValueError("Minio credentials are required if using Minio")
220
-
221
- for location_dict in [
222
- csv_minio_location,
223
- output_minio_location,
224
- tableschema_minio_location,
225
- ]:
226
- if location_dict is not None:
227
- if any(
228
- [
229
- (location_key not in location_dict) or (location_dict[location_key] is None)
230
- for location_key in ["netloc", "bucket", "key"]
231
- ]
232
- ):
233
- raise ValueError("Minio location dict must contain url, bucket and key")
234
-
235
- file_path = tempfile.NamedTemporaryFile(delete=False).name
236
- download_from_minio(
237
- netloc=csv_minio_location["netloc"],
238
- bucket=csv_minio_location["bucket"],
239
- key=csv_minio_location["key"],
240
- filepath=file_path,
241
- minio_user=minio_user,
242
- minio_pwd=minio_pwd,
243
- )
244
-
245
- analysis = routine(
246
- file_path,
247
- save_results=True,
248
- **kwargs,
249
- )
250
-
251
- # Write report JSON file.
252
- output_path_to_store_minio_file = os.path.splitext(file_path)[0] + ".json"
253
- with open(output_path_to_store_minio_file, "w", encoding="utf8") as fp:
254
- json.dump(analysis, fp, indent=4, separators=(",", ": "))
255
-
256
- upload_to_minio(
257
- netloc=output_minio_location["netloc"],
258
- bucket=output_minio_location["bucket"],
259
- key=output_minio_location["key"],
260
- filepath=output_path_to_store_minio_file,
261
- minio_user=minio_user,
262
- minio_pwd=minio_pwd,
263
- )
264
-
265
- os.remove(output_path_to_store_minio_file)
266
- os.remove(file_path)
267
-
268
- generate_table_schema(
269
- analysis_report=analysis,
270
- save_file=True,
271
- netloc=tableschema_minio_location["netloc"],
272
- bucket=tableschema_minio_location["bucket"],
273
- key=tableschema_minio_location["key"],
274
- minio_user=minio_user,
275
- minio_pwd=minio_pwd,
276
- )
277
-
278
- return analysis
@@ -1,5 +1,4 @@
1
1
  import os
2
- from typing import Union
3
2
 
4
3
  from csv_detective import detect_fields, detect_labels # noqa
5
4
 
@@ -18,7 +17,7 @@ def get_all_packages(detect_type) -> list:
18
17
 
19
18
 
20
19
  def return_all_tests(
21
- user_input_tests: Union[str, list],
20
+ user_input_tests: str | list,
22
21
  detect_type: str,
23
22
  ) -> list:
24
23
  """
@@ -1,6 +1,5 @@
1
1
  import json
2
2
  import os
3
- from typing import Optional, Union
4
3
 
5
4
  import pandas as pd
6
5
 
@@ -17,14 +16,14 @@ def generate_output(
17
16
  file_path: str,
18
17
  num_rows: int = 500,
19
18
  limited_output: bool = True,
20
- save_results: Union[bool, str] = True,
19
+ save_results: bool | str = True,
21
20
  output_profile: bool = False,
22
21
  output_schema: bool = False,
23
22
  output_df: bool = False,
24
23
  cast_json: bool = True,
25
24
  verbose: bool = False,
26
- sheet_name: Optional[Union[str, int]] = None,
27
- ) -> Union[dict, tuple[dict, pd.DataFrame]]:
25
+ sheet_name: str | int | None = None,
26
+ ) -> dict | tuple[dict, pd.DataFrame]:
28
27
  if output_profile:
29
28
  analysis["profile"] = create_profile(
30
29
  table=table,
@@ -51,7 +50,7 @@ def generate_output(
51
50
  )
52
51
 
53
52
  if output_schema:
54
- analysis["schema"] = generate_table_schema(analysis, save_file=False, verbose=verbose)
53
+ analysis["schema"] = generate_table_schema(analysis, save_results=False, verbose=verbose)
55
54
 
56
55
  if output_df:
57
56
  return analysis, cast_df(
@@ -1,7 +1,6 @@
1
1
  import json
2
2
  from datetime import date, datetime
3
3
  from time import time
4
- from typing import Optional, Union
5
4
 
6
5
  import pandas as pd
7
6
 
@@ -11,7 +10,7 @@ from csv_detective.detect_fields.temp.date import date_casting
11
10
  from csv_detective.utils import display_logs_depending_process_time
12
11
 
13
12
 
14
- def cast(value: str, _type: str) -> Optional[Union[str, float, bool, date, datetime]]:
13
+ def cast(value: str, _type: str) -> str | float | bool | date | datetime | None:
15
14
  if not isinstance(value, str) or not value:
16
15
  # None is the current default value in hydra, should we keep this?
17
16
  return None
@@ -3,7 +3,7 @@ import random
3
3
  import string
4
4
  import uuid
5
5
  from datetime import datetime
6
- from typing import Any, Optional, Type, Union
6
+ from typing import Any, Type
7
7
 
8
8
  import pandas as pd
9
9
  import requests
@@ -14,10 +14,10 @@ fake = Faker()
14
14
 
15
15
 
16
16
  def create_example_csv_file(
17
- fields: Optional[dict] = None,
18
- schema_path: Optional[str] = None,
17
+ fields: dict | None = None,
18
+ schema_path: str | None = None,
19
19
  file_length: int = 10,
20
- output_name: Optional[str] = "example_file.csv",
20
+ output_name: str | None = "example_file.csv",
21
21
  output_sep: str = ";",
22
22
  encoding: str = "utf-8",
23
23
  ignore_required: bool = False,
@@ -49,8 +49,8 @@ def create_example_csv_file(
49
49
  def _string(
50
50
  length: int = 10,
51
51
  required: bool = True,
52
- pattern: Optional[str] = None,
53
- enum: Optional[str] = None,
52
+ pattern: str | None = None,
53
+ enum: str | None = None,
54
54
  ) -> str:
55
55
  if potential_skip(required):
56
56
  return ""
@@ -70,7 +70,7 @@ def create_example_csv_file(
70
70
  return str(uuid.uuid4())
71
71
 
72
72
  def _date(
73
- date_range: Optional[list[str]] = None,
73
+ date_range: list[str] | None = None,
74
74
  format: str = "%Y-%m-%d",
75
75
  required: bool = True,
76
76
  ) -> str:
@@ -99,7 +99,7 @@ def create_example_csv_file(
99
99
  return fake.time(format)
100
100
 
101
101
  def _datetime(
102
- datetime_range: Optional[list[str]] = None,
102
+ datetime_range: list[str] | None = None,
103
103
  format: str = "%Y-%m-%d %H-%M-%S",
104
104
  required: bool = True,
105
105
  ) -> str:
@@ -123,11 +123,11 @@ def create_example_csv_file(
123
123
  return f"http://{rstr.domainsafe()}.{rstr.letters(3)}/{rstr.urlsafe()}"
124
124
 
125
125
  def _number(
126
- num_type: Type[Union[int, float]] = int,
127
- num_range: Optional[list[float]] = None,
128
- enum: Optional[list] = None,
126
+ num_type: Type[int | float] = int,
127
+ num_range: list[float] | None = None,
128
+ enum: list | None = None,
129
129
  required: bool = True,
130
- ) -> Union[int, float]:
130
+ ) -> int | float:
131
131
  assert num_range is None or len(num_range) == 2
132
132
  if potential_skip(required):
133
133
  return ""
@@ -1,14 +1,8 @@
1
1
  import json
2
2
  import logging
3
- import os
4
- import tempfile
5
3
  from datetime import datetime
6
4
  from time import time
7
- from typing import Optional
8
5
 
9
- from botocore.exceptions import ClientError
10
-
11
- from csv_detective.s3_utils import download_from_minio, get_s3_client, upload_to_minio
12
6
  from csv_detective.utils import display_logs_depending_process_time
13
7
 
14
8
 
@@ -202,25 +196,14 @@ def get_constraints(format: str) -> dict:
202
196
 
203
197
  def generate_table_schema(
204
198
  analysis_report: dict,
205
- save_file: bool,
206
- netloc: Optional[str] = None,
207
- bucket: Optional[str] = None,
208
- key: Optional[str] = None,
209
- minio_user: Optional[str] = None,
210
- minio_pwd: Optional[str] = None,
199
+ save_results: bool | str = True,
211
200
  verbose: bool = False,
212
201
  ) -> dict:
213
202
  """Generates a table schema from the analysis report
214
203
 
215
204
  Args:
216
205
  analysis_report (dict): The analysis report from csv_detective
217
- save_file (bool): indicate if schema should be saved into minio or just returned
218
- netloc (str): The netloc of the minio instance to upload the tableschema
219
- bucket (str): The bucket to save the schema in
220
- key (str): The key to save the schema in (without extension as we will append
221
- version number and extension)
222
- minio_user (str): The minio user
223
- minio_pwd (str): The minio password
206
+ save_results (bool or str): whether and where to save the results
224
207
 
225
208
  Returns:
226
209
  """
@@ -277,71 +260,9 @@ def generate_table_schema(
277
260
  f"Created schema in {round(time() - start, 3)}s", time() - start
278
261
  )
279
262
 
280
- if not save_file:
281
- return schema
282
-
283
- if save_file:
284
- if not all([netloc, key, bucket, minio_user, minio_pwd]):
285
- raise Exception(
286
- "To save schema into minio, parameters : netloc, key, bucket, "
287
- "minio_user, minio_pwd should be provided"
288
- )
289
-
290
- # Create bucket if does not exist
291
- client = get_s3_client(netloc, minio_user, minio_pwd)
292
- try:
293
- client.head_bucket(Bucket=bucket)
294
- except ClientError:
295
- client.create_bucket(Bucket=bucket)
296
-
297
- tableschema_objects = client.list_objects(Bucket=bucket, Prefix=key, Delimiter="/")
298
- if "Contents" in tableschema_objects:
299
- tableschema_keys = [
300
- tableschema["Key"]
301
- for tableschema in client.list_objects(Bucket=bucket, Prefix=key, Delimiter="/")[
302
- "Contents"
303
- ]
304
- ]
305
- tableschema_versions = [
306
- os.path.splitext(tableschema_key)[0].split("_")[-1]
307
- for tableschema_key in tableschema_keys
308
- ]
309
- latest_version = max(tableschema_versions)
263
+ if save_results:
264
+ output_path = save_results if isinstance(save_results, str) else "schema.json"
265
+ with open(output_path, "w", encoding="utf8") as fp:
266
+ json.dump(schema, fp, indent=4, separators=(",", ": "), ensure_ascii=False, default=str)
310
267
 
311
- with tempfile.NamedTemporaryFile() as latest_schema_file:
312
- with open(latest_schema_file.name, "w") as fp:
313
- download_from_minio(
314
- netloc,
315
- bucket,
316
- f"{key}_{latest_version}.json",
317
- latest_schema_file.name,
318
- minio_user,
319
- minio_pwd,
320
- )
321
- # Check if files are different
322
- with open(latest_schema_file.name, "r") as fp:
323
- latest_schema = json.load(fp)
324
- if latest_schema["fields"] != fields:
325
- latest_version_split = latest_version.split(".")
326
- new_version = (
327
- latest_version_split[0]
328
- + "."
329
- + latest_version_split[1]
330
- + "."
331
- + str(int(latest_version_split[2]) + 1)
332
- )
333
- else:
334
- return None
335
-
336
- schema["version"] = new_version
337
-
338
- tableschema_file = tempfile.NamedTemporaryFile(delete=False)
339
- with open(tableschema_file.name, "w") as fp:
340
- json.dump(schema, fp, indent=4)
341
-
342
- new_version_key = f"{key}_{new_version}.json"
343
- upload_to_minio(
344
- netloc, bucket, new_version_key, tableschema_file.name, minio_user, minio_pwd
345
- )
346
- os.unlink(tableschema_file.name)
347
- return {"netloc": netloc, "bucket": bucket, "key": new_version_key}
268
+ return schema
@@ -1,6 +1,5 @@
1
1
  from io import BytesIO
2
2
  from time import time
3
- from typing import Optional
4
3
 
5
4
  import openpyxl
6
5
  import pandas as pd
@@ -23,8 +22,8 @@ XLS_LIKE_EXT = NEW_EXCEL_EXT + OLD_EXCEL_EXT + OPEN_OFFICE_EXT
23
22
  def parse_excel(
24
23
  file_path: str,
25
24
  num_rows: int = -1,
26
- engine: Optional[str] = None,
27
- sheet_name: Optional[str] = None,
25
+ engine: str | None = None,
26
+ sheet_name: str | None = None,
28
27
  random_state: int = 42,
29
28
  verbose: bool = False,
30
29
  ) -> tuple[pd.DataFrame, int, int, str, str, int]:
@@ -1,5 +1,4 @@
1
1
  from io import BytesIO, StringIO
2
- from typing import Optional, Union
3
2
 
4
3
  import pandas as pd
5
4
  import requests
@@ -26,10 +25,10 @@ from .excel import (
26
25
  def load_file(
27
26
  file_path: str,
28
27
  num_rows: int = 500,
29
- encoding: Optional[str] = None,
30
- sep: Optional[str] = None,
28
+ encoding: str | None = None,
29
+ sep: str | None = None,
31
30
  verbose: bool = False,
32
- sheet_name: Optional[Union[str, int]] = None,
31
+ sheet_name: str | int | None = None,
33
32
  ) -> tuple[pd.DataFrame, dict]:
34
33
  file_name = file_path.split("/")[-1]
35
34
  engine = None
csv_detective/utils.py CHANGED
@@ -1,5 +1,4 @@
1
1
  import logging
2
- from typing import Optional, Union
3
2
 
4
3
  import pandas as pd
5
4
 
@@ -31,7 +30,7 @@ def is_url(file_path: str) -> bool:
31
30
  return file_path.startswith("http")
32
31
 
33
32
 
34
- def cast_prevent_nan(value: float, _type: str) -> Optional[Union[float, int]]:
33
+ def cast_prevent_nan(value: float, _type: str) -> float | int | None:
35
34
  if _type not in {"int", "float"}:
36
35
  raise ValueError(f"Invalid type was passed: {_type}")
37
36
  return None if pd.isna(value) else eval(_type)(value)
csv_detective/validate.py CHANGED
@@ -1,5 +1,4 @@
1
1
  import logging
2
- from typing import Optional, Union
3
2
 
4
3
  import pandas as pd
5
4
 
@@ -22,12 +21,12 @@ def validate(
22
21
  file_path: str,
23
22
  previous_analysis: dict,
24
23
  num_rows: int = 500,
25
- encoding: Optional[str] = None,
26
- sep: Optional[str] = None,
24
+ encoding: str | None = None,
25
+ sep: str | None = None,
27
26
  verbose: bool = False,
28
27
  skipna: bool = True,
29
- sheet_name: Optional[Union[str, int]] = None,
30
- ) -> tuple[bool, Optional[pd.DataFrame], Optional[dict]]:
28
+ sheet_name: str | int | None = None,
29
+ ) -> tuple[bool, pd.DataFrame | None, dict | None]:
31
30
  """
32
31
  Verify is the given file has the same fields and types as in the previous analysis.
33
32
  """
@@ -1,15 +1,14 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: csv-detective
3
- Version: 0.9.2.dev1896
3
+ Version: 0.9.3.dev0
4
4
  Summary: Detect tabular files column content
5
5
  Author-email: Etalab <opendatateam@data.gouv.fr>
6
6
  License: MIT
7
7
  Project-URL: Source, https://github.com/datagouv/csv_detective
8
8
  Keywords: CSV,data processing,encoding,guess,parser,tabular
9
- Requires-Python: <3.14,>=3.9
9
+ Requires-Python: <3.14,>=3.10
10
10
  Description-Content-Type: text/markdown
11
11
  License-File: LICENSE
12
- Requires-Dist: boto3<2,>=1.34.0
13
12
  Requires-Dist: dateparser<2,>=1.2.0
14
13
  Requires-Dist: faust-cchardet==2.1.19
15
14
  Requires-Dist: pandas<3,>=2.2.0
@@ -26,7 +25,6 @@ Requires-Dist: rstr==3.2.2
26
25
  Provides-Extra: dev
27
26
  Requires-Dist: pytest>=8.3.0; extra == "dev"
28
27
  Requires-Dist: responses>=0.25.0; extra == "dev"
29
- Requires-Dist: bumpx>=0.3.10; extra == "dev"
30
28
  Requires-Dist: ruff>=0.9.3; extra == "dev"
31
29
  Dynamic: license-file
32
30
 
@@ -221,32 +219,26 @@ ruff check --fix .
221
219
  ruff format .
222
220
  ```
223
221
 
224
- ## Release
222
+ ### 🏷️ Release
225
223
 
226
- The release process uses `bumpx`.
224
+ The release process uses the [`tag_version.sh`](tag_version.sh) script to create git tags and update [CHANGELOG.md](CHANGELOG.md) and [pyproject.toml](pyproject.toml) automatically.
227
225
 
228
- ```shell
229
- pip install -e .[dev]
230
- ```
231
-
232
- ### Process
233
-
234
- 1. `bumpx` will handle bumping the version according to your command (patch, minor, major)
235
- 2. It will update the CHANGELOG according to the new version being published
236
- 3. It will push a tag with the given version to github
237
- 4. CircleCI will pickup this tag, build the package and publish it to pypi
238
- 5. `bumpx` will have everything ready for the next version (version, changelog...)
226
+ ```bash
227
+ # Create a new release
228
+ ./tag_version.sh <version>
239
229
 
240
- ### Dry run
230
+ # Example
231
+ ./tag_version.sh 2.5.0
241
232
 
242
- ```shell
243
- bumpx -d -v
233
+ # Dry run to see what would happen
234
+ ./tag_version.sh 2.5.0 --dry-run
244
235
  ```
245
236
 
246
- ### Release
247
-
248
- This will release a patch version:
237
+ **Prerequisites**: GitHub CLI (`gh`) must be installed and authenticated, and you must be on the main branch with a clean working directory.
249
238
 
250
- ```shell
251
- bumpx -v
252
- ```
239
+ The script automatically:
240
+ - Updates the version in pyproject.toml
241
+ - Extracts commits since the last tag and formats them for CHANGELOG.md
242
+ - Identifies breaking changes (commits with `!:` in the subject)
243
+ - Creates a git tag and pushes it to the remote repository
244
+ - Creates a GitHub release with the changelog content
@@ -1,10 +1,9 @@
1
- csv_detective/__init__.py,sha256=FsL6q5F-gKLMnWy05-1CJpa4cz9tquheZ2LS1tjkVgI,162
1
+ csv_detective/__init__.py,sha256=qvjDQBcw1ZIpapIrdGg1IUjBJ1q5KPhQda_05fevleg,126
2
2
  csv_detective/cli.py,sha256=mu5anmBmaDk52_uZGiA4T37wYZCuV43gZAepjs1Cqzc,1389
3
- csv_detective/explore_csv.py,sha256=sEMza4Z27ac88fGq7tUiK1zlfvuftztHhHVoa0c2EVU,9191
4
- csv_detective/load_tests.py,sha256=uVKweLq3cf-yB5ZZI-m9tBVs_SWNcOw8sDJa97TOJGo,2266
5
- csv_detective/s3_utils.py,sha256=z1KTVVkdurMv21o-rZu7_aluMJnSi-d5uxnQbqT2NoI,1407
6
- csv_detective/utils.py,sha256=xiIO7ZDqkTm9Rnhnq6RaDdnrPIfoG0JV9AsmaOG6plA,1162
7
- csv_detective/validate.py,sha256=RLHXLrRuynkdcvHUlSEbyglPvdbNYlT1Z4nQI-BdYdA,2898
3
+ csv_detective/explore_csv.py,sha256=uXMFu_IIsRh8ky_PfdPTDVco_j4jSDahzMW6rnjXveE,5726
4
+ csv_detective/load_tests.py,sha256=75iCxSlIeLUT-nH1fTaSjLofIPJ2AIBczkIZWaO_mkw,2234
5
+ csv_detective/utils.py,sha256=RJ_zFOJ1DRY8HtDrKPiCdNk5gU6-KwOrOKOyfSkBZZY,1118
6
+ csv_detective/validate.py,sha256=bC9-OWH9lU45Ibma-QryvOdmcncDUBiNk0G2NADrjmQ,2841
8
7
  csv_detective/detect_fields/__init__.py,sha256=ZZ7u9zsMtCqPC2xxeLp57UTCbqpKFJi6D_LO1ew15BU,1980
9
8
  csv_detective/detect_fields/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
9
  csv_detective/detect_fields/FR/geo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -17,10 +16,10 @@ csv_detective/detect_fields/FR/geo/code_region/__init__.py,sha256=9pR2tVS4J2Kryt
17
16
  csv_detective/detect_fields/FR/geo/commune/__init__.py,sha256=5vw4zjlmWaR2djxuQOUrmwsNIc9HgAE-zdxwerVR3S0,380
18
17
  csv_detective/detect_fields/FR/geo/departement/__init__.py,sha256=UsMEW1EVVgnw-daOc1jBkEaGKvqTONSAGnj1s3QgM8w,400
19
18
  csv_detective/detect_fields/FR/geo/insee_canton/__init__.py,sha256=YsAGiblFexBxvu_E3XaXhy_bordc6c1oKPgDzTsDeXw,374
20
- csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py,sha256=RjkDSZzIbp4nnvDpa5GomDpyIJGvwErX7TgC4dlBJ14,437
21
- csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py,sha256=7xmYpTYoHvFfcuocAhm6dP_j4sMII_hG1PMSrWId4FY,344
22
- csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py,sha256=JbKuGK5UmUGAQKPFpN4RSLf3axJ5D1aCjzRXYHW-iXU,441
23
- csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py,sha256=5VWDaHZvGhJAJu5XQrj6gLx5CVA9dNOE30eTXQ3pSf0,344
19
+ csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py,sha256=7ONo0MxrJY1gPWRwyPCX4ZDbCINmxnKRV85zscADxT8,435
20
+ csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py,sha256=lIgWebNapfrnPt0XeNUMs78Xa_csGNAtTk8VEk9wXXo,342
21
+ csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py,sha256=YXTWSymmcXW9eD2OfiSlmX7N-IUtZkDrNYHd6vTnJTc,439
22
+ csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py,sha256=7tZ8sgIkQ9zuSOZ-vGYBkH04Vv1xlPlJDM78xYfD57Y,342
24
23
  csv_detective/detect_fields/FR/geo/pays/__init__.py,sha256=85y-5qNRAWJrKqL0wh9iPMUBQjvPwc9lv1cYB2m0daQ,364
25
24
  csv_detective/detect_fields/FR/geo/region/__init__.py,sha256=6mJRaGsCPBY5JHHe8EWxEjDpAOIfvBPTaZKJb3_n3gU,1077
26
25
  csv_detective/detect_fields/FR/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -50,13 +49,13 @@ csv_detective/detect_fields/geo/iso_country_code_alpha3/iso_country_code_alpha3.
50
49
  csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py,sha256=AnAridM4C8hcm4PeNdr8969czgrzM4KemGVZWAJSM1U,436
51
50
  csv_detective/detect_fields/geo/iso_country_code_numeric/iso_country_code_numeric.txt,sha256=2GtEhuporsHYV-pU4q9kfXU5iOtfW5C0GYBTTKQtnnA,1004
52
51
  csv_detective/detect_fields/geo/json_geojson/__init__.py,sha256=6wlwlxQmsVIZ21g-THvH3nBj-I8FuoF2sBlZAoEMGiQ,393
53
- csv_detective/detect_fields/geo/latitude_wgs/__init__.py,sha256=sdor-L1WDHv5opg1Le13mru4ImSA-yEbxchlWENuUFE,327
52
+ csv_detective/detect_fields/geo/latitude_wgs/__init__.py,sha256=PI-wlTJmPk6nznzu_Fou_SSCET90wIf78mXwb1W1K70,325
54
53
  csv_detective/detect_fields/geo/latlon_wgs/__init__.py,sha256=IXDTqD4YFUJYI1FYZ5ZfkqXY6KvNY7sgBVFRAvgTHtI,454
55
- csv_detective/detect_fields/geo/longitude_wgs/__init__.py,sha256=gPnNTe-L9xjBVE-30VCJiK6IVZttj6Cy6zu1IL5907Y,330
54
+ csv_detective/detect_fields/geo/longitude_wgs/__init__.py,sha256=B7YFfvEI48DfAn8xbc-vpVERQaKh9_59ERfieo2D6OY,328
56
55
  csv_detective/detect_fields/geo/lonlat_wgs/__init__.py,sha256=CnBMYevfGdhBvureF3oc_zqT-RZjG419iAuUlugQFLc,454
57
56
  csv_detective/detect_fields/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
58
57
  csv_detective/detect_fields/other/booleen/__init__.py,sha256=37ZUJACrZA9FQBYLDeVJGze7_I9x-ZWv5yWuBcqHcwI,497
59
- csv_detective/detect_fields/other/email/__init__.py,sha256=p235wILf0fR9TeSEuyuPgoysAv9zg23a4vzdy3YJlxE,192
58
+ csv_detective/detect_fields/other/email/__init__.py,sha256=O09KVBDsI-_wOecOlqyrtgr8V1ubPqB5EwPhOCxVLJ8,199
60
59
  csv_detective/detect_fields/other/float/__init__.py,sha256=AT4Kpgwoz5PuAoLx00u0SL8DjjXZxsE8zSRbN18uAv4,578
61
60
  csv_detective/detect_fields/other/int/__init__.py,sha256=4SQAgaYTafeBL6hdT7Wp_xwcRNQsOWlYjaXKl78EuDw,320
62
61
  csv_detective/detect_fields/other/json/__init__.py,sha256=AkRWZAidEM1dWkVRFThEBI5M7kMUu5Yu12iCViGM8lU,310
@@ -67,9 +66,9 @@ csv_detective/detect_fields/other/twitter/__init__.py,sha256=Npu6ZbyNfHq1y7xn0Gd
67
66
  csv_detective/detect_fields/other/url/__init__.py,sha256=L7h9fZldh1w86XwCx0x3Q1TXSJ_nIId1C-l1yFzZYrA,299
68
67
  csv_detective/detect_fields/other/uuid/__init__.py,sha256=XFxbIsdIhRw0dtFxBXQBhicE4yy7P4jmwYXeJhq6FVY,215
69
68
  csv_detective/detect_fields/temp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
70
- csv_detective/detect_fields/temp/date/__init__.py,sha256=JtWaK8hkzBaIUc-fu0G7lIFpWqCfraRh6l0Mo65U3b0,2155
71
- csv_detective/detect_fields/temp/datetime_aware/__init__.py,sha256=ZDNUcbU0ZJzaxUt0Utc1Y9dRrq4HHW9uCbcnOuz5Sfk,1247
72
- csv_detective/detect_fields/temp/datetime_naive/__init__.py,sha256=QoVOA98lT_GVSGO_mQwKtAy2o-REs8C9d6JB9d_L_B4,1189
69
+ csv_detective/detect_fields/temp/date/__init__.py,sha256=j066luXADCti4Mbb-jvznrL1jf3p5TpEpVzW8vThRDE,2124
70
+ csv_detective/detect_fields/temp/datetime_aware/__init__.py,sha256=oDaZIhkW0SXSYeuK5R5TIzajvSmu-XjUn8GpqITFLnY,1250
71
+ csv_detective/detect_fields/temp/datetime_naive/__init__.py,sha256=z5wpuHiDl8j7ZeQjfZ5wO9lG6H9Ps6X218ANNw19Dag,1073
73
72
  csv_detective/detect_fields/temp/datetime_rfc822/__init__.py,sha256=-pFdIIPgaLq2_QbFJ9zwy4YIwZuC73F0A_cNDntTuvQ,512
74
73
  csv_detective/detect_fields/temp/year/__init__.py,sha256=gHchVciZExbGZLMBcbBaDXB0IgGptkQc4RhfSOMY0Ww,194
75
74
  csv_detective/detect_labels/__init__.py,sha256=93s93DRNeFw9fJiGp0rW3iRWZX3WOeVau2PAaF4QlPE,1777
@@ -131,38 +130,37 @@ csv_detective/detect_labels/temp/year/__init__.py,sha256=7uWaCZY7dOG7nolW46IgBWm
131
130
  csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
132
131
  csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvcaGd0Go,2887
133
132
  csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
134
- csv_detective/detection/engine.py,sha256=1Z4vzjxwPRZ9-vv8nw-zU2sgBZtOsEz0UoKjGaSwVJU,1543
135
- csv_detective/detection/formats.py,sha256=YFFEJHhlMw7IMtbotpam1qYt07djnYMHd8j6AvOA3XA,7459
136
- csv_detective/detection/headers.py,sha256=y5iR4jWH5fUtAH_Zg0zxWSVG_INCHlXJFMbhPpI2YMo,1148
133
+ csv_detective/detection/engine.py,sha256=NpWUgqsNXogBnVclPYccqJZVtDd780houVY-YIMr5c0,1511
134
+ csv_detective/detection/formats.py,sha256=QXdxdECU5uC_ytLBT_6-xe0VAiaMptXF4KYiShRUVCA,7702
135
+ csv_detective/detection/headers.py,sha256=hvYU13Nq8GWci5skc5vVUOxM0DwOUwbjVMlmY94lWhA,1135
137
136
  csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
138
137
  csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
139
138
  csv_detective/detection/variables.py,sha256=wfsA_MOk14TPMOY7gkvpTGpo9-USzMnFaAou3MPHqxc,3536
140
- csv_detective/output/__init__.py,sha256=bMsLp-XCVf4sNymIof_kdMdqFIY7GocOas-lPNekfQg,1930
141
- csv_detective/output/dataframe.py,sha256=pjxvpzIWVUW9_xvT3JjoPnOIVUUHnzL7kZo1xQdMDxQ,2139
142
- csv_detective/output/example.py,sha256=XrnPS_uC0cICn7tgnLWNctpUbnPzl7fIMzNTzJEWGJc,8655
139
+ csv_detective/output/__init__.py,sha256=3g6aR6tg1WM-bPFrAdSPSFbNEj2y7tnZiAC_DAhw9_Q,1876
140
+ csv_detective/output/dataframe.py,sha256=Ao7hyfkyQxpmQ9PGBq4bFYJnJaURczl10H7q0oUcYEw,2097
141
+ csv_detective/output/example.py,sha256=R7nxBBawM6KT9nipO7PAAc2zaIXjY-YxzWTd1NqK4xA,8599
143
142
  csv_detective/output/profile.py,sha256=thckCcfy9cES5yYNW6TDGV82gP1OFWJuLhInT1g7JpI,2814
144
- csv_detective/output/schema.py,sha256=5Duw5qnsJ-LaVC6JgF7p1zZAkehDzsbXA4iTSJUgLNM,13760
143
+ csv_detective/output/schema.py,sha256=vXPlEw44zRR4GcYd-PQ_R_qXeCaefEDxW2XmprdNP_c,10453
145
144
  csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
146
145
  csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
147
146
  csv_detective/parsing/columns.py,sha256=HRHJBJ1gftuheegJHzhQmg-u83pVAXXuQ9GKR34mKgk,5696
148
147
  csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
149
148
  csv_detective/parsing/csv.py,sha256=fJkjKvyk7InkNnYKtmivyi48mmcwvrha7gvZ5J4-86A,1588
150
- csv_detective/parsing/excel.py,sha256=sKD5PRN1TlzPPOKFnZ3VRb0r1yIjPLlpxVWmZQeLYFk,7027
151
- csv_detective/parsing/load.py,sha256=C3M8nvgWenOb8aDFi5dpDGCoAw9EBqr4EB63zbz2M14,3699
149
+ csv_detective/parsing/excel.py,sha256=oAVTuoDccJc4-kVjHXiIPLQx3lq3aZRRZQxkG1c06JQ,6992
150
+ csv_detective/parsing/load.py,sha256=-pQlwOPTYVpvgt21ERa4K9ObcLozWBJbZ3kWO1U0wkE,3648
152
151
  csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
153
- csv_detective-0.9.2.dev1896.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
152
+ csv_detective-0.9.3.dev0.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
154
153
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
155
154
  tests/test_example.py,sha256=uTWswvUzBWEADGXZmMAdZvKhKvIjvT5zWOVVABgCDN4,1987
156
- tests/test_fields.py,sha256=5901OxKDReGMPQm3ZJ36oDjtJ8H3El5jPxf1YNu5wVg,12542
157
- tests/test_file.py,sha256=ZL0Jx499RUpmKFvcPQVnAeafSbyc23fqwt93ZrYg9GE,10258
155
+ tests/test_fields.py,sha256=R6r6dcUwPx9XWIoc1xH4z0HlCnTj_bmxw91H5Gfqq5I,13762
156
+ tests/test_file.py,sha256=QEBv69P0bAKWBzhQ3KKOR1Z1RQSf5CVEilqBojwP2Yc,10791
158
157
  tests/test_labels.py,sha256=Y0XlOpztCyV65pk7iAS_nMMfdysoBujlBmz10vHul9A,469
159
158
  tests/test_structure.py,sha256=GRDYKy0UcdqlN4qglzsRC0puFj5cb-SVvONjvcPvtAA,1400
160
159
  tests/test_validation.py,sha256=ie-Xf0vk6-M6GQq-x7kY5yse1EmXfxQkbaV7fR3fvYo,3308
161
- venv/bin/activate_this.py,sha256=NRy3waFmwW1pOaNUp33wNN0vD1Kzkd-zXX-Sgl4EiVI,1286
162
- venv/bin/jp.py,sha256=7z7dvRg0M7HzpZG4ssQID7nScjvQx7bcYTxJWDOrS6E,1717
160
+ venv/bin/activate_this.py,sha256=wS7qPipy8R-dS_0ICD8PqqUQ8F-PrtcpiJw2DUPngYM,1287
163
161
  venv/bin/runxlrd.py,sha256=YlZMuycM_V_hzNt2yt3FyXPuwouMCmMhvj1oZaBeeuw,16092
164
- csv_detective-0.9.2.dev1896.dist-info/METADATA,sha256=2ZrcsJkSf2uY3pxlmwvui5uFbicmYpa8nDnxmkp4-xM,9767
165
- csv_detective-0.9.2.dev1896.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
166
- csv_detective-0.9.2.dev1896.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
167
- csv_detective-0.9.2.dev1896.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
168
- csv_detective-0.9.2.dev1896.dist-info/RECORD,,
162
+ csv_detective-0.9.3.dev0.dist-info/METADATA,sha256=Xga9fj8KjfrMOhp5ZIoXsJLcAI2Jz31yNsdfFJca2DU,9928
163
+ csv_detective-0.9.3.dev0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
164
+ csv_detective-0.9.3.dev0.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
165
+ csv_detective-0.9.3.dev0.dist-info/top_level.txt,sha256=cYKb4Ok3XgYA7rMDOYtxysjSJp_iUA9lJjynhVzue8g,30
166
+ csv_detective-0.9.3.dev0.dist-info/RECORD,,
tests/test_fields.py CHANGED
@@ -79,6 +79,7 @@ from csv_detective.detection.variables import (
79
79
  from csv_detective.load_tests import return_all_tests
80
80
  from csv_detective.output.dataframe import cast
81
81
  from csv_detective.output.utils import prepare_output_dict
82
+ from csv_detective.parsing.columns import test_col as col_test # to prevent pytest from testing it
82
83
 
83
84
 
84
85
  def test_all_tests_return_bool():
@@ -284,7 +285,7 @@ fields = {
284
285
  False: ["nein", "ja", "2", "-0"],
285
286
  },
286
287
  email: {
287
- True: ["cdo_intern@data.gouv.fr"],
288
+ True: ["cdo_intern@data.gouv.fr", "P.NOM@CIE.LONGDOMAIN"],
288
289
  False: ["cdo@@gouv.sfd"],
289
290
  },
290
291
  json: {
@@ -356,17 +357,25 @@ fields = {
356
357
  True: [
357
358
  "2021-06-22 10:20:10-04:00",
358
359
  "2030-06-22 00:00:00.0028+02:00",
360
+ "2000-12-21 10:20:10.1Z",
359
361
  "2024-12-19T10:53:36.428000+00:00",
360
362
  "1996/06/22 10:20:10 GMT",
361
363
  ],
362
364
  False: ["2021-06-22T30:20:10", "Sun, 06 Nov 1994 08:49:37 GMT", "2021-06-44 10:20:10"],
363
365
  },
364
366
  datetime_naive: {
365
- True: ["2021-06-22 10:20:10", "2030/06/22 00:00:00.0028"],
367
+ True: [
368
+ "2021-06-22 10:20:10",
369
+ "2030/06-22 00:00:00",
370
+ "2030/06/22 00:00:00.0028",
371
+ ],
366
372
  False: [
367
373
  "2021-06-22T30:20:10",
368
374
  "Sun, 06 Nov 1994 08:49:37 GMT",
369
375
  "2021-06-44 10:20:10+02:00",
376
+ "1999-12-01T00:00:00Z",
377
+ "2021-06-44",
378
+ "15 décembre 1985",
370
379
  ],
371
380
  },
372
381
  datetime_rfc822: {
@@ -451,8 +460,8 @@ def test_priority(args):
451
460
  ("28/01/2000", date),
452
461
  ("2025-08-20T14:30:00+02:00", datetime_aware),
453
462
  ("2025/08/20 14:30:00.2763-12:00", datetime_aware),
454
- ("1925_12_20T14:30:00.2763Z", datetime_naive),
455
- ("1925 12 20 14:30:00Z", datetime_naive),
463
+ ("1925_12_20T14:30:00.2763", datetime_naive),
464
+ ("1925 12 20 14:30:00Z", datetime_aware),
456
465
  ),
457
466
  )
458
467
  def test_early_detection(args):
@@ -461,3 +470,27 @@ def test_early_detection(args):
461
470
  res = module._is(value)
462
471
  assert res
463
472
  mock_func.assert_not_called()
473
+
474
+
475
+ def test_all_proportion_1():
476
+ all_tests = return_all_tests("ALL", "detect_fields")
477
+ prop_1 = {
478
+ t.__name__.split(".")[-1]: eval(
479
+ t.__name__.split(".")[-1]
480
+ if t.__name__.split(".")[-1] not in ["int", "float"]
481
+ else "test_" + t.__name__.split(".")[-1]
482
+ )
483
+ for t in all_tests
484
+ if t.PROPORTION == 1
485
+ }
486
+ # building a table that uses only correct values for these formats, except on one row
487
+ table = pd.DataFrame(
488
+ {
489
+ test_name: (fields[test_module][True] * 100)[:100] + ["not_suitable"]
490
+ for test_name, test_module in prop_1.items()
491
+ }
492
+ )
493
+ # testing columns for all formats
494
+ returned_table = col_test(table, all_tests, limited_output=True)
495
+ # the analysis should have found no match on any format
496
+ assert all(returned_table[col].sum() == 0 for col in table.columns)
tests/test_file.py CHANGED
@@ -6,6 +6,7 @@ import responses
6
6
 
7
7
  from csv_detective import routine
8
8
  from csv_detective.output.profile import create_profile
9
+ from csv_detective.parsing.columns import MAX_ROWS_ANALYSIS
9
10
 
10
11
 
11
12
  @pytest.mark.parametrize(
@@ -343,3 +344,20 @@ def test_almost_uniform_column(mocked_responses):
343
344
  save_results=False,
344
345
  )
345
346
  assert analysis["columns"][col_name]["format"] == "int"
347
+
348
+
349
+ def test_full_nan_column(mocked_responses):
350
+ # we want a file that needs sampling
351
+ expected_content = "only_nan,second_col\n" + ",1\n" * (MAX_ROWS_ANALYSIS + 1)
352
+ mocked_responses.get(
353
+ "http://example.com/test.csv",
354
+ body=expected_content,
355
+ status=200,
356
+ )
357
+ # just testing it doesn't fail
358
+ routine(
359
+ file_path="http://example.com/test.csv",
360
+ num_rows=-1,
361
+ output_profile=False,
362
+ save_results=False,
363
+ )
venv/bin/activate_this.py CHANGED
@@ -29,7 +29,7 @@ os.environ["VIRTUAL_ENV_PROMPT"] = '' or os.path.basename(base)
29
29
 
30
30
  # add the virtual environments libraries to the host python import mechanism
31
31
  prev_length = len(sys.path)
32
- for lib in '../lib/python3.9/site-packages'.split(os.pathsep):
32
+ for lib in '../lib/python3.11/site-packages'.split(os.pathsep):
33
33
  path = os.path.realpath(os.path.join(bin_dir, lib))
34
34
  site.addsitedir(path.decode("utf-8") if '' else path)
35
35
  sys.path[:] = sys.path[prev_length:] + sys.path[0:prev_length]
csv_detective/s3_utils.py DELETED
@@ -1,44 +0,0 @@
1
- import logging
2
-
3
- import boto3
4
- from botocore.client import Config
5
- from botocore.exceptions import ClientError
6
-
7
-
8
- def get_minio_url(netloc: str, bucket: str, key: str) -> str:
9
- """Returns location of given resource in minio once it is saved"""
10
- return netloc + "/" + bucket + "/" + key
11
-
12
-
13
- def get_s3_client(url: str, minio_user: str, minio_pwd: str) -> boto3.client:
14
- return boto3.client(
15
- "s3",
16
- endpoint_url=url,
17
- aws_access_key_id=minio_user,
18
- aws_secret_access_key=minio_pwd,
19
- config=Config(signature_version="s3v4"),
20
- )
21
-
22
-
23
- def download_from_minio(
24
- netloc: str, bucket: str, key: str, filepath: str, minio_user: str, minio_pwd: str
25
- ) -> None:
26
- logging.info("Downloading from minio")
27
- s3 = get_s3_client(netloc, minio_user, minio_pwd)
28
- try:
29
- s3.download_file(bucket, key, filepath)
30
- logging.info(f"Resource downloaded from minio at {get_minio_url(netloc, bucket, key)}")
31
- except ClientError as e:
32
- logging.error(e)
33
-
34
-
35
- def upload_to_minio(
36
- netloc: str, bucket: str, key: str, filepath: str, minio_user: str, minio_pwd: str
37
- ) -> None:
38
- logging.info("Saving to minio")
39
- s3 = get_s3_client(netloc, minio_user, minio_pwd)
40
- try:
41
- s3.upload_file(filepath, bucket, key)
42
- logging.info(f"Resource saved into minio at {get_minio_url(netloc, bucket, key)}")
43
- except ClientError as e:
44
- logging.error(e)
venv/bin/jp.py DELETED
@@ -1,54 +0,0 @@
1
- #!/home/circleci/project/venv/bin/python
2
-
3
- import sys
4
- import json
5
- import argparse
6
- from pprint import pformat
7
-
8
- import jmespath
9
- from jmespath import exceptions
10
-
11
-
12
- def main():
13
- parser = argparse.ArgumentParser()
14
- parser.add_argument('expression')
15
- parser.add_argument('-f', '--filename',
16
- help=('The filename containing the input data. '
17
- 'If a filename is not given then data is '
18
- 'read from stdin.'))
19
- parser.add_argument('--ast', action='store_true',
20
- help=('Pretty print the AST, do not search the data.'))
21
- args = parser.parse_args()
22
- expression = args.expression
23
- if args.ast:
24
- # Only print the AST
25
- expression = jmespath.compile(args.expression)
26
- sys.stdout.write(pformat(expression.parsed))
27
- sys.stdout.write('\n')
28
- return 0
29
- if args.filename:
30
- with open(args.filename, 'r') as f:
31
- data = json.load(f)
32
- else:
33
- data = sys.stdin.read()
34
- data = json.loads(data)
35
- try:
36
- sys.stdout.write(json.dumps(
37
- jmespath.search(expression, data), indent=4, ensure_ascii=False))
38
- sys.stdout.write('\n')
39
- except exceptions.ArityError as e:
40
- sys.stderr.write("invalid-arity: %s\n" % e)
41
- return 1
42
- except exceptions.JMESPathTypeError as e:
43
- sys.stderr.write("invalid-type: %s\n" % e)
44
- return 1
45
- except exceptions.UnknownFunctionError as e:
46
- sys.stderr.write("unknown-function: %s\n" % e)
47
- return 1
48
- except exceptions.ParseError as e:
49
- sys.stderr.write("syntax-error: %s\n" % e)
50
- return 1
51
-
52
-
53
- if __name__ == '__main__':
54
- sys.exit(main())