csv-detective 0.8.1.dev1703__py3-none-any.whl → 0.8.1.dev1729__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. csv_detective/cli.py +6 -9
  2. csv_detective/detect_fields/FR/geo/adresse/__init__.py +78 -78
  3. csv_detective/detect_fields/FR/geo/code_departement/__init__.py +2 -2
  4. csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -1
  5. csv_detective/detect_fields/FR/geo/code_region/__init__.py +1 -1
  6. csv_detective/detect_fields/FR/geo/commune/__init__.py +2 -2
  7. csv_detective/detect_fields/FR/geo/departement/__init__.py +2 -2
  8. csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +2 -2
  9. csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +1 -2
  10. csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +1 -1
  11. csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +1 -2
  12. csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +1 -1
  13. csv_detective/detect_fields/FR/geo/pays/__init__.py +6 -6
  14. csv_detective/detect_fields/FR/geo/region/__init__.py +6 -4
  15. csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +15 -14
  16. csv_detective/detect_fields/FR/other/csp_insee/__init__.py +4 -3
  17. csv_detective/detect_fields/FR/other/date_fr/__init__.py +3 -3
  18. csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +4 -3
  19. csv_detective/detect_fields/FR/other/sexe/__init__.py +2 -2
  20. csv_detective/detect_fields/FR/other/siren/__init__.py +3 -3
  21. csv_detective/detect_fields/FR/other/siret/__init__.py +3 -3
  22. csv_detective/detect_fields/FR/other/tel_fr/__init__.py +3 -3
  23. csv_detective/detect_fields/FR/other/uai/__init__.py +2 -2
  24. csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +15 -15
  25. csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +27 -27
  26. csv_detective/detect_fields/__init__.py +94 -43
  27. csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +5 -5
  28. csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +5 -5
  29. csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +5 -5
  30. csv_detective/detect_fields/geo/latitude_wgs/__init__.py +1 -1
  31. csv_detective/detect_fields/geo/longitude_wgs/__init__.py +1 -1
  32. csv_detective/detect_fields/other/booleen/__init__.py +1 -1
  33. csv_detective/detect_fields/other/email/__init__.py +4 -2
  34. csv_detective/detect_fields/other/int/__init__.py +3 -3
  35. csv_detective/detect_fields/other/mongo_object_id/__init__.py +2 -2
  36. csv_detective/detect_fields/other/twitter/__init__.py +2 -2
  37. csv_detective/detect_fields/other/uuid/__init__.py +4 -5
  38. csv_detective/detect_fields/temp/date/__init__.py +3 -2
  39. csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +6 -6
  40. csv_detective/detect_fields/temp/year/__init__.py +1 -1
  41. csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -1
  42. csv_detective/detect_labels/__init__.py +51 -1
  43. csv_detective/detect_labels/geo/lonlat_wgs/__init__.py +1 -0
  44. csv_detective/detect_labels/other/mongo_object_id/__init__.py +1 -1
  45. csv_detective/detection/columns.py +9 -9
  46. csv_detective/detection/encoding.py +6 -4
  47. csv_detective/detection/engine.py +6 -5
  48. csv_detective/detection/formats.py +19 -19
  49. csv_detective/detection/headers.py +3 -5
  50. csv_detective/detection/rows.py +1 -1
  51. csv_detective/detection/variables.py +6 -7
  52. csv_detective/explore_csv.py +7 -8
  53. csv_detective/load_tests.py +7 -16
  54. csv_detective/output/__init__.py +3 -7
  55. csv_detective/output/dataframe.py +9 -5
  56. csv_detective/output/example.py +13 -13
  57. csv_detective/output/profile.py +30 -23
  58. csv_detective/output/schema.py +20 -23
  59. csv_detective/output/utils.py +15 -15
  60. csv_detective/parsing/columns.py +23 -12
  61. csv_detective/parsing/csv.py +1 -1
  62. csv_detective/parsing/excel.py +10 -11
  63. csv_detective/parsing/load.py +11 -8
  64. csv_detective/parsing/text.py +4 -9
  65. csv_detective/s3_utils.py +3 -7
  66. csv_detective/utils.py +4 -2
  67. csv_detective/validate.py +18 -13
  68. {csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/METADATA +12 -2
  69. {csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/RECORD +79 -79
  70. tests/test_example.py +2 -6
  71. tests/test_fields.py +16 -10
  72. tests/test_file.py +10 -9
  73. tests/test_labels.py +3 -2
  74. tests/test_structure.py +4 -3
  75. tests/test_validation.py +9 -6
  76. {csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/WHEEL +0 -0
  77. {csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/entry_points.txt +0 -0
  78. {csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/licenses/LICENSE +0 -0
  79. {csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/top_level.txt +0 -0
@@ -1,16 +1,17 @@
1
- from collections import defaultdict
2
1
  import logging
2
+ from collections import defaultdict
3
3
  from typing import Union
4
4
 
5
5
  import numpy as np
6
6
  import pandas as pd
7
+
7
8
  from csv_detective.detection.variables import (
8
9
  detect_categorical_variable,
9
10
  # detect_continuous_variable,
10
11
  )
11
12
  from csv_detective.load_tests import return_all_tests
12
13
  from csv_detective.output.utils import prepare_output_dict
13
- from csv_detective.parsing.columns import test_col, test_label, MAX_ROWS_ANALYSIS
14
+ from csv_detective.parsing.columns import MAX_ROWS_ANALYSIS, test_col, test_label
14
15
  from csv_detective.validate import validate
15
16
 
16
17
 
@@ -42,10 +43,12 @@ def detect_formats(
42
43
  # detect_continuous_variable(table.iloc[:, ~categorical_mask.values], verbose=verbose)
43
44
  # )
44
45
 
45
- analysis.update({
46
- "categorical": res_categorical,
47
- # "continuous": res_continuous,
48
- })
46
+ analysis.update(
47
+ {
48
+ "categorical": res_categorical,
49
+ # "continuous": res_continuous,
50
+ }
51
+ )
49
52
 
50
53
  # list testing to be performed
51
54
  all_tests_fields = return_all_tests(
@@ -60,7 +63,9 @@ def detect_formats(
60
63
  return analysis
61
64
 
62
65
  # Perform testing on fields
63
- scores_table_fields = test_col(table, all_tests_fields, limited_output, skipna=skipna, verbose=verbose)
66
+ scores_table_fields = test_col(
67
+ table, all_tests_fields, limited_output, skipna=skipna, verbose=verbose
68
+ )
64
69
  analysis["columns_fields"] = prepare_output_dict(scores_table_fields, limited_output)
65
70
 
66
71
  # Perform testing on labels
@@ -71,16 +76,14 @@ def detect_formats(
71
76
  # This is because the fields are more important than the labels and yields a max
72
77
  # of 1.5 for the final score.
73
78
  scores_table = scores_table_fields * (
74
- 1
75
- + scores_table_labels.reindex(
76
- index=scores_table_fields.index, fill_value=0
77
- ).values / 2
79
+ 1 + scores_table_labels.reindex(index=scores_table_fields.index, fill_value=0).values / 2
78
80
  )
79
81
 
80
82
  # To reduce false positives: ensure these formats are detected only if the label yields
81
83
  # a detection (skipping the ones that have been excluded by the users).
82
84
  formats_with_mandatory_label = [
83
- f for f in [
85
+ f
86
+ for f in [
84
87
  "code_departement",
85
88
  "code_commune_insee",
86
89
  "code_postal",
@@ -90,7 +93,8 @@ def detect_formats(
90
93
  "longitude_wgs_fr_metropole",
91
94
  "latitude_l93",
92
95
  "longitude_l93",
93
- ] if f in scores_table.index
96
+ ]
97
+ if f in scores_table.index
94
98
  ]
95
99
  scores_table.loc[formats_with_mandatory_label, :] = np.where(
96
100
  scores_table_labels.loc[formats_with_mandatory_label, :],
@@ -123,9 +127,7 @@ def detect_formats(
123
127
  analysis[detection_method] = {
124
128
  col_name: [
125
129
  {
126
- "python_type": metier_to_python_type.get(
127
- detection["format"], "string"
128
- ),
130
+ "python_type": metier_to_python_type.get(detection["format"], "string"),
129
131
  **detection,
130
132
  }
131
133
  for detection in detections
@@ -136,9 +138,7 @@ def detect_formats(
136
138
  for detection_method in ["columns_fields", "columns_labels", "columns"]:
137
139
  analysis[detection_method] = {
138
140
  col_name: {
139
- "python_type": metier_to_python_type.get(
140
- detection["format"], "string"
141
- ),
141
+ "python_type": metier_to_python_type.get(detection["format"], "string"),
142
142
  **detection,
143
143
  }
144
144
  for col_name, detection in analysis[detection_method].items()
@@ -15,18 +15,16 @@ def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int,
15
15
  header = file.readline()
16
16
  position = file.tell()
17
17
  chaine = [c for c in header.replace("\n", "").split(sep) if c]
18
- if chaine[-1] not in ["", "\n"] and all(
19
- [mot not in ["", "\n"] for mot in chaine[1:-1]]
20
- ):
18
+ if chaine[-1] not in ["", "\n"] and all([mot not in ["", "\n"] for mot in chaine[1:-1]]):
21
19
  next_row = file.readline()
22
20
  file.seek(position)
23
21
  if header != next_row:
24
22
  if verbose:
25
23
  display_logs_depending_process_time(
26
- f'Detected headers in {round(time() - start, 3)}s',
24
+ f"Detected headers in {round(time() - start, 3)}s",
27
25
  time() - start,
28
26
  )
29
27
  return i, chaine
30
28
  if verbose:
31
- logging.info('No header detected')
29
+ logging.info("No header detected")
32
30
  return 0, None
@@ -5,7 +5,7 @@ def remove_empty_first_rows(table: pd.DataFrame) -> tuple[pd.DataFrame, int]:
5
5
  """Analog process to detect_headers for csv files, determines how many rows to skip
6
6
  to end up with the header at the right place"""
7
7
  idx = 0
8
- if all([str(c).startswith('Unnamed:') for c in table.columns]):
8
+ if all([str(c).startswith("Unnamed:") for c in table.columns]):
9
9
  # there is on offset between the index in the file (idx here)
10
10
  # and the index in the dataframe, because of the header
11
11
  idx = 1
@@ -1,5 +1,5 @@
1
- from ast import literal_eval
2
1
  import logging
2
+ from ast import literal_eval
3
3
  from time import time
4
4
 
5
5
  import pandas as pd
@@ -7,7 +7,9 @@ import pandas as pd
7
7
  from csv_detective.utils import display_logs_depending_process_time
8
8
 
9
9
 
10
- def detect_continuous_variable(table: pd.DataFrame, continuous_th: float = 0.9, verbose: bool = False):
10
+ def detect_continuous_variable(
11
+ table: pd.DataFrame, continuous_th: float = 0.9, verbose: bool = False
12
+ ):
11
13
  """
12
14
  Detects whether a column contains continuous variables. We consider a continuous column
13
15
  one that contains a considerable amount of float values.
@@ -34,16 +36,13 @@ def detect_continuous_variable(table: pd.DataFrame, continuous_th: float = 0.9,
34
36
  value = value.replace(",", ".")
35
37
  value = literal_eval(value)
36
38
  return type(value)
37
- # flake8: noqa
38
- except:
39
+ except Exception:
39
40
  return False
40
41
 
41
42
  if verbose:
42
43
  start = time()
43
44
  logging.info("Detecting continuous columns")
44
- res = table.apply(
45
- lambda serie: check_threshold(serie.apply(parses_to_integer), continuous_th)
46
- )
45
+ res = table.apply(lambda serie: check_threshold(serie.apply(parses_to_integer), continuous_th))
47
46
  if verbose:
48
47
  display_logs_depending_process_time(
49
48
  f"Detected {sum(res)} continuous columns in {round(time() - start, 3)}s",
@@ -55,7 +55,10 @@ def routine(
55
55
  dict: a dict with information about the csv and possible types for each column
56
56
  """
57
57
 
58
- if not (isinstance(save_results, bool) or (isinstance(save_results, str) and save_results.endswith(".json"))):
58
+ if not (
59
+ isinstance(save_results, bool)
60
+ or (isinstance(save_results, str) and save_results.endswith(".json"))
61
+ ):
59
62
  raise ValueError("`save_results` must be a bool or a valid path to a json file.")
60
63
 
61
64
  if verbose:
@@ -100,8 +103,7 @@ def routine(
100
103
  finally:
101
104
  if verbose:
102
105
  display_logs_depending_process_time(
103
- f"Routine completed in {round(time() - start_routine, 3)}s",
104
- time() - start_routine
106
+ f"Routine completed in {round(time() - start_routine, 3)}s", time() - start_routine
105
107
  )
106
108
 
107
109
 
@@ -119,7 +121,6 @@ def validate_then_detect(
119
121
  cast_json: bool = True,
120
122
  verbose: bool = False,
121
123
  ):
122
-
123
124
  if verbose:
124
125
  start_routine = time()
125
126
  if is_url(file_path):
@@ -170,8 +171,7 @@ def validate_then_detect(
170
171
  finally:
171
172
  if verbose:
172
173
  display_logs_depending_process_time(
173
- f"Process completed in {round(time() - start_routine, 3)}s",
174
- time() - start_routine
174
+ f"Process completed in {round(time() - start_routine, 3)}s", time() - start_routine
175
175
  )
176
176
 
177
177
 
@@ -226,8 +226,7 @@ def routine_minio(
226
226
  if location_dict is not None:
227
227
  if any(
228
228
  [
229
- (location_key not in location_dict)
230
- or (location_dict[location_key] is None)
229
+ (location_key not in location_dict) or (location_dict[location_key] is None)
231
230
  for location_key in ["netloc", "bucket", "key"]
232
231
  ]
233
232
  ):
@@ -1,8 +1,7 @@
1
1
  import os
2
2
  from typing import Union
3
3
 
4
- # flake8: noqa
5
- from csv_detective import detect_fields, detect_labels
4
+ from csv_detective import detect_fields, detect_labels # noqa
6
5
 
7
6
 
8
7
  def get_all_packages(detect_type) -> list:
@@ -12,10 +11,7 @@ def get_all_packages(detect_type) -> list:
12
11
  for filename in filenames:
13
12
  file = os.path.join(dirpath, filename).replace(root_dir, "")
14
13
  if file.endswith("__init__.py"):
15
- module = (
16
- file.replace("__init__.py", "")
17
- .replace("/", ".").replace("\\", ".")[:-1]
18
- )
14
+ module = file.replace("__init__.py", "").replace("/", ".").replace("\\", ".")[:-1]
19
15
  if module:
20
16
  modules.append(detect_type + module)
21
17
  return modules
@@ -43,20 +39,15 @@ def return_all_tests(
43
39
  if "ALL" in user_input_tests or all(x[0] == "-" for x in user_input_tests):
44
40
  tests_to_do = [detect_type]
45
41
  else:
46
- tests_to_do = [
47
- f"{detect_type}.{x}" for x in user_input_tests if x[0] != "-"
48
- ]
49
- tests_skipped = [
50
- f"{detect_type}.{x[1:]}" for x in user_input_tests if x[0] == "-"
51
- ]
42
+ tests_to_do = [f"{detect_type}.{x}" for x in user_input_tests if x[0] != "-"]
43
+ tests_skipped = [f"{detect_type}.{x[1:]}" for x in user_input_tests if x[0] == "-"]
52
44
  all_tests = [
53
45
  # this is why we need to import detect_fields/labels
54
- eval(x) for x in all_packages
46
+ eval(x)
47
+ for x in all_packages
55
48
  if any([y == x[: len(y)] for y in tests_to_do])
56
49
  and all([y != x[: len(y)] for y in tests_skipped])
57
50
  ]
58
51
  # to remove groups of tests
59
- all_tests = [
60
- test for test in all_tests if "_is" in dir(test)
61
- ]
52
+ all_tests = [test for test in all_tests if "_is" in dir(test)]
62
53
  return all_tests
@@ -5,6 +5,7 @@ from typing import Optional, Union
5
5
  import pandas as pd
6
6
 
7
7
  from csv_detective.utils import is_url
8
+
8
9
  from .dataframe import cast_df
9
10
  from .profile import create_profile
10
11
  from .schema import generate_table_schema
@@ -24,7 +25,6 @@ def generate_output(
24
25
  verbose: bool = False,
25
26
  sheet_name: Optional[Union[str, int]] = None,
26
27
  ) -> Union[dict, tuple[dict, pd.DataFrame]]:
27
-
28
28
  if output_profile:
29
29
  analysis["profile"] = create_profile(
30
30
  table=table,
@@ -40,7 +40,7 @@ def generate_output(
40
40
  else:
41
41
  output_path = os.path.splitext(file_path)[0]
42
42
  if is_url(output_path):
43
- output_path = output_path.split('/')[-1]
43
+ output_path = output_path.split("/")[-1]
44
44
  if analysis.get("sheet_name"):
45
45
  output_path += "_sheet-" + str(sheet_name)
46
46
  output_path += ".json"
@@ -48,11 +48,7 @@ def generate_output(
48
48
  json.dump(analysis, fp, indent=4, separators=(",", ": "), ensure_ascii=False)
49
49
 
50
50
  if output_schema:
51
- analysis["schema"] = generate_table_schema(
52
- analysis,
53
- save_file=False,
54
- verbose=verbose
55
- )
51
+ analysis["schema"] = generate_table_schema(analysis, save_file=False, verbose=verbose)
56
52
 
57
53
  if output_df:
58
54
  return analysis, cast_df(
@@ -1,7 +1,7 @@
1
- from datetime import date, datetime
2
1
  import json
3
- from typing import Optional, Union
2
+ from datetime import date, datetime
4
3
  from time import time
4
+ from typing import Optional, Union
5
5
 
6
6
  import pandas as pd
7
7
 
@@ -30,12 +30,16 @@ def cast(value: str, _type: str) -> Optional[Union[str, float, bool, date, datet
30
30
  raise ValueError(f"Unknown type `{_type}`")
31
31
 
32
32
 
33
- def cast_df(df: pd.DataFrame, columns: dict, cast_json: bool = True, verbose: bool = False) -> pd.DataFrame:
33
+ def cast_df(
34
+ df: pd.DataFrame, columns: dict, cast_json: bool = True, verbose: bool = False
35
+ ) -> pd.DataFrame:
34
36
  if verbose:
35
37
  start = time()
36
38
  output_df = pd.DataFrame()
37
39
  for col_name, detection in columns.items():
38
- if detection["python_type"] == "string" or (detection["python_type"] == "json" and not cast_json):
40
+ if detection["python_type"] == "string" or (
41
+ detection["python_type"] == "json" and not cast_json
42
+ ):
39
43
  # no change if detected type is string
40
44
  output_df[col_name] = df[col_name].copy()
41
45
  elif detection["python_type"] == "int":
@@ -49,7 +53,7 @@ def cast_df(df: pd.DataFrame, columns: dict, cast_json: bool = True, verbose: bo
49
53
  del df[col_name]
50
54
  if verbose:
51
55
  display_logs_depending_process_time(
52
- f'Casting columns completed in {round(time() - start, 3)}s',
56
+ f"Casting columns completed in {round(time() - start, 3)}s",
53
57
  time() - start,
54
58
  )
55
59
  return output_df
@@ -1,14 +1,14 @@
1
- from datetime import datetime
2
1
  import json
3
2
  import random
4
3
  import string
5
- from typing import Union, Optional, Any, Type
6
4
  import uuid
5
+ from datetime import datetime
6
+ from typing import Any, Optional, Type, Union
7
7
 
8
- from faker import Faker
9
8
  import pandas as pd
10
9
  import requests
11
10
  import rstr
11
+ from faker import Faker
12
12
 
13
13
  fake = Faker()
14
14
 
@@ -135,7 +135,7 @@ def create_example_csv_file(
135
135
  return random.choice(enum)
136
136
  if num_range is None:
137
137
  num_range = [0, 1000]
138
- if num_type == int:
138
+ if num_type is int:
139
139
  return random.randint(num_range[0], num_range[1])
140
140
  else:
141
141
  return round(random.uniform(num_range[0], num_range[1]), 1)
@@ -179,7 +179,7 @@ def create_example_csv_file(
179
179
  "yearmonth": "date",
180
180
  "time": "time",
181
181
  "datetime": "datetime",
182
- "array": "array"
182
+ "array": "array",
183
183
  }
184
184
 
185
185
  if schema_path:
@@ -188,7 +188,7 @@ def create_example_csv_file(
188
188
  else:
189
189
  with open(schema_path, encoding=encoding) as jsonfile:
190
190
  schema = json.load(jsonfile)
191
- if not ("fields" in schema.keys()):
191
+ if "fields" not in schema.keys():
192
192
  raise ValueError("The schema must have a 'fields' key.")
193
193
  else:
194
194
  fields = [
@@ -198,12 +198,14 @@ def create_example_csv_file(
198
198
  # when frformat is supported in TableSchema, we can build args for French standards
199
199
  # linked to https://github.com/datagouv/fr-format/issues/26
200
200
  "args": (
201
- build_args_from_constraints(f["constraints"]) if "constraints" in f.keys()
201
+ build_args_from_constraints(f["constraints"])
202
+ if "constraints" in f.keys()
202
203
  else build_args_from_constraints(f["arrayItem"]["constraints"])
203
204
  if "arrayItem" in f.keys() and "constraints" in f["arrayItem"].keys()
204
205
  else {}
205
- )
206
- } for f in schema["fields"]
206
+ ),
207
+ }
208
+ for f in schema["fields"]
207
209
  ]
208
210
 
209
211
  for k in range(len(fields)):
@@ -234,10 +236,8 @@ def create_example_csv_file(
234
236
  # would it be better to create by column or by row (as for now)?
235
237
  output = pd.DataFrame(
236
238
  [
237
- [
238
- types_to_func.get(f["type"], "str")(**f["args"])
239
- for f in fields
240
- ] for _ in range(file_length)
239
+ [types_to_func.get(f["type"], "str")(**f["args"]) for f in fields]
240
+ for _ in range(file_length)
241
241
  ],
242
242
  columns=[f["name"] for f in fields],
243
243
  )
@@ -1,5 +1,5 @@
1
- from collections import defaultdict
2
1
  import logging
2
+ from collections import defaultdict
3
3
  from time import time
4
4
 
5
5
  import pandas as pd
@@ -29,15 +29,12 @@ def create_profile(
29
29
  safe_table = table.copy()
30
30
  if not limited_output:
31
31
  dict_cols_fields = {
32
- k: v[0] if v else {'python_type': 'string', 'format': 'string', 'score': 1.0}
32
+ k: v[0] if v else {"python_type": "string", "format": "string", "score": 1.0}
33
33
  for k, v in dict_cols_fields.items()
34
34
  }
35
- dtypes = {
36
- k: map_python_types.get(v["python_type"], str)
37
- for k, v in dict_cols_fields.items()
38
- }
35
+ dtypes = {k: map_python_types.get(v["python_type"], str) for k, v in dict_cols_fields.items()}
39
36
  for c in safe_table.columns:
40
- if dtypes[c] == float:
37
+ if dtypes[c] is float:
41
38
  safe_table[c] = safe_table[c].apply(
42
39
  lambda s: float_casting(s) if isinstance(s, str) else s
43
40
  )
@@ -48,18 +45,26 @@ def create_profile(
48
45
  int,
49
46
  ]:
50
47
  profile[c].update(
51
- min=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
52
- safe_table[c].min()
53
- )),
54
- max=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
55
- safe_table[c].max()
56
- )),
57
- mean=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
58
- safe_table[c].mean()
59
- )),
60
- std=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
61
- safe_table[c].std()
62
- )),
48
+ min=prevent_nan(
49
+ map_python_types.get(dict_cols_fields[c]["python_type"], str)(
50
+ safe_table[c].min()
51
+ )
52
+ ),
53
+ max=prevent_nan(
54
+ map_python_types.get(dict_cols_fields[c]["python_type"], str)(
55
+ safe_table[c].max()
56
+ )
57
+ ),
58
+ mean=prevent_nan(
59
+ map_python_types.get(dict_cols_fields[c]["python_type"], str)(
60
+ safe_table[c].mean()
61
+ )
62
+ ),
63
+ std=prevent_nan(
64
+ map_python_types.get(dict_cols_fields[c]["python_type"], str)(
65
+ safe_table[c].std()
66
+ )
67
+ ),
63
68
  )
64
69
  tops_bruts = (
65
70
  safe_table[safe_table[c].notna()][c]
@@ -70,10 +75,12 @@ def create_profile(
70
75
  )
71
76
  tops = []
72
77
  for tb in tops_bruts:
73
- tops.append({
74
- "count": tb["count"],
75
- "value": tb[c],
76
- })
78
+ tops.append(
79
+ {
80
+ "count": tb["count"],
81
+ "value": tb[c],
82
+ }
83
+ )
77
84
  profile[c].update(
78
85
  tops=tops,
79
86
  nb_distinct=safe_table[c].nunique(),
@@ -1,14 +1,14 @@
1
- from datetime import datetime
2
1
  import json
3
2
  import logging
4
3
  import os
5
4
  import tempfile
5
+ from datetime import datetime
6
6
  from time import time
7
7
  from typing import Optional
8
8
 
9
9
  from botocore.exceptions import ClientError
10
10
 
11
- from csv_detective.s3_utils import get_s3_client, download_from_minio, upload_to_minio
11
+ from csv_detective.s3_utils import download_from_minio, get_s3_client, upload_to_minio
12
12
  from csv_detective.utils import display_logs_depending_process_time
13
13
 
14
14
 
@@ -26,13 +26,11 @@ def get_description(format: str) -> str:
26
26
  "insee_canton": "Le nom du canton",
27
27
  "latitude_l93": "La latitude au format Lambert 93",
28
28
  "latitude_wgs_fr_metropole": (
29
- "La latitude au format WGS. Ne concerne que des latitudes "
30
- "de la métropole française"
29
+ "La latitude au format WGS. Ne concerne que des latitudes de la métropole française"
31
30
  ),
32
31
  "longitude_l93": "La longitude au format Lambert 93",
33
32
  "longitude_wgs_fr_metropole": (
34
- "La longitude au format WGS. Ne concerne que des longitudes "
35
- "de la métropole française"
33
+ "La longitude au format WGS. Ne concerne que des longitudes de la métropole française"
36
34
  ),
37
35
  "pays": "Le nom du pays",
38
36
  "region": "Le nom de la région",
@@ -86,13 +84,13 @@ def get_pattern(format: str) -> str:
86
84
  ),
87
85
  "uai": r"^(0[0-8][0-9]|09[0-5]|9[78][0-9]|[67]20)[0-9]{4}[A-Z]$",
88
86
  "email": r"^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$",
89
- "twitter": r'^@[A-Za-z0-9_]+$',
90
- "mongo_object_id": r'^[0-9a-fA-F]{24}$',
91
- "uuid": r'^[{]?[0-9a-fA-F]{8}' + '-?([0-9a-fA-F]{4}-?)' + '{3}[0-9a-fA-F]{12}[}]?$',
87
+ "twitter": r"^@[A-Za-z0-9_]+$",
88
+ "mongo_object_id": r"^[0-9a-fA-F]{24}$",
89
+ "uuid": r"^[{]?[0-9a-fA-F]{8}" + "-?([0-9a-fA-F]{4}-?)" + "{3}[0-9a-fA-F]{12}[}]?$",
92
90
  "url": (
93
- r'^https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]'
94
- r'{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*)$'
95
- )
91
+ r"^https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]"
92
+ r"{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*)$"
93
+ ),
96
94
  }
97
95
  if format in format_to_pattern:
98
96
  return {"pattern": format_to_pattern[format]}
@@ -210,7 +208,7 @@ def generate_table_schema(
210
208
  key: Optional[str] = None,
211
209
  minio_user: Optional[str] = None,
212
210
  minio_pwd: Optional[str] = None,
213
- verbose: bool = False
211
+ verbose: bool = False,
214
212
  ) -> dict:
215
213
  """Generates a table schema from the analysis report
216
214
 
@@ -236,7 +234,7 @@ def generate_table_schema(
236
234
  "example": get_example(field_report["format"]),
237
235
  "type": get_validata_type(field_report["format"]),
238
236
  "formatFR": field_report["format"],
239
- "constraints": get_constraints(field_report["format"])
237
+ "constraints": get_constraints(field_report["format"]),
240
238
  }
241
239
  for header, field_report in analysis_report["columns"].items()
242
240
  ]
@@ -255,12 +253,9 @@ def generate_table_schema(
255
253
  "sources": [
256
254
  {
257
255
  "title": "Spécification Tableschema",
258
- "path": "https://specs.frictionlessdata.io/table-schema"
256
+ "path": "https://specs.frictionlessdata.io/table-schema",
259
257
  },
260
- {
261
- "title": "schema.data.gouv.fr",
262
- "path": "https://schema.data.gouv.fr"
263
- }
258
+ {"title": "schema.data.gouv.fr", "path": "https://schema.data.gouv.fr"},
264
259
  ],
265
260
  "created": datetime.today().strftime("%Y-%m-%d"),
266
261
  "lastModified": datetime.today().strftime("%Y-%m-%d"),
@@ -278,7 +273,9 @@ def generate_table_schema(
278
273
  }
279
274
 
280
275
  if verbose:
281
- display_logs_depending_process_time(f'Created schema in {round(time() - start, 3)}s', time() - start)
276
+ display_logs_depending_process_time(
277
+ f"Created schema in {round(time() - start, 3)}s", time() - start
278
+ )
282
279
 
283
280
  if not save_file:
284
281
  return schema
@@ -301,9 +298,9 @@ def generate_table_schema(
301
298
  if "Contents" in tableschema_objects:
302
299
  tableschema_keys = [
303
300
  tableschema["Key"]
304
- for tableschema in client.list_objects(
305
- Bucket=bucket, Prefix=key, Delimiter="/"
306
- )["Contents"]
301
+ for tableschema in client.list_objects(Bucket=bucket, Prefix=key, Delimiter="/")[
302
+ "Contents"
303
+ ]
307
304
  ]
308
305
  tableschema_versions = [
309
306
  os.path.splitext(tableschema_key)[0].split("_")[-1]
@@ -19,14 +19,17 @@ def prepare_output_dict(return_table: pd.DataFrame, limited_output: bool):
19
19
  # no need to specify int and float everywhere, they are deprioritized anyway
20
20
  ("int", ("float",)),
21
21
  # bool over everything
22
- ("booleen", (
23
- "latitude_l93",
24
- "latitude_wgs",
25
- "latitude_wgs_fr_metropole",
26
- "longitude_l93",
27
- "longitude_wgs",
28
- "longitude_wgs_fr_metropole",
29
- )),
22
+ (
23
+ "booleen",
24
+ (
25
+ "latitude_l93",
26
+ "latitude_wgs",
27
+ "latitude_wgs_fr_metropole",
28
+ "longitude_l93",
29
+ "longitude_wgs",
30
+ "longitude_wgs_fr_metropole",
31
+ ),
32
+ ),
30
33
  ("geojson", ("json",)),
31
34
  # latlon over lonlat if no longitude allows to discriminate
32
35
  ("latlon_wgs", ("json", "lonlat_wgs")),
@@ -49,13 +52,10 @@ def prepare_output_dict(return_table: pd.DataFrame, limited_output: bool):
49
52
  for prio_format, secondary_formats in priorities:
50
53
  if prio_format in detected_formats:
51
54
  for secondary in secondary_formats:
52
- if (
53
- secondary in detected_formats
54
- and (
55
- return_dict_cols[column_name][prio_format]
56
- >= return_dict_cols[column_name][secondary]
57
- or return_dict_cols[column_name][prio_format] >= 1
58
- )
55
+ if secondary in detected_formats and (
56
+ return_dict_cols[column_name][prio_format]
57
+ >= return_dict_cols[column_name][secondary]
58
+ or return_dict_cols[column_name][prio_format] >= 1
59
59
  ):
60
60
  formats_to_remove.add(secondary)
61
61