csv-detective 0.8.1.dev1674__py3-none-any.whl → 0.8.1.dev1720__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. csv_detective/__init__.py +0 -2
  2. csv_detective/cli.py +6 -9
  3. csv_detective/detect_fields/FR/geo/adresse/__init__.py +78 -78
  4. csv_detective/detect_fields/FR/geo/code_departement/__init__.py +2 -2
  5. csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -1
  6. csv_detective/detect_fields/FR/geo/code_region/__init__.py +1 -1
  7. csv_detective/detect_fields/FR/geo/commune/__init__.py +2 -2
  8. csv_detective/detect_fields/FR/geo/departement/__init__.py +2 -2
  9. csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +2 -2
  10. csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +1 -2
  11. csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +1 -1
  12. csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +1 -2
  13. csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +1 -1
  14. csv_detective/detect_fields/FR/geo/pays/__init__.py +6 -6
  15. csv_detective/detect_fields/FR/geo/region/__init__.py +6 -4
  16. csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +15 -14
  17. csv_detective/detect_fields/FR/other/csp_insee/__init__.py +4 -3
  18. csv_detective/detect_fields/FR/other/date_fr/__init__.py +3 -3
  19. csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +4 -3
  20. csv_detective/detect_fields/FR/other/sexe/__init__.py +2 -2
  21. csv_detective/detect_fields/FR/other/siren/__init__.py +3 -3
  22. csv_detective/detect_fields/FR/other/siret/__init__.py +3 -3
  23. csv_detective/detect_fields/FR/other/tel_fr/__init__.py +3 -3
  24. csv_detective/detect_fields/FR/other/uai/__init__.py +2 -2
  25. csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +15 -15
  26. csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +27 -27
  27. csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +5 -5
  28. csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +5 -5
  29. csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +5 -5
  30. csv_detective/detect_fields/geo/latitude_wgs/__init__.py +1 -1
  31. csv_detective/detect_fields/geo/longitude_wgs/__init__.py +1 -1
  32. csv_detective/detect_fields/other/booleen/__init__.py +1 -1
  33. csv_detective/detect_fields/other/email/__init__.py +4 -2
  34. csv_detective/detect_fields/other/int/__init__.py +3 -3
  35. csv_detective/detect_fields/other/mongo_object_id/__init__.py +2 -2
  36. csv_detective/detect_fields/other/twitter/__init__.py +2 -2
  37. csv_detective/detect_fields/other/uuid/__init__.py +4 -5
  38. csv_detective/detect_fields/temp/date/__init__.py +3 -2
  39. csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +6 -6
  40. csv_detective/detect_fields/temp/year/__init__.py +1 -1
  41. csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -1
  42. csv_detective/detect_labels/geo/lonlat_wgs/__init__.py +1 -0
  43. csv_detective/detect_labels/other/mongo_object_id/__init__.py +1 -1
  44. csv_detective/detection/columns.py +9 -9
  45. csv_detective/detection/encoding.py +6 -4
  46. csv_detective/detection/engine.py +6 -5
  47. csv_detective/detection/formats.py +19 -19
  48. csv_detective/detection/headers.py +3 -5
  49. csv_detective/detection/rows.py +1 -1
  50. csv_detective/detection/variables.py +4 -4
  51. csv_detective/explore_csv.py +7 -8
  52. csv_detective/load_tests.py +6 -14
  53. csv_detective/output/__init__.py +3 -7
  54. csv_detective/output/dataframe.py +9 -5
  55. csv_detective/output/example.py +13 -13
  56. csv_detective/output/profile.py +30 -23
  57. csv_detective/output/schema.py +20 -23
  58. csv_detective/output/utils.py +15 -15
  59. csv_detective/parsing/columns.py +23 -12
  60. csv_detective/parsing/csv.py +1 -1
  61. csv_detective/parsing/excel.py +10 -11
  62. csv_detective/parsing/load.py +11 -8
  63. csv_detective/parsing/text.py +4 -9
  64. csv_detective/s3_utils.py +3 -7
  65. csv_detective/utils.py +4 -2
  66. csv_detective/validate.py +18 -13
  67. csv_detective-0.8.1.dev1674.data/data/share/csv_detective/README.md → csv_detective-0.8.1.dev1720.dist-info/METADATA +32 -0
  68. {csv_detective-0.8.1.dev1674.dist-info → csv_detective-0.8.1.dev1720.dist-info}/RECORD +81 -81
  69. {csv_detective-0.8.1.dev1674.dist-info → csv_detective-0.8.1.dev1720.dist-info}/top_level.txt +2 -0
  70. tests/test_example.py +2 -6
  71. tests/test_fields.py +16 -10
  72. tests/test_file.py +10 -9
  73. tests/test_labels.py +3 -2
  74. tests/test_structure.py +3 -1
  75. tests/test_validation.py +9 -6
  76. venv/bin/activate_this.py +38 -0
  77. venv/bin/jp.py +54 -0
  78. venv/bin/runxlrd.py +410 -0
  79. csv_detective-0.8.1.dev1674.data/data/share/csv_detective/CHANGELOG.md +0 -186
  80. csv_detective-0.8.1.dev1674.dist-info/METADATA +0 -268
  81. csv_detective-0.8.1.dev1674.dist-info/licenses/LICENSE +0 -21
  82. {csv_detective-0.8.1.dev1674.dist-info → csv_detective-0.8.1.dev1720.dist-info}/WHEEL +0 -0
  83. {csv_detective-0.8.1.dev1674.dist-info → csv_detective-0.8.1.dev1720.dist-info}/entry_points.txt +0 -0
  84. {csv_detective-0.8.1.dev1674.data/data/share/csv_detective → csv_detective-0.8.1.dev1720.dist-info/licenses}/LICENSE +0 -0
@@ -5,6 +5,7 @@ from typing import Optional, Union
5
5
  import pandas as pd
6
6
 
7
7
  from csv_detective.utils import is_url
8
+
8
9
  from .dataframe import cast_df
9
10
  from .profile import create_profile
10
11
  from .schema import generate_table_schema
@@ -24,7 +25,6 @@ def generate_output(
24
25
  verbose: bool = False,
25
26
  sheet_name: Optional[Union[str, int]] = None,
26
27
  ) -> Union[dict, tuple[dict, pd.DataFrame]]:
27
-
28
28
  if output_profile:
29
29
  analysis["profile"] = create_profile(
30
30
  table=table,
@@ -40,7 +40,7 @@ def generate_output(
40
40
  else:
41
41
  output_path = os.path.splitext(file_path)[0]
42
42
  if is_url(output_path):
43
- output_path = output_path.split('/')[-1]
43
+ output_path = output_path.split("/")[-1]
44
44
  if analysis.get("sheet_name"):
45
45
  output_path += "_sheet-" + str(sheet_name)
46
46
  output_path += ".json"
@@ -48,11 +48,7 @@ def generate_output(
48
48
  json.dump(analysis, fp, indent=4, separators=(",", ": "), ensure_ascii=False)
49
49
 
50
50
  if output_schema:
51
- analysis["schema"] = generate_table_schema(
52
- analysis,
53
- save_file=False,
54
- verbose=verbose
55
- )
51
+ analysis["schema"] = generate_table_schema(analysis, save_file=False, verbose=verbose)
56
52
 
57
53
  if output_df:
58
54
  return analysis, cast_df(
@@ -1,7 +1,7 @@
1
- from datetime import date, datetime
2
1
  import json
3
- from typing import Optional, Union
2
+ from datetime import date, datetime
4
3
  from time import time
4
+ from typing import Optional, Union
5
5
 
6
6
  import pandas as pd
7
7
 
@@ -30,12 +30,16 @@ def cast(value: str, _type: str) -> Optional[Union[str, float, bool, date, datet
30
30
  raise ValueError(f"Unknown type `{_type}`")
31
31
 
32
32
 
33
- def cast_df(df: pd.DataFrame, columns: dict, cast_json: bool = True, verbose: bool = False) -> pd.DataFrame:
33
+ def cast_df(
34
+ df: pd.DataFrame, columns: dict, cast_json: bool = True, verbose: bool = False
35
+ ) -> pd.DataFrame:
34
36
  if verbose:
35
37
  start = time()
36
38
  output_df = pd.DataFrame()
37
39
  for col_name, detection in columns.items():
38
- if detection["python_type"] == "string" or (detection["python_type"] == "json" and not cast_json):
40
+ if detection["python_type"] == "string" or (
41
+ detection["python_type"] == "json" and not cast_json
42
+ ):
39
43
  # no change if detected type is string
40
44
  output_df[col_name] = df[col_name].copy()
41
45
  elif detection["python_type"] == "int":
@@ -49,7 +53,7 @@ def cast_df(df: pd.DataFrame, columns: dict, cast_json: bool = True, verbose: bo
49
53
  del df[col_name]
50
54
  if verbose:
51
55
  display_logs_depending_process_time(
52
- f'Casting columns completed in {round(time() - start, 3)}s',
56
+ f"Casting columns completed in {round(time() - start, 3)}s",
53
57
  time() - start,
54
58
  )
55
59
  return output_df
@@ -1,14 +1,14 @@
1
- from datetime import datetime
2
1
  import json
3
2
  import random
4
3
  import string
5
- from typing import Union, Optional, Any, Type
6
4
  import uuid
5
+ from datetime import datetime
6
+ from typing import Any, Optional, Type, Union
7
7
 
8
- from faker import Faker
9
8
  import pandas as pd
10
9
  import requests
11
10
  import rstr
11
+ from faker import Faker
12
12
 
13
13
  fake = Faker()
14
14
 
@@ -135,7 +135,7 @@ def create_example_csv_file(
135
135
  return random.choice(enum)
136
136
  if num_range is None:
137
137
  num_range = [0, 1000]
138
- if num_type == int:
138
+ if num_type is int:
139
139
  return random.randint(num_range[0], num_range[1])
140
140
  else:
141
141
  return round(random.uniform(num_range[0], num_range[1]), 1)
@@ -179,7 +179,7 @@ def create_example_csv_file(
179
179
  "yearmonth": "date",
180
180
  "time": "time",
181
181
  "datetime": "datetime",
182
- "array": "array"
182
+ "array": "array",
183
183
  }
184
184
 
185
185
  if schema_path:
@@ -188,7 +188,7 @@ def create_example_csv_file(
188
188
  else:
189
189
  with open(schema_path, encoding=encoding) as jsonfile:
190
190
  schema = json.load(jsonfile)
191
- if not ("fields" in schema.keys()):
191
+ if "fields" not in schema.keys():
192
192
  raise ValueError("The schema must have a 'fields' key.")
193
193
  else:
194
194
  fields = [
@@ -198,12 +198,14 @@ def create_example_csv_file(
198
198
  # when frformat is supported in TableSchema, we can build args for French standards
199
199
  # linked to https://github.com/datagouv/fr-format/issues/26
200
200
  "args": (
201
- build_args_from_constraints(f["constraints"]) if "constraints" in f.keys()
201
+ build_args_from_constraints(f["constraints"])
202
+ if "constraints" in f.keys()
202
203
  else build_args_from_constraints(f["arrayItem"]["constraints"])
203
204
  if "arrayItem" in f.keys() and "constraints" in f["arrayItem"].keys()
204
205
  else {}
205
- )
206
- } for f in schema["fields"]
206
+ ),
207
+ }
208
+ for f in schema["fields"]
207
209
  ]
208
210
 
209
211
  for k in range(len(fields)):
@@ -234,10 +236,8 @@ def create_example_csv_file(
234
236
  # would it be better to create by column or by row (as for now)?
235
237
  output = pd.DataFrame(
236
238
  [
237
- [
238
- types_to_func.get(f["type"], "str")(**f["args"])
239
- for f in fields
240
- ] for _ in range(file_length)
239
+ [types_to_func.get(f["type"], "str")(**f["args"]) for f in fields]
240
+ for _ in range(file_length)
241
241
  ],
242
242
  columns=[f["name"] for f in fields],
243
243
  )
@@ -1,5 +1,5 @@
1
- from collections import defaultdict
2
1
  import logging
2
+ from collections import defaultdict
3
3
  from time import time
4
4
 
5
5
  import pandas as pd
@@ -29,15 +29,12 @@ def create_profile(
29
29
  safe_table = table.copy()
30
30
  if not limited_output:
31
31
  dict_cols_fields = {
32
- k: v[0] if v else {'python_type': 'string', 'format': 'string', 'score': 1.0}
32
+ k: v[0] if v else {"python_type": "string", "format": "string", "score": 1.0}
33
33
  for k, v in dict_cols_fields.items()
34
34
  }
35
- dtypes = {
36
- k: map_python_types.get(v["python_type"], str)
37
- for k, v in dict_cols_fields.items()
38
- }
35
+ dtypes = {k: map_python_types.get(v["python_type"], str) for k, v in dict_cols_fields.items()}
39
36
  for c in safe_table.columns:
40
- if dtypes[c] == float:
37
+ if dtypes[c] is float:
41
38
  safe_table[c] = safe_table[c].apply(
42
39
  lambda s: float_casting(s) if isinstance(s, str) else s
43
40
  )
@@ -48,18 +45,26 @@ def create_profile(
48
45
  int,
49
46
  ]:
50
47
  profile[c].update(
51
- min=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
52
- safe_table[c].min()
53
- )),
54
- max=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
55
- safe_table[c].max()
56
- )),
57
- mean=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
58
- safe_table[c].mean()
59
- )),
60
- std=prevent_nan(map_python_types.get(dict_cols_fields[c]["python_type"], str)(
61
- safe_table[c].std()
62
- )),
48
+ min=prevent_nan(
49
+ map_python_types.get(dict_cols_fields[c]["python_type"], str)(
50
+ safe_table[c].min()
51
+ )
52
+ ),
53
+ max=prevent_nan(
54
+ map_python_types.get(dict_cols_fields[c]["python_type"], str)(
55
+ safe_table[c].max()
56
+ )
57
+ ),
58
+ mean=prevent_nan(
59
+ map_python_types.get(dict_cols_fields[c]["python_type"], str)(
60
+ safe_table[c].mean()
61
+ )
62
+ ),
63
+ std=prevent_nan(
64
+ map_python_types.get(dict_cols_fields[c]["python_type"], str)(
65
+ safe_table[c].std()
66
+ )
67
+ ),
63
68
  )
64
69
  tops_bruts = (
65
70
  safe_table[safe_table[c].notna()][c]
@@ -70,10 +75,12 @@ def create_profile(
70
75
  )
71
76
  tops = []
72
77
  for tb in tops_bruts:
73
- tops.append({
74
- "count": tb["count"],
75
- "value": tb[c],
76
- })
78
+ tops.append(
79
+ {
80
+ "count": tb["count"],
81
+ "value": tb[c],
82
+ }
83
+ )
77
84
  profile[c].update(
78
85
  tops=tops,
79
86
  nb_distinct=safe_table[c].nunique(),
@@ -1,14 +1,14 @@
1
- from datetime import datetime
2
1
  import json
3
2
  import logging
4
3
  import os
5
4
  import tempfile
5
+ from datetime import datetime
6
6
  from time import time
7
7
  from typing import Optional
8
8
 
9
9
  from botocore.exceptions import ClientError
10
10
 
11
- from csv_detective.s3_utils import get_s3_client, download_from_minio, upload_to_minio
11
+ from csv_detective.s3_utils import download_from_minio, get_s3_client, upload_to_minio
12
12
  from csv_detective.utils import display_logs_depending_process_time
13
13
 
14
14
 
@@ -26,13 +26,11 @@ def get_description(format: str) -> str:
26
26
  "insee_canton": "Le nom du canton",
27
27
  "latitude_l93": "La latitude au format Lambert 93",
28
28
  "latitude_wgs_fr_metropole": (
29
- "La latitude au format WGS. Ne concerne que des latitudes "
30
- "de la métropole française"
29
+ "La latitude au format WGS. Ne concerne que des latitudes de la métropole française"
31
30
  ),
32
31
  "longitude_l93": "La longitude au format Lambert 93",
33
32
  "longitude_wgs_fr_metropole": (
34
- "La longitude au format WGS. Ne concerne que des longitudes "
35
- "de la métropole française"
33
+ "La longitude au format WGS. Ne concerne que des longitudes de la métropole française"
36
34
  ),
37
35
  "pays": "Le nom du pays",
38
36
  "region": "Le nom de la région",
@@ -86,13 +84,13 @@ def get_pattern(format: str) -> str:
86
84
  ),
87
85
  "uai": r"^(0[0-8][0-9]|09[0-5]|9[78][0-9]|[67]20)[0-9]{4}[A-Z]$",
88
86
  "email": r"^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$",
89
- "twitter": r'^@[A-Za-z0-9_]+$',
90
- "mongo_object_id": r'^[0-9a-fA-F]{24}$',
91
- "uuid": r'^[{]?[0-9a-fA-F]{8}' + '-?([0-9a-fA-F]{4}-?)' + '{3}[0-9a-fA-F]{12}[}]?$',
87
+ "twitter": r"^@[A-Za-z0-9_]+$",
88
+ "mongo_object_id": r"^[0-9a-fA-F]{24}$",
89
+ "uuid": r"^[{]?[0-9a-fA-F]{8}" + "-?([0-9a-fA-F]{4}-?)" + "{3}[0-9a-fA-F]{12}[}]?$",
92
90
  "url": (
93
- r'^https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]'
94
- r'{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*)$'
95
- )
91
+ r"^https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]"
92
+ r"{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*)$"
93
+ ),
96
94
  }
97
95
  if format in format_to_pattern:
98
96
  return {"pattern": format_to_pattern[format]}
@@ -210,7 +208,7 @@ def generate_table_schema(
210
208
  key: Optional[str] = None,
211
209
  minio_user: Optional[str] = None,
212
210
  minio_pwd: Optional[str] = None,
213
- verbose: bool = False
211
+ verbose: bool = False,
214
212
  ) -> dict:
215
213
  """Generates a table schema from the analysis report
216
214
 
@@ -236,7 +234,7 @@ def generate_table_schema(
236
234
  "example": get_example(field_report["format"]),
237
235
  "type": get_validata_type(field_report["format"]),
238
236
  "formatFR": field_report["format"],
239
- "constraints": get_constraints(field_report["format"])
237
+ "constraints": get_constraints(field_report["format"]),
240
238
  }
241
239
  for header, field_report in analysis_report["columns"].items()
242
240
  ]
@@ -255,12 +253,9 @@ def generate_table_schema(
255
253
  "sources": [
256
254
  {
257
255
  "title": "Spécification Tableschema",
258
- "path": "https://specs.frictionlessdata.io/table-schema"
256
+ "path": "https://specs.frictionlessdata.io/table-schema",
259
257
  },
260
- {
261
- "title": "schema.data.gouv.fr",
262
- "path": "https://schema.data.gouv.fr"
263
- }
258
+ {"title": "schema.data.gouv.fr", "path": "https://schema.data.gouv.fr"},
264
259
  ],
265
260
  "created": datetime.today().strftime("%Y-%m-%d"),
266
261
  "lastModified": datetime.today().strftime("%Y-%m-%d"),
@@ -278,7 +273,9 @@ def generate_table_schema(
278
273
  }
279
274
 
280
275
  if verbose:
281
- display_logs_depending_process_time(f'Created schema in {round(time() - start, 3)}s', time() - start)
276
+ display_logs_depending_process_time(
277
+ f"Created schema in {round(time() - start, 3)}s", time() - start
278
+ )
282
279
 
283
280
  if not save_file:
284
281
  return schema
@@ -301,9 +298,9 @@ def generate_table_schema(
301
298
  if "Contents" in tableschema_objects:
302
299
  tableschema_keys = [
303
300
  tableschema["Key"]
304
- for tableschema in client.list_objects(
305
- Bucket=bucket, Prefix=key, Delimiter="/"
306
- )["Contents"]
301
+ for tableschema in client.list_objects(Bucket=bucket, Prefix=key, Delimiter="/")[
302
+ "Contents"
303
+ ]
307
304
  ]
308
305
  tableschema_versions = [
309
306
  os.path.splitext(tableschema_key)[0].split("_")[-1]
@@ -19,14 +19,17 @@ def prepare_output_dict(return_table: pd.DataFrame, limited_output: bool):
19
19
  # no need to specify int and float everywhere, they are deprioritized anyway
20
20
  ("int", ("float",)),
21
21
  # bool over everything
22
- ("booleen", (
23
- "latitude_l93",
24
- "latitude_wgs",
25
- "latitude_wgs_fr_metropole",
26
- "longitude_l93",
27
- "longitude_wgs",
28
- "longitude_wgs_fr_metropole",
29
- )),
22
+ (
23
+ "booleen",
24
+ (
25
+ "latitude_l93",
26
+ "latitude_wgs",
27
+ "latitude_wgs_fr_metropole",
28
+ "longitude_l93",
29
+ "longitude_wgs",
30
+ "longitude_wgs_fr_metropole",
31
+ ),
32
+ ),
30
33
  ("geojson", ("json",)),
31
34
  # latlon over lonlat if no longitude allows to discriminate
32
35
  ("latlon_wgs", ("json", "lonlat_wgs")),
@@ -49,13 +52,10 @@ def prepare_output_dict(return_table: pd.DataFrame, limited_output: bool):
49
52
  for prio_format, secondary_formats in priorities:
50
53
  if prio_format in detected_formats:
51
54
  for secondary in secondary_formats:
52
- if (
53
- secondary in detected_formats
54
- and (
55
- return_dict_cols[column_name][prio_format]
56
- >= return_dict_cols[column_name][secondary]
57
- or return_dict_cols[column_name][prio_format] >= 1
58
- )
55
+ if secondary in detected_formats and (
56
+ return_dict_cols[column_name][prio_format]
57
+ >= return_dict_cols[column_name][secondary]
58
+ or return_dict_cols[column_name][prio_format] >= 1
59
59
  ):
60
60
  formats_to_remove.add(secondary)
61
61
 
@@ -28,6 +28,7 @@ def test_col_val(
28
28
  # TODO : change for a cleaner method and only test columns in modules labels
29
29
  def apply_test_func(serie: pd.Series, test_func: Callable, _range: int):
30
30
  return serie.sample(n=_range).apply(test_func)
31
+
31
32
  try:
32
33
  if skipna:
33
34
  serie = serie[serie.notnull()]
@@ -60,11 +61,13 @@ def test_col_val(
60
61
  if verbose and time() - start > 3:
61
62
  display_logs_depending_process_time(
62
63
  f"\t/!\\ Column '{serie.name}' took too long ({round(time() - start, 3)}s)",
63
- time() - start
64
+ time() - start,
64
65
  )
65
66
 
66
67
 
67
- def test_col_label(label: str, test_func: Callable, proportion: float = 1, limited_output: bool = False):
68
+ def test_col_label(
69
+ label: str, test_func: Callable, proportion: float = 1, limited_output: bool = False
70
+ ):
68
71
  """Tests label (from header) using test_func.
69
72
  - proportion : indicates the minimum score to pass the test for the serie
70
73
  to be detected as a certain format
@@ -76,7 +79,13 @@ def test_col_label(label: str, test_func: Callable, proportion: float = 1, limit
76
79
  return result if result >= proportion else 0
77
80
 
78
81
 
79
- def test_col(table: pd.DataFrame, all_tests: list, limited_output: bool, skipna: bool = True, verbose: bool = False):
82
+ def test_col(
83
+ table: pd.DataFrame,
84
+ all_tests: list,
85
+ limited_output: bool,
86
+ skipna: bool = True,
87
+ verbose: bool = False,
88
+ ):
80
89
  if verbose:
81
90
  start = time()
82
91
  logging.info("Testing columns to get types")
@@ -106,11 +115,13 @@ def test_col(table: pd.DataFrame, all_tests: list, limited_output: bool, skipna:
106
115
  )
107
116
  if verbose:
108
117
  display_logs_depending_process_time(
109
- f'\t> Done with type "{key}" in {round(time() - start_type, 3)}s ({idx+1}/{len(test_funcs)})',
110
- time() - start_type
118
+ f'\t> Done with type "{key}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(test_funcs)})',
119
+ time() - start_type,
111
120
  )
112
121
  if verbose:
113
- display_logs_depending_process_time(f"Done testing columns in {round(time() - start, 3)}s", time() - start)
122
+ display_logs_depending_process_time(
123
+ f"Done testing columns in {round(time() - start, 3)}s", time() - start
124
+ )
114
125
  return return_table
115
126
 
116
127
 
@@ -128,16 +139,16 @@ def test_label(table: pd.DataFrame, all_tests: list, limited_output: bool, verbo
128
139
  if verbose:
129
140
  start_type = time()
130
141
  return_table.loc[key] = [
131
- test_col_label(
132
- col_name, value["func"], value["prop"], limited_output=limited_output
133
- )
142
+ test_col_label(col_name, value["func"], value["prop"], limited_output=limited_output)
134
143
  for col_name in table.columns
135
144
  ]
136
145
  if verbose:
137
146
  display_logs_depending_process_time(
138
- f'\t- Done with type "{key}" in {round(time() - start_type, 3)}s ({idx+1}/{len(test_funcs)})',
139
- time() - start_type
147
+ f'\t- Done with type "{key}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(test_funcs)})',
148
+ time() - start_type,
140
149
  )
141
150
  if verbose:
142
- display_logs_depending_process_time(f"Done testing labels in {round(time() - start, 3)}s", time() - start)
151
+ display_logs_depending_process_time(
152
+ f"Done testing labels in {round(time() - start, 3)}s", time() - start
153
+ )
143
154
  return return_table
@@ -49,7 +49,7 @@ def parse_csv(
49
49
  raise ValueError("Could not load file")
50
50
  if verbose:
51
51
  display_logs_depending_process_time(
52
- f'Table parsed successfully in {round(time() - start, 3)}s',
52
+ f"Table parsed successfully in {round(time() - start, 3)}s",
53
53
  time() - start,
54
54
  )
55
55
  return table, total_lines, nb_duplicates
@@ -28,14 +28,13 @@ def parse_excel(
28
28
  random_state: int = 42,
29
29
  verbose: bool = False,
30
30
  ) -> tuple[pd.DataFrame, int, int, str, str, int]:
31
- """"Excel-like parsing is really slow, could be a good improvement for future development"""
31
+ """ "Excel-like parsing is really slow, could be a good improvement for future development"""
32
32
  if verbose:
33
33
  start = time()
34
34
  no_sheet_specified = sheet_name is None
35
35
 
36
- if (
37
- engine in ['openpyxl', 'xlrd'] or
38
- any([file_path.endswith(k) for k in NEW_EXCEL_EXT + OLD_EXCEL_EXT])
36
+ if engine in ["openpyxl", "xlrd"] or any(
37
+ [file_path.endswith(k) for k in NEW_EXCEL_EXT + OLD_EXCEL_EXT]
39
38
  ):
40
39
  remote_content = None
41
40
  if is_url(file_path):
@@ -50,7 +49,7 @@ def parse_excel(
50
49
  if sheet_name is None:
51
50
  if verbose:
52
51
  display_logs_depending_process_time(
53
- f'Detected {engine_to_file[engine]} file, no sheet specified, reading the largest one',
52
+ f"Detected {engine_to_file[engine]} file, no sheet specified, reading the largest one",
54
53
  time() - start,
55
54
  )
56
55
  try:
@@ -58,8 +57,8 @@ def parse_excel(
58
57
  # openpyxl doesn't want to open files that don't have a valid extension
59
58
  # see: https://foss.heptapod.net/openpyxl/openpyxl/-/issues/2157
60
59
  # if the file is remote, we have a remote content anyway so it's fine
61
- if not remote_content and '.' not in file_path.split('/')[-1]:
62
- with open(file_path, 'rb') as f:
60
+ if not remote_content and "." not in file_path.split("/")[-1]:
61
+ with open(file_path, "rb") as f:
63
62
  remote_content = BytesIO(f.read())
64
63
  # faster than loading all sheets
65
64
  wb = openpyxl.load_workbook(remote_content or file_path, read_only=True)
@@ -82,7 +81,7 @@ def parse_excel(
82
81
  # sometimes a xls file is recognized as ods
83
82
  if verbose:
84
83
  display_logs_depending_process_time(
85
- 'Could not read file with classic xls reader, trying with ODS',
84
+ "Could not read file with classic xls reader, trying with ODS",
86
85
  time() - start,
87
86
  )
88
87
  engine = "odf"
@@ -95,7 +94,7 @@ def parse_excel(
95
94
  if sheet_name is None:
96
95
  if verbose:
97
96
  display_logs_depending_process_time(
98
- f'Detected {engine_to_file[engine]} file, no sheet specified, reading the largest one',
97
+ f"Detected {engine_to_file[engine]} file, no sheet specified, reading the largest one",
99
98
  time() - start,
100
99
  )
101
100
  tables = pd.read_excel(
@@ -132,7 +131,7 @@ def parse_excel(
132
131
  table = table.sample(num_rows, random_state=random_state)
133
132
  if verbose:
134
133
  display_logs_depending_process_time(
135
- f'Table parsed successfully in {round(time() - start, 3)}s',
134
+ f"Table parsed successfully in {round(time() - start, 3)}s",
136
135
  time() - start,
137
136
  )
138
137
  return table, total_lines, nb_duplicates, sheet_name, engine, header_row_idx
@@ -163,7 +162,7 @@ def parse_excel(
163
162
  table = table.sample(num_rows, random_state=random_state)
164
163
  if verbose:
165
164
  display_logs_depending_process_time(
166
- f'Table parsed successfully in {round(time() - start, 3)}s',
165
+ f"Table parsed successfully in {round(time() - start, 3)}s",
167
166
  time() - start,
168
167
  )
169
168
  return table, total_lines, nb_duplicates, sheet_name, engine, header_row_idx
@@ -14,6 +14,7 @@ from csv_detective.detection.engine import (
14
14
  from csv_detective.detection.headers import detect_headers
15
15
  from csv_detective.detection.separator import detect_separator
16
16
  from csv_detective.utils import is_url
17
+
17
18
  from .compression import unzip
18
19
  from .csv import parse_csv
19
20
  from .excel import (
@@ -30,9 +31,9 @@ def load_file(
30
31
  verbose: bool = False,
31
32
  sheet_name: Optional[Union[str, int]] = None,
32
33
  ) -> tuple[pd.DataFrame, dict]:
33
- file_name = file_path.split('/')[-1]
34
+ file_name = file_path.split("/")[-1]
34
35
  engine = None
35
- if '.' not in file_name or not file_name.endswith("csv"):
36
+ if "." not in file_name or not file_name.endswith("csv"):
36
37
  # file has no extension, we'll investigate how to read it
37
38
  engine = detect_engine(file_path, verbose=verbose)
38
39
 
@@ -88,10 +89,12 @@ def load_file(
88
89
  "heading_columns": heading_columns,
89
90
  "trailing_columns": trailing_columns,
90
91
  }
91
- analysis.update({
92
- "header_row_idx": header_row_idx,
93
- "header": header,
94
- "total_lines": total_lines,
95
- "nb_duplicates": nb_duplicates,
96
- })
92
+ analysis.update(
93
+ {
94
+ "header_row_idx": header_row_idx,
95
+ "header": header,
96
+ "total_lines": total_lines,
97
+ "nb_duplicates": nb_duplicates,
98
+ }
99
+ )
97
100
  return table, analysis
@@ -2,9 +2,7 @@ from re import finditer
2
2
 
3
3
 
4
4
  def camel_case_split(identifier: str):
5
- matches = finditer(
6
- ".+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)", identifier
7
- )
5
+ matches = finditer(".+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)", identifier)
8
6
  return " ".join([m.group(0) for m in matches])
9
7
 
10
8
 
@@ -46,15 +44,12 @@ def header_score(header: str, words_combinations_list: list[str]) -> float:
46
44
  processed_header = _process_text(header)
47
45
 
48
46
  header_matches_words_combination = float(
49
- any(
50
- words_combination == processed_header for words_combination in words_combinations_list
51
- )
47
+ any(words_combination == processed_header for words_combination in words_combinations_list)
52
48
  )
53
49
  words_combination_in_header = 0.5 * (
54
50
  any(
55
- is_word_in_string(
56
- words_combination, processed_header
57
- ) for words_combination in words_combinations_list
51
+ is_word_in_string(words_combination, processed_header)
52
+ for words_combination in words_combinations_list
58
53
  )
59
54
  )
60
55
 
csv_detective/s3_utils.py CHANGED
@@ -1,6 +1,6 @@
1
- import boto3
2
1
  import logging
3
2
 
3
+ import boto3
4
4
  from botocore.client import Config
5
5
  from botocore.exceptions import ClientError
6
6
 
@@ -27,9 +27,7 @@ def download_from_minio(
27
27
  s3 = get_s3_client(netloc, minio_user, minio_pwd)
28
28
  try:
29
29
  s3.download_file(bucket, key, filepath)
30
- logging.info(
31
- f"Resource downloaded from minio at {get_minio_url(netloc, bucket, key)}"
32
- )
30
+ logging.info(f"Resource downloaded from minio at {get_minio_url(netloc, bucket, key)}")
33
31
  except ClientError as e:
34
32
  logging.error(e)
35
33
 
@@ -41,8 +39,6 @@ def upload_to_minio(
41
39
  s3 = get_s3_client(netloc, minio_user, minio_pwd)
42
40
  try:
43
41
  s3.upload_file(filepath, bucket, key)
44
- logging.info(
45
- f"Resource saved into minio at {get_minio_url(netloc, bucket, key)}"
46
- )
42
+ logging.info(f"Resource saved into minio at {get_minio_url(netloc, bucket, key)}")
47
43
  except ClientError as e:
48
44
  logging.error(e)
csv_detective/utils.py CHANGED
@@ -4,7 +4,9 @@ from typing import Optional
4
4
  import pandas as pd
5
5
 
6
6
  logging.basicConfig(level=logging.INFO)
7
- logging.addLevelName(logging.CRITICAL, "\033[1;41m%s\033[1;0m" % logging.getLevelName(logging.CRITICAL))
7
+ logging.addLevelName(
8
+ logging.CRITICAL, "\033[1;41m%s\033[1;0m" % logging.getLevelName(logging.CRITICAL)
9
+ )
8
10
  logging.addLevelName(logging.WARN, "\033[1;31m%s\033[1;0m" % logging.getLevelName(logging.WARN))
9
11
 
10
12
  THRESHOLD_WARN = 1
@@ -26,7 +28,7 @@ def display_logs_depending_process_time(prompt: str, duration: float) -> None:
26
28
  def is_url(file_path: str) -> bool:
27
29
  # could be more sophisticated if needed
28
30
  # using the URL detection test was considered but too broad (schema required to use requests)
29
- return file_path.startswith('http')
31
+ return file_path.startswith("http")
30
32
 
31
33
 
32
34
  def prevent_nan(value: float) -> Optional[float]: