csv-detective 0.8.1.dev1674__py3-none-any.whl → 0.8.1.dev1720__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. csv_detective/__init__.py +0 -2
  2. csv_detective/cli.py +6 -9
  3. csv_detective/detect_fields/FR/geo/adresse/__init__.py +78 -78
  4. csv_detective/detect_fields/FR/geo/code_departement/__init__.py +2 -2
  5. csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -1
  6. csv_detective/detect_fields/FR/geo/code_region/__init__.py +1 -1
  7. csv_detective/detect_fields/FR/geo/commune/__init__.py +2 -2
  8. csv_detective/detect_fields/FR/geo/departement/__init__.py +2 -2
  9. csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +2 -2
  10. csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +1 -2
  11. csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +1 -1
  12. csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +1 -2
  13. csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +1 -1
  14. csv_detective/detect_fields/FR/geo/pays/__init__.py +6 -6
  15. csv_detective/detect_fields/FR/geo/region/__init__.py +6 -4
  16. csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +15 -14
  17. csv_detective/detect_fields/FR/other/csp_insee/__init__.py +4 -3
  18. csv_detective/detect_fields/FR/other/date_fr/__init__.py +3 -3
  19. csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +4 -3
  20. csv_detective/detect_fields/FR/other/sexe/__init__.py +2 -2
  21. csv_detective/detect_fields/FR/other/siren/__init__.py +3 -3
  22. csv_detective/detect_fields/FR/other/siret/__init__.py +3 -3
  23. csv_detective/detect_fields/FR/other/tel_fr/__init__.py +3 -3
  24. csv_detective/detect_fields/FR/other/uai/__init__.py +2 -2
  25. csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +15 -15
  26. csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +27 -27
  27. csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +5 -5
  28. csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +5 -5
  29. csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +5 -5
  30. csv_detective/detect_fields/geo/latitude_wgs/__init__.py +1 -1
  31. csv_detective/detect_fields/geo/longitude_wgs/__init__.py +1 -1
  32. csv_detective/detect_fields/other/booleen/__init__.py +1 -1
  33. csv_detective/detect_fields/other/email/__init__.py +4 -2
  34. csv_detective/detect_fields/other/int/__init__.py +3 -3
  35. csv_detective/detect_fields/other/mongo_object_id/__init__.py +2 -2
  36. csv_detective/detect_fields/other/twitter/__init__.py +2 -2
  37. csv_detective/detect_fields/other/uuid/__init__.py +4 -5
  38. csv_detective/detect_fields/temp/date/__init__.py +3 -2
  39. csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +6 -6
  40. csv_detective/detect_fields/temp/year/__init__.py +1 -1
  41. csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -1
  42. csv_detective/detect_labels/geo/lonlat_wgs/__init__.py +1 -0
  43. csv_detective/detect_labels/other/mongo_object_id/__init__.py +1 -1
  44. csv_detective/detection/columns.py +9 -9
  45. csv_detective/detection/encoding.py +6 -4
  46. csv_detective/detection/engine.py +6 -5
  47. csv_detective/detection/formats.py +19 -19
  48. csv_detective/detection/headers.py +3 -5
  49. csv_detective/detection/rows.py +1 -1
  50. csv_detective/detection/variables.py +4 -4
  51. csv_detective/explore_csv.py +7 -8
  52. csv_detective/load_tests.py +6 -14
  53. csv_detective/output/__init__.py +3 -7
  54. csv_detective/output/dataframe.py +9 -5
  55. csv_detective/output/example.py +13 -13
  56. csv_detective/output/profile.py +30 -23
  57. csv_detective/output/schema.py +20 -23
  58. csv_detective/output/utils.py +15 -15
  59. csv_detective/parsing/columns.py +23 -12
  60. csv_detective/parsing/csv.py +1 -1
  61. csv_detective/parsing/excel.py +10 -11
  62. csv_detective/parsing/load.py +11 -8
  63. csv_detective/parsing/text.py +4 -9
  64. csv_detective/s3_utils.py +3 -7
  65. csv_detective/utils.py +4 -2
  66. csv_detective/validate.py +18 -13
  67. csv_detective-0.8.1.dev1674.data/data/share/csv_detective/README.md → csv_detective-0.8.1.dev1720.dist-info/METADATA +32 -0
  68. {csv_detective-0.8.1.dev1674.dist-info → csv_detective-0.8.1.dev1720.dist-info}/RECORD +81 -81
  69. {csv_detective-0.8.1.dev1674.dist-info → csv_detective-0.8.1.dev1720.dist-info}/top_level.txt +2 -0
  70. tests/test_example.py +2 -6
  71. tests/test_fields.py +16 -10
  72. tests/test_file.py +10 -9
  73. tests/test_labels.py +3 -2
  74. tests/test_structure.py +3 -1
  75. tests/test_validation.py +9 -6
  76. venv/bin/activate_this.py +38 -0
  77. venv/bin/jp.py +54 -0
  78. venv/bin/runxlrd.py +410 -0
  79. csv_detective-0.8.1.dev1674.data/data/share/csv_detective/CHANGELOG.md +0 -186
  80. csv_detective-0.8.1.dev1674.dist-info/METADATA +0 -268
  81. csv_detective-0.8.1.dev1674.dist-info/licenses/LICENSE +0 -21
  82. {csv_detective-0.8.1.dev1674.dist-info → csv_detective-0.8.1.dev1720.dist-info}/WHEEL +0 -0
  83. {csv_detective-0.8.1.dev1674.dist-info → csv_detective-0.8.1.dev1720.dist-info}/entry_points.txt +0 -0
  84. {csv_detective-0.8.1.dev1674.data/data/share/csv_detective → csv_detective-0.8.1.dev1720.dist-info/licenses}/LICENSE +0 -0
@@ -2,37 +2,37 @@ from unidecode import unidecode
2
2
 
3
3
  PROPORTION = 1
4
4
  mois = {
5
- 'janvier',
6
- 'fevrier',
7
- 'mars',
8
- 'avril',
9
- 'mai',
10
- 'juin',
11
- 'juillet',
12
- 'aout',
13
- 'septembre',
14
- 'octobre',
15
- 'novembre',
16
- 'decembre',
17
- 'jan',
18
- 'fev',
19
- 'mar',
20
- 'avr',
21
- 'mai',
22
- 'jun',
23
- 'jui',
24
- 'juil',
25
- 'aou',
26
- 'sep',
27
- 'sept',
28
- 'oct',
29
- 'nov',
30
- 'dec'
5
+ "janvier",
6
+ "fevrier",
7
+ "mars",
8
+ "avril",
9
+ "mai",
10
+ "juin",
11
+ "juillet",
12
+ "aout",
13
+ "septembre",
14
+ "octobre",
15
+ "novembre",
16
+ "decembre",
17
+ "jan",
18
+ "fev",
19
+ "mar",
20
+ "avr",
21
+ "mai",
22
+ "jun",
23
+ "jui",
24
+ "juil",
25
+ "aou",
26
+ "sep",
27
+ "sept",
28
+ "oct",
29
+ "nov",
30
+ "dec",
31
31
  }
32
32
 
33
33
 
34
34
  def _is(val):
35
- '''Renvoie True si les champs peuvent être des mois de l'année'''
35
+ """Renvoie True si les champs peuvent être des mois de l'année"""
36
36
  if not isinstance(val, str):
37
37
  return False
38
38
  val = unidecode(val.lower())
@@ -1,15 +1,15 @@
1
- from os.path import dirname, join
2
1
  import re
2
+ from os.path import dirname, join
3
3
 
4
4
  PROPORTION = 1
5
5
 
6
- with open(join(dirname(__file__), 'iso_country_code_alpha2.txt'), 'r') as iofile:
7
- liste_pays = iofile.read().split('\n')
6
+ with open(join(dirname(__file__), "iso_country_code_alpha2.txt"), "r") as iofile:
7
+ liste_pays = iofile.read().split("\n")
8
8
  liste_pays = set(liste_pays)
9
9
 
10
10
 
11
11
  def _is(val):
12
- '''Renvoie True si val peut etre un code iso pays alpha-2, False sinon'''
13
- if not isinstance(val, str) or not bool(re.match(r'[A-Z]{2}$', val)):
12
+ """Renvoie True si val peut etre un code iso pays alpha-2, False sinon"""
13
+ if not isinstance(val, str) or not bool(re.match(r"[A-Z]{2}$", val)):
14
14
  return False
15
15
  return val in liste_pays
@@ -1,14 +1,14 @@
1
- from os.path import dirname, join
2
1
  import re
2
+ from os.path import dirname, join
3
3
 
4
4
  PROPORTION = 1
5
5
 
6
- with open(join(dirname(__file__), 'iso_country_code_alpha3.txt'), 'r') as iofile:
7
- liste_pays = iofile.read().split('\n')
6
+ with open(join(dirname(__file__), "iso_country_code_alpha3.txt"), "r") as iofile:
7
+ liste_pays = iofile.read().split("\n")
8
8
 
9
9
 
10
10
  def _is(val):
11
- '''Renvoie True si val peut etre un code iso pays alpha-3, False sinon'''
12
- if not isinstance(val, str) or not bool(re.match(r'[A-Z]{3}$', val)):
11
+ """Renvoie True si val peut etre un code iso pays alpha-3, False sinon"""
12
+ if not isinstance(val, str) or not bool(re.match(r"[A-Z]{3}$", val)):
13
13
  return False
14
14
  return val in set(liste_pays)
@@ -1,15 +1,15 @@
1
- from os.path import dirname, join
2
1
  import re
2
+ from os.path import dirname, join
3
3
 
4
4
  PROPORTION = 1
5
5
 
6
- with open(join(dirname(__file__), 'iso_country_code_numeric.txt'), 'r') as iofile:
7
- liste_pays = iofile.read().split('\n')
6
+ with open(join(dirname(__file__), "iso_country_code_numeric.txt"), "r") as iofile:
7
+ liste_pays = iofile.read().split("\n")
8
8
  liste_pays = set(liste_pays)
9
9
 
10
10
 
11
11
  def _is(val):
12
- '''Renvoie True si val peut etre un code iso pays numerique, False sinon'''
13
- if not isinstance(val, str) or not bool(re.match(r'[0-9]{3}$', val)):
12
+ """Renvoie True si val peut etre un code iso pays numerique, False sinon"""
13
+ if not isinstance(val, str) or not bool(re.match(r"[0-9]{3}$", val)):
14
14
  return False
15
15
  return val in liste_pays
@@ -4,7 +4,7 @@ PROPORTION = 0.9
4
4
 
5
5
 
6
6
  def _is(val):
7
- '''Renvoie True si val peut etre une latitude'''
7
+ """Renvoie True si val peut etre une latitude"""
8
8
  try:
9
9
  return is_float(val) and float(val) >= -90 and float(val) <= 90
10
10
  except ValueError:
@@ -4,7 +4,7 @@ PROPORTION = 0.9
4
4
 
5
5
 
6
6
  def _is(val):
7
- '''Renvoie True si val peut etre une longitude'''
7
+ """Renvoie True si val peut etre une longitude"""
8
8
  try:
9
9
  return is_float(val) and float(val) >= -180 and float(val) <= 180
10
10
  except ValueError:
@@ -23,5 +23,5 @@ def bool_casting(val: str) -> bool:
23
23
 
24
24
 
25
25
  def _is(val: str) -> bool:
26
- '''Détecte les booléens'''
26
+ """Détecte les booléens"""
27
27
  return isinstance(val, str) and val.lower() in liste_bool
@@ -4,5 +4,7 @@ PROPORTION = 1
4
4
 
5
5
 
6
6
  def _is(val):
7
- '''Detects e-mails'''
8
- return isinstance(val, str) and bool(re.match(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}$', val))
7
+ """Detects e-mails"""
8
+ return isinstance(val, str) and bool(
9
+ re.match(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}$", val)
10
+ )
@@ -2,11 +2,11 @@ PROPORTION = 1
2
2
 
3
3
 
4
4
  def _is(val):
5
- '''Detects integers'''
5
+ """Detects integers"""
6
6
  if (
7
7
  not isinstance(val, str)
8
- or any([v in val for v in ['.', '_', '+']])
9
- or (val.startswith('0') and len(val) > 1)
8
+ or any([v in val for v in [".", "_", "+"]])
9
+ or (val.startswith("0") and len(val) > 1)
10
10
  ):
11
11
  return False
12
12
  try:
@@ -4,5 +4,5 @@ PROPORTION = 0.8
4
4
 
5
5
 
6
6
  def _is(val):
7
- '''Detects Mongo ObjectIds'''
8
- return isinstance(val, str) and bool(re.match(r'^[0-9a-fA-F]{24}$', val))
7
+ """Detects Mongo ObjectIds"""
8
+ return isinstance(val, str) and bool(re.match(r"^[0-9a-fA-F]{24}$", val))
@@ -4,5 +4,5 @@ PROPORTION = 1
4
4
 
5
5
 
6
6
  def _is(val):
7
- '''Detects twitter accounts'''
8
- return isinstance(val, str) and bool(re.match(r'^@[A-Za-z0-9_]+$', val))
7
+ """Detects twitter accounts"""
8
+ return isinstance(val, str) and bool(re.match(r"^@[A-Za-z0-9_]+$", val))
@@ -4,8 +4,7 @@ PROPORTION = 0.8
4
4
 
5
5
 
6
6
  def _is(val):
7
- '''Detects UUIDs'''
8
- return isinstance(val, str) and bool(re.match(
9
- r'^[{]?[0-9a-fA-F]{8}' + '-?([0-9a-fA-F]{4}-?)' + '{3}[0-9a-fA-F]{12}[}]?$',
10
- val
11
- ))
7
+ """Detects UUIDs"""
8
+ return isinstance(val, str) and bool(
9
+ re.match(r"^[{]?[0-9a-fA-F]{8}" + "-?([0-9a-fA-F]{4}-?)" + "{3}[0-9a-fA-F]{12}[}]?$", val)
10
+ )
@@ -2,7 +2,8 @@ from datetime import datetime
2
2
  from typing import Optional
3
3
 
4
4
  from dateparser import parse as date_parser
5
- from dateutil.parser import parse as dateutil_parser, ParserError
5
+ from dateutil.parser import ParserError
6
+ from dateutil.parser import parse as dateutil_parser
6
7
 
7
8
  PROPORTION = 1
8
9
  # /!\ this is only for dates, not datetimes which are handled by other utils
@@ -22,7 +23,7 @@ threshold = 0.3
22
23
 
23
24
 
24
25
  def _is(val):
25
- '''Renvoie True si val peut être une date, False sinon'''
26
+ """Renvoie True si val peut être une date, False sinon"""
26
27
  # early stops, to cut processing time
27
28
  if not isinstance(val, str) or len(val) > 20 or len(val) < 8:
28
29
  return False
@@ -4,15 +4,15 @@ PROPORTION = 1
4
4
 
5
5
 
6
6
  def _is(val):
7
- '''Renvoie True si val peut être une date au format rfc822, False sinon
8
- Exemple: Tue, 19 Dec 2023 15:30:45 +0000'''
7
+ """Renvoie True si val peut être une date au format rfc822, False sinon
8
+ Exemple: Tue, 19 Dec 2023 15:30:45 +0000"""
9
9
 
10
10
  return isinstance(val, str) and bool(
11
11
  re.match(
12
- r'^[A-Za-z]{3}, (0[1-9]|[1-2][0-9]|3[01]) [A-Za-z]{3} \d{4} '
13
- r'([0-2])([0-9]):([0-5])([0-9]):([0-5])([0-9]) '
14
- r'(ut|gmt|est|edt|cst|cdt|mst|mdt|pst|pdt|[+\-](0[0-9]|1[0-3])00)$',
12
+ r"^[A-Za-z]{3}, (0[1-9]|[1-2][0-9]|3[01]) [A-Za-z]{3} \d{4} "
13
+ r"([0-2])([0-9]):([0-5])([0-9]):([0-5])([0-9]) "
14
+ r"(ut|gmt|est|edt|cst|cdt|mst|mdt|pst|pdt|[+\-](0[0-9]|1[0-3])00)$",
15
15
  val.lower(),
16
- re.IGNORECASE
16
+ re.IGNORECASE,
17
17
  )
18
18
  )
@@ -2,7 +2,7 @@ PROPORTION = 1
2
2
 
3
3
 
4
4
  def _is(val):
5
- '''Returns True if val can be a year'''
5
+ """Returns True if val can be a year"""
6
6
  try:
7
7
  val = int(val)
8
8
  except ValueError:
@@ -4,7 +4,6 @@ PROPORTION = 0.5
4
4
 
5
5
 
6
6
  def _is(header: str) -> float:
7
-
8
7
  words_combinations_list = [
9
8
  "telephone",
10
9
  "tel",
@@ -1,4 +1,5 @@
1
1
  from csv_detective.parsing.text import header_score
2
+
2
3
  from ..latlon_wgs import COMMON_COORDS_LABELS
3
4
 
4
5
  PROPORTION = 0.5
@@ -4,5 +4,5 @@ PROPORTION = 0.5
4
4
 
5
5
 
6
6
  def _is(header: str) -> float:
7
- words_combinations_list = ['id', 'objectid']
7
+ words_combinations_list = ["id", "objectid"]
8
8
  return header_score(header, words_combinations_list)
@@ -1,6 +1,6 @@
1
1
  import logging
2
- from typing import TextIO
3
2
  from time import time
3
+ from typing import TextIO
4
4
 
5
5
  from csv_detective.utils import display_logs_depending_process_time
6
6
 
@@ -47,19 +47,21 @@ def detect_heading_columns(file: TextIO, sep: str, verbose: bool = False) -> int
47
47
  if return_int == 0:
48
48
  if verbose:
49
49
  display_logs_depending_process_time(
50
- f'No heading column detected in {round(time() - start, 3)}s',
50
+ f"No heading column detected in {round(time() - start, 3)}s",
51
51
  time() - start,
52
52
  )
53
53
  return 0
54
54
  if verbose:
55
55
  display_logs_depending_process_time(
56
- f'{return_int} heading columns detected in {round(time() - start, 3)}s',
56
+ f"{return_int} heading columns detected in {round(time() - start, 3)}s",
57
57
  time() - start,
58
58
  )
59
59
  return return_int
60
60
 
61
61
 
62
- def detect_trailing_columns(file: TextIO, sep: str, heading_columns: int, verbose: bool = False) -> int:
62
+ def detect_trailing_columns(
63
+ file: TextIO, sep: str, heading_columns: int, verbose: bool = False
64
+ ) -> int:
63
65
  """Tests first 10 lines to see if there are empty trailing columns"""
64
66
  if verbose:
65
67
  start = time()
@@ -70,20 +72,18 @@ def detect_trailing_columns(file: TextIO, sep: str, heading_columns: int, verbos
70
72
  line = file.readline()
71
73
  return_int = min(
72
74
  return_int,
73
- len(line.replace("\n", ""))
74
- - len(line.replace("\n", "").strip(sep))
75
- - heading_columns,
75
+ len(line.replace("\n", "")) - len(line.replace("\n", "").strip(sep)) - heading_columns,
76
76
  )
77
77
  if return_int == 0:
78
78
  if verbose:
79
79
  display_logs_depending_process_time(
80
- f'No trailing column detected in {round(time() - start, 3)}s',
80
+ f"No trailing column detected in {round(time() - start, 3)}s",
81
81
  time() - start,
82
82
  )
83
83
  return 0
84
84
  if verbose:
85
85
  display_logs_depending_process_time(
86
- f'{return_int} trailing columns detected in {round(time() - start, 3)}s',
86
+ f"{return_int} trailing columns detected in {round(time() - start, 3)}s",
87
87
  time() - start,
88
88
  )
89
89
  return return_int
@@ -1,6 +1,6 @@
1
1
  import logging
2
- from time import time
3
2
  from io import BytesIO
3
+ from time import time
4
4
 
5
5
  from cchardet import detect
6
6
 
@@ -16,12 +16,14 @@ def detect_encoding(binary_file: BytesIO, verbose: bool = False) -> str:
16
16
  logging.info("Detecting encoding")
17
17
  encoding_dict = detect(binary_file.read())
18
18
  if not encoding_dict["encoding"]:
19
- raise ValueError("Could not detect the file's encoding. Consider specifying it in the routine call.")
19
+ raise ValueError(
20
+ "Could not detect the file's encoding. Consider specifying it in the routine call."
21
+ )
20
22
  if verbose:
21
23
  message = f'Detected encoding: "{encoding_dict["encoding"]}"'
22
- message += f' in {round(time() - start, 3)}s (confidence: {round(encoding_dict["confidence"]*100)}%)'
24
+ message += f" in {round(time() - start, 3)}s (confidence: {round(encoding_dict['confidence'] * 100)}%)"
23
25
  display_logs_depending_process_time(
24
26
  message,
25
27
  time() - start,
26
28
  )
27
- return encoding_dict['encoding']
29
+ return encoding_dict["encoding"]
@@ -22,11 +22,11 @@ def detect_engine(file_path: str, verbose=False) -> Optional[str]:
22
22
  mapping = {
23
23
  "application/gzip": "gzip",
24
24
  "application/x-gzip": "gzip",
25
- 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'openpyxl',
26
- 'application/vnd.ms-excel': 'xlrd',
27
- 'application/vnd.oasis.opendocument.spreadsheet': 'odf',
25
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "openpyxl",
26
+ "application/vnd.ms-excel": "xlrd",
27
+ "application/vnd.oasis.opendocument.spreadsheet": "odf",
28
28
  # all these files could be recognized as zip, may need to check all cases then
29
- 'application/zip': 'openpyxl',
29
+ "application/zip": "openpyxl",
30
30
  }
31
31
  # if none of the above, we move forwards with the csv process
32
32
  if is_url(file_path):
@@ -37,7 +37,8 @@ def detect_engine(file_path: str, verbose=False) -> Optional[str]:
37
37
  if verbose:
38
38
  message = (
39
39
  f"File is not csv, detected {engine_to_file.get(engine, 'csv')}"
40
- if engine else "Processing the file as a csv"
40
+ if engine
41
+ else "Processing the file as a csv"
41
42
  )
42
43
  display_logs_depending_process_time(
43
44
  message,
@@ -1,16 +1,17 @@
1
- from collections import defaultdict
2
1
  import logging
2
+ from collections import defaultdict
3
3
  from typing import Union
4
4
 
5
5
  import numpy as np
6
6
  import pandas as pd
7
+
7
8
  from csv_detective.detection.variables import (
8
9
  detect_categorical_variable,
9
10
  # detect_continuous_variable,
10
11
  )
11
12
  from csv_detective.load_tests import return_all_tests
12
13
  from csv_detective.output.utils import prepare_output_dict
13
- from csv_detective.parsing.columns import test_col, test_label, MAX_ROWS_ANALYSIS
14
+ from csv_detective.parsing.columns import MAX_ROWS_ANALYSIS, test_col, test_label
14
15
  from csv_detective.validate import validate
15
16
 
16
17
 
@@ -42,10 +43,12 @@ def detect_formats(
42
43
  # detect_continuous_variable(table.iloc[:, ~categorical_mask.values], verbose=verbose)
43
44
  # )
44
45
 
45
- analysis.update({
46
- "categorical": res_categorical,
47
- # "continuous": res_continuous,
48
- })
46
+ analysis.update(
47
+ {
48
+ "categorical": res_categorical,
49
+ # "continuous": res_continuous,
50
+ }
51
+ )
49
52
 
50
53
  # list testing to be performed
51
54
  all_tests_fields = return_all_tests(
@@ -60,7 +63,9 @@ def detect_formats(
60
63
  return analysis
61
64
 
62
65
  # Perform testing on fields
63
- scores_table_fields = test_col(table, all_tests_fields, limited_output, skipna=skipna, verbose=verbose)
66
+ scores_table_fields = test_col(
67
+ table, all_tests_fields, limited_output, skipna=skipna, verbose=verbose
68
+ )
64
69
  analysis["columns_fields"] = prepare_output_dict(scores_table_fields, limited_output)
65
70
 
66
71
  # Perform testing on labels
@@ -71,16 +76,14 @@ def detect_formats(
71
76
  # This is because the fields are more important than the labels and yields a max
72
77
  # of 1.5 for the final score.
73
78
  scores_table = scores_table_fields * (
74
- 1
75
- + scores_table_labels.reindex(
76
- index=scores_table_fields.index, fill_value=0
77
- ).values / 2
79
+ 1 + scores_table_labels.reindex(index=scores_table_fields.index, fill_value=0).values / 2
78
80
  )
79
81
 
80
82
  # To reduce false positives: ensure these formats are detected only if the label yields
81
83
  # a detection (skipping the ones that have been excluded by the users).
82
84
  formats_with_mandatory_label = [
83
- f for f in [
85
+ f
86
+ for f in [
84
87
  "code_departement",
85
88
  "code_commune_insee",
86
89
  "code_postal",
@@ -90,7 +93,8 @@ def detect_formats(
90
93
  "longitude_wgs_fr_metropole",
91
94
  "latitude_l93",
92
95
  "longitude_l93",
93
- ] if f in scores_table.index
96
+ ]
97
+ if f in scores_table.index
94
98
  ]
95
99
  scores_table.loc[formats_with_mandatory_label, :] = np.where(
96
100
  scores_table_labels.loc[formats_with_mandatory_label, :],
@@ -123,9 +127,7 @@ def detect_formats(
123
127
  analysis[detection_method] = {
124
128
  col_name: [
125
129
  {
126
- "python_type": metier_to_python_type.get(
127
- detection["format"], "string"
128
- ),
130
+ "python_type": metier_to_python_type.get(detection["format"], "string"),
129
131
  **detection,
130
132
  }
131
133
  for detection in detections
@@ -136,9 +138,7 @@ def detect_formats(
136
138
  for detection_method in ["columns_fields", "columns_labels", "columns"]:
137
139
  analysis[detection_method] = {
138
140
  col_name: {
139
- "python_type": metier_to_python_type.get(
140
- detection["format"], "string"
141
- ),
141
+ "python_type": metier_to_python_type.get(detection["format"], "string"),
142
142
  **detection,
143
143
  }
144
144
  for col_name, detection in analysis[detection_method].items()
@@ -15,18 +15,16 @@ def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int,
15
15
  header = file.readline()
16
16
  position = file.tell()
17
17
  chaine = [c for c in header.replace("\n", "").split(sep) if c]
18
- if chaine[-1] not in ["", "\n"] and all(
19
- [mot not in ["", "\n"] for mot in chaine[1:-1]]
20
- ):
18
+ if chaine[-1] not in ["", "\n"] and all([mot not in ["", "\n"] for mot in chaine[1:-1]]):
21
19
  next_row = file.readline()
22
20
  file.seek(position)
23
21
  if header != next_row:
24
22
  if verbose:
25
23
  display_logs_depending_process_time(
26
- f'Detected headers in {round(time() - start, 3)}s',
24
+ f"Detected headers in {round(time() - start, 3)}s",
27
25
  time() - start,
28
26
  )
29
27
  return i, chaine
30
28
  if verbose:
31
- logging.info('No header detected')
29
+ logging.info("No header detected")
32
30
  return 0, None
@@ -5,7 +5,7 @@ def remove_empty_first_rows(table: pd.DataFrame) -> tuple[pd.DataFrame, int]:
5
5
  """Analog process to detect_headers for csv files, determines how many rows to skip
6
6
  to end up with the header at the right place"""
7
7
  idx = 0
8
- if all([str(c).startswith('Unnamed:') for c in table.columns]):
8
+ if all([str(c).startswith("Unnamed:") for c in table.columns]):
9
9
  # there is on offset between the index in the file (idx here)
10
10
  # and the index in the dataframe, because of the header
11
11
  idx = 1
@@ -7,7 +7,9 @@ import pandas as pd
7
7
  from csv_detective.utils import display_logs_depending_process_time
8
8
 
9
9
 
10
- def detect_continuous_variable(table: pd.DataFrame, continuous_th: float = 0.9, verbose: bool = False):
10
+ def detect_continuous_variable(
11
+ table: pd.DataFrame, continuous_th: float = 0.9, verbose: bool = False
12
+ ):
11
13
  """
12
14
  Detects whether a column contains continuous variables. We consider a continuous column
13
15
  one that contains a considerable amount of float values.
@@ -41,9 +43,7 @@ def detect_continuous_variable(table: pd.DataFrame, continuous_th: float = 0.9,
41
43
  if verbose:
42
44
  start = time()
43
45
  logging.info("Detecting continuous columns")
44
- res = table.apply(
45
- lambda serie: check_threshold(serie.apply(parses_to_integer), continuous_th)
46
- )
46
+ res = table.apply(lambda serie: check_threshold(serie.apply(parses_to_integer), continuous_th))
47
47
  if verbose:
48
48
  display_logs_depending_process_time(
49
49
  f"Detected {sum(res)} continuous columns in {round(time() - start, 3)}s",
@@ -55,7 +55,10 @@ def routine(
55
55
  dict: a dict with information about the csv and possible types for each column
56
56
  """
57
57
 
58
- if not (isinstance(save_results, bool) or (isinstance(save_results, str) and save_results.endswith(".json"))):
58
+ if not (
59
+ isinstance(save_results, bool)
60
+ or (isinstance(save_results, str) and save_results.endswith(".json"))
61
+ ):
59
62
  raise ValueError("`save_results` must be a bool or a valid path to a json file.")
60
63
 
61
64
  if verbose:
@@ -100,8 +103,7 @@ def routine(
100
103
  finally:
101
104
  if verbose:
102
105
  display_logs_depending_process_time(
103
- f"Routine completed in {round(time() - start_routine, 3)}s",
104
- time() - start_routine
106
+ f"Routine completed in {round(time() - start_routine, 3)}s", time() - start_routine
105
107
  )
106
108
 
107
109
 
@@ -119,7 +121,6 @@ def validate_then_detect(
119
121
  cast_json: bool = True,
120
122
  verbose: bool = False,
121
123
  ):
122
-
123
124
  if verbose:
124
125
  start_routine = time()
125
126
  if is_url(file_path):
@@ -170,8 +171,7 @@ def validate_then_detect(
170
171
  finally:
171
172
  if verbose:
172
173
  display_logs_depending_process_time(
173
- f"Process completed in {round(time() - start_routine, 3)}s",
174
- time() - start_routine
174
+ f"Process completed in {round(time() - start_routine, 3)}s", time() - start_routine
175
175
  )
176
176
 
177
177
 
@@ -226,8 +226,7 @@ def routine_minio(
226
226
  if location_dict is not None:
227
227
  if any(
228
228
  [
229
- (location_key not in location_dict)
230
- or (location_dict[location_key] is None)
229
+ (location_key not in location_dict) or (location_dict[location_key] is None)
231
230
  for location_key in ["netloc", "bucket", "key"]
232
231
  ]
233
232
  ):
@@ -12,10 +12,7 @@ def get_all_packages(detect_type) -> list:
12
12
  for filename in filenames:
13
13
  file = os.path.join(dirpath, filename).replace(root_dir, "")
14
14
  if file.endswith("__init__.py"):
15
- module = (
16
- file.replace("__init__.py", "")
17
- .replace("/", ".").replace("\\", ".")[:-1]
18
- )
15
+ module = file.replace("__init__.py", "").replace("/", ".").replace("\\", ".")[:-1]
19
16
  if module:
20
17
  modules.append(detect_type + module)
21
18
  return modules
@@ -43,20 +40,15 @@ def return_all_tests(
43
40
  if "ALL" in user_input_tests or all(x[0] == "-" for x in user_input_tests):
44
41
  tests_to_do = [detect_type]
45
42
  else:
46
- tests_to_do = [
47
- f"{detect_type}.{x}" for x in user_input_tests if x[0] != "-"
48
- ]
49
- tests_skipped = [
50
- f"{detect_type}.{x[1:]}" for x in user_input_tests if x[0] == "-"
51
- ]
43
+ tests_to_do = [f"{detect_type}.{x}" for x in user_input_tests if x[0] != "-"]
44
+ tests_skipped = [f"{detect_type}.{x[1:]}" for x in user_input_tests if x[0] == "-"]
52
45
  all_tests = [
53
46
  # this is why we need to import detect_fields/labels
54
- eval(x) for x in all_packages
47
+ eval(x)
48
+ for x in all_packages
55
49
  if any([y == x[: len(y)] for y in tests_to_do])
56
50
  and all([y != x[: len(y)] for y in tests_skipped])
57
51
  ]
58
52
  # to remove groups of tests
59
- all_tests = [
60
- test for test in all_tests if "_is" in dir(test)
61
- ]
53
+ all_tests = [test for test in all_tests if "_is" in dir(test)]
62
54
  return all_tests