csv-detective 0.8.1.dev1703__py3-none-any.whl → 0.8.1.dev1729__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. csv_detective/cli.py +6 -9
  2. csv_detective/detect_fields/FR/geo/adresse/__init__.py +78 -78
  3. csv_detective/detect_fields/FR/geo/code_departement/__init__.py +2 -2
  4. csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -1
  5. csv_detective/detect_fields/FR/geo/code_region/__init__.py +1 -1
  6. csv_detective/detect_fields/FR/geo/commune/__init__.py +2 -2
  7. csv_detective/detect_fields/FR/geo/departement/__init__.py +2 -2
  8. csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +2 -2
  9. csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +1 -2
  10. csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +1 -1
  11. csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +1 -2
  12. csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +1 -1
  13. csv_detective/detect_fields/FR/geo/pays/__init__.py +6 -6
  14. csv_detective/detect_fields/FR/geo/region/__init__.py +6 -4
  15. csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +15 -14
  16. csv_detective/detect_fields/FR/other/csp_insee/__init__.py +4 -3
  17. csv_detective/detect_fields/FR/other/date_fr/__init__.py +3 -3
  18. csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +4 -3
  19. csv_detective/detect_fields/FR/other/sexe/__init__.py +2 -2
  20. csv_detective/detect_fields/FR/other/siren/__init__.py +3 -3
  21. csv_detective/detect_fields/FR/other/siret/__init__.py +3 -3
  22. csv_detective/detect_fields/FR/other/tel_fr/__init__.py +3 -3
  23. csv_detective/detect_fields/FR/other/uai/__init__.py +2 -2
  24. csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +15 -15
  25. csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +27 -27
  26. csv_detective/detect_fields/__init__.py +94 -43
  27. csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +5 -5
  28. csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +5 -5
  29. csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +5 -5
  30. csv_detective/detect_fields/geo/latitude_wgs/__init__.py +1 -1
  31. csv_detective/detect_fields/geo/longitude_wgs/__init__.py +1 -1
  32. csv_detective/detect_fields/other/booleen/__init__.py +1 -1
  33. csv_detective/detect_fields/other/email/__init__.py +4 -2
  34. csv_detective/detect_fields/other/int/__init__.py +3 -3
  35. csv_detective/detect_fields/other/mongo_object_id/__init__.py +2 -2
  36. csv_detective/detect_fields/other/twitter/__init__.py +2 -2
  37. csv_detective/detect_fields/other/uuid/__init__.py +4 -5
  38. csv_detective/detect_fields/temp/date/__init__.py +3 -2
  39. csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +6 -6
  40. csv_detective/detect_fields/temp/year/__init__.py +1 -1
  41. csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -1
  42. csv_detective/detect_labels/__init__.py +51 -1
  43. csv_detective/detect_labels/geo/lonlat_wgs/__init__.py +1 -0
  44. csv_detective/detect_labels/other/mongo_object_id/__init__.py +1 -1
  45. csv_detective/detection/columns.py +9 -9
  46. csv_detective/detection/encoding.py +6 -4
  47. csv_detective/detection/engine.py +6 -5
  48. csv_detective/detection/formats.py +19 -19
  49. csv_detective/detection/headers.py +3 -5
  50. csv_detective/detection/rows.py +1 -1
  51. csv_detective/detection/variables.py +6 -7
  52. csv_detective/explore_csv.py +7 -8
  53. csv_detective/load_tests.py +7 -16
  54. csv_detective/output/__init__.py +3 -7
  55. csv_detective/output/dataframe.py +9 -5
  56. csv_detective/output/example.py +13 -13
  57. csv_detective/output/profile.py +30 -23
  58. csv_detective/output/schema.py +20 -23
  59. csv_detective/output/utils.py +15 -15
  60. csv_detective/parsing/columns.py +23 -12
  61. csv_detective/parsing/csv.py +1 -1
  62. csv_detective/parsing/excel.py +10 -11
  63. csv_detective/parsing/load.py +11 -8
  64. csv_detective/parsing/text.py +4 -9
  65. csv_detective/s3_utils.py +3 -7
  66. csv_detective/utils.py +4 -2
  67. csv_detective/validate.py +18 -13
  68. {csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/METADATA +12 -2
  69. {csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/RECORD +79 -79
  70. tests/test_example.py +2 -6
  71. tests/test_fields.py +16 -10
  72. tests/test_file.py +10 -9
  73. tests/test_labels.py +3 -2
  74. tests/test_structure.py +4 -3
  75. tests/test_validation.py +9 -6
  76. {csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/WHEEL +0 -0
  77. {csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/entry_points.txt +0 -0
  78. {csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/licenses/LICENSE +0 -0
  79. {csv_detective-0.8.1.dev1703.dist-info → csv_detective-0.8.1.dev1729.dist-info}/top_level.txt +0 -0
@@ -2,37 +2,37 @@ from unidecode import unidecode
2
2
 
3
3
  PROPORTION = 1
4
4
  mois = {
5
- 'janvier',
6
- 'fevrier',
7
- 'mars',
8
- 'avril',
9
- 'mai',
10
- 'juin',
11
- 'juillet',
12
- 'aout',
13
- 'septembre',
14
- 'octobre',
15
- 'novembre',
16
- 'decembre',
17
- 'jan',
18
- 'fev',
19
- 'mar',
20
- 'avr',
21
- 'mai',
22
- 'jun',
23
- 'jui',
24
- 'juil',
25
- 'aou',
26
- 'sep',
27
- 'sept',
28
- 'oct',
29
- 'nov',
30
- 'dec'
5
+ "janvier",
6
+ "fevrier",
7
+ "mars",
8
+ "avril",
9
+ "mai",
10
+ "juin",
11
+ "juillet",
12
+ "aout",
13
+ "septembre",
14
+ "octobre",
15
+ "novembre",
16
+ "decembre",
17
+ "jan",
18
+ "fev",
19
+ "mar",
20
+ "avr",
21
+ "mai",
22
+ "jun",
23
+ "jui",
24
+ "juil",
25
+ "aou",
26
+ "sep",
27
+ "sept",
28
+ "oct",
29
+ "nov",
30
+ "dec",
31
31
  }
32
32
 
33
33
 
34
34
  def _is(val):
35
- '''Renvoie True si les champs peuvent être des mois de l'année'''
35
+ """Renvoie True si les champs peuvent être des mois de l'année"""
36
36
  if not isinstance(val, str):
37
37
  return False
38
38
  val = unidecode(val.lower())
@@ -1,61 +1,112 @@
1
- # flake8: noqa
2
- from .FR.other import (
3
- code_csp_insee,
4
- csp_insee,
5
- sexe,
6
- siren,
7
- tel_fr,
8
- uai,
9
- siret,
10
- insee_ape700,
11
- date_fr,
12
- code_import,
13
- code_waldec,
14
- code_rna,
15
- )
16
-
17
- from .other import (
18
- email,
19
- url,
20
- booleen,
21
- money,
22
- mongo_object_id,
23
- percent,
24
- twitter,
25
- float,
26
- int,
27
- uuid,
28
- json,
29
- )
30
-
31
1
  from .FR.geo import (
32
2
  adresse,
33
3
  code_commune_insee,
34
- code_postal,
35
- commune,
36
- departement,
37
- pays,
38
- region,
39
4
  code_departement,
40
5
  code_fantoir,
41
- longitude_wgs_fr_metropole,
42
- latitude_wgs_fr_metropole,
6
+ code_postal,
43
7
  code_region,
8
+ commune,
9
+ departement,
10
+ insee_canton,
44
11
  latitude_l93,
12
+ latitude_wgs_fr_metropole,
45
13
  longitude_l93,
46
- insee_canton,
14
+ longitude_wgs_fr_metropole,
15
+ pays,
16
+ region,
47
17
  )
48
-
18
+ from .FR.other import (
19
+ code_csp_insee,
20
+ code_import,
21
+ code_rna,
22
+ code_waldec,
23
+ csp_insee,
24
+ date_fr,
25
+ insee_ape700,
26
+ sexe,
27
+ siren,
28
+ siret,
29
+ tel_fr,
30
+ uai,
31
+ )
32
+ from .FR.temp import jour_de_la_semaine, mois_de_annee
49
33
  from .geo import (
50
34
  iso_country_code_alpha2,
51
35
  iso_country_code_alpha3,
52
36
  iso_country_code_numeric,
37
+ json_geojson,
53
38
  latitude_wgs,
54
- longitude_wgs,
55
39
  latlon_wgs,
40
+ longitude_wgs,
56
41
  lonlat_wgs,
57
- json_geojson,
58
42
  )
43
+ from .other import (
44
+ booleen,
45
+ email,
46
+ float,
47
+ int,
48
+ json,
49
+ money,
50
+ mongo_object_id,
51
+ percent,
52
+ twitter,
53
+ url,
54
+ uuid,
55
+ )
56
+ from .temp import date, datetime_aware, datetime_naive, datetime_rfc822, year
59
57
 
60
- from .FR.temp import jour_de_la_semaine, mois_de_annee
61
- from .temp import year, date, datetime_aware, datetime_naive, datetime_rfc822
58
+ __all__ = [
59
+ "adresse",
60
+ "code_commune_insee",
61
+ "code_departement",
62
+ "code_fantoir",
63
+ "code_postal",
64
+ "code_region",
65
+ "commune",
66
+ "departement",
67
+ "insee_canton",
68
+ "latitude_l93",
69
+ "latitude_wgs_fr_metropole",
70
+ "longitude_l93",
71
+ "longitude_wgs_fr_metropole",
72
+ "pays",
73
+ "region",
74
+ "code_csp_insee",
75
+ "code_import",
76
+ "code_rna",
77
+ "code_waldec",
78
+ "csp_insee",
79
+ "date_fr",
80
+ "insee_ape700",
81
+ "sexe",
82
+ "siren",
83
+ "siret",
84
+ "tel_fr",
85
+ "uai",
86
+ "jour_de_la_semaine",
87
+ "mois_de_annee",
88
+ "iso_country_code_alpha2",
89
+ "iso_country_code_alpha3",
90
+ "iso_country_code_numeric",
91
+ "json_geojson",
92
+ "latitude_wgs",
93
+ "latlon_wgs",
94
+ "longitude_wgs",
95
+ "lonlat_wgs",
96
+ "booleen",
97
+ "email",
98
+ "float",
99
+ "int",
100
+ "json",
101
+ "money",
102
+ "mongo_object_id",
103
+ "percent",
104
+ "twitter",
105
+ "url",
106
+ "uuid",
107
+ "date",
108
+ "datetime_aware",
109
+ "datetime_naive",
110
+ "datetime_rfc822",
111
+ "year",
112
+ ]
@@ -1,15 +1,15 @@
1
- from os.path import dirname, join
2
1
  import re
2
+ from os.path import dirname, join
3
3
 
4
4
  PROPORTION = 1
5
5
 
6
- with open(join(dirname(__file__), 'iso_country_code_alpha2.txt'), 'r') as iofile:
7
- liste_pays = iofile.read().split('\n')
6
+ with open(join(dirname(__file__), "iso_country_code_alpha2.txt"), "r") as iofile:
7
+ liste_pays = iofile.read().split("\n")
8
8
  liste_pays = set(liste_pays)
9
9
 
10
10
 
11
11
  def _is(val):
12
- '''Renvoie True si val peut etre un code iso pays alpha-2, False sinon'''
13
- if not isinstance(val, str) or not bool(re.match(r'[A-Z]{2}$', val)):
12
+ """Renvoie True si val peut etre un code iso pays alpha-2, False sinon"""
13
+ if not isinstance(val, str) or not bool(re.match(r"[A-Z]{2}$", val)):
14
14
  return False
15
15
  return val in liste_pays
@@ -1,14 +1,14 @@
1
- from os.path import dirname, join
2
1
  import re
2
+ from os.path import dirname, join
3
3
 
4
4
  PROPORTION = 1
5
5
 
6
- with open(join(dirname(__file__), 'iso_country_code_alpha3.txt'), 'r') as iofile:
7
- liste_pays = iofile.read().split('\n')
6
+ with open(join(dirname(__file__), "iso_country_code_alpha3.txt"), "r") as iofile:
7
+ liste_pays = iofile.read().split("\n")
8
8
 
9
9
 
10
10
  def _is(val):
11
- '''Renvoie True si val peut etre un code iso pays alpha-3, False sinon'''
12
- if not isinstance(val, str) or not bool(re.match(r'[A-Z]{3}$', val)):
11
+ """Renvoie True si val peut etre un code iso pays alpha-3, False sinon"""
12
+ if not isinstance(val, str) or not bool(re.match(r"[A-Z]{3}$", val)):
13
13
  return False
14
14
  return val in set(liste_pays)
@@ -1,15 +1,15 @@
1
- from os.path import dirname, join
2
1
  import re
2
+ from os.path import dirname, join
3
3
 
4
4
  PROPORTION = 1
5
5
 
6
- with open(join(dirname(__file__), 'iso_country_code_numeric.txt'), 'r') as iofile:
7
- liste_pays = iofile.read().split('\n')
6
+ with open(join(dirname(__file__), "iso_country_code_numeric.txt"), "r") as iofile:
7
+ liste_pays = iofile.read().split("\n")
8
8
  liste_pays = set(liste_pays)
9
9
 
10
10
 
11
11
  def _is(val):
12
- '''Renvoie True si val peut etre un code iso pays numerique, False sinon'''
13
- if not isinstance(val, str) or not bool(re.match(r'[0-9]{3}$', val)):
12
+ """Renvoie True si val peut etre un code iso pays numerique, False sinon"""
13
+ if not isinstance(val, str) or not bool(re.match(r"[0-9]{3}$", val)):
14
14
  return False
15
15
  return val in liste_pays
@@ -4,7 +4,7 @@ PROPORTION = 0.9
4
4
 
5
5
 
6
6
  def _is(val):
7
- '''Renvoie True si val peut etre une latitude'''
7
+ """Renvoie True si val peut etre une latitude"""
8
8
  try:
9
9
  return is_float(val) and float(val) >= -90 and float(val) <= 90
10
10
  except ValueError:
@@ -4,7 +4,7 @@ PROPORTION = 0.9
4
4
 
5
5
 
6
6
  def _is(val):
7
- '''Renvoie True si val peut etre une longitude'''
7
+ """Renvoie True si val peut etre une longitude"""
8
8
  try:
9
9
  return is_float(val) and float(val) >= -180 and float(val) <= 180
10
10
  except ValueError:
@@ -23,5 +23,5 @@ def bool_casting(val: str) -> bool:
23
23
 
24
24
 
25
25
  def _is(val: str) -> bool:
26
- '''Détecte les booléens'''
26
+ """Détecte les booléens"""
27
27
  return isinstance(val, str) and val.lower() in liste_bool
@@ -4,5 +4,7 @@ PROPORTION = 1
4
4
 
5
5
 
6
6
  def _is(val):
7
- '''Detects e-mails'''
8
- return isinstance(val, str) and bool(re.match(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}$', val))
7
+ """Detects e-mails"""
8
+ return isinstance(val, str) and bool(
9
+ re.match(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}$", val)
10
+ )
@@ -2,11 +2,11 @@ PROPORTION = 1
2
2
 
3
3
 
4
4
  def _is(val):
5
- '''Detects integers'''
5
+ """Detects integers"""
6
6
  if (
7
7
  not isinstance(val, str)
8
- or any([v in val for v in ['.', '_', '+']])
9
- or (val.startswith('0') and len(val) > 1)
8
+ or any([v in val for v in [".", "_", "+"]])
9
+ or (val.startswith("0") and len(val) > 1)
10
10
  ):
11
11
  return False
12
12
  try:
@@ -4,5 +4,5 @@ PROPORTION = 0.8
4
4
 
5
5
 
6
6
  def _is(val):
7
- '''Detects Mongo ObjectIds'''
8
- return isinstance(val, str) and bool(re.match(r'^[0-9a-fA-F]{24}$', val))
7
+ """Detects Mongo ObjectIds"""
8
+ return isinstance(val, str) and bool(re.match(r"^[0-9a-fA-F]{24}$", val))
@@ -4,5 +4,5 @@ PROPORTION = 1
4
4
 
5
5
 
6
6
  def _is(val):
7
- '''Detects twitter accounts'''
8
- return isinstance(val, str) and bool(re.match(r'^@[A-Za-z0-9_]+$', val))
7
+ """Detects twitter accounts"""
8
+ return isinstance(val, str) and bool(re.match(r"^@[A-Za-z0-9_]+$", val))
@@ -4,8 +4,7 @@ PROPORTION = 0.8
4
4
 
5
5
 
6
6
  def _is(val):
7
- '''Detects UUIDs'''
8
- return isinstance(val, str) and bool(re.match(
9
- r'^[{]?[0-9a-fA-F]{8}' + '-?([0-9a-fA-F]{4}-?)' + '{3}[0-9a-fA-F]{12}[}]?$',
10
- val
11
- ))
7
+ """Detects UUIDs"""
8
+ return isinstance(val, str) and bool(
9
+ re.match(r"^[{]?[0-9a-fA-F]{8}" + "-?([0-9a-fA-F]{4}-?)" + "{3}[0-9a-fA-F]{12}[}]?$", val)
10
+ )
@@ -2,7 +2,8 @@ from datetime import datetime
2
2
  from typing import Optional
3
3
 
4
4
  from dateparser import parse as date_parser
5
- from dateutil.parser import parse as dateutil_parser, ParserError
5
+ from dateutil.parser import ParserError
6
+ from dateutil.parser import parse as dateutil_parser
6
7
 
7
8
  PROPORTION = 1
8
9
  # /!\ this is only for dates, not datetimes which are handled by other utils
@@ -22,7 +23,7 @@ threshold = 0.3
22
23
 
23
24
 
24
25
  def _is(val):
25
- '''Renvoie True si val peut être une date, False sinon'''
26
+ """Renvoie True si val peut être une date, False sinon"""
26
27
  # early stops, to cut processing time
27
28
  if not isinstance(val, str) or len(val) > 20 or len(val) < 8:
28
29
  return False
@@ -4,15 +4,15 @@ PROPORTION = 1
4
4
 
5
5
 
6
6
  def _is(val):
7
- '''Renvoie True si val peut être une date au format rfc822, False sinon
8
- Exemple: Tue, 19 Dec 2023 15:30:45 +0000'''
7
+ """Renvoie True si val peut être une date au format rfc822, False sinon
8
+ Exemple: Tue, 19 Dec 2023 15:30:45 +0000"""
9
9
 
10
10
  return isinstance(val, str) and bool(
11
11
  re.match(
12
- r'^[A-Za-z]{3}, (0[1-9]|[1-2][0-9]|3[01]) [A-Za-z]{3} \d{4} '
13
- r'([0-2])([0-9]):([0-5])([0-9]):([0-5])([0-9]) '
14
- r'(ut|gmt|est|edt|cst|cdt|mst|mdt|pst|pdt|[+\-](0[0-9]|1[0-3])00)$',
12
+ r"^[A-Za-z]{3}, (0[1-9]|[1-2][0-9]|3[01]) [A-Za-z]{3} \d{4} "
13
+ r"([0-2])([0-9]):([0-5])([0-9]):([0-5])([0-9]) "
14
+ r"(ut|gmt|est|edt|cst|cdt|mst|mdt|pst|pdt|[+\-](0[0-9]|1[0-3])00)$",
15
15
  val.lower(),
16
- re.IGNORECASE
16
+ re.IGNORECASE,
17
17
  )
18
18
  )
@@ -2,7 +2,7 @@ PROPORTION = 1
2
2
 
3
3
 
4
4
  def _is(val):
5
- '''Returns True if val can be a year'''
5
+ """Returns True if val can be a year"""
6
6
  try:
7
7
  val = int(val)
8
8
  except ValueError:
@@ -4,7 +4,6 @@ PROPORTION = 0.5
4
4
 
5
5
 
6
6
  def _is(header: str) -> float:
7
-
8
7
  words_combinations_list = [
9
8
  "telephone",
10
9
  "tel",
@@ -1,4 +1,3 @@
1
- # flake8: noqa
2
1
  from .FR.geo import (
3
2
  adresse,
4
3
  code_commune_insee,
@@ -42,3 +41,54 @@ from .geo import (
42
41
  )
43
42
  from .other import booleen, email, float, int, money, mongo_object_id, twitter, url, uuid
44
43
  from .temp import date, datetime_rfc822, year
44
+
45
+ __all__ = [
46
+ "adresse",
47
+ "code_commune_insee",
48
+ "code_departement",
49
+ "code_fantoir",
50
+ "code_postal",
51
+ "code_region",
52
+ "commune",
53
+ "departement",
54
+ "insee_canton",
55
+ "latitude_l93",
56
+ "latitude_wgs_fr_metropole",
57
+ "longitude_l93",
58
+ "longitude_wgs_fr_metropole",
59
+ "pays",
60
+ "region",
61
+ "code_csp_insee",
62
+ "code_rna",
63
+ "code_waldec",
64
+ "csp_insee",
65
+ "date_fr",
66
+ "insee_ape700",
67
+ "sexe",
68
+ "siren",
69
+ "siret",
70
+ "tel_fr",
71
+ "uai",
72
+ "iso_country_code_alpha2",
73
+ "iso_country_code_alpha3",
74
+ "iso_country_code_numeric",
75
+ "json_geojson",
76
+ "latitude_wgs",
77
+ "latlon_wgs",
78
+ "longitude_wgs",
79
+ "lonlat_wgs",
80
+ "jour_de_la_semaine",
81
+ "mois_de_annee",
82
+ "booleen",
83
+ "email",
84
+ "float",
85
+ "int",
86
+ "money",
87
+ "mongo_object_id",
88
+ "twitter",
89
+ "url",
90
+ "uuid",
91
+ "date",
92
+ "datetime_rfc822",
93
+ "year",
94
+ ]
@@ -1,4 +1,5 @@
1
1
  from csv_detective.parsing.text import header_score
2
+
2
3
  from ..latlon_wgs import COMMON_COORDS_LABELS
3
4
 
4
5
  PROPORTION = 0.5
@@ -4,5 +4,5 @@ PROPORTION = 0.5
4
4
 
5
5
 
6
6
  def _is(header: str) -> float:
7
- words_combinations_list = ['id', 'objectid']
7
+ words_combinations_list = ["id", "objectid"]
8
8
  return header_score(header, words_combinations_list)
@@ -1,6 +1,6 @@
1
1
  import logging
2
- from typing import TextIO
3
2
  from time import time
3
+ from typing import TextIO
4
4
 
5
5
  from csv_detective.utils import display_logs_depending_process_time
6
6
 
@@ -47,19 +47,21 @@ def detect_heading_columns(file: TextIO, sep: str, verbose: bool = False) -> int
47
47
  if return_int == 0:
48
48
  if verbose:
49
49
  display_logs_depending_process_time(
50
- f'No heading column detected in {round(time() - start, 3)}s',
50
+ f"No heading column detected in {round(time() - start, 3)}s",
51
51
  time() - start,
52
52
  )
53
53
  return 0
54
54
  if verbose:
55
55
  display_logs_depending_process_time(
56
- f'{return_int} heading columns detected in {round(time() - start, 3)}s',
56
+ f"{return_int} heading columns detected in {round(time() - start, 3)}s",
57
57
  time() - start,
58
58
  )
59
59
  return return_int
60
60
 
61
61
 
62
- def detect_trailing_columns(file: TextIO, sep: str, heading_columns: int, verbose: bool = False) -> int:
62
+ def detect_trailing_columns(
63
+ file: TextIO, sep: str, heading_columns: int, verbose: bool = False
64
+ ) -> int:
63
65
  """Tests first 10 lines to see if there are empty trailing columns"""
64
66
  if verbose:
65
67
  start = time()
@@ -70,20 +72,18 @@ def detect_trailing_columns(file: TextIO, sep: str, heading_columns: int, verbos
70
72
  line = file.readline()
71
73
  return_int = min(
72
74
  return_int,
73
- len(line.replace("\n", ""))
74
- - len(line.replace("\n", "").strip(sep))
75
- - heading_columns,
75
+ len(line.replace("\n", "")) - len(line.replace("\n", "").strip(sep)) - heading_columns,
76
76
  )
77
77
  if return_int == 0:
78
78
  if verbose:
79
79
  display_logs_depending_process_time(
80
- f'No trailing column detected in {round(time() - start, 3)}s',
80
+ f"No trailing column detected in {round(time() - start, 3)}s",
81
81
  time() - start,
82
82
  )
83
83
  return 0
84
84
  if verbose:
85
85
  display_logs_depending_process_time(
86
- f'{return_int} trailing columns detected in {round(time() - start, 3)}s',
86
+ f"{return_int} trailing columns detected in {round(time() - start, 3)}s",
87
87
  time() - start,
88
88
  )
89
89
  return return_int
@@ -1,6 +1,6 @@
1
1
  import logging
2
- from time import time
3
2
  from io import BytesIO
3
+ from time import time
4
4
 
5
5
  from cchardet import detect
6
6
 
@@ -16,12 +16,14 @@ def detect_encoding(binary_file: BytesIO, verbose: bool = False) -> str:
16
16
  logging.info("Detecting encoding")
17
17
  encoding_dict = detect(binary_file.read())
18
18
  if not encoding_dict["encoding"]:
19
- raise ValueError("Could not detect the file's encoding. Consider specifying it in the routine call.")
19
+ raise ValueError(
20
+ "Could not detect the file's encoding. Consider specifying it in the routine call."
21
+ )
20
22
  if verbose:
21
23
  message = f'Detected encoding: "{encoding_dict["encoding"]}"'
22
- message += f' in {round(time() - start, 3)}s (confidence: {round(encoding_dict["confidence"]*100)}%)'
24
+ message += f" in {round(time() - start, 3)}s (confidence: {round(encoding_dict['confidence'] * 100)}%)"
23
25
  display_logs_depending_process_time(
24
26
  message,
25
27
  time() - start,
26
28
  )
27
- return encoding_dict['encoding']
29
+ return encoding_dict["encoding"]
@@ -22,11 +22,11 @@ def detect_engine(file_path: str, verbose=False) -> Optional[str]:
22
22
  mapping = {
23
23
  "application/gzip": "gzip",
24
24
  "application/x-gzip": "gzip",
25
- 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'openpyxl',
26
- 'application/vnd.ms-excel': 'xlrd',
27
- 'application/vnd.oasis.opendocument.spreadsheet': 'odf',
25
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "openpyxl",
26
+ "application/vnd.ms-excel": "xlrd",
27
+ "application/vnd.oasis.opendocument.spreadsheet": "odf",
28
28
  # all these files could be recognized as zip, may need to check all cases then
29
- 'application/zip': 'openpyxl',
29
+ "application/zip": "openpyxl",
30
30
  }
31
31
  # if none of the above, we move forwards with the csv process
32
32
  if is_url(file_path):
@@ -37,7 +37,8 @@ def detect_engine(file_path: str, verbose=False) -> Optional[str]:
37
37
  if verbose:
38
38
  message = (
39
39
  f"File is not csv, detected {engine_to_file.get(engine, 'csv')}"
40
- if engine else "Processing the file as a csv"
40
+ if engine
41
+ else "Processing the file as a csv"
41
42
  )
42
43
  display_logs_depending_process_time(
43
44
  message,