csv-detective 0.7.5.dev1180__py3-none-any.whl → 0.7.5.dev1209__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. csv_detective/__init__.py +1 -1
  2. csv_detective/detect_fields/FR/geo/adresse/__init__.py +1 -1
  3. csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +1 -1
  4. csv_detective/detect_fields/FR/other/csp_insee/__init__.py +1 -1
  5. csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +1 -1
  6. csv_detective/detect_fields/FR/other/sexe/__init__.py +1 -1
  7. csv_detective/detect_fields/temp/date/__init__.py +5 -1
  8. csv_detective/detect_labels/FR/geo/adresse/__init__.py +1 -1
  9. csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +1 -1
  10. csv_detective/detect_labels/FR/geo/code_departement/__init__.py +1 -1
  11. csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +1 -1
  12. csv_detective/detect_labels/FR/geo/code_postal/__init__.py +1 -1
  13. csv_detective/detect_labels/FR/geo/code_region/__init__.py +1 -1
  14. csv_detective/detect_labels/FR/geo/commune/__init__.py +1 -1
  15. csv_detective/detect_labels/FR/geo/departement/__init__.py +1 -1
  16. csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +1 -1
  17. csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +1 -1
  18. csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +1 -1
  19. csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +1 -1
  20. csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +1 -1
  21. csv_detective/detect_labels/FR/geo/pays/__init__.py +1 -1
  22. csv_detective/detect_labels/FR/geo/region/__init__.py +1 -1
  23. csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +1 -1
  24. csv_detective/detect_labels/FR/other/code_rna/__init__.py +1 -1
  25. csv_detective/detect_labels/FR/other/code_waldec/__init__.py +1 -1
  26. csv_detective/detect_labels/FR/other/csp_insee/__init__.py +1 -1
  27. csv_detective/detect_labels/FR/other/date_fr/__init__.py +1 -1
  28. csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +1 -1
  29. csv_detective/detect_labels/FR/other/sexe/__init__.py +1 -1
  30. csv_detective/detect_labels/FR/other/siren/__init__.py +1 -1
  31. csv_detective/detect_labels/FR/other/siret/__init__.py +1 -1
  32. csv_detective/detect_labels/FR/other/tel_fr/__init__.py +1 -1
  33. csv_detective/detect_labels/FR/other/uai/__init__.py +1 -1
  34. csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +1 -1
  35. csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +1 -1
  36. csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +1 -1
  37. csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +1 -1
  38. csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +1 -1
  39. csv_detective/detect_labels/geo/json_geojson/__init__.py +1 -1
  40. csv_detective/detect_labels/geo/latitude_wgs/__init__.py +1 -1
  41. csv_detective/detect_labels/geo/latlon_wgs/__init__.py +1 -1
  42. csv_detective/detect_labels/geo/longitude_wgs/__init__.py +1 -1
  43. csv_detective/detect_labels/other/booleen/__init__.py +1 -1
  44. csv_detective/detect_labels/other/email/__init__.py +1 -1
  45. csv_detective/detect_labels/other/float/__init__.py +1 -1
  46. csv_detective/detect_labels/other/int/__init__.py +1 -1
  47. csv_detective/detect_labels/other/mongo_object_id/__init__.py +1 -1
  48. csv_detective/detect_labels/other/twitter/__init__.py +1 -1
  49. csv_detective/detect_labels/other/url/__init__.py +1 -1
  50. csv_detective/detect_labels/other/uuid/__init__.py +1 -1
  51. csv_detective/detect_labels/temp/date/__init__.py +1 -1
  52. csv_detective/detect_labels/temp/datetime_iso/__init__.py +1 -1
  53. csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +1 -1
  54. csv_detective/detect_labels/temp/year/__init__.py +1 -1
  55. csv_detective/detection/columns.py +89 -0
  56. csv_detective/detection/encoding.py +27 -0
  57. csv_detective/detection/engine.py +46 -0
  58. csv_detective/detection/headers.py +32 -0
  59. csv_detective/detection/rows.py +18 -0
  60. csv_detective/detection/separator.py +44 -0
  61. csv_detective/detection/variables.py +98 -0
  62. csv_detective/explore_csv.py +40 -110
  63. csv_detective/output/dataframe.py +55 -0
  64. csv_detective/{create_example.py → output/example.py} +10 -9
  65. csv_detective/output/profile.py +87 -0
  66. csv_detective/{schema_generation.py → output/schema.py} +344 -343
  67. csv_detective/output/utils.py +51 -0
  68. csv_detective/parsing/columns.py +141 -0
  69. csv_detective/parsing/compression.py +11 -0
  70. csv_detective/parsing/csv.py +55 -0
  71. csv_detective/parsing/excel.py +169 -0
  72. csv_detective/parsing/load.py +97 -0
  73. csv_detective/utils.py +10 -236
  74. {csv_detective-0.7.5.dev1180.data → csv_detective-0.7.5.dev1209.data}/data/share/csv_detective/CHANGELOG.md +3 -0
  75. {csv_detective-0.7.5.dev1180.dist-info → csv_detective-0.7.5.dev1209.dist-info}/METADATA +3 -2
  76. {csv_detective-0.7.5.dev1180.dist-info → csv_detective-0.7.5.dev1209.dist-info}/RECORD +85 -71
  77. {csv_detective-0.7.5.dev1180.dist-info → csv_detective-0.7.5.dev1209.dist-info}/WHEEL +1 -1
  78. tests/test_fields.py +7 -6
  79. tests/test_file.py +56 -57
  80. csv_detective/detection.py +0 -618
  81. /csv_detective/{process_text.py → parsing/text.py} +0 -0
  82. {csv_detective-0.7.5.dev1180.data → csv_detective-0.7.5.dev1209.data}/data/share/csv_detective/LICENSE.AGPL.txt +0 -0
  83. {csv_detective-0.7.5.dev1180.data → csv_detective-0.7.5.dev1209.data}/data/share/csv_detective/README.md +0 -0
  84. {csv_detective-0.7.5.dev1180.dist-info → csv_detective-0.7.5.dev1209.dist-info}/entry_points.txt +0 -0
  85. {csv_detective-0.7.5.dev1180.dist-info → csv_detective-0.7.5.dev1209.dist-info/licenses}/LICENSE.AGPL.txt +0 -0
  86. {csv_detective-0.7.5.dev1180.dist-info → csv_detective-0.7.5.dev1209.dist-info}/top_level.txt +0 -0
csv_detective/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
1
  from .explore_csv import routine, routine_minio # noqa
2
- from .create_example import create_example_csv_file # noqa
2
+ from .output.example import create_example_csv_file # noqa
3
3
 
4
4
  __version__ = '0.7.5.dev'
@@ -1,4 +1,4 @@
1
- from csv_detective.process_text import _process_text
1
+ from csv_detective.parsing.text import _process_text
2
2
 
3
3
  PROPORTION = 0.55
4
4
  # ajouts d'espaces en fin de mots pour s'assurer que le str n'est pas juste une substr d'un mot plus long
@@ -1,4 +1,4 @@
1
- from csv_detective.process_text import _process_text
1
+ from csv_detective.parsing.text import _process_text
2
2
  import re
3
3
 
4
4
  PROPORTION = 1
@@ -1,5 +1,5 @@
1
1
  from os.path import dirname, join
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 1
5
5
  f = open(join(dirname(__file__), 'csp_insee.txt'), 'r')
@@ -1,5 +1,5 @@
1
1
  from os.path import dirname, join
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 1
5
5
  f = open(join(dirname(__file__), 'insee_ape700.txt'), 'r')
@@ -1,4 +1,4 @@
1
- from csv_detective.process_text import _process_text
1
+ from csv_detective.parsing.text import _process_text
2
2
 
3
3
  PROPORTION = 1
4
4
 
@@ -14,6 +14,11 @@ def date_casting(val: str) -> Optional[datetime]:
14
14
  return dateutil_parser(val)
15
15
  except ParserError:
16
16
  return date_parser(val)
17
+ except OverflowError:
18
+ return None
19
+
20
+
21
+ threshold = 0.3
17
22
 
18
23
 
19
24
  def _is(val):
@@ -21,7 +26,6 @@ def _is(val):
21
26
  # early stops, to cut processing time
22
27
  if not isinstance(val, str) or len(val) > 20 or len(val) < 8:
23
28
  return False
24
- threshold = 0.3
25
29
  if sum([char.isdigit() for char in val]) / len(val) < threshold:
26
30
  return False
27
31
  res = date_casting(val)
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -0,0 +1,89 @@
1
+ import logging
2
+ from typing import TextIO
3
+ from time import time
4
+
5
+ from csv_detective.utils import display_logs_depending_process_time
6
+
7
+
8
+ def detect_extra_columns(file: TextIO, sep: str):
9
+ """regarde s'il y a des colonnes en trop
10
+ Attention, file ne doit pas avoir de ligne vide"""
11
+ file.seek(0)
12
+ retour = False
13
+ nb_useless_col = 99999
14
+
15
+ for i in range(10):
16
+ line = file.readline()
17
+ # regarde si on a un retour
18
+ if retour:
19
+ assert line[-1] == "\n"
20
+ if line[-1] == "\n":
21
+ retour = True
22
+
23
+ # regarde le nombre de derniere colonne inutile
24
+ deb = 0 + retour
25
+ line = line[::-1][deb:]
26
+ k = 0
27
+ for sign in line:
28
+ if sign != sep:
29
+ break
30
+ k += 1
31
+ if k == 0:
32
+ return 0, retour
33
+ nb_useless_col = min(k, nb_useless_col)
34
+ return nb_useless_col, retour
35
+
36
+
37
+ def detect_heading_columns(file: TextIO, sep: str, verbose: bool = False) -> int:
38
+ """Tests first 10 lines to see if there are empty heading columns"""
39
+ if verbose:
40
+ start = time()
41
+ logging.info("Detecting heading columns")
42
+ file.seek(0)
43
+ return_int = float("Inf")
44
+ for i in range(10):
45
+ line = file.readline()
46
+ return_int = min(return_int, len(line) - len(line.strip(sep)))
47
+ if return_int == 0:
48
+ if verbose:
49
+ display_logs_depending_process_time(
50
+ f'No heading column detected in {round(time() - start, 3)}s',
51
+ time() - start,
52
+ )
53
+ return 0
54
+ if verbose:
55
+ display_logs_depending_process_time(
56
+ f'{return_int} heading columns detected in {round(time() - start, 3)}s',
57
+ time() - start,
58
+ )
59
+ return return_int
60
+
61
+
62
+ def detect_trailing_columns(file: TextIO, sep: str, heading_columns: int, verbose: bool = False) -> int:
63
+ """Tests first 10 lines to see if there are empty trailing columns"""
64
+ if verbose:
65
+ start = time()
66
+ logging.info("Detecting trailing columns")
67
+ file.seek(0)
68
+ return_int = float("Inf")
69
+ for i in range(10):
70
+ line = file.readline()
71
+ return_int = min(
72
+ return_int,
73
+ len(line.replace("\n", ""))
74
+ - len(line.replace("\n", "").strip(sep))
75
+ - heading_columns,
76
+ )
77
+ if return_int == 0:
78
+ if verbose:
79
+ display_logs_depending_process_time(
80
+ f'No trailing column detected in {round(time() - start, 3)}s',
81
+ time() - start,
82
+ )
83
+ return 0
84
+ if verbose:
85
+ display_logs_depending_process_time(
86
+ f'{return_int} trailing columns detected in {round(time() - start, 3)}s',
87
+ time() - start,
88
+ )
89
+ return return_int
@@ -0,0 +1,27 @@
1
+ import logging
2
+ from time import time
3
+ from io import BytesIO
4
+
5
+ from cchardet import detect
6
+
7
+ from csv_detective.utils import display_logs_depending_process_time
8
+
9
+
10
+ def detect_encoding(binary_file: BytesIO, verbose: bool = False) -> str:
11
+ """
12
+ Detects file encoding using faust-cchardet (forked from the original cchardet)
13
+ """
14
+ if verbose:
15
+ start = time()
16
+ logging.info("Detecting encoding")
17
+ encoding_dict = detect(binary_file.read())
18
+ if not encoding_dict["encoding"]:
19
+ raise ValueError("Could not detect the file's encoding. Consider specifying it in the routine call.")
20
+ if verbose:
21
+ message = f'Detected encoding: "{encoding_dict["encoding"]}"'
22
+ message += f' in {round(time() - start, 3)}s (confidence: {round(encoding_dict["confidence"]*100)}%)'
23
+ display_logs_depending_process_time(
24
+ message,
25
+ time() - start,
26
+ )
27
+ return encoding_dict['encoding']
@@ -0,0 +1,46 @@
1
+ from time import time
2
+ from typing import Optional
3
+
4
+ import magic
5
+ import requests
6
+
7
+ from csv_detective.utils import display_logs_depending_process_time, is_url
8
+
9
+ COMPRESSION_ENGINES = ["gzip"]
10
+ EXCEL_ENGINES = ["openpyxl", "xlrd", "odf"]
11
+ engine_to_file = {
12
+ "openpyxl": "Excel",
13
+ "xlrd": "old Excel",
14
+ "odf": "OpenOffice",
15
+ "gzip": "csv.gz",
16
+ }
17
+
18
+
19
+ def detect_engine(file_path: str, verbose=False) -> Optional[str]:
20
+ if verbose:
21
+ start = time()
22
+ mapping = {
23
+ "application/gzip": "gzip",
24
+ "application/x-gzip": "gzip",
25
+ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'openpyxl',
26
+ 'application/vnd.ms-excel': 'xlrd',
27
+ 'application/vnd.oasis.opendocument.spreadsheet': 'odf',
28
+ # all these files could be recognized as zip, may need to check all cases then
29
+ 'application/zip': 'openpyxl',
30
+ }
31
+ # if none of the above, we move forwards with the csv process
32
+ if is_url(file_path):
33
+ remote_content = requests.get(file_path).content
34
+ engine = mapping.get(magic.from_buffer(remote_content, mime=True))
35
+ else:
36
+ engine = mapping.get(magic.from_file(file_path, mime=True))
37
+ if verbose:
38
+ message = (
39
+ f"File is not csv, detected {engine_to_file.get(engine, 'csv')}"
40
+ if engine else "Processing the file as a csv"
41
+ )
42
+ display_logs_depending_process_time(
43
+ message,
44
+ time() - start,
45
+ )
46
+ return engine