csv-detective 0.7.5.dev1197__py3-none-any.whl → 0.7.5.dev1209__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. csv_detective/__init__.py +1 -1
  2. csv_detective/detect_fields/FR/geo/adresse/__init__.py +1 -1
  3. csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +1 -1
  4. csv_detective/detect_fields/FR/other/csp_insee/__init__.py +1 -1
  5. csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +1 -1
  6. csv_detective/detect_fields/FR/other/sexe/__init__.py +1 -1
  7. csv_detective/detect_labels/FR/geo/adresse/__init__.py +1 -1
  8. csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +1 -1
  9. csv_detective/detect_labels/FR/geo/code_departement/__init__.py +1 -1
  10. csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +1 -1
  11. csv_detective/detect_labels/FR/geo/code_postal/__init__.py +1 -1
  12. csv_detective/detect_labels/FR/geo/code_region/__init__.py +1 -1
  13. csv_detective/detect_labels/FR/geo/commune/__init__.py +1 -1
  14. csv_detective/detect_labels/FR/geo/departement/__init__.py +1 -1
  15. csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +1 -1
  16. csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +1 -1
  17. csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +1 -1
  18. csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +1 -1
  19. csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +1 -1
  20. csv_detective/detect_labels/FR/geo/pays/__init__.py +1 -1
  21. csv_detective/detect_labels/FR/geo/region/__init__.py +1 -1
  22. csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +1 -1
  23. csv_detective/detect_labels/FR/other/code_rna/__init__.py +1 -1
  24. csv_detective/detect_labels/FR/other/code_waldec/__init__.py +1 -1
  25. csv_detective/detect_labels/FR/other/csp_insee/__init__.py +1 -1
  26. csv_detective/detect_labels/FR/other/date_fr/__init__.py +1 -1
  27. csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +1 -1
  28. csv_detective/detect_labels/FR/other/sexe/__init__.py +1 -1
  29. csv_detective/detect_labels/FR/other/siren/__init__.py +1 -1
  30. csv_detective/detect_labels/FR/other/siret/__init__.py +1 -1
  31. csv_detective/detect_labels/FR/other/tel_fr/__init__.py +1 -1
  32. csv_detective/detect_labels/FR/other/uai/__init__.py +1 -1
  33. csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +1 -1
  34. csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +1 -1
  35. csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +1 -1
  36. csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +1 -1
  37. csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +1 -1
  38. csv_detective/detect_labels/geo/json_geojson/__init__.py +1 -1
  39. csv_detective/detect_labels/geo/latitude_wgs/__init__.py +1 -1
  40. csv_detective/detect_labels/geo/latlon_wgs/__init__.py +1 -1
  41. csv_detective/detect_labels/geo/longitude_wgs/__init__.py +1 -1
  42. csv_detective/detect_labels/other/booleen/__init__.py +1 -1
  43. csv_detective/detect_labels/other/email/__init__.py +1 -1
  44. csv_detective/detect_labels/other/float/__init__.py +1 -1
  45. csv_detective/detect_labels/other/int/__init__.py +1 -1
  46. csv_detective/detect_labels/other/mongo_object_id/__init__.py +1 -1
  47. csv_detective/detect_labels/other/twitter/__init__.py +1 -1
  48. csv_detective/detect_labels/other/url/__init__.py +1 -1
  49. csv_detective/detect_labels/other/uuid/__init__.py +1 -1
  50. csv_detective/detect_labels/temp/date/__init__.py +1 -1
  51. csv_detective/detect_labels/temp/datetime_iso/__init__.py +1 -1
  52. csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +1 -1
  53. csv_detective/detect_labels/temp/year/__init__.py +1 -1
  54. csv_detective/detection/columns.py +89 -0
  55. csv_detective/detection/encoding.py +27 -0
  56. csv_detective/detection/engine.py +46 -0
  57. csv_detective/detection/headers.py +32 -0
  58. csv_detective/detection/rows.py +18 -0
  59. csv_detective/detection/separator.py +44 -0
  60. csv_detective/detection/variables.py +98 -0
  61. csv_detective/explore_csv.py +40 -124
  62. csv_detective/output/dataframe.py +55 -0
  63. csv_detective/{create_example.py → output/example.py} +10 -9
  64. csv_detective/output/profile.py +87 -0
  65. csv_detective/{schema_generation.py → output/schema.py} +344 -343
  66. csv_detective/output/utils.py +51 -0
  67. csv_detective/parsing/columns.py +141 -0
  68. csv_detective/parsing/compression.py +11 -0
  69. csv_detective/parsing/csv.py +55 -0
  70. csv_detective/parsing/excel.py +169 -0
  71. csv_detective/parsing/load.py +97 -0
  72. csv_detective/utils.py +10 -236
  73. {csv_detective-0.7.5.dev1197.data → csv_detective-0.7.5.dev1209.data}/data/share/csv_detective/CHANGELOG.md +1 -0
  74. {csv_detective-0.7.5.dev1197.dist-info → csv_detective-0.7.5.dev1209.dist-info}/METADATA +1 -1
  75. {csv_detective-0.7.5.dev1197.dist-info → csv_detective-0.7.5.dev1209.dist-info}/RECORD +84 -70
  76. tests/test_fields.py +7 -6
  77. tests/test_file.py +15 -14
  78. csv_detective/detection.py +0 -633
  79. /csv_detective/{process_text.py → parsing/text.py} +0 -0
  80. {csv_detective-0.7.5.dev1197.data → csv_detective-0.7.5.dev1209.data}/data/share/csv_detective/LICENSE.AGPL.txt +0 -0
  81. {csv_detective-0.7.5.dev1197.data → csv_detective-0.7.5.dev1209.data}/data/share/csv_detective/README.md +0 -0
  82. {csv_detective-0.7.5.dev1197.dist-info → csv_detective-0.7.5.dev1209.dist-info}/WHEEL +0 -0
  83. {csv_detective-0.7.5.dev1197.dist-info → csv_detective-0.7.5.dev1209.dist-info}/entry_points.txt +0 -0
  84. {csv_detective-0.7.5.dev1197.dist-info → csv_detective-0.7.5.dev1209.dist-info}/licenses/LICENSE.AGPL.txt +0 -0
  85. {csv_detective-0.7.5.dev1197.dist-info → csv_detective-0.7.5.dev1209.dist-info}/top_level.txt +0 -0
csv_detective/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
1
  from .explore_csv import routine, routine_minio # noqa
2
- from .create_example import create_example_csv_file # noqa
2
+ from .output.example import create_example_csv_file # noqa
3
3
 
4
4
  __version__ = '0.7.5.dev'
@@ -1,4 +1,4 @@
1
- from csv_detective.process_text import _process_text
1
+ from csv_detective.parsing.text import _process_text
2
2
 
3
3
  PROPORTION = 0.55
4
4
  # ajouts d'espaces en fin de mots pour s'assurer que le str n'est pas juste une substr d'un mot plus long
@@ -1,4 +1,4 @@
1
- from csv_detective.process_text import _process_text
1
+ from csv_detective.parsing.text import _process_text
2
2
  import re
3
3
 
4
4
  PROPORTION = 1
@@ -1,5 +1,5 @@
1
1
  from os.path import dirname, join
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 1
5
5
  f = open(join(dirname(__file__), 'csp_insee.txt'), 'r')
@@ -1,5 +1,5 @@
1
1
  from os.path import dirname, join
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 1
5
5
  f = open(join(dirname(__file__), 'insee_ape700.txt'), 'r')
@@ -1,4 +1,4 @@
1
- from csv_detective.process_text import _process_text
1
+ from csv_detective.parsing.text import _process_text
2
2
 
3
3
  PROPORTION = 1
4
4
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -1,5 +1,5 @@
1
1
  from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
2
+ from csv_detective.parsing.text import _process_text
3
3
 
4
4
  PROPORTION = 0.5
5
5
 
@@ -0,0 +1,89 @@
1
+ import logging
2
+ from typing import TextIO
3
+ from time import time
4
+
5
+ from csv_detective.utils import display_logs_depending_process_time
6
+
7
+
8
+ def detect_extra_columns(file: TextIO, sep: str):
9
+ """regarde s'il y a des colonnes en trop
10
+ Attention, file ne doit pas avoir de ligne vide"""
11
+ file.seek(0)
12
+ retour = False
13
+ nb_useless_col = 99999
14
+
15
+ for i in range(10):
16
+ line = file.readline()
17
+ # regarde si on a un retour
18
+ if retour:
19
+ assert line[-1] == "\n"
20
+ if line[-1] == "\n":
21
+ retour = True
22
+
23
+ # regarde le nombre de derniere colonne inutile
24
+ deb = 0 + retour
25
+ line = line[::-1][deb:]
26
+ k = 0
27
+ for sign in line:
28
+ if sign != sep:
29
+ break
30
+ k += 1
31
+ if k == 0:
32
+ return 0, retour
33
+ nb_useless_col = min(k, nb_useless_col)
34
+ return nb_useless_col, retour
35
+
36
+
37
+ def detect_heading_columns(file: TextIO, sep: str, verbose: bool = False) -> int:
38
+ """Tests first 10 lines to see if there are empty heading columns"""
39
+ if verbose:
40
+ start = time()
41
+ logging.info("Detecting heading columns")
42
+ file.seek(0)
43
+ return_int = float("Inf")
44
+ for i in range(10):
45
+ line = file.readline()
46
+ return_int = min(return_int, len(line) - len(line.strip(sep)))
47
+ if return_int == 0:
48
+ if verbose:
49
+ display_logs_depending_process_time(
50
+ f'No heading column detected in {round(time() - start, 3)}s',
51
+ time() - start,
52
+ )
53
+ return 0
54
+ if verbose:
55
+ display_logs_depending_process_time(
56
+ f'{return_int} heading columns detected in {round(time() - start, 3)}s',
57
+ time() - start,
58
+ )
59
+ return return_int
60
+
61
+
62
+ def detect_trailing_columns(file: TextIO, sep: str, heading_columns: int, verbose: bool = False) -> int:
63
+ """Tests first 10 lines to see if there are empty trailing columns"""
64
+ if verbose:
65
+ start = time()
66
+ logging.info("Detecting trailing columns")
67
+ file.seek(0)
68
+ return_int = float("Inf")
69
+ for i in range(10):
70
+ line = file.readline()
71
+ return_int = min(
72
+ return_int,
73
+ len(line.replace("\n", ""))
74
+ - len(line.replace("\n", "").strip(sep))
75
+ - heading_columns,
76
+ )
77
+ if return_int == 0:
78
+ if verbose:
79
+ display_logs_depending_process_time(
80
+ f'No trailing column detected in {round(time() - start, 3)}s',
81
+ time() - start,
82
+ )
83
+ return 0
84
+ if verbose:
85
+ display_logs_depending_process_time(
86
+ f'{return_int} trailing columns detected in {round(time() - start, 3)}s',
87
+ time() - start,
88
+ )
89
+ return return_int
@@ -0,0 +1,27 @@
1
+ import logging
2
+ from time import time
3
+ from io import BytesIO
4
+
5
+ from cchardet import detect
6
+
7
+ from csv_detective.utils import display_logs_depending_process_time
8
+
9
+
10
+ def detect_encoding(binary_file: BytesIO, verbose: bool = False) -> str:
11
+ """
12
+ Detects file encoding using faust-cchardet (forked from the original cchardet)
13
+ """
14
+ if verbose:
15
+ start = time()
16
+ logging.info("Detecting encoding")
17
+ encoding_dict = detect(binary_file.read())
18
+ if not encoding_dict["encoding"]:
19
+ raise ValueError("Could not detect the file's encoding. Consider specifying it in the routine call.")
20
+ if verbose:
21
+ message = f'Detected encoding: "{encoding_dict["encoding"]}"'
22
+ message += f' in {round(time() - start, 3)}s (confidence: {round(encoding_dict["confidence"]*100)}%)'
23
+ display_logs_depending_process_time(
24
+ message,
25
+ time() - start,
26
+ )
27
+ return encoding_dict['encoding']
@@ -0,0 +1,46 @@
1
+ from time import time
2
+ from typing import Optional
3
+
4
+ import magic
5
+ import requests
6
+
7
+ from csv_detective.utils import display_logs_depending_process_time, is_url
8
+
9
+ COMPRESSION_ENGINES = ["gzip"]
10
+ EXCEL_ENGINES = ["openpyxl", "xlrd", "odf"]
11
+ engine_to_file = {
12
+ "openpyxl": "Excel",
13
+ "xlrd": "old Excel",
14
+ "odf": "OpenOffice",
15
+ "gzip": "csv.gz",
16
+ }
17
+
18
+
19
+ def detect_engine(file_path: str, verbose=False) -> Optional[str]:
20
+ if verbose:
21
+ start = time()
22
+ mapping = {
23
+ "application/gzip": "gzip",
24
+ "application/x-gzip": "gzip",
25
+ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'openpyxl',
26
+ 'application/vnd.ms-excel': 'xlrd',
27
+ 'application/vnd.oasis.opendocument.spreadsheet': 'odf',
28
+ # all these files could be recognized as zip, may need to check all cases then
29
+ 'application/zip': 'openpyxl',
30
+ }
31
+ # if none of the above, we move forwards with the csv process
32
+ if is_url(file_path):
33
+ remote_content = requests.get(file_path).content
34
+ engine = mapping.get(magic.from_buffer(remote_content, mime=True))
35
+ else:
36
+ engine = mapping.get(magic.from_file(file_path, mime=True))
37
+ if verbose:
38
+ message = (
39
+ f"File is not csv, detected {engine_to_file.get(engine, 'csv')}"
40
+ if engine else "Processing the file as a csv"
41
+ )
42
+ display_logs_depending_process_time(
43
+ message,
44
+ time() - start,
45
+ )
46
+ return engine
@@ -0,0 +1,32 @@
1
+ import logging
2
+ from time import time
3
+ from typing import Optional, TextIO
4
+
5
+ from csv_detective.utils import display_logs_depending_process_time
6
+
7
+
8
+ def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int, Optional[list]]:
9
+ """Tests 10 first rows for possible header (in case header is not 1st row)"""
10
+ if verbose:
11
+ start = time()
12
+ logging.info("Detecting headers")
13
+ file.seek(0)
14
+ for i in range(10):
15
+ header = file.readline()
16
+ position = file.tell()
17
+ chaine = [c for c in header.replace("\n", "").split(sep) if c]
18
+ if chaine[-1] not in ["", "\n"] and all(
19
+ [mot not in ["", "\n"] for mot in chaine[1:-1]]
20
+ ):
21
+ next_row = file.readline()
22
+ file.seek(position)
23
+ if header != next_row:
24
+ if verbose:
25
+ display_logs_depending_process_time(
26
+ f'Detected headers in {round(time() - start, 3)}s',
27
+ time() - start,
28
+ )
29
+ return i, chaine
30
+ if verbose:
31
+ logging.info('No header detected')
32
+ return 0, None