csv-detective 0.6.7__py3-none-any.whl → 0.9.3.dev2438__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. csv_detective/__init__.py +7 -1
  2. csv_detective/cli.py +33 -21
  3. csv_detective/{detect_fields/FR → detection}/__init__.py +0 -0
  4. csv_detective/detection/columns.py +89 -0
  5. csv_detective/detection/encoding.py +29 -0
  6. csv_detective/detection/engine.py +46 -0
  7. csv_detective/detection/formats.py +156 -0
  8. csv_detective/detection/headers.py +28 -0
  9. csv_detective/detection/rows.py +18 -0
  10. csv_detective/detection/separator.py +44 -0
  11. csv_detective/detection/variables.py +97 -0
  12. csv_detective/explore_csv.py +151 -377
  13. csv_detective/format.py +67 -0
  14. csv_detective/formats/__init__.py +9 -0
  15. csv_detective/formats/adresse.py +116 -0
  16. csv_detective/formats/binary.py +26 -0
  17. csv_detective/formats/booleen.py +35 -0
  18. csv_detective/formats/code_commune_insee.py +26 -0
  19. csv_detective/formats/code_csp_insee.py +36 -0
  20. csv_detective/formats/code_departement.py +29 -0
  21. csv_detective/formats/code_fantoir.py +21 -0
  22. csv_detective/formats/code_import.py +17 -0
  23. csv_detective/formats/code_postal.py +25 -0
  24. csv_detective/formats/code_region.py +22 -0
  25. csv_detective/formats/code_rna.py +29 -0
  26. csv_detective/formats/code_waldec.py +17 -0
  27. csv_detective/formats/commune.py +27 -0
  28. csv_detective/formats/csp_insee.py +31 -0
  29. csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt +0 -0
  30. csv_detective/formats/date.py +99 -0
  31. csv_detective/formats/date_fr.py +22 -0
  32. csv_detective/formats/datetime_aware.py +45 -0
  33. csv_detective/formats/datetime_naive.py +48 -0
  34. csv_detective/formats/datetime_rfc822.py +24 -0
  35. csv_detective/formats/departement.py +37 -0
  36. csv_detective/formats/email.py +28 -0
  37. csv_detective/formats/float.py +29 -0
  38. csv_detective/formats/geojson.py +36 -0
  39. csv_detective/formats/insee_ape700.py +31 -0
  40. csv_detective/formats/insee_canton.py +28 -0
  41. csv_detective/formats/int.py +23 -0
  42. csv_detective/formats/iso_country_code_alpha2.py +30 -0
  43. csv_detective/formats/iso_country_code_alpha3.py +30 -0
  44. csv_detective/formats/iso_country_code_numeric.py +31 -0
  45. csv_detective/formats/jour_de_la_semaine.py +41 -0
  46. csv_detective/formats/json.py +20 -0
  47. csv_detective/formats/latitude_l93.py +48 -0
  48. csv_detective/formats/latitude_wgs.py +42 -0
  49. csv_detective/formats/latitude_wgs_fr_metropole.py +42 -0
  50. csv_detective/formats/latlon_wgs.py +53 -0
  51. csv_detective/formats/longitude_l93.py +39 -0
  52. csv_detective/formats/longitude_wgs.py +32 -0
  53. csv_detective/formats/longitude_wgs_fr_metropole.py +32 -0
  54. csv_detective/formats/lonlat_wgs.py +36 -0
  55. csv_detective/formats/mois_de_lannee.py +48 -0
  56. csv_detective/formats/money.py +18 -0
  57. csv_detective/formats/mongo_object_id.py +14 -0
  58. csv_detective/formats/pays.py +35 -0
  59. csv_detective/formats/percent.py +16 -0
  60. csv_detective/formats/region.py +70 -0
  61. csv_detective/formats/sexe.py +17 -0
  62. csv_detective/formats/siren.py +37 -0
  63. csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} +47 -29
  64. csv_detective/formats/tel_fr.py +36 -0
  65. csv_detective/formats/uai.py +36 -0
  66. csv_detective/formats/url.py +46 -0
  67. csv_detective/formats/username.py +14 -0
  68. csv_detective/formats/uuid.py +16 -0
  69. csv_detective/formats/year.py +28 -0
  70. csv_detective/output/__init__.py +65 -0
  71. csv_detective/output/dataframe.py +96 -0
  72. csv_detective/output/example.py +250 -0
  73. csv_detective/output/profile.py +119 -0
  74. csv_detective/{schema_generation.py → output/schema.py} +268 -343
  75. csv_detective/output/utils.py +74 -0
  76. csv_detective/{detect_fields/FR/geo → parsing}/__init__.py +0 -0
  77. csv_detective/parsing/columns.py +235 -0
  78. csv_detective/parsing/compression.py +11 -0
  79. csv_detective/parsing/csv.py +56 -0
  80. csv_detective/parsing/excel.py +167 -0
  81. csv_detective/parsing/load.py +111 -0
  82. csv_detective/parsing/text.py +56 -0
  83. csv_detective/utils.py +23 -196
  84. csv_detective/validate.py +138 -0
  85. csv_detective-0.9.3.dev2438.dist-info/METADATA +267 -0
  86. csv_detective-0.9.3.dev2438.dist-info/RECORD +92 -0
  87. csv_detective-0.9.3.dev2438.dist-info/WHEEL +4 -0
  88. {csv_detective-0.6.7.dist-info → csv_detective-0.9.3.dev2438.dist-info}/entry_points.txt +1 -0
  89. csv_detective/all_packages.txt +0 -104
  90. csv_detective/detect_fields/FR/geo/adresse/__init__.py +0 -100
  91. csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +0 -24
  92. csv_detective/detect_fields/FR/geo/code_commune_insee/code_commune_insee.txt +0 -37600
  93. csv_detective/detect_fields/FR/geo/code_departement/__init__.py +0 -11
  94. csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +0 -15
  95. csv_detective/detect_fields/FR/geo/code_fantoir/code_fantoir.txt +0 -26122
  96. csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -19
  97. csv_detective/detect_fields/FR/geo/code_postal/code_postal.txt +0 -36822
  98. csv_detective/detect_fields/FR/geo/code_region/__init__.py +0 -27
  99. csv_detective/detect_fields/FR/geo/commune/__init__.py +0 -21
  100. csv_detective/detect_fields/FR/geo/commune/commune.txt +0 -36745
  101. csv_detective/detect_fields/FR/geo/departement/__init__.py +0 -19
  102. csv_detective/detect_fields/FR/geo/departement/departement.txt +0 -101
  103. csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +0 -20
  104. csv_detective/detect_fields/FR/geo/insee_canton/canton2017.txt +0 -2055
  105. csv_detective/detect_fields/FR/geo/insee_canton/cantons.txt +0 -2055
  106. csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +0 -13
  107. csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -13
  108. csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +0 -13
  109. csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -13
  110. csv_detective/detect_fields/FR/geo/pays/__init__.py +0 -17
  111. csv_detective/detect_fields/FR/geo/pays/pays.txt +0 -248
  112. csv_detective/detect_fields/FR/geo/region/__init__.py +0 -16
  113. csv_detective/detect_fields/FR/geo/region/region.txt +0 -44
  114. csv_detective/detect_fields/FR/other/__init__.py +0 -0
  115. csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +0 -26
  116. csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
  117. csv_detective/detect_fields/FR/other/code_rna/__init__.py +0 -8
  118. csv_detective/detect_fields/FR/other/code_waldec/__init__.py +0 -12
  119. csv_detective/detect_fields/FR/other/csp_insee/__init__.py +0 -16
  120. csv_detective/detect_fields/FR/other/date_fr/__init__.py +0 -12
  121. csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +0 -16
  122. csv_detective/detect_fields/FR/other/sexe/__init__.py +0 -9
  123. csv_detective/detect_fields/FR/other/siren/__init__.py +0 -18
  124. csv_detective/detect_fields/FR/other/tel_fr/__init__.py +0 -15
  125. csv_detective/detect_fields/FR/other/uai/__init__.py +0 -15
  126. csv_detective/detect_fields/FR/temp/__init__.py +0 -0
  127. csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +0 -23
  128. csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +0 -37
  129. csv_detective/detect_fields/__init__.py +0 -57
  130. csv_detective/detect_fields/geo/__init__.py +0 -0
  131. csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +0 -15
  132. csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +0 -14
  133. csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +0 -15
  134. csv_detective/detect_fields/geo/json_geojson/__init__.py +0 -22
  135. csv_detective/detect_fields/geo/latitude_wgs/__init__.py +0 -13
  136. csv_detective/detect_fields/geo/latlon_wgs/__init__.py +0 -15
  137. csv_detective/detect_fields/geo/longitude_wgs/__init__.py +0 -13
  138. csv_detective/detect_fields/other/__init__.py +0 -0
  139. csv_detective/detect_fields/other/booleen/__init__.py +0 -21
  140. csv_detective/detect_fields/other/email/__init__.py +0 -8
  141. csv_detective/detect_fields/other/float/__init__.py +0 -17
  142. csv_detective/detect_fields/other/int/__init__.py +0 -12
  143. csv_detective/detect_fields/other/json/__init__.py +0 -24
  144. csv_detective/detect_fields/other/mongo_object_id/__init__.py +0 -8
  145. csv_detective/detect_fields/other/twitter/__init__.py +0 -8
  146. csv_detective/detect_fields/other/url/__init__.py +0 -11
  147. csv_detective/detect_fields/other/uuid/__init__.py +0 -11
  148. csv_detective/detect_fields/temp/__init__.py +0 -0
  149. csv_detective/detect_fields/temp/date/__init__.py +0 -62
  150. csv_detective/detect_fields/temp/datetime_iso/__init__.py +0 -18
  151. csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +0 -21
  152. csv_detective/detect_fields/temp/year/__init__.py +0 -10
  153. csv_detective/detect_labels/FR/__init__.py +0 -0
  154. csv_detective/detect_labels/FR/geo/__init__.py +0 -0
  155. csv_detective/detect_labels/FR/geo/adresse/__init__.py +0 -40
  156. csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +0 -42
  157. csv_detective/detect_labels/FR/geo/code_departement/__init__.py +0 -33
  158. csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +0 -33
  159. csv_detective/detect_labels/FR/geo/code_postal/__init__.py +0 -41
  160. csv_detective/detect_labels/FR/geo/code_region/__init__.py +0 -33
  161. csv_detective/detect_labels/FR/geo/commune/__init__.py +0 -33
  162. csv_detective/detect_labels/FR/geo/departement/__init__.py +0 -47
  163. csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +0 -33
  164. csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +0 -54
  165. csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -55
  166. csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +0 -44
  167. csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -45
  168. csv_detective/detect_labels/FR/geo/pays/__init__.py +0 -45
  169. csv_detective/detect_labels/FR/geo/region/__init__.py +0 -45
  170. csv_detective/detect_labels/FR/other/__init__.py +0 -0
  171. csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +0 -33
  172. csv_detective/detect_labels/FR/other/code_rna/__init__.py +0 -38
  173. csv_detective/detect_labels/FR/other/code_waldec/__init__.py +0 -33
  174. csv_detective/detect_labels/FR/other/csp_insee/__init__.py +0 -37
  175. csv_detective/detect_labels/FR/other/date_fr/__init__.py +0 -33
  176. csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +0 -40
  177. csv_detective/detect_labels/FR/other/sexe/__init__.py +0 -33
  178. csv_detective/detect_labels/FR/other/siren/__init__.py +0 -41
  179. csv_detective/detect_labels/FR/other/siret/__init__.py +0 -40
  180. csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -45
  181. csv_detective/detect_labels/FR/other/uai/__init__.py +0 -50
  182. csv_detective/detect_labels/FR/temp/__init__.py +0 -0
  183. csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +0 -41
  184. csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +0 -33
  185. csv_detective/detect_labels/__init__.py +0 -43
  186. csv_detective/detect_labels/geo/__init__.py +0 -0
  187. csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +0 -41
  188. csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +0 -41
  189. csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +0 -41
  190. csv_detective/detect_labels/geo/json_geojson/__init__.py +0 -42
  191. csv_detective/detect_labels/geo/latitude_wgs/__init__.py +0 -55
  192. csv_detective/detect_labels/geo/latlon_wgs/__init__.py +0 -67
  193. csv_detective/detect_labels/geo/longitude_wgs/__init__.py +0 -45
  194. csv_detective/detect_labels/other/__init__.py +0 -0
  195. csv_detective/detect_labels/other/booleen/__init__.py +0 -34
  196. csv_detective/detect_labels/other/email/__init__.py +0 -45
  197. csv_detective/detect_labels/other/float/__init__.py +0 -33
  198. csv_detective/detect_labels/other/int/__init__.py +0 -33
  199. csv_detective/detect_labels/other/money/__init__.py +0 -11
  200. csv_detective/detect_labels/other/money/check_col_name.py +0 -8
  201. csv_detective/detect_labels/other/mongo_object_id/__init__.py +0 -33
  202. csv_detective/detect_labels/other/twitter/__init__.py +0 -33
  203. csv_detective/detect_labels/other/url/__init__.py +0 -48
  204. csv_detective/detect_labels/other/uuid/__init__.py +0 -33
  205. csv_detective/detect_labels/temp/__init__.py +0 -0
  206. csv_detective/detect_labels/temp/date/__init__.py +0 -51
  207. csv_detective/detect_labels/temp/datetime_iso/__init__.py +0 -45
  208. csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +0 -44
  209. csv_detective/detect_labels/temp/year/__init__.py +0 -44
  210. csv_detective/detection.py +0 -361
  211. csv_detective/process_text.py +0 -39
  212. csv_detective/s3_utils.py +0 -48
  213. csv_detective-0.6.7.data/data/share/csv_detective/CHANGELOG.md +0 -118
  214. csv_detective-0.6.7.data/data/share/csv_detective/LICENSE.AGPL.txt +0 -661
  215. csv_detective-0.6.7.data/data/share/csv_detective/README.md +0 -247
  216. csv_detective-0.6.7.dist-info/LICENSE.AGPL.txt +0 -661
  217. csv_detective-0.6.7.dist-info/METADATA +0 -23
  218. csv_detective-0.6.7.dist-info/RECORD +0 -150
  219. csv_detective-0.6.7.dist-info/WHEEL +0 -5
  220. csv_detective-0.6.7.dist-info/top_level.txt +0 -2
  221. tests/__init__.py +0 -0
  222. tests/test_fields.py +0 -360
  223. tests/test_file.py +0 -116
  224. tests/test_labels.py +0 -7
  225. /csv_detective/{detect_fields/FR/other/csp_insee → formats/data}/csp_insee.txt +0 -0
  226. /csv_detective/{detect_fields/geo/iso_country_code_alpha2 → formats/data}/iso_country_code_alpha2.txt +0 -0
  227. /csv_detective/{detect_fields/geo/iso_country_code_alpha3 → formats/data}/iso_country_code_alpha3.txt +0 -0
  228. /csv_detective/{detect_fields/geo/iso_country_code_numeric → formats/data}/iso_country_code_numeric.txt +0 -0
@@ -0,0 +1,56 @@
1
+ from re import finditer
2
+
3
+
4
+ def camel_case_split(identifier: str):
5
+ matches = finditer(".+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)", identifier)
6
+ return " ".join([m.group(0) for m in matches])
7
+
8
+
9
+ translate_dict = {
10
+ " ": ["-", "_", "'", ",", " "],
11
+ "a": ["à", "â"],
12
+ "c": ["ç"],
13
+ "e": ["é", "è", "ê", "é"],
14
+ "i": ["î", "ï"],
15
+ "o": ["ô", "ö"],
16
+ "u": ["ù", "û", "ü"],
17
+ }
18
+
19
+
20
+ # Process text
21
+ def _process_text(val: str):
22
+ """Traitement des chaînes de caractères pour les standardiser.
23
+ Plusieurs alternatives ont été testées : .translate, unidecode.unidecode,
24
+ des méthodes hybrides, mais aucune ne s'est avérée plus performante."""
25
+ val = camel_case_split(val)
26
+ val = val.lower()
27
+ for target in translate_dict:
28
+ for source in translate_dict[target]:
29
+ val = val.replace(source, target)
30
+ val = val.strip()
31
+ return val
32
+
33
+
34
+ def is_word_in_string(word: str, string: str):
35
+ # if the substring is too short, the test can become irrelevant
36
+ return len(word) > 2 and word in string
37
+
38
+
39
+ def header_score(header: str, words_combinations_list: list[str]) -> float:
40
+ """Returns:
41
+ - 1 if the header is exactly in the specified list
42
+ - 0.5 if any of the words is within the header
43
+ - 0 otherwise"""
44
+ processed_header = _process_text(header)
45
+
46
+ header_matches_words_combination = float(
47
+ any(words_combination == processed_header for words_combination in words_combinations_list)
48
+ )
49
+ words_combination_in_header = 0.5 * (
50
+ any(
51
+ is_word_in_string(words_combination, processed_header)
52
+ for words_combination in words_combinations_list
53
+ )
54
+ )
55
+
56
+ return max(header_matches_words_combination, words_combination_in_header)
csv_detective/utils.py CHANGED
@@ -1,209 +1,36 @@
1
- import pandas as pd
2
1
  import logging
3
- from time import time
2
+
3
+ import pandas as pd
4
4
 
5
5
  logging.basicConfig(level=logging.INFO)
6
+ logging.addLevelName(
7
+ logging.CRITICAL, "\033[1;41m%s\033[1;0m" % logging.getLevelName(logging.CRITICAL)
8
+ )
9
+ logging.addLevelName(logging.WARN, "\033[1;31m%s\033[1;0m" % logging.getLevelName(logging.WARN))
6
10
 
11
+ THRESHOLD_WARN = 1
12
+ THRESHOLD_CRITICAL = 3
7
13
 
8
- def display_logs_depending_process_time(prompt: str, duration: float):
9
- '''
10
- Print colored logs according to the time the operation took.
11
- '''
12
- logging.addLevelName(logging.CRITICAL, "\033[1;41m%s\033[1;0m" % logging.getLevelName(logging.CRITICAL))
13
- logging.addLevelName(logging.WARN, "\033[1;31m%s\033[1;0m" % logging.getLevelName(logging.WARN))
14
14
 
15
- threshold_warn = 1
16
- threshold_critical = 3
17
-
18
- if duration < threshold_warn:
19
- logging.info(prompt)
20
- elif duration < threshold_critical:
21
- logging.warn(prompt)
22
- else:
23
- logging.critical(prompt)
24
-
25
-
26
- def test_col_val(
27
- serie, test_func, proportion=0.9, skipna=True, output_mode="ALL", verbose=False
28
- ):
29
- """Tests values of the serie using test_func.
30
- - skipna : if True indicates that NaNs are not counted as False
31
- - proportion : indicates the proportion of values that have to pass the test
32
- for the serie to be detected as a certain format
15
+ def display_logs_depending_process_time(prompt: str, duration: float) -> None:
33
16
  """
34
- if verbose:
35
- start = time()
36
-
37
- # TODO : change for a cleaner method and only test columns in modules labels
38
- def apply_test_func(serie, test_func, _range):
39
- return serie.sample(n=_range).apply(test_func)
40
- try:
41
- if skipna:
42
- serie = serie[serie.notnull()]
43
- ser_len = len(serie)
44
- if ser_len == 0:
45
- return 0.0
46
- if output_mode == "ALL":
47
- result = apply_test_func(serie, test_func, ser_len).sum() / ser_len
48
- return result if result >= proportion else 0.0
49
- else:
50
- if proportion == 1: # Then try first 1 value, then 5, then all
51
- for _range in [
52
- min(1, ser_len),
53
- min(5, ser_len),
54
- ser_len,
55
- ]: # Pour ne pas faire d'opérations inutiles, on commence par 1,
56
- # puis 5 valeurs puis la serie complète
57
- if all(apply_test_func(serie, test_func, _range)):
58
- # print(serie.name, ': check OK')
59
- pass
60
- else:
61
- return 0.0
62
- return 1.0
63
- else:
64
- # if we have a proportion, statistically it's OK to analyse up to 10k rows
65
- # (arbitrary number) and get a significant result
66
- to_analyse = min(ser_len, 10000)
67
- result = apply_test_func(serie, test_func, to_analyse).sum() / to_analyse
68
- return result if result >= proportion else 0.0
69
- finally:
70
- if verbose and time() - start > 3:
71
- display_logs_depending_process_time(
72
- f"\t/!\\ Column '{serie.name}' took too long ({round(time() - start, 3)}s)",
73
- time() - start
74
- )
75
-
76
-
77
- def test_col_label(label, test_func, proportion=1, output_mode="ALL"):
78
- """Tests label (from header) using test_func.
79
- - proportion : indicates the minimum score to pass the test for the serie
80
- to be detected as a certain format
17
+ Print colored logs according to the time the operation took.
81
18
  """
82
- if output_mode == "ALL":
83
- return test_func(label)
19
+ if duration < THRESHOLD_WARN:
20
+ logging.info(prompt)
21
+ elif duration < THRESHOLD_CRITICAL:
22
+ logging.warning(prompt)
84
23
  else:
85
- result = test_func(label)
86
- return result if result >= proportion else 0
87
-
88
-
89
- def test_col(table, all_tests, output_mode, verbose: bool = False):
90
- # Initialising dict for tests
91
- if verbose:
92
- start = time()
93
- logging.info("Testing columns to get types")
94
- test_funcs = dict()
95
- for test in all_tests:
96
- name = test.__name__.split(".")[-1]
97
- test_funcs[name] = {"func": test._is, "prop": test.PROPORTION}
98
- return_table = pd.DataFrame(columns=table.columns)
99
- for idx, (key, value) in enumerate(test_funcs.items()):
100
- if verbose:
101
- start_type = time()
102
- logging.info(f"\t- Starting with type '{key}'")
103
- # improvement lead : put the longest tests behind and make them only if previous tests not satisfactory
104
- # => the following needs to change, "apply" means all columns are tested for one type at once
105
- return_table.loc[key] = table.apply(
106
- lambda serie: test_col_val(
107
- serie,
108
- value["func"],
109
- value["prop"],
110
- output_mode=output_mode,
111
- verbose=verbose,
112
- )
113
- )
114
- if verbose:
115
- display_logs_depending_process_time(
116
- f'\t> Done with type "{key}" in {round(time() - start_type, 3)}s ({idx+1}/{len(test_funcs)})',
117
- time() - start_type
118
- )
119
- if verbose:
120
- display_logs_depending_process_time(f"Done testing columns in {round(time() - start, 3)}s", time() - start)
121
- return return_table
122
-
123
-
124
- def test_label(table, all_tests, output_mode, verbose: bool = False):
125
- # Initialising dict for tests
126
- if verbose:
127
- start = time()
128
- logging.info("Testing labels to get types")
129
- test_funcs = dict()
130
- for test in all_tests:
131
- name = test.__name__.split(".")[-1]
132
- test_funcs[name] = {"func": test._is, "prop": test.PROPORTION}
133
-
134
- return_table = pd.DataFrame(columns=table.columns)
135
- for idx, (key, value) in enumerate(test_funcs.items()):
136
- if verbose:
137
- start_type = time()
138
- return_table.loc[key] = [
139
- test_col_label(
140
- col_name, value["func"], value["prop"], output_mode=output_mode
141
- )
142
- for col_name in table.columns
143
- ]
144
- if verbose:
145
- display_logs_depending_process_time(
146
- f'\t- Done with type "{key}" in {round(time() - start_type, 3)}s ({idx+1}/{len(test_funcs)})',
147
- time() - start_type
148
- )
149
- if verbose:
150
- display_logs_depending_process_time(f"Done testing labels in {round(time() - start, 3)}s", time() - start)
151
- return return_table
152
-
153
-
154
- def prepare_output_dict(return_table, output_mode):
155
- return_dict_cols = return_table.to_dict("dict")
156
- return_dict_cols_intermediary = {}
157
- for column_name in return_dict_cols:
158
- return_dict_cols_intermediary[column_name] = []
159
- for detected_value_type in return_dict_cols[column_name]:
160
- if return_dict_cols[column_name][detected_value_type] == 0:
161
- continue
162
- dict_tmp = {}
163
- dict_tmp["format"] = detected_value_type
164
- dict_tmp["score"] = return_dict_cols[column_name][detected_value_type]
165
- return_dict_cols_intermediary[column_name].append(dict_tmp)
166
-
167
- # Clean dict using priorities
168
- formats_detected = {
169
- x["format"] for x in return_dict_cols_intermediary[column_name]
170
- }
171
- formats_to_remove = set()
172
- # Deprioritise float and int detection vs others
173
- if len(formats_detected - {"float", "int"}) > 0:
174
- formats_to_remove = formats_to_remove.union({"float", "int"})
175
- if "int" in formats_detected:
176
- formats_to_remove.add("float")
177
- if "latitude_wgs_fr_metropole" in formats_detected:
178
- formats_to_remove.add("latitude_l93")
179
- formats_to_remove.add("latitude_wgs")
180
- if "longitude_wgs_fr_metropole" in formats_detected:
181
- formats_to_remove.add("longitude_l93")
182
- formats_to_remove.add("longitude_wgs")
183
- if "longitude_wgs" in formats_detected:
184
- formats_to_remove.add("longitude_l93")
185
- if "code_region" in formats_detected:
186
- formats_to_remove.add("code_departement")
187
-
188
- formats_to_keep = formats_detected - formats_to_remove
24
+ logging.critical(prompt)
189
25
 
190
- detections = return_dict_cols_intermediary[column_name]
191
- detections = [x for x in detections if x["format"] in formats_to_keep]
192
- if output_mode == "ALL":
193
- return_dict_cols_intermediary[column_name] = detections
194
- if output_mode == "LIMITED":
195
- return_dict_cols_intermediary[column_name] = (
196
- max(detections, key=lambda x: x["score"])
197
- if len(detections) > 0
198
- else {"format": "string", "score": 1.0}
199
- )
200
26
 
201
- return return_dict_cols_intermediary
27
+ def is_url(file_path: str) -> bool:
28
+ # could be more sophisticated if needed
29
+ # using the URL detection test was considered but too broad (schema required to use requests)
30
+ return file_path.startswith("http")
202
31
 
203
32
 
204
- def full_word_strictly_inside_string(word, string):
205
- return (
206
- (" " + word + " " in string)
207
- or (string.startswith(word + " "))
208
- or (string.endswith(" " + word))
209
- )
33
+ def cast_prevent_nan(value: float, _type: str) -> float | int | None:
34
+ if _type not in {"int", "float"}:
35
+ raise ValueError(f"Invalid type was passed: {_type}")
36
+ return None if pd.isna(value) else eval(_type)(value)
@@ -0,0 +1,138 @@
1
+ import logging
2
+
3
+ import pandas as pd
4
+
5
+ from csv_detective.format import FormatsManager
6
+ from csv_detective.parsing.columns import MAX_NUMBER_CATEGORICAL_VALUES, test_col_val
7
+
8
+ VALIDATION_CHUNK_SIZE = int(1e5)
9
+ logging.basicConfig(level=logging.INFO)
10
+
11
+ formats = FormatsManager().formats
12
+
13
+
14
+ def validate(
15
+ file_path: str,
16
+ previous_analysis: dict,
17
+ verbose: bool = False,
18
+ skipna: bool = True,
19
+ ) -> tuple[bool, pd.DataFrame | None, dict | None, dict[str, pd.Series] | None]:
20
+ """
21
+ Verify is the given file has the same fields and types as in the given analysis.
22
+
23
+ Args:
24
+ file_path: the path of the file to validate
25
+ previous_analysis: the previous analysis to validate against (expected in the same structure as the output of the routine)
26
+ verbose: whether the code displays the steps it's going through
27
+ skipna: whether to ignore NaN values in the checks
28
+ """
29
+ try:
30
+ if previous_analysis.get("separator"):
31
+ # loading the table in chunks
32
+ chunks = pd.read_csv(
33
+ file_path,
34
+ dtype=str,
35
+ sep=previous_analysis["separator"],
36
+ encoding=previous_analysis["encoding"],
37
+ skiprows=previous_analysis["header_row_idx"],
38
+ compression=previous_analysis.get("compression"),
39
+ chunksize=VALIDATION_CHUNK_SIZE,
40
+ )
41
+ analysis = {
42
+ k: v
43
+ for k, v in previous_analysis.items()
44
+ if k
45
+ in ["encoding", "separator", "compression", "heading_columns", "trailing_columns"]
46
+ and v is not None
47
+ }
48
+ else:
49
+ # or chunks-like if not chunkable
50
+ chunks = iter(
51
+ [
52
+ pd.read_excel(
53
+ file_path,
54
+ dtype=str,
55
+ engine=previous_analysis["engine"],
56
+ sheet_name=previous_analysis["sheet_name"],
57
+ )
58
+ ]
59
+ )
60
+ analysis = {k: v for k, v in previous_analysis.items() if k in ["engine", "sheet_name"]}
61
+ first_chunk = next(chunks)
62
+ analysis.update(
63
+ {k: v for k, v in previous_analysis.items() if k in ["header_row_idx", "header"]}
64
+ )
65
+ except Exception as e:
66
+ if verbose:
67
+ logging.warning(f"> Could not load the file with previous analysis values: {e}")
68
+ return False, None, None, None
69
+ if verbose:
70
+ logging.info("Comparing table with the previous analysis")
71
+ logging.info("- Checking if all columns match")
72
+ if len(first_chunk.columns) != len(previous_analysis["header"]) or any(
73
+ list(first_chunk.columns)[k] != previous_analysis["header"][k]
74
+ for k in range(len(previous_analysis["header"]))
75
+ ):
76
+ if verbose:
77
+ logging.warning("> Columns do not match, proceeding with full analysis")
78
+ return False, None, None, None
79
+ if verbose:
80
+ logging.info(
81
+ f"Testing previously detected formats on chunks of {VALIDATION_CHUNK_SIZE} rows"
82
+ )
83
+
84
+ # hashing rows to get nb_duplicates
85
+ row_hashes_count = first_chunk.apply(lambda row: hash(tuple(row)), axis=1).value_counts()
86
+ # getting values for profile to read the file only once
87
+ col_values = {col: first_chunk[col].value_counts(dropna=False) for col in first_chunk.columns}
88
+ analysis["total_lines"] = 0
89
+ for idx, chunk in enumerate([first_chunk, *chunks]):
90
+ if verbose:
91
+ logging.info(f"> Testing chunk number {idx}")
92
+ analysis["total_lines"] += len(chunk)
93
+ row_hashes_count = row_hashes_count.add(
94
+ chunk.apply(lambda row: hash(tuple(row)), axis=1).value_counts(),
95
+ fill_value=0,
96
+ )
97
+ for col in chunk.columns:
98
+ col_values[col] = col_values[col].add(
99
+ chunk[col].value_counts(dropna=False),
100
+ fill_value=0,
101
+ )
102
+ for col_name, args in previous_analysis["columns"].items():
103
+ if verbose:
104
+ logging.info(f"- Testing {col_name} for {args['format']}")
105
+ if args["format"] == "string":
106
+ # no test for columns that have not been recognized as a specific format
107
+ continue
108
+ test_result: float = test_col_val(
109
+ serie=chunk[col_name],
110
+ format=formats[args["format"]],
111
+ skipna=skipna,
112
+ )
113
+ if not bool(test_result):
114
+ if verbose:
115
+ logging.warning("> Test failed, proceeding with full analysis")
116
+ return False, first_chunk, analysis, None
117
+ if verbose:
118
+ logging.info("> All checks successful")
119
+ analysis["nb_duplicates"] = sum(row_hashes_count > 1)
120
+ analysis["categorical"] = [
121
+ col for col, values in col_values.items() if len(values) <= MAX_NUMBER_CATEGORICAL_VALUES
122
+ ]
123
+ return (
124
+ True,
125
+ first_chunk,
126
+ analysis
127
+ | {
128
+ k: previous_analysis[k]
129
+ for k in [
130
+ "categorical",
131
+ "columns",
132
+ "columns_fields",
133
+ "columns_labels",
134
+ "formats",
135
+ ]
136
+ },
137
+ col_values,
138
+ )