csv-detective 0.6.7__py3-none-any.whl → 0.9.3.dev2438__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. csv_detective/__init__.py +7 -1
  2. csv_detective/cli.py +33 -21
  3. csv_detective/{detect_fields/FR → detection}/__init__.py +0 -0
  4. csv_detective/detection/columns.py +89 -0
  5. csv_detective/detection/encoding.py +29 -0
  6. csv_detective/detection/engine.py +46 -0
  7. csv_detective/detection/formats.py +156 -0
  8. csv_detective/detection/headers.py +28 -0
  9. csv_detective/detection/rows.py +18 -0
  10. csv_detective/detection/separator.py +44 -0
  11. csv_detective/detection/variables.py +97 -0
  12. csv_detective/explore_csv.py +151 -377
  13. csv_detective/format.py +67 -0
  14. csv_detective/formats/__init__.py +9 -0
  15. csv_detective/formats/adresse.py +116 -0
  16. csv_detective/formats/binary.py +26 -0
  17. csv_detective/formats/booleen.py +35 -0
  18. csv_detective/formats/code_commune_insee.py +26 -0
  19. csv_detective/formats/code_csp_insee.py +36 -0
  20. csv_detective/formats/code_departement.py +29 -0
  21. csv_detective/formats/code_fantoir.py +21 -0
  22. csv_detective/formats/code_import.py +17 -0
  23. csv_detective/formats/code_postal.py +25 -0
  24. csv_detective/formats/code_region.py +22 -0
  25. csv_detective/formats/code_rna.py +29 -0
  26. csv_detective/formats/code_waldec.py +17 -0
  27. csv_detective/formats/commune.py +27 -0
  28. csv_detective/formats/csp_insee.py +31 -0
  29. csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt +0 -0
  30. csv_detective/formats/date.py +99 -0
  31. csv_detective/formats/date_fr.py +22 -0
  32. csv_detective/formats/datetime_aware.py +45 -0
  33. csv_detective/formats/datetime_naive.py +48 -0
  34. csv_detective/formats/datetime_rfc822.py +24 -0
  35. csv_detective/formats/departement.py +37 -0
  36. csv_detective/formats/email.py +28 -0
  37. csv_detective/formats/float.py +29 -0
  38. csv_detective/formats/geojson.py +36 -0
  39. csv_detective/formats/insee_ape700.py +31 -0
  40. csv_detective/formats/insee_canton.py +28 -0
  41. csv_detective/formats/int.py +23 -0
  42. csv_detective/formats/iso_country_code_alpha2.py +30 -0
  43. csv_detective/formats/iso_country_code_alpha3.py +30 -0
  44. csv_detective/formats/iso_country_code_numeric.py +31 -0
  45. csv_detective/formats/jour_de_la_semaine.py +41 -0
  46. csv_detective/formats/json.py +20 -0
  47. csv_detective/formats/latitude_l93.py +48 -0
  48. csv_detective/formats/latitude_wgs.py +42 -0
  49. csv_detective/formats/latitude_wgs_fr_metropole.py +42 -0
  50. csv_detective/formats/latlon_wgs.py +53 -0
  51. csv_detective/formats/longitude_l93.py +39 -0
  52. csv_detective/formats/longitude_wgs.py +32 -0
  53. csv_detective/formats/longitude_wgs_fr_metropole.py +32 -0
  54. csv_detective/formats/lonlat_wgs.py +36 -0
  55. csv_detective/formats/mois_de_lannee.py +48 -0
  56. csv_detective/formats/money.py +18 -0
  57. csv_detective/formats/mongo_object_id.py +14 -0
  58. csv_detective/formats/pays.py +35 -0
  59. csv_detective/formats/percent.py +16 -0
  60. csv_detective/formats/region.py +70 -0
  61. csv_detective/formats/sexe.py +17 -0
  62. csv_detective/formats/siren.py +37 -0
  63. csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} +47 -29
  64. csv_detective/formats/tel_fr.py +36 -0
  65. csv_detective/formats/uai.py +36 -0
  66. csv_detective/formats/url.py +46 -0
  67. csv_detective/formats/username.py +14 -0
  68. csv_detective/formats/uuid.py +16 -0
  69. csv_detective/formats/year.py +28 -0
  70. csv_detective/output/__init__.py +65 -0
  71. csv_detective/output/dataframe.py +96 -0
  72. csv_detective/output/example.py +250 -0
  73. csv_detective/output/profile.py +119 -0
  74. csv_detective/{schema_generation.py → output/schema.py} +268 -343
  75. csv_detective/output/utils.py +74 -0
  76. csv_detective/{detect_fields/FR/geo → parsing}/__init__.py +0 -0
  77. csv_detective/parsing/columns.py +235 -0
  78. csv_detective/parsing/compression.py +11 -0
  79. csv_detective/parsing/csv.py +56 -0
  80. csv_detective/parsing/excel.py +167 -0
  81. csv_detective/parsing/load.py +111 -0
  82. csv_detective/parsing/text.py +56 -0
  83. csv_detective/utils.py +23 -196
  84. csv_detective/validate.py +138 -0
  85. csv_detective-0.9.3.dev2438.dist-info/METADATA +267 -0
  86. csv_detective-0.9.3.dev2438.dist-info/RECORD +92 -0
  87. csv_detective-0.9.3.dev2438.dist-info/WHEEL +4 -0
  88. {csv_detective-0.6.7.dist-info → csv_detective-0.9.3.dev2438.dist-info}/entry_points.txt +1 -0
  89. csv_detective/all_packages.txt +0 -104
  90. csv_detective/detect_fields/FR/geo/adresse/__init__.py +0 -100
  91. csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +0 -24
  92. csv_detective/detect_fields/FR/geo/code_commune_insee/code_commune_insee.txt +0 -37600
  93. csv_detective/detect_fields/FR/geo/code_departement/__init__.py +0 -11
  94. csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +0 -15
  95. csv_detective/detect_fields/FR/geo/code_fantoir/code_fantoir.txt +0 -26122
  96. csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -19
  97. csv_detective/detect_fields/FR/geo/code_postal/code_postal.txt +0 -36822
  98. csv_detective/detect_fields/FR/geo/code_region/__init__.py +0 -27
  99. csv_detective/detect_fields/FR/geo/commune/__init__.py +0 -21
  100. csv_detective/detect_fields/FR/geo/commune/commune.txt +0 -36745
  101. csv_detective/detect_fields/FR/geo/departement/__init__.py +0 -19
  102. csv_detective/detect_fields/FR/geo/departement/departement.txt +0 -101
  103. csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +0 -20
  104. csv_detective/detect_fields/FR/geo/insee_canton/canton2017.txt +0 -2055
  105. csv_detective/detect_fields/FR/geo/insee_canton/cantons.txt +0 -2055
  106. csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +0 -13
  107. csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -13
  108. csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +0 -13
  109. csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -13
  110. csv_detective/detect_fields/FR/geo/pays/__init__.py +0 -17
  111. csv_detective/detect_fields/FR/geo/pays/pays.txt +0 -248
  112. csv_detective/detect_fields/FR/geo/region/__init__.py +0 -16
  113. csv_detective/detect_fields/FR/geo/region/region.txt +0 -44
  114. csv_detective/detect_fields/FR/other/__init__.py +0 -0
  115. csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +0 -26
  116. csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
  117. csv_detective/detect_fields/FR/other/code_rna/__init__.py +0 -8
  118. csv_detective/detect_fields/FR/other/code_waldec/__init__.py +0 -12
  119. csv_detective/detect_fields/FR/other/csp_insee/__init__.py +0 -16
  120. csv_detective/detect_fields/FR/other/date_fr/__init__.py +0 -12
  121. csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +0 -16
  122. csv_detective/detect_fields/FR/other/sexe/__init__.py +0 -9
  123. csv_detective/detect_fields/FR/other/siren/__init__.py +0 -18
  124. csv_detective/detect_fields/FR/other/tel_fr/__init__.py +0 -15
  125. csv_detective/detect_fields/FR/other/uai/__init__.py +0 -15
  126. csv_detective/detect_fields/FR/temp/__init__.py +0 -0
  127. csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +0 -23
  128. csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +0 -37
  129. csv_detective/detect_fields/__init__.py +0 -57
  130. csv_detective/detect_fields/geo/__init__.py +0 -0
  131. csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +0 -15
  132. csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +0 -14
  133. csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +0 -15
  134. csv_detective/detect_fields/geo/json_geojson/__init__.py +0 -22
  135. csv_detective/detect_fields/geo/latitude_wgs/__init__.py +0 -13
  136. csv_detective/detect_fields/geo/latlon_wgs/__init__.py +0 -15
  137. csv_detective/detect_fields/geo/longitude_wgs/__init__.py +0 -13
  138. csv_detective/detect_fields/other/__init__.py +0 -0
  139. csv_detective/detect_fields/other/booleen/__init__.py +0 -21
  140. csv_detective/detect_fields/other/email/__init__.py +0 -8
  141. csv_detective/detect_fields/other/float/__init__.py +0 -17
  142. csv_detective/detect_fields/other/int/__init__.py +0 -12
  143. csv_detective/detect_fields/other/json/__init__.py +0 -24
  144. csv_detective/detect_fields/other/mongo_object_id/__init__.py +0 -8
  145. csv_detective/detect_fields/other/twitter/__init__.py +0 -8
  146. csv_detective/detect_fields/other/url/__init__.py +0 -11
  147. csv_detective/detect_fields/other/uuid/__init__.py +0 -11
  148. csv_detective/detect_fields/temp/__init__.py +0 -0
  149. csv_detective/detect_fields/temp/date/__init__.py +0 -62
  150. csv_detective/detect_fields/temp/datetime_iso/__init__.py +0 -18
  151. csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +0 -21
  152. csv_detective/detect_fields/temp/year/__init__.py +0 -10
  153. csv_detective/detect_labels/FR/__init__.py +0 -0
  154. csv_detective/detect_labels/FR/geo/__init__.py +0 -0
  155. csv_detective/detect_labels/FR/geo/adresse/__init__.py +0 -40
  156. csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +0 -42
  157. csv_detective/detect_labels/FR/geo/code_departement/__init__.py +0 -33
  158. csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +0 -33
  159. csv_detective/detect_labels/FR/geo/code_postal/__init__.py +0 -41
  160. csv_detective/detect_labels/FR/geo/code_region/__init__.py +0 -33
  161. csv_detective/detect_labels/FR/geo/commune/__init__.py +0 -33
  162. csv_detective/detect_labels/FR/geo/departement/__init__.py +0 -47
  163. csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +0 -33
  164. csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +0 -54
  165. csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -55
  166. csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +0 -44
  167. csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -45
  168. csv_detective/detect_labels/FR/geo/pays/__init__.py +0 -45
  169. csv_detective/detect_labels/FR/geo/region/__init__.py +0 -45
  170. csv_detective/detect_labels/FR/other/__init__.py +0 -0
  171. csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +0 -33
  172. csv_detective/detect_labels/FR/other/code_rna/__init__.py +0 -38
  173. csv_detective/detect_labels/FR/other/code_waldec/__init__.py +0 -33
  174. csv_detective/detect_labels/FR/other/csp_insee/__init__.py +0 -37
  175. csv_detective/detect_labels/FR/other/date_fr/__init__.py +0 -33
  176. csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +0 -40
  177. csv_detective/detect_labels/FR/other/sexe/__init__.py +0 -33
  178. csv_detective/detect_labels/FR/other/siren/__init__.py +0 -41
  179. csv_detective/detect_labels/FR/other/siret/__init__.py +0 -40
  180. csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -45
  181. csv_detective/detect_labels/FR/other/uai/__init__.py +0 -50
  182. csv_detective/detect_labels/FR/temp/__init__.py +0 -0
  183. csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +0 -41
  184. csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +0 -33
  185. csv_detective/detect_labels/__init__.py +0 -43
  186. csv_detective/detect_labels/geo/__init__.py +0 -0
  187. csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +0 -41
  188. csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +0 -41
  189. csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +0 -41
  190. csv_detective/detect_labels/geo/json_geojson/__init__.py +0 -42
  191. csv_detective/detect_labels/geo/latitude_wgs/__init__.py +0 -55
  192. csv_detective/detect_labels/geo/latlon_wgs/__init__.py +0 -67
  193. csv_detective/detect_labels/geo/longitude_wgs/__init__.py +0 -45
  194. csv_detective/detect_labels/other/__init__.py +0 -0
  195. csv_detective/detect_labels/other/booleen/__init__.py +0 -34
  196. csv_detective/detect_labels/other/email/__init__.py +0 -45
  197. csv_detective/detect_labels/other/float/__init__.py +0 -33
  198. csv_detective/detect_labels/other/int/__init__.py +0 -33
  199. csv_detective/detect_labels/other/money/__init__.py +0 -11
  200. csv_detective/detect_labels/other/money/check_col_name.py +0 -8
  201. csv_detective/detect_labels/other/mongo_object_id/__init__.py +0 -33
  202. csv_detective/detect_labels/other/twitter/__init__.py +0 -33
  203. csv_detective/detect_labels/other/url/__init__.py +0 -48
  204. csv_detective/detect_labels/other/uuid/__init__.py +0 -33
  205. csv_detective/detect_labels/temp/__init__.py +0 -0
  206. csv_detective/detect_labels/temp/date/__init__.py +0 -51
  207. csv_detective/detect_labels/temp/datetime_iso/__init__.py +0 -45
  208. csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +0 -44
  209. csv_detective/detect_labels/temp/year/__init__.py +0 -44
  210. csv_detective/detection.py +0 -361
  211. csv_detective/process_text.py +0 -39
  212. csv_detective/s3_utils.py +0 -48
  213. csv_detective-0.6.7.data/data/share/csv_detective/CHANGELOG.md +0 -118
  214. csv_detective-0.6.7.data/data/share/csv_detective/LICENSE.AGPL.txt +0 -661
  215. csv_detective-0.6.7.data/data/share/csv_detective/README.md +0 -247
  216. csv_detective-0.6.7.dist-info/LICENSE.AGPL.txt +0 -661
  217. csv_detective-0.6.7.dist-info/METADATA +0 -23
  218. csv_detective-0.6.7.dist-info/RECORD +0 -150
  219. csv_detective-0.6.7.dist-info/WHEEL +0 -5
  220. csv_detective-0.6.7.dist-info/top_level.txt +0 -2
  221. tests/__init__.py +0 -0
  222. tests/test_fields.py +0 -360
  223. tests/test_file.py +0 -116
  224. tests/test_labels.py +0 -7
  225. /csv_detective/{detect_fields/FR/other/csp_insee → formats/data}/csp_insee.txt +0 -0
  226. /csv_detective/{detect_fields/geo/iso_country_code_alpha2 → formats/data}/iso_country_code_alpha2.txt +0 -0
  227. /csv_detective/{detect_fields/geo/iso_country_code_alpha3 → formats/data}/iso_country_code_alpha3.txt +0 -0
  228. /csv_detective/{detect_fields/geo/iso_country_code_numeric → formats/data}/iso_country_code_numeric.txt +0 -0
@@ -0,0 +1,74 @@
1
+ import pandas as pd
2
+
3
+
4
+ def prepare_output_dict(return_table: pd.DataFrame, limited_output: bool):
5
+ # -> dict[str, dict | list[dict]] (to be added when upgrading to python>=3.10)
6
+ return_dict_cols = return_table.to_dict("dict")
7
+ output_dict = {}
8
+ for column_name in return_dict_cols:
9
+ # keep only formats with a non-zero score
10
+ output_dict[column_name] = [
11
+ {
12
+ "format": detected_value_type,
13
+ "score": return_dict_cols[column_name][detected_value_type],
14
+ }
15
+ for detected_value_type in return_dict_cols[column_name]
16
+ if return_dict_cols[column_name][detected_value_type] > 0
17
+ ]
18
+ priorities = [
19
+ # no need to specify int and float everywhere, they are deprioritized anyway
20
+ ("int", ("float",)),
21
+ # bool over everything
22
+ (
23
+ "booleen",
24
+ (
25
+ "latitude_l93",
26
+ "latitude_wgs",
27
+ "latitude_wgs_fr_metropole",
28
+ "longitude_l93",
29
+ "longitude_wgs",
30
+ "longitude_wgs_fr_metropole",
31
+ ),
32
+ ),
33
+ ("geojson", ("json",)),
34
+ # latlon over lonlat if no longitude allows to discriminate
35
+ ("latlon_wgs", ("json", "lonlat_wgs")),
36
+ ("lonlat_wgs", ("json",)),
37
+ ("latitude_wgs_fr_metropole", ("latitude_l93", "latitude_wgs")),
38
+ ("longitude_wgs_fr_metropole", ("longitude_l93", "longitude_wgs")),
39
+ ("latitude_wgs", ("latitude_l93",)),
40
+ ("longitude_wgs", ("longitude_l93",)),
41
+ ("code_region", ("code_departement",)),
42
+ ("datetime_rfc822", ("datetime_aware",)),
43
+ ]
44
+ detected_formats = set(x["format"] for x in output_dict[column_name])
45
+ formats_to_remove = set()
46
+ # Deprioritise float and int detection vs others
47
+ if len(detected_formats - {"float", "int"}) > 0:
48
+ formats_to_remove = formats_to_remove.union({"float", "int"})
49
+ # Deprioritize less specific formats if:
50
+ # secondary score is even or worse
51
+ # or priority score is at least 1 (max of the field score)
52
+ for prio_format, secondary_formats in priorities:
53
+ if prio_format in detected_formats:
54
+ for secondary in secondary_formats:
55
+ if secondary in detected_formats and (
56
+ return_dict_cols[column_name][prio_format]
57
+ >= return_dict_cols[column_name][secondary]
58
+ or return_dict_cols[column_name][prio_format] >= 1
59
+ ):
60
+ formats_to_remove.add(secondary)
61
+
62
+ formats_to_keep = detected_formats - formats_to_remove
63
+
64
+ detections = [x for x in output_dict[column_name] if x["format"] in formats_to_keep]
65
+ if not limited_output:
66
+ output_dict[column_name] = detections
67
+ else:
68
+ output_dict[column_name] = (
69
+ max(detections, key=lambda x: x["score"])
70
+ if len(detections) > 0
71
+ else {"format": "string", "score": 1.0}
72
+ )
73
+
74
+ return output_dict
@@ -0,0 +1,235 @@
1
+ import logging
2
+ from time import time
3
+ from typing import Callable
4
+
5
+ import pandas as pd
6
+ from more_itertools import peekable
7
+
8
+ from csv_detective.format import Format
9
+ from csv_detective.parsing.csv import CHUNK_SIZE
10
+ from csv_detective.utils import display_logs_depending_process_time
11
+
12
+ # above this threshold, a column is not considered categorical
13
+ MAX_NUMBER_CATEGORICAL_VALUES = 25
14
+
15
+
16
+ def test_col_val(
17
+ serie: pd.Series,
18
+ format: Format,
19
+ skipna: bool = True,
20
+ limited_output: bool = False,
21
+ verbose: bool = False,
22
+ ) -> float:
23
+ """Tests values of the serie using test_func.
24
+ - skipna : if True indicates that NaNs are considered True
25
+ for the serie to be detected as a certain format
26
+ """
27
+ if verbose:
28
+ start = time()
29
+
30
+ # TODO : change for a cleaner method and only test columns in modules labels
31
+ def apply_test_func(serie: pd.Series, test_func: Callable, _range: int):
32
+ return serie.sample(n=_range).apply(test_func)
33
+
34
+ try:
35
+ if skipna:
36
+ serie = serie.loc[serie.notnull()]
37
+ ser_len = len(serie)
38
+ if ser_len == 0:
39
+ # being here means the whole column is NaN, so if skipna it's a pass
40
+ return 1.0 if skipna else 0.0
41
+ if not limited_output or format.proportion < 1:
42
+ # we want or have to go through the whole column to have the proportion
43
+ result: float = serie.apply(format.func).sum() / ser_len
44
+ return result if result >= format.proportion else 0.0
45
+ else:
46
+ # the whole column has to be valid so we have early stops (1 then 5 rows)
47
+ # to not waste time if directly unsuccessful
48
+ for _range in [
49
+ min(1, ser_len),
50
+ min(5, ser_len),
51
+ ser_len,
52
+ ]:
53
+ if not all(apply_test_func(serie, format.func, _range)):
54
+ return 0.0
55
+ return 1.0
56
+ finally:
57
+ if verbose and time() - start > 3:
58
+ display_logs_depending_process_time(
59
+ f"\t/!\\ Column '{serie.name}' took too long ({round(time() - start, 3)}s)",
60
+ time() - start,
61
+ )
62
+
63
+
64
+ def test_col(
65
+ table: pd.DataFrame,
66
+ formats: dict[str, Format],
67
+ limited_output: bool,
68
+ skipna: bool = True,
69
+ verbose: bool = False,
70
+ ):
71
+ if verbose:
72
+ start = time()
73
+ logging.info("Testing columns to get formats")
74
+ return_table = pd.DataFrame(columns=table.columns)
75
+ for idx, (label, format) in enumerate(formats.items()):
76
+ if verbose:
77
+ start_type = time()
78
+ logging.info(f"\t- Starting with format '{label}'")
79
+ # improvement lead : put the longest tests behind and make them only if previous tests not satisfactory
80
+ # => the following needs to change, "apply" means all columns are tested for one type at once
81
+ return_table.loc[label] = table.apply(
82
+ lambda serie: test_col_val(
83
+ serie,
84
+ format,
85
+ skipna=skipna,
86
+ limited_output=limited_output,
87
+ verbose=verbose,
88
+ )
89
+ )
90
+ if verbose:
91
+ display_logs_depending_process_time(
92
+ f'\t> Done with type "{label}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(formats)})',
93
+ time() - start_type,
94
+ )
95
+ if verbose:
96
+ display_logs_depending_process_time(
97
+ f"Done testing columns in {round(time() - start, 3)}s", time() - start
98
+ )
99
+ return return_table
100
+
101
+
102
+ def test_label(
103
+ columns: list[str], formats: dict[str, Format], limited_output: bool, verbose: bool = False
104
+ ):
105
+ if verbose:
106
+ start = time()
107
+ logging.info("Testing labels to get types")
108
+
109
+ return_table = pd.DataFrame(columns=columns)
110
+ for idx, (label, format) in enumerate(formats.items()):
111
+ if verbose:
112
+ start_type = time()
113
+ return_table.loc[label] = [format.is_valid_label(col_name) for col_name in columns]
114
+ if verbose:
115
+ display_logs_depending_process_time(
116
+ f'\t- Done with type "{label}" in {round(time() - start_type, 3)}s ({idx + 1}/{len(formats)})',
117
+ time() - start_type,
118
+ )
119
+ if verbose:
120
+ display_logs_depending_process_time(
121
+ f"Done testing labels in {round(time() - start, 3)}s", time() - start
122
+ )
123
+ return return_table
124
+
125
+
126
+ def test_col_chunks(
127
+ table: pd.DataFrame,
128
+ file_path: str,
129
+ analysis: dict,
130
+ formats: dict[str, Format],
131
+ limited_output: bool,
132
+ skipna: bool = True,
133
+ verbose: bool = False,
134
+ ) -> tuple[pd.DataFrame, dict, dict[str, pd.Series]]:
135
+ def build_remaining_tests_per_col(return_table: pd.DataFrame) -> dict[str, list[str]]:
136
+ # returns a dict with the table's columns as keys and the list of remaining format labels to apply
137
+ return {
138
+ col: [
139
+ fmt_label
140
+ for fmt_label in return_table.index
141
+ if return_table.loc[fmt_label, col] > 0
142
+ ]
143
+ for col in return_table.columns
144
+ }
145
+
146
+ if verbose:
147
+ start = time()
148
+ logging.info("Testing columns to get formats on chunks")
149
+
150
+ # analysing the sample to get a first guess
151
+ return_table = test_col(table, formats, limited_output, skipna=skipna, verbose=verbose)
152
+ remaining_tests_per_col = build_remaining_tests_per_col(return_table)
153
+
154
+ # hashing rows to get nb_duplicates
155
+ row_hashes_count = table.apply(lambda row: hash(tuple(row)), axis=1).value_counts()
156
+ # getting values for profile to read the file only once
157
+ col_values = {col: table[col].value_counts(dropna=False) for col in table.columns}
158
+
159
+ # only csv files can end up here, can't chunk excel
160
+ chunks = pd.read_csv(
161
+ file_path,
162
+ dtype=str,
163
+ encoding=analysis["encoding"],
164
+ sep=analysis["separator"],
165
+ skiprows=analysis["header_row_idx"],
166
+ compression=analysis.get("compression"),
167
+ chunksize=CHUNK_SIZE,
168
+ )
169
+ analysis["total_lines"] = CHUNK_SIZE
170
+ batch, batch_number = [], 1
171
+ iterator = peekable(enumerate(chunks))
172
+ while iterator:
173
+ idx, chunk = next(iterator)
174
+ if idx == 0:
175
+ # we have read and analysed the first chunk already
176
+ continue
177
+ if len(batch) < 10:
178
+ # it's too slow to process chunks directly, but we want to keep the first analysis
179
+ # on a "small" chunk, so partial analyses are done on batches of chunks
180
+ batch.append(chunk)
181
+ # we don't know when the chunks end, and doing one additionnal step
182
+ # for the final batch is ugly
183
+ try:
184
+ iterator.peek()
185
+ continue
186
+ except StopIteration:
187
+ pass
188
+ if verbose:
189
+ logging.info(f"> Testing batch number {batch_number}")
190
+ batch = pd.concat(batch, ignore_index=True)
191
+ analysis["total_lines"] += len(batch)
192
+ row_hashes_count = row_hashes_count.add(
193
+ batch.apply(lambda row: hash(tuple(row)), axis=1).value_counts(),
194
+ fill_value=0,
195
+ )
196
+ for col in batch.columns:
197
+ col_values[col] = col_values[col].add(
198
+ batch[col].value_counts(dropna=False),
199
+ fill_value=0,
200
+ )
201
+ if not any(remaining_tests for remaining_tests in remaining_tests_per_col.values()):
202
+ # no more potential tests to do on any column, early stop
203
+ break
204
+ for col, fmt_labels in remaining_tests_per_col.items():
205
+ # testing each column with the tests that are still competing
206
+ # after previous batchs analyses
207
+ for label in fmt_labels:
208
+ batch_col_test = test_col_val(
209
+ batch[col],
210
+ formats[label],
211
+ limited_output=limited_output,
212
+ skipna=skipna,
213
+ )
214
+ return_table.loc[label, col] = (
215
+ # if this batch's column tested 0 then test fails overall
216
+ 0
217
+ if batch_col_test == 0
218
+ # otherwise updating the score with weighted average
219
+ else ((return_table.loc[label, col] * idx + batch_col_test) / (idx + 1))
220
+ )
221
+ remaining_tests_per_col = build_remaining_tests_per_col(return_table)
222
+ batch, batch_number = [], batch_number + 1
223
+ analysis["nb_duplicates"] = sum(row_hashes_count > 1)
224
+ analysis["categorical"] = [
225
+ col for col, values in col_values.items() if len(values) <= MAX_NUMBER_CATEGORICAL_VALUES
226
+ ]
227
+ # handling that empty columns score 1 everywhere
228
+ for col in return_table.columns:
229
+ if sum(return_table[col]) == len(return_table):
230
+ return_table[col] = 0
231
+ if verbose:
232
+ display_logs_depending_process_time(
233
+ f"Done testing chunks in {round(time() - start, 3)}s", time() - start
234
+ )
235
+ return return_table, analysis, col_values
@@ -0,0 +1,11 @@
1
+ import gzip
2
+ from io import BytesIO
3
+
4
+
5
+ def unzip(binary_file: BytesIO, engine: str) -> BytesIO:
6
+ if engine == "gzip":
7
+ with gzip.open(binary_file, mode="rb") as binary_file:
8
+ file_content = binary_file.read()
9
+ else:
10
+ raise NotImplementedError(f"{engine} is not yet supported")
11
+ return BytesIO(file_content)
@@ -0,0 +1,56 @@
1
+ import logging
2
+ from time import time
3
+ from typing import TextIO
4
+
5
+ import pandas as pd
6
+
7
+ from csv_detective.utils import display_logs_depending_process_time
8
+
9
+ # the number of rows for the first analysis, and the number of rows per chunk of the df iterator
10
+ CHUNK_SIZE = int(1e4)
11
+
12
+
13
+ def parse_csv(
14
+ the_file: TextIO,
15
+ encoding: str,
16
+ sep: str,
17
+ num_rows: int,
18
+ skiprows: int,
19
+ random_state: int = 42,
20
+ verbose: bool = False,
21
+ ) -> tuple[pd.DataFrame, int | None, int | None]:
22
+ if verbose:
23
+ start = time()
24
+ logging.info("Parsing table")
25
+
26
+ if not isinstance(the_file, str):
27
+ the_file.seek(0)
28
+
29
+ try:
30
+ table = pd.read_csv(
31
+ the_file,
32
+ sep=sep,
33
+ dtype=str,
34
+ encoding=encoding,
35
+ skiprows=skiprows,
36
+ nrows=CHUNK_SIZE,
37
+ )
38
+ total_lines = len(table)
39
+ # branch between small and big files starts here
40
+ if total_lines == CHUNK_SIZE:
41
+ if verbose:
42
+ logging.warning(f"File is too long, analysing in chunks of {CHUNK_SIZE} rows")
43
+ total_lines, nb_duplicates = None, None
44
+ else:
45
+ nb_duplicates = len(table.loc[table.duplicated()])
46
+ if num_rows > 0:
47
+ num_rows = min(num_rows, total_lines or len(table))
48
+ table = table.sample(num_rows, random_state=random_state)
49
+ except Exception as e:
50
+ raise ValueError("Could not load file") from e
51
+ if verbose:
52
+ display_logs_depending_process_time(
53
+ f"Table parsed successfully in {round(time() - start, 3)}s",
54
+ time() - start,
55
+ )
56
+ return table, total_lines, nb_duplicates
@@ -0,0 +1,167 @@
1
+ from io import BytesIO
2
+ from time import time
3
+
4
+ import openpyxl
5
+ import pandas as pd
6
+ import requests
7
+ import xlrd
8
+
9
+ from csv_detective.detection.engine import engine_to_file
10
+ from csv_detective.detection.rows import remove_empty_first_rows
11
+ from csv_detective.utils import (
12
+ display_logs_depending_process_time,
13
+ is_url,
14
+ )
15
+
16
+ NEW_EXCEL_EXT = [".xlsx", ".xlsm", ".xltx", ".xltm"]
17
+ OLD_EXCEL_EXT = [".xls"]
18
+ OPEN_OFFICE_EXT = [".odf", ".ods", ".odt"]
19
+ XLS_LIKE_EXT = NEW_EXCEL_EXT + OLD_EXCEL_EXT + OPEN_OFFICE_EXT
20
+
21
+
22
+ def parse_excel(
23
+ file_path: str,
24
+ num_rows: int = -1,
25
+ engine: str | None = None,
26
+ sheet_name: str | None = None,
27
+ random_state: int = 42,
28
+ verbose: bool = False,
29
+ ) -> tuple[pd.DataFrame, int, int, str, str, int]:
30
+ """ "Excel-like parsing is really slow, could be a good improvement for future development"""
31
+ if verbose:
32
+ start = time()
33
+ no_sheet_specified = sheet_name is None
34
+
35
+ if engine in ["openpyxl", "xlrd"] or any(
36
+ [file_path.endswith(k) for k in NEW_EXCEL_EXT + OLD_EXCEL_EXT]
37
+ ):
38
+ remote_content = None
39
+ if is_url(file_path):
40
+ r = requests.get(file_path)
41
+ r.raise_for_status()
42
+ remote_content = BytesIO(r.content)
43
+ if not engine:
44
+ if any([file_path.endswith(k) for k in NEW_EXCEL_EXT]):
45
+ engine = "openpyxl"
46
+ else:
47
+ engine = "xlrd"
48
+ if sheet_name is None:
49
+ if verbose:
50
+ display_logs_depending_process_time(
51
+ f"Detected {engine_to_file[engine]} file, no sheet specified, reading the largest one",
52
+ time() - start,
53
+ )
54
+ try:
55
+ if engine == "openpyxl":
56
+ # openpyxl doesn't want to open files that don't have a valid extension
57
+ # see: https://foss.heptapod.net/openpyxl/openpyxl/-/issues/2157
58
+ # if the file is remote, we have a remote content anyway so it's fine
59
+ if not remote_content and "." not in file_path.split("/")[-1]:
60
+ with open(file_path, "rb") as f:
61
+ remote_content = BytesIO(f.read())
62
+ # faster than loading all sheets
63
+ wb = openpyxl.load_workbook(remote_content or file_path, read_only=True)
64
+ try:
65
+ sizes = {s.title: s.max_row * s.max_column for s in wb.worksheets}
66
+ except TypeError:
67
+ # sometimes read_only can't get the info, so we have to open the file for real
68
+ # this takes more time but it's for a limited number of files
69
+ # and it's this or nothing
70
+ wb = openpyxl.load_workbook(remote_content or file_path)
71
+ sizes = {s.title: s.max_row * s.max_column for s in wb.worksheets}
72
+ else:
73
+ if remote_content:
74
+ wb = xlrd.open_workbook(file_contents=remote_content.read())
75
+ else:
76
+ wb = xlrd.open_workbook(file_path)
77
+ sizes = {s.name: s.nrows * s.ncols for s in wb.sheets()}
78
+ sheet_name = max(sizes, key=sizes.get)
79
+ except xlrd.biffh.XLRDError:
80
+ # sometimes a xls file is recognized as ods
81
+ if verbose:
82
+ display_logs_depending_process_time(
83
+ "Could not read file with classic xls reader, trying with ODS",
84
+ time() - start,
85
+ )
86
+ engine = "odf"
87
+
88
+ if engine == "odf" or any([file_path.endswith(k) for k in OPEN_OFFICE_EXT]):
89
+ # for ODS files, no way to get sheets' sizes without
90
+ # loading the file one way or another (pandas or pure odfpy)
91
+ # so all in one
92
+ engine = "odf"
93
+ if sheet_name is None:
94
+ if verbose:
95
+ display_logs_depending_process_time(
96
+ f"Detected {engine_to_file[engine]} file, no sheet specified, reading the largest one",
97
+ time() - start,
98
+ )
99
+ tables = pd.read_excel(
100
+ file_path,
101
+ engine="odf",
102
+ sheet_name=None,
103
+ dtype=str,
104
+ )
105
+ sizes = {sheet_name: table.size for sheet_name, table in tables.items()}
106
+ sheet_name = max(sizes, key=sizes.get)
107
+ if verbose:
108
+ display_logs_depending_process_time(
109
+ f'Going forwards with sheet "{sheet_name}"',
110
+ time() - start,
111
+ )
112
+ table = tables[sheet_name]
113
+ else:
114
+ if verbose:
115
+ display_logs_depending_process_time(
116
+ f'Detected {engine_to_file[engine]} file, reading sheet "{sheet_name}"',
117
+ time() - start,
118
+ )
119
+ table = pd.read_excel(
120
+ file_path,
121
+ engine="odf",
122
+ sheet_name=sheet_name,
123
+ dtype=str,
124
+ )
125
+ table, header_row_idx = remove_empty_first_rows(table)
126
+ total_lines = len(table)
127
+ nb_duplicates = len(table.loc[table.duplicated()])
128
+ if num_rows > 0:
129
+ num_rows = min(num_rows - 1, total_lines)
130
+ table = table.sample(num_rows, random_state=random_state)
131
+ if verbose:
132
+ display_logs_depending_process_time(
133
+ f"Table parsed successfully in {round(time() - start, 3)}s",
134
+ time() - start,
135
+ )
136
+ return table, total_lines, nb_duplicates, sheet_name, engine, header_row_idx
137
+
138
+ # so here we end up with (old and new) excel files only
139
+ if verbose:
140
+ if no_sheet_specified:
141
+ display_logs_depending_process_time(
142
+ f'Going forwards with sheet "{sheet_name}"',
143
+ time() - start,
144
+ )
145
+ else:
146
+ display_logs_depending_process_time(
147
+ f'Detected {engine_to_file[engine]} file, reading sheet "{sheet_name}"',
148
+ time() - start,
149
+ )
150
+ table = pd.read_excel(
151
+ file_path,
152
+ engine=engine,
153
+ sheet_name=sheet_name,
154
+ dtype=str,
155
+ )
156
+ table, header_row_idx = remove_empty_first_rows(table)
157
+ total_lines = len(table)
158
+ nb_duplicates = len(table.loc[table.duplicated()])
159
+ if num_rows > 0:
160
+ num_rows = min(num_rows - 1, total_lines)
161
+ table = table.sample(num_rows, random_state=random_state)
162
+ if verbose:
163
+ display_logs_depending_process_time(
164
+ f"Table parsed successfully in {round(time() - start, 3)}s",
165
+ time() - start,
166
+ )
167
+ return table, total_lines, nb_duplicates, sheet_name, engine, header_row_idx
@@ -0,0 +1,111 @@
1
+ from io import BytesIO, StringIO
2
+
3
+ import pandas as pd
4
+ import requests
5
+
6
+ from csv_detective.detection.columns import detect_heading_columns, detect_trailing_columns
7
+ from csv_detective.detection.encoding import detect_encoding
8
+ from csv_detective.detection.engine import (
9
+ COMPRESSION_ENGINES,
10
+ EXCEL_ENGINES,
11
+ detect_engine,
12
+ )
13
+ from csv_detective.detection.headers import detect_headers
14
+ from csv_detective.detection.separator import detect_separator
15
+ from csv_detective.parsing.compression import unzip
16
+ from csv_detective.parsing.csv import parse_csv
17
+ from csv_detective.parsing.excel import (
18
+ XLS_LIKE_EXT,
19
+ parse_excel,
20
+ )
21
+ from csv_detective.utils import is_url
22
+
23
+
24
+ def load_file(
25
+ file_path: str,
26
+ num_rows: int = 500,
27
+ encoding: str | None = None,
28
+ sep: str | None = None,
29
+ verbose: bool = False,
30
+ sheet_name: str | int | None = None,
31
+ ) -> tuple[pd.DataFrame, dict]:
32
+ file_name = file_path.split("/")[-1]
33
+ engine = None
34
+ if "." not in file_name or not file_name.endswith("csv"):
35
+ # file has no extension, we'll investigate how to read it
36
+ engine = detect_engine(file_path, verbose=verbose)
37
+
38
+ if engine in EXCEL_ENGINES or any([file_path.endswith(k) for k in XLS_LIKE_EXT]):
39
+ table, total_lines, nb_duplicates, sheet_name, engine, header_row_idx = parse_excel(
40
+ file_path=file_path,
41
+ num_rows=num_rows,
42
+ engine=engine,
43
+ sheet_name=sheet_name,
44
+ verbose=verbose,
45
+ )
46
+ if table.empty:
47
+ raise ValueError("Table seems to be empty")
48
+ header = table.columns.to_list()
49
+ if any(col.startswith("Unnamed") for col in header):
50
+ raise ValueError("Could not retrieve headers")
51
+ analysis = {
52
+ "engine": engine,
53
+ "sheet_name": sheet_name,
54
+ }
55
+ else:
56
+ # fetching or reading file as binary
57
+ if is_url(file_path):
58
+ r = requests.get(file_path, allow_redirects=True)
59
+ r.raise_for_status()
60
+ binary_file = BytesIO(r.content)
61
+ else:
62
+ binary_file = open(file_path, "rb")
63
+ # handling compression
64
+ if engine in COMPRESSION_ENGINES:
65
+ binary_file: BytesIO = unzip(binary_file=binary_file, engine=engine)
66
+ # detecting encoding if not specified
67
+ if encoding is None:
68
+ encoding: str = detect_encoding(binary_file, verbose=verbose)
69
+ binary_file.seek(0)
70
+ # decoding and reading file
71
+ if is_url(file_path) or engine in COMPRESSION_ENGINES:
72
+ str_file = StringIO()
73
+ while True:
74
+ chunk = binary_file.read(1024**2)
75
+ if not chunk:
76
+ break
77
+ str_file.write(chunk.decode(encoding=encoding))
78
+ del binary_file
79
+ str_file.seek(0)
80
+ else:
81
+ str_file = open(file_path, "r", encoding=encoding)
82
+ if sep is None:
83
+ sep = detect_separator(str_file, verbose=verbose)
84
+ header_row_idx, header = detect_headers(str_file, sep, verbose=verbose)
85
+ if header is None or (isinstance(header, list) and any([h is None for h in header])):
86
+ raise ValueError("Could not retrieve headers")
87
+ heading_columns = detect_heading_columns(str_file, sep, verbose=verbose)
88
+ trailing_columns = detect_trailing_columns(str_file, sep, heading_columns, verbose=verbose)
89
+ table, total_lines, nb_duplicates = parse_csv(
90
+ str_file, encoding, sep, num_rows, header_row_idx, verbose=verbose
91
+ )
92
+ del str_file
93
+ if table.empty:
94
+ raise ValueError("Table seems to be empty")
95
+ analysis = {
96
+ "encoding": encoding,
97
+ "separator": sep,
98
+ "heading_columns": heading_columns,
99
+ "trailing_columns": trailing_columns,
100
+ }
101
+ if engine is not None:
102
+ analysis["compression"] = engine
103
+ analysis |= {
104
+ "header_row_idx": header_row_idx,
105
+ "header": header,
106
+ }
107
+ if total_lines is not None:
108
+ analysis["total_lines"] = total_lines
109
+ if nb_duplicates is not None:
110
+ analysis["nb_duplicates"] = nb_duplicates
111
+ return table, analysis