csv-detective 0.6.7__py3-none-any.whl → 0.9.3.dev2438__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. csv_detective/__init__.py +7 -1
  2. csv_detective/cli.py +33 -21
  3. csv_detective/{detect_fields/FR → detection}/__init__.py +0 -0
  4. csv_detective/detection/columns.py +89 -0
  5. csv_detective/detection/encoding.py +29 -0
  6. csv_detective/detection/engine.py +46 -0
  7. csv_detective/detection/formats.py +156 -0
  8. csv_detective/detection/headers.py +28 -0
  9. csv_detective/detection/rows.py +18 -0
  10. csv_detective/detection/separator.py +44 -0
  11. csv_detective/detection/variables.py +97 -0
  12. csv_detective/explore_csv.py +151 -377
  13. csv_detective/format.py +67 -0
  14. csv_detective/formats/__init__.py +9 -0
  15. csv_detective/formats/adresse.py +116 -0
  16. csv_detective/formats/binary.py +26 -0
  17. csv_detective/formats/booleen.py +35 -0
  18. csv_detective/formats/code_commune_insee.py +26 -0
  19. csv_detective/formats/code_csp_insee.py +36 -0
  20. csv_detective/formats/code_departement.py +29 -0
  21. csv_detective/formats/code_fantoir.py +21 -0
  22. csv_detective/formats/code_import.py +17 -0
  23. csv_detective/formats/code_postal.py +25 -0
  24. csv_detective/formats/code_region.py +22 -0
  25. csv_detective/formats/code_rna.py +29 -0
  26. csv_detective/formats/code_waldec.py +17 -0
  27. csv_detective/formats/commune.py +27 -0
  28. csv_detective/formats/csp_insee.py +31 -0
  29. csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt +0 -0
  30. csv_detective/formats/date.py +99 -0
  31. csv_detective/formats/date_fr.py +22 -0
  32. csv_detective/formats/datetime_aware.py +45 -0
  33. csv_detective/formats/datetime_naive.py +48 -0
  34. csv_detective/formats/datetime_rfc822.py +24 -0
  35. csv_detective/formats/departement.py +37 -0
  36. csv_detective/formats/email.py +28 -0
  37. csv_detective/formats/float.py +29 -0
  38. csv_detective/formats/geojson.py +36 -0
  39. csv_detective/formats/insee_ape700.py +31 -0
  40. csv_detective/formats/insee_canton.py +28 -0
  41. csv_detective/formats/int.py +23 -0
  42. csv_detective/formats/iso_country_code_alpha2.py +30 -0
  43. csv_detective/formats/iso_country_code_alpha3.py +30 -0
  44. csv_detective/formats/iso_country_code_numeric.py +31 -0
  45. csv_detective/formats/jour_de_la_semaine.py +41 -0
  46. csv_detective/formats/json.py +20 -0
  47. csv_detective/formats/latitude_l93.py +48 -0
  48. csv_detective/formats/latitude_wgs.py +42 -0
  49. csv_detective/formats/latitude_wgs_fr_metropole.py +42 -0
  50. csv_detective/formats/latlon_wgs.py +53 -0
  51. csv_detective/formats/longitude_l93.py +39 -0
  52. csv_detective/formats/longitude_wgs.py +32 -0
  53. csv_detective/formats/longitude_wgs_fr_metropole.py +32 -0
  54. csv_detective/formats/lonlat_wgs.py +36 -0
  55. csv_detective/formats/mois_de_lannee.py +48 -0
  56. csv_detective/formats/money.py +18 -0
  57. csv_detective/formats/mongo_object_id.py +14 -0
  58. csv_detective/formats/pays.py +35 -0
  59. csv_detective/formats/percent.py +16 -0
  60. csv_detective/formats/region.py +70 -0
  61. csv_detective/formats/sexe.py +17 -0
  62. csv_detective/formats/siren.py +37 -0
  63. csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} +47 -29
  64. csv_detective/formats/tel_fr.py +36 -0
  65. csv_detective/formats/uai.py +36 -0
  66. csv_detective/formats/url.py +46 -0
  67. csv_detective/formats/username.py +14 -0
  68. csv_detective/formats/uuid.py +16 -0
  69. csv_detective/formats/year.py +28 -0
  70. csv_detective/output/__init__.py +65 -0
  71. csv_detective/output/dataframe.py +96 -0
  72. csv_detective/output/example.py +250 -0
  73. csv_detective/output/profile.py +119 -0
  74. csv_detective/{schema_generation.py → output/schema.py} +268 -343
  75. csv_detective/output/utils.py +74 -0
  76. csv_detective/{detect_fields/FR/geo → parsing}/__init__.py +0 -0
  77. csv_detective/parsing/columns.py +235 -0
  78. csv_detective/parsing/compression.py +11 -0
  79. csv_detective/parsing/csv.py +56 -0
  80. csv_detective/parsing/excel.py +167 -0
  81. csv_detective/parsing/load.py +111 -0
  82. csv_detective/parsing/text.py +56 -0
  83. csv_detective/utils.py +23 -196
  84. csv_detective/validate.py +138 -0
  85. csv_detective-0.9.3.dev2438.dist-info/METADATA +267 -0
  86. csv_detective-0.9.3.dev2438.dist-info/RECORD +92 -0
  87. csv_detective-0.9.3.dev2438.dist-info/WHEEL +4 -0
  88. {csv_detective-0.6.7.dist-info → csv_detective-0.9.3.dev2438.dist-info}/entry_points.txt +1 -0
  89. csv_detective/all_packages.txt +0 -104
  90. csv_detective/detect_fields/FR/geo/adresse/__init__.py +0 -100
  91. csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +0 -24
  92. csv_detective/detect_fields/FR/geo/code_commune_insee/code_commune_insee.txt +0 -37600
  93. csv_detective/detect_fields/FR/geo/code_departement/__init__.py +0 -11
  94. csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +0 -15
  95. csv_detective/detect_fields/FR/geo/code_fantoir/code_fantoir.txt +0 -26122
  96. csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -19
  97. csv_detective/detect_fields/FR/geo/code_postal/code_postal.txt +0 -36822
  98. csv_detective/detect_fields/FR/geo/code_region/__init__.py +0 -27
  99. csv_detective/detect_fields/FR/geo/commune/__init__.py +0 -21
  100. csv_detective/detect_fields/FR/geo/commune/commune.txt +0 -36745
  101. csv_detective/detect_fields/FR/geo/departement/__init__.py +0 -19
  102. csv_detective/detect_fields/FR/geo/departement/departement.txt +0 -101
  103. csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +0 -20
  104. csv_detective/detect_fields/FR/geo/insee_canton/canton2017.txt +0 -2055
  105. csv_detective/detect_fields/FR/geo/insee_canton/cantons.txt +0 -2055
  106. csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +0 -13
  107. csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -13
  108. csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +0 -13
  109. csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -13
  110. csv_detective/detect_fields/FR/geo/pays/__init__.py +0 -17
  111. csv_detective/detect_fields/FR/geo/pays/pays.txt +0 -248
  112. csv_detective/detect_fields/FR/geo/region/__init__.py +0 -16
  113. csv_detective/detect_fields/FR/geo/region/region.txt +0 -44
  114. csv_detective/detect_fields/FR/other/__init__.py +0 -0
  115. csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +0 -26
  116. csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
  117. csv_detective/detect_fields/FR/other/code_rna/__init__.py +0 -8
  118. csv_detective/detect_fields/FR/other/code_waldec/__init__.py +0 -12
  119. csv_detective/detect_fields/FR/other/csp_insee/__init__.py +0 -16
  120. csv_detective/detect_fields/FR/other/date_fr/__init__.py +0 -12
  121. csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +0 -16
  122. csv_detective/detect_fields/FR/other/sexe/__init__.py +0 -9
  123. csv_detective/detect_fields/FR/other/siren/__init__.py +0 -18
  124. csv_detective/detect_fields/FR/other/tel_fr/__init__.py +0 -15
  125. csv_detective/detect_fields/FR/other/uai/__init__.py +0 -15
  126. csv_detective/detect_fields/FR/temp/__init__.py +0 -0
  127. csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +0 -23
  128. csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +0 -37
  129. csv_detective/detect_fields/__init__.py +0 -57
  130. csv_detective/detect_fields/geo/__init__.py +0 -0
  131. csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +0 -15
  132. csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +0 -14
  133. csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +0 -15
  134. csv_detective/detect_fields/geo/json_geojson/__init__.py +0 -22
  135. csv_detective/detect_fields/geo/latitude_wgs/__init__.py +0 -13
  136. csv_detective/detect_fields/geo/latlon_wgs/__init__.py +0 -15
  137. csv_detective/detect_fields/geo/longitude_wgs/__init__.py +0 -13
  138. csv_detective/detect_fields/other/__init__.py +0 -0
  139. csv_detective/detect_fields/other/booleen/__init__.py +0 -21
  140. csv_detective/detect_fields/other/email/__init__.py +0 -8
  141. csv_detective/detect_fields/other/float/__init__.py +0 -17
  142. csv_detective/detect_fields/other/int/__init__.py +0 -12
  143. csv_detective/detect_fields/other/json/__init__.py +0 -24
  144. csv_detective/detect_fields/other/mongo_object_id/__init__.py +0 -8
  145. csv_detective/detect_fields/other/twitter/__init__.py +0 -8
  146. csv_detective/detect_fields/other/url/__init__.py +0 -11
  147. csv_detective/detect_fields/other/uuid/__init__.py +0 -11
  148. csv_detective/detect_fields/temp/__init__.py +0 -0
  149. csv_detective/detect_fields/temp/date/__init__.py +0 -62
  150. csv_detective/detect_fields/temp/datetime_iso/__init__.py +0 -18
  151. csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +0 -21
  152. csv_detective/detect_fields/temp/year/__init__.py +0 -10
  153. csv_detective/detect_labels/FR/__init__.py +0 -0
  154. csv_detective/detect_labels/FR/geo/__init__.py +0 -0
  155. csv_detective/detect_labels/FR/geo/adresse/__init__.py +0 -40
  156. csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +0 -42
  157. csv_detective/detect_labels/FR/geo/code_departement/__init__.py +0 -33
  158. csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +0 -33
  159. csv_detective/detect_labels/FR/geo/code_postal/__init__.py +0 -41
  160. csv_detective/detect_labels/FR/geo/code_region/__init__.py +0 -33
  161. csv_detective/detect_labels/FR/geo/commune/__init__.py +0 -33
  162. csv_detective/detect_labels/FR/geo/departement/__init__.py +0 -47
  163. csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +0 -33
  164. csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +0 -54
  165. csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -55
  166. csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +0 -44
  167. csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -45
  168. csv_detective/detect_labels/FR/geo/pays/__init__.py +0 -45
  169. csv_detective/detect_labels/FR/geo/region/__init__.py +0 -45
  170. csv_detective/detect_labels/FR/other/__init__.py +0 -0
  171. csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +0 -33
  172. csv_detective/detect_labels/FR/other/code_rna/__init__.py +0 -38
  173. csv_detective/detect_labels/FR/other/code_waldec/__init__.py +0 -33
  174. csv_detective/detect_labels/FR/other/csp_insee/__init__.py +0 -37
  175. csv_detective/detect_labels/FR/other/date_fr/__init__.py +0 -33
  176. csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +0 -40
  177. csv_detective/detect_labels/FR/other/sexe/__init__.py +0 -33
  178. csv_detective/detect_labels/FR/other/siren/__init__.py +0 -41
  179. csv_detective/detect_labels/FR/other/siret/__init__.py +0 -40
  180. csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -45
  181. csv_detective/detect_labels/FR/other/uai/__init__.py +0 -50
  182. csv_detective/detect_labels/FR/temp/__init__.py +0 -0
  183. csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +0 -41
  184. csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +0 -33
  185. csv_detective/detect_labels/__init__.py +0 -43
  186. csv_detective/detect_labels/geo/__init__.py +0 -0
  187. csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +0 -41
  188. csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +0 -41
  189. csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +0 -41
  190. csv_detective/detect_labels/geo/json_geojson/__init__.py +0 -42
  191. csv_detective/detect_labels/geo/latitude_wgs/__init__.py +0 -55
  192. csv_detective/detect_labels/geo/latlon_wgs/__init__.py +0 -67
  193. csv_detective/detect_labels/geo/longitude_wgs/__init__.py +0 -45
  194. csv_detective/detect_labels/other/__init__.py +0 -0
  195. csv_detective/detect_labels/other/booleen/__init__.py +0 -34
  196. csv_detective/detect_labels/other/email/__init__.py +0 -45
  197. csv_detective/detect_labels/other/float/__init__.py +0 -33
  198. csv_detective/detect_labels/other/int/__init__.py +0 -33
  199. csv_detective/detect_labels/other/money/__init__.py +0 -11
  200. csv_detective/detect_labels/other/money/check_col_name.py +0 -8
  201. csv_detective/detect_labels/other/mongo_object_id/__init__.py +0 -33
  202. csv_detective/detect_labels/other/twitter/__init__.py +0 -33
  203. csv_detective/detect_labels/other/url/__init__.py +0 -48
  204. csv_detective/detect_labels/other/uuid/__init__.py +0 -33
  205. csv_detective/detect_labels/temp/__init__.py +0 -0
  206. csv_detective/detect_labels/temp/date/__init__.py +0 -51
  207. csv_detective/detect_labels/temp/datetime_iso/__init__.py +0 -45
  208. csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +0 -44
  209. csv_detective/detect_labels/temp/year/__init__.py +0 -44
  210. csv_detective/detection.py +0 -361
  211. csv_detective/process_text.py +0 -39
  212. csv_detective/s3_utils.py +0 -48
  213. csv_detective-0.6.7.data/data/share/csv_detective/CHANGELOG.md +0 -118
  214. csv_detective-0.6.7.data/data/share/csv_detective/LICENSE.AGPL.txt +0 -661
  215. csv_detective-0.6.7.data/data/share/csv_detective/README.md +0 -247
  216. csv_detective-0.6.7.dist-info/LICENSE.AGPL.txt +0 -661
  217. csv_detective-0.6.7.dist-info/METADATA +0 -23
  218. csv_detective-0.6.7.dist-info/RECORD +0 -150
  219. csv_detective-0.6.7.dist-info/WHEEL +0 -5
  220. csv_detective-0.6.7.dist-info/top_level.txt +0 -2
  221. tests/__init__.py +0 -0
  222. tests/test_fields.py +0 -360
  223. tests/test_file.py +0 -116
  224. tests/test_labels.py +0 -7
  225. /csv_detective/{detect_fields/FR/other/csp_insee → formats/data}/csp_insee.txt +0 -0
  226. /csv_detective/{detect_fields/geo/iso_country_code_alpha2 → formats/data}/iso_country_code_alpha2.txt +0 -0
  227. /csv_detective/{detect_fields/geo/iso_country_code_alpha3 → formats/data}/iso_country_code_alpha3.txt +0 -0
  228. /csv_detective/{detect_fields/geo/iso_country_code_numeric → formats/data}/iso_country_code_numeric.txt +0 -0
@@ -1,413 +1,187 @@
1
- """
2
- Ce script analyse les premières lignes d'un CSV pour essayer de déterminer le
3
- contenu possible des champs
4
- """
5
-
6
- from typing import Dict, List, Literal, Union
7
- import json
8
- import numpy as np
9
- import os
10
- import tempfile
11
- from pkg_resources import resource_string
12
1
  import logging
13
2
  from time import time
14
3
 
15
- # flake8: noqa
16
- from csv_detective import detect_fields
17
- from csv_detective import detect_labels
18
- from csv_detective.s3_utils import download_from_minio, upload_to_minio
19
- from csv_detective.schema_generation import generate_table_schema
20
- from csv_detective.utils import test_col, test_label, prepare_output_dict, display_logs_depending_process_time
21
- from .detection import (
22
- detect_separator,
23
- detect_encoding,
24
- detect_headers,
25
- detect_heading_columns,
26
- detect_trailing_columns,
27
- parse_table,
28
- create_profile,
29
- detetect_categorical_variable,
30
- # detect_continuous_variable,
31
- )
4
+ import pandas as pd
32
5
 
6
+ from csv_detective.detection.formats import detect_formats
7
+ from csv_detective.output import generate_output
8
+ from csv_detective.parsing.load import load_file
9
+ from csv_detective.utils import display_logs_depending_process_time, is_url
10
+ from csv_detective.validate import validate
33
11
 
34
12
  logging.basicConfig(level=logging.INFO)
35
13
 
36
- def return_all_tests(user_input_tests, detect_type="detect_fields"):
37
- """
38
- returns all tests that have a method _is and are listed in the user_input_tests
39
- the function can select a sub_package from csv_detective
40
- """
41
- all_packages = resource_string(__name__, "all_packages.txt")
42
- all_packages = all_packages.decode().split("\n")
43
- all_packages.remove("")
44
- all_packages.remove("csv_detective")
45
- all_packages = [x.replace("csv_detective.", "") for x in all_packages]
46
-
47
- if user_input_tests is None:
48
- return []
49
-
50
- if isinstance(user_input_tests, str):
51
- assert user_input_tests[0] != "-"
52
- if user_input_tests == "ALL":
53
- tests_to_do = [detect_type]
54
- else:
55
- tests_to_do = [detect_type + "." + user_input_tests]
56
- tests_to_not_do = []
57
- elif isinstance(user_input_tests, list):
58
- if "ALL" in user_input_tests:
59
- tests_to_do = [detect_type]
60
- else:
61
- tests_to_do = [
62
- detect_type + "." + x for x in user_input_tests if x[0] != "-"
63
- ]
64
- tests_to_not_do = [
65
- detect_type + "." + x[1:] for x in user_input_tests if x[0] == "-"
66
- ]
67
-
68
- all_fields = [
69
- x
70
- for x in all_packages
71
- if any([y == x[: len(y)] for y in tests_to_do])
72
- and all([y != x[: len(y)] for y in tests_to_not_do])
73
- ]
74
- all_tests = [eval(field) for field in all_fields]
75
- all_tests = [
76
- test for test in all_tests if "_is" in dir(test)
77
- ] # TODO : Fix this shit
78
- return all_tests
79
-
80
14
 
81
15
  def routine(
82
- csv_file_path: str,
16
+ file_path: str,
83
17
  num_rows: int = 500,
84
- user_input_tests: Union[str, List[str]] = "ALL",
85
- output_mode: Literal["ALL", "LIMITED"] = "LIMITED",
86
- save_results: bool = True,
87
- encoding: str = None,
88
- sep: str = None,
18
+ tags: list[str] | None = None,
19
+ limited_output: bool = True,
20
+ save_results: bool | str = True,
21
+ encoding: str | None = None,
22
+ sep: str | None = None,
23
+ skipna: bool = True,
89
24
  output_profile: bool = False,
90
25
  output_schema: bool = False,
91
- verbose: bool = False
92
- ):
93
- """Returns a dict with information about the csv table and possible
94
- column contents.
26
+ output_df: bool = False,
27
+ cast_json: bool = True,
28
+ verbose: bool = False,
29
+ sheet_name: str | int | None = None,
30
+ ) -> dict | tuple[dict, pd.DataFrame]:
31
+ """
32
+ Returns a dict with information about the table and possible column contents, and if requested the DataFrame with columns cast according to analysis.
95
33
 
96
34
  Args:
97
- csv_file_path: local path to CSV file if not using Minio
98
- num_rows: number of rows to sample from the file for analysis ; -1 for analysis
99
- of the whole file
100
- user_input_tests: tests to run on the file
101
- output_mode: LIMITED or ALL, whether or not to return all possible types or only
102
- the most likely one for each column
103
- save_results: whether or not to save the results in a json file
35
+ file_path: local path or URL to file
36
+ num_rows: number of rows to sample from the file for analysis ; -1 for analysis of the whole file
37
+ tags: tags to filter formats (for instance ["geo", "fr] to run only the checks related to geo and French formats)
38
+ limited_output: whether or not to return all possible types or only the most likely one for each column
39
+ save_results: whether or not to save the results in a json file, or the path where to dump the output
104
40
  output_profile: whether or not to add the 'profile' field to the output
105
41
  output_schema: whether or not to add the 'schema' field to the output (tableschema)
106
- verbose: whether or not to print process logs in console
42
+ output_df: whether or not to return the loaded DataFrame along with the analysis report
43
+ cast_json: whether or not to cast json columns into objects (otherwise they are returned as strings)
44
+ verbose: whether or not to print process logs in console
45
+ sheet_name: if reading multi-sheet file (xls-like), which sheet to consider
46
+ skipna: whether to keep NaN (empty cells) for tests
107
47
 
108
48
  Returns:
109
49
  dict: a dict with information about the csv and possible types for each column
110
50
  """
111
- if verbose:
112
- start_routine = time()
113
- if csv_file_path is None:
114
- raise ValueError("csv_file_path is required.")
115
-
116
- if encoding is None:
117
- binary_file = open(csv_file_path, mode="rb")
118
- encoding = detect_encoding(binary_file, verbose=verbose)
119
-
120
- with open(csv_file_path, "r", encoding=encoding) as str_file:
121
- if sep is None:
122
- sep = detect_separator(str_file, verbose=verbose)
123
- header_row_idx, header = detect_headers(str_file, sep, verbose=verbose)
124
- if header is None:
125
- return_dict = {"error": True}
126
- return return_dict
127
- elif isinstance(header, list):
128
- if any([x is None for x in header]):
129
- return_dict = {"error": True}
130
- return return_dict
131
- heading_columns = detect_heading_columns(str_file, sep, verbose=verbose)
132
- trailing_columns = detect_trailing_columns(str_file, sep, heading_columns, verbose=verbose)
133
- table, total_lines, nb_duplicates = parse_table(
134
- str_file, encoding, sep, num_rows, header_row_idx, verbose=verbose
135
- )
136
-
137
- if table.empty:
138
- res_categorical = []
139
- # res_continuous = []
140
- else:
141
- # Detects columns that are categorical
142
- res_categorical, categorical_mask = detetect_categorical_variable(table, verbose=verbose)
143
- res_categorical = list(res_categorical)
144
- # Detect columns that are continuous (we already know the categorical) : we don't need this for now, cuts processing time
145
- # res_continuous = list(
146
- # detect_continuous_variable(table.iloc[:, ~categorical_mask.values], verbose=verbose)
147
- # )
148
-
149
- # Creating return dictionary
150
- return_dict = dict()
151
- return_dict["encoding"] = encoding
152
- return_dict["separator"] = sep
153
- return_dict["header_row_idx"] = header_row_idx
154
- return_dict["header"] = header
155
- return_dict["total_lines"] = total_lines
156
- return_dict["nb_duplicates"] = nb_duplicates
157
-
158
- return_dict["heading_columns"] = heading_columns
159
- return_dict["trailing_columns"] = trailing_columns
160
51
 
161
- # return_dict["continuous"] = res_continuous
162
- return_dict["categorical"] = res_categorical
163
-
164
- # list testing to be performed
165
- all_tests_fields = return_all_tests(
166
- user_input_tests, detect_type="detect_fields"
167
- ) # list all tests for the fields
168
- all_tests_labels = return_all_tests(
169
- user_input_tests, detect_type="detect_labels"
170
- ) # list all tests for the labels
171
-
172
- # if no testing then return
173
- if not all_tests_fields and not all_tests_labels:
174
- return return_dict
175
-
176
- # Perform testing on fields
177
- return_table_fields = test_col(table, all_tests_fields, output_mode, verbose=verbose)
178
- return_dict_cols_fields = prepare_output_dict(return_table_fields, output_mode)
179
- return_dict["columns_fields"] = return_dict_cols_fields
52
+ if not (
53
+ isinstance(save_results, bool)
54
+ or (isinstance(save_results, str) and save_results.endswith(".json"))
55
+ ):
56
+ raise ValueError("`save_results` must be a bool or a valid path to a json file.")
180
57
 
181
- # Perform testing on labels
182
- return_table_labels = test_label(table, all_tests_labels, output_mode, verbose=verbose)
183
- return_dict_cols_labels = prepare_output_dict(return_table_labels, output_mode)
184
- return_dict["columns_labels"] = return_dict_cols_labels
58
+ if verbose:
59
+ start_routine = time()
60
+ if is_url(file_path):
61
+ logging.info("Path recognized as a URL")
185
62
 
186
- # Multiply the results of the fields by 1 + 0.5 * the results of the labels.
187
- # This is because the fields are more important than the labels and yields a max
188
- # of 1.5 for the final score.
189
- return_table = return_table_fields * (
190
- 1
191
- + return_table_labels.reindex(
192
- index=return_table_fields.index, fill_value=0
193
- ).values
194
- / 2
63
+ table, analysis = load_file(
64
+ file_path=file_path,
65
+ num_rows=num_rows,
66
+ encoding=encoding,
67
+ sep=sep,
68
+ verbose=verbose,
69
+ sheet_name=sheet_name,
195
70
  )
196
71
 
197
- # To reduce false positives: ensure these formats are detected only if the label yields
198
- # a detection.
199
- formats_with_mandatory_label = [
200
- "code_departement",
201
- "code_commune_insee",
202
- "code_postal",
203
- "latitude_wgs",
204
- "longitude_wgs",
205
- "latitude_wgs_fr_metropole",
206
- "longitude_wgs_fr_metropole",
207
- "latitude_l93",
208
- "longitude_l93",
209
- ]
210
- return_table.loc[formats_with_mandatory_label, :] = np.where(
211
- return_table_labels.loc[formats_with_mandatory_label, :],
212
- return_table.loc[formats_with_mandatory_label, :],
213
- 0,
72
+ analysis, _col_values = detect_formats(
73
+ table=table,
74
+ analysis=analysis,
75
+ file_path=file_path,
76
+ tags=tags,
77
+ limited_output=limited_output,
78
+ skipna=skipna,
79
+ verbose=verbose,
214
80
  )
215
- return_dict_cols = prepare_output_dict(return_table, output_mode)
216
- return_dict["columns"] = return_dict_cols
217
81
 
218
- metier_to_python_type = {
219
- "booleen": "bool",
220
- "int": "int",
221
- "float": "float",
222
- "string": "string",
223
- "json": "json",
224
- "json_geojson": "json",
225
- "datetime": "datetime",
226
- "date": "date",
227
- "latitude": "float",
228
- "latitude_l93": "float",
229
- "latitude_wgs": "float",
230
- "latitude_wgs_fr_metropole": "float",
231
- "longitude": "float",
232
- "longitude_l93": "float",
233
- "longitude_wgs": "float",
234
- "longitude_wgs_fr_metropole": "float",
235
- }
236
-
237
- if output_mode == "ALL":
238
- for detection_method in ["columns_fields", "columns_labels", "columns"]:
239
- return_dict[detection_method] = {
240
- col_name: [
241
- {
242
- "python_type": metier_to_python_type.get(
243
- detection["format"], "string"
244
- ),
245
- **detection,
246
- }
247
- for detection in detections
248
- ]
249
- for col_name, detections in return_dict[detection_method].items()
250
- }
251
- if output_mode == "LIMITED":
252
- for detection_method in ["columns_fields", "columns_labels", "columns"]:
253
- return_dict[detection_method] = {
254
- col_name: {
255
- "python_type": metier_to_python_type.get(
256
- detection["format"], "string"
257
- ),
258
- **detection,
259
- }
260
- for col_name, detection in return_dict[detection_method].items()
261
- }
262
-
263
- # Add detection with formats as keys
264
- return_dict["formats"] = {
265
- column_metadata["format"]: []
266
- for column_metadata in return_dict["columns"].values()
267
- }
268
- for header, col_metadata in return_dict["columns"].items():
269
- return_dict["formats"][col_metadata["format"]].append(header)
270
-
271
- if output_profile:
272
- return_dict["profile"] = create_profile(
273
- table, return_dict["columns"],
274
- sep,
275
- encoding,
276
- num_rows,
277
- header_row_idx,
278
- verbose=verbose
279
- )
280
-
281
- if save_results:
282
- # Write your file as json
283
- output_path_to_store_minio_file = os.path.splitext(csv_file_path)[0] + ".json"
284
- with open(output_path_to_store_minio_file, "w", encoding="utf8") as fp:
285
- json.dump(return_dict, fp, indent=4, separators=(",", ": "), ensure_ascii=False)
286
-
287
- if output_schema and output_mode != "ALL":
288
- return_dict["schema"] = generate_table_schema(
289
- return_dict,
290
- save_file=False,
291
- verbose=verbose
292
- )
293
- if verbose:
294
- display_logs_depending_process_time(
295
- f'Routine completed in {round(time() - start_routine, 3)}s',
296
- time() - start_routine
82
+ try:
83
+ return generate_output(
84
+ table=table,
85
+ analysis=analysis,
86
+ file_path=file_path,
87
+ num_rows=num_rows,
88
+ limited_output=limited_output,
89
+ save_results=save_results,
90
+ output_profile=output_profile,
91
+ output_schema=output_schema,
92
+ output_df=output_df,
93
+ cast_json=cast_json,
94
+ verbose=verbose,
95
+ sheet_name=sheet_name,
96
+ _col_values=_col_values,
297
97
  )
298
- return return_dict
98
+ finally:
99
+ if verbose:
100
+ display_logs_depending_process_time(
101
+ f"Routine completed in {round(time() - start_routine, 3)}s", time() - start_routine
102
+ )
299
103
 
300
104
 
301
- def routine_minio(
302
- csv_minio_location: Dict[str, str],
303
- output_minio_location: Dict[str, str],
304
- tableschema_minio_location: Dict[str, str],
305
- minio_user: str,
306
- minio_pwd: str,
105
+ def validate_then_detect(
106
+ file_path: str,
107
+ previous_analysis: dict,
307
108
  num_rows: int = 500,
308
- user_input_tests: Union[str, List[str]] = "ALL",
309
- encoding: str = None,
310
- sep: str = None,
109
+ tags: list[str] | None = None,
110
+ limited_output: bool = True,
111
+ save_results: bool | str = True,
112
+ skipna: bool = True,
113
+ output_profile: bool = False,
114
+ output_schema: bool = False,
115
+ output_df: bool = False,
116
+ cast_json: bool = True,
117
+ verbose: bool = False,
311
118
  ):
312
- """Returns a dict with information about the csv table and possible
313
- column contents.
119
+ """
120
+ Performs a validation of the given file against the given analysis.
121
+ If the validation fails, performs a full analysis and return it.
122
+ Otherwise return the previous analysis (which is therefore still valid).
123
+ NB: if asked, the profile is recreated in both cases.
314
124
 
315
125
  Args:
316
- csv_minio_location: dict with Minio URL, bucket and key of the CSV file
317
- output_minio_location: Minio URL, bucket and key to store output file. None if
318
- not uploading to Minio.
319
- tableschema_minio_location: Minio URL, bucket and key to store tableschema file.
320
- None if not uploading the tableschema to Minio.
321
- minio_user: user name for the minio instance
322
- minio_pwd: password for the minio instance
323
- num_rows: number of rows to sample from the file for analysis ; -1 for analysis of
324
- the whole file
325
- user_input_tests: tests to run on the file
326
- output_mode: LIMITED or ALL, whether or not to return all possible types or only
327
- the most likely one for each column
328
-
329
- Returns:
330
- dict: a dict with information about the csv and possible types for each column
126
+ file_path: the path of the file to validate.
127
+ previous_analysis: the previous analysis to validate against (expected in the same structure as the output of the routine)
128
+ num_rows: number of rows to sample from the file for analysis ; -1 for analysis of the whole file
129
+ tags: tags to filter formats (for instance ["geo", "fr] to run only the checks related to geo and French formats)
130
+ limited_output: whether or not to return all possible types or only the most likely one for each column
131
+ save_results: whether or not to save the results in a json file, or the path where to dump the output
132
+ skipna: whether to ignore NaN values in the checks
133
+ output_profile: whether or not to add the 'profile' field to the output
134
+ output_schema: whether or not to add the 'schema' field to the output (tableschema)
135
+ output_df: whether or not to return the loaded DataFrame along with the analysis report
136
+ cast_json: whether or not to cast json columns into objects (otherwise they are returned as strings)
137
+ verbose: whether the code displays the steps it's going through
331
138
  """
332
-
333
- if (
334
- (
335
- any(
336
- [
337
- location_dict is not None
338
- for location_dict in [
339
- csv_minio_location,
340
- output_minio_location,
341
- tableschema_minio_location,
342
- ]
343
- ]
344
- )
345
- )
346
- and (minio_user is None)
347
- or (minio_pwd is None)
348
- ):
349
- raise ValueError("Minio credentials are required if using Minio")
350
-
351
- for location_dict in [
352
- csv_minio_location,
353
- output_minio_location,
354
- tableschema_minio_location,
355
- ]:
356
- if location_dict is not None:
357
- if any(
358
- [
359
- (location_key not in location_dict)
360
- or (location_dict[location_key] is None)
361
- for location_key in ["netloc", "bucket", "key"]
362
- ]
363
- ):
364
- raise ValueError("Minio location dict must contain url, bucket and key")
365
-
366
- csv_file_path = tempfile.NamedTemporaryFile(delete=False).name
367
- download_from_minio(
368
- netloc=csv_minio_location["netloc"],
369
- bucket=csv_minio_location["bucket"],
370
- key=csv_minio_location["key"],
371
- filepath=csv_file_path,
372
- minio_user=minio_user,
373
- minio_pwd=minio_pwd,
374
- )
375
-
376
- return_dict = routine(
377
- csv_file_path,
378
- num_rows,
379
- user_input_tests,
380
- output_mode="LIMITED",
381
- save_results=True,
382
- encoding=encoding,
383
- sep=sep,
384
- )
385
-
386
- # Write report JSON file.
387
- output_path_to_store_minio_file = os.path.splitext(csv_file_path)[0] + ".json"
388
- with open(output_path_to_store_minio_file, "w", encoding="utf8") as fp:
389
- json.dump(return_dict, fp, indent=4, separators=(",", ": "))
390
-
391
- upload_to_minio(
392
- netloc=output_minio_location["netloc"],
393
- bucket=output_minio_location["bucket"],
394
- key=output_minio_location["key"],
395
- filepath=output_path_to_store_minio_file,
396
- minio_user=minio_user,
397
- minio_pwd=minio_pwd,
398
- )
399
-
400
- os.remove(output_path_to_store_minio_file)
401
- os.remove(csv_file_path)
402
-
403
- generate_table_schema(
404
- return_dict,
405
- True,
406
- netloc=tableschema_minio_location["netloc"],
407
- bucket=tableschema_minio_location["bucket"],
408
- key=tableschema_minio_location["key"],
409
- minio_user=minio_user,
410
- minio_pwd=minio_pwd,
139
+ if verbose:
140
+ start_routine = time()
141
+ if is_url(file_path):
142
+ logging.info("Path recognized as a URL")
143
+
144
+ is_valid, table, analysis, col_values = validate(
145
+ file_path=file_path,
146
+ previous_analysis=previous_analysis,
147
+ verbose=verbose,
148
+ skipna=skipna,
411
149
  )
412
-
413
- return return_dict
150
+ if analysis is None:
151
+ # if loading failed in validate, we load it from scratch
152
+ table, analysis = load_file(
153
+ file_path=file_path,
154
+ num_rows=num_rows,
155
+ verbose=verbose,
156
+ )
157
+ if not is_valid:
158
+ analysis, col_values = detect_formats(
159
+ table=table,
160
+ analysis=analysis,
161
+ file_path=file_path,
162
+ tags=tags,
163
+ limited_output=limited_output,
164
+ skipna=skipna,
165
+ verbose=verbose,
166
+ )
167
+ try:
168
+ return generate_output(
169
+ table=table,
170
+ analysis=analysis,
171
+ file_path=file_path,
172
+ num_rows=num_rows,
173
+ limited_output=limited_output,
174
+ save_results=save_results,
175
+ output_profile=output_profile,
176
+ output_schema=output_schema,
177
+ output_df=output_df,
178
+ cast_json=cast_json,
179
+ verbose=verbose,
180
+ sheet_name=analysis.get("sheet_name"),
181
+ _col_values=col_values,
182
+ )
183
+ finally:
184
+ if verbose:
185
+ display_logs_depending_process_time(
186
+ f"Process completed in {round(time() - start_routine, 3)}s", time() - start_routine
187
+ )
@@ -0,0 +1,67 @@
1
+ from typing import Any, Callable
2
+
3
+ from csv_detective.parsing.text import header_score
4
+
5
+
6
+ class Format:
7
+ def __init__(
8
+ self,
9
+ name: str,
10
+ func: Callable[[Any], bool],
11
+ _test_values: dict[bool, list[str]],
12
+ labels: list[str] = [],
13
+ proportion: float = 1,
14
+ tags: list[str] = [],
15
+ ) -> None:
16
+ """
17
+ Instanciates a Format object.
18
+
19
+ Args:
20
+ name: the name of the format.
21
+ func: the value test for the format (returns whether a string is valid).
22
+ _test_values: lists of valid and invalid values, used in the tests
23
+ labels: the list of hint headers for the header score
24
+ proportion: the tolerance (between 0 and 1) to say a column is valid for a format. (1 => 100% of the column has to pass the func check for the column to be considered valid)
25
+ tags: to allow users to submit a file to only a subset of formats
26
+ """
27
+ self.name: str = name
28
+ self.func: Callable = func
29
+ self._test_values: dict[bool, list[str]] = _test_values
30
+ self.labels: list[str] = labels
31
+ self.proportion: float = proportion
32
+ self.tags: list[str] = tags
33
+
34
+ def is_valid_label(self, val: str) -> float:
35
+ return header_score(val, self.labels)
36
+
37
+
38
+ class FormatsManager:
39
+ formats: dict[str, Format]
40
+
41
+ def __init__(self) -> None:
42
+ import csv_detective.formats as formats
43
+
44
+ format_labels = [f for f in dir(formats) if "_is" in dir(getattr(formats, f))]
45
+ self.formats = {
46
+ label: Format(
47
+ name=label,
48
+ func=(module := getattr(formats, label))._is,
49
+ _test_values=module._test_values,
50
+ **{
51
+ attr: val
52
+ for attr in ["labels", "proportion", "tags"]
53
+ if (val := getattr(module, attr, None))
54
+ },
55
+ )
56
+ for label in format_labels
57
+ }
58
+
59
+ def get_formats_from_tags(self, tags: list[str]) -> dict[str, Format]:
60
+ return {
61
+ label: fmt
62
+ for label, fmt in self.formats.items()
63
+ if all(tag in fmt.tags for tag in tags)
64
+ }
65
+
66
+ def available_tags(self) -> set[str]:
67
+ return set(tag for format in self.formats.values() for tag in format.tags)
@@ -0,0 +1,9 @@
1
+ import importlib
2
+ import os
3
+
4
+ for file in os.listdir(os.path.dirname(__file__)):
5
+ if file.endswith(".py") and not file.startswith("_"):
6
+ module_name = file[:-3]
7
+ module = importlib.import_module(f"csv_detective.formats.{module_name}")
8
+ globals()[module_name] = module
9
+ del module