csv-detective 0.6.7__py3-none-any.whl → 0.9.3.dev2438__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. csv_detective/__init__.py +7 -1
  2. csv_detective/cli.py +33 -21
  3. csv_detective/{detect_fields/FR → detection}/__init__.py +0 -0
  4. csv_detective/detection/columns.py +89 -0
  5. csv_detective/detection/encoding.py +29 -0
  6. csv_detective/detection/engine.py +46 -0
  7. csv_detective/detection/formats.py +156 -0
  8. csv_detective/detection/headers.py +28 -0
  9. csv_detective/detection/rows.py +18 -0
  10. csv_detective/detection/separator.py +44 -0
  11. csv_detective/detection/variables.py +97 -0
  12. csv_detective/explore_csv.py +151 -377
  13. csv_detective/format.py +67 -0
  14. csv_detective/formats/__init__.py +9 -0
  15. csv_detective/formats/adresse.py +116 -0
  16. csv_detective/formats/binary.py +26 -0
  17. csv_detective/formats/booleen.py +35 -0
  18. csv_detective/formats/code_commune_insee.py +26 -0
  19. csv_detective/formats/code_csp_insee.py +36 -0
  20. csv_detective/formats/code_departement.py +29 -0
  21. csv_detective/formats/code_fantoir.py +21 -0
  22. csv_detective/formats/code_import.py +17 -0
  23. csv_detective/formats/code_postal.py +25 -0
  24. csv_detective/formats/code_region.py +22 -0
  25. csv_detective/formats/code_rna.py +29 -0
  26. csv_detective/formats/code_waldec.py +17 -0
  27. csv_detective/formats/commune.py +27 -0
  28. csv_detective/formats/csp_insee.py +31 -0
  29. csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt +0 -0
  30. csv_detective/formats/date.py +99 -0
  31. csv_detective/formats/date_fr.py +22 -0
  32. csv_detective/formats/datetime_aware.py +45 -0
  33. csv_detective/formats/datetime_naive.py +48 -0
  34. csv_detective/formats/datetime_rfc822.py +24 -0
  35. csv_detective/formats/departement.py +37 -0
  36. csv_detective/formats/email.py +28 -0
  37. csv_detective/formats/float.py +29 -0
  38. csv_detective/formats/geojson.py +36 -0
  39. csv_detective/formats/insee_ape700.py +31 -0
  40. csv_detective/formats/insee_canton.py +28 -0
  41. csv_detective/formats/int.py +23 -0
  42. csv_detective/formats/iso_country_code_alpha2.py +30 -0
  43. csv_detective/formats/iso_country_code_alpha3.py +30 -0
  44. csv_detective/formats/iso_country_code_numeric.py +31 -0
  45. csv_detective/formats/jour_de_la_semaine.py +41 -0
  46. csv_detective/formats/json.py +20 -0
  47. csv_detective/formats/latitude_l93.py +48 -0
  48. csv_detective/formats/latitude_wgs.py +42 -0
  49. csv_detective/formats/latitude_wgs_fr_metropole.py +42 -0
  50. csv_detective/formats/latlon_wgs.py +53 -0
  51. csv_detective/formats/longitude_l93.py +39 -0
  52. csv_detective/formats/longitude_wgs.py +32 -0
  53. csv_detective/formats/longitude_wgs_fr_metropole.py +32 -0
  54. csv_detective/formats/lonlat_wgs.py +36 -0
  55. csv_detective/formats/mois_de_lannee.py +48 -0
  56. csv_detective/formats/money.py +18 -0
  57. csv_detective/formats/mongo_object_id.py +14 -0
  58. csv_detective/formats/pays.py +35 -0
  59. csv_detective/formats/percent.py +16 -0
  60. csv_detective/formats/region.py +70 -0
  61. csv_detective/formats/sexe.py +17 -0
  62. csv_detective/formats/siren.py +37 -0
  63. csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} +47 -29
  64. csv_detective/formats/tel_fr.py +36 -0
  65. csv_detective/formats/uai.py +36 -0
  66. csv_detective/formats/url.py +46 -0
  67. csv_detective/formats/username.py +14 -0
  68. csv_detective/formats/uuid.py +16 -0
  69. csv_detective/formats/year.py +28 -0
  70. csv_detective/output/__init__.py +65 -0
  71. csv_detective/output/dataframe.py +96 -0
  72. csv_detective/output/example.py +250 -0
  73. csv_detective/output/profile.py +119 -0
  74. csv_detective/{schema_generation.py → output/schema.py} +268 -343
  75. csv_detective/output/utils.py +74 -0
  76. csv_detective/{detect_fields/FR/geo → parsing}/__init__.py +0 -0
  77. csv_detective/parsing/columns.py +235 -0
  78. csv_detective/parsing/compression.py +11 -0
  79. csv_detective/parsing/csv.py +56 -0
  80. csv_detective/parsing/excel.py +167 -0
  81. csv_detective/parsing/load.py +111 -0
  82. csv_detective/parsing/text.py +56 -0
  83. csv_detective/utils.py +23 -196
  84. csv_detective/validate.py +138 -0
  85. csv_detective-0.9.3.dev2438.dist-info/METADATA +267 -0
  86. csv_detective-0.9.3.dev2438.dist-info/RECORD +92 -0
  87. csv_detective-0.9.3.dev2438.dist-info/WHEEL +4 -0
  88. {csv_detective-0.6.7.dist-info → csv_detective-0.9.3.dev2438.dist-info}/entry_points.txt +1 -0
  89. csv_detective/all_packages.txt +0 -104
  90. csv_detective/detect_fields/FR/geo/adresse/__init__.py +0 -100
  91. csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +0 -24
  92. csv_detective/detect_fields/FR/geo/code_commune_insee/code_commune_insee.txt +0 -37600
  93. csv_detective/detect_fields/FR/geo/code_departement/__init__.py +0 -11
  94. csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +0 -15
  95. csv_detective/detect_fields/FR/geo/code_fantoir/code_fantoir.txt +0 -26122
  96. csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -19
  97. csv_detective/detect_fields/FR/geo/code_postal/code_postal.txt +0 -36822
  98. csv_detective/detect_fields/FR/geo/code_region/__init__.py +0 -27
  99. csv_detective/detect_fields/FR/geo/commune/__init__.py +0 -21
  100. csv_detective/detect_fields/FR/geo/commune/commune.txt +0 -36745
  101. csv_detective/detect_fields/FR/geo/departement/__init__.py +0 -19
  102. csv_detective/detect_fields/FR/geo/departement/departement.txt +0 -101
  103. csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +0 -20
  104. csv_detective/detect_fields/FR/geo/insee_canton/canton2017.txt +0 -2055
  105. csv_detective/detect_fields/FR/geo/insee_canton/cantons.txt +0 -2055
  106. csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +0 -13
  107. csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -13
  108. csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +0 -13
  109. csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -13
  110. csv_detective/detect_fields/FR/geo/pays/__init__.py +0 -17
  111. csv_detective/detect_fields/FR/geo/pays/pays.txt +0 -248
  112. csv_detective/detect_fields/FR/geo/region/__init__.py +0 -16
  113. csv_detective/detect_fields/FR/geo/region/region.txt +0 -44
  114. csv_detective/detect_fields/FR/other/__init__.py +0 -0
  115. csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +0 -26
  116. csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
  117. csv_detective/detect_fields/FR/other/code_rna/__init__.py +0 -8
  118. csv_detective/detect_fields/FR/other/code_waldec/__init__.py +0 -12
  119. csv_detective/detect_fields/FR/other/csp_insee/__init__.py +0 -16
  120. csv_detective/detect_fields/FR/other/date_fr/__init__.py +0 -12
  121. csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +0 -16
  122. csv_detective/detect_fields/FR/other/sexe/__init__.py +0 -9
  123. csv_detective/detect_fields/FR/other/siren/__init__.py +0 -18
  124. csv_detective/detect_fields/FR/other/tel_fr/__init__.py +0 -15
  125. csv_detective/detect_fields/FR/other/uai/__init__.py +0 -15
  126. csv_detective/detect_fields/FR/temp/__init__.py +0 -0
  127. csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +0 -23
  128. csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +0 -37
  129. csv_detective/detect_fields/__init__.py +0 -57
  130. csv_detective/detect_fields/geo/__init__.py +0 -0
  131. csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +0 -15
  132. csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +0 -14
  133. csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +0 -15
  134. csv_detective/detect_fields/geo/json_geojson/__init__.py +0 -22
  135. csv_detective/detect_fields/geo/latitude_wgs/__init__.py +0 -13
  136. csv_detective/detect_fields/geo/latlon_wgs/__init__.py +0 -15
  137. csv_detective/detect_fields/geo/longitude_wgs/__init__.py +0 -13
  138. csv_detective/detect_fields/other/__init__.py +0 -0
  139. csv_detective/detect_fields/other/booleen/__init__.py +0 -21
  140. csv_detective/detect_fields/other/email/__init__.py +0 -8
  141. csv_detective/detect_fields/other/float/__init__.py +0 -17
  142. csv_detective/detect_fields/other/int/__init__.py +0 -12
  143. csv_detective/detect_fields/other/json/__init__.py +0 -24
  144. csv_detective/detect_fields/other/mongo_object_id/__init__.py +0 -8
  145. csv_detective/detect_fields/other/twitter/__init__.py +0 -8
  146. csv_detective/detect_fields/other/url/__init__.py +0 -11
  147. csv_detective/detect_fields/other/uuid/__init__.py +0 -11
  148. csv_detective/detect_fields/temp/__init__.py +0 -0
  149. csv_detective/detect_fields/temp/date/__init__.py +0 -62
  150. csv_detective/detect_fields/temp/datetime_iso/__init__.py +0 -18
  151. csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +0 -21
  152. csv_detective/detect_fields/temp/year/__init__.py +0 -10
  153. csv_detective/detect_labels/FR/__init__.py +0 -0
  154. csv_detective/detect_labels/FR/geo/__init__.py +0 -0
  155. csv_detective/detect_labels/FR/geo/adresse/__init__.py +0 -40
  156. csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +0 -42
  157. csv_detective/detect_labels/FR/geo/code_departement/__init__.py +0 -33
  158. csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +0 -33
  159. csv_detective/detect_labels/FR/geo/code_postal/__init__.py +0 -41
  160. csv_detective/detect_labels/FR/geo/code_region/__init__.py +0 -33
  161. csv_detective/detect_labels/FR/geo/commune/__init__.py +0 -33
  162. csv_detective/detect_labels/FR/geo/departement/__init__.py +0 -47
  163. csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +0 -33
  164. csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +0 -54
  165. csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -55
  166. csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +0 -44
  167. csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -45
  168. csv_detective/detect_labels/FR/geo/pays/__init__.py +0 -45
  169. csv_detective/detect_labels/FR/geo/region/__init__.py +0 -45
  170. csv_detective/detect_labels/FR/other/__init__.py +0 -0
  171. csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +0 -33
  172. csv_detective/detect_labels/FR/other/code_rna/__init__.py +0 -38
  173. csv_detective/detect_labels/FR/other/code_waldec/__init__.py +0 -33
  174. csv_detective/detect_labels/FR/other/csp_insee/__init__.py +0 -37
  175. csv_detective/detect_labels/FR/other/date_fr/__init__.py +0 -33
  176. csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +0 -40
  177. csv_detective/detect_labels/FR/other/sexe/__init__.py +0 -33
  178. csv_detective/detect_labels/FR/other/siren/__init__.py +0 -41
  179. csv_detective/detect_labels/FR/other/siret/__init__.py +0 -40
  180. csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -45
  181. csv_detective/detect_labels/FR/other/uai/__init__.py +0 -50
  182. csv_detective/detect_labels/FR/temp/__init__.py +0 -0
  183. csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +0 -41
  184. csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +0 -33
  185. csv_detective/detect_labels/__init__.py +0 -43
  186. csv_detective/detect_labels/geo/__init__.py +0 -0
  187. csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +0 -41
  188. csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +0 -41
  189. csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +0 -41
  190. csv_detective/detect_labels/geo/json_geojson/__init__.py +0 -42
  191. csv_detective/detect_labels/geo/latitude_wgs/__init__.py +0 -55
  192. csv_detective/detect_labels/geo/latlon_wgs/__init__.py +0 -67
  193. csv_detective/detect_labels/geo/longitude_wgs/__init__.py +0 -45
  194. csv_detective/detect_labels/other/__init__.py +0 -0
  195. csv_detective/detect_labels/other/booleen/__init__.py +0 -34
  196. csv_detective/detect_labels/other/email/__init__.py +0 -45
  197. csv_detective/detect_labels/other/float/__init__.py +0 -33
  198. csv_detective/detect_labels/other/int/__init__.py +0 -33
  199. csv_detective/detect_labels/other/money/__init__.py +0 -11
  200. csv_detective/detect_labels/other/money/check_col_name.py +0 -8
  201. csv_detective/detect_labels/other/mongo_object_id/__init__.py +0 -33
  202. csv_detective/detect_labels/other/twitter/__init__.py +0 -33
  203. csv_detective/detect_labels/other/url/__init__.py +0 -48
  204. csv_detective/detect_labels/other/uuid/__init__.py +0 -33
  205. csv_detective/detect_labels/temp/__init__.py +0 -0
  206. csv_detective/detect_labels/temp/date/__init__.py +0 -51
  207. csv_detective/detect_labels/temp/datetime_iso/__init__.py +0 -45
  208. csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +0 -44
  209. csv_detective/detect_labels/temp/year/__init__.py +0 -44
  210. csv_detective/detection.py +0 -361
  211. csv_detective/process_text.py +0 -39
  212. csv_detective/s3_utils.py +0 -48
  213. csv_detective-0.6.7.data/data/share/csv_detective/CHANGELOG.md +0 -118
  214. csv_detective-0.6.7.data/data/share/csv_detective/LICENSE.AGPL.txt +0 -661
  215. csv_detective-0.6.7.data/data/share/csv_detective/README.md +0 -247
  216. csv_detective-0.6.7.dist-info/LICENSE.AGPL.txt +0 -661
  217. csv_detective-0.6.7.dist-info/METADATA +0 -23
  218. csv_detective-0.6.7.dist-info/RECORD +0 -150
  219. csv_detective-0.6.7.dist-info/WHEEL +0 -5
  220. csv_detective-0.6.7.dist-info/top_level.txt +0 -2
  221. tests/__init__.py +0 -0
  222. tests/test_fields.py +0 -360
  223. tests/test_file.py +0 -116
  224. tests/test_labels.py +0 -7
  225. /csv_detective/{detect_fields/FR/other/csp_insee → formats/data}/csp_insee.txt +0 -0
  226. /csv_detective/{detect_fields/geo/iso_country_code_alpha2 → formats/data}/iso_country_code_alpha2.txt +0 -0
  227. /csv_detective/{detect_fields/geo/iso_country_code_alpha3 → formats/data}/iso_country_code_alpha3.txt +0 -0
  228. /csv_detective/{detect_fields/geo/iso_country_code_numeric → formats/data}/iso_country_code_numeric.txt +0 -0
@@ -0,0 +1,96 @@
1
+ import json
2
+ from datetime import date, datetime
3
+ from time import time
4
+ from typing import Iterator
5
+
6
+ import pandas as pd
7
+
8
+ from csv_detective.formats.binary import binary_casting
9
+ from csv_detective.formats.booleen import bool_casting
10
+ from csv_detective.formats.date import date_casting
11
+ from csv_detective.formats.float import float_casting
12
+ from csv_detective.parsing.csv import CHUNK_SIZE
13
+ from csv_detective.utils import display_logs_depending_process_time
14
+
15
+
16
+ def cast(value: str, _type: str) -> str | float | bool | date | datetime | bytes | None:
17
+ if not isinstance(value, str) or not value:
18
+ # None is the current default value in hydra, should we keep this?
19
+ return None
20
+ match _type:
21
+ case "float":
22
+ return float_casting(value)
23
+ case "bool":
24
+ return bool_casting(value)
25
+ case "json":
26
+ # in hydra json are given to postgres as strings, conversion is done by postgres
27
+ return json.loads(value)
28
+ case "date":
29
+ _date = date_casting(value)
30
+ return _date.date() if _date else None
31
+ case "datetime":
32
+ return date_casting(value)
33
+ case "binary":
34
+ return binary_casting(value)
35
+ case _:
36
+ raise ValueError(f"Unknown type `{_type}`")
37
+
38
+
39
+ def cast_df(
40
+ df: pd.DataFrame, columns: dict, cast_json: bool = True, verbose: bool = False
41
+ ) -> pd.DataFrame:
42
+ # for efficiency this modifies the dataframe in place as we don't need it anymore afterwards
43
+ if verbose:
44
+ start = time()
45
+ for col_name, detection in columns.items():
46
+ if detection["python_type"] == "string" or (
47
+ detection["python_type"] == "json" and not cast_json
48
+ ):
49
+ # no change if detected type is string
50
+ continue
51
+ elif detection["python_type"] == "int":
52
+ # to allow having ints and NaN in the same column
53
+ df[col_name] = df[col_name].astype(pd.Int64Dtype())
54
+ else:
55
+ df[col_name] = df[col_name].apply(lambda col: cast(col, _type=detection["python_type"]))
56
+ if verbose:
57
+ display_logs_depending_process_time(
58
+ f"Casting columns completed in {round(time() - start, 3)}s",
59
+ time() - start,
60
+ )
61
+ return df
62
+
63
+
64
+ def cast_df_chunks(
65
+ df: pd.DataFrame,
66
+ analysis: dict,
67
+ file_path: str,
68
+ cast_json: bool = True,
69
+ verbose: bool = False,
70
+ ) -> Iterator[pd.DataFrame]:
71
+ if analysis.get("engine") or analysis["total_lines"] <= CHUNK_SIZE:
72
+ # the file is loaded in one chunk, so returning the cast df
73
+ yield cast_df(
74
+ df=df,
75
+ columns=analysis["columns"],
76
+ cast_json=cast_json,
77
+ verbose=verbose,
78
+ )
79
+ else:
80
+ # loading the csv in chunks using the analysis
81
+ chunks = pd.read_csv(
82
+ file_path,
83
+ dtype=str,
84
+ sep=analysis["separator"],
85
+ encoding=analysis["encoding"],
86
+ skiprows=analysis["header_row_idx"],
87
+ compression=analysis.get("compression"),
88
+ chunksize=CHUNK_SIZE,
89
+ )
90
+ for chunk in chunks:
91
+ yield cast_df(
92
+ df=chunk,
93
+ columns=analysis["columns"],
94
+ cast_json=cast_json,
95
+ verbose=verbose,
96
+ )
@@ -0,0 +1,250 @@
1
+ import json
2
+ import random
3
+ import string
4
+ import uuid
5
+ from datetime import datetime
6
+ from typing import Any, Type
7
+
8
+ import pandas as pd
9
+ import requests
10
+ import rstr
11
+ from faker import Faker
12
+
13
+ from csv_detective.utils import is_url
14
+
15
+ fake = Faker()
16
+
17
+
18
+ def create_example_csv_file(
19
+ fields: dict | None = None,
20
+ schema_path: str | None = None,
21
+ file_length: int = 10,
22
+ output_name: str | None = "example_file.csv",
23
+ output_sep: str = ";",
24
+ encoding: str = "utf-8",
25
+ ignore_required: bool = False,
26
+ ) -> pd.DataFrame:
27
+ """
28
+ Create an example file based on a list of dicts like follows:
29
+ fields = [
30
+ {
31
+ "name": "column_name",
32
+ "type": "column_type",
33
+ "args": {dict_of_args} # optional
34
+ },
35
+ ...
36
+ ]
37
+ Or from a TableSchema
38
+ """
39
+ # need to make a CLI command
40
+
41
+ if not (fields or schema_path):
42
+ raise ValueError("At least fields or schema_path must be specified.")
43
+
44
+ def potential_skip(required: bool) -> bool:
45
+ if ignore_required:
46
+ return False
47
+ if not required:
48
+ # for now 30% chance to have an optional value, this could go as an argument
49
+ return random.randint(1, 100) <= 30
50
+
51
+ def _string(
52
+ length: int = 10,
53
+ required: bool = True,
54
+ pattern: str | None = None,
55
+ enum: str | None = None,
56
+ ) -> str:
57
+ if potential_skip(required):
58
+ return ""
59
+ if pattern is not None:
60
+ return rstr.xeger(pattern)
61
+ elif enum is not None:
62
+ return random.choice(enum)
63
+ else:
64
+ letters = string.ascii_lowercase
65
+ return "".join(random.choice(letters) for i in range(length))
66
+
67
+ def _id(
68
+ required: bool = True,
69
+ ) -> str:
70
+ if potential_skip(required):
71
+ return ""
72
+ return str(uuid.uuid4())
73
+
74
+ def _date(
75
+ date_range: list[str] | None = None,
76
+ format: str = "%Y-%m-%d",
77
+ required: bool = True,
78
+ ) -> str:
79
+ # the bounds specified in date_range are expected in the same format as the desired output format
80
+ assert all([k in format for k in ["%d", "%m", "%Y"]])
81
+ if potential_skip(required):
82
+ return ""
83
+ if date_range is None:
84
+ return fake.date(format)
85
+ else:
86
+ if len(date_range) != 2:
87
+ raise ValueError("'date_range' must have exactly two elements.")
88
+ return fake.date_between_dates(
89
+ datetime.strptime(date_range[0], format),
90
+ datetime.strptime(date_range[1], format),
91
+ ).strftime(format)
92
+
93
+ def _time(
94
+ format: str = "%H:%M:%S",
95
+ required: bool = True,
96
+ ) -> str:
97
+ assert all([k in format for k in ["%H", "%M", "%S"]])
98
+ if potential_skip(required):
99
+ return ""
100
+ # maybe add a time_range argument?
101
+ return fake.time(format)
102
+
103
+ def _datetime(
104
+ datetime_range: list[str] | None = None,
105
+ format: str = "%Y-%m-%d %H-%M-%S",
106
+ required: bool = True,
107
+ ) -> str:
108
+ # the bounds specified in datetime_range are expected in the same format as the desired output format
109
+ assert all([k in format for k in ["%d", "%m", "%Y", "%H", "%M", "%S"]])
110
+ if potential_skip(required):
111
+ return ""
112
+ if datetime_range is None:
113
+ return fake.date_time().strftime(format)
114
+ else:
115
+ if len(datetime_range) != 2:
116
+ raise ValueError("'date_range' must have exactly two elements.")
117
+ return fake.date_time_between(
118
+ datetime.strptime(datetime_range[0], format),
119
+ datetime.strptime(datetime_range[1], format),
120
+ ).strftime(format)
121
+
122
+ def _url(required: bool = True) -> str:
123
+ if potential_skip(required):
124
+ return ""
125
+ return f"http://{rstr.domainsafe()}.{rstr.letters(3)}/{rstr.urlsafe()}"
126
+
127
+ def _number(
128
+ num_type: Type[int | float] = int,
129
+ num_range: list[float] | None = None,
130
+ enum: list | None = None,
131
+ required: bool = True,
132
+ ) -> int | float:
133
+ assert num_range is None or len(num_range) == 2
134
+ if potential_skip(required):
135
+ return ""
136
+ if enum:
137
+ return random.choice(enum)
138
+ if num_range is None:
139
+ num_range = [0, 1000]
140
+ if num_type is int:
141
+ return random.randint(num_range[0], num_range[1])
142
+ else:
143
+ return round(random.uniform(num_range[0], num_range[1]), 1)
144
+
145
+ def _bool(required: bool = True) -> bool:
146
+ if potential_skip(required):
147
+ return ""
148
+ return random.randint(0, 1) == 0
149
+
150
+ def _array(enum: list[Any], required: bool = True) -> str:
151
+ if potential_skip(required):
152
+ return ""
153
+ return f"[{','.join(random.sample(enum, random.randint(1, len(enum))))}]"
154
+
155
+ def build_args_from_constraints(constraints: dict) -> dict:
156
+ args = {}
157
+ args["required"] = constraints.get("required", False)
158
+ for _ in ["pattern", "enum", "format"]:
159
+ if _ in constraints:
160
+ args[_] = constraints[_]
161
+ if "minimum" in constraints and "maximum" in constraints:
162
+ args["num_range"] = [constraints["minimum"], constraints["maximum"]]
163
+ # maybe there are better values than these?
164
+ elif "minimum" in constraints:
165
+ args["num_range"] = [constraints["minimum"], 10 + constraints["minimum"]]
166
+ elif "maximum" in constraints:
167
+ args["num_range"] = [constraints["maximum"] - 10, constraints["maximum"]]
168
+ if "minLength" in constraints:
169
+ args["length"] = constraints["minLength"]
170
+ if "maxLength" in constraints:
171
+ args["length"] = constraints["maxLength"]
172
+ return args
173
+
174
+ schema_types_to_python = {
175
+ "number": "float",
176
+ "integer": "int",
177
+ "string": "str",
178
+ "year": "year",
179
+ "boolean": "bool",
180
+ "date": "date",
181
+ "yearmonth": "date",
182
+ "time": "time",
183
+ "datetime": "datetime",
184
+ "array": "array",
185
+ }
186
+
187
+ if schema_path:
188
+ if is_url(schema_path):
189
+ schema = requests.get(schema_path).json()
190
+ else:
191
+ with open(schema_path, encoding=encoding) as jsonfile:
192
+ schema = json.load(jsonfile)
193
+ if "fields" not in schema.keys():
194
+ raise ValueError("The schema must have a 'fields' key.")
195
+ else:
196
+ fields = [
197
+ {
198
+ "name": f["name"],
199
+ "type": schema_types_to_python.get(f["type"], "str"),
200
+ # when frformat is supported in TableSchema, we can build args for French standards
201
+ # linked to https://github.com/datagouv/fr-format/issues/26
202
+ "args": (
203
+ build_args_from_constraints(f["constraints"])
204
+ if "constraints" in f.keys()
205
+ else build_args_from_constraints(f["arrayItem"]["constraints"])
206
+ if "arrayItem" in f.keys() and "constraints" in f["arrayItem"].keys()
207
+ else {}
208
+ ),
209
+ }
210
+ for f in schema["fields"]
211
+ ]
212
+
213
+ for k in range(len(fields)):
214
+ if "args" not in fields[k]:
215
+ fields[k]["args"] = {}
216
+ if fields[k]["type"] == "float":
217
+ fields[k]["args"]["num_type"] = float
218
+ elif fields[k]["type"] == "int":
219
+ fields[k]["args"]["num_type"] = int
220
+ elif fields[k]["type"] == "year":
221
+ fields[k]["args"]["num_type"] = int
222
+ fields[k]["args"]["num_range"] = [1990, 2050]
223
+
224
+ types_to_func = {
225
+ "int": _number,
226
+ "float": _number,
227
+ "date": _date,
228
+ "time": _time,
229
+ "str": _string,
230
+ "url": _url,
231
+ "id": _id,
232
+ "year": _number,
233
+ "bool": _bool,
234
+ "datetime": _datetime,
235
+ "array": _array,
236
+ }
237
+
238
+ # would it be better to create by column or by row (as for now)?
239
+ output = pd.DataFrame(
240
+ [
241
+ [types_to_func.get(f["type"], "str")(**f["args"]) for f in fields]
242
+ for _ in range(file_length)
243
+ ],
244
+ columns=[f["name"] for f in fields],
245
+ )
246
+
247
+ if output_name:
248
+ output.to_csv(output_name, sep=output_sep, index=False)
249
+
250
+ return output
@@ -0,0 +1,119 @@
1
+ import logging
2
+ from collections import defaultdict
3
+ from time import time
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+ from csv_detective.formats.float import float_casting
9
+ from csv_detective.utils import cast_prevent_nan, display_logs_depending_process_time
10
+
11
+
12
+ def create_profile(
13
+ table: pd.DataFrame,
14
+ columns: dict,
15
+ num_rows: int,
16
+ limited_output: bool = True,
17
+ cast_json: bool = True,
18
+ verbose: bool = False,
19
+ _col_values: dict[str, pd.Series] | None = None,
20
+ ) -> dict:
21
+ if verbose:
22
+ start = time()
23
+ logging.info("Creating profile")
24
+
25
+ if num_rows > 0:
26
+ raise ValueError("To create profiles num_rows has to be set to -1")
27
+ if not limited_output:
28
+ columns = {
29
+ k: v[0] if v else {"python_type": "string", "format": "string", "score": 1.0}
30
+ for k, v in columns.items()
31
+ }
32
+ # value_counts().reset_index() tries to insert a "count" column, and fails if it's already here
33
+ _count_col = "count"
34
+ while _count_col in table.columns:
35
+ _count_col = "_" + _count_col
36
+ profile = defaultdict(dict)
37
+ for c in table.columns:
38
+ # for numerical formats we want min, max, mean, std
39
+ if columns[c]["python_type"] in ["float", "int"]:
40
+ # if we have read the file in chunks we already have what we need
41
+ if _col_values is None:
42
+ # we locally cast the column to perform the operations,
43
+ # using the same method as in cast_df
44
+ cast_col = (
45
+ table[c].astype(pd.Int64Dtype())
46
+ if columns[c]["python_type"] == "int"
47
+ else table[c].apply(lambda x: float_casting(x) if isinstance(x, str) else pd.NA)
48
+ )
49
+ stats = {
50
+ "min": cast_prevent_nan(cast_col.min(), columns[c]["python_type"]),
51
+ "mean": cast_prevent_nan(cast_col.mean(), columns[c]["python_type"]),
52
+ "max": cast_prevent_nan(cast_col.max(), columns[c]["python_type"]),
53
+ "std": cast_prevent_nan(cast_col.std(), columns[c]["python_type"]),
54
+ }
55
+ else:
56
+ cast_col = _col_values[c].reset_index()
57
+ cast_col = cast_col.loc[cast_col[c].notna()]
58
+ cast_col[c] = (
59
+ cast_col[c].astype(pd.Int64Dtype())
60
+ if columns[c]["python_type"] == "int"
61
+ else cast_col[c].apply(
62
+ lambda x: float_casting(x) if isinstance(x, str) else pd.NA
63
+ )
64
+ )
65
+ stats = {
66
+ "min": cast_prevent_nan(cast_col[c].min(), columns[c]["python_type"]),
67
+ "mean": cast_prevent_nan(
68
+ (cast_col[c] * cast_col["count"]).sum() / sum(cast_col["count"]),
69
+ columns[c]["python_type"],
70
+ ),
71
+ "max": cast_prevent_nan(cast_col[c].max(), columns[c]["python_type"]),
72
+ }
73
+ stats["std"] = cast_prevent_nan(
74
+ np.sqrt(
75
+ sum(cast_col["count"] * (cast_col[c] - stats["mean"]) ** 2)
76
+ / sum(cast_col["count"])
77
+ ),
78
+ columns[c]["python_type"],
79
+ )
80
+ profile[c].update(**stats)
81
+ del cast_col
82
+ # for all formats we want most frequent values, nb unique values and nb missing values
83
+ tops_bruts = (
84
+ (table[c].value_counts() if _col_values is None else _col_values[c].sort_values())
85
+ .reset_index(name=_count_col)
86
+ .iloc[:10]
87
+ .to_dict(orient="records")
88
+ )
89
+ profile[c].update(
90
+ tops=[
91
+ {
92
+ "count": tb[_count_col],
93
+ "value": tb[c],
94
+ }
95
+ for tb in tops_bruts
96
+ ],
97
+ nb_distinct=(
98
+ (
99
+ table[c].nunique()
100
+ if columns[c]["python_type"] != "json" or not cast_json
101
+ # a column containing cast json is not serializable
102
+ else table[c].astype(str).nunique()
103
+ )
104
+ if _col_values is None
105
+ else len(_col_values)
106
+ ),
107
+ nb_missing_values=(
108
+ len(table[c].loc[table[c].isna()])
109
+ if _col_values is None
110
+ else (_col_values[c].loc[pd.NA] if pd.NA in _col_values[c].index else 0)
111
+ ),
112
+ )
113
+ if verbose:
114
+ display_logs_depending_process_time(
115
+ f"Created profile in {round(time() - start, 3)}s",
116
+ time() - start,
117
+ )
118
+ del _col_values
119
+ return profile