csv-detective 0.6.7__py3-none-any.whl → 0.9.3.dev2438__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. csv_detective/__init__.py +7 -1
  2. csv_detective/cli.py +33 -21
  3. csv_detective/{detect_fields/FR → detection}/__init__.py +0 -0
  4. csv_detective/detection/columns.py +89 -0
  5. csv_detective/detection/encoding.py +29 -0
  6. csv_detective/detection/engine.py +46 -0
  7. csv_detective/detection/formats.py +156 -0
  8. csv_detective/detection/headers.py +28 -0
  9. csv_detective/detection/rows.py +18 -0
  10. csv_detective/detection/separator.py +44 -0
  11. csv_detective/detection/variables.py +97 -0
  12. csv_detective/explore_csv.py +151 -377
  13. csv_detective/format.py +67 -0
  14. csv_detective/formats/__init__.py +9 -0
  15. csv_detective/formats/adresse.py +116 -0
  16. csv_detective/formats/binary.py +26 -0
  17. csv_detective/formats/booleen.py +35 -0
  18. csv_detective/formats/code_commune_insee.py +26 -0
  19. csv_detective/formats/code_csp_insee.py +36 -0
  20. csv_detective/formats/code_departement.py +29 -0
  21. csv_detective/formats/code_fantoir.py +21 -0
  22. csv_detective/formats/code_import.py +17 -0
  23. csv_detective/formats/code_postal.py +25 -0
  24. csv_detective/formats/code_region.py +22 -0
  25. csv_detective/formats/code_rna.py +29 -0
  26. csv_detective/formats/code_waldec.py +17 -0
  27. csv_detective/formats/commune.py +27 -0
  28. csv_detective/formats/csp_insee.py +31 -0
  29. csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt +0 -0
  30. csv_detective/formats/date.py +99 -0
  31. csv_detective/formats/date_fr.py +22 -0
  32. csv_detective/formats/datetime_aware.py +45 -0
  33. csv_detective/formats/datetime_naive.py +48 -0
  34. csv_detective/formats/datetime_rfc822.py +24 -0
  35. csv_detective/formats/departement.py +37 -0
  36. csv_detective/formats/email.py +28 -0
  37. csv_detective/formats/float.py +29 -0
  38. csv_detective/formats/geojson.py +36 -0
  39. csv_detective/formats/insee_ape700.py +31 -0
  40. csv_detective/formats/insee_canton.py +28 -0
  41. csv_detective/formats/int.py +23 -0
  42. csv_detective/formats/iso_country_code_alpha2.py +30 -0
  43. csv_detective/formats/iso_country_code_alpha3.py +30 -0
  44. csv_detective/formats/iso_country_code_numeric.py +31 -0
  45. csv_detective/formats/jour_de_la_semaine.py +41 -0
  46. csv_detective/formats/json.py +20 -0
  47. csv_detective/formats/latitude_l93.py +48 -0
  48. csv_detective/formats/latitude_wgs.py +42 -0
  49. csv_detective/formats/latitude_wgs_fr_metropole.py +42 -0
  50. csv_detective/formats/latlon_wgs.py +53 -0
  51. csv_detective/formats/longitude_l93.py +39 -0
  52. csv_detective/formats/longitude_wgs.py +32 -0
  53. csv_detective/formats/longitude_wgs_fr_metropole.py +32 -0
  54. csv_detective/formats/lonlat_wgs.py +36 -0
  55. csv_detective/formats/mois_de_lannee.py +48 -0
  56. csv_detective/formats/money.py +18 -0
  57. csv_detective/formats/mongo_object_id.py +14 -0
  58. csv_detective/formats/pays.py +35 -0
  59. csv_detective/formats/percent.py +16 -0
  60. csv_detective/formats/region.py +70 -0
  61. csv_detective/formats/sexe.py +17 -0
  62. csv_detective/formats/siren.py +37 -0
  63. csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} +47 -29
  64. csv_detective/formats/tel_fr.py +36 -0
  65. csv_detective/formats/uai.py +36 -0
  66. csv_detective/formats/url.py +46 -0
  67. csv_detective/formats/username.py +14 -0
  68. csv_detective/formats/uuid.py +16 -0
  69. csv_detective/formats/year.py +28 -0
  70. csv_detective/output/__init__.py +65 -0
  71. csv_detective/output/dataframe.py +96 -0
  72. csv_detective/output/example.py +250 -0
  73. csv_detective/output/profile.py +119 -0
  74. csv_detective/{schema_generation.py → output/schema.py} +268 -343
  75. csv_detective/output/utils.py +74 -0
  76. csv_detective/{detect_fields/FR/geo → parsing}/__init__.py +0 -0
  77. csv_detective/parsing/columns.py +235 -0
  78. csv_detective/parsing/compression.py +11 -0
  79. csv_detective/parsing/csv.py +56 -0
  80. csv_detective/parsing/excel.py +167 -0
  81. csv_detective/parsing/load.py +111 -0
  82. csv_detective/parsing/text.py +56 -0
  83. csv_detective/utils.py +23 -196
  84. csv_detective/validate.py +138 -0
  85. csv_detective-0.9.3.dev2438.dist-info/METADATA +267 -0
  86. csv_detective-0.9.3.dev2438.dist-info/RECORD +92 -0
  87. csv_detective-0.9.3.dev2438.dist-info/WHEEL +4 -0
  88. {csv_detective-0.6.7.dist-info → csv_detective-0.9.3.dev2438.dist-info}/entry_points.txt +1 -0
  89. csv_detective/all_packages.txt +0 -104
  90. csv_detective/detect_fields/FR/geo/adresse/__init__.py +0 -100
  91. csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +0 -24
  92. csv_detective/detect_fields/FR/geo/code_commune_insee/code_commune_insee.txt +0 -37600
  93. csv_detective/detect_fields/FR/geo/code_departement/__init__.py +0 -11
  94. csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +0 -15
  95. csv_detective/detect_fields/FR/geo/code_fantoir/code_fantoir.txt +0 -26122
  96. csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -19
  97. csv_detective/detect_fields/FR/geo/code_postal/code_postal.txt +0 -36822
  98. csv_detective/detect_fields/FR/geo/code_region/__init__.py +0 -27
  99. csv_detective/detect_fields/FR/geo/commune/__init__.py +0 -21
  100. csv_detective/detect_fields/FR/geo/commune/commune.txt +0 -36745
  101. csv_detective/detect_fields/FR/geo/departement/__init__.py +0 -19
  102. csv_detective/detect_fields/FR/geo/departement/departement.txt +0 -101
  103. csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +0 -20
  104. csv_detective/detect_fields/FR/geo/insee_canton/canton2017.txt +0 -2055
  105. csv_detective/detect_fields/FR/geo/insee_canton/cantons.txt +0 -2055
  106. csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +0 -13
  107. csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -13
  108. csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +0 -13
  109. csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -13
  110. csv_detective/detect_fields/FR/geo/pays/__init__.py +0 -17
  111. csv_detective/detect_fields/FR/geo/pays/pays.txt +0 -248
  112. csv_detective/detect_fields/FR/geo/region/__init__.py +0 -16
  113. csv_detective/detect_fields/FR/geo/region/region.txt +0 -44
  114. csv_detective/detect_fields/FR/other/__init__.py +0 -0
  115. csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +0 -26
  116. csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
  117. csv_detective/detect_fields/FR/other/code_rna/__init__.py +0 -8
  118. csv_detective/detect_fields/FR/other/code_waldec/__init__.py +0 -12
  119. csv_detective/detect_fields/FR/other/csp_insee/__init__.py +0 -16
  120. csv_detective/detect_fields/FR/other/date_fr/__init__.py +0 -12
  121. csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +0 -16
  122. csv_detective/detect_fields/FR/other/sexe/__init__.py +0 -9
  123. csv_detective/detect_fields/FR/other/siren/__init__.py +0 -18
  124. csv_detective/detect_fields/FR/other/tel_fr/__init__.py +0 -15
  125. csv_detective/detect_fields/FR/other/uai/__init__.py +0 -15
  126. csv_detective/detect_fields/FR/temp/__init__.py +0 -0
  127. csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +0 -23
  128. csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +0 -37
  129. csv_detective/detect_fields/__init__.py +0 -57
  130. csv_detective/detect_fields/geo/__init__.py +0 -0
  131. csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +0 -15
  132. csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +0 -14
  133. csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +0 -15
  134. csv_detective/detect_fields/geo/json_geojson/__init__.py +0 -22
  135. csv_detective/detect_fields/geo/latitude_wgs/__init__.py +0 -13
  136. csv_detective/detect_fields/geo/latlon_wgs/__init__.py +0 -15
  137. csv_detective/detect_fields/geo/longitude_wgs/__init__.py +0 -13
  138. csv_detective/detect_fields/other/__init__.py +0 -0
  139. csv_detective/detect_fields/other/booleen/__init__.py +0 -21
  140. csv_detective/detect_fields/other/email/__init__.py +0 -8
  141. csv_detective/detect_fields/other/float/__init__.py +0 -17
  142. csv_detective/detect_fields/other/int/__init__.py +0 -12
  143. csv_detective/detect_fields/other/json/__init__.py +0 -24
  144. csv_detective/detect_fields/other/mongo_object_id/__init__.py +0 -8
  145. csv_detective/detect_fields/other/twitter/__init__.py +0 -8
  146. csv_detective/detect_fields/other/url/__init__.py +0 -11
  147. csv_detective/detect_fields/other/uuid/__init__.py +0 -11
  148. csv_detective/detect_fields/temp/__init__.py +0 -0
  149. csv_detective/detect_fields/temp/date/__init__.py +0 -62
  150. csv_detective/detect_fields/temp/datetime_iso/__init__.py +0 -18
  151. csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +0 -21
  152. csv_detective/detect_fields/temp/year/__init__.py +0 -10
  153. csv_detective/detect_labels/FR/__init__.py +0 -0
  154. csv_detective/detect_labels/FR/geo/__init__.py +0 -0
  155. csv_detective/detect_labels/FR/geo/adresse/__init__.py +0 -40
  156. csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +0 -42
  157. csv_detective/detect_labels/FR/geo/code_departement/__init__.py +0 -33
  158. csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +0 -33
  159. csv_detective/detect_labels/FR/geo/code_postal/__init__.py +0 -41
  160. csv_detective/detect_labels/FR/geo/code_region/__init__.py +0 -33
  161. csv_detective/detect_labels/FR/geo/commune/__init__.py +0 -33
  162. csv_detective/detect_labels/FR/geo/departement/__init__.py +0 -47
  163. csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +0 -33
  164. csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +0 -54
  165. csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -55
  166. csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +0 -44
  167. csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -45
  168. csv_detective/detect_labels/FR/geo/pays/__init__.py +0 -45
  169. csv_detective/detect_labels/FR/geo/region/__init__.py +0 -45
  170. csv_detective/detect_labels/FR/other/__init__.py +0 -0
  171. csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +0 -33
  172. csv_detective/detect_labels/FR/other/code_rna/__init__.py +0 -38
  173. csv_detective/detect_labels/FR/other/code_waldec/__init__.py +0 -33
  174. csv_detective/detect_labels/FR/other/csp_insee/__init__.py +0 -37
  175. csv_detective/detect_labels/FR/other/date_fr/__init__.py +0 -33
  176. csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +0 -40
  177. csv_detective/detect_labels/FR/other/sexe/__init__.py +0 -33
  178. csv_detective/detect_labels/FR/other/siren/__init__.py +0 -41
  179. csv_detective/detect_labels/FR/other/siret/__init__.py +0 -40
  180. csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -45
  181. csv_detective/detect_labels/FR/other/uai/__init__.py +0 -50
  182. csv_detective/detect_labels/FR/temp/__init__.py +0 -0
  183. csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +0 -41
  184. csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +0 -33
  185. csv_detective/detect_labels/__init__.py +0 -43
  186. csv_detective/detect_labels/geo/__init__.py +0 -0
  187. csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +0 -41
  188. csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +0 -41
  189. csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +0 -41
  190. csv_detective/detect_labels/geo/json_geojson/__init__.py +0 -42
  191. csv_detective/detect_labels/geo/latitude_wgs/__init__.py +0 -55
  192. csv_detective/detect_labels/geo/latlon_wgs/__init__.py +0 -67
  193. csv_detective/detect_labels/geo/longitude_wgs/__init__.py +0 -45
  194. csv_detective/detect_labels/other/__init__.py +0 -0
  195. csv_detective/detect_labels/other/booleen/__init__.py +0 -34
  196. csv_detective/detect_labels/other/email/__init__.py +0 -45
  197. csv_detective/detect_labels/other/float/__init__.py +0 -33
  198. csv_detective/detect_labels/other/int/__init__.py +0 -33
  199. csv_detective/detect_labels/other/money/__init__.py +0 -11
  200. csv_detective/detect_labels/other/money/check_col_name.py +0 -8
  201. csv_detective/detect_labels/other/mongo_object_id/__init__.py +0 -33
  202. csv_detective/detect_labels/other/twitter/__init__.py +0 -33
  203. csv_detective/detect_labels/other/url/__init__.py +0 -48
  204. csv_detective/detect_labels/other/uuid/__init__.py +0 -33
  205. csv_detective/detect_labels/temp/__init__.py +0 -0
  206. csv_detective/detect_labels/temp/date/__init__.py +0 -51
  207. csv_detective/detect_labels/temp/datetime_iso/__init__.py +0 -45
  208. csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +0 -44
  209. csv_detective/detect_labels/temp/year/__init__.py +0 -44
  210. csv_detective/detection.py +0 -361
  211. csv_detective/process_text.py +0 -39
  212. csv_detective/s3_utils.py +0 -48
  213. csv_detective-0.6.7.data/data/share/csv_detective/CHANGELOG.md +0 -118
  214. csv_detective-0.6.7.data/data/share/csv_detective/LICENSE.AGPL.txt +0 -661
  215. csv_detective-0.6.7.data/data/share/csv_detective/README.md +0 -247
  216. csv_detective-0.6.7.dist-info/LICENSE.AGPL.txt +0 -661
  217. csv_detective-0.6.7.dist-info/METADATA +0 -23
  218. csv_detective-0.6.7.dist-info/RECORD +0 -150
  219. csv_detective-0.6.7.dist-info/WHEEL +0 -5
  220. csv_detective-0.6.7.dist-info/top_level.txt +0 -2
  221. tests/__init__.py +0 -0
  222. tests/test_fields.py +0 -360
  223. tests/test_file.py +0 -116
  224. tests/test_labels.py +0 -7
  225. /csv_detective/{detect_fields/FR/other/csp_insee → formats/data}/csp_insee.txt +0 -0
  226. /csv_detective/{detect_fields/geo/iso_country_code_alpha2 → formats/data}/iso_country_code_alpha2.txt +0 -0
  227. /csv_detective/{detect_fields/geo/iso_country_code_alpha3 → formats/data}/iso_country_code_alpha3.txt +0 -0
  228. /csv_detective/{detect_fields/geo/iso_country_code_numeric → formats/data}/iso_country_code_numeric.txt +0 -0
@@ -0,0 +1,267 @@
1
+ Metadata-Version: 2.3
2
+ Name: csv-detective
3
+ Version: 0.9.3.dev2438
4
+ Summary: Detect tabular files column content
5
+ Keywords: CSV,data processing,encoding,guess,parser,tabular
6
+ Author: data.gouv.fr
7
+ Author-email: data.gouv.fr <opendatateam@data.gouv.fr>
8
+ License: MIT
9
+ Requires-Dist: dateparser>=1.2.0,<2
10
+ Requires-Dist: faust-cchardet==2.1.19
11
+ Requires-Dist: pandas>=2.2.0,<3
12
+ Requires-Dist: python-dateutil>=2.8.2,<3
13
+ Requires-Dist: unidecode>=1.3.6,<2
14
+ Requires-Dist: openpyxl>=3.1.5
15
+ Requires-Dist: xlrd>=2.0.1
16
+ Requires-Dist: odfpy>=1.4.1
17
+ Requires-Dist: requests>=2.32.3,<3
18
+ Requires-Dist: python-magic>=0.4.27
19
+ Requires-Dist: frformat==0.4.0
20
+ Requires-Dist: faker>=33.0.0
21
+ Requires-Dist: rstr>=3.2.2
22
+ Requires-Dist: more-itertools>=10.8.0
23
+ Requires-Dist: pytest>=8.3.0 ; extra == 'dev'
24
+ Requires-Dist: responses>=0.25.0 ; extra == 'dev'
25
+ Requires-Dist: ruff>=0.9.3 ; extra == 'dev'
26
+ Requires-Python: >=3.10, <3.15
27
+ Project-URL: Source, https://github.com/datagouv/csv_detective
28
+ Provides-Extra: dev
29
+ Description-Content-Type: text/markdown
30
+
31
+ # CSV Detective
32
+
33
+ This is a package to **automatically detect column content in tabular files**. The script reads either the whole file or the first few rows and performs various checks (regex, casting, comparison with official lists...) to see for each column if it matches with various content types.
34
+
35
+ Currently supported file types: csv(.gz), xls, xlsx, ods.
36
+
37
+ You can also directly feed the URL of a remote file (from data.gouv.fr for instance).
38
+
39
+ ## How To?
40
+
41
+ ### Install the package
42
+
43
+ You need to have Python >= 3.10 installed. We recommend using a virtual environment.
44
+
45
+ ```bash
46
+ pip install csv-detective
47
+ ```
48
+
49
+ ### Detect some columns
50
+
51
+ Say you have a tabular file located at `file_path`. This is how you could use `csv_detective`:
52
+
53
+ ```python
54
+ # Import the csv_detective package
55
+ from csv_detective import routine
56
+ import os # for this example only
57
+
58
+ # Replace by your file path
59
+ file_path = os.path.join('.', 'tests', 'code_postaux_v201410.csv')
60
+
61
+ # Open your file and run csv_detective
62
+ inspection_results = routine(
63
+ file_path, # or file URL
64
+ num_rows=-1, # Value -1 will analyze all lines of your file, you can change with the number of lines you wish to analyze
65
+ save_results=False, # Default False. If True, it will save result output into the same directory as the analyzed file, using the same name as your file and .json extension
66
+ output_profile=True, # Default False. If True, returned dict will contain a property "profile" indicating profile (min, max, mean, tops...) of every column of your csv
67
+ output_schema=True, # Default False. If True, returned dict will contain a property "schema" containing basic [tableschema](https://specs.frictionlessdata.io/table-schema/) of your file. This can be used to validate structure of other csv which should match same structure.
68
+ tags=["fr"], # Default None. If set as a list of strings, only performs checks related to the specified tags (you can see the available tags with FormatsManager().available_tags())
69
+ )
70
+ ```
71
+
72
+ ## So What Do You Get ?
73
+
74
+ ### Output
75
+
76
+ The program creates a `python` dictionary with the following information :
77
+
78
+ ```
79
+ {
80
+ "encoding": "windows-1252", # Encoding detected
81
+ "separator": ";", # Detected CSV separator
82
+ "header_row_idx": 0 # Index of the header (aka how many lines to skip to get it)
83
+ "headers": ['code commune INSEE', 'nom de la commune', 'code postal', "libellé d'acheminement"], # Header row
84
+ "total_lines": 42, # Number of rows (excluding header)
85
+ "nb_duplicates": 0, # Number of exact duplicates in rows
86
+ "heading_columns": 0, # Number of heading columns
87
+ "trailing_columns": 0, # Number of trailing columns
88
+ "categorical": ['Code commune'] # Columns that contain less than 25 different values (arbitrary threshold)
89
+ "columns": { # Property that conciliate detection from labels and content of a column
90
+ "Code commune": {
91
+ "python_type": "string",
92
+ "format": "code_commune_insee",
93
+ "score": 1.0
94
+ },
95
+ },
96
+ "columns_labels": { # Property that return detection from header columns
97
+ "Code commune": {
98
+ "python_type": "string",
99
+ "format": "code_commune_insee",
100
+ "score": 0.5
101
+ },
102
+ },
103
+ "columns_fields": { # Property that return detection from content columns
104
+ "Code commune": {
105
+ "python_type": "string",
106
+ "format": "code_commune_insee",
107
+ "score": 1.25
108
+ },
109
+ },
110
+ "profile": {
111
+ "column_name" : {
112
+ "min": 1, # only int and float
113
+ "max": 12, # only int and float
114
+ "mean": 5, # only int and float
115
+ "std": 5, # only int and float
116
+ "tops": [ # 10 most frequent values in the column
117
+ "xxx",
118
+ "yyy",
119
+ "..."
120
+ ],
121
+ "nb_distinct": 67, # number of distinct values
122
+ "nb_missing_values": 102 # number of empty cells in the column
123
+ }
124
+ },
125
+ "schema": { # TableSchema of the file if `output_schema` was set to `True`
126
+ "$schema": "https://frictionlessdata.io/schemas/table-schema.json",
127
+ "name": "",
128
+ "title": "",
129
+ "description": "",
130
+ "countryCode": "FR",
131
+ "homepage": "",
132
+ "path": "https://github.com/datagouv/csv-detective",
133
+ "resources": [],
134
+ "sources": [
135
+ {"title": "Spécification Tableschema", "path": "https://specs.frictionlessdata.io/table-schema"},
136
+ {"title": "schema.data.gouv.fr", "path": "https://schema.data.gouv.fr"}
137
+ ],
138
+ "created": "2023-02-10",
139
+ "lastModified": "2023-02-10",
140
+ "version": "0.0.1",
141
+ "contributors": [
142
+ {"title": "Table schema bot", "email": "schema@data.gouv.fr", "organisation": "data.gouv.fr", "role": "author"}
143
+ ],
144
+ "fields": [
145
+ {
146
+ "name": "Code commune",
147
+ "description": "Le code INSEE de la commune",
148
+ "example": "23150",
149
+ "type": "string",
150
+ "formatFR": "code_commune_insee",
151
+ "constraints": {
152
+ "required": False,
153
+ "pattern": "^([013-9]\\d|2[AB1-9])\\d{3}$",
154
+ }
155
+ }
156
+ ]
157
+ }
158
+ }
159
+ ```
160
+
161
+ The output slightly differs depending on the file format:
162
+ - csv files have `encoding` and `separator` (and `compression` if relevant)
163
+ - xls, xlsx, ods files have `engine` and `sheet_name`
164
+
165
+ You may also set `output_df` to `True`, in which case the output is a tuple of two elements:
166
+ - the analysis (as described above)
167
+ - an iterator of `pd.DataFrame`s which contain the columns cast with the detected types (which can be used with `pd.concat` or in a loop):
168
+ ```python
169
+ inspection, df_chunks = routine(
170
+ file_path=file_path,
171
+ num_rows=-1,
172
+ output_df=True,
173
+ )
174
+ cast_df = pd.concat(df_chunks, ignore_index=True)
175
+ # if "col1" has been detected as a float, then cast_df["col1"] contains floats
176
+ ```
177
+
178
+ ### What Formats Can Be Detected
179
+
180
+ Includes :
181
+ - types (float, int, dates, datetimes, JSON) and more specific (latitude, longitude, geoJSON...)
182
+ - Communes, Départements, Régions, Pays
183
+ - Codes Communes, Codes Postaux, Codes Departement, ISO Pays
184
+ - Codes CSP, Description CSP, SIREN
185
+ - E-Mails, URLs, Téléphones FR
186
+ - Years, Dates, Jours de la Semaine FR
187
+ - UUIDs, Mongo ObjectIds
188
+
189
+ ### Validation
190
+ If you have a pre-made analysis of a file, you can check whether another file conforms to the same analysis:
191
+ ```python
192
+ from csv_detective import validate
193
+ is_valid, *_ = validate(
194
+ file_path,
195
+ previous_analysis, # exactly as it came out from the routine function
196
+ )
197
+ ```
198
+
199
+ ### Format detection and scoring
200
+ For each column, 3 scores are computed for each format, the higher the score, the more likely the format:
201
+ - the field score based on the values contained in the column (0.0 to 1.0).
202
+ - the label score based on the header of the column (0.0 to 1.0).
203
+ - the overall score, computed as `field_score * (1 + label_score/2)` (0.0 to 1.5).
204
+
205
+ The overall score computation aims to give more weight to the column contents while
206
+ still leveraging the column header.
207
+
208
+ #### `limited_output` - Select the output mode you want for json report
209
+
210
+ This option allows you to select the output mode you want to pass. To do so, you have to pass a `limited_output` argument to the `routine` function. This variable has two possible values:
211
+
212
+ - `limited_output` defaults to `True` which means report will contain only detected column formats based on a pre-selected threshold proportion in data. Report result is the standard output (an example can be found above in 'Output' section).
213
+ Only the format with highest score is present in the output.
214
+ - `limited_output=False` means report will contain a full list of all column format possibilities for each input data columns with a value associated which match to the proportion of found column type in data. With this report, user can adjust its rules of detection based on a specific threshold and has a better vision of quality detection for each columns. Results could also be easily transformed into a dataframe (columns types in column / column names in rows) for analysis and test.
215
+
216
+ ## Improvement suggestions
217
+
218
+ - Smarter refactors
219
+ - Performances improvements
220
+ - Test other ways to load and process data (`pandas` alternatives)
221
+ - Add more and more detection modules...
222
+
223
+ Related ideas:
224
+
225
+ - store column names to make a learning model based on column names for (possible pre-screen)
226
+ - entity resolution (good luck...)
227
+
228
+ ## Why Could This Be of Any Use?
229
+
230
+ Organisations such as [data.gouv.fr](http://data.gouv.fr) aggregate huge amounts of un-normalised data. Performing cross-examination across datasets can be difficult. This tool could help enrich the datasets metadata and facilitate linking them together.
231
+
232
+ [`udata-hydra`](https://github.com/etalab/udata-hydra) is a crawler that checks, analyzes (using `csv-detective`) and APIfies all tabular files from [data.gouv.fr](http://data.gouv.fr).
233
+
234
+ An early version of this analysis of all resources on data.gouv.fr can be found [here](https://github.com/Leobouloc/data.gouv-exploration).
235
+
236
+ ## Linting
237
+
238
+ Remember to format, lint, and sort imports with [Ruff](https://docs.astral.sh/ruff/) before committing (checks will remind you anyway):
239
+ ```bash
240
+ pip install .[dev]
241
+ ruff check --fix .
242
+ ruff format .
243
+ ```
244
+
245
+ ### 🏷️ Release
246
+
247
+ The release process uses the [`tag_version.sh`](tag_version.sh) script to create git tags and update [CHANGELOG.md](CHANGELOG.md) and [pyproject.toml](pyproject.toml) automatically.
248
+
249
+ **Prerequisites**: [GitHub CLI](https://cli.github.com/) (`gh`) must be installed and authenticated, and you must be on the main branch with a clean working directory.
250
+
251
+ ```bash
252
+ # Create a new release
253
+ ./tag_version.sh <version>
254
+
255
+ # Example
256
+ ./tag_version.sh 2.5.0
257
+
258
+ # Dry run to see what would happen
259
+ ./tag_version.sh 2.5.0 --dry-run
260
+ ```
261
+
262
+ The script automatically:
263
+ - Updates the version in `pyproject.toml`
264
+ - Extracts commits since the last tag and formats them for `CHANGELOG.md`
265
+ - Identifies breaking changes (commits with `!:` in the subject)
266
+ - Creates a git tag and pushes it to the remote repository
267
+ - Creates a GitHub release with the changelog content
@@ -0,0 +1,92 @@
1
+ csv_detective/__init__.py,sha256=zlYElTOp_I2_VG7ZdOTuAu0wuCXSc0cr3sH6gtk2bcg,152
2
+ csv_detective/cli.py,sha256=mu5anmBmaDk52_uZGiA4T37wYZCuV43gZAepjs1Cqzc,1389
3
+ csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvcaGd0Go,2887
5
+ csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
6
+ csv_detective/detection/engine.py,sha256=wQeDKpp2DKF-HcS1R8H6GgQyaUgQme4szPtEHgAjBII,1552
7
+ csv_detective/detection/formats.py,sha256=kQEht5lr9hFhYe0Zn1lfj9jOKaqYrXNrM_tkQX24pEk,5410
8
+ csv_detective/detection/headers.py,sha256=95pTL524Sy5PGxyQ03ofFUaamvlmkxTJQe8u6HfzOkU,1051
9
+ csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
10
+ csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
11
+ csv_detective/detection/variables.py,sha256=-QtZOB96z3pWbqnZ-c1RU3yzoYqcO61A0JzeS6JbkxY,3576
12
+ csv_detective/explore_csv.py,sha256=-LCHr7vyT0Q0oLtXeOO8pEevJ6-8Ib9JP3D7nVgZM8o,7090
13
+ csv_detective/format.py,sha256=XX_cSTQc0jlsQq3GUqHi7Cz36AiRrpjrwPmeoOTLMvo,2396
14
+ csv_detective/formats/__init__.py,sha256=Egiy29kcG3Oz2eE2maYhD3wP29zOSOWyRlOpGD5LGvU,318
15
+ csv_detective/formats/adresse.py,sha256=jALDpEDAWyAcgqEfNVRg_W1r6XaYuJKD_jAaP2l-bxk,1943
16
+ csv_detective/formats/binary.py,sha256=OCGRDh5p27sA4yjrpKIp3b2_PfHJYUe5QxIArf-fCxA,676
17
+ csv_detective/formats/booleen.py,sha256=AnDDKShkSYpWO4POhwY2V7_C4yPWbmqBu8CJPgQ9Gwc,648
18
+ csv_detective/formats/code_commune_insee.py,sha256=MhwCPVAhwWH-MyaNAIVRNbqKfeNe3oiCpzEGfpHkpJY,504
19
+ csv_detective/formats/code_csp_insee.py,sha256=_JQ-YbnHMenNnwIg1xBmNVqgCa1tLD2hbPN1soODhDk,656
20
+ csv_detective/formats/code_departement.py,sha256=odwVbmktgjEhL-dSFHXuCRVwhkF8bL8G7VlpVTnMY2A,628
21
+ csv_detective/formats/code_fantoir.py,sha256=nFVFYJEP2HHE2TyhR_dhGdPCMLfCROBO_B8wxwQn7T8,366
22
+ csv_detective/formats/code_import.py,sha256=N5NVvnHkRwC7ARHoM77R-2cYSeyNmPoRIn6JL3Fbnjs,346
23
+ csv_detective/formats/code_postal.py,sha256=C6XMkiVTxhMFvfyvJmGp3iwvh722EzMwD_UdqQU4aR0,427
24
+ csv_detective/formats/code_region.py,sha256=VFKh1rGYVYTNWBJZ2_m0xS4rhJlrI_Gr8q8RXuZCr-w,366
25
+ csv_detective/formats/code_rna.py,sha256=WExlQtlAUfOFT4N3MKsMBhZVxTdNzgexFjmXhZdRM1w,512
26
+ csv_detective/formats/code_waldec.py,sha256=kJEJfikbhMfVwtA8hBpup0tpeSFoY_rWrEdXQxgNwhg,297
27
+ csv_detective/formats/commune.py,sha256=oVpwINGqpwMOT43KkasozipJ9hBeoQ5FrKV_wIeVJGE,532
28
+ csv_detective/formats/csp_insee.py,sha256=HE6NK6Sw91mLFeAAKwWUXZZfXX6fiA0zK4RI4YdkUFY,656
29
+ csv_detective/formats/data/csp_insee.txt,sha256=kgKaKc-5PHu5U4--ugLjpFyMNtTU9CGdZ9ANU3YAsM4,32879
30
+ csv_detective/formats/data/insee_ape700.txt,sha256=nKgslakENwgE7sPkVNHqR23iXuxF02p9-v5MC2_ntx8,4398
31
+ csv_detective/formats/data/iso_country_code_alpha2.txt,sha256=YyPlDqCdz65ecf4Wes_r0P4rDSJG35niXtjc4MmctXM,1740
32
+ csv_detective/formats/data/iso_country_code_alpha3.txt,sha256=aYqKSohgXuBtcIBfF52f8JWYDdxL_HV_Ol1srGnWBp4,1003
33
+ csv_detective/formats/data/iso_country_code_numeric.txt,sha256=2GtEhuporsHYV-pU4q9kfXU5iOtfW5C0GYBTTKQtnnA,1004
34
+ csv_detective/formats/date.py,sha256=X4ohXaFO8cXPJktUSumc3bfdlbDIWEYTG8S9ugVRcsE,2730
35
+ csv_detective/formats/date_fr.py,sha256=3hTw5RommrhcgECFRSt9KgyB9zyi1j4W3UygEHmRgoE,502
36
+ csv_detective/formats/datetime_aware.py,sha256=-1ZBix6vYlYXTvhXrijP-98AN7iPB0x_DbbwU1QjMCI,1470
37
+ csv_detective/formats/datetime_naive.py,sha256=nvA8qT1fb2RmpXN5_Cw9YZA6pC4BryX_B0V-E6O2UbU,1521
38
+ csv_detective/formats/datetime_rfc822.py,sha256=l-SLb34hSuHxC2JQ-9SD-nG38JqzoozwUZiGtoybb0A,601
39
+ csv_detective/formats/departement.py,sha256=UP9UF23BFq_-mIS8N10K5XkoCXwPmDeSoa_7lCAkI4w,768
40
+ csv_detective/formats/email.py,sha256=Qen2EBDYY5TtWXwxrrTGWRrbIybz0ySlVpl4ZRk8pzA,517
41
+ csv_detective/formats/float.py,sha256=tWs_tW64OuacNQENu3uk5GOEVQMQls2iiteFOacQRAQ,832
42
+ csv_detective/formats/geojson.py,sha256=udbBxCBRmb0o6TD8z5ryemfqdinBz6njNJU0XcbfMig,757
43
+ csv_detective/formats/insee_ape700.py,sha256=cLs3Eersqm4wX6oqsqp0Vb3WGPJb2xY5Za_vh0uLgKc,780
44
+ csv_detective/formats/insee_canton.py,sha256=Q5jczsOmh1wPP2KtDkcmqZ7Hlv50Zz9YvPIbxy46qs0,531
45
+ csv_detective/formats/int.py,sha256=ZBUOn50luMtlNKWPyOaMIkY3J4f4hA0MqwcoFtksozU,482
46
+ csv_detective/formats/iso_country_code_alpha2.py,sha256=vIep_j0xuqlXKyuvk8c8GaJC73HuJqKfQ4QzQKHsPc0,613
47
+ csv_detective/formats/iso_country_code_alpha3.py,sha256=yOmm91O8ot6KoUBfss5cqykDfeeMNCwafDAvPNvbufA,668
48
+ csv_detective/formats/iso_country_code_numeric.py,sha256=989ypOmjIrNTV9vFnrBlbpRWQ9whd3Rv9gNasdF_O4g,685
49
+ csv_detective/formats/jour_de_la_semaine.py,sha256=c5QBw9eZfwRs_jL_Ckm95UH-TxlExdFmfZNYW7-_iZI,606
50
+ csv_detective/formats/json.py,sha256=E-s7IHW0q5WgAJVK0I-5Rv7W_RdofROB5wnIXbNegZQ,446
51
+ csv_detective/formats/latitude_l93.py,sha256=GteGpxAht-jeOBLr_deCuEXA_LliVYIAmyr_7jFAWgI,986
52
+ csv_detective/formats/latitude_wgs.py,sha256=HPcFlLzJNqynLugDQ07vO04rOCNBuAabVJEP8FQ89Q0,780
53
+ csv_detective/formats/latitude_wgs_fr_metropole.py,sha256=ruGzQLJPiMV2AlnsBneQIhMzstseddzWA0bDg5gfTG4,791
54
+ csv_detective/formats/latlon_wgs.py,sha256=CbNi4Y-ZgBfNyYi54xwcZGLpEusiLAWVpFP1YgHtI1M,1224
55
+ csv_detective/formats/longitude_l93.py,sha256=vJE4k_DyQOjAruqu_Q0E2sJKZB4mXGGN6bS9WCelsbs,768
56
+ csv_detective/formats/longitude_wgs.py,sha256=DUZCUxJQl53HHVQbXlz_lWXoAZhy3MvJWcPNdiK5cCM,552
57
+ csv_detective/formats/longitude_wgs_fr_metropole.py,sha256=wPlJP06K0BVWfrx1wwEAKK93AKIqvsuw705gKAlWAfQ,550
58
+ csv_detective/formats/lonlat_wgs.py,sha256=BgtTl2ReI0hSQB-7mcR4TDxx-QzvA1B9fiZWxTb5xPI,1005
59
+ csv_detective/formats/mois_de_lannee.py,sha256=4_mmdr9S83utVCgPaK_epkeBm2mhwdUWQEoB_Fhdh2o,759
60
+ csv_detective/formats/money.py,sha256=HpjrmfUmbG8sXF557XbYzQ7TLtpNVRgpC991gGokO8I,414
61
+ csv_detective/formats/mongo_object_id.py,sha256=XsiP4iMxfBBIeuL-4g5bm3jgS6yUMJC2X5CmrEJ40oI,296
62
+ csv_detective/formats/pays.py,sha256=FRvoQwIWiKbm0RC62Sus1X0Y_yJ-cfvdB5RYhkY-4NY,693
63
+ csv_detective/formats/percent.py,sha256=s6eQBMwJr2uyTZMUCK1_ifA0c4Rt2iEe9_E_hKKU_mk,308
64
+ csv_detective/formats/region.py,sha256=CkN7JTsZB1X3bH5xohbtMCxL5BX9MSpith36_1mHMd4,1483
65
+ csv_detective/formats/sexe.py,sha256=yioD4W6EkgUgo74rxn6KLZtN_0XYXtmA4mqVyI7e1mU,387
66
+ csv_detective/formats/siren.py,sha256=ieLe50vdSnkXadcUI8VXnnId9GFGHyIBWVTP6bJtyMo,758
67
+ csv_detective/formats/siret.py,sha256=ehkZgOH-HggN6IgxF4G0DMut_6giZ3gc4g9wMdwZFHQ,997
68
+ csv_detective/formats/tel_fr.py,sha256=yKCqIlqKO2yKucCoCjYfSjqNKfTjqFcmNXxg6THG0WE,624
69
+ csv_detective/formats/uai.py,sha256=uT5gjdTmoFH9QPZdTFkJgiyuKLW0B6KmT6yqHQeaeOU,711
70
+ csv_detective/formats/url.py,sha256=j6tCbcEzQw7U53ixeeFfhzueN8syVgQsjmAmY7RRWdU,1049
71
+ csv_detective/formats/username.py,sha256=y38OggfWpEQsGi0JnD9QRM30musa29lO6nz-qybR24U,249
72
+ csv_detective/formats/uuid.py,sha256=ekMEFfzQtz0cLudzmu3AoCM0Yf5pu23qAcFNFgHWJ1A,346
73
+ csv_detective/formats/year.py,sha256=pkAfYPKZdy0g1ZoHGgJNpgTS5y5weGEKXCVMGaxIX8k,472
74
+ csv_detective/output/__init__.py,sha256=ALSq_tgX7rGyh--7rmbKz8wHkmResN0h7mNujndow3w,2103
75
+ csv_detective/output/dataframe.py,sha256=Hnd-AY51U0JMACcpuaK9wwO4oCX9Nd7ZLUTqavgJWRA,3406
76
+ csv_detective/output/example.py,sha256=8LWheSBYCeDFfarbnmzBrdCbTd8Alh1U4pfXMKfabOw,8630
77
+ csv_detective/output/profile.py,sha256=VUQp0VJ22dfY4R5TybTpuQW_TOX_rLEp98cOzu-Jf44,4876
78
+ csv_detective/output/schema.py,sha256=XoKljXPXP00DfqPCiz1ydwTHYGAFsvNxnaPCNBuuBIo,10443
79
+ csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
80
+ csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
81
+ csv_detective/parsing/columns.py,sha256=nihmB7Cv5BUNPh2EhMRPLdAxvcjrGZF-QFbJDd6rR2M,9246
82
+ csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
83
+ csv_detective/parsing/csv.py,sha256=0T0gpaXzwJo-sq41IoLQD704GiMUYeDVVASVbat-zWg,1726
84
+ csv_detective/parsing/excel.py,sha256=oAVTuoDccJc4-kVjHXiIPLQx3lq3aZRRZQxkG1c06JQ,6992
85
+ csv_detective/parsing/load.py,sha256=f-8aKiNpy_47qg4Lq-UZUR4NNrbJ_-KEGvcUQZ8cmb0,4317
86
+ csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
87
+ csv_detective/utils.py,sha256=RJ_zFOJ1DRY8HtDrKPiCdNk5gU6-KwOrOKOyfSkBZZY,1118
88
+ csv_detective/validate.py,sha256=XldlbGkUlPaIh0y4z9iaWlmmahwCrD1900s5Cxlq5wI,5430
89
+ csv_detective-0.9.3.dev2438.dist-info/WHEEL,sha256=z-mOpxbJHqy3cq6SvUThBZdaLGFZzdZPtgWLcP2NKjQ,79
90
+ csv_detective-0.9.3.dev2438.dist-info/entry_points.txt,sha256=1J86TQNCanjsLMboAufdEUla03qEQaC9QmVGYgt2FCQ,57
91
+ csv_detective-0.9.3.dev2438.dist-info/METADATA,sha256=FHlHT6UPBByKisTmFR5TImWGG8rXSKiTP_lc7-yHRDU,11063
92
+ csv_detective-0.9.3.dev2438.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: uv 0.9.15
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -1,2 +1,3 @@
1
1
  [console_scripts]
2
2
  csv_detective = csv_detective.cli:run
3
+
@@ -1,104 +0,0 @@
1
- csv_detective
2
- csv_detective.detect_fields
3
- csv_detective.detect_fields.temp
4
- csv_detective.detect_fields.other
5
- csv_detective.detect_fields.geo
6
- csv_detective.detect_fields.FR
7
- csv_detective.detect_fields.temp.date
8
- csv_detective.detect_fields.temp.year
9
- csv_detective.detect_fields.other.email
10
- csv_detective.detect_fields.other.mongo_object_id
11
- csv_detective.detect_fields.other.uuid
12
- csv_detective.detect_fields.other.url
13
- csv_detective.detect_fields.geo.iso_country_code_alpha2
14
- csv_detective.detect_fields.geo.iso_country_code_alpha3
15
- csv_detective.detect_fields.geo.iso_country_code_numeric
16
- csv_detective.detect_fields.FR.temp
17
- csv_detective.detect_fields.FR.other
18
- csv_detective.detect_fields.FR.geo
19
- csv_detective.detect_fields.FR.temp.jour_de_la_semaine
20
- csv_detective.detect_fields.FR.other.csp_insee
21
- csv_detective.detect_fields.FR.other.tel_fr
22
- csv_detective.detect_fields.FR.other.siren
23
- csv_detective.detect_fields.FR.other.code_csp_insee
24
- csv_detective.detect_fields.FR.other.sexe
25
- csv_detective.detect_fields.FR.geo.pays
26
- csv_detective.detect_fields.FR.geo.code_departement
27
- csv_detective.detect_fields.FR.geo.adresse
28
- csv_detective.detect_fields.FR.geo.code_commune_insee
29
- csv_detective.detect_fields.FR.geo.commune
30
- csv_detective.detect_fields.FR.geo.region
31
- csv_detective.detect_fields.FR.geo.code_postal
32
- csv_detective.detect_fields.FR.geo.departement
33
- csv_detective.detect_fields.FR.other.uai
34
- csv_detective.detect_fields.FR.other.siret
35
- csv_detective.detect_fields.geo.latitude_wgs
36
- csv_detective.detect_fields.geo.longitude_wgs
37
- csv_detective.detect_fields.geo.latlon_wgs
38
- csv_detective.detect_fields.geo.json_geojson
39
- csv_detective.detect_fields.FR.geo.code_fantoir
40
- csv_detective.detect_fields.FR.other.insee_ape700
41
- csv_detective.detect_fields.temp.datetime_iso
42
- csv_detective.detect_fields.temp.datetime_rfc822
43
- csv_detective.detect_fields.FR.geo.latitude_wgs_fr_metropole
44
- csv_detective.detect_fields.FR.geo.longitude_wgs_fr_metropole
45
- csv_detective.detect_fields.FR.geo.code_region
46
- csv_detective.detect_fields.other.booleen
47
- csv_detective.detect_fields.other.twitter
48
- csv_detective.detect_fields.other.float
49
- csv_detective.detect_fields.other.int
50
- csv_detective.detect_fields.other.json
51
- csv_detective.detect_fields.FR.geo.latitude_l93
52
- csv_detective.detect_fields.FR.geo.longitude_l93
53
- csv_detective.detect_fields.FR.geo.insee_canton
54
- csv_detective.detect_fields.FR.other.date_fr
55
- csv_detective.detect_fields.FR.other.code_waldec
56
- csv_detective.detect_fields.FR.other.code_rna
57
- csv_detective.detect_labels.FR.geo.adresse
58
- csv_detective.detect_labels.FR.geo.code_commune_insee
59
- csv_detective.detect_labels.FR.geo.code_departement
60
- csv_detective.detect_labels.FR.geo.code_fantoir
61
- csv_detective.detect_labels.FR.geo.code_postal
62
- csv_detective.detect_labels.FR.geo.code_region
63
- csv_detective.detect_labels.FR.geo.commune
64
- csv_detective.detect_labels.FR.geo.departement
65
- csv_detective.detect_labels.FR.geo.insee_canton
66
- csv_detective.detect_labels.FR.geo.latitude_l93
67
- csv_detective.detect_labels.FR.geo.latitude_wgs_fr_metropole
68
- csv_detective.detect_labels.FR.geo.longitude_l93
69
- csv_detective.detect_labels.FR.geo.longitude_wgs_fr_metropole
70
- csv_detective.detect_labels.FR.geo.pays
71
- csv_detective.detect_labels.FR.geo.region
72
- csv_detective.detect_labels.FR.other.code_csp_insee
73
- csv_detective.detect_labels.FR.other.code_rna
74
- csv_detective.detect_labels.FR.other.code_waldec
75
- csv_detective.detect_labels.FR.other.csp_insee
76
- csv_detective.detect_labels.FR.other.date_fr
77
- csv_detective.detect_labels.FR.other.insee_ape700
78
- csv_detective.detect_labels.FR.other.sexe
79
- csv_detective.detect_labels.FR.other.siren
80
- csv_detective.detect_labels.FR.other.siret
81
- csv_detective.detect_labels.FR.other.tel_fr
82
- csv_detective.detect_labels.FR.other.uai
83
- csv_detective.detect_labels.FR.temp.jour_de_la_semaine
84
- csv_detective.detect_labels.FR.temp.mois_de_annee
85
- csv_detective.detect_labels.geo.iso_country_code_alpha2
86
- csv_detective.detect_labels.geo.iso_country_code_alpha3
87
- csv_detective.detect_labels.geo.iso_country_code_numeric
88
- csv_detective.detect_labels.geo.json_geojson
89
- csv_detective.detect_labels.geo.latitude_wgs
90
- csv_detective.detect_labels.geo.latlon_wgs
91
- csv_detective.detect_labels.geo.longitude_wgs
92
- csv_detective.detect_labels.other.booleen
93
- csv_detective.detect_labels.other.email
94
- csv_detective.detect_labels.other.mongo_object_id
95
- csv_detective.detect_labels.other.uuid
96
- csv_detective.detect_labels.other.float
97
- csv_detective.detect_labels.other.int
98
- csv_detective.detect_labels.other.money
99
- csv_detective.detect_labels.other.twitter
100
- csv_detective.detect_labels.other.url
101
- csv_detective.detect_labels.temp.date
102
- csv_detective.detect_labels.temp.datetime_iso
103
- csv_detective.detect_labels.temp.datetime_rfc822
104
- csv_detective.detect_labels.temp.year
@@ -1,100 +0,0 @@
1
- from csv_detective.process_text import _process_text
2
-
3
- PROPORTION = 0.55
4
- # ajouts d'espaces en fin de mots pour s'assurer que le str n'est pas juste une substr d'un mot plus long
5
- voies = {
6
- 'aire ',
7
- 'allee ',
8
- 'avenue ',
9
- 'base ',
10
- 'boulevard ',
11
- 'cami ',
12
- 'carrefour ',
13
- 'chemin ',
14
- 'cheminement ',
15
- 'chaussee ',
16
- 'cite ',
17
- 'clos ',
18
- 'coin ',
19
- 'corniche ',
20
- 'cote ',
21
- 'cour ',
22
- 'cours ',
23
- 'domaine ',
24
- 'descente ',
25
- 'ecart ',
26
- 'esplanade ',
27
- 'faubourg ',
28
- 'gare ',
29
- 'grande rue',
30
- 'hameau ',
31
- 'halle ',
32
- 'ilot ',
33
- 'impasse ',
34
- 'lieu dit',
35
- 'lotissement ',
36
- 'marche ',
37
- 'montee ',
38
- 'parc ',
39
- 'passage ',
40
- 'place ',
41
- 'plan ',
42
- 'plaine ',
43
- 'plateau ',
44
- 'pont ',
45
- 'port ',
46
- 'promenade ',
47
- 'parvis ',
48
- 'quartier ',
49
- 'quai ',
50
- 'residence ',
51
- 'ruelle ',
52
- 'rocade ',
53
- 'rond point',
54
- 'route ',
55
- 'rue ',
56
- # 'sente - sentier',
57
- 'square ',
58
- 'tour ',
59
- # 'terre-plein',
60
- 'traverse ',
61
- 'villa ',
62
- 'village ',
63
- 'voie ',
64
- 'zone artisanale',
65
- 'zone d’amenagement concerte',
66
- 'zone d’amenagement differe',
67
- 'zone industrielle',
68
- 'zone ',
69
- # 'r',
70
- 'av ',
71
- 'pl ',
72
- 'bd ',
73
- 'cami ',
74
- # 'che',
75
- 'chs ',
76
- 'dom ',
77
- 'ham ',
78
- 'ld ',
79
- # 'pro',
80
- # 'rte',
81
- 'vlge ',
82
- 'za ',
83
- 'zac ',
84
- 'zad ',
85
- 'zi ',
86
- # 'car',
87
- 'fg ',
88
- # 'lot',
89
- 'imp ',
90
- # 'qu',
91
- 'mte'
92
- }
93
-
94
-
95
- def _is(val):
96
- '''Repere des adresses'''
97
- if len(val) > 150:
98
- return False
99
- val = _process_text(val)
100
- return any(x in val for x in voies)
@@ -1,24 +0,0 @@
1
- from os.path import dirname, join
2
- import re
3
-
4
- PROPORTION = 0.75
5
- f = open(join(dirname(__file__), 'code_commune_insee.txt'), 'r')
6
- codes_insee = f.read().split('\n')
7
- # removing empty str due to additionnal line in file
8
- del codes_insee[-1]
9
- codes_insee = set(codes_insee)
10
- f.close()
11
- # vérification de cohérence avec prise en compte corse 2A/2B et DOM (971-976 sauf 975)
12
- regex = r'^([01345678][0-9]{4}|2[AB1-9][0-9]{3}|9([0-5][0-9]{3}|7[12346][0-9]{2}))$'
13
-
14
-
15
- def _is(val):
16
- '''Renvoie True si val peut être un code commune INSEE, False sinon'''
17
- # test sur la longueur
18
- if len(val) != 5:
19
- return False
20
-
21
- if not bool(re.match(regex, val)):
22
- return False
23
-
24
- return val in codes_insee