csv-detective 0.8.1.dev1674__py3-none-any.whl → 0.8.1.dev1720__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. csv_detective/__init__.py +0 -2
  2. csv_detective/cli.py +6 -9
  3. csv_detective/detect_fields/FR/geo/adresse/__init__.py +78 -78
  4. csv_detective/detect_fields/FR/geo/code_departement/__init__.py +2 -2
  5. csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -1
  6. csv_detective/detect_fields/FR/geo/code_region/__init__.py +1 -1
  7. csv_detective/detect_fields/FR/geo/commune/__init__.py +2 -2
  8. csv_detective/detect_fields/FR/geo/departement/__init__.py +2 -2
  9. csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +2 -2
  10. csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +1 -2
  11. csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +1 -1
  12. csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +1 -2
  13. csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +1 -1
  14. csv_detective/detect_fields/FR/geo/pays/__init__.py +6 -6
  15. csv_detective/detect_fields/FR/geo/region/__init__.py +6 -4
  16. csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +15 -14
  17. csv_detective/detect_fields/FR/other/csp_insee/__init__.py +4 -3
  18. csv_detective/detect_fields/FR/other/date_fr/__init__.py +3 -3
  19. csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +4 -3
  20. csv_detective/detect_fields/FR/other/sexe/__init__.py +2 -2
  21. csv_detective/detect_fields/FR/other/siren/__init__.py +3 -3
  22. csv_detective/detect_fields/FR/other/siret/__init__.py +3 -3
  23. csv_detective/detect_fields/FR/other/tel_fr/__init__.py +3 -3
  24. csv_detective/detect_fields/FR/other/uai/__init__.py +2 -2
  25. csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +15 -15
  26. csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +27 -27
  27. csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +5 -5
  28. csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +5 -5
  29. csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +5 -5
  30. csv_detective/detect_fields/geo/latitude_wgs/__init__.py +1 -1
  31. csv_detective/detect_fields/geo/longitude_wgs/__init__.py +1 -1
  32. csv_detective/detect_fields/other/booleen/__init__.py +1 -1
  33. csv_detective/detect_fields/other/email/__init__.py +4 -2
  34. csv_detective/detect_fields/other/int/__init__.py +3 -3
  35. csv_detective/detect_fields/other/mongo_object_id/__init__.py +2 -2
  36. csv_detective/detect_fields/other/twitter/__init__.py +2 -2
  37. csv_detective/detect_fields/other/uuid/__init__.py +4 -5
  38. csv_detective/detect_fields/temp/date/__init__.py +3 -2
  39. csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +6 -6
  40. csv_detective/detect_fields/temp/year/__init__.py +1 -1
  41. csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -1
  42. csv_detective/detect_labels/geo/lonlat_wgs/__init__.py +1 -0
  43. csv_detective/detect_labels/other/mongo_object_id/__init__.py +1 -1
  44. csv_detective/detection/columns.py +9 -9
  45. csv_detective/detection/encoding.py +6 -4
  46. csv_detective/detection/engine.py +6 -5
  47. csv_detective/detection/formats.py +19 -19
  48. csv_detective/detection/headers.py +3 -5
  49. csv_detective/detection/rows.py +1 -1
  50. csv_detective/detection/variables.py +4 -4
  51. csv_detective/explore_csv.py +7 -8
  52. csv_detective/load_tests.py +6 -14
  53. csv_detective/output/__init__.py +3 -7
  54. csv_detective/output/dataframe.py +9 -5
  55. csv_detective/output/example.py +13 -13
  56. csv_detective/output/profile.py +30 -23
  57. csv_detective/output/schema.py +20 -23
  58. csv_detective/output/utils.py +15 -15
  59. csv_detective/parsing/columns.py +23 -12
  60. csv_detective/parsing/csv.py +1 -1
  61. csv_detective/parsing/excel.py +10 -11
  62. csv_detective/parsing/load.py +11 -8
  63. csv_detective/parsing/text.py +4 -9
  64. csv_detective/s3_utils.py +3 -7
  65. csv_detective/utils.py +4 -2
  66. csv_detective/validate.py +18 -13
  67. csv_detective-0.8.1.dev1674.data/data/share/csv_detective/README.md → csv_detective-0.8.1.dev1720.dist-info/METADATA +32 -0
  68. {csv_detective-0.8.1.dev1674.dist-info → csv_detective-0.8.1.dev1720.dist-info}/RECORD +81 -81
  69. {csv_detective-0.8.1.dev1674.dist-info → csv_detective-0.8.1.dev1720.dist-info}/top_level.txt +2 -0
  70. tests/test_example.py +2 -6
  71. tests/test_fields.py +16 -10
  72. tests/test_file.py +10 -9
  73. tests/test_labels.py +3 -2
  74. tests/test_structure.py +3 -1
  75. tests/test_validation.py +9 -6
  76. venv/bin/activate_this.py +38 -0
  77. venv/bin/jp.py +54 -0
  78. venv/bin/runxlrd.py +410 -0
  79. csv_detective-0.8.1.dev1674.data/data/share/csv_detective/CHANGELOG.md +0 -186
  80. csv_detective-0.8.1.dev1674.dist-info/METADATA +0 -268
  81. csv_detective-0.8.1.dev1674.dist-info/licenses/LICENSE +0 -21
  82. {csv_detective-0.8.1.dev1674.dist-info → csv_detective-0.8.1.dev1720.dist-info}/WHEEL +0 -0
  83. {csv_detective-0.8.1.dev1674.dist-info → csv_detective-0.8.1.dev1720.dist-info}/entry_points.txt +0 -0
  84. {csv_detective-0.8.1.dev1674.data/data/share/csv_detective → csv_detective-0.8.1.dev1720.dist-info/licenses}/LICENSE +0 -0
@@ -1,268 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: csv_detective
3
- Version: 0.8.1.dev1674
4
- Summary: Detect tabular files column content
5
- Home-page: https://github.com/datagouv/csv_detective
6
- Author: Etalab
7
- Author-email: opendatateam@data.gouv.fr
8
- License: https://spdx.org/licenses/MIT.html#licenseText
9
- Project-URL: Source, https://github.com/datagouv/csv_detective
10
- Keywords: CSV data processing encoding guess parser tabular
11
- Classifier: Development Status :: 2 - Pre-Alpha
12
- Classifier: License :: OSI Approved :: MIT License
13
- Classifier: Operating System :: OS Independent
14
- Classifier: Programming Language :: Python :: 3
15
- Classifier: Programming Language :: Python :: 3.9
16
- Classifier: Programming Language :: Python :: 3.10
17
- Classifier: Programming Language :: Python :: 3.11
18
- Classifier: Programming Language :: Python :: 3.12
19
- Classifier: Programming Language :: Python :: 3.13
20
- Classifier: Programming Language :: Python :: Implementation :: CPython
21
- Classifier: Topic :: Scientific/Engineering :: Information Analysis
22
- Requires-Python: >=3.9
23
- Description-Content-Type: text/markdown
24
- License-File: LICENSE
25
- Requires-Dist: boto3<2,>=1.34.0
26
- Requires-Dist: dateparser<2,>=1.2.0
27
- Requires-Dist: faust-cchardet==2.1.19
28
- Requires-Dist: pandas<3,>=2.2.0
29
- Requires-Dist: python-dateutil<3,>=2.8.2
30
- Requires-Dist: Unidecode<2,>=1.3.6
31
- Requires-Dist: openpyxl==3.1.5
32
- Requires-Dist: xlrd==2.0.1
33
- Requires-Dist: odfpy==1.4.1
34
- Requires-Dist: requests<3,>=2.32.3
35
- Requires-Dist: python-magic==0.4.27
36
- Requires-Dist: frformat==0.4.0
37
- Requires-Dist: Faker>=33.0.0
38
- Requires-Dist: rstr==3.2.2
39
- Provides-Extra: dev
40
- Requires-Dist: pytest==8.3.0; extra == "dev"
41
- Requires-Dist: responses==0.25.0; extra == "dev"
42
- Requires-Dist: bumpx==0.3.10; extra == "dev"
43
- Dynamic: author
44
- Dynamic: author-email
45
- Dynamic: classifier
46
- Dynamic: description
47
- Dynamic: description-content-type
48
- Dynamic: home-page
49
- Dynamic: keywords
50
- Dynamic: license
51
- Dynamic: license-file
52
- Dynamic: project-url
53
- Dynamic: provides-extra
54
- Dynamic: requires-dist
55
- Dynamic: requires-python
56
- Dynamic: summary
57
-
58
- # CSV Detective
59
-
60
- This is a package to **automatically detect column content in tabular files**. The script reads either the whole file or the first few rows and performs various checks to see for each column if it matches with various content types. This is currently done through regex and string comparison.
61
-
62
- Currently supported file types: csv, xls, xlsx, ods.
63
-
64
- You can also directly feed the URL of a remote file (from data.gouv.fr for instance).
65
-
66
- ## How To ?
67
-
68
- ### Install the package
69
-
70
- You need to have python >= 3.9 installed. We recommend using a virtual environement.
71
-
72
- ```
73
- pip install csv-detective
74
- ```
75
-
76
- ### Detect some columns
77
-
78
- Say you have a tabular file located at `file_path`. This is how you could use `csv_detective`:
79
-
80
- ```
81
- # Import the csv_detective package
82
- from csv_detective import routine
83
- import os # for this example only
84
-
85
- # Replace by your file path
86
- file_path = os.path.join('.', 'tests', 'code_postaux_v201410.csv')
87
-
88
- # Open your file and run csv_detective
89
- inspection_results = routine(
90
- file_path, # or file URL
91
- num_rows=-1, # Value -1 will analyze all lines of your file, you can change with the number of lines you wish to analyze
92
- save_results=False, # Default False. If True, it will save result output into the same directory as the analyzed file, using the same name as your file and .json extension
93
- output_profile=True, # Default False. If True, returned dict will contain a property "profile" indicating profile (min, max, mean, tops...) of every column of you csv
94
- output_schema=True, # Default False. If True, returned dict will contain a property "schema" containing basic [tableschema](https://specs.frictionlessdata.io/table-schema/) of your file. This can be use to validate structure of other csv which should match same structure.
95
- )
96
- ```
97
-
98
- ## So What Do You Get ?
99
-
100
- ### Output
101
-
102
- The program creates a `Python` dictionnary with the following information :
103
-
104
- ```
105
- {
106
- "encoding": "windows-1252", # Encoding detected
107
- "separator": ";", # Detected CSV separator
108
- "header_row_idx": 0 # Index of the header (aka how many lines to skip to get it)
109
- "headers": ['code commune INSEE', 'nom de la commune', 'code postal', "libellé d'acheminement"], # Header row
110
- "total_lines": 42, # Number of rows (excluding header)
111
- "nb_duplicates": 0, # Number of exact duplicates in rows
112
- "heading_columns": 0, # Number of heading columns
113
- "trailing_columns": 0, # Number of trailing columns
114
- "categorical": ['Code commune'] # Columns that contain less than 25 different values (arbitrary threshold)
115
- "columns": { # Property that conciliate detection from labels and content of a column
116
- "Code commune": {
117
- "python_type": "string",
118
- "format": "code_commune_insee",
119
- "score": 1.0
120
- },
121
- },
122
- "columns_labels": { # Property that return detection from header columns
123
- "Code commune": {
124
- "python_type": "string",
125
- "format": "code_commune_insee",
126
- "score": 0.5
127
- },
128
- },
129
- "columns_fields": { # Property that return detection from content columns
130
- "Code commune": {
131
- "python_type": "string",
132
- "format": "code_commune_insee",
133
- "score": 1.25
134
- },
135
- },
136
- "profile": {
137
- "column_name" : {
138
- "min": 1, # only int and float
139
- "max: 12, # only int and float
140
- "mean": 5, # only int and float
141
- "std": 5, # only int and float
142
- "tops": [ # 10 most frequent values in the column
143
- "xxx",
144
- "yyy",
145
- "..."
146
- ],
147
- "nb_distinct": 67, # number of distinct values
148
- "nb_missing_values": 102 # number of empty cells in the column
149
- }
150
- },
151
- "schema": { # TableSchema of the file if `output_schema` was set to `True`
152
- "$schema": "https://frictionlessdata.io/schemas/table-schema.json",
153
- "name": "",
154
- "title": "",
155
- "description": "",
156
- "countryCode": "FR",
157
- "homepage": "",
158
- "path": "https://github.com/datagouv/csv-detective",
159
- "resources": [],
160
- "sources": [
161
- {"title": "Spécification Tableschema", "path": "https://specs.frictionlessdata.io/table-schema"},
162
- {"title": "schema.data.gouv.fr", "path": "https://schema.data.gouv.fr"}
163
- ],
164
- "created": "2023-02-10",
165
- "lastModified": "2023-02-10",
166
- "version": "0.0.1",
167
- "contributors": [
168
- {"title": "Table schema bot", "email": "schema@data.gouv.fr", "organisation": "data.gouv.fr", "role": "author"}
169
- ],
170
- "fields": [
171
- {
172
- "name": "Code commune",
173
- "description": "Le code INSEE de la commune",
174
- "example": "23150",
175
- "type": "string",
176
- "formatFR": "code_commune_insee",
177
- "constraints": {
178
- "required": False,
179
- "pattern": "^([013-9]\\d|2[AB1-9])\\d{3}$",
180
- }
181
- }
182
- ]
183
- }
184
- }
185
- ```
186
-
187
- The output slightly differs depending on the file format:
188
- - csv files have `encoding` and `separator`
189
- - xls, xls, ods files have `engine` and `sheet_name`
190
-
191
- ### What Formats Can Be Detected
192
-
193
- Includes :
194
-
195
- - Communes, Départements, Régions, Pays
196
- - Codes Communes, Codes Postaux, Codes Departement, ISO Pays
197
- - Codes CSP, Description CSP, SIREN
198
- - E-Mails, URLs, Téléphones FR
199
- - Years, Dates, Jours de la Semaine FR
200
- - UUIDs, Mongo ObjectIds
201
-
202
- ### Format detection and scoring
203
- For each column, 3 scores are computed for each format, the higher the score, the more likely the format:
204
- - the field score based on the values contained in the column (0.0 to 1.0).
205
- - the label score based on the header of the column (0.0 to 1.0).
206
- - the overall score, computed as `field_score * (1 + label_score/2)` (0.0 to 1.5).
207
-
208
- The overall score computation aims to give more weight to the column contents while
209
- still leveraging the column header.
210
-
211
- #### `limited_output` - Select the output mode you want for json report
212
-
213
- This option allows you to select the output mode you want to pass. To do so, you have to pass a `limited_output` argument to the `routine` function. This variable has two possible values:
214
-
215
- - `limited_output` defaults to `True` which means report will contain only detected column formats based on a pre-selected threshold proportion in data. Report result is the standard output (an example can be found above in 'Output' section).
216
- Only the format with highest score is present in the output.
217
- - `limited_output=False` means report will contain a full list of all column format possibilities for each input data columns with a value associated which match to the proportion of found column type in data. With this report, user can adjust its rules of detection based on a specific threshold and has a better vision of quality detection for each columns. Results could also be easily transformed into a dataframe (columns types in column / column names in rows) for analysis and test.
218
-
219
- ## Improvement suggestions
220
-
221
- - Smarter refactors
222
- - Improve performances
223
- - Test other ways to load and process data (`pandas` alternatives)
224
- - Add more and more detection modules...
225
-
226
- Related ideas:
227
-
228
- - store column names to make a learning model based on column names for (possible pre-screen)
229
- - normalising data based on column prediction
230
- - entity resolution (good luck...)
231
-
232
- ## Why Could This Be of Any Use ?
233
-
234
- Organisations such as [data.gouv.fr](http://data.gouv.fr) aggregate huge amounts of un-normalised data. Performing cross-examination across datasets can be difficult. This tool could help enrich the datasets metadata and facilitate linking them together.
235
-
236
- [`udata-hydra`](https://github.com/etalab/udata-hydra) is a crawler that checks, analyzes (using `csv-detective`) and APIfies all tabular files from [data.gouv.fr](http://data.gouv.fr).
237
-
238
- An early version of this analysis of all resources on data.gouv.fr can be found [here](https://github.com/Leobouloc/data.gouv-exploration).
239
-
240
- ## Release
241
-
242
- The release process uses `bumpx`.
243
-
244
- ```shell
245
- pip install -r requirements-build.txt
246
- ```
247
-
248
- ### Process
249
-
250
- 1. `bumpx` will handle bumping the version according to your command (patch, minor, major)
251
- 2. It will update the CHANGELOG according to the new version being published
252
- 3. It will push a tag with the given version to github
253
- 4. CircleCI will pickup this tag, build the package and publish it to pypi
254
- 5. `bumpx` will have everything ready for the next version (version, changelog...)
255
-
256
- ### Dry run
257
-
258
- ```shell
259
- bumpx -d -v
260
- ```
261
-
262
- ### Release
263
-
264
- This will release a patch version:
265
-
266
- ```shell
267
- bumpx -v
268
- ```
@@ -1,21 +0,0 @@
1
- MIT License
2
-
3
- Copyright (c) 2025 data.gouv.fr
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining a copy
6
- of this software and associated documentation files (the "Software"), to deal
7
- in the Software without restriction, including without limitation the rights
8
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- copies of the Software, and to permit persons to whom the Software is
10
- furnished to do so, subject to the following conditions:
11
-
12
- The above copyright notice and this permission notice shall be included in all
13
- copies or substantial portions of the Software.
14
-
15
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- SOFTWARE.