csv-detective 0.6.7__py3-none-any.whl → 0.9.3.dev2438__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/__init__.py +7 -1
- csv_detective/cli.py +33 -21
- csv_detective/{detect_fields/FR → detection}/__init__.py +0 -0
- csv_detective/detection/columns.py +89 -0
- csv_detective/detection/encoding.py +29 -0
- csv_detective/detection/engine.py +46 -0
- csv_detective/detection/formats.py +156 -0
- csv_detective/detection/headers.py +28 -0
- csv_detective/detection/rows.py +18 -0
- csv_detective/detection/separator.py +44 -0
- csv_detective/detection/variables.py +97 -0
- csv_detective/explore_csv.py +151 -377
- csv_detective/format.py +67 -0
- csv_detective/formats/__init__.py +9 -0
- csv_detective/formats/adresse.py +116 -0
- csv_detective/formats/binary.py +26 -0
- csv_detective/formats/booleen.py +35 -0
- csv_detective/formats/code_commune_insee.py +26 -0
- csv_detective/formats/code_csp_insee.py +36 -0
- csv_detective/formats/code_departement.py +29 -0
- csv_detective/formats/code_fantoir.py +21 -0
- csv_detective/formats/code_import.py +17 -0
- csv_detective/formats/code_postal.py +25 -0
- csv_detective/formats/code_region.py +22 -0
- csv_detective/formats/code_rna.py +29 -0
- csv_detective/formats/code_waldec.py +17 -0
- csv_detective/formats/commune.py +27 -0
- csv_detective/formats/csp_insee.py +31 -0
- csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt +0 -0
- csv_detective/formats/date.py +99 -0
- csv_detective/formats/date_fr.py +22 -0
- csv_detective/formats/datetime_aware.py +45 -0
- csv_detective/formats/datetime_naive.py +48 -0
- csv_detective/formats/datetime_rfc822.py +24 -0
- csv_detective/formats/departement.py +37 -0
- csv_detective/formats/email.py +28 -0
- csv_detective/formats/float.py +29 -0
- csv_detective/formats/geojson.py +36 -0
- csv_detective/formats/insee_ape700.py +31 -0
- csv_detective/formats/insee_canton.py +28 -0
- csv_detective/formats/int.py +23 -0
- csv_detective/formats/iso_country_code_alpha2.py +30 -0
- csv_detective/formats/iso_country_code_alpha3.py +30 -0
- csv_detective/formats/iso_country_code_numeric.py +31 -0
- csv_detective/formats/jour_de_la_semaine.py +41 -0
- csv_detective/formats/json.py +20 -0
- csv_detective/formats/latitude_l93.py +48 -0
- csv_detective/formats/latitude_wgs.py +42 -0
- csv_detective/formats/latitude_wgs_fr_metropole.py +42 -0
- csv_detective/formats/latlon_wgs.py +53 -0
- csv_detective/formats/longitude_l93.py +39 -0
- csv_detective/formats/longitude_wgs.py +32 -0
- csv_detective/formats/longitude_wgs_fr_metropole.py +32 -0
- csv_detective/formats/lonlat_wgs.py +36 -0
- csv_detective/formats/mois_de_lannee.py +48 -0
- csv_detective/formats/money.py +18 -0
- csv_detective/formats/mongo_object_id.py +14 -0
- csv_detective/formats/pays.py +35 -0
- csv_detective/formats/percent.py +16 -0
- csv_detective/formats/region.py +70 -0
- csv_detective/formats/sexe.py +17 -0
- csv_detective/formats/siren.py +37 -0
- csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} +47 -29
- csv_detective/formats/tel_fr.py +36 -0
- csv_detective/formats/uai.py +36 -0
- csv_detective/formats/url.py +46 -0
- csv_detective/formats/username.py +14 -0
- csv_detective/formats/uuid.py +16 -0
- csv_detective/formats/year.py +28 -0
- csv_detective/output/__init__.py +65 -0
- csv_detective/output/dataframe.py +96 -0
- csv_detective/output/example.py +250 -0
- csv_detective/output/profile.py +119 -0
- csv_detective/{schema_generation.py → output/schema.py} +268 -343
- csv_detective/output/utils.py +74 -0
- csv_detective/{detect_fields/FR/geo → parsing}/__init__.py +0 -0
- csv_detective/parsing/columns.py +235 -0
- csv_detective/parsing/compression.py +11 -0
- csv_detective/parsing/csv.py +56 -0
- csv_detective/parsing/excel.py +167 -0
- csv_detective/parsing/load.py +111 -0
- csv_detective/parsing/text.py +56 -0
- csv_detective/utils.py +23 -196
- csv_detective/validate.py +138 -0
- csv_detective-0.9.3.dev2438.dist-info/METADATA +267 -0
- csv_detective-0.9.3.dev2438.dist-info/RECORD +92 -0
- csv_detective-0.9.3.dev2438.dist-info/WHEEL +4 -0
- {csv_detective-0.6.7.dist-info → csv_detective-0.9.3.dev2438.dist-info}/entry_points.txt +1 -0
- csv_detective/all_packages.txt +0 -104
- csv_detective/detect_fields/FR/geo/adresse/__init__.py +0 -100
- csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +0 -24
- csv_detective/detect_fields/FR/geo/code_commune_insee/code_commune_insee.txt +0 -37600
- csv_detective/detect_fields/FR/geo/code_departement/__init__.py +0 -11
- csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +0 -15
- csv_detective/detect_fields/FR/geo/code_fantoir/code_fantoir.txt +0 -26122
- csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -19
- csv_detective/detect_fields/FR/geo/code_postal/code_postal.txt +0 -36822
- csv_detective/detect_fields/FR/geo/code_region/__init__.py +0 -27
- csv_detective/detect_fields/FR/geo/commune/__init__.py +0 -21
- csv_detective/detect_fields/FR/geo/commune/commune.txt +0 -36745
- csv_detective/detect_fields/FR/geo/departement/__init__.py +0 -19
- csv_detective/detect_fields/FR/geo/departement/departement.txt +0 -101
- csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +0 -20
- csv_detective/detect_fields/FR/geo/insee_canton/canton2017.txt +0 -2055
- csv_detective/detect_fields/FR/geo/insee_canton/cantons.txt +0 -2055
- csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -13
- csv_detective/detect_fields/FR/geo/pays/__init__.py +0 -17
- csv_detective/detect_fields/FR/geo/pays/pays.txt +0 -248
- csv_detective/detect_fields/FR/geo/region/__init__.py +0 -16
- csv_detective/detect_fields/FR/geo/region/region.txt +0 -44
- csv_detective/detect_fields/FR/other/__init__.py +0 -0
- csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +0 -26
- csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
- csv_detective/detect_fields/FR/other/code_rna/__init__.py +0 -8
- csv_detective/detect_fields/FR/other/code_waldec/__init__.py +0 -12
- csv_detective/detect_fields/FR/other/csp_insee/__init__.py +0 -16
- csv_detective/detect_fields/FR/other/date_fr/__init__.py +0 -12
- csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +0 -16
- csv_detective/detect_fields/FR/other/sexe/__init__.py +0 -9
- csv_detective/detect_fields/FR/other/siren/__init__.py +0 -18
- csv_detective/detect_fields/FR/other/tel_fr/__init__.py +0 -15
- csv_detective/detect_fields/FR/other/uai/__init__.py +0 -15
- csv_detective/detect_fields/FR/temp/__init__.py +0 -0
- csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +0 -23
- csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +0 -37
- csv_detective/detect_fields/__init__.py +0 -57
- csv_detective/detect_fields/geo/__init__.py +0 -0
- csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +0 -15
- csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +0 -14
- csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +0 -15
- csv_detective/detect_fields/geo/json_geojson/__init__.py +0 -22
- csv_detective/detect_fields/geo/latitude_wgs/__init__.py +0 -13
- csv_detective/detect_fields/geo/latlon_wgs/__init__.py +0 -15
- csv_detective/detect_fields/geo/longitude_wgs/__init__.py +0 -13
- csv_detective/detect_fields/other/__init__.py +0 -0
- csv_detective/detect_fields/other/booleen/__init__.py +0 -21
- csv_detective/detect_fields/other/email/__init__.py +0 -8
- csv_detective/detect_fields/other/float/__init__.py +0 -17
- csv_detective/detect_fields/other/int/__init__.py +0 -12
- csv_detective/detect_fields/other/json/__init__.py +0 -24
- csv_detective/detect_fields/other/mongo_object_id/__init__.py +0 -8
- csv_detective/detect_fields/other/twitter/__init__.py +0 -8
- csv_detective/detect_fields/other/url/__init__.py +0 -11
- csv_detective/detect_fields/other/uuid/__init__.py +0 -11
- csv_detective/detect_fields/temp/__init__.py +0 -0
- csv_detective/detect_fields/temp/date/__init__.py +0 -62
- csv_detective/detect_fields/temp/datetime_iso/__init__.py +0 -18
- csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +0 -21
- csv_detective/detect_fields/temp/year/__init__.py +0 -10
- csv_detective/detect_labels/FR/__init__.py +0 -0
- csv_detective/detect_labels/FR/geo/__init__.py +0 -0
- csv_detective/detect_labels/FR/geo/adresse/__init__.py +0 -40
- csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +0 -42
- csv_detective/detect_labels/FR/geo/code_departement/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/code_postal/__init__.py +0 -41
- csv_detective/detect_labels/FR/geo/code_region/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/commune/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/departement/__init__.py +0 -47
- csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +0 -33
- csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +0 -54
- csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -55
- csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +0 -44
- csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -45
- csv_detective/detect_labels/FR/geo/pays/__init__.py +0 -45
- csv_detective/detect_labels/FR/geo/region/__init__.py +0 -45
- csv_detective/detect_labels/FR/other/__init__.py +0 -0
- csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/code_rna/__init__.py +0 -38
- csv_detective/detect_labels/FR/other/code_waldec/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/csp_insee/__init__.py +0 -37
- csv_detective/detect_labels/FR/other/date_fr/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +0 -40
- csv_detective/detect_labels/FR/other/sexe/__init__.py +0 -33
- csv_detective/detect_labels/FR/other/siren/__init__.py +0 -41
- csv_detective/detect_labels/FR/other/siret/__init__.py +0 -40
- csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -45
- csv_detective/detect_labels/FR/other/uai/__init__.py +0 -50
- csv_detective/detect_labels/FR/temp/__init__.py +0 -0
- csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +0 -41
- csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +0 -33
- csv_detective/detect_labels/__init__.py +0 -43
- csv_detective/detect_labels/geo/__init__.py +0 -0
- csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +0 -41
- csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +0 -41
- csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +0 -41
- csv_detective/detect_labels/geo/json_geojson/__init__.py +0 -42
- csv_detective/detect_labels/geo/latitude_wgs/__init__.py +0 -55
- csv_detective/detect_labels/geo/latlon_wgs/__init__.py +0 -67
- csv_detective/detect_labels/geo/longitude_wgs/__init__.py +0 -45
- csv_detective/detect_labels/other/__init__.py +0 -0
- csv_detective/detect_labels/other/booleen/__init__.py +0 -34
- csv_detective/detect_labels/other/email/__init__.py +0 -45
- csv_detective/detect_labels/other/float/__init__.py +0 -33
- csv_detective/detect_labels/other/int/__init__.py +0 -33
- csv_detective/detect_labels/other/money/__init__.py +0 -11
- csv_detective/detect_labels/other/money/check_col_name.py +0 -8
- csv_detective/detect_labels/other/mongo_object_id/__init__.py +0 -33
- csv_detective/detect_labels/other/twitter/__init__.py +0 -33
- csv_detective/detect_labels/other/url/__init__.py +0 -48
- csv_detective/detect_labels/other/uuid/__init__.py +0 -33
- csv_detective/detect_labels/temp/__init__.py +0 -0
- csv_detective/detect_labels/temp/date/__init__.py +0 -51
- csv_detective/detect_labels/temp/datetime_iso/__init__.py +0 -45
- csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +0 -44
- csv_detective/detect_labels/temp/year/__init__.py +0 -44
- csv_detective/detection.py +0 -361
- csv_detective/process_text.py +0 -39
- csv_detective/s3_utils.py +0 -48
- csv_detective-0.6.7.data/data/share/csv_detective/CHANGELOG.md +0 -118
- csv_detective-0.6.7.data/data/share/csv_detective/LICENSE.AGPL.txt +0 -661
- csv_detective-0.6.7.data/data/share/csv_detective/README.md +0 -247
- csv_detective-0.6.7.dist-info/LICENSE.AGPL.txt +0 -661
- csv_detective-0.6.7.dist-info/METADATA +0 -23
- csv_detective-0.6.7.dist-info/RECORD +0 -150
- csv_detective-0.6.7.dist-info/WHEEL +0 -5
- csv_detective-0.6.7.dist-info/top_level.txt +0 -2
- tests/__init__.py +0 -0
- tests/test_fields.py +0 -360
- tests/test_file.py +0 -116
- tests/test_labels.py +0 -7
- /csv_detective/{detect_fields/FR/other/csp_insee → formats/data}/csp_insee.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_alpha2 → formats/data}/iso_country_code_alpha2.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_alpha3 → formats/data}/iso_country_code_alpha3.txt +0 -0
- /csv_detective/{detect_fields/geo/iso_country_code_numeric → formats/data}/iso_country_code_numeric.txt +0 -0
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: csv-detective
|
|
3
|
+
Version: 0.9.3.dev2438
|
|
4
|
+
Summary: Detect tabular files column content
|
|
5
|
+
Keywords: CSV,data processing,encoding,guess,parser,tabular
|
|
6
|
+
Author: data.gouv.fr
|
|
7
|
+
Author-email: data.gouv.fr <opendatateam@data.gouv.fr>
|
|
8
|
+
License: MIT
|
|
9
|
+
Requires-Dist: dateparser>=1.2.0,<2
|
|
10
|
+
Requires-Dist: faust-cchardet==2.1.19
|
|
11
|
+
Requires-Dist: pandas>=2.2.0,<3
|
|
12
|
+
Requires-Dist: python-dateutil>=2.8.2,<3
|
|
13
|
+
Requires-Dist: unidecode>=1.3.6,<2
|
|
14
|
+
Requires-Dist: openpyxl>=3.1.5
|
|
15
|
+
Requires-Dist: xlrd>=2.0.1
|
|
16
|
+
Requires-Dist: odfpy>=1.4.1
|
|
17
|
+
Requires-Dist: requests>=2.32.3,<3
|
|
18
|
+
Requires-Dist: python-magic>=0.4.27
|
|
19
|
+
Requires-Dist: frformat==0.4.0
|
|
20
|
+
Requires-Dist: faker>=33.0.0
|
|
21
|
+
Requires-Dist: rstr>=3.2.2
|
|
22
|
+
Requires-Dist: more-itertools>=10.8.0
|
|
23
|
+
Requires-Dist: pytest>=8.3.0 ; extra == 'dev'
|
|
24
|
+
Requires-Dist: responses>=0.25.0 ; extra == 'dev'
|
|
25
|
+
Requires-Dist: ruff>=0.9.3 ; extra == 'dev'
|
|
26
|
+
Requires-Python: >=3.10, <3.15
|
|
27
|
+
Project-URL: Source, https://github.com/datagouv/csv_detective
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Description-Content-Type: text/markdown
|
|
30
|
+
|
|
31
|
+
# CSV Detective
|
|
32
|
+
|
|
33
|
+
This is a package to **automatically detect column content in tabular files**. The script reads either the whole file or the first few rows and performs various checks (regex, casting, comparison with official lists...) to see for each column if it matches with various content types.
|
|
34
|
+
|
|
35
|
+
Currently supported file types: csv(.gz), xls, xlsx, ods.
|
|
36
|
+
|
|
37
|
+
You can also directly feed the URL of a remote file (from data.gouv.fr for instance).
|
|
38
|
+
|
|
39
|
+
## How To?
|
|
40
|
+
|
|
41
|
+
### Install the package
|
|
42
|
+
|
|
43
|
+
You need to have Python >= 3.10 installed. We recommend using a virtual environment.
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
pip install csv-detective
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Detect some columns
|
|
50
|
+
|
|
51
|
+
Say you have a tabular file located at `file_path`. This is how you could use `csv_detective`:
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
# Import the csv_detective package
|
|
55
|
+
from csv_detective import routine
|
|
56
|
+
import os # for this example only
|
|
57
|
+
|
|
58
|
+
# Replace by your file path
|
|
59
|
+
file_path = os.path.join('.', 'tests', 'code_postaux_v201410.csv')
|
|
60
|
+
|
|
61
|
+
# Open your file and run csv_detective
|
|
62
|
+
inspection_results = routine(
|
|
63
|
+
file_path, # or file URL
|
|
64
|
+
num_rows=-1, # Value -1 will analyze all lines of your file, you can change with the number of lines you wish to analyze
|
|
65
|
+
save_results=False, # Default False. If True, it will save result output into the same directory as the analyzed file, using the same name as your file and .json extension
|
|
66
|
+
output_profile=True, # Default False. If True, returned dict will contain a property "profile" indicating profile (min, max, mean, tops...) of every column of your csv
|
|
67
|
+
output_schema=True, # Default False. If True, returned dict will contain a property "schema" containing basic [tableschema](https://specs.frictionlessdata.io/table-schema/) of your file. This can be used to validate structure of other csv which should match same structure.
|
|
68
|
+
tags=["fr"], # Default None. If set as a list of strings, only performs checks related to the specified tags (you can see the available tags with FormatsManager().available_tags())
|
|
69
|
+
)
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## So What Do You Get ?
|
|
73
|
+
|
|
74
|
+
### Output
|
|
75
|
+
|
|
76
|
+
The program creates a `python` dictionary with the following information :
|
|
77
|
+
|
|
78
|
+
```
|
|
79
|
+
{
|
|
80
|
+
"encoding": "windows-1252", # Encoding detected
|
|
81
|
+
"separator": ";", # Detected CSV separator
|
|
82
|
+
"header_row_idx": 0 # Index of the header (aka how many lines to skip to get it)
|
|
83
|
+
"headers": ['code commune INSEE', 'nom de la commune', 'code postal', "libellé d'acheminement"], # Header row
|
|
84
|
+
"total_lines": 42, # Number of rows (excluding header)
|
|
85
|
+
"nb_duplicates": 0, # Number of exact duplicates in rows
|
|
86
|
+
"heading_columns": 0, # Number of heading columns
|
|
87
|
+
"trailing_columns": 0, # Number of trailing columns
|
|
88
|
+
"categorical": ['Code commune'] # Columns that contain less than 25 different values (arbitrary threshold)
|
|
89
|
+
"columns": { # Property that conciliate detection from labels and content of a column
|
|
90
|
+
"Code commune": {
|
|
91
|
+
"python_type": "string",
|
|
92
|
+
"format": "code_commune_insee",
|
|
93
|
+
"score": 1.0
|
|
94
|
+
},
|
|
95
|
+
},
|
|
96
|
+
"columns_labels": { # Property that return detection from header columns
|
|
97
|
+
"Code commune": {
|
|
98
|
+
"python_type": "string",
|
|
99
|
+
"format": "code_commune_insee",
|
|
100
|
+
"score": 0.5
|
|
101
|
+
},
|
|
102
|
+
},
|
|
103
|
+
"columns_fields": { # Property that return detection from content columns
|
|
104
|
+
"Code commune": {
|
|
105
|
+
"python_type": "string",
|
|
106
|
+
"format": "code_commune_insee",
|
|
107
|
+
"score": 1.25
|
|
108
|
+
},
|
|
109
|
+
},
|
|
110
|
+
"profile": {
|
|
111
|
+
"column_name" : {
|
|
112
|
+
"min": 1, # only int and float
|
|
113
|
+
"max": 12, # only int and float
|
|
114
|
+
"mean": 5, # only int and float
|
|
115
|
+
"std": 5, # only int and float
|
|
116
|
+
"tops": [ # 10 most frequent values in the column
|
|
117
|
+
"xxx",
|
|
118
|
+
"yyy",
|
|
119
|
+
"..."
|
|
120
|
+
],
|
|
121
|
+
"nb_distinct": 67, # number of distinct values
|
|
122
|
+
"nb_missing_values": 102 # number of empty cells in the column
|
|
123
|
+
}
|
|
124
|
+
},
|
|
125
|
+
"schema": { # TableSchema of the file if `output_schema` was set to `True`
|
|
126
|
+
"$schema": "https://frictionlessdata.io/schemas/table-schema.json",
|
|
127
|
+
"name": "",
|
|
128
|
+
"title": "",
|
|
129
|
+
"description": "",
|
|
130
|
+
"countryCode": "FR",
|
|
131
|
+
"homepage": "",
|
|
132
|
+
"path": "https://github.com/datagouv/csv-detective",
|
|
133
|
+
"resources": [],
|
|
134
|
+
"sources": [
|
|
135
|
+
{"title": "Spécification Tableschema", "path": "https://specs.frictionlessdata.io/table-schema"},
|
|
136
|
+
{"title": "schema.data.gouv.fr", "path": "https://schema.data.gouv.fr"}
|
|
137
|
+
],
|
|
138
|
+
"created": "2023-02-10",
|
|
139
|
+
"lastModified": "2023-02-10",
|
|
140
|
+
"version": "0.0.1",
|
|
141
|
+
"contributors": [
|
|
142
|
+
{"title": "Table schema bot", "email": "schema@data.gouv.fr", "organisation": "data.gouv.fr", "role": "author"}
|
|
143
|
+
],
|
|
144
|
+
"fields": [
|
|
145
|
+
{
|
|
146
|
+
"name": "Code commune",
|
|
147
|
+
"description": "Le code INSEE de la commune",
|
|
148
|
+
"example": "23150",
|
|
149
|
+
"type": "string",
|
|
150
|
+
"formatFR": "code_commune_insee",
|
|
151
|
+
"constraints": {
|
|
152
|
+
"required": False,
|
|
153
|
+
"pattern": "^([013-9]\\d|2[AB1-9])\\d{3}$",
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
]
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
The output slightly differs depending on the file format:
|
|
162
|
+
- csv files have `encoding` and `separator` (and `compression` if relevant)
|
|
163
|
+
- xls, xlsx, ods files have `engine` and `sheet_name`
|
|
164
|
+
|
|
165
|
+
You may also set `output_df` to `True`, in which case the output is a tuple of two elements:
|
|
166
|
+
- the analysis (as described above)
|
|
167
|
+
- an iterator of `pd.DataFrame`s which contain the columns cast with the detected types (which can be used with `pd.concat` or in a loop):
|
|
168
|
+
```python
|
|
169
|
+
inspection, df_chunks = routine(
|
|
170
|
+
file_path=file_path,
|
|
171
|
+
num_rows=-1,
|
|
172
|
+
output_df=True,
|
|
173
|
+
)
|
|
174
|
+
cast_df = pd.concat(df_chunks, ignore_index=True)
|
|
175
|
+
# if "col1" has been detected as a float, then cast_df["col1"] contains floats
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
### What Formats Can Be Detected
|
|
179
|
+
|
|
180
|
+
Includes :
|
|
181
|
+
- types (float, int, dates, datetimes, JSON) and more specific (latitude, longitude, geoJSON...)
|
|
182
|
+
- Communes, Départements, Régions, Pays
|
|
183
|
+
- Codes Communes, Codes Postaux, Codes Departement, ISO Pays
|
|
184
|
+
- Codes CSP, Description CSP, SIREN
|
|
185
|
+
- E-Mails, URLs, Téléphones FR
|
|
186
|
+
- Years, Dates, Jours de la Semaine FR
|
|
187
|
+
- UUIDs, Mongo ObjectIds
|
|
188
|
+
|
|
189
|
+
### Validation
|
|
190
|
+
If you have a pre-made analysis of a file, you can check whether another file conforms to the same analysis:
|
|
191
|
+
```python
|
|
192
|
+
from csv_detective import validate
|
|
193
|
+
is_valid, *_ = validate(
|
|
194
|
+
file_path,
|
|
195
|
+
previous_analysis, # exactly as it came out from the routine function
|
|
196
|
+
)
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
### Format detection and scoring
|
|
200
|
+
For each column, 3 scores are computed for each format, the higher the score, the more likely the format:
|
|
201
|
+
- the field score based on the values contained in the column (0.0 to 1.0).
|
|
202
|
+
- the label score based on the header of the column (0.0 to 1.0).
|
|
203
|
+
- the overall score, computed as `field_score * (1 + label_score/2)` (0.0 to 1.5).
|
|
204
|
+
|
|
205
|
+
The overall score computation aims to give more weight to the column contents while
|
|
206
|
+
still leveraging the column header.
|
|
207
|
+
|
|
208
|
+
#### `limited_output` - Select the output mode you want for json report
|
|
209
|
+
|
|
210
|
+
This option allows you to select the output mode you want to pass. To do so, you have to pass a `limited_output` argument to the `routine` function. This variable has two possible values:
|
|
211
|
+
|
|
212
|
+
- `limited_output` defaults to `True` which means report will contain only detected column formats based on a pre-selected threshold proportion in data. Report result is the standard output (an example can be found above in 'Output' section).
|
|
213
|
+
Only the format with highest score is present in the output.
|
|
214
|
+
- `limited_output=False` means report will contain a full list of all column format possibilities for each input data columns with a value associated which match to the proportion of found column type in data. With this report, user can adjust its rules of detection based on a specific threshold and has a better vision of quality detection for each columns. Results could also be easily transformed into a dataframe (columns types in column / column names in rows) for analysis and test.
|
|
215
|
+
|
|
216
|
+
## Improvement suggestions
|
|
217
|
+
|
|
218
|
+
- Smarter refactors
|
|
219
|
+
- Performances improvements
|
|
220
|
+
- Test other ways to load and process data (`pandas` alternatives)
|
|
221
|
+
- Add more and more detection modules...
|
|
222
|
+
|
|
223
|
+
Related ideas:
|
|
224
|
+
|
|
225
|
+
- store column names to make a learning model based on column names for (possible pre-screen)
|
|
226
|
+
- entity resolution (good luck...)
|
|
227
|
+
|
|
228
|
+
## Why Could This Be of Any Use?
|
|
229
|
+
|
|
230
|
+
Organisations such as [data.gouv.fr](http://data.gouv.fr) aggregate huge amounts of un-normalised data. Performing cross-examination across datasets can be difficult. This tool could help enrich the datasets metadata and facilitate linking them together.
|
|
231
|
+
|
|
232
|
+
[`udata-hydra`](https://github.com/etalab/udata-hydra) is a crawler that checks, analyzes (using `csv-detective`) and APIfies all tabular files from [data.gouv.fr](http://data.gouv.fr).
|
|
233
|
+
|
|
234
|
+
An early version of this analysis of all resources on data.gouv.fr can be found [here](https://github.com/Leobouloc/data.gouv-exploration).
|
|
235
|
+
|
|
236
|
+
## Linting
|
|
237
|
+
|
|
238
|
+
Remember to format, lint, and sort imports with [Ruff](https://docs.astral.sh/ruff/) before committing (checks will remind you anyway):
|
|
239
|
+
```bash
|
|
240
|
+
pip install .[dev]
|
|
241
|
+
ruff check --fix .
|
|
242
|
+
ruff format .
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
### 🏷️ Release
|
|
246
|
+
|
|
247
|
+
The release process uses the [`tag_version.sh`](tag_version.sh) script to create git tags and update [CHANGELOG.md](CHANGELOG.md) and [pyproject.toml](pyproject.toml) automatically.
|
|
248
|
+
|
|
249
|
+
**Prerequisites**: [GitHub CLI](https://cli.github.com/) (`gh`) must be installed and authenticated, and you must be on the main branch with a clean working directory.
|
|
250
|
+
|
|
251
|
+
```bash
|
|
252
|
+
# Create a new release
|
|
253
|
+
./tag_version.sh <version>
|
|
254
|
+
|
|
255
|
+
# Example
|
|
256
|
+
./tag_version.sh 2.5.0
|
|
257
|
+
|
|
258
|
+
# Dry run to see what would happen
|
|
259
|
+
./tag_version.sh 2.5.0 --dry-run
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
The script automatically:
|
|
263
|
+
- Updates the version in `pyproject.toml`
|
|
264
|
+
- Extracts commits since the last tag and formats them for `CHANGELOG.md`
|
|
265
|
+
- Identifies breaking changes (commits with `!:` in the subject)
|
|
266
|
+
- Creates a git tag and pushes it to the remote repository
|
|
267
|
+
- Creates a GitHub release with the changelog content
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
csv_detective/__init__.py,sha256=zlYElTOp_I2_VG7ZdOTuAu0wuCXSc0cr3sH6gtk2bcg,152
|
|
2
|
+
csv_detective/cli.py,sha256=mu5anmBmaDk52_uZGiA4T37wYZCuV43gZAepjs1Cqzc,1389
|
|
3
|
+
csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
csv_detective/detection/columns.py,sha256=_JtZHBr3aoEmSWh2xVe2ISnt-G7hpnA9vqlvcaGd0Go,2887
|
|
5
|
+
csv_detective/detection/encoding.py,sha256=KZ8W8BPfZAq9UiP5wgaeupYa5INU8KPz98E2L3XpX2Y,999
|
|
6
|
+
csv_detective/detection/engine.py,sha256=wQeDKpp2DKF-HcS1R8H6GgQyaUgQme4szPtEHgAjBII,1552
|
|
7
|
+
csv_detective/detection/formats.py,sha256=kQEht5lr9hFhYe0Zn1lfj9jOKaqYrXNrM_tkQX24pEk,5410
|
|
8
|
+
csv_detective/detection/headers.py,sha256=95pTL524Sy5PGxyQ03ofFUaamvlmkxTJQe8u6HfzOkU,1051
|
|
9
|
+
csv_detective/detection/rows.py,sha256=quf3ZTTFPOo09H-faZ9cRKibb1QGHEKHlpivFRx2Va4,742
|
|
10
|
+
csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
|
|
11
|
+
csv_detective/detection/variables.py,sha256=-QtZOB96z3pWbqnZ-c1RU3yzoYqcO61A0JzeS6JbkxY,3576
|
|
12
|
+
csv_detective/explore_csv.py,sha256=-LCHr7vyT0Q0oLtXeOO8pEevJ6-8Ib9JP3D7nVgZM8o,7090
|
|
13
|
+
csv_detective/format.py,sha256=XX_cSTQc0jlsQq3GUqHi7Cz36AiRrpjrwPmeoOTLMvo,2396
|
|
14
|
+
csv_detective/formats/__init__.py,sha256=Egiy29kcG3Oz2eE2maYhD3wP29zOSOWyRlOpGD5LGvU,318
|
|
15
|
+
csv_detective/formats/adresse.py,sha256=jALDpEDAWyAcgqEfNVRg_W1r6XaYuJKD_jAaP2l-bxk,1943
|
|
16
|
+
csv_detective/formats/binary.py,sha256=OCGRDh5p27sA4yjrpKIp3b2_PfHJYUe5QxIArf-fCxA,676
|
|
17
|
+
csv_detective/formats/booleen.py,sha256=AnDDKShkSYpWO4POhwY2V7_C4yPWbmqBu8CJPgQ9Gwc,648
|
|
18
|
+
csv_detective/formats/code_commune_insee.py,sha256=MhwCPVAhwWH-MyaNAIVRNbqKfeNe3oiCpzEGfpHkpJY,504
|
|
19
|
+
csv_detective/formats/code_csp_insee.py,sha256=_JQ-YbnHMenNnwIg1xBmNVqgCa1tLD2hbPN1soODhDk,656
|
|
20
|
+
csv_detective/formats/code_departement.py,sha256=odwVbmktgjEhL-dSFHXuCRVwhkF8bL8G7VlpVTnMY2A,628
|
|
21
|
+
csv_detective/formats/code_fantoir.py,sha256=nFVFYJEP2HHE2TyhR_dhGdPCMLfCROBO_B8wxwQn7T8,366
|
|
22
|
+
csv_detective/formats/code_import.py,sha256=N5NVvnHkRwC7ARHoM77R-2cYSeyNmPoRIn6JL3Fbnjs,346
|
|
23
|
+
csv_detective/formats/code_postal.py,sha256=C6XMkiVTxhMFvfyvJmGp3iwvh722EzMwD_UdqQU4aR0,427
|
|
24
|
+
csv_detective/formats/code_region.py,sha256=VFKh1rGYVYTNWBJZ2_m0xS4rhJlrI_Gr8q8RXuZCr-w,366
|
|
25
|
+
csv_detective/formats/code_rna.py,sha256=WExlQtlAUfOFT4N3MKsMBhZVxTdNzgexFjmXhZdRM1w,512
|
|
26
|
+
csv_detective/formats/code_waldec.py,sha256=kJEJfikbhMfVwtA8hBpup0tpeSFoY_rWrEdXQxgNwhg,297
|
|
27
|
+
csv_detective/formats/commune.py,sha256=oVpwINGqpwMOT43KkasozipJ9hBeoQ5FrKV_wIeVJGE,532
|
|
28
|
+
csv_detective/formats/csp_insee.py,sha256=HE6NK6Sw91mLFeAAKwWUXZZfXX6fiA0zK4RI4YdkUFY,656
|
|
29
|
+
csv_detective/formats/data/csp_insee.txt,sha256=kgKaKc-5PHu5U4--ugLjpFyMNtTU9CGdZ9ANU3YAsM4,32879
|
|
30
|
+
csv_detective/formats/data/insee_ape700.txt,sha256=nKgslakENwgE7sPkVNHqR23iXuxF02p9-v5MC2_ntx8,4398
|
|
31
|
+
csv_detective/formats/data/iso_country_code_alpha2.txt,sha256=YyPlDqCdz65ecf4Wes_r0P4rDSJG35niXtjc4MmctXM,1740
|
|
32
|
+
csv_detective/formats/data/iso_country_code_alpha3.txt,sha256=aYqKSohgXuBtcIBfF52f8JWYDdxL_HV_Ol1srGnWBp4,1003
|
|
33
|
+
csv_detective/formats/data/iso_country_code_numeric.txt,sha256=2GtEhuporsHYV-pU4q9kfXU5iOtfW5C0GYBTTKQtnnA,1004
|
|
34
|
+
csv_detective/formats/date.py,sha256=X4ohXaFO8cXPJktUSumc3bfdlbDIWEYTG8S9ugVRcsE,2730
|
|
35
|
+
csv_detective/formats/date_fr.py,sha256=3hTw5RommrhcgECFRSt9KgyB9zyi1j4W3UygEHmRgoE,502
|
|
36
|
+
csv_detective/formats/datetime_aware.py,sha256=-1ZBix6vYlYXTvhXrijP-98AN7iPB0x_DbbwU1QjMCI,1470
|
|
37
|
+
csv_detective/formats/datetime_naive.py,sha256=nvA8qT1fb2RmpXN5_Cw9YZA6pC4BryX_B0V-E6O2UbU,1521
|
|
38
|
+
csv_detective/formats/datetime_rfc822.py,sha256=l-SLb34hSuHxC2JQ-9SD-nG38JqzoozwUZiGtoybb0A,601
|
|
39
|
+
csv_detective/formats/departement.py,sha256=UP9UF23BFq_-mIS8N10K5XkoCXwPmDeSoa_7lCAkI4w,768
|
|
40
|
+
csv_detective/formats/email.py,sha256=Qen2EBDYY5TtWXwxrrTGWRrbIybz0ySlVpl4ZRk8pzA,517
|
|
41
|
+
csv_detective/formats/float.py,sha256=tWs_tW64OuacNQENu3uk5GOEVQMQls2iiteFOacQRAQ,832
|
|
42
|
+
csv_detective/formats/geojson.py,sha256=udbBxCBRmb0o6TD8z5ryemfqdinBz6njNJU0XcbfMig,757
|
|
43
|
+
csv_detective/formats/insee_ape700.py,sha256=cLs3Eersqm4wX6oqsqp0Vb3WGPJb2xY5Za_vh0uLgKc,780
|
|
44
|
+
csv_detective/formats/insee_canton.py,sha256=Q5jczsOmh1wPP2KtDkcmqZ7Hlv50Zz9YvPIbxy46qs0,531
|
|
45
|
+
csv_detective/formats/int.py,sha256=ZBUOn50luMtlNKWPyOaMIkY3J4f4hA0MqwcoFtksozU,482
|
|
46
|
+
csv_detective/formats/iso_country_code_alpha2.py,sha256=vIep_j0xuqlXKyuvk8c8GaJC73HuJqKfQ4QzQKHsPc0,613
|
|
47
|
+
csv_detective/formats/iso_country_code_alpha3.py,sha256=yOmm91O8ot6KoUBfss5cqykDfeeMNCwafDAvPNvbufA,668
|
|
48
|
+
csv_detective/formats/iso_country_code_numeric.py,sha256=989ypOmjIrNTV9vFnrBlbpRWQ9whd3Rv9gNasdF_O4g,685
|
|
49
|
+
csv_detective/formats/jour_de_la_semaine.py,sha256=c5QBw9eZfwRs_jL_Ckm95UH-TxlExdFmfZNYW7-_iZI,606
|
|
50
|
+
csv_detective/formats/json.py,sha256=E-s7IHW0q5WgAJVK0I-5Rv7W_RdofROB5wnIXbNegZQ,446
|
|
51
|
+
csv_detective/formats/latitude_l93.py,sha256=GteGpxAht-jeOBLr_deCuEXA_LliVYIAmyr_7jFAWgI,986
|
|
52
|
+
csv_detective/formats/latitude_wgs.py,sha256=HPcFlLzJNqynLugDQ07vO04rOCNBuAabVJEP8FQ89Q0,780
|
|
53
|
+
csv_detective/formats/latitude_wgs_fr_metropole.py,sha256=ruGzQLJPiMV2AlnsBneQIhMzstseddzWA0bDg5gfTG4,791
|
|
54
|
+
csv_detective/formats/latlon_wgs.py,sha256=CbNi4Y-ZgBfNyYi54xwcZGLpEusiLAWVpFP1YgHtI1M,1224
|
|
55
|
+
csv_detective/formats/longitude_l93.py,sha256=vJE4k_DyQOjAruqu_Q0E2sJKZB4mXGGN6bS9WCelsbs,768
|
|
56
|
+
csv_detective/formats/longitude_wgs.py,sha256=DUZCUxJQl53HHVQbXlz_lWXoAZhy3MvJWcPNdiK5cCM,552
|
|
57
|
+
csv_detective/formats/longitude_wgs_fr_metropole.py,sha256=wPlJP06K0BVWfrx1wwEAKK93AKIqvsuw705gKAlWAfQ,550
|
|
58
|
+
csv_detective/formats/lonlat_wgs.py,sha256=BgtTl2ReI0hSQB-7mcR4TDxx-QzvA1B9fiZWxTb5xPI,1005
|
|
59
|
+
csv_detective/formats/mois_de_lannee.py,sha256=4_mmdr9S83utVCgPaK_epkeBm2mhwdUWQEoB_Fhdh2o,759
|
|
60
|
+
csv_detective/formats/money.py,sha256=HpjrmfUmbG8sXF557XbYzQ7TLtpNVRgpC991gGokO8I,414
|
|
61
|
+
csv_detective/formats/mongo_object_id.py,sha256=XsiP4iMxfBBIeuL-4g5bm3jgS6yUMJC2X5CmrEJ40oI,296
|
|
62
|
+
csv_detective/formats/pays.py,sha256=FRvoQwIWiKbm0RC62Sus1X0Y_yJ-cfvdB5RYhkY-4NY,693
|
|
63
|
+
csv_detective/formats/percent.py,sha256=s6eQBMwJr2uyTZMUCK1_ifA0c4Rt2iEe9_E_hKKU_mk,308
|
|
64
|
+
csv_detective/formats/region.py,sha256=CkN7JTsZB1X3bH5xohbtMCxL5BX9MSpith36_1mHMd4,1483
|
|
65
|
+
csv_detective/formats/sexe.py,sha256=yioD4W6EkgUgo74rxn6KLZtN_0XYXtmA4mqVyI7e1mU,387
|
|
66
|
+
csv_detective/formats/siren.py,sha256=ieLe50vdSnkXadcUI8VXnnId9GFGHyIBWVTP6bJtyMo,758
|
|
67
|
+
csv_detective/formats/siret.py,sha256=ehkZgOH-HggN6IgxF4G0DMut_6giZ3gc4g9wMdwZFHQ,997
|
|
68
|
+
csv_detective/formats/tel_fr.py,sha256=yKCqIlqKO2yKucCoCjYfSjqNKfTjqFcmNXxg6THG0WE,624
|
|
69
|
+
csv_detective/formats/uai.py,sha256=uT5gjdTmoFH9QPZdTFkJgiyuKLW0B6KmT6yqHQeaeOU,711
|
|
70
|
+
csv_detective/formats/url.py,sha256=j6tCbcEzQw7U53ixeeFfhzueN8syVgQsjmAmY7RRWdU,1049
|
|
71
|
+
csv_detective/formats/username.py,sha256=y38OggfWpEQsGi0JnD9QRM30musa29lO6nz-qybR24U,249
|
|
72
|
+
csv_detective/formats/uuid.py,sha256=ekMEFfzQtz0cLudzmu3AoCM0Yf5pu23qAcFNFgHWJ1A,346
|
|
73
|
+
csv_detective/formats/year.py,sha256=pkAfYPKZdy0g1ZoHGgJNpgTS5y5weGEKXCVMGaxIX8k,472
|
|
74
|
+
csv_detective/output/__init__.py,sha256=ALSq_tgX7rGyh--7rmbKz8wHkmResN0h7mNujndow3w,2103
|
|
75
|
+
csv_detective/output/dataframe.py,sha256=Hnd-AY51U0JMACcpuaK9wwO4oCX9Nd7ZLUTqavgJWRA,3406
|
|
76
|
+
csv_detective/output/example.py,sha256=8LWheSBYCeDFfarbnmzBrdCbTd8Alh1U4pfXMKfabOw,8630
|
|
77
|
+
csv_detective/output/profile.py,sha256=VUQp0VJ22dfY4R5TybTpuQW_TOX_rLEp98cOzu-Jf44,4876
|
|
78
|
+
csv_detective/output/schema.py,sha256=XoKljXPXP00DfqPCiz1ydwTHYGAFsvNxnaPCNBuuBIo,10443
|
|
79
|
+
csv_detective/output/utils.py,sha256=tbji3dEH7bDc6gLCeVSVquqU3xaHA1CQOMuaJT4Hub8,3297
|
|
80
|
+
csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
81
|
+
csv_detective/parsing/columns.py,sha256=nihmB7Cv5BUNPh2EhMRPLdAxvcjrGZF-QFbJDd6rR2M,9246
|
|
82
|
+
csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
|
|
83
|
+
csv_detective/parsing/csv.py,sha256=0T0gpaXzwJo-sq41IoLQD704GiMUYeDVVASVbat-zWg,1726
|
|
84
|
+
csv_detective/parsing/excel.py,sha256=oAVTuoDccJc4-kVjHXiIPLQx3lq3aZRRZQxkG1c06JQ,6992
|
|
85
|
+
csv_detective/parsing/load.py,sha256=f-8aKiNpy_47qg4Lq-UZUR4NNrbJ_-KEGvcUQZ8cmb0,4317
|
|
86
|
+
csv_detective/parsing/text.py,sha256=uz8wfmNTQnOd_4fjrIZ_5rxmFmgrg343hJh2szB73Hc,1770
|
|
87
|
+
csv_detective/utils.py,sha256=RJ_zFOJ1DRY8HtDrKPiCdNk5gU6-KwOrOKOyfSkBZZY,1118
|
|
88
|
+
csv_detective/validate.py,sha256=XldlbGkUlPaIh0y4z9iaWlmmahwCrD1900s5Cxlq5wI,5430
|
|
89
|
+
csv_detective-0.9.3.dev2438.dist-info/WHEEL,sha256=z-mOpxbJHqy3cq6SvUThBZdaLGFZzdZPtgWLcP2NKjQ,79
|
|
90
|
+
csv_detective-0.9.3.dev2438.dist-info/entry_points.txt,sha256=1J86TQNCanjsLMboAufdEUla03qEQaC9QmVGYgt2FCQ,57
|
|
91
|
+
csv_detective-0.9.3.dev2438.dist-info/METADATA,sha256=FHlHT6UPBByKisTmFR5TImWGG8rXSKiTP_lc7-yHRDU,11063
|
|
92
|
+
csv_detective-0.9.3.dev2438.dist-info/RECORD,,
|
csv_detective/all_packages.txt
DELETED
|
@@ -1,104 +0,0 @@
|
|
|
1
|
-
csv_detective
|
|
2
|
-
csv_detective.detect_fields
|
|
3
|
-
csv_detective.detect_fields.temp
|
|
4
|
-
csv_detective.detect_fields.other
|
|
5
|
-
csv_detective.detect_fields.geo
|
|
6
|
-
csv_detective.detect_fields.FR
|
|
7
|
-
csv_detective.detect_fields.temp.date
|
|
8
|
-
csv_detective.detect_fields.temp.year
|
|
9
|
-
csv_detective.detect_fields.other.email
|
|
10
|
-
csv_detective.detect_fields.other.mongo_object_id
|
|
11
|
-
csv_detective.detect_fields.other.uuid
|
|
12
|
-
csv_detective.detect_fields.other.url
|
|
13
|
-
csv_detective.detect_fields.geo.iso_country_code_alpha2
|
|
14
|
-
csv_detective.detect_fields.geo.iso_country_code_alpha3
|
|
15
|
-
csv_detective.detect_fields.geo.iso_country_code_numeric
|
|
16
|
-
csv_detective.detect_fields.FR.temp
|
|
17
|
-
csv_detective.detect_fields.FR.other
|
|
18
|
-
csv_detective.detect_fields.FR.geo
|
|
19
|
-
csv_detective.detect_fields.FR.temp.jour_de_la_semaine
|
|
20
|
-
csv_detective.detect_fields.FR.other.csp_insee
|
|
21
|
-
csv_detective.detect_fields.FR.other.tel_fr
|
|
22
|
-
csv_detective.detect_fields.FR.other.siren
|
|
23
|
-
csv_detective.detect_fields.FR.other.code_csp_insee
|
|
24
|
-
csv_detective.detect_fields.FR.other.sexe
|
|
25
|
-
csv_detective.detect_fields.FR.geo.pays
|
|
26
|
-
csv_detective.detect_fields.FR.geo.code_departement
|
|
27
|
-
csv_detective.detect_fields.FR.geo.adresse
|
|
28
|
-
csv_detective.detect_fields.FR.geo.code_commune_insee
|
|
29
|
-
csv_detective.detect_fields.FR.geo.commune
|
|
30
|
-
csv_detective.detect_fields.FR.geo.region
|
|
31
|
-
csv_detective.detect_fields.FR.geo.code_postal
|
|
32
|
-
csv_detective.detect_fields.FR.geo.departement
|
|
33
|
-
csv_detective.detect_fields.FR.other.uai
|
|
34
|
-
csv_detective.detect_fields.FR.other.siret
|
|
35
|
-
csv_detective.detect_fields.geo.latitude_wgs
|
|
36
|
-
csv_detective.detect_fields.geo.longitude_wgs
|
|
37
|
-
csv_detective.detect_fields.geo.latlon_wgs
|
|
38
|
-
csv_detective.detect_fields.geo.json_geojson
|
|
39
|
-
csv_detective.detect_fields.FR.geo.code_fantoir
|
|
40
|
-
csv_detective.detect_fields.FR.other.insee_ape700
|
|
41
|
-
csv_detective.detect_fields.temp.datetime_iso
|
|
42
|
-
csv_detective.detect_fields.temp.datetime_rfc822
|
|
43
|
-
csv_detective.detect_fields.FR.geo.latitude_wgs_fr_metropole
|
|
44
|
-
csv_detective.detect_fields.FR.geo.longitude_wgs_fr_metropole
|
|
45
|
-
csv_detective.detect_fields.FR.geo.code_region
|
|
46
|
-
csv_detective.detect_fields.other.booleen
|
|
47
|
-
csv_detective.detect_fields.other.twitter
|
|
48
|
-
csv_detective.detect_fields.other.float
|
|
49
|
-
csv_detective.detect_fields.other.int
|
|
50
|
-
csv_detective.detect_fields.other.json
|
|
51
|
-
csv_detective.detect_fields.FR.geo.latitude_l93
|
|
52
|
-
csv_detective.detect_fields.FR.geo.longitude_l93
|
|
53
|
-
csv_detective.detect_fields.FR.geo.insee_canton
|
|
54
|
-
csv_detective.detect_fields.FR.other.date_fr
|
|
55
|
-
csv_detective.detect_fields.FR.other.code_waldec
|
|
56
|
-
csv_detective.detect_fields.FR.other.code_rna
|
|
57
|
-
csv_detective.detect_labels.FR.geo.adresse
|
|
58
|
-
csv_detective.detect_labels.FR.geo.code_commune_insee
|
|
59
|
-
csv_detective.detect_labels.FR.geo.code_departement
|
|
60
|
-
csv_detective.detect_labels.FR.geo.code_fantoir
|
|
61
|
-
csv_detective.detect_labels.FR.geo.code_postal
|
|
62
|
-
csv_detective.detect_labels.FR.geo.code_region
|
|
63
|
-
csv_detective.detect_labels.FR.geo.commune
|
|
64
|
-
csv_detective.detect_labels.FR.geo.departement
|
|
65
|
-
csv_detective.detect_labels.FR.geo.insee_canton
|
|
66
|
-
csv_detective.detect_labels.FR.geo.latitude_l93
|
|
67
|
-
csv_detective.detect_labels.FR.geo.latitude_wgs_fr_metropole
|
|
68
|
-
csv_detective.detect_labels.FR.geo.longitude_l93
|
|
69
|
-
csv_detective.detect_labels.FR.geo.longitude_wgs_fr_metropole
|
|
70
|
-
csv_detective.detect_labels.FR.geo.pays
|
|
71
|
-
csv_detective.detect_labels.FR.geo.region
|
|
72
|
-
csv_detective.detect_labels.FR.other.code_csp_insee
|
|
73
|
-
csv_detective.detect_labels.FR.other.code_rna
|
|
74
|
-
csv_detective.detect_labels.FR.other.code_waldec
|
|
75
|
-
csv_detective.detect_labels.FR.other.csp_insee
|
|
76
|
-
csv_detective.detect_labels.FR.other.date_fr
|
|
77
|
-
csv_detective.detect_labels.FR.other.insee_ape700
|
|
78
|
-
csv_detective.detect_labels.FR.other.sexe
|
|
79
|
-
csv_detective.detect_labels.FR.other.siren
|
|
80
|
-
csv_detective.detect_labels.FR.other.siret
|
|
81
|
-
csv_detective.detect_labels.FR.other.tel_fr
|
|
82
|
-
csv_detective.detect_labels.FR.other.uai
|
|
83
|
-
csv_detective.detect_labels.FR.temp.jour_de_la_semaine
|
|
84
|
-
csv_detective.detect_labels.FR.temp.mois_de_annee
|
|
85
|
-
csv_detective.detect_labels.geo.iso_country_code_alpha2
|
|
86
|
-
csv_detective.detect_labels.geo.iso_country_code_alpha3
|
|
87
|
-
csv_detective.detect_labels.geo.iso_country_code_numeric
|
|
88
|
-
csv_detective.detect_labels.geo.json_geojson
|
|
89
|
-
csv_detective.detect_labels.geo.latitude_wgs
|
|
90
|
-
csv_detective.detect_labels.geo.latlon_wgs
|
|
91
|
-
csv_detective.detect_labels.geo.longitude_wgs
|
|
92
|
-
csv_detective.detect_labels.other.booleen
|
|
93
|
-
csv_detective.detect_labels.other.email
|
|
94
|
-
csv_detective.detect_labels.other.mongo_object_id
|
|
95
|
-
csv_detective.detect_labels.other.uuid
|
|
96
|
-
csv_detective.detect_labels.other.float
|
|
97
|
-
csv_detective.detect_labels.other.int
|
|
98
|
-
csv_detective.detect_labels.other.money
|
|
99
|
-
csv_detective.detect_labels.other.twitter
|
|
100
|
-
csv_detective.detect_labels.other.url
|
|
101
|
-
csv_detective.detect_labels.temp.date
|
|
102
|
-
csv_detective.detect_labels.temp.datetime_iso
|
|
103
|
-
csv_detective.detect_labels.temp.datetime_rfc822
|
|
104
|
-
csv_detective.detect_labels.temp.year
|
|
@@ -1,100 +0,0 @@
|
|
|
1
|
-
from csv_detective.process_text import _process_text
|
|
2
|
-
|
|
3
|
-
PROPORTION = 0.55
|
|
4
|
-
# ajouts d'espaces en fin de mots pour s'assurer que le str n'est pas juste une substr d'un mot plus long
|
|
5
|
-
voies = {
|
|
6
|
-
'aire ',
|
|
7
|
-
'allee ',
|
|
8
|
-
'avenue ',
|
|
9
|
-
'base ',
|
|
10
|
-
'boulevard ',
|
|
11
|
-
'cami ',
|
|
12
|
-
'carrefour ',
|
|
13
|
-
'chemin ',
|
|
14
|
-
'cheminement ',
|
|
15
|
-
'chaussee ',
|
|
16
|
-
'cite ',
|
|
17
|
-
'clos ',
|
|
18
|
-
'coin ',
|
|
19
|
-
'corniche ',
|
|
20
|
-
'cote ',
|
|
21
|
-
'cour ',
|
|
22
|
-
'cours ',
|
|
23
|
-
'domaine ',
|
|
24
|
-
'descente ',
|
|
25
|
-
'ecart ',
|
|
26
|
-
'esplanade ',
|
|
27
|
-
'faubourg ',
|
|
28
|
-
'gare ',
|
|
29
|
-
'grande rue',
|
|
30
|
-
'hameau ',
|
|
31
|
-
'halle ',
|
|
32
|
-
'ilot ',
|
|
33
|
-
'impasse ',
|
|
34
|
-
'lieu dit',
|
|
35
|
-
'lotissement ',
|
|
36
|
-
'marche ',
|
|
37
|
-
'montee ',
|
|
38
|
-
'parc ',
|
|
39
|
-
'passage ',
|
|
40
|
-
'place ',
|
|
41
|
-
'plan ',
|
|
42
|
-
'plaine ',
|
|
43
|
-
'plateau ',
|
|
44
|
-
'pont ',
|
|
45
|
-
'port ',
|
|
46
|
-
'promenade ',
|
|
47
|
-
'parvis ',
|
|
48
|
-
'quartier ',
|
|
49
|
-
'quai ',
|
|
50
|
-
'residence ',
|
|
51
|
-
'ruelle ',
|
|
52
|
-
'rocade ',
|
|
53
|
-
'rond point',
|
|
54
|
-
'route ',
|
|
55
|
-
'rue ',
|
|
56
|
-
# 'sente - sentier',
|
|
57
|
-
'square ',
|
|
58
|
-
'tour ',
|
|
59
|
-
# 'terre-plein',
|
|
60
|
-
'traverse ',
|
|
61
|
-
'villa ',
|
|
62
|
-
'village ',
|
|
63
|
-
'voie ',
|
|
64
|
-
'zone artisanale',
|
|
65
|
-
'zone d’amenagement concerte',
|
|
66
|
-
'zone d’amenagement differe',
|
|
67
|
-
'zone industrielle',
|
|
68
|
-
'zone ',
|
|
69
|
-
# 'r',
|
|
70
|
-
'av ',
|
|
71
|
-
'pl ',
|
|
72
|
-
'bd ',
|
|
73
|
-
'cami ',
|
|
74
|
-
# 'che',
|
|
75
|
-
'chs ',
|
|
76
|
-
'dom ',
|
|
77
|
-
'ham ',
|
|
78
|
-
'ld ',
|
|
79
|
-
# 'pro',
|
|
80
|
-
# 'rte',
|
|
81
|
-
'vlge ',
|
|
82
|
-
'za ',
|
|
83
|
-
'zac ',
|
|
84
|
-
'zad ',
|
|
85
|
-
'zi ',
|
|
86
|
-
# 'car',
|
|
87
|
-
'fg ',
|
|
88
|
-
# 'lot',
|
|
89
|
-
'imp ',
|
|
90
|
-
# 'qu',
|
|
91
|
-
'mte'
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
def _is(val):
|
|
96
|
-
'''Repere des adresses'''
|
|
97
|
-
if len(val) > 150:
|
|
98
|
-
return False
|
|
99
|
-
val = _process_text(val)
|
|
100
|
-
return any(x in val for x in voies)
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
from os.path import dirname, join
|
|
2
|
-
import re
|
|
3
|
-
|
|
4
|
-
PROPORTION = 0.75
|
|
5
|
-
f = open(join(dirname(__file__), 'code_commune_insee.txt'), 'r')
|
|
6
|
-
codes_insee = f.read().split('\n')
|
|
7
|
-
# removing empty str due to additionnal line in file
|
|
8
|
-
del codes_insee[-1]
|
|
9
|
-
codes_insee = set(codes_insee)
|
|
10
|
-
f.close()
|
|
11
|
-
# vérification de cohérence avec prise en compte corse 2A/2B et DOM (971-976 sauf 975)
|
|
12
|
-
regex = r'^([01345678][0-9]{4}|2[AB1-9][0-9]{3}|9([0-5][0-9]{3}|7[12346][0-9]{2}))$'
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def _is(val):
|
|
16
|
-
'''Renvoie True si val peut être un code commune INSEE, False sinon'''
|
|
17
|
-
# test sur la longueur
|
|
18
|
-
if len(val) != 5:
|
|
19
|
-
return False
|
|
20
|
-
|
|
21
|
-
if not bool(re.match(regex, val)):
|
|
22
|
-
return False
|
|
23
|
-
|
|
24
|
-
return val in codes_insee
|