csv-detective 0.10.4.dev1__py3-none-any.whl → 0.10.2549__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/detection/__init__.py +0 -0
- csv_detective/detection/columns.py +0 -0
- csv_detective/detection/encoding.py +0 -0
- csv_detective/detection/engine.py +0 -0
- csv_detective/detection/formats.py +38 -13
- csv_detective/detection/headers.py +14 -12
- csv_detective/detection/rows.py +1 -1
- csv_detective/detection/separator.py +0 -0
- csv_detective/detection/variables.py +0 -0
- csv_detective/explore_csv.py +6 -18
- csv_detective/format.py +5 -12
- csv_detective/formats/__init__.py +0 -0
- csv_detective/formats/adresse.py +9 -9
- csv_detective/formats/binary.py +1 -2
- csv_detective/formats/booleen.py +2 -3
- csv_detective/formats/code_commune_insee.py +10 -12
- csv_detective/formats/code_csp_insee.py +1 -1
- csv_detective/formats/code_departement.py +7 -8
- csv_detective/formats/code_fantoir.py +5 -6
- csv_detective/formats/code_import.py +1 -1
- csv_detective/formats/code_postal.py +9 -10
- csv_detective/formats/code_region.py +6 -7
- csv_detective/formats/code_rna.py +6 -7
- csv_detective/formats/code_waldec.py +1 -1
- csv_detective/formats/commune.py +5 -5
- csv_detective/formats/csp_insee.py +5 -6
- csv_detective/formats/data/insee_ape700.txt +1 -1
- csv_detective/formats/data/iso_country_code_alpha2.txt +397 -153
- csv_detective/formats/data/iso_country_code_alpha3.txt +132 -132
- csv_detective/formats/data/iso_country_code_numeric.txt +94 -94
- csv_detective/formats/date.py +18 -28
- csv_detective/formats/date_fr.py +1 -1
- csv_detective/formats/datetime_aware.py +2 -7
- csv_detective/formats/datetime_naive.py +0 -3
- csv_detective/formats/datetime_rfc822.py +0 -1
- csv_detective/formats/departement.py +15 -15
- csv_detective/formats/email.py +13 -13
- csv_detective/formats/float.py +1 -2
- csv_detective/formats/geojson.py +10 -10
- csv_detective/formats/insee_ape700.py +8 -10
- csv_detective/formats/insee_canton.py +6 -6
- csv_detective/formats/int.py +1 -2
- csv_detective/formats/iso_country_code_alpha2.py +14 -14
- csv_detective/formats/iso_country_code_alpha3.py +13 -6
- csv_detective/formats/iso_country_code_numeric.py +9 -2
- csv_detective/formats/jour_de_la_semaine.py +12 -11
- csv_detective/formats/json.py +0 -6
- csv_detective/formats/latitude_l93.py +22 -8
- csv_detective/formats/latitude_wgs.py +29 -31
- csv_detective/formats/latitude_wgs_fr_metropole.py +30 -7
- csv_detective/formats/latlon_wgs.py +28 -30
- csv_detective/formats/longitude_l93.py +13 -8
- csv_detective/formats/longitude_wgs.py +19 -34
- csv_detective/formats/longitude_wgs_fr_metropole.py +19 -6
- csv_detective/formats/lonlat_wgs.py +11 -12
- csv_detective/formats/mois_de_lannee.py +1 -1
- csv_detective/formats/money.py +1 -1
- csv_detective/formats/mongo_object_id.py +1 -1
- csv_detective/formats/pays.py +13 -11
- csv_detective/formats/percent.py +1 -1
- csv_detective/formats/region.py +13 -13
- csv_detective/formats/sexe.py +1 -1
- csv_detective/formats/siren.py +10 -9
- csv_detective/formats/siret.py +9 -9
- csv_detective/formats/tel_fr.py +13 -7
- csv_detective/formats/uai.py +18 -17
- csv_detective/formats/url.py +16 -16
- csv_detective/formats/username.py +1 -1
- csv_detective/formats/uuid.py +1 -1
- csv_detective/formats/year.py +12 -7
- csv_detective/output/__init__.py +0 -0
- csv_detective/output/dataframe.py +3 -8
- csv_detective/output/example.py +0 -0
- csv_detective/output/profile.py +2 -6
- csv_detective/output/schema.py +0 -0
- csv_detective/output/utils.py +0 -0
- csv_detective/parsing/__init__.py +0 -0
- csv_detective/parsing/columns.py +5 -9
- csv_detective/parsing/compression.py +0 -0
- csv_detective/parsing/csv.py +0 -0
- csv_detective/parsing/excel.py +1 -1
- csv_detective/parsing/load.py +12 -11
- csv_detective/parsing/text.py +12 -13
- csv_detective/validate.py +36 -71
- {csv_detective-0.10.4.dev1.dist-info → csv_detective-0.10.2549.dist-info}/METADATA +18 -15
- csv_detective-0.10.2549.dist-info/RECORD +92 -0
- csv_detective-0.10.2549.dist-info/WHEEL +4 -0
- {csv_detective-0.10.4.dev1.dist-info → csv_detective-0.10.2549.dist-info}/entry_points.txt +1 -0
- csv_detective-0.10.4.dev1.dist-info/RECORD +0 -111
- csv_detective-0.10.4.dev1.dist-info/WHEEL +0 -5
- csv_detective-0.10.4.dev1.dist-info/licenses/LICENSE +0 -21
- csv_detective-0.10.4.dev1.dist-info/top_level.txt +0 -3
- tests/__init__.py +0 -0
- tests/data/a_test_file.csv +0 -407
- tests/data/a_test_file.json +0 -394
- tests/data/b_test_file.csv +0 -7
- tests/data/c_test_file.csv +0 -2
- tests/data/csv_file +0 -7
- tests/data/file.csv.gz +0 -0
- tests/data/file.ods +0 -0
- tests/data/file.xls +0 -0
- tests/data/file.xlsx +0 -0
- tests/data/xlsx_file +0 -0
- tests/test_example.py +0 -67
- tests/test_fields.py +0 -175
- tests/test_file.py +0 -469
- tests/test_labels.py +0 -26
- tests/test_structure.py +0 -45
- tests/test_validation.py +0 -163
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -11,7 +11,6 @@ from csv_detective.format import Format, FormatsManager
|
|
|
11
11
|
from csv_detective.output.utils import prepare_output_dict
|
|
12
12
|
from csv_detective.parsing.columns import (
|
|
13
13
|
MAX_NUMBER_CATEGORICAL_VALUES,
|
|
14
|
-
handle_empty_columns,
|
|
15
14
|
test_col,
|
|
16
15
|
test_col_chunks,
|
|
17
16
|
test_label,
|
|
@@ -50,7 +49,6 @@ def detect_formats(
|
|
|
50
49
|
skipna=skipna,
|
|
51
50
|
verbose=verbose,
|
|
52
51
|
)
|
|
53
|
-
handle_empty_columns(scores_table_fields)
|
|
54
52
|
res_categorical, _ = detect_categorical_variable(
|
|
55
53
|
table,
|
|
56
54
|
max_number_categorical_values=MAX_NUMBER_CATEGORICAL_VALUES,
|
|
@@ -84,7 +82,22 @@ def detect_formats(
|
|
|
84
82
|
# To reduce false positives: ensure these formats are detected only if the label yields
|
|
85
83
|
# a detection (skipping the ones that have been excluded by the users).
|
|
86
84
|
formats_with_mandatory_label = [
|
|
87
|
-
f
|
|
85
|
+
f
|
|
86
|
+
for f in [
|
|
87
|
+
"code_departement",
|
|
88
|
+
"code_commune_insee",
|
|
89
|
+
"code_postal",
|
|
90
|
+
"code_fantoir",
|
|
91
|
+
"latitude_wgs",
|
|
92
|
+
"longitude_wgs",
|
|
93
|
+
"latitude_wgs_fr_metropole",
|
|
94
|
+
"longitude_wgs_fr_metropole",
|
|
95
|
+
"latitude_l93",
|
|
96
|
+
"longitude_l93",
|
|
97
|
+
"siren",
|
|
98
|
+
"siret",
|
|
99
|
+
]
|
|
100
|
+
if f in scores_table.index
|
|
88
101
|
]
|
|
89
102
|
scores_table.loc[formats_with_mandatory_label, :] = np.where(
|
|
90
103
|
scores_table_labels.loc[formats_with_mandatory_label, :],
|
|
@@ -93,16 +106,32 @@ def detect_formats(
|
|
|
93
106
|
)
|
|
94
107
|
analysis["columns"] = prepare_output_dict(scores_table, limited_output)
|
|
95
108
|
|
|
109
|
+
metier_to_python_type = {
|
|
110
|
+
"booleen": "bool",
|
|
111
|
+
"int": "int",
|
|
112
|
+
"float": "float",
|
|
113
|
+
"string": "string",
|
|
114
|
+
"json": "json",
|
|
115
|
+
"geojson": "json",
|
|
116
|
+
"datetime_aware": "datetime",
|
|
117
|
+
"datetime_naive": "datetime",
|
|
118
|
+
"datetime_rfc822": "datetime",
|
|
119
|
+
"date": "date",
|
|
120
|
+
"latitude_l93": "float",
|
|
121
|
+
"latitude_wgs": "float",
|
|
122
|
+
"latitude_wgs_fr_metropole": "float",
|
|
123
|
+
"longitude_l93": "float",
|
|
124
|
+
"longitude_wgs": "float",
|
|
125
|
+
"longitude_wgs_fr_metropole": "float",
|
|
126
|
+
"binary": "binary",
|
|
127
|
+
}
|
|
128
|
+
|
|
96
129
|
if not limited_output:
|
|
97
130
|
for detection_method in ["columns_fields", "columns_labels", "columns"]:
|
|
98
131
|
analysis[detection_method] = {
|
|
99
132
|
col_name: [
|
|
100
133
|
{
|
|
101
|
-
"python_type": (
|
|
102
|
-
"string"
|
|
103
|
-
if detection["format"] == "string"
|
|
104
|
-
else fmtm.formats[detection["format"]].python_type
|
|
105
|
-
),
|
|
134
|
+
"python_type": metier_to_python_type.get(detection["format"], "string"),
|
|
106
135
|
**detection,
|
|
107
136
|
}
|
|
108
137
|
for detection in detections
|
|
@@ -113,11 +142,7 @@ def detect_formats(
|
|
|
113
142
|
for detection_method in ["columns_fields", "columns_labels", "columns"]:
|
|
114
143
|
analysis[detection_method] = {
|
|
115
144
|
col_name: {
|
|
116
|
-
"python_type": (
|
|
117
|
-
"string"
|
|
118
|
-
if detection["format"] == "string"
|
|
119
|
-
else fmtm.formats[detection["format"]].python_type
|
|
120
|
-
),
|
|
145
|
+
"python_type": metier_to_python_type.get(detection["format"], "string"),
|
|
121
146
|
**detection,
|
|
122
147
|
}
|
|
123
148
|
for col_name, detection in analysis[detection_method].items()
|
|
@@ -5,22 +5,24 @@ from typing import TextIO
|
|
|
5
5
|
from csv_detective.utils import display_logs_depending_process_time
|
|
6
6
|
|
|
7
7
|
|
|
8
|
-
def
|
|
8
|
+
def detect_headers(file: TextIO, sep: str, verbose: bool = False) -> tuple[int, list | None]:
|
|
9
9
|
"""Tests 10 first rows for possible header (in case header is not 1st row)"""
|
|
10
10
|
if verbose:
|
|
11
11
|
start = time()
|
|
12
|
-
logging.info("Detecting
|
|
12
|
+
logging.info("Detecting headers")
|
|
13
13
|
file.seek(0)
|
|
14
14
|
for i in range(10):
|
|
15
15
|
row = file.readline()
|
|
16
16
|
position = file.tell()
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
17
|
+
headers = [c for c in row.replace("\n", "").split(sep) if c]
|
|
18
|
+
if not any(col == "" for col in headers):
|
|
19
|
+
next_row = file.readline()
|
|
20
|
+
file.seek(position)
|
|
21
|
+
if row != next_row:
|
|
22
|
+
if verbose:
|
|
23
|
+
display_logs_depending_process_time(
|
|
24
|
+
f"Detected headers in {round(time() - start, 3)}s",
|
|
25
|
+
time() - start,
|
|
26
|
+
)
|
|
27
|
+
return i, headers
|
|
28
|
+
raise ValueError("Could not retrieve headers")
|
csv_detective/detection/rows.py
CHANGED
|
@@ -2,7 +2,7 @@ import pandas as pd
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
def remove_empty_first_rows(table: pd.DataFrame) -> tuple[pd.DataFrame, int]:
|
|
5
|
-
"""Analog process to
|
|
5
|
+
"""Analog process to detect_headers for csv files, determines how many rows to skip
|
|
6
6
|
to end up with the header at the right place"""
|
|
7
7
|
idx = 0
|
|
8
8
|
if all([str(c).startswith("Unnamed:") for c in table.columns]):
|
|
File without changes
|
|
File without changes
|
csv_detective/explore_csv.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from time import time
|
|
3
|
-
from typing import Iterator
|
|
4
3
|
|
|
5
4
|
import pandas as pd
|
|
6
5
|
|
|
@@ -28,7 +27,7 @@ def routine(
|
|
|
28
27
|
cast_json: bool = True,
|
|
29
28
|
verbose: bool = False,
|
|
30
29
|
sheet_name: str | int | None = None,
|
|
31
|
-
) -> dict | tuple[dict,
|
|
30
|
+
) -> dict | tuple[dict, pd.DataFrame]:
|
|
32
31
|
"""
|
|
33
32
|
Returns a dict with information about the table and possible column contents, and if requested the DataFrame with columns cast according to analysis.
|
|
34
33
|
|
|
@@ -116,7 +115,7 @@ def validate_then_detect(
|
|
|
116
115
|
output_df: bool = False,
|
|
117
116
|
cast_json: bool = True,
|
|
118
117
|
verbose: bool = False,
|
|
119
|
-
)
|
|
118
|
+
):
|
|
120
119
|
"""
|
|
121
120
|
Performs a validation of the given file against the given analysis.
|
|
122
121
|
If the validation fails, performs a full analysis and return it.
|
|
@@ -142,19 +141,20 @@ def validate_then_detect(
|
|
|
142
141
|
if is_url(file_path):
|
|
143
142
|
logging.info("Path recognized as a URL")
|
|
144
143
|
|
|
145
|
-
is_valid, analysis, col_values = validate(
|
|
144
|
+
is_valid, table, analysis, col_values = validate(
|
|
146
145
|
file_path=file_path,
|
|
147
146
|
previous_analysis=previous_analysis,
|
|
148
147
|
verbose=verbose,
|
|
149
148
|
skipna=skipna,
|
|
150
149
|
)
|
|
151
|
-
if
|
|
152
|
-
# if loading failed in validate, we load it from scratch
|
|
150
|
+
if analysis is None:
|
|
151
|
+
# if loading failed in validate, we load it from scratch
|
|
153
152
|
table, analysis = load_file(
|
|
154
153
|
file_path=file_path,
|
|
155
154
|
num_rows=num_rows,
|
|
156
155
|
verbose=verbose,
|
|
157
156
|
)
|
|
157
|
+
if not is_valid:
|
|
158
158
|
analysis, col_values = detect_formats(
|
|
159
159
|
table=table,
|
|
160
160
|
analysis=analysis,
|
|
@@ -164,18 +164,6 @@ def validate_then_detect(
|
|
|
164
164
|
skipna=skipna,
|
|
165
165
|
verbose=verbose,
|
|
166
166
|
)
|
|
167
|
-
else:
|
|
168
|
-
# successful validation means we have a correct analysis and col_values
|
|
169
|
-
# only need to reload the table, and we already know how
|
|
170
|
-
table, _ = load_file(
|
|
171
|
-
file_path=file_path,
|
|
172
|
-
num_rows=num_rows,
|
|
173
|
-
verbose=verbose,
|
|
174
|
-
sep=analysis.get("separator"),
|
|
175
|
-
encoding=analysis.get("encoding"),
|
|
176
|
-
engine=analysis.get("engine"),
|
|
177
|
-
sheet_name=analysis.get("sheet_name"),
|
|
178
|
-
)
|
|
179
167
|
try:
|
|
180
168
|
return generate_output(
|
|
181
169
|
table=table,
|
csv_detective/format.py
CHANGED
|
@@ -9,11 +9,9 @@ class Format:
|
|
|
9
9
|
name: str,
|
|
10
10
|
func: Callable[[Any], bool],
|
|
11
11
|
_test_values: dict[bool, list[str]],
|
|
12
|
-
labels:
|
|
12
|
+
labels: list[str] = [],
|
|
13
13
|
proportion: float = 1,
|
|
14
14
|
tags: list[str] = [],
|
|
15
|
-
mandatory_label: bool = False,
|
|
16
|
-
python_type: str = "string",
|
|
17
15
|
) -> None:
|
|
18
16
|
"""
|
|
19
17
|
Instanciates a Format object.
|
|
@@ -22,18 +20,16 @@ class Format:
|
|
|
22
20
|
name: the name of the format.
|
|
23
21
|
func: the value test for the format (returns whether a string is valid).
|
|
24
22
|
_test_values: lists of valid and invalid values, used in the tests
|
|
25
|
-
labels: the
|
|
23
|
+
labels: the list of hint headers for the header score
|
|
26
24
|
proportion: the tolerance (between 0 and 1) to say a column is valid for a format. (1 => 100% of the column has to pass the func check for the column to be considered valid)
|
|
27
25
|
tags: to allow users to submit a file to only a subset of formats
|
|
28
26
|
"""
|
|
29
27
|
self.name: str = name
|
|
30
|
-
self.func: Callable
|
|
28
|
+
self.func: Callable = func
|
|
31
29
|
self._test_values: dict[bool, list[str]] = _test_values
|
|
32
|
-
self.labels:
|
|
30
|
+
self.labels: list[str] = labels
|
|
33
31
|
self.proportion: float = proportion
|
|
34
32
|
self.tags: list[str] = tags
|
|
35
|
-
self.mandatory_label: bool = mandatory_label
|
|
36
|
-
self.python_type: str = python_type
|
|
37
33
|
|
|
38
34
|
def is_valid_label(self, val: str) -> float:
|
|
39
35
|
return header_score(val, self.labels)
|
|
@@ -53,7 +49,7 @@ class FormatsManager:
|
|
|
53
49
|
_test_values=module._test_values,
|
|
54
50
|
**{
|
|
55
51
|
attr: val
|
|
56
|
-
for attr in ["labels", "proportion", "tags"
|
|
52
|
+
for attr in ["labels", "proportion", "tags"]
|
|
57
53
|
if (val := getattr(module, attr, None))
|
|
58
54
|
},
|
|
59
55
|
)
|
|
@@ -67,8 +63,5 @@ class FormatsManager:
|
|
|
67
63
|
if all(tag in fmt.tags for tag in tags)
|
|
68
64
|
}
|
|
69
65
|
|
|
70
|
-
def get_formats_with_mandatory_label(self) -> dict[str, Format]:
|
|
71
|
-
return {label: fmt for label, fmt in self.formats.items() if fmt.mandatory_label}
|
|
72
|
-
|
|
73
66
|
def available_tags(self) -> set[str]:
|
|
74
67
|
return set(tag for format in self.formats.values() for tag in format.tags)
|
|
File without changes
|
csv_detective/formats/adresse.py
CHANGED
|
@@ -2,15 +2,15 @@ from csv_detective.parsing.text import _process_text
|
|
|
2
2
|
|
|
3
3
|
proportion = 0.55
|
|
4
4
|
tags = ["fr", "geo"]
|
|
5
|
-
labels =
|
|
6
|
-
"adresse"
|
|
7
|
-
"localisation"
|
|
8
|
-
"adresse postale"
|
|
9
|
-
"adresse geographique"
|
|
10
|
-
"adr"
|
|
11
|
-
"adresse complete"
|
|
12
|
-
"adresse station"
|
|
13
|
-
|
|
5
|
+
labels = [
|
|
6
|
+
"adresse",
|
|
7
|
+
"localisation",
|
|
8
|
+
"adresse postale",
|
|
9
|
+
"adresse geographique",
|
|
10
|
+
"adr",
|
|
11
|
+
"adresse complete",
|
|
12
|
+
"adresse station",
|
|
13
|
+
]
|
|
14
14
|
|
|
15
15
|
voies = {
|
|
16
16
|
"aire ",
|
csv_detective/formats/binary.py
CHANGED
csv_detective/formats/booleen.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
proportion = 1
|
|
2
2
|
tags = ["type"]
|
|
3
|
-
|
|
4
|
-
labels = {"is ": 1, "has ": 1, "est ": 1}
|
|
3
|
+
labels = ["is ", "has ", "est "]
|
|
5
4
|
|
|
6
5
|
bool_mapping = {
|
|
7
6
|
"1": True,
|
|
@@ -22,7 +21,7 @@ bool_mapping = {
|
|
|
22
21
|
liste_bool = set(bool_mapping.keys())
|
|
23
22
|
|
|
24
23
|
|
|
25
|
-
def bool_casting(val: str) -> bool
|
|
24
|
+
def bool_casting(val: str) -> bool:
|
|
26
25
|
return bool_mapping.get(val.lower())
|
|
27
26
|
|
|
28
27
|
|
|
@@ -2,18 +2,16 @@ from frformat import CodeCommuneInsee, Millesime
|
|
|
2
2
|
|
|
3
3
|
proportion = 0.75
|
|
4
4
|
tags = ["fr", "geo"]
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
"code
|
|
8
|
-
"
|
|
9
|
-
"
|
|
10
|
-
"code commune"
|
|
11
|
-
"
|
|
12
|
-
"
|
|
13
|
-
"
|
|
14
|
-
|
|
15
|
-
"code": 0.5,
|
|
16
|
-
}
|
|
5
|
+
labels = [
|
|
6
|
+
"code commune insee",
|
|
7
|
+
"code insee",
|
|
8
|
+
"codes insee",
|
|
9
|
+
"code commune",
|
|
10
|
+
"code insee commune",
|
|
11
|
+
"insee",
|
|
12
|
+
"code com",
|
|
13
|
+
"com",
|
|
14
|
+
]
|
|
17
15
|
|
|
18
16
|
_code_commune_insee = CodeCommuneInsee(Millesime.LATEST)
|
|
19
17
|
|
|
@@ -2,14 +2,13 @@ from frformat import Millesime, NumeroDepartement, Options
|
|
|
2
2
|
|
|
3
3
|
proportion = 1
|
|
4
4
|
tags = ["fr", "geo"]
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
"
|
|
8
|
-
"
|
|
9
|
-
"
|
|
10
|
-
"
|
|
11
|
-
|
|
12
|
-
}
|
|
5
|
+
labels = [
|
|
6
|
+
"code departement",
|
|
7
|
+
"code_departement",
|
|
8
|
+
"dep",
|
|
9
|
+
"departement",
|
|
10
|
+
"dept",
|
|
11
|
+
]
|
|
13
12
|
|
|
14
13
|
_options = Options(
|
|
15
14
|
ignore_case=True,
|
|
@@ -2,12 +2,11 @@ from frformat import CodeFantoir
|
|
|
2
2
|
|
|
3
3
|
proportion = 1
|
|
4
4
|
tags = ["fr", "geo"]
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
"
|
|
8
|
-
"
|
|
9
|
-
|
|
10
|
-
}
|
|
5
|
+
labels = [
|
|
6
|
+
"cadastre1",
|
|
7
|
+
"code fantoir",
|
|
8
|
+
"fantoir",
|
|
9
|
+
]
|
|
11
10
|
|
|
12
11
|
_code_fantoir = CodeFantoir()
|
|
13
12
|
|
|
@@ -2,16 +2,15 @@ from frformat import CodePostal
|
|
|
2
2
|
|
|
3
3
|
proportion = 0.9
|
|
4
4
|
tags = ["fr", "geo"]
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
"code
|
|
8
|
-
"
|
|
9
|
-
"
|
|
10
|
-
"
|
|
11
|
-
"
|
|
12
|
-
"
|
|
13
|
-
|
|
14
|
-
}
|
|
5
|
+
labels = [
|
|
6
|
+
"code postal",
|
|
7
|
+
"postal code",
|
|
8
|
+
"postcode",
|
|
9
|
+
"post code",
|
|
10
|
+
"cp",
|
|
11
|
+
"codes postaux",
|
|
12
|
+
"location postcode",
|
|
13
|
+
]
|
|
15
14
|
|
|
16
15
|
_code_postal = CodePostal()
|
|
17
16
|
|
|
@@ -2,13 +2,12 @@ from frformat import CodeRegion, Millesime
|
|
|
2
2
|
|
|
3
3
|
proportion = 1
|
|
4
4
|
tags = ["fr", "geo"]
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
"
|
|
8
|
-
"
|
|
9
|
-
"
|
|
10
|
-
|
|
11
|
-
}
|
|
5
|
+
labels = [
|
|
6
|
+
"code region",
|
|
7
|
+
"reg",
|
|
8
|
+
"code insee region",
|
|
9
|
+
"region",
|
|
10
|
+
]
|
|
12
11
|
|
|
13
12
|
_code_region = CodeRegion(Millesime.LATEST)
|
|
14
13
|
|
|
@@ -2,13 +2,12 @@ from frformat import CodeRNA
|
|
|
2
2
|
|
|
3
3
|
proportion = 0.9
|
|
4
4
|
tags = ["fr"]
|
|
5
|
-
labels =
|
|
6
|
-
"code rna"
|
|
7
|
-
"rna"
|
|
8
|
-
"n° inscription association"
|
|
9
|
-
"identifiant association"
|
|
10
|
-
|
|
11
|
-
}
|
|
5
|
+
labels = [
|
|
6
|
+
"code rna",
|
|
7
|
+
"rna",
|
|
8
|
+
"n° inscription association",
|
|
9
|
+
"identifiant association",
|
|
10
|
+
]
|
|
12
11
|
|
|
13
12
|
_code_rna = CodeRNA()
|
|
14
13
|
|
csv_detective/formats/commune.py
CHANGED
|
@@ -2,11 +2,11 @@ from frformat import Commune, Millesime, Options
|
|
|
2
2
|
|
|
3
3
|
proportion = 0.8
|
|
4
4
|
tags = ["fr", "geo"]
|
|
5
|
-
labels =
|
|
6
|
-
"commune"
|
|
7
|
-
"ville"
|
|
8
|
-
"libelle commune"
|
|
9
|
-
|
|
5
|
+
labels = [
|
|
6
|
+
"commune",
|
|
7
|
+
"ville",
|
|
8
|
+
"libelle commune",
|
|
9
|
+
]
|
|
10
10
|
|
|
11
11
|
_options = Options(
|
|
12
12
|
ignore_case=True,
|
|
@@ -4,12 +4,11 @@ from csv_detective.parsing.text import _process_text
|
|
|
4
4
|
|
|
5
5
|
proportion = 1
|
|
6
6
|
tags = ["fr"]
|
|
7
|
-
labels =
|
|
8
|
-
"csp insee"
|
|
9
|
-
"csp"
|
|
10
|
-
"categorie socioprofessionnelle"
|
|
11
|
-
|
|
12
|
-
}
|
|
7
|
+
labels = [
|
|
8
|
+
"csp insee",
|
|
9
|
+
"csp",
|
|
10
|
+
"categorie socioprofessionnelle",
|
|
11
|
+
]
|
|
13
12
|
|
|
14
13
|
f = open(join(dirname(__file__), "data", "csp_insee.txt"), "r")
|
|
15
14
|
codes_insee = f.read().split("\n")
|