csv-detective 0.8.1.dev1549__py3-none-any.whl → 0.8.1.dev1599__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csv_detective/detect_fields/__init__.py +1 -0
- csv_detective/detect_fields/geo/json_geojson/__init__.py +6 -10
- csv_detective/detect_fields/geo/latlon_wgs/__init__.py +4 -1
- csv_detective/detect_fields/geo/lonlat_wgs/__init__.py +16 -0
- csv_detective/detect_fields/other/json/__init__.py +3 -13
- csv_detective/detect_labels/__init__.py +4 -3
- csv_detective/detect_labels/geo/latlon_wgs/__init__.py +34 -31
- csv_detective/detect_labels/geo/lonlat_wgs/__init__.py +14 -0
- csv_detective/detect_labels/other/booleen/__init__.py +1 -1
- csv_detective/detection/formats.py +0 -2
- csv_detective/output/schema.py +3 -0
- csv_detective/output/utils.py +56 -35
- csv_detective/parsing/columns.py +0 -1
- {csv_detective-0.8.1.dev1549.data → csv_detective-0.8.1.dev1599.data}/data/share/csv_detective/CHANGELOG.md +2 -0
- {csv_detective-0.8.1.dev1549.dist-info → csv_detective-0.8.1.dev1599.dist-info}/METADATA +1 -1
- {csv_detective-0.8.1.dev1549.dist-info → csv_detective-0.8.1.dev1599.dist-info}/RECORD +23 -21
- tests/test_fields.py +33 -3
- {csv_detective-0.8.1.dev1549.data → csv_detective-0.8.1.dev1599.data}/data/share/csv_detective/LICENSE +0 -0
- {csv_detective-0.8.1.dev1549.data → csv_detective-0.8.1.dev1599.data}/data/share/csv_detective/README.md +0 -0
- {csv_detective-0.8.1.dev1549.dist-info → csv_detective-0.8.1.dev1599.dist-info}/WHEEL +0 -0
- {csv_detective-0.8.1.dev1549.dist-info → csv_detective-0.8.1.dev1599.dist-info}/entry_points.txt +0 -0
- {csv_detective-0.8.1.dev1549.dist-info → csv_detective-0.8.1.dev1599.dist-info}/licenses/LICENSE +0 -0
- {csv_detective-0.8.1.dev1549.dist-info → csv_detective-0.8.1.dev1599.dist-info}/top_level.txt +0 -0
|
@@ -1,22 +1,18 @@
|
|
|
1
1
|
import json
|
|
2
|
-
from json import JSONDecodeError
|
|
3
2
|
|
|
4
3
|
PROPORTION = 0.9
|
|
5
4
|
|
|
6
5
|
|
|
7
6
|
def _is(val):
|
|
8
|
-
|
|
7
|
+
"""Renvoie True si val peut etre un geojson"""
|
|
9
8
|
|
|
10
9
|
try:
|
|
11
10
|
j = json.loads(val)
|
|
12
|
-
if
|
|
13
|
-
|
|
14
|
-
if 'geometry' in j:
|
|
15
|
-
if 'coordinates' in j['geometry']:
|
|
11
|
+
if isinstance(j, dict):
|
|
12
|
+
if "type" in j and "coordinates" in j:
|
|
16
13
|
return True
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
except
|
|
14
|
+
if "geometry" in j and "coordinates" in j["geometry"]:
|
|
15
|
+
return True
|
|
16
|
+
except Exception:
|
|
20
17
|
pass
|
|
21
|
-
|
|
22
18
|
return False
|
|
@@ -5,9 +5,12 @@ PROPORTION = 1
|
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
def _is(val):
|
|
8
|
-
|
|
8
|
+
"""Renvoie True si val peut etre une latitude,longitude"""
|
|
9
9
|
|
|
10
10
|
if not isinstance(val, str) or val.count(",") != 1:
|
|
11
11
|
return False
|
|
12
12
|
lat, lon = val.split(",")
|
|
13
|
+
# handling [lat,lon]
|
|
14
|
+
if lat.startswith("[") and lon.endswith("]"):
|
|
15
|
+
lat, lon = lat[1:], lon[:-1]
|
|
13
16
|
return is_lat(lat) and is_lon(lon.replace(" ", ""))
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from ..latitude_wgs import _is as is_lat
|
|
2
|
+
from ..longitude_wgs import _is as is_lon
|
|
3
|
+
|
|
4
|
+
PROPORTION = 1
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def _is(val):
|
|
8
|
+
"""Renvoie True si val peut etre une longitude,latitude"""
|
|
9
|
+
|
|
10
|
+
if not isinstance(val, str) or val.count(",") != 1:
|
|
11
|
+
return False
|
|
12
|
+
lon, lat = val.split(",")
|
|
13
|
+
# handling [lon,lat]
|
|
14
|
+
if lon.startswith("[") and lat.endswith("]"):
|
|
15
|
+
lon, lat = lon[1:], lat[:-1]
|
|
16
|
+
return is_lon(lon) and is_lat(lat.replace(" ", ""))
|
|
@@ -5,20 +5,10 @@ PROPORTION = 1
|
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
def _is(val):
|
|
8
|
-
|
|
8
|
+
"""Detects json"""
|
|
9
9
|
try:
|
|
10
10
|
loaded = json.loads(val)
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
any(
|
|
14
|
-
[
|
|
15
|
-
geo in loaded for geo in ['coordinates', 'geometry']
|
|
16
|
-
]
|
|
17
|
-
)
|
|
18
|
-
)
|
|
19
|
-
):
|
|
20
|
-
return True
|
|
21
|
-
else:
|
|
22
|
-
return False
|
|
11
|
+
# we don't want to consider integers for instance
|
|
12
|
+
return isinstance(loaded, (list, dict))
|
|
23
13
|
except (JSONDecodeError, TypeError):
|
|
24
14
|
return False
|
|
@@ -14,7 +14,7 @@ from .FR.geo import (
|
|
|
14
14
|
longitude_l93,
|
|
15
15
|
longitude_wgs_fr_metropole,
|
|
16
16
|
pays,
|
|
17
|
-
region
|
|
17
|
+
region,
|
|
18
18
|
)
|
|
19
19
|
from .FR.other import (
|
|
20
20
|
code_csp_insee,
|
|
@@ -27,7 +27,7 @@ from .FR.other import (
|
|
|
27
27
|
siren,
|
|
28
28
|
siret,
|
|
29
29
|
tel_fr,
|
|
30
|
-
uai
|
|
30
|
+
uai,
|
|
31
31
|
)
|
|
32
32
|
from .FR.temp import jour_de_la_semaine, mois_de_annee
|
|
33
33
|
from .geo import (
|
|
@@ -37,7 +37,8 @@ from .geo import (
|
|
|
37
37
|
json_geojson,
|
|
38
38
|
latitude_wgs,
|
|
39
39
|
latlon_wgs,
|
|
40
|
-
longitude_wgs
|
|
40
|
+
longitude_wgs,
|
|
41
|
+
lonlat_wgs,
|
|
41
42
|
)
|
|
42
43
|
from .other import booleen, email, float, int, money, mongo_object_id, twitter, url, uuid
|
|
43
44
|
from .temp import date, datetime_rfc822, year
|
|
@@ -2,42 +2,45 @@ from csv_detective.parsing.text import header_score
|
|
|
2
2
|
|
|
3
3
|
PROPORTION = 0.5
|
|
4
4
|
|
|
5
|
+
COMMON_COORDS_LABELS = [
|
|
6
|
+
"c geo",
|
|
7
|
+
"code geo",
|
|
8
|
+
"coord gps",
|
|
9
|
+
"coordonnees",
|
|
10
|
+
"coordonnees ban",
|
|
11
|
+
"coordonnees finales",
|
|
12
|
+
"coordonnees geo",
|
|
13
|
+
"coordonnees geographiques",
|
|
14
|
+
"coordonnees geoloc",
|
|
15
|
+
"coordonnees geoloc",
|
|
16
|
+
"coordonnees gps",
|
|
17
|
+
"coordonnees insee",
|
|
18
|
+
"coordonnees xy",
|
|
19
|
+
"geo",
|
|
20
|
+
"geo coordinates",
|
|
21
|
+
"geo cp",
|
|
22
|
+
"geo localisation",
|
|
23
|
+
"geo point",
|
|
24
|
+
"geo point 2d",
|
|
25
|
+
"geolocalisation",
|
|
26
|
+
"geom x y",
|
|
27
|
+
"geometry x y",
|
|
28
|
+
"geopoint",
|
|
29
|
+
"point geo",
|
|
30
|
+
"point geo insee",
|
|
31
|
+
"position",
|
|
32
|
+
"position geographique",
|
|
33
|
+
"wgs84",
|
|
34
|
+
"x y",
|
|
35
|
+
"xy",
|
|
36
|
+
]
|
|
37
|
+
|
|
5
38
|
|
|
6
39
|
def _is(header: str) -> float:
|
|
7
40
|
words_combinations_list = [
|
|
8
41
|
"latlon wgs",
|
|
9
42
|
"latlon",
|
|
10
|
-
"geo point",
|
|
11
|
-
"geo point 2d",
|
|
12
|
-
"wgs84",
|
|
13
|
-
"geolocalisation",
|
|
14
|
-
"geo",
|
|
15
|
-
"coordonnees finales",
|
|
16
|
-
"coordonnees",
|
|
17
|
-
"coordonnees ban",
|
|
18
|
-
"xy",
|
|
19
|
-
"geometry x y",
|
|
20
|
-
"coordonnees insee",
|
|
21
|
-
"coordonnees geographiques",
|
|
22
|
-
"position",
|
|
23
|
-
"coordonnes gps",
|
|
24
|
-
"geopoint",
|
|
25
|
-
"geom x y",
|
|
26
|
-
"coord gps",
|
|
27
43
|
"latlong",
|
|
28
|
-
"position geographique",
|
|
29
|
-
"c geo",
|
|
30
|
-
"coordonnes geoloc",
|
|
31
44
|
"lat lon",
|
|
32
|
-
|
|
33
|
-
"geo localisation",
|
|
34
|
-
"coordonnes geo",
|
|
35
|
-
"geo cp",
|
|
36
|
-
"x y",
|
|
37
|
-
"geo coordinates",
|
|
38
|
-
"point geo",
|
|
39
|
-
"point geo insee",
|
|
40
|
-
"coordonnees geoloc",
|
|
41
|
-
"coordonnees xy",
|
|
42
|
-
]
|
|
45
|
+
] + COMMON_COORDS_LABELS
|
|
43
46
|
return header_score(header, words_combinations_list)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from csv_detective.parsing.text import header_score
|
|
2
|
+
from ..latlon_wgs import COMMON_COORDS_LABELS
|
|
3
|
+
|
|
4
|
+
PROPORTION = 0.5
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def _is(header: str) -> float:
|
|
8
|
+
words_combinations_list = [
|
|
9
|
+
"lonlat wgs",
|
|
10
|
+
"lonlat",
|
|
11
|
+
"longlat",
|
|
12
|
+
"lon lat",
|
|
13
|
+
] + COMMON_COORDS_LABELS
|
|
14
|
+
return header_score(header, words_combinations_list)
|
|
@@ -110,11 +110,9 @@ def detect_formats(
|
|
|
110
110
|
"datetime_naive": "datetime",
|
|
111
111
|
"datetime_rfc822": "datetime",
|
|
112
112
|
"date": "date",
|
|
113
|
-
"latitude": "float",
|
|
114
113
|
"latitude_l93": "float",
|
|
115
114
|
"latitude_wgs": "float",
|
|
116
115
|
"latitude_wgs_fr_metropole": "float",
|
|
117
|
-
"longitude": "float",
|
|
118
116
|
"longitude_l93": "float",
|
|
119
117
|
"longitude_wgs": "float",
|
|
120
118
|
"longitude_wgs_fr_metropole": "float",
|
csv_detective/output/schema.py
CHANGED
|
@@ -51,6 +51,7 @@ def get_description(format: str) -> str:
|
|
|
51
51
|
"latitude_wgs": "La latitude au format WGS",
|
|
52
52
|
"longitude_wgs": "La longitude au format WGS",
|
|
53
53
|
"latlon_wgs": "Les coordonnées XY (latitude et longitude)",
|
|
54
|
+
"lonlat_wgs": "Les coordonnées XY (longitude et latitude)",
|
|
54
55
|
"booleen": "Booléen",
|
|
55
56
|
"email": "L'adresse couriel (email)",
|
|
56
57
|
"float": "Nombre flottant (à virgule)",
|
|
@@ -116,6 +117,7 @@ def get_validata_type(format: str) -> str:
|
|
|
116
117
|
"latitude_wgs": "number",
|
|
117
118
|
"latitude_wgs_fr_metropole": "number",
|
|
118
119
|
"latlon_wgs": "geo_point",
|
|
120
|
+
"lonlat_wgs": "geo_point",
|
|
119
121
|
"longitude": "number",
|
|
120
122
|
"longitude_l93": "number",
|
|
121
123
|
"longitude_wgs": "number",
|
|
@@ -162,6 +164,7 @@ def get_example(format: str) -> str:
|
|
|
162
164
|
"latitude_wgs": 42.42,
|
|
163
165
|
"latitude_wgs_fr_metropole": 41.3,
|
|
164
166
|
"latlon_wgs": "42.42, 0.0",
|
|
167
|
+
"lonlat_wgs": "0.0, 42.42",
|
|
165
168
|
"longitude": 0.0,
|
|
166
169
|
"longitude_l93": -357823,
|
|
167
170
|
"longitude_wgs": 0.0,
|
csv_detective/output/utils.py
CHANGED
|
@@ -2,52 +2,73 @@ import pandas as pd
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
def prepare_output_dict(return_table: pd.DataFrame, limited_output: bool):
|
|
5
|
+
# -> dict[str, dict | list[dict]] (to be added when upgrading to python>=3.10)
|
|
5
6
|
return_dict_cols = return_table.to_dict("dict")
|
|
6
|
-
|
|
7
|
+
output_dict = {}
|
|
7
8
|
for column_name in return_dict_cols:
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
9
|
+
# keep only formats with a non-zero score
|
|
10
|
+
output_dict[column_name] = [
|
|
11
|
+
{
|
|
12
|
+
"format": detected_value_type,
|
|
13
|
+
"score": return_dict_cols[column_name][detected_value_type],
|
|
14
|
+
}
|
|
15
|
+
for detected_value_type in return_dict_cols[column_name]
|
|
16
|
+
if return_dict_cols[column_name][detected_value_type] > 0
|
|
17
|
+
]
|
|
18
|
+
priorities = [
|
|
19
|
+
# no need to specify int and float everywhere, they are deprioritized anyway
|
|
20
|
+
("int", ("float",)),
|
|
21
|
+
# bool over everything
|
|
22
|
+
("booleen", (
|
|
23
|
+
"latitude_l93",
|
|
24
|
+
"latitude_wgs",
|
|
25
|
+
"latitude_wgs_fr_metropole",
|
|
26
|
+
"longitude_l93",
|
|
27
|
+
"longitude_wgs",
|
|
28
|
+
"longitude_wgs_fr_metropole",
|
|
29
|
+
)),
|
|
30
|
+
("geojson", ("json",)),
|
|
31
|
+
# latlon over lonlat if no longitude allows to discriminate
|
|
32
|
+
("latlon_wgs", ("json", "lonlat_wgs")),
|
|
33
|
+
("lonlat_wgs", ("json",)),
|
|
34
|
+
("latitude_wgs_fr_metropole", ("latitude_l93", "latitude_wgs")),
|
|
35
|
+
("longitude_wgs_fr_metropole", ("longitude_l93", "longitude_wgs")),
|
|
36
|
+
("latitude_wgs", ("latitude_l93",)),
|
|
37
|
+
("longitude_wgs", ("longitude_l93",)),
|
|
38
|
+
("code_region", ("code_departement",)),
|
|
39
|
+
("datetime_rfc822", ("datetime_aware",)),
|
|
40
|
+
]
|
|
41
|
+
detected_formats = set(x["format"] for x in output_dict[column_name])
|
|
21
42
|
formats_to_remove = set()
|
|
22
43
|
# Deprioritise float and int detection vs others
|
|
23
|
-
if len(
|
|
44
|
+
if len(detected_formats - {"float", "int"}) > 0:
|
|
24
45
|
formats_to_remove = formats_to_remove.union({"float", "int"})
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
46
|
+
# Deprioritize less specific formats if:
|
|
47
|
+
# secondary score is even or worse
|
|
48
|
+
# or priority score is at least 1 (max of the field score)
|
|
49
|
+
for prio_format, secondary_formats in priorities:
|
|
50
|
+
if prio_format in detected_formats:
|
|
51
|
+
for secondary in secondary_formats:
|
|
52
|
+
if (
|
|
53
|
+
secondary in detected_formats
|
|
54
|
+
and (
|
|
55
|
+
return_dict_cols[column_name][prio_format]
|
|
56
|
+
>= return_dict_cols[column_name][secondary]
|
|
57
|
+
or return_dict_cols[column_name][prio_format] >= 1
|
|
58
|
+
)
|
|
59
|
+
):
|
|
60
|
+
formats_to_remove.add(secondary)
|
|
39
61
|
|
|
40
|
-
formats_to_keep =
|
|
62
|
+
formats_to_keep = detected_formats - formats_to_remove
|
|
41
63
|
|
|
42
|
-
detections =
|
|
43
|
-
detections = [x for x in detections if x["format"] in formats_to_keep]
|
|
64
|
+
detections = [x for x in output_dict[column_name] if x["format"] in formats_to_keep]
|
|
44
65
|
if not limited_output:
|
|
45
|
-
|
|
66
|
+
output_dict[column_name] = detections
|
|
46
67
|
else:
|
|
47
|
-
|
|
68
|
+
output_dict[column_name] = (
|
|
48
69
|
max(detections, key=lambda x: x["score"])
|
|
49
70
|
if len(detections) > 0
|
|
50
71
|
else {"format": "string", "score": 1.0}
|
|
51
72
|
)
|
|
52
73
|
|
|
53
|
-
return
|
|
74
|
+
return output_dict
|
csv_detective/parsing/columns.py
CHANGED
|
@@ -10,6 +10,8 @@
|
|
|
10
10
|
- Split aware and naive datetimes for hydra to cast them separately [#130](https://github.com/datagouv/csv-detective/pull/130)
|
|
11
11
|
- Validate using the testing function, to consider PROPORTIONS [#131](https://github.com/datagouv/csv-detective/pull/131)
|
|
12
12
|
- Remove `datetime_iso` format due to ambiguous cast in db (can be naive or aware) [#132](https://github.com/datagouv/csv-detective/pull/132)
|
|
13
|
+
- Add `lonlat_wgs` format and handle optional brackets for `latlon_wgs` [#133](https://github.com/datagouv/csv-detective/pull/133)
|
|
14
|
+
- Refactor format prioritizing [#134](https://github.com/datagouv/csv-detective/pull/134)
|
|
13
15
|
|
|
14
16
|
## 0.8.0 (2025-05-20)
|
|
15
17
|
|
|
@@ -5,7 +5,7 @@ csv_detective/load_tests.py,sha256=GILvfkd4OVI-72mA4nzbPlZqgcXZ4wznOhGfZ1ucWkM,2
|
|
|
5
5
|
csv_detective/s3_utils.py,sha256=1cIVdQUYY2ovErbMwp72Gqtqx2bkB8nfVhn-QaOFTT0,1451
|
|
6
6
|
csv_detective/utils.py,sha256=-tIs9yV7RJPGj65lQ7LjRGch6Iws9UeuIPQsd2uUUJM,1025
|
|
7
7
|
csv_detective/validate.py,sha256=5Li_vfvU9wdfoZjNjef-MBUoKcKoJ-c7381QoX9aDXY,2818
|
|
8
|
-
csv_detective/detect_fields/__init__.py,sha256=
|
|
8
|
+
csv_detective/detect_fields/__init__.py,sha256=0A5SZTp_IhhJ9z7lWeH4K5_0uwMK_VdMudjPm7oggVg,1000
|
|
9
9
|
csv_detective/detect_fields/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
csv_detective/detect_fields/FR/geo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
11
|
csv_detective/detect_fields/FR/geo/adresse/__init__.py,sha256=NqV8ULf9gY9iFnA1deKR-1Yobr96WwCsn5JfbP_MjiY,1675
|
|
@@ -49,16 +49,17 @@ csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py,sha256=u98rn
|
|
|
49
49
|
csv_detective/detect_fields/geo/iso_country_code_alpha3/iso_country_code_alpha3.txt,sha256=aYqKSohgXuBtcIBfF52f8JWYDdxL_HV_Ol1srGnWBp4,1003
|
|
50
50
|
csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py,sha256=wJAynAkGZN7jKeI3xOeLXQ_irxQBb_J56pRkLDYVClY,436
|
|
51
51
|
csv_detective/detect_fields/geo/iso_country_code_numeric/iso_country_code_numeric.txt,sha256=2GtEhuporsHYV-pU4q9kfXU5iOtfW5C0GYBTTKQtnnA,1004
|
|
52
|
-
csv_detective/detect_fields/geo/json_geojson/__init__.py,sha256=
|
|
52
|
+
csv_detective/detect_fields/geo/json_geojson/__init__.py,sha256=6wlwlxQmsVIZ21g-THvH3nBj-I8FuoF2sBlZAoEMGiQ,393
|
|
53
53
|
csv_detective/detect_fields/geo/latitude_wgs/__init__.py,sha256=ArS6PuYEd0atZwSqNDZhXZz1TwzdiwdV8ovRYTOacpg,327
|
|
54
|
-
csv_detective/detect_fields/geo/latlon_wgs/__init__.py,sha256=
|
|
54
|
+
csv_detective/detect_fields/geo/latlon_wgs/__init__.py,sha256=IXDTqD4YFUJYI1FYZ5ZfkqXY6KvNY7sgBVFRAvgTHtI,454
|
|
55
55
|
csv_detective/detect_fields/geo/longitude_wgs/__init__.py,sha256=G7afWOKiGh_Tv7gwDNGt1a4B_A8hkCBkIxn3THDCUFk,330
|
|
56
|
+
csv_detective/detect_fields/geo/lonlat_wgs/__init__.py,sha256=CnBMYevfGdhBvureF3oc_zqT-RZjG419iAuUlugQFLc,454
|
|
56
57
|
csv_detective/detect_fields/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
57
58
|
csv_detective/detect_fields/other/booleen/__init__.py,sha256=wn_yyTAmGxqo0l0b7JRpGb0da_E27iGxES9zWCrnsqc,497
|
|
58
59
|
csv_detective/detect_fields/other/email/__init__.py,sha256=O9tgJmq0O8Q-8iin63NqEEDhlsUJjxFZNaNFM4GZaws,178
|
|
59
60
|
csv_detective/detect_fields/other/float/__init__.py,sha256=AT4Kpgwoz5PuAoLx00u0SL8DjjXZxsE8zSRbN18uAv4,578
|
|
60
61
|
csv_detective/detect_fields/other/int/__init__.py,sha256=QN3kQJLYqLRBiubUK7g4Xq03PlA5wqVwx2pPPIO9FdI,320
|
|
61
|
-
csv_detective/detect_fields/other/json/__init__.py,sha256=
|
|
62
|
+
csv_detective/detect_fields/other/json/__init__.py,sha256=AkRWZAidEM1dWkVRFThEBI5M7kMUu5Yu12iCViGM8lU,310
|
|
62
63
|
csv_detective/detect_fields/other/money/__init__.py,sha256=g_ZwBZXl9LhldwFYQotC5WqLiE8qQCZHtoI9eJvl_9M,232
|
|
63
64
|
csv_detective/detect_fields/other/mongo_object_id/__init__.py,sha256=7fcrHsOZAqXp2_N0IjPskYJ_qi4xRlo9iyNNDQVLzsU,156
|
|
64
65
|
csv_detective/detect_fields/other/percent/__init__.py,sha256=vgpekNOPBRuunoVBXMi81rwHv4uSOhe78pbVtQ5SBO8,177
|
|
@@ -71,7 +72,7 @@ csv_detective/detect_fields/temp/datetime_aware/__init__.py,sha256=bEfWvXx_GNCRU
|
|
|
71
72
|
csv_detective/detect_fields/temp/datetime_naive/__init__.py,sha256=GtQo55SrrXfoT-L7ZXW63jrlAYvNT5m56wMfhuY3pyI,836
|
|
72
73
|
csv_detective/detect_fields/temp/datetime_rfc822/__init__.py,sha256=JtUzg3BXYd-XJMLGxQ0P1OAJGOQ7DlYMD4fCU9yndg0,511
|
|
73
74
|
csv_detective/detect_fields/temp/year/__init__.py,sha256=RjsiIHoplnI4Odi5587TzRhKTDT-FTqGOBpdartuShA,194
|
|
74
|
-
csv_detective/detect_labels/__init__.py,sha256=
|
|
75
|
+
csv_detective/detect_labels/__init__.py,sha256=8vrFUrMc8a_VOC5gvYNMKL-Do_q9eMTrghJRI9Xotvk,883
|
|
75
76
|
csv_detective/detect_labels/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
76
77
|
csv_detective/detect_labels/FR/geo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
77
78
|
csv_detective/detect_labels/FR/geo/adresse/__init__.py,sha256=fNWFW-Wo3n6azDBfmi0J0qnzP-p2StLxCc9eNiE9NNE,346
|
|
@@ -110,10 +111,11 @@ csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py,sha256=biUZP
|
|
|
110
111
|
csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py,sha256=biUZP8gAsVpjXLTx1WeS19qR4ia0pzpi6R69wJgu4B0,348
|
|
111
112
|
csv_detective/detect_labels/geo/json_geojson/__init__.py,sha256=On8VOCDD0EspZra6fTQCXH4MYao2xmRu-o7xWcab7Jg,355
|
|
112
113
|
csv_detective/detect_labels/geo/latitude_wgs/__init__.py,sha256=ME_KjniqDSdAwXP7XnKXyr5IA75KrGSLIhvPNfsux6E,664
|
|
113
|
-
csv_detective/detect_labels/geo/latlon_wgs/__init__.py,sha256=
|
|
114
|
+
csv_detective/detect_labels/geo/latlon_wgs/__init__.py,sha256=tDndlFyEM7qKS3ATxp0Xs0FsPsOPpRWhDe1ockbWw8s,923
|
|
114
115
|
csv_detective/detect_labels/geo/longitude_wgs/__init__.py,sha256=_8IV2FLtrOjzhQNsk-fsgc9-jbAgzKDVMr4tXu2P-s4,429
|
|
116
|
+
csv_detective/detect_labels/geo/lonlat_wgs/__init__.py,sha256=NNKlFcMsKVqnUKEm_4flGxcNUGS2-iS3m6ihQf2AVTk,345
|
|
115
117
|
csv_detective/detect_labels/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
116
|
-
csv_detective/detect_labels/other/booleen/__init__.py,sha256=
|
|
118
|
+
csv_detective/detect_labels/other/booleen/__init__.py,sha256=zEkarex7L4T3vmYjR5hdhtnhugTVDsvkgG_it6nN0aA,214
|
|
117
119
|
csv_detective/detect_labels/other/email/__init__.py,sha256=Poagn45-eC2a_Wdk5Qs6d2BgYdncCQKZp2yEB50IuNw,431
|
|
118
120
|
csv_detective/detect_labels/other/float/__init__.py,sha256=X0axZN2GAfC_y01zRfIyvOfRsOy2KNQcQ-mlQAKxqT4,216
|
|
119
121
|
csv_detective/detect_labels/other/int/__init__.py,sha256=_1AY7thEBCcgSBQQ2YbY4YaPaxGRQ71BtmaFaX088ig,215
|
|
@@ -130,7 +132,7 @@ csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG
|
|
|
130
132
|
csv_detective/detection/columns.py,sha256=vfE-DKESA6J9Rfsl-a8tjgZfE21VmzArO5TrbzL0KmE,2905
|
|
131
133
|
csv_detective/detection/encoding.py,sha256=tpjJEMNM_2TcLXDzn1lNQPnSRnsWYjs83tQ8jNwTj4E,973
|
|
132
134
|
csv_detective/detection/engine.py,sha256=HiIrU-l9EO5Fbc2Vh8W_Uy5-dpKcQQzlxCqMuWc09LY,1530
|
|
133
|
-
csv_detective/detection/formats.py,sha256=
|
|
135
|
+
csv_detective/detection/formats.py,sha256=c0LFTWbibWbEJSZaPy_86LIMOY3qRxj-I_agwpb4zbI,6284
|
|
134
136
|
csv_detective/detection/headers.py,sha256=wrVII2RQpsVmHhrO1DHf3dmiu8kbtOjBlskf41cnQmc,1172
|
|
135
137
|
csv_detective/detection/rows.py,sha256=3qvsbsBcMxiqqfSYYkOgsRpX777rk22tnRHDwUA97kU,742
|
|
136
138
|
csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
|
|
@@ -139,28 +141,28 @@ csv_detective/output/__init__.py,sha256=5KTevPfp_4MRxByJyOntQjToNfeG7dPQn-_13wSq
|
|
|
139
141
|
csv_detective/output/dataframe.py,sha256=89iQRE59cHQyQQEsujQVIKP2YAUYpPklWkdDOqZE-wE,2183
|
|
140
142
|
csv_detective/output/example.py,sha256=EdPX1iqHhIG4DsiHuYdy-J7JxOkjgUh_o2D5nrfM5fA,8649
|
|
141
143
|
csv_detective/output/profile.py,sha256=B8YU541T_YPDezJGh4dkHckOShiwHSrZd9GS8jbmz7A,2919
|
|
142
|
-
csv_detective/output/schema.py,sha256=
|
|
143
|
-
csv_detective/output/utils.py,sha256=
|
|
144
|
+
csv_detective/output/schema.py,sha256=yC9K1vw6NUTULNv9a7CaMGns9iXmbzFLbtHI4wegqEc,13812
|
|
145
|
+
csv_detective/output/utils.py,sha256=xPM2KYdqousmjU22-w7HnaF6AR74fj8lhQY77Y9xs7w,3310
|
|
144
146
|
csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
145
|
-
csv_detective/parsing/columns.py,sha256=
|
|
147
|
+
csv_detective/parsing/columns.py,sha256=aMdG6-G-2Tj_2JdHotAIveQwaG_r8chGcGieFiUaBRk,5634
|
|
146
148
|
csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
|
|
147
149
|
csv_detective/parsing/csv.py,sha256=11mibDnJhIjykXLGZvA5ZEU5U7KgxIrbyO6BNv6jlro,1626
|
|
148
150
|
csv_detective/parsing/excel.py,sha256=AslE2S1e67o8yTIAIhp-lAnJ6-XqeBBRz1-VMFqhZBM,7055
|
|
149
151
|
csv_detective/parsing/load.py,sha256=u6fbGFZsL2GwPQRzhAXgt32JpUur7vbQdErREHxNJ-w,3661
|
|
150
152
|
csv_detective/parsing/text.py,sha256=_TprGi0gHZlRsafizI3dqQhBehZW4BazqxmypMcAZ-o,1824
|
|
151
|
-
csv_detective-0.8.1.
|
|
152
|
-
csv_detective-0.8.1.
|
|
153
|
-
csv_detective-0.8.1.
|
|
154
|
-
csv_detective-0.8.1.
|
|
153
|
+
csv_detective-0.8.1.dev1599.data/data/share/csv_detective/CHANGELOG.md,sha256=WQ8cTB2D5YkAJ9AsS2ziKtZL8m1sPclGPenTD1BxZ_g,9646
|
|
154
|
+
csv_detective-0.8.1.dev1599.data/data/share/csv_detective/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
|
|
155
|
+
csv_detective-0.8.1.dev1599.data/data/share/csv_detective/README.md,sha256=gKLFmC8kuCCywS9eAhMak_JNriUWWNOsBKleAu5TIEY,8501
|
|
156
|
+
csv_detective-0.8.1.dev1599.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
|
|
155
157
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
156
158
|
tests/test_example.py,sha256=JeHxSK0IVDcSrOhSZlNGSQv4JAc_r6mzvJM8PfmLTMw,2018
|
|
157
|
-
tests/test_fields.py,sha256=
|
|
159
|
+
tests/test_fields.py,sha256=IwMpjOn8W5kDCvJYp3Cer4m571qomzjupOAvSRFMg_Q,11819
|
|
158
160
|
tests/test_file.py,sha256=0bHV9wx9mSRoav_DVF19g694yohb1p0bw7rtcBeKG-8,8451
|
|
159
161
|
tests/test_labels.py,sha256=Nkr645bUewrj8hjNDKr67FQ6Sy_TID6f3E5Kfkl231M,464
|
|
160
162
|
tests/test_structure.py,sha256=bv-tjgXohvQAxwmxzH0BynFpK2TyPjcxvtIAmIRlZmA,1393
|
|
161
163
|
tests/test_validation.py,sha256=CTGonR6htxcWF9WH8MxumDD8cF45Y-G4hm94SM4lFjU,3246
|
|
162
|
-
csv_detective-0.8.1.
|
|
163
|
-
csv_detective-0.8.1.
|
|
164
|
-
csv_detective-0.8.1.
|
|
165
|
-
csv_detective-0.8.1.
|
|
166
|
-
csv_detective-0.8.1.
|
|
164
|
+
csv_detective-0.8.1.dev1599.dist-info/METADATA,sha256=NoE1tBjCZxO2uffbH9wSgkuNzOVOgLRA2qkjth7ynyk,10443
|
|
165
|
+
csv_detective-0.8.1.dev1599.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
166
|
+
csv_detective-0.8.1.dev1599.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
|
|
167
|
+
csv_detective-0.8.1.dev1599.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
|
|
168
|
+
csv_detective-0.8.1.dev1599.dist-info/RECORD,,
|
tests/test_fields.py
CHANGED
|
@@ -44,6 +44,7 @@ from csv_detective.detect_fields.geo import (
|
|
|
44
44
|
latitude_wgs,
|
|
45
45
|
latlon_wgs,
|
|
46
46
|
longitude_wgs,
|
|
47
|
+
lonlat_wgs,
|
|
47
48
|
)
|
|
48
49
|
from csv_detective.detect_fields.other import (
|
|
49
50
|
booleen,
|
|
@@ -71,6 +72,7 @@ from csv_detective.detection.variables import (
|
|
|
71
72
|
)
|
|
72
73
|
from csv_detective.load_tests import return_all_tests
|
|
73
74
|
from csv_detective.output.dataframe import cast
|
|
75
|
+
from csv_detective.output.utils import prepare_output_dict
|
|
74
76
|
|
|
75
77
|
|
|
76
78
|
def test_all_tests_return_bool():
|
|
@@ -263,13 +265,17 @@ fields = {
|
|
|
263
265
|
False: ["100"],
|
|
264
266
|
},
|
|
265
267
|
latlon_wgs: {
|
|
266
|
-
True: ["43.2,-22.6", "-10.7,140", "-40.7, 10.8"],
|
|
267
|
-
False: ["0.1,192", "-102, 92"],
|
|
268
|
+
True: ["43.2,-22.6", "-10.7,140", "-40.7, 10.8", "[12,-0.28]"],
|
|
269
|
+
False: ["0.1,192", "-102, 92", "[23.02,4.1", "23.02,4.1]", "160.1,-27"],
|
|
268
270
|
},
|
|
269
271
|
longitude_wgs: {
|
|
270
272
|
True: ["120", "-20.2"],
|
|
271
273
|
False: ["-200"],
|
|
272
274
|
},
|
|
275
|
+
lonlat_wgs: {
|
|
276
|
+
True: ["-22.6,43.2", "140,-10.7", "10.8, -40.7", "[-0.28,12]"],
|
|
277
|
+
False: ["192,0.1", "92, -102", "[4.1,23.02", "4.1,23.02]", "-27,160.1"],
|
|
278
|
+
},
|
|
273
279
|
booleen: {
|
|
274
280
|
True: ["oui", "0", "1", "yes", "false", "True"],
|
|
275
281
|
False: ["nein", "ja", "2", "-0"],
|
|
@@ -280,7 +286,7 @@ fields = {
|
|
|
280
286
|
},
|
|
281
287
|
json: {
|
|
282
288
|
True: ['{"pomme": "fruit", "reponse": 42}', "[1,2,3,4]"],
|
|
283
|
-
False: [
|
|
289
|
+
False: ["5", '{"zefib":', '{"a"}'],
|
|
284
290
|
},
|
|
285
291
|
money: {
|
|
286
292
|
True: ["120€", "-20.2$"],
|
|
@@ -405,3 +411,27 @@ def test_fields_with_values(args):
|
|
|
405
411
|
def test_cast(args):
|
|
406
412
|
value, detected_type, cast_type = args
|
|
407
413
|
assert isinstance(cast(value, detected_type), cast_type)
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
@pytest.mark.parametrize(
|
|
417
|
+
"args",
|
|
418
|
+
(
|
|
419
|
+
# there is a specific numerical format => specific wins
|
|
420
|
+
({"int": 1, "float": 1, "latitude_wgs": 1}, "latitude_wgs"),
|
|
421
|
+
# scores are equal for related formats => priority wins
|
|
422
|
+
({"int": 1, "float": 1}, "int"),
|
|
423
|
+
# score is lower for priority format => secondary wins
|
|
424
|
+
({"int": 0.5, "float": 1}, "float"),
|
|
425
|
+
# score is lower for priority format, but is 1 => priority wins
|
|
426
|
+
({"int": 1, "float": 1.25}, "int"),
|
|
427
|
+
# two rounds of priority => highest priority wins
|
|
428
|
+
({"latlon_wgs": 1, "lonlat_wgs": 1, "json": 1}, "latlon_wgs"),
|
|
429
|
+
# no detection => default to string
|
|
430
|
+
({}, "string"),
|
|
431
|
+
),
|
|
432
|
+
)
|
|
433
|
+
def test_priority(args):
|
|
434
|
+
detections, expected = args
|
|
435
|
+
col = "col1"
|
|
436
|
+
output = prepare_output_dict(pd.DataFrame({col: detections}), limited_output=True)
|
|
437
|
+
assert output[col]["format"] == expected
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{csv_detective-0.8.1.dev1549.dist-info → csv_detective-0.8.1.dev1599.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{csv_detective-0.8.1.dev1549.dist-info → csv_detective-0.8.1.dev1599.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
{csv_detective-0.8.1.dev1549.dist-info → csv_detective-0.8.1.dev1599.dist-info}/top_level.txt
RENAMED
|
File without changes
|