csv-detective 0.8.1.dev1549__py3-none-any.whl → 0.8.1.dev1599__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. csv_detective/detect_fields/__init__.py +1 -0
  2. csv_detective/detect_fields/geo/json_geojson/__init__.py +6 -10
  3. csv_detective/detect_fields/geo/latlon_wgs/__init__.py +4 -1
  4. csv_detective/detect_fields/geo/lonlat_wgs/__init__.py +16 -0
  5. csv_detective/detect_fields/other/json/__init__.py +3 -13
  6. csv_detective/detect_labels/__init__.py +4 -3
  7. csv_detective/detect_labels/geo/latlon_wgs/__init__.py +34 -31
  8. csv_detective/detect_labels/geo/lonlat_wgs/__init__.py +14 -0
  9. csv_detective/detect_labels/other/booleen/__init__.py +1 -1
  10. csv_detective/detection/formats.py +0 -2
  11. csv_detective/output/schema.py +3 -0
  12. csv_detective/output/utils.py +56 -35
  13. csv_detective/parsing/columns.py +0 -1
  14. {csv_detective-0.8.1.dev1549.data → csv_detective-0.8.1.dev1599.data}/data/share/csv_detective/CHANGELOG.md +2 -0
  15. {csv_detective-0.8.1.dev1549.dist-info → csv_detective-0.8.1.dev1599.dist-info}/METADATA +1 -1
  16. {csv_detective-0.8.1.dev1549.dist-info → csv_detective-0.8.1.dev1599.dist-info}/RECORD +23 -21
  17. tests/test_fields.py +33 -3
  18. {csv_detective-0.8.1.dev1549.data → csv_detective-0.8.1.dev1599.data}/data/share/csv_detective/LICENSE +0 -0
  19. {csv_detective-0.8.1.dev1549.data → csv_detective-0.8.1.dev1599.data}/data/share/csv_detective/README.md +0 -0
  20. {csv_detective-0.8.1.dev1549.dist-info → csv_detective-0.8.1.dev1599.dist-info}/WHEEL +0 -0
  21. {csv_detective-0.8.1.dev1549.dist-info → csv_detective-0.8.1.dev1599.dist-info}/entry_points.txt +0 -0
  22. {csv_detective-0.8.1.dev1549.dist-info → csv_detective-0.8.1.dev1599.dist-info}/licenses/LICENSE +0 -0
  23. {csv_detective-0.8.1.dev1549.dist-info → csv_detective-0.8.1.dev1599.dist-info}/top_level.txt +0 -0
@@ -53,6 +53,7 @@ from .geo import (
53
53
  latitude_wgs,
54
54
  longitude_wgs,
55
55
  latlon_wgs,
56
+ lonlat_wgs,
56
57
  json_geojson,
57
58
  )
58
59
 
@@ -1,22 +1,18 @@
1
1
  import json
2
- from json import JSONDecodeError
3
2
 
4
3
  PROPORTION = 0.9
5
4
 
6
5
 
7
6
  def _is(val):
8
- '''Renvoie True si val peut etre geojson'''
7
+ """Renvoie True si val peut etre un geojson"""
9
8
 
10
9
  try:
11
10
  j = json.loads(val)
12
- if 'type' in j and 'coordinates' in j:
13
- return True
14
- if 'geometry' in j:
15
- if 'coordinates' in j['geometry']:
11
+ if isinstance(j, dict):
12
+ if "type" in j and "coordinates" in j:
16
13
  return True
17
- except JSONDecodeError:
18
- pass
19
- except TypeError:
14
+ if "geometry" in j and "coordinates" in j["geometry"]:
15
+ return True
16
+ except Exception:
20
17
  pass
21
-
22
18
  return False
@@ -5,9 +5,12 @@ PROPORTION = 1
5
5
 
6
6
 
7
7
  def _is(val):
8
- '''Renvoie True si val peut etre une latitude,longitude'''
8
+ """Renvoie True si val peut etre une latitude,longitude"""
9
9
 
10
10
  if not isinstance(val, str) or val.count(",") != 1:
11
11
  return False
12
12
  lat, lon = val.split(",")
13
+ # handling [lat,lon]
14
+ if lat.startswith("[") and lon.endswith("]"):
15
+ lat, lon = lat[1:], lon[:-1]
13
16
  return is_lat(lat) and is_lon(lon.replace(" ", ""))
@@ -0,0 +1,16 @@
1
+ from ..latitude_wgs import _is as is_lat
2
+ from ..longitude_wgs import _is as is_lon
3
+
4
+ PROPORTION = 1
5
+
6
+
7
+ def _is(val):
8
+ """Renvoie True si val peut etre une longitude,latitude"""
9
+
10
+ if not isinstance(val, str) or val.count(",") != 1:
11
+ return False
12
+ lon, lat = val.split(",")
13
+ # handling [lon,lat]
14
+ if lon.startswith("[") and lat.endswith("]"):
15
+ lon, lat = lon[1:], lat[:-1]
16
+ return is_lon(lon) and is_lat(lat.replace(" ", ""))
@@ -5,20 +5,10 @@ PROPORTION = 1
5
5
 
6
6
 
7
7
  def _is(val):
8
- '''Detects json'''
8
+ """Detects json"""
9
9
  try:
10
10
  loaded = json.loads(val)
11
- if isinstance(loaded, list) or (
12
- isinstance(loaded, dict) and not (
13
- any(
14
- [
15
- geo in loaded for geo in ['coordinates', 'geometry']
16
- ]
17
- )
18
- )
19
- ):
20
- return True
21
- else:
22
- return False
11
+ # we don't want to consider integers for instance
12
+ return isinstance(loaded, (list, dict))
23
13
  except (JSONDecodeError, TypeError):
24
14
  return False
@@ -14,7 +14,7 @@ from .FR.geo import (
14
14
  longitude_l93,
15
15
  longitude_wgs_fr_metropole,
16
16
  pays,
17
- region
17
+ region,
18
18
  )
19
19
  from .FR.other import (
20
20
  code_csp_insee,
@@ -27,7 +27,7 @@ from .FR.other import (
27
27
  siren,
28
28
  siret,
29
29
  tel_fr,
30
- uai
30
+ uai,
31
31
  )
32
32
  from .FR.temp import jour_de_la_semaine, mois_de_annee
33
33
  from .geo import (
@@ -37,7 +37,8 @@ from .geo import (
37
37
  json_geojson,
38
38
  latitude_wgs,
39
39
  latlon_wgs,
40
- longitude_wgs
40
+ longitude_wgs,
41
+ lonlat_wgs,
41
42
  )
42
43
  from .other import booleen, email, float, int, money, mongo_object_id, twitter, url, uuid
43
44
  from .temp import date, datetime_rfc822, year
@@ -2,42 +2,45 @@ from csv_detective.parsing.text import header_score
2
2
 
3
3
  PROPORTION = 0.5
4
4
 
5
+ COMMON_COORDS_LABELS = [
6
+ "c geo",
7
+ "code geo",
8
+ "coord gps",
9
+ "coordonnees",
10
+ "coordonnees ban",
11
+ "coordonnees finales",
12
+ "coordonnees geo",
13
+ "coordonnees geographiques",
14
+ "coordonnees geoloc",
15
+ "coordonnees geoloc",
16
+ "coordonnees gps",
17
+ "coordonnees insee",
18
+ "coordonnees xy",
19
+ "geo",
20
+ "geo coordinates",
21
+ "geo cp",
22
+ "geo localisation",
23
+ "geo point",
24
+ "geo point 2d",
25
+ "geolocalisation",
26
+ "geom x y",
27
+ "geometry x y",
28
+ "geopoint",
29
+ "point geo",
30
+ "point geo insee",
31
+ "position",
32
+ "position geographique",
33
+ "wgs84",
34
+ "x y",
35
+ "xy",
36
+ ]
37
+
5
38
 
6
39
  def _is(header: str) -> float:
7
40
  words_combinations_list = [
8
41
  "latlon wgs",
9
42
  "latlon",
10
- "geo point",
11
- "geo point 2d",
12
- "wgs84",
13
- "geolocalisation",
14
- "geo",
15
- "coordonnees finales",
16
- "coordonnees",
17
- "coordonnees ban",
18
- "xy",
19
- "geometry x y",
20
- "coordonnees insee",
21
- "coordonnees geographiques",
22
- "position",
23
- "coordonnes gps",
24
- "geopoint",
25
- "geom x y",
26
- "coord gps",
27
43
  "latlong",
28
- "position geographique",
29
- "c geo",
30
- "coordonnes geoloc",
31
44
  "lat lon",
32
- "code geo",
33
- "geo localisation",
34
- "coordonnes geo",
35
- "geo cp",
36
- "x y",
37
- "geo coordinates",
38
- "point geo",
39
- "point geo insee",
40
- "coordonnees geoloc",
41
- "coordonnees xy",
42
- ]
45
+ ] + COMMON_COORDS_LABELS
43
46
  return header_score(header, words_combinations_list)
@@ -0,0 +1,14 @@
1
+ from csv_detective.parsing.text import header_score
2
+ from ..latlon_wgs import COMMON_COORDS_LABELS
3
+
4
+ PROPORTION = 0.5
5
+
6
+
7
+ def _is(header: str) -> float:
8
+ words_combinations_list = [
9
+ "lonlat wgs",
10
+ "lonlat",
11
+ "longlat",
12
+ "lon lat",
13
+ ] + COMMON_COORDS_LABELS
14
+ return header_score(header, words_combinations_list)
@@ -4,5 +4,5 @@ PROPORTION = 0.5
4
4
 
5
5
 
6
6
  def _is(header: str) -> float:
7
- words_combinations_list = ["is_", "has_", "est_"]
7
+ words_combinations_list = ["is ", "has ", "est "]
8
8
  return header_score(header, words_combinations_list)
@@ -110,11 +110,9 @@ def detect_formats(
110
110
  "datetime_naive": "datetime",
111
111
  "datetime_rfc822": "datetime",
112
112
  "date": "date",
113
- "latitude": "float",
114
113
  "latitude_l93": "float",
115
114
  "latitude_wgs": "float",
116
115
  "latitude_wgs_fr_metropole": "float",
117
- "longitude": "float",
118
116
  "longitude_l93": "float",
119
117
  "longitude_wgs": "float",
120
118
  "longitude_wgs_fr_metropole": "float",
@@ -51,6 +51,7 @@ def get_description(format: str) -> str:
51
51
  "latitude_wgs": "La latitude au format WGS",
52
52
  "longitude_wgs": "La longitude au format WGS",
53
53
  "latlon_wgs": "Les coordonnées XY (latitude et longitude)",
54
+ "lonlat_wgs": "Les coordonnées XY (longitude et latitude)",
54
55
  "booleen": "Booléen",
55
56
  "email": "L'adresse couriel (email)",
56
57
  "float": "Nombre flottant (à virgule)",
@@ -116,6 +117,7 @@ def get_validata_type(format: str) -> str:
116
117
  "latitude_wgs": "number",
117
118
  "latitude_wgs_fr_metropole": "number",
118
119
  "latlon_wgs": "geo_point",
120
+ "lonlat_wgs": "geo_point",
119
121
  "longitude": "number",
120
122
  "longitude_l93": "number",
121
123
  "longitude_wgs": "number",
@@ -162,6 +164,7 @@ def get_example(format: str) -> str:
162
164
  "latitude_wgs": 42.42,
163
165
  "latitude_wgs_fr_metropole": 41.3,
164
166
  "latlon_wgs": "42.42, 0.0",
167
+ "lonlat_wgs": "0.0, 42.42",
165
168
  "longitude": 0.0,
166
169
  "longitude_l93": -357823,
167
170
  "longitude_wgs": 0.0,
@@ -2,52 +2,73 @@ import pandas as pd
2
2
 
3
3
 
4
4
  def prepare_output_dict(return_table: pd.DataFrame, limited_output: bool):
5
+ # -> dict[str, dict | list[dict]] (to be added when upgrading to python>=3.10)
5
6
  return_dict_cols = return_table.to_dict("dict")
6
- return_dict_cols_intermediary = {}
7
+ output_dict = {}
7
8
  for column_name in return_dict_cols:
8
- return_dict_cols_intermediary[column_name] = []
9
- for detected_value_type in return_dict_cols[column_name]:
10
- if return_dict_cols[column_name][detected_value_type] == 0:
11
- continue
12
- dict_tmp = {}
13
- dict_tmp["format"] = detected_value_type
14
- dict_tmp["score"] = return_dict_cols[column_name][detected_value_type]
15
- return_dict_cols_intermediary[column_name].append(dict_tmp)
16
-
17
- # Clean dict using priorities
18
- formats_detected = {
19
- x["format"] for x in return_dict_cols_intermediary[column_name]
20
- }
9
+ # keep only formats with a non-zero score
10
+ output_dict[column_name] = [
11
+ {
12
+ "format": detected_value_type,
13
+ "score": return_dict_cols[column_name][detected_value_type],
14
+ }
15
+ for detected_value_type in return_dict_cols[column_name]
16
+ if return_dict_cols[column_name][detected_value_type] > 0
17
+ ]
18
+ priorities = [
19
+ # no need to specify int and float everywhere, they are deprioritized anyway
20
+ ("int", ("float",)),
21
+ # bool over everything
22
+ ("booleen", (
23
+ "latitude_l93",
24
+ "latitude_wgs",
25
+ "latitude_wgs_fr_metropole",
26
+ "longitude_l93",
27
+ "longitude_wgs",
28
+ "longitude_wgs_fr_metropole",
29
+ )),
30
+ ("geojson", ("json",)),
31
+ # latlon over lonlat if no longitude allows to discriminate
32
+ ("latlon_wgs", ("json", "lonlat_wgs")),
33
+ ("lonlat_wgs", ("json",)),
34
+ ("latitude_wgs_fr_metropole", ("latitude_l93", "latitude_wgs")),
35
+ ("longitude_wgs_fr_metropole", ("longitude_l93", "longitude_wgs")),
36
+ ("latitude_wgs", ("latitude_l93",)),
37
+ ("longitude_wgs", ("longitude_l93",)),
38
+ ("code_region", ("code_departement",)),
39
+ ("datetime_rfc822", ("datetime_aware",)),
40
+ ]
41
+ detected_formats = set(x["format"] for x in output_dict[column_name])
21
42
  formats_to_remove = set()
22
43
  # Deprioritise float and int detection vs others
23
- if len(formats_detected - {"float", "int"}) > 0:
44
+ if len(detected_formats - {"float", "int"}) > 0:
24
45
  formats_to_remove = formats_to_remove.union({"float", "int"})
25
- if "int" in formats_detected:
26
- formats_to_remove.add("float")
27
- if "latitude_wgs_fr_metropole" in formats_detected:
28
- formats_to_remove.add("latitude_l93")
29
- formats_to_remove.add("latitude_wgs")
30
- if "longitude_wgs_fr_metropole" in formats_detected:
31
- formats_to_remove.add("longitude_l93")
32
- formats_to_remove.add("longitude_wgs")
33
- if "longitude_wgs" in formats_detected:
34
- formats_to_remove.add("longitude_l93")
35
- if "code_region" in formats_detected:
36
- formats_to_remove.add("code_departement")
37
- if "datetime_rfc822" in formats_detected:
38
- formats_to_remove.add("datetime_aware")
46
+ # Deprioritize less specific formats if:
47
+ # secondary score is even or worse
48
+ # or priority score is at least 1 (max of the field score)
49
+ for prio_format, secondary_formats in priorities:
50
+ if prio_format in detected_formats:
51
+ for secondary in secondary_formats:
52
+ if (
53
+ secondary in detected_formats
54
+ and (
55
+ return_dict_cols[column_name][prio_format]
56
+ >= return_dict_cols[column_name][secondary]
57
+ or return_dict_cols[column_name][prio_format] >= 1
58
+ )
59
+ ):
60
+ formats_to_remove.add(secondary)
39
61
 
40
- formats_to_keep = formats_detected - formats_to_remove
62
+ formats_to_keep = detected_formats - formats_to_remove
41
63
 
42
- detections = return_dict_cols_intermediary[column_name]
43
- detections = [x for x in detections if x["format"] in formats_to_keep]
64
+ detections = [x for x in output_dict[column_name] if x["format"] in formats_to_keep]
44
65
  if not limited_output:
45
- return_dict_cols_intermediary[column_name] = detections
66
+ output_dict[column_name] = detections
46
67
  else:
47
- return_dict_cols_intermediary[column_name] = (
68
+ output_dict[column_name] = (
48
69
  max(detections, key=lambda x: x["score"])
49
70
  if len(detections) > 0
50
71
  else {"format": "string", "score": 1.0}
51
72
  )
52
73
 
53
- return return_dict_cols_intermediary
74
+ return output_dict
@@ -46,7 +46,6 @@ def test_col_val(
46
46
  ]: # Pour ne pas faire d'opérations inutiles, on commence par 1,
47
47
  # puis 5 valeurs puis la serie complète
48
48
  if all(apply_test_func(serie, test_func, _range)):
49
- # print(serie.name, ': check OK')
50
49
  pass
51
50
  else:
52
51
  return 0.0
@@ -10,6 +10,8 @@
10
10
  - Split aware and naive datetimes for hydra to cast them separately [#130](https://github.com/datagouv/csv-detective/pull/130)
11
11
  - Validate using the testing function, to consider PROPORTIONS [#131](https://github.com/datagouv/csv-detective/pull/131)
12
12
  - Remove `datetime_iso` format due to ambiguous cast in db (can be naive or aware) [#132](https://github.com/datagouv/csv-detective/pull/132)
13
+ - Add `lonlat_wgs` format and handle optional brackets for `latlon_wgs` [#133](https://github.com/datagouv/csv-detective/pull/133)
14
+ - Refactor format prioritizing [#134](https://github.com/datagouv/csv-detective/pull/134)
13
15
 
14
16
  ## 0.8.0 (2025-05-20)
15
17
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: csv_detective
3
- Version: 0.8.1.dev1549
3
+ Version: 0.8.1.dev1599
4
4
  Summary: Detect tabular files column content
5
5
  Home-page: https://github.com/datagouv/csv_detective
6
6
  Author: Etalab
@@ -5,7 +5,7 @@ csv_detective/load_tests.py,sha256=GILvfkd4OVI-72mA4nzbPlZqgcXZ4wznOhGfZ1ucWkM,2
5
5
  csv_detective/s3_utils.py,sha256=1cIVdQUYY2ovErbMwp72Gqtqx2bkB8nfVhn-QaOFTT0,1451
6
6
  csv_detective/utils.py,sha256=-tIs9yV7RJPGj65lQ7LjRGch6Iws9UeuIPQsd2uUUJM,1025
7
7
  csv_detective/validate.py,sha256=5Li_vfvU9wdfoZjNjef-MBUoKcKoJ-c7381QoX9aDXY,2818
8
- csv_detective/detect_fields/__init__.py,sha256=jThGn0_HO8U0mMoSbf38x8l46ABRQcmHcNLvjZqQQdc,984
8
+ csv_detective/detect_fields/__init__.py,sha256=0A5SZTp_IhhJ9z7lWeH4K5_0uwMK_VdMudjPm7oggVg,1000
9
9
  csv_detective/detect_fields/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  csv_detective/detect_fields/FR/geo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  csv_detective/detect_fields/FR/geo/adresse/__init__.py,sha256=NqV8ULf9gY9iFnA1deKR-1Yobr96WwCsn5JfbP_MjiY,1675
@@ -49,16 +49,17 @@ csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py,sha256=u98rn
49
49
  csv_detective/detect_fields/geo/iso_country_code_alpha3/iso_country_code_alpha3.txt,sha256=aYqKSohgXuBtcIBfF52f8JWYDdxL_HV_Ol1srGnWBp4,1003
50
50
  csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py,sha256=wJAynAkGZN7jKeI3xOeLXQ_irxQBb_J56pRkLDYVClY,436
51
51
  csv_detective/detect_fields/geo/iso_country_code_numeric/iso_country_code_numeric.txt,sha256=2GtEhuporsHYV-pU4q9kfXU5iOtfW5C0GYBTTKQtnnA,1004
52
- csv_detective/detect_fields/geo/json_geojson/__init__.py,sha256=FPHOfTrfXJs62-NgeOcNGOvwPd7I1fEVp8lTdMNfj3w,433
52
+ csv_detective/detect_fields/geo/json_geojson/__init__.py,sha256=6wlwlxQmsVIZ21g-THvH3nBj-I8FuoF2sBlZAoEMGiQ,393
53
53
  csv_detective/detect_fields/geo/latitude_wgs/__init__.py,sha256=ArS6PuYEd0atZwSqNDZhXZz1TwzdiwdV8ovRYTOacpg,327
54
- csv_detective/detect_fields/geo/latlon_wgs/__init__.py,sha256=7_mnO9uC_kI7e2WR8xIer7Kqw8zi-v-JKaAD4zcoGbE,342
54
+ csv_detective/detect_fields/geo/latlon_wgs/__init__.py,sha256=IXDTqD4YFUJYI1FYZ5ZfkqXY6KvNY7sgBVFRAvgTHtI,454
55
55
  csv_detective/detect_fields/geo/longitude_wgs/__init__.py,sha256=G7afWOKiGh_Tv7gwDNGt1a4B_A8hkCBkIxn3THDCUFk,330
56
+ csv_detective/detect_fields/geo/lonlat_wgs/__init__.py,sha256=CnBMYevfGdhBvureF3oc_zqT-RZjG419iAuUlugQFLc,454
56
57
  csv_detective/detect_fields/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
57
58
  csv_detective/detect_fields/other/booleen/__init__.py,sha256=wn_yyTAmGxqo0l0b7JRpGb0da_E27iGxES9zWCrnsqc,497
58
59
  csv_detective/detect_fields/other/email/__init__.py,sha256=O9tgJmq0O8Q-8iin63NqEEDhlsUJjxFZNaNFM4GZaws,178
59
60
  csv_detective/detect_fields/other/float/__init__.py,sha256=AT4Kpgwoz5PuAoLx00u0SL8DjjXZxsE8zSRbN18uAv4,578
60
61
  csv_detective/detect_fields/other/int/__init__.py,sha256=QN3kQJLYqLRBiubUK7g4Xq03PlA5wqVwx2pPPIO9FdI,320
61
- csv_detective/detect_fields/other/json/__init__.py,sha256=DhzyvT12kOqgum89silIu3uoSYXmC_s_AaxLtXAD4eU,540
62
+ csv_detective/detect_fields/other/json/__init__.py,sha256=AkRWZAidEM1dWkVRFThEBI5M7kMUu5Yu12iCViGM8lU,310
62
63
  csv_detective/detect_fields/other/money/__init__.py,sha256=g_ZwBZXl9LhldwFYQotC5WqLiE8qQCZHtoI9eJvl_9M,232
63
64
  csv_detective/detect_fields/other/mongo_object_id/__init__.py,sha256=7fcrHsOZAqXp2_N0IjPskYJ_qi4xRlo9iyNNDQVLzsU,156
64
65
  csv_detective/detect_fields/other/percent/__init__.py,sha256=vgpekNOPBRuunoVBXMi81rwHv4uSOhe78pbVtQ5SBO8,177
@@ -71,7 +72,7 @@ csv_detective/detect_fields/temp/datetime_aware/__init__.py,sha256=bEfWvXx_GNCRU
71
72
  csv_detective/detect_fields/temp/datetime_naive/__init__.py,sha256=GtQo55SrrXfoT-L7ZXW63jrlAYvNT5m56wMfhuY3pyI,836
72
73
  csv_detective/detect_fields/temp/datetime_rfc822/__init__.py,sha256=JtUzg3BXYd-XJMLGxQ0P1OAJGOQ7DlYMD4fCU9yndg0,511
73
74
  csv_detective/detect_fields/temp/year/__init__.py,sha256=RjsiIHoplnI4Odi5587TzRhKTDT-FTqGOBpdartuShA,194
74
- csv_detective/detect_labels/__init__.py,sha256=oVq2fiO6QkaWB0wZImL8YVW7oiwPky8ivmLZAFmK55Q,864
75
+ csv_detective/detect_labels/__init__.py,sha256=8vrFUrMc8a_VOC5gvYNMKL-Do_q9eMTrghJRI9Xotvk,883
75
76
  csv_detective/detect_labels/FR/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
76
77
  csv_detective/detect_labels/FR/geo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
77
78
  csv_detective/detect_labels/FR/geo/adresse/__init__.py,sha256=fNWFW-Wo3n6azDBfmi0J0qnzP-p2StLxCc9eNiE9NNE,346
@@ -110,10 +111,11 @@ csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py,sha256=biUZP
110
111
  csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py,sha256=biUZP8gAsVpjXLTx1WeS19qR4ia0pzpi6R69wJgu4B0,348
111
112
  csv_detective/detect_labels/geo/json_geojson/__init__.py,sha256=On8VOCDD0EspZra6fTQCXH4MYao2xmRu-o7xWcab7Jg,355
112
113
  csv_detective/detect_labels/geo/latitude_wgs/__init__.py,sha256=ME_KjniqDSdAwXP7XnKXyr5IA75KrGSLIhvPNfsux6E,664
113
- csv_detective/detect_labels/geo/latlon_wgs/__init__.py,sha256=dbWX1LKpoev7zwWthw9vlwGQp6CSlgYrTBnPpvyNC-A,989
114
+ csv_detective/detect_labels/geo/latlon_wgs/__init__.py,sha256=tDndlFyEM7qKS3ATxp0Xs0FsPsOPpRWhDe1ockbWw8s,923
114
115
  csv_detective/detect_labels/geo/longitude_wgs/__init__.py,sha256=_8IV2FLtrOjzhQNsk-fsgc9-jbAgzKDVMr4tXu2P-s4,429
116
+ csv_detective/detect_labels/geo/lonlat_wgs/__init__.py,sha256=NNKlFcMsKVqnUKEm_4flGxcNUGS2-iS3m6ihQf2AVTk,345
115
117
  csv_detective/detect_labels/other/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
116
- csv_detective/detect_labels/other/booleen/__init__.py,sha256=BZwnfR-Zcv8dqscLrBKhttgwm4Dqq16M0PaGirxYWio,214
118
+ csv_detective/detect_labels/other/booleen/__init__.py,sha256=zEkarex7L4T3vmYjR5hdhtnhugTVDsvkgG_it6nN0aA,214
117
119
  csv_detective/detect_labels/other/email/__init__.py,sha256=Poagn45-eC2a_Wdk5Qs6d2BgYdncCQKZp2yEB50IuNw,431
118
120
  csv_detective/detect_labels/other/float/__init__.py,sha256=X0axZN2GAfC_y01zRfIyvOfRsOy2KNQcQ-mlQAKxqT4,216
119
121
  csv_detective/detect_labels/other/int/__init__.py,sha256=_1AY7thEBCcgSBQQ2YbY4YaPaxGRQ71BtmaFaX088ig,215
@@ -130,7 +132,7 @@ csv_detective/detection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG
130
132
  csv_detective/detection/columns.py,sha256=vfE-DKESA6J9Rfsl-a8tjgZfE21VmzArO5TrbzL0KmE,2905
131
133
  csv_detective/detection/encoding.py,sha256=tpjJEMNM_2TcLXDzn1lNQPnSRnsWYjs83tQ8jNwTj4E,973
132
134
  csv_detective/detection/engine.py,sha256=HiIrU-l9EO5Fbc2Vh8W_Uy5-dpKcQQzlxCqMuWc09LY,1530
133
- csv_detective/detection/formats.py,sha256=3vf7VdjxTmdt5KaTqGBwT5GuZhHuw98R-sIemTcOIJg,6345
135
+ csv_detective/detection/formats.py,sha256=c0LFTWbibWbEJSZaPy_86LIMOY3qRxj-I_agwpb4zbI,6284
134
136
  csv_detective/detection/headers.py,sha256=wrVII2RQpsVmHhrO1DHf3dmiu8kbtOjBlskf41cnQmc,1172
135
137
  csv_detective/detection/rows.py,sha256=3qvsbsBcMxiqqfSYYkOgsRpX777rk22tnRHDwUA97kU,742
136
138
  csv_detective/detection/separator.py,sha256=XjeDBqhiBxVfkCPJKem9BAgJqs_hOgQltc_pxrH_-Tg,1547
@@ -139,28 +141,28 @@ csv_detective/output/__init__.py,sha256=5KTevPfp_4MRxByJyOntQjToNfeG7dPQn-_13wSq
139
141
  csv_detective/output/dataframe.py,sha256=89iQRE59cHQyQQEsujQVIKP2YAUYpPklWkdDOqZE-wE,2183
140
142
  csv_detective/output/example.py,sha256=EdPX1iqHhIG4DsiHuYdy-J7JxOkjgUh_o2D5nrfM5fA,8649
141
143
  csv_detective/output/profile.py,sha256=B8YU541T_YPDezJGh4dkHckOShiwHSrZd9GS8jbmz7A,2919
142
- csv_detective/output/schema.py,sha256=Hpav3RgIP7gOb93h154s1wNSlEZtHNJVzFDDwp54UcQ,13669
143
- csv_detective/output/utils.py,sha256=RcOkFQihwfmEIOD-gwrUKi2r5CwBbs17vkuAf8n7-Wo,2405
144
+ csv_detective/output/schema.py,sha256=yC9K1vw6NUTULNv9a7CaMGns9iXmbzFLbtHI4wegqEc,13812
145
+ csv_detective/output/utils.py,sha256=xPM2KYdqousmjU22-w7HnaF6AR74fj8lhQY77Y9xs7w,3310
144
146
  csv_detective/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
145
- csv_detective/parsing/columns.py,sha256=rLzAU36cHMpVynEPhj8uMdr3IRO3_Yq58Yw7Z6oLPiQ,5693
147
+ csv_detective/parsing/columns.py,sha256=aMdG6-G-2Tj_2JdHotAIveQwaG_r8chGcGieFiUaBRk,5634
146
148
  csv_detective/parsing/compression.py,sha256=Fnw5tj-PpBNI8NYsWj5gD-DUoWcVLnsVpiKm9MpxmIA,350
147
149
  csv_detective/parsing/csv.py,sha256=11mibDnJhIjykXLGZvA5ZEU5U7KgxIrbyO6BNv6jlro,1626
148
150
  csv_detective/parsing/excel.py,sha256=AslE2S1e67o8yTIAIhp-lAnJ6-XqeBBRz1-VMFqhZBM,7055
149
151
  csv_detective/parsing/load.py,sha256=u6fbGFZsL2GwPQRzhAXgt32JpUur7vbQdErREHxNJ-w,3661
150
152
  csv_detective/parsing/text.py,sha256=_TprGi0gHZlRsafizI3dqQhBehZW4BazqxmypMcAZ-o,1824
151
- csv_detective-0.8.1.dev1549.data/data/share/csv_detective/CHANGELOG.md,sha256=1jO_wJx_-DK1TqmdmIu2bmbnvg2iJ2iX78MEb29MZYY,9425
152
- csv_detective-0.8.1.dev1549.data/data/share/csv_detective/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
153
- csv_detective-0.8.1.dev1549.data/data/share/csv_detective/README.md,sha256=gKLFmC8kuCCywS9eAhMak_JNriUWWNOsBKleAu5TIEY,8501
154
- csv_detective-0.8.1.dev1549.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
153
+ csv_detective-0.8.1.dev1599.data/data/share/csv_detective/CHANGELOG.md,sha256=WQ8cTB2D5YkAJ9AsS2ziKtZL8m1sPclGPenTD1BxZ_g,9646
154
+ csv_detective-0.8.1.dev1599.data/data/share/csv_detective/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
155
+ csv_detective-0.8.1.dev1599.data/data/share/csv_detective/README.md,sha256=gKLFmC8kuCCywS9eAhMak_JNriUWWNOsBKleAu5TIEY,8501
156
+ csv_detective-0.8.1.dev1599.dist-info/licenses/LICENSE,sha256=A1dQrzxyxRHRih02KwibWj1khQyF7GeA6SqdOU87Gk4,1088
155
157
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
156
158
  tests/test_example.py,sha256=JeHxSK0IVDcSrOhSZlNGSQv4JAc_r6mzvJM8PfmLTMw,2018
157
- tests/test_fields.py,sha256=tTFOmlb9gewtCwcZV7B6Gc3aH6xXK5kMUFSEBi7iIy4,10638
159
+ tests/test_fields.py,sha256=IwMpjOn8W5kDCvJYp3Cer4m571qomzjupOAvSRFMg_Q,11819
158
160
  tests/test_file.py,sha256=0bHV9wx9mSRoav_DVF19g694yohb1p0bw7rtcBeKG-8,8451
159
161
  tests/test_labels.py,sha256=Nkr645bUewrj8hjNDKr67FQ6Sy_TID6f3E5Kfkl231M,464
160
162
  tests/test_structure.py,sha256=bv-tjgXohvQAxwmxzH0BynFpK2TyPjcxvtIAmIRlZmA,1393
161
163
  tests/test_validation.py,sha256=CTGonR6htxcWF9WH8MxumDD8cF45Y-G4hm94SM4lFjU,3246
162
- csv_detective-0.8.1.dev1549.dist-info/METADATA,sha256=LQkj1jrN7dsdUjGOV3Z8BRKANccZTxdXdoRWtAvOa6w,10443
163
- csv_detective-0.8.1.dev1549.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
164
- csv_detective-0.8.1.dev1549.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
165
- csv_detective-0.8.1.dev1549.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
166
- csv_detective-0.8.1.dev1549.dist-info/RECORD,,
164
+ csv_detective-0.8.1.dev1599.dist-info/METADATA,sha256=NoE1tBjCZxO2uffbH9wSgkuNzOVOgLRA2qkjth7ynyk,10443
165
+ csv_detective-0.8.1.dev1599.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
166
+ csv_detective-0.8.1.dev1599.dist-info/entry_points.txt,sha256=JjweTReFqKJmuvkegzlew2j3D5pZzfxvbEGOtGVGmaY,56
167
+ csv_detective-0.8.1.dev1599.dist-info/top_level.txt,sha256=M0Nv646VHo-49zWjPkwo2C48UmtfddV8_9mEZeIxy8Q,20
168
+ csv_detective-0.8.1.dev1599.dist-info/RECORD,,
tests/test_fields.py CHANGED
@@ -44,6 +44,7 @@ from csv_detective.detect_fields.geo import (
44
44
  latitude_wgs,
45
45
  latlon_wgs,
46
46
  longitude_wgs,
47
+ lonlat_wgs,
47
48
  )
48
49
  from csv_detective.detect_fields.other import (
49
50
  booleen,
@@ -71,6 +72,7 @@ from csv_detective.detection.variables import (
71
72
  )
72
73
  from csv_detective.load_tests import return_all_tests
73
74
  from csv_detective.output.dataframe import cast
75
+ from csv_detective.output.utils import prepare_output_dict
74
76
 
75
77
 
76
78
  def test_all_tests_return_bool():
@@ -263,13 +265,17 @@ fields = {
263
265
  False: ["100"],
264
266
  },
265
267
  latlon_wgs: {
266
- True: ["43.2,-22.6", "-10.7,140", "-40.7, 10.8"],
267
- False: ["0.1,192", "-102, 92"],
268
+ True: ["43.2,-22.6", "-10.7,140", "-40.7, 10.8", "[12,-0.28]"],
269
+ False: ["0.1,192", "-102, 92", "[23.02,4.1", "23.02,4.1]", "160.1,-27"],
268
270
  },
269
271
  longitude_wgs: {
270
272
  True: ["120", "-20.2"],
271
273
  False: ["-200"],
272
274
  },
275
+ lonlat_wgs: {
276
+ True: ["-22.6,43.2", "140,-10.7", "10.8, -40.7", "[-0.28,12]"],
277
+ False: ["192,0.1", "92, -102", "[4.1,23.02", "4.1,23.02]", "-27,160.1"],
278
+ },
273
279
  booleen: {
274
280
  True: ["oui", "0", "1", "yes", "false", "True"],
275
281
  False: ["nein", "ja", "2", "-0"],
@@ -280,7 +286,7 @@ fields = {
280
286
  },
281
287
  json: {
282
288
  True: ['{"pomme": "fruit", "reponse": 42}', "[1,2,3,4]"],
283
- False: ['{"coordinates": [45.783753, 3.049342], "citycode": "63870"}', "{zefib:"],
289
+ False: ["5", '{"zefib":', '{"a"}'],
284
290
  },
285
291
  money: {
286
292
  True: ["120€", "-20.2$"],
@@ -405,3 +411,27 @@ def test_fields_with_values(args):
405
411
  def test_cast(args):
406
412
  value, detected_type, cast_type = args
407
413
  assert isinstance(cast(value, detected_type), cast_type)
414
+
415
+
416
+ @pytest.mark.parametrize(
417
+ "args",
418
+ (
419
+ # there is a specific numerical format => specific wins
420
+ ({"int": 1, "float": 1, "latitude_wgs": 1}, "latitude_wgs"),
421
+ # scores are equal for related formats => priority wins
422
+ ({"int": 1, "float": 1}, "int"),
423
+ # score is lower for priority format => secondary wins
424
+ ({"int": 0.5, "float": 1}, "float"),
425
+ # score is lower for priority format, but is 1 => priority wins
426
+ ({"int": 1, "float": 1.25}, "int"),
427
+ # two rounds of priority => highest priority wins
428
+ ({"latlon_wgs": 1, "lonlat_wgs": 1, "json": 1}, "latlon_wgs"),
429
+ # no detection => default to string
430
+ ({}, "string"),
431
+ ),
432
+ )
433
+ def test_priority(args):
434
+ detections, expected = args
435
+ col = "col1"
436
+ output = prepare_output_dict(pd.DataFrame({col: detections}), limited_output=True)
437
+ assert output[col]["format"] == expected