csv-detective 0.10.1.dev2590__py3-none-any.whl → 0.10.1.dev2616__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. csv_detective/format.py +3 -3
  2. csv_detective/formats/adresse.py +9 -9
  3. csv_detective/formats/binary.py +1 -1
  4. csv_detective/formats/booleen.py +2 -2
  5. csv_detective/formats/code_commune_insee.py +11 -10
  6. csv_detective/formats/code_csp_insee.py +1 -1
  7. csv_detective/formats/code_departement.py +7 -7
  8. csv_detective/formats/code_fantoir.py +5 -5
  9. csv_detective/formats/code_import.py +1 -1
  10. csv_detective/formats/code_postal.py +9 -9
  11. csv_detective/formats/code_region.py +6 -6
  12. csv_detective/formats/code_rna.py +7 -6
  13. csv_detective/formats/code_waldec.py +1 -1
  14. csv_detective/formats/commune.py +5 -5
  15. csv_detective/formats/csp_insee.py +6 -5
  16. csv_detective/formats/date.py +17 -17
  17. csv_detective/formats/date_fr.py +1 -1
  18. csv_detective/formats/datetime_aware.py +1 -1
  19. csv_detective/formats/departement.py +15 -15
  20. csv_detective/formats/email.py +13 -13
  21. csv_detective/formats/float.py +1 -1
  22. csv_detective/formats/geojson.py +9 -10
  23. csv_detective/formats/insee_ape700.py +10 -8
  24. csv_detective/formats/insee_canton.py +6 -6
  25. csv_detective/formats/int.py +1 -1
  26. csv_detective/formats/iso_country_code_alpha2.py +10 -9
  27. csv_detective/formats/iso_country_code_alpha3.py +2 -9
  28. csv_detective/formats/iso_country_code_numeric.py +2 -9
  29. csv_detective/formats/jour_de_la_semaine.py +11 -12
  30. csv_detective/formats/json.py +5 -0
  31. csv_detective/formats/latitude_l93.py +6 -22
  32. csv_detective/formats/latitude_wgs.py +29 -29
  33. csv_detective/formats/latitude_wgs_fr_metropole.py +5 -30
  34. csv_detective/formats/latlon_wgs.py +28 -28
  35. csv_detective/formats/longitude_l93.py +6 -13
  36. csv_detective/formats/longitude_wgs.py +32 -19
  37. csv_detective/formats/longitude_wgs_fr_metropole.py +4 -19
  38. csv_detective/formats/lonlat_wgs.py +11 -11
  39. csv_detective/formats/mois_de_lannee.py +1 -1
  40. csv_detective/formats/money.py +1 -1
  41. csv_detective/formats/mongo_object_id.py +1 -1
  42. csv_detective/formats/pays.py +11 -13
  43. csv_detective/formats/percent.py +1 -1
  44. csv_detective/formats/region.py +13 -13
  45. csv_detective/formats/sexe.py +1 -1
  46. csv_detective/formats/siren.py +8 -10
  47. csv_detective/formats/siret.py +8 -9
  48. csv_detective/formats/tel_fr.py +7 -13
  49. csv_detective/formats/uai.py +17 -18
  50. csv_detective/formats/url.py +16 -16
  51. csv_detective/formats/username.py +1 -1
  52. csv_detective/formats/uuid.py +1 -1
  53. csv_detective/formats/year.py +6 -12
  54. csv_detective/parsing/text.py +13 -12
  55. {csv_detective-0.10.1.dev2590.dist-info → csv_detective-0.10.1.dev2616.dist-info}/METADATA +1 -1
  56. csv_detective-0.10.1.dev2616.dist-info/RECORD +92 -0
  57. {csv_detective-0.10.1.dev2590.dist-info → csv_detective-0.10.1.dev2616.dist-info}/WHEEL +1 -1
  58. csv_detective-0.10.1.dev2590.dist-info/RECORD +0 -92
  59. {csv_detective-0.10.1.dev2590.dist-info → csv_detective-0.10.1.dev2616.dist-info}/entry_points.txt +0 -0
@@ -2,33 +2,17 @@ from frformat import LatitudeL93
2
2
 
3
3
  from csv_detective.formats.float import _is as is_float
4
4
  from csv_detective.formats.float import float_casting
5
+ from csv_detective.formats.latitude_wgs import SHARED_LATITUDE_LABELS
5
6
 
6
7
  proportion = 1
7
8
  tags = ["fr", "geo"]
8
9
  mandatory_label = True
9
10
  python_type = "float"
10
- labels = [
11
- "latitude",
12
- "lat",
13
- "y",
14
- "yf",
15
- "yd",
16
- "y l93",
17
- "coordonnee y",
18
- "latitude lb93",
19
- "coord y",
20
- "ycoord",
21
- "geocodage y gps",
22
- "location latitude",
23
- "ylatitude",
24
- "ylat",
25
- "latitude (y)",
26
- "latitudeorg",
27
- "coordinates.latitude",
28
- "googlemap latitude",
29
- "latitudelieu",
30
- "latitude googlemap",
31
- ]
11
+ labels = SHARED_LATITUDE_LABELS | {
12
+ "y l93": 1,
13
+ "latitude lb93": 1,
14
+ "lamby": 1,
15
+ }
32
16
 
33
17
  _latitudel93 = LatitudeL93()
34
18
 
@@ -1,44 +1,44 @@
1
1
  from csv_detective.formats.float import _is as is_float
2
+ from csv_detective.formats.int import _is as is_int
2
3
 
3
4
  proportion = 1
4
5
  tags = ["geo"]
5
6
  mandatory_label = True
6
7
  python_type = "float"
7
- labels = [
8
- "latitude",
9
- "lat",
10
- "y",
11
- "yf",
12
- "yd",
13
- "coordonnee y",
14
- "coord y",
15
- "ycoord",
16
- "geocodage y gps",
17
- "location latitude",
18
- "ylatitude",
19
- "ylat",
20
- "latitude (y)",
21
- "latitudeorg",
22
- "coordinates.latitude",
23
- "googlemap latitude",
24
- "latitudelieu",
25
- "latitude googlemap",
26
- "latitude wgs84",
27
- "y wgs84",
28
- "latitude (wgs84)",
29
- ]
8
+ SHARED_LATITUDE_LABELS = {
9
+ "latitude": 1,
10
+ "lat": 0.75,
11
+ "y": 0.5,
12
+ "yf": 0.5,
13
+ "yd": 0.5,
14
+ "coordonnee y": 1,
15
+ "coord y": 1,
16
+ "ycoord": 1,
17
+ "ylat": 1,
18
+ }
19
+ labels = SHARED_LATITUDE_LABELS | {
20
+ "y gps": 1,
21
+ "latitude wgs84": 1,
22
+ "y wgs84": 1,
23
+ "wsg": 0.75,
24
+ "gps": 0.5,
25
+ }
30
26
 
31
27
 
32
28
  def _is(val):
33
29
  try:
34
- return is_float(val) and float(val) >= -90 and float(val) <= 90
35
- except ValueError:
36
- return False
37
- except OverflowError:
30
+ return (
31
+ is_float(val)
32
+ and -90 <= float(val) <= 90
33
+ # we ideally would like a certain level of decimal precision
34
+ # but 1.200 is saved as 1.2 in csv so we just discriminate ints
35
+ and not is_int(val)
36
+ )
37
+ except Exception:
38
38
  return False
39
39
 
40
40
 
41
41
  _test_values = {
42
- True: ["43.2", "-22"],
43
- False: ["100"],
42
+ True: ["43.2872", "-22.61", "-3.0"],
43
+ False: ["100.1973", "40"],
44
44
  }
@@ -1,44 +1,19 @@
1
- from csv_detective.formats.float import _is as is_float
1
+ from csv_detective.formats.latitude_wgs import _is as is_latitude, labels # noqa
2
2
 
3
3
  proportion = 1
4
4
  tags = ["fr", "geo"]
5
5
  mandatory_label = True
6
6
  python_type = "float"
7
- labels = [
8
- "latitude",
9
- "lat",
10
- "y",
11
- "yf",
12
- "yd",
13
- "coordonnee y",
14
- "coord y",
15
- "ycoord",
16
- "geocodage y gps",
17
- "location latitude",
18
- "ylatitude",
19
- "ylat",
20
- "latitude (y)",
21
- "latitudeorg",
22
- "coordinates.latitude",
23
- "googlemap latitude",
24
- "latitudelieu",
25
- "latitude googlemap",
26
- "latitude wgs84",
27
- "y wgs84",
28
- "latitude (wgs84)",
29
- ]
30
7
 
31
8
 
32
9
  def _is(val):
33
10
  try:
34
- return is_float(val) and float(val) >= 41.3 and float(val) <= 51.3
35
- except ValueError:
36
- return False
37
- except OverflowError:
11
+ return is_latitude(val) and 41.3 <= float(val) <= 51.3
12
+ except Exception:
38
13
  return False
39
14
 
40
15
 
41
16
  _test_values = {
42
- True: ["42.5"],
43
- False: ["22.5", "62.5"],
17
+ True: ["42.576", "42.5"],
18
+ False: ["22.5"],
44
19
  }
@@ -5,36 +5,36 @@ proportion = 1
5
5
  tags = ["geo"]
6
6
  mandatory_label = True
7
7
 
8
- SHARED_COORDS_LABELS = [
9
- "ban",
10
- "coordinates",
11
- "coordonnees",
12
- "coordonnees insee",
13
- "geo",
14
- "geopoint",
15
- "geoloc",
16
- "geolocalisation",
17
- "geom",
18
- "geometry",
19
- "gps",
20
- "localisation",
21
- "point",
22
- "position",
23
- "wgs84",
24
- ]
25
-
26
- specific = [
27
- "latlon",
28
- "lat lon",
29
- "x y",
30
- "xy",
31
- ]
8
+ SHARED_COORDS_LABELS = {
9
+ "ban": 1,
10
+ "coordinates": 1,
11
+ "coordonnees": 1,
12
+ "coordonnees insee": 1,
13
+ "geo": 0.5,
14
+ "geopoint": 1,
15
+ "geoloc": 1,
16
+ "geolocalisation": 1,
17
+ "geom": 0.75,
18
+ "geometry": 1,
19
+ "gps": 1,
20
+ "localisation": 1,
21
+ "point": 1,
22
+ "position": 1,
23
+ "wgs84": 1,
24
+ }
25
+
26
+ specific = {
27
+ "latlon": 1,
28
+ "lat lon": 1,
29
+ "x y": 0.75,
30
+ "xy": 0.75,
31
+ }
32
32
 
33
33
  # we aim wide to catch exact matches if possible for the highest possible score
34
34
  labels = (
35
35
  SHARED_COORDS_LABELS
36
- + specific
37
- + [w + sep + suf for suf in specific for w in SHARED_COORDS_LABELS for sep in ["", " "]]
36
+ | specific
37
+ | {w + sep + suf: 1 for suf in specific for w in SHARED_COORDS_LABELS for sep in ["", " "]}
38
38
  )
39
39
 
40
40
 
@@ -49,6 +49,6 @@ def _is(val):
49
49
 
50
50
 
51
51
  _test_values = {
52
- True: ["43.2,-22.6", "-10.7,140", "-40.7, 10.8", "[12,-0.28]"],
53
- False: ["0.1,192", "-102, 92", "[23.02,4.1", "23.02,4.1]", "160.1,-27"],
52
+ True: ["43.2,-22.6", "-10.71,140.0", "-40.791, 10.81", "[12.01,-0.28]"],
53
+ False: ["0.1,192", "-102, 92", "[23.02,4.1", "23.02,4.1]", "160.1,-27", "1,2", "43, -23"],
54
54
  }
@@ -2,24 +2,17 @@ from frformat import LongitudeL93
2
2
 
3
3
  from csv_detective.formats.float import _is as is_float
4
4
  from csv_detective.formats.float import float_casting
5
+ from csv_detective.formats.longitude_wgs import SHARED_LONGITUDE_LABELS
5
6
 
6
7
  proportion = 1
7
8
  tags = ["fr", "geo"]
8
9
  mandatory_label = True
9
10
  python_type = "float"
10
- labels = [
11
- "longitude",
12
- "lon",
13
- "long",
14
- "geocodage x gps",
15
- "location longitude",
16
- "xlongitude",
17
- "lng",
18
- "xlong",
19
- "x",
20
- "xf",
21
- "xd",
22
- ]
11
+ labels = SHARED_LONGITUDE_LABELS | {
12
+ "x l93": 1,
13
+ "longitude lb93": 1,
14
+ "lambx": 1,
15
+ }
23
16
 
24
17
  _longitudel93 = LongitudeL93()
25
18
 
@@ -1,34 +1,47 @@
1
1
  from csv_detective.formats.float import _is as is_float
2
+ from csv_detective.formats.int import _is as is_int
2
3
 
3
4
  proportion = 1
4
5
  tags = ["geo"]
5
6
  mandatory_label = True
6
7
  python_type = "float"
7
- labels = [
8
- "longitude",
9
- "lon",
10
- "long",
11
- "geocodage x gps",
12
- "location longitude",
13
- "xlongitude",
14
- "lng",
15
- "xlong",
16
- "x",
17
- "xf",
18
- "xd",
19
- ]
8
+ SHARED_LONGITUDE_LABELS = {
9
+ "longitude": 1,
10
+ "long": 0.75,
11
+ "lon": 0.75,
12
+ "lng": 0.5,
13
+ "x": 0.5,
14
+ "xf": 0.5,
15
+ "xd": 0.5,
16
+ "coordonnee x": 1,
17
+ "coord x": 1,
18
+ "xcoord": 1,
19
+ "xlon": 1,
20
+ "xlong": 1,
21
+ }
22
+ labels = SHARED_LONGITUDE_LABELS | {
23
+ "x gps": 1,
24
+ "longitude wgs84": 1,
25
+ "x wgs84": 1,
26
+ "wsg": 0.75,
27
+ "gps": 0.5,
28
+ }
20
29
 
21
30
 
22
31
  def _is(val):
23
32
  try:
24
- return is_float(val) and float(val) >= -180 and float(val) <= 180
25
- except ValueError:
26
- return False
27
- except OverflowError:
33
+ return (
34
+ is_float(val)
35
+ and -180 <= float(val) <= 180
36
+ # we ideally would like a certain level of decimal precision
37
+ # but 1.200 is saved as 1.2 in csv so we just discriminate ints
38
+ and not is_int(val)
39
+ )
40
+ except Exception:
28
41
  return False
29
42
 
30
43
 
31
44
  _test_values = {
32
- True: ["120", "-20.2"],
33
- False: ["-200"],
45
+ True: ["120.8263", "-20.27", "31.0"],
46
+ False: ["-200", "20"],
34
47
  }
@@ -1,34 +1,19 @@
1
- from csv_detective.formats.float import _is as is_float
1
+ from csv_detective.formats.longitude_wgs import _is as is_longitude, labels # noqa
2
2
 
3
3
  proportion = 1
4
4
  tags = ["fr", "geo"]
5
5
  mandatory_label = True
6
6
  python_type = "float"
7
- labels = [
8
- "longitude",
9
- "lon",
10
- "long",
11
- "geocodage x gps",
12
- "location longitude",
13
- "xlongitude",
14
- "lng",
15
- "xlong",
16
- "x",
17
- "xf",
18
- "xd",
19
- ]
20
7
 
21
8
 
22
9
  def _is(val):
23
10
  try:
24
- return is_float(val) and float(val) >= -5.5 and float(val) <= 9.8
25
- except ValueError:
26
- return False
27
- except OverflowError:
11
+ return is_longitude(val) and -5.5 <= float(val) <= 9.8
12
+ except Exception:
28
13
  return False
29
14
 
30
15
 
31
16
  _test_values = {
32
- True: ["-2.5"],
17
+ True: ["-2.01", "8.0"],
33
18
  False: ["12.8"],
34
19
  }
@@ -6,18 +6,18 @@ proportion = 1
6
6
  tags = ["geo"]
7
7
  mandatory_label = True
8
8
 
9
- specific = [
10
- "lonlat",
11
- "lon lat",
12
- "y x",
13
- "yx",
14
- ]
9
+ specific = {
10
+ "lonlat": 1,
11
+ "lon lat": 1,
12
+ "y x": 0.75,
13
+ "yx": 0.75,
14
+ }
15
15
 
16
16
  # we aim wide to catch exact matches if possible for the highest possible score
17
- words = (
17
+ labels = (
18
18
  SHARED_COORDS_LABELS
19
- + specific
20
- + [w + sep + suf for suf in specific for w in SHARED_COORDS_LABELS for sep in ["", " "]]
19
+ | specific
20
+ | {w + sep + suf: 1 for suf in specific for w in SHARED_COORDS_LABELS for sep in ["", " "]}
21
21
  )
22
22
 
23
23
 
@@ -32,6 +32,6 @@ def _is(val):
32
32
 
33
33
 
34
34
  _test_values = {
35
- True: ["-22.6,43.2", "140,-10.7", "10.8, -40.7", "[-0.28,12]"],
36
- False: ["192,0.1", "92, -102", "[4.1,23.02", "4.1,23.02]", "-27,160.1"],
35
+ True: ["-22.6,43.012", "140.0,-10.70", "10.829, -40.71", "[-0.28,12.43]"],
36
+ False: ["192,0.1", "92, -102", "[4.1,23.02", "4.1,23.02]", "-27,160.1", "2,4", "-22, 43.0"],
37
37
  }
@@ -2,7 +2,7 @@ from unidecode import unidecode
2
2
 
3
3
  proportion = 1
4
4
  tags = ["fr", "temp"]
5
- labels = ["mois", "month"]
5
+ labels = {"mois": 1, "month": 1}
6
6
 
7
7
  mois = {
8
8
  "janvier",
@@ -1,7 +1,7 @@
1
1
  from csv_detective.formats.float import _is as is_float
2
2
 
3
3
  proportion = 0.8
4
- labels = ["budget", "salaire", "euro", "euros", "prêt", "montant"]
4
+ labels = {"budget": 1, "salaire": 1, "euro": 1, "euros": 1, "prêt": 1, "montant": 1}
5
5
 
6
6
  currencies = {"€", "$", "£", "¥"}
7
7
 
@@ -1,7 +1,7 @@
1
1
  import re
2
2
 
3
3
  proportion = 0.8
4
- labels = ["id", "objectid"]
4
+ labels = {"id": 1, "objectid": 1}
5
5
 
6
6
 
7
7
  def _is(val):
@@ -2,19 +2,17 @@ from frformat import Millesime, Options, Pays
2
2
 
3
3
  proportion = 0.6
4
4
  tags = ["fr", "geo"]
5
- labels = [
6
- "pays",
7
- "payslieu",
8
- "paysorg",
9
- "country",
10
- "pays lib",
11
- "lieupays",
12
- "pays beneficiaire",
13
- "nom du pays",
14
- "journey start country",
15
- "libelle pays",
16
- "journey end country",
17
- ]
5
+ labels = {
6
+ "pays": 1,
7
+ "payslieu": 1,
8
+ "paysorg": 1,
9
+ "country": 1,
10
+ "pays lib": 1,
11
+ "lieupays": 1,
12
+ "pays beneficiaire": 1,
13
+ "nom du pays": 1,
14
+ "libelle pays": 1,
15
+ }
18
16
 
19
17
  _options = Options(
20
18
  ignore_case=True,
@@ -1,7 +1,7 @@
1
1
  from csv_detective.formats.float import _is as is_float
2
2
 
3
3
  proportion = 0.8
4
- labels = []
4
+ labels = {"pourcent": 1, "part": 0.75, "pct": 0.75}
5
5
 
6
6
 
7
7
  def _is(val):
@@ -2,19 +2,19 @@ from frformat import Millesime, Options, Region
2
2
 
3
3
  proportion = 1
4
4
  tags = ["fr", "geo"]
5
- labels = [
6
- "region",
7
- "libelle region",
8
- "nom region",
9
- "libelle reg",
10
- "nom reg",
11
- "reg libusage",
12
- "nom de la region",
13
- "regionorg",
14
- "regionlieu",
15
- "reg",
16
- "nom officiel region",
17
- ]
5
+ labels = {
6
+ "region": 1,
7
+ "libelle region": 1,
8
+ "nom region": 1,
9
+ "libelle reg": 1,
10
+ "nom reg": 1,
11
+ "reg libusage": 1,
12
+ "nom de la region": 1,
13
+ "regionorg": 1,
14
+ "regionlieu": 1,
15
+ "reg": 0.5,
16
+ "nom officiel region": 1,
17
+ }
18
18
 
19
19
  _extra_valid_values_set = frozenset(
20
20
  {
@@ -2,7 +2,7 @@ from csv_detective.parsing.text import _process_text
2
2
 
3
3
  proportion = 1
4
4
  tags = ["fr"]
5
- labels = ["sexe", "sex", "civilite", "genre", "id sexe"]
5
+ labels = {"sexe": 1, "sex": 1, "civilite": 1, "genre": 1}
6
6
 
7
7
 
8
8
  def _is(val):
@@ -3,16 +3,14 @@ import re
3
3
  proportion = 0.9
4
4
  tags = ["fr"]
5
5
  mandatory_label = True
6
- labels = [
7
- "siren",
8
- "siren organisme designe",
9
- "siren organisme designant",
10
- " siren",
11
- "siren organisme",
12
- "siren titulaire",
13
- "numero siren",
14
- "epci",
15
- ]
6
+ labels = {
7
+ "siren": 1,
8
+ "siren": 1,
9
+ "siren organisme": 1,
10
+ "siren titulaire": 1,
11
+ "numero siren": 1,
12
+ "epci": 1,
13
+ }
16
14
 
17
15
 
18
16
  def _is(val):
@@ -3,15 +3,14 @@ import re
3
3
  proportion = 0.8
4
4
  tags = ["fr"]
5
5
  mandatory_label = True
6
- labels = [
7
- "siret",
8
- "siret d",
9
- "num siret",
10
- "siretacheteur",
11
- " siret",
12
- "coll siret",
13
- "epci",
14
- ]
6
+ labels = {
7
+ "siret": 1,
8
+ "num siret": 1,
9
+ "siretacheteur": 1,
10
+ "n° siret": 1,
11
+ "coll siret": 1,
12
+ "epci": 1,
13
+ }
15
14
 
16
15
 
17
16
  def _is(val):
@@ -2,19 +2,13 @@ import re
2
2
 
3
3
  proportion = 0.7
4
4
  tags = ["fr"]
5
- labels = [
6
- "telephone",
7
- "tel",
8
- "tel1",
9
- "tel2",
10
- "phone",
11
- "num tel",
12
- "tel mob",
13
- "telephone sav",
14
- "telephone1",
15
- "coordinates.phone",
16
- "telephone du lieu",
17
- ]
5
+ labels = {
6
+ "telephone": 1,
7
+ "tel": 1,
8
+ "phone": 1,
9
+ "num tel": 1,
10
+ "tel mob": 1,
11
+ }
18
12
 
19
13
 
20
14
  def _is(val):
@@ -2,24 +2,23 @@ import re
2
2
 
3
3
  proportion = 0.8
4
4
  tags = ["fr"]
5
- labels = [
6
- "uai",
7
- "code etablissement",
8
- "code uai",
9
- "uai - identifiant",
10
- "numero uai",
11
- "rne",
12
- "numero de l'etablissement",
13
- "code rne",
14
- "codeetab",
15
- "code uai de l'etablissement",
16
- "ref uai",
17
- "cd rne",
18
- "numerouai",
19
- "numero d etablissement",
20
- "code etablissement",
21
- "numero etablissement",
22
- ]
5
+ labels = {
6
+ "uai": 1,
7
+ "code etablissement": 1,
8
+ "code uai": 1,
9
+ "uai - identifiant": 1,
10
+ "numero uai": 1,
11
+ "rne": 0.75,
12
+ "numero de l'etablissement": 1,
13
+ "code rne": 1,
14
+ "codeetab": 1,
15
+ "code uai de l'etablissement": 1,
16
+ "ref uai": 1,
17
+ "cd rne": 1,
18
+ "numerouai": 1,
19
+ "numero d etablissement": 1,
20
+ "numero etablissement": 1,
21
+ }
23
22
 
24
23
 
25
24
  def _is(val):