csv-detective 0.10.1__py3-none-any.whl → 0.10.1.dev2576__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. csv_detective/detection/formats.py +38 -11
  2. csv_detective/format.py +4 -11
  3. csv_detective/formats/adresse.py +9 -9
  4. csv_detective/formats/binary.py +1 -2
  5. csv_detective/formats/booleen.py +2 -3
  6. csv_detective/formats/code_commune_insee.py +10 -12
  7. csv_detective/formats/code_csp_insee.py +1 -1
  8. csv_detective/formats/code_departement.py +7 -8
  9. csv_detective/formats/code_fantoir.py +5 -6
  10. csv_detective/formats/code_import.py +1 -1
  11. csv_detective/formats/code_postal.py +9 -10
  12. csv_detective/formats/code_region.py +6 -7
  13. csv_detective/formats/code_rna.py +6 -7
  14. csv_detective/formats/code_waldec.py +1 -1
  15. csv_detective/formats/commune.py +5 -5
  16. csv_detective/formats/csp_insee.py +5 -6
  17. csv_detective/formats/data/insee_ape700.txt +1 -1
  18. csv_detective/formats/data/iso_country_code_alpha2.txt +397 -153
  19. csv_detective/formats/data/iso_country_code_alpha3.txt +132 -132
  20. csv_detective/formats/data/iso_country_code_numeric.txt +94 -94
  21. csv_detective/formats/date.py +17 -18
  22. csv_detective/formats/date_fr.py +1 -1
  23. csv_detective/formats/datetime_aware.py +2 -7
  24. csv_detective/formats/datetime_naive.py +0 -3
  25. csv_detective/formats/datetime_rfc822.py +0 -1
  26. csv_detective/formats/departement.py +15 -15
  27. csv_detective/formats/email.py +13 -13
  28. csv_detective/formats/float.py +1 -2
  29. csv_detective/formats/geojson.py +10 -10
  30. csv_detective/formats/insee_ape700.py +8 -10
  31. csv_detective/formats/insee_canton.py +6 -6
  32. csv_detective/formats/int.py +1 -2
  33. csv_detective/formats/iso_country_code_alpha2.py +14 -14
  34. csv_detective/formats/iso_country_code_alpha3.py +13 -6
  35. csv_detective/formats/iso_country_code_numeric.py +9 -2
  36. csv_detective/formats/jour_de_la_semaine.py +12 -11
  37. csv_detective/formats/json.py +0 -6
  38. csv_detective/formats/latitude_l93.py +22 -8
  39. csv_detective/formats/latitude_wgs.py +29 -31
  40. csv_detective/formats/latitude_wgs_fr_metropole.py +30 -7
  41. csv_detective/formats/latlon_wgs.py +28 -30
  42. csv_detective/formats/longitude_l93.py +13 -8
  43. csv_detective/formats/longitude_wgs.py +19 -34
  44. csv_detective/formats/longitude_wgs_fr_metropole.py +19 -6
  45. csv_detective/formats/lonlat_wgs.py +11 -12
  46. csv_detective/formats/mois_de_lannee.py +1 -1
  47. csv_detective/formats/money.py +1 -1
  48. csv_detective/formats/mongo_object_id.py +1 -1
  49. csv_detective/formats/pays.py +13 -11
  50. csv_detective/formats/percent.py +1 -1
  51. csv_detective/formats/region.py +13 -13
  52. csv_detective/formats/sexe.py +1 -1
  53. csv_detective/formats/siren.py +10 -9
  54. csv_detective/formats/siret.py +9 -9
  55. csv_detective/formats/tel_fr.py +13 -7
  56. csv_detective/formats/uai.py +18 -17
  57. csv_detective/formats/url.py +16 -16
  58. csv_detective/formats/username.py +1 -1
  59. csv_detective/formats/uuid.py +1 -1
  60. csv_detective/formats/year.py +12 -7
  61. csv_detective/output/dataframe.py +1 -6
  62. csv_detective/output/profile.py +1 -5
  63. csv_detective/parsing/text.py +12 -13
  64. {csv_detective-0.10.1.dist-info → csv_detective-0.10.1.dev2576.dist-info}/METADATA +2 -2
  65. csv_detective-0.10.1.dev2576.dist-info/RECORD +92 -0
  66. {csv_detective-0.10.1.dist-info → csv_detective-0.10.1.dev2576.dist-info}/WHEEL +1 -1
  67. csv_detective-0.10.1.dist-info/RECORD +0 -92
  68. {csv_detective-0.10.1.dist-info → csv_detective-0.10.1.dev2576.dist-info}/entry_points.txt +0 -0
@@ -2,17 +2,22 @@ from frformat import LongitudeL93
2
2
 
3
3
  from csv_detective.formats.float import _is as is_float
4
4
  from csv_detective.formats.float import float_casting
5
- from csv_detective.formats.longitude_wgs import SHARED_LONGITUDE_LABELS
6
5
 
7
6
  proportion = 1
8
7
  tags = ["fr", "geo"]
9
- mandatory_label = True
10
- python_type = "float"
11
- labels = SHARED_LONGITUDE_LABELS | {
12
- "x l93": 1,
13
- "longitude lb93": 1,
14
- "lambx": 1,
15
- }
8
+ labels = [
9
+ "longitude",
10
+ "lon",
11
+ "long",
12
+ "geocodage x gps",
13
+ "location longitude",
14
+ "xlongitude",
15
+ "lng",
16
+ "xlong",
17
+ "x",
18
+ "xf",
19
+ "xd",
20
+ ]
16
21
 
17
22
  _longitudel93 = LongitudeL93()
18
23
 
@@ -1,47 +1,32 @@
1
1
  from csv_detective.formats.float import _is as is_float
2
- from csv_detective.formats.int import _is as is_int
3
2
 
4
3
  proportion = 1
5
4
  tags = ["geo"]
6
- mandatory_label = True
7
- python_type = "float"
8
- SHARED_LONGITUDE_LABELS = {
9
- "longitude": 1,
10
- "long": 0.75,
11
- "lon": 0.75,
12
- "lng": 0.5,
13
- "x": 0.5,
14
- "xf": 0.5,
15
- "xd": 0.5,
16
- "coordonnee x": 1,
17
- "coord x": 1,
18
- "xcoord": 1,
19
- "xlon": 1,
20
- "xlong": 1,
21
- }
22
- labels = SHARED_LONGITUDE_LABELS | {
23
- "x gps": 1,
24
- "longitude wgs84": 1,
25
- "x wgs84": 1,
26
- "wsg": 0.75,
27
- "gps": 0.5,
28
- }
5
+ labels = [
6
+ "longitude",
7
+ "lon",
8
+ "long",
9
+ "geocodage x gps",
10
+ "location longitude",
11
+ "xlongitude",
12
+ "lng",
13
+ "xlong",
14
+ "x",
15
+ "xf",
16
+ "xd",
17
+ ]
29
18
 
30
19
 
31
20
  def _is(val):
32
21
  try:
33
- return (
34
- is_float(val)
35
- and -180 <= float(val) <= 180
36
- # we ideally would like a certain level of decimal precision
37
- # but 1.200 is saved as 1.2 in csv so we just discriminate ints
38
- and not is_int(val)
39
- )
40
- except Exception:
22
+ return is_float(val) and float(val) >= -180 and float(val) <= 180
23
+ except ValueError:
24
+ return False
25
+ except OverflowError:
41
26
  return False
42
27
 
43
28
 
44
29
  _test_values = {
45
- True: ["120.8263", "-20.27", "31.0"],
46
- False: ["-200", "20"],
30
+ True: ["120", "-20.2"],
31
+ False: ["-200"],
47
32
  }
@@ -1,19 +1,32 @@
1
- from csv_detective.formats.longitude_wgs import _is as is_longitude, labels # noqa
1
+ from csv_detective.formats.float import _is as is_float
2
2
 
3
3
  proportion = 1
4
4
  tags = ["fr", "geo"]
5
- mandatory_label = True
6
- python_type = "float"
5
+ labels = [
6
+ "longitude",
7
+ "lon",
8
+ "long",
9
+ "geocodage x gps",
10
+ "location longitude",
11
+ "xlongitude",
12
+ "lng",
13
+ "xlong",
14
+ "x",
15
+ "xf",
16
+ "xd",
17
+ ]
7
18
 
8
19
 
9
20
  def _is(val):
10
21
  try:
11
- return is_longitude(val) and -5.5 <= float(val) <= 9.8
12
- except Exception:
22
+ return is_float(val) and float(val) >= -5.5 and float(val) <= 9.8
23
+ except ValueError:
24
+ return False
25
+ except OverflowError:
13
26
  return False
14
27
 
15
28
 
16
29
  _test_values = {
17
- True: ["-2.01", "8.0"],
30
+ True: ["-2.5"],
18
31
  False: ["12.8"],
19
32
  }
@@ -4,20 +4,19 @@ from csv_detective.formats.longitude_wgs import _is as is_lon
4
4
 
5
5
  proportion = 1
6
6
  tags = ["geo"]
7
- mandatory_label = True
8
7
 
9
- specific = {
10
- "lonlat": 1,
11
- "lon lat": 1,
12
- "y x": 0.75,
13
- "yx": 0.75,
14
- }
8
+ specific = [
9
+ "lonlat",
10
+ "lon lat",
11
+ "y x",
12
+ "yx",
13
+ ]
15
14
 
16
15
  # we aim wide to catch exact matches if possible for the highest possible score
17
- labels = (
16
+ words = (
18
17
  SHARED_COORDS_LABELS
19
- | specific
20
- | {w + sep + suf: 1 for suf in specific for w in SHARED_COORDS_LABELS for sep in ["", " "]}
18
+ + specific
19
+ + [w + sep + suf for suf in specific for w in SHARED_COORDS_LABELS for sep in ["", " "]]
21
20
  )
22
21
 
23
22
 
@@ -32,6 +31,6 @@ def _is(val):
32
31
 
33
32
 
34
33
  _test_values = {
35
- True: ["-22.6,43.012", "140.0,-10.70", "10.829, -40.71", "[-0.28,12.43]"],
36
- False: ["192,0.1", "92, -102", "[4.1,23.02", "4.1,23.02]", "-27,160.1", "2,4", "-22, 43.0"],
34
+ True: ["-22.6,43.2", "140,-10.7", "10.8, -40.7", "[-0.28,12]"],
35
+ False: ["192,0.1", "92, -102", "[4.1,23.02", "4.1,23.02]", "-27,160.1"],
37
36
  }
@@ -2,7 +2,7 @@ from unidecode import unidecode
2
2
 
3
3
  proportion = 1
4
4
  tags = ["fr", "temp"]
5
- labels = {"mois": 1, "month": 1}
5
+ labels = ["mois", "month"]
6
6
 
7
7
  mois = {
8
8
  "janvier",
@@ -1,7 +1,7 @@
1
1
  from csv_detective.formats.float import _is as is_float
2
2
 
3
3
  proportion = 0.8
4
- labels = {"budget": 1, "salaire": 1, "euro": 1, "euros": 1, "prêt": 1, "montant": 1}
4
+ labels = ["budget", "salaire", "euro", "euros", "prêt", "montant"]
5
5
 
6
6
  currencies = {"€", "$", "£", "¥"}
7
7
 
@@ -1,7 +1,7 @@
1
1
  import re
2
2
 
3
3
  proportion = 0.8
4
- labels = {"id": 1, "objectid": 1}
4
+ labels = ["id", "objectid"]
5
5
 
6
6
 
7
7
  def _is(val):
@@ -2,17 +2,19 @@ from frformat import Millesime, Options, Pays
2
2
 
3
3
  proportion = 0.6
4
4
  tags = ["fr", "geo"]
5
- labels = {
6
- "pays": 1,
7
- "payslieu": 1,
8
- "paysorg": 1,
9
- "country": 1,
10
- "pays lib": 1,
11
- "lieupays": 1,
12
- "pays beneficiaire": 1,
13
- "nom du pays": 1,
14
- "libelle pays": 1,
15
- }
5
+ labels = [
6
+ "pays",
7
+ "payslieu",
8
+ "paysorg",
9
+ "country",
10
+ "pays lib",
11
+ "lieupays",
12
+ "pays beneficiaire",
13
+ "nom du pays",
14
+ "journey start country",
15
+ "libelle pays",
16
+ "journey end country",
17
+ ]
16
18
 
17
19
  _options = Options(
18
20
  ignore_case=True,
@@ -1,7 +1,7 @@
1
1
  from csv_detective.formats.float import _is as is_float
2
2
 
3
3
  proportion = 0.8
4
- labels = {"pourcent": 1, "part": 0.75, "pct": 0.75}
4
+ labels = []
5
5
 
6
6
 
7
7
  def _is(val):
@@ -2,19 +2,19 @@ from frformat import Millesime, Options, Region
2
2
 
3
3
  proportion = 1
4
4
  tags = ["fr", "geo"]
5
- labels = {
6
- "region": 1,
7
- "libelle region": 1,
8
- "nom region": 1,
9
- "libelle reg": 1,
10
- "nom reg": 1,
11
- "reg libusage": 1,
12
- "nom de la region": 1,
13
- "regionorg": 1,
14
- "regionlieu": 1,
15
- "reg": 0.5,
16
- "nom officiel region": 1,
17
- }
5
+ labels = [
6
+ "region",
7
+ "libelle region",
8
+ "nom region",
9
+ "libelle reg",
10
+ "nom reg",
11
+ "reg libusage",
12
+ "nom de la region",
13
+ "regionorg",
14
+ "regionlieu",
15
+ "reg",
16
+ "nom officiel region",
17
+ ]
18
18
 
19
19
  _extra_valid_values_set = frozenset(
20
20
  {
@@ -2,7 +2,7 @@ from csv_detective.parsing.text import _process_text
2
2
 
3
3
  proportion = 1
4
4
  tags = ["fr"]
5
- labels = {"sexe": 1, "sex": 1, "civilite": 1, "genre": 1}
5
+ labels = ["sexe", "sex", "civilite", "genre", "id sexe"]
6
6
 
7
7
 
8
8
  def _is(val):
@@ -2,15 +2,16 @@ import re
2
2
 
3
3
  proportion = 0.9
4
4
  tags = ["fr"]
5
- mandatory_label = True
6
- labels = {
7
- "siren": 1,
8
- " siren": 1,
9
- "siren organisme": 1,
10
- "siren titulaire": 1,
11
- "numero siren": 1,
12
- "epci": 1,
13
- }
5
+ labels = [
6
+ "siren",
7
+ "siren organisme designe",
8
+ "siren organisme designant",
9
+ "siren",
10
+ "siren organisme",
11
+ "siren titulaire",
12
+ "numero siren",
13
+ "epci",
14
+ ]
14
15
 
15
16
 
16
17
  def _is(val):
@@ -2,15 +2,15 @@ import re
2
2
 
3
3
  proportion = 0.8
4
4
  tags = ["fr"]
5
- mandatory_label = True
6
- labels = {
7
- "siret": 1,
8
- "num siret": 1,
9
- "siretacheteur": 1,
10
- "n° siret": 1,
11
- "coll siret": 1,
12
- "epci": 1,
13
- }
5
+ labels = [
6
+ "siret",
7
+ "siret d",
8
+ "num siret",
9
+ "siretacheteur",
10
+ "n° siret",
11
+ "coll siret",
12
+ "epci",
13
+ ]
14
14
 
15
15
 
16
16
  def _is(val):
@@ -2,13 +2,19 @@ import re
2
2
 
3
3
  proportion = 0.7
4
4
  tags = ["fr"]
5
- labels = {
6
- "telephone": 1,
7
- "tel": 1,
8
- "phone": 1,
9
- "num tel": 1,
10
- "tel mob": 1,
11
- }
5
+ labels = [
6
+ "telephone",
7
+ "tel",
8
+ "tel1",
9
+ "tel2",
10
+ "phone",
11
+ "num tel",
12
+ "tel mob",
13
+ "telephone sav",
14
+ "telephone1",
15
+ "coordinates.phone",
16
+ "telephone du lieu",
17
+ ]
12
18
 
13
19
 
14
20
  def _is(val):
@@ -2,23 +2,24 @@ import re
2
2
 
3
3
  proportion = 0.8
4
4
  tags = ["fr"]
5
- labels = {
6
- "uai": 1,
7
- "code etablissement": 1,
8
- "code uai": 1,
9
- "uai - identifiant": 1,
10
- "numero uai": 1,
11
- "rne": 0.75,
12
- "numero de l'etablissement": 1,
13
- "code rne": 1,
14
- "codeetab": 1,
15
- "code uai de l'etablissement": 1,
16
- "ref uai": 1,
17
- "cd rne": 1,
18
- "numerouai": 1,
19
- "numero d etablissement": 1,
20
- "numero etablissement": 1,
21
- }
5
+ labels = [
6
+ "uai",
7
+ "code etablissement",
8
+ "code uai",
9
+ "uai - identifiant",
10
+ "numero uai",
11
+ "rne",
12
+ "numero de l'etablissement",
13
+ "code rne",
14
+ "codeetab",
15
+ "code uai de l'etablissement",
16
+ "ref uai",
17
+ "cd rne",
18
+ "numerouai",
19
+ "numero d etablissement",
20
+ "code etablissement",
21
+ "numero etablissement",
22
+ ]
22
23
 
23
24
 
24
25
  def _is(val):
@@ -1,22 +1,22 @@
1
1
  import re
2
2
 
3
3
  proportion = 1
4
- labels = {
5
- "url": 1,
6
- "url source": 1,
7
- "site web": 1,
8
- "source url": 1,
9
- "site internet": 1,
10
- "remote url": 1,
11
- "web": 1,
12
- "site": 1,
13
- "lien": 1,
14
- "site data": 1,
15
- "lien url": 1,
16
- "lien vers le fichier": 1,
17
- "sitweb": 1,
18
- "interneturl": 1,
19
- }
4
+ labels = [
5
+ "url",
6
+ "url source",
7
+ "site web",
8
+ "source url",
9
+ "site internet",
10
+ "remote url",
11
+ "web",
12
+ "site",
13
+ "lien",
14
+ "site data",
15
+ "lien url",
16
+ "lien vers le fichier",
17
+ "sitweb",
18
+ "interneturl",
19
+ ]
20
20
 
21
21
  pattern = re.compile(
22
22
  r"^((https?|ftp)://|www\.)(([A-Za-z0-9-]+\.)+[A-Za-z]{2,6})"
@@ -1,7 +1,7 @@
1
1
  import re
2
2
 
3
3
  proportion = 1
4
- labels = {"account": 1, "username": 1, "user": 0.75}
4
+ labels = ["account", "username", "user"]
5
5
 
6
6
 
7
7
  def _is(val):
@@ -1,7 +1,7 @@
1
1
  import re
2
2
 
3
3
  proportion = 0.8
4
- labels = {"id": 1, "identifiant": 1}
4
+ labels = ["id", "identifiant"]
5
5
 
6
6
 
7
7
  def _is(val) -> bool:
@@ -1,12 +1,17 @@
1
1
  proportion = 1
2
2
  tags = ["temp"]
3
- python_type = "int"
4
- labels = {
5
- "year": 1,
6
- "annee": 1,
7
- "naissance": 1,
8
- "exercice": 1,
9
- }
3
+ labels = [
4
+ "year",
5
+ "annee",
6
+ "annee depot",
7
+ "an nais",
8
+ "exercice",
9
+ "data year",
10
+ "annee de publication",
11
+ "exercice comptable",
12
+ "annee de naissance",
13
+ "annee ouverture",
14
+ ]
10
15
 
11
16
 
12
17
  def _is(val):
@@ -13,16 +13,11 @@ from csv_detective.parsing.csv import CHUNK_SIZE
13
13
  from csv_detective.utils import display_logs_depending_process_time
14
14
 
15
15
 
16
- def cast(value: str, _type: str) -> str | int | float | bool | date | datetime | bytes | None:
16
+ def cast(value: str, _type: str) -> str | float | bool | date | datetime | bytes | None:
17
17
  if not isinstance(value, str) or not value:
18
18
  # None is the current default value in hydra, should we keep this?
19
19
  return None
20
20
  match _type:
21
- case "string":
22
- # not used here, convenience for external use (cc hydra)
23
- return value
24
- case "int":
25
- return int(value)
26
21
  case "float":
27
22
  return float_casting(value)
28
23
  case "bool":
@@ -81,11 +81,7 @@ def create_profile(
81
81
  del cast_col
82
82
  # for all formats we want most frequent values, nb unique values and nb missing values
83
83
  tops_bruts = (
84
- (
85
- table[c].value_counts()
86
- if _col_values is None
87
- else (s := _col_values[c]).loc[s.index.notna()].sort_values(ascending=False)
88
- )
84
+ (table[c].value_counts() if _col_values is None else _col_values[c].sort_values())
89
85
  .reset_index(name=_count_col)
90
86
  .iloc[:10]
91
87
  .to_dict(orient="records")
@@ -36,22 +36,21 @@ def is_word_in_string(word: str, string: str):
36
36
  return len(word) > 2 and word in string
37
37
 
38
38
 
39
- def header_score(header: str, valid_headers: dict[str, float]) -> float:
39
+ def header_score(header: str, words_combinations_list: list[str]) -> float:
40
40
  """Returns:
41
- - the valid header's credibility if the header is exactly in the valid list
42
- - 0.5*credibility if any of the words is within the valid list
41
+ - 1 if the header is exactly in the specified list
42
+ - 0.5 if any of the words is within the header
43
43
  - 0 otherwise"""
44
44
  processed_header = _process_text(header)
45
45
 
46
- header_matches_valid = max(
47
- (valid == processed_header) * credibility for valid, credibility in valid_headers.items()
46
+ header_matches_words_combination = float(
47
+ any(words_combination == processed_header for words_combination in words_combinations_list)
48
48
  )
49
-
50
- return max(
51
- header_matches_valid,
52
- 0.5
53
- * max(
54
- is_word_in_string(valid, processed_header) * credibility
55
- for valid, credibility in valid_headers.items()
56
- ),
49
+ words_combination_in_header = 0.5 * (
50
+ any(
51
+ is_word_in_string(words_combination, processed_header)
52
+ for words_combination in words_combinations_list
53
+ )
57
54
  )
55
+
56
+ return max(header_matches_words_combination, words_combination_in_header)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: csv-detective
3
- Version: 0.10.1
3
+ Version: 0.10.1.dev2576
4
4
  Summary: Detect tabular files column content
5
5
  Keywords: CSV,data processing,encoding,guess,parser,tabular
6
6
  Author: data.gouv.fr
@@ -24,7 +24,7 @@ Requires-Dist: pytest>=8.3.0 ; extra == 'dev'
24
24
  Requires-Dist: responses>=0.25.0 ; extra == 'dev'
25
25
  Requires-Dist: ruff>=0.9.3 ; extra == 'dev'
26
26
  Requires-Python: >=3.10, <3.15
27
- Project-URL: Source, https://github.com/datagouv/csv-detective
27
+ Project-URL: Source, https://github.com/datagouv/csv_detective
28
28
  Provides-Extra: dev
29
29
  Description-Content-Type: text/markdown
30
30