csv-detective 0.9.3.dev2258__py3-none-any.whl → 0.9.3.dev2348__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (179) hide show
  1. csv_detective/detection/formats.py +12 -15
  2. csv_detective/explore_csv.py +28 -9
  3. csv_detective/format.py +67 -0
  4. csv_detective/formats/__init__.py +9 -0
  5. csv_detective/{detect_fields/FR/geo/adresse/__init__.py → formats/adresse.py} +116 -100
  6. csv_detective/{detect_fields/other/booleen/__init__.py → formats/booleen.py} +35 -27
  7. csv_detective/formats/code_commune_insee.py +26 -0
  8. csv_detective/{detect_fields/FR/other/code_csp_insee/__init__.py → formats/code_csp_insee.py} +36 -29
  9. csv_detective/{detect_fields/FR/geo/code_departement/__init__.py → formats/code_departement.py} +29 -15
  10. csv_detective/formats/code_fantoir.py +21 -0
  11. csv_detective/{detect_fields/FR/other/code_import/__init__.py → formats/code_import.py} +17 -9
  12. csv_detective/formats/code_postal.py +25 -0
  13. csv_detective/formats/code_region.py +22 -0
  14. csv_detective/formats/code_rna.py +29 -0
  15. csv_detective/formats/code_waldec.py +17 -0
  16. csv_detective/{detect_fields/FR/geo/commune/__init__.py → formats/commune.py} +27 -16
  17. csv_detective/{detect_fields/FR/other/csp_insee/__init__.py → formats/csp_insee.py} +31 -19
  18. csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt +0 -0
  19. csv_detective/{detect_fields/temp/date/__init__.py → formats/date.py} +99 -62
  20. csv_detective/formats/date_fr.py +22 -0
  21. csv_detective/{detect_fields/temp/datetime_aware/__init__.py → formats/datetime_aware.py} +18 -7
  22. csv_detective/{detect_fields/temp/datetime_naive/__init__.py → formats/datetime_naive.py} +21 -2
  23. csv_detective/{detect_fields/temp/datetime_rfc822/__init__.py → formats/datetime_rfc822.py} +24 -18
  24. csv_detective/formats/departement.py +37 -0
  25. csv_detective/formats/email.py +28 -0
  26. csv_detective/{detect_fields/other/float/__init__.py → formats/float.py} +29 -21
  27. csv_detective/formats/geojson.py +36 -0
  28. csv_detective/{detect_fields/FR/other/insee_ape700/__init__.py → formats/insee_ape700.py} +31 -19
  29. csv_detective/{detect_fields/FR/geo/insee_canton/__init__.py → formats/insee_canton.py} +28 -15
  30. csv_detective/{detect_fields/other/int/__init__.py → formats/int.py} +23 -16
  31. csv_detective/formats/iso_country_code_alpha2.py +30 -0
  32. csv_detective/formats/iso_country_code_alpha3.py +30 -0
  33. csv_detective/formats/iso_country_code_numeric.py +31 -0
  34. csv_detective/{detect_fields/FR/temp/jour_de_la_semaine/__init__.py → formats/jour_de_la_semaine.py} +41 -25
  35. csv_detective/{detect_fields/other/json/__init__.py → formats/json.py} +20 -14
  36. csv_detective/formats/latitude_l93.py +48 -0
  37. csv_detective/formats/latitude_wgs.py +42 -0
  38. csv_detective/formats/latitude_wgs_fr_metropole.py +42 -0
  39. csv_detective/formats/latlon_wgs.py +53 -0
  40. csv_detective/formats/longitude_l93.py +39 -0
  41. csv_detective/formats/longitude_wgs.py +32 -0
  42. csv_detective/formats/longitude_wgs_fr_metropole.py +32 -0
  43. csv_detective/formats/lonlat_wgs.py +36 -0
  44. csv_detective/{detect_fields/FR/temp/mois_de_annee/__init__.py → formats/mois_de_lannee.py} +48 -39
  45. csv_detective/formats/money.py +18 -0
  46. csv_detective/formats/mongo_object_id.py +14 -0
  47. csv_detective/formats/pays.py +35 -0
  48. csv_detective/formats/percent.py +16 -0
  49. csv_detective/{detect_fields/FR/geo/region/__init__.py → formats/region.py} +70 -50
  50. csv_detective/formats/sexe.py +17 -0
  51. csv_detective/{detect_fields/FR/other/siren/__init__.py → formats/siren.py} +37 -20
  52. csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} +47 -31
  53. csv_detective/formats/tel_fr.py +36 -0
  54. csv_detective/formats/uai.py +36 -0
  55. csv_detective/formats/url.py +45 -0
  56. csv_detective/formats/username.py +14 -0
  57. csv_detective/formats/uuid.py +16 -0
  58. csv_detective/formats/year.py +28 -0
  59. csv_detective/output/__init__.py +3 -4
  60. csv_detective/output/dataframe.py +3 -3
  61. csv_detective/output/profile.py +2 -3
  62. csv_detective/output/schema.py +2 -2
  63. csv_detective/parsing/columns.py +35 -50
  64. csv_detective/parsing/csv.py +2 -2
  65. csv_detective/parsing/load.py +4 -5
  66. csv_detective/validate.py +9 -4
  67. {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2348.dist-info}/METADATA +6 -5
  68. csv_detective-0.9.3.dev2348.dist-info/RECORD +102 -0
  69. tests/test_fields.py +39 -364
  70. tests/test_file.py +1 -1
  71. tests/test_labels.py +5 -3
  72. tests/test_structure.py +40 -36
  73. csv_detective/detect_fields/FR/__init__.py +0 -0
  74. csv_detective/detect_fields/FR/geo/__init__.py +0 -0
  75. csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +0 -9
  76. csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +0 -9
  77. csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -9
  78. csv_detective/detect_fields/FR/geo/code_region/__init__.py +0 -10
  79. csv_detective/detect_fields/FR/geo/departement/__init__.py +0 -16
  80. csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +0 -19
  81. csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -13
  82. csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +0 -19
  83. csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -13
  84. csv_detective/detect_fields/FR/geo/pays/__init__.py +0 -16
  85. csv_detective/detect_fields/FR/other/__init__.py +0 -0
  86. csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
  87. csv_detective/detect_fields/FR/other/code_rna/__init__.py +0 -9
  88. csv_detective/detect_fields/FR/other/code_waldec/__init__.py +0 -9
  89. csv_detective/detect_fields/FR/other/date_fr/__init__.py +0 -12
  90. csv_detective/detect_fields/FR/other/sexe/__init__.py +0 -11
  91. csv_detective/detect_fields/FR/other/tel_fr/__init__.py +0 -17
  92. csv_detective/detect_fields/FR/other/uai/__init__.py +0 -15
  93. csv_detective/detect_fields/FR/temp/__init__.py +0 -0
  94. csv_detective/detect_fields/__init__.py +0 -112
  95. csv_detective/detect_fields/geo/__init__.py +0 -0
  96. csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +0 -15
  97. csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +0 -14
  98. csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +0 -15
  99. csv_detective/detect_fields/geo/json_geojson/__init__.py +0 -18
  100. csv_detective/detect_fields/geo/latitude_wgs/__init__.py +0 -13
  101. csv_detective/detect_fields/geo/latlon_wgs/__init__.py +0 -16
  102. csv_detective/detect_fields/geo/longitude_wgs/__init__.py +0 -13
  103. csv_detective/detect_fields/geo/lonlat_wgs/__init__.py +0 -16
  104. csv_detective/detect_fields/other/__init__.py +0 -0
  105. csv_detective/detect_fields/other/email/__init__.py +0 -10
  106. csv_detective/detect_fields/other/money/__init__.py +0 -11
  107. csv_detective/detect_fields/other/mongo_object_id/__init__.py +0 -8
  108. csv_detective/detect_fields/other/percent/__init__.py +0 -9
  109. csv_detective/detect_fields/other/twitter/__init__.py +0 -8
  110. csv_detective/detect_fields/other/url/__init__.py +0 -14
  111. csv_detective/detect_fields/other/uuid/__init__.py +0 -10
  112. csv_detective/detect_fields/temp/__init__.py +0 -0
  113. csv_detective/detect_fields/temp/year/__init__.py +0 -10
  114. csv_detective/detect_labels/FR/__init__.py +0 -0
  115. csv_detective/detect_labels/FR/geo/__init__.py +0 -0
  116. csv_detective/detect_labels/FR/geo/adresse/__init__.py +0 -15
  117. csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +0 -17
  118. csv_detective/detect_labels/FR/geo/code_departement/__init__.py +0 -15
  119. csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +0 -12
  120. csv_detective/detect_labels/FR/geo/code_postal/__init__.py +0 -16
  121. csv_detective/detect_labels/FR/geo/code_region/__init__.py +0 -14
  122. csv_detective/detect_labels/FR/geo/commune/__init__.py +0 -12
  123. csv_detective/detect_labels/FR/geo/departement/__init__.py +0 -22
  124. csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +0 -13
  125. csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +0 -30
  126. csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -30
  127. csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +0 -21
  128. csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -21
  129. csv_detective/detect_labels/FR/geo/pays/__init__.py +0 -20
  130. csv_detective/detect_labels/FR/geo/region/__init__.py +0 -20
  131. csv_detective/detect_labels/FR/other/__init__.py +0 -0
  132. csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +0 -8
  133. csv_detective/detect_labels/FR/other/code_rna/__init__.py +0 -13
  134. csv_detective/detect_labels/FR/other/code_waldec/__init__.py +0 -8
  135. csv_detective/detect_labels/FR/other/csp_insee/__init__.py +0 -13
  136. csv_detective/detect_labels/FR/other/date_fr/__init__.py +0 -9
  137. csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +0 -15
  138. csv_detective/detect_labels/FR/other/sexe/__init__.py +0 -8
  139. csv_detective/detect_labels/FR/other/siren/__init__.py +0 -17
  140. csv_detective/detect_labels/FR/other/siret/__init__.py +0 -16
  141. csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -20
  142. csv_detective/detect_labels/FR/other/uai/__init__.py +0 -25
  143. csv_detective/detect_labels/FR/temp/__init__.py +0 -0
  144. csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +0 -16
  145. csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +0 -8
  146. csv_detective/detect_labels/__init__.py +0 -94
  147. csv_detective/detect_labels/geo/__init__.py +0 -0
  148. csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +0 -16
  149. csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +0 -16
  150. csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +0 -16
  151. csv_detective/detect_labels/geo/json_geojson/__init__.py +0 -17
  152. csv_detective/detect_labels/geo/latitude_wgs/__init__.py +0 -30
  153. csv_detective/detect_labels/geo/latlon_wgs/__init__.py +0 -39
  154. csv_detective/detect_labels/geo/longitude_wgs/__init__.py +0 -21
  155. csv_detective/detect_labels/geo/lonlat_wgs/__init__.py +0 -23
  156. csv_detective/detect_labels/other/__init__.py +0 -0
  157. csv_detective/detect_labels/other/booleen/__init__.py +0 -8
  158. csv_detective/detect_labels/other/email/__init__.py +0 -20
  159. csv_detective/detect_labels/other/float/__init__.py +0 -8
  160. csv_detective/detect_labels/other/int/__init__.py +0 -8
  161. csv_detective/detect_labels/other/money/__init__.py +0 -8
  162. csv_detective/detect_labels/other/mongo_object_id/__init__.py +0 -8
  163. csv_detective/detect_labels/other/twitter/__init__.py +0 -8
  164. csv_detective/detect_labels/other/url/__init__.py +0 -23
  165. csv_detective/detect_labels/other/uuid/__init__.py +0 -8
  166. csv_detective/detect_labels/temp/__init__.py +0 -0
  167. csv_detective/detect_labels/temp/date/__init__.py +0 -28
  168. csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +0 -19
  169. csv_detective/detect_labels/temp/year/__init__.py +0 -19
  170. csv_detective/load_tests.py +0 -59
  171. csv_detective-0.9.3.dev2258.dist-info/RECORD +0 -166
  172. /csv_detective/{detect_fields/FR/other/csp_insee → formats/data}/csp_insee.txt +0 -0
  173. /csv_detective/{detect_fields/geo/iso_country_code_alpha2 → formats/data}/iso_country_code_alpha2.txt +0 -0
  174. /csv_detective/{detect_fields/geo/iso_country_code_alpha3 → formats/data}/iso_country_code_alpha3.txt +0 -0
  175. /csv_detective/{detect_fields/geo/iso_country_code_numeric → formats/data}/iso_country_code_numeric.txt +0 -0
  176. {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2348.dist-info}/WHEEL +0 -0
  177. {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2348.dist-info}/entry_points.txt +0 -0
  178. {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2348.dist-info}/licenses/LICENSE +0 -0
  179. {csv_detective-0.9.3.dev2258.dist-info → csv_detective-0.9.3.dev2348.dist-info}/top_level.txt +0 -0
@@ -1,25 +0,0 @@
1
- from csv_detective.parsing.text import header_score
2
-
3
- PROPORTION = 0.5
4
-
5
-
6
- def _is(header: str) -> float:
7
- words_combinations_list = [
8
- "uai",
9
- "code etablissement",
10
- "code uai",
11
- "uai - identifiant",
12
- "numero uai",
13
- "rne",
14
- "numero de l'etablissement",
15
- "code rne",
16
- "codeetab",
17
- "code uai de l'etablissement",
18
- "ref uai",
19
- "cd rne",
20
- "numerouai",
21
- "numero d etablissement",
22
- "code etablissement",
23
- "numero etablissement",
24
- ]
25
- return header_score(header, words_combinations_list)
File without changes
@@ -1,16 +0,0 @@
1
- from csv_detective.parsing.text import header_score
2
-
3
- PROPORTION = 0.5
4
-
5
-
6
- def _is(header: str) -> float:
7
- words_combinations_list = [
8
- "jour semaine",
9
- "type jour",
10
- "jour de la semaine",
11
- "saufjour",
12
- "nomjour",
13
- "jour",
14
- "jour de fermeture",
15
- ]
16
- return header_score(header, words_combinations_list)
@@ -1,8 +0,0 @@
1
- from csv_detective.parsing.text import header_score
2
-
3
- PROPORTION = 0.5
4
-
5
-
6
- def _is(header: str) -> float:
7
- words_combinations_list = ["mois de annee", "mois", "month"]
8
- return header_score(header, words_combinations_list)
@@ -1,94 +0,0 @@
1
- from .FR.geo import (
2
- adresse,
3
- code_commune_insee,
4
- code_departement,
5
- code_fantoir,
6
- code_postal,
7
- code_region,
8
- commune,
9
- departement,
10
- insee_canton,
11
- latitude_l93,
12
- latitude_wgs_fr_metropole,
13
- longitude_l93,
14
- longitude_wgs_fr_metropole,
15
- pays,
16
- region,
17
- )
18
- from .FR.other import (
19
- code_csp_insee,
20
- code_rna,
21
- code_waldec,
22
- csp_insee,
23
- date_fr,
24
- insee_ape700,
25
- sexe,
26
- siren,
27
- siret,
28
- tel_fr,
29
- uai,
30
- )
31
- from .FR.temp import jour_de_la_semaine, mois_de_annee
32
- from .geo import (
33
- iso_country_code_alpha2,
34
- iso_country_code_alpha3,
35
- iso_country_code_numeric,
36
- json_geojson,
37
- latitude_wgs,
38
- latlon_wgs,
39
- longitude_wgs,
40
- lonlat_wgs,
41
- )
42
- from .other import booleen, email, float, int, money, mongo_object_id, twitter, url, uuid
43
- from .temp import date, datetime_rfc822, year
44
-
45
- __all__ = [
46
- "adresse",
47
- "code_commune_insee",
48
- "code_departement",
49
- "code_fantoir",
50
- "code_postal",
51
- "code_region",
52
- "commune",
53
- "departement",
54
- "insee_canton",
55
- "latitude_l93",
56
- "latitude_wgs_fr_metropole",
57
- "longitude_l93",
58
- "longitude_wgs_fr_metropole",
59
- "pays",
60
- "region",
61
- "code_csp_insee",
62
- "code_rna",
63
- "code_waldec",
64
- "csp_insee",
65
- "date_fr",
66
- "insee_ape700",
67
- "sexe",
68
- "siren",
69
- "siret",
70
- "tel_fr",
71
- "uai",
72
- "iso_country_code_alpha2",
73
- "iso_country_code_alpha3",
74
- "iso_country_code_numeric",
75
- "json_geojson",
76
- "latitude_wgs",
77
- "latlon_wgs",
78
- "longitude_wgs",
79
- "lonlat_wgs",
80
- "jour_de_la_semaine",
81
- "mois_de_annee",
82
- "booleen",
83
- "email",
84
- "float",
85
- "int",
86
- "money",
87
- "mongo_object_id",
88
- "twitter",
89
- "url",
90
- "uuid",
91
- "date",
92
- "datetime_rfc822",
93
- "year",
94
- ]
File without changes
@@ -1,16 +0,0 @@
1
- from csv_detective.parsing.text import header_score
2
-
3
- PROPORTION = 0.5
4
-
5
-
6
- def _is(header: str) -> float:
7
- words_combinations_list = [
8
- "iso country code",
9
- "code pays",
10
- "pays",
11
- "country",
12
- "nation",
13
- "pays code",
14
- "code pays (iso)",
15
- ]
16
- return header_score(header, words_combinations_list)
@@ -1,16 +0,0 @@
1
- from csv_detective.parsing.text import header_score
2
-
3
- PROPORTION = 0.5
4
-
5
-
6
- def _is(header: str) -> float:
7
- words_combinations_list = [
8
- "iso country code",
9
- "code pays",
10
- "pays",
11
- "country",
12
- "nation",
13
- "pays code",
14
- "code pays (iso)",
15
- ]
16
- return header_score(header, words_combinations_list)
@@ -1,16 +0,0 @@
1
- from csv_detective.parsing.text import header_score
2
-
3
- PROPORTION = 0.5
4
-
5
-
6
- def _is(header: str) -> float:
7
- words_combinations_list = [
8
- "iso country code",
9
- "code pays",
10
- "pays",
11
- "country",
12
- "nation",
13
- "pays code",
14
- "code pays (iso)",
15
- ]
16
- return header_score(header, words_combinations_list)
@@ -1,17 +0,0 @@
1
- from csv_detective.parsing.text import header_score
2
-
3
- PROPORTION = 0.5
4
-
5
-
6
- def _is(header: str) -> float:
7
- words_combinations_list = [
8
- "json geojson",
9
- "json",
10
- "geojson",
11
- "geo shape",
12
- "geom",
13
- "geometry",
14
- "geo shape",
15
- "geoshape",
16
- ]
17
- return header_score(header, words_combinations_list)
@@ -1,30 +0,0 @@
1
- from csv_detective.parsing.text import header_score
2
-
3
- PROPORTION = 0.5
4
-
5
-
6
- def _is(header: str) -> float:
7
- words_combinations_list = [
8
- "latitude",
9
- "lat",
10
- "y",
11
- "yf",
12
- "yd",
13
- "coordonnee y",
14
- "coord y",
15
- "ycoord",
16
- "geocodage y gps",
17
- "location latitude",
18
- "ylatitude",
19
- "ylat",
20
- "latitude (y)",
21
- "latitudeorg",
22
- "coordinates.latitude",
23
- "googlemap latitude",
24
- "latitudelieu",
25
- "latitude googlemap",
26
- "latitude wgs84",
27
- "y wgs84",
28
- "latitude (wgs84)",
29
- ]
30
- return header_score(header, words_combinations_list)
@@ -1,39 +0,0 @@
1
- from csv_detective.parsing.text import header_score
2
-
3
- PROPORTION = 0.5
4
-
5
- COMMON_COORDS_LABELS = [
6
- "ban",
7
- "coordinates",
8
- "coordonnees",
9
- "coordonnees insee",
10
- "geo",
11
- "geopoint",
12
- "geoloc",
13
- "geolocalisation",
14
- "geom",
15
- "geometry",
16
- "gps",
17
- "localisation",
18
- "point",
19
- "position",
20
- "wgs84",
21
- ]
22
-
23
- specific = [
24
- "latlon",
25
- "lat lon",
26
- "x y",
27
- "xy",
28
- ]
29
-
30
- # we aim wide to catch exact matches if possible for the highest possible score
31
- words = (
32
- COMMON_COORDS_LABELS
33
- + specific
34
- + [w + sep + suf for suf in specific for w in COMMON_COORDS_LABELS for sep in ["", " "]]
35
- )
36
-
37
-
38
- def _is(header: str) -> float:
39
- return header_score(header, words)
@@ -1,21 +0,0 @@
1
- from csv_detective.parsing.text import header_score
2
-
3
- PROPORTION = 0.5
4
-
5
-
6
- def _is(header: str) -> float:
7
- # Does not detect CRS
8
- words_combinations_list = [
9
- "longitude",
10
- "lon",
11
- "long",
12
- "geocodage x gps",
13
- "location longitude",
14
- "xlongitude",
15
- "lng",
16
- "xlong",
17
- "x",
18
- "xf",
19
- "xd",
20
- ]
21
- return header_score(header, words_combinations_list)
@@ -1,23 +0,0 @@
1
- from csv_detective.parsing.text import header_score
2
-
3
- from ..latlon_wgs import COMMON_COORDS_LABELS
4
-
5
- PROPORTION = 0.5
6
-
7
- specific = [
8
- "lonlat",
9
- "lon lat",
10
- "y x",
11
- "yx",
12
- ]
13
-
14
- # we aim wide to catch exact matches if possible for the highest possible score
15
- words = (
16
- COMMON_COORDS_LABELS
17
- + specific
18
- + [w + sep + suf for suf in specific for w in COMMON_COORDS_LABELS for sep in ["", " "]]
19
- )
20
-
21
-
22
- def _is(header: str) -> float:
23
- return header_score(header, words)
File without changes
@@ -1,8 +0,0 @@
1
- from csv_detective.parsing.text import header_score
2
-
3
- PROPORTION = 0.5
4
-
5
-
6
- def _is(header: str) -> float:
7
- words_combinations_list = ["is ", "has ", "est "]
8
- return header_score(header, words_combinations_list)
@@ -1,20 +0,0 @@
1
- from csv_detective.parsing.text import header_score
2
-
3
- PROPORTION = 0.5
4
-
5
-
6
- def _is(header: str) -> float:
7
- words_combinations_list = [
8
- "email",
9
- "mail",
10
- "courriel",
11
- "contact",
12
- "mel",
13
- "lieucourriel",
14
- "coordinates.emailcontact",
15
- "e mail",
16
- "mo mail",
17
- "adresse mail",
18
- "adresse email",
19
- ]
20
- return header_score(header, words_combinations_list)
@@ -1,8 +0,0 @@
1
- from csv_detective.parsing.text import header_score
2
-
3
- PROPORTION = 0.5
4
-
5
-
6
- def _is(header: str) -> float:
7
- words_combinations_list = ["part", "ratio", "taux"]
8
- return header_score(header, words_combinations_list)
@@ -1,8 +0,0 @@
1
- from csv_detective.parsing.text import header_score
2
-
3
- PROPORTION = 0.5
4
-
5
-
6
- def _is(header: str) -> float:
7
- words_combinations_list = ["nb", "nombre", "nbre"]
8
- return header_score(header, words_combinations_list)
@@ -1,8 +0,0 @@
1
- from csv_detective.parsing.text import header_score
2
-
3
- PROPORTION = 0.5
4
-
5
-
6
- def _is(header: str) -> float:
7
- words_combinations_list = ["budget", "salaire", "euro", "euros", "prêt", "montant"]
8
- return header_score(header, words_combinations_list)
@@ -1,8 +0,0 @@
1
- from csv_detective.parsing.text import header_score
2
-
3
- PROPORTION = 0.5
4
-
5
-
6
- def _is(header: str) -> float:
7
- words_combinations_list = ["id", "objectid"]
8
- return header_score(header, words_combinations_list)
@@ -1,8 +0,0 @@
1
- from csv_detective.parsing.text import header_score
2
-
3
- PROPORTION = 0.5
4
-
5
-
6
- def _is(header: str) -> float:
7
- words_combinations_list = ["twitter", "twitter account", "twitter username"]
8
- return header_score(header, words_combinations_list)
@@ -1,23 +0,0 @@
1
- from csv_detective.parsing.text import header_score
2
-
3
- PROPORTION = 0.5
4
-
5
-
6
- def _is(header: str) -> float:
7
- words_combinations_list = [
8
- "url",
9
- "url source",
10
- "site web",
11
- "source url",
12
- "site internet",
13
- "remote url",
14
- "web",
15
- "site",
16
- "lien",
17
- "site data",
18
- "lien url",
19
- "lien vers le fichier",
20
- "sitweb",
21
- "interneturl",
22
- ]
23
- return header_score(header, words_combinations_list)
@@ -1,8 +0,0 @@
1
- from csv_detective.parsing.text import header_score
2
-
3
- PROPORTION = 0.5
4
-
5
-
6
- def _is(header: str) -> float:
7
- words_combinations_list = ["id", "uuid", "guid"]
8
- return header_score(header, words_combinations_list)
File without changes
@@ -1,28 +0,0 @@
1
- from csv_detective.parsing.text import header_score
2
-
3
- PROPORTION = 0.5
4
-
5
-
6
- def _is(header: str) -> float:
7
- words_combinations_list = [
8
- "date",
9
- "jour",
10
- "date de mise a jour",
11
- "sns date",
12
- "date maj",
13
- "rem date",
14
- "periode",
15
- "date de publication",
16
- "dpc",
17
- "extract date",
18
- "date immatriculation",
19
- "date jeu donnees",
20
- "datemaj",
21
- "dateouv",
22
- "date der maj",
23
- "dmaj",
24
- "jour",
25
- "yyyymmdd",
26
- "aaaammjj",
27
- ]
28
- return header_score(header, words_combinations_list)
@@ -1,19 +0,0 @@
1
- from csv_detective.parsing.text import header_score
2
-
3
- PROPORTION = 0.5
4
-
5
-
6
- def _is(header: str) -> float:
7
- words_combinations_list = [
8
- "datetime",
9
- "timestamp",
10
- "osm_timestamp",
11
- "date",
12
- "created at",
13
- "last update",
14
- "date maj",
15
- "createdat",
16
- "date naissance",
17
- "date donnees",
18
- ] # Almost same as IS0, no example in data
19
- return header_score(header, words_combinations_list)
@@ -1,19 +0,0 @@
1
- from csv_detective.parsing.text import header_score
2
-
3
- PROPORTION = 0.5
4
-
5
-
6
- def _is(header: str) -> float:
7
- words_combinations_list = [
8
- "year",
9
- "annee",
10
- "annee depot",
11
- "an nais",
12
- "exercice",
13
- "data year",
14
- "annee de publication",
15
- "exercice comptable",
16
- "annee de naissance",
17
- "annee ouverture",
18
- ]
19
- return header_score(header, words_combinations_list)
@@ -1,59 +0,0 @@
1
- import os
2
-
3
- from csv_detective import detect_fields, detect_labels # noqa
4
-
5
-
6
- def get_all_packages(detect_type) -> list:
7
- root_dir = os.path.dirname(os.path.abspath(__file__)) + "/" + detect_type
8
- modules = []
9
- for dirpath, _, filenames in os.walk(root_dir):
10
- for filename in filenames:
11
- file = os.path.join(dirpath, filename).replace(root_dir, "")
12
- if file.endswith("__init__.py"):
13
- module = file.replace("__init__.py", "").replace("/", ".").replace("\\", ".")[:-1]
14
- if module:
15
- modules.append(detect_type + module)
16
- return modules
17
-
18
-
19
- def return_all_tests(
20
- user_input_tests: str | list,
21
- detect_type: str,
22
- ) -> dict[str, dict]:
23
- """
24
- returns all tests that have a method _is and are listed in the user_input_tests
25
- the function can select a sub_package from csv_detective
26
- user_input_tests may look like this:
27
- - "ALL": all possible tests are made
28
- - "FR.other.siren" (or any other path-like string to one of the tests, or a group of tests, like "FR.geo"):
29
- this specifc (group of) test(s) only
30
- - ["FR.temp.mois_de_annee", "geo", ...]: only the specified tests will be made ; you may also skip
31
- specific (groups of) tests by add "-" at the start (e.g "-temp.date")
32
- """
33
- assert detect_type in ["detect_fields", "detect_labels"]
34
- all_packages = get_all_packages(detect_type=detect_type)
35
-
36
- if isinstance(user_input_tests, str):
37
- user_input_tests = [user_input_tests]
38
- if "ALL" in user_input_tests or all(x[0] == "-" for x in user_input_tests):
39
- tests_to_do = [detect_type]
40
- else:
41
- tests_to_do = [f"{detect_type}.{x}" for x in user_input_tests if x[0] != "-"]
42
- tests_skipped = [f"{detect_type}.{x[1:]}" for x in user_input_tests if x[0] == "-"]
43
- # removing specified (groups of) tests
44
- all_tests = [
45
- # this is why we need to import detect_fields/labels
46
- eval(x)
47
- for x in all_packages
48
- if any([y == x[: len(y)] for y in tests_to_do])
49
- and all([y != x[: len(y)] for y in tests_skipped])
50
- ]
51
- return {
52
- test.__name__.split(".")[-1]: {
53
- "func": test._is,
54
- "prop": test.PROPORTION,
55
- "module": test,
56
- }
57
- for test in all_tests
58
- if "_is" in dir(test)
59
- }