csv-detective 0.6.7__py3-none-any.whl → 0.9.3.dev2438__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. csv_detective/__init__.py +7 -1
  2. csv_detective/cli.py +33 -21
  3. csv_detective/{detect_fields/FR → detection}/__init__.py +0 -0
  4. csv_detective/detection/columns.py +89 -0
  5. csv_detective/detection/encoding.py +29 -0
  6. csv_detective/detection/engine.py +46 -0
  7. csv_detective/detection/formats.py +156 -0
  8. csv_detective/detection/headers.py +28 -0
  9. csv_detective/detection/rows.py +18 -0
  10. csv_detective/detection/separator.py +44 -0
  11. csv_detective/detection/variables.py +97 -0
  12. csv_detective/explore_csv.py +151 -377
  13. csv_detective/format.py +67 -0
  14. csv_detective/formats/__init__.py +9 -0
  15. csv_detective/formats/adresse.py +116 -0
  16. csv_detective/formats/binary.py +26 -0
  17. csv_detective/formats/booleen.py +35 -0
  18. csv_detective/formats/code_commune_insee.py +26 -0
  19. csv_detective/formats/code_csp_insee.py +36 -0
  20. csv_detective/formats/code_departement.py +29 -0
  21. csv_detective/formats/code_fantoir.py +21 -0
  22. csv_detective/formats/code_import.py +17 -0
  23. csv_detective/formats/code_postal.py +25 -0
  24. csv_detective/formats/code_region.py +22 -0
  25. csv_detective/formats/code_rna.py +29 -0
  26. csv_detective/formats/code_waldec.py +17 -0
  27. csv_detective/formats/commune.py +27 -0
  28. csv_detective/formats/csp_insee.py +31 -0
  29. csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt +0 -0
  30. csv_detective/formats/date.py +99 -0
  31. csv_detective/formats/date_fr.py +22 -0
  32. csv_detective/formats/datetime_aware.py +45 -0
  33. csv_detective/formats/datetime_naive.py +48 -0
  34. csv_detective/formats/datetime_rfc822.py +24 -0
  35. csv_detective/formats/departement.py +37 -0
  36. csv_detective/formats/email.py +28 -0
  37. csv_detective/formats/float.py +29 -0
  38. csv_detective/formats/geojson.py +36 -0
  39. csv_detective/formats/insee_ape700.py +31 -0
  40. csv_detective/formats/insee_canton.py +28 -0
  41. csv_detective/formats/int.py +23 -0
  42. csv_detective/formats/iso_country_code_alpha2.py +30 -0
  43. csv_detective/formats/iso_country_code_alpha3.py +30 -0
  44. csv_detective/formats/iso_country_code_numeric.py +31 -0
  45. csv_detective/formats/jour_de_la_semaine.py +41 -0
  46. csv_detective/formats/json.py +20 -0
  47. csv_detective/formats/latitude_l93.py +48 -0
  48. csv_detective/formats/latitude_wgs.py +42 -0
  49. csv_detective/formats/latitude_wgs_fr_metropole.py +42 -0
  50. csv_detective/formats/latlon_wgs.py +53 -0
  51. csv_detective/formats/longitude_l93.py +39 -0
  52. csv_detective/formats/longitude_wgs.py +32 -0
  53. csv_detective/formats/longitude_wgs_fr_metropole.py +32 -0
  54. csv_detective/formats/lonlat_wgs.py +36 -0
  55. csv_detective/formats/mois_de_lannee.py +48 -0
  56. csv_detective/formats/money.py +18 -0
  57. csv_detective/formats/mongo_object_id.py +14 -0
  58. csv_detective/formats/pays.py +35 -0
  59. csv_detective/formats/percent.py +16 -0
  60. csv_detective/formats/region.py +70 -0
  61. csv_detective/formats/sexe.py +17 -0
  62. csv_detective/formats/siren.py +37 -0
  63. csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} +47 -29
  64. csv_detective/formats/tel_fr.py +36 -0
  65. csv_detective/formats/uai.py +36 -0
  66. csv_detective/formats/url.py +46 -0
  67. csv_detective/formats/username.py +14 -0
  68. csv_detective/formats/uuid.py +16 -0
  69. csv_detective/formats/year.py +28 -0
  70. csv_detective/output/__init__.py +65 -0
  71. csv_detective/output/dataframe.py +96 -0
  72. csv_detective/output/example.py +250 -0
  73. csv_detective/output/profile.py +119 -0
  74. csv_detective/{schema_generation.py → output/schema.py} +268 -343
  75. csv_detective/output/utils.py +74 -0
  76. csv_detective/{detect_fields/FR/geo → parsing}/__init__.py +0 -0
  77. csv_detective/parsing/columns.py +235 -0
  78. csv_detective/parsing/compression.py +11 -0
  79. csv_detective/parsing/csv.py +56 -0
  80. csv_detective/parsing/excel.py +167 -0
  81. csv_detective/parsing/load.py +111 -0
  82. csv_detective/parsing/text.py +56 -0
  83. csv_detective/utils.py +23 -196
  84. csv_detective/validate.py +138 -0
  85. csv_detective-0.9.3.dev2438.dist-info/METADATA +267 -0
  86. csv_detective-0.9.3.dev2438.dist-info/RECORD +92 -0
  87. csv_detective-0.9.3.dev2438.dist-info/WHEEL +4 -0
  88. {csv_detective-0.6.7.dist-info → csv_detective-0.9.3.dev2438.dist-info}/entry_points.txt +1 -0
  89. csv_detective/all_packages.txt +0 -104
  90. csv_detective/detect_fields/FR/geo/adresse/__init__.py +0 -100
  91. csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +0 -24
  92. csv_detective/detect_fields/FR/geo/code_commune_insee/code_commune_insee.txt +0 -37600
  93. csv_detective/detect_fields/FR/geo/code_departement/__init__.py +0 -11
  94. csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +0 -15
  95. csv_detective/detect_fields/FR/geo/code_fantoir/code_fantoir.txt +0 -26122
  96. csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -19
  97. csv_detective/detect_fields/FR/geo/code_postal/code_postal.txt +0 -36822
  98. csv_detective/detect_fields/FR/geo/code_region/__init__.py +0 -27
  99. csv_detective/detect_fields/FR/geo/commune/__init__.py +0 -21
  100. csv_detective/detect_fields/FR/geo/commune/commune.txt +0 -36745
  101. csv_detective/detect_fields/FR/geo/departement/__init__.py +0 -19
  102. csv_detective/detect_fields/FR/geo/departement/departement.txt +0 -101
  103. csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +0 -20
  104. csv_detective/detect_fields/FR/geo/insee_canton/canton2017.txt +0 -2055
  105. csv_detective/detect_fields/FR/geo/insee_canton/cantons.txt +0 -2055
  106. csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +0 -13
  107. csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -13
  108. csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +0 -13
  109. csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -13
  110. csv_detective/detect_fields/FR/geo/pays/__init__.py +0 -17
  111. csv_detective/detect_fields/FR/geo/pays/pays.txt +0 -248
  112. csv_detective/detect_fields/FR/geo/region/__init__.py +0 -16
  113. csv_detective/detect_fields/FR/geo/region/region.txt +0 -44
  114. csv_detective/detect_fields/FR/other/__init__.py +0 -0
  115. csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +0 -26
  116. csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
  117. csv_detective/detect_fields/FR/other/code_rna/__init__.py +0 -8
  118. csv_detective/detect_fields/FR/other/code_waldec/__init__.py +0 -12
  119. csv_detective/detect_fields/FR/other/csp_insee/__init__.py +0 -16
  120. csv_detective/detect_fields/FR/other/date_fr/__init__.py +0 -12
  121. csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +0 -16
  122. csv_detective/detect_fields/FR/other/sexe/__init__.py +0 -9
  123. csv_detective/detect_fields/FR/other/siren/__init__.py +0 -18
  124. csv_detective/detect_fields/FR/other/tel_fr/__init__.py +0 -15
  125. csv_detective/detect_fields/FR/other/uai/__init__.py +0 -15
  126. csv_detective/detect_fields/FR/temp/__init__.py +0 -0
  127. csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +0 -23
  128. csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +0 -37
  129. csv_detective/detect_fields/__init__.py +0 -57
  130. csv_detective/detect_fields/geo/__init__.py +0 -0
  131. csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +0 -15
  132. csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +0 -14
  133. csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +0 -15
  134. csv_detective/detect_fields/geo/json_geojson/__init__.py +0 -22
  135. csv_detective/detect_fields/geo/latitude_wgs/__init__.py +0 -13
  136. csv_detective/detect_fields/geo/latlon_wgs/__init__.py +0 -15
  137. csv_detective/detect_fields/geo/longitude_wgs/__init__.py +0 -13
  138. csv_detective/detect_fields/other/__init__.py +0 -0
  139. csv_detective/detect_fields/other/booleen/__init__.py +0 -21
  140. csv_detective/detect_fields/other/email/__init__.py +0 -8
  141. csv_detective/detect_fields/other/float/__init__.py +0 -17
  142. csv_detective/detect_fields/other/int/__init__.py +0 -12
  143. csv_detective/detect_fields/other/json/__init__.py +0 -24
  144. csv_detective/detect_fields/other/mongo_object_id/__init__.py +0 -8
  145. csv_detective/detect_fields/other/twitter/__init__.py +0 -8
  146. csv_detective/detect_fields/other/url/__init__.py +0 -11
  147. csv_detective/detect_fields/other/uuid/__init__.py +0 -11
  148. csv_detective/detect_fields/temp/__init__.py +0 -0
  149. csv_detective/detect_fields/temp/date/__init__.py +0 -62
  150. csv_detective/detect_fields/temp/datetime_iso/__init__.py +0 -18
  151. csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +0 -21
  152. csv_detective/detect_fields/temp/year/__init__.py +0 -10
  153. csv_detective/detect_labels/FR/__init__.py +0 -0
  154. csv_detective/detect_labels/FR/geo/__init__.py +0 -0
  155. csv_detective/detect_labels/FR/geo/adresse/__init__.py +0 -40
  156. csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +0 -42
  157. csv_detective/detect_labels/FR/geo/code_departement/__init__.py +0 -33
  158. csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +0 -33
  159. csv_detective/detect_labels/FR/geo/code_postal/__init__.py +0 -41
  160. csv_detective/detect_labels/FR/geo/code_region/__init__.py +0 -33
  161. csv_detective/detect_labels/FR/geo/commune/__init__.py +0 -33
  162. csv_detective/detect_labels/FR/geo/departement/__init__.py +0 -47
  163. csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +0 -33
  164. csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +0 -54
  165. csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -55
  166. csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +0 -44
  167. csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -45
  168. csv_detective/detect_labels/FR/geo/pays/__init__.py +0 -45
  169. csv_detective/detect_labels/FR/geo/region/__init__.py +0 -45
  170. csv_detective/detect_labels/FR/other/__init__.py +0 -0
  171. csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +0 -33
  172. csv_detective/detect_labels/FR/other/code_rna/__init__.py +0 -38
  173. csv_detective/detect_labels/FR/other/code_waldec/__init__.py +0 -33
  174. csv_detective/detect_labels/FR/other/csp_insee/__init__.py +0 -37
  175. csv_detective/detect_labels/FR/other/date_fr/__init__.py +0 -33
  176. csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +0 -40
  177. csv_detective/detect_labels/FR/other/sexe/__init__.py +0 -33
  178. csv_detective/detect_labels/FR/other/siren/__init__.py +0 -41
  179. csv_detective/detect_labels/FR/other/siret/__init__.py +0 -40
  180. csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -45
  181. csv_detective/detect_labels/FR/other/uai/__init__.py +0 -50
  182. csv_detective/detect_labels/FR/temp/__init__.py +0 -0
  183. csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +0 -41
  184. csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +0 -33
  185. csv_detective/detect_labels/__init__.py +0 -43
  186. csv_detective/detect_labels/geo/__init__.py +0 -0
  187. csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +0 -41
  188. csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +0 -41
  189. csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +0 -41
  190. csv_detective/detect_labels/geo/json_geojson/__init__.py +0 -42
  191. csv_detective/detect_labels/geo/latitude_wgs/__init__.py +0 -55
  192. csv_detective/detect_labels/geo/latlon_wgs/__init__.py +0 -67
  193. csv_detective/detect_labels/geo/longitude_wgs/__init__.py +0 -45
  194. csv_detective/detect_labels/other/__init__.py +0 -0
  195. csv_detective/detect_labels/other/booleen/__init__.py +0 -34
  196. csv_detective/detect_labels/other/email/__init__.py +0 -45
  197. csv_detective/detect_labels/other/float/__init__.py +0 -33
  198. csv_detective/detect_labels/other/int/__init__.py +0 -33
  199. csv_detective/detect_labels/other/money/__init__.py +0 -11
  200. csv_detective/detect_labels/other/money/check_col_name.py +0 -8
  201. csv_detective/detect_labels/other/mongo_object_id/__init__.py +0 -33
  202. csv_detective/detect_labels/other/twitter/__init__.py +0 -33
  203. csv_detective/detect_labels/other/url/__init__.py +0 -48
  204. csv_detective/detect_labels/other/uuid/__init__.py +0 -33
  205. csv_detective/detect_labels/temp/__init__.py +0 -0
  206. csv_detective/detect_labels/temp/date/__init__.py +0 -51
  207. csv_detective/detect_labels/temp/datetime_iso/__init__.py +0 -45
  208. csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +0 -44
  209. csv_detective/detect_labels/temp/year/__init__.py +0 -44
  210. csv_detective/detection.py +0 -361
  211. csv_detective/process_text.py +0 -39
  212. csv_detective/s3_utils.py +0 -48
  213. csv_detective-0.6.7.data/data/share/csv_detective/CHANGELOG.md +0 -118
  214. csv_detective-0.6.7.data/data/share/csv_detective/LICENSE.AGPL.txt +0 -661
  215. csv_detective-0.6.7.data/data/share/csv_detective/README.md +0 -247
  216. csv_detective-0.6.7.dist-info/LICENSE.AGPL.txt +0 -661
  217. csv_detective-0.6.7.dist-info/METADATA +0 -23
  218. csv_detective-0.6.7.dist-info/RECORD +0 -150
  219. csv_detective-0.6.7.dist-info/WHEEL +0 -5
  220. csv_detective-0.6.7.dist-info/top_level.txt +0 -2
  221. tests/__init__.py +0 -0
  222. tests/test_fields.py +0 -360
  223. tests/test_file.py +0 -116
  224. tests/test_labels.py +0 -7
  225. /csv_detective/{detect_fields/FR/other/csp_insee → formats/data}/csp_insee.txt +0 -0
  226. /csv_detective/{detect_fields/geo/iso_country_code_alpha2 → formats/data}/iso_country_code_alpha2.txt +0 -0
  227. /csv_detective/{detect_fields/geo/iso_country_code_alpha3 → formats/data}/iso_country_code_alpha3.txt +0 -0
  228. /csv_detective/{detect_fields/geo/iso_country_code_numeric → formats/data}/iso_country_code_numeric.txt +0 -0
@@ -1,33 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = ['commune', 'ville', 'libelle commune']
14
- processed_header = _process_text(header)
15
-
16
- header_matches_words_combination = float(
17
- any(
18
- [
19
- words_combination == processed_header for words_combination in words_combinations_list
20
- ]
21
- )
22
- )
23
- words_combination_in_header = 0.5 * float(
24
- any(
25
- [
26
- full_word_strictly_inside_string(
27
- words_combination, processed_header
28
- ) for words_combination in words_combinations_list
29
- ]
30
- )
31
- )
32
-
33
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,47 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = [
14
- 'departement',
15
- 'libelle du departement',
16
- 'deplib',
17
- 'nom dept',
18
- 'dept',
19
- 'libdepartement',
20
- 'nom departement',
21
- 'libelle dep',
22
- 'libelle departement',
23
- 'lb departements',
24
- 'dep libusage',
25
- 'lb departement',
26
- 'nom dep'
27
- ]
28
- processed_header = _process_text(header)
29
-
30
- header_matches_words_combination = float(
31
- any(
32
- [
33
- words_combination == processed_header for words_combination in words_combinations_list
34
- ]
35
- )
36
- )
37
- words_combination_in_header = 0.5 * float(
38
- any(
39
- [
40
- full_word_strictly_inside_string(
41
- words_combination, processed_header
42
- ) for words_combination in words_combinations_list
43
- ]
44
- )
45
- )
46
-
47
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,33 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = ['insee canton', 'canton', 'cant', 'nom canton']
14
- processed_header = _process_text(header)
15
-
16
- header_matches_words_combination = float(
17
- any(
18
- [
19
- words_combination == processed_header for words_combination in words_combinations_list
20
- ]
21
- )
22
- )
23
- words_combination_in_header = 0.5 * float(
24
- any(
25
- [
26
- full_word_strictly_inside_string(
27
- words_combination, processed_header
28
- ) for words_combination in words_combinations_list
29
- ]
30
- )
31
- )
32
-
33
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,54 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
- # Does not always detect CRS
13
- words_combinations_list = [
14
- 'latitude',
15
- 'lat',
16
- 'y',
17
- 'yf',
18
- 'yd',
19
- 'y l93',
20
- 'coordonnee y',
21
- 'latitude lb93',
22
- 'coord y',
23
- 'ycoord',
24
- 'geocodage y gps',
25
- 'location latitude',
26
- 'ylatitude',
27
- 'ylat',
28
- 'latitude (y)',
29
- 'latitudeorg',
30
- 'coordinates.latitude',
31
- 'googlemap latitude',
32
- 'latitudelieu',
33
- 'latitude googlemap'
34
- ]
35
- processed_header = _process_text(header)
36
-
37
- header_matches_words_combination = float(
38
- any(
39
- [
40
- words_combination == processed_header for words_combination in words_combinations_list
41
- ]
42
- )
43
- )
44
- words_combination_in_header = 0.5 * float(
45
- any(
46
- [
47
- full_word_strictly_inside_string(
48
- words_combination, processed_header
49
- ) for words_combination in words_combinations_list
50
- ]
51
- )
52
- )
53
-
54
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,55 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = [
14
- 'latitude',
15
- 'lat',
16
- 'y',
17
- 'yf',
18
- 'yd',
19
- 'coordonnee y',
20
- 'coord y',
21
- 'ycoord',
22
- 'geocodage y gps',
23
- 'location latitude',
24
- 'ylatitude',
25
- 'ylat',
26
- 'latitude (y)',
27
- 'latitudeorg',
28
- 'coordinates.latitude',
29
- 'googlemap latitude',
30
- 'latitudelieu',
31
- 'latitude googlemap',
32
- 'latitude wgs84',
33
- 'y wgs84',
34
- 'latitude (wgs84)'
35
- ]
36
- processed_header = _process_text(header)
37
-
38
- header_matches_words_combination = float(
39
- any(
40
- [
41
- words_combination == processed_header for words_combination in words_combinations_list
42
- ]
43
- )
44
- )
45
- words_combination_in_header = 0.5 * float(
46
- any(
47
- [
48
- full_word_strictly_inside_string(
49
- words_combination, processed_header
50
- ) for words_combination in words_combinations_list
51
- ]
52
- )
53
- )
54
-
55
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,44 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
- # Does not detect CRS
13
- words_combinations_list = [
14
- 'longitude',
15
- 'lon',
16
- 'long',
17
- 'geocodage x gps',
18
- 'location longitude',
19
- 'xlongitude',
20
- 'lng',
21
- 'xlong',
22
- 'x',
23
- 'xf',
24
- 'xd']
25
- processed_header = _process_text(header)
26
-
27
- header_matches_words_combination = float(
28
- any(
29
- [
30
- words_combination == processed_header for words_combination in words_combinations_list
31
- ]
32
- )
33
- )
34
- words_combination_in_header = 0.5 * float(
35
- any(
36
- [
37
- full_word_strictly_inside_string(
38
- words_combination, processed_header
39
- ) for words_combination in words_combinations_list
40
- ]
41
- )
42
- )
43
-
44
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,45 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
- # Does not detect CRS
13
- words_combinations_list = [
14
- 'longitude',
15
- 'lon',
16
- 'long',
17
- 'geocodage x gps',
18
- 'location longitude',
19
- 'xlongitude',
20
- 'lng',
21
- 'xlong',
22
- 'x',
23
- 'xf',
24
- 'xd'
25
- ]
26
- processed_header = _process_text(header)
27
-
28
- header_matches_words_combination = float(
29
- any(
30
- [
31
- words_combination == processed_header for words_combination in words_combinations_list
32
- ]
33
- )
34
- )
35
- words_combination_in_header = 0.5 * float(
36
- any(
37
- [
38
- full_word_strictly_inside_string(
39
- words_combination, processed_header
40
- ) for words_combination in words_combinations_list
41
- ]
42
- )
43
- )
44
-
45
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,45 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = [
14
- 'pays',
15
- 'payslieu',
16
- 'paysorg',
17
- 'country',
18
- 'pays lib',
19
- 'lieupays',
20
- 'pays beneficiaire',
21
- 'nom du pays',
22
- 'journey start country',
23
- 'libelle pays',
24
- 'journey end country'
25
- ]
26
- processed_header = _process_text(header)
27
-
28
- header_matches_words_combination = float(
29
- any(
30
- [
31
- words_combination == processed_header for words_combination in words_combinations_list
32
- ]
33
- )
34
- )
35
- words_combination_in_header = 0.5 * float(
36
- any(
37
- [
38
- full_word_strictly_inside_string(
39
- words_combination, processed_header
40
- ) for words_combination in words_combinations_list
41
- ]
42
- )
43
- )
44
-
45
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,45 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = [
14
- 'region',
15
- 'libelle region',
16
- 'nom region',
17
- 'libelle reg',
18
- 'nom reg',
19
- 'reg libusage',
20
- 'nom de la region',
21
- 'regionorg',
22
- 'regionlieu',
23
- 'reg',
24
- 'nom officiel region'
25
- ]
26
- processed_header = _process_text(header)
27
-
28
- header_matches_words_combination = float(
29
- any(
30
- [
31
- words_combination == processed_header for words_combination in words_combinations_list
32
- ]
33
- )
34
- )
35
- words_combination_in_header = 0.5 * float(
36
- any(
37
- [
38
- full_word_strictly_inside_string(
39
- words_combination, processed_header
40
- ) for words_combination in words_combinations_list
41
- ]
42
- )
43
- )
44
-
45
- return max(header_matches_words_combination, words_combination_in_header)
File without changes
@@ -1,33 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = ['code csp insee', 'code csp']
14
- processed_header = _process_text(header)
15
-
16
- header_matches_words_combination = float(
17
- any(
18
- [
19
- words_combination == processed_header for words_combination in words_combinations_list
20
- ]
21
- )
22
- )
23
- words_combination_in_header = 0.5 * float(
24
- any(
25
- [
26
- full_word_strictly_inside_string(
27
- words_combination, processed_header
28
- ) for words_combination in words_combinations_list
29
- ]
30
- )
31
- )
32
-
33
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,38 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = [
14
- 'code rna',
15
- 'rna',
16
- 'n° inscription association',
17
- 'identifiant association'
18
- ]
19
- processed_header = _process_text(header)
20
-
21
- header_matches_words_combination = float(
22
- any(
23
- [
24
- words_combination == processed_header for words_combination in words_combinations_list
25
- ]
26
- )
27
- )
28
- words_combination_in_header = 0.5 * float(
29
- any(
30
- [
31
- full_word_strictly_inside_string(
32
- words_combination, processed_header
33
- ) for words_combination in words_combinations_list
34
- ]
35
- )
36
- )
37
-
38
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,33 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = ['code waldec', 'waldec']
14
- processed_header = _process_text(header)
15
-
16
- header_matches_words_combination = float(
17
- any(
18
- [
19
- words_combination == processed_header for words_combination in words_combinations_list
20
- ]
21
- )
22
- )
23
- words_combination_in_header = 0.5 * float(
24
- any(
25
- [
26
- full_word_strictly_inside_string(
27
- words_combination, processed_header
28
- ) for words_combination in words_combinations_list
29
- ]
30
- )
31
- )
32
-
33
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,37 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
- # To improve? No specific header found in data
13
- words_combinations_list = [
14
- 'csp insee',
15
- 'csp',
16
- 'categorie socioprofessionnelle'
17
- ]
18
- processed_header = _process_text(header)
19
-
20
- header_matches_words_combination = float(
21
- any(
22
- [
23
- words_combination == processed_header for words_combination in words_combinations_list
24
- ]
25
- )
26
- )
27
- words_combination_in_header = 0.5 * float(
28
- any(
29
- [
30
- full_word_strictly_inside_string(
31
- words_combination, processed_header
32
- ) for words_combination in words_combinations_list
33
- ]
34
- )
35
- )
36
-
37
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,33 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
- # To improve: no header specific to 'fr' found in data
13
- words_combinations_list = ['date']
14
- processed_header = _process_text(header)
15
-
16
- header_matches_words_combination = float(
17
- any(
18
- [
19
- words_combination == processed_header for words_combination in words_combinations_list
20
- ]
21
- )
22
- )
23
- words_combination_in_header = 0.5 * float(
24
- any(
25
- [
26
- full_word_strictly_inside_string(
27
- words_combination, processed_header
28
- ) for words_combination in words_combinations_list
29
- ]
30
- )
31
- )
32
-
33
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,40 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = [
14
- 'code ape',
15
- 'code activite (ape)',
16
- 'code naf',
17
- 'code naf organisme designe',
18
- 'code naf organisme designant',
19
- 'base sirene : code ape de l\'etablissement siege'
20
- ]
21
- processed_header = _process_text(header)
22
-
23
- header_matches_words_combination = float(
24
- any(
25
- [
26
- words_combination == processed_header for words_combination in words_combinations_list
27
- ]
28
- )
29
- )
30
- words_combination_in_header = 0.5 * float(
31
- any(
32
- [
33
- full_word_strictly_inside_string(
34
- words_combination, processed_header
35
- ) for words_combination in words_combinations_list
36
- ]
37
- )
38
- )
39
-
40
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,33 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = ['sexe', 'sex', 'civilite', 'genre', 'id sexe']
14
- processed_header = _process_text(header)
15
-
16
- header_matches_words_combination = float(
17
- any(
18
- [
19
- words_combination == processed_header for words_combination in words_combinations_list
20
- ]
21
- )
22
- )
23
- words_combination_in_header = 0.5 * float(
24
- any(
25
- [
26
- full_word_strictly_inside_string(
27
- words_combination, processed_header
28
- ) for words_combination in words_combinations_list
29
- ]
30
- )
31
- )
32
-
33
- return max(header_matches_words_combination, words_combination_in_header)