csv-detective 0.6.7__py3-none-any.whl → 0.9.3.dev2438__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. csv_detective/__init__.py +7 -1
  2. csv_detective/cli.py +33 -21
  3. csv_detective/{detect_fields/FR → detection}/__init__.py +0 -0
  4. csv_detective/detection/columns.py +89 -0
  5. csv_detective/detection/encoding.py +29 -0
  6. csv_detective/detection/engine.py +46 -0
  7. csv_detective/detection/formats.py +156 -0
  8. csv_detective/detection/headers.py +28 -0
  9. csv_detective/detection/rows.py +18 -0
  10. csv_detective/detection/separator.py +44 -0
  11. csv_detective/detection/variables.py +97 -0
  12. csv_detective/explore_csv.py +151 -377
  13. csv_detective/format.py +67 -0
  14. csv_detective/formats/__init__.py +9 -0
  15. csv_detective/formats/adresse.py +116 -0
  16. csv_detective/formats/binary.py +26 -0
  17. csv_detective/formats/booleen.py +35 -0
  18. csv_detective/formats/code_commune_insee.py +26 -0
  19. csv_detective/formats/code_csp_insee.py +36 -0
  20. csv_detective/formats/code_departement.py +29 -0
  21. csv_detective/formats/code_fantoir.py +21 -0
  22. csv_detective/formats/code_import.py +17 -0
  23. csv_detective/formats/code_postal.py +25 -0
  24. csv_detective/formats/code_region.py +22 -0
  25. csv_detective/formats/code_rna.py +29 -0
  26. csv_detective/formats/code_waldec.py +17 -0
  27. csv_detective/formats/commune.py +27 -0
  28. csv_detective/formats/csp_insee.py +31 -0
  29. csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt +0 -0
  30. csv_detective/formats/date.py +99 -0
  31. csv_detective/formats/date_fr.py +22 -0
  32. csv_detective/formats/datetime_aware.py +45 -0
  33. csv_detective/formats/datetime_naive.py +48 -0
  34. csv_detective/formats/datetime_rfc822.py +24 -0
  35. csv_detective/formats/departement.py +37 -0
  36. csv_detective/formats/email.py +28 -0
  37. csv_detective/formats/float.py +29 -0
  38. csv_detective/formats/geojson.py +36 -0
  39. csv_detective/formats/insee_ape700.py +31 -0
  40. csv_detective/formats/insee_canton.py +28 -0
  41. csv_detective/formats/int.py +23 -0
  42. csv_detective/formats/iso_country_code_alpha2.py +30 -0
  43. csv_detective/formats/iso_country_code_alpha3.py +30 -0
  44. csv_detective/formats/iso_country_code_numeric.py +31 -0
  45. csv_detective/formats/jour_de_la_semaine.py +41 -0
  46. csv_detective/formats/json.py +20 -0
  47. csv_detective/formats/latitude_l93.py +48 -0
  48. csv_detective/formats/latitude_wgs.py +42 -0
  49. csv_detective/formats/latitude_wgs_fr_metropole.py +42 -0
  50. csv_detective/formats/latlon_wgs.py +53 -0
  51. csv_detective/formats/longitude_l93.py +39 -0
  52. csv_detective/formats/longitude_wgs.py +32 -0
  53. csv_detective/formats/longitude_wgs_fr_metropole.py +32 -0
  54. csv_detective/formats/lonlat_wgs.py +36 -0
  55. csv_detective/formats/mois_de_lannee.py +48 -0
  56. csv_detective/formats/money.py +18 -0
  57. csv_detective/formats/mongo_object_id.py +14 -0
  58. csv_detective/formats/pays.py +35 -0
  59. csv_detective/formats/percent.py +16 -0
  60. csv_detective/formats/region.py +70 -0
  61. csv_detective/formats/sexe.py +17 -0
  62. csv_detective/formats/siren.py +37 -0
  63. csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} +47 -29
  64. csv_detective/formats/tel_fr.py +36 -0
  65. csv_detective/formats/uai.py +36 -0
  66. csv_detective/formats/url.py +46 -0
  67. csv_detective/formats/username.py +14 -0
  68. csv_detective/formats/uuid.py +16 -0
  69. csv_detective/formats/year.py +28 -0
  70. csv_detective/output/__init__.py +65 -0
  71. csv_detective/output/dataframe.py +96 -0
  72. csv_detective/output/example.py +250 -0
  73. csv_detective/output/profile.py +119 -0
  74. csv_detective/{schema_generation.py → output/schema.py} +268 -343
  75. csv_detective/output/utils.py +74 -0
  76. csv_detective/{detect_fields/FR/geo → parsing}/__init__.py +0 -0
  77. csv_detective/parsing/columns.py +235 -0
  78. csv_detective/parsing/compression.py +11 -0
  79. csv_detective/parsing/csv.py +56 -0
  80. csv_detective/parsing/excel.py +167 -0
  81. csv_detective/parsing/load.py +111 -0
  82. csv_detective/parsing/text.py +56 -0
  83. csv_detective/utils.py +23 -196
  84. csv_detective/validate.py +138 -0
  85. csv_detective-0.9.3.dev2438.dist-info/METADATA +267 -0
  86. csv_detective-0.9.3.dev2438.dist-info/RECORD +92 -0
  87. csv_detective-0.9.3.dev2438.dist-info/WHEEL +4 -0
  88. {csv_detective-0.6.7.dist-info → csv_detective-0.9.3.dev2438.dist-info}/entry_points.txt +1 -0
  89. csv_detective/all_packages.txt +0 -104
  90. csv_detective/detect_fields/FR/geo/adresse/__init__.py +0 -100
  91. csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +0 -24
  92. csv_detective/detect_fields/FR/geo/code_commune_insee/code_commune_insee.txt +0 -37600
  93. csv_detective/detect_fields/FR/geo/code_departement/__init__.py +0 -11
  94. csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +0 -15
  95. csv_detective/detect_fields/FR/geo/code_fantoir/code_fantoir.txt +0 -26122
  96. csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -19
  97. csv_detective/detect_fields/FR/geo/code_postal/code_postal.txt +0 -36822
  98. csv_detective/detect_fields/FR/geo/code_region/__init__.py +0 -27
  99. csv_detective/detect_fields/FR/geo/commune/__init__.py +0 -21
  100. csv_detective/detect_fields/FR/geo/commune/commune.txt +0 -36745
  101. csv_detective/detect_fields/FR/geo/departement/__init__.py +0 -19
  102. csv_detective/detect_fields/FR/geo/departement/departement.txt +0 -101
  103. csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +0 -20
  104. csv_detective/detect_fields/FR/geo/insee_canton/canton2017.txt +0 -2055
  105. csv_detective/detect_fields/FR/geo/insee_canton/cantons.txt +0 -2055
  106. csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +0 -13
  107. csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -13
  108. csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +0 -13
  109. csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -13
  110. csv_detective/detect_fields/FR/geo/pays/__init__.py +0 -17
  111. csv_detective/detect_fields/FR/geo/pays/pays.txt +0 -248
  112. csv_detective/detect_fields/FR/geo/region/__init__.py +0 -16
  113. csv_detective/detect_fields/FR/geo/region/region.txt +0 -44
  114. csv_detective/detect_fields/FR/other/__init__.py +0 -0
  115. csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +0 -26
  116. csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
  117. csv_detective/detect_fields/FR/other/code_rna/__init__.py +0 -8
  118. csv_detective/detect_fields/FR/other/code_waldec/__init__.py +0 -12
  119. csv_detective/detect_fields/FR/other/csp_insee/__init__.py +0 -16
  120. csv_detective/detect_fields/FR/other/date_fr/__init__.py +0 -12
  121. csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +0 -16
  122. csv_detective/detect_fields/FR/other/sexe/__init__.py +0 -9
  123. csv_detective/detect_fields/FR/other/siren/__init__.py +0 -18
  124. csv_detective/detect_fields/FR/other/tel_fr/__init__.py +0 -15
  125. csv_detective/detect_fields/FR/other/uai/__init__.py +0 -15
  126. csv_detective/detect_fields/FR/temp/__init__.py +0 -0
  127. csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +0 -23
  128. csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +0 -37
  129. csv_detective/detect_fields/__init__.py +0 -57
  130. csv_detective/detect_fields/geo/__init__.py +0 -0
  131. csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +0 -15
  132. csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +0 -14
  133. csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +0 -15
  134. csv_detective/detect_fields/geo/json_geojson/__init__.py +0 -22
  135. csv_detective/detect_fields/geo/latitude_wgs/__init__.py +0 -13
  136. csv_detective/detect_fields/geo/latlon_wgs/__init__.py +0 -15
  137. csv_detective/detect_fields/geo/longitude_wgs/__init__.py +0 -13
  138. csv_detective/detect_fields/other/__init__.py +0 -0
  139. csv_detective/detect_fields/other/booleen/__init__.py +0 -21
  140. csv_detective/detect_fields/other/email/__init__.py +0 -8
  141. csv_detective/detect_fields/other/float/__init__.py +0 -17
  142. csv_detective/detect_fields/other/int/__init__.py +0 -12
  143. csv_detective/detect_fields/other/json/__init__.py +0 -24
  144. csv_detective/detect_fields/other/mongo_object_id/__init__.py +0 -8
  145. csv_detective/detect_fields/other/twitter/__init__.py +0 -8
  146. csv_detective/detect_fields/other/url/__init__.py +0 -11
  147. csv_detective/detect_fields/other/uuid/__init__.py +0 -11
  148. csv_detective/detect_fields/temp/__init__.py +0 -0
  149. csv_detective/detect_fields/temp/date/__init__.py +0 -62
  150. csv_detective/detect_fields/temp/datetime_iso/__init__.py +0 -18
  151. csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +0 -21
  152. csv_detective/detect_fields/temp/year/__init__.py +0 -10
  153. csv_detective/detect_labels/FR/__init__.py +0 -0
  154. csv_detective/detect_labels/FR/geo/__init__.py +0 -0
  155. csv_detective/detect_labels/FR/geo/adresse/__init__.py +0 -40
  156. csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +0 -42
  157. csv_detective/detect_labels/FR/geo/code_departement/__init__.py +0 -33
  158. csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +0 -33
  159. csv_detective/detect_labels/FR/geo/code_postal/__init__.py +0 -41
  160. csv_detective/detect_labels/FR/geo/code_region/__init__.py +0 -33
  161. csv_detective/detect_labels/FR/geo/commune/__init__.py +0 -33
  162. csv_detective/detect_labels/FR/geo/departement/__init__.py +0 -47
  163. csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +0 -33
  164. csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +0 -54
  165. csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -55
  166. csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +0 -44
  167. csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -45
  168. csv_detective/detect_labels/FR/geo/pays/__init__.py +0 -45
  169. csv_detective/detect_labels/FR/geo/region/__init__.py +0 -45
  170. csv_detective/detect_labels/FR/other/__init__.py +0 -0
  171. csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +0 -33
  172. csv_detective/detect_labels/FR/other/code_rna/__init__.py +0 -38
  173. csv_detective/detect_labels/FR/other/code_waldec/__init__.py +0 -33
  174. csv_detective/detect_labels/FR/other/csp_insee/__init__.py +0 -37
  175. csv_detective/detect_labels/FR/other/date_fr/__init__.py +0 -33
  176. csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +0 -40
  177. csv_detective/detect_labels/FR/other/sexe/__init__.py +0 -33
  178. csv_detective/detect_labels/FR/other/siren/__init__.py +0 -41
  179. csv_detective/detect_labels/FR/other/siret/__init__.py +0 -40
  180. csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -45
  181. csv_detective/detect_labels/FR/other/uai/__init__.py +0 -50
  182. csv_detective/detect_labels/FR/temp/__init__.py +0 -0
  183. csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +0 -41
  184. csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +0 -33
  185. csv_detective/detect_labels/__init__.py +0 -43
  186. csv_detective/detect_labels/geo/__init__.py +0 -0
  187. csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +0 -41
  188. csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +0 -41
  189. csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +0 -41
  190. csv_detective/detect_labels/geo/json_geojson/__init__.py +0 -42
  191. csv_detective/detect_labels/geo/latitude_wgs/__init__.py +0 -55
  192. csv_detective/detect_labels/geo/latlon_wgs/__init__.py +0 -67
  193. csv_detective/detect_labels/geo/longitude_wgs/__init__.py +0 -45
  194. csv_detective/detect_labels/other/__init__.py +0 -0
  195. csv_detective/detect_labels/other/booleen/__init__.py +0 -34
  196. csv_detective/detect_labels/other/email/__init__.py +0 -45
  197. csv_detective/detect_labels/other/float/__init__.py +0 -33
  198. csv_detective/detect_labels/other/int/__init__.py +0 -33
  199. csv_detective/detect_labels/other/money/__init__.py +0 -11
  200. csv_detective/detect_labels/other/money/check_col_name.py +0 -8
  201. csv_detective/detect_labels/other/mongo_object_id/__init__.py +0 -33
  202. csv_detective/detect_labels/other/twitter/__init__.py +0 -33
  203. csv_detective/detect_labels/other/url/__init__.py +0 -48
  204. csv_detective/detect_labels/other/uuid/__init__.py +0 -33
  205. csv_detective/detect_labels/temp/__init__.py +0 -0
  206. csv_detective/detect_labels/temp/date/__init__.py +0 -51
  207. csv_detective/detect_labels/temp/datetime_iso/__init__.py +0 -45
  208. csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +0 -44
  209. csv_detective/detect_labels/temp/year/__init__.py +0 -44
  210. csv_detective/detection.py +0 -361
  211. csv_detective/process_text.py +0 -39
  212. csv_detective/s3_utils.py +0 -48
  213. csv_detective-0.6.7.data/data/share/csv_detective/CHANGELOG.md +0 -118
  214. csv_detective-0.6.7.data/data/share/csv_detective/LICENSE.AGPL.txt +0 -661
  215. csv_detective-0.6.7.data/data/share/csv_detective/README.md +0 -247
  216. csv_detective-0.6.7.dist-info/LICENSE.AGPL.txt +0 -661
  217. csv_detective-0.6.7.dist-info/METADATA +0 -23
  218. csv_detective-0.6.7.dist-info/RECORD +0 -150
  219. csv_detective-0.6.7.dist-info/WHEEL +0 -5
  220. csv_detective-0.6.7.dist-info/top_level.txt +0 -2
  221. tests/__init__.py +0 -0
  222. tests/test_fields.py +0 -360
  223. tests/test_file.py +0 -116
  224. tests/test_labels.py +0 -7
  225. /csv_detective/{detect_fields/FR/other/csp_insee → formats/data}/csp_insee.txt +0 -0
  226. /csv_detective/{detect_fields/geo/iso_country_code_alpha2 → formats/data}/iso_country_code_alpha2.txt +0 -0
  227. /csv_detective/{detect_fields/geo/iso_country_code_alpha3 → formats/data}/iso_country_code_alpha3.txt +0 -0
  228. /csv_detective/{detect_fields/geo/iso_country_code_numeric → formats/data}/iso_country_code_numeric.txt +0 -0
@@ -1,41 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = [
14
- 'siren',
15
- 'siren organisme designe',
16
- 'siren organisme designant',
17
- 'n° siren',
18
- 'siren organisme',
19
- 'siren titulaire',
20
- 'numero siren'
21
- ]
22
- processed_header = _process_text(header)
23
-
24
- header_matches_words_combination = float(
25
- any(
26
- [
27
- words_combination == processed_header for words_combination in words_combinations_list
28
- ]
29
- )
30
- )
31
- words_combination_in_header = 0.5 * float(
32
- any(
33
- [
34
- full_word_strictly_inside_string(
35
- words_combination, processed_header
36
- ) for words_combination in words_combinations_list
37
- ]
38
- )
39
- )
40
-
41
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,40 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = [
14
- 'siret',
15
- 'siret d',
16
- 'num siret',
17
- 'siretacheteur',
18
- 'n° siret',
19
- 'coll siret'
20
- ]
21
- processed_header = _process_text(header)
22
-
23
- header_matches_words_combination = float(
24
- any(
25
- [
26
- words_combination == processed_header for words_combination in words_combinations_list
27
- ]
28
- )
29
- )
30
- words_combination_in_header = 0.5 * float(
31
- any(
32
- [
33
- full_word_strictly_inside_string(
34
- words_combination, processed_header
35
- ) for words_combination in words_combinations_list
36
- ]
37
- )
38
- )
39
-
40
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,45 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = [
14
- 'telephone',
15
- 'tel',
16
- 'tel1',
17
- 'tel2',
18
- 'phone',
19
- 'num tel',
20
- 'tel mob',
21
- 'telephone sav',
22
- 'telephone1',
23
- 'coordinates.phone',
24
- 'telephone du lieu'
25
- ]
26
- processed_header = _process_text(header)
27
-
28
- header_matches_words_combination = float(
29
- any(
30
- [
31
- words_combination == processed_header for words_combination in words_combinations_list
32
- ]
33
- )
34
- )
35
- words_combination_in_header = 0.5 * float(
36
- any(
37
- [
38
- full_word_strictly_inside_string(
39
- words_combination, processed_header
40
- ) for words_combination in words_combinations_list
41
- ]
42
- )
43
- )
44
-
45
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,50 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = [
14
- 'uai',
15
- 'code etablissement',
16
- 'code uai',
17
- 'uai - identifiant',
18
- 'numero uai',
19
- 'rne',
20
- "numero de l'etablissement",
21
- 'code rne',
22
- 'codeetab',
23
- "code uai de l'etablissement",
24
- 'ref uai',
25
- 'cd rne',
26
- 'numerouai',
27
- 'numero d etablissement',
28
- 'code etablissement',
29
- 'numero etablissement'
30
- ]
31
- processed_header = _process_text(header)
32
-
33
- header_matches_words_combination = float(
34
- any(
35
- [
36
- words_combination == processed_header for words_combination in words_combinations_list
37
- ]
38
- )
39
- )
40
- words_combination_in_header = 0.5 * float(
41
- any(
42
- [
43
- full_word_strictly_inside_string(
44
- words_combination, processed_header
45
- ) for words_combination in words_combinations_list
46
- ]
47
- )
48
- )
49
-
50
- return max(header_matches_words_combination, words_combination_in_header)
File without changes
@@ -1,41 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = [
14
- 'jour semaine',
15
- 'type jour',
16
- 'jour de la semaine',
17
- 'saufjour',
18
- 'nomjour',
19
- 'jour',
20
- 'jour de fermeture'
21
- ]
22
- processed_header = _process_text(header)
23
-
24
- header_matches_words_combination = float(
25
- any(
26
- [
27
- words_combination == processed_header for words_combination in words_combinations_list
28
- ]
29
- )
30
- )
31
- words_combination_in_header = 0.5 * float(
32
- any(
33
- [
34
- full_word_strictly_inside_string(
35
- words_combination, processed_header
36
- ) for words_combination in words_combinations_list
37
- ]
38
- )
39
- )
40
-
41
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,33 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = ['mois de annee', 'mois']
14
- processed_header = _process_text(header)
15
-
16
- header_matches_words_combination = float(
17
- any(
18
- [
19
- words_combination == processed_header for words_combination in words_combinations_list
20
- ]
21
- )
22
- )
23
- words_combination_in_header = 0.5 * float(
24
- any(
25
- [
26
- full_word_strictly_inside_string(
27
- words_combination, processed_header
28
- ) for words_combination in words_combinations_list
29
- ]
30
- )
31
- )
32
-
33
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,43 +0,0 @@
1
- # flake8: noqa
2
- from .FR.geo import (
3
- adresse,
4
- code_commune_insee,
5
- code_departement,
6
- code_fantoir,
7
- code_postal,
8
- code_region,
9
- commune,
10
- departement,
11
- insee_canton,
12
- latitude_l93,
13
- latitude_wgs_fr_metropole,
14
- longitude_l93,
15
- longitude_wgs_fr_metropole,
16
- pays,
17
- region
18
- )
19
- from .FR.other import (
20
- code_csp_insee,
21
- code_rna,
22
- code_waldec,
23
- csp_insee,
24
- date_fr,
25
- insee_ape700,
26
- sexe,
27
- siren,
28
- siret,
29
- tel_fr,
30
- uai
31
- )
32
- from .FR.temp import jour_de_la_semaine, mois_de_annee
33
- from .geo import (
34
- iso_country_code_alpha2,
35
- iso_country_code_alpha3,
36
- iso_country_code_numeric,
37
- json_geojson,
38
- latitude_wgs,
39
- latlon_wgs,
40
- longitude_wgs
41
- )
42
- from .other import booleen, email, float, int, money, mongo_object_id, twitter, url, uuid
43
- from .temp import date, datetime_iso, datetime_rfc822, year
File without changes
@@ -1,41 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = [
14
- 'iso country code',
15
- 'code pays',
16
- 'pays',
17
- 'country',
18
- 'nation',
19
- 'pays code',
20
- 'code pays (iso)'
21
- ]
22
- processed_header = _process_text(header)
23
-
24
- header_matches_words_combination = float(
25
- any(
26
- [
27
- words_combination == processed_header for words_combination in words_combinations_list
28
- ]
29
- )
30
- )
31
- words_combination_in_header = 0.5 * float(
32
- any(
33
- [
34
- full_word_strictly_inside_string(
35
- words_combination, processed_header
36
- ) for words_combination in words_combinations_list
37
- ]
38
- )
39
- )
40
-
41
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,41 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = [
14
- 'iso country code',
15
- 'code pays',
16
- 'pays',
17
- 'country',
18
- 'nation',
19
- 'pays code',
20
- 'code pays (iso)'
21
- ]
22
- processed_header = _process_text(header)
23
-
24
- header_matches_words_combination = float(
25
- any(
26
- [
27
- words_combination == processed_header for words_combination in words_combinations_list
28
- ]
29
- )
30
- )
31
- words_combination_in_header = 0.5 * float(
32
- any(
33
- [
34
- full_word_strictly_inside_string(
35
- words_combination, processed_header
36
- ) for words_combination in words_combinations_list
37
- ]
38
- )
39
- )
40
-
41
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,41 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = [
14
- 'iso country code',
15
- 'code pays',
16
- 'pays',
17
- 'country',
18
- 'nation',
19
- 'pays code',
20
- 'code pays (iso)'
21
- ]
22
- processed_header = _process_text(header)
23
-
24
- header_matches_words_combination = float(
25
- any(
26
- [
27
- words_combination == processed_header for words_combination in words_combinations_list
28
- ]
29
- )
30
- )
31
- words_combination_in_header = 0.5 * float(
32
- any(
33
- [
34
- full_word_strictly_inside_string(
35
- words_combination, processed_header
36
- ) for words_combination in words_combinations_list
37
- ]
38
- )
39
- )
40
-
41
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,42 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = [
14
- 'json geojson',
15
- 'json',
16
- 'geojson',
17
- 'geo shape',
18
- 'geom',
19
- 'geometry',
20
- 'geo shape',
21
- 'geoshape'
22
- ]
23
- processed_header = _process_text(header)
24
-
25
- header_matches_words_combination = float(
26
- any(
27
- [
28
- words_combination == processed_header for words_combination in words_combinations_list
29
- ]
30
- )
31
- )
32
- words_combination_in_header = 0.5 * float(
33
- any(
34
- [
35
- full_word_strictly_inside_string(
36
- words_combination, processed_header
37
- ) for words_combination in words_combinations_list
38
- ]
39
- )
40
- )
41
-
42
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,55 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = [
14
- 'latitude',
15
- 'lat',
16
- 'y',
17
- 'yf',
18
- 'yd',
19
- 'coordonnee y',
20
- 'coord y',
21
- 'ycoord',
22
- 'geocodage y gps',
23
- 'location latitude',
24
- 'ylatitude',
25
- 'ylat',
26
- 'latitude (y)',
27
- 'latitudeorg',
28
- 'coordinates.latitude',
29
- 'googlemap latitude',
30
- 'latitudelieu',
31
- 'latitude googlemap',
32
- 'latitude wgs84',
33
- 'y wgs84',
34
- 'latitude (wgs84)'
35
- ]
36
- processed_header = _process_text(header)
37
-
38
- header_matches_words_combination = float(
39
- any(
40
- [
41
- words_combination == processed_header for words_combination in words_combinations_list
42
- ]
43
- )
44
- )
45
- words_combination_in_header = 0.5 * float(
46
- any(
47
- [
48
- full_word_strictly_inside_string(
49
- words_combination, processed_header
50
- ) for words_combination in words_combinations_list
51
- ]
52
- )
53
- )
54
-
55
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,67 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = [
14
- 'latlon wgs',
15
- 'latlon',
16
- 'geo point',
17
- 'geo point 2d',
18
- 'wgs84',
19
- 'geolocalisation',
20
- 'geo',
21
- 'coordonnees finales',
22
- 'coordonnees',
23
- 'coordonnees ban',
24
- 'xy',
25
- 'geometry x y',
26
- 'coordonnees insee',
27
- 'coordonnees geographiques',
28
- 'position',
29
- 'coordonnes gps',
30
- 'geopoint',
31
- 'geom x y',
32
- 'coord gps',
33
- 'latlong',
34
- 'position geographique',
35
- 'c geo',
36
- 'coordonnes geoloc',
37
- 'lat lon',
38
- 'code geo',
39
- 'geo localisation',
40
- 'coordonnes geo',
41
- 'geo cp',
42
- 'x y',
43
- 'geo coordinates',
44
- 'point geo',
45
- 'point geo insee',
46
- 'coordonnees geoloc',
47
- 'coordonnees xy'
48
- ]
49
- processed_header = _process_text(header)
50
-
51
- header_matches_words_combination = float(
52
- any(
53
- [
54
- words_combination == processed_header for words_combination in words_combinations_list
55
- ]
56
- )
57
- )
58
- words_combination_in_header = 0.5 * float(
59
- any(
60
- [
61
- full_word_strictly_inside_string(
62
- words_combination, processed_header
63
- ) for words_combination in words_combinations_list
64
- ]
65
- )
66
- )
67
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,45 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
- # Does not detect CRS
13
- words_combinations_list = [
14
- 'longitude',
15
- 'lon',
16
- 'long',
17
- 'geocodage x gps',
18
- 'location longitude',
19
- 'xlongitude',
20
- 'lng',
21
- 'xlong',
22
- 'x',
23
- 'xf',
24
- 'xd'
25
- ]
26
- processed_header = _process_text(header)
27
-
28
- header_matches_words_combination = float(
29
- any(
30
- [
31
- words_combination == processed_header for words_combination in words_combinations_list
32
- ]
33
- )
34
- )
35
- words_combination_in_header = 0.5 * float(
36
- any(
37
- [
38
- full_word_strictly_inside_string(
39
- words_combination, processed_header
40
- ) for words_combination in words_combinations_list
41
- ]
42
- )
43
- )
44
-
45
- return max(header_matches_words_combination, words_combination_in_header)
File without changes
@@ -1,34 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- # Not relevant to make it match with specific words (find other rules)
14
- words_combinations_list = []
15
- processed_header = _process_text(header)
16
-
17
- header_matches_words_combination = float(
18
- any(
19
- [
20
- words_combination == processed_header for words_combination in words_combinations_list
21
- ]
22
- )
23
- )
24
- words_combination_in_header = 0.5 * float(
25
- any(
26
- [
27
- full_word_strictly_inside_string(
28
- words_combination, processed_header
29
- ) for words_combination in words_combinations_list
30
- ]
31
- )
32
- )
33
-
34
- return max(header_matches_words_combination, words_combination_in_header)