csv-detective 0.6.7__py3-none-any.whl → 0.9.3.dev2438__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. csv_detective/__init__.py +7 -1
  2. csv_detective/cli.py +33 -21
  3. csv_detective/{detect_fields/FR → detection}/__init__.py +0 -0
  4. csv_detective/detection/columns.py +89 -0
  5. csv_detective/detection/encoding.py +29 -0
  6. csv_detective/detection/engine.py +46 -0
  7. csv_detective/detection/formats.py +156 -0
  8. csv_detective/detection/headers.py +28 -0
  9. csv_detective/detection/rows.py +18 -0
  10. csv_detective/detection/separator.py +44 -0
  11. csv_detective/detection/variables.py +97 -0
  12. csv_detective/explore_csv.py +151 -377
  13. csv_detective/format.py +67 -0
  14. csv_detective/formats/__init__.py +9 -0
  15. csv_detective/formats/adresse.py +116 -0
  16. csv_detective/formats/binary.py +26 -0
  17. csv_detective/formats/booleen.py +35 -0
  18. csv_detective/formats/code_commune_insee.py +26 -0
  19. csv_detective/formats/code_csp_insee.py +36 -0
  20. csv_detective/formats/code_departement.py +29 -0
  21. csv_detective/formats/code_fantoir.py +21 -0
  22. csv_detective/formats/code_import.py +17 -0
  23. csv_detective/formats/code_postal.py +25 -0
  24. csv_detective/formats/code_region.py +22 -0
  25. csv_detective/formats/code_rna.py +29 -0
  26. csv_detective/formats/code_waldec.py +17 -0
  27. csv_detective/formats/commune.py +27 -0
  28. csv_detective/formats/csp_insee.py +31 -0
  29. csv_detective/{detect_fields/FR/other/insee_ape700 → formats/data}/insee_ape700.txt +0 -0
  30. csv_detective/formats/date.py +99 -0
  31. csv_detective/formats/date_fr.py +22 -0
  32. csv_detective/formats/datetime_aware.py +45 -0
  33. csv_detective/formats/datetime_naive.py +48 -0
  34. csv_detective/formats/datetime_rfc822.py +24 -0
  35. csv_detective/formats/departement.py +37 -0
  36. csv_detective/formats/email.py +28 -0
  37. csv_detective/formats/float.py +29 -0
  38. csv_detective/formats/geojson.py +36 -0
  39. csv_detective/formats/insee_ape700.py +31 -0
  40. csv_detective/formats/insee_canton.py +28 -0
  41. csv_detective/formats/int.py +23 -0
  42. csv_detective/formats/iso_country_code_alpha2.py +30 -0
  43. csv_detective/formats/iso_country_code_alpha3.py +30 -0
  44. csv_detective/formats/iso_country_code_numeric.py +31 -0
  45. csv_detective/formats/jour_de_la_semaine.py +41 -0
  46. csv_detective/formats/json.py +20 -0
  47. csv_detective/formats/latitude_l93.py +48 -0
  48. csv_detective/formats/latitude_wgs.py +42 -0
  49. csv_detective/formats/latitude_wgs_fr_metropole.py +42 -0
  50. csv_detective/formats/latlon_wgs.py +53 -0
  51. csv_detective/formats/longitude_l93.py +39 -0
  52. csv_detective/formats/longitude_wgs.py +32 -0
  53. csv_detective/formats/longitude_wgs_fr_metropole.py +32 -0
  54. csv_detective/formats/lonlat_wgs.py +36 -0
  55. csv_detective/formats/mois_de_lannee.py +48 -0
  56. csv_detective/formats/money.py +18 -0
  57. csv_detective/formats/mongo_object_id.py +14 -0
  58. csv_detective/formats/pays.py +35 -0
  59. csv_detective/formats/percent.py +16 -0
  60. csv_detective/formats/region.py +70 -0
  61. csv_detective/formats/sexe.py +17 -0
  62. csv_detective/formats/siren.py +37 -0
  63. csv_detective/{detect_fields/FR/other/siret/__init__.py → formats/siret.py} +47 -29
  64. csv_detective/formats/tel_fr.py +36 -0
  65. csv_detective/formats/uai.py +36 -0
  66. csv_detective/formats/url.py +46 -0
  67. csv_detective/formats/username.py +14 -0
  68. csv_detective/formats/uuid.py +16 -0
  69. csv_detective/formats/year.py +28 -0
  70. csv_detective/output/__init__.py +65 -0
  71. csv_detective/output/dataframe.py +96 -0
  72. csv_detective/output/example.py +250 -0
  73. csv_detective/output/profile.py +119 -0
  74. csv_detective/{schema_generation.py → output/schema.py} +268 -343
  75. csv_detective/output/utils.py +74 -0
  76. csv_detective/{detect_fields/FR/geo → parsing}/__init__.py +0 -0
  77. csv_detective/parsing/columns.py +235 -0
  78. csv_detective/parsing/compression.py +11 -0
  79. csv_detective/parsing/csv.py +56 -0
  80. csv_detective/parsing/excel.py +167 -0
  81. csv_detective/parsing/load.py +111 -0
  82. csv_detective/parsing/text.py +56 -0
  83. csv_detective/utils.py +23 -196
  84. csv_detective/validate.py +138 -0
  85. csv_detective-0.9.3.dev2438.dist-info/METADATA +267 -0
  86. csv_detective-0.9.3.dev2438.dist-info/RECORD +92 -0
  87. csv_detective-0.9.3.dev2438.dist-info/WHEEL +4 -0
  88. {csv_detective-0.6.7.dist-info → csv_detective-0.9.3.dev2438.dist-info}/entry_points.txt +1 -0
  89. csv_detective/all_packages.txt +0 -104
  90. csv_detective/detect_fields/FR/geo/adresse/__init__.py +0 -100
  91. csv_detective/detect_fields/FR/geo/code_commune_insee/__init__.py +0 -24
  92. csv_detective/detect_fields/FR/geo/code_commune_insee/code_commune_insee.txt +0 -37600
  93. csv_detective/detect_fields/FR/geo/code_departement/__init__.py +0 -11
  94. csv_detective/detect_fields/FR/geo/code_fantoir/__init__.py +0 -15
  95. csv_detective/detect_fields/FR/geo/code_fantoir/code_fantoir.txt +0 -26122
  96. csv_detective/detect_fields/FR/geo/code_postal/__init__.py +0 -19
  97. csv_detective/detect_fields/FR/geo/code_postal/code_postal.txt +0 -36822
  98. csv_detective/detect_fields/FR/geo/code_region/__init__.py +0 -27
  99. csv_detective/detect_fields/FR/geo/commune/__init__.py +0 -21
  100. csv_detective/detect_fields/FR/geo/commune/commune.txt +0 -36745
  101. csv_detective/detect_fields/FR/geo/departement/__init__.py +0 -19
  102. csv_detective/detect_fields/FR/geo/departement/departement.txt +0 -101
  103. csv_detective/detect_fields/FR/geo/insee_canton/__init__.py +0 -20
  104. csv_detective/detect_fields/FR/geo/insee_canton/canton2017.txt +0 -2055
  105. csv_detective/detect_fields/FR/geo/insee_canton/cantons.txt +0 -2055
  106. csv_detective/detect_fields/FR/geo/latitude_l93/__init__.py +0 -13
  107. csv_detective/detect_fields/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -13
  108. csv_detective/detect_fields/FR/geo/longitude_l93/__init__.py +0 -13
  109. csv_detective/detect_fields/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -13
  110. csv_detective/detect_fields/FR/geo/pays/__init__.py +0 -17
  111. csv_detective/detect_fields/FR/geo/pays/pays.txt +0 -248
  112. csv_detective/detect_fields/FR/geo/region/__init__.py +0 -16
  113. csv_detective/detect_fields/FR/geo/region/region.txt +0 -44
  114. csv_detective/detect_fields/FR/other/__init__.py +0 -0
  115. csv_detective/detect_fields/FR/other/code_csp_insee/__init__.py +0 -26
  116. csv_detective/detect_fields/FR/other/code_csp_insee/code_csp_insee.txt +0 -498
  117. csv_detective/detect_fields/FR/other/code_rna/__init__.py +0 -8
  118. csv_detective/detect_fields/FR/other/code_waldec/__init__.py +0 -12
  119. csv_detective/detect_fields/FR/other/csp_insee/__init__.py +0 -16
  120. csv_detective/detect_fields/FR/other/date_fr/__init__.py +0 -12
  121. csv_detective/detect_fields/FR/other/insee_ape700/__init__.py +0 -16
  122. csv_detective/detect_fields/FR/other/sexe/__init__.py +0 -9
  123. csv_detective/detect_fields/FR/other/siren/__init__.py +0 -18
  124. csv_detective/detect_fields/FR/other/tel_fr/__init__.py +0 -15
  125. csv_detective/detect_fields/FR/other/uai/__init__.py +0 -15
  126. csv_detective/detect_fields/FR/temp/__init__.py +0 -0
  127. csv_detective/detect_fields/FR/temp/jour_de_la_semaine/__init__.py +0 -23
  128. csv_detective/detect_fields/FR/temp/mois_de_annee/__init__.py +0 -37
  129. csv_detective/detect_fields/__init__.py +0 -57
  130. csv_detective/detect_fields/geo/__init__.py +0 -0
  131. csv_detective/detect_fields/geo/iso_country_code_alpha2/__init__.py +0 -15
  132. csv_detective/detect_fields/geo/iso_country_code_alpha3/__init__.py +0 -14
  133. csv_detective/detect_fields/geo/iso_country_code_numeric/__init__.py +0 -15
  134. csv_detective/detect_fields/geo/json_geojson/__init__.py +0 -22
  135. csv_detective/detect_fields/geo/latitude_wgs/__init__.py +0 -13
  136. csv_detective/detect_fields/geo/latlon_wgs/__init__.py +0 -15
  137. csv_detective/detect_fields/geo/longitude_wgs/__init__.py +0 -13
  138. csv_detective/detect_fields/other/__init__.py +0 -0
  139. csv_detective/detect_fields/other/booleen/__init__.py +0 -21
  140. csv_detective/detect_fields/other/email/__init__.py +0 -8
  141. csv_detective/detect_fields/other/float/__init__.py +0 -17
  142. csv_detective/detect_fields/other/int/__init__.py +0 -12
  143. csv_detective/detect_fields/other/json/__init__.py +0 -24
  144. csv_detective/detect_fields/other/mongo_object_id/__init__.py +0 -8
  145. csv_detective/detect_fields/other/twitter/__init__.py +0 -8
  146. csv_detective/detect_fields/other/url/__init__.py +0 -11
  147. csv_detective/detect_fields/other/uuid/__init__.py +0 -11
  148. csv_detective/detect_fields/temp/__init__.py +0 -0
  149. csv_detective/detect_fields/temp/date/__init__.py +0 -62
  150. csv_detective/detect_fields/temp/datetime_iso/__init__.py +0 -18
  151. csv_detective/detect_fields/temp/datetime_rfc822/__init__.py +0 -21
  152. csv_detective/detect_fields/temp/year/__init__.py +0 -10
  153. csv_detective/detect_labels/FR/__init__.py +0 -0
  154. csv_detective/detect_labels/FR/geo/__init__.py +0 -0
  155. csv_detective/detect_labels/FR/geo/adresse/__init__.py +0 -40
  156. csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +0 -42
  157. csv_detective/detect_labels/FR/geo/code_departement/__init__.py +0 -33
  158. csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +0 -33
  159. csv_detective/detect_labels/FR/geo/code_postal/__init__.py +0 -41
  160. csv_detective/detect_labels/FR/geo/code_region/__init__.py +0 -33
  161. csv_detective/detect_labels/FR/geo/commune/__init__.py +0 -33
  162. csv_detective/detect_labels/FR/geo/departement/__init__.py +0 -47
  163. csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +0 -33
  164. csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +0 -54
  165. csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +0 -55
  166. csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +0 -44
  167. csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +0 -45
  168. csv_detective/detect_labels/FR/geo/pays/__init__.py +0 -45
  169. csv_detective/detect_labels/FR/geo/region/__init__.py +0 -45
  170. csv_detective/detect_labels/FR/other/__init__.py +0 -0
  171. csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +0 -33
  172. csv_detective/detect_labels/FR/other/code_rna/__init__.py +0 -38
  173. csv_detective/detect_labels/FR/other/code_waldec/__init__.py +0 -33
  174. csv_detective/detect_labels/FR/other/csp_insee/__init__.py +0 -37
  175. csv_detective/detect_labels/FR/other/date_fr/__init__.py +0 -33
  176. csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +0 -40
  177. csv_detective/detect_labels/FR/other/sexe/__init__.py +0 -33
  178. csv_detective/detect_labels/FR/other/siren/__init__.py +0 -41
  179. csv_detective/detect_labels/FR/other/siret/__init__.py +0 -40
  180. csv_detective/detect_labels/FR/other/tel_fr/__init__.py +0 -45
  181. csv_detective/detect_labels/FR/other/uai/__init__.py +0 -50
  182. csv_detective/detect_labels/FR/temp/__init__.py +0 -0
  183. csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +0 -41
  184. csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +0 -33
  185. csv_detective/detect_labels/__init__.py +0 -43
  186. csv_detective/detect_labels/geo/__init__.py +0 -0
  187. csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +0 -41
  188. csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +0 -41
  189. csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +0 -41
  190. csv_detective/detect_labels/geo/json_geojson/__init__.py +0 -42
  191. csv_detective/detect_labels/geo/latitude_wgs/__init__.py +0 -55
  192. csv_detective/detect_labels/geo/latlon_wgs/__init__.py +0 -67
  193. csv_detective/detect_labels/geo/longitude_wgs/__init__.py +0 -45
  194. csv_detective/detect_labels/other/__init__.py +0 -0
  195. csv_detective/detect_labels/other/booleen/__init__.py +0 -34
  196. csv_detective/detect_labels/other/email/__init__.py +0 -45
  197. csv_detective/detect_labels/other/float/__init__.py +0 -33
  198. csv_detective/detect_labels/other/int/__init__.py +0 -33
  199. csv_detective/detect_labels/other/money/__init__.py +0 -11
  200. csv_detective/detect_labels/other/money/check_col_name.py +0 -8
  201. csv_detective/detect_labels/other/mongo_object_id/__init__.py +0 -33
  202. csv_detective/detect_labels/other/twitter/__init__.py +0 -33
  203. csv_detective/detect_labels/other/url/__init__.py +0 -48
  204. csv_detective/detect_labels/other/uuid/__init__.py +0 -33
  205. csv_detective/detect_labels/temp/__init__.py +0 -0
  206. csv_detective/detect_labels/temp/date/__init__.py +0 -51
  207. csv_detective/detect_labels/temp/datetime_iso/__init__.py +0 -45
  208. csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +0 -44
  209. csv_detective/detect_labels/temp/year/__init__.py +0 -44
  210. csv_detective/detection.py +0 -361
  211. csv_detective/process_text.py +0 -39
  212. csv_detective/s3_utils.py +0 -48
  213. csv_detective-0.6.7.data/data/share/csv_detective/CHANGELOG.md +0 -118
  214. csv_detective-0.6.7.data/data/share/csv_detective/LICENSE.AGPL.txt +0 -661
  215. csv_detective-0.6.7.data/data/share/csv_detective/README.md +0 -247
  216. csv_detective-0.6.7.dist-info/LICENSE.AGPL.txt +0 -661
  217. csv_detective-0.6.7.dist-info/METADATA +0 -23
  218. csv_detective-0.6.7.dist-info/RECORD +0 -150
  219. csv_detective-0.6.7.dist-info/WHEEL +0 -5
  220. csv_detective-0.6.7.dist-info/top_level.txt +0 -2
  221. tests/__init__.py +0 -0
  222. tests/test_fields.py +0 -360
  223. tests/test_file.py +0 -116
  224. tests/test_labels.py +0 -7
  225. /csv_detective/{detect_fields/FR/other/csp_insee → formats/data}/csp_insee.txt +0 -0
  226. /csv_detective/{detect_fields/geo/iso_country_code_alpha2 → formats/data}/iso_country_code_alpha2.txt +0 -0
  227. /csv_detective/{detect_fields/geo/iso_country_code_alpha3 → formats/data}/iso_country_code_alpha3.txt +0 -0
  228. /csv_detective/{detect_fields/geo/iso_country_code_numeric → formats/data}/iso_country_code_numeric.txt +0 -0
@@ -1,45 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = [
14
- 'email',
15
- 'mail',
16
- 'courriel',
17
- 'contact',
18
- 'mel',
19
- 'lieucourriel',
20
- 'coordinates.emailcontact',
21
- 'e mail',
22
- 'mo mail',
23
- 'adresse mail',
24
- 'adresse email'
25
- ]
26
- processed_header = _process_text(header)
27
-
28
- header_matches_words_combination = float(
29
- any(
30
- [
31
- words_combination == processed_header for words_combination in words_combinations_list
32
- ]
33
- )
34
- )
35
- words_combination_in_header = 0.5 * float(
36
- any(
37
- [
38
- full_word_strictly_inside_string(
39
- words_combination, processed_header
40
- ) for words_combination in words_combinations_list
41
- ]
42
- )
43
- )
44
-
45
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,33 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = ['part', 'ratio']
14
- processed_header = _process_text(header)
15
-
16
- header_matches_words_combination = float(
17
- any(
18
- [
19
- words_combination == processed_header for words_combination in words_combinations_list
20
- ]
21
- )
22
- )
23
- words_combination_in_header = 0.5 * float(
24
- any(
25
- [
26
- full_word_strictly_inside_string(
27
- words_combination, processed_header
28
- ) for words_combination in words_combinations_list
29
- ]
30
- )
31
- )
32
-
33
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,33 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = ['nb', 'nombre', 'nbre']
14
- processed_header = _process_text(header)
15
-
16
- header_matches_words_combination = float(
17
- any(
18
- [
19
- words_combination == processed_header for words_combination in words_combinations_list
20
- ]
21
- )
22
- )
23
- words_combination_in_header = 0.5 * float(
24
- any(
25
- [
26
- full_word_strictly_inside_string(
27
- words_combination, processed_header
28
- ) for words_combination in words_combinations_list
29
- ]
30
- )
31
- )
32
-
33
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,11 +0,0 @@
1
- PROPORTION = 0.5
2
-
3
-
4
- def _is(header):
5
- '''
6
- Returns 1 if at least one of the mentionned words is in the label, else 0
7
- '''
8
-
9
- words_list = ['budget', 'salaire', 'euro', 'euros', 'prêt', 'montant']
10
-
11
- return float(any([word in header.lower() for word in words_list]))
@@ -1,8 +0,0 @@
1
- def is_col_name_related_to_money(name):
2
- # TODO : make this a little bit more clever (spacy ?)
3
- col_name_related_to_money = False
4
- money_themes = ['budget', 'salaire', 'euro', 'euros', 'prêt', 'montant']
5
- # TODO attention 'européeen' est détecté OK
6
- for theme in money_themes:
7
- col_name_related_to_money = col_name_related_to_money or (theme in name)
8
- return col_name_related_to_money
@@ -1,33 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = ['id', 'objectid']
14
- processed_header = _process_text(header)
15
-
16
- header_matches_words_combination = float(
17
- any(
18
- [
19
- words_combination == processed_header for words_combination in words_combinations_list
20
- ]
21
- )
22
- )
23
- words_combination_in_header = 0.5 * float(
24
- any(
25
- [
26
- full_word_strictly_inside_string(
27
- words_combination, processed_header
28
- ) for words_combination in words_combinations_list
29
- ]
30
- )
31
- )
32
-
33
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,33 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = ['twitter', 'twitter account', 'twitter username']
14
- processed_header = _process_text(header)
15
-
16
- header_matches_words_combination = float(
17
- any(
18
- [
19
- words_combination == processed_header for words_combination in words_combinations_list
20
- ]
21
- )
22
- )
23
- words_combination_in_header = 0.5 * float(
24
- any(
25
- [
26
- full_word_strictly_inside_string(
27
- words_combination, processed_header
28
- ) for words_combination in words_combinations_list
29
- ]
30
- )
31
- )
32
-
33
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,48 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = [
14
- 'url',
15
- 'url source',
16
- 'site web',
17
- 'source url',
18
- 'site internet',
19
- 'remote url',
20
- 'web',
21
- 'site',
22
- 'lien',
23
- 'site data',
24
- 'lien url',
25
- 'lien vers le fichier',
26
- 'sitweb',
27
- 'interneturl'
28
- ]
29
- processed_header = _process_text(header)
30
-
31
- header_matches_words_combination = float(
32
- any(
33
- [
34
- words_combination == processed_header for words_combination in words_combinations_list
35
- ]
36
- )
37
- )
38
- words_combination_in_header = 0.5 * float(
39
- any(
40
- [
41
- full_word_strictly_inside_string(
42
- words_combination, processed_header
43
- ) for words_combination in words_combinations_list
44
- ]
45
- )
46
- )
47
-
48
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,33 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = ['id', 'uuid', 'guid']
14
- processed_header = _process_text(header)
15
-
16
- header_matches_words_combination = float(
17
- any(
18
- [
19
- words_combination == processed_header for words_combination in words_combinations_list
20
- ]
21
- )
22
- )
23
- words_combination_in_header = 0.5 * float(
24
- any(
25
- [
26
- full_word_strictly_inside_string(
27
- words_combination, processed_header
28
- ) for words_combination in words_combinations_list
29
- ]
30
- )
31
- )
32
-
33
- return max(header_matches_words_combination, words_combination_in_header)
File without changes
@@ -1,51 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = [
14
- 'date',
15
- 'jour',
16
- 'date de mise a jour',
17
- 'sns date',
18
- 'date maj',
19
- 'rem date',
20
- 'periode',
21
- 'date de publication',
22
- 'dpc',
23
- 'extract date',
24
- 'date immatriculation',
25
- 'date jeu donnees',
26
- 'datemaj',
27
- 'dateouv',
28
- 'date der maj',
29
- 'dmaj',
30
- 'jour'
31
- ]
32
- processed_header = _process_text(header)
33
-
34
- header_matches_words_combination = float(
35
- any(
36
- [
37
- words_combination == processed_header for words_combination in words_combinations_list
38
- ]
39
- )
40
- )
41
- words_combination_in_header = 0.5 * float(
42
- any(
43
- [
44
- full_word_strictly_inside_string(
45
- words_combination, processed_header
46
- ) for words_combination in words_combinations_list
47
- ]
48
- )
49
- )
50
-
51
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,45 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = [
14
- 'datetime iso',
15
- 'datetime',
16
- 'timestamp',
17
- 'osm_timestamp',
18
- 'date',
19
- 'created at',
20
- 'last update',
21
- 'date maj',
22
- 'createdat',
23
- 'date naissance',
24
- 'date donnees'
25
- ]
26
- processed_header = _process_text(header)
27
-
28
- header_matches_words_combination = float(
29
- any(
30
- [
31
- words_combination == processed_header for words_combination in words_combinations_list
32
- ]
33
- )
34
- )
35
- words_combination_in_header = 0.5 * float(
36
- any(
37
- [
38
- full_word_strictly_inside_string(
39
- words_combination, processed_header
40
- ) for words_combination in words_combinations_list
41
- ]
42
- )
43
- )
44
-
45
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,44 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = [
14
- 'datetime',
15
- 'timestamp',
16
- 'osm_timestamp',
17
- 'date',
18
- 'created at',
19
- 'last update',
20
- 'date maj',
21
- 'createdat',
22
- 'date naissance',
23
- 'date donnees'
24
- ] # Almost same as IS0, no example in data
25
- processed_header = _process_text(header)
26
-
27
- header_matches_words_combination = float(
28
- any(
29
- [
30
- words_combination == processed_header for words_combination in words_combinations_list
31
- ]
32
- )
33
- )
34
- words_combination_in_header = 0.5 * float(
35
- any(
36
- [
37
- full_word_strictly_inside_string(
38
- words_combination, processed_header
39
- ) for words_combination in words_combinations_list
40
- ]
41
- )
42
- )
43
-
44
- return max(header_matches_words_combination, words_combination_in_header)
@@ -1,44 +0,0 @@
1
- from csv_detective.utils import full_word_strictly_inside_string
2
- from csv_detective.process_text import _process_text
3
-
4
- PROPORTION = 0.5
5
-
6
-
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = [
14
- 'year',
15
- 'annee',
16
- 'annee depot',
17
- 'an nais',
18
- 'exercice',
19
- 'data year',
20
- 'annee de publication',
21
- 'exercice comptable',
22
- 'annee de naissance',
23
- 'annee ouverture'
24
- ]
25
- processed_header = _process_text(header)
26
-
27
- header_matches_words_combination = float(
28
- any(
29
- [
30
- words_combination == processed_header for words_combination in words_combinations_list
31
- ]
32
- )
33
- )
34
- words_combination_in_header = 0.5 * float(
35
- any(
36
- [
37
- full_word_strictly_inside_string(
38
- words_combination, processed_header
39
- ) for words_combination in words_combinations_list
40
- ]
41
- )
42
- )
43
-
44
- return max(header_matches_words_combination, words_combination_in_header)