csv-detective 0.8.1.dev1362__py3-none-any.whl → 0.8.1.dev1380__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. csv_detective/detect_labels/FR/geo/adresse/__init__.py +9 -34
  2. csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py +11 -36
  3. csv_detective/detect_labels/FR/geo/code_departement/__init__.py +11 -29
  4. csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py +8 -29
  5. csv_detective/detect_labels/FR/geo/code_postal/__init__.py +10 -35
  6. csv_detective/detect_labels/FR/geo/code_region/__init__.py +10 -29
  7. csv_detective/detect_labels/FR/geo/commune/__init__.py +8 -29
  8. csv_detective/detect_labels/FR/geo/departement/__init__.py +16 -41
  9. csv_detective/detect_labels/FR/geo/insee_canton/__init__.py +9 -29
  10. csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py +24 -48
  11. csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py +24 -49
  12. csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py +15 -38
  13. csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py +14 -38
  14. csv_detective/detect_labels/FR/geo/pays/__init__.py +14 -39
  15. csv_detective/detect_labels/FR/geo/region/__init__.py +14 -39
  16. csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py +4 -29
  17. csv_detective/detect_labels/FR/other/code_rna/__init__.py +7 -32
  18. csv_detective/detect_labels/FR/other/code_waldec/__init__.py +4 -29
  19. csv_detective/detect_labels/FR/other/csp_insee/__init__.py +6 -30
  20. csv_detective/detect_labels/FR/other/date_fr/__init__.py +5 -29
  21. csv_detective/detect_labels/FR/other/insee_ape700/__init__.py +9 -34
  22. csv_detective/detect_labels/FR/other/sexe/__init__.py +4 -29
  23. csv_detective/detect_labels/FR/other/siren/__init__.py +10 -35
  24. csv_detective/detect_labels/FR/other/siret/__init__.py +9 -34
  25. csv_detective/detect_labels/FR/other/tel_fr/__init__.py +14 -38
  26. csv_detective/detect_labels/FR/other/uai/__init__.py +17 -42
  27. csv_detective/detect_labels/FR/temp/jour_de_la_semaine/__init__.py +10 -35
  28. csv_detective/detect_labels/FR/temp/mois_de_annee/__init__.py +4 -29
  29. csv_detective/detect_labels/geo/iso_country_code_alpha2/__init__.py +10 -35
  30. csv_detective/detect_labels/geo/iso_country_code_alpha3/__init__.py +10 -35
  31. csv_detective/detect_labels/geo/iso_country_code_numeric/__init__.py +10 -35
  32. csv_detective/detect_labels/geo/json_geojson/__init__.py +11 -36
  33. csv_detective/detect_labels/geo/latitude_wgs/__init__.py +24 -49
  34. csv_detective/detect_labels/geo/latlon_wgs/__init__.py +37 -61
  35. csv_detective/detect_labels/geo/longitude_wgs/__init__.py +14 -38
  36. csv_detective/detect_labels/other/booleen/__init__.py +4 -30
  37. csv_detective/detect_labels/other/email/__init__.py +14 -39
  38. csv_detective/detect_labels/other/float/__init__.py +4 -29
  39. csv_detective/detect_labels/other/int/__init__.py +4 -29
  40. csv_detective/detect_labels/other/money/__init__.py +5 -8
  41. csv_detective/detect_labels/other/mongo_object_id/__init__.py +3 -28
  42. csv_detective/detect_labels/other/twitter/__init__.py +4 -29
  43. csv_detective/detect_labels/other/url/__init__.py +17 -42
  44. csv_detective/detect_labels/other/uuid/__init__.py +4 -29
  45. csv_detective/detect_labels/temp/date/__init__.py +22 -47
  46. csv_detective/detect_labels/temp/datetime_iso/__init__.py +14 -39
  47. csv_detective/detect_labels/temp/datetime_rfc822/__init__.py +13 -38
  48. csv_detective/detect_labels/temp/year/__init__.py +13 -38
  49. csv_detective/parsing/text.py +42 -20
  50. csv_detective/utils.py +0 -4
  51. {csv_detective-0.8.1.dev1362.data → csv_detective-0.8.1.dev1380.data}/data/share/csv_detective/CHANGELOG.md +1 -1
  52. {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1380.dist-info}/METADATA +1 -1
  53. {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1380.dist-info}/RECORD +60 -61
  54. {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1380.dist-info}/WHEEL +1 -1
  55. tests/test_labels.py +18 -2
  56. csv_detective/detect_labels/other/money/check_col_name.py +0 -8
  57. {csv_detective-0.8.1.dev1362.data → csv_detective-0.8.1.dev1380.data}/data/share/csv_detective/LICENSE.AGPL.txt +0 -0
  58. {csv_detective-0.8.1.dev1362.data → csv_detective-0.8.1.dev1380.data}/data/share/csv_detective/README.md +0 -0
  59. {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1380.dist-info}/entry_points.txt +0 -0
  60. {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1380.dist-info}/licenses/LICENSE.AGPL.txt +0 -0
  61. {csv_detective-0.8.1.dev1362.dist-info → csv_detective-0.8.1.dev1380.dist-info}/top_level.txt +0 -0
@@ -1,45 +1,20 @@
1
- from csv_detective.utils import is_word_in_string
2
- from csv_detective.parsing.text import _process_text
1
+ from csv_detective.parsing.text import header_score
3
2
 
4
3
  PROPORTION = 0.5
5
4
 
6
5
 
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
6
+ def _is(header: str) -> float:
13
7
  words_combinations_list = [
14
- 'email',
15
- 'mail',
16
- 'courriel',
17
- 'contact',
18
- 'mel',
19
- 'lieucourriel',
20
- 'coordinates.emailcontact',
21
- 'e mail',
22
- 'mo mail',
23
- 'adresse mail',
24
- 'adresse email'
8
+ "email",
9
+ "mail",
10
+ "courriel",
11
+ "contact",
12
+ "mel",
13
+ "lieucourriel",
14
+ "coordinates.emailcontact",
15
+ "e mail",
16
+ "mo mail",
17
+ "adresse mail",
18
+ "adresse email",
25
19
  ]
26
- processed_header = _process_text(header)
27
-
28
- header_matches_words_combination = float(
29
- any(
30
- [
31
- words_combination == processed_header for words_combination in words_combinations_list
32
- ]
33
- )
34
- )
35
- words_combination_in_header = 0.5 * float(
36
- any(
37
- [
38
- is_word_in_string(
39
- words_combination, processed_header
40
- ) for words_combination in words_combinations_list
41
- ]
42
- )
43
- )
44
-
45
- return max(header_matches_words_combination, words_combination_in_header)
20
+ return header_score(header, words_combinations_list)
@@ -1,33 +1,8 @@
1
- from csv_detective.utils import is_word_in_string
2
- from csv_detective.parsing.text import _process_text
1
+ from csv_detective.parsing.text import header_score
3
2
 
4
3
  PROPORTION = 0.5
5
4
 
6
5
 
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = ['part', 'ratio']
14
- processed_header = _process_text(header)
15
-
16
- header_matches_words_combination = float(
17
- any(
18
- [
19
- words_combination == processed_header for words_combination in words_combinations_list
20
- ]
21
- )
22
- )
23
- words_combination_in_header = 0.5 * float(
24
- any(
25
- [
26
- is_word_in_string(
27
- words_combination, processed_header
28
- ) for words_combination in words_combinations_list
29
- ]
30
- )
31
- )
32
-
33
- return max(header_matches_words_combination, words_combination_in_header)
6
+ def _is(header: str) -> float:
7
+ words_combinations_list = ["part", "ratio", "taux"]
8
+ return header_score(header, words_combinations_list)
@@ -1,33 +1,8 @@
1
- from csv_detective.utils import is_word_in_string
2
- from csv_detective.parsing.text import _process_text
1
+ from csv_detective.parsing.text import header_score
3
2
 
4
3
  PROPORTION = 0.5
5
4
 
6
5
 
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = ['nb', 'nombre', 'nbre']
14
- processed_header = _process_text(header)
15
-
16
- header_matches_words_combination = float(
17
- any(
18
- [
19
- words_combination == processed_header for words_combination in words_combinations_list
20
- ]
21
- )
22
- )
23
- words_combination_in_header = 0.5 * float(
24
- any(
25
- [
26
- is_word_in_string(
27
- words_combination, processed_header
28
- ) for words_combination in words_combinations_list
29
- ]
30
- )
31
- )
32
-
33
- return max(header_matches_words_combination, words_combination_in_header)
6
+ def _is(header: str) -> float:
7
+ words_combinations_list = ["nb", "nombre", "nbre"]
8
+ return header_score(header, words_combinations_list)
@@ -1,11 +1,8 @@
1
- PROPORTION = 0.5
2
-
1
+ from csv_detective.parsing.text import header_score
3
2
 
4
- def _is(header):
5
- '''
6
- Returns 1 if at least one of the mentionned words is in the label, else 0
7
- '''
3
+ PROPORTION = 0.5
8
4
 
9
- words_list = ['budget', 'salaire', 'euro', 'euros', 'prêt', 'montant']
10
5
 
11
- return float(any([word in header.lower() for word in words_list]))
6
+ def _is(header: str) -> float:
7
+ words_combinations_list = ["budget", "salaire", "euro", "euros", "prêt", "montant"]
8
+ return header_score(header, words_combinations_list)
@@ -1,33 +1,8 @@
1
- from csv_detective.utils import is_word_in_string
2
- from csv_detective.parsing.text import _process_text
1
+ from csv_detective.parsing.text import header_score
3
2
 
4
3
  PROPORTION = 0.5
5
4
 
6
5
 
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
6
+ def _is(header: str) -> float:
13
7
  words_combinations_list = ['id', 'objectid']
14
- processed_header = _process_text(header)
15
-
16
- header_matches_words_combination = float(
17
- any(
18
- [
19
- words_combination == processed_header for words_combination in words_combinations_list
20
- ]
21
- )
22
- )
23
- words_combination_in_header = 0.5 * float(
24
- any(
25
- [
26
- is_word_in_string(
27
- words_combination, processed_header
28
- ) for words_combination in words_combinations_list
29
- ]
30
- )
31
- )
32
-
33
- return max(header_matches_words_combination, words_combination_in_header)
8
+ return header_score(header, words_combinations_list)
@@ -1,33 +1,8 @@
1
- from csv_detective.utils import is_word_in_string
2
- from csv_detective.parsing.text import _process_text
1
+ from csv_detective.parsing.text import header_score
3
2
 
4
3
  PROPORTION = 0.5
5
4
 
6
5
 
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = ['twitter', 'twitter account', 'twitter username']
14
- processed_header = _process_text(header)
15
-
16
- header_matches_words_combination = float(
17
- any(
18
- [
19
- words_combination == processed_header for words_combination in words_combinations_list
20
- ]
21
- )
22
- )
23
- words_combination_in_header = 0.5 * float(
24
- any(
25
- [
26
- is_word_in_string(
27
- words_combination, processed_header
28
- ) for words_combination in words_combinations_list
29
- ]
30
- )
31
- )
32
-
33
- return max(header_matches_words_combination, words_combination_in_header)
6
+ def _is(header: str) -> float:
7
+ words_combinations_list = ["twitter", "twitter account", "twitter username"]
8
+ return header_score(header, words_combinations_list)
@@ -1,48 +1,23 @@
1
- from csv_detective.utils import is_word_in_string
2
- from csv_detective.parsing.text import _process_text
1
+ from csv_detective.parsing.text import header_score
3
2
 
4
3
  PROPORTION = 0.5
5
4
 
6
5
 
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
6
+ def _is(header: str) -> float:
13
7
  words_combinations_list = [
14
- 'url',
15
- 'url source',
16
- 'site web',
17
- 'source url',
18
- 'site internet',
19
- 'remote url',
20
- 'web',
21
- 'site',
22
- 'lien',
23
- 'site data',
24
- 'lien url',
25
- 'lien vers le fichier',
26
- 'sitweb',
27
- 'interneturl'
8
+ "url",
9
+ "url source",
10
+ "site web",
11
+ "source url",
12
+ "site internet",
13
+ "remote url",
14
+ "web",
15
+ "site",
16
+ "lien",
17
+ "site data",
18
+ "lien url",
19
+ "lien vers le fichier",
20
+ "sitweb",
21
+ "interneturl",
28
22
  ]
29
- processed_header = _process_text(header)
30
-
31
- header_matches_words_combination = float(
32
- any(
33
- [
34
- words_combination == processed_header for words_combination in words_combinations_list
35
- ]
36
- )
37
- )
38
- words_combination_in_header = 0.5 * float(
39
- any(
40
- [
41
- is_word_in_string(
42
- words_combination, processed_header
43
- ) for words_combination in words_combinations_list
44
- ]
45
- )
46
- )
47
-
48
- return max(header_matches_words_combination, words_combination_in_header)
23
+ return header_score(header, words_combinations_list)
@@ -1,33 +1,8 @@
1
- from csv_detective.utils import is_word_in_string
2
- from csv_detective.parsing.text import _process_text
1
+ from csv_detective.parsing.text import header_score
3
2
 
4
3
  PROPORTION = 0.5
5
4
 
6
5
 
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
13
- words_combinations_list = ['id', 'uuid', 'guid']
14
- processed_header = _process_text(header)
15
-
16
- header_matches_words_combination = float(
17
- any(
18
- [
19
- words_combination == processed_header for words_combination in words_combinations_list
20
- ]
21
- )
22
- )
23
- words_combination_in_header = 0.5 * float(
24
- any(
25
- [
26
- is_word_in_string(
27
- words_combination, processed_header
28
- ) for words_combination in words_combinations_list
29
- ]
30
- )
31
- )
32
-
33
- return max(header_matches_words_combination, words_combination_in_header)
6
+ def _is(header: str) -> float:
7
+ words_combinations_list = ["id", "uuid", "guid"]
8
+ return header_score(header, words_combinations_list)
@@ -1,53 +1,28 @@
1
- from csv_detective.utils import is_word_in_string
2
- from csv_detective.parsing.text import _process_text
1
+ from csv_detective.parsing.text import header_score
3
2
 
4
3
  PROPORTION = 0.5
5
4
 
6
5
 
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
6
+ def _is(header: str) -> float:
13
7
  words_combinations_list = [
14
- 'date',
15
- 'jour',
16
- 'date de mise a jour',
17
- 'sns date',
18
- 'date maj',
19
- 'rem date',
20
- 'periode',
21
- 'date de publication',
22
- 'dpc',
23
- 'extract date',
24
- 'date immatriculation',
25
- 'date jeu donnees',
26
- 'datemaj',
27
- 'dateouv',
28
- 'date der maj',
29
- 'dmaj',
30
- 'jour',
31
- 'yyyymmdd',
32
- 'aaaammjj',
8
+ "date",
9
+ "jour",
10
+ "date de mise a jour",
11
+ "sns date",
12
+ "date maj",
13
+ "rem date",
14
+ "periode",
15
+ "date de publication",
16
+ "dpc",
17
+ "extract date",
18
+ "date immatriculation",
19
+ "date jeu donnees",
20
+ "datemaj",
21
+ "dateouv",
22
+ "date der maj",
23
+ "dmaj",
24
+ "jour",
25
+ "yyyymmdd",
26
+ "aaaammjj",
33
27
  ]
34
- processed_header = _process_text(header)
35
-
36
- header_matches_words_combination = float(
37
- any(
38
- [
39
- words_combination == processed_header for words_combination in words_combinations_list
40
- ]
41
- )
42
- )
43
- words_combination_in_header = 0.5 * float(
44
- any(
45
- [
46
- is_word_in_string(
47
- words_combination, processed_header
48
- ) for words_combination in words_combinations_list
49
- ]
50
- )
51
- )
52
-
53
- return max(header_matches_words_combination, words_combination_in_header)
28
+ return header_score(header, words_combinations_list)
@@ -1,45 +1,20 @@
1
- from csv_detective.utils import is_word_in_string
2
- from csv_detective.parsing.text import _process_text
1
+ from csv_detective.parsing.text import header_score
3
2
 
4
3
  PROPORTION = 0.5
5
4
 
6
5
 
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
6
+ def _is(header: str) -> float:
13
7
  words_combinations_list = [
14
- 'datetime iso',
15
- 'datetime',
16
- 'timestamp',
17
- 'osm_timestamp',
18
- 'date',
19
- 'created at',
20
- 'last update',
21
- 'date maj',
22
- 'createdat',
23
- 'date naissance',
24
- 'date donnees'
8
+ "datetime iso",
9
+ "datetime",
10
+ "timestamp",
11
+ "osm_timestamp",
12
+ "date",
13
+ "created at",
14
+ "last update",
15
+ "date maj",
16
+ "createdat",
17
+ "date naissance",
18
+ "date donnees",
25
19
  ]
26
- processed_header = _process_text(header)
27
-
28
- header_matches_words_combination = float(
29
- any(
30
- [
31
- words_combination == processed_header for words_combination in words_combinations_list
32
- ]
33
- )
34
- )
35
- words_combination_in_header = 0.5 * float(
36
- any(
37
- [
38
- is_word_in_string(
39
- words_combination, processed_header
40
- ) for words_combination in words_combinations_list
41
- ]
42
- )
43
- )
44
-
45
- return max(header_matches_words_combination, words_combination_in_header)
20
+ return header_score(header, words_combinations_list)
@@ -1,44 +1,19 @@
1
- from csv_detective.utils import is_word_in_string
2
- from csv_detective.parsing.text import _process_text
1
+ from csv_detective.parsing.text import header_score
3
2
 
4
3
  PROPORTION = 0.5
5
4
 
6
5
 
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
6
+ def _is(header: str) -> float:
13
7
  words_combinations_list = [
14
- 'datetime',
15
- 'timestamp',
16
- 'osm_timestamp',
17
- 'date',
18
- 'created at',
19
- 'last update',
20
- 'date maj',
21
- 'createdat',
22
- 'date naissance',
23
- 'date donnees'
8
+ "datetime",
9
+ "timestamp",
10
+ "osm_timestamp",
11
+ "date",
12
+ "created at",
13
+ "last update",
14
+ "date maj",
15
+ "createdat",
16
+ "date naissance",
17
+ "date donnees",
24
18
  ] # Almost same as IS0, no example in data
25
- processed_header = _process_text(header)
26
-
27
- header_matches_words_combination = float(
28
- any(
29
- [
30
- words_combination == processed_header for words_combination in words_combinations_list
31
- ]
32
- )
33
- )
34
- words_combination_in_header = 0.5 * float(
35
- any(
36
- [
37
- is_word_in_string(
38
- words_combination, processed_header
39
- ) for words_combination in words_combinations_list
40
- ]
41
- )
42
- )
43
-
44
- return max(header_matches_words_combination, words_combination_in_header)
19
+ return header_score(header, words_combinations_list)
@@ -1,44 +1,19 @@
1
- from csv_detective.utils import is_word_in_string
2
- from csv_detective.parsing.text import _process_text
1
+ from csv_detective.parsing.text import header_score
3
2
 
4
3
  PROPORTION = 0.5
5
4
 
6
5
 
7
- def _is(header):
8
- '''
9
- Returns 1 if the (processed) header matches one of the expected words combination,
10
- else 0
11
- '''
12
-
6
+ def _is(header: str) -> float:
13
7
  words_combinations_list = [
14
- 'year',
15
- 'annee',
16
- 'annee depot',
17
- 'an nais',
18
- 'exercice',
19
- 'data year',
20
- 'annee de publication',
21
- 'exercice comptable',
22
- 'annee de naissance',
23
- 'annee ouverture'
8
+ "year",
9
+ "annee",
10
+ "annee depot",
11
+ "an nais",
12
+ "exercice",
13
+ "data year",
14
+ "annee de publication",
15
+ "exercice comptable",
16
+ "annee de naissance",
17
+ "annee ouverture",
24
18
  ]
25
- processed_header = _process_text(header)
26
-
27
- header_matches_words_combination = float(
28
- any(
29
- [
30
- words_combination == processed_header for words_combination in words_combinations_list
31
- ]
32
- )
33
- )
34
- words_combination_in_header = 0.5 * float(
35
- any(
36
- [
37
- is_word_in_string(
38
- words_combination, processed_header
39
- ) for words_combination in words_combinations_list
40
- ]
41
- )
42
- )
43
-
44
- return max(header_matches_words_combination, words_combination_in_header)
19
+ return header_score(header, words_combinations_list)
@@ -8,6 +8,17 @@ def camel_case_split(identifier: str):
8
8
  return " ".join([m.group(0) for m in matches])
9
9
 
10
10
 
11
+ translate_dict = {
12
+ " ": ["-", "_", "'", ",", " "],
13
+ "a": ["à", "â"],
14
+ "c": ["ç"],
15
+ "e": ["é", "è", "ê", "é"],
16
+ "i": ["î", "ï"],
17
+ "o": ["ô", "ö"],
18
+ "u": ["ù", "û", "ü"],
19
+ }
20
+
21
+
11
22
  # Process text
12
23
  def _process_text(val: str):
13
24
  """Traitement des chaînes de caractères pour les standardiser.
@@ -15,25 +26,36 @@ def _process_text(val: str):
15
26
  des méthodes hybrides, mais aucune ne s'est avérée plus performante."""
16
27
  val = camel_case_split(val)
17
28
  val = val.lower()
18
- val = val.replace("-", " ")
19
- val = val.replace("_", " ")
20
- val = val.replace("'", " ")
21
- val = val.replace(",", " ")
22
- val = val.replace(" ", " ")
23
- val = val.replace("à", "a")
24
- val = val.replace("â", "a")
25
- val = val.replace("ç", "c")
26
- val = val.replace("é", "e")
27
- val = val.replace("é", "e")
28
- val = val.replace("è", "e")
29
- val = val.replace("ê", "e")
30
- val = val.replace("î", "i")
31
- val = val.replace("ï", "i")
32
- val = val.replace("ô", "o")
33
- val = val.replace("ö", "o")
34
- val = val.replace("î", "i")
35
- val = val.replace("û", "u")
36
- val = val.replace("ù", "u")
37
- val = val.replace("ü", "u")
29
+ for target in translate_dict:
30
+ for source in translate_dict[target]:
31
+ val = val.replace(source, target)
38
32
  val = val.strip()
39
33
  return val
34
+
35
+
36
+ def is_word_in_string(word: str, string: str):
37
+ # if the substring is too short, the test can become irrelevant
38
+ return len(word) > 2 and word in string
39
+
40
+
41
+ def header_score(header: str, words_combinations_list: list[str]) -> float:
42
+ """Returns:
43
+ - 1 if the header is exactly in the specified list
44
+ - 0.5 if any of the words is within the header
45
+ - 0 otherwise"""
46
+ processed_header = _process_text(header)
47
+
48
+ header_matches_words_combination = float(
49
+ any(
50
+ words_combination == processed_header for words_combination in words_combinations_list
51
+ )
52
+ )
53
+ words_combination_in_header = 0.5 * (
54
+ any(
55
+ is_word_in_string(
56
+ words_combination, processed_header
57
+ ) for words_combination in words_combinations_list
58
+ )
59
+ )
60
+
61
+ return max(header_matches_words_combination, words_combination_in_header)
csv_detective/utils.py CHANGED
@@ -32,7 +32,3 @@ def prevent_nan(value: float) -> Optional[float]:
32
32
  if math.isnan(value):
33
33
  return None
34
34
  return value
35
-
36
-
37
- def is_word_in_string(word: str, string: str):
38
- return word in string