sportdb-config 0.5.2 → 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/sportdb/config/club.rb +36 -19
- data/lib/sportdb/config/club_index.rb +4 -16
- data/lib/sportdb/config/variants.rb +85 -6
- data/lib/sportdb/config/version.rb +1 -1
- data/lib/sportdb/config/wiki_index.rb +12 -13
- data/test/test_variants.rb +4 -3
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 43d84070a1a63efe77220f0be88991956c6696a7
|
4
|
+
data.tar.gz: 4b3c0a3a728065b648f52083f32eba1ebaeab43f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2461e383c70749bfc398de2561a057ab087a1d366644b4cf20d11530c19858d70974e0852b96db71041a6a1f7ebdabb34492a5874785227beea0140911f48f77
|
7
|
+
data.tar.gz: 72e12603145d651a9a2b6266755d1838d962513be478d3f1dbcfb2025b649a8bf4adcea875cdb9bc9424e95cdcb8511c433493125d90f1e5ce1c7e49dfec9993
|
data/lib/sportdb/config/club.rb
CHANGED
@@ -47,7 +47,7 @@ class Club
|
|
47
47
|
## check for duplicates
|
48
48
|
def duplicates?
|
49
49
|
names = [name] + alt_names + alt_names_auto
|
50
|
-
names = names.map { |name| normalize( name ) }
|
50
|
+
names = names.map { |name| normalize( sanitize(name) ) }
|
51
51
|
|
52
52
|
names.size != names.uniq.size
|
53
53
|
end
|
@@ -57,7 +57,7 @@ class Club
|
|
57
57
|
|
58
58
|
## calculate (count) frequency and select if greater than one
|
59
59
|
names.reduce( Hash.new ) do |h,name|
|
60
|
-
norm = normalize( name )
|
60
|
+
norm = normalize( sanitize(name) )
|
61
61
|
h[norm] ||= []
|
62
62
|
h[norm] << name; h
|
63
63
|
end.select { |norm,names| names.size > 1 }
|
@@ -83,26 +83,14 @@ class Club
|
|
83
83
|
|
84
84
|
def self.has_year?( name ) name =~ YEAR_REGEX; end
|
85
85
|
|
86
|
-
LANG_REGEX = /\[[a-z]{2}\]/
|
86
|
+
LANG_REGEX = /\[[a-z]{1,2}\]/ ## note also allow [a] or [d] or [e] - why? why not?
|
87
87
|
def self.strip_lang( name )
|
88
88
|
name.gsub( LANG_REGEX, '' ).strip
|
89
89
|
end
|
90
90
|
|
91
91
|
def self.has_lang?( name ) name =~ LANG_REGEX; end
|
92
92
|
|
93
|
-
|
94
|
-
## note: remove all dots (.), dash (-), ', º, /, etc.
|
95
|
-
## for norm(alizing) names
|
96
|
-
def self.strip_norm( name )
|
97
|
-
name.gsub( NORM_REGEX, '' )
|
98
|
-
end
|
99
|
-
|
100
|
-
def strip_year( name ) self.class.strip_year( name ); end
|
101
|
-
def strip_lang( name ) self.class.strip_lang( name ); end
|
102
|
-
def strip_norm( name ) self.class.strip_norm( name ); end
|
103
|
-
|
104
|
-
private
|
105
|
-
def sanitize( name )
|
93
|
+
def self.sanitize( name )
|
106
94
|
## check for year(s) e.g. (1887-1911), (-2013),
|
107
95
|
## (1946-2001,2013-) etc.
|
108
96
|
name = strip_year( name )
|
@@ -111,18 +99,47 @@ private
|
|
111
99
|
name
|
112
100
|
end
|
113
101
|
|
114
|
-
|
115
|
-
|
102
|
+
|
103
|
+
NORM_REGEX = /[.'º\-\/]/
|
104
|
+
## note: remove all dots (.), dash (-), ', º, /, etc.
|
105
|
+
## for norm(alizing) names
|
106
|
+
def self.strip_norm( name )
|
107
|
+
name.gsub( NORM_REGEX, '' )
|
108
|
+
end
|
109
|
+
|
110
|
+
def self.normalize( name )
|
111
|
+
# note: do NOT call sanitize here (keep normalize "atomic" for reuse)
|
116
112
|
|
117
113
|
## remove all dots (.), dash (-), º, /, etc.
|
118
114
|
name = strip_norm( name )
|
119
115
|
name = name.gsub( ' ', '' ) # note: also remove all spaces!!!
|
120
116
|
|
121
117
|
## todo/fix: use our own downcase - why? why not?
|
122
|
-
name = name
|
118
|
+
name = downcase_i18n( name ) ## do NOT care about upper and lowercase for now
|
123
119
|
name
|
124
120
|
end
|
125
121
|
|
122
|
+
|
123
|
+
def self.strip_wiki( name ) # todo/check: rename to strip_wikipedia_en - why? why not?
|
124
|
+
## note: strip disambiguationn qualifier from wikipedia page name if present
|
125
|
+
## note: only remove year and foot... for now
|
126
|
+
## e.g. FC Wacker Innsbruck (2002) => FC Wacker Innsbruck
|
127
|
+
## Willem II (football club) => Willem II
|
128
|
+
##
|
129
|
+
## e.g. do NOT strip others !! e.g.
|
130
|
+
## América Futebol Clube (MG)
|
131
|
+
## only add more "special" cases on demand (that, is) if we find more
|
132
|
+
name = name.gsub( /\([12][^\)]+?\)/, '' ).strip ## starting with a digit 1 or 2 (assuming year)
|
133
|
+
name = name.gsub( /\(foot[^\)]+?\)/, '' ).strip ## starting with foot (assuming football ...)
|
134
|
+
name
|
135
|
+
end
|
136
|
+
|
137
|
+
|
138
|
+
private
|
139
|
+
## private "shortcut" convenience helpers
|
140
|
+
def sanitize( name ) self.class.sanitize( name ); end
|
141
|
+
def normalize( name ) self.class.normalize( name ); end
|
142
|
+
|
126
143
|
def variants( name ) Variant.find( name ); end
|
127
144
|
end # class Club
|
128
145
|
|
@@ -48,7 +48,8 @@ class ClubIndex
|
|
48
48
|
def strip_year( name ) Club.strip_year( name ); end
|
49
49
|
def has_year?( name) Club.has_year?( name ); end
|
50
50
|
def strip_lang( name ) Club.strip_lang( name ); end
|
51
|
-
def
|
51
|
+
def strip_wiki( name ) Club.strip_wiki( name ); end
|
52
|
+
def normalize( name ) Club.normalize( name ); end
|
52
53
|
|
53
54
|
|
54
55
|
def add_wiki( rec_or_recs ) ## add wiki(pedia club record / links
|
@@ -62,9 +63,7 @@ class ClubIndex
|
|
62
63
|
## e.g. do NOT strip others !! e.g.
|
63
64
|
## América Futebol Clube (MG)
|
64
65
|
## only add more "special" cases on demand (that, is) if we find more
|
65
|
-
name = rec.name
|
66
|
-
name = name.gsub( /\([12][^\)]+?\)/, '' ).strip ## starting with a digit 1 or 2 (assuming year)
|
67
|
-
name = name.gsub( /\(foot[^\)]+?\)/, '' ).strip ## starting with foot (assuming football ...)
|
66
|
+
name = strip_wiki( rec.name )
|
68
67
|
|
69
68
|
m = match_by( name: name, country: rec.country )
|
70
69
|
if m.nil?
|
@@ -136,6 +135,7 @@ class ClubIndex
|
|
136
135
|
|
137
136
|
names.each_with_index do |name,i|
|
138
137
|
## check lang codes e.g. [en], [fr], etc.
|
138
|
+
## todo/check/fix: move strip_lang up in the chain - check for duplicates (e.g. only lang code marker different etc.) - why? why not?
|
139
139
|
name = strip_lang( name )
|
140
140
|
norm = normalize( name )
|
141
141
|
alt_recs = @clubs_by_name[ norm ]
|
@@ -205,18 +205,6 @@ class ClubIndex
|
|
205
205
|
end
|
206
206
|
end
|
207
207
|
end
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
private
|
212
|
-
def normalize( name )
|
213
|
-
name = strip_norm( name )
|
214
|
-
name = name.gsub( ' ', '' ) # remove all spaces
|
215
|
-
|
216
|
-
## todo/fix: use our own downcase - why? why not?
|
217
|
-
name = name.downcase ## do NOT care about upper and lowercase for now
|
218
|
-
name
|
219
|
-
end
|
220
208
|
end # class ClubIndex
|
221
209
|
|
222
210
|
|
@@ -21,33 +21,72 @@ end
|
|
21
21
|
## "simple" translation
|
22
22
|
ALPHA_SPECIALS = {
|
23
23
|
'Ä'=>'A', 'ä'=>'a',
|
24
|
-
|
24
|
+
'Á'=>'A', 'á'=>'a',
|
25
25
|
'à'=>'a',
|
26
26
|
'ã'=>'a',
|
27
27
|
'â'=>'a',
|
28
|
+
'Å'=>'A', 'å'=>'a',
|
29
|
+
'æ'=>'ae',
|
30
|
+
'ā'=>'a',
|
31
|
+
'ă'=>'a',
|
32
|
+
'ą'=>'a',
|
28
33
|
|
29
|
-
|
34
|
+
'Ç' =>'C', 'ç'=>'c',
|
35
|
+
'ć'=>'c',
|
36
|
+
'Č'=>'C', 'č'=>'c',
|
30
37
|
|
31
38
|
'É'=>'E', 'é'=>'e',
|
32
39
|
'è'=>'e',
|
33
40
|
'ê'=>'e',
|
41
|
+
'ë'=>'e',
|
42
|
+
'ė'=>'e',
|
43
|
+
'ę'=>'e',
|
34
44
|
|
35
|
-
'
|
45
|
+
'ğ'=>'g',
|
46
|
+
|
47
|
+
'İ'=>'I',
|
48
|
+
'Í'=>'I', 'í'=>'i',
|
36
49
|
'î'=>'i',
|
50
|
+
'ī'=>'i',
|
51
|
+
'ı'=>'i',
|
52
|
+
|
53
|
+
'Ł'=>'L', 'ł'=>'l',
|
37
54
|
|
38
55
|
'ñ'=>'n',
|
56
|
+
'ń'=>'n',
|
57
|
+
'ň'=>'n',
|
39
58
|
|
40
59
|
'Ö'=>'O', 'ö'=>'o',
|
41
60
|
'ó'=>'o',
|
42
61
|
'õ'=>'o',
|
43
62
|
'ô'=>'o',
|
63
|
+
'ø'=>'o',
|
64
|
+
'ő'=>'o',
|
44
65
|
|
45
|
-
|
46
|
-
'ú'=>'u',
|
66
|
+
'ř'=>'r',
|
47
67
|
|
68
|
+
'Ś'=>'S',
|
69
|
+
'Ş'=>'S', 'ş'=>'s',
|
70
|
+
'Š'=>'S', 'š'=>'s',
|
71
|
+
'ș'=>'s', ## U+0219
|
48
72
|
'ß'=>'ss',
|
73
|
+
|
74
|
+
'ţ'=>'t', ## U+0163
|
75
|
+
'ț'=>'t', ## U+021B
|
76
|
+
'þ'=>'th',
|
77
|
+
|
78
|
+
'Ü'=>'U', 'ü'=>'u',
|
79
|
+
'Ú'=>'U', 'ú'=>'u',
|
80
|
+
'ū'=>'u',
|
81
|
+
|
82
|
+
'ý'=>'y',
|
83
|
+
|
84
|
+
'ź'=>'z',
|
85
|
+
'ż'=>'z',
|
86
|
+
'Ž'=>'Z', 'ž'=>'z',
|
49
87
|
}
|
50
88
|
|
89
|
+
|
51
90
|
## de,at,ch translation for umlauts
|
52
91
|
ALPHA_SPECIALS_DE = {
|
53
92
|
'Ä'=>'Ae', 'ä'=>'ae',
|
@@ -59,6 +98,35 @@ ALPHA_SPECIALS_DE = {
|
|
59
98
|
## add ALPHA_SPECIALS_ES - why? why not? is Espanyol catalan spelling or spanish (castillian)?
|
60
99
|
# 'ñ'=>'ny', ## e.g. Español => Espanyol
|
61
100
|
|
101
|
+
ALPHA_DOWNCASE = %w[A B C D E F G H I J K L M N O P Q R S T U V W X Y Z].reduce({}) do |h,ch|
|
102
|
+
h[ch] = ch.downcase
|
103
|
+
h
|
104
|
+
end.merge(
|
105
|
+
'Ä'=>'ä',
|
106
|
+
'Á'=>'á',
|
107
|
+
'Å'=>'å',
|
108
|
+
|
109
|
+
'Ç'=>'ç',
|
110
|
+
'Č'=>'č',
|
111
|
+
|
112
|
+
'É'=>'é',
|
113
|
+
|
114
|
+
'İ'=>'?', ## fix - add lowercase
|
115
|
+
'Í'=>'í',
|
116
|
+
|
117
|
+
'Ł'=>'ł',
|
118
|
+
|
119
|
+
'Ö'=>'ö',
|
120
|
+
|
121
|
+
'Ś'=>'?', ## fix - add lowercase
|
122
|
+
'Ş'=>'ş',
|
123
|
+
'Š'=>'š',
|
124
|
+
|
125
|
+
'Ü'=>'ü',
|
126
|
+
'Ú'=>'ú',
|
127
|
+
|
128
|
+
'Ž'=>'ž',
|
129
|
+
)
|
62
130
|
|
63
131
|
|
64
132
|
def self.alpha_specials_count( freq, mapping )
|
@@ -99,8 +167,19 @@ def self.find( name )
|
|
99
167
|
alt_names = alt_names.uniq
|
100
168
|
alt_names
|
101
169
|
end
|
102
|
-
end # Variant
|
103
170
|
|
171
|
+
def self.downcase_i18n( name ) ## our very own downcase for int'l characters / letters
|
172
|
+
tr( name, ALPHA_DOWNCASE )
|
173
|
+
end
|
174
|
+
|
175
|
+
end # class Variant
|
104
176
|
|
105
177
|
end ## module Import
|
106
178
|
end ## module SportDb
|
179
|
+
|
180
|
+
|
181
|
+
|
182
|
+
## "global" convenience helper
|
183
|
+
def downcase_i18n( name )
|
184
|
+
SportDb::Import::Variant.downcase_i18n( name )
|
185
|
+
end # Variant
|
@@ -17,6 +17,15 @@ class WikiIndex
|
|
17
17
|
self.new( recs )
|
18
18
|
end
|
19
19
|
|
20
|
+
|
21
|
+
|
22
|
+
## helpers from club - use a helper module for includes - why? why not?
|
23
|
+
def strip_lang( name ) Club.strip_lang( name ); end
|
24
|
+
def strip_year( name ) Club.strip_year( name ); end
|
25
|
+
def normalize( name ) Club.normalize( name ); end
|
26
|
+
def strip_wiki( name) Club.strip_wiki( name ); end
|
27
|
+
|
28
|
+
|
20
29
|
def initialize( recs )
|
21
30
|
@pages_by_country = {}
|
22
31
|
|
@@ -24,21 +33,11 @@ class WikiIndex
|
|
24
33
|
## check for duplicate recs - report and exit on dupliate!!!!!!
|
25
34
|
recs.each do |rec|
|
26
35
|
h = @pages_by_country[ rec.country.key ] ||= {}
|
27
|
-
h[ normalize(rec.name) ] = rec
|
36
|
+
h[ normalize( strip_wiki( rec.name )) ] = rec
|
28
37
|
end
|
29
38
|
end
|
30
39
|
|
31
40
|
|
32
|
-
def normalize( name )
|
33
|
-
## todo/fix: (re)use normalize from Club!!!!
|
34
|
-
name = name.gsub( /[\-\.]/, '' )
|
35
|
-
name = name.gsub( ' ', '' ) ## remove spaces too
|
36
|
-
name = name.downcase
|
37
|
-
name
|
38
|
-
end
|
39
|
-
|
40
|
-
|
41
|
-
|
42
41
|
def find_by( club: ) ## todo/check: use find_by_club - why? why not?
|
43
42
|
find_by_club( club )
|
44
43
|
end
|
@@ -55,8 +54,8 @@ class WikiIndex
|
|
55
54
|
## todo/check: sort names ?
|
56
55
|
## sort by longest first (for best match)
|
57
56
|
names.each do |name|
|
58
|
-
##
|
59
|
-
rec = h[ normalize( name ) ]
|
57
|
+
## note: normalize AND sanitize (e.g. remove/string year and lang e.g. (1946-2001), [en] too)
|
58
|
+
rec = h[ normalize( strip_year( strip_lang( name ))) ]
|
60
59
|
break if rec ## bingo!! found - break on first match
|
61
60
|
end
|
62
61
|
end
|
data/test/test_variants.rb
CHANGED
@@ -12,9 +12,10 @@ class TestVariants < MiniTest::Test
|
|
12
12
|
def variants( name ) SportDb::Import::Variant.find( name ); end
|
13
13
|
|
14
14
|
|
15
|
-
def
|
16
|
-
|
17
|
-
|
15
|
+
def test_downcase_i18n
|
16
|
+
assert_equal 'abcdefghijklmnopqrstuvwxyz', downcase_i18n( 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' )
|
17
|
+
assert_equal 'äöü', downcase_i18n( 'ÄÖÜ' )
|
18
|
+
assert_equal 'köln', downcase_i18n( 'KÖLN' )
|
18
19
|
end
|
19
20
|
|
20
21
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sportdb-config
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-08-
|
11
|
+
date: 2019-08-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: csvreader
|