sportdb-config 0.5.2 → 0.5.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/sportdb/config/club.rb +36 -19
- data/lib/sportdb/config/club_index.rb +4 -16
- data/lib/sportdb/config/variants.rb +85 -6
- data/lib/sportdb/config/version.rb +1 -1
- data/lib/sportdb/config/wiki_index.rb +12 -13
- data/test/test_variants.rb +4 -3
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 43d84070a1a63efe77220f0be88991956c6696a7
|
4
|
+
data.tar.gz: 4b3c0a3a728065b648f52083f32eba1ebaeab43f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2461e383c70749bfc398de2561a057ab087a1d366644b4cf20d11530c19858d70974e0852b96db71041a6a1f7ebdabb34492a5874785227beea0140911f48f77
|
7
|
+
data.tar.gz: 72e12603145d651a9a2b6266755d1838d962513be478d3f1dbcfb2025b649a8bf4adcea875cdb9bc9424e95cdcb8511c433493125d90f1e5ce1c7e49dfec9993
|
data/lib/sportdb/config/club.rb
CHANGED
@@ -47,7 +47,7 @@ class Club
|
|
47
47
|
## check for duplicates
|
48
48
|
def duplicates?
|
49
49
|
names = [name] + alt_names + alt_names_auto
|
50
|
-
names = names.map { |name| normalize( name ) }
|
50
|
+
names = names.map { |name| normalize( sanitize(name) ) }
|
51
51
|
|
52
52
|
names.size != names.uniq.size
|
53
53
|
end
|
@@ -57,7 +57,7 @@ class Club
|
|
57
57
|
|
58
58
|
## calculate (count) frequency and select if greater than one
|
59
59
|
names.reduce( Hash.new ) do |h,name|
|
60
|
-
norm = normalize( name )
|
60
|
+
norm = normalize( sanitize(name) )
|
61
61
|
h[norm] ||= []
|
62
62
|
h[norm] << name; h
|
63
63
|
end.select { |norm,names| names.size > 1 }
|
@@ -83,26 +83,14 @@ class Club
|
|
83
83
|
|
84
84
|
def self.has_year?( name ) name =~ YEAR_REGEX; end
|
85
85
|
|
86
|
-
LANG_REGEX = /\[[a-z]{2}\]/
|
86
|
+
LANG_REGEX = /\[[a-z]{1,2}\]/ ## note also allow [a] or [d] or [e] - why? why not?
|
87
87
|
def self.strip_lang( name )
|
88
88
|
name.gsub( LANG_REGEX, '' ).strip
|
89
89
|
end
|
90
90
|
|
91
91
|
def self.has_lang?( name ) name =~ LANG_REGEX; end
|
92
92
|
|
93
|
-
|
94
|
-
## note: remove all dots (.), dash (-), ', º, /, etc.
|
95
|
-
## for norm(alizing) names
|
96
|
-
def self.strip_norm( name )
|
97
|
-
name.gsub( NORM_REGEX, '' )
|
98
|
-
end
|
99
|
-
|
100
|
-
def strip_year( name ) self.class.strip_year( name ); end
|
101
|
-
def strip_lang( name ) self.class.strip_lang( name ); end
|
102
|
-
def strip_norm( name ) self.class.strip_norm( name ); end
|
103
|
-
|
104
|
-
private
|
105
|
-
def sanitize( name )
|
93
|
+
def self.sanitize( name )
|
106
94
|
## check for year(s) e.g. (1887-1911), (-2013),
|
107
95
|
## (1946-2001,2013-) etc.
|
108
96
|
name = strip_year( name )
|
@@ -111,18 +99,47 @@ private
|
|
111
99
|
name
|
112
100
|
end
|
113
101
|
|
114
|
-
|
115
|
-
|
102
|
+
|
103
|
+
NORM_REGEX = /[.'º\-\/]/
|
104
|
+
## note: remove all dots (.), dash (-), ', º, /, etc.
|
105
|
+
## for norm(alizing) names
|
106
|
+
def self.strip_norm( name )
|
107
|
+
name.gsub( NORM_REGEX, '' )
|
108
|
+
end
|
109
|
+
|
110
|
+
def self.normalize( name )
|
111
|
+
# note: do NOT call sanitize here (keep normalize "atomic" for reuse)
|
116
112
|
|
117
113
|
## remove all dots (.), dash (-), º, /, etc.
|
118
114
|
name = strip_norm( name )
|
119
115
|
name = name.gsub( ' ', '' ) # note: also remove all spaces!!!
|
120
116
|
|
121
117
|
## todo/fix: use our own downcase - why? why not?
|
122
|
-
name = name
|
118
|
+
name = downcase_i18n( name ) ## do NOT care about upper and lowercase for now
|
123
119
|
name
|
124
120
|
end
|
125
121
|
|
122
|
+
|
123
|
+
def self.strip_wiki( name ) # todo/check: rename to strip_wikipedia_en - why? why not?
|
124
|
+
## note: strip disambiguationn qualifier from wikipedia page name if present
|
125
|
+
## note: only remove year and foot... for now
|
126
|
+
## e.g. FC Wacker Innsbruck (2002) => FC Wacker Innsbruck
|
127
|
+
## Willem II (football club) => Willem II
|
128
|
+
##
|
129
|
+
## e.g. do NOT strip others !! e.g.
|
130
|
+
## América Futebol Clube (MG)
|
131
|
+
## only add more "special" cases on demand (that, is) if we find more
|
132
|
+
name = name.gsub( /\([12][^\)]+?\)/, '' ).strip ## starting with a digit 1 or 2 (assuming year)
|
133
|
+
name = name.gsub( /\(foot[^\)]+?\)/, '' ).strip ## starting with foot (assuming football ...)
|
134
|
+
name
|
135
|
+
end
|
136
|
+
|
137
|
+
|
138
|
+
private
|
139
|
+
## private "shortcut" convenience helpers
|
140
|
+
def sanitize( name ) self.class.sanitize( name ); end
|
141
|
+
def normalize( name ) self.class.normalize( name ); end
|
142
|
+
|
126
143
|
def variants( name ) Variant.find( name ); end
|
127
144
|
end # class Club
|
128
145
|
|
@@ -48,7 +48,8 @@ class ClubIndex
|
|
48
48
|
def strip_year( name ) Club.strip_year( name ); end
|
49
49
|
def has_year?( name) Club.has_year?( name ); end
|
50
50
|
def strip_lang( name ) Club.strip_lang( name ); end
|
51
|
-
def
|
51
|
+
def strip_wiki( name ) Club.strip_wiki( name ); end
|
52
|
+
def normalize( name ) Club.normalize( name ); end
|
52
53
|
|
53
54
|
|
54
55
|
def add_wiki( rec_or_recs ) ## add wiki(pedia club record / links
|
@@ -62,9 +63,7 @@ class ClubIndex
|
|
62
63
|
## e.g. do NOT strip others !! e.g.
|
63
64
|
## América Futebol Clube (MG)
|
64
65
|
## only add more "special" cases on demand (that, is) if we find more
|
65
|
-
name = rec.name
|
66
|
-
name = name.gsub( /\([12][^\)]+?\)/, '' ).strip ## starting with a digit 1 or 2 (assuming year)
|
67
|
-
name = name.gsub( /\(foot[^\)]+?\)/, '' ).strip ## starting with foot (assuming football ...)
|
66
|
+
name = strip_wiki( rec.name )
|
68
67
|
|
69
68
|
m = match_by( name: name, country: rec.country )
|
70
69
|
if m.nil?
|
@@ -136,6 +135,7 @@ class ClubIndex
|
|
136
135
|
|
137
136
|
names.each_with_index do |name,i|
|
138
137
|
## check lang codes e.g. [en], [fr], etc.
|
138
|
+
## todo/check/fix: move strip_lang up in the chain - check for duplicates (e.g. only lang code marker different etc.) - why? why not?
|
139
139
|
name = strip_lang( name )
|
140
140
|
norm = normalize( name )
|
141
141
|
alt_recs = @clubs_by_name[ norm ]
|
@@ -205,18 +205,6 @@ class ClubIndex
|
|
205
205
|
end
|
206
206
|
end
|
207
207
|
end
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
private
|
212
|
-
def normalize( name )
|
213
|
-
name = strip_norm( name )
|
214
|
-
name = name.gsub( ' ', '' ) # remove all spaces
|
215
|
-
|
216
|
-
## todo/fix: use our own downcase - why? why not?
|
217
|
-
name = name.downcase ## do NOT care about upper and lowercase for now
|
218
|
-
name
|
219
|
-
end
|
220
208
|
end # class ClubIndex
|
221
209
|
|
222
210
|
|
@@ -21,33 +21,72 @@ end
|
|
21
21
|
## "simple" translation
|
22
22
|
ALPHA_SPECIALS = {
|
23
23
|
'Ä'=>'A', 'ä'=>'a',
|
24
|
-
|
24
|
+
'Á'=>'A', 'á'=>'a',
|
25
25
|
'à'=>'a',
|
26
26
|
'ã'=>'a',
|
27
27
|
'â'=>'a',
|
28
|
+
'Å'=>'A', 'å'=>'a',
|
29
|
+
'æ'=>'ae',
|
30
|
+
'ā'=>'a',
|
31
|
+
'ă'=>'a',
|
32
|
+
'ą'=>'a',
|
28
33
|
|
29
|
-
|
34
|
+
'Ç' =>'C', 'ç'=>'c',
|
35
|
+
'ć'=>'c',
|
36
|
+
'Č'=>'C', 'č'=>'c',
|
30
37
|
|
31
38
|
'É'=>'E', 'é'=>'e',
|
32
39
|
'è'=>'e',
|
33
40
|
'ê'=>'e',
|
41
|
+
'ë'=>'e',
|
42
|
+
'ė'=>'e',
|
43
|
+
'ę'=>'e',
|
34
44
|
|
35
|
-
'
|
45
|
+
'ğ'=>'g',
|
46
|
+
|
47
|
+
'İ'=>'I',
|
48
|
+
'Í'=>'I', 'í'=>'i',
|
36
49
|
'î'=>'i',
|
50
|
+
'ī'=>'i',
|
51
|
+
'ı'=>'i',
|
52
|
+
|
53
|
+
'Ł'=>'L', 'ł'=>'l',
|
37
54
|
|
38
55
|
'ñ'=>'n',
|
56
|
+
'ń'=>'n',
|
57
|
+
'ň'=>'n',
|
39
58
|
|
40
59
|
'Ö'=>'O', 'ö'=>'o',
|
41
60
|
'ó'=>'o',
|
42
61
|
'õ'=>'o',
|
43
62
|
'ô'=>'o',
|
63
|
+
'ø'=>'o',
|
64
|
+
'ő'=>'o',
|
44
65
|
|
45
|
-
|
46
|
-
'ú'=>'u',
|
66
|
+
'ř'=>'r',
|
47
67
|
|
68
|
+
'Ś'=>'S',
|
69
|
+
'Ş'=>'S', 'ş'=>'s',
|
70
|
+
'Š'=>'S', 'š'=>'s',
|
71
|
+
'ș'=>'s', ## U+0219
|
48
72
|
'ß'=>'ss',
|
73
|
+
|
74
|
+
'ţ'=>'t', ## U+0163
|
75
|
+
'ț'=>'t', ## U+021B
|
76
|
+
'þ'=>'th',
|
77
|
+
|
78
|
+
'Ü'=>'U', 'ü'=>'u',
|
79
|
+
'Ú'=>'U', 'ú'=>'u',
|
80
|
+
'ū'=>'u',
|
81
|
+
|
82
|
+
'ý'=>'y',
|
83
|
+
|
84
|
+
'ź'=>'z',
|
85
|
+
'ż'=>'z',
|
86
|
+
'Ž'=>'Z', 'ž'=>'z',
|
49
87
|
}
|
50
88
|
|
89
|
+
|
51
90
|
## de,at,ch translation for umlauts
|
52
91
|
ALPHA_SPECIALS_DE = {
|
53
92
|
'Ä'=>'Ae', 'ä'=>'ae',
|
@@ -59,6 +98,35 @@ ALPHA_SPECIALS_DE = {
|
|
59
98
|
## add ALPHA_SPECIALS_ES - why? why not? is Espanyol catalan spelling or spanish (castillian)?
|
60
99
|
# 'ñ'=>'ny', ## e.g. Español => Espanyol
|
61
100
|
|
101
|
+
ALPHA_DOWNCASE = %w[A B C D E F G H I J K L M N O P Q R S T U V W X Y Z].reduce({}) do |h,ch|
|
102
|
+
h[ch] = ch.downcase
|
103
|
+
h
|
104
|
+
end.merge(
|
105
|
+
'Ä'=>'ä',
|
106
|
+
'Á'=>'á',
|
107
|
+
'Å'=>'å',
|
108
|
+
|
109
|
+
'Ç'=>'ç',
|
110
|
+
'Č'=>'č',
|
111
|
+
|
112
|
+
'É'=>'é',
|
113
|
+
|
114
|
+
'İ'=>'?', ## fix - add lowercase
|
115
|
+
'Í'=>'í',
|
116
|
+
|
117
|
+
'Ł'=>'ł',
|
118
|
+
|
119
|
+
'Ö'=>'ö',
|
120
|
+
|
121
|
+
'Ś'=>'?', ## fix - add lowercase
|
122
|
+
'Ş'=>'ş',
|
123
|
+
'Š'=>'š',
|
124
|
+
|
125
|
+
'Ü'=>'ü',
|
126
|
+
'Ú'=>'ú',
|
127
|
+
|
128
|
+
'Ž'=>'ž',
|
129
|
+
)
|
62
130
|
|
63
131
|
|
64
132
|
def self.alpha_specials_count( freq, mapping )
|
@@ -99,8 +167,19 @@ def self.find( name )
|
|
99
167
|
alt_names = alt_names.uniq
|
100
168
|
alt_names
|
101
169
|
end
|
102
|
-
end # Variant
|
103
170
|
|
171
|
+
def self.downcase_i18n( name ) ## our very own downcase for int'l characters / letters
|
172
|
+
tr( name, ALPHA_DOWNCASE )
|
173
|
+
end
|
174
|
+
|
175
|
+
end # class Variant
|
104
176
|
|
105
177
|
end ## module Import
|
106
178
|
end ## module SportDb
|
179
|
+
|
180
|
+
|
181
|
+
|
182
|
+
## "global" convenience helper
|
183
|
+
def downcase_i18n( name )
|
184
|
+
SportDb::Import::Variant.downcase_i18n( name )
|
185
|
+
end # Variant
|
@@ -17,6 +17,15 @@ class WikiIndex
|
|
17
17
|
self.new( recs )
|
18
18
|
end
|
19
19
|
|
20
|
+
|
21
|
+
|
22
|
+
## helpers from club - use a helper module for includes - why? why not?
|
23
|
+
def strip_lang( name ) Club.strip_lang( name ); end
|
24
|
+
def strip_year( name ) Club.strip_year( name ); end
|
25
|
+
def normalize( name ) Club.normalize( name ); end
|
26
|
+
def strip_wiki( name) Club.strip_wiki( name ); end
|
27
|
+
|
28
|
+
|
20
29
|
def initialize( recs )
|
21
30
|
@pages_by_country = {}
|
22
31
|
|
@@ -24,21 +33,11 @@ class WikiIndex
|
|
24
33
|
## check for duplicate recs - report and exit on dupliate!!!!!!
|
25
34
|
recs.each do |rec|
|
26
35
|
h = @pages_by_country[ rec.country.key ] ||= {}
|
27
|
-
h[ normalize(rec.name) ] = rec
|
36
|
+
h[ normalize( strip_wiki( rec.name )) ] = rec
|
28
37
|
end
|
29
38
|
end
|
30
39
|
|
31
40
|
|
32
|
-
def normalize( name )
|
33
|
-
## todo/fix: (re)use normalize from Club!!!!
|
34
|
-
name = name.gsub( /[\-\.]/, '' )
|
35
|
-
name = name.gsub( ' ', '' ) ## remove spaces too
|
36
|
-
name = name.downcase
|
37
|
-
name
|
38
|
-
end
|
39
|
-
|
40
|
-
|
41
|
-
|
42
41
|
def find_by( club: ) ## todo/check: use find_by_club - why? why not?
|
43
42
|
find_by_club( club )
|
44
43
|
end
|
@@ -55,8 +54,8 @@ class WikiIndex
|
|
55
54
|
## todo/check: sort names ?
|
56
55
|
## sort by longest first (for best match)
|
57
56
|
names.each do |name|
|
58
|
-
##
|
59
|
-
rec = h[ normalize( name ) ]
|
57
|
+
## note: normalize AND sanitize (e.g. remove/string year and lang e.g. (1946-2001), [en] too)
|
58
|
+
rec = h[ normalize( strip_year( strip_lang( name ))) ]
|
60
59
|
break if rec ## bingo!! found - break on first match
|
61
60
|
end
|
62
61
|
end
|
data/test/test_variants.rb
CHANGED
@@ -12,9 +12,10 @@ class TestVariants < MiniTest::Test
|
|
12
12
|
def variants( name ) SportDb::Import::Variant.find( name ); end
|
13
13
|
|
14
14
|
|
15
|
-
def
|
16
|
-
|
17
|
-
|
15
|
+
def test_downcase_i18n
|
16
|
+
assert_equal 'abcdefghijklmnopqrstuvwxyz', downcase_i18n( 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' )
|
17
|
+
assert_equal 'äöü', downcase_i18n( 'ÄÖÜ' )
|
18
|
+
assert_equal 'köln', downcase_i18n( 'KÖLN' )
|
18
19
|
end
|
19
20
|
|
20
21
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sportdb-config
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-08-
|
11
|
+
date: 2019-08-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: csvreader
|