sportdb-config 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/{HISTORY.md → CHANGELOG.md} +0 -0
- data/Manifest.txt +1 -11
- data/Rakefile +8 -4
- data/lib/sportdb/config.rb +9 -33
- data/lib/sportdb/config/config.rb +36 -4
- data/lib/sportdb/config/version.rb +1 -1
- data/test/test_clubs.rb +76 -20
- metadata +48 -16
- data/lib/sportdb/config/club.rb +0 -147
- data/lib/sportdb/config/club_index.rb +0 -212
- data/lib/sportdb/config/club_reader.rb +0 -278
- data/lib/sportdb/config/countries.rb +0 -58
- data/lib/sportdb/config/variants.rb +0 -185
- data/lib/sportdb/config/wiki_reader.rb +0 -104
- data/test/test_club_index.rb +0 -100
- data/test/test_club_reader.rb +0 -150
- data/test/test_variants.rb +0 -46
- data/test/test_wiki_reader.rb +0 -77
@@ -1,58 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module SportDb
|
4
|
-
module Import
|
5
|
-
|
6
|
-
## built-in countries for (quick starter) auto-add
|
7
|
-
|
8
|
-
## note: (re)use the struct from the fifa country gem / library for now
|
9
|
-
Country = ::Fifa::Country
|
10
|
-
|
11
|
-
|
12
|
-
class CountryIndex
|
13
|
-
|
14
|
-
def initialize( recs )
|
15
|
-
@countries = []
|
16
|
-
@countries_by_code = {}
|
17
|
-
|
18
|
-
add( recs )
|
19
|
-
end
|
20
|
-
|
21
|
-
def add( recs )
|
22
|
-
###########################################
|
23
|
-
## auto-fill countries
|
24
|
-
## pp recs
|
25
|
-
recs.each do |rec|
|
26
|
-
## rec e.g. { key:'af', fifa:'AFG', name:'Afghanistan'}
|
27
|
-
|
28
|
-
@countries << rec
|
29
|
-
|
30
|
-
## add codes lookups - key, fifa, ...
|
31
|
-
if @countries_by_code[ rec.key ]
|
32
|
-
puts "** !! ERROR !! country code (key) >#{rec.key}< already exits!!"
|
33
|
-
exit 1
|
34
|
-
else
|
35
|
-
@countries_by_code[ rec.key ] = rec
|
36
|
-
end
|
37
|
-
|
38
|
-
## add fifa code (only) if different from key
|
39
|
-
if rec.key != rec.fifa.downcase
|
40
|
-
if @countries_by_code[ rec.fifa.downcase ]
|
41
|
-
puts "** !! ERROR !! country code (fifa) >#{rec.fifa.downcase}< already exits!!"
|
42
|
-
exit 1
|
43
|
-
else
|
44
|
-
@countries_by_code[ rec.fifa.downcase ] = rec
|
45
|
-
end
|
46
|
-
end
|
47
|
-
end
|
48
|
-
end # method initialize
|
49
|
-
|
50
|
-
def []( key )
|
51
|
-
key = key.to_s.downcase ## allow symbols (and always downcase e.g. AUT to aut etc.)
|
52
|
-
@countries_by_code[ key ]
|
53
|
-
end
|
54
|
-
end # class CountryIndex
|
55
|
-
|
56
|
-
|
57
|
-
end # module Import
|
58
|
-
end # module SportDb
|
@@ -1,185 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
module SportDb
|
4
|
-
module Import
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
class Variant ## (spelling) variant finder / builder for names
|
9
|
-
|
10
|
-
|
11
|
-
def self.frequency_table( name ) ## todo/check: use/rename to char_frequency_table
|
12
|
-
## calculate the frequency table of letters, digits, etc.
|
13
|
-
freq = Hash.new(0)
|
14
|
-
name.each_char do |ch|
|
15
|
-
freq[ch] += 1
|
16
|
-
end
|
17
|
-
freq
|
18
|
-
end
|
19
|
-
|
20
|
-
|
21
|
-
## "simple" translation
|
22
|
-
ALPHA_SPECIALS = {
|
23
|
-
'Ä'=>'A', 'ä'=>'a',
|
24
|
-
'Á'=>'A', 'á'=>'a',
|
25
|
-
'à'=>'a',
|
26
|
-
'ã'=>'a',
|
27
|
-
'â'=>'a',
|
28
|
-
'Å'=>'A', 'å'=>'a',
|
29
|
-
'æ'=>'ae',
|
30
|
-
'ā'=>'a',
|
31
|
-
'ă'=>'a',
|
32
|
-
'ą'=>'a',
|
33
|
-
|
34
|
-
'Ç' =>'C', 'ç'=>'c',
|
35
|
-
'ć'=>'c',
|
36
|
-
'Č'=>'C', 'č'=>'c',
|
37
|
-
|
38
|
-
'É'=>'E', 'é'=>'e',
|
39
|
-
'è'=>'e',
|
40
|
-
'ê'=>'e',
|
41
|
-
'ë'=>'e',
|
42
|
-
'ė'=>'e',
|
43
|
-
'ę'=>'e',
|
44
|
-
|
45
|
-
'ğ'=>'g',
|
46
|
-
|
47
|
-
'İ'=>'I',
|
48
|
-
'Í'=>'I', 'í'=>'i',
|
49
|
-
'î'=>'i',
|
50
|
-
'ī'=>'i',
|
51
|
-
'ı'=>'i',
|
52
|
-
|
53
|
-
'Ł'=>'L', 'ł'=>'l',
|
54
|
-
|
55
|
-
'ñ'=>'n',
|
56
|
-
'ń'=>'n',
|
57
|
-
'ň'=>'n',
|
58
|
-
|
59
|
-
'Ö'=>'O', 'ö'=>'o',
|
60
|
-
'ó'=>'o',
|
61
|
-
'õ'=>'o',
|
62
|
-
'ô'=>'o',
|
63
|
-
'ø'=>'o',
|
64
|
-
'ő'=>'o',
|
65
|
-
|
66
|
-
'ř'=>'r',
|
67
|
-
|
68
|
-
'Ś'=>'S',
|
69
|
-
'Ş'=>'S', 'ş'=>'s',
|
70
|
-
'Š'=>'S', 'š'=>'s',
|
71
|
-
'ș'=>'s', ## U+0219
|
72
|
-
'ß'=>'ss',
|
73
|
-
|
74
|
-
'ţ'=>'t', ## U+0163
|
75
|
-
'ț'=>'t', ## U+021B
|
76
|
-
'þ'=>'th',
|
77
|
-
|
78
|
-
'Ü'=>'U', 'ü'=>'u',
|
79
|
-
'Ú'=>'U', 'ú'=>'u',
|
80
|
-
'ū'=>'u',
|
81
|
-
|
82
|
-
'ý'=>'y',
|
83
|
-
|
84
|
-
'ź'=>'z',
|
85
|
-
'ż'=>'z',
|
86
|
-
'Ž'=>'Z', 'ž'=>'z',
|
87
|
-
}
|
88
|
-
|
89
|
-
|
90
|
-
## de,at,ch translation for umlauts
|
91
|
-
ALPHA_SPECIALS_DE = {
|
92
|
-
'Ä'=>'Ae', 'ä'=>'ae',
|
93
|
-
'Ö'=>'Oe', 'ö'=>'oe',
|
94
|
-
'Ü'=>'Ue', 'ü'=>'ue',
|
95
|
-
'ß'=>'ss',
|
96
|
-
}
|
97
|
-
|
98
|
-
## add ALPHA_SPECIALS_ES - why? why not? is Espanyol catalan spelling or spanish (castillian)?
|
99
|
-
# 'ñ'=>'ny', ## e.g. Español => Espanyol
|
100
|
-
|
101
|
-
ALPHA_DOWNCASE = %w[A B C D E F G H I J K L M N O P Q R S T U V W X Y Z].reduce({}) do |h,ch|
|
102
|
-
h[ch] = ch.downcase
|
103
|
-
h
|
104
|
-
end.merge(
|
105
|
-
'Ä'=>'ä',
|
106
|
-
'Á'=>'á',
|
107
|
-
'Å'=>'å',
|
108
|
-
|
109
|
-
'Ç'=>'ç',
|
110
|
-
'Č'=>'č',
|
111
|
-
|
112
|
-
'É'=>'é',
|
113
|
-
|
114
|
-
'İ'=>'?', ## fix - add lowercase
|
115
|
-
'Í'=>'í',
|
116
|
-
|
117
|
-
'Ł'=>'ł',
|
118
|
-
|
119
|
-
'Ö'=>'ö',
|
120
|
-
|
121
|
-
'Ś'=>'?', ## fix - add lowercase
|
122
|
-
'Ş'=>'ş',
|
123
|
-
'Š'=>'š',
|
124
|
-
|
125
|
-
'Ü'=>'ü',
|
126
|
-
'Ú'=>'ú',
|
127
|
-
|
128
|
-
'Ž'=>'ž',
|
129
|
-
)
|
130
|
-
|
131
|
-
|
132
|
-
def self.alpha_specials_count( freq, mapping )
|
133
|
-
mapping.keys.reduce(0) do |count,ch|
|
134
|
-
count += freq[ch]
|
135
|
-
count
|
136
|
-
end
|
137
|
-
end
|
138
|
-
|
139
|
-
def self.tr( name, mapping )
|
140
|
-
buf = String.new
|
141
|
-
name.each_char do |ch|
|
142
|
-
buf << if mapping[ch]
|
143
|
-
mapping[ch]
|
144
|
-
else
|
145
|
-
ch
|
146
|
-
end
|
147
|
-
end
|
148
|
-
buf
|
149
|
-
end
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
def self.find( name )
|
154
|
-
alt_names = []
|
155
|
-
|
156
|
-
freq = frequency_table( name )
|
157
|
-
|
158
|
-
if alpha_specials_count( freq, ALPHA_SPECIALS ) > 0 # check if includes äöü etc.
|
159
|
-
alt_names << tr( name, ALPHA_SPECIALS )
|
160
|
-
end
|
161
|
-
|
162
|
-
if alpha_specials_count( freq, ALPHA_SPECIALS_DE ) > 0 ## todo/fix: add / pass-in language/country code and check - why? why not?
|
163
|
-
alt_names << tr( name, ALPHA_SPECIALS_DE )
|
164
|
-
end
|
165
|
-
|
166
|
-
## todo - make uniq e.g. Preußen is Preussen, Preussen 2x
|
167
|
-
alt_names = alt_names.uniq
|
168
|
-
alt_names
|
169
|
-
end
|
170
|
-
|
171
|
-
def self.downcase_i18n( name ) ## our very own downcase for int'l characters / letters
|
172
|
-
tr( name, ALPHA_DOWNCASE )
|
173
|
-
end
|
174
|
-
|
175
|
-
end # class Variant
|
176
|
-
|
177
|
-
end ## module Import
|
178
|
-
end ## module SportDb
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
## "global" convenience helper
|
183
|
-
def downcase_i18n( name )
|
184
|
-
SportDb::Import::Variant.downcase_i18n( name )
|
185
|
-
end # Variant
|
@@ -1,104 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
|
4
|
-
module SportDb
|
5
|
-
module Import
|
6
|
-
|
7
|
-
|
8
|
-
class WikiReader ## todo/check: rename to WikiClubReader - why? why not?
|
9
|
-
|
10
|
-
class WikiClub
|
11
|
-
attr_reader :name, :country
|
12
|
-
def initialize( name, country )
|
13
|
-
@name, @country = name, country
|
14
|
-
end
|
15
|
-
end
|
16
|
-
|
17
|
-
|
18
|
-
def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
|
19
|
-
txt = File.open( path, 'r:utf-8' ).read
|
20
|
-
parse( txt )
|
21
|
-
end
|
22
|
-
|
23
|
-
|
24
|
-
def self.parse( txt )
|
25
|
-
recs = []
|
26
|
-
last_country = nil ## note: supports only one level of headings for now (and that is a country)
|
27
|
-
|
28
|
-
txt.each_line do |line|
|
29
|
-
line = line.strip
|
30
|
-
|
31
|
-
next if line.empty?
|
32
|
-
next if line.start_with?( '#' ) ## skip comments too
|
33
|
-
|
34
|
-
## strip inline (until end-of-line) comments too
|
35
|
-
## e.g Eupen => KAS Eupen, ## [de]
|
36
|
-
## => Eupen => KAS Eupen,
|
37
|
-
line = line.sub( /#.*/, '' ).strip
|
38
|
-
pp line
|
39
|
-
|
40
|
-
|
41
|
-
next if line =~ /^={1,}$/ ## skip "decorative" only heading e.g. ========
|
42
|
-
|
43
|
-
## note: like in wikimedia markup (and markdown) all optional trailing ==== too
|
44
|
-
## todo/check: allow === Text =-=-=-=-=-= too - why? why not?
|
45
|
-
if line =~ /^(={1,}) ## leading ======
|
46
|
-
([^=]+?) ## text (note: for now no "inline" = allowed)
|
47
|
-
=* ## (optional) trailing ====
|
48
|
-
$/x
|
49
|
-
heading_marker = $1
|
50
|
-
heading_level = $1.length ## count number of = for heading level
|
51
|
-
heading = $2.strip
|
52
|
-
|
53
|
-
puts "heading #{heading_level} >#{heading}<"
|
54
|
-
|
55
|
-
if heading_level > 1
|
56
|
-
puts "** !!! ERROR [wiki reader] !!! - - headings level too deep - only top / one level supported for now; sorry"
|
57
|
-
exit 1
|
58
|
-
end
|
59
|
-
|
60
|
-
## quick hack: if level is 1 assume country for now
|
61
|
-
## and extract country code e.g.
|
62
|
-
## Austria (at) => at
|
63
|
-
## todo/fix: allow code only e.g. at or aut without enclosing () too - why? why not?
|
64
|
-
if heading =~ /\(([a-z]{2,3})\)/i ## note allow (at) or (AUT) too
|
65
|
-
country_code = $1
|
66
|
-
|
67
|
-
## check country code - MUST exist for now!!!!
|
68
|
-
country = SportDb::Import.config.countries[ country_code ]
|
69
|
-
if country.nil?
|
70
|
-
puts "** !!! ERROR [wiki reader] !!! - unknown country with code >#{country_code}< - sorry - add country to config to fix"
|
71
|
-
exit 1
|
72
|
-
end
|
73
|
-
|
74
|
-
last_country = country
|
75
|
-
else
|
76
|
-
puts "!!! error - heading level 1 - missing country code - >#{heading}<"
|
77
|
-
exit 1
|
78
|
-
end
|
79
|
-
pp last_country
|
80
|
-
else
|
81
|
-
## strip and squish (white)spaces
|
82
|
-
# e.g. New York FC (2011-) => New York FC (2011-)
|
83
|
-
value = line.strip.gsub( /[ \t]+/, ' ' )
|
84
|
-
|
85
|
-
## normalize (allow underscore (-) - replace with space)
|
86
|
-
## e.g. Cercle_Brugge_K.S.V. => Cercle Brugge K.S.V.
|
87
|
-
value = value.gsub( '_', ' ' )
|
88
|
-
|
89
|
-
if last_country.nil?
|
90
|
-
puts "** !!! ERROR [wiki reader] !!! - country heading missing for club name; sorry - add country heading to fix"
|
91
|
-
exit 1
|
92
|
-
end
|
93
|
-
|
94
|
-
rec = WikiClub.new( value, last_country )
|
95
|
-
recs << rec
|
96
|
-
end
|
97
|
-
end # each_line
|
98
|
-
recs
|
99
|
-
end # method read
|
100
|
-
|
101
|
-
end # class WikiReader
|
102
|
-
|
103
|
-
end ## module Import
|
104
|
-
end ## module SportDb
|
data/test/test_club_index.rb
DELETED
@@ -1,100 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
###
|
4
|
-
# to run use
|
5
|
-
# ruby -I ./lib -I ./test test/test_club_index.rb
|
6
|
-
|
7
|
-
|
8
|
-
require 'helper'
|
9
|
-
|
10
|
-
class TestClubIndex < MiniTest::Test
|
11
|
-
|
12
|
-
def test_clubs
|
13
|
-
pp SportDb::Import.config.clubs.errors
|
14
|
-
|
15
|
-
SportDb::Import.config.clubs.dump_duplicates
|
16
|
-
|
17
|
-
m = SportDb::Import.config.clubs.match( 'Rapid Wien' )
|
18
|
-
assert_equal 'SK Rapid Wien', m[0].name
|
19
|
-
assert_equal 'Austria', m[0].country.name
|
20
|
-
assert_equal 'Wien', m[0].city
|
21
|
-
|
22
|
-
m = SportDb::Import.config.clubs.match( 'rapid wien' )
|
23
|
-
assert_equal 'SK Rapid Wien', m[0].name
|
24
|
-
assert_equal 'Austria', m[0].country.name
|
25
|
-
assert_equal 'Wien', m[0].city
|
26
|
-
|
27
|
-
## note: all dots (.) get always removed
|
28
|
-
m = SportDb::Import.config.clubs.match( '...r.a.p.i.d w.i.e.n...' )
|
29
|
-
assert_equal 'SK Rapid Wien', m[0].name
|
30
|
-
assert_equal 'Austria', m[0].country.name
|
31
|
-
assert_equal 'Wien', m[0].city
|
32
|
-
|
33
|
-
## note: all spaces and dashes (-) get always removed
|
34
|
-
m = SportDb::Import.config.clubs.match( '--- r a p i d w i e n ---' )
|
35
|
-
assert_equal 'SK Rapid Wien', m[0].name
|
36
|
-
assert_equal 'Austria', m[0].country.name
|
37
|
-
assert_equal 'Wien', m[0].city
|
38
|
-
|
39
|
-
m = SportDb::Import.config.clubs.match( 'RAPID WIEN' )
|
40
|
-
assert_equal 'SK Rapid Wien', m[0].name
|
41
|
-
assert_equal 'Austria', m[0].country.name
|
42
|
-
assert_equal 'Wien', m[0].city
|
43
|
-
|
44
|
-
|
45
|
-
c = SportDb::Import.config.clubs[ 'SK Rapid Wien' ] ## check canoncial name match (only)
|
46
|
-
assert_equal 'SK Rapid Wien', c.name
|
47
|
-
assert_equal 'Austria', c.country.name
|
48
|
-
assert_equal 'Wien', c.city
|
49
|
-
|
50
|
-
|
51
|
-
m = SportDb::Import.config.clubs.match( 'Arsenal' )
|
52
|
-
assert_equal 3, m.size
|
53
|
-
|
54
|
-
m = SportDb::Import.config.clubs.match( 'ARSENAL' )
|
55
|
-
assert_equal 3, m.size
|
56
|
-
|
57
|
-
m = SportDb::Import.config.clubs.match_by( name: 'Arsenal', country: 'eng' )
|
58
|
-
assert_equal 1, m.size
|
59
|
-
assert_equal 'Arsenal FC', m[0].name
|
60
|
-
assert_equal 'England', m[0].country.name
|
61
|
-
assert_equal 'London', m[0].city
|
62
|
-
|
63
|
-
m = SportDb::Import.config.clubs.match_by( name: 'Arsenal', country: 'ar' )
|
64
|
-
assert_equal 1, m.size
|
65
|
-
assert_equal 'Arsenal de Sarandí', m[0].name
|
66
|
-
assert_equal 'Argentina', m[0].country.name
|
67
|
-
assert_equal 'Sarandí', m[0].city
|
68
|
-
|
69
|
-
m = SportDb::Import.config.clubs.match_by( name: 'Arsenal', country: 'ru' )
|
70
|
-
assert_equal 1, m.size
|
71
|
-
assert_equal 'Arsenal Tula', m[0].name
|
72
|
-
assert_equal 'Russia', m[0].country.name
|
73
|
-
assert_equal 'Tula', m[0].city
|
74
|
-
|
75
|
-
|
76
|
-
m = SportDb::Import.config.clubs.match( 'Arsenal FC' )
|
77
|
-
assert_equal 2, m.size
|
78
|
-
|
79
|
-
m = SportDb::Import.config.clubs.match( 'Arsenal F.C.' )
|
80
|
-
assert_equal 2, m.size
|
81
|
-
|
82
|
-
m = SportDb::Import.config.clubs.match( '...A.r.s.e.n.a.l... F.C...' )
|
83
|
-
assert_equal 2, m.size
|
84
|
-
|
85
|
-
|
86
|
-
##############################################
|
87
|
-
## test wikipedia names and links/urls
|
88
|
-
|
89
|
-
m = SportDb::Import.config.clubs.match( 'Club Brugge KV' )
|
90
|
-
assert_equal 1, m.size
|
91
|
-
assert_equal 'Club Brugge KV', m[0].wikipedia
|
92
|
-
assert_equal 'https://en.wikipedia.org/wiki/Club_Brugge_KV', m[0].wikipedia_url
|
93
|
-
|
94
|
-
m = SportDb::Import.config.clubs.match( 'RSC Anderlecht' )
|
95
|
-
assert_equal 1, m.size
|
96
|
-
assert_equal 'R.S.C. Anderlecht', m[0].wikipedia
|
97
|
-
assert_equal 'https://en.wikipedia.org/wiki/R.S.C._Anderlecht', m[0].wikipedia_url
|
98
|
-
end
|
99
|
-
|
100
|
-
end # class TestClubIndex
|