sportdb-config 0.6.0 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,147 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module SportDb
4
- module Import
5
-
6
- ##
7
- # note: use our own (internal) club struct for now - why? why not?
8
- # - check that shape/structure/fields/attributes match
9
- # the Team struct in sportdb-text (in SportDb::Struct::Team) !!!!
10
- class Club
11
- ## todo: use just names for alt_names - why? why not?
12
- attr_accessor :name, :alt_names,
13
- :year, :ground, :city
14
-
15
- ## more attribs - todo/fix - also add "upstream" to struct & model!!!!!
16
- attr_accessor :district, :geos, :year_end, :country
17
-
18
- ## special import only attribs
19
- attr_accessor :alt_names_auto ## auto-generated alt names
20
- attr_accessor :wikipedia # wikipedia page name (for english (en))
21
-
22
- def historic?() @year_end ? true : false; end
23
- alias_method :past?, :historic?
24
-
25
-
26
- def wikipedia?() @wikipedia; end
27
- def wikipedia_url
28
- if @wikipedia
29
- ## note: replace spaces with underscore (-)
30
- ## e.g. Club Brugge KV => Club_Brugge_KV
31
- ## todo/check/fix:
32
- ## check if "plain" dash (-) needs to get replaced with typographic dash??
33
- "https://en.wikipedia.org/wiki/#{@wikipedia.gsub(' ','_')}"
34
- else
35
- nil
36
- end
37
- end
38
-
39
-
40
- def initialize
41
- @alt_names = []
42
- @alt_names_auto = []
43
- end
44
-
45
-
46
- ## helper methods for import only
47
- ## check for duplicates
48
- def duplicates?
49
- names = [name] + alt_names + alt_names_auto
50
- names = names.map { |name| normalize( sanitize(name) ) }
51
-
52
- names.size != names.uniq.size
53
- end
54
-
55
- def duplicates
56
- names = [name] + alt_names + alt_names_auto
57
-
58
- ## calculate (count) frequency and select if greater than one
59
- names.reduce( Hash.new ) do |h,name|
60
- norm = normalize( sanitize(name) )
61
- h[norm] ||= []
62
- h[norm] << name; h
63
- end.select { |norm,names| names.size > 1 }
64
- end
65
-
66
- def add_variants( name_or_names )
67
- names = name_or_names.is_a?(Array) ? name_or_names : [name_or_names]
68
- names.each do |name|
69
- name = sanitize( name )
70
- self.alt_names_auto += variants( name )
71
- end
72
- end
73
-
74
- ###################################
75
- # "global" helper - move to ___ ? why? why not?
76
-
77
- YEAR_REGEX = /\([0-9,\- ]+?\)/
78
- def self.strip_year( name )
79
- ## check for year(s) e.g. (1887-1911), (-2013),
80
- ## (1946-2001, 2013-) etc.
81
- name.gsub( YEAR_REGEX, '' ).strip
82
- end
83
-
84
- def self.has_year?( name ) name =~ YEAR_REGEX; end
85
-
86
- LANG_REGEX = /\[[a-z]{1,2}\]/ ## note also allow [a] or [d] or [e] - why? why not?
87
- def self.strip_lang( name )
88
- name.gsub( LANG_REGEX, '' ).strip
89
- end
90
-
91
- def self.has_lang?( name ) name =~ LANG_REGEX; end
92
-
93
- def self.sanitize( name )
94
- ## check for year(s) e.g. (1887-1911), (-2013),
95
- ## (1946-2001,2013-) etc.
96
- name = strip_year( name )
97
- ## check lang codes e.g. [en], [fr], etc.
98
- name = strip_lang( name )
99
- name
100
- end
101
-
102
-
103
- NORM_REGEX = /[.'º\-\/]/
104
- ## note: remove all dots (.), dash (-), ', º, /, etc.
105
- ## for norm(alizing) names
106
- def self.strip_norm( name )
107
- name.gsub( NORM_REGEX, '' )
108
- end
109
-
110
- def self.normalize( name )
111
- # note: do NOT call sanitize here (keep normalize "atomic" for reuse)
112
-
113
- ## remove all dots (.), dash (-), º, /, etc.
114
- name = strip_norm( name )
115
- name = name.gsub( ' ', '' ) # note: also remove all spaces!!!
116
-
117
- ## todo/fix: use our own downcase - why? why not?
118
- name = downcase_i18n( name ) ## do NOT care about upper and lowercase for now
119
- name
120
- end
121
-
122
-
123
- def self.strip_wiki( name ) # todo/check: rename to strip_wikipedia_en - why? why not?
124
- ## note: strip disambiguationn qualifier from wikipedia page name if present
125
- ## note: only remove year and foot... for now
126
- ## e.g. FC Wacker Innsbruck (2002) => FC Wacker Innsbruck
127
- ## Willem II (football club) => Willem II
128
- ##
129
- ## e.g. do NOT strip others !! e.g.
130
- ## América Futebol Clube (MG)
131
- ## only add more "special" cases on demand (that, is) if we find more
132
- name = name.gsub( /\([12][^\)]+?\)/, '' ).strip ## starting with a digit 1 or 2 (assuming year)
133
- name = name.gsub( /\(foot[^\)]+?\)/, '' ).strip ## starting with foot (assuming football ...)
134
- name
135
- end
136
-
137
-
138
- private
139
- ## private "shortcut" convenience helpers
140
- def sanitize( name ) self.class.sanitize( name ); end
141
- def normalize( name ) self.class.normalize( name ); end
142
-
143
- def variants( name ) Variant.find( name ); end
144
- end # class Club
145
-
146
- end # module Import
147
- end # module SportDb
@@ -1,212 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module SportDb
4
- module Import
5
-
6
-
7
- class ClubIndex
8
-
9
- def self.build( path )
10
- recs = []
11
- datafiles = Configuration.find_datafiles_clubs( path )
12
- datafiles.each do |datafile|
13
- recs += ClubReader.read( datafile )
14
- end
15
- recs
16
-
17
- clubs = self.new
18
- clubs.add( recs )
19
-
20
- ## add wiki(pedia) anchored links
21
- recs = []
22
- datafiles = Configuration.find_datafiles_clubs_wiki( path )
23
- datafiles.each do |datafile|
24
- recs += WikiReader.read( datafile )
25
- end
26
-
27
- pp recs
28
- clubs.add_wiki( recs )
29
- clubs
30
- end
31
-
32
-
33
-
34
- def initialize
35
- @clubs = {} ## clubs (indexed) by canonical name
36
- @clubs_by_name = {}
37
- @errors = []
38
- end
39
-
40
- attr_reader :errors
41
- def errors?() @errors.empty? == false; end
42
-
43
- def mappings() @clubs_by_name; end ## todo/check: rename to index or something - why? why not?
44
- def clubs() @clubs.values; end
45
-
46
-
47
- ## helpers from club - use a helper module for includes - why? why not?
48
- def strip_year( name ) Club.strip_year( name ); end
49
- def has_year?( name) Club.has_year?( name ); end
50
- def strip_lang( name ) Club.strip_lang( name ); end
51
- def strip_wiki( name ) Club.strip_wiki( name ); end
52
- def normalize( name ) Club.normalize( name ); end
53
-
54
-
55
- def add_wiki( rec_or_recs ) ## add wiki(pedia club record / links
56
- recs = rec_or_recs.is_a?( Array ) ? rec_or_recs : [rec_or_recs] ## wrap (single) rec in array
57
-
58
- recs.each do |rec|
59
- ## note: strip qualifier () from wikipedia page name if present
60
- ## e.g. FC Wacker Innsbruck (2002) => FC Wacker Innsbruck
61
- ## Willem II (football club) => Willem II
62
- ##
63
- ## e.g. do NOT strip others !! e.g.
64
- ## América Futebol Clube (MG)
65
- ## only add more "special" cases on demand (that, is) if we find more
66
- name = strip_wiki( rec.name )
67
-
68
- m = match_by( name: name, country: rec.country )
69
- if m.nil?
70
- puts "** !!! ERROR !!! - no matching club found for wiki(pedia) name >#{name}, #{rec.country.name} (#{rec.country.key})<; sorry - to fix add name to clubs"
71
- exit 1
72
- end
73
- if m.size > 1
74
- puts "** !!! ERROR !!! - too many (greater than one) matching clubs found for wiki(pedia) name >#{name}, #{rec.country.name} (#{rec.country.key})<"
75
- pp m
76
- exit 1
77
- end
78
- club = m[0]
79
- club.wikipedia = rec.name
80
- end
81
- end # method add_wiki
82
-
83
-
84
- def add( rec_or_recs ) ## add club record / alt_names
85
- recs = rec_or_recs.is_a?( Array ) ? rec_or_recs : [rec_or_recs] ## wrap (single) rec in array
86
-
87
- recs.each do |rec|
88
- ## puts "adding:"
89
- ## pp rec
90
- ### step 1) add canonical name
91
- old_rec = @clubs[ rec.name ]
92
- if old_rec
93
- puts "** !!! ERROR !!! - (canonical) name conflict - duplicate - >#{rec.name}< will overwrite >#{old_rec.name}<:"
94
- pp old_rec
95
- pp rec
96
- exit 1
97
- else
98
- @clubs[ rec.name ] = rec
99
- end
100
-
101
- ## step 2) add all names (canonical name + alt names + alt names (auto))
102
- names = [rec.name] + rec.alt_names
103
- more_names = []
104
- ## check "hand-typed" names for year (auto-add)
105
- ## check for year(s) e.g. (1887-1911), (-2013),
106
- ## (1946-2001,2013-) etc.
107
- names.each do |name|
108
- if has_year?( name )
109
- more_names << strip_year( name )
110
- end
111
- end
112
-
113
- names += more_names
114
- ## check for duplicates - simple check for now - fix/improve
115
- ## todo/fix: (auto)remove duplicates - why? why not?
116
- count = names.size
117
- count_uniq = names.uniq.size
118
- if count != count_uniq
119
- puts "** !!! ERROR !!! - #{count-count_uniq} duplicate name(s):"
120
- pp names
121
- pp rec
122
- exit 1
123
- end
124
-
125
- ## check with auto-names just warn for now and do not exit
126
- names += rec.alt_names_auto
127
- count = names.size
128
- count_uniq = names.uniq.size
129
- if count != count_uniq
130
- puts "** !!! WARN !!! - #{count-count_uniq} duplicate name(s):"
131
- pp names
132
- pp rec
133
- end
134
-
135
-
136
- names.each_with_index do |name,i|
137
- ## check lang codes e.g. [en], [fr], etc.
138
- ## todo/check/fix: move strip_lang up in the chain - check for duplicates (e.g. only lang code marker different etc.) - why? why not?
139
- name = strip_lang( name )
140
- norm = normalize( name )
141
- alt_recs = @clubs_by_name[ norm ]
142
- if alt_recs
143
- ## check if include club rec already or is new club rec
144
- if alt_recs.include?( rec )
145
- ## note: do NOT include duplicate club record
146
- msg = "** !!! WARN !!! - (norm) name conflict/duplicate for club - >#{name}< normalized to >#{norm}< already included >#{rec.name}, #{rec.country.name}<"
147
- puts msg
148
- @errors << msg
149
- else
150
- msg = "** !!! WARN !!! - name conflict/duplicate - >#{name}< will overwrite >#{alt_recs[0].name}, #{alt_recs[0].country.name}< with >#{rec.name}, #{rec.country.name}<"
151
- puts msg
152
- @errors << msg
153
- alt_recs << rec
154
- end
155
- else
156
- @clubs_by_name[ norm ] = [rec]
157
- end
158
- end
159
- end
160
- end # method add
161
-
162
-
163
- def []( name ) ## lookup by canoncial name only
164
- @clubs[ name ]
165
- end
166
-
167
- def match( name )
168
- ## todo/check: return empty array if no match!!! and NOT nil (add || []) - why? why not?
169
- name = normalize( name )
170
- @clubs_by_name[ name ]
171
- end
172
-
173
-
174
- def match_by( name:, country: )
175
- ## note: match must for now always include name
176
- m = match( name )
177
- if m ## filter by country
178
- ## note: country assumes / allows the country key or fifa code for now
179
-
180
- ## note: allow passing in of country struct too
181
- country_rec = if country.is_a?( SportDb::Import::Country )
182
- country ## (re)use country struct - no need to run lookup again
183
- else
184
- rec = SportDb::Import.config.countries[ country ]
185
- if rec.nil?
186
- puts "** !!! ERROR !!! - unknown country >#{country}< - no match found, sorry - add to world/countries.txt in config"
187
- exit 1
188
- end
189
- rec
190
- end
191
-
192
- m = m.select { |club| club.country.key == country_rec.key }
193
- m = nil if m.empty? ## note: reset to nil if no more matches
194
- end
195
- m
196
- end
197
-
198
-
199
-
200
- def dump_duplicates # debug helper - report duplicate club name records
201
- @clubs_by_name.each do |name, clubs|
202
- if clubs.size > 1
203
- puts "#{clubs.size} matching club duplicates for >#{name}<:"
204
- pp clubs
205
- end
206
- end
207
- end
208
- end # class ClubIndex
209
-
210
-
211
- end # module Import
212
- end # module SportDb
@@ -1,278 +0,0 @@
1
- # encoding: utf-8
2
-
3
-
4
- module SportDb
5
- module Import
6
-
7
-
8
- class ClubReader
9
-
10
-
11
- def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
12
- txt = File.open( path, 'r:utf-8' ).read
13
- parse( txt )
14
- end
15
-
16
-
17
- def self.parse( txt )
18
- recs = []
19
- last_rec = nil
20
- headings = [] ## headings stack
21
-
22
- txt.each_line do |line|
23
- line = line.strip
24
-
25
- next if line.empty?
26
- next if line.start_with?( '#' ) ## skip comments too
27
-
28
- ## strip inline (until end-of-line) comments too
29
- ## e.g Eupen => KAS Eupen, ## [de]
30
- ## => Eupen => KAS Eupen,
31
- line = line.sub( /#.*/, '' ).strip
32
- pp line
33
-
34
-
35
- next if line =~ /^={1,}$/ ## skip "decorative" only heading e.g. ========
36
-
37
- ## note: like in wikimedia markup (and markdown) all optional trailing ==== too
38
- ## todo/check: allow === Text =-=-=-=-=-= too - why? why not?
39
- if line =~ /^(={1,}) ## leading ======
40
- ([^=]+?) ## text (note: for now no "inline" = allowed)
41
- =* ## (optional) trailing ====
42
- $/x
43
- heading_marker = $1
44
- heading_level = $1.length ## count number of = for heading level
45
- heading = $2.strip
46
-
47
- puts "heading #{heading_level} >#{heading}<"
48
-
49
- ## 1) first pop headings if present
50
- while headings.size+1 > heading_level
51
- headings.pop
52
- end
53
-
54
- ## 2) add missing (hierarchy) level if
55
- while headings.size+1 < heading_level
56
- ## todo/fix: issue warning about "skipping" hierarchy level
57
- puts "!!! warn [team reader] - skipping hierarchy level in headings "
58
- headings.push( nil )
59
- end
60
-
61
- if heading =~ /^\?+$/ ## note: use ? or ?? or ?? to reset level to nil
62
- ## keep level empty
63
- else
64
-
65
- ## quick hack: if level is 1 assume country for now
66
- ## and extract country code e.g.
67
- ## Austria (at) => at
68
- ## todo/fix: allow code only e.g. at or aut without enclosing () too - why? why not?
69
- if heading_level == 1
70
- if heading =~ /\(([a-z]{2,3})\)/i ## note allow (at) or (AUT) too
71
- country_code = $1
72
-
73
- ## check country code - MUST exist for now!!!!
74
- country = SportDb::Import.config.countries[ country_code ]
75
- if country.nil?
76
- puts "!!! error [team reader] - unknown country with code >#{country_code}< - sorry - add country to config to fix"
77
- exit 1
78
- end
79
-
80
- headings.push( country.key )
81
- else
82
- puts "!!! error - heading level 1 - missing country code - >#{heading}<"
83
- exit 1
84
- end
85
- else
86
- ## quick hack:
87
- ## remove known fill/dummy words incl:
88
- ## Provincia San Juan => San Juan (see argentina, for example)
89
- ##
90
- ## use geo tree long term with alternative names - why? why not?
91
- words = ['Provincia']
92
- words.each { |word| heading = heading.gsub( word, '' ) }
93
- heading = heading.strip
94
-
95
- headings.push( heading )
96
- end
97
-
98
- ## assert that hierarchy level is ok
99
- if headings.size != heading_level
100
- puts "!!! error - headings hierarchy/stack out of order - #{heading.size}<=>#{heading_level}"
101
- exit 1
102
- end
103
- end
104
-
105
- pp headings
106
-
107
- elsif line.start_with?( '|' )
108
- ## assume continuation with line of alternative names
109
- ## note: skip leading pipe
110
- values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
111
- ## strip and squish (white)spaces
112
- # e.g. New York FC (2011-) => New York FC (2011-)
113
- values = values.map { |value| value.strip.gsub( /[ \t]+/, ' ' ) }
114
- last_rec.alt_names += values
115
- last_rec.add_variants( values ) # auto-add (possible) auto-generated variant names
116
-
117
- ## check for duplicates
118
- if last_rec.duplicates?
119
- duplicates = last_rec.duplicates
120
- puts "*** !!! WARN !!! - #{duplicates.size} duplicate alt name mapping(s):"
121
- pp duplicates
122
- pp last_rec
123
- ##
124
- ## todo/fix: make it only an error with exit 1
125
- ## if (not normalized) names are the same (not unique/uniq)
126
- ## e.g. don't exit on A.F.C. == AFC etc.
127
- ## exit 1
128
- end
129
- else
130
- values = line.split( ',' )
131
-
132
- rec = Club.new
133
- value = values.shift ## get first item
134
- ## strip and squish (white)spaces
135
- # e.g. New York FC (2011-) => New York FC (2011-)
136
- value = value.strip.gsub( /[ \t]+/, ' ' )
137
- rec.name = value # canoncial name (global unique "beautiful/long" name)
138
- rec.add_variants( value ) # auto-add (possible) auto-generated variant names
139
-
140
- ## note:
141
- ## check/todo!!!!!!!!!!!!!!!!!-
142
- ## strip year if to present e.g. (2011-)
143
- ##
144
- ## do NOT strip for defunct / historic clubs e.g.
145
- ## (1899-1910)
146
- ## or (-1914) or (-2011) etc.
147
-
148
- ###
149
- ## todo: move year out of canonical team name - why? why not?
150
-
151
- ## check if canonical name include (2011-) or similar in name
152
- ## if yes, remove (2011-) and add to (alt) names
153
- ## e.g. New York FC (2011) => New York FC
154
- if rec.name =~ /\(.+?\)/ ## note: use non-greedy (?) match
155
- name = rec.name.gsub( /\(.+?\)/, '' ).strip
156
-
157
- if rec.name =~ /\(([0-9]{4})-\)/ ## e.g. (2014-)
158
- rec.year = $1.to_i
159
- elsif rec.name =~ /\(-([0-9]{4})\)/ ## e.g. (-2014)
160
- rec.year_end = $1.to_i
161
- elsif rec.name =~ /\(([0-9]{4})-([0-9]{4})\)/ ## e.g. (2011-2014)
162
- rec.year = $1.to_i
163
- rec.year_end = $2.to_i
164
- else
165
- ## todo/check: warn about unknown year format
166
- end
167
- end
168
-
169
- ## todo/check - check for unknown format values
170
- ## e.g. too many values, duplicate years, etc.
171
- ## check for overwritting, etc.
172
- while values.size > 0
173
- value = values.shift
174
- ## strip and squish (white)spaces
175
- # e.g. León › Guanajuato => León › Guanajuato
176
- value = value.strip.gsub( /[ \t]+/, ' ' )
177
- if value =~/^\d{4}$/ # e.g 1904
178
- ## todo/check: issue warning if year is already set!!!!!!!
179
- if rec.year
180
- puts "!!! error - year already set to #{rec.year} - CANNOT overwrite with #{value}:"
181
- pp rec
182
- exit 1
183
- end
184
- rec.year = value.to_i
185
- elsif value.start_with?( '@' ) # e.g. @ Anfield
186
- ## cut-off leading @ and spaces
187
- rec.ground = value[1..-1].strip
188
- else
189
- ## assume city / geo tree
190
- ## split into geo tree
191
- geos = split_geo( value )
192
- city = geos[0]
193
- ## check for "embedded" district e.g. London (Fulham) or Hamburg (St. Pauli) etc.
194
- if city =~ /\((.+?)\)/ ## note: use non-greedy (?) match
195
- rec.district = $1.strip
196
- city = city.gsub( /\(.+?\)/, '' ).strip
197
- end
198
- rec.city = city
199
-
200
- if geos.size > 1
201
- ## cut-off city and keep the rest (of geo tree)
202
- rec.geos = geos[1..-1]
203
- end
204
- end
205
- end ## while values
206
-
207
-
208
- ###############
209
- ## use headings text for geo tree
210
-
211
- ## 1) add country if present
212
- if headings.size > 0 && headings[0]
213
- country = SportDb::Import.config.countries[ headings[0] ]
214
- rec.country = country
215
- else
216
- ## make it an error - why? why not?
217
- puts "!!! error - country missing in headings hierarchy - sorry - add to quicklist"
218
- exit 1
219
- end
220
-
221
- ## 2) check geo tree with headings hierarchy
222
- if headings.size > 1 && headings[1]
223
- geos = split_geo( headings[1] )
224
- if rec.geos
225
- if rec.geos[0] != geos[0]
226
- puts "!!! error - geo tree - headings mismatch >#{rec.geos[0]}< <=> >#{geos[0]}<"
227
- exit 1
228
- end
229
- if rec.geos[1] && rec.geos[1] != geos[1] ## check optional 2nd level too
230
- puts "!!! error - geo tree - headings mismatch >#{rec.geos[1]}< <=> >#{geos[1]}<"
231
- exit 1
232
- end
233
- else
234
- ## add missing region (state/province) from headings hierarchy
235
- rec.geos = geos
236
- end
237
- end
238
-
239
- last_rec = rec
240
-
241
-
242
- ### todo/fix:
243
- ## auto-add alt name with dots stripped - why? why not?
244
- ## e.g. D.C. United => DC United
245
- ## e.g. Liverpool F.C. => Liverpool FC
246
- ## e.g. St. Albin => St Albin etc.
247
- ## e.g. 1. FC Köln => 1 FC Köln -- make special case for 1. - why? why not?
248
-
249
- ##
250
- ## todo/fix: unify mapping entries
251
- ## always lowercase !!!! (case insensitive)
252
- ## always strip (2011-) !!!
253
- ## always strip dots (e.g. St., F.C, etc.)
254
-
255
- recs << rec
256
- end
257
- end # each_line
258
- recs
259
- end # method read
260
-
261
- ### helpers
262
- def self.split_geo( str )
263
- ## assume city / geo tree
264
- ## strip and squish (white)spaces
265
- # e.g. León › Guanajuato => León › Guanajuato
266
- str = str.strip.gsub( /[ \t]+/, ' ' )
267
-
268
- ## split into geo tree
269
- geos = str.split( /[<>‹›]/ ) ## note: allow > < or › ‹
270
- geos = geos.map { |geo| geo.strip } ## remove all whitespaces
271
- geos
272
- end
273
-
274
- end # class ClubReader
275
-
276
-
277
- end ## module Import
278
- end ## module SportDb