sportdb-config 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,147 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module SportDb
4
- module Import
5
-
6
- ##
7
- # note: use our own (internal) club struct for now - why? why not?
8
- # - check that shape/structure/fields/attributes match
9
- # the Team struct in sportdb-text (in SportDb::Struct::Team) !!!!
10
- class Club
11
- ## todo: use just names for alt_names - why? why not?
12
- attr_accessor :name, :alt_names,
13
- :year, :ground, :city
14
-
15
- ## more attribs - todo/fix - also add "upstream" to struct & model!!!!!
16
- attr_accessor :district, :geos, :year_end, :country
17
-
18
- ## special import only attribs
19
- attr_accessor :alt_names_auto ## auto-generated alt names
20
- attr_accessor :wikipedia # wikipedia page name (for english (en))
21
-
22
- def historic?() @year_end ? true : false; end
23
- alias_method :past?, :historic?
24
-
25
-
26
- def wikipedia?() @wikipedia; end
27
- def wikipedia_url
28
- if @wikipedia
29
- ## note: replace spaces with underscore (-)
30
- ## e.g. Club Brugge KV => Club_Brugge_KV
31
- ## todo/check/fix:
32
- ## check if "plain" dash (-) needs to get replaced with typographic dash??
33
- "https://en.wikipedia.org/wiki/#{@wikipedia.gsub(' ','_')}"
34
- else
35
- nil
36
- end
37
- end
38
-
39
-
40
- def initialize
41
- @alt_names = []
42
- @alt_names_auto = []
43
- end
44
-
45
-
46
- ## helper methods for import only
47
- ## check for duplicates
48
- def duplicates?
49
- names = [name] + alt_names + alt_names_auto
50
- names = names.map { |name| normalize( sanitize(name) ) }
51
-
52
- names.size != names.uniq.size
53
- end
54
-
55
- def duplicates
56
- names = [name] + alt_names + alt_names_auto
57
-
58
- ## calculate (count) frequency and select if greater than one
59
- names.reduce( Hash.new ) do |h,name|
60
- norm = normalize( sanitize(name) )
61
- h[norm] ||= []
62
- h[norm] << name; h
63
- end.select { |norm,names| names.size > 1 }
64
- end
65
-
66
- def add_variants( name_or_names )
67
- names = name_or_names.is_a?(Array) ? name_or_names : [name_or_names]
68
- names.each do |name|
69
- name = sanitize( name )
70
- self.alt_names_auto += variants( name )
71
- end
72
- end
73
-
74
- ###################################
75
- # "global" helper - move to ___ ? why? why not?
76
-
77
- YEAR_REGEX = /\([0-9,\- ]+?\)/
78
- def self.strip_year( name )
79
- ## check for year(s) e.g. (1887-1911), (-2013),
80
- ## (1946-2001, 2013-) etc.
81
- name.gsub( YEAR_REGEX, '' ).strip
82
- end
83
-
84
- def self.has_year?( name ) name =~ YEAR_REGEX; end
85
-
86
- LANG_REGEX = /\[[a-z]{1,2}\]/ ## note also allow [a] or [d] or [e] - why? why not?
87
- def self.strip_lang( name )
88
- name.gsub( LANG_REGEX, '' ).strip
89
- end
90
-
91
- def self.has_lang?( name ) name =~ LANG_REGEX; end
92
-
93
- def self.sanitize( name )
94
- ## check for year(s) e.g. (1887-1911), (-2013),
95
- ## (1946-2001,2013-) etc.
96
- name = strip_year( name )
97
- ## check lang codes e.g. [en], [fr], etc.
98
- name = strip_lang( name )
99
- name
100
- end
101
-
102
-
103
- NORM_REGEX = /[.'º\-\/]/
104
- ## note: remove all dots (.), dash (-), ', º, /, etc.
105
- ## for norm(alizing) names
106
- def self.strip_norm( name )
107
- name.gsub( NORM_REGEX, '' )
108
- end
109
-
110
- def self.normalize( name )
111
- # note: do NOT call sanitize here (keep normalize "atomic" for reuse)
112
-
113
- ## remove all dots (.), dash (-), º, /, etc.
114
- name = strip_norm( name )
115
- name = name.gsub( ' ', '' ) # note: also remove all spaces!!!
116
-
117
- ## todo/fix: use our own downcase - why? why not?
118
- name = downcase_i18n( name ) ## do NOT care about upper and lowercase for now
119
- name
120
- end
121
-
122
-
123
- def self.strip_wiki( name ) # todo/check: rename to strip_wikipedia_en - why? why not?
124
- ## note: strip disambiguationn qualifier from wikipedia page name if present
125
- ## note: only remove year and foot... for now
126
- ## e.g. FC Wacker Innsbruck (2002) => FC Wacker Innsbruck
127
- ## Willem II (football club) => Willem II
128
- ##
129
- ## e.g. do NOT strip others !! e.g.
130
- ## América Futebol Clube (MG)
131
- ## only add more "special" cases on demand (that, is) if we find more
132
- name = name.gsub( /\([12][^\)]+?\)/, '' ).strip ## starting with a digit 1 or 2 (assuming year)
133
- name = name.gsub( /\(foot[^\)]+?\)/, '' ).strip ## starting with foot (assuming football ...)
134
- name
135
- end
136
-
137
-
138
- private
139
- ## private "shortcut" convenience helpers
140
- def sanitize( name ) self.class.sanitize( name ); end
141
- def normalize( name ) self.class.normalize( name ); end
142
-
143
- def variants( name ) Variant.find( name ); end
144
- end # class Club
145
-
146
- end # module Import
147
- end # module SportDb
@@ -1,212 +0,0 @@
1
- # encoding: utf-8
2
-
3
- module SportDb
4
- module Import
5
-
6
-
7
- class ClubIndex
8
-
9
- def self.build( path )
10
- recs = []
11
- datafiles = Configuration.find_datafiles_clubs( path )
12
- datafiles.each do |datafile|
13
- recs += ClubReader.read( datafile )
14
- end
15
- recs
16
-
17
- clubs = self.new
18
- clubs.add( recs )
19
-
20
- ## add wiki(pedia) anchored links
21
- recs = []
22
- datafiles = Configuration.find_datafiles_clubs_wiki( path )
23
- datafiles.each do |datafile|
24
- recs += WikiReader.read( datafile )
25
- end
26
-
27
- pp recs
28
- clubs.add_wiki( recs )
29
- clubs
30
- end
31
-
32
-
33
-
34
- def initialize
35
- @clubs = {} ## clubs (indexed) by canonical name
36
- @clubs_by_name = {}
37
- @errors = []
38
- end
39
-
40
- attr_reader :errors
41
- def errors?() @errors.empty? == false; end
42
-
43
- def mappings() @clubs_by_name; end ## todo/check: rename to index or something - why? why not?
44
- def clubs() @clubs.values; end
45
-
46
-
47
- ## helpers from club - use a helper module for includes - why? why not?
48
- def strip_year( name ) Club.strip_year( name ); end
49
- def has_year?( name) Club.has_year?( name ); end
50
- def strip_lang( name ) Club.strip_lang( name ); end
51
- def strip_wiki( name ) Club.strip_wiki( name ); end
52
- def normalize( name ) Club.normalize( name ); end
53
-
54
-
55
- def add_wiki( rec_or_recs ) ## add wiki(pedia club record / links
56
- recs = rec_or_recs.is_a?( Array ) ? rec_or_recs : [rec_or_recs] ## wrap (single) rec in array
57
-
58
- recs.each do |rec|
59
- ## note: strip qualifier () from wikipedia page name if present
60
- ## e.g. FC Wacker Innsbruck (2002) => FC Wacker Innsbruck
61
- ## Willem II (football club) => Willem II
62
- ##
63
- ## e.g. do NOT strip others !! e.g.
64
- ## América Futebol Clube (MG)
65
- ## only add more "special" cases on demand (that, is) if we find more
66
- name = strip_wiki( rec.name )
67
-
68
- m = match_by( name: name, country: rec.country )
69
- if m.nil?
70
- puts "** !!! ERROR !!! - no matching club found for wiki(pedia) name >#{name}, #{rec.country.name} (#{rec.country.key})<; sorry - to fix add name to clubs"
71
- exit 1
72
- end
73
- if m.size > 1
74
- puts "** !!! ERROR !!! - too many (greater than one) matching clubs found for wiki(pedia) name >#{name}, #{rec.country.name} (#{rec.country.key})<"
75
- pp m
76
- exit 1
77
- end
78
- club = m[0]
79
- club.wikipedia = rec.name
80
- end
81
- end # method add_wiki
82
-
83
-
84
- def add( rec_or_recs ) ## add club record / alt_names
85
- recs = rec_or_recs.is_a?( Array ) ? rec_or_recs : [rec_or_recs] ## wrap (single) rec in array
86
-
87
- recs.each do |rec|
88
- ## puts "adding:"
89
- ## pp rec
90
- ### step 1) add canonical name
91
- old_rec = @clubs[ rec.name ]
92
- if old_rec
93
- puts "** !!! ERROR !!! - (canonical) name conflict - duplicate - >#{rec.name}< will overwrite >#{old_rec.name}<:"
94
- pp old_rec
95
- pp rec
96
- exit 1
97
- else
98
- @clubs[ rec.name ] = rec
99
- end
100
-
101
- ## step 2) add all names (canonical name + alt names + alt names (auto))
102
- names = [rec.name] + rec.alt_names
103
- more_names = []
104
- ## check "hand-typed" names for year (auto-add)
105
- ## check for year(s) e.g. (1887-1911), (-2013),
106
- ## (1946-2001,2013-) etc.
107
- names.each do |name|
108
- if has_year?( name )
109
- more_names << strip_year( name )
110
- end
111
- end
112
-
113
- names += more_names
114
- ## check for duplicates - simple check for now - fix/improve
115
- ## todo/fix: (auto)remove duplicates - why? why not?
116
- count = names.size
117
- count_uniq = names.uniq.size
118
- if count != count_uniq
119
- puts "** !!! ERROR !!! - #{count-count_uniq} duplicate name(s):"
120
- pp names
121
- pp rec
122
- exit 1
123
- end
124
-
125
- ## check with auto-names just warn for now and do not exit
126
- names += rec.alt_names_auto
127
- count = names.size
128
- count_uniq = names.uniq.size
129
- if count != count_uniq
130
- puts "** !!! WARN !!! - #{count-count_uniq} duplicate name(s):"
131
- pp names
132
- pp rec
133
- end
134
-
135
-
136
- names.each_with_index do |name,i|
137
- ## check lang codes e.g. [en], [fr], etc.
138
- ## todo/check/fix: move strip_lang up in the chain - check for duplicates (e.g. only lang code marker different etc.) - why? why not?
139
- name = strip_lang( name )
140
- norm = normalize( name )
141
- alt_recs = @clubs_by_name[ norm ]
142
- if alt_recs
143
- ## check if include club rec already or is new club rec
144
- if alt_recs.include?( rec )
145
- ## note: do NOT include duplicate club record
146
- msg = "** !!! WARN !!! - (norm) name conflict/duplicate for club - >#{name}< normalized to >#{norm}< already included >#{rec.name}, #{rec.country.name}<"
147
- puts msg
148
- @errors << msg
149
- else
150
- msg = "** !!! WARN !!! - name conflict/duplicate - >#{name}< will overwrite >#{alt_recs[0].name}, #{alt_recs[0].country.name}< with >#{rec.name}, #{rec.country.name}<"
151
- puts msg
152
- @errors << msg
153
- alt_recs << rec
154
- end
155
- else
156
- @clubs_by_name[ norm ] = [rec]
157
- end
158
- end
159
- end
160
- end # method add
161
-
162
-
163
- def []( name ) ## lookup by canoncial name only
164
- @clubs[ name ]
165
- end
166
-
167
- def match( name )
168
- ## todo/check: return empty array if no match!!! and NOT nil (add || []) - why? why not?
169
- name = normalize( name )
170
- @clubs_by_name[ name ]
171
- end
172
-
173
-
174
- def match_by( name:, country: )
175
- ## note: match must for now always include name
176
- m = match( name )
177
- if m ## filter by country
178
- ## note: country assumes / allows the country key or fifa code for now
179
-
180
- ## note: allow passing in of country struct too
181
- country_rec = if country.is_a?( SportDb::Import::Country )
182
- country ## (re)use country struct - no need to run lookup again
183
- else
184
- rec = SportDb::Import.config.countries[ country ]
185
- if rec.nil?
186
- puts "** !!! ERROR !!! - unknown country >#{country}< - no match found, sorry - add to world/countries.txt in config"
187
- exit 1
188
- end
189
- rec
190
- end
191
-
192
- m = m.select { |club| club.country.key == country_rec.key }
193
- m = nil if m.empty? ## note: reset to nil if no more matches
194
- end
195
- m
196
- end
197
-
198
-
199
-
200
- def dump_duplicates # debug helper - report duplicate club name records
201
- @clubs_by_name.each do |name, clubs|
202
- if clubs.size > 1
203
- puts "#{clubs.size} matching club duplicates for >#{name}<:"
204
- pp clubs
205
- end
206
- end
207
- end
208
- end # class ClubIndex
209
-
210
-
211
- end # module Import
212
- end # module SportDb
@@ -1,278 +0,0 @@
1
- # encoding: utf-8
2
-
3
-
4
- module SportDb
5
- module Import
6
-
7
-
8
- class ClubReader
9
-
10
-
11
- def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
12
- txt = File.open( path, 'r:utf-8' ).read
13
- parse( txt )
14
- end
15
-
16
-
17
- def self.parse( txt )
18
- recs = []
19
- last_rec = nil
20
- headings = [] ## headings stack
21
-
22
- txt.each_line do |line|
23
- line = line.strip
24
-
25
- next if line.empty?
26
- next if line.start_with?( '#' ) ## skip comments too
27
-
28
- ## strip inline (until end-of-line) comments too
29
- ## e.g Eupen => KAS Eupen, ## [de]
30
- ## => Eupen => KAS Eupen,
31
- line = line.sub( /#.*/, '' ).strip
32
- pp line
33
-
34
-
35
- next if line =~ /^={1,}$/ ## skip "decorative" only heading e.g. ========
36
-
37
- ## note: like in wikimedia markup (and markdown) all optional trailing ==== too
38
- ## todo/check: allow === Text =-=-=-=-=-= too - why? why not?
39
- if line =~ /^(={1,}) ## leading ======
40
- ([^=]+?) ## text (note: for now no "inline" = allowed)
41
- =* ## (optional) trailing ====
42
- $/x
43
- heading_marker = $1
44
- heading_level = $1.length ## count number of = for heading level
45
- heading = $2.strip
46
-
47
- puts "heading #{heading_level} >#{heading}<"
48
-
49
- ## 1) first pop headings if present
50
- while headings.size+1 > heading_level
51
- headings.pop
52
- end
53
-
54
- ## 2) add missing (hierarchy) level if
55
- while headings.size+1 < heading_level
56
- ## todo/fix: issue warning about "skipping" hierarchy level
57
- puts "!!! warn [team reader] - skipping hierarchy level in headings "
58
- headings.push( nil )
59
- end
60
-
61
- if heading =~ /^\?+$/ ## note: use ? or ?? or ?? to reset level to nil
62
- ## keep level empty
63
- else
64
-
65
- ## quick hack: if level is 1 assume country for now
66
- ## and extract country code e.g.
67
- ## Austria (at) => at
68
- ## todo/fix: allow code only e.g. at or aut without enclosing () too - why? why not?
69
- if heading_level == 1
70
- if heading =~ /\(([a-z]{2,3})\)/i ## note allow (at) or (AUT) too
71
- country_code = $1
72
-
73
- ## check country code - MUST exist for now!!!!
74
- country = SportDb::Import.config.countries[ country_code ]
75
- if country.nil?
76
- puts "!!! error [team reader] - unknown country with code >#{country_code}< - sorry - add country to config to fix"
77
- exit 1
78
- end
79
-
80
- headings.push( country.key )
81
- else
82
- puts "!!! error - heading level 1 - missing country code - >#{heading}<"
83
- exit 1
84
- end
85
- else
86
- ## quick hack:
87
- ## remove known fill/dummy words incl:
88
- ## Provincia San Juan => San Juan (see argentina, for example)
89
- ##
90
- ## use geo tree long term with alternative names - why? why not?
91
- words = ['Provincia']
92
- words.each { |word| heading = heading.gsub( word, '' ) }
93
- heading = heading.strip
94
-
95
- headings.push( heading )
96
- end
97
-
98
- ## assert that hierarchy level is ok
99
- if headings.size != heading_level
100
- puts "!!! error - headings hierarchy/stack out of order - #{heading.size}<=>#{heading_level}"
101
- exit 1
102
- end
103
- end
104
-
105
- pp headings
106
-
107
- elsif line.start_with?( '|' )
108
- ## assume continuation with line of alternative names
109
- ## note: skip leading pipe
110
- values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
111
- ## strip and squish (white)spaces
112
- # e.g. New York FC (2011-) => New York FC (2011-)
113
- values = values.map { |value| value.strip.gsub( /[ \t]+/, ' ' ) }
114
- last_rec.alt_names += values
115
- last_rec.add_variants( values ) # auto-add (possible) auto-generated variant names
116
-
117
- ## check for duplicates
118
- if last_rec.duplicates?
119
- duplicates = last_rec.duplicates
120
- puts "*** !!! WARN !!! - #{duplicates.size} duplicate alt name mapping(s):"
121
- pp duplicates
122
- pp last_rec
123
- ##
124
- ## todo/fix: make it only an error with exit 1
125
- ## if (not normalized) names are the same (not unique/uniq)
126
- ## e.g. don't exit on A.F.C. == AFC etc.
127
- ## exit 1
128
- end
129
- else
130
- values = line.split( ',' )
131
-
132
- rec = Club.new
133
- value = values.shift ## get first item
134
- ## strip and squish (white)spaces
135
- # e.g. New York FC (2011-) => New York FC (2011-)
136
- value = value.strip.gsub( /[ \t]+/, ' ' )
137
- rec.name = value # canoncial name (global unique "beautiful/long" name)
138
- rec.add_variants( value ) # auto-add (possible) auto-generated variant names
139
-
140
- ## note:
141
- ## check/todo!!!!!!!!!!!!!!!!!-
142
- ## strip year if to present e.g. (2011-)
143
- ##
144
- ## do NOT strip for defunct / historic clubs e.g.
145
- ## (1899-1910)
146
- ## or (-1914) or (-2011) etc.
147
-
148
- ###
149
- ## todo: move year out of canonical team name - why? why not?
150
-
151
- ## check if canonical name include (2011-) or similar in name
152
- ## if yes, remove (2011-) and add to (alt) names
153
- ## e.g. New York FC (2011) => New York FC
154
- if rec.name =~ /\(.+?\)/ ## note: use non-greedy (?) match
155
- name = rec.name.gsub( /\(.+?\)/, '' ).strip
156
-
157
- if rec.name =~ /\(([0-9]{4})-\)/ ## e.g. (2014-)
158
- rec.year = $1.to_i
159
- elsif rec.name =~ /\(-([0-9]{4})\)/ ## e.g. (-2014)
160
- rec.year_end = $1.to_i
161
- elsif rec.name =~ /\(([0-9]{4})-([0-9]{4})\)/ ## e.g. (2011-2014)
162
- rec.year = $1.to_i
163
- rec.year_end = $2.to_i
164
- else
165
- ## todo/check: warn about unknown year format
166
- end
167
- end
168
-
169
- ## todo/check - check for unknown format values
170
- ## e.g. too many values, duplicate years, etc.
171
- ## check for overwritting, etc.
172
- while values.size > 0
173
- value = values.shift
174
- ## strip and squish (white)spaces
175
- # e.g. León › Guanajuato => León › Guanajuato
176
- value = value.strip.gsub( /[ \t]+/, ' ' )
177
- if value =~/^\d{4}$/ # e.g 1904
178
- ## todo/check: issue warning if year is already set!!!!!!!
179
- if rec.year
180
- puts "!!! error - year already set to #{rec.year} - CANNOT overwrite with #{value}:"
181
- pp rec
182
- exit 1
183
- end
184
- rec.year = value.to_i
185
- elsif value.start_with?( '@' ) # e.g. @ Anfield
186
- ## cut-off leading @ and spaces
187
- rec.ground = value[1..-1].strip
188
- else
189
- ## assume city / geo tree
190
- ## split into geo tree
191
- geos = split_geo( value )
192
- city = geos[0]
193
- ## check for "embedded" district e.g. London (Fulham) or Hamburg (St. Pauli) etc.
194
- if city =~ /\((.+?)\)/ ## note: use non-greedy (?) match
195
- rec.district = $1.strip
196
- city = city.gsub( /\(.+?\)/, '' ).strip
197
- end
198
- rec.city = city
199
-
200
- if geos.size > 1
201
- ## cut-off city and keep the rest (of geo tree)
202
- rec.geos = geos[1..-1]
203
- end
204
- end
205
- end ## while values
206
-
207
-
208
- ###############
209
- ## use headings text for geo tree
210
-
211
- ## 1) add country if present
212
- if headings.size > 0 && headings[0]
213
- country = SportDb::Import.config.countries[ headings[0] ]
214
- rec.country = country
215
- else
216
- ## make it an error - why? why not?
217
- puts "!!! error - country missing in headings hierarchy - sorry - add to quicklist"
218
- exit 1
219
- end
220
-
221
- ## 2) check geo tree with headings hierarchy
222
- if headings.size > 1 && headings[1]
223
- geos = split_geo( headings[1] )
224
- if rec.geos
225
- if rec.geos[0] != geos[0]
226
- puts "!!! error - geo tree - headings mismatch >#{rec.geos[0]}< <=> >#{geos[0]}<"
227
- exit 1
228
- end
229
- if rec.geos[1] && rec.geos[1] != geos[1] ## check optional 2nd level too
230
- puts "!!! error - geo tree - headings mismatch >#{rec.geos[1]}< <=> >#{geos[1]}<"
231
- exit 1
232
- end
233
- else
234
- ## add missing region (state/province) from headings hierarchy
235
- rec.geos = geos
236
- end
237
- end
238
-
239
- last_rec = rec
240
-
241
-
242
- ### todo/fix:
243
- ## auto-add alt name with dots stripped - why? why not?
244
- ## e.g. D.C. United => DC United
245
- ## e.g. Liverpool F.C. => Liverpool FC
246
- ## e.g. St. Albin => St Albin etc.
247
- ## e.g. 1. FC Köln => 1 FC Köln -- make special case for 1. - why? why not?
248
-
249
- ##
250
- ## todo/fix: unify mapping entries
251
- ## always lowercase !!!! (case insensitive)
252
- ## always strip (2011-) !!!
253
- ## always strip dots (e.g. St., F.C, etc.)
254
-
255
- recs << rec
256
- end
257
- end # each_line
258
- recs
259
- end # method read
260
-
261
- ### helpers
262
- def self.split_geo( str )
263
- ## assume city / geo tree
264
- ## strip and squish (white)spaces
265
- # e.g. León › Guanajuato => León › Guanajuato
266
- str = str.strip.gsub( /[ \t]+/, ' ' )
267
-
268
- ## split into geo tree
269
- geos = str.split( /[<>‹›]/ ) ## note: allow > < or › ‹
270
- geos = geos.map { |geo| geo.strip } ## remove all whitespaces
271
- geos
272
- end
273
-
274
- end # class ClubReader
275
-
276
-
277
- end ## module Import
278
- end ## module SportDb