sportdb-formats 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,336 @@
1
+ # encoding: utf-8
2
+
3
+ module SportDb
4
+ module Import
5
+
6
+
7
+ class ClubIndex
8
+
9
+ def self.build( path )
10
+ pack = Package.new( path ) ## lets us use direcotry or zip archive
11
+
12
+ recs = []
13
+ pack.each_clubs do |entry|
14
+ recs += Club.parse( entry.read )
15
+ end
16
+ recs
17
+
18
+ clubs = new
19
+ clubs.add( recs )
20
+
21
+ ## add wiki(pedia) anchored links
22
+ recs = []
23
+ pack.each_clubs_wiki do |entry|
24
+ recs += WikiReader.parse( entry.read )
25
+ end
26
+
27
+ pp recs
28
+ clubs.add_wiki( recs )
29
+ clubs
30
+ end
31
+
32
+
33
+ def catalog() Import.catalog; end
34
+
35
+ def initialize
36
+ @clubs = {} ## clubs (indexed) by canonical name
37
+ @clubs_by_name = {}
38
+ @errors = []
39
+ end
40
+
41
+ attr_reader :errors
42
+ def errors?() @errors.empty? == false; end
43
+
44
+ def mappings() @clubs_by_name; end ## todo/check: rename to index or something - why? why not?
45
+ def clubs() @clubs.values; end
46
+ alias_method :all, :clubs ## use ActiveRecord-like alias for clubs
47
+
48
+
49
+ ## helpers from club - use a helper module for includes - why? why not?
50
+ include NameHelper
51
+ ## incl. strip_year( name )
52
+ ## has_year?( name)
53
+ ## strip_lang( name )
54
+ ## normalize( name )
55
+
56
+ def strip_wiki( name ) # todo/check: rename to strip_wikipedia_en - why? why not?
57
+ ## note: strip disambiguationn qualifier from wikipedia page name if present
58
+ ## note: only remove year and foot... for now
59
+ ## e.g. FC Wacker Innsbruck (2002) => FC Wacker Innsbruck
60
+ ## Willem II (football club) => Willem II
61
+ ##
62
+ ## e.g. do NOT strip others !! e.g.
63
+ ## América Futebol Clube (MG)
64
+ ## only add more "special" cases on demand (that, is) if we find more
65
+ name = name.gsub( /\([12][^\)]+?\)/, '' ).strip ## starting with a digit 1 or 2 (assuming year)
66
+ name = name.gsub( /\(foot[^\)]+?\)/, '' ).strip ## starting with foot (assuming football ...)
67
+ name
68
+ end
69
+
70
+ def add_wiki( rec_or_recs ) ## add wiki(pedia club record / links
71
+ recs = rec_or_recs.is_a?( Array ) ? rec_or_recs : [rec_or_recs] ## wrap (single) rec in array
72
+
73
+ recs.each do |rec|
74
+ ## note: strip qualifier () from wikipedia page name if present
75
+ ## e.g. FC Wacker Innsbruck (2002) => FC Wacker Innsbruck
76
+ ## Willem II (football club) => Willem II
77
+ ##
78
+ ## e.g. do NOT strip others !! e.g.
79
+ ## América Futebol Clube (MG)
80
+ ## only add more "special" cases on demand (that, is) if we find more
81
+ name = strip_wiki( rec.name )
82
+
83
+ m = match_by( name: name, country: rec.country )
84
+ if m.nil?
85
+ puts "** !!! ERROR !!! - no matching club found for wiki(pedia) name >#{name}, #{rec.country.name} (#{rec.country.key})<; sorry - to fix add name to clubs"
86
+ exit 1
87
+ end
88
+ if m.size > 1
89
+ puts "** !!! ERROR !!! - too many (greater than one) matching clubs found for wiki(pedia) name >#{name}, #{rec.country.name} (#{rec.country.key})<"
90
+ pp m
91
+ exit 1
92
+ end
93
+ club = m[0]
94
+ club.wikipedia = rec.name
95
+ end
96
+ end # method add_wiki
97
+
98
+
99
+ def add( rec_or_recs ) ## add club record / alt_names
100
+ recs = rec_or_recs.is_a?( Array ) ? rec_or_recs : [rec_or_recs] ## wrap (single) rec in array
101
+
102
+ recs.each do |rec|
103
+ ## puts "adding:"
104
+ ## pp rec
105
+ ### step 1) add canonical name
106
+ old_rec = @clubs[ rec.name ]
107
+ if old_rec
108
+ puts "** !!! ERROR !!! - (canonical) name conflict - duplicate - >#{rec.name}< will overwrite >#{old_rec.name}<:"
109
+ pp old_rec
110
+ pp rec
111
+ exit 1
112
+ else
113
+ @clubs[ rec.name ] = rec
114
+ end
115
+
116
+ ## step 2) add all names (canonical name + alt names + alt names (auto))
117
+ names = [rec.name] + rec.alt_names
118
+ more_names = []
119
+ ## check "hand-typed" names for year (auto-add)
120
+ ## check for year(s) e.g. (1887-1911), (-2013),
121
+ ## (1946-2001,2013-) etc.
122
+ names.each do |name|
123
+ if has_year?( name )
124
+ more_names << strip_year( name )
125
+ end
126
+ end
127
+
128
+ names += more_names
129
+ ## check for duplicates - simple check for now - fix/improve
130
+ ## todo/fix: (auto)remove duplicates - why? why not?
131
+ count = names.size
132
+ count_uniq = names.uniq.size
133
+ if count != count_uniq
134
+ puts "** !!! ERROR !!! - #{count-count_uniq} duplicate name(s):"
135
+ pp names
136
+ pp rec
137
+ exit 1
138
+ end
139
+
140
+ ## check with auto-names just warn for now and do not exit
141
+ names += rec.alt_names_auto
142
+ count = names.size
143
+ count_uniq = names.uniq.size
144
+ if count != count_uniq
145
+ puts "** !!! WARN !!! - #{count-count_uniq} duplicate name(s):"
146
+ pp names
147
+ pp rec
148
+ end
149
+
150
+
151
+ names.each_with_index do |name,i|
152
+ ## check lang codes e.g. [en], [fr], etc.
153
+ ## todo/check/fix: move strip_lang up in the chain - check for duplicates (e.g. only lang code marker different etc.) - why? why not?
154
+ name = strip_lang( name )
155
+ norm = normalize( name )
156
+ alt_recs = @clubs_by_name[ norm ]
157
+ if alt_recs
158
+ ## check if include club rec already or is new club rec
159
+ if alt_recs.include?( rec )
160
+ ## note: do NOT include duplicate club record
161
+ msg = "** !!! WARN !!! - (norm) name conflict/duplicate for club - >#{name}< normalized to >#{norm}< already included >#{rec.name}, #{rec.country.name}<"
162
+ puts msg
163
+ @errors << msg
164
+ else
165
+ msg = "** !!! WARN !!! - name conflict/duplicate - >#{name}< will overwrite >#{alt_recs[0].name}, #{alt_recs[0].country.name}< with >#{rec.name}, #{rec.country.name}<"
166
+ puts msg
167
+ @errors << msg
168
+ alt_recs << rec
169
+ end
170
+ else
171
+ @clubs_by_name[ norm ] = [rec]
172
+ end
173
+ end
174
+ end
175
+ end # method add
176
+
177
+
178
+ ## todo/fix/check: use rename to find_canon or find_canonical() or something??
179
+ ## remove (getting used?) - why? why not?
180
+ def []( name ) ## lookup by canoncial name only; todo/fix: add find alias why? why not?
181
+ @clubs[ name ]
182
+ end
183
+
184
+
185
+ ## todo/fix/check: return empty array if no match!!!
186
+ ## and NOT nil (add || []) - why? why not?
187
+ def match( name )
188
+ name = normalize( name )
189
+ m = @clubs_by_name[ name ]
190
+
191
+ ## no match - retry with unaccented variant if different
192
+ ## e.g. example is Preussen Münster (with mixed accent and unaccented letters) that would go unmatched for now
193
+ ## Preussen Münster => preussenmünster (norm) => preussenmunster (norm+unaccent)
194
+ if m.nil?
195
+ name2 = unaccent( name )
196
+ m = @clubs_by_name[ name2 ] if name2 != name
197
+ end
198
+ m
199
+ end
200
+
201
+
202
+ ## helper to always convert (possible) country key to existing country record
203
+ ## todo: make private - why? why not?
204
+ def country( country )
205
+ if country.is_a?( String ) || country.is_a?( Symbol )
206
+ ## note: use own "global" countries index setting for ClubIndex - why? why not?
207
+ rec = catalog.countries.find( country.to_s )
208
+ if rec.nil?
209
+ puts "** !!! ERROR !!! - unknown country >#{country}< - no match found, sorry - add to world/countries.txt in config"
210
+ exit 1
211
+ end
212
+ rec
213
+ else
214
+ country ## (re)use country struct - no need to run lookup again
215
+ end
216
+ end
217
+
218
+
219
+ ## match - always returns an array (with one or more matches) or nil
220
+ def match_by( name:, country: nil )
221
+ ## note: allow passing in of country key too (auto-counvert)
222
+ ## and country struct too
223
+ ## - country assumes / allows the country key or fifa code for now
224
+ m = match( name )
225
+
226
+ if country
227
+ country = country( country )
228
+
229
+ ## note: match must for now always include name
230
+ if m ## filter by country
231
+ m = m.select { |club| club.country.key == country.key }
232
+ m = nil if m.empty? ## note: reset to nil if no more matches
233
+ end
234
+ end
235
+ m
236
+ end
237
+
238
+ def find( name ) find_by( name: name, country: nil ); end
239
+ def find!( name ) find_by!( name: name, country: nil ); end
240
+
241
+ ## find - always returns a single record / match or nil
242
+ ## if there is more than one match than find aborts / fails
243
+ def find_by!( name:, country: nil ) ## todo/fix: add international or league flag?
244
+ club = find_by( name: name, country: country )
245
+
246
+ if club.nil?
247
+ puts "** !!! ERROR - no match for club >#{name}<"
248
+ exit 1
249
+ end
250
+
251
+ club
252
+ end
253
+
254
+
255
+ def find_by( name:, country: nil ) ## todo/fix: add international or league flag?
256
+ ## note: allow passing in of country key too (auto-counvert)
257
+ ## and country struct too
258
+ ## - country assumes / allows the country key or fifa code for now
259
+ m = nil
260
+
261
+ if country
262
+ country = country( country )
263
+
264
+ m = match_by( name: name, country: country )
265
+
266
+ if m.nil?
267
+ ## (re)try with second country - quick hacks for known leagues
268
+ ## todo/fix: add league flag to activate!!! - why? why not
269
+ m = match_by( name: name, country: 'wal' ) if country.key == 'eng'
270
+ m = match_by( name: name, country: 'eng' ) if country.key == 'sco'
271
+ m = match_by( name: name, country: 'nir' ) if country.key == 'ie'
272
+ m = match_by( name: name, country: 'mc' ) if country.key == 'fr'
273
+ m = match_by( name: name, country: 'li' ) if country.key == 'ch'
274
+ m = match_by( name: name, country: 'ca' ) if country.key == 'us'
275
+ end
276
+ else ## try "global" search - no country passed in
277
+ m = match( name )
278
+ end
279
+
280
+
281
+ club = nil
282
+ if m.nil?
283
+ ## puts "** !!! WARN !!! no match for club >#{name}<"
284
+ elsif m.size > 1
285
+ puts "** !!! ERROR - too many matches (#{m.size}) for club >#{name}<:"
286
+ pp m
287
+ exit 1
288
+ else # bingo; match - assume size == 1
289
+ club = m[0]
290
+ end
291
+
292
+ club
293
+ end
294
+
295
+
296
+
297
+ def build_mods( mods )
298
+ ## e.g.
299
+ ## { 'Arsenal | Arsenal FC' => 'Arsenal, ENG',
300
+ ## 'Liverpool | Liverpool FC' => 'Liverpool, ENG',
301
+ ## 'Barcelona' => 'Barcelona, ESP',
302
+ ## 'Valencia' => 'Valencia, ESP' }
303
+
304
+ mods.reduce({}) do |h,(club_names, club_line)|
305
+
306
+ values = club_line.split( ',' )
307
+ values = values.map { |value| value.strip } ## strip all spaces
308
+
309
+ ## todo/fix: make sure country is present !!!!
310
+ club_name, country_name = values
311
+ club = find_by!( name: club_name, country: country_name )
312
+
313
+ values = club_names.split( '|' )
314
+ values = values.map { |value| value.strip } ## strip all spaces
315
+
316
+ values.each do |club_name|
317
+ h[club_name] = club
318
+ end
319
+ h
320
+ end
321
+ end
322
+
323
+
324
+ def dump_duplicates # debug helper - report duplicate club name records
325
+ @clubs_by_name.each do |name, clubs|
326
+ if clubs.size > 1
327
+ puts "#{clubs.size} matching club duplicates for >#{name}<:"
328
+ pp clubs
329
+ end
330
+ end
331
+ end
332
+ end # class ClubIndex
333
+
334
+
335
+ end # module Import
336
+ end # module SportDb
@@ -0,0 +1,350 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ module SportDb
5
+ module Import
6
+
7
+
8
+ class ClubReader
9
+
10
+ def catalog() Import.catalog; end
11
+
12
+
13
+
14
+ def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
15
+ txt = File.open( path, 'r:utf-8' ) { |f| f.read }
16
+ parse( txt )
17
+ end
18
+
19
+ def self.parse( txt )
20
+ new( txt ).parse
21
+ end
22
+
23
+ def initialize( txt )
24
+ @txt = txt
25
+ end
26
+
27
+ ## pattern for b (child) team / club marker e.g.
28
+ ## (ii) or ii) or ii.) or (ii.) or (II)
29
+ ## (b) or b) or b.) or (b.) or (B)
30
+ ## (2) or 2) or 2.) or (2.)
31
+ B_TEAM_MARKER_RE = %r{^ \(? # optional opening bracket
32
+ (?: ii|b|2 )
33
+ \.? # optional dot - keep and allow dot - why? why not?
34
+ \) # required closing bracket
35
+ }xi ## note: add case-insenstive (e.g. II/ii or B/b)
36
+
37
+ ## pattern for checking for address line e.g.
38
+ ## use just one style / syntax - why? why not?
39
+ ## Fischhofgasse 12 ~ 1100 Wien or
40
+ ## Fischhofgasse 12 // 1100 Wien or Fischhofgasse 12 /// 1100 Wien
41
+ ## Fischhofgasse 12 ++ 1100 Wien or Fischhofgasse 12 +++ 1100 Wien
42
+ ADDR_MARKER_RE = %r{ (?: ^|[ ] ) # space or beginning of line
43
+ (?: ~ | /{2,} | \+{2,} )
44
+ (?: [ ]|$) # space or end of line
45
+ }x
46
+
47
+
48
+ def add_alt_names( rec, names ) ## helper for adding alternat names
49
+
50
+ ## strip and squish (white)spaces
51
+ # e.g. New York FC (2011-) => New York FC (2011-)
52
+ names = names.map { |name| name.gsub( '$', '' ).strip
53
+ .gsub( /[ \t]+/, ' ' ) }
54
+ rec.alt_names += names
55
+ rec.add_variants( names ) # auto-add (possible) auto-generated variant names
56
+
57
+ ## check for duplicates
58
+ if rec.duplicates?
59
+ duplicates = rec.duplicates
60
+ puts "*** !!! WARN !!! - #{duplicates.size} duplicate alt name mapping(s):"
61
+ pp duplicates
62
+ pp rec
63
+ ##
64
+ ## todo/fix: make it only an error with exit 1
65
+ ## if (not normalized) names are the same (not unique/uniq)
66
+ ## e.g. don't exit on A.F.C. == AFC etc.
67
+ ## exit 1
68
+ end
69
+ end
70
+
71
+
72
+ def parse
73
+ recs = []
74
+ last_rec = nil
75
+ headings = [] ## headings stack
76
+
77
+ OutlineReader.parse( @txt ).each do |node|
78
+ if [:h1,:h2,:h3,:h4,:h5,:h6].include?( node[0] )
79
+ heading_level = node[0][1].to_i
80
+ heading = node[1]
81
+
82
+ puts "heading #{heading_level} >#{heading}<"
83
+
84
+ ## 1) first pop headings if present
85
+ while headings.size+1 > heading_level
86
+ headings.pop
87
+ end
88
+
89
+ ## 2) add missing (hierarchy) level if
90
+ while headings.size+1 < heading_level
91
+ ## todo/fix: issue warning about "skipping" hierarchy level
92
+ puts "!!! warn [team reader] - skipping hierarchy level in headings "
93
+ headings.push( nil )
94
+ end
95
+
96
+ if heading =~ /^\?+$/ ## note: use ? or ?? or ?? to reset level to nil
97
+ ## keep level empty
98
+ else
99
+ ## note: if level is 1 assume country for now
100
+ if heading_level == 1
101
+ ## assume country in heading; allow all "formats" supported by parse e.g.
102
+ ## Österreich • Austria (at)
103
+ ## Österreich • Austria
104
+ ## Austria
105
+ ## Deutschland (de) • Germany
106
+ country = catalog.countries.parse( heading )
107
+ ## check country code - MUST exist for now!!!!
108
+ if country.nil?
109
+ puts "!!! error [club reader] - unknown country >#{heading}< - sorry - add country to config to fix"
110
+ exit 1
111
+ end
112
+
113
+ headings.push( country.key )
114
+ else
115
+ ## quick hack:
116
+ ## remove known fill/dummy words incl:
117
+ ## Provincia San Juan => San Juan (see argentina, for example)
118
+ ##
119
+ ## use geo tree long term with alternative names - why? why not?
120
+ words = ['Provincia']
121
+ words.each { |word| heading = heading.gsub( word, '' ) }
122
+ heading = heading.strip
123
+
124
+ headings.push( heading )
125
+ end
126
+
127
+ ## assert that hierarchy level is ok
128
+ if headings.size != heading_level
129
+ puts "!!! error - headings hierarchy/stack out of order - #{heading.size}<=>#{heading_level}"
130
+ exit 1
131
+ end
132
+ end
133
+
134
+ pp headings
135
+
136
+ elsif node[0] == :p ## paragraph with (text) lines
137
+ lines = node[1]
138
+ lines.each do |line|
139
+ if line.start_with?( '|' )
140
+ ## assume continuation with line of alternative names
141
+ ## note: skip leading pipe
142
+ values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
143
+
144
+ add_alt_names( last_rec, values ) ## note: use alt_names helper for (re)use
145
+
146
+ ## check for b (child) team / club marker e.g.
147
+ ## (ii) or ii) or ii.) or (ii.)
148
+ ## (b) or b) or b.) or (b.)
149
+ ## (2) or 2) or 2.) or (2.)
150
+ elsif line =~ B_TEAM_MARKER_RE
151
+ line = line.sub( B_TEAM_MARKER_RE, '' ).strip ## remove (leading) b team marker
152
+
153
+ ## todo/fix: move into "regular" club branch - (re)use, that is, use the same code
154
+ # for both a and b team / club
155
+ rec = Club.new
156
+ value = line ## note: assume / allow just canonical name for now
157
+ ## strip and squish (white)spaces
158
+ # e.g. New York FC (2011-) => New York FC (2011-)
159
+ value = value.gsub( '$', '' ).strip
160
+ .gsub( /[ \t]+/, ' ' )
161
+
162
+ rec.name = value # canoncial name (global unique "beautiful/long" name)
163
+ rec.add_variants( value ) # auto-add (possible) auto-generated variant names
164
+
165
+ ### link a and b team / clubs
166
+ ## assume last_rec is the a team
167
+ ## todo/fix: check last_rec required NOT null
168
+ rec.a = last_rec
169
+ last_rec.b = rec
170
+
171
+ last_rec = rec
172
+ recs << rec
173
+
174
+ ## check for address line e.g.
175
+ ## use just one style / syntax - why? why not?
176
+ ## Fischhofgasse 12 ~ 1100 Wien or
177
+ ## Fischhofgasse 12 // 1100 Wien or Fischhofgasse 12 /// 1100 Wien
178
+ ## Fischhofgasse 12 ++ 1100 Wien or Fischhofgasse 12 +++ 1100 Wien
179
+ elsif line =~ ADDR_MARKER_RE
180
+ # note skip for now!!!
181
+ # todo/fix: add support for address line!!!
182
+ puts " skipping address line for now >#{line}<"
183
+ else
184
+ values = line.split( ',' )
185
+
186
+ rec = Club.new
187
+
188
+ col = values.shift ## get first item
189
+ ## note: allow optional alt names for convenience with required canoncial name
190
+ names = col.split( '|' ) # team names - allow/use pipe(|)
191
+ value = names[0] ## canonical name
192
+ alt_names = names[1..-1] ## optional (inline) alt names
193
+
194
+ ## strip and squish (white)spaces
195
+ # e.g. New York FC (2011-) => New York FC (2011-)
196
+ value = value.gsub( '$', '' ).strip
197
+ .gsub( /[ \t]+/, ' ' )
198
+ rec.name = value # canoncial name (global unique "beautiful/long" name)
199
+ rec.add_variants( value ) # auto-add (possible) auto-generated variant names
200
+
201
+ ## note: add optional (inline) alternate names if present
202
+ add_alt_names( rec, alt_names ) if alt_names.size > 0
203
+
204
+ ## note:
205
+ ## check/todo!!!!!!!!!!!!!!!!!-
206
+ ## strip year if to present e.g. (2011-)
207
+ ##
208
+ ## do NOT strip for defunct / historic clubs e.g.
209
+ ## (1899-1910)
210
+ ## or (-1914) or (-2011) etc.
211
+
212
+ ###
213
+ ## todo: move year out of canonical team name - why? why not?
214
+
215
+ ## check if canonical name include (2011-) or similar in name
216
+ ## if yes, remove (2011-) and add to (alt) names
217
+ ## e.g. New York FC (2011) => New York FC
218
+ if rec.name =~ /\(.+?\)/ ## note: use non-greedy (?) match
219
+ name = rec.name.gsub( /\(.+?\)/, '' ).strip
220
+
221
+ if rec.name =~ /\(([0-9]{4})-\)/ ## e.g. (2014-)
222
+ rec.year = $1.to_i
223
+ elsif rec.name =~ /\(-([0-9]{4})\)/ ## e.g. (-2014)
224
+ rec.year_end = $1.to_i
225
+ elsif rec.name =~ /\(([0-9]{4})-([0-9]{4})\)/ ## e.g. (2011-2014)
226
+ rec.year = $1.to_i
227
+ rec.year_end = $2.to_i
228
+ else
229
+ ## todo/check: warn about unknown year format
230
+ end
231
+ end
232
+
233
+ ## todo/check - check for unknown format values
234
+ ## e.g. too many values, duplicate years, etc.
235
+ ## check for overwritting, etc.
236
+ while values.size > 0
237
+ value = values.shift
238
+ ## strip and squish (white)spaces
239
+ # e.g. León › Guanajuato => León › Guanajuato
240
+ value = value.strip.gsub( /[ \t]+/, ' ' )
241
+ if value =~/^\d{4}$/ # e.g 1904
242
+ ## todo/check: issue warning if year is already set!!!!!!!
243
+ if rec.year
244
+ puts "!!! error - year already set to #{rec.year} - CANNOT overwrite with #{value}:"
245
+ pp rec
246
+ exit 1
247
+ end
248
+ rec.year = value.to_i
249
+ elsif value.start_with?( '@' ) # e.g. @ Anfield
250
+ ## cut-off leading @ and spaces
251
+ rec.ground = value[1..-1].strip
252
+ else
253
+ ## assume city / geo tree
254
+ ## split into geo tree
255
+ geos = split_geo( value )
256
+ city = geos[0]
257
+ ## check for "embedded" district e.g. London (Fulham) or Hamburg (St. Pauli) etc.
258
+ if city =~ /\((.+?)\)/ ## note: use non-greedy (?) match
259
+ rec.district = $1.strip
260
+ city = city.gsub( /\(.+?\)/, '' ).strip
261
+ end
262
+ rec.city = city
263
+
264
+ if geos.size > 1
265
+ ## cut-off city and keep the rest (of geo tree)
266
+ rec.geos = geos[1..-1]
267
+ end
268
+ end
269
+ end ## while values
270
+
271
+
272
+ ###############
273
+ ## use headings text for geo tree
274
+
275
+ ## 1) add country if present
276
+ if headings.size > 0 && headings[0]
277
+ country = catalog.countries.find( headings[0] )
278
+ rec.country = country
279
+ else
280
+ ## make it an error - why? why not?
281
+ puts "!!! error - country missing in headings hierarchy - sorry - add to quicklist"
282
+ exit 1
283
+ end
284
+
285
+ ## 2) check geo tree with headings hierarchy
286
+ if headings.size > 1 && headings[1]
287
+ geos = split_geo( headings[1] )
288
+ if rec.geos
289
+ if rec.geos[0] != geos[0]
290
+ puts "!!! error - geo tree - headings mismatch >#{rec.geos[0]}< <=> >#{geos[0]}<"
291
+ exit 1
292
+ end
293
+ if rec.geos[1] && rec.geos[1] != geos[1] ## check optional 2nd level too
294
+ puts "!!! error - geo tree - headings mismatch >#{rec.geos[1]}< <=> >#{geos[1]}<"
295
+ exit 1
296
+ end
297
+ else
298
+ ## add missing region (state/province) from headings hierarchy
299
+ rec.geos = geos
300
+ end
301
+ end
302
+
303
+ last_rec = rec
304
+
305
+
306
+ ### todo/fix:
307
+ ## auto-add alt name with dots stripped - why? why not?
308
+ ## e.g. D.C. United => DC United
309
+ ## e.g. Liverpool F.C. => Liverpool FC
310
+ ## e.g. St. Albin => St Albin etc.
311
+ ## e.g. 1. FC Köln => 1 FC Köln -- make special case for 1. - why? why not?
312
+
313
+ ##
314
+ ## todo/fix: unify mapping entries
315
+ ## always lowercase !!!! (case insensitive)
316
+ ## always strip (2011-) !!!
317
+ ## always strip dots (e.g. St., F.C, etc.)
318
+
319
+ recs << rec
320
+ end
321
+ end # each line (in paragraph)
322
+ else
323
+ puts "** !!! ERROR !!! [club reader] - unknown line type:"
324
+ pp node
325
+ exit 1
326
+ end
327
+ end
328
+
329
+ recs
330
+ end # method read
331
+
332
+ #######################
333
+ ### helpers
334
+ def split_geo( str )
335
+ ## assume city / geo tree
336
+ ## strip and squish (white)spaces
337
+ # e.g. León › Guanajuato => León › Guanajuato
338
+ str = str.strip.gsub( /[ \t]+/, ' ' )
339
+
340
+ ## split into geo tree
341
+ geos = str.split( /[<>‹›]/ ) ## note: allow > < or › ‹
342
+ geos = geos.map { |geo| geo.strip } ## remove all whitespaces
343
+ geos
344
+ end
345
+
346
+ end # class ClubReader
347
+
348
+
349
+ end ## module Import
350
+ end ## module SportDb