sportdb-formats 1.0.0 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,336 @@
1
+ # encoding: utf-8
2
+
3
+ module SportDb
4
+ module Import
5
+
6
+
7
+ class ClubIndex
8
+
9
+ def self.build( path )
10
+ pack = Package.new( path ) ## lets us use direcotry or zip archive
11
+
12
+ recs = []
13
+ pack.each_clubs do |entry|
14
+ recs += Club.parse( entry.read )
15
+ end
16
+ recs
17
+
18
+ clubs = new
19
+ clubs.add( recs )
20
+
21
+ ## add wiki(pedia) anchored links
22
+ recs = []
23
+ pack.each_clubs_wiki do |entry|
24
+ recs += WikiReader.parse( entry.read )
25
+ end
26
+
27
+ pp recs
28
+ clubs.add_wiki( recs )
29
+ clubs
30
+ end
31
+
32
+
33
+ def catalog() Import.catalog; end
34
+
35
+ def initialize
36
+ @clubs = {} ## clubs (indexed) by canonical name
37
+ @clubs_by_name = {}
38
+ @errors = []
39
+ end
40
+
41
+ attr_reader :errors
42
+ def errors?() @errors.empty? == false; end
43
+
44
+ def mappings() @clubs_by_name; end ## todo/check: rename to index or something - why? why not?
45
+ def clubs() @clubs.values; end
46
+ alias_method :all, :clubs ## use ActiveRecord-like alias for clubs
47
+
48
+
49
+ ## helpers from club - use a helper module for includes - why? why not?
50
+ include NameHelper
51
+ ## incl. strip_year( name )
52
+ ## has_year?( name)
53
+ ## strip_lang( name )
54
+ ## normalize( name )
55
+
56
+ def strip_wiki( name ) # todo/check: rename to strip_wikipedia_en - why? why not?
57
+ ## note: strip disambiguationn qualifier from wikipedia page name if present
58
+ ## note: only remove year and foot... for now
59
+ ## e.g. FC Wacker Innsbruck (2002) => FC Wacker Innsbruck
60
+ ## Willem II (football club) => Willem II
61
+ ##
62
+ ## e.g. do NOT strip others !! e.g.
63
+ ## América Futebol Clube (MG)
64
+ ## only add more "special" cases on demand (that, is) if we find more
65
+ name = name.gsub( /\([12][^\)]+?\)/, '' ).strip ## starting with a digit 1 or 2 (assuming year)
66
+ name = name.gsub( /\(foot[^\)]+?\)/, '' ).strip ## starting with foot (assuming football ...)
67
+ name
68
+ end
69
+
70
+ def add_wiki( rec_or_recs ) ## add wiki(pedia club record / links
71
+ recs = rec_or_recs.is_a?( Array ) ? rec_or_recs : [rec_or_recs] ## wrap (single) rec in array
72
+
73
+ recs.each do |rec|
74
+ ## note: strip qualifier () from wikipedia page name if present
75
+ ## e.g. FC Wacker Innsbruck (2002) => FC Wacker Innsbruck
76
+ ## Willem II (football club) => Willem II
77
+ ##
78
+ ## e.g. do NOT strip others !! e.g.
79
+ ## América Futebol Clube (MG)
80
+ ## only add more "special" cases on demand (that, is) if we find more
81
+ name = strip_wiki( rec.name )
82
+
83
+ m = match_by( name: name, country: rec.country )
84
+ if m.nil?
85
+ puts "** !!! ERROR !!! - no matching club found for wiki(pedia) name >#{name}, #{rec.country.name} (#{rec.country.key})<; sorry - to fix add name to clubs"
86
+ exit 1
87
+ end
88
+ if m.size > 1
89
+ puts "** !!! ERROR !!! - too many (greater than one) matching clubs found for wiki(pedia) name >#{name}, #{rec.country.name} (#{rec.country.key})<"
90
+ pp m
91
+ exit 1
92
+ end
93
+ club = m[0]
94
+ club.wikipedia = rec.name
95
+ end
96
+ end # method add_wiki
97
+
98
+
99
+ def add( rec_or_recs ) ## add club record / alt_names
100
+ recs = rec_or_recs.is_a?( Array ) ? rec_or_recs : [rec_or_recs] ## wrap (single) rec in array
101
+
102
+ recs.each do |rec|
103
+ ## puts "adding:"
104
+ ## pp rec
105
+ ### step 1) add canonical name
106
+ old_rec = @clubs[ rec.name ]
107
+ if old_rec
108
+ puts "** !!! ERROR !!! - (canonical) name conflict - duplicate - >#{rec.name}< will overwrite >#{old_rec.name}<:"
109
+ pp old_rec
110
+ pp rec
111
+ exit 1
112
+ else
113
+ @clubs[ rec.name ] = rec
114
+ end
115
+
116
+ ## step 2) add all names (canonical name + alt names + alt names (auto))
117
+ names = [rec.name] + rec.alt_names
118
+ more_names = []
119
+ ## check "hand-typed" names for year (auto-add)
120
+ ## check for year(s) e.g. (1887-1911), (-2013),
121
+ ## (1946-2001,2013-) etc.
122
+ names.each do |name|
123
+ if has_year?( name )
124
+ more_names << strip_year( name )
125
+ end
126
+ end
127
+
128
+ names += more_names
129
+ ## check for duplicates - simple check for now - fix/improve
130
+ ## todo/fix: (auto)remove duplicates - why? why not?
131
+ count = names.size
132
+ count_uniq = names.uniq.size
133
+ if count != count_uniq
134
+ puts "** !!! ERROR !!! - #{count-count_uniq} duplicate name(s):"
135
+ pp names
136
+ pp rec
137
+ exit 1
138
+ end
139
+
140
+ ## check with auto-names just warn for now and do not exit
141
+ names += rec.alt_names_auto
142
+ count = names.size
143
+ count_uniq = names.uniq.size
144
+ if count != count_uniq
145
+ puts "** !!! WARN !!! - #{count-count_uniq} duplicate name(s):"
146
+ pp names
147
+ pp rec
148
+ end
149
+
150
+
151
+ names.each_with_index do |name,i|
152
+ ## check lang codes e.g. [en], [fr], etc.
153
+ ## todo/check/fix: move strip_lang up in the chain - check for duplicates (e.g. only lang code marker different etc.) - why? why not?
154
+ name = strip_lang( name )
155
+ norm = normalize( name )
156
+ alt_recs = @clubs_by_name[ norm ]
157
+ if alt_recs
158
+ ## check if include club rec already or is new club rec
159
+ if alt_recs.include?( rec )
160
+ ## note: do NOT include duplicate club record
161
+ msg = "** !!! WARN !!! - (norm) name conflict/duplicate for club - >#{name}< normalized to >#{norm}< already included >#{rec.name}, #{rec.country.name}<"
162
+ puts msg
163
+ @errors << msg
164
+ else
165
+ msg = "** !!! WARN !!! - name conflict/duplicate - >#{name}< will overwrite >#{alt_recs[0].name}, #{alt_recs[0].country.name}< with >#{rec.name}, #{rec.country.name}<"
166
+ puts msg
167
+ @errors << msg
168
+ alt_recs << rec
169
+ end
170
+ else
171
+ @clubs_by_name[ norm ] = [rec]
172
+ end
173
+ end
174
+ end
175
+ end # method add
176
+
177
+
178
+ ## todo/fix/check: use rename to find_canon or find_canonical() or something??
179
+ ## remove (getting used?) - why? why not?
180
+ def []( name ) ## lookup by canoncial name only; todo/fix: add find alias why? why not?
181
+ @clubs[ name ]
182
+ end
183
+
184
+
185
+ ## todo/fix/check: return empty array if no match!!!
186
+ ## and NOT nil (add || []) - why? why not?
187
+ def match( name )
188
+ name = normalize( name )
189
+ m = @clubs_by_name[ name ]
190
+
191
+ ## no match - retry with unaccented variant if different
192
+ ## e.g. example is Preussen Münster (with mixed accent and unaccented letters) that would go unmatched for now
193
+ ## Preussen Münster => preussenmünster (norm) => preussenmunster (norm+unaccent)
194
+ if m.nil?
195
+ name2 = unaccent( name )
196
+ m = @clubs_by_name[ name2 ] if name2 != name
197
+ end
198
+ m
199
+ end
200
+
201
+
202
+ ## helper to always convert (possible) country key to existing country record
203
+ ## todo: make private - why? why not?
204
+ def country( country )
205
+ if country.is_a?( String ) || country.is_a?( Symbol )
206
+ ## note: use own "global" countries index setting for ClubIndex - why? why not?
207
+ rec = catalog.countries.find( country.to_s )
208
+ if rec.nil?
209
+ puts "** !!! ERROR !!! - unknown country >#{country}< - no match found, sorry - add to world/countries.txt in config"
210
+ exit 1
211
+ end
212
+ rec
213
+ else
214
+ country ## (re)use country struct - no need to run lookup again
215
+ end
216
+ end
217
+
218
+
219
+ ## match - always returns an array (with one or more matches) or nil
220
+ def match_by( name:, country: nil )
221
+ ## note: allow passing in of country key too (auto-counvert)
222
+ ## and country struct too
223
+ ## - country assumes / allows the country key or fifa code for now
224
+ m = match( name )
225
+
226
+ if country
227
+ country = country( country )
228
+
229
+ ## note: match must for now always include name
230
+ if m ## filter by country
231
+ m = m.select { |club| club.country.key == country.key }
232
+ m = nil if m.empty? ## note: reset to nil if no more matches
233
+ end
234
+ end
235
+ m
236
+ end
237
+
238
+ def find( name ) find_by( name: name, country: nil ); end
239
+ def find!( name ) find_by!( name: name, country: nil ); end
240
+
241
+ ## find - always returns a single record / match or nil
242
+ ## if there is more than one match than find aborts / fails
243
+ def find_by!( name:, country: nil ) ## todo/fix: add international or league flag?
244
+ club = find_by( name: name, country: country )
245
+
246
+ if club.nil?
247
+ puts "** !!! ERROR - no match for club >#{name}<"
248
+ exit 1
249
+ end
250
+
251
+ club
252
+ end
253
+
254
+
255
+ def find_by( name:, country: nil ) ## todo/fix: add international or league flag?
256
+ ## note: allow passing in of country key too (auto-counvert)
257
+ ## and country struct too
258
+ ## - country assumes / allows the country key or fifa code for now
259
+ m = nil
260
+
261
+ if country
262
+ country = country( country )
263
+
264
+ m = match_by( name: name, country: country )
265
+
266
+ if m.nil?
267
+ ## (re)try with second country - quick hacks for known leagues
268
+ ## todo/fix: add league flag to activate!!! - why? why not
269
+ m = match_by( name: name, country: 'wal' ) if country.key == 'eng'
270
+ m = match_by( name: name, country: 'eng' ) if country.key == 'sco'
271
+ m = match_by( name: name, country: 'nir' ) if country.key == 'ie'
272
+ m = match_by( name: name, country: 'mc' ) if country.key == 'fr'
273
+ m = match_by( name: name, country: 'li' ) if country.key == 'ch'
274
+ m = match_by( name: name, country: 'ca' ) if country.key == 'us'
275
+ end
276
+ else ## try "global" search - no country passed in
277
+ m = match( name )
278
+ end
279
+
280
+
281
+ club = nil
282
+ if m.nil?
283
+ ## puts "** !!! WARN !!! no match for club >#{name}<"
284
+ elsif m.size > 1
285
+ puts "** !!! ERROR - too many matches (#{m.size}) for club >#{name}<:"
286
+ pp m
287
+ exit 1
288
+ else # bingo; match - assume size == 1
289
+ club = m[0]
290
+ end
291
+
292
+ club
293
+ end
294
+
295
+
296
+
297
+ def build_mods( mods )
298
+ ## e.g.
299
+ ## { 'Arsenal | Arsenal FC' => 'Arsenal, ENG',
300
+ ## 'Liverpool | Liverpool FC' => 'Liverpool, ENG',
301
+ ## 'Barcelona' => 'Barcelona, ESP',
302
+ ## 'Valencia' => 'Valencia, ESP' }
303
+
304
+ mods.reduce({}) do |h,(club_names, club_line)|
305
+
306
+ values = club_line.split( ',' )
307
+ values = values.map { |value| value.strip } ## strip all spaces
308
+
309
+ ## todo/fix: make sure country is present !!!!
310
+ club_name, country_name = values
311
+ club = find_by!( name: club_name, country: country_name )
312
+
313
+ values = club_names.split( '|' )
314
+ values = values.map { |value| value.strip } ## strip all spaces
315
+
316
+ values.each do |club_name|
317
+ h[club_name] = club
318
+ end
319
+ h
320
+ end
321
+ end
322
+
323
+
324
+ def dump_duplicates # debug helper - report duplicate club name records
325
+ @clubs_by_name.each do |name, clubs|
326
+ if clubs.size > 1
327
+ puts "#{clubs.size} matching club duplicates for >#{name}<:"
328
+ pp clubs
329
+ end
330
+ end
331
+ end
332
+ end # class ClubIndex
333
+
334
+
335
+ end # module Import
336
+ end # module SportDb
@@ -0,0 +1,350 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ module SportDb
5
+ module Import
6
+
7
+
8
+ class ClubReader
9
+
10
+ def catalog() Import.catalog; end
11
+
12
+
13
+
14
+ def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
15
+ txt = File.open( path, 'r:utf-8' ) { |f| f.read }
16
+ parse( txt )
17
+ end
18
+
19
+ def self.parse( txt )
20
+ new( txt ).parse
21
+ end
22
+
23
+ def initialize( txt )
24
+ @txt = txt
25
+ end
26
+
27
+ ## pattern for b (child) team / club marker e.g.
28
+ ## (ii) or ii) or ii.) or (ii.) or (II)
29
+ ## (b) or b) or b.) or (b.) or (B)
30
+ ## (2) or 2) or 2.) or (2.)
31
+ B_TEAM_MARKER_RE = %r{^ \(? # optional opening bracket
32
+ (?: ii|b|2 )
33
+ \.? # optional dot - keep and allow dot - why? why not?
34
+ \) # required closing bracket
35
+ }xi ## note: add case-insenstive (e.g. II/ii or B/b)
36
+
37
+ ## pattern for checking for address line e.g.
38
+ ## use just one style / syntax - why? why not?
39
+ ## Fischhofgasse 12 ~ 1100 Wien or
40
+ ## Fischhofgasse 12 // 1100 Wien or Fischhofgasse 12 /// 1100 Wien
41
+ ## Fischhofgasse 12 ++ 1100 Wien or Fischhofgasse 12 +++ 1100 Wien
42
+ ADDR_MARKER_RE = %r{ (?: ^|[ ] ) # space or beginning of line
43
+ (?: ~ | /{2,} | \+{2,} )
44
+ (?: [ ]|$) # space or end of line
45
+ }x
46
+
47
+
48
+ def add_alt_names( rec, names ) ## helper for adding alternat names
49
+
50
+ ## strip and squish (white)spaces
51
+ # e.g. New York FC (2011-) => New York FC (2011-)
52
+ names = names.map { |name| name.gsub( '$', '' ).strip
53
+ .gsub( /[ \t]+/, ' ' ) }
54
+ rec.alt_names += names
55
+ rec.add_variants( names ) # auto-add (possible) auto-generated variant names
56
+
57
+ ## check for duplicates
58
+ if rec.duplicates?
59
+ duplicates = rec.duplicates
60
+ puts "*** !!! WARN !!! - #{duplicates.size} duplicate alt name mapping(s):"
61
+ pp duplicates
62
+ pp rec
63
+ ##
64
+ ## todo/fix: make it only an error with exit 1
65
+ ## if (not normalized) names are the same (not unique/uniq)
66
+ ## e.g. don't exit on A.F.C. == AFC etc.
67
+ ## exit 1
68
+ end
69
+ end
70
+
71
+
72
+ def parse
73
+ recs = []
74
+ last_rec = nil
75
+ headings = [] ## headings stack
76
+
77
+ OutlineReader.parse( @txt ).each do |node|
78
+ if [:h1,:h2,:h3,:h4,:h5,:h6].include?( node[0] )
79
+ heading_level = node[0][1].to_i
80
+ heading = node[1]
81
+
82
+ puts "heading #{heading_level} >#{heading}<"
83
+
84
+ ## 1) first pop headings if present
85
+ while headings.size+1 > heading_level
86
+ headings.pop
87
+ end
88
+
89
+ ## 2) add missing (hierarchy) level if
90
+ while headings.size+1 < heading_level
91
+ ## todo/fix: issue warning about "skipping" hierarchy level
92
+ puts "!!! warn [team reader] - skipping hierarchy level in headings "
93
+ headings.push( nil )
94
+ end
95
+
96
+ if heading =~ /^\?+$/ ## note: use ? or ?? or ?? to reset level to nil
97
+ ## keep level empty
98
+ else
99
+ ## note: if level is 1 assume country for now
100
+ if heading_level == 1
101
+ ## assume country in heading; allow all "formats" supported by parse e.g.
102
+ ## Österreich • Austria (at)
103
+ ## Österreich • Austria
104
+ ## Austria
105
+ ## Deutschland (de) • Germany
106
+ country = catalog.countries.parse( heading )
107
+ ## check country code - MUST exist for now!!!!
108
+ if country.nil?
109
+ puts "!!! error [club reader] - unknown country >#{heading}< - sorry - add country to config to fix"
110
+ exit 1
111
+ end
112
+
113
+ headings.push( country.key )
114
+ else
115
+ ## quick hack:
116
+ ## remove known fill/dummy words incl:
117
+ ## Provincia San Juan => San Juan (see argentina, for example)
118
+ ##
119
+ ## use geo tree long term with alternative names - why? why not?
120
+ words = ['Provincia']
121
+ words.each { |word| heading = heading.gsub( word, '' ) }
122
+ heading = heading.strip
123
+
124
+ headings.push( heading )
125
+ end
126
+
127
+ ## assert that hierarchy level is ok
128
+ if headings.size != heading_level
129
+ puts "!!! error - headings hierarchy/stack out of order - #{heading.size}<=>#{heading_level}"
130
+ exit 1
131
+ end
132
+ end
133
+
134
+ pp headings
135
+
136
+ elsif node[0] == :p ## paragraph with (text) lines
137
+ lines = node[1]
138
+ lines.each do |line|
139
+ if line.start_with?( '|' )
140
+ ## assume continuation with line of alternative names
141
+ ## note: skip leading pipe
142
+ values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
143
+
144
+ add_alt_names( last_rec, values ) ## note: use alt_names helper for (re)use
145
+
146
+ ## check for b (child) team / club marker e.g.
147
+ ## (ii) or ii) or ii.) or (ii.)
148
+ ## (b) or b) or b.) or (b.)
149
+ ## (2) or 2) or 2.) or (2.)
150
+ elsif line =~ B_TEAM_MARKER_RE
151
+ line = line.sub( B_TEAM_MARKER_RE, '' ).strip ## remove (leading) b team marker
152
+
153
+ ## todo/fix: move into "regular" club branch - (re)use, that is, use the same code
154
+ # for both a and b team / club
155
+ rec = Club.new
156
+ value = line ## note: assume / allow just canonical name for now
157
+ ## strip and squish (white)spaces
158
+ # e.g. New York FC (2011-) => New York FC (2011-)
159
+ value = value.gsub( '$', '' ).strip
160
+ .gsub( /[ \t]+/, ' ' )
161
+
162
+ rec.name = value # canoncial name (global unique "beautiful/long" name)
163
+ rec.add_variants( value ) # auto-add (possible) auto-generated variant names
164
+
165
+ ### link a and b team / clubs
166
+ ## assume last_rec is the a team
167
+ ## todo/fix: check last_rec required NOT null
168
+ rec.a = last_rec
169
+ last_rec.b = rec
170
+
171
+ last_rec = rec
172
+ recs << rec
173
+
174
+ ## check for address line e.g.
175
+ ## use just one style / syntax - why? why not?
176
+ ## Fischhofgasse 12 ~ 1100 Wien or
177
+ ## Fischhofgasse 12 // 1100 Wien or Fischhofgasse 12 /// 1100 Wien
178
+ ## Fischhofgasse 12 ++ 1100 Wien or Fischhofgasse 12 +++ 1100 Wien
179
+ elsif line =~ ADDR_MARKER_RE
180
+ # note skip for now!!!
181
+ # todo/fix: add support for address line!!!
182
+ puts " skipping address line for now >#{line}<"
183
+ else
184
+ values = line.split( ',' )
185
+
186
+ rec = Club.new
187
+
188
+ col = values.shift ## get first item
189
+ ## note: allow optional alt names for convenience with required canoncial name
190
+ names = col.split( '|' ) # team names - allow/use pipe(|)
191
+ value = names[0] ## canonical name
192
+ alt_names = names[1..-1] ## optional (inline) alt names
193
+
194
+ ## strip and squish (white)spaces
195
+ # e.g. New York FC (2011-) => New York FC (2011-)
196
+ value = value.gsub( '$', '' ).strip
197
+ .gsub( /[ \t]+/, ' ' )
198
+ rec.name = value # canoncial name (global unique "beautiful/long" name)
199
+ rec.add_variants( value ) # auto-add (possible) auto-generated variant names
200
+
201
+ ## note: add optional (inline) alternate names if present
202
+ add_alt_names( rec, alt_names ) if alt_names.size > 0
203
+
204
+ ## note:
205
+ ## check/todo!!!!!!!!!!!!!!!!!-
206
+ ## strip year if to present e.g. (2011-)
207
+ ##
208
+ ## do NOT strip for defunct / historic clubs e.g.
209
+ ## (1899-1910)
210
+ ## or (-1914) or (-2011) etc.
211
+
212
+ ###
213
+ ## todo: move year out of canonical team name - why? why not?
214
+
215
+ ## check if canonical name include (2011-) or similar in name
216
+ ## if yes, remove (2011-) and add to (alt) names
217
+ ## e.g. New York FC (2011) => New York FC
218
+ if rec.name =~ /\(.+?\)/ ## note: use non-greedy (?) match
219
+ name = rec.name.gsub( /\(.+?\)/, '' ).strip
220
+
221
+ if rec.name =~ /\(([0-9]{4})-\)/ ## e.g. (2014-)
222
+ rec.year = $1.to_i
223
+ elsif rec.name =~ /\(-([0-9]{4})\)/ ## e.g. (-2014)
224
+ rec.year_end = $1.to_i
225
+ elsif rec.name =~ /\(([0-9]{4})-([0-9]{4})\)/ ## e.g. (2011-2014)
226
+ rec.year = $1.to_i
227
+ rec.year_end = $2.to_i
228
+ else
229
+ ## todo/check: warn about unknown year format
230
+ end
231
+ end
232
+
233
+ ## todo/check - check for unknown format values
234
+ ## e.g. too many values, duplicate years, etc.
235
+ ## check for overwritting, etc.
236
+ while values.size > 0
237
+ value = values.shift
238
+ ## strip and squish (white)spaces
239
+ # e.g. León › Guanajuato => León › Guanajuato
240
+ value = value.strip.gsub( /[ \t]+/, ' ' )
241
+ if value =~/^\d{4}$/ # e.g 1904
242
+ ## todo/check: issue warning if year is already set!!!!!!!
243
+ if rec.year
244
+ puts "!!! error - year already set to #{rec.year} - CANNOT overwrite with #{value}:"
245
+ pp rec
246
+ exit 1
247
+ end
248
+ rec.year = value.to_i
249
+ elsif value.start_with?( '@' ) # e.g. @ Anfield
250
+ ## cut-off leading @ and spaces
251
+ rec.ground = value[1..-1].strip
252
+ else
253
+ ## assume city / geo tree
254
+ ## split into geo tree
255
+ geos = split_geo( value )
256
+ city = geos[0]
257
+ ## check for "embedded" district e.g. London (Fulham) or Hamburg (St. Pauli) etc.
258
+ if city =~ /\((.+?)\)/ ## note: use non-greedy (?) match
259
+ rec.district = $1.strip
260
+ city = city.gsub( /\(.+?\)/, '' ).strip
261
+ end
262
+ rec.city = city
263
+
264
+ if geos.size > 1
265
+ ## cut-off city and keep the rest (of geo tree)
266
+ rec.geos = geos[1..-1]
267
+ end
268
+ end
269
+ end ## while values
270
+
271
+
272
+ ###############
273
+ ## use headings text for geo tree
274
+
275
+ ## 1) add country if present
276
+ if headings.size > 0 && headings[0]
277
+ country = catalog.countries.find( headings[0] )
278
+ rec.country = country
279
+ else
280
+ ## make it an error - why? why not?
281
+ puts "!!! error - country missing in headings hierarchy - sorry - add to quicklist"
282
+ exit 1
283
+ end
284
+
285
+ ## 2) check geo tree with headings hierarchy
286
+ if headings.size > 1 && headings[1]
287
+ geos = split_geo( headings[1] )
288
+ if rec.geos
289
+ if rec.geos[0] != geos[0]
290
+ puts "!!! error - geo tree - headings mismatch >#{rec.geos[0]}< <=> >#{geos[0]}<"
291
+ exit 1
292
+ end
293
+ if rec.geos[1] && rec.geos[1] != geos[1] ## check optional 2nd level too
294
+ puts "!!! error - geo tree - headings mismatch >#{rec.geos[1]}< <=> >#{geos[1]}<"
295
+ exit 1
296
+ end
297
+ else
298
+ ## add missing region (state/province) from headings hierarchy
299
+ rec.geos = geos
300
+ end
301
+ end
302
+
303
+ last_rec = rec
304
+
305
+
306
+ ### todo/fix:
307
+ ## auto-add alt name with dots stripped - why? why not?
308
+ ## e.g. D.C. United => DC United
309
+ ## e.g. Liverpool F.C. => Liverpool FC
310
+ ## e.g. St. Albin => St Albin etc.
311
+ ## e.g. 1. FC Köln => 1 FC Köln -- make special case for 1. - why? why not?
312
+
313
+ ##
314
+ ## todo/fix: unify mapping entries
315
+ ## always lowercase !!!! (case insensitive)
316
+ ## always strip (2011-) !!!
317
+ ## always strip dots (e.g. St., F.C, etc.)
318
+
319
+ recs << rec
320
+ end
321
+ end # each line (in paragraph)
322
+ else
323
+ puts "** !!! ERROR !!! [club reader] - unknown line type:"
324
+ pp node
325
+ exit 1
326
+ end
327
+ end
328
+
329
+ recs
330
+ end # method read
331
+
332
+ #######################
333
+ ### helpers
334
+ def split_geo( str )
335
+ ## assume city / geo tree
336
+ ## strip and squish (white)spaces
337
+ # e.g. León › Guanajuato => León › Guanajuato
338
+ str = str.strip.gsub( /[ \t]+/, ' ' )
339
+
340
+ ## split into geo tree
341
+ geos = str.split( /[<>‹›]/ ) ## note: allow > < or › ‹
342
+ geos = geos.map { |geo| geo.strip } ## remove all whitespaces
343
+ geos
344
+ end
345
+
346
+ end # class ClubReader
347
+
348
+
349
+ end ## module Import
350
+ end ## module SportDb