sportdb-formats 1.1.6 → 1.2.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. checksums.yaml +5 -5
  2. data/CHANGELOG.md +2 -0
  3. data/Manifest.txt +4 -25
  4. data/Rakefile +1 -1
  5. data/lib/sportdb/formats/country/country_reader.rb +142 -142
  6. data/lib/sportdb/formats/datafile.rb +59 -59
  7. data/lib/sportdb/formats/event/event_reader.rb +184 -183
  8. data/lib/sportdb/formats/goals.rb +53 -9
  9. data/lib/sportdb/formats/ground/ground_reader.rb +289 -0
  10. data/lib/sportdb/formats/league/league_reader.rb +152 -168
  11. data/lib/sportdb/formats/lines_reader.rb +47 -0
  12. data/lib/sportdb/formats/match/match_parser.rb +130 -13
  13. data/lib/sportdb/formats/match/match_parser_auto_conf.rb +270 -202
  14. data/lib/sportdb/formats/outline_reader.rb +0 -1
  15. data/lib/sportdb/formats/package.rb +394 -374
  16. data/lib/sportdb/formats/search/sport.rb +357 -0
  17. data/lib/sportdb/formats/search/world.rb +139 -0
  18. data/lib/sportdb/formats/team/club_index_history.rb +134 -134
  19. data/lib/sportdb/formats/team/club_reader.rb +318 -350
  20. data/lib/sportdb/formats/team/club_reader_history.rb +203 -203
  21. data/lib/sportdb/formats/team/wiki_reader.rb +108 -108
  22. data/lib/sportdb/formats/version.rb +4 -7
  23. data/lib/sportdb/formats.rb +60 -27
  24. metadata +13 -35
  25. data/lib/sportdb/formats/country/country_index.rb +0 -192
  26. data/lib/sportdb/formats/event/event_index.rb +0 -141
  27. data/lib/sportdb/formats/league/league_index.rb +0 -178
  28. data/lib/sportdb/formats/team/club_index.rb +0 -338
  29. data/lib/sportdb/formats/team/national_team_index.rb +0 -114
  30. data/lib/sportdb/formats/team/team_index.rb +0 -43
  31. data/test/helper.rb +0 -132
  32. data/test/test_club_index.rb +0 -183
  33. data/test/test_club_index_history.rb +0 -107
  34. data/test/test_club_reader.rb +0 -201
  35. data/test/test_club_reader_history.rb +0 -212
  36. data/test/test_club_reader_props.rb +0 -54
  37. data/test/test_country_index.rb +0 -63
  38. data/test/test_country_reader.rb +0 -89
  39. data/test/test_datafile.rb +0 -30
  40. data/test/test_datafile_package.rb +0 -46
  41. data/test/test_goals.rb +0 -113
  42. data/test/test_league_index.rb +0 -157
  43. data/test/test_league_outline_reader.rb +0 -55
  44. data/test/test_league_reader.rb +0 -72
  45. data/test/test_outline_reader.rb +0 -31
  46. data/test/test_package.rb +0 -78
  47. data/test/test_package_match.rb +0 -102
  48. data/test/test_regex.rb +0 -67
  49. data/test/test_wiki_reader.rb +0 -77
@@ -1,350 +1,318 @@
1
- # encoding: utf-8
2
-
3
-
4
- module SportDb
5
- module Import
6
-
7
-
8
- class ClubReader
9
-
10
- def catalog() Import.catalog; end
11
-
12
-
13
-
14
- def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
15
- txt = File.open( path, 'r:utf-8' ) { |f| f.read }
16
- parse( txt )
17
- end
18
-
19
- def self.parse( txt )
20
- new( txt ).parse
21
- end
22
-
23
- def initialize( txt )
24
- @txt = txt
25
- end
26
-
27
- ## pattern for b (child) team / club marker e.g.
28
- ## (ii) or ii) or ii.) or (ii.) or (II)
29
- ## (b) or b) or b.) or (b.) or (B)
30
- ## (2) or 2) or 2.) or (2.)
31
- B_TEAM_MARKER_RE = %r{^ \(? # optional opening bracket
32
- (?: ii|b|2 )
33
- \.? # optional dot - keep and allow dot - why? why not?
34
- \) # required closing bracket
35
- }xi ## note: add case-insenstive (e.g. II/ii or B/b)
36
-
37
- ## pattern for checking for address line e.g.
38
- ## use just one style / syntax - why? why not?
39
- ## Fischhofgasse 12 ~ 1100 Wien or
40
- ## Fischhofgasse 12 // 1100 Wien or Fischhofgasse 12 /// 1100 Wien
41
- ## Fischhofgasse 12 ++ 1100 Wien or Fischhofgasse 12 +++ 1100 Wien
42
- ADDR_MARKER_RE = %r{ (?: ^|[ ] ) # space or beginning of line
43
- (?: ~ | /{2,} | \+{2,} )
44
- (?: [ ]|$) # space or end of line
45
- }x
46
-
47
-
48
- def add_alt_names( rec, names ) ## helper for adding alternat names
49
-
50
- ## strip and squish (white)spaces
51
- # e.g. New York FC (2011-) => New York FC (2011-)
52
- names = names.map { |name| name.gsub( '$', '' ).strip
53
- .gsub( /[ \t]+/, ' ' ) }
54
- rec.alt_names += names
55
- rec.add_variants( names ) # auto-add (possible) auto-generated variant names
56
-
57
- ## check for duplicates
58
- if rec.duplicates?
59
- duplicates = rec.duplicates
60
- puts "*** !!! WARN !!! - #{duplicates.size} duplicate alt name mapping(s):"
61
- pp duplicates
62
- pp rec
63
- ##
64
- ## todo/fix: make it only an error with exit 1
65
- ## if (not normalized) names are the same (not unique/uniq)
66
- ## e.g. don't exit on A.F.C. == AFC etc.
67
- ## exit 1
68
- end
69
- end
70
-
71
-
72
- def parse
73
- recs = []
74
- last_rec = nil
75
- headings = [] ## headings stack
76
-
77
- OutlineReader.parse( @txt ).each do |node|
78
- if [:h1,:h2,:h3,:h4,:h5,:h6].include?( node[0] )
79
- heading_level = node[0][1].to_i
80
- heading = node[1]
81
-
82
- puts "heading #{heading_level} >#{heading}<"
83
-
84
- ## 1) first pop headings if present
85
- while headings.size+1 > heading_level
86
- headings.pop
87
- end
88
-
89
- ## 2) add missing (hierarchy) level if
90
- while headings.size+1 < heading_level
91
- ## todo/fix: issue warning about "skipping" hierarchy level
92
- puts "!!! warn [team reader] - skipping hierarchy level in headings "
93
- headings.push( nil )
94
- end
95
-
96
- if heading =~ /^\?+$/ ## note: use ? or ?? or ?? to reset level to nil
97
- ## keep level empty
98
- else
99
- ## note: if level is 1 assume country for now
100
- if heading_level == 1
101
- ## assume country in heading; allow all "formats" supported by parse e.g.
102
- ## Österreich Austria (at)
103
- ## Österreich Austria
104
- ## Austria
105
- ## Deutschland (de) • Germany
106
- country = catalog.countries.parse( heading )
107
- ## check country code - MUST exist for now!!!!
108
- if country.nil?
109
- puts "!!! error [club reader] - unknown country >#{heading}< - sorry - add country to config to fix"
110
- exit 1
111
- end
112
-
113
- headings.push( country.key )
114
- else
115
- ## quick hack:
116
- ## remove known fill/dummy words incl:
117
- ## Provincia San Juan => San Juan (see argentina, for example)
118
- ##
119
- ## use geo tree long term with alternative names - why? why not?
120
- words = ['Provincia']
121
- words.each { |word| heading = heading.gsub( word, '' ) }
122
- heading = heading.strip
123
-
124
- headings.push( heading )
125
- end
126
-
127
- ## assert that hierarchy level is ok
128
- if headings.size != heading_level
129
- puts "!!! error - headings hierarchy/stack out of order - #{heading.size}<=>#{heading_level}"
130
- exit 1
131
- end
132
- end
133
-
134
- pp headings
135
-
136
- elsif node[0] == :p ## paragraph with (text) lines
137
- lines = node[1]
138
- lines.each do |line|
139
- if line.start_with?( '|' )
140
- ## assume continuation with line of alternative names
141
- ## note: skip leading pipe
142
- values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
143
-
144
- add_alt_names( last_rec, values ) ## note: use alt_names helper for (re)use
145
-
146
- ## check for b (child) team / club marker e.g.
147
- ## (ii) or ii) or ii.) or (ii.)
148
- ## (b) or b) or b.) or (b.)
149
- ## (2) or 2) or 2.) or (2.)
150
- elsif line =~ B_TEAM_MARKER_RE
151
- line = line.sub( B_TEAM_MARKER_RE, '' ).strip ## remove (leading) b team marker
152
-
153
- ## todo/fix: move into "regular" club branch - (re)use, that is, use the same code
154
- # for both a and b team / club
155
- rec = Club.new
156
- value = line ## note: assume / allow just canonical name for now
157
- ## strip and squish (white)spaces
158
- # e.g. New York FC (2011-) => New York FC (2011-)
159
- value = value.gsub( '$', '' ).strip
160
- .gsub( /[ \t]+/, ' ' )
161
-
162
- rec.name = value # canoncial name (global unique "beautiful/long" name)
163
- rec.add_variants( value ) # auto-add (possible) auto-generated variant names
164
-
165
- ### link a and b team / clubs
166
- ## assume last_rec is the a team
167
- ## todo/fix: check last_rec required NOT null
168
- rec.a = last_rec
169
- last_rec.b = rec
170
-
171
- last_rec = rec
172
- recs << rec
173
-
174
- ## check for address line e.g.
175
- ## use just one style / syntax - why? why not?
176
- ## Fischhofgasse 12 ~ 1100 Wien or
177
- ## Fischhofgasse 12 // 1100 Wien or Fischhofgasse 12 /// 1100 Wien
178
- ## Fischhofgasse 12 ++ 1100 Wien or Fischhofgasse 12 +++ 1100 Wien
179
- elsif line =~ ADDR_MARKER_RE
180
- # note skip for now!!!
181
- # todo/fix: add support for address line!!!
182
- puts " skipping address line for now >#{line}<"
183
- else
184
- values = line.split( ',' )
185
-
186
- rec = Club.new
187
-
188
- col = values.shift ## get first item
189
- ## note: allow optional alt names for convenience with required canoncial name
190
- names = col.split( '|' ) # team names - allow/use pipe(|)
191
- value = names[0] ## canonical name
192
- alt_names = names[1..-1] ## optional (inline) alt names
193
-
194
- ## strip and squish (white)spaces
195
- # e.g. New York FC (2011-) => New York FC (2011-)
196
- value = value.gsub( '$', '' ).strip
197
- .gsub( /[ \t]+/, ' ' )
198
- rec.name = value # canoncial name (global unique "beautiful/long" name)
199
- rec.add_variants( value ) # auto-add (possible) auto-generated variant names
200
-
201
- ## note: add optional (inline) alternate names if present
202
- add_alt_names( rec, alt_names ) if alt_names.size > 0
203
-
204
- ## note:
205
- ## check/todo!!!!!!!!!!!!!!!!!-
206
- ## strip year if to present e.g. (2011-)
207
- ##
208
- ## do NOT strip for defunct / historic clubs e.g.
209
- ## (1899-1910)
210
- ## or (-1914) or (-2011) etc.
211
-
212
- ###
213
- ## todo: move year out of canonical team name - why? why not?
214
-
215
- ## check if canonical name include (2011-) or similar in name
216
- ## if yes, remove (2011-) and add to (alt) names
217
- ## e.g. New York FC (2011) => New York FC
218
- if rec.name =~ /\(.+?\)/ ## note: use non-greedy (?) match
219
- name = rec.name.gsub( /\(.+?\)/, '' ).strip
220
-
221
- if rec.name =~ /\(([0-9]{4})-\)/ ## e.g. (2014-)
222
- rec.year = $1.to_i
223
- elsif rec.name =~ /\(-([0-9]{4})\)/ ## e.g. (-2014)
224
- rec.year_end = $1.to_i
225
- elsif rec.name =~ /\(([0-9]{4})-([0-9]{4})\)/ ## e.g. (2011-2014)
226
- rec.year = $1.to_i
227
- rec.year_end = $2.to_i
228
- else
229
- ## todo/check: warn about unknown year format
230
- end
231
- end
232
-
233
- ## todo/check - check for unknown format values
234
- ## e.g. too many values, duplicate years, etc.
235
- ## check for overwritting, etc.
236
- while values.size > 0
237
- value = values.shift
238
- ## strip and squish (white)spaces
239
- # e.g. León › Guanajuato => León › Guanajuato
240
- value = value.strip.gsub( /[ \t]+/, ' ' )
241
- if value =~/^\d{4}$/ # e.g 1904
242
- ## todo/check: issue warning if year is already set!!!!!!!
243
- if rec.year
244
- puts "!!! error - year already set to #{rec.year} - CANNOT overwrite with #{value}:"
245
- pp rec
246
- exit 1
247
- end
248
- rec.year = value.to_i
249
- elsif value.start_with?( '@' ) # e.g. @ Anfield
250
- ## cut-off leading @ and spaces
251
- rec.ground = value[1..-1].strip
252
- else
253
- ## assume city / geo tree
254
- ## split into geo tree
255
- geos = split_geo( value )
256
- city = geos[0]
257
- ## check for "embedded" district e.g. London (Fulham) or Hamburg (St. Pauli) etc.
258
- if city =~ /\((.+?)\)/ ## note: use non-greedy (?) match
259
- rec.district = $1.strip
260
- city = city.gsub( /\(.+?\)/, '' ).strip
261
- end
262
- rec.city = city
263
-
264
- if geos.size > 1
265
- ## cut-off city and keep the rest (of geo tree)
266
- rec.geos = geos[1..-1]
267
- end
268
- end
269
- end ## while values
270
-
271
-
272
- ###############
273
- ## use headings text for geo tree
274
-
275
- ## 1) add country if present
276
- if headings.size > 0 && headings[0]
277
- country = catalog.countries.find( headings[0] )
278
- rec.country = country
279
- else
280
- ## make it an error - why? why not?
281
- puts "!!! error - country missing in headings hierarchy - sorry - add to quicklist"
282
- exit 1
283
- end
284
-
285
- ## 2) check geo tree with headings hierarchy
286
- if headings.size > 1 && headings[1]
287
- geos = split_geo( headings[1] )
288
- if rec.geos
289
- if rec.geos[0] != geos[0]
290
- puts "!!! error - geo tree - headings mismatch >#{rec.geos[0]}< <=> >#{geos[0]}<"
291
- exit 1
292
- end
293
- if rec.geos[1] && rec.geos[1] != geos[1] ## check optional 2nd level too
294
- puts "!!! error - geo tree - headings mismatch >#{rec.geos[1]}< <=> >#{geos[1]}<"
295
- exit 1
296
- end
297
- else
298
- ## add missing region (state/province) from headings hierarchy
299
- rec.geos = geos
300
- end
301
- end
302
-
303
- last_rec = rec
304
-
305
-
306
- ### todo/fix:
307
- ## auto-add alt name with dots stripped - why? why not?
308
- ## e.g. D.C. United => DC United
309
- ## e.g. Liverpool F.C. => Liverpool FC
310
- ## e.g. St. Albin => St Albin etc.
311
- ## e.g. 1. FC Köln => 1 FC Köln -- make special case for 1. - why? why not?
312
-
313
- ##
314
- ## todo/fix: unify mapping entries
315
- ## always lowercase !!!! (case insensitive)
316
- ## always strip (2011-) !!!
317
- ## always strip dots (e.g. St., F.C, etc.)
318
-
319
- recs << rec
320
- end
321
- end # each line (in paragraph)
322
- else
323
- puts "** !!! ERROR !!! [club reader] - unknown line type:"
324
- pp node
325
- exit 1
326
- end
327
- end
328
-
329
- recs
330
- end # method read
331
-
332
- #######################
333
- ### helpers
334
- def split_geo( str )
335
- ## assume city / geo tree
336
- ## strip and squish (white)spaces
337
- # e.g. León › Guanajuato => León › Guanajuato
338
- str = str.strip.gsub( /[ \t]+/, ' ' )
339
-
340
- ## split into geo tree
341
- geos = str.split( /[<>‹›]/ ) ## note: allow > < or › ‹
342
- geos = geos.map { |geo| geo.strip } ## remove all whitespaces
343
- geos
344
- end
345
-
346
- end # class ClubReader
347
-
348
-
349
- end ## module Import
350
- end ## module SportDb
1
+
2
+ module SportDb
3
+ module Import
4
+
5
+
6
+ class ClubReader
7
+
8
+ def world() Import.world; end
9
+
10
+
11
+
12
+ def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
13
+ txt = File.open( path, 'r:utf-8' ) { |f| f.read }
14
+ parse( txt )
15
+ end
16
+
17
+ def self.parse( txt )
18
+ new( txt ).parse
19
+ end
20
+
21
+ def initialize( txt )
22
+ @txt = txt
23
+ end
24
+
25
+ ## pattern for b (child) team / club marker e.g.
26
+ ## (ii) or ii) or ii.) or (ii.) or (II)
27
+ ## (b) or b) or b.) or (b.) or (B)
28
+ ## (2) or 2) or 2.) or (2.)
29
+ B_TEAM_MARKER_RE = %r{^ \(? # optional opening bracket
30
+ (?: ii|b|2 )
31
+ \.? # optional dot - keep and allow dot - why? why not?
32
+ \) # required closing bracket
33
+ }xi ## note: add case-insenstive (e.g. II/ii or B/b)
34
+
35
+ ## pattern for checking for address line e.g.
36
+ ## use just one style / syntax - why? why not?
37
+ ## Fischhofgasse 12 ~ 1100 Wien or
38
+ ## Fischhofgasse 12 // 1100 Wien or Fischhofgasse 12 /// 1100 Wien
39
+ ## Fischhofgasse 12 ++ 1100 Wien or Fischhofgasse 12 +++ 1100 Wien
40
+ ADDR_MARKER_RE = %r{ (?: ^|[ ] ) # space or beginning of line
41
+ (?: ~ | /{2,} | \+{2,} )
42
+ (?: [ ]|$) # space or end of line
43
+ }x
44
+
45
+
46
+
47
+ def parse
48
+ recs = []
49
+ last_rec = nil
50
+ headings = [] ## headings stack
51
+
52
+ OutlineReader.parse( @txt ).each do |node|
53
+ if [:h1,:h2,:h3,:h4,:h5,:h6].include?( node[0] )
54
+ heading_level = node[0][1].to_i
55
+ heading = node[1]
56
+
57
+ puts "heading #{heading_level} >#{heading}<"
58
+
59
+ ## 1) first pop headings if present
60
+ while headings.size+1 > heading_level
61
+ headings.pop
62
+ end
63
+
64
+ ## 2) add missing (hierarchy) level if
65
+ while headings.size+1 < heading_level
66
+ ## todo/fix: issue warning about "skipping" hierarchy level
67
+ puts "!!! warn [team reader] - skipping hierarchy level in headings "
68
+ headings.push( nil )
69
+ end
70
+
71
+ if heading =~ /^\?+$/ ## note: use ? or ?? or ?? to reset level to nil
72
+ ## keep level empty
73
+ else
74
+ ## note: if level is 1 assume country for now
75
+ if heading_level == 1
76
+ ## assume country in heading; allow all "formats" supported by parse e.g.
77
+ ## Österreich Austria (at)
78
+ ## Österreich Austria
79
+ ## Austria
80
+ ## Deutschland (de) • Germany
81
+ country = world.countries.parse( heading )
82
+ ## check country code - MUST exist for now!!!!
83
+ if country.nil?
84
+ puts "!!! error [club reader] - unknown country >#{heading}< - sorry - add country to config to fix"
85
+ exit 1
86
+ end
87
+
88
+ headings.push( country.key )
89
+ else
90
+ ## quick hack:
91
+ ## remove known fill/dummy words incl:
92
+ ## Provincia San Juan => San Juan (see argentina, for example)
93
+ ##
94
+ ## use geo tree long term with alternative names - why? why not?
95
+ words = ['Provincia']
96
+ words.each { |word| heading = heading.gsub( word, '' ) }
97
+ heading = heading.strip
98
+
99
+ headings.push( heading )
100
+ end
101
+
102
+ ## assert that hierarchy level is ok
103
+ if headings.size != heading_level
104
+ puts "!!! error - headings hierarchy/stack out of order - #{heading.size}<=>#{heading_level}"
105
+ exit 1
106
+ end
107
+ end
108
+
109
+ pp headings
110
+
111
+ elsif node[0] == :p ## paragraph with (text) lines
112
+ lines = node[1]
113
+ lines.each do |line|
114
+ if line.start_with?( '|' )
115
+ ## assume continuation with line of alternative names
116
+ ## note: skip leading pipe
117
+ values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
118
+ values = values.map {|value| _norm(value) } ## squish/strip etc.
119
+
120
+ last_rec.alt_names += values
121
+
122
+ ## check for b (child) team / club marker e.g.
123
+ ## (ii) or ii) or ii.) or (ii.)
124
+ ## (b) or b) or b.) or (b.)
125
+ ## (2) or 2) or 2.) or (2.)
126
+ elsif line =~ B_TEAM_MARKER_RE
127
+ line = line.sub( B_TEAM_MARKER_RE, '' ).strip ## remove (leading) b team marker
128
+
129
+ ## todo/fix: move into "regular" club branch - (re)use, that is, use the same code
130
+ # for both a and b team / club
131
+ rec = Club.new
132
+ ## note: assume / allow just canonical name for now
133
+ ## strip and squish (white)spaces
134
+ # e.g. New York FC (2011-) => New York FC (2011-)
135
+ rec.name = _norm( line ) # canoncial name (global unique "beautiful/long" name)
136
+
137
+ ### link a and b team / clubs
138
+ ## assume last_rec is the a team
139
+ ## todo/fix: check last_rec required NOT null
140
+ rec.a = last_rec
141
+ last_rec.b = rec
142
+
143
+ last_rec = rec
144
+ recs << rec
145
+
146
+ ## check for address line e.g.
147
+ ## use just one style / syntax - why? why not?
148
+ ## Fischhofgasse 12 ~ 1100 Wien or
149
+ ## Fischhofgasse 12 // 1100 Wien or Fischhofgasse 12 /// 1100 Wien
150
+ ## Fischhofgasse 12 ++ 1100 Wien or Fischhofgasse 12 +++ 1100 Wien
151
+ elsif line =~ ADDR_MARKER_RE
152
+ last_rec.address = _squish( line )
153
+ else
154
+ values = line.split( ',' )
155
+
156
+ rec = Club.new
157
+
158
+ col = values.shift ## get first item
159
+ ## note: allow optional alt names for convenience with required canoncial name
160
+ names = col.split( '|' ) # team names - allow/use pipe(|)
161
+ names = names.map {|name| _norm(name) } ## squish/strip etc.
162
+
163
+ value = names[0] ## canonical name
164
+ alt_names = names[1..-1] ## optional (inline) alt names
165
+
166
+ rec.name = value # canoncial name (global unique "beautiful/long" name)
167
+ ## note: add optional (inline) alternate names if present
168
+ rec.alt_names += alt_names if alt_names.size > 0
169
+
170
+ ## note:
171
+ ## check/todo!!!!!!!!!!!!!!!!!-
172
+ ## strip year if to present e.g. (2011-)
173
+ ##
174
+ ## do NOT strip for defunct / historic clubs e.g.
175
+ ## (1899-1910)
176
+ ## or (-1914) or (-2011) etc.
177
+
178
+ ###
179
+ ## todo: move year out of canonical team name - why? why not?
180
+
181
+ ## check if canonical name include (2011-) or similar in name
182
+ ## if yes, remove (2011-) and add to (alt) names
183
+ ## e.g. New York FC (2011) => New York FC
184
+ if rec.name =~ /\(.+?\)/ ## note: use non-greedy (?) match
185
+ name = rec.name.gsub( /\(.+?\)/, '' ).strip
186
+
187
+ if rec.name =~ /\(([0-9]{4})-\)/ ## e.g. (2014-)
188
+ rec.year = $1.to_i
189
+ elsif rec.name =~ /\(-([0-9]{4})\)/ ## e.g. (-2014)
190
+ rec.year_end = $1.to_i
191
+ elsif rec.name =~ /\(([0-9]{4})-([0-9]{4})\)/ ## e.g. (2011-2014)
192
+ rec.year = $1.to_i
193
+ rec.year_end = $2.to_i
194
+ else
195
+ ## todo/check: warn about unknown year format
196
+ end
197
+ end
198
+
199
+ ## todo/check - check for unknown format values
200
+ ## e.g. too many values, duplicate years, etc.
201
+ ## check for overwritting, etc.
202
+
203
+ ## strip and squish (white)spaces
204
+ # e.g. León › Guanajuato => León › Guanajuato
205
+ values = values.map {|value| _squish(value) }
206
+
207
+ while values.size > 0
208
+ value = values.shift
209
+ if value =~/^\d{4}$/ # e.g 1904
210
+ ## todo/check: issue warning if year is already set!!!!!!!
211
+ if rec.year
212
+ puts "!!! error - year already set to #{rec.year} - CANNOT overwrite with #{value}:"
213
+ pp rec
214
+ exit 1
215
+ end
216
+ rec.year = value.to_i
217
+ elsif value.start_with?( '@' ) # e.g. @ Anfield
218
+ ## cut-off leading @ and spaces
219
+ rec.ground = value[1..-1].strip
220
+ else
221
+ ## assume city / geo tree
222
+ ## split into geo tree
223
+ geos = split_geo( value )
224
+ city = geos[0]
225
+ ## check for "embedded" district e.g. London (Fulham) or Hamburg (St. Pauli) etc.
226
+ if city =~ /\((.+?)\)/ ## note: use non-greedy (?) match
227
+ rec.district = $1.strip
228
+ city = city.gsub( /\(.+?\)/, '' ).strip
229
+ end
230
+ rec.city = city
231
+
232
+ if geos.size > 1
233
+ ## cut-off city and keep the rest (of geo tree)
234
+ rec.geos = geos[1..-1]
235
+ end
236
+ end
237
+ end ## while values
238
+
239
+
240
+ ###############
241
+ ## use headings text for geo tree
242
+
243
+ ## 1) add country if present
244
+ if headings.size > 0 && headings[0]
245
+ country = world.countries.find( headings[0] )
246
+ rec.country = country
247
+ else
248
+ ## make it an error - why? why not?
249
+ puts "!!! error - country missing in headings hierarchy - sorry - add to quicklist"
250
+ exit 1
251
+ end
252
+
253
+ ## 2) check geo tree with headings hierarchy
254
+ if headings.size > 1 && headings[1]
255
+ geos = split_geo( headings[1] )
256
+ if rec.geos
257
+ if rec.geos[0] != geos[0]
258
+ puts "!!! error - geo tree - headings mismatch >#{rec.geos[0]}< <=> >#{geos[0]}<"
259
+ exit 1
260
+ end
261
+ if rec.geos[1] && rec.geos[1] != geos[1] ## check optional 2nd level too
262
+ puts "!!! error - geo tree - headings mismatch >#{rec.geos[1]}< <=> >#{geos[1]}<"
263
+ exit 1
264
+ end
265
+ else
266
+ ## add missing region (state/province) from headings hierarchy
267
+ rec.geos = geos
268
+ end
269
+ end
270
+
271
+ last_rec = rec
272
+
273
+ recs << rec
274
+ end
275
+ end # each line (in paragraph)
276
+ else
277
+ puts "** !!! ERROR !!! [club reader] - unknown line type:"
278
+ pp node
279
+ exit 1
280
+ end
281
+ end
282
+
283
+ recs
284
+ end # method read
285
+
286
+ #######################
287
+ ### helpers
288
+ def split_geo( str )
289
+ ## assume city / geo tree
290
+ ## strip and squish (white)spaces
291
+ # e.g. León › Guanajuato => León › Guanajuato
292
+ str = _squish( str )
293
+
294
+ ## split into geo tree
295
+ geos = str.split( /[<>‹›]/ ) ## note: allow > < or › ‹
296
+ geos = geos.map { |geo| geo.strip } ## remove all whitespaces
297
+ geos
298
+ end
299
+
300
+
301
+ ## norm(alize) helper - squish (spaces)
302
+ ## and remove dollars ($$$)
303
+ ## and remove leading and trailing spaces
304
+ def _norm( str )
305
+ ## only extra clean-up of dollars for now ($$$)
306
+ _squish( str.gsub( '$', '' ) )
307
+ end
308
+
309
+ def _squish( str )
310
+ str.gsub( /[ \t\u00a0]+/, ' ' ).strip
311
+ end
312
+
313
+
314
+ end # class ClubReader
315
+
316
+
317
+ end ## module Import
318
+ end ## module SportDb