sportdb-formats 1.0.0 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Manifest.txt +21 -0
- data/lib/sportdb/formats.rb +63 -0
- data/lib/sportdb/formats/country/country_index.rb +192 -0
- data/lib/sportdb/formats/country/country_reader.rb +122 -0
- data/lib/sportdb/formats/league/league_index.rb +174 -0
- data/lib/sportdb/formats/league/league_outline_reader.rb +141 -0
- data/lib/sportdb/formats/league/league_reader.rb +162 -0
- data/lib/sportdb/formats/team/club_index.rb +336 -0
- data/lib/sportdb/formats/team/club_reader.rb +350 -0
- data/lib/sportdb/formats/team/club_reader_props.rb +75 -0
- data/lib/sportdb/formats/team/national_team_index.rb +114 -0
- data/lib/sportdb/formats/team/team_index.rb +43 -0
- data/lib/sportdb/formats/team/wiki_reader.rb +108 -0
- data/lib/sportdb/formats/version.rb +1 -1
- data/test/helper.rb +72 -0
- data/test/test_club_index.rb +183 -0
- data/test/test_club_reader.rb +201 -0
- data/test/test_club_reader_props.rb +54 -0
- data/test/test_country_index.rb +63 -0
- data/test/test_country_reader.rb +59 -0
- data/test/test_league_index.rb +157 -0
- data/test/test_league_outline_reader.rb +55 -0
- data/test/test_league_reader.rb +72 -0
- data/test/test_regex.rb +49 -0
- data/test/test_wiki_reader.rb +77 -0
- metadata +22 -1
@@ -0,0 +1,336 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module SportDb
|
4
|
+
module Import
|
5
|
+
|
6
|
+
|
7
|
+
class ClubIndex
|
8
|
+
|
9
|
+
def self.build( path )
|
10
|
+
pack = Package.new( path ) ## lets us use direcotry or zip archive
|
11
|
+
|
12
|
+
recs = []
|
13
|
+
pack.each_clubs do |entry|
|
14
|
+
recs += Club.parse( entry.read )
|
15
|
+
end
|
16
|
+
recs
|
17
|
+
|
18
|
+
clubs = new
|
19
|
+
clubs.add( recs )
|
20
|
+
|
21
|
+
## add wiki(pedia) anchored links
|
22
|
+
recs = []
|
23
|
+
pack.each_clubs_wiki do |entry|
|
24
|
+
recs += WikiReader.parse( entry.read )
|
25
|
+
end
|
26
|
+
|
27
|
+
pp recs
|
28
|
+
clubs.add_wiki( recs )
|
29
|
+
clubs
|
30
|
+
end
|
31
|
+
|
32
|
+
|
33
|
+
def catalog() Import.catalog; end
|
34
|
+
|
35
|
+
def initialize
|
36
|
+
@clubs = {} ## clubs (indexed) by canonical name
|
37
|
+
@clubs_by_name = {}
|
38
|
+
@errors = []
|
39
|
+
end
|
40
|
+
|
41
|
+
attr_reader :errors
|
42
|
+
def errors?() @errors.empty? == false; end
|
43
|
+
|
44
|
+
def mappings() @clubs_by_name; end ## todo/check: rename to index or something - why? why not?
|
45
|
+
def clubs() @clubs.values; end
|
46
|
+
alias_method :all, :clubs ## use ActiveRecord-like alias for clubs
|
47
|
+
|
48
|
+
|
49
|
+
## helpers from club - use a helper module for includes - why? why not?
|
50
|
+
include NameHelper
|
51
|
+
## incl. strip_year( name )
|
52
|
+
## has_year?( name)
|
53
|
+
## strip_lang( name )
|
54
|
+
## normalize( name )
|
55
|
+
|
56
|
+
def strip_wiki( name ) # todo/check: rename to strip_wikipedia_en - why? why not?
|
57
|
+
## note: strip disambiguationn qualifier from wikipedia page name if present
|
58
|
+
## note: only remove year and foot... for now
|
59
|
+
## e.g. FC Wacker Innsbruck (2002) => FC Wacker Innsbruck
|
60
|
+
## Willem II (football club) => Willem II
|
61
|
+
##
|
62
|
+
## e.g. do NOT strip others !! e.g.
|
63
|
+
## América Futebol Clube (MG)
|
64
|
+
## only add more "special" cases on demand (that, is) if we find more
|
65
|
+
name = name.gsub( /\([12][^\)]+?\)/, '' ).strip ## starting with a digit 1 or 2 (assuming year)
|
66
|
+
name = name.gsub( /\(foot[^\)]+?\)/, '' ).strip ## starting with foot (assuming football ...)
|
67
|
+
name
|
68
|
+
end
|
69
|
+
|
70
|
+
def add_wiki( rec_or_recs ) ## add wiki(pedia club record / links
|
71
|
+
recs = rec_or_recs.is_a?( Array ) ? rec_or_recs : [rec_or_recs] ## wrap (single) rec in array
|
72
|
+
|
73
|
+
recs.each do |rec|
|
74
|
+
## note: strip qualifier () from wikipedia page name if present
|
75
|
+
## e.g. FC Wacker Innsbruck (2002) => FC Wacker Innsbruck
|
76
|
+
## Willem II (football club) => Willem II
|
77
|
+
##
|
78
|
+
## e.g. do NOT strip others !! e.g.
|
79
|
+
## América Futebol Clube (MG)
|
80
|
+
## only add more "special" cases on demand (that, is) if we find more
|
81
|
+
name = strip_wiki( rec.name )
|
82
|
+
|
83
|
+
m = match_by( name: name, country: rec.country )
|
84
|
+
if m.nil?
|
85
|
+
puts "** !!! ERROR !!! - no matching club found for wiki(pedia) name >#{name}, #{rec.country.name} (#{rec.country.key})<; sorry - to fix add name to clubs"
|
86
|
+
exit 1
|
87
|
+
end
|
88
|
+
if m.size > 1
|
89
|
+
puts "** !!! ERROR !!! - too many (greater than one) matching clubs found for wiki(pedia) name >#{name}, #{rec.country.name} (#{rec.country.key})<"
|
90
|
+
pp m
|
91
|
+
exit 1
|
92
|
+
end
|
93
|
+
club = m[0]
|
94
|
+
club.wikipedia = rec.name
|
95
|
+
end
|
96
|
+
end # method add_wiki
|
97
|
+
|
98
|
+
|
99
|
+
def add( rec_or_recs ) ## add club record / alt_names
|
100
|
+
recs = rec_or_recs.is_a?( Array ) ? rec_or_recs : [rec_or_recs] ## wrap (single) rec in array
|
101
|
+
|
102
|
+
recs.each do |rec|
|
103
|
+
## puts "adding:"
|
104
|
+
## pp rec
|
105
|
+
### step 1) add canonical name
|
106
|
+
old_rec = @clubs[ rec.name ]
|
107
|
+
if old_rec
|
108
|
+
puts "** !!! ERROR !!! - (canonical) name conflict - duplicate - >#{rec.name}< will overwrite >#{old_rec.name}<:"
|
109
|
+
pp old_rec
|
110
|
+
pp rec
|
111
|
+
exit 1
|
112
|
+
else
|
113
|
+
@clubs[ rec.name ] = rec
|
114
|
+
end
|
115
|
+
|
116
|
+
## step 2) add all names (canonical name + alt names + alt names (auto))
|
117
|
+
names = [rec.name] + rec.alt_names
|
118
|
+
more_names = []
|
119
|
+
## check "hand-typed" names for year (auto-add)
|
120
|
+
## check for year(s) e.g. (1887-1911), (-2013),
|
121
|
+
## (1946-2001,2013-) etc.
|
122
|
+
names.each do |name|
|
123
|
+
if has_year?( name )
|
124
|
+
more_names << strip_year( name )
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
names += more_names
|
129
|
+
## check for duplicates - simple check for now - fix/improve
|
130
|
+
## todo/fix: (auto)remove duplicates - why? why not?
|
131
|
+
count = names.size
|
132
|
+
count_uniq = names.uniq.size
|
133
|
+
if count != count_uniq
|
134
|
+
puts "** !!! ERROR !!! - #{count-count_uniq} duplicate name(s):"
|
135
|
+
pp names
|
136
|
+
pp rec
|
137
|
+
exit 1
|
138
|
+
end
|
139
|
+
|
140
|
+
## check with auto-names just warn for now and do not exit
|
141
|
+
names += rec.alt_names_auto
|
142
|
+
count = names.size
|
143
|
+
count_uniq = names.uniq.size
|
144
|
+
if count != count_uniq
|
145
|
+
puts "** !!! WARN !!! - #{count-count_uniq} duplicate name(s):"
|
146
|
+
pp names
|
147
|
+
pp rec
|
148
|
+
end
|
149
|
+
|
150
|
+
|
151
|
+
names.each_with_index do |name,i|
|
152
|
+
## check lang codes e.g. [en], [fr], etc.
|
153
|
+
## todo/check/fix: move strip_lang up in the chain - check for duplicates (e.g. only lang code marker different etc.) - why? why not?
|
154
|
+
name = strip_lang( name )
|
155
|
+
norm = normalize( name )
|
156
|
+
alt_recs = @clubs_by_name[ norm ]
|
157
|
+
if alt_recs
|
158
|
+
## check if include club rec already or is new club rec
|
159
|
+
if alt_recs.include?( rec )
|
160
|
+
## note: do NOT include duplicate club record
|
161
|
+
msg = "** !!! WARN !!! - (norm) name conflict/duplicate for club - >#{name}< normalized to >#{norm}< already included >#{rec.name}, #{rec.country.name}<"
|
162
|
+
puts msg
|
163
|
+
@errors << msg
|
164
|
+
else
|
165
|
+
msg = "** !!! WARN !!! - name conflict/duplicate - >#{name}< will overwrite >#{alt_recs[0].name}, #{alt_recs[0].country.name}< with >#{rec.name}, #{rec.country.name}<"
|
166
|
+
puts msg
|
167
|
+
@errors << msg
|
168
|
+
alt_recs << rec
|
169
|
+
end
|
170
|
+
else
|
171
|
+
@clubs_by_name[ norm ] = [rec]
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end # method add
|
176
|
+
|
177
|
+
|
178
|
+
## todo/fix/check: use rename to find_canon or find_canonical() or something??
|
179
|
+
## remove (getting used?) - why? why not?
|
180
|
+
def []( name ) ## lookup by canoncial name only; todo/fix: add find alias why? why not?
|
181
|
+
@clubs[ name ]
|
182
|
+
end
|
183
|
+
|
184
|
+
|
185
|
+
## todo/fix/check: return empty array if no match!!!
|
186
|
+
## and NOT nil (add || []) - why? why not?
|
187
|
+
def match( name )
|
188
|
+
name = normalize( name )
|
189
|
+
m = @clubs_by_name[ name ]
|
190
|
+
|
191
|
+
## no match - retry with unaccented variant if different
|
192
|
+
## e.g. example is Preussen Münster (with mixed accent and unaccented letters) that would go unmatched for now
|
193
|
+
## Preussen Münster => preussenmünster (norm) => preussenmunster (norm+unaccent)
|
194
|
+
if m.nil?
|
195
|
+
name2 = unaccent( name )
|
196
|
+
m = @clubs_by_name[ name2 ] if name2 != name
|
197
|
+
end
|
198
|
+
m
|
199
|
+
end
|
200
|
+
|
201
|
+
|
202
|
+
## helper to always convert (possible) country key to existing country record
|
203
|
+
## todo: make private - why? why not?
|
204
|
+
def country( country )
|
205
|
+
if country.is_a?( String ) || country.is_a?( Symbol )
|
206
|
+
## note: use own "global" countries index setting for ClubIndex - why? why not?
|
207
|
+
rec = catalog.countries.find( country.to_s )
|
208
|
+
if rec.nil?
|
209
|
+
puts "** !!! ERROR !!! - unknown country >#{country}< - no match found, sorry - add to world/countries.txt in config"
|
210
|
+
exit 1
|
211
|
+
end
|
212
|
+
rec
|
213
|
+
else
|
214
|
+
country ## (re)use country struct - no need to run lookup again
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
|
219
|
+
## match - always returns an array (with one or more matches) or nil
|
220
|
+
def match_by( name:, country: nil )
|
221
|
+
## note: allow passing in of country key too (auto-counvert)
|
222
|
+
## and country struct too
|
223
|
+
## - country assumes / allows the country key or fifa code for now
|
224
|
+
m = match( name )
|
225
|
+
|
226
|
+
if country
|
227
|
+
country = country( country )
|
228
|
+
|
229
|
+
## note: match must for now always include name
|
230
|
+
if m ## filter by country
|
231
|
+
m = m.select { |club| club.country.key == country.key }
|
232
|
+
m = nil if m.empty? ## note: reset to nil if no more matches
|
233
|
+
end
|
234
|
+
end
|
235
|
+
m
|
236
|
+
end
|
237
|
+
|
238
|
+
def find( name ) find_by( name: name, country: nil ); end
|
239
|
+
def find!( name ) find_by!( name: name, country: nil ); end
|
240
|
+
|
241
|
+
## find - always returns a single record / match or nil
|
242
|
+
## if there is more than one match than find aborts / fails
|
243
|
+
def find_by!( name:, country: nil ) ## todo/fix: add international or league flag?
|
244
|
+
club = find_by( name: name, country: country )
|
245
|
+
|
246
|
+
if club.nil?
|
247
|
+
puts "** !!! ERROR - no match for club >#{name}<"
|
248
|
+
exit 1
|
249
|
+
end
|
250
|
+
|
251
|
+
club
|
252
|
+
end
|
253
|
+
|
254
|
+
|
255
|
+
def find_by( name:, country: nil ) ## todo/fix: add international or league flag?
|
256
|
+
## note: allow passing in of country key too (auto-counvert)
|
257
|
+
## and country struct too
|
258
|
+
## - country assumes / allows the country key or fifa code for now
|
259
|
+
m = nil
|
260
|
+
|
261
|
+
if country
|
262
|
+
country = country( country )
|
263
|
+
|
264
|
+
m = match_by( name: name, country: country )
|
265
|
+
|
266
|
+
if m.nil?
|
267
|
+
## (re)try with second country - quick hacks for known leagues
|
268
|
+
## todo/fix: add league flag to activate!!! - why? why not
|
269
|
+
m = match_by( name: name, country: 'wal' ) if country.key == 'eng'
|
270
|
+
m = match_by( name: name, country: 'eng' ) if country.key == 'sco'
|
271
|
+
m = match_by( name: name, country: 'nir' ) if country.key == 'ie'
|
272
|
+
m = match_by( name: name, country: 'mc' ) if country.key == 'fr'
|
273
|
+
m = match_by( name: name, country: 'li' ) if country.key == 'ch'
|
274
|
+
m = match_by( name: name, country: 'ca' ) if country.key == 'us'
|
275
|
+
end
|
276
|
+
else ## try "global" search - no country passed in
|
277
|
+
m = match( name )
|
278
|
+
end
|
279
|
+
|
280
|
+
|
281
|
+
club = nil
|
282
|
+
if m.nil?
|
283
|
+
## puts "** !!! WARN !!! no match for club >#{name}<"
|
284
|
+
elsif m.size > 1
|
285
|
+
puts "** !!! ERROR - too many matches (#{m.size}) for club >#{name}<:"
|
286
|
+
pp m
|
287
|
+
exit 1
|
288
|
+
else # bingo; match - assume size == 1
|
289
|
+
club = m[0]
|
290
|
+
end
|
291
|
+
|
292
|
+
club
|
293
|
+
end
|
294
|
+
|
295
|
+
|
296
|
+
|
297
|
+
def build_mods( mods )
|
298
|
+
## e.g.
|
299
|
+
## { 'Arsenal | Arsenal FC' => 'Arsenal, ENG',
|
300
|
+
## 'Liverpool | Liverpool FC' => 'Liverpool, ENG',
|
301
|
+
## 'Barcelona' => 'Barcelona, ESP',
|
302
|
+
## 'Valencia' => 'Valencia, ESP' }
|
303
|
+
|
304
|
+
mods.reduce({}) do |h,(club_names, club_line)|
|
305
|
+
|
306
|
+
values = club_line.split( ',' )
|
307
|
+
values = values.map { |value| value.strip } ## strip all spaces
|
308
|
+
|
309
|
+
## todo/fix: make sure country is present !!!!
|
310
|
+
club_name, country_name = values
|
311
|
+
club = find_by!( name: club_name, country: country_name )
|
312
|
+
|
313
|
+
values = club_names.split( '|' )
|
314
|
+
values = values.map { |value| value.strip } ## strip all spaces
|
315
|
+
|
316
|
+
values.each do |club_name|
|
317
|
+
h[club_name] = club
|
318
|
+
end
|
319
|
+
h
|
320
|
+
end
|
321
|
+
end
|
322
|
+
|
323
|
+
|
324
|
+
def dump_duplicates # debug helper - report duplicate club name records
|
325
|
+
@clubs_by_name.each do |name, clubs|
|
326
|
+
if clubs.size > 1
|
327
|
+
puts "#{clubs.size} matching club duplicates for >#{name}<:"
|
328
|
+
pp clubs
|
329
|
+
end
|
330
|
+
end
|
331
|
+
end
|
332
|
+
end # class ClubIndex
|
333
|
+
|
334
|
+
|
335
|
+
end # module Import
|
336
|
+
end # module SportDb
|
@@ -0,0 +1,350 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
module SportDb
|
5
|
+
module Import
|
6
|
+
|
7
|
+
|
8
|
+
class ClubReader
|
9
|
+
|
10
|
+
def catalog() Import.catalog; end
|
11
|
+
|
12
|
+
|
13
|
+
|
14
|
+
def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
|
15
|
+
txt = File.open( path, 'r:utf-8' ) { |f| f.read }
|
16
|
+
parse( txt )
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.parse( txt )
|
20
|
+
new( txt ).parse
|
21
|
+
end
|
22
|
+
|
23
|
+
def initialize( txt )
|
24
|
+
@txt = txt
|
25
|
+
end
|
26
|
+
|
27
|
+
## pattern for b (child) team / club marker e.g.
|
28
|
+
## (ii) or ii) or ii.) or (ii.) or (II)
|
29
|
+
## (b) or b) or b.) or (b.) or (B)
|
30
|
+
## (2) or 2) or 2.) or (2.)
|
31
|
+
B_TEAM_MARKER_RE = %r{^ \(? # optional opening bracket
|
32
|
+
(?: ii|b|2 )
|
33
|
+
\.? # optional dot - keep and allow dot - why? why not?
|
34
|
+
\) # required closing bracket
|
35
|
+
}xi ## note: add case-insenstive (e.g. II/ii or B/b)
|
36
|
+
|
37
|
+
## pattern for checking for address line e.g.
|
38
|
+
## use just one style / syntax - why? why not?
|
39
|
+
## Fischhofgasse 12 ~ 1100 Wien or
|
40
|
+
## Fischhofgasse 12 // 1100 Wien or Fischhofgasse 12 /// 1100 Wien
|
41
|
+
## Fischhofgasse 12 ++ 1100 Wien or Fischhofgasse 12 +++ 1100 Wien
|
42
|
+
ADDR_MARKER_RE = %r{ (?: ^|[ ] ) # space or beginning of line
|
43
|
+
(?: ~ | /{2,} | \+{2,} )
|
44
|
+
(?: [ ]|$) # space or end of line
|
45
|
+
}x
|
46
|
+
|
47
|
+
|
48
|
+
def add_alt_names( rec, names ) ## helper for adding alternat names
|
49
|
+
|
50
|
+
## strip and squish (white)spaces
|
51
|
+
# e.g. New York FC (2011-) => New York FC (2011-)
|
52
|
+
names = names.map { |name| name.gsub( '$', '' ).strip
|
53
|
+
.gsub( /[ \t]+/, ' ' ) }
|
54
|
+
rec.alt_names += names
|
55
|
+
rec.add_variants( names ) # auto-add (possible) auto-generated variant names
|
56
|
+
|
57
|
+
## check for duplicates
|
58
|
+
if rec.duplicates?
|
59
|
+
duplicates = rec.duplicates
|
60
|
+
puts "*** !!! WARN !!! - #{duplicates.size} duplicate alt name mapping(s):"
|
61
|
+
pp duplicates
|
62
|
+
pp rec
|
63
|
+
##
|
64
|
+
## todo/fix: make it only an error with exit 1
|
65
|
+
## if (not normalized) names are the same (not unique/uniq)
|
66
|
+
## e.g. don't exit on A.F.C. == AFC etc.
|
67
|
+
## exit 1
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
|
72
|
+
def parse
|
73
|
+
recs = []
|
74
|
+
last_rec = nil
|
75
|
+
headings = [] ## headings stack
|
76
|
+
|
77
|
+
OutlineReader.parse( @txt ).each do |node|
|
78
|
+
if [:h1,:h2,:h3,:h4,:h5,:h6].include?( node[0] )
|
79
|
+
heading_level = node[0][1].to_i
|
80
|
+
heading = node[1]
|
81
|
+
|
82
|
+
puts "heading #{heading_level} >#{heading}<"
|
83
|
+
|
84
|
+
## 1) first pop headings if present
|
85
|
+
while headings.size+1 > heading_level
|
86
|
+
headings.pop
|
87
|
+
end
|
88
|
+
|
89
|
+
## 2) add missing (hierarchy) level if
|
90
|
+
while headings.size+1 < heading_level
|
91
|
+
## todo/fix: issue warning about "skipping" hierarchy level
|
92
|
+
puts "!!! warn [team reader] - skipping hierarchy level in headings "
|
93
|
+
headings.push( nil )
|
94
|
+
end
|
95
|
+
|
96
|
+
if heading =~ /^\?+$/ ## note: use ? or ?? or ?? to reset level to nil
|
97
|
+
## keep level empty
|
98
|
+
else
|
99
|
+
## note: if level is 1 assume country for now
|
100
|
+
if heading_level == 1
|
101
|
+
## assume country in heading; allow all "formats" supported by parse e.g.
|
102
|
+
## Österreich • Austria (at)
|
103
|
+
## Österreich • Austria
|
104
|
+
## Austria
|
105
|
+
## Deutschland (de) • Germany
|
106
|
+
country = catalog.countries.parse( heading )
|
107
|
+
## check country code - MUST exist for now!!!!
|
108
|
+
if country.nil?
|
109
|
+
puts "!!! error [club reader] - unknown country >#{heading}< - sorry - add country to config to fix"
|
110
|
+
exit 1
|
111
|
+
end
|
112
|
+
|
113
|
+
headings.push( country.key )
|
114
|
+
else
|
115
|
+
## quick hack:
|
116
|
+
## remove known fill/dummy words incl:
|
117
|
+
## Provincia San Juan => San Juan (see argentina, for example)
|
118
|
+
##
|
119
|
+
## use geo tree long term with alternative names - why? why not?
|
120
|
+
words = ['Provincia']
|
121
|
+
words.each { |word| heading = heading.gsub( word, '' ) }
|
122
|
+
heading = heading.strip
|
123
|
+
|
124
|
+
headings.push( heading )
|
125
|
+
end
|
126
|
+
|
127
|
+
## assert that hierarchy level is ok
|
128
|
+
if headings.size != heading_level
|
129
|
+
puts "!!! error - headings hierarchy/stack out of order - #{heading.size}<=>#{heading_level}"
|
130
|
+
exit 1
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
pp headings
|
135
|
+
|
136
|
+
elsif node[0] == :p ## paragraph with (text) lines
|
137
|
+
lines = node[1]
|
138
|
+
lines.each do |line|
|
139
|
+
if line.start_with?( '|' )
|
140
|
+
## assume continuation with line of alternative names
|
141
|
+
## note: skip leading pipe
|
142
|
+
values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
|
143
|
+
|
144
|
+
add_alt_names( last_rec, values ) ## note: use alt_names helper for (re)use
|
145
|
+
|
146
|
+
## check for b (child) team / club marker e.g.
|
147
|
+
## (ii) or ii) or ii.) or (ii.)
|
148
|
+
## (b) or b) or b.) or (b.)
|
149
|
+
## (2) or 2) or 2.) or (2.)
|
150
|
+
elsif line =~ B_TEAM_MARKER_RE
|
151
|
+
line = line.sub( B_TEAM_MARKER_RE, '' ).strip ## remove (leading) b team marker
|
152
|
+
|
153
|
+
## todo/fix: move into "regular" club branch - (re)use, that is, use the same code
|
154
|
+
# for both a and b team / club
|
155
|
+
rec = Club.new
|
156
|
+
value = line ## note: assume / allow just canonical name for now
|
157
|
+
## strip and squish (white)spaces
|
158
|
+
# e.g. New York FC (2011-) => New York FC (2011-)
|
159
|
+
value = value.gsub( '$', '' ).strip
|
160
|
+
.gsub( /[ \t]+/, ' ' )
|
161
|
+
|
162
|
+
rec.name = value # canoncial name (global unique "beautiful/long" name)
|
163
|
+
rec.add_variants( value ) # auto-add (possible) auto-generated variant names
|
164
|
+
|
165
|
+
### link a and b team / clubs
|
166
|
+
## assume last_rec is the a team
|
167
|
+
## todo/fix: check last_rec required NOT null
|
168
|
+
rec.a = last_rec
|
169
|
+
last_rec.b = rec
|
170
|
+
|
171
|
+
last_rec = rec
|
172
|
+
recs << rec
|
173
|
+
|
174
|
+
## check for address line e.g.
|
175
|
+
## use just one style / syntax - why? why not?
|
176
|
+
## Fischhofgasse 12 ~ 1100 Wien or
|
177
|
+
## Fischhofgasse 12 // 1100 Wien or Fischhofgasse 12 /// 1100 Wien
|
178
|
+
## Fischhofgasse 12 ++ 1100 Wien or Fischhofgasse 12 +++ 1100 Wien
|
179
|
+
elsif line =~ ADDR_MARKER_RE
|
180
|
+
# note skip for now!!!
|
181
|
+
# todo/fix: add support for address line!!!
|
182
|
+
puts " skipping address line for now >#{line}<"
|
183
|
+
else
|
184
|
+
values = line.split( ',' )
|
185
|
+
|
186
|
+
rec = Club.new
|
187
|
+
|
188
|
+
col = values.shift ## get first item
|
189
|
+
## note: allow optional alt names for convenience with required canoncial name
|
190
|
+
names = col.split( '|' ) # team names - allow/use pipe(|)
|
191
|
+
value = names[0] ## canonical name
|
192
|
+
alt_names = names[1..-1] ## optional (inline) alt names
|
193
|
+
|
194
|
+
## strip and squish (white)spaces
|
195
|
+
# e.g. New York FC (2011-) => New York FC (2011-)
|
196
|
+
value = value.gsub( '$', '' ).strip
|
197
|
+
.gsub( /[ \t]+/, ' ' )
|
198
|
+
rec.name = value # canoncial name (global unique "beautiful/long" name)
|
199
|
+
rec.add_variants( value ) # auto-add (possible) auto-generated variant names
|
200
|
+
|
201
|
+
## note: add optional (inline) alternate names if present
|
202
|
+
add_alt_names( rec, alt_names ) if alt_names.size > 0
|
203
|
+
|
204
|
+
## note:
|
205
|
+
## check/todo!!!!!!!!!!!!!!!!!-
|
206
|
+
## strip year if to present e.g. (2011-)
|
207
|
+
##
|
208
|
+
## do NOT strip for defunct / historic clubs e.g.
|
209
|
+
## (1899-1910)
|
210
|
+
## or (-1914) or (-2011) etc.
|
211
|
+
|
212
|
+
###
|
213
|
+
## todo: move year out of canonical team name - why? why not?
|
214
|
+
|
215
|
+
## check if canonical name include (2011-) or similar in name
|
216
|
+
## if yes, remove (2011-) and add to (alt) names
|
217
|
+
## e.g. New York FC (2011) => New York FC
|
218
|
+
if rec.name =~ /\(.+?\)/ ## note: use non-greedy (?) match
|
219
|
+
name = rec.name.gsub( /\(.+?\)/, '' ).strip
|
220
|
+
|
221
|
+
if rec.name =~ /\(([0-9]{4})-\)/ ## e.g. (2014-)
|
222
|
+
rec.year = $1.to_i
|
223
|
+
elsif rec.name =~ /\(-([0-9]{4})\)/ ## e.g. (-2014)
|
224
|
+
rec.year_end = $1.to_i
|
225
|
+
elsif rec.name =~ /\(([0-9]{4})-([0-9]{4})\)/ ## e.g. (2011-2014)
|
226
|
+
rec.year = $1.to_i
|
227
|
+
rec.year_end = $2.to_i
|
228
|
+
else
|
229
|
+
## todo/check: warn about unknown year format
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
## todo/check - check for unknown format values
|
234
|
+
## e.g. too many values, duplicate years, etc.
|
235
|
+
## check for overwritting, etc.
|
236
|
+
while values.size > 0
|
237
|
+
value = values.shift
|
238
|
+
## strip and squish (white)spaces
|
239
|
+
# e.g. León › Guanajuato => León › Guanajuato
|
240
|
+
value = value.strip.gsub( /[ \t]+/, ' ' )
|
241
|
+
if value =~/^\d{4}$/ # e.g 1904
|
242
|
+
## todo/check: issue warning if year is already set!!!!!!!
|
243
|
+
if rec.year
|
244
|
+
puts "!!! error - year already set to #{rec.year} - CANNOT overwrite with #{value}:"
|
245
|
+
pp rec
|
246
|
+
exit 1
|
247
|
+
end
|
248
|
+
rec.year = value.to_i
|
249
|
+
elsif value.start_with?( '@' ) # e.g. @ Anfield
|
250
|
+
## cut-off leading @ and spaces
|
251
|
+
rec.ground = value[1..-1].strip
|
252
|
+
else
|
253
|
+
## assume city / geo tree
|
254
|
+
## split into geo tree
|
255
|
+
geos = split_geo( value )
|
256
|
+
city = geos[0]
|
257
|
+
## check for "embedded" district e.g. London (Fulham) or Hamburg (St. Pauli) etc.
|
258
|
+
if city =~ /\((.+?)\)/ ## note: use non-greedy (?) match
|
259
|
+
rec.district = $1.strip
|
260
|
+
city = city.gsub( /\(.+?\)/, '' ).strip
|
261
|
+
end
|
262
|
+
rec.city = city
|
263
|
+
|
264
|
+
if geos.size > 1
|
265
|
+
## cut-off city and keep the rest (of geo tree)
|
266
|
+
rec.geos = geos[1..-1]
|
267
|
+
end
|
268
|
+
end
|
269
|
+
end ## while values
|
270
|
+
|
271
|
+
|
272
|
+
###############
|
273
|
+
## use headings text for geo tree
|
274
|
+
|
275
|
+
## 1) add country if present
|
276
|
+
if headings.size > 0 && headings[0]
|
277
|
+
country = catalog.countries.find( headings[0] )
|
278
|
+
rec.country = country
|
279
|
+
else
|
280
|
+
## make it an error - why? why not?
|
281
|
+
puts "!!! error - country missing in headings hierarchy - sorry - add to quicklist"
|
282
|
+
exit 1
|
283
|
+
end
|
284
|
+
|
285
|
+
## 2) check geo tree with headings hierarchy
|
286
|
+
if headings.size > 1 && headings[1]
|
287
|
+
geos = split_geo( headings[1] )
|
288
|
+
if rec.geos
|
289
|
+
if rec.geos[0] != geos[0]
|
290
|
+
puts "!!! error - geo tree - headings mismatch >#{rec.geos[0]}< <=> >#{geos[0]}<"
|
291
|
+
exit 1
|
292
|
+
end
|
293
|
+
if rec.geos[1] && rec.geos[1] != geos[1] ## check optional 2nd level too
|
294
|
+
puts "!!! error - geo tree - headings mismatch >#{rec.geos[1]}< <=> >#{geos[1]}<"
|
295
|
+
exit 1
|
296
|
+
end
|
297
|
+
else
|
298
|
+
## add missing region (state/province) from headings hierarchy
|
299
|
+
rec.geos = geos
|
300
|
+
end
|
301
|
+
end
|
302
|
+
|
303
|
+
last_rec = rec
|
304
|
+
|
305
|
+
|
306
|
+
### todo/fix:
|
307
|
+
## auto-add alt name with dots stripped - why? why not?
|
308
|
+
## e.g. D.C. United => DC United
|
309
|
+
## e.g. Liverpool F.C. => Liverpool FC
|
310
|
+
## e.g. St. Albin => St Albin etc.
|
311
|
+
## e.g. 1. FC Köln => 1 FC Köln -- make special case for 1. - why? why not?
|
312
|
+
|
313
|
+
##
|
314
|
+
## todo/fix: unify mapping entries
|
315
|
+
## always lowercase !!!! (case insensitive)
|
316
|
+
## always strip (2011-) !!!
|
317
|
+
## always strip dots (e.g. St., F.C, etc.)
|
318
|
+
|
319
|
+
recs << rec
|
320
|
+
end
|
321
|
+
end # each line (in paragraph)
|
322
|
+
else
|
323
|
+
puts "** !!! ERROR !!! [club reader] - unknown line type:"
|
324
|
+
pp node
|
325
|
+
exit 1
|
326
|
+
end
|
327
|
+
end
|
328
|
+
|
329
|
+
recs
|
330
|
+
end # method read
|
331
|
+
|
332
|
+
#######################
|
333
|
+
### helpers
|
334
|
+
def split_geo( str )
|
335
|
+
## assume city / geo tree
|
336
|
+
## strip and squish (white)spaces
|
337
|
+
# e.g. León › Guanajuato => León › Guanajuato
|
338
|
+
str = str.strip.gsub( /[ \t]+/, ' ' )
|
339
|
+
|
340
|
+
## split into geo tree
|
341
|
+
geos = str.split( /[<>‹›]/ ) ## note: allow > < or › ‹
|
342
|
+
geos = geos.map { |geo| geo.strip } ## remove all whitespaces
|
343
|
+
geos
|
344
|
+
end
|
345
|
+
|
346
|
+
end # class ClubReader
|
347
|
+
|
348
|
+
|
349
|
+
end ## module Import
|
350
|
+
end ## module SportDb
|