sportdb-formats 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Manifest.txt +21 -0
- data/lib/sportdb/formats.rb +63 -0
- data/lib/sportdb/formats/country/country_index.rb +192 -0
- data/lib/sportdb/formats/country/country_reader.rb +122 -0
- data/lib/sportdb/formats/league/league_index.rb +174 -0
- data/lib/sportdb/formats/league/league_outline_reader.rb +141 -0
- data/lib/sportdb/formats/league/league_reader.rb +162 -0
- data/lib/sportdb/formats/team/club_index.rb +336 -0
- data/lib/sportdb/formats/team/club_reader.rb +350 -0
- data/lib/sportdb/formats/team/club_reader_props.rb +75 -0
- data/lib/sportdb/formats/team/national_team_index.rb +114 -0
- data/lib/sportdb/formats/team/team_index.rb +43 -0
- data/lib/sportdb/formats/team/wiki_reader.rb +108 -0
- data/lib/sportdb/formats/version.rb +1 -1
- data/test/helper.rb +72 -0
- data/test/test_club_index.rb +183 -0
- data/test/test_club_reader.rb +201 -0
- data/test/test_club_reader_props.rb +54 -0
- data/test/test_country_index.rb +63 -0
- data/test/test_country_reader.rb +59 -0
- data/test/test_league_index.rb +157 -0
- data/test/test_league_outline_reader.rb +55 -0
- data/test/test_league_reader.rb +72 -0
- data/test/test_regex.rb +49 -0
- data/test/test_wiki_reader.rb +77 -0
- metadata +22 -1
@@ -0,0 +1,336 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module SportDb
|
4
|
+
module Import
|
5
|
+
|
6
|
+
|
7
|
+
class ClubIndex
|
8
|
+
|
9
|
+
def self.build( path )
|
10
|
+
pack = Package.new( path ) ## lets us use direcotry or zip archive
|
11
|
+
|
12
|
+
recs = []
|
13
|
+
pack.each_clubs do |entry|
|
14
|
+
recs += Club.parse( entry.read )
|
15
|
+
end
|
16
|
+
recs
|
17
|
+
|
18
|
+
clubs = new
|
19
|
+
clubs.add( recs )
|
20
|
+
|
21
|
+
## add wiki(pedia) anchored links
|
22
|
+
recs = []
|
23
|
+
pack.each_clubs_wiki do |entry|
|
24
|
+
recs += WikiReader.parse( entry.read )
|
25
|
+
end
|
26
|
+
|
27
|
+
pp recs
|
28
|
+
clubs.add_wiki( recs )
|
29
|
+
clubs
|
30
|
+
end
|
31
|
+
|
32
|
+
|
33
|
+
def catalog() Import.catalog; end
|
34
|
+
|
35
|
+
def initialize
|
36
|
+
@clubs = {} ## clubs (indexed) by canonical name
|
37
|
+
@clubs_by_name = {}
|
38
|
+
@errors = []
|
39
|
+
end
|
40
|
+
|
41
|
+
attr_reader :errors
|
42
|
+
def errors?() @errors.empty? == false; end
|
43
|
+
|
44
|
+
def mappings() @clubs_by_name; end ## todo/check: rename to index or something - why? why not?
|
45
|
+
def clubs() @clubs.values; end
|
46
|
+
alias_method :all, :clubs ## use ActiveRecord-like alias for clubs
|
47
|
+
|
48
|
+
|
49
|
+
## helpers from club - use a helper module for includes - why? why not?
|
50
|
+
include NameHelper
|
51
|
+
## incl. strip_year( name )
|
52
|
+
## has_year?( name)
|
53
|
+
## strip_lang( name )
|
54
|
+
## normalize( name )
|
55
|
+
|
56
|
+
def strip_wiki( name ) # todo/check: rename to strip_wikipedia_en - why? why not?
|
57
|
+
## note: strip disambiguationn qualifier from wikipedia page name if present
|
58
|
+
## note: only remove year and foot... for now
|
59
|
+
## e.g. FC Wacker Innsbruck (2002) => FC Wacker Innsbruck
|
60
|
+
## Willem II (football club) => Willem II
|
61
|
+
##
|
62
|
+
## e.g. do NOT strip others !! e.g.
|
63
|
+
## América Futebol Clube (MG)
|
64
|
+
## only add more "special" cases on demand (that, is) if we find more
|
65
|
+
name = name.gsub( /\([12][^\)]+?\)/, '' ).strip ## starting with a digit 1 or 2 (assuming year)
|
66
|
+
name = name.gsub( /\(foot[^\)]+?\)/, '' ).strip ## starting with foot (assuming football ...)
|
67
|
+
name
|
68
|
+
end
|
69
|
+
|
70
|
+
def add_wiki( rec_or_recs ) ## add wiki(pedia club record / links
|
71
|
+
recs = rec_or_recs.is_a?( Array ) ? rec_or_recs : [rec_or_recs] ## wrap (single) rec in array
|
72
|
+
|
73
|
+
recs.each do |rec|
|
74
|
+
## note: strip qualifier () from wikipedia page name if present
|
75
|
+
## e.g. FC Wacker Innsbruck (2002) => FC Wacker Innsbruck
|
76
|
+
## Willem II (football club) => Willem II
|
77
|
+
##
|
78
|
+
## e.g. do NOT strip others !! e.g.
|
79
|
+
## América Futebol Clube (MG)
|
80
|
+
## only add more "special" cases on demand (that, is) if we find more
|
81
|
+
name = strip_wiki( rec.name )
|
82
|
+
|
83
|
+
m = match_by( name: name, country: rec.country )
|
84
|
+
if m.nil?
|
85
|
+
puts "** !!! ERROR !!! - no matching club found for wiki(pedia) name >#{name}, #{rec.country.name} (#{rec.country.key})<; sorry - to fix add name to clubs"
|
86
|
+
exit 1
|
87
|
+
end
|
88
|
+
if m.size > 1
|
89
|
+
puts "** !!! ERROR !!! - too many (greater than one) matching clubs found for wiki(pedia) name >#{name}, #{rec.country.name} (#{rec.country.key})<"
|
90
|
+
pp m
|
91
|
+
exit 1
|
92
|
+
end
|
93
|
+
club = m[0]
|
94
|
+
club.wikipedia = rec.name
|
95
|
+
end
|
96
|
+
end # method add_wiki
|
97
|
+
|
98
|
+
|
99
|
+
def add( rec_or_recs ) ## add club record / alt_names
|
100
|
+
recs = rec_or_recs.is_a?( Array ) ? rec_or_recs : [rec_or_recs] ## wrap (single) rec in array
|
101
|
+
|
102
|
+
recs.each do |rec|
|
103
|
+
## puts "adding:"
|
104
|
+
## pp rec
|
105
|
+
### step 1) add canonical name
|
106
|
+
old_rec = @clubs[ rec.name ]
|
107
|
+
if old_rec
|
108
|
+
puts "** !!! ERROR !!! - (canonical) name conflict - duplicate - >#{rec.name}< will overwrite >#{old_rec.name}<:"
|
109
|
+
pp old_rec
|
110
|
+
pp rec
|
111
|
+
exit 1
|
112
|
+
else
|
113
|
+
@clubs[ rec.name ] = rec
|
114
|
+
end
|
115
|
+
|
116
|
+
## step 2) add all names (canonical name + alt names + alt names (auto))
|
117
|
+
names = [rec.name] + rec.alt_names
|
118
|
+
more_names = []
|
119
|
+
## check "hand-typed" names for year (auto-add)
|
120
|
+
## check for year(s) e.g. (1887-1911), (-2013),
|
121
|
+
## (1946-2001,2013-) etc.
|
122
|
+
names.each do |name|
|
123
|
+
if has_year?( name )
|
124
|
+
more_names << strip_year( name )
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
names += more_names
|
129
|
+
## check for duplicates - simple check for now - fix/improve
|
130
|
+
## todo/fix: (auto)remove duplicates - why? why not?
|
131
|
+
count = names.size
|
132
|
+
count_uniq = names.uniq.size
|
133
|
+
if count != count_uniq
|
134
|
+
puts "** !!! ERROR !!! - #{count-count_uniq} duplicate name(s):"
|
135
|
+
pp names
|
136
|
+
pp rec
|
137
|
+
exit 1
|
138
|
+
end
|
139
|
+
|
140
|
+
## check with auto-names just warn for now and do not exit
|
141
|
+
names += rec.alt_names_auto
|
142
|
+
count = names.size
|
143
|
+
count_uniq = names.uniq.size
|
144
|
+
if count != count_uniq
|
145
|
+
puts "** !!! WARN !!! - #{count-count_uniq} duplicate name(s):"
|
146
|
+
pp names
|
147
|
+
pp rec
|
148
|
+
end
|
149
|
+
|
150
|
+
|
151
|
+
names.each_with_index do |name,i|
|
152
|
+
## check lang codes e.g. [en], [fr], etc.
|
153
|
+
## todo/check/fix: move strip_lang up in the chain - check for duplicates (e.g. only lang code marker different etc.) - why? why not?
|
154
|
+
name = strip_lang( name )
|
155
|
+
norm = normalize( name )
|
156
|
+
alt_recs = @clubs_by_name[ norm ]
|
157
|
+
if alt_recs
|
158
|
+
## check if include club rec already or is new club rec
|
159
|
+
if alt_recs.include?( rec )
|
160
|
+
## note: do NOT include duplicate club record
|
161
|
+
msg = "** !!! WARN !!! - (norm) name conflict/duplicate for club - >#{name}< normalized to >#{norm}< already included >#{rec.name}, #{rec.country.name}<"
|
162
|
+
puts msg
|
163
|
+
@errors << msg
|
164
|
+
else
|
165
|
+
msg = "** !!! WARN !!! - name conflict/duplicate - >#{name}< will overwrite >#{alt_recs[0].name}, #{alt_recs[0].country.name}< with >#{rec.name}, #{rec.country.name}<"
|
166
|
+
puts msg
|
167
|
+
@errors << msg
|
168
|
+
alt_recs << rec
|
169
|
+
end
|
170
|
+
else
|
171
|
+
@clubs_by_name[ norm ] = [rec]
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end # method add
|
176
|
+
|
177
|
+
|
178
|
+
## todo/fix/check: use rename to find_canon or find_canonical() or something??
|
179
|
+
## remove (getting used?) - why? why not?
|
180
|
+
def []( name ) ## lookup by canoncial name only; todo/fix: add find alias why? why not?
|
181
|
+
@clubs[ name ]
|
182
|
+
end
|
183
|
+
|
184
|
+
|
185
|
+
## todo/fix/check: return empty array if no match!!!
|
186
|
+
## and NOT nil (add || []) - why? why not?
|
187
|
+
def match( name )
|
188
|
+
name = normalize( name )
|
189
|
+
m = @clubs_by_name[ name ]
|
190
|
+
|
191
|
+
## no match - retry with unaccented variant if different
|
192
|
+
## e.g. example is Preussen Münster (with mixed accent and unaccented letters) that would go unmatched for now
|
193
|
+
## Preussen Münster => preussenmünster (norm) => preussenmunster (norm+unaccent)
|
194
|
+
if m.nil?
|
195
|
+
name2 = unaccent( name )
|
196
|
+
m = @clubs_by_name[ name2 ] if name2 != name
|
197
|
+
end
|
198
|
+
m
|
199
|
+
end
|
200
|
+
|
201
|
+
|
202
|
+
## helper to always convert (possible) country key to existing country record
|
203
|
+
## todo: make private - why? why not?
|
204
|
+
def country( country )
|
205
|
+
if country.is_a?( String ) || country.is_a?( Symbol )
|
206
|
+
## note: use own "global" countries index setting for ClubIndex - why? why not?
|
207
|
+
rec = catalog.countries.find( country.to_s )
|
208
|
+
if rec.nil?
|
209
|
+
puts "** !!! ERROR !!! - unknown country >#{country}< - no match found, sorry - add to world/countries.txt in config"
|
210
|
+
exit 1
|
211
|
+
end
|
212
|
+
rec
|
213
|
+
else
|
214
|
+
country ## (re)use country struct - no need to run lookup again
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
|
219
|
+
## match - always returns an array (with one or more matches) or nil
|
220
|
+
def match_by( name:, country: nil )
|
221
|
+
## note: allow passing in of country key too (auto-counvert)
|
222
|
+
## and country struct too
|
223
|
+
## - country assumes / allows the country key or fifa code for now
|
224
|
+
m = match( name )
|
225
|
+
|
226
|
+
if country
|
227
|
+
country = country( country )
|
228
|
+
|
229
|
+
## note: match must for now always include name
|
230
|
+
if m ## filter by country
|
231
|
+
m = m.select { |club| club.country.key == country.key }
|
232
|
+
m = nil if m.empty? ## note: reset to nil if no more matches
|
233
|
+
end
|
234
|
+
end
|
235
|
+
m
|
236
|
+
end
|
237
|
+
|
238
|
+
def find( name ) find_by( name: name, country: nil ); end
|
239
|
+
def find!( name ) find_by!( name: name, country: nil ); end
|
240
|
+
|
241
|
+
## find - always returns a single record / match or nil
|
242
|
+
## if there is more than one match than find aborts / fails
|
243
|
+
def find_by!( name:, country: nil ) ## todo/fix: add international or league flag?
|
244
|
+
club = find_by( name: name, country: country )
|
245
|
+
|
246
|
+
if club.nil?
|
247
|
+
puts "** !!! ERROR - no match for club >#{name}<"
|
248
|
+
exit 1
|
249
|
+
end
|
250
|
+
|
251
|
+
club
|
252
|
+
end
|
253
|
+
|
254
|
+
|
255
|
+
def find_by( name:, country: nil ) ## todo/fix: add international or league flag?
|
256
|
+
## note: allow passing in of country key too (auto-counvert)
|
257
|
+
## and country struct too
|
258
|
+
## - country assumes / allows the country key or fifa code for now
|
259
|
+
m = nil
|
260
|
+
|
261
|
+
if country
|
262
|
+
country = country( country )
|
263
|
+
|
264
|
+
m = match_by( name: name, country: country )
|
265
|
+
|
266
|
+
if m.nil?
|
267
|
+
## (re)try with second country - quick hacks for known leagues
|
268
|
+
## todo/fix: add league flag to activate!!! - why? why not
|
269
|
+
m = match_by( name: name, country: 'wal' ) if country.key == 'eng'
|
270
|
+
m = match_by( name: name, country: 'eng' ) if country.key == 'sco'
|
271
|
+
m = match_by( name: name, country: 'nir' ) if country.key == 'ie'
|
272
|
+
m = match_by( name: name, country: 'mc' ) if country.key == 'fr'
|
273
|
+
m = match_by( name: name, country: 'li' ) if country.key == 'ch'
|
274
|
+
m = match_by( name: name, country: 'ca' ) if country.key == 'us'
|
275
|
+
end
|
276
|
+
else ## try "global" search - no country passed in
|
277
|
+
m = match( name )
|
278
|
+
end
|
279
|
+
|
280
|
+
|
281
|
+
club = nil
|
282
|
+
if m.nil?
|
283
|
+
## puts "** !!! WARN !!! no match for club >#{name}<"
|
284
|
+
elsif m.size > 1
|
285
|
+
puts "** !!! ERROR - too many matches (#{m.size}) for club >#{name}<:"
|
286
|
+
pp m
|
287
|
+
exit 1
|
288
|
+
else # bingo; match - assume size == 1
|
289
|
+
club = m[0]
|
290
|
+
end
|
291
|
+
|
292
|
+
club
|
293
|
+
end
|
294
|
+
|
295
|
+
|
296
|
+
|
297
|
+
def build_mods( mods )
|
298
|
+
## e.g.
|
299
|
+
## { 'Arsenal | Arsenal FC' => 'Arsenal, ENG',
|
300
|
+
## 'Liverpool | Liverpool FC' => 'Liverpool, ENG',
|
301
|
+
## 'Barcelona' => 'Barcelona, ESP',
|
302
|
+
## 'Valencia' => 'Valencia, ESP' }
|
303
|
+
|
304
|
+
mods.reduce({}) do |h,(club_names, club_line)|
|
305
|
+
|
306
|
+
values = club_line.split( ',' )
|
307
|
+
values = values.map { |value| value.strip } ## strip all spaces
|
308
|
+
|
309
|
+
## todo/fix: make sure country is present !!!!
|
310
|
+
club_name, country_name = values
|
311
|
+
club = find_by!( name: club_name, country: country_name )
|
312
|
+
|
313
|
+
values = club_names.split( '|' )
|
314
|
+
values = values.map { |value| value.strip } ## strip all spaces
|
315
|
+
|
316
|
+
values.each do |club_name|
|
317
|
+
h[club_name] = club
|
318
|
+
end
|
319
|
+
h
|
320
|
+
end
|
321
|
+
end
|
322
|
+
|
323
|
+
|
324
|
+
def dump_duplicates # debug helper - report duplicate club name records
|
325
|
+
@clubs_by_name.each do |name, clubs|
|
326
|
+
if clubs.size > 1
|
327
|
+
puts "#{clubs.size} matching club duplicates for >#{name}<:"
|
328
|
+
pp clubs
|
329
|
+
end
|
330
|
+
end
|
331
|
+
end
|
332
|
+
end # class ClubIndex
|
333
|
+
|
334
|
+
|
335
|
+
end # module Import
|
336
|
+
end # module SportDb
|
@@ -0,0 +1,350 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
module SportDb
|
5
|
+
module Import
|
6
|
+
|
7
|
+
|
8
|
+
class ClubReader
|
9
|
+
|
10
|
+
def catalog() Import.catalog; end
|
11
|
+
|
12
|
+
|
13
|
+
|
14
|
+
def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
|
15
|
+
txt = File.open( path, 'r:utf-8' ) { |f| f.read }
|
16
|
+
parse( txt )
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.parse( txt )
|
20
|
+
new( txt ).parse
|
21
|
+
end
|
22
|
+
|
23
|
+
def initialize( txt )
|
24
|
+
@txt = txt
|
25
|
+
end
|
26
|
+
|
27
|
+
## pattern for b (child) team / club marker e.g.
|
28
|
+
## (ii) or ii) or ii.) or (ii.) or (II)
|
29
|
+
## (b) or b) or b.) or (b.) or (B)
|
30
|
+
## (2) or 2) or 2.) or (2.)
|
31
|
+
B_TEAM_MARKER_RE = %r{^ \(? # optional opening bracket
|
32
|
+
(?: ii|b|2 )
|
33
|
+
\.? # optional dot - keep and allow dot - why? why not?
|
34
|
+
\) # required closing bracket
|
35
|
+
}xi ## note: add case-insenstive (e.g. II/ii or B/b)
|
36
|
+
|
37
|
+
## pattern for checking for address line e.g.
|
38
|
+
## use just one style / syntax - why? why not?
|
39
|
+
## Fischhofgasse 12 ~ 1100 Wien or
|
40
|
+
## Fischhofgasse 12 // 1100 Wien or Fischhofgasse 12 /// 1100 Wien
|
41
|
+
## Fischhofgasse 12 ++ 1100 Wien or Fischhofgasse 12 +++ 1100 Wien
|
42
|
+
ADDR_MARKER_RE = %r{ (?: ^|[ ] ) # space or beginning of line
|
43
|
+
(?: ~ | /{2,} | \+{2,} )
|
44
|
+
(?: [ ]|$) # space or end of line
|
45
|
+
}x
|
46
|
+
|
47
|
+
|
48
|
+
def add_alt_names( rec, names ) ## helper for adding alternat names
|
49
|
+
|
50
|
+
## strip and squish (white)spaces
|
51
|
+
# e.g. New York FC (2011-) => New York FC (2011-)
|
52
|
+
names = names.map { |name| name.gsub( '$', '' ).strip
|
53
|
+
.gsub( /[ \t]+/, ' ' ) }
|
54
|
+
rec.alt_names += names
|
55
|
+
rec.add_variants( names ) # auto-add (possible) auto-generated variant names
|
56
|
+
|
57
|
+
## check for duplicates
|
58
|
+
if rec.duplicates?
|
59
|
+
duplicates = rec.duplicates
|
60
|
+
puts "*** !!! WARN !!! - #{duplicates.size} duplicate alt name mapping(s):"
|
61
|
+
pp duplicates
|
62
|
+
pp rec
|
63
|
+
##
|
64
|
+
## todo/fix: make it only an error with exit 1
|
65
|
+
## if (not normalized) names are the same (not unique/uniq)
|
66
|
+
## e.g. don't exit on A.F.C. == AFC etc.
|
67
|
+
## exit 1
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
|
72
|
+
def parse
|
73
|
+
recs = []
|
74
|
+
last_rec = nil
|
75
|
+
headings = [] ## headings stack
|
76
|
+
|
77
|
+
OutlineReader.parse( @txt ).each do |node|
|
78
|
+
if [:h1,:h2,:h3,:h4,:h5,:h6].include?( node[0] )
|
79
|
+
heading_level = node[0][1].to_i
|
80
|
+
heading = node[1]
|
81
|
+
|
82
|
+
puts "heading #{heading_level} >#{heading}<"
|
83
|
+
|
84
|
+
## 1) first pop headings if present
|
85
|
+
while headings.size+1 > heading_level
|
86
|
+
headings.pop
|
87
|
+
end
|
88
|
+
|
89
|
+
## 2) add missing (hierarchy) level if
|
90
|
+
while headings.size+1 < heading_level
|
91
|
+
## todo/fix: issue warning about "skipping" hierarchy level
|
92
|
+
puts "!!! warn [team reader] - skipping hierarchy level in headings "
|
93
|
+
headings.push( nil )
|
94
|
+
end
|
95
|
+
|
96
|
+
if heading =~ /^\?+$/ ## note: use ? or ?? or ?? to reset level to nil
|
97
|
+
## keep level empty
|
98
|
+
else
|
99
|
+
## note: if level is 1 assume country for now
|
100
|
+
if heading_level == 1
|
101
|
+
## assume country in heading; allow all "formats" supported by parse e.g.
|
102
|
+
## Österreich • Austria (at)
|
103
|
+
## Österreich • Austria
|
104
|
+
## Austria
|
105
|
+
## Deutschland (de) • Germany
|
106
|
+
country = catalog.countries.parse( heading )
|
107
|
+
## check country code - MUST exist for now!!!!
|
108
|
+
if country.nil?
|
109
|
+
puts "!!! error [club reader] - unknown country >#{heading}< - sorry - add country to config to fix"
|
110
|
+
exit 1
|
111
|
+
end
|
112
|
+
|
113
|
+
headings.push( country.key )
|
114
|
+
else
|
115
|
+
## quick hack:
|
116
|
+
## remove known fill/dummy words incl:
|
117
|
+
## Provincia San Juan => San Juan (see argentina, for example)
|
118
|
+
##
|
119
|
+
## use geo tree long term with alternative names - why? why not?
|
120
|
+
words = ['Provincia']
|
121
|
+
words.each { |word| heading = heading.gsub( word, '' ) }
|
122
|
+
heading = heading.strip
|
123
|
+
|
124
|
+
headings.push( heading )
|
125
|
+
end
|
126
|
+
|
127
|
+
## assert that hierarchy level is ok
|
128
|
+
if headings.size != heading_level
|
129
|
+
puts "!!! error - headings hierarchy/stack out of order - #{heading.size}<=>#{heading_level}"
|
130
|
+
exit 1
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
pp headings
|
135
|
+
|
136
|
+
elsif node[0] == :p ## paragraph with (text) lines
|
137
|
+
lines = node[1]
|
138
|
+
lines.each do |line|
|
139
|
+
if line.start_with?( '|' )
|
140
|
+
## assume continuation with line of alternative names
|
141
|
+
## note: skip leading pipe
|
142
|
+
values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
|
143
|
+
|
144
|
+
add_alt_names( last_rec, values ) ## note: use alt_names helper for (re)use
|
145
|
+
|
146
|
+
## check for b (child) team / club marker e.g.
|
147
|
+
## (ii) or ii) or ii.) or (ii.)
|
148
|
+
## (b) or b) or b.) or (b.)
|
149
|
+
## (2) or 2) or 2.) or (2.)
|
150
|
+
elsif line =~ B_TEAM_MARKER_RE
|
151
|
+
line = line.sub( B_TEAM_MARKER_RE, '' ).strip ## remove (leading) b team marker
|
152
|
+
|
153
|
+
## todo/fix: move into "regular" club branch - (re)use, that is, use the same code
|
154
|
+
# for both a and b team / club
|
155
|
+
rec = Club.new
|
156
|
+
value = line ## note: assume / allow just canonical name for now
|
157
|
+
## strip and squish (white)spaces
|
158
|
+
# e.g. New York FC (2011-) => New York FC (2011-)
|
159
|
+
value = value.gsub( '$', '' ).strip
|
160
|
+
.gsub( /[ \t]+/, ' ' )
|
161
|
+
|
162
|
+
rec.name = value # canoncial name (global unique "beautiful/long" name)
|
163
|
+
rec.add_variants( value ) # auto-add (possible) auto-generated variant names
|
164
|
+
|
165
|
+
### link a and b team / clubs
|
166
|
+
## assume last_rec is the a team
|
167
|
+
## todo/fix: check last_rec required NOT null
|
168
|
+
rec.a = last_rec
|
169
|
+
last_rec.b = rec
|
170
|
+
|
171
|
+
last_rec = rec
|
172
|
+
recs << rec
|
173
|
+
|
174
|
+
## check for address line e.g.
|
175
|
+
## use just one style / syntax - why? why not?
|
176
|
+
## Fischhofgasse 12 ~ 1100 Wien or
|
177
|
+
## Fischhofgasse 12 // 1100 Wien or Fischhofgasse 12 /// 1100 Wien
|
178
|
+
## Fischhofgasse 12 ++ 1100 Wien or Fischhofgasse 12 +++ 1100 Wien
|
179
|
+
elsif line =~ ADDR_MARKER_RE
|
180
|
+
# note skip for now!!!
|
181
|
+
# todo/fix: add support for address line!!!
|
182
|
+
puts " skipping address line for now >#{line}<"
|
183
|
+
else
|
184
|
+
values = line.split( ',' )
|
185
|
+
|
186
|
+
rec = Club.new
|
187
|
+
|
188
|
+
col = values.shift ## get first item
|
189
|
+
## note: allow optional alt names for convenience with required canoncial name
|
190
|
+
names = col.split( '|' ) # team names - allow/use pipe(|)
|
191
|
+
value = names[0] ## canonical name
|
192
|
+
alt_names = names[1..-1] ## optional (inline) alt names
|
193
|
+
|
194
|
+
## strip and squish (white)spaces
|
195
|
+
# e.g. New York FC (2011-) => New York FC (2011-)
|
196
|
+
value = value.gsub( '$', '' ).strip
|
197
|
+
.gsub( /[ \t]+/, ' ' )
|
198
|
+
rec.name = value # canoncial name (global unique "beautiful/long" name)
|
199
|
+
rec.add_variants( value ) # auto-add (possible) auto-generated variant names
|
200
|
+
|
201
|
+
## note: add optional (inline) alternate names if present
|
202
|
+
add_alt_names( rec, alt_names ) if alt_names.size > 0
|
203
|
+
|
204
|
+
## note:
|
205
|
+
## check/todo!!!!!!!!!!!!!!!!!-
|
206
|
+
## strip year if to present e.g. (2011-)
|
207
|
+
##
|
208
|
+
## do NOT strip for defunct / historic clubs e.g.
|
209
|
+
## (1899-1910)
|
210
|
+
## or (-1914) or (-2011) etc.
|
211
|
+
|
212
|
+
###
|
213
|
+
## todo: move year out of canonical team name - why? why not?
|
214
|
+
|
215
|
+
## check if canonical name include (2011-) or similar in name
|
216
|
+
## if yes, remove (2011-) and add to (alt) names
|
217
|
+
## e.g. New York FC (2011) => New York FC
|
218
|
+
if rec.name =~ /\(.+?\)/ ## note: use non-greedy (?) match
|
219
|
+
name = rec.name.gsub( /\(.+?\)/, '' ).strip
|
220
|
+
|
221
|
+
if rec.name =~ /\(([0-9]{4})-\)/ ## e.g. (2014-)
|
222
|
+
rec.year = $1.to_i
|
223
|
+
elsif rec.name =~ /\(-([0-9]{4})\)/ ## e.g. (-2014)
|
224
|
+
rec.year_end = $1.to_i
|
225
|
+
elsif rec.name =~ /\(([0-9]{4})-([0-9]{4})\)/ ## e.g. (2011-2014)
|
226
|
+
rec.year = $1.to_i
|
227
|
+
rec.year_end = $2.to_i
|
228
|
+
else
|
229
|
+
## todo/check: warn about unknown year format
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
## todo/check - check for unknown format values
|
234
|
+
## e.g. too many values, duplicate years, etc.
|
235
|
+
## check for overwritting, etc.
|
236
|
+
while values.size > 0
|
237
|
+
value = values.shift
|
238
|
+
## strip and squish (white)spaces
|
239
|
+
# e.g. León › Guanajuato => León › Guanajuato
|
240
|
+
value = value.strip.gsub( /[ \t]+/, ' ' )
|
241
|
+
if value =~/^\d{4}$/ # e.g 1904
|
242
|
+
## todo/check: issue warning if year is already set!!!!!!!
|
243
|
+
if rec.year
|
244
|
+
puts "!!! error - year already set to #{rec.year} - CANNOT overwrite with #{value}:"
|
245
|
+
pp rec
|
246
|
+
exit 1
|
247
|
+
end
|
248
|
+
rec.year = value.to_i
|
249
|
+
elsif value.start_with?( '@' ) # e.g. @ Anfield
|
250
|
+
## cut-off leading @ and spaces
|
251
|
+
rec.ground = value[1..-1].strip
|
252
|
+
else
|
253
|
+
## assume city / geo tree
|
254
|
+
## split into geo tree
|
255
|
+
geos = split_geo( value )
|
256
|
+
city = geos[0]
|
257
|
+
## check for "embedded" district e.g. London (Fulham) or Hamburg (St. Pauli) etc.
|
258
|
+
if city =~ /\((.+?)\)/ ## note: use non-greedy (?) match
|
259
|
+
rec.district = $1.strip
|
260
|
+
city = city.gsub( /\(.+?\)/, '' ).strip
|
261
|
+
end
|
262
|
+
rec.city = city
|
263
|
+
|
264
|
+
if geos.size > 1
|
265
|
+
## cut-off city and keep the rest (of geo tree)
|
266
|
+
rec.geos = geos[1..-1]
|
267
|
+
end
|
268
|
+
end
|
269
|
+
end ## while values
|
270
|
+
|
271
|
+
|
272
|
+
###############
|
273
|
+
## use headings text for geo tree
|
274
|
+
|
275
|
+
## 1) add country if present
|
276
|
+
if headings.size > 0 && headings[0]
|
277
|
+
country = catalog.countries.find( headings[0] )
|
278
|
+
rec.country = country
|
279
|
+
else
|
280
|
+
## make it an error - why? why not?
|
281
|
+
puts "!!! error - country missing in headings hierarchy - sorry - add to quicklist"
|
282
|
+
exit 1
|
283
|
+
end
|
284
|
+
|
285
|
+
## 2) check geo tree with headings hierarchy
|
286
|
+
if headings.size > 1 && headings[1]
|
287
|
+
geos = split_geo( headings[1] )
|
288
|
+
if rec.geos
|
289
|
+
if rec.geos[0] != geos[0]
|
290
|
+
puts "!!! error - geo tree - headings mismatch >#{rec.geos[0]}< <=> >#{geos[0]}<"
|
291
|
+
exit 1
|
292
|
+
end
|
293
|
+
if rec.geos[1] && rec.geos[1] != geos[1] ## check optional 2nd level too
|
294
|
+
puts "!!! error - geo tree - headings mismatch >#{rec.geos[1]}< <=> >#{geos[1]}<"
|
295
|
+
exit 1
|
296
|
+
end
|
297
|
+
else
|
298
|
+
## add missing region (state/province) from headings hierarchy
|
299
|
+
rec.geos = geos
|
300
|
+
end
|
301
|
+
end
|
302
|
+
|
303
|
+
last_rec = rec
|
304
|
+
|
305
|
+
|
306
|
+
### todo/fix:
|
307
|
+
## auto-add alt name with dots stripped - why? why not?
|
308
|
+
## e.g. D.C. United => DC United
|
309
|
+
## e.g. Liverpool F.C. => Liverpool FC
|
310
|
+
## e.g. St. Albin => St Albin etc.
|
311
|
+
## e.g. 1. FC Köln => 1 FC Köln -- make special case for 1. - why? why not?
|
312
|
+
|
313
|
+
##
|
314
|
+
## todo/fix: unify mapping entries
|
315
|
+
## always lowercase !!!! (case insensitive)
|
316
|
+
## always strip (2011-) !!!
|
317
|
+
## always strip dots (e.g. St., F.C, etc.)
|
318
|
+
|
319
|
+
recs << rec
|
320
|
+
end
|
321
|
+
end # each line (in paragraph)
|
322
|
+
else
|
323
|
+
puts "** !!! ERROR !!! [club reader] - unknown line type:"
|
324
|
+
pp node
|
325
|
+
exit 1
|
326
|
+
end
|
327
|
+
end
|
328
|
+
|
329
|
+
recs
|
330
|
+
end # method read
|
331
|
+
|
332
|
+
#######################
|
333
|
+
### helpers
|
334
|
+
def split_geo( str )
|
335
|
+
## assume city / geo tree
|
336
|
+
## strip and squish (white)spaces
|
337
|
+
# e.g. León › Guanajuato => León › Guanajuato
|
338
|
+
str = str.strip.gsub( /[ \t]+/, ' ' )
|
339
|
+
|
340
|
+
## split into geo tree
|
341
|
+
geos = str.split( /[<>‹›]/ ) ## note: allow > < or › ‹
|
342
|
+
geos = geos.map { |geo| geo.strip } ## remove all whitespaces
|
343
|
+
geos
|
344
|
+
end
|
345
|
+
|
346
|
+
end # class ClubReader
|
347
|
+
|
348
|
+
|
349
|
+
end ## module Import
|
350
|
+
end ## module SportDb
|