sportdb-formats 1.1.5 → 1.1.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/sportdb/formats/country/country_reader.rb +142 -142
- data/lib/sportdb/formats/datafile.rb +59 -59
- data/lib/sportdb/formats/event/event_index.rb +141 -141
- data/lib/sportdb/formats/event/event_reader.rb +183 -183
- data/lib/sportdb/formats/league/league_outline_reader.rb +1 -0
- data/lib/sportdb/formats/league/league_reader.rb +168 -168
- data/lib/sportdb/formats/match/match_parser_auto_conf.rb +202 -202
- data/lib/sportdb/formats/package.rb +374 -374
- data/lib/sportdb/formats/team/club_index_history.rb +134 -134
- data/lib/sportdb/formats/team/club_reader.rb +350 -350
- data/lib/sportdb/formats/team/club_reader_history.rb +203 -203
- data/lib/sportdb/formats/team/wiki_reader.rb +108 -108
- data/lib/sportdb/formats/version.rb +1 -1
- data/test/test_club_index_history.rb +107 -107
- data/test/test_club_reader.rb +201 -201
- data/test/test_club_reader_history.rb +212 -212
- data/test/test_country_reader.rb +89 -89
- data/test/test_league_outline_reader.rb +55 -55
- data/test/test_league_reader.rb +72 -72
- data/test/test_outline_reader.rb +31 -31
- data/test/test_regex.rb +67 -67
- data/test/test_wiki_reader.rb +77 -77
- metadata +12 -6
@@ -1,183 +1,183 @@
|
|
1
|
-
|
2
|
-
module SportDb
|
3
|
-
module Import
|
4
|
-
|
5
|
-
|
6
|
-
class EventInfo
|
7
|
-
## "high level" info (summary) about event (like a "wikipedia infobox")
|
8
|
-
## use for checking dataset imports; lets you check e.g.
|
9
|
-
## - dates within range
|
10
|
-
## - number of teams e.g. 20
|
11
|
-
## - matches played e.g. 380
|
12
|
-
## - goals scored e.g. 937
|
13
|
-
## etc.
|
14
|
-
|
15
|
-
attr_reader :league,
|
16
|
-
:season,
|
17
|
-
:teams,
|
18
|
-
:matches,
|
19
|
-
:goals,
|
20
|
-
:start_date,
|
21
|
-
:end_date
|
22
|
-
|
23
|
-
def initialize( league:, season:,
|
24
|
-
start_date: nil, end_date: nil,
|
25
|
-
teams: nil,
|
26
|
-
matches: nil,
|
27
|
-
goals: nil )
|
28
|
-
|
29
|
-
@league = league
|
30
|
-
@season = season
|
31
|
-
|
32
|
-
@start_date = start_date
|
33
|
-
@end_date = end_date
|
34
|
-
|
35
|
-
@teams = teams ## todo/check: rename/use teams_count ??
|
36
|
-
@matches = matches ## todo/check: rename/use match_count ??
|
37
|
-
@goals = goals
|
38
|
-
end
|
39
|
-
|
40
|
-
def include?( date )
|
41
|
-
## todo/fix: add options e.g.
|
42
|
-
## - add delta/off_by_one or such?
|
43
|
-
## - add strict (for) only return true if date range (really) defined (no generic auto-rules)
|
44
|
-
|
45
|
-
### note: for now allow off by one error (via timezone/local time errors)
|
46
|
-
## todo/fix: issue warning if off by one!!!!
|
47
|
-
if @start_date && @end_date
|
48
|
-
date >= (@start_date-1) &&
|
49
|
-
date <= (@end_date+1)
|
50
|
-
else
|
51
|
-
if @season.year?
|
52
|
-
# assume generic rule
|
53
|
-
## same year e.g. Jan 1 - Dec 31; always true for now
|
54
|
-
date.year == @season.start_year
|
55
|
-
else
|
56
|
-
# assume generic rule
|
57
|
-
## July 1 - June 30 (Y+1)
|
58
|
-
## - todo/check -start for some countries/leagues in June 1 or August 1 ????
|
59
|
-
date >= Date.new( @season.start_year, 7, 1 ) &&
|
60
|
-
date <= Date.new( @season.end_year, 6, 30 )
|
61
|
-
end
|
62
|
-
end
|
63
|
-
end # method include?
|
64
|
-
alias_method :between?, :include?
|
65
|
-
end # class EventInfo
|
66
|
-
|
67
|
-
|
68
|
-
class EventInfoReader
|
69
|
-
def catalog() Import.catalog; end
|
70
|
-
|
71
|
-
|
72
|
-
def self.read( path )
|
73
|
-
txt = File.open( path, 'r:utf-8') {|f| f.read }
|
74
|
-
new( txt ).parse
|
75
|
-
end
|
76
|
-
|
77
|
-
def self.parse( txt )
|
78
|
-
new( txt ).parse
|
79
|
-
end
|
80
|
-
|
81
|
-
def initialize( txt )
|
82
|
-
@txt = txt
|
83
|
-
end
|
84
|
-
|
85
|
-
def parse
|
86
|
-
recs = []
|
87
|
-
|
88
|
-
parse_csv( @txt ).each do |row|
|
89
|
-
league_col = row['League']
|
90
|
-
season_col = row['Season'] || row['Year']
|
91
|
-
dates_col = row['Dates']
|
92
|
-
|
93
|
-
season = Import::Season.parse( season_col )
|
94
|
-
league = catalog.leagues.find!( league_col )
|
95
|
-
|
96
|
-
|
97
|
-
dates = []
|
98
|
-
if dates_col.nil? || dates_col.empty?
|
99
|
-
## do nothing; no dates - keep dates array empty
|
100
|
-
else
|
101
|
-
## squish spaces
|
102
|
-
dates_col = dates_col.gsub( /[ ]{2,}/, ' ' ) ## squish/fold spaces
|
103
|
-
|
104
|
-
puts "#{league.name} (#{league.key}) | #{season.key} | #{dates_col}"
|
105
|
-
|
106
|
-
### todo/check: check what parts "Aug 15" return ???
|
107
|
-
### short form for "Aug 15 -" - works?
|
108
|
-
|
109
|
-
## todo/fix!!! - check EventInfo.include?
|
110
|
-
## now allow dates with only start_date too!! (WITHOUT end_date)
|
111
|
-
parts = dates_col.split( /[ ]*[–-][ ]*/ )
|
112
|
-
if parts.size == 1
|
113
|
-
pp parts
|
114
|
-
dates << DateFormats.parse( parts[0], start: Date.new( season.start_year, 1, 1 ), lang: 'en' )
|
115
|
-
pp dates
|
116
|
-
elsif parts.size == 2
|
117
|
-
pp parts
|
118
|
-
dates << DateFormats.parse( parts[0], start: Date.new( season.start_year, 1, 1 ), lang: 'en' )
|
119
|
-
dates << DateFormats.parse( parts[1], start: Date.new( season.end_year ? season.end_year : season.start_year, 1, 1 ), lang: 'en' )
|
120
|
-
pp dates
|
121
|
-
|
122
|
-
## assert/check if period is less than 365 days for now
|
123
|
-
diff = dates[1].to_date.jd - dates[0].to_date.jd
|
124
|
-
puts "#{diff}d"
|
125
|
-
if diff > 365
|
126
|
-
puts "!! ERROR - date range / period assertion failed; expected diff < 365 days"
|
127
|
-
exit 1
|
128
|
-
end
|
129
|
-
else
|
130
|
-
puts "!! ERRROR - expected data range / period - one or two dates; got #{parts.size}:"
|
131
|
-
pp dates_col
|
132
|
-
pp parts
|
133
|
-
exit 1
|
134
|
-
end
|
135
|
-
end
|
136
|
-
|
137
|
-
|
138
|
-
teams_col = row['Clubs'] || row['Teams']
|
139
|
-
goals_col = row['Goals']
|
140
|
-
|
141
|
-
## note: remove (and allow) all non-digits e.g. 370 goals, 20 clubs, etc.
|
142
|
-
teams_col = teams_col.gsub( /[^0-9]/, '' ) if teams_col
|
143
|
-
goals_col = goals_col.gsub( /[^0-9]/, '' ) if goals_col
|
144
|
-
|
145
|
-
teams = (teams_col.nil? || teams_col.empty?) ? nil : teams_col.to_i
|
146
|
-
goals = (goals_col.nil? || goals_col.empty?) ? nil : goals_col.to_i
|
147
|
-
|
148
|
-
matches_col = row['Matches']
|
149
|
-
## note: support additions in matches (played) e.g.
|
150
|
-
# 132 + 63 Play-off-Spiele
|
151
|
-
matches_col = matches_col.gsub( /[^0-9+]/, '' ) if matches_col
|
152
|
-
|
153
|
-
matches = if matches_col.nil? || matches_col.empty?
|
154
|
-
nil
|
155
|
-
else
|
156
|
-
if matches_col.index( '+' ) ### check for calculations
|
157
|
-
## note: for now only supports additions
|
158
|
-
matches_col.split( '+' ).reduce( 0 ) do |sum,str|
|
159
|
-
sum + str.to_i
|
160
|
-
end
|
161
|
-
else ## assume single (integer) number
|
162
|
-
matches_col.to_i
|
163
|
-
end
|
164
|
-
end
|
165
|
-
|
166
|
-
rec = EventInfo.new( league: league,
|
167
|
-
season: season,
|
168
|
-
start_date: dates[0],
|
169
|
-
end_date: dates[1],
|
170
|
-
teams: teams,
|
171
|
-
matches: matches,
|
172
|
-
goals: goals
|
173
|
-
)
|
174
|
-
recs << rec
|
175
|
-
end # each row
|
176
|
-
recs
|
177
|
-
end # method parse
|
178
|
-
end # class EventInfoReader
|
179
|
-
|
180
|
-
|
181
|
-
end ## module Import
|
182
|
-
end ## module SportDb
|
183
|
-
|
1
|
+
|
2
|
+
module SportDb
|
3
|
+
module Import
|
4
|
+
|
5
|
+
|
6
|
+
class EventInfo
|
7
|
+
## "high level" info (summary) about event (like a "wikipedia infobox")
|
8
|
+
## use for checking dataset imports; lets you check e.g.
|
9
|
+
## - dates within range
|
10
|
+
## - number of teams e.g. 20
|
11
|
+
## - matches played e.g. 380
|
12
|
+
## - goals scored e.g. 937
|
13
|
+
## etc.
|
14
|
+
|
15
|
+
attr_reader :league,
|
16
|
+
:season,
|
17
|
+
:teams,
|
18
|
+
:matches,
|
19
|
+
:goals,
|
20
|
+
:start_date,
|
21
|
+
:end_date
|
22
|
+
|
23
|
+
def initialize( league:, season:,
|
24
|
+
start_date: nil, end_date: nil,
|
25
|
+
teams: nil,
|
26
|
+
matches: nil,
|
27
|
+
goals: nil )
|
28
|
+
|
29
|
+
@league = league
|
30
|
+
@season = season
|
31
|
+
|
32
|
+
@start_date = start_date
|
33
|
+
@end_date = end_date
|
34
|
+
|
35
|
+
@teams = teams ## todo/check: rename/use teams_count ??
|
36
|
+
@matches = matches ## todo/check: rename/use match_count ??
|
37
|
+
@goals = goals
|
38
|
+
end
|
39
|
+
|
40
|
+
def include?( date )
|
41
|
+
## todo/fix: add options e.g.
|
42
|
+
## - add delta/off_by_one or such?
|
43
|
+
## - add strict (for) only return true if date range (really) defined (no generic auto-rules)
|
44
|
+
|
45
|
+
### note: for now allow off by one error (via timezone/local time errors)
|
46
|
+
## todo/fix: issue warning if off by one!!!!
|
47
|
+
if @start_date && @end_date
|
48
|
+
date >= (@start_date-1) &&
|
49
|
+
date <= (@end_date+1)
|
50
|
+
else
|
51
|
+
if @season.year?
|
52
|
+
# assume generic rule
|
53
|
+
## same year e.g. Jan 1 - Dec 31; always true for now
|
54
|
+
date.year == @season.start_year
|
55
|
+
else
|
56
|
+
# assume generic rule
|
57
|
+
## July 1 - June 30 (Y+1)
|
58
|
+
## - todo/check -start for some countries/leagues in June 1 or August 1 ????
|
59
|
+
date >= Date.new( @season.start_year, 7, 1 ) &&
|
60
|
+
date <= Date.new( @season.end_year, 6, 30 )
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end # method include?
|
64
|
+
alias_method :between?, :include?
|
65
|
+
end # class EventInfo
|
66
|
+
|
67
|
+
|
68
|
+
class EventInfoReader
|
69
|
+
def catalog() Import.catalog; end
|
70
|
+
|
71
|
+
|
72
|
+
def self.read( path )
|
73
|
+
txt = File.open( path, 'r:utf-8') {|f| f.read }
|
74
|
+
new( txt ).parse
|
75
|
+
end
|
76
|
+
|
77
|
+
def self.parse( txt )
|
78
|
+
new( txt ).parse
|
79
|
+
end
|
80
|
+
|
81
|
+
def initialize( txt )
|
82
|
+
@txt = txt
|
83
|
+
end
|
84
|
+
|
85
|
+
def parse
|
86
|
+
recs = []
|
87
|
+
|
88
|
+
parse_csv( @txt ).each do |row|
|
89
|
+
league_col = row['League']
|
90
|
+
season_col = row['Season'] || row['Year']
|
91
|
+
dates_col = row['Dates']
|
92
|
+
|
93
|
+
season = Import::Season.parse( season_col )
|
94
|
+
league = catalog.leagues.find!( league_col )
|
95
|
+
|
96
|
+
|
97
|
+
dates = []
|
98
|
+
if dates_col.nil? || dates_col.empty?
|
99
|
+
## do nothing; no dates - keep dates array empty
|
100
|
+
else
|
101
|
+
## squish spaces
|
102
|
+
dates_col = dates_col.gsub( /[ ]{2,}/, ' ' ) ## squish/fold spaces
|
103
|
+
|
104
|
+
puts "#{league.name} (#{league.key}) | #{season.key} | #{dates_col}"
|
105
|
+
|
106
|
+
### todo/check: check what parts "Aug 15" return ???
|
107
|
+
### short form for "Aug 15 -" - works?
|
108
|
+
|
109
|
+
## todo/fix!!! - check EventInfo.include?
|
110
|
+
## now allow dates with only start_date too!! (WITHOUT end_date)
|
111
|
+
parts = dates_col.split( /[ ]*[–-][ ]*/ )
|
112
|
+
if parts.size == 1
|
113
|
+
pp parts
|
114
|
+
dates << DateFormats.parse( parts[0], start: Date.new( season.start_year, 1, 1 ), lang: 'en' )
|
115
|
+
pp dates
|
116
|
+
elsif parts.size == 2
|
117
|
+
pp parts
|
118
|
+
dates << DateFormats.parse( parts[0], start: Date.new( season.start_year, 1, 1 ), lang: 'en' )
|
119
|
+
dates << DateFormats.parse( parts[1], start: Date.new( season.end_year ? season.end_year : season.start_year, 1, 1 ), lang: 'en' )
|
120
|
+
pp dates
|
121
|
+
|
122
|
+
## assert/check if period is less than 365 days for now
|
123
|
+
diff = dates[1].to_date.jd - dates[0].to_date.jd
|
124
|
+
puts "#{diff}d"
|
125
|
+
if diff > 365
|
126
|
+
puts "!! ERROR - date range / period assertion failed; expected diff < 365 days"
|
127
|
+
exit 1
|
128
|
+
end
|
129
|
+
else
|
130
|
+
puts "!! ERRROR - expected data range / period - one or two dates; got #{parts.size}:"
|
131
|
+
pp dates_col
|
132
|
+
pp parts
|
133
|
+
exit 1
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
|
138
|
+
teams_col = row['Clubs'] || row['Teams']
|
139
|
+
goals_col = row['Goals']
|
140
|
+
|
141
|
+
## note: remove (and allow) all non-digits e.g. 370 goals, 20 clubs, etc.
|
142
|
+
teams_col = teams_col.gsub( /[^0-9]/, '' ) if teams_col
|
143
|
+
goals_col = goals_col.gsub( /[^0-9]/, '' ) if goals_col
|
144
|
+
|
145
|
+
teams = (teams_col.nil? || teams_col.empty?) ? nil : teams_col.to_i
|
146
|
+
goals = (goals_col.nil? || goals_col.empty?) ? nil : goals_col.to_i
|
147
|
+
|
148
|
+
matches_col = row['Matches']
|
149
|
+
## note: support additions in matches (played) e.g.
|
150
|
+
# 132 + 63 Play-off-Spiele
|
151
|
+
matches_col = matches_col.gsub( /[^0-9+]/, '' ) if matches_col
|
152
|
+
|
153
|
+
matches = if matches_col.nil? || matches_col.empty?
|
154
|
+
nil
|
155
|
+
else
|
156
|
+
if matches_col.index( '+' ) ### check for calculations
|
157
|
+
## note: for now only supports additions
|
158
|
+
matches_col.split( '+' ).reduce( 0 ) do |sum,str|
|
159
|
+
sum + str.to_i
|
160
|
+
end
|
161
|
+
else ## assume single (integer) number
|
162
|
+
matches_col.to_i
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
rec = EventInfo.new( league: league,
|
167
|
+
season: season,
|
168
|
+
start_date: dates[0],
|
169
|
+
end_date: dates[1],
|
170
|
+
teams: teams,
|
171
|
+
matches: matches,
|
172
|
+
goals: goals
|
173
|
+
)
|
174
|
+
recs << rec
|
175
|
+
end # each row
|
176
|
+
recs
|
177
|
+
end # method parse
|
178
|
+
end # class EventInfoReader
|
179
|
+
|
180
|
+
|
181
|
+
end ## module Import
|
182
|
+
end ## module SportDb
|
183
|
+
|
@@ -151,6 +151,7 @@ class LeagueOutlineReader ## todo/check - rename to LeaguePageReader / LeagueP
|
|
151
151
|
'Playoffs - Relegation',
|
152
152
|
'Playoffs - Challenger',
|
153
153
|
'Finals',
|
154
|
+
'Match 6th Place', # e.g. Super League Greece 2012/13
|
154
155
|
|
155
156
|
'Apertura',
|
156
157
|
'Apertura - Liguilla',
|
@@ -1,168 +1,168 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
|
4
|
-
module SportDb
|
5
|
-
module Import
|
6
|
-
|
7
|
-
|
8
|
-
class LeagueReader
|
9
|
-
|
10
|
-
def catalog() Import.catalog; end
|
11
|
-
|
12
|
-
|
13
|
-
def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
|
14
|
-
txt = File.open( path, 'r:utf-8' ) { |f| f.read }
|
15
|
-
parse( txt )
|
16
|
-
end
|
17
|
-
|
18
|
-
def self.parse( txt )
|
19
|
-
new( txt ).parse
|
20
|
-
end
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
include Logging
|
25
|
-
|
26
|
-
def initialize( txt )
|
27
|
-
@txt = txt
|
28
|
-
end
|
29
|
-
|
30
|
-
def parse
|
31
|
-
recs = []
|
32
|
-
last_rec = nil
|
33
|
-
|
34
|
-
country = nil # last country
|
35
|
-
intl = false # is international (league/tournament/cup/competition)
|
36
|
-
clubs = true # or clubs|national teams
|
37
|
-
|
38
|
-
OutlineReader.parse( @txt ).each do |node|
|
39
|
-
if [:h1,:h2,:h3,:h4,:h5,:h6].include?( node[0] )
|
40
|
-
heading_level = node[0][1].to_i
|
41
|
-
heading = node[1]
|
42
|
-
|
43
|
-
logger.debug "heading #{heading_level} >#{heading}<"
|
44
|
-
|
45
|
-
if heading_level != 1
|
46
|
-
puts "** !!! ERROR !!! unsupported heading level; expected heading 1 for now only; sorry"
|
47
|
-
pp line
|
48
|
-
exit 1
|
49
|
-
else
|
50
|
-
logger.debug "heading (#{heading_level}) >#{heading}<"
|
51
|
-
last_heading = heading
|
52
|
-
## map to country or international / int'l or national teams
|
53
|
-
if heading =~ /national team/i ## national team tournament
|
54
|
-
country = nil
|
55
|
-
intl = true
|
56
|
-
clubs = false
|
57
|
-
elsif heading =~ /international|int'l/i ## int'l club tournament
|
58
|
-
country = nil
|
59
|
-
intl = true
|
60
|
-
clubs = true
|
61
|
-
else
|
62
|
-
## assume country in heading; allow all "formats" supported by parse e.g.
|
63
|
-
## Österreich • Austria (at)
|
64
|
-
## Österreich • Austria
|
65
|
-
## Austria
|
66
|
-
## Deutschland (de) • Germany
|
67
|
-
country = catalog.countries.parse( heading )
|
68
|
-
intl = false
|
69
|
-
clubs = true
|
70
|
-
|
71
|
-
## check country code - MUST exist for now!!!!
|
72
|
-
if country.nil?
|
73
|
-
puts "!!! error [league reader] - unknown country >#{heading}< - sorry - add country to config to fix"
|
74
|
-
exit 1
|
75
|
-
end
|
76
|
-
end
|
77
|
-
end
|
78
|
-
elsif node[0] == :p ## paragraph with (text) lines
|
79
|
-
lines = node[1]
|
80
|
-
lines.each do |line|
|
81
|
-
|
82
|
-
if line.start_with?( '|' )
|
83
|
-
## assume continuation with line of alternative names
|
84
|
-
## note: skip leading pipe
|
85
|
-
values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
|
86
|
-
## 1) strip (commerical) sponsor markers/tags e.g. $$ Liga $$BBV$$ MX
|
87
|
-
## 2) strip and squish (white)spaces
|
88
|
-
# e.g. New York FC (2011-) => New York FC (2011-)
|
89
|
-
values = values.map { |value| value.gsub( '$', '' )
|
90
|
-
.gsub( /[ \t]+/, ' ' )
|
91
|
-
.strip }
|
92
|
-
logger.debug "alt_names: #{values.join( '|' )}"
|
93
|
-
|
94
|
-
last_rec.alt_names += values
|
95
|
-
else
|
96
|
-
## assume "regular" line
|
97
|
-
## check if starts with id (todo/check: use a more "strict"/better regex capture pattern!!!)
|
98
|
-
if line =~ /^([a-z0-9][a-z0-9.]*)[ ]+(.+)$/
|
99
|
-
league_key = $1
|
100
|
-
## 1) strip (commercial) sponsor markers/tags e.g $$
|
101
|
-
## 2) strip and squish (white)spaces
|
102
|
-
league_name = $2.gsub( '$', '' )
|
103
|
-
.gsub( /[ \t]+/, ' ' )
|
104
|
-
.strip
|
105
|
-
|
106
|
-
logger.debug "key: >#{league_key}<, name: >#{league_name}<"
|
107
|
-
|
108
|
-
|
109
|
-
alt_names_auto = []
|
110
|
-
if country
|
111
|
-
alt_names_auto << "#{country.key.upcase} #{league_key.upcase.gsub('.', ' ')}"
|
112
|
-
## todo/check: add "hack" for cl (chile) and exclude?
|
113
|
-
## add a list of (auto-)excluded country codes with conflicts? why? why not?
|
114
|
-
## cl - a) Chile b) Champions League
|
115
|
-
alt_names_auto << "#{country.key.upcase}" if league_key == '1' ## add shortcut for top level 1 (just country key)
|
116
|
-
if country.key.upcase != country.code
|
117
|
-
alt_names_auto << "#{country.code} #{league_key.upcase.gsub('.', ' ')}"
|
118
|
-
alt_names_auto << "#{country.code}" if league_key == '1' ## add shortcut for top level 1 (just country key)
|
119
|
-
end
|
120
|
-
alt_names_auto << "#{country.name} #{league_key}" if league_key =~ /^[0-9]+$/ ## if all numeric e.g. add Austria 1 etc.
|
121
|
-
|
122
|
-
## auto-add with country prepended
|
123
|
-
## e.g. England Premier League, Austria Bundesliga etc.
|
124
|
-
## todo/check: also add variants with country alt name if present!!!
|
125
|
-
## todo/check: exclude cups or such from country + league name auto-add - why? why not?
|
126
|
-
alt_names_auto << "#{country.name} #{league_name}"
|
127
|
-
else ## assume int'l (no country) e.g. champions league, etc.
|
128
|
-
## only auto-add key (e.g. CL, EL, etc.)
|
129
|
-
alt_names_auto << league_key.upcase.gsub('.', ' ') ## note: no country code (prefix/leading) used
|
130
|
-
end
|
131
|
-
|
132
|
-
## pp alt_names_auto
|
133
|
-
|
134
|
-
## prepend country key/code if country present
|
135
|
-
## todo/fix: only auto-prepend country if key/code start with a number (level) or incl. cup
|
136
|
-
## why? lets you "overwrite" key if desired - use it - why? why not?
|
137
|
-
if country
|
138
|
-
league_key = "#{country.key}.#{league_key}"
|
139
|
-
end
|
140
|
-
|
141
|
-
rec = League.new( key: league_key,
|
142
|
-
name: league_name,
|
143
|
-
alt_names_auto: alt_names_auto,
|
144
|
-
country: country,
|
145
|
-
intl: intl,
|
146
|
-
clubs: clubs)
|
147
|
-
recs << rec
|
148
|
-
last_rec = rec
|
149
|
-
else
|
150
|
-
puts "** !!! ERROR !!! missing key for (canonical) league name"
|
151
|
-
exit 1
|
152
|
-
end
|
153
|
-
end
|
154
|
-
end # each line
|
155
|
-
else
|
156
|
-
puts "** !!! ERROR !!! [league reader] - unknown line type:"
|
157
|
-
pp node
|
158
|
-
exit 1
|
159
|
-
end
|
160
|
-
## pp line
|
161
|
-
end
|
162
|
-
recs
|
163
|
-
end # method parse
|
164
|
-
|
165
|
-
end # class LeagueReader
|
166
|
-
|
167
|
-
end ## module Import
|
168
|
-
end ## module SportDb
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
module SportDb
|
5
|
+
module Import
|
6
|
+
|
7
|
+
|
8
|
+
class LeagueReader
|
9
|
+
|
10
|
+
def catalog() Import.catalog; end
|
11
|
+
|
12
|
+
|
13
|
+
def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
|
14
|
+
txt = File.open( path, 'r:utf-8' ) { |f| f.read }
|
15
|
+
parse( txt )
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.parse( txt )
|
19
|
+
new( txt ).parse
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
|
24
|
+
include Logging
|
25
|
+
|
26
|
+
def initialize( txt )
|
27
|
+
@txt = txt
|
28
|
+
end
|
29
|
+
|
30
|
+
def parse
|
31
|
+
recs = []
|
32
|
+
last_rec = nil
|
33
|
+
|
34
|
+
country = nil # last country
|
35
|
+
intl = false # is international (league/tournament/cup/competition)
|
36
|
+
clubs = true # or clubs|national teams
|
37
|
+
|
38
|
+
OutlineReader.parse( @txt ).each do |node|
|
39
|
+
if [:h1,:h2,:h3,:h4,:h5,:h6].include?( node[0] )
|
40
|
+
heading_level = node[0][1].to_i
|
41
|
+
heading = node[1]
|
42
|
+
|
43
|
+
logger.debug "heading #{heading_level} >#{heading}<"
|
44
|
+
|
45
|
+
if heading_level != 1
|
46
|
+
puts "** !!! ERROR !!! unsupported heading level; expected heading 1 for now only; sorry"
|
47
|
+
pp line
|
48
|
+
exit 1
|
49
|
+
else
|
50
|
+
logger.debug "heading (#{heading_level}) >#{heading}<"
|
51
|
+
last_heading = heading
|
52
|
+
## map to country or international / int'l or national teams
|
53
|
+
if heading =~ /national team/i ## national team tournament
|
54
|
+
country = nil
|
55
|
+
intl = true
|
56
|
+
clubs = false
|
57
|
+
elsif heading =~ /international|int'l/i ## int'l club tournament
|
58
|
+
country = nil
|
59
|
+
intl = true
|
60
|
+
clubs = true
|
61
|
+
else
|
62
|
+
## assume country in heading; allow all "formats" supported by parse e.g.
|
63
|
+
## Österreich • Austria (at)
|
64
|
+
## Österreich • Austria
|
65
|
+
## Austria
|
66
|
+
## Deutschland (de) • Germany
|
67
|
+
country = catalog.countries.parse( heading )
|
68
|
+
intl = false
|
69
|
+
clubs = true
|
70
|
+
|
71
|
+
## check country code - MUST exist for now!!!!
|
72
|
+
if country.nil?
|
73
|
+
puts "!!! error [league reader] - unknown country >#{heading}< - sorry - add country to config to fix"
|
74
|
+
exit 1
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
elsif node[0] == :p ## paragraph with (text) lines
|
79
|
+
lines = node[1]
|
80
|
+
lines.each do |line|
|
81
|
+
|
82
|
+
if line.start_with?( '|' )
|
83
|
+
## assume continuation with line of alternative names
|
84
|
+
## note: skip leading pipe
|
85
|
+
values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
|
86
|
+
## 1) strip (commerical) sponsor markers/tags e.g. $$ Liga $$BBV$$ MX
|
87
|
+
## 2) strip and squish (white)spaces
|
88
|
+
# e.g. New York FC (2011-) => New York FC (2011-)
|
89
|
+
values = values.map { |value| value.gsub( '$', '' )
|
90
|
+
.gsub( /[ \t]+/, ' ' )
|
91
|
+
.strip }
|
92
|
+
logger.debug "alt_names: #{values.join( '|' )}"
|
93
|
+
|
94
|
+
last_rec.alt_names += values
|
95
|
+
else
|
96
|
+
## assume "regular" line
|
97
|
+
## check if starts with id (todo/check: use a more "strict"/better regex capture pattern!!!)
|
98
|
+
if line =~ /^([a-z0-9][a-z0-9.]*)[ ]+(.+)$/
|
99
|
+
league_key = $1
|
100
|
+
## 1) strip (commercial) sponsor markers/tags e.g $$
|
101
|
+
## 2) strip and squish (white)spaces
|
102
|
+
league_name = $2.gsub( '$', '' )
|
103
|
+
.gsub( /[ \t]+/, ' ' )
|
104
|
+
.strip
|
105
|
+
|
106
|
+
logger.debug "key: >#{league_key}<, name: >#{league_name}<"
|
107
|
+
|
108
|
+
|
109
|
+
alt_names_auto = []
|
110
|
+
if country
|
111
|
+
alt_names_auto << "#{country.key.upcase} #{league_key.upcase.gsub('.', ' ')}"
|
112
|
+
## todo/check: add "hack" for cl (chile) and exclude?
|
113
|
+
## add a list of (auto-)excluded country codes with conflicts? why? why not?
|
114
|
+
## cl - a) Chile b) Champions League
|
115
|
+
alt_names_auto << "#{country.key.upcase}" if league_key == '1' ## add shortcut for top level 1 (just country key)
|
116
|
+
if country.key.upcase != country.code
|
117
|
+
alt_names_auto << "#{country.code} #{league_key.upcase.gsub('.', ' ')}"
|
118
|
+
alt_names_auto << "#{country.code}" if league_key == '1' ## add shortcut for top level 1 (just country key)
|
119
|
+
end
|
120
|
+
alt_names_auto << "#{country.name} #{league_key}" if league_key =~ /^[0-9]+$/ ## if all numeric e.g. add Austria 1 etc.
|
121
|
+
|
122
|
+
## auto-add with country prepended
|
123
|
+
## e.g. England Premier League, Austria Bundesliga etc.
|
124
|
+
## todo/check: also add variants with country alt name if present!!!
|
125
|
+
## todo/check: exclude cups or such from country + league name auto-add - why? why not?
|
126
|
+
alt_names_auto << "#{country.name} #{league_name}"
|
127
|
+
else ## assume int'l (no country) e.g. champions league, etc.
|
128
|
+
## only auto-add key (e.g. CL, EL, etc.)
|
129
|
+
alt_names_auto << league_key.upcase.gsub('.', ' ') ## note: no country code (prefix/leading) used
|
130
|
+
end
|
131
|
+
|
132
|
+
## pp alt_names_auto
|
133
|
+
|
134
|
+
## prepend country key/code if country present
|
135
|
+
## todo/fix: only auto-prepend country if key/code start with a number (level) or incl. cup
|
136
|
+
## why? lets you "overwrite" key if desired - use it - why? why not?
|
137
|
+
if country
|
138
|
+
league_key = "#{country.key}.#{league_key}"
|
139
|
+
end
|
140
|
+
|
141
|
+
rec = League.new( key: league_key,
|
142
|
+
name: league_name,
|
143
|
+
alt_names_auto: alt_names_auto,
|
144
|
+
country: country,
|
145
|
+
intl: intl,
|
146
|
+
clubs: clubs)
|
147
|
+
recs << rec
|
148
|
+
last_rec = rec
|
149
|
+
else
|
150
|
+
puts "** !!! ERROR !!! missing key for (canonical) league name"
|
151
|
+
exit 1
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end # each line
|
155
|
+
else
|
156
|
+
puts "** !!! ERROR !!! [league reader] - unknown line type:"
|
157
|
+
pp node
|
158
|
+
exit 1
|
159
|
+
end
|
160
|
+
## pp line
|
161
|
+
end
|
162
|
+
recs
|
163
|
+
end # method parse
|
164
|
+
|
165
|
+
end # class LeagueReader
|
166
|
+
|
167
|
+
end ## module Import
|
168
|
+
end ## module SportDb
|