sportdb-formats 1.1.5 → 1.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/sportdb/formats/country/country_reader.rb +142 -142
- data/lib/sportdb/formats/datafile.rb +59 -59
- data/lib/sportdb/formats/event/event_index.rb +141 -141
- data/lib/sportdb/formats/event/event_reader.rb +183 -183
- data/lib/sportdb/formats/league/league_outline_reader.rb +1 -0
- data/lib/sportdb/formats/league/league_reader.rb +168 -168
- data/lib/sportdb/formats/match/match_parser_auto_conf.rb +202 -202
- data/lib/sportdb/formats/package.rb +374 -374
- data/lib/sportdb/formats/team/club_index_history.rb +134 -134
- data/lib/sportdb/formats/team/club_reader.rb +350 -350
- data/lib/sportdb/formats/team/club_reader_history.rb +203 -203
- data/lib/sportdb/formats/team/wiki_reader.rb +108 -108
- data/lib/sportdb/formats/version.rb +1 -1
- data/test/test_club_index_history.rb +107 -107
- data/test/test_club_reader.rb +201 -201
- data/test/test_club_reader_history.rb +212 -212
- data/test/test_country_reader.rb +89 -89
- data/test/test_league_outline_reader.rb +55 -55
- data/test/test_league_reader.rb +72 -72
- data/test/test_outline_reader.rb +31 -31
- data/test/test_regex.rb +67 -67
- data/test/test_wiki_reader.rb +77 -77
- metadata +12 -6
@@ -1,183 +1,183 @@
|
|
1
|
-
|
2
|
-
module SportDb
|
3
|
-
module Import
|
4
|
-
|
5
|
-
|
6
|
-
class EventInfo
|
7
|
-
## "high level" info (summary) about event (like a "wikipedia infobox")
|
8
|
-
## use for checking dataset imports; lets you check e.g.
|
9
|
-
## - dates within range
|
10
|
-
## - number of teams e.g. 20
|
11
|
-
## - matches played e.g. 380
|
12
|
-
## - goals scored e.g. 937
|
13
|
-
## etc.
|
14
|
-
|
15
|
-
attr_reader :league,
|
16
|
-
:season,
|
17
|
-
:teams,
|
18
|
-
:matches,
|
19
|
-
:goals,
|
20
|
-
:start_date,
|
21
|
-
:end_date
|
22
|
-
|
23
|
-
def initialize( league:, season:,
|
24
|
-
start_date: nil, end_date: nil,
|
25
|
-
teams: nil,
|
26
|
-
matches: nil,
|
27
|
-
goals: nil )
|
28
|
-
|
29
|
-
@league = league
|
30
|
-
@season = season
|
31
|
-
|
32
|
-
@start_date = start_date
|
33
|
-
@end_date = end_date
|
34
|
-
|
35
|
-
@teams = teams ## todo/check: rename/use teams_count ??
|
36
|
-
@matches = matches ## todo/check: rename/use match_count ??
|
37
|
-
@goals = goals
|
38
|
-
end
|
39
|
-
|
40
|
-
def include?( date )
|
41
|
-
## todo/fix: add options e.g.
|
42
|
-
## - add delta/off_by_one or such?
|
43
|
-
## - add strict (for) only return true if date range (really) defined (no generic auto-rules)
|
44
|
-
|
45
|
-
### note: for now allow off by one error (via timezone/local time errors)
|
46
|
-
## todo/fix: issue warning if off by one!!!!
|
47
|
-
if @start_date && @end_date
|
48
|
-
date >= (@start_date-1) &&
|
49
|
-
date <= (@end_date+1)
|
50
|
-
else
|
51
|
-
if @season.year?
|
52
|
-
# assume generic rule
|
53
|
-
## same year e.g. Jan 1 - Dec 31; always true for now
|
54
|
-
date.year == @season.start_year
|
55
|
-
else
|
56
|
-
# assume generic rule
|
57
|
-
## July 1 - June 30 (Y+1)
|
58
|
-
## - todo/check -start for some countries/leagues in June 1 or August 1 ????
|
59
|
-
date >= Date.new( @season.start_year, 7, 1 ) &&
|
60
|
-
date <= Date.new( @season.end_year, 6, 30 )
|
61
|
-
end
|
62
|
-
end
|
63
|
-
end # method include?
|
64
|
-
alias_method :between?, :include?
|
65
|
-
end # class EventInfo
|
66
|
-
|
67
|
-
|
68
|
-
class EventInfoReader
|
69
|
-
def catalog() Import.catalog; end
|
70
|
-
|
71
|
-
|
72
|
-
def self.read( path )
|
73
|
-
txt = File.open( path, 'r:utf-8') {|f| f.read }
|
74
|
-
new( txt ).parse
|
75
|
-
end
|
76
|
-
|
77
|
-
def self.parse( txt )
|
78
|
-
new( txt ).parse
|
79
|
-
end
|
80
|
-
|
81
|
-
def initialize( txt )
|
82
|
-
@txt = txt
|
83
|
-
end
|
84
|
-
|
85
|
-
def parse
|
86
|
-
recs = []
|
87
|
-
|
88
|
-
parse_csv( @txt ).each do |row|
|
89
|
-
league_col = row['League']
|
90
|
-
season_col = row['Season'] || row['Year']
|
91
|
-
dates_col = row['Dates']
|
92
|
-
|
93
|
-
season = Import::Season.parse( season_col )
|
94
|
-
league = catalog.leagues.find!( league_col )
|
95
|
-
|
96
|
-
|
97
|
-
dates = []
|
98
|
-
if dates_col.nil? || dates_col.empty?
|
99
|
-
## do nothing; no dates - keep dates array empty
|
100
|
-
else
|
101
|
-
## squish spaces
|
102
|
-
dates_col = dates_col.gsub( /[ ]{2,}/, ' ' ) ## squish/fold spaces
|
103
|
-
|
104
|
-
puts "#{league.name} (#{league.key}) | #{season.key} | #{dates_col}"
|
105
|
-
|
106
|
-
### todo/check: check what parts "Aug 15" return ???
|
107
|
-
### short form for "Aug 15 -" - works?
|
108
|
-
|
109
|
-
## todo/fix!!! - check EventInfo.include?
|
110
|
-
## now allow dates with only start_date too!! (WITHOUT end_date)
|
111
|
-
parts = dates_col.split( /[ ]*[–-][ ]*/ )
|
112
|
-
if parts.size == 1
|
113
|
-
pp parts
|
114
|
-
dates << DateFormats.parse( parts[0], start: Date.new( season.start_year, 1, 1 ), lang: 'en' )
|
115
|
-
pp dates
|
116
|
-
elsif parts.size == 2
|
117
|
-
pp parts
|
118
|
-
dates << DateFormats.parse( parts[0], start: Date.new( season.start_year, 1, 1 ), lang: 'en' )
|
119
|
-
dates << DateFormats.parse( parts[1], start: Date.new( season.end_year ? season.end_year : season.start_year, 1, 1 ), lang: 'en' )
|
120
|
-
pp dates
|
121
|
-
|
122
|
-
## assert/check if period is less than 365 days for now
|
123
|
-
diff = dates[1].to_date.jd - dates[0].to_date.jd
|
124
|
-
puts "#{diff}d"
|
125
|
-
if diff > 365
|
126
|
-
puts "!! ERROR - date range / period assertion failed; expected diff < 365 days"
|
127
|
-
exit 1
|
128
|
-
end
|
129
|
-
else
|
130
|
-
puts "!! ERRROR - expected data range / period - one or two dates; got #{parts.size}:"
|
131
|
-
pp dates_col
|
132
|
-
pp parts
|
133
|
-
exit 1
|
134
|
-
end
|
135
|
-
end
|
136
|
-
|
137
|
-
|
138
|
-
teams_col = row['Clubs'] || row['Teams']
|
139
|
-
goals_col = row['Goals']
|
140
|
-
|
141
|
-
## note: remove (and allow) all non-digits e.g. 370 goals, 20 clubs, etc.
|
142
|
-
teams_col = teams_col.gsub( /[^0-9]/, '' ) if teams_col
|
143
|
-
goals_col = goals_col.gsub( /[^0-9]/, '' ) if goals_col
|
144
|
-
|
145
|
-
teams = (teams_col.nil? || teams_col.empty?) ? nil : teams_col.to_i
|
146
|
-
goals = (goals_col.nil? || goals_col.empty?) ? nil : goals_col.to_i
|
147
|
-
|
148
|
-
matches_col = row['Matches']
|
149
|
-
## note: support additions in matches (played) e.g.
|
150
|
-
# 132 + 63 Play-off-Spiele
|
151
|
-
matches_col = matches_col.gsub( /[^0-9+]/, '' ) if matches_col
|
152
|
-
|
153
|
-
matches = if matches_col.nil? || matches_col.empty?
|
154
|
-
nil
|
155
|
-
else
|
156
|
-
if matches_col.index( '+' ) ### check for calculations
|
157
|
-
## note: for now only supports additions
|
158
|
-
matches_col.split( '+' ).reduce( 0 ) do |sum,str|
|
159
|
-
sum + str.to_i
|
160
|
-
end
|
161
|
-
else ## assume single (integer) number
|
162
|
-
matches_col.to_i
|
163
|
-
end
|
164
|
-
end
|
165
|
-
|
166
|
-
rec = EventInfo.new( league: league,
|
167
|
-
season: season,
|
168
|
-
start_date: dates[0],
|
169
|
-
end_date: dates[1],
|
170
|
-
teams: teams,
|
171
|
-
matches: matches,
|
172
|
-
goals: goals
|
173
|
-
)
|
174
|
-
recs << rec
|
175
|
-
end # each row
|
176
|
-
recs
|
177
|
-
end # method parse
|
178
|
-
end # class EventInfoReader
|
179
|
-
|
180
|
-
|
181
|
-
end ## module Import
|
182
|
-
end ## module SportDb
|
183
|
-
|
1
|
+
|
2
|
+
module SportDb
|
3
|
+
module Import
|
4
|
+
|
5
|
+
|
6
|
+
class EventInfo
|
7
|
+
## "high level" info (summary) about event (like a "wikipedia infobox")
|
8
|
+
## use for checking dataset imports; lets you check e.g.
|
9
|
+
## - dates within range
|
10
|
+
## - number of teams e.g. 20
|
11
|
+
## - matches played e.g. 380
|
12
|
+
## - goals scored e.g. 937
|
13
|
+
## etc.
|
14
|
+
|
15
|
+
attr_reader :league,
|
16
|
+
:season,
|
17
|
+
:teams,
|
18
|
+
:matches,
|
19
|
+
:goals,
|
20
|
+
:start_date,
|
21
|
+
:end_date
|
22
|
+
|
23
|
+
def initialize( league:, season:,
|
24
|
+
start_date: nil, end_date: nil,
|
25
|
+
teams: nil,
|
26
|
+
matches: nil,
|
27
|
+
goals: nil )
|
28
|
+
|
29
|
+
@league = league
|
30
|
+
@season = season
|
31
|
+
|
32
|
+
@start_date = start_date
|
33
|
+
@end_date = end_date
|
34
|
+
|
35
|
+
@teams = teams ## todo/check: rename/use teams_count ??
|
36
|
+
@matches = matches ## todo/check: rename/use match_count ??
|
37
|
+
@goals = goals
|
38
|
+
end
|
39
|
+
|
40
|
+
def include?( date )
|
41
|
+
## todo/fix: add options e.g.
|
42
|
+
## - add delta/off_by_one or such?
|
43
|
+
## - add strict (for) only return true if date range (really) defined (no generic auto-rules)
|
44
|
+
|
45
|
+
### note: for now allow off by one error (via timezone/local time errors)
|
46
|
+
## todo/fix: issue warning if off by one!!!!
|
47
|
+
if @start_date && @end_date
|
48
|
+
date >= (@start_date-1) &&
|
49
|
+
date <= (@end_date+1)
|
50
|
+
else
|
51
|
+
if @season.year?
|
52
|
+
# assume generic rule
|
53
|
+
## same year e.g. Jan 1 - Dec 31; always true for now
|
54
|
+
date.year == @season.start_year
|
55
|
+
else
|
56
|
+
# assume generic rule
|
57
|
+
## July 1 - June 30 (Y+1)
|
58
|
+
## - todo/check -start for some countries/leagues in June 1 or August 1 ????
|
59
|
+
date >= Date.new( @season.start_year, 7, 1 ) &&
|
60
|
+
date <= Date.new( @season.end_year, 6, 30 )
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end # method include?
|
64
|
+
alias_method :between?, :include?
|
65
|
+
end # class EventInfo
|
66
|
+
|
67
|
+
|
68
|
+
class EventInfoReader
|
69
|
+
def catalog() Import.catalog; end
|
70
|
+
|
71
|
+
|
72
|
+
def self.read( path )
|
73
|
+
txt = File.open( path, 'r:utf-8') {|f| f.read }
|
74
|
+
new( txt ).parse
|
75
|
+
end
|
76
|
+
|
77
|
+
def self.parse( txt )
|
78
|
+
new( txt ).parse
|
79
|
+
end
|
80
|
+
|
81
|
+
def initialize( txt )
|
82
|
+
@txt = txt
|
83
|
+
end
|
84
|
+
|
85
|
+
def parse
|
86
|
+
recs = []
|
87
|
+
|
88
|
+
parse_csv( @txt ).each do |row|
|
89
|
+
league_col = row['League']
|
90
|
+
season_col = row['Season'] || row['Year']
|
91
|
+
dates_col = row['Dates']
|
92
|
+
|
93
|
+
season = Import::Season.parse( season_col )
|
94
|
+
league = catalog.leagues.find!( league_col )
|
95
|
+
|
96
|
+
|
97
|
+
dates = []
|
98
|
+
if dates_col.nil? || dates_col.empty?
|
99
|
+
## do nothing; no dates - keep dates array empty
|
100
|
+
else
|
101
|
+
## squish spaces
|
102
|
+
dates_col = dates_col.gsub( /[ ]{2,}/, ' ' ) ## squish/fold spaces
|
103
|
+
|
104
|
+
puts "#{league.name} (#{league.key}) | #{season.key} | #{dates_col}"
|
105
|
+
|
106
|
+
### todo/check: check what parts "Aug 15" return ???
|
107
|
+
### short form for "Aug 15 -" - works?
|
108
|
+
|
109
|
+
## todo/fix!!! - check EventInfo.include?
|
110
|
+
## now allow dates with only start_date too!! (WITHOUT end_date)
|
111
|
+
parts = dates_col.split( /[ ]*[–-][ ]*/ )
|
112
|
+
if parts.size == 1
|
113
|
+
pp parts
|
114
|
+
dates << DateFormats.parse( parts[0], start: Date.new( season.start_year, 1, 1 ), lang: 'en' )
|
115
|
+
pp dates
|
116
|
+
elsif parts.size == 2
|
117
|
+
pp parts
|
118
|
+
dates << DateFormats.parse( parts[0], start: Date.new( season.start_year, 1, 1 ), lang: 'en' )
|
119
|
+
dates << DateFormats.parse( parts[1], start: Date.new( season.end_year ? season.end_year : season.start_year, 1, 1 ), lang: 'en' )
|
120
|
+
pp dates
|
121
|
+
|
122
|
+
## assert/check if period is less than 365 days for now
|
123
|
+
diff = dates[1].to_date.jd - dates[0].to_date.jd
|
124
|
+
puts "#{diff}d"
|
125
|
+
if diff > 365
|
126
|
+
puts "!! ERROR - date range / period assertion failed; expected diff < 365 days"
|
127
|
+
exit 1
|
128
|
+
end
|
129
|
+
else
|
130
|
+
puts "!! ERRROR - expected data range / period - one or two dates; got #{parts.size}:"
|
131
|
+
pp dates_col
|
132
|
+
pp parts
|
133
|
+
exit 1
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
|
138
|
+
teams_col = row['Clubs'] || row['Teams']
|
139
|
+
goals_col = row['Goals']
|
140
|
+
|
141
|
+
## note: remove (and allow) all non-digits e.g. 370 goals, 20 clubs, etc.
|
142
|
+
teams_col = teams_col.gsub( /[^0-9]/, '' ) if teams_col
|
143
|
+
goals_col = goals_col.gsub( /[^0-9]/, '' ) if goals_col
|
144
|
+
|
145
|
+
teams = (teams_col.nil? || teams_col.empty?) ? nil : teams_col.to_i
|
146
|
+
goals = (goals_col.nil? || goals_col.empty?) ? nil : goals_col.to_i
|
147
|
+
|
148
|
+
matches_col = row['Matches']
|
149
|
+
## note: support additions in matches (played) e.g.
|
150
|
+
# 132 + 63 Play-off-Spiele
|
151
|
+
matches_col = matches_col.gsub( /[^0-9+]/, '' ) if matches_col
|
152
|
+
|
153
|
+
matches = if matches_col.nil? || matches_col.empty?
|
154
|
+
nil
|
155
|
+
else
|
156
|
+
if matches_col.index( '+' ) ### check for calculations
|
157
|
+
## note: for now only supports additions
|
158
|
+
matches_col.split( '+' ).reduce( 0 ) do |sum,str|
|
159
|
+
sum + str.to_i
|
160
|
+
end
|
161
|
+
else ## assume single (integer) number
|
162
|
+
matches_col.to_i
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
rec = EventInfo.new( league: league,
|
167
|
+
season: season,
|
168
|
+
start_date: dates[0],
|
169
|
+
end_date: dates[1],
|
170
|
+
teams: teams,
|
171
|
+
matches: matches,
|
172
|
+
goals: goals
|
173
|
+
)
|
174
|
+
recs << rec
|
175
|
+
end # each row
|
176
|
+
recs
|
177
|
+
end # method parse
|
178
|
+
end # class EventInfoReader
|
179
|
+
|
180
|
+
|
181
|
+
end ## module Import
|
182
|
+
end ## module SportDb
|
183
|
+
|
@@ -151,6 +151,7 @@ class LeagueOutlineReader ## todo/check - rename to LeaguePageReader / LeagueP
|
|
151
151
|
'Playoffs - Relegation',
|
152
152
|
'Playoffs - Challenger',
|
153
153
|
'Finals',
|
154
|
+
'Match 6th Place', # e.g. Super League Greece 2012/13
|
154
155
|
|
155
156
|
'Apertura',
|
156
157
|
'Apertura - Liguilla',
|
@@ -1,168 +1,168 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
|
4
|
-
module SportDb
|
5
|
-
module Import
|
6
|
-
|
7
|
-
|
8
|
-
class LeagueReader
|
9
|
-
|
10
|
-
def catalog() Import.catalog; end
|
11
|
-
|
12
|
-
|
13
|
-
def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
|
14
|
-
txt = File.open( path, 'r:utf-8' ) { |f| f.read }
|
15
|
-
parse( txt )
|
16
|
-
end
|
17
|
-
|
18
|
-
def self.parse( txt )
|
19
|
-
new( txt ).parse
|
20
|
-
end
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
include Logging
|
25
|
-
|
26
|
-
def initialize( txt )
|
27
|
-
@txt = txt
|
28
|
-
end
|
29
|
-
|
30
|
-
def parse
|
31
|
-
recs = []
|
32
|
-
last_rec = nil
|
33
|
-
|
34
|
-
country = nil # last country
|
35
|
-
intl = false # is international (league/tournament/cup/competition)
|
36
|
-
clubs = true # or clubs|national teams
|
37
|
-
|
38
|
-
OutlineReader.parse( @txt ).each do |node|
|
39
|
-
if [:h1,:h2,:h3,:h4,:h5,:h6].include?( node[0] )
|
40
|
-
heading_level = node[0][1].to_i
|
41
|
-
heading = node[1]
|
42
|
-
|
43
|
-
logger.debug "heading #{heading_level} >#{heading}<"
|
44
|
-
|
45
|
-
if heading_level != 1
|
46
|
-
puts "** !!! ERROR !!! unsupported heading level; expected heading 1 for now only; sorry"
|
47
|
-
pp line
|
48
|
-
exit 1
|
49
|
-
else
|
50
|
-
logger.debug "heading (#{heading_level}) >#{heading}<"
|
51
|
-
last_heading = heading
|
52
|
-
## map to country or international / int'l or national teams
|
53
|
-
if heading =~ /national team/i ## national team tournament
|
54
|
-
country = nil
|
55
|
-
intl = true
|
56
|
-
clubs = false
|
57
|
-
elsif heading =~ /international|int'l/i ## int'l club tournament
|
58
|
-
country = nil
|
59
|
-
intl = true
|
60
|
-
clubs = true
|
61
|
-
else
|
62
|
-
## assume country in heading; allow all "formats" supported by parse e.g.
|
63
|
-
## Österreich • Austria (at)
|
64
|
-
## Österreich • Austria
|
65
|
-
## Austria
|
66
|
-
## Deutschland (de) • Germany
|
67
|
-
country = catalog.countries.parse( heading )
|
68
|
-
intl = false
|
69
|
-
clubs = true
|
70
|
-
|
71
|
-
## check country code - MUST exist for now!!!!
|
72
|
-
if country.nil?
|
73
|
-
puts "!!! error [league reader] - unknown country >#{heading}< - sorry - add country to config to fix"
|
74
|
-
exit 1
|
75
|
-
end
|
76
|
-
end
|
77
|
-
end
|
78
|
-
elsif node[0] == :p ## paragraph with (text) lines
|
79
|
-
lines = node[1]
|
80
|
-
lines.each do |line|
|
81
|
-
|
82
|
-
if line.start_with?( '|' )
|
83
|
-
## assume continuation with line of alternative names
|
84
|
-
## note: skip leading pipe
|
85
|
-
values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
|
86
|
-
## 1) strip (commerical) sponsor markers/tags e.g. $$ Liga $$BBV$$ MX
|
87
|
-
## 2) strip and squish (white)spaces
|
88
|
-
# e.g. New York FC (2011-) => New York FC (2011-)
|
89
|
-
values = values.map { |value| value.gsub( '$', '' )
|
90
|
-
.gsub( /[ \t]+/, ' ' )
|
91
|
-
.strip }
|
92
|
-
logger.debug "alt_names: #{values.join( '|' )}"
|
93
|
-
|
94
|
-
last_rec.alt_names += values
|
95
|
-
else
|
96
|
-
## assume "regular" line
|
97
|
-
## check if starts with id (todo/check: use a more "strict"/better regex capture pattern!!!)
|
98
|
-
if line =~ /^([a-z0-9][a-z0-9.]*)[ ]+(.+)$/
|
99
|
-
league_key = $1
|
100
|
-
## 1) strip (commercial) sponsor markers/tags e.g $$
|
101
|
-
## 2) strip and squish (white)spaces
|
102
|
-
league_name = $2.gsub( '$', '' )
|
103
|
-
.gsub( /[ \t]+/, ' ' )
|
104
|
-
.strip
|
105
|
-
|
106
|
-
logger.debug "key: >#{league_key}<, name: >#{league_name}<"
|
107
|
-
|
108
|
-
|
109
|
-
alt_names_auto = []
|
110
|
-
if country
|
111
|
-
alt_names_auto << "#{country.key.upcase} #{league_key.upcase.gsub('.', ' ')}"
|
112
|
-
## todo/check: add "hack" for cl (chile) and exclude?
|
113
|
-
## add a list of (auto-)excluded country codes with conflicts? why? why not?
|
114
|
-
## cl - a) Chile b) Champions League
|
115
|
-
alt_names_auto << "#{country.key.upcase}" if league_key == '1' ## add shortcut for top level 1 (just country key)
|
116
|
-
if country.key.upcase != country.code
|
117
|
-
alt_names_auto << "#{country.code} #{league_key.upcase.gsub('.', ' ')}"
|
118
|
-
alt_names_auto << "#{country.code}" if league_key == '1' ## add shortcut for top level 1 (just country key)
|
119
|
-
end
|
120
|
-
alt_names_auto << "#{country.name} #{league_key}" if league_key =~ /^[0-9]+$/ ## if all numeric e.g. add Austria 1 etc.
|
121
|
-
|
122
|
-
## auto-add with country prepended
|
123
|
-
## e.g. England Premier League, Austria Bundesliga etc.
|
124
|
-
## todo/check: also add variants with country alt name if present!!!
|
125
|
-
## todo/check: exclude cups or such from country + league name auto-add - why? why not?
|
126
|
-
alt_names_auto << "#{country.name} #{league_name}"
|
127
|
-
else ## assume int'l (no country) e.g. champions league, etc.
|
128
|
-
## only auto-add key (e.g. CL, EL, etc.)
|
129
|
-
alt_names_auto << league_key.upcase.gsub('.', ' ') ## note: no country code (prefix/leading) used
|
130
|
-
end
|
131
|
-
|
132
|
-
## pp alt_names_auto
|
133
|
-
|
134
|
-
## prepend country key/code if country present
|
135
|
-
## todo/fix: only auto-prepend country if key/code start with a number (level) or incl. cup
|
136
|
-
## why? lets you "overwrite" key if desired - use it - why? why not?
|
137
|
-
if country
|
138
|
-
league_key = "#{country.key}.#{league_key}"
|
139
|
-
end
|
140
|
-
|
141
|
-
rec = League.new( key: league_key,
|
142
|
-
name: league_name,
|
143
|
-
alt_names_auto: alt_names_auto,
|
144
|
-
country: country,
|
145
|
-
intl: intl,
|
146
|
-
clubs: clubs)
|
147
|
-
recs << rec
|
148
|
-
last_rec = rec
|
149
|
-
else
|
150
|
-
puts "** !!! ERROR !!! missing key for (canonical) league name"
|
151
|
-
exit 1
|
152
|
-
end
|
153
|
-
end
|
154
|
-
end # each line
|
155
|
-
else
|
156
|
-
puts "** !!! ERROR !!! [league reader] - unknown line type:"
|
157
|
-
pp node
|
158
|
-
exit 1
|
159
|
-
end
|
160
|
-
## pp line
|
161
|
-
end
|
162
|
-
recs
|
163
|
-
end # method parse
|
164
|
-
|
165
|
-
end # class LeagueReader
|
166
|
-
|
167
|
-
end ## module Import
|
168
|
-
end ## module SportDb
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
module SportDb
|
5
|
+
module Import
|
6
|
+
|
7
|
+
|
8
|
+
class LeagueReader
|
9
|
+
|
10
|
+
def catalog() Import.catalog; end
|
11
|
+
|
12
|
+
|
13
|
+
def self.read( path ) ## use - rename to read_file or from_file etc. - why? why not?
|
14
|
+
txt = File.open( path, 'r:utf-8' ) { |f| f.read }
|
15
|
+
parse( txt )
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.parse( txt )
|
19
|
+
new( txt ).parse
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
|
24
|
+
include Logging
|
25
|
+
|
26
|
+
def initialize( txt )
|
27
|
+
@txt = txt
|
28
|
+
end
|
29
|
+
|
30
|
+
def parse
|
31
|
+
recs = []
|
32
|
+
last_rec = nil
|
33
|
+
|
34
|
+
country = nil # last country
|
35
|
+
intl = false # is international (league/tournament/cup/competition)
|
36
|
+
clubs = true # or clubs|national teams
|
37
|
+
|
38
|
+
OutlineReader.parse( @txt ).each do |node|
|
39
|
+
if [:h1,:h2,:h3,:h4,:h5,:h6].include?( node[0] )
|
40
|
+
heading_level = node[0][1].to_i
|
41
|
+
heading = node[1]
|
42
|
+
|
43
|
+
logger.debug "heading #{heading_level} >#{heading}<"
|
44
|
+
|
45
|
+
if heading_level != 1
|
46
|
+
puts "** !!! ERROR !!! unsupported heading level; expected heading 1 for now only; sorry"
|
47
|
+
pp line
|
48
|
+
exit 1
|
49
|
+
else
|
50
|
+
logger.debug "heading (#{heading_level}) >#{heading}<"
|
51
|
+
last_heading = heading
|
52
|
+
## map to country or international / int'l or national teams
|
53
|
+
if heading =~ /national team/i ## national team tournament
|
54
|
+
country = nil
|
55
|
+
intl = true
|
56
|
+
clubs = false
|
57
|
+
elsif heading =~ /international|int'l/i ## int'l club tournament
|
58
|
+
country = nil
|
59
|
+
intl = true
|
60
|
+
clubs = true
|
61
|
+
else
|
62
|
+
## assume country in heading; allow all "formats" supported by parse e.g.
|
63
|
+
## Österreich • Austria (at)
|
64
|
+
## Österreich • Austria
|
65
|
+
## Austria
|
66
|
+
## Deutschland (de) • Germany
|
67
|
+
country = catalog.countries.parse( heading )
|
68
|
+
intl = false
|
69
|
+
clubs = true
|
70
|
+
|
71
|
+
## check country code - MUST exist for now!!!!
|
72
|
+
if country.nil?
|
73
|
+
puts "!!! error [league reader] - unknown country >#{heading}< - sorry - add country to config to fix"
|
74
|
+
exit 1
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
elsif node[0] == :p ## paragraph with (text) lines
|
79
|
+
lines = node[1]
|
80
|
+
lines.each do |line|
|
81
|
+
|
82
|
+
if line.start_with?( '|' )
|
83
|
+
## assume continuation with line of alternative names
|
84
|
+
## note: skip leading pipe
|
85
|
+
values = line[1..-1].split( '|' ) # team names - allow/use pipe(|)
|
86
|
+
## 1) strip (commerical) sponsor markers/tags e.g. $$ Liga $$BBV$$ MX
|
87
|
+
## 2) strip and squish (white)spaces
|
88
|
+
# e.g. New York FC (2011-) => New York FC (2011-)
|
89
|
+
values = values.map { |value| value.gsub( '$', '' )
|
90
|
+
.gsub( /[ \t]+/, ' ' )
|
91
|
+
.strip }
|
92
|
+
logger.debug "alt_names: #{values.join( '|' )}"
|
93
|
+
|
94
|
+
last_rec.alt_names += values
|
95
|
+
else
|
96
|
+
## assume "regular" line
|
97
|
+
## check if starts with id (todo/check: use a more "strict"/better regex capture pattern!!!)
|
98
|
+
if line =~ /^([a-z0-9][a-z0-9.]*)[ ]+(.+)$/
|
99
|
+
league_key = $1
|
100
|
+
## 1) strip (commercial) sponsor markers/tags e.g $$
|
101
|
+
## 2) strip and squish (white)spaces
|
102
|
+
league_name = $2.gsub( '$', '' )
|
103
|
+
.gsub( /[ \t]+/, ' ' )
|
104
|
+
.strip
|
105
|
+
|
106
|
+
logger.debug "key: >#{league_key}<, name: >#{league_name}<"
|
107
|
+
|
108
|
+
|
109
|
+
alt_names_auto = []
|
110
|
+
if country
|
111
|
+
alt_names_auto << "#{country.key.upcase} #{league_key.upcase.gsub('.', ' ')}"
|
112
|
+
## todo/check: add "hack" for cl (chile) and exclude?
|
113
|
+
## add a list of (auto-)excluded country codes with conflicts? why? why not?
|
114
|
+
## cl - a) Chile b) Champions League
|
115
|
+
alt_names_auto << "#{country.key.upcase}" if league_key == '1' ## add shortcut for top level 1 (just country key)
|
116
|
+
if country.key.upcase != country.code
|
117
|
+
alt_names_auto << "#{country.code} #{league_key.upcase.gsub('.', ' ')}"
|
118
|
+
alt_names_auto << "#{country.code}" if league_key == '1' ## add shortcut for top level 1 (just country key)
|
119
|
+
end
|
120
|
+
alt_names_auto << "#{country.name} #{league_key}" if league_key =~ /^[0-9]+$/ ## if all numeric e.g. add Austria 1 etc.
|
121
|
+
|
122
|
+
## auto-add with country prepended
|
123
|
+
## e.g. England Premier League, Austria Bundesliga etc.
|
124
|
+
## todo/check: also add variants with country alt name if present!!!
|
125
|
+
## todo/check: exclude cups or such from country + league name auto-add - why? why not?
|
126
|
+
alt_names_auto << "#{country.name} #{league_name}"
|
127
|
+
else ## assume int'l (no country) e.g. champions league, etc.
|
128
|
+
## only auto-add key (e.g. CL, EL, etc.)
|
129
|
+
alt_names_auto << league_key.upcase.gsub('.', ' ') ## note: no country code (prefix/leading) used
|
130
|
+
end
|
131
|
+
|
132
|
+
## pp alt_names_auto
|
133
|
+
|
134
|
+
## prepend country key/code if country present
|
135
|
+
## todo/fix: only auto-prepend country if key/code start with a number (level) or incl. cup
|
136
|
+
## why? lets you "overwrite" key if desired - use it - why? why not?
|
137
|
+
if country
|
138
|
+
league_key = "#{country.key}.#{league_key}"
|
139
|
+
end
|
140
|
+
|
141
|
+
rec = League.new( key: league_key,
|
142
|
+
name: league_name,
|
143
|
+
alt_names_auto: alt_names_auto,
|
144
|
+
country: country,
|
145
|
+
intl: intl,
|
146
|
+
clubs: clubs)
|
147
|
+
recs << rec
|
148
|
+
last_rec = rec
|
149
|
+
else
|
150
|
+
puts "** !!! ERROR !!! missing key for (canonical) league name"
|
151
|
+
exit 1
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end # each line
|
155
|
+
else
|
156
|
+
puts "** !!! ERROR !!! [league reader] - unknown line type:"
|
157
|
+
pp node
|
158
|
+
exit 1
|
159
|
+
end
|
160
|
+
## pp line
|
161
|
+
end
|
162
|
+
recs
|
163
|
+
end # method parse
|
164
|
+
|
165
|
+
end # class LeagueReader
|
166
|
+
|
167
|
+
end ## module Import
|
168
|
+
end ## module SportDb
|