worldfootball 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,186 @@
1
+
2
+ module Worldfootball
3
+ class Page
4
+
5
+ class Report < Page ## note: use nested class for now - why? why not?
6
+
7
+
8
+ def self.from_cache( slug )
9
+ url = Metal.report_url( slug )
10
+ html = Webcache.read( url )
11
+ new( html )
12
+ end
13
+
14
+
15
+
16
+ def find_table_tore
17
+ # <table class="" cellpadding="3" cellspacing="1">
18
+ # <tr>
19
+ # <td colspan="2" class="ueberschrift" align="center">Tore</td>
20
+ # </tr>
21
+
22
+ ## get table
23
+ ## first table row is Tore
24
+ tables = doc.css( 'table.standard_tabelle' )
25
+ # puts " found #{tables.size} table.standard_tabelle" # e.g. found 6 table.standard_tabelle
26
+ tables.each do |table|
27
+ trs = table.css( 'tr' )
28
+ ## puts " found #{trs.size} trs"
29
+ tds = trs[0].css( 'td' )
30
+ ## puts " found #{tds.size} tds"
31
+
32
+ if tds.size > 0 && tds[0].text == 'Tore'
33
+ return table
34
+ end
35
+ end
36
+
37
+ nil ## nothing found; auto-report error -why? why not?
38
+ end
39
+
40
+ def goals
41
+ @goals ||= begin
42
+
43
+ # <div class="data">
44
+ # <table class="standard_tabelle" cellpadding="3" cellspacing="1">
45
+
46
+ # puts table.class.name #=> Nokogiri::XML::Element
47
+ # puts table.text
48
+
49
+ table = find_table_tore
50
+ ## pp table
51
+
52
+ trs = table.css( 'tr' )
53
+ # puts trs.size
54
+
55
+
56
+
57
+ rows = []
58
+ last_score1 = 0
59
+ last_score2 = 0
60
+
61
+ trs.each_with_index do |tr,i|
62
+
63
+ next if i==0 # skip Tore headline row
64
+
65
+ break if i==1 && tr.text.strip == 'keine' ## assume 0:0 - no goals
66
+
67
+ # <tr>
68
+ # <td class="hell" align="center" width="20%">
69
+ # <b>0 : 1</b>
70
+ # </td>
71
+ # <td class="hell" style="padding-left: 50px;">
72
+ # <a href="/spieler_profil/luis-phelipe/" title="Luis Phelipe">Luis Phelipe</a> 34. / Rechtsschuss &nbsp;(<a href="/spieler_profil/alexander-prass/" title="Alexander Prass">Alexander Prass</a>)
73
+ # </td>
74
+ # </tr>
75
+
76
+ tds = tr.css( 'td' )
77
+
78
+ score_str = squish( tds[0].text )
79
+
80
+ player_str = squish( tds[1].text )
81
+
82
+ print '[%03d] ' % i
83
+ print score_str
84
+ print " | "
85
+ print player_str
86
+ print "\n"
87
+
88
+ score_str = score_str.gsub( ':', '-' )
89
+ score_str = score_str.gsub( ' ', '' ) ## remove all white space
90
+
91
+
92
+ ### todo/fix: use new Score.split helper here
93
+ ## score1, score2 = Score.split( score_str )
94
+ parts = score_str.split('-')
95
+ score1 = parts[0].to_i
96
+ score2 = parts[1].to_i
97
+
98
+ if last_score1+1 == score1 && last_score2 == score2
99
+ team = 1
100
+ elsif last_score2+1 == score2 && last_score1 == score1
101
+ team = 2
102
+ else
103
+ puts "!! ERROR - unexpected score advance (one goal at a time expected):"
104
+ puts " #{last_score1}-#{last_score2}=> #{score1}-#{score2}"
105
+ exit 1
106
+ end
107
+
108
+
109
+ last_score1 = score1
110
+ last_score2 = score2
111
+
112
+
113
+
114
+ if player_str.index('/')
115
+ parts = player_str.split('/')
116
+ # pp parts
117
+ notes = parts[1].strip
118
+
119
+ if parts[0].strip =~ /^([^0-9]+)[ ]+([0-9]+)\.$/
120
+ player_name = $1
121
+ goal_minute = $2
122
+ # puts " >#{player_name}< | >#{goal_minute}<"
123
+ else
124
+ puts "!! ERROR - unknown goal format (in part i):"
125
+ puts player_str
126
+ pp parts
127
+ exit 1
128
+ end
129
+ else # (simple line with no divider (/)
130
+ # Andrés Andrade 88.  (Nicolas Meister)
131
+ if m = %r{^([^0-9]+)
132
+ [ ]+
133
+ ([0-9]+)\.
134
+ (?:
135
+ [ ]+
136
+ (\([^)]+\))
137
+ )?
138
+ $}x.match( player_str )
139
+ player_name = m[1]
140
+ goal_minute = m[2]
141
+ notes = m[3] ? m[3] : ''
142
+ else
143
+ puts "!! ERROR - unknown goal format:"
144
+ puts player_str
145
+ exit 1
146
+ end
147
+ end
148
+
149
+
150
+ ## check for "flags" e.g. own goal or penalty
151
+ ## if found - remove from notes (use its own flag)
152
+ owngoal = false
153
+ penalty = false
154
+
155
+ if notes.index( 'Eigentor' )
156
+ owngoal = true
157
+ notes = notes.sub('Eigentor', '' ).strip
158
+ elsif notes.index( 'Elfmeter' )
159
+ ## e.g. Elfmeter  (Marco Hausjell)
160
+ penalty = true
161
+ notes = notes.sub('Elfmeter', '' ).strip
162
+ else
163
+ ## nothing - keep going
164
+ end
165
+
166
+ rec = { score: score_str,
167
+ team: team, # 1 or 2
168
+ player: player_name,
169
+ minute: goal_minute
170
+ }
171
+ rec[:owngoal] = true if owngoal
172
+ rec[:penalty] = true if penalty
173
+ rec[:notes] = notes unless notes.empty?
174
+
175
+ rows << rec
176
+ end ## each tr
177
+ rows
178
+ end
179
+ end # goals
180
+
181
+
182
+ end # class Report
183
+
184
+
185
+ end # class Page
186
+ end # module Worldfootball
@@ -0,0 +1,292 @@
1
+
2
+ module Worldfootball
3
+ class Page
4
+
5
+ class Schedule < Page ## note: use nested class for now - why? why not?
6
+
7
+
8
+ def self.from_cache( slug )
9
+ url = Metal.schedule_url( slug )
10
+ html = Webcache.read( url )
11
+ new( html )
12
+ end
13
+
14
+
15
+
16
+ def matches
17
+ @matches ||= begin
18
+
19
+ # <div class="data">
20
+ # <table class="standard_tabelle" cellpadding="3" cellspacing="1">
21
+
22
+ ## note: use > for "strict" sibling (child without any in-betweens)
23
+ table = doc.css( 'div.data > table.standard_tabelle' ).first ## get table
24
+ # puts table.class.name #=> Nokogiri::XML::Element
25
+ # puts table.text
26
+
27
+ trs = table.css( 'tr' )
28
+ # puts trs.size
29
+ i = 0
30
+
31
+ last_date_str = nil
32
+ last_round = nil
33
+
34
+ rows = []
35
+
36
+
37
+ ## ghost trs? what for? see for an example in bra
38
+ ## check for style display:none - why? why not?
39
+ ##
40
+ ## <tr class="e2-parent" data-liga_id="88" data-gs_match_id="9062777"
41
+ ## style="display:none;">
42
+ ## <td colspan="2"></td>
43
+ ## <td colspan="3">
44
+ ## <span class="e2" data-liga_id="88" data-gs_match_id="9062777"></span>
45
+ ## </td>
46
+ ## <td colspan="2"></td>
47
+ ## </tr>
48
+
49
+
50
+ trs.each do |tr|
51
+
52
+ if tr['style'] && tr['style'].index( 'display') &&
53
+ tr['style'].index( 'none')
54
+ puts "!! WARN: skipping ghost line >#{tr.text.strip}<"
55
+ next
56
+ end
57
+
58
+
59
+ i += 1
60
+
61
+ if tr.text.strip =~ /Spieltag/ ||
62
+ tr.text.strip =~ /[1-9]\.[ ]Runde|
63
+ Qual\.[ ][1-9]\.[ ]Runde| # see EL or CL Quali
64
+ Qualifikation| # see CA Championship
65
+ Sechzehntelfinale| # see EL
66
+ Achtelfinale|
67
+ Viertelfinale|
68
+ Halbfinale|
69
+ Finale|
70
+ Gruppe[ ][A-Z]| # see CL
71
+ Playoffs # see EL Quali
72
+ /x
73
+ puts
74
+ print '[%03d] ' % i
75
+ ## print squish( tr.text )
76
+ print "round >#{tr.text.strip}<"
77
+ print "\n"
78
+
79
+ last_round = tr.text.strip
80
+ else ## assume table row (tr) is match line
81
+ tds = tr.css( 'td' )
82
+
83
+ date_str = squish( tds[0].text )
84
+ time_str = squish( tds[1].text )
85
+
86
+ date_str = last_date_str if date_str.empty?
87
+
88
+ ## note: for debugging - print as we go along (parsing)
89
+ print '[%03d] ' % i
90
+ print "%-10s | " % date_str
91
+ print "%-5s | " % time_str
92
+
93
+
94
+ # was: team1_str = squish( tds[2].text )
95
+
96
+ ## <td><a href="/teams/hibernian-fc/" title="Hibernian FC">Hibernian FC</a></td>
97
+ ## todo/check: check if tooltip title always equals text - why? why not?
98
+ team1_anchor = tds[2].css( 'a' )[0]
99
+ if team1_anchor # note: <a> might be optional (and team name only be plain text)
100
+ team1_str = squish( team1_anchor.text )
101
+ team1_ref = norm_team_ref( team1_anchor[:href] )
102
+ else
103
+ team1_str = squish( tds[2].text )
104
+ team1_ref = nil
105
+ puts "!! WARN: no team1_ref for >#{team1_str}< found"
106
+ end
107
+
108
+ ## note: for debugging - print as we go along (parsing)
109
+ print "%-22s | " % team1_str
110
+
111
+ ## <td> - </td>
112
+ ## e.g. -
113
+ vs_str = squish( tds[3].text ) ## use to assert column!!!
114
+ assert( vs_str == '-', "- for vs. expected; got #{vs_str}")
115
+ ## was: team2_str = squish( tds[4].text )
116
+
117
+ ## <td><a href="/teams/st-johnstone-fc/" title="St. Johnstone FC">St. Johnstone FC</a></td>
118
+ team2_anchor = tds[4].css( 'a' )[0]
119
+ if team2_anchor
120
+ team2_str = squish( team2_anchor.text )
121
+ team2_ref = norm_team_ref( team2_anchor[:href] )
122
+ else
123
+ team2_str = squish( tds[4].text )
124
+ team2_ref = nil
125
+ puts "!! WARN: no team2_ref for >#{team2_str}< found"
126
+ end
127
+
128
+ ## note: for debugging - print as we go along (parsing)
129
+ print "%-22s | " % team2_str
130
+
131
+
132
+
133
+ ### was: score_str = squish( tds[5].text )
134
+ ## <a href="/spielbericht/premiership-2020-2021-hibernian-fc-st-johnstone-fc/" title="Spielschema Hibernian FC - St. Johnstone FC">-:-</a>
135
+
136
+ score_anchor = tds[5].css( 'a' )[0]
137
+ if score_anchor ## note: score ref (match report) is optional!!!!
138
+ score_str = squish( score_anchor.text )
139
+ score_ref = norm_score_ref( score_anchor[:href] )
140
+ else
141
+ score_str = squish( tds[5].text )
142
+ score_ref = nil
143
+ end
144
+
145
+
146
+ ## todo - find a better way to check for live match
147
+ ## check for live badge image
148
+ ## <td>
149
+ ## <img src="https://s.hs-data.com/bilder/shared/live/2.png" /></a>
150
+ ## </td>
151
+ img = tds[6].css( 'img' )[0]
152
+ if img && img[:src].index( '/live/')
153
+ puts "!! WARN: live match badge, resetting score from #{score_str} to -:-"
154
+ score_str = '-:-' # note: -:- gets replaced to ---
155
+ end
156
+
157
+
158
+ print "%-10s | " % score_str
159
+ print (score_ref ? score_ref : 'n/a')
160
+ print "\n"
161
+
162
+
163
+ ## change 2:1 (1:1) to 2-1 (1-1)
164
+ score_str = score_str.gsub( ':', '-' )
165
+
166
+ ## convert date from 25.10.2019 to 2019-25-10
167
+ date = Date.strptime( date_str, '%d.%m.%Y' )
168
+
169
+ ## note: keep structure flat for now
170
+ ## (AND not nested e.g. team:{text:,ref:}) - why? why not?
171
+ rows << { round: last_round,
172
+ date: date.strftime( '%Y-%m-%d' ),
173
+ time: time_str,
174
+ team1: team1_str,
175
+ team1_ref: team1_ref,
176
+ score: score_str,
177
+ team2: team2_str,
178
+ team2_ref: team2_ref,
179
+ report_ref: score_ref
180
+ }
181
+
182
+ last_date_str = date_str
183
+ end
184
+ end # each tr (table row)
185
+
186
+ rows
187
+ end
188
+ end # matches
189
+
190
+
191
+
192
+ def teams
193
+ @teams ||= begin
194
+ h = {}
195
+ matches.each do |match|
196
+ ## index by name/text for now NOT ref - why? why not?
197
+ [{text: match[:team1],
198
+ ref: match[:team1_ref]},
199
+ {text: match[:team2],
200
+ ref: match[:team2_ref]}].each do |team|
201
+ rec = h[ team[:text] ] ||= { count: 0,
202
+ name: team[ :text],
203
+ ref: team[ :ref ] }
204
+ rec[ :count ] += 1
205
+ ## todo/check: check/assert that name and ref are always equal - why? why not?
206
+ end
207
+ end
208
+
209
+ h.values
210
+ end
211
+ end
212
+
213
+ def rounds
214
+ @rounds ||= begin
215
+ h = {}
216
+ matches.each do |match|
217
+ rec = h[ match[:round] ] ||= { count: 0,
218
+ name: match[ :round] }
219
+ rec[ :count ] += 1
220
+ end
221
+
222
+ h.values
223
+ end
224
+ end
225
+
226
+
227
+ def seasons
228
+ # <select name="saison" ...
229
+ @seasons ||= begin
230
+ recs = []
231
+ season = doc.css( 'select[name="saison"]').first
232
+ options = season.css( 'option' )
233
+
234
+ options.each do |option|
235
+ recs << { text: squish( option.text ),
236
+ ref: norm_season_ref( option[:value] )
237
+ }
238
+ end
239
+ recs
240
+ end
241
+ end
242
+
243
+
244
+ ######
245
+ ## helpers
246
+
247
+ ## todo/check - rename/use HREF and not REF - why? why not?
248
+ REF_SCORE_RE = %r{^/spielbericht/
249
+ ([a-z0-9_-]+)/$}x
250
+
251
+ def norm_score_ref( str )
252
+ ## check ref format / path
253
+ if m=REF_SCORE_RE.match( str )
254
+ m[1]
255
+ else
256
+ puts "!! ERROR: unexpected score href format >#{str}<"
257
+ exit 1
258
+ end
259
+ end
260
+
261
+
262
+ REF_TEAM_RE = %r{^/teams/
263
+ ([a-z0-9_-]+)/$}x
264
+
265
+ def norm_team_ref( str )
266
+ ## check ref format / path
267
+ if m=REF_TEAM_RE.match( str )
268
+ m[1]
269
+ else
270
+ puts "!! ERROR: unexpected team href format >#{str}<"
271
+ exit 1
272
+ end
273
+ end
274
+
275
+
276
+ REF_SEASON_RE = %r{^/alle_spiele/
277
+ ([a-z0-9_-]+)/$}x
278
+
279
+ def norm_season_ref( str )
280
+ ## check ref format / path
281
+ if m=REF_SEASON_RE.match( str )
282
+ m[1]
283
+ else
284
+ puts "!! ERROR: unexpected season href format >#{str}<"
285
+ exit 1
286
+ end
287
+ end
288
+ end # class Schedule
289
+
290
+
291
+ end # class Page
292
+ end # module Worldfootball
@@ -0,0 +1,66 @@
1
+ module Worldfootball
2
+
3
+
4
+ MAX_HEADERS = [
5
+ 'Stage',
6
+ 'Round',
7
+ 'Date',
8
+ 'Time',
9
+ 'Team 1',
10
+ 'FT',
11
+ 'HT',
12
+ 'Team 2',
13
+ 'ET',
14
+ 'P',
15
+ 'Comments'] ## e.g. awarded, cancelled/canceled, etc.
16
+
17
+ MIN_HEADERS = [ ## always keep even if all empty
18
+ 'Date',
19
+ 'Team 1',
20
+ 'FT',
21
+ 'Team 2'
22
+ ]
23
+
24
+ def self.vacuum( rows, headers: MAX_HEADERS, fixed_headers: MIN_HEADERS )
25
+ ## check for unused columns and strip/remove
26
+ counter = Array.new( MAX_HEADERS.size, 0 )
27
+ rows.each do |row|
28
+ row.each_with_index do |col, idx|
29
+ counter[idx] += 1 unless col.nil? || col.empty?
30
+ end
31
+ end
32
+
33
+ pp counter
34
+
35
+ ## check empty columns
36
+ headers = []
37
+ indices = []
38
+ empty_headers = []
39
+ empty_indices = []
40
+
41
+ counter.each_with_index do |num, idx|
42
+ header = MAX_HEADERS[ idx ]
43
+ if num > 0 || (num == 0 && fixed_headers.include?( header ))
44
+ headers << header
45
+ indices << idx
46
+ else
47
+ empty_headers << header
48
+ empty_indices << idx
49
+ end
50
+ end
51
+
52
+ if empty_indices.size > 0
53
+ rows = rows.map do |row|
54
+ row_vacuumed = []
55
+ row.each_with_index do |col, idx|
56
+ ## todo/fix: use values or such??
57
+ row_vacuumed << col unless empty_indices.include?( idx )
58
+ end
59
+ row_vacuumed
60
+ end
61
+ end
62
+
63
+ [rows, headers]
64
+ end
65
+
66
+ end # module Worldfootball
@@ -0,0 +1,20 @@
1
+
2
+ module Worldfootball
3
+ MAJOR = 0 ## todo: namespace inside version or something - why? why not??
4
+ MINOR = 1
5
+ PATCH = 1
6
+ VERSION = [MAJOR,MINOR,PATCH].join('.')
7
+
8
+ def self.version
9
+ VERSION
10
+ end
11
+
12
+ def self.banner
13
+ "worldfootball/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}] in (#{root})"
14
+ end
15
+
16
+ def self.root
17
+ File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )
18
+ end
19
+ end # module Worldfootball
20
+
@@ -0,0 +1,66 @@
1
+ ## 3rd party (our own)
2
+ require 'season/formats' ## add season support
3
+ require 'webget' ## incl. webget, webcache, webclient, etc.
4
+
5
+ require 'cocos'
6
+
7
+ ## 3rd party
8
+ require 'nokogiri'
9
+
10
+
11
+
12
+ ###
13
+ # our own code
14
+ require_relative 'worldfootball/version'
15
+ require_relative 'worldfootball/leagues'
16
+ require_relative 'worldfootball/download'
17
+ require_relative 'worldfootball/page'
18
+ require_relative 'worldfootball/page_schedule'
19
+ require_relative 'worldfootball/page_report'
20
+
21
+
22
+ require_relative 'worldfootball/mods'
23
+ require_relative 'worldfootball/vacuum'
24
+ require_relative 'worldfootball/build'
25
+ require_relative 'worldfootball/convert'
26
+ require_relative 'worldfootball/convert_reports'
27
+
28
+
29
+ require_relative 'worldfootball/generator'
30
+
31
+
32
+
33
+ module Worldfootball
34
+
35
+ class Configuration
36
+ #########
37
+ ## nested configuration classes - use - why? why not?
38
+ class Convert
39
+ def out_dir() @out_dir || './o'; end
40
+ def out_dir=(value) @out_dir = value; end
41
+ end
42
+
43
+ def convert() @convert ||= Convert.new; end
44
+ end # class Configuration
45
+
46
+ ## lets you use
47
+ ## Worldfootball.configure do |config|
48
+ ## config.convert.out_dir = './o'
49
+ ## end
50
+ def self.configure() yield( config ); end
51
+ def self.config() @config ||= Configuration.new; end
52
+
53
+ end # module Worldfootball
54
+
55
+
56
+
57
+
58
+ ### for processing tool
59
+ ## (auto-)add sportdb/writer (pulls in sportdb/catalogs and gitti)
60
+ # require 'sportdb/writers'
61
+
62
+
63
+
64
+
65
+ puts Worldfootball.banner ## say hello
66
+