football-sources 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,95 +0,0 @@
1
- module Fbref
2
-
3
- def self.convert( league:, season: )
4
- page = Page::Schedule.from_cache( league: league,
5
- season: season )
6
-
7
- puts page.title
8
-
9
- rows = page.matches
10
- recs = build( rows, league: league, season: season )
11
- ## pp rows
12
-
13
- ## reformat date / beautify e.g. Sat Aug 7 1993
14
- recs.each { |rec| rec[2] = Date.strptime( rec[2], '%Y-%m-%d' ).strftime( '%a %b %-d %Y' ) }
15
-
16
- recs, headers = vacuum( recs )
17
- pp recs[0..2]
18
-
19
- season = Season.parse( season )
20
- path = "#{config.convert.out_dir}/#{league}_#{season.to_path}.csv"
21
- puts "write #{path}..."
22
- Cache::CsvMatchWriter.write( path, recs, headers: headers )
23
- end
24
-
25
-
26
-
27
-
28
- #####
29
- # vacuum helper stuff - todo/fix - (re)use - make more generic - why? why not?
30
-
31
- MAX_HEADERS = [
32
- 'Stage',
33
- 'Round',
34
- 'Date',
35
- 'Time',
36
- 'Team 1',
37
- 'FT',
38
- 'HT',
39
- 'Team 2',
40
- 'ET',
41
- 'P',
42
- 'Venue',
43
- 'Att',
44
- 'Comments', ## e.g. awarded, cancelled/canceled, etc.
45
- ]
46
-
47
- MIN_HEADERS = [ ## always keep even if all empty
48
- 'Date',
49
- 'Team 1',
50
- 'FT',
51
- 'Team 2'
52
- ]
53
-
54
- def self.vacuum( rows, headers: MAX_HEADERS, fixed_headers: MIN_HEADERS )
55
- ## check for unused columns and strip/remove
56
- counter = Array.new( MAX_HEADERS.size, 0 )
57
- rows.each do |row|
58
- row.each_with_index do |col, idx|
59
- counter[idx] += 1 unless col.nil? || col.empty?
60
- end
61
- end
62
-
63
- pp counter
64
-
65
- ## check empty columns
66
- headers = []
67
- indices = []
68
- empty_headers = []
69
- empty_indices = []
70
-
71
- counter.each_with_index do |num, idx|
72
- header = MAX_HEADERS[ idx ]
73
- if num > 0 || (num == 0 && fixed_headers.include?( header ))
74
- headers << header
75
- indices << idx
76
- else
77
- empty_headers << header
78
- empty_indices << idx
79
- end
80
- end
81
-
82
- if empty_indices.size > 0
83
- rows = rows.map do |row|
84
- row_vacuumed = []
85
- row.each_with_index do |col, idx|
86
- ## todo/fix: use values or such??
87
- row_vacuumed << col unless empty_indices.include?( idx )
88
- end
89
- row_vacuumed
90
- end
91
- end
92
-
93
- [rows, headers]
94
- end
95
- end # module Fbref
@@ -1,4 +0,0 @@
1
-
2
- require_relative 'fbref/config'
3
- require_relative 'fbref/build'
4
- require_relative 'fbref/convert'
@@ -1,245 +0,0 @@
1
-
2
- module Worldfootball
3
-
4
-
5
- ROUND_TO_EN = {
6
- '1. Runde' => 'Round 1',
7
- '2. Runde' => 'Round 2',
8
- '3. Runde' => 'Round 3',
9
- '4. Runde' => 'Round 4',
10
- 'Achtelfinale' => 'Round of 16',
11
- 'Viertelfinale' => 'Quarterfinals',
12
- 'Halbfinale' => 'Semifinals',
13
- 'Finale' => 'Final',
14
- }
15
-
16
-
17
- ## todo/check: english league cup/trophy has NO ET - also support - make more flexible!!!
18
-
19
- ## build "standard" match records from "raw" table rows
20
- def self.build( rows, season:, league:, stage: '' ) ## rename to fixup or such - why? why not?
21
- season = Season( season ) ## cast (ensure) season class (NOT string, integer, etc.)
22
-
23
- raise ArgumentError, "league key as string expected" unless league.is_a?(String) ## note: do NOT pass in league struct! pass in key (string)
24
-
25
- print " #{rows.size} rows - build #{league} #{season}"
26
- print " - #{stage}" unless stage.empty?
27
- print "\n"
28
-
29
-
30
- ## note: use only first part from key for lookup
31
- ## e.g. at.1 => at
32
- ## eng.1 => eng
33
- ## and so on
34
- mods = MODS[ league.split('.')[0] ] || {}
35
-
36
- score_errors = SCORE_ERRORS[ league ] || {}
37
-
38
-
39
- i = 0
40
- recs = []
41
- rows.each do |row|
42
- i += 1
43
-
44
-
45
- if row[:round] =~ /Spieltag/
46
- puts
47
- print '[%03d] ' % (i+1)
48
- print row[:round]
49
-
50
- if m = row[:round].match( /([0-9]+)\. Spieltag/ )
51
- ## todo/check: always use a string even if number (as a string eg. '1' etc.)
52
- round = m[1] ## note: keep as string (NOT number)
53
- print " => #{round}"
54
- else
55
- puts "!! ERROR: cannot find matchday number"
56
- exit 1
57
- end
58
- print "\n"
59
- elsif row[:round] =~ /[1-9]\.[ ]Runde|
60
- Achtelfinale|
61
- Viertelfinale|
62
- Halbfinale|
63
- Finale
64
- /x
65
- puts
66
- print '[%03d] ' % (i+1)
67
- print row[:round]
68
-
69
-
70
- ## do NOT translate rounds (to english) - keep in german / deutsch (de)
71
- if ['at.cup', 'at.1', ## at.1 - incl. europa league playoff
72
- 'de.cup'].include?( league )
73
- round = row[:round]
74
- else
75
- round = ROUND_TO_EN[ row[:round] ]
76
- if round.nil?
77
- puts "!! ERROR: no mapping for round to english (en) found >#{row[:round]}<:"
78
- pp row
79
- exit 1
80
- end
81
- print " => #{round}"
82
- end
83
- print "\n"
84
- else
85
- puts "!! ERROR: unknown round >#{row[:round]}< for league >#{league}<:"
86
- pp row
87
- exit 1
88
- end
89
-
90
-
91
- date_str = row[:date]
92
- time_str = row[:time]
93
- team1_str = row[:team1]
94
- team2_str = row[:team2]
95
- score_str = row[:score]
96
-
97
- ## convert date from string e.g. 2019-25-10
98
- date = Date.strptime( date_str, '%Y-%m-%d' )
99
-
100
-
101
- ### check for score_error; first (step 1) lookup by date
102
- score_error = score_errors[ date.strftime('%Y-%m-%d') ]
103
- if score_error
104
- if team1_str == score_error[0] &&
105
- team2_str == score_error[1]
106
- ## check if team names match too; if yes, apply fix/patch!!
107
- if score_str != score_error[2][0]
108
- puts "!! WARN - score fix changed? - expected #{score_error[2][0]}, got #{score_str} - fixing to #{score_error[2][1]}"
109
- pp row
110
- end
111
- puts "FIX - applying score error fix - from #{score_error[2][0]} to => #{score_error[2][1]}"
112
- score_str = score_error[2][1]
113
- end
114
- end
115
-
116
-
117
- print '[%03d] ' % (i+1)
118
- print "%-10s | " % date_str
119
- print "%-5s | " % time_str
120
- print "%-22s | " % team1_str
121
- print "%-22s | " % team2_str
122
- print score_str
123
- print "\n"
124
-
125
-
126
- ## check for 0:3 Wert. - change Wert. to awd. (awarded)
127
- score_str = score_str.sub( /Wert\./i, 'awd.' )
128
-
129
- ## clean team name (e.g. remove (old))
130
- ## and asciify (e.g. ’ to ' )
131
- team1_str = norm_team( team1_str )
132
- team2_str = norm_team( team2_str )
133
-
134
- team1_str = mods[ team1_str ] if mods[ team1_str ]
135
- team2_str = mods[ team2_str ] if mods[ team2_str ]
136
-
137
-
138
-
139
-
140
- ht, ft, et, pen, comments = parse_score( score_str )
141
-
142
-
143
-
144
- recs << [stage,
145
- round,
146
- date.strftime( '%Y-%m-%d' ),
147
- time_str,
148
- team1_str,
149
- ft,
150
- ht,
151
- team2_str,
152
- et, # extra: incl. extra time
153
- pen, # extra: incl. penalties
154
- comments]
155
- end # each row
156
- recs
157
- end # build
158
-
159
-
160
-
161
- def self.parse_score( score_str )
162
- comments = String.new( '' ) ## check - rename to/use status or such - why? why not?
163
-
164
- ## split score
165
- ft = ''
166
- ht = ''
167
- et = ''
168
- pen = ''
169
- if score_str == '---' ## in the future (no score yet) - was -:-
170
- ft = ''
171
- ht = ''
172
- elsif score_str == 'n.gesp.' || ## cancelled (british) / canceled (us)
173
- score_str == 'ausg.' || ## todo/check: change to some other status ????
174
- score_str == 'annull.' ## todo/check: change to some other status (see ie 2012) ????
175
- ft = '(*)'
176
- ht = ''
177
- comments = 'cancelled'
178
- elsif score_str == 'abgebr.' ## abandoned -- waiting for replay?
179
- ft = '(*)'
180
- ht = ''
181
- comments = 'abandoned'
182
- elsif score_str == 'verl.' ## postponed
183
- ft = ''
184
- ht = ''
185
- comments = 'postponed'
186
- # 5-4 (0-0, 1-1, 2-2) i.E.
187
- elsif score_str =~ /([0-9]+) [ ]*-[ ]* ([0-9]+)
188
- [ ]*
189
- \(([0-9]+) [ ]*-[ ]* ([0-9]+)
190
- [ ]*,[ ]*
191
- ([0-9]+) [ ]*-[ ]* ([0-9]+)
192
- [ ]*,[ ]*
193
- ([0-9]+) [ ]*-[ ]* ([0-9]+)\)
194
- [ ]*
195
- i\.E\.
196
- /x
197
- pen = "#{$1}-#{$2}"
198
- ht = "#{$3}-#{$4}"
199
- ft = "#{$5}-#{$6}"
200
- et = "#{$7}-#{$8}"
201
- # 2-1 (1-0, 1-1) n.V
202
- elsif score_str =~ /([0-9]+) [ ]*-[ ]* ([0-9]+)
203
- [ ]*
204
- \(([0-9]+) [ ]*-[ ]* ([0-9]+)
205
- [ ]*,[ ]*
206
- ([0-9]+) [ ]*-[ ]* ([0-9]+)
207
- \)
208
- [ ]*
209
- n\.V\.
210
- /x
211
- et = "#{$1}-#{$2}"
212
- ht = "#{$3}-#{$4}"
213
- ft = "#{$5}-#{$6}"
214
- elsif score_str =~ /([0-9]+)
215
- [ ]*-[ ]*
216
- ([0-9]+)
217
- [ ]*
218
- \(([0-9]+)
219
- [ ]*-[ ]*
220
- ([0-9]+)
221
- \)
222
- /x
223
- ft = "#{$1}-#{$2}"
224
- ht = "#{$3}-#{$4}"
225
- elsif score_str =~ /([0-9]+)
226
- [ ]*-[ ]*
227
- ([0-9]+)
228
- [ ]*
229
- ([a-z.]+)
230
- /x
231
- ft = "#{$1}-#{$2} (*)"
232
- ht = ''
233
- comments = $3
234
- elsif score_str =~ /^([0-9]+)-([0-9]+)$/
235
- ft = "#{$1}-#{$2}" ## e.g. see luxemburg and others
236
- ht = ''
237
- else
238
- puts "!! ERROR - unsupported score format >#{score_str}< - sorry; maybe add a score error fix/patch"
239
- exit 1
240
- end
241
-
242
- [ht, ft, et, pen, comments]
243
- end
244
-
245
- end # module Worldfootball
@@ -1,16 +0,0 @@
1
- module Worldfootball
2
-
3
- ### add some more config options / settings
4
- class Configuration
5
- #########
6
- ## nested configuration classes - use - why? why not?
7
- class Convert
8
- def out_dir() @out_dir || './o'; end
9
- def out_dir=(value) @out_dir = value; end
10
- end
11
-
12
- def convert() @convert ||= Convert.new; end
13
- end # class Configuration
14
-
15
-
16
- end # module Worldfootball
@@ -1,100 +0,0 @@
1
-
2
- module Worldfootball
3
-
4
-
5
-
6
- def self.convert( league:, season:, offset: nil ) ## check: rename (optional) offset to time_offset or such?
7
- season = Season( season ) ## cast (ensure) season class (NOT string, integer, etc.)
8
-
9
- league = find_league( league )
10
-
11
- pages = league.pages( season: season )
12
-
13
- # note: assume stages if pages is an array (of hash table/records)
14
- # (and NOT a single hash table/record)
15
- if pages.is_a?(Array)
16
- recs = []
17
- pages.each do |page_meta|
18
- slug = page_meta[:slug]
19
- stage_name = page_meta[:stage]
20
- ## todo/fix: report error/check if stage.name is nil!!!
21
-
22
- print " parsing #{slug}..."
23
-
24
- # unless File.exist?( path )
25
- # puts "!! WARN - missing stage >#{stage_name}< source - >#{path}<"
26
- # next
27
- # end
28
-
29
- page = Page::Schedule.from_cache( slug )
30
- print " title=>#{page.title}<..."
31
- print "\n"
32
-
33
- rows = page.matches
34
- stage_recs = build( rows, season: season, league: league.key, stage: stage_name )
35
-
36
- pp stage_recs[0] ## check first record
37
- recs += stage_recs
38
- end
39
- else
40
- page_meta = pages
41
- slug = page_meta[:slug]
42
-
43
- print " parsing #{slug}..."
44
-
45
- page = Page::Schedule.from_cache( slug )
46
- print " title=>#{page.title}<..."
47
- print "\n"
48
-
49
- rows = page.matches
50
- recs = build( rows, season: season, league: league.key )
51
-
52
- pp recs[0] ## check first record
53
- end
54
-
55
- recs = recs.map { |rec| fix_date( rec, offset ) } if offset
56
-
57
- ## note: sort matches by date before saving/writing!!!!
58
- ## note: for now assume date in string in 1999-11-30 format (allows sort by "simple" a-z)
59
- ## note: assume date is third column!!! (stage/round/date/...)
60
- recs = recs.sort { |l,r| l[2] <=> r[2] }
61
- ## reformat date / beautify e.g. Sat Aug 7 1993
62
- recs.each { |rec| rec[2] = Date.strptime( rec[2], '%Y-%m-%d' ).strftime( '%a %b %-d %Y' ) }
63
-
64
- ## remove unused columns (e.g. stage, et, p, etc.)
65
- recs, headers = vacuum( recs )
66
-
67
- puts headers
68
- pp recs[0] ## check first record
69
-
70
- out_path = "#{config.convert.out_dir}/#{season.path}/#{league.key}.csv"
71
-
72
- puts "write #{out_path}..."
73
- Cache::CsvMatchWriter.write( out_path, recs, headers: headers )
74
- end
75
-
76
-
77
-
78
- ## helper to fix dates to use local timezone (and not utc/london time)
79
- def self.fix_date( row, offset )
80
- return row if row[3].nil? || row[3].empty? ## note: time (column) required for fix
81
-
82
- col = row[2]
83
- if col =~ /^\d{4}-\d{2}-\d{2}$/
84
- date_fmt = '%Y-%m-%d' # e.g. 2002-08-17
85
- else
86
- puts "!!! ERROR - wrong (unknown) date format >>#{col}<<; cannot continue; fix it; sorry"
87
- ## todo/fix: add to errors/warns list - why? why not?
88
- exit 1
89
- end
90
-
91
- date = DateTime.strptime( "#{row[2]} #{row[3]}", "#{date_fmt} %H:%M" )
92
- ## NOTE - MUST be -7/24.0!!!! or such to work
93
- date = date + (offset/24.0)
94
-
95
- row[2] = date.strftime( date_fmt ) ## overwrite "old"
96
- row[3] = date.strftime( '%H:%M' )
97
- row ## return row for possible pipelining - why? why not?
98
- end
99
-
100
- end # module Worldfootball
@@ -1,107 +0,0 @@
1
- module Worldfootball
2
-
3
-
4
- def self.convert_reports( league:, season: )
5
- season = Season( season ) ## cast (ensure) season class (NOT string, integer, etc.)
6
-
7
- league = find_league( league )
8
-
9
- ## note: use only first part from key for lookup
10
- ## e.g. at.1 => at
11
- ## eng.1 => eng
12
- ## and so on
13
- mods = MODS[ league.key.split('.')[0] ] || {}
14
-
15
-
16
-
17
- pages = league.pages( season: season )
18
-
19
- recs = []
20
-
21
- ## if single (simple) page setup - wrap in array
22
- pages = pages.is_a?(Array) ? pages : [pages]
23
- pages.each do |page_meta| # note: use page_info for now (or page_rec or page_meta or such)
24
-
25
- page = Page::Schedule.from_cache( page_meta[:slug] )
26
- print " page title=>#{page.title}<..."
27
- print "\n"
28
-
29
- matches = page.matches
30
-
31
- puts "matches - #{matches.size} rows:"
32
- pp matches[0]
33
-
34
- puts "#{page.generated_in_days_ago} - #{page.generated}"
35
-
36
-
37
- matches.each_with_index do |match,i|
38
-
39
- report_ref = match[:report_ref]
40
- if report_ref.nil?
41
- puts "!! WARN: no match report ref found for match:"
42
- pp match
43
- next
44
- end
45
-
46
- puts "reading #{i+1}/#{matches.size} - #{report_ref}..."
47
- report = Page::Report.from_cache( report_ref )
48
-
49
- puts
50
- puts report.title
51
- puts report.generated
52
-
53
- rows = report.goals
54
- puts "goals - #{rows.size} records"
55
- ## pp rows
56
-
57
-
58
- if rows.size > 0
59
- ## add goals
60
- date = Date.strptime( match[:date], '%Y-%m-%d')
61
-
62
- team1 = match[:team1]
63
- team2 = match[:team2]
64
-
65
- ## clean team name (e.g. remove (old))
66
- ## and asciify (e.g. ’ to ' )
67
- team1 = norm_team( team1 )
68
- team2 = norm_team( team2 )
69
-
70
- team1 = mods[ team1 ] if mods[ team1 ]
71
- team2 = mods[ team2 ] if mods[ team2 ]
72
-
73
- match_id = "#{team1} - #{team2} | #{date.strftime('%b %-d %Y')}"
74
-
75
-
76
- rows.each do |row|
77
- extra = if row[:owngoal]
78
- '(og)' ## or use OG or O.G.- why? why not?
79
- elsif row[:penalty]
80
- '(pen)' ## or use P or PEN - why? why not?
81
- else
82
- ''
83
- end
84
-
85
- rec = [match_id,
86
- row[:score],
87
- "#{row[:minute]}'",
88
- extra,
89
- row[:player],
90
- row[:notes]]
91
- recs << rec
92
- end
93
- end
94
- end # each match
95
- end # each page
96
-
97
- ## pp recs
98
-
99
- out_path = "#{config.convert.out_dir}/#{season.path}/#{league.key}~goals.csv"
100
-
101
- headers = ['Match', 'Score', 'Minute', 'Extra', 'Player', 'Notes']
102
-
103
- puts "write #{out_path}..."
104
- Cache::CsvMatchWriter.write( out_path, recs, headers: headers )
105
- end
106
- end # module Worldfootballl
107
-
@@ -1,76 +0,0 @@
1
-
2
-
3
- module Worldfootball
4
- ################################
5
- # add more helpers
6
- # move upstream for (re)use - why? why not?
7
-
8
- ## todo/check: what to do: if league is both included and excluded?
9
- ## include forces include? or exclude has the last word? - why? why not?
10
- ## Excludes match before includes,
11
- ## meaning that something that has been excluded cannot be included again
12
-
13
- ## todo - find "proper/classic" timezone ("winter time")
14
-
15
- ## Brasilia - Distrito Federal, Brasil (GMT-3) -- summer time?
16
- ## Ciudad de México, CDMX, México (GMT-5) -- summer time?
17
- ## Londres, Reino Unido (GMT+1)
18
- ## Madrid -- ?
19
- ## Lisboa -- ?
20
- ## Moskow -- ?
21
- ##
22
- ## todo/check - quick fix timezone offsets for leagues for now
23
- ## - find something better - why? why not?
24
- ## note: assume time is in GMT+1
25
- OFFSETS = {
26
- 'eng.1' => -1,
27
- 'eng.2' => -1,
28
- 'eng.3' => -1,
29
- 'eng.4' => -1,
30
- 'eng.5' => -1,
31
-
32
- 'es.1' => -1,
33
- 'es.2' => -1,
34
-
35
- 'pt.1' => -1,
36
- 'pt.2' => -1,
37
-
38
- 'br.1' => -5,
39
- 'mx.1' => -7,
40
- }
41
-
42
-
43
-
44
- class Job ## todo/check: use a module (NOT a class) - why? why not?
45
- def self.download( datasets )
46
- datasets.each_with_index do |dataset,i|
47
- league = dataset[0]
48
- seasons = dataset[1]
49
-
50
- puts "downloading [#{i+1}/#{datasets.size}] #{league}..."
51
- seasons.each_with_index do |season,j|
52
- puts " season [#{j+1}/#{season.size}] #{league} #{season}..."
53
- Worldfootball.schedule( league: league,
54
- season: season )
55
- end
56
- end
57
- end
58
-
59
- def self.convert( datasets )
60
- datasets.each_with_index do |dataset,i|
61
- league = dataset[0]
62
- seasons = dataset[1]
63
-
64
- puts "converting [#{i+1}/#{datasets.size}] #{league}..."
65
- seasons.each_with_index do |season,j|
66
- puts " season [#{j+1}/#{season.size}] #{league} #{season}..."
67
- Worldfootball.convert( league: league,
68
- season: season,
69
- offset: OFFSETS[ league ] )
70
- end
71
- end
72
- end
73
- end # class Job
74
-
75
- end # module Worldfootball
76
-