football-sources 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,95 +0,0 @@
1
- module Fbref
2
-
3
- def self.convert( league:, season: )
4
- page = Page::Schedule.from_cache( league: league,
5
- season: season )
6
-
7
- puts page.title
8
-
9
- rows = page.matches
10
- recs = build( rows, league: league, season: season )
11
- ## pp rows
12
-
13
- ## reformat date / beautify e.g. Sat Aug 7 1993
14
- recs.each { |rec| rec[2] = Date.strptime( rec[2], '%Y-%m-%d' ).strftime( '%a %b %-d %Y' ) }
15
-
16
- recs, headers = vacuum( recs )
17
- pp recs[0..2]
18
-
19
- season = Season.parse( season )
20
- path = "#{config.convert.out_dir}/#{league}_#{season.to_path}.csv"
21
- puts "write #{path}..."
22
- Cache::CsvMatchWriter.write( path, recs, headers: headers )
23
- end
24
-
25
-
26
-
27
-
28
- #####
29
- # vacuum helper stuff - todo/fix - (re)use - make more generic - why? why not?
30
-
31
- MAX_HEADERS = [
32
- 'Stage',
33
- 'Round',
34
- 'Date',
35
- 'Time',
36
- 'Team 1',
37
- 'FT',
38
- 'HT',
39
- 'Team 2',
40
- 'ET',
41
- 'P',
42
- 'Venue',
43
- 'Att',
44
- 'Comments', ## e.g. awarded, cancelled/canceled, etc.
45
- ]
46
-
47
- MIN_HEADERS = [ ## always keep even if all empty
48
- 'Date',
49
- 'Team 1',
50
- 'FT',
51
- 'Team 2'
52
- ]
53
-
54
- def self.vacuum( rows, headers: MAX_HEADERS, fixed_headers: MIN_HEADERS )
55
- ## check for unused columns and strip/remove
56
- counter = Array.new( MAX_HEADERS.size, 0 )
57
- rows.each do |row|
58
- row.each_with_index do |col, idx|
59
- counter[idx] += 1 unless col.nil? || col.empty?
60
- end
61
- end
62
-
63
- pp counter
64
-
65
- ## check empty columns
66
- headers = []
67
- indices = []
68
- empty_headers = []
69
- empty_indices = []
70
-
71
- counter.each_with_index do |num, idx|
72
- header = MAX_HEADERS[ idx ]
73
- if num > 0 || (num == 0 && fixed_headers.include?( header ))
74
- headers << header
75
- indices << idx
76
- else
77
- empty_headers << header
78
- empty_indices << idx
79
- end
80
- end
81
-
82
- if empty_indices.size > 0
83
- rows = rows.map do |row|
84
- row_vacuumed = []
85
- row.each_with_index do |col, idx|
86
- ## todo/fix: use values or such??
87
- row_vacuumed << col unless empty_indices.include?( idx )
88
- end
89
- row_vacuumed
90
- end
91
- end
92
-
93
- [rows, headers]
94
- end
95
- end # module Fbref
@@ -1,4 +0,0 @@
1
-
2
- require_relative 'fbref/config'
3
- require_relative 'fbref/build'
4
- require_relative 'fbref/convert'
@@ -1,245 +0,0 @@
1
-
2
- module Worldfootball
3
-
4
-
5
- ROUND_TO_EN = {
6
- '1. Runde' => 'Round 1',
7
- '2. Runde' => 'Round 2',
8
- '3. Runde' => 'Round 3',
9
- '4. Runde' => 'Round 4',
10
- 'Achtelfinale' => 'Round of 16',
11
- 'Viertelfinale' => 'Quarterfinals',
12
- 'Halbfinale' => 'Semifinals',
13
- 'Finale' => 'Final',
14
- }
15
-
16
-
17
- ## todo/check: english league cup/trophy has NO ET - also support - make more flexible!!!
18
-
19
- ## build "standard" match records from "raw" table rows
20
- def self.build( rows, season:, league:, stage: '' ) ## rename to fixup or such - why? why not?
21
- season = Season( season ) ## cast (ensure) season class (NOT string, integer, etc.)
22
-
23
- raise ArgumentError, "league key as string expected" unless league.is_a?(String) ## note: do NOT pass in league struct! pass in key (string)
24
-
25
- print " #{rows.size} rows - build #{league} #{season}"
26
- print " - #{stage}" unless stage.empty?
27
- print "\n"
28
-
29
-
30
- ## note: use only first part from key for lookup
31
- ## e.g. at.1 => at
32
- ## eng.1 => eng
33
- ## and so on
34
- mods = MODS[ league.split('.')[0] ] || {}
35
-
36
- score_errors = SCORE_ERRORS[ league ] || {}
37
-
38
-
39
- i = 0
40
- recs = []
41
- rows.each do |row|
42
- i += 1
43
-
44
-
45
- if row[:round] =~ /Spieltag/
46
- puts
47
- print '[%03d] ' % (i+1)
48
- print row[:round]
49
-
50
- if m = row[:round].match( /([0-9]+)\. Spieltag/ )
51
- ## todo/check: always use a string even if number (as a string eg. '1' etc.)
52
- round = m[1] ## note: keep as string (NOT number)
53
- print " => #{round}"
54
- else
55
- puts "!! ERROR: cannot find matchday number"
56
- exit 1
57
- end
58
- print "\n"
59
- elsif row[:round] =~ /[1-9]\.[ ]Runde|
60
- Achtelfinale|
61
- Viertelfinale|
62
- Halbfinale|
63
- Finale
64
- /x
65
- puts
66
- print '[%03d] ' % (i+1)
67
- print row[:round]
68
-
69
-
70
- ## do NOT translate rounds (to english) - keep in german / deutsch (de)
71
- if ['at.cup', 'at.1', ## at.1 - incl. europa league playoff
72
- 'de.cup'].include?( league )
73
- round = row[:round]
74
- else
75
- round = ROUND_TO_EN[ row[:round] ]
76
- if round.nil?
77
- puts "!! ERROR: no mapping for round to english (en) found >#{row[:round]}<:"
78
- pp row
79
- exit 1
80
- end
81
- print " => #{round}"
82
- end
83
- print "\n"
84
- else
85
- puts "!! ERROR: unknown round >#{row[:round]}< for league >#{league}<:"
86
- pp row
87
- exit 1
88
- end
89
-
90
-
91
- date_str = row[:date]
92
- time_str = row[:time]
93
- team1_str = row[:team1]
94
- team2_str = row[:team2]
95
- score_str = row[:score]
96
-
97
- ## convert date from string e.g. 2019-25-10
98
- date = Date.strptime( date_str, '%Y-%m-%d' )
99
-
100
-
101
- ### check for score_error; first (step 1) lookup by date
102
- score_error = score_errors[ date.strftime('%Y-%m-%d') ]
103
- if score_error
104
- if team1_str == score_error[0] &&
105
- team2_str == score_error[1]
106
- ## check if team names match too; if yes, apply fix/patch!!
107
- if score_str != score_error[2][0]
108
- puts "!! WARN - score fix changed? - expected #{score_error[2][0]}, got #{score_str} - fixing to #{score_error[2][1]}"
109
- pp row
110
- end
111
- puts "FIX - applying score error fix - from #{score_error[2][0]} to => #{score_error[2][1]}"
112
- score_str = score_error[2][1]
113
- end
114
- end
115
-
116
-
117
- print '[%03d] ' % (i+1)
118
- print "%-10s | " % date_str
119
- print "%-5s | " % time_str
120
- print "%-22s | " % team1_str
121
- print "%-22s | " % team2_str
122
- print score_str
123
- print "\n"
124
-
125
-
126
- ## check for 0:3 Wert. - change Wert. to awd. (awarded)
127
- score_str = score_str.sub( /Wert\./i, 'awd.' )
128
-
129
- ## clean team name (e.g. remove (old))
130
- ## and asciify (e.g. ’ to ' )
131
- team1_str = norm_team( team1_str )
132
- team2_str = norm_team( team2_str )
133
-
134
- team1_str = mods[ team1_str ] if mods[ team1_str ]
135
- team2_str = mods[ team2_str ] if mods[ team2_str ]
136
-
137
-
138
-
139
-
140
- ht, ft, et, pen, comments = parse_score( score_str )
141
-
142
-
143
-
144
- recs << [stage,
145
- round,
146
- date.strftime( '%Y-%m-%d' ),
147
- time_str,
148
- team1_str,
149
- ft,
150
- ht,
151
- team2_str,
152
- et, # extra: incl. extra time
153
- pen, # extra: incl. penalties
154
- comments]
155
- end # each row
156
- recs
157
- end # build
158
-
159
-
160
-
161
- def self.parse_score( score_str )
162
- comments = String.new( '' ) ## check - rename to/use status or such - why? why not?
163
-
164
- ## split score
165
- ft = ''
166
- ht = ''
167
- et = ''
168
- pen = ''
169
- if score_str == '---' ## in the future (no score yet) - was -:-
170
- ft = ''
171
- ht = ''
172
- elsif score_str == 'n.gesp.' || ## cancelled (british) / canceled (us)
173
- score_str == 'ausg.' || ## todo/check: change to some other status ????
174
- score_str == 'annull.' ## todo/check: change to some other status (see ie 2012) ????
175
- ft = '(*)'
176
- ht = ''
177
- comments = 'cancelled'
178
- elsif score_str == 'abgebr.' ## abandoned -- waiting for replay?
179
- ft = '(*)'
180
- ht = ''
181
- comments = 'abandoned'
182
- elsif score_str == 'verl.' ## postponed
183
- ft = ''
184
- ht = ''
185
- comments = 'postponed'
186
- # 5-4 (0-0, 1-1, 2-2) i.E.
187
- elsif score_str =~ /([0-9]+) [ ]*-[ ]* ([0-9]+)
188
- [ ]*
189
- \(([0-9]+) [ ]*-[ ]* ([0-9]+)
190
- [ ]*,[ ]*
191
- ([0-9]+) [ ]*-[ ]* ([0-9]+)
192
- [ ]*,[ ]*
193
- ([0-9]+) [ ]*-[ ]* ([0-9]+)\)
194
- [ ]*
195
- i\.E\.
196
- /x
197
- pen = "#{$1}-#{$2}"
198
- ht = "#{$3}-#{$4}"
199
- ft = "#{$5}-#{$6}"
200
- et = "#{$7}-#{$8}"
201
- # 2-1 (1-0, 1-1) n.V
202
- elsif score_str =~ /([0-9]+) [ ]*-[ ]* ([0-9]+)
203
- [ ]*
204
- \(([0-9]+) [ ]*-[ ]* ([0-9]+)
205
- [ ]*,[ ]*
206
- ([0-9]+) [ ]*-[ ]* ([0-9]+)
207
- \)
208
- [ ]*
209
- n\.V\.
210
- /x
211
- et = "#{$1}-#{$2}"
212
- ht = "#{$3}-#{$4}"
213
- ft = "#{$5}-#{$6}"
214
- elsif score_str =~ /([0-9]+)
215
- [ ]*-[ ]*
216
- ([0-9]+)
217
- [ ]*
218
- \(([0-9]+)
219
- [ ]*-[ ]*
220
- ([0-9]+)
221
- \)
222
- /x
223
- ft = "#{$1}-#{$2}"
224
- ht = "#{$3}-#{$4}"
225
- elsif score_str =~ /([0-9]+)
226
- [ ]*-[ ]*
227
- ([0-9]+)
228
- [ ]*
229
- ([a-z.]+)
230
- /x
231
- ft = "#{$1}-#{$2} (*)"
232
- ht = ''
233
- comments = $3
234
- elsif score_str =~ /^([0-9]+)-([0-9]+)$/
235
- ft = "#{$1}-#{$2}" ## e.g. see luxemburg and others
236
- ht = ''
237
- else
238
- puts "!! ERROR - unsupported score format >#{score_str}< - sorry; maybe add a score error fix/patch"
239
- exit 1
240
- end
241
-
242
- [ht, ft, et, pen, comments]
243
- end
244
-
245
- end # module Worldfootball
@@ -1,16 +0,0 @@
1
- module Worldfootball
2
-
3
- ### add some more config options / settings
4
- class Configuration
5
- #########
6
- ## nested configuration classes - use - why? why not?
7
- class Convert
8
- def out_dir() @out_dir || './o'; end
9
- def out_dir=(value) @out_dir = value; end
10
- end
11
-
12
- def convert() @convert ||= Convert.new; end
13
- end # class Configuration
14
-
15
-
16
- end # module Worldfootball
@@ -1,100 +0,0 @@
1
-
2
- module Worldfootball
3
-
4
-
5
-
6
- def self.convert( league:, season:, offset: nil ) ## check: rename (optional) offset to time_offset or such?
7
- season = Season( season ) ## cast (ensure) season class (NOT string, integer, etc.)
8
-
9
- league = find_league( league )
10
-
11
- pages = league.pages( season: season )
12
-
13
- # note: assume stages if pages is an array (of hash table/records)
14
- # (and NOT a single hash table/record)
15
- if pages.is_a?(Array)
16
- recs = []
17
- pages.each do |page_meta|
18
- slug = page_meta[:slug]
19
- stage_name = page_meta[:stage]
20
- ## todo/fix: report error/check if stage.name is nil!!!
21
-
22
- print " parsing #{slug}..."
23
-
24
- # unless File.exist?( path )
25
- # puts "!! WARN - missing stage >#{stage_name}< source - >#{path}<"
26
- # next
27
- # end
28
-
29
- page = Page::Schedule.from_cache( slug )
30
- print " title=>#{page.title}<..."
31
- print "\n"
32
-
33
- rows = page.matches
34
- stage_recs = build( rows, season: season, league: league.key, stage: stage_name )
35
-
36
- pp stage_recs[0] ## check first record
37
- recs += stage_recs
38
- end
39
- else
40
- page_meta = pages
41
- slug = page_meta[:slug]
42
-
43
- print " parsing #{slug}..."
44
-
45
- page = Page::Schedule.from_cache( slug )
46
- print " title=>#{page.title}<..."
47
- print "\n"
48
-
49
- rows = page.matches
50
- recs = build( rows, season: season, league: league.key )
51
-
52
- pp recs[0] ## check first record
53
- end
54
-
55
- recs = recs.map { |rec| fix_date( rec, offset ) } if offset
56
-
57
- ## note: sort matches by date before saving/writing!!!!
58
- ## note: for now assume date in string in 1999-11-30 format (allows sort by "simple" a-z)
59
- ## note: assume date is third column!!! (stage/round/date/...)
60
- recs = recs.sort { |l,r| l[2] <=> r[2] }
61
- ## reformat date / beautify e.g. Sat Aug 7 1993
62
- recs.each { |rec| rec[2] = Date.strptime( rec[2], '%Y-%m-%d' ).strftime( '%a %b %-d %Y' ) }
63
-
64
- ## remove unused columns (e.g. stage, et, p, etc.)
65
- recs, headers = vacuum( recs )
66
-
67
- puts headers
68
- pp recs[0] ## check first record
69
-
70
- out_path = "#{config.convert.out_dir}/#{season.path}/#{league.key}.csv"
71
-
72
- puts "write #{out_path}..."
73
- Cache::CsvMatchWriter.write( out_path, recs, headers: headers )
74
- end
75
-
76
-
77
-
78
- ## helper to fix dates to use local timezone (and not utc/london time)
79
- def self.fix_date( row, offset )
80
- return row if row[3].nil? || row[3].empty? ## note: time (column) required for fix
81
-
82
- col = row[2]
83
- if col =~ /^\d{4}-\d{2}-\d{2}$/
84
- date_fmt = '%Y-%m-%d' # e.g. 2002-08-17
85
- else
86
- puts "!!! ERROR - wrong (unknown) date format >>#{col}<<; cannot continue; fix it; sorry"
87
- ## todo/fix: add to errors/warns list - why? why not?
88
- exit 1
89
- end
90
-
91
- date = DateTime.strptime( "#{row[2]} #{row[3]}", "#{date_fmt} %H:%M" )
92
- ## NOTE - MUST be -7/24.0!!!! or such to work
93
- date = date + (offset/24.0)
94
-
95
- row[2] = date.strftime( date_fmt ) ## overwrite "old"
96
- row[3] = date.strftime( '%H:%M' )
97
- row ## return row for possible pipelining - why? why not?
98
- end
99
-
100
- end # module Worldfootball
@@ -1,107 +0,0 @@
1
- module Worldfootball
2
-
3
-
4
- def self.convert_reports( league:, season: )
5
- season = Season( season ) ## cast (ensure) season class (NOT string, integer, etc.)
6
-
7
- league = find_league( league )
8
-
9
- ## note: use only first part from key for lookup
10
- ## e.g. at.1 => at
11
- ## eng.1 => eng
12
- ## and so on
13
- mods = MODS[ league.key.split('.')[0] ] || {}
14
-
15
-
16
-
17
- pages = league.pages( season: season )
18
-
19
- recs = []
20
-
21
- ## if single (simple) page setup - wrap in array
22
- pages = pages.is_a?(Array) ? pages : [pages]
23
- pages.each do |page_meta| # note: use page_info for now (or page_rec or page_meta or such)
24
-
25
- page = Page::Schedule.from_cache( page_meta[:slug] )
26
- print " page title=>#{page.title}<..."
27
- print "\n"
28
-
29
- matches = page.matches
30
-
31
- puts "matches - #{matches.size} rows:"
32
- pp matches[0]
33
-
34
- puts "#{page.generated_in_days_ago} - #{page.generated}"
35
-
36
-
37
- matches.each_with_index do |match,i|
38
-
39
- report_ref = match[:report_ref]
40
- if report_ref.nil?
41
- puts "!! WARN: no match report ref found for match:"
42
- pp match
43
- next
44
- end
45
-
46
- puts "reading #{i+1}/#{matches.size} - #{report_ref}..."
47
- report = Page::Report.from_cache( report_ref )
48
-
49
- puts
50
- puts report.title
51
- puts report.generated
52
-
53
- rows = report.goals
54
- puts "goals - #{rows.size} records"
55
- ## pp rows
56
-
57
-
58
- if rows.size > 0
59
- ## add goals
60
- date = Date.strptime( match[:date], '%Y-%m-%d')
61
-
62
- team1 = match[:team1]
63
- team2 = match[:team2]
64
-
65
- ## clean team name (e.g. remove (old))
66
- ## and asciify (e.g. ’ to ' )
67
- team1 = norm_team( team1 )
68
- team2 = norm_team( team2 )
69
-
70
- team1 = mods[ team1 ] if mods[ team1 ]
71
- team2 = mods[ team2 ] if mods[ team2 ]
72
-
73
- match_id = "#{team1} - #{team2} | #{date.strftime('%b %-d %Y')}"
74
-
75
-
76
- rows.each do |row|
77
- extra = if row[:owngoal]
78
- '(og)' ## or use OG or O.G.- why? why not?
79
- elsif row[:penalty]
80
- '(pen)' ## or use P or PEN - why? why not?
81
- else
82
- ''
83
- end
84
-
85
- rec = [match_id,
86
- row[:score],
87
- "#{row[:minute]}'",
88
- extra,
89
- row[:player],
90
- row[:notes]]
91
- recs << rec
92
- end
93
- end
94
- end # each match
95
- end # each page
96
-
97
- ## pp recs
98
-
99
- out_path = "#{config.convert.out_dir}/#{season.path}/#{league.key}~goals.csv"
100
-
101
- headers = ['Match', 'Score', 'Minute', 'Extra', 'Player', 'Notes']
102
-
103
- puts "write #{out_path}..."
104
- Cache::CsvMatchWriter.write( out_path, recs, headers: headers )
105
- end
106
- end # module Worldfootballl
107
-
@@ -1,76 +0,0 @@
1
-
2
-
3
- module Worldfootball
4
- ################################
5
- # add more helpers
6
- # move upstream for (re)use - why? why not?
7
-
8
- ## todo/check: what to do: if league is both included and excluded?
9
- ## include forces include? or exclude has the last word? - why? why not?
10
- ## Excludes match before includes,
11
- ## meaning that something that has been excluded cannot be included again
12
-
13
- ## todo - find "proper/classic" timezone ("winter time")
14
-
15
- ## Brasilia - Distrito Federal, Brasil (GMT-3) -- summer time?
16
- ## Ciudad de México, CDMX, México (GMT-5) -- summer time?
17
- ## Londres, Reino Unido (GMT+1)
18
- ## Madrid -- ?
19
- ## Lisboa -- ?
20
- ## Moskow -- ?
21
- ##
22
- ## todo/check - quick fix timezone offsets for leagues for now
23
- ## - find something better - why? why not?
24
- ## note: assume time is in GMT+1
25
- OFFSETS = {
26
- 'eng.1' => -1,
27
- 'eng.2' => -1,
28
- 'eng.3' => -1,
29
- 'eng.4' => -1,
30
- 'eng.5' => -1,
31
-
32
- 'es.1' => -1,
33
- 'es.2' => -1,
34
-
35
- 'pt.1' => -1,
36
- 'pt.2' => -1,
37
-
38
- 'br.1' => -5,
39
- 'mx.1' => -7,
40
- }
41
-
42
-
43
-
44
- class Job ## todo/check: use a module (NOT a class) - why? why not?
45
- def self.download( datasets )
46
- datasets.each_with_index do |dataset,i|
47
- league = dataset[0]
48
- seasons = dataset[1]
49
-
50
- puts "downloading [#{i+1}/#{datasets.size}] #{league}..."
51
- seasons.each_with_index do |season,j|
52
- puts " season [#{j+1}/#{season.size}] #{league} #{season}..."
53
- Worldfootball.schedule( league: league,
54
- season: season )
55
- end
56
- end
57
- end
58
-
59
- def self.convert( datasets )
60
- datasets.each_with_index do |dataset,i|
61
- league = dataset[0]
62
- seasons = dataset[1]
63
-
64
- puts "converting [#{i+1}/#{datasets.size}] #{league}..."
65
- seasons.each_with_index do |season,j|
66
- puts " season [#{j+1}/#{season.size}] #{league} #{season}..."
67
- Worldfootball.convert( league: league,
68
- season: season,
69
- offset: OFFSETS[ league ] )
70
- end
71
- end
72
- end
73
- end # class Job
74
-
75
- end # module Worldfootball
76
-