worldfootball 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,46 @@
1
+
2
+ module Worldfootball
3
+
4
+
5
+ def self.list_pages ## todo/check - rename to/use list_cached_pages
6
+ start_time = Time.now ## todo: use Timer? t = Timer.start / stop / diff etc. - why? why not?
7
+
8
+ # pages = Dir.glob( './dl/at*' )
9
+ pages = Dir.glob( "#{Webcache.root}/www.weltfussball.de/alle_spiele/*.html" )
10
+ puts " #{pages.size} page(s)" #=> 576 pages
11
+ puts
12
+
13
+
14
+ leagues = {}
15
+
16
+ pages.each do |path|
17
+ basename = File.basename( path, File.extname( path ) )
18
+ print "%-50s" % basename
19
+ print " => "
20
+
21
+ page = Worldfootball.find_page( basename )
22
+ if page
23
+ league_key = page[:league]
24
+ season_key = page[:season]
25
+
26
+ print " "
27
+ print "%-12s" % league_key
28
+ print "| %-10s" % season_key
29
+ print "\n"
30
+
31
+ seasons = leagues[league_key] ||= []
32
+ seasons << season_key unless seasons.include?( season_key )
33
+ else
34
+ print "??"
35
+ print "\n"
36
+ end
37
+ end
38
+
39
+ puts " #{pages.size} page(s)" #=> 576 pages
40
+ puts
41
+
42
+ end_time = Time.now
43
+ diff_time = end_time - start_time
44
+ puts "convert_all: done in #{diff_time} sec(s)"
45
+ end
46
+ end # module Worldfootball
@@ -0,0 +1,220 @@
1
+ module Worldfootball
2
+
3
+ #################
4
+ # todo/fix - use timezone instead of offset !!!
5
+ # e.g
6
+ =begin
7
+ TIMEZONES = {
8
+ 'eng.1' => 'Europe/London',
9
+ 'eng.2' => 'Europe/London',
10
+
11
+ 'es.1' => 'Europe/Madrid',
12
+
13
+ 'de.1' => 'Europe/Berlin',
14
+ 'fr.1' => 'Europe/Paris',
15
+ 'it.1' => 'Europe/Rome',
16
+ 'nl.1' => 'Europe/Amsterdam',
17
+
18
+ 'pt.1' => 'Europe/Lisbon',
19
+
20
+ ## todo/fix - pt.1
21
+ ## one team in madeira!!! check for different timezone??
22
+ ## CD Nacional da Madeira
23
+
24
+ 'br.1' => 'America/Sao_Paulo',
25
+ ## todo/fix - brazil has 4 timezones
26
+ ## really only two in use for clubs
27
+ ## west and east (amazonas et al)
28
+ ## for now use west for all - why? why not?
29
+ }
30
+ =end
31
+
32
+ ## todo - find "proper/classic" timezone ("winter time")
33
+
34
+ ## Brasilia - Distrito Federal, Brasil (GMT-3) -- summer time?
35
+ ## Ciudad de México, CDMX, México (GMT-5) -- summer time?
36
+ ## Londres, Reino Unido (GMT+1)
37
+ ## Madrid -- ?
38
+ ## Lisboa -- ?
39
+ ## Moskow -- ?
40
+ ##
41
+ ## todo/check - quick fix timezone offsets for leagues for now
42
+ ## - find something better - why? why not?
43
+ ## note: assume time is in GMT/UTC+1 - PLUS SUMMERTIME!!!
44
+ ## todo/fix - consider summertime before conversion too!!!
45
+
46
+
47
+ OFFSETS = {
48
+
49
+ ## fix - change to gmt/utc offset
50
+ ## if offset == 1 (GMT/UTC+1)
51
+ ## do NOTHING (default date/timezone of pages)
52
+
53
+ 'eng' => 0,
54
+ # 'eng.1' => 0,
55
+ # 'eng.2' => 0,
56
+ # 'eng.3' => 0,
57
+ # 'eng.4' => 0,
58
+ # 'eng.5' => 0,
59
+
60
+ 'ie' => 0,
61
+ 'sco' => 0,
62
+
63
+ 'pt' => 0,
64
+ # 'pt.1' => 0,
65
+ # 'pt.2' => 0,
66
+
67
+ 'fi' => 2, # +2
68
+ 'gr' => 2, # +2
69
+ 'ro' => 2, # +2
70
+ 'ua' => 2, # +2
71
+
72
+ 'ru' => 3, # +3
73
+ 'tr' => 3, # +3 turkey time/moscow time
74
+
75
+
76
+ 'us' => -5, # (gmt-5) new york
77
+
78
+ 'mx' => -6,
79
+ # 'mx.1' => -6,
80
+ # 'mx.2' => -6,
81
+ # 'mx.3' => -6,
82
+ # 'mx.cup' => -6,
83
+
84
+ 'cr' => -6, # gmt-6
85
+ 'gt' => -6, # gmt-6
86
+ 'hn' => -6, # gmt-6
87
+ 'sv' => -6, # gmt-6
88
+ 'ni' => -6, # gmt-6
89
+
90
+ 'uy' => -3, # gmt-3
91
+ 'pe' => -5, # gmt-5
92
+ 'ec' => -5, # gmt-5
93
+ 'co' => -5, # gmt-5
94
+ 'bo' => -4, # gmt-4
95
+ 'cl' => -4, # gmt-4
96
+
97
+ 'br' => -4, # gmt-3 - change to -3?
98
+ # 'br.1' => -4,
99
+ 'ar' => -4, # gmt-3 - change to -3?
100
+ # 'ar.1' => -4,
101
+
102
+
103
+ 'eg' => 3, # +3 (gmt+3)
104
+ 'jp' => 9, # +9 (gmt+9)
105
+ 'cn' => 7, # +7 (gmt+7)
106
+
107
+
108
+ ## note - central european time (cet) - no need for date auto-fix
109
+ 'at' => 1,
110
+ 'de' => 1,
111
+ 'ch' => 1,
112
+ 'hu' => 1,
113
+ 'cz' => 1,
114
+ 'pl' => 1,
115
+ 'nl' => 1,
116
+ 'lu' => 1,
117
+ 'be' => 1,
118
+ 'dk' => 1,
119
+ 'se' => 1,
120
+ 'it' => 1,
121
+ 'fr' => 1,
122
+ 'es' => 1,
123
+ ## see https://en.wikipedia.org/wiki/Time_in_Europe
124
+
125
+
126
+ ################
127
+ ## int'l tournaments
128
+ # 'uefa.cl'
129
+ # 'uefa.el'
130
+ 'uefa.cl' => 1,
131
+ 'uefa.el' => 1,
132
+
133
+ 'concacaf.cl' => -6, ### use mx time
134
+ 'copa.l' => -4, ### use brazil time
135
+ }
136
+
137
+
138
+ ####
139
+ # config for slug to local basename / directories
140
+ # e.g.
141
+ # aut-bundesliga-2023-2024 => austria/2023-24/1_bundesliga.txt
142
+
143
+
144
+ ## add (timezone) offset here too - why? why not?
145
+ LEAGUE_SETUPS = {
146
+ ## note - for now auto-generate path via name (downcased)
147
+ ## e.g. Belgium => /belgium
148
+
149
+ ## top five (europe)
150
+ 'eng' => { code: 'eng', name: 'England' },
151
+ 'es' => { code: 'esp', name: 'Spain' },
152
+ # 'fr' => { code: 'fra', name: 'France' },
153
+ # 'de' => { code: '???', name: 'Germany' },
154
+ 'it' => { code: 'ita', name: 'Italy' },
155
+
156
+
157
+ 'be' => { code: 'bel', name: 'Belgium' },
158
+ 'at' => { code: 'aut', name: 'Austria' },
159
+ 'hu' => { code: 'hun', name: 'Hungary' },
160
+
161
+ 'tr' => { code: 'tur', name: 'Turkey' },
162
+ 'nl' => { code: 'ned', name: 'Netherlands' },
163
+ 'ch' => { code: 'sui', name: 'Switzerland' },
164
+
165
+
166
+ 'cz' => { code: 'cze', name: 'Czech Republic' },
167
+ 'dk' => { code: 'den', name: 'Denmark' },
168
+ 'fi' => { code: 'fin', name: 'Finland' },
169
+ 'gr' => { code: 'gre', name: 'Greece' },
170
+
171
+ 'ie' => { code: 'irl', name: 'Ireland' },
172
+ 'sco' => { code: 'sco', name: 'Scotland' },
173
+
174
+ 'lu' => { code: 'lux', name: 'Luxembourg' },
175
+ 'pl' => { code: 'pol', name: 'Poland' },
176
+ 'pt' => { code: 'por', name: 'Portugal' },
177
+ 'ro' => { code: 'rou', name: 'Romania' },
178
+ 'ru' => { code: 'rus', name: 'Russia' },
179
+ 'se' => { code: 'swe', name: 'Sweden' },
180
+ 'ua' => { code: 'ukr', name: 'Ukraine' },
181
+
182
+
183
+ 'eg' => { code: 'egy', name: 'Egypt' },
184
+ 'jp' => { code: 'jpn', name: 'Japan' },
185
+ 'cn' => { code: 'chn', name: 'China' },
186
+
187
+ ## note - for now do NOT add United States to league name
188
+ ## e.g. 1 - Major League Soccer
189
+ ## 2 - USL Championship
190
+ ## cup - U.S. Open Cup
191
+ 'us' => { code: 'usa', name: nil, path: 'united-states' },
192
+
193
+ 'mx' => { code: 'mex', name: 'Mexico' },
194
+ 'ar' => { code: 'arg', name: 'Argentina' },
195
+ 'br' => { code: 'bra', name: 'Brazil' },
196
+
197
+ 'uy' => { code: 'uru', name: 'Uruguay' },
198
+ 'pe' => { code: 'per', name: 'Peru' },
199
+ 'ec' => { code: 'ecu', name: 'Ecuador' },
200
+ 'bo' => { code: 'bol', name: 'Bolivia' },
201
+ 'cl' => { code: 'chi', name: 'Chile' },
202
+ 'co' => { code: 'col', name: 'Colombia' },
203
+
204
+ 'cr' => { code: 'crc', name: 'Costa Rica' },
205
+ 'gt' => { code: 'gua', name: 'Guatemala' },
206
+ 'hn' => { code: 'hon', name: 'Honduras' },
207
+ 'sv' => { code: 'slv', name: 'El Salvador' },
208
+ 'ni' => { code: 'nca', name: 'Nicaragua' },
209
+
210
+
211
+ ## int'l tournaments
212
+ 'uefa.cl' => { code: nil, name: 'UEFA', path: 'europe' },
213
+ 'uefa.el' => { code: nil, name: 'UEFA', path: 'europe' },
214
+ 'concacaf.cl' => { code: nil, name: nil, path: 'north-america' },
215
+ 'copa.l' => { code: nil, name: nil, path: 'south-america' },
216
+ }
217
+
218
+
219
+
220
+ end # module Worldfootball
@@ -2,84 +2,18 @@
2
2
  module Worldfootball
3
3
 
4
4
 
5
- #################
6
- # todo/fix - use timezone instead of offset !!!
7
- # e.g
8
- =begin
9
- TIMEZONES = {
10
- 'eng.1' => 'Europe/London',
11
- 'eng.2' => 'Europe/London',
12
-
13
- 'es.1' => 'Europe/Madrid',
14
-
15
- 'de.1' => 'Europe/Berlin',
16
- 'fr.1' => 'Europe/Paris',
17
- 'it.1' => 'Europe/Rome',
18
- 'nl.1' => 'Europe/Amsterdam',
19
-
20
- 'pt.1' => 'Europe/Lisbon',
21
-
22
- ## todo/fix - pt.1
23
- ## one team in madeira!!! check for different timezone??
24
- ## CD Nacional da Madeira
25
-
26
- 'br.1' => 'America/Sao_Paulo',
27
- ## todo/fix - brazil has 4 timezones
28
- ## really only two in use for clubs
29
- ## west and east (amazonas et al)
30
- ## for now use west for all - why? why not?
31
- }
32
- =end
33
-
34
- ## todo - find "proper/classic" timezone ("winter time")
35
-
36
- ## Brasilia - Distrito Federal, Brasil (GMT-3) -- summer time?
37
- ## Ciudad de México, CDMX, México (GMT-5) -- summer time?
38
- ## Londres, Reino Unido (GMT+1)
39
- ## Madrid -- ?
40
- ## Lisboa -- ?
41
- ## Moskow -- ?
42
- ##
43
- ## todo/check - quick fix timezone offsets for leagues for now
44
- ## - find something better - why? why not?
45
- ## note: assume time is in GMT+1
46
- OFFSETS = {
47
- 'eng.1' => -1,
48
- 'eng.2' => -1,
49
- 'eng.3' => -1,
50
- 'eng.4' => -1,
51
- 'eng.5' => -1,
52
-
53
- 'es.1' => -1,
54
- 'es.2' => -1,
55
-
56
- 'pt.1' => -1,
57
- 'pt.2' => -1,
58
-
59
- 'br.1' => -5,
60
- 'mx.1' => -7,
61
- }
62
-
63
-
64
- def self.convert( league:, season: )
5
+ def self.convert( league:, season: )
65
6
  season = Season( season ) ## cast (ensure) season class (NOT string, integer, etc.)
66
7
 
67
- league = find_league( league )
8
+ league = find_league!( league )
9
+ pages = league.pages!( season: season )
68
10
 
69
- pages = league.pages( season: season )
70
11
 
71
- ## check: rename (optional) offset to time_offset or such?
72
- offset = OFFSETS[ league ]
73
-
74
-
75
- # note: assume stages if pages is an array (of hash table/records)
76
- # (and NOT a single hash table/record)
77
- if pages.is_a?(Array)
78
12
  recs = []
79
- pages.each do |page_meta|
80
- slug = page_meta[:slug]
81
- stage_name = page_meta[:stage]
82
- ## todo/fix: report error/check if stage.name is nil!!!
13
+ pages.each do |slug, stage|
14
+ ## note: stage might be nil
15
+ ## todo/fix: report error/check if stage is nil!!!
16
+ stage ||= ''
83
17
 
84
18
  print " parsing #{slug}..."
85
19
 
@@ -93,35 +27,27 @@ def self.convert( league:, season: )
93
27
  print "\n"
94
28
 
95
29
  rows = page.matches
96
- stage_recs = build( rows, season: season, league: league.key, stage: stage_name )
30
+ stage_recs = build( rows,
31
+ season: season,
32
+ league: league.key,
33
+ stage: stage )
97
34
 
98
35
  pp stage_recs[0] ## check first record
99
36
  recs += stage_recs
100
37
  end
101
- else
102
- page_meta = pages
103
- slug = page_meta[:slug]
104
-
105
- print " parsing #{slug}..."
106
-
107
- page = Page::Schedule.from_cache( slug )
108
- print " title=>#{page.title}<..."
109
- print "\n"
110
38
 
111
- rows = page.matches
112
- recs = build( rows, season: season, league: league.key )
113
39
 
114
- pp recs[0] ## check first record
115
- end
40
+ recs = fix_dates( recs, league: league.key )
116
41
 
117
- recs = recs.map { |rec| fix_date( rec, offset ) } if offset
118
42
 
119
43
  ## note: sort matches by date before saving/writing!!!!
120
44
  ## note: for now assume date in string in 1999-11-30 format (allows sort by "simple" a-z)
121
45
  ## note: assume date is third column!!! (stage/round/date/...)
122
46
  recs = recs.sort { |l,r| l[2] <=> r[2] }
123
47
  ## reformat date / beautify e.g. Sat Aug 7 1993
124
- recs.each { |rec| rec[2] = Date.strptime( rec[2], '%Y-%m-%d' ).strftime( '%a %b %-d %Y' ) }
48
+ recs.each do |rec|
49
+ rec[2] = Date.strptime( rec[2], '%Y-%m-%d' ).strftime( '%a %b %-d %Y' )
50
+ end
125
51
 
126
52
  ## remove unused columns (e.g. stage, et, p, etc.)
127
53
  recs, headers = vacuum( recs )
@@ -136,10 +62,44 @@ recs.each { |rec| rec[2] = Date.strptime( rec[2], '%Y-%m-%d' ).strftime( '%a %b
136
62
  end
137
63
 
138
64
 
139
-
140
65
  ## helper to fix dates to use local timezone (and not utc/london time)
141
- def self.fix_date( row, offset )
142
- return row if row[3].nil? || row[3].empty? ## note: time (column) required for fix
66
+ def self.fix_dates( rows, league: )
67
+
68
+ ## check: rename (optional) offset to time_offset or such?
69
+ ## note - retry with league_country (e.g. eng.1 => eng etc.)
70
+ offset = OFFSETS[ league ] ||
71
+ OFFSETS[ league.split('.')[0] ]
72
+
73
+ if offset.nil?
74
+ puts "!! ERROR - no timezone/offset configured for league >#{league}<:"
75
+ pp rows[0] ## print first row too (for dates etc.)
76
+ exit 1
77
+ end
78
+
79
+
80
+ ## todo/check - rename offset to timezone
81
+ ## or utc_offset or such - why? why not
82
+
83
+ ## note - assume central european time (cet) - GMT/UTC+1
84
+ ## e.g. offset = 1 for cet (and 0 for gmt/london) etc.
85
+ diff_cet = offset-1
86
+
87
+ return rows if diff_cet == 0 ## no need to convert if in cet
88
+
89
+ rows.map { |row| _fix_date( row, offset ) }
90
+ end
91
+
92
+
93
+ def self._fix_date( row, offset )
94
+ ## note: time (column) required for fix
95
+ return row if row[3].nil? || row[3].empty?
96
+
97
+ ## note - assume central european time (cet) - GMT/UTC+1
98
+ diff_cet = offset-1
99
+
100
+ return row if diff_cet == 0
101
+
102
+
143
103
 
144
104
  col = row[2]
145
105
  if col =~ /^\d{4}-\d{2}-\d{2}$/
@@ -152,7 +112,7 @@ def self.fix_date( row, offset )
152
112
 
153
113
  date = DateTime.strptime( "#{row[2]} #{row[3]}", "#{date_fmt} %H:%M" )
154
114
  ## NOTE - MUST be -7/24.0!!!! or such to work
155
- date = date + (offset/24.0)
115
+ date = date + (diff_cet/24.0)
156
116
 
157
117
  row[2] = date.strftime( date_fmt ) ## overwrite "old"
158
118
  row[3] = date.strftime( '%H:%M' )
@@ -7,14 +7,9 @@ module Worldfootball
7
7
  def self.schedule( league:, season: )
8
8
  season = Season( season ) ## cast (ensure) season class (NOT string, integer, etc.)
9
9
 
10
- league = find_league( league )
11
-
12
- pages = league.pages( season: season )
13
-
14
- ## if single (simple) page setup - wrap in array
15
- pages = pages.is_a?(Array) ? pages : [pages]
16
- pages.each do |page_meta|
17
- Metal.download_schedule( page_meta[:slug] )
10
+ pages = find_league_pages!( league: league, season: season )
11
+ pages.each do |slug, _|
12
+ Metal.download_schedule( slug )
18
13
  end # each page
19
14
  end
20
15
 
@@ -22,14 +17,9 @@ end
22
17
  def self.reports( league:, season:, cache: true ) ## todo/check: rename to reports_for_schedule or such - why? why not?
23
18
  season = Season( season ) ## cast (ensure) season class (NOT string, integer, etc.)
24
19
 
25
- league = find_league( league )
26
-
27
- pages = league.pages( season: season )
28
-
29
- ## if single (simple) page setup - wrap in array
30
- pages = pages.is_a?(Array) ? pages : [pages]
31
- pages.each do |page_meta|
32
- Metal.download_reports_for_schedule( page_meta[:slug], cache: cache )
20
+ pages = find_league_pages!( league: league, season: season )
21
+ pages.each do |slug, _|
22
+ Metal.download_reports_for_schedule( slug, cache: cache )
33
23
  end # each page
34
24
  end
35
25
 
@@ -41,7 +31,7 @@ end
41
31
 
42
32
  ## todo/check: put in Downloader namespace/class - why? why not?
43
33
  ## or use Metal - no "porcelain" downloaders / machinery
44
- class Metal
34
+ class Metal
45
35
 
46
36
  BASE_URL = 'https://www.weltfussball.de'
47
37
 
@@ -117,7 +107,7 @@ class Metal
117
107
  end
118
108
  end
119
109
 
120
-
110
+
121
111
  def self.download_page( url ) ## get & record/save to cache
122
112
  response = Webget.page( url ) ## fetch (and cache) html page (via HTTP GET)
123
113