rsssf 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/{HISTORY.md → CHANGELOG.md} +2 -0
- data/Manifest.txt +3 -6
- data/README.md +43 -26
- data/Rakefile +8 -7
- data/lib/rsssf/convert.rb +495 -0
- data/lib/rsssf/download.rb +151 -0
- data/lib/rsssf/page.rb +70 -45
- data/lib/rsssf/repo.rb +77 -153
- data/lib/rsssf/reports/page.rb +30 -19
- data/lib/rsssf/reports/schedule.rb +111 -25
- data/lib/rsssf/schedule.rb +4 -14
- data/lib/rsssf/utils.rb +10 -29
- data/lib/rsssf/version.rb +3 -5
- data/lib/rsssf.rb +42 -19
- metadata +26 -25
- data/.gemtest +0 -0
- data/lib/rsssf/fetch.rb +0 -80
- data/lib/rsssf/html2txt.rb +0 -157
- data/lib/rsssf/patch.rb +0 -28
- data/test/helper.rb +0 -12
- data/test/test_utils.rb +0 -83
@@ -0,0 +1,151 @@
|
|
1
|
+
|
2
|
+
module Rsssf
|
3
|
+
|
4
|
+
## end_year to slug_year
|
5
|
+
## check if generic rule/convention in use ???
|
6
|
+
## 2007-08: tablesd/duit08.html
|
7
|
+
## 2008-09: tablesd/duit09.html
|
8
|
+
## 2009-10: tablesd/duit2010.html
|
9
|
+
## 2010-11: tablesd/duit2011.html
|
10
|
+
## 2011-12: tablesd/duit2012.html
|
11
|
+
|
12
|
+
|
13
|
+
## map country codes to table pages
|
14
|
+
## add options about (char) encoding ??? - why? why not?
|
15
|
+
TABLE = {
|
16
|
+
'eng' => ['tablese/eng{year}', { encoding: 'Windows-1252' } ],
|
17
|
+
'es' => ['tabless/span{year}', { encoding: 'Windows-1252' } ],
|
18
|
+
'de' => ['tablesd/duit{year}', { encoding: 'Windows-1252' } ],
|
19
|
+
'at' => ['tableso/oost{year}', { encoding: 'Windows-1252' } ],
|
20
|
+
'br' => [
|
21
|
+
->(season) {
|
22
|
+
## note: special slug/case for year/season 2000
|
23
|
+
## see rsssf.org/tablesb/brazchamp.html
|
24
|
+
if season == Season('2000')
|
25
|
+
'tablesb/braz-joao{year}' ## use braz-joao00 - why? why not?
|
26
|
+
else
|
27
|
+
'tablesb/braz{year}'
|
28
|
+
end
|
29
|
+
}, { encoding: 'Windows-1252' } ],
|
30
|
+
}
|
31
|
+
|
32
|
+
|
33
|
+
BASE_URL = "https://rsssf.org"
|
34
|
+
|
35
|
+
|
36
|
+
def self.table_url( code, season: )
|
37
|
+
url, _ = table_url_and_encoding( code, season: season )
|
38
|
+
url
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.table_url_and_encoding( code, season: )
|
42
|
+
season = Season( season )
|
43
|
+
|
44
|
+
table = TABLE[ code.downcase ]
|
45
|
+
tmpl = table[0]
|
46
|
+
tmpl = tmpl.call( season ) if tmpl.is_a?(Proc) ## check for proc
|
47
|
+
|
48
|
+
opts = table[1] || {}
|
49
|
+
encoding = opts[:encoding] || 'UTF-8'
|
50
|
+
|
51
|
+
|
52
|
+
slug = if season.end_year < 2010 ## cut off all digits (only keep last two)s
|
53
|
+
## convert end_year to string with leading zero
|
54
|
+
'%02d' % (season.end_year % 100) ## e.g. 00 / 01 / 99 / 98 / 11 / etc.
|
55
|
+
else
|
56
|
+
'%4d' % season.end_year
|
57
|
+
end
|
58
|
+
|
59
|
+
tmpl = tmpl.sub( '{year}', slug )
|
60
|
+
url = "#{BASE_URL}/#{tmpl}.html"
|
61
|
+
|
62
|
+
[url, encoding]
|
63
|
+
end
|
64
|
+
|
65
|
+
|
66
|
+
def self.download_table( code, season: )
|
67
|
+
url, encoding = table_url_and_encoding( code, season: season )
|
68
|
+
|
69
|
+
download_page( url, encoding: encoding )
|
70
|
+
end
|
71
|
+
|
72
|
+
|
73
|
+
def self.download_page( url, encoding: )
|
74
|
+
|
75
|
+
## note: assume plain 7-bit ascii for now
|
76
|
+
## -- assume rsssf uses ISO_8859_15 (updated version of ISO_8859_1)
|
77
|
+
###-- does NOT use utf-8 character encoding!!!
|
78
|
+
response = Webget.page( url, encoding: encoding ) ## fetch (and cache) html page (via HTTP GET)
|
79
|
+
|
80
|
+
## note: exit on get / fetch error - do NOT continue for now - why? why not?
|
81
|
+
exit 1 if response.status.nok? ## e.g. HTTP status code != 200
|
82
|
+
|
83
|
+
|
84
|
+
puts "html:"
|
85
|
+
html = response.text( encoding: encoding )
|
86
|
+
pp html[0..400]
|
87
|
+
html
|
88
|
+
end
|
89
|
+
end # module Rsssf
|
90
|
+
|
91
|
+
|
92
|
+
|
93
|
+
__END__
|
94
|
+
|
95
|
+
1998-99: tablesd/duit99.html
|
96
|
+
1999-00: tablesd/duit00.html ## use 1999-2000 - why?? why not??
|
97
|
+
2000-01: tablesd/duit01.html
|
98
|
+
2001-02: tablesd/duit02.html
|
99
|
+
2002-03: tablesd/duit03.html
|
100
|
+
2003-04: tablesd/duit04.html
|
101
|
+
2004-05: tablesd/duit05.html
|
102
|
+
2005-06: tablesd/duit06.html
|
103
|
+
2006-07: tablesd/duit07.html
|
104
|
+
2007-08: tablesd/duit08.html
|
105
|
+
2008-09: tablesd/duit09.html
|
106
|
+
2009-10: tablesd/duit2010.html
|
107
|
+
2010-11: tablesd/duit2011.html
|
108
|
+
2011-12: tablesd/duit2012.html
|
109
|
+
2012-13: tablesd/duit2013.html
|
110
|
+
2013-14: tablesd/duit2014.html
|
111
|
+
2014-15: tablesd/duit2015.html
|
112
|
+
|
113
|
+
|
114
|
+
2010-11: tableso/oost2011.html
|
115
|
+
2011-12: tableso/oost2012.html
|
116
|
+
2012-13: tableso/oost2013.html
|
117
|
+
2013-14: tableso/oost2014.html
|
118
|
+
2014-15: tableso/oost2015.html
|
119
|
+
2015-16: tableso/oost2016.html
|
120
|
+
|
121
|
+
2011: tablesb/braz2011.html !! Windows-1252
|
122
|
+
2012: tablesb/braz2012.html !! Windows-1252
|
123
|
+
2013: tablesb/braz2013.html !! Windows-1252
|
124
|
+
2014: tablesb/braz2014.html !! Windows-1252
|
125
|
+
2015: tablesb/braz2015.html !! Windows-1252
|
126
|
+
2016: tablesb/braz2016.html !! Windows-1252
|
127
|
+
2017: tablesb/braz2017.html !! Windows-1252
|
128
|
+
2018: tablesb/braz2018.html !! Windows-1252
|
129
|
+
2019: tablesb/braz2019.html !! Windows-1252
|
130
|
+
2020: tablesb/braz2020.html !! Windows-1252 ## 2020/21 - extended for corona
|
131
|
+
2021: tablesb/braz2021.html !! Windows-1252
|
132
|
+
2022: tablesb/braz2022.html !! Windows-1252
|
133
|
+
2023: tablesb/braz2023.html !! Windows-1252
|
134
|
+
2024: tablesb/braz2024.html !! Windows-1252
|
135
|
+
|
136
|
+
2010-11: tablese/eng2011.html !! Windows-1252
|
137
|
+
2011-12: tablese/eng2012.html !! Windows-1252
|
138
|
+
2012-13: tablese/eng2013.html !! Windows-1252
|
139
|
+
2013-14: tablese/eng2014.html !! Windows-1252
|
140
|
+
2014-15: tablese/eng2015.html !! Windows-1252
|
141
|
+
2015-16: tablese/eng2016.html !! Windows-1252
|
142
|
+
2016-17: tablese/eng2017.html !! Windows-1252
|
143
|
+
2017-18: tablese/eng2018.html !! Windows-1252
|
144
|
+
2018-19: tablese/eng2019.html !! Windows-1252
|
145
|
+
2019-20: tablese/eng2020.html !! Windows-1252
|
146
|
+
2020-21: tablese/eng2021.html !! Windows-1252
|
147
|
+
2021-22: tablese/eng2022.html !! Windows-1252
|
148
|
+
2022-23: tablese/eng2023.html !! Windows-1252
|
149
|
+
2023-24: tablese/eng2024.html !! Windows-1252
|
150
|
+
|
151
|
+
|
data/lib/rsssf/page.rb
CHANGED
@@ -1,13 +1,11 @@
|
|
1
|
-
# encoding: utf-8
|
2
1
|
|
3
2
|
|
4
3
|
module Rsssf
|
4
|
+
|
5
5
|
|
6
6
|
PageStat = Struct.new(
|
7
|
-
:source, ## e.g.
|
8
|
-
:basename, ## e.g. duit89 -- note: filename w/o extension (and path)
|
7
|
+
:source, ## e.g. https://rsssf.org/tabled/duit89.html
|
9
8
|
:year, ## e.g. 1989 -- note: always four digits
|
10
|
-
:season, ## e.g. 1990-91 -- note: always a string (NOT a number)
|
11
9
|
:authors,
|
12
10
|
:last_updated,
|
13
11
|
:line_count, ## todo: rename to (just) lines - why? why not?
|
@@ -27,24 +25,41 @@ module Rsssf
|
|
27
25
|
class Page
|
28
26
|
|
29
27
|
include Utils ## e.g. year_from_name, etc.
|
28
|
+
|
29
|
+
def self.read_cache( url ) ### use read_cache /web/html or such - why? why not?
|
30
|
+
html = Webcache.read( url )
|
31
|
+
|
32
|
+
puts "html:"
|
33
|
+
pp html[0..400]
|
34
|
+
|
35
|
+
txt = PageConverter.convert( html, url: url )
|
36
|
+
txt
|
30
37
|
|
31
|
-
|
32
|
-
txt = PageFetcher.new.fetch( src )
|
33
|
-
self.from_string( txt )
|
38
|
+
new( txt )
|
34
39
|
end
|
35
40
|
|
36
41
|
|
37
|
-
def self.
|
38
|
-
|
39
|
-
|
42
|
+
def self.read_txt( path ) ## use read_txt
|
43
|
+
# note: always assume sources (already) converted from html to txt!!!!
|
44
|
+
txt = read_text( path )
|
45
|
+
new( txt )
|
40
46
|
end
|
41
47
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
48
|
+
|
49
|
+
|
50
|
+
### use text alias too (for txt) - why? why not?
|
51
|
+
attr_accessor :txt
|
52
|
+
|
53
|
+
## quick hack? used for auto-patch machinery
|
54
|
+
attr_accessor :patch
|
55
|
+
attr_accessor :url ### source url
|
56
|
+
|
57
|
+
|
46
58
|
def initialize( txt )
|
47
59
|
@txt = txt
|
60
|
+
|
61
|
+
@patch = nil
|
62
|
+
@url = nil
|
48
63
|
end
|
49
64
|
|
50
65
|
|
@@ -61,17 +76,20 @@ CUP_ROUND_REGEX = /\b(
|
|
61
76
|
Final
|
62
77
|
)\b/ix
|
63
78
|
|
64
|
-
|
79
|
+
|
80
|
+
|
81
|
+
## make header required - why? why not?
|
82
|
+
def find_schedule( header: nil,
|
83
|
+
cup: false ) ## change to build_schedule - why? why not???
|
65
84
|
|
66
85
|
## find match schedule/fixtures in multi-league doc
|
67
|
-
new_txt =
|
86
|
+
new_txt = String.new
|
68
87
|
|
69
88
|
## note: keep track of statistics
|
70
89
|
## e.g. number of rounds found
|
71
90
|
|
72
91
|
round_count = 0
|
73
92
|
|
74
|
-
header = opts[:header]
|
75
93
|
if header
|
76
94
|
league_header_found = false
|
77
95
|
|
@@ -89,7 +107,8 @@ def find_schedule( opts={} ) ## change to build_schedule - why? why not???
|
|
89
107
|
header_regex = /^
|
90
108
|
([#]{2,4}\s+(#{header_esc}))
|
91
109
|
|
|
92
|
-
(\*{2}(#{header_esc})\*{2})
|
110
|
+
(\*{2}(#{header_esc})) ## was: \*{2})
|
111
|
+
## do not inluce trailing ** for now (allows anchors e.g. §)
|
93
112
|
/ix
|
94
113
|
|
95
114
|
## todo:
|
@@ -105,7 +124,7 @@ def find_schedule( opts={} ) ## change to build_schedule - why? why not???
|
|
105
124
|
## pp header_regex
|
106
125
|
|
107
126
|
|
108
|
-
if
|
127
|
+
if cup
|
109
128
|
round_regex = CUP_ROUND_REGEX ## note: only allow final, quaterfinals, etc. if knockout cup
|
110
129
|
else
|
111
130
|
round_regex = LEAGUE_ROUND_REGEX
|
@@ -128,8 +147,10 @@ def find_schedule( opts={} ) ## change to build_schedule - why? why not???
|
|
128
147
|
if line =~ header_regex
|
129
148
|
puts "!!! bingo - found header >#{line}<"
|
130
149
|
league_header_found = true
|
131
|
-
|
132
|
-
|
150
|
+
|
151
|
+
## note - do NOT auto-add header/title !!!
|
152
|
+
# title = line.gsub( /[#*]/, '' ).strip ## quick hack: extract title from header
|
153
|
+
# new_txt << "## #{title}\n\n" # note: use header/stage title (regex group capture)
|
133
154
|
else
|
134
155
|
puts " searching for header >#{header}<; skipping line >#{line}<"
|
135
156
|
next
|
@@ -205,13 +226,24 @@ def find_schedule( opts={} ) ## change to build_schedule - why? why not???
|
|
205
226
|
end
|
206
227
|
end # each line
|
207
228
|
|
208
|
-
|
209
|
-
|
229
|
+
|
230
|
+
## quick hack?
|
231
|
+
### auto-apply patch if patch configured
|
232
|
+
if @patch && @patch.respond_to?(:on_patch)
|
233
|
+
url_path = URI.parse( url ).path
|
234
|
+
basename = File.basename( url_path, File.extname( url_path ))
|
235
|
+
year = year_from_name( basename )
|
236
|
+
new_txt = @patch.on_patch( new_txt, basename, year )
|
237
|
+
end
|
238
|
+
|
239
|
+
schedule = Schedule.new( new_txt )
|
240
|
+
## schedule.rounds = round_count
|
210
241
|
|
211
242
|
schedule
|
212
243
|
end # method find_schedule
|
213
244
|
|
214
245
|
|
246
|
+
|
215
247
|
def build_stat
|
216
248
|
source = nil
|
217
249
|
authors = nil
|
@@ -224,7 +256,7 @@ def build_stat
|
|
224
256
|
end
|
225
257
|
|
226
258
|
##
|
227
|
-
## fix/todo: move authors n last updated whitespace cleanup
|
259
|
+
## fix/todo: move authors n last updated whitespace cleanup - why? why not??
|
228
260
|
|
229
261
|
if @txt =~ /authors?:\s+(.+?)\s+last updated:\s+(\d{1,2} [a-z]{3,10} \d{4})/im
|
230
262
|
last_updated = $2.to_s # note: save a copy first (gets "reset" by next regex)
|
@@ -235,7 +267,15 @@ def build_stat
|
|
235
267
|
end
|
236
268
|
|
237
269
|
puts "*** !!! missing source" if source.nil?
|
238
|
-
puts "*** !!! missing authors
|
270
|
+
puts "*** !!! missing authors and last updated" if authors.nil? || last_updated.nil?
|
271
|
+
|
272
|
+
|
273
|
+
## get year from source (url)
|
274
|
+
url_path = URI.parse( source ).path
|
275
|
+
basename = File.basename( url_path, File.extname( url_path ) ) ## e.g. duit92.txt or duit92.html => duit92
|
276
|
+
puts " basename=>#{basename}<"
|
277
|
+
year = year_from_name( basename )
|
278
|
+
|
239
279
|
|
240
280
|
sections = []
|
241
281
|
|
@@ -248,26 +288,16 @@ def build_stat
|
|
248
288
|
## todo: add more patterns? how? why?
|
249
289
|
if line =~ /####\s+(.+)/
|
250
290
|
puts " found section >#{$1}<"
|
251
|
-
|
291
|
+
## remove anchors first e.g. ‹§sa› etc.
|
292
|
+
## check if anchors with underscore (_) or dash/hyphen (-) ???
|
293
|
+
sections << $1.sub( /‹§[a-z0-9]+›/, '' ).strip
|
252
294
|
end
|
253
295
|
end
|
254
296
|
|
255
297
|
|
256
|
-
# get path from url
|
257
|
-
url = URI.parse( source )
|
258
|
-
## pp url
|
259
|
-
## puts url.host
|
260
|
-
path = url.path
|
261
|
-
extname = File.extname( path )
|
262
|
-
basename = File.basename( path, extname ) ## e.g. duit92.txt or duit92.html => duit92
|
263
|
-
year = year_from_name( basename )
|
264
|
-
season = year_to_season( year )
|
265
|
-
|
266
298
|
rec = PageStat.new
|
267
299
|
rec.source = source # e.g. http://rsssf.org/tabled/duit89.html -- use source_url - why?? why not??
|
268
|
-
rec.
|
269
|
-
rec.year = year # e.g. 89 => 1989 -- note: always four digits
|
270
|
-
rec.season = season
|
300
|
+
rec.year = year
|
271
301
|
rec.authors = authors
|
272
302
|
rec.last_updated = last_updated
|
273
303
|
rec.line_count = line_count
|
@@ -279,17 +309,12 @@ end ## method build_stat
|
|
279
309
|
|
280
310
|
|
281
311
|
def save( path )
|
282
|
-
|
283
|
-
f.write @txt
|
284
|
-
end
|
312
|
+
write_text( path, @txt )
|
285
313
|
end ## method save
|
286
314
|
|
287
315
|
end ## class Page
|
288
316
|
end ## module Rsssf
|
289
317
|
|
290
318
|
|
291
|
-
## add (shortcut) alias
|
292
|
-
RsssfPageStat = Rsssf::PageStat
|
293
|
-
RsssfPage = Rsssf::Page
|
294
319
|
|
295
320
|
|
data/lib/rsssf/repo.rb
CHANGED
@@ -1,174 +1,115 @@
|
|
1
|
-
# encoding: utf-8
|
2
1
|
|
3
2
|
module Rsssf
|
4
3
|
|
5
|
-
## used by Repo#make_schedules
|
6
|
-
ScheduleConfig = Struct.new(
|
7
|
-
:name,
|
8
|
-
:opts_for_year, ## hash or proc ->(year){ Hash[...] }
|
9
|
-
:dir_for_year, ## proc ->(year){ 'path_here'} ## rename to path_for_year - why, why not??
|
10
|
-
:includes ## array of years to include e.g. [2011,2012] etc.
|
11
|
-
)
|
12
|
-
|
13
|
-
|
14
|
-
ScheduleStat = Struct.new(
|
15
|
-
:path, ## e.g. 2012-13 or archive/1980s/1984-85
|
16
|
-
:filename, ## e.g. 1-bundesliga.txt -- note: w/o path
|
17
|
-
:year, ## e.g. 2013 -- note: numeric (integer)
|
18
|
-
:season, ## e.g. 2012-13 -- note: is a string
|
19
|
-
:rounds ## e.g. 36 -- note: numeric (integer)
|
20
|
-
)
|
21
4
|
|
22
5
|
|
23
6
|
class Repo
|
24
|
-
|
25
|
-
include Filters ## e.g. sanitize, etc.
|
26
7
|
include Utils ## e.g. year_from_file, etc.
|
27
8
|
|
28
9
|
|
29
|
-
def initialize( path,
|
10
|
+
def initialize( path, title: 'Your Title Here',
|
11
|
+
patch: nil )
|
30
12
|
@repo_path = path
|
31
|
-
@
|
13
|
+
@title = title
|
14
|
+
@patch = patch
|
32
15
|
end
|
33
16
|
|
34
17
|
|
35
|
-
def
|
36
|
-
|
37
|
-
cfg = YAML.load_file( "#{@repo_path}/tables/config.yml")
|
38
|
-
pp cfg
|
18
|
+
def root() @repo_path; end ## use/rename to path - why? why not?
|
19
|
+
alias_method :root_dir, :root
|
39
20
|
|
40
|
-
dl_base = 'http://rsssf.com'
|
41
21
|
|
42
|
-
|
43
|
-
|
44
|
-
|
22
|
+
## for now use single country repos - why? why not?
|
23
|
+
## add support for all-in-one repos
|
24
|
+
def prepare_pages( code, seasons )
|
25
|
+
seasons.each do |season|
|
26
|
+
url = Rsssf.table_url( code, season: season )
|
45
27
|
|
46
|
-
##
|
47
|
-
|
48
|
-
|
28
|
+
## check if not in cache
|
29
|
+
unless Webcache.cached?( url )
|
30
|
+
## download - if not cached
|
31
|
+
Rsssf.download_table( code, season: season )
|
32
|
+
end
|
49
33
|
|
50
|
-
|
51
|
-
dest_path = "#{@repo_path}/tables/#{basename}.txt"
|
34
|
+
page = Page.read_cache( url )
|
52
35
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
end # method fetch_pages
|
36
|
+
url_path = URI.parse( url ).path
|
37
|
+
puts " url = >#{url}<"
|
38
|
+
puts " url_path = >#{url_path}<"
|
57
39
|
|
40
|
+
basename = File.basename( url_path, File.extname( url_path ))
|
58
41
|
|
59
|
-
|
60
|
-
|
42
|
+
###
|
43
|
+
## check for on_prepare (apply patches)
|
44
|
+
if @patch && @patch.respond_to?(:on_prepare)
|
45
|
+
year = year_from_name( basename )
|
46
|
+
page.txt = @patch.on_prepare( page.txt, basename, year )
|
47
|
+
end
|
61
48
|
|
62
|
-
files = Dir[ "#{@repo_path}/tables/*.txt" ]
|
63
|
-
files.each do |file|
|
64
|
-
page = Page.from_file( file )
|
65
|
-
stats << page.build_stat
|
66
|
-
end
|
67
49
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
end
|
50
|
+
path = "#{@repo_path}/tables/#{basename}.txt"
|
51
|
+
page.save( path )
|
52
|
+
end
|
53
|
+
end # method prepare_pages
|
72
54
|
|
73
55
|
|
74
|
-
def
|
75
|
-
|
76
|
-
|
77
|
-
|
56
|
+
def each_page( code, seasons, &blk ) ## use each table or such - why? why not?
|
57
|
+
seasons.each do |season|
|
58
|
+
url = Rsssf.table_url( code, season: season )
|
59
|
+
url_path = URI.parse( url ).path
|
60
|
+
puts " url = >#{url}<"
|
61
|
+
puts " url_path = >#{url_path}<"
|
62
|
+
basename = File.basename( url_path, File.extname( url_path ))
|
78
63
|
|
64
|
+
path = "#{@repo_path}/tables/#{basename}.txt"
|
65
|
+
page = Page.read_txt( path )
|
79
66
|
|
67
|
+
## add/pass along patcher if patcher
|
68
|
+
if @patch
|
69
|
+
page.patch = @patch
|
70
|
+
page.url = url
|
71
|
+
end
|
80
72
|
|
81
|
-
|
82
|
-
|
83
|
-
patch_dir( "#{@repo_path}/tables" ) do |txt, name, year|
|
84
|
-
puts "patching #{year} (#{name}) (#{@repo_path})..."
|
85
|
-
patcher.patch( txt, name, year ) ## note: must be last (that is, must return (patcher) t(e)xt)
|
73
|
+
season = Season( season )
|
74
|
+
blk.call( season, page )
|
86
75
|
end
|
87
|
-
end ## method patch_pages
|
88
|
-
|
89
|
-
|
90
|
-
def sanitize_pages
|
91
|
-
## for debugging/testing lets you (re)run sanitize (alreay incl. in html2txt filter by default)
|
92
|
-
sanitize_dir( "#{@repo_path}/tables" )
|
93
76
|
end
|
94
77
|
|
95
78
|
|
79
|
+
def make_pages_summary
|
80
|
+
files = Dir.glob( "#{@repo_path}/tables/*.txt" )
|
81
|
+
report = PageReport.build( files, title: @title ) ## pass in title etc.
|
96
82
|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
stats = []
|
101
|
-
|
102
|
-
files = Dir[ "#{@repo_path}/tables/*.txt" ]
|
103
|
-
files.each do |file|
|
104
|
-
|
105
|
-
## todo/check/fix:
|
106
|
-
## use source: prop in rsssf page - why? why not???
|
107
|
-
## move year/season/basename into page ???
|
108
|
-
#
|
109
|
-
# assume every rsssf page has at least:
|
110
|
-
## - basename e.g. duit2014
|
111
|
-
## - year e.g. 2014 (numeric)
|
112
|
-
## - season (derived from config lookup???) - string e.g. 2014-15 or 2014 etc.
|
113
|
-
extname = File.extname( file )
|
114
|
-
basename = File.basename( file, extname )
|
115
|
-
year = year_from_name( basename )
|
116
|
-
season = year_to_season( year )
|
117
|
-
|
118
|
-
if cfg.includes && cfg.includes.include?( year ) == false
|
119
|
-
puts " skipping #{basename}; not listed in includes"
|
120
|
-
next
|
121
|
-
end
|
122
|
-
|
123
|
-
|
124
|
-
puts " reading >#{basename}<"
|
83
|
+
### save report as README.md in tables/ folder in repo
|
84
|
+
report.save( "#{@repo_path}/tables/README.md" )
|
85
|
+
end # method make_pages_summary
|
125
86
|
|
126
|
-
page = Page.from_file( file ) # note: always assume sources (already) converted to utf-8
|
127
87
|
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
if cfg.dir_for_year.nil?
|
141
|
-
## use default setting, that is, archive for dir (e.g. archive/1980s/1985-86 etc.)
|
142
|
-
dir_for_year = archive_dir_for_year( year )
|
143
|
-
else
|
144
|
-
## assume it's a proc/lambda
|
145
|
-
dir_for_year = cfg.dir_for_year.call( year )
|
146
|
-
end
|
88
|
+
def make_schedules_summary
|
89
|
+
## find all match datafiles
|
90
|
+
args = [@repo_path]
|
91
|
+
files = SportDb::Parser::Opts.expand_args( args )
|
92
|
+
pp files
|
93
|
+
|
94
|
+
report = ScheduleReport.build( files, title: @title,
|
95
|
+
patch: @patch ) ## pass in title etc.
|
96
|
+
report.save( "#{@repo_path}/README.md" )
|
97
|
+
end
|
147
98
|
|
148
|
-
## -- cfg.name e.g. => 1-liga
|
149
99
|
|
150
|
-
dest_path = "#{@repo_path}/#{dir_for_year}/#{cfg.name}.txt"
|
151
|
-
puts " save to >#{dest_path}<"
|
152
|
-
FileUtils.mkdir_p( File.dirname( dest_path ))
|
153
|
-
schedule.save( dest_path )
|
154
100
|
|
155
|
-
rec = ScheduleStat.new
|
156
|
-
rec.path = dir_for_year
|
157
|
-
rec.filename = "#{cfg.name}.txt" ## change to basename - why?? why not??
|
158
|
-
rec.year = year
|
159
|
-
rec.season = season
|
160
|
-
rec.rounds = schedule.rounds
|
161
101
|
|
162
|
-
|
102
|
+
def patch_pages( patcher )
|
103
|
+
## lets you run/use custom (repo/country-specific patches e.g. for adding/patching headings etc.)
|
104
|
+
patch_dir( "#{@repo_path}/tables" ) do |txt, name, year|
|
105
|
+
puts "patching #{year} (#{name}) (#{@repo_path})..."
|
106
|
+
patcher.patch( txt, name, year ) ## note: must be last (that is, must return (patcher) t(e)xt)
|
163
107
|
end
|
164
|
-
|
165
|
-
stats # return stats for reporting
|
166
|
-
end # method make_schedules
|
108
|
+
end ## method patch_pages
|
167
109
|
|
168
110
|
|
169
|
-
|
170
|
-
|
171
|
-
files = Dir[ "#{root}/*.txt" ]
|
111
|
+
def patch_dir( root, &blk )
|
112
|
+
files = Dir.glob( "#{root}/**/*.txt" )
|
172
113
|
## pp files
|
173
114
|
|
174
115
|
## sort files by year (latest first)
|
@@ -180,41 +121,24 @@ def patch_dir( root )
|
|
180
121
|
end
|
181
122
|
|
182
123
|
files.each do |file|
|
183
|
-
txt =
|
124
|
+
txt = read_text( file ) ## note: assumes already converted to utf-8
|
184
125
|
|
185
126
|
basename = File.basename( file, '.txt' ) ## e.g. duit92.txt => duit92
|
186
127
|
year = year_from_name( basename )
|
187
128
|
|
188
|
-
new_txt =
|
189
|
-
## calculate hash to see if anything changed ?? why? why not??
|
129
|
+
new_txt = blk.call( txt, basename, year )
|
190
130
|
|
191
|
-
|
192
|
-
|
131
|
+
## calculate hash to see if anything changed ?? why? why not??
|
132
|
+
if txt != new_txt
|
133
|
+
puts " patching #{file}, text changed"
|
134
|
+
write_text( file, new_txt )
|
193
135
|
end
|
194
136
|
end # each file
|
195
137
|
end ## patch_dir
|
196
138
|
|
197
|
-
def sanitize_dir( root )
|
198
|
-
files = Dir[ "#{root}/*.txt" ]
|
199
|
-
|
200
|
-
files.each do |file|
|
201
|
-
txt = File.read_utf8( file ) ## note: assumes already converted to utf-8
|
202
|
-
|
203
|
-
new_txt = sanitize( txt )
|
204
139
|
|
205
|
-
File.open( file, 'w' ) do |f|
|
206
|
-
f.write new_txt
|
207
|
-
end
|
208
|
-
end # each file
|
209
|
-
end ## sanitize_dir
|
210
140
|
|
211
141
|
|
212
142
|
end ## class Repo
|
213
143
|
end ## module Rsssf
|
214
144
|
|
215
|
-
## add (shortcut) alias
|
216
|
-
RsssfRepo = Rsssf::Repo
|
217
|
-
RsssfScheduleConfig = Rsssf::ScheduleConfig
|
218
|
-
RsssfScheduleStat = Rsssf::ScheduleStat
|
219
|
-
|
220
|
-
|