rsssf 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/{HISTORY.md → CHANGELOG.md} +2 -0
- data/Manifest.txt +3 -6
- data/README.md +43 -26
- data/Rakefile +8 -7
- data/lib/rsssf/convert.rb +495 -0
- data/lib/rsssf/download.rb +151 -0
- data/lib/rsssf/page.rb +70 -45
- data/lib/rsssf/repo.rb +77 -153
- data/lib/rsssf/reports/page.rb +30 -19
- data/lib/rsssf/reports/schedule.rb +111 -25
- data/lib/rsssf/schedule.rb +4 -14
- data/lib/rsssf/utils.rb +10 -29
- data/lib/rsssf/version.rb +3 -5
- data/lib/rsssf.rb +42 -19
- metadata +26 -25
- data/.gemtest +0 -0
- data/lib/rsssf/fetch.rb +0 -80
- data/lib/rsssf/html2txt.rb +0 -157
- data/lib/rsssf/patch.rb +0 -28
- data/test/helper.rb +0 -12
- data/test/test_utils.rb +0 -83
@@ -0,0 +1,151 @@
|
|
1
|
+
|
2
|
+
module Rsssf
|
3
|
+
|
4
|
+
## end_year to slug_year
|
5
|
+
## check if generic rule/convention in use ???
|
6
|
+
## 2007-08: tablesd/duit08.html
|
7
|
+
## 2008-09: tablesd/duit09.html
|
8
|
+
## 2009-10: tablesd/duit2010.html
|
9
|
+
## 2010-11: tablesd/duit2011.html
|
10
|
+
## 2011-12: tablesd/duit2012.html
|
11
|
+
|
12
|
+
|
13
|
+
## map country codes to table pages
|
14
|
+
## add options about (char) encoding ??? - why? why not?
|
15
|
+
TABLE = {
|
16
|
+
'eng' => ['tablese/eng{year}', { encoding: 'Windows-1252' } ],
|
17
|
+
'es' => ['tabless/span{year}', { encoding: 'Windows-1252' } ],
|
18
|
+
'de' => ['tablesd/duit{year}', { encoding: 'Windows-1252' } ],
|
19
|
+
'at' => ['tableso/oost{year}', { encoding: 'Windows-1252' } ],
|
20
|
+
'br' => [
|
21
|
+
->(season) {
|
22
|
+
## note: special slug/case for year/season 2000
|
23
|
+
## see rsssf.org/tablesb/brazchamp.html
|
24
|
+
if season == Season('2000')
|
25
|
+
'tablesb/braz-joao{year}' ## use braz-joao00 - why? why not?
|
26
|
+
else
|
27
|
+
'tablesb/braz{year}'
|
28
|
+
end
|
29
|
+
}, { encoding: 'Windows-1252' } ],
|
30
|
+
}
|
31
|
+
|
32
|
+
|
33
|
+
BASE_URL = "https://rsssf.org"
|
34
|
+
|
35
|
+
|
36
|
+
def self.table_url( code, season: )
|
37
|
+
url, _ = table_url_and_encoding( code, season: season )
|
38
|
+
url
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.table_url_and_encoding( code, season: )
|
42
|
+
season = Season( season )
|
43
|
+
|
44
|
+
table = TABLE[ code.downcase ]
|
45
|
+
tmpl = table[0]
|
46
|
+
tmpl = tmpl.call( season ) if tmpl.is_a?(Proc) ## check for proc
|
47
|
+
|
48
|
+
opts = table[1] || {}
|
49
|
+
encoding = opts[:encoding] || 'UTF-8'
|
50
|
+
|
51
|
+
|
52
|
+
slug = if season.end_year < 2010 ## cut off all digits (only keep last two)s
|
53
|
+
## convert end_year to string with leading zero
|
54
|
+
'%02d' % (season.end_year % 100) ## e.g. 00 / 01 / 99 / 98 / 11 / etc.
|
55
|
+
else
|
56
|
+
'%4d' % season.end_year
|
57
|
+
end
|
58
|
+
|
59
|
+
tmpl = tmpl.sub( '{year}', slug )
|
60
|
+
url = "#{BASE_URL}/#{tmpl}.html"
|
61
|
+
|
62
|
+
[url, encoding]
|
63
|
+
end
|
64
|
+
|
65
|
+
|
66
|
+
def self.download_table( code, season: )
|
67
|
+
url, encoding = table_url_and_encoding( code, season: season )
|
68
|
+
|
69
|
+
download_page( url, encoding: encoding )
|
70
|
+
end
|
71
|
+
|
72
|
+
|
73
|
+
def self.download_page( url, encoding: )
|
74
|
+
|
75
|
+
## note: assume plain 7-bit ascii for now
|
76
|
+
## -- assume rsssf uses ISO_8859_15 (updated version of ISO_8859_1)
|
77
|
+
###-- does NOT use utf-8 character encoding!!!
|
78
|
+
response = Webget.page( url, encoding: encoding ) ## fetch (and cache) html page (via HTTP GET)
|
79
|
+
|
80
|
+
## note: exit on get / fetch error - do NOT continue for now - why? why not?
|
81
|
+
exit 1 if response.status.nok? ## e.g. HTTP status code != 200
|
82
|
+
|
83
|
+
|
84
|
+
puts "html:"
|
85
|
+
html = response.text( encoding: encoding )
|
86
|
+
pp html[0..400]
|
87
|
+
html
|
88
|
+
end
|
89
|
+
end # module Rsssf
|
90
|
+
|
91
|
+
|
92
|
+
|
93
|
+
__END__
|
94
|
+
|
95
|
+
1998-99: tablesd/duit99.html
|
96
|
+
1999-00: tablesd/duit00.html ## use 1999-2000 - why?? why not??
|
97
|
+
2000-01: tablesd/duit01.html
|
98
|
+
2001-02: tablesd/duit02.html
|
99
|
+
2002-03: tablesd/duit03.html
|
100
|
+
2003-04: tablesd/duit04.html
|
101
|
+
2004-05: tablesd/duit05.html
|
102
|
+
2005-06: tablesd/duit06.html
|
103
|
+
2006-07: tablesd/duit07.html
|
104
|
+
2007-08: tablesd/duit08.html
|
105
|
+
2008-09: tablesd/duit09.html
|
106
|
+
2009-10: tablesd/duit2010.html
|
107
|
+
2010-11: tablesd/duit2011.html
|
108
|
+
2011-12: tablesd/duit2012.html
|
109
|
+
2012-13: tablesd/duit2013.html
|
110
|
+
2013-14: tablesd/duit2014.html
|
111
|
+
2014-15: tablesd/duit2015.html
|
112
|
+
|
113
|
+
|
114
|
+
2010-11: tableso/oost2011.html
|
115
|
+
2011-12: tableso/oost2012.html
|
116
|
+
2012-13: tableso/oost2013.html
|
117
|
+
2013-14: tableso/oost2014.html
|
118
|
+
2014-15: tableso/oost2015.html
|
119
|
+
2015-16: tableso/oost2016.html
|
120
|
+
|
121
|
+
2011: tablesb/braz2011.html !! Windows-1252
|
122
|
+
2012: tablesb/braz2012.html !! Windows-1252
|
123
|
+
2013: tablesb/braz2013.html !! Windows-1252
|
124
|
+
2014: tablesb/braz2014.html !! Windows-1252
|
125
|
+
2015: tablesb/braz2015.html !! Windows-1252
|
126
|
+
2016: tablesb/braz2016.html !! Windows-1252
|
127
|
+
2017: tablesb/braz2017.html !! Windows-1252
|
128
|
+
2018: tablesb/braz2018.html !! Windows-1252
|
129
|
+
2019: tablesb/braz2019.html !! Windows-1252
|
130
|
+
2020: tablesb/braz2020.html !! Windows-1252 ## 2020/21 - extended for corona
|
131
|
+
2021: tablesb/braz2021.html !! Windows-1252
|
132
|
+
2022: tablesb/braz2022.html !! Windows-1252
|
133
|
+
2023: tablesb/braz2023.html !! Windows-1252
|
134
|
+
2024: tablesb/braz2024.html !! Windows-1252
|
135
|
+
|
136
|
+
2010-11: tablese/eng2011.html !! Windows-1252
|
137
|
+
2011-12: tablese/eng2012.html !! Windows-1252
|
138
|
+
2012-13: tablese/eng2013.html !! Windows-1252
|
139
|
+
2013-14: tablese/eng2014.html !! Windows-1252
|
140
|
+
2014-15: tablese/eng2015.html !! Windows-1252
|
141
|
+
2015-16: tablese/eng2016.html !! Windows-1252
|
142
|
+
2016-17: tablese/eng2017.html !! Windows-1252
|
143
|
+
2017-18: tablese/eng2018.html !! Windows-1252
|
144
|
+
2018-19: tablese/eng2019.html !! Windows-1252
|
145
|
+
2019-20: tablese/eng2020.html !! Windows-1252
|
146
|
+
2020-21: tablese/eng2021.html !! Windows-1252
|
147
|
+
2021-22: tablese/eng2022.html !! Windows-1252
|
148
|
+
2022-23: tablese/eng2023.html !! Windows-1252
|
149
|
+
2023-24: tablese/eng2024.html !! Windows-1252
|
150
|
+
|
151
|
+
|
data/lib/rsssf/page.rb
CHANGED
@@ -1,13 +1,11 @@
|
|
1
|
-
# encoding: utf-8
|
2
1
|
|
3
2
|
|
4
3
|
module Rsssf
|
4
|
+
|
5
5
|
|
6
6
|
PageStat = Struct.new(
|
7
|
-
:source, ## e.g.
|
8
|
-
:basename, ## e.g. duit89 -- note: filename w/o extension (and path)
|
7
|
+
:source, ## e.g. https://rsssf.org/tabled/duit89.html
|
9
8
|
:year, ## e.g. 1989 -- note: always four digits
|
10
|
-
:season, ## e.g. 1990-91 -- note: always a string (NOT a number)
|
11
9
|
:authors,
|
12
10
|
:last_updated,
|
13
11
|
:line_count, ## todo: rename to (just) lines - why? why not?
|
@@ -27,24 +25,41 @@ module Rsssf
|
|
27
25
|
class Page
|
28
26
|
|
29
27
|
include Utils ## e.g. year_from_name, etc.
|
28
|
+
|
29
|
+
def self.read_cache( url ) ### use read_cache /web/html or such - why? why not?
|
30
|
+
html = Webcache.read( url )
|
31
|
+
|
32
|
+
puts "html:"
|
33
|
+
pp html[0..400]
|
34
|
+
|
35
|
+
txt = PageConverter.convert( html, url: url )
|
36
|
+
txt
|
30
37
|
|
31
|
-
|
32
|
-
txt = PageFetcher.new.fetch( src )
|
33
|
-
self.from_string( txt )
|
38
|
+
new( txt )
|
34
39
|
end
|
35
40
|
|
36
41
|
|
37
|
-
def self.
|
38
|
-
|
39
|
-
|
42
|
+
def self.read_txt( path ) ## use read_txt
|
43
|
+
# note: always assume sources (already) converted from html to txt!!!!
|
44
|
+
txt = read_text( path )
|
45
|
+
new( txt )
|
40
46
|
end
|
41
47
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
48
|
+
|
49
|
+
|
50
|
+
### use text alias too (for txt) - why? why not?
|
51
|
+
attr_accessor :txt
|
52
|
+
|
53
|
+
## quick hack? used for auto-patch machinery
|
54
|
+
attr_accessor :patch
|
55
|
+
attr_accessor :url ### source url
|
56
|
+
|
57
|
+
|
46
58
|
def initialize( txt )
|
47
59
|
@txt = txt
|
60
|
+
|
61
|
+
@patch = nil
|
62
|
+
@url = nil
|
48
63
|
end
|
49
64
|
|
50
65
|
|
@@ -61,17 +76,20 @@ CUP_ROUND_REGEX = /\b(
|
|
61
76
|
Final
|
62
77
|
)\b/ix
|
63
78
|
|
64
|
-
|
79
|
+
|
80
|
+
|
81
|
+
## make header required - why? why not?
|
82
|
+
def find_schedule( header: nil,
|
83
|
+
cup: false ) ## change to build_schedule - why? why not???
|
65
84
|
|
66
85
|
## find match schedule/fixtures in multi-league doc
|
67
|
-
new_txt =
|
86
|
+
new_txt = String.new
|
68
87
|
|
69
88
|
## note: keep track of statistics
|
70
89
|
## e.g. number of rounds found
|
71
90
|
|
72
91
|
round_count = 0
|
73
92
|
|
74
|
-
header = opts[:header]
|
75
93
|
if header
|
76
94
|
league_header_found = false
|
77
95
|
|
@@ -89,7 +107,8 @@ def find_schedule( opts={} ) ## change to build_schedule - why? why not???
|
|
89
107
|
header_regex = /^
|
90
108
|
([#]{2,4}\s+(#{header_esc}))
|
91
109
|
|
|
92
|
-
(\*{2}(#{header_esc})\*{2})
|
110
|
+
(\*{2}(#{header_esc})) ## was: \*{2})
|
111
|
+
## do not inluce trailing ** for now (allows anchors e.g. §)
|
93
112
|
/ix
|
94
113
|
|
95
114
|
## todo:
|
@@ -105,7 +124,7 @@ def find_schedule( opts={} ) ## change to build_schedule - why? why not???
|
|
105
124
|
## pp header_regex
|
106
125
|
|
107
126
|
|
108
|
-
if
|
127
|
+
if cup
|
109
128
|
round_regex = CUP_ROUND_REGEX ## note: only allow final, quaterfinals, etc. if knockout cup
|
110
129
|
else
|
111
130
|
round_regex = LEAGUE_ROUND_REGEX
|
@@ -128,8 +147,10 @@ def find_schedule( opts={} ) ## change to build_schedule - why? why not???
|
|
128
147
|
if line =~ header_regex
|
129
148
|
puts "!!! bingo - found header >#{line}<"
|
130
149
|
league_header_found = true
|
131
|
-
|
132
|
-
|
150
|
+
|
151
|
+
## note - do NOT auto-add header/title !!!
|
152
|
+
# title = line.gsub( /[#*]/, '' ).strip ## quick hack: extract title from header
|
153
|
+
# new_txt << "## #{title}\n\n" # note: use header/stage title (regex group capture)
|
133
154
|
else
|
134
155
|
puts " searching for header >#{header}<; skipping line >#{line}<"
|
135
156
|
next
|
@@ -205,13 +226,24 @@ def find_schedule( opts={} ) ## change to build_schedule - why? why not???
|
|
205
226
|
end
|
206
227
|
end # each line
|
207
228
|
|
208
|
-
|
209
|
-
|
229
|
+
|
230
|
+
## quick hack?
|
231
|
+
### auto-apply patch if patch configured
|
232
|
+
if @patch && @patch.respond_to?(:on_patch)
|
233
|
+
url_path = URI.parse( url ).path
|
234
|
+
basename = File.basename( url_path, File.extname( url_path ))
|
235
|
+
year = year_from_name( basename )
|
236
|
+
new_txt = @patch.on_patch( new_txt, basename, year )
|
237
|
+
end
|
238
|
+
|
239
|
+
schedule = Schedule.new( new_txt )
|
240
|
+
## schedule.rounds = round_count
|
210
241
|
|
211
242
|
schedule
|
212
243
|
end # method find_schedule
|
213
244
|
|
214
245
|
|
246
|
+
|
215
247
|
def build_stat
|
216
248
|
source = nil
|
217
249
|
authors = nil
|
@@ -224,7 +256,7 @@ def build_stat
|
|
224
256
|
end
|
225
257
|
|
226
258
|
##
|
227
|
-
## fix/todo: move authors n last updated whitespace cleanup
|
259
|
+
## fix/todo: move authors n last updated whitespace cleanup - why? why not??
|
228
260
|
|
229
261
|
if @txt =~ /authors?:\s+(.+?)\s+last updated:\s+(\d{1,2} [a-z]{3,10} \d{4})/im
|
230
262
|
last_updated = $2.to_s # note: save a copy first (gets "reset" by next regex)
|
@@ -235,7 +267,15 @@ def build_stat
|
|
235
267
|
end
|
236
268
|
|
237
269
|
puts "*** !!! missing source" if source.nil?
|
238
|
-
puts "*** !!! missing authors
|
270
|
+
puts "*** !!! missing authors and last updated" if authors.nil? || last_updated.nil?
|
271
|
+
|
272
|
+
|
273
|
+
## get year from source (url)
|
274
|
+
url_path = URI.parse( source ).path
|
275
|
+
basename = File.basename( url_path, File.extname( url_path ) ) ## e.g. duit92.txt or duit92.html => duit92
|
276
|
+
puts " basename=>#{basename}<"
|
277
|
+
year = year_from_name( basename )
|
278
|
+
|
239
279
|
|
240
280
|
sections = []
|
241
281
|
|
@@ -248,26 +288,16 @@ def build_stat
|
|
248
288
|
## todo: add more patterns? how? why?
|
249
289
|
if line =~ /####\s+(.+)/
|
250
290
|
puts " found section >#{$1}<"
|
251
|
-
|
291
|
+
## remove anchors first e.g. ‹§sa› etc.
|
292
|
+
## check if anchors with underscore (_) or dash/hyphen (-) ???
|
293
|
+
sections << $1.sub( /‹§[a-z0-9]+›/, '' ).strip
|
252
294
|
end
|
253
295
|
end
|
254
296
|
|
255
297
|
|
256
|
-
# get path from url
|
257
|
-
url = URI.parse( source )
|
258
|
-
## pp url
|
259
|
-
## puts url.host
|
260
|
-
path = url.path
|
261
|
-
extname = File.extname( path )
|
262
|
-
basename = File.basename( path, extname ) ## e.g. duit92.txt or duit92.html => duit92
|
263
|
-
year = year_from_name( basename )
|
264
|
-
season = year_to_season( year )
|
265
|
-
|
266
298
|
rec = PageStat.new
|
267
299
|
rec.source = source # e.g. http://rsssf.org/tabled/duit89.html -- use source_url - why?? why not??
|
268
|
-
rec.
|
269
|
-
rec.year = year # e.g. 89 => 1989 -- note: always four digits
|
270
|
-
rec.season = season
|
300
|
+
rec.year = year
|
271
301
|
rec.authors = authors
|
272
302
|
rec.last_updated = last_updated
|
273
303
|
rec.line_count = line_count
|
@@ -279,17 +309,12 @@ end ## method build_stat
|
|
279
309
|
|
280
310
|
|
281
311
|
def save( path )
|
282
|
-
|
283
|
-
f.write @txt
|
284
|
-
end
|
312
|
+
write_text( path, @txt )
|
285
313
|
end ## method save
|
286
314
|
|
287
315
|
end ## class Page
|
288
316
|
end ## module Rsssf
|
289
317
|
|
290
318
|
|
291
|
-
## add (shortcut) alias
|
292
|
-
RsssfPageStat = Rsssf::PageStat
|
293
|
-
RsssfPage = Rsssf::Page
|
294
319
|
|
295
320
|
|
data/lib/rsssf/repo.rb
CHANGED
@@ -1,174 +1,115 @@
|
|
1
|
-
# encoding: utf-8
|
2
1
|
|
3
2
|
module Rsssf
|
4
3
|
|
5
|
-
## used by Repo#make_schedules
|
6
|
-
ScheduleConfig = Struct.new(
|
7
|
-
:name,
|
8
|
-
:opts_for_year, ## hash or proc ->(year){ Hash[...] }
|
9
|
-
:dir_for_year, ## proc ->(year){ 'path_here'} ## rename to path_for_year - why, why not??
|
10
|
-
:includes ## array of years to include e.g. [2011,2012] etc.
|
11
|
-
)
|
12
|
-
|
13
|
-
|
14
|
-
ScheduleStat = Struct.new(
|
15
|
-
:path, ## e.g. 2012-13 or archive/1980s/1984-85
|
16
|
-
:filename, ## e.g. 1-bundesliga.txt -- note: w/o path
|
17
|
-
:year, ## e.g. 2013 -- note: numeric (integer)
|
18
|
-
:season, ## e.g. 2012-13 -- note: is a string
|
19
|
-
:rounds ## e.g. 36 -- note: numeric (integer)
|
20
|
-
)
|
21
4
|
|
22
5
|
|
23
6
|
class Repo
|
24
|
-
|
25
|
-
include Filters ## e.g. sanitize, etc.
|
26
7
|
include Utils ## e.g. year_from_file, etc.
|
27
8
|
|
28
9
|
|
29
|
-
def initialize( path,
|
10
|
+
def initialize( path, title: 'Your Title Here',
|
11
|
+
patch: nil )
|
30
12
|
@repo_path = path
|
31
|
-
@
|
13
|
+
@title = title
|
14
|
+
@patch = patch
|
32
15
|
end
|
33
16
|
|
34
17
|
|
35
|
-
def
|
36
|
-
|
37
|
-
cfg = YAML.load_file( "#{@repo_path}/tables/config.yml")
|
38
|
-
pp cfg
|
18
|
+
def root() @repo_path; end ## use/rename to path - why? why not?
|
19
|
+
alias_method :root_dir, :root
|
39
20
|
|
40
|
-
dl_base = 'http://rsssf.com'
|
41
21
|
|
42
|
-
|
43
|
-
|
44
|
-
|
22
|
+
## for now use single country repos - why? why not?
|
23
|
+
## add support for all-in-one repos
|
24
|
+
def prepare_pages( code, seasons )
|
25
|
+
seasons.each do |season|
|
26
|
+
url = Rsssf.table_url( code, season: season )
|
45
27
|
|
46
|
-
##
|
47
|
-
|
48
|
-
|
28
|
+
## check if not in cache
|
29
|
+
unless Webcache.cached?( url )
|
30
|
+
## download - if not cached
|
31
|
+
Rsssf.download_table( code, season: season )
|
32
|
+
end
|
49
33
|
|
50
|
-
|
51
|
-
dest_path = "#{@repo_path}/tables/#{basename}.txt"
|
34
|
+
page = Page.read_cache( url )
|
52
35
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
end # method fetch_pages
|
36
|
+
url_path = URI.parse( url ).path
|
37
|
+
puts " url = >#{url}<"
|
38
|
+
puts " url_path = >#{url_path}<"
|
57
39
|
|
40
|
+
basename = File.basename( url_path, File.extname( url_path ))
|
58
41
|
|
59
|
-
|
60
|
-
|
42
|
+
###
|
43
|
+
## check for on_prepare (apply patches)
|
44
|
+
if @patch && @patch.respond_to?(:on_prepare)
|
45
|
+
year = year_from_name( basename )
|
46
|
+
page.txt = @patch.on_prepare( page.txt, basename, year )
|
47
|
+
end
|
61
48
|
|
62
|
-
files = Dir[ "#{@repo_path}/tables/*.txt" ]
|
63
|
-
files.each do |file|
|
64
|
-
page = Page.from_file( file )
|
65
|
-
stats << page.build_stat
|
66
|
-
end
|
67
49
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
end
|
50
|
+
path = "#{@repo_path}/tables/#{basename}.txt"
|
51
|
+
page.save( path )
|
52
|
+
end
|
53
|
+
end # method prepare_pages
|
72
54
|
|
73
55
|
|
74
|
-
def
|
75
|
-
|
76
|
-
|
77
|
-
|
56
|
+
def each_page( code, seasons, &blk ) ## use each table or such - why? why not?
|
57
|
+
seasons.each do |season|
|
58
|
+
url = Rsssf.table_url( code, season: season )
|
59
|
+
url_path = URI.parse( url ).path
|
60
|
+
puts " url = >#{url}<"
|
61
|
+
puts " url_path = >#{url_path}<"
|
62
|
+
basename = File.basename( url_path, File.extname( url_path ))
|
78
63
|
|
64
|
+
path = "#{@repo_path}/tables/#{basename}.txt"
|
65
|
+
page = Page.read_txt( path )
|
79
66
|
|
67
|
+
## add/pass along patcher if patcher
|
68
|
+
if @patch
|
69
|
+
page.patch = @patch
|
70
|
+
page.url = url
|
71
|
+
end
|
80
72
|
|
81
|
-
|
82
|
-
|
83
|
-
patch_dir( "#{@repo_path}/tables" ) do |txt, name, year|
|
84
|
-
puts "patching #{year} (#{name}) (#{@repo_path})..."
|
85
|
-
patcher.patch( txt, name, year ) ## note: must be last (that is, must return (patcher) t(e)xt)
|
73
|
+
season = Season( season )
|
74
|
+
blk.call( season, page )
|
86
75
|
end
|
87
|
-
end ## method patch_pages
|
88
|
-
|
89
|
-
|
90
|
-
def sanitize_pages
|
91
|
-
## for debugging/testing lets you (re)run sanitize (alreay incl. in html2txt filter by default)
|
92
|
-
sanitize_dir( "#{@repo_path}/tables" )
|
93
76
|
end
|
94
77
|
|
95
78
|
|
79
|
+
def make_pages_summary
|
80
|
+
files = Dir.glob( "#{@repo_path}/tables/*.txt" )
|
81
|
+
report = PageReport.build( files, title: @title ) ## pass in title etc.
|
96
82
|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
stats = []
|
101
|
-
|
102
|
-
files = Dir[ "#{@repo_path}/tables/*.txt" ]
|
103
|
-
files.each do |file|
|
104
|
-
|
105
|
-
## todo/check/fix:
|
106
|
-
## use source: prop in rsssf page - why? why not???
|
107
|
-
## move year/season/basename into page ???
|
108
|
-
#
|
109
|
-
# assume every rsssf page has at least:
|
110
|
-
## - basename e.g. duit2014
|
111
|
-
## - year e.g. 2014 (numeric)
|
112
|
-
## - season (derived from config lookup???) - string e.g. 2014-15 or 2014 etc.
|
113
|
-
extname = File.extname( file )
|
114
|
-
basename = File.basename( file, extname )
|
115
|
-
year = year_from_name( basename )
|
116
|
-
season = year_to_season( year )
|
117
|
-
|
118
|
-
if cfg.includes && cfg.includes.include?( year ) == false
|
119
|
-
puts " skipping #{basename}; not listed in includes"
|
120
|
-
next
|
121
|
-
end
|
122
|
-
|
123
|
-
|
124
|
-
puts " reading >#{basename}<"
|
83
|
+
### save report as README.md in tables/ folder in repo
|
84
|
+
report.save( "#{@repo_path}/tables/README.md" )
|
85
|
+
end # method make_pages_summary
|
125
86
|
|
126
|
-
page = Page.from_file( file ) # note: always assume sources (already) converted to utf-8
|
127
87
|
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
if cfg.dir_for_year.nil?
|
141
|
-
## use default setting, that is, archive for dir (e.g. archive/1980s/1985-86 etc.)
|
142
|
-
dir_for_year = archive_dir_for_year( year )
|
143
|
-
else
|
144
|
-
## assume it's a proc/lambda
|
145
|
-
dir_for_year = cfg.dir_for_year.call( year )
|
146
|
-
end
|
88
|
+
def make_schedules_summary
|
89
|
+
## find all match datafiles
|
90
|
+
args = [@repo_path]
|
91
|
+
files = SportDb::Parser::Opts.expand_args( args )
|
92
|
+
pp files
|
93
|
+
|
94
|
+
report = ScheduleReport.build( files, title: @title,
|
95
|
+
patch: @patch ) ## pass in title etc.
|
96
|
+
report.save( "#{@repo_path}/README.md" )
|
97
|
+
end
|
147
98
|
|
148
|
-
## -- cfg.name e.g. => 1-liga
|
149
99
|
|
150
|
-
dest_path = "#{@repo_path}/#{dir_for_year}/#{cfg.name}.txt"
|
151
|
-
puts " save to >#{dest_path}<"
|
152
|
-
FileUtils.mkdir_p( File.dirname( dest_path ))
|
153
|
-
schedule.save( dest_path )
|
154
100
|
|
155
|
-
rec = ScheduleStat.new
|
156
|
-
rec.path = dir_for_year
|
157
|
-
rec.filename = "#{cfg.name}.txt" ## change to basename - why?? why not??
|
158
|
-
rec.year = year
|
159
|
-
rec.season = season
|
160
|
-
rec.rounds = schedule.rounds
|
161
101
|
|
162
|
-
|
102
|
+
def patch_pages( patcher )
|
103
|
+
## lets you run/use custom (repo/country-specific patches e.g. for adding/patching headings etc.)
|
104
|
+
patch_dir( "#{@repo_path}/tables" ) do |txt, name, year|
|
105
|
+
puts "patching #{year} (#{name}) (#{@repo_path})..."
|
106
|
+
patcher.patch( txt, name, year ) ## note: must be last (that is, must return (patcher) t(e)xt)
|
163
107
|
end
|
164
|
-
|
165
|
-
stats # return stats for reporting
|
166
|
-
end # method make_schedules
|
108
|
+
end ## method patch_pages
|
167
109
|
|
168
110
|
|
169
|
-
|
170
|
-
|
171
|
-
files = Dir[ "#{root}/*.txt" ]
|
111
|
+
def patch_dir( root, &blk )
|
112
|
+
files = Dir.glob( "#{root}/**/*.txt" )
|
172
113
|
## pp files
|
173
114
|
|
174
115
|
## sort files by year (latest first)
|
@@ -180,41 +121,24 @@ def patch_dir( root )
|
|
180
121
|
end
|
181
122
|
|
182
123
|
files.each do |file|
|
183
|
-
txt =
|
124
|
+
txt = read_text( file ) ## note: assumes already converted to utf-8
|
184
125
|
|
185
126
|
basename = File.basename( file, '.txt' ) ## e.g. duit92.txt => duit92
|
186
127
|
year = year_from_name( basename )
|
187
128
|
|
188
|
-
new_txt =
|
189
|
-
## calculate hash to see if anything changed ?? why? why not??
|
129
|
+
new_txt = blk.call( txt, basename, year )
|
190
130
|
|
191
|
-
|
192
|
-
|
131
|
+
## calculate hash to see if anything changed ?? why? why not??
|
132
|
+
if txt != new_txt
|
133
|
+
puts " patching #{file}, text changed"
|
134
|
+
write_text( file, new_txt )
|
193
135
|
end
|
194
136
|
end # each file
|
195
137
|
end ## patch_dir
|
196
138
|
|
197
|
-
def sanitize_dir( root )
|
198
|
-
files = Dir[ "#{root}/*.txt" ]
|
199
|
-
|
200
|
-
files.each do |file|
|
201
|
-
txt = File.read_utf8( file ) ## note: assumes already converted to utf-8
|
202
|
-
|
203
|
-
new_txt = sanitize( txt )
|
204
139
|
|
205
|
-
File.open( file, 'w' ) do |f|
|
206
|
-
f.write new_txt
|
207
|
-
end
|
208
|
-
end # each file
|
209
|
-
end ## sanitize_dir
|
210
140
|
|
211
141
|
|
212
142
|
end ## class Repo
|
213
143
|
end ## module Rsssf
|
214
144
|
|
215
|
-
## add (shortcut) alias
|
216
|
-
RsssfRepo = Rsssf::Repo
|
217
|
-
RsssfScheduleConfig = Rsssf::ScheduleConfig
|
218
|
-
RsssfScheduleStat = Rsssf::ScheduleStat
|
219
|
-
|
220
|
-
|