rsssf 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,295 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ module Rsssf
5
+
6
+ PageStat = Struct.new(
7
+ :source, ## e.g. http://rsssf.org/tabled/duit89.html
8
+ :basename, ## e.g. duit89 -- note: filename w/o extension (and path)
9
+ :year, ## e.g. 1989 -- note: always four digits
10
+ :season, ## e.g. 1990-91 -- note: always a string (NOT a number)
11
+ :authors,
12
+ :last_updated,
13
+ :line_count, ## todo: rename to (just) lines - why? why not?
14
+ :char_count, ## todo: rename to (just) char(ectar)s - why? why not?
15
+ :sections)
16
+
17
+
18
+ ###
19
+ ## note:
20
+ # a rsssf page may contain:
21
+ # many leagues, cups
22
+ # - tables, schedules (rounds), notes, etc.
23
+ #
24
+ # a rsssf page MUST be in plain text (.txt) and utf-8 character encoding assumed
25
+ #
26
+
27
+ class Page
28
+
29
+ include Utils ## e.g. year_from_name, etc.
30
+
31
+ def self.from_url( src )
32
+ txt = PageFetcher.new.fetch( src )
33
+ self.from_string( txt )
34
+ end
35
+
36
+
37
+ def self.from_file( path )
38
+ txt = File.read_utf8( path ) # note: always assume sources (already) converted to utf-8
39
+ self.from_string( txt )
40
+ end
41
+
42
+ def self.from_string( txt )
43
+ self.new( txt )
44
+ end
45
+
46
+ def initialize( txt )
47
+ @txt = txt
48
+ end
49
+
50
+
51
+ LEAGUE_ROUND_REGEX = /\b
52
+ Round
53
+ \b/ix
54
+
55
+ CUP_ROUND_REGEX = /\b(
56
+ Round |
57
+ 1\/8\sFinals |
58
+ 1\/16\sFinals |
59
+ Quarterfinals |
60
+ Semifinals |
61
+ Final
62
+ )\b/ix
63
+
64
+ def find_schedule( opts={} ) ## change to build_schedule - why? why not???
65
+
66
+ ## find match schedule/fixtures in multi-league doc
67
+ new_txt = ''
68
+
69
+ ## note: keep track of statistics
70
+ ## e.g. number of rounds found
71
+
72
+ round_count = 0
73
+
74
+ header = opts[:header]
75
+ if header
76
+ league_header_found = false
77
+
78
+ ## header:
79
+ ## - assumes heading 4 e.g. #### Premier League or
80
+ ## - bold e.g. **FA Cup** for now
81
+ ## note: markers must start line (^)
82
+
83
+ ## note:
84
+ ## header gsub spaces to \s otherwise no match in regex (using free-form x-flag)!!!
85
+ header_esc = header.gsub( ' ', '\s' )
86
+
87
+ ## note: somehow #{2,4} will not work with free-form /xi defined (picked up as comment?)
88
+ ## use [#] hack ??
89
+ header_regex = /^
90
+ ([#]{2,4}\s+(#{header_esc}))
91
+ |
92
+ (\*{2}(#{header_esc})\*{2})
93
+ /ix
94
+
95
+ ## todo:
96
+ ## use new stage_regex e.g. **xxx** - why? why not?
97
+ ## allow more than one stage in one schedule (e.g. regular stage,playoff stage etc)
98
+
99
+ else
100
+ league_header_found = true # default (no header; assume single league file)
101
+ header_regex = /^---dummy---$/ ## non-matching dummy regex
102
+ end
103
+
104
+ ## puts "header_regex:"
105
+ ## pp header_regex
106
+
107
+
108
+ if opts[:cup]
109
+ round_regex = CUP_ROUND_REGEX ## note: only allow final, quaterfinals, etc. if knockout cup
110
+ else
111
+ round_regex = LEAGUE_ROUND_REGEX
112
+ end
113
+
114
+
115
+ ## stages
116
+ first_round_header_found = false
117
+ round_header_found = false
118
+ round_body_found = false ## allow round header followed by blank lines
119
+
120
+ blank_found = false
121
+
122
+
123
+
124
+ @txt.each_line do |line|
125
+
126
+ if league_header_found == false
127
+ ## first find start of league header/section
128
+ if line =~ header_regex
129
+ puts "!!! bingo - found header >#{line}<"
130
+ league_header_found = true
131
+ title = line.gsub( /[#*]/, '' ).strip ## quick hack: extract title from header
132
+ new_txt << "## #{title}\n\n" # note: use header/stage title (regex group capture)
133
+ else
134
+ puts " searching for header >#{header}<; skipping line >#{line}<"
135
+ next
136
+ end
137
+ elsif first_round_header_found == false
138
+ ## next look for first round (starting w/ Round)
139
+ if line =~ round_regex
140
+ puts "!!! bingo - found first round >#{line}<"
141
+ round_count += 1
142
+ first_round_header_found = true
143
+ round_header_found = true
144
+ round_body_found = false
145
+ new_txt << line
146
+ elsif line =~ /^=-=-=-=/
147
+ puts "*** no rounds found; hit section marker (horizontal rule)"
148
+ break
149
+ elsif line =~ /^\*{2}[^*]+\*{2}/ ## e.g. **FA Cup**
150
+ puts "*** no rounds found; hit section/stage header: #{line}"
151
+ break
152
+ else
153
+ puts " searching for first round; skipping line >#{line}<"
154
+ next ## continue; searching
155
+ end
156
+ elsif round_header_found == true
157
+ ## collect rounds;
158
+ ## assume text block until next blank line
159
+ ## new block must allways start w/ round
160
+ if line =~ /^\s*$/ ## blank line?
161
+ if round_body_found
162
+ round_header_found = false
163
+ blank_found = true ## keep track of blank (lines) - allow inside round block (can continue w/ date header/marker)
164
+ new_txt << line
165
+ else
166
+ ## note: skip blanks following header
167
+ next
168
+ end
169
+ else
170
+ round_body_found = true
171
+ new_txt << line ## keep going until next blank line
172
+ end
173
+ else
174
+ ## skip (more) blank lines
175
+ if line =~ /^\s*$/
176
+ next ## continue; skip extra blank line
177
+ elsif line =~ round_regex
178
+ puts "!!! bingo - found new round >#{line}<"
179
+ round_count += 1
180
+ round_header_found = true # more rounds; continue
181
+ round_body_found = false
182
+ blank_found = false # reset blank tracker
183
+ new_txt << line
184
+ elsif blank_found && line =~ /\[[a-z]{3} \d{1,2}\]/i ## e.g. [Mar 13] or [May 5] with leading blank line; continue round
185
+ puts "!!! bingo - continue round >#{line}<"
186
+ round_header_found = true
187
+ blank_found = false # reset blank tracker
188
+ new_txt << line
189
+ elsif blank_found && line =~ /First Legs|Second Legs/i
190
+ puts "!!! bingo - continue round >#{line}<"
191
+ round_header_found = true
192
+ blank_found = false # reset blank tracker
193
+ new_txt << line
194
+ elsif line =~ /=-=-=-=/
195
+ puts "!!! stop schedule; hit section marker (horizontal rule)"
196
+ break;
197
+ elsif line =~ /^\*{2}[^*]+\*{2}/ ## e.g. **FA Cup**
198
+ puts "!!! stop schedule; hit section/stage header: #{line}"
199
+ break
200
+ else
201
+ blank_found = false
202
+ puts "skipping line in schedule >#{line}<"
203
+ next # continue
204
+ end
205
+ end
206
+ end # each line
207
+
208
+ schedule = Schedule.from_string( new_txt )
209
+ schedule.rounds = round_count
210
+
211
+ schedule
212
+ end # method find_schedule
213
+
214
+
215
+ def build_stat
216
+ source = nil
217
+ authors = nil
218
+ last_updated = nil
219
+
220
+ ### find source ref
221
+ if @txt =~ /source: ([^ \n]+)/im
222
+ source = $1.to_s
223
+ puts "source: >#{source}<"
224
+ end
225
+
226
+ ##
227
+ ## fix/todo: move authors n last updated whitespace cleanup to sanitize - why? why not??
228
+
229
+ if @txt =~ /authors?:\s+(.+?)\s+last updated:\s+(\d{1,2} [a-z]{3,10} \d{4})/im
230
+ last_updated = $2.to_s # note: save a copy first (gets "reset" by next regex)
231
+ authors = $1.to_s.strip.gsub(/\s+/, ' ' ) # cleanup whitespace; squish-style
232
+ authors = authors.gsub( /[ ]*,[ ]*/, ', ' ) # prettify commas - always single space after comma (no space before)
233
+ puts "authors: >#{authors}<"
234
+ puts "last updated: >#{last_updated}<"
235
+ end
236
+
237
+ puts "*** !!! missing source" if source.nil?
238
+ puts "*** !!! missing authors n last updated" if authors.nil? || last_updated.nil?
239
+
240
+ sections = []
241
+
242
+ ## count lines
243
+ line_count = 0
244
+ @txt.each_line do |line|
245
+ line_count +=1
246
+
247
+ ### find sections
248
+ ## todo: add more patterns? how? why?
249
+ if line =~ /####\s+(.+)/
250
+ puts " found section >#{$1}<"
251
+ sections << $1.strip
252
+ end
253
+ end
254
+
255
+
256
+ # get path from url
257
+ url = URI.parse( source )
258
+ ## pp url
259
+ ## puts url.host
260
+ path = url.path
261
+ extname = File.extname( path )
262
+ basename = File.basename( path, extname ) ## e.g. duit92.txt or duit92.html => duit92
263
+ year = year_from_name( basename )
264
+ season = year_to_season( year )
265
+
266
+ rec = PageStat.new
267
+ rec.source = source # e.g. http://rsssf.org/tabled/duit89.html -- use source_url - why?? why not??
268
+ rec.basename = basename # e.g. duit89
269
+ rec.year = year # e.g. 89 => 1989 -- note: always four digits
270
+ rec.season = season
271
+ rec.authors = authors
272
+ rec.last_updated = last_updated
273
+ rec.line_count = line_count
274
+ rec.char_count = @txt.size ## fix: use "true" char count not byte count
275
+ rec.sections = sections
276
+
277
+ rec
278
+ end ## method build_stat
279
+
280
+
281
+ def save( path )
282
+ File.open( path, 'w' ) do |f|
283
+ f.write @txt
284
+ end
285
+ end ## method save
286
+
287
+ end ## class Page
288
+ end ## module Rsssf
289
+
290
+
291
+ ## add (shortcut) alias
292
+ RsssfPageStat = Rsssf::PageStat
293
+ RsssfPage = Rsssf::Page
294
+
295
+
@@ -0,0 +1,28 @@
1
+ # encoding: utf-8
2
+
3
+ module Rsssf
4
+
5
+ class Patcher
6
+
7
+ ## e.g. 2008/09
8
+ ## note: also support 1999/2000
9
+ SEASON = '\d{4}\/(\d{2}|\d{4})' ## note: use single quotes - quotes do NOT get escaped (e.g. '\d' => "\\d")
10
+
11
+ def patch_heading( txt, rxs, title )
12
+ rxs.each do |rx|
13
+ txt = txt.sub( rx ) do |match|
14
+ match = match.gsub( "\n", '$$') ## change newlines to $$ for single-line outputs/dumps
15
+ puts " found heading >#{match}<"
16
+ "\n\n#### #{title}\n\n"
17
+ end
18
+ end
19
+ txt
20
+ end
21
+
22
+
23
+ end # class Patcher
24
+ end ## module Rsssf
25
+
26
+ ## add (shortcut) alias
27
+ RsssfPatcher = Rsssf::Patcher
28
+
@@ -0,0 +1,220 @@
1
+ # encoding: utf-8
2
+
3
+ module Rsssf
4
+
5
+ ## used by Repo#make_schedules
6
+ ScheduleConfig = Struct.new(
7
+ :name,
8
+ :opts_for_year, ## hash or proc ->(year){ Hash[...] }
9
+ :dir_for_year, ## proc ->(year){ 'path_here'} ## rename to path_for_year - why, why not??
10
+ :includes ## array of years to include e.g. [2011,2012] etc.
11
+ )
12
+
13
+
14
+ ScheduleStat = Struct.new(
15
+ :path, ## e.g. 2012-13 or archive/1980s/1984-85
16
+ :filename, ## e.g. 1-bundesliga.txt -- note: w/o path
17
+ :year, ## e.g. 2013 -- note: numeric (integer)
18
+ :season, ## e.g. 2012-13 -- note: is a string
19
+ :rounds ## e.g. 36 -- note: numeric (integer)
20
+ )
21
+
22
+
23
+ class Repo
24
+
25
+ include Filters ## e.g. sanitize, etc.
26
+ include Utils ## e.g. year_from_file, etc.
27
+
28
+
29
+ def initialize( path, opts ) ## pass in title etc.
30
+ @repo_path = path
31
+ @opts = opts
32
+ end
33
+
34
+
35
+ def fetch_pages
36
+ puts "fetch_pages:"
37
+ cfg = YAML.load_file( "#{@repo_path}/tables/config.yml")
38
+ pp cfg
39
+
40
+ dl_base = 'http://rsssf.com'
41
+
42
+ cfg.each do |k,v|
43
+ ## season = k # as string e.g. 2011-12 or 2011 etc.
44
+ path = v # as string e.g. tablesd/duit2011.html
45
+
46
+ ## note: assumes extension is .html
47
+ # e.g. tablesd/duit2011.html => duit2011
48
+ basename = File.basename( path, '.html' )
49
+
50
+ src_url = "#{dl_base}/#{path}"
51
+ dest_path = "#{@repo_path}/tables/#{basename}.txt"
52
+
53
+ page = Page.from_url( src_url )
54
+ page.save( dest_path )
55
+ end # each year
56
+ end # method fetch_pages
57
+
58
+
59
+ def make_pages_summary
60
+ stats = []
61
+
62
+ files = Dir[ "#{@repo_path}/tables/*.txt" ]
63
+ files.each do |file|
64
+ page = Page.from_file( file )
65
+ stats << page.build_stat
66
+ end
67
+
68
+ ### save report as README.md in tables/ folder in repo
69
+ report = PageReport.new( stats, @opts ) ## pass in title etc.
70
+ report.save( "#{@repo_path}/tables/README.md" )
71
+ end # method make_pages_summary
72
+
73
+
74
+ def make_schedules_summary( stats ) ## note: requires stats to be passed in for now
75
+ report = ScheduleReport.new( stats, @opts ) ## pass in title etc.
76
+ report.save( "#{@repo_path}/README.md" )
77
+ end # method make_schedules_summary
78
+
79
+
80
+
81
+ def patch_pages( patcher )
82
+ ## lets you run/use custom (repo/country-specific patches e.g. for adding/patching headings etc.)
83
+ patch_dir( "#{@repo_path}/tables" ) do |txt, name, year|
84
+ puts "patching #{year} (#{name}) (#{@repo_path})..."
85
+ patcher.patch( txt, name, year ) ## note: must be last (that is, must return (patcher) t(e)xt)
86
+ end
87
+ end ## method patch_pages
88
+
89
+
90
+ def sanitize_pages
91
+ ## for debugging/testing lets you (re)run sanitize (alreay incl. in html2txt filter by default)
92
+ sanitize_dir( "#{@repo_path}/tables" )
93
+ end
94
+
95
+
96
+
97
+ def make_schedules( cfg )
98
+
99
+ ## note: return stats (for report eg. README)
100
+ stats = []
101
+
102
+ files = Dir[ "#{@repo_path}/tables/*.txt" ]
103
+ files.each do |file|
104
+
105
+ ## todo/check/fix:
106
+ ## use source: prop in rsssf page - why? why not???
107
+ ## move year/season/basename into page ???
108
+ #
109
+ # assume every rsssf page has at least:
110
+ ## - basename e.g. duit2014
111
+ ## - year e.g. 2014 (numeric)
112
+ ## - season (derived from config lookup???) - string e.g. 2014-15 or 2014 etc.
113
+ extname = File.extname( file )
114
+ basename = File.basename( file, extname )
115
+ year = year_from_name( basename )
116
+ season = year_to_season( year )
117
+
118
+ if cfg.includes && cfg.includes.include?( year ) == false
119
+ puts " skipping #{basename}; not listed in includes"
120
+ next
121
+ end
122
+
123
+
124
+ puts " reading >#{basename}<"
125
+
126
+ page = Page.from_file( file ) # note: always assume sources (already) converted to utf-8
127
+
128
+ if cfg.opts_for_year.is_a?( Hash )
129
+ opts = cfg.opts_for_year ## just use as is 1:1 (constant/same for all years)
130
+ else
131
+ ## assume it's a proc/lambda (call to calculate)
132
+ opts = cfg.opts_for_year.call( year )
133
+ end
134
+ pp opts
135
+
136
+ schedule = page.find_schedule( opts )
137
+ ## pp schedule
138
+
139
+
140
+ if cfg.dir_for_year.nil?
141
+ ## use default setting, that is, archive for dir (e.g. archive/1980s/1985-86 etc.)
142
+ dir_for_year = archive_dir_for_year( year )
143
+ else
144
+ ## assume it's a proc/lambda
145
+ dir_for_year = cfg.dir_for_year.call( year )
146
+ end
147
+
148
+ ## -- cfg.name e.g. => 1-liga
149
+
150
+ dest_path = "#{@repo_path}/#{dir_for_year}/#{cfg.name}.txt"
151
+ puts " save to >#{dest_path}<"
152
+ FileUtils.mkdir_p( File.dirname( dest_path ))
153
+ schedule.save( dest_path )
154
+
155
+ rec = ScheduleStat.new
156
+ rec.path = dir_for_year
157
+ rec.filename = "#{cfg.name}.txt" ## change to basename - why?? why not??
158
+ rec.year = year
159
+ rec.season = season
160
+ rec.rounds = schedule.rounds
161
+
162
+ stats << rec
163
+ end
164
+
165
+ stats # return stats for reporting
166
+ end # method make_schedules
167
+
168
+
169
+ private
170
+ def patch_dir( root )
171
+ files = Dir[ "#{root}/*.txt" ]
172
+ ## pp files
173
+
174
+ ## sort files by year (latest first)
175
+ files = files.sort do |l,r|
176
+ lyear = year_from_file( l )
177
+ ryear = year_from_file( r )
178
+
179
+ ryear <=> lyear
180
+ end
181
+
182
+ files.each do |file|
183
+ txt = File.read_utf8( file ) ## note: assumes already converted to utf-8
184
+
185
+ basename = File.basename( file, '.txt' ) ## e.g. duit92.txt => duit92
186
+ year = year_from_name( basename )
187
+
188
+ new_txt = yield( txt, basename, year )
189
+ ## calculate hash to see if anything changed ?? why? why not??
190
+
191
+ File.open( file, 'w' ) do |f|
192
+ f.write new_txt
193
+ end
194
+ end # each file
195
+ end ## patch_dir
196
+
197
+ def sanitize_dir( root )
198
+ files = Dir[ "#{root}/*.txt" ]
199
+
200
+ files.each do |file|
201
+ txt = File.read_utf8( file ) ## note: assumes already converted to utf-8
202
+
203
+ new_txt = sanitize( txt )
204
+
205
+ File.open( file, 'w' ) do |f|
206
+ f.write new_txt
207
+ end
208
+ end # each file
209
+ end ## sanitize_dir
210
+
211
+
212
+ end ## class Repo
213
+ end ## module Rsssf
214
+
215
+ ## add (shortcut) alias
216
+ RsssfRepo = Rsssf::Repo
217
+ RsssfScheduleConfig = Rsssf::ScheduleConfig
218
+ RsssfScheduleStat = Rsssf::ScheduleStat
219
+
220
+