rsssf 0.0.1 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,295 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ module Rsssf
5
+
6
+ PageStat = Struct.new(
7
+ :source, ## e.g. http://rsssf.org/tabled/duit89.html
8
+ :basename, ## e.g. duit89 -- note: filename w/o extension (and path)
9
+ :year, ## e.g. 1989 -- note: always four digits
10
+ :season, ## e.g. 1990-91 -- note: always a string (NOT a number)
11
+ :authors,
12
+ :last_updated,
13
+ :line_count, ## todo: rename to (just) lines - why? why not?
14
+ :char_count, ## todo: rename to (just) char(ectar)s - why? why not?
15
+ :sections)
16
+
17
+
18
+ ###
19
+ ## note:
20
+ # a rsssf page may contain:
21
+ # many leagues, cups
22
+ # - tables, schedules (rounds), notes, etc.
23
+ #
24
+ # a rsssf page MUST be in plain text (.txt) and utf-8 character encoding assumed
25
+ #
26
+
27
+ class Page
28
+
29
+ include Utils ## e.g. year_from_name, etc.
30
+
31
+ def self.from_url( src )
32
+ txt = PageFetcher.new.fetch( src )
33
+ self.from_string( txt )
34
+ end
35
+
36
+
37
+ def self.from_file( path )
38
+ txt = File.read_utf8( path ) # note: always assume sources (already) converted to utf-8
39
+ self.from_string( txt )
40
+ end
41
+
42
+ def self.from_string( txt )
43
+ self.new( txt )
44
+ end
45
+
46
+ def initialize( txt )
47
+ @txt = txt
48
+ end
49
+
50
+
51
+ LEAGUE_ROUND_REGEX = /\b
52
+ Round
53
+ \b/ix
54
+
55
+ CUP_ROUND_REGEX = /\b(
56
+ Round |
57
+ 1\/8\sFinals |
58
+ 1\/16\sFinals |
59
+ Quarterfinals |
60
+ Semifinals |
61
+ Final
62
+ )\b/ix
63
+
64
+ def find_schedule( opts={} ) ## change to build_schedule - why? why not???
65
+
66
+ ## find match schedule/fixtures in multi-league doc
67
+ new_txt = ''
68
+
69
+ ## note: keep track of statistics
70
+ ## e.g. number of rounds found
71
+
72
+ round_count = 0
73
+
74
+ header = opts[:header]
75
+ if header
76
+ league_header_found = false
77
+
78
+ ## header:
79
+ ## - assumes heading 4 e.g. #### Premier League or
80
+ ## - bold e.g. **FA Cup** for now
81
+ ## note: markers must start line (^)
82
+
83
+ ## note:
84
+ ## header gsub spaces to \s otherwise no match in regex (using free-form x-flag)!!!
85
+ header_esc = header.gsub( ' ', '\s' )
86
+
87
+ ## note: somehow #{2,4} will not work with free-form /xi defined (picked up as comment?)
88
+ ## use [#] hack ??
89
+ header_regex = /^
90
+ ([#]{2,4}\s+(#{header_esc}))
91
+ |
92
+ (\*{2}(#{header_esc})\*{2})
93
+ /ix
94
+
95
+ ## todo:
96
+ ## use new stage_regex e.g. **xxx** - why? why not?
97
+ ## allow more than one stage in one schedule (e.g. regular stage,playoff stage etc)
98
+
99
+ else
100
+ league_header_found = true # default (no header; assume single league file)
101
+ header_regex = /^---dummy---$/ ## non-matching dummy regex
102
+ end
103
+
104
+ ## puts "header_regex:"
105
+ ## pp header_regex
106
+
107
+
108
+ if opts[:cup]
109
+ round_regex = CUP_ROUND_REGEX ## note: only allow final, quaterfinals, etc. if knockout cup
110
+ else
111
+ round_regex = LEAGUE_ROUND_REGEX
112
+ end
113
+
114
+
115
+ ## stages
116
+ first_round_header_found = false
117
+ round_header_found = false
118
+ round_body_found = false ## allow round header followed by blank lines
119
+
120
+ blank_found = false
121
+
122
+
123
+
124
+ @txt.each_line do |line|
125
+
126
+ if league_header_found == false
127
+ ## first find start of league header/section
128
+ if line =~ header_regex
129
+ puts "!!! bingo - found header >#{line}<"
130
+ league_header_found = true
131
+ title = line.gsub( /[#*]/, '' ).strip ## quick hack: extract title from header
132
+ new_txt << "## #{title}\n\n" # note: use header/stage title (regex group capture)
133
+ else
134
+ puts " searching for header >#{header}<; skipping line >#{line}<"
135
+ next
136
+ end
137
+ elsif first_round_header_found == false
138
+ ## next look for first round (starting w/ Round)
139
+ if line =~ round_regex
140
+ puts "!!! bingo - found first round >#{line}<"
141
+ round_count += 1
142
+ first_round_header_found = true
143
+ round_header_found = true
144
+ round_body_found = false
145
+ new_txt << line
146
+ elsif line =~ /^=-=-=-=/
147
+ puts "*** no rounds found; hit section marker (horizontal rule)"
148
+ break
149
+ elsif line =~ /^\*{2}[^*]+\*{2}/ ## e.g. **FA Cup**
150
+ puts "*** no rounds found; hit section/stage header: #{line}"
151
+ break
152
+ else
153
+ puts " searching for first round; skipping line >#{line}<"
154
+ next ## continue; searching
155
+ end
156
+ elsif round_header_found == true
157
+ ## collect rounds;
158
+ ## assume text block until next blank line
159
+ ## new block must allways start w/ round
160
+ if line =~ /^\s*$/ ## blank line?
161
+ if round_body_found
162
+ round_header_found = false
163
+ blank_found = true ## keep track of blank (lines) - allow inside round block (can continue w/ date header/marker)
164
+ new_txt << line
165
+ else
166
+ ## note: skip blanks following header
167
+ next
168
+ end
169
+ else
170
+ round_body_found = true
171
+ new_txt << line ## keep going until next blank line
172
+ end
173
+ else
174
+ ## skip (more) blank lines
175
+ if line =~ /^\s*$/
176
+ next ## continue; skip extra blank line
177
+ elsif line =~ round_regex
178
+ puts "!!! bingo - found new round >#{line}<"
179
+ round_count += 1
180
+ round_header_found = true # more rounds; continue
181
+ round_body_found = false
182
+ blank_found = false # reset blank tracker
183
+ new_txt << line
184
+ elsif blank_found && line =~ /\[[a-z]{3} \d{1,2}\]/i ## e.g. [Mar 13] or [May 5] with leading blank line; continue round
185
+ puts "!!! bingo - continue round >#{line}<"
186
+ round_header_found = true
187
+ blank_found = false # reset blank tracker
188
+ new_txt << line
189
+ elsif blank_found && line =~ /First Legs|Second Legs/i
190
+ puts "!!! bingo - continue round >#{line}<"
191
+ round_header_found = true
192
+ blank_found = false # reset blank tracker
193
+ new_txt << line
194
+ elsif line =~ /=-=-=-=/
195
+ puts "!!! stop schedule; hit section marker (horizontal rule)"
196
+ break;
197
+ elsif line =~ /^\*{2}[^*]+\*{2}/ ## e.g. **FA Cup**
198
+ puts "!!! stop schedule; hit section/stage header: #{line}"
199
+ break
200
+ else
201
+ blank_found = false
202
+ puts "skipping line in schedule >#{line}<"
203
+ next # continue
204
+ end
205
+ end
206
+ end # each line
207
+
208
+ schedule = Schedule.from_string( new_txt )
209
+ schedule.rounds = round_count
210
+
211
+ schedule
212
+ end # method find_schedule
213
+
214
+
215
+ def build_stat
216
+ source = nil
217
+ authors = nil
218
+ last_updated = nil
219
+
220
+ ### find source ref
221
+ if @txt =~ /source: ([^ \n]+)/im
222
+ source = $1.to_s
223
+ puts "source: >#{source}<"
224
+ end
225
+
226
+ ##
227
+ ## fix/todo: move authors n last updated whitespace cleanup to sanitize - why? why not??
228
+
229
+ if @txt =~ /authors?:\s+(.+?)\s+last updated:\s+(\d{1,2} [a-z]{3,10} \d{4})/im
230
+ last_updated = $2.to_s # note: save a copy first (gets "reset" by next regex)
231
+ authors = $1.to_s.strip.gsub(/\s+/, ' ' ) # cleanup whitespace; squish-style
232
+ authors = authors.gsub( /[ ]*,[ ]*/, ', ' ) # prettify commas - always single space after comma (no space before)
233
+ puts "authors: >#{authors}<"
234
+ puts "last updated: >#{last_updated}<"
235
+ end
236
+
237
+ puts "*** !!! missing source" if source.nil?
238
+ puts "*** !!! missing authors n last updated" if authors.nil? || last_updated.nil?
239
+
240
+ sections = []
241
+
242
+ ## count lines
243
+ line_count = 0
244
+ @txt.each_line do |line|
245
+ line_count +=1
246
+
247
+ ### find sections
248
+ ## todo: add more patterns? how? why?
249
+ if line =~ /####\s+(.+)/
250
+ puts " found section >#{$1}<"
251
+ sections << $1.strip
252
+ end
253
+ end
254
+
255
+
256
+ # get path from url
257
+ url = URI.parse( source )
258
+ ## pp url
259
+ ## puts url.host
260
+ path = url.path
261
+ extname = File.extname( path )
262
+ basename = File.basename( path, extname ) ## e.g. duit92.txt or duit92.html => duit92
263
+ year = year_from_name( basename )
264
+ season = year_to_season( year )
265
+
266
+ rec = PageStat.new
267
+ rec.source = source # e.g. http://rsssf.org/tabled/duit89.html -- use source_url - why?? why not??
268
+ rec.basename = basename # e.g. duit89
269
+ rec.year = year # e.g. 89 => 1989 -- note: always four digits
270
+ rec.season = season
271
+ rec.authors = authors
272
+ rec.last_updated = last_updated
273
+ rec.line_count = line_count
274
+ rec.char_count = @txt.size ## fix: use "true" char count not byte count
275
+ rec.sections = sections
276
+
277
+ rec
278
+ end ## method build_stat
279
+
280
+
281
+ def save( path )
282
+ File.open( path, 'w' ) do |f|
283
+ f.write @txt
284
+ end
285
+ end ## method save
286
+
287
+ end ## class Page
288
+ end ## module Rsssf
289
+
290
+
291
+ ## add (shortcut) alias
292
+ RsssfPageStat = Rsssf::PageStat
293
+ RsssfPage = Rsssf::Page
294
+
295
+
@@ -0,0 +1,28 @@
1
+ # encoding: utf-8
2
+
3
+ module Rsssf
4
+
5
+ class Patcher
6
+
7
+ ## e.g. 2008/09
8
+ ## note: also support 1999/2000
9
+ SEASON = '\d{4}\/(\d{2}|\d{4})' ## note: use single quotes - quotes do NOT get escaped (e.g. '\d' => "\\d")
10
+
11
+ def patch_heading( txt, rxs, title )
12
+ rxs.each do |rx|
13
+ txt = txt.sub( rx ) do |match|
14
+ match = match.gsub( "\n", '$$') ## change newlines to $$ for single-line outputs/dumps
15
+ puts " found heading >#{match}<"
16
+ "\n\n#### #{title}\n\n"
17
+ end
18
+ end
19
+ txt
20
+ end
21
+
22
+
23
+ end # class Patcher
24
+ end ## module Rsssf
25
+
26
+ ## add (shortcut) alias
27
+ RsssfPatcher = Rsssf::Patcher
28
+
@@ -0,0 +1,220 @@
1
+ # encoding: utf-8
2
+
3
+ module Rsssf
4
+
5
+ ## used by Repo#make_schedules
6
+ ScheduleConfig = Struct.new(
7
+ :name,
8
+ :opts_for_year, ## hash or proc ->(year){ Hash[...] }
9
+ :dir_for_year, ## proc ->(year){ 'path_here'} ## rename to path_for_year - why, why not??
10
+ :includes ## array of years to include e.g. [2011,2012] etc.
11
+ )
12
+
13
+
14
+ ScheduleStat = Struct.new(
15
+ :path, ## e.g. 2012-13 or archive/1980s/1984-85
16
+ :filename, ## e.g. 1-bundesliga.txt -- note: w/o path
17
+ :year, ## e.g. 2013 -- note: numeric (integer)
18
+ :season, ## e.g. 2012-13 -- note: is a string
19
+ :rounds ## e.g. 36 -- note: numeric (integer)
20
+ )
21
+
22
+
23
+ class Repo
24
+
25
+ include Filters ## e.g. sanitize, etc.
26
+ include Utils ## e.g. year_from_file, etc.
27
+
28
+
29
+ def initialize( path, opts ) ## pass in title etc.
30
+ @repo_path = path
31
+ @opts = opts
32
+ end
33
+
34
+
35
+ def fetch_pages
36
+ puts "fetch_pages:"
37
+ cfg = YAML.load_file( "#{@repo_path}/tables/config.yml")
38
+ pp cfg
39
+
40
+ dl_base = 'http://rsssf.com'
41
+
42
+ cfg.each do |k,v|
43
+ ## season = k # as string e.g. 2011-12 or 2011 etc.
44
+ path = v # as string e.g. tablesd/duit2011.html
45
+
46
+ ## note: assumes extension is .html
47
+ # e.g. tablesd/duit2011.html => duit2011
48
+ basename = File.basename( path, '.html' )
49
+
50
+ src_url = "#{dl_base}/#{path}"
51
+ dest_path = "#{@repo_path}/tables/#{basename}.txt"
52
+
53
+ page = Page.from_url( src_url )
54
+ page.save( dest_path )
55
+ end # each year
56
+ end # method fetch_pages
57
+
58
+
59
+ def make_pages_summary
60
+ stats = []
61
+
62
+ files = Dir[ "#{@repo_path}/tables/*.txt" ]
63
+ files.each do |file|
64
+ page = Page.from_file( file )
65
+ stats << page.build_stat
66
+ end
67
+
68
+ ### save report as README.md in tables/ folder in repo
69
+ report = PageReport.new( stats, @opts ) ## pass in title etc.
70
+ report.save( "#{@repo_path}/tables/README.md" )
71
+ end # method make_pages_summary
72
+
73
+
74
+ def make_schedules_summary( stats ) ## note: requires stats to be passed in for now
75
+ report = ScheduleReport.new( stats, @opts ) ## pass in title etc.
76
+ report.save( "#{@repo_path}/README.md" )
77
+ end # method make_schedules_summary
78
+
79
+
80
+
81
+ def patch_pages( patcher )
82
+ ## lets you run/use custom (repo/country-specific patches e.g. for adding/patching headings etc.)
83
+ patch_dir( "#{@repo_path}/tables" ) do |txt, name, year|
84
+ puts "patching #{year} (#{name}) (#{@repo_path})..."
85
+ patcher.patch( txt, name, year ) ## note: must be last (that is, must return (patcher) t(e)xt)
86
+ end
87
+ end ## method patch_pages
88
+
89
+
90
+ def sanitize_pages
91
+ ## for debugging/testing lets you (re)run sanitize (alreay incl. in html2txt filter by default)
92
+ sanitize_dir( "#{@repo_path}/tables" )
93
+ end
94
+
95
+
96
+
97
+ def make_schedules( cfg )
98
+
99
+ ## note: return stats (for report eg. README)
100
+ stats = []
101
+
102
+ files = Dir[ "#{@repo_path}/tables/*.txt" ]
103
+ files.each do |file|
104
+
105
+ ## todo/check/fix:
106
+ ## use source: prop in rsssf page - why? why not???
107
+ ## move year/season/basename into page ???
108
+ #
109
+ # assume every rsssf page has at least:
110
+ ## - basename e.g. duit2014
111
+ ## - year e.g. 2014 (numeric)
112
+ ## - season (derived from config lookup???) - string e.g. 2014-15 or 2014 etc.
113
+ extname = File.extname( file )
114
+ basename = File.basename( file, extname )
115
+ year = year_from_name( basename )
116
+ season = year_to_season( year )
117
+
118
+ if cfg.includes && cfg.includes.include?( year ) == false
119
+ puts " skipping #{basename}; not listed in includes"
120
+ next
121
+ end
122
+
123
+
124
+ puts " reading >#{basename}<"
125
+
126
+ page = Page.from_file( file ) # note: always assume sources (already) converted to utf-8
127
+
128
+ if cfg.opts_for_year.is_a?( Hash )
129
+ opts = cfg.opts_for_year ## just use as is 1:1 (constant/same for all years)
130
+ else
131
+ ## assume it's a proc/lambda (call to calculate)
132
+ opts = cfg.opts_for_year.call( year )
133
+ end
134
+ pp opts
135
+
136
+ schedule = page.find_schedule( opts )
137
+ ## pp schedule
138
+
139
+
140
+ if cfg.dir_for_year.nil?
141
+ ## use default setting, that is, archive for dir (e.g. archive/1980s/1985-86 etc.)
142
+ dir_for_year = archive_dir_for_year( year )
143
+ else
144
+ ## assume it's a proc/lambda
145
+ dir_for_year = cfg.dir_for_year.call( year )
146
+ end
147
+
148
+ ## -- cfg.name e.g. => 1-liga
149
+
150
+ dest_path = "#{@repo_path}/#{dir_for_year}/#{cfg.name}.txt"
151
+ puts " save to >#{dest_path}<"
152
+ FileUtils.mkdir_p( File.dirname( dest_path ))
153
+ schedule.save( dest_path )
154
+
155
+ rec = ScheduleStat.new
156
+ rec.path = dir_for_year
157
+ rec.filename = "#{cfg.name}.txt" ## change to basename - why?? why not??
158
+ rec.year = year
159
+ rec.season = season
160
+ rec.rounds = schedule.rounds
161
+
162
+ stats << rec
163
+ end
164
+
165
+ stats # return stats for reporting
166
+ end # method make_schedules
167
+
168
+
169
+ private
170
+ def patch_dir( root )
171
+ files = Dir[ "#{root}/*.txt" ]
172
+ ## pp files
173
+
174
+ ## sort files by year (latest first)
175
+ files = files.sort do |l,r|
176
+ lyear = year_from_file( l )
177
+ ryear = year_from_file( r )
178
+
179
+ ryear <=> lyear
180
+ end
181
+
182
+ files.each do |file|
183
+ txt = File.read_utf8( file ) ## note: assumes already converted to utf-8
184
+
185
+ basename = File.basename( file, '.txt' ) ## e.g. duit92.txt => duit92
186
+ year = year_from_name( basename )
187
+
188
+ new_txt = yield( txt, basename, year )
189
+ ## calculate hash to see if anything changed ?? why? why not??
190
+
191
+ File.open( file, 'w' ) do |f|
192
+ f.write new_txt
193
+ end
194
+ end # each file
195
+ end ## patch_dir
196
+
197
+ def sanitize_dir( root )
198
+ files = Dir[ "#{root}/*.txt" ]
199
+
200
+ files.each do |file|
201
+ txt = File.read_utf8( file ) ## note: assumes already converted to utf-8
202
+
203
+ new_txt = sanitize( txt )
204
+
205
+ File.open( file, 'w' ) do |f|
206
+ f.write new_txt
207
+ end
208
+ end # each file
209
+ end ## sanitize_dir
210
+
211
+
212
+ end ## class Repo
213
+ end ## module Rsssf
214
+
215
+ ## add (shortcut) alias
216
+ RsssfRepo = Rsssf::Repo
217
+ RsssfScheduleConfig = Rsssf::ScheduleConfig
218
+ RsssfScheduleStat = Rsssf::ScheduleStat
219
+
220
+