rsssf 0.0.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/rsssf/page.rb ADDED
@@ -0,0 +1,320 @@
1
+
2
+
3
+ module Rsssf
4
+
5
+
6
+ PageStat = Struct.new(
7
+ :source, ## e.g. https://rsssf.org/tabled/duit89.html
8
+ :year, ## e.g. 1989 -- note: always four digits
9
+ :authors,
10
+ :last_updated,
11
+ :line_count, ## todo: rename to (just) lines - why? why not?
12
+ :char_count, ## todo: rename to (just) char(ectar)s - why? why not?
13
+ :sections)
14
+
15
+
16
+ ###
17
+ ## note:
18
+ # a rsssf page may contain:
19
+ # many leagues, cups
20
+ # - tables, schedules (rounds), notes, etc.
21
+ #
22
+ # a rsssf page MUST be in plain text (.txt) and utf-8 character encoding assumed
23
+ #
24
+
25
+ class Page
26
+
27
+ include Utils ## e.g. year_from_name, etc.
28
+
29
+ def self.read_cache( url ) ### use read_cache /web/html or such - why? why not?
30
+ html = Webcache.read( url )
31
+
32
+ puts "html:"
33
+ pp html[0..400]
34
+
35
+ txt = PageConverter.convert( html, url: url )
36
+ txt
37
+
38
+ new( txt )
39
+ end
40
+
41
+
42
+ def self.read_txt( path ) ## use read_txt
43
+ # note: always assume sources (already) converted from html to txt!!!!
44
+ txt = read_text( path )
45
+ new( txt )
46
+ end
47
+
48
+
49
+
50
+ ### use text alias too (for txt) - why? why not?
51
+ attr_accessor :txt
52
+
53
+ ## quick hack? used for auto-patch machinery
54
+ attr_accessor :patch
55
+ attr_accessor :url ### source url
56
+
57
+
58
+ def initialize( txt )
59
+ @txt = txt
60
+
61
+ @patch = nil
62
+ @url = nil
63
+ end
64
+
65
+
66
+ LEAGUE_ROUND_REGEX = /\b
67
+ Round
68
+ \b/ix
69
+
70
+ CUP_ROUND_REGEX = /\b(
71
+ Round |
72
+ 1\/8\sFinals |
73
+ 1\/16\sFinals |
74
+ Quarterfinals |
75
+ Semifinals |
76
+ Final
77
+ )\b/ix
78
+
79
+
80
+
81
+ ## make header required - why? why not?
82
+ def find_schedule( header: nil,
83
+ cup: false ) ## change to build_schedule - why? why not???
84
+
85
+ ## find match schedule/fixtures in multi-league doc
86
+ new_txt = String.new
87
+
88
+ ## note: keep track of statistics
89
+ ## e.g. number of rounds found
90
+
91
+ round_count = 0
92
+
93
+ if header
94
+ league_header_found = false
95
+
96
+ ## header:
97
+ ## - assumes heading 4 e.g. #### Premier League or
98
+ ## - bold e.g. **FA Cup** for now
99
+ ## note: markers must start line (^)
100
+
101
+ ## note:
102
+ ## header gsub spaces to \s otherwise no match in regex (using free-form x-flag)!!!
103
+ header_esc = header.gsub( ' ', '\s' )
104
+
105
+ ## note: somehow #{2,4} will not work with free-form /xi defined (picked up as comment?)
106
+ ## use [#] hack ??
107
+ header_regex = /^
108
+ ([#]{2,4}\s+(#{header_esc}))
109
+ |
110
+ (\*{2}(#{header_esc})) ## was: \*{2})
111
+ ## do not inluce trailing ** for now (allows anchors e.g. §)
112
+ /ix
113
+
114
+ ## todo:
115
+ ## use new stage_regex e.g. **xxx** - why? why not?
116
+ ## allow more than one stage in one schedule (e.g. regular stage,playoff stage etc)
117
+
118
+ else
119
+ league_header_found = true # default (no header; assume single league file)
120
+ header_regex = /^---dummy---$/ ## non-matching dummy regex
121
+ end
122
+
123
+ ## puts "header_regex:"
124
+ ## pp header_regex
125
+
126
+
127
+ if cup
128
+ round_regex = CUP_ROUND_REGEX ## note: only allow final, quaterfinals, etc. if knockout cup
129
+ else
130
+ round_regex = LEAGUE_ROUND_REGEX
131
+ end
132
+
133
+
134
+ ## stages
135
+ first_round_header_found = false
136
+ round_header_found = false
137
+ round_body_found = false ## allow round header followed by blank lines
138
+
139
+ blank_found = false
140
+
141
+
142
+
143
+ @txt.each_line do |line|
144
+
145
+ if league_header_found == false
146
+ ## first find start of league header/section
147
+ if line =~ header_regex
148
+ puts "!!! bingo - found header >#{line}<"
149
+ league_header_found = true
150
+
151
+ ## note - do NOT auto-add header/title !!!
152
+ # title = line.gsub( /[#*]/, '' ).strip ## quick hack: extract title from header
153
+ # new_txt << "## #{title}\n\n" # note: use header/stage title (regex group capture)
154
+ else
155
+ puts " searching for header >#{header}<; skipping line >#{line}<"
156
+ next
157
+ end
158
+ elsif first_round_header_found == false
159
+ ## next look for first round (starting w/ Round)
160
+ if line =~ round_regex
161
+ puts "!!! bingo - found first round >#{line}<"
162
+ round_count += 1
163
+ first_round_header_found = true
164
+ round_header_found = true
165
+ round_body_found = false
166
+ new_txt << line
167
+ elsif line =~ /^=-=-=-=/
168
+ puts "*** no rounds found; hit section marker (horizontal rule)"
169
+ break
170
+ elsif line =~ /^\*{2}[^*]+\*{2}/ ## e.g. **FA Cup**
171
+ puts "*** no rounds found; hit section/stage header: #{line}"
172
+ break
173
+ else
174
+ puts " searching for first round; skipping line >#{line}<"
175
+ next ## continue; searching
176
+ end
177
+ elsif round_header_found == true
178
+ ## collect rounds;
179
+ ## assume text block until next blank line
180
+ ## new block must allways start w/ round
181
+ if line =~ /^\s*$/ ## blank line?
182
+ if round_body_found
183
+ round_header_found = false
184
+ blank_found = true ## keep track of blank (lines) - allow inside round block (can continue w/ date header/marker)
185
+ new_txt << line
186
+ else
187
+ ## note: skip blanks following header
188
+ next
189
+ end
190
+ else
191
+ round_body_found = true
192
+ new_txt << line ## keep going until next blank line
193
+ end
194
+ else
195
+ ## skip (more) blank lines
196
+ if line =~ /^\s*$/
197
+ next ## continue; skip extra blank line
198
+ elsif line =~ round_regex
199
+ puts "!!! bingo - found new round >#{line}<"
200
+ round_count += 1
201
+ round_header_found = true # more rounds; continue
202
+ round_body_found = false
203
+ blank_found = false # reset blank tracker
204
+ new_txt << line
205
+ elsif blank_found && line =~ /\[[a-z]{3} \d{1,2}\]/i ## e.g. [Mar 13] or [May 5] with leading blank line; continue round
206
+ puts "!!! bingo - continue round >#{line}<"
207
+ round_header_found = true
208
+ blank_found = false # reset blank tracker
209
+ new_txt << line
210
+ elsif blank_found && line =~ /First Legs|Second Legs/i
211
+ puts "!!! bingo - continue round >#{line}<"
212
+ round_header_found = true
213
+ blank_found = false # reset blank tracker
214
+ new_txt << line
215
+ elsif line =~ /=-=-=-=/
216
+ puts "!!! stop schedule; hit section marker (horizontal rule)"
217
+ break;
218
+ elsif line =~ /^\*{2}[^*]+\*{2}/ ## e.g. **FA Cup**
219
+ puts "!!! stop schedule; hit section/stage header: #{line}"
220
+ break
221
+ else
222
+ blank_found = false
223
+ puts "skipping line in schedule >#{line}<"
224
+ next # continue
225
+ end
226
+ end
227
+ end # each line
228
+
229
+
230
+ ## quick hack?
231
+ ### auto-apply patch if patch configured
232
+ if @patch && @patch.respond_to?(:on_patch)
233
+ url_path = URI.parse( url ).path
234
+ basename = File.basename( url_path, File.extname( url_path ))
235
+ year = year_from_name( basename )
236
+ new_txt = @patch.on_patch( new_txt, basename, year )
237
+ end
238
+
239
+ schedule = Schedule.new( new_txt )
240
+ ## schedule.rounds = round_count
241
+
242
+ schedule
243
+ end # method find_schedule
244
+
245
+
246
+
247
+ def build_stat
248
+ source = nil
249
+ authors = nil
250
+ last_updated = nil
251
+
252
+ ### find source ref
253
+ if @txt =~ /source: ([^ \n]+)/im
254
+ source = $1.to_s
255
+ puts "source: >#{source}<"
256
+ end
257
+
258
+ ##
259
+ ## fix/todo: move authors n last updated whitespace cleanup - why? why not??
260
+
261
+ if @txt =~ /authors?:\s+(.+?)\s+last updated:\s+(\d{1,2} [a-z]{3,10} \d{4})/im
262
+ last_updated = $2.to_s # note: save a copy first (gets "reset" by next regex)
263
+ authors = $1.to_s.strip.gsub(/\s+/, ' ' ) # cleanup whitespace; squish-style
264
+ authors = authors.gsub( /[ ]*,[ ]*/, ', ' ) # prettify commas - always single space after comma (no space before)
265
+ puts "authors: >#{authors}<"
266
+ puts "last updated: >#{last_updated}<"
267
+ end
268
+
269
+ puts "*** !!! missing source" if source.nil?
270
+ puts "*** !!! missing authors and last updated" if authors.nil? || last_updated.nil?
271
+
272
+
273
+ ## get year from source (url)
274
+ url_path = URI.parse( source ).path
275
+ basename = File.basename( url_path, File.extname( url_path ) ) ## e.g. duit92.txt or duit92.html => duit92
276
+ puts " basename=>#{basename}<"
277
+ year = year_from_name( basename )
278
+
279
+
280
+ sections = []
281
+
282
+ ## count lines
283
+ line_count = 0
284
+ @txt.each_line do |line|
285
+ line_count +=1
286
+
287
+ ### find sections
288
+ ## todo: add more patterns? how? why?
289
+ if line =~ /####\s+(.+)/
290
+ puts " found section >#{$1}<"
291
+ ## remove anchors first e.g. ‹§sa› etc.
292
+ ## check if anchors with underscore (_) or dash/hyphen (-) ???
293
+ sections << $1.sub( /‹§[a-z0-9]+›/, '' ).strip
294
+ end
295
+ end
296
+
297
+
298
+ rec = PageStat.new
299
+ rec.source = source # e.g. http://rsssf.org/tabled/duit89.html -- use source_url - why?? why not??
300
+ rec.year = year
301
+ rec.authors = authors
302
+ rec.last_updated = last_updated
303
+ rec.line_count = line_count
304
+ rec.char_count = @txt.size ## fix: use "true" char count not byte count
305
+ rec.sections = sections
306
+
307
+ rec
308
+ end ## method build_stat
309
+
310
+
311
+ def save( path )
312
+ write_text( path, @txt )
313
+ end ## method save
314
+
315
+ end ## class Page
316
+ end ## module Rsssf
317
+
318
+
319
+
320
+
data/lib/rsssf/repo.rb ADDED
@@ -0,0 +1,144 @@
1
+
2
+ module Rsssf
3
+
4
+
5
+
6
+ class Repo
7
+ include Utils ## e.g. year_from_file, etc.
8
+
9
+
10
+ def initialize( path, title: 'Your Title Here',
11
+ patch: nil )
12
+ @repo_path = path
13
+ @title = title
14
+ @patch = patch
15
+ end
16
+
17
+
18
+ def root() @repo_path; end ## use/rename to path - why? why not?
19
+ alias_method :root_dir, :root
20
+
21
+
22
+ ## for now use single country repos - why? why not?
23
+ ## add support for all-in-one repos
24
+ def prepare_pages( code, seasons )
25
+ seasons.each do |season|
26
+ url = Rsssf.table_url( code, season: season )
27
+
28
+ ## check if not in cache
29
+ unless Webcache.cached?( url )
30
+ ## download - if not cached
31
+ Rsssf.download_table( code, season: season )
32
+ end
33
+
34
+ page = Page.read_cache( url )
35
+
36
+ url_path = URI.parse( url ).path
37
+ puts " url = >#{url}<"
38
+ puts " url_path = >#{url_path}<"
39
+
40
+ basename = File.basename( url_path, File.extname( url_path ))
41
+
42
+ ###
43
+ ## check for on_prepare (apply patches)
44
+ if @patch && @patch.respond_to?(:on_prepare)
45
+ year = year_from_name( basename )
46
+ page.txt = @patch.on_prepare( page.txt, basename, year )
47
+ end
48
+
49
+
50
+ path = "#{@repo_path}/tables/#{basename}.txt"
51
+ page.save( path )
52
+ end
53
+ end # method prepare_pages
54
+
55
+
56
+ def each_page( code, seasons, &blk ) ## use each table or such - why? why not?
57
+ seasons.each do |season|
58
+ url = Rsssf.table_url( code, season: season )
59
+ url_path = URI.parse( url ).path
60
+ puts " url = >#{url}<"
61
+ puts " url_path = >#{url_path}<"
62
+ basename = File.basename( url_path, File.extname( url_path ))
63
+
64
+ path = "#{@repo_path}/tables/#{basename}.txt"
65
+ page = Page.read_txt( path )
66
+
67
+ ## add/pass along patcher if patcher
68
+ if @patch
69
+ page.patch = @patch
70
+ page.url = url
71
+ end
72
+
73
+ season = Season( season )
74
+ blk.call( season, page )
75
+ end
76
+ end
77
+
78
+
79
+ def make_pages_summary
80
+ files = Dir.glob( "#{@repo_path}/tables/*.txt" )
81
+ report = PageReport.build( files, title: @title ) ## pass in title etc.
82
+
83
+ ### save report as README.md in tables/ folder in repo
84
+ report.save( "#{@repo_path}/tables/README.md" )
85
+ end # method make_pages_summary
86
+
87
+
88
+ def make_schedules_summary
89
+ ## find all match datafiles
90
+ args = [@repo_path]
91
+ files = SportDb::Parser::Opts.expand_args( args )
92
+ pp files
93
+
94
+ report = ScheduleReport.build( files, title: @title,
95
+ patch: @patch ) ## pass in title etc.
96
+ report.save( "#{@repo_path}/README.md" )
97
+ end
98
+
99
+
100
+
101
+
102
+ def patch_pages( patcher )
103
+ ## lets you run/use custom (repo/country-specific patches e.g. for adding/patching headings etc.)
104
+ patch_dir( "#{@repo_path}/tables" ) do |txt, name, year|
105
+ puts "patching #{year} (#{name}) (#{@repo_path})..."
106
+ patcher.patch( txt, name, year ) ## note: must be last (that is, must return (patcher) t(e)xt)
107
+ end
108
+ end ## method patch_pages
109
+
110
+
111
+ def patch_dir( root, &blk )
112
+ files = Dir.glob( "#{root}/**/*.txt" )
113
+ ## pp files
114
+
115
+ ## sort files by year (latest first)
116
+ files = files.sort do |l,r|
117
+ lyear = year_from_file( l )
118
+ ryear = year_from_file( r )
119
+
120
+ ryear <=> lyear
121
+ end
122
+
123
+ files.each do |file|
124
+ txt = read_text( file ) ## note: assumes already converted to utf-8
125
+
126
+ basename = File.basename( file, '.txt' ) ## e.g. duit92.txt => duit92
127
+ year = year_from_name( basename )
128
+
129
+ new_txt = blk.call( txt, basename, year )
130
+
131
+ ## calculate hash to see if anything changed ?? why? why not??
132
+ if txt != new_txt
133
+ puts " patching #{file}, text changed"
134
+ write_text( file, new_txt )
135
+ end
136
+ end # each file
137
+ end ## patch_dir
138
+
139
+
140
+
141
+
142
+ end ## class Repo
143
+ end ## module Rsssf
144
+
@@ -0,0 +1,75 @@
1
+
2
+
3
+ module Rsssf
4
+
5
+ class PageReport
6
+
7
+
8
+ def self.build( files, title: )
9
+ stats = []
10
+ files.each do |file|
11
+ page = Page.read_txt( file )
12
+ stats << page.build_stat
13
+ end
14
+
15
+ new( stats, title: title )
16
+ end
17
+
18
+
19
+ attr_reader :title
20
+
21
+ def initialize( stats, title: )
22
+ @stats = stats
23
+ @title = title
24
+ end
25
+
26
+ ### save report as README.md in repo
27
+ def save( path ) write_text( path, build_summary ); end
28
+
29
+
30
+ def build_summary
31
+
32
+ stats = @stats.sort do |l,r|
33
+ r.year <=> l.year
34
+ end
35
+
36
+ header =<<EOS
37
+
38
+ # #{title}
39
+
40
+ football.db RSSSF Archive Data Summary for #{title}
41
+
42
+ EOS
43
+
44
+ ## no longer add last update
45
+ ## _Last Update: #{Time.now}_
46
+
47
+
48
+ txt = ''
49
+ txt << header
50
+
51
+ txt << "| File | Authors | Last Updated | Lines (Chars) | Sections |\n"
52
+ txt << "| :----- | :------- | :----------- | ------------: | :------- |\n"
53
+
54
+ ## note - removed season (no longer tracked here)
55
+
56
+ stats.each do |stat|
57
+ ## get basename from source url
58
+ url_path = URI.parse( stat.source ).path
59
+ basename = File.basename( url_path, File.extname( url_path ) ) ## e.g. duit92.txt or duit92.html => duit92
60
+
61
+ txt << "| [#{basename}.txt](#{basename}.txt) "
62
+ txt << "| #{stat.authors} "
63
+ txt << "| #{stat.last_updated} "
64
+ txt << "| #{stat.line_count} (#{stat.char_count}) "
65
+ txt << "| #{stat.sections.join(', ')} "
66
+ txt << "|\n"
67
+ end
68
+
69
+ txt << "\n\n"
70
+ txt
71
+ end # method build_summary
72
+
73
+ end ## class PageReport
74
+ end ## module Rsssf
75
+
@@ -0,0 +1,163 @@
1
+
2
+
3
+ module Rsssf
4
+
5
+
6
+ ScheduleStat = Struct.new(
7
+ :path, ## path to .txt file
8
+ :errors ## array or nil
9
+ )
10
+
11
+
12
+
13
+ class ScheduleReport
14
+
15
+ include Utils ## e.g. year_from_file, etc.
16
+
17
+ ##
18
+ ## quick hack? pass along (optional) patch
19
+
20
+ def self.build( files, title:,
21
+ patch: nil )
22
+ linter = Parser::Linter.new
23
+
24
+ stats = []
25
+ files.each_with_index do |file,i|
26
+
27
+ puts "==> [#{i+1}/#{files.size}] reading >#{file}<..."
28
+
29
+ txt = read_text( file )
30
+
31
+ if patch && patch.respond_to?(:on_parse)
32
+ season_dir = File.basename(File.dirname(file))
33
+ season = Season( season_dir )
34
+ basename = File.basename(file, File.extname(file))
35
+ puts " [debug] before patch.on_parse #{basename}, #{season}"
36
+ txt = patch.on_parse( txt, basename, season )
37
+ end
38
+
39
+ linter.parse( txt, parse: true,
40
+ path: file ) ## todo/fix - change path to file/filename - why? why not?
41
+
42
+ stat = ScheduleStat.new
43
+ stat.path = file
44
+ stat.errors = linter.errors
45
+
46
+ stats << stat
47
+ end
48
+
49
+ new( stats, title: title )
50
+ end
51
+
52
+
53
+ attr_reader :title
54
+
55
+ def initialize( stats, title: )
56
+ @stats = stats
57
+ @title = title
58
+ end
59
+
60
+ ### save report as README.md in repo
61
+ def save( path ) write_text( path, build_summary ); end
62
+
63
+
64
+ def build_summary
65
+ ## sort start 1) by season (latest first) than
66
+ ## 2) by name (e.g. 1-bundesliga, cup, etc.)
67
+ stats = @stats.sort do |l,r|
68
+ v = File.basename(File.dirname(r.path)) <=> File.basename(File.dirname(l.path))
69
+ v = File.basename(l.path) <=> File.basename(r.path) if v == 0 ## same season
70
+ v
71
+ end
72
+
73
+ header =<<EOS
74
+
75
+ # #{title}
76
+
77
+ football.db RSSSF (Rec.Sport.Soccer Statistics Foundation) Archive Data for
78
+ #{title}
79
+
80
+ EOS
81
+
82
+ ## no longer add last update
83
+ ## _Last Update: #{Time.now}_
84
+ ##
85
+
86
+
87
+ =begin
88
+ footer =<<EOS
89
+
90
+ ## Questions? Comments?
91
+
92
+ Send them along to the
93
+ [Open Sports & Friends Forum](http://groups.google.com/group/opensport).
94
+ Thanks!
95
+ EOS
96
+ =end
97
+
98
+
99
+ errors = []
100
+
101
+
102
+ txt = String.new
103
+ txt << header
104
+
105
+ txt << "| Season | League, Cup | Errors |\n"
106
+ txt << "| :----- | :---------- | -----: |\n"
107
+
108
+
109
+ stats.each_with_index do |stat,i|
110
+
111
+ path = stat.path
112
+ season_dir = File.basename(File.dirname( path ))
113
+ filename = File.basename( path ) ## incl. extension !!
114
+
115
+ season = Season( season_dir )
116
+ ## note - use archive_dir_for_season for archive path
117
+
118
+
119
+ txt << "| #{season_dir} "
120
+ txt << "| [#{filename}](#{archive_dir_for_season(season)}/#{filename}) "
121
+
122
+ txt << if stat.errors.size > 0
123
+ "| **!! #{stat.errors.size}** "
124
+ else
125
+ "| OK "
126
+ end
127
+ txt << "|\n"
128
+
129
+ errors += stat.errors if stat.errors.size > 0
130
+ end
131
+
132
+ if errors.size > 0
133
+ txt << "\n\n"
134
+ txt << "#{errors.size} errors in #{stats.size} datafile(s)\n\n"
135
+
136
+ txt << "```\n"
137
+ errors.each do |path, msg, line|
138
+ season_dir = File.basename(File.dirname( path ))
139
+ filename = File.basename( path ) ## incl. extension !!
140
+
141
+ txt <<"#{season_dir}/#{filename} -- #{msg}\n"
142
+ txt << " in line >#{line}<\n" unless line.empty?
143
+ end
144
+ txt << "```\n"
145
+ end
146
+
147
+ =begin
148
+ stats.each do |stat|
149
+ txt << "| #{stat.season} "
150
+ txt << "| [#{stat.filename}](#{stat.path}/#{stat.filename}) "
151
+ txt << "| #{stat.rounds} "
152
+ txt << "|\n"
153
+ end
154
+ =end
155
+
156
+
157
+ ## txt << footer
158
+ txt
159
+ end # method build_summary
160
+
161
+ end ## class ScheduleReport
162
+ end ## module Rsssf
163
+