RubyGems - rsssf - Versions diffs - 0.0.1 → 0.2.0 - Mend

rsssf 0.0.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

checksums.yaml +5 -5
data/{HISTORY.md → CHANGELOG.md} +2 -0
data/Manifest.txt +9 -1
data/README.md +193 -7
data/Rakefile +8 -7
data/lib/rsssf/convert.rb +495 -0
data/lib/rsssf/download.rb +151 -0
data/lib/rsssf/page.rb +320 -0
data/lib/rsssf/repo.rb +144 -0
data/lib/rsssf/reports/page.rb +75 -0
data/lib/rsssf/reports/schedule.rb +163 -0
data/lib/rsssf/schedule.rb +21 -0
data/lib/rsssf/utils.rb +56 -0
data/lib/rsssf/version.rb +4 -6
data/lib/rsssf.rb +46 -10
metadata +32 -19

data/lib/rsssf/page.rb ADDED Viewed

@@ -0,0 +1,320 @@
+module Rsssf
+  PageStat = Struct.new(
+    :source,     ## e.g. https://rsssf.org/tabled/duit89.html
+    :year,       ## e.g. 1989     -- note: always four digits
+    :authors,
+    :last_updated,
+    :line_count,  ## todo: rename to (just) lines - why? why not?
+    :char_count,  ## todo: rename to (just) char(ectar)s  - why? why not?
+    :sections)
+###
+## note:
+#    a rsssf page may contain:
+#     many leagues, cups
+#     - tables, schedules (rounds), notes, etc.
+#
+#   a rsssf page MUST be in plain text (.txt) and utf-8 character encoding assumed
+#
+class Page
+  include Utils   ## e.g. year_from_name, etc.
+def self.read_cache( url )  ### use read_cache /web/html or such - why? why not?
+  html = Webcache.read( url )
+  puts "html:"
+  pp html[0..400]
+  txt = PageConverter.convert( html, url: url )
+  txt
+  new( txt )
+end
+def self.read_txt( path )  ## use read_txt
+    # note: always assume sources (already) converted from html to txt!!!!
+  txt = read_text( path )
+  new( txt )
+end
+### use text alias too (for txt) - why? why not?
+attr_accessor :txt
+## quick hack? used for auto-patch machinery
+attr_accessor :patch
+attr_accessor :url  ### source url
+def initialize( txt )
+  @txt = txt
+  @patch = nil
+  @url   = nil
+end
+LEAGUE_ROUND_REGEX = /\b
+                      Round
+                      \b/ix
+CUP_ROUND_REGEX  = /\b(
+                      Round         |
+                      1\/8\sFinals  |
+                      1\/16\sFinals |
+                      Quarterfinals |
+                      Semifinals    |
+                      Final
+                    )\b/ix
+## make header required - why? why not?
+def find_schedule( header: nil,
+                   cup:    false )     ## change to build_schedule - why? why not???
+  ## find match schedule/fixtures in multi-league doc
+  new_txt = String.new
+  ## note: keep track of statistics
+  ##   e.g. number of rounds found
+  round_count = 0
+  if header
+    league_header_found        = false
+     ## header:
+     ##  - assumes heading 4 e.g. #### Premier League or
+     ##  - bold e.g. **FA Cup** for now
+     ##  note: markers must start line (^)
+     ## note:
+     ## header gsub spaces to \s otherwise no match in regex (using free-form x-flag)!!!
+     header_esc   = header.gsub( ' ', '\s' )
+     ## note: somehow #{2,4} will not work with free-form /xi defined (picked up as comment?)
+     ##  use [#] hack ??
+     header_regex = /^
+                      ([#]{2,4}\s+(#{header_esc}))
+                        |
+                      (\*{2}(#{header_esc}))     ## was: \*{2})
+                                                 ##  do not inluce trailing ** for now (allows anchors e.g. §)
+                    /ix
+    ## todo:
+    ##   use new stage_regex e.g. **xxx** - why? why not?
+    ##  allow more than one stage in one schedule (e.g. regular stage,playoff stage etc)
+  else
+    league_header_found = true   # default (no header; assume single league file)
+    header_regex = /^---dummy---$/  ## non-matching dummy regex
+  end
+  ## puts "header_regex:"
+  ## pp header_regex
+  if cup
+    round_regex = CUP_ROUND_REGEX   ## note: only allow final, quaterfinals, etc. if knockout cup
+  else
+    round_regex = LEAGUE_ROUND_REGEX
+  end
+  ## stages
+  first_round_header_found   = false
+  round_header_found         = false
+  round_body_found           = false   ## allow round header followed by blank lines
+  blank_found = false
+  @txt.each_line do |line|
+    if league_header_found == false
+      ## first find start of league header/section
+      if line =~ header_regex
+        puts "!!! bingo - found header >#{line}<"
+        league_header_found = true
+        ## note - do NOT auto-add header/title !!!
+        # title = line.gsub( /[#*]/, '' ).strip   ##  quick hack: extract title from header
+        # new_txt << "## #{title}\n\n"    # note: use header/stage title (regex group capture)
+      else
+        puts "  searching for header >#{header}<; skipping line >#{line}<"
+        next
+      end
+    elsif first_round_header_found == false
+      ## next look for first round (starting w/ Round)
+      if line =~ round_regex
+        puts "!!! bingo - found first round >#{line}<"
+        round_count += 1
+        first_round_header_found = true
+        round_header_found       = true
+        round_body_found         = false
+        new_txt << line
+      elsif line =~ /^=-=-=-=/
+        puts "*** no rounds found; hit section marker (horizontal rule)"
+        break
+      elsif line =~ /^\*{2}[^*]+\*{2}/   ## e.g. **FA Cup**
+        puts "*** no rounds found; hit section/stage header: #{line}"
+        break
+      else
+        puts "  searching for first round; skipping line >#{line}<"
+        next ## continue; searching
+      end
+    elsif round_header_found == true
+      ## collect rounds;
+      ##   assume text block until next blank line
+      ##   new block must allways start w/ round
+      if line =~ /^\s*$/   ## blank line?
+        if round_body_found
+          round_header_found = false
+          blank_found        = true    ## keep track of blank (lines) - allow inside round block (can continue w/ date header/marker)
+          new_txt << line
+        else
+          ## note: skip blanks following header
+          next
+        end
+      else
+        round_body_found = true
+        new_txt << line   ## keep going until next blank line
+      end
+    else
+      ## skip (more) blank lines
+      if line =~ /^\s*$/
+        next  ## continue; skip extra blank line
+      elsif line =~ round_regex
+        puts "!!! bingo - found new round >#{line}<"
+        round_count += 1
+        round_header_found = true   # more rounds; continue
+        round_body_found   = false
+        blank_found        = false  # reset blank tracker
+        new_txt << line
+      elsif blank_found && line =~ /\[[a-z]{3} \d{1,2}\]/i   ## e.g. [Mar 13] or [May 5] with leading blank line; continue round
+        puts "!!! bingo - continue round >#{line}<"
+        round_header_found = true
+        blank_found        = false  # reset blank tracker
+        new_txt << line
+      elsif blank_found && line =~ /First Legs|Second Legs/i
+        puts "!!! bingo - continue round >#{line}<"
+        round_header_found = true
+        blank_found        = false  # reset blank tracker
+        new_txt << line
+      elsif line =~ /=-=-=-=/
+        puts "!!! stop schedule; hit section marker (horizontal rule)"
+        break;
+      elsif line =~ /^\*{2}[^*]+\*{2}/   ## e.g. **FA Cup**
+        puts "!!! stop schedule; hit section/stage header: #{line}"
+        break
+      else
+        blank_found  = false
+        puts "skipping line in schedule >#{line}<"
+        next # continue
+      end
+    end
+  end  # each line
+  ## quick hack?
+  ### auto-apply patch if patch configured
+   if @patch  && @patch.respond_to?(:on_patch)
+      url_path = URI.parse( url ).path
+      basename = File.basename( url_path, File.extname( url_path ))
+      year     = year_from_name( basename )
+      new_txt = @patch.on_patch( new_txt, basename, year )
+   end
+  schedule = Schedule.new( new_txt )
+  ## schedule.rounds = round_count
+  schedule
+end  # method find_schedule
+def build_stat
+  source       = nil
+  authors      = nil
+  last_updated = nil
+  ### find source ref
+  if @txt =~ /source: ([^ \n]+)/im
+    source = $1.to_s
+    puts "source: >#{source}<"
+  end
+  ##
+  ## fix/todo: move authors n last updated  whitespace cleanup  - why? why not??
+  if @txt =~ /authors?:\s+(.+?)\s+last updated:\s+(\d{1,2} [a-z]{3,10} \d{4})/im
+    last_updated = $2.to_s   # note: save a copy first (gets "reset" by next regex)
+    authors      = $1.to_s.strip.gsub(/\s+/, ' ' )  # cleanup whitespace; squish-style
+    authors = authors.gsub( /[ ]*,[ ]*/, ', ' )    # prettify commas - always single space after comma (no space before)
+    puts "authors: >#{authors}<"
+    puts "last updated: >#{last_updated}<"
+  end
+  puts "*** !!! missing source"  if source.nil?
+  puts "*** !!! missing authors and last updated"   if authors.nil? || last_updated.nil?
+  ## get year from source (url)
+  url_path  = URI.parse( source ).path
+  basename  = File.basename( url_path, File.extname( url_path ) )  ## e.g. duit92.txt or duit92.html => duit92
+  puts "   basename=>#{basename}<"
+  year      = year_from_name( basename )
+  sections = []
+  ## count lines
+  line_count = 0
+  @txt.each_line do |line|
+    line_count +=1
+    ### find sections
+    ## todo: add more patterns? how? why?
+    if line =~ /####\s+(.+)/
+      puts "  found section >#{$1}<"
+      ## remove  anchors first   e.g.   ‹§sa› etc.
+      ##   check if anchors with underscore (_) or dash/hyphen (-) ???
+      sections << $1.sub( /‹§[a-z0-9]+›/, '' ).strip
+    end
+  end
+  rec = PageStat.new
+  rec.source       = source         # e.g. http://rsssf.org/tabled/duit89.html   -- use source_url - why?? why not??
+  rec.year         = year
+  rec.authors      = authors
+  rec.last_updated = last_updated
+  rec.line_count   = line_count
+  rec.char_count   = @txt.size      ## fix: use "true" char count not byte count
+  rec.sections     = sections
+  rec
+end  ## method build_stat
+def save( path )
+  write_text( path, @txt )
+end  ## method save
+end  ## class Page
+end  ## module Rsssf

data/lib/rsssf/repo.rb ADDED Viewed

@@ -0,0 +1,144 @@
+module Rsssf
+class Repo
+  include Utils       ## e.g. year_from_file, etc.
+def initialize( path, title: 'Your Title Here',
+                      patch: nil )
+  @repo_path = path
+  @title     = title
+  @patch     = patch
+end
+def root() @repo_path; end    ## use/rename to path - why? why not?
+alias_method :root_dir, :root
+## for now use single country repos - why? why not?
+##   add support for all-in-one repos
+def prepare_pages( code, seasons )
+  seasons.each do |season|
+    url = Rsssf.table_url( code, season: season )
+    ## check if not in cache
+    unless Webcache.cached?( url )
+        ## download - if not cached
+        Rsssf.download_table( code, season: season )
+    end
+    page = Page.read_cache( url )
+    url_path = URI.parse( url ).path
+    puts "  url = >#{url}<"
+    puts "  url_path = >#{url_path}<"
+    basename = File.basename( url_path, File.extname( url_path ))
+    ###
+    ## check for on_prepare (apply patches)
+    if @patch && @patch.respond_to?(:on_prepare)
+       year = year_from_name( basename )
+       page.txt = @patch.on_prepare( page.txt, basename, year )
+    end
+    path = "#{@repo_path}/tables/#{basename}.txt"
+    page.save( path )
+  end
+end # method prepare_pages
+def each_page( code, seasons, &blk )  ## use each table or such - why? why not?
+  seasons.each do |season|
+    url = Rsssf.table_url( code, season: season )
+    url_path = URI.parse( url ).path
+    puts "  url = >#{url}<"
+    puts "  url_path = >#{url_path}<"
+    basename = File.basename( url_path, File.extname( url_path ))
+    path = "#{@repo_path}/tables/#{basename}.txt"
+     page = Page.read_txt( path )
+     ## add/pass along patcher if patcher
+     if @patch
+       page.patch  = @patch
+       page.url    = url
+     end
+    season = Season( season )
+    blk.call( season, page )
+  end
+end
+def make_pages_summary
+  files = Dir.glob( "#{@repo_path}/tables/*.txt" )
+  report = PageReport.build( files, title: @title )    ## pass in title etc.
+  ### save report as README.md in tables/ folder in repo
+  report.save( "#{@repo_path}/tables/README.md" )
+end  # method make_pages_summary
+def make_schedules_summary
+   ## find all match datafiles
+   args = [@repo_path]
+   files = SportDb::Parser::Opts.expand_args( args )
+   pp files
+   report = ScheduleReport.build( files, title: @title,
+                                         patch: @patch )   ## pass in title etc.
+   report.save( "#{@repo_path}/README.md" )
+end
+def patch_pages( patcher )
+  ## lets you run/use custom (repo/country-specific patches e.g. for adding/patching headings etc.)
+  patch_dir( "#{@repo_path}/tables" ) do |txt, name, year|
+    puts "patching #{year} (#{name}) (#{@repo_path})..."
+    patcher.patch( txt, name, year )    ## note: must be last (that is, must return (patcher) t(e)xt)
+  end
+end ## method  patch_pages
+def patch_dir( root, &blk )
+  files = Dir.glob( "#{root}/**/*.txt" )
+  ## pp files
+  ## sort files by year (latest first)
+  files = files.sort do |l,r|
+    lyear = year_from_file( l )
+    ryear = year_from_file( r )
+    ryear <=> lyear
+  end
+  files.each do |file|
+    txt = read_text( file )    ## note: assumes already converted to utf-8
+    basename = File.basename( file, '.txt' )  ## e.g. duit92.txt => duit92
+    year     = year_from_name( basename )
+    new_txt = blk.call( txt, basename, year )
+    ## calculate hash to see if anything changed ?? why? why not??
+    if txt != new_txt
+      puts "  patching #{file}, text changed"
+      write_text( file, new_txt )
+    end
+  end # each file
+end  ## patch_dir
+end  ## class Repo
+end  ## module Rsssf

data/lib/rsssf/reports/page.rb ADDED Viewed

@@ -0,0 +1,75 @@
+module Rsssf
+class PageReport
+def self.build( files, title:  )
+  stats = []
+  files.each do |file|
+    page = Page.read_txt( file )
+    stats << page.build_stat
+  end
+  new( stats, title: title )
+end
+attr_reader :title
+def initialize( stats, title: )
+  @stats = stats
+  @title = title
+end
+### save report as README.md in repo
+def save( path ) write_text( path, build_summary ); end
+def build_summary
+  stats = @stats.sort do |l,r|
+    r.year <=> l.year
+  end
+  header =<<EOS
+# #{title}
+football.db RSSSF Archive Data Summary for #{title}
+EOS
+## no longer add last update
+##  _Last Update: #{Time.now}_
+  txt = ''
+  txt << header
+  txt << "| File   | Authors  | Last Updated | Lines (Chars) | Sections |\n"
+  txt << "| :----- | :------- | :----------- | ------------: | :------- |\n"
+## note - removed season (no longer tracked here)
+  stats.each do |stat|
+    ## get basename from source url
+    url_path  = URI.parse( stat.source ).path
+    basename  = File.basename( url_path, File.extname( url_path ) )  ## e.g. duit92.txt or duit92.html => duit92
+    txt << "| [#{basename}.txt](#{basename}.txt) "
+    txt << "| #{stat.authors} "
+    txt << "| #{stat.last_updated} "
+    txt << "| #{stat.line_count} (#{stat.char_count}) "
+    txt << "| #{stat.sections.join(', ')} "
+    txt << "|\n"
+  end
+  txt << "\n\n"
+  txt
+end  # method build_summary
+end  ## class PageReport
+end  ## module Rsssf

data/lib/rsssf/reports/schedule.rb ADDED Viewed

@@ -0,0 +1,163 @@
+module Rsssf
+ScheduleStat = Struct.new(
+    :path,     ## path to .txt file
+    :errors   ## array or nil
+)
+class ScheduleReport
+  include Utils       ## e.g. year_from_file, etc.
+##
+##  quick hack?  pass along (optional) patch
+def self.build( files, title:,
+                       patch: nil )
+  linter = Parser::Linter.new
+  stats = []
+  files.each_with_index do |file,i|
+    puts "==> [#{i+1}/#{files.size}] reading >#{file}<..."
+    txt = read_text( file )
+    if patch && patch.respond_to?(:on_parse)
+      season_dir = File.basename(File.dirname(file))
+      season     = Season( season_dir )
+      basename   = File.basename(file, File.extname(file))
+      puts "  [debug] before  patch.on_parse #{basename}, #{season}"
+      txt = patch.on_parse( txt, basename, season )
+    end
+    linter.parse( txt, parse: true,
+                       path:  file  )   ## todo/fix - change path to file/filename - why? why not?
+    stat = ScheduleStat.new
+    stat.path   = file
+    stat.errors = linter.errors
+    stats << stat
+  end
+  new( stats, title: title )
+end
+attr_reader :title
+def initialize( stats,  title: )
+  @stats = stats
+  @title = title
+end
+### save report as README.md in repo
+def save( path ) write_text( path, build_summary ); end
+def build_summary
+  ## sort start 1) by season (latest first) than
+  ##            2) by name (e.g. 1-bundesliga, cup, etc.)
+  stats = @stats.sort do |l,r|
+    v =  File.basename(File.dirname(r.path)) <=> File.basename(File.dirname(l.path))
+    v =  File.basename(l.path) <=> File.basename(r.path)    if v == 0  ## same season
+    v
+  end
+  header =<<EOS
+# #{title}
+football.db RSSSF (Rec.Sport.Soccer Statistics Foundation) Archive Data for
+#{title}
+EOS
+## no longer add last update
+## _Last Update: #{Time.now}_
+##
+=begin
+  footer =<<EOS
+## Questions? Comments?
+Send them along to the
+[Open Sports & Friends Forum](http://groups.google.com/group/opensport).
+Thanks!
+EOS
+=end
+  errors = []
+  txt = String.new
+  txt << header
+  txt << "| Season | League, Cup | Errors |\n"
+  txt << "| :----- | :---------- | -----: |\n"
+  stats.each_with_index do |stat,i|
+      path = stat.path
+      season_dir = File.basename(File.dirname( path ))
+      filename   = File.basename( path ) ## incl. extension !!
+      season = Season( season_dir )
+      ## note - use archive_dir_for_season for archive path
+      txt << "| #{season_dir} "
+      txt << "| [#{filename}](#{archive_dir_for_season(season)}/#{filename}) "
+      txt <<   if stat.errors.size > 0
+                 "|  **!! #{stat.errors.size}**  "
+               else
+                 "|  OK  "
+               end
+      txt << "|\n"
+      errors += stat.errors  if stat.errors.size > 0
+  end
+   if errors.size > 0
+     txt << "\n\n"
+     txt << "#{errors.size} errors in #{stats.size} datafile(s)\n\n"
+     txt << "```\n"
+     errors.each do |path, msg, line|
+        season_dir = File.basename(File.dirname( path ))
+        filename   = File.basename( path ) ## incl. extension !!
+        txt <<"#{season_dir}/#{filename} -- #{msg}\n"
+        txt << "     in line >#{line}<\n"    unless line.empty?
+     end
+     txt << "```\n"
+   end
+=begin
+  stats.each do |stat|
+    txt << "| #{stat.season} "
+    txt << "| [#{stat.filename}](#{stat.path}/#{stat.filename}) "
+    txt << "| #{stat.rounds} "
+    txt << "|\n"
+  end
+=end
+  ## txt << footer
+  txt
+end  # method build_summary
+end  ## class ScheduleReport
+end  ## module Rsssf