RubyGems - rsssf - Versions diffs - 0.0.1 → 0.1.0 - Mend

rsssf 0.0.1 → 0.1.0

Files changed (18) hide show

checksums.yaml +4 -4
data/.gemtest +0 -0
data/Manifest.txt +11 -0
data/README.md +171 -2
data/lib/rsssf.rb +13 -0
data/lib/rsssf/fetch.rb +80 -0
data/lib/rsssf/html2txt.rb +157 -0
data/lib/rsssf/page.rb +295 -0
data/lib/rsssf/patch.rb +28 -0
data/lib/rsssf/repo.rb +220 -0
data/lib/rsssf/reports/page.rb +64 -0
data/lib/rsssf/reports/schedule.rb +77 -0
data/lib/rsssf/schedule.rb +31 -0
data/lib/rsssf/utils.rb +75 -0
data/lib/rsssf/version.rb +2 -2
data/test/helper.rb +12 -0
data/test/test_utils.rb +83 -0
metadata +13 -1

@@ -0,0 +1,295 @@
+# encoding: utf-8
+module Rsssf
+  PageStat = Struct.new(
+    :source,     ## e.g. http://rsssf.org/tabled/duit89.html
+    :basename,   ## e.g. duit89   -- note: filename w/o extension (and path)
+    :year,       ## e.g. 1989     -- note: always four digits
+    :season,     ## e.g. 1990-91  -- note: always a string (NOT a number)
+    :authors,
+    :last_updated,
+    :line_count,  ## todo: rename to (just) lines - why? why not?
+    :char_count,  ## todo: rename to (just) char(ectar)s  - why? why not?
+    :sections)
+###
+## note:
+#    a rsssf page may contain:
+#     many leagues, cups
+#     - tables, schedules (rounds), notes, etc.
+#
+#   a rsssf page MUST be in plain text (.txt) and utf-8 character encoding assumed
+#
+class Page
+  include Utils   ## e.g. year_from_name, etc.
+def self.from_url( src )
+  txt = PageFetcher.new.fetch( src )
+  self.from_string( txt )
+end
+def self.from_file( path )
+  txt = File.read_utf8( path )  # note: always assume sources (already) converted to utf-8
+  self.from_string( txt )
+end
+def self.from_string( txt )
+  self.new( txt )
+end
+def initialize( txt )
+  @txt = txt
+end
+LEAGUE_ROUND_REGEX = /\b
+                      Round
+                      \b/ix
+CUP_ROUND_REGEX  = /\b(
+                      Round         |
+                      1\/8\sFinals  |
+                      1\/16\sFinals |
+                      Quarterfinals |
+                      Semifinals    |
+                      Final
+                    )\b/ix
+def find_schedule( opts={} )     ## change to build_schedule - why? why not???
+  ## find match schedule/fixtures in multi-league doc
+  new_txt = ''
+  ## note: keep track of statistics
+  ##   e.g. number of rounds found
+  round_count = 0
+  header = opts[:header]
+  if header
+    league_header_found        = false
+     ## header:
+     ##  - assumes heading 4 e.g. #### Premier League or
+     ##  - bold e.g. **FA Cup** for now
+     ##  note: markers must start line (^)
+     ## note:
+     ## header gsub spaces to \s otherwise no match in regex (using free-form x-flag)!!!
+     header_esc   = header.gsub( ' ', '\s' )
+     ## note: somehow #{2,4} will not work with free-form /xi defined (picked up as comment?)
+     ##  use [#] hack ??
+     header_regex = /^
+                      ([#]{2,4}\s+(#{header_esc}))
+                        |
+                      (\*{2}(#{header_esc})\*{2})
+                    /ix
+    ## todo:
+    ##   use new stage_regex e.g. **xxx** - why? why not?
+    ##  allow more than one stage in one schedule (e.g. regular stage,playoff stage etc)
+  else
+    league_header_found = true   # default (no header; assume single league file)
+    header_regex = /^---dummy---$/  ## non-matching dummy regex
+  end
+  ## puts "header_regex:"
+  ## pp header_regex
+  if opts[:cup]
+    round_regex = CUP_ROUND_REGEX   ## note: only allow final, quaterfinals, etc. if knockout cup
+  else
+    round_regex = LEAGUE_ROUND_REGEX
+  end
+  ## stages
+  first_round_header_found   = false
+  round_header_found         = false
+  round_body_found           = false   ## allow round header followed by blank lines
+  blank_found = false
+  @txt.each_line do |line|
+    if league_header_found == false
+      ## first find start of league header/section
+      if line =~ header_regex
+        puts "!!! bingo - found header >#{line}<"
+        league_header_found = true
+        title = line.gsub( /[#*]/, '' ).strip   ##  quick hack: extract title from header
+        new_txt << "## #{title}\n\n"    # note: use header/stage title (regex group capture)
+      else
+        puts "  searching for header >#{header}<; skipping line >#{line}<"
+        next
+      end
+    elsif first_round_header_found == false
+      ## next look for first round (starting w/ Round)
+      if line =~ round_regex
+        puts "!!! bingo - found first round >#{line}<"
+        round_count += 1
+        first_round_header_found = true
+        round_header_found       = true
+        round_body_found         = false
+        new_txt << line
+      elsif line =~ /^=-=-=-=/
+        puts "*** no rounds found; hit section marker (horizontal rule)"
+        break
+      elsif line =~ /^\*{2}[^*]+\*{2}/   ## e.g. **FA Cup**
+        puts "*** no rounds found; hit section/stage header: #{line}"
+        break
+      else
+        puts "  searching for first round; skipping line >#{line}<"
+        next ## continue; searching
+      end
+    elsif round_header_found == true
+      ## collect rounds;
+      ##   assume text block until next blank line
+      ##   new block must allways start w/ round
+      if line =~ /^\s*$/   ## blank line?
+        if round_body_found
+          round_header_found = false
+          blank_found        = true    ## keep track of blank (lines) - allow inside round block (can continue w/ date header/marker)
+          new_txt << line
+        else
+          ## note: skip blanks following header
+          next
+        end
+      else
+        round_body_found = true
+        new_txt << line   ## keep going until next blank line
+      end
+    else
+      ## skip (more) blank lines
+      if line =~ /^\s*$/
+        next  ## continue; skip extra blank line
+      elsif line =~ round_regex
+        puts "!!! bingo - found new round >#{line}<"
+        round_count += 1
+        round_header_found = true   # more rounds; continue
+        round_body_found   = false
+        blank_found        = false  # reset blank tracker
+        new_txt << line
+      elsif blank_found && line =~ /\[[a-z]{3} \d{1,2}\]/i   ## e.g. [Mar 13] or [May 5] with leading blank line; continue round
+        puts "!!! bingo - continue round >#{line}<"
+        round_header_found = true
+        blank_found        = false  # reset blank tracker
+        new_txt << line
+      elsif blank_found && line =~ /First Legs|Second Legs/i
+        puts "!!! bingo - continue round >#{line}<"
+        round_header_found = true
+        blank_found        = false  # reset blank tracker
+        new_txt << line
+      elsif line =~ /=-=-=-=/
+        puts "!!! stop schedule; hit section marker (horizontal rule)"
+        break;
+      elsif line =~ /^\*{2}[^*]+\*{2}/   ## e.g. **FA Cup**
+        puts "!!! stop schedule; hit section/stage header: #{line}"
+        break
+      else
+        blank_found  = false
+        puts "skipping line in schedule >#{line}<"
+        next # continue
+      end
+    end
+  end  # each line
+  schedule = Schedule.from_string( new_txt )
+  schedule.rounds = round_count
+  schedule
+end  # method find_schedule
+def build_stat
+  source       = nil
+  authors      = nil
+  last_updated = nil
+  ### find source ref
+  if @txt =~ /source: ([^ \n]+)/im
+    source = $1.to_s
+    puts "source: >#{source}<"
+  end
+  ##
+  ## fix/todo: move authors n last updated  whitespace cleanup to sanitize - why? why not??
+  if @txt =~ /authors?:\s+(.+?)\s+last updated:\s+(\d{1,2} [a-z]{3,10} \d{4})/im
+    last_updated = $2.to_s   # note: save a copy first (gets "reset" by next regex)
+    authors      = $1.to_s.strip.gsub(/\s+/, ' ' )  # cleanup whitespace; squish-style
+    authors = authors.gsub( /[ ]*,[ ]*/, ', ' )    # prettify commas - always single space after comma (no space before)
+    puts "authors: >#{authors}<"
+    puts "last updated: >#{last_updated}<"
+  end
+  puts "*** !!! missing source"  if source.nil?
+  puts "*** !!! missing authors n last updated"   if authors.nil? || last_updated.nil?
+  sections = []
+  ## count lines
+  line_count = 0
+  @txt.each_line do |line|
+    line_count +=1
+    ### find sections
+    ## todo: add more patterns? how? why?
+    if line =~ /####\s+(.+)/
+      puts "  found section >#{$1}<"
+      sections << $1.strip
+    end
+  end
+  # get path from url
+  url  = URI.parse( source )
+  ## pp url
+  ## puts url.host
+  path = url.path
+  extname  = File.extname( path )
+  basename = File.basename( path, extname )  ## e.g. duit92.txt or duit92.html => duit92
+  year     = year_from_name( basename )
+  season   = year_to_season( year )
+  rec = PageStat.new
+  rec.source       = source         # e.g. http://rsssf.org/tabled/duit89.html   -- use source_url - why?? why not??
+  rec.basename     = basename       # e.g. duit89
+  rec.year         = year           # e.g. 89 => 1989  -- note: always four digits
+  rec.season       = season
+  rec.authors      = authors
+  rec.last_updated = last_updated
+  rec.line_count   = line_count
+  rec.char_count   = @txt.size      ## fix: use "true" char count not byte count
+  rec.sections     = sections
+  rec
+end  ## method build_stat
+def save( path )
+  File.open( path, 'w' ) do |f|
+    f.write @txt
+  end
+end  ## method save
+end  ## class Page
+end  ## module Rsssf
+## add (shortcut) alias
+RsssfPageStat = Rsssf::PageStat
+RsssfPage     = Rsssf::Page

data/lib/rsssf/patch.rb ADDED

@@ -0,0 +1,28 @@
+# encoding: utf-8
+module Rsssf
+class Patcher
+## e.g. 2008/09
+##   note: also support 1999/2000
+SEASON = '\d{4}\/(\d{2}|\d{4})'  ## note: use single quotes - quotes do NOT get escaped (e.g. '\d' => "\\d")
+def patch_heading( txt, rxs, title )
+  rxs.each do |rx|
+    txt = txt.sub( rx ) do |match|
+      match = match.gsub( "\n", '$$')  ## change newlines to $$ for single-line outputs/dumps
+      puts "  found heading >#{match}<"
+      "\n\n#### #{title}\n\n"
+    end
+  end
+  txt
+end
+end # class Patcher
+end  ## module Rsssf
+## add (shortcut) alias
+RsssfPatcher = Rsssf::Patcher

data/lib/rsssf/repo.rb ADDED

@@ -0,0 +1,220 @@
+# encoding: utf-8
+module Rsssf
+## used by Repo#make_schedules
+ScheduleConfig = Struct.new(
+  :name,
+  :opts_for_year,  ## hash or proc ->(year){ Hash[...] }
+  :dir_for_year,  ## proc ->(year){ 'path_here'}     ## rename to path_for_year - why, why not??
+  :includes        ## array of years to include e.g. [2011,2012] etc.
+)
+ScheduleStat = Struct.new(
+  :path,          ## e.g. 2012-13 or archive/1980s/1984-85
+  :filename,      ## e.g. 1-bundesliga.txt   -- note: w/o path
+  :year,          ## e.g. 2013      -- note: numeric (integer)
+  :season,        ## e.g. 2012-13   -- note: is a string
+  :rounds         ## e.g. 36   -- note: numeric (integer)
+)
+class Repo
+  include Filters     ## e.g. sanitize, etc.
+  include Utils       ## e.g. year_from_file, etc.
+def initialize( path, opts )   ## pass in title etc.
+  @repo_path = path
+  @opts      = opts
+end
+def fetch_pages
+  puts "fetch_pages:"
+  cfg = YAML.load_file( "#{@repo_path}/tables/config.yml")
+  pp cfg
+  dl_base = 'http://rsssf.com'
+  cfg.each do |k,v|
+    ## season = k   # as string e.g. 2011-12  or 2011 etc.
+    path      = v  # as string e.g. tablesd/duit2011.html
+    ## note: assumes extension is .html
+    #    e.g. tablesd/duit2011.html => duit2011
+    basename = File.basename( path, '.html' )
+    src_url   = "#{dl_base}/#{path}"
+    dest_path = "#{@repo_path}/tables/#{basename}.txt"
+    page = Page.from_url( src_url )
+    page.save( dest_path )
+  end # each year
+end # method fetch_pages
+def make_pages_summary
+  stats = []
+  files = Dir[ "#{@repo_path}/tables/*.txt" ]
+  files.each do |file|
+    page = Page.from_file( file )
+    stats << page.build_stat
+  end
+  ### save report as README.md in tables/ folder in repo
+  report = PageReport.new( stats, @opts )    ## pass in title etc.
+  report.save( "#{@repo_path}/tables/README.md" )
+end  # method make_pages_summary
+def make_schedules_summary( stats )   ## note: requires stats to be passed in for now
+  report = ScheduleReport.new( stats, @opts )   ## pass in title etc.
+  report.save( "#{@repo_path}/README.md" )
+end  # method make_schedules_summary
+def patch_pages( patcher )
+  ## lets you run/use custom (repo/country-specific patches e.g. for adding/patching headings etc.)
+  patch_dir( "#{@repo_path}/tables" ) do |txt, name, year|
+    puts "patching #{year} (#{name}) (#{@repo_path})..."
+    patcher.patch( txt, name, year )    ## note: must be last (that is, must return (patcher) t(e)xt)
+  end
+end ## method  patch_pages
+def sanitize_pages
+   ## for debugging/testing lets you (re)run sanitize  (alreay incl. in html2txt filter by default)
+   sanitize_dir( "#{@repo_path}/tables" )
+end
+def make_schedules( cfg )
+  ## note: return stats (for report eg. README)
+  stats = []
+  files = Dir[ "#{@repo_path}/tables/*.txt" ]
+  files.each do |file|
+## todo/check/fix:
+##   use source: prop in rsssf page - why? why not???
+##   move year/season/basename into page ???
+#
+#  assume every rsssf page has at least:
+##    - basename  e.g. duit2014
+##    - year      e.g. 2014 (numeric)
+##    - season    (derived from config lookup???) - string e.g. 2014-15 or 2014 etc.
+    extname  = File.extname( file )
+    basename = File.basename( file, extname )
+    year     = year_from_name( basename )
+    season   = year_to_season( year )
+    if cfg.includes && cfg.includes.include?( year ) == false
+      puts "   skipping #{basename}; not listed in includes"
+      next
+    end
+    puts "  reading >#{basename}<"
+    page = Page.from_file( file ) # note: always assume sources (already) converted to utf-8
+    if cfg.opts_for_year.is_a?( Hash )
+      opts = cfg.opts_for_year    ## just use as is 1:1 (constant/same for all years)
+    else
+      ## assume it's a proc/lambda (call to calculate)
+      opts = cfg.opts_for_year.call( year )
+    end
+    pp opts
+    schedule = page.find_schedule( opts )
+    ## pp schedule
+    if cfg.dir_for_year.nil?
+      ## use default setting, that is, archive for dir (e.g. archive/1980s/1985-86 etc.)
+      dir_for_year = archive_dir_for_year( year )
+    else
+      ## assume it's a proc/lambda
+      dir_for_year = cfg.dir_for_year.call( year )
+    end
+    ## -- cfg.name               e.g. => 1-liga
+    dest_path = "#{@repo_path}/#{dir_for_year}/#{cfg.name}.txt"
+    puts "  save to >#{dest_path}<"
+    FileUtils.mkdir_p( File.dirname( dest_path ))
+    schedule.save( dest_path )
+    rec = ScheduleStat.new
+    rec.path     = dir_for_year
+    rec.filename = "#{cfg.name}.txt"    ## change to basename - why?? why not??
+    rec.year     = year
+    rec.season   = season
+    rec.rounds   = schedule.rounds
+    stats << rec
+  end
+  stats  # return stats for reporting
+end # method make_schedules
+private
+def patch_dir( root )
+  files = Dir[ "#{root}/*.txt" ]
+  ## pp files
+  ## sort files by year (latest first)
+  files = files.sort do |l,r|
+    lyear = year_from_file( l )
+    ryear = year_from_file( r )
+    ryear <=> lyear
+  end
+  files.each do |file|
+    txt = File.read_utf8( file )    ## note: assumes already converted to utf-8
+    basename = File.basename( file, '.txt' )  ## e.g. duit92.txt => duit92
+    year     = year_from_name( basename )
+    new_txt = yield( txt, basename, year )
+    ## calculate hash to see if anything changed ?? why? why not??
+    File.open( file, 'w' ) do |f|
+      f.write new_txt
+    end
+  end # each file
+end  ## patch_dir
+def sanitize_dir( root )
+  files = Dir[ "#{root}/*.txt" ]
+  files.each do |file|
+    txt = File.read_utf8( file )    ## note: assumes already converted to utf-8
+    new_txt = sanitize( txt )
+    File.open( file, 'w' ) do |f|
+      f.write new_txt
+    end
+  end # each file
+end  ## sanitize_dir
+end  ## class Repo
+end  ## module Rsssf
+## add (shortcut) alias
+RsssfRepo           = Rsssf::Repo
+RsssfScheduleConfig = Rsssf::ScheduleConfig
+RsssfScheduleStat   = Rsssf::ScheduleStat