RubyGems - rsssf - Versions diffs - 0.1.0 → 0.2.0 - Mend

rsssf 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

checksums.yaml +5 -5
data/{HISTORY.md → CHANGELOG.md} +2 -0
data/Manifest.txt +3 -6
data/README.md +43 -26
data/Rakefile +8 -7
data/lib/rsssf/convert.rb +495 -0
data/lib/rsssf/download.rb +151 -0
data/lib/rsssf/page.rb +70 -45
data/lib/rsssf/repo.rb +77 -153
data/lib/rsssf/reports/page.rb +30 -19
data/lib/rsssf/reports/schedule.rb +111 -25
data/lib/rsssf/schedule.rb +4 -14
data/lib/rsssf/utils.rb +10 -29
data/lib/rsssf/version.rb +3 -5
data/lib/rsssf.rb +42 -19
metadata +26 -25
data/.gemtest +0 -0
data/lib/rsssf/fetch.rb +0 -80
data/lib/rsssf/html2txt.rb +0 -157
data/lib/rsssf/patch.rb +0 -28
data/test/helper.rb +0 -12
data/test/test_utils.rb +0 -83

data/lib/rsssf/download.rb ADDED Viewed

@@ -0,0 +1,151 @@
+module Rsssf
+## end_year to slug_year
+##   check if generic rule/convention in use ???
+## 2007-08: tablesd/duit08.html
+## 2008-09: tablesd/duit09.html
+## 2009-10: tablesd/duit2010.html
+## 2010-11: tablesd/duit2011.html
+## 2011-12: tablesd/duit2012.html
+    ## map country codes to table pages
+    ##   add options about (char) encoding ??? - why? why not?
+  TABLE = {
+    'eng' => ['tablese/eng{year}',   { encoding: 'Windows-1252' } ],
+    'es'  => ['tabless/span{year}',  { encoding: 'Windows-1252' } ],
+    'de'  => ['tablesd/duit{year}', { encoding: 'Windows-1252' } ],
+    'at'  => ['tableso/oost{year}', { encoding: 'Windows-1252' }  ],
+    'br'  => [
+              ->(season) {
+                  ## note: special slug/case for year/season 2000
+                  ##  see rsssf.org/tablesb/brazchamp.html
+                 if season == Season('2000')
+                   'tablesb/braz-joao{year}'  ## use braz-joao00 - why? why not?
+                 else
+                   'tablesb/braz{year}'
+                 end
+              },  { encoding: 'Windows-1252' } ],
+  }
+  BASE_URL = "https://rsssf.org"
+  def self.table_url( code, season: )
+     url, _ = table_url_and_encoding( code, season: season )
+     url
+  end
+  def self.table_url_and_encoding( code, season: )
+     season = Season( season )
+     table = TABLE[ code.downcase ]
+     tmpl     = table[0]
+     tmpl     = tmpl.call( season )  if tmpl.is_a?(Proc)  ## check for proc
+     opts     = table[1] || {}
+     encoding = opts[:encoding]  || 'UTF-8'
+     slug =  if season.end_year < 2010   ## cut off all digits (only keep last two)s
+                 ##  convert end_year to string with leading zero
+                 '%02d' % (season.end_year % 100)  ## e.g. 00 / 01 / 99 / 98 / 11 / etc.
+              else
+                '%4d' % season.end_year
+              end
+     tmpl = tmpl.sub( '{year}', slug )
+     url = "#{BASE_URL}/#{tmpl}.html"
+     [url, encoding]
+  end
+  def self.download_table( code, season: )
+     url, encoding = table_url_and_encoding( code, season: season )
+     download_page( url, encoding: encoding )
+  end
+  def self.download_page( url, encoding: )
+    ## note: assume plain 7-bit ascii for now
+    ##  -- assume rsssf uses ISO_8859_15 (updated version of ISO_8859_1)
+    ###-- does NOT use utf-8 character encoding!!!
+    response = Webget.page( url, encoding: encoding )  ## fetch (and cache) html page (via HTTP GET)
+    ## note: exit on get / fetch error - do NOT continue for now - why? why not?
+    exit 1   if response.status.nok?    ## e.g.  HTTP status code != 200
+    puts "html:"
+    html =  response.text( encoding: encoding )
+    pp html[0..400]
+    html
+  end
+end  # module Rsssf
+__END__
+1998-99: tablesd/duit99.html
+1999-00: tablesd/duit00.html      ## use 1999-2000  - why?? why not??
+2000-01: tablesd/duit01.html
+2001-02: tablesd/duit02.html
+2002-03: tablesd/duit03.html
+2003-04: tablesd/duit04.html
+2004-05: tablesd/duit05.html
+2005-06: tablesd/duit06.html
+2006-07: tablesd/duit07.html
+2007-08: tablesd/duit08.html
+2008-09: tablesd/duit09.html
+2009-10: tablesd/duit2010.html
+2010-11: tablesd/duit2011.html
+2011-12: tablesd/duit2012.html
+2012-13: tablesd/duit2013.html
+2013-14: tablesd/duit2014.html
+2014-15: tablesd/duit2015.html
+2010-11: tableso/oost2011.html
+2011-12: tableso/oost2012.html
+2012-13: tableso/oost2013.html
+2013-14: tableso/oost2014.html
+2014-15: tableso/oost2015.html
+2015-16: tableso/oost2016.html
+2011: tablesb/braz2011.html  !! Windows-1252
+2012: tablesb/braz2012.html  !! Windows-1252
+2013: tablesb/braz2013.html  !! Windows-1252
+2014: tablesb/braz2014.html  !! Windows-1252
+2015: tablesb/braz2015.html  !! Windows-1252
+2016: tablesb/braz2016.html  !! Windows-1252
+2017: tablesb/braz2017.html  !! Windows-1252
+2018: tablesb/braz2018.html  !! Windows-1252
+2019: tablesb/braz2019.html  !! Windows-1252
+2020: tablesb/braz2020.html  !! Windows-1252   ## 2020/21  - extended for corona
+2021: tablesb/braz2021.html  !! Windows-1252
+2022: tablesb/braz2022.html  !! Windows-1252
+2023: tablesb/braz2023.html  !! Windows-1252
+2024: tablesb/braz2024.html  !! Windows-1252
+2010-11: tablese/eng2011.html  !! Windows-1252
+2011-12: tablese/eng2012.html  !! Windows-1252
+2012-13: tablese/eng2013.html  !! Windows-1252
+2013-14: tablese/eng2014.html  !! Windows-1252
+2014-15: tablese/eng2015.html  !! Windows-1252
+2015-16: tablese/eng2016.html  !! Windows-1252
+2016-17: tablese/eng2017.html  !! Windows-1252
+2017-18: tablese/eng2018.html  !! Windows-1252
+2018-19: tablese/eng2019.html  !! Windows-1252
+2019-20: tablese/eng2020.html  !! Windows-1252
+2020-21: tablese/eng2021.html  !! Windows-1252
+2021-22: tablese/eng2022.html  !! Windows-1252
+2022-23: tablese/eng2023.html  !! Windows-1252
+2023-24: tablese/eng2024.html  !! Windows-1252

data/lib/rsssf/page.rb CHANGED Viewed

@@ -1,13 +1,11 @@
-# encoding: utf-8
 module Rsssf
   PageStat = Struct.new(
-    :source,     ## e.g. http://rsssf.org/tabled/duit89.html
-    :basename,   ## e.g. duit89   -- note: filename w/o extension (and path)
+    :source,     ## e.g. https://rsssf.org/tabled/duit89.html
     :year,       ## e.g. 1989     -- note: always four digits
-    :season,     ## e.g. 1990-91  -- note: always a string (NOT a number)
     :authors,
     :last_updated,
     :line_count,  ## todo: rename to (just) lines - why? why not?
@@ -27,24 +25,41 @@ module Rsssf
 class Page
   include Utils   ## e.g. year_from_name, etc.
+def self.read_cache( url )  ### use read_cache /web/html or such - why? why not?
+  html = Webcache.read( url )
+  puts "html:"
+  pp html[0..400]
+  txt = PageConverter.convert( html, url: url )
+  txt
-def self.from_url( src )
-  txt = PageFetcher.new.fetch( src )
-  self.from_string( txt )
+  new( txt )
 end
-def self.from_file( path )
-  txt = File.read_utf8( path )  # note: always assume sources (already) converted to utf-8
-  self.from_string( txt )
+def self.read_txt( path )  ## use read_txt
+    # note: always assume sources (already) converted from html to txt!!!!
+  txt = read_text( path )
+  new( txt )
 end
-def self.from_string( txt )
-  self.new( txt )
-end
+### use text alias too (for txt) - why? why not?
+attr_accessor :txt
+## quick hack? used for auto-patch machinery
+attr_accessor :patch
+attr_accessor :url  ### source url
 def initialize( txt )
   @txt = txt
+  @patch = nil
+  @url   = nil
 end
@@ -61,17 +76,20 @@ CUP_ROUND_REGEX  = /\b(
                       Final
                     )\b/ix
-def find_schedule( opts={} )     ## change to build_schedule - why? why not???
+## make header required - why? why not?
+def find_schedule( header: nil,
+                   cup:    false )     ## change to build_schedule - why? why not???
   ## find match schedule/fixtures in multi-league doc
-  new_txt = ''
+  new_txt = String.new
   ## note: keep track of statistics
   ##   e.g. number of rounds found
   round_count = 0
-  header = opts[:header]
   if header
     league_header_found        = false
@@ -89,7 +107,8 @@ def find_schedule( opts={} )     ## change to build_schedule - why? why not???
      header_regex = /^
                       ([#]{2,4}\s+(#{header_esc}))
                         |
-                      (\*{2}(#{header_esc})\*{2})
+                      (\*{2}(#{header_esc}))     ## was: \*{2})
+                                                 ##  do not inluce trailing ** for now (allows anchors e.g. §)
                     /ix
     ## todo:
@@ -105,7 +124,7 @@ def find_schedule( opts={} )     ## change to build_schedule - why? why not???
   ## pp header_regex
-  if opts[:cup]
+  if cup
     round_regex = CUP_ROUND_REGEX   ## note: only allow final, quaterfinals, etc. if knockout cup
   else
     round_regex = LEAGUE_ROUND_REGEX
@@ -128,8 +147,10 @@ def find_schedule( opts={} )     ## change to build_schedule - why? why not???
       if line =~ header_regex
         puts "!!! bingo - found header >#{line}<"
         league_header_found = true
-        title = line.gsub( /[#*]/, '' ).strip   ##  quick hack: extract title from header
-        new_txt << "## #{title}\n\n"    # note: use header/stage title (regex group capture)
+        ## note - do NOT auto-add header/title !!!
+        # title = line.gsub( /[#*]/, '' ).strip   ##  quick hack: extract title from header
+        # new_txt << "## #{title}\n\n"    # note: use header/stage title (regex group capture)
       else
         puts "  searching for header >#{header}<; skipping line >#{line}<"
         next
@@ -205,13 +226,24 @@ def find_schedule( opts={} )     ## change to build_schedule - why? why not???
     end
   end  # each line
-  schedule = Schedule.from_string( new_txt )
-  schedule.rounds = round_count
+  ## quick hack?
+  ### auto-apply patch if patch configured
+   if @patch  && @patch.respond_to?(:on_patch)
+      url_path = URI.parse( url ).path
+      basename = File.basename( url_path, File.extname( url_path ))
+      year     = year_from_name( basename )
+      new_txt = @patch.on_patch( new_txt, basename, year )
+   end
+  schedule = Schedule.new( new_txt )
+  ## schedule.rounds = round_count
   schedule
 end  # method find_schedule
 def build_stat
   source       = nil
   authors      = nil
@@ -224,7 +256,7 @@ def build_stat
   end
   ##
-  ## fix/todo: move authors n last updated  whitespace cleanup to sanitize - why? why not??
+  ## fix/todo: move authors n last updated  whitespace cleanup  - why? why not??
   if @txt =~ /authors?:\s+(.+?)\s+last updated:\s+(\d{1,2} [a-z]{3,10} \d{4})/im
     last_updated = $2.to_s   # note: save a copy first (gets "reset" by next regex)
@@ -235,7 +267,15 @@ def build_stat
   end
   puts "*** !!! missing source"  if source.nil?
-  puts "*** !!! missing authors n last updated"   if authors.nil? || last_updated.nil?
+  puts "*** !!! missing authors and last updated"   if authors.nil? || last_updated.nil?
+  ## get year from source (url)
+  url_path  = URI.parse( source ).path
+  basename  = File.basename( url_path, File.extname( url_path ) )  ## e.g. duit92.txt or duit92.html => duit92
+  puts "   basename=>#{basename}<"
+  year      = year_from_name( basename )
   sections = []
@@ -248,26 +288,16 @@ def build_stat
     ## todo: add more patterns? how? why?
     if line =~ /####\s+(.+)/
       puts "  found section >#{$1}<"
-      sections << $1.strip
+      ## remove  anchors first   e.g.   ‹§sa› etc.
+      ##   check if anchors with underscore (_) or dash/hyphen (-) ???
+      sections << $1.sub( /‹§[a-z0-9]+›/, '' ).strip
     end
   end
-  # get path from url
-  url  = URI.parse( source )
-  ## pp url
-  ## puts url.host
-  path = url.path
-  extname  = File.extname( path )
-  basename = File.basename( path, extname )  ## e.g. duit92.txt or duit92.html => duit92
-  year     = year_from_name( basename )
-  season   = year_to_season( year )
   rec = PageStat.new
   rec.source       = source         # e.g. http://rsssf.org/tabled/duit89.html   -- use source_url - why?? why not??
-  rec.basename     = basename       # e.g. duit89
-  rec.year         = year           # e.g. 89 => 1989  -- note: always four digits
-  rec.season       = season
+  rec.year         = year
   rec.authors      = authors
   rec.last_updated = last_updated
   rec.line_count   = line_count
@@ -279,17 +309,12 @@ end  ## method build_stat
 def save( path )
-  File.open( path, 'w' ) do |f|
-    f.write @txt
-  end
+  write_text( path, @txt )
 end  ## method save
 end  ## class Page
 end  ## module Rsssf
-## add (shortcut) alias
-RsssfPageStat = Rsssf::PageStat
-RsssfPage     = Rsssf::Page

data/lib/rsssf/repo.rb CHANGED Viewed

@@ -1,174 +1,115 @@
-# encoding: utf-8
 module Rsssf
-## used by Repo#make_schedules
-ScheduleConfig = Struct.new(
-  :name,
-  :opts_for_year,  ## hash or proc ->(year){ Hash[...] }
-  :dir_for_year,  ## proc ->(year){ 'path_here'}     ## rename to path_for_year - why, why not??
-  :includes        ## array of years to include e.g. [2011,2012] etc.
-)
-ScheduleStat = Struct.new(
-  :path,          ## e.g. 2012-13 or archive/1980s/1984-85
-  :filename,      ## e.g. 1-bundesliga.txt   -- note: w/o path
-  :year,          ## e.g. 2013      -- note: numeric (integer)
-  :season,        ## e.g. 2012-13   -- note: is a string
-  :rounds         ## e.g. 36   -- note: numeric (integer)
-)
 class Repo
-  include Filters     ## e.g. sanitize, etc.
   include Utils       ## e.g. year_from_file, etc.
-def initialize( path, opts )   ## pass in title etc.
+def initialize( path, title: 'Your Title Here',
+                      patch: nil )
   @repo_path = path
-  @opts      = opts
+  @title     = title
+  @patch     = patch
 end
-def fetch_pages
-  puts "fetch_pages:"
-  cfg = YAML.load_file( "#{@repo_path}/tables/config.yml")
-  pp cfg
+def root() @repo_path; end    ## use/rename to path - why? why not?
+alias_method :root_dir, :root
-  dl_base = 'http://rsssf.com'
-  cfg.each do |k,v|
-    ## season = k   # as string e.g. 2011-12  or 2011 etc.
-    path      = v  # as string e.g. tablesd/duit2011.html
+## for now use single country repos - why? why not?
+##   add support for all-in-one repos
+def prepare_pages( code, seasons )
+  seasons.each do |season|
+    url = Rsssf.table_url( code, season: season )
-    ## note: assumes extension is .html
-    #    e.g. tablesd/duit2011.html => duit2011
-    basename = File.basename( path, '.html' )
+    ## check if not in cache
+    unless Webcache.cached?( url )
+        ## download - if not cached
+        Rsssf.download_table( code, season: season )
+    end
-    src_url   = "#{dl_base}/#{path}"
-    dest_path = "#{@repo_path}/tables/#{basename}.txt"
+    page = Page.read_cache( url )
-    page = Page.from_url( src_url )
-    page.save( dest_path )
-  end # each year
-end # method fetch_pages
+    url_path = URI.parse( url ).path
+    puts "  url = >#{url}<"
+    puts "  url_path = >#{url_path}<"
+    basename = File.basename( url_path, File.extname( url_path ))
-def make_pages_summary
-  stats = []
+    ###
+    ## check for on_prepare (apply patches)
+    if @patch && @patch.respond_to?(:on_prepare)
+       year = year_from_name( basename )
+       page.txt = @patch.on_prepare( page.txt, basename, year )
+    end
-  files = Dir[ "#{@repo_path}/tables/*.txt" ]
-  files.each do |file|
-    page = Page.from_file( file )
-    stats << page.build_stat
-  end
-  ### save report as README.md in tables/ folder in repo
-  report = PageReport.new( stats, @opts )    ## pass in title etc.
-  report.save( "#{@repo_path}/tables/README.md" )
-end  # method make_pages_summary
+    path = "#{@repo_path}/tables/#{basename}.txt"
+    page.save( path )
+  end
+end # method prepare_pages
-def make_schedules_summary( stats )   ## note: requires stats to be passed in for now
-  report = ScheduleReport.new( stats, @opts )   ## pass in title etc.
-  report.save( "#{@repo_path}/README.md" )
-end  # method make_schedules_summary
+def each_page( code, seasons, &blk )  ## use each table or such - why? why not?
+  seasons.each do |season|
+    url = Rsssf.table_url( code, season: season )
+    url_path = URI.parse( url ).path
+    puts "  url = >#{url}<"
+    puts "  url_path = >#{url_path}<"
+    basename = File.basename( url_path, File.extname( url_path ))
+    path = "#{@repo_path}/tables/#{basename}.txt"
+     page = Page.read_txt( path )
+     ## add/pass along patcher if patcher
+     if @patch
+       page.patch  = @patch
+       page.url    = url
+     end
-def patch_pages( patcher )
-  ## lets you run/use custom (repo/country-specific patches e.g. for adding/patching headings etc.)
-  patch_dir( "#{@repo_path}/tables" ) do |txt, name, year|
-    puts "patching #{year} (#{name}) (#{@repo_path})..."
-    patcher.patch( txt, name, year )    ## note: must be last (that is, must return (patcher) t(e)xt)
+    season = Season( season )
+    blk.call( season, page )
   end
-end ## method  patch_pages
-def sanitize_pages
-   ## for debugging/testing lets you (re)run sanitize  (alreay incl. in html2txt filter by default)
-   sanitize_dir( "#{@repo_path}/tables" )
 end
+def make_pages_summary
+  files = Dir.glob( "#{@repo_path}/tables/*.txt" )
+  report = PageReport.build( files, title: @title )    ## pass in title etc.
-def make_schedules( cfg )
-  ## note: return stats (for report eg. README)
-  stats = []
-  files = Dir[ "#{@repo_path}/tables/*.txt" ]
-  files.each do |file|
-## todo/check/fix:
-##   use source: prop in rsssf page - why? why not???
-##   move year/season/basename into page ???
-#
-#  assume every rsssf page has at least:
-##    - basename  e.g. duit2014
-##    - year      e.g. 2014 (numeric)
-##    - season    (derived from config lookup???) - string e.g. 2014-15 or 2014 etc.
-    extname  = File.extname( file )
-    basename = File.basename( file, extname )
-    year     = year_from_name( basename )
-    season   = year_to_season( year )
-    if cfg.includes && cfg.includes.include?( year ) == false
-      puts "   skipping #{basename}; not listed in includes"
-      next
-    end
-    puts "  reading >#{basename}<"
+  ### save report as README.md in tables/ folder in repo
+  report.save( "#{@repo_path}/tables/README.md" )
+end  # method make_pages_summary
-    page = Page.from_file( file ) # note: always assume sources (already) converted to utf-8
-    if cfg.opts_for_year.is_a?( Hash )
-      opts = cfg.opts_for_year    ## just use as is 1:1 (constant/same for all years)
-    else
-      ## assume it's a proc/lambda (call to calculate)
-      opts = cfg.opts_for_year.call( year )
-    end
-    pp opts
-    schedule = page.find_schedule( opts )
-    ## pp schedule
-    if cfg.dir_for_year.nil?
-      ## use default setting, that is, archive for dir (e.g. archive/1980s/1985-86 etc.)
-      dir_for_year = archive_dir_for_year( year )
-    else
-      ## assume it's a proc/lambda
-      dir_for_year = cfg.dir_for_year.call( year )
-    end
+def make_schedules_summary
+   ## find all match datafiles
+   args = [@repo_path]
+   files = SportDb::Parser::Opts.expand_args( args )
+   pp files
+   report = ScheduleReport.build( files, title: @title,
+                                         patch: @patch )   ## pass in title etc.
+   report.save( "#{@repo_path}/README.md" )
+end
-    ## -- cfg.name               e.g. => 1-liga
-    dest_path = "#{@repo_path}/#{dir_for_year}/#{cfg.name}.txt"
-    puts "  save to >#{dest_path}<"
-    FileUtils.mkdir_p( File.dirname( dest_path ))
-    schedule.save( dest_path )
-    rec = ScheduleStat.new
-    rec.path     = dir_for_year
-    rec.filename = "#{cfg.name}.txt"    ## change to basename - why?? why not??
-    rec.year     = year
-    rec.season   = season
-    rec.rounds   = schedule.rounds
-    stats << rec
+def patch_pages( patcher )
+  ## lets you run/use custom (repo/country-specific patches e.g. for adding/patching headings etc.)
+  patch_dir( "#{@repo_path}/tables" ) do |txt, name, year|
+    puts "patching #{year} (#{name}) (#{@repo_path})..."
+    patcher.patch( txt, name, year )    ## note: must be last (that is, must return (patcher) t(e)xt)
   end
-  stats  # return stats for reporting
-end # method make_schedules
+end ## method  patch_pages
-private
-def patch_dir( root )
-  files = Dir[ "#{root}/*.txt" ]
+def patch_dir( root, &blk )
+  files = Dir.glob( "#{root}/**/*.txt" )
   ## pp files
   ## sort files by year (latest first)
@@ -180,41 +121,24 @@ def patch_dir( root )
   end
   files.each do |file|
-    txt = File.read_utf8( file )    ## note: assumes already converted to utf-8
+    txt = read_text( file )    ## note: assumes already converted to utf-8
     basename = File.basename( file, '.txt' )  ## e.g. duit92.txt => duit92
     year     = year_from_name( basename )
-    new_txt = yield( txt, basename, year )
-    ## calculate hash to see if anything changed ?? why? why not??
+    new_txt = blk.call( txt, basename, year )
-    File.open( file, 'w' ) do |f|
-      f.write new_txt
+    ## calculate hash to see if anything changed ?? why? why not??
+    if txt != new_txt
+      puts "  patching #{file}, text changed"
+      write_text( file, new_txt )
     end
   end # each file
 end  ## patch_dir
-def sanitize_dir( root )
-  files = Dir[ "#{root}/*.txt" ]
-  files.each do |file|
-    txt = File.read_utf8( file )    ## note: assumes already converted to utf-8
-    new_txt = sanitize( txt )
-    File.open( file, 'w' ) do |f|
-      f.write new_txt
-    end
-  end # each file
-end  ## sanitize_dir
 end  ## class Repo
 end  ## module Rsssf
-## add (shortcut) alias
-RsssfRepo           = Rsssf::Repo
-RsssfScheduleConfig = Rsssf::ScheduleConfig
-RsssfScheduleStat   = Rsssf::ScheduleStat