RubyGems - rsssf - Versions diffs - 0.1.0 → 0.3.0 - Mend

rsssf 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

checksums.yaml +5 -5
data/{HISTORY.md → CHANGELOG.md} +4 -0
data/Manifest.txt +41 -7
data/README.md +93 -71
data/Rakefile +8 -7
data/config/groups_en.txt +44 -0
data/config/rounds_en.txt +283 -0
data/config/rounds_es.txt +20 -0
data/config/rounds_misc.txt +7 -0
data/lib/_cocos_.rb +158 -0
data/lib/rsssf/convert/convert.rb +71 -0
data/lib/rsssf/convert/errata.rb +103 -0
data/lib/rsssf/convert/html_entities.rb +150 -0
data/lib/rsssf/convert/html_to_txt/beautify_anchors.rb +96 -0
data/lib/rsssf/convert/html_to_txt/make_heading.rb +70 -0
data/lib/rsssf/convert/html_to_txt/remove_emails.rb +43 -0
data/lib/rsssf/convert/html_to_txt/replace_a_href.rb +85 -0
data/lib/rsssf/convert/html_to_txt/replace_a_name.rb +87 -0
data/lib/rsssf/convert/html_to_txt/replace_heading.rb +76 -0
data/lib/rsssf/convert/html_to_txt/replace_hr.rb +25 -0
data/lib/rsssf/convert/html_to_txt.rb +247 -0
data/lib/rsssf/download.rb +20 -0
data/lib/rsssf/fmtfix/dates.rb +541 -0
data/lib/rsssf/fmtfix/dates_helpers.rb +63 -0
data/lib/rsssf/fmtfix/errata.rb +44 -0
data/lib/rsssf/fmtfix/fmtfix-base.rb +68 -0
data/lib/rsssf/fmtfix/fmtfix.rb +101 -0
data/lib/rsssf/fmtfix/goals.rb +173 -0
data/lib/rsssf/fmtfix/headers.rb +326 -0
data/lib/rsssf/fmtfix/outline.rb +228 -0
data/lib/rsssf/fmtfix/patch_headings.rb +141 -0
data/lib/rsssf/fmtfix/rounds.rb +74 -0
data/lib/rsssf/fmtfix/score.rb +92 -0
data/lib/rsssf/fmtfix/tables.rb +316 -0
data/lib/rsssf/fmtfix/topscorers.rb +50 -0
data/lib/rsssf/page-find_schedule.rb +127 -0
data/lib/rsssf/page-meta.rb +68 -0
data/lib/rsssf/page.rb +125 -238
data/lib/rsssf/parse_schedules.rb +34 -0
data/lib/rsssf/prepare/convert-links.rb +77 -0
data/lib/rsssf/prepare/convert-meta.rb +111 -0
data/lib/rsssf/prepare/convert-navlines.rb +154 -0
data/lib/rsssf/prepare/convert-postproc.rb +141 -0
data/lib/rsssf/prepare/convert.rb +100 -0
data/lib/rsssf/prepare/download.rb +40 -0
data/lib/rsssf/project.rb +154 -0
data/lib/rsssf/reports/page.rb +66 -23
data/lib/rsssf/reports/schedule.rb +89 -40
data/lib/rsssf/schedule.rb +4 -14
data/lib/rsssf/utils.rb +37 -45
data/lib/rsssf/version.rb +7 -6
data/lib/rsssf.rb +82 -19
metadata +68 -26
data/.gemtest +0 -0
data/lib/rsssf/fetch.rb +0 -80
data/lib/rsssf/html2txt.rb +0 -157
data/lib/rsssf/patch.rb +0 -28
data/lib/rsssf/repo.rb +0 -220
data/test/helper.rb +0 -12
data/test/test_utils.rb +0 -83

data/lib/rsssf/fetch.rb DELETED Viewed

@@ -1,80 +0,0 @@
-# encoding: utf-8
-module Rsssf
-class PageFetcher
-  include Filters   # e.g. html2text, sanitize etc.
-def initialize
-  @worker = Fetcher::Worker.new
-end
-def fetch( src_url )
-  ## note: assume plain 7-bit ascii for now
-  ##  -- assume rsssf uses ISO_8859_15 (updated version of ISO_8859_1) -- does NOT use utf-8 character encoding!!!
-  html = @worker.read( src_url )
-  ### todo/fix: first check if html is all ascii-7bit e.g.
-  ## includes only chars from 64 to 127!!!
-  ## normalize newlines
-  ##   remove \r (form feed) used by Windows; just use \n (new line)
-  html = html.gsub( "\r", '' )
-  ## note:
-  ##   assume (default) to ISO 3166-15 (an updated version of ISO 3166-1) for now
-  ##
-  ##  other possible alternatives - try:
-  ##  - Windows CP 1562  or
-  ##  - ISO 3166-2  (for eastern european languages )
-  ##
-  ## note: german umlaut use the same code (int)
-  ##    in ISO 3166-1/15 and 2 and Windows CP1562  (other chars ARE different!!!)
-  html = html.force_encoding( Encoding::ISO_8859_15 )
-  html = html.encode( Encoding::UTF_8 )    # try conversion to utf-8
-  ## check for html entities
-  html = html.gsub( "&auml;", 'ä' )
-  html = html.gsub( "&ouml;", 'ö' )
-  html = html.gsub( "&uuml;", 'ü' )
-  html = html.gsub( "&Auml;", 'Ä' )
-  html = html.gsub( "&Ouml;", 'Ö' )
-  html = html.gsub( "&Uuml;", 'Ü' )
-  html = html.gsub( "&szlig;", 'ß' )
-  html = html.gsub( "&oulm;", 'ö' )    ## support typo in entity (&ouml;)
-  html = html.gsub( "&slig;", "ß" )    ## support typo in entity (&szlig;)
-  html = html.gsub( "&Eacute;", 'É' )
-  html = html.gsub( "&oslash;", 'ø' )
-  ## check for more entities
-  html = html.gsub( /&[^;]+;/) do |match|
-    puts "*** found unencoded html entity #{match}"
-    match   ## pass through as is (1:1)
-  end
-  ## todo/fix: add more entities
-  txt   = html_to_txt( html )
-  header = <<EOS
-<!--
-   source: #{src_url}
-  -->
-EOS
-  header+txt  ## return txt w/ header
-end  ## method fetch
-end  ## class PageFetcher
-end  ## module Rsssf
-## add (shortcut) alias
-RsssfPageFetcher = Rsssf::PageFetcher

data/lib/rsssf/html2txt.rb DELETED Viewed

@@ -1,157 +0,0 @@
-# encoding: utf-8
-module Rsssf
-module Filters
-def html_to_txt( html )
-###
-#   todo: check if any tags (still) present??
-  ## cut off everything before body
-  html = html.sub( /.+?<BODY>\s*/im, '' )
-  ## cut off everything after body (closing)
-  html = html.sub( /<\/BODY>.*/im, '' )
-  ## remove cite
-  html = html.gsub( /<CITE>([^<]+)<\/CITE>/im ) do |_|
-    puts " remove cite >#{$1}<"
-    "#{$1}"
-  end
-  html = html.gsub( /\s*<HR>\s*/im ) do |match|
-    match = match.gsub( "\n", '$$' )  ## make newlines visible for debugging
-    puts " replace horizontal rule (hr) - >#{match}<"
-    "\n=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n"    ## check what hr to use use  - . - . - or =-=-=-= or somehting distinct?
-  end
-  ## replace break (br)
-  ## note: do NOT use m/multiline for now - why? why not??
-  html = html.gsub( /<BR>\s*/i ) do |match|    ## note: include (swallow) "extra" newline
-    match = match.gsub( "\n", '$$' )  ## make newlines visible for debugging
-    puts " replace break (br) - >#{match}<"
-    "\n"
-  end
-  ## remove anchors (a name)
-  html = html.gsub( /<A NAME[^>]*>(.+?)<\/A>/im ) do |match|   ## note: use .+? non-greedy match
-    title = $1.to_s   ## note: "save" caputure first; gets replaced by gsub (next regex call)
-    match = match.gsub( "\n", '$$' )  ## make newlines visible for debugging
-    puts " replace anchor (a) name >#{title}< - >#{match}<"
-    "#{title}"
-  end
-  ## remove anchors (a href)
-  #    note: heading 4 includes anchor (thus, let anchors go first)
-  #  note: <a \newline href is used for authors email - thus incl. support for newline as space
-  html = html.gsub( /<A\s+HREF[^>]*>(.+?)<\/A>/im ) do |_|   ## note: use .+? non-greedy match
-    puts " replace anchor (a) href >#{$1}<"
-    "‹#{$1}›"
-  end
-  ## replace paragrah (p)
-  html = html.gsub( /\s*<P>\s*/im ) do |match|    ## note: include (swallow) "extra" newline
-    match = match.gsub( "\n", '$$' )  ## make newlines visible for debugging
-    puts " replace paragraph (p) - >#{match}<"
-    "\n\n"
-  end
-  html = html.gsub( /<\/P>/i, '' )  ## replace paragraph (p) closing w/ nothing for now
-  ## remove i
-  html = html.gsub( /<I>([^<]+)<\/I>/im ) do |_|
-    puts " remove italic (i) >#{$1}<"
-    "#{$1}"
-  end
-  ## heading 2
-  html = html.gsub( /\s*<H2>([^<]+)<\/H2>\s*/im ) do |_|
-    puts " replace heading 2 (h2) >#{$1}<"
-    "\n\n## #{$1}\n\n"    ## note: make sure to always add two newlines
-  end
-  ## heading 4
-  html = html.gsub( /\s*<H4>([^<]+)<\/H4>\s*/im ) do |_|
-    puts " replace heading 4 (h4) >#{$1}<"
-    "\n\n#### #{$1}\n\n"    ## note: make sure to always add two newlines
-  end
-  ## remove b   - note: might include anchors (thus, call after anchors)
-  html = html.gsub( /<B>([^<]+)<\/B>/im ) do |_|
-    puts " remove bold (b) >#{$1}<"
-    "**#{$1}**"
-  end
-  ## replace preformatted (pre)
-  html = html.gsub( /<PRE>|<\/PRE>/i ) do |_|
-    puts " replace preformatted (pre)"
-    ''  # replace w/ nothing for now (keep surrounding newlines)
-  end
-=begin
-  puts
-  puts
-  puts "html:"
-  puts html[0..2000]
-  puts "-- snip --"
-  puts html[-1000..-1]   ## print last hundred chars
-=end
-  ## cleanup whitespaces
-  ##   todo/fix:  convert newline in space first
-  ##                and than collapse spaces etc.!!!
-  txt = ''
-  html.each_line do |line|
-     line = line.gsub( "\t", '  ' ) # replace all tabs w/ two spaces for nwo
-     line = line.rstrip             # remove trailing whitespace (incl. newline/formfeed)
-     txt << line
-     txt << "\n"
-  end
-  ### remove emails etc.
-  txt = sanitize( txt )
-  txt
-end # method html_to_text
-def sanitize( txt )
-  ### remove emails for (spam/privacy) protection
-  ## e.g. (selamm@example.es)
-  ##      (buuu@mscs.dal.ca)
-  ##      (kaxx@rsssf.com)
-  ##      (Manu_Maya@yakoo.co)
-  ##   note add support for optional ‹› enclosure (used by html2txt converted a href :mailto links)
-  ##   e.g. (‹selamm@example.es›)
-  email_pattern = "\\(‹?[a-z][a-z0-9_]+@[a-z]+(\\.[a-z]+)+›?\\)"   ## note: just a string; needs to escape \\ twice!!!
-  ## check for "free-standing e.g. on its own line" emails only for now
-  txt = txt.gsub( /\n#{email_pattern}\n/i ) do |match|
-    puts "removing (free-standing) email >#{match}<"
-    "\n"   # return empty line
-  end
-  txt = txt.gsub( /#{email_pattern}/i ) do |match|
-    puts "remove email >#{match}<"
-    ''
-  end
-  txt
-end # method sanitize
-end # module Filters
-end # module Rsssf
-## add (shortcut) alias
-RsssfFilters = Rsssf::Filters

data/lib/rsssf/patch.rb DELETED Viewed

@@ -1,28 +0,0 @@
-# encoding: utf-8
-module Rsssf
-class Patcher
-## e.g. 2008/09
-##   note: also support 1999/2000
-SEASON = '\d{4}\/(\d{2}|\d{4})'  ## note: use single quotes - quotes do NOT get escaped (e.g. '\d' => "\\d")
-def patch_heading( txt, rxs, title )
-  rxs.each do |rx|
-    txt = txt.sub( rx ) do |match|
-      match = match.gsub( "\n", '$$')  ## change newlines to $$ for single-line outputs/dumps
-      puts "  found heading >#{match}<"
-      "\n\n#### #{title}\n\n"
-    end
-  end
-  txt
-end
-end # class Patcher
-end  ## module Rsssf
-## add (shortcut) alias
-RsssfPatcher = Rsssf::Patcher

data/lib/rsssf/repo.rb DELETED Viewed

@@ -1,220 +0,0 @@
-# encoding: utf-8
-module Rsssf
-## used by Repo#make_schedules
-ScheduleConfig = Struct.new(
-  :name,
-  :opts_for_year,  ## hash or proc ->(year){ Hash[...] }
-  :dir_for_year,  ## proc ->(year){ 'path_here'}     ## rename to path_for_year - why, why not??
-  :includes        ## array of years to include e.g. [2011,2012] etc.
-)
-ScheduleStat = Struct.new(
-  :path,          ## e.g. 2012-13 or archive/1980s/1984-85
-  :filename,      ## e.g. 1-bundesliga.txt   -- note: w/o path
-  :year,          ## e.g. 2013      -- note: numeric (integer)
-  :season,        ## e.g. 2012-13   -- note: is a string
-  :rounds         ## e.g. 36   -- note: numeric (integer)
-)
-class Repo
-  include Filters     ## e.g. sanitize, etc.
-  include Utils       ## e.g. year_from_file, etc.
-def initialize( path, opts )   ## pass in title etc.
-  @repo_path = path
-  @opts      = opts
-end
-def fetch_pages
-  puts "fetch_pages:"
-  cfg = YAML.load_file( "#{@repo_path}/tables/config.yml")
-  pp cfg
-  dl_base = 'http://rsssf.com'
-  cfg.each do |k,v|
-    ## season = k   # as string e.g. 2011-12  or 2011 etc.
-    path      = v  # as string e.g. tablesd/duit2011.html
-    ## note: assumes extension is .html
-    #    e.g. tablesd/duit2011.html => duit2011
-    basename = File.basename( path, '.html' )
-    src_url   = "#{dl_base}/#{path}"
-    dest_path = "#{@repo_path}/tables/#{basename}.txt"
-    page = Page.from_url( src_url )
-    page.save( dest_path )
-  end # each year
-end # method fetch_pages
-def make_pages_summary
-  stats = []
-  files = Dir[ "#{@repo_path}/tables/*.txt" ]
-  files.each do |file|
-    page = Page.from_file( file )
-    stats << page.build_stat
-  end
-  ### save report as README.md in tables/ folder in repo
-  report = PageReport.new( stats, @opts )    ## pass in title etc.
-  report.save( "#{@repo_path}/tables/README.md" )
-end  # method make_pages_summary
-def make_schedules_summary( stats )   ## note: requires stats to be passed in for now
-  report = ScheduleReport.new( stats, @opts )   ## pass in title etc.
-  report.save( "#{@repo_path}/README.md" )
-end  # method make_schedules_summary
-def patch_pages( patcher )
-  ## lets you run/use custom (repo/country-specific patches e.g. for adding/patching headings etc.)
-  patch_dir( "#{@repo_path}/tables" ) do |txt, name, year|
-    puts "patching #{year} (#{name}) (#{@repo_path})..."
-    patcher.patch( txt, name, year )    ## note: must be last (that is, must return (patcher) t(e)xt)
-  end
-end ## method  patch_pages
-def sanitize_pages
-   ## for debugging/testing lets you (re)run sanitize  (alreay incl. in html2txt filter by default)
-   sanitize_dir( "#{@repo_path}/tables" )
-end
-def make_schedules( cfg )
-  ## note: return stats (for report eg. README)
-  stats = []
-  files = Dir[ "#{@repo_path}/tables/*.txt" ]
-  files.each do |file|
-## todo/check/fix:
-##   use source: prop in rsssf page - why? why not???
-##   move year/season/basename into page ???
-#
-#  assume every rsssf page has at least:
-##    - basename  e.g. duit2014
-##    - year      e.g. 2014 (numeric)
-##    - season    (derived from config lookup???) - string e.g. 2014-15 or 2014 etc.
-    extname  = File.extname( file )
-    basename = File.basename( file, extname )
-    year     = year_from_name( basename )
-    season   = year_to_season( year )
-    if cfg.includes && cfg.includes.include?( year ) == false
-      puts "   skipping #{basename}; not listed in includes"
-      next
-    end
-    puts "  reading >#{basename}<"
-    page = Page.from_file( file ) # note: always assume sources (already) converted to utf-8
-    if cfg.opts_for_year.is_a?( Hash )
-      opts = cfg.opts_for_year    ## just use as is 1:1 (constant/same for all years)
-    else
-      ## assume it's a proc/lambda (call to calculate)
-      opts = cfg.opts_for_year.call( year )
-    end
-    pp opts
-    schedule = page.find_schedule( opts )
-    ## pp schedule
-    if cfg.dir_for_year.nil?
-      ## use default setting, that is, archive for dir (e.g. archive/1980s/1985-86 etc.)
-      dir_for_year = archive_dir_for_year( year )
-    else
-      ## assume it's a proc/lambda
-      dir_for_year = cfg.dir_for_year.call( year )
-    end
-    ## -- cfg.name               e.g. => 1-liga
-    dest_path = "#{@repo_path}/#{dir_for_year}/#{cfg.name}.txt"
-    puts "  save to >#{dest_path}<"
-    FileUtils.mkdir_p( File.dirname( dest_path ))
-    schedule.save( dest_path )
-    rec = ScheduleStat.new
-    rec.path     = dir_for_year
-    rec.filename = "#{cfg.name}.txt"    ## change to basename - why?? why not??
-    rec.year     = year
-    rec.season   = season
-    rec.rounds   = schedule.rounds
-    stats << rec
-  end
-  stats  # return stats for reporting
-end # method make_schedules
-private
-def patch_dir( root )
-  files = Dir[ "#{root}/*.txt" ]
-  ## pp files
-  ## sort files by year (latest first)
-  files = files.sort do |l,r|
-    lyear = year_from_file( l )
-    ryear = year_from_file( r )
-    ryear <=> lyear
-  end
-  files.each do |file|
-    txt = File.read_utf8( file )    ## note: assumes already converted to utf-8
-    basename = File.basename( file, '.txt' )  ## e.g. duit92.txt => duit92
-    year     = year_from_name( basename )
-    new_txt = yield( txt, basename, year )
-    ## calculate hash to see if anything changed ?? why? why not??
-    File.open( file, 'w' ) do |f|
-      f.write new_txt
-    end
-  end # each file
-end  ## patch_dir
-def sanitize_dir( root )
-  files = Dir[ "#{root}/*.txt" ]
-  files.each do |file|
-    txt = File.read_utf8( file )    ## note: assumes already converted to utf-8
-    new_txt = sanitize( txt )
-    File.open( file, 'w' ) do |f|
-      f.write new_txt
-    end
-  end # each file
-end  ## sanitize_dir
-end  ## class Repo
-end  ## module Rsssf
-## add (shortcut) alias
-RsssfRepo           = Rsssf::Repo
-RsssfScheduleConfig = Rsssf::ScheduleConfig
-RsssfScheduleStat   = Rsssf::ScheduleStat

data/test/helper.rb DELETED Viewed

@@ -1,12 +0,0 @@
-## $:.unshift(File.dirname(__FILE__))
-## minitest setup
-require 'minitest/autorun'
-## our own code
-require 'rsssf'

data/test/test_utils.rb DELETED Viewed

@@ -1,83 +0,0 @@
-# encoding: utf-8
-require 'helper'
-class TestUtils < MiniTest::Test
-  include RsssfUtils    ## e.g. year_from_name etc.
-  def test_year
-    ###########
-    ## year_from_name
-    ##    note: num <= 16   - assume 20xx for now from 00..16
-    ##                      -  else  19xx
-    assert_equal 2000, year_from_name( 'duit00' )
-    assert_equal 2016, year_from_name( 'duit16' )
-    assert_equal 1999, year_from_name( 'duit99' )
-    assert_equal 2001, year_from_name( 'duit2001' )
-    assert_equal 1964, year_from_name( 'duit64' )
-    assert_equal 1965, year_from_name( 'duit1965' )
-    assert_equal 2011, year_from_name( 'duit2011' )
-    ####
-    # year_from_file
-    assert_equal 2000, year_from_file( 'duit00.txt' )
-    assert_equal 2000, year_from_file( 'duit00.html' )
-    assert_equal 2000, year_from_file( './duit00.txt' )
-    assert_equal 2000, year_from_file( 'xxx/1998/xxx/duit00.txt' )
-    assert_equal 2016, year_from_file( 'duit16.txt' )
-    assert_equal 2016, year_from_file( 'duit16.html' )
-    assert_equal 2001, year_from_file( 'duit2001.txt' )
-    assert_equal 2001, year_from_file( 'duit2001.html' )
-    assert_equal 2001, year_from_file( 'xx/1990s/1997/xxx/duit2001.txt' )
-    assert_equal 2000, year_from_file( 'de-deutschland/tables/duit00.txt' )
-    assert_equal 1964, year_from_file( 'de-deutschland/62/tables/duit64.txt' )    # check w/ numbers in path
-    assert_equal 1999, year_from_file( 'de-deutschland/1977/tables/duit99.txt' )  # check w/ numbers in path
-    assert_equal 1965, year_from_file( 'de-deutschland/tables/duit1965.txt' )
-    assert_equal 2011, year_from_file( 'de-deutschland/tables/duit2011.txt' )
-    assert_equal 2000, year_from_file( 'de-deutschland/tables/duit00.html' )
-    assert_equal 1964, year_from_file( 'de-deutschland/62/tables/duit64.html' )    # check w/ numbers in path
-    assert_equal 1999, year_from_file( 'de-deutschland/1977/tables/duit99.html' )  # check w/ numbers in path
-    assert_equal 1965, year_from_file( 'de-deutschland/tables/duit1965.html' )
-    assert_equal 2011, year_from_file( 'de-deutschland/tables/duit2011.html' )
-    #####
-    ## year_to_season
-    assert_equal '1998-99', year_to_season( 1999 )
-    assert_equal '1999-00', year_to_season( 2000 )   ## todo: use 1999-2000 - why? why not??
-    assert_equal '2000-01', year_to_season( 2001 )
-    assert_equal '2014-15', year_to_season( 2015 )
-    assert_equal '1999-00', year_to_season( 0 )
-    assert_equal '1963-64', year_to_season( 64 )
-    assert_equal '1998-99', year_to_season( 99 )
-    assert_equal '1964-65', year_to_season( 1965 )
-    assert_equal '2010-11', year_to_season( 2011 )
-    #######
-    ## archive_dir_for_year
-    ##  note:  year <= 2010  use season 2009-10
-    assert_equal 'archive/1990s/1998-99', archive_dir_for_year( 1999 )
-    assert_equal 'archive/2000s/2000-01', archive_dir_for_year( 2001 )
-    assert_equal '2014-15',               archive_dir_for_year( 2015 )
-    assert true  ## everything ok if get here
-  end
-end # class TestUtils