RubyGems - rsssf - Versions diffs - 0.1.0 → 0.3.0 - Mend

rsssf 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

checksums.yaml +5 -5
data/{HISTORY.md → CHANGELOG.md} +4 -0
data/Manifest.txt +41 -7
data/README.md +93 -71
data/Rakefile +8 -7
data/config/groups_en.txt +44 -0
data/config/rounds_en.txt +283 -0
data/config/rounds_es.txt +20 -0
data/config/rounds_misc.txt +7 -0
data/lib/_cocos_.rb +158 -0
data/lib/rsssf/convert/convert.rb +71 -0
data/lib/rsssf/convert/errata.rb +103 -0
data/lib/rsssf/convert/html_entities.rb +150 -0
data/lib/rsssf/convert/html_to_txt/beautify_anchors.rb +96 -0
data/lib/rsssf/convert/html_to_txt/make_heading.rb +70 -0
data/lib/rsssf/convert/html_to_txt/remove_emails.rb +43 -0
data/lib/rsssf/convert/html_to_txt/replace_a_href.rb +85 -0
data/lib/rsssf/convert/html_to_txt/replace_a_name.rb +87 -0
data/lib/rsssf/convert/html_to_txt/replace_heading.rb +76 -0
data/lib/rsssf/convert/html_to_txt/replace_hr.rb +25 -0
data/lib/rsssf/convert/html_to_txt.rb +247 -0
data/lib/rsssf/download.rb +20 -0
data/lib/rsssf/fmtfix/dates.rb +541 -0
data/lib/rsssf/fmtfix/dates_helpers.rb +63 -0
data/lib/rsssf/fmtfix/errata.rb +44 -0
data/lib/rsssf/fmtfix/fmtfix-base.rb +68 -0
data/lib/rsssf/fmtfix/fmtfix.rb +101 -0
data/lib/rsssf/fmtfix/goals.rb +173 -0
data/lib/rsssf/fmtfix/headers.rb +326 -0
data/lib/rsssf/fmtfix/outline.rb +228 -0
data/lib/rsssf/fmtfix/patch_headings.rb +141 -0
data/lib/rsssf/fmtfix/rounds.rb +74 -0
data/lib/rsssf/fmtfix/score.rb +92 -0
data/lib/rsssf/fmtfix/tables.rb +316 -0
data/lib/rsssf/fmtfix/topscorers.rb +50 -0
data/lib/rsssf/page-find_schedule.rb +127 -0
data/lib/rsssf/page-meta.rb +68 -0
data/lib/rsssf/page.rb +125 -238
data/lib/rsssf/parse_schedules.rb +34 -0
data/lib/rsssf/prepare/convert-links.rb +77 -0
data/lib/rsssf/prepare/convert-meta.rb +111 -0
data/lib/rsssf/prepare/convert-navlines.rb +154 -0
data/lib/rsssf/prepare/convert-postproc.rb +141 -0
data/lib/rsssf/prepare/convert.rb +100 -0
data/lib/rsssf/prepare/download.rb +40 -0
data/lib/rsssf/project.rb +154 -0
data/lib/rsssf/reports/page.rb +66 -23
data/lib/rsssf/reports/schedule.rb +89 -40
data/lib/rsssf/schedule.rb +4 -14
data/lib/rsssf/utils.rb +37 -45
data/lib/rsssf/version.rb +7 -6
data/lib/rsssf.rb +82 -19
metadata +68 -26
data/.gemtest +0 -0
data/lib/rsssf/fetch.rb +0 -80
data/lib/rsssf/html2txt.rb +0 -157
data/lib/rsssf/patch.rb +0 -28
data/lib/rsssf/repo.rb +0 -220
data/test/helper.rb +0 -12
data/test/test_utils.rb +0 -83

data/lib/rsssf/prepare/convert-navlines.rb ADDED Viewed

@@ -0,0 +1,154 @@
+module Rsssf
+class  Prep    ## todo: find a better name e.g. BatchPrep or ??
+=begin
+todo - remove all "trailing" nav links in section
+‹1974/75, see page oost75›.
+‹1976/77, see page oost77›.
+‹list of final tables, see page oosthist›.
+‹list of champions, see page oostchamp›.
+‹list of cup finals, see page oostcuphist›.
+‹list of super cup finals, see page oostsupcuphist›.
+‹list of foundation dates, see page oostfound›.
+=end
+def strip_navlines( lines, heading: true )
+## note - expects an array of lines (e.g. txt.lines!!!)
+          newlines = []
+          navlines = []
+          body     = false    ## hit/seen body?
+          lines.each_with_index do |line,lineno|
+              ## check for optional leading heading line
+              ## note - first line is heading
+              ##  (only optional for first section)
+              if heading && lineno == 0 && line.lstrip.start_with?( '==' )
+                    newlines << line
+                    next
+              end
+              ##  possibly remove leading nav link lines
+              if !body
+                 if line.strip.empty?
+                    newlines << line
+                    next
+                 end
+                 ## remove leading nav link lines only
+                 newline = line.strip.gsub( /‹.+?›/, '' )
+                 ##  check what's left over?
+                 ##  if only space or pipe (|) or dot (.) than remove
+                 if newline.match?( %r{\A
+                                        [ |.]*
+                                     \z}ix )
+                    ## puts "  removing nav line #{line}"
+                    navlines << line
+                    ## eat-up; record edit
+                 else
+                    body = true
+                    newlines << line
+                 end
+              else
+                  newlines << line
+              end
+          end  # each line
+          [newlines,navlines]
+end
+def proc_navlines_by_sections( txt )
+   edits = []
+   ###
+   ##  remove  remaing nav html elements
+   ##  <MENU></MENU>
+   ##   <UL></UL>
+   ##   <LI></LI>
+      tags = []
+     txt = txt.gsub( %r{   <MENU> | </MENU>
+                         | <UL>   | </UL>
+                         | <LI>   | </LI>
+                       }ix ) do |match|
+         tags << match
+           ''
+     end
+     if tags.size > 0
+                edit = String.new
+                edit += "-- removed #{tags.size} remaining nav html element(s):\n"
+                edit += tags.join( ' ')
+                puts edit
+                edits << edit   ## record edit
+      end
+    sections = txt.split( %r{^
+                               (?= [ ]* ={2,} [ ]*
+                                    [\p{L}0-9]  ## one letter or digit required
+                               )
+                            }ix
+                        )
+     newsections = []
+     sections.each_with_index do |sect,sectno|
+          newlines, navlines = strip_navlines( sect.lines, heading: true )
+          if navlines.size > 0
+             edit = String.new
+             edit += "-- removing #{navlines.size} leading nav line(s) in section #{sectno+1}:\n"
+             edit += navlines.join
+             puts edit
+             edits << edit
+          end
+          ## special check for last section
+          if sectno+1 == sections.size
+              ## reverse lines
+              ##  and remove trailing navlines until hitting body
+              ##   note - set heading to false
+              newlines, navlines = strip_navlines( newlines.reverse, heading: false )
+              newlines = newlines.reverse
+              navlines = navlines.reverse
+              if navlines.size > 0
+                edit = String.new
+                edit += "-- removing #{navlines.size} trailing nav line(s) in last section #{sectno+1}:\n"
+                edit += navlines.join
+                puts edit
+                edits << edit
+              end
+          end
+          newsections << newlines.join
+     end # each section
+    [newsections.join, edits]
+end
+end    ## class Prep
+end    ## module Rsssf

data/lib/rsssf/prepare/convert-postproc.rb ADDED Viewed

@@ -0,0 +1,141 @@
+module Rsssf
+class  Prep    ## todo: find a better name e.g. BatchPrep or ??
+###
+##  remove trailing about document meta backmatter
+##  == About this document  ‹§about›
+##
+##
+##  note - start_with anchored w/ \A to start of string
+START_WITH_ABOUT_RE = %r{  \A
+                  [ \n]*   ## trailing spaces or blank lines
+                  ={2,} [ ]* About [ ]+ this [ ]+ document
+                   .*?
+              }ix
+###
+##  remove "custom" sections by title
+##   e.g.   === Index of groups
+START_WITH_CUSTOM_RE = %r{  \A
+                  [ \n]*   ## trailing spaces or blank lines
+                  ={2,}
+                      [ ]*
+                        (?<title>
+                           Index [ ] of [ ] groups
+                         )
+                     [ ]*
+                   $
+              }ix
+##
+## todo - fix
+##
+##   remove all menu, ul,li, tags etc.
+##    before nav check
+##    see https://rsssf.github.io/tables/2014q.html
+##       as an example!!!
+START_WITH_NAV_RE = %r{  \A
+                [ \n]*    ## trailing spaces or blank lines
+                ‹.+?›    ##  link  (exlude named anchor - why? why not? §)
+             }ix
+def postproc_page( txt, basename:, dirname: )
+  ### record edits in its own txt file
+  edits = []
+  links = []
+  about = nil
+###
+##  step 1
+##   split by horizontal rules (hrs)
+##       and remove navigations sections
+##             starting with links e.g.
+## ‹Bundesliga, see §bund›
+   sects = txt.split( /^=-=-=-=-=-=-=-=-=-=-=-=-=-=-=$/ )
+   sects = sects.select do |sect|
+             if START_WITH_NAV_RE.match?( sect )
+                links += collect_links( sect, basename: basename,
+                                              dirname: dirname )
+              edit = String.new
+               edit += "-- removing nav(igation) section:"
+               edit += sect
+               puts edit
+               edits << edit   ## record edit
+               false           ## remove section
+             elsif m=START_WITH_CUSTOM_RE.match( sect )
+                links += collect_links( sect, basename: basename,
+                                              dirname: dirname )
+                edit = String.new
+                edit += "-- removing custom section with title >#{m[:title]}<:"
+                edit += sect
+                puts edit
+                edits << edit   ## record edit
+                false           ## remove section
+             elsif START_WITH_ABOUT_RE.match?( sect )
+                ## note - do NOT collect links in about section!!!
+               about = sect
+               false           ## remove (about) section
+             else
+                links += collect_links( sect, basename: basename,
+                                              dirname: dirname )
+               true            ## keep section
+             end
+           end
+   ## sects.each_with_index do |sect,i|
+   ##  puts "==> #{i+1}/#{sects.size}"
+   ##  pp sect
+   ## end
+   ##  puts "  #{sects.size} sect(s)"
+   ## note - replace hr with blank line
+   txt = sects.join( "\n\n" )
+   ###
+   ## remove pre comments
+   txt = txt.gsub( "<!-- start pre -->\n", '' )
+   txt = txt.gsub( "<!-- end pre -->\n", '' )
+    ## try to remove leading and trailing nav(igation) lines
+    txt, more_edits = proc_navlines_by_sections( txt )
+    edits += more_edits
+   ## note - return (new) txt AND recorded edits (& erratas)
+   ##        return edits as array or joined (single) string - why? why not?
+   ##   note - return empty array if no edits!!
+   [txt, edits, links, about]
+end
+end    ## class Prep
+end    ## module Rsssf

data/lib/rsssf/prepare/convert.rb ADDED Viewed

@@ -0,0 +1,100 @@
+module Rsssf
+class  Prep    ## todo: find a better name e.g. BatchPrep or ??
+## convenience helper
+def self.convert_pages( pages, outdir: )
+      @@prep ||= new   ## use a "shared" built-in prep
+      @@prep.convert_pages( pages, outdir: outdir )
+end
+def convert_pages( pages, outdir: )
+  pages.each_with_index do |config,i|
+    puts
+    puts "==> [#{i+1}/#{pages.size}] converting #{config.pretty_inspect}..."
+    page     = config['page']
+    url      = "https://rsssf.org/#{page}"
+    html     = Webcache.read( url )
+    edits = []
+    txt, more_edits = PageConverter.convert( html, url: url )
+    edits += more_edits
+    basename = File.basename( page, File.extname( page ))
+    dirname  = File.dirname( page )
+    ##
+    ##  post-process .txt page
+    txt, more_edits, links, about = postproc_page( txt, basename: basename,
+                                                        dirname: dirname )
+    edits += more_edits
+    title  =  find_title( html ) || 'n/a'
+    authors, updated = about ? find_author_n_date( about ) : [nil,nil]
+ header_props = <<EOS
+     title:   #{title}
+     source:  #{url}
+EOS
+   if authors && updated
+      ##  assume plural if and or command (,)
+      header_props +=  if /\band\b|,/i.match( authors )
+                         "     authors: #{authors}\n"
+                       else
+                         "     author:  #{authors}\n"
+                       end
+      header_props +=    "     updated: #{updated}"
+   end
+  header = <<EOS
+  <!--
+#{header_props}
+    -->
+EOS
+     ## note - (auto-) add (comment) header to written out txt!!!
+     write_text( "#{outdir}/#{dirname}/#{basename}.txt", header+txt )
+     ## todo/check - delete edits file if no edits - why? why not?
+     if edits.size > 0
+        write_text( "#{outdir}/#{dirname}/#{basename}.edits.txt", edits.join("\n") )
+     end
+     ## todo/check - delete links file if no links - why? why not?
+     if links.size > 0
+         buf = links.map do |link|
+                              title   = link[0]
+                              pageref = link[1]
+                             "#{'%-30s' % pageref}  :  #{title}"
+                        end.join( "\n")
+         write_text( "#{outdir}/#{dirname}/#{basename}.links.txt", buf )
+     end
+     ## todo/check - delete about file if no about - why? why not?
+     if about
+        write_text( "#{outdir}/#{dirname}/#{basename}.about.txt", about )
+     end
+  end
+end
+end    ## class Prep
+end    ## module Rsssf

data/lib/rsssf/prepare/download.rb ADDED Viewed

@@ -0,0 +1,40 @@
+module Rsssf
+class  Prep   ## todo: find a better name e.g. BatchPrep or ??
+## convenience helper
+def self.download_pages( pages, force: )
+   @@prep ||= new   ## use a "shared" built-in prep
+   @@prep.download_pages( pages, force: force )
+end
+def download_pages( pages, force: )
+  pages.each_with_index do |config,i|
+## todo / double check fix read_csv upstream
+##    if   empty column has comment it is "" empty string otherwise
+##                it is nil!!!  ??
+    encoding = config['encoding']
+    encoding = 'windows-1252'   if encoding.nil? || encoding.empty?
+    page     = config['page']
+    url      = "https://rsssf.org/#{page}"
+## check if not in cache
+##   note - use force == true  to always (force) download
+    if Webcache.cached?( url ) && force == false
+        puts "   CACHE HIT - #{url}"
+    else
+        puts "==> [#{i+1}/#{pages.size}] download #{config.pretty_inspect}..."
+        html = Rsssf.download_page( url, encoding: encoding )
+    end
+  end
+end
+end    ## class Prep
+end    ## module Rsssf

data/lib/rsssf/project.rb ADDED Viewed

@@ -0,0 +1,154 @@
+module Rsssf
+class Project
+  include Utils       ## e.g. year_from_file, etc.
+  attr_reader :title,
+              :root_dir
+def initialize( dir,
+                title: 'Your Title Here',
+                slug:  nil )
+  @root_dir  = dir
+  @title     = title
+  @slug      = slug     ## note - might be a proc e.g.  ->(season) {}
+end
+def pages_dir()  "#{root_dir}/pages"; end
+def _find_pages
+  glob = "#{pages_dir}/**/*.txt"
+  print "  glob >#{glob}<..."
+  files = Dir.glob( glob )
+  puts "  #{files.size} page(s) .txt found"
+  ## pp files
+  files
+end
+def make_pages_summary
+  files = _find_pages()
+  report = PageReport.build( files, title: @title )    ## pass in title etc.
+  ### save report as README.md in pages/ dir in project root_dir
+  report.save( "#{pages_dir}/README.md" )
+end  # method make_pages_summary
+def make_schedules_summary
+   ## find all match datafiles
+   ##    note - looks for season pattern for now
+   ##                 YYYY-YY or YYYY
+   glob = "#{root_dir}/**/{[12][0-9][0-9][0-9]-[0-9][0-9],[12][0-9][0-9][0-9]}/*.txt"
+   print "  glob >#{glob}<..."
+   files = Dir.glob( glob )
+   puts "  #{files.size} datatfile(s) .txt found"
+   pp files
+   report = ScheduleReport.build( files, title: @title )   ## pass in title etc.
+   report.save( "#{root_dir}/README.md" )
+end
+def make_schedules( txt, archive: false )
+   configs = parse_schedules( txt )
+   ## pp configs
+   configs.each do |config|
+     header       = config['header']
+     seasons      = config['seasons']
+     basename     = config['basename'] || config['slug']
+     title_tmpl   = config['title']
+     ## note: header allows hierarchy e.g.  (see england and others)
+     ##   Cup Tournaments › FA Cup  or
+     ##   Cup Tournaments > FA Cup
+     header_hiera = header.split( /[ ]* [›>] [ ]*/x )
+     puts "==> #{header_hiera.join(' › ')} - #{seasons.size} season(s)..."
+     i=0
+     each_page( seasons ) do |season, page|
+       title = title_tmpl.sub( '{season}', season.to_s )
+        puts "  [#{i+1}/#{seasons.size}] #{season} => #{basename}, #{title}..."
+       sched = page.find_schedule!( header: header_hiera )
+        outpath =   if archive
+                       ## use archive/1990s and such if season <= 2009/10
+                       "#{root_dir}/#{archive_dir_for_season(season)}/#{basename}.txt"
+                    else
+                       "#{root_dir}/#{season.to_path}/#{basename}.txt"
+                    end
+       sched.save( outpath, header: "= #{title}\n\n" )
+        i+=1
+     end
+  end
+end
+def each_page( seasons, &blk )
+   seasons.each do |season|
+      season = Season( season )
+      basename = _mk_basename( season )
+      path = "#{pages_dir}/#{basename}.txt"
+      page = Page.read_txt( path )
+      blk.call( season, page )
+   end
+end
+def _mk_basename( season )
+   slug =  @slug.is_a?(Proc) ? @slug.call( season ) : @slug
+   ## e.g.  braz01, braz09 or braz2010
+   basename = "#{slug}#{_mk_year(season)}"
+   basename
+end
+def _mk_year( season )
+##
+##   note -  00, 01, 02, 03, 04, 05, 06, 07, 08, 09  => 2000, 2001, .. 2009
+##           10, 11, 12, .. 99                       => 1910 !!, 1911, 1912, .. 1999
+##
+##            2010, 2011, 2012, ...
+##
+##    fix - check for 18xx ???  requires full year!!!
+##     only 1910 to 2009  (10..09)
+    slug =  if season.end_year >= 1910 &&
+               season.end_year <  2010
+                 ## cut off all digits (only keep last two)s
+                 ##  convert end_year to string with leading zero
+                 ## e.g. 00 / 01 / 99 / 98 / 11 / etc.
+                 '%02d' % (season.end_year % 100)
+              else
+                 '%4d' % season.end_year
+              end
+    slug
+end
+end  ## class Project
+end  ## module Rsssf

data/lib/rsssf/reports/page.rb CHANGED Viewed

@@ -1,25 +1,31 @@
-# encoding: utf-8
 module Rsssf
 class PageReport
+def self.build( files, title:  )
+  stats = []
+  files.each do |file|
+    page = Page.read_txt( file )
+    stats << page.build_stat
+  end
+  new( stats, title: title )
+end
 attr_reader :title
-def initialize( stats, opts )
+def initialize( stats, title: )
   @stats = stats
-  @opts  = opts
-  @title = opts[:title] || 'Your Title Here'
+  @title = title
 end
-def save( path )
-  ### save report as README.md in repo
-  File.open( path, 'w' ) do |f|
-    f.write build_summary
-  end
-end
+### save report as README.md in repo
+def save( path ) write_text( path, build_summary ); end
 def build_summary
@@ -33,32 +39,69 @@ def build_summary
 football.db RSSSF Archive Data Summary for #{title}
-_Last Update: #{Time.now}_
 EOS
+## no longer add last update
+##  _Last Update: #{Time.now}_
   txt = ''
   txt << header
-  txt << "| Season | File   | Authors  | Last Updated | Lines (Chars) | Sections |\n"
-  txt << "| :----- | :----- | :------- | :----------- | ------------: | :------- |\n"
+  txt << "| File   | Sections | Last Updated | Lines (Chars) |\n"
+  txt << "| :----- | :------- | :----------- | ------------: |\n"
+## note - removed season (no longer tracked here)
   stats.each do |stat|
-    txt << "| #{stat.season} "
-    txt << "| [#{stat.basename}.txt](#{stat.basename}.txt) "
-    txt << "| #{stat.authors} "
-    txt << "| #{stat.last_updated} "
+    ## get basename from source url
+    url_path  = URI.parse( stat.source ).path
+    basename  = File.basename( url_path, File.extname( url_path ) )  ## e.g. duit92.txt or duit92.html => duit92
+    txt << "| [#{basename}.txt](#{basename}.txt) "
+    txt << "| **#{stat.title}** "
+    if stat.sections.size > 0
+       txt << "<br> "
+       txt <<  stat.sections.join( " <br> " )
+    end
+    txt << %Q{| <span title="by #{stat.authors}">#{stat.last_updated}</span> }
     txt << "| #{stat.line_count} (#{stat.char_count}) "
-    txt << "| #{stat.sections.join(', ')} "
     txt << "|\n"
   end
-  txt << "\n\n"
+  txt << "\n\n"
   txt
 end  # method build_summary
 end  ## class PageReport
 end  ## module Rsssf
-## add (shortcut) alias
-RsssfPageReport = Rsssf::PageReport
+__END__
+old version:
+  txt << header
+  txt << "| File   | Authors  | Last Updated | Lines (Chars) | Sections |\n"
+  txt << "| :----- | :------- | :----------- | ------------: | :------- |\n"
+## note - removed season (no longer tracked here)
+  stats.each do |stat|
+    ## get basename from source url
+    url_path  = URI.parse( stat.source ).path
+    basename  = File.basename( url_path, File.extname( url_path ) )  ## e.g. duit92.txt or duit92.html => duit92
+    txt << "| [#{basename}.txt](#{basename}.txt) "
+    txt << "| #{stat.authors} "
+    txt << "| #{stat.last_updated} "
+    txt << "| #{stat.line_count} (#{stat.char_count}) "
+    txt << "| **#{stat.title}** "
+    if stat.sections.size > 0
+       txt << "<br> "
+       txt <<  stat.sections.join( " <br> " )
+    end
+    txt << "|\n"
+  end