RubyGems - rsssf - Versions diffs - 0.1.0 → 0.3.0 - Mend

rsssf 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

checksums.yaml +5 -5
data/{HISTORY.md → CHANGELOG.md} +4 -0
data/Manifest.txt +41 -7
data/README.md +93 -71
data/Rakefile +8 -7
data/config/groups_en.txt +44 -0
data/config/rounds_en.txt +283 -0
data/config/rounds_es.txt +20 -0
data/config/rounds_misc.txt +7 -0
data/lib/_cocos_.rb +158 -0
data/lib/rsssf/convert/convert.rb +71 -0
data/lib/rsssf/convert/errata.rb +103 -0
data/lib/rsssf/convert/html_entities.rb +150 -0
data/lib/rsssf/convert/html_to_txt/beautify_anchors.rb +96 -0
data/lib/rsssf/convert/html_to_txt/make_heading.rb +70 -0
data/lib/rsssf/convert/html_to_txt/remove_emails.rb +43 -0
data/lib/rsssf/convert/html_to_txt/replace_a_href.rb +85 -0
data/lib/rsssf/convert/html_to_txt/replace_a_name.rb +87 -0
data/lib/rsssf/convert/html_to_txt/replace_heading.rb +76 -0
data/lib/rsssf/convert/html_to_txt/replace_hr.rb +25 -0
data/lib/rsssf/convert/html_to_txt.rb +247 -0
data/lib/rsssf/download.rb +20 -0
data/lib/rsssf/fmtfix/dates.rb +541 -0
data/lib/rsssf/fmtfix/dates_helpers.rb +63 -0
data/lib/rsssf/fmtfix/errata.rb +44 -0
data/lib/rsssf/fmtfix/fmtfix-base.rb +68 -0
data/lib/rsssf/fmtfix/fmtfix.rb +101 -0
data/lib/rsssf/fmtfix/goals.rb +173 -0
data/lib/rsssf/fmtfix/headers.rb +326 -0
data/lib/rsssf/fmtfix/outline.rb +228 -0
data/lib/rsssf/fmtfix/patch_headings.rb +141 -0
data/lib/rsssf/fmtfix/rounds.rb +74 -0
data/lib/rsssf/fmtfix/score.rb +92 -0
data/lib/rsssf/fmtfix/tables.rb +316 -0
data/lib/rsssf/fmtfix/topscorers.rb +50 -0
data/lib/rsssf/page-find_schedule.rb +127 -0
data/lib/rsssf/page-meta.rb +68 -0
data/lib/rsssf/page.rb +125 -238
data/lib/rsssf/parse_schedules.rb +34 -0
data/lib/rsssf/prepare/convert-links.rb +77 -0
data/lib/rsssf/prepare/convert-meta.rb +111 -0
data/lib/rsssf/prepare/convert-navlines.rb +154 -0
data/lib/rsssf/prepare/convert-postproc.rb +141 -0
data/lib/rsssf/prepare/convert.rb +100 -0
data/lib/rsssf/prepare/download.rb +40 -0
data/lib/rsssf/project.rb +154 -0
data/lib/rsssf/reports/page.rb +66 -23
data/lib/rsssf/reports/schedule.rb +89 -40
data/lib/rsssf/schedule.rb +4 -14
data/lib/rsssf/utils.rb +37 -45
data/lib/rsssf/version.rb +7 -6
data/lib/rsssf.rb +82 -19
metadata +68 -26
data/.gemtest +0 -0
data/lib/rsssf/fetch.rb +0 -80
data/lib/rsssf/html2txt.rb +0 -157
data/lib/rsssf/patch.rb +0 -28
data/lib/rsssf/repo.rb +0 -220
data/test/helper.rb +0 -12
data/test/test_utils.rb +0 -83

data/lib/rsssf/page.rb CHANGED Viewed

@@ -1,13 +1,12 @@
-# encoding: utf-8
 module Rsssf
   PageStat = Struct.new(
-    :source,     ## e.g. http://rsssf.org/tabled/duit89.html
-    :basename,   ## e.g. duit89   -- note: filename w/o extension (and path)
+    :source,     ## e.g. https://rsssf.org/tabled/duit89.html
     :year,       ## e.g. 1989     -- note: always four digits
-    :season,     ## e.g. 1990-91  -- note: always a string (NOT a number)
+    :title,
     :authors,
     :last_updated,
     :line_count,  ## todo: rename to (just) lines - why? why not?
@@ -28,268 +27,156 @@ class Page
   include Utils   ## e.g. year_from_name, etc.
-def self.from_url( src )
-  txt = PageFetcher.new.fetch( src )
-  self.from_string( txt )
-end
+def self.read_cache( url )  ### use read_cache /web/html or such - why? why not?
+  html = Webcache.read( url )
+  puts "html:"
+  pp html[0..400]
+  txt = PageConverter.convert( html, url: url )
+  txt
-def self.from_file( path )
-  txt = File.read_utf8( path )  # note: always assume sources (already) converted to utf-8
-  self.from_string( txt )
+  new( txt )
 end
-def self.from_string( txt )
-  self.new( txt )
+def self.read_txt( path )  ## use read_txt
+    # note: always assume sources (already) converted from html to txt!!!!
+  txt = read_text( path )
+  new( txt )
 end
+### use text alias too (for txt) - why? why not?
+attr_accessor :txt
+attr_accessor :url  ### source url
 def initialize( txt )
-  @txt = txt
+  @txt   = txt
+  @url   = nil
 end
-LEAGUE_ROUND_REGEX = /\b
-                      Round
-                      \b/ix
-CUP_ROUND_REGEX  = /\b(
-                      Round         |
-                      1\/8\sFinals  |
-                      1\/16\sFinals |
-                      Quarterfinals |
-                      Semifinals    |
-                      Final
-                    )\b/ix
-def find_schedule( opts={} )     ## change to build_schedule - why? why not???
-  ## find match schedule/fixtures in multi-league doc
-  new_txt = ''
-  ## note: keep track of statistics
-  ##   e.g. number of rounds found
-  round_count = 0
-  header = opts[:header]
-  if header
-    league_header_found        = false
-     ## header:
-     ##  - assumes heading 4 e.g. #### Premier League or
-     ##  - bold e.g. **FA Cup** for now
-     ##  note: markers must start line (^)
-     ## note:
-     ## header gsub spaces to \s otherwise no match in regex (using free-form x-flag)!!!
-     header_esc   = header.gsub( ' ', '\s' )
-     ## note: somehow #{2,4} will not work with free-form /xi defined (picked up as comment?)
-     ##  use [#] hack ??
-     header_regex = /^
-                      ([#]{2,4}\s+(#{header_esc}))
-                        |
-                      (\*{2}(#{header_esc})\*{2})
-                    /ix
-    ## todo:
-    ##   use new stage_regex e.g. **xxx** - why? why not?
-    ##  allow more than one stage in one schedule (e.g. regular stage,playoff stage etc)
-  else
-    league_header_found = true   # default (no header; assume single league file)
-    header_regex = /^---dummy---$/  ## non-matching dummy regex
-  end
-  ## puts "header_regex:"
-  ## pp header_regex
-  if opts[:cup]
-    round_regex = CUP_ROUND_REGEX   ## note: only allow final, quaterfinals, etc. if knockout cup
-  else
-    round_regex = LEAGUE_ROUND_REGEX
-  end
-  ## stages
-  first_round_header_found   = false
-  round_header_found         = false
-  round_body_found           = false   ## allow round header followed by blank lines
-  blank_found = false
-  @txt.each_line do |line|
-    if league_header_found == false
-      ## first find start of league header/section
-      if line =~ header_regex
-        puts "!!! bingo - found header >#{line}<"
-        league_header_found = true
-        title = line.gsub( /[#*]/, '' ).strip   ##  quick hack: extract title from header
-        new_txt << "## #{title}\n\n"    # note: use header/stage title (regex group capture)
-      else
-        puts "  searching for header >#{header}<; skipping line >#{line}<"
-        next
-      end
-    elsif first_round_header_found == false
-      ## next look for first round (starting w/ Round)
-      if line =~ round_regex
-        puts "!!! bingo - found first round >#{line}<"
-        round_count += 1
-        first_round_header_found = true
-        round_header_found       = true
-        round_body_found         = false
-        new_txt << line
-      elsif line =~ /^=-=-=-=/
-        puts "*** no rounds found; hit section marker (horizontal rule)"
-        break
-      elsif line =~ /^\*{2}[^*]+\*{2}/   ## e.g. **FA Cup**
-        puts "*** no rounds found; hit section/stage header: #{line}"
-        break
-      else
-        puts "  searching for first round; skipping line >#{line}<"
-        next ## continue; searching
-      end
-    elsif round_header_found == true
-      ## collect rounds;
-      ##   assume text block until next blank line
-      ##   new block must allways start w/ round
-      if line =~ /^\s*$/   ## blank line?
-        if round_body_found
-          round_header_found = false
-          blank_found        = true    ## keep track of blank (lines) - allow inside round block (can continue w/ date header/marker)
-          new_txt << line
-        else
-          ## note: skip blanks following header
-          next
-        end
-      else
-        round_body_found = true
-        new_txt << line   ## keep going until next blank line
-      end
-    else
-      ## skip (more) blank lines
-      if line =~ /^\s*$/
-        next  ## continue; skip extra blank line
-      elsif line =~ round_regex
-        puts "!!! bingo - found new round >#{line}<"
-        round_count += 1
-        round_header_found = true   # more rounds; continue
-        round_body_found   = false
-        blank_found        = false  # reset blank tracker
-        new_txt << line
-      elsif blank_found && line =~ /\[[a-z]{3} \d{1,2}\]/i   ## e.g. [Mar 13] or [May 5] with leading blank line; continue round
-        puts "!!! bingo - continue round >#{line}<"
-        round_header_found = true
-        blank_found        = false  # reset blank tracker
-        new_txt << line
-      elsif blank_found && line =~ /First Legs|Second Legs/i
-        puts "!!! bingo - continue round >#{line}<"
-        round_header_found = true
-        blank_found        = false  # reset blank tracker
-        new_txt << line
-      elsif line =~ /=-=-=-=/
-        puts "!!! stop schedule; hit section marker (horizontal rule)"
-        break;
-      elsif line =~ /^\*{2}[^*]+\*{2}/   ## e.g. **FA Cup**
-        puts "!!! stop schedule; hit section/stage header: #{line}"
-        break
-      else
-        blank_found  = false
-        puts "skipping line in schedule >#{line}<"
-        next # continue
-      end
-    end
-  end  # each line
-  schedule = Schedule.from_string( new_txt )
-  schedule.rounds = round_count
-  schedule
-end  # method find_schedule
+## let's you check optional ref e.g. ‹§fin›
+###  todo/fix - change to OPT_REF_RE   - make it regex
+##     regex embedded in regex will use  regex.source automatic (no need to escape)!!
+## let's you check optional ref e.g. ‹§fin›
+OPT_REF = %q{
+            (?: [ ]*
+              ‹§ (?<ref> [^›]+?) ›
+            )?
+         }
+HX_RE = %r{          ## negative lookahead
+                     ##   do NOT match  =-=
+                     ##   do NOT match  ===========  (without any heading text!!)
+                     ##     e.g.
+                     ##       Fall season
+                     ##       ===========
+                    (?! ^[ ]* (?:    =-=
+                                 |  ={1,} [ ]* $
+                               )
+                     )
+                     ^
+                    [ ]*
+                  (?<marker> ={1,6})
+                     [ ]*
+                  (?<text> .+?)
+                     #{OPT_REF}
+                     [ ]*
+            $}x
+##
+## change to outline - why? why not?
+def _scan_headings()  txt.scan( HX_RE );   end
+def _build_toc( txt )
+     hx =  txt.scan( HX_RE )
+     toc = []
+       hx.each do |marker,text,ref|
+          toc <<  "#{marker} #{text}"
+       end
+     toc
+end
+=begin
+<!--
+     title:   Austria 2002/03
+     source:  https://rsssf.org/tableso/oost03.html
+     authors: Andreas Exenberger and Karel Stokkermans
+     updated: 15 Jun 2022
+    -->
+=end
 def build_stat
+  title        = nil
   source       = nil
   authors      = nil
   last_updated = nil
-  ### find source ref
-  if @txt =~ /source: ([^ \n]+)/im
-    source = $1.to_s
-    puts "source: >#{source}<"
-  end
-  ##
-  ## fix/todo: move authors n last updated  whitespace cleanup to sanitize - why? why not??
-  if @txt =~ /authors?:\s+(.+?)\s+last updated:\s+(\d{1,2} [a-z]{3,10} \d{4})/im
-    last_updated = $2.to_s   # note: save a copy first (gets "reset" by next regex)
-    authors      = $1.to_s.strip.gsub(/\s+/, ' ' )  # cleanup whitespace; squish-style
-    authors = authors.gsub( /[ ]*,[ ]*/, ', ' )    # prettify commas - always single space after comma (no space before)
-    puts "authors: >#{authors}<"
-    puts "last updated: >#{last_updated}<"
-  end
-  puts "*** !!! missing source"  if source.nil?
-  puts "*** !!! missing authors n last updated"   if authors.nil? || last_updated.nil?
-  sections = []
-  ## count lines
-  line_count = 0
-  @txt.each_line do |line|
-    line_count +=1
-    ### find sections
-    ## todo: add more patterns? how? why?
-    if line =~ /####\s+(.+)/
-      puts "  found section >#{$1}<"
-      sections << $1.strip
-    end
-  end
-  # get path from url
-  url  = URI.parse( source )
-  ## pp url
-  ## puts url.host
-  path = url.path
-  extname  = File.extname( path )
-  basename = File.basename( path, extname )  ## e.g. duit92.txt or duit92.html => duit92
-  year     = year_from_name( basename )
-  season   = year_to_season( year )
+  meta = parse_meta( @txt ) || {}
+  title        = meta[:title]
+  source       = meta[:source]
+  authors      = meta[:author] || meta[:authors]   ## note - check for author & authors !!!
+  last_updated = meta[:updated]
+  puts "*** !!! missing source"        if source.nil?
+  puts "*** !!! missing author(s)"     if authors.nil?
+  puts "**  !!! missing last updated"  if last_updated.nil?
+  ## get year from source (url)
+  ###   move (for reuse) to  year_from_url  in utils - why? why not?
+  url_path  = URI.parse( source ).path
+  basename  = File.basename( url_path, File.extname( url_path ) )  ## e.g. duit92.txt or duit92.html => duit92
+  puts "   basename=>#{basename}<"
+  year      = year_from_name( basename )
+  sections = _build_toc( txt )
   rec = PageStat.new
   rec.source       = source         # e.g. http://rsssf.org/tabled/duit89.html   -- use source_url - why?? why not??
-  rec.basename     = basename       # e.g. duit89
-  rec.year         = year           # e.g. 89 => 1989  -- note: always four digits
-  rec.season       = season
+  rec.year         = year       ## note: in 2021/22  - year is always end_year, that is, 2022
+  rec.title        = title
   rec.authors      = authors
   rec.last_updated = last_updated
-  rec.line_count   = line_count
-  rec.char_count   = @txt.size      ## fix: use "true" char count not byte count
-  rec.sections     = sections
+  rec.line_count   = @txt.lines.count    ### or @txt.each_line.count
+  rec.char_count   = @txt.size          ## note - size/length is true char count (@txt.bytesize is byte count!!)
+  rec.sections     = sections
   rec
 end  ## method build_stat
 def save( path )
-  File.open( path, 'w' ) do |f|
-    f.write @txt
-  end
+  write_text( path, @txt )
 end  ## method save
 end  ## class Page
 end  ## module Rsssf
-## add (shortcut) alias
-RsssfPageStat = Rsssf::PageStat
-RsssfPage     = Rsssf::Page

data/lib/rsssf/parse_schedules.rb ADDED Viewed

@@ -0,0 +1,34 @@
+##
+## todo/check - find a better name
+##       rename to parse_sections/leagues/??? - why? why not?
+def parse_schedules( txt )
+   rows = parse_csv( txt )
+   ## transform seasons column to seasons objects
+   rows.each do |row|
+      if row['seasons'] && !row['seasons'].empty?
+        row['seasons'] = Season.parse_line( row['seasons'] )
+      end
+   end
+   rows
+end
+def read_schedules( path )  parse_schedules( read_text(path)); end
+__END__
+############
+## sample usage
+configs = parse_schedules( <<TXT )
+header,       seasons,             basename,      title
+Bundesliga,   2010/11..2025/26,    1-bundesliga,  Austria | Bundesliga {season}
+ÖFB Cup,      2010/11..2025/26,    cup,           Austria | ÖFB Cup {season}
+TXT
+## pp configs

data/lib/rsssf/prepare/convert-links.rb ADDED Viewed

@@ -0,0 +1,77 @@
+module Rsssf
+class  Prep    ## todo: find a better name e.g. BatchPrep or ??
+##          see page 2006f
+##   see page ../tablesw/worldcup›
+##  e.g. ‹League C, see page 2023uefanl§lgc›
+##       ‹League A, see page 2023uefanl.html#lga›
+##   todo/fix - fix upstream ?? (e.g. remove. html and replace #=>§)
+LINK_APAGE_RE = %r{  ‹(?<title> [^›]+?)
+                       , [ ] see [ ] page [ ]
+                      (?<pageref> [^›]+?)
+                    ›
+                }ix
+=begin
+["1973/74", "oost74"],
+ ["1975/76", "oost76"],
+ ["list of final tables", "oosthist"],
+ ["list of champions", "oostchamp"],
+ ["list of cup finals", "oostcuphist"],
+ ["list of super cup finals", "oostsupcuphist"],
+ ["list of foundation dates", "oostfound"]]
+=end
+def expand_pageref( pageref, dirname: )
+                  ##
+                  ##  note - pre-proces
+                  ##   2023uefanl.html#lga
+                  ##     stkitts2025.html#pres
+                  ##
+                  ##   remove .html
+                  ##    and optional anchor
+                  ##
+                  ##   fix - upstream - why? why not?
+                   pageref = pageref.sub(  %r{ \.html\b }ix, '' )
+                   ## check - only really one # allowed in url path???
+                   pageref = pageref.sub(  '#', '§' )
+                 if /^[a-z0-9][a-z0-9§-]*$/.match?( pageref )
+                    ## assume relative page in "local" dir
+                    "#{dirname}/#{pageref}"
+                 elsif pageref.start_with?( '../')
+                    ## ../tablesw/worldcup
+                     pageref.sub( "../", '' )
+                 elsif pageref.start_with?( './' )
+                     raise ArgumentError, "found (unsupported) ./ pageref >#{pageref}<"
+                 elsif pageref.start_with?( '/' )
+                     raise ArgumentError, "found (unsupported) / pageref >#{pageref}<"
+                 elsif pageref.start_with?( %r{^https?:}i )
+                     raise ArgumentError, "found (unsupported) https?: pageref >#{pageref}<"
+                 else
+                     raise ArgumentError, "found (unsupported) pageref >#{pageref}<"
+                 end
+end
+def collect_links( txt, basename:, dirname: )
+  links = txt.scan( LINK_APAGE_RE )
+  links.map do |link|
+                   link[1] = expand_pageref( link[1], dirname: dirname )
+                   link
+               end
+  links
+end
+end    ## class Prep
+end    ## module Rsssf

data/lib/rsssf/prepare/convert-meta.rb ADDED Viewed

@@ -0,0 +1,111 @@
+module Rsssf
+class  Prep    ## todo: find a better name e.g. BatchPrep or ??
+###
+#  note  - check for special cases (later) with no about this docu section!!
+#
+##   https://rsssf.org/tablesb/braz98.html
+##         has not about document section
+#       and only a last update: 22 Apr 1999   line (no author)
+TITLE_RE = %r{
+    <TITLE>(?<text>.*?)</TITLE>
+}ixm
+def find_title( html )
+  if m=TITLE_RE.match( html )
+     text = m[:text].strip
+     ## note - convert html entities
+     ##  e.g. Brazil 2000 - Copa Jo&atilde;o Havelange
+     text = PageConverter.convert_html_entities( text )
+     ##  add autofix known typos/erratas here!!!
+     ## note - title quick typo fix (in brazil) remove <
+     ##   e.g. <TITLE>Brazil 1988<</TITLE>
+     text = text.gsub( '<', '' )
+     text
+  else
+     nil
+  end
+end
+ABOUT_META_RE = %r{
+    ## (i) author(s) info
+   \b authors? [ ]* :
+    \s+
+      (?<author> .+?)    ## note - non-greedy (may incl. newline break!!)
+    \s+
+    ## (ii) followed by date
+    \b last [ ]+ updated [ ]*:
+      \s*
+      (?<date> \d{1,2} [ ]+              ## day
+                [a-z]{3,10} [ ]+         ## month
+                \d{4} \b)                ## year
+}ixm
+## change name to authors_n_updated or such - why? why not?
+def find_author_n_date( txt )
+  ##
+  ## fix/todo: move authors n last updated
+  ##  whitespace cleanup  - why? why not??
+  if m=ABOUT_META_RE.match( txt )
+    authors = m[:author].strip.gsub(/\s+/, ' ' )  # cleanup whitespace; squish-style
+    authors = authors.gsub( /[ ]*,[ ]*/, ', ' )    # prettify commas - always single space after comma (no space before)
+    updated = m[:date].strip.gsub(/\s+/, ' ' )
+    [authors, updated]
+  else
+     ## report error or raise exception??
+     ##  return nil for now
+     [nil,nil]  ## or return (single) nil ??
+  end
+end
+end    ## class Prep
+end    ## module Rsssf
+=begin
+e.g.
+Authors: Hans Schöggl, Jan Schoenmakers and Karel Stokkermans
+Last updated: 7 Mar 2023
+-or-
+Authors: Ambrosius Kutschera
+and Karel Stokkermans
+Last updated: 31 Oct 2004
+-or-
+Author: RSSSF
+Last updated: 15 Jun 2022
+-or-
+Authors: Andreas Exenberger, Hans Schöggl
+and Karel Stokkermans
+Last updated: 15 Jul 2022
+=end