RubyGems - rsssf - Versions diffs - 0.1.0 → 0.3.0 - Mend

rsssf 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

checksums.yaml +5 -5
data/{HISTORY.md → CHANGELOG.md} +4 -0
data/Manifest.txt +41 -7
data/README.md +93 -71
data/Rakefile +8 -7
data/config/groups_en.txt +44 -0
data/config/rounds_en.txt +283 -0
data/config/rounds_es.txt +20 -0
data/config/rounds_misc.txt +7 -0
data/lib/_cocos_.rb +158 -0
data/lib/rsssf/convert/convert.rb +71 -0
data/lib/rsssf/convert/errata.rb +103 -0
data/lib/rsssf/convert/html_entities.rb +150 -0
data/lib/rsssf/convert/html_to_txt/beautify_anchors.rb +96 -0
data/lib/rsssf/convert/html_to_txt/make_heading.rb +70 -0
data/lib/rsssf/convert/html_to_txt/remove_emails.rb +43 -0
data/lib/rsssf/convert/html_to_txt/replace_a_href.rb +85 -0
data/lib/rsssf/convert/html_to_txt/replace_a_name.rb +87 -0
data/lib/rsssf/convert/html_to_txt/replace_heading.rb +76 -0
data/lib/rsssf/convert/html_to_txt/replace_hr.rb +25 -0
data/lib/rsssf/convert/html_to_txt.rb +247 -0
data/lib/rsssf/download.rb +20 -0
data/lib/rsssf/fmtfix/dates.rb +541 -0
data/lib/rsssf/fmtfix/dates_helpers.rb +63 -0
data/lib/rsssf/fmtfix/errata.rb +44 -0
data/lib/rsssf/fmtfix/fmtfix-base.rb +68 -0
data/lib/rsssf/fmtfix/fmtfix.rb +101 -0
data/lib/rsssf/fmtfix/goals.rb +173 -0
data/lib/rsssf/fmtfix/headers.rb +326 -0
data/lib/rsssf/fmtfix/outline.rb +228 -0
data/lib/rsssf/fmtfix/patch_headings.rb +141 -0
data/lib/rsssf/fmtfix/rounds.rb +74 -0
data/lib/rsssf/fmtfix/score.rb +92 -0
data/lib/rsssf/fmtfix/tables.rb +316 -0
data/lib/rsssf/fmtfix/topscorers.rb +50 -0
data/lib/rsssf/page-find_schedule.rb +127 -0
data/lib/rsssf/page-meta.rb +68 -0
data/lib/rsssf/page.rb +125 -238
data/lib/rsssf/parse_schedules.rb +34 -0
data/lib/rsssf/prepare/convert-links.rb +77 -0
data/lib/rsssf/prepare/convert-meta.rb +111 -0
data/lib/rsssf/prepare/convert-navlines.rb +154 -0
data/lib/rsssf/prepare/convert-postproc.rb +141 -0
data/lib/rsssf/prepare/convert.rb +100 -0
data/lib/rsssf/prepare/download.rb +40 -0
data/lib/rsssf/project.rb +154 -0
data/lib/rsssf/reports/page.rb +66 -23
data/lib/rsssf/reports/schedule.rb +89 -40
data/lib/rsssf/schedule.rb +4 -14
data/lib/rsssf/utils.rb +37 -45
data/lib/rsssf/version.rb +7 -6
data/lib/rsssf.rb +82 -19
metadata +68 -26
data/.gemtest +0 -0
data/lib/rsssf/fetch.rb +0 -80
data/lib/rsssf/html2txt.rb +0 -157
data/lib/rsssf/patch.rb +0 -28
data/lib/rsssf/repo.rb +0 -220
data/test/helper.rb +0 -12
data/test/test_utils.rb +0 -83

data/config/rounds_es.txt ADDED Viewed

@@ -0,0 +1,20 @@
+## used in mexico
+Playoffs [ ]   \( Liguilla \)
+Recalificación
+## used in argentina
+Round·of·64·-·32·avos·de·final
+Round·of·32·-·16·avos·de·final
+Round·of·16·-·Octavos·de·Final
+Quarter·finals·-·Cuartos·de·final
+Semi·finals·-·Semifinales
+Primera·fase·de·zonas·-·Phase·of·groups

data/config/rounds_misc.txt ADDED Viewed

@@ -0,0 +1,7 @@
+# in cz
+Play-off·o·umístění
+Skupina·o·záchranu

data/lib/_cocos_.rb ADDED Viewed

@@ -0,0 +1,158 @@
+###
+##  move "upstream" to cocos for sharing
+##
+## note - use File.file? instead of File.exist?
+##            (checks if file exists AND file is a file NOT a directory)
+##
+##   todo/fix - add an option to check if file found or not
+##                      return nil if not found or such
+##
+##  use find_file! and find_file or such - why? why not?
+=begin
+def find_file(name, path: [])
+  path.each do |dir|
+    full = File.join(dir, name)
+    return full if File.exist?(full)
+  end
+  nil
+end
+def find_file!(name, path: [])
+  find_file(name, path:) or
+    raise Errno::ENOENT, ""
+end
+plus add  option - raise_on_error: false  - why? why not?
+def find_file!  - find_file(  raise_on_error: false )
+=end
+def find_file!( name, path: )
+    filepath = find_file( name, path: path )
+    raise Errorno::ENOENT, "file <#{name}> not found; looking in path #{path.inspect}"   if filepath.nil?
+    filepath
+end
+##
+##  note - find_file will NOT find directories!!!
+##                          File.file? will only check if a file (not directory) exits!!
+def find_file( name, path: )
+    return name    if File.file?( name )
+    path.each do |dir|
+        filepath = File.join( dir, name )
+        return filepath   if File.file?(  filepath )
+    end
+    nil   ## return nil if not found
+end
+####
+#  parse/find_patterns
+## use/rename to VARDEF_LINE or such - why? why not?
+VARDEF_RE = %r{\A
+                [ ]*
+              \$(?<key> [a-z][a-z0-9_]*)
+                [ ]*
+              =
+                [ ]*
+              (?<value> .+?)   ## eat-up (non-greedy) the rest until end-of-line
+                [ ]*
+              \z
+}ix
+VAR_RE = %r{  \$(?<key> [a-z][a-z0-9_]*)
+                  \b
+}ix
+def read_patterns( path )
+    parse_patterns( read_text( path ))
+end
+def parse_patterns( txt )
+     ## norm newline (windows cr/lf \r\n) to (lf - \n)
+     txt = txt.gsub( /\r\n/, "\n" )
+     ### check for line continuations with backslash (\)
+     ##      note - allow spaces before newline
+     txt = txt.gsub( /\\[ ]*$\n/, '' )
+     vars = {}
+     names = [] # array of lines (with words)
+     txt.each_line do |line|
+       line = line.strip
+       next if line.empty?
+       next if line.start_with?( '#' )   ## skip comments too
+       break if line == '__END__'
+       ## strip inline (until end-of-line) comments too
+       ##   e.g. Janvier  Janv  Jan  ## check janv in use??
+       ##   =>   Janvier  Janv  Jan
+       line = line.sub( /#.*/, '' ).strip
+       ## pp line
+       ###
+       ##  check for variable defs
+       if m = VARDEF_RE.match( line )
+           vars[ m[:key].downcase ] = m[:value ]
+           next
+       end
+       line = line.gsub( VAR_RE ) do |_|
+                  m = Regexp.last_match
+                  key = m[:key].downcase
+                  value = vars[key]
+                  raise ArgumentError, "subvars - no vardef found for key >#{key}<"   if value.nil?
+                  value
+             end
+        ### use squish  - remove more than one inline space
+         line = line.gsub( /[ ]{2,}/, ' ' )
+         ## open paren (use for grouping to non-capture grouping) e.g.
+         ##   () => (?: )
+         ##   note - do NOT replace escaped /( !!!
+         ##         e.g.   playoffs (liguilla)
+          line = line.gsub( /  ## negative lookbehind
+                                   (?<! \\)
+                                \(
+                               /x, '(?: ')
+         ## expand space shortcuts
+         ##     replace  Middle Dot (·)  Unicode: U+00B7 or
+         ##             White Square (□)  Unicode: U+25A1 or
+         ##   White Small Square     (▫)   Unicode: U+25AB
+         ##               Open Box (␣)    Unicode: U+2423 or
+         ##
+         ##  add more - why? why not?
+          line = line.gsub( /[·□▫␣]/, ' [ ] ' )
+       names << line
+     end
+     names
+end

data/lib/rsssf/convert/convert.rb ADDED Viewed

@@ -0,0 +1,71 @@
+module Rsssf
+class PageConverter
+  ## convenience helper
+  def self.convert( html, url: )
+    @@converter ||= new   ## use a "shared" built-in converter
+    @@converter.convert( html, url: url )
+  end
+  ##
+  ##  add anchor: options or such
+  ##    lets you toggle adding anchors (§premier etc.) - why? why not?
+  def convert( html, url: )
+    ### todo/fix: first check if html is all ascii-7bit e.g.
+    ## includes only chars from 64 to 127!!!
+    ## normalize newlines
+    ##   replace \r\n (form feed \r) used by Windows - ff+lf;
+    ##         just use \n (new line a.k.a. line feed)
+    html = html.gsub( "\r\n", "\n" )
+    ##  convert tabs to two spaces (or use four??)
+    html = html.gsub( "\t", '   ' )
+    html = convert_html_entities( html, url: url )
+ ###################################
+ ### smart quotes quick fixes
+ ### convert all "smart" quote to (standard) single and double quotes
+ ##  D´Alessandro   =>  D'Alessandro
+ ##    81´ and 88'   =>  81' and 88'
+    html = html.gsub( /[´’‘]/, "'" )
+    html = html.gsub( /[“”]/,  '"' )
+  ### convert fancy (unicode) dashes/hyphens to plain dash/hyphen
+     html = html.gsub( '–', '-' )
+    txt   = html_to_txt( html, url: url )
+    txt
+  end  ## method convert
+###################
+# more helpers
+def self.log( msg )
+  ## append msg to ./logs.txt
+  ##     use ./errors.txt - why? why not?
+  File.open( './logs.txt', 'a:utf-8' ) do |f|
+    f.write( msg )
+    f.write( "\n" )
+  end
+end
+def log( msg ) self.class.log( msg ); end
+end # module PageConverter
+end # module Rsssf

data/lib/rsssf/convert/errata.rb ADDED Viewed

@@ -0,0 +1,103 @@
+module Rsssf
+class PageConverter
+##
+## todo/fix/fix/fix
+##    add filenames/urls for quick fixes!!!
+def self.errata_html( html )
+   ## auto-fix known typos / errors
+   ###   kind of PRE-processing, see errata_txt for POST-processing
+   ###  check - rename to errata_pre/post - why? why not?
+     ## quick fix - rm </ADDRESS>
+    ##  </ADDRESS>
+    ##   tablesb/braz94.html
+    html = html.gsub( '</ADDRESS>', ''  )
+   ## quick fix   </a  => </a>
+   ##  <a href="#play6">Gold League (Calle 6)</a
+   ##  <a href="#zpl">PBZ Premier League 2025/26</a
+   ##  <a href="#lig1">Championnat National Ligue 1</a
+   html = html.gsub( /<\/A
+                          (?! [ ]*>)     ## negative lookahead
+                      /ix, '</A>' )
+   ## quick fix  </br>  => <br>
+   html = html.gsub( /<\/BR>/i, '<BR>' )
+  ## quick fix - change typo <H1></H2>
+  ##  tables/58full.html
+  html = html.gsub( '<H1>Quarterfinals</H2>', '<H2>Quarterfinals</H2>' )
+  ## quick fix - change typo <M>,<N> to <B>
+  ##   tables/54full.html
+  html = html.gsub( '<M>MEX</B>', '<B>MEX</B>' )
+  ##   tables/58full.html
+  html = html.gsub( '<N>CZE</B>', '<B>CZE</B>' )
+  ## quick fix -
+  ##   tablesb/braz88.html
+  html = html.gsub( '<</TITLE>', '</TITLE>' )
+  ## quick fix
+  ##   hr (horizontal rule) via img
+  ##   in  tables/30full.html and others
+  ##
+  ## <IMG SRC="xshadow.gif.pagespeed.ic.AbdeNVcmzw.png" ALT="-----------">
+  ##   look  for
+  ## <IMG   ALT="---">
+  html = html.gsub(  /<IMG
+                            [^>]+?
+                           ALT="-{3,}"
+                         >/ixm, '<HR>' )
+  html
+end
+def errata_html( html ) self.class.errata_html( html ); end
+def self.errata_html_entities( html )
+    ########
+    ## typos / autofix - keep - why? why not?
+    html = html.gsub( "&oulm;", 'ö' )    ## support typo in entity (&ouml;)
+    html = html.gsub( "&uml;",  'ü' )    ## support typo in entity (&uuml;) - why? why not?
+    html = html.gsub( "&slig;", "ß" )    ## support typo in entity (&szlig;)
+    html = html.gsub( "&aaacute;", "á" )  ## typo for &aacute;
+    html = html.gsub( "&nitlde;", "ñ" )  ## typ for &ntilde;
+    html
+end
+def errata_html_entities( html ) self.class.errata_html_entities( html ); end
+def errata_txt( txt )
+  ## kind-of POST-processing, see errata_html for PRE-processing
+   ## quick fix - squish spaces (to single)
+   ##   tables/82full.html
+   txt = txt.gsub( 'Second  phase', 'Second phase' )
+   ## quick fix - add (missing) closing bracket (])
+   ##   tables/70q.html
+   txt = txt.gsub(/^South America Group 10 \[Brazil$/,
+                   'South America Group 10 [Brazil]' )
+  txt
+end
+end # module PageConverter
+end # module Rsssf

data/lib/rsssf/convert/html_entities.rb ADDED Viewed

@@ -0,0 +1,150 @@
+module Rsssf
+class PageConverter
+  ENTITIES =  %w[
+À   &Agrave;
+Á   &Aacute;
+Â   &Acirc;
+Ã   &Atilde;
+Ä   &Auml;
+Å   &Aring;
+à   &agrave;
+á   &aacute;
+â   &acirc;
+ã   &atilde;
+ä   &auml;
+å   &aring;
+Æ   &AElig;
+æ   &aelig;
+ß   &szlig;
+Ç   &Ccedil;
+ç   &ccedil;
+È   &Egrave;
+É   &Eacute;
+Ê   &Ecirc;
+Ë   &Euml;
+è   &egrave;
+é   &eacute;
+ê   &ecirc;
+ë   &euml;
+ð   &eth;
+Ì   &Igrave;
+Í   &Iacute;
+Î   &Icirc;
+Ï   &Iuml;
+ì   &igrave;
+í   &iacute;
+î   &icirc;
+ï   &iuml;
+Ñ   &Ntilde;
+ñ   &ntilde;
+Ò   &Ograve;
+Ó   &Oacute;
+Ô   &Ocirc;
+Õ   &Otilde;
+Ö   &Ouml;
+ò   &ograve;
+ó   &oacute;
+ô   &ocirc;
+õ   &otilde;
+ö   &ouml;
+Ø   &Oslash;
+ø   &oslash;
+Ù   &Ugrave;
+Ú   &Uacute;
+Û   &Ucirc;
+Ü   &Uuml;
+ù   &ugrave;
+ú   &uacute;
+û   &ucirc;
+ü   &uuml;
+Ý   &Yacute;
+ý   &yacute;
+ÿ   &yuml;
+<    &lt;
+>    &gt;
+&    &amp;
+©    &copy;
+®    &reg;
+]
+  def self.convert_html_entities( html, url: nil )
+    ## check for html entities
+    html = html.gsub( "&auml;", 'ä' )
+    html = html.gsub( "&ouml;", 'ö' )
+    html = html.gsub( "&uuml;", 'ü' )
+    html = html.gsub( "&Auml;", 'Ä' )
+    html = html.gsub( "&Ouml;", 'Ö' )
+    html = html.gsub( "&Uuml;", 'Ü' )
+    html = html.gsub( "&szlig;", 'ß' )
+    html = errata_html_entities( html )
+    ENTITIES.each_slice(2) do |str, entity|
+       html = html.gsub( entity, str )
+    end
+    ##############
+    ## check for more entities
+    ##   limit &---; to length 10 - why? why not?
+    ## check for decimal entities (mapping 1:1 to unicode)
+    html = html.gsub(/&#(\d+);/) do |match|
+             uni =  if match == '&#307;'   ## use like Van D&#307;k  -> Van Dijk
+                     'ij'
+                    else
+                     [$1.to_i].pack("U")
+                    end
+              ##puts "   converting numeric html entity #{match} to unicode char #{uni}"
+             uni
+          end
+    html = html.gsub( /&[^; ]{1,10};/) do |match|
+           ##   ignore weird edge case of &A;
+           ##    e.g. [M&A; moved from pool B] - where M&A is name of club
+           ##
+           ##  in ital03.html:
+           ###    [Eugenio Corini 22pen&36pen; Christian Vieri 69]
+           ##     Francesco Totti 31, Vincenzo Montella 49&68; Antonio Di Natale 11]
+               if match == '&A;' ||
+                  match == '&36pen;' || match == '&68;'
+               else
+                  msg = "found unencoded html entity #{match}"
+                  msg += " in >#{url}<"   if url
+                  puts "*** WARN - #{msg}"
+                  log( msg )  ## log too (see log.txt)
+               end
+               match   ## pass through as is (1:1)
+    end
+    html
+  end
+  def convert_html_entities( html, url: nil ) self.class.convert_html_entities( html, url: url ); end
+end # module PageConverter
+end # module Rsssf

data/lib/rsssf/convert/html_to_txt/beautify_anchors.rb ADDED Viewed

@@ -0,0 +1,96 @@
+module Rsssf
+class PageConverter
+def beautify_anchors( html )
+  ## beautify
+  ##  ‹§2fin›
+  ##
+  ## == Semifinals
+  ##
+  ##  merge anchor (a name) with heading into one line e.g.
+  ##       =>
+  ##  == Semifinals  ‹§2fin›
+   html = html.gsub( /\s*
+                          (?<name>‹§
+                                    [^›]+?
+                                 ›)
+                      \s*
+                          (?<heading>={2,}
+                              [^=\n]+?
+                          )
+                       \n
+                       \s*/ixm ) do |match|
+           m = Regexp.last_match
+           match = match.gsub( "\n", '$$' )  ## make newlines visible for debugging
+           puts "   mergeing anchor (a name) with heading into one line - >#{match}<"
+           "\n\n#{m[:heading]}  #{m[:name]}\n\n"
+    end
+###
+##
+## beautify
+##  ‹§argsquad›Argentine Squad Full Info
+##  ‹§eng›ENGLAND
+##
+##
+##  reformat anchor (a name) start line with text  e.g.
+##       =>
+##  Argentine Squad Full Info  ‹§argsquad›
+##  ENGLAND  ‹§eng›
+   html = html.gsub( /\n
+                          (?<name>‹§
+                                    [^›]+?
+                                 ›)
+                      [ ]*
+                          (?<text>[^\n]+?
+                          )
+                       \n
+                       /ixm ) do |match|
+           m = Regexp.last_match
+           match = match.gsub( "\n", '$$' )  ## make newlines visible for debugging
+           puts "   move anchor (a name) starting line with text to end - >#{match}<"
+           "\n#{m[:text]}  #{m[:name]}\n"
+    end
+###
+## beautify heading
+##   ==== ‹§gra›Group A
+##     =>
+##   ==== Group A  ‹§gra›
+   html = html.gsub( /\n
+                          (?<heading_marker>
+                               ={2,})
+                               [ ]*
+                          (?<name>‹§
+                                    [^›]+?
+                                 ›)
+                             [ ]*
+                          (?<heading_text>[^\n]+?
+                          )
+                       \n
+                       /ixm ) do |match|
+           m = Regexp.last_match
+           match = match.gsub( "\n", '$$' )  ## make newlines visible for debugging
+           puts "   move anchor (a name) in heading to end - >#{match}<"
+           "\n#{m[:heading_marker]} #{m[:heading_text]}  #{m[:name]}\n"
+    end
+    html
+end
+end # module PageConverter
+end # module Rsssf

data/lib/rsssf/convert/html_to_txt/make_heading.rb ADDED Viewed

@@ -0,0 +1,70 @@
+###
+## <b><a name="fall">Opening Season 2024</a></b>  => <hb> ... </hb>
+## <u><a name="fplay">Playoff Stage</a></u>       => <hu> ... </hu>
+##
+##  (inofficial) heading "bold", heading "underscore"
+##  note - MUST be one single "stand-alone" line  (in pre block) !!!
+=begin
+BU_ANAME_LINE_RE = %r{^ [ ]*  < (?<tag>B|U) >
+             [ ]* (?<text>
+                     <A [ ]+ NAME
+                      .+?
+                     </A>
+                   )
+             [ ]*  </ \k<tag> >
+             [ ]*
+        $}ix
+=end
+## scan for now only (do NOT replace)
+BOLD_OR_UNDERLINE_LINE_RE = %r{^ [ ]*  < (?<tag> [BU]) >
+             [ ]* (?<text>
+                      .+?   ## note - use non-greedy match
+                   )
+             [ ]*  </ \k<tag> >
+             [ ]*
+        $}ix
+def make_heading( html )
+  edits = []
+  html = html.gsub( BOLD_OR_UNDERLINE_LINE_RE ) do |match|
+        m = Regexp.last_match
+        tag  = m[:tag].downcase
+        text = m[:text]
+        if text.downcase.start_with?( '<a name' )
+          msg =  "make heading (h#{tag}) out of #{tag}-enclosed a name in line >#{text}<"
+          puts " #{msg}"
+          ## note - edit line MUST start with --
+          ##         might be multi-line
+          edits << "-- #{msg}"
+          "<h#{tag}>#{text}</h#{tag}>"
+        else
+          ## note - skip (false positive) copyright line (in about this document)
+          ##  (C) Copyright RSSSF
+          ##      Copyright
+          if %r{copyright}i.match?( text )
+          else
+            msg =  "found #{tag}-enclosed line >#{text}< - heading?"
+            puts " #{msg}"
+            edits << "-- #{msg}"
+          end
+            match   ## keep as is (do NOT change)
+        end
+  end
+  [html, edits]
+end