RubyGems - rsssf-parser - Versions diffs - 0.0.1 → 0.1.0 - Mend

rsssf-parser 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +2 -0
data/Manifest.txt +1 -0
data/Rakefile +2 -1
data/bin/rsssf +7 -5
data/lib/rsssf/parser/linter.rb +90 -26
data/lib/rsssf/parser/token-date.rb +6 -59
data/lib/rsssf/parser/token-goals.rb +3 -3
data/lib/rsssf/parser/token-note.rb +32 -4
data/lib/rsssf/parser/token-text.rb +3 -2
data/lib/rsssf/parser/token.rb +75 -16
data/lib/rsssf/parser/version.rb +24 -0
data/lib/rsssf/parser.rb +10 -0
metadata +3 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 1b6cfe7842f0f46d242c1c2fc1f52b4c032b5c25fce314939583c7f96a486c65
-  data.tar.gz: ba5244b284f65129dca3b35e87d10984e1bf8906e571b3e42e85a4615eecb733
+  metadata.gz: 52ee31bde399793292f0978a0f3109be47f2df56de7e35fb013f6f47d33a5ff0
+  data.tar.gz: d2ecbbb9f5935d97a9520c65e30a4a4ce0fab6b6817e97e70932c4f73e02f269
 SHA512:
-  metadata.gz: 9a63d121c858e35f757b59c490fc05cfd1457ac5c6e3294a291db787da141061f046c5ce2342fdf275e64bbb647934ef43d8547c2aa53eef537d12405d746185
-  data.tar.gz: e7a8f41d2d53e63fb72e35f22b1a0a0da370e15ab1b4aedf2ae1f37a2a1bc277ff31deb8362e541bbc403f7d49ea3f1120b4b271b9d36982359141ca80617d72
+  metadata.gz: 403573de54f0dba9155ec8efa264eb5f69dc3785cb7411cbf97d4fbbf033734370bdfe61d6eaff5d1a5939a2972bd12ea9a2fcf9fb4abdd14077e85b9d9a3d3a
+  data.tar.gz: 54414cabdff9a1804f9ce5256dd34d5d0b70b7df7e9d3d22fb2fa3191712436679763370c019b78b8ccc051196feed31edfd5e2807b42c845789e848bf3e5e50

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,5 @@
+### 0.1.0
 ### 0.0.1 / 2024-07-17
 * Everything is new. First release.

data/Manifest.txt CHANGED Viewed

@@ -13,3 +13,4 @@ lib/rsssf/parser/token-round.rb
 lib/rsssf/parser/token-score.rb
 lib/rsssf/parser/token-text.rb
 lib/rsssf/parser/token.rb
+lib/rsssf/parser/version.rb

data/Rakefile CHANGED Viewed

@@ -1,9 +1,10 @@
 require 'hoe'
+require './lib/rsssf/parser/version.rb'
 Hoe.spec 'rsssf-parser' do
-  self.version = '0.0.1'
+  self.version = SportDb::Module::RsssfParser::VERSION
   self.summary = "rsssf-parser - football match schedule & results parser (& tokenizer) for the rsssf format / conventions"
   self.description = summary

data/bin/rsssf CHANGED Viewed

@@ -45,8 +45,7 @@ paths =  if args.empty?
             '../../../rsssf/austria/2010-11/cup.txt',
           ]
          else
-            ## todo/fix - expand_args!!!
-            args
+            SportDb::Parser::Opts.expand_args( args )
          end
@@ -60,15 +59,18 @@ Rsssf::Parser::Linter.debug = true    if opts[:debug]
 linter = Rsssf::Parser::Linter.new
+errors = []
 paths.each_with_index do |path,i|
     puts "==> [#{i+1}/#{paths.size}] reading >#{path}<..."
     linter.read( path, parse: !opts[:metal] )
+    errors += linter.errors  if linter.errors?
 end
-if linter.errors?
+if errors.size > 0
     puts
-    pp linter.errors
-    puts "!!   #{linter.errors.size} parse error(s) in #{paths.size} datafiles(s)"
+    pp errors
+    puts "!!   #{errors.size} parse error(s) in #{paths.size} datafiles(s)"
 else
     puts "OK   no parse errors found in #{paths.size} datafile(s)"
 end

data/lib/rsssf/parser/linter.rb CHANGED Viewed

@@ -25,45 +25,78 @@ def errors?() @errors.size > 0; end
+def read( path, parse: false )
+     parse( read_text( path ), parse: parse,
+                               path:  path )
+end
 #########
 ## parse - false (default) - tokenize (only)
 ##       - true            - tokenize & parse
-def read( path, parse: false )
-  ## fix - (re)use outline reader later!!!
-  ##   plus check for headings etc.
-  text = File.open( path, 'r:utf-8' ) { |f| f.read }
-  lines = text.split( "\n" )
+##
+## todo/fix - change path to file or such - why? why not?
-  ##  process lines
-  tree = []
-  lines.each do |line|
-    ## skip blank and comment lines
-    next if line.strip.empty? || line.strip.start_with?('#')
+MAX_ERRORS = 13   ## stop after 13 errors
-    ## strip inline (end-of-line) comments
-    line = line.sub( /#.+$/, '' )
+def parse( txt, parse: false,
+                path: 'path/to/filename/here' )
+  ## note: every (new) read call - resets errors list to empty
+  @errors = []
+  nodes = SportDb::OutlineReader.parse( txt )
-    if debug?
-      puts
-      puts "line >#{line}<"
-    end
+  ##  process nodes
+  h1         = nil
+  orphans    = 0    ## track paragraphs with no heading
+  paragraphs = 0    ## track paragraphs with heading
-    t, error_messages  =  if parse
-                            @parser.parse_with_errors( line )
-                          else
-                            @parser.tokenize_with_errors( line )
-                          end
+  nodes.each do |node|
+    type = node[0]
+    if type == :h1
+        h1 = node[1]  ## get heading text
+        ## puts
+        puts "  = Heading 1 >#{node[1]}<"
+    elsif type == :p
+      if h1.nil?
+        orphans += 1    ## only warn once (at the end; see below)
+        next
+      end
+      paragraphs += 1
+      lines = node[1]
+      tree = []
+      lines.each_with_index do |line,i|
+        if debug?
+          puts
+          puts "line >#{line}<"
+        end
+        t, error_messages  =  if parse
+                                @parser.parse_with_errors( line )
+                              else
+                                @parser.tokenize_with_errors( line )
+                              end
     if error_messages.size > 0
       ## add to "global" error list
       ##   make a triplet tuple (file / msg / line text)
             error_messages.each do |msg|
+                ## note - stop processing / adding errors if hit MAX ERRORS
+                if @errors.size >= MAX_ERRORS
+                   @errors << [ path,
+                                 "stop after #{MAX_ERRORS} errors",
+                                 '']
+                   return
+                end
                 @errors << [ path,
                              msg,
                              line
@@ -74,9 +107,40 @@ def read( path, parse: false )
     pp t   if debug?
     tree << t
-  end
+  end
   ## pp tree
-end  # read
+else
+  pp node
+  raise ArgumentError, "unsupported (node) type >#{type}<"
+end
+end  # each node
+  ## no heading and no orphans => assume empty file (comments only)!!!
+  if h1.nil? && orphans == 0
+    puts "  !! WARN - no heading(s) and paragraph(s) found"
+     @errors << [ path,
+                  "warn - no heading(s) and paragraph(s) found",
+                  ""  ## pass along empty line
+                ]
+  end
+  if orphans > 0
+    puts "  !! WARN - no heading for #{orphans} text paragraph(s); skipping parse"
+    @errors << [ path,
+                  "warn - no heading for #{orphans} text paragraph(s); skipping parse",
+                  ""  ## pass along empty line
+               ]
+  end
+  if h1 && paragraphs == 0
+    puts "  !! WARN - heading with no text paragraph(s)"
+    @errors << [ path,
+                  "warn - heading with no text paragraph(s)",
+                  ""  ## pass along empty line
+               ]
+  end
+end  # parse
 end  # class Linter

data/lib/rsssf/parser/token-date.rb CHANGED Viewed

@@ -2,62 +2,9 @@ module Rsssf
 class Parser
-def self.parse_names( txt )
-  lines = [] # array of lines (with words)
-  txt.each_line do |line|
-    line = line.strip
-    next if line.empty?
-    next if line.start_with?( '#' )   ## skip comments too
-    ## strip inline (until end-of-line) comments too
-    ##   e.g. Janvier  Janv  Jan  ## check janv in use??
-    ##   =>   Janvier  Janv  Jan
-    line = line.sub( /#.*/, '' ).strip
-    ## pp line
-    values = line.split( /[ \t]+/ )
-    ## pp values
-    ## todo/fix -- add check for duplicates
-    lines << values
-  end
-  lines
-end # method parse
-def self.build_names( lines )
-  ## join all words together into a single string e.g.
-  ##   January|Jan|February|Feb|March|Mar|April|Apr|May|June|Jun|...
-  lines.map { |line| line.join('|') }.join('|')
-end
-## add normalize option (for downcase) - why? why not?
-def self.build_map( lines )
-    ## note: downcase name!!!
-   ## build a lookup map that maps the word to the index (line no) plus 1 e.g.
-   ##  {"january" => 1,  "jan" => 1,
-   ##   "february" => 2, "feb" => 2,
-   ##   "march" => 3,    "mar" => 3,
-   ##   "april" => 4,    "apr" => 4,
-   ##   "may" => 5,
-   ##   "june" => 6,     "jun" => 6, ...
-   lines.each_with_index.reduce( {} ) do |h,(line,i)|
-     line.each { |name| h[ name.downcase ] = i+1 }  ## note: start mapping with 1 (and NOT zero-based, that is, 0)
-     h
-   end
-end
  ## note -  support only 5 letter max for now
  ##    now January|February|August etc.
-MONTH_LINES = parse_names( <<TXT )
+MONTH_LINES = SportDb::Parser.parse_names( <<TXT )
 Jan
 Feb
 March      Mar
@@ -72,15 +19,15 @@ Nov
 Dec
 TXT
-MONTH_NAMES = build_names( MONTH_LINES )
+MONTH_NAMES = SportDb::Parser.build_names( MONTH_LINES )
 # pp MONTH_NAMES
-MONTH_MAP   = build_map( MONTH_LINES )
+MONTH_MAP   = SportDb::Parser.build_map( MONTH_LINES, downcase: true )
 # pp MONTH_MAP
 ### nnote - only support two or three letters
 ##    no Tues | Thur | Thurs | Sunday etc.
-DAY_LINES = parse_names( <<TXT )
+DAY_LINES = SportDb::Parser.parse_names( <<TXT )
 Mon  Mo
 Tue  Tu
 Wed  We
@@ -91,9 +38,9 @@ Sun  Su
 TXT
-DAY_NAMES = build_names( DAY_LINES )
+DAY_NAMES = SportDb::Parser.build_names( DAY_LINES )
 # pp DAY_NAMES
-DAY_MAP   = build_map( DAY_LINES )
+DAY_MAP   = SportDb::Parser.build_map( DAY_LINES, downcase: true )
 # pp DAY_MAP

data/lib/rsssf/parser/token-goals.rb CHANGED Viewed

@@ -29,12 +29,12 @@ MINUTE_RE = %r{
                     (?:
                       \d{1,3}
                       '?   ## optional minute quote (')
-                      (?= (og|pen|p)? ([ ;,\]]|$))
+                      (?= (og|o|pen|p)? ([ ;,\]\)]|$))
                     )
                   )
                 )
                 |
-                (?= (og|pen|p)? ([ ;,\]]|$))  # note - break can be og|pen|p too
+                (?= (og|o|pen|p)? ([ ;,\]\)]|$))  # note - break can be og|pen|p too
          )
       )}ix
 ### note - word boundary (\b) will NOT work for quoet (')
@@ -55,7 +55,7 @@ GOAL_PEN_RE = %r{
 GOAL_OG_RE = %r{
    (?<og>
         (?<=\d|\+|[ ]|')	## must follow a number or plus (e.g. 45og / 45+og / 45 og) or space
-          og
+          (?: og|o )
           \b
    )
 }ix

data/lib/rsssf/parser/token-note.rb CHANGED Viewed

@@ -6,8 +6,8 @@ class Parser
 ##  move to token-note(s) file !!!!
 ##
-NOTE_RE = %r{
-    \[
+NOTE_BASICS_RE = %r{
+    (?<note_open> \[ )
    (?<note>
      (?:  ##  starting with ___   PLUS requiring more text
        (?:
@@ -97,16 +97,44 @@ NOTE_RE = %r{
               |
               replay
               ## e.g.  [replay]
+              |
+              verified
+              ## e.g.  [verified 2:0 wo.]
           )
         ([ ]    ## note - optional text
           [^\]]+?
          )?         ## slurp all to next ] - (use non-greedy)
       )
-    )    # note capture
-     \]
+    )    # note capture
+     (?:
+         (?<note_close> \] )
+         | $ ## note - allow open notes (that continue on next line)
+      )
+}ix
+NOTE_MORE_RE = %r{
+      (?<=[ ])  ## one (leading) space min. required
+       (?<note_cont>
+             [⮑…] |
+             \.{2,3}   ### .. or ...
+       )
+        [ ]*
+       (?<note>
+            [^\]]+?   ## non-greeedy
+          )
+       (?:
+         (?<note_close> \] )
+         | $ ## note - allow open notes (that continue on next line)
+       )
 }ix
+NOTE_RE  = Regexp.union(  NOTE_BASICS_RE,
+                          NOTE_MORE_RE,
+                         )
 end  #   class Parser
 end  #   module Rsssf

data/lib/rsssf/parser/token-text.rb CHANGED Viewed

@@ -80,8 +80,9 @@ TEXT_STRICT_RE = %r{
         ## positive lookahead
         ##   cannot use \b  if text ends in dot (.) or other non-alphnum
-        ##        than \b will not work
-            (?=[ ,;@|\[\]]
+        ##        than \b will not work
+        ##   not    - add () too for now - why? why not?
+            (?=[ ,;@|\[\]\(\)]
                  |$
             )
     )

data/lib/rsssf/parser/token.rb CHANGED Viewed

@@ -9,7 +9,7 @@ BASICS_RE = %r{
     (?<spaces> [ ]{2,}) |
     (?<space>  [ ])
         |
-    (?<sym>[;,@|\[\]])
+    (?<sym>[;,@|\[\]\(\)])     ## note - add () too  - why? why not?
 }ix
@@ -38,10 +38,37 @@ RE = Regexp.union(  GROUP_RE, ROUND_RE, LEG_RE,
                      TEXT_RE )
+###  rename to dash or to ???
+####   used to add/allow hyphen/dash (-) in INSIDE_RE
+HYPHEN_RE = %r{   ## must be space before and after (or end of line)!!!
+                  ##  note - uses SYM capture
+                    (?<sym>
+                      (?<=[ ])	# Positive lookbehind for space
+                         -
+                       (?=[ ]|$)   # positive lookahead for space
+                    )
+                }ix
+### rename to ??  use SCORE_AT for now - why? why not?
+##   add support for score at/score points/markers
+###  e.g.  [1-0 Andrei 08, 1-1 Rydlewicz 24, 1-2 Prica 85, 2-2 Bella 88,
+##      2-3 Arvidsson 102]
+SCORE_AT_RE = %r{ (?<score_at>
+                    \b
+                    \d{1,2}-\d{1,2}
+                    \b
+                  )
+              }ix
 ## "strict" text match mode inside brackets
 ##  ]
-INSIDE_RE  = Regexp.union(  GOAL_OG_RE, GOAL_PEN_RE,
-                            BASICS_RE,
+INSIDE_RE  = Regexp.union(  SCORE_AT_RE,
+                            GOAL_OG_RE, GOAL_PEN_RE,
+                            BASICS_RE, HYPHEN_RE,
                             TEXT_STRICT_RE,
                             MINUTE_RE,
                          )
@@ -56,6 +83,11 @@ def log( msg )
 end
+## open/close pairs - lookup close (by open char)
+SYM_CLOSE = {
+  '(' => ')',
+  '[' => ']',
+}
 def tokenize_with_errors( line, debug: false )
   tokens = []
@@ -72,6 +104,7 @@ def tokenize_with_errors( line, debug: false )
   ####
   ## quick hack - keep re state/mode between tokenize calls!!!
   @re  ||= RE     ## note - switch between RE & INSIDE_RE
   while m = @re.match( line, pos )
@@ -84,10 +117,14 @@ def tokenize_with_errors( line, debug: false )
     if offsets[0] != pos
       ## match NOT starting at start/begin position!!!
       ##  report parse error!!!
-      msg =  "!! WARN - parse error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
+      ctx = @re == INSIDE_RE ? 'INSIDE_RE' : 'RE'  ## assume RE
+      ## fix/change - use str.inspect to show tabs (\t)
+      ##          and possibly other special characters causing trouble
+      msg =  "  !! WARN - parse error (#{ctx}) - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
       puts msg
-      errors << "parse error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]}"
+      errors << "parse error (#{ctx}) - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]}"
       log( msg )
     end
@@ -109,7 +146,9 @@ def tokenize_with_errors( line, debug: false )
              [:text, m[:text]]   ## keep pos - why? why not?
            elsif m[:minute]
              [:minute, m[:minute]]
-           elsif m[:og]
+           elsif m[:score_at]
+             [:score_at, m[:score_at]]
+          elsif m[:og]
              [:og, m[:og]]    ## for typed drop - string version/variants
            elsif m[:pen]
              [:pen, m[:pen]]
@@ -121,12 +160,21 @@ def tokenize_with_errors( line, debug: false )
              when ';' then [:';']
              when '@' then [:'@']
              when '|' then [:'|']
-             when '['
-               ## report error - already in inside mode!!!
+             when '-' then [:'-']
+             when '[', '('
+               if sym == @sym_open
+                 ## report error - already in inside mode!!!
+                 ##  e.g. another [ in [] or ( in ()
+                 log( "warn - unexpected (opening) #{sym} in inside (goal) mode in line >#{line}<" )
+               end
                nil
-             when ']'
-               puts "  leave inside match mode"
-               @re = RE
+             when ']', ')'   ## allow [] AND () for inside mode
+               ## puts "  leave inside match mode"
+               if sym == @sym_close
+                   @re = RE
+                   @sym_open  = nil  ## reset sym_open/close
+                   @sym_close = nil
+               end
                nil
              else
               nil  ## ignore others (e.g. brackets [])
@@ -176,12 +224,15 @@ def tokenize_with_errors( line, debug: false )
              when ';' then [:';']
              when '@' then [:'@']
              when '|' then [:'|']
-             when '['
+             when '[', '('
                ##  switch to inside mode!!!
-               puts "  enter inside match mode"
+               ## puts "  enter inside match mode"
                @re = INSIDE_RE
+               @sym_open  =  sym      ## record open/close style - why? why not?
+               @sym_close =  SYM_CLOSE[sym]
                nil
-             when ']'
+             when ']', ')'
+               log( "warn - unexpected (closing) #{sym} in standard mode in line >#{line}<" )
                ## already in standard mode/ctx
                ##  report warn/error - why? why not?
                nil
@@ -204,13 +255,21 @@ def tokenize_with_errors( line, debug: false )
     end
   end
   ## check if no match in end of string
   if offsets[1] != line.size
-    msg =  "!! WARN - parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<"
+    ## note - report regex context
+    ##  e.g.  RE or INSIDE_RE  to help debugging/troubleshooting format errors
+    ctx = @re == INSIDE_RE ? 'INSIDE_RE' : 'RE'  ## assume RE
+    ## fix/change - use str.inspect to show tabs (\t)
+    ##          and possibly other special characters causing trouble
+    msg =  "  !! WARN - parse error (#{ctx}) - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<"
     puts msg
     log( msg )
-    errors << "parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size}"
+    errors << "parse error (#{ctx}) - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size}"
   end

data/lib/rsssf/parser/version.rb ADDED Viewed

@@ -0,0 +1,24 @@
+module SportDb
+  module Module
+    module RsssfParser
+  MAJOR = 0    ## todo: namespace inside version or something - why? why not??
+  MINOR = 1
+  PATCH = 0
+  VERSION = [MAJOR,MINOR,PATCH].join('.')
+  def self.version
+    VERSION
+  end
+  def self.banner
+    "rsssf-parser/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}] in (#{root})"
+  end
+  def self.root
+    File.expand_path( File.dirname(File.dirname(File.dirname(File.dirname(__FILE__)))) )
+  end
+    end   # module RsssfParser
+  end
+end

data/lib/rsssf/parser.rb CHANGED Viewed

@@ -2,9 +2,17 @@
 ####
 ##  build on "standard" parse
 require 'sportdb/parser'
+## pulled in for/uses only
+##  -  SportDb::Parser::Tokens  !!!
+##
+##  plus in the future pull in SportDb::OutlineReader
+##
+##  note - pulls in more deps e.g. cococs AND season-formats
 ## our own code
+require_relative 'parser/version'
 require_relative 'parser/token-text'
 require_relative 'parser/token-note'
 require_relative 'parser/token-round'    ## round (& group)
@@ -19,3 +27,5 @@ require_relative 'parser/linter'
+# say hello
+puts SportDb::Module::RsssfParser.banner

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: rsssf-parser
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.1.0
 platform: ruby
 authors:
 - Gerald Bauer
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2024-07-17 00:00:00.000000000 Z
+date: 2024-07-22 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: sportdb-parser
@@ -84,6 +84,7 @@ files:
 - lib/rsssf/parser/token-score.rb
 - lib/rsssf/parser/token-text.rb
 - lib/rsssf/parser/token.rb
+- lib/rsssf/parser/version.rb
 homepage: https://github.com/sportdb/sport.db
 licenses:
 - Public Domain