RubyGems - rsssf-parser - Versions diffs - 0.0.1 → 0.1.0 - Mend

rsssf-parser 0.0.1 → 0.1.0

Files changed (14) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +2 -0
data/Manifest.txt +1 -0
data/Rakefile +2 -1
data/bin/rsssf +7 -5
data/lib/rsssf/parser/linter.rb +90 -26
data/lib/rsssf/parser/token-date.rb +6 -59
data/lib/rsssf/parser/token-goals.rb +3 -3
data/lib/rsssf/parser/token-note.rb +32 -4
data/lib/rsssf/parser/token-text.rb +3 -2
data/lib/rsssf/parser/token.rb +75 -16
data/lib/rsssf/parser/version.rb +24 -0
data/lib/rsssf/parser.rb +10 -0
metadata +3 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 1b6cfe7842f0f46d242c1c2fc1f52b4c032b5c25fce314939583c7f96a486c65
-  data.tar.gz: ba5244b284f65129dca3b35e87d10984e1bf8906e571b3e42e85a4615eecb733
+  metadata.gz: 52ee31bde399793292f0978a0f3109be47f2df56de7e35fb013f6f47d33a5ff0
+  data.tar.gz: d2ecbbb9f5935d97a9520c65e30a4a4ce0fab6b6817e97e70932c4f73e02f269
 SHA512:
-  metadata.gz: 9a63d121c858e35f757b59c490fc05cfd1457ac5c6e3294a291db787da141061f046c5ce2342fdf275e64bbb647934ef43d8547c2aa53eef537d12405d746185
-  data.tar.gz: e7a8f41d2d53e63fb72e35f22b1a0a0da370e15ab1b4aedf2ae1f37a2a1bc277ff31deb8362e541bbc403f7d49ea3f1120b4b271b9d36982359141ca80617d72
+  metadata.gz: 403573de54f0dba9155ec8efa264eb5f69dc3785cb7411cbf97d4fbbf033734370bdfe61d6eaff5d1a5939a2972bd12ea9a2fcf9fb4abdd14077e85b9d9a3d3a
+  data.tar.gz: 54414cabdff9a1804f9ce5256dd34d5d0b70b7df7e9d3d22fb2fa3191712436679763370c019b78b8ccc051196feed31edfd5e2807b42c845789e848bf3e5e50

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,5 @@
+### 0.1.0
 ### 0.0.1 / 2024-07-17
 * Everything is new. First release.

data/Manifest.txt CHANGED Viewed

@@ -13,3 +13,4 @@ lib/rsssf/parser/token-round.rb
 lib/rsssf/parser/token-score.rb
 lib/rsssf/parser/token-text.rb
 lib/rsssf/parser/token.rb
+lib/rsssf/parser/version.rb

data/Rakefile CHANGED Viewed

@@ -1,9 +1,10 @@
 require 'hoe'
+require './lib/rsssf/parser/version.rb'
 Hoe.spec 'rsssf-parser' do
-  self.version = '0.0.1'
+  self.version = SportDb::Module::RsssfParser::VERSION
   self.summary = "rsssf-parser - football match schedule & results parser (& tokenizer) for the rsssf format / conventions"
   self.description = summary

data/bin/rsssf CHANGED Viewed

@@ -45,8 +45,7 @@ paths =  if args.empty?
             '../../../rsssf/austria/2010-11/cup.txt',
           ]
          else
-            ## todo/fix - expand_args!!!
-            args
+            SportDb::Parser::Opts.expand_args( args )
          end
@@ -60,15 +59,18 @@ Rsssf::Parser::Linter.debug = true    if opts[:debug]
 linter = Rsssf::Parser::Linter.new
+errors = []
 paths.each_with_index do |path,i|
     puts "==> [#{i+1}/#{paths.size}] reading >#{path}<..."
     linter.read( path, parse: !opts[:metal] )
+    errors += linter.errors  if linter.errors?
 end
-if linter.errors?
+if errors.size > 0
     puts
-    pp linter.errors
-    puts "!!   #{linter.errors.size} parse error(s) in #{paths.size} datafiles(s)"
+    pp errors
+    puts "!!   #{errors.size} parse error(s) in #{paths.size} datafiles(s)"
 else
     puts "OK   no parse errors found in #{paths.size} datafile(s)"
 end

data/lib/rsssf/parser/linter.rb CHANGED Viewed

@@ -25,45 +25,78 @@ def errors?() @errors.size > 0; end
+def read( path, parse: false )
+     parse( read_text( path ), parse: parse,
+                               path:  path )
+end
 #########
 ## parse - false (default) - tokenize (only)
 ##       - true            - tokenize & parse
-def read( path, parse: false )
-  ## fix - (re)use outline reader later!!!
-  ##   plus check for headings etc.
-  text = File.open( path, 'r:utf-8' ) { |f| f.read }
-  lines = text.split( "\n" )
+##
+## todo/fix - change path to file or such - why? why not?
-  ##  process lines
-  tree = []
-  lines.each do |line|
-    ## skip blank and comment lines
-    next if line.strip.empty? || line.strip.start_with?('#')
+MAX_ERRORS = 13   ## stop after 13 errors
-    ## strip inline (end-of-line) comments
-    line = line.sub( /#.+$/, '' )
+def parse( txt, parse: false,
+                path: 'path/to/filename/here' )
+  ## note: every (new) read call - resets errors list to empty
+  @errors = []
+  nodes = SportDb::OutlineReader.parse( txt )
-    if debug?
-      puts
-      puts "line >#{line}<"
-    end
+  ##  process nodes
+  h1         = nil
+  orphans    = 0    ## track paragraphs with no heading
+  paragraphs = 0    ## track paragraphs with heading
-    t, error_messages  =  if parse
-                            @parser.parse_with_errors( line )
-                          else
-                            @parser.tokenize_with_errors( line )
-                          end
+  nodes.each do |node|
+    type = node[0]
+    if type == :h1
+        h1 = node[1]  ## get heading text
+        ## puts
+        puts "  = Heading 1 >#{node[1]}<"
+    elsif type == :p
+      if h1.nil?
+        orphans += 1    ## only warn once (at the end; see below)
+        next
+      end
+      paragraphs += 1
+      lines = node[1]
+      tree = []
+      lines.each_with_index do |line,i|
+        if debug?
+          puts
+          puts "line >#{line}<"
+        end
+        t, error_messages  =  if parse
+                                @parser.parse_with_errors( line )
+                              else
+                                @parser.tokenize_with_errors( line )
+                              end
     if error_messages.size > 0
       ## add to "global" error list
       ##   make a triplet tuple (file / msg / line text)
             error_messages.each do |msg|
+                ## note - stop processing / adding errors if hit MAX ERRORS
+                if @errors.size >= MAX_ERRORS
+                   @errors << [ path,
+                                 "stop after #{MAX_ERRORS} errors",
+                                 '']
+                   return
+                end
                 @errors << [ path,
                              msg,
                              line
@@ -74,9 +107,40 @@ def read( path, parse: false )
     pp t   if debug?
     tree << t
-  end
+  end
   ## pp tree
-end  # read
+else
+  pp node
+  raise ArgumentError, "unsupported (node) type >#{type}<"
+end
+end  # each node
+  ## no heading and no orphans => assume empty file (comments only)!!!
+  if h1.nil? && orphans == 0
+    puts "  !! WARN - no heading(s) and paragraph(s) found"
+     @errors << [ path,
+                  "warn - no heading(s) and paragraph(s) found",
+                  ""  ## pass along empty line
+                ]
+  end
+  if orphans > 0
+    puts "  !! WARN - no heading for #{orphans} text paragraph(s); skipping parse"
+    @errors << [ path,
+                  "warn - no heading for #{orphans} text paragraph(s); skipping parse",
+                  ""  ## pass along empty line
+               ]
+  end
+  if h1 && paragraphs == 0
+    puts "  !! WARN - heading with no text paragraph(s)"
+    @errors << [ path,
+                  "warn - heading with no text paragraph(s)",
+                  ""  ## pass along empty line
+               ]
+  end
+end  # parse
 end  # class Linter

data/lib/rsssf/parser/token-date.rb CHANGED Viewed

@@ -2,62 +2,9 @@ module Rsssf
 class Parser
-def self.parse_names( txt )
-  lines = [] # array of lines (with words)
-  txt.each_line do |line|
-    line = line.strip
-    next if line.empty?
-    next if line.start_with?( '#' )   ## skip comments too
-    ## strip inline (until end-of-line) comments too
-    ##   e.g. Janvier  Janv  Jan  ## check janv in use??
-    ##   =>   Janvier  Janv  Jan
-    line = line.sub( /#.*/, '' ).strip
-    ## pp line
-    values = line.split( /[ \t]+/ )
-    ## pp values
-    ## todo/fix -- add check for duplicates
-    lines << values
-  end
-  lines
-end # method parse
-def self.build_names( lines )
-  ## join all words together into a single string e.g.
-  ##   January|Jan|February|Feb|March|Mar|April|Apr|May|June|Jun|...
-  lines.map { |line| line.join('|') }.join('|')
-end
-## add normalize option (for downcase) - why? why not?
-def self.build_map( lines )
-    ## note: downcase name!!!
-   ## build a lookup map that maps the word to the index (line no) plus 1 e.g.
-   ##  {"january" => 1,  "jan" => 1,
-   ##   "february" => 2, "feb" => 2,
-   ##   "march" => 3,    "mar" => 3,
-   ##   "april" => 4,    "apr" => 4,
-   ##   "may" => 5,
-   ##   "june" => 6,     "jun" => 6, ...
-   lines.each_with_index.reduce( {} ) do |h,(line,i)|
-     line.each { |name| h[ name.downcase ] = i+1 }  ## note: start mapping with 1 (and NOT zero-based, that is, 0)
-     h
-   end
-end
  ## note -  support only 5 letter max for now
  ##    now January|February|August etc.
-MONTH_LINES = parse_names( <<TXT )
+MONTH_LINES = SportDb::Parser.parse_names( <<TXT )
 Jan
 Feb
 March      Mar
@@ -72,15 +19,15 @@ Nov
 Dec
 TXT
-MONTH_NAMES = build_names( MONTH_LINES )
+MONTH_NAMES = SportDb::Parser.build_names( MONTH_LINES )
 # pp MONTH_NAMES
-MONTH_MAP   = build_map( MONTH_LINES )
+MONTH_MAP   = SportDb::Parser.build_map( MONTH_LINES, downcase: true )
 # pp MONTH_MAP
 ### nnote - only support two or three letters
 ##    no Tues | Thur | Thurs | Sunday etc.
-DAY_LINES = parse_names( <<TXT )
+DAY_LINES = SportDb::Parser.parse_names( <<TXT )
 Mon  Mo
 Tue  Tu
 Wed  We
@@ -91,9 +38,9 @@ Sun  Su
 TXT
-DAY_NAMES = build_names( DAY_LINES )
+DAY_NAMES = SportDb::Parser.build_names( DAY_LINES )
 # pp DAY_NAMES
-DAY_MAP   = build_map( DAY_LINES )
+DAY_MAP   = SportDb::Parser.build_map( DAY_LINES, downcase: true )
 # pp DAY_MAP

data/lib/rsssf/parser/token-goals.rb CHANGED Viewed

@@ -29,12 +29,12 @@ MINUTE_RE = %r{
                     (?:
                       \d{1,3}
                       '?   ## optional minute quote (')
-                      (?= (og|pen|p)? ([ ;,\]]|$))
+                      (?= (og|o|pen|p)? ([ ;,\]\)]|$))
                     )
                   )
                 )
                 |
-                (?= (og|pen|p)? ([ ;,\]]|$))  # note - break can be og|pen|p too
+                (?= (og|o|pen|p)? ([ ;,\]\)]|$))  # note - break can be og|pen|p too
          )
       )}ix
 ### note - word boundary (\b) will NOT work for quoet (')
@@ -55,7 +55,7 @@ GOAL_PEN_RE = %r{
 GOAL_OG_RE = %r{
    (?<og>
         (?<=\d|\+|[ ]|')	## must follow a number or plus (e.g. 45og / 45+og / 45 og) or space
-          og
+          (?: og|o )
           \b
    )
 }ix

data/lib/rsssf/parser/token-note.rb CHANGED Viewed

@@ -6,8 +6,8 @@ class Parser
 ##  move to token-note(s) file !!!!
 ##
-NOTE_RE = %r{
-    \[
+NOTE_BASICS_RE = %r{
+    (?<note_open> \[ )
    (?<note>
      (?:  ##  starting with ___   PLUS requiring more text
        (?:
@@ -97,16 +97,44 @@ NOTE_RE = %r{
               |
               replay
               ## e.g.  [replay]
+              |
+              verified
+              ## e.g.  [verified 2:0 wo.]
           )
         ([ ]    ## note - optional text
           [^\]]+?
          )?         ## slurp all to next ] - (use non-greedy)
       )
-    )    # note capture
-     \]
+    )    # note capture
+     (?:
+         (?<note_close> \] )
+         | $ ## note - allow open notes (that continue on next line)
+      )
+}ix
+NOTE_MORE_RE = %r{
+      (?<=[ ])  ## one (leading) space min. required
+       (?<note_cont>
+             [⮑…] |
+             \.{2,3}   ### .. or ...
+       )
+        [ ]*
+       (?<note>
+            [^\]]+?   ## non-greeedy
+          )
+       (?:
+         (?<note_close> \] )
+         | $ ## note - allow open notes (that continue on next line)
+       )
 }ix
+NOTE_RE  = Regexp.union(  NOTE_BASICS_RE,
+                          NOTE_MORE_RE,
+                         )
 end  #   class Parser
 end  #   module Rsssf

data/lib/rsssf/parser/token-text.rb CHANGED Viewed

@@ -80,8 +80,9 @@ TEXT_STRICT_RE = %r{
         ## positive lookahead
         ##   cannot use \b  if text ends in dot (.) or other non-alphnum
-        ##        than \b will not work
-            (?=[ ,;@|\[\]]
+        ##        than \b will not work
+        ##   not    - add () too for now - why? why not?
+            (?=[ ,;@|\[\]\(\)]
                  |$
             )
     )

data/lib/rsssf/parser/token.rb CHANGED Viewed

@@ -9,7 +9,7 @@ BASICS_RE = %r{
     (?<spaces> [ ]{2,}) |
     (?<space>  [ ])
         |
-    (?<sym>[;,@|\[\]])
+    (?<sym>[;,@|\[\]\(\)])     ## note - add () too  - why? why not?
 }ix
@@ -38,10 +38,37 @@ RE = Regexp.union(  GROUP_RE, ROUND_RE, LEG_RE,
                      TEXT_RE )
+###  rename to dash or to ???
+####   used to add/allow hyphen/dash (-) in INSIDE_RE
+HYPHEN_RE = %r{   ## must be space before and after (or end of line)!!!
+                  ##  note - uses SYM capture
+                    (?<sym>
+                      (?<=[ ])	# Positive lookbehind for space
+                         -
+                       (?=[ ]|$)   # positive lookahead for space
+                    )
+                }ix
+### rename to ??  use SCORE_AT for now - why? why not?
+##   add support for score at/score points/markers
+###  e.g.  [1-0 Andrei 08, 1-1 Rydlewicz 24, 1-2 Prica 85, 2-2 Bella 88,
+##      2-3 Arvidsson 102]
+SCORE_AT_RE = %r{ (?<score_at>
+                    \b
+                    \d{1,2}-\d{1,2}
+                    \b
+                  )
+              }ix
 ## "strict" text match mode inside brackets
 ##  ]
-INSIDE_RE  = Regexp.union(  GOAL_OG_RE, GOAL_PEN_RE,
-                            BASICS_RE,
+INSIDE_RE  = Regexp.union(  SCORE_AT_RE,
+                            GOAL_OG_RE, GOAL_PEN_RE,
+                            BASICS_RE, HYPHEN_RE,
                             TEXT_STRICT_RE,
                             MINUTE_RE,
                          )
@@ -56,6 +83,11 @@ def log( msg )
 end
+## open/close pairs - lookup close (by open char)
+SYM_CLOSE = {
+  '(' => ')',
+  '[' => ']',
+}
 def tokenize_with_errors( line, debug: false )
   tokens = []
@@ -72,6 +104,7 @@ def tokenize_with_errors( line, debug: false )
   ####
   ## quick hack - keep re state/mode between tokenize calls!!!
   @re  ||= RE     ## note - switch between RE & INSIDE_RE
   while m = @re.match( line, pos )
@@ -84,10 +117,14 @@ def tokenize_with_errors( line, debug: false )
     if offsets[0] != pos
       ## match NOT starting at start/begin position!!!
       ##  report parse error!!!
-      msg =  "!! WARN - parse error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
+      ctx = @re == INSIDE_RE ? 'INSIDE_RE' : 'RE'  ## assume RE
+      ## fix/change - use str.inspect to show tabs (\t)
+      ##          and possibly other special characters causing trouble
+      msg =  "  !! WARN - parse error (#{ctx}) - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
       puts msg
-      errors << "parse error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]}"
+      errors << "parse error (#{ctx}) - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]}"
       log( msg )
     end
@@ -109,7 +146,9 @@ def tokenize_with_errors( line, debug: false )
              [:text, m[:text]]   ## keep pos - why? why not?
            elsif m[:minute]
              [:minute, m[:minute]]
-           elsif m[:og]
+           elsif m[:score_at]
+             [:score_at, m[:score_at]]
+          elsif m[:og]
              [:og, m[:og]]    ## for typed drop - string version/variants
            elsif m[:pen]
              [:pen, m[:pen]]
@@ -121,12 +160,21 @@ def tokenize_with_errors( line, debug: false )
              when ';' then [:';']
              when '@' then [:'@']
              when '|' then [:'|']
-             when '['
-               ## report error - already in inside mode!!!
+             when '-' then [:'-']
+             when '[', '('
+               if sym == @sym_open
+                 ## report error - already in inside mode!!!
+                 ##  e.g. another [ in [] or ( in ()
+                 log( "warn - unexpected (opening) #{sym} in inside (goal) mode in line >#{line}<" )
+               end
                nil
-             when ']'
-               puts "  leave inside match mode"
-               @re = RE
+             when ']', ')'   ## allow [] AND () for inside mode
+               ## puts "  leave inside match mode"
+               if sym == @sym_close
+                   @re = RE
+                   @sym_open  = nil  ## reset sym_open/close
+                   @sym_close = nil
+               end
                nil
              else
               nil  ## ignore others (e.g. brackets [])
@@ -176,12 +224,15 @@ def tokenize_with_errors( line, debug: false )
              when ';' then [:';']
              when '@' then [:'@']
              when '|' then [:'|']
-             when '['
+             when '[', '('
                ##  switch to inside mode!!!
-               puts "  enter inside match mode"
+               ## puts "  enter inside match mode"
                @re = INSIDE_RE
+               @sym_open  =  sym      ## record open/close style - why? why not?
+               @sym_close =  SYM_CLOSE[sym]
                nil
-             when ']'
+             when ']', ')'
+               log( "warn - unexpected (closing) #{sym} in standard mode in line >#{line}<" )
                ## already in standard mode/ctx
                ##  report warn/error - why? why not?
                nil
@@ -204,13 +255,21 @@ def tokenize_with_errors( line, debug: false )
     end
   end
   ## check if no match in end of string
   if offsets[1] != line.size
-    msg =  "!! WARN - parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<"
+    ## note - report regex context
+    ##  e.g.  RE or INSIDE_RE  to help debugging/troubleshooting format errors
+    ctx = @re == INSIDE_RE ? 'INSIDE_RE' : 'RE'  ## assume RE
+    ## fix/change - use str.inspect to show tabs (\t)
+    ##          and possibly other special characters causing trouble
+    msg =  "  !! WARN - parse error (#{ctx}) - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<"
     puts msg
     log( msg )
-    errors << "parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size}"
+    errors << "parse error (#{ctx}) - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size}"
   end

data/lib/rsssf/parser/version.rb ADDED Viewed

@@ -0,0 +1,24 @@
+module SportDb
+  module Module
+    module RsssfParser
+  MAJOR = 0    ## todo: namespace inside version or something - why? why not??
+  MINOR = 1
+  PATCH = 0
+  VERSION = [MAJOR,MINOR,PATCH].join('.')
+  def self.version
+    VERSION
+  end
+  def self.banner
+    "rsssf-parser/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}] in (#{root})"
+  end
+  def self.root
+    File.expand_path( File.dirname(File.dirname(File.dirname(File.dirname(__FILE__)))) )
+  end
+    end   # module RsssfParser
+  end
+end

data/lib/rsssf/parser.rb CHANGED Viewed

@@ -2,9 +2,17 @@
 ####
 ##  build on "standard" parse
 require 'sportdb/parser'
+## pulled in for/uses only
+##  -  SportDb::Parser::Tokens  !!!
+##
+##  plus in the future pull in SportDb::OutlineReader
+##
+##  note - pulls in more deps e.g. cococs AND season-formats
 ## our own code
+require_relative 'parser/version'
 require_relative 'parser/token-text'
 require_relative 'parser/token-note'
 require_relative 'parser/token-round'    ## round (& group)
@@ -19,3 +27,5 @@ require_relative 'parser/linter'
+# say hello
+puts SportDb::Module::RsssfParser.banner

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: rsssf-parser
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.1.0
 platform: ruby
 authors:
 - Gerald Bauer
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2024-07-17 00:00:00.000000000 Z
+date: 2024-07-22 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: sportdb-parser
@@ -84,6 +84,7 @@ files:
 - lib/rsssf/parser/token-score.rb
 - lib/rsssf/parser/token-text.rb
 - lib/rsssf/parser/token.rb
+- lib/rsssf/parser/version.rb
 homepage: https://github.com/sportdb/sport.db
 licenses:
 - Public Domain