RubyGems - sportdb-parser - Versions diffs - 0.2.2 → 0.3.1 - Mend

sportdb-parser 0.2.2 → 0.3.1

Files changed (11) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +1 -1
data/Manifest.txt +3 -0
data/bin/fbtok +67 -0
data/lib/sportdb/parser/lang.rb +3 -0
data/lib/sportdb/parser/linter.rb +156 -0
data/lib/sportdb/parser/outline_reader.rb +97 -0
data/lib/sportdb/parser/token-text.rb +50 -40
data/lib/sportdb/parser/version.rb +2 -2
data/lib/sportdb/parser.rb +5 -0
metadata +7 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 0c9225b21f400b9f9cced2052c3062f41a091ed81d3d4239164c9652f53ebc6e
-  data.tar.gz: f7250eaa21324962df27e7cdd397857afa570c610f00c80c31e5105e40964002
+  metadata.gz: 3721d30a9ec1145f59d6bc84bb9a6cf81330fdafa00314decc2c165f2c6b92c1
+  data.tar.gz: a50b917e0bc5db3ac21cb4ead5507233cbd91f4b6f3b52668ef74ba8c6db6140
 SHA512:
-  metadata.gz: 471c938c233d8f81d7a0fd5e4470a27a52486906764816b6c35ea3d88e19650c81302fd5ff9ee30b85d3a8e9f81ada8eef20b49bd3de924c7238acb106ba6082
-  data.tar.gz: 24d1cf3846404859ad7e751895325b256321d43e2881413fda6325c744ca0c31b52ef2032a9dfc8e56e67d7a06df54a6d2780a297982440b8e40b7055fe06c26
+  metadata.gz: 3e286842bcb5c163d841d414a15d4cbd91359324b88106603e22ebbec19b4cb8ffff3119d92a5c9e302e6aeabbc8a7ba74975efebee11ba275ce1d49f0286c1f
+  data.tar.gz: f237aedda025eb35bb08b31345941a3f1f073cbf8c3495c1afcf6e45de73f067e47a8a4c55981d5881079a5ca45a04af427908f379555208c98851c5d4751aa1

data/CHANGELOG.md CHANGED Viewed

@@ -1,4 +1,4 @@
-### 0.2.2
+### 0.3.1
 ### 0.0.1 / 2024-07-12

data/Manifest.txt CHANGED Viewed

@@ -2,8 +2,11 @@ CHANGELOG.md
 Manifest.txt
 README.md
 Rakefile
+bin/fbtok
 lib/sportdb/parser.rb
 lib/sportdb/parser/lang.rb
+lib/sportdb/parser/linter.rb
+lib/sportdb/parser/outline_reader.rb
 lib/sportdb/parser/parser.rb
 lib/sportdb/parser/token-date.rb
 lib/sportdb/parser/token-score.rb

data/bin/fbtok ADDED Viewed

@@ -0,0 +1,67 @@
+#!/usr/bin/env ruby
+## tip: to test run:
+##   ruby -I ./lib bin/fbtok
+require 'sportdb/parser'
+require 'optparse'   ## check - already auto-required in cocos? keep? why? why not?
+args=ARGV
+opts = {
+    debug: true,
+    metal: false,
+}
+parser = OptionParser.new do |parser|
+  parser.banner = "Usage: #{$PROGRAM_NAME} [options]"
+  parser.on( "--verbose", "--debug",
+               "turn on verbose / debug output (default: #{opts[:debug]})" ) do |debug|
+    opts[:debug] = true
+  end
+  parser.on( "--metal",
+                 "turn off typed parse tree; show to the metal tokens"+
+                   " (default: #{opts[:metal]})" ) do |metal|
+    opts[:metal] = true
+  end
+end
+parser.parse!( args )
+puts "OPTS:"
+p opts
+puts "ARGV:"
+p args
+SportDb::Parser::Linter.debug = true    if opts[:debug]
+linter = SportDb::Parser::Linter.new
+errors = []
+paths = args
+paths.each_with_index do |path,i|
+    puts "==> [#{i+1}/#{paths.size}] reading >#{path}<..."
+    linter.read( path, parse: !opts[:metal] )
+    errors += linter.errors    if linter.errors?
+end
+if errors.size > 0
+    puts
+    pp errors
+    puts
+    puts "!!   #{errors.size} parse error(s) in #{paths.size} datafiles(s)"
+else
+    puts
+    puts "OK   no parse errors found in #{paths.size} datafile(s)"
+end
+puts "bye"

data/lib/sportdb/parser/lang.rb CHANGED Viewed

@@ -64,6 +64,9 @@ ROUND_RE = %r{^(
              [ ] Round
        )
        |
+  ## Playoff Round 1
+     (?:  Play-?off [ ] Round [ ] [1-9][0-9]* )
+       |
   ## starting with preliminary
   #   e.g.  Preliminary round
      (?:  Preliminary  [ ]

data/lib/sportdb/parser/linter.rb ADDED Viewed

@@ -0,0 +1,156 @@
+module SportDb
+class Parser
+###
+## note - Linter for now nested inside Parser - keep? why? why not?
+class Linter
+def self.debug=(value) @@debug = value; end
+def self.debug?() @@debug ||= false; end  ## note: default is FALSE
+def debug?()  self.class.debug?; end
+attr_reader :errors
+def initialize
+  @errors = []
+  @parser = Parser.new   ## use own parser instance (not shared) - why? why not?
+end
+def errors?() @errors.size > 0; end
+  ## note:  colon (:) MUST be followed by one (or more) spaces
+  ##      make sure mon feb 12 18:10 will not match
+  ##        allow 1. FC Köln etc.
+  ##               Mainz 05:
+  ##           limit to 30 chars max
+  ##          only allow  chars incl. intl buut (NOT ()[]/;)
+  ##
+  ##   Group A:
+  ##   Group B:   - remove colon
+  ##    or lookup first
+  ATTRIB_RE = %r{^
+                   [ ]*?     # slurp leading spaces
+                (?<key>[^:|\]\[()\/; -]
+                       [^:|\]\[()\/;]{0,30}
+                 )
+                   [ ]*?     # slurp trailing spaces
+                   :[ ]+
+                (?<value>.+)
+                    [ ]*?   # slurp trailing spaces
+                   $
+                }ix
+#########
+## parse - false (default) - tokenize (only)
+##       - true            - tokenize & parse
+def read( path, parse: false )
+  ## note: every (new) read call - resets errors list to empty
+  @errors = []
+  nodes = OutlineReader.read( path )
+  ##  process nodes
+  h1 = nil
+  h2 = nil
+  orphans = 0    ## track paragraphs's with no heading
+  attrib_found = false
+  nodes.each do |node|
+    type = node[0]
+    if type == :h1
+        h1 = node[1]  ## get heading text
+        puts "  = Heading 1 >#{node[1]}<"
+    elsif type == :h2
+        if h1.nil?
+          puts "!! WARN - no heading for subheading; skipping parse"
+          next
+        end
+        h2 = node[1]  ## get heading text
+        puts "  == Heading 2 >#{node[1]}<"
+    elsif type == :p
+       if h1.nil?
+         orphans += 1    ## only warn once
+         puts "!! WARN - no heading for #{orphans} text paragraph(s); skipping parse"
+         next
+       end
+       lines = node[1]
+       tree = []
+       lines.each_with_index do |line,i|
+        if debug?
+         puts
+         puts "line >#{line}<"
+        end
+        ## skip new (experimental attrib syntax)
+        if attrib_found == false &&
+            ATTRIB_RE.match?( line )
+          ## note: check attrib regex AFTER group def e.g.:
+          ##         Group A:
+          ##         Group B:  etc.
+          ##     todo/fix - change Group A: to Group A etc.
+          ##                       Group B: to Group B
+           attrib_found = true
+           ## logger.debug "skipping key/value line - >#{line}<"
+           next
+        end
+        if attrib_found
+          ## check if line ends with dot
+          ##  if not slurp up lines to the next do!!!
+          ## logger.debug "skipping key/value line - >#{line}<"
+          attrib_found = false   if line.end_with?( '.' )
+              # logger.debug "skipping key/value line (cont.) - >#{line}<"
+              next
+        end
+        t, error_messages  =  if parse
+                                  @parser.parse_with_errors( line )
+                              else
+                                  @parser.tokenize_with_errors( line )
+                              end
+         if error_messages.size > 0
+            ## add to "global" error list
+            ##   make a triplet tuple (file / msg / line text)
+            error_messages.each do |msg|
+                @errors << [ path,
+                             msg,
+                             line
+                           ]
+            end
+         end
+         pp t   if debug?
+         tree << t
+       end
+       ## pp tree
+    else
+        pp node
+        raise ArgumentError, "unsupported (node) type >#{type}<"
+    end
+  end  # each node
+end  # read
+end  # class Linter
+end   # class Parser
+end   # module SportDb

data/lib/sportdb/parser/outline_reader.rb ADDED Viewed

@@ -0,0 +1,97 @@
+module SportDb
+class OutlineReader
+  def self.debug=(value) @@debug = value; end
+  def self.debug?() @@debug ||= false; end
+  def debug?()  self.class.debug?; end
+  def self.read( path )   ## use - rename to read_file or from_file etc. - why? why not?
+    txt = File.open( path, 'r:utf-8' ) {|f| f.read }
+    parse( txt )
+  end
+  def self.parse( txt )
+    new( txt ).parse
+  end
+  def initialize( txt )
+    @txt = txt
+  end
+  ## note: skip "decorative" only heading e.g. ========
+  ##  todo/check:  find a better name e.g. HEADING_EMPTY_RE or HEADING_LINE_RE or ???
+  HEADING_BLANK_RE = %r{\A
+                        ={1,}
+                        \z}x
+  ## note: like in wikimedia markup (and markdown) all optional trailing ==== too
+  HEADING_RE = %r{\A
+                  (?<marker>={1,})       ## 1. leading ======
+                    [ ]*
+                  (?<text>[^=]+)         ## 2. text   (note: for now no "inline" = allowed)
+                    [ ]*
+                    =*                   ## 3. (optional) trailing ====
+                  \z}x
+  def parse
+    outline=[]   ## outline structure
+    start_para = true      ## start new para(graph) on new text line?
+    @txt.each_line do |line|
+        line = line.strip      ## todo/fix: keep leading and trailing spaces - why? why not?
+        if line.empty?    ## todo/fix: keep blank line nodes?? and just remove comments and process headings?! - why? why not?
+          start_para = true
+          next
+        end
+        break if line == '__END__'
+        next if line.start_with?( '#' )   ## skip comments too
+        ## strip inline (until end-of-line) comments too
+        ##  e.g Eupen | KAS Eupen ## [de]
+        ##   => Eupen | KAS Eupen
+        ##  e.g bq   Bonaire,  BOE        # CONCACAF
+        ##   => bq   Bonaire,  BOE
+        line = line.sub( /#.*/, '' ).strip
+        pp line    if debug?
+        ## todo/check: also use heading blank as paragraph "breaker" or treat it like a comment ?? - why? why not?
+        next if HEADING_BLANK_RE.match( line )  # skip "decorative" only heading e.g. ========
+         ## note: like in wikimedia markup (and markdown) all optional trailing ==== too
+        if m=HEADING_RE.match( line )
+           start_para = true
+           heading_marker = m[:marker]
+           heading_level  = heading_marker.length   ## count number of = for heading level
+           heading        = m[:text].strip
+           puts "heading #{heading_level} >#{heading}<"   if debug?
+           outline << [:"h#{heading_level}", heading]
+        else    ## assume it's a (plain/regular) text line
+           if start_para
+             outline << [:p, [line]]
+             start_para = false
+           else
+             node = outline[-1]    ## get last entry
+             if node[0] == :p      ##  assert it's a p(aragraph) node!!!
+                node[1] << line    ## add line to p(aragraph)
+             else
+               puts "!! ERROR - invalid outline state / format - expected p(aragraph) node; got:"
+               pp node
+               exit 1
+             end
+           end
+        end
+    end
+    outline
+  end # method read
+end # class OutlineReader
+end # module SportDb

data/lib/sportdb/parser/token-text.rb CHANGED Viewed

@@ -1,12 +1,12 @@
-module SportDb
+module SportDb
 class Parser
 ##  note - do NOT allow single alpha text for now
-##   add later??      A - B    C - D  - why?
+##   add later??      A - B    C - D  - why?
 ## opt 1) one alpha
-## (?<text_i> [a-z])    # only allow single letter text (not numbers!!)
+## (?<text_i> [a-z])    # only allow single letter text (not numbers!!)
 ## opt 2) more than one alphanum
@@ -26,19 +26,19 @@ class Parser
 TEXT_RE = %r{
-    ## must start with alpha (allow unicode letters!!)
-    (?<text>
-           ## positive lookbehind
+    ## must start with alpha (allow unicode letters!!)
+    (?<text>
+           ## positive lookbehind
            ##  (MUST be fixed number of chars - no quantifier e.g. +? etc.)
             (?<=[ ,;@|\[\]]
                  |^
             )
-            (?:
+            (?:
                 # opt 1 - start with alpha
                  \p{L}+    ## all unicode letters (e.g. [a-z])
                    |
-                # opt 2 - start with num!! - allow special case (e.g. 1. FC)
+                # opt 2 - start with num!! - allow special case (e.g. 1. FC)
                      \d+  # check for num lookahead (MUST be space or dot)
                       ## MUST be followed by (optional dot) and
                       ##                      required space !!!
@@ -46,69 +46,79 @@ TEXT_RE = %r{
                       \.?     ## optional dot
                       [ ]?   ## make space optional too  - why? why not?
                              ##  yes - eg. 1st, 2nd, 5th etc.
-                       \p{L}+
+                       \p{L}+
                )
               (?:(?:  (?:[ ]
                      (?!vs?\.?[ ])    ## note - exclude (v[ ]/vs[ ]/v.[ ]/vs.[ ])
-                       )
+                       )
                       |     # only single spaces allowed inline!!!
-                     [-]
+                     [-]
                   )?
                 (?:
                   \p{L} |
-                  [&/']
+                  [&/']
                     |
                  (?:
-                   \d+
-                   (?![0-9.:h'/+-])
+                   \d+
+                   (?![0-9.:h'/+-])
                    ## negative lookahead for numbers
                    ##   note - include digits itself!!!
-                 )|
-                 \.
-               )
+                 )|
+                 \.
+               )
               )*  ## must NOT end with space or dash(-)
               ##  todo/fix - possible in regex here
               ##     only end in alphanum a-z0-9 (not dot or & ???)
             ## allow optional at the end
             ##  tag or year
-            ##   make it and in the future - why? why not?
-            ##
+            ##   make it and in the future - why? why not?
+            ##
+            ## change - fix
+            ##   do NOT use (A) for amateur
+            ##   use A or A. with NO ()!!!
             ## (A) -    allow with predined  alpha only for now
             ##          e.g. (A) - amateur a team or b?
+            ###  same for U21 or U9 etc
+            ##        use with NO ()!!! - why? why not?
             ##      or U21 U9 etc.   - why? why not?
             ##       or etc.
             ## (1879-1893) or allow years e.g. (1879-1893)
-            ###
-            (?:
-               [ ]
-                  \( (?:
-                       A|B|
-                       U\d{1,2}
-                     )
-                  \)
-            )?
+            ###
+            ##    add allow country code three to five letters for now
+            ##       change to generic 1 to 5 - why? why not?
+            ##     e.g. (A), (I),
+            ##          (AUT)
+            ##          (TRNC)   five? for UEFA code for northern cyprus
+            ##     change to 1 to 4 - why? why not?
+            ##   check - fix possible for upper case only here
+            ##                     inline for this group only?
             (?:
-               [ ]
+               [ ]
                \(
                   \d{4}-\d{4}
                \)
-            )?
+            )?
+             (?:
+               [ ]+   ## allow more than once space - why? why not?
+                  \( (?:
+                       [A-Z]{1,5}
+                     )
+                  \)
+             )?
             ## add lookahead/lookbehind
-           ##    must be space!!!
+           ##    must be space!!!
            ##   (or comma or  start/end of string)
            ##   kind of \b !!!
             ## positive lookahead
             (?=[ ,;@|\[\]]
                  |$
             )
-   )
+   )
 }ix
 end # class Parser
-end # module SportDb
+end # module SportDb

data/lib/sportdb/parser/version.rb CHANGED Viewed

@@ -3,8 +3,8 @@ module SportDb
   module Module
     module Parser
   MAJOR = 0    ## todo: namespace inside version or something - why? why not??
-  MINOR = 2
-  PATCH = 2
+  MINOR = 3
+  PATCH = 1
   VERSION = [MAJOR,MINOR,PATCH].join('.')
   def self.version

data/lib/sportdb/parser.rb CHANGED Viewed

@@ -24,6 +24,11 @@ require_relative 'parser/lang'
 require_relative 'parser/parser'
+####
+##  todo/check - move outline reader upstream to cocos - why? why not?
+##       use  read_outline(), parse_outline()  - why? why not?
+require_relative 'parser/outline_reader'
+require_relative 'parser/linter'
 ###
 #  make parser api (easily) available - why? why not?

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: sportdb-parser
 version: !ruby/object:Gem::Version
-  version: 0.2.2
+  version: 0.3.1
 platform: ruby
 authors:
 - Gerald Bauer
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2024-08-27 00:00:00.000000000 Z
+date: 2024-09-18 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: cocos
@@ -74,7 +74,8 @@ dependencies:
         version: '4.1'
 description: sportdb-parser - football.txt match parser (& tokenizer)
 email: gerald.bauer@gmail.com
-executables: []
+executables:
+- fbtok
 extensions: []
 extra_rdoc_files:
 - CHANGELOG.md
@@ -85,8 +86,11 @@ files:
 - Manifest.txt
 - README.md
 - Rakefile
+- bin/fbtok
 - lib/sportdb/parser.rb
 - lib/sportdb/parser/lang.rb
+- lib/sportdb/parser/linter.rb
+- lib/sportdb/parser/outline_reader.rb
 - lib/sportdb/parser/parser.rb
 - lib/sportdb/parser/token-date.rb
 - lib/sportdb/parser/token-score.rb