RubyGems - sportdb-parser - Versions diffs - 0.0.1 - Mend

sportdb-parser 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +3 -0
data/Manifest.txt +14 -0
data/README.md +8 -0
data/Rakefile +27 -0
data/bin/fbt +144 -0
data/lib/sportdb/parser/lang.rb +111 -0
data/lib/sportdb/parser/linter.rb +153 -0
data/lib/sportdb/parser/outline_reader.rb +101 -0
data/lib/sportdb/parser/parser.rb +196 -0
data/lib/sportdb/parser/token-date.rb +193 -0
data/lib/sportdb/parser/token-score.rb +121 -0
data/lib/sportdb/parser/token-text.rb +114 -0
data/lib/sportdb/parser/token.rb +364 -0
data/lib/sportdb/parser.rb +44 -0
metadata +96 -0

data/lib/sportdb/parser/parser.rb ADDED Viewed

@@ -0,0 +1,196 @@
+module SportDb
+class Parser
+## transforms
+##
+##  Netherlands  1-2 (1-1)   England
+##   =>  text => team
+##       score|vs
+##       text => team
+## token iter/find better name
+##  e.g. TokenBuffer/Scanner or such ??
+class Tokens
+    def initialize( tokens )
+        @tokens = tokens
+        @pos = 0
+    end
+    def pos()  @pos; end
+    def eos?() @pos >= @tokens.size; end
+    def include?( *types )
+        pos = @pos
+        ## puts "  starting include? #{types.inspect} @ #{pos}"
+        while pos < @tokens.size do
+            return true   if types.include?( @tokens[pos][0] )
+            pos +=1
+        end
+        false
+    end
+    ## pattern e.g. [:text, [:vs,:score], :text]
+    def match?( *pattern )
+        ## puts "  starting match? #{pattern.inspect} @ #{@pos}"
+        pattern.each_with_index do |types,offset|
+            ## if single symbol wrap in array
+            types = types.is_a?(Array) ? types : [types]
+            return false  unless types.include?( peek(offset) )
+        end
+        true
+    end
+    ## return token type  (e.g. :text, :num, etc.)
+    def cur()           peek(0); end
+    ## return content (assumed to be text)
+    def text(offset=0)
+        ## raise error - why? why not?
+        ##   return nil?
+        if peek( offset ) != :text
+            raise ArgumentError, "text(#{offset}) - token not a text type"
+        end
+        @tokens[@pos+offset][1]
+    end
+    def peek(offset=1)
+        ## return nil if eos
+        if @pos+offset >= @tokens.size
+            nil
+        else
+           @tokens[@pos+offset][0]
+        end
+    end
+    ## note - returns complete token
+    def next
+       # if @pos >= @tokens.size
+       #     raise ArgumentError, "end of array - #{@pos} >= #{@tokens.size}"
+       # end
+       #   throw (standard) end of iteration here why? why not?
+        t = @tokens[@pos]
+        @pos += 1
+        t
+    end
+    def collect( &blk )
+        tokens = []
+        loop do
+          break if eos?
+          tokens <<  if block_given?
+                        blk.call( self.next )
+                     else
+                        self.next
+                     end
+        end
+        tokens
+    end
+end  # class Tokens
+##
+##
+##  add !!!!
+##   collect_until e.g. collect_until( :text )
+def parse_with_errors( line, debug: false )
+    errors = []
+    tokens, token_errors = tokenize_with_errors( line, typed: true )
+    errors += token_errors
+#############
+## pass 1
+##   replace all texts with keyword matches (e.g. group, round, leg, etc.)
+     tokens = tokens.map do |t|
+                      if t[0] == :text
+                          text = t[1]
+                          if is_group?( text )
+                             [:group, text]
+                          elsif is_leg?( text )
+                             [:leg, text]
+                          elsif is_round?( text )
+                             [:round, text]
+                          else
+                              t   ## pass through as-is (1:1)
+                          end
+                      else
+                         t
+                      end
+                end
+    ## puts "tokens:"
+    ## pp tokens
+## transform tokens into (parse tree/ast) nodes
+    nodes = []
+    buf = Tokens.new( tokens )
+    ## pp buf
+    loop do
+          if buf.pos == 0
+            ## check for
+            ##    group def or round def
+            if buf.match?( :round, :'|' )    ## assume round def (change round to round_def)
+                      nodes << [:round_def, buf.next[1]]
+                      buf.next ## swallow pipe
+                      nodes += buf.collect
+                      break
+            end
+            if buf.match?( :group, :'|' )    ## assume group def (change group to group_def)
+                      nodes << [:group_def, buf.next[1]]
+                      buf.next ## swallow pipe
+                      ## change all text to team
+                      nodes += buf.collect { |t|
+                                t[0] == :text ? [:team, t[1]] : t
+                               }
+                      break
+            end
+          end
+          if buf.match?( :text, [:score, :vs], :text )
+             nodes << [:team, buf.next[1]]
+             nodes << buf.next
+             nodes << [:team, buf.next[1]]
+          elsif buf.match?( :text, :minute )
+             nodes << [:player, buf.next[1]]
+             nodes << buf.next
+          elsif buf.cur == :'@'
+               ## add all to the end as is
+               ##   only change text to geo
+              nodes += buf.collect  { |t|
+                           t[0] == :text ? [:geo, t[1]] : t
+                            }
+              break
+          else
+             ## pass through
+             nodes << buf.next
+          end
+          break if buf.eos?
+    end
+    [nodes,errors]
+end
+### convience helper - ignore errors by default
+def parse( line, debug: false )
+  nodes, _ = parse_with_errors( line, debug: debug )
+  nodes
+end
+end #  class Parser
+end  # module SportDb

data/lib/sportdb/parser/token-date.rb ADDED Viewed

@@ -0,0 +1,193 @@
+module SportDb
+class Parser
+def self.parse_names( txt )
+  lines = [] # array of lines (with words)
+  txt.each_line do |line|
+    line = line.strip
+    next if line.empty?
+    next if line.start_with?( '#' )   ## skip comments too
+    ## strip inline (until end-of-line) comments too
+    ##   e.g. Janvier  Janv  Jan  ## check janv in use??
+    ##   =>   Janvier  Janv  Jan
+    line = line.sub( /#.*/, '' ).strip
+    ## pp line
+    values = line.split( /[ \t]+/ )
+    ## pp values
+    ## todo/fix -- add check for duplicates
+    lines << values
+  end
+  lines
+end # method parse
+def self.build_names( lines )
+  ## join all words together into a single string e.g.
+  ##   January|Jan|February|Feb|March|Mar|April|Apr|May|June|Jun|...
+  lines.map { |line| line.join('|') }.join('|')
+end
+## add normalize option (for downcase) - why? why not?
+def self.build_map( lines )
+    ## note: downcase name!!!
+   ## build a lookup map that maps the word to the index (line no) plus 1 e.g.
+   ##  {"january" => 1,  "jan" => 1,
+   ##   "february" => 2, "feb" => 2,
+   ##   "march" => 3,    "mar" => 3,
+   ##   "april" => 4,    "apr" => 4,
+   ##   "may" => 5,
+   ##   "june" => 6,     "jun" => 6, ...
+   lines.each_with_index.reduce( {} ) do |h,(line,i)|
+     line.each { |name| h[ name.downcase ] = i+1 }  ## note: start mapping with 1 (and NOT zero-based, that is, 0)
+     h
+   end
+ end
+MONTH_LINES = parse_names( <<TXT )
+January    Jan
+February   Feb
+March      Mar
+April      Apr
+May
+June       Jun
+July       Jul
+August     Aug
+September  Sept  Sep
+October    Oct
+November   Nov
+December   Dec
+TXT
+MONTH_NAMES = build_names( MONTH_LINES )
+# pp MONTH_NAMES
+MONTH_MAP   = build_map( MONTH_LINES )
+# pp MONTH_MAP
+DAY_LINES = parse_names( <<TXT )
+Monday                   Mon  Mo
+Tuesday            Tues  Tue  Tu
+Wednesday                Wed  We
+Thursday    Thurs  Thur  Thu  Th
+Friday                   Fri  Fr
+Saturday                 Sat  Sa
+Sunday                   Sun  Su
+TXT
+DAY_NAMES = build_names( DAY_LINES )
+# pp DAY_NAMES
+DAY_MAP   = build_map( DAY_LINES )
+# pp DAY_MAP
+#=>
+# "January|Jan|February|Feb|March|Mar|April|Apr|May|June|Jun|
+#  July|Jul|August|Aug|September|Sept|Sep|October|Oct|
+#  November|Nov|December|Dec"
+#
+# "Monday|Mon|Mo|Tuesday|Tues|Tue|Tu|Wednesday|Wed|We|
+#  Thursday|Thurs|Thur|Thu|Th|Friday|Fri|Fr|
+#  Saturday|Sat|Sa|Sunday|Sun|Su"
+## todo - add more date variants !!!!
+# e.g. Fri Aug/9  or Fri Aug 9
+DATE_RE = %r{
+(?<date>
+  \b
+     ## optional day name
+     ((?<day_name>#{DAY_NAMES})
+          [ ]
+     )?
+     (?<month_name>#{MONTH_NAMES})
+         (?: \/|[ ] )
+     (?<day>\d{1,2})
+     ## optional year
+     (  [ ]
+        (?<year>\d{4})
+     )?
+  \b
+)}ix
+###
+#  date duration
+#   use - or + as separator
+#    in theory plus( +) only if dates
+#     are two days next to each other
+#
+#   otherwise  define new dates type in the future? why? why not?
+#
+#  check for plus (+) if dates are next to each other (t+1) - why? why not?
+#
+#  Sun Jun/23 - Wed Jun/26   -- YES
+#  Jun/23 - Jun/26           -- YES
+#  Tue Jun/25 + Wed Jun/26   -- YES
+#  Jun/25 + Jun/26           -- YES
+#
+#  Jun/25 - 26        - why? why not???
+#  Jun/25 .. 26        - why? why not???
+#  Jun/25 to 26        - why? why not???
+#  Jun/25 + 26        - add - why? why not???
+#  Sun-Wed Jun/23-26  -  add - why? why not???
+#  Wed+Thu Jun/26+27 2024  -  add - why? why not???
+#
+#  maybe use comman and plus for list of dates
+#    Tue Jun/25, Wed Jun/26, Thu Jun/27  ??
+#    Tue Jun/25 + Wed Jun/26 + Thu Jun/27  ??
+#
+#   add back optional comma (before) year - why? why not?
+DURATION_RE =  %r{
+(?<duration>
+    \b
+   ## optional day name
+   ((?<day_name1>#{DAY_NAMES})
+      [ ]
+   )?
+   (?<month_name1>#{MONTH_NAMES})
+      (?: \/|[ ] )
+   (?<day1>\d{1,2})
+   ## optional year
+   ( [ ]
+      (?<year1>\d{4})
+   )?
+   ## support + and -  (add .. or such - why??)
+   [ ]*[+-][ ]*
+   ## optional day name
+   ((?<day_name2>#{DAY_NAMES})
+      [ ]
+   )?
+   (?<month_name2>#{MONTH_NAMES})
+      (?: \/|[ ] )
+   (?<day2>\d{1,2})
+   ## optional year
+   ( [ ]
+      (?<year2>\d{4})
+   )?
+   \b
+)}ix
+end  #   class Parser
+end  # module SportDb

data/lib/sportdb/parser/token-score.rb ADDED Viewed

@@ -0,0 +1,121 @@
+module SportDb
+class Parser
+    ## todo/check: use ‹› (unicode chars) to mark optional parts in regex constant name - why? why not?
+    #####
+    #  english helpers (penalty, extra time, ...)
+    ##   note - p must go last (shortest match)
+    #     pso = penalty shootout
+    P_EN  =  '(?: pso | pen\.? | p\.? )'     # e.g. p., p, pen, pen., PSO, etc.
+    ET_EN =  '(?: aet | a\.e\.t\.? )'     # note: make last . optional (e.g a.e.t) allowed too
+    ##  note: allow SPECIAL cases WITHOUT full time scores (just a.e.t or pen. + a.e.t.)
+    ##      3-4 pen. 2-2 a.e.t.
+    ##      3-4 pen.   2-2 a.e.t.
+    ##               2-2 a.e.t.
+    SCORE__P_ET__RE = %r{
+        (?<score>
+           \b
+            (?:
+               (?<p1>\d{1,2}) - (?<p2>\d{1,2})
+                 [ ]* #{P_EN} [ ]+
+             )?             # note: make penalty (P) score optional for now
+            (?<et1>\d{1,2}) - (?<et2>\d{1,2})
+               [ ]* #{ET_EN}
+               (?=[ \]]|$)
+        )}ix
+                ## todo/check:  remove loakahead assertion here - why require space?
+                ## note: \b works only after non-alphanum e.g. )
+    ## e.g. 3-4 pen. 2-2 a.e.t. (1-1, 1-1)  or
+    ##      3-4p 2-2aet (1-1, )     or
+    ##      3-4 pen.  2-2 a.e.t. (1-1)       or
+    ##               2-2 a.e.t. (1-1, 1-1)  or
+    ##               2-2 a.e.t. (1-1, )     or
+    ##               2-2 a.e.t. (1-1)
+    SCORE__P_ET_FT_HT__RE = %r{
+          (?<score>
+               \b
+               (?:
+                (?<p1>\d{1,2}) - (?<p2>\d{1,2})
+                   [ ]* #{P_EN} [ ]+
+                )?            # note: make penalty (P) score optional for now
+               (?<et1>\d{1,2}) - (?<et2>\d{1,2})
+                   [ ]* #{ET_EN} [ ]+
+                   \(
+                   [ ]*
+              (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
+                   [ ]*
+                (?:
+                     , [ ]*
+                    (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
+                        [ ]*
+                    )?
+                )?              # note: make half time (HT) score optional for now
+              \)
+             (?=[ \]]|$)
+            )}ix       ## todo/check:  remove loakahead assertion here - why require space?
+                               ## note: \b works only after non-alphanum e.g. )
+    ###
+    ##   special case for case WITHOUT extra time!!
+    ##     same as above (but WITHOUT extra time and pen required)
+    SCORE__P_FT_HT__RE = %r{
+             (?<score>
+                \b
+     (?<p1>\d{1,2}) - (?<p2>\d{1,2})
+        [ ]* #{P_EN} [ ]+
+        \(
+        [ ]*
+      (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
+        [ ]*
+     (?:
+          , [ ]*
+         (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
+             [ ]*
+         )?
+     )?              # note: make half time (HT) score optional for now
+   \)
+  (?=[ \]]|$)
+    )}ix    ## todo/check:  remove loakahead assertion here - why require space?
+            ## note: \b works only after non-alphanum e.g. )
+    ## e.g. 2-1 (1-1) or
+    ##      2-1
+    SCORE__FT_HT__RE = %r{
+            (?<score>
+              \b
+              (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
+               (?:
+                   [ ]+ \( [ ]*
+                (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
+                   [ ]* \)
+               )?   # note: make half time (HT) score optional for now
+             (?=[ \]]|$)
+             )}ix    ## todo/check:  remove loakahead assertion here - why require space?
+                    ## note: \b works only after non-alphanum e.g. )
+#############################################
+# map tables
+#  note: order matters; first come-first matched/served
+SCORE_RE = Regexp.union(
+  SCORE__P_ET_FT_HT__RE,  # e.g. 5-1 pen. 2-2 a.e.t. (1-1, 1-0)
+  SCORE__P_FT_HT__RE,     # e.g. 5-1 pen. (1-1)
+  SCORE__P_ET__RE,        # e.g. 2-2 a.e.t.  or  5-1 pen. 2-2 a.e.t.
+  SCORE__FT_HT__RE        # e.g. 1-1 (1-0)
+)
+end  #  class Parser
+end  # module SportDb

data/lib/sportdb/parser/token-text.rb ADDED Viewed

@@ -0,0 +1,114 @@
+module SportDb
+class Parser
+##  note - do NOT allow single alpha text for now
+##   add later??      A - B    C - D  - why?
+## opt 1) one alpha
+## (?<text_i> [a-z])    # only allow single letter text (not numbers!!)
+## opt 2) more than one alphanum
+### allow special case - starting text with number e.g.
+##    number must be follow by space or dot ()
+# 1 FC   ##    allow 1-FC or 1FC   - why? why not?
+# 1. FC
+# 1.FC   - XXXX  - not allowed for now, parse error
+# 1FC    - XXXX  - now allowed for now, parse error
+# 1890 Munich
+#
+##
+#  allow Cote'd Ivoir or such
+##   e.g. add '
+TEXT_RE = %r{
+    ## must start with alpha (allow unicode letters!!)
+    (?<text>
+           ## positive lookbehind
+           ##  (MUST be fixed number of chars - no quantifier e.g. +? etc.)
+            (?<=[ ,;@|\[\]]
+                 |^
+            )
+            (?:
+                # opt 1 - start with alpha
+                 \p{L}+    ## all unicode letters (e.g. [a-z])
+                   |
+                # opt 2 - start with num!! - allow special case (e.g. 1. FC)
+                     \d+  # check for num lookahead (MUST be space or dot)
+                      ## MUST be followed by (optional dot) and
+                      ##                      required space !!!
+                      ## MUST be follow by a to z!!!!
+                      \.?     ## optional dot
+                      [ ]?   ## make space optional too  - why? why not?
+                             ##  yes - eg. 1st, 2nd, 5th etc.
+                       \p{L}+
+               )
+              (?:(?:  (?:[ ]
+                     (?!vs?\.?[ ])    ## note - exclude (v[ ]/vs[ ]/v.[ ]/vs.[ ])
+                       )
+                      |     # only single spaces allowed inline!!!
+                     [-]
+                  )?
+                (?:
+                  \p{L} |
+                  [&/']
+                    |
+                 (?:
+                   \d+
+                   (?![0-9.:h'/+-])
+                   ## negative lookahead for numbers
+                   ##   note - include digits itself!!!
+                 )|
+                 \.
+               )
+              )*  ## must NOT end with space or dash(-)
+              ##  todo/fix - possible in regex here
+              ##     only end in alphanum a-z0-9 (not dot or & ???)
+            ## allow optional at the end
+            ##  tag or year
+            ##   make it and in the future - why? why not?
+            ##
+            ## (A) -    allow with predined  alpha only for now
+            ##          e.g. (A) - amateur a team or b?
+            ##      or U21 U9 etc.   - why? why not?
+            ##       or etc.
+            ## (1879-1893) or allow years e.g. (1879-1893)
+            ###
+            (?:
+               [ ]
+                  \( (?:
+                       A|B|
+                       U\d{1,2}
+                     )
+                  \)
+            )?
+            (?:
+               [ ]
+               \(
+                  \d{4}-\d{4}
+               \)
+            )?
+            ## add lookahead/lookbehind
+           ##    must be space!!!
+           ##   (or comma or  start/end of string)
+           ##   kind of \b !!!
+            ## positive lookahead
+            (?=[ ,;@|\[\]]
+                 |$
+            )
+   )
+}ix
+end # class Parser
+end # module SportDb