RubyGems - sportdb-parser - Versions diffs - 0.5.9 → 0.6.1 - Mend

sportdb-parser 0.5.9 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +1 -1
data/Manifest.txt +2 -0
data/lib/sportdb/parser/lexer.rb +101 -36
data/lib/sportdb/parser/parser.rb +561 -387
data/lib/sportdb/parser/racc_parser.rb +5 -3
data/lib/sportdb/parser/racc_tree.rb +12 -5
data/lib/sportdb/parser/token-date.rb +81 -13
data/lib/sportdb/parser/token-minute.rb +45 -0
data/lib/sportdb/parser/token-prop.rb +133 -0
data/lib/sportdb/parser/token-score.rb +25 -14
data/lib/sportdb/parser/token-text.rb +9 -2
data/lib/sportdb/parser/token.rb +51 -176
data/lib/sportdb/parser/version.rb +2 -2
data/lib/sportdb/parser.rb +2 -0
metadata +4 -2

data/lib/sportdb/parser/racc_parser.rb CHANGED Viewed

@@ -14,7 +14,7 @@ def initialize( txt,  debug: false )
     ### todo:
     ##  -  pass along debug flag
-    lexer = SportDb::Lexer.new( txt )
+    lexer = SportDb::Lexer.new( txt, debug: debug )
     ##  note - use tokenize_with_errors and add/collect tokenize errors
     @tokens, @errors = lexer.tokenize_with_errors
     ## pp @tokens
@@ -68,12 +68,14 @@ def initialize( txt,  debug: false )
   def on_error(error_token_id, error_value, value_stack)
-    args = [error_token_id, error_value, value_stack]
+    ## auto-add error_token (as string)
+    error_token = Racc_token_to_s_table[error_token_id]
+    args = [error_token, error_token_id, error_value, value_stack]
     puts
     puts "!! on parse error:"
     puts "args=#{args.pretty_inspect}"
-    @errors << "parse error on token: #{error_token_id} with value: #{error_value}, stack: #{value_stack.pretty_inspect}"
+    @errors << "parse error on token: #{error_token} (#{error_token_id}) with value: #{error_value}, stack: #{value_stack.pretty_inspect}"
     ## exit 1  ##   exit for now  -  get and print more info about context etc.!!
   end

data/lib/sportdb/parser/racc_tree.rb CHANGED Viewed

@@ -71,10 +71,14 @@ RoundDef   = Struct.new( :name, :date, :duration )  do
   end
 end
-DateHeader = Struct.new( :date ) do
+DateHeader = Struct.new( :date, :time, :geo, :timezone ) do
   def pretty_print( printer )
     printer.text( "<DateHeader " )
-    printer.text( "#{self.date.pretty_inspect}>" )
+    printer.text( "#{self.date.pretty_inspect}" )
+    printer.text( " time=#{self.time.pretty_inspect}" )          if self.time
+    printer.text( " geo=#{self.geo.pretty_inspect}" )            if self.geo
+    printer.text( " timezone=#{self.timezone}")             if self.timezone
+    printer.text( ">")
   end
 end
@@ -85,14 +89,17 @@ GroupHeader = Struct.new( :name ) do
   end
 end
-RoundHeader = Struct.new( :names ) do
+RoundHeader = Struct.new( :names, :group ) do
   def pretty_print( printer )
     printer.text( "<RoundHeader " )
-    printer.text( "#{self.names.join(', ')}>" )
+    printer.text( "#{self.names.join(', ')}" )
+    printer.text( " group=#{self.group}")    if self.group
+    printer.text( ">" )
   end
 end
-MatchLine   = Struct.new( :ord, :date, :time,
+MatchLine   = Struct.new( :ord, :date, :time, :wday,
                           :team1, :team2, :score,
                           :status,
                           :geo,

data/lib/sportdb/parser/token-date.rb CHANGED Viewed

@@ -146,12 +146,29 @@ DATE_II_RE = %r{
 )}ix
+# e.g. iso-date  -  2011-08-25
+##   note - allow/support ("shortcuts") e.g 2011-8-25  or 2011-8-3 / 2011-08-03 etc.
+DATE_III_RE = %r{
+(?<date>
+  \b
+   (?<year>\d{4})
+       -
+   (?<month>\d{1,2})
+       -
+   (?<day>\d{1,2})
+  \b
+)}ix
 #############################################
 # map tables
 #  note: order matters; first come-first matched/served
 DATE_RE = Regexp.union(
    DATE_I_RE,
-   DATE_II_RE
+   DATE_II_RE,
+   DATE_III_RE,
 )
@@ -197,29 +214,36 @@ end
 #
 #  Sun Jun/23 - Wed Jun/26   -- YES
 #  Jun/23 - Jun/26           -- YES
-#  Tue Jun/25 + Wed Jun/26   -- YES
-#  Jun/25 + Jun/26           -- YES
-#
-#  Jun/25 - 26        - why? why not???
+#  Jun/25 - 26        - why? why not???  - YES - see blow variant iii!!!
+#  Tue Jun/25 + Wed Jun/26   -- NO
+#  Jun/25 + Jun/26           -- NO
 #  Jun/25 .. 26        - why? why not???
 #  Jun/25 to 26        - why? why not???
 #  Jun/25 + 26        - add - why? why not???
 #  Sun-Wed Jun/23-26  -  add - why? why not???
 #  Wed+Thu Jun/26+27 2024  -  add - why? why not???
 #
-#  maybe use comman and plus for list of dates
+#  maybe use comma and plus for list of dates
 #    Tue Jun/25, Wed Jun/26, Thu Jun/27  ??
 #    Tue Jun/25 + Wed Jun/26 + Thu Jun/27  ??
 #
 #   add back optional comma (before) year - why? why not?
+#
 ##
 #   todo add plus later on - why? why not?
+###   todo/fix  add optional comma (,) before year
+### regex note/tip/remindr -  \b () \b MUST always get enclosed in parantheses
+##                                     because alternation (|) has lowest priority/binding
 DURATION_I_RE =  %r{
 (?<duration>
     \b
+  (?:
    ## optional day name
    ((?<day_name1>#{DAY_NAMES})
       [ ]
@@ -228,12 +252,13 @@ DURATION_I_RE =  %r{
       (?: \/|[ ] )
    (?<day1>\d{1,2})
    ## optional year
-   ( [ ]
+   (  ,?   # optional comma
+      [ ]
       (?<year1>\d{4})
    )?
    ## support + and -  (add .. or such - why??)
-   [ ]*[-][ ]*
+   [ ]* - [ ]*
    ## optional day name
    ((?<day_name2>#{DAY_NAMES})
@@ -243,20 +268,28 @@ DURATION_I_RE =  %r{
       (?: \/|[ ] )
    (?<day2>\d{1,2})
    ## optional year
-   ( [ ]
+   (  ,?   # optional comma
+      [ ]
       (?<year2>\d{4})
    )?
+  )
    \b
 )}ix
+#   FIX - remove this variant
+#         "standardize on month day [year]" !!!!
+=begin
 ###
 #   variant ii
 # e.g. 26 July - 27 July
-DURATION_II_RE =  %r{
+#      26 July,
+XXX_DURATION_II_RE =  %r{
 (?<duration>
     \b
+  (?
    ## optional day name
    ((?<day_name1>#{DAY_NAMES})
       [ ]
@@ -265,7 +298,8 @@ DURATION_II_RE =  %r{
       [ ]
    (?<month_name1>#{MONTH_NAMES})
    ## optional year
-   ( [ ]
+   (
+       [ ]
       (?<year1>\d{4})
    )?
@@ -283,16 +317,50 @@ DURATION_II_RE =  %r{
    ( [ ]
       (?<year2>\d{4})
    )?
+  )
+   \b
+)}ix
+=end
+#  variant ii
+#  add support for shorthand
+#     August 16-18, 2011
+#     September 13-15, 2011
+#      October 18-20, 2011
+#      March/6-8, 2012
+#      March 6-8 2012
+#      March 6-8
+#
+#   - add support for August 16+17 or such (and check 16+18)
+#       use <op> to check if day2 is a plus or range or such - why? why not?
+DURATION_II_RE =  %r{
+(?<duration>
+    \b
+   (?:
+       (?<month_name1>#{MONTH_NAMES})
+           [ /]
+        (?<day1>\d{1,2})
+             -
+        (?<day2>\d{1,2})
+          (?:
+            ,?     ## optional comma
+            [ ]
+            (?<year1>\d{4})
+          )?     ## optional year
+   )
    \b
 )}ix
 #############################################
 # map tables
 #  note: order matters; first come-first matched/served
 DURATION_RE = Regexp.union(
    DURATION_I_RE,
-   DURATION_II_RE
+   DURATION_II_RE,
 )

data/lib/sportdb/parser/token-minute.rb ADDED Viewed

@@ -0,0 +1,45 @@
+module SportDb
+class Lexer
+#
+#  todo/check - move goal type regexes to goal or somewhere else?
+#
+##   goal types
+# (pen.) or (pen) or (p.) or (p)
+## (o.g.) or (og)
+##   todo/check - keep case-insensitive
+##                   or allow OG or P or PEN or
+##                   only lower case - why? why not?
+GOAL_PEN_RE = %r{
+   (?<pen> \(
+           (?:pen|p)\.?
+           \)
+    )
+}ix
+GOAL_OG_RE = %r{
+   (?<og> \(
+          (?:og|o\.g\.)
+          \)
+   )
+}ix
+MINUTE_RE = %r{
+     (?<minute>
+       (?<=[ (])	 # positive lookbehind for space or opening ( e.g. (61') required
+                     #    todo - add more lookbehinds e.g.  ,) etc. - why? why not?
+           (?<value>\d{1,3})      ## constrain numbers to 0 to 999!!!
+        (?: \+
+            (?<value2>\d{1,3})
+        )?
+        '     ## must have minute marker!!!!
+     )
+}ix
+end   # module SportDb
+end   # class Lexer

data/lib/sportdb/parser/token-prop.rb ADDED Viewed

@@ -0,0 +1,133 @@
+###
+##  team prop mode e.g.
+##
+##
+##    Fri Jun 14 21:00  @ München Fußball Arena, München
+##  (1)  Germany  v  Scotland   5-1 (3-0)
+##  Wirtz 10' Musiala 19' Havertz 45+1' (pen.) Füllkrug 68' Can 90+3'; Rüdiger 87' (o.g.)
+##
+## Germany:    Neuer - Kimmich, Rüdiger, Tah [Y], Mittelstädt - Andrich [Y] (Groß 46'),
+##       Kroos (Can 80') - Musiala (Müller 74'), Gündogan, Wirtz (Sane 63') -
+##       Havertz (Füllkrug 63')
+## Scotland:   Gunn - Porteous [R 44'], Hendry, Tierney (McKenna 78') - Ralston [Y],
+##       McTominay, McGregor (Gilmour 67'), Robertson - Christie (Shankland 82'),
+##       Adams (Hanley 46'), McGinn (McLean 67')
+module SportDb
+class Lexer
+## name different from text (does NOT allow number in name/text)
+PROP_NAME_RE = %r{
+                 (?<prop_name> \b
+                   (?<name>
+                      \p{L}+
+                        \.?    ## optional dot
+                      (?:
+                          [ ]?    # only single spaces allowed inline!!!
+                          (?:
+                              (?:
+                                (?<=\p{L})   ## use lookbehind
+                                 [/'-]   ## must be surrounded by letters
+                                       ## e.g. One/Two NOT
+                                       ##      One/ Two or One / Two or One /Two etc.
+                                (?=\p{L})      ## use lookahead
+                              )
+                                 |
+                              (?:
+                                (?<=[ ])   ## use lookbehind  -- add letter (plus dot) or such - why? why not?
+                                 [']   ## must be surrounded by leading space and
+                                       ## traling letters  (e.g. UDI 'Beter Bed)
+                                (?=\p{L})      ## use lookahead
+                              )
+                                 |
+                              (?:
+                                (?<=\p{L})   ## use lookbehind
+                                 [']   ## must be surrounded by leading letter and
+                                       ## trailing space PLUS letter  (e.g. UDI' Beter Bed)
+                                (?=[ ]\p{L})      ## use lookahead (space WITH letter
+                              )
+                                 |   ## standard case with letter(s) and optinal dot
+                              (?: \p{L}+
+                                    \.?  ## optional dot
+                              )
+                          )+
+                     )*
+                   )
+               ## add lookahead - must be non-alphanum
+                  (?=[ ,;\]\)]|$)
+                  )
+}ix
+##############
+#  add support for props/ attributes e.g.
+#
+#    Germany:    Neuer - Kimmich, Rüdiger, Tah [Y], Mittelstädt - Andrich [Y] (46' Groß),
+#      Kroos (80' Can) - Musiala (74' Müller), Gündogan,
+#      Wirtz (63' Sane) - Havertz (63' Füllkrug)
+#    Scotland:   Gunn - Porteous [R 44'], Hendry, Tierney (78' McKenna) - Ralston [Y],
+#      McTominay, McGregor (67' Gilmour), Robertson - Christie (82' Shankland),
+#      Adams (46' Hanley), McGinn (67' McLean)
+#
+## note:  colon (:) MUST be followed by one (or more) spaces
+##      make sure mon feb 12 18:10 will not match
+##        allow 1. FC Köln etc.
+##               Mainz 05:
+##           limit to 30 chars max
+##          only allow  chars incl. intl but (NOT ()[]/;)
+##
+## todo/fix:
+##   check if   St. Pölten     works; with starting St. ???
+  PROP_KEY_RE = %r{
+                 (?<prop_key> \b
+                   (?<key>
+                       (?:\p{L}+
+                           |
+                           \d+  # check for num lookahead (MUST be space or dot)
+                        ## MUST be followed by (optional dot) and
+                        ##                      required space !!!
+                        ## MUST be follow by a to z!!!!
+                         \.?     ## optional dot
+                         [ ]?   ## make space optional too  - why? why not?
+                             ##  yes - eg. 1st, 2nd, 5th etc.
+                         \p{L}+
+                        )
+                        [\d\p{L}'/° -]*?   ## allow almost anyting
+                                          ## fix - add negative lookahead
+                                          ##         no space and dash etc.
+                                          ##    only allowed "inline" not at the end
+                                          ## must end with latter or digit!
+                   )
+                    [ ]*?     # slurp trailing spaces
+                     :
+                    (?=[ ]+)  ## possitive lookahead (must be followed by space!!)
+                   )
+                 }ix
+PROP_BASICS_RE = %r{
+    (?<spaces> [ ]{2,}) |
+    (?<space>  [ ])
+        |
+    (?<sym>
+        [;,\(\)\[\]-]
+    )
+}ix
+PROP_RE = Regexp.union(
+   PROP_BASICS_RE,
+   MINUTE_RE,
+   PROP_NAME_RE,
+)
+end  # class Lexer
+end  # module SportDb

data/lib/sportdb/parser/token-score.rb CHANGED Viewed

@@ -17,7 +17,7 @@ class Lexer
     ##      3-4 pen.   2-2 a.e.t.
     ##               2-2 a.e.t.
     SCORE__P_ET__RE = %r{
-        (?<score>
+        (?<score_more>
            \b
             (?:
                (?<p1>\d{1,2}) - (?<p2>\d{1,2})
@@ -34,7 +34,7 @@ class Lexer
     ##  note: allow SPECIAL with penalty only
     ##      3-4 pen.
     SCORE__P__RE = %r{
-        (?<score>
+        (?<score_more>
            \b
               (?<p1>\d{1,2}) - (?<p2>\d{1,2})
                 [ ]* #{P_EN}
@@ -52,7 +52,7 @@ class Lexer
     ##               2-2 a.e.t. (1-1)
     SCORE__P_ET_FT_HT__RE = %r{
-          (?<score>
+          (?<score_more>
                \b
                (?:
                 (?<p1>\d{1,2}) - (?<p2>\d{1,2})
@@ -79,7 +79,7 @@ class Lexer
     ##   special case for case WITHOUT extra time!!
     ##     same as above (but WITHOUT extra time and pen required)
     SCORE__P_FT_HT__RE = %r{
-             (?<score>
+             (?<score_more>
                 \b
      (?<p1>\d{1,2}) - (?<p2>\d{1,2})
         [ ]* #{P_EN} [ ]+
@@ -99,36 +99,47 @@ class Lexer
             ## note: \b works only after non-alphanum e.g. )
-    ## e.g. 2-1 (1-1) or
-    ##      2-1
+    ##########
+    ## e.g. 2-1 (1-1)
     SCORE__FT_HT__RE = %r{
-            (?<score>
+            (?<score_more>
               \b
               (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
-               (?:
                    [ ]+ \( [ ]*
                 (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
                    [ ]* \)
-               )?   # note: make half time (HT) score optional for now
              (?=[ ,\]]|$)
              )}ix    ## todo/check:  remove loakahead assertion here - why require space?
                     ## note: \b works only after non-alphanum e.g. )
+    #####
+    ##      2-1
+    SCORE__FT__RE = %r{
+            (?<score>
+              \b
+              (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
+              \b
+             )}ix
 #############################################
 # map tables
 #  note: order matters; first come-first matched/served
+#
+## check - find a better name for SCORE_MORE - SCORE_EX, SCORE_BIG, or ___ - why? why not?
-SCORE_RE = Regexp.union(
+SCORE_MORE_RE = Regexp.union(
   SCORE__P_ET_FT_HT__RE,  # e.g. 5-1 pen. 2-2 a.e.t. (1-1, 1-0)
   SCORE__P_FT_HT__RE,     # e.g. 5-1 pen. (1-1)
   SCORE__P_ET__RE,        # e.g. 2-2 a.e.t.  or  5-1 pen. 2-2 a.e.t.
   SCORE__P__RE,           # e.g. 5-1 pen.
-  SCORE__FT_HT__RE,        # e.g. 1-1 (1-0) or 1-1  -- note - must go last!!!
+  SCORE__FT_HT__RE,        # e.g. 1-1 (1-0)
+  ##  note - keep basic score as its own token!!!!
+  ##   that is, SCORE & SCORE_MORE
+  ### SCORE__FT__RE,           # e.g. 1-1  -- note - must go last!!!
 )
+SCORE_RE   =   SCORE__FT__RE
 end  #  class Lexer
 end  # module SportDb

data/lib/sportdb/parser/token-text.rb CHANGED Viewed

@@ -24,6 +24,13 @@ class Lexer
 #  allow Cote'd Ivoir or such
 ##   e.g. add '
+## note:
+##  make sure these do NOT match!!!
+## TEXT  =>  "Matchday 1 / Group A"
+## TEXT  =>  "Matchday 2 / Group A"
+## TEXT  =>  "Matchday 3 / Group A"
 TEXT_RE = %r{
     ## must start with alpha (allow unicode letters!!)
@@ -59,11 +66,11 @@ TEXT_RE = %r{
                                ##    AND switch to case-sensitive (via -i!!!)
                       )
                       |     # only single spaces allowed inline!!!
-                     [-]
+                     [-/]
                   )?
                 (?:
                   \p{L} |
-                  [&/'°]
+                  [&'°]
                     |
                  (?:
                    \d+