RubyGems - sportdb-parser - Versions diffs - 0.5.9 → 0.6.0 - Mend

sportdb-parser 0.5.9 → 0.6.0

Files changed (15) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +1 -1
data/Manifest.txt +2 -0
data/lib/sportdb/parser/lexer.rb +47 -28
data/lib/sportdb/parser/parser.rb +421 -344
data/lib/sportdb/parser/racc_parser.rb +1 -1
data/lib/sportdb/parser/racc_tree.rb +12 -5
data/lib/sportdb/parser/token-date.rb +18 -1
data/lib/sportdb/parser/token-minute.rb +45 -0
data/lib/sportdb/parser/token-prop.rb +133 -0
data/lib/sportdb/parser/token-text.rb +9 -2
data/lib/sportdb/parser/token.rb +43 -177
data/lib/sportdb/parser/version.rb +2 -2
data/lib/sportdb/parser.rb +2 -0
metadata +4 -2

data/lib/sportdb/parser/racc_parser.rb CHANGED Viewed

@@ -14,7 +14,7 @@ def initialize( txt,  debug: false )
     ### todo:
     ##  -  pass along debug flag
-    lexer = SportDb::Lexer.new( txt )
+    lexer = SportDb::Lexer.new( txt, debug: debug )
     ##  note - use tokenize_with_errors and add/collect tokenize errors
     @tokens, @errors = lexer.tokenize_with_errors
     ## pp @tokens

data/lib/sportdb/parser/racc_tree.rb CHANGED Viewed

@@ -71,10 +71,14 @@ RoundDef   = Struct.new( :name, :date, :duration )  do
   end
 end
-DateHeader = Struct.new( :date ) do
+DateHeader = Struct.new( :date, :time, :geo, :timezone ) do
   def pretty_print( printer )
     printer.text( "<DateHeader " )
-    printer.text( "#{self.date.pretty_inspect}>" )
+    printer.text( "#{self.date.pretty_inspect}" )
+    printer.text( " time=#{self.time.pretty_inspect}" )          if self.time
+    printer.text( " geo=#{self.geo.pretty_inspect}" )            if self.geo
+    printer.text( " timezone=#{self.timezone}")             if self.timezone
+    printer.text( ">")
   end
 end
@@ -85,14 +89,17 @@ GroupHeader = Struct.new( :name ) do
   end
 end
-RoundHeader = Struct.new( :names ) do
+RoundHeader = Struct.new( :names, :group ) do
   def pretty_print( printer )
     printer.text( "<RoundHeader " )
-    printer.text( "#{self.names.join(', ')}>" )
+    printer.text( "#{self.names.join(', ')}" )
+    printer.text( " group=#{self.group}")    if self.group
+    printer.text( ">" )
   end
 end
-MatchLine   = Struct.new( :ord, :date, :time,
+MatchLine   = Struct.new( :ord, :date, :time, :wday,
                           :team1, :team2, :score,
                           :status,
                           :geo,

data/lib/sportdb/parser/token-date.rb CHANGED Viewed

@@ -146,12 +146,29 @@ DATE_II_RE = %r{
 )}ix
+# e.g. iso-date  -  2011-08-25
+##    todo/check - allow  2011-8-25  or 2011-8-3 / 2011-08-03 etc. - why? why not?
+DATE_III_RE = %r{
+(?<date>
+  \b
+   (?<year>\d{4})
+       -
+   (?<month>\d{2})
+       -
+   (?<day>\d{2})
+  \b
+)}ix
 #############################################
 # map tables
 #  note: order matters; first come-first matched/served
 DATE_RE = Regexp.union(
    DATE_I_RE,
-   DATE_II_RE
+   DATE_II_RE,
+   DATE_III_RE,
 )

data/lib/sportdb/parser/token-minute.rb ADDED Viewed

@@ -0,0 +1,45 @@
+module SportDb
+class Lexer
+#
+#  todo/check - move goal type regexes to goal or somewhere else?
+#
+##   goal types
+# (pen.) or (pen) or (p.) or (p)
+## (o.g.) or (og)
+##   todo/check - keep case-insensitive
+##                   or allow OG or P or PEN or
+##                   only lower case - why? why not?
+GOAL_PEN_RE = %r{
+   (?<pen> \(
+           (?:pen|p)\.?
+           \)
+    )
+}ix
+GOAL_OG_RE = %r{
+   (?<og> \(
+          (?:og|o\.g\.)
+          \)
+   )
+}ix
+MINUTE_RE = %r{
+     (?<minute>
+       (?<=[ (])	 # positive lookbehind for space or opening ( e.g. (61') required
+                     #    todo - add more lookbehinds e.g.  ,) etc. - why? why not?
+           (?<value>\d{1,3})      ## constrain numbers to 0 to 999!!!
+        (?: \+
+            (?<value2>\d{1,3})
+        )?
+        '     ## must have minute marker!!!!
+     )
+}ix
+end   # module SportDb
+end   # class Lexer

data/lib/sportdb/parser/token-prop.rb ADDED Viewed

@@ -0,0 +1,133 @@
+###
+##  team prop mode e.g.
+##
+##
+##    Fri Jun 14 21:00  @ München Fußball Arena, München
+##  (1)  Germany  v  Scotland   5-1 (3-0)
+##  Wirtz 10' Musiala 19' Havertz 45+1' (pen.) Füllkrug 68' Can 90+3'; Rüdiger 87' (o.g.)
+##
+## Germany:    Neuer - Kimmich, Rüdiger, Tah [Y], Mittelstädt - Andrich [Y] (Groß 46'),
+##       Kroos (Can 80') - Musiala (Müller 74'), Gündogan, Wirtz (Sane 63') -
+##       Havertz (Füllkrug 63')
+## Scotland:   Gunn - Porteous [R 44'], Hendry, Tierney (McKenna 78') - Ralston [Y],
+##       McTominay, McGregor (Gilmour 67'), Robertson - Christie (Shankland 82'),
+##       Adams (Hanley 46'), McGinn (McLean 67')
+module SportDb
+class Lexer
+## name different from text (does NOT allow number in name/text)
+PROP_NAME_RE = %r{
+                 (?<prop_name> \b
+                   (?<name>
+                      \p{L}+
+                        \.?    ## optional dot
+                      (?:
+                          [ ]?    # only single spaces allowed inline!!!
+                          (?:
+                              (?:
+                                (?<=\p{L})   ## use lookbehind
+                                 [/'-]   ## must be surrounded by letters
+                                       ## e.g. One/Two NOT
+                                       ##      One/ Two or One / Two or One /Two etc.
+                                (?=\p{L})      ## use lookahead
+                              )
+                                 |
+                              (?:
+                                (?<=[ ])   ## use lookbehind  -- add letter (plus dot) or such - why? why not?
+                                 [']   ## must be surrounded by leading space and
+                                       ## traling letters  (e.g. UDI 'Beter Bed)
+                                (?=\p{L})      ## use lookahead
+                              )
+                                 |
+                              (?:
+                                (?<=\p{L})   ## use lookbehind
+                                 [']   ## must be surrounded by leading letter and
+                                       ## trailing space PLUS letter  (e.g. UDI' Beter Bed)
+                                (?=[ ]\p{L})      ## use lookahead (space WITH letter
+                              )
+                                 |   ## standard case with letter(s) and optinal dot
+                              (?: \p{L}+
+                                    \.?  ## optional dot
+                              )
+                          )+
+                     )*
+                   )
+               ## add lookahead - must be non-alphanum
+                  (?=[ ,;\]\)]|$)
+                  )
+}ix
+##############
+#  add support for props/ attributes e.g.
+#
+#    Germany:    Neuer - Kimmich, Rüdiger, Tah [Y], Mittelstädt - Andrich [Y] (46' Groß),
+#      Kroos (80' Can) - Musiala (74' Müller), Gündogan,
+#      Wirtz (63' Sane) - Havertz (63' Füllkrug)
+#    Scotland:   Gunn - Porteous [R 44'], Hendry, Tierney (78' McKenna) - Ralston [Y],
+#      McTominay, McGregor (67' Gilmour), Robertson - Christie (82' Shankland),
+#      Adams (46' Hanley), McGinn (67' McLean)
+#
+## note:  colon (:) MUST be followed by one (or more) spaces
+##      make sure mon feb 12 18:10 will not match
+##        allow 1. FC Köln etc.
+##               Mainz 05:
+##           limit to 30 chars max
+##          only allow  chars incl. intl but (NOT ()[]/;)
+##
+## todo/fix:
+##   check if   St. Pölten     works; with starting St. ???
+  PROP_KEY_RE = %r{
+                 (?<prop_key> \b
+                   (?<key>
+                       (?:\p{L}+
+                           |
+                           \d+  # check for num lookahead (MUST be space or dot)
+                        ## MUST be followed by (optional dot) and
+                        ##                      required space !!!
+                        ## MUST be follow by a to z!!!!
+                         \.?     ## optional dot
+                         [ ]?   ## make space optional too  - why? why not?
+                             ##  yes - eg. 1st, 2nd, 5th etc.
+                         \p{L}+
+                        )
+                        [\d\p{L}'/° -]*?   ## allow almost anyting
+                                          ## fix - add negative lookahead
+                                          ##         no space and dash etc.
+                                          ##    only allowed "inline" not at the end
+                                          ## must end with latter or digit!
+                   )
+                    [ ]*?     # slurp trailing spaces
+                     :
+                    (?=[ ]+)  ## possitive lookahead (must be followed by space!!)
+                   )
+                 }ix
+PROP_BASICS_RE = %r{
+    (?<spaces> [ ]{2,}) |
+    (?<space>  [ ])
+        |
+    (?<sym>
+        [;,\(\)\[\]-]
+    )
+}ix
+PROP_RE = Regexp.union(
+   PROP_BASICS_RE,
+   MINUTE_RE,
+   PROP_NAME_RE,
+)
+end  # class Lexer
+end  # module SportDb

data/lib/sportdb/parser/token-text.rb CHANGED Viewed

@@ -24,6 +24,13 @@ class Lexer
 #  allow Cote'd Ivoir or such
 ##   e.g. add '
+## note:
+##  make sure these do NOT match!!!
+## TEXT  =>  "Matchday 1 / Group A"
+## TEXT  =>  "Matchday 2 / Group A"
+## TEXT  =>  "Matchday 3 / Group A"
 TEXT_RE = %r{
     ## must start with alpha (allow unicode letters!!)
@@ -59,11 +66,11 @@ TEXT_RE = %r{
                                ##    AND switch to case-sensitive (via -i!!!)
                       )
                       |     # only single spaces allowed inline!!!
-                     [-]
+                     [-/]
                   )?
                 (?:
                   \p{L} |
-                  [&/'°]
+                  [&'°]
                     |
                  (?:
                    \d+

data/lib/sportdb/parser/token.rb CHANGED Viewed

@@ -7,13 +7,14 @@ class Lexer
 ##
 #  keep 18h30 - why? why not?
 #    add support for 6:30pm 8:20am etc. - why? why not?
+#
+#    check - only support h e.g. 18h30  or 18H30 too - why? why not?
+# e.g. 18.30 (or 18:30 or 18h30)
 TIME_RE = %r{
-    ## e.g. 18.30 (or 18:30 or 18h30)
     (?<time>  \b
-              (?<hour>\d{1,2})
+        (?:   (?<hour>\d{1,2})
                  (?: :|\.|h )
-              (?<minute>\d{2})
+              (?<minute>\d{2}))
               \b
     )
 }ix
@@ -42,9 +43,12 @@ TIME_RE = %r{
 #   https://en.wikipedia.org/wiki/Time_zone
 #   https://en.wikipedia.org/wiki/List_of_UTC_offsets
 #   https://en.wikipedia.org/wiki/UTC−04:00  etc.
+#
+#  e.g. (UTC-2) or (CEST/UTC-2) etc.
+#    todo check - only allow upcase
+#    or  (utc-2) and (cest/utc-2) too - why? why not?
 TIMEZONE_RE = %r{
-   ## e.g. (UTC-2) or (CEST/UTC-2) etc.
    (?<timezone>
       \(
            ## optional "local" timezone name eg. BRT or CEST etc.
@@ -60,6 +64,35 @@ TIMEZONE_RE = %r{
+## add wday / stand-alone week day - as separate regex or
+##          use TEXT with is_wday? check or such with
+##                requirement of beginning of line (anchored to line) only??
+##       - why? why not?
+WDAY_RE = %r{
+(?<wday>
+  \b     # note - alternation (|) is lowest precedence (such
+         #    parathenes required around \b()\b !!!
+         ## note - NOT case sensitive!!!
+       (?<day_name>
+        (?-i:
+          Mon|Mo|
+          Tue|Tu|
+          Wed|We|
+          Thu|Th|
+          Fri|Fr|
+          Sat|Sa|
+          Sun|Su
+       ))
+  \b     ## todo/check - must be followed by two spaces or space + [( etc.
+         ##   to allow words starting with weekday abbrevations - why? why not?
+         ##     check if any names (teams, rounds, etc) come up in practice
+         ##   or maybe remove three letter abbrevations Mon/Tue
+         ##    and keep only Mo/Tu/We etc. - why? why not?
+)}x
 BASICS_RE = %r{
     ## e.g. (51) or (1) etc.  - limit digits of number???
@@ -78,189 +111,22 @@ BASICS_RE = %r{
     (?<spaces> [ ]{2,}) |
     (?<space>  [ ])
         |
-    (?<sym>[;,@|\[\]-])
-}ix
-## removed from basics
-=begin
-    (?<none>
-       (?<=[ \[]|^)	 # Positive lookbehind for space or [
-           -
-        (?=[ ]*;)   # positive lookahead for space
-    )
-       |
-   (?<vs>
-       (?<=[ ])	# Positive lookbehind for space
-       (?:
-          vs\.?|   ## allow optional dot (eg. vs. v.)
-          v\.?|
-          -
-       )   # not bigger match first e.g. vs than v etc.
-       (?=[ ])   # positive lookahead for space
-    )
-       |
-    make - into a simple symbol !!!
-=end
-MINUTE_RE = %r{
-     (?<minute>
-       (?<=[ (])	 # Positive lookbehind for space or opening ( e.g. (61') required
-           (?<value>\d{1,3})      ## constrain numbers to 0 to 999!!!
-        (?: \+
-            (?<value2>\d{1,3})
-        )?
-        '     ## must have minute marker!!!!
-     )
-}ix
-##   goal types
-# (pen.) or (pen) or (p.) or (p)
-## (o.g.) or (og)
-GOAL_PEN_RE = %r{
-   (?<pen> \(
-           (?:pen|p)\.?
-           \)
-    )
-}ix
-GOAL_OG_RE = %r{
-   (?<og> \(
-          (?:og|o\.g\.)
-          \)
-   )
+    (?<sym>[;,/@|\[\]-])
 }ix
-PROP_BASICS_RE = %r{
-    (?<spaces> [ ]{2,}) |
-    (?<space>  [ ])
-        |
-    (?<sym>[.;,\(\)\[\]-])   ## note - dot (.) is the (all-important) end-of-prop marker!!!
-}ix
-## name different from text (does not allow number in name/text)
-##
-##  note - includes special handling for dot (.) if at the end of line!!!
-##            end-of-line dot (.) is the prop end-of-marker - do NOT eat-up!!!
-PROP_NAME_RE = %r{
-                 (?<prop_name> \b
-                   (?<name>
-                      \p{L}+
-                       (?: \. (?: (?![ ]*$) )
-                        )?      ## edge case - check for end of prop marker! (e.g. Stop.)
-                      (?:
-                          [ ]?    # only single spaces allowed inline!!!
-                          (?:
-                              (?:
-                                (?<=\p{L})   ## use lookbehind
-                                 [/'-]   ## must be surrounded by letters
-                                       ## e.g. One/Two NOT
-                                       ##      One/ Two or One / Two or One /Two etc.
-                                (?=\p{L})      ## use lookahead
-                              )
-                                 |
-                              (?:
-                                (?<=[ ])   ## use lookbehind  -- add letter (plus dot) or such - why? why not?
-                                 [']   ## must be surrounded by leading space and
-                                       ## traling letters  (e.g. UDI 'Beter Bed)
-                                (?=\p{L})      ## use lookahead
-                              )
-                                 |
-                              (?:
-                                (?<=\p{L})   ## use lookbehind
-                                 [']   ## must be surrounded by leading letter and
-                                       ## trailing space PLUS letter  (e.g. UDI' Beter Bed)
-                                (?=[ ]\p{L})      ## use lookahead (space WITH letter
-                              )
-                                 |
-                              (?: \p{L}+
-                                  (?: \.
-                                      (?: (?![ ]*$) )
-                                  )?  ## last dot is delimiter!!!
-                              )
-                          )+
-                     )*
-                   )
-               ## add lookahead - must be non-alphanum (or dot)
-                  (?=[ .,;\]\)]|$)
-                  )
-}ix
-##############
-#  add support for props/ attributes e.g.
-#
-#    Germany:    Neuer - Kimmich, Rüdiger, Tah [Y], Mittelstädt – Andrich [Y] (46' Groß),
-#      Kroos (80' Can) – Musiala (74' Müller), Gündogan,
-#      Wirtz (63' Sane) – Havertz (63' Füllkrug).
-#    Scotland:   Gunn – Porteous [R 44'], Hendry, Tierney (78' McKenna) – Ralston [Y],
-#      McTominay, McGregor (67' Gilmour), Robertson – Christie (82' Shankland),
-#      Adams (46' Hanley), McGinn (67' McLean).
-#
-## note:  colon (:) MUST be followed by one (or more) spaces
-##      make sure mon feb 12 18:10 will not match
-##        allow 1. FC Köln etc.
-##               Mainz 05:
-##           limit to 30 chars max
-##          only allow  chars incl. intl but (NOT ()[]/;)
-  PROP_KEY_RE = %r{
-                 (?<prop_key> \b
-                   (?<key>
-                       (?:\p{L}+
-                           |
-                           \d+  # check for num lookahead (MUST be space or dot)
-                        ## MUST be followed by (optional dot) and
-                        ##                      required space !!!
-                        ## MUST be follow by a to z!!!!
-                         \.?     ## optional dot
-                         [ ]?   ## make space optional too  - why? why not?
-                             ##  yes - eg. 1st, 2nd, 5th etc.
-                         \p{L}+
-                        )
-                        [\d\p{L}'/° -]*?   ## allow almost anyting
-                                          ## fix - add negative lookahead
-                                          ##         no space and dash etc.
-                                          ##    only allowed "inline" not at the end
-                                          ## must end with latter or digit!
-                   )
-                    [ ]*?     # slurp trailing spaces
-                     :
-                    (?=[ ]+)  ## possitive lookahead (must be followed by space!!)
-                   )
-                 }ix
-PROP_RE = Regexp.union(
-   PROP_BASICS_RE,
-   MINUTE_RE,
-   PROP_NAME_RE,
-)
 RE = Regexp.union(  PROP_KEY_RE, ##  start with prop key (match will/should switch into prop mode!!!)
                     STATUS_RE,
                     TIMEZONE_RE,
                      TIME_RE,
                      DURATION_RE,  # note - duration MUST match before date
                     DATE_RE,
+                    WDAY_RE,   # allow standalone weekday name (e.g. Mo/Tu/etc.) - why? why not?
                     SCORE_RE,
-                    BASICS_RE, MINUTE_RE,
+                    BASICS_RE,
+                    MINUTE_RE,
                     GOAL_OG_RE, GOAL_PEN_RE,
                      TEXT_RE )

data/lib/sportdb/parser/version.rb CHANGED Viewed

@@ -3,8 +3,8 @@ module SportDb
   module Module
     module Parser
   MAJOR = 0    ## todo: namespace inside version or something - why? why not??
-  MINOR = 5
-  PATCH = 9
+  MINOR = 6
+  PATCH = 0
   VERSION = [MAJOR,MINOR,PATCH].join('.')
   def self.version

data/lib/sportdb/parser.rb CHANGED Viewed

@@ -21,6 +21,8 @@ require_relative 'parser/token-score'
 require_relative 'parser/token-date'
 require_relative 'parser/token-text'
 require_relative 'parser/token-status'
+require_relative 'parser/token-minute'
+require_relative 'parser/token-prop'    ## team prop(erty) mode (note - must be before token)
 require_relative 'parser/token'
 require_relative 'parser/lexer'

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: sportdb-parser
 version: !ruby/object:Gem::Version
-  version: 0.5.9
+  version: 0.6.0
 platform: ruby
 authors:
 - Gerald Bauer
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2025-01-29 00:00:00.000000000 Z
+date: 2025-01-30 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: cocos
@@ -102,6 +102,8 @@ files:
 - lib/sportdb/parser/racc_parser.rb
 - lib/sportdb/parser/racc_tree.rb
 - lib/sportdb/parser/token-date.rb
+- lib/sportdb/parser/token-minute.rb
+- lib/sportdb/parser/token-prop.rb
 - lib/sportdb/parser/token-score.rb
 - lib/sportdb/parser/token-status.rb
 - lib/sportdb/parser/token-text.rb