sportdb-parser 0.6.3 → 0.6.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -1
- data/lib/sportdb/parser/lexer.rb +140 -17
- data/lib/sportdb/parser/parser.rb +414 -354
- data/lib/sportdb/parser/racc_tree.rb +24 -0
- data/lib/sportdb/parser/token-date.rb +20 -0
- data/lib/sportdb/parser/token-minute.rb +140 -0
- data/lib/sportdb/parser/token-prop.rb +17 -9
- data/lib/sportdb/parser/token.rb +39 -10
- data/lib/sportdb/parser/version.rb +1 -1
- metadata +2 -2
| @@ -122,6 +122,30 @@ MatchLine   = Struct.new( :ord, :date, :time, :wday, | |
| 122 122 |  | 
| 123 123 | 
             
            end
         | 
| 124 124 |  | 
| 125 | 
            +
            ## check - use a different name e.g. GoalLineScore or such - why? why not?
         | 
| 126 | 
            +
            GoalLineAlt = Struct.new( :goals ) do
         | 
| 127 | 
            +
              def pretty_print( printer )
         | 
| 128 | 
            +
                printer.text( "<GoalLineAlt " )
         | 
| 129 | 
            +
                printer.text( "goals=" + self.goals.pretty_inspect + ">" )
         | 
| 130 | 
            +
              end  
         | 
| 131 | 
            +
            end
         | 
| 132 | 
            +
             | 
| 133 | 
            +
            GoalAlt   = Struct.new( :score, :player, :minute ) do
         | 
| 134 | 
            +
              def to_s
         | 
| 135 | 
            +
                buf = String.new
         | 
| 136 | 
            +
                buf << "#{score} "
         | 
| 137 | 
            +
                buf << "#{self.player}"
         | 
| 138 | 
            +
                buf << " #{self.minute}"  if self.minute
         | 
| 139 | 
            +
                buf
         | 
| 140 | 
            +
              end
         | 
| 141 | 
            +
             | 
| 142 | 
            +
              def pretty_print( printer )
         | 
| 143 | 
            +
                printer.text( to_s )
         | 
| 144 | 
            +
              end  
         | 
| 145 | 
            +
            end
         | 
| 146 | 
            +
             | 
| 147 | 
            +
             | 
| 148 | 
            +
             | 
| 125 149 | 
             
            GoalLine    = Struct.new( :goals1, :goals2 ) do
         | 
| 126 150 | 
             
              def pretty_print( printer )
         | 
| 127 151 | 
             
                printer.text( "<GoalLine " )
         | 
| @@ -159,6 +159,25 @@ DATE_III_RE = %r{ | |
| 159 159 | 
             
              \b
         | 
| 160 160 | 
             
            )}ix
         | 
| 161 161 |  | 
| 162 | 
            +
            ## allow (short)"european" style  8.8. 
         | 
| 163 | 
            +
            ##   note - assume day/month!!!
         | 
| 164 | 
            +
            DATE_IIII_RE = %r{
         | 
| 165 | 
            +
            (?<date>
         | 
| 166 | 
            +
              \b
         | 
| 167 | 
            +
               (?<day>\d{1,2})
         | 
| 168 | 
            +
                   \.
         | 
| 169 | 
            +
               (?<month>\d{1,2})
         | 
| 170 | 
            +
                   \.
         | 
| 171 | 
            +
               (?: (?: 
         | 
| 172 | 
            +
                      (?<year>\d{4})        ## optional year 2025 (yyyy)
         | 
| 173 | 
            +
                          |
         | 
| 174 | 
            +
                      (?<yy>\d{2})           ## optional year 25 (yy)
         | 
| 175 | 
            +
                   )
         | 
| 176 | 
            +
                    \b
         | 
| 177 | 
            +
               )?
         | 
| 178 | 
            +
            )
         | 
| 179 | 
            +
            }ix
         | 
| 180 | 
            +
             | 
| 162 181 |  | 
| 163 182 |  | 
| 164 183 |  | 
| @@ -169,6 +188,7 @@ DATE_RE = Regexp.union( | |
| 169 188 | 
             
               DATE_I_RE,
         | 
| 170 189 | 
             
               DATE_II_RE,
         | 
| 171 190 | 
             
               DATE_III_RE,
         | 
| 191 | 
            +
               DATE_IIII_RE,    ## e.g. 8.8. or 8.13.79 or 08.14.1973 
         | 
| 172 192 | 
             
            )
         | 
| 173 193 |  | 
| 174 194 |  | 
| @@ -54,6 +54,146 @@ MINUTE_RE = %r{ | |
| 54 54 | 
             
            }ix
         | 
| 55 55 |  | 
| 56 56 |  | 
| 57 | 
            +
            #####
         | 
| 58 | 
            +
            #  player with minute (top-level) regex 
         | 
| 59 | 
            +
            #   - starts new player/goal mode (until end of line)!!!
         | 
| 60 | 
            +
            #   - note: allow one or more spaces between name and minute
         | 
| 61 | 
            +
            #
         | 
| 62 | 
            +
            #  note - aaa  bbb 40'
         | 
| 63 | 
            +
            #      make sure anchor (^) - beginning of line - present!!!
         | 
| 64 | 
            +
            #       note - will NOT work with ^ anchor!!
         | 
| 65 | 
            +
            #       use special \G - Matches first matching position !!!!
         | 
| 66 | 
            +
            #          otherwise you get matches such as >bbb 40'< skipping >aaa< etc.!!!
         | 
| 67 | 
            +
            #
         | 
| 68 | 
            +
            #   regex question - check if in an regex union - space regex gets matches
         | 
| 69 | 
            +
            #                          or others with first matching position 
         | 
| 70 | 
            +
            #                          or if chars get eaten-up? 
         | 
| 71 | 
            +
            #                        let us know if \G is required here or not
         | 
| 72 | 
            +
             | 
| 73 | 
            +
             | 
| 74 | 
            +
            PLAYER_WITH_MINUTE_RE = %r{
         | 
| 75 | 
            +
                       ^    ### note - MUST start line; leading spaces optional (eat-up)
         | 
| 76 | 
            +
                       [ ]*
         | 
| 77 | 
            +
                         (?:      # optional open bracket ([) -- remove later
         | 
| 78 | 
            +
                            (?<open_bracket> \[ )
         | 
| 79 | 
            +
                            [ ]*
         | 
| 80 | 
            +
                         )?
         | 
| 81 | 
            +
                         (?:     # optional none a.k.a. -;   - what todo here?
         | 
| 82 | 
            +
                           (?<none>  - [ ]* ; [ ]* )
         | 
| 83 | 
            +
                         )?
         | 
| 84 | 
            +
               (?<player_with_minute>
         | 
| 85 | 
            +
                               (?<name>
         | 
| 86 | 
            +
                                  \p{L}+       
         | 
| 87 | 
            +
                                    \.?    ## optional dot
         | 
| 88 | 
            +
                   
         | 
| 89 | 
            +
                                      (?:
         | 
| 90 | 
            +
                                          ## rule for space; only one single space allowed inline!!!
         | 
| 91 | 
            +
                                          (?:
         | 
| 92 | 
            +
                                            (?<![ ])  ## use negative lookbehind                             
         | 
| 93 | 
            +
                                              [ ] 
         | 
| 94 | 
            +
                                            (?=\p{L}|')      ## use lookahead        
         | 
| 95 | 
            +
                                          )
         | 
| 96 | 
            +
                                              |
         | 
| 97 | 
            +
                                          (?:
         | 
| 98 | 
            +
                                            (?<=\p{L})   ## use lookbehind
         | 
| 99 | 
            +
                                             ['-]   ## must be surrounded by letters
         | 
| 100 | 
            +
                                                   ## e.g. One/Two NOT
         | 
| 101 | 
            +
                                                   ##      One/ Two or One / Two or One /Two etc.
         | 
| 102 | 
            +
                                            (?=\p{L})      ## use lookahead        
         | 
| 103 | 
            +
                                          )
         | 
| 104 | 
            +
                                             |   
         | 
| 105 | 
            +
                                          (?:
         | 
| 106 | 
            +
                                            (?<=[ ])   ## use lookbehind  -- add letter (plus dot) or such - why? why not?
         | 
| 107 | 
            +
                                             [']   ## must be surrounded by leading space and
         | 
| 108 | 
            +
                                                   ## traling letters  (e.g. UDI 'Beter Bed)
         | 
| 109 | 
            +
                                            (?=\p{L})      ## use lookahead        
         | 
| 110 | 
            +
                                          )   
         | 
| 111 | 
            +
                                             |
         | 
| 112 | 
            +
                                          (?:
         | 
| 113 | 
            +
                                            (?<=\p{L})   ## use lookbehind
         | 
| 114 | 
            +
                                             [']   ## must be surrounded by leading letter and
         | 
| 115 | 
            +
                                                   ## trailing space PLUS letter  (e.g. UDI' Beter Bed)
         | 
| 116 | 
            +
                                            (?=[ ]\p{L})      ## use lookahead (space WITH letter         
         | 
| 117 | 
            +
                                          )   
         | 
| 118 | 
            +
                                             |   ## standard case with letter(s) and optinal dot
         | 
| 119 | 
            +
                                          (?: \p{L}+
         | 
| 120 | 
            +
                                                \.?  ## optional dot
         | 
| 121 | 
            +
                                          )
         | 
| 122 | 
            +
                                      )*
         | 
| 123 | 
            +
                               )
         | 
| 124 | 
            +
            #### spaces
         | 
| 125 | 
            +
                 (?: [ ]+)
         | 
| 126 | 
            +
            #### minute (see above)
         | 
| 127 | 
            +
            #####   use MINUTE_RE.source or such - for inline (reference) use? do not copy
         | 
| 128 | 
            +
                 (?<minute>
         | 
| 129 | 
            +
                   (?<=[ (])	 # positive lookbehind for space or opening ( e.g. (61') required
         | 
| 130 | 
            +
                                 #    todo - add more lookbehinds e.g.  ,) etc. - why? why not?
         | 
| 131 | 
            +
                       (?: 
         | 
| 132 | 
            +
                          (?<value>\d{1,3})      ## constrain numbers to 0 to 999!!!
         | 
| 133 | 
            +
                               (?: \+
         | 
| 134 | 
            +
                                 (?<value2>\d{1,3})   
         | 
| 135 | 
            +
                               )?
         | 
| 136 | 
            +
                           |
         | 
| 137 | 
            +
                          (?<value> \?{2} | _{2} )  ## add support for n/a (not/available)
         | 
| 138 | 
            +
                       )           
         | 
| 139 | 
            +
                    '     ## must have minute marker!!!!
         | 
| 140 | 
            +
                 )
         | 
| 141 | 
            +
             
         | 
| 142 | 
            +
               )   
         | 
| 143 | 
            +
            }ix
         | 
| 144 | 
            +
             | 
| 145 | 
            +
             | 
| 146 | 
            +
            PLAYER_WITH_SCORE_RE = %r{
         | 
| 147 | 
            +
                       ^    ### note - MUST start line; leading spaces optional (eat-up)
         | 
| 148 | 
            +
                       [ ]*
         | 
| 149 | 
            +
               (?<player_with_score>
         | 
| 150 | 
            +
                               (?<score>
         | 
| 151 | 
            +
                                 (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
         | 
| 152 | 
            +
                               )
         | 
| 153 | 
            +
                                  [ ]+
         | 
| 154 | 
            +
                               (?<name>
         | 
| 155 | 
            +
                                  \p{L}+       
         | 
| 156 | 
            +
                                    \.?    ## optional dot
         | 
| 157 | 
            +
                   
         | 
| 158 | 
            +
                                      (?:
         | 
| 159 | 
            +
                                          ## rule for space; only one single space allowed inline!!!
         | 
| 160 | 
            +
                                          (?:
         | 
| 161 | 
            +
                                            (?<![ ])  ## use negative lookbehind                             
         | 
| 162 | 
            +
                                              [ ] 
         | 
| 163 | 
            +
                                            (?=\p{L}|')      ## use lookahead        
         | 
| 164 | 
            +
                                          )
         | 
| 165 | 
            +
                                              |
         | 
| 166 | 
            +
                                          (?:
         | 
| 167 | 
            +
                                            (?<=\p{L})   ## use lookbehind
         | 
| 168 | 
            +
                                             ['-]   ## must be surrounded by letters
         | 
| 169 | 
            +
                                                   ## e.g. One/Two NOT
         | 
| 170 | 
            +
                                                   ##      One/ Two or One / Two or One /Two etc.
         | 
| 171 | 
            +
                                            (?=\p{L})      ## use lookahead        
         | 
| 172 | 
            +
                                          )
         | 
| 173 | 
            +
                                             |   
         | 
| 174 | 
            +
                                          (?:
         | 
| 175 | 
            +
                                            (?<=[ ])   ## use lookbehind  -- add letter (plus dot) or such - why? why not?
         | 
| 176 | 
            +
                                             [']   ## must be surrounded by leading space and
         | 
| 177 | 
            +
                                                   ## traling letters  (e.g. UDI 'Beter Bed)
         | 
| 178 | 
            +
                                            (?=\p{L})      ## use lookahead        
         | 
| 179 | 
            +
                                          )   
         | 
| 180 | 
            +
                                             |
         | 
| 181 | 
            +
                                          (?:
         | 
| 182 | 
            +
                                            (?<=\p{L})   ## use lookbehind
         | 
| 183 | 
            +
                                             [']   ## must be surrounded by leading letter and
         | 
| 184 | 
            +
                                                   ## trailing space PLUS letter  (e.g. UDI' Beter Bed)
         | 
| 185 | 
            +
                                            (?=[ ]\p{L})      ## use lookahead (space WITH letter         
         | 
| 186 | 
            +
                                          )   
         | 
| 187 | 
            +
                                             |   ## standard case with letter(s) and optinal dot
         | 
| 188 | 
            +
                                          (?: \p{L}+
         | 
| 189 | 
            +
                                                \.?  ## optional dot
         | 
| 190 | 
            +
                                          )
         | 
| 191 | 
            +
                                      )*
         | 
| 192 | 
            +
                               )   ## name
         | 
| 193 | 
            +
                        ### check/todo - add lookahead  (e.g. must be space or ,$) why? why not?               
         | 
| 194 | 
            +
                )  ## player_with_score 
         | 
| 195 | 
            +
            }ix
         | 
| 196 | 
            +
             | 
| 57 197 |  | 
| 58 198 |  | 
| 59 199 | 
             
            end   # module SportDb
         | 
| @@ -19,18 +19,23 @@ class Lexer | |
| 19 19 |  | 
| 20 20 |  | 
| 21 21 | 
             
            ## name different from text (does NOT allow number in name/text)
         | 
| 22 | 
            -
             | 
| 23 22 | 
             
            PROP_NAME_RE = %r{
         | 
| 24 | 
            -
                             (?<prop_name>  | 
| 23 | 
            +
                             (?<prop_name> 
         | 
| 24 | 
            +
                                  \b
         | 
| 25 25 | 
             
                               (?<name>
         | 
| 26 26 | 
             
                                  \p{L}+       
         | 
| 27 27 | 
             
                                    \.?    ## optional dot
         | 
| 28 | 
            -
                                  (?: 
         | 
| 29 | 
            -
                                      [ ]?    # only single spaces allowed inline!!!
         | 
| 30 28 | 
             
                                      (?:
         | 
| 29 | 
            +
                                         ## rule for space; only one single space allowed inline!!!
         | 
| 31 30 | 
             
                                          (?:
         | 
| 31 | 
            +
                                            (?<![ ])  ## use negative lookbehind                             
         | 
| 32 | 
            +
                                              [ ] 
         | 
| 33 | 
            +
                                            (?=\p{L}|')      ## use lookahead        
         | 
| 34 | 
            +
                                          )
         | 
| 35 | 
            +
                                              |                         
         | 
| 36 | 
            +
                                         (?:
         | 
| 32 37 | 
             
                                            (?<=\p{L})   ## use lookbehind
         | 
| 33 | 
            -
                                             [ | 
| 38 | 
            +
                                             ['-]   ## must be surrounded by letters
         | 
| 34 39 | 
             
                                                   ## e.g. One/Two NOT
         | 
| 35 40 | 
             
                                                   ##      One/ Two or One / Two or One /Two etc.
         | 
| 36 41 | 
             
                                            (?=\p{L})      ## use lookahead        
         | 
| @@ -53,9 +58,8 @@ PROP_NAME_RE = %r{ | |
| 53 58 | 
             
                                          (?: \p{L}+
         | 
| 54 59 | 
             
                                                \.?  ## optional dot
         | 
| 55 60 | 
             
                                          )
         | 
| 56 | 
            -
                                      ) | 
| 57 | 
            -
             | 
| 58 | 
            -
                               )
         | 
| 61 | 
            +
                                      )*
         | 
| 62 | 
            +
                                )
         | 
| 59 63 | 
             
                           ## add lookahead - must be non-alphanum 
         | 
| 60 64 | 
             
                              (?=[ ,;\]\)]|$)
         | 
| 61 65 | 
             
                              )
         | 
| @@ -83,10 +87,14 @@ PROP_NAME_RE = %r{ | |
| 83 87 | 
             
            ##
         | 
| 84 88 | 
             
            ## todo/fix:
         | 
| 85 89 | 
             
            ##   check if   St. Pölten     works; with starting St. ???
         | 
| 90 | 
            +
            ##
         | 
| 91 | 
            +
            ##  note - use special \G - Matches first matching position !!!!
         | 
| 86 92 |  | 
| 87 93 |  | 
| 88 94 | 
             
              PROP_KEY_RE = %r{ 
         | 
| 89 | 
            -
             | 
| 95 | 
            +
                                ^     # note - MUST start line; leading spaces optional (eat-up)
         | 
| 96 | 
            +
                                [ ]*  
         | 
| 97 | 
            +
                             (?<prop_key>
         | 
| 90 98 | 
             
                               (?<key>
         | 
| 91 99 | 
             
                                   (?:\p{L}+
         | 
| 92 100 | 
             
                                       |
         | 
    
        data/lib/sportdb/parser/token.rb
    CHANGED
    
    | @@ -84,7 +84,8 @@ WDAY_RE = %r{ | |
| 84 84 | 
             
                      Sat|Sa|
         | 
| 85 85 | 
             
                      Sun|Su
         | 
| 86 86 | 
             
                   ))
         | 
| 87 | 
            -
             | 
| 87 | 
            +
                   (?=[ ]{2})   # positive lookahead for two space  
         | 
| 88 | 
            +
                   ## todo/check - must be followed by two spaces or space + [( etc.
         | 
| 88 89 | 
             
                     ##   to allow words starting with weekday abbrevations - why? why not?
         | 
| 89 90 | 
             
                     ##     check if any names (teams, rounds, etc) come up in practice 
         | 
| 90 91 | 
             
                     ##   or maybe remove three letter abbrevations Mon/Tue
         | 
| @@ -123,26 +124,54 @@ BASICS_RE = %r{ | |
| 123 124 | 
             
            }ix
         | 
| 124 125 |  | 
| 125 126 |  | 
| 127 | 
            +
            ## general catch-all  (RECOMMENDED (ALWAYS) use as last entry in union)
         | 
| 128 | 
            +
            ##   to avoid advance of pos match!!!
         | 
| 129 | 
            +
            ANY_RE = %r{
         | 
| 130 | 
            +
                           (?<any> .)
         | 
| 131 | 
            +
                      }ix
         | 
| 126 132 |  | 
| 127 133 |  | 
| 128 | 
            -
            RE = Regexp.union( | 
| 134 | 
            +
            RE = Regexp.union(
         | 
| 129 135 | 
             
                                STATUS_RE,
         | 
| 130 136 | 
             
                                NOTE_RE,
         | 
| 131 137 | 
             
                                TIMEZONE_RE,
         | 
| 138 | 
            +
                                DURATION_RE,  # note - duration MUST match before date
         | 
| 139 | 
            +
                                DATE_RE,  ## note - date must go before time (e.g. 12.12. vs 12.12)
         | 
| 132 140 | 
             
                                 TIME_RE,
         | 
| 133 | 
            -
                                 DURATION_RE,  # note - duration MUST match before date
         | 
| 134 | 
            -
                                DATE_RE,
         | 
| 135 141 | 
             
                                SCORE_MORE_RE, 
         | 
| 136 142 | 
             
                                SCORE_RE,   ## note basic score e.g. 1-1 must go after SCORE_MORE_RE!!!
         | 
| 137 143 | 
             
                                BASICS_RE, 
         | 
| 138 | 
            -
                                 | 
| 139 | 
            -
             | 
| 140 | 
            -
             | 
| 141 | 
            -
             | 
| 142 | 
            -
                                 WDAY_RE,  # allow standalone weekday name (e.g. Mo/Tu/etc.) - why? why not?
         | 
| 143 | 
            -
                                           #    note - wday MUST be after text e.g. Sun Ke 68' is Sun Ke (NOT Sun) etc.
         | 
| 144 | 
            +
                                WDAY_RE,  # allow standalone weekday name (e.g. Mo/Tu/etc.) - why? why not?
         | 
| 145 | 
            +
                                          #    note - wday MUST be after text e.g. Sun Ke 68' is Sun Ke (NOT Sun) etc.
         | 
| 146 | 
            +
                               TEXT_RE,
         | 
| 147 | 
            +
                               ANY_RE,
         | 
| 144 148 | 
             
                                  )
         | 
| 145 149 |  | 
| 146 150 |  | 
| 151 | 
            +
             | 
| 152 | 
            +
            ######################################################
         | 
| 153 | 
            +
            ## goal mode (switched to by PLAYER_WITH_MINUTE_RE)   
         | 
| 154 | 
            +
             | 
| 155 | 
            +
            GOAL_BASICS_RE = %r{
         | 
| 156 | 
            +
                (?<spaces> [ ]{2,}) |
         | 
| 157 | 
            +
                (?<space>  [ ])
         | 
| 158 | 
            +
                    |
         | 
| 159 | 
            +
                (?<sym>  
         | 
| 160 | 
            +
                    [;,\[\]]   ## add (-) dash too - why? why not?   
         | 
| 161 | 
            +
                )   
         | 
| 162 | 
            +
            }ix
         | 
| 163 | 
            +
             | 
| 164 | 
            +
             | 
| 165 | 
            +
            GOAL_RE = Regexp.union(
         | 
| 166 | 
            +
                GOAL_BASICS_RE,
         | 
| 167 | 
            +
                MINUTE_RE,
         | 
| 168 | 
            +
                MINUTE_NA_RE,   ## note - add/allow not/available (n/a,na) minutes hack for now
         | 
| 169 | 
            +
                GOAL_OG_RE, GOAL_PEN_RE,
         | 
| 170 | 
            +
                SCORE_RE,
         | 
| 171 | 
            +
                PROP_NAME_RE,    ## note - (re)use prop name for now for (player) name
         | 
| 172 | 
            +
            )
         | 
| 173 | 
            +
             | 
| 174 | 
            +
             | 
| 175 | 
            +
             | 
| 147 176 | 
             
            end  # class Lexer
         | 
| 148 177 | 
             
            end # module SportDb
         | 
    
        metadata
    CHANGED
    
    | @@ -1,14 +1,14 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: sportdb-parser
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0.6. | 
| 4 | 
            +
              version: 0.6.4
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - Gerald Bauer
         | 
| 8 8 | 
             
            autorequire:
         | 
| 9 9 | 
             
            bindir: bin
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date: 2025-02- | 
| 11 | 
            +
            date: 2025-02-27 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies:
         | 
| 13 13 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 14 14 | 
             
              name: cocos
         |