RubyGems - rsssf-parser - Versions diffs - 0.0.1 - Mend

rsssf-parser 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +3 -0
data/Manifest.txt +15 -0
data/README.md +11 -0
data/Rakefile +29 -0
data/bin/rsssf +80 -0
data/lib/rsssf/parser/linter.rb +84 -0
data/lib/rsssf/parser/parser.rb +100 -0
data/lib/rsssf/parser/token-date.rb +161 -0
data/lib/rsssf/parser/token-goals.rb +68 -0
data/lib/rsssf/parser/token-note.rb +113 -0
data/lib/rsssf/parser/token-round.rb +102 -0
data/lib/rsssf/parser/token-score.rb +103 -0
data/lib/rsssf/parser/token-text.rb +162 -0
data/lib/rsssf/parser/token.rb +230 -0
data/lib/rsssf/parser.rb +21 -0
metadata +113 -0

data/lib/rsssf/parser/token-round.rb ADDED Viewed

@@ -0,0 +1,102 @@
+module Rsssf
+class Parser
+## Group A-Z
+## Group 1-99
+## Group HEX  # used in concaf world cup quali
+## Group 1A or A1, B1  - used anywhere
+##
+##  use "key" of group - why? why not?
+GROUP_RE = %r{(?<group>
+                 \b
+                Group [ ]
+                   [a-z0-9]+
+            \b)}ix
+ROUND_RE = %r{(?<round>
+                  \b
+   (?:
+   # round  - note - requiers number e.g. round 1,2, etc.
+      (?:  (?: Round |
+              Matchday |
+              Week
+           )
+           [ ] [0-9]+
+      )
+      |
+   # more (kockout) rounds
+   # playoffs  - playoff, play-off, play-offs
+        (?: Play-?offs?
+           (?: [ ]for[ ]quarter-?finals )?
+        )
+        |
+   # round32
+        (?: Round[ ]of[ ]32 |
+            Last[ ]32 |
+            16th[ ]finals |
+            1/16[ ]finals
+            )
+          |
+   # round16
+        (?: Round[ ]of[ ]16 |
+            Last[ ]16 |
+            8th[ ]finals |
+            1/8[ ]finals
+            )
+           |
+   # fifthplace
+         (?:
+             (?: (Fifth|5th)[ -]place
+                  (?: [ ] (?: match|play-?off|final ))?
+              ) |
+             (?: Match[ ]for[ ](?: fifth|5th )[ -]place )
+         )
+          |
+   # thirdplace
+          (?:
+              (?: (Third|3rd)[ -]place
+                     (?: [ ] (?: match|play-?off|final ))?
+               ) |
+              (?: Match[ ]for[ ](?: third|3rd )[ -]place )
+           )
+           |
+   # quarterfinals
+         (?:
+              Quarter-?finals? |
+              Quarters |
+              Last[ ]8
+          )
+          |
+   # semifinals
+        (?:
+             Semi-?finals? |
+             Semis |
+             Last[ ]4
+        )
+        |
+   # final
+         Finals?
+       )
+      \b)}ix
+##
+## keep leg separate (from round) - why? why not?
+##
+LEG_RE = %r{ (?<leg>
+                  \b
+  (?:
+   # leg1
+     (?: 1st|First)[ ]legs?
+     |
+  # leg2
+     (?: 2nd|Second)[ ]legs?
+  )
+    \b)}ix
+end  # class Parser
+end  # module Rsssf

data/lib/rsssf/parser/token-score.rb ADDED Viewed

@@ -0,0 +1,103 @@
+module Rsssf
+class Parser
+    ######
+    ## e.g. 2-1
+    SCORE_RE = %r{
+            (?<score>
+                (?<=[ ])	# Positive lookbehind for space
+                   (?<score1>\d{1,2}) - (?<score2>\d{1,2})
+                (?=[ ])   # positive lookahead for space
+            )
+          }ix
+##  [aet]
+##  [aet, 3-2 pen]
+##  [aet; 3-2 pen]
+##  [3-2 pen]
+##  [3-2 pen.]
+##  [aet, 9-8 pen]
+##  [aet, 5-3 pen]
+##  [aet, 6-5 pen]
+##  [aet]
+##
+## - add dot (.) too ??
+##     [aet. 3-2 pen]
+SCORE_EXT_RE =  %r{ \[
+                      (?<score_ext>
+                          (?:       ## aet only e.g.  aet
+                             aet
+                             (?:   ##  optional pen
+                               [,;][ ]*
+                               \d{1,2}-\d{1,2} [ ]? pen\.?
+                             )?
+                          )
+                          |
+                          (?:   ##  penalty only e.g. 3-2 pen
+                            \d{1,2}-\d{1,2} [ ]? pen\.?
+                          )
+                      )
+                    \]
+                  }ix
+### awd  - awarded
+SCORE_AWD_RE  = %r{  ## must be space before and after!!!
+                    (?<score_awd>
+                      (?<=[ ])	# Positive lookbehind for space
+                        awd
+                       (?=[ ])   # positive lookahead for space
+                    )
+                }ix
+### abd  -  abandoned
+SCORE_ABD_RE  = %r{  ## must be space before and after!!!
+                    (?<score_abd>
+                      (?<=[ ])	# Positive lookbehind for space
+                        abd
+                       (?=[ ])   # positive lookahead for space
+                    )
+                }ix
+### ppd  - postponed
+SCORE_PPD_RE  = %r{  ## must be space before and after!!!
+                    (?<score_ppd>
+                      (?<=[ ])	# Positive lookbehind for space
+                        ppd
+                       (?=[ ])   # positive lookahead for space
+                    )
+                }ix
+### n/p   - not played
+SCORE_NP_RE    = %r{  ## must be space before and after!!!
+                    (?<score_np>
+                      (?<=[ ])	# Positive lookbehind for space
+                         n/p
+                       (?=[ ])   # positive lookahead for space
+                    )
+                }ix
+## A walkover, also W.O. or w/o (originally two words: "walk over"),
+##  is awarded to the opposing team/player etc,
+## if there are no other players available,
+## or they have been disqualified,
+## because the other contestants have forfeited or
+# the other contestants have withdrawn from the contest.
+##
+##  w/o  - walk over
+SCORE_WO_RE    = %r{  ## must be space before and after!!!
+                    (?<score_wo>
+                      (?<=[ ])	# Positive lookbehind for space
+                         w/o
+                       (?=[ ])   # positive lookahead for space
+                    )
+                }ix
+end  #  class Parser
+end  # module Rsssf

data/lib/rsssf/parser/token-text.rb ADDED Viewed

@@ -0,0 +1,162 @@
+module Rsssf
+class Parser
+##  note - do NOT allow single alpha text for now
+##   add later??      A - B    C - D  - why?
+## opt 1) one alpha
+## (?<text_i> [a-z])    # only allow single letter text (not numbers!!)
+## opt 2) more than one alphanum
+### allow special case - starting text with number e.g.
+##    number must be follow by space or dot ()
+# 1 FC   ##    allow 1-FC or 1FC   - why? why not?
+# 1. FC
+# 1.FC   - XXXX  - not allowed for now, parse error
+# 1FC    - XXXX  - now allowed for now, parse error
+# 1890 Munich
+#
+##
+#  allow Cote'd Ivoir or such
+##   e.g. add '
+## note - use a more strict text re(gex)
+##         if inside brackets !!!!
+###
+## "simple" strict text regex
+###  no numbers  (or & or such inside)
+##    allows  dash/hyphen (-)
+##      and   dot (.) and apostroph (') for now
+## simple (double) quoted text
+##   only supports a-z (unicode) PLUS (single) inline space
+##    add more chars - why? why not?
+TEXT_QUOTED =   '(?:  "    ' +
+                 '  \p{L}+  ' +
+                 '     (?: [ ]  ' +
+                 '        \p{L}+ )*   '  +
+                 '    "  )  '
+### might start with "" !!!
+##    e.g.
+##      "Tiago" Cardoso Mendes 80
+##     "Cristiano Ronaldo" dos Santos Aveiro 74
+##     "Zé Castro" José Eduardo Rosa Vale Castro 60og
+TEXT_STRICT_RE = %r{
+   (?<text>
+         (?: \b |  #{TEXT_QUOTED} [ ]   ## note - leading quoted text must be followed by space!!
+          )
+          \p{L}+    ## all unicode letters (e.g. [a-z])
+             (?:
+               (?:[ ]
+                    |     # only single spaces allowed inline!!!
+                   [-]
+               )?
+               (?:
+                  \p{L}+ |
+                   ['.] |
+                   (?:
+                      (?<= [ ])
+                      #{TEXT_QUOTED}
+                      (?= [ ]|$)   ### must be followed by space
+                                  ##  todo/fix - add all end of text lookaheads to (see below)
+                   )
+               )
+              )*
+               ## must NOT end with space or dash(-)
+              ##  todo/fix - possible in regex here
+              ##     only end in alphanum a-z0-9 (not dot or & ???)
+        ## positive lookahead
+        ##   cannot use \b  if text ends in dot (.) or other non-alphnum
+        ##        than \b will not work
+            (?=[ ,;@|\[\]]
+                 |$
+            )
+    )
+}ix
+TEXT_RE = %r{
+    ## must start with alpha (allow unicode letters!!)
+    (?<text>
+             \b   ## use/require word boundary
+            (?:
+                # opt 1 - start with alpha
+                 \p{L}+    ## all unicode letters (e.g. [a-z])
+                   |
+                # opt 2 - start with num!! - allow special case (e.g. 1. FC)
+                     \d+  # check for num lookahead (MUST be space or dot)
+                      ## MUST be followed by (optional dot) and
+                      ##                      required space !!!
+                      ## MUST be follow by a to z!!!!
+                      \.?     ## optional dot
+                      [ ]?   ## make space optional too  - why? why not?
+                             ##  yes - eg. 1st, 2nd, 5th etc.
+                       \p{L}+
+               )
+              (?:(?:  (?:[ ]
+                     (?! (awd|abd|ppd|n/p|w/o)[ ])    ## note - exclude (awd[ ]/abd[ ]/n/p[ ])
+                       )
+                      |     # only single spaces allowed inline!!!
+                     [-]
+                  )?
+                (?:
+                  \p{L}+ | [&/'.]
+                    |
+                 (?:
+                   \d+
+                   (?![0-9.:'/+-])
+                   ## negative lookahead for numbers
+                   ##   note - include digits itself!!!
+                 )
+               )
+              )*  ## must NOT end with space or dash(-)
+              ##  todo/fix - possible in regex here
+              ##     only end in alphanum a-z0-9 (not dot or & ???)
+              ## support (Hamburg) or such at the end (ony)
+              ##   note - no numbers allowed inside () for now!!
+             (?:
+                  [ ]\(\p{L}+
+                      (?:
+                         (?: [ ] |
+                             [-]
+                          )?
+                          \p{L}+ | [&/'.]
+                        )*
+                      \)
+             )?
+            ## add lookahead/lookbehind
+           ##    must be space!!!
+           ##   (or comma or  start/end of string)
+           ##   kind of \b !!!
+            ## positive lookahead
+            ##  note - added : too - why? why not?
+            (?=[ ,;@|:\[\]]
+                 |$
+            )
+   )
+}ix
+end # class Parser
+end # module Rsssf

data/lib/rsssf/parser/token.rb ADDED Viewed

@@ -0,0 +1,230 @@
+module Rsssf
+class Parser
+BASICS_RE = %r{
+    (?<spaces> [ ]{2,}) |
+    (?<space>  [ ])
+        |
+    (?<sym>[;,@|\[\]])
+}ix
+VS_RE = %r{   ## must be space before and after!!!
+                    (?<vs>
+                      (?<=[ ])	# Positive lookbehind for space
+                         -
+                       (?=[ ])   # positive lookahead for space
+                    )
+                }ix
+RE = Regexp.union(  GROUP_RE, ROUND_RE, LEG_RE,
+                    DATE_RE,
+                    VS_RE,
+                    SCORE_RE,
+                    SCORE_AWD_RE, SCORE_ABD_RE, SCORE_PPD_RE, SCORE_NP_RE,
+                       SCORE_WO_RE,
+                    SCORE_EXT_RE,
+                    NOTE_RE,
+                    BASICS_RE,
+                     TEXT_RE )
+## "strict" text match mode inside brackets
+##  ]
+INSIDE_RE  = Regexp.union(  GOAL_OG_RE, GOAL_PEN_RE,
+                            BASICS_RE,
+                            TEXT_STRICT_RE,
+                            MINUTE_RE,
+                         )
+def log( msg )
+   ## append msg to ./logs.txt
+   ##     use ./errors.txt - why? why not?
+   File.open( './logs.txt', 'a:utf-8' ) do |f|
+     f.write( msg )
+     f.write( "\n" )
+   end
+end
+def tokenize_with_errors( line, debug: false )
+  tokens = []
+  errors = []   ## keep a list of errors - why? why not?
+  puts ">#{line}<"    if debug
+  pos = 0
+  ## track last offsets - to report error on no match
+  ##   or no match in end of string
+  offsets = [0,0]
+  m = nil
+  ####
+  ## quick hack - keep re state/mode between tokenize calls!!!
+  @re  ||= RE     ## note - switch between RE & INSIDE_RE
+  while m = @re.match( line, pos )
+    if debug
+      pp m
+      puts "pos: #{pos}"
+    end
+    offsets = [m.begin(0), m.end(0)]
+    if offsets[0] != pos
+      ## match NOT starting at start/begin position!!!
+      ##  report parse error!!!
+      msg =  "!! WARN - parse error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
+      puts msg
+      errors << "parse error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]}"
+      log( msg )
+    end
+    ##
+    ## todo/fix - also check if possible
+    ##   if no match but not yet end off string!!!!
+    ##    report skipped text run too!!!
+    pos = offsets[1]
+    pp offsets   if debug
+    t =  if @re == INSIDE_RE
+           if m[:space]
+             nil   ## skip space
+           elsif m[:spaces]
+             nil  ## skip spaces
+           elsif m[:text]
+             [:text, m[:text]]   ## keep pos - why? why not?
+           elsif m[:minute]
+             [:minute, m[:minute]]
+           elsif m[:og]
+             [:og, m[:og]]    ## for typed drop - string version/variants
+           elsif m[:pen]
+             [:pen, m[:pen]]
+           elsif m[:sym]
+             sym = m[:sym]
+             ## return symbols "inline" as is - why? why not?
+             case sym
+             when ',' then [:',']
+             when ';' then [:';']
+             when '@' then [:'@']
+             when '|' then [:'|']
+             when '['
+               ## report error - already in inside mode!!!
+               nil
+             when ']'
+               puts "  leave inside match mode"
+               @re = RE
+               nil
+             else
+              nil  ## ignore others (e.g. brackets [])
+             end
+           else
+             ## report error  - why? why not?
+             nil
+           end
+         else  ## assume standard mode/ctx
+           if m[:space]
+             nil   ## skip space
+           elsif m[:spaces]
+             nil  ## skip spaces
+           elsif m[:text]
+             [:text, m[:text]]   ## keep pos - why? why not?
+           elsif m[:note]
+             [:note, m[:note]]
+           elsif m[:group]
+             [:group, m[:group]]
+           elsif m[:round]
+             [:round, m[:round]]
+           elsif m[:leg]
+             [:leg, m[:leg]]
+           elsif m[:date]
+             [:date, m[:date]]
+           elsif m[:vs]
+             [:vs, m[:vs]]
+           elsif m[:score]
+             [:score, m[:score]]
+           elsif m[:score_awd]   # awarded (awd)
+             [:score_awd, m[:score_awd]]
+           elsif m[:score_abd]   # abandoned (abd)
+             [:score_abd, m[:score_abd]]
+           elsif m[:score_ppd]   # postponed (ppd)
+             [:score_ppd, m[:score_ppd]]
+           elsif m[:score_np]    # not played (n/p)
+             [:score_np, m[:score_np]]
+           elsif m[:score_wo]    # walk over (w/o)
+             [:score_wo, m[:score_wo]]
+           elsif m[:score_ext]
+             [:score_ext, m[:score_ext]]
+           elsif m[:sym]
+             sym = m[:sym]
+             ## return symbols "inline" as is - why? why not?
+             case sym
+             when ',' then [:',']
+             when ';' then [:';']
+             when '@' then [:'@']
+             when '|' then [:'|']
+             when '['
+               ##  switch to inside mode!!!
+               puts "  enter inside match mode"
+               @re = INSIDE_RE
+               nil
+             when ']'
+               ## already in standard mode/ctx
+               ##  report warn/error - why? why not?
+               nil
+             else
+               nil  ## ignore others (e.g. brackets [])
+             end
+           else
+             ## report error  - why? why not?
+             nil
+           end
+         end
+    tokens << t    if t
+    if debug
+      print ">"
+      print "*" * pos
+      puts "#{line[pos..-1]}<"
+    end
+  end
+  ## check if no match in end of string
+  if offsets[1] != line.size
+    msg =  "!! WARN - parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<"
+    puts msg
+    log( msg )
+    errors << "parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size}"
+  end
+  [tokens,errors]
+end
+### convience helper - ignore errors by default
+def tokenize(  line, debug: false )
+   tokens, _ = tokenize_with_errors( line, debug: debug )
+   tokens
+end
+end  # class Parser
+end # module Rsssf

data/lib/rsssf/parser.rb ADDED Viewed

@@ -0,0 +1,21 @@
+####
+##  build on "standard" parse
+require 'sportdb/parser'
+## our own code
+require_relative 'parser/token-text'
+require_relative 'parser/token-note'
+require_relative 'parser/token-round'    ## round (& group)
+require_relative 'parser/token-date'
+require_relative 'parser/token-score'
+require_relative 'parser/token-goals'
+require_relative 'parser/token'
+require_relative 'parser/parser'
+require_relative 'parser/linter'