RubyGems - rsssf-parser - Versions diffs - 0.0.1 - Mend

rsssf-parser 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +3 -0
data/Manifest.txt +15 -0
data/README.md +11 -0
data/Rakefile +29 -0
data/bin/rsssf +80 -0
data/lib/rsssf/parser/linter.rb +84 -0
data/lib/rsssf/parser/parser.rb +100 -0
data/lib/rsssf/parser/token-date.rb +161 -0
data/lib/rsssf/parser/token-goals.rb +68 -0
data/lib/rsssf/parser/token-note.rb +113 -0
data/lib/rsssf/parser/token-round.rb +102 -0
data/lib/rsssf/parser/token-score.rb +103 -0
data/lib/rsssf/parser/token-text.rb +162 -0
data/lib/rsssf/parser/token.rb +230 -0
data/lib/rsssf/parser.rb +21 -0
metadata +113 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: 1b6cfe7842f0f46d242c1c2fc1f52b4c032b5c25fce314939583c7f96a486c65
+  data.tar.gz: ba5244b284f65129dca3b35e87d10984e1bf8906e571b3e42e85a4615eecb733
+SHA512:
+  metadata.gz: 9a63d121c858e35f757b59c490fc05cfd1457ac5c6e3294a291db787da141061f046c5ce2342fdf275e64bbb647934ef43d8547c2aa53eef537d12405d746185
+  data.tar.gz: e7a8f41d2d53e63fb72e35f22b1a0a0da370e15ab1b4aedf2ae1f37a2a1bc277ff31deb8362e541bbc403f7d49ea3f1120b4b271b9d36982359141ca80617d72

data/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,3 @@
+### 0.0.1 / 2024-07-17
+* Everything is new. First release.

data/Manifest.txt ADDED Viewed

@@ -0,0 +1,15 @@
+CHANGELOG.md
+Manifest.txt
+README.md
+Rakefile
+bin/rsssf
+lib/rsssf/parser.rb
+lib/rsssf/parser/linter.rb
+lib/rsssf/parser/parser.rb
+lib/rsssf/parser/token-date.rb
+lib/rsssf/parser/token-goals.rb
+lib/rsssf/parser/token-note.rb
+lib/rsssf/parser/token-round.rb
+lib/rsssf/parser/token-score.rb
+lib/rsssf/parser/token-text.rb
+lib/rsssf/parser/token.rb

data/README.md ADDED Viewed

	@@ -0,0 +1,11 @@
1	+ # rsssf-parser - football match schedule & results parser (& tokenizer) for the rsssf format / conventions
2	+
3	+
4	+
5	+
6	+
7	+
8	+
9	+
10	+
11	+

data/Rakefile ADDED Viewed

@@ -0,0 +1,29 @@
+require 'hoe'
+Hoe.spec 'rsssf-parser' do
+  self.version = '0.0.1'
+  self.summary = "rsssf-parser - football match schedule & results parser (& tokenizer) for the rsssf format / conventions"
+  self.description = summary
+  self.urls = { home: 'https://github.com/sportdb/sport.db' }
+  self.author = 'Gerald Bauer'
+  self.email = 'gerald.bauer@gmail.com'
+  # switch extension to .markdown for gihub formatting
+  self.readme_file  = 'README.md'
+  self.history_file = 'CHANGELOG.md'
+  self.licenses = ['Public Domain']
+  self.extra_deps = [
+      ['sportdb-parser'],     ### (re(use standard football.txt parser machinery - why? why not?
+  ]
+  self.spec_extras = {
+    required_ruby_version: '>= 2.2.2'
+  }
+end

data/bin/rsssf ADDED Viewed

@@ -0,0 +1,80 @@
+#!/usr/bin/env ruby
+## tip: to test run:
+##   ruby -I ./lib -I ../parser/lib  bin/rsssf
+require 'rsssf/parser'
+require 'optparse'
+args = ARGV
+opts = { debug: false,
+          metal: false }
+parser = OptionParser.new do |parser|
+parser.banner = "Usage: #{$PROGRAM_NAME} [options]"
+##
+## check if git has a offline option?? (use same)
+##             check for other tools - why? why not?
+  parser.on( "--verbose", "--debug",
+               "turn on verbose / debug output (default: #{opts[:debug]} )" ) do |debug|
+    opts[:debug] = debug
+  end
+  parser.on( "--metal",
+                 "turn off typed parse tree; show to the metal tokens"+
+                   " (default: #{opts[:metal]})" ) do |metal|
+    opts[:metal] = metal
+  end
+end
+parser.parse!( args )
+puts "OPTS:"
+p opts
+puts "ARGV:"
+p args
+paths =  if args.empty?
+          [
+            '../../../rsssf/austria/2010-11/cup.txt',
+          ]
+         else
+            ## todo/fix - expand_args!!!
+            args
+         end
+pp paths
+Rsssf::Parser::Linter.debug = true    if opts[:debug]
+linter = Rsssf::Parser::Linter.new
+paths.each_with_index do |path,i|
+    puts "==> [#{i+1}/#{paths.size}] reading >#{path}<..."
+    linter.read( path, parse: !opts[:metal] )
+end
+if linter.errors?
+    puts
+    pp linter.errors
+    puts "!!   #{linter.errors.size} parse error(s) in #{paths.size} datafiles(s)"
+else
+    puts "OK   no parse errors found in #{paths.size} datafile(s)"
+end
+puts "bye"

data/lib/rsssf/parser/linter.rb ADDED Viewed

@@ -0,0 +1,84 @@
+module Rsssf
+class Parser
+###
+## note - Linter for now nested inside Parser - keep? why? why not?
+class Linter
+def self.debug=(value) @@debug = value; end
+def self.debug?() @@debug ||= false; end  ## note: default is FALSE
+def debug?()  self.class.debug?; end
+attr_reader :errors
+def initialize
+  @errors = []
+  @parser = Parser.new   ## use own parser instance (not shared) - why? why not?
+end
+def errors?() @errors.size > 0; end
+#########
+## parse - false (default) - tokenize (only)
+##       - true            - tokenize & parse
+def read( path, parse: false )
+  ## fix - (re)use outline reader later!!!
+  ##   plus check for headings etc.
+  text = File.open( path, 'r:utf-8' ) { |f| f.read }
+  lines = text.split( "\n" )
+  ##  process lines
+  tree = []
+  lines.each do |line|
+    ## skip blank and comment lines
+    next if line.strip.empty? || line.strip.start_with?('#')
+    ## strip inline (end-of-line) comments
+    line = line.sub( /#.+$/, '' )
+    if debug?
+      puts
+      puts "line >#{line}<"
+    end
+    t, error_messages  =  if parse
+                            @parser.parse_with_errors( line )
+                          else
+                            @parser.tokenize_with_errors( line )
+                          end
+    if error_messages.size > 0
+      ## add to "global" error list
+      ##   make a triplet tuple (file / msg / line text)
+            error_messages.each do |msg|
+                @errors << [ path,
+                             msg,
+                             line
+                           ]
+            end
+    end
+    pp t   if debug?
+    tree << t
+  end
+  ## pp tree
+end  # read
+end  # class Linter
+end   # class Parser
+end   # module Rsssf

data/lib/rsssf/parser/parser.rb ADDED Viewed

@@ -0,0 +1,100 @@
+module Rsssf
+class Parser
+## transforms
+##
+##  Netherlands  1-2 (1-1)   England
+##   =>  text => team
+##       score|vs
+##       text => team
+##
+##
+##  add !!!!
+##   collect_until e.g. collect_until( :text )
+def parse_with_errors( line, debug: false )
+    errors = []
+    tokens, token_errors = tokenize_with_errors( line )
+    errors += token_errors
+=begin
+#############
+## pass 1
+##   replace all texts with keyword matches (e.g. group, round, leg, etc.)
+     tokens = tokens.map do |t|
+                      if t[0] == :text
+                          text = t[1]
+                          if is_group?( text )
+                             ### expects to be followed by num (or text ABC??)
+                             [:group, text]
+                          elsif is_matchday?( text )
+                             ### expects to be followed by num
+                             ##  use different name e.g. :fix_round or such?
+                             [:matchday, text]
+                          elsif is_leg?( text )
+                             [:leg, text]
+                          elsif is_round?( text )
+                             [:round, text]
+                          else
+                              t   ## pass through as-is (1:1)
+                          end
+                      else
+                         t
+                      end
+                end
+    ## puts "tokens:"
+    ## pp tokens
+=end
+## transform tokens into (parse tree/ast) nodes
+    nodes = []
+    ## note - (re)use token buffer from "standard" parser here !!!!
+    buf = SportDb::Parser::Tokens.new( tokens )
+    ## pp buf
+    loop do
+          if buf.match?( :text, [:score,
+                                 :score_awd,
+                                 :score_abd,
+                                 :score_ppd,
+                                 :score_np,
+                                 :score_wo,
+                                 :vs], :text )
+             nodes << [:team, buf.next[1]]
+             nodes << buf.next
+             nodes << [:team, buf.next[1]]
+          elsif buf.match?( :text, :minute )    ## assume player+minute
+             nodes << [:player, buf.next[1]]
+             nodes << buf.next
+          else
+             ## pass through
+             nodes << buf.next
+          end
+          break if buf.eos?
+    end
+    [nodes,errors]
+end
+### convience helper - ignore errors by default
+def parse( line, debug: false )
+  nodes, _ = parse_with_errors( line, debug: debug )
+  nodes
+end
+end #  class Parser
+end  # module Rsssf

data/lib/rsssf/parser/token-date.rb ADDED Viewed

@@ -0,0 +1,161 @@
+module Rsssf
+class Parser
+def self.parse_names( txt )
+  lines = [] # array of lines (with words)
+  txt.each_line do |line|
+    line = line.strip
+    next if line.empty?
+    next if line.start_with?( '#' )   ## skip comments too
+    ## strip inline (until end-of-line) comments too
+    ##   e.g. Janvier  Janv  Jan  ## check janv in use??
+    ##   =>   Janvier  Janv  Jan
+    line = line.sub( /#.*/, '' ).strip
+    ## pp line
+    values = line.split( /[ \t]+/ )
+    ## pp values
+    ## todo/fix -- add check for duplicates
+    lines << values
+  end
+  lines
+end # method parse
+def self.build_names( lines )
+  ## join all words together into a single string e.g.
+  ##   January|Jan|February|Feb|March|Mar|April|Apr|May|June|Jun|...
+  lines.map { |line| line.join('|') }.join('|')
+end
+## add normalize option (for downcase) - why? why not?
+def self.build_map( lines )
+    ## note: downcase name!!!
+   ## build a lookup map that maps the word to the index (line no) plus 1 e.g.
+   ##  {"january" => 1,  "jan" => 1,
+   ##   "february" => 2, "feb" => 2,
+   ##   "march" => 3,    "mar" => 3,
+   ##   "april" => 4,    "apr" => 4,
+   ##   "may" => 5,
+   ##   "june" => 6,     "jun" => 6, ...
+   lines.each_with_index.reduce( {} ) do |h,(line,i)|
+     line.each { |name| h[ name.downcase ] = i+1 }  ## note: start mapping with 1 (and NOT zero-based, that is, 0)
+     h
+   end
+end
+ ## note -  support only 5 letter max for now
+ ##    now January|February|August etc.
+MONTH_LINES = parse_names( <<TXT )
+Jan
+Feb
+March      Mar
+April      Apr
+May
+June       Jun
+July       Jul
+Aug
+Sept       Sep
+Oct
+Nov
+Dec
+TXT
+MONTH_NAMES = build_names( MONTH_LINES )
+# pp MONTH_NAMES
+MONTH_MAP   = build_map( MONTH_LINES )
+# pp MONTH_MAP
+### nnote - only support two or three letters
+##    no Tues | Thur | Thurs | Sunday etc.
+DAY_LINES = parse_names( <<TXT )
+Mon  Mo
+Tue  Tu
+Wed  We
+Thu  Th
+Fri  Fr
+Sat  Sa
+Sun  Su
+TXT
+DAY_NAMES = build_names( DAY_LINES )
+# pp DAY_NAMES
+DAY_MAP   = build_map( DAY_LINES )
+# pp DAY_MAP
+#=>
+# "Jan|Feb|March|Mar|April|Apr|May|June|Jun|
+#  July|Jul|Aug|Sept|Sep|Oct|Nov|Dec"
+#
+# "Mon|Mo|Tue|Tu|Wed|We|
+#  Thu|Th|Fri|Fr|Sat|Sa|Sun|Su"
+## todo - add more date variants !!!!
+# e.g.  Fri Aug 9
+DATE_RE = %r{
+ ## note - do not include [] in capture for now - why? why not
+    ## eat-up/consume optional [] - part i
+    (?: \[ | \b
+     )
+(?<date>
+     (?:  ######
+          ## variant I/1/one
+          ###   Fri June 24
+     ## optional day name
+     ((?<day_name>#{DAY_NAMES})
+          [ ]
+     )?
+     ##  allow 1 or 2 spaces e.g. Jul  2 / Jun 27 to pretty print
+     (?<month_name>#{MONTH_NAMES})
+         [ ]{1,2}
+     (?<day>\d{1,2})
+     ## optional year
+     (  [ ]
+        (?<year>\d{4})
+     )?
+     )
+    |
+     (?: ####
+         ## variant II/2/two
+         ##   17- 3-22   - allow space befor mont
+         ##   17-3-22
+            \d{1,2}
+             -
+            [ ]*\d{1,2}
+             -
+             (?:
+                \d{4} |   ## 2024
+                \d{2}     ## or 24 only
+             )
+     )
+     )  ## end date capture
+  ## eat-up/consume optional [] - part ii
+  (?: \] | \b
+  )
+}ix
+end  #   class Parser
+end  # module Rsssf

data/lib/rsssf/parser/token-goals.rb ADDED Viewed

@@ -0,0 +1,68 @@
+module Rsssf
+class Parser
+## cannot start with number
+## cannot have number inside
+## cannot end with number!!!
+##
+##  check if can end in dot - why? why not?
+##    e.g.  jr. or such?
+##
+##   allow  45+/90+  too
+##     or   90+pen or
+##          90+ pen/90+p/90+ og
+MINUTE_RE = %r{
+     (?<minute>
+         \b
+            \d{1,3}
+            '?   ## optional minute quote (')
+            (?:
+               # optional offset/extra e.g. 45+ / 90+ or 45+10 / 90+5
+                (?: \+
+                  (?:
+                     (?! [0-9])   ## negative look ahead (not a number) required
+                     |
+                    (?:
+                      \d{1,3}
+                      '?   ## optional minute quote (')
+                      (?= (og|pen|p)? ([ ;,\]]|$))
+                    )
+                  )
+                )
+                |
+                (?= (og|pen|p)? ([ ;,\]]|$))  # note - break can be og|pen|p too
+         )
+      )}ix
+### note - word boundary (\b) will NOT work for quoet (')
+##             because quote is NOT alphanum (like dot etc.)
+##   goal types
+GOAL_PEN_RE = %r{
+   (?<pen>
+        (?<=\d|\+|[ ]|')	## must follow a number or plus (e.g. 45p / 45+p / 45 p / 45'p) or space
+            (?: pen|p )
+            \b
+    )
+}ix
+GOAL_OG_RE = %r{
+   (?<og>
+        (?<=\d|\+|[ ]|')	## must follow a number or plus (e.g. 45og / 45+og / 45 og) or space
+          og
+          \b
+   )
+}ix
+end # class Parser
+end # module Rsssf

data/lib/rsssf/parser/token-note.rb ADDED Viewed

@@ -0,0 +1,113 @@
+module Rsssf
+class Parser
+###
+##  move to token-note(s) file !!!!
+##
+NOTE_RE = %r{
+    \[
+   (?<note>
+     (?:  ##  starting with ___   PLUS requiring more text
+       (?:
+          nb:
+          ##  e.g. [NB: between top-8 of regular season]
+          #        [NB: América, Morelia and Tigres qualified on better record regular season]
+          #        [NB: Celaya qualified on away goals]
+          #        [NB: Alebrijes qualified on away goal]
+          #        [NB: Leones Negros qualified on away goals]
+          #
+          # todo/fix:
+          # add "top-level" NB: version
+          ##   with full (end-of) line note - why? why not?
+          |
+          (?: originally[ ])? scheduled
+          ## e.g. [originally scheduled to play in Mexico City]
+          |
+          rescheduled
+          ## e.g.  [Rescheduled due to earthquake occurred in Mexico on September 19]
+          |
+          remaining
+          ## e.g. [remaining 79']
+          ##      [remaining 84']
+          ##      [remaining 59']
+          ##      [remaining 5']
+          |
+          played
+          ## e.g. [played in Macaé-RJ]
+          ##      [played in Caxias do Sul-RS]
+          ##      [played in Sete Lagoas-MG]
+          ##      [played in Uberlândia-MG]
+          ##      [played in Brasília-DF]
+          ##      [played in Vöcklabruck]
+          ##      [played in Pasching]
+          |
+          declared
+          ## e.g.  [declared void]
+          |
+          inter-group
+          ## e.g. [inter-group A-B]
+          ##      [inter-group C-D]
+       )
+      [ ]
+      [^\]]+?    ## slurp all to next ] - (use non-greedy)
+     )
+      |
+     (?:
+       ## starting with in  - do NOT allow digits
+       ##   name starting with in possible - why? why not?
+           in[ ]
+            [^0-9\]]+?
+       ## e.g. [In Estadio La Corregidora]
+       ##      [in Unidad Deportiva Centenario]
+       ##      [in Estadio Olímpico Universitario]
+       ##      [in Estadio Victoria]
+       ##      [in UD José Brindis]
+       ##      [in Colomos Alfredo "Pistache" Torres stadium]
+     )
+      |
+      (?:
+          (?:
+             postponed
+             ## e.g. [postponed due to problems with the screen of the stadium]
+             ##      [postponed by storm]
+             ##      [postponed due to tropical storm "Hanna"]
+             ##      [postponed from Sep 10-12 due to death Queen Elizabeth II]
+             ##     [postponed]  -- include why? why not?
+             |
+             awarded
+             ## e.g. [awarded match to Leones Negros by undue alignment; original result 1-2]
+             ##     [awarded 3-0 to Cafetaleros by undue alignment; originally ended 2-0]
+             ##     [awarded 3-0; originally 0-2, América used ineligible player (Federico Viñas)]
+             |
+             abandoned
+             ## e.g. [abandoned at 1-1 in 65' due to cardiac arrest Luton player Tom Lockyer]
+             ##      [abandoned at 0-0 in 6' due to waterlogged pitch]
+             ##     [abandoned at 5-0 in 80' due to attack on assistant referee by Cerro; result stood]
+             ##    [abandoned at 1-0 in 31']
+             ##    [abandoned at 0-1' in 85 due to crowd trouble]
+             |
+              suspended
+              ## e.g. [suspended at 0-0 in 12' due to storm]
+              ##      [suspended at 84' by storm; result stood]
+              |
+              annulled
+              ## e.g.  [annulled]
+              |
+              replay
+              ## e.g.  [replay]
+          )
+        ([ ]    ## note - optional text
+          [^\]]+?
+         )?         ## slurp all to next ] - (use non-greedy)
+      )
+    )    # note capture
+     \]
+}ix
+end  #   class Parser
+end  #   module Rsssf