RubyGems - sportdb-formats - Versions diffs - 0.4.0 → 1.0.0 - Mend

sportdb-formats 0.4.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

checksums.yaml +4 -4
data/Manifest.txt +24 -4
data/Rakefile +3 -3
data/lib/sportdb/formats.rb +25 -2
data/lib/sportdb/formats/config.rb +40 -0
data/lib/sportdb/formats/datafile.rb +42 -62
data/lib/sportdb/formats/datafile_package.rb +160 -0
data/lib/sportdb/formats/match/conf_parser.rb +120 -0
data/lib/sportdb/formats/match/mapper.rb +319 -0
data/lib/sportdb/formats/match/mapper_teams.rb +23 -0
data/lib/sportdb/formats/match/match_parser.rb +659 -0
data/lib/sportdb/formats/match/match_parser_auto_conf.rb +202 -0
data/lib/sportdb/formats/name_helper.rb +84 -0
data/lib/sportdb/formats/outline_reader.rb +53 -15
data/lib/sportdb/formats/package.rb +172 -160
data/lib/sportdb/formats/parser_helper.rb +81 -0
data/lib/sportdb/formats/score/score_formats.rb +180 -0
data/lib/sportdb/formats/score/score_parser.rb +196 -0
data/lib/sportdb/formats/structs/country.rb +1 -43
data/lib/sportdb/formats/structs/group.rb +25 -0
data/lib/sportdb/formats/structs/league.rb +7 -26
data/lib/sportdb/formats/structs/match.rb +72 -51
data/lib/sportdb/formats/structs/round.rb +14 -4
data/lib/sportdb/formats/structs/season.rb +3 -0
data/lib/sportdb/formats/structs/team.rb +144 -0
data/lib/sportdb/formats/version.rb +2 -2
data/test/helper.rb +83 -1
data/test/test_clubs.rb +3 -3
data/test/test_conf.rb +65 -0
data/test/test_datafile.rb +21 -30
data/test/test_match.rb +0 -6
data/test/test_match_auto.rb +72 -0
data/test/test_match_auto_champs.rb +45 -0
data/test/test_match_auto_euro.rb +37 -0
data/test/test_match_auto_worldcup.rb +61 -0
data/test/test_match_champs.rb +27 -0
data/test/test_match_eng.rb +26 -0
data/test/test_match_euro.rb +27 -0
data/test/test_match_worldcup.rb +27 -0
data/test/test_name_helper.rb +67 -0
data/test/test_outline_reader.rb +3 -3
data/test/test_package.rb +21 -2
data/test/test_package_match.rb +78 -0
data/test/test_scores.rb +67 -51
metadata +32 -12
data/lib/sportdb/formats/scores.rb +0 -253
data/lib/sportdb/formats/structs/club.rb +0 -213
data/test/test_club_helpers.rb +0 -63
data/test/test_datafile_match.rb +0 -65

data/lib/sportdb/formats/match/conf_parser.rb ADDED Viewed

@@ -0,0 +1,120 @@
+module SportDb
+  class ConfParser
+    def self.parse( lines )
+      parser = new( lines )
+      parser.parse
+    end
+    include Logging         ## e.g. logger#debug, logger#info, etc.
+    include ParserHelper    ## e.g. read_lines, etc.
+    def initialize( lines )
+      # for convenience split string into lines
+      ##    note: removes/strips empty lines
+      ## todo/check: change to text instead of array of lines - why? why not?
+      @lines        = lines.is_a?( String ) ? read_lines( lines ) : lines
+    end
+    COUNTRY_RE = %r{ [<>‹›,]
+                     [ ]*
+                     (?<country>[A-Z]{2,4})   ## todo/check: allow one-letter (motor vehicle plates) or 5 letter possible?
+                    \b}xi
+    ## standings table row regex matcher e.g.
+    ##     1  Manchester City         38  32  4  2 106-27 100
+    ## or  1. Manchester City         38  32  4  2 106:27 100
+    TABLE_RE = %r{ ^
+                    (?:
+                      (?<rank>\d+)\.?
+                         |
+                        [-]
+                     )
+                    [ ]+
+                      (?<team>.+?)   ## note: let's use non-greedy (MINIMUM length) match for now
+                    [ ]+
+                      (?<pld>\d+)    ## (pl)aye(d)
+                    [ ]+
+                      (?<w>\d+)      ## (w)ins
+                    [ ]+
+                      (?<d>\d+)     ## (d)raws
+                    [ ]+
+                      (?<l>\d+)      ## (l)ost
+                    [ ]+
+                      (?<gf>\d+)     ## (g)oal (f)or
+                        [ ]*
+                        [:-]    ## note: allow 10-10 or 10:10 or 10 - 10 or 10 : 10 etc.
+                        [ ]*
+                      (?<ga>\d+)      ## (g)oal (a)gainst
+                     (?:          ## allow optional (g)oal (d)ifference
+                        [ ]+
+                        (?<gd>[±+-]?\d+)  ## (g)oal (d)ifference
+                      )?
+                     [ ]+
+                      (?<pts>\d+)      ## (p)oin(ts)
+                         (?:     ## allow optional deductions e.g. [-7]
+                               [ ]+
+                            \[(?<deduction>-\d+)\]
+                         )?
+                      $}x
+    def parse
+      teams = {}    ## convert lines to teams
+      @lines.each do |line|
+        next if line =~ /^[ -]+$/   ## skip decorative lines with dash only (e.g. ---- or - - - -) etc.
+        ## quick hack - check for/extract (optional) county code (for teams) first
+        ##  allow as separators <>‹›,  NOTE: includes (,) comma for now too
+        m = nil
+        country = nil
+        if m=COUNTRY_RE.match( line )
+          country = m[:country]
+          line = line.sub( m[0], '' )  ## replace match with nothing for now
+        end
+        if m=TABLE_RE.match( line )
+          puts "  matching table entry >#{line}<"
+          name = m[:team]
+          rank = m[:rank] ? Integer(m[:rank]) : nil
+          standing = {
+            pld: Integer(m[:pld]),
+            w:   Integer(m[:w]),
+            d:   Integer(m[:d]),
+            l:   Integer(m[:l]),
+            gf:  Integer(m[:gf]),
+            ga:  Integer(m[:ga]),
+          }
+          standing[ :gd ]        = Integer(m[:gd].gsub(/[±+]/,''))    if m[:gd]
+          standing[ :pts ]       = Integer(m[:pts])
+          standing[ :deduction ] = Integer(m[:deduction])  if m[:deduction]
+          ## todo/fix: track double usage - why? why not? report/raise error/exception on duplicates?
+          team = teams[ name ] ||= { }
+          team[ :country ]   = country     if country
+          team[ :rank ]      = rank        if rank
+          team[ :standing ]  = standing    if standing
+        else
+          ## assume team is full line
+          name = line.strip  # note: strip leading and trailing spaces
+          team = teams[ name ] ||= { }
+          team[ :country ]  = country     if country
+        end
+      end
+      teams
+    end # method parse
+  end # class ConfParser
+end # module SportDb

data/lib/sportdb/formats/match/mapper.rb ADDED Viewed

@@ -0,0 +1,319 @@
+# encoding: utf-8
+module SportDb
+##
+## note: this was/is a cut-n-page (inline) copy of TextUtils::TitleMapper2
+##   see https://github.com/textkit/textutils/blob/master/textutils/lib/textutils/title_mapper2.rb
+class MapperV2      ## todo/check: rename to NameMapper/TitleMapper ? why? why not??
+  include Logging
+  attr_reader :known_titles   ## rename to mapping or mappings or just titles - why? why not?
+  ########
+  ##  key:      e.g. augsburg
+  ##  title:    e.g. FC Augsburg
+  ##  length (of title(!!) - not regex pattern):   e.g. 11   -- do not count dots (e.g. U.S.A. => 3 or 6) why? why not?
+  MappingStruct =  Struct.new( :key, :title, :length, :pattern)     ## todo/check: use (rename to) TitleStruct - why? why not??
+  ######
+  ## convenience helper - (auto)build ActiveRecord-like team records/structs
+  Record = Struct.new( :key, :title, :synonyms )
+  def build_records( txt_or_lines )
+    recs = []
+    if txt_or_lines.is_a?( String )
+        ## todo/fix: use ParserHelper read_lines !!! ????
+        txt = txt_or_lines
+        lines = []
+        txt.each_line do |line|
+          line = line.strip
+          next if line.empty? || line.start_with?( '#' )  ## note: skip empty and comment lines
+          lines << line
+        end
+    else
+        lines = txt_or_lines
+    end
+    lines.each do |line|
+      values = line.split( '|' )
+      values = values.map { |value| value.strip }
+      title    = values[0]
+      ## note: quick hack - auto-generate key, that is, remove all non-ascii chars and downcase
+      key      = title.downcase.gsub( /[^a-z]/, '' )
+      synonyms = values.size > 1 ? values[1..-1].join( '|' ) : nil
+      recs << Record.new( key, title, synonyms )
+    end
+    recs
+  end
+  def initialize( records_or_mapping, tag )
+    ## for convenience allow easy (auto-)convert text (lines) to records
+    ##  as 1) text block/string  or
+    ##     2) array of lines/strings
+    records_or_mapping = build_records( records_or_mapping )   if records_or_mapping.is_a?( String ) ||
+                                                                  (records_or_mapping.is_a?( Array ) && records_or_mapping[0].is_a?( String ))
+    ## build mapping lookup table
+    @known_titles =  if records_or_mapping.is_a?( Hash )  ## assume "custom" mapping hash table (title/name=>record)
+                        build_title_table_for_mapping( records_or_mapping )
+                     else  ## assume array of records
+                        build_title_table_for_records( records_or_mapping )
+                     end
+    ## build lookup hash by record (e.g. team/club/etc.) key
+    records = if records_or_mapping.is_a?( Array )
+                  records_or_mapping
+              else   ## assume hash (uses values assuming to be all records - note might include duplicates)
+                  records_or_mapping.values
+              end
+    @records = records.reduce({}) { |h,rec| h[rec.key]=rec; h }
+    ## todo: rename tag to attrib or attrib_name - why ?? why not ???
+    @tag = tag   # e.g. tag name use for @@brewery@@ @@team@@ etc.
+  end
+  def map_titles!( line )   ## rename to just map! - why?? why not???
+    begin
+      found = map_title_for!( @tag, line, @known_titles )
+    end while found
+  end
+  def find_rec!( line )
+    find_rec_for!( @tag, line, @records )
+  end
+  def find_recs!( line )  # note: keys (plural!) - will return array
+    counter = 1
+    recs = []
+    rec = find_rec_for!( "#{@tag}#{counter}", line, @records )
+    while rec
+      recs << rec
+      counter += 1
+      rec = find_rec_for!( "#{@tag}#{counter}", line, @records )
+    end
+    recs
+  end
+private
+  def build_title_table_for_mapping( mapping )
+    known_titles = []
+    mapping.each do |title, rec|
+      m = MappingStruct.new
+      m.key     = rec.key
+      m.title   = title
+      m.length  = title.length
+      m.pattern = Regexp.escape( title )   ## note: just use "standard" regex escape (e.g. no extras for umlauts,accents,etc.)
+      known_titles << m
+    end
+    ## note: sort here by length (largest goes first - best match)
+    known_titles = known_titles.sort { |l,r| r.length <=> l.length }
+    known_titles
+  end
+  def build_title_table_for_records( records )
+    ## build known tracks table w/ synonyms e.g.
+    #
+    # [[ 'wolfsbrug', 'VfL Wolfsburg'],
+    #  [ 'augsburg',  'FC Augsburg'],
+    #  [ 'augsburg',  'Augi2'],
+    #  [ 'augsburg',  'Augi3' ],
+    #  [ 'stuttgart', 'VfB Stuttgart']]
+    known_titles = []
+    records.each_with_index do |rec,index|
+      title_candidates = []
+      title_candidates << rec.title
+      title_candidates += rec.synonyms.split('|') if rec.synonyms && !rec.synonyms.empty?
+      ## check if title includes subtitle e.g. Grand Prix Japan (Suzuka Circuit)
+      #  make subtitle optional by adding title w/o subtitle e.g. Grand Prix Japan
+      titles = []
+      title_candidates.each do |t|
+        titles << t
+        if t =~ /\(.+\)/
+          extra_title = t.gsub( /\(.+\)/, '' ) # remove/delete subtitles
+          # note: strip leading n trailing withspaces too!
+          #  -- todo: add squish or something if () is inline e.g. leaves two spaces?
+          extra_title.strip!
+          titles << extra_title
+        end
+      end
+      titles.each do |t|
+        m = MappingStruct.new
+        m.key     = rec.key
+        m.title   = t
+        m.length  = t.length
+        ## note: escape for regex plus allow subs for special chars/accents
+        m.pattern = title_esc_regex( t )
+        known_titles << m
+      end
+      logger.debug "  #{rec.class.name}[#{index+1}] #{rec.key} >#{titles.join('|')}<"
+      ## note: only include code field - if defined
+      if rec.respond_to?(:code) && rec.code && !rec.code.empty?
+        m = MappingStruct.new
+        m.key     = rec.key
+        m.title   = rec.code
+        m.length  = rec.code.length
+        m.pattern = rec.code   ## note: use code for now as is (no variants allowed fow now)
+        known_titles << m
+      end
+    end
+    ## note: sort here by length (largest goes first - best match)
+      #  exclude code and key (key should always go last)
+    known_titles = known_titles.sort { |l,r| r.length <=> l.length }
+    known_titles
+  end
+  def map_title_for!( tag, line, mappings )
+    mappings.each do |mapping|
+      key     = mapping.key
+      pattern = mapping.pattern
+      ## nb: \b does NOT include space or newline for word boundry (only alphanums e.g. a-z0-9)
+      ## (thus add it, allows match for Benfica Lis.  for example - note . at the end)
+      ## check add $ e.g. (\b| |\t|$) does this work? - check w/ Benfica Lis.$
+      re = /\b#{pattern}(\b| |\t|$)/   # wrap with world boundry (e.g. match only whole words e.g. not wac in wacker)
+      if line =~ re
+        logger.debug "     match for #{tag.downcase}  >#{key}< >#{pattern}<"
+        # make sure @@oo{key}oo@@ doesn't match itself with other key e.g. wacker, wac, etc.
+        line.sub!( re, "@@oo#{key}oo@@ " )    # NB: add one space char at end
+        return true    # break out after first match (do NOT continue)
+      end
+    end
+    false
+  end
+  def find_rec_for!( tag, line, records )
+    re = /@@oo([^@]+?)oo@@/     # e.g. everything in @@ .... @@ (use non-greedy +? plus all chars but not @, that is [^@])
+    if line =~ re
+      key = $1
+      logger.debug "   #{tag.downcase}: >#{key}<"
+      line.sub!( re, "[#{tag.upcase}]" )
+      records[ key ]  ## note: map key to record (using records hash table mapping)
+    else
+      nil
+    end
+  end # method find_key_for!
+####
+# title helper cut-n-paste copy from TextUtils
+##  see https://github.com/textkit/textutils/blob/master/textutils/lib/textutils/helper/title_helper.rb
+def title_esc_regex( title_unescaped )
+      ##  escape regex special chars e.g.
+      #    . to \. and
+      #    ( to \(
+      #    ) to \)
+      #    ? to \? -- zero or one
+      #    * to \* -- zero or more
+      #    + to \+ -- one or more
+      #    $ to \$ -- end of line
+      #    ^ to \^ -- start of line etc.
+      ### add { and } ???
+      ### add [ and ] ???
+      ### add \ too ???
+      ### add | too ???
+      # e.g. Benfica Lis.
+      # e.g. Club Atlético Colón (Santa Fe)
+      # e.g. Bauer Anton (????)
+      ## NB: cannot use Regexp.escape! will escape space '' to '\ '
+      ## title = Regexp.escape( title_unescaped )
+      title = title_unescaped.gsub( '.', '\.' )
+      title = title.gsub( '(', '\(' )
+      title = title.gsub( ')', '\)' )
+      title = title.gsub( '?', '\?' )
+      title = title.gsub( '*', '\*' )
+      title = title.gsub( '+', '\+' )
+      title = title.gsub( '$', '\$' )
+      title = title.gsub( '^', '\^' )
+      ##  match accented char with or without accents
+      ##  add (ü|ue) etc.
+      ## also make - optional change to (-| ) e.g. Blau-Weiss == Blau Weiss
+      ## todo: add some more
+      ## see http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references  for more
+      ##
+      ##  reuse for all readers!
+      alternatives = [
+        ['-', '(-| )'],  ## e.g. Blau-Weiß Linz
+        ['æ', '(æ|ae)'],  ## e.g.
+        ['ä', '(ä|ae)'],  ## e.g.
+        ['Ö', '(Ö|Oe)'],  ## e.g. Österreich
+        ['ö', '(ö|oe)'],  ## e.g. Mönchengladbach
+        ['ß', '(ß|ss)'],  ## e.g. Blau-Weiß Linz
+        ['ü', '(ü|ue)'],  ## e.g.
+        ['á', '(á|a)'],  ## e.g. Bogotá, Sársfield
+        ['ã', '(ã|a)'],  ## e.g  São Paulo
+        ['ç', '(ç|c)'],  ## e.g. Fenerbahçe
+        ['é', '(é|e)'],  ## e.g. Vélez
+        ['ê', '(ê|e)'],  ## e.g. Grêmio
+        ['ï', '(ï|i)' ], ## e.g. El Djazaïr
+        ['ñ', '(ñ|n)'],  ## e.g. Porteño
+        ['ň', '(ň|n)'],  ## e.g. Plzeň
+        ['ó', '(ó|o)'],   ## e.g. Colón
+        ['ō', '(ō|o)'],  # # e.g. Tōkyō
+        ['ș', '(ș|s)'],   ## e.g. Bucarești
+        ['ú', '(ú|u)']  ## e.g. Fútbol
+      ]
+      ### fix/todo:  check for  dot+space e.g. . and make dot optional
+      ##
+      #  e.g. make  dot (.) optional plus allow alternative optional space e.g.
+      #   -- for U.S.A. => allow USA or U S A
+      #
+      ##    e.g. U. de G. or U de G or U.de G. ??
+      ##   collect some more (real-world) examples first!!!!!
+      alternatives.each do |alt|
+        title = title.gsub( alt[0], alt[1] )
+      end
+      title
+  end
+end # class MapperV2
+end # module SportDb

data/lib/sportdb/formats/match/mapper_teams.rb ADDED Viewed

@@ -0,0 +1,23 @@
+# encoding: utf-8
+module SportDb
+class TeamMapper
+  def initialize( records_or_mapping )
+    @mapper = MapperV2.new( records_or_mapping, 'team' )
+  end
+  def find_teams!( line ) # Note: returns an array - note: plural! (teamsssss)
+    @mapper.find_recs!( line )
+  end
+  def find_team!( line )  # Note: returns key (string or nil)
+    @mapper.find_rec!( line )
+  end
+  def map_teams!( line )
+    @mapper.map_titles!( line )
+  end
+end # class TeamMapper
+end # module SportDb