RubyGems - wordcut - Versions diffs - 0.0.1 → 0.0.4 - Mend

wordcut 0.0.1 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

checksums.yaml +4 -4
data/LICENSE +165 -0
data/data/tha/tdict-acronyms.txt +471 -0
data/data/tha/tdict-city.txt +242 -0
data/data/tha/tdict-collection.txt +54 -0
data/data/tha/tdict-common.txt +1234 -0
data/data/tha/tdict-country.txt +217 -0
data/data/tha/tdict-district.txt +114 -0
data/data/tha/tdict-geo.txt +53 -0
data/data/tha/tdict-history.txt +35 -0
data/data/tha/tdict-ict.txt +260 -0
data/data/tha/tdict-lang-ethnic.txt +43 -0
data/data/tha/tdict-proper.txt +393 -0
data/data/tha/tdict-science.txt +146 -0
data/data/tha/tdict-spell.txt +82 -0
data/data/tha/tdict-std-compound.txt +5493 -0
data/data/tha/tdict-std.txt +15374 -0
data/wordcut/dag.rb +82 -78
data/wordcut/dict.rb +39 -38
data/wordcut/dict_seek.rb +24 -22
data/wordcut/edge.rb +21 -17
data/wordcut/edge_builder.rb +16 -14
data/wordcut/pointer.rb +27 -24
data/wordcut/space_slicer.rb +25 -21
data/wordcut/tokenizer.rb +11 -9
metadata +19 -3

data/wordcut/dag.rb CHANGED Viewed

@@ -2,98 +2,102 @@ require_relative "edge_builder"
 require_relative "pointer"
 require_relative "space_slicer"
-module DictDagUpdater
-  def update_by_dict(i, pointers)
-    edge = self.build_edges(pointers).min
-    self[i] = edge
-    return i
+module Wordcut
+  module DictDagUpdater
+    def update_by_dict(i, pointers)
+      edge = self.build_edges(pointers).min
+      self[i] = edge
+      return i
+    end
   end
-end
-module UnkDagUpdater
-  def update_by_unk(i, left)
-    src = self[left]
-    edge = Edge.new(:s => left,
-                    :unk => src.unk + 1,
-                    :chunk => src.chunk + 1,
-                    :etype => :UNK,
-                    :payload => nil)
-    self[i] = edge
-    return left
+  module UnkDagUpdater
+    def update_by_unk(i, left)
+      src = self[left]
+      edge = Edge.new(:s => left,
+                      :unk => src.unk + 1,
+                      :chunk => src.chunk + 1,
+                      :etype => :UNK,
+                      :payload => nil)
+      self[i] = edge
+      return left
+    end
   end
-end
-module SpaceDagUpdater
-  def update_by_space(i, slicer)
-    s = slicer.s
-    src = self[s]
-    edge = Edge.new(:s => s,
-                    :unk => src.unk,
-                    :chunk => src.chunk + 1,
-                    :etype => :SPACE,
-                    :payload => nil)
-    self[i] = edge
-    return i
+  module SpaceDagUpdater
+    def update_by_space(i, slicer)
+      s = slicer.s
+      src = self[s]
+      edge = Edge.new(:s => s,
+                      :unk => src.unk,
+                      :chunk => src.chunk + 1,
+                      :etype => :SPACE,
+                      :payload => nil)
+      self[i] = edge
+      return i
+    end
   end
-end
-module BasicDagUpdater
-  include DictDagUpdater
-  include UnkDagUpdater
-  include SpaceDagUpdater
-  include PointersManipulator
-  def update(i, left, pointers, space_slicer)
-    if not pointers&.empty?
-      update_by_dict(i, pointers)
-    elsif space_slicer&.final
-      update_by_space(i, space_slicer)
-    else
-      update_by_unk(i, left)
+  module BasicDagUpdater
+    include DictDagUpdater
+    include UnkDagUpdater
+    include SpaceDagUpdater
+    include PointersManipulator
+    def update(i, left, pointers, space_slicer)
+      if not pointers&.empty?
+        update_by_dict(i, pointers)
+      elsif space_slicer&.final
+        update_by_space(i, space_slicer)
+      else
+        update_by_unk(i, left)
+      end
     end
   end
-end
-module DagBuilder
-  def build(dict, txt)
-    self[0] = init_edge
-    pointers = []
-    left = 0
-    space_slicer = SpaceSlicer.new(0)
-    for i in 1..txt.length
-      ch = txt[i - 1]
-      next_ch = i < txt.length ? txt[i] : nil
-      space_slicer.transit(ch, next_ch)
-      pointers << new_pointer(i, dict)
-      pointers = transit(pointers, ch)
-      left = update(i, left, pointers.select(&:final), space_slicer)
+  module DagBuilder
+    def build(dict, txt)
+      self[0] = init_edge
+      pointers = []
+      left = 0
+      space_slicer = SpaceSlicer.new(0)
+      for i in 1..txt.length
+        ch = txt[i - 1]
+        next_ch = i < txt.length ? txt[i] : nil
+        space_slicer.transit(ch, next_ch)
+        pointers << new_pointer(i, dict)
+        pointers = transit(pointers, ch)
+        left = update(i, left, pointers.select(&:final), space_slicer)
+      end
     end
   end
-end
-module DagToToken
-  def tokens(txt)
-    toks = []
-    i = txt.length
-    while i > 0
-      s = self[i].s
-      tok = txt.slice(s, i-s)
-      toks << tok
-      i = s
+  module DagToToken
+    def tokens(txt)
+      toks = []
+      i = txt.length
+      while i > 0
+        s = self[i].s
+        tok = txt.slice(s, i-s)
+        toks << tok
+        i = s
+      end
+      toks.reverse
     end
-    toks.reverse
   end
-end
-class BasicDag < Array
-  include EdgeBuilder
-  include BasicDagUpdater
-  include DagBuilder
-  include DagToToken
-  def self.build(dict, txt)
-    dag = BasicDag.new(txt.length + 1)
-    dag.build(dict, txt)
-    return dag
+  class BasicDag < Array
+    include EdgeBuilder
+    include BasicDagUpdater
+    include DagBuilder
+    include DagToToken
+    def self.build(dict, txt)
+      dag = BasicDag.new(txt.length + 1)
+      dag.build(dict, txt)
+      return dag
+    end
   end
 end

data/wordcut/dict.rb CHANGED Viewed

@@ -1,51 +1,52 @@
 require_relative "dict_seek"
-class WordItem
-  attr_reader :headword
-  def initialize(headword)
-    @headword = headword
-  end
-end
-module DictInfo
-  def l
-    0
+module Wordcut
+  class WordItem
+    attr_reader :headword
+    def initialize(headword)
+      @headword = headword
+    end
   end
-  def r
-    return nil if self.empty?
-    self.length - 1
-  end
-end
+  module DictInfo
+    def l
+      0
+    end
-module PathResolver
-  def resolve_path(lang, name)
-    File.expand_path(File.join(__FILE__, '..', '..', 'data', lang, name, ))
+    def r
+      return nil if self.empty?
+      self.length - 1
+    end
   end
-end
-module BasicDictLoader
-  include PathResolver
-  def load_bundle(lang, name)
-    load(resolve_path(lang, name))
+  module PathResolver
+    def resolve_path(lang, name)
+      File.expand_path(File.join(__FILE__, '..', '..', 'data', lang, name, ))
+    end
   end
-  def load(path)
-    self.concat(open(path).each_line
-                 .map(&:strip)
-                 .reject(&:empty?)
-                 .map{|w| WordItem.new w})
+  module BasicDictLoader
+    include PathResolver
+    def load_bundle(lang, name)
+      load(resolve_path(lang, name))
+    end
+    def load(path)
+      self.concat(open(path).each_line
+                   .map(&:strip)
+                   .reject(&:empty?)
+                   .map{|w| WordItem.new w})
+    end
   end
-end
-class BasicDict < Array
-  include DictInfo
-  include DictSeeker
-  include BasicDictLoader
+  class BasicDict < Array
+    include DictInfo
+    include DictSeeker
+    include BasicDictLoader
-  def self.from_bundle(lang, name)
-    dict = self.new
-    dict.load_bundle(lang, name)
-    return dict
+    def self.from_bundle(lang, name)
+      dict = self.new
+      dict.load_bundle(lang, name)
+      return dict
+    end
   end
 end

data/wordcut/dict_seek.rb CHANGED Viewed

@@ -1,28 +1,30 @@
-module DictSeeker
-  def seek(ch, l, r, offset, policy)
-    idx = nil
-    while l <= r
-      m = (l + r) / 2
-      w = self[m].headword
-      wlen = w.length
+module Wordcut
+  module DictSeeker
+    def seek(ch, l, r, offset, policy)
+      idx = nil
+      while l <= r
+        m = (l + r) / 2
+        w = self[m].headword
+        wlen = w.length
-      if wlen <= offset
-        l = m + 1
-      else
-        ch_w = w[offset]
-        if ch_w < ch
+        if wlen <= offset
           l = m + 1
-        elsif ch_w > ch
-          r = m - 1
-        elsif policy == :LEFT
-          idx = m
-          r = m - 1
-        elsif policy == :RIGHT
-          idx = m
-          l = m + 1
-        end
+        else
+          ch_w = w[offset]
+          if ch_w < ch
+            l = m + 1
+          elsif ch_w > ch
+            r = m - 1
+          elsif policy == :LEFT
+            idx = m
+            r = m - 1
+          elsif policy == :RIGHT
+            idx = m
+            l = m + 1
+          end
+        end
       end
+      return idx
     end
-    return idx
   end
 end

data/wordcut/edge.rb CHANGED Viewed

@@ -1,21 +1,25 @@
-class Edge
-  attr_reader :unk, :chunk, :s, :payload, :etype
-  CMP_FUNCS = [lambda {|e| e.unk}, lambda {|e| e.chunk}]
-  def initialize(args = {})
-    @unk = args[:unk] || 0
-    @chunk = args[:chunk] || 0
-    @s = args[:s] || 0
-    @payload = args[:payload]
-    @etype = args[:etype]
-  end
+module Wordcut
+  class Edge
+    attr_reader :unk, :chunk, :s, :payload, :etype
+    CMP_FUNCS = [lambda {|e| e.unk}, lambda {|e| e.chunk}]
+    def initialize(args = {})
+      @unk = args[:unk] || 0
+      @chunk = args[:chunk] || 0
+      @s = args[:s] || 0
+      @payload = args[:payload]
+      @etype = args[:etype]
+    end
-  def <=>(o)
-    for fn in CMP_FUNCS
-      cmp = fn.call(self) <=> fn.call(o)
-      return cmp if cmp != 0
+    def <=>(o)
+      for fn in CMP_FUNCS
+        cmp = fn.call(self) <=> fn.call(o)
+        return cmp if cmp != 0
+      end
+      return 0
     end
-    return 0
   end
 end

data/wordcut/edge_builder.rb CHANGED Viewed

@@ -1,18 +1,20 @@
 require_relative "edge.rb"
-module EdgeBuilder
-  def init_edge
-    Edge.new
-  end
-  def build_edges(pointers)
-    pointers.map do |pointer|
-      src = self[pointer.s]
-      Edge.new(:s => pointer.s,
-               :unk => src.unk,
-               :chunk => src.chunk + 1,
-               :etype => :DICT,
-               :payload => nil)
-    end
+module Wordcut
+  module EdgeBuilder
+    def init_edge
+      Edge.new
+    end
+    def build_edges(pointers)
+      pointers.map do |pointer|
+        src = self[pointer.s]
+        Edge.new(:s => pointer.s,
+                 :unk => src.unk,
+                 :chunk => src.chunk + 1,
+                 :etype => :DICT,
+                 :payload => nil)
+      end
+    end
   end
 end

data/wordcut/pointer.rb CHANGED Viewed

@@ -1,29 +1,32 @@
-class Pointer
-  attr_reader :s, :l, :r, :offset, :dict, :final
-  def initialize(s, l, r, offset, dict, final=false)
-    @s = s
-    @l = l
-    @r = r
-    @offset = offset
-    @dict = dict
-    @final = final
-  end
+module Wordcut
-  def update(ch)
-    l = @dict.seek(ch, @l, @r, @offset, :LEFT)
-    return nil unless l
-    r = @dict.seek(ch, l, @r, @offset, :RIGHT)
-    final = (@dict[l].headword.length == @offset + 1)
-    self.class.new(@s, l, r, @offset + 1, @dict, final)
-  end
-end
+  class Pointer
+    attr_reader :s, :l, :r, :offset, :dict, :final
+    def initialize(s, l, r, offset, dict, final=false)
+      @s = s
+      @l = l
+      @r = r
+      @offset = offset
+      @dict = dict
+      @final = final
+    end
-module PointersManipulator
-  def new_pointer(i, dict)
-    Pointer.new(i-1, dict.l, dict.r, 0, dict)
+    def update(ch)
+      l = @dict.seek(ch, @l, @r, @offset, :LEFT)
+      return nil unless l
+      r = @dict.seek(ch, l, @r, @offset, :RIGHT)
+      final = (@dict[l].headword.length == @offset + 1)
+      self.class.new(@s, l, r, @offset + 1, @dict, final)
+    end
   end
-  def transit(pointers, ch)
-    pointers.map{|p| p.update(ch)}.reject(&:nil?)
+  module PointersManipulator
+    def new_pointer(i, dict)
+      Pointer.new(i-1, dict.l, dict.r, 0, dict)
+    end
+    def transit(pointers, ch)
+      pointers.map{|p| p.update(ch)}.reject(&:nil?)
+    end
   end
 end

data/wordcut/space_slicer.rb CHANGED Viewed

@@ -1,26 +1,30 @@
-class SpaceSlicer
-  attr_reader :s, :offset, :final
-  def initialize(s)
-    @s = s
-    @offset = 0
-    @final = false
-  end
-  def transit(ch, next_ch)
-    current_is_space = (ch =~ /\s/)
-    next_is_space = (not nil? and next_ch =~ /\s/)
+module Wordcut
+  class SpaceSlicer
+    attr_reader :s, :offset, :final
-    if current_is_space and next_is_space
-      @offset += 1
-    elsif current_is_space and not next_is_space
-      @offset += 1
-      @final = true
-    elsif not current_is_space
-      @final = false
-      @s += @offset
-      @s += 1
+    def initialize(s)
+      @s = s
       @offset = 0
+      @final = false
+    end
+    def transit(ch, next_ch)
+      current_is_space = (ch =~ /\s/)
+      next_is_space = (not nil? and next_ch =~ /\s/)
+      if current_is_space and next_is_space
+        @offset += 1
+      elsif current_is_space and not next_is_space
+        @offset += 1
+        @final = true
+      elsif not current_is_space
+        @final = false
+        @s += @offset
+        @s += 1
+        @offset = 0
+      end
     end
   end
 end

data/wordcut/tokenizer.rb CHANGED Viewed

@@ -1,15 +1,17 @@
 require_relative "dag.rb"
-module Tokenizer
-  def tokenize(txt)
-    @dag_class.build(@dict, txt).tokens(txt)
+module Wordcut
+  module Tokenizer
+    def tokenize(txt)
+      @dag_class.build(@dict, txt).tokens(txt)
+    end
   end
-end
-class BasicTokenizer
-  include Tokenizer
-  def initialize(dict)
-    @dict = dict
-    @dag_class = BasicDag
+  class BasicTokenizer
+    include Tokenizer
+    def initialize(dict)
+      @dict = dict
+      @dag_class = BasicDag
+    end
   end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: wordcut
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.0.4
 platform: ruby
 authors:
 - Vee Satayamas
@@ -17,7 +17,23 @@ executables: []
 extensions: []
 extra_rdoc_files: []
 files:
+- LICENSE
 - README.md
+- data/tha/tdict-acronyms.txt
+- data/tha/tdict-city.txt
+- data/tha/tdict-collection.txt
+- data/tha/tdict-common.txt
+- data/tha/tdict-country.txt
+- data/tha/tdict-district.txt
+- data/tha/tdict-geo.txt
+- data/tha/tdict-history.txt
+- data/tha/tdict-ict.txt
+- data/tha/tdict-lang-ethnic.txt
+- data/tha/tdict-proper.txt
+- data/tha/tdict-science.txt
+- data/tha/tdict-spell.txt
+- data/tha/tdict-std-compound.txt
+- data/tha/tdict-std.txt
 - wordcut/dag.rb
 - wordcut/dict.rb
 - wordcut/dict_seek.rb
@@ -26,14 +42,14 @@ files:
 - wordcut/pointer.rb
 - wordcut/space_slicer.rb
 - wordcut/tokenizer.rb
-homepage: https://github.com/veer66/wordcut
+homepage: https://github.com/veer66/wordcut.rb
 licenses:
 - LGPL-3.0
 metadata: {}
 post_install_message:
 rdoc_options: []
 require_paths:
-- wordcut
+- "."
 required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="