RubyGems - rseg - Versions diffs - 0.1.1 - Mend

rseg 0.1.1

Files changed (18) hide show

data/.gitignore ADDED Viewed

@@ -0,0 +1,3 @@
+test
+pkg
+.DS_Store

data/LICENSE ADDED Viewed

@@ -0,0 +1,30 @@
+Rseg includes two built-in dictionaries:
+* CC-CEDICT (http://cc-cedict.org/wiki/): Creative Commons Attribution-Share Alike 3.0 License (http://creativecommons.org/licenses/by-sa/3.0/)
+* Wikipedia Chinese article title list (http://download.wikimedia.org/zhwiki/): Creative Commons Attribution-Share Alike 3.0 License （http://creativecommons.org/licenses/by-sa/3.0/)
+The codes and others in Rseg are licensed under MIT license:
+===============================
+Copyright (c) 2009 Yuanyi Zhang
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.

data/README ADDED Viewed

@@ -0,0 +1,35 @@
+Introduction
+========
+Rseg is a Chinese Word Segmentation(中文分词) routine in pure Ruby.
+The algorithm is based on this article: http://xiecc.blog.163.com/blog/static/14032200671110224190/
+Usage
+========
+It's very easy to use:
+> require 'rubygems'
+> require 'rseg'
+> RSeg.segment("需要分词的文章")
+['需要', '分词', '的', '文章']
+The first call to Rseg#segment will need about 30 seconds to load the dictionary, the second call will be very fast.
+Performance
+========
+About 5M character/s on my Macbook (Intel Core 2 Duo 2GHz/4G mem).
+License
+========
+Rseg includes two built-in dictionaries:
+* CC-CEDICT (http://cc-cedict.org/wiki/) with Creative Commons Attribution-Share Alike 3.0 License (http://creativecommons.org/licenses/by-sa/3.0/)
+* Wikipedia Chinese article title list (http://download.wikimedia.org/zhwiki/) with Creative Commons Attribution-Share Alike 3.0 License（http://creativecommons.org/licenses/by-sa/3.0/)
+The codes and others in Rseg are licensed under MIT license.
+Feedback
+========
+All feedback are welcome, Yuanyi Zhang(zhangyuanyi#gmail.com)

data/Rakefile ADDED Viewed

@@ -0,0 +1,19 @@
+require 'rake'
+require 'rake/testtask'
+require 'rcov/rcovtask'
+begin
+  require 'jeweler'
+  Jeweler::Tasks.new do |s|
+    s.name = "rseg"
+    s.executables = "rseg"
+    s.summary = "A Chinese Word Segmentation(中文分词) routine in pure Ruby"
+    s.email = "zhangyuanyi@gmail.com"
+    s.homepage = "http://github.com/yzhang/rseg"
+    s.description = "A Chinese Word Segmentation(中文分词) routine in pure Ruby"
+    s.authors = ["Yuanyi Zhang"]
+    s.files =  FileList["[A-Z]*", "{bin,lib}/**/*", '.gitignore', 'dict/dict.hash']
+  end
+rescue LoadError
+  puts "Jeweler, or one of its dependencies, is not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
+end

data/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.1.1

data/bin/rseg ADDED Viewed

@@ -0,0 +1,3 @@
+#!/usr/bin/env ruby
+puts 'Coming soon'

data/dict/dict.hash ADDED Viewed

Binary file

data/lib/builder/dict.rb ADDED Viewed

@@ -0,0 +1,42 @@
+$KCODE = 'UTF8'
+def process(path, tree)
+  File.open(path, 'r') do |file|
+    file.each_line do |line|
+      node = nil
+      line.chars.each do |c|
+        next if c == "\n" || c == "\r"
+        if node
+          node[c] ||= {}
+          node = node[c]
+        else
+          tree[c] ||= Hash.new
+          node = tree[c]
+        end
+      end
+      node[:end] = true
+    end
+  end
+end
+def build
+  tree = {}
+  dictionaries = ['cedict.zh_CN.utf8', 'wikipedia.zh.utf8']
+  #dictionaries = ['wikipedia.zh.utf8']
+  dictionaries.each do |dictionary|
+    puts "Processing #{dictionary}..."
+    path = File.join(File.dirname(__FILE__), '../../dict', dictionary)
+    process(path, tree)
+  end
+  File.open(hash_path, "wb") {|io| Marshal.dump(tree, io)}
+  puts 'Done'
+end
+def hash_path
+  File.join(File.dirname(__FILE__), '../../dict/dict.hash')
+end
+build

data/lib/engines/dict.rb ADDED Viewed

@@ -0,0 +1,42 @@
+class Dict < Engine
+  @@root = nil
+  def initialize
+    @@root ||= load_dict(dict_path)
+    @word = ''
+    @node = @@root
+    super
+  end
+  def process(char)
+    match = false
+    word = nil
+    if @node[char]
+      @word << char
+      @node = @node[char]
+      match = true
+    else
+      if @node[:end] || @word.chars.to_a.length == 1
+        word = @word
+      else
+        word = @word.chars.to_a
+      end
+      @node = @@root
+      @word = ''
+      match = false
+    end
+    [match, word]
+  end
+  private
+  def load_dict(path)
+    File.open(path, "rb") {|io| Marshal.load(io)}
+  end
+  def dict_path
+    File.join(File.dirname(__FILE__), '../../dict/dict.hash')
+  end
+end

data/lib/engines/engine.rb ADDED Viewed

@@ -0,0 +1,17 @@
+class Engine
+  def initialize
+    @running = true
+  end
+  def stop
+    @running = false
+  end
+  def run
+    @running = true
+  end
+  def running?
+    @running
+  end
+end

data/lib/engines/english.rb ADDED Viewed

@@ -0,0 +1,24 @@
+LETTER_SYMBOLS = ('a'..'z').to_a + ('A'..'Z').to_a
+class English < Engine
+  def initialize
+    @word = ''
+    super
+  end
+  def process(char)
+    match = false
+    word = nil
+    if LETTER_SYMBOLS.include?(char)
+      @word << char
+      match = true
+    else
+      word = @word
+      @word = ''
+      match = false
+    end
+    [match, word]
+  end
+end

data/lib/engines/name.rb ADDED Viewed

@@ -0,0 +1,52 @@
+LAST_NAMES = %W(丁 卜 刁 七 弓 干 于 王 尤 孔 方 申 白 甘 田 包 石 左 平 司 皮 史 池 艾 年 匡 充 江 印
+                促 伊 伍 安 任 米 促 牟 向 吉 成 伏 吕 李 吴 沈 何 贝 狄 祁 杜 汪 阮 邢 汲 别 辛 冷 利 沃 谷
+                扶 步 那 沙 周 金 吕 花 孟 和 邵 房 抗 灰 明 屈 松 牧 宓 武 幸 卓 易 尚 邰 空 竺 岳 东 林
+                施 姜 俞 查 封 秋 帅 祖 羿 柯 茅 柳 姚 纪 宣 咸 库 侯 洪 胡 哈 宣 郁 祝 苗 禹 娄
+                秦 奚 倪 度 凌 宰 宦 师 徐 翁 班 马 时 晃 乌 夏 贡 柴 能 家 宫 敖 索 晏 桑 高 凌 桂 容 姬 劳 桑 桂 袁 时 祝 席 徐 高 夏 凌 洪 翁 家 芮 乌 祖 索 贡
+                许 张 曹 戚 梅 屠 盛 崖 章 鱼 国 商 扈 寇 终 冯 苗 康 常 茅 闵 麻 胡 崔 邢 条 符 宿 堵 浦 习 鱼
+                梁 富 曾 程 项 钮 舒 彭 费 童 云 喻 嵇 范 费 贺 毕 付 黄 邵 祁 阮 强 童 邱 解 贲 单 富 钮 荀 惠 邴 焦 班 甯 钭 景 邰 劳 茹 寇 荆
+                莫 际 景 须 杨 詹 郎 雷 贾 路 骆 虞 经 裘 郁 滑 甄 靳 詹 闻 逄 雍 訾 郎 农 路 骆 虞 经 裘 郁 滑 靳 闻 逄 雍
+                赵 黄 褚 凤 郝 齐 臧 熊 管 裴 荣 郗 韶 郜 黎 翟 寿 通
+                卫 葛 鲁 乐 谈 董 樊 万 诸 刘 叶 都 满 广 殴 巩 养
+                郭 钱 陈 陶 鲍 穆 郭 堆 卢 陆 龙 噪 鄂 阴 苍 燕 冀 衡 融 蒯 逯
+                蒋 魏 谢 邹 潘 滕 邬 戴 钟 蔡 缪 应 储 糜 隗 历 蒲 慕 蔚 隆 鞠 关
+                韩 萧 颜 庞 麦 双 璩 濮 聂 丰 看
+                郑 严 蓟 薄 谭 罗
+                买 蓝 蓬 怀 党 饶
+                顾 苏 龚 边 栾 权)
+FIRST_NAMES = %W(文 铭 菁 郁 怡 智 德 祥 志 华 孟 庆 雅 佩 晓 蓉 明 仁 宇 青 慧 豪 琪 安
+                 惠 宗 信 盈 君 秀 敏 伶 佳 国 荣 忠 宏 育 丽 圣 淑 彦 龙 冠 后 静 娟 子
+                 嘉 瑞 柏 弘 芳 正 玮 贞 如 凯 元 士 伟 杰 颍 霖 玲 仪 珮 英 建 政 真 珍
+                 美 世 立 秋 婷 贤 瑜 中 玉 维 莹 翔 家 芬 昌 裕 雯 萍 永 成 宜 鸿 珊 民
+                 欣 哲 良 伦 燕 梦 磊 丹 元 一 昌 红 健)
+class Name < Engine
+  def initialize
+    @word = ''
+    @last = false
+    super
+  end
+  def process(char)
+    match = false
+    word = nil
+    if !@last && LAST_NAMES.include?(char)
+        @word << char
+        match = true
+        @last = true
+    elsif @last && @word.chars.to_a.length < 3 && FIRST_NAMES.include?(char)
+        @word << char
+        match = true
+        @unit = true
+    else
+      word = @word
+      @word = ''
+      @last = false
+      match = false
+    end
+    [match, word]
+  end
+end

data/lib/engines/number.rb ADDED Viewed

@@ -0,0 +1,59 @@
+NUMBER_SYMBOLS  = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
+                  '一', '二', '三', '四', '五', '六', '七', '八', '九', '十',
+                  '零', '〇', '百', '千', '壹', '贰', '叁', '肆', '柒', '捌',
+                  '玖', '拾', '之', '%', '¥', '分', '$', '.', '点', '第', '每']
+SUBUNIT_SYMBOLS = ['多', '公', '英', '厘', '毫', '微', '纳', '海', '平', '立',
+                   '方', '摄', '华', '氏', '美', '日', '澳', '港', '台', '新',
+                   '个', '百', '佰', '千', '仟', '万', '萬', '亿', '兆', '吉']
+UNIT_SYMBOLS    = ['刻', '章', '回', '节', '名', '个', '届', '次', '集', '元',
+                   '角', '例', '人', '斤', '克', '吨', '米', '里', '升', '码',
+                   '尺', '寸', '杆', '顷', '亩', '磅', '镑', '桶', '度', '秒',
+                   '分', '卡', '焦', '瓦', '匹', '圆', '币', '年', '月', '日',
+                   '时', '秒', '点', '百', '佰', '仟', '千', '万', '萬', '亿',
+                   '兆', '吉', '块', '半', '岁', '家', '所', '期', '场', '投',
+                   '中', '辆', '只', '头']
+class Number < Engine
+  def initialize
+    @word = ''
+    @number = ''
+    @unit = false
+    @subunit = false
+    super
+  end
+  def process(char)
+    match = false
+    word = nil
+    if (!@subunit || @unit) && NUMBER_SYMBOLS.include?(char)
+      @number << char
+      match = true
+      @unit = false
+      @subunit = false
+    elsif (@number != '' || @unit) && SUBUNIT_SYMBOLS.include?(char)
+      @number << char
+      match = true
+      @subunit = true
+    end
+    if (@number != '' || @subunit) && UNIT_SYMBOLS.include?(char)
+      @word << @number
+      @word << char if !match
+      @number = ''
+      @unit = true
+      match = true
+    end
+    if !match
+      word = (@word != '') ? @word : @number
+      @word = ''
+      @number = ''
+      match = false
+      @unit = false
+      @subunit = false
+    end
+    [match, word]
+  end
+end

data/lib/filters/conjunction.rb ADDED Viewed

@@ -0,0 +1,7 @@
+CONJUNCTIONS = %W(给 的 说 对 在 和 是 被 最 所 那 由 这 有 将 你 会 与 他 为 不 没 很 了 啊 哦 呵 把 去 从)
+class Conjunction
+  def self.filter(char)
+    CONJUNCTIONS.include?(char) ? :conjunction : char
+  end
+end

data/lib/filters/fullwidth.rb ADDED Viewed

@@ -0,0 +1,17 @@
+class Fullwidth
+  FULLWIDTH_CHARS = {'１' => '1', '２' => '2', '３' => '3', '４' => '4', '５' => '5', '６' => '6', '７' => '7', '８' => '8',
+                     '９' => '9', '０' => '0', 'ａ' => 'a', 'ｂ' => 'b', 'ｃ' => 'c', 'ｄ' => 'd', 'ｅ' => 'e', 'ｆ' => 'f',
+                     'ｇ' => 'g', 'ｈ' => 'h', 'ｉ' => 'i', 'ｊ' => 'j', 'ｋ' => 'k', 'ｌ' => 'l', 'ｍ' => 'm', 'ｎ' => 'n',
+                     'ｏ' => 'o', 'ｐ' => 'p', 'ｑ' => 'q', 'ｒ' => 'r', 'ｓ' => 's', 'ｔ' => 't', 'ｕ' => 'u', 'ｖ' => 'v',
+                     'ｗ' => 'w', 'ｘ' => 'x', 'ｙ' => 'y', 'ｚ' => 'z', 'Ａ' => 'A', 'Ｂ' => 'B', 'Ｃ' => 'C', 'Ｄ' => 'D',
+                     'Ｅ' => 'E', 'Ｆ' => 'F', 'Ｇ' => 'G', 'Ｈ' => 'H', 'Ｉ' => 'I', 'Ｊ' => 'J', 'Ｋ' => 'K', 'Ｌ' => 'L',
+                     'Ｍ' => 'M', 'Ｎ' => 'N', 'Ｏ' => 'O', 'Ｐ' => 'P', 'Ｑ' => 'Q', 'Ｒ' => 'R', 'Ｓ' => 'S', 'Ｔ' => 'T',
+                     'Ｕ' => 'U', 'Ｖ' => 'V', 'Ｗ' => 'W', 'Ｘ' => 'X', 'Ｙ' => 'Y', 'Ｚ' => 'Z', '－' => '-', '＋' => '+',
+                     '—' => '-', '，' => ',', '／' => '/', '·' => '.'}
+  class << self
+    def filter(char)
+      FULLWIDTH_CHARS[char].nil? ? char : FULLWIDTH_CHARS[char]
+    end
+  end
+end

data/lib/filters/symbol.rb ADDED Viewed

@@ -0,0 +1,13 @@
+SEPARATORS = ['｀', '［', '］', '、', '＝', '‘', '；', '。', '｜', '？', '》',
+              '《', '：', '“', '｛', '｝', '）', '（', '＊', '…', '＃', '！',
+              '～', '’', '”', '〕', '〈', '〉', '「', '」', '『', '』', '〖', '〗',
+              '【', '】', '＜', '＞', '`', '~', '!', '@', '#', '^',
+              '&', '*', '\\', '(', ')', '=', '{', '}', '[', ']',
+              '|', ';', ':', "'", '<', '>', '?', "\n", "\t", "\r",
+              ' ', '-', '/', '+', ',', '　']
+class Symbol
+  def self.filter(char)
+    SEPARATORS.include?(char) ? :symbol : char
+  end
+end

data/lib/rseg.rb ADDED Viewed

@@ -0,0 +1,112 @@
+$KCODE = 'UTF8'
+require File.join(File.dirname(__FILE__), 'engines/engine')
+require File.join(File.dirname(__FILE__), 'engines/dict')
+require File.join(File.dirname(__FILE__), 'engines/english')
+require File.join(File.dirname(__FILE__), 'engines/number')
+require File.join(File.dirname(__FILE__), 'engines/name')
+require File.join(File.dirname(__FILE__), 'filters/fullwidth')
+require File.join(File.dirname(__FILE__), 'filters/symbol')
+require File.join(File.dirname(__FILE__), 'filters/conjunction')
+class Rseg
+  @@engines = nil
+  @@segment = nil
+  @@filters = nil
+  class << self
+    def segment(input)
+      @@segment ||= Rseg.new(input)
+      @@segment.segment
+    end
+  end
+  def initialize(input)
+    @input = input
+    @words = []
+    init_engines
+    init_filters
+  end
+  def segment
+    @words.clear
+    @input.chars.each do |origin|
+      char = filter(origin)
+      process(char, origin)
+    end
+    process(:symbol, '')
+    @words
+  end
+  private
+  def filter(char)
+    result = char
+    @@filters.each do |klass|
+      result = klass.filter(result)
+    end
+    result
+  end
+  def process(char, origin)
+    nomatch = true
+    word = ''
+    engines.each do |engine|
+      next unless engine.running?
+      match, word = engine.process(char)
+      match ? nomatch = false : engine.stop
+    end
+    if nomatch
+      if word == ''
+        @words << origin unless char == :symbol
+        reset_engines
+      else
+        reset_engines
+        @words << word if word.is_a?(String)
+        reprocess(word) if word.is_a?(Array)
+        # re-process current char
+        process(char, origin)
+      end
+    end
+  end
+  def reprocess(word)
+    last = word.pop
+    word.each do |char|
+      process(char, char)
+    end
+    process(:symbol, :symbol)
+    process(last, last)
+  end
+  def reset_engines
+    engines.each do |engine|
+      engine.run
+    end
+  end
+  def engines=(engines)
+    @@engines ||= engines
+  end
+  def engines
+    @@engines
+  end
+  def init_filters
+    @@filters = [Fullwidth, Symbol]
+  end
+  def init_engines
+    @@engines ||= [Dict, English, Number, Name].map do |engine_klass|
+      engine_klass.new
+    end
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,79 @@
+--- !ruby/object:Gem::Specification
+name: rseg
+version: !ruby/object:Gem::Version
+  version: 0.1.1
+platform: ruby
+authors:
+- Yuanyi Zhang
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2009-11-29 00:00:00 +08:00
+default_executable: rseg
+dependencies: []
+description: "A Chinese Word Segmentation(\xE4\xB8\xAD\xE6\x96\x87\xE5\x88\x86\xE8\xAF\x8D) routine in pure Ruby"
+email: zhangyuanyi@gmail.com
+executables:
+- rseg
+extensions: []
+extra_rdoc_files:
+- LICENSE
+- README
+files:
+- .gitignore
+- LICENSE
+- README
+- Rakefile
+- VERSION
+- bin/rseg
+- dict/dict.hash
+- lib/builder/dict.rb
+- lib/engines/dict.rb
+- lib/engines/engine.rb
+- lib/engines/english.rb
+- lib/engines/name.rb
+- lib/engines/number.rb
+- lib/filters/conjunction.rb
+- lib/filters/fullwidth.rb
+- lib/filters/symbol.rb
+- lib/rseg.rb
+has_rdoc: true
+homepage: http://github.com/yzhang/rseg
+licenses: []
+post_install_message:
+rdoc_options:
+- --charset=UTF-8
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project:
+rubygems_version: 1.3.5
+signing_key:
+specification_version: 3
+summary: "A Chinese Word Segmentation(\xE4\xB8\xAD\xE6\x96\x87\xE5\x88\x86\xE8\xAF\x8D) routine in pure Ruby"
+test_files:
+- test/test_auto.rb
+- test/test_bench.rb
+- test/test_ent.rb
+- test/test_finance.rb
+- test/test_news.rb
+- test/test_sport.rb
+- test/test_tech.rb
+- test/test_web.rb