rseg 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,3 @@
1
+ test
2
+ pkg
3
+ .DS_Store
data/LICENSE ADDED
@@ -0,0 +1,30 @@
1
+ Rseg includes two built-in dictionaries:
2
+
3
+ * CC-CEDICT (http://cc-cedict.org/wiki/): Creative Commons Attribution-Share Alike 3.0 License (http://creativecommons.org/licenses/by-sa/3.0/)
4
+ * Wikipedia Chinese article title list (http://download.wikimedia.org/zhwiki/): Creative Commons Attribution-Share Alike 3.0 License (http://creativecommons.org/licenses/by-sa/3.0/)
5
+
6
+ The codes and others in Rseg are licensed under MIT license:
7
+
8
+ ===============================
9
+ Copyright (c) 2009 Yuanyi Zhang
10
+
11
+ Permission is hereby granted, free of charge, to any person
12
+ obtaining a copy of this software and associated documentation
13
+ files (the "Software"), to deal in the Software without
14
+ restriction, including without limitation the rights to use,
15
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
16
+ copies of the Software, and to permit persons to whom the
17
+ Software is furnished to do so, subject to the following
18
+ conditions:
19
+
20
+ The above copyright notice and this permission notice shall be
21
+ included in all copies or substantial portions of the Software.
22
+
23
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
25
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
27
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
28
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
29
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
30
+ OTHER DEALINGS IN THE SOFTWARE.
data/README ADDED
@@ -0,0 +1,35 @@
1
+ Introduction
2
+ ========
3
+ Rseg is a Chinese Word Segmentation(中文分词) routine in pure Ruby.
4
+
5
+ The algorithm is based on this article: http://xiecc.blog.163.com/blog/static/14032200671110224190/
6
+
7
+ Usage
8
+ ========
9
+
10
+ It's very easy to use:
11
+
12
+ > require 'rubygems'
13
+ > require 'rseg'
14
+ > RSeg.segment("需要分词的文章")
15
+ ['需要', '分词', '的', '文章']
16
+
17
+ The first call to Rseg#segment will need about 30 seconds to load the dictionary, the second call will be very fast.
18
+
19
+ Performance
20
+ ========
21
+ About 5M character/s on my Macbook (Intel Core 2 Duo 2GHz/4G mem).
22
+
23
+ License
24
+ ========
25
+
26
+ Rseg includes two built-in dictionaries:
27
+
28
+ * CC-CEDICT (http://cc-cedict.org/wiki/) with Creative Commons Attribution-Share Alike 3.0 License (http://creativecommons.org/licenses/by-sa/3.0/)
29
+ * Wikipedia Chinese article title list (http://download.wikimedia.org/zhwiki/) with Creative Commons Attribution-Share Alike 3.0 License(http://creativecommons.org/licenses/by-sa/3.0/)
30
+
31
+ The codes and others in Rseg are licensed under MIT license.
32
+
33
+ Feedback
34
+ ========
35
+ All feedback are welcome, Yuanyi Zhang(zhangyuanyi#gmail.com)
data/Rakefile ADDED
@@ -0,0 +1,19 @@
1
+ require 'rake'
2
+ require 'rake/testtask'
3
+ require 'rcov/rcovtask'
4
+
5
+ begin
6
+ require 'jeweler'
7
+ Jeweler::Tasks.new do |s|
8
+ s.name = "rseg"
9
+ s.executables = "rseg"
10
+ s.summary = "A Chinese Word Segmentation(中文分词) routine in pure Ruby"
11
+ s.email = "zhangyuanyi@gmail.com"
12
+ s.homepage = "http://github.com/yzhang/rseg"
13
+ s.description = "A Chinese Word Segmentation(中文分词) routine in pure Ruby"
14
+ s.authors = ["Yuanyi Zhang"]
15
+ s.files = FileList["[A-Z]*", "{bin,lib}/**/*", '.gitignore', 'dict/dict.hash']
16
+ end
17
+ rescue LoadError
18
+ puts "Jeweler, or one of its dependencies, is not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
19
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.1
data/bin/rseg ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ puts 'Coming soon'
data/dict/dict.hash ADDED
Binary file
@@ -0,0 +1,42 @@
1
+ $KCODE = 'UTF8'
2
+
3
+ def process(path, tree)
4
+ File.open(path, 'r') do |file|
5
+ file.each_line do |line|
6
+ node = nil
7
+ line.chars.each do |c|
8
+ next if c == "\n" || c == "\r"
9
+ if node
10
+ node[c] ||= {}
11
+ node = node[c]
12
+ else
13
+ tree[c] ||= Hash.new
14
+ node = tree[c]
15
+ end
16
+ end
17
+ node[:end] = true
18
+ end
19
+ end
20
+ end
21
+
22
+ def build
23
+ tree = {}
24
+ dictionaries = ['cedict.zh_CN.utf8', 'wikipedia.zh.utf8']
25
+ #dictionaries = ['wikipedia.zh.utf8']
26
+
27
+ dictionaries.each do |dictionary|
28
+ puts "Processing #{dictionary}..."
29
+ path = File.join(File.dirname(__FILE__), '../../dict', dictionary)
30
+ process(path, tree)
31
+ end
32
+
33
+ File.open(hash_path, "wb") {|io| Marshal.dump(tree, io)}
34
+ puts 'Done'
35
+ end
36
+
37
+ def hash_path
38
+ File.join(File.dirname(__FILE__), '../../dict/dict.hash')
39
+ end
40
+
41
+ build
42
+
@@ -0,0 +1,42 @@
1
+ class Dict < Engine
2
+ @@root = nil
3
+
4
+ def initialize
5
+ @@root ||= load_dict(dict_path)
6
+ @word = ''
7
+ @node = @@root
8
+ super
9
+ end
10
+
11
+ def process(char)
12
+ match = false
13
+ word = nil
14
+
15
+ if @node[char]
16
+ @word << char
17
+ @node = @node[char]
18
+ match = true
19
+ else
20
+ if @node[:end] || @word.chars.to_a.length == 1
21
+ word = @word
22
+ else
23
+ word = @word.chars.to_a
24
+ end
25
+
26
+ @node = @@root
27
+ @word = ''
28
+ match = false
29
+ end
30
+
31
+ [match, word]
32
+ end
33
+
34
+ private
35
+ def load_dict(path)
36
+ File.open(path, "rb") {|io| Marshal.load(io)}
37
+ end
38
+
39
+ def dict_path
40
+ File.join(File.dirname(__FILE__), '../../dict/dict.hash')
41
+ end
42
+ end
@@ -0,0 +1,17 @@
1
+ class Engine
2
+ def initialize
3
+ @running = true
4
+ end
5
+
6
+ def stop
7
+ @running = false
8
+ end
9
+
10
+ def run
11
+ @running = true
12
+ end
13
+
14
+ def running?
15
+ @running
16
+ end
17
+ end
@@ -0,0 +1,24 @@
1
+ LETTER_SYMBOLS = ('a'..'z').to_a + ('A'..'Z').to_a
2
+
3
+ class English < Engine
4
+ def initialize
5
+ @word = ''
6
+ super
7
+ end
8
+
9
+ def process(char)
10
+ match = false
11
+ word = nil
12
+
13
+ if LETTER_SYMBOLS.include?(char)
14
+ @word << char
15
+ match = true
16
+ else
17
+ word = @word
18
+ @word = ''
19
+ match = false
20
+ end
21
+
22
+ [match, word]
23
+ end
24
+ end
@@ -0,0 +1,52 @@
1
+ LAST_NAMES = %W(丁 卜 刁 七 弓 干 于 王 尤 孔 方 申 白 甘 田 包 石 左 平 司 皮 史 池 艾 年 匡 充 江 印
2
+ 促 伊 伍 安 任 米 促 牟 向 吉 成 伏 吕 李 吴 沈 何 贝 狄 祁 杜 汪 阮 邢 汲 别 辛 冷 利 沃 谷
3
+ 扶 步 那 沙 周 金 吕 花 孟 和 邵 房 抗 灰 明 屈 松 牧 宓 武 幸 卓 易 尚 邰 空 竺 岳 东 林
4
+ 施 姜 俞 查 封 秋 帅 祖 羿 柯 茅 柳 姚 纪 宣 咸 库 侯 洪 胡 哈 宣 郁 祝 苗 禹 娄
5
+ 秦 奚 倪 度 凌 宰 宦 师 徐 翁 班 马 时 晃 乌 夏 贡 柴 能 家 宫 敖 索 晏 桑 高 凌 桂 容 姬 劳 桑 桂 袁 时 祝 席 徐 高 夏 凌 洪 翁 家 芮 乌 祖 索 贡
6
+ 许 张 曹 戚 梅 屠 盛 崖 章 鱼 国 商 扈 寇 终 冯 苗 康 常 茅 闵 麻 胡 崔 邢 条 符 宿 堵 浦 习 鱼
7
+ 梁 富 曾 程 项 钮 舒 彭 费 童 云 喻 嵇 范 费 贺 毕 付 黄 邵 祁 阮 强 童 邱 解 贲 单 富 钮 荀 惠 邴 焦 班 甯 钭 景 邰 劳 茹 寇 荆
8
+ 莫 际 景 须 杨 詹 郎 雷 贾 路 骆 虞 经 裘 郁 滑 甄 靳 詹 闻 逄 雍 訾 郎 农 路 骆 虞 经 裘 郁 滑 靳 闻 逄 雍
9
+ 赵 黄 褚 凤 郝 齐 臧 熊 管 裴 荣 郗 韶 郜 黎 翟 寿 通
10
+ 卫 葛 鲁 乐 谈 董 樊 万 诸 刘 叶 都 满 广 殴 巩 养
11
+ 郭 钱 陈 陶 鲍 穆 郭 堆 卢 陆 龙 噪 鄂 阴 苍 燕 冀 衡 融 蒯 逯
12
+ 蒋 魏 谢 邹 潘 滕 邬 戴 钟 蔡 缪 应 储 糜 隗 历 蒲 慕 蔚 隆 鞠 关
13
+ 韩 萧 颜 庞 麦 双 璩 濮 聂 丰 看
14
+ 郑 严 蓟 薄 谭 罗
15
+ 买 蓝 蓬 怀 党 饶
16
+ 顾 苏 龚 边 栾 权)
17
+
18
+ FIRST_NAMES = %W(文 铭 菁 郁 怡 智 德 祥 志 华 孟 庆 雅 佩 晓 蓉 明 仁 宇 青 慧 豪 琪 安
19
+ 惠 宗 信 盈 君 秀 敏 伶 佳 国 荣 忠 宏 育 丽 圣 淑 彦 龙 冠 后 静 娟 子
20
+ 嘉 瑞 柏 弘 芳 正 玮 贞 如 凯 元 士 伟 杰 颍 霖 玲 仪 珮 英 建 政 真 珍
21
+ 美 世 立 秋 婷 贤 瑜 中 玉 维 莹 翔 家 芬 昌 裕 雯 萍 永 成 宜 鸿 珊 民
22
+ 欣 哲 良 伦 燕 梦 磊 丹 元 一 昌 红 健)
23
+
24
+ class Name < Engine
25
+ def initialize
26
+ @word = ''
27
+ @last = false
28
+ super
29
+ end
30
+
31
+ def process(char)
32
+ match = false
33
+ word = nil
34
+
35
+ if !@last && LAST_NAMES.include?(char)
36
+ @word << char
37
+ match = true
38
+ @last = true
39
+ elsif @last && @word.chars.to_a.length < 3 && FIRST_NAMES.include?(char)
40
+ @word << char
41
+ match = true
42
+ @unit = true
43
+ else
44
+ word = @word
45
+ @word = ''
46
+ @last = false
47
+ match = false
48
+ end
49
+
50
+ [match, word]
51
+ end
52
+ end
@@ -0,0 +1,59 @@
1
+ NUMBER_SYMBOLS = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
2
+ '一', '二', '三', '四', '五', '六', '七', '八', '九', '十',
3
+ '零', '〇', '百', '千', '壹', '贰', '叁', '肆', '柒', '捌',
4
+ '玖', '拾', '之', '%', '¥', '分', '$', '.', '点', '第', '每']
5
+ SUBUNIT_SYMBOLS = ['多', '公', '英', '厘', '毫', '微', '纳', '海', '平', '立',
6
+ '方', '摄', '华', '氏', '美', '日', '澳', '港', '台', '新',
7
+ '个', '百', '佰', '千', '仟', '万', '萬', '亿', '兆', '吉']
8
+ UNIT_SYMBOLS = ['刻', '章', '回', '节', '名', '个', '届', '次', '集', '元',
9
+ '角', '例', '人', '斤', '克', '吨', '米', '里', '升', '码',
10
+ '尺', '寸', '杆', '顷', '亩', '磅', '镑', '桶', '度', '秒',
11
+ '分', '卡', '焦', '瓦', '匹', '圆', '币', '年', '月', '日',
12
+ '时', '秒', '点', '百', '佰', '仟', '千', '万', '萬', '亿',
13
+ '兆', '吉', '块', '半', '岁', '家', '所', '期', '场', '投',
14
+ '中', '辆', '只', '头']
15
+
16
+ class Number < Engine
17
+ def initialize
18
+ @word = ''
19
+ @number = ''
20
+ @unit = false
21
+ @subunit = false
22
+ super
23
+ end
24
+
25
+ def process(char)
26
+ match = false
27
+ word = nil
28
+
29
+ if (!@subunit || @unit) && NUMBER_SYMBOLS.include?(char)
30
+ @number << char
31
+ match = true
32
+ @unit = false
33
+ @subunit = false
34
+ elsif (@number != '' || @unit) && SUBUNIT_SYMBOLS.include?(char)
35
+ @number << char
36
+ match = true
37
+ @subunit = true
38
+ end
39
+
40
+ if (@number != '' || @subunit) && UNIT_SYMBOLS.include?(char)
41
+ @word << @number
42
+ @word << char if !match
43
+ @number = ''
44
+ @unit = true
45
+ match = true
46
+ end
47
+
48
+ if !match
49
+ word = (@word != '') ? @word : @number
50
+ @word = ''
51
+ @number = ''
52
+ match = false
53
+ @unit = false
54
+ @subunit = false
55
+ end
56
+
57
+ [match, word]
58
+ end
59
+ end
@@ -0,0 +1,7 @@
1
+ CONJUNCTIONS = %W(给 的 说 对 在 和 是 被 最 所 那 由 这 有 将 你 会 与 他 为 不 没 很 了 啊 哦 呵 把 去 从)
2
+
3
+ class Conjunction
4
+ def self.filter(char)
5
+ CONJUNCTIONS.include?(char) ? :conjunction : char
6
+ end
7
+ end
@@ -0,0 +1,17 @@
1
+ class Fullwidth
2
+ FULLWIDTH_CHARS = {'1' => '1', '2' => '2', '3' => '3', '4' => '4', '5' => '5', '6' => '6', '7' => '7', '8' => '8',
3
+ '9' => '9', '0' => '0', 'a' => 'a', 'b' => 'b', 'c' => 'c', 'd' => 'd', 'e' => 'e', 'f' => 'f',
4
+ 'g' => 'g', 'h' => 'h', 'i' => 'i', 'j' => 'j', 'k' => 'k', 'l' => 'l', 'm' => 'm', 'n' => 'n',
5
+ 'o' => 'o', 'p' => 'p', 'q' => 'q', 'r' => 'r', 's' => 's', 't' => 't', 'u' => 'u', 'v' => 'v',
6
+ 'w' => 'w', 'x' => 'x', 'y' => 'y', 'z' => 'z', 'A' => 'A', 'B' => 'B', 'C' => 'C', 'D' => 'D',
7
+ 'E' => 'E', 'F' => 'F', 'G' => 'G', 'H' => 'H', 'I' => 'I', 'J' => 'J', 'K' => 'K', 'L' => 'L',
8
+ 'M' => 'M', 'N' => 'N', 'O' => 'O', 'P' => 'P', 'Q' => 'Q', 'R' => 'R', 'S' => 'S', 'T' => 'T',
9
+ 'U' => 'U', 'V' => 'V', 'W' => 'W', 'X' => 'X', 'Y' => 'Y', 'Z' => 'Z', '-' => '-', '+' => '+',
10
+ '—' => '-', ',' => ',', '/' => '/', '·' => '.'}
11
+
12
+ class << self
13
+ def filter(char)
14
+ FULLWIDTH_CHARS[char].nil? ? char : FULLWIDTH_CHARS[char]
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,13 @@
1
+ SEPARATORS = ['`', '[', ']', '、', '=', '‘', ';', '。', '|', '?', '》',
2
+ '《', ':', '“', '{', '}', ')', '(', '*', '…', '#', '!',
3
+ '~', '’', '”', '〕', '〈', '〉', '「', '」', '『', '』', '〖', '〗',
4
+ '【', '】', '<', '>', '`', '~', '!', '@', '#', '^',
5
+ '&', '*', '\\', '(', ')', '=', '{', '}', '[', ']',
6
+ '|', ';', ':', "'", '<', '>', '?', "\n", "\t", "\r",
7
+ ' ', '-', '/', '+', ',', ' ']
8
+
9
+ class Symbol
10
+ def self.filter(char)
11
+ SEPARATORS.include?(char) ? :symbol : char
12
+ end
13
+ end
data/lib/rseg.rb ADDED
@@ -0,0 +1,112 @@
1
+ $KCODE = 'UTF8'
2
+
3
+ require File.join(File.dirname(__FILE__), 'engines/engine')
4
+ require File.join(File.dirname(__FILE__), 'engines/dict')
5
+ require File.join(File.dirname(__FILE__), 'engines/english')
6
+ require File.join(File.dirname(__FILE__), 'engines/number')
7
+ require File.join(File.dirname(__FILE__), 'engines/name')
8
+
9
+ require File.join(File.dirname(__FILE__), 'filters/fullwidth')
10
+ require File.join(File.dirname(__FILE__), 'filters/symbol')
11
+ require File.join(File.dirname(__FILE__), 'filters/conjunction')
12
+
13
+ class Rseg
14
+ @@engines = nil
15
+ @@segment = nil
16
+ @@filters = nil
17
+
18
+ class << self
19
+ def segment(input)
20
+ @@segment ||= Rseg.new(input)
21
+ @@segment.segment
22
+ end
23
+ end
24
+
25
+ def initialize(input)
26
+ @input = input
27
+ @words = []
28
+ init_engines
29
+ init_filters
30
+ end
31
+
32
+ def segment
33
+ @words.clear
34
+
35
+ @input.chars.each do |origin|
36
+ char = filter(origin)
37
+ process(char, origin)
38
+ end
39
+
40
+ process(:symbol, '')
41
+ @words
42
+ end
43
+
44
+ private
45
+ def filter(char)
46
+ result = char
47
+ @@filters.each do |klass|
48
+ result = klass.filter(result)
49
+ end
50
+ result
51
+ end
52
+
53
+ def process(char, origin)
54
+ nomatch = true
55
+ word = ''
56
+
57
+ engines.each do |engine|
58
+ next unless engine.running?
59
+ match, word = engine.process(char)
60
+ match ? nomatch = false : engine.stop
61
+ end
62
+
63
+ if nomatch
64
+ if word == ''
65
+ @words << origin unless char == :symbol
66
+ reset_engines
67
+ else
68
+ reset_engines
69
+ @words << word if word.is_a?(String)
70
+ reprocess(word) if word.is_a?(Array)
71
+
72
+ # re-process current char
73
+ process(char, origin)
74
+ end
75
+ end
76
+ end
77
+
78
+ def reprocess(word)
79
+ last = word.pop
80
+
81
+ word.each do |char|
82
+ process(char, char)
83
+ end
84
+
85
+ process(:symbol, :symbol)
86
+ process(last, last)
87
+ end
88
+
89
+ def reset_engines
90
+ engines.each do |engine|
91
+ engine.run
92
+ end
93
+ end
94
+
95
+ def engines=(engines)
96
+ @@engines ||= engines
97
+ end
98
+
99
+ def engines
100
+ @@engines
101
+ end
102
+
103
+ def init_filters
104
+ @@filters = [Fullwidth, Symbol]
105
+ end
106
+
107
+ def init_engines
108
+ @@engines ||= [Dict, English, Number, Name].map do |engine_klass|
109
+ engine_klass.new
110
+ end
111
+ end
112
+ end
metadata ADDED
@@ -0,0 +1,79 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rseg
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Yuanyi Zhang
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-11-29 00:00:00 +08:00
13
+ default_executable: rseg
14
+ dependencies: []
15
+
16
+ description: "A Chinese Word Segmentation(\xE4\xB8\xAD\xE6\x96\x87\xE5\x88\x86\xE8\xAF\x8D) routine in pure Ruby"
17
+ email: zhangyuanyi@gmail.com
18
+ executables:
19
+ - rseg
20
+ extensions: []
21
+
22
+ extra_rdoc_files:
23
+ - LICENSE
24
+ - README
25
+ files:
26
+ - .gitignore
27
+ - LICENSE
28
+ - README
29
+ - Rakefile
30
+ - VERSION
31
+ - bin/rseg
32
+ - dict/dict.hash
33
+ - lib/builder/dict.rb
34
+ - lib/engines/dict.rb
35
+ - lib/engines/engine.rb
36
+ - lib/engines/english.rb
37
+ - lib/engines/name.rb
38
+ - lib/engines/number.rb
39
+ - lib/filters/conjunction.rb
40
+ - lib/filters/fullwidth.rb
41
+ - lib/filters/symbol.rb
42
+ - lib/rseg.rb
43
+ has_rdoc: true
44
+ homepage: http://github.com/yzhang/rseg
45
+ licenses: []
46
+
47
+ post_install_message:
48
+ rdoc_options:
49
+ - --charset=UTF-8
50
+ require_paths:
51
+ - lib
52
+ required_ruby_version: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ version: "0"
57
+ version:
58
+ required_rubygems_version: !ruby/object:Gem::Requirement
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ version: "0"
63
+ version:
64
+ requirements: []
65
+
66
+ rubyforge_project:
67
+ rubygems_version: 1.3.5
68
+ signing_key:
69
+ specification_version: 3
70
+ summary: "A Chinese Word Segmentation(\xE4\xB8\xAD\xE6\x96\x87\xE5\x88\x86\xE8\xAF\x8D) routine in pure Ruby"
71
+ test_files:
72
+ - test/test_auto.rb
73
+ - test/test_bench.rb
74
+ - test/test_ent.rb
75
+ - test/test_finance.rb
76
+ - test/test_news.rb
77
+ - test/test_sport.rb
78
+ - test/test_tech.rb
79
+ - test/test_web.rb