rseg 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,3 @@
1
+ test
2
+ pkg
3
+ .DS_Store
data/LICENSE ADDED
@@ -0,0 +1,30 @@
1
+ Rseg includes two built-in dictionaries:
2
+
3
+ * CC-CEDICT (http://cc-cedict.org/wiki/): Creative Commons Attribution-Share Alike 3.0 License (http://creativecommons.org/licenses/by-sa/3.0/)
4
+ * Wikipedia Chinese article title list (http://download.wikimedia.org/zhwiki/): Creative Commons Attribution-Share Alike 3.0 License (http://creativecommons.org/licenses/by-sa/3.0/)
5
+
6
+ The codes and others in Rseg are licensed under MIT license:
7
+
8
+ ===============================
9
+ Copyright (c) 2009 Yuanyi Zhang
10
+
11
+ Permission is hereby granted, free of charge, to any person
12
+ obtaining a copy of this software and associated documentation
13
+ files (the "Software"), to deal in the Software without
14
+ restriction, including without limitation the rights to use,
15
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
16
+ copies of the Software, and to permit persons to whom the
17
+ Software is furnished to do so, subject to the following
18
+ conditions:
19
+
20
+ The above copyright notice and this permission notice shall be
21
+ included in all copies or substantial portions of the Software.
22
+
23
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
25
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
27
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
28
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
29
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
30
+ OTHER DEALINGS IN THE SOFTWARE.
data/README ADDED
@@ -0,0 +1,35 @@
1
+ Introduction
2
+ ========
3
+ Rseg is a Chinese Word Segmentation(中文分词) routine in pure Ruby.
4
+
5
+ The algorithm is based on this article: http://xiecc.blog.163.com/blog/static/14032200671110224190/
6
+
7
+ Usage
8
+ ========
9
+
10
+ It's very easy to use:
11
+
12
+ > require 'rubygems'
13
+ > require 'rseg'
14
+ > RSeg.segment("需要分词的文章")
15
+ ['需要', '分词', '的', '文章']
16
+
17
+ The first call to Rseg#segment will need about 30 seconds to load the dictionary, the second call will be very fast.
18
+
19
+ Performance
20
+ ========
21
+ About 5M character/s on my Macbook (Intel Core 2 Duo 2GHz/4G mem).
22
+
23
+ License
24
+ ========
25
+
26
+ Rseg includes two built-in dictionaries:
27
+
28
+ * CC-CEDICT (http://cc-cedict.org/wiki/) with Creative Commons Attribution-Share Alike 3.0 License (http://creativecommons.org/licenses/by-sa/3.0/)
29
+ * Wikipedia Chinese article title list (http://download.wikimedia.org/zhwiki/) with Creative Commons Attribution-Share Alike 3.0 License(http://creativecommons.org/licenses/by-sa/3.0/)
30
+
31
+ The codes and others in Rseg are licensed under MIT license.
32
+
33
+ Feedback
34
+ ========
35
+ All feedback are welcome, Yuanyi Zhang(zhangyuanyi#gmail.com)
data/Rakefile ADDED
@@ -0,0 +1,19 @@
1
+ require 'rake'
2
+ require 'rake/testtask'
3
+ require 'rcov/rcovtask'
4
+
5
+ begin
6
+ require 'jeweler'
7
+ Jeweler::Tasks.new do |s|
8
+ s.name = "rseg"
9
+ s.executables = "rseg"
10
+ s.summary = "A Chinese Word Segmentation(中文分词) routine in pure Ruby"
11
+ s.email = "zhangyuanyi@gmail.com"
12
+ s.homepage = "http://github.com/yzhang/rseg"
13
+ s.description = "A Chinese Word Segmentation(中文分词) routine in pure Ruby"
14
+ s.authors = ["Yuanyi Zhang"]
15
+ s.files = FileList["[A-Z]*", "{bin,lib}/**/*", '.gitignore', 'dict/dict.hash']
16
+ end
17
+ rescue LoadError
18
+ puts "Jeweler, or one of its dependencies, is not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
19
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.1
data/bin/rseg ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ puts 'Coming soon'
data/dict/dict.hash ADDED
Binary file
@@ -0,0 +1,42 @@
1
+ $KCODE = 'UTF8'
2
+
3
+ def process(path, tree)
4
+ File.open(path, 'r') do |file|
5
+ file.each_line do |line|
6
+ node = nil
7
+ line.chars.each do |c|
8
+ next if c == "\n" || c == "\r"
9
+ if node
10
+ node[c] ||= {}
11
+ node = node[c]
12
+ else
13
+ tree[c] ||= Hash.new
14
+ node = tree[c]
15
+ end
16
+ end
17
+ node[:end] = true
18
+ end
19
+ end
20
+ end
21
+
22
+ def build
23
+ tree = {}
24
+ dictionaries = ['cedict.zh_CN.utf8', 'wikipedia.zh.utf8']
25
+ #dictionaries = ['wikipedia.zh.utf8']
26
+
27
+ dictionaries.each do |dictionary|
28
+ puts "Processing #{dictionary}..."
29
+ path = File.join(File.dirname(__FILE__), '../../dict', dictionary)
30
+ process(path, tree)
31
+ end
32
+
33
+ File.open(hash_path, "wb") {|io| Marshal.dump(tree, io)}
34
+ puts 'Done'
35
+ end
36
+
37
+ def hash_path
38
+ File.join(File.dirname(__FILE__), '../../dict/dict.hash')
39
+ end
40
+
41
+ build
42
+
@@ -0,0 +1,42 @@
1
+ class Dict < Engine
2
+ @@root = nil
3
+
4
+ def initialize
5
+ @@root ||= load_dict(dict_path)
6
+ @word = ''
7
+ @node = @@root
8
+ super
9
+ end
10
+
11
+ def process(char)
12
+ match = false
13
+ word = nil
14
+
15
+ if @node[char]
16
+ @word << char
17
+ @node = @node[char]
18
+ match = true
19
+ else
20
+ if @node[:end] || @word.chars.to_a.length == 1
21
+ word = @word
22
+ else
23
+ word = @word.chars.to_a
24
+ end
25
+
26
+ @node = @@root
27
+ @word = ''
28
+ match = false
29
+ end
30
+
31
+ [match, word]
32
+ end
33
+
34
+ private
35
+ def load_dict(path)
36
+ File.open(path, "rb") {|io| Marshal.load(io)}
37
+ end
38
+
39
+ def dict_path
40
+ File.join(File.dirname(__FILE__), '../../dict/dict.hash')
41
+ end
42
+ end
@@ -0,0 +1,17 @@
1
+ class Engine
2
+ def initialize
3
+ @running = true
4
+ end
5
+
6
+ def stop
7
+ @running = false
8
+ end
9
+
10
+ def run
11
+ @running = true
12
+ end
13
+
14
+ def running?
15
+ @running
16
+ end
17
+ end
@@ -0,0 +1,24 @@
1
+ LETTER_SYMBOLS = ('a'..'z').to_a + ('A'..'Z').to_a
2
+
3
+ class English < Engine
4
+ def initialize
5
+ @word = ''
6
+ super
7
+ end
8
+
9
+ def process(char)
10
+ match = false
11
+ word = nil
12
+
13
+ if LETTER_SYMBOLS.include?(char)
14
+ @word << char
15
+ match = true
16
+ else
17
+ word = @word
18
+ @word = ''
19
+ match = false
20
+ end
21
+
22
+ [match, word]
23
+ end
24
+ end
@@ -0,0 +1,52 @@
1
+ LAST_NAMES = %W(丁 卜 刁 七 弓 干 于 王 尤 孔 方 申 白 甘 田 包 石 左 平 司 皮 史 池 艾 年 匡 充 江 印
2
+ 促 伊 伍 安 任 米 促 牟 向 吉 成 伏 吕 李 吴 沈 何 贝 狄 祁 杜 汪 阮 邢 汲 别 辛 冷 利 沃 谷
3
+ 扶 步 那 沙 周 金 吕 花 孟 和 邵 房 抗 灰 明 屈 松 牧 宓 武 幸 卓 易 尚 邰 空 竺 岳 东 林
4
+ 施 姜 俞 查 封 秋 帅 祖 羿 柯 茅 柳 姚 纪 宣 咸 库 侯 洪 胡 哈 宣 郁 祝 苗 禹 娄
5
+ 秦 奚 倪 度 凌 宰 宦 师 徐 翁 班 马 时 晃 乌 夏 贡 柴 能 家 宫 敖 索 晏 桑 高 凌 桂 容 姬 劳 桑 桂 袁 时 祝 席 徐 高 夏 凌 洪 翁 家 芮 乌 祖 索 贡
6
+ 许 张 曹 戚 梅 屠 盛 崖 章 鱼 国 商 扈 寇 终 冯 苗 康 常 茅 闵 麻 胡 崔 邢 条 符 宿 堵 浦 习 鱼
7
+ 梁 富 曾 程 项 钮 舒 彭 费 童 云 喻 嵇 范 费 贺 毕 付 黄 邵 祁 阮 强 童 邱 解 贲 单 富 钮 荀 惠 邴 焦 班 甯 钭 景 邰 劳 茹 寇 荆
8
+ 莫 际 景 须 杨 詹 郎 雷 贾 路 骆 虞 经 裘 郁 滑 甄 靳 詹 闻 逄 雍 訾 郎 农 路 骆 虞 经 裘 郁 滑 靳 闻 逄 雍
9
+ 赵 黄 褚 凤 郝 齐 臧 熊 管 裴 荣 郗 韶 郜 黎 翟 寿 通
10
+ 卫 葛 鲁 乐 谈 董 樊 万 诸 刘 叶 都 满 广 殴 巩 养
11
+ 郭 钱 陈 陶 鲍 穆 郭 堆 卢 陆 龙 噪 鄂 阴 苍 燕 冀 衡 融 蒯 逯
12
+ 蒋 魏 谢 邹 潘 滕 邬 戴 钟 蔡 缪 应 储 糜 隗 历 蒲 慕 蔚 隆 鞠 关
13
+ 韩 萧 颜 庞 麦 双 璩 濮 聂 丰 看
14
+ 郑 严 蓟 薄 谭 罗
15
+ 买 蓝 蓬 怀 党 饶
16
+ 顾 苏 龚 边 栾 权)
17
+
18
+ FIRST_NAMES = %W(文 铭 菁 郁 怡 智 德 祥 志 华 孟 庆 雅 佩 晓 蓉 明 仁 宇 青 慧 豪 琪 安
19
+ 惠 宗 信 盈 君 秀 敏 伶 佳 国 荣 忠 宏 育 丽 圣 淑 彦 龙 冠 后 静 娟 子
20
+ 嘉 瑞 柏 弘 芳 正 玮 贞 如 凯 元 士 伟 杰 颍 霖 玲 仪 珮 英 建 政 真 珍
21
+ 美 世 立 秋 婷 贤 瑜 中 玉 维 莹 翔 家 芬 昌 裕 雯 萍 永 成 宜 鸿 珊 民
22
+ 欣 哲 良 伦 燕 梦 磊 丹 元 一 昌 红 健)
23
+
24
+ class Name < Engine
25
+ def initialize
26
+ @word = ''
27
+ @last = false
28
+ super
29
+ end
30
+
31
+ def process(char)
32
+ match = false
33
+ word = nil
34
+
35
+ if !@last && LAST_NAMES.include?(char)
36
+ @word << char
37
+ match = true
38
+ @last = true
39
+ elsif @last && @word.chars.to_a.length < 3 && FIRST_NAMES.include?(char)
40
+ @word << char
41
+ match = true
42
+ @unit = true
43
+ else
44
+ word = @word
45
+ @word = ''
46
+ @last = false
47
+ match = false
48
+ end
49
+
50
+ [match, word]
51
+ end
52
+ end
@@ -0,0 +1,59 @@
1
+ NUMBER_SYMBOLS = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
2
+ '一', '二', '三', '四', '五', '六', '七', '八', '九', '十',
3
+ '零', '〇', '百', '千', '壹', '贰', '叁', '肆', '柒', '捌',
4
+ '玖', '拾', '之', '%', '¥', '分', '$', '.', '点', '第', '每']
5
+ SUBUNIT_SYMBOLS = ['多', '公', '英', '厘', '毫', '微', '纳', '海', '平', '立',
6
+ '方', '摄', '华', '氏', '美', '日', '澳', '港', '台', '新',
7
+ '个', '百', '佰', '千', '仟', '万', '萬', '亿', '兆', '吉']
8
+ UNIT_SYMBOLS = ['刻', '章', '回', '节', '名', '个', '届', '次', '集', '元',
9
+ '角', '例', '人', '斤', '克', '吨', '米', '里', '升', '码',
10
+ '尺', '寸', '杆', '顷', '亩', '磅', '镑', '桶', '度', '秒',
11
+ '分', '卡', '焦', '瓦', '匹', '圆', '币', '年', '月', '日',
12
+ '时', '秒', '点', '百', '佰', '仟', '千', '万', '萬', '亿',
13
+ '兆', '吉', '块', '半', '岁', '家', '所', '期', '场', '投',
14
+ '中', '辆', '只', '头']
15
+
16
+ class Number < Engine
17
+ def initialize
18
+ @word = ''
19
+ @number = ''
20
+ @unit = false
21
+ @subunit = false
22
+ super
23
+ end
24
+
25
+ def process(char)
26
+ match = false
27
+ word = nil
28
+
29
+ if (!@subunit || @unit) && NUMBER_SYMBOLS.include?(char)
30
+ @number << char
31
+ match = true
32
+ @unit = false
33
+ @subunit = false
34
+ elsif (@number != '' || @unit) && SUBUNIT_SYMBOLS.include?(char)
35
+ @number << char
36
+ match = true
37
+ @subunit = true
38
+ end
39
+
40
+ if (@number != '' || @subunit) && UNIT_SYMBOLS.include?(char)
41
+ @word << @number
42
+ @word << char if !match
43
+ @number = ''
44
+ @unit = true
45
+ match = true
46
+ end
47
+
48
+ if !match
49
+ word = (@word != '') ? @word : @number
50
+ @word = ''
51
+ @number = ''
52
+ match = false
53
+ @unit = false
54
+ @subunit = false
55
+ end
56
+
57
+ [match, word]
58
+ end
59
+ end
@@ -0,0 +1,7 @@
1
+ CONJUNCTIONS = %W(给 的 说 对 在 和 是 被 最 所 那 由 这 有 将 你 会 与 他 为 不 没 很 了 啊 哦 呵 把 去 从)
2
+
3
+ class Conjunction
4
+ def self.filter(char)
5
+ CONJUNCTIONS.include?(char) ? :conjunction : char
6
+ end
7
+ end
@@ -0,0 +1,17 @@
1
+ class Fullwidth
2
+ FULLWIDTH_CHARS = {'1' => '1', '2' => '2', '3' => '3', '4' => '4', '5' => '5', '6' => '6', '7' => '7', '8' => '8',
3
+ '9' => '9', '0' => '0', 'a' => 'a', 'b' => 'b', 'c' => 'c', 'd' => 'd', 'e' => 'e', 'f' => 'f',
4
+ 'g' => 'g', 'h' => 'h', 'i' => 'i', 'j' => 'j', 'k' => 'k', 'l' => 'l', 'm' => 'm', 'n' => 'n',
5
+ 'o' => 'o', 'p' => 'p', 'q' => 'q', 'r' => 'r', 's' => 's', 't' => 't', 'u' => 'u', 'v' => 'v',
6
+ 'w' => 'w', 'x' => 'x', 'y' => 'y', 'z' => 'z', 'A' => 'A', 'B' => 'B', 'C' => 'C', 'D' => 'D',
7
+ 'E' => 'E', 'F' => 'F', 'G' => 'G', 'H' => 'H', 'I' => 'I', 'J' => 'J', 'K' => 'K', 'L' => 'L',
8
+ 'M' => 'M', 'N' => 'N', 'O' => 'O', 'P' => 'P', 'Q' => 'Q', 'R' => 'R', 'S' => 'S', 'T' => 'T',
9
+ 'U' => 'U', 'V' => 'V', 'W' => 'W', 'X' => 'X', 'Y' => 'Y', 'Z' => 'Z', '-' => '-', '+' => '+',
10
+ '—' => '-', ',' => ',', '/' => '/', '·' => '.'}
11
+
12
+ class << self
13
+ def filter(char)
14
+ FULLWIDTH_CHARS[char].nil? ? char : FULLWIDTH_CHARS[char]
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,13 @@
1
+ SEPARATORS = ['`', '[', ']', '、', '=', '‘', ';', '。', '|', '?', '》',
2
+ '《', ':', '“', '{', '}', ')', '(', '*', '…', '#', '!',
3
+ '~', '’', '”', '〕', '〈', '〉', '「', '」', '『', '』', '〖', '〗',
4
+ '【', '】', '<', '>', '`', '~', '!', '@', '#', '^',
5
+ '&', '*', '\\', '(', ')', '=', '{', '}', '[', ']',
6
+ '|', ';', ':', "'", '<', '>', '?', "\n", "\t", "\r",
7
+ ' ', '-', '/', '+', ',', ' ']
8
+
9
+ class Symbol
10
+ def self.filter(char)
11
+ SEPARATORS.include?(char) ? :symbol : char
12
+ end
13
+ end
data/lib/rseg.rb ADDED
@@ -0,0 +1,112 @@
1
+ $KCODE = 'UTF8'
2
+
3
+ require File.join(File.dirname(__FILE__), 'engines/engine')
4
+ require File.join(File.dirname(__FILE__), 'engines/dict')
5
+ require File.join(File.dirname(__FILE__), 'engines/english')
6
+ require File.join(File.dirname(__FILE__), 'engines/number')
7
+ require File.join(File.dirname(__FILE__), 'engines/name')
8
+
9
+ require File.join(File.dirname(__FILE__), 'filters/fullwidth')
10
+ require File.join(File.dirname(__FILE__), 'filters/symbol')
11
+ require File.join(File.dirname(__FILE__), 'filters/conjunction')
12
+
13
+ class Rseg
14
+ @@engines = nil
15
+ @@segment = nil
16
+ @@filters = nil
17
+
18
+ class << self
19
+ def segment(input)
20
+ @@segment ||= Rseg.new(input)
21
+ @@segment.segment
22
+ end
23
+ end
24
+
25
+ def initialize(input)
26
+ @input = input
27
+ @words = []
28
+ init_engines
29
+ init_filters
30
+ end
31
+
32
+ def segment
33
+ @words.clear
34
+
35
+ @input.chars.each do |origin|
36
+ char = filter(origin)
37
+ process(char, origin)
38
+ end
39
+
40
+ process(:symbol, '')
41
+ @words
42
+ end
43
+
44
+ private
45
+ def filter(char)
46
+ result = char
47
+ @@filters.each do |klass|
48
+ result = klass.filter(result)
49
+ end
50
+ result
51
+ end
52
+
53
+ def process(char, origin)
54
+ nomatch = true
55
+ word = ''
56
+
57
+ engines.each do |engine|
58
+ next unless engine.running?
59
+ match, word = engine.process(char)
60
+ match ? nomatch = false : engine.stop
61
+ end
62
+
63
+ if nomatch
64
+ if word == ''
65
+ @words << origin unless char == :symbol
66
+ reset_engines
67
+ else
68
+ reset_engines
69
+ @words << word if word.is_a?(String)
70
+ reprocess(word) if word.is_a?(Array)
71
+
72
+ # re-process current char
73
+ process(char, origin)
74
+ end
75
+ end
76
+ end
77
+
78
+ def reprocess(word)
79
+ last = word.pop
80
+
81
+ word.each do |char|
82
+ process(char, char)
83
+ end
84
+
85
+ process(:symbol, :symbol)
86
+ process(last, last)
87
+ end
88
+
89
+ def reset_engines
90
+ engines.each do |engine|
91
+ engine.run
92
+ end
93
+ end
94
+
95
+ def engines=(engines)
96
+ @@engines ||= engines
97
+ end
98
+
99
+ def engines
100
+ @@engines
101
+ end
102
+
103
+ def init_filters
104
+ @@filters = [Fullwidth, Symbol]
105
+ end
106
+
107
+ def init_engines
108
+ @@engines ||= [Dict, English, Number, Name].map do |engine_klass|
109
+ engine_klass.new
110
+ end
111
+ end
112
+ end
metadata ADDED
@@ -0,0 +1,79 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rseg
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Yuanyi Zhang
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-11-29 00:00:00 +08:00
13
+ default_executable: rseg
14
+ dependencies: []
15
+
16
+ description: "A Chinese Word Segmentation(\xE4\xB8\xAD\xE6\x96\x87\xE5\x88\x86\xE8\xAF\x8D) routine in pure Ruby"
17
+ email: zhangyuanyi@gmail.com
18
+ executables:
19
+ - rseg
20
+ extensions: []
21
+
22
+ extra_rdoc_files:
23
+ - LICENSE
24
+ - README
25
+ files:
26
+ - .gitignore
27
+ - LICENSE
28
+ - README
29
+ - Rakefile
30
+ - VERSION
31
+ - bin/rseg
32
+ - dict/dict.hash
33
+ - lib/builder/dict.rb
34
+ - lib/engines/dict.rb
35
+ - lib/engines/engine.rb
36
+ - lib/engines/english.rb
37
+ - lib/engines/name.rb
38
+ - lib/engines/number.rb
39
+ - lib/filters/conjunction.rb
40
+ - lib/filters/fullwidth.rb
41
+ - lib/filters/symbol.rb
42
+ - lib/rseg.rb
43
+ has_rdoc: true
44
+ homepage: http://github.com/yzhang/rseg
45
+ licenses: []
46
+
47
+ post_install_message:
48
+ rdoc_options:
49
+ - --charset=UTF-8
50
+ require_paths:
51
+ - lib
52
+ required_ruby_version: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ version: "0"
57
+ version:
58
+ required_rubygems_version: !ruby/object:Gem::Requirement
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ version: "0"
63
+ version:
64
+ requirements: []
65
+
66
+ rubyforge_project:
67
+ rubygems_version: 1.3.5
68
+ signing_key:
69
+ specification_version: 3
70
+ summary: "A Chinese Word Segmentation(\xE4\xB8\xAD\xE6\x96\x87\xE5\x88\x86\xE8\xAF\x8D) routine in pure Ruby"
71
+ test_files:
72
+ - test/test_auto.rb
73
+ - test/test_bench.rb
74
+ - test/test_ent.rb
75
+ - test/test_finance.rb
76
+ - test/test_news.rb
77
+ - test/test_sport.rb
78
+ - test/test_tech.rb
79
+ - test/test_web.rb