rseg1.9 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ test
2
+ pkg
3
+ .DS_Store
4
+ *.gem
data/LICENSE ADDED
@@ -0,0 +1,30 @@
1
+ Rseg includes two built-in dictionaries:
2
+
3
+ * CC-CEDICT (http://cc-cedict.org/wiki/): Creative Commons Attribution-Share Alike 3.0 License (http://creativecommons.org/licenses/by-sa/3.0/)
4
+ * Wikipedia Chinese article title list (http://download.wikimedia.org/zhwiki/): Creative Commons Attribution-Share Alike 3.0 License (http://creativecommons.org/licenses/by-sa/3.0/)
5
+
6
+ The codes and others in Rseg are licensed under MIT license:
7
+
8
+ ===============================
9
+ Copyright (c) 2009 Yuanyi Zhang
10
+
11
+ Permission is hereby granted, free of charge, to any person
12
+ obtaining a copy of this software and associated documentation
13
+ files (the "Software"), to deal in the Software without
14
+ restriction, including without limitation the rights to use,
15
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
16
+ copies of the Software, and to permit persons to whom the
17
+ Software is furnished to do so, subject to the following
18
+ conditions:
19
+
20
+ The above copyright notice and this permission notice shall be
21
+ included in all copies or substantial portions of the Software.
22
+
23
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
25
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
27
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
28
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
29
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
30
+ OTHER DEALINGS IN THE SOFTWARE.
data/README ADDED
@@ -0,0 +1,57 @@
1
+ Introduction
2
+ ========
3
+ Rseg is a Chinese Word Segmentation(中文分词) routine in pure Ruby.
4
+
5
+ The algorithm is based on this article: http://xiecc.blog.163.com/blog/static/14032200671110224190/
6
+
7
+ Usage
8
+ ========
9
+
10
+ Rseg now support two modes: inline and C/S mode.
11
+
12
+ 1. Inline mode
13
+
14
+ > require 'rubygems'
15
+ > require 'rseg'
16
+ > Rseg.segment("需要分词的文章")
17
+ ['需要', '分词', '的', '文章']
18
+
19
+ The first call to Rseg#segment will need about 30 seconds to load the dictionary, the second call will be very fast, you can also call Rseg#load to load dictionaries manually.
20
+
21
+ 2. C/S mode
22
+
23
+ $ rseg_server
24
+ == Sinatra/0.9.4 has taken the stage on 4100
25
+
26
+ This will start rseg server on http://localhost:4100
27
+
28
+ You can visit it via your browser or the rseg command.
29
+
30
+ $ rseg '需要分词的文章'
31
+ 需要 分词 的 文章
32
+
33
+ You can also access server with the Rseg#remote_segment
34
+
35
+ $ irb
36
+ > require 'rubygems'
37
+ > require 'rseg'
38
+ > Rseg.remote_segment("需要分词的文章") # This will be very fast
39
+ ['需要', '分词', '的', '文章']
40
+
41
+ Performance
42
+ ========
43
+ About 5M character/s on my Macbook (Intel Core 2 Duo 2GHz/4G mem).
44
+
45
+ License
46
+ ========
47
+
48
+ Rseg includes two built-in dictionaries:
49
+
50
+ * CC-CEDICT (http://cc-cedict.org/wiki/) with Creative Commons Attribution-Share Alike 3.0 License (http://creativecommons.org/licenses/by-sa/3.0/)
51
+ * Wikipedia Chinese article title list (http://download.wikimedia.org/zhwiki/) with Creative Commons Attribution-Share Alike 3.0 License(http://creativecommons.org/licenses/by-sa/3.0/)
52
+
53
+ The codes and others in Rseg are licensed under MIT license.
54
+
55
+ Feedback
56
+ ========
57
+ All feedback are welcome, Yuanyi Zhang(zhangyuanyi#gmail.com)
data/Rakefile ADDED
@@ -0,0 +1,21 @@
1
+ require 'rake'
2
+ require 'rake/testtask'
3
+ require 'rcov/rcovtask'
4
+
5
+ begin
6
+ require 'jeweler'
7
+ Jeweler::Tasks.new do |s|
8
+ s.name = "rseg"
9
+ s.executables = ["rseg", 'rseg_server']
10
+ s.summary = "A Chinese Word Segmentation(中文分词) routine in pure Ruby"
11
+ s.email = "zhangyuanyi@gmail.com"
12
+ s.homepage = "http://github.com/yzhang/rseg"
13
+ s.description = "A Chinese Word Segmentation(中文分词) routine in pure Ruby"
14
+ s.authors = ["Yuanyi Zhang"]
15
+ s.files = FileList["[A-Z]*", "{bin,lib,public,views}/**/*", '.gitignore', 'dict/dict.hash']
16
+ s.add_dependency 'haml'
17
+ s.add_dependency 'sinatra'
18
+ end
19
+ rescue LoadError
20
+ puts "Jeweler, or one of its dependencies, is not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
21
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.7
data/bin/rseg ADDED
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env ruby
2
+ require File.join(File.dirname(__FILE__), '/../lib/rseg')
3
+
4
+ input = ARGV[0]
5
+
6
+ if input.nil? || input == ''
7
+ puts "Usage: rseg <text>"
8
+ exit
9
+ end
10
+
11
+ puts Rseg.remote_segment(input).join(' ')
data/bin/rseg_server ADDED
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env ruby
2
+ $:.unshift File.expand_path(File.dirname(__FILE__))
3
+
4
+ require 'rubygems'
5
+ require 'haml'
6
+
7
+ require File.join(File.dirname(__FILE__), '/../lib/rseg')
8
+ require File.join(File.dirname(__FILE__), '/../lib/app')
9
+
10
+ puts "Loading dictionaries, this will take about 30 seconds."
11
+ puts "Please wait a moment..."
12
+ Rseg.load
13
+ puts "Dictionaries loaded."
14
+
15
+ App.run! :host => '127.0.0.1', :port => 4100, :environment => 'production'
16
+ exit
data/dict/dict.hash ADDED
Binary file
data/lib/app.rb ADDED
@@ -0,0 +1,22 @@
1
+ require 'sinatra/base'
2
+
3
+ class App < Sinatra::Base
4
+ set :root, File.dirname(__FILE__) + "/.."
5
+ set :app_file, __FILE__
6
+
7
+ get '/' do
8
+ haml :index
9
+ end
10
+
11
+ post '/segment' do
12
+ @input = params[:input]
13
+ @result = Rseg.segment(@input).join(' ')
14
+ haml :index
15
+ end
16
+
17
+ post '/seg' do
18
+ @input = params[:input]
19
+ @result = Rseg.segment(@input)
20
+ @result.join(' ')
21
+ end
22
+ end
@@ -0,0 +1,42 @@
1
+ #coding: utf-8
2
+ $KCODE = 'UTF8'
3
+
4
+ def process(path, tree)
5
+ File.open(path, 'r') do |file|
6
+ file.each_line do |line|
7
+ node = nil
8
+ line.chars.each do |c|
9
+ next if c == "\n" || c == "\r"
10
+ if node
11
+ node[c] ||= {}
12
+ node = node[c]
13
+ else
14
+ tree[c] ||= Hash.new
15
+ node = tree[c]
16
+ end
17
+ end
18
+ node[:end] = true
19
+ end
20
+ end
21
+ end
22
+
23
+ def build
24
+ tree = {}
25
+ dictionaries = ['cedict.zh_CN.utf8', 'wikipedia.zh.utf8']
26
+
27
+ dictionaries.each do |dictionary|
28
+ puts "Processing #{dictionary}..."
29
+ path = File.join(File.dirname(__FILE__), '../../dict', dictionary)
30
+ process(path, tree)
31
+ end
32
+
33
+ File.open(hash_path, "wb") {|io| Marshal.dump(tree, io)}
34
+ puts 'Done'
35
+ end
36
+
37
+ def hash_path
38
+ File.join(File.dirname(__FILE__), '../../dict/dict.hash')
39
+ end
40
+
41
+ build
42
+
@@ -0,0 +1,51 @@
1
+ module RsegEngine
2
+ class Dict < Engine
3
+ @@root = nil
4
+ @@dict_path = File.join(File.dirname(__FILE__), '../../dict/dict.hash')
5
+
6
+ class << self
7
+ def dict_path=(path)
8
+ @@dict_path = path
9
+ end
10
+
11
+ def dict_path
12
+ @@dict_path
13
+ end
14
+ end
15
+
16
+ def initialize
17
+ @@root ||= load_dict(@@dict_path)
18
+ @word = ''
19
+ @node = @@root
20
+ super
21
+ end
22
+
23
+ def process(char)
24
+ match = false
25
+ word = nil
26
+
27
+ if @node[char]
28
+ @word << char
29
+ @node = @node[char]
30
+ match = true
31
+ else
32
+ if @node[:end] || @word.chars.to_a.length == 1
33
+ word = @word
34
+ else
35
+ word = @word.chars.to_a
36
+ end
37
+
38
+ @node = @@root
39
+ @word = ''
40
+ match = false
41
+ end
42
+
43
+ [match, word]
44
+ end
45
+
46
+ private
47
+ def load_dict(path)
48
+ File.open(path, "rb") {|io| Marshal.load(io)}
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,19 @@
1
+ module RsegEngine
2
+ class Engine
3
+ def initialize
4
+ @running = true
5
+ end
6
+
7
+ def stop
8
+ @running = false
9
+ end
10
+
11
+ def run
12
+ @running = true
13
+ end
14
+
15
+ def running?
16
+ @running
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,26 @@
1
+ module RsegEngine
2
+ LETTER_SYMBOLS = ('a'..'z').to_a + ('A'..'Z').to_a
3
+
4
+ class English < Engine
5
+ def initialize
6
+ @word = ''
7
+ super
8
+ end
9
+
10
+ def process(char)
11
+ match = false
12
+ word = nil
13
+
14
+ if LETTER_SYMBOLS.include?(char)
15
+ @word << char
16
+ match = true
17
+ else
18
+ word = @word
19
+ @word = ''
20
+ match = false
21
+ end
22
+
23
+ [match, word]
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,52 @@
1
+ #coding: utf-8
2
+ module RsegEngine
3
+ class Name < Engine
4
+ @@last_names = %W(丁 卜 刁 七 弓 干 于 王 尤 孔 方 申 白 甘 田 包 石 左 平 司 皮 史 池 艾 年 匡 充 江 印
5
+ 促 伊 伍 安 任 米 促 牟 向 吉 成 伏 吕 李 吴 沈 何 贝 狄 祁 杜 汪 阮 邢 汲 别 辛 冷 利 沃 谷
6
+ 扶 步 那 沙 周 金 吕 花 孟 和 邵 房 抗 灰 明 屈 松 牧 宓 武 幸 卓 易 尚 邰 空 竺 岳 东 林
7
+ 施 姜 俞 查 封 秋 帅 祖 羿 柯 茅 柳 姚 纪 宣 咸 库 侯 洪 胡 哈 宣 郁 祝 苗 禹 娄
8
+ 秦 奚 倪 度 凌 宰 宦 师 徐 翁 班 马 时 晃 乌 夏 贡 柴 能 家 宫 敖 索 晏 桑 高 凌 桂 容 姬 劳 桑 桂 袁 时 祝 席 徐 高 夏 凌 洪 翁 家 芮 乌 祖 索 贡
9
+ 许 张 曹 戚 梅 屠 盛 崖 章 鱼 国 商 扈 寇 终 冯 苗 康 常 茅 闵 麻 胡 崔 邢 条 符 宿 堵 浦 习 鱼
10
+ 梁 富 曾 程 项 钮 舒 彭 费 童 云 喻 嵇 范 费 贺 毕 付 黄 邵 祁 阮 强 童 邱 解 贲 单 富 钮 荀 惠 邴 焦 班 甯 钭 景 邰 劳 茹 寇 荆
11
+ 莫 际 景 须 杨 詹 郎 雷 贾 路 骆 虞 经 裘 郁 滑 甄 靳 詹 闻 逄 雍 訾 郎 农 路 骆 虞 经 裘 郁 滑 靳 闻 逄 雍
12
+ 赵 黄 褚 凤 郝 齐 臧 熊 管 裴 荣 郗 韶 郜 黎 翟 寿 通
13
+ 卫 葛 鲁 乐 谈 董 樊 万 诸 刘 叶 都 满 广 殴 巩 养
14
+ 郭 钱 陈 陶 鲍 穆 郭 堆 卢 陆 龙 噪 鄂 阴 苍 燕 冀 衡 融 蒯 逯
15
+ 蒋 魏 谢 邹 潘 滕 邬 戴 钟 蔡 缪 应 储 糜 隗 历 蒲 慕 蔚 隆 鞠 关
16
+ 韩 萧 颜 庞 麦 双 璩 濮 聂 丰 看
17
+ 郑 严 蓟 薄 谭 罗 买 蓝 蓬 怀 党 饶 顾 苏 龚 边 栾 权) #:nodoc:
18
+
19
+ @@first_names = %W(文 铭 菁 郁 怡 智 德 祥 志 华 孟 庆 雅 佩 晓 蓉 明 仁 宇 青 慧 豪 琪 安
20
+ 惠 宗 信 盈 君 秀 敏 伶 佳 国 荣 忠 宏 育 丽 圣 淑 彦 龙 冠 后 静 娟 子
21
+ 嘉 瑞 柏 弘 芳 正 玮 贞 如 凯 元 士 伟 杰 颍 霖 玲 仪 珮 英 建 政 真 珍
22
+ 美 世 立 秋 婷 贤 瑜 中 玉 维 莹 翔 家 芬 昌 裕 雯 萍 永 成 宜 鸿 珊 民
23
+ 欣 哲 良 伦 燕 梦 磊 丹 元 一 昌 红 健) #:nodoc:
24
+ def initialize
25
+ @word = ''
26
+ @last = false
27
+ super
28
+ end
29
+
30
+ def process(char)
31
+ match = false
32
+ word = nil
33
+
34
+ if !@last && @@last_names.include?(char)
35
+ @word << char
36
+ match = true
37
+ @last = true
38
+ elsif @last && @word.chars.to_a.length < 3 && @@first_names.include?(char)
39
+ @word << char
40
+ match = true
41
+ @unit = true
42
+ else
43
+ word = @word
44
+ @word = ''
45
+ @last = false
46
+ match = false
47
+ end
48
+
49
+ [match, word]
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,62 @@
1
+ #coding: utf-8
2
+ module RsegEngine
3
+ class Number < Engine
4
+ @@number_symbols = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
5
+ '一', '二', '三', '四', '五', '六', '七', '八', '九', '十',
6
+ '零', '〇', '百', '千', '壹', '贰', '叁', '肆', '柒', '捌',
7
+ '玖', '拾', '之', '%', '¥', '分', '$', '.', '点', '第', '每']
8
+ @@subunit_symbols = ['多', '公', '英', '厘', '毫', '微', '纳', '海', '平', '立',
9
+ '方', '摄', '华', '氏', '美', '日', '澳', '港', '台', '新',
10
+ '个', '百', '佰', '千', '仟', '万', '萬', '亿', '兆', '吉']
11
+ @@unit_symbols = ['刻', '章', '回', '节', '名', '个', '届', '次', '集', '元',
12
+ '角', '例', '人', '斤', '克', '吨', '米', '里', '升', '码',
13
+ '尺', '寸', '杆', '顷', '亩', '磅', '镑', '桶', '度', '秒',
14
+ '分', '卡', '焦', '瓦', '匹', '圆', '币', '年', '月', '日',
15
+ '时', '秒', '点', '百', '佰', '仟', '千', '万', '萬', '亿',
16
+ '兆', '吉', '块', '半', '岁', '家', '所', '期', '场', '投',
17
+ '中', '辆', '只', '头']
18
+
19
+ def initialize
20
+ @word = ''
21
+ @number = ''
22
+ @unit = false
23
+ @subunit = false
24
+ super
25
+ end
26
+
27
+ def process(char)
28
+ match = false
29
+ word = nil
30
+
31
+ if (!@subunit || @unit) && @@number_symbols.include?(char)
32
+ @number << char
33
+ match = true
34
+ @unit = false
35
+ @subunit = false
36
+ elsif (@number != '' || @unit) && @@subunit_symbols.include?(char)
37
+ @number << char
38
+ match = true
39
+ @subunit = true
40
+ end
41
+
42
+ if (@number != '' || @subunit) && @@unit_symbols.include?(char)
43
+ @word << @number
44
+ @word << char if !match
45
+ @number = ''
46
+ @unit = true
47
+ match = true
48
+ end
49
+
50
+ if !match
51
+ word = (@word != '') ? @word : @number
52
+ @word = ''
53
+ @number = ''
54
+ match = false
55
+ @unit = false
56
+ @subunit = false
57
+ end
58
+
59
+ [match, word]
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,10 @@
1
+ #coding: utf-8
2
+ module RsegFilter
3
+ class Conjunction
4
+ @@conjunctions = %W(给 的 说 对 在 和 是 被 最 所 那 由 这 有 将 你 会 与 他 为 不 没 很 了 啊 哦 呵 把 去 从)
5
+
6
+ def self.filter(char)
7
+ @@conjunctions.include?(char) ? :conjunction : char
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,20 @@
1
+ #coding: utf-8
2
+ module RsegFilter
3
+ class Fullwidth
4
+ @@fullwidth_chars = {'1' => '1', '2' => '2', '3' => '3', '4' => '4', '5' => '5', '6' => '6', '7' => '7', '8' => '8',
5
+ '9' => '9', '0' => '0', 'a' => 'a', 'b' => 'b', 'c' => 'c', 'd' => 'd', 'e' => 'e', 'f' => 'f',
6
+ 'g' => 'g', 'h' => 'h', 'i' => 'i', 'j' => 'j', 'k' => 'k', 'l' => 'l', 'm' => 'm', 'n' => 'n',
7
+ 'o' => 'o', 'p' => 'p', 'q' => 'q', 'r' => 'r', 's' => 's', 't' => 't', 'u' => 'u', 'v' => 'v',
8
+ 'w' => 'w', 'x' => 'x', 'y' => 'y', 'z' => 'z', 'A' => 'A', 'B' => 'B', 'C' => 'C', 'D' => 'D',
9
+ 'E' => 'E', 'F' => 'F', 'G' => 'G', 'H' => 'H', 'I' => 'I', 'J' => 'J', 'K' => 'K', 'L' => 'L',
10
+ 'M' => 'M', 'N' => 'N', 'O' => 'O', 'P' => 'P', 'Q' => 'Q', 'R' => 'R', 'S' => 'S', 'T' => 'T',
11
+ 'U' => 'U', 'V' => 'V', 'W' => 'W', 'X' => 'X', 'Y' => 'Y', 'Z' => 'Z', '-' => '-', '+' => '+',
12
+ '—' => '-', ',' => ',', '/' => '/', '·' => '.'}
13
+
14
+ class << self
15
+ def filter(char)
16
+ @@fullwidth_chars[char].nil? ? char : @@fullwidth_chars[char]
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,15 @@
1
+ #coding: utf-8
2
+ module RsegFilter
3
+ class Symbol
4
+ @@separators = ['`', '[', ']', '、', '=', '‘', ';', '。', '|', '?', '》',
5
+ '《', ':', '“', '{', '}', ')', '(', '*', '…', '#', '!',
6
+ '~', '’', '”', '〕', '〈', '〉', '「', '」', '『', '』', '〖', '〗',
7
+ '【', '】', '<', '>', '`', '~', '!', '@', '#', '^',
8
+ '&', '*', '\\', '(', ')', '=', '{', '}', '[', ']',
9
+ '|', ';', ':', "'", '<', '>', '?', "\n", "\t", "\r",
10
+ ' ', '-', '/', '+', ',', ' ']
11
+ def self.filter(char)
12
+ @@separators.include?(char) ? :symbol : char
13
+ end
14
+ end
15
+ end
data/lib/rseg.rb ADDED
@@ -0,0 +1,139 @@
1
+ #coding: utf-8
2
+ $KCODE = 'UTF8'
3
+
4
+ require 'singleton'
5
+ require 'net/http'
6
+
7
+ require File.join(File.dirname(__FILE__), 'engines/engine')
8
+ require File.join(File.dirname(__FILE__), 'engines/dict')
9
+ require File.join(File.dirname(__FILE__), 'engines/english')
10
+ require File.join(File.dirname(__FILE__), 'engines/number')
11
+ require File.join(File.dirname(__FILE__), 'engines/name')
12
+
13
+ require File.join(File.dirname(__FILE__), 'filters/fullwidth')
14
+ require File.join(File.dirname(__FILE__), 'filters/symbol')
15
+ require File.join(File.dirname(__FILE__), 'filters/conjunction')
16
+
17
+ class Rseg
18
+ include Singleton
19
+ include RsegEngine
20
+ include RsegFilter
21
+
22
+ class << self
23
+ def dict_path=(path)
24
+ RsegEngine::Dict.dict_path = path
25
+ end
26
+
27
+ def segment(input)
28
+ Rseg.instance.input = input
29
+ Rseg.instance.segment
30
+ end
31
+
32
+ def load
33
+ Rseg.instance
34
+ nil
35
+ end
36
+
37
+ def remote_segment(input)
38
+ begin
39
+ response = Net::HTTP.post_form(URI.parse('http://127.0.0.1:4100/seg'), :input => input)
40
+ response.code == '200' ? response.body.split(' ') :
41
+ ["Can't connect to http://localhost:4100\nUse rseg_server to start it"]
42
+ rescue
43
+ ["Can't connect to http://localhost:4100\nUse rseg_server to start it"]
44
+ end
45
+ end
46
+ end
47
+
48
+ def initialize
49
+ @input = ''
50
+ @words = []
51
+ init_engines
52
+ init_filters
53
+ end
54
+
55
+ def input=(input)
56
+ @input = input
57
+ end
58
+
59
+ def segment
60
+ @words = []
61
+
62
+ @input.chars.each do |origin|
63
+ char = filter(origin)
64
+ process(char, origin)
65
+ end
66
+
67
+ process(:symbol, '')
68
+ @words
69
+ end
70
+
71
+ private
72
+ def filter(char)
73
+ result = char
74
+ @filters.each do |klass|
75
+ result = klass.filter(result)
76
+ end
77
+ result
78
+ end
79
+
80
+ def process(char, origin)
81
+ nomatch = true
82
+ word = ''
83
+
84
+ engines.each do |engine|
85
+ next unless engine.running?
86
+ match, word = engine.process(char)
87
+ match ? nomatch = false : engine.stop
88
+ end
89
+
90
+ if nomatch
91
+ if word == ''
92
+ @words << origin unless char == :symbol
93
+ reset_engines
94
+ else
95
+ reset_engines
96
+ @words << word if word.is_a?(String)
97
+ reprocess(word) if word.is_a?(Array)
98
+
99
+ # re-process current char
100
+ process(char, origin)
101
+ end
102
+ end
103
+ end
104
+
105
+ def reprocess(word)
106
+ last = word.pop
107
+
108
+ word.each do |char|
109
+ process(char, char)
110
+ end
111
+
112
+ process(:symbol, :symbol)
113
+ process(last, last)
114
+ end
115
+
116
+ def reset_engines
117
+ engines.each do |engine|
118
+ engine.run
119
+ end
120
+ end
121
+
122
+ def engines=(engines)
123
+ @engines ||= engines
124
+ end
125
+
126
+ def engines
127
+ @engines
128
+ end
129
+
130
+ def init_filters
131
+ @filters = [Fullwidth, Symbol]
132
+ end
133
+
134
+ def init_engines
135
+ @engines ||= [Dict, English, Number, Name].map do |engine_klass|
136
+ engine_klass.new
137
+ end
138
+ end
139
+ end
data/public/screen.css ADDED
@@ -0,0 +1,123 @@
1
+ div.clear {clear: both;}
2
+ body {background: #EEEEEE; margin: 0; padding: 0;
3
+ font-family: 'Lucida Grande', 'Lucida Sans Unicode',
4
+ 'Garuda';}
5
+ code {font-family: 'Lucida Console', monospace;
6
+ font-size: 12px;}
7
+ li {height: 18px;}
8
+ ul {list-style: none; margin: 0; padding: 0;}
9
+ ol:hover {cursor: pointer;}
10
+ ol li {white-space: pre;}
11
+ #explanation {font-size: 12px; color: #666666;
12
+ margin: 20px 0 0 100px;}
13
+ /* WRAP */
14
+ #wrap {width: 860px; background: #FFFFFF; margin: 0 auto;
15
+ padding: 30px 50px 20px 50px;
16
+ border-left: 1px solid #DDDDDD;
17
+ border-right: 1px solid #DDDDDD;}
18
+ /* HEADER */
19
+ #header {margin: 0 auto 25px auto;}
20
+ h1 {margin: 0; font-size: 36px; color: #981919;}
21
+ h2 {margin: 0; font-size: 22px; color: #333333;}
22
+ #header ul {margin: 0; font-size: 12px; color: #666666;}
23
+ #header ul li strong{color: #444444;}
24
+ #header ul li {display: inline; padding: 0 10px;}
25
+ #header ul li.first {padding-left: 0;}
26
+ #header ul li.last {border: 0; padding-right: 0;}
27
+
28
+ #content {width: 860px; margin: 0 auto 10px auto;}
29
+
30
+ h3 {float: left; width: 100px; margin-bottom: 10px;
31
+ color: #981919; font-size: 14px; font-weight: bold;}
32
+
33
+ #footer {width: 860px; margin: 0 auto 10px auto; clear:both;
34
+ font-size: 18px; border-top:1px solid #000; padding-top: 10px;
35
+ text-align: right;}
36
+
37
+ textarea {font-size: 18px; padding:10px;}
38
+ #segform { width: 430px; float: left; font-size: 18px;}
39
+ #segresult { width: 408px; float: left; font-size: 18px;
40
+ padding: 10px; color: #D12F19;}
41
+ /* --------------------------------------------------------------
42
+
43
+ buttons.css
44
+ * Gives you some great CSS-only buttons.
45
+
46
+ Created by Kevin Hale [particletree.com]
47
+ * particletree.com/features/rediscovering-the-button-element
48
+
49
+ See Readme.txt in this folder for instructions.
50
+
51
+ -------------------------------------------------------------- */
52
+
53
+ button {
54
+ display:block;
55
+ float:left;
56
+ margin:0 0.583em 0.667em 0;
57
+ padding:5px 10px 5px 7px; /* Links */
58
+
59
+ border:1px solid #dedede;
60
+ border-top:1px solid #eee;
61
+ border-left:1px solid #eee;
62
+
63
+ background-color:#f5f5f5;
64
+ font-family:"Lucida Grande", Tahoma, Arial, Verdana, sans-serif;
65
+ font-size:100%;
66
+ line-height:130%;
67
+ text-decoration:none;
68
+ font-weight:bold;
69
+ color:#565656;
70
+ cursor:pointer;
71
+ }
72
+ button {
73
+ width:auto;
74
+ overflow:visible;
75
+ padding:4px 10px 3px 7px; /* IE6 */
76
+ }
77
+ button[type] {
78
+ padding:4px 10px 4px 7px; /* Firefox */
79
+ line-height:17px; /* Safari */
80
+ }
81
+ *:first-child+html button[type] {
82
+ padding:4px 10px 3px 7px; /* IE7 */
83
+ }
84
+ button img {
85
+ margin:0 3px -3px 0 !important;
86
+ padding:0;
87
+ border:none;
88
+ width:16px;
89
+ height:16px;
90
+ float:none;
91
+ }
92
+
93
+
94
+ /* Button colors
95
+ -------------------------------------------------------------- */
96
+
97
+ /* Standard */
98
+ button:hover {
99
+ background-color:#dff4ff;
100
+ border:1px solid #c2e1ef;
101
+ color:#336699;
102
+ }
103
+
104
+ /* Positive */
105
+ body .positive {
106
+ color:#529214;
107
+ }
108
+ button.positive:hover {
109
+ background-color:#E6EFC2;
110
+ border:1px solid #C6D880;
111
+ color:#529214;
112
+ }
113
+
114
+ /* Negative */
115
+ body .negative {
116
+ color:#d12f19;
117
+ }
118
+ button.negative:hover {
119
+ background:#fbe3e4;
120
+ border:1px solid #fbc2c4;
121
+ color:#d12f19;
122
+ }
123
+
data/views/index.haml ADDED
@@ -0,0 +1,8 @@
1
+ #segform
2
+ %form{:action => '/segment', :method => 'post'}
3
+ %p
4
+ %textarea{:id => 'input', :rows => '15', :cols => '35', :name => 'input'}= @input || '输入要分词的文章'
5
+ %p
6
+ %button.negative{ :type => "submit" } 开始分词
7
+ #segresult
8
+ %p= @result
data/views/layout.haml ADDED
@@ -0,0 +1,16 @@
1
+ !!! Strict
2
+ %html{ :lang => "en", :"xml:lang" => "en", :xmlns => "http://www.w3.org/1999/xhtml" }
3
+ %head
4
+ %meta{ :content => "text/html; charset=utf-8", :"http-equiv" => "Content-Type" }
5
+ %meta{ :content => "zh_CN", :"http-equiv" => "Content-Language" }
6
+ %title= "Rseg中文分词"
7
+ %link{ :rel => 'stylesheet', :href => '/screen.css', :type => 'text/css', :media => "screen"}
8
+
9
+ %body
10
+ #wrap
11
+ #header
12
+ %h1= "Rseg 中文分词"
13
+ %address.watermark
14
+ #content.condensed= yield
15
+ #footer= "作者: 张元一 <br />EMail:zhangyuanyi#gmail.com"
16
+
metadata ADDED
@@ -0,0 +1,104 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rseg1.9
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.5
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Yuanyi Zhang
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2010-11-28 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: haml
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: sinatra
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ description: A Chinese Word Segmentation(中文分词) routine in pure Ruby
47
+ email: zhangyuanyi@gmail.com
48
+ executables:
49
+ - rseg
50
+ - rseg_server
51
+ extensions: []
52
+ extra_rdoc_files:
53
+ - LICENSE
54
+ - README
55
+ files:
56
+ - .gitignore
57
+ - LICENSE
58
+ - README
59
+ - Rakefile
60
+ - VERSION
61
+ - bin/rseg
62
+ - bin/rseg_server
63
+ - dict/dict.hash
64
+ - lib/app.rb
65
+ - lib/builder/dict.rb
66
+ - lib/engines/dict.rb
67
+ - lib/engines/engine.rb
68
+ - lib/engines/english.rb
69
+ - lib/engines/name.rb
70
+ - lib/engines/number.rb
71
+ - lib/filters/conjunction.rb
72
+ - lib/filters/fullwidth.rb
73
+ - lib/filters/symbol.rb
74
+ - lib/rseg.rb
75
+ - public/screen.css
76
+ - views/index.haml
77
+ - views/layout.haml
78
+ homepage: http://github.com/yzhang/rseg
79
+ licenses: []
80
+ post_install_message:
81
+ rdoc_options:
82
+ - --charset=UTF-8
83
+ require_paths:
84
+ - lib
85
+ required_ruby_version: !ruby/object:Gem::Requirement
86
+ none: false
87
+ requirements:
88
+ - - ! '>='
89
+ - !ruby/object:Gem::Version
90
+ version: '0'
91
+ required_rubygems_version: !ruby/object:Gem::Requirement
92
+ none: false
93
+ requirements:
94
+ - - ! '>='
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ requirements: []
98
+ rubyforge_project:
99
+ rubygems_version: 1.8.24
100
+ signing_key:
101
+ specification_version: 3
102
+ summary: A Chinese Word Segmentation(中文分词) routine in pure Ruby
103
+ test_files: []
104
+ has_rdoc: