rseg1.9 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ test
2
+ pkg
3
+ .DS_Store
4
+ *.gem
data/LICENSE ADDED
@@ -0,0 +1,30 @@
1
+ Rseg includes two built-in dictionaries:
2
+
3
+ * CC-CEDICT (http://cc-cedict.org/wiki/): Creative Commons Attribution-Share Alike 3.0 License (http://creativecommons.org/licenses/by-sa/3.0/)
4
+ * Wikipedia Chinese article title list (http://download.wikimedia.org/zhwiki/): Creative Commons Attribution-Share Alike 3.0 License (http://creativecommons.org/licenses/by-sa/3.0/)
5
+
6
+ The codes and others in Rseg are licensed under MIT license:
7
+
8
+ ===============================
9
+ Copyright (c) 2009 Yuanyi Zhang
10
+
11
+ Permission is hereby granted, free of charge, to any person
12
+ obtaining a copy of this software and associated documentation
13
+ files (the "Software"), to deal in the Software without
14
+ restriction, including without limitation the rights to use,
15
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
16
+ copies of the Software, and to permit persons to whom the
17
+ Software is furnished to do so, subject to the following
18
+ conditions:
19
+
20
+ The above copyright notice and this permission notice shall be
21
+ included in all copies or substantial portions of the Software.
22
+
23
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
25
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
27
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
28
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
29
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
30
+ OTHER DEALINGS IN THE SOFTWARE.
data/README ADDED
@@ -0,0 +1,57 @@
1
+ Introduction
2
+ ========
3
+ Rseg is a Chinese Word Segmentation(中文分词) routine in pure Ruby.
4
+
5
+ The algorithm is based on this article: http://xiecc.blog.163.com/blog/static/14032200671110224190/
6
+
7
+ Usage
8
+ ========
9
+
10
+ Rseg now support two modes: inline and C/S mode.
11
+
12
+ 1. Inline mode
13
+
14
+ > require 'rubygems'
15
+ > require 'rseg'
16
+ > Rseg.segment("需要分词的文章")
17
+ ['需要', '分词', '的', '文章']
18
+
19
+ The first call to Rseg#segment will need about 30 seconds to load the dictionary, the second call will be very fast, you can also call Rseg#load to load dictionaries manually.
20
+
21
+ 2. C/S mode
22
+
23
+ $ rseg_server
24
+ == Sinatra/0.9.4 has taken the stage on 4100
25
+
26
+ This will start rseg server on http://localhost:4100
27
+
28
+ You can visit it via your browser or the rseg command.
29
+
30
+ $ rseg '需要分词的文章'
31
+ 需要 分词 的 文章
32
+
33
+ You can also access server with the Rseg#remote_segment
34
+
35
+ $ irb
36
+ > require 'rubygems'
37
+ > require 'rseg'
38
+ > Rseg.remote_segment("需要分词的文章") # This will be very fast
39
+ ['需要', '分词', '的', '文章']
40
+
41
+ Performance
42
+ ========
43
+ About 5M character/s on my Macbook (Intel Core 2 Duo 2GHz/4G mem).
44
+
45
+ License
46
+ ========
47
+
48
+ Rseg includes two built-in dictionaries:
49
+
50
+ * CC-CEDICT (http://cc-cedict.org/wiki/) with Creative Commons Attribution-Share Alike 3.0 License (http://creativecommons.org/licenses/by-sa/3.0/)
51
+ * Wikipedia Chinese article title list (http://download.wikimedia.org/zhwiki/) with Creative Commons Attribution-Share Alike 3.0 License(http://creativecommons.org/licenses/by-sa/3.0/)
52
+
53
+ The codes and others in Rseg are licensed under MIT license.
54
+
55
+ Feedback
56
+ ========
57
+ All feedback are welcome, Yuanyi Zhang(zhangyuanyi#gmail.com)
data/Rakefile ADDED
@@ -0,0 +1,21 @@
1
+ require 'rake'
2
+ require 'rake/testtask'
3
+ require 'rcov/rcovtask'
4
+
5
+ begin
6
+ require 'jeweler'
7
+ Jeweler::Tasks.new do |s|
8
+ s.name = "rseg"
9
+ s.executables = ["rseg", 'rseg_server']
10
+ s.summary = "A Chinese Word Segmentation(中文分词) routine in pure Ruby"
11
+ s.email = "zhangyuanyi@gmail.com"
12
+ s.homepage = "http://github.com/yzhang/rseg"
13
+ s.description = "A Chinese Word Segmentation(中文分词) routine in pure Ruby"
14
+ s.authors = ["Yuanyi Zhang"]
15
+ s.files = FileList["[A-Z]*", "{bin,lib,public,views}/**/*", '.gitignore', 'dict/dict.hash']
16
+ s.add_dependency 'haml'
17
+ s.add_dependency 'sinatra'
18
+ end
19
+ rescue LoadError
20
+ puts "Jeweler, or one of its dependencies, is not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
21
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.7
data/bin/rseg ADDED
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env ruby
2
+ require File.join(File.dirname(__FILE__), '/../lib/rseg')
3
+
4
+ input = ARGV[0]
5
+
6
+ if input.nil? || input == ''
7
+ puts "Usage: rseg <text>"
8
+ exit
9
+ end
10
+
11
+ puts Rseg.remote_segment(input).join(' ')
data/bin/rseg_server ADDED
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env ruby
2
+ $:.unshift File.expand_path(File.dirname(__FILE__))
3
+
4
+ require 'rubygems'
5
+ require 'haml'
6
+
7
+ require File.join(File.dirname(__FILE__), '/../lib/rseg')
8
+ require File.join(File.dirname(__FILE__), '/../lib/app')
9
+
10
+ puts "Loading dictionaries, this will take about 30 seconds."
11
+ puts "Please wait a moment..."
12
+ Rseg.load
13
+ puts "Dictionaries loaded."
14
+
15
+ App.run! :host => '127.0.0.1', :port => 4100, :environment => 'production'
16
+ exit
data/dict/dict.hash ADDED
Binary file
data/lib/app.rb ADDED
@@ -0,0 +1,22 @@
1
+ require 'sinatra/base'
2
+
3
+ class App < Sinatra::Base
4
+ set :root, File.dirname(__FILE__) + "/.."
5
+ set :app_file, __FILE__
6
+
7
+ get '/' do
8
+ haml :index
9
+ end
10
+
11
+ post '/segment' do
12
+ @input = params[:input]
13
+ @result = Rseg.segment(@input).join(' ')
14
+ haml :index
15
+ end
16
+
17
+ post '/seg' do
18
+ @input = params[:input]
19
+ @result = Rseg.segment(@input)
20
+ @result.join(' ')
21
+ end
22
+ end
@@ -0,0 +1,42 @@
1
+ #coding: utf-8
2
+ $KCODE = 'UTF8'
3
+
4
+ def process(path, tree)
5
+ File.open(path, 'r') do |file|
6
+ file.each_line do |line|
7
+ node = nil
8
+ line.chars.each do |c|
9
+ next if c == "\n" || c == "\r"
10
+ if node
11
+ node[c] ||= {}
12
+ node = node[c]
13
+ else
14
+ tree[c] ||= Hash.new
15
+ node = tree[c]
16
+ end
17
+ end
18
+ node[:end] = true
19
+ end
20
+ end
21
+ end
22
+
23
+ def build
24
+ tree = {}
25
+ dictionaries = ['cedict.zh_CN.utf8', 'wikipedia.zh.utf8']
26
+
27
+ dictionaries.each do |dictionary|
28
+ puts "Processing #{dictionary}..."
29
+ path = File.join(File.dirname(__FILE__), '../../dict', dictionary)
30
+ process(path, tree)
31
+ end
32
+
33
+ File.open(hash_path, "wb") {|io| Marshal.dump(tree, io)}
34
+ puts 'Done'
35
+ end
36
+
37
+ def hash_path
38
+ File.join(File.dirname(__FILE__), '../../dict/dict.hash')
39
+ end
40
+
41
+ build
42
+
@@ -0,0 +1,51 @@
1
+ module RsegEngine
2
+ class Dict < Engine
3
+ @@root = nil
4
+ @@dict_path = File.join(File.dirname(__FILE__), '../../dict/dict.hash')
5
+
6
+ class << self
7
+ def dict_path=(path)
8
+ @@dict_path = path
9
+ end
10
+
11
+ def dict_path
12
+ @@dict_path
13
+ end
14
+ end
15
+
16
+ def initialize
17
+ @@root ||= load_dict(@@dict_path)
18
+ @word = ''
19
+ @node = @@root
20
+ super
21
+ end
22
+
23
+ def process(char)
24
+ match = false
25
+ word = nil
26
+
27
+ if @node[char]
28
+ @word << char
29
+ @node = @node[char]
30
+ match = true
31
+ else
32
+ if @node[:end] || @word.chars.to_a.length == 1
33
+ word = @word
34
+ else
35
+ word = @word.chars.to_a
36
+ end
37
+
38
+ @node = @@root
39
+ @word = ''
40
+ match = false
41
+ end
42
+
43
+ [match, word]
44
+ end
45
+
46
+ private
47
+ def load_dict(path)
48
+ File.open(path, "rb") {|io| Marshal.load(io)}
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,19 @@
1
+ module RsegEngine
2
+ class Engine
3
+ def initialize
4
+ @running = true
5
+ end
6
+
7
+ def stop
8
+ @running = false
9
+ end
10
+
11
+ def run
12
+ @running = true
13
+ end
14
+
15
+ def running?
16
+ @running
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,26 @@
1
+ module RsegEngine
2
+ LETTER_SYMBOLS = ('a'..'z').to_a + ('A'..'Z').to_a
3
+
4
+ class English < Engine
5
+ def initialize
6
+ @word = ''
7
+ super
8
+ end
9
+
10
+ def process(char)
11
+ match = false
12
+ word = nil
13
+
14
+ if LETTER_SYMBOLS.include?(char)
15
+ @word << char
16
+ match = true
17
+ else
18
+ word = @word
19
+ @word = ''
20
+ match = false
21
+ end
22
+
23
+ [match, word]
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,52 @@
1
+ #coding: utf-8
2
+ module RsegEngine
3
+ class Name < Engine
4
+ @@last_names = %W(丁 卜 刁 七 弓 干 于 王 尤 孔 方 申 白 甘 田 包 石 左 平 司 皮 史 池 艾 年 匡 充 江 印
5
+ 促 伊 伍 安 任 米 促 牟 向 吉 成 伏 吕 李 吴 沈 何 贝 狄 祁 杜 汪 阮 邢 汲 别 辛 冷 利 沃 谷
6
+ 扶 步 那 沙 周 金 吕 花 孟 和 邵 房 抗 灰 明 屈 松 牧 宓 武 幸 卓 易 尚 邰 空 竺 岳 东 林
7
+ 施 姜 俞 查 封 秋 帅 祖 羿 柯 茅 柳 姚 纪 宣 咸 库 侯 洪 胡 哈 宣 郁 祝 苗 禹 娄
8
+ 秦 奚 倪 度 凌 宰 宦 师 徐 翁 班 马 时 晃 乌 夏 贡 柴 能 家 宫 敖 索 晏 桑 高 凌 桂 容 姬 劳 桑 桂 袁 时 祝 席 徐 高 夏 凌 洪 翁 家 芮 乌 祖 索 贡
9
+ 许 张 曹 戚 梅 屠 盛 崖 章 鱼 国 商 扈 寇 终 冯 苗 康 常 茅 闵 麻 胡 崔 邢 条 符 宿 堵 浦 习 鱼
10
+ 梁 富 曾 程 项 钮 舒 彭 费 童 云 喻 嵇 范 费 贺 毕 付 黄 邵 祁 阮 强 童 邱 解 贲 单 富 钮 荀 惠 邴 焦 班 甯 钭 景 邰 劳 茹 寇 荆
11
+ 莫 际 景 须 杨 詹 郎 雷 贾 路 骆 虞 经 裘 郁 滑 甄 靳 詹 闻 逄 雍 訾 郎 农 路 骆 虞 经 裘 郁 滑 靳 闻 逄 雍
12
+ 赵 黄 褚 凤 郝 齐 臧 熊 管 裴 荣 郗 韶 郜 黎 翟 寿 通
13
+ 卫 葛 鲁 乐 谈 董 樊 万 诸 刘 叶 都 满 广 殴 巩 养
14
+ 郭 钱 陈 陶 鲍 穆 郭 堆 卢 陆 龙 噪 鄂 阴 苍 燕 冀 衡 融 蒯 逯
15
+ 蒋 魏 谢 邹 潘 滕 邬 戴 钟 蔡 缪 应 储 糜 隗 历 蒲 慕 蔚 隆 鞠 关
16
+ 韩 萧 颜 庞 麦 双 璩 濮 聂 丰 看
17
+ 郑 严 蓟 薄 谭 罗 买 蓝 蓬 怀 党 饶 顾 苏 龚 边 栾 权) #:nodoc:
18
+
19
+ @@first_names = %W(文 铭 菁 郁 怡 智 德 祥 志 华 孟 庆 雅 佩 晓 蓉 明 仁 宇 青 慧 豪 琪 安
20
+ 惠 宗 信 盈 君 秀 敏 伶 佳 国 荣 忠 宏 育 丽 圣 淑 彦 龙 冠 后 静 娟 子
21
+ 嘉 瑞 柏 弘 芳 正 玮 贞 如 凯 元 士 伟 杰 颍 霖 玲 仪 珮 英 建 政 真 珍
22
+ 美 世 立 秋 婷 贤 瑜 中 玉 维 莹 翔 家 芬 昌 裕 雯 萍 永 成 宜 鸿 珊 民
23
+ 欣 哲 良 伦 燕 梦 磊 丹 元 一 昌 红 健) #:nodoc:
24
+ def initialize
25
+ @word = ''
26
+ @last = false
27
+ super
28
+ end
29
+
30
+ def process(char)
31
+ match = false
32
+ word = nil
33
+
34
+ if !@last && @@last_names.include?(char)
35
+ @word << char
36
+ match = true
37
+ @last = true
38
+ elsif @last && @word.chars.to_a.length < 3 && @@first_names.include?(char)
39
+ @word << char
40
+ match = true
41
+ @unit = true
42
+ else
43
+ word = @word
44
+ @word = ''
45
+ @last = false
46
+ match = false
47
+ end
48
+
49
+ [match, word]
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,62 @@
1
+ #coding: utf-8
2
+ module RsegEngine
3
+ class Number < Engine
4
+ @@number_symbols = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
5
+ '一', '二', '三', '四', '五', '六', '七', '八', '九', '十',
6
+ '零', '〇', '百', '千', '壹', '贰', '叁', '肆', '柒', '捌',
7
+ '玖', '拾', '之', '%', '¥', '分', '$', '.', '点', '第', '每']
8
+ @@subunit_symbols = ['多', '公', '英', '厘', '毫', '微', '纳', '海', '平', '立',
9
+ '方', '摄', '华', '氏', '美', '日', '澳', '港', '台', '新',
10
+ '个', '百', '佰', '千', '仟', '万', '萬', '亿', '兆', '吉']
11
+ @@unit_symbols = ['刻', '章', '回', '节', '名', '个', '届', '次', '集', '元',
12
+ '角', '例', '人', '斤', '克', '吨', '米', '里', '升', '码',
13
+ '尺', '寸', '杆', '顷', '亩', '磅', '镑', '桶', '度', '秒',
14
+ '分', '卡', '焦', '瓦', '匹', '圆', '币', '年', '月', '日',
15
+ '时', '秒', '点', '百', '佰', '仟', '千', '万', '萬', '亿',
16
+ '兆', '吉', '块', '半', '岁', '家', '所', '期', '场', '投',
17
+ '中', '辆', '只', '头']
18
+
19
+ def initialize
20
+ @word = ''
21
+ @number = ''
22
+ @unit = false
23
+ @subunit = false
24
+ super
25
+ end
26
+
27
+ def process(char)
28
+ match = false
29
+ word = nil
30
+
31
+ if (!@subunit || @unit) && @@number_symbols.include?(char)
32
+ @number << char
33
+ match = true
34
+ @unit = false
35
+ @subunit = false
36
+ elsif (@number != '' || @unit) && @@subunit_symbols.include?(char)
37
+ @number << char
38
+ match = true
39
+ @subunit = true
40
+ end
41
+
42
+ if (@number != '' || @subunit) && @@unit_symbols.include?(char)
43
+ @word << @number
44
+ @word << char if !match
45
+ @number = ''
46
+ @unit = true
47
+ match = true
48
+ end
49
+
50
+ if !match
51
+ word = (@word != '') ? @word : @number
52
+ @word = ''
53
+ @number = ''
54
+ match = false
55
+ @unit = false
56
+ @subunit = false
57
+ end
58
+
59
+ [match, word]
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,10 @@
1
+ #coding: utf-8
2
+ module RsegFilter
3
+ class Conjunction
4
+ @@conjunctions = %W(给 的 说 对 在 和 是 被 最 所 那 由 这 有 将 你 会 与 他 为 不 没 很 了 啊 哦 呵 把 去 从)
5
+
6
+ def self.filter(char)
7
+ @@conjunctions.include?(char) ? :conjunction : char
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,20 @@
1
+ #coding: utf-8
2
+ module RsegFilter
3
+ class Fullwidth
4
+ @@fullwidth_chars = {'1' => '1', '2' => '2', '3' => '3', '4' => '4', '5' => '5', '6' => '6', '7' => '7', '8' => '8',
5
+ '9' => '9', '0' => '0', 'a' => 'a', 'b' => 'b', 'c' => 'c', 'd' => 'd', 'e' => 'e', 'f' => 'f',
6
+ 'g' => 'g', 'h' => 'h', 'i' => 'i', 'j' => 'j', 'k' => 'k', 'l' => 'l', 'm' => 'm', 'n' => 'n',
7
+ 'o' => 'o', 'p' => 'p', 'q' => 'q', 'r' => 'r', 's' => 's', 't' => 't', 'u' => 'u', 'v' => 'v',
8
+ 'w' => 'w', 'x' => 'x', 'y' => 'y', 'z' => 'z', 'A' => 'A', 'B' => 'B', 'C' => 'C', 'D' => 'D',
9
+ 'E' => 'E', 'F' => 'F', 'G' => 'G', 'H' => 'H', 'I' => 'I', 'J' => 'J', 'K' => 'K', 'L' => 'L',
10
+ 'M' => 'M', 'N' => 'N', 'O' => 'O', 'P' => 'P', 'Q' => 'Q', 'R' => 'R', 'S' => 'S', 'T' => 'T',
11
+ 'U' => 'U', 'V' => 'V', 'W' => 'W', 'X' => 'X', 'Y' => 'Y', 'Z' => 'Z', '-' => '-', '+' => '+',
12
+ '—' => '-', ',' => ',', '/' => '/', '·' => '.'}
13
+
14
+ class << self
15
+ def filter(char)
16
+ @@fullwidth_chars[char].nil? ? char : @@fullwidth_chars[char]
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,15 @@
1
+ #coding: utf-8
2
+ module RsegFilter
3
+ class Symbol
4
+ @@separators = ['`', '[', ']', '、', '=', '‘', ';', '。', '|', '?', '》',
5
+ '《', ':', '“', '{', '}', ')', '(', '*', '…', '#', '!',
6
+ '~', '’', '”', '〕', '〈', '〉', '「', '」', '『', '』', '〖', '〗',
7
+ '【', '】', '<', '>', '`', '~', '!', '@', '#', '^',
8
+ '&', '*', '\\', '(', ')', '=', '{', '}', '[', ']',
9
+ '|', ';', ':', "'", '<', '>', '?', "\n", "\t", "\r",
10
+ ' ', '-', '/', '+', ',', ' ']
11
+ def self.filter(char)
12
+ @@separators.include?(char) ? :symbol : char
13
+ end
14
+ end
15
+ end
data/lib/rseg.rb ADDED
@@ -0,0 +1,139 @@
1
+ #coding: utf-8
2
+ $KCODE = 'UTF8'
3
+
4
+ require 'singleton'
5
+ require 'net/http'
6
+
7
+ require File.join(File.dirname(__FILE__), 'engines/engine')
8
+ require File.join(File.dirname(__FILE__), 'engines/dict')
9
+ require File.join(File.dirname(__FILE__), 'engines/english')
10
+ require File.join(File.dirname(__FILE__), 'engines/number')
11
+ require File.join(File.dirname(__FILE__), 'engines/name')
12
+
13
+ require File.join(File.dirname(__FILE__), 'filters/fullwidth')
14
+ require File.join(File.dirname(__FILE__), 'filters/symbol')
15
+ require File.join(File.dirname(__FILE__), 'filters/conjunction')
16
+
17
+ class Rseg
18
+ include Singleton
19
+ include RsegEngine
20
+ include RsegFilter
21
+
22
+ class << self
23
+ def dict_path=(path)
24
+ RsegEngine::Dict.dict_path = path
25
+ end
26
+
27
+ def segment(input)
28
+ Rseg.instance.input = input
29
+ Rseg.instance.segment
30
+ end
31
+
32
+ def load
33
+ Rseg.instance
34
+ nil
35
+ end
36
+
37
+ def remote_segment(input)
38
+ begin
39
+ response = Net::HTTP.post_form(URI.parse('http://127.0.0.1:4100/seg'), :input => input)
40
+ response.code == '200' ? response.body.split(' ') :
41
+ ["Can't connect to http://localhost:4100\nUse rseg_server to start it"]
42
+ rescue
43
+ ["Can't connect to http://localhost:4100\nUse rseg_server to start it"]
44
+ end
45
+ end
46
+ end
47
+
48
+ def initialize
49
+ @input = ''
50
+ @words = []
51
+ init_engines
52
+ init_filters
53
+ end
54
+
55
+ def input=(input)
56
+ @input = input
57
+ end
58
+
59
+ def segment
60
+ @words = []
61
+
62
+ @input.chars.each do |origin|
63
+ char = filter(origin)
64
+ process(char, origin)
65
+ end
66
+
67
+ process(:symbol, '')
68
+ @words
69
+ end
70
+
71
+ private
72
+ def filter(char)
73
+ result = char
74
+ @filters.each do |klass|
75
+ result = klass.filter(result)
76
+ end
77
+ result
78
+ end
79
+
80
+ def process(char, origin)
81
+ nomatch = true
82
+ word = ''
83
+
84
+ engines.each do |engine|
85
+ next unless engine.running?
86
+ match, word = engine.process(char)
87
+ match ? nomatch = false : engine.stop
88
+ end
89
+
90
+ if nomatch
91
+ if word == ''
92
+ @words << origin unless char == :symbol
93
+ reset_engines
94
+ else
95
+ reset_engines
96
+ @words << word if word.is_a?(String)
97
+ reprocess(word) if word.is_a?(Array)
98
+
99
+ # re-process current char
100
+ process(char, origin)
101
+ end
102
+ end
103
+ end
104
+
105
+ def reprocess(word)
106
+ last = word.pop
107
+
108
+ word.each do |char|
109
+ process(char, char)
110
+ end
111
+
112
+ process(:symbol, :symbol)
113
+ process(last, last)
114
+ end
115
+
116
+ def reset_engines
117
+ engines.each do |engine|
118
+ engine.run
119
+ end
120
+ end
121
+
122
+ def engines=(engines)
123
+ @engines ||= engines
124
+ end
125
+
126
+ def engines
127
+ @engines
128
+ end
129
+
130
+ def init_filters
131
+ @filters = [Fullwidth, Symbol]
132
+ end
133
+
134
+ def init_engines
135
+ @engines ||= [Dict, English, Number, Name].map do |engine_klass|
136
+ engine_klass.new
137
+ end
138
+ end
139
+ end
data/public/screen.css ADDED
@@ -0,0 +1,123 @@
1
+ div.clear {clear: both;}
2
+ body {background: #EEEEEE; margin: 0; padding: 0;
3
+ font-family: 'Lucida Grande', 'Lucida Sans Unicode',
4
+ 'Garuda';}
5
+ code {font-family: 'Lucida Console', monospace;
6
+ font-size: 12px;}
7
+ li {height: 18px;}
8
+ ul {list-style: none; margin: 0; padding: 0;}
9
+ ol:hover {cursor: pointer;}
10
+ ol li {white-space: pre;}
11
+ #explanation {font-size: 12px; color: #666666;
12
+ margin: 20px 0 0 100px;}
13
+ /* WRAP */
14
+ #wrap {width: 860px; background: #FFFFFF; margin: 0 auto;
15
+ padding: 30px 50px 20px 50px;
16
+ border-left: 1px solid #DDDDDD;
17
+ border-right: 1px solid #DDDDDD;}
18
+ /* HEADER */
19
+ #header {margin: 0 auto 25px auto;}
20
+ h1 {margin: 0; font-size: 36px; color: #981919;}
21
+ h2 {margin: 0; font-size: 22px; color: #333333;}
22
+ #header ul {margin: 0; font-size: 12px; color: #666666;}
23
+ #header ul li strong{color: #444444;}
24
+ #header ul li {display: inline; padding: 0 10px;}
25
+ #header ul li.first {padding-left: 0;}
26
+ #header ul li.last {border: 0; padding-right: 0;}
27
+
28
+ #content {width: 860px; margin: 0 auto 10px auto;}
29
+
30
+ h3 {float: left; width: 100px; margin-bottom: 10px;
31
+ color: #981919; font-size: 14px; font-weight: bold;}
32
+
33
+ #footer {width: 860px; margin: 0 auto 10px auto; clear:both;
34
+ font-size: 18px; border-top:1px solid #000; padding-top: 10px;
35
+ text-align: right;}
36
+
37
+ textarea {font-size: 18px; padding:10px;}
38
+ #segform { width: 430px; float: left; font-size: 18px;}
39
+ #segresult { width: 408px; float: left; font-size: 18px;
40
+ padding: 10px; color: #D12F19;}
41
+ /* --------------------------------------------------------------
42
+
43
+ buttons.css
44
+ * Gives you some great CSS-only buttons.
45
+
46
+ Created by Kevin Hale [particletree.com]
47
+ * particletree.com/features/rediscovering-the-button-element
48
+
49
+ See Readme.txt in this folder for instructions.
50
+
51
+ -------------------------------------------------------------- */
52
+
53
+ button {
54
+ display:block;
55
+ float:left;
56
+ margin:0 0.583em 0.667em 0;
57
+ padding:5px 10px 5px 7px; /* Links */
58
+
59
+ border:1px solid #dedede;
60
+ border-top:1px solid #eee;
61
+ border-left:1px solid #eee;
62
+
63
+ background-color:#f5f5f5;
64
+ font-family:"Lucida Grande", Tahoma, Arial, Verdana, sans-serif;
65
+ font-size:100%;
66
+ line-height:130%;
67
+ text-decoration:none;
68
+ font-weight:bold;
69
+ color:#565656;
70
+ cursor:pointer;
71
+ }
72
+ button {
73
+ width:auto;
74
+ overflow:visible;
75
+ padding:4px 10px 3px 7px; /* IE6 */
76
+ }
77
+ button[type] {
78
+ padding:4px 10px 4px 7px; /* Firefox */
79
+ line-height:17px; /* Safari */
80
+ }
81
+ *:first-child+html button[type] {
82
+ padding:4px 10px 3px 7px; /* IE7 */
83
+ }
84
+ button img {
85
+ margin:0 3px -3px 0 !important;
86
+ padding:0;
87
+ border:none;
88
+ width:16px;
89
+ height:16px;
90
+ float:none;
91
+ }
92
+
93
+
94
+ /* Button colors
95
+ -------------------------------------------------------------- */
96
+
97
+ /* Standard */
98
+ button:hover {
99
+ background-color:#dff4ff;
100
+ border:1px solid #c2e1ef;
101
+ color:#336699;
102
+ }
103
+
104
+ /* Positive */
105
+ body .positive {
106
+ color:#529214;
107
+ }
108
+ button.positive:hover {
109
+ background-color:#E6EFC2;
110
+ border:1px solid #C6D880;
111
+ color:#529214;
112
+ }
113
+
114
+ /* Negative */
115
+ body .negative {
116
+ color:#d12f19;
117
+ }
118
+ button.negative:hover {
119
+ background:#fbe3e4;
120
+ border:1px solid #fbc2c4;
121
+ color:#d12f19;
122
+ }
123
+
data/views/index.haml ADDED
@@ -0,0 +1,8 @@
1
+ #segform
2
+ %form{:action => '/segment', :method => 'post'}
3
+ %p
4
+ %textarea{:id => 'input', :rows => '15', :cols => '35', :name => 'input'}= @input || '输入要分词的文章'
5
+ %p
6
+ %button.negative{ :type => "submit" } 开始分词
7
+ #segresult
8
+ %p= @result
data/views/layout.haml ADDED
@@ -0,0 +1,16 @@
1
+ !!! Strict
2
+ %html{ :lang => "en", :"xml:lang" => "en", :xmlns => "http://www.w3.org/1999/xhtml" }
3
+ %head
4
+ %meta{ :content => "text/html; charset=utf-8", :"http-equiv" => "Content-Type" }
5
+ %meta{ :content => "zh_CN", :"http-equiv" => "Content-Language" }
6
+ %title= "Rseg中文分词"
7
+ %link{ :rel => 'stylesheet', :href => '/screen.css', :type => 'text/css', :media => "screen"}
8
+
9
+ %body
10
+ #wrap
11
+ #header
12
+ %h1= "Rseg 中文分词"
13
+ %address.watermark
14
+ #content.condensed= yield
15
+ #footer= "作者: 张元一 <br />EMail:zhangyuanyi#gmail.com"
16
+
metadata ADDED
@@ -0,0 +1,104 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rseg1.9
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.5
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Yuanyi Zhang
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2010-11-28 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: haml
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: sinatra
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ description: A Chinese Word Segmentation(中文分词) routine in pure Ruby
47
+ email: zhangyuanyi@gmail.com
48
+ executables:
49
+ - rseg
50
+ - rseg_server
51
+ extensions: []
52
+ extra_rdoc_files:
53
+ - LICENSE
54
+ - README
55
+ files:
56
+ - .gitignore
57
+ - LICENSE
58
+ - README
59
+ - Rakefile
60
+ - VERSION
61
+ - bin/rseg
62
+ - bin/rseg_server
63
+ - dict/dict.hash
64
+ - lib/app.rb
65
+ - lib/builder/dict.rb
66
+ - lib/engines/dict.rb
67
+ - lib/engines/engine.rb
68
+ - lib/engines/english.rb
69
+ - lib/engines/name.rb
70
+ - lib/engines/number.rb
71
+ - lib/filters/conjunction.rb
72
+ - lib/filters/fullwidth.rb
73
+ - lib/filters/symbol.rb
74
+ - lib/rseg.rb
75
+ - public/screen.css
76
+ - views/index.haml
77
+ - views/layout.haml
78
+ homepage: http://github.com/yzhang/rseg
79
+ licenses: []
80
+ post_install_message:
81
+ rdoc_options:
82
+ - --charset=UTF-8
83
+ require_paths:
84
+ - lib
85
+ required_ruby_version: !ruby/object:Gem::Requirement
86
+ none: false
87
+ requirements:
88
+ - - ! '>='
89
+ - !ruby/object:Gem::Version
90
+ version: '0'
91
+ required_rubygems_version: !ruby/object:Gem::Requirement
92
+ none: false
93
+ requirements:
94
+ - - ! '>='
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ requirements: []
98
+ rubyforge_project:
99
+ rubygems_version: 1.8.24
100
+ signing_key:
101
+ specification_version: 3
102
+ summary: A Chinese Word Segmentation(中文分词) routine in pure Ruby
103
+ test_files: []
104
+ has_rdoc: