rseg_ggharry 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: d6b111f0c4f9c4a3b6ee178975667fae0f72e355
4
+ data.tar.gz: f5ae41efa261a06aafb4ebfe24f0a4887889eb9d
5
+ SHA512:
6
+ metadata.gz: c5547ccbab57137b0c4d52f0a7b406766d8823261e8045760db9fa52c2106b420be76c11cb1a8db5ce7beeb528b12d0a9114c4d3919e753cbf99c9dcb30fdbc6
7
+ data.tar.gz: 04c3476ec8cc0798daa7fe4fc33ebbb9222f75977e823fd0574592e19fd03683ff433495bf8f8691191d6b95e59748838247e676696126767d19f72013a9fff7
data/.gitignore ADDED
@@ -0,0 +1,3 @@
1
+ test
2
+ pkg
3
+ .DS_Store
data/LICENSE ADDED
@@ -0,0 +1,30 @@
1
+ Rseg includes two built-in dictionaries:
2
+
3
+ * CC-CEDICT (http://cc-cedict.org/wiki/): Creative Commons Attribution-Share Alike 3.0 License (http://creativecommons.org/licenses/by-sa/3.0/)
4
+ * Wikipedia Chinese article title list (http://download.wikimedia.org/zhwiki/): Creative Commons Attribution-Share Alike 3.0 License (http://creativecommons.org/licenses/by-sa/3.0/)
5
+
6
+ The codes and others in Rseg are licensed under MIT license:
7
+
8
+ ===============================
9
+ Copyright (c) 2009 Yuanyi Zhang
10
+
11
+ Permission is hereby granted, free of charge, to any person
12
+ obtaining a copy of this software and associated documentation
13
+ files (the "Software"), to deal in the Software without
14
+ restriction, including without limitation the rights to use,
15
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
16
+ copies of the Software, and to permit persons to whom the
17
+ Software is furnished to do so, subject to the following
18
+ conditions:
19
+
20
+ The above copyright notice and this permission notice shall be
21
+ included in all copies or substantial portions of the Software.
22
+
23
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
25
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
27
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
28
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
29
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
30
+ OTHER DEALINGS IN THE SOFTWARE.
data/README ADDED
@@ -0,0 +1,57 @@
1
+ Introduction
2
+ ========
3
+ Rseg is a Chinese Word Segmentation(中文分词) routine in pure Ruby.
4
+
5
+ The algorithm is based on this article: http://xiecc.blog.163.com/blog/static/14032200671110224190/
6
+
7
+ Usage
8
+ ========
9
+
10
+ Rseg now support two modes: inline and C/S mode.
11
+
12
+ 1. Inline mode
13
+
14
+ > require 'rubygems'
15
+ > require 'rseg'
16
+ > Rseg.segment("需要分词的文章")
17
+ ['需要', '分词', '的', '文章']
18
+
19
+ The first call to Rseg#segment will need about 30 seconds to load the dictionary, the second call will be very fast, you can also call Rseg#load to load dictionaries manually.
20
+
21
+ 2. C/S mode
22
+
23
+ $ rseg_server
24
+ == Sinatra/0.9.4 has taken the stage on 4100
25
+
26
+ This will start rseg server on http://localhost:4100
27
+
28
+ You can visit it via your browser or the rseg command.
29
+
30
+ $ rseg '需要分词的文章'
31
+ 需要 分词 的 文章
32
+
33
+ You can also access server with the Rseg#remote_segment
34
+
35
+ $ irb
36
+ > require 'rubygems'
37
+ > require 'rseg'
38
+ > Rseg.remote_segment("需要分词的文章") # This will be very fast
39
+ ['需要', '分词', '的', '文章']
40
+
41
+ Performance
42
+ ========
43
+ About 5M character/s on my Macbook (Intel Core 2 Duo 2GHz/4G mem).
44
+
45
+ License
46
+ ========
47
+
48
+ Rseg includes two built-in dictionaries:
49
+
50
+ * CC-CEDICT (http://cc-cedict.org/wiki/) with Creative Commons Attribution-Share Alike 3.0 License (http://creativecommons.org/licenses/by-sa/3.0/)
51
+ * Wikipedia Chinese article title list (http://download.wikimedia.org/zhwiki/) with Creative Commons Attribution-Share Alike 3.0 License(http://creativecommons.org/licenses/by-sa/3.0/)
52
+
53
+ The codes and others in Rseg are licensed under MIT license.
54
+
55
+ Feedback
56
+ ========
57
+ All feedback are welcome, Yuanyi Zhang(zhangyuanyi#gmail.com)
data/Rakefile ADDED
@@ -0,0 +1,21 @@
1
+ require 'rake'
2
+ require 'rake/testtask'
3
+ require 'rcov/rcovtask'
4
+
5
+ begin
6
+ require 'jeweler'
7
+ Jeweler::Tasks.new do |s|
8
+ s.name = "rseg"
9
+ s.executables = ["rseg", 'rseg_server']
10
+ s.summary = "A Chinese Word Segmentation(中文分词) routine in pure Ruby"
11
+ s.email = "zhangyuanyi@gmail.com"
12
+ s.homepage = "http://github.com/yzhang/rseg"
13
+ s.description = "A Chinese Word Segmentation(中文分词) routine in pure Ruby"
14
+ s.authors = ["Yuanyi Zhang"]
15
+ s.files = FileList["[A-Z]*", "{bin,lib,public,views}/**/*", '.gitignore', 'dict/dict.hash']
16
+ s.add_dependency 'haml'
17
+ s.add_dependency 'sinatra'
18
+ end
19
+ rescue LoadError
20
+ puts "Jeweler, or one of its dependencies, is not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
21
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.7
data/bin/rseg ADDED
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env ruby
2
+ require File.join(File.dirname(__FILE__), '/../lib/rseg')
3
+
4
+ input = ARGV[0]
5
+
6
+ if input.nil? || input == ''
7
+ puts "Usage: rseg <text>"
8
+ exit
9
+ end
10
+
11
+ puts Rseg.remote_segment(input).join(' ')
data/bin/rseg_server ADDED
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env ruby
2
+ $:.unshift File.expand_path(File.dirname(__FILE__))
3
+
4
+ require 'rubygems'
5
+ require 'haml'
6
+
7
+ require File.join(File.dirname(__FILE__), '/../lib/rseg')
8
+ require File.join(File.dirname(__FILE__), '/../lib/app')
9
+
10
+ puts "Loading dictionaries, this will take about 30 seconds."
11
+ puts "Please wait a moment..."
12
+ Rseg.load
13
+ puts "Dictionaries loaded."
14
+
15
+ App.run! :host => '127.0.0.1', :port => 4100, :environment => 'production'
16
+ exit
data/dict/dict.hash ADDED
Binary file
data/lib/app.rb ADDED
@@ -0,0 +1,22 @@
1
+ require 'sinatra/base'
2
+
3
+ class App < Sinatra::Base
4
+ set :root, File.dirname(__FILE__) + "/.."
5
+ set :app_file, __FILE__
6
+
7
+ get '/' do
8
+ haml :index
9
+ end
10
+
11
+ post '/segment' do
12
+ @input = params[:input]
13
+ @result = Rseg.segment(@input).join(' ')
14
+ haml :index
15
+ end
16
+
17
+ post '/seg' do
18
+ @input = params[:input]
19
+ @result = Rseg.segment(@input)
20
+ @result.join(' ')
21
+ end
22
+ end
@@ -0,0 +1,41 @@
1
+ # encoding: utf-8
2
+
3
+ def process(path, tree)
4
+ File.open(path, 'r') do |file|
5
+ file.each_line do |line|
6
+ node = nil
7
+ line.chars.each do |c|
8
+ next if c == "\n" || c == "\r"
9
+ if node
10
+ node[c] ||= {}
11
+ node = node[c]
12
+ else
13
+ tree[c] ||= Hash.new
14
+ node = tree[c]
15
+ end
16
+ end
17
+ node[:end] = true
18
+ end
19
+ end
20
+ end
21
+
22
+ def build
23
+ tree = {}
24
+ dictionaries = ['cedict.zh_CN.utf8', 'wikipedia.zh.utf8']
25
+
26
+ dictionaries.each do |dictionary|
27
+ puts "Processing #{dictionary}..."
28
+ path = File.join(File.dirname(__FILE__), '../../dict', dictionary)
29
+ process(path, tree)
30
+ end
31
+
32
+ File.open(hash_path, "wb") {|io| Marshal.dump(tree, io)}
33
+ puts 'Done'
34
+ end
35
+
36
+ def hash_path
37
+ File.join(File.dirname(__FILE__), '../../dict/dict.hash')
38
+ end
39
+
40
+ build
41
+
@@ -0,0 +1,51 @@
1
+ module RsegEngine
2
+ class Dict < Engine
3
+ @@root = nil
4
+ @@dict_path = File.join(File.dirname(__FILE__), '../../dict/dict.hash')
5
+
6
+ class << self
7
+ def dict_path=(path)
8
+ @@dict_path = path
9
+ end
10
+
11
+ def dict_path
12
+ @@dict_path
13
+ end
14
+ end
15
+
16
+ def initialize
17
+ @@root ||= load_dict(@@dict_path)
18
+ @word = ''
19
+ @node = @@root
20
+ super
21
+ end
22
+
23
+ def process(char)
24
+ match = false
25
+ word = nil
26
+
27
+ if @node[char]
28
+ @word << char
29
+ @node = @node[char]
30
+ match = true
31
+ else
32
+ if @node[:end] || @word.chars.to_a.length == 1
33
+ word = @word
34
+ else
35
+ word = @word.chars.to_a
36
+ end
37
+
38
+ @node = @@root
39
+ @word = ''
40
+ match = false
41
+ end
42
+
43
+ [match, word]
44
+ end
45
+
46
+ private
47
+ def load_dict(path)
48
+ File.open(path, "rb") {|io| Marshal.load(io)}
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,19 @@
1
+ module RsegEngine
2
+ class Engine
3
+ def initialize
4
+ @running = true
5
+ end
6
+
7
+ def stop
8
+ @running = false
9
+ end
10
+
11
+ def run
12
+ @running = true
13
+ end
14
+
15
+ def running?
16
+ @running
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,26 @@
1
+ module RsegEngine
2
+ LETTER_SYMBOLS = ('a'..'z').to_a + ('A'..'Z').to_a
3
+
4
+ class English < Engine
5
+ def initialize
6
+ @word = ''
7
+ super
8
+ end
9
+
10
+ def process(char)
11
+ match = false
12
+ word = nil
13
+
14
+ if LETTER_SYMBOLS.include?(char)
15
+ @word << char
16
+ match = true
17
+ else
18
+ word = @word
19
+ @word = ''
20
+ match = false
21
+ end
22
+
23
+ [match, word]
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,51 @@
1
+ module RsegEngine
2
+ class Name < Engine
3
+ @@last_names = %W(丁 卜 刁 七 弓 干 于 王 尤 孔 方 申 白 甘 田 包 石 左 平 司 皮 史 池 艾 年 匡 充 江 印
4
+ 促 伊 伍 安 任 米 促 牟 向 吉 成 伏 吕 李 吴 沈 何 贝 狄 祁 杜 汪 阮 邢 汲 别 辛 冷 利 沃 谷
5
+ 扶 步 那 沙 周 金 吕 花 孟 和 邵 房 抗 灰 明 屈 松 牧 宓 武 幸 卓 易 尚 邰 空 竺 岳 东 林
6
+ 施 姜 俞 查 封 秋 帅 祖 羿 柯 茅 柳 姚 纪 宣 咸 库 侯 洪 胡 哈 宣 郁 祝 苗 禹 娄
7
+ 秦 奚 倪 度 凌 宰 宦 师 徐 翁 班 马 时 晃 乌 夏 贡 柴 能 家 宫 敖 索 晏 桑 高 凌 桂 容 姬 劳 桑 桂 袁 时 祝 席 徐 高 夏 凌 洪 翁 家 芮 乌 祖 索 贡
8
+ 许 张 曹 戚 梅 屠 盛 崖 章 鱼 国 商 扈 寇 终 冯 苗 康 常 茅 闵 麻 胡 崔 邢 条 符 宿 堵 浦 习 鱼
9
+ 梁 富 曾 程 项 钮 舒 彭 费 童 云 喻 嵇 范 费 贺 毕 付 黄 邵 祁 阮 强 童 邱 解 贲 单 富 钮 荀 惠 邴 焦 班 甯 钭 景 邰 劳 茹 寇 荆
10
+ 莫 际 景 须 杨 詹 郎 雷 贾 路 骆 虞 经 裘 郁 滑 甄 靳 詹 闻 逄 雍 訾 郎 农 路 骆 虞 经 裘 郁 滑 靳 闻 逄 雍
11
+ 赵 黄 褚 凤 郝 齐 臧 熊 管 裴 荣 郗 韶 郜 黎 翟 寿 通
12
+ 卫 葛 鲁 乐 谈 董 樊 万 诸 刘 叶 都 满 广 殴 巩 养
13
+ 郭 钱 陈 陶 鲍 穆 郭 堆 卢 陆 龙 噪 鄂 阴 苍 燕 冀 衡 融 蒯 逯
14
+ 蒋 魏 谢 邹 潘 滕 邬 戴 钟 蔡 缪 应 储 糜 隗 历 蒲 慕 蔚 隆 鞠 关
15
+ 韩 萧 颜 庞 麦 双 璩 濮 聂 丰 看
16
+ 郑 严 蓟 薄 谭 罗 买 蓝 蓬 怀 党 饶 顾 苏 龚 边 栾 权) #:nodoc:
17
+
18
+ @@first_names = %W(文 铭 菁 郁 怡 智 德 祥 志 华 孟 庆 雅 佩 晓 蓉 明 仁 宇 青 慧 豪 琪 安
19
+ 惠 宗 信 盈 君 秀 敏 伶 佳 国 荣 忠 宏 育 丽 圣 淑 彦 龙 冠 后 静 娟 子
20
+ 嘉 瑞 柏 弘 芳 正 玮 贞 如 凯 元 士 伟 杰 颍 霖 玲 仪 珮 英 建 政 真 珍
21
+ 美 世 立 秋 婷 贤 瑜 中 玉 维 莹 翔 家 芬 昌 裕 雯 萍 永 成 宜 鸿 珊 民
22
+ 欣 哲 良 伦 燕 梦 磊 丹 元 一 昌 红 健) #:nodoc:
23
+ def initialize
24
+ @word = ''
25
+ @last = false
26
+ super
27
+ end
28
+
29
+ def process(char)
30
+ match = false
31
+ word = nil
32
+
33
+ if !@last && @@last_names.include?(char)
34
+ @word << char
35
+ match = true
36
+ @last = true
37
+ elsif @last && @word.chars.to_a.length < 3 && @@first_names.include?(char)
38
+ @word << char
39
+ match = true
40
+ @unit = true
41
+ else
42
+ word = @word
43
+ @word = ''
44
+ @last = false
45
+ match = false
46
+ end
47
+
48
+ [match, word]
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,61 @@
1
+ module RsegEngine
2
+ class Number < Engine
3
+ @@number_symbols = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
4
+ '一', '二', '三', '四', '五', '六', '七', '八', '九', '十',
5
+ '零', '〇', '百', '千', '壹', '贰', '叁', '肆', '柒', '捌',
6
+ '玖', '拾', '之', '%', '¥', '分', '$', '.', '点', '第', '每']
7
+ @@subunit_symbols = ['多', '公', '英', '厘', '毫', '微', '纳', '海', '平', '立',
8
+ '方', '摄', '华', '氏', '美', '日', '澳', '港', '台', '新',
9
+ '个', '百', '佰', '千', '仟', '万', '萬', '亿', '兆', '吉']
10
+ @@unit_symbols = ['刻', '章', '回', '节', '名', '个', '届', '次', '集', '元',
11
+ '角', '例', '人', '斤', '克', '吨', '米', '里', '升', '码',
12
+ '尺', '寸', '杆', '顷', '亩', '磅', '镑', '桶', '度', '秒',
13
+ '分', '卡', '焦', '瓦', '匹', '圆', '币', '年', '月', '日',
14
+ '时', '秒', '点', '百', '佰', '仟', '千', '万', '萬', '亿',
15
+ '兆', '吉', '块', '半', '岁', '家', '所', '期', '场', '投',
16
+ '中', '辆', '只', '头']
17
+
18
+ def initialize
19
+ @word = ''
20
+ @number = ''
21
+ @unit = false
22
+ @subunit = false
23
+ super
24
+ end
25
+
26
+ def process(char)
27
+ match = false
28
+ word = nil
29
+
30
+ if (!@subunit || @unit) && @@number_symbols.include?(char)
31
+ @number << char
32
+ match = true
33
+ @unit = false
34
+ @subunit = false
35
+ elsif (@number != '' || @unit) && @@subunit_symbols.include?(char)
36
+ @number << char
37
+ match = true
38
+ @subunit = true
39
+ end
40
+
41
+ if (@number != '' || @subunit) && @@unit_symbols.include?(char)
42
+ @word << @number
43
+ @word << char if !match
44
+ @number = ''
45
+ @unit = true
46
+ match = true
47
+ end
48
+
49
+ if !match
50
+ word = (@word != '') ? @word : @number
51
+ @word = ''
52
+ @number = ''
53
+ match = false
54
+ @unit = false
55
+ @subunit = false
56
+ end
57
+
58
+ [match, word]
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,9 @@
1
+ module RsegFilter
2
+ class Conjunction
3
+ @@conjunctions = %W(给 的 说 对 在 和 是 被 最 所 那 由 这 有 将 你 会 与 他 为 不 没 很 了 啊 哦 呵 把 去 从)
4
+
5
+ def self.filter(char)
6
+ @@conjunctions.include?(char) ? :conjunction : char
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,19 @@
1
+ module RsegFilter
2
+ class Fullwidth
3
+ @@fullwidth_chars = {'1' => '1', '2' => '2', '3' => '3', '4' => '4', '5' => '5', '6' => '6', '7' => '7', '8' => '8',
4
+ '9' => '9', '0' => '0', 'a' => 'a', 'b' => 'b', 'c' => 'c', 'd' => 'd', 'e' => 'e', 'f' => 'f',
5
+ 'g' => 'g', 'h' => 'h', 'i' => 'i', 'j' => 'j', 'k' => 'k', 'l' => 'l', 'm' => 'm', 'n' => 'n',
6
+ 'o' => 'o', 'p' => 'p', 'q' => 'q', 'r' => 'r', 's' => 's', 't' => 't', 'u' => 'u', 'v' => 'v',
7
+ 'w' => 'w', 'x' => 'x', 'y' => 'y', 'z' => 'z', 'A' => 'A', 'B' => 'B', 'C' => 'C', 'D' => 'D',
8
+ 'E' => 'E', 'F' => 'F', 'G' => 'G', 'H' => 'H', 'I' => 'I', 'J' => 'J', 'K' => 'K', 'L' => 'L',
9
+ 'M' => 'M', 'N' => 'N', 'O' => 'O', 'P' => 'P', 'Q' => 'Q', 'R' => 'R', 'S' => 'S', 'T' => 'T',
10
+ 'U' => 'U', 'V' => 'V', 'W' => 'W', 'X' => 'X', 'Y' => 'Y', 'Z' => 'Z', '-' => '-', '+' => '+',
11
+ '—' => '-', ',' => ',', '/' => '/', '·' => '.'}
12
+
13
+ class << self
14
+ def filter(char)
15
+ @@fullwidth_chars[char].nil? ? char : @@fullwidth_chars[char]
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,14 @@
1
+ module RsegFilter
2
+ class Symbol
3
+ @@separators = ['`', '[', ']', '、', '=', '‘', ';', '。', '|', '?', '》',
4
+ '《', ':', '“', '{', '}', ')', '(', '*', '…', '#', '!',
5
+ '~', '’', '”', '〕', '〈', '〉', '「', '」', '『', '』', '〖', '〗',
6
+ '【', '】', '<', '>', '`', '~', '!', '@', '#', '^',
7
+ '&', '*', '\\', '(', ')', '=', '{', '}', '[', ']',
8
+ '|', ';', ':', "'", '<', '>', '?', "\n", "\t", "\r",
9
+ ' ', '-', '/', '+', ',', ' ']
10
+ def self.filter(char)
11
+ @@separators.include?(char) ? :symbol : char
12
+ end
13
+ end
14
+ end
data/lib/rseg.rb ADDED
@@ -0,0 +1,138 @@
1
+ # encoding: utf-8
2
+
3
+ require 'singleton'
4
+ require 'net/http'
5
+
6
+ require File.join(File.dirname(__FILE__), 'engines/engine')
7
+ require File.join(File.dirname(__FILE__), 'engines/dict')
8
+ require File.join(File.dirname(__FILE__), 'engines/english')
9
+ require File.join(File.dirname(__FILE__), 'engines/number')
10
+ require File.join(File.dirname(__FILE__), 'engines/name')
11
+
12
+ require File.join(File.dirname(__FILE__), 'filters/fullwidth')
13
+ require File.join(File.dirname(__FILE__), 'filters/symbol')
14
+ require File.join(File.dirname(__FILE__), 'filters/conjunction')
15
+
16
+ class Rseg
17
+ include Singleton
18
+ include RsegEngine
19
+ include RsegFilter
20
+
21
+ class << self
22
+ def dict_path=(path)
23
+ RsegEngine::Dict.dict_path = path
24
+ end
25
+
26
+ def segment(input)
27
+ Rseg.instance.input = input
28
+ Rseg.instance.segment
29
+ end
30
+
31
+ def load
32
+ Rseg.instance
33
+ nil
34
+ end
35
+
36
+ def remote_segment(input)
37
+ begin
38
+ response = Net::HTTP.post_form(URI.parse('http://127.0.0.1:4100/seg'), :input => input)
39
+ response.code == '200' ? response.body.split(' ') :
40
+ ["Can't connect to http://localhost:4100\nUse rseg_server to start it"]
41
+ rescue
42
+ ["Can't connect to http://localhost:4100\nUse rseg_server to start it"]
43
+ end
44
+ end
45
+ end
46
+
47
+ def initialize
48
+ @input = ''
49
+ @words = []
50
+ init_engines
51
+ init_filters
52
+ end
53
+
54
+ def input=(input)
55
+ @input = input
56
+ end
57
+
58
+ def segment
59
+ @words = []
60
+
61
+ @input.chars.each do |origin|
62
+ char = filter(origin)
63
+ process(char, origin)
64
+ end
65
+
66
+ process(:symbol, '')
67
+ @words
68
+ end
69
+
70
+ private
71
+ def filter(char)
72
+ result = char
73
+ @filters.each do |klass|
74
+ result = klass.filter(result)
75
+ end
76
+ result
77
+ end
78
+
79
+ def process(char, origin)
80
+ nomatch = true
81
+ word = ''
82
+
83
+ engines.each do |engine|
84
+ next unless engine.running?
85
+ match, word = engine.process(char)
86
+ match ? nomatch = false : engine.stop
87
+ end
88
+
89
+ if nomatch
90
+ if word == ''
91
+ @words << origin unless char == :symbol
92
+ reset_engines
93
+ else
94
+ reset_engines
95
+ @words << word if word.is_a?(String)
96
+ reprocess(word) if word.is_a?(Array)
97
+
98
+ # re-process current char
99
+ process(char, origin)
100
+ end
101
+ end
102
+ end
103
+
104
+ def reprocess(word)
105
+ last = word.pop
106
+
107
+ word.each do |char|
108
+ process(char, char)
109
+ end
110
+
111
+ process(:symbol, :symbol)
112
+ process(last, last)
113
+ end
114
+
115
+ def reset_engines
116
+ engines.each do |engine|
117
+ engine.run
118
+ end
119
+ end
120
+
121
+ def engines=(engines)
122
+ @engines ||= engines
123
+ end
124
+
125
+ def engines
126
+ @engines
127
+ end
128
+
129
+ def init_filters
130
+ @filters = [Fullwidth, Symbol]
131
+ end
132
+
133
+ def init_engines
134
+ @engines ||= [Dict, English, Number, Name].map do |engine_klass|
135
+ engine_klass.new
136
+ end
137
+ end
138
+ end
data/public/screen.css ADDED
@@ -0,0 +1,123 @@
1
+ div.clear {clear: both;}
2
+ body {background: #EEEEEE; margin: 0; padding: 0;
3
+ font-family: 'Lucida Grande', 'Lucida Sans Unicode',
4
+ 'Garuda';}
5
+ code {font-family: 'Lucida Console', monospace;
6
+ font-size: 12px;}
7
+ li {height: 18px;}
8
+ ul {list-style: none; margin: 0; padding: 0;}
9
+ ol:hover {cursor: pointer;}
10
+ ol li {white-space: pre;}
11
+ #explanation {font-size: 12px; color: #666666;
12
+ margin: 20px 0 0 100px;}
13
+ /* WRAP */
14
+ #wrap {width: 860px; background: #FFFFFF; margin: 0 auto;
15
+ padding: 30px 50px 20px 50px;
16
+ border-left: 1px solid #DDDDDD;
17
+ border-right: 1px solid #DDDDDD;}
18
+ /* HEADER */
19
+ #header {margin: 0 auto 25px auto;}
20
+ h1 {margin: 0; font-size: 36px; color: #981919;}
21
+ h2 {margin: 0; font-size: 22px; color: #333333;}
22
+ #header ul {margin: 0; font-size: 12px; color: #666666;}
23
+ #header ul li strong{color: #444444;}
24
+ #header ul li {display: inline; padding: 0 10px;}
25
+ #header ul li.first {padding-left: 0;}
26
+ #header ul li.last {border: 0; padding-right: 0;}
27
+
28
+ #content {width: 860px; margin: 0 auto 10px auto;}
29
+
30
+ h3 {float: left; width: 100px; margin-bottom: 10px;
31
+ color: #981919; font-size: 14px; font-weight: bold;}
32
+
33
+ #footer {width: 860px; margin: 0 auto 10px auto; clear:both;
34
+ font-size: 18px; border-top:1px solid #000; padding-top: 10px;
35
+ text-align: right;}
36
+
37
+ textarea {font-size: 18px; padding:10px;}
38
+ #segform { width: 430px; float: left; font-size: 18px;}
39
+ #segresult { width: 408px; float: left; font-size: 18px;
40
+ padding: 10px; color: #D12F19;}
41
+ /* --------------------------------------------------------------
42
+
43
+ buttons.css
44
+ * Gives you some great CSS-only buttons.
45
+
46
+ Created by Kevin Hale [particletree.com]
47
+ * particletree.com/features/rediscovering-the-button-element
48
+
49
+ See Readme.txt in this folder for instructions.
50
+
51
+ -------------------------------------------------------------- */
52
+
53
+ button {
54
+ display:block;
55
+ float:left;
56
+ margin:0 0.583em 0.667em 0;
57
+ padding:5px 10px 5px 7px; /* Links */
58
+
59
+ border:1px solid #dedede;
60
+ border-top:1px solid #eee;
61
+ border-left:1px solid #eee;
62
+
63
+ background-color:#f5f5f5;
64
+ font-family:"Lucida Grande", Tahoma, Arial, Verdana, sans-serif;
65
+ font-size:100%;
66
+ line-height:130%;
67
+ text-decoration:none;
68
+ font-weight:bold;
69
+ color:#565656;
70
+ cursor:pointer;
71
+ }
72
+ button {
73
+ width:auto;
74
+ overflow:visible;
75
+ padding:4px 10px 3px 7px; /* IE6 */
76
+ }
77
+ button[type] {
78
+ padding:4px 10px 4px 7px; /* Firefox */
79
+ line-height:17px; /* Safari */
80
+ }
81
+ *:first-child+html button[type] {
82
+ padding:4px 10px 3px 7px; /* IE7 */
83
+ }
84
+ button img {
85
+ margin:0 3px -3px 0 !important;
86
+ padding:0;
87
+ border:none;
88
+ width:16px;
89
+ height:16px;
90
+ float:none;
91
+ }
92
+
93
+
94
+ /* Button colors
95
+ -------------------------------------------------------------- */
96
+
97
+ /* Standard */
98
+ button:hover {
99
+ background-color:#dff4ff;
100
+ border:1px solid #c2e1ef;
101
+ color:#336699;
102
+ }
103
+
104
+ /* Positive */
105
+ body .positive {
106
+ color:#529214;
107
+ }
108
+ button.positive:hover {
109
+ background-color:#E6EFC2;
110
+ border:1px solid #C6D880;
111
+ color:#529214;
112
+ }
113
+
114
+ /* Negative */
115
+ body .negative {
116
+ color:#d12f19;
117
+ }
118
+ button.negative:hover {
119
+ background:#fbe3e4;
120
+ border:1px solid #fbc2c4;
121
+ color:#d12f19;
122
+ }
123
+
data/views/index.haml ADDED
@@ -0,0 +1,8 @@
1
+ #segform
2
+ %form{:action => '/segment', :method => 'post'}
3
+ %p
4
+ %textarea{:id => 'input', :rows => '15', :cols => '35', :name => 'input'}= @input || '输入要分词的文章'
5
+ %p
6
+ %button.negative{ :type => "submit" } 开始分词
7
+ #segresult
8
+ %p= @result
data/views/layout.haml ADDED
@@ -0,0 +1,16 @@
1
+ !!! Strict
2
+ %html{ :lang => "en", :"xml:lang" => "en", :xmlns => "http://www.w3.org/1999/xhtml" }
3
+ %head
4
+ %meta{ :content => "text/html; charset=utf-8", :"http-equiv" => "Content-Type" }
5
+ %meta{ :content => "zh_CN", :"http-equiv" => "Content-Language" }
6
+ %title= "Rseg中文分词"
7
+ %link{ :rel => 'stylesheet', :href => '/screen.css', :type => 'text/css', :media => "screen"}
8
+
9
+ %body
10
+ #wrap
11
+ #header
12
+ %h1= "Rseg 中文分词"
13
+ %address.watermark
14
+ #content.condensed= yield
15
+ #footer= "作者: 张元一 <br />EMail:zhangyuanyi#gmail.com"
16
+
metadata ADDED
@@ -0,0 +1,97 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rseg_ggharry
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Yuanyi Zhang
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2010-11-28 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: haml
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: sinatra
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: A Chinese Word Segmentation(中文分词) routine in pure Ruby
42
+ email: zhangyuanyi@gmail.com
43
+ executables:
44
+ - rseg
45
+ - rseg_server
46
+ extensions: []
47
+ extra_rdoc_files:
48
+ - LICENSE
49
+ - README
50
+ files:
51
+ - ".gitignore"
52
+ - LICENSE
53
+ - README
54
+ - Rakefile
55
+ - VERSION
56
+ - bin/rseg
57
+ - bin/rseg_server
58
+ - dict/dict.hash
59
+ - lib/app.rb
60
+ - lib/builder/dict.rb
61
+ - lib/engines/dict.rb
62
+ - lib/engines/engine.rb
63
+ - lib/engines/english.rb
64
+ - lib/engines/name.rb
65
+ - lib/engines/number.rb
66
+ - lib/filters/conjunction.rb
67
+ - lib/filters/fullwidth.rb
68
+ - lib/filters/symbol.rb
69
+ - lib/rseg.rb
70
+ - public/screen.css
71
+ - views/index.haml
72
+ - views/layout.haml
73
+ homepage: http://github.com/yzhang/rseg
74
+ licenses: []
75
+ metadata: {}
76
+ post_install_message:
77
+ rdoc_options:
78
+ - "--charset=UTF-8"
79
+ require_paths:
80
+ - lib
81
+ required_ruby_version: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ required_rubygems_version: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - ">="
89
+ - !ruby/object:Gem::Version
90
+ version: '0'
91
+ requirements: []
92
+ rubyforge_project:
93
+ rubygems_version: 2.4.5
94
+ signing_key:
95
+ specification_version: 3
96
+ summary: A Chinese Word Segmentation(中文分词) routine in pure Ruby
97
+ test_files: []