rseg-ggharry 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 4b936b9ceeec41297940b46e0240f4e55e967e99
4
+ data.tar.gz: c1df52d3609806edee718f9c22fcfbcbcc06952e
5
+ SHA512:
6
+ metadata.gz: be172b75391151d4b590048da0a2d3bc20c6a9952a205a82a8a0eb2701bc555d69c4028fac36cc5680d63e6026b87b9691a134200ee1d04f702b9e3b93d4cbaf
7
+ data.tar.gz: 4e4321619104f9f067a3c4ef67c361491d7272dba74b66aa9b9d7adbca7e1c63754c6f50c0eef498d993c9cfd7982629a15118cf7c7614a1883459604379679f
data/.gitignore ADDED
@@ -0,0 +1,3 @@
1
+ test
2
+ pkg
3
+ .DS_Store
data/LICENSE ADDED
@@ -0,0 +1,30 @@
1
+ Rseg includes two built-in dictionaries:
2
+
3
+ * CC-CEDICT (http://cc-cedict.org/wiki/): Creative Commons Attribution-Share Alike 3.0 License (http://creativecommons.org/licenses/by-sa/3.0/)
4
+ * Wikipedia Chinese article title list (http://download.wikimedia.org/zhwiki/): Creative Commons Attribution-Share Alike 3.0 License (http://creativecommons.org/licenses/by-sa/3.0/)
5
+
6
+ The codes and others in Rseg are licensed under MIT license:
7
+
8
+ ===============================
9
+ Copyright (c) 2009 Yuanyi Zhang
10
+
11
+ Permission is hereby granted, free of charge, to any person
12
+ obtaining a copy of this software and associated documentation
13
+ files (the "Software"), to deal in the Software without
14
+ restriction, including without limitation the rights to use,
15
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
16
+ copies of the Software, and to permit persons to whom the
17
+ Software is furnished to do so, subject to the following
18
+ conditions:
19
+
20
+ The above copyright notice and this permission notice shall be
21
+ included in all copies or substantial portions of the Software.
22
+
23
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
25
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
27
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
28
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
29
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
30
+ OTHER DEALINGS IN THE SOFTWARE.
data/README ADDED
@@ -0,0 +1,57 @@
1
+ Introduction
2
+ ========
3
+ Rseg is a Chinese Word Segmentation(中文分词) routine in pure Ruby.
4
+
5
+ The algorithm is based on this article: http://xiecc.blog.163.com/blog/static/14032200671110224190/
6
+
7
+ Usage
8
+ ========
9
+
10
+ Rseg now support two modes: inline and C/S mode.
11
+
12
+ 1. Inline mode
13
+
14
+ > require 'rubygems'
15
+ > require 'rseg'
16
+ > Rseg.segment("需要分词的文章")
17
+ ['需要', '分词', '的', '文章']
18
+
19
+ The first call to Rseg#segment will need about 30 seconds to load the dictionary, the second call will be very fast, you can also call Rseg#load to load dictionaries manually.
20
+
21
+ 2. C/S mode
22
+
23
+ $ rseg_server
24
+ == Sinatra/0.9.4 has taken the stage on 4100
25
+
26
+ This will start rseg server on http://localhost:4100
27
+
28
+ You can visit it via your browser or the rseg command.
29
+
30
+ $ rseg '需要分词的文章'
31
+ 需要 分词 的 文章
32
+
33
+ You can also access server with the Rseg#remote_segment
34
+
35
+ $ irb
36
+ > require 'rubygems'
37
+ > require 'rseg'
38
+ > Rseg.remote_segment("需要分词的文章") # This will be very fast
39
+ ['需要', '分词', '的', '文章']
40
+
41
+ Performance
42
+ ========
43
+ About 5M character/s on my Macbook (Intel Core 2 Duo 2GHz/4G mem).
44
+
45
+ License
46
+ ========
47
+
48
+ Rseg includes two built-in dictionaries:
49
+
50
+ * CC-CEDICT (http://cc-cedict.org/wiki/) with Creative Commons Attribution-Share Alike 3.0 License (http://creativecommons.org/licenses/by-sa/3.0/)
51
+ * Wikipedia Chinese article title list (http://download.wikimedia.org/zhwiki/) with Creative Commons Attribution-Share Alike 3.0 License(http://creativecommons.org/licenses/by-sa/3.0/)
52
+
53
+ The codes and others in Rseg are licensed under MIT license.
54
+
55
+ Feedback
56
+ ========
57
+ All feedback are welcome, Yuanyi Zhang(zhangyuanyi#gmail.com)
data/Rakefile ADDED
@@ -0,0 +1,21 @@
1
+ require 'rake'
2
+ require 'rake/testtask'
3
+ require 'rcov/rcovtask'
4
+
5
+ begin
6
+ require 'jeweler'
7
+ Jeweler::Tasks.new do |s|
8
+ s.name = "rseg"
9
+ s.executables = ["rseg", 'rseg_server']
10
+ s.summary = "A Chinese Word Segmentation(中文分词) routine in pure Ruby"
11
+ s.email = "zhangyuanyi@gmail.com"
12
+ s.homepage = "http://github.com/yzhang/rseg"
13
+ s.description = "A Chinese Word Segmentation(中文分词) routine in pure Ruby"
14
+ s.authors = ["Yuanyi Zhang"]
15
+ s.files = FileList["[A-Z]*", "{bin,lib,public,views}/**/*", '.gitignore', 'dict/dict.hash']
16
+ s.add_dependency 'haml'
17
+ s.add_dependency 'sinatra'
18
+ end
19
+ rescue LoadError
20
+ puts "Jeweler, or one of its dependencies, is not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
21
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.7
data/bin/rseg ADDED
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env ruby
2
+ require File.join(File.dirname(__FILE__), '/../lib/rseg')
3
+
4
+ input = ARGV[0]
5
+
6
+ if input.nil? || input == ''
7
+ puts "Usage: rseg <text>"
8
+ exit
9
+ end
10
+
11
+ puts Rseg.remote_segment(input).join(' ')
data/bin/rseg_server ADDED
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env ruby
2
+ $:.unshift File.expand_path(File.dirname(__FILE__))
3
+
4
+ require 'rubygems'
5
+ require 'haml'
6
+
7
+ require File.join(File.dirname(__FILE__), '/../lib/rseg')
8
+ require File.join(File.dirname(__FILE__), '/../lib/app')
9
+
10
+ puts "Loading dictionaries, this will take about 30 seconds."
11
+ puts "Please wait a moment..."
12
+ Rseg.load
13
+ puts "Dictionaries loaded."
14
+
15
+ App.run! :host => '127.0.0.1', :port => 4100, :environment => 'production'
16
+ exit
data/dict/dict.hash ADDED
Binary file
data/lib/app.rb ADDED
@@ -0,0 +1,22 @@
1
+ require 'sinatra/base'
2
+
3
+ class App < Sinatra::Base
4
+ set :root, File.dirname(__FILE__) + "/.."
5
+ set :app_file, __FILE__
6
+
7
+ get '/' do
8
+ haml :index
9
+ end
10
+
11
+ post '/segment' do
12
+ @input = params[:input]
13
+ @result = Rseg.segment(@input).join(' ')
14
+ haml :index
15
+ end
16
+
17
+ post '/seg' do
18
+ @input = params[:input]
19
+ @result = Rseg.segment(@input)
20
+ @result.join(' ')
21
+ end
22
+ end
@@ -0,0 +1,41 @@
1
+ # encoding: utf-8
2
+
3
+ def process(path, tree)
4
+ File.open(path, 'r') do |file|
5
+ file.each_line do |line|
6
+ node = nil
7
+ line.chars.each do |c|
8
+ next if c == "\n" || c == "\r"
9
+ if node
10
+ node[c] ||= {}
11
+ node = node[c]
12
+ else
13
+ tree[c] ||= Hash.new
14
+ node = tree[c]
15
+ end
16
+ end
17
+ node[:end] = true
18
+ end
19
+ end
20
+ end
21
+
22
+ def build
23
+ tree = {}
24
+ dictionaries = ['cedict.zh_CN.utf8', 'wikipedia.zh.utf8']
25
+
26
+ dictionaries.each do |dictionary|
27
+ puts "Processing #{dictionary}..."
28
+ path = File.join(File.dirname(__FILE__), '../../dict', dictionary)
29
+ process(path, tree)
30
+ end
31
+
32
+ File.open(hash_path, "wb") {|io| Marshal.dump(tree, io)}
33
+ puts 'Done'
34
+ end
35
+
36
+ def hash_path
37
+ File.join(File.dirname(__FILE__), '../../dict/dict.hash')
38
+ end
39
+
40
+ build
41
+
@@ -0,0 +1,51 @@
1
+ module RsegEngine
2
+ class Dict < Engine
3
+ @@root = nil
4
+ @@dict_path = File.join(File.dirname(__FILE__), '../../dict/dict.hash')
5
+
6
+ class << self
7
+ def dict_path=(path)
8
+ @@dict_path = path
9
+ end
10
+
11
+ def dict_path
12
+ @@dict_path
13
+ end
14
+ end
15
+
16
+ def initialize
17
+ @@root ||= load_dict(@@dict_path)
18
+ @word = ''
19
+ @node = @@root
20
+ super
21
+ end
22
+
23
+ def process(char)
24
+ match = false
25
+ word = nil
26
+
27
+ if @node[char]
28
+ @word << char
29
+ @node = @node[char]
30
+ match = true
31
+ else
32
+ if @node[:end] || @word.chars.to_a.length == 1
33
+ word = @word
34
+ else
35
+ word = @word.chars.to_a
36
+ end
37
+
38
+ @node = @@root
39
+ @word = ''
40
+ match = false
41
+ end
42
+
43
+ [match, word]
44
+ end
45
+
46
+ private
47
+ def load_dict(path)
48
+ File.open(path, "rb") {|io| Marshal.load(io)}
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,19 @@
1
+ module RsegEngine
2
+ class Engine
3
+ def initialize
4
+ @running = true
5
+ end
6
+
7
+ def stop
8
+ @running = false
9
+ end
10
+
11
+ def run
12
+ @running = true
13
+ end
14
+
15
+ def running?
16
+ @running
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,26 @@
1
+ module RsegEngine
2
+ LETTER_SYMBOLS = ('a'..'z').to_a + ('A'..'Z').to_a
3
+
4
+ class English < Engine
5
+ def initialize
6
+ @word = ''
7
+ super
8
+ end
9
+
10
+ def process(char)
11
+ match = false
12
+ word = nil
13
+
14
+ if LETTER_SYMBOLS.include?(char)
15
+ @word << char
16
+ match = true
17
+ else
18
+ word = @word
19
+ @word = ''
20
+ match = false
21
+ end
22
+
23
+ [match, word]
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,51 @@
1
+ module RsegEngine
2
+ class Name < Engine
3
+ @@last_names = %W(丁 卜 刁 七 弓 干 于 王 尤 孔 方 申 白 甘 田 包 石 左 平 司 皮 史 池 艾 年 匡 充 江 印
4
+ 促 伊 伍 安 任 米 促 牟 向 吉 成 伏 吕 李 吴 沈 何 贝 狄 祁 杜 汪 阮 邢 汲 别 辛 冷 利 沃 谷
5
+ 扶 步 那 沙 周 金 吕 花 孟 和 邵 房 抗 灰 明 屈 松 牧 宓 武 幸 卓 易 尚 邰 空 竺 岳 东 林
6
+ 施 姜 俞 查 封 秋 帅 祖 羿 柯 茅 柳 姚 纪 宣 咸 库 侯 洪 胡 哈 宣 郁 祝 苗 禹 娄
7
+ 秦 奚 倪 度 凌 宰 宦 师 徐 翁 班 马 时 晃 乌 夏 贡 柴 能 家 宫 敖 索 晏 桑 高 凌 桂 容 姬 劳 桑 桂 袁 时 祝 席 徐 高 夏 凌 洪 翁 家 芮 乌 祖 索 贡
8
+ 许 张 曹 戚 梅 屠 盛 崖 章 鱼 国 商 扈 寇 终 冯 苗 康 常 茅 闵 麻 胡 崔 邢 条 符 宿 堵 浦 习 鱼
9
+ 梁 富 曾 程 项 钮 舒 彭 费 童 云 喻 嵇 范 费 贺 毕 付 黄 邵 祁 阮 强 童 邱 解 贲 单 富 钮 荀 惠 邴 焦 班 甯 钭 景 邰 劳 茹 寇 荆
10
+ 莫 际 景 须 杨 詹 郎 雷 贾 路 骆 虞 经 裘 郁 滑 甄 靳 詹 闻 逄 雍 訾 郎 农 路 骆 虞 经 裘 郁 滑 靳 闻 逄 雍
11
+ 赵 黄 褚 凤 郝 齐 臧 熊 管 裴 荣 郗 韶 郜 黎 翟 寿 通
12
+ 卫 葛 鲁 乐 谈 董 樊 万 诸 刘 叶 都 满 广 殴 巩 养
13
+ 郭 钱 陈 陶 鲍 穆 郭 堆 卢 陆 龙 噪 鄂 阴 苍 燕 冀 衡 融 蒯 逯
14
+ 蒋 魏 谢 邹 潘 滕 邬 戴 钟 蔡 缪 应 储 糜 隗 历 蒲 慕 蔚 隆 鞠 关
15
+ 韩 萧 颜 庞 麦 双 璩 濮 聂 丰 看
16
+ 郑 严 蓟 薄 谭 罗 买 蓝 蓬 怀 党 饶 顾 苏 龚 边 栾 权) #:nodoc:
17
+
18
+ @@first_names = %W(文 铭 菁 郁 怡 智 德 祥 志 华 孟 庆 雅 佩 晓 蓉 明 仁 宇 青 慧 豪 琪 安
19
+ 惠 宗 信 盈 君 秀 敏 伶 佳 国 荣 忠 宏 育 丽 圣 淑 彦 龙 冠 后 静 娟 子
20
+ 嘉 瑞 柏 弘 芳 正 玮 贞 如 凯 元 士 伟 杰 颍 霖 玲 仪 珮 英 建 政 真 珍
21
+ 美 世 立 秋 婷 贤 瑜 中 玉 维 莹 翔 家 芬 昌 裕 雯 萍 永 成 宜 鸿 珊 民
22
+ 欣 哲 良 伦 燕 梦 磊 丹 元 一 昌 红 健) #:nodoc:
23
+ def initialize
24
+ @word = ''
25
+ @last = false
26
+ super
27
+ end
28
+
29
+ def process(char)
30
+ match = false
31
+ word = nil
32
+
33
+ if !@last && @@last_names.include?(char)
34
+ @word << char
35
+ match = true
36
+ @last = true
37
+ elsif @last && @word.chars.to_a.length < 3 && @@first_names.include?(char)
38
+ @word << char
39
+ match = true
40
+ @unit = true
41
+ else
42
+ word = @word
43
+ @word = ''
44
+ @last = false
45
+ match = false
46
+ end
47
+
48
+ [match, word]
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,61 @@
1
+ module RsegEngine
2
+ class Number < Engine
3
+ @@number_symbols = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
4
+ '一', '二', '三', '四', '五', '六', '七', '八', '九', '十',
5
+ '零', '〇', '百', '千', '壹', '贰', '叁', '肆', '柒', '捌',
6
+ '玖', '拾', '之', '%', '¥', '分', '$', '.', '点', '第', '每']
7
+ @@subunit_symbols = ['多', '公', '英', '厘', '毫', '微', '纳', '海', '平', '立',
8
+ '方', '摄', '华', '氏', '美', '日', '澳', '港', '台', '新',
9
+ '个', '百', '佰', '千', '仟', '万', '萬', '亿', '兆', '吉']
10
+ @@unit_symbols = ['刻', '章', '回', '节', '名', '个', '届', '次', '集', '元',
11
+ '角', '例', '人', '斤', '克', '吨', '米', '里', '升', '码',
12
+ '尺', '寸', '杆', '顷', '亩', '磅', '镑', '桶', '度', '秒',
13
+ '分', '卡', '焦', '瓦', '匹', '圆', '币', '年', '月', '日',
14
+ '时', '秒', '点', '百', '佰', '仟', '千', '万', '萬', '亿',
15
+ '兆', '吉', '块', '半', '岁', '家', '所', '期', '场', '投',
16
+ '中', '辆', '只', '头']
17
+
18
+ def initialize
19
+ @word = ''
20
+ @number = ''
21
+ @unit = false
22
+ @subunit = false
23
+ super
24
+ end
25
+
26
+ def process(char)
27
+ match = false
28
+ word = nil
29
+
30
+ if (!@subunit || @unit) && @@number_symbols.include?(char)
31
+ @number << char
32
+ match = true
33
+ @unit = false
34
+ @subunit = false
35
+ elsif (@number != '' || @unit) && @@subunit_symbols.include?(char)
36
+ @number << char
37
+ match = true
38
+ @subunit = true
39
+ end
40
+
41
+ if (@number != '' || @subunit) && @@unit_symbols.include?(char)
42
+ @word << @number
43
+ @word << char if !match
44
+ @number = ''
45
+ @unit = true
46
+ match = true
47
+ end
48
+
49
+ if !match
50
+ word = (@word != '') ? @word : @number
51
+ @word = ''
52
+ @number = ''
53
+ match = false
54
+ @unit = false
55
+ @subunit = false
56
+ end
57
+
58
+ [match, word]
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,9 @@
1
+ module RsegFilter
2
+ class Conjunction
3
+ @@conjunctions = %W(给 的 说 对 在 和 是 被 最 所 那 由 这 有 将 你 会 与 他 为 不 没 很 了 啊 哦 呵 把 去 从)
4
+
5
+ def self.filter(char)
6
+ @@conjunctions.include?(char) ? :conjunction : char
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,19 @@
1
+ module RsegFilter
2
+ class Fullwidth
3
+ @@fullwidth_chars = {'1' => '1', '2' => '2', '3' => '3', '4' => '4', '5' => '5', '6' => '6', '7' => '7', '8' => '8',
4
+ '9' => '9', '0' => '0', 'a' => 'a', 'b' => 'b', 'c' => 'c', 'd' => 'd', 'e' => 'e', 'f' => 'f',
5
+ 'g' => 'g', 'h' => 'h', 'i' => 'i', 'j' => 'j', 'k' => 'k', 'l' => 'l', 'm' => 'm', 'n' => 'n',
6
+ 'o' => 'o', 'p' => 'p', 'q' => 'q', 'r' => 'r', 's' => 's', 't' => 't', 'u' => 'u', 'v' => 'v',
7
+ 'w' => 'w', 'x' => 'x', 'y' => 'y', 'z' => 'z', 'A' => 'A', 'B' => 'B', 'C' => 'C', 'D' => 'D',
8
+ 'E' => 'E', 'F' => 'F', 'G' => 'G', 'H' => 'H', 'I' => 'I', 'J' => 'J', 'K' => 'K', 'L' => 'L',
9
+ 'M' => 'M', 'N' => 'N', 'O' => 'O', 'P' => 'P', 'Q' => 'Q', 'R' => 'R', 'S' => 'S', 'T' => 'T',
10
+ 'U' => 'U', 'V' => 'V', 'W' => 'W', 'X' => 'X', 'Y' => 'Y', 'Z' => 'Z', '-' => '-', '+' => '+',
11
+ '—' => '-', ',' => ',', '/' => '/', '·' => '.'}
12
+
13
+ class << self
14
+ def filter(char)
15
+ @@fullwidth_chars[char].nil? ? char : @@fullwidth_chars[char]
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,14 @@
1
+ module RsegFilter
2
+ class Symbol
3
+ @@separators = ['`', '[', ']', '、', '=', '‘', ';', '。', '|', '?', '》',
4
+ '《', ':', '“', '{', '}', ')', '(', '*', '…', '#', '!',
5
+ '~', '’', '”', '〕', '〈', '〉', '「', '」', '『', '』', '〖', '〗',
6
+ '【', '】', '<', '>', '`', '~', '!', '@', '#', '^',
7
+ '&', '*', '\\', '(', ')', '=', '{', '}', '[', ']',
8
+ '|', ';', ':', "'", '<', '>', '?', "\n", "\t", "\r",
9
+ ' ', '-', '/', '+', ',', ' ']
10
+ def self.filter(char)
11
+ @@separators.include?(char) ? :symbol : char
12
+ end
13
+ end
14
+ end
data/lib/rseg.rb ADDED
@@ -0,0 +1,138 @@
1
+ # encoding: utf-8
2
+
3
+ require 'singleton'
4
+ require 'net/http'
5
+
6
+ require File.join(File.dirname(__FILE__), 'engines/engine')
7
+ require File.join(File.dirname(__FILE__), 'engines/dict')
8
+ require File.join(File.dirname(__FILE__), 'engines/english')
9
+ require File.join(File.dirname(__FILE__), 'engines/number')
10
+ require File.join(File.dirname(__FILE__), 'engines/name')
11
+
12
+ require File.join(File.dirname(__FILE__), 'filters/fullwidth')
13
+ require File.join(File.dirname(__FILE__), 'filters/symbol')
14
+ require File.join(File.dirname(__FILE__), 'filters/conjunction')
15
+
16
+ class Rseg
17
+ include Singleton
18
+ include RsegEngine
19
+ include RsegFilter
20
+
21
+ class << self
22
+ def dict_path=(path)
23
+ RsegEngine::Dict.dict_path = path
24
+ end
25
+
26
+ def segment(input)
27
+ Rseg.instance.input = input
28
+ Rseg.instance.segment
29
+ end
30
+
31
+ def load
32
+ Rseg.instance
33
+ nil
34
+ end
35
+
36
+ def remote_segment(input)
37
+ begin
38
+ response = Net::HTTP.post_form(URI.parse('http://127.0.0.1:4100/seg'), :input => input)
39
+ response.code == '200' ? response.body.split(' ') :
40
+ ["Can't connect to http://localhost:4100\nUse rseg_server to start it"]
41
+ rescue
42
+ ["Can't connect to http://localhost:4100\nUse rseg_server to start it"]
43
+ end
44
+ end
45
+ end
46
+
47
+ def initialize
48
+ @input = ''
49
+ @words = []
50
+ init_engines
51
+ init_filters
52
+ end
53
+
54
+ def input=(input)
55
+ @input = input
56
+ end
57
+
58
+ def segment
59
+ @words = []
60
+
61
+ @input.chars.each do |origin|
62
+ char = filter(origin)
63
+ process(char, origin)
64
+ end
65
+
66
+ process(:symbol, '')
67
+ @words
68
+ end
69
+
70
+ private
71
+ def filter(char)
72
+ result = char
73
+ @filters.each do |klass|
74
+ result = klass.filter(result)
75
+ end
76
+ result
77
+ end
78
+
79
+ def process(char, origin)
80
+ nomatch = true
81
+ word = ''
82
+
83
+ engines.each do |engine|
84
+ next unless engine.running?
85
+ match, word = engine.process(char)
86
+ match ? nomatch = false : engine.stop
87
+ end
88
+
89
+ if nomatch
90
+ if word == ''
91
+ @words << origin unless char == :symbol
92
+ reset_engines
93
+ else
94
+ reset_engines
95
+ @words << word if word.is_a?(String)
96
+ reprocess(word) if word.is_a?(Array)
97
+
98
+ # re-process current char
99
+ process(char, origin)
100
+ end
101
+ end
102
+ end
103
+
104
+ def reprocess(word)
105
+ last = word.pop
106
+
107
+ word.each do |char|
108
+ process(char, char)
109
+ end
110
+
111
+ process(:symbol, :symbol)
112
+ process(last, last)
113
+ end
114
+
115
+ def reset_engines
116
+ engines.each do |engine|
117
+ engine.run
118
+ end
119
+ end
120
+
121
+ def engines=(engines)
122
+ @engines ||= engines
123
+ end
124
+
125
+ def engines
126
+ @engines
127
+ end
128
+
129
+ def init_filters
130
+ @filters = [Fullwidth, Symbol]
131
+ end
132
+
133
+ def init_engines
134
+ @engines ||= [Dict, English, Number, Name].map do |engine_klass|
135
+ engine_klass.new
136
+ end
137
+ end
138
+ end
data/public/screen.css ADDED
@@ -0,0 +1,123 @@
1
+ div.clear {clear: both;}
2
+ body {background: #EEEEEE; margin: 0; padding: 0;
3
+ font-family: 'Lucida Grande', 'Lucida Sans Unicode',
4
+ 'Garuda';}
5
+ code {font-family: 'Lucida Console', monospace;
6
+ font-size: 12px;}
7
+ li {height: 18px;}
8
+ ul {list-style: none; margin: 0; padding: 0;}
9
+ ol:hover {cursor: pointer;}
10
+ ol li {white-space: pre;}
11
+ #explanation {font-size: 12px; color: #666666;
12
+ margin: 20px 0 0 100px;}
13
+ /* WRAP */
14
+ #wrap {width: 860px; background: #FFFFFF; margin: 0 auto;
15
+ padding: 30px 50px 20px 50px;
16
+ border-left: 1px solid #DDDDDD;
17
+ border-right: 1px solid #DDDDDD;}
18
+ /* HEADER */
19
+ #header {margin: 0 auto 25px auto;}
20
+ h1 {margin: 0; font-size: 36px; color: #981919;}
21
+ h2 {margin: 0; font-size: 22px; color: #333333;}
22
+ #header ul {margin: 0; font-size: 12px; color: #666666;}
23
+ #header ul li strong{color: #444444;}
24
+ #header ul li {display: inline; padding: 0 10px;}
25
+ #header ul li.first {padding-left: 0;}
26
+ #header ul li.last {border: 0; padding-right: 0;}
27
+
28
+ #content {width: 860px; margin: 0 auto 10px auto;}
29
+
30
+ h3 {float: left; width: 100px; margin-bottom: 10px;
31
+ color: #981919; font-size: 14px; font-weight: bold;}
32
+
33
+ #footer {width: 860px; margin: 0 auto 10px auto; clear:both;
34
+ font-size: 18px; border-top:1px solid #000; padding-top: 10px;
35
+ text-align: right;}
36
+
37
+ textarea {font-size: 18px; padding:10px;}
38
+ #segform { width: 430px; float: left; font-size: 18px;}
39
+ #segresult { width: 408px; float: left; font-size: 18px;
40
+ padding: 10px; color: #D12F19;}
41
+ /* --------------------------------------------------------------
42
+
43
+ buttons.css
44
+ * Gives you some great CSS-only buttons.
45
+
46
+ Created by Kevin Hale [particletree.com]
47
+ * particletree.com/features/rediscovering-the-button-element
48
+
49
+ See Readme.txt in this folder for instructions.
50
+
51
+ -------------------------------------------------------------- */
52
+
53
+ button {
54
+ display:block;
55
+ float:left;
56
+ margin:0 0.583em 0.667em 0;
57
+ padding:5px 10px 5px 7px; /* Links */
58
+
59
+ border:1px solid #dedede;
60
+ border-top:1px solid #eee;
61
+ border-left:1px solid #eee;
62
+
63
+ background-color:#f5f5f5;
64
+ font-family:"Lucida Grande", Tahoma, Arial, Verdana, sans-serif;
65
+ font-size:100%;
66
+ line-height:130%;
67
+ text-decoration:none;
68
+ font-weight:bold;
69
+ color:#565656;
70
+ cursor:pointer;
71
+ }
72
+ button {
73
+ width:auto;
74
+ overflow:visible;
75
+ padding:4px 10px 3px 7px; /* IE6 */
76
+ }
77
+ button[type] {
78
+ padding:4px 10px 4px 7px; /* Firefox */
79
+ line-height:17px; /* Safari */
80
+ }
81
+ *:first-child+html button[type] {
82
+ padding:4px 10px 3px 7px; /* IE7 */
83
+ }
84
+ button img {
85
+ margin:0 3px -3px 0 !important;
86
+ padding:0;
87
+ border:none;
88
+ width:16px;
89
+ height:16px;
90
+ float:none;
91
+ }
92
+
93
+
94
+ /* Button colors
95
+ -------------------------------------------------------------- */
96
+
97
+ /* Standard */
98
+ button:hover {
99
+ background-color:#dff4ff;
100
+ border:1px solid #c2e1ef;
101
+ color:#336699;
102
+ }
103
+
104
+ /* Positive */
105
+ body .positive {
106
+ color:#529214;
107
+ }
108
+ button.positive:hover {
109
+ background-color:#E6EFC2;
110
+ border:1px solid #C6D880;
111
+ color:#529214;
112
+ }
113
+
114
+ /* Negative */
115
+ body .negative {
116
+ color:#d12f19;
117
+ }
118
+ button.negative:hover {
119
+ background:#fbe3e4;
120
+ border:1px solid #fbc2c4;
121
+ color:#d12f19;
122
+ }
123
+
data/views/index.haml ADDED
@@ -0,0 +1,8 @@
1
+ #segform
2
+ %form{:action => '/segment', :method => 'post'}
3
+ %p
4
+ %textarea{:id => 'input', :rows => '15', :cols => '35', :name => 'input'}= @input || '输入要分词的文章'
5
+ %p
6
+ %button.negative{ :type => "submit" } 开始分词
7
+ #segresult
8
+ %p= @result
data/views/layout.haml ADDED
@@ -0,0 +1,16 @@
1
+ !!! Strict
2
+ %html{ :lang => "en", :"xml:lang" => "en", :xmlns => "http://www.w3.org/1999/xhtml" }
3
+ %head
4
+ %meta{ :content => "text/html; charset=utf-8", :"http-equiv" => "Content-Type" }
5
+ %meta{ :content => "zh_CN", :"http-equiv" => "Content-Language" }
6
+ %title= "Rseg中文分词"
7
+ %link{ :rel => 'stylesheet', :href => '/screen.css', :type => 'text/css', :media => "screen"}
8
+
9
+ %body
10
+ #wrap
11
+ #header
12
+ %h1= "Rseg 中文分词"
13
+ %address.watermark
14
+ #content.condensed= yield
15
+ #footer= "作者: 张元一 <br />EMail:zhangyuanyi#gmail.com"
16
+
metadata ADDED
@@ -0,0 +1,97 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rseg-ggharry
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Harry Chen
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-09-16 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: haml
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: sinatra
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: A Chinese Word Segmentation(中文分词) routine in pure Ruby
42
+ email: harry@harry.gg
43
+ executables:
44
+ - rseg
45
+ - rseg_server
46
+ extensions: []
47
+ extra_rdoc_files:
48
+ - LICENSE
49
+ - README
50
+ files:
51
+ - ".gitignore"
52
+ - LICENSE
53
+ - README
54
+ - Rakefile
55
+ - VERSION
56
+ - bin/rseg
57
+ - bin/rseg_server
58
+ - dict/dict.hash
59
+ - lib/app.rb
60
+ - lib/builder/dict.rb
61
+ - lib/engines/dict.rb
62
+ - lib/engines/engine.rb
63
+ - lib/engines/english.rb
64
+ - lib/engines/name.rb
65
+ - lib/engines/number.rb
66
+ - lib/filters/conjunction.rb
67
+ - lib/filters/fullwidth.rb
68
+ - lib/filters/symbol.rb
69
+ - lib/rseg.rb
70
+ - public/screen.css
71
+ - views/index.haml
72
+ - views/layout.haml
73
+ homepage: http://github.com/yzhang/rseg
74
+ licenses: []
75
+ metadata: {}
76
+ post_install_message:
77
+ rdoc_options:
78
+ - "--charset=UTF-8"
79
+ require_paths:
80
+ - lib
81
+ required_ruby_version: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ required_rubygems_version: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - ">="
89
+ - !ruby/object:Gem::Version
90
+ version: '0'
91
+ requirements: []
92
+ rubyforge_project:
93
+ rubygems_version: 2.4.5
94
+ signing_key:
95
+ specification_version: 3
96
+ summary: A Chinese Word Segmentation(中文分词) routine in pure Ruby
97
+ test_files: []