ckip_client 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +63 -0
- data/lib/CKIP_Client.rb +65 -0
- data/lib/config/parser.yml +4 -0
- data/lib/config/segment.yml +4 -0
- metadata +48 -0
data/README.md
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
#CKIP_CLient
|
2
|
+
|
3
|
+
[RubyGems](http://rubygems.org/gems/ckip_client)
|
4
|
+
|
5
|
+
CKIP_CLient是連接[中央研究院][中央研究院][詞庫小組][詞庫小組]研發之[中文斷詞系統][斷詞系統]與[中文剖析系統][剖析系統]的Ruby程式界面。
|
6
|
+
感謝中央研究院[詞庫小組][詞庫小組]多年來之研究成果!
|
7
|
+
|
8
|
+
|
9
|
+
## 安裝 Installation
|
10
|
+
|
11
|
+
請先至中文斷詞系統[網站][斷詞申請]或中文剖析系統[網站][剖析申請]申請:帳號/密碼
|
12
|
+
再安裝本Gem!
|
13
|
+
|
14
|
+
gem install chinese_convt
|
15
|
+
|
16
|
+
安裝完成後至Gem所在資料夾中修改帳號密碼資料。
|
17
|
+
資料夾位置通常在:/usr/local/lib/ruby/gems/1.9.1/gems/
|
18
|
+
進入:ckip_client-0.0.3/lib/config/
|
19
|
+
於 segment.yml 檔案中輸入中文斷詞系統之帳號密碼,
|
20
|
+
於 parser.yml 檔案中輸入中文剖析系統之帳號密碼,
|
21
|
+
至此安裝設定就緒。
|
22
|
+
|
23
|
+
|
24
|
+
## 使用 Usage
|
25
|
+
|
26
|
+
將文章斷詞:
|
27
|
+
|
28
|
+
CKIP.segment( text )
|
29
|
+
|
30
|
+
剖析文章:
|
31
|
+
|
32
|
+
CKIP.parser( text )
|
33
|
+
|
34
|
+
也可以讓輸出結果濾除詞性資料,在輸入時加入第二個參數 'neat'
|
35
|
+
|
36
|
+
CKIP.segment( text , 'neat' )
|
37
|
+
CKIP.parser( text , 'neat' )
|
38
|
+
|
39
|
+
注:
|
40
|
+
輸入的字串編碼可以是 UTF-8 或是 Big5 或 Big5-UAO 三種其中之一。
|
41
|
+
但是輸出結果一律為 UTF-8 編碼!
|
42
|
+
|
43
|
+
|
44
|
+
## 範例 Example
|
45
|
+
|
46
|
+
require 'ckip_client'
|
47
|
+
text = File.open('text.txt').read
|
48
|
+
puts CKIP.segment( text )
|
49
|
+
|
50
|
+
|
51
|
+
## 參閱 References
|
52
|
+
|
53
|
+
+ [中研院詞庫小組][詞庫小組]
|
54
|
+
+ [中文斷詞系統][斷詞系統]
|
55
|
+
+ [中文剖析系統][剖析系統]
|
56
|
+
|
57
|
+
|
58
|
+
[中央研究院]: http://www.sinica.edu.tw/
|
59
|
+
[詞庫小組]: http://godel.iis.sinica.edu.tw/CKIP/
|
60
|
+
[斷詞系統]: http://ckipsvr.iis.sinica.edu.tw
|
61
|
+
[剖析系統]: http://parser.iis.sinica.edu.tw
|
62
|
+
[斷詞申請]: http://ckipsvr.iis.sinica.edu.tw/webservice.htm
|
63
|
+
[剖析申請]: http://parser.iis.sinica.edu.tw/v1/apply.htm
|
data/lib/CKIP_Client.rb
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'socket'
|
3
|
+
require 'yaml'
|
4
|
+
|
5
|
+
module CKIP
|
6
|
+
|
7
|
+
class Client
|
8
|
+
|
9
|
+
def self.get( sys , text )
|
10
|
+
text_encoding = text.encoding.to_s
|
11
|
+
unless ['Big5','Big5-UAO','UTF-8'].include? text_encoding
|
12
|
+
raise 'Encoding ERROR : CKIP_Client only support UTF-8 or Big5 or Big5-UAO encodings.'
|
13
|
+
end
|
14
|
+
input_encoding = (text_encoding == 'Big5-UAO')? 'Big5' : text_encoding
|
15
|
+
|
16
|
+
config = YAML::load( File.open( File.dirname(__FILE__) + "/config/#{sys}.yml" ).read )
|
17
|
+
request = "<?xml version=\"1.0\" ?>
|
18
|
+
<wordsegmentation version=\"0.1\" charsetcode=\"#{input_encoding.downcase}\">
|
19
|
+
<option showcategory=\"1\" />
|
20
|
+
<authentication username=\"#{config['username']}\" password=\"#{config['password']}\" />
|
21
|
+
<text>#{text}</text>
|
22
|
+
</wordsegmentation>"
|
23
|
+
|
24
|
+
socket = TCPSocket.open( config['host'] , config['port'] )
|
25
|
+
socket.write( request )
|
26
|
+
xml_result = socket.gets.force_encoding( text_encoding )
|
27
|
+
socket.close
|
28
|
+
return xml_result
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.xml2str( xml )
|
32
|
+
xml.encode!('UTF-8')
|
33
|
+
if /<result>(.*?)<\/result>/m.match( xml )
|
34
|
+
return $1.gsub(/<\/sentence>\r?\n?\t*?\s*?<sentence>/,"\n").gsub("\n \n","\n\n").sub(/\t*?\s*?<sentence> ?/,'').sub(/<\/sentence>/,'').gsub("\n ", "\n")
|
35
|
+
elsif /<processstatus code="\d">(.*?)<\/processstatus>/.match( xml )
|
36
|
+
raise $1
|
37
|
+
else
|
38
|
+
raise "XML result format error!!"
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
|
44
|
+
def self.segment( text , mode = nil )
|
45
|
+
output = Client.xml2str( Client.get( 'segment' , text ) )
|
46
|
+
if ['compact','neat'].include?( mode )
|
47
|
+
return output.gsub!(/\([A-Za-z]+\)/,'')
|
48
|
+
else
|
49
|
+
return output
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def self.parser( text , mode = nil )
|
54
|
+
text.encode!('Big5-UAO') if text.encoding.to_s == 'UTF-8'
|
55
|
+
|
56
|
+
output = Client.xml2str( Client.get( 'parser' , text ) )
|
57
|
+
if ['compact','neat'].include?( mode )
|
58
|
+
return output.gsub(/[A-Za-z]+?:/,'').gsub(/[A-Za-z]+?\(/,'(').gsub(/[A-Za-z]+?‧.+?\(/,'(').gsub!(/[#%]/,'').gsub!(/^\d+:\d+.\[\d+\]\s/,'').gsub(/\([A-Z]+?\)$/,'')
|
59
|
+
else
|
60
|
+
return output
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
metadata
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ckip_client
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.3
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- xxxooo
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-11-09 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: CKIP_CLient是連接中央研究院詞庫小組研發之中文斷詞系統與中文剖析系統的Ruby程式界面。感謝中央研究院[詞庫小組][詞庫小組]多年來之研究成果!
|
15
|
+
email: xxxooo.tw@gmail.com
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- README.md
|
21
|
+
- lib/CKIP_Client.rb
|
22
|
+
- lib/config/segment.yml
|
23
|
+
- lib/config/parser.yml
|
24
|
+
homepage: http://github.com/xxxooo/ckip_client
|
25
|
+
licenses: []
|
26
|
+
post_install_message:
|
27
|
+
rdoc_options: []
|
28
|
+
require_paths:
|
29
|
+
- lib
|
30
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
31
|
+
none: false
|
32
|
+
requirements:
|
33
|
+
- - ! '>='
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: '0'
|
36
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
37
|
+
none: false
|
38
|
+
requirements:
|
39
|
+
- - ! '>='
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: '0'
|
42
|
+
requirements: []
|
43
|
+
rubyforge_project:
|
44
|
+
rubygems_version: 1.8.23
|
45
|
+
signing_key:
|
46
|
+
specification_version: 3
|
47
|
+
summary: 連接中研院詞庫小組的中文斷詞系統與中文剖析系統之API
|
48
|
+
test_files: []
|