ckip_client 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +7 -6
- data/lib/CKIP_Client.rb +15 -6
- metadata +2 -2
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#
|
1
|
+
#CKIP_Client
|
2
2
|
|
3
3
|
[RubyGems](http://rubygems.org/gems/ckip_client)
|
4
4
|
|
@@ -11,11 +11,11 @@ CKIP_Client是連接[中央研究院][中央研究院][詞庫小組][詞庫小
|
|
11
11
|
請先至中文斷詞系統[網站][斷詞申請]或中文剖析系統[網站][剖析申請]申請:帳號/密碼
|
12
12
|
再安裝本Gem
|
13
13
|
|
14
|
-
gem install
|
14
|
+
gem install ckip_client
|
15
15
|
|
16
16
|
安裝完成後至Gem所在資料夾中修改帳號密碼資料。
|
17
17
|
資料夾位置通常在:/usr/local/lib/ruby/gems/1.9.1/gems/
|
18
|
-
進入:ckip_client-0.0.
|
18
|
+
進入:ckip_client-0.0.5/lib/config/
|
19
19
|
於 segment.yml 檔案中輸入中文斷詞系統之帳號密碼,
|
20
20
|
於 parser.yml 檔案中輸入中文剖析系統之帳號密碼,
|
21
21
|
至此安裝設定就緒。
|
@@ -36,9 +36,10 @@ CKIP_Client是連接[中央研究院][中央研究院][詞庫小組][詞庫小
|
|
36
36
|
CKIP.segment( text , 'neat' )
|
37
37
|
CKIP.parser( text , 'neat' )
|
38
38
|
|
39
|
-
|
40
|
-
輸入的字串編碼可以是 UTF-8
|
41
|
-
|
39
|
+
文字編碼:
|
40
|
+
輸入的字串編碼可以是 UTF-8 或 Big5 或是 Big5-UAO 三種其中之一。
|
41
|
+
而輸出結果一律為 UTF-8 編碼。
|
42
|
+
CKIP系統不支援 Big5-HKSCS 之特有港字。
|
42
43
|
|
43
44
|
|
44
45
|
## 範例 Example
|
data/lib/CKIP_Client.rb
CHANGED
@@ -7,12 +7,20 @@ module CKIP
|
|
7
7
|
class Client
|
8
8
|
|
9
9
|
def self.get( sys , text )
|
10
|
+
begin
|
11
|
+
text.encode("Big5-UAO")
|
12
|
+
rescue Encoding::UndefinedConversionError
|
13
|
+
trans_text = text.encode("Big5-UAO", :undef => :replace).encode("UTF-8")
|
14
|
+
un_w = text.delete( trans_text )
|
15
|
+
puts "!!contains unsupported character: #{un_w}!!"
|
16
|
+
raise $!
|
17
|
+
end
|
10
18
|
text_encoding = text.encoding.to_s
|
11
19
|
unless ['Big5','Big5-UAO','UTF-8'].include? text_encoding
|
12
|
-
raise 'Encoding ERROR
|
20
|
+
raise 'Encoding ERROR!! CKIP_Client only support UTF-8 or Big5 or Big5-UAO encodings.'
|
13
21
|
end
|
14
22
|
input_encoding = (text_encoding == 'Big5-UAO')? 'Big5' : text_encoding
|
15
|
-
|
23
|
+
sst = 1.6 - 1020.0 / (text.size + 640)
|
16
24
|
config = YAML::load( File.open( File.dirname(__FILE__) + "/config/#{sys}.yml" ).read )
|
17
25
|
request = "<?xml version=\"1.0\" ?>
|
18
26
|
<wordsegmentation version=\"0.1\" charsetcode=\"#{input_encoding.downcase}\">
|
@@ -24,10 +32,11 @@ module CKIP
|
|
24
32
|
Timeout::timeout(5) {
|
25
33
|
socket = TCPSocket.open( config['host'] , config['port'] )
|
26
34
|
socket.write( request )
|
27
|
-
xml_result = socket.gets.force_encoding( text_encoding )
|
35
|
+
@xml_result = socket.gets.force_encoding( text_encoding )
|
28
36
|
socket.close
|
29
|
-
return xml_result
|
30
37
|
}
|
38
|
+
sleep (rand + 0.25) * sst
|
39
|
+
return @xml_result
|
31
40
|
rescue Timeout::Error
|
32
41
|
puts "CKIP Connection Timeout!!!"
|
33
42
|
raise Timeout::Error
|
@@ -36,6 +45,7 @@ module CKIP
|
|
36
45
|
|
37
46
|
def self.xml2str( xml )
|
38
47
|
xml.encode!('UTF-8')
|
48
|
+
sleep (rand * 0.1 + 0.05)
|
39
49
|
if /<result>(.*?)<\/result>/m.match( xml )
|
40
50
|
return $1.gsub(/<\/sentence>\r?\n?\t*?\s*?<sentence>/,"\n").gsub("\n \n","\n\n").sub(/\t*?\s*?<sentence> ?/,'').sub(/<\/sentence>/,'').gsub("\n ", "\n")
|
41
51
|
elsif /<processstatus code="\d">(.*?)<\/processstatus>/.match( xml )
|
@@ -44,7 +54,7 @@ module CKIP
|
|
44
54
|
raise "XML result format error!!"
|
45
55
|
end
|
46
56
|
end
|
47
|
-
|
57
|
+
|
48
58
|
end
|
49
59
|
|
50
60
|
def self.segment( text , mode = nil )
|
@@ -65,7 +75,6 @@ module CKIP
|
|
65
75
|
else
|
66
76
|
return output
|
67
77
|
end
|
68
|
-
|
69
78
|
end
|
70
79
|
|
71
80
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ckip_client
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-05-02 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: CKIP_CLient是連接中央研究院詞庫小組研發之中文斷詞系統與中文剖析系統的Ruby程式界面。感謝中央研究院詞庫小組多年來之研究成果!
|
15
15
|
email: xxxooo.tw@gmail.com
|