ckip_client 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. data/README.md +7 -6
  2. data/lib/CKIP_Client.rb +15 -6
  3. metadata +2 -2
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- #CKIP_CLient
1
+ #CKIP_Client
2
2
 
3
3
  [RubyGems](http://rubygems.org/gems/ckip_client)
4
4
 
@@ -11,11 +11,11 @@ CKIP_Client是連接[中央研究院][中央研究院][詞庫小組][詞庫小
11
11
  請先至中文斷詞系統[網站][斷詞申請]或中文剖析系統[網站][剖析申請]申請:帳號/密碼
12
12
  再安裝本Gem
13
13
 
14
- gem install chinese_convt
14
+ gem install ckip_client
15
15
 
16
16
  安裝完成後至Gem所在資料夾中修改帳號密碼資料。
17
17
  資料夾位置通常在:/usr/local/lib/ruby/gems/1.9.1/gems/
18
- 進入:ckip_client-0.0.3/lib/config/
18
+ 進入:ckip_client-0.0.5/lib/config/
19
19
  於 segment.yml 檔案中輸入中文斷詞系統之帳號密碼,
20
20
  於 parser.yml 檔案中輸入中文剖析系統之帳號密碼,
21
21
  至此安裝設定就緒。
@@ -36,9 +36,10 @@ CKIP_Client是連接[中央研究院][中央研究院][詞庫小組][詞庫小
36
36
  CKIP.segment( text , 'neat' )
37
37
  CKIP.parser( text , 'neat' )
38
38
 
39
- 注:
40
- 輸入的字串編碼可以是 UTF-8 或是 Big5 Big5-UAO 三種其中之一。
41
- 但是輸出結果一律為 UTF-8 編碼。
39
+ 文字編碼:
40
+ 輸入的字串編碼可以是 UTF-8 Big5 或是 Big5-UAO 三種其中之一。
41
+ 而輸出結果一律為 UTF-8 編碼。
42
+ CKIP系統不支援 Big5-HKSCS 之特有港字。
42
43
 
43
44
 
44
45
  ## 範例 Example
data/lib/CKIP_Client.rb CHANGED
@@ -7,12 +7,20 @@ module CKIP
7
7
  class Client
8
8
 
9
9
  def self.get( sys , text )
10
+ begin
11
+ text.encode("Big5-UAO")
12
+ rescue Encoding::UndefinedConversionError
13
+ trans_text = text.encode("Big5-UAO", :undef => :replace).encode("UTF-8")
14
+ un_w = text.delete( trans_text )
15
+ puts "!!contains unsupported character: #{un_w}!!"
16
+ raise $!
17
+ end
10
18
  text_encoding = text.encoding.to_s
11
19
  unless ['Big5','Big5-UAO','UTF-8'].include? text_encoding
12
- raise 'Encoding ERROR : CKIP_Client only support UTF-8 or Big5 or Big5-UAO encodings.'
20
+ raise 'Encoding ERROR!! CKIP_Client only support UTF-8 or Big5 or Big5-UAO encodings.'
13
21
  end
14
22
  input_encoding = (text_encoding == 'Big5-UAO')? 'Big5' : text_encoding
15
-
23
+ sst = 1.6 - 1020.0 / (text.size + 640)
16
24
  config = YAML::load( File.open( File.dirname(__FILE__) + "/config/#{sys}.yml" ).read )
17
25
  request = "<?xml version=\"1.0\" ?>
18
26
  <wordsegmentation version=\"0.1\" charsetcode=\"#{input_encoding.downcase}\">
@@ -24,10 +32,11 @@ module CKIP
24
32
  Timeout::timeout(5) {
25
33
  socket = TCPSocket.open( config['host'] , config['port'] )
26
34
  socket.write( request )
27
- xml_result = socket.gets.force_encoding( text_encoding )
35
+ @xml_result = socket.gets.force_encoding( text_encoding )
28
36
  socket.close
29
- return xml_result
30
37
  }
38
+ sleep (rand + 0.25) * sst
39
+ return @xml_result
31
40
  rescue Timeout::Error
32
41
  puts "CKIP Connection Timeout!!!"
33
42
  raise Timeout::Error
@@ -36,6 +45,7 @@ module CKIP
36
45
 
37
46
  def self.xml2str( xml )
38
47
  xml.encode!('UTF-8')
48
+ sleep (rand * 0.1 + 0.05)
39
49
  if /<result>(.*?)<\/result>/m.match( xml )
40
50
  return $1.gsub(/<\/sentence>\r?\n?\t*?\s*?<sentence>/,"\n").gsub("\n \n","\n\n").sub(/\t*?\s*?<sentence> ?/,'').sub(/<\/sentence>/,'').gsub("\n ", "\n")
41
51
  elsif /<processstatus code="\d">(.*?)<\/processstatus>/.match( xml )
@@ -44,7 +54,7 @@ module CKIP
44
54
  raise "XML result format error!!"
45
55
  end
46
56
  end
47
-
57
+
48
58
  end
49
59
 
50
60
  def self.segment( text , mode = nil )
@@ -65,7 +75,6 @@ module CKIP
65
75
  else
66
76
  return output
67
77
  end
68
-
69
78
  end
70
79
 
71
80
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ckip_client
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-04-30 00:00:00.000000000 Z
12
+ date: 2013-05-02 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: CKIP_CLient是連接中央研究院詞庫小組研發之中文斷詞系統與中文剖析系統的Ruby程式界面。感謝中央研究院詞庫小組多年來之研究成果!
15
15
  email: xxxooo.tw@gmail.com