ckip_client 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. data/lib/CKIP_Client.rb +27 -24
  2. metadata +2 -2
@@ -8,21 +8,14 @@ module CKIP
8
8
  class Client
9
9
 
10
10
  def self.get( sys , text )
11
- begin
12
- text.encode("Big5-UAO")
13
- rescue Encoding::UndefinedConversionError
14
- trans_text = text.encode("Big5-UAO", :undef => :replace).encode("UTF-8")
15
- un_w = text.delete( trans_text )
16
- puts "!!contains unsupported character: #{un_w}!!"
17
- raise $!
18
- end
19
11
  text_encoding = text.encoding.to_s
20
12
  unless ['Big5','Big5-UAO','UTF-8'].include? text_encoding
21
13
  raise 'Encoding ERROR!! CKIP_Client only support UTF-8 or Big5 or Big5-UAO encodings.'
22
14
  end
23
15
  input_encoding = (text_encoding == 'Big5-UAO')? 'Big5' : text_encoding
24
- sst = 1.6 - 1020.0 / (text.size + 640)
16
+ sst = 2.0 - 2304.0 / (text.size + 1280)
25
17
  config = YAML::load( File.open( File.dirname(__FILE__) + "/config/#{sys}.yml" ).read )
18
+ sleep rand * 0.25 + 0.1
26
19
  request = "<?xml version=\"1.0\" ?>
27
20
  <wordsegmentation version=\"0.1\" charsetcode=\"#{input_encoding.downcase}\">
28
21
  <option showcategory=\"1\" />
@@ -30,29 +23,40 @@ module CKIP
30
23
  <text>#{text}</text>
31
24
  </wordsegmentation>"
32
25
  begin
33
- Timeout::timeout(5) {
34
- socket = TCPSocket.open( config['host'] , config['port'] )
35
- socket.write( request )
36
- @xml_result = socket.gets.force_encoding( text_encoding )
37
- socket.close
26
+ time0 = Time.now
27
+ xml_result = Timeout::timeout(8.0 * (sst + 1.0)){
28
+ @socket = TCPSocket.open( config['host'] , config['port'] )
29
+ @socket.write( request )
30
+ @socket.gets.force_encoding( text_encoding )
38
31
  }
39
- sleep (rand + 0.25) * sst
40
- return @xml_result
32
+ time1 = (Time.now - time0)
33
+ sleep (rand + 0.5) * sst + time1 * 0.35
34
+
35
+ if xml_result.valid_encoding?
36
+ return xml_result.encode!('UTF-8')
37
+ else
38
+ trans_text = xml_result.encode("UTF-32", :undef => :replace, :invalid => :replace).encode( text_encoding )
39
+ text2 = text.gsub(/[^[:word:]]+/ , "")
40
+ trans_text.each_char{ |c| text2.delete!(c) }
41
+ puts "!!contains unsupported character: #{text2}!!"
42
+ raise Encoding::InvalidByteSequenceError
43
+ end
41
44
  rescue Timeout::Error
42
- puts "CKIP Connection Timeout!!!"
43
- raise Timeout::Error
45
+ time1 = (Time.now - time0)
46
+ puts "!!!Timeout: waited for #{time1.round(2)}s and no response from CKIP server!!!"
47
+ raise $!
48
+ ensure
49
+ @socket.close
44
50
  end
45
51
  end
46
52
 
47
53
  def self.xml2str( xml )
48
- xml.encode!('UTF-8')
49
- sleep (rand * 0.1 + 0.05)
50
54
  if /<result>(.*?)<\/result>/m.match( xml )
51
55
  return $1.gsub(/<\/sentence>\r?\n?\t*?\s*?<sentence>/,"\n").gsub("\n \n","\n\n").sub(/\t*?\s*?<sentence> ?/,'').sub(/<\/sentence>/,'').gsub("\n ", "\n")
52
56
  elsif /<processstatus code="\d">(.*?)<\/processstatus>/.match( xml )
53
57
  raise $1
54
58
  else
55
- raise "XML result format error!!"
59
+ raise "XML return error!!"
56
60
  end
57
61
  end
58
62
 
@@ -61,7 +65,7 @@ module CKIP
61
65
  def self.segment( text , mode = nil )
62
66
  output = Client.xml2str( Client.get( 'segment' , text ) )
63
67
  if ['compact','neat'].include?( mode )
64
- return output.gsub!(/\([A-Za-z]+\)/,'')
68
+ return output.gsub!(/\([A-Za-z_]+\)/,'')
65
69
  else
66
70
  return output
67
71
  end
@@ -69,10 +73,9 @@ module CKIP
69
73
 
70
74
  def self.parser( text , mode = nil )
71
75
  text.encode!('Big5-UAO') if text.encoding.to_s == 'UTF-8'
72
-
73
76
  output = Client.xml2str( Client.get( 'parser' , text ) )
74
77
  if ['compact','neat'].include?( mode )
75
- return output.gsub(/[A-Za-z]+?:/,'').gsub(/[A-Za-z]+?\(/,'(').gsub(/[A-Za-z]+?‧.+?\(/,'(').gsub!(/[#%]/,'').gsub!(/^\d+:\d+.\[\d+\]\s/,'').gsub(/\([A-Z]+?\)$/,'')
78
+ return output.gsub(/[A-Za-z_]+?:/,'').gsub(/[A-Za-z_]+?\(/,'(').gsub(/[A-Za-z_]+?‧.+?\(/,'(').gsub!(/[#%]/,'').gsub!(/^\d+:\d+.\[\d+\]\s/,'').gsub(/\([A-Z]+?\)$/,'')
76
79
  else
77
80
  return output
78
81
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ckip_client
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.0.7
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-05-02 00:00:00.000000000 Z
12
+ date: 2013-05-06 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: CKIP_CLient是連接中央研究院詞庫小組研發之中文斷詞系統與中文剖析系統的Ruby程式界面。感謝中央研究院詞庫小組多年來之研究成果!
15
15
  email: xxxooo.tw@gmail.com