ckip_client 0.0.6 → 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (2) hide show
  1. data/lib/CKIP_Client.rb +27 -24
  2. metadata +2 -2
@@ -8,21 +8,14 @@ module CKIP
8
8
  class Client
9
9
 
10
10
  def self.get( sys , text )
11
- begin
12
- text.encode("Big5-UAO")
13
- rescue Encoding::UndefinedConversionError
14
- trans_text = text.encode("Big5-UAO", :undef => :replace).encode("UTF-8")
15
- un_w = text.delete( trans_text )
16
- puts "!!contains unsupported character: #{un_w}!!"
17
- raise $!
18
- end
19
11
  text_encoding = text.encoding.to_s
20
12
  unless ['Big5','Big5-UAO','UTF-8'].include? text_encoding
21
13
  raise 'Encoding ERROR!! CKIP_Client only support UTF-8 or Big5 or Big5-UAO encodings.'
22
14
  end
23
15
  input_encoding = (text_encoding == 'Big5-UAO')? 'Big5' : text_encoding
24
- sst = 1.6 - 1020.0 / (text.size + 640)
16
+ sst = 2.0 - 2304.0 / (text.size + 1280)
25
17
  config = YAML::load( File.open( File.dirname(__FILE__) + "/config/#{sys}.yml" ).read )
18
+ sleep rand * 0.25 + 0.1
26
19
  request = "<?xml version=\"1.0\" ?>
27
20
  <wordsegmentation version=\"0.1\" charsetcode=\"#{input_encoding.downcase}\">
28
21
  <option showcategory=\"1\" />
@@ -30,29 +23,40 @@ module CKIP
30
23
  <text>#{text}</text>
31
24
  </wordsegmentation>"
32
25
  begin
33
- Timeout::timeout(5) {
34
- socket = TCPSocket.open( config['host'] , config['port'] )
35
- socket.write( request )
36
- @xml_result = socket.gets.force_encoding( text_encoding )
37
- socket.close
26
+ time0 = Time.now
27
+ xml_result = Timeout::timeout(8.0 * (sst + 1.0)){
28
+ @socket = TCPSocket.open( config['host'] , config['port'] )
29
+ @socket.write( request )
30
+ @socket.gets.force_encoding( text_encoding )
38
31
  }
39
- sleep (rand + 0.25) * sst
40
- return @xml_result
32
+ time1 = (Time.now - time0)
33
+ sleep (rand + 0.5) * sst + time1 * 0.35
34
+
35
+ if xml_result.valid_encoding?
36
+ return xml_result.encode!('UTF-8')
37
+ else
38
+ trans_text = xml_result.encode("UTF-32", :undef => :replace, :invalid => :replace).encode( text_encoding )
39
+ text2 = text.gsub(/[^[:word:]]+/ , "")
40
+ trans_text.each_char{ |c| text2.delete!(c) }
41
+ puts "!!contains unsupported character: #{text2}!!"
42
+ raise Encoding::InvalidByteSequenceError
43
+ end
41
44
  rescue Timeout::Error
42
- puts "CKIP Connection Timeout!!!"
43
- raise Timeout::Error
45
+ time1 = (Time.now - time0)
46
+ puts "!!!Timeout: waited for #{time1.round(2)}s and no response from CKIP server!!!"
47
+ raise $!
48
+ ensure
49
+ @socket.close
44
50
  end
45
51
  end
46
52
 
47
53
  def self.xml2str( xml )
48
- xml.encode!('UTF-8')
49
- sleep (rand * 0.1 + 0.05)
50
54
  if /<result>(.*?)<\/result>/m.match( xml )
51
55
  return $1.gsub(/<\/sentence>\r?\n?\t*?\s*?<sentence>/,"\n").gsub("\n \n","\n\n").sub(/\t*?\s*?<sentence> ?/,'').sub(/<\/sentence>/,'').gsub("\n ", "\n")
52
56
  elsif /<processstatus code="\d">(.*?)<\/processstatus>/.match( xml )
53
57
  raise $1
54
58
  else
55
- raise "XML result format error!!"
59
+ raise "XML return error!!"
56
60
  end
57
61
  end
58
62
 
@@ -61,7 +65,7 @@ module CKIP
61
65
  def self.segment( text , mode = nil )
62
66
  output = Client.xml2str( Client.get( 'segment' , text ) )
63
67
  if ['compact','neat'].include?( mode )
64
- return output.gsub!(/\([A-Za-z]+\)/,'')
68
+ return output.gsub!(/\([A-Za-z_]+\)/,'')
65
69
  else
66
70
  return output
67
71
  end
@@ -69,10 +73,9 @@ module CKIP
69
73
 
70
74
  def self.parser( text , mode = nil )
71
75
  text.encode!('Big5-UAO') if text.encoding.to_s == 'UTF-8'
72
-
73
76
  output = Client.xml2str( Client.get( 'parser' , text ) )
74
77
  if ['compact','neat'].include?( mode )
75
- return output.gsub(/[A-Za-z]+?:/,'').gsub(/[A-Za-z]+?\(/,'(').gsub(/[A-Za-z]+?‧.+?\(/,'(').gsub!(/[#%]/,'').gsub!(/^\d+:\d+.\[\d+\]\s/,'').gsub(/\([A-Z]+?\)$/,'')
78
+ return output.gsub(/[A-Za-z_]+?:/,'').gsub(/[A-Za-z_]+?\(/,'(').gsub(/[A-Za-z_]+?‧.+?\(/,'(').gsub!(/[#%]/,'').gsub!(/^\d+:\d+.\[\d+\]\s/,'').gsub(/\([A-Z]+?\)$/,'')
76
79
  else
77
80
  return output
78
81
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ckip_client
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.0.7
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-05-02 00:00:00.000000000 Z
12
+ date: 2013-05-06 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: CKIP_CLient是連接中央研究院詞庫小組研發之中文斷詞系統與中文剖析系統的Ruby程式界面。感謝中央研究院詞庫小組多年來之研究成果!
15
15
  email: xxxooo.tw@gmail.com