ckip_client 0.0.6 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/CKIP_Client.rb +27 -24
- metadata +2 -2
data/lib/CKIP_Client.rb
CHANGED
@@ -8,21 +8,14 @@ module CKIP
|
|
8
8
|
class Client
|
9
9
|
|
10
10
|
def self.get( sys , text )
|
11
|
-
begin
|
12
|
-
text.encode("Big5-UAO")
|
13
|
-
rescue Encoding::UndefinedConversionError
|
14
|
-
trans_text = text.encode("Big5-UAO", :undef => :replace).encode("UTF-8")
|
15
|
-
un_w = text.delete( trans_text )
|
16
|
-
puts "!!contains unsupported character: #{un_w}!!"
|
17
|
-
raise $!
|
18
|
-
end
|
19
11
|
text_encoding = text.encoding.to_s
|
20
12
|
unless ['Big5','Big5-UAO','UTF-8'].include? text_encoding
|
21
13
|
raise 'Encoding ERROR!! CKIP_Client only support UTF-8 or Big5 or Big5-UAO encodings.'
|
22
14
|
end
|
23
15
|
input_encoding = (text_encoding == 'Big5-UAO')? 'Big5' : text_encoding
|
24
|
-
sst =
|
16
|
+
sst = 2.0 - 2304.0 / (text.size + 1280)
|
25
17
|
config = YAML::load( File.open( File.dirname(__FILE__) + "/config/#{sys}.yml" ).read )
|
18
|
+
sleep rand * 0.25 + 0.1
|
26
19
|
request = "<?xml version=\"1.0\" ?>
|
27
20
|
<wordsegmentation version=\"0.1\" charsetcode=\"#{input_encoding.downcase}\">
|
28
21
|
<option showcategory=\"1\" />
|
@@ -30,29 +23,40 @@ module CKIP
|
|
30
23
|
<text>#{text}</text>
|
31
24
|
</wordsegmentation>"
|
32
25
|
begin
|
33
|
-
|
34
|
-
|
35
|
-
socket.
|
36
|
-
@
|
37
|
-
socket.
|
26
|
+
time0 = Time.now
|
27
|
+
xml_result = Timeout::timeout(8.0 * (sst + 1.0)){
|
28
|
+
@socket = TCPSocket.open( config['host'] , config['port'] )
|
29
|
+
@socket.write( request )
|
30
|
+
@socket.gets.force_encoding( text_encoding )
|
38
31
|
}
|
39
|
-
|
40
|
-
|
32
|
+
time1 = (Time.now - time0)
|
33
|
+
sleep (rand + 0.5) * sst + time1 * 0.35
|
34
|
+
|
35
|
+
if xml_result.valid_encoding?
|
36
|
+
return xml_result.encode!('UTF-8')
|
37
|
+
else
|
38
|
+
trans_text = xml_result.encode("UTF-32", :undef => :replace, :invalid => :replace).encode( text_encoding )
|
39
|
+
text2 = text.gsub(/[^[:word:]]+/ , "")
|
40
|
+
trans_text.each_char{ |c| text2.delete!(c) }
|
41
|
+
puts "!!contains unsupported character: #{text2}!!"
|
42
|
+
raise Encoding::InvalidByteSequenceError
|
43
|
+
end
|
41
44
|
rescue Timeout::Error
|
42
|
-
|
43
|
-
|
45
|
+
time1 = (Time.now - time0)
|
46
|
+
puts "!!!Timeout: waited for #{time1.round(2)}s and no response from CKIP server!!!"
|
47
|
+
raise $!
|
48
|
+
ensure
|
49
|
+
@socket.close
|
44
50
|
end
|
45
51
|
end
|
46
52
|
|
47
53
|
def self.xml2str( xml )
|
48
|
-
xml.encode!('UTF-8')
|
49
|
-
sleep (rand * 0.1 + 0.05)
|
50
54
|
if /<result>(.*?)<\/result>/m.match( xml )
|
51
55
|
return $1.gsub(/<\/sentence>\r?\n?\t*?\s*?<sentence>/,"\n").gsub("\n \n","\n\n").sub(/\t*?\s*?<sentence> ?/,'').sub(/<\/sentence>/,'').gsub("\n ", "\n")
|
52
56
|
elsif /<processstatus code="\d">(.*?)<\/processstatus>/.match( xml )
|
53
57
|
raise $1
|
54
58
|
else
|
55
|
-
raise "XML
|
59
|
+
raise "XML return error!!"
|
56
60
|
end
|
57
61
|
end
|
58
62
|
|
@@ -61,7 +65,7 @@ module CKIP
|
|
61
65
|
def self.segment( text , mode = nil )
|
62
66
|
output = Client.xml2str( Client.get( 'segment' , text ) )
|
63
67
|
if ['compact','neat'].include?( mode )
|
64
|
-
return output.gsub!(/\([A-Za-
|
68
|
+
return output.gsub!(/\([A-Za-z_]+\)/,'')
|
65
69
|
else
|
66
70
|
return output
|
67
71
|
end
|
@@ -69,10 +73,9 @@ module CKIP
|
|
69
73
|
|
70
74
|
def self.parser( text , mode = nil )
|
71
75
|
text.encode!('Big5-UAO') if text.encoding.to_s == 'UTF-8'
|
72
|
-
|
73
76
|
output = Client.xml2str( Client.get( 'parser' , text ) )
|
74
77
|
if ['compact','neat'].include?( mode )
|
75
|
-
return output.gsub(/[A-Za-
|
78
|
+
return output.gsub(/[A-Za-z_]+?:/,'').gsub(/[A-Za-z_]+?\(/,'(').gsub(/[A-Za-z_]+?‧.+?\(/,'(').gsub!(/[#%]/,'').gsub!(/^\d+:\d+.\[\d+\]\s/,'').gsub(/\([A-Z]+?\)$/,'')
|
76
79
|
else
|
77
80
|
return output
|
78
81
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ckip_client
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-05-
|
12
|
+
date: 2013-05-06 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: CKIP_CLient是連接中央研究院詞庫小組研發之中文斷詞系統與中文剖析系統的Ruby程式界面。感謝中央研究院詞庫小組多年來之研究成果!
|
15
15
|
email: xxxooo.tw@gmail.com
|