ckip_client 0.0.6 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/CKIP_Client.rb +27 -24
- metadata +2 -2
data/lib/CKIP_Client.rb
CHANGED
@@ -8,21 +8,14 @@ module CKIP
|
|
8
8
|
class Client
|
9
9
|
|
10
10
|
def self.get( sys , text )
|
11
|
-
begin
|
12
|
-
text.encode("Big5-UAO")
|
13
|
-
rescue Encoding::UndefinedConversionError
|
14
|
-
trans_text = text.encode("Big5-UAO", :undef => :replace).encode("UTF-8")
|
15
|
-
un_w = text.delete( trans_text )
|
16
|
-
puts "!!contains unsupported character: #{un_w}!!"
|
17
|
-
raise $!
|
18
|
-
end
|
19
11
|
text_encoding = text.encoding.to_s
|
20
12
|
unless ['Big5','Big5-UAO','UTF-8'].include? text_encoding
|
21
13
|
raise 'Encoding ERROR!! CKIP_Client only support UTF-8 or Big5 or Big5-UAO encodings.'
|
22
14
|
end
|
23
15
|
input_encoding = (text_encoding == 'Big5-UAO')? 'Big5' : text_encoding
|
24
|
-
sst =
|
16
|
+
sst = 2.0 - 2304.0 / (text.size + 1280)
|
25
17
|
config = YAML::load( File.open( File.dirname(__FILE__) + "/config/#{sys}.yml" ).read )
|
18
|
+
sleep rand * 0.25 + 0.1
|
26
19
|
request = "<?xml version=\"1.0\" ?>
|
27
20
|
<wordsegmentation version=\"0.1\" charsetcode=\"#{input_encoding.downcase}\">
|
28
21
|
<option showcategory=\"1\" />
|
@@ -30,29 +23,40 @@ module CKIP
|
|
30
23
|
<text>#{text}</text>
|
31
24
|
</wordsegmentation>"
|
32
25
|
begin
|
33
|
-
|
34
|
-
|
35
|
-
socket.
|
36
|
-
@
|
37
|
-
socket.
|
26
|
+
time0 = Time.now
|
27
|
+
xml_result = Timeout::timeout(8.0 * (sst + 1.0)){
|
28
|
+
@socket = TCPSocket.open( config['host'] , config['port'] )
|
29
|
+
@socket.write( request )
|
30
|
+
@socket.gets.force_encoding( text_encoding )
|
38
31
|
}
|
39
|
-
|
40
|
-
|
32
|
+
time1 = (Time.now - time0)
|
33
|
+
sleep (rand + 0.5) * sst + time1 * 0.35
|
34
|
+
|
35
|
+
if xml_result.valid_encoding?
|
36
|
+
return xml_result.encode!('UTF-8')
|
37
|
+
else
|
38
|
+
trans_text = xml_result.encode("UTF-32", :undef => :replace, :invalid => :replace).encode( text_encoding )
|
39
|
+
text2 = text.gsub(/[^[:word:]]+/ , "")
|
40
|
+
trans_text.each_char{ |c| text2.delete!(c) }
|
41
|
+
puts "!!contains unsupported character: #{text2}!!"
|
42
|
+
raise Encoding::InvalidByteSequenceError
|
43
|
+
end
|
41
44
|
rescue Timeout::Error
|
42
|
-
|
43
|
-
|
45
|
+
time1 = (Time.now - time0)
|
46
|
+
puts "!!!Timeout: waited for #{time1.round(2)}s and no response from CKIP server!!!"
|
47
|
+
raise $!
|
48
|
+
ensure
|
49
|
+
@socket.close
|
44
50
|
end
|
45
51
|
end
|
46
52
|
|
47
53
|
def self.xml2str( xml )
|
48
|
-
xml.encode!('UTF-8')
|
49
|
-
sleep (rand * 0.1 + 0.05)
|
50
54
|
if /<result>(.*?)<\/result>/m.match( xml )
|
51
55
|
return $1.gsub(/<\/sentence>\r?\n?\t*?\s*?<sentence>/,"\n").gsub("\n \n","\n\n").sub(/\t*?\s*?<sentence> ?/,'').sub(/<\/sentence>/,'').gsub("\n ", "\n")
|
52
56
|
elsif /<processstatus code="\d">(.*?)<\/processstatus>/.match( xml )
|
53
57
|
raise $1
|
54
58
|
else
|
55
|
-
raise "XML
|
59
|
+
raise "XML return error!!"
|
56
60
|
end
|
57
61
|
end
|
58
62
|
|
@@ -61,7 +65,7 @@ module CKIP
|
|
61
65
|
def self.segment( text , mode = nil )
|
62
66
|
output = Client.xml2str( Client.get( 'segment' , text ) )
|
63
67
|
if ['compact','neat'].include?( mode )
|
64
|
-
return output.gsub!(/\([A-Za-
|
68
|
+
return output.gsub!(/\([A-Za-z_]+\)/,'')
|
65
69
|
else
|
66
70
|
return output
|
67
71
|
end
|
@@ -69,10 +73,9 @@ module CKIP
|
|
69
73
|
|
70
74
|
def self.parser( text , mode = nil )
|
71
75
|
text.encode!('Big5-UAO') if text.encoding.to_s == 'UTF-8'
|
72
|
-
|
73
76
|
output = Client.xml2str( Client.get( 'parser' , text ) )
|
74
77
|
if ['compact','neat'].include?( mode )
|
75
|
-
return output.gsub(/[A-Za-
|
78
|
+
return output.gsub(/[A-Za-z_]+?:/,'').gsub(/[A-Za-z_]+?\(/,'(').gsub(/[A-Za-z_]+?‧.+?\(/,'(').gsub!(/[#%]/,'').gsub!(/^\d+:\d+.\[\d+\]\s/,'').gsub(/\([A-Z]+?\)$/,'')
|
76
79
|
else
|
77
80
|
return output
|
78
81
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ckip_client
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-05-
|
12
|
+
date: 2013-05-06 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: CKIP_CLient是連接中央研究院詞庫小組研發之中文斷詞系統與中文剖析系統的Ruby程式界面。感謝中央研究院詞庫小組多年來之研究成果!
|
15
15
|
email: xxxooo.tw@gmail.com
|