poliqarpr 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/README.txt +107 -0
- data/Rakefile +23 -0
- data/changelog.txt +37 -0
- data/lib/poliqarpr/client.rb +452 -0
- data/lib/poliqarpr/connector.rb +140 -0
- data/lib/poliqarpr/exceptions.rb +9 -0
- data/lib/poliqarpr/excerpt.rb +63 -0
- data/lib/poliqarpr/lemmata.rb +11 -0
- data/lib/poliqarpr/query_result.rb +73 -0
- data/lib/poliqarpr/segment.rb +23 -0
- data/lib/poliqarpr/util.rb +9 -0
- data/lib/poliqarpr.rb +9 -0
- data/poliqarpr.gemspec +17 -0
- metadata +68 -0
data/README.txt
ADDED
@@ -0,0 +1,107 @@
|
|
1
|
+
= poliqarpr
|
2
|
+
|
3
|
+
* http://github.com/apohllo/poliqarpr
|
4
|
+
|
5
|
+
== DESCRIPTION:
|
6
|
+
|
7
|
+
Poliqarpr is Ruby client for Poliqarp server.
|
8
|
+
|
9
|
+
|
10
|
+
== FEATURES/PROBLEMS:
|
11
|
+
|
12
|
+
* built-in pagination of query results
|
13
|
+
* support for lemmatization
|
14
|
+
* asynchronous communication might be not stable (100% CPU ussage)
|
15
|
+
* only partial implementation of server protocol:
|
16
|
+
** PING, VERSION
|
17
|
+
** MAKE-SESSION, CLOSE-SESSION
|
18
|
+
** OPEN (corpus), CLOSE (corpus)
|
19
|
+
** CORPUS-STATS, GET-TAGSET
|
20
|
+
** STATUS, CANCEL (used internally)
|
21
|
+
** MAKE-QUERY, RUN-QUERY, BUFFER-STATE (used internally)
|
22
|
+
** GET-RESULTS, GET-CONTEXT, METADATA
|
23
|
+
** SET: lemmata, tags
|
24
|
+
|
25
|
+
== SYNOPSIS:
|
26
|
+
|
27
|
+
Poliqarpr is Ruby client for Poliqarp corpus server (see
|
28
|
+
http://poliqarp.sourceforge.net/), which is used to store large texts used in
|
29
|
+
Natural Language Processing.
|
30
|
+
|
31
|
+
|
32
|
+
== REQUIREMENTS:
|
33
|
+
|
34
|
+
Poliqarp server (only C implementation http://poliqarp.sourceforge.net/)
|
35
|
+
|
36
|
+
== INSTALL:
|
37
|
+
|
38
|
+
You need RubyGems v. 1.2
|
39
|
+
|
40
|
+
* gem -v
|
41
|
+
* 1.2.0 #=> ok
|
42
|
+
|
43
|
+
You need the gemcutter.org repository to be added to your sources list:
|
44
|
+
|
45
|
+
* gem sources -a http://gemcutter.org
|
46
|
+
|
47
|
+
Then you can type:
|
48
|
+
|
49
|
+
* sudo gem install poliqarpr
|
50
|
+
|
51
|
+
You can install the optional default corpus (warning: it is distributed under
|
52
|
+
different license!):
|
53
|
+
|
54
|
+
* sudo gem install poliqarpr-corpus
|
55
|
+
|
56
|
+
== BASIC USAGE:
|
57
|
+
|
58
|
+
(You need the poliqarpr-corpus to be installed for this to work. See the last
|
59
|
+
step of installation process).
|
60
|
+
|
61
|
+
Require the gem:
|
62
|
+
|
63
|
+
require 'poliaqarpr'
|
64
|
+
|
65
|
+
Create the server client and open default corpus
|
66
|
+
|
67
|
+
client = Poliqarp::Client.new
|
68
|
+
client.open_corpus :default
|
69
|
+
|
70
|
+
Query the corpus for given segment
|
71
|
+
|
72
|
+
result = client.find("kot")
|
73
|
+
result[0].to_s
|
74
|
+
|
75
|
+
Remember to close the client on exit
|
76
|
+
|
77
|
+
client.close
|
78
|
+
|
79
|
+
|
80
|
+
== LICENSE:
|
81
|
+
|
82
|
+
(The MIT License)
|
83
|
+
|
84
|
+
Copyright (c) 2008-2009 Aleksander Pohl
|
85
|
+
|
86
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
87
|
+
a copy of this software and associated documentation files (the
|
88
|
+
'Software'), to deal in the Software without restriction, including
|
89
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
90
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
91
|
+
permit persons to whom the Software is furnished to do so, subject to
|
92
|
+
the following conditions:
|
93
|
+
|
94
|
+
The above copyright notice and this permission notice shall be
|
95
|
+
included in all copies or substantial portions of the Software.
|
96
|
+
|
97
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
98
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
99
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
100
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
101
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
102
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
103
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
104
|
+
|
105
|
+
== FEEDBACK
|
106
|
+
|
107
|
+
* mailto:apohllo@o2.pl
|
data/Rakefile
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
task :default => [:install]
|
2
|
+
|
3
|
+
$gem_name = "poliqarpr"
|
4
|
+
|
5
|
+
desc "Build the gem"
|
6
|
+
task :build do
|
7
|
+
sh "gem build #$gem_name.gemspec"
|
8
|
+
end
|
9
|
+
|
10
|
+
desc "Install the library at local machnie"
|
11
|
+
task :install => :build do
|
12
|
+
sh "sudo gem install #$gem_name"
|
13
|
+
end
|
14
|
+
|
15
|
+
desc "Uninstall the library from local machnie"
|
16
|
+
task :uninstall do
|
17
|
+
sh "sudo gem uninstall #$gem_name"
|
18
|
+
end
|
19
|
+
|
20
|
+
desc "Clean"
|
21
|
+
task :clean do
|
22
|
+
sh "rm #$gem_name*.gem"
|
23
|
+
end
|
data/changelog.txt
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
0.0.5
|
2
|
+
- Bugfix: making parallel query might lead to silent thread death
|
3
|
+
- Support for Ruby 1.9 encoding
|
4
|
+
- Require path improvments
|
5
|
+
- Copyright year correction
|
6
|
+
- Documentation now points to gemcutter instead of github
|
7
|
+
|
8
|
+
0.0.4
|
9
|
+
- ping/pong diagnostics
|
10
|
+
- server version
|
11
|
+
- corpus statistics
|
12
|
+
- implementation of asynchronous protocol (not stable)
|
13
|
+
|
14
|
+
|
15
|
+
0.0.3
|
16
|
+
- the license of the corpus included
|
17
|
+
- client rdoc documentation
|
18
|
+
- support for lemmata retrieval
|
19
|
+
- excerpt now contains segments instead of strings
|
20
|
+
- buffer size setter
|
21
|
+
- default corpus moved to separate plugin (sudo gem install apohllo-poliqarpr-corpus)
|
22
|
+
|
23
|
+
0.0.2
|
24
|
+
- query result is full blown class
|
25
|
+
- source divided into client, excerpt and query resutl
|
26
|
+
- specs for client, excerpt and query result
|
27
|
+
- namespace changed to POLIQARP
|
28
|
+
- default corpus included
|
29
|
+
- singular results properly fatched
|
30
|
+
- valid result for queries containing many words
|
31
|
+
- same queries which are run in sequence are called only once
|
32
|
+
- README.txt included in gem
|
33
|
+
- specs included in gem
|
34
|
+
|
35
|
+
0.0.1
|
36
|
+
- initiali implementation
|
37
|
+
- synchorous querying for terms
|
@@ -0,0 +1,452 @@
|
|
1
|
+
# vim:encoding=utf-8
|
2
|
+
module Poliqarp
|
3
|
+
# Author:: Aleksander Pohl (mailto:apohllo@o2.pl)
|
4
|
+
# License:: MIT License
|
5
|
+
#
|
6
|
+
# This class is the implementation of the Poliqarp server client.
|
7
|
+
class Client
|
8
|
+
GROUPS = [:left_context, :left_match, :right_match, :right_context]
|
9
|
+
|
10
|
+
# If debug is turned on, the communication between server and client
|
11
|
+
# is logged to standard output.
|
12
|
+
attr_writer :debug
|
13
|
+
|
14
|
+
# The size of the buffer is the maximum number of excerpts which
|
15
|
+
# are returned for single query.
|
16
|
+
attr_writer :buffer_size
|
17
|
+
|
18
|
+
# Creates new poliqarp server client.
|
19
|
+
#
|
20
|
+
# Parameters:
|
21
|
+
# * +session_name+ the name of the client session. Defaults to "RUBY".
|
22
|
+
# * +debug+ if set to true, all messages sent and received from server
|
23
|
+
# are printed to standard output. Defaults to false.
|
24
|
+
def initialize(session_name="RUBY", debug=false)
|
25
|
+
@session_name = session_name
|
26
|
+
@left_context = 5
|
27
|
+
@right_context = 5
|
28
|
+
@debug = debug
|
29
|
+
@buffer_size = 500000
|
30
|
+
@connector = Connector.new(debug)
|
31
|
+
@answer_queue = Queue.new
|
32
|
+
new_session
|
33
|
+
end
|
34
|
+
|
35
|
+
# A hint about installation of default corpus gem
|
36
|
+
def self.const_missing(const)
|
37
|
+
if const.to_s =~ /DEFAULT_CORPUS/
|
38
|
+
raise "You need to install 'apohllo-poliqarpr-corpus' to use the default corpus"
|
39
|
+
end
|
40
|
+
super
|
41
|
+
end
|
42
|
+
|
43
|
+
# Creates new session for the client with the name given in constructor.
|
44
|
+
# If the session was already opened, it is closed.
|
45
|
+
#
|
46
|
+
# Parameters:
|
47
|
+
# * +port+ - the port on which the poliqarpd server is accepting connections (defaults to 4567)
|
48
|
+
def new_session(port=4567)
|
49
|
+
close if @session
|
50
|
+
@connector.open("localhost",port)
|
51
|
+
talk("MAKE-SESSION #{@session_name}")
|
52
|
+
talk("BUFFER-RESIZE #{@buffer_size}")
|
53
|
+
@session = true
|
54
|
+
self.tags = {}
|
55
|
+
self.lemmata = {}
|
56
|
+
end
|
57
|
+
|
58
|
+
# Closes the opened session.
|
59
|
+
def close
|
60
|
+
talk "CLOSE-SESSION"
|
61
|
+
@session = false
|
62
|
+
end
|
63
|
+
|
64
|
+
# Closes the opened corpus.
|
65
|
+
def close_corpus
|
66
|
+
talk "CLOSE"
|
67
|
+
end
|
68
|
+
|
69
|
+
# Sets the size of the left short context. It must be > 0
|
70
|
+
#
|
71
|
+
# The size of the left short context is the number
|
72
|
+
# of segments displayed in the found excerpts left to the
|
73
|
+
# matched segment(s).
|
74
|
+
def left_context=(value)
|
75
|
+
if correct_context_value?(value)
|
76
|
+
result = talk("SET left-context-width #{value}")
|
77
|
+
@left_context = value if result =~ /^R OK/
|
78
|
+
else
|
79
|
+
raise "Invalid argument: #{value}. It must be fixnum greater than 0."
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
# Sets the size of the right short context. It must be > 0
|
84
|
+
#
|
85
|
+
# The size of the right short context is the number
|
86
|
+
# of segments displayed in the found excerpts right to the
|
87
|
+
# matched segment(s).
|
88
|
+
def right_context=(value)
|
89
|
+
if correct_context_value?(value)
|
90
|
+
result = talk("SET right-context-width #{value}")
|
91
|
+
@right_context = value if result =~ /^R OK/
|
92
|
+
else
|
93
|
+
raise "Invalid argument: #{value}. It must be fixnum greater than 0."
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
# Sets the tags' flags. There are four groups of segments
|
98
|
+
# which the flags apply for:
|
99
|
+
# * +left_context+
|
100
|
+
# * +left_match+
|
101
|
+
# * +right_match+
|
102
|
+
# * +right_context+
|
103
|
+
#
|
104
|
+
# If the flag for given group is set to true, all segments
|
105
|
+
# in the group are annotated with grammatical tags. E.g.:
|
106
|
+
# c.find("kot")
|
107
|
+
# ...
|
108
|
+
# "kot" tags: "subst:sg:nom:m2"
|
109
|
+
#
|
110
|
+
# You can pass :all to turn on flags for all groups
|
111
|
+
def tags=(options={})
|
112
|
+
options = set_all_flags if options == :all
|
113
|
+
@tag_flags = options
|
114
|
+
flags = ""
|
115
|
+
GROUPS.each do |flag|
|
116
|
+
flags << (options[flag] ? "1" : "0")
|
117
|
+
end
|
118
|
+
talk("SET retrieve-tags #{flags}")
|
119
|
+
end
|
120
|
+
|
121
|
+
# Sets the lemmatas' flags. There are four groups of segments
|
122
|
+
# which the flags apply for:
|
123
|
+
# * +left_context+
|
124
|
+
# * +left_match+
|
125
|
+
# * +right_match+
|
126
|
+
# * +right_context+
|
127
|
+
#
|
128
|
+
# If the flag for given group is set to true, all segments
|
129
|
+
# in the group are returned with the base form of the lemmata. E.g.:
|
130
|
+
# c.find("kotu")
|
131
|
+
# ...
|
132
|
+
# "kotu" base_form: "kot"
|
133
|
+
#
|
134
|
+
# You can pass :all to turn on flags for all groups
|
135
|
+
def lemmata=(options={})
|
136
|
+
options = set_all_flags if options == :all
|
137
|
+
@lemmata_flags = options
|
138
|
+
flags = ""
|
139
|
+
GROUPS.each do |flag|
|
140
|
+
flags << (options[flag] ? "1" : "0")
|
141
|
+
end
|
142
|
+
talk("SET retrieve-lemmata #{flags}")
|
143
|
+
end
|
144
|
+
|
145
|
+
# *Asynchronous* Opens the corpus given as +path+. To open the default
|
146
|
+
# corpus pass +:default+ as the argument.
|
147
|
+
#
|
148
|
+
# If you don't want to wait until the call is finished, you
|
149
|
+
# have to provide +handler+ for the asynchronous answer.
|
150
|
+
def open_corpus(path, &handler)
|
151
|
+
if path == :default
|
152
|
+
open_corpus(DEFAULT_CORPUS, &handler)
|
153
|
+
else
|
154
|
+
real_handler = handler || lambda{|msg| @answer_queue.push msg }
|
155
|
+
talk("OPEN #{path}", :async, &real_handler)
|
156
|
+
do_wait if handler.nil?
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
# Server diagnostics -- the result should be :pong
|
161
|
+
def ping
|
162
|
+
:pong if talk("PING") =~ /PONG/
|
163
|
+
end
|
164
|
+
|
165
|
+
# Returns server version
|
166
|
+
def version
|
167
|
+
talk("VERSION")
|
168
|
+
end
|
169
|
+
|
170
|
+
# Returns corpus statistics:
|
171
|
+
# * +:segment_tokens+ the number of segments in the corpus
|
172
|
+
# (two segments which look exactly the same are counted separately)
|
173
|
+
# * +:segment_types+ the number of segment types in the corpus
|
174
|
+
# (two segments which look exactly the same are counted as one type)
|
175
|
+
# * +:lemmata+ the number of lemmata (lexemes) types
|
176
|
+
# (all forms of inflected word, e.g. 'kot', 'kotu', ...
|
177
|
+
# are treated as one "word" -- lemmata)
|
178
|
+
# * +:tags+ the number of different grammar tags (each combination
|
179
|
+
# of atomic tags is treated as different "tag")
|
180
|
+
def stats
|
181
|
+
stats = {}
|
182
|
+
talk("CORPUS-STATS").split.each_with_index do |value, index|
|
183
|
+
case index
|
184
|
+
when 1
|
185
|
+
stats[:segment_tokens] = value.to_i
|
186
|
+
when 2
|
187
|
+
stats[:segment_types] = value.to_i
|
188
|
+
when 3
|
189
|
+
stats[:lemmata] = value.to_i
|
190
|
+
when 4
|
191
|
+
stats[:tags] = value.to_i
|
192
|
+
end
|
193
|
+
end
|
194
|
+
stats
|
195
|
+
end
|
196
|
+
|
197
|
+
# TODO
|
198
|
+
def metadata_types
|
199
|
+
raise "Not implemented"
|
200
|
+
end
|
201
|
+
|
202
|
+
# Returns the tag-set used in the corpus.
|
203
|
+
# It is divided into two groups:
|
204
|
+
# * +:categories+ enlists tags belonging to grammatical categories
|
205
|
+
# (each category has a list of its tags, eg. gender: m1 m2 m3 f n,
|
206
|
+
# means that there are 5 genders: masculine(1,2,3), feminine and neuter)
|
207
|
+
# * +:classes+ enlists grammatical tags used to describe it
|
208
|
+
# (each class has a list of tags used to describe it, eg. adj: degree
|
209
|
+
# gender case number, means that adjectives are described in terms
|
210
|
+
# of degree, gender, case and number)
|
211
|
+
def tagset
|
212
|
+
answer = talk("GET-TAGSET")
|
213
|
+
counters = answer.split
|
214
|
+
result = {}
|
215
|
+
[:categories, :classes].each_with_index do |type, type_index|
|
216
|
+
result[type] = {}
|
217
|
+
counters[type_index+1].to_i.times do |index|
|
218
|
+
values = read_word.split
|
219
|
+
result[type][values[0].to_sym] = values[1..-1].map{|v| v.to_sym}
|
220
|
+
end
|
221
|
+
end
|
222
|
+
result
|
223
|
+
end
|
224
|
+
|
225
|
+
# Send the query to the opened corpus.
|
226
|
+
#
|
227
|
+
# Options:
|
228
|
+
# * +index+ the index of the (only one) result to be returned. The index is relative
|
229
|
+
# to the beginning of the query result. In normal case you should query the
|
230
|
+
# corpus without specifying the index, to see what results are returned.
|
231
|
+
# Then you can use the index and the same query to retrieve one result.
|
232
|
+
# The pair (query, index) is a kind of unique identifier of the excerpt.
|
233
|
+
# * +page_size+ the size of the page of results. If the page size is 0, then
|
234
|
+
# all results are returned on one page. It is ignored if the +index+ option
|
235
|
+
# is present. Defaults to 0.
|
236
|
+
# * +page_index+ the index of the page of results (the first page has index 1, not 0).
|
237
|
+
# It is ignored if the +index+ option is present. Defaults to 1.
|
238
|
+
def find(query,options={})
|
239
|
+
if options[:index]
|
240
|
+
find_one(query, options[:index])
|
241
|
+
else
|
242
|
+
find_many(query, options)
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
alias query find
|
247
|
+
|
248
|
+
# Returns the number of results for given query.
|
249
|
+
def count(query)
|
250
|
+
count_results(make_query(query))
|
251
|
+
end
|
252
|
+
|
253
|
+
# Returns the long context of the excerpt which is identified by
|
254
|
+
# given (query, index) pair.
|
255
|
+
def context(query,index)
|
256
|
+
make_query(query)
|
257
|
+
result = []
|
258
|
+
talk "GET-CONTEXT #{index}"
|
259
|
+
# 1st part
|
260
|
+
result << read_word
|
261
|
+
# 2nd part
|
262
|
+
result << read_word
|
263
|
+
# 3rd part
|
264
|
+
result << read_word
|
265
|
+
# 4th part
|
266
|
+
result << read_word
|
267
|
+
result
|
268
|
+
end
|
269
|
+
|
270
|
+
# Returns the metadata of the excerpt which is identified by
|
271
|
+
# given (query, index) pair.
|
272
|
+
def metadata(query, index)
|
273
|
+
make_query(query)
|
274
|
+
result = {}
|
275
|
+
answer = talk("METADATA #{index}")
|
276
|
+
count = answer.split(" ")[1].to_i
|
277
|
+
count.times do |index|
|
278
|
+
type = read_word.gsub(/[^a-zA-Z]/,"").to_sym
|
279
|
+
value = read_word[2..-1]
|
280
|
+
unless value.nil?
|
281
|
+
result[type] ||= []
|
282
|
+
result[type] << value
|
283
|
+
end
|
284
|
+
end
|
285
|
+
result
|
286
|
+
end
|
287
|
+
|
288
|
+
protected
|
289
|
+
# Sends a message directly to the server
|
290
|
+
# * +msg+ the message to send
|
291
|
+
# * +mode+ if set to :sync, the method block untli the message
|
292
|
+
# is received. If :async the method returns immediately.
|
293
|
+
# Default: :sync
|
294
|
+
# * +handler+ the handler of the assynchronous message.
|
295
|
+
# It is ignored when the mode is set to :sync.
|
296
|
+
def talk(msg, mode = :sync, &handler)
|
297
|
+
puts msg if @debug
|
298
|
+
@connector.send(msg, mode, &handler)
|
299
|
+
end
|
300
|
+
|
301
|
+
# Make query and retrieve many results.
|
302
|
+
# * +query+ the query to be sent to the server.
|
303
|
+
# * +options+ see find
|
304
|
+
def find_many(query, options)
|
305
|
+
page_size = (options[:page_size] || 0)
|
306
|
+
page_index = (options[:page_index] || 1)
|
307
|
+
|
308
|
+
answer_offset = page_size * (page_index - 1)
|
309
|
+
if page_size > 0
|
310
|
+
result_count = make_async_query(query,answer_offset)
|
311
|
+
answers_limit = answer_offset + page_size > result_count ?
|
312
|
+
result_count - answer_offset : page_size
|
313
|
+
else
|
314
|
+
# all answers needed -- the call must be synchronous
|
315
|
+
result_count = count_results(make_query(query))
|
316
|
+
answers_limit = result_count
|
317
|
+
end
|
318
|
+
|
319
|
+
page_count = page_size <= 0 ? 1 :
|
320
|
+
result_count / page_size + (result_count % page_size > 0 ? 1 : 0)
|
321
|
+
|
322
|
+
result = QueryResult.new(page_index, page_count,page_size,self,query)
|
323
|
+
if answers_limit > 0
|
324
|
+
talk("GET-RESULTS #{answer_offset} #{answer_offset + answers_limit - 1}")
|
325
|
+
answers_limit.times do |answer_index|
|
326
|
+
result << fetch_result(answer_offset + answer_index, query)
|
327
|
+
end
|
328
|
+
end
|
329
|
+
result
|
330
|
+
end
|
331
|
+
|
332
|
+
# Make query and retrieve only one result
|
333
|
+
# * +query+ the query to be sent to the server
|
334
|
+
# * +index+ the index of the answer to be retrieved
|
335
|
+
def find_one(query,index)
|
336
|
+
make_async_query(query,index)
|
337
|
+
talk("GET-RESULTS #{index} #{index}")
|
338
|
+
fetch_result(index,query)
|
339
|
+
end
|
340
|
+
|
341
|
+
# Fetches one result of the query
|
342
|
+
#
|
343
|
+
# MAKE-QUERY and GET-RESULTS must be sent to the server before
|
344
|
+
# this method is called
|
345
|
+
def fetch_result(index, query)
|
346
|
+
result = Excerpt.new(index, self, query)
|
347
|
+
result << read_segments(:left_context)
|
348
|
+
result << read_segments(:left_match)
|
349
|
+
# XXX
|
350
|
+
#result << read_segments(:right_match)
|
351
|
+
result << read_segments(:right_context)
|
352
|
+
result
|
353
|
+
end
|
354
|
+
|
355
|
+
def read_segments(group)
|
356
|
+
size = read_number()
|
357
|
+
segments = []
|
358
|
+
size.times do |segment_index|
|
359
|
+
segment = Segment.new(read_word)
|
360
|
+
segments << segment
|
361
|
+
if @lemmata_flags[group] || @tag_flags[group]
|
362
|
+
lemmata_size = read_number()
|
363
|
+
lemmata_size.times do |lemmata_index|
|
364
|
+
lemmata = Lemmata.new()
|
365
|
+
if @lemmata_flags[group]
|
366
|
+
lemmata.base_form = read_word
|
367
|
+
end
|
368
|
+
if @tag_flags[group]
|
369
|
+
read_word
|
370
|
+
end
|
371
|
+
segment.lemmata << lemmata
|
372
|
+
end
|
373
|
+
end
|
374
|
+
end
|
375
|
+
segments
|
376
|
+
end
|
377
|
+
|
378
|
+
# Reads number stored in the message received from the server.
|
379
|
+
def read_number
|
380
|
+
@connector.read_message.match(/\d+/)[0].to_i
|
381
|
+
end
|
382
|
+
|
383
|
+
# Counts number of results for given answer
|
384
|
+
def count_results(answer)
|
385
|
+
answer.split(" ")[1].to_i
|
386
|
+
end
|
387
|
+
|
388
|
+
# *Asynchronous* Sends the query to the server
|
389
|
+
# * +query+ query to send
|
390
|
+
# * +handler+ if given, the method returns immediately,
|
391
|
+
# and the answer is sent to the handler. In this case
|
392
|
+
# the result returned by make_query should be IGNORED!
|
393
|
+
def make_query(query, &handler)
|
394
|
+
if @last_query != query
|
395
|
+
@last_query = query
|
396
|
+
if handler.nil?
|
397
|
+
real_handler = lambda { |msg| @answer_queue.push msg }
|
398
|
+
else
|
399
|
+
real_handler = handler
|
400
|
+
end
|
401
|
+
begin
|
402
|
+
talk("MAKE-QUERY #{query}")
|
403
|
+
rescue JobInProgress
|
404
|
+
talk("CANCEL") rescue nil
|
405
|
+
talk("MAKE-QUERY #{query}")
|
406
|
+
end
|
407
|
+
talk("RUN-QUERY #{@buffer_size}", :async, &real_handler)
|
408
|
+
@last_result = do_wait if handler.nil?
|
409
|
+
end
|
410
|
+
@last_result
|
411
|
+
end
|
412
|
+
|
413
|
+
# Reads string stored in the last message received from server
|
414
|
+
def read_word
|
415
|
+
@connector.read_message
|
416
|
+
end
|
417
|
+
|
418
|
+
private
|
419
|
+
def do_wait
|
420
|
+
loop {
|
421
|
+
status = talk("STATUS") rescue break
|
422
|
+
puts "STATUS: #{status}" if @debug
|
423
|
+
sleep 0.3
|
424
|
+
}
|
425
|
+
@answer_queue.shift
|
426
|
+
end
|
427
|
+
|
428
|
+
def set_all_flags
|
429
|
+
options = {}
|
430
|
+
GROUPS.each{|g| options[g] = true}
|
431
|
+
options
|
432
|
+
end
|
433
|
+
|
434
|
+
def correct_context_value?(value)
|
435
|
+
value.is_a?(Fixnum) && value > 0
|
436
|
+
end
|
437
|
+
|
438
|
+
def make_async_query(query,answer_offset)
|
439
|
+
# the handler is empty, since we access the result count through
|
440
|
+
# BUFFER-STATE call
|
441
|
+
make_query(query){|msg| }
|
442
|
+
result_count = 0
|
443
|
+
begin
|
444
|
+
# the result count might be not exact!
|
445
|
+
result_count = talk("BUFFER-STATE").split(" ")[2].to_i
|
446
|
+
talk("STATUS") rescue break
|
447
|
+
end while result_count < answer_offset
|
448
|
+
@last_result = "OK #{result_count}"
|
449
|
+
result_count
|
450
|
+
end
|
451
|
+
end
|
452
|
+
end
|
@@ -0,0 +1,140 @@
|
|
1
|
+
# vim:encoding=utf-8
|
2
|
+
require 'socket'
|
3
|
+
require 'thread'
|
4
|
+
require File.join(File.dirname(__FILE__),'util')
|
5
|
+
|
6
|
+
module Poliqarp
|
7
|
+
# Author:: Aleksander Pohl (mailto:apohllo@o2.pl)
|
8
|
+
# License:: MIT License
|
9
|
+
#
|
10
|
+
# This class hold the TCP connection to the server and is responsible
|
11
|
+
# for dispatching synchronous and asynchronous queries and answers.
|
12
|
+
class Connector
|
13
|
+
include Ruby19
|
14
|
+
|
15
|
+
# Error messages assigned to error codes
|
16
|
+
# (taken from poliqarpd implementation)
|
17
|
+
ERRORS = {
|
18
|
+
1 => "Incorrect number of arguments",
|
19
|
+
3 => "No session opened",
|
20
|
+
4 => "Cannot create a session for a connection that",
|
21
|
+
5 => "Not enough memory",
|
22
|
+
6 => "Invalid session ID",
|
23
|
+
7 => "Session with this ID is already bound",
|
24
|
+
8 => "Session user ID does not match the argument",
|
25
|
+
10 => "Session already has an open corpus",
|
26
|
+
12 => "System error while opening the corpus",
|
27
|
+
13 => "No corpus opened",
|
28
|
+
14 => "Invalid job ID",
|
29
|
+
15 => "A job is already in progress",
|
30
|
+
16 => "Incorrect query",
|
31
|
+
17 => "Invalid result range",
|
32
|
+
18 => "Incorrect session option",
|
33
|
+
19 => "Invalid session option value",
|
34
|
+
20 => "Invalid sorting criteria"
|
35
|
+
}
|
36
|
+
|
37
|
+
UTF8 = "utf-8"
|
38
|
+
|
39
|
+
# Creates new connector
|
40
|
+
def initialize(debug)
|
41
|
+
@message_queue = Queue.new
|
42
|
+
@socket_mutex = Mutex.new
|
43
|
+
@loop_mutex = Mutex.new
|
44
|
+
@debug = debug
|
45
|
+
end
|
46
|
+
|
47
|
+
# Opens connection with poliqarp server which runs
|
48
|
+
# on given +host+ and +port+.
|
49
|
+
def open(host,port)
|
50
|
+
@socket_mutex.synchronize {
|
51
|
+
@socket = TCPSocket.new(host,port) if @socket.nil?
|
52
|
+
}
|
53
|
+
running = nil
|
54
|
+
@loop_mutex.synchronize {
|
55
|
+
running = @loop_running
|
56
|
+
}
|
57
|
+
main_loop unless running
|
58
|
+
@loop_mutex.synchronize {
|
59
|
+
@loop_running = true
|
60
|
+
}
|
61
|
+
end
|
62
|
+
|
63
|
+
# Sends message to the poliqarp server. Returns the first synchronous
|
64
|
+
# answer of the server.
|
65
|
+
# * +message+ the message to send
|
66
|
+
# * +mode+ synchronous (+:sync:) or asynchronous (+:async+)
|
67
|
+
# * +handler+ the handler of the asynchronous message
|
68
|
+
def send(message, mode, &handler)
|
69
|
+
puts "send #{mode} #{message}" if @debug
|
70
|
+
if ruby19?
|
71
|
+
massage = message.encode(UTF8)
|
72
|
+
end
|
73
|
+
@socket.puts(message)
|
74
|
+
if mode == :async
|
75
|
+
@handler = handler
|
76
|
+
end
|
77
|
+
read_message
|
78
|
+
end
|
79
|
+
|
80
|
+
# Retrives one message from the server.
|
81
|
+
# If the message indicates an error, new runtime error
|
82
|
+
# containing the error description is returned.
|
83
|
+
def read_message
|
84
|
+
message = @message_queue.shift
|
85
|
+
if message =~ /^ERR/
|
86
|
+
code = message.match(/\d+/)[0].to_i
|
87
|
+
raise JobInProgress.new() if code == 15
|
88
|
+
raise RuntimeError.new("Poliqarp Error: "+ERRORS[code])
|
89
|
+
else
|
90
|
+
message
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
private
|
95
|
+
def main_loop
|
96
|
+
@loop = Thread.new {
|
97
|
+
loop {
|
98
|
+
receive
|
99
|
+
# XXX ??? needed
|
100
|
+
#sleep 0.001
|
101
|
+
}
|
102
|
+
}
|
103
|
+
end
|
104
|
+
|
105
|
+
def receive
|
106
|
+
result = read_line
|
107
|
+
if ruby19?
|
108
|
+
result.force_encoding(UTF8)
|
109
|
+
end
|
110
|
+
msg = result[2..-2]
|
111
|
+
if result =~ /^M/
|
112
|
+
receive_async(msg)
|
113
|
+
elsif result
|
114
|
+
receive_sync(msg)
|
115
|
+
end
|
116
|
+
# if nil, nothing was received
|
117
|
+
end
|
118
|
+
|
119
|
+
def receive_sync(message)
|
120
|
+
puts "receive sync: #{message}" if @debug
|
121
|
+
@message_queue << message
|
122
|
+
end
|
123
|
+
|
124
|
+
def receive_async(message)
|
125
|
+
puts "receive async: #{message}" if @debug
|
126
|
+
Thread.new{
|
127
|
+
@handler.call(message)
|
128
|
+
}
|
129
|
+
end
|
130
|
+
|
131
|
+
def read_line
|
132
|
+
line = ""
|
133
|
+
begin
|
134
|
+
chars = @socket.recvfrom(1)
|
135
|
+
line << chars[0]
|
136
|
+
end while chars[0] != "\n"
|
137
|
+
line
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
@@ -0,0 +1,9 @@
|
|
1
|
+
module Poliqarp
|
2
|
+
# Author:: Aleksander Pohl (mailto:apohllo@o2.pl)
|
3
|
+
# License:: MIT License
|
4
|
+
|
5
|
+
# The JobInProgress exception is raised if there was asynchronous call
|
6
|
+
# to the server which haven't finished, which is interrupted by another
|
7
|
+
# asynchronous call.
|
8
|
+
class JobInProgress < Exception; end
|
9
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
module Poliqarp
|
2
|
+
# Author:: Aleksander Pohl
|
3
|
+
# License:: MIT License
|
4
|
+
#
|
5
|
+
# The excerpt class is used to store single result of the query,
|
6
|
+
# i.e. the excerpt of the corpus which contains the words which
|
7
|
+
# the corpus was queried for.
|
8
|
+
#
|
9
|
+
# The excerpt is divided into groups, which contain segments,
|
10
|
+
# which the texts in the corpus were divided for.
|
11
|
+
# The first group is the left context, the second -- the matched
|
12
|
+
# query, and the last -- the right context.
|
13
|
+
class Excerpt
|
14
|
+
attr_reader :index, :base_form, :short_context
|
15
|
+
|
16
|
+
def initialize(index, client, base_form)
|
17
|
+
@index = index
|
18
|
+
@client = client
|
19
|
+
@base_form = base_form
|
20
|
+
@short_context = []
|
21
|
+
end
|
22
|
+
|
23
|
+
# Adds segment group to the excerpt
|
24
|
+
def <<(value)
|
25
|
+
@short_context << value
|
26
|
+
end
|
27
|
+
|
28
|
+
|
29
|
+
# Returns the matched query as string
|
30
|
+
def word
|
31
|
+
#@short_context[0].split(/\s+/)[-1]
|
32
|
+
@short_context[1].to_s
|
33
|
+
end
|
34
|
+
|
35
|
+
alias inflected_form word
|
36
|
+
|
37
|
+
# The string representation of the excerpt is the shord
|
38
|
+
# context of the query.
|
39
|
+
def to_s
|
40
|
+
@short_context.join("")
|
41
|
+
end
|
42
|
+
|
43
|
+
# Returns the long context of the query.
|
44
|
+
def context
|
45
|
+
return @context unless @context.nil?
|
46
|
+
@context = @client.context(@base_form, @index)
|
47
|
+
end
|
48
|
+
|
49
|
+
{ :medium => :medium, :style => :styl, :date => :data_wydania,
|
50
|
+
:city => :miejsce_wydania, :publisher => :wydawca, :title => :tytu,
|
51
|
+
:author => :autor}.each do |method, keyword|
|
52
|
+
define_method method do
|
53
|
+
self.metadata[keyword]
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
protected
|
58
|
+
def metadata
|
59
|
+
return @metadata unless @metadata.nil?
|
60
|
+
@metadata = @client.metadata(@base_form, @index)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
module Poliqarp
|
2
|
+
# Author:: Aleksander Pohl (mailto:apohllo@o2.pl)
|
3
|
+
# License:: MIT License
|
4
|
+
#
|
5
|
+
# The query result class is used to paginate results of the
|
6
|
+
# query. Each query result has information about its context
|
7
|
+
# (the next and previous page).
|
8
|
+
class QueryResult
|
9
|
+
include Enumerable
|
10
|
+
|
11
|
+
attr_accessor :page, :page_count, :query, :page_size
|
12
|
+
|
13
|
+
def initialize(page, page_count, page_size, client, query)
|
14
|
+
@page = page
|
15
|
+
@page_count = page_count
|
16
|
+
@page_size = page_size
|
17
|
+
@client = client
|
18
|
+
@query = query
|
19
|
+
@excerpts = []
|
20
|
+
end
|
21
|
+
|
22
|
+
# Adds excerpt to the query result
|
23
|
+
def <<(excerpt)
|
24
|
+
@excerpts << excerpt
|
25
|
+
end
|
26
|
+
|
27
|
+
# Allows to iterate over the results stored in the result
|
28
|
+
def each
|
29
|
+
@excerpts.each{|e| yield e}
|
30
|
+
end
|
31
|
+
|
32
|
+
[:first, :last, :empty?].each do |method|
|
33
|
+
define_method method do
|
34
|
+
@excerpts.send(method)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# Returns excerpt with given index.
|
39
|
+
def [](index)
|
40
|
+
@excerpts[index]
|
41
|
+
end
|
42
|
+
|
43
|
+
# Two excerpts are equal iff their page number, page count,
|
44
|
+
# query and page size are equal.
|
45
|
+
def ==(other)
|
46
|
+
return false unless other.is_a? QueryResult
|
47
|
+
@page == other.page && @page_count == other.page_count &&
|
48
|
+
@query == other.query && @page_size == other.page_size
|
49
|
+
end
|
50
|
+
|
51
|
+
# Returns the previous page of the query result
|
52
|
+
def previous_page
|
53
|
+
if @page > 1
|
54
|
+
@client.find(@query, :page_size => @page_size,
|
55
|
+
:page_index => @page - 1)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# Return the next page of the query result
|
60
|
+
def next_page
|
61
|
+
if @page < @page_count
|
62
|
+
@client.find(@query, :page_size => @page_size,
|
63
|
+
:page_index => @page + 1)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
# Returns the number of excerpts stored in this page (query result)
|
68
|
+
def size
|
69
|
+
@excerpts.size
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Poliqarp
|
2
|
+
# Author:: Aleksander Pohl (mailto:apohllo@o2.pl)
|
3
|
+
# License:: MIT LICENSE
|
4
|
+
#
|
5
|
+
# The segment is the smallest meaningful part of the text.
|
6
|
+
# It may contain many lemmata, since the segments are sometimes
|
7
|
+
# not disambiguated.
|
8
|
+
class Segment
|
9
|
+
attr_reader :literal, :lemmata
|
10
|
+
|
11
|
+
# Creates new segment. The specified argument is the literal
|
12
|
+
# (as found in the text) representation of the segment.
|
13
|
+
def initialize(literal)
|
14
|
+
@literal = literal
|
15
|
+
@lemmata = []
|
16
|
+
end
|
17
|
+
|
18
|
+
# Returns the segment literal
|
19
|
+
def to_s
|
20
|
+
@literal
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
data/lib/poliqarpr.rb
ADDED
data/poliqarpr.gemspec
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.name = "poliqarpr"
|
3
|
+
s.version = "0.0.5"
|
4
|
+
s.date = "2009-12-10"
|
5
|
+
s.summary = "Ruby client for Poliqarp"
|
6
|
+
s.email = "apohllo@o2.pl"
|
7
|
+
s.homepage = "http://www.github.com/apohllo/poliqarpr"
|
8
|
+
s.description = "Ruby client for Poliqarp (NLP corpus server)"
|
9
|
+
s.authors = ['Aleksander Pohl']
|
10
|
+
s.files = ["Rakefile", "poliqarpr.gemspec",
|
11
|
+
"changelog.txt", "README.txt" ] + Dir.glob("lib/**/*")
|
12
|
+
s.test_files = Dir.glob("test/**/*")
|
13
|
+
s.rdoc_options = ["--main", "README.txt"]
|
14
|
+
s.has_rdoc = true
|
15
|
+
s.extra_rdoc_files = ["README.txt"]
|
16
|
+
end
|
17
|
+
|
metadata
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: poliqarpr
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.5
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Aleksander Pohl
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-12-10 00:00:00 +01:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: Ruby client for Poliqarp (NLP corpus server)
|
17
|
+
email: apohllo@o2.pl
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files:
|
23
|
+
- README.txt
|
24
|
+
files:
|
25
|
+
- Rakefile
|
26
|
+
- poliqarpr.gemspec
|
27
|
+
- changelog.txt
|
28
|
+
- README.txt
|
29
|
+
- lib/poliqarpr.rb
|
30
|
+
- lib/poliqarpr/exceptions.rb
|
31
|
+
- lib/poliqarpr/lemmata.rb
|
32
|
+
- lib/poliqarpr/query_result.rb
|
33
|
+
- lib/poliqarpr/excerpt.rb
|
34
|
+
- lib/poliqarpr/segment.rb
|
35
|
+
- lib/poliqarpr/client.rb
|
36
|
+
- lib/poliqarpr/util.rb
|
37
|
+
- lib/poliqarpr/connector.rb
|
38
|
+
has_rdoc: true
|
39
|
+
homepage: http://www.github.com/apohllo/poliqarpr
|
40
|
+
licenses: []
|
41
|
+
|
42
|
+
post_install_message:
|
43
|
+
rdoc_options:
|
44
|
+
- --main
|
45
|
+
- README.txt
|
46
|
+
require_paths:
|
47
|
+
- lib
|
48
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: "0"
|
53
|
+
version:
|
54
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
55
|
+
requirements:
|
56
|
+
- - ">="
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
version: "0"
|
59
|
+
version:
|
60
|
+
requirements: []
|
61
|
+
|
62
|
+
rubyforge_project:
|
63
|
+
rubygems_version: 1.3.5
|
64
|
+
signing_key:
|
65
|
+
specification_version: 3
|
66
|
+
summary: Ruby client for Poliqarp
|
67
|
+
test_files: []
|
68
|
+
|