poliqarpr 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.txt +107 -0
- data/Rakefile +23 -0
- data/changelog.txt +37 -0
- data/lib/poliqarpr/client.rb +452 -0
- data/lib/poliqarpr/connector.rb +140 -0
- data/lib/poliqarpr/exceptions.rb +9 -0
- data/lib/poliqarpr/excerpt.rb +63 -0
- data/lib/poliqarpr/lemmata.rb +11 -0
- data/lib/poliqarpr/query_result.rb +73 -0
- data/lib/poliqarpr/segment.rb +23 -0
- data/lib/poliqarpr/util.rb +9 -0
- data/lib/poliqarpr.rb +9 -0
- data/poliqarpr.gemspec +17 -0
- metadata +68 -0
data/README.txt
ADDED
@@ -0,0 +1,107 @@
|
|
1
|
+
= poliqarpr
|
2
|
+
|
3
|
+
* http://github.com/apohllo/poliqarpr
|
4
|
+
|
5
|
+
== DESCRIPTION:
|
6
|
+
|
7
|
+
Poliqarpr is Ruby client for Poliqarp server.
|
8
|
+
|
9
|
+
|
10
|
+
== FEATURES/PROBLEMS:
|
11
|
+
|
12
|
+
* built-in pagination of query results
|
13
|
+
* support for lemmatization
|
14
|
+
* asynchronous communication might be not stable (100% CPU ussage)
|
15
|
+
* only partial implementation of server protocol:
|
16
|
+
** PING, VERSION
|
17
|
+
** MAKE-SESSION, CLOSE-SESSION
|
18
|
+
** OPEN (corpus), CLOSE (corpus)
|
19
|
+
** CORPUS-STATS, GET-TAGSET
|
20
|
+
** STATUS, CANCEL (used internally)
|
21
|
+
** MAKE-QUERY, RUN-QUERY, BUFFER-STATE (used internally)
|
22
|
+
** GET-RESULTS, GET-CONTEXT, METADATA
|
23
|
+
** SET: lemmata, tags
|
24
|
+
|
25
|
+
== SYNOPSIS:
|
26
|
+
|
27
|
+
Poliqarpr is Ruby client for Poliqarp corpus server (see
|
28
|
+
http://poliqarp.sourceforge.net/), which is used to store large texts used in
|
29
|
+
Natural Language Processing.
|
30
|
+
|
31
|
+
|
32
|
+
== REQUIREMENTS:
|
33
|
+
|
34
|
+
Poliqarp server (only C implementation http://poliqarp.sourceforge.net/)
|
35
|
+
|
36
|
+
== INSTALL:
|
37
|
+
|
38
|
+
You need RubyGems v. 1.2
|
39
|
+
|
40
|
+
* gem -v
|
41
|
+
* 1.2.0 #=> ok
|
42
|
+
|
43
|
+
You need the gemcutter.org repository to be added to your sources list:
|
44
|
+
|
45
|
+
* gem sources -a http://gemcutter.org
|
46
|
+
|
47
|
+
Then you can type:
|
48
|
+
|
49
|
+
* sudo gem install poliqarpr
|
50
|
+
|
51
|
+
You can install the optional default corpus (warning: it is distributed under
|
52
|
+
different license!):
|
53
|
+
|
54
|
+
* sudo gem install poliqarpr-corpus
|
55
|
+
|
56
|
+
== BASIC USAGE:
|
57
|
+
|
58
|
+
(You need the poliqarpr-corpus to be installed for this to work. See the last
|
59
|
+
step of installation process).
|
60
|
+
|
61
|
+
Require the gem:
|
62
|
+
|
63
|
+
require 'poliaqarpr'
|
64
|
+
|
65
|
+
Create the server client and open default corpus
|
66
|
+
|
67
|
+
client = Poliqarp::Client.new
|
68
|
+
client.open_corpus :default
|
69
|
+
|
70
|
+
Query the corpus for given segment
|
71
|
+
|
72
|
+
result = client.find("kot")
|
73
|
+
result[0].to_s
|
74
|
+
|
75
|
+
Remember to close the client on exit
|
76
|
+
|
77
|
+
client.close
|
78
|
+
|
79
|
+
|
80
|
+
== LICENSE:
|
81
|
+
|
82
|
+
(The MIT License)
|
83
|
+
|
84
|
+
Copyright (c) 2008-2009 Aleksander Pohl
|
85
|
+
|
86
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
87
|
+
a copy of this software and associated documentation files (the
|
88
|
+
'Software'), to deal in the Software without restriction, including
|
89
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
90
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
91
|
+
permit persons to whom the Software is furnished to do so, subject to
|
92
|
+
the following conditions:
|
93
|
+
|
94
|
+
The above copyright notice and this permission notice shall be
|
95
|
+
included in all copies or substantial portions of the Software.
|
96
|
+
|
97
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
98
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
99
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
100
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
101
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
102
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
103
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
104
|
+
|
105
|
+
== FEEDBACK
|
106
|
+
|
107
|
+
* mailto:apohllo@o2.pl
|
data/Rakefile
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
task :default => [:install]
|
2
|
+
|
3
|
+
$gem_name = "poliqarpr"
|
4
|
+
|
5
|
+
desc "Build the gem"
|
6
|
+
task :build do
|
7
|
+
sh "gem build #$gem_name.gemspec"
|
8
|
+
end
|
9
|
+
|
10
|
+
desc "Install the library at local machnie"
|
11
|
+
task :install => :build do
|
12
|
+
sh "sudo gem install #$gem_name"
|
13
|
+
end
|
14
|
+
|
15
|
+
desc "Uninstall the library from local machnie"
|
16
|
+
task :uninstall do
|
17
|
+
sh "sudo gem uninstall #$gem_name"
|
18
|
+
end
|
19
|
+
|
20
|
+
desc "Clean"
|
21
|
+
task :clean do
|
22
|
+
sh "rm #$gem_name*.gem"
|
23
|
+
end
|
data/changelog.txt
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
0.0.5
|
2
|
+
- Bugfix: making parallel query might lead to silent thread death
|
3
|
+
- Support for Ruby 1.9 encoding
|
4
|
+
- Require path improvments
|
5
|
+
- Copyright year correction
|
6
|
+
- Documentation now points to gemcutter instead of github
|
7
|
+
|
8
|
+
0.0.4
|
9
|
+
- ping/pong diagnostics
|
10
|
+
- server version
|
11
|
+
- corpus statistics
|
12
|
+
- implementation of asynchronous protocol (not stable)
|
13
|
+
|
14
|
+
|
15
|
+
0.0.3
|
16
|
+
- the license of the corpus included
|
17
|
+
- client rdoc documentation
|
18
|
+
- support for lemmata retrieval
|
19
|
+
- excerpt now contains segments instead of strings
|
20
|
+
- buffer size setter
|
21
|
+
- default corpus moved to separate plugin (sudo gem install apohllo-poliqarpr-corpus)
|
22
|
+
|
23
|
+
0.0.2
|
24
|
+
- query result is full blown class
|
25
|
+
- source divided into client, excerpt and query resutl
|
26
|
+
- specs for client, excerpt and query result
|
27
|
+
- namespace changed to POLIQARP
|
28
|
+
- default corpus included
|
29
|
+
- singular results properly fatched
|
30
|
+
- valid result for queries containing many words
|
31
|
+
- same queries which are run in sequence are called only once
|
32
|
+
- README.txt included in gem
|
33
|
+
- specs included in gem
|
34
|
+
|
35
|
+
0.0.1
|
36
|
+
- initiali implementation
|
37
|
+
- synchorous querying for terms
|
@@ -0,0 +1,452 @@
|
|
1
|
+
# vim:encoding=utf-8
|
2
|
+
module Poliqarp
|
3
|
+
# Author:: Aleksander Pohl (mailto:apohllo@o2.pl)
|
4
|
+
# License:: MIT License
|
5
|
+
#
|
6
|
+
# This class is the implementation of the Poliqarp server client.
|
7
|
+
class Client
|
8
|
+
GROUPS = [:left_context, :left_match, :right_match, :right_context]
|
9
|
+
|
10
|
+
# If debug is turned on, the communication between server and client
|
11
|
+
# is logged to standard output.
|
12
|
+
attr_writer :debug
|
13
|
+
|
14
|
+
# The size of the buffer is the maximum number of excerpts which
|
15
|
+
# are returned for single query.
|
16
|
+
attr_writer :buffer_size
|
17
|
+
|
18
|
+
# Creates new poliqarp server client.
|
19
|
+
#
|
20
|
+
# Parameters:
|
21
|
+
# * +session_name+ the name of the client session. Defaults to "RUBY".
|
22
|
+
# * +debug+ if set to true, all messages sent and received from server
|
23
|
+
# are printed to standard output. Defaults to false.
|
24
|
+
def initialize(session_name="RUBY", debug=false)
|
25
|
+
@session_name = session_name
|
26
|
+
@left_context = 5
|
27
|
+
@right_context = 5
|
28
|
+
@debug = debug
|
29
|
+
@buffer_size = 500000
|
30
|
+
@connector = Connector.new(debug)
|
31
|
+
@answer_queue = Queue.new
|
32
|
+
new_session
|
33
|
+
end
|
34
|
+
|
35
|
+
# A hint about installation of default corpus gem
|
36
|
+
def self.const_missing(const)
|
37
|
+
if const.to_s =~ /DEFAULT_CORPUS/
|
38
|
+
raise "You need to install 'apohllo-poliqarpr-corpus' to use the default corpus"
|
39
|
+
end
|
40
|
+
super
|
41
|
+
end
|
42
|
+
|
43
|
+
# Creates new session for the client with the name given in constructor.
|
44
|
+
# If the session was already opened, it is closed.
|
45
|
+
#
|
46
|
+
# Parameters:
|
47
|
+
# * +port+ - the port on which the poliqarpd server is accepting connections (defaults to 4567)
|
48
|
+
def new_session(port=4567)
|
49
|
+
close if @session
|
50
|
+
@connector.open("localhost",port)
|
51
|
+
talk("MAKE-SESSION #{@session_name}")
|
52
|
+
talk("BUFFER-RESIZE #{@buffer_size}")
|
53
|
+
@session = true
|
54
|
+
self.tags = {}
|
55
|
+
self.lemmata = {}
|
56
|
+
end
|
57
|
+
|
58
|
+
# Closes the opened session.
|
59
|
+
def close
|
60
|
+
talk "CLOSE-SESSION"
|
61
|
+
@session = false
|
62
|
+
end
|
63
|
+
|
64
|
+
# Closes the opened corpus.
|
65
|
+
def close_corpus
|
66
|
+
talk "CLOSE"
|
67
|
+
end
|
68
|
+
|
69
|
+
# Sets the size of the left short context. It must be > 0
|
70
|
+
#
|
71
|
+
# The size of the left short context is the number
|
72
|
+
# of segments displayed in the found excerpts left to the
|
73
|
+
# matched segment(s).
|
74
|
+
def left_context=(value)
|
75
|
+
if correct_context_value?(value)
|
76
|
+
result = talk("SET left-context-width #{value}")
|
77
|
+
@left_context = value if result =~ /^R OK/
|
78
|
+
else
|
79
|
+
raise "Invalid argument: #{value}. It must be fixnum greater than 0."
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
# Sets the size of the right short context. It must be > 0
|
84
|
+
#
|
85
|
+
# The size of the right short context is the number
|
86
|
+
# of segments displayed in the found excerpts right to the
|
87
|
+
# matched segment(s).
|
88
|
+
def right_context=(value)
|
89
|
+
if correct_context_value?(value)
|
90
|
+
result = talk("SET right-context-width #{value}")
|
91
|
+
@right_context = value if result =~ /^R OK/
|
92
|
+
else
|
93
|
+
raise "Invalid argument: #{value}. It must be fixnum greater than 0."
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
# Sets the tags' flags. There are four groups of segments
|
98
|
+
# which the flags apply for:
|
99
|
+
# * +left_context+
|
100
|
+
# * +left_match+
|
101
|
+
# * +right_match+
|
102
|
+
# * +right_context+
|
103
|
+
#
|
104
|
+
# If the flag for given group is set to true, all segments
|
105
|
+
# in the group are annotated with grammatical tags. E.g.:
|
106
|
+
# c.find("kot")
|
107
|
+
# ...
|
108
|
+
# "kot" tags: "subst:sg:nom:m2"
|
109
|
+
#
|
110
|
+
# You can pass :all to turn on flags for all groups
|
111
|
+
def tags=(options={})
|
112
|
+
options = set_all_flags if options == :all
|
113
|
+
@tag_flags = options
|
114
|
+
flags = ""
|
115
|
+
GROUPS.each do |flag|
|
116
|
+
flags << (options[flag] ? "1" : "0")
|
117
|
+
end
|
118
|
+
talk("SET retrieve-tags #{flags}")
|
119
|
+
end
|
120
|
+
|
121
|
+
# Sets the lemmatas' flags. There are four groups of segments
|
122
|
+
# which the flags apply for:
|
123
|
+
# * +left_context+
|
124
|
+
# * +left_match+
|
125
|
+
# * +right_match+
|
126
|
+
# * +right_context+
|
127
|
+
#
|
128
|
+
# If the flag for given group is set to true, all segments
|
129
|
+
# in the group are returned with the base form of the lemmata. E.g.:
|
130
|
+
# c.find("kotu")
|
131
|
+
# ...
|
132
|
+
# "kotu" base_form: "kot"
|
133
|
+
#
|
134
|
+
# You can pass :all to turn on flags for all groups
|
135
|
+
def lemmata=(options={})
|
136
|
+
options = set_all_flags if options == :all
|
137
|
+
@lemmata_flags = options
|
138
|
+
flags = ""
|
139
|
+
GROUPS.each do |flag|
|
140
|
+
flags << (options[flag] ? "1" : "0")
|
141
|
+
end
|
142
|
+
talk("SET retrieve-lemmata #{flags}")
|
143
|
+
end
|
144
|
+
|
145
|
+
# *Asynchronous* Opens the corpus given as +path+. To open the default
|
146
|
+
# corpus pass +:default+ as the argument.
|
147
|
+
#
|
148
|
+
# If you don't want to wait until the call is finished, you
|
149
|
+
# have to provide +handler+ for the asynchronous answer.
|
150
|
+
def open_corpus(path, &handler)
|
151
|
+
if path == :default
|
152
|
+
open_corpus(DEFAULT_CORPUS, &handler)
|
153
|
+
else
|
154
|
+
real_handler = handler || lambda{|msg| @answer_queue.push msg }
|
155
|
+
talk("OPEN #{path}", :async, &real_handler)
|
156
|
+
do_wait if handler.nil?
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
# Server diagnostics -- the result should be :pong
|
161
|
+
def ping
|
162
|
+
:pong if talk("PING") =~ /PONG/
|
163
|
+
end
|
164
|
+
|
165
|
+
# Returns server version
|
166
|
+
def version
|
167
|
+
talk("VERSION")
|
168
|
+
end
|
169
|
+
|
170
|
+
# Returns corpus statistics:
|
171
|
+
# * +:segment_tokens+ the number of segments in the corpus
|
172
|
+
# (two segments which look exactly the same are counted separately)
|
173
|
+
# * +:segment_types+ the number of segment types in the corpus
|
174
|
+
# (two segments which look exactly the same are counted as one type)
|
175
|
+
# * +:lemmata+ the number of lemmata (lexemes) types
|
176
|
+
# (all forms of inflected word, e.g. 'kot', 'kotu', ...
|
177
|
+
# are treated as one "word" -- lemmata)
|
178
|
+
# * +:tags+ the number of different grammar tags (each combination
|
179
|
+
# of atomic tags is treated as different "tag")
|
180
|
+
def stats
|
181
|
+
stats = {}
|
182
|
+
talk("CORPUS-STATS").split.each_with_index do |value, index|
|
183
|
+
case index
|
184
|
+
when 1
|
185
|
+
stats[:segment_tokens] = value.to_i
|
186
|
+
when 2
|
187
|
+
stats[:segment_types] = value.to_i
|
188
|
+
when 3
|
189
|
+
stats[:lemmata] = value.to_i
|
190
|
+
when 4
|
191
|
+
stats[:tags] = value.to_i
|
192
|
+
end
|
193
|
+
end
|
194
|
+
stats
|
195
|
+
end
|
196
|
+
|
197
|
+
# TODO
|
198
|
+
def metadata_types
|
199
|
+
raise "Not implemented"
|
200
|
+
end
|
201
|
+
|
202
|
+
# Returns the tag-set used in the corpus.
|
203
|
+
# It is divided into two groups:
|
204
|
+
# * +:categories+ enlists tags belonging to grammatical categories
|
205
|
+
# (each category has a list of its tags, eg. gender: m1 m2 m3 f n,
|
206
|
+
# means that there are 5 genders: masculine(1,2,3), feminine and neuter)
|
207
|
+
# * +:classes+ enlists grammatical tags used to describe it
|
208
|
+
# (each class has a list of tags used to describe it, eg. adj: degree
|
209
|
+
# gender case number, means that adjectives are described in terms
|
210
|
+
# of degree, gender, case and number)
|
211
|
+
def tagset
|
212
|
+
answer = talk("GET-TAGSET")
|
213
|
+
counters = answer.split
|
214
|
+
result = {}
|
215
|
+
[:categories, :classes].each_with_index do |type, type_index|
|
216
|
+
result[type] = {}
|
217
|
+
counters[type_index+1].to_i.times do |index|
|
218
|
+
values = read_word.split
|
219
|
+
result[type][values[0].to_sym] = values[1..-1].map{|v| v.to_sym}
|
220
|
+
end
|
221
|
+
end
|
222
|
+
result
|
223
|
+
end
|
224
|
+
|
225
|
+
# Send the query to the opened corpus.
|
226
|
+
#
|
227
|
+
# Options:
|
228
|
+
# * +index+ the index of the (only one) result to be returned. The index is relative
|
229
|
+
# to the beginning of the query result. In normal case you should query the
|
230
|
+
# corpus without specifying the index, to see what results are returned.
|
231
|
+
# Then you can use the index and the same query to retrieve one result.
|
232
|
+
# The pair (query, index) is a kind of unique identifier of the excerpt.
|
233
|
+
# * +page_size+ the size of the page of results. If the page size is 0, then
|
234
|
+
# all results are returned on one page. It is ignored if the +index+ option
|
235
|
+
# is present. Defaults to 0.
|
236
|
+
# * +page_index+ the index of the page of results (the first page has index 1, not 0).
|
237
|
+
# It is ignored if the +index+ option is present. Defaults to 1.
|
238
|
+
def find(query,options={})
|
239
|
+
if options[:index]
|
240
|
+
find_one(query, options[:index])
|
241
|
+
else
|
242
|
+
find_many(query, options)
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
alias query find
|
247
|
+
|
248
|
+
# Returns the number of results for given query.
|
249
|
+
def count(query)
|
250
|
+
count_results(make_query(query))
|
251
|
+
end
|
252
|
+
|
253
|
+
# Returns the long context of the excerpt which is identified by
|
254
|
+
# given (query, index) pair.
|
255
|
+
def context(query,index)
|
256
|
+
make_query(query)
|
257
|
+
result = []
|
258
|
+
talk "GET-CONTEXT #{index}"
|
259
|
+
# 1st part
|
260
|
+
result << read_word
|
261
|
+
# 2nd part
|
262
|
+
result << read_word
|
263
|
+
# 3rd part
|
264
|
+
result << read_word
|
265
|
+
# 4th part
|
266
|
+
result << read_word
|
267
|
+
result
|
268
|
+
end
|
269
|
+
|
270
|
+
# Returns the metadata of the excerpt which is identified by
|
271
|
+
# given (query, index) pair.
|
272
|
+
def metadata(query, index)
|
273
|
+
make_query(query)
|
274
|
+
result = {}
|
275
|
+
answer = talk("METADATA #{index}")
|
276
|
+
count = answer.split(" ")[1].to_i
|
277
|
+
count.times do |index|
|
278
|
+
type = read_word.gsub(/[^a-zA-Z]/,"").to_sym
|
279
|
+
value = read_word[2..-1]
|
280
|
+
unless value.nil?
|
281
|
+
result[type] ||= []
|
282
|
+
result[type] << value
|
283
|
+
end
|
284
|
+
end
|
285
|
+
result
|
286
|
+
end
|
287
|
+
|
288
|
+
protected
|
289
|
+
# Sends a message directly to the server
|
290
|
+
# * +msg+ the message to send
|
291
|
+
# * +mode+ if set to :sync, the method block untli the message
|
292
|
+
# is received. If :async the method returns immediately.
|
293
|
+
# Default: :sync
|
294
|
+
# * +handler+ the handler of the assynchronous message.
|
295
|
+
# It is ignored when the mode is set to :sync.
|
296
|
+
def talk(msg, mode = :sync, &handler)
|
297
|
+
puts msg if @debug
|
298
|
+
@connector.send(msg, mode, &handler)
|
299
|
+
end
|
300
|
+
|
301
|
+
# Make query and retrieve many results.
|
302
|
+
# * +query+ the query to be sent to the server.
|
303
|
+
# * +options+ see find
|
304
|
+
def find_many(query, options)
|
305
|
+
page_size = (options[:page_size] || 0)
|
306
|
+
page_index = (options[:page_index] || 1)
|
307
|
+
|
308
|
+
answer_offset = page_size * (page_index - 1)
|
309
|
+
if page_size > 0
|
310
|
+
result_count = make_async_query(query,answer_offset)
|
311
|
+
answers_limit = answer_offset + page_size > result_count ?
|
312
|
+
result_count - answer_offset : page_size
|
313
|
+
else
|
314
|
+
# all answers needed -- the call must be synchronous
|
315
|
+
result_count = count_results(make_query(query))
|
316
|
+
answers_limit = result_count
|
317
|
+
end
|
318
|
+
|
319
|
+
page_count = page_size <= 0 ? 1 :
|
320
|
+
result_count / page_size + (result_count % page_size > 0 ? 1 : 0)
|
321
|
+
|
322
|
+
result = QueryResult.new(page_index, page_count,page_size,self,query)
|
323
|
+
if answers_limit > 0
|
324
|
+
talk("GET-RESULTS #{answer_offset} #{answer_offset + answers_limit - 1}")
|
325
|
+
answers_limit.times do |answer_index|
|
326
|
+
result << fetch_result(answer_offset + answer_index, query)
|
327
|
+
end
|
328
|
+
end
|
329
|
+
result
|
330
|
+
end
|
331
|
+
|
332
|
+
# Make query and retrieve only one result
|
333
|
+
# * +query+ the query to be sent to the server
|
334
|
+
# * +index+ the index of the answer to be retrieved
|
335
|
+
def find_one(query,index)
|
336
|
+
make_async_query(query,index)
|
337
|
+
talk("GET-RESULTS #{index} #{index}")
|
338
|
+
fetch_result(index,query)
|
339
|
+
end
|
340
|
+
|
341
|
+
# Fetches one result of the query
|
342
|
+
#
|
343
|
+
# MAKE-QUERY and GET-RESULTS must be sent to the server before
|
344
|
+
# this method is called
|
345
|
+
def fetch_result(index, query)
|
346
|
+
result = Excerpt.new(index, self, query)
|
347
|
+
result << read_segments(:left_context)
|
348
|
+
result << read_segments(:left_match)
|
349
|
+
# XXX
|
350
|
+
#result << read_segments(:right_match)
|
351
|
+
result << read_segments(:right_context)
|
352
|
+
result
|
353
|
+
end
|
354
|
+
|
355
|
+
def read_segments(group)
|
356
|
+
size = read_number()
|
357
|
+
segments = []
|
358
|
+
size.times do |segment_index|
|
359
|
+
segment = Segment.new(read_word)
|
360
|
+
segments << segment
|
361
|
+
if @lemmata_flags[group] || @tag_flags[group]
|
362
|
+
lemmata_size = read_number()
|
363
|
+
lemmata_size.times do |lemmata_index|
|
364
|
+
lemmata = Lemmata.new()
|
365
|
+
if @lemmata_flags[group]
|
366
|
+
lemmata.base_form = read_word
|
367
|
+
end
|
368
|
+
if @tag_flags[group]
|
369
|
+
read_word
|
370
|
+
end
|
371
|
+
segment.lemmata << lemmata
|
372
|
+
end
|
373
|
+
end
|
374
|
+
end
|
375
|
+
segments
|
376
|
+
end
|
377
|
+
|
378
|
+
# Reads number stored in the message received from the server.
|
379
|
+
def read_number
|
380
|
+
@connector.read_message.match(/\d+/)[0].to_i
|
381
|
+
end
|
382
|
+
|
383
|
+
# Counts number of results for given answer
|
384
|
+
def count_results(answer)
|
385
|
+
answer.split(" ")[1].to_i
|
386
|
+
end
|
387
|
+
|
388
|
+
# *Asynchronous* Sends the query to the server
|
389
|
+
# * +query+ query to send
|
390
|
+
# * +handler+ if given, the method returns immediately,
|
391
|
+
# and the answer is sent to the handler. In this case
|
392
|
+
# the result returned by make_query should be IGNORED!
|
393
|
+
def make_query(query, &handler)
|
394
|
+
if @last_query != query
|
395
|
+
@last_query = query
|
396
|
+
if handler.nil?
|
397
|
+
real_handler = lambda { |msg| @answer_queue.push msg }
|
398
|
+
else
|
399
|
+
real_handler = handler
|
400
|
+
end
|
401
|
+
begin
|
402
|
+
talk("MAKE-QUERY #{query}")
|
403
|
+
rescue JobInProgress
|
404
|
+
talk("CANCEL") rescue nil
|
405
|
+
talk("MAKE-QUERY #{query}")
|
406
|
+
end
|
407
|
+
talk("RUN-QUERY #{@buffer_size}", :async, &real_handler)
|
408
|
+
@last_result = do_wait if handler.nil?
|
409
|
+
end
|
410
|
+
@last_result
|
411
|
+
end
|
412
|
+
|
413
|
+
# Reads string stored in the last message received from server
|
414
|
+
def read_word
|
415
|
+
@connector.read_message
|
416
|
+
end
|
417
|
+
|
418
|
+
private
|
419
|
+
def do_wait
|
420
|
+
loop {
|
421
|
+
status = talk("STATUS") rescue break
|
422
|
+
puts "STATUS: #{status}" if @debug
|
423
|
+
sleep 0.3
|
424
|
+
}
|
425
|
+
@answer_queue.shift
|
426
|
+
end
|
427
|
+
|
428
|
+
def set_all_flags
|
429
|
+
options = {}
|
430
|
+
GROUPS.each{|g| options[g] = true}
|
431
|
+
options
|
432
|
+
end
|
433
|
+
|
434
|
+
def correct_context_value?(value)
|
435
|
+
value.is_a?(Fixnum) && value > 0
|
436
|
+
end
|
437
|
+
|
438
|
+
def make_async_query(query,answer_offset)
|
439
|
+
# the handler is empty, since we access the result count through
|
440
|
+
# BUFFER-STATE call
|
441
|
+
make_query(query){|msg| }
|
442
|
+
result_count = 0
|
443
|
+
begin
|
444
|
+
# the result count might be not exact!
|
445
|
+
result_count = talk("BUFFER-STATE").split(" ")[2].to_i
|
446
|
+
talk("STATUS") rescue break
|
447
|
+
end while result_count < answer_offset
|
448
|
+
@last_result = "OK #{result_count}"
|
449
|
+
result_count
|
450
|
+
end
|
451
|
+
end
|
452
|
+
end
|
@@ -0,0 +1,140 @@
|
|
1
|
+
# vim:encoding=utf-8
|
2
|
+
require 'socket'
|
3
|
+
require 'thread'
|
4
|
+
require File.join(File.dirname(__FILE__),'util')
|
5
|
+
|
6
|
+
module Poliqarp
|
7
|
+
# Author:: Aleksander Pohl (mailto:apohllo@o2.pl)
|
8
|
+
# License:: MIT License
|
9
|
+
#
|
10
|
+
# This class hold the TCP connection to the server and is responsible
|
11
|
+
# for dispatching synchronous and asynchronous queries and answers.
|
12
|
+
class Connector
|
13
|
+
include Ruby19
|
14
|
+
|
15
|
+
# Error messages assigned to error codes
|
16
|
+
# (taken from poliqarpd implementation)
|
17
|
+
ERRORS = {
|
18
|
+
1 => "Incorrect number of arguments",
|
19
|
+
3 => "No session opened",
|
20
|
+
4 => "Cannot create a session for a connection that",
|
21
|
+
5 => "Not enough memory",
|
22
|
+
6 => "Invalid session ID",
|
23
|
+
7 => "Session with this ID is already bound",
|
24
|
+
8 => "Session user ID does not match the argument",
|
25
|
+
10 => "Session already has an open corpus",
|
26
|
+
12 => "System error while opening the corpus",
|
27
|
+
13 => "No corpus opened",
|
28
|
+
14 => "Invalid job ID",
|
29
|
+
15 => "A job is already in progress",
|
30
|
+
16 => "Incorrect query",
|
31
|
+
17 => "Invalid result range",
|
32
|
+
18 => "Incorrect session option",
|
33
|
+
19 => "Invalid session option value",
|
34
|
+
20 => "Invalid sorting criteria"
|
35
|
+
}
|
36
|
+
|
37
|
+
UTF8 = "utf-8"
|
38
|
+
|
39
|
+
# Creates new connector
|
40
|
+
def initialize(debug)
|
41
|
+
@message_queue = Queue.new
|
42
|
+
@socket_mutex = Mutex.new
|
43
|
+
@loop_mutex = Mutex.new
|
44
|
+
@debug = debug
|
45
|
+
end
|
46
|
+
|
47
|
+
# Opens connection with poliqarp server which runs
|
48
|
+
# on given +host+ and +port+.
|
49
|
+
def open(host,port)
|
50
|
+
@socket_mutex.synchronize {
|
51
|
+
@socket = TCPSocket.new(host,port) if @socket.nil?
|
52
|
+
}
|
53
|
+
running = nil
|
54
|
+
@loop_mutex.synchronize {
|
55
|
+
running = @loop_running
|
56
|
+
}
|
57
|
+
main_loop unless running
|
58
|
+
@loop_mutex.synchronize {
|
59
|
+
@loop_running = true
|
60
|
+
}
|
61
|
+
end
|
62
|
+
|
63
|
+
# Sends message to the poliqarp server. Returns the first synchronous
|
64
|
+
# answer of the server.
|
65
|
+
# * +message+ the message to send
|
66
|
+
# * +mode+ synchronous (+:sync:) or asynchronous (+:async+)
|
67
|
+
# * +handler+ the handler of the asynchronous message
|
68
|
+
def send(message, mode, &handler)
|
69
|
+
puts "send #{mode} #{message}" if @debug
|
70
|
+
if ruby19?
|
71
|
+
massage = message.encode(UTF8)
|
72
|
+
end
|
73
|
+
@socket.puts(message)
|
74
|
+
if mode == :async
|
75
|
+
@handler = handler
|
76
|
+
end
|
77
|
+
read_message
|
78
|
+
end
|
79
|
+
|
80
|
+
# Retrives one message from the server.
|
81
|
+
# If the message indicates an error, new runtime error
|
82
|
+
# containing the error description is returned.
|
83
|
+
def read_message
|
84
|
+
message = @message_queue.shift
|
85
|
+
if message =~ /^ERR/
|
86
|
+
code = message.match(/\d+/)[0].to_i
|
87
|
+
raise JobInProgress.new() if code == 15
|
88
|
+
raise RuntimeError.new("Poliqarp Error: "+ERRORS[code])
|
89
|
+
else
|
90
|
+
message
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
private
|
95
|
+
def main_loop
|
96
|
+
@loop = Thread.new {
|
97
|
+
loop {
|
98
|
+
receive
|
99
|
+
# XXX ??? needed
|
100
|
+
#sleep 0.001
|
101
|
+
}
|
102
|
+
}
|
103
|
+
end
|
104
|
+
|
105
|
+
def receive
|
106
|
+
result = read_line
|
107
|
+
if ruby19?
|
108
|
+
result.force_encoding(UTF8)
|
109
|
+
end
|
110
|
+
msg = result[2..-2]
|
111
|
+
if result =~ /^M/
|
112
|
+
receive_async(msg)
|
113
|
+
elsif result
|
114
|
+
receive_sync(msg)
|
115
|
+
end
|
116
|
+
# if nil, nothing was received
|
117
|
+
end
|
118
|
+
|
119
|
+
def receive_sync(message)
|
120
|
+
puts "receive sync: #{message}" if @debug
|
121
|
+
@message_queue << message
|
122
|
+
end
|
123
|
+
|
124
|
+
def receive_async(message)
|
125
|
+
puts "receive async: #{message}" if @debug
|
126
|
+
Thread.new{
|
127
|
+
@handler.call(message)
|
128
|
+
}
|
129
|
+
end
|
130
|
+
|
131
|
+
def read_line
|
132
|
+
line = ""
|
133
|
+
begin
|
134
|
+
chars = @socket.recvfrom(1)
|
135
|
+
line << chars[0]
|
136
|
+
end while chars[0] != "\n"
|
137
|
+
line
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
@@ -0,0 +1,9 @@
|
|
1
|
+
module Poliqarp
|
2
|
+
# Author:: Aleksander Pohl (mailto:apohllo@o2.pl)
|
3
|
+
# License:: MIT License
|
4
|
+
|
5
|
+
# The JobInProgress exception is raised if there was asynchronous call
|
6
|
+
# to the server which haven't finished, which is interrupted by another
|
7
|
+
# asynchronous call.
|
8
|
+
class JobInProgress < Exception; end
|
9
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
module Poliqarp
|
2
|
+
# Author:: Aleksander Pohl
|
3
|
+
# License:: MIT License
|
4
|
+
#
|
5
|
+
# The excerpt class is used to store single result of the query,
|
6
|
+
# i.e. the excerpt of the corpus which contains the words which
|
7
|
+
# the corpus was queried for.
|
8
|
+
#
|
9
|
+
# The excerpt is divided into groups, which contain segments,
|
10
|
+
# which the texts in the corpus were divided for.
|
11
|
+
# The first group is the left context, the second -- the matched
|
12
|
+
# query, and the last -- the right context.
|
13
|
+
class Excerpt
|
14
|
+
attr_reader :index, :base_form, :short_context
|
15
|
+
|
16
|
+
def initialize(index, client, base_form)
|
17
|
+
@index = index
|
18
|
+
@client = client
|
19
|
+
@base_form = base_form
|
20
|
+
@short_context = []
|
21
|
+
end
|
22
|
+
|
23
|
+
# Adds segment group to the excerpt
|
24
|
+
def <<(value)
|
25
|
+
@short_context << value
|
26
|
+
end
|
27
|
+
|
28
|
+
|
29
|
+
# Returns the matched query as string
|
30
|
+
def word
|
31
|
+
#@short_context[0].split(/\s+/)[-1]
|
32
|
+
@short_context[1].to_s
|
33
|
+
end
|
34
|
+
|
35
|
+
alias inflected_form word
|
36
|
+
|
37
|
+
# The string representation of the excerpt is the shord
|
38
|
+
# context of the query.
|
39
|
+
def to_s
|
40
|
+
@short_context.join("")
|
41
|
+
end
|
42
|
+
|
43
|
+
# Returns the long context of the query.
|
44
|
+
def context
|
45
|
+
return @context unless @context.nil?
|
46
|
+
@context = @client.context(@base_form, @index)
|
47
|
+
end
|
48
|
+
|
49
|
+
{ :medium => :medium, :style => :styl, :date => :data_wydania,
|
50
|
+
:city => :miejsce_wydania, :publisher => :wydawca, :title => :tytu,
|
51
|
+
:author => :autor}.each do |method, keyword|
|
52
|
+
define_method method do
|
53
|
+
self.metadata[keyword]
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
protected
|
58
|
+
def metadata
|
59
|
+
return @metadata unless @metadata.nil?
|
60
|
+
@metadata = @client.metadata(@base_form, @index)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
module Poliqarp
|
2
|
+
# Author:: Aleksander Pohl (mailto:apohllo@o2.pl)
|
3
|
+
# License:: MIT License
|
4
|
+
#
|
5
|
+
# The query result class is used to paginate results of the
|
6
|
+
# query. Each query result has information about its context
|
7
|
+
# (the next and previous page).
|
8
|
+
class QueryResult
|
9
|
+
include Enumerable
|
10
|
+
|
11
|
+
attr_accessor :page, :page_count, :query, :page_size
|
12
|
+
|
13
|
+
def initialize(page, page_count, page_size, client, query)
|
14
|
+
@page = page
|
15
|
+
@page_count = page_count
|
16
|
+
@page_size = page_size
|
17
|
+
@client = client
|
18
|
+
@query = query
|
19
|
+
@excerpts = []
|
20
|
+
end
|
21
|
+
|
22
|
+
# Adds excerpt to the query result
|
23
|
+
def <<(excerpt)
|
24
|
+
@excerpts << excerpt
|
25
|
+
end
|
26
|
+
|
27
|
+
# Allows to iterate over the results stored in the result
|
28
|
+
def each
|
29
|
+
@excerpts.each{|e| yield e}
|
30
|
+
end
|
31
|
+
|
32
|
+
[:first, :last, :empty?].each do |method|
|
33
|
+
define_method method do
|
34
|
+
@excerpts.send(method)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# Returns excerpt with given index.
|
39
|
+
def [](index)
|
40
|
+
@excerpts[index]
|
41
|
+
end
|
42
|
+
|
43
|
+
# Two excerpts are equal iff their page number, page count,
|
44
|
+
# query and page size are equal.
|
45
|
+
def ==(other)
|
46
|
+
return false unless other.is_a? QueryResult
|
47
|
+
@page == other.page && @page_count == other.page_count &&
|
48
|
+
@query == other.query && @page_size == other.page_size
|
49
|
+
end
|
50
|
+
|
51
|
+
# Returns the previous page of the query result
|
52
|
+
def previous_page
|
53
|
+
if @page > 1
|
54
|
+
@client.find(@query, :page_size => @page_size,
|
55
|
+
:page_index => @page - 1)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# Return the next page of the query result
|
60
|
+
def next_page
|
61
|
+
if @page < @page_count
|
62
|
+
@client.find(@query, :page_size => @page_size,
|
63
|
+
:page_index => @page + 1)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
# Returns the number of excerpts stored in this page (query result)
|
68
|
+
def size
|
69
|
+
@excerpts.size
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Poliqarp
|
2
|
+
# Author:: Aleksander Pohl (mailto:apohllo@o2.pl)
|
3
|
+
# License:: MIT LICENSE
|
4
|
+
#
|
5
|
+
# The segment is the smallest meaningful part of the text.
|
6
|
+
# It may contain many lemmata, since the segments are sometimes
|
7
|
+
# not disambiguated.
|
8
|
+
class Segment
|
9
|
+
attr_reader :literal, :lemmata
|
10
|
+
|
11
|
+
# Creates new segment. The specified argument is the literal
|
12
|
+
# (as found in the text) representation of the segment.
|
13
|
+
def initialize(literal)
|
14
|
+
@literal = literal
|
15
|
+
@lemmata = []
|
16
|
+
end
|
17
|
+
|
18
|
+
# Returns the segment literal
|
19
|
+
def to_s
|
20
|
+
@literal
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
data/lib/poliqarpr.rb
ADDED
data/poliqarpr.gemspec
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.name = "poliqarpr"
|
3
|
+
s.version = "0.0.5"
|
4
|
+
s.date = "2009-12-10"
|
5
|
+
s.summary = "Ruby client for Poliqarp"
|
6
|
+
s.email = "apohllo@o2.pl"
|
7
|
+
s.homepage = "http://www.github.com/apohllo/poliqarpr"
|
8
|
+
s.description = "Ruby client for Poliqarp (NLP corpus server)"
|
9
|
+
s.authors = ['Aleksander Pohl']
|
10
|
+
s.files = ["Rakefile", "poliqarpr.gemspec",
|
11
|
+
"changelog.txt", "README.txt" ] + Dir.glob("lib/**/*")
|
12
|
+
s.test_files = Dir.glob("test/**/*")
|
13
|
+
s.rdoc_options = ["--main", "README.txt"]
|
14
|
+
s.has_rdoc = true
|
15
|
+
s.extra_rdoc_files = ["README.txt"]
|
16
|
+
end
|
17
|
+
|
metadata
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: poliqarpr
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.5
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Aleksander Pohl
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-12-10 00:00:00 +01:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: Ruby client for Poliqarp (NLP corpus server)
|
17
|
+
email: apohllo@o2.pl
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files:
|
23
|
+
- README.txt
|
24
|
+
files:
|
25
|
+
- Rakefile
|
26
|
+
- poliqarpr.gemspec
|
27
|
+
- changelog.txt
|
28
|
+
- README.txt
|
29
|
+
- lib/poliqarpr.rb
|
30
|
+
- lib/poliqarpr/exceptions.rb
|
31
|
+
- lib/poliqarpr/lemmata.rb
|
32
|
+
- lib/poliqarpr/query_result.rb
|
33
|
+
- lib/poliqarpr/excerpt.rb
|
34
|
+
- lib/poliqarpr/segment.rb
|
35
|
+
- lib/poliqarpr/client.rb
|
36
|
+
- lib/poliqarpr/util.rb
|
37
|
+
- lib/poliqarpr/connector.rb
|
38
|
+
has_rdoc: true
|
39
|
+
homepage: http://www.github.com/apohllo/poliqarpr
|
40
|
+
licenses: []
|
41
|
+
|
42
|
+
post_install_message:
|
43
|
+
rdoc_options:
|
44
|
+
- --main
|
45
|
+
- README.txt
|
46
|
+
require_paths:
|
47
|
+
- lib
|
48
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: "0"
|
53
|
+
version:
|
54
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
55
|
+
requirements:
|
56
|
+
- - ">="
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
version: "0"
|
59
|
+
version:
|
60
|
+
requirements: []
|
61
|
+
|
62
|
+
rubyforge_project:
|
63
|
+
rubygems_version: 1.3.5
|
64
|
+
signing_key:
|
65
|
+
specification_version: 3
|
66
|
+
summary: Ruby client for Poliqarp
|
67
|
+
test_files: []
|
68
|
+
|