apohllo-poliqarpr 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/README.txt +30 -0
- data/changelog.txt +8 -0
- data/lib/poliqarpr/client.rb +151 -27
- data/lib/poliqarpr/excerpt.rb +16 -0
- data/lib/poliqarpr/lemmata.rb +11 -0
- data/lib/poliqarpr/query_result.rb +14 -0
- data/lib/poliqarpr/segment.rb +23 -0
- data/lib/poliqarpr.rb +7 -1
- data/poliqarpr.gemspec +4 -39
- data/spec/client.rb +53 -0
- data/spec/excerpt.rb +40 -2
- metadata +4 -37
- data/corpus/frek.cdf +0 -4
- data/corpus/frek.cfg +0 -100
- data/corpus/frek.cfg~ +0 -100
- data/corpus/frek.meta.cfg +0 -1
- data/corpus/frek.meta.lisp +0 -4
- data/corpus/frek.poliqarp.base1.image +0 -0
- data/corpus/frek.poliqarp.base1.offset +0 -0
- data/corpus/frek.poliqarp.base2.image +0 -0
- data/corpus/frek.poliqarp.base2.offset +0 -0
- data/corpus/frek.poliqarp.chunk.image +0 -0
- data/corpus/frek.poliqarp.corpus.image +0 -0
- data/corpus/frek.poliqarp.meta-key.image +0 -0
- data/corpus/frek.poliqarp.meta-key.offset +0 -0
- data/corpus/frek.poliqarp.meta-value.image +0 -0
- data/corpus/frek.poliqarp.meta-value.offset +0 -0
- data/corpus/frek.poliqarp.meta.image +0 -0
- data/corpus/frek.poliqarp.orth.image +0 -0
- data/corpus/frek.poliqarp.orth.index.alpha +0 -0
- data/corpus/frek.poliqarp.orth.index.atergo +0 -0
- data/corpus/frek.poliqarp.orth.offset +0 -0
- data/corpus/frek.poliqarp.rindex.amb +0 -0
- data/corpus/frek.poliqarp.rindex.amb.offset +0 -0
- data/corpus/frek.poliqarp.rindex.disamb +0 -0
- data/corpus/frek.poliqarp.rindex.disamb.offset +0 -0
- data/corpus/frek.poliqarp.rindex.orth +0 -0
- data/corpus/frek.poliqarp.rindex.orth.offset +0 -0
- data/corpus/frek.poliqarp.subchunk.image +0 -0
- data/corpus/frek.poliqarp.subchunk.item.ch +0 -0
- data/corpus/frek.poliqarp.subchunk.offset +0 -0
- data/corpus/frek.poliqarp.subpos1.image +0 -0
- data/corpus/frek.poliqarp.subpos1.offset +0 -0
- data/corpus/frek.poliqarp.subpos2.image +0 -0
- data/corpus/frek.poliqarp.subpos2.offset +0 -0
- data/corpus/frek.poliqarp.tag.image +0 -0
- data/corpus/frek.poliqarp.tag.offset +0 -0
data/README.txt
CHANGED
@@ -9,6 +9,8 @@ Poliqarpr is Ruby client for Poliqarp server.
|
|
9
9
|
|
10
10
|
== FEATURES/PROBLEMS:
|
11
11
|
|
12
|
+
* built-in pagination of query results
|
13
|
+
* support for lemmatization
|
12
14
|
* asynchronous communication is implemented in synchronous manner
|
13
15
|
* only partial implementation of server protocol
|
14
16
|
|
@@ -38,6 +40,34 @@ Then you can type:
|
|
38
40
|
|
39
41
|
* sudo gem install apohllo-poliqarpr
|
40
42
|
|
43
|
+
You can install the optional default corpus (warning: it is distributed under
|
44
|
+
different license!):
|
45
|
+
|
46
|
+
* sudo gem install apohllo-poliqarpr-corpus
|
47
|
+
|
48
|
+
== BASIC USAGE:
|
49
|
+
|
50
|
+
(You need the poliqarpr-corpus to be installed for this to work. See the last
|
51
|
+
step of installation process).
|
52
|
+
|
53
|
+
Require the gem:
|
54
|
+
|
55
|
+
require 'poliaqarpr'
|
56
|
+
|
57
|
+
Create the server client and open default corpus
|
58
|
+
|
59
|
+
client = Poliqarp::Client.new
|
60
|
+
client.open_corpus :default
|
61
|
+
|
62
|
+
Query the corpus for given segment
|
63
|
+
|
64
|
+
result = client.find("kot")
|
65
|
+
result[0].to_s
|
66
|
+
|
67
|
+
Remember to close the client on exit
|
68
|
+
|
69
|
+
client.close
|
70
|
+
|
41
71
|
|
42
72
|
== LICENSE:
|
43
73
|
|
data/changelog.txt
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
0.0.3
|
2
|
+
- the license of the corpus included
|
3
|
+
- client rdoc documentation
|
4
|
+
- support for lemmata retrieval
|
5
|
+
- excerpt now contains segments instead of strings
|
6
|
+
- buffer size setter
|
7
|
+
- default corpus moved to separate plugin (sudo gem install apohllo-poliqarpr-corpus)
|
8
|
+
|
1
9
|
0.0.2
|
2
10
|
- query result is full blown class
|
3
11
|
- source divided into client, excerpt and query resutl
|
data/lib/poliqarpr/client.rb
CHANGED
@@ -1,6 +1,10 @@
|
|
1
|
+
require 'socket'
|
1
2
|
module Poliqarp
|
3
|
+
# Author:: Aleksander Pohl (mailto:apohllo@o2.pl)
|
4
|
+
# License:: MIT License
|
5
|
+
#
|
6
|
+
# This class is the implementation of the Poliqarp server client.
|
2
7
|
class Client
|
3
|
-
DEFAULT_CORPUS = File.join(File.expand_path(File.dirname(__FILE__)),"..", "..", "corpus", "frek")
|
4
8
|
ERRORS = {
|
5
9
|
1 => "Incorrect number of arguments",
|
6
10
|
3 => "No session opened",
|
@@ -20,8 +24,22 @@ module Poliqarp
|
|
20
24
|
19 => "Invalid session option value",
|
21
25
|
20 => "Invalid sorting criteria"
|
22
26
|
}
|
27
|
+
GROUPS = [:left_context, :left_match, :right_match, :right_context]
|
28
|
+
|
29
|
+
# If debug is turned on, the communication between server and client
|
30
|
+
# is logged to standard output.
|
23
31
|
attr_writer :debug
|
24
32
|
|
33
|
+
# The size of the buffer is the maximum number of excerpts which
|
34
|
+
# are returned for single query.
|
35
|
+
attr_writer :buffer_size
|
36
|
+
|
37
|
+
# Creates new poliqarp server client.
|
38
|
+
#
|
39
|
+
# Parameters:
|
40
|
+
# * +session_name+ the name of the client session. Defaults to "RUBY".
|
41
|
+
# * +debug+ if set to true, all messages sent and received from server
|
42
|
+
# are printed to standard output. Defaults to false.
|
25
43
|
def initialize(session_name="RUBY", debug=false)
|
26
44
|
@session_name = session_name
|
27
45
|
@left_context = 5
|
@@ -31,9 +49,22 @@ module Poliqarp
|
|
31
49
|
new_session
|
32
50
|
end
|
33
51
|
|
34
|
-
|
52
|
+
# A hint about uninstalled default corpus gem
|
53
|
+
def self.const_missing(const)
|
54
|
+
if const.to_s =~ /DEFAULT_CORPUS/
|
55
|
+
raise "You need to install 'apohllo-poliqarpr-corpus' to use the default corpus"
|
56
|
+
end
|
57
|
+
super
|
58
|
+
end
|
59
|
+
|
60
|
+
# Creates new session for the client with the name given in constructor.
|
61
|
+
# If the session was already opened, it is closed.
|
62
|
+
#
|
63
|
+
# Parameters:
|
64
|
+
# * +port+ - the port on which the poliqarpd server is accepting connections (defaults to 4567)
|
65
|
+
def new_session(port=4567)
|
35
66
|
close if @session
|
36
|
-
@socket = TCPSocket.new("localhost",
|
67
|
+
@socket = TCPSocket.new("localhost",port)
|
37
68
|
talk "MAKE-SESSION #{@session_name}"
|
38
69
|
rcv_sync
|
39
70
|
talk("BUFFER-RESIZE #{@buffer_size}")
|
@@ -43,11 +74,7 @@ module Poliqarp
|
|
43
74
|
self.lemmata = {}
|
44
75
|
end
|
45
76
|
|
46
|
-
|
47
|
-
puts msg if @debug
|
48
|
-
@socket.puts(msg)
|
49
|
-
end
|
50
|
-
|
77
|
+
# Closes the opened connection to the poliqarpd server.
|
51
78
|
def close
|
52
79
|
#talk "CLOSE"
|
53
80
|
#rcv_sync
|
@@ -57,43 +84,88 @@ module Poliqarp
|
|
57
84
|
@session = false
|
58
85
|
end
|
59
86
|
|
87
|
+
# Sets the size of the left short context. It must be > 0
|
88
|
+
#
|
89
|
+
# The size of the left short context is the number
|
90
|
+
# of segments displayed in the found excerpts left to the
|
91
|
+
# matched segment(s).
|
60
92
|
def left_context=(value)
|
61
|
-
if value
|
93
|
+
if correct_context_value?(value)
|
62
94
|
talk "SET left-context-width #{value}"
|
63
95
|
result = rcv_sync
|
64
96
|
@left_context = value if result =~ /^R OK/
|
97
|
+
else
|
98
|
+
raise "Invalid argument: #{value}. It must be fixnum greater than 0."
|
65
99
|
end
|
66
100
|
end
|
67
101
|
|
102
|
+
# Sets the size of the right short context. It must be > 0
|
103
|
+
#
|
104
|
+
# The size of the right short context is the number
|
105
|
+
# of segments displayed in the found excerpts right to the
|
106
|
+
# matched segment(s).
|
68
107
|
def right_context=(value)
|
69
|
-
if value
|
108
|
+
if correct_context_value?(value)
|
70
109
|
talk "SET right-context-width #{value}"
|
71
110
|
result = rcv_sync
|
72
111
|
@right_context = value if result =~ /^R OK/
|
112
|
+
else
|
113
|
+
raise "Invalid argument: #{value}. It must be fixnum greater than 0."
|
73
114
|
end
|
74
115
|
end
|
75
116
|
|
117
|
+
# Sets the tags' flags. There are four groups of segments
|
118
|
+
# which the flags apply for:
|
119
|
+
# * +left_context+
|
120
|
+
# * +left_match+
|
121
|
+
# * +right_match+
|
122
|
+
# * +right_context+
|
123
|
+
#
|
124
|
+
# If the flag for given group is set to true, all segments
|
125
|
+
# in the group are annotated with grammatical tags. E.g.:
|
126
|
+
# c.find("kot")
|
127
|
+
# ...
|
128
|
+
# "kot" tags: "subst:sg:nom:m2"
|
129
|
+
#
|
130
|
+
# You can pass :all to turn on flags for all groups
|
76
131
|
def tags=(options={})
|
132
|
+
options = set_all_flags if options == :all
|
133
|
+
@tag_flags = options
|
77
134
|
flags = ""
|
78
|
-
|
79
|
-
:rightM_tags, :right_context_tags].each do |flag|
|
135
|
+
GROUPS.each do |flag|
|
80
136
|
flags << (options[flag] ? "1" : "0")
|
81
137
|
end
|
82
138
|
talk "SET retrieve-tags #{flags}"
|
83
139
|
rcv_sync
|
84
140
|
end
|
85
141
|
|
142
|
+
# Sets the lemmatas' flags. There are four groups of segments
|
143
|
+
# which the flags apply for:
|
144
|
+
# * +left_context+
|
145
|
+
# * +left_match+
|
146
|
+
# * +right_match+
|
147
|
+
# * +right_context+
|
148
|
+
#
|
149
|
+
# If the flag for given group is set to true, all segments
|
150
|
+
# in the group are returned with the base form of the lemmata. E.g.:
|
151
|
+
# c.find("kotu")
|
152
|
+
# ...
|
153
|
+
# "kotu" base_form: "kot"
|
154
|
+
#
|
155
|
+
# You can pass :all to turn on flags for all groups
|
86
156
|
def lemmata=(options={})
|
157
|
+
options = set_all_flags if options == :all
|
158
|
+
@lemmata_flags = options
|
87
159
|
flags = ""
|
88
|
-
|
89
|
-
:rightM_lemmata, :right_context_lemmata].each do |flag|
|
160
|
+
GROUPS.each do |flag|
|
90
161
|
flags << (options[flag] ? "1" : "0")
|
91
162
|
end
|
92
163
|
talk "SET retrieve-lemmata #{flags}"
|
93
164
|
rcv_sync
|
94
165
|
end
|
95
166
|
|
96
|
-
|
167
|
+
# Opens the corpus given as +path+. To open the default
|
168
|
+
# corpus pass +:default+ as the argument.
|
97
169
|
def open_corpus(path)
|
98
170
|
if path == :default
|
99
171
|
open_corpus(DEFAULT_CORPUS)
|
@@ -104,6 +176,19 @@ module Poliqarp
|
|
104
176
|
end
|
105
177
|
end
|
106
178
|
|
179
|
+
# Send the query to the opened corpus.
|
180
|
+
#
|
181
|
+
# Options:
|
182
|
+
# * +index+ the index of the (only one) result to be returned. The index is relative
|
183
|
+
# to the beginning of the query result. In normal case you should query the
|
184
|
+
# corpus without specifying the index, to see what results are returned.
|
185
|
+
# Then you can use the index and the same query to retrieve one result.
|
186
|
+
# The pair (query, index) is a kind of unique identifier of the excerpt.
|
187
|
+
# * +page_size+ the size of the page of results. If the page size is 0, then
|
188
|
+
# all results are returned on one page. It is ignored if the +index+ option
|
189
|
+
# is present. Defaults to 0.
|
190
|
+
# * +page_index+ the index of the page of results (the first page has index 1, not 0).
|
191
|
+
# It is ignored if the +index+ option is present. Defaults to 1.
|
107
192
|
def find(query,options={})
|
108
193
|
if options[:index]
|
109
194
|
find_one(query, options[:index])
|
@@ -114,10 +199,13 @@ module Poliqarp
|
|
114
199
|
|
115
200
|
alias query find
|
116
201
|
|
202
|
+
# Returns the number of results for given query.
|
117
203
|
def count(query)
|
118
204
|
count_results(make_query(query))
|
119
205
|
end
|
120
206
|
|
207
|
+
# Returns the long context of the excerpt which is identified by
|
208
|
+
# given (query, index) pair.
|
121
209
|
def context(query,index)
|
122
210
|
make_query(query)
|
123
211
|
result = []
|
@@ -135,6 +223,8 @@ module Poliqarp
|
|
135
223
|
result
|
136
224
|
end
|
137
225
|
|
226
|
+
# Returns the metadata of the excerpt which is identified by
|
227
|
+
# given (query, index) pair.
|
138
228
|
def metadata(query, index)
|
139
229
|
make_query(query)
|
140
230
|
result = {}
|
@@ -152,6 +242,13 @@ module Poliqarp
|
|
152
242
|
end
|
153
243
|
|
154
244
|
protected
|
245
|
+
# Sends a message directly to the server
|
246
|
+
# * +msg+ the message to send
|
247
|
+
def talk(msg)
|
248
|
+
puts msg if @debug
|
249
|
+
@socket.puts(msg)
|
250
|
+
end
|
251
|
+
|
155
252
|
def find_many(query, options)
|
156
253
|
page_size = (options[:page_size] || 0)
|
157
254
|
page_index = (options[:page_index] || 1)
|
@@ -193,29 +290,45 @@ protected
|
|
193
290
|
end
|
194
291
|
|
195
292
|
# Fetches one result of the query
|
196
|
-
|
293
|
+
##
|
197
294
|
# MAKE-QUERY and GET-RESULTS must be called on server before
|
198
295
|
# this method is called
|
199
296
|
def fetch_result(index, query)
|
200
297
|
result = Excerpt.new(index, self, query)
|
201
|
-
|
202
|
-
result << read_segments
|
203
|
-
#
|
204
|
-
result << read_segments
|
205
|
-
|
206
|
-
result << read_segments
|
298
|
+
result << read_segments(:left_context)
|
299
|
+
result << read_segments(:left_match)
|
300
|
+
# XXX
|
301
|
+
#result << read_segments(:right_match)
|
302
|
+
result << read_segments(:right_context)
|
207
303
|
|
208
304
|
result
|
209
305
|
end
|
210
306
|
|
211
|
-
def read_segments
|
212
|
-
|
213
|
-
size = answer.match(/\d+/)[0].to_i
|
307
|
+
def read_segments(group)
|
308
|
+
size = get_number(rcv_sync)
|
214
309
|
segments = []
|
215
310
|
size.times do |segment_index|
|
216
|
-
|
311
|
+
segment = Segment.new(read_word)
|
312
|
+
segments << segment
|
313
|
+
if @lemmata_flags[group] || @tag_flags[group]
|
314
|
+
lemmata_size = get_number(rcv_sync)
|
315
|
+
lemmata_size.times do |lemmata_index|
|
316
|
+
lemmata = Lemmata.new()
|
317
|
+
if @lemmata_flags[group]
|
318
|
+
lemmata.base_form = read_word
|
319
|
+
end
|
320
|
+
if @tag_flags[group]
|
321
|
+
read_word
|
322
|
+
end
|
323
|
+
segment.lemmata << lemmata
|
324
|
+
end
|
325
|
+
end
|
217
326
|
end
|
218
|
-
segments
|
327
|
+
segments
|
328
|
+
end
|
329
|
+
|
330
|
+
def get_number(str)
|
331
|
+
str.match(/\d+/)[0].to_i
|
219
332
|
end
|
220
333
|
|
221
334
|
def count_results(answer)
|
@@ -268,5 +381,16 @@ protected
|
|
268
381
|
end until line =~ /^M/
|
269
382
|
line
|
270
383
|
end
|
384
|
+
|
385
|
+
private
|
386
|
+
def set_all_flags
|
387
|
+
options = {}
|
388
|
+
GROUPS.each{|g| options[g] = true}
|
389
|
+
options
|
390
|
+
end
|
391
|
+
|
392
|
+
def correct_context_value?(value)
|
393
|
+
value.is_a?(Fixnum) && value > 0
|
394
|
+
end
|
271
395
|
end
|
272
396
|
end
|
data/lib/poliqarpr/excerpt.rb
CHANGED
@@ -1,4 +1,15 @@
|
|
1
1
|
module Poliqarp
|
2
|
+
# Author:: Aleksander Pohl
|
3
|
+
# License:: MIT License
|
4
|
+
#
|
5
|
+
# The excerpt class is used to store single result of the query,
|
6
|
+
# i.e. the excerpt of the corpus which contains the words which
|
7
|
+
# the corpus was queried for.
|
8
|
+
#
|
9
|
+
# The excerpt is divided into groups, which contain segments,
|
10
|
+
# which the texts in the corpus were divided for.
|
11
|
+
# The first group is the left context, the second -- the matched
|
12
|
+
# query, and the last -- the right context.
|
2
13
|
class Excerpt
|
3
14
|
attr_reader :index, :base_form, :short_context
|
4
15
|
|
@@ -9,11 +20,13 @@ module Poliqarp
|
|
9
20
|
@short_context = []
|
10
21
|
end
|
11
22
|
|
23
|
+
# Adds segment group to the excerpt
|
12
24
|
def <<(value)
|
13
25
|
@short_context << value
|
14
26
|
end
|
15
27
|
|
16
28
|
|
29
|
+
# Returns the matched query as string
|
17
30
|
def word
|
18
31
|
#@short_context[0].split(/\s+/)[-1]
|
19
32
|
@short_context[1].to_s
|
@@ -21,10 +34,13 @@ module Poliqarp
|
|
21
34
|
|
22
35
|
alias inflected_form word
|
23
36
|
|
37
|
+
# The string representation of the excerpt is the shord
|
38
|
+
# context of the query.
|
24
39
|
def to_s
|
25
40
|
@short_context.join("")
|
26
41
|
end
|
27
42
|
|
43
|
+
# Returns the long context of the query.
|
28
44
|
def context
|
29
45
|
return @context unless @context.nil?
|
30
46
|
@context = @client.context(@base_form, @index)
|
@@ -1,4 +1,10 @@
|
|
1
1
|
module Poliqarp
|
2
|
+
# Author:: Aleksander Pohl (mailto:apohllo@o2.pl)
|
3
|
+
# License:: MIT License
|
4
|
+
#
|
5
|
+
# The query result class is used to paginate results of the
|
6
|
+
# query. Each query result has information about its context
|
7
|
+
# (the next and previous page).
|
2
8
|
class QueryResult
|
3
9
|
include Enumerable
|
4
10
|
|
@@ -13,10 +19,12 @@ module Poliqarp
|
|
13
19
|
@excerpts = []
|
14
20
|
end
|
15
21
|
|
22
|
+
# Adds excerpt to the query result
|
16
23
|
def <<(excerpt)
|
17
24
|
@excerpts << excerpt
|
18
25
|
end
|
19
26
|
|
27
|
+
# Allows to iterate over the results stored in the result
|
20
28
|
def each
|
21
29
|
@excerpts.each{|e| yield e}
|
22
30
|
end
|
@@ -27,16 +35,20 @@ module Poliqarp
|
|
27
35
|
end
|
28
36
|
end
|
29
37
|
|
38
|
+
# Returns excerpt with given index.
|
30
39
|
def [](index)
|
31
40
|
@excerpts[index]
|
32
41
|
end
|
33
42
|
|
43
|
+
# Two excerpts are equal iff their page number, page count,
|
44
|
+
# query and page size are equal.
|
34
45
|
def ==(other)
|
35
46
|
return false unless other.is_a? QueryResult
|
36
47
|
@page == other.page && @page_count == other.page_count &&
|
37
48
|
@query == other.query && @page_size == other.page_size
|
38
49
|
end
|
39
50
|
|
51
|
+
# Returns the previous page of the query result
|
40
52
|
def previous_page
|
41
53
|
if @page > 1
|
42
54
|
@client.find(@query, :page_size => @page_size,
|
@@ -44,6 +56,7 @@ module Poliqarp
|
|
44
56
|
end
|
45
57
|
end
|
46
58
|
|
59
|
+
# Return the next page of the query result
|
47
60
|
def next_page
|
48
61
|
if @page < @page_count
|
49
62
|
@client.find(@query, :page_size => @page_size,
|
@@ -51,6 +64,7 @@ module Poliqarp
|
|
51
64
|
end
|
52
65
|
end
|
53
66
|
|
67
|
+
# Returns the number of excerpts stored in this page (query result)
|
54
68
|
def size
|
55
69
|
@excerpts.size
|
56
70
|
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Poliqarp
|
2
|
+
# Author:: Aleksander Pohl (mailto:apohllo@o2.pl)
|
3
|
+
# License:: MIT LICENSE
|
4
|
+
#
|
5
|
+
# The segment is the smallest meaningful part of the text.
|
6
|
+
# It may contain many lemmata, since the segments are sometimes
|
7
|
+
# not disambiguated.
|
8
|
+
class Segment
|
9
|
+
attr_reader :literal, :lemmata
|
10
|
+
|
11
|
+
# Creates new segment. The specified argument is the literal
|
12
|
+
# (as found in the text) representation of the segment.
|
13
|
+
def initialize(literal)
|
14
|
+
@literal = literal
|
15
|
+
@lemmata = []
|
16
|
+
end
|
17
|
+
|
18
|
+
# Returns the segment literal
|
19
|
+
def to_s
|
20
|
+
@literal
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
data/lib/poliqarpr.rb
CHANGED
@@ -2,4 +2,10 @@ path = File.join(File.dirname(__FILE__), 'poliqarpr')
|
|
2
2
|
require File.join(path, 'client')
|
3
3
|
require File.join(path, 'query_result')
|
4
4
|
require File.join(path, 'excerpt')
|
5
|
-
|
5
|
+
require File.join(path, 'segment')
|
6
|
+
require File.join(path, 'lemmata')
|
7
|
+
begin
|
8
|
+
require 'poliqarpr-corpus'
|
9
|
+
rescue LoadError
|
10
|
+
# Do nothig, since the default corpus is optional
|
11
|
+
end
|
data/poliqarpr.gemspec
CHANGED
@@ -1,9 +1,7 @@
|
|
1
|
-
#require 'rake'
|
2
|
-
|
3
1
|
Gem::Specification.new do |s|
|
4
2
|
s.name = "poliqarpr"
|
5
|
-
s.version = "0.0.
|
6
|
-
s.date = "2008-12-
|
3
|
+
s.version = "0.0.3"
|
4
|
+
s.date = "2008-12-20"
|
7
5
|
s.summary = "Ruby client for Poliqarp"
|
8
6
|
s.email = "apohllo@o2.pl"
|
9
7
|
s.homepage = "http://www.apohllo.pl/projekty/poliqarpr"
|
@@ -14,42 +12,9 @@ Gem::Specification.new do |s|
|
|
14
12
|
"lib/poliqarpr/client.rb",
|
15
13
|
"lib/poliqarpr/query_result.rb",
|
16
14
|
"lib/poliqarpr/excerpt.rb",
|
15
|
+
"lib/poliqarpr/lemmata.rb",
|
16
|
+
"lib/poliqarpr/segment.rb",
|
17
17
|
"README.txt",
|
18
|
-
"corpus/frek.cdf",
|
19
|
-
"corpus/frek.poliqarp.base1.image",
|
20
|
-
"corpus/frek.poliqarp.corpus.image",
|
21
|
-
"corpus/frek.poliqarp.meta-value.offset",
|
22
|
-
"corpus/frek.poliqarp.rindex.amb",
|
23
|
-
"corpus/frek.poliqarp.rindex.orth.offset",
|
24
|
-
"corpus/frek.poliqarp.subpos1.offset",
|
25
|
-
"corpus/frek.cfg",
|
26
|
-
"corpus/frek.poliqarp.base1.offset",
|
27
|
-
"corpus/frek.poliqarp.meta.image",
|
28
|
-
"corpus/frek.poliqarp.orth.image",
|
29
|
-
"corpus/frek.poliqarp.rindex.amb.offset",
|
30
|
-
"corpus/frek.poliqarp.subchunk.image",
|
31
|
-
"corpus/frek.poliqarp.subpos2.image",
|
32
|
-
"corpus/frek.cfg~",
|
33
|
-
"corpus/frek.poliqarp.base2.image",
|
34
|
-
"corpus/frek.poliqarp.meta-key.image",
|
35
|
-
"corpus/frek.poliqarp.orth.index.alpha",
|
36
|
-
"corpus/frek.poliqarp.rindex.disamb",
|
37
|
-
"corpus/frek.poliqarp.subchunk.item.ch",
|
38
|
-
"corpus/frek.poliqarp.subpos2.offset",
|
39
|
-
"corpus/frek.meta.cfg",
|
40
|
-
"corpus/frek.poliqarp.base2.offset",
|
41
|
-
"corpus/frek.poliqarp.meta-key.offset",
|
42
|
-
"corpus/frek.poliqarp.orth.index.atergo",
|
43
|
-
"corpus/frek.poliqarp.rindex.disamb.offset",
|
44
|
-
"corpus/frek.poliqarp.subchunk.offset",
|
45
|
-
"corpus/frek.poliqarp.tag.image",
|
46
|
-
"corpus/frek.meta.lisp",
|
47
|
-
"corpus/frek.poliqarp.chunk.image",
|
48
|
-
"corpus/frek.poliqarp.meta-value.image",
|
49
|
-
"corpus/frek.poliqarp.orth.offset",
|
50
|
-
"corpus/frek.poliqarp.rindex.orth",
|
51
|
-
"corpus/frek.poliqarp.subpos1.image",
|
52
|
-
"corpus/frek.poliqarp.tag.offset"
|
53
18
|
]
|
54
19
|
s.test_files = [
|
55
20
|
"spec/client.rb",
|
data/spec/client.rb
CHANGED
@@ -29,6 +29,38 @@ describe Poliqarp::Client do
|
|
29
29
|
@client.close
|
30
30
|
end
|
31
31
|
|
32
|
+
it "should allow to set the right context size" do
|
33
|
+
@client.right_context = 5
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should raise error if the size of right context is not number" do
|
37
|
+
(proc do
|
38
|
+
@client.right_context = "a"
|
39
|
+
end).should raise_error(RuntimeError)
|
40
|
+
end
|
41
|
+
|
42
|
+
it "should rais error if the size of right context is less or equal 0" do
|
43
|
+
(proc do
|
44
|
+
@client.right_context = 0
|
45
|
+
end).should raise_error(RuntimeError)
|
46
|
+
end
|
47
|
+
|
48
|
+
it "should allow to set the left context size" do
|
49
|
+
@client.right_context = 5
|
50
|
+
end
|
51
|
+
|
52
|
+
it "should raise error if the size of left context is not number" do
|
53
|
+
(lambda do
|
54
|
+
@client.left_context = "a"
|
55
|
+
end).should raise_error(RuntimeError)
|
56
|
+
end
|
57
|
+
|
58
|
+
it "should rais error if the size of left context is less or equal 0" do
|
59
|
+
(lambda do
|
60
|
+
@client.left_context = 0
|
61
|
+
end).should raise_error(RuntimeError)
|
62
|
+
end
|
63
|
+
|
32
64
|
it "should allow to find 'kot'" do
|
33
65
|
@client.find("kot").size.should_not == 0
|
34
66
|
end
|
@@ -83,6 +115,27 @@ describe Poliqarp::Client do
|
|
83
115
|
@result.to_s.should == @client.find("nachalny")[0].to_s
|
84
116
|
end
|
85
117
|
end
|
118
|
+
|
119
|
+
describe("(with lemmata flags set to true)") do
|
120
|
+
before(:all) do
|
121
|
+
@client.lemmata = {:left_context => true, :right_context => true,
|
122
|
+
:left_match => true, :right_match => true}
|
123
|
+
end
|
124
|
+
|
125
|
+
it "should allow to find 'kotu'" do
|
126
|
+
@client.find("kotu").size.should_not == 0
|
127
|
+
end
|
128
|
+
|
129
|
+
it "should contain 'kotu' in query result for 'kotu'" do
|
130
|
+
@client.find("kotu")[0].to_s.should match(/\bkotu\b/)
|
131
|
+
end
|
132
|
+
|
133
|
+
it "should contain 'kot' in lemmatized query result for 'kotu'" do
|
134
|
+
@client.find("kotu")[0].short_context.flatten.
|
135
|
+
map{|e| e.lemmata[0].base_form}.join(" ").should match(/\bkot\b/)
|
136
|
+
end
|
137
|
+
|
138
|
+
end
|
86
139
|
end
|
87
140
|
|
88
141
|
end
|
data/spec/excerpt.rb
CHANGED
@@ -23,8 +23,19 @@ describe Poliqarp::Excerpt do
|
|
23
23
|
@excerpt.base_form.should_not == nil
|
24
24
|
end
|
25
25
|
|
26
|
-
it "should
|
27
|
-
@excerpt
|
26
|
+
it "should contain 3 groups in short context" do
|
27
|
+
@excerpt.short_context.size.should == 3
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should allow to add segment group" do
|
31
|
+
@excerpt << [Poliqarp::Segment.new("abc")]
|
32
|
+
end
|
33
|
+
|
34
|
+
|
35
|
+
it "should contain non empty segments in short context" do
|
36
|
+
@excerpt.short_context.flatten.each do |segment|
|
37
|
+
segment.literal.should_not == nil
|
38
|
+
end
|
28
39
|
end
|
29
40
|
|
30
41
|
it "should contain the exact form which it was created for" do
|
@@ -92,4 +103,31 @@ describe Poliqarp::Excerpt do
|
|
92
103
|
@excerpt.author[0].should == "Małgorzata Pamuła"
|
93
104
|
end
|
94
105
|
end
|
106
|
+
|
107
|
+
describe('first result for "kotu" with lemmatization turned on') do
|
108
|
+
before(:all) do
|
109
|
+
@client.lemmata = :all
|
110
|
+
@client.open_corpus(:default)
|
111
|
+
@excerpt = @client.find("kotu")[0]
|
112
|
+
end
|
113
|
+
|
114
|
+
it "should have one lemmata for each segment" do
|
115
|
+
@excerpt.short_context.each do |group|
|
116
|
+
group.each do |segment|
|
117
|
+
segment.lemmata.size.should == 1
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
it "should have non-nil lemmata for each segment" do
|
123
|
+
@excerpt.short_context.flatten.each do |segment|
|
124
|
+
segment.lemmata[0].should_not == nil
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
it "should contain 'kot' as one of the lemmata" do
|
129
|
+
@excerpt.short_context.flatten.
|
130
|
+
any?{|s| s.lemmata[0].base_form == "kot"}.should == true
|
131
|
+
end
|
132
|
+
end
|
95
133
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: apohllo-poliqarpr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Aleksander Pohl
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-12-
|
12
|
+
date: 2008-12-20 00:00:00 -08:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|
@@ -29,42 +29,9 @@ files:
|
|
29
29
|
- lib/poliqarpr/client.rb
|
30
30
|
- lib/poliqarpr/query_result.rb
|
31
31
|
- lib/poliqarpr/excerpt.rb
|
32
|
+
- lib/poliqarpr/lemmata.rb
|
33
|
+
- lib/poliqarpr/segment.rb
|
32
34
|
- README.txt
|
33
|
-
- corpus/frek.cdf
|
34
|
-
- corpus/frek.poliqarp.base1.image
|
35
|
-
- corpus/frek.poliqarp.corpus.image
|
36
|
-
- corpus/frek.poliqarp.meta-value.offset
|
37
|
-
- corpus/frek.poliqarp.rindex.amb
|
38
|
-
- corpus/frek.poliqarp.rindex.orth.offset
|
39
|
-
- corpus/frek.poliqarp.subpos1.offset
|
40
|
-
- corpus/frek.cfg
|
41
|
-
- corpus/frek.poliqarp.base1.offset
|
42
|
-
- corpus/frek.poliqarp.meta.image
|
43
|
-
- corpus/frek.poliqarp.orth.image
|
44
|
-
- corpus/frek.poliqarp.rindex.amb.offset
|
45
|
-
- corpus/frek.poliqarp.subchunk.image
|
46
|
-
- corpus/frek.poliqarp.subpos2.image
|
47
|
-
- corpus/frek.cfg~
|
48
|
-
- corpus/frek.poliqarp.base2.image
|
49
|
-
- corpus/frek.poliqarp.meta-key.image
|
50
|
-
- corpus/frek.poliqarp.orth.index.alpha
|
51
|
-
- corpus/frek.poliqarp.rindex.disamb
|
52
|
-
- corpus/frek.poliqarp.subchunk.item.ch
|
53
|
-
- corpus/frek.poliqarp.subpos2.offset
|
54
|
-
- corpus/frek.meta.cfg
|
55
|
-
- corpus/frek.poliqarp.base2.offset
|
56
|
-
- corpus/frek.poliqarp.meta-key.offset
|
57
|
-
- corpus/frek.poliqarp.orth.index.atergo
|
58
|
-
- corpus/frek.poliqarp.rindex.disamb.offset
|
59
|
-
- corpus/frek.poliqarp.subchunk.offset
|
60
|
-
- corpus/frek.poliqarp.tag.image
|
61
|
-
- corpus/frek.meta.lisp
|
62
|
-
- corpus/frek.poliqarp.chunk.image
|
63
|
-
- corpus/frek.poliqarp.meta-value.image
|
64
|
-
- corpus/frek.poliqarp.orth.offset
|
65
|
-
- corpus/frek.poliqarp.rindex.orth
|
66
|
-
- corpus/frek.poliqarp.subpos1.image
|
67
|
-
- corpus/frek.poliqarp.tag.offset
|
68
35
|
has_rdoc: true
|
69
36
|
homepage: http://www.apohllo.pl/projekty/poliqarpr
|
70
37
|
post_install_message:
|
data/corpus/frek.cdf
DELETED
data/corpus/frek.cfg
DELETED
@@ -1,100 +0,0 @@
|
|
1
|
-
# Config file format for Oasis release
|
2
|
-
# Config version 1.0
|
3
|
-
|
4
|
-
# The new startup section may contain any command normally accepted by the shell
|
5
|
-
|
6
|
-
[ALIASES]
|
7
|
-
|
8
|
-
masc = m1|m2|m3
|
9
|
-
verb = pact|ppas|winien|praet|bedzie|fin|impt|aglt|ger|imps|inf|pant|pcon
|
10
|
-
noun = subst|depr|xxs|ger|ppron12|ppron3
|
11
|
-
pron = ppron12|ppron3|siebie
|
12
|
-
|
13
|
-
|
14
|
-
[ATTR]
|
15
|
-
|
16
|
-
number = sg pl
|
17
|
-
case = nom gen dat acc inst loc voc
|
18
|
-
gender = m1 m2 m3 f n
|
19
|
-
person = pri sec ter
|
20
|
-
degree = pos comp sup
|
21
|
-
aspect = imperf perf
|
22
|
-
negation = aff neg
|
23
|
-
accommodability = congr rec
|
24
|
-
accentability = akc nakc
|
25
|
-
post-prepositionality = npraep praep
|
26
|
-
agglutination = agl nagl
|
27
|
-
vocalicity = nwok wok
|
28
|
-
|
29
|
-
# Parts of speech no longer need forward declarations, this was inconvenient and ugly.
|
30
|
-
# Also, any attribute may be optional so a declaration such as:
|
31
|
-
# foo = [bar] [froz] fred [wilma]
|
32
|
-
# should no longer cause problems and ctags with such attributes now parse correctly regardless
|
33
|
-
# of presence or absence of any optional attribute
|
34
|
-
|
35
|
-
[POS]
|
36
|
-
|
37
|
-
adja =
|
38
|
-
adjp =
|
39
|
-
conj =
|
40
|
-
interp =
|
41
|
-
pred =
|
42
|
-
xxx =
|
43
|
-
adv = degree
|
44
|
-
imps = aspect
|
45
|
-
inf = aspect
|
46
|
-
pant = aspect
|
47
|
-
pcon = aspect
|
48
|
-
qub = [vocalicity]
|
49
|
-
prep = case [vocalicity]
|
50
|
-
siebie = case
|
51
|
-
subst = number case gender
|
52
|
-
depr = number case gender
|
53
|
-
xxs = number case gender
|
54
|
-
ger = number case gender aspect negation
|
55
|
-
ppron12 = number case gender person [accentability]
|
56
|
-
ppron3 = number case gender person [accentability] [post-prepositionality]
|
57
|
-
num = number case gender [accommodability]
|
58
|
-
adj = number case gender degree
|
59
|
-
pact = number case gender aspect negation
|
60
|
-
ppas = number case gender aspect negation
|
61
|
-
winien = number gender aspect
|
62
|
-
praet = number gender aspect [agglutination]
|
63
|
-
bedzie = number person aspect
|
64
|
-
fin = number person aspect
|
65
|
-
impt = number person aspect
|
66
|
-
aglt = number person aspect vocalicity
|
67
|
-
ign =
|
68
|
-
|
69
|
-
# Named entities replaced old 'special' attributes, name changed mostly because of
|
70
|
-
# unification of 'named-thing' handling code into one named-entity thing
|
71
|
-
# Entity aliasing allows for any existing entity to be seen under different name
|
72
|
-
#
|
73
|
-
# FCQP provides four builtin entities:
|
74
|
-
# entity-current
|
75
|
-
# entity-base
|
76
|
-
# entity-tag
|
77
|
-
# entity-pos
|
78
|
-
|
79
|
-
[NAMED-ENTITY]
|
80
|
-
|
81
|
-
entity-orth = orth
|
82
|
-
entity-base = base
|
83
|
-
entity-tag = tag
|
84
|
-
entity-pos = pos
|
85
|
-
|
86
|
-
# Old 'aliases' for attribute names
|
87
|
-
|
88
|
-
pos = flex
|
89
|
-
number = numb nmb
|
90
|
-
case = cas
|
91
|
-
gender = gnd gend
|
92
|
-
person = per pers
|
93
|
-
degree = deg degr
|
94
|
-
aspect = asp
|
95
|
-
negation = neg
|
96
|
-
accommodability = acco acom acm
|
97
|
-
accentability = acce acen acn
|
98
|
-
post-prepositionality = ppr ppre
|
99
|
-
agglutination = agg aggl
|
100
|
-
vocalicity = vcl
|
data/corpus/frek.cfg~
DELETED
@@ -1,100 +0,0 @@
|
|
1
|
-
# Config file format for Oasis release
|
2
|
-
# Config version 1.0
|
3
|
-
|
4
|
-
# The new startup section may contain any command normally accepted by the shell
|
5
|
-
|
6
|
-
[STARTUP]
|
7
|
-
|
8
|
-
/alias masc = m1 m2 m3
|
9
|
-
/alias verb = pact ppas winien praet bedzie fin impt aglt ger imps inf pant pcon
|
10
|
-
/alias noun = subst depr xxs ger ppron12 ppron3
|
11
|
-
/alias pron = ppron12 ppron3 siebie
|
12
|
-
|
13
|
-
|
14
|
-
[ATTR]
|
15
|
-
|
16
|
-
number = sg pl
|
17
|
-
case = nom gen dat acc inst loc voc
|
18
|
-
gender = m1 m2 m3 f n
|
19
|
-
person = pri sec ter
|
20
|
-
degree = pos comp sup
|
21
|
-
aspect = imperf perf
|
22
|
-
negation = aff neg
|
23
|
-
accommodability = congr rec
|
24
|
-
accentability = akc nakc
|
25
|
-
post-prepositionality = npraep praep
|
26
|
-
agglutination = agl nagl
|
27
|
-
vocalicity = nwok wok
|
28
|
-
|
29
|
-
# Parts of speech no longer need forward declarations, this was inconvenient and ugly.
|
30
|
-
# Also, any attribute may be optional so a declaration such as:
|
31
|
-
# foo = [bar] [froz] fred [wilma]
|
32
|
-
# should no longer cause problems and ctags with such attributes now parse correctly regardless
|
33
|
-
# of presence or absence of any optional attribute
|
34
|
-
|
35
|
-
[POS]
|
36
|
-
|
37
|
-
adja =
|
38
|
-
adjp =
|
39
|
-
conj =
|
40
|
-
interp =
|
41
|
-
pred =
|
42
|
-
xxx =
|
43
|
-
adv = degree
|
44
|
-
imps = aspect
|
45
|
-
inf = aspect
|
46
|
-
pant = aspect
|
47
|
-
pcon = aspect
|
48
|
-
qub = [vocalicity]
|
49
|
-
prep = case [vocalicity]
|
50
|
-
siebie = case
|
51
|
-
subst = number case gender
|
52
|
-
depr = number case gender
|
53
|
-
xxs = number case gender
|
54
|
-
ger = number case gender aspect negation
|
55
|
-
ppron12 = number case gender person [accentability]
|
56
|
-
ppron3 = number case gender person [accentability] [post-prepositionality]
|
57
|
-
num = number case gender [accommodability]
|
58
|
-
adj = number case gender degree
|
59
|
-
pact = number case gender aspect negation
|
60
|
-
ppas = number case gender aspect negation
|
61
|
-
winien = number gender aspect
|
62
|
-
praet = number gender aspect [agglutination]
|
63
|
-
bedzie = number person aspect
|
64
|
-
fin = number person aspect
|
65
|
-
impt = number person aspect
|
66
|
-
aglt = number person aspect vocalicity
|
67
|
-
ign =
|
68
|
-
|
69
|
-
# Named entities replaced old 'special' attributes, name changed mostly because of
|
70
|
-
# unification of 'named-thing' handling code into one named-entity thing
|
71
|
-
# Entity aliasing allows for any existing entity to be seen under different name
|
72
|
-
#
|
73
|
-
# FCQP provides four builtin entities:
|
74
|
-
# entity-current
|
75
|
-
# entity-base
|
76
|
-
# entity-tag
|
77
|
-
# entity-pos
|
78
|
-
|
79
|
-
[NAMED-ENTITY]
|
80
|
-
|
81
|
-
entity-orth = orth
|
82
|
-
entity-base = base
|
83
|
-
entity-tag = tag
|
84
|
-
entity-pos = pos
|
85
|
-
|
86
|
-
# Old 'aliases' for attribute names
|
87
|
-
|
88
|
-
pos = flex
|
89
|
-
number = numb nmb
|
90
|
-
case = cas
|
91
|
-
gender = gnd gend
|
92
|
-
person = per pers
|
93
|
-
degree = deg degr
|
94
|
-
aspect = asp
|
95
|
-
negation = neg
|
96
|
-
accommodability = acco acom acm
|
97
|
-
accentability = acce acen acn
|
98
|
-
post-prepositionality = ppr ppre
|
99
|
-
agglutination = agg aggl
|
100
|
-
vocalicity = vcl
|
data/corpus/frek.meta.cfg
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
S sample
|
data/corpus/frek.meta.lisp
DELETED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|