apohllo-poliqarpr 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.txt +30 -0
- data/changelog.txt +8 -0
- data/lib/poliqarpr/client.rb +151 -27
- data/lib/poliqarpr/excerpt.rb +16 -0
- data/lib/poliqarpr/lemmata.rb +11 -0
- data/lib/poliqarpr/query_result.rb +14 -0
- data/lib/poliqarpr/segment.rb +23 -0
- data/lib/poliqarpr.rb +7 -1
- data/poliqarpr.gemspec +4 -39
- data/spec/client.rb +53 -0
- data/spec/excerpt.rb +40 -2
- metadata +4 -37
- data/corpus/frek.cdf +0 -4
- data/corpus/frek.cfg +0 -100
- data/corpus/frek.cfg~ +0 -100
- data/corpus/frek.meta.cfg +0 -1
- data/corpus/frek.meta.lisp +0 -4
- data/corpus/frek.poliqarp.base1.image +0 -0
- data/corpus/frek.poliqarp.base1.offset +0 -0
- data/corpus/frek.poliqarp.base2.image +0 -0
- data/corpus/frek.poliqarp.base2.offset +0 -0
- data/corpus/frek.poliqarp.chunk.image +0 -0
- data/corpus/frek.poliqarp.corpus.image +0 -0
- data/corpus/frek.poliqarp.meta-key.image +0 -0
- data/corpus/frek.poliqarp.meta-key.offset +0 -0
- data/corpus/frek.poliqarp.meta-value.image +0 -0
- data/corpus/frek.poliqarp.meta-value.offset +0 -0
- data/corpus/frek.poliqarp.meta.image +0 -0
- data/corpus/frek.poliqarp.orth.image +0 -0
- data/corpus/frek.poliqarp.orth.index.alpha +0 -0
- data/corpus/frek.poliqarp.orth.index.atergo +0 -0
- data/corpus/frek.poliqarp.orth.offset +0 -0
- data/corpus/frek.poliqarp.rindex.amb +0 -0
- data/corpus/frek.poliqarp.rindex.amb.offset +0 -0
- data/corpus/frek.poliqarp.rindex.disamb +0 -0
- data/corpus/frek.poliqarp.rindex.disamb.offset +0 -0
- data/corpus/frek.poliqarp.rindex.orth +0 -0
- data/corpus/frek.poliqarp.rindex.orth.offset +0 -0
- data/corpus/frek.poliqarp.subchunk.image +0 -0
- data/corpus/frek.poliqarp.subchunk.item.ch +0 -0
- data/corpus/frek.poliqarp.subchunk.offset +0 -0
- data/corpus/frek.poliqarp.subpos1.image +0 -0
- data/corpus/frek.poliqarp.subpos1.offset +0 -0
- data/corpus/frek.poliqarp.subpos2.image +0 -0
- data/corpus/frek.poliqarp.subpos2.offset +0 -0
- data/corpus/frek.poliqarp.tag.image +0 -0
- data/corpus/frek.poliqarp.tag.offset +0 -0
data/README.txt
CHANGED
@@ -9,6 +9,8 @@ Poliqarpr is Ruby client for Poliqarp server.
|
|
9
9
|
|
10
10
|
== FEATURES/PROBLEMS:
|
11
11
|
|
12
|
+
* built-in pagination of query results
|
13
|
+
* support for lemmatization
|
12
14
|
* asynchronous communication is implemented in synchronous manner
|
13
15
|
* only partial implementation of server protocol
|
14
16
|
|
@@ -38,6 +40,34 @@ Then you can type:
|
|
38
40
|
|
39
41
|
* sudo gem install apohllo-poliqarpr
|
40
42
|
|
43
|
+
You can install the optional default corpus (warning: it is distributed under
|
44
|
+
different license!):
|
45
|
+
|
46
|
+
* sudo gem install apohllo-poliqarpr-corpus
|
47
|
+
|
48
|
+
== BASIC USAGE:
|
49
|
+
|
50
|
+
(You need the poliqarpr-corpus to be installed for this to work. See the last
|
51
|
+
step of installation process).
|
52
|
+
|
53
|
+
Require the gem:
|
54
|
+
|
55
|
+
require 'poliaqarpr'
|
56
|
+
|
57
|
+
Create the server client and open default corpus
|
58
|
+
|
59
|
+
client = Poliqarp::Client.new
|
60
|
+
client.open_corpus :default
|
61
|
+
|
62
|
+
Query the corpus for given segment
|
63
|
+
|
64
|
+
result = client.find("kot")
|
65
|
+
result[0].to_s
|
66
|
+
|
67
|
+
Remember to close the client on exit
|
68
|
+
|
69
|
+
client.close
|
70
|
+
|
41
71
|
|
42
72
|
== LICENSE:
|
43
73
|
|
data/changelog.txt
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
0.0.3
|
2
|
+
- the license of the corpus included
|
3
|
+
- client rdoc documentation
|
4
|
+
- support for lemmata retrieval
|
5
|
+
- excerpt now contains segments instead of strings
|
6
|
+
- buffer size setter
|
7
|
+
- default corpus moved to separate plugin (sudo gem install apohllo-poliqarpr-corpus)
|
8
|
+
|
1
9
|
0.0.2
|
2
10
|
- query result is full blown class
|
3
11
|
- source divided into client, excerpt and query resutl
|
data/lib/poliqarpr/client.rb
CHANGED
@@ -1,6 +1,10 @@
|
|
1
|
+
require 'socket'
|
1
2
|
module Poliqarp
|
3
|
+
# Author:: Aleksander Pohl (mailto:apohllo@o2.pl)
|
4
|
+
# License:: MIT License
|
5
|
+
#
|
6
|
+
# This class is the implementation of the Poliqarp server client.
|
2
7
|
class Client
|
3
|
-
DEFAULT_CORPUS = File.join(File.expand_path(File.dirname(__FILE__)),"..", "..", "corpus", "frek")
|
4
8
|
ERRORS = {
|
5
9
|
1 => "Incorrect number of arguments",
|
6
10
|
3 => "No session opened",
|
@@ -20,8 +24,22 @@ module Poliqarp
|
|
20
24
|
19 => "Invalid session option value",
|
21
25
|
20 => "Invalid sorting criteria"
|
22
26
|
}
|
27
|
+
GROUPS = [:left_context, :left_match, :right_match, :right_context]
|
28
|
+
|
29
|
+
# If debug is turned on, the communication between server and client
|
30
|
+
# is logged to standard output.
|
23
31
|
attr_writer :debug
|
24
32
|
|
33
|
+
# The size of the buffer is the maximum number of excerpts which
|
34
|
+
# are returned for single query.
|
35
|
+
attr_writer :buffer_size
|
36
|
+
|
37
|
+
# Creates new poliqarp server client.
|
38
|
+
#
|
39
|
+
# Parameters:
|
40
|
+
# * +session_name+ the name of the client session. Defaults to "RUBY".
|
41
|
+
# * +debug+ if set to true, all messages sent and received from server
|
42
|
+
# are printed to standard output. Defaults to false.
|
25
43
|
def initialize(session_name="RUBY", debug=false)
|
26
44
|
@session_name = session_name
|
27
45
|
@left_context = 5
|
@@ -31,9 +49,22 @@ module Poliqarp
|
|
31
49
|
new_session
|
32
50
|
end
|
33
51
|
|
34
|
-
|
52
|
+
# A hint about uninstalled default corpus gem
|
53
|
+
def self.const_missing(const)
|
54
|
+
if const.to_s =~ /DEFAULT_CORPUS/
|
55
|
+
raise "You need to install 'apohllo-poliqarpr-corpus' to use the default corpus"
|
56
|
+
end
|
57
|
+
super
|
58
|
+
end
|
59
|
+
|
60
|
+
# Creates new session for the client with the name given in constructor.
|
61
|
+
# If the session was already opened, it is closed.
|
62
|
+
#
|
63
|
+
# Parameters:
|
64
|
+
# * +port+ - the port on which the poliqarpd server is accepting connections (defaults to 4567)
|
65
|
+
def new_session(port=4567)
|
35
66
|
close if @session
|
36
|
-
@socket = TCPSocket.new("localhost",
|
67
|
+
@socket = TCPSocket.new("localhost",port)
|
37
68
|
talk "MAKE-SESSION #{@session_name}"
|
38
69
|
rcv_sync
|
39
70
|
talk("BUFFER-RESIZE #{@buffer_size}")
|
@@ -43,11 +74,7 @@ module Poliqarp
|
|
43
74
|
self.lemmata = {}
|
44
75
|
end
|
45
76
|
|
46
|
-
|
47
|
-
puts msg if @debug
|
48
|
-
@socket.puts(msg)
|
49
|
-
end
|
50
|
-
|
77
|
+
# Closes the opened connection to the poliqarpd server.
|
51
78
|
def close
|
52
79
|
#talk "CLOSE"
|
53
80
|
#rcv_sync
|
@@ -57,43 +84,88 @@ module Poliqarp
|
|
57
84
|
@session = false
|
58
85
|
end
|
59
86
|
|
87
|
+
# Sets the size of the left short context. It must be > 0
|
88
|
+
#
|
89
|
+
# The size of the left short context is the number
|
90
|
+
# of segments displayed in the found excerpts left to the
|
91
|
+
# matched segment(s).
|
60
92
|
def left_context=(value)
|
61
|
-
if value
|
93
|
+
if correct_context_value?(value)
|
62
94
|
talk "SET left-context-width #{value}"
|
63
95
|
result = rcv_sync
|
64
96
|
@left_context = value if result =~ /^R OK/
|
97
|
+
else
|
98
|
+
raise "Invalid argument: #{value}. It must be fixnum greater than 0."
|
65
99
|
end
|
66
100
|
end
|
67
101
|
|
102
|
+
# Sets the size of the right short context. It must be > 0
|
103
|
+
#
|
104
|
+
# The size of the right short context is the number
|
105
|
+
# of segments displayed in the found excerpts right to the
|
106
|
+
# matched segment(s).
|
68
107
|
def right_context=(value)
|
69
|
-
if value
|
108
|
+
if correct_context_value?(value)
|
70
109
|
talk "SET right-context-width #{value}"
|
71
110
|
result = rcv_sync
|
72
111
|
@right_context = value if result =~ /^R OK/
|
112
|
+
else
|
113
|
+
raise "Invalid argument: #{value}. It must be fixnum greater than 0."
|
73
114
|
end
|
74
115
|
end
|
75
116
|
|
117
|
+
# Sets the tags' flags. There are four groups of segments
|
118
|
+
# which the flags apply for:
|
119
|
+
# * +left_context+
|
120
|
+
# * +left_match+
|
121
|
+
# * +right_match+
|
122
|
+
# * +right_context+
|
123
|
+
#
|
124
|
+
# If the flag for given group is set to true, all segments
|
125
|
+
# in the group are annotated with grammatical tags. E.g.:
|
126
|
+
# c.find("kot")
|
127
|
+
# ...
|
128
|
+
# "kot" tags: "subst:sg:nom:m2"
|
129
|
+
#
|
130
|
+
# You can pass :all to turn on flags for all groups
|
76
131
|
def tags=(options={})
|
132
|
+
options = set_all_flags if options == :all
|
133
|
+
@tag_flags = options
|
77
134
|
flags = ""
|
78
|
-
|
79
|
-
:rightM_tags, :right_context_tags].each do |flag|
|
135
|
+
GROUPS.each do |flag|
|
80
136
|
flags << (options[flag] ? "1" : "0")
|
81
137
|
end
|
82
138
|
talk "SET retrieve-tags #{flags}"
|
83
139
|
rcv_sync
|
84
140
|
end
|
85
141
|
|
142
|
+
# Sets the lemmatas' flags. There are four groups of segments
|
143
|
+
# which the flags apply for:
|
144
|
+
# * +left_context+
|
145
|
+
# * +left_match+
|
146
|
+
# * +right_match+
|
147
|
+
# * +right_context+
|
148
|
+
#
|
149
|
+
# If the flag for given group is set to true, all segments
|
150
|
+
# in the group are returned with the base form of the lemmata. E.g.:
|
151
|
+
# c.find("kotu")
|
152
|
+
# ...
|
153
|
+
# "kotu" base_form: "kot"
|
154
|
+
#
|
155
|
+
# You can pass :all to turn on flags for all groups
|
86
156
|
def lemmata=(options={})
|
157
|
+
options = set_all_flags if options == :all
|
158
|
+
@lemmata_flags = options
|
87
159
|
flags = ""
|
88
|
-
|
89
|
-
:rightM_lemmata, :right_context_lemmata].each do |flag|
|
160
|
+
GROUPS.each do |flag|
|
90
161
|
flags << (options[flag] ? "1" : "0")
|
91
162
|
end
|
92
163
|
talk "SET retrieve-lemmata #{flags}"
|
93
164
|
rcv_sync
|
94
165
|
end
|
95
166
|
|
96
|
-
|
167
|
+
# Opens the corpus given as +path+. To open the default
|
168
|
+
# corpus pass +:default+ as the argument.
|
97
169
|
def open_corpus(path)
|
98
170
|
if path == :default
|
99
171
|
open_corpus(DEFAULT_CORPUS)
|
@@ -104,6 +176,19 @@ module Poliqarp
|
|
104
176
|
end
|
105
177
|
end
|
106
178
|
|
179
|
+
# Send the query to the opened corpus.
|
180
|
+
#
|
181
|
+
# Options:
|
182
|
+
# * +index+ the index of the (only one) result to be returned. The index is relative
|
183
|
+
# to the beginning of the query result. In normal case you should query the
|
184
|
+
# corpus without specifying the index, to see what results are returned.
|
185
|
+
# Then you can use the index and the same query to retrieve one result.
|
186
|
+
# The pair (query, index) is a kind of unique identifier of the excerpt.
|
187
|
+
# * +page_size+ the size of the page of results. If the page size is 0, then
|
188
|
+
# all results are returned on one page. It is ignored if the +index+ option
|
189
|
+
# is present. Defaults to 0.
|
190
|
+
# * +page_index+ the index of the page of results (the first page has index 1, not 0).
|
191
|
+
# It is ignored if the +index+ option is present. Defaults to 1.
|
107
192
|
def find(query,options={})
|
108
193
|
if options[:index]
|
109
194
|
find_one(query, options[:index])
|
@@ -114,10 +199,13 @@ module Poliqarp
|
|
114
199
|
|
115
200
|
alias query find
|
116
201
|
|
202
|
+
# Returns the number of results for given query.
|
117
203
|
def count(query)
|
118
204
|
count_results(make_query(query))
|
119
205
|
end
|
120
206
|
|
207
|
+
# Returns the long context of the excerpt which is identified by
|
208
|
+
# given (query, index) pair.
|
121
209
|
def context(query,index)
|
122
210
|
make_query(query)
|
123
211
|
result = []
|
@@ -135,6 +223,8 @@ module Poliqarp
|
|
135
223
|
result
|
136
224
|
end
|
137
225
|
|
226
|
+
# Returns the metadata of the excerpt which is identified by
|
227
|
+
# given (query, index) pair.
|
138
228
|
def metadata(query, index)
|
139
229
|
make_query(query)
|
140
230
|
result = {}
|
@@ -152,6 +242,13 @@ module Poliqarp
|
|
152
242
|
end
|
153
243
|
|
154
244
|
protected
|
245
|
+
# Sends a message directly to the server
|
246
|
+
# * +msg+ the message to send
|
247
|
+
def talk(msg)
|
248
|
+
puts msg if @debug
|
249
|
+
@socket.puts(msg)
|
250
|
+
end
|
251
|
+
|
155
252
|
def find_many(query, options)
|
156
253
|
page_size = (options[:page_size] || 0)
|
157
254
|
page_index = (options[:page_index] || 1)
|
@@ -193,29 +290,45 @@ protected
|
|
193
290
|
end
|
194
291
|
|
195
292
|
# Fetches one result of the query
|
196
|
-
|
293
|
+
##
|
197
294
|
# MAKE-QUERY and GET-RESULTS must be called on server before
|
198
295
|
# this method is called
|
199
296
|
def fetch_result(index, query)
|
200
297
|
result = Excerpt.new(index, self, query)
|
201
|
-
|
202
|
-
result << read_segments
|
203
|
-
#
|
204
|
-
result << read_segments
|
205
|
-
|
206
|
-
result << read_segments
|
298
|
+
result << read_segments(:left_context)
|
299
|
+
result << read_segments(:left_match)
|
300
|
+
# XXX
|
301
|
+
#result << read_segments(:right_match)
|
302
|
+
result << read_segments(:right_context)
|
207
303
|
|
208
304
|
result
|
209
305
|
end
|
210
306
|
|
211
|
-
def read_segments
|
212
|
-
|
213
|
-
size = answer.match(/\d+/)[0].to_i
|
307
|
+
def read_segments(group)
|
308
|
+
size = get_number(rcv_sync)
|
214
309
|
segments = []
|
215
310
|
size.times do |segment_index|
|
216
|
-
|
311
|
+
segment = Segment.new(read_word)
|
312
|
+
segments << segment
|
313
|
+
if @lemmata_flags[group] || @tag_flags[group]
|
314
|
+
lemmata_size = get_number(rcv_sync)
|
315
|
+
lemmata_size.times do |lemmata_index|
|
316
|
+
lemmata = Lemmata.new()
|
317
|
+
if @lemmata_flags[group]
|
318
|
+
lemmata.base_form = read_word
|
319
|
+
end
|
320
|
+
if @tag_flags[group]
|
321
|
+
read_word
|
322
|
+
end
|
323
|
+
segment.lemmata << lemmata
|
324
|
+
end
|
325
|
+
end
|
217
326
|
end
|
218
|
-
segments
|
327
|
+
segments
|
328
|
+
end
|
329
|
+
|
330
|
+
def get_number(str)
|
331
|
+
str.match(/\d+/)[0].to_i
|
219
332
|
end
|
220
333
|
|
221
334
|
def count_results(answer)
|
@@ -268,5 +381,16 @@ protected
|
|
268
381
|
end until line =~ /^M/
|
269
382
|
line
|
270
383
|
end
|
384
|
+
|
385
|
+
private
|
386
|
+
def set_all_flags
|
387
|
+
options = {}
|
388
|
+
GROUPS.each{|g| options[g] = true}
|
389
|
+
options
|
390
|
+
end
|
391
|
+
|
392
|
+
def correct_context_value?(value)
|
393
|
+
value.is_a?(Fixnum) && value > 0
|
394
|
+
end
|
271
395
|
end
|
272
396
|
end
|
data/lib/poliqarpr/excerpt.rb
CHANGED
@@ -1,4 +1,15 @@
|
|
1
1
|
module Poliqarp
|
2
|
+
# Author:: Aleksander Pohl
|
3
|
+
# License:: MIT License
|
4
|
+
#
|
5
|
+
# The excerpt class is used to store single result of the query,
|
6
|
+
# i.e. the excerpt of the corpus which contains the words which
|
7
|
+
# the corpus was queried for.
|
8
|
+
#
|
9
|
+
# The excerpt is divided into groups, which contain segments,
|
10
|
+
# which the texts in the corpus were divided for.
|
11
|
+
# The first group is the left context, the second -- the matched
|
12
|
+
# query, and the last -- the right context.
|
2
13
|
class Excerpt
|
3
14
|
attr_reader :index, :base_form, :short_context
|
4
15
|
|
@@ -9,11 +20,13 @@ module Poliqarp
|
|
9
20
|
@short_context = []
|
10
21
|
end
|
11
22
|
|
23
|
+
# Adds segment group to the excerpt
|
12
24
|
def <<(value)
|
13
25
|
@short_context << value
|
14
26
|
end
|
15
27
|
|
16
28
|
|
29
|
+
# Returns the matched query as string
|
17
30
|
def word
|
18
31
|
#@short_context[0].split(/\s+/)[-1]
|
19
32
|
@short_context[1].to_s
|
@@ -21,10 +34,13 @@ module Poliqarp
|
|
21
34
|
|
22
35
|
alias inflected_form word
|
23
36
|
|
37
|
+
# The string representation of the excerpt is the shord
|
38
|
+
# context of the query.
|
24
39
|
def to_s
|
25
40
|
@short_context.join("")
|
26
41
|
end
|
27
42
|
|
43
|
+
# Returns the long context of the query.
|
28
44
|
def context
|
29
45
|
return @context unless @context.nil?
|
30
46
|
@context = @client.context(@base_form, @index)
|
@@ -1,4 +1,10 @@
|
|
1
1
|
module Poliqarp
|
2
|
+
# Author:: Aleksander Pohl (mailto:apohllo@o2.pl)
|
3
|
+
# License:: MIT License
|
4
|
+
#
|
5
|
+
# The query result class is used to paginate results of the
|
6
|
+
# query. Each query result has information about its context
|
7
|
+
# (the next and previous page).
|
2
8
|
class QueryResult
|
3
9
|
include Enumerable
|
4
10
|
|
@@ -13,10 +19,12 @@ module Poliqarp
|
|
13
19
|
@excerpts = []
|
14
20
|
end
|
15
21
|
|
22
|
+
# Adds excerpt to the query result
|
16
23
|
def <<(excerpt)
|
17
24
|
@excerpts << excerpt
|
18
25
|
end
|
19
26
|
|
27
|
+
# Allows to iterate over the results stored in the result
|
20
28
|
def each
|
21
29
|
@excerpts.each{|e| yield e}
|
22
30
|
end
|
@@ -27,16 +35,20 @@ module Poliqarp
|
|
27
35
|
end
|
28
36
|
end
|
29
37
|
|
38
|
+
# Returns excerpt with given index.
|
30
39
|
def [](index)
|
31
40
|
@excerpts[index]
|
32
41
|
end
|
33
42
|
|
43
|
+
# Two excerpts are equal iff their page number, page count,
|
44
|
+
# query and page size are equal.
|
34
45
|
def ==(other)
|
35
46
|
return false unless other.is_a? QueryResult
|
36
47
|
@page == other.page && @page_count == other.page_count &&
|
37
48
|
@query == other.query && @page_size == other.page_size
|
38
49
|
end
|
39
50
|
|
51
|
+
# Returns the previous page of the query result
|
40
52
|
def previous_page
|
41
53
|
if @page > 1
|
42
54
|
@client.find(@query, :page_size => @page_size,
|
@@ -44,6 +56,7 @@ module Poliqarp
|
|
44
56
|
end
|
45
57
|
end
|
46
58
|
|
59
|
+
# Return the next page of the query result
|
47
60
|
def next_page
|
48
61
|
if @page < @page_count
|
49
62
|
@client.find(@query, :page_size => @page_size,
|
@@ -51,6 +64,7 @@ module Poliqarp
|
|
51
64
|
end
|
52
65
|
end
|
53
66
|
|
67
|
+
# Returns the number of excerpts stored in this page (query result)
|
54
68
|
def size
|
55
69
|
@excerpts.size
|
56
70
|
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Poliqarp
|
2
|
+
# Author:: Aleksander Pohl (mailto:apohllo@o2.pl)
|
3
|
+
# License:: MIT LICENSE
|
4
|
+
#
|
5
|
+
# The segment is the smallest meaningful part of the text.
|
6
|
+
# It may contain many lemmata, since the segments are sometimes
|
7
|
+
# not disambiguated.
|
8
|
+
class Segment
|
9
|
+
attr_reader :literal, :lemmata
|
10
|
+
|
11
|
+
# Creates new segment. The specified argument is the literal
|
12
|
+
# (as found in the text) representation of the segment.
|
13
|
+
def initialize(literal)
|
14
|
+
@literal = literal
|
15
|
+
@lemmata = []
|
16
|
+
end
|
17
|
+
|
18
|
+
# Returns the segment literal
|
19
|
+
def to_s
|
20
|
+
@literal
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
data/lib/poliqarpr.rb
CHANGED
@@ -2,4 +2,10 @@ path = File.join(File.dirname(__FILE__), 'poliqarpr')
|
|
2
2
|
require File.join(path, 'client')
|
3
3
|
require File.join(path, 'query_result')
|
4
4
|
require File.join(path, 'excerpt')
|
5
|
-
|
5
|
+
require File.join(path, 'segment')
|
6
|
+
require File.join(path, 'lemmata')
|
7
|
+
begin
|
8
|
+
require 'poliqarpr-corpus'
|
9
|
+
rescue LoadError
|
10
|
+
# Do nothig, since the default corpus is optional
|
11
|
+
end
|
data/poliqarpr.gemspec
CHANGED
@@ -1,9 +1,7 @@
|
|
1
|
-
#require 'rake'
|
2
|
-
|
3
1
|
Gem::Specification.new do |s|
|
4
2
|
s.name = "poliqarpr"
|
5
|
-
s.version = "0.0.
|
6
|
-
s.date = "2008-12-
|
3
|
+
s.version = "0.0.3"
|
4
|
+
s.date = "2008-12-20"
|
7
5
|
s.summary = "Ruby client for Poliqarp"
|
8
6
|
s.email = "apohllo@o2.pl"
|
9
7
|
s.homepage = "http://www.apohllo.pl/projekty/poliqarpr"
|
@@ -14,42 +12,9 @@ Gem::Specification.new do |s|
|
|
14
12
|
"lib/poliqarpr/client.rb",
|
15
13
|
"lib/poliqarpr/query_result.rb",
|
16
14
|
"lib/poliqarpr/excerpt.rb",
|
15
|
+
"lib/poliqarpr/lemmata.rb",
|
16
|
+
"lib/poliqarpr/segment.rb",
|
17
17
|
"README.txt",
|
18
|
-
"corpus/frek.cdf",
|
19
|
-
"corpus/frek.poliqarp.base1.image",
|
20
|
-
"corpus/frek.poliqarp.corpus.image",
|
21
|
-
"corpus/frek.poliqarp.meta-value.offset",
|
22
|
-
"corpus/frek.poliqarp.rindex.amb",
|
23
|
-
"corpus/frek.poliqarp.rindex.orth.offset",
|
24
|
-
"corpus/frek.poliqarp.subpos1.offset",
|
25
|
-
"corpus/frek.cfg",
|
26
|
-
"corpus/frek.poliqarp.base1.offset",
|
27
|
-
"corpus/frek.poliqarp.meta.image",
|
28
|
-
"corpus/frek.poliqarp.orth.image",
|
29
|
-
"corpus/frek.poliqarp.rindex.amb.offset",
|
30
|
-
"corpus/frek.poliqarp.subchunk.image",
|
31
|
-
"corpus/frek.poliqarp.subpos2.image",
|
32
|
-
"corpus/frek.cfg~",
|
33
|
-
"corpus/frek.poliqarp.base2.image",
|
34
|
-
"corpus/frek.poliqarp.meta-key.image",
|
35
|
-
"corpus/frek.poliqarp.orth.index.alpha",
|
36
|
-
"corpus/frek.poliqarp.rindex.disamb",
|
37
|
-
"corpus/frek.poliqarp.subchunk.item.ch",
|
38
|
-
"corpus/frek.poliqarp.subpos2.offset",
|
39
|
-
"corpus/frek.meta.cfg",
|
40
|
-
"corpus/frek.poliqarp.base2.offset",
|
41
|
-
"corpus/frek.poliqarp.meta-key.offset",
|
42
|
-
"corpus/frek.poliqarp.orth.index.atergo",
|
43
|
-
"corpus/frek.poliqarp.rindex.disamb.offset",
|
44
|
-
"corpus/frek.poliqarp.subchunk.offset",
|
45
|
-
"corpus/frek.poliqarp.tag.image",
|
46
|
-
"corpus/frek.meta.lisp",
|
47
|
-
"corpus/frek.poliqarp.chunk.image",
|
48
|
-
"corpus/frek.poliqarp.meta-value.image",
|
49
|
-
"corpus/frek.poliqarp.orth.offset",
|
50
|
-
"corpus/frek.poliqarp.rindex.orth",
|
51
|
-
"corpus/frek.poliqarp.subpos1.image",
|
52
|
-
"corpus/frek.poliqarp.tag.offset"
|
53
18
|
]
|
54
19
|
s.test_files = [
|
55
20
|
"spec/client.rb",
|
data/spec/client.rb
CHANGED
@@ -29,6 +29,38 @@ describe Poliqarp::Client do
|
|
29
29
|
@client.close
|
30
30
|
end
|
31
31
|
|
32
|
+
it "should allow to set the right context size" do
|
33
|
+
@client.right_context = 5
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should raise error if the size of right context is not number" do
|
37
|
+
(proc do
|
38
|
+
@client.right_context = "a"
|
39
|
+
end).should raise_error(RuntimeError)
|
40
|
+
end
|
41
|
+
|
42
|
+
it "should rais error if the size of right context is less or equal 0" do
|
43
|
+
(proc do
|
44
|
+
@client.right_context = 0
|
45
|
+
end).should raise_error(RuntimeError)
|
46
|
+
end
|
47
|
+
|
48
|
+
it "should allow to set the left context size" do
|
49
|
+
@client.right_context = 5
|
50
|
+
end
|
51
|
+
|
52
|
+
it "should raise error if the size of left context is not number" do
|
53
|
+
(lambda do
|
54
|
+
@client.left_context = "a"
|
55
|
+
end).should raise_error(RuntimeError)
|
56
|
+
end
|
57
|
+
|
58
|
+
it "should rais error if the size of left context is less or equal 0" do
|
59
|
+
(lambda do
|
60
|
+
@client.left_context = 0
|
61
|
+
end).should raise_error(RuntimeError)
|
62
|
+
end
|
63
|
+
|
32
64
|
it "should allow to find 'kot'" do
|
33
65
|
@client.find("kot").size.should_not == 0
|
34
66
|
end
|
@@ -83,6 +115,27 @@ describe Poliqarp::Client do
|
|
83
115
|
@result.to_s.should == @client.find("nachalny")[0].to_s
|
84
116
|
end
|
85
117
|
end
|
118
|
+
|
119
|
+
describe("(with lemmata flags set to true)") do
|
120
|
+
before(:all) do
|
121
|
+
@client.lemmata = {:left_context => true, :right_context => true,
|
122
|
+
:left_match => true, :right_match => true}
|
123
|
+
end
|
124
|
+
|
125
|
+
it "should allow to find 'kotu'" do
|
126
|
+
@client.find("kotu").size.should_not == 0
|
127
|
+
end
|
128
|
+
|
129
|
+
it "should contain 'kotu' in query result for 'kotu'" do
|
130
|
+
@client.find("kotu")[0].to_s.should match(/\bkotu\b/)
|
131
|
+
end
|
132
|
+
|
133
|
+
it "should contain 'kot' in lemmatized query result for 'kotu'" do
|
134
|
+
@client.find("kotu")[0].short_context.flatten.
|
135
|
+
map{|e| e.lemmata[0].base_form}.join(" ").should match(/\bkot\b/)
|
136
|
+
end
|
137
|
+
|
138
|
+
end
|
86
139
|
end
|
87
140
|
|
88
141
|
end
|
data/spec/excerpt.rb
CHANGED
@@ -23,8 +23,19 @@ describe Poliqarp::Excerpt do
|
|
23
23
|
@excerpt.base_form.should_not == nil
|
24
24
|
end
|
25
25
|
|
26
|
-
it "should
|
27
|
-
@excerpt
|
26
|
+
it "should contain 3 groups in short context" do
|
27
|
+
@excerpt.short_context.size.should == 3
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should allow to add segment group" do
|
31
|
+
@excerpt << [Poliqarp::Segment.new("abc")]
|
32
|
+
end
|
33
|
+
|
34
|
+
|
35
|
+
it "should contain non empty segments in short context" do
|
36
|
+
@excerpt.short_context.flatten.each do |segment|
|
37
|
+
segment.literal.should_not == nil
|
38
|
+
end
|
28
39
|
end
|
29
40
|
|
30
41
|
it "should contain the exact form which it was created for" do
|
@@ -92,4 +103,31 @@ describe Poliqarp::Excerpt do
|
|
92
103
|
@excerpt.author[0].should == "Małgorzata Pamuła"
|
93
104
|
end
|
94
105
|
end
|
106
|
+
|
107
|
+
describe('first result for "kotu" with lemmatization turned on') do
|
108
|
+
before(:all) do
|
109
|
+
@client.lemmata = :all
|
110
|
+
@client.open_corpus(:default)
|
111
|
+
@excerpt = @client.find("kotu")[0]
|
112
|
+
end
|
113
|
+
|
114
|
+
it "should have one lemmata for each segment" do
|
115
|
+
@excerpt.short_context.each do |group|
|
116
|
+
group.each do |segment|
|
117
|
+
segment.lemmata.size.should == 1
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
it "should have non-nil lemmata for each segment" do
|
123
|
+
@excerpt.short_context.flatten.each do |segment|
|
124
|
+
segment.lemmata[0].should_not == nil
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
it "should contain 'kot' as one of the lemmata" do
|
129
|
+
@excerpt.short_context.flatten.
|
130
|
+
any?{|s| s.lemmata[0].base_form == "kot"}.should == true
|
131
|
+
end
|
132
|
+
end
|
95
133
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: apohllo-poliqarpr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Aleksander Pohl
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-12-
|
12
|
+
date: 2008-12-20 00:00:00 -08:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|
@@ -29,42 +29,9 @@ files:
|
|
29
29
|
- lib/poliqarpr/client.rb
|
30
30
|
- lib/poliqarpr/query_result.rb
|
31
31
|
- lib/poliqarpr/excerpt.rb
|
32
|
+
- lib/poliqarpr/lemmata.rb
|
33
|
+
- lib/poliqarpr/segment.rb
|
32
34
|
- README.txt
|
33
|
-
- corpus/frek.cdf
|
34
|
-
- corpus/frek.poliqarp.base1.image
|
35
|
-
- corpus/frek.poliqarp.corpus.image
|
36
|
-
- corpus/frek.poliqarp.meta-value.offset
|
37
|
-
- corpus/frek.poliqarp.rindex.amb
|
38
|
-
- corpus/frek.poliqarp.rindex.orth.offset
|
39
|
-
- corpus/frek.poliqarp.subpos1.offset
|
40
|
-
- corpus/frek.cfg
|
41
|
-
- corpus/frek.poliqarp.base1.offset
|
42
|
-
- corpus/frek.poliqarp.meta.image
|
43
|
-
- corpus/frek.poliqarp.orth.image
|
44
|
-
- corpus/frek.poliqarp.rindex.amb.offset
|
45
|
-
- corpus/frek.poliqarp.subchunk.image
|
46
|
-
- corpus/frek.poliqarp.subpos2.image
|
47
|
-
- corpus/frek.cfg~
|
48
|
-
- corpus/frek.poliqarp.base2.image
|
49
|
-
- corpus/frek.poliqarp.meta-key.image
|
50
|
-
- corpus/frek.poliqarp.orth.index.alpha
|
51
|
-
- corpus/frek.poliqarp.rindex.disamb
|
52
|
-
- corpus/frek.poliqarp.subchunk.item.ch
|
53
|
-
- corpus/frek.poliqarp.subpos2.offset
|
54
|
-
- corpus/frek.meta.cfg
|
55
|
-
- corpus/frek.poliqarp.base2.offset
|
56
|
-
- corpus/frek.poliqarp.meta-key.offset
|
57
|
-
- corpus/frek.poliqarp.orth.index.atergo
|
58
|
-
- corpus/frek.poliqarp.rindex.disamb.offset
|
59
|
-
- corpus/frek.poliqarp.subchunk.offset
|
60
|
-
- corpus/frek.poliqarp.tag.image
|
61
|
-
- corpus/frek.meta.lisp
|
62
|
-
- corpus/frek.poliqarp.chunk.image
|
63
|
-
- corpus/frek.poliqarp.meta-value.image
|
64
|
-
- corpus/frek.poliqarp.orth.offset
|
65
|
-
- corpus/frek.poliqarp.rindex.orth
|
66
|
-
- corpus/frek.poliqarp.subpos1.image
|
67
|
-
- corpus/frek.poliqarp.tag.offset
|
68
35
|
has_rdoc: true
|
69
36
|
homepage: http://www.apohllo.pl/projekty/poliqarpr
|
70
37
|
post_install_message:
|
data/corpus/frek.cdf
DELETED
data/corpus/frek.cfg
DELETED
@@ -1,100 +0,0 @@
|
|
1
|
-
# Config file format for Oasis release
|
2
|
-
# Config version 1.0
|
3
|
-
|
4
|
-
# The new startup section may contain any command normally accepted by the shell
|
5
|
-
|
6
|
-
[ALIASES]
|
7
|
-
|
8
|
-
masc = m1|m2|m3
|
9
|
-
verb = pact|ppas|winien|praet|bedzie|fin|impt|aglt|ger|imps|inf|pant|pcon
|
10
|
-
noun = subst|depr|xxs|ger|ppron12|ppron3
|
11
|
-
pron = ppron12|ppron3|siebie
|
12
|
-
|
13
|
-
|
14
|
-
[ATTR]
|
15
|
-
|
16
|
-
number = sg pl
|
17
|
-
case = nom gen dat acc inst loc voc
|
18
|
-
gender = m1 m2 m3 f n
|
19
|
-
person = pri sec ter
|
20
|
-
degree = pos comp sup
|
21
|
-
aspect = imperf perf
|
22
|
-
negation = aff neg
|
23
|
-
accommodability = congr rec
|
24
|
-
accentability = akc nakc
|
25
|
-
post-prepositionality = npraep praep
|
26
|
-
agglutination = agl nagl
|
27
|
-
vocalicity = nwok wok
|
28
|
-
|
29
|
-
# Parts of speech no longer need forward declarations, this was inconvenient and ugly.
|
30
|
-
# Also, any attribute may be optional so a declaration such as:
|
31
|
-
# foo = [bar] [froz] fred [wilma]
|
32
|
-
# should no longer cause problems and ctags with such attributes now parse correctly regardless
|
33
|
-
# of presence or absence of any optional attribute
|
34
|
-
|
35
|
-
[POS]
|
36
|
-
|
37
|
-
adja =
|
38
|
-
adjp =
|
39
|
-
conj =
|
40
|
-
interp =
|
41
|
-
pred =
|
42
|
-
xxx =
|
43
|
-
adv = degree
|
44
|
-
imps = aspect
|
45
|
-
inf = aspect
|
46
|
-
pant = aspect
|
47
|
-
pcon = aspect
|
48
|
-
qub = [vocalicity]
|
49
|
-
prep = case [vocalicity]
|
50
|
-
siebie = case
|
51
|
-
subst = number case gender
|
52
|
-
depr = number case gender
|
53
|
-
xxs = number case gender
|
54
|
-
ger = number case gender aspect negation
|
55
|
-
ppron12 = number case gender person [accentability]
|
56
|
-
ppron3 = number case gender person [accentability] [post-prepositionality]
|
57
|
-
num = number case gender [accommodability]
|
58
|
-
adj = number case gender degree
|
59
|
-
pact = number case gender aspect negation
|
60
|
-
ppas = number case gender aspect negation
|
61
|
-
winien = number gender aspect
|
62
|
-
praet = number gender aspect [agglutination]
|
63
|
-
bedzie = number person aspect
|
64
|
-
fin = number person aspect
|
65
|
-
impt = number person aspect
|
66
|
-
aglt = number person aspect vocalicity
|
67
|
-
ign =
|
68
|
-
|
69
|
-
# Named entities replaced old 'special' attributes, name changed mostly because of
|
70
|
-
# unification of 'named-thing' handling code into one named-entity thing
|
71
|
-
# Entity aliasing allows for any existing entity to be seen under different name
|
72
|
-
#
|
73
|
-
# FCQP provides four builtin entities:
|
74
|
-
# entity-current
|
75
|
-
# entity-base
|
76
|
-
# entity-tag
|
77
|
-
# entity-pos
|
78
|
-
|
79
|
-
[NAMED-ENTITY]
|
80
|
-
|
81
|
-
entity-orth = orth
|
82
|
-
entity-base = base
|
83
|
-
entity-tag = tag
|
84
|
-
entity-pos = pos
|
85
|
-
|
86
|
-
# Old 'aliases' for attribute names
|
87
|
-
|
88
|
-
pos = flex
|
89
|
-
number = numb nmb
|
90
|
-
case = cas
|
91
|
-
gender = gnd gend
|
92
|
-
person = per pers
|
93
|
-
degree = deg degr
|
94
|
-
aspect = asp
|
95
|
-
negation = neg
|
96
|
-
accommodability = acco acom acm
|
97
|
-
accentability = acce acen acn
|
98
|
-
post-prepositionality = ppr ppre
|
99
|
-
agglutination = agg aggl
|
100
|
-
vocalicity = vcl
|
data/corpus/frek.cfg~
DELETED
@@ -1,100 +0,0 @@
|
|
1
|
-
# Config file format for Oasis release
|
2
|
-
# Config version 1.0
|
3
|
-
|
4
|
-
# The new startup section may contain any command normally accepted by the shell
|
5
|
-
|
6
|
-
[STARTUP]
|
7
|
-
|
8
|
-
/alias masc = m1 m2 m3
|
9
|
-
/alias verb = pact ppas winien praet bedzie fin impt aglt ger imps inf pant pcon
|
10
|
-
/alias noun = subst depr xxs ger ppron12 ppron3
|
11
|
-
/alias pron = ppron12 ppron3 siebie
|
12
|
-
|
13
|
-
|
14
|
-
[ATTR]
|
15
|
-
|
16
|
-
number = sg pl
|
17
|
-
case = nom gen dat acc inst loc voc
|
18
|
-
gender = m1 m2 m3 f n
|
19
|
-
person = pri sec ter
|
20
|
-
degree = pos comp sup
|
21
|
-
aspect = imperf perf
|
22
|
-
negation = aff neg
|
23
|
-
accommodability = congr rec
|
24
|
-
accentability = akc nakc
|
25
|
-
post-prepositionality = npraep praep
|
26
|
-
agglutination = agl nagl
|
27
|
-
vocalicity = nwok wok
|
28
|
-
|
29
|
-
# Parts of speech no longer need forward declarations, this was inconvenient and ugly.
|
30
|
-
# Also, any attribute may be optional so a declaration such as:
|
31
|
-
# foo = [bar] [froz] fred [wilma]
|
32
|
-
# should no longer cause problems and ctags with such attributes now parse correctly regardless
|
33
|
-
# of presence or absence of any optional attribute
|
34
|
-
|
35
|
-
[POS]
|
36
|
-
|
37
|
-
adja =
|
38
|
-
adjp =
|
39
|
-
conj =
|
40
|
-
interp =
|
41
|
-
pred =
|
42
|
-
xxx =
|
43
|
-
adv = degree
|
44
|
-
imps = aspect
|
45
|
-
inf = aspect
|
46
|
-
pant = aspect
|
47
|
-
pcon = aspect
|
48
|
-
qub = [vocalicity]
|
49
|
-
prep = case [vocalicity]
|
50
|
-
siebie = case
|
51
|
-
subst = number case gender
|
52
|
-
depr = number case gender
|
53
|
-
xxs = number case gender
|
54
|
-
ger = number case gender aspect negation
|
55
|
-
ppron12 = number case gender person [accentability]
|
56
|
-
ppron3 = number case gender person [accentability] [post-prepositionality]
|
57
|
-
num = number case gender [accommodability]
|
58
|
-
adj = number case gender degree
|
59
|
-
pact = number case gender aspect negation
|
60
|
-
ppas = number case gender aspect negation
|
61
|
-
winien = number gender aspect
|
62
|
-
praet = number gender aspect [agglutination]
|
63
|
-
bedzie = number person aspect
|
64
|
-
fin = number person aspect
|
65
|
-
impt = number person aspect
|
66
|
-
aglt = number person aspect vocalicity
|
67
|
-
ign =
|
68
|
-
|
69
|
-
# Named entities replaced old 'special' attributes, name changed mostly because of
|
70
|
-
# unification of 'named-thing' handling code into one named-entity thing
|
71
|
-
# Entity aliasing allows for any existing entity to be seen under different name
|
72
|
-
#
|
73
|
-
# FCQP provides four builtin entities:
|
74
|
-
# entity-current
|
75
|
-
# entity-base
|
76
|
-
# entity-tag
|
77
|
-
# entity-pos
|
78
|
-
|
79
|
-
[NAMED-ENTITY]
|
80
|
-
|
81
|
-
entity-orth = orth
|
82
|
-
entity-base = base
|
83
|
-
entity-tag = tag
|
84
|
-
entity-pos = pos
|
85
|
-
|
86
|
-
# Old 'aliases' for attribute names
|
87
|
-
|
88
|
-
pos = flex
|
89
|
-
number = numb nmb
|
90
|
-
case = cas
|
91
|
-
gender = gnd gend
|
92
|
-
person = per pers
|
93
|
-
degree = deg degr
|
94
|
-
aspect = asp
|
95
|
-
negation = neg
|
96
|
-
accommodability = acco acom acm
|
97
|
-
accentability = acce acen acn
|
98
|
-
post-prepositionality = ppr ppre
|
99
|
-
agglutination = agg aggl
|
100
|
-
vocalicity = vcl
|
data/corpus/frek.meta.cfg
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
S sample
|
data/corpus/frek.meta.lisp
DELETED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|