apohllo-poliqarpr 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. data/README.txt +30 -0
  2. data/changelog.txt +8 -0
  3. data/lib/poliqarpr/client.rb +151 -27
  4. data/lib/poliqarpr/excerpt.rb +16 -0
  5. data/lib/poliqarpr/lemmata.rb +11 -0
  6. data/lib/poliqarpr/query_result.rb +14 -0
  7. data/lib/poliqarpr/segment.rb +23 -0
  8. data/lib/poliqarpr.rb +7 -1
  9. data/poliqarpr.gemspec +4 -39
  10. data/spec/client.rb +53 -0
  11. data/spec/excerpt.rb +40 -2
  12. metadata +4 -37
  13. data/corpus/frek.cdf +0 -4
  14. data/corpus/frek.cfg +0 -100
  15. data/corpus/frek.cfg~ +0 -100
  16. data/corpus/frek.meta.cfg +0 -1
  17. data/corpus/frek.meta.lisp +0 -4
  18. data/corpus/frek.poliqarp.base1.image +0 -0
  19. data/corpus/frek.poliqarp.base1.offset +0 -0
  20. data/corpus/frek.poliqarp.base2.image +0 -0
  21. data/corpus/frek.poliqarp.base2.offset +0 -0
  22. data/corpus/frek.poliqarp.chunk.image +0 -0
  23. data/corpus/frek.poliqarp.corpus.image +0 -0
  24. data/corpus/frek.poliqarp.meta-key.image +0 -0
  25. data/corpus/frek.poliqarp.meta-key.offset +0 -0
  26. data/corpus/frek.poliqarp.meta-value.image +0 -0
  27. data/corpus/frek.poliqarp.meta-value.offset +0 -0
  28. data/corpus/frek.poliqarp.meta.image +0 -0
  29. data/corpus/frek.poliqarp.orth.image +0 -0
  30. data/corpus/frek.poliqarp.orth.index.alpha +0 -0
  31. data/corpus/frek.poliqarp.orth.index.atergo +0 -0
  32. data/corpus/frek.poliqarp.orth.offset +0 -0
  33. data/corpus/frek.poliqarp.rindex.amb +0 -0
  34. data/corpus/frek.poliqarp.rindex.amb.offset +0 -0
  35. data/corpus/frek.poliqarp.rindex.disamb +0 -0
  36. data/corpus/frek.poliqarp.rindex.disamb.offset +0 -0
  37. data/corpus/frek.poliqarp.rindex.orth +0 -0
  38. data/corpus/frek.poliqarp.rindex.orth.offset +0 -0
  39. data/corpus/frek.poliqarp.subchunk.image +0 -0
  40. data/corpus/frek.poliqarp.subchunk.item.ch +0 -0
  41. data/corpus/frek.poliqarp.subchunk.offset +0 -0
  42. data/corpus/frek.poliqarp.subpos1.image +0 -0
  43. data/corpus/frek.poliqarp.subpos1.offset +0 -0
  44. data/corpus/frek.poliqarp.subpos2.image +0 -0
  45. data/corpus/frek.poliqarp.subpos2.offset +0 -0
  46. data/corpus/frek.poliqarp.tag.image +0 -0
  47. data/corpus/frek.poliqarp.tag.offset +0 -0
data/README.txt CHANGED
@@ -9,6 +9,8 @@ Poliqarpr is Ruby client for Poliqarp server.
9
9
 
10
10
  == FEATURES/PROBLEMS:
11
11
 
12
+ * built-in pagination of query results
13
+ * support for lemmatization
12
14
  * asynchronous communication is implemented in synchronous manner
13
15
  * only partial implementation of server protocol
14
16
 
@@ -38,6 +40,34 @@ Then you can type:
38
40
 
39
41
  * sudo gem install apohllo-poliqarpr
40
42
 
43
+ You can install the optional default corpus (warning: it is distributed under
44
+ different license!):
45
+
46
+ * sudo gem install apohllo-poliqarpr-corpus
47
+
48
+ == BASIC USAGE:
49
+
50
+ (You need the poliqarpr-corpus to be installed for this to work. See the last
51
+ step of installation process).
52
+
53
+ Require the gem:
54
+
55
+ require 'poliaqarpr'
56
+
57
+ Create the server client and open default corpus
58
+
59
+ client = Poliqarp::Client.new
60
+ client.open_corpus :default
61
+
62
+ Query the corpus for given segment
63
+
64
+ result = client.find("kot")
65
+ result[0].to_s
66
+
67
+ Remember to close the client on exit
68
+
69
+ client.close
70
+
41
71
 
42
72
  == LICENSE:
43
73
 
data/changelog.txt CHANGED
@@ -1,3 +1,11 @@
1
+ 0.0.3
2
+ - the license of the corpus included
3
+ - client rdoc documentation
4
+ - support for lemmata retrieval
5
+ - excerpt now contains segments instead of strings
6
+ - buffer size setter
7
+ - default corpus moved to separate plugin (sudo gem install apohllo-poliqarpr-corpus)
8
+
1
9
  0.0.2
2
10
  - query result is full blown class
3
11
  - source divided into client, excerpt and query resutl
@@ -1,6 +1,10 @@
1
+ require 'socket'
1
2
  module Poliqarp
3
+ # Author:: Aleksander Pohl (mailto:apohllo@o2.pl)
4
+ # License:: MIT License
5
+ #
6
+ # This class is the implementation of the Poliqarp server client.
2
7
  class Client
3
- DEFAULT_CORPUS = File.join(File.expand_path(File.dirname(__FILE__)),"..", "..", "corpus", "frek")
4
8
  ERRORS = {
5
9
  1 => "Incorrect number of arguments",
6
10
  3 => "No session opened",
@@ -20,8 +24,22 @@ module Poliqarp
20
24
  19 => "Invalid session option value",
21
25
  20 => "Invalid sorting criteria"
22
26
  }
27
+ GROUPS = [:left_context, :left_match, :right_match, :right_context]
28
+
29
+ # If debug is turned on, the communication between server and client
30
+ # is logged to standard output.
23
31
  attr_writer :debug
24
32
 
33
+ # The size of the buffer is the maximum number of excerpts which
34
+ # are returned for single query.
35
+ attr_writer :buffer_size
36
+
37
+ # Creates new poliqarp server client.
38
+ #
39
+ # Parameters:
40
+ # * +session_name+ the name of the client session. Defaults to "RUBY".
41
+ # * +debug+ if set to true, all messages sent and received from server
42
+ # are printed to standard output. Defaults to false.
25
43
  def initialize(session_name="RUBY", debug=false)
26
44
  @session_name = session_name
27
45
  @left_context = 5
@@ -31,9 +49,22 @@ module Poliqarp
31
49
  new_session
32
50
  end
33
51
 
34
- def new_session
52
+ # A hint about uninstalled default corpus gem
53
+ def self.const_missing(const)
54
+ if const.to_s =~ /DEFAULT_CORPUS/
55
+ raise "You need to install 'apohllo-poliqarpr-corpus' to use the default corpus"
56
+ end
57
+ super
58
+ end
59
+
60
+ # Creates new session for the client with the name given in constructor.
61
+ # If the session was already opened, it is closed.
62
+ #
63
+ # Parameters:
64
+ # * +port+ - the port on which the poliqarpd server is accepting connections (defaults to 4567)
65
+ def new_session(port=4567)
35
66
  close if @session
36
- @socket = TCPSocket.new("localhost",4567)
67
+ @socket = TCPSocket.new("localhost",port)
37
68
  talk "MAKE-SESSION #{@session_name}"
38
69
  rcv_sync
39
70
  talk("BUFFER-RESIZE #{@buffer_size}")
@@ -43,11 +74,7 @@ module Poliqarp
43
74
  self.lemmata = {}
44
75
  end
45
76
 
46
- def talk(msg)
47
- puts msg if @debug
48
- @socket.puts(msg)
49
- end
50
-
77
+ # Closes the opened connection to the poliqarpd server.
51
78
  def close
52
79
  #talk "CLOSE"
53
80
  #rcv_sync
@@ -57,43 +84,88 @@ module Poliqarp
57
84
  @session = false
58
85
  end
59
86
 
87
+ # Sets the size of the left short context. It must be > 0
88
+ #
89
+ # The size of the left short context is the number
90
+ # of segments displayed in the found excerpts left to the
91
+ # matched segment(s).
60
92
  def left_context=(value)
61
- if value.is_a? Fixnum
93
+ if correct_context_value?(value)
62
94
  talk "SET left-context-width #{value}"
63
95
  result = rcv_sync
64
96
  @left_context = value if result =~ /^R OK/
97
+ else
98
+ raise "Invalid argument: #{value}. It must be fixnum greater than 0."
65
99
  end
66
100
  end
67
101
 
102
+ # Sets the size of the right short context. It must be > 0
103
+ #
104
+ # The size of the right short context is the number
105
+ # of segments displayed in the found excerpts right to the
106
+ # matched segment(s).
68
107
  def right_context=(value)
69
- if value.is_a? Fixnum
108
+ if correct_context_value?(value)
70
109
  talk "SET right-context-width #{value}"
71
110
  result = rcv_sync
72
111
  @right_context = value if result =~ /^R OK/
112
+ else
113
+ raise "Invalid argument: #{value}. It must be fixnum greater than 0."
73
114
  end
74
115
  end
75
116
 
117
+ # Sets the tags' flags. There are four groups of segments
118
+ # which the flags apply for:
119
+ # * +left_context+
120
+ # * +left_match+
121
+ # * +right_match+
122
+ # * +right_context+
123
+ #
124
+ # If the flag for given group is set to true, all segments
125
+ # in the group are annotated with grammatical tags. E.g.:
126
+ # c.find("kot")
127
+ # ...
128
+ # "kot" tags: "subst:sg:nom:m2"
129
+ #
130
+ # You can pass :all to turn on flags for all groups
76
131
  def tags=(options={})
132
+ options = set_all_flags if options == :all
133
+ @tag_flags = options
77
134
  flags = ""
78
- [:left_context_tags, :leftM_tags,
79
- :rightM_tags, :right_context_tags].each do |flag|
135
+ GROUPS.each do |flag|
80
136
  flags << (options[flag] ? "1" : "0")
81
137
  end
82
138
  talk "SET retrieve-tags #{flags}"
83
139
  rcv_sync
84
140
  end
85
141
 
142
+ # Sets the lemmatas' flags. There are four groups of segments
143
+ # which the flags apply for:
144
+ # * +left_context+
145
+ # * +left_match+
146
+ # * +right_match+
147
+ # * +right_context+
148
+ #
149
+ # If the flag for given group is set to true, all segments
150
+ # in the group are returned with the base form of the lemmata. E.g.:
151
+ # c.find("kotu")
152
+ # ...
153
+ # "kotu" base_form: "kot"
154
+ #
155
+ # You can pass :all to turn on flags for all groups
86
156
  def lemmata=(options={})
157
+ options = set_all_flags if options == :all
158
+ @lemmata_flags = options
87
159
  flags = ""
88
- [:left_context_lemmata, :leftM_lemmata,
89
- :rightM_lemmata, :right_context_lemmata].each do |flag|
160
+ GROUPS.each do |flag|
90
161
  flags << (options[flag] ? "1" : "0")
91
162
  end
92
163
  talk "SET retrieve-lemmata #{flags}"
93
164
  rcv_sync
94
165
  end
95
166
 
96
-
167
+ # Opens the corpus given as +path+. To open the default
168
+ # corpus pass +:default+ as the argument.
97
169
  def open_corpus(path)
98
170
  if path == :default
99
171
  open_corpus(DEFAULT_CORPUS)
@@ -104,6 +176,19 @@ module Poliqarp
104
176
  end
105
177
  end
106
178
 
179
+ # Send the query to the opened corpus.
180
+ #
181
+ # Options:
182
+ # * +index+ the index of the (only one) result to be returned. The index is relative
183
+ # to the beginning of the query result. In normal case you should query the
184
+ # corpus without specifying the index, to see what results are returned.
185
+ # Then you can use the index and the same query to retrieve one result.
186
+ # The pair (query, index) is a kind of unique identifier of the excerpt.
187
+ # * +page_size+ the size of the page of results. If the page size is 0, then
188
+ # all results are returned on one page. It is ignored if the +index+ option
189
+ # is present. Defaults to 0.
190
+ # * +page_index+ the index of the page of results (the first page has index 1, not 0).
191
+ # It is ignored if the +index+ option is present. Defaults to 1.
107
192
  def find(query,options={})
108
193
  if options[:index]
109
194
  find_one(query, options[:index])
@@ -114,10 +199,13 @@ module Poliqarp
114
199
 
115
200
  alias query find
116
201
 
202
+ # Returns the number of results for given query.
117
203
  def count(query)
118
204
  count_results(make_query(query))
119
205
  end
120
206
 
207
+ # Returns the long context of the excerpt which is identified by
208
+ # given (query, index) pair.
121
209
  def context(query,index)
122
210
  make_query(query)
123
211
  result = []
@@ -135,6 +223,8 @@ module Poliqarp
135
223
  result
136
224
  end
137
225
 
226
+ # Returns the metadata of the excerpt which is identified by
227
+ # given (query, index) pair.
138
228
  def metadata(query, index)
139
229
  make_query(query)
140
230
  result = {}
@@ -152,6 +242,13 @@ module Poliqarp
152
242
  end
153
243
 
154
244
  protected
245
+ # Sends a message directly to the server
246
+ # * +msg+ the message to send
247
+ def talk(msg)
248
+ puts msg if @debug
249
+ @socket.puts(msg)
250
+ end
251
+
155
252
  def find_many(query, options)
156
253
  page_size = (options[:page_size] || 0)
157
254
  page_index = (options[:page_index] || 1)
@@ -193,29 +290,45 @@ protected
193
290
  end
194
291
 
195
292
  # Fetches one result of the query
196
- #
293
+ ##
197
294
  # MAKE-QUERY and GET-RESULTS must be called on server before
198
295
  # this method is called
199
296
  def fetch_result(index, query)
200
297
  result = Excerpt.new(index, self, query)
201
- # left_context
202
- result << read_segments
203
- # matched query
204
- result << read_segments
205
- # right context
206
- result << read_segments
298
+ result << read_segments(:left_context)
299
+ result << read_segments(:left_match)
300
+ # XXX
301
+ #result << read_segments(:right_match)
302
+ result << read_segments(:right_context)
207
303
 
208
304
  result
209
305
  end
210
306
 
211
- def read_segments
212
- answer = rcv_sync
213
- size = answer.match(/\d+/)[0].to_i
307
+ def read_segments(group)
308
+ size = get_number(rcv_sync)
214
309
  segments = []
215
310
  size.times do |segment_index|
216
- segments << read_word
311
+ segment = Segment.new(read_word)
312
+ segments << segment
313
+ if @lemmata_flags[group] || @tag_flags[group]
314
+ lemmata_size = get_number(rcv_sync)
315
+ lemmata_size.times do |lemmata_index|
316
+ lemmata = Lemmata.new()
317
+ if @lemmata_flags[group]
318
+ lemmata.base_form = read_word
319
+ end
320
+ if @tag_flags[group]
321
+ read_word
322
+ end
323
+ segment.lemmata << lemmata
324
+ end
325
+ end
217
326
  end
218
- segments.join("")
327
+ segments
328
+ end
329
+
330
+ def get_number(str)
331
+ str.match(/\d+/)[0].to_i
219
332
  end
220
333
 
221
334
  def count_results(answer)
@@ -268,5 +381,16 @@ protected
268
381
  end until line =~ /^M/
269
382
  line
270
383
  end
384
+
385
+ private
386
+ def set_all_flags
387
+ options = {}
388
+ GROUPS.each{|g| options[g] = true}
389
+ options
390
+ end
391
+
392
+ def correct_context_value?(value)
393
+ value.is_a?(Fixnum) && value > 0
394
+ end
271
395
  end
272
396
  end
@@ -1,4 +1,15 @@
1
1
  module Poliqarp
2
+ # Author:: Aleksander Pohl
3
+ # License:: MIT License
4
+ #
5
+ # The excerpt class is used to store single result of the query,
6
+ # i.e. the excerpt of the corpus which contains the words which
7
+ # the corpus was queried for.
8
+ #
9
+ # The excerpt is divided into groups, which contain segments,
10
+ # which the texts in the corpus were divided for.
11
+ # The first group is the left context, the second -- the matched
12
+ # query, and the last -- the right context.
2
13
  class Excerpt
3
14
  attr_reader :index, :base_form, :short_context
4
15
 
@@ -9,11 +20,13 @@ module Poliqarp
9
20
  @short_context = []
10
21
  end
11
22
 
23
+ # Adds segment group to the excerpt
12
24
  def <<(value)
13
25
  @short_context << value
14
26
  end
15
27
 
16
28
 
29
+ # Returns the matched query as string
17
30
  def word
18
31
  #@short_context[0].split(/\s+/)[-1]
19
32
  @short_context[1].to_s
@@ -21,10 +34,13 @@ module Poliqarp
21
34
 
22
35
  alias inflected_form word
23
36
 
37
+ # The string representation of the excerpt is the shord
38
+ # context of the query.
24
39
  def to_s
25
40
  @short_context.join("")
26
41
  end
27
42
 
43
+ # Returns the long context of the query.
28
44
  def context
29
45
  return @context unless @context.nil?
30
46
  @context = @client.context(@base_form, @index)
@@ -0,0 +1,11 @@
1
+ module Poliqarp
2
+ # Author:: Aleksander Pohl (mailto:apohllo@o2.pl)
3
+ # License:: MIT License
4
+ #
5
+ # The lemmata contains the base form of the segment
6
+ class Lemmata
7
+ attr_accessor :base_form
8
+ def initialize()
9
+ end
10
+ end
11
+ end
@@ -1,4 +1,10 @@
1
1
  module Poliqarp
2
+ # Author:: Aleksander Pohl (mailto:apohllo@o2.pl)
3
+ # License:: MIT License
4
+ #
5
+ # The query result class is used to paginate results of the
6
+ # query. Each query result has information about its context
7
+ # (the next and previous page).
2
8
  class QueryResult
3
9
  include Enumerable
4
10
 
@@ -13,10 +19,12 @@ module Poliqarp
13
19
  @excerpts = []
14
20
  end
15
21
 
22
+ # Adds excerpt to the query result
16
23
  def <<(excerpt)
17
24
  @excerpts << excerpt
18
25
  end
19
26
 
27
+ # Allows to iterate over the results stored in the result
20
28
  def each
21
29
  @excerpts.each{|e| yield e}
22
30
  end
@@ -27,16 +35,20 @@ module Poliqarp
27
35
  end
28
36
  end
29
37
 
38
+ # Returns excerpt with given index.
30
39
  def [](index)
31
40
  @excerpts[index]
32
41
  end
33
42
 
43
+ # Two excerpts are equal iff their page number, page count,
44
+ # query and page size are equal.
34
45
  def ==(other)
35
46
  return false unless other.is_a? QueryResult
36
47
  @page == other.page && @page_count == other.page_count &&
37
48
  @query == other.query && @page_size == other.page_size
38
49
  end
39
50
 
51
+ # Returns the previous page of the query result
40
52
  def previous_page
41
53
  if @page > 1
42
54
  @client.find(@query, :page_size => @page_size,
@@ -44,6 +56,7 @@ module Poliqarp
44
56
  end
45
57
  end
46
58
 
59
+ # Return the next page of the query result
47
60
  def next_page
48
61
  if @page < @page_count
49
62
  @client.find(@query, :page_size => @page_size,
@@ -51,6 +64,7 @@ module Poliqarp
51
64
  end
52
65
  end
53
66
 
67
+ # Returns the number of excerpts stored in this page (query result)
54
68
  def size
55
69
  @excerpts.size
56
70
  end
@@ -0,0 +1,23 @@
1
+ module Poliqarp
2
+ # Author:: Aleksander Pohl (mailto:apohllo@o2.pl)
3
+ # License:: MIT LICENSE
4
+ #
5
+ # The segment is the smallest meaningful part of the text.
6
+ # It may contain many lemmata, since the segments are sometimes
7
+ # not disambiguated.
8
+ class Segment
9
+ attr_reader :literal, :lemmata
10
+
11
+ # Creates new segment. The specified argument is the literal
12
+ # (as found in the text) representation of the segment.
13
+ def initialize(literal)
14
+ @literal = literal
15
+ @lemmata = []
16
+ end
17
+
18
+ # Returns the segment literal
19
+ def to_s
20
+ @literal
21
+ end
22
+ end
23
+ end
data/lib/poliqarpr.rb CHANGED
@@ -2,4 +2,10 @@ path = File.join(File.dirname(__FILE__), 'poliqarpr')
2
2
  require File.join(path, 'client')
3
3
  require File.join(path, 'query_result')
4
4
  require File.join(path, 'excerpt')
5
-
5
+ require File.join(path, 'segment')
6
+ require File.join(path, 'lemmata')
7
+ begin
8
+ require 'poliqarpr-corpus'
9
+ rescue LoadError
10
+ # Do nothig, since the default corpus is optional
11
+ end
data/poliqarpr.gemspec CHANGED
@@ -1,9 +1,7 @@
1
- #require 'rake'
2
-
3
1
  Gem::Specification.new do |s|
4
2
  s.name = "poliqarpr"
5
- s.version = "0.0.2"
6
- s.date = "2008-12-15"
3
+ s.version = "0.0.3"
4
+ s.date = "2008-12-20"
7
5
  s.summary = "Ruby client for Poliqarp"
8
6
  s.email = "apohllo@o2.pl"
9
7
  s.homepage = "http://www.apohllo.pl/projekty/poliqarpr"
@@ -14,42 +12,9 @@ Gem::Specification.new do |s|
14
12
  "lib/poliqarpr/client.rb",
15
13
  "lib/poliqarpr/query_result.rb",
16
14
  "lib/poliqarpr/excerpt.rb",
15
+ "lib/poliqarpr/lemmata.rb",
16
+ "lib/poliqarpr/segment.rb",
17
17
  "README.txt",
18
- "corpus/frek.cdf",
19
- "corpus/frek.poliqarp.base1.image",
20
- "corpus/frek.poliqarp.corpus.image",
21
- "corpus/frek.poliqarp.meta-value.offset",
22
- "corpus/frek.poliqarp.rindex.amb",
23
- "corpus/frek.poliqarp.rindex.orth.offset",
24
- "corpus/frek.poliqarp.subpos1.offset",
25
- "corpus/frek.cfg",
26
- "corpus/frek.poliqarp.base1.offset",
27
- "corpus/frek.poliqarp.meta.image",
28
- "corpus/frek.poliqarp.orth.image",
29
- "corpus/frek.poliqarp.rindex.amb.offset",
30
- "corpus/frek.poliqarp.subchunk.image",
31
- "corpus/frek.poliqarp.subpos2.image",
32
- "corpus/frek.cfg~",
33
- "corpus/frek.poliqarp.base2.image",
34
- "corpus/frek.poliqarp.meta-key.image",
35
- "corpus/frek.poliqarp.orth.index.alpha",
36
- "corpus/frek.poliqarp.rindex.disamb",
37
- "corpus/frek.poliqarp.subchunk.item.ch",
38
- "corpus/frek.poliqarp.subpos2.offset",
39
- "corpus/frek.meta.cfg",
40
- "corpus/frek.poliqarp.base2.offset",
41
- "corpus/frek.poliqarp.meta-key.offset",
42
- "corpus/frek.poliqarp.orth.index.atergo",
43
- "corpus/frek.poliqarp.rindex.disamb.offset",
44
- "corpus/frek.poliqarp.subchunk.offset",
45
- "corpus/frek.poliqarp.tag.image",
46
- "corpus/frek.meta.lisp",
47
- "corpus/frek.poliqarp.chunk.image",
48
- "corpus/frek.poliqarp.meta-value.image",
49
- "corpus/frek.poliqarp.orth.offset",
50
- "corpus/frek.poliqarp.rindex.orth",
51
- "corpus/frek.poliqarp.subpos1.image",
52
- "corpus/frek.poliqarp.tag.offset"
53
18
  ]
54
19
  s.test_files = [
55
20
  "spec/client.rb",
data/spec/client.rb CHANGED
@@ -29,6 +29,38 @@ describe Poliqarp::Client do
29
29
  @client.close
30
30
  end
31
31
 
32
+ it "should allow to set the right context size" do
33
+ @client.right_context = 5
34
+ end
35
+
36
+ it "should raise error if the size of right context is not number" do
37
+ (proc do
38
+ @client.right_context = "a"
39
+ end).should raise_error(RuntimeError)
40
+ end
41
+
42
+ it "should rais error if the size of right context is less or equal 0" do
43
+ (proc do
44
+ @client.right_context = 0
45
+ end).should raise_error(RuntimeError)
46
+ end
47
+
48
+ it "should allow to set the left context size" do
49
+ @client.right_context = 5
50
+ end
51
+
52
+ it "should raise error if the size of left context is not number" do
53
+ (lambda do
54
+ @client.left_context = "a"
55
+ end).should raise_error(RuntimeError)
56
+ end
57
+
58
+ it "should rais error if the size of left context is less or equal 0" do
59
+ (lambda do
60
+ @client.left_context = 0
61
+ end).should raise_error(RuntimeError)
62
+ end
63
+
32
64
  it "should allow to find 'kot'" do
33
65
  @client.find("kot").size.should_not == 0
34
66
  end
@@ -83,6 +115,27 @@ describe Poliqarp::Client do
83
115
  @result.to_s.should == @client.find("nachalny")[0].to_s
84
116
  end
85
117
  end
118
+
119
+ describe("(with lemmata flags set to true)") do
120
+ before(:all) do
121
+ @client.lemmata = {:left_context => true, :right_context => true,
122
+ :left_match => true, :right_match => true}
123
+ end
124
+
125
+ it "should allow to find 'kotu'" do
126
+ @client.find("kotu").size.should_not == 0
127
+ end
128
+
129
+ it "should contain 'kotu' in query result for 'kotu'" do
130
+ @client.find("kotu")[0].to_s.should match(/\bkotu\b/)
131
+ end
132
+
133
+ it "should contain 'kot' in lemmatized query result for 'kotu'" do
134
+ @client.find("kotu")[0].short_context.flatten.
135
+ map{|e| e.lemmata[0].base_form}.join(" ").should match(/\bkot\b/)
136
+ end
137
+
138
+ end
86
139
  end
87
140
 
88
141
  end
data/spec/excerpt.rb CHANGED
@@ -23,8 +23,19 @@ describe Poliqarp::Excerpt do
23
23
  @excerpt.base_form.should_not == nil
24
24
  end
25
25
 
26
- it "should allow to add short context" do
27
- @excerpt << "abc"
26
+ it "should contain 3 groups in short context" do
27
+ @excerpt.short_context.size.should == 3
28
+ end
29
+
30
+ it "should allow to add segment group" do
31
+ @excerpt << [Poliqarp::Segment.new("abc")]
32
+ end
33
+
34
+
35
+ it "should contain non empty segments in short context" do
36
+ @excerpt.short_context.flatten.each do |segment|
37
+ segment.literal.should_not == nil
38
+ end
28
39
  end
29
40
 
30
41
  it "should contain the exact form which it was created for" do
@@ -92,4 +103,31 @@ describe Poliqarp::Excerpt do
92
103
  @excerpt.author[0].should == "Małgorzata Pamuła"
93
104
  end
94
105
  end
106
+
107
+ describe('first result for "kotu" with lemmatization turned on') do
108
+ before(:all) do
109
+ @client.lemmata = :all
110
+ @client.open_corpus(:default)
111
+ @excerpt = @client.find("kotu")[0]
112
+ end
113
+
114
+ it "should have one lemmata for each segment" do
115
+ @excerpt.short_context.each do |group|
116
+ group.each do |segment|
117
+ segment.lemmata.size.should == 1
118
+ end
119
+ end
120
+ end
121
+
122
+ it "should have non-nil lemmata for each segment" do
123
+ @excerpt.short_context.flatten.each do |segment|
124
+ segment.lemmata[0].should_not == nil
125
+ end
126
+ end
127
+
128
+ it "should contain 'kot' as one of the lemmata" do
129
+ @excerpt.short_context.flatten.
130
+ any?{|s| s.lemmata[0].base_form == "kot"}.should == true
131
+ end
132
+ end
95
133
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: apohllo-poliqarpr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Aleksander Pohl
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-12-15 00:00:00 -08:00
12
+ date: 2008-12-20 00:00:00 -08:00
13
13
  default_executable:
14
14
  dependencies: []
15
15
 
@@ -29,42 +29,9 @@ files:
29
29
  - lib/poliqarpr/client.rb
30
30
  - lib/poliqarpr/query_result.rb
31
31
  - lib/poliqarpr/excerpt.rb
32
+ - lib/poliqarpr/lemmata.rb
33
+ - lib/poliqarpr/segment.rb
32
34
  - README.txt
33
- - corpus/frek.cdf
34
- - corpus/frek.poliqarp.base1.image
35
- - corpus/frek.poliqarp.corpus.image
36
- - corpus/frek.poliqarp.meta-value.offset
37
- - corpus/frek.poliqarp.rindex.amb
38
- - corpus/frek.poliqarp.rindex.orth.offset
39
- - corpus/frek.poliqarp.subpos1.offset
40
- - corpus/frek.cfg
41
- - corpus/frek.poliqarp.base1.offset
42
- - corpus/frek.poliqarp.meta.image
43
- - corpus/frek.poliqarp.orth.image
44
- - corpus/frek.poliqarp.rindex.amb.offset
45
- - corpus/frek.poliqarp.subchunk.image
46
- - corpus/frek.poliqarp.subpos2.image
47
- - corpus/frek.cfg~
48
- - corpus/frek.poliqarp.base2.image
49
- - corpus/frek.poliqarp.meta-key.image
50
- - corpus/frek.poliqarp.orth.index.alpha
51
- - corpus/frek.poliqarp.rindex.disamb
52
- - corpus/frek.poliqarp.subchunk.item.ch
53
- - corpus/frek.poliqarp.subpos2.offset
54
- - corpus/frek.meta.cfg
55
- - corpus/frek.poliqarp.base2.offset
56
- - corpus/frek.poliqarp.meta-key.offset
57
- - corpus/frek.poliqarp.orth.index.atergo
58
- - corpus/frek.poliqarp.rindex.disamb.offset
59
- - corpus/frek.poliqarp.subchunk.offset
60
- - corpus/frek.poliqarp.tag.image
61
- - corpus/frek.meta.lisp
62
- - corpus/frek.poliqarp.chunk.image
63
- - corpus/frek.poliqarp.meta-value.image
64
- - corpus/frek.poliqarp.orth.offset
65
- - corpus/frek.poliqarp.rindex.orth
66
- - corpus/frek.poliqarp.subpos1.image
67
- - corpus/frek.poliqarp.tag.offset
68
35
  has_rdoc: true
69
36
  homepage: http://www.apohllo.pl/projekty/poliqarpr
70
37
  post_install_message:
data/corpus/frek.cdf DELETED
@@ -1,4 +0,0 @@
1
- version = 1
2
- endianness = little-endian
3
- indices = oda
4
- index-granularity = 1024
data/corpus/frek.cfg DELETED
@@ -1,100 +0,0 @@
1
- # Config file format for Oasis release
2
- # Config version 1.0
3
-
4
- # The new startup section may contain any command normally accepted by the shell
5
-
6
- [ALIASES]
7
-
8
- masc = m1|m2|m3
9
- verb = pact|ppas|winien|praet|bedzie|fin|impt|aglt|ger|imps|inf|pant|pcon
10
- noun = subst|depr|xxs|ger|ppron12|ppron3
11
- pron = ppron12|ppron3|siebie
12
-
13
-
14
- [ATTR]
15
-
16
- number = sg pl
17
- case = nom gen dat acc inst loc voc
18
- gender = m1 m2 m3 f n
19
- person = pri sec ter
20
- degree = pos comp sup
21
- aspect = imperf perf
22
- negation = aff neg
23
- accommodability = congr rec
24
- accentability = akc nakc
25
- post-prepositionality = npraep praep
26
- agglutination = agl nagl
27
- vocalicity = nwok wok
28
-
29
- # Parts of speech no longer need forward declarations, this was inconvenient and ugly.
30
- # Also, any attribute may be optional so a declaration such as:
31
- # foo = [bar] [froz] fred [wilma]
32
- # should no longer cause problems and ctags with such attributes now parse correctly regardless
33
- # of presence or absence of any optional attribute
34
-
35
- [POS]
36
-
37
- adja =
38
- adjp =
39
- conj =
40
- interp =
41
- pred =
42
- xxx =
43
- adv = degree
44
- imps = aspect
45
- inf = aspect
46
- pant = aspect
47
- pcon = aspect
48
- qub = [vocalicity]
49
- prep = case [vocalicity]
50
- siebie = case
51
- subst = number case gender
52
- depr = number case gender
53
- xxs = number case gender
54
- ger = number case gender aspect negation
55
- ppron12 = number case gender person [accentability]
56
- ppron3 = number case gender person [accentability] [post-prepositionality]
57
- num = number case gender [accommodability]
58
- adj = number case gender degree
59
- pact = number case gender aspect negation
60
- ppas = number case gender aspect negation
61
- winien = number gender aspect
62
- praet = number gender aspect [agglutination]
63
- bedzie = number person aspect
64
- fin = number person aspect
65
- impt = number person aspect
66
- aglt = number person aspect vocalicity
67
- ign =
68
-
69
- # Named entities replaced old 'special' attributes, name changed mostly because of
70
- # unification of 'named-thing' handling code into one named-entity thing
71
- # Entity aliasing allows for any existing entity to be seen under different name
72
- #
73
- # FCQP provides four builtin entities:
74
- # entity-current
75
- # entity-base
76
- # entity-tag
77
- # entity-pos
78
-
79
- [NAMED-ENTITY]
80
-
81
- entity-orth = orth
82
- entity-base = base
83
- entity-tag = tag
84
- entity-pos = pos
85
-
86
- # Old 'aliases' for attribute names
87
-
88
- pos = flex
89
- number = numb nmb
90
- case = cas
91
- gender = gnd gend
92
- person = per pers
93
- degree = deg degr
94
- aspect = asp
95
- negation = neg
96
- accommodability = acco acom acm
97
- accentability = acce acen acn
98
- post-prepositionality = ppr ppre
99
- agglutination = agg aggl
100
- vocalicity = vcl
data/corpus/frek.cfg~ DELETED
@@ -1,100 +0,0 @@
1
- # Config file format for Oasis release
2
- # Config version 1.0
3
-
4
- # The new startup section may contain any command normally accepted by the shell
5
-
6
- [STARTUP]
7
-
8
- /alias masc = m1 m2 m3
9
- /alias verb = pact ppas winien praet bedzie fin impt aglt ger imps inf pant pcon
10
- /alias noun = subst depr xxs ger ppron12 ppron3
11
- /alias pron = ppron12 ppron3 siebie
12
-
13
-
14
- [ATTR]
15
-
16
- number = sg pl
17
- case = nom gen dat acc inst loc voc
18
- gender = m1 m2 m3 f n
19
- person = pri sec ter
20
- degree = pos comp sup
21
- aspect = imperf perf
22
- negation = aff neg
23
- accommodability = congr rec
24
- accentability = akc nakc
25
- post-prepositionality = npraep praep
26
- agglutination = agl nagl
27
- vocalicity = nwok wok
28
-
29
- # Parts of speech no longer need forward declarations, this was inconvenient and ugly.
30
- # Also, any attribute may be optional so a declaration such as:
31
- # foo = [bar] [froz] fred [wilma]
32
- # should no longer cause problems and ctags with such attributes now parse correctly regardless
33
- # of presence or absence of any optional attribute
34
-
35
- [POS]
36
-
37
- adja =
38
- adjp =
39
- conj =
40
- interp =
41
- pred =
42
- xxx =
43
- adv = degree
44
- imps = aspect
45
- inf = aspect
46
- pant = aspect
47
- pcon = aspect
48
- qub = [vocalicity]
49
- prep = case [vocalicity]
50
- siebie = case
51
- subst = number case gender
52
- depr = number case gender
53
- xxs = number case gender
54
- ger = number case gender aspect negation
55
- ppron12 = number case gender person [accentability]
56
- ppron3 = number case gender person [accentability] [post-prepositionality]
57
- num = number case gender [accommodability]
58
- adj = number case gender degree
59
- pact = number case gender aspect negation
60
- ppas = number case gender aspect negation
61
- winien = number gender aspect
62
- praet = number gender aspect [agglutination]
63
- bedzie = number person aspect
64
- fin = number person aspect
65
- impt = number person aspect
66
- aglt = number person aspect vocalicity
67
- ign =
68
-
69
- # Named entities replaced old 'special' attributes, name changed mostly because of
70
- # unification of 'named-thing' handling code into one named-entity thing
71
- # Entity aliasing allows for any existing entity to be seen under different name
72
- #
73
- # FCQP provides four builtin entities:
74
- # entity-current
75
- # entity-base
76
- # entity-tag
77
- # entity-pos
78
-
79
- [NAMED-ENTITY]
80
-
81
- entity-orth = orth
82
- entity-base = base
83
- entity-tag = tag
84
- entity-pos = pos
85
-
86
- # Old 'aliases' for attribute names
87
-
88
- pos = flex
89
- number = numb nmb
90
- case = cas
91
- gender = gnd gend
92
- person = per pers
93
- degree = deg degr
94
- aspect = asp
95
- negation = neg
96
- accommodability = acco acom acm
97
- accentability = acce acen acn
98
- post-prepositionality = ppr ppre
99
- agglutination = agg aggl
100
- vocalicity = vcl
data/corpus/frek.meta.cfg DELETED
@@ -1 +0,0 @@
1
- S sample
@@ -1,4 +0,0 @@
1
- (single "sample"
2
- "/cesHeader/fileDesc/(sourceDesc/biblFull/)*sourceDesc/biblStruct/monogr/h.title")
3
-
4
-
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file