acts_as_searchable 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1025 @@
1
+ #--
2
+ # Ruby interface of Hyper Estraier
3
+ # Copyright (C) 2004-2006 Mikio Hirabayashi
4
+ # All rights reserved.
5
+ # This file is part of Hyper Estraier.
6
+ # Redistribution and use in source and binary forms, with or without modification, are
7
+ # permitted provided that the following conditions are met:
8
+ #
9
+ # * Redistributions of source code must retain the above copyright notice, this list of
10
+ # conditions and the following disclaimer.
11
+ # * Redistributions in binary form must reproduce the above copyright notice, this list of
12
+ # conditions and the following disclaimer in the documentation and/or other materials
13
+ # provided with the distribution.
14
+ # * Neither the name of Mikio Hirabayashi nor the names of its contributors may be used to
15
+ # endorse or promote products derived from this software without specific prior written
16
+ # permission.
17
+ #
18
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS
19
+ # OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
20
+ # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
21
+ # COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23
+ # GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24
+ # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25
+ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
26
+ # OF THE POSSIBILITY OF SUCH DAMAGE.
27
+ #++
28
+ #:include:overview
29
+
30
+
31
+ require "uri"
32
+ require "socket"
33
+ require "stringio"
34
+
35
+
36
+
37
+ #
38
+ # Module for the namespace of Hyper Estraier
39
+ #
40
+ module EstraierPure
41
+ #----------------------------------------------------------------
42
+ #++ Abstraction of document.
43
+ #----------------------------------------------------------------
44
+ class Document
45
+ #--------------------------------
46
+ # public methods
47
+ #--------------------------------
48
+ public
49
+ # Add an attribute.
50
+ # `name' specifies the name of an attribute.
51
+ # `value' specifies the value of the attribute. If it is `nil', the attribute is removed.
52
+ # The return value is always `nil'.
53
+ def add_attr(name, value)
54
+ Utility::check_types({ name=>String, value=>String }) if $DEBUG
55
+ name = name.gsub(/[ \t\r\n\v\f]+/, " ")
56
+ name = name.strip.squeeze(" ")
57
+ value = value.gsub(/[ \t\r\n\v\f]+/, " ")
58
+ value = value.strip.squeeze(" ")
59
+ @attrs[name] = value
60
+ nil
61
+ end
62
+ # Add a sentence of text.
63
+ # `text' specifies a sentence of text.
64
+ # The return value is always `nil'.
65
+ def add_text(text)
66
+ Utility::check_types({ text=>String }) if $DEBUG
67
+ text = text.gsub(/[ \t\r\n\v\f]+/, " ")
68
+ text = text.strip.squeeze(" ")
69
+ @dtexts.push(text) if text.length
70
+ nil
71
+ end
72
+ # Add a hidden sentence.
73
+ # `text' specifies a hidden sentence.
74
+ # The return value is always `nil'.
75
+ def add_hidden_text(text)
76
+ Utility::check_types({ text=>String }) if $DEBUG
77
+ text = text.gsub(/[ \t\r\n\v\f]+/, " ")
78
+ text = text.strip.squeeze(" ")
79
+ @htexts.push(text) if text.length
80
+ nil
81
+ end
82
+ # Attache keywords.
83
+ # `kwords' specifies a map object of keywords. Keys of the map should be keywords of the
84
+ # document and values should be their scores in decimal string.
85
+ # The return value is always `nil'.
86
+ def set_keywords(kwords)
87
+ Utility::check_types({ kwords=>Hash }) if $DEBUG
88
+ @kwords = kwords
89
+ end
90
+ # Get the ID number.
91
+ # The return value is the ID number of the document object. If the object has never been
92
+ # registered, -1 is returned.
93
+ def id()
94
+ @id
95
+ end
96
+ # Get a list of attribute names of a document object.
97
+ # The return value is a list object of attribute names.
98
+ def attr_names()
99
+ @attrs.keys.sort
100
+ end
101
+ # Get the value of an attribute.
102
+ # `name' specifies the name of an attribute.
103
+ # The return value is the value of the attribute or `nil' if it does not exist.
104
+ def attr(name)
105
+ Utility::check_types({ name=>String }) if $DEBUG
106
+ @attrs[name]
107
+ end
108
+ # Get a list of sentences of the text.
109
+ # The return value is a list object of sentences of the text.
110
+ def texts()
111
+ @dtexts
112
+ end
113
+ # Concatenate sentences of the text of a document object.
114
+ # The return value is concatenated sentences.
115
+ def cat_texts()
116
+ buf = StringIO::new
117
+ for i in 0...@dtexts.length
118
+ buf.write(" ") if i > 0
119
+ buf.write(@dtexts[i])
120
+ end
121
+ buf.string
122
+ end
123
+ # Dump draft data of a document object.
124
+ # The return value is draft data.
125
+ def dump_draft()
126
+ buf = StringIO::new
127
+ keys = @attrs.keys.sort
128
+ for i in 0...keys.length
129
+ buf.printf("%s=%s\n", keys[i], @attrs[keys[i]])
130
+ end
131
+ if @kwords
132
+ buf.printf("%%VECTOR");
133
+ @kwords.each() do |key, value|
134
+ buf.printf("\t%s\t%s", key, value);
135
+ end
136
+ buf.printf("\n");
137
+ end
138
+ buf.printf("\n")
139
+ for i in 0...@dtexts.length
140
+ buf.printf("%s\n", @dtexts[i])
141
+ end
142
+ for i in 0...@htexts.length
143
+ buf.printf("\t%s\n", @htexts[i])
144
+ end
145
+ buf.string
146
+ end
147
+ # Get attached keywords.
148
+ # The return value is a map object of keywords and their scores in decimal string. If no
149
+ # keyword is attached, `nil' is returned.
150
+ def keywords()
151
+ @kwords
152
+ end
153
+ #--------------------------------
154
+ # private methods
155
+ #--------------------------------
156
+ private
157
+ # Create a document object.
158
+ # `draft' specifies a string of draft data.
159
+ def initialize(draft = "")
160
+ Utility::check_types({ draft=>String }) if $DEBUG
161
+ @id = -1
162
+ @attrs = {}
163
+ @dtexts = []
164
+ @htexts = []
165
+ @kwords = nil
166
+ if draft.length
167
+ lines = draft.split(/\n/)
168
+ num = 0
169
+ while num < lines.length
170
+ line = lines[num]
171
+ num += 1
172
+ break if line.length < 1
173
+ if line =~ /^%/
174
+ if line =~ /^%VECTOR\t/
175
+ @kwords = {} unless @kwords
176
+ fields = line.split(/\t/)
177
+ i = 1
178
+ while i < fields.length - 1
179
+ @kwords[fields[i]] = fields[i+1]
180
+ i += 2
181
+ end
182
+ end
183
+ next
184
+ end
185
+ line = line.gsub(/[ \t\r\n\v\f]+/, " ")
186
+ line = line.strip.squeeze(" ")
187
+ if idx = line.index("=")
188
+ key = line[0...idx]
189
+ value = line[idx+1...line.length]
190
+ @attrs[key] = value
191
+ end
192
+ end
193
+ while num < lines.length
194
+ line = lines[num]
195
+ next unless line.length
196
+ if line[0] == 0x9
197
+ @htexts.push(line[1...line.length]) if line.length > 1
198
+ else
199
+ @dtexts.push(line)
200
+ end
201
+ num += 1
202
+ end
203
+ end
204
+ end
205
+ end
206
+ #----------------------------------------------------------------
207
+ #++ Abstraction of search condition.
208
+ #----------------------------------------------------------------
209
+ class Condition
210
+ #--------------------------------
211
+ # public constants
212
+ #--------------------------------
213
+ public
214
+ # option: check N-gram keys skipping by three
215
+ SURE = 1 << 0
216
+ # option: check N-gram keys skipping by two
217
+ USUAL = 1 << 1
218
+ # option: without TF-IDF tuning
219
+ FAST = 1 << 2
220
+ # option: with the simplified phrase
221
+ AGITO = 1 << 3
222
+ # option: check every N-gram key
223
+ NOIDF = 1 << 4
224
+ # option: check N-gram keys skipping by one
225
+ SIMPLE = 1 << 10
226
+ #--------------------------------
227
+ # public methods
228
+ #--------------------------------
229
+ public
230
+ # Set the search phrase.
231
+ # `phrase' specifies a search phrase.
232
+ # The return value is always `nil'.
233
+ def set_phrase(phrase)
234
+ Utility::check_types({ phrase=>String }) if $DEBUG
235
+ phrase = phrase.gsub(/[ \t\r\n\v\f]+/, " ")
236
+ phrase = phrase.strip.squeeze(" ")
237
+ @phrase = phrase
238
+ nil
239
+ end
240
+ # Add an expression for an attribute.
241
+ # `expr' specifies an expression for an attribute.
242
+ # The return value is always `nil'.
243
+ def add_attr(expr)
244
+ Utility::check_types({ expr=>String }) if $DEBUG
245
+ expr = expr.gsub(/[ \t\r\n\v\f]+/, " ")
246
+ expr = expr.strip.squeeze(" ")
247
+ @attrs.push(expr)
248
+ nil
249
+ end
250
+ # Set the order of a condition object.
251
+ # `expr' specifies an expression for the order. By default, the order is by score descending.
252
+ # The return value is always `nil'.
253
+ def set_order(expr)
254
+ Utility::check_types({ expr=>String }) if $DEBUG
255
+ expr = expr.gsub(/[ \t\r\n\v\f]+/, " ")
256
+ expr = expr.strip.squeeze(" ")
257
+ @order = expr
258
+ nil
259
+ end
260
+ # Set the maximum number of retrieval.
261
+ # `max' specifies the maximum number of retrieval. By default, the number of retrieval is
262
+ # not limited.
263
+ # The return value is always `nil'.
264
+ def set_max(max)
265
+ Utility::check_types({ max=>Integer }) if $DEBUG
266
+ @max = max if(max >= 0)
267
+ nil
268
+ end
269
+ # Set the number of skipped documents.
270
+ # `skip' specifies the number of documents to be skipped in the search result.
271
+ # The return value is always `nil'.
272
+ def set_skip(skip)
273
+ Utility::check_types({ skip=>Integer }) if $DEBUG
274
+ @skip = skip if(skip >= 0)
275
+ nil
276
+ end
277
+ # Set options of retrieval.
278
+ # `options' specifies options: `Condition::SURE' specifies that it checks every N-gram
279
+ # key, `Condition::USU', which is the default, specifies that it checks N-gram keys
280
+ # with skipping one key, `Condition::FAST' skips two keys, `Condition::AGITO'
281
+ # skips three keys, `Condition::NOIDF' specifies not to perform TF-IDF tuning,
282
+ # `Condition::SIMPLE' specifies to use simplified phrase. Each option can be specified at
283
+ # the same time by bitwise or. If keys are skipped, though search speed is improved, the
284
+ # relevance ratio grows less.
285
+ # The return value is always `nil'.
286
+ def set_options(options)
287
+ Utility::check_types({ options=>Integer }) if $DEBUG
288
+ @options |= options
289
+ nil
290
+ end
291
+ # Get the search phrase.
292
+ # The return value is the search phrase.
293
+ def phrase()
294
+ @phrase
295
+ end
296
+ # Get expressions for attributes.
297
+ # The return value is expressions for attributes.
298
+ def attrs()
299
+ @attrs
300
+ end
301
+ # Get the order expression.
302
+ # The return value is the order expression.
303
+ def order()
304
+ @order
305
+ end
306
+ # Get the maximum number of retrieval.
307
+ # The return value is the maximum number of retrieval.
308
+ def max()
309
+ @max
310
+ end
311
+ # Get the number of skipped documents.
312
+ # The return value is the number of documents to be skipped in the search result.
313
+ def skip()
314
+ @skip
315
+ end
316
+ # Get options of retrieval.
317
+ # The return value is options by bitwise or.
318
+ def options()
319
+ @options
320
+ end
321
+ #--------------------------------
322
+ # private methods
323
+ #--------------------------------
324
+ private
325
+ # Create a search condition object.
326
+ def initialize()
327
+ @phrase = nil
328
+ @attrs = []
329
+ @order = nil
330
+ @max = -1
331
+ @skip = 0
332
+ @options = 0
333
+ end
334
+ end
335
+ #----------------------------------------------------------------
336
+ #++ Abstraction of document in result set.
337
+ #----------------------------------------------------------------
338
+ class ResultDocument
339
+ #--------------------------------
340
+ # public methods
341
+ #--------------------------------
342
+ public
343
+ # Get the URI.
344
+ # The return value is the URI of the result document object.
345
+ def uri()
346
+ @uri
347
+ end
348
+ # Get a list of attribute names.
349
+ # The return value is a list object of attribute names.
350
+ def attr_names()
351
+ @attrs.keys.sort
352
+ end
353
+ # Get the value of an attribute.
354
+ # The return value is the value of the attribute or `nil' if it does not exist.
355
+ def attr(name)
356
+ Utility::check_types({ name=>String }) if $DEBUG
357
+ @attrs[name]
358
+ end
359
+ # Get the snippet of a result document object.
360
+ # The return value is a string of the snippet of the result document object. There are tab
361
+ # separated values. Each line is a string to be shown. Though most lines have only one
362
+ # field, some lines have two fields. If the second field exists, the first field is to be
363
+ # shown with highlighted, and the second field means its normalized form.
364
+ def snippet()
365
+ @snippet
366
+ end
367
+ # Get keywords.
368
+ # The return value is a string of serialized keywords of the result document object. There
369
+ # are tab separated values. Keywords and their scores come alternately.
370
+ def keywords()
371
+ @keywords
372
+ end
373
+ #--------------------------------
374
+ # private methods
375
+ #--------------------------------
376
+ private
377
+ # Create a result document object.
378
+ def initialize(uri, attrs, snippet, keywords)
379
+ Utility::check_types({ uri=>String, attrs=>Hash,
380
+ snippet=>String, keywords=>String }) if $DEBUG
381
+ @uri = uri
382
+ @attrs = attrs
383
+ @snippet = snippet
384
+ @keywords = keywords
385
+ end
386
+ end
387
+ #----------------------------------------------------------------
388
+ #++ Abstraction of result set from node.
389
+ #----------------------------------------------------------------
390
+ class NodeResult
391
+ #--------------------------------
392
+ # public methods
393
+ #--------------------------------
394
+ public
395
+ # Get the number of documents.
396
+ # The return value is the number of documents.
397
+ def doc_num()
398
+ @docs.length
399
+ end
400
+ # Get the value of hint information.
401
+ # The return value is a result document object or `nil' if the index is out of bounds.
402
+ def get_doc(index)
403
+ Utility::check_types({ index=>Integer }) if $DEBUG
404
+ return nil if index < 0 || index >= @docs.length
405
+ @docs[index]
406
+ end
407
+ # Get the value of hint information.
408
+ # `key' specifies the key of a hint. "VERSION", "NODE", "HIT", "HINT#n", "DOCNUM", "WORDNUM",
409
+ # "TIME", "LINK#n", and "VIEW" are provided for keys.
410
+ # The return value is the hint or `nil' if the key does not exist.
411
+ def hint(key)
412
+ Utility::check_types({ key=>String }) if $DEBUG
413
+ @hints[key]
414
+ end
415
+ #--------------------------------
416
+ # private methods
417
+ #--------------------------------
418
+ private
419
+ # Create a node result object.
420
+ def initialize(docs, hints)
421
+ Utility::check_types({ docs=>Array, hints=>Hash }) if $DEBUG
422
+ @docs = docs
423
+ @hints = hints
424
+ end
425
+ end
426
+ #----------------------------------------------------------------
427
+ #++ Abstraction of connection to P2P node.
428
+ #----------------------------------------------------------------
429
+ class Node
430
+ #--------------------------------
431
+ # public methods
432
+ #--------------------------------
433
+ public
434
+ # Set the URL of a node server.
435
+ # `url' specifies the URL of a node.
436
+ # The return value is always `nil'.
437
+ def set_url(url)
438
+ Utility::check_types({ url=>String }) if $DEBUG
439
+ @url = url
440
+ nil
441
+ end
442
+ # Set the proxy information.
443
+ # `host' specifies the host name of a proxy server.
444
+ # `port' specifies the port number of the proxy server.
445
+ # The return value is always `nil'.
446
+ def set_proxy(host, port)
447
+ Utility::check_types({ host=>String, port=>Integer }) if $DEBUG
448
+ @pxhost = host
449
+ @pxport = port
450
+ nil
451
+ end
452
+ # Set timeout of a connection.
453
+ # `sec' specifies timeout of the connection in seconds.
454
+ # The return value is always `nil'.
455
+ def set_timeout(sec)
456
+ Utility::check_types({ sec=>Integer }) if $DEBUG
457
+ @timeout = sec
458
+ nil
459
+ end
460
+ # Set the authentication information.
461
+ # `name' specifies the name of authentication.
462
+ # `passwd' specifies the password of the authentication.
463
+ # The return value is always `nil'.
464
+ def set_auth(name, password)
465
+ Utility::check_types({ name=>String, password=>String }) if $DEBUG
466
+ @auth = name + ":" + password
467
+ nil
468
+ end
469
+ # Get the status code of the last request.
470
+ # The return value is the status code of the last request. -1 means failure of connection.
471
+ def status()
472
+ @status
473
+ end
474
+ # Add a document.
475
+ # `doc' specifies a document object. The document object should have the URI attribute.
476
+ # The return value is true if success, else it is false.
477
+ def put_doc(doc)
478
+ Utility::check_types({ doc=>Document }) if $DEBUG
479
+ @status = -1
480
+ return false if !@url
481
+ turl = @url + "/put_doc"
482
+ reqheads = [ "Content-Type: text/x-estraier-draft" ]
483
+ reqheads.push("Authorization: Basic " + Utility::base_encode(@auth)) if @auth
484
+ reqbody = doc.dump_draft
485
+ rv = Utility::shuttle_url(turl, @pxhost, @pxport, @timeout, reqheads, reqbody, nil, nil)
486
+ @status = rv
487
+ rv == 200
488
+ end
489
+ # Remove a document.
490
+ # `id' specifies the ID number of a registered document.
491
+ # The return value is true if success, else it is false.
492
+ def out_doc(id)
493
+ Utility::check_types({ id=>Integer }) if $DEBUG
494
+ @status = -1
495
+ return false if !@url
496
+ turl = @url + "/out_doc"
497
+ reqheads = [ "Content-Type: application/x-www-form-urlencoded" ]
498
+ reqheads.push("Authorization: Basic " + Utility::base_encode(@auth)) if @auth
499
+ reqbody = "id=" + id.to_s
500
+ rv = Utility::shuttle_url(turl, @pxhost, @pxport, @timeout, reqheads, reqbody, nil, nil)
501
+ @status = rv
502
+ rv == 200
503
+ end
504
+ # Remove a document specified by URI.
505
+ # `uri' specifies the URI of a registered document.
506
+ # The return value is true if success, else it is false.
507
+ def out_doc_by_uri(uri)
508
+ Utility::check_types({ uri=>String }) if $DEBUG
509
+ @status = -1
510
+ return false if !@url
511
+ turl = @url + "/out_doc"
512
+ reqheads = [ "Content-Type: application/x-www-form-urlencoded" ]
513
+ reqheads.push("Authorization: Basic " + Utility::base_encode(@auth)) if @auth
514
+ reqbody = "uri=" + URI::encode(uri)
515
+ rv = Utility::shuttle_url(turl, @pxhost, @pxport, @timeout, reqheads, reqbody, nil, nil)
516
+ @status = rv
517
+ rv == 200
518
+ end
519
+ # Edit attributes of a document.
520
+ # `doc' specifies a document object.
521
+ # The return value is true if success, else it is false.
522
+ def edit_doc(doc)
523
+ Utility::check_types({ doc=>Document }) if $DEBUG
524
+ @status = -1
525
+ return false if !@url
526
+ turl = @url + "/edit_doc"
527
+ reqheads = [ "Content-Type: text/x-estraier-draft" ]
528
+ reqheads.push("Authorization: Basic " + Utility::base_encode(@auth)) if @auth
529
+ reqbody = doc.dump_draft
530
+ rv = Utility::shuttle_url(turl, @pxhost, @pxport, @timeout, reqheads, reqbody, nil, nil)
531
+ @status = rv
532
+ rv == 200
533
+ end
534
+ # Retrieve a document.
535
+ # `id' specifies the ID number of a registered document.
536
+ # The return value is a document object. On error, `nil' is returned.
537
+ def get_doc(id)
538
+ Utility::check_types({ id=>Integer }) if $DEBUG
539
+ @status = -1
540
+ return nil if !@url
541
+ turl = @url + "/get_doc"
542
+ reqheads = [ "Content-Type: application/x-www-form-urlencoded" ]
543
+ reqheads.push("Authorization: Basic " + Utility::base_encode(@auth)) if @auth
544
+ reqbody = "id=" + id.to_s
545
+ resbody = StringIO::new
546
+ rv = Utility::shuttle_url(turl, @pxhost, @pxport, @timeout, reqheads, reqbody, nil, resbody)
547
+ @status = rv
548
+ return nil if rv != 200
549
+ Document::new(resbody.string)
550
+ end
551
+ # Retrieve a document.
552
+ # `uri' specifies the URI of a registered document.
553
+ # The return value is a document object. On error, `nil' is returned.
554
+ def get_doc_by_uri(uri)
555
+ Utility::check_types({ uri=>String }) if $DEBUG
556
+ @status = -1
557
+ return nil if !@url
558
+ turl = @url + "/get_doc"
559
+ reqheads = [ "Content-Type: application/x-www-form-urlencoded" ]
560
+ reqheads.push("Authorization: Basic " + Utility::base_encode(@auth)) if @auth
561
+ reqbody = "uri=" + URI::encode(uri)
562
+ resbody = StringIO::new
563
+ rv = Utility::shuttle_url(turl, @pxhost, @pxport, @timeout, reqheads, reqbody, nil, resbody)
564
+ @status = rv
565
+ return nil if rv != 200
566
+ Document::new(resbody.string)
567
+ end
568
+ # Retrieve the value of an attribute of a document.
569
+ # `id' specifies the ID number of a registered document.
570
+ # `name' specifies the name of an attribute.
571
+ # The return value is the value of the attribute or `nil' if it does not exist.
572
+ def get_doc_attr(id, name)
573
+ Utility::check_types({ id=>Integer, name=>String }) if $DEBUG
574
+ @status = -1
575
+ return nil if !@url
576
+ turl = @url + "/get_doc_attr"
577
+ reqheads = [ "Content-Type: application/x-www-form-urlencoded" ]
578
+ reqheads.push("Authorization: Basic " + Utility::base_encode(@auth)) if @auth
579
+ reqbody = "id=" + id.to_s + "&attr=" + URI::encode(name)
580
+ resbody = StringIO::new
581
+ rv = Utility::shuttle_url(turl, @pxhost, @pxport, @timeout, reqheads, reqbody, nil, resbody)
582
+ @status = rv
583
+ return nil if rv != 200
584
+ resbody.string.chomp
585
+ end
586
+ # Retrieve the value of an attribute of a document specified by URI.
587
+ # `uri' specifies the URI of a registered document.
588
+ # `name' specifies the name of an attribute.
589
+ # The return value is the value of the attribute or `nil' if it does not exist.
590
+ def get_doc_attr_by_uri(uri, name)
591
+ Utility::check_types({ uri=>String, name=>String }) if $DEBUG
592
+ @status = -1
593
+ return nil if !@url
594
+ turl = @url + "/get_doc_attr"
595
+ reqheads = [ "Content-Type: application/x-www-form-urlencoded" ]
596
+ reqheads.push("Authorization: Basic " + Utility::base_encode(@auth)) if @auth
597
+ reqbody = "uri=" + URI::encode(uri) + "&attr=" + URI::encode(name)
598
+ resbody = StringIO::new
599
+ rv = Utility::shuttle_url(turl, @pxhost, @pxport, @timeout, reqheads, reqbody, nil, resbody)
600
+ @status = rv
601
+ return nil if rv != 200
602
+ resbody.string.chomp
603
+ end
604
+ # Extract keywords of a document.
605
+ # `id' specifies the ID number of a registered document.
606
+ # The return value is a hash object of keywords and their scores in decimal string or `nil'
607
+ # on error.
608
+ def etch_doc(id)
609
+ Utility::check_types({ id=>Integer }) if $DEBUG
610
+ @status = -1
611
+ return nil if !@url
612
+ turl = @url + "/etch_doc"
613
+ reqheads = [ "Content-Type: application/x-www-form-urlencoded" ]
614
+ reqheads.push("Authorization: Basic " + Utility::base_encode(@auth)) if @auth
615
+ reqbody = "id=" + id.to_s
616
+ resbody = StringIO::new
617
+ rv = Utility::shuttle_url(turl, @pxhost, @pxport, @timeout, reqheads, reqbody, nil, resbody)
618
+ @status = rv
619
+ return nil if rv != 200
620
+ kwords = {}
621
+ lines = resbody.string.split(/\n/)
622
+ for i in 0...lines.length
623
+ pair = lines[i].split(/\t/)
624
+ next if pair.length < 2
625
+ kwords[pair[0]] = pair[1]
626
+ end
627
+ kwords
628
+ end
629
+ # Extract keywords of a document specified by URI.
630
+ # `uri' specifies the URI of a registered document.
631
+ # The return value is a hash object of keywords and their scores in decimal string or `nil'
632
+ # on error.
633
+ def etch_doc_by_uri(uri)
634
+ Utility::check_types({ uri=>String }) if $DEBUG
635
+ @status = -1
636
+ return nil if !@url
637
+ turl = @url + "/etch_doc"
638
+ reqheads = [ "Content-Type: application/x-www-form-urlencoded" ]
639
+ reqheads.push("Authorization: Basic " + Utility::base_encode(@auth)) if @auth
640
+ reqbody = "uri=" + URI::encode(uri);
641
+ resbody = StringIO::new
642
+ rv = Utility::shuttle_url(turl, @pxhost, @pxport, @timeout, reqheads, reqbody, nil, resbody)
643
+ @status = rv
644
+ return nil if rv != 200
645
+ kwords = {}
646
+ lines = resbody.string.split(/\n/)
647
+ for i in 0...lines.length
648
+ pair = lines[i].split(/\t/)
649
+ next if pair.length < 2
650
+ kwords[pair[0]] = pair[1]
651
+ end
652
+ kwords
653
+ end
654
+ # Get the ID of a document specified by URI.
655
+ # `uri' specifies the URI of a registered document.
656
+ # The return value is the ID of the document. On error, -1 is returned.
657
+ def uri_to_id(uri)
658
+ Utility::check_types({ uri=>String }) if $DEBUG
659
+ @status = -1
660
+ return -1 if !@url
661
+ turl = @url + "/uri_to_id"
662
+ reqheads = [ "Content-Type: application/x-www-form-urlencoded" ]
663
+ reqheads.push("Authorization: Basic " + Utility::base_encode(@auth)) if @auth
664
+ reqbody = "uri=" + URI::encode(uri)
665
+ resbody = StringIO::new
666
+ rv = Utility::shuttle_url(turl, @pxhost, @pxport, @timeout, reqheads, reqbody, nil, resbody)
667
+ @status = rv
668
+ return nil if rv != 200
669
+ resbody.string.chomp
670
+ end
671
+ # Get the name.
672
+ # The return value is the name. On error, `nil' is returned.
673
+ def name()
674
+ set_info if !@name
675
+ @name
676
+ end
677
+ # Get the label.
678
+ # The return value is the label. On error, `nil' is returned.
679
+ def label()
680
+ set_info if !@label
681
+ @label
682
+ end
683
+ # Get the number of documents.
684
+ # The return value is the number of documents. On error, -1 is returned.
685
+ def doc_num()
686
+ set_info if @dnum < 0
687
+ @dnum
688
+ end
689
+ # Get the number of unique words.
690
+ # The return value is the number of unique words. On error, -1 is returned.
691
+ def word_num()
692
+ set_info if @wnum < 0
693
+ @wnum
694
+ end
695
+ # Get the size of the datbase.
696
+ # The return value is the size of the datbase. On error, -1.0 is returned.
697
+ def size()
698
+ set_info if @size < 0.0
699
+ @size
700
+ end
701
+ # Search documents corresponding a condition.
702
+ # `cond' specifies a condition object.
703
+ # `depth' specifies the depth of meta search.
704
+ # The return value is a node result object. On error, `nil' is returned.
705
+ def search(cond, depth)
706
+ Utility::check_types({ cond=>Condition, depth=>Integer }) if $DEBUG
707
+ @status = -1
708
+ return nil if !@url
709
+ turl = @url + "/search"
710
+ reqheads = [ "Content-Type: application/x-www-form-urlencoded" ]
711
+ reqheads.push("Authorization: Basic " + Utility::base_encode(@auth)) if @auth
712
+ reqbody = Utility::cond_to_query(cond, depth, @wwidth, @hwidth, @awidth)
713
+ resbody = StringIO::new
714
+ rv = Utility::shuttle_url(turl, @pxhost, @pxport, @timeout, reqheads, reqbody, nil, resbody)
715
+ @status = rv
716
+ return nil if rv != 200
717
+ lines = resbody.string.split(/\n/)
718
+ return nil if lines.length < 1
719
+ docs = []
720
+ hints = {}
721
+ nres = NodeResult::new(docs, hints)
722
+ border = lines[0]
723
+ isend = false
724
+ lnum = 1
725
+ while lnum < lines.length
726
+ line = lines[lnum]
727
+ lnum += 1
728
+ if line.length >= border.length && line.index(border) == 0
729
+ isend = true if line[border.length...line.length] == ":END"
730
+ break
731
+ end
732
+ lidx = line.index("\t")
733
+ if lidx
734
+ key = line[0...lidx]
735
+ value = line[(lidx+1)...line.length]
736
+ hints[key] = value
737
+ end
738
+ end
739
+ snum = lnum
740
+ while !isend && lnum < lines.length
741
+ line = lines[lnum]
742
+ lnum += 1
743
+ if line.length >= border.length && line.index(border) == 0
744
+ if lnum > snum
745
+ rdattrs = {}
746
+ sb = StringIO::new
747
+ rdvector = ""
748
+ rlnum = snum
749
+ while rlnum < lnum - 1
750
+ rdline = lines[rlnum].strip
751
+ rlnum += 1
752
+ break if rdline.length < 1
753
+ if rdline =~ /^%/
754
+ lidx = rdline.index("\t")
755
+ rdvector = rdline[(lidx+1)...rdline.length] if rdline =~ /%VECTOR/ && lidx
756
+ else
757
+ lidx = rdline.index("=")
758
+ if lidx
759
+ key = rdline[0...lidx]
760
+ value = rdline[(lidx+1)...rdline.length]
761
+ rdattrs[key] = value
762
+ end
763
+ end
764
+ end
765
+ while rlnum < lnum - 1
766
+ rdline = lines[rlnum]
767
+ rlnum += 1
768
+ sb.printf("%s\n", rdline)
769
+ end
770
+ rduri = rdattrs["@uri"]
771
+ rdsnippet = sb.string
772
+ if rduri
773
+ rdoc = ResultDocument::new(rduri, rdattrs, rdsnippet, rdvector)
774
+ docs.push(rdoc)
775
+ end
776
+ end
777
+ snum = lnum
778
+ isend = true if line[border.length...line.length] == ":END"
779
+ end
780
+ end
781
+ return nil if !isend
782
+ return nres
783
+ end
784
+ # Set width of snippet in the result.
785
+ # `wwidth' specifies whole width of a snippet. By default, it is 480. If it is 0, no
786
+ # snippet is sent. If it is negative, whole body text is sent instead of snippet.
787
+ # `hwidth' specifies width of strings picked up from the beginning of the text. By default,
788
+ # it is 96. If it is negative 0, the current setting is not changed.
789
+ # `awidth' specifies width of strings picked up around each highlighted word. By default,
790
+ # it is 96. If it is negative, the current setting is not changed.
791
+ def set_snippet_width(wwidth, hwidth, awidth)
792
+ @wwidth = wwidth;
793
+ @hwidth = hwidth if hwidth >= 0
794
+ @awidth = awidth if awidth >= 0
795
+ end
796
+ # Manage a user account of a node.
797
+ # `name' specifies the name of a user.
798
+ # `mode' specifies the operation mode. 0 means to delete the account. 1 means to set the
799
+ # account as an administrator. 2 means to set the account as a guest.
800
+ # The return value is true if success, else it is false.
801
+ def set_user(name, mode)
802
+ Utility::check_types({ name=>String, mode=>Integer }) if $DEBUG
803
+ @status = -1
804
+ return false if !@url
805
+ turl = @url + "/_set_user"
806
+ reqheads = [ "Content-Type: application/x-www-form-urlencoded" ]
807
+ reqheads.push("Authorization: Basic " + Utility::base_encode(@auth)) if @auth
808
+ reqbody = "name=" + URI::encode(name) + "&mode=" + mode.to_s
809
+ rv = Utility::shuttle_url(turl, @pxhost, @pxport, @timeout, reqheads, reqbody, nil, nil)
810
+ @status = rv
811
+ rv == 200
812
+ end
813
+ # Manage a link of a node.
814
+ # `url' specifies the URL of the target node of a link.
815
+ # `label' specifies the label of the link.
816
+ # `credit' specifies the credit of the link. If it is negative, the link is removed.
817
+ # The return value is true if success, else it is false.
818
+ def set_link(url, label, credit)
819
+ Utility::check_types({ url=>String, label=>String, credit=>Integer }) if $DEBUG
820
+ @status = -1
821
+ return false if !@url
822
+ turl = @url + "/_set_link"
823
+ reqheads = [ "Content-Type: application/x-www-form-urlencoded" ]
824
+ reqheads.push("Authorization: Basic " + Utility::base_encode(@auth)) if @auth
825
+ reqbody = "url=" + URI::encode(url) + "&label=" + label
826
+ reqbody += "&credit=" + credit.to_s if credit >= 0
827
+ rv = Utility::shuttle_url(turl, @pxhost, @pxport, @timeout, reqheads, reqbody, nil, nil)
828
+ @status = rv
829
+ rv == 200
830
+ end
831
+ #--------------------------------
832
+ # private methods
833
+ #--------------------------------
834
+ private
835
+ # Create a node connection object.
836
+ def initialize()
837
+ @url = nil
838
+ @pxhost = nil
839
+ @pxport = -1
840
+ @timeout = -1
841
+ @auth = nil
842
+ @name = nil
843
+ @label = nil
844
+ @dnum = -1
845
+ @wnum = -1
846
+ @size = -1.0
847
+ @wwidth = 480;
848
+ @hwidth = 96;
849
+ @awidth = 96;
850
+ @status = -1
851
+ end
852
+ # Set information of the node.
853
+ def set_info()
854
+ @status = -1
855
+ return if !@url
856
+ turl = @url + "/inform"
857
+ reqheads = []
858
+ reqheads.push("Authorization: Basic " + Utility::base_encode(@auth)) if @auth
859
+ resbody = StringIO::new
860
+ rv = Utility::shuttle_url(turl, @pxhost, @pxport, @timeout, reqheads, nil, nil, resbody)
861
+ @status = rv
862
+ return if rv != 200
863
+ lines = resbody.string.chomp.split(/\n/)
864
+ return if lines.length < 1
865
+ elems = lines[0].chomp.split(/\t/)
866
+ return if elems.length != 5
867
+ @name = elems[0]
868
+ @label = elems[1]
869
+ @dnum = elems[2].to_i
870
+ @wnum = elems[3].to_i
871
+ @size = elems[4].to_f
872
+ end
873
+ end
874
+ #:stopdoc:
875
+ #
876
+ # Module for utility
877
+ #
878
+ module Utility
879
+ public
880
+ # Check types of arguments
881
+ # `types' specifies a hash object whose keys are arguments and values are class objects.
882
+ # If there is a invalid object, an exception is thrown.
883
+ def check_types(types)
884
+ i = 0
885
+ types.each_key do |key|
886
+ i += 1
887
+ unless key.kind_of?(types[key]) || key == nil
888
+ raise ArgumentError::new("Argument#" + i.to_s +
889
+ " should be a kind of " + types[key].to_s)
890
+ end
891
+ end
892
+ end
893
+ module_function :check_types
894
+ # Perform an interaction of a URL.
895
+ # `url' specifies a URL.
896
+ # `pxhost' specifies the host name of a proxy. If it is `nil', it is not used.
897
+ # `pxport' specifies the port number of the proxy.
898
+ # `outsec' specifies timeout in seconds. If it is negative, it is not used.
899
+ # `reqheads' specifies a list object of extension headers. If it is `nil', it is not used.
900
+ # `reqbody' specifies the pointer of the entitiy body of request. If it is `nil', "GET"
901
+ # method is used.
902
+ # `resheads' specifies a list object into which headers of response is stored. If it is
903
+ # `nil' it is not used.
904
+ # `resbody' specifies stream object into which the entity body of response is stored. If it
905
+ # is `nil', it is not used.
906
+ # The return value is the status code of the response or -1 on error.
907
+ def shuttle_url(url, pxhost, pxport, outsec, reqheads, reqbody, resheads, resbody)
908
+ begin
909
+ status = -1
910
+ th = Thread::start do
911
+ url = URI::parse(url)
912
+ url.normalize
913
+ Thread::current.exit if url.scheme != "http" || !url.host || url.port < 1
914
+ if pxhost
915
+ host = pxhost
916
+ port = pxport
917
+ query = "http://" + url.host + ":" + url.port.to_s + url.path
918
+ else
919
+ host = url.host
920
+ port = url.port
921
+ query = url.path
922
+ end
923
+ query += "?" + url.query if url.query && !reqbody
924
+ begin
925
+ sock = TCPSocket.open(host, port)
926
+ if reqbody
927
+ sock.printf("POST " + query + " HTTP/1.0\r\n")
928
+ else
929
+ sock.printf("GET " + query + " HTTP/1.0\r\n")
930
+ end
931
+ sock.printf("Host: %s:%d\r\n", url.host, url.port)
932
+ sock.printf("Connection: close\r\n")
933
+ sock.printf("User-Agent: HyperEstraierForRuby/1.0.0\r\n")
934
+ if reqheads
935
+ reqheads.each do |line|
936
+ sock.printf("%s\r\n", line)
937
+ end
938
+ end
939
+ sock.printf("Content-Length: %d\r\n", reqbody.length) if reqbody
940
+ sock.printf("\r\n")
941
+ sock.write(reqbody) if reqbody
942
+ line = sock.gets.chomp
943
+ elems = line.split(/ */)
944
+ Thread::current.exit if elems.length < 3 || !(elems[0] =~ /^HTTP/)
945
+ status = elems[1].to_i
946
+ resheads.push(line) if resheads
947
+ begin
948
+ line = sock.gets.chomp
949
+ resheads.push(line) if resheads
950
+ end while line.length > 0
951
+ while buf = sock.read(8192)
952
+ resbody.write(buf) if resbody
953
+ end
954
+ ensure
955
+ sock.close if sock
956
+ end
957
+ end
958
+ if outsec >= 0
959
+ unless th.join(outsec)
960
+ th.exit
961
+ th.join
962
+ return -1
963
+ end
964
+ else
965
+ th.join
966
+ end
967
+ return status
968
+ rescue
969
+ return -1
970
+ end
971
+ end
972
+ module_function :shuttle_url
973
+ # Serialize a condition object into a query string.
974
+ # `cond' specifies a condition object.
975
+ # `depth' specifies depth of meta search.
976
+ # `wwidth' specifies whole width of a snippet.
977
+ # `hwidth' specifies width of strings picked up from the beginning of the text.
978
+ # `awidth' specifies width of strings picked up around each highlighted word.
979
+ # The return value is the serialized string.
980
+ def cond_to_query(cond, depth, wwidth, hwidth, awidth)
981
+ buf = StringIO::new
982
+ if cond.phrase
983
+ buf.write("&") if buf.length > 0
984
+ buf.write("phrase=")
985
+ buf.write(URI::encode(cond.phrase))
986
+ end
987
+ for i in 0...cond.attrs.length
988
+ buf.write("&") if buf.length > 0
989
+ buf.write("attr" + (i + 1).to_s + "=")
990
+ buf.write(URI::encode(cond.attrs[i]))
991
+ end
992
+ if cond.order
993
+ buf.write("&") if buf.length > 0
994
+ buf.write("order=")
995
+ buf.write(URI::encode(cond.order))
996
+ end
997
+ if cond.max > 0
998
+ buf.write("&") if buf.length > 0
999
+ buf.write("max=" + cond.max.to_s)
1000
+ else
1001
+ buf.write("&") if buf.length > 0
1002
+ buf.write("max=" + (1 << 30).to_s)
1003
+ end
1004
+ buf.write("&options=" + cond.options.to_s) if cond.options > 0
1005
+ buf.write("&depth=" + depth.to_s) if depth > 0
1006
+ buf.write("&wwidth=" + wwidth.to_s)
1007
+ buf.write("&hwidth=" + hwidth.to_s)
1008
+ buf.write("&awidth=" + awidth.to_s)
1009
+ buf.write("&skip=" + cond.skip.to_s)
1010
+ buf.string
1011
+ end
1012
+ module_function :cond_to_query
1013
+ # Encode a byte sequence with Base64 encoding.
1014
+ # `data' specifyes a string object.
1015
+ # The return value is the encoded string.
1016
+ def base_encode(data)
1017
+ [data].pack("m").gsub(/[ \n]/, "")
1018
+ end
1019
+ module_function :base_encode
1020
+ end
1021
+ end
1022
+
1023
+
1024
+
1025
+ # END OF FILE