rsi 0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +25 -0
- data/Makefile +24 -0
- data/Manifest +30 -0
- data/README +49 -0
- data/TODO +30 -0
- data/bin/rsi_search.rb +50 -0
- data/bin/search_bench.rb +47 -0
- data/docs/ATTRIB +14 -0
- data/docs/Changes +25 -0
- data/docs/Roadmap +41 -0
- data/lib/rsi.rb +40 -0
- data/lib/rsi/analysis.rb +79 -0
- data/lib/rsi/compressed_serializers.rb +60 -0
- data/lib/rsi/dictionary.rb +232 -0
- data/lib/rsi/index.rb +245 -0
- data/lib/rsi/logmanager.rb +105 -0
- data/lib/rsi/porter.rb +213 -0
- data/lib/rsi/query.rb +98 -0
- data/lib/rsi/rsi_intro.rb +91 -0
- data/lib/rsi/serializers.rb +31 -0
- data/lib/rsi/stoplist.rb +72 -0
- data/lib/rsi/stoplist.txt +59 -0
- data/rsi.gemspec +59 -0
- data/setup.rb +1360 -0
- data/tests/suite_all.rb +14 -0
- data/tests/t_analysis.rb +43 -0
- data/tests/t_dictionary.rb +76 -0
- data/tests/t_index.rb +78 -0
- data/tests/t_index_multi.rb +71 -0
- data/version.release +1 -0
- metadata +72 -0
data/LICENSE
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
Copyright (c) 2005, Gregory D. Fast
|
2
|
+
All rights reserved.
|
3
|
+
|
4
|
+
Redistribution and use in source and binary forms, with or without
|
5
|
+
modification, are permitted provided that the following conditions
|
6
|
+
are met:
|
7
|
+
|
8
|
+
* Redistributions of source code must retain the above copyright
|
9
|
+
notice, this list of conditions and the following disclaimer.
|
10
|
+
|
11
|
+
* Redistributions in binary form must reproduce the above copyright
|
12
|
+
notice, this list of conditions and the following disclaimer in the
|
13
|
+
documentation and/or other materials provided with the distribution.
|
14
|
+
|
15
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
16
|
+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
17
|
+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
18
|
+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
19
|
+
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
20
|
+
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
|
21
|
+
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22
|
+
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
23
|
+
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
24
|
+
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
25
|
+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
data/Makefile
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
|
2
|
+
PKGNAME = rsi
|
3
|
+
VERSION = $(shell cat version.release)
|
4
|
+
RELEASE = $(PKGNAME)-$(VERSION)
|
5
|
+
|
6
|
+
PHONY: check.manifest dist
|
7
|
+
|
8
|
+
dist: check.manifest version.release
|
9
|
+
mkdir $(RELEASE)
|
10
|
+
tar cf - `cat Manifest` | tar xvf - -C $(RELEASE)
|
11
|
+
tar zcvf $(RELEASE).tar.gz $(RELEASE)
|
12
|
+
rm -rf $(RELEASE)
|
13
|
+
|
14
|
+
check.manifest: Manifest
|
15
|
+
for i in `cat Manifest`; do test -e $$i; done
|
16
|
+
|
17
|
+
test:
|
18
|
+
ruby -Ilib tests/suite_all.rb
|
19
|
+
|
20
|
+
|
21
|
+
# ;; default ruby-mode has tabs on? ugh.
|
22
|
+
# (add-hook 'ruby-mode-hook
|
23
|
+
# (lambda ()
|
24
|
+
# (setq indent-tabs-mode nil)))
|
data/Manifest
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
LICENSE
|
2
|
+
Makefile
|
3
|
+
Manifest
|
4
|
+
README
|
5
|
+
TODO
|
6
|
+
rsi.gemspec
|
7
|
+
setup.rb
|
8
|
+
version.release
|
9
|
+
bin/rsi_search.rb
|
10
|
+
bin/search_bench.rb
|
11
|
+
docs/ATTRIB
|
12
|
+
docs/Changes
|
13
|
+
docs/Roadmap
|
14
|
+
lib/rsi.rb
|
15
|
+
lib/rsi/analysis.rb
|
16
|
+
lib/rsi/compressed_serializers.rb
|
17
|
+
lib/rsi/dictionary.rb
|
18
|
+
lib/rsi/index.rb
|
19
|
+
lib/rsi/logmanager.rb
|
20
|
+
lib/rsi/porter.rb
|
21
|
+
lib/rsi/query.rb
|
22
|
+
lib/rsi/rsi_intro.rb
|
23
|
+
lib/rsi/serializers.rb
|
24
|
+
lib/rsi/stoplist.rb
|
25
|
+
lib/rsi/stoplist.txt
|
26
|
+
tests/suite_all.rb
|
27
|
+
tests/t_analysis.rb
|
28
|
+
tests/t_index.rb
|
29
|
+
tests/t_index_multi.rb
|
30
|
+
tests/t_dictionary.rb
|
data/README
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
RSI README v0.4
|
2
|
+
===============
|
3
|
+
|
4
|
+
RSI(Ruby Simple Indexer (or perhaps Really Simple Indexer)) is a
|
5
|
+
simple full text index.
|
6
|
+
|
7
|
+
RSI is a simple full text search engine implementation in Ruby. It
|
8
|
+
aims to be easily useful within other programs: simple to set up,
|
9
|
+
simple to use.
|
10
|
+
|
11
|
+
An emphasis has been placed on getting functionality out the door,
|
12
|
+
rather than heavy optimization (that can come later). It still
|
13
|
+
appears to be reasonably fast and efficient (while admitting to have
|
14
|
+
not been heavily profiled...).
|
15
|
+
|
16
|
+
Requirements
|
17
|
+
------------
|
18
|
+
|
19
|
+
* Ruby 1.8
|
20
|
+
|
21
|
+
Install
|
22
|
+
-------
|
23
|
+
|
24
|
+
De-compress archive and enter its top directory.
|
25
|
+
Then type:
|
26
|
+
|
27
|
+
($ su)
|
28
|
+
# ruby setup.rb
|
29
|
+
|
30
|
+
These simple step installs this program under the default
|
31
|
+
location of Ruby libraries. You can also install files into
|
32
|
+
your favorite directory by supplying setup.rb some options.
|
33
|
+
Try "ruby setup.rb --help".
|
34
|
+
|
35
|
+
|
36
|
+
Usage
|
37
|
+
-----
|
38
|
+
|
39
|
+
See rsi_intro.rb .
|
40
|
+
|
41
|
+
|
42
|
+
License
|
43
|
+
-------
|
44
|
+
|
45
|
+
This is free software.
|
46
|
+
See the LICENSE file included in this distribution for terms.
|
47
|
+
|
48
|
+
Copyright 2005, Greg Fast <gdf@speakeasy.net>
|
49
|
+
|
data/TODO
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
#perf - reduce index size (more!)
|
2
|
+
#func - allow fields (date=x, text=x) etc
|
3
|
+
func - allow range queries
|
4
|
+
#stable - store info on module versions, serialization format, etc in db root
|
5
|
+
stable - still need version info stored in root... icky.
|
6
|
+
#stable - tests, benchmarking
|
7
|
+
longterm - index efficiency
|
8
|
+
longterm - benchmarking
|
9
|
+
|
10
|
+
# dist - name project - rsi?
|
11
|
+
# dist - setup.rb config
|
12
|
+
dist - gem config - tests, docs
|
13
|
+
# dist - license
|
14
|
+
# dist2 - docs
|
15
|
+
|
16
|
+
stable - threadsafe
|
17
|
+
|
18
|
+
#func - make bz2 optional
|
19
|
+
|
20
|
+
# dist - finish ATTRIB
|
21
|
+
# dist2 - organize file structure (eg, porter.rb!@#)
|
22
|
+
# dist - no tabs!@ ARGH!
|
23
|
+
|
24
|
+
tests - more unit/regression tests
|
25
|
+
# idx - format store for multiple indexes
|
26
|
+
stable - do something reasonable with logs
|
27
|
+
|
28
|
+
#dist - whoops. use zlib.rb instead of bz2...
|
29
|
+
|
30
|
+
dist - include version numbers in source
|
data/bin/rsi_search.rb
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
#! /opt/ruby-1.8.1/bin/ruby -w
|
2
|
+
#
|
3
|
+
# Usage:
|
4
|
+
# search.rb [--db /path/to/index/db] --index-dir dir/to/index
|
5
|
+
# search.rb [--db /path/to/index/db] term1 term2 term3
|
6
|
+
#
|
7
|
+
# (Not yet implemented:)
|
8
|
+
# search.rb [--db /path/to/index/db] --index-files file file file
|
9
|
+
#
|
10
|
+
require 'getoptlong'
|
11
|
+
require 'rsi'
|
12
|
+
|
13
|
+
gopt = GetoptLong.new()
|
14
|
+
gopt.set_options(
|
15
|
+
[ '--index-dir', '-i', GetoptLong::REQUIRED_ARGUMENT],
|
16
|
+
[ '--db', '-d', GetoptLong::REQUIRED_ARGUMENT]
|
17
|
+
)
|
18
|
+
# lame...
|
19
|
+
OPT = { '--db' => "/var/tmp/search" }
|
20
|
+
gopt.each_option {|opt, value| OPT[opt] = value }
|
21
|
+
|
22
|
+
indexer = RSI::Indexer.new( OPT['--db'] )
|
23
|
+
#indexer.serializer = RSI::YAMLSerializer.new()
|
24
|
+
#indexer.serializer = RSI::CompressedSerializer.new()
|
25
|
+
indexer.serializer = RSI::NativeSerializer.new() # default
|
26
|
+
indexer.open()
|
27
|
+
|
28
|
+
if OPT.has_key?( '--index-dir' )
|
29
|
+
to_index = OPT['--index-dir']
|
30
|
+
puts "Indexing #{to_index}..."
|
31
|
+
Dir.foreach( to_index ) do |filename|
|
32
|
+
next if filename =~ /^\./
|
33
|
+
next if FileTest.directory?(filename)
|
34
|
+
next if filename =~ /~$/
|
35
|
+
|
36
|
+
fullpath = File.expand_path( File.join( to_index, filename ) )
|
37
|
+
puts "...#{fullpath}"
|
38
|
+
uri = fullpath
|
39
|
+
contents = File.read( fullpath )
|
40
|
+
indexer.add_document( uri, contents )
|
41
|
+
end
|
42
|
+
puts "Synching..."
|
43
|
+
indexer.flush()
|
44
|
+
else
|
45
|
+
query = ARGV.join(" ")
|
46
|
+
puts "Query: #{query}"
|
47
|
+
puts indexer.find_all( query )
|
48
|
+
end
|
49
|
+
|
50
|
+
|
data/bin/search_bench.rb
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
#! /opt/ruby-1.8.1/bin/ruby
|
2
|
+
#
|
3
|
+
# Benchmark the serializers
|
4
|
+
#
|
5
|
+
require 'getoptlong'
|
6
|
+
require 'benchmark'
|
7
|
+
require 'rsi'
|
8
|
+
|
9
|
+
testdocs = "./testdocs"
|
10
|
+
|
11
|
+
dbs = {
|
12
|
+
"/var/tmp/search.yaml" => RSI::YAMLSerializer.new(),
|
13
|
+
"/var/tmp/search.marshal" => RSI::NativeSerializer.new(),
|
14
|
+
"/var/tmp/search.bz2" => RSI::CompressedSerializer.new(),
|
15
|
+
};
|
16
|
+
|
17
|
+
|
18
|
+
Benchmark.bm do |benchmarker|
|
19
|
+
dbs.each do |db, serializer|
|
20
|
+
benchmarker.report(db) do
|
21
|
+
|
22
|
+
if FileTest.exists?( db )
|
23
|
+
raise "DB location #{db} exists"
|
24
|
+
end
|
25
|
+
|
26
|
+
indexer = RSI::Indexer.new( db )
|
27
|
+
indexer.dictionary.serializer = serializer
|
28
|
+
|
29
|
+
#puts "Indexing #{testdocs}..."
|
30
|
+
Dir.foreach( testdocs ) do |filename|
|
31
|
+
next if filename =~ /^\./
|
32
|
+
next if FileTest.directory?(filename)
|
33
|
+
next if filename =~ /~$/
|
34
|
+
|
35
|
+
fullpath = File.expand_path( File.join( testdocs, filename ) )
|
36
|
+
#puts "...#{fullpath}"
|
37
|
+
uri = fullpath
|
38
|
+
contents = File.read( fullpath )
|
39
|
+
indexer.add_document( uri, contents )
|
40
|
+
end
|
41
|
+
#puts "Synching..."
|
42
|
+
indexer.finish()
|
43
|
+
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
data/docs/ATTRIB
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
Misc Attributions
|
2
|
+
-----------------
|
3
|
+
|
4
|
+
`porter.rb` is Ray Pereda's implementation of the Porter Stemming
|
5
|
+
Algorithm (updated by Dave Thomas). The original, and more
|
6
|
+
information, is available at http://www.tartarus.org/~martin/PorterStemmer .
|
7
|
+
|
8
|
+
`stoplist.rb` was generated from a stoplist found at
|
9
|
+
http://www.cs.utep.edu/nigel/nlp/ir/stoplist.txt, and which in turn was
|
10
|
+
attributed as "from Manning and Schutze, 1999, pg 533".
|
11
|
+
|
12
|
+
`setup.rb` is (of course) from Minero Aoki's project at
|
13
|
+
http://i.loveruby.net/en/prog/setup.html .
|
14
|
+
|
data/docs/Changes
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
0.1 - 11 Jan 2005
|
2
|
+
- Initial public release. Basic functions work, lots of nice stuff missing.
|
3
|
+
|
4
|
+
0.2 - 12 Jan 2005
|
5
|
+
- Doc release. Basic documentation, code rearrangement.
|
6
|
+
|
7
|
+
0.3 - 18 Jan 2005
|
8
|
+
- Added some unit tests
|
9
|
+
- Refactored dictionary.rb, analysis.rb, towards multiple index support
|
10
|
+
- API: index.finish() is now index.flush()
|
11
|
+
- API: protected internal methods in index, dictionary
|
12
|
+
|
13
|
+
0.4 - 04 Feb 2005
|
14
|
+
- Improved logging (see rsi/logmanager.rb)
|
15
|
+
- Improved gemspec to a releasable state
|
16
|
+
- API: multi-field indexes supported
|
17
|
+
- API: structured queries supported
|
18
|
+
- Index metadata is stored and used on open, eliminating need to
|
19
|
+
remember what kind of Serializers, etc, the index was created with.
|
20
|
+
- CompressedSerializer moved into its own file, and now uses zlib
|
21
|
+
(which is part of Ruby 1.8 stdlib) rather than bz2. The bz2
|
22
|
+
serializer is still included, and raises a sensible exception if
|
23
|
+
used without BZ2 loaded.
|
24
|
+
- More tests
|
25
|
+
|
data/docs/Roadmap
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
Ye Olde Release Planne
|
2
|
+
----------------------
|
3
|
+
|
4
|
+
[11 Jan 2005 - released]
|
5
|
+
0.1 - initial public release, basic impl
|
6
|
+
|
7
|
+
[12 Jan 2005 -released]
|
8
|
+
0.2 - docs - api, usage/tutorial, roadmap
|
9
|
+
- make sure source has no tabs (argh!)
|
10
|
+
- source layout, module structure
|
11
|
+
|
12
|
+
[18 Jan 2005 - released]
|
13
|
+
0.3 - gem dist, basic stability issues
|
14
|
+
- support multiple indexes (req internal api change) [forward from 0.5]
|
15
|
+
|
16
|
+
[04 Feb 2005]
|
17
|
+
0.4 - support simple field-based queries, including range queries
|
18
|
+
[forward from 0.5]
|
19
|
+
- store index db metainfo in metadata, verify on Index#open()
|
20
|
+
[pushed back from 0.3]
|
21
|
+
|
22
|
+
0.45 - doc update
|
23
|
+
|
24
|
+
0.5 - build range index impl [pushed back from 0.4]
|
25
|
+
- build compound analyzers
|
26
|
+
- (?) build email, html analyzers
|
27
|
+
- arch/rationale docs? [pushed back from 0.3]
|
28
|
+
|
29
|
+
[extreme fuzziness follows]
|
30
|
+
|
31
|
+
0.6 - (?) update/migration mechanism for index schema changes
|
32
|
+
|
33
|
+
0.7 - reasonably complete test, benchmarking suites
|
34
|
+
|
35
|
+
0.8 - support index updates, deletion
|
36
|
+
|
37
|
+
1.0 - threadsafe
|
38
|
+
- reduce index size as much as possible
|
39
|
+
- make searches swift
|
40
|
+
|
41
|
+
1.1 - interesting query analyzer
|
data/lib/rsi.rb
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
#
|
2
|
+
# Simple minimal fulltext indexer/search engine.
|
3
|
+
#
|
4
|
+
# = Synopsis
|
5
|
+
#
|
6
|
+
# require 'rsi'
|
7
|
+
# indexer = RSI::Index.new( "/var/db/index" )
|
8
|
+
# uri = "file:///Users/gregfast/poo.txt"
|
9
|
+
# content = File.read( uri )
|
10
|
+
# indexer.add_document( uri, content )
|
11
|
+
# indexer.finish()
|
12
|
+
#
|
13
|
+
# docs = indexer.find_all( "quick brown fox" )
|
14
|
+
#
|
15
|
+
# = Intro
|
16
|
+
#
|
17
|
+
# See link:files/lib/rsi/rsi_intro_rb.html .
|
18
|
+
#
|
19
|
+
# = Author
|
20
|
+
#
|
21
|
+
# Greg Fast, gdf@speakeasy.net
|
22
|
+
#
|
23
|
+
# = Copyright
|
24
|
+
#
|
25
|
+
# Copyright 2005 Greg Fast <gdf@speakeasy.net>
|
26
|
+
# See LICENSE file distributed with this software.
|
27
|
+
#
|
28
|
+
#--
|
29
|
+
# $Id: rsi.rb 48 2005-01-14 21:57:07Z gdf $
|
30
|
+
#
|
31
|
+
require 'logger'
|
32
|
+
require 'yaml'
|
33
|
+
require 'bz2'
|
34
|
+
require 'rsi/porter'
|
35
|
+
require 'rsi/stoplist'
|
36
|
+
require 'rsi/serializers'
|
37
|
+
require 'rsi/dictionary'
|
38
|
+
require 'rsi/index'
|
39
|
+
require 'rsi/analysis'
|
40
|
+
|
data/lib/rsi/analysis.rb
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
#
|
2
|
+
# Content/query tokenization classes
|
3
|
+
#
|
4
|
+
require 'rsi/stoplist'
|
5
|
+
require 'rsi/query'
|
6
|
+
require 'rsi/logmanager'
|
7
|
+
|
8
|
+
module RSI
|
9
|
+
|
10
|
+
# Constants for field/dictionary types.
|
11
|
+
FIELD_TYPE_TEXT = :FIELD_TYPE_TEXT
|
12
|
+
FIELD_TYPE_DATE = :FIELD_TYPE_DATE
|
13
|
+
|
14
|
+
class DefaultTextAnalyzer
|
15
|
+
include Loggable
|
16
|
+
|
17
|
+
attr_accessor :stoplist
|
18
|
+
|
19
|
+
def initialize()
|
20
|
+
@stoplist = nil
|
21
|
+
end
|
22
|
+
|
23
|
+
# Returns a map of fields to field types, for each field returned
|
24
|
+
# by this analyzer's tokenize() method.
|
25
|
+
# Field names should be safe to be used as file path components.
|
26
|
+
def get_field_types()
|
27
|
+
return { "text" => RSI::FIELD_TYPE_TEXT }
|
28
|
+
end
|
29
|
+
|
30
|
+
# Given a chunk of text content, returns a list of indexable
|
31
|
+
# terms contained in that content.
|
32
|
+
# The content may not be a complete document.
|
33
|
+
# The terms returned may not be a unique set.
|
34
|
+
# The terms returned will all be set to field 'text'.
|
35
|
+
def tokenize( content ) # -> { field, [terms...] }..
|
36
|
+
return { "text" => tokenize_text(content) }
|
37
|
+
end
|
38
|
+
|
39
|
+
def tokenize_query( query )
|
40
|
+
q = RSI::ANDQuery.new()
|
41
|
+
tokenize_text( query ).each do |t|
|
42
|
+
q.add_subquery( RSI::TermQuery.new( 'text', t ) )
|
43
|
+
end
|
44
|
+
return q
|
45
|
+
end
|
46
|
+
|
47
|
+
def tokenize_text( content )
|
48
|
+
initialize_stoplist()
|
49
|
+
c = content.dup.to_s #copy
|
50
|
+
c.gsub!( /\'s\b/, "s" ) # normalize contractions
|
51
|
+
c.gsub!( /n\'t\b/, "nt" )
|
52
|
+
c.tr!( "^a-zA-Z0-9", " " ) # thunk non-wordy chars to ws
|
53
|
+
a = c.split() # split on whitespace
|
54
|
+
a.collect! { |x| x.length<3 ? nil : x } # remove short terms
|
55
|
+
a.compact!
|
56
|
+
a.collect! { |x| x.stem } # stem terms
|
57
|
+
a.compact!
|
58
|
+
a.collect! { |x| x.upcase }
|
59
|
+
a.compact!
|
60
|
+
a.collect! { |x| @stoplist.has_key?(x) ? nil : x } # remove stops
|
61
|
+
a.uniq!
|
62
|
+
a.compact!
|
63
|
+
return a
|
64
|
+
end
|
65
|
+
|
66
|
+
protected
|
67
|
+
|
68
|
+
# done lazily, to account for changing analyzer
|
69
|
+
def initialize_stoplist()
|
70
|
+
return unless @stoplist.nil?
|
71
|
+
@stoplist = {}
|
72
|
+
tokenize_text( RSI::STOPLIST_s ).each do |x|
|
73
|
+
@stoplist[ x ] = 1 if x.length > 0
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
end
|
78
|
+
|
79
|
+
end
|