rsi 0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +25 -0
- data/Makefile +24 -0
- data/Manifest +30 -0
- data/README +49 -0
- data/TODO +30 -0
- data/bin/rsi_search.rb +50 -0
- data/bin/search_bench.rb +47 -0
- data/docs/ATTRIB +14 -0
- data/docs/Changes +25 -0
- data/docs/Roadmap +41 -0
- data/lib/rsi.rb +40 -0
- data/lib/rsi/analysis.rb +79 -0
- data/lib/rsi/compressed_serializers.rb +60 -0
- data/lib/rsi/dictionary.rb +232 -0
- data/lib/rsi/index.rb +245 -0
- data/lib/rsi/logmanager.rb +105 -0
- data/lib/rsi/porter.rb +213 -0
- data/lib/rsi/query.rb +98 -0
- data/lib/rsi/rsi_intro.rb +91 -0
- data/lib/rsi/serializers.rb +31 -0
- data/lib/rsi/stoplist.rb +72 -0
- data/lib/rsi/stoplist.txt +59 -0
- data/rsi.gemspec +59 -0
- data/setup.rb +1360 -0
- data/tests/suite_all.rb +14 -0
- data/tests/t_analysis.rb +43 -0
- data/tests/t_dictionary.rb +76 -0
- data/tests/t_index.rb +78 -0
- data/tests/t_index_multi.rb +71 -0
- data/version.release +1 -0
- metadata +72 -0
data/LICENSE
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
Copyright (c) 2005, Gregory D. Fast
|
2
|
+
All rights reserved.
|
3
|
+
|
4
|
+
Redistribution and use in source and binary forms, with or without
|
5
|
+
modification, are permitted provided that the following conditions
|
6
|
+
are met:
|
7
|
+
|
8
|
+
* Redistributions of source code must retain the above copyright
|
9
|
+
notice, this list of conditions and the following disclaimer.
|
10
|
+
|
11
|
+
* Redistributions in binary form must reproduce the above copyright
|
12
|
+
notice, this list of conditions and the following disclaimer in the
|
13
|
+
documentation and/or other materials provided with the distribution.
|
14
|
+
|
15
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
16
|
+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
17
|
+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
18
|
+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
19
|
+
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
20
|
+
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
|
21
|
+
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22
|
+
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
23
|
+
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
24
|
+
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
25
|
+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
data/Makefile
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
|
2
|
+
PKGNAME = rsi
|
3
|
+
VERSION = $(shell cat version.release)
|
4
|
+
RELEASE = $(PKGNAME)-$(VERSION)
|
5
|
+
|
6
|
+
PHONY: check.manifest dist
|
7
|
+
|
8
|
+
dist: check.manifest version.release
|
9
|
+
mkdir $(RELEASE)
|
10
|
+
tar cf - `cat Manifest` | tar xvf - -C $(RELEASE)
|
11
|
+
tar zcvf $(RELEASE).tar.gz $(RELEASE)
|
12
|
+
rm -rf $(RELEASE)
|
13
|
+
|
14
|
+
check.manifest: Manifest
|
15
|
+
for i in `cat Manifest`; do test -e $$i; done
|
16
|
+
|
17
|
+
test:
|
18
|
+
ruby -Ilib tests/suite_all.rb
|
19
|
+
|
20
|
+
|
21
|
+
# ;; default ruby-mode has tabs on? ugh.
|
22
|
+
# (add-hook 'ruby-mode-hook
|
23
|
+
# (lambda ()
|
24
|
+
# (setq indent-tabs-mode nil)))
|
data/Manifest
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
LICENSE
|
2
|
+
Makefile
|
3
|
+
Manifest
|
4
|
+
README
|
5
|
+
TODO
|
6
|
+
rsi.gemspec
|
7
|
+
setup.rb
|
8
|
+
version.release
|
9
|
+
bin/rsi_search.rb
|
10
|
+
bin/search_bench.rb
|
11
|
+
docs/ATTRIB
|
12
|
+
docs/Changes
|
13
|
+
docs/Roadmap
|
14
|
+
lib/rsi.rb
|
15
|
+
lib/rsi/analysis.rb
|
16
|
+
lib/rsi/compressed_serializers.rb
|
17
|
+
lib/rsi/dictionary.rb
|
18
|
+
lib/rsi/index.rb
|
19
|
+
lib/rsi/logmanager.rb
|
20
|
+
lib/rsi/porter.rb
|
21
|
+
lib/rsi/query.rb
|
22
|
+
lib/rsi/rsi_intro.rb
|
23
|
+
lib/rsi/serializers.rb
|
24
|
+
lib/rsi/stoplist.rb
|
25
|
+
lib/rsi/stoplist.txt
|
26
|
+
tests/suite_all.rb
|
27
|
+
tests/t_analysis.rb
|
28
|
+
tests/t_index.rb
|
29
|
+
tests/t_index_multi.rb
|
30
|
+
tests/t_dictionary.rb
|
data/README
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
RSI README v0.4
|
2
|
+
===============
|
3
|
+
|
4
|
+
RSI(Ruby Simple Indexer (or perhaps Really Simple Indexer)) is a
|
5
|
+
simple full text index.
|
6
|
+
|
7
|
+
RSI is a simple full text search engine implementation in Ruby. It
|
8
|
+
aims to be easily useful within other programs: simple to set up,
|
9
|
+
simple to use.
|
10
|
+
|
11
|
+
An emphasis has been placed on getting functionality out the door,
|
12
|
+
rather than heavy optimization (that can come later). It still
|
13
|
+
appears to be reasonably fast and efficient (while admitting to have
|
14
|
+
not been heavily profiled...).
|
15
|
+
|
16
|
+
Requirements
|
17
|
+
------------
|
18
|
+
|
19
|
+
* Ruby 1.8
|
20
|
+
|
21
|
+
Install
|
22
|
+
-------
|
23
|
+
|
24
|
+
De-compress archive and enter its top directory.
|
25
|
+
Then type:
|
26
|
+
|
27
|
+
($ su)
|
28
|
+
# ruby setup.rb
|
29
|
+
|
30
|
+
These simple step installs this program under the default
|
31
|
+
location of Ruby libraries. You can also install files into
|
32
|
+
your favorite directory by supplying setup.rb some options.
|
33
|
+
Try "ruby setup.rb --help".
|
34
|
+
|
35
|
+
|
36
|
+
Usage
|
37
|
+
-----
|
38
|
+
|
39
|
+
See rsi_intro.rb .
|
40
|
+
|
41
|
+
|
42
|
+
License
|
43
|
+
-------
|
44
|
+
|
45
|
+
This is free software.
|
46
|
+
See the LICENSE file included in this distribution for terms.
|
47
|
+
|
48
|
+
Copyright 2005, Greg Fast <gdf@speakeasy.net>
|
49
|
+
|
data/TODO
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
#perf - reduce index size (more!)
|
2
|
+
#func - allow fields (date=x, text=x) etc
|
3
|
+
func - allow range queries
|
4
|
+
#stable - store info on module versions, serialization format, etc in db root
|
5
|
+
stable - still need version info stored in root... icky.
|
6
|
+
#stable - tests, benchmarking
|
7
|
+
longterm - index efficiency
|
8
|
+
longterm - benchmarking
|
9
|
+
|
10
|
+
# dist - name project - rsi?
|
11
|
+
# dist - setup.rb config
|
12
|
+
dist - gem config - tests, docs
|
13
|
+
# dist - license
|
14
|
+
# dist2 - docs
|
15
|
+
|
16
|
+
stable - threadsafe
|
17
|
+
|
18
|
+
#func - make bz2 optional
|
19
|
+
|
20
|
+
# dist - finish ATTRIB
|
21
|
+
# dist2 - organize file structure (eg, porter.rb!@#)
|
22
|
+
# dist - no tabs!@ ARGH!
|
23
|
+
|
24
|
+
tests - more unit/regression tests
|
25
|
+
# idx - format store for multiple indexes
|
26
|
+
stable - do something reasonable with logs
|
27
|
+
|
28
|
+
#dist - whoops. use zlib.rb instead of bz2...
|
29
|
+
|
30
|
+
dist - include version numbers in source
|
data/bin/rsi_search.rb
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
#! /opt/ruby-1.8.1/bin/ruby -w
|
2
|
+
#
|
3
|
+
# Usage:
|
4
|
+
# search.rb [--db /path/to/index/db] --index-dir dir/to/index
|
5
|
+
# search.rb [--db /path/to/index/db] term1 term2 term3
|
6
|
+
#
|
7
|
+
# (Not yet implemented:)
|
8
|
+
# search.rb [--db /path/to/index/db] --index-files file file file
|
9
|
+
#
|
10
|
+
require 'getoptlong'
|
11
|
+
require 'rsi'
|
12
|
+
|
13
|
+
gopt = GetoptLong.new()
|
14
|
+
gopt.set_options(
|
15
|
+
[ '--index-dir', '-i', GetoptLong::REQUIRED_ARGUMENT],
|
16
|
+
[ '--db', '-d', GetoptLong::REQUIRED_ARGUMENT]
|
17
|
+
)
|
18
|
+
# lame...
|
19
|
+
OPT = { '--db' => "/var/tmp/search" }
|
20
|
+
gopt.each_option {|opt, value| OPT[opt] = value }
|
21
|
+
|
22
|
+
indexer = RSI::Indexer.new( OPT['--db'] )
|
23
|
+
#indexer.serializer = RSI::YAMLSerializer.new()
|
24
|
+
#indexer.serializer = RSI::CompressedSerializer.new()
|
25
|
+
indexer.serializer = RSI::NativeSerializer.new() # default
|
26
|
+
indexer.open()
|
27
|
+
|
28
|
+
if OPT.has_key?( '--index-dir' )
|
29
|
+
to_index = OPT['--index-dir']
|
30
|
+
puts "Indexing #{to_index}..."
|
31
|
+
Dir.foreach( to_index ) do |filename|
|
32
|
+
next if filename =~ /^\./
|
33
|
+
next if FileTest.directory?(filename)
|
34
|
+
next if filename =~ /~$/
|
35
|
+
|
36
|
+
fullpath = File.expand_path( File.join( to_index, filename ) )
|
37
|
+
puts "...#{fullpath}"
|
38
|
+
uri = fullpath
|
39
|
+
contents = File.read( fullpath )
|
40
|
+
indexer.add_document( uri, contents )
|
41
|
+
end
|
42
|
+
puts "Synching..."
|
43
|
+
indexer.flush()
|
44
|
+
else
|
45
|
+
query = ARGV.join(" ")
|
46
|
+
puts "Query: #{query}"
|
47
|
+
puts indexer.find_all( query )
|
48
|
+
end
|
49
|
+
|
50
|
+
|
data/bin/search_bench.rb
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
#! /opt/ruby-1.8.1/bin/ruby
|
2
|
+
#
|
3
|
+
# Benchmark the serializers
|
4
|
+
#
|
5
|
+
require 'getoptlong'
|
6
|
+
require 'benchmark'
|
7
|
+
require 'rsi'
|
8
|
+
|
9
|
+
testdocs = "./testdocs"
|
10
|
+
|
11
|
+
dbs = {
|
12
|
+
"/var/tmp/search.yaml" => RSI::YAMLSerializer.new(),
|
13
|
+
"/var/tmp/search.marshal" => RSI::NativeSerializer.new(),
|
14
|
+
"/var/tmp/search.bz2" => RSI::CompressedSerializer.new(),
|
15
|
+
};
|
16
|
+
|
17
|
+
|
18
|
+
Benchmark.bm do |benchmarker|
|
19
|
+
dbs.each do |db, serializer|
|
20
|
+
benchmarker.report(db) do
|
21
|
+
|
22
|
+
if FileTest.exists?( db )
|
23
|
+
raise "DB location #{db} exists"
|
24
|
+
end
|
25
|
+
|
26
|
+
indexer = RSI::Indexer.new( db )
|
27
|
+
indexer.dictionary.serializer = serializer
|
28
|
+
|
29
|
+
#puts "Indexing #{testdocs}..."
|
30
|
+
Dir.foreach( testdocs ) do |filename|
|
31
|
+
next if filename =~ /^\./
|
32
|
+
next if FileTest.directory?(filename)
|
33
|
+
next if filename =~ /~$/
|
34
|
+
|
35
|
+
fullpath = File.expand_path( File.join( testdocs, filename ) )
|
36
|
+
#puts "...#{fullpath}"
|
37
|
+
uri = fullpath
|
38
|
+
contents = File.read( fullpath )
|
39
|
+
indexer.add_document( uri, contents )
|
40
|
+
end
|
41
|
+
#puts "Synching..."
|
42
|
+
indexer.finish()
|
43
|
+
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
data/docs/ATTRIB
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
Misc Attributions
|
2
|
+
-----------------
|
3
|
+
|
4
|
+
`porter.rb` is Ray Pereda's implementation of the Porter Stemming
|
5
|
+
Algorithm (updated by Dave Thomas). The original, and more
|
6
|
+
information, is available at http://www.tartarus.org/~martin/PorterStemmer .
|
7
|
+
|
8
|
+
`stoplist.rb` was generated from a stoplist found at
|
9
|
+
http://www.cs.utep.edu/nigel/nlp/ir/stoplist.txt, and which in turn was
|
10
|
+
attributed as "from Manning and Schutze, 1999, pg 533".
|
11
|
+
|
12
|
+
`setup.rb` is (of course) from Minero Aoki's project at
|
13
|
+
http://i.loveruby.net/en/prog/setup.html .
|
14
|
+
|
data/docs/Changes
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
0.1 - 11 Jan 2005
|
2
|
+
- Initial public release. Basic functions work, lots of nice stuff missing.
|
3
|
+
|
4
|
+
0.2 - 12 Jan 2005
|
5
|
+
- Doc release. Basic documentation, code rearrangement.
|
6
|
+
|
7
|
+
0.3 - 18 Jan 2005
|
8
|
+
- Added some unit tests
|
9
|
+
- Refactored dictionary.rb, analysis.rb, towards multiple index support
|
10
|
+
- API: index.finish() is now index.flush()
|
11
|
+
- API: protected internal methods in index, dictionary
|
12
|
+
|
13
|
+
0.4 - 04 Feb 2005
|
14
|
+
- Improved logging (see rsi/logmanager.rb)
|
15
|
+
- Improved gemspec to a releasable state
|
16
|
+
- API: multi-field indexes supported
|
17
|
+
- API: structured queries supported
|
18
|
+
- Index metadata is stored and used on open, eliminating need to
|
19
|
+
remember what kind of Serializers, etc, the index was created with.
|
20
|
+
- CompressedSerializer moved into its own file, and now uses zlib
|
21
|
+
(which is part of Ruby 1.8 stdlib) rather than bz2. The bz2
|
22
|
+
serializer is still included, and raises a sensible exception if
|
23
|
+
used without BZ2 loaded.
|
24
|
+
- More tests
|
25
|
+
|
data/docs/Roadmap
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
Ye Olde Release Planne
|
2
|
+
----------------------
|
3
|
+
|
4
|
+
[11 Jan 2005 - released]
|
5
|
+
0.1 - initial public release, basic impl
|
6
|
+
|
7
|
+
[12 Jan 2005 -released]
|
8
|
+
0.2 - docs - api, usage/tutorial, roadmap
|
9
|
+
- make sure source has no tabs (argh!)
|
10
|
+
- source layout, module structure
|
11
|
+
|
12
|
+
[18 Jan 2005 - released]
|
13
|
+
0.3 - gem dist, basic stability issues
|
14
|
+
- support multiple indexes (req internal api change) [forward from 0.5]
|
15
|
+
|
16
|
+
[04 Feb 2005]
|
17
|
+
0.4 - support simple field-based queries, including range queries
|
18
|
+
[forward from 0.5]
|
19
|
+
- store index db metainfo in metadata, verify on Index#open()
|
20
|
+
[pushed back from 0.3]
|
21
|
+
|
22
|
+
0.45 - doc update
|
23
|
+
|
24
|
+
0.5 - build range index impl [pushed back from 0.4]
|
25
|
+
- build compound analyzers
|
26
|
+
- (?) build email, html analyzers
|
27
|
+
- arch/rationale docs? [pushed back from 0.3]
|
28
|
+
|
29
|
+
[extreme fuzziness follows]
|
30
|
+
|
31
|
+
0.6 - (?) update/migration mechanism for index schema changes
|
32
|
+
|
33
|
+
0.7 - reasonably complete test, benchmarking suites
|
34
|
+
|
35
|
+
0.8 - support index updates, deletion
|
36
|
+
|
37
|
+
1.0 - threadsafe
|
38
|
+
- reduce index size as much as possible
|
39
|
+
- make searches swift
|
40
|
+
|
41
|
+
1.1 - interesting query analyzer
|
data/lib/rsi.rb
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
#
|
2
|
+
# Simple minimal fulltext indexer/search engine.
|
3
|
+
#
|
4
|
+
# = Synopsis
|
5
|
+
#
|
6
|
+
# require 'rsi'
|
7
|
+
# indexer = RSI::Index.new( "/var/db/index" )
|
8
|
+
# uri = "file:///Users/gregfast/poo.txt"
|
9
|
+
# content = File.read( uri )
|
10
|
+
# indexer.add_document( uri, content )
|
11
|
+
# indexer.finish()
|
12
|
+
#
|
13
|
+
# docs = indexer.find_all( "quick brown fox" )
|
14
|
+
#
|
15
|
+
# = Intro
|
16
|
+
#
|
17
|
+
# See link:files/lib/rsi/rsi_intro_rb.html .
|
18
|
+
#
|
19
|
+
# = Author
|
20
|
+
#
|
21
|
+
# Greg Fast, gdf@speakeasy.net
|
22
|
+
#
|
23
|
+
# = Copyright
|
24
|
+
#
|
25
|
+
# Copyright 2005 Greg Fast <gdf@speakeasy.net>
|
26
|
+
# See LICENSE file distributed with this software.
|
27
|
+
#
|
28
|
+
#--
|
29
|
+
# $Id: rsi.rb 48 2005-01-14 21:57:07Z gdf $
|
30
|
+
#
|
31
|
+
require 'logger'
|
32
|
+
require 'yaml'
|
33
|
+
require 'bz2'
|
34
|
+
require 'rsi/porter'
|
35
|
+
require 'rsi/stoplist'
|
36
|
+
require 'rsi/serializers'
|
37
|
+
require 'rsi/dictionary'
|
38
|
+
require 'rsi/index'
|
39
|
+
require 'rsi/analysis'
|
40
|
+
|
data/lib/rsi/analysis.rb
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
#
|
2
|
+
# Content/query tokenization classes
|
3
|
+
#
|
4
|
+
require 'rsi/stoplist'
|
5
|
+
require 'rsi/query'
|
6
|
+
require 'rsi/logmanager'
|
7
|
+
|
8
|
+
module RSI
|
9
|
+
|
10
|
+
# Constants for field/dictionary types.
|
11
|
+
FIELD_TYPE_TEXT = :FIELD_TYPE_TEXT
|
12
|
+
FIELD_TYPE_DATE = :FIELD_TYPE_DATE
|
13
|
+
|
14
|
+
class DefaultTextAnalyzer
|
15
|
+
include Loggable
|
16
|
+
|
17
|
+
attr_accessor :stoplist
|
18
|
+
|
19
|
+
def initialize()
|
20
|
+
@stoplist = nil
|
21
|
+
end
|
22
|
+
|
23
|
+
# Returns a map of fields to field types, for each field returned
|
24
|
+
# by this analyzer's tokenize() method.
|
25
|
+
# Field names should be safe to be used as file path components.
|
26
|
+
def get_field_types()
|
27
|
+
return { "text" => RSI::FIELD_TYPE_TEXT }
|
28
|
+
end
|
29
|
+
|
30
|
+
# Given a chunk of text content, returns a list of indexable
|
31
|
+
# terms contained in that content.
|
32
|
+
# The content may not be a complete document.
|
33
|
+
# The terms returned may not be a unique set.
|
34
|
+
# The terms returned will all be set to field 'text'.
|
35
|
+
def tokenize( content ) # -> { field, [terms...] }..
|
36
|
+
return { "text" => tokenize_text(content) }
|
37
|
+
end
|
38
|
+
|
39
|
+
def tokenize_query( query )
|
40
|
+
q = RSI::ANDQuery.new()
|
41
|
+
tokenize_text( query ).each do |t|
|
42
|
+
q.add_subquery( RSI::TermQuery.new( 'text', t ) )
|
43
|
+
end
|
44
|
+
return q
|
45
|
+
end
|
46
|
+
|
47
|
+
def tokenize_text( content )
|
48
|
+
initialize_stoplist()
|
49
|
+
c = content.dup.to_s #copy
|
50
|
+
c.gsub!( /\'s\b/, "s" ) # normalize contractions
|
51
|
+
c.gsub!( /n\'t\b/, "nt" )
|
52
|
+
c.tr!( "^a-zA-Z0-9", " " ) # thunk non-wordy chars to ws
|
53
|
+
a = c.split() # split on whitespace
|
54
|
+
a.collect! { |x| x.length<3 ? nil : x } # remove short terms
|
55
|
+
a.compact!
|
56
|
+
a.collect! { |x| x.stem } # stem terms
|
57
|
+
a.compact!
|
58
|
+
a.collect! { |x| x.upcase }
|
59
|
+
a.compact!
|
60
|
+
a.collect! { |x| @stoplist.has_key?(x) ? nil : x } # remove stops
|
61
|
+
a.uniq!
|
62
|
+
a.compact!
|
63
|
+
return a
|
64
|
+
end
|
65
|
+
|
66
|
+
protected
|
67
|
+
|
68
|
+
# done lazily, to account for changing analyzer
|
69
|
+
def initialize_stoplist()
|
70
|
+
return unless @stoplist.nil?
|
71
|
+
@stoplist = {}
|
72
|
+
tokenize_text( RSI::STOPLIST_s ).each do |x|
|
73
|
+
@stoplist[ x ] = 1 if x.length > 0
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
end
|
78
|
+
|
79
|
+
end
|