rsi 0.4

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE ADDED
@@ -0,0 +1,25 @@
1
+ Copyright (c) 2005, Gregory D. Fast
2
+ All rights reserved.
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions
6
+ are met:
7
+
8
+ * Redistributions of source code must retain the above copyright
9
+ notice, this list of conditions and the following disclaimer.
10
+
11
+ * Redistributions in binary form must reproduce the above copyright
12
+ notice, this list of conditions and the following disclaimer in the
13
+ documentation and/or other materials provided with the distribution.
14
+
15
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
21
+ TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,24 @@
1
+
2
+ PKGNAME = rsi
3
+ VERSION = $(shell cat version.release)
4
+ RELEASE = $(PKGNAME)-$(VERSION)
5
+
6
+ PHONY: check.manifest dist
7
+
8
+ dist: check.manifest version.release
9
+ mkdir $(RELEASE)
10
+ tar cf - `cat Manifest` | tar xvf - -C $(RELEASE)
11
+ tar zcvf $(RELEASE).tar.gz $(RELEASE)
12
+ rm -rf $(RELEASE)
13
+
14
+ check.manifest: Manifest
15
+ for i in `cat Manifest`; do test -e $$i; done
16
+
17
+ test:
18
+ ruby -Ilib tests/suite_all.rb
19
+
20
+
21
+ # ;; default ruby-mode has tabs on? ugh.
22
+ # (add-hook 'ruby-mode-hook
23
+ # (lambda ()
24
+ # (setq indent-tabs-mode nil)))
@@ -0,0 +1,30 @@
1
+ LICENSE
2
+ Makefile
3
+ Manifest
4
+ README
5
+ TODO
6
+ rsi.gemspec
7
+ setup.rb
8
+ version.release
9
+ bin/rsi_search.rb
10
+ bin/search_bench.rb
11
+ docs/ATTRIB
12
+ docs/Changes
13
+ docs/Roadmap
14
+ lib/rsi.rb
15
+ lib/rsi/analysis.rb
16
+ lib/rsi/compressed_serializers.rb
17
+ lib/rsi/dictionary.rb
18
+ lib/rsi/index.rb
19
+ lib/rsi/logmanager.rb
20
+ lib/rsi/porter.rb
21
+ lib/rsi/query.rb
22
+ lib/rsi/rsi_intro.rb
23
+ lib/rsi/serializers.rb
24
+ lib/rsi/stoplist.rb
25
+ lib/rsi/stoplist.txt
26
+ tests/suite_all.rb
27
+ tests/t_analysis.rb
28
+ tests/t_index.rb
29
+ tests/t_index_multi.rb
30
+ tests/t_dictionary.rb
data/README ADDED
@@ -0,0 +1,49 @@
1
+ RSI README v0.4
2
+ ===============
3
+
4
+ RSI(Ruby Simple Indexer (or perhaps Really Simple Indexer)) is a
5
+ simple full text index.
6
+
7
+ RSI is a simple full text search engine implementation in Ruby. It
8
+ aims to be easily useful within other programs: simple to set up,
9
+ simple to use.
10
+
11
+ An emphasis has been placed on getting functionality out the door,
12
+ rather than heavy optimization (that can come later). It still
13
+ appears to be reasonably fast and efficient (while admitting to have
14
+ not been heavily profiled...).
15
+
16
+ Requirements
17
+ ------------
18
+
19
+ * Ruby 1.8
20
+
21
+ Install
22
+ -------
23
+
24
+ De-compress archive and enter its top directory.
25
+ Then type:
26
+
27
+ ($ su)
28
+ # ruby setup.rb
29
+
30
+ These simple step installs this program under the default
31
+ location of Ruby libraries. You can also install files into
32
+ your favorite directory by supplying setup.rb some options.
33
+ Try "ruby setup.rb --help".
34
+
35
+
36
+ Usage
37
+ -----
38
+
39
+ See rsi_intro.rb .
40
+
41
+
42
+ License
43
+ -------
44
+
45
+ This is free software.
46
+ See the LICENSE file included in this distribution for terms.
47
+
48
+ Copyright 2005, Greg Fast <gdf@speakeasy.net>
49
+
data/TODO ADDED
@@ -0,0 +1,30 @@
1
+ #perf - reduce index size (more!)
2
+ #func - allow fields (date=x, text=x) etc
3
+ func - allow range queries
4
+ #stable - store info on module versions, serialization format, etc in db root
5
+ stable - still need version info stored in root... icky.
6
+ #stable - tests, benchmarking
7
+ longterm - index efficiency
8
+ longterm - benchmarking
9
+
10
+ # dist - name project - rsi?
11
+ # dist - setup.rb config
12
+ dist - gem config - tests, docs
13
+ # dist - license
14
+ # dist2 - docs
15
+
16
+ stable - threadsafe
17
+
18
+ #func - make bz2 optional
19
+
20
+ # dist - finish ATTRIB
21
+ # dist2 - organize file structure (eg, porter.rb!@#)
22
+ # dist - no tabs!@ ARGH!
23
+
24
+ tests - more unit/regression tests
25
+ # idx - format store for multiple indexes
26
+ stable - do something reasonable with logs
27
+
28
+ #dist - whoops. use zlib.rb instead of bz2...
29
+
30
+ dist - include version numbers in source
@@ -0,0 +1,50 @@
1
+ #! /opt/ruby-1.8.1/bin/ruby -w
2
+ #
3
+ # Usage:
4
+ # search.rb [--db /path/to/index/db] --index-dir dir/to/index
5
+ # search.rb [--db /path/to/index/db] term1 term2 term3
6
+ #
7
+ # (Not yet implemented:)
8
+ # search.rb [--db /path/to/index/db] --index-files file file file
9
+ #
10
+ require 'getoptlong'
11
+ require 'rsi'
12
+
13
+ gopt = GetoptLong.new()
14
+ gopt.set_options(
15
+ [ '--index-dir', '-i', GetoptLong::REQUIRED_ARGUMENT],
16
+ [ '--db', '-d', GetoptLong::REQUIRED_ARGUMENT]
17
+ )
18
+ # lame...
19
+ OPT = { '--db' => "/var/tmp/search" }
20
+ gopt.each_option {|opt, value| OPT[opt] = value }
21
+
22
+ indexer = RSI::Indexer.new( OPT['--db'] )
23
+ #indexer.serializer = RSI::YAMLSerializer.new()
24
+ #indexer.serializer = RSI::CompressedSerializer.new()
25
+ indexer.serializer = RSI::NativeSerializer.new() # default
26
+ indexer.open()
27
+
28
+ if OPT.has_key?( '--index-dir' )
29
+ to_index = OPT['--index-dir']
30
+ puts "Indexing #{to_index}..."
31
+ Dir.foreach( to_index ) do |filename|
32
+ next if filename =~ /^\./
33
+ next if FileTest.directory?(filename)
34
+ next if filename =~ /~$/
35
+
36
+ fullpath = File.expand_path( File.join( to_index, filename ) )
37
+ puts "...#{fullpath}"
38
+ uri = fullpath
39
+ contents = File.read( fullpath )
40
+ indexer.add_document( uri, contents )
41
+ end
42
+ puts "Synching..."
43
+ indexer.flush()
44
+ else
45
+ query = ARGV.join(" ")
46
+ puts "Query: #{query}"
47
+ puts indexer.find_all( query )
48
+ end
49
+
50
+
@@ -0,0 +1,47 @@
1
+ #! /opt/ruby-1.8.1/bin/ruby
2
+ #
3
+ # Benchmark the serializers
4
+ #
5
+ require 'getoptlong'
6
+ require 'benchmark'
7
+ require 'rsi'
8
+
9
+ testdocs = "./testdocs"
10
+
11
+ dbs = {
12
+ "/var/tmp/search.yaml" => RSI::YAMLSerializer.new(),
13
+ "/var/tmp/search.marshal" => RSI::NativeSerializer.new(),
14
+ "/var/tmp/search.bz2" => RSI::CompressedSerializer.new(),
15
+ };
16
+
17
+
18
+ Benchmark.bm do |benchmarker|
19
+ dbs.each do |db, serializer|
20
+ benchmarker.report(db) do
21
+
22
+ if FileTest.exists?( db )
23
+ raise "DB location #{db} exists"
24
+ end
25
+
26
+ indexer = RSI::Indexer.new( db )
27
+ indexer.dictionary.serializer = serializer
28
+
29
+ #puts "Indexing #{testdocs}..."
30
+ Dir.foreach( testdocs ) do |filename|
31
+ next if filename =~ /^\./
32
+ next if FileTest.directory?(filename)
33
+ next if filename =~ /~$/
34
+
35
+ fullpath = File.expand_path( File.join( testdocs, filename ) )
36
+ #puts "...#{fullpath}"
37
+ uri = fullpath
38
+ contents = File.read( fullpath )
39
+ indexer.add_document( uri, contents )
40
+ end
41
+ #puts "Synching..."
42
+ indexer.finish()
43
+
44
+ end
45
+ end
46
+ end
47
+
@@ -0,0 +1,14 @@
1
+ Misc Attributions
2
+ -----------------
3
+
4
+ `porter.rb` is Ray Pereda's implementation of the Porter Stemming
5
+ Algorithm (updated by Dave Thomas). The original, and more
6
+ information, is available at http://www.tartarus.org/~martin/PorterStemmer .
7
+
8
+ `stoplist.rb` was generated from a stoplist found at
9
+ http://www.cs.utep.edu/nigel/nlp/ir/stoplist.txt, and which in turn was
10
+ attributed as "from Manning and Schutze, 1999, pg 533".
11
+
12
+ `setup.rb` is (of course) from Minero Aoki's project at
13
+ http://i.loveruby.net/en/prog/setup.html .
14
+
@@ -0,0 +1,25 @@
1
+ 0.1 - 11 Jan 2005
2
+ - Initial public release. Basic functions work, lots of nice stuff missing.
3
+
4
+ 0.2 - 12 Jan 2005
5
+ - Doc release. Basic documentation, code rearrangement.
6
+
7
+ 0.3 - 18 Jan 2005
8
+ - Added some unit tests
9
+ - Refactored dictionary.rb, analysis.rb, towards multiple index support
10
+ - API: index.finish() is now index.flush()
11
+ - API: protected internal methods in index, dictionary
12
+
13
+ 0.4 - 04 Feb 2005
14
+ - Improved logging (see rsi/logmanager.rb)
15
+ - Improved gemspec to a releasable state
16
+ - API: multi-field indexes supported
17
+ - API: structured queries supported
18
+ - Index metadata is stored and used on open, eliminating need to
19
+ remember what kind of Serializers, etc, the index was created with.
20
+ - CompressedSerializer moved into its own file, and now uses zlib
21
+ (which is part of Ruby 1.8 stdlib) rather than bz2. The bz2
22
+ serializer is still included, and raises a sensible exception if
23
+ used without BZ2 loaded.
24
+ - More tests
25
+
@@ -0,0 +1,41 @@
1
+ Ye Olde Release Planne
2
+ ----------------------
3
+
4
+ [11 Jan 2005 - released]
5
+ 0.1 - initial public release, basic impl
6
+
7
+ [12 Jan 2005 -released]
8
+ 0.2 - docs - api, usage/tutorial, roadmap
9
+ - make sure source has no tabs (argh!)
10
+ - source layout, module structure
11
+
12
+ [18 Jan 2005 - released]
13
+ 0.3 - gem dist, basic stability issues
14
+ - support multiple indexes (req internal api change) [forward from 0.5]
15
+
16
+ [04 Feb 2005]
17
+ 0.4 - support simple field-based queries, including range queries
18
+ [forward from 0.5]
19
+ - store index db metainfo in metadata, verify on Index#open()
20
+ [pushed back from 0.3]
21
+
22
+ 0.45 - doc update
23
+
24
+ 0.5 - build range index impl [pushed back from 0.4]
25
+ - build compound analyzers
26
+ - (?) build email, html analyzers
27
+ - arch/rationale docs? [pushed back from 0.3]
28
+
29
+ [extreme fuzziness follows]
30
+
31
+ 0.6 - (?) update/migration mechanism for index schema changes
32
+
33
+ 0.7 - reasonably complete test, benchmarking suites
34
+
35
+ 0.8 - support index updates, deletion
36
+
37
+ 1.0 - threadsafe
38
+ - reduce index size as much as possible
39
+ - make searches swift
40
+
41
+ 1.1 - interesting query analyzer
@@ -0,0 +1,40 @@
1
+ #
2
+ # Simple minimal fulltext indexer/search engine.
3
+ #
4
+ # = Synopsis
5
+ #
6
+ # require 'rsi'
7
+ # indexer = RSI::Index.new( "/var/db/index" )
8
+ # uri = "file:///Users/gregfast/poo.txt"
9
+ # content = File.read( uri )
10
+ # indexer.add_document( uri, content )
11
+ # indexer.finish()
12
+ #
13
+ # docs = indexer.find_all( "quick brown fox" )
14
+ #
15
+ # = Intro
16
+ #
17
+ # See link:files/lib/rsi/rsi_intro_rb.html .
18
+ #
19
+ # = Author
20
+ #
21
+ # Greg Fast, gdf@speakeasy.net
22
+ #
23
+ # = Copyright
24
+ #
25
+ # Copyright 2005 Greg Fast <gdf@speakeasy.net>
26
+ # See LICENSE file distributed with this software.
27
+ #
28
+ #--
29
+ # $Id: rsi.rb 48 2005-01-14 21:57:07Z gdf $
30
+ #
31
+ require 'logger'
32
+ require 'yaml'
33
+ require 'bz2'
34
+ require 'rsi/porter'
35
+ require 'rsi/stoplist'
36
+ require 'rsi/serializers'
37
+ require 'rsi/dictionary'
38
+ require 'rsi/index'
39
+ require 'rsi/analysis'
40
+
@@ -0,0 +1,79 @@
1
+ #
2
+ # Content/query tokenization classes
3
+ #
4
+ require 'rsi/stoplist'
5
+ require 'rsi/query'
6
+ require 'rsi/logmanager'
7
+
8
+ module RSI
9
+
10
+ # Constants for field/dictionary types.
11
+ FIELD_TYPE_TEXT = :FIELD_TYPE_TEXT
12
+ FIELD_TYPE_DATE = :FIELD_TYPE_DATE
13
+
14
+ class DefaultTextAnalyzer
15
+ include Loggable
16
+
17
+ attr_accessor :stoplist
18
+
19
+ def initialize()
20
+ @stoplist = nil
21
+ end
22
+
23
+ # Returns a map of fields to field types, for each field returned
24
+ # by this analyzer's tokenize() method.
25
+ # Field names should be safe to be used as file path components.
26
+ def get_field_types()
27
+ return { "text" => RSI::FIELD_TYPE_TEXT }
28
+ end
29
+
30
+ # Given a chunk of text content, returns a list of indexable
31
+ # terms contained in that content.
32
+ # The content may not be a complete document.
33
+ # The terms returned may not be a unique set.
34
+ # The terms returned will all be set to field 'text'.
35
+ def tokenize( content ) # -> { field, [terms...] }..
36
+ return { "text" => tokenize_text(content) }
37
+ end
38
+
39
+ def tokenize_query( query )
40
+ q = RSI::ANDQuery.new()
41
+ tokenize_text( query ).each do |t|
42
+ q.add_subquery( RSI::TermQuery.new( 'text', t ) )
43
+ end
44
+ return q
45
+ end
46
+
47
+ def tokenize_text( content )
48
+ initialize_stoplist()
49
+ c = content.dup.to_s #copy
50
+ c.gsub!( /\'s\b/, "s" ) # normalize contractions
51
+ c.gsub!( /n\'t\b/, "nt" )
52
+ c.tr!( "^a-zA-Z0-9", " " ) # thunk non-wordy chars to ws
53
+ a = c.split() # split on whitespace
54
+ a.collect! { |x| x.length<3 ? nil : x } # remove short terms
55
+ a.compact!
56
+ a.collect! { |x| x.stem } # stem terms
57
+ a.compact!
58
+ a.collect! { |x| x.upcase }
59
+ a.compact!
60
+ a.collect! { |x| @stoplist.has_key?(x) ? nil : x } # remove stops
61
+ a.uniq!
62
+ a.compact!
63
+ return a
64
+ end
65
+
66
+ protected
67
+
68
+ # done lazily, to account for changing analyzer
69
+ def initialize_stoplist()
70
+ return unless @stoplist.nil?
71
+ @stoplist = {}
72
+ tokenize_text( RSI::STOPLIST_s ).each do |x|
73
+ @stoplist[ x ] = 1 if x.length > 0
74
+ end
75
+ end
76
+
77
+ end
78
+
79
+ end