rsi 0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/LICENSE ADDED
@@ -0,0 +1,25 @@
1
+ Copyright (c) 2005, Gregory D. Fast
2
+ All rights reserved.
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions
6
+ are met:
7
+
8
+ * Redistributions of source code must retain the above copyright
9
+ notice, this list of conditions and the following disclaimer.
10
+
11
+ * Redistributions in binary form must reproduce the above copyright
12
+ notice, this list of conditions and the following disclaimer in the
13
+ documentation and/or other materials provided with the distribution.
14
+
15
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
21
+ TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,24 @@
1
+
2
+ PKGNAME = rsi
3
+ VERSION = $(shell cat version.release)
4
+ RELEASE = $(PKGNAME)-$(VERSION)
5
+
6
+ PHONY: check.manifest dist
7
+
8
+ dist: check.manifest version.release
9
+ mkdir $(RELEASE)
10
+ tar cf - `cat Manifest` | tar xvf - -C $(RELEASE)
11
+ tar zcvf $(RELEASE).tar.gz $(RELEASE)
12
+ rm -rf $(RELEASE)
13
+
14
+ check.manifest: Manifest
15
+ for i in `cat Manifest`; do test -e $$i; done
16
+
17
+ test:
18
+ ruby -Ilib tests/suite_all.rb
19
+
20
+
21
+ # ;; default ruby-mode has tabs on? ugh.
22
+ # (add-hook 'ruby-mode-hook
23
+ # (lambda ()
24
+ # (setq indent-tabs-mode nil)))
@@ -0,0 +1,30 @@
1
+ LICENSE
2
+ Makefile
3
+ Manifest
4
+ README
5
+ TODO
6
+ rsi.gemspec
7
+ setup.rb
8
+ version.release
9
+ bin/rsi_search.rb
10
+ bin/search_bench.rb
11
+ docs/ATTRIB
12
+ docs/Changes
13
+ docs/Roadmap
14
+ lib/rsi.rb
15
+ lib/rsi/analysis.rb
16
+ lib/rsi/compressed_serializers.rb
17
+ lib/rsi/dictionary.rb
18
+ lib/rsi/index.rb
19
+ lib/rsi/logmanager.rb
20
+ lib/rsi/porter.rb
21
+ lib/rsi/query.rb
22
+ lib/rsi/rsi_intro.rb
23
+ lib/rsi/serializers.rb
24
+ lib/rsi/stoplist.rb
25
+ lib/rsi/stoplist.txt
26
+ tests/suite_all.rb
27
+ tests/t_analysis.rb
28
+ tests/t_index.rb
29
+ tests/t_index_multi.rb
30
+ tests/t_dictionary.rb
data/README ADDED
@@ -0,0 +1,49 @@
1
+ RSI README v0.4
2
+ ===============
3
+
4
+ RSI(Ruby Simple Indexer (or perhaps Really Simple Indexer)) is a
5
+ simple full text index.
6
+
7
+ RSI is a simple full text search engine implementation in Ruby. It
8
+ aims to be easily useful within other programs: simple to set up,
9
+ simple to use.
10
+
11
+ An emphasis has been placed on getting functionality out the door,
12
+ rather than heavy optimization (that can come later). It still
13
+ appears to be reasonably fast and efficient (while admitting to have
14
+ not been heavily profiled...).
15
+
16
+ Requirements
17
+ ------------
18
+
19
+ * Ruby 1.8
20
+
21
+ Install
22
+ -------
23
+
24
+ De-compress archive and enter its top directory.
25
+ Then type:
26
+
27
+ ($ su)
28
+ # ruby setup.rb
29
+
30
+ These simple step installs this program under the default
31
+ location of Ruby libraries. You can also install files into
32
+ your favorite directory by supplying setup.rb some options.
33
+ Try "ruby setup.rb --help".
34
+
35
+
36
+ Usage
37
+ -----
38
+
39
+ See rsi_intro.rb .
40
+
41
+
42
+ License
43
+ -------
44
+
45
+ This is free software.
46
+ See the LICENSE file included in this distribution for terms.
47
+
48
+ Copyright 2005, Greg Fast <gdf@speakeasy.net>
49
+
data/TODO ADDED
@@ -0,0 +1,30 @@
1
+ #perf - reduce index size (more!)
2
+ #func - allow fields (date=x, text=x) etc
3
+ func - allow range queries
4
+ #stable - store info on module versions, serialization format, etc in db root
5
+ stable - still need version info stored in root... icky.
6
+ #stable - tests, benchmarking
7
+ longterm - index efficiency
8
+ longterm - benchmarking
9
+
10
+ # dist - name project - rsi?
11
+ # dist - setup.rb config
12
+ dist - gem config - tests, docs
13
+ # dist - license
14
+ # dist2 - docs
15
+
16
+ stable - threadsafe
17
+
18
+ #func - make bz2 optional
19
+
20
+ # dist - finish ATTRIB
21
+ # dist2 - organize file structure (eg, porter.rb!@#)
22
+ # dist - no tabs!@ ARGH!
23
+
24
+ tests - more unit/regression tests
25
+ # idx - format store for multiple indexes
26
+ stable - do something reasonable with logs
27
+
28
+ #dist - whoops. use zlib.rb instead of bz2...
29
+
30
+ dist - include version numbers in source
@@ -0,0 +1,50 @@
1
+ #! /opt/ruby-1.8.1/bin/ruby -w
2
+ #
3
+ # Usage:
4
+ # search.rb [--db /path/to/index/db] --index-dir dir/to/index
5
+ # search.rb [--db /path/to/index/db] term1 term2 term3
6
+ #
7
+ # (Not yet implemented:)
8
+ # search.rb [--db /path/to/index/db] --index-files file file file
9
+ #
10
+ require 'getoptlong'
11
+ require 'rsi'
12
+
13
+ gopt = GetoptLong.new()
14
+ gopt.set_options(
15
+ [ '--index-dir', '-i', GetoptLong::REQUIRED_ARGUMENT],
16
+ [ '--db', '-d', GetoptLong::REQUIRED_ARGUMENT]
17
+ )
18
+ # lame...
19
+ OPT = { '--db' => "/var/tmp/search" }
20
+ gopt.each_option {|opt, value| OPT[opt] = value }
21
+
22
+ indexer = RSI::Indexer.new( OPT['--db'] )
23
+ #indexer.serializer = RSI::YAMLSerializer.new()
24
+ #indexer.serializer = RSI::CompressedSerializer.new()
25
+ indexer.serializer = RSI::NativeSerializer.new() # default
26
+ indexer.open()
27
+
28
+ if OPT.has_key?( '--index-dir' )
29
+ to_index = OPT['--index-dir']
30
+ puts "Indexing #{to_index}..."
31
+ Dir.foreach( to_index ) do |filename|
32
+ next if filename =~ /^\./
33
+ next if FileTest.directory?(filename)
34
+ next if filename =~ /~$/
35
+
36
+ fullpath = File.expand_path( File.join( to_index, filename ) )
37
+ puts "...#{fullpath}"
38
+ uri = fullpath
39
+ contents = File.read( fullpath )
40
+ indexer.add_document( uri, contents )
41
+ end
42
+ puts "Synching..."
43
+ indexer.flush()
44
+ else
45
+ query = ARGV.join(" ")
46
+ puts "Query: #{query}"
47
+ puts indexer.find_all( query )
48
+ end
49
+
50
+
@@ -0,0 +1,47 @@
1
+ #! /opt/ruby-1.8.1/bin/ruby
2
+ #
3
+ # Benchmark the serializers
4
+ #
5
+ require 'getoptlong'
6
+ require 'benchmark'
7
+ require 'rsi'
8
+
9
+ testdocs = "./testdocs"
10
+
11
+ dbs = {
12
+ "/var/tmp/search.yaml" => RSI::YAMLSerializer.new(),
13
+ "/var/tmp/search.marshal" => RSI::NativeSerializer.new(),
14
+ "/var/tmp/search.bz2" => RSI::CompressedSerializer.new(),
15
+ };
16
+
17
+
18
+ Benchmark.bm do |benchmarker|
19
+ dbs.each do |db, serializer|
20
+ benchmarker.report(db) do
21
+
22
+ if FileTest.exists?( db )
23
+ raise "DB location #{db} exists"
24
+ end
25
+
26
+ indexer = RSI::Indexer.new( db )
27
+ indexer.dictionary.serializer = serializer
28
+
29
+ #puts "Indexing #{testdocs}..."
30
+ Dir.foreach( testdocs ) do |filename|
31
+ next if filename =~ /^\./
32
+ next if FileTest.directory?(filename)
33
+ next if filename =~ /~$/
34
+
35
+ fullpath = File.expand_path( File.join( testdocs, filename ) )
36
+ #puts "...#{fullpath}"
37
+ uri = fullpath
38
+ contents = File.read( fullpath )
39
+ indexer.add_document( uri, contents )
40
+ end
41
+ #puts "Synching..."
42
+ indexer.finish()
43
+
44
+ end
45
+ end
46
+ end
47
+
@@ -0,0 +1,14 @@
1
+ Misc Attributions
2
+ -----------------
3
+
4
+ `porter.rb` is Ray Pereda's implementation of the Porter Stemming
5
+ Algorithm (updated by Dave Thomas). The original, and more
6
+ information, is available at http://www.tartarus.org/~martin/PorterStemmer .
7
+
8
+ `stoplist.rb` was generated from a stoplist found at
9
+ http://www.cs.utep.edu/nigel/nlp/ir/stoplist.txt, and which in turn was
10
+ attributed as "from Manning and Schutze, 1999, pg 533".
11
+
12
+ `setup.rb` is (of course) from Minero Aoki's project at
13
+ http://i.loveruby.net/en/prog/setup.html .
14
+
@@ -0,0 +1,25 @@
1
+ 0.1 - 11 Jan 2005
2
+ - Initial public release. Basic functions work, lots of nice stuff missing.
3
+
4
+ 0.2 - 12 Jan 2005
5
+ - Doc release. Basic documentation, code rearrangement.
6
+
7
+ 0.3 - 18 Jan 2005
8
+ - Added some unit tests
9
+ - Refactored dictionary.rb, analysis.rb, towards multiple index support
10
+ - API: index.finish() is now index.flush()
11
+ - API: protected internal methods in index, dictionary
12
+
13
+ 0.4 - 04 Feb 2005
14
+ - Improved logging (see rsi/logmanager.rb)
15
+ - Improved gemspec to a releasable state
16
+ - API: multi-field indexes supported
17
+ - API: structured queries supported
18
+ - Index metadata is stored and used on open, eliminating need to
19
+ remember what kind of Serializers, etc, the index was created with.
20
+ - CompressedSerializer moved into its own file, and now uses zlib
21
+ (which is part of Ruby 1.8 stdlib) rather than bz2. The bz2
22
+ serializer is still included, and raises a sensible exception if
23
+ used without BZ2 loaded.
24
+ - More tests
25
+
@@ -0,0 +1,41 @@
1
+ Ye Olde Release Planne
2
+ ----------------------
3
+
4
+ [11 Jan 2005 - released]
5
+ 0.1 - initial public release, basic impl
6
+
7
+ [12 Jan 2005 -released]
8
+ 0.2 - docs - api, usage/tutorial, roadmap
9
+ - make sure source has no tabs (argh!)
10
+ - source layout, module structure
11
+
12
+ [18 Jan 2005 - released]
13
+ 0.3 - gem dist, basic stability issues
14
+ - support multiple indexes (req internal api change) [forward from 0.5]
15
+
16
+ [04 Feb 2005]
17
+ 0.4 - support simple field-based queries, including range queries
18
+ [forward from 0.5]
19
+ - store index db metainfo in metadata, verify on Index#open()
20
+ [pushed back from 0.3]
21
+
22
+ 0.45 - doc update
23
+
24
+ 0.5 - build range index impl [pushed back from 0.4]
25
+ - build compound analyzers
26
+ - (?) build email, html analyzers
27
+ - arch/rationale docs? [pushed back from 0.3]
28
+
29
+ [extreme fuzziness follows]
30
+
31
+ 0.6 - (?) update/migration mechanism for index schema changes
32
+
33
+ 0.7 - reasonably complete test, benchmarking suites
34
+
35
+ 0.8 - support index updates, deletion
36
+
37
+ 1.0 - threadsafe
38
+ - reduce index size as much as possible
39
+ - make searches swift
40
+
41
+ 1.1 - interesting query analyzer
@@ -0,0 +1,40 @@
1
+ #
2
+ # Simple minimal fulltext indexer/search engine.
3
+ #
4
+ # = Synopsis
5
+ #
6
+ # require 'rsi'
7
+ # indexer = RSI::Index.new( "/var/db/index" )
8
+ # uri = "file:///Users/gregfast/poo.txt"
9
+ # content = File.read( uri )
10
+ # indexer.add_document( uri, content )
11
+ # indexer.finish()
12
+ #
13
+ # docs = indexer.find_all( "quick brown fox" )
14
+ #
15
+ # = Intro
16
+ #
17
+ # See link:files/lib/rsi/rsi_intro_rb.html .
18
+ #
19
+ # = Author
20
+ #
21
+ # Greg Fast, gdf@speakeasy.net
22
+ #
23
+ # = Copyright
24
+ #
25
+ # Copyright 2005 Greg Fast <gdf@speakeasy.net>
26
+ # See LICENSE file distributed with this software.
27
+ #
28
+ #--
29
+ # $Id: rsi.rb 48 2005-01-14 21:57:07Z gdf $
30
+ #
31
+ require 'logger'
32
+ require 'yaml'
33
+ require 'bz2'
34
+ require 'rsi/porter'
35
+ require 'rsi/stoplist'
36
+ require 'rsi/serializers'
37
+ require 'rsi/dictionary'
38
+ require 'rsi/index'
39
+ require 'rsi/analysis'
40
+
@@ -0,0 +1,79 @@
1
+ #
2
+ # Content/query tokenization classes
3
+ #
4
+ require 'rsi/stoplist'
5
+ require 'rsi/query'
6
+ require 'rsi/logmanager'
7
+
8
+ module RSI
9
+
10
+ # Constants for field/dictionary types.
11
+ FIELD_TYPE_TEXT = :FIELD_TYPE_TEXT
12
+ FIELD_TYPE_DATE = :FIELD_TYPE_DATE
13
+
14
+ class DefaultTextAnalyzer
15
+ include Loggable
16
+
17
+ attr_accessor :stoplist
18
+
19
+ def initialize()
20
+ @stoplist = nil
21
+ end
22
+
23
+ # Returns a map of fields to field types, for each field returned
24
+ # by this analyzer's tokenize() method.
25
+ # Field names should be safe to be used as file path components.
26
+ def get_field_types()
27
+ return { "text" => RSI::FIELD_TYPE_TEXT }
28
+ end
29
+
30
+ # Given a chunk of text content, returns a list of indexable
31
+ # terms contained in that content.
32
+ # The content may not be a complete document.
33
+ # The terms returned may not be a unique set.
34
+ # The terms returned will all be set to field 'text'.
35
+ def tokenize( content ) # -> { field, [terms...] }..
36
+ return { "text" => tokenize_text(content) }
37
+ end
38
+
39
+ def tokenize_query( query )
40
+ q = RSI::ANDQuery.new()
41
+ tokenize_text( query ).each do |t|
42
+ q.add_subquery( RSI::TermQuery.new( 'text', t ) )
43
+ end
44
+ return q
45
+ end
46
+
47
+ def tokenize_text( content )
48
+ initialize_stoplist()
49
+ c = content.dup.to_s #copy
50
+ c.gsub!( /\'s\b/, "s" ) # normalize contractions
51
+ c.gsub!( /n\'t\b/, "nt" )
52
+ c.tr!( "^a-zA-Z0-9", " " ) # thunk non-wordy chars to ws
53
+ a = c.split() # split on whitespace
54
+ a.collect! { |x| x.length<3 ? nil : x } # remove short terms
55
+ a.compact!
56
+ a.collect! { |x| x.stem } # stem terms
57
+ a.compact!
58
+ a.collect! { |x| x.upcase }
59
+ a.compact!
60
+ a.collect! { |x| @stoplist.has_key?(x) ? nil : x } # remove stops
61
+ a.uniq!
62
+ a.compact!
63
+ return a
64
+ end
65
+
66
+ protected
67
+
68
+ # done lazily, to account for changing analyzer
69
+ def initialize_stoplist()
70
+ return unless @stoplist.nil?
71
+ @stoplist = {}
72
+ tokenize_text( RSI::STOPLIST_s ).each do |x|
73
+ @stoplist[ x ] = 1 if x.length > 0
74
+ end
75
+ end
76
+
77
+ end
78
+
79
+ end