SimpleSearch 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
data/README ADDED
@@ -0,0 +1,68 @@
1
+ ===SimpleSearch - Simple vector space search library
2
+
3
+ ==What is SimpleSearch?
4
+ -----
5
+
6
+ SimpleSearch is a simple vector space text search engine.
7
+
8
+ ==Installation
9
+ -----
10
+
11
+ Prerequisites
12
+
13
+ * Ruby 1.8 (http://www.ruby-lang.org/)
14
+
15
+ Optional
16
+
17
+ * RubyGems (http://rubygems.rubyforge.org)
18
+
19
+ ==Installing SimpleSearch
20
+ -----
21
+
22
+ RubyGems (http://rubygems.rubyforge.org):
23
+
24
+ gem install SimpleSearch
25
+
26
+ ...or...
27
+
28
+ .tar.gz installation:
29
+
30
+ ruby setup.rb #not yet available
31
+
32
+
33
+ ==Using SimpleSearch
34
+ -----
35
+
36
+ SimpleSearch comes with a command line program that was primarily written as an example of how to use the API but might actually be useful.
37
+
38
+ To run the command line program, simply type:
39
+ $ search-simple --help
40
+
41
+ An example:
42
+ $ search-simple --cache=/tmp/mycache --dir=/usr/local/lib/ruby/gems/1.8/doc --extensions=html markup
43
+
44
+ This will cause search-simple to (re)index all of the files with a .html extension in your RubyGems rdoc directory and then search them for the words "markup" and "html". The search indices will be stored in /tmp/mycache.
45
+
46
+ At the heart of SimpleSearch is, of course, an API that can be embedded in other programs. The code of SimpleSearch was originally created by Dave Thomas as a search mechanism for his RubLog (http://rubyforge.org/projects/rublog) weblogging package. The API can be used as follows:
47
+
48
+ require 'search/simple'
49
+ Search::Simple::Searcher.load(content_for_indexing(options), "/tmp/search_cache")
50
+ contents = Search::Simple::Contents.new
51
+ # silly example
52
+ Dir['**/*'].each do |file_name|
53
+ File.open(file_name) do |file|
54
+ contents << Search::Simple::Content.new(file.read, File.expand_path(file_name), file.mtime)
55
+ end
56
+ end
57
+ sr = s.find_words(['some', 'keywords', 'to', 'search', 'for'])
58
+ if sr.contains_matches
59
+ sr.results.sort.each do |res|
60
+ puts "#{res.score}:#{res.name}"
61
+ end
62
+ else
63
+ puts "No matches"
64
+ end
65
+
66
+ ==Credits
67
+ ------
68
+ Almost all of this code was written by Dave Thomas (http://pragprog.com/pragdave). The original code was a complete rewrite at an attempt that Chad Fowler (http://www.chadfowler.com) made to do a vector space search for RubLog. Chad Fowler adapted Dave's working RubLog code to be Rublog-independent and created what is now SimpleSearch out of it.
@@ -0,0 +1,42 @@
1
+ #!/usr/bin/env ruby
2
+ require 'optparse'
3
+ require 'search/simple'
4
+
5
+ options = {}
6
+ ARGV.options do |opts|
7
+ opts.on_tail("--help", "show this message") {puts opts}
8
+ opts.on('-cCACHEFILE','--cache=CACHEFILE', "Location of the search cache (defaults to /tmp/search_cache") { |options[:cachefile]| }
9
+ opts.on('-eEXTENSIONS','--extensions=EXTENSIONS', "Comma separated list of file name extensions to include in the search/index") { |options[:extensions]| }
10
+ opts.on('-dCONTENTDIR', '--dir=CONTENTDIR', "Directory from which to get the content to index") {|options[:directory]|}
11
+ opts.on('-tTERMS', '--terms=TERMS', "Comma separated list of words to search for") {|options[:terms]|}
12
+ opts.parse!
13
+ end
14
+
15
+ def content_for_indexing(options)
16
+ contents = Search::Simple::Contents.new
17
+ extensions = options[:extensions] || ""
18
+ globpattern = (options[:directory] || ".") + "/**/*" + "{#{extensions}}"
19
+ Dir[globpattern].each do |file_name|
20
+ next if File.directory?(file_name)
21
+ File.open(file_name) do |file|
22
+ contents << Search::Simple::Content.new(file.read, File.expand_path(file_name), file.mtime)
23
+ end
24
+ end
25
+ contents
26
+ end
27
+
28
+ unless options[:terms]
29
+ puts "Usage: simplesearch --help"
30
+ exit 1
31
+ end
32
+ s = Search::Simple::Searcher.load(content_for_indexing(options), options[:cachefile] || "/tmp/search_cache")
33
+ sr = s.find_words(options[:terms].split(/,/))
34
+ if sr.contains_matches
35
+ require 'pp'
36
+ puts "Score\t#File"
37
+ sr.results.sort.each do |res|
38
+ puts "#{res.score}\t#{res.name}"
39
+ end
40
+ else
41
+ puts "No matches"
42
+ end
@@ -0,0 +1 @@
1
+ require 'search/simple/searcher'
@@ -0,0 +1,126 @@
1
+ # Maintain a dictionary mapping words to consecutive integers (the
2
+ # first unique word is 0, the second is 1 and so on)
3
+
4
+ require 'search/simple/porter_stemmer'
5
+ module Search
6
+ module Simple
7
+ class Dictionary
8
+ STOP_WORDS = {
9
+ "a" => 1,
10
+ "again" => 1,
11
+ "all" => 1,
12
+ "along" => 1,
13
+ "also" => 1,
14
+ "an" => 1,
15
+ "and" => 1,
16
+ "arialhelvetica" => 1,
17
+ "as" => 1,
18
+ "at" => 1,
19
+ "but" => 1,
20
+ "by" => 1,
21
+ "came" => 1,
22
+ "can" => 1,
23
+ "cant" => 1,
24
+ "couldnt" => 1,
25
+ "did" => 1,
26
+ "didn" => 1,
27
+ "didnt" => 1,
28
+ "do" => 1,
29
+ "doesnt" => 1,
30
+ "dont" => 1,
31
+ "entrytitledetail" => 1,
32
+ "ever" => 1,
33
+ "first" => 1,
34
+ "fontvariant" => 1,
35
+ "from" => 1,
36
+ "have" => 1,
37
+ "her" => 1,
38
+ "here" => 1,
39
+ "him" => 1,
40
+ "how" => 1,
41
+ "i" => 1,
42
+ "if" => 1,
43
+ "in" => 1,
44
+ "into" => 1,
45
+ "is" => 1,
46
+ "isnt" => 1,
47
+ "it" => 1,
48
+ "itll" => 1,
49
+ "just" => 1,
50
+ "last" => 1,
51
+ "least" => 1,
52
+ "like" => 1,
53
+ "most" => 1,
54
+ "my" => 1,
55
+ "new" => 1,
56
+ "no" => 1,
57
+ "not" => 1,
58
+ "now" => 1,
59
+ "of" => 1,
60
+ "on" => 1,
61
+ "or" => 1,
62
+ "should" => 1,
63
+ "sidebartitl" => 1,
64
+ "sinc" => 1,
65
+ "so" => 1,
66
+ "some" => 1,
67
+ "textdecoration" => 1,
68
+ "th" => 1,
69
+ "than" => 1,
70
+ "that" => 1,
71
+ "the" => 1,
72
+ "their" => 1,
73
+ "then" => 1,
74
+ "those" => 1,
75
+ "to" => 1,
76
+ "told" => 1,
77
+ "too" => 1,
78
+ "true" => 1,
79
+ "try" => 1,
80
+ "until" => 1,
81
+ "url" => 1,
82
+ "us" => 1,
83
+ "were" => 1,
84
+ "when" => 1,
85
+ "whether" => 1,
86
+ "while" => 1,
87
+ "with" => 1,
88
+ "within" => 1,
89
+ "yes" => 1,
90
+ "you" => 1,
91
+ "youll" => 1,
92
+ }
93
+
94
+ def initialize
95
+ @words = {}
96
+ end
97
+
98
+ def add_word(word)
99
+ word = Stemmable::stem_porter(word)
100
+ if STOP_WORDS[word]
101
+ nil
102
+ else
103
+ @words[word] ||= @words.size
104
+ end
105
+ end
106
+
107
+ def find(word)
108
+ word = Stemmable::stem_porter(word)
109
+ if STOP_WORDS[word]
110
+ nil
111
+ else
112
+ @words[word]
113
+ end
114
+ end
115
+
116
+ def size
117
+ @words.size
118
+ end
119
+
120
+ def dump
121
+ puts @words.keys.sort
122
+ end
123
+
124
+ end
125
+ end
126
+ end
@@ -0,0 +1,220 @@
1
+ #! /local/ruby/bin/ruby
2
+ #
3
+ # $Id: PorterStemmer.rb,v 1.1.1.1 2004/04/17 13:55:20 pragdave Exp $
4
+ #
5
+ # See example usage at the end of this file.
6
+ #
7
+
8
+ module Stemmable
9
+
10
+ STEMMED = {}
11
+
12
+ STEP_2_LIST = {
13
+ 'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
14
+ 'izer'=>'ize', 'bli'=>'ble',
15
+ 'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
16
+ 'ization'=>'ize', 'ation'=>'ate',
17
+ 'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
18
+ 'ousness'=>'ous', 'aliti'=>'al',
19
+ 'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
20
+ }
21
+
22
+ STEP_3_LIST = {
23
+ 'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
24
+ 'ical'=>'ic', 'ful'=>'', 'ness'=>''
25
+ }
26
+
27
+
28
+ SUFFIX_1_REGEXP = /(
29
+ ational |
30
+ tional |
31
+ enci |
32
+ anci |
33
+ izer |
34
+ bli |
35
+ alli |
36
+ entli |
37
+ eli |
38
+ ousli |
39
+ ization |
40
+ ation |
41
+ ator |
42
+ alism |
43
+ iveness |
44
+ fulness |
45
+ ousness |
46
+ aliti |
47
+ iviti |
48
+ biliti |
49
+ logi)$/x
50
+
51
+
52
+ SUFFIX_2_REGEXP = /(
53
+ al |
54
+ ance |
55
+ ence |
56
+ er |
57
+ ic |
58
+ able |
59
+ ible |
60
+ ant |
61
+ ement |
62
+ ment |
63
+ ent |
64
+ ou |
65
+ ism |
66
+ ate |
67
+ iti |
68
+ ous |
69
+ ive |
70
+ ize)$/x
71
+
72
+
73
+ C = "[^aeiou]" # consonant
74
+ V = "[aeiouy]" # vowel
75
+ CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
76
+ VV = "#{V}(?>[aeiou]*)" # vowel sequence
77
+
78
+ MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
79
+ MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
80
+ MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
81
+ VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
82
+
83
+ #
84
+ # Porter stemmer in Ruby.
85
+ #
86
+ # This is the Porter stemming algorithm, ported to Ruby from the
87
+ # version coded up in Perl. It's easy to follow against the rules
88
+ # in the original paper in:
89
+ #
90
+ # Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
91
+ # no. 3, pp 130-137,
92
+ #
93
+ # See also http://www.tartarus.org/~martin/PorterStemmer
94
+ #
95
+ # Send comments to raypereda@hotmail.com
96
+ #
97
+
98
+ def stem_porter(w = self.to_str.dup)
99
+
100
+ # make a copy of the given object and convert it to a string.
101
+ original_word = w
102
+
103
+ return w if w.length < 3
104
+
105
+ result = STEMMED[w]
106
+ return result if result
107
+
108
+ # now map initial y to Y so that the patterns never treat it as vowel
109
+ w[0] = 'Y' if w[0] == ?y
110
+
111
+ # Step 1a
112
+ if w =~ /(ss|i)es$/
113
+ w = $` + $1
114
+ elsif w =~ /([^s])s$/
115
+ w = $` + $1
116
+ end
117
+
118
+ # Step 1b
119
+ if w =~ /eed$/
120
+ w.chop! if $` =~ MGR0
121
+ elsif w =~ /(ed|ing)$/
122
+ stem = $`
123
+ if stem =~ VOWEL_IN_STEM
124
+ w = stem
125
+ case w
126
+ when /(at|bl|iz)$/ then w << "e"
127
+ when /([^aeiouylsz])\1$/ then w.chop!
128
+ when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
129
+ end
130
+ end
131
+ end
132
+
133
+ if w =~ /y$/
134
+ stem = $`
135
+ w = stem + "i" if stem =~ VOWEL_IN_STEM
136
+ end
137
+
138
+ # Step 2
139
+ if w =~ SUFFIX_1_REGEXP
140
+ stem = $`
141
+ suffix = $1
142
+ # print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
143
+ if stem =~ MGR0
144
+ w = stem + STEP_2_LIST[suffix]
145
+ end
146
+ end
147
+
148
+ # Step 3
149
+ if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
150
+ stem = $`
151
+ suffix = $1
152
+ if stem =~ MGR0
153
+ w = stem + STEP_3_LIST[suffix]
154
+ end
155
+ end
156
+
157
+ # Step 4
158
+ if w =~ SUFFIX_2_REGEXP
159
+ stem = $`
160
+ if stem =~ MGR1
161
+ w = stem
162
+ end
163
+ elsif w =~ /(s|t)(ion)$/
164
+ stem = $` + $1
165
+ if stem =~ MGR1
166
+ w = stem
167
+ end
168
+ end
169
+
170
+ # Step 5
171
+ if w =~ /e$/
172
+ stem = $`
173
+ if (stem =~ MGR1) ||
174
+ (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
175
+ w = stem
176
+ end
177
+ end
178
+
179
+ if w =~ /ll$/ && w =~ MGR1
180
+ w.chop!
181
+ end
182
+
183
+ # and turn initial Y back to y
184
+ w[0] = 'y' if w[0] == ?Y
185
+
186
+ STEMMED[original_word] = w
187
+
188
+ w
189
+ end
190
+
191
+
192
+ module_function :stem_porter
193
+ #
194
+ # make the stem_porter the default stem method, just in case we
195
+ # feel like having multiple stemmers available later.
196
+ #
197
+ alias stem stem_porter
198
+
199
+ end
200
+
201
+
202
+
203
+ #
204
+ # Make this script executable, and send it words on stdin, one per
205
+ # line, and it will output the stemmed versions to stdout.
206
+ #
207
+ if $0 == __FILE__ then
208
+ class String
209
+ include Stemmable
210
+ end
211
+
212
+ # the String class, and any subclasses of it you might have, now know
213
+ # how to stem things.
214
+
215
+ $stdin.each do |word|
216
+ puts word.stem
217
+ end
218
+ end
219
+
220
+