SimpleSearch 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README ADDED
@@ -0,0 +1,68 @@
1
+ ===SimpleSearch - Simple vector space search library
2
+
3
+ ==What is SimpleSearch?
4
+ -----
5
+
6
+ SimpleSearch is a simple vector space text search engine.
7
+
8
+ ==Installation
9
+ -----
10
+
11
+ Prerequisites
12
+
13
+ * Ruby 1.8 (http://www.ruby-lang.org/)
14
+
15
+ Optional
16
+
17
+ * RubyGems (http://rubygems.rubyforge.org)
18
+
19
+ ==Installing SimpleSearch
20
+ -----
21
+
22
+ RubyGems (http://rubygems.rubyforge.org):
23
+
24
+ gem install SimpleSearch
25
+
26
+ ...or...
27
+
28
+ .tar.gz installation:
29
+
30
+ ruby setup.rb #not yet available
31
+
32
+
33
+ ==Using SimpleSearch
34
+ -----
35
+
36
+ SimpleSearch comes with a command line program that was primarily written as an example of how to use the API but might actually be useful.
37
+
38
+ To run the command line program, simply type:
39
+ $ search-simple --help
40
+
41
+ An example:
42
+ $ search-simple --cache=/tmp/mycache --dir=/usr/local/lib/ruby/gems/1.8/doc --extensions=html markup
43
+
44
+ This will cause search-simple to (re)index all of the files with a .html extension in your RubyGems rdoc directory and then search them for the words "markup" and "html". The search indices will be stored in /tmp/mycache.
45
+
46
+ At the heart of SimpleSearch is, of course, an API that can be embedded in other programs. The code of SimpleSearch was originally created by Dave Thomas as a search mechanism for his RubLog (http://rubyforge.org/projects/rublog) weblogging package. The API can be used as follows:
47
+
48
+ require 'search/simple'
49
+ Search::Simple::Searcher.load(content_for_indexing(options), "/tmp/search_cache")
50
+ contents = Search::Simple::Contents.new
51
+ # silly example
52
+ Dir['**/*'].each do |file_name|
53
+ File.open(file_name) do |file|
54
+ contents << Search::Simple::Content.new(file.read, File.expand_path(file_name), file.mtime)
55
+ end
56
+ end
57
+ sr = s.find_words(['some', 'keywords', 'to', 'search', 'for'])
58
+ if sr.contains_matches
59
+ sr.results.sort.each do |res|
60
+ puts "#{res.score}:#{res.name}"
61
+ end
62
+ else
63
+ puts "No matches"
64
+ end
65
+
66
+ ==Credits
67
+ ------
68
+ Almost all of this code was written by Dave Thomas (http://pragprog.com/pragdave). The original code was a complete rewrite at an attempt that Chad Fowler (http://www.chadfowler.com) made to do a vector space search for RubLog. Chad Fowler adapted Dave's working RubLog code to be Rublog-independent and created what is now SimpleSearch out of it.
@@ -0,0 +1,42 @@
1
+ #!/usr/bin/env ruby
2
+ require 'optparse'
3
+ require 'search/simple'
4
+
5
+ options = {}
6
+ ARGV.options do |opts|
7
+ opts.on_tail("--help", "show this message") {puts opts}
8
+ opts.on('-cCACHEFILE','--cache=CACHEFILE', "Location of the search cache (defaults to /tmp/search_cache") { |options[:cachefile]| }
9
+ opts.on('-eEXTENSIONS','--extensions=EXTENSIONS', "Comma separated list of file name extensions to include in the search/index") { |options[:extensions]| }
10
+ opts.on('-dCONTENTDIR', '--dir=CONTENTDIR', "Directory from which to get the content to index") {|options[:directory]|}
11
+ opts.on('-tTERMS', '--terms=TERMS', "Comma separated list of words to search for") {|options[:terms]|}
12
+ opts.parse!
13
+ end
14
+
15
+ def content_for_indexing(options)
16
+ contents = Search::Simple::Contents.new
17
+ extensions = options[:extensions] || ""
18
+ globpattern = (options[:directory] || ".") + "/**/*" + "{#{extensions}}"
19
+ Dir[globpattern].each do |file_name|
20
+ next if File.directory?(file_name)
21
+ File.open(file_name) do |file|
22
+ contents << Search::Simple::Content.new(file.read, File.expand_path(file_name), file.mtime)
23
+ end
24
+ end
25
+ contents
26
+ end
27
+
28
+ unless options[:terms]
29
+ puts "Usage: simplesearch --help"
30
+ exit 1
31
+ end
32
+ s = Search::Simple::Searcher.load(content_for_indexing(options), options[:cachefile] || "/tmp/search_cache")
33
+ sr = s.find_words(options[:terms].split(/,/))
34
+ if sr.contains_matches
35
+ require 'pp'
36
+ puts "Score\t#File"
37
+ sr.results.sort.each do |res|
38
+ puts "#{res.score}\t#{res.name}"
39
+ end
40
+ else
41
+ puts "No matches"
42
+ end
@@ -0,0 +1 @@
1
+ require 'search/simple/searcher'
@@ -0,0 +1,126 @@
1
+ # Maintain a dictionary mapping words to consecutive integers (the
2
+ # first unique word is 0, the second is 1 and so on)
3
+
4
+ require 'search/simple/porter_stemmer'
5
+ module Search
6
+ module Simple
7
+ class Dictionary
8
+ STOP_WORDS = {
9
+ "a" => 1,
10
+ "again" => 1,
11
+ "all" => 1,
12
+ "along" => 1,
13
+ "also" => 1,
14
+ "an" => 1,
15
+ "and" => 1,
16
+ "arialhelvetica" => 1,
17
+ "as" => 1,
18
+ "at" => 1,
19
+ "but" => 1,
20
+ "by" => 1,
21
+ "came" => 1,
22
+ "can" => 1,
23
+ "cant" => 1,
24
+ "couldnt" => 1,
25
+ "did" => 1,
26
+ "didn" => 1,
27
+ "didnt" => 1,
28
+ "do" => 1,
29
+ "doesnt" => 1,
30
+ "dont" => 1,
31
+ "entrytitledetail" => 1,
32
+ "ever" => 1,
33
+ "first" => 1,
34
+ "fontvariant" => 1,
35
+ "from" => 1,
36
+ "have" => 1,
37
+ "her" => 1,
38
+ "here" => 1,
39
+ "him" => 1,
40
+ "how" => 1,
41
+ "i" => 1,
42
+ "if" => 1,
43
+ "in" => 1,
44
+ "into" => 1,
45
+ "is" => 1,
46
+ "isnt" => 1,
47
+ "it" => 1,
48
+ "itll" => 1,
49
+ "just" => 1,
50
+ "last" => 1,
51
+ "least" => 1,
52
+ "like" => 1,
53
+ "most" => 1,
54
+ "my" => 1,
55
+ "new" => 1,
56
+ "no" => 1,
57
+ "not" => 1,
58
+ "now" => 1,
59
+ "of" => 1,
60
+ "on" => 1,
61
+ "or" => 1,
62
+ "should" => 1,
63
+ "sidebartitl" => 1,
64
+ "sinc" => 1,
65
+ "so" => 1,
66
+ "some" => 1,
67
+ "textdecoration" => 1,
68
+ "th" => 1,
69
+ "than" => 1,
70
+ "that" => 1,
71
+ "the" => 1,
72
+ "their" => 1,
73
+ "then" => 1,
74
+ "those" => 1,
75
+ "to" => 1,
76
+ "told" => 1,
77
+ "too" => 1,
78
+ "true" => 1,
79
+ "try" => 1,
80
+ "until" => 1,
81
+ "url" => 1,
82
+ "us" => 1,
83
+ "were" => 1,
84
+ "when" => 1,
85
+ "whether" => 1,
86
+ "while" => 1,
87
+ "with" => 1,
88
+ "within" => 1,
89
+ "yes" => 1,
90
+ "you" => 1,
91
+ "youll" => 1,
92
+ }
93
+
94
+ def initialize
95
+ @words = {}
96
+ end
97
+
98
+ def add_word(word)
99
+ word = Stemmable::stem_porter(word)
100
+ if STOP_WORDS[word]
101
+ nil
102
+ else
103
+ @words[word] ||= @words.size
104
+ end
105
+ end
106
+
107
+ def find(word)
108
+ word = Stemmable::stem_porter(word)
109
+ if STOP_WORDS[word]
110
+ nil
111
+ else
112
+ @words[word]
113
+ end
114
+ end
115
+
116
+ def size
117
+ @words.size
118
+ end
119
+
120
+ def dump
121
+ puts @words.keys.sort
122
+ end
123
+
124
+ end
125
+ end
126
+ end
@@ -0,0 +1,220 @@
1
+ #! /local/ruby/bin/ruby
2
+ #
3
+ # $Id: PorterStemmer.rb,v 1.1.1.1 2004/04/17 13:55:20 pragdave Exp $
4
+ #
5
+ # See example usage at the end of this file.
6
+ #
7
+
8
+ module Stemmable
9
+
10
+ STEMMED = {}
11
+
12
+ STEP_2_LIST = {
13
+ 'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
14
+ 'izer'=>'ize', 'bli'=>'ble',
15
+ 'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
16
+ 'ization'=>'ize', 'ation'=>'ate',
17
+ 'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
18
+ 'ousness'=>'ous', 'aliti'=>'al',
19
+ 'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
20
+ }
21
+
22
+ STEP_3_LIST = {
23
+ 'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
24
+ 'ical'=>'ic', 'ful'=>'', 'ness'=>''
25
+ }
26
+
27
+
28
+ SUFFIX_1_REGEXP = /(
29
+ ational |
30
+ tional |
31
+ enci |
32
+ anci |
33
+ izer |
34
+ bli |
35
+ alli |
36
+ entli |
37
+ eli |
38
+ ousli |
39
+ ization |
40
+ ation |
41
+ ator |
42
+ alism |
43
+ iveness |
44
+ fulness |
45
+ ousness |
46
+ aliti |
47
+ iviti |
48
+ biliti |
49
+ logi)$/x
50
+
51
+
52
+ SUFFIX_2_REGEXP = /(
53
+ al |
54
+ ance |
55
+ ence |
56
+ er |
57
+ ic |
58
+ able |
59
+ ible |
60
+ ant |
61
+ ement |
62
+ ment |
63
+ ent |
64
+ ou |
65
+ ism |
66
+ ate |
67
+ iti |
68
+ ous |
69
+ ive |
70
+ ize)$/x
71
+
72
+
73
+ C = "[^aeiou]" # consonant
74
+ V = "[aeiouy]" # vowel
75
+ CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
76
+ VV = "#{V}(?>[aeiou]*)" # vowel sequence
77
+
78
+ MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
79
+ MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
80
+ MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
81
+ VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
82
+
83
+ #
84
+ # Porter stemmer in Ruby.
85
+ #
86
+ # This is the Porter stemming algorithm, ported to Ruby from the
87
+ # version coded up in Perl. It's easy to follow against the rules
88
+ # in the original paper in:
89
+ #
90
+ # Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
91
+ # no. 3, pp 130-137,
92
+ #
93
+ # See also http://www.tartarus.org/~martin/PorterStemmer
94
+ #
95
+ # Send comments to raypereda@hotmail.com
96
+ #
97
+
98
+ def stem_porter(w = self.to_str.dup)
99
+
100
+ # make a copy of the given object and convert it to a string.
101
+ original_word = w
102
+
103
+ return w if w.length < 3
104
+
105
+ result = STEMMED[w]
106
+ return result if result
107
+
108
+ # now map initial y to Y so that the patterns never treat it as vowel
109
+ w[0] = 'Y' if w[0] == ?y
110
+
111
+ # Step 1a
112
+ if w =~ /(ss|i)es$/
113
+ w = $` + $1
114
+ elsif w =~ /([^s])s$/
115
+ w = $` + $1
116
+ end
117
+
118
+ # Step 1b
119
+ if w =~ /eed$/
120
+ w.chop! if $` =~ MGR0
121
+ elsif w =~ /(ed|ing)$/
122
+ stem = $`
123
+ if stem =~ VOWEL_IN_STEM
124
+ w = stem
125
+ case w
126
+ when /(at|bl|iz)$/ then w << "e"
127
+ when /([^aeiouylsz])\1$/ then w.chop!
128
+ when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
129
+ end
130
+ end
131
+ end
132
+
133
+ if w =~ /y$/
134
+ stem = $`
135
+ w = stem + "i" if stem =~ VOWEL_IN_STEM
136
+ end
137
+
138
+ # Step 2
139
+ if w =~ SUFFIX_1_REGEXP
140
+ stem = $`
141
+ suffix = $1
142
+ # print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
143
+ if stem =~ MGR0
144
+ w = stem + STEP_2_LIST[suffix]
145
+ end
146
+ end
147
+
148
+ # Step 3
149
+ if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
150
+ stem = $`
151
+ suffix = $1
152
+ if stem =~ MGR0
153
+ w = stem + STEP_3_LIST[suffix]
154
+ end
155
+ end
156
+
157
+ # Step 4
158
+ if w =~ SUFFIX_2_REGEXP
159
+ stem = $`
160
+ if stem =~ MGR1
161
+ w = stem
162
+ end
163
+ elsif w =~ /(s|t)(ion)$/
164
+ stem = $` + $1
165
+ if stem =~ MGR1
166
+ w = stem
167
+ end
168
+ end
169
+
170
+ # Step 5
171
+ if w =~ /e$/
172
+ stem = $`
173
+ if (stem =~ MGR1) ||
174
+ (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
175
+ w = stem
176
+ end
177
+ end
178
+
179
+ if w =~ /ll$/ && w =~ MGR1
180
+ w.chop!
181
+ end
182
+
183
+ # and turn initial Y back to y
184
+ w[0] = 'y' if w[0] == ?Y
185
+
186
+ STEMMED[original_word] = w
187
+
188
+ w
189
+ end
190
+
191
+
192
+ module_function :stem_porter
193
+ #
194
+ # make the stem_porter the default stem method, just in case we
195
+ # feel like having multiple stemmers available later.
196
+ #
197
+ alias stem stem_porter
198
+
199
+ end
200
+
201
+
202
+
203
+ #
204
+ # Make this script executable, and send it words on stdin, one per
205
+ # line, and it will output the stemmed versions to stdout.
206
+ #
207
+ if $0 == __FILE__ then
208
+ class String
209
+ include Stemmable
210
+ end
211
+
212
+ # the String class, and any subclasses of it you might have, now know
213
+ # how to stem things.
214
+
215
+ $stdin.each do |word|
216
+ puts word.stem
217
+ end
218
+ end
219
+
220
+