SimpleSearch 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +68 -0
- data/bin/simplesearch +42 -0
- data/lib/search/simple.rb +1 -0
- data/lib/search/simple/dictionary.rb +126 -0
- data/lib/search/simple/porter_stemmer.rb +220 -0
- data/lib/search/simple/searcher.rb +191 -0
- data/lib/search/simple/vector.rb +97 -0
- data/setup.rb +1360 -0
- metadata +46 -0
data/README
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
===SimpleSearch - Simple vector space search library
|
2
|
+
|
3
|
+
==What is SimpleSearch?
|
4
|
+
-----
|
5
|
+
|
6
|
+
SimpleSearch is a simple vector space text search engine.
|
7
|
+
|
8
|
+
==Installation
|
9
|
+
-----
|
10
|
+
|
11
|
+
Prerequisites
|
12
|
+
|
13
|
+
* Ruby 1.8 (http://www.ruby-lang.org/)
|
14
|
+
|
15
|
+
Optional
|
16
|
+
|
17
|
+
* RubyGems (http://rubygems.rubyforge.org)
|
18
|
+
|
19
|
+
==Installing SimpleSearch
|
20
|
+
-----
|
21
|
+
|
22
|
+
RubyGems (http://rubygems.rubyforge.org):
|
23
|
+
|
24
|
+
gem install SimpleSearch
|
25
|
+
|
26
|
+
...or...
|
27
|
+
|
28
|
+
.tar.gz installation:
|
29
|
+
|
30
|
+
ruby setup.rb #not yet available
|
31
|
+
|
32
|
+
|
33
|
+
==Using SimpleSearch
|
34
|
+
-----
|
35
|
+
|
36
|
+
SimpleSearch comes with a command line program that was primarily written as an example of how to use the API but might actually be useful.
|
37
|
+
|
38
|
+
To run the command line program, simply type:
|
39
|
+
$ search-simple --help
|
40
|
+
|
41
|
+
An example:
|
42
|
+
$ search-simple --cache=/tmp/mycache --dir=/usr/local/lib/ruby/gems/1.8/doc --extensions=html markup
|
43
|
+
|
44
|
+
This will cause search-simple to (re)index all of the files with a .html extension in your RubyGems rdoc directory and then search them for the words "markup" and "html". The search indices will be stored in /tmp/mycache.
|
45
|
+
|
46
|
+
At the heart of SimpleSearch is, of course, an API that can be embedded in other programs. The code of SimpleSearch was originally created by Dave Thomas as a search mechanism for his RubLog (http://rubyforge.org/projects/rublog) weblogging package. The API can be used as follows:
|
47
|
+
|
48
|
+
require 'search/simple'
|
49
|
+
Search::Simple::Searcher.load(content_for_indexing(options), "/tmp/search_cache")
|
50
|
+
contents = Search::Simple::Contents.new
|
51
|
+
# silly example
|
52
|
+
Dir['**/*'].each do |file_name|
|
53
|
+
File.open(file_name) do |file|
|
54
|
+
contents << Search::Simple::Content.new(file.read, File.expand_path(file_name), file.mtime)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
sr = s.find_words(['some', 'keywords', 'to', 'search', 'for'])
|
58
|
+
if sr.contains_matches
|
59
|
+
sr.results.sort.each do |res|
|
60
|
+
puts "#{res.score}:#{res.name}"
|
61
|
+
end
|
62
|
+
else
|
63
|
+
puts "No matches"
|
64
|
+
end
|
65
|
+
|
66
|
+
==Credits
|
67
|
+
------
|
68
|
+
Almost all of this code was written by Dave Thomas (http://pragprog.com/pragdave). The original code was a complete rewrite at an attempt that Chad Fowler (http://www.chadfowler.com) made to do a vector space search for RubLog. Chad Fowler adapted Dave's working RubLog code to be Rublog-independent and created what is now SimpleSearch out of it.
|
data/bin/simplesearch
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'optparse'
|
3
|
+
require 'search/simple'
|
4
|
+
|
5
|
+
options = {}
|
6
|
+
ARGV.options do |opts|
|
7
|
+
opts.on_tail("--help", "show this message") {puts opts}
|
8
|
+
opts.on('-cCACHEFILE','--cache=CACHEFILE', "Location of the search cache (defaults to /tmp/search_cache") { |options[:cachefile]| }
|
9
|
+
opts.on('-eEXTENSIONS','--extensions=EXTENSIONS', "Comma separated list of file name extensions to include in the search/index") { |options[:extensions]| }
|
10
|
+
opts.on('-dCONTENTDIR', '--dir=CONTENTDIR', "Directory from which to get the content to index") {|options[:directory]|}
|
11
|
+
opts.on('-tTERMS', '--terms=TERMS', "Comma separated list of words to search for") {|options[:terms]|}
|
12
|
+
opts.parse!
|
13
|
+
end
|
14
|
+
|
15
|
+
def content_for_indexing(options)
|
16
|
+
contents = Search::Simple::Contents.new
|
17
|
+
extensions = options[:extensions] || ""
|
18
|
+
globpattern = (options[:directory] || ".") + "/**/*" + "{#{extensions}}"
|
19
|
+
Dir[globpattern].each do |file_name|
|
20
|
+
next if File.directory?(file_name)
|
21
|
+
File.open(file_name) do |file|
|
22
|
+
contents << Search::Simple::Content.new(file.read, File.expand_path(file_name), file.mtime)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
contents
|
26
|
+
end
|
27
|
+
|
28
|
+
unless options[:terms]
|
29
|
+
puts "Usage: simplesearch --help"
|
30
|
+
exit 1
|
31
|
+
end
|
32
|
+
s = Search::Simple::Searcher.load(content_for_indexing(options), options[:cachefile] || "/tmp/search_cache")
|
33
|
+
sr = s.find_words(options[:terms].split(/,/))
|
34
|
+
if sr.contains_matches
|
35
|
+
require 'pp'
|
36
|
+
puts "Score\t#File"
|
37
|
+
sr.results.sort.each do |res|
|
38
|
+
puts "#{res.score}\t#{res.name}"
|
39
|
+
end
|
40
|
+
else
|
41
|
+
puts "No matches"
|
42
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
require 'search/simple/searcher'
|
@@ -0,0 +1,126 @@
|
|
1
|
+
# Maintain a dictionary mapping words to consecutive integers (the
|
2
|
+
# first unique word is 0, the second is 1 and so on)
|
3
|
+
|
4
|
+
require 'search/simple/porter_stemmer'
|
5
|
+
module Search
|
6
|
+
module Simple
|
7
|
+
class Dictionary
|
8
|
+
STOP_WORDS = {
|
9
|
+
"a" => 1,
|
10
|
+
"again" => 1,
|
11
|
+
"all" => 1,
|
12
|
+
"along" => 1,
|
13
|
+
"also" => 1,
|
14
|
+
"an" => 1,
|
15
|
+
"and" => 1,
|
16
|
+
"arialhelvetica" => 1,
|
17
|
+
"as" => 1,
|
18
|
+
"at" => 1,
|
19
|
+
"but" => 1,
|
20
|
+
"by" => 1,
|
21
|
+
"came" => 1,
|
22
|
+
"can" => 1,
|
23
|
+
"cant" => 1,
|
24
|
+
"couldnt" => 1,
|
25
|
+
"did" => 1,
|
26
|
+
"didn" => 1,
|
27
|
+
"didnt" => 1,
|
28
|
+
"do" => 1,
|
29
|
+
"doesnt" => 1,
|
30
|
+
"dont" => 1,
|
31
|
+
"entrytitledetail" => 1,
|
32
|
+
"ever" => 1,
|
33
|
+
"first" => 1,
|
34
|
+
"fontvariant" => 1,
|
35
|
+
"from" => 1,
|
36
|
+
"have" => 1,
|
37
|
+
"her" => 1,
|
38
|
+
"here" => 1,
|
39
|
+
"him" => 1,
|
40
|
+
"how" => 1,
|
41
|
+
"i" => 1,
|
42
|
+
"if" => 1,
|
43
|
+
"in" => 1,
|
44
|
+
"into" => 1,
|
45
|
+
"is" => 1,
|
46
|
+
"isnt" => 1,
|
47
|
+
"it" => 1,
|
48
|
+
"itll" => 1,
|
49
|
+
"just" => 1,
|
50
|
+
"last" => 1,
|
51
|
+
"least" => 1,
|
52
|
+
"like" => 1,
|
53
|
+
"most" => 1,
|
54
|
+
"my" => 1,
|
55
|
+
"new" => 1,
|
56
|
+
"no" => 1,
|
57
|
+
"not" => 1,
|
58
|
+
"now" => 1,
|
59
|
+
"of" => 1,
|
60
|
+
"on" => 1,
|
61
|
+
"or" => 1,
|
62
|
+
"should" => 1,
|
63
|
+
"sidebartitl" => 1,
|
64
|
+
"sinc" => 1,
|
65
|
+
"so" => 1,
|
66
|
+
"some" => 1,
|
67
|
+
"textdecoration" => 1,
|
68
|
+
"th" => 1,
|
69
|
+
"than" => 1,
|
70
|
+
"that" => 1,
|
71
|
+
"the" => 1,
|
72
|
+
"their" => 1,
|
73
|
+
"then" => 1,
|
74
|
+
"those" => 1,
|
75
|
+
"to" => 1,
|
76
|
+
"told" => 1,
|
77
|
+
"too" => 1,
|
78
|
+
"true" => 1,
|
79
|
+
"try" => 1,
|
80
|
+
"until" => 1,
|
81
|
+
"url" => 1,
|
82
|
+
"us" => 1,
|
83
|
+
"were" => 1,
|
84
|
+
"when" => 1,
|
85
|
+
"whether" => 1,
|
86
|
+
"while" => 1,
|
87
|
+
"with" => 1,
|
88
|
+
"within" => 1,
|
89
|
+
"yes" => 1,
|
90
|
+
"you" => 1,
|
91
|
+
"youll" => 1,
|
92
|
+
}
|
93
|
+
|
94
|
+
def initialize
|
95
|
+
@words = {}
|
96
|
+
end
|
97
|
+
|
98
|
+
def add_word(word)
|
99
|
+
word = Stemmable::stem_porter(word)
|
100
|
+
if STOP_WORDS[word]
|
101
|
+
nil
|
102
|
+
else
|
103
|
+
@words[word] ||= @words.size
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def find(word)
|
108
|
+
word = Stemmable::stem_porter(word)
|
109
|
+
if STOP_WORDS[word]
|
110
|
+
nil
|
111
|
+
else
|
112
|
+
@words[word]
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
def size
|
117
|
+
@words.size
|
118
|
+
end
|
119
|
+
|
120
|
+
def dump
|
121
|
+
puts @words.keys.sort
|
122
|
+
end
|
123
|
+
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
@@ -0,0 +1,220 @@
|
|
1
|
+
#! /local/ruby/bin/ruby
|
2
|
+
#
|
3
|
+
# $Id: PorterStemmer.rb,v 1.1.1.1 2004/04/17 13:55:20 pragdave Exp $
|
4
|
+
#
|
5
|
+
# See example usage at the end of this file.
|
6
|
+
#
|
7
|
+
|
8
|
+
module Stemmable
|
9
|
+
|
10
|
+
STEMMED = {}
|
11
|
+
|
12
|
+
STEP_2_LIST = {
|
13
|
+
'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
|
14
|
+
'izer'=>'ize', 'bli'=>'ble',
|
15
|
+
'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
|
16
|
+
'ization'=>'ize', 'ation'=>'ate',
|
17
|
+
'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
|
18
|
+
'ousness'=>'ous', 'aliti'=>'al',
|
19
|
+
'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
|
20
|
+
}
|
21
|
+
|
22
|
+
STEP_3_LIST = {
|
23
|
+
'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
|
24
|
+
'ical'=>'ic', 'ful'=>'', 'ness'=>''
|
25
|
+
}
|
26
|
+
|
27
|
+
|
28
|
+
SUFFIX_1_REGEXP = /(
|
29
|
+
ational |
|
30
|
+
tional |
|
31
|
+
enci |
|
32
|
+
anci |
|
33
|
+
izer |
|
34
|
+
bli |
|
35
|
+
alli |
|
36
|
+
entli |
|
37
|
+
eli |
|
38
|
+
ousli |
|
39
|
+
ization |
|
40
|
+
ation |
|
41
|
+
ator |
|
42
|
+
alism |
|
43
|
+
iveness |
|
44
|
+
fulness |
|
45
|
+
ousness |
|
46
|
+
aliti |
|
47
|
+
iviti |
|
48
|
+
biliti |
|
49
|
+
logi)$/x
|
50
|
+
|
51
|
+
|
52
|
+
SUFFIX_2_REGEXP = /(
|
53
|
+
al |
|
54
|
+
ance |
|
55
|
+
ence |
|
56
|
+
er |
|
57
|
+
ic |
|
58
|
+
able |
|
59
|
+
ible |
|
60
|
+
ant |
|
61
|
+
ement |
|
62
|
+
ment |
|
63
|
+
ent |
|
64
|
+
ou |
|
65
|
+
ism |
|
66
|
+
ate |
|
67
|
+
iti |
|
68
|
+
ous |
|
69
|
+
ive |
|
70
|
+
ize)$/x
|
71
|
+
|
72
|
+
|
73
|
+
C = "[^aeiou]" # consonant
|
74
|
+
V = "[aeiouy]" # vowel
|
75
|
+
CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
|
76
|
+
VV = "#{V}(?>[aeiou]*)" # vowel sequence
|
77
|
+
|
78
|
+
MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
|
79
|
+
MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
|
80
|
+
MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
|
81
|
+
VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
|
82
|
+
|
83
|
+
#
|
84
|
+
# Porter stemmer in Ruby.
|
85
|
+
#
|
86
|
+
# This is the Porter stemming algorithm, ported to Ruby from the
|
87
|
+
# version coded up in Perl. It's easy to follow against the rules
|
88
|
+
# in the original paper in:
|
89
|
+
#
|
90
|
+
# Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
|
91
|
+
# no. 3, pp 130-137,
|
92
|
+
#
|
93
|
+
# See also http://www.tartarus.org/~martin/PorterStemmer
|
94
|
+
#
|
95
|
+
# Send comments to raypereda@hotmail.com
|
96
|
+
#
|
97
|
+
|
98
|
+
def stem_porter(w = self.to_str.dup)
|
99
|
+
|
100
|
+
# make a copy of the given object and convert it to a string.
|
101
|
+
original_word = w
|
102
|
+
|
103
|
+
return w if w.length < 3
|
104
|
+
|
105
|
+
result = STEMMED[w]
|
106
|
+
return result if result
|
107
|
+
|
108
|
+
# now map initial y to Y so that the patterns never treat it as vowel
|
109
|
+
w[0] = 'Y' if w[0] == ?y
|
110
|
+
|
111
|
+
# Step 1a
|
112
|
+
if w =~ /(ss|i)es$/
|
113
|
+
w = $` + $1
|
114
|
+
elsif w =~ /([^s])s$/
|
115
|
+
w = $` + $1
|
116
|
+
end
|
117
|
+
|
118
|
+
# Step 1b
|
119
|
+
if w =~ /eed$/
|
120
|
+
w.chop! if $` =~ MGR0
|
121
|
+
elsif w =~ /(ed|ing)$/
|
122
|
+
stem = $`
|
123
|
+
if stem =~ VOWEL_IN_STEM
|
124
|
+
w = stem
|
125
|
+
case w
|
126
|
+
when /(at|bl|iz)$/ then w << "e"
|
127
|
+
when /([^aeiouylsz])\1$/ then w.chop!
|
128
|
+
when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
if w =~ /y$/
|
134
|
+
stem = $`
|
135
|
+
w = stem + "i" if stem =~ VOWEL_IN_STEM
|
136
|
+
end
|
137
|
+
|
138
|
+
# Step 2
|
139
|
+
if w =~ SUFFIX_1_REGEXP
|
140
|
+
stem = $`
|
141
|
+
suffix = $1
|
142
|
+
# print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
|
143
|
+
if stem =~ MGR0
|
144
|
+
w = stem + STEP_2_LIST[suffix]
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
# Step 3
|
149
|
+
if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
|
150
|
+
stem = $`
|
151
|
+
suffix = $1
|
152
|
+
if stem =~ MGR0
|
153
|
+
w = stem + STEP_3_LIST[suffix]
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
# Step 4
|
158
|
+
if w =~ SUFFIX_2_REGEXP
|
159
|
+
stem = $`
|
160
|
+
if stem =~ MGR1
|
161
|
+
w = stem
|
162
|
+
end
|
163
|
+
elsif w =~ /(s|t)(ion)$/
|
164
|
+
stem = $` + $1
|
165
|
+
if stem =~ MGR1
|
166
|
+
w = stem
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
# Step 5
|
171
|
+
if w =~ /e$/
|
172
|
+
stem = $`
|
173
|
+
if (stem =~ MGR1) ||
|
174
|
+
(stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
|
175
|
+
w = stem
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
if w =~ /ll$/ && w =~ MGR1
|
180
|
+
w.chop!
|
181
|
+
end
|
182
|
+
|
183
|
+
# and turn initial Y back to y
|
184
|
+
w[0] = 'y' if w[0] == ?Y
|
185
|
+
|
186
|
+
STEMMED[original_word] = w
|
187
|
+
|
188
|
+
w
|
189
|
+
end
|
190
|
+
|
191
|
+
|
192
|
+
module_function :stem_porter
|
193
|
+
#
|
194
|
+
# make the stem_porter the default stem method, just in case we
|
195
|
+
# feel like having multiple stemmers available later.
|
196
|
+
#
|
197
|
+
alias stem stem_porter
|
198
|
+
|
199
|
+
end
|
200
|
+
|
201
|
+
|
202
|
+
|
203
|
+
#
|
204
|
+
# Make this script executable, and send it words on stdin, one per
|
205
|
+
# line, and it will output the stemmed versions to stdout.
|
206
|
+
#
|
207
|
+
if $0 == __FILE__ then
|
208
|
+
class String
|
209
|
+
include Stemmable
|
210
|
+
end
|
211
|
+
|
212
|
+
# the String class, and any subclasses of it you might have, now know
|
213
|
+
# how to stem things.
|
214
|
+
|
215
|
+
$stdin.each do |word|
|
216
|
+
puts word.stem
|
217
|
+
end
|
218
|
+
end
|
219
|
+
|
220
|
+
|