SimpleSearch 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +68 -0
- data/bin/simplesearch +42 -0
- data/lib/search/simple.rb +1 -0
- data/lib/search/simple/dictionary.rb +126 -0
- data/lib/search/simple/porter_stemmer.rb +220 -0
- data/lib/search/simple/searcher.rb +191 -0
- data/lib/search/simple/vector.rb +97 -0
- data/setup.rb +1360 -0
- metadata +46 -0
data/README
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
===SimpleSearch - Simple vector space search library
|
2
|
+
|
3
|
+
==What is SimpleSearch?
|
4
|
+
-----
|
5
|
+
|
6
|
+
SimpleSearch is a simple vector space text search engine.
|
7
|
+
|
8
|
+
==Installation
|
9
|
+
-----
|
10
|
+
|
11
|
+
Prerequisites
|
12
|
+
|
13
|
+
* Ruby 1.8 (http://www.ruby-lang.org/)
|
14
|
+
|
15
|
+
Optional
|
16
|
+
|
17
|
+
* RubyGems (http://rubygems.rubyforge.org)
|
18
|
+
|
19
|
+
==Installing SimpleSearch
|
20
|
+
-----
|
21
|
+
|
22
|
+
RubyGems (http://rubygems.rubyforge.org):
|
23
|
+
|
24
|
+
gem install SimpleSearch
|
25
|
+
|
26
|
+
...or...
|
27
|
+
|
28
|
+
.tar.gz installation:
|
29
|
+
|
30
|
+
ruby setup.rb #not yet available
|
31
|
+
|
32
|
+
|
33
|
+
==Using SimpleSearch
|
34
|
+
-----
|
35
|
+
|
36
|
+
SimpleSearch comes with a command line program that was primarily written as an example of how to use the API but might actually be useful.
|
37
|
+
|
38
|
+
To run the command line program, simply type:
|
39
|
+
$ search-simple --help
|
40
|
+
|
41
|
+
An example:
|
42
|
+
$ search-simple --cache=/tmp/mycache --dir=/usr/local/lib/ruby/gems/1.8/doc --extensions=html markup
|
43
|
+
|
44
|
+
This will cause search-simple to (re)index all of the files with a .html extension in your RubyGems rdoc directory and then search them for the words "markup" and "html". The search indices will be stored in /tmp/mycache.
|
45
|
+
|
46
|
+
At the heart of SimpleSearch is, of course, an API that can be embedded in other programs. The code of SimpleSearch was originally created by Dave Thomas as a search mechanism for his RubLog (http://rubyforge.org/projects/rublog) weblogging package. The API can be used as follows:
|
47
|
+
|
48
|
+
require 'search/simple'
|
49
|
+
Search::Simple::Searcher.load(content_for_indexing(options), "/tmp/search_cache")
|
50
|
+
contents = Search::Simple::Contents.new
|
51
|
+
# silly example
|
52
|
+
Dir['**/*'].each do |file_name|
|
53
|
+
File.open(file_name) do |file|
|
54
|
+
contents << Search::Simple::Content.new(file.read, File.expand_path(file_name), file.mtime)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
sr = s.find_words(['some', 'keywords', 'to', 'search', 'for'])
|
58
|
+
if sr.contains_matches
|
59
|
+
sr.results.sort.each do |res|
|
60
|
+
puts "#{res.score}:#{res.name}"
|
61
|
+
end
|
62
|
+
else
|
63
|
+
puts "No matches"
|
64
|
+
end
|
65
|
+
|
66
|
+
==Credits
|
67
|
+
------
|
68
|
+
Almost all of this code was written by Dave Thomas (http://pragprog.com/pragdave). The original code was a complete rewrite at an attempt that Chad Fowler (http://www.chadfowler.com) made to do a vector space search for RubLog. Chad Fowler adapted Dave's working RubLog code to be Rublog-independent and created what is now SimpleSearch out of it.
|
data/bin/simplesearch
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'optparse'
|
3
|
+
require 'search/simple'
|
4
|
+
|
5
|
+
options = {}
|
6
|
+
ARGV.options do |opts|
|
7
|
+
opts.on_tail("--help", "show this message") {puts opts}
|
8
|
+
opts.on('-cCACHEFILE','--cache=CACHEFILE', "Location of the search cache (defaults to /tmp/search_cache") { |options[:cachefile]| }
|
9
|
+
opts.on('-eEXTENSIONS','--extensions=EXTENSIONS', "Comma separated list of file name extensions to include in the search/index") { |options[:extensions]| }
|
10
|
+
opts.on('-dCONTENTDIR', '--dir=CONTENTDIR', "Directory from which to get the content to index") {|options[:directory]|}
|
11
|
+
opts.on('-tTERMS', '--terms=TERMS', "Comma separated list of words to search for") {|options[:terms]|}
|
12
|
+
opts.parse!
|
13
|
+
end
|
14
|
+
|
15
|
+
def content_for_indexing(options)
|
16
|
+
contents = Search::Simple::Contents.new
|
17
|
+
extensions = options[:extensions] || ""
|
18
|
+
globpattern = (options[:directory] || ".") + "/**/*" + "{#{extensions}}"
|
19
|
+
Dir[globpattern].each do |file_name|
|
20
|
+
next if File.directory?(file_name)
|
21
|
+
File.open(file_name) do |file|
|
22
|
+
contents << Search::Simple::Content.new(file.read, File.expand_path(file_name), file.mtime)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
contents
|
26
|
+
end
|
27
|
+
|
28
|
+
unless options[:terms]
|
29
|
+
puts "Usage: simplesearch --help"
|
30
|
+
exit 1
|
31
|
+
end
|
32
|
+
s = Search::Simple::Searcher.load(content_for_indexing(options), options[:cachefile] || "/tmp/search_cache")
|
33
|
+
sr = s.find_words(options[:terms].split(/,/))
|
34
|
+
if sr.contains_matches
|
35
|
+
require 'pp'
|
36
|
+
puts "Score\t#File"
|
37
|
+
sr.results.sort.each do |res|
|
38
|
+
puts "#{res.score}\t#{res.name}"
|
39
|
+
end
|
40
|
+
else
|
41
|
+
puts "No matches"
|
42
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
require 'search/simple/searcher'
|
@@ -0,0 +1,126 @@
|
|
1
|
+
# Maintain a dictionary mapping words to consecutive integers (the
|
2
|
+
# first unique word is 0, the second is 1 and so on)
|
3
|
+
|
4
|
+
require 'search/simple/porter_stemmer'
|
5
|
+
module Search
|
6
|
+
module Simple
|
7
|
+
class Dictionary
|
8
|
+
STOP_WORDS = {
|
9
|
+
"a" => 1,
|
10
|
+
"again" => 1,
|
11
|
+
"all" => 1,
|
12
|
+
"along" => 1,
|
13
|
+
"also" => 1,
|
14
|
+
"an" => 1,
|
15
|
+
"and" => 1,
|
16
|
+
"arialhelvetica" => 1,
|
17
|
+
"as" => 1,
|
18
|
+
"at" => 1,
|
19
|
+
"but" => 1,
|
20
|
+
"by" => 1,
|
21
|
+
"came" => 1,
|
22
|
+
"can" => 1,
|
23
|
+
"cant" => 1,
|
24
|
+
"couldnt" => 1,
|
25
|
+
"did" => 1,
|
26
|
+
"didn" => 1,
|
27
|
+
"didnt" => 1,
|
28
|
+
"do" => 1,
|
29
|
+
"doesnt" => 1,
|
30
|
+
"dont" => 1,
|
31
|
+
"entrytitledetail" => 1,
|
32
|
+
"ever" => 1,
|
33
|
+
"first" => 1,
|
34
|
+
"fontvariant" => 1,
|
35
|
+
"from" => 1,
|
36
|
+
"have" => 1,
|
37
|
+
"her" => 1,
|
38
|
+
"here" => 1,
|
39
|
+
"him" => 1,
|
40
|
+
"how" => 1,
|
41
|
+
"i" => 1,
|
42
|
+
"if" => 1,
|
43
|
+
"in" => 1,
|
44
|
+
"into" => 1,
|
45
|
+
"is" => 1,
|
46
|
+
"isnt" => 1,
|
47
|
+
"it" => 1,
|
48
|
+
"itll" => 1,
|
49
|
+
"just" => 1,
|
50
|
+
"last" => 1,
|
51
|
+
"least" => 1,
|
52
|
+
"like" => 1,
|
53
|
+
"most" => 1,
|
54
|
+
"my" => 1,
|
55
|
+
"new" => 1,
|
56
|
+
"no" => 1,
|
57
|
+
"not" => 1,
|
58
|
+
"now" => 1,
|
59
|
+
"of" => 1,
|
60
|
+
"on" => 1,
|
61
|
+
"or" => 1,
|
62
|
+
"should" => 1,
|
63
|
+
"sidebartitl" => 1,
|
64
|
+
"sinc" => 1,
|
65
|
+
"so" => 1,
|
66
|
+
"some" => 1,
|
67
|
+
"textdecoration" => 1,
|
68
|
+
"th" => 1,
|
69
|
+
"than" => 1,
|
70
|
+
"that" => 1,
|
71
|
+
"the" => 1,
|
72
|
+
"their" => 1,
|
73
|
+
"then" => 1,
|
74
|
+
"those" => 1,
|
75
|
+
"to" => 1,
|
76
|
+
"told" => 1,
|
77
|
+
"too" => 1,
|
78
|
+
"true" => 1,
|
79
|
+
"try" => 1,
|
80
|
+
"until" => 1,
|
81
|
+
"url" => 1,
|
82
|
+
"us" => 1,
|
83
|
+
"were" => 1,
|
84
|
+
"when" => 1,
|
85
|
+
"whether" => 1,
|
86
|
+
"while" => 1,
|
87
|
+
"with" => 1,
|
88
|
+
"within" => 1,
|
89
|
+
"yes" => 1,
|
90
|
+
"you" => 1,
|
91
|
+
"youll" => 1,
|
92
|
+
}
|
93
|
+
|
94
|
+
def initialize
|
95
|
+
@words = {}
|
96
|
+
end
|
97
|
+
|
98
|
+
def add_word(word)
|
99
|
+
word = Stemmable::stem_porter(word)
|
100
|
+
if STOP_WORDS[word]
|
101
|
+
nil
|
102
|
+
else
|
103
|
+
@words[word] ||= @words.size
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def find(word)
|
108
|
+
word = Stemmable::stem_porter(word)
|
109
|
+
if STOP_WORDS[word]
|
110
|
+
nil
|
111
|
+
else
|
112
|
+
@words[word]
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
def size
|
117
|
+
@words.size
|
118
|
+
end
|
119
|
+
|
120
|
+
def dump
|
121
|
+
puts @words.keys.sort
|
122
|
+
end
|
123
|
+
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
@@ -0,0 +1,220 @@
|
|
1
|
+
#! /local/ruby/bin/ruby
|
2
|
+
#
|
3
|
+
# $Id: PorterStemmer.rb,v 1.1.1.1 2004/04/17 13:55:20 pragdave Exp $
|
4
|
+
#
|
5
|
+
# See example usage at the end of this file.
|
6
|
+
#
|
7
|
+
|
8
|
+
module Stemmable
|
9
|
+
|
10
|
+
STEMMED = {}
|
11
|
+
|
12
|
+
STEP_2_LIST = {
|
13
|
+
'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
|
14
|
+
'izer'=>'ize', 'bli'=>'ble',
|
15
|
+
'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
|
16
|
+
'ization'=>'ize', 'ation'=>'ate',
|
17
|
+
'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
|
18
|
+
'ousness'=>'ous', 'aliti'=>'al',
|
19
|
+
'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
|
20
|
+
}
|
21
|
+
|
22
|
+
STEP_3_LIST = {
|
23
|
+
'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
|
24
|
+
'ical'=>'ic', 'ful'=>'', 'ness'=>''
|
25
|
+
}
|
26
|
+
|
27
|
+
|
28
|
+
SUFFIX_1_REGEXP = /(
|
29
|
+
ational |
|
30
|
+
tional |
|
31
|
+
enci |
|
32
|
+
anci |
|
33
|
+
izer |
|
34
|
+
bli |
|
35
|
+
alli |
|
36
|
+
entli |
|
37
|
+
eli |
|
38
|
+
ousli |
|
39
|
+
ization |
|
40
|
+
ation |
|
41
|
+
ator |
|
42
|
+
alism |
|
43
|
+
iveness |
|
44
|
+
fulness |
|
45
|
+
ousness |
|
46
|
+
aliti |
|
47
|
+
iviti |
|
48
|
+
biliti |
|
49
|
+
logi)$/x
|
50
|
+
|
51
|
+
|
52
|
+
SUFFIX_2_REGEXP = /(
|
53
|
+
al |
|
54
|
+
ance |
|
55
|
+
ence |
|
56
|
+
er |
|
57
|
+
ic |
|
58
|
+
able |
|
59
|
+
ible |
|
60
|
+
ant |
|
61
|
+
ement |
|
62
|
+
ment |
|
63
|
+
ent |
|
64
|
+
ou |
|
65
|
+
ism |
|
66
|
+
ate |
|
67
|
+
iti |
|
68
|
+
ous |
|
69
|
+
ive |
|
70
|
+
ize)$/x
|
71
|
+
|
72
|
+
|
73
|
+
C = "[^aeiou]" # consonant
|
74
|
+
V = "[aeiouy]" # vowel
|
75
|
+
CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
|
76
|
+
VV = "#{V}(?>[aeiou]*)" # vowel sequence
|
77
|
+
|
78
|
+
MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
|
79
|
+
MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
|
80
|
+
MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
|
81
|
+
VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
|
82
|
+
|
83
|
+
#
|
84
|
+
# Porter stemmer in Ruby.
|
85
|
+
#
|
86
|
+
# This is the Porter stemming algorithm, ported to Ruby from the
|
87
|
+
# version coded up in Perl. It's easy to follow against the rules
|
88
|
+
# in the original paper in:
|
89
|
+
#
|
90
|
+
# Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
|
91
|
+
# no. 3, pp 130-137,
|
92
|
+
#
|
93
|
+
# See also http://www.tartarus.org/~martin/PorterStemmer
|
94
|
+
#
|
95
|
+
# Send comments to raypereda@hotmail.com
|
96
|
+
#
|
97
|
+
|
98
|
+
def stem_porter(w = self.to_str.dup)
|
99
|
+
|
100
|
+
# make a copy of the given object and convert it to a string.
|
101
|
+
original_word = w
|
102
|
+
|
103
|
+
return w if w.length < 3
|
104
|
+
|
105
|
+
result = STEMMED[w]
|
106
|
+
return result if result
|
107
|
+
|
108
|
+
# now map initial y to Y so that the patterns never treat it as vowel
|
109
|
+
w[0] = 'Y' if w[0] == ?y
|
110
|
+
|
111
|
+
# Step 1a
|
112
|
+
if w =~ /(ss|i)es$/
|
113
|
+
w = $` + $1
|
114
|
+
elsif w =~ /([^s])s$/
|
115
|
+
w = $` + $1
|
116
|
+
end
|
117
|
+
|
118
|
+
# Step 1b
|
119
|
+
if w =~ /eed$/
|
120
|
+
w.chop! if $` =~ MGR0
|
121
|
+
elsif w =~ /(ed|ing)$/
|
122
|
+
stem = $`
|
123
|
+
if stem =~ VOWEL_IN_STEM
|
124
|
+
w = stem
|
125
|
+
case w
|
126
|
+
when /(at|bl|iz)$/ then w << "e"
|
127
|
+
when /([^aeiouylsz])\1$/ then w.chop!
|
128
|
+
when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
if w =~ /y$/
|
134
|
+
stem = $`
|
135
|
+
w = stem + "i" if stem =~ VOWEL_IN_STEM
|
136
|
+
end
|
137
|
+
|
138
|
+
# Step 2
|
139
|
+
if w =~ SUFFIX_1_REGEXP
|
140
|
+
stem = $`
|
141
|
+
suffix = $1
|
142
|
+
# print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
|
143
|
+
if stem =~ MGR0
|
144
|
+
w = stem + STEP_2_LIST[suffix]
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
# Step 3
|
149
|
+
if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
|
150
|
+
stem = $`
|
151
|
+
suffix = $1
|
152
|
+
if stem =~ MGR0
|
153
|
+
w = stem + STEP_3_LIST[suffix]
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
# Step 4
|
158
|
+
if w =~ SUFFIX_2_REGEXP
|
159
|
+
stem = $`
|
160
|
+
if stem =~ MGR1
|
161
|
+
w = stem
|
162
|
+
end
|
163
|
+
elsif w =~ /(s|t)(ion)$/
|
164
|
+
stem = $` + $1
|
165
|
+
if stem =~ MGR1
|
166
|
+
w = stem
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
# Step 5
|
171
|
+
if w =~ /e$/
|
172
|
+
stem = $`
|
173
|
+
if (stem =~ MGR1) ||
|
174
|
+
(stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
|
175
|
+
w = stem
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
if w =~ /ll$/ && w =~ MGR1
|
180
|
+
w.chop!
|
181
|
+
end
|
182
|
+
|
183
|
+
# and turn initial Y back to y
|
184
|
+
w[0] = 'y' if w[0] == ?Y
|
185
|
+
|
186
|
+
STEMMED[original_word] = w
|
187
|
+
|
188
|
+
w
|
189
|
+
end
|
190
|
+
|
191
|
+
|
192
|
+
module_function :stem_porter
|
193
|
+
#
|
194
|
+
# make the stem_porter the default stem method, just in case we
|
195
|
+
# feel like having multiple stemmers available later.
|
196
|
+
#
|
197
|
+
alias stem stem_porter
|
198
|
+
|
199
|
+
end
|
200
|
+
|
201
|
+
|
202
|
+
|
203
|
+
#
|
204
|
+
# Make this script executable, and send it words on stdin, one per
|
205
|
+
# line, and it will output the stemmed versions to stdout.
|
206
|
+
#
|
207
|
+
if $0 == __FILE__ then
|
208
|
+
class String
|
209
|
+
include Stemmable
|
210
|
+
end
|
211
|
+
|
212
|
+
# the String class, and any subclasses of it you might have, now know
|
213
|
+
# how to stem things.
|
214
|
+
|
215
|
+
$stdin.each do |word|
|
216
|
+
puts word.stem
|
217
|
+
end
|
218
|
+
end
|
219
|
+
|
220
|
+
|