moonstone 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile ADDED
@@ -0,0 +1,61 @@
1
+ $:.unshift "#{here = File.dirname(__FILE__)}/lib"
2
+ require 'rake/gempackagetask'
3
+ require 'rake/rdoctask'
4
+
5
+ deps = %w{ rspec }
6
+
7
+ task(:install_gems) {
8
+ deps.each { |g|
9
+ system "jruby -S gem install #{g}"
10
+ }
11
+ }
12
+
13
+ spec = Gem::Specification.new { |s|
14
+ s.platform = Gem::Platform::RUBY
15
+
16
+ s.authors = "Matthew King", "Jason Rush", "Jay Donnell", "Dan Yoder"
17
+ s.email = "self@automatthew.com"
18
+ s.files = Dir["{lib,doc,bin,ext}/**/*"].delete_if {|f|
19
+ /\/rdoc(\/|$)/i.match f
20
+ } + %w(Rakefile)
21
+ s.require_path = 'lib'
22
+ s.has_rdoc = true
23
+ s.extra_rdoc_files = Dir['doc/*'].select(&File.method(:file?))
24
+ s.extensions << 'ext/extconf.rb' if File.exist? 'ext/extconf.rb'
25
+ Dir['bin/*'].map(&File.method(:basename)).map(&s.executables.method(:<<))
26
+
27
+ s.name = 'moonstone'
28
+ s.summary = "Moonstone Agile Search Framework"
29
+ deps.each &s.method(:add_dependency)
30
+ s.version = '0.6.0'
31
+ }
32
+
33
+ Rake::GemPackageTask.new(spec) { |pkg|
34
+ pkg.need_tar_bz2 = true
35
+ }
36
+
37
+ task(:uninstall) {
38
+ system "sudo jruby -S gem uninstall -aIx #{spec.name}"
39
+ }
40
+
41
+ task(:install => [:uninstall, :package]) {
42
+ g = "pkg/#{spec.name}-#{spec.version}.gem"
43
+ system "sudo jruby -S gem install --local #{g}"
44
+ }
45
+
46
+ task(:uninstall_no_sudo) {
47
+ system "jruby -S gem uninstall -aIx #{spec.name}"
48
+ }
49
+
50
+ task(:install_no_sudo => [:uninstall_no_sudo, :package]) {
51
+ g = "pkg/#{spec.name}-#{spec.version}.gem"
52
+ system "jruby -S gem install -l #{g}"
53
+ }
54
+
55
+ desc "run some tests"
56
+ task :test do
57
+ options = ENV['options']
58
+ files = FileList['test/**/*.rb'].exclude('test/helpers.rb')
59
+ puts cmd = "jruby #{options} -I lib -S spec -c #{ files.join(' ') }"
60
+ system cmd
61
+ end
Binary file
@@ -0,0 +1,58 @@
1
+ module Lucene
2
+ module Analysis
3
+ include_package "org.apache.lucene.analysis"
4
+
5
+ module Standard
6
+ include_package "org.apache.lucene.analysis.standard"
7
+ [
8
+ StandardAnalyzer,
9
+ StandardFilter,
10
+ StandardTokenizer,
11
+ ]
12
+ end
13
+ include Standard
14
+
15
+ TokenStream.module_eval do
16
+ include Enumerable
17
+ def each
18
+ token = Token.new
19
+ while token = self.next(token) do
20
+ yield token
21
+ end
22
+ end
23
+ end
24
+
25
+ Analyzer.module_eval do
26
+ def tokenize(field, text)
27
+ token_stream(field, java.io.StringReader.new(text)).map { |token| token.term_text }
28
+ end
29
+ end
30
+
31
+ # Biggie Smalls, Biggie Smalls, Biggie Smalls
32
+ [
33
+ CachingTokenFilter,
34
+ CharTokenizer,
35
+ ISOLatin1AccentFilter,
36
+ KeywordAnalyzer,
37
+ KeywordTokenizer,
38
+ LengthFilter,
39
+ LetterTokenizer,
40
+ LowerCaseFilter,
41
+ LowerCaseTokenizer,
42
+ PerFieldAnalyzerWrapper,
43
+ PorterStemFilter,
44
+ PorterStemmer,
45
+ SimpleAnalyzer,
46
+ SinkTokenizer,
47
+ StopAnalyzer,
48
+ StopFilter,
49
+ TeeTokenFilter,
50
+ Token,
51
+ TokenFilter,
52
+ Tokenizer,
53
+ WhitespaceAnalyzer,
54
+ WhitespaceTokenizer,
55
+ WordlistLoader
56
+ ]
57
+ end
58
+ end
@@ -0,0 +1,149 @@
1
+ module Lucene
2
+ module Document
3
+ include_package "org.apache.lucene.document"
4
+
5
+ # avoid naming problems with Lucene::Document::Document
6
+ Doc = Lucene::Document::Document
7
+
8
+ # I spit on final class
9
+ Doc.module_eval do
10
+ attr_accessor :score, :id, :tokens, :explanation
11
+
12
+ self::Field = Lucene::Document::Field
13
+
14
+ @@field_store = {
15
+ nil => Field::Store::YES,
16
+ false => Field::Store::NO,
17
+ :NO => Field::Store::NO,
18
+ :no => Field::Store::NO,
19
+ true => Field::Store::YES,
20
+ :YES => Field::Store::YES,
21
+ :yes => Field::Store::YES,
22
+ :compress => Field::Store::COMPRESS,
23
+ :COMPRESS => Field::Store::COMPRESS
24
+ }
25
+ @@field_index = {
26
+ nil => Field::Index::ANALYZED,
27
+ false => Field::Index::NO,
28
+ :NO => Field::Index::NO,
29
+ :no => Field::Index::NO,
30
+ true => Field::Index::ANALYZED,
31
+ :analyzed => Field::Index::ANALYZED,
32
+ :ANALYZED => Field::Index::ANALYZED,
33
+ :not_analyzed => Field::Index::NOT_ANALYZED,
34
+ :NOT_ANALYZED => Field::Index::NOT_ANALYZED,
35
+ :analyzed_no_norms => Field::Index::ANALYZED_NO_NORMS,
36
+ :ANALYZED_NO_NORMS => Field::Index::ANALYZED_NO_NORMS,
37
+ :not_analyzed_no_norms => Field::Index::NOT_ANALYZED_NO_NORMS,
38
+ :NOT_ANALYZED_NO_NORMS => Field::Index::NOT_ANALYZED_NO_NORMS
39
+ }
40
+
41
+ @@field_term_vector = {
42
+ nil => Field::TermVector::NO,
43
+ :NO => Field::TermVector::NO,
44
+ :no => Field::TermVector::NO,
45
+ false => Field::TermVector::NO,
46
+ :YES => Field::TermVector::YES,
47
+ :yes => Field::TermVector::YES,
48
+ true => Field::TermVector::YES,
49
+ :WITH_POSITIONS => Field::TermVector::WITH_POSITIONS,
50
+ :with_positions => Field::TermVector::WITH_POSITIONS,
51
+ :WITH_OFFSETS => Field::TermVector::WITH_OFFSETS,
52
+ :with_offsets => Field::TermVector::WITH_OFFSETS,
53
+ :WITH_POSITIONS_OFFSETS => Field::TermVector::WITH_POSITIONS_OFFSETS,
54
+ :with_positions_offsets => Field::TermVector::WITH_POSITIONS_OFFSETS
55
+ }
56
+
57
+ def self.new
58
+ doc = super()
59
+ yield doc if block_given?
60
+ doc
61
+ end
62
+
63
+ def self.create(fields)
64
+ doc = self.new
65
+ fields.each { |field| doc.add_field(*field) }
66
+ doc
67
+ end
68
+
69
+ def add_field(name, value, options={})
70
+ field = if value.is_a? java.io.Reader
71
+ Field.new(name, value, @@field_term_vector[options[:term_vector]])
72
+ else
73
+ store = @@field_store[options[:store]]
74
+ index = @@field_index[options[:index]]
75
+ term_vector = @@field_term_vector[options[:term_vector]]
76
+ params = [name, value, store, index]
77
+ params << term_vector if term_vector
78
+ Field.new(*params)
79
+ end
80
+ add(field)
81
+ end
82
+
83
+ # specialty field adders
84
+ def stored(name, value)
85
+ add_field(name, value, :store => true, :index => false)
86
+ end
87
+
88
+ def analyzed(name, value)
89
+ add_field(name, value, :store => true, :index => :tokenized)
90
+ end
91
+
92
+ def unanalyzed(name, value)
93
+ add_field(name, value, :store => true, :index => :not_analyzed)
94
+ end
95
+
96
+ alias_method :[], :get
97
+
98
+ def get_all(field_name)
99
+ fields.select { |f| f.name == field_name }.map { |f| f.string_value }
100
+ end
101
+
102
+ def field_names
103
+ fields.map { |f| f.name }.uniq
104
+ end
105
+
106
+ alias_method :keys, :field_names
107
+
108
+ def to_hash
109
+ hash = {}
110
+ hash["id"] = @id if @id
111
+ hash["score"] = @score if @score
112
+ hash["explanation"] = @explanation.toString(1) if @explanation
113
+ fields = {}
114
+ hash["fields"] = fields
115
+ keys.each do|k|
116
+ values = self.get_all(k)
117
+ # fields[k] = values.size == 1 ? values.first : values
118
+ fields[k] = values
119
+ end
120
+ hash["tokens"] = @tokens if @tokens
121
+ hash
122
+ end
123
+
124
+ def to_json
125
+ to_hash.to_json
126
+ end
127
+
128
+ end
129
+
130
+ Field.module_eval do
131
+
132
+ alias_method :stored?, :is_stored
133
+ alias_method :indexed?, :is_indexed
134
+ alias_method :tokenized?, :is_tokenized
135
+ alias_method :analyzed?, :is_tokenized
136
+ alias_method :compressed?, :is_compressed
137
+
138
+ def unanalyzed?; indexed? && !analyzed?; end
139
+ def unindexed?; stored? && !indexed?; end
140
+
141
+ end
142
+
143
+ # Biggie Smalls, Biggie Smalls, Biggie Smalls
144
+ [
145
+ DateField,
146
+ DateTools
147
+ ]
148
+ end
149
+ end
@@ -0,0 +1,9 @@
1
+ module Lucene
2
+ module Search
3
+ module Function
4
+ include_package 'org.apache.lucene.search.function'
5
+
6
+ [FieldScoreQuery, CustomScoreQuery]
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,48 @@
1
+ module Lucene
2
+ module Index
3
+ include_package "org.apache.lucene.index"
4
+
5
+ IndexWriter.module_eval do
6
+ MaxFieldLength = self::MaxFieldLength
7
+
8
+ def self.open(*args)
9
+ args << MaxFieldLength::UNLIMITED unless args.last.is_a? MaxFieldLength
10
+ writer = new(*args)
11
+ begin
12
+ result = yield(writer)
13
+ ensure
14
+ writer.close
15
+ end
16
+ result
17
+ end
18
+
19
+ def add_documents(docs)
20
+ docs.each { |doc| add_document(doc) }
21
+ end
22
+
23
+
24
+ end
25
+
26
+ TermEnum.module_eval do
27
+ include Enumerable
28
+
29
+ def each
30
+ while self.next do
31
+ yield term
32
+ end
33
+ end
34
+
35
+ def for_field(field_name)
36
+ select { |t| t.field == field_name }.map { |t| t.text }
37
+ end
38
+
39
+ end
40
+
41
+ # Biggie Smalls, Biggie Smalls, Biggie Smalls
42
+ [
43
+ IndexReader,
44
+ Payload,
45
+ Term,
46
+ ]
47
+ end
48
+ end
@@ -0,0 +1,14 @@
1
+ module Lucene
2
+ module QueryParser
3
+ include_package "org.apache.lucene.queryParser"
4
+
5
+ # avoid problems with Lucene::QueryParser::QueryParser
6
+ Parser = org.apache.lucene.queryParser.QueryParser
7
+
8
+ # Biggie Smalls, Biggie Smalls, Biggie Smalls
9
+ [
10
+ MultiFieldQueryParser,
11
+ Token
12
+ ]
13
+ end
14
+ end
@@ -0,0 +1,195 @@
1
+ module Lucene
2
+ module Search
3
+ include_package "org.apache.lucene.search"
4
+
5
+ [ SortField, Sort ]
6
+
7
+ Hits.module_eval do
8
+ include Enumerable
9
+ def each
10
+ i = 0
11
+ while i < length do
12
+ yield doc(i)
13
+ i += 1
14
+ end
15
+ end
16
+
17
+ def to_a
18
+ map
19
+ end
20
+
21
+ alias_method :size, :length
22
+ end
23
+
24
+ TopDocs.module_eval do
25
+ attr_accessor :query
26
+ include Enumerable
27
+
28
+ def each(searcher=nil)
29
+ initialize_docs(searcher) if searcher && documents.empty? #Do we ever want to reinitialize the documents list?
30
+ documents.each { |doc| yield doc }
31
+ end
32
+
33
+ def initialize_docs(searcher)
34
+ @offset ||= 0
35
+ self.scoreDocs.each_with_index do |sd, i|
36
+ #For pagination, only init the docs that fit the offset
37
+ if i >= @offset
38
+ doc = searcher.doc(sd.doc)
39
+ doc.score = sd.score
40
+ doc.id = sd.doc
41
+ documents << doc
42
+ end
43
+ end
44
+ end
45
+
46
+ #Remove docs that precede the offset
47
+ def offset!(offset)
48
+ @offset = offset || 0
49
+ self
50
+ end
51
+
52
+ def offset
53
+ @offset ||= 0
54
+ end
55
+
56
+ def [](index)
57
+ documents[index]
58
+ end
59
+
60
+ def first
61
+ documents[0]
62
+ end
63
+
64
+ def last
65
+ to_a.last
66
+ end
67
+
68
+ def length
69
+ self.scoreDocs.length - (@offset || 0)
70
+ end
71
+
72
+ alias_method :size, :length
73
+
74
+ def empty?
75
+ self.length == 0
76
+ end
77
+
78
+ def to_hash
79
+ {
80
+ :query => self.query,
81
+ :total_hits => self.totalHits,
82
+ :documents => self.to_a
83
+ }
84
+ end
85
+
86
+ def to_json
87
+ to_hash.to_json
88
+ end
89
+
90
+ private
91
+ def documents
92
+ @documents ||= []
93
+ end
94
+ end
95
+
96
+ Hit.module_eval do
97
+ alias_method :[], :get
98
+ end
99
+
100
+ IndexSearcher.module_eval do
101
+ def self.open(*args)
102
+ searcher = new(*args)
103
+ begin
104
+ result = yield(searcher)
105
+ ensure
106
+ searcher.close
107
+ end
108
+ result
109
+ end
110
+ end
111
+
112
+ BooleanQuery.module_eval do
113
+
114
+ def self.and(*queries)
115
+ q = self.new
116
+ queries.each { |query| q.add(query, BooleanClause::Occur::MUST) }
117
+ q
118
+ end
119
+
120
+ def self.or(*queries)
121
+ q = self.new
122
+ queries.each { |query| q.add(query, BooleanClause::Occur::SHOULD) }
123
+ q
124
+ end
125
+
126
+ def self.not(*queries)
127
+ q = self.new
128
+ queries.each { |query| q.add(query, BooleanClause::Occur::MUST_NOT) }
129
+ q
130
+ end
131
+
132
+ def and(*queries)
133
+ queries.each { |query| add(query, BooleanClause::Occur::MUST) }
134
+ self
135
+ end
136
+
137
+ def or(*queries)
138
+ queries.each { |query| add(query, BooleanClause::Occur::SHOULD) }
139
+ self
140
+ end
141
+
142
+ def not(*queries)
143
+ queries.each { |query| add(query, BooleanClause::Occur::MUST_NOT) }
144
+ self
145
+ end
146
+
147
+ end
148
+
149
+ TermQuery.module_eval do
150
+
151
+ def self.new(*args)
152
+ term = args.first.is_a?(Lucene::Index::Term) ? args.first : Lucene::Index::Term.new(*args)
153
+ super(term)
154
+ end
155
+
156
+ end
157
+
158
+ module Spell
159
+ include_package 'org.apache.lucene.search.spell'
160
+ [PlainTextDictionary]
161
+ end
162
+
163
+ PhraseQuery.module_eval do
164
+ def self.create(field, phrase)
165
+ raise "I need an array" unless phrase.is_a? Array
166
+ query = self.new
167
+ phrase.each do |word|
168
+ query.add(Index::Term.new(field, word))
169
+ end
170
+ query
171
+ end
172
+ end
173
+
174
+
175
+ # Biggie Smalls, Biggie Smalls, Biggie Smalls
176
+ [
177
+ Explanation,
178
+ FilteredQuery,
179
+ FuzzyQuery,
180
+ HitIterator,
181
+ MultiPhraseQuery,
182
+ PrefixQuery,
183
+ Query,
184
+ RangeQuery,
185
+ ScoreDoc,
186
+ Searcher,
187
+ Similarity,
188
+ TopDocCollector,
189
+ TopFieldDocCollector,
190
+ TopFieldDocs,
191
+ Weight,
192
+ WildcardQuery
193
+ ]
194
+ end
195
+ end
@@ -0,0 +1,12 @@
1
+ module Lucene
2
+ module Store
3
+ include_package "org.apache.lucene.store"
4
+
5
+ # Biggie Smalls, Biggie Smalls, Biggie Smalls
6
+ [
7
+ Directory,
8
+ FSDirectory,
9
+ RAMDirectory
10
+ ]
11
+ end
12
+ end
@@ -0,0 +1,23 @@
1
+ module Moonstone
2
+ class Analyzer < Lucene::Analysis::Analyzer
3
+ attr_accessor :filter_chain
4
+ # Moonstone::Analyzer.new(WhitespaceTokenizer, StandardFilter, StemFilter)
5
+ # FIXME: Why don't we explicitly require a tokenizer + *filters ?
6
+ def self.new(*classes)
7
+ analyzer = super()
8
+ analyzer.filter_chain = classes
9
+ analyzer
10
+ end
11
+
12
+ def tokenStream(field_name, reader)
13
+ tokenizer, *args = @filter_chain[0]
14
+ stream = tokenizer.new(reader, *args)
15
+ @filter_chain.slice(1..-1).each do |filter|
16
+ klass, *args = filter
17
+ stream = klass.new(stream, *args)
18
+ end
19
+ stream
20
+ end
21
+
22
+ end
23
+ end
@@ -0,0 +1,186 @@
1
+ module Moonstone
2
+ class Engine
3
+ include Lucene::Index
4
+ include Lucene::Search
5
+
6
+ attr_reader :store, :similarity
7
+
8
+ # :store should be a String or some kind of Lucene::Store::Directory
9
+ def initialize(options = {})
10
+ @store = options[:store] || Lucene::Store::RAMDirectory.new
11
+ @inspect = options[:inspect]
12
+ end
13
+
14
+ # The source should be enumerable.
15
+ def index(source, optimize=true)
16
+ IndexWriter.open(@store, analyzer) do |writer|
17
+ writer.set_similarity(@similarity.new) if @similarity
18
+
19
+ source.each_with_index do |record, i|
20
+ doc = doc_from(record)
21
+ writer.add_document(doc) if doc
22
+ Moonstone::Logger.info "Indexed #{i+1} records" if (i+1)%1000 == 0
23
+ end
24
+ writer.optimize if optimize
25
+ yield writer if block_given? #For post-processing stuff where you still need access to the writer
26
+ end
27
+ refresh_searcher
28
+ end
29
+
30
+ def stamp_metadata
31
+ metadata = Lucene::Document::Doc.new
32
+ metadata.add_field 'metadata', 'index', :index => :not_analyzed
33
+ metadata.add_field 'build_date', Date.today.strftime("%Y-%m-%d"), :index => false
34
+ metadata.add_field 'engine_name', self.class.name, :index => false
35
+ metadata.add_field 'engine_version', `git show-ref -s --abbrev HEAD`.chomp, :index => false
36
+ metadata.add_field 'query_conditions', ENV['query_conditions'].to_s, :index => false
37
+ writer do |w|
38
+ w.add_document(metadata)
39
+ end
40
+ end
41
+
42
+ def index_metadata
43
+ query = TermQuery.new 'metadata', 'index'
44
+ @index_metadata ||= search(query).last
45
+ end
46
+
47
+ def doc_count
48
+ @reader ||= IndexReader.open(@store)
49
+ @reader.max_doc
50
+ end
51
+
52
+ def document(id)
53
+ @reader ||= IndexReader.open(@store)
54
+ if id < @reader.max_doc
55
+ doc = @reader.document(id)
56
+ doc.tokens = tokens_for_doc(id)
57
+ doc.id = id
58
+ doc
59
+ end
60
+ end
61
+
62
+ # Adds docs to index. docs must be an enumerable set of such objects that doc_from can turn into a document
63
+ def insert_documents(source, optimize=false)
64
+ index(source, optimize)
65
+ refresh_searcher
66
+ end
67
+
68
+ def insert_document(source, optimize=false)
69
+ insert_documents([source], optimize)
70
+ end
71
+
72
+ # docs must be enumerable set of hashes, with fields
73
+ # :field, :value, :document
74
+ # (where field and value combine to make a term to match documents to replace)
75
+ def update_documents(docs)
76
+ IndexWriter.open(@store, analyzer) do |writer|
77
+ writer.set_similarity(@similarity.new) if @similarity
78
+ docs.each do |doc|
79
+ raise "Invalid arguments" unless doc[:field] && doc[:value] && doc[:document]
80
+ term = Term.new(doc[:field], doc[:value])
81
+ document = doc_from(doc[:document])
82
+ writer.updateDocument(term, document)
83
+ end
84
+ end
85
+ refresh_searcher
86
+ end
87
+
88
+ def update_document(doc)
89
+ update_documents([doc])
90
+ end
91
+
92
+ # terms should be an enumerable set of hashes, with fields
93
+ # :field and :value, which combine to make a term to match documents to delete
94
+ def delete_documents(terms)
95
+ IndexWriter.open(@store, analyzer) do |writer|
96
+ terms.each do |t|
97
+ term = Term.new(t[:field], t[:value])
98
+ writer.deleteDocuments(term)
99
+ end
100
+ end
101
+ refresh_searcher
102
+ end
103
+
104
+ def delete_document(term)
105
+ delete_documents([term])
106
+ end
107
+
108
+ # Takes any kind of input object parsable by your #create_query method. Quack.
109
+ # Options patterns (see javadoc for org.apache.lucene.search.Searcher):
110
+ # Returns a TopDocs object
111
+ # Note that Hits is deprecated so the versions of search() returning a Hits object are not implemented
112
+ def search(input, options = {})
113
+ query = input.kind_of?(Lucene::Search::Query) ? input : create_query(input)
114
+ @searcher ||= IndexSearcher.new(@store)
115
+ top_docs = if (hit_collector = options[:hit_collector])
116
+ args = [ options[:filter], hit_collector ].compact
117
+ @searcher.search(query, *args)
118
+ hit_collector.topDocs
119
+ else
120
+ options[:limit] ||= 25
121
+ options[:offset] ||= 0
122
+ args = [ options[:filter], (options[:limit] + options[:offset]) ] #Always include both of these, even if nil
123
+ args << options[:sort] if options[:sort]
124
+ @searcher.search(query, *args).offset!(options[:offset])
125
+ end
126
+ top_docs.each(@searcher) do |doc|
127
+ doc.tokens = self.tokens_for_doc(doc) if inspect_mode?
128
+ yield doc if block_given?
129
+ end
130
+ top_docs
131
+ end
132
+
133
+ #Reopen the searcher (used when the index has changed)
134
+ def refresh_searcher
135
+ @searcher = IndexSearcher.new(@store) if @searcher #If it's nil, it'll get lazy loaded
136
+ end
137
+
138
+ def close
139
+ @searcher.close if @searcher
140
+ @reader.close if @reader
141
+ end
142
+
143
+ # Returns an instance of the Analyzer class defined within
144
+ # this class's namespace.
145
+ def analyzer
146
+ @analyzer ||= self.class::Analyzer.new
147
+ end
148
+
149
+ # Opens an IndexWriter for the duration of the block.
150
+ # engine.writer { |w| w.add_document(doc) }
151
+ def writer
152
+ IndexWriter.open(@store, self.class::Analyzer.new) do |writer|
153
+ writer.set_similarity(@similarity.new) if @similarity
154
+ yield writer
155
+ end
156
+ end
157
+
158
+ # Opens an IndexSearcher for the duration of the block.
159
+ # engine.searcher { |s| s.search(query_object) }
160
+ def searcher
161
+ IndexSearcher.open(@store) do |searcher|
162
+ searcher.set_similarity(@similarity.new) if @similarity
163
+ yield searcher
164
+ end
165
+ end
166
+
167
+ # Opens an IndexReader for the duration of the block.
168
+ # engine.reader { |r| r.terms }
169
+ def reader
170
+ reader = IndexReader.open(@store)
171
+ yield reader
172
+ reader.close
173
+ end
174
+
175
+
176
+ def parser(field, analyzer = nil)
177
+ @parser ||= {}
178
+ @parser[field.to_sym] ||= Lucene::QueryParser::Parser.new(field, analyzer || self.analyzer)
179
+ end
180
+
181
+ def inspect_mode?
182
+ @inspect
183
+ end
184
+
185
+ end
186
+ end
@@ -0,0 +1,30 @@
1
+ module Moonstone
2
+ class Filter < Lucene::Analysis::TokenFilter
3
+
4
+ def initialize(stream)
5
+ if block_given?
6
+ self.class.module_eval do
7
+ define_method :process do |token|
8
+ yield token
9
+ end
10
+ end
11
+ end
12
+ super
13
+ @stream = stream
14
+ end
15
+
16
+ def next(token=nil)
17
+ if token = (token ? @stream.next(token) : @stream.next)
18
+ text = process(token.term_text)
19
+ # skip a token if its text is empty
20
+ if text.empty?
21
+ token = self.next(token)
22
+ else
23
+ token.term_text = text
24
+ token
25
+ end
26
+ end
27
+ end
28
+
29
+ end
30
+ end
@@ -0,0 +1,26 @@
1
+ module Moonstone
2
+
3
+ module Filters
4
+
5
+ class Synonymer < Moonstone::QueuedFilter
6
+
7
+ def initialize(stream, synonym_hash)
8
+ @synonym_hash = synonym_hash
9
+ super(stream)
10
+ end
11
+
12
+ def process(text)
13
+ if syns = @synonym_hash[text]
14
+ if syns.is_a?(String)
15
+ [text, syns]
16
+ elsif syns.is_a?(Array)
17
+ [text].concat syns
18
+ end
19
+ else
20
+ text
21
+ end
22
+ end
23
+
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,38 @@
1
+ #Methods to assist in index analysis
2
+ module Moonstone
3
+ class Engine
4
+
5
+ #Return a hash of tokens, keyed on field name, for the given doc.
6
+ # Doc can be either a Document, or the integer document id.
7
+ # Note that if it is a Document, doc.id cannot be nil
8
+ def tokens_for_doc(doc, fields = nil)
9
+ tokens = {}
10
+ self.reader do |reader|
11
+ unless doc.kind_of?(Lucene::Document::Doc)
12
+ doc_id = doc
13
+ doc = reader.document(doc)
14
+ doc.id = doc_id
15
+ end
16
+ fields = doc.keys if fields.nil?
17
+ fields.each do |field|
18
+ tokens[field] = []
19
+ tfv = reader.getTermFreqVector(doc.id, field)
20
+ if tfv && tfv.size > 0 && tfv.respond_to?(:getTermPositions)
21
+ tv = tfv.getTerms
22
+ tv.length.times do |i|
23
+ positions = tfv.getTermPositions(i) || []
24
+ positions.each { |pos| tokens[field][pos] = tv[i]}
25
+ end
26
+ end
27
+ end
28
+ end
29
+ tokens
30
+ end
31
+
32
+ #Helper, delegates to tokens_for_doc
33
+ def tokens_for_field(doc, field)
34
+ tokens_for_doc(doc, [field])[field]
35
+ end
36
+
37
+ end
38
+ end
@@ -0,0 +1,25 @@
1
+ module Moonstone
2
+ class MultiAnalyzer < Lucene::Analysis::Analyzer
3
+ attr_accessor :fields
4
+
5
+ # Moonstone::MultiAnalyzer.new :name => [KeywordTokenizer, SynonymFilter],
6
+ # :categories => [WhitespaceTokenizer, SynonymFilter, StemFilter]
7
+ def self.new(hash={})
8
+ analyzer = super()
9
+ analyzer.fields = hash
10
+ analyzer
11
+ end
12
+
13
+ def tokenStream(field_name, reader)
14
+ filter_chain = @fields[field_name.to_sym] || @fields[true]
15
+ tokenizer, *args = filter_chain[0]
16
+ stream = tokenizer.new(reader, *args)
17
+ filter_chain.slice(1..-1).each do |filter|
18
+ klass, *args = filter
19
+ stream = klass.new(stream, *args)
20
+ end
21
+ stream
22
+ end
23
+
24
+ end
25
+ end
@@ -0,0 +1,45 @@
1
+ module Moonstone
2
+
3
+ class QueuedFilter < Moonstone::Filter
4
+
5
+ def initialize(stream)
6
+ @buffer = []
7
+ super
8
+ end
9
+
10
+ def read_buffer(token=nil)
11
+ if item = @buffer.shift
12
+ if item.is_a? String
13
+ token ||= Lucene::Analysis::Token.new
14
+ token.term_text = item
15
+ token
16
+ else
17
+ raise "What have you done?"
18
+ end
19
+ end
20
+ end
21
+
22
+ def next(token=nil)
23
+ if t = read_buffer(token)
24
+ t
25
+ elsif token = (token ? @stream.next(token) : @stream.next)
26
+ results = process(token.term_text)
27
+ if results.is_a? Array
28
+ text = results.shift
29
+ results.each { |t| @buffer << t }
30
+ else
31
+ text = results
32
+ end
33
+ # skip a token if its text is empty
34
+ if text && text.empty?
35
+ token = self.next(token)
36
+ else
37
+ token.term_text = text
38
+ token
39
+ end
40
+ end
41
+ end
42
+
43
+ end
44
+
45
+ end
@@ -0,0 +1,28 @@
1
+ require 'moonstone/racker'
2
+ module Moonstone
3
+ module Racker
4
+ module BasicSearch
5
+ include Moonstone::Racker
6
+
7
+ # GET /search.html?input=happiness
8
+ def html_GET_search(request)
9
+ results = search(request.params['input'], search_options(request))
10
+ results.join("\n<br>")
11
+ end
12
+
13
+ # GET /search.json?input=happiness
14
+ def json_GET_search(request)
15
+ results = search(request.params['input'], search_options(request))
16
+ results.to_json
17
+ end
18
+
19
+ # POST /search.json
20
+ def json_POST_search(request)
21
+ options = search_options(request)
22
+ data = request.env['rack.input'].read
23
+ JSON.parse(data).map { |input| search(input, options) }.to_json
24
+ end
25
+
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,33 @@
1
+ require 'moonstone/racker'
2
+ module Moonstone
3
+ module Racker
4
+ module LocalSearch
5
+ include Moonstone::Racker
6
+
7
+ def json_GET_search(request)
8
+ args = request.params.values_at('input', 'lat', 'lon')
9
+ options = search_options(request)
10
+ args << options
11
+ t = Time.now
12
+ results = search(*args).to_hash
13
+ results[:time] = Time.now - t
14
+ results.to_json
15
+ end
16
+
17
+ # JSON body should contain an array of 3-element arrays (topic, lat, lon)
18
+ # curl -i -X POST -d '[ ["plumbers", "", ""], ["burgers", "", ""] ]' \
19
+ # http://localhost:9292/search.json
20
+ def json_POST_search(request)
21
+ options = search_options(request)
22
+ data = request.env['rack.input'].read
23
+ JSON.parse(data).map do |input, lat, lon|
24
+ t = Time.now
25
+ results = search(input, lat, lon, options).to_hash
26
+ results[:time] = Time.now - t
27
+ results
28
+ end.to_json
29
+ end
30
+
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,84 @@
1
+ require 'rack'
2
+ require 'json'
3
+ module Moonstone
4
+ # include me in a Moonstone::Engine, maybe?
5
+ module Racker
6
+
7
+ PathMatcher = %r{^/([\w_]+)\.([\w_]+)$}
8
+
9
+ def call(env)
10
+ request, response = Rack::Request.new(env), Rack::Response.new
11
+ # Determine (or possibly fake) an HTTP method
12
+ real = request.request_method.upcase
13
+ http_method = if (real == 'POST') && (fake = request.params['_method'])
14
+ fake.upcase
15
+ else
16
+ real
17
+ end
18
+ # Match against a very limited species of URI path.
19
+ whole, action, ext = request.path_info.match(PathMatcher).to_a
20
+ # Poor man's content negotiation
21
+ content_type = case ext
22
+ when 'json'
23
+ 'application/json'
24
+ end
25
+ response['Content-Type'] = content_type if content_type
26
+ # Poor man's routing
27
+ method_name = action ? "#{ext || 'html'}_#{http_method}_#{action}" : nil
28
+ if method_name && respond_to?(method_name)
29
+ response.body = send(method_name, request).to_s
30
+ else
31
+ response.status, response.body = 404, "404"
32
+ end
33
+ response.finish
34
+ end
35
+
36
+ # helper for action methods
37
+ def search_options(request)
38
+ params = request.params
39
+ limit = params['limit']
40
+ offset = params['offset']
41
+ options = {}
42
+ options[:limit] = limit.to_i if limit
43
+ options[:offset] = offset.to_i if offset
44
+ options
45
+ end
46
+
47
+ def json_GET_engine_version(request)
48
+ { :name => self.class.name,
49
+ :version => `git show-ref -h -s --abbrev HEAD`.chomp.split.first
50
+ }.to_json
51
+ end
52
+
53
+ def json_GET_index_info(request)
54
+ md = index_metadata || {}
55
+ { :build_date => md["build_date"],
56
+ :build_engine => { :name => md["engine_name"],
57
+ :version => md["engine_version"]},
58
+ :query_conditions => md["query_conditions"],
59
+ :doc_count => doc_count
60
+ }.to_json
61
+ end
62
+
63
+ def json_GET_document(request)
64
+ document(request.params['id'].to_i).to_json
65
+ end
66
+
67
+ def self.generate_rackup_file(engine, store)
68
+ rackup = <<RACKUP
69
+ options[:Port] = 9293
70
+ #{yield}
71
+ require 'moonstone/racker/local_search'
72
+ #{engine}.module_eval do
73
+ include Moonstone::Racker::LocalSearch
74
+ end
75
+ run #{engine}.new(:store => "#{File.expand_path store}")
76
+ RACKUP
77
+
78
+ File.open "#{File.dirname(store)}/config.ru", "w" do |f|
79
+ f.puts rackup
80
+ end
81
+ end
82
+
83
+ end
84
+ end
@@ -0,0 +1,19 @@
1
+ module Moonstone
2
+ class Tokenizer < Lucene::Analysis::Tokenizer
3
+
4
+ include Lucene::Analysis
5
+
6
+ def initialize(reader)
7
+ @reader = java.io.BufferedReader.new(reader)
8
+ end
9
+
10
+ # No, this is not terribly useful. Subclass me already.
11
+ def next(token=nil)
12
+ token = (token ? token.clear : Token.new)
13
+ token.set_term_text @reader.read_line
14
+ token.set_start_offset 1
15
+ token.set_end_offset 1
16
+ end
17
+
18
+ end
19
+ end
data/lib/moonstone.rb ADDED
@@ -0,0 +1,28 @@
1
+ require 'java'
2
+ require 'logger'
3
+ require 'json'
4
+
5
+ $:.unshift(here = File.dirname(__FILE__))
6
+
7
+ Dir["#{here}/jar/*.jar"].each { |jar| require jar }
8
+ require 'lucene/analysis'
9
+ require 'lucene/document'
10
+ require 'lucene/function'
11
+ require 'lucene/index'
12
+ require 'lucene/query_parser'
13
+ require 'lucene/search'
14
+ require 'lucene/store'
15
+
16
+ require 'moonstone/engine'
17
+ require 'moonstone/tokenizer'
18
+ require 'moonstone/filter'
19
+ require 'moonstone/queued_filter'
20
+ require 'moonstone/analyzer'
21
+ require 'moonstone/multi_analyzer'
22
+ require 'moonstone/index_inspection'
23
+
24
+ require 'moonstone/filters/synonymer.rb'
25
+
26
+ require 'moonstone/racker'
27
+
28
+ Moonstone::Logger = Logger.new($stderr) unless defined? Moonstone::Logger
metadata ADDED
@@ -0,0 +1,91 @@
1
+ --- !ruby/object:Gem::Specification
2
+ extensions: []
3
+
4
+ homepage:
5
+ executables: []
6
+
7
+ version: !ruby/object:Gem::Version
8
+ version: 0.6.0
9
+ post_install_message:
10
+ date: 2009-06-16 07:00:00 +00:00
11
+ files:
12
+ - lib/jar
13
+ - lib/lucene
14
+ - lib/moonstone
15
+ - lib/moonstone.rb
16
+ - lib/jar/lucene-core-2.4.0.jar
17
+ - lib/jar/lucene-spellchecker-2.4-dev.jar
18
+ - lib/lucene/analysis.rb
19
+ - lib/lucene/document.rb
20
+ - lib/lucene/function.rb
21
+ - lib/lucene/index.rb
22
+ - lib/lucene/query_parser.rb
23
+ - lib/lucene/search.rb
24
+ - lib/lucene/store.rb
25
+ - lib/moonstone/analyzer.rb
26
+ - lib/moonstone/engine.rb
27
+ - lib/moonstone/filter.rb
28
+ - lib/moonstone/filters
29
+ - lib/moonstone/index_inspection.rb
30
+ - lib/moonstone/multi_analyzer.rb
31
+ - lib/moonstone/queued_filter.rb
32
+ - lib/moonstone/racker
33
+ - lib/moonstone/racker.rb
34
+ - lib/moonstone/tokenizer.rb
35
+ - lib/moonstone/filters/synonymer.rb
36
+ - lib/moonstone/racker/basic_search.rb
37
+ - lib/moonstone/racker/local_search.rb
38
+ - Rakefile
39
+ rubygems_version: 1.3.1
40
+ rdoc_options: []
41
+
42
+ signing_key:
43
+ cert_chain: []
44
+
45
+ name: moonstone
46
+ has_rdoc: true
47
+ platform: ruby
48
+ summary: Moonstone Agile Search Framework
49
+ default_executable:
50
+ bindir: bin
51
+ required_rubygems_version: !ruby/object:Gem::Requirement
52
+ version:
53
+ requirements:
54
+ - - '>='
55
+ - !ruby/object:Gem::Version
56
+ version: "0"
57
+ required_ruby_version: !ruby/object:Gem::Requirement
58
+ version:
59
+ requirements:
60
+ - - '>='
61
+ - !ruby/object:Gem::Version
62
+ version: "0"
63
+ require_paths:
64
+ - lib
65
+ specification_version: 2
66
+ test_files: []
67
+
68
+ dependencies:
69
+ - !ruby/object:Gem::Dependency
70
+ type: :runtime
71
+ name: rspec
72
+ version_requirement:
73
+ version_requirements: !ruby/object:Gem::Requirement
74
+ version:
75
+ requirements:
76
+ - - '>='
77
+ - !ruby/object:Gem::Version
78
+ version: "0"
79
+ description:
80
+ email: self@automatthew.com
81
+ authors:
82
+ - Matthew King
83
+ - Jason Rush
84
+ - Jay Donnell
85
+ - Dan Yoder
86
+ extra_rdoc_files: []
87
+
88
+ requirements: []
89
+
90
+ rubyforge_project:
91
+ autorequire: