solrizer 0.1.0.pre2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,19 @@
1
+ .DS_Store
2
+ nohup.out
3
+
4
+ *.sqlite3
5
+ *.log
6
+ *~
7
+ *.swp
8
+
9
+ pkg/
10
+ coverage/*
11
+
12
+ tmp/**/*
13
+ tmp/performance
14
+
15
+ rerun.txt
16
+
17
+ .loadpath
18
+ .project
19
+ .buildpath
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Matt Zumwalt
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,17 @@
1
+ = foo
2
+
3
+ Description goes here.
4
+
5
+ == Note on Patches/Pull Requests
6
+
7
+ * Fork the project.
8
+ * Make your feature addition or bug fix.
9
+ * Add tests for it. This is important so I don't break it in a
10
+ future version unintentionally.
11
+ * Commit, do not mess with rakefile, version, or history.
12
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
13
+ * Send me a pull request. Bonus points for topic branches.
14
+
15
+ == Copyright
16
+
17
+ Copyright (c) 2010 Matt Zumwalt. See LICENSE for details.
@@ -0,0 +1,46 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "solrizer"
8
+ gem.summary = %Q{A utility for building solr indexes, usually from Fedora repository content.}
9
+ gem.description = %Q{Use solrizer to populate solr indexes from Fedora repository content or from other sources. You can run solrizer from within your apps, using the provided rake tasks, or as a JMS listener}
10
+ gem.email = "matt.zumwalt@yourmediashelf.com"
11
+ gem.homepage = "http://github.com/projecthydra/solrizer"
12
+ gem.authors = ["Matt Zumwalt"]
13
+ gem.add_dependency "active-fedora", "> 1.1.3"
14
+ gem.add_development_dependency "rspec", ">= 1.2.9"
15
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
16
+ end
17
+ Jeweler::GemcutterTasks.new
18
+ rescue LoadError
19
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
20
+ end
21
+
22
+ require 'spec/rake/spectask'
23
+ Spec::Rake::SpecTask.new(:spec) do |spec|
24
+ spec.libs << 'lib' << 'spec'
25
+ spec.spec_files = FileList['spec/**/*_spec.rb']
26
+ end
27
+
28
+ Spec::Rake::SpecTask.new(:rcov) do |spec|
29
+ spec.libs << 'lib' << 'spec'
30
+ spec.pattern = 'spec/**/*_spec.rb'
31
+ spec.rcov = true
32
+ end
33
+
34
+ task :spec => :check_dependencies
35
+
36
+ task :default => :spec
37
+
38
+ require 'rake/rdoctask'
39
+ Rake::RDocTask.new do |rdoc|
40
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
41
+
42
+ rdoc.rdoc_dir = 'rdoc'
43
+ rdoc.title = "solrizer #{version}"
44
+ rdoc.rdoc_files.include('README*')
45
+ rdoc.rdoc_files.include('lib/**/*.rb')
46
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0.pre2
@@ -0,0 +1,16 @@
1
+ development:
2
+ fedora:
3
+ url: http://fedoraAdmin:fedoraAdmin@127.0.0.1:8983/fedora
4
+ solr:
5
+ url: http://127.0.0.1:8983/solr/development
6
+ test:
7
+ fedora:
8
+ url: http://fedoraAdmin:fedoraAdmin@127.0.0.1:8983/fedora
9
+ solr:
10
+ url: http://127.0.0.1:8983/solr/test
11
+ production:
12
+ fedora:
13
+ url: http://fedoraAdmin:fedoraAdmin@127.0.0.1:8080/fedora
14
+ solr:
15
+ url: http://127.0.0.1:8080/solr
16
+
@@ -0,0 +1,4 @@
1
+ info:fedora/afmodel:SaltDocument : salt_document
2
+ info:fedora/afmodel:JP2Document : jp2_document
3
+ info:fedora/afmodel:ModsDocument : mods_document
4
+ info:fedora/afmodel:DCDocument : dc_document
@@ -0,0 +1,7 @@
1
+ development:
2
+ url: http://localhost:8983/solr/development
3
+ test: &TEST
4
+ url: http://localhost:8983/solr/test
5
+ production:
6
+ url: http://localhost:8080/solr/production
7
+
@@ -0,0 +1,101 @@
1
+ require 'rubygems'
2
+ require 'solrizer/indexer.rb'
3
+ # require 'fastercsv'
4
+ require "ruby-debug"
5
+
6
+
7
+
8
+ module Solrizer
9
+ class Solrizer
10
+
11
+ attr_accessor :indexer, :index_full_text
12
+
13
+ #
14
+ # This method initializes the indexer
15
+ # If passed an argument of :index_full_text=>true, it will perform full-text indexing instead of indexing fields only.
16
+ #
17
+ def initialize( opts={} )
18
+ @@index_list = false unless defined?(@@index_list)
19
+ if opts[:index_full_text] == true || opts[:index_full_text] == "true"
20
+ @index_full_text = true
21
+ else
22
+ @index_full_text = false
23
+ end
24
+ @indexer = Indexer.new( :index_full_text=>@index_full_text )
25
+ end
26
+
27
+ #
28
+ # This method solrizes the given Fedora object's full-text and facets into the search index
29
+ #
30
+ def solrize( obj )
31
+ # retrieve the Fedora object based on the given unique id
32
+
33
+ begin
34
+
35
+ start = Time.now
36
+ print "Retrieving object #{obj} ..."
37
+ obj = obj.kind_of?(ActiveFedora::Base) ? obj : Repository.get_object( obj )
38
+
39
+ obj_done = Time.now
40
+ obj_done_elapse = obj_done - start
41
+ puts " completed. Duration: #{obj_done_elapse}"
42
+
43
+ print "\t Indexing object #{obj.pid} ... "
44
+ # add the keywords and facets to the search index
45
+ index_start = Time.now
46
+ indexer.index( obj )
47
+
48
+ index_done = Time.now
49
+ index_elapsed = index_done - index_start
50
+
51
+ puts "completed. Duration: #{index_elapsed} ."
52
+
53
+
54
+ rescue Exception => e
55
+ p "unable to index #{obj}. Failed with #{e.inspect}"
56
+
57
+
58
+ end #begin
59
+
60
+ end
61
+
62
+ #
63
+ # This method retrieves a comprehensive list of all the unique identifiers in Fedora and
64
+ # solrizes each object's full-text and facets into the search index
65
+ #
66
+ def solrize_objects
67
+ # retrieve a list of all the pids in the fedora repository
68
+ num_docs = 1000000 # modify this number to guarantee that all the objects are retrieved from the repository
69
+ puts "WARNING: You have turned off indexing of Full Text content. Be sure to re-run indexer with @@index_full_text set to true in main.rb" if index_full_text == false
70
+
71
+ if @@index_list == false
72
+
73
+ pids = Repository.get_pids( num_docs )
74
+ puts "Shelving #{pids.length} Fedora objects"
75
+ pids.each do |pid|
76
+ unless pid[0].empty? || pid[0].nil?
77
+ solrize( pid )
78
+ end
79
+ end #pids.each
80
+
81
+ else
82
+
83
+ if File.exists?(@@index_list)
84
+ arr_of_pids = FasterCSV.read(@@index_list, :headers=>false)
85
+
86
+ puts "Indexing from list at #{@@index_list}"
87
+ puts "Shelving #{arr_of_pids.length} Fedora objects"
88
+
89
+ arr_of_pids.each do |row|
90
+ pid = row[0]
91
+ solrize( pid )
92
+ end #FASTERCSV
93
+ else
94
+ puts "#{@@index_list} does not exists!"
95
+ end #if File.exists
96
+
97
+ end #if Index_LISTS
98
+ end #solrize_objects
99
+
100
+ end #class
101
+ end #module
@@ -0,0 +1,8 @@
1
+
2
+ # FEDORA_URL = 'http://fedoraAdmin:fedoraAdmin@salt-dev.stanford.edu/fedora'
3
+ # FEDORA_SOLR_URL = 'http://salt-dev.stanford.edu:8080/solr'
4
+ # SHELVER_SOLR_URL = 'http://sulwebappdev1.stanford.edu:8100/salt_solr'
5
+ FEDORA_URL = 'http://fedoraAdmin:fedoraAdmin@localhost:8080/fedora'
6
+ FEDORA_SOLR_URL = 'http://localhost:8080/solr'
7
+ SHELVER_SOLR_URL = 'http://localhost:8080/bl_solr'
8
+
@@ -0,0 +1,89 @@
1
+ require 'solr'
2
+ require 'rexml/document'
3
+ require "nokogiri"
4
+ require 'yaml'
5
+
6
+ module Solrizer
7
+ class Extractor
8
+
9
+
10
+ def extract_tags(text)
11
+ doc = REXML::Document.new( text )
12
+ extract_tag(doc, 'archivist_tags').merge(extract_tag(doc, 'donor_tags'))
13
+ end
14
+
15
+ def extract_tag(doc, type)
16
+ tags = doc.elements["/fields/#{type}"]
17
+ return {} unless tags
18
+ {type => tags.text.split(/,/).map {|t| t.strip}}
19
+ end
20
+
21
+
22
+ #
23
+ # Extracts content-model and hydra-type from RELS-EXT datastream
24
+ #
25
+ def extract_rels_ext( text, solr_doc=Solr::Document.new )
26
+ # TODO: only read in this file once
27
+
28
+ if defined?(RAILS_ROOT)
29
+ config_path = File.join(RAILS_ROOT, "config")
30
+ else
31
+ config_path = File.join(File.dirname(__FILE__), "..", "..", "config")
32
+ end
33
+ map = YAML.load(File.open(File.join(config_path, "hydra_types.yml")))
34
+
35
+ doc = Nokogiri::XML(text)
36
+ doc.xpath( '//foo:hasModel', 'foo' => 'info:fedora/fedora-system:def/model#' ).each do |element|
37
+ cmodel = element.attributes['resource'].to_s
38
+ solr_doc << Solr::Field.new( :cmodel_t => cmodel )
39
+
40
+ if map.has_key?(cmodel)
41
+ solr_doc << Solr::Field.new( :hydra_type_t => map[cmodel] )
42
+ end
43
+ end
44
+
45
+ return solr_doc
46
+ end
47
+
48
+ #
49
+ # This method extracts solr fields from simple xml
50
+ #
51
+ def xml_to_solr( text, solr_doc=Solr::Document.new )
52
+ doc = REXML::Document.new( text )
53
+ doc.root.elements.each do |element|
54
+ solr_doc << Solr::Field.new( :"#{element.name}_t" => "#{element.text}" )
55
+ end
56
+
57
+ return solr_doc
58
+ end
59
+
60
+ #
61
+ # This method strips html tags out and returns content to be indexed in solr
62
+ #
63
+ def html_content_to_solr( ds, solr_doc=Solr::Document.new )
64
+
65
+ text = CGI.unescapeHTML(ds.content)
66
+ doc = Nokogiri::HTML(text)
67
+
68
+ # html to story_display
69
+ stories = doc.xpath('//story')
70
+
71
+ stories.each do |story|
72
+ solr_doc << Solr::Field.new(:story_display => story.children.to_xml)
73
+ end
74
+
75
+ #strip out text and put in story_t
76
+ text_nodes = doc.xpath("//text()")
77
+ text = String.new
78
+
79
+ text_nodes.each do |text_node|
80
+ text << text_node.content
81
+ end
82
+
83
+ solr_doc << Solr::Field.new(:story_t => text)
84
+
85
+ return solr_doc
86
+ end
87
+
88
+ end
89
+ end
@@ -0,0 +1,261 @@
1
+ require 'solr'
2
+ require 'solrizer/extractor'
3
+ require 'solrizer/repository'
4
+
5
+
6
+ module Solrizer
7
+ class Indexer
8
+ #
9
+ # Class variables
10
+ #
11
+ @@unique_id = 0
12
+
13
+ def self.unique_id
14
+ @@unique_id
15
+ end
16
+
17
+ #
18
+ # Member variables
19
+ #
20
+ attr_accessor :connection, :extractor, :index_full_text
21
+
22
+ #
23
+ # This method performs initialization tasks
24
+ #
25
+ def initialize( opts={} )
26
+ @@index_list = false unless defined?(@@index_list)
27
+ @extractor = Extractor.new
28
+
29
+ if opts[:index_full_text] == true || opts[:index_full_text] == "true"
30
+ @index_full_text = true
31
+ else
32
+ @index_full_text = false
33
+ end
34
+
35
+ connect
36
+ end
37
+
38
+ #
39
+ # This method connects to the Solr instance
40
+ #
41
+ def connect
42
+
43
+ if ActiveFedora.fedora_config.empty?
44
+ ActiveFedora.init
45
+ end
46
+
47
+ if defined?(Blacklight)
48
+ solr_config = Blacklight.solr_config
49
+ else
50
+
51
+ if defined?(RAILS_ROOT)
52
+ config_path = File.join(RAILS_ROOT, "config")
53
+ yaml = YAML.load(File.open(File.join(config_path, "solr.yml")))
54
+ solr_config = yaml[RAILS_ENV]
55
+ puts solr_config.inspect
56
+ else
57
+ config_path = File.join(File.dirname(__FILE__), "..", "..", "config")
58
+ yaml = YAML.load(File.open(File.join(config_path, "solr.yml")))
59
+
60
+
61
+ if ENV["environment"].nil?
62
+ environment = "development"
63
+ else
64
+ environment = ENV["environment"]
65
+ end
66
+
67
+ solr_config = yaml[environment]
68
+ puts solr_config.inspect
69
+ end
70
+
71
+ end
72
+
73
+ if index_full_text == true
74
+ url = solr_config['fulltext']['url']
75
+ elsif solr_config.has_key?("default")
76
+ url = solr_config['default']['url']
77
+ else
78
+ url = solr_config['url']
79
+ end
80
+ @connection = Solr::Connection.new(url, :autocommit => :on )
81
+ end
82
+
83
+ #
84
+ # This method extracts the facet categories from the given Fedora object's external tag datastream
85
+ #
86
+ def extract_xml_to_solr( obj, ds_name, solr_doc=Solr::Document.new )
87
+ xml_ds = Repository.get_datastream( obj, ds_name )
88
+ extractor.xml_to_solr( xml_ds.content, solr_doc )
89
+ end
90
+
91
+ #
92
+ #
93
+ #
94
+ def extract_rels_ext( obj, ds_name, solr_doc=Solr::Document.new )
95
+ rels_ext_ds = Repository.get_datastream( obj, ds_name )
96
+ extractor.extract_rels_ext( rels_ext_ds.content, solr_doc )
97
+ end
98
+
99
+ #
100
+ # This method generates the month and day facets from the date_t in solr_doc
101
+ #
102
+
103
+ def generate_dates(solr_doc)
104
+
105
+ # This will check for valid dates, but it seems most of the dates are currently invalid....
106
+ #date_check = /^(19|20)\d\d([- \/.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])/
107
+
108
+ #if there is not date_t, add on with easy-to-find value
109
+ if solr_doc[:date_t].nil?
110
+ solr_doc << Solr::Field.new( :date_t => "9999-99-99")
111
+ end #if
112
+
113
+ # unless date_check !~ solr_doc[:date_t]
114
+ date_obj = Date._parse(solr_doc[:date_t])
115
+
116
+ if date_obj[:mon].nil?
117
+ solr_doc << Solr::Field.new(:month_facet => 99)
118
+ elsif 0 < date_obj[:mon] && date_obj[:mon] < 13
119
+ solr_doc << Solr::Field.new( :month_facet => date_obj[:mon].to_s.rjust(2, '0'))
120
+ else
121
+ solr_doc << Solr::Field.new( :month_facet => 99)
122
+ end
123
+
124
+ if date_obj[:mday].nil?
125
+ solr_doc << Solr::Field.new( :day_facet => 99)
126
+ elsif 0 < date_obj[:mday] && date_obj[:mday] < 32
127
+ solr_doc << Solr::Field.new( :day_facet => date_obj[:mday].to_s.rjust(2, '0'))
128
+ else
129
+ solr_doc << Solr::Field.new( :day_facet => 99)
130
+ end
131
+
132
+ return solr_doc
133
+ # end
134
+
135
+ end
136
+
137
+
138
+ #
139
+ # This method creates a Solr-formatted XML document
140
+ #
141
+ def create_document( obj )
142
+
143
+ solr_doc = Solr::Document.new
144
+
145
+ model_klazz_array = ActiveFedora::ContentModel.known_models_for( obj )
146
+ model_klazz_array.delete(ActiveFedora::Base)
147
+
148
+ # If the object was passed in as an ActiveFedora::Base, call to_solr in order to get the base field entries from ActiveFedora::Base
149
+ # Otherwise, the object was passed in as a model instance other than ActiveFedora::Base,so call its to_solr method & allow it to insert the fields from ActiveFedora::Base
150
+ if obj.class == ActiveFedora::Base
151
+ solr_doc = obj.to_solr(solr_doc)
152
+ puts " added base fields from #{obj.class.to_s}"
153
+ else
154
+ solr_doc = obj.to_solr(solr_doc)
155
+ model_klazz_array.delete(obj.class)
156
+ puts " added base fields from #{obj.class.to_s} and model fields from #{obj.class.to_s}"
157
+ end
158
+
159
+ # Load the object as an instance of each of its other models and get the corresponding solr fields
160
+ # Include :model_only=>true in the options in order to avoid adding the metadata from ActiveFedora::Base every time.
161
+ model_klazz_array.each do |klazz|
162
+ instance = klazz.load_instance(obj.pid)
163
+ solr_doc = instance.to_solr(solr_doc, :model_only=>true)
164
+ puts " added solr fields from #{klazz.to_s}"
165
+ end
166
+
167
+ solr_doc << Solr::Field.new( :id_t => "#{obj.pid}" )
168
+ solr_doc << Solr::Field.new( :id => "#{obj.pid}" ) unless solr_doc[:id]
169
+
170
+ # increment the unique id to ensure that all documents in the search index are unique
171
+ @@unique_id += 1
172
+
173
+ return solr_doc
174
+ end
175
+
176
+ #
177
+ # This method adds a document to the Solr search index
178
+ #
179
+ def index( obj )
180
+ # print "Indexing '#{obj.pid}'..."
181
+ begin
182
+
183
+ solr_doc = create_document( obj )
184
+ connection.add( solr_doc )
185
+
186
+ # puts connection.url
187
+ #puts solr_doc
188
+ # puts "done"
189
+
190
+ # rescue Exception => e
191
+ # p "unable to index #{obj.pid}. Failed with #{e.inspect}"
192
+ end
193
+
194
+ end
195
+
196
+ #
197
+ # This method queries the Solr search index and returns a response
198
+ #
199
+ def query( query_str )
200
+ response = conn.query( query_str )
201
+ end
202
+
203
+ #
204
+ # This method prints out the results of the given query string by iterating through all the hits
205
+ #
206
+ def printResults( query_str )
207
+ query( query_str ) do |hit|
208
+ puts hit.inspect
209
+ end
210
+ end
211
+
212
+ #
213
+ # This method deletes a document from the Solr search index by id
214
+ #
215
+ def deleteDocument( id )
216
+ connection.delete( id )
217
+ end
218
+
219
+ # Populates a solr doc with values from a hash.
220
+ # Accepts two forms of hashes:
221
+ # => {'technology'=>["t1", "t2"], 'company'=>"c1", "person"=>["p1", "p2"]}
222
+ # or
223
+ # => {:facets => {'technology'=>["t1", "t2"], 'company'=>"c1", "person"=>["p1", "p2"]} }
224
+ #
225
+ # Note that values for individual fields can be a single string or an array of strings.
226
+ def self.solrize( input_hash, solr_doc=Solr::Document.new )
227
+ facets = input_hash.has_key?(:facets) ? input_hash[:facets] : input_hash
228
+ facets.each_pair do |facet_name, value|
229
+ case value.class.to_s
230
+ when "String"
231
+ solr_doc << Solr::Field.new( :"#{facet_name}_facet" => "#{value}" )
232
+ when "Array"
233
+ value.each { |v| solr_doc << Solr::Field.new( :"#{facet_name}_facet" => "#{v}" ) }
234
+ end
235
+ end
236
+
237
+ if input_hash.has_key?(:symbols)
238
+ input_hash[:symbols].each do |symbol_name, value|
239
+ case value.class.to_s
240
+ when "String"
241
+ solr_doc << Solr::Field.new( :"#{symbol_name}_s" => "#{value}" )
242
+ when "Array"
243
+ value.each { |v| solr_doc << Solr::Field.new( :"#{symbol_name}_s" => "#{v}" ) }
244
+ end
245
+ end
246
+ end
247
+ return solr_doc
248
+ end
249
+
250
+
251
+ private :connect, :create_document
252
+
253
+ def class_exists?(class_name)
254
+ klass = Module.const_get(class_name)
255
+ return klass.is_a?(Class)
256
+ rescue NameError
257
+ return false
258
+ end
259
+
260
+ end
261
+ end