solrizer 0.1.0.pre2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,19 @@
1
+ .DS_Store
2
+ nohup.out
3
+
4
+ *.sqlite3
5
+ *.log
6
+ *~
7
+ *.swp
8
+
9
+ pkg/
10
+ coverage/*
11
+
12
+ tmp/**/*
13
+ tmp/performance
14
+
15
+ rerun.txt
16
+
17
+ .loadpath
18
+ .project
19
+ .buildpath
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Matt Zumwalt
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,17 @@
1
+ = foo
2
+
3
+ Description goes here.
4
+
5
+ == Note on Patches/Pull Requests
6
+
7
+ * Fork the project.
8
+ * Make your feature addition or bug fix.
9
+ * Add tests for it. This is important so I don't break it in a
10
+ future version unintentionally.
11
+ * Commit, do not mess with rakefile, version, or history.
12
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
13
+ * Send me a pull request. Bonus points for topic branches.
14
+
15
+ == Copyright
16
+
17
+ Copyright (c) 2010 Matt Zumwalt. See LICENSE for details.
@@ -0,0 +1,46 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "solrizer"
8
+ gem.summary = %Q{A utility for building solr indexes, usually from Fedora repository content.}
9
+ gem.description = %Q{Use solrizer to populate solr indexes from Fedora repository content or from other sources. You can run solrizer from within your apps, using the provided rake tasks, or as a JMS listener}
10
+ gem.email = "matt.zumwalt@yourmediashelf.com"
11
+ gem.homepage = "http://github.com/projecthydra/solrizer"
12
+ gem.authors = ["Matt Zumwalt"]
13
+ gem.add_dependency "active-fedora", "> 1.1.3"
14
+ gem.add_development_dependency "rspec", ">= 1.2.9"
15
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
16
+ end
17
+ Jeweler::GemcutterTasks.new
18
+ rescue LoadError
19
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
20
+ end
21
+
22
+ require 'spec/rake/spectask'
23
+ Spec::Rake::SpecTask.new(:spec) do |spec|
24
+ spec.libs << 'lib' << 'spec'
25
+ spec.spec_files = FileList['spec/**/*_spec.rb']
26
+ end
27
+
28
+ Spec::Rake::SpecTask.new(:rcov) do |spec|
29
+ spec.libs << 'lib' << 'spec'
30
+ spec.pattern = 'spec/**/*_spec.rb'
31
+ spec.rcov = true
32
+ end
33
+
34
+ task :spec => :check_dependencies
35
+
36
+ task :default => :spec
37
+
38
+ require 'rake/rdoctask'
39
+ Rake::RDocTask.new do |rdoc|
40
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
41
+
42
+ rdoc.rdoc_dir = 'rdoc'
43
+ rdoc.title = "solrizer #{version}"
44
+ rdoc.rdoc_files.include('README*')
45
+ rdoc.rdoc_files.include('lib/**/*.rb')
46
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0.pre2
@@ -0,0 +1,16 @@
1
+ development:
2
+ fedora:
3
+ url: http://fedoraAdmin:fedoraAdmin@127.0.0.1:8983/fedora
4
+ solr:
5
+ url: http://127.0.0.1:8983/solr/development
6
+ test:
7
+ fedora:
8
+ url: http://fedoraAdmin:fedoraAdmin@127.0.0.1:8983/fedora
9
+ solr:
10
+ url: http://127.0.0.1:8983/solr/test
11
+ production:
12
+ fedora:
13
+ url: http://fedoraAdmin:fedoraAdmin@127.0.0.1:8080/fedora
14
+ solr:
15
+ url: http://127.0.0.1:8080/solr
16
+
@@ -0,0 +1,4 @@
1
+ info:fedora/afmodel:SaltDocument : salt_document
2
+ info:fedora/afmodel:JP2Document : jp2_document
3
+ info:fedora/afmodel:ModsDocument : mods_document
4
+ info:fedora/afmodel:DCDocument : dc_document
@@ -0,0 +1,7 @@
1
+ development:
2
+ url: http://localhost:8983/solr/development
3
+ test: &TEST
4
+ url: http://localhost:8983/solr/test
5
+ production:
6
+ url: http://localhost:8080/solr/production
7
+
@@ -0,0 +1,101 @@
1
+ require 'rubygems'
2
+ require 'solrizer/indexer.rb'
3
+ # require 'fastercsv'
4
+ require "ruby-debug"
5
+
6
+
7
+
8
+ module Solrizer
9
+ class Solrizer
10
+
11
+ attr_accessor :indexer, :index_full_text
12
+
13
+ #
14
+ # This method initializes the indexer
15
+ # If passed an argument of :index_full_text=>true, it will perform full-text indexing instead of indexing fields only.
16
+ #
17
+ def initialize( opts={} )
18
+ @@index_list = false unless defined?(@@index_list)
19
+ if opts[:index_full_text] == true || opts[:index_full_text] == "true"
20
+ @index_full_text = true
21
+ else
22
+ @index_full_text = false
23
+ end
24
+ @indexer = Indexer.new( :index_full_text=>@index_full_text )
25
+ end
26
+
27
+ #
28
+ # This method solrizes the given Fedora object's full-text and facets into the search index
29
+ #
30
+ def solrize( obj )
31
+ # retrieve the Fedora object based on the given unique id
32
+
33
+ begin
34
+
35
+ start = Time.now
36
+ print "Retrieving object #{obj} ..."
37
+ obj = obj.kind_of?(ActiveFedora::Base) ? obj : Repository.get_object( obj )
38
+
39
+ obj_done = Time.now
40
+ obj_done_elapse = obj_done - start
41
+ puts " completed. Duration: #{obj_done_elapse}"
42
+
43
+ print "\t Indexing object #{obj.pid} ... "
44
+ # add the keywords and facets to the search index
45
+ index_start = Time.now
46
+ indexer.index( obj )
47
+
48
+ index_done = Time.now
49
+ index_elapsed = index_done - index_start
50
+
51
+ puts "completed. Duration: #{index_elapsed} ."
52
+
53
+
54
+ rescue Exception => e
55
+ p "unable to index #{obj}. Failed with #{e.inspect}"
56
+
57
+
58
+ end #begin
59
+
60
+ end
61
+
62
+ #
63
+ # This method retrieves a comprehensive list of all the unique identifiers in Fedora and
64
+ # solrizes each object's full-text and facets into the search index
65
+ #
66
+ def solrize_objects
67
+ # retrieve a list of all the pids in the fedora repository
68
+ num_docs = 1000000 # modify this number to guarantee that all the objects are retrieved from the repository
69
+ puts "WARNING: You have turned off indexing of Full Text content. Be sure to re-run indexer with @@index_full_text set to true in main.rb" if index_full_text == false
70
+
71
+ if @@index_list == false
72
+
73
+ pids = Repository.get_pids( num_docs )
74
+ puts "Shelving #{pids.length} Fedora objects"
75
+ pids.each do |pid|
76
+ unless pid[0].empty? || pid[0].nil?
77
+ solrize( pid )
78
+ end
79
+ end #pids.each
80
+
81
+ else
82
+
83
+ if File.exists?(@@index_list)
84
+ arr_of_pids = FasterCSV.read(@@index_list, :headers=>false)
85
+
86
+ puts "Indexing from list at #{@@index_list}"
87
+ puts "Shelving #{arr_of_pids.length} Fedora objects"
88
+
89
+ arr_of_pids.each do |row|
90
+ pid = row[0]
91
+ solrize( pid )
92
+ end #FASTERCSV
93
+ else
94
+ puts "#{@@index_list} does not exists!"
95
+ end #if File.exists
96
+
97
+ end #if Index_LISTS
98
+ end #solrize_objects
99
+
100
+ end #class
101
+ end #module
@@ -0,0 +1,8 @@
1
+
2
+ # FEDORA_URL = 'http://fedoraAdmin:fedoraAdmin@salt-dev.stanford.edu/fedora'
3
+ # FEDORA_SOLR_URL = 'http://salt-dev.stanford.edu:8080/solr'
4
+ # SHELVER_SOLR_URL = 'http://sulwebappdev1.stanford.edu:8100/salt_solr'
5
+ FEDORA_URL = 'http://fedoraAdmin:fedoraAdmin@localhost:8080/fedora'
6
+ FEDORA_SOLR_URL = 'http://localhost:8080/solr'
7
+ SHELVER_SOLR_URL = 'http://localhost:8080/bl_solr'
8
+
@@ -0,0 +1,89 @@
1
+ require 'solr'
2
+ require 'rexml/document'
3
+ require "nokogiri"
4
+ require 'yaml'
5
+
6
+ module Solrizer
7
+ class Extractor
8
+
9
+
10
+ def extract_tags(text)
11
+ doc = REXML::Document.new( text )
12
+ extract_tag(doc, 'archivist_tags').merge(extract_tag(doc, 'donor_tags'))
13
+ end
14
+
15
+ def extract_tag(doc, type)
16
+ tags = doc.elements["/fields/#{type}"]
17
+ return {} unless tags
18
+ {type => tags.text.split(/,/).map {|t| t.strip}}
19
+ end
20
+
21
+
22
+ #
23
+ # Extracts content-model and hydra-type from RELS-EXT datastream
24
+ #
25
+ def extract_rels_ext( text, solr_doc=Solr::Document.new )
26
+ # TODO: only read in this file once
27
+
28
+ if defined?(RAILS_ROOT)
29
+ config_path = File.join(RAILS_ROOT, "config")
30
+ else
31
+ config_path = File.join(File.dirname(__FILE__), "..", "..", "config")
32
+ end
33
+ map = YAML.load(File.open(File.join(config_path, "hydra_types.yml")))
34
+
35
+ doc = Nokogiri::XML(text)
36
+ doc.xpath( '//foo:hasModel', 'foo' => 'info:fedora/fedora-system:def/model#' ).each do |element|
37
+ cmodel = element.attributes['resource'].to_s
38
+ solr_doc << Solr::Field.new( :cmodel_t => cmodel )
39
+
40
+ if map.has_key?(cmodel)
41
+ solr_doc << Solr::Field.new( :hydra_type_t => map[cmodel] )
42
+ end
43
+ end
44
+
45
+ return solr_doc
46
+ end
47
+
48
+ #
49
+ # This method extracts solr fields from simple xml
50
+ #
51
+ def xml_to_solr( text, solr_doc=Solr::Document.new )
52
+ doc = REXML::Document.new( text )
53
+ doc.root.elements.each do |element|
54
+ solr_doc << Solr::Field.new( :"#{element.name}_t" => "#{element.text}" )
55
+ end
56
+
57
+ return solr_doc
58
+ end
59
+
60
+ #
61
+ # This method strips html tags out and returns content to be indexed in solr
62
+ #
63
+ def html_content_to_solr( ds, solr_doc=Solr::Document.new )
64
+
65
+ text = CGI.unescapeHTML(ds.content)
66
+ doc = Nokogiri::HTML(text)
67
+
68
+ # html to story_display
69
+ stories = doc.xpath('//story')
70
+
71
+ stories.each do |story|
72
+ solr_doc << Solr::Field.new(:story_display => story.children.to_xml)
73
+ end
74
+
75
+ #strip out text and put in story_t
76
+ text_nodes = doc.xpath("//text()")
77
+ text = String.new
78
+
79
+ text_nodes.each do |text_node|
80
+ text << text_node.content
81
+ end
82
+
83
+ solr_doc << Solr::Field.new(:story_t => text)
84
+
85
+ return solr_doc
86
+ end
87
+
88
+ end
89
+ end
@@ -0,0 +1,261 @@
1
+ require 'solr'
2
+ require 'solrizer/extractor'
3
+ require 'solrizer/repository'
4
+
5
+
6
+ module Solrizer
7
+ class Indexer
8
+ #
9
+ # Class variables
10
+ #
11
+ @@unique_id = 0
12
+
13
+ def self.unique_id
14
+ @@unique_id
15
+ end
16
+
17
+ #
18
+ # Member variables
19
+ #
20
+ attr_accessor :connection, :extractor, :index_full_text
21
+
22
+ #
23
+ # This method performs initialization tasks
24
+ #
25
+ def initialize( opts={} )
26
+ @@index_list = false unless defined?(@@index_list)
27
+ @extractor = Extractor.new
28
+
29
+ if opts[:index_full_text] == true || opts[:index_full_text] == "true"
30
+ @index_full_text = true
31
+ else
32
+ @index_full_text = false
33
+ end
34
+
35
+ connect
36
+ end
37
+
38
+ #
39
+ # This method connects to the Solr instance
40
+ #
41
+ def connect
42
+
43
+ if ActiveFedora.fedora_config.empty?
44
+ ActiveFedora.init
45
+ end
46
+
47
+ if defined?(Blacklight)
48
+ solr_config = Blacklight.solr_config
49
+ else
50
+
51
+ if defined?(RAILS_ROOT)
52
+ config_path = File.join(RAILS_ROOT, "config")
53
+ yaml = YAML.load(File.open(File.join(config_path, "solr.yml")))
54
+ solr_config = yaml[RAILS_ENV]
55
+ puts solr_config.inspect
56
+ else
57
+ config_path = File.join(File.dirname(__FILE__), "..", "..", "config")
58
+ yaml = YAML.load(File.open(File.join(config_path, "solr.yml")))
59
+
60
+
61
+ if ENV["environment"].nil?
62
+ environment = "development"
63
+ else
64
+ environment = ENV["environment"]
65
+ end
66
+
67
+ solr_config = yaml[environment]
68
+ puts solr_config.inspect
69
+ end
70
+
71
+ end
72
+
73
+ if index_full_text == true
74
+ url = solr_config['fulltext']['url']
75
+ elsif solr_config.has_key?("default")
76
+ url = solr_config['default']['url']
77
+ else
78
+ url = solr_config['url']
79
+ end
80
+ @connection = Solr::Connection.new(url, :autocommit => :on )
81
+ end
82
+
83
+ #
84
+ # This method extracts the facet categories from the given Fedora object's external tag datastream
85
+ #
86
+ def extract_xml_to_solr( obj, ds_name, solr_doc=Solr::Document.new )
87
+ xml_ds = Repository.get_datastream( obj, ds_name )
88
+ extractor.xml_to_solr( xml_ds.content, solr_doc )
89
+ end
90
+
91
+ #
92
+ #
93
+ #
94
+ def extract_rels_ext( obj, ds_name, solr_doc=Solr::Document.new )
95
+ rels_ext_ds = Repository.get_datastream( obj, ds_name )
96
+ extractor.extract_rels_ext( rels_ext_ds.content, solr_doc )
97
+ end
98
+
99
+ #
100
+ # This method generates the month and day facets from the date_t in solr_doc
101
+ #
102
+
103
+ def generate_dates(solr_doc)
104
+
105
+ # This will check for valid dates, but it seems most of the dates are currently invalid....
106
+ #date_check = /^(19|20)\d\d([- \/.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])/
107
+
108
+ #if there is not date_t, add on with easy-to-find value
109
+ if solr_doc[:date_t].nil?
110
+ solr_doc << Solr::Field.new( :date_t => "9999-99-99")
111
+ end #if
112
+
113
+ # unless date_check !~ solr_doc[:date_t]
114
+ date_obj = Date._parse(solr_doc[:date_t])
115
+
116
+ if date_obj[:mon].nil?
117
+ solr_doc << Solr::Field.new(:month_facet => 99)
118
+ elsif 0 < date_obj[:mon] && date_obj[:mon] < 13
119
+ solr_doc << Solr::Field.new( :month_facet => date_obj[:mon].to_s.rjust(2, '0'))
120
+ else
121
+ solr_doc << Solr::Field.new( :month_facet => 99)
122
+ end
123
+
124
+ if date_obj[:mday].nil?
125
+ solr_doc << Solr::Field.new( :day_facet => 99)
126
+ elsif 0 < date_obj[:mday] && date_obj[:mday] < 32
127
+ solr_doc << Solr::Field.new( :day_facet => date_obj[:mday].to_s.rjust(2, '0'))
128
+ else
129
+ solr_doc << Solr::Field.new( :day_facet => 99)
130
+ end
131
+
132
+ return solr_doc
133
+ # end
134
+
135
+ end
136
+
137
+
138
+ #
139
+ # This method creates a Solr-formatted XML document
140
+ #
141
+ def create_document( obj )
142
+
143
+ solr_doc = Solr::Document.new
144
+
145
+ model_klazz_array = ActiveFedora::ContentModel.known_models_for( obj )
146
+ model_klazz_array.delete(ActiveFedora::Base)
147
+
148
+ # If the object was passed in as an ActiveFedora::Base, call to_solr in order to get the base field entries from ActiveFedora::Base
149
+ # Otherwise, the object was passed in as a model instance other than ActiveFedora::Base,so call its to_solr method & allow it to insert the fields from ActiveFedora::Base
150
+ if obj.class == ActiveFedora::Base
151
+ solr_doc = obj.to_solr(solr_doc)
152
+ puts " added base fields from #{obj.class.to_s}"
153
+ else
154
+ solr_doc = obj.to_solr(solr_doc)
155
+ model_klazz_array.delete(obj.class)
156
+ puts " added base fields from #{obj.class.to_s} and model fields from #{obj.class.to_s}"
157
+ end
158
+
159
+ # Load the object as an instance of each of its other models and get the corresponding solr fields
160
+ # Include :model_only=>true in the options in order to avoid adding the metadata from ActiveFedora::Base every time.
161
+ model_klazz_array.each do |klazz|
162
+ instance = klazz.load_instance(obj.pid)
163
+ solr_doc = instance.to_solr(solr_doc, :model_only=>true)
164
+ puts " added solr fields from #{klazz.to_s}"
165
+ end
166
+
167
+ solr_doc << Solr::Field.new( :id_t => "#{obj.pid}" )
168
+ solr_doc << Solr::Field.new( :id => "#{obj.pid}" ) unless solr_doc[:id]
169
+
170
+ # increment the unique id to ensure that all documents in the search index are unique
171
+ @@unique_id += 1
172
+
173
+ return solr_doc
174
+ end
175
+
176
+ #
177
+ # This method adds a document to the Solr search index
178
+ #
179
+ def index( obj )
180
+ # print "Indexing '#{obj.pid}'..."
181
+ begin
182
+
183
+ solr_doc = create_document( obj )
184
+ connection.add( solr_doc )
185
+
186
+ # puts connection.url
187
+ #puts solr_doc
188
+ # puts "done"
189
+
190
+ # rescue Exception => e
191
+ # p "unable to index #{obj.pid}. Failed with #{e.inspect}"
192
+ end
193
+
194
+ end
195
+
196
+ #
197
+ # This method queries the Solr search index and returns a response
198
+ #
199
+ def query( query_str )
200
+ response = conn.query( query_str )
201
+ end
202
+
203
+ #
204
+ # This method prints out the results of the given query string by iterating through all the hits
205
+ #
206
+ def printResults( query_str )
207
+ query( query_str ) do |hit|
208
+ puts hit.inspect
209
+ end
210
+ end
211
+
212
+ #
213
+ # This method deletes a document from the Solr search index by id
214
+ #
215
+ def deleteDocument( id )
216
+ connection.delete( id )
217
+ end
218
+
219
+ # Populates a solr doc with values from a hash.
220
+ # Accepts two forms of hashes:
221
+ # => {'technology'=>["t1", "t2"], 'company'=>"c1", "person"=>["p1", "p2"]}
222
+ # or
223
+ # => {:facets => {'technology'=>["t1", "t2"], 'company'=>"c1", "person"=>["p1", "p2"]} }
224
+ #
225
+ # Note that values for individual fields can be a single string or an array of strings.
226
+ def self.solrize( input_hash, solr_doc=Solr::Document.new )
227
+ facets = input_hash.has_key?(:facets) ? input_hash[:facets] : input_hash
228
+ facets.each_pair do |facet_name, value|
229
+ case value.class.to_s
230
+ when "String"
231
+ solr_doc << Solr::Field.new( :"#{facet_name}_facet" => "#{value}" )
232
+ when "Array"
233
+ value.each { |v| solr_doc << Solr::Field.new( :"#{facet_name}_facet" => "#{v}" ) }
234
+ end
235
+ end
236
+
237
+ if input_hash.has_key?(:symbols)
238
+ input_hash[:symbols].each do |symbol_name, value|
239
+ case value.class.to_s
240
+ when "String"
241
+ solr_doc << Solr::Field.new( :"#{symbol_name}_s" => "#{value}" )
242
+ when "Array"
243
+ value.each { |v| solr_doc << Solr::Field.new( :"#{symbol_name}_s" => "#{v}" ) }
244
+ end
245
+ end
246
+ end
247
+ return solr_doc
248
+ end
249
+
250
+
251
+ private :connect, :create_document
252
+
253
+ def class_exists?(class_name)
254
+ klass = Module.const_get(class_name)
255
+ return klass.is_a?(Class)
256
+ rescue NameError
257
+ return false
258
+ end
259
+
260
+ end
261
+ end