solrizer-fedora 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,22 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+ /.bundle
21
+
22
+ ## PROJECT::SPECIFIC
data/Gemfile ADDED
@@ -0,0 +1,10 @@
1
+ source "http://rubygems.org"
2
+ gem 'active-fedora', '1.2.4'
3
+ gem 'solrizer', '>=0.3.0'
4
+
5
+ group :development, :test do
6
+ gem 'ruby-debug'
7
+ gem 'ruby-debug-base'
8
+ gem 'rspec', '<2.0.0'
9
+ gem 'mocha'
10
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,44 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ active-fedora (1.2.4)
5
+ mime-types (>= 1.16)
6
+ multipart-post
7
+ nokogiri
8
+ om (>= 1.0)
9
+ solr-ruby (>= 0.0.6)
10
+ xml-simple (>= 1.0.12)
11
+ columnize (0.3.1)
12
+ facets (2.9.0)
13
+ linecache (0.43)
14
+ mime-types (1.16)
15
+ mocha (0.9.9)
16
+ rake
17
+ multipart-post (1.0.1)
18
+ nokogiri (1.4.3.1)
19
+ om (1.0.0)
20
+ facets
21
+ nokogiri (>= 1.4.2)
22
+ rake (0.8.7)
23
+ rspec (1.3.1)
24
+ ruby-debug (0.10.3)
25
+ columnize (>= 0.1)
26
+ ruby-debug-base (~> 0.10.3.0)
27
+ ruby-debug-base (0.10.3)
28
+ linecache (>= 0.3)
29
+ solr-ruby (0.0.8)
30
+ solrizer (0.3.0)
31
+ active-fedora (>= 1.1.5)
32
+ om (>= 1.0.0)
33
+ xml-simple (1.0.12)
34
+
35
+ PLATFORMS
36
+ ruby
37
+
38
+ DEPENDENCIES
39
+ active-fedora (= 1.2.4)
40
+ mocha
41
+ rspec (< 2.0.0)
42
+ ruby-debug
43
+ ruby-debug-base
44
+ solrizer (>= 0.2.0)
data/History.textile ADDED
@@ -0,0 +1,3 @@
1
+ h2. 0.1.0
2
+
3
+ Initial Release -- pretty much a direct replica of all of the fedora-related stuff in solrizer versions older than 0.3.0
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Matt Zumwalt
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.textile ADDED
@@ -0,0 +1,34 @@
1
+ h1. solrizer-fedora
2
+
3
+ An extension to projecthydra/solrizer that provides utilities for loading objects from Fedora Repositories and creating solr documents from them.
4
+
5
+ h2. Usage
6
+
7
+ <pre>gem install solrizer-fedora</pre>
8
+
9
+ You must tell the app where to find fedora and solr. Put that information into config/fedora.yml and config/solr.yml
10
+
11
+ Then...
12
+
13
+ <pre>
14
+ irb
15
+ require "rubygems"
16
+ require "solrizer-fedora"
17
+ solrizer = Solrizer::Fedora::Solrizer.new
18
+ solrizer.solrize("demo:5")
19
+ </pre>
20
+
21
+
22
+ h2. Note on Patches/Pull Requests
23
+
24
+ * Fork the project.
25
+ * Make your feature addition or bug fix.
26
+ * Add tests for it. This is important so I don't break it in a
27
+ future version unintentionally.
28
+ * Commit, do not mess with rakefile, version, or history.
29
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
30
+ * Send me a pull request. Bonus points for topic branches.
31
+
32
+ h2. Copyright
33
+
34
+ Copyright (c) 2010 Matt Zumwalt and MediaShelf. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,57 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "solrizer-fedora"
8
+ gem.summary = %Q{An extension to solrizer that deals with Fedora objects & Repositories}
9
+ gem.description = %Q{An extension to projecthydra/solrizer that provides utilities for loading objects from Fedora Repositories and creating solr documents from them.}
10
+ gem.email = "matt.zumwalt@yourmediashelf.com"
11
+ gem.homepage = "http://github.com/projecthydra/solrizer-fedora"
12
+ gem.authors = ["Matt Zumwalt"]
13
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
14
+ end
15
+ Jeweler::GemcutterTasks.new
16
+ rescue LoadError
17
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
18
+ end
19
+
20
+ require 'spec/rake/spectask'
21
+ Spec::Rake::SpecTask.new(:spec) do |spec|
22
+ spec.libs << 'lib' << 'spec'
23
+ spec.spec_files = FileList['spec/**/*_spec.rb']
24
+ end
25
+
26
+ Spec::Rake::SpecTask.new(:rcov) do |spec|
27
+ spec.libs << 'lib' << 'spec'
28
+ spec.pattern = 'spec/**/*_spec.rb'
29
+ spec.rcov = true
30
+ end
31
+
32
+ task :spec => :check_dependencies
33
+
34
+ task :default => :spec
35
+
36
+ require 'rake/rdoctask'
37
+ Rake::RDocTask.new do |rdoc|
38
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
39
+
40
+ rdoc.rdoc_dir = 'rdoc'
41
+ rdoc.title = "solrizer #{version}"
42
+ rdoc.rdoc_files.include('README*')
43
+ rdoc.rdoc_files.include('lib/**/*.rb')
44
+ end
45
+
46
+ begin
47
+ require 'rcov/rcovtask'
48
+ Rcov::RcovTask.new do |test|
49
+ test.libs << 'test'
50
+ test.pattern = 'test/**/test_*.rb'
51
+ test.verbose = true
52
+ end
53
+ rescue LoadError
54
+ task :rcov do
55
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
56
+ end
57
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
data/config/fedora.yml ADDED
@@ -0,0 +1,16 @@
1
+ development:
2
+ fedora:
3
+ url: http://fedoraAdmin:fedoraAdmin@127.0.0.1:8983/fedora
4
+ solr:
5
+ url: http://127.0.0.1:8983/solr/development
6
+ test:
7
+ fedora:
8
+ url: http://fedoraAdmin:fedoraAdmin@127.0.0.1:8983/fedora
9
+ solr:
10
+ url: http://127.0.0.1:8983/solr/test
11
+ production:
12
+ fedora:
13
+ url: http://fedoraAdmin:fedoraAdmin@127.0.0.1:8080/fedora
14
+ solr:
15
+ url: http://127.0.0.1:8080/solr
16
+
@@ -0,0 +1,4 @@
1
+ info:fedora/afmodel:SaltDocument : salt_document
2
+ info:fedora/afmodel:JP2Document : jp2_document
3
+ info:fedora/afmodel:ModsDocument : mods_document
4
+ info:fedora/afmodel:DCDocument : dc_document
data/config/solr.yml ADDED
@@ -0,0 +1,7 @@
1
+ development:
2
+ url: http://localhost:8983/solr/development
3
+ test: &TEST
4
+ url: http://localhost:8983/solr/test
5
+ production:
6
+ url: http://localhost:8080/solr/production
7
+
@@ -0,0 +1 @@
1
+ require "solrizer/fedora"
@@ -0,0 +1,7 @@
1
+ require "rubygems"
2
+ require "solrizer"
3
+ module Solrizer::Fedora
4
+ end
5
+ Dir[File.join(File.dirname(__FILE__),"fedora","*.rb")].each {|file| require file }
6
+
7
+ Solrizer::Extractor.send(:include, Solrizer::Fedora::Extractor)
@@ -0,0 +1,34 @@
1
+ require 'solr'
2
+ require 'rexml/document'
3
+ require "nokogiri"
4
+ require 'yaml'
5
+
6
+ module Solrizer::Fedora::Extractor
7
+
8
+ #
9
+ # Extracts content-model and hydra-type from RELS-EXT datastream
10
+ #
11
+ def extract_rels_ext( text, solr_doc=Solr::Document.new )
12
+ # TODO: only read in this file once
13
+
14
+ if defined?(RAILS_ROOT)
15
+ config_path = File.join(RAILS_ROOT, "config")
16
+ else
17
+ config_path = File.join(File.dirname(__FILE__), "..", "..", "..", "config")
18
+ end
19
+ map = YAML.load(File.open(File.join(config_path, "hydra_types.yml")))
20
+
21
+ doc = Nokogiri::XML(text)
22
+ doc.xpath( '//foo:hasModel', 'foo' => 'info:fedora/fedora-system:def/model#' ).each do |element|
23
+ cmodel = element.attributes['resource'].to_s
24
+ solr_doc << Solr::Field.new( :cmodel_t => cmodel )
25
+
26
+ if map.has_key?(cmodel)
27
+ solr_doc << Solr::Field.new( :hydra_type_t => map[cmodel] )
28
+ end
29
+ end
30
+
31
+ return solr_doc
32
+ end
33
+
34
+ end
@@ -0,0 +1,213 @@
1
+ require 'solr'
2
+ require 'solrizer/extractor'
3
+ require 'solrizer/fedora/repository'
4
+
5
+ module Solrizer::Fedora
6
+ class Indexer
7
+ #
8
+ # Class variables
9
+ #
10
+ @@unique_id = 0
11
+
12
+ def self.unique_id
13
+ @@unique_id
14
+ end
15
+
16
+ #
17
+ # Member variables
18
+ #
19
+ attr_accessor :connection, :extractor, :index_full_text
20
+
21
+ #
22
+ # This method performs initialization tasks
23
+ #
24
+ def initialize( opts={} )
25
+ @@index_list = false unless defined?(@@index_list)
26
+ @extractor = ::Solrizer::Extractor.new
27
+
28
+ if opts[:index_full_text] == true || opts[:index_full_text] == "true"
29
+ @index_full_text = true
30
+ else
31
+ @index_full_text = false
32
+ end
33
+
34
+ connect
35
+ end
36
+
37
+ #
38
+ # This method connects to the Solr instance
39
+ #
40
+ def connect
41
+
42
+ if ActiveFedora.fedora_config.empty?
43
+ ActiveFedora.init
44
+ end
45
+
46
+ if defined?(Blacklight)
47
+ solr_config = Blacklight.solr_config
48
+ else
49
+
50
+ if defined?(RAILS_ROOT)
51
+ config_path = File.join(RAILS_ROOT, "config")
52
+ yaml = YAML.load(File.open(File.join(config_path, "solr.yml")))
53
+ solr_config = yaml[RAILS_ENV]
54
+ puts solr_config.inspect
55
+ else
56
+ config_path = File.join(File.dirname(__FILE__), "..", "..", "..", "config")
57
+ yaml = YAML.load(File.open(File.join(config_path, "solr.yml")))
58
+
59
+
60
+ if ENV["environment"].nil?
61
+ environment = "development"
62
+ else
63
+ environment = ENV["environment"]
64
+ end
65
+
66
+ solr_config = yaml[environment]
67
+ puts solr_config.inspect
68
+ end
69
+
70
+ end
71
+
72
+ if index_full_text == true
73
+ url = solr_config['fulltext']['url']
74
+ elsif solr_config.has_key?("default")
75
+ url = solr_config['default']['url']
76
+ else
77
+ url = solr_config['url']
78
+ end
79
+ @connection = Solr::Connection.new(url, :autocommit => :on )
80
+ end
81
+
82
+ #
83
+ # This method extracts the facet categories from the given Fedora object's external tag datastream
84
+ #
85
+ def extract_xml_to_solr( obj, ds_name, solr_doc=Solr::Document.new )
86
+ xml_ds = Repository.get_datastream( obj, ds_name )
87
+ extractor.xml_to_solr( xml_ds.content, solr_doc )
88
+ end
89
+
90
+ #
91
+ #
92
+ #
93
+ def extract_rels_ext( obj, ds_name, solr_doc=Solr::Document.new )
94
+ rels_ext_ds = Repository.get_datastream( obj, ds_name )
95
+ extractor.extract_rels_ext( rels_ext_ds.content, solr_doc )
96
+ end
97
+
98
+ #
99
+ # This method generates the month and day facets from the date_t in solr_doc
100
+ #
101
+
102
+ def generate_dates(solr_doc)
103
+
104
+ # This will check for valid dates, but it seems most of the dates are currently invalid....
105
+ #date_check = /^(19|20)\d\d([- \/.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])/
106
+
107
+ #if there is not date_t, add on with easy-to-find value
108
+ if solr_doc[:date_t].nil?
109
+ solr_doc << Solr::Field.new( :date_t => "9999-99-99")
110
+ end #if
111
+
112
+ # unless date_check !~ solr_doc[:date_t]
113
+ date_obj = Date._parse(solr_doc[:date_t])
114
+
115
+ if date_obj[:mon].nil?
116
+ solr_doc << Solr::Field.new(:month_facet => 99)
117
+ elsif 0 < date_obj[:mon] && date_obj[:mon] < 13
118
+ solr_doc << Solr::Field.new( :month_facet => date_obj[:mon].to_s.rjust(2, '0'))
119
+ else
120
+ solr_doc << Solr::Field.new( :month_facet => 99)
121
+ end
122
+
123
+ if date_obj[:mday].nil?
124
+ solr_doc << Solr::Field.new( :day_facet => 99)
125
+ elsif 0 < date_obj[:mday] && date_obj[:mday] < 32
126
+ solr_doc << Solr::Field.new( :day_facet => date_obj[:mday].to_s.rjust(2, '0'))
127
+ else
128
+ solr_doc << Solr::Field.new( :day_facet => 99)
129
+ end
130
+
131
+ return solr_doc
132
+ # end
133
+
134
+ end
135
+
136
+
137
+ #
138
+ # This method creates a Solr-formatted XML document
139
+ #
140
+ def create_document( obj )
141
+
142
+ solr_doc = Solr::Document.new
143
+
144
+ model_klazz_array = ActiveFedora::ContentModel.known_models_for( obj )
145
+ model_klazz_array.delete(ActiveFedora::Base)
146
+
147
+ # If the object was passed in as an ActiveFedora::Base, call to_solr in order to get the base field entries from ActiveFedora::Base
148
+ # Otherwise, the object was passed in as a model instance other than ActiveFedora::Base,so call its to_solr method & allow it to insert the fields from ActiveFedora::Base
149
+ if obj.class == ActiveFedora::Base
150
+ solr_doc = obj.to_solr(solr_doc)
151
+ puts " added base fields from #{obj.class.to_s}"
152
+ else
153
+ solr_doc = obj.to_solr(solr_doc)
154
+ model_klazz_array.delete(obj.class)
155
+ puts " added base fields from #{obj.class.to_s} and model fields from #{obj.class.to_s}"
156
+ end
157
+
158
+ # Load the object as an instance of each of its other models and get the corresponding solr fields
159
+ # Include :model_only=>true in the options in order to avoid adding the metadata from ActiveFedora::Base every time.
160
+ model_klazz_array.each do |klazz|
161
+ instance = klazz.load_instance(obj.pid)
162
+ solr_doc = instance.to_solr(solr_doc, :model_only=>true)
163
+ puts " added solr fields from #{klazz.to_s}"
164
+ end
165
+
166
+ solr_doc << Solr::Field.new( :id_t => "#{obj.pid}" )
167
+ solr_doc << Solr::Field.new( :id => "#{obj.pid}" ) unless solr_doc[:id]
168
+
169
+ # increment the unique id to ensure that all documents in the search index are unique
170
+ @@unique_id += 1
171
+
172
+ return solr_doc
173
+ end
174
+
175
+ #
176
+ # This method adds a document to the Solr search index
177
+ #
178
+ def index( obj )
179
+ # print "Indexing '#{obj.pid}'..."
180
+ begin
181
+
182
+ solr_doc = create_document( obj )
183
+ connection.add( solr_doc )
184
+
185
+ # puts connection.url
186
+ #puts solr_doc
187
+ # puts "done"
188
+
189
+ # rescue Exception => e
190
+ # p "unable to index #{obj.pid}. Failed with #{e.inspect}"
191
+ end
192
+
193
+ end
194
+
195
+ #
196
+ # This method queries the Solr search index and returns a response
197
+ #
198
+ def query( query_str )
199
+ response = conn.query( query_str )
200
+ end
201
+
202
+
203
+ private :connect, :create_document
204
+
205
+ def class_exists?(class_name)
206
+ klass = Module.const_get(class_name)
207
+ return klass.is_a?(Class)
208
+ rescue NameError
209
+ return false
210
+ end
211
+
212
+ end
213
+ end
@@ -0,0 +1,44 @@
1
+ require 'active-fedora'
2
+
3
+ module Solrizer::Fedora
4
+ class Repository
5
+
6
+ #
7
+ # This method retrieves a comprehensive list of unique ids in the fedora repository
8
+ #
9
+ def self.get_pids( num_docs )
10
+ solr_results = ActiveFedora::SolrService.instance.conn.query( "active_fedora_model_field:Document", { :rows => num_docs } )
11
+ id_array = []
12
+ solr_results.hits.each do |hit|
13
+ id_array << hit[SOLR_DOCUMENT_ID]
14
+ end
15
+ return id_array
16
+ end
17
+
18
+ #
19
+ # This method retrieves the object associated with the given unique id
20
+ #
21
+ def self.get_object( pid )
22
+ object = ActiveFedora::Base.load_instance( pid )
23
+ end
24
+
25
+ #
26
+ # This method retrieves a comprehensive list of datastreams for the given object
27
+ #
28
+ def self.get_datastreams( obj )
29
+ ds_keys = obj.datastreams.keys
30
+ end
31
+
32
+ #
33
+ # This method retrieves the datastream for the given object with the given datastream name
34
+ #
35
+ def self.get_datastream( obj, ds_name )
36
+ begin
37
+ obj.datastreams[ ds_name ]
38
+ rescue
39
+ return nil
40
+ end
41
+ end
42
+
43
+ end
44
+ end
@@ -0,0 +1,118 @@
1
+ require 'solrizer/field_mapper.rb'
2
+ require 'solrizer/field_name_mapper'
3
+
4
+ require 'solrizer/fedora/indexer'
5
+ require 'solrizer/xml'
6
+ require 'solrizer/html'
7
+
8
+ # Let people explicitly require xml support if they want it ...
9
+ # require 'solrizer/xml.rb'
10
+
11
+ # require 'fastercsv'
12
+ require "ruby-debug"
13
+
14
+
15
+ module Solrizer::Fedora
16
+ class Solrizer
17
+
18
+ attr_accessor :indexer, :index_full_text
19
+
20
+ #
21
+ # This method initializes the indexer
22
+ # If passed an argument of :index_full_text=>true, it will perform full-text indexing instead of indexing fields only.
23
+ #
24
+ def initialize( opts={} )
25
+ @@index_list = false unless defined?(@@index_list)
26
+ if opts[:index_full_text] == true || opts[:index_full_text] == "true"
27
+ @index_full_text = true
28
+ else
29
+ @index_full_text = false
30
+ end
31
+ @indexer = Indexer.new( :index_full_text=>@index_full_text )
32
+ end
33
+
34
+ #
35
+ # This method solrizes the given Fedora object's full-text and facets into the search index
36
+ #
37
+ def solrize( obj )
38
+ # retrieve the Fedora object based on the given unique id
39
+
40
+ begin
41
+
42
+ start = Time.now
43
+ print "Retrieving object #{obj} ..."
44
+
45
+ case obj
46
+ when ActiveFedora::Base
47
+ # do nothing
48
+ when Fedora::FedoraObject
49
+ obj = Repository.get_object( obj.pid )
50
+ when String
51
+ obj = Repository.get_object( obj )
52
+ else
53
+ raise "you must pass either a ActiveFedora::Base, Fedora::RepositoryObject, or a String. You submitted a #{obj.class}"
54
+ end
55
+
56
+ # obj = obj.kind_of?(ActiveFedora::Base) ? obj : Repository.get_object( obj )
57
+
58
+ obj_done = Time.now
59
+ obj_done_elapse = obj_done - start
60
+ puts " completed. Duration: #{obj_done_elapse}"
61
+
62
+ print "\t Indexing object #{obj.pid} ... "
63
+ # add the keywords and facets to the search index
64
+ index_start = Time.now
65
+ indexer.index( obj )
66
+
67
+ index_done = Time.now
68
+ index_elapsed = index_done - index_start
69
+
70
+ puts "completed. Duration: #{index_elapsed} ."
71
+
72
+
73
+ rescue Exception => e
74
+ p "unable to index #{obj}. Failed with #{e.inspect}"
75
+
76
+
77
+ end #begin
78
+
79
+ end
80
+
81
+ #
82
+ # This method retrieves a comprehensive list of all the unique identifiers in Fedora and
83
+ # solrizes each object's full-text and facets into the search index
84
+ def solrize_objects
85
+ # retrieve a list of all the pids in the fedora repository
86
+ num_docs = 1000000 # modify this number to guarantee that all the objects are retrieved from the repository
87
+ puts "WARNING: You have turned off indexing of Full Text content. Be sure to re-run indexer with @@index_full_text set to true in main.rb" if index_full_text == false
88
+
89
+ if @@index_list == false
90
+
91
+ objects = ::Fedora::Repository.instance.find_objects(:limit=>num_docs)
92
+
93
+ puts "Shelving #{objects.length} Fedora objects"
94
+ objects.each do |object|
95
+ solrize( object )
96
+ end
97
+
98
+ else
99
+
100
+ if File.exists?(@@index_list)
101
+ arr_of_pids = FasterCSV.read(@@index_list, :headers=>false)
102
+
103
+ puts "Indexing from list at #{@@index_list}"
104
+ puts "Shelving #{arr_of_pids.length} Fedora objects"
105
+
106
+ arr_of_pids.each do |row|
107
+ pid = row[0]
108
+ solrize( pid )
109
+ end #FASTERCSV
110
+ else
111
+ puts "#{@@index_list} does not exists!"
112
+ end #if File.exists
113
+
114
+ end #if Index_LISTS
115
+ end #solrize_objects
116
+
117
+ end #class
118
+ end #module
@@ -0,0 +1,35 @@
1
+ namespace :solrizer do
2
+
3
+ namespace :fedora
4
+ desc 'Index a fedora object of the given pid.'
5
+ task :solrize => :environment do
6
+ index_full_text = ENV['FULL_TEXT'] == 'true'
7
+ if ENV['PID']
8
+ puts "indexing #{ENV['PID'].inspect}"
9
+ solrizer = Solrizer::Fedora::Solrizer.new :index_full_text=> index_full_text
10
+ solrizer.solrize(ENV['PID'])
11
+ puts "Finished shelving #{ENV['PID']}"
12
+ else
13
+ puts "You must provide a pid using the format 'solrizer::solrize_object PID=sample:pid'."
14
+ end
15
+ end
16
+
17
+ desc 'Index all objects in the repository.'
18
+ task :solrize_objects => :environment do
19
+ index_full_text = ENV['FULL_TEXT'] == 'true'
20
+ if ENV['INDEX_LIST']
21
+ @@index_list = ENV['INDEX_LIST']
22
+ end
23
+
24
+ puts "Re-indexing Fedora Repository."
25
+ puts "Fedora URL: #{ActiveFedora.fedora_config[:url]}"
26
+ puts "Fedora Solr URL: #{ActiveFedora.solr_config[:url]}"
27
+ puts "Blacklight Solr Config: #{Blacklight.solr_config.inspect}"
28
+ puts "Doing full text index." if index_full_text
29
+ solrizer = Solrizer::Fedora::Solrizer.new :index_full_text=> index_full_text
30
+ solrizer.solrize_objects
31
+ puts "Solrizer task complete."
32
+ end
33
+ end
34
+
35
+ end
@@ -0,0 +1,72 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{solrizer-fedora}
8
+ s.version = "0.1.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Matt Zumwalt"]
12
+ s.date = %q{2010-10-26}
13
+ s.description = %q{An extension to projecthydra/solrizer that provides utilities for loading objects from Fedora Repositories and creating solr documents from them.}
14
+ s.email = %q{matt.zumwalt@yourmediashelf.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.textile"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".gitignore",
22
+ "Gemfile",
23
+ "Gemfile.lock",
24
+ "History.textile",
25
+ "LICENSE",
26
+ "README.textile",
27
+ "Rakefile",
28
+ "VERSION",
29
+ "config/fedora.yml",
30
+ "config/hydra_types.yml",
31
+ "config/solr.yml",
32
+ "lib/solrizer-fedora.rb",
33
+ "lib/solrizer/fedora.rb",
34
+ "lib/solrizer/fedora/extractor.rb",
35
+ "lib/solrizer/fedora/indexer.rb",
36
+ "lib/solrizer/fedora/repository.rb",
37
+ "lib/solrizer/fedora/solrizer.rb",
38
+ "lib/tasks/solrizer-fedora.rake",
39
+ "solrizer-fedora.gemspec",
40
+ "spec/fixtures/rels_ext_cmodel.xml",
41
+ "spec/integration/fedora_indexer_spec.rb",
42
+ "spec/rcov.opts",
43
+ "spec/spec.opts",
44
+ "spec/spec_helper.rb",
45
+ "spec/units/fedora_extractor_spec.rb",
46
+ "spec/units/fedora_indexer_spec.rb",
47
+ "spec/units/fedora_solrizer_spec.rb"
48
+ ]
49
+ s.homepage = %q{http://github.com/projecthydra/solrizer-fedora}
50
+ s.rdoc_options = ["--charset=UTF-8"]
51
+ s.require_paths = ["lib"]
52
+ s.rubygems_version = %q{1.3.7}
53
+ s.summary = %q{An extension to solrizer that deals with Fedora objects & Repositories}
54
+ s.test_files = [
55
+ "spec/integration/fedora_indexer_spec.rb",
56
+ "spec/spec_helper.rb",
57
+ "spec/units/fedora_extractor_spec.rb",
58
+ "spec/units/fedora_indexer_spec.rb",
59
+ "spec/units/fedora_solrizer_spec.rb"
60
+ ]
61
+
62
+ if s.respond_to? :specification_version then
63
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
64
+ s.specification_version = 3
65
+
66
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
67
+ else
68
+ end
69
+ else
70
+ end
71
+ end
72
+
@@ -0,0 +1,8 @@
1
+ <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
2
+ <rdf:Description rdf:about="info:fedora/demo:multipurpose-objects-model_and_sdef">
3
+ <hasModel xmlns="info:fedora/fedora-system:def/model#" rdf:resource="info:fedora/fedora-system:ContentModel-3.0"/>
4
+ <hasModel xmlns="info:fedora/fedora-system:def/model#" rdf:resource="info:fedora/afmodel:SaltDocument"/>
5
+ <hasModel xmlns="info:fedora/fedora-system:def/model#" rdf:resource="info:fedora/afmodel:JP2Document"/>
6
+ <hasModel xmlns="info:fedora/fedora-system:def/model#" rdf:resource="info:fedora/afmodel:DCDocument"/>
7
+ </rdf:Description>
8
+ </rdf:RDF>
@@ -0,0 +1,18 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+ require 'solrizer'
3
+
4
+ describe Solrizer::Fedora::Indexer do
5
+
6
+ before(:each) do
7
+ @indexer = Solrizer::Fedora::Indexer.new
8
+ end
9
+
10
+ describe "index" do
11
+ it "should update solr with the metadata from the given object" do
12
+ pending "Got to decide if/how to handle fixtures in this gem. Probably should just mock out Fedora & Solr entirely."
13
+ obj = Solrizer::Repository.get_object( "druid:sb733gr4073" )
14
+ @indexer.index( obj )
15
+ end
16
+ end
17
+
18
+ end
data/spec/rcov.opts ADDED
@@ -0,0 +1,2 @@
1
+ --exclude "spec/*,gems/*"
2
+ --rails
data/spec/spec.opts ADDED
@@ -0,0 +1 @@
1
+ --color
@@ -0,0 +1,18 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+ require 'solrizer/fedora'
4
+ require 'spec'
5
+ require 'spec/autorun'
6
+
7
+ require 'solrizer'
8
+
9
+ Spec::Runner.configure do |config|
10
+
11
+ config.mock_with :mocha
12
+
13
+
14
+ def fixture(file)
15
+ File.new(File.join(File.dirname(__FILE__), 'fixtures', file))
16
+ end
17
+
18
+ end
@@ -0,0 +1,31 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+ require 'solrizer'
3
+
4
+ describe Solrizer::Fedora::Extractor do
5
+
6
+ before(:all) do
7
+ @extractor = Solrizer::Extractor.new
8
+ end
9
+
10
+ describe "extract_rels_ext" do
11
+ it "should extract the content model of the RELS-EXT datastream of a Fedora object and set hydra_type using hydra_types mapping" do
12
+ rels_ext = fixture("rels_ext_cmodel.xml")
13
+ result = @extractor.extract_rels_ext( rels_ext )
14
+ result[:cmodel_t].should == "info:fedora/fedora-system:ContentModel-3.0"
15
+ result[:hydra_type_t].should == "salt_document"
16
+
17
+ # ... and a hacky way of making sure that it added a field for each of the dc:medium values
18
+ result.inspect.include?('@value="info:fedora/afmodel:SaltDocument"').should be_true
19
+ result.inspect.include?('@value="jp2_document"').should be_true
20
+ end
21
+ end
22
+
23
+ describe "extract_hydra_types" do
24
+ it "should extract the hydra_type of a Fedora object" do
25
+ rels_ext = fixture("rels_ext_cmodel.xml")
26
+ result = @extractor.extract_rels_ext( rels_ext )
27
+ result[:hydra_type_t].should == "salt_document"
28
+ end
29
+ end
30
+
31
+ end
@@ -0,0 +1,78 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+ require 'solrizer'
3
+ require "solrizer/fedora"
4
+
5
+ describe Solrizer::Fedora::Indexer do
6
+
7
+ before(:each) do
8
+ Solrizer::Fedora::Indexer.any_instance.stubs(:connect).returns("foo")
9
+
10
+ @extractor = mock("Extractor")
11
+ @extractor.stubs(:html_content_to_solr).returns(@solr_doc)
12
+ # @solr_doc = mock('solr_doc')
13
+ # @solr_doc.stubs(:<<)
14
+ # @solr_doc.stubs(:[])
15
+
16
+ @solr_doc = Solr::Document.new
17
+
18
+ Solrizer::Extractor.expects(:new).returns(@extractor)
19
+ @indexer = Solrizer::Fedora::Indexer.new
20
+
21
+ end
22
+
23
+ describe "#generate_dates" do
24
+ it "should still give 9999-99-99 date if the solr document does not have a date_t field" do
25
+
26
+ solr_result = @indexer.generate_dates(@solr_doc)
27
+ solr_result.should be_kind_of Solr::Document
28
+ solr_result[:date_t].should == "9999-99-99"
29
+ solr_result[:month_facet].should == "99"
30
+ solr_result[:day_facet].should == '99'
31
+
32
+ end
33
+
34
+ it "should still give 9999-99-99 date if the solr_doc[:date_t] is not valid date in YYYY-MM-DD format " do
35
+
36
+ @solr_doc << Solr::Field.new(:date_t => "Unknown")
37
+ solr_result = @indexer.generate_dates(@solr_doc)
38
+ solr_result.should be_kind_of Solr::Document
39
+ solr_result[:date_t].should == "Unknown"
40
+ solr_result[:month_facet].should == "99"
41
+ solr_result[:day_facet].should == '99'
42
+
43
+ end
44
+
45
+ it "should give month and dates even if the :date_t is not a valid date but is in YYYY-MM-DD format " do
46
+
47
+ @solr_doc << Solr::Field.new(:date_t => "0000-13-11")
48
+ solr_result = @indexer.generate_dates(@solr_doc)
49
+ solr_result.should be_kind_of Solr::Document
50
+ solr_result[:date_t].should == "0000-13-11"
51
+ solr_result[:month_facet].should == "99"
52
+ solr_result[:day_facet].should == '11'
53
+ end
54
+
55
+ it "should give month and day when in a valid date format" do
56
+ @solr_doc << Solr::Field.new(:date_t => "1978-04-11")
57
+ solr_result = @indexer.generate_dates(@solr_doc)
58
+ solr_result.should be_kind_of Solr::Document
59
+ solr_result[:date_t].should == "1978-04-11"
60
+ solr_result[:month_facet].should == "04"
61
+ solr_result[:day_facet].should == '11'
62
+
63
+ end
64
+
65
+ it "should still give two digit strings even if the month/day is single digit" do
66
+
67
+ @solr_doc << Solr::Field.new(:date_t => "1978-4-1")
68
+ solr_result = @indexer.generate_dates(@solr_doc)
69
+ solr_result.should be_kind_of Solr::Document
70
+ solr_result[:date_t].should == "1978-4-1"
71
+ solr_result[:month_facet].should == "04"
72
+ solr_result[:day_facet].should == '01'
73
+
74
+ end
75
+
76
+ end
77
+
78
+ end
@@ -0,0 +1,42 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+
3
+ describe Solrizer::Fedora::Solrizer do
4
+
5
+ before(:each) do
6
+ @solrizer = Solrizer::Fedora::Solrizer.new
7
+ end
8
+
9
+ describe "solrize" do
10
+ it "should trigger the indexer for the provided object" do
11
+ sample_obj = ActiveFedora::Base.new
12
+ @solrizer.indexer.expects(:index).with( sample_obj )
13
+ @solrizer.solrize( sample_obj )
14
+ end
15
+ it "should work with Fedora::FedoraObject objects" do
16
+ mock_object = Fedora::FedoraObject.new(:pid=>"my:pid", :label=>"my label")
17
+ ActiveFedora::Base.expects(:load_instance).with( mock_object.pid ).returns(mock_object)
18
+ @solrizer.indexer.expects(:index).with( mock_object )
19
+ @solrizer.solrize( mock_object )
20
+ end
21
+ it "should load the object if only a pid is provided" do
22
+ mock_object = mock("my object")
23
+ mock_object.stubs(:pid)
24
+ mock_object.stubs(:label)
25
+ mock_object.stubs(:datastreams).returns({'descMetadata'=>"foo","location"=>"bar"})
26
+
27
+ ActiveFedora::Base.expects(:load_instance).with( "_PID_" ).returns(mock_object)
28
+ @solrizer.indexer.expects(:index).with(mock_object)
29
+ @solrizer.solrize("_PID_")
30
+ end
31
+
32
+ end
33
+
34
+ describe "solrize_objects" do
35
+ it "should call solrize for each object returned by Fedora::Repository.find_objects" do
36
+ objects = [["pid1"], ["pid2"], ["pid3"]]
37
+ Fedora::Repository.any_instance.expects(:find_objects).returns(objects)
38
+ objects.each {|object| @solrizer.expects(:solrize).with( object ) }
39
+ @solrizer.solrize_objects
40
+ end
41
+ end
42
+ end
metadata ADDED
@@ -0,0 +1,99 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: solrizer-fedora
3
+ version: !ruby/object:Gem::Version
4
+ hash: 27
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 0
10
+ version: 0.1.0
11
+ platform: ruby
12
+ authors:
13
+ - Matt Zumwalt
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-10-26 00:00:00 -05:00
19
+ default_executable:
20
+ dependencies: []
21
+
22
+ description: An extension to projecthydra/solrizer that provides utilities for loading objects from Fedora Repositories and creating solr documents from them.
23
+ email: matt.zumwalt@yourmediashelf.com
24
+ executables: []
25
+
26
+ extensions: []
27
+
28
+ extra_rdoc_files:
29
+ - LICENSE
30
+ - README.textile
31
+ files:
32
+ - .document
33
+ - .gitignore
34
+ - Gemfile
35
+ - Gemfile.lock
36
+ - History.textile
37
+ - LICENSE
38
+ - README.textile
39
+ - Rakefile
40
+ - VERSION
41
+ - config/fedora.yml
42
+ - config/hydra_types.yml
43
+ - config/solr.yml
44
+ - lib/solrizer-fedora.rb
45
+ - lib/solrizer/fedora.rb
46
+ - lib/solrizer/fedora/extractor.rb
47
+ - lib/solrizer/fedora/indexer.rb
48
+ - lib/solrizer/fedora/repository.rb
49
+ - lib/solrizer/fedora/solrizer.rb
50
+ - lib/tasks/solrizer-fedora.rake
51
+ - solrizer-fedora.gemspec
52
+ - spec/fixtures/rels_ext_cmodel.xml
53
+ - spec/integration/fedora_indexer_spec.rb
54
+ - spec/rcov.opts
55
+ - spec/spec.opts
56
+ - spec/spec_helper.rb
57
+ - spec/units/fedora_extractor_spec.rb
58
+ - spec/units/fedora_indexer_spec.rb
59
+ - spec/units/fedora_solrizer_spec.rb
60
+ has_rdoc: true
61
+ homepage: http://github.com/projecthydra/solrizer-fedora
62
+ licenses: []
63
+
64
+ post_install_message:
65
+ rdoc_options:
66
+ - --charset=UTF-8
67
+ require_paths:
68
+ - lib
69
+ required_ruby_version: !ruby/object:Gem::Requirement
70
+ none: false
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ hash: 3
75
+ segments:
76
+ - 0
77
+ version: "0"
78
+ required_rubygems_version: !ruby/object:Gem::Requirement
79
+ none: false
80
+ requirements:
81
+ - - ">="
82
+ - !ruby/object:Gem::Version
83
+ hash: 3
84
+ segments:
85
+ - 0
86
+ version: "0"
87
+ requirements: []
88
+
89
+ rubyforge_project:
90
+ rubygems_version: 1.3.7
91
+ signing_key:
92
+ specification_version: 3
93
+ summary: An extension to solrizer that deals with Fedora objects & Repositories
94
+ test_files:
95
+ - spec/integration/fedora_indexer_spec.rb
96
+ - spec/spec_helper.rb
97
+ - spec/units/fedora_extractor_spec.rb
98
+ - spec/units/fedora_indexer_spec.rb
99
+ - spec/units/fedora_solrizer_spec.rb