solrizer-fedora 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,22 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+ /.bundle
21
+
22
+ ## PROJECT::SPECIFIC
data/Gemfile ADDED
@@ -0,0 +1,10 @@
1
+ source "http://rubygems.org"
2
+ gem 'active-fedora', '1.2.4'
3
+ gem 'solrizer', '>=0.3.0'
4
+
5
+ group :development, :test do
6
+ gem 'ruby-debug'
7
+ gem 'ruby-debug-base'
8
+ gem 'rspec', '<2.0.0'
9
+ gem 'mocha'
10
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,44 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ active-fedora (1.2.4)
5
+ mime-types (>= 1.16)
6
+ multipart-post
7
+ nokogiri
8
+ om (>= 1.0)
9
+ solr-ruby (>= 0.0.6)
10
+ xml-simple (>= 1.0.12)
11
+ columnize (0.3.1)
12
+ facets (2.9.0)
13
+ linecache (0.43)
14
+ mime-types (1.16)
15
+ mocha (0.9.9)
16
+ rake
17
+ multipart-post (1.0.1)
18
+ nokogiri (1.4.3.1)
19
+ om (1.0.0)
20
+ facets
21
+ nokogiri (>= 1.4.2)
22
+ rake (0.8.7)
23
+ rspec (1.3.1)
24
+ ruby-debug (0.10.3)
25
+ columnize (>= 0.1)
26
+ ruby-debug-base (~> 0.10.3.0)
27
+ ruby-debug-base (0.10.3)
28
+ linecache (>= 0.3)
29
+ solr-ruby (0.0.8)
30
+ solrizer (0.3.0)
31
+ active-fedora (>= 1.1.5)
32
+ om (>= 1.0.0)
33
+ xml-simple (1.0.12)
34
+
35
+ PLATFORMS
36
+ ruby
37
+
38
+ DEPENDENCIES
39
+ active-fedora (= 1.2.4)
40
+ mocha
41
+ rspec (< 2.0.0)
42
+ ruby-debug
43
+ ruby-debug-base
44
+ solrizer (>= 0.2.0)
data/History.textile ADDED
@@ -0,0 +1,3 @@
1
+ h2. 0.1.0
2
+
3
+ Initial Release -- pretty much a direct replica of all of the fedora-related stuff in solrizer versions older than 0.3.0
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Matt Zumwalt
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.textile ADDED
@@ -0,0 +1,34 @@
1
+ h1. solrizer-fedora
2
+
3
+ An extension to projecthydra/solrizer that provides utilities for loading objects from Fedora Repositories and creating solr documents from them.
4
+
5
+ h2. Usage
6
+
7
+ <pre>gem install solrizer-fedora</pre>
8
+
9
+ You must tell the app where to find fedora and solr. Put that information into config/fedora.yml and config/solr.yml
10
+
11
+ Then...
12
+
13
+ <pre>
14
+ irb
15
+ require "rubygems"
16
+ require "solrizer-fedora"
17
+ solrizer = Solrizer::Fedora::Solrizer.new
18
+ solrizer.solrize("demo:5")
19
+ </pre>
20
+
21
+
22
+ h2. Note on Patches/Pull Requests
23
+
24
+ * Fork the project.
25
+ * Make your feature addition or bug fix.
26
+ * Add tests for it. This is important so I don't break it in a
27
+ future version unintentionally.
28
+ * Commit, do not mess with rakefile, version, or history.
29
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
30
+ * Send me a pull request. Bonus points for topic branches.
31
+
32
+ h2. Copyright
33
+
34
+ Copyright (c) 2010 Matt Zumwalt and MediaShelf. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,57 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "solrizer-fedora"
8
+ gem.summary = %Q{An extension to solrizer that deals with Fedora objects & Repositories}
9
+ gem.description = %Q{An extension to projecthydra/solrizer that provides utilities for loading objects from Fedora Repositories and creating solr documents from them.}
10
+ gem.email = "matt.zumwalt@yourmediashelf.com"
11
+ gem.homepage = "http://github.com/projecthydra/solrizer-fedora"
12
+ gem.authors = ["Matt Zumwalt"]
13
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
14
+ end
15
+ Jeweler::GemcutterTasks.new
16
+ rescue LoadError
17
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
18
+ end
19
+
20
+ require 'spec/rake/spectask'
21
+ Spec::Rake::SpecTask.new(:spec) do |spec|
22
+ spec.libs << 'lib' << 'spec'
23
+ spec.spec_files = FileList['spec/**/*_spec.rb']
24
+ end
25
+
26
+ Spec::Rake::SpecTask.new(:rcov) do |spec|
27
+ spec.libs << 'lib' << 'spec'
28
+ spec.pattern = 'spec/**/*_spec.rb'
29
+ spec.rcov = true
30
+ end
31
+
32
+ task :spec => :check_dependencies
33
+
34
+ task :default => :spec
35
+
36
+ require 'rake/rdoctask'
37
+ Rake::RDocTask.new do |rdoc|
38
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
39
+
40
+ rdoc.rdoc_dir = 'rdoc'
41
+ rdoc.title = "solrizer #{version}"
42
+ rdoc.rdoc_files.include('README*')
43
+ rdoc.rdoc_files.include('lib/**/*.rb')
44
+ end
45
+
46
+ begin
47
+ require 'rcov/rcovtask'
48
+ Rcov::RcovTask.new do |test|
49
+ test.libs << 'test'
50
+ test.pattern = 'test/**/test_*.rb'
51
+ test.verbose = true
52
+ end
53
+ rescue LoadError
54
+ task :rcov do
55
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
56
+ end
57
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
data/config/fedora.yml ADDED
@@ -0,0 +1,16 @@
1
+ development:
2
+ fedora:
3
+ url: http://fedoraAdmin:fedoraAdmin@127.0.0.1:8983/fedora
4
+ solr:
5
+ url: http://127.0.0.1:8983/solr/development
6
+ test:
7
+ fedora:
8
+ url: http://fedoraAdmin:fedoraAdmin@127.0.0.1:8983/fedora
9
+ solr:
10
+ url: http://127.0.0.1:8983/solr/test
11
+ production:
12
+ fedora:
13
+ url: http://fedoraAdmin:fedoraAdmin@127.0.0.1:8080/fedora
14
+ solr:
15
+ url: http://127.0.0.1:8080/solr
16
+
@@ -0,0 +1,4 @@
1
+ info:fedora/afmodel:SaltDocument : salt_document
2
+ info:fedora/afmodel:JP2Document : jp2_document
3
+ info:fedora/afmodel:ModsDocument : mods_document
4
+ info:fedora/afmodel:DCDocument : dc_document
data/config/solr.yml ADDED
@@ -0,0 +1,7 @@
1
+ development:
2
+ url: http://localhost:8983/solr/development
3
+ test: &TEST
4
+ url: http://localhost:8983/solr/test
5
+ production:
6
+ url: http://localhost:8080/solr/production
7
+
@@ -0,0 +1 @@
1
+ require "solrizer/fedora"
@@ -0,0 +1,7 @@
1
+ require "rubygems"
2
+ require "solrizer"
3
+ module Solrizer::Fedora
4
+ end
5
+ Dir[File.join(File.dirname(__FILE__),"fedora","*.rb")].each {|file| require file }
6
+
7
+ Solrizer::Extractor.send(:include, Solrizer::Fedora::Extractor)
@@ -0,0 +1,34 @@
1
+ require 'solr'
2
+ require 'rexml/document'
3
+ require "nokogiri"
4
+ require 'yaml'
5
+
6
+ module Solrizer::Fedora::Extractor
7
+
8
+ #
9
+ # Extracts content-model and hydra-type from RELS-EXT datastream
10
+ #
11
+ def extract_rels_ext( text, solr_doc=Solr::Document.new )
12
+ # TODO: only read in this file once
13
+
14
+ if defined?(RAILS_ROOT)
15
+ config_path = File.join(RAILS_ROOT, "config")
16
+ else
17
+ config_path = File.join(File.dirname(__FILE__), "..", "..", "..", "config")
18
+ end
19
+ map = YAML.load(File.open(File.join(config_path, "hydra_types.yml")))
20
+
21
+ doc = Nokogiri::XML(text)
22
+ doc.xpath( '//foo:hasModel', 'foo' => 'info:fedora/fedora-system:def/model#' ).each do |element|
23
+ cmodel = element.attributes['resource'].to_s
24
+ solr_doc << Solr::Field.new( :cmodel_t => cmodel )
25
+
26
+ if map.has_key?(cmodel)
27
+ solr_doc << Solr::Field.new( :hydra_type_t => map[cmodel] )
28
+ end
29
+ end
30
+
31
+ return solr_doc
32
+ end
33
+
34
+ end
@@ -0,0 +1,213 @@
1
+ require 'solr'
2
+ require 'solrizer/extractor'
3
+ require 'solrizer/fedora/repository'
4
+
5
+ module Solrizer::Fedora
6
+ class Indexer
7
+ #
8
+ # Class variables
9
+ #
10
+ @@unique_id = 0
11
+
12
+ def self.unique_id
13
+ @@unique_id
14
+ end
15
+
16
+ #
17
+ # Member variables
18
+ #
19
+ attr_accessor :connection, :extractor, :index_full_text
20
+
21
+ #
22
+ # This method performs initialization tasks
23
+ #
24
+ def initialize( opts={} )
25
+ @@index_list = false unless defined?(@@index_list)
26
+ @extractor = ::Solrizer::Extractor.new
27
+
28
+ if opts[:index_full_text] == true || opts[:index_full_text] == "true"
29
+ @index_full_text = true
30
+ else
31
+ @index_full_text = false
32
+ end
33
+
34
+ connect
35
+ end
36
+
37
+ #
38
+ # This method connects to the Solr instance
39
+ #
40
+ def connect
41
+
42
+ if ActiveFedora.fedora_config.empty?
43
+ ActiveFedora.init
44
+ end
45
+
46
+ if defined?(Blacklight)
47
+ solr_config = Blacklight.solr_config
48
+ else
49
+
50
+ if defined?(RAILS_ROOT)
51
+ config_path = File.join(RAILS_ROOT, "config")
52
+ yaml = YAML.load(File.open(File.join(config_path, "solr.yml")))
53
+ solr_config = yaml[RAILS_ENV]
54
+ puts solr_config.inspect
55
+ else
56
+ config_path = File.join(File.dirname(__FILE__), "..", "..", "..", "config")
57
+ yaml = YAML.load(File.open(File.join(config_path, "solr.yml")))
58
+
59
+
60
+ if ENV["environment"].nil?
61
+ environment = "development"
62
+ else
63
+ environment = ENV["environment"]
64
+ end
65
+
66
+ solr_config = yaml[environment]
67
+ puts solr_config.inspect
68
+ end
69
+
70
+ end
71
+
72
+ if index_full_text == true
73
+ url = solr_config['fulltext']['url']
74
+ elsif solr_config.has_key?("default")
75
+ url = solr_config['default']['url']
76
+ else
77
+ url = solr_config['url']
78
+ end
79
+ @connection = Solr::Connection.new(url, :autocommit => :on )
80
+ end
81
+
82
+ #
83
+ # This method extracts the facet categories from the given Fedora object's external tag datastream
84
+ #
85
+ def extract_xml_to_solr( obj, ds_name, solr_doc=Solr::Document.new )
86
+ xml_ds = Repository.get_datastream( obj, ds_name )
87
+ extractor.xml_to_solr( xml_ds.content, solr_doc )
88
+ end
89
+
90
+ #
91
+ #
92
+ #
93
+ def extract_rels_ext( obj, ds_name, solr_doc=Solr::Document.new )
94
+ rels_ext_ds = Repository.get_datastream( obj, ds_name )
95
+ extractor.extract_rels_ext( rels_ext_ds.content, solr_doc )
96
+ end
97
+
98
+ #
99
+ # This method generates the month and day facets from the date_t in solr_doc
100
+ #
101
+
102
+ def generate_dates(solr_doc)
103
+
104
+ # This will check for valid dates, but it seems most of the dates are currently invalid....
105
+ #date_check = /^(19|20)\d\d([- \/.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])/
106
+
107
+ #if there is not date_t, add on with easy-to-find value
108
+ if solr_doc[:date_t].nil?
109
+ solr_doc << Solr::Field.new( :date_t => "9999-99-99")
110
+ end #if
111
+
112
+ # unless date_check !~ solr_doc[:date_t]
113
+ date_obj = Date._parse(solr_doc[:date_t])
114
+
115
+ if date_obj[:mon].nil?
116
+ solr_doc << Solr::Field.new(:month_facet => 99)
117
+ elsif 0 < date_obj[:mon] && date_obj[:mon] < 13
118
+ solr_doc << Solr::Field.new( :month_facet => date_obj[:mon].to_s.rjust(2, '0'))
119
+ else
120
+ solr_doc << Solr::Field.new( :month_facet => 99)
121
+ end
122
+
123
+ if date_obj[:mday].nil?
124
+ solr_doc << Solr::Field.new( :day_facet => 99)
125
+ elsif 0 < date_obj[:mday] && date_obj[:mday] < 32
126
+ solr_doc << Solr::Field.new( :day_facet => date_obj[:mday].to_s.rjust(2, '0'))
127
+ else
128
+ solr_doc << Solr::Field.new( :day_facet => 99)
129
+ end
130
+
131
+ return solr_doc
132
+ # end
133
+
134
+ end
135
+
136
+
137
+ #
138
+ # This method creates a Solr-formatted XML document
139
+ #
140
+ def create_document( obj )
141
+
142
+ solr_doc = Solr::Document.new
143
+
144
+ model_klazz_array = ActiveFedora::ContentModel.known_models_for( obj )
145
+ model_klazz_array.delete(ActiveFedora::Base)
146
+
147
+ # If the object was passed in as an ActiveFedora::Base, call to_solr in order to get the base field entries from ActiveFedora::Base
148
+ # Otherwise, the object was passed in as a model instance other than ActiveFedora::Base,so call its to_solr method & allow it to insert the fields from ActiveFedora::Base
149
+ if obj.class == ActiveFedora::Base
150
+ solr_doc = obj.to_solr(solr_doc)
151
+ puts " added base fields from #{obj.class.to_s}"
152
+ else
153
+ solr_doc = obj.to_solr(solr_doc)
154
+ model_klazz_array.delete(obj.class)
155
+ puts " added base fields from #{obj.class.to_s} and model fields from #{obj.class.to_s}"
156
+ end
157
+
158
+ # Load the object as an instance of each of its other models and get the corresponding solr fields
159
+ # Include :model_only=>true in the options in order to avoid adding the metadata from ActiveFedora::Base every time.
160
+ model_klazz_array.each do |klazz|
161
+ instance = klazz.load_instance(obj.pid)
162
+ solr_doc = instance.to_solr(solr_doc, :model_only=>true)
163
+ puts " added solr fields from #{klazz.to_s}"
164
+ end
165
+
166
+ solr_doc << Solr::Field.new( :id_t => "#{obj.pid}" )
167
+ solr_doc << Solr::Field.new( :id => "#{obj.pid}" ) unless solr_doc[:id]
168
+
169
+ # increment the unique id to ensure that all documents in the search index are unique
170
+ @@unique_id += 1
171
+
172
+ return solr_doc
173
+ end
174
+
175
+ #
176
+ # This method adds a document to the Solr search index
177
+ #
178
+ def index( obj )
179
+ # print "Indexing '#{obj.pid}'..."
180
+ begin
181
+
182
+ solr_doc = create_document( obj )
183
+ connection.add( solr_doc )
184
+
185
+ # puts connection.url
186
+ #puts solr_doc
187
+ # puts "done"
188
+
189
+ # rescue Exception => e
190
+ # p "unable to index #{obj.pid}. Failed with #{e.inspect}"
191
+ end
192
+
193
+ end
194
+
195
+ #
196
+ # This method queries the Solr search index and returns a response
197
+ #
198
+ def query( query_str )
199
+ response = conn.query( query_str )
200
+ end
201
+
202
+
203
+ private :connect, :create_document
204
+
205
+ def class_exists?(class_name)
206
+ klass = Module.const_get(class_name)
207
+ return klass.is_a?(Class)
208
+ rescue NameError
209
+ return false
210
+ end
211
+
212
+ end
213
+ end
@@ -0,0 +1,44 @@
1
+ require 'active-fedora'
2
+
3
+ module Solrizer::Fedora
4
+ class Repository
5
+
6
+ #
7
+ # This method retrieves a comprehensive list of unique ids in the fedora repository
8
+ #
9
+ def self.get_pids( num_docs )
10
+ solr_results = ActiveFedora::SolrService.instance.conn.query( "active_fedora_model_field:Document", { :rows => num_docs } )
11
+ id_array = []
12
+ solr_results.hits.each do |hit|
13
+ id_array << hit[SOLR_DOCUMENT_ID]
14
+ end
15
+ return id_array
16
+ end
17
+
18
+ #
19
+ # This method retrieves the object associated with the given unique id
20
+ #
21
+ def self.get_object( pid )
22
+ object = ActiveFedora::Base.load_instance( pid )
23
+ end
24
+
25
+ #
26
+ # This method retrieves a comprehensive list of datastreams for the given object
27
+ #
28
+ def self.get_datastreams( obj )
29
+ ds_keys = obj.datastreams.keys
30
+ end
31
+
32
+ #
33
+ # This method retrieves the datastream for the given object with the given datastream name
34
+ #
35
+ def self.get_datastream( obj, ds_name )
36
+ begin
37
+ obj.datastreams[ ds_name ]
38
+ rescue
39
+ return nil
40
+ end
41
+ end
42
+
43
+ end
44
+ end
@@ -0,0 +1,118 @@
1
+ require 'solrizer/field_mapper.rb'
2
+ require 'solrizer/field_name_mapper'
3
+
4
+ require 'solrizer/fedora/indexer'
5
+ require 'solrizer/xml'
6
+ require 'solrizer/html'
7
+
8
+ # Let people explicitly require xml support if they want it ...
9
+ # require 'solrizer/xml.rb'
10
+
11
+ # require 'fastercsv'
12
+ require "ruby-debug"
13
+
14
+
15
+ module Solrizer::Fedora
16
+ class Solrizer
17
+
18
+ attr_accessor :indexer, :index_full_text
19
+
20
+ #
21
+ # This method initializes the indexer
22
+ # If passed an argument of :index_full_text=>true, it will perform full-text indexing instead of indexing fields only.
23
+ #
24
+ def initialize( opts={} )
25
+ @@index_list = false unless defined?(@@index_list)
26
+ if opts[:index_full_text] == true || opts[:index_full_text] == "true"
27
+ @index_full_text = true
28
+ else
29
+ @index_full_text = false
30
+ end
31
+ @indexer = Indexer.new( :index_full_text=>@index_full_text )
32
+ end
33
+
34
+ #
35
+ # This method solrizes the given Fedora object's full-text and facets into the search index
36
+ #
37
+ def solrize( obj )
38
+ # retrieve the Fedora object based on the given unique id
39
+
40
+ begin
41
+
42
+ start = Time.now
43
+ print "Retrieving object #{obj} ..."
44
+
45
+ case obj
46
+ when ActiveFedora::Base
47
+ # do nothing
48
+ when Fedora::FedoraObject
49
+ obj = Repository.get_object( obj.pid )
50
+ when String
51
+ obj = Repository.get_object( obj )
52
+ else
53
+ raise "you must pass either a ActiveFedora::Base, Fedora::RepositoryObject, or a String. You submitted a #{obj.class}"
54
+ end
55
+
56
+ # obj = obj.kind_of?(ActiveFedora::Base) ? obj : Repository.get_object( obj )
57
+
58
+ obj_done = Time.now
59
+ obj_done_elapse = obj_done - start
60
+ puts " completed. Duration: #{obj_done_elapse}"
61
+
62
+ print "\t Indexing object #{obj.pid} ... "
63
+ # add the keywords and facets to the search index
64
+ index_start = Time.now
65
+ indexer.index( obj )
66
+
67
+ index_done = Time.now
68
+ index_elapsed = index_done - index_start
69
+
70
+ puts "completed. Duration: #{index_elapsed} ."
71
+
72
+
73
+ rescue Exception => e
74
+ p "unable to index #{obj}. Failed with #{e.inspect}"
75
+
76
+
77
+ end #begin
78
+
79
+ end
80
+
81
+ #
82
+ # This method retrieves a comprehensive list of all the unique identifiers in Fedora and
83
+ # solrizes each object's full-text and facets into the search index
84
+ def solrize_objects
85
+ # retrieve a list of all the pids in the fedora repository
86
+ num_docs = 1000000 # modify this number to guarantee that all the objects are retrieved from the repository
87
+ puts "WARNING: You have turned off indexing of Full Text content. Be sure to re-run indexer with @@index_full_text set to true in main.rb" if index_full_text == false
88
+
89
+ if @@index_list == false
90
+
91
+ objects = ::Fedora::Repository.instance.find_objects(:limit=>num_docs)
92
+
93
+ puts "Shelving #{objects.length} Fedora objects"
94
+ objects.each do |object|
95
+ solrize( object )
96
+ end
97
+
98
+ else
99
+
100
+ if File.exists?(@@index_list)
101
+ arr_of_pids = FasterCSV.read(@@index_list, :headers=>false)
102
+
103
+ puts "Indexing from list at #{@@index_list}"
104
+ puts "Shelving #{arr_of_pids.length} Fedora objects"
105
+
106
+ arr_of_pids.each do |row|
107
+ pid = row[0]
108
+ solrize( pid )
109
+ end #FASTERCSV
110
+ else
111
+ puts "#{@@index_list} does not exists!"
112
+ end #if File.exists
113
+
114
+ end #if Index_LISTS
115
+ end #solrize_objects
116
+
117
+ end #class
118
+ end #module
@@ -0,0 +1,35 @@
1
+ namespace :solrizer do
2
+
3
+ namespace :fedora
4
+ desc 'Index a fedora object of the given pid.'
5
+ task :solrize => :environment do
6
+ index_full_text = ENV['FULL_TEXT'] == 'true'
7
+ if ENV['PID']
8
+ puts "indexing #{ENV['PID'].inspect}"
9
+ solrizer = Solrizer::Fedora::Solrizer.new :index_full_text=> index_full_text
10
+ solrizer.solrize(ENV['PID'])
11
+ puts "Finished shelving #{ENV['PID']}"
12
+ else
13
+ puts "You must provide a pid using the format 'solrizer::solrize_object PID=sample:pid'."
14
+ end
15
+ end
16
+
17
+ desc 'Index all objects in the repository.'
18
+ task :solrize_objects => :environment do
19
+ index_full_text = ENV['FULL_TEXT'] == 'true'
20
+ if ENV['INDEX_LIST']
21
+ @@index_list = ENV['INDEX_LIST']
22
+ end
23
+
24
+ puts "Re-indexing Fedora Repository."
25
+ puts "Fedora URL: #{ActiveFedora.fedora_config[:url]}"
26
+ puts "Fedora Solr URL: #{ActiveFedora.solr_config[:url]}"
27
+ puts "Blacklight Solr Config: #{Blacklight.solr_config.inspect}"
28
+ puts "Doing full text index." if index_full_text
29
+ solrizer = Solrizer::Fedora::Solrizer.new :index_full_text=> index_full_text
30
+ solrizer.solrize_objects
31
+ puts "Solrizer task complete."
32
+ end
33
+ end
34
+
35
+ end
@@ -0,0 +1,72 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{solrizer-fedora}
8
+ s.version = "0.1.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Matt Zumwalt"]
12
+ s.date = %q{2010-10-26}
13
+ s.description = %q{An extension to projecthydra/solrizer that provides utilities for loading objects from Fedora Repositories and creating solr documents from them.}
14
+ s.email = %q{matt.zumwalt@yourmediashelf.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.textile"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".gitignore",
22
+ "Gemfile",
23
+ "Gemfile.lock",
24
+ "History.textile",
25
+ "LICENSE",
26
+ "README.textile",
27
+ "Rakefile",
28
+ "VERSION",
29
+ "config/fedora.yml",
30
+ "config/hydra_types.yml",
31
+ "config/solr.yml",
32
+ "lib/solrizer-fedora.rb",
33
+ "lib/solrizer/fedora.rb",
34
+ "lib/solrizer/fedora/extractor.rb",
35
+ "lib/solrizer/fedora/indexer.rb",
36
+ "lib/solrizer/fedora/repository.rb",
37
+ "lib/solrizer/fedora/solrizer.rb",
38
+ "lib/tasks/solrizer-fedora.rake",
39
+ "solrizer-fedora.gemspec",
40
+ "spec/fixtures/rels_ext_cmodel.xml",
41
+ "spec/integration/fedora_indexer_spec.rb",
42
+ "spec/rcov.opts",
43
+ "spec/spec.opts",
44
+ "spec/spec_helper.rb",
45
+ "spec/units/fedora_extractor_spec.rb",
46
+ "spec/units/fedora_indexer_spec.rb",
47
+ "spec/units/fedora_solrizer_spec.rb"
48
+ ]
49
+ s.homepage = %q{http://github.com/projecthydra/solrizer-fedora}
50
+ s.rdoc_options = ["--charset=UTF-8"]
51
+ s.require_paths = ["lib"]
52
+ s.rubygems_version = %q{1.3.7}
53
+ s.summary = %q{An extension to solrizer that deals with Fedora objects & Repositories}
54
+ s.test_files = [
55
+ "spec/integration/fedora_indexer_spec.rb",
56
+ "spec/spec_helper.rb",
57
+ "spec/units/fedora_extractor_spec.rb",
58
+ "spec/units/fedora_indexer_spec.rb",
59
+ "spec/units/fedora_solrizer_spec.rb"
60
+ ]
61
+
62
+ if s.respond_to? :specification_version then
63
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
64
+ s.specification_version = 3
65
+
66
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
67
+ else
68
+ end
69
+ else
70
+ end
71
+ end
72
+
@@ -0,0 +1,8 @@
1
+ <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
2
+ <rdf:Description rdf:about="info:fedora/demo:multipurpose-objects-model_and_sdef">
3
+ <hasModel xmlns="info:fedora/fedora-system:def/model#" rdf:resource="info:fedora/fedora-system:ContentModel-3.0"/>
4
+ <hasModel xmlns="info:fedora/fedora-system:def/model#" rdf:resource="info:fedora/afmodel:SaltDocument"/>
5
+ <hasModel xmlns="info:fedora/fedora-system:def/model#" rdf:resource="info:fedora/afmodel:JP2Document"/>
6
+ <hasModel xmlns="info:fedora/fedora-system:def/model#" rdf:resource="info:fedora/afmodel:DCDocument"/>
7
+ </rdf:Description>
8
+ </rdf:RDF>
@@ -0,0 +1,18 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+ require 'solrizer'
3
+
4
+ describe Solrizer::Fedora::Indexer do
5
+
6
+ before(:each) do
7
+ @indexer = Solrizer::Fedora::Indexer.new
8
+ end
9
+
10
+ describe "index" do
11
+ it "should update solr with the metadata from the given object" do
12
+ pending "Got to decide if/how to handle fixtures in this gem. Probably should just mock out Fedora & Solr entirely."
13
+ obj = Solrizer::Repository.get_object( "druid:sb733gr4073" )
14
+ @indexer.index( obj )
15
+ end
16
+ end
17
+
18
+ end
data/spec/rcov.opts ADDED
@@ -0,0 +1,2 @@
1
+ --exclude "spec/*,gems/*"
2
+ --rails
data/spec/spec.opts ADDED
@@ -0,0 +1 @@
1
+ --color
@@ -0,0 +1,18 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+ require 'solrizer/fedora'
4
+ require 'spec'
5
+ require 'spec/autorun'
6
+
7
+ require 'solrizer'
8
+
9
+ Spec::Runner.configure do |config|
10
+
11
+ config.mock_with :mocha
12
+
13
+
14
+ def fixture(file)
15
+ File.new(File.join(File.dirname(__FILE__), 'fixtures', file))
16
+ end
17
+
18
+ end
@@ -0,0 +1,31 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+ require 'solrizer'
3
+
4
+ describe Solrizer::Fedora::Extractor do
5
+
6
+ before(:all) do
7
+ @extractor = Solrizer::Extractor.new
8
+ end
9
+
10
+ describe "extract_rels_ext" do
11
+ it "should extract the content model of the RELS-EXT datastream of a Fedora object and set hydra_type using hydra_types mapping" do
12
+ rels_ext = fixture("rels_ext_cmodel.xml")
13
+ result = @extractor.extract_rels_ext( rels_ext )
14
+ result[:cmodel_t].should == "info:fedora/fedora-system:ContentModel-3.0"
15
+ result[:hydra_type_t].should == "salt_document"
16
+
17
+ # ... and a hacky way of making sure that it added a field for each of the dc:medium values
18
+ result.inspect.include?('@value="info:fedora/afmodel:SaltDocument"').should be_true
19
+ result.inspect.include?('@value="jp2_document"').should be_true
20
+ end
21
+ end
22
+
23
+ describe "extract_hydra_types" do
24
+ it "should extract the hydra_type of a Fedora object" do
25
+ rels_ext = fixture("rels_ext_cmodel.xml")
26
+ result = @extractor.extract_rels_ext( rels_ext )
27
+ result[:hydra_type_t].should == "salt_document"
28
+ end
29
+ end
30
+
31
+ end
@@ -0,0 +1,78 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+ require 'solrizer'
3
+ require "solrizer/fedora"
4
+
5
+ describe Solrizer::Fedora::Indexer do
6
+
7
+ before(:each) do
8
+ Solrizer::Fedora::Indexer.any_instance.stubs(:connect).returns("foo")
9
+
10
+ @extractor = mock("Extractor")
11
+ @extractor.stubs(:html_content_to_solr).returns(@solr_doc)
12
+ # @solr_doc = mock('solr_doc')
13
+ # @solr_doc.stubs(:<<)
14
+ # @solr_doc.stubs(:[])
15
+
16
+ @solr_doc = Solr::Document.new
17
+
18
+ Solrizer::Extractor.expects(:new).returns(@extractor)
19
+ @indexer = Solrizer::Fedora::Indexer.new
20
+
21
+ end
22
+
23
+ describe "#generate_dates" do
24
+ it "should still give 9999-99-99 date if the solr document does not have a date_t field" do
25
+
26
+ solr_result = @indexer.generate_dates(@solr_doc)
27
+ solr_result.should be_kind_of Solr::Document
28
+ solr_result[:date_t].should == "9999-99-99"
29
+ solr_result[:month_facet].should == "99"
30
+ solr_result[:day_facet].should == '99'
31
+
32
+ end
33
+
34
+ it "should still give 9999-99-99 date if the solr_doc[:date_t] is not valid date in YYYY-MM-DD format " do
35
+
36
+ @solr_doc << Solr::Field.new(:date_t => "Unknown")
37
+ solr_result = @indexer.generate_dates(@solr_doc)
38
+ solr_result.should be_kind_of Solr::Document
39
+ solr_result[:date_t].should == "Unknown"
40
+ solr_result[:month_facet].should == "99"
41
+ solr_result[:day_facet].should == '99'
42
+
43
+ end
44
+
45
+ it "should give month and dates even if the :date_t is not a valid date but is in YYYY-MM-DD format " do
46
+
47
+ @solr_doc << Solr::Field.new(:date_t => "0000-13-11")
48
+ solr_result = @indexer.generate_dates(@solr_doc)
49
+ solr_result.should be_kind_of Solr::Document
50
+ solr_result[:date_t].should == "0000-13-11"
51
+ solr_result[:month_facet].should == "99"
52
+ solr_result[:day_facet].should == '11'
53
+ end
54
+
55
+ it "should give month and day when in a valid date format" do
56
+ @solr_doc << Solr::Field.new(:date_t => "1978-04-11")
57
+ solr_result = @indexer.generate_dates(@solr_doc)
58
+ solr_result.should be_kind_of Solr::Document
59
+ solr_result[:date_t].should == "1978-04-11"
60
+ solr_result[:month_facet].should == "04"
61
+ solr_result[:day_facet].should == '11'
62
+
63
+ end
64
+
65
+ it "should still give two digit strings even if the month/day is single digit" do
66
+
67
+ @solr_doc << Solr::Field.new(:date_t => "1978-4-1")
68
+ solr_result = @indexer.generate_dates(@solr_doc)
69
+ solr_result.should be_kind_of Solr::Document
70
+ solr_result[:date_t].should == "1978-4-1"
71
+ solr_result[:month_facet].should == "04"
72
+ solr_result[:day_facet].should == '01'
73
+
74
+ end
75
+
76
+ end
77
+
78
+ end
@@ -0,0 +1,42 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+
3
+ describe Solrizer::Fedora::Solrizer do
4
+
5
+ before(:each) do
6
+ @solrizer = Solrizer::Fedora::Solrizer.new
7
+ end
8
+
9
+ describe "solrize" do
10
+ it "should trigger the indexer for the provided object" do
11
+ sample_obj = ActiveFedora::Base.new
12
+ @solrizer.indexer.expects(:index).with( sample_obj )
13
+ @solrizer.solrize( sample_obj )
14
+ end
15
+ it "should work with Fedora::FedoraObject objects" do
16
+ mock_object = Fedora::FedoraObject.new(:pid=>"my:pid", :label=>"my label")
17
+ ActiveFedora::Base.expects(:load_instance).with( mock_object.pid ).returns(mock_object)
18
+ @solrizer.indexer.expects(:index).with( mock_object )
19
+ @solrizer.solrize( mock_object )
20
+ end
21
+ it "should load the object if only a pid is provided" do
22
+ mock_object = mock("my object")
23
+ mock_object.stubs(:pid)
24
+ mock_object.stubs(:label)
25
+ mock_object.stubs(:datastreams).returns({'descMetadata'=>"foo","location"=>"bar"})
26
+
27
+ ActiveFedora::Base.expects(:load_instance).with( "_PID_" ).returns(mock_object)
28
+ @solrizer.indexer.expects(:index).with(mock_object)
29
+ @solrizer.solrize("_PID_")
30
+ end
31
+
32
+ end
33
+
34
+ describe "solrize_objects" do
35
+ it "should call solrize for each object returned by Fedora::Repository.find_objects" do
36
+ objects = [["pid1"], ["pid2"], ["pid3"]]
37
+ Fedora::Repository.any_instance.expects(:find_objects).returns(objects)
38
+ objects.each {|object| @solrizer.expects(:solrize).with( object ) }
39
+ @solrizer.solrize_objects
40
+ end
41
+ end
42
+ end
metadata ADDED
@@ -0,0 +1,99 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: solrizer-fedora
3
+ version: !ruby/object:Gem::Version
4
+ hash: 27
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 0
10
+ version: 0.1.0
11
+ platform: ruby
12
+ authors:
13
+ - Matt Zumwalt
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-10-26 00:00:00 -05:00
19
+ default_executable:
20
+ dependencies: []
21
+
22
+ description: An extension to projecthydra/solrizer that provides utilities for loading objects from Fedora Repositories and creating solr documents from them.
23
+ email: matt.zumwalt@yourmediashelf.com
24
+ executables: []
25
+
26
+ extensions: []
27
+
28
+ extra_rdoc_files:
29
+ - LICENSE
30
+ - README.textile
31
+ files:
32
+ - .document
33
+ - .gitignore
34
+ - Gemfile
35
+ - Gemfile.lock
36
+ - History.textile
37
+ - LICENSE
38
+ - README.textile
39
+ - Rakefile
40
+ - VERSION
41
+ - config/fedora.yml
42
+ - config/hydra_types.yml
43
+ - config/solr.yml
44
+ - lib/solrizer-fedora.rb
45
+ - lib/solrizer/fedora.rb
46
+ - lib/solrizer/fedora/extractor.rb
47
+ - lib/solrizer/fedora/indexer.rb
48
+ - lib/solrizer/fedora/repository.rb
49
+ - lib/solrizer/fedora/solrizer.rb
50
+ - lib/tasks/solrizer-fedora.rake
51
+ - solrizer-fedora.gemspec
52
+ - spec/fixtures/rels_ext_cmodel.xml
53
+ - spec/integration/fedora_indexer_spec.rb
54
+ - spec/rcov.opts
55
+ - spec/spec.opts
56
+ - spec/spec_helper.rb
57
+ - spec/units/fedora_extractor_spec.rb
58
+ - spec/units/fedora_indexer_spec.rb
59
+ - spec/units/fedora_solrizer_spec.rb
60
+ has_rdoc: true
61
+ homepage: http://github.com/projecthydra/solrizer-fedora
62
+ licenses: []
63
+
64
+ post_install_message:
65
+ rdoc_options:
66
+ - --charset=UTF-8
67
+ require_paths:
68
+ - lib
69
+ required_ruby_version: !ruby/object:Gem::Requirement
70
+ none: false
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ hash: 3
75
+ segments:
76
+ - 0
77
+ version: "0"
78
+ required_rubygems_version: !ruby/object:Gem::Requirement
79
+ none: false
80
+ requirements:
81
+ - - ">="
82
+ - !ruby/object:Gem::Version
83
+ hash: 3
84
+ segments:
85
+ - 0
86
+ version: "0"
87
+ requirements: []
88
+
89
+ rubyforge_project:
90
+ rubygems_version: 1.3.7
91
+ signing_key:
92
+ specification_version: 3
93
+ summary: An extension to solrizer that deals with Fedora objects & Repositories
94
+ test_files:
95
+ - spec/integration/fedora_indexer_spec.rb
96
+ - spec/spec_helper.rb
97
+ - spec/units/fedora_extractor_spec.rb
98
+ - spec/units/fedora_indexer_spec.rb
99
+ - spec/units/fedora_solrizer_spec.rb