solrizer 0.1.0.pre2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,17 @@
1
+ #!/bin/env ruby
2
+
3
+ @index_full_text = false
4
+
5
+ require 'rubygems'
6
+ load 'configuration.rb'
7
+ load 'repository.rb'
8
+ load 'solrizer.rb'
9
+
10
+ # initialize connection to Fedora repository
11
+ repository = Repository.new
12
+ repository.initialize_repository
13
+
14
+ # solrize all objects in the Fedora repository
15
+ solrizer = Solrizer.new
16
+ solrizer.solrize_objects
17
+
@@ -0,0 +1,143 @@
1
+ require 'fastercsv'
2
+ REPLICATOR_LIST = false unless defined?(REPLICATOR_LIST)
3
+
4
+
5
+ module Solrizer
6
+ class Replicator
7
+
8
+ include Stanford::SaltControllerHelper
9
+ attr_accessor :dest_repo, :configs
10
+
11
+ def initialize
12
+ config_path = "#{RAILS_ROOT}/config/replicator.yml"
13
+ raw_configs = YAML::load(File.open(config_path))
14
+ @configs = raw_configs[RAILS_ENV]
15
+ @dest_repo = Fedora::Repository.new(configs["destination"]["fedora"]["url"])
16
+
17
+ ActiveFedora.fedora_config[:url] = configs["source"]["fedora"]["url"]
18
+ logger.info("REPLICATOR: re-initializing Fedora with fedora_config: #{ActiveFedora.fedora_config.inspect}")
19
+
20
+ Fedora::Repository.register(ActiveFedora.fedora_config[:url])
21
+ logger.info("REPLICATOR: re-initialized Fedora as: #{Fedora::Repository.instance.inspect}")
22
+
23
+ # Register Solr
24
+ ActiveFedora.solr_config[:url] = configs["source"]["solr"]["url"]
25
+
26
+ logger.info("REPLICATOR: re-initializing ActiveFedora::SolrService with solr_config: #{ActiveFedora.solr_config.inspect}")
27
+
28
+ ActiveFedora::SolrService.register(ActiveFedora.solr_config[:url])
29
+
30
+ end
31
+
32
+ def replicate_objects
33
+ # retrieve a list of all the pids in the fedora repository
34
+ num_docs = 1000000 # modify this number to guarantee that all the objects are retrieved from the repository
35
+
36
+ if REPLICATOR_LIST == false
37
+
38
+ pids = Repository.get_pids( num_docs )
39
+ puts "Replicating #{pids.length} Fedora objects"
40
+ pids.each do |pid|
41
+ unless pid[0].empty? || pid[0].nil? || !pid[0].include?("druid:")
42
+ puts "Processing #{pid}"
43
+ replicate_object( pid )
44
+ end #unless
45
+ end #pids.each
46
+
47
+ else
48
+
49
+ if File.exists?(REPLICATOR_LIST)
50
+ arr_of_pids = FasterCSV.read(REPLICATOR_LIST, :headers=>false)
51
+
52
+ puts "Replicating from list at #{REPLICATOR_LIST}"
53
+ puts "Replicating #{arr_of_pids.length} Fedora objects"
54
+
55
+ arr_of_pids.each do |row|
56
+ pid = row[0]
57
+ replicate_object( pid )
58
+ end #FASTERCSV
59
+
60
+ else
61
+ puts "#{REPLICATOR_LIST} does not exists!"
62
+ end #if File.exists
63
+
64
+ end #if Index_LISTS
65
+ end #replicate_objects
66
+
67
+
68
+ def replicate_object(obj)
69
+ #source_doc = Document.load_instance(pid)
70
+ obj = obj.kind_of?(ActiveFedora::Base) ? obj : Repository.get_object( obj )
71
+ p "Indexing object #{obj.pid} with label #{obj.label}"
72
+ begin
73
+ unless obj.nil?
74
+ create_stub(obj)
75
+ p "Successfully replicated #{obj.pid}"
76
+ end
77
+ rescue Exception => e
78
+ p "unable to create stub. Failed with #{e.inspect}"
79
+ end
80
+ end
81
+
82
+ # Creates a stub object in @dest_repo with the datastreams that we need in the stubs
83
+ def create_stub(source_object)
84
+
85
+ begin
86
+
87
+ jp2 = downloadables(source_object, :canonical=>true, :mime_type=>"image/jp2")
88
+ jp2.new_object = true
89
+ jp2.control_group = 'M'
90
+ jp2.blob = jp2.content
91
+
92
+ stub_object = Fedora::FedoraObject.new(:pid=>source_object.pid)
93
+ dest_repo.save(stub_object)
94
+ dest_repo.save(jp2)
95
+
96
+ ["properties", "extProperties", "descMetadata", "location"].each do |ds_name|
97
+ ds = source_object.datastreams[ds_name]
98
+ ds.new_object = true
99
+ ds.blob = ds.content
100
+ dest_repo.save(ds)
101
+ end
102
+
103
+ rescue
104
+ #for object without jp2s
105
+ #this is a temp fix to the downloadables() issue
106
+
107
+
108
+ pid = source_object.pid
109
+ p "> #{pid}"
110
+
111
+ jp2_file = File.new('spec/fixtures/image.jp2')
112
+ ds = ActiveFedora::Datastream.new(:dsID => "image.jp2", :dsLabel => 'image.jp2', :controlGroup => 'M', :blob => jp2_file)
113
+ source_object.add_datastream(ds)
114
+ source_object.save
115
+ # source_object = Document.load_instance(pid)
116
+ source_object = ActiveFedora::Base.load_instance(pid)
117
+ stub_object = Fedora::FedoraObject.new(:pid=>source_object.pid)
118
+ dest_repo.save(stub_object)
119
+
120
+ jp2 = downloadables(source_object, :canonical=>true, :mime_type=>"image/jp2")
121
+ jp2.new_object = true
122
+ jp2.control_group = 'M'
123
+ jp2.blob = jp2.content
124
+
125
+ stub_object = Fedora::FedoraObject.new(:pid=>source_object.pid)
126
+ dest_repo.save(stub_object)
127
+ dest_repo.save(jp2)
128
+
129
+ ["properties", "extProperties", "descMetadata", "location"].each do |ds_name|
130
+ ds = source_object.datastreams[ds_name]
131
+ ds.new_object = true
132
+ ds.blob = ds.content
133
+ dest_repo.save(ds)
134
+ end
135
+
136
+ end
137
+ end
138
+ def logger
139
+ @logger ||= defined?(RAILS_DEFAULT_LOGGER) ? RAILS_DEFAULT_LOGGER : Logger.new(STDOUT)
140
+ end
141
+
142
+ end
143
+ end
@@ -0,0 +1,54 @@
1
+
2
+ require 'active-fedora'
3
+
4
+
5
+ module Solrizer
6
+ class Repository
7
+
8
+ #
9
+ # This method initializes the fedora repository and solr instance
10
+ #
11
+ def initialize_repository
12
+ Fedora::Repository.register( FEDORA_URL )
13
+ ActiveFedora::SolrService.register( FEDORA_SOLR_URL )
14
+ end
15
+
16
+ #
17
+ # This method retrieves a comprehensive list of unique ids in the fedora repository
18
+ #
19
+ def self.get_pids( num_docs )
20
+ solr_results = ActiveFedora::SolrService.instance.conn.query( "active_fedora_model_field:Document", { :rows => num_docs } )
21
+ id_array = []
22
+ solr_results.hits.each do |hit|
23
+ id_array << hit[SOLR_DOCUMENT_ID]
24
+ end
25
+ return id_array
26
+ end
27
+
28
+ #
29
+ # This method retrieves the object associated with the given unique id
30
+ #
31
+ def self.get_object( pid )
32
+ object = ActiveFedora::Base.load_instance( pid )
33
+ end
34
+
35
+ #
36
+ # This method retrieves a comprehensive list of datastreams for the given object
37
+ #
38
+ def self.get_datastreams( obj )
39
+ ds_keys = obj.datastreams.keys
40
+ end
41
+
42
+ #
43
+ # This method retrieves the datastream for the given object with the given datastream name
44
+ #
45
+ def self.get_datastream( obj, ds_name )
46
+ begin
47
+ obj.datastreams[ ds_name ]
48
+ rescue
49
+ return nil
50
+ end
51
+ end
52
+
53
+ end
54
+ end
@@ -0,0 +1,33 @@
1
+ namespace :solrizer do
2
+
3
+ desc 'Index a fedora object of the given pid.'
4
+ task :solrize => :environment do
5
+ index_full_text = ENV['FULL_TEXT'] == 'true'
6
+ if ENV['PID']
7
+ puts "indexing #{ENV['PID'].inspect}"
8
+ solrizer = Solrizer::Solrizer.new :index_full_text=> index_full_text
9
+ solrizer.solrize(ENV['PID'])
10
+ puts "Finished shelving #{ENV['PID']}"
11
+ else
12
+ puts "You must provide a pid using the format 'solrizer::solrize_object PID=sample:pid'."
13
+ end
14
+ end
15
+
16
+ desc 'Index all objects in the repository.'
17
+ task :solrize_objects => :environment do
18
+ index_full_text = ENV['FULL_TEXT'] == 'true'
19
+ if ENV['INDEX_LIST']
20
+ @@index_list = ENV['INDEX_LIST']
21
+ end
22
+
23
+ puts "Re-indexing Fedora Repository."
24
+ puts "Fedora URL: #{ActiveFedora.fedora_config[:url]}"
25
+ puts "Fedora Solr URL: #{ActiveFedora.solr_config[:url]}"
26
+ puts "Blacklight Solr Config: #{Blacklight.solr_config.inspect}"
27
+ puts "Doing full text index." if index_full_text
28
+ solrizer = Solrizer::Solrizer.new :index_full_text=> index_full_text
29
+ solrizer.solrize_objects
30
+ puts "Solrizer task complete."
31
+ end
32
+
33
+ end
@@ -0,0 +1,80 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{solrizer}
8
+ s.version = "0.1.0.pre2"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new("> 1.3.1") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Matt Zumwalt"]
12
+ s.date = %q{2010-05-15}
13
+ s.description = %q{Use solrizer to populate solr indexes from Fedora repository content or from other sources. You can run solrizer from within your apps, using the provided rake tasks, or as a JMS listener}
14
+ s.email = %q{matt.zumwalt@yourmediashelf.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".gitignore",
21
+ "LICENSE",
22
+ "README.rdoc",
23
+ "Rakefile",
24
+ "VERSION",
25
+ "config/fedora.yml",
26
+ "config/hydra_types.yml",
27
+ "config/solr.yml",
28
+ "lib/solrizer.rb",
29
+ "lib/solrizer/configuration.rb",
30
+ "lib/solrizer/extractor.rb",
31
+ "lib/solrizer/indexer.rb",
32
+ "lib/solrizer/main.rb",
33
+ "lib/solrizer/replicator.rb",
34
+ "lib/solrizer/repository.rb",
35
+ "lib/tasks/solrizer.rake",
36
+ "solrizer.gemspec",
37
+ "spec/fixtures/druid-bv448hq0314-descMetadata.xml",
38
+ "spec/fixtures/druid-bv448hq0314-extProperties.xml",
39
+ "spec/fixtures/druid-cm234kq4672-extProperties.xml",
40
+ "spec/fixtures/druid-cm234kq4672-stories.xml",
41
+ "spec/fixtures/druid-hc513kw4806-descMetadata.xml",
42
+ "spec/fixtures/rels_ext_cmodel.xml",
43
+ "spec/integration/indexer_spec.rb",
44
+ "spec/rcov.opts",
45
+ "spec/spec.opts",
46
+ "spec/spec_helper.rb",
47
+ "spec/units/extractor_spec.rb",
48
+ "spec/units/indexer_spec.rb",
49
+ "spec/units/shelver_spec.rb"
50
+ ]
51
+ s.homepage = %q{http://github.com/projecthydra/solrizer}
52
+ s.rdoc_options = ["--charset=UTF-8"]
53
+ s.require_paths = ["lib"]
54
+ s.rubygems_version = %q{1.3.6}
55
+ s.summary = %q{A utility for building solr indexes, usually from Fedora repository content.}
56
+ s.test_files = [
57
+ "spec/integration/indexer_spec.rb",
58
+ "spec/spec_helper.rb",
59
+ "spec/units/extractor_spec.rb",
60
+ "spec/units/indexer_spec.rb",
61
+ "spec/units/shelver_spec.rb"
62
+ ]
63
+
64
+ if s.respond_to? :specification_version then
65
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
66
+ s.specification_version = 3
67
+
68
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
69
+ s.add_runtime_dependency(%q<active-fedora>, ["> 1.1.3"])
70
+ s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
71
+ else
72
+ s.add_dependency(%q<active-fedora>, ["> 1.1.3"])
73
+ s.add_dependency(%q<rspec>, [">= 1.2.9"])
74
+ end
75
+ else
76
+ s.add_dependency(%q<active-fedora>, ["> 1.1.3"])
77
+ s.add_dependency(%q<rspec>, [">= 1.2.9"])
78
+ end
79
+ end
80
+
@@ -0,0 +1,11 @@
1
+ <dc xmlns:dcterms="http://purl.org/dc/terms/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
2
+ <dcterms:type xsi:type="DCMITYPE">text</dcterms:type>
3
+ <dcterms:medium>Paper Document</dcterms:medium>
4
+ <dcterms:rights>Presumed under copyright. Do not publish.</dcterms:rights>
5
+ <dcterms:date>1985-12-30</dcterms:date>
6
+ <dcterms:format>application/tiff</dcterms:format>
7
+ <dcterms:format>application/jp2000</dcterms:format>
8
+ <dcterms:format>application/pdf</dcterms:format>
9
+ <dcterms:title>This is a Sample Title</dcterms:title>
10
+ <dcterms:publisher>Sample Unversity</dcterms:publisher>
11
+ </dc>
@@ -0,0 +1,52 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <document>
3
+ <attributes>
4
+ <attribute type="item">5958</attribute>
5
+ <attribute type="objectid">FEI0010-00013142</attribute>
6
+ <attribute type="title">Letter from Ellie Engelmore to Professor K. C. Reddy</attribute>
7
+ <attribute type="copyright">Presumed under copyright. Do not publish.</attribute>
8
+ <attribute type="description"/>
9
+ <attribute type="date">1985-12-30</attribute>
10
+ <attribute type="datestr">30/12/1985</attribute>
11
+ <attribute type="docurl">https://www.stanford.edu/group/salt_project/SLA/Feigenbaum/eaf7000/pdf/00013142.pdf</attribute>
12
+ <attribute type="doctn">http://www.stanford.edu/group/salt_project/cgi-bin/SLA/Feigenbaum/eaf7000/png/small_00013142.png</attribute>
13
+ <attribute type="url"/>
14
+ <attribute type="industryterm"/>
15
+ <attribute type="technology">artificial intelligence</attribute>
16
+ <attribute type="company"/>
17
+ <attribute type="person">ELLIE ENGELMORE</attribute>
18
+ <attribute type="year">1985</attribute>
19
+ <attribute type="organization">Mathematics and Computer/Information Sciences University of Hyderabad Central University P. O. Hyder</attribute>
20
+ <attribute type="sourcelocation">Folder 15</attribute>
21
+ </attributes>
22
+ <facets>
23
+ <facet type="year" id="49">1980s</facet>
24
+ <facet type="year" id="49">1985</facet>
25
+ <facet type="year" id="42">1980s</facet>
26
+ <facet type="sourcelocation" id="592">Feigenbaum</facet>
27
+ <facet type="sourcelocation" id="592">eaf7000</facet>
28
+ <facet type="sourcelocation" id="592">Box 51A</facet>
29
+ <facet type="sourcelocation" id="594">Feigenbaum</facet>
30
+ <facet type="sourcelocation" id="594">eaf7000</facet>
31
+ <facet type="sourcelocation" id="594">Box 51A</facet>
32
+ <facet type="sourcelocation" id="594">Folder 15</facet>
33
+ <facet type="sourcelocation" id="691">Feigenbaum</facet>
34
+ <facet type="sourcelocation" id="692">Feigenbaum</facet>
35
+ <facet type="sourcelocation" id="692">eaf7000</facet>
36
+ <facet type="doctype" id="32">Correspondence</facet>
37
+ <facet type="city" id="82">Ann Arbor</facet>
38
+ <facet type="city" id="910">Hyderabad</facet>
39
+ <facet type="city" id="1519">Palo Alto</facet>
40
+ <facet type="country" id="68">India</facet>
41
+ <facet type="emailaddress" id="288">EENGELMORE@SUMEX-AIM.ARPA</facet>
42
+ <facet type="organization" id="5065">Heuristic Programming Project</facet>
43
+ <facet type="organization" id="7012">Mathematics and Computer/Information Sciences University of Hyderabad Central University P. O. Hyder</facet>
44
+ <facet type="organization" id="8878">Professor K. C. Reddy School of Mathematics and Computer/Information Sciences</facet>
45
+ <facet type="person" id="5810">ELLIE ENGELMORE</facet>
46
+ <facet type="person" id="17934">Reddy</facet>
47
+ <facet type="person" id="5787">EDWARD FEIGENBAUM</facet>
48
+ <facet type="provinceorstate" id="96">Michigan</facet>
49
+ <facet type="provinceorstate" id="27">California</facet>
50
+ <facet type="technology" id="1713">artificial intelligence</facet>
51
+ </facets>
52
+ </document>
@@ -0,0 +1,5 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <document>
3
+ <attributes><attribute type="item">4290</attribute><attribute type="objectid">FEI0010-00011325</attribute><attribute type="title">Letter from Ellie Engelmore to Wemara Lichty</attribute><attribute type="copyright">Presumed under copyright. Do not publish.</attribute><attribute type="description"/><attribute type="date">1984-6-4</attribute><attribute type="datestr">4/6/1984</attribute><attribute type="docurl">https://www.stanford.edu/group/salt_project/SLA/Feigenbaum/eaf7000/pdf/00011325.pdf</attribute><attribute type="doctn">http://www.stanford.edu/group/salt_project/cgi-bin/SLA/Feigenbaum/eaf7000/png/small_00011325.png</attribute><attribute type="url"/><attribute type="industryterm"/><attribute type="technology">artificial intelligence</attribute><attribute type="company"/><attribute type="person">A. FEIGENBAUM</attribute><attribute type="year">1984</attribute><attribute type="organization">McAlcster Hall University</attribute><attribute type="sourcelocation">Folder 5</attribute></attributes>
4
+ <facets><facet type="year" id="48">1980s</facet><facet type="year" id="48">1984</facet><facet type="year" id="42">1980s</facet><facet type="sourcelocation" id="578">Feigenbaum</facet><facet type="sourcelocation" id="578">eaf7000</facet><facet type="sourcelocation" id="578">Box 51</facet><facet type="sourcelocation" id="587">Feigenbaum</facet><facet type="sourcelocation" id="587">eaf7000</facet><facet type="sourcelocation" id="587">Box 51</facet><facet type="sourcelocation" id="587">Folder 5</facet><facet type="sourcelocation" id="692">Feigenbaum</facet><facet type="sourcelocation" id="692">eaf7000</facet><facet type="sourcelocation" id="691">Feigenbaum</facet><facet type="doctype" id="32">Correspondence</facet><facet type="city" id="1948">Stanford</facet><facet type="country" id="33">Columbia</facet><facet type="facility" id="2551">U. Missouri library</facet><facet type="organization" id="5065">Heuristic Programming Project</facet><facet type="organization" id="7026">McAlcster Hall University</facet><facet type="organization" id="9645">STANFORD UNIVERSITY</facet><facet type="organization" id="11964">University of Missouri</facet><facet type="organization" id="12407">Wemara Lichty Psychology Department</facet><facet type="person" id="15650">Morton Hunt</facet><facet type="person" id="37">A. FEIGENBAUM</facet><facet type="person" id="5810">ELLIE ENGELMORE</facet><facet type="provinceorstate" id="27">California</facet><facet type="provinceorstate" id="100">Missouri</facet><facet type="publishedmedium" id="44">Artificial Intelligence</facet><facet type="technology" id="1713">artificial intelligence</facet></facets>
5
+ </document>
@@ -0,0 +1,17 @@
1
+
2
+ <html>
3
+ <body>
4
+ <pre>
5
+ This is
6
+ preformatted text.
7
+ It preserves both spaces
8
+ and line breaks.
9
+ </pre>
10
+ <p>The pre tag is good for displaying computer code:</p>
11
+ <pre>
12
+ for i = 1 to 10
13
+ print i
14
+ next i
15
+ </pre>
16
+ </body>
17
+ </html>
@@ -0,0 +1,11 @@
1
+ <dc xmlns:dcterms="http://purl.org/dc/terms/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
2
+ <dcterms:type xsi:type="DCMITYPE">text</dcterms:type>
3
+ <dcterms:medium>Paper Document</dcterms:medium>
4
+ <dcterms:rights>Copyright © 2006 All rights reserved. Distribution for commercial purposes is prohibited.</dcterms:rights>
5
+ <dcterms:date/>
6
+ <dcterms:format>application/tiff</dcterms:format>
7
+ <dcterms:format>application/jp2000</dcterms:format>
8
+ <dcterms:format>application/pdf</dcterms:format>
9
+ <dcterms:title>The Rise and Fall of the YouTube Empire</dcterms:title>
10
+ <dcterms:publisher>Sample Unversity</dcterms:publisher>
11
+ </dc>
@@ -0,0 +1,8 @@
1
+ <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
2
+ <rdf:Description rdf:about="info:fedora/demo:multipurpose-objects-model_and_sdef">
3
+ <hasModel xmlns="info:fedora/fedora-system:def/model#" rdf:resource="info:fedora/fedora-system:ContentModel-3.0"/>
4
+ <hasModel xmlns="info:fedora/fedora-system:def/model#" rdf:resource="info:fedora/afmodel:SaltDocument"/>
5
+ <hasModel xmlns="info:fedora/fedora-system:def/model#" rdf:resource="info:fedora/afmodel:JP2Document"/>
6
+ <hasModel xmlns="info:fedora/fedora-system:def/model#" rdf:resource="info:fedora/afmodel:DCDocument"/>
7
+ </rdf:Description>
8
+ </rdf:RDF>
@@ -0,0 +1,18 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+ require 'solrizer'
3
+
4
+ describe Solrizer::Indexer do
5
+
6
+ before(:each) do
7
+ @indexer = Solrizer::Indexer.new
8
+ end
9
+
10
+ describe "index" do
11
+ it "should update solr with the metadata from the given object" do
12
+ pending "Got to decide if/how to handle fixtures in this gem. Probably should just mock out Fedora & Solr entirely."
13
+ obj = Solrizer::Repository.get_object( "druid:sb733gr4073" )
14
+ @indexer.index( obj )
15
+ end
16
+ end
17
+
18
+ end