solrizer 0.1.0.pre2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,17 @@
1
+ #!/bin/env ruby
2
+
3
+ @index_full_text = false
4
+
5
+ require 'rubygems'
6
+ load 'configuration.rb'
7
+ load 'repository.rb'
8
+ load 'solrizer.rb'
9
+
10
+ # initialize connection to Fedora repository
11
+ repository = Repository.new
12
+ repository.initialize_repository
13
+
14
+ # solrize all objects in the Fedora repository
15
+ solrizer = Solrizer.new
16
+ solrizer.solrize_objects
17
+
@@ -0,0 +1,143 @@
1
+ require 'fastercsv'
2
+ REPLICATOR_LIST = false unless defined?(REPLICATOR_LIST)
3
+
4
+
5
+ module Solrizer
6
+ class Replicator
7
+
8
+ include Stanford::SaltControllerHelper
9
+ attr_accessor :dest_repo, :configs
10
+
11
+ def initialize
12
+ config_path = "#{RAILS_ROOT}/config/replicator.yml"
13
+ raw_configs = YAML::load(File.open(config_path))
14
+ @configs = raw_configs[RAILS_ENV]
15
+ @dest_repo = Fedora::Repository.new(configs["destination"]["fedora"]["url"])
16
+
17
+ ActiveFedora.fedora_config[:url] = configs["source"]["fedora"]["url"]
18
+ logger.info("REPLICATOR: re-initializing Fedora with fedora_config: #{ActiveFedora.fedora_config.inspect}")
19
+
20
+ Fedora::Repository.register(ActiveFedora.fedora_config[:url])
21
+ logger.info("REPLICATOR: re-initialized Fedora as: #{Fedora::Repository.instance.inspect}")
22
+
23
+ # Register Solr
24
+ ActiveFedora.solr_config[:url] = configs["source"]["solr"]["url"]
25
+
26
+ logger.info("REPLICATOR: re-initializing ActiveFedora::SolrService with solr_config: #{ActiveFedora.solr_config.inspect}")
27
+
28
+ ActiveFedora::SolrService.register(ActiveFedora.solr_config[:url])
29
+
30
+ end
31
+
32
+ def replicate_objects
33
+ # retrieve a list of all the pids in the fedora repository
34
+ num_docs = 1000000 # modify this number to guarantee that all the objects are retrieved from the repository
35
+
36
+ if REPLICATOR_LIST == false
37
+
38
+ pids = Repository.get_pids( num_docs )
39
+ puts "Replicating #{pids.length} Fedora objects"
40
+ pids.each do |pid|
41
+ unless pid[0].empty? || pid[0].nil? || !pid[0].include?("druid:")
42
+ puts "Processing #{pid}"
43
+ replicate_object( pid )
44
+ end #unless
45
+ end #pids.each
46
+
47
+ else
48
+
49
+ if File.exists?(REPLICATOR_LIST)
50
+ arr_of_pids = FasterCSV.read(REPLICATOR_LIST, :headers=>false)
51
+
52
+ puts "Replicating from list at #{REPLICATOR_LIST}"
53
+ puts "Replicating #{arr_of_pids.length} Fedora objects"
54
+
55
+ arr_of_pids.each do |row|
56
+ pid = row[0]
57
+ replicate_object( pid )
58
+ end #FASTERCSV
59
+
60
+ else
61
+ puts "#{REPLICATOR_LIST} does not exists!"
62
+ end #if File.exists
63
+
64
+ end #if Index_LISTS
65
+ end #replicate_objects
66
+
67
+
68
+ def replicate_object(obj)
69
+ #source_doc = Document.load_instance(pid)
70
+ obj = obj.kind_of?(ActiveFedora::Base) ? obj : Repository.get_object( obj )
71
+ p "Indexing object #{obj.pid} with label #{obj.label}"
72
+ begin
73
+ unless obj.nil?
74
+ create_stub(obj)
75
+ p "Successfully replicated #{obj.pid}"
76
+ end
77
+ rescue Exception => e
78
+ p "unable to create stub. Failed with #{e.inspect}"
79
+ end
80
+ end
81
+
82
+ # Creates a stub object in @dest_repo with the datastreams that we need in the stubs
83
+ def create_stub(source_object)
84
+
85
+ begin
86
+
87
+ jp2 = downloadables(source_object, :canonical=>true, :mime_type=>"image/jp2")
88
+ jp2.new_object = true
89
+ jp2.control_group = 'M'
90
+ jp2.blob = jp2.content
91
+
92
+ stub_object = Fedora::FedoraObject.new(:pid=>source_object.pid)
93
+ dest_repo.save(stub_object)
94
+ dest_repo.save(jp2)
95
+
96
+ ["properties", "extProperties", "descMetadata", "location"].each do |ds_name|
97
+ ds = source_object.datastreams[ds_name]
98
+ ds.new_object = true
99
+ ds.blob = ds.content
100
+ dest_repo.save(ds)
101
+ end
102
+
103
+ rescue
104
+ #for object without jp2s
105
+ #this is a temp fix to the downloadables() issue
106
+
107
+
108
+ pid = source_object.pid
109
+ p "> #{pid}"
110
+
111
+ jp2_file = File.new('spec/fixtures/image.jp2')
112
+ ds = ActiveFedora::Datastream.new(:dsID => "image.jp2", :dsLabel => 'image.jp2', :controlGroup => 'M', :blob => jp2_file)
113
+ source_object.add_datastream(ds)
114
+ source_object.save
115
+ # source_object = Document.load_instance(pid)
116
+ source_object = ActiveFedora::Base.load_instance(pid)
117
+ stub_object = Fedora::FedoraObject.new(:pid=>source_object.pid)
118
+ dest_repo.save(stub_object)
119
+
120
+ jp2 = downloadables(source_object, :canonical=>true, :mime_type=>"image/jp2")
121
+ jp2.new_object = true
122
+ jp2.control_group = 'M'
123
+ jp2.blob = jp2.content
124
+
125
+ stub_object = Fedora::FedoraObject.new(:pid=>source_object.pid)
126
+ dest_repo.save(stub_object)
127
+ dest_repo.save(jp2)
128
+
129
+ ["properties", "extProperties", "descMetadata", "location"].each do |ds_name|
130
+ ds = source_object.datastreams[ds_name]
131
+ ds.new_object = true
132
+ ds.blob = ds.content
133
+ dest_repo.save(ds)
134
+ end
135
+
136
+ end
137
+ end
138
+ def logger
139
+ @logger ||= defined?(RAILS_DEFAULT_LOGGER) ? RAILS_DEFAULT_LOGGER : Logger.new(STDOUT)
140
+ end
141
+
142
+ end
143
+ end
@@ -0,0 +1,54 @@
1
+
2
+ require 'active-fedora'
3
+
4
+
5
+ module Solrizer
6
+ class Repository
7
+
8
+ #
9
+ # This method initializes the fedora repository and solr instance
10
+ #
11
+ def initialize_repository
12
+ Fedora::Repository.register( FEDORA_URL )
13
+ ActiveFedora::SolrService.register( FEDORA_SOLR_URL )
14
+ end
15
+
16
+ #
17
+ # This method retrieves a comprehensive list of unique ids in the fedora repository
18
+ #
19
+ def self.get_pids( num_docs )
20
+ solr_results = ActiveFedora::SolrService.instance.conn.query( "active_fedora_model_field:Document", { :rows => num_docs } )
21
+ id_array = []
22
+ solr_results.hits.each do |hit|
23
+ id_array << hit[SOLR_DOCUMENT_ID]
24
+ end
25
+ return id_array
26
+ end
27
+
28
+ #
29
+ # This method retrieves the object associated with the given unique id
30
+ #
31
+ def self.get_object( pid )
32
+ object = ActiveFedora::Base.load_instance( pid )
33
+ end
34
+
35
+ #
36
+ # This method retrieves a comprehensive list of datastreams for the given object
37
+ #
38
+ def self.get_datastreams( obj )
39
+ ds_keys = obj.datastreams.keys
40
+ end
41
+
42
+ #
43
+ # This method retrieves the datastream for the given object with the given datastream name
44
+ #
45
+ def self.get_datastream( obj, ds_name )
46
+ begin
47
+ obj.datastreams[ ds_name ]
48
+ rescue
49
+ return nil
50
+ end
51
+ end
52
+
53
+ end
54
+ end
@@ -0,0 +1,33 @@
1
+ namespace :solrizer do
2
+
3
+ desc 'Index a fedora object of the given pid.'
4
+ task :solrize => :environment do
5
+ index_full_text = ENV['FULL_TEXT'] == 'true'
6
+ if ENV['PID']
7
+ puts "indexing #{ENV['PID'].inspect}"
8
+ solrizer = Solrizer::Solrizer.new :index_full_text=> index_full_text
9
+ solrizer.solrize(ENV['PID'])
10
+ puts "Finished shelving #{ENV['PID']}"
11
+ else
12
+ puts "You must provide a pid using the format 'solrizer::solrize_object PID=sample:pid'."
13
+ end
14
+ end
15
+
16
+ desc 'Index all objects in the repository.'
17
+ task :solrize_objects => :environment do
18
+ index_full_text = ENV['FULL_TEXT'] == 'true'
19
+ if ENV['INDEX_LIST']
20
+ @@index_list = ENV['INDEX_LIST']
21
+ end
22
+
23
+ puts "Re-indexing Fedora Repository."
24
+ puts "Fedora URL: #{ActiveFedora.fedora_config[:url]}"
25
+ puts "Fedora Solr URL: #{ActiveFedora.solr_config[:url]}"
26
+ puts "Blacklight Solr Config: #{Blacklight.solr_config.inspect}"
27
+ puts "Doing full text index." if index_full_text
28
+ solrizer = Solrizer::Solrizer.new :index_full_text=> index_full_text
29
+ solrizer.solrize_objects
30
+ puts "Solrizer task complete."
31
+ end
32
+
33
+ end
@@ -0,0 +1,80 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{solrizer}
8
+ s.version = "0.1.0.pre2"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new("> 1.3.1") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Matt Zumwalt"]
12
+ s.date = %q{2010-05-15}
13
+ s.description = %q{Use solrizer to populate solr indexes from Fedora repository content or from other sources. You can run solrizer from within your apps, using the provided rake tasks, or as a JMS listener}
14
+ s.email = %q{matt.zumwalt@yourmediashelf.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".gitignore",
21
+ "LICENSE",
22
+ "README.rdoc",
23
+ "Rakefile",
24
+ "VERSION",
25
+ "config/fedora.yml",
26
+ "config/hydra_types.yml",
27
+ "config/solr.yml",
28
+ "lib/solrizer.rb",
29
+ "lib/solrizer/configuration.rb",
30
+ "lib/solrizer/extractor.rb",
31
+ "lib/solrizer/indexer.rb",
32
+ "lib/solrizer/main.rb",
33
+ "lib/solrizer/replicator.rb",
34
+ "lib/solrizer/repository.rb",
35
+ "lib/tasks/solrizer.rake",
36
+ "solrizer.gemspec",
37
+ "spec/fixtures/druid-bv448hq0314-descMetadata.xml",
38
+ "spec/fixtures/druid-bv448hq0314-extProperties.xml",
39
+ "spec/fixtures/druid-cm234kq4672-extProperties.xml",
40
+ "spec/fixtures/druid-cm234kq4672-stories.xml",
41
+ "spec/fixtures/druid-hc513kw4806-descMetadata.xml",
42
+ "spec/fixtures/rels_ext_cmodel.xml",
43
+ "spec/integration/indexer_spec.rb",
44
+ "spec/rcov.opts",
45
+ "spec/spec.opts",
46
+ "spec/spec_helper.rb",
47
+ "spec/units/extractor_spec.rb",
48
+ "spec/units/indexer_spec.rb",
49
+ "spec/units/shelver_spec.rb"
50
+ ]
51
+ s.homepage = %q{http://github.com/projecthydra/solrizer}
52
+ s.rdoc_options = ["--charset=UTF-8"]
53
+ s.require_paths = ["lib"]
54
+ s.rubygems_version = %q{1.3.6}
55
+ s.summary = %q{A utility for building solr indexes, usually from Fedora repository content.}
56
+ s.test_files = [
57
+ "spec/integration/indexer_spec.rb",
58
+ "spec/spec_helper.rb",
59
+ "spec/units/extractor_spec.rb",
60
+ "spec/units/indexer_spec.rb",
61
+ "spec/units/shelver_spec.rb"
62
+ ]
63
+
64
+ if s.respond_to? :specification_version then
65
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
66
+ s.specification_version = 3
67
+
68
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
69
+ s.add_runtime_dependency(%q<active-fedora>, ["> 1.1.3"])
70
+ s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
71
+ else
72
+ s.add_dependency(%q<active-fedora>, ["> 1.1.3"])
73
+ s.add_dependency(%q<rspec>, [">= 1.2.9"])
74
+ end
75
+ else
76
+ s.add_dependency(%q<active-fedora>, ["> 1.1.3"])
77
+ s.add_dependency(%q<rspec>, [">= 1.2.9"])
78
+ end
79
+ end
80
+
@@ -0,0 +1,11 @@
1
+ <dc xmlns:dcterms="http://purl.org/dc/terms/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
2
+ <dcterms:type xsi:type="DCMITYPE">text</dcterms:type>
3
+ <dcterms:medium>Paper Document</dcterms:medium>
4
+ <dcterms:rights>Presumed under copyright. Do not publish.</dcterms:rights>
5
+ <dcterms:date>1985-12-30</dcterms:date>
6
+ <dcterms:format>application/tiff</dcterms:format>
7
+ <dcterms:format>application/jp2000</dcterms:format>
8
+ <dcterms:format>application/pdf</dcterms:format>
9
+ <dcterms:title>This is a Sample Title</dcterms:title>
10
+ <dcterms:publisher>Sample Unversity</dcterms:publisher>
11
+ </dc>
@@ -0,0 +1,52 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <document>
3
+ <attributes>
4
+ <attribute type="item">5958</attribute>
5
+ <attribute type="objectid">FEI0010-00013142</attribute>
6
+ <attribute type="title">Letter from Ellie Engelmore to Professor K. C. Reddy</attribute>
7
+ <attribute type="copyright">Presumed under copyright. Do not publish.</attribute>
8
+ <attribute type="description"/>
9
+ <attribute type="date">1985-12-30</attribute>
10
+ <attribute type="datestr">30/12/1985</attribute>
11
+ <attribute type="docurl">https://www.stanford.edu/group/salt_project/SLA/Feigenbaum/eaf7000/pdf/00013142.pdf</attribute>
12
+ <attribute type="doctn">http://www.stanford.edu/group/salt_project/cgi-bin/SLA/Feigenbaum/eaf7000/png/small_00013142.png</attribute>
13
+ <attribute type="url"/>
14
+ <attribute type="industryterm"/>
15
+ <attribute type="technology">artificial intelligence</attribute>
16
+ <attribute type="company"/>
17
+ <attribute type="person">ELLIE ENGELMORE</attribute>
18
+ <attribute type="year">1985</attribute>
19
+ <attribute type="organization">Mathematics and Computer/Information Sciences University of Hyderabad Central University P. O. Hyder</attribute>
20
+ <attribute type="sourcelocation">Folder 15</attribute>
21
+ </attributes>
22
+ <facets>
23
+ <facet type="year" id="49">1980s</facet>
24
+ <facet type="year" id="49">1985</facet>
25
+ <facet type="year" id="42">1980s</facet>
26
+ <facet type="sourcelocation" id="592">Feigenbaum</facet>
27
+ <facet type="sourcelocation" id="592">eaf7000</facet>
28
+ <facet type="sourcelocation" id="592">Box 51A</facet>
29
+ <facet type="sourcelocation" id="594">Feigenbaum</facet>
30
+ <facet type="sourcelocation" id="594">eaf7000</facet>
31
+ <facet type="sourcelocation" id="594">Box 51A</facet>
32
+ <facet type="sourcelocation" id="594">Folder 15</facet>
33
+ <facet type="sourcelocation" id="691">Feigenbaum</facet>
34
+ <facet type="sourcelocation" id="692">Feigenbaum</facet>
35
+ <facet type="sourcelocation" id="692">eaf7000</facet>
36
+ <facet type="doctype" id="32">Correspondence</facet>
37
+ <facet type="city" id="82">Ann Arbor</facet>
38
+ <facet type="city" id="910">Hyderabad</facet>
39
+ <facet type="city" id="1519">Palo Alto</facet>
40
+ <facet type="country" id="68">India</facet>
41
+ <facet type="emailaddress" id="288">EENGELMORE@SUMEX-AIM.ARPA</facet>
42
+ <facet type="organization" id="5065">Heuristic Programming Project</facet>
43
+ <facet type="organization" id="7012">Mathematics and Computer/Information Sciences University of Hyderabad Central University P. O. Hyder</facet>
44
+ <facet type="organization" id="8878">Professor K. C. Reddy School of Mathematics and Computer/Information Sciences</facet>
45
+ <facet type="person" id="5810">ELLIE ENGELMORE</facet>
46
+ <facet type="person" id="17934">Reddy</facet>
47
+ <facet type="person" id="5787">EDWARD FEIGENBAUM</facet>
48
+ <facet type="provinceorstate" id="96">Michigan</facet>
49
+ <facet type="provinceorstate" id="27">California</facet>
50
+ <facet type="technology" id="1713">artificial intelligence</facet>
51
+ </facets>
52
+ </document>
@@ -0,0 +1,5 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <document>
3
+ <attributes><attribute type="item">4290</attribute><attribute type="objectid">FEI0010-00011325</attribute><attribute type="title">Letter from Ellie Engelmore to Wemara Lichty</attribute><attribute type="copyright">Presumed under copyright. Do not publish.</attribute><attribute type="description"/><attribute type="date">1984-6-4</attribute><attribute type="datestr">4/6/1984</attribute><attribute type="docurl">https://www.stanford.edu/group/salt_project/SLA/Feigenbaum/eaf7000/pdf/00011325.pdf</attribute><attribute type="doctn">http://www.stanford.edu/group/salt_project/cgi-bin/SLA/Feigenbaum/eaf7000/png/small_00011325.png</attribute><attribute type="url"/><attribute type="industryterm"/><attribute type="technology">artificial intelligence</attribute><attribute type="company"/><attribute type="person">A. FEIGENBAUM</attribute><attribute type="year">1984</attribute><attribute type="organization">McAlcster Hall University</attribute><attribute type="sourcelocation">Folder 5</attribute></attributes>
4
+ <facets><facet type="year" id="48">1980s</facet><facet type="year" id="48">1984</facet><facet type="year" id="42">1980s</facet><facet type="sourcelocation" id="578">Feigenbaum</facet><facet type="sourcelocation" id="578">eaf7000</facet><facet type="sourcelocation" id="578">Box 51</facet><facet type="sourcelocation" id="587">Feigenbaum</facet><facet type="sourcelocation" id="587">eaf7000</facet><facet type="sourcelocation" id="587">Box 51</facet><facet type="sourcelocation" id="587">Folder 5</facet><facet type="sourcelocation" id="692">Feigenbaum</facet><facet type="sourcelocation" id="692">eaf7000</facet><facet type="sourcelocation" id="691">Feigenbaum</facet><facet type="doctype" id="32">Correspondence</facet><facet type="city" id="1948">Stanford</facet><facet type="country" id="33">Columbia</facet><facet type="facility" id="2551">U. Missouri library</facet><facet type="organization" id="5065">Heuristic Programming Project</facet><facet type="organization" id="7026">McAlcster Hall University</facet><facet type="organization" id="9645">STANFORD UNIVERSITY</facet><facet type="organization" id="11964">University of Missouri</facet><facet type="organization" id="12407">Wemara Lichty Psychology Department</facet><facet type="person" id="15650">Morton Hunt</facet><facet type="person" id="37">A. FEIGENBAUM</facet><facet type="person" id="5810">ELLIE ENGELMORE</facet><facet type="provinceorstate" id="27">California</facet><facet type="provinceorstate" id="100">Missouri</facet><facet type="publishedmedium" id="44">Artificial Intelligence</facet><facet type="technology" id="1713">artificial intelligence</facet></facets>
5
+ </document>
@@ -0,0 +1,17 @@
1
+
2
+ <html>
3
+ <body>
4
+ <pre>
5
+ This is
6
+ preformatted text.
7
+ It preserves both spaces
8
+ and line breaks.
9
+ </pre>
10
+ <p>The pre tag is good for displaying computer code:</p>
11
+ <pre>
12
+ for i = 1 to 10
13
+ print i
14
+ next i
15
+ </pre>
16
+ </body>
17
+ </html>
@@ -0,0 +1,11 @@
1
+ <dc xmlns:dcterms="http://purl.org/dc/terms/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
2
+ <dcterms:type xsi:type="DCMITYPE">text</dcterms:type>
3
+ <dcterms:medium>Paper Document</dcterms:medium>
4
+ <dcterms:rights>Copyright © 2006 All rights reserved. Distribution for commercial purposes is prohibited.</dcterms:rights>
5
+ <dcterms:date/>
6
+ <dcterms:format>application/tiff</dcterms:format>
7
+ <dcterms:format>application/jp2000</dcterms:format>
8
+ <dcterms:format>application/pdf</dcterms:format>
9
+ <dcterms:title>The Rise and Fall of the YouTube Empire</dcterms:title>
10
+ <dcterms:publisher>Sample Unversity</dcterms:publisher>
11
+ </dc>
@@ -0,0 +1,8 @@
1
+ <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
2
+ <rdf:Description rdf:about="info:fedora/demo:multipurpose-objects-model_and_sdef">
3
+ <hasModel xmlns="info:fedora/fedora-system:def/model#" rdf:resource="info:fedora/fedora-system:ContentModel-3.0"/>
4
+ <hasModel xmlns="info:fedora/fedora-system:def/model#" rdf:resource="info:fedora/afmodel:SaltDocument"/>
5
+ <hasModel xmlns="info:fedora/fedora-system:def/model#" rdf:resource="info:fedora/afmodel:JP2Document"/>
6
+ <hasModel xmlns="info:fedora/fedora-system:def/model#" rdf:resource="info:fedora/afmodel:DCDocument"/>
7
+ </rdf:Description>
8
+ </rdf:RDF>
@@ -0,0 +1,18 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+ require 'solrizer'
3
+
4
+ describe Solrizer::Indexer do
5
+
6
+ before(:each) do
7
+ @indexer = Solrizer::Indexer.new
8
+ end
9
+
10
+ describe "index" do
11
+ it "should update solr with the metadata from the given object" do
12
+ pending "Got to decide if/how to handle fixtures in this gem. Probably should just mock out Fedora & Solr entirely."
13
+ obj = Solrizer::Repository.get_object( "druid:sb733gr4073" )
14
+ @indexer.index( obj )
15
+ end
16
+ end
17
+
18
+ end