RubyGems - shelver - Versions diffs - 0.0.0 - Mend

shelver 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

data/.gitignore +19 -0
data/LICENSE +20 -0
data/README.rdoc +17 -0
data/Rakefile +45 -0
data/VERSION +1 -0
data/config/hydra_types.yml +4 -0
data/config/solr.yml +24 -0
data/lib/shelver/configuration.rb +8 -0
data/lib/shelver/extractor.rb +89 -0
data/lib/shelver/indexer.rb +251 -0
data/lib/shelver/main.rb +17 -0
data/lib/shelver/replicator.rb +143 -0
data/lib/shelver/repository.rb +54 -0
data/lib/shelver.rb +103 -0
data/lib/tasks/shelver.rake +33 -0
data/shelver.gemspec +74 -0
data/spec/fixtures/druid-bv448hq0314-descMetadata.xml +11 -0
data/spec/fixtures/druid-bv448hq0314-extProperties.xml +52 -0
data/spec/fixtures/druid-cm234kq4672-extProperties.xml +5 -0
data/spec/fixtures/druid-cm234kq4672-stories.xml +17 -0
data/spec/fixtures/druid-hc513kw4806-descMetadata.xml +11 -0
data/spec/fixtures/rels_ext_cmodel.xml +8 -0
data/spec/rcov.opts +2 -0
data/spec/spec.opts +1 -0
data/spec/spec_helper.rb +16 -0
data/spec/units/extractor_spec.rb +50 -0
data/spec/units/indexer_spec.rb +127 -0
data/spec/units/shelver_spec.rb +42 -0
metadata +106 -0

data/.gitignore ADDED Viewed

@@ -0,0 +1,19 @@
+.DS_Store
+nohup.out
+*.sqlite3
+*.log
+*~
+*.swp
+pkg/
+coverage/*
+tmp/**/*
+tmp/performance
+rerun.txt
+.loadpath
+.project
+.buildpath

data/LICENSE ADDED Viewed

@@ -0,0 +1,20 @@
+Copyright (c) 2009 Matt Zumwalt
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.rdoc ADDED Viewed

@@ -0,0 +1,17 @@
+= foo
+Description goes here.
+== Note on Patches/Pull Requests
+* Fork the project.
+* Make your feature addition or bug fix.
+* Add tests for it. This is important so I don't break it in a
+  future version unintentionally.
+* Commit, do not mess with rakefile, version, or history.
+  (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
+* Send me a pull request. Bonus points for topic branches.
+== Copyright
+Copyright (c) 2010 Matt Zumwalt. See LICENSE for details.

data/Rakefile ADDED Viewed

@@ -0,0 +1,45 @@
+require 'rubygems'
+require 'rake'
+begin
+  require 'jeweler'
+  Jeweler::Tasks.new do |gem|
+    gem.name = "shelver"
+    gem.summary = %Q{A utility for building solr indexes, usually from Fedora repository content.}
+    gem.description = %Q{Use shelver to populate solr indexes from Fedora repository content or from other sources.  You can run shelver from within your apps, using the provided rake tasks, or as a JMS listener}
+    gem.email = "matt.zumwalt@yourmediashelf.com"
+    gem.homepage = "http://github.com/mediashelf/shelver"
+    gem.authors = ["Matt Zumwalt"]
+    gem.add_development_dependency "rspec", ">= 1.2.9"
+    # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
+  end
+  Jeweler::GemcutterTasks.new
+rescue LoadError
+  puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
+end
+require 'spec/rake/spectask'
+Spec::Rake::SpecTask.new(:spec) do |spec|
+  spec.libs << 'lib' << 'spec'
+  spec.spec_files = FileList['spec/**/*_spec.rb']
+end
+Spec::Rake::SpecTask.new(:rcov) do |spec|
+  spec.libs << 'lib' << 'spec'
+  spec.pattern = 'spec/**/*_spec.rb'
+  spec.rcov = true
+end
+task :spec => :check_dependencies
+task :default => :spec
+require 'rake/rdoctask'
+Rake::RDocTask.new do |rdoc|
+  version = File.exist?('VERSION') ? File.read('VERSION') : ""
+  rdoc.rdoc_dir = 'rdoc'
+  rdoc.title = "shelver #{version}"
+  rdoc.rdoc_files.include('README*')
+  rdoc.rdoc_files.include('lib/**/*.rb')
+end

data/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.0.0

data/config/hydra_types.yml ADDED Viewed

@@ -0,0 +1,4 @@
+info:fedora/afmodel:SaltDocument : salt_document
+info:fedora/afmodel:JP2Document : jp2_document
+info:fedora/afmodel:ModsDocument : mods_document
+info:fedora/afmodel:DCDocument : dc_document

data/config/solr.yml ADDED Viewed

@@ -0,0 +1,24 @@
+staging:
+  default:
+    url: http://salt-dev.stanford.edu:8080/bl_solr/core0
+  fulltext:
+    url: http://salt-dev.stanford.edu:8080/bl_solr/core1
+development:
+  default:
+    url: http://localhost:8080/bl_solr/core0
+  fulltext:
+    url: http://localhost:8080/bl_solr/core1
+test: &TEST
+  default:
+    url: http://localhost:8080/bl_solr/core0
+  fulltext:
+    url: http://localhost:8080/bl_solr/core1
+production:
+  default:
+    url: http://salt-dev.stanford.edu:8080/bl_solr/core0
+  fulltext:
+    url: http://salt-dev.stanford.edu:8080/bl_solr/core1
+cucumber:
+  <<: *TEST
+cucumber:
+  <<: *TEST

data/lib/shelver/configuration.rb ADDED Viewed

@@ -0,0 +1,8 @@
+# FEDORA_URL = 'http://fedoraAdmin:fedoraAdmin@salt-dev.stanford.edu/fedora'
+# FEDORA_SOLR_URL = 'http://salt-dev.stanford.edu:8080/solr'
+# SHELVER_SOLR_URL = 'http://sulwebappdev1.stanford.edu:8100/salt_solr'
+FEDORA_URL = 'http://fedoraAdmin:fedoraAdmin@localhost:8080/fedora'
+FEDORA_SOLR_URL = 'http://localhost:8080/solr'
+SHELVER_SOLR_URL = 'http://localhost:8080/bl_solr'

data/lib/shelver/extractor.rb ADDED Viewed

@@ -0,0 +1,89 @@
+require 'solr'
+require 'rexml/document'
+require "nokogiri"
+require 'yaml'
+module Shelver
+class Extractor
+  def extract_tags(text)
+    doc = REXML::Document.new( text )
+    extract_tag(doc, 'archivist_tags').merge(extract_tag(doc, 'donor_tags'))
+  end
+  def extract_tag(doc, type)
+    tags = doc.elements["/fields/#{type}"]
+    return {} unless tags
+    {type => tags.text.split(/,/).map {|t| t.strip}}
+  end
+  #
+  # Extracts content-model and hydra-type from RELS-EXT datastream
+  #
+  def extract_rels_ext( text, solr_doc=Solr::Document.new )
+    # TODO: only read in this file once
+    if defined?(RAILS_ROOT)
+      config_path = File.join(RAILS_ROOT, "config")
+    else
+      config_path = File.join(File.dirname(__FILE__), "..", "..", "config")
+    end
+    map = YAML.load(File.open(File.join(config_path, "hydra_types.yml")))
+    doc = Nokogiri::XML(text)
+    doc.xpath( '//foo:hasModel', 'foo' => 'info:fedora/fedora-system:def/model#' ).each do |element|
+      cmodel = element.attributes['resource'].to_s
+      solr_doc << Solr::Field.new( :cmodel_t => cmodel )
+      if map.has_key?(cmodel)
+        solr_doc << Solr::Field.new( :hydra_type_t => map[cmodel] )
+      end
+    end
+    return solr_doc
+  end
+  #
+  # This method extracts solr fields from simple xml
+  #
+  def xml_to_solr( text, solr_doc=Solr::Document.new )
+    doc = REXML::Document.new( text )
+    doc.root.elements.each do |element|
+      solr_doc << Solr::Field.new( :"#{element.name}_t" => "#{element.text}" )
+    end
+    return solr_doc
+  end
+  #
+  # This method strips html tags out and returns content to be indexed in solr
+  #
+  def html_content_to_solr( ds, solr_doc=Solr::Document.new )
+    text = CGI.unescapeHTML(ds.content)
+    doc = Nokogiri::HTML(text)
+    # html to story_display
+    stories = doc.xpath('//story')
+    stories.each do |story|
+      solr_doc << Solr::Field.new(:story_display => story.children.to_xml)
+    end
+    #strip out text and put in story_t
+    text_nodes = doc.xpath("//text()")
+    text = String.new
+     text_nodes.each do |text_node|
+       text << text_node.content
+     end
+     solr_doc << Solr::Field.new(:story_t => text)
+     return solr_doc
+  end
+end
+end

data/lib/shelver/indexer.rb ADDED Viewed

@@ -0,0 +1,251 @@
+require 'solr'
+require 'shelver/extractor'
+require 'shelver/repository'
+module Shelver
+class Indexer
+  #
+  # Class variables
+  #
+  @@unique_id = 0
+  def self.unique_id
+    @@unique_id
+  end
+  #
+  # Member variables
+  #
+  attr_accessor :connection, :extractor, :index_full_text
+  #
+  # This method performs initialization tasks
+  #
+  def initialize( opts={} )
+    @@index_list = false unless defined?(@@index_list)
+    @extractor = Extractor.new
+    if opts[:index_full_text] == true || opts[:index_full_text] == "true"
+      @index_full_text = true
+    else
+      @index_full_text = false
+    end
+    connect
+  end
+  #
+  # This method connects to the Solr instance
+  #
+  def connect
+    if defined?(Blacklight)
+      solr_config = Blacklight.solr_config
+    else
+      if defined?(RAILS_ROOT)
+        config_path = File.join(RAILS_ROOT, "config")
+        yaml = YAML.load(File.open(File.join(config_path, "solr.yml")))
+        solr_config = yaml[RAILS_ENV]
+      else
+        config_path = File.join(File.dirname(__FILE__), "..", "..", "config")
+        yaml = YAML.load(File.open(File.join(config_path, "solr.yml")))
+        if ENV["environment"].nil?
+          environment = "development"
+        else
+          environment = ENV["environment"]
+        end
+        solr_config = yaml[environment]
+        puts solr_config.inspect
+      end
+    end
+    if index_full_text == true
+      url = solr_config['fulltext']['url']
+    else
+      url = solr_config['default']['url']
+    end
+    @connection = Solr::Connection.new(url, :autocommit => :on )
+  end
+  #
+  # This method extracts the facet categories from the given Fedora object's external tag datastream
+  #
+  def extract_xml_to_solr( obj, ds_name, solr_doc=Solr::Document.new )
+    xml_ds = Repository.get_datastream( obj, ds_name )
+    extractor.xml_to_solr( xml_ds.content, solr_doc )
+  end
+  #
+  #
+  #
+  def extract_rels_ext( obj, ds_name, solr_doc=Solr::Document.new )
+    rels_ext_ds = Repository.get_datastream( obj, ds_name )
+    extractor.extract_rels_ext( rels_ext_ds.content, solr_doc )
+  end
+  #
+  # This method generates the month and day facets from the date_t in solr_doc
+  #
+  def generate_dates(solr_doc)
+    # This will check for valid dates, but it seems most of the dates are currently invalid....
+    #date_check =  /^(19|20)\d\d([- \/.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])/
+   #if there is not date_t, add on with easy-to-find value
+   if solr_doc[:date_t].nil?
+        solr_doc << Solr::Field.new( :date_t => "9999-99-99")
+   end #if
+    # unless date_check !~  solr_doc[:date_t]
+    date_obj = Date._parse(solr_doc[:date_t])
+    if date_obj[:mon].nil?
+       solr_doc << Solr::Field.new(:month_facet => 99)
+    elsif 0 < date_obj[:mon] && date_obj[:mon] < 13
+      solr_doc << Solr::Field.new( :month_facet => date_obj[:mon].to_s.rjust(2, '0'))
+    else
+      solr_doc << Solr::Field.new( :month_facet => 99)
+    end
+    if  date_obj[:mday].nil?
+      solr_doc << Solr::Field.new( :day_facet => 99)
+    elsif 0 < date_obj[:mday] && date_obj[:mday] < 32
+      solr_doc << Solr::Field.new( :day_facet => date_obj[:mday].to_s.rjust(2, '0'))
+    else
+       solr_doc << Solr::Field.new( :day_facet => 99)
+    end
+    return solr_doc
+#      end
+  end
+  #
+  # This method creates a Solr-formatted XML document
+  #
+  def create_document( obj )
+    # retrieve a comprehensive list of all the datastreams associated with the given
+    #   object and categorize each datastream based on its filename
+    ext_properties_ds_names, rels_ext_names, properties_ds_names, stories_ds_names, full_text_ds_names, xml_ds_names, jp2_ds_names,  = [],[],[],[],[],[],[],[]
+    ds_names = Repository.get_datastreams( obj )
+    ds_names.each do |ds_name|
+      if ds_name =~ /descMetadata/
+        xml_ds_names << ds_name
+      elsif ds_name =~ /^properties/
+        properties_ds_names << ds_name
+        xml_ds_names << ds_name
+      elsif ds_name =~ /^RELS-EXT/
+        rels_ext_names << ds_name
+      end
+    end
+    # create the Solr document
+    solr_doc = Solr::Document.new
+    solr_doc << Solr::Field.new( :id => "#{obj.pid}" )
+    solr_doc << Solr::Field.new( :id_t => "#{obj.pid}" )
+    # Pass the solr_doc through extract_simple_xml_to_solr
+      xml_ds_names.each { |ds_name| extract_xml_to_solr(obj, ds_name, solr_doc)}
+    # Generate month_facet and day_facet from date_t value
+      generate_dates(solr_doc)
+    # extract RELS-EXT
+    rels_ext_names.each { |ds_name| extract_rels_ext(obj, ds_name, solr_doc)}
+    # increment the unique id to ensure that all documents in the search index are unique
+    @@unique_id += 1
+    return solr_doc
+  end
+  #
+  # This method adds a document to the Solr search index
+  #
+  def index( obj )
+   # print "Indexing '#{obj.pid}'..."
+    begin
+      solr_doc = create_document( obj )
+      connection.add( solr_doc )
+     # puts connection.url
+     #puts solr_doc
+     #  puts "done"
+    rescue Exception => e
+       p "unable to index #{obj.pid}.  Failed with #{e.inspect}"
+    end
+  end
+  #
+  # This method queries the Solr search index and returns a response
+  #
+  def query( query_str )
+    response = conn.query( query_str )
+  end
+  #
+  # This method prints out the results of the given query string by iterating through all the hits
+  #
+  def printResults( query_str )
+    query( query_str ) do |hit|
+      puts hit.inspect
+    end
+  end
+  #
+  # This method deletes a document from the Solr search index by id
+  #
+  def deleteDocument( id )
+    connection.delete( id )
+  end
+  # Populates a solr doc with values from a hash.
+  # Accepts two forms of hashes:
+  # => {'technology'=>["t1", "t2"], 'company'=>"c1", "person"=>["p1", "p2"]}
+  # or
+  # => {:facets => {'technology'=>["t1", "t2"], 'company'=>"c1", "person"=>["p1", "p2"]} }
+  #
+  # Note that values for individual fields can be a single string or an array of strings.
+  def self.solrize( input_hash, solr_doc=Solr::Document.new )
+    facets = input_hash.has_key?(:facets) ? input_hash[:facets] : input_hash
+    facets.each_pair do |facet_name, value|
+      case value.class.to_s
+      when "String"
+        solr_doc << Solr::Field.new( :"#{facet_name}_facet" => "#{value}" )
+      when "Array"
+        value.each { |v| solr_doc << Solr::Field.new( :"#{facet_name}_facet" => "#{v}" ) }
+      end
+    end
+    if input_hash.has_key?(:symbols)
+      input_hash[:symbols].each do |symbol_name, value|
+        case value.class.to_s
+        when "String"
+          solr_doc << Solr::Field.new( :"#{symbol_name}_s" => "#{value}" )
+	      when "Array"
+          value.each { |v| solr_doc << Solr::Field.new( :"#{symbol_name}_s" => "#{v}" ) }
+        end
+      end
+    end
+    return solr_doc
+  end
+  private :connect, :create_document
+end
+end

data/lib/shelver/main.rb ADDED Viewed

@@ -0,0 +1,17 @@
+#!/bin/env ruby
+@index_full_text = false
+require 'rubygems'
+load 'configuration.rb'
+load 'repository.rb'
+load 'shelver.rb'
+# initialize connection to Fedora repository
+repository = Repository.new
+repository.initialize_repository
+# shelve all objects in the Fedora repository
+shelver = Shelver.new
+shelver.shelve_objects

data/lib/shelver/replicator.rb ADDED Viewed

@@ -0,0 +1,143 @@
+require 'fastercsv'
+REPLICATOR_LIST = false unless defined?(REPLICATOR_LIST)
+module Shelver
+  class Replicator
+    include Stanford::SaltControllerHelper
+    attr_accessor :dest_repo, :configs
+    def initialize
+      config_path = "#{RAILS_ROOT}/config/replicator.yml"
+      raw_configs = YAML::load(File.open(config_path))
+      @configs = raw_configs[RAILS_ENV]
+      @dest_repo = Fedora::Repository.new(configs["destination"]["fedora"]["url"])
+      ActiveFedora.fedora_config[:url] = configs["source"]["fedora"]["url"]
+      logger.info("REPLICATOR: re-initializing Fedora with fedora_config: #{ActiveFedora.fedora_config.inspect}")
+      Fedora::Repository.register(ActiveFedora.fedora_config[:url])
+      logger.info("REPLICATOR: re-initialized Fedora as: #{Fedora::Repository.instance.inspect}")
+      # Register Solr
+      ActiveFedora.solr_config[:url] = configs["source"]["solr"]["url"]
+      logger.info("REPLICATOR: re-initializing ActiveFedora::SolrService with solr_config: #{ActiveFedora.solr_config.inspect}")
+      ActiveFedora::SolrService.register(ActiveFedora.solr_config[:url])
+    end
+    def replicate_objects
+     # retrieve a list of all the pids in the fedora repository
+      num_docs = 1000000   # modify this number to guarantee that all the objects are retrieved from the repository
+      if REPLICATOR_LIST == false
+         pids = Repository.get_pids( num_docs )
+         puts "Replicating #{pids.length} Fedora objects"
+          pids.each do |pid|
+            unless pid[0].empty? || pid[0].nil? || !pid[0].include?("druid:")
+              puts "Processing #{pid}"
+              replicate_object( pid )
+            end #unless
+          end #pids.each
+      else
+         if File.exists?(REPLICATOR_LIST)
+            arr_of_pids = FasterCSV.read(REPLICATOR_LIST, :headers=>false)
+            puts "Replicating from list at #{REPLICATOR_LIST}"
+            puts "Replicating #{arr_of_pids.length} Fedora objects"
+           arr_of_pids.each do |row|
+              pid = row[0]
+              replicate_object( pid )
+           end #FASTERCSV
+          else
+            puts "#{REPLICATOR_LIST} does not exists!"
+          end #if File.exists
+      end #if Index_LISTS
+    end #replicate_objects
+    def replicate_object(obj)
+	#source_doc = Document.load_instance(pid)
+       obj = obj.kind_of?(ActiveFedora::Base) ? obj : Repository.get_object( obj )
+	     p "Indexing object #{obj.pid} with label #{obj.label}"
+      begin
+        unless obj.nil?
+		      create_stub(obj)
+        	p "Successfully replicated #{obj.pid}"
+   	    end
+      rescue Exception => e
+        p "unable to create stub.  Failed with #{e.inspect}"
+      end
+    end
+    # Creates a stub object in @dest_repo with the datastreams that we need in the stubs
+    def create_stub(source_object)
+      begin
+       jp2 = downloadables(source_object, :canonical=>true, :mime_type=>"image/jp2")
+       jp2.new_object = true
+       jp2.control_group = 'M'
+       jp2.blob = jp2.content
+       	stub_object = Fedora::FedoraObject.new(:pid=>source_object.pid)
+       	dest_repo.save(stub_object)
+	      dest_repo.save(jp2)
+      ["properties", "extProperties", "descMetadata", "location"].each do |ds_name|
+        ds = source_object.datastreams[ds_name]
+        ds.new_object = true
+        ds.blob = ds.content
+        dest_repo.save(ds)
+      end
+     rescue
+         #for object without jp2s
+         #this is a temp fix to the downloadables() issue
+         pid = source_object.pid
+	        p "> #{pid}"
+          jp2_file = File.new('spec/fixtures/image.jp2')
+          ds = ActiveFedora::Datastream.new(:dsID => "image.jp2", :dsLabel => 'image.jp2', :controlGroup => 'M', :blob => jp2_file)
+	        source_object.add_datastream(ds)
+          source_object.save
+	        #  source_object = Document.load_instance(pid)
+ 	        source_object = ActiveFedora::Base.load_instance(pid)
+       	  stub_object = Fedora::FedoraObject.new(:pid=>source_object.pid)
+          dest_repo.save(stub_object)
+          jp2 = downloadables(source_object, :canonical=>true, :mime_type=>"image/jp2")
+          jp2.new_object = true
+          jp2.control_group = 'M'
+          jp2.blob = jp2.content
+          	stub_object = Fedora::FedoraObject.new(:pid=>source_object.pid)
+          	dest_repo.save(stub_object)
+   	      dest_repo.save(jp2)
+         ["properties", "extProperties", "descMetadata", "location"].each do |ds_name|
+           ds = source_object.datastreams[ds_name]
+           ds.new_object = true
+           ds.blob = ds.content
+           dest_repo.save(ds)
+         end
+      end
+    end
+    def logger
+      @logger ||= defined?(RAILS_DEFAULT_LOGGER) ? RAILS_DEFAULT_LOGGER : Logger.new(STDOUT)
+    end
+  end
+end