RubyGems - solrizer-fedora - Versions diffs - 0.1.0 - Mend

solrizer-fedora 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

data/.document +5 -0
data/.gitignore +22 -0
data/Gemfile +10 -0
data/Gemfile.lock +44 -0
data/History.textile +3 -0
data/LICENSE +20 -0
data/README.textile +34 -0
data/Rakefile +57 -0
data/VERSION +1 -0
data/config/fedora.yml +16 -0
data/config/hydra_types.yml +4 -0
data/config/solr.yml +7 -0
data/lib/solrizer-fedora.rb +1 -0
data/lib/solrizer/fedora.rb +7 -0
data/lib/solrizer/fedora/extractor.rb +34 -0
data/lib/solrizer/fedora/indexer.rb +213 -0
data/lib/solrizer/fedora/repository.rb +44 -0
data/lib/solrizer/fedora/solrizer.rb +118 -0
data/lib/tasks/solrizer-fedora.rake +35 -0
data/solrizer-fedora.gemspec +72 -0
data/spec/fixtures/rels_ext_cmodel.xml +8 -0
data/spec/integration/fedora_indexer_spec.rb +18 -0
data/spec/rcov.opts +2 -0
data/spec/spec.opts +1 -0
data/spec/spec_helper.rb +18 -0
data/spec/units/fedora_extractor_spec.rb +31 -0
data/spec/units/fedora_indexer_spec.rb +78 -0
data/spec/units/fedora_solrizer_spec.rb +42 -0
metadata +99 -0

data/.document ADDED Viewed

@@ -0,0 +1,5 @@
+README.rdoc
+lib/**/*.rb
+bin/*
+features/**/*.feature
+LICENSE

data/.gitignore ADDED Viewed

@@ -0,0 +1,22 @@
+## MAC OS
+.DS_Store
+## TEXTMATE
+*.tmproj
+tmtags
+## EMACS
+*~
+\#*
+.\#*
+## VIM
+*.swp
+## PROJECT::GENERAL
+coverage
+rdoc
+pkg
+/.bundle
+## PROJECT::SPECIFIC

data/Gemfile ADDED Viewed

@@ -0,0 +1,10 @@
+source "http://rubygems.org"
+gem 'active-fedora', '1.2.4'
+gem 'solrizer', '>=0.3.0'
+group :development, :test do
+  gem 'ruby-debug'
+  gem 'ruby-debug-base'
+  gem 'rspec', '<2.0.0'
+  gem 'mocha'
+end

data/Gemfile.lock ADDED Viewed

@@ -0,0 +1,44 @@
+GEM
+  remote: http://rubygems.org/
+  specs:
+    active-fedora (1.2.4)
+      mime-types (>= 1.16)
+      multipart-post
+      nokogiri
+      om (>= 1.0)
+      solr-ruby (>= 0.0.6)
+      xml-simple (>= 1.0.12)
+    columnize (0.3.1)
+    facets (2.9.0)
+    linecache (0.43)
+    mime-types (1.16)
+    mocha (0.9.9)
+      rake
+    multipart-post (1.0.1)
+    nokogiri (1.4.3.1)
+    om (1.0.0)
+      facets
+      nokogiri (>= 1.4.2)
+    rake (0.8.7)
+    rspec (1.3.1)
+    ruby-debug (0.10.3)
+      columnize (>= 0.1)
+      ruby-debug-base (~> 0.10.3.0)
+    ruby-debug-base (0.10.3)
+      linecache (>= 0.3)
+    solr-ruby (0.0.8)
+    solrizer (0.3.0)
+      active-fedora (>= 1.1.5)
+      om (>= 1.0.0)
+    xml-simple (1.0.12)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  active-fedora (= 1.2.4)
+  mocha
+  rspec (< 2.0.0)
+  ruby-debug
+  ruby-debug-base
+  solrizer (>= 0.2.0)

data/History.textile ADDED Viewed

@@ -0,0 +1,3 @@
+h2. 0.1.0
+Initial Release -- pretty much a direct replica of all of the fedora-related stuff in solrizer versions older than 0.3.0

data/LICENSE ADDED Viewed

@@ -0,0 +1,20 @@
+Copyright (c) 2009 Matt Zumwalt
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.textile ADDED Viewed

@@ -0,0 +1,34 @@
+h1. solrizer-fedora
+An extension to projecthydra/solrizer that provides utilities for loading objects from Fedora Repositories and creating solr documents from them.
+h2. Usage
+<pre>gem install solrizer-fedora</pre>
+You must tell the app where to find fedora and solr.  Put that information into config/fedora.yml and config/solr.yml
+Then...
+<pre>
+irb
+require "rubygems"
+require "solrizer-fedora"
+solrizer = Solrizer::Fedora::Solrizer.new
+solrizer.solrize("demo:5")
+</pre>
+h2.  Note on Patches/Pull Requests
+* Fork the project.
+* Make your feature addition or bug fix.
+* Add tests for it. This is important so I don't break it in a
+  future version unintentionally.
+* Commit, do not mess with rakefile, version, or history.
+  (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
+* Send me a pull request. Bonus points for topic branches.
+h2. Copyright
+Copyright (c) 2010 Matt Zumwalt and MediaShelf. See LICENSE for details.

data/Rakefile ADDED Viewed

@@ -0,0 +1,57 @@
+require 'rubygems'
+require 'rake'
+begin
+  require 'jeweler'
+  Jeweler::Tasks.new do |gem|
+    gem.name = "solrizer-fedora"
+    gem.summary = %Q{An extension to solrizer that deals with Fedora objects & Repositories}
+    gem.description = %Q{An extension to projecthydra/solrizer that provides utilities for loading objects from Fedora Repositories and creating solr documents from them.}
+    gem.email = "matt.zumwalt@yourmediashelf.com"
+    gem.homepage = "http://github.com/projecthydra/solrizer-fedora"
+    gem.authors = ["Matt Zumwalt"]
+    # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
+  end
+  Jeweler::GemcutterTasks.new
+rescue LoadError
+  puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
+end
+require 'spec/rake/spectask'
+Spec::Rake::SpecTask.new(:spec) do |spec|
+  spec.libs << 'lib' << 'spec'
+  spec.spec_files = FileList['spec/**/*_spec.rb']
+end
+Spec::Rake::SpecTask.new(:rcov) do |spec|
+  spec.libs << 'lib' << 'spec'
+  spec.pattern = 'spec/**/*_spec.rb'
+  spec.rcov = true
+end
+task :spec => :check_dependencies
+task :default => :spec
+require 'rake/rdoctask'
+Rake::RDocTask.new do |rdoc|
+  version = File.exist?('VERSION') ? File.read('VERSION') : ""
+  rdoc.rdoc_dir = 'rdoc'
+  rdoc.title = "solrizer #{version}"
+  rdoc.rdoc_files.include('README*')
+  rdoc.rdoc_files.include('lib/**/*.rb')
+end
+begin
+  require 'rcov/rcovtask'
+  Rcov::RcovTask.new do |test|
+    test.libs << 'test'
+    test.pattern = 'test/**/test_*.rb'
+    test.verbose = true
+  end
+rescue LoadError
+  task :rcov do
+    abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
+  end
+end

data/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.1.0

data/config/fedora.yml ADDED Viewed

@@ -0,0 +1,16 @@
+development:
+  fedora:
+    url: http://fedoraAdmin:fedoraAdmin@127.0.0.1:8983/fedora
+  solr:
+    url: http://127.0.0.1:8983/solr/development
+test:
+  fedora:
+    url: http://fedoraAdmin:fedoraAdmin@127.0.0.1:8983/fedora
+  solr:
+    url: http://127.0.0.1:8983/solr/test
+production:
+  fedora:
+    url: http://fedoraAdmin:fedoraAdmin@127.0.0.1:8080/fedora
+  solr:
+    url: http://127.0.0.1:8080/solr

data/config/hydra_types.yml ADDED Viewed

@@ -0,0 +1,4 @@
+info:fedora/afmodel:SaltDocument : salt_document
+info:fedora/afmodel:JP2Document : jp2_document
+info:fedora/afmodel:ModsDocument : mods_document
+info:fedora/afmodel:DCDocument : dc_document

data/config/solr.yml ADDED Viewed

@@ -0,0 +1,7 @@
+development:
+  url: http://localhost:8983/solr/development
+test: &TEST
+  url: http://localhost:8983/solr/test
+production:
+  url: http://localhost:8080/solr/production

data/lib/solrizer-fedora.rb ADDED Viewed

	@@ -0,0 +1 @@
1	+ require "solrizer/fedora"

data/lib/solrizer/fedora.rb ADDED Viewed

@@ -0,0 +1,7 @@
+require "rubygems"
+require "solrizer"
+module Solrizer::Fedora
+end
+Dir[File.join(File.dirname(__FILE__),"fedora","*.rb")].each {|file| require file }
+Solrizer::Extractor.send(:include, Solrizer::Fedora::Extractor)

data/lib/solrizer/fedora/extractor.rb ADDED Viewed

@@ -0,0 +1,34 @@
+require 'solr'
+require 'rexml/document'
+require "nokogiri"
+require 'yaml'
+module Solrizer::Fedora::Extractor
+  #
+  # Extracts content-model and hydra-type from RELS-EXT datastream
+  #
+  def extract_rels_ext( text, solr_doc=Solr::Document.new )
+    # TODO: only read in this file once
+    if defined?(RAILS_ROOT)
+      config_path = File.join(RAILS_ROOT, "config")
+    else
+      config_path = File.join(File.dirname(__FILE__), "..", "..", "..", "config")
+    end
+    map = YAML.load(File.open(File.join(config_path, "hydra_types.yml")))
+    doc = Nokogiri::XML(text)
+    doc.xpath( '//foo:hasModel', 'foo' => 'info:fedora/fedora-system:def/model#' ).each do |element|
+      cmodel = element.attributes['resource'].to_s
+      solr_doc << Solr::Field.new( :cmodel_t => cmodel )
+      if map.has_key?(cmodel)
+        solr_doc << Solr::Field.new( :hydra_type_t => map[cmodel] )
+      end
+    end
+    return solr_doc
+  end
+end

data/lib/solrizer/fedora/indexer.rb ADDED Viewed

@@ -0,0 +1,213 @@
+require 'solr'
+require 'solrizer/extractor'
+require 'solrizer/fedora/repository'
+module Solrizer::Fedora
+class Indexer
+  #
+  # Class variables
+  #
+  @@unique_id = 0
+  def self.unique_id
+    @@unique_id
+  end
+  #
+  # Member variables
+  #
+  attr_accessor :connection, :extractor, :index_full_text
+  #
+  # This method performs initialization tasks
+  #
+  def initialize( opts={} )
+    @@index_list = false unless defined?(@@index_list)
+    @extractor = ::Solrizer::Extractor.new
+    if opts[:index_full_text] == true || opts[:index_full_text] == "true"
+      @index_full_text = true
+    else
+      @index_full_text = false
+    end
+    connect
+  end
+  #
+  # This method connects to the Solr instance
+  #
+  def connect
+    if ActiveFedora.fedora_config.empty?
+      ActiveFedora.init
+    end
+    if defined?(Blacklight)
+      solr_config = Blacklight.solr_config
+    else
+      if defined?(RAILS_ROOT)
+        config_path = File.join(RAILS_ROOT, "config")
+        yaml = YAML.load(File.open(File.join(config_path, "solr.yml")))
+        solr_config = yaml[RAILS_ENV]
+        puts solr_config.inspect
+      else
+        config_path = File.join(File.dirname(__FILE__), "..", "..", "..", "config")
+        yaml = YAML.load(File.open(File.join(config_path, "solr.yml")))
+        if ENV["environment"].nil?
+          environment = "development"
+        else
+          environment = ENV["environment"]
+        end
+        solr_config = yaml[environment]
+        puts solr_config.inspect
+      end
+    end
+    if index_full_text == true
+      url = solr_config['fulltext']['url']
+    elsif solr_config.has_key?("default")
+      url = solr_config['default']['url']
+    else
+      url = solr_config['url']
+    end
+    @connection = Solr::Connection.new(url, :autocommit => :on )
+  end
+  #
+  # This method extracts the facet categories from the given Fedora object's external tag datastream
+  #
+  def extract_xml_to_solr( obj, ds_name, solr_doc=Solr::Document.new )
+    xml_ds = Repository.get_datastream( obj, ds_name )
+    extractor.xml_to_solr( xml_ds.content, solr_doc )
+  end
+  #
+  #
+  #
+  def extract_rels_ext( obj, ds_name, solr_doc=Solr::Document.new )
+    rels_ext_ds = Repository.get_datastream( obj, ds_name )
+    extractor.extract_rels_ext( rels_ext_ds.content, solr_doc )
+  end
+  #
+  # This method generates the month and day facets from the date_t in solr_doc
+  #
+  def generate_dates(solr_doc)
+    # This will check for valid dates, but it seems most of the dates are currently invalid....
+    #date_check =  /^(19|20)\d\d([- \/.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])/
+   #if there is not date_t, add on with easy-to-find value
+   if solr_doc[:date_t].nil?
+        solr_doc << Solr::Field.new( :date_t => "9999-99-99")
+   end #if
+    # unless date_check !~  solr_doc[:date_t]
+    date_obj = Date._parse(solr_doc[:date_t])
+    if date_obj[:mon].nil?
+       solr_doc << Solr::Field.new(:month_facet => 99)
+    elsif 0 < date_obj[:mon] && date_obj[:mon] < 13
+      solr_doc << Solr::Field.new( :month_facet => date_obj[:mon].to_s.rjust(2, '0'))
+    else
+      solr_doc << Solr::Field.new( :month_facet => 99)
+    end
+    if  date_obj[:mday].nil?
+      solr_doc << Solr::Field.new( :day_facet => 99)
+    elsif 0 < date_obj[:mday] && date_obj[:mday] < 32
+      solr_doc << Solr::Field.new( :day_facet => date_obj[:mday].to_s.rjust(2, '0'))
+    else
+       solr_doc << Solr::Field.new( :day_facet => 99)
+    end
+    return solr_doc
+#      end
+  end
+  #
+  # This method creates a Solr-formatted XML document
+  #
+  def create_document( obj )
+    solr_doc = Solr::Document.new
+    model_klazz_array = ActiveFedora::ContentModel.known_models_for( obj )
+    model_klazz_array.delete(ActiveFedora::Base)
+    # If the object was passed in as an ActiveFedora::Base, call to_solr in order to get the base field entries from ActiveFedora::Base
+    # Otherwise, the object was passed in as a model instance other than ActiveFedora::Base,so call its to_solr method & allow it to insert the fields from ActiveFedora::Base
+    if obj.class == ActiveFedora::Base
+      solr_doc = obj.to_solr(solr_doc)
+      puts "  added base fields from #{obj.class.to_s}"
+    else
+      solr_doc = obj.to_solr(solr_doc)
+      model_klazz_array.delete(obj.class)
+      puts "    added base fields from #{obj.class.to_s} and model fields from #{obj.class.to_s}"
+    end
+    # Load the object as an instance of each of its other models and get the corresponding solr fields
+    # Include :model_only=>true in the options in order to avoid adding the metadata from ActiveFedora::Base every time.
+    model_klazz_array.each do |klazz|
+      instance = klazz.load_instance(obj.pid)
+      solr_doc = instance.to_solr(solr_doc, :model_only=>true)
+      puts "  added solr fields from #{klazz.to_s}"
+    end
+    solr_doc << Solr::Field.new( :id_t => "#{obj.pid}" )
+    solr_doc << Solr::Field.new( :id => "#{obj.pid}" ) unless solr_doc[:id]
+    # increment the unique id to ensure that all documents in the search index are unique
+    @@unique_id += 1
+    return solr_doc
+  end
+  #
+  # This method adds a document to the Solr search index
+  #
+  def index( obj )
+   # print "Indexing '#{obj.pid}'..."
+    begin
+      solr_doc = create_document( obj )
+      connection.add( solr_doc )
+     # puts connection.url
+     #puts solr_doc
+     #  puts "done"
+    # rescue Exception => e
+    #    p "unable to index #{obj.pid}.  Failed with #{e.inspect}"
+    end
+  end
+  #
+  # This method queries the Solr search index and returns a response
+  #
+  def query( query_str )
+    response = conn.query( query_str )
+  end
+  private :connect, :create_document
+  def class_exists?(class_name)
+    klass = Module.const_get(class_name)
+    return klass.is_a?(Class)
+  rescue NameError
+    return false
+  end
+end
+end

data/lib/solrizer/fedora/repository.rb ADDED Viewed

@@ -0,0 +1,44 @@
+require 'active-fedora'
+module Solrizer::Fedora
+class Repository
+  #
+  # This method retrieves a comprehensive list of unique ids in the fedora repository
+  #
+  def self.get_pids( num_docs )
+    solr_results = ActiveFedora::SolrService.instance.conn.query( "active_fedora_model_field:Document", { :rows => num_docs } )
+    id_array = []
+    solr_results.hits.each do |hit|
+      id_array << hit[SOLR_DOCUMENT_ID]
+    end
+    return id_array
+  end
+  #
+  # This method retrieves the object associated with the given unique id
+  #
+  def self.get_object( pid )
+    object = ActiveFedora::Base.load_instance( pid )
+  end
+  #
+  # This method retrieves a comprehensive list of datastreams for the given object
+  #
+  def self.get_datastreams( obj )
+    ds_keys = obj.datastreams.keys
+  end
+  #
+  # This method retrieves the datastream for the given object with the given datastream name
+  #
+  def self.get_datastream( obj, ds_name )
+    begin
+      obj.datastreams[ ds_name ]
+    rescue
+      return nil
+    end
+  end
+end
+end

data/lib/solrizer/fedora/solrizer.rb ADDED Viewed

@@ -0,0 +1,118 @@
+require 'solrizer/field_mapper.rb'
+require 'solrizer/field_name_mapper'
+require 'solrizer/fedora/indexer'
+require 'solrizer/xml'
+require 'solrizer/html'
+# Let people explicitly require xml support if they want it ...
+# require 'solrizer/xml.rb'
+# require 'fastercsv'
+require "ruby-debug"
+module Solrizer::Fedora
+class Solrizer
+  attr_accessor :indexer, :index_full_text
+  #
+  # This method initializes the indexer
+  # If passed an argument of :index_full_text=>true, it will perform full-text indexing instead of indexing fields only.
+  #
+  def initialize( opts={} )
+    @@index_list = false unless defined?(@@index_list)
+    if opts[:index_full_text] == true || opts[:index_full_text] == "true"
+      @index_full_text = true
+    else
+      @index_full_text = false
+    end
+    @indexer = Indexer.new( :index_full_text=>@index_full_text )
+  end
+  #
+  # This method solrizes the given Fedora object's full-text and facets into the search index
+  #
+  def solrize( obj )
+    # retrieve the Fedora object based on the given unique id
+      begin
+      start = Time.now
+      print "Retrieving object #{obj} ..."
+      case obj
+      when ActiveFedora::Base
+        # do nothing
+      when Fedora::FedoraObject
+        obj = Repository.get_object( obj.pid )
+      when String
+        obj = Repository.get_object( obj )
+      else
+        raise "you must pass either a ActiveFedora::Base, Fedora::RepositoryObject, or a String.  You submitted a #{obj.class}"
+      end
+      # obj = obj.kind_of?(ActiveFedora::Base) ? obj : Repository.get_object( obj )
+          obj_done = Time.now
+          obj_done_elapse = obj_done - start
+          puts  " completed. Duration: #{obj_done_elapse}"
+         print "\t Indexing object #{obj.pid} ... "
+         # add the keywords and facets to the search index
+         index_start = Time.now
+         indexer.index( obj )
+         index_done = Time.now
+         index_elapsed = index_done - index_start
+          puts "completed. Duration:  #{index_elapsed} ."
+      rescue Exception => e
+           p "unable to index #{obj}.  Failed with #{e.inspect}"
+      end #begin
+  end
+  #
+  # This method retrieves a comprehensive list of all the unique identifiers in Fedora and
+  # solrizes each object's full-text and facets into the search index
+  def solrize_objects
+    # retrieve a list of all the pids in the fedora repository
+    num_docs = 1000000   # modify this number to guarantee that all the objects are retrieved from the repository
+    puts "WARNING: You have turned off indexing of Full Text content.  Be sure to re-run indexer with @@index_full_text set to true in main.rb" if index_full_text == false
+    if @@index_list == false
+      objects = ::Fedora::Repository.instance.find_objects(:limit=>num_docs)
+      puts "Shelving #{objects.length} Fedora objects"
+      objects.each do |object|
+        solrize( object )
+      end
+    else
+       if File.exists?(@@index_list)
+          arr_of_pids = FasterCSV.read(@@index_list, :headers=>false)
+          puts "Indexing from list at #{@@index_list}"
+          puts "Shelving #{arr_of_pids.length} Fedora objects"
+         arr_of_pids.each do |row|
+            pid = row[0]
+            solrize( pid )
+	 end #FASTERCSV
+        else
+          puts "#{@@index_list} does not exists!"
+        end #if File.exists
+    end #if Index_LISTS
+  end #solrize_objects
+end #class
+end #module

data/lib/tasks/solrizer-fedora.rake ADDED Viewed

@@ -0,0 +1,35 @@
+namespace :solrizer do
+  namespace :fedora
+    desc 'Index a fedora object of the given pid.'
+    task :solrize => :environment do
+      index_full_text = ENV['FULL_TEXT'] == 'true'
+      if ENV['PID']
+        puts "indexing #{ENV['PID'].inspect}"
+        solrizer = Solrizer::Fedora::Solrizer.new :index_full_text=> index_full_text
+        solrizer.solrize(ENV['PID'])
+        puts "Finished shelving #{ENV['PID']}"
+      else
+        puts "You must provide a pid using the format 'solrizer::solrize_object PID=sample:pid'."
+      end
+    end
+    desc 'Index all objects in the repository.'
+    task :solrize_objects => :environment do
+      index_full_text = ENV['FULL_TEXT'] == 'true'
+      if ENV['INDEX_LIST']
+        @@index_list = ENV['INDEX_LIST']
+      end
+      puts "Re-indexing Fedora Repository."
+      puts "Fedora URL: #{ActiveFedora.fedora_config[:url]}"
+      puts "Fedora Solr URL: #{ActiveFedora.solr_config[:url]}"
+      puts "Blacklight Solr Config: #{Blacklight.solr_config.inspect}"
+      puts "Doing full text index." if index_full_text
+      solrizer = Solrizer::Fedora::Solrizer.new :index_full_text=> index_full_text
+      solrizer.solrize_objects
+      puts "Solrizer task complete."
+    end
+  end
+end

data/solrizer-fedora.gemspec ADDED Viewed

@@ -0,0 +1,72 @@
+# Generated by jeweler
+# DO NOT EDIT THIS FILE DIRECTLY
+# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
+# -*- encoding: utf-8 -*-
+Gem::Specification.new do |s|
+  s.name = %q{solrizer-fedora}
+  s.version = "0.1.0"
+  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
+  s.authors = ["Matt Zumwalt"]
+  s.date = %q{2010-10-26}
+  s.description = %q{An extension to projecthydra/solrizer that provides utilities for loading objects from Fedora Repositories and creating solr documents from them.}
+  s.email = %q{matt.zumwalt@yourmediashelf.com}
+  s.extra_rdoc_files = [
+    "LICENSE",
+     "README.textile"
+  ]
+  s.files = [
+    ".document",
+     ".gitignore",
+     "Gemfile",
+     "Gemfile.lock",
+     "History.textile",
+     "LICENSE",
+     "README.textile",
+     "Rakefile",
+     "VERSION",
+     "config/fedora.yml",
+     "config/hydra_types.yml",
+     "config/solr.yml",
+     "lib/solrizer-fedora.rb",
+     "lib/solrizer/fedora.rb",
+     "lib/solrizer/fedora/extractor.rb",
+     "lib/solrizer/fedora/indexer.rb",
+     "lib/solrizer/fedora/repository.rb",
+     "lib/solrizer/fedora/solrizer.rb",
+     "lib/tasks/solrizer-fedora.rake",
+     "solrizer-fedora.gemspec",
+     "spec/fixtures/rels_ext_cmodel.xml",
+     "spec/integration/fedora_indexer_spec.rb",
+     "spec/rcov.opts",
+     "spec/spec.opts",
+     "spec/spec_helper.rb",
+     "spec/units/fedora_extractor_spec.rb",
+     "spec/units/fedora_indexer_spec.rb",
+     "spec/units/fedora_solrizer_spec.rb"
+  ]
+  s.homepage = %q{http://github.com/projecthydra/solrizer-fedora}
+  s.rdoc_options = ["--charset=UTF-8"]
+  s.require_paths = ["lib"]
+  s.rubygems_version = %q{1.3.7}
+  s.summary = %q{An extension to solrizer that deals with Fedora objects & Repositories}
+  s.test_files = [
+    "spec/integration/fedora_indexer_spec.rb",
+     "spec/spec_helper.rb",
+     "spec/units/fedora_extractor_spec.rb",
+     "spec/units/fedora_indexer_spec.rb",
+     "spec/units/fedora_solrizer_spec.rb"
+  ]
+  if s.respond_to? :specification_version then
+    current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
+    s.specification_version = 3
+    if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
+    else
+    end
+  else
+  end
+end

data/spec/fixtures/rels_ext_cmodel.xml ADDED Viewed

@@ -0,0 +1,8 @@
+<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+	<rdf:Description rdf:about="info:fedora/demo:multipurpose-objects-model_and_sdef">
+		<hasModel xmlns="info:fedora/fedora-system:def/model#" rdf:resource="info:fedora/fedora-system:ContentModel-3.0"/>
+		<hasModel xmlns="info:fedora/fedora-system:def/model#" rdf:resource="info:fedora/afmodel:SaltDocument"/>
+		<hasModel xmlns="info:fedora/fedora-system:def/model#" rdf:resource="info:fedora/afmodel:JP2Document"/>
+		<hasModel xmlns="info:fedora/fedora-system:def/model#" rdf:resource="info:fedora/afmodel:DCDocument"/>
+	</rdf:Description>
+</rdf:RDF>

data/spec/integration/fedora_indexer_spec.rb ADDED Viewed

@@ -0,0 +1,18 @@
+require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
+require 'solrizer'
+describe Solrizer::Fedora::Indexer do
+  before(:each) do
+    @indexer = Solrizer::Fedora::Indexer.new
+  end
+  describe "index" do
+    it "should update solr with the metadata from the given object" do
+      pending "Got to decide if/how to handle fixtures in this gem. Probably should just mock out Fedora & Solr entirely."
+      obj = Solrizer::Repository.get_object( "druid:sb733gr4073" )
+      @indexer.index( obj )
+    end
+  end
+end

data/spec/rcov.opts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ --exclude "spec/,gems/"
2	+ --rails

data/spec/spec.opts ADDED Viewed

	@@ -0,0 +1 @@
1	+ --color

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,18 @@
+$LOAD_PATH.unshift(File.dirname(__FILE__))
+$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
+require 'solrizer/fedora'
+require 'spec'
+require 'spec/autorun'
+require 'solrizer'
+Spec::Runner.configure do |config|
+  config.mock_with :mocha
+  def fixture(file)
+    File.new(File.join(File.dirname(__FILE__), 'fixtures', file))
+  end
+end

data/spec/units/fedora_extractor_spec.rb ADDED Viewed

@@ -0,0 +1,31 @@
+require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
+require 'solrizer'
+describe Solrizer::Fedora::Extractor do
+  before(:all) do
+    @extractor = Solrizer::Extractor.new
+  end
+  describe "extract_rels_ext" do
+    it "should extract the content model of the RELS-EXT datastream of a Fedora object and set hydra_type using hydra_types mapping" do
+      rels_ext = fixture("rels_ext_cmodel.xml")
+      result = @extractor.extract_rels_ext( rels_ext )
+      result[:cmodel_t].should == "info:fedora/fedora-system:ContentModel-3.0"
+      result[:hydra_type_t].should == "salt_document"
+      # ... and a hacky way of making sure that it added a field for each of the dc:medium values
+      result.inspect.include?('@value="info:fedora/afmodel:SaltDocument"').should be_true
+      result.inspect.include?('@value="jp2_document"').should be_true
+    end
+  end
+  describe "extract_hydra_types" do
+    it "should extract the hydra_type of a Fedora object" do
+      rels_ext = fixture("rels_ext_cmodel.xml")
+      result = @extractor.extract_rels_ext( rels_ext )
+      result[:hydra_type_t].should == "salt_document"
+    end
+  end
+end

data/spec/units/fedora_indexer_spec.rb ADDED Viewed

@@ -0,0 +1,78 @@
+require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
+require 'solrizer'
+require "solrizer/fedora"
+describe Solrizer::Fedora::Indexer do
+  before(:each) do
+     Solrizer::Fedora::Indexer.any_instance.stubs(:connect).returns("foo")
+     @extractor = mock("Extractor")
+     @extractor.stubs(:html_content_to_solr).returns(@solr_doc)
+#     @solr_doc = mock('solr_doc')
+#     @solr_doc.stubs(:<<)
+#     @solr_doc.stubs(:[])
+     @solr_doc = Solr::Document.new
+     Solrizer::Extractor.expects(:new).returns(@extractor)
+     @indexer = Solrizer::Fedora::Indexer.new
+   end
+  describe "#generate_dates" do
+    it "should still give 9999-99-99 date if the solr document does not have a date_t field" do
+    solr_result = @indexer.generate_dates(@solr_doc)
+    solr_result.should be_kind_of Solr::Document
+    solr_result[:date_t].should == "9999-99-99"
+    solr_result[:month_facet].should == "99"
+    solr_result[:day_facet].should == '99'
+    end
+    it "should still give 9999-99-99 date if the solr_doc[:date_t] is not valid date in YYYY-MM-DD format " do
+      @solr_doc << Solr::Field.new(:date_t => "Unknown")
+      solr_result = @indexer.generate_dates(@solr_doc)
+      solr_result.should be_kind_of Solr::Document
+      solr_result[:date_t].should == "Unknown"
+      solr_result[:month_facet].should == "99"
+      solr_result[:day_facet].should == '99'
+    end
+    it "should give month and dates even if the :date_t is not a valid date but is in YYYY-MM-DD format  " do
+       @solr_doc << Solr::Field.new(:date_t => "0000-13-11")
+       solr_result = @indexer.generate_dates(@solr_doc)
+       solr_result.should be_kind_of Solr::Document
+       solr_result[:date_t].should == "0000-13-11"
+       solr_result[:month_facet].should == "99"
+       solr_result[:day_facet].should == '11'
+     end
+     it "should give month and day when in a valid date format" do
+           @solr_doc << Solr::Field.new(:date_t => "1978-04-11")
+            solr_result = @indexer.generate_dates(@solr_doc)
+            solr_result.should be_kind_of Solr::Document
+            solr_result[:date_t].should == "1978-04-11"
+            solr_result[:month_facet].should == "04"
+            solr_result[:day_facet].should == '11'
+     end
+     it "should still give two digit strings even if the month/day is single digit" do
+         @solr_doc << Solr::Field.new(:date_t => "1978-4-1")
+         solr_result = @indexer.generate_dates(@solr_doc)
+         solr_result.should be_kind_of Solr::Document
+         solr_result[:date_t].should == "1978-4-1"
+         solr_result[:month_facet].should == "04"
+         solr_result[:day_facet].should == '01'
+     end
+  end
+end

data/spec/units/fedora_solrizer_spec.rb ADDED Viewed

@@ -0,0 +1,42 @@
+require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
+describe Solrizer::Fedora::Solrizer do
+  before(:each) do
+    @solrizer = Solrizer::Fedora::Solrizer.new
+  end
+  describe "solrize" do
+    it "should trigger the indexer for the provided object" do
+      sample_obj = ActiveFedora::Base.new
+      @solrizer.indexer.expects(:index).with( sample_obj )
+      @solrizer.solrize( sample_obj )
+    end
+    it "should work with Fedora::FedoraObject objects" do
+      mock_object = Fedora::FedoraObject.new(:pid=>"my:pid", :label=>"my label")
+      ActiveFedora::Base.expects(:load_instance).with( mock_object.pid ).returns(mock_object)
+      @solrizer.indexer.expects(:index).with( mock_object )
+      @solrizer.solrize( mock_object )
+    end
+    it "should load the object if only a pid is provided" do
+      mock_object = mock("my object")
+      mock_object.stubs(:pid)
+      mock_object.stubs(:label)
+      mock_object.stubs(:datastreams).returns({'descMetadata'=>"foo","location"=>"bar"})
+      ActiveFedora::Base.expects(:load_instance).with( "_PID_" ).returns(mock_object)
+      @solrizer.indexer.expects(:index).with(mock_object)
+      @solrizer.solrize("_PID_")
+    end
+  end
+  describe "solrize_objects" do
+    it "should call solrize for each object returned by Fedora::Repository.find_objects" do
+      objects = [["pid1"], ["pid2"], ["pid3"]]
+      Fedora::Repository.any_instance.expects(:find_objects).returns(objects)
+      objects.each {|object| @solrizer.expects(:solrize).with( object ) }
+      @solrizer.solrize_objects
+    end
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,99 @@
+--- !ruby/object:Gem::Specification
+name: solrizer-fedora
+version: !ruby/object:Gem::Version
+  hash: 27
+  prerelease: false
+  segments:
+  - 0
+  - 1
+  - 0
+  version: 0.1.0
+platform: ruby
+authors:
+- Matt Zumwalt
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2010-10-26 00:00:00 -05:00
+default_executable:
+dependencies: []
+description: An extension to projecthydra/solrizer that provides utilities for loading objects from Fedora Repositories and creating solr documents from them.
+email: matt.zumwalt@yourmediashelf.com
+executables: []
+extensions: []
+extra_rdoc_files:
+- LICENSE
+- README.textile
+files:
+- .document
+- .gitignore
+- Gemfile
+- Gemfile.lock
+- History.textile
+- LICENSE
+- README.textile
+- Rakefile
+- VERSION
+- config/fedora.yml
+- config/hydra_types.yml
+- config/solr.yml
+- lib/solrizer-fedora.rb
+- lib/solrizer/fedora.rb
+- lib/solrizer/fedora/extractor.rb
+- lib/solrizer/fedora/indexer.rb
+- lib/solrizer/fedora/repository.rb
+- lib/solrizer/fedora/solrizer.rb
+- lib/tasks/solrizer-fedora.rake
+- solrizer-fedora.gemspec
+- spec/fixtures/rels_ext_cmodel.xml
+- spec/integration/fedora_indexer_spec.rb
+- spec/rcov.opts
+- spec/spec.opts
+- spec/spec_helper.rb
+- spec/units/fedora_extractor_spec.rb
+- spec/units/fedora_indexer_spec.rb
+- spec/units/fedora_solrizer_spec.rb
+has_rdoc: true
+homepage: http://github.com/projecthydra/solrizer-fedora
+licenses: []
+post_install_message:
+rdoc_options:
+- --charset=UTF-8
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+requirements: []
+rubyforge_project:
+rubygems_version: 1.3.7
+signing_key:
+specification_version: 3
+summary: An extension to solrizer that deals with Fedora objects & Repositories
+test_files:
+- spec/integration/fedora_indexer_spec.rb
+- spec/spec_helper.rb
+- spec/units/fedora_extractor_spec.rb
+- spec/units/fedora_indexer_spec.rb
+- spec/units/fedora_solrizer_spec.rb