RubyGems - solrizer - Versions diffs - 0.2.0 → 0.3.0 - Mend

solrizer 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

data/.gitignore +3 -0
data/Gemfile +14 -0
data/Gemfile.lock +44 -0
data/History.txt +8 -0
data/Rakefile +10 -3
data/VERSION +1 -1
data/config/solr_mappings.yml +16 -13
data/config/solr_mappings_af_0.1.yml +18 -0
data/lib/solrizer/extractor.rb +31 -72
data/lib/solrizer/field_mapper.rb +351 -0
data/lib/solrizer/field_name_mapper.rb +37 -51
data/lib/solrizer/html/extractor.rb +36 -0
data/lib/solrizer/html.rb +7 -0
data/lib/solrizer/xml/extractor.rb +31 -0
data/lib/solrizer/xml/terminology_based_solrizer.rb +25 -29
data/lib/solrizer/xml.rb +4 -1
data/lib/solrizer.rb +2 -113
data/lib/tasks/solrizer.rake +7 -27
data/solrizer.gemspec +46 -26
data/spec/{spec.opts → .rspec} +0 -0
data/spec/fixtures/test_solr_mappings.yml +16 -0
data/spec/spec_helper.rb +1 -0
data/spec/units/extractor_spec.rb +43 -34
data/spec/units/field_mapper_spec.rb +227 -0
data/spec/units/field_name_mapper_spec.rb +16 -29
data/spec/units/xml_extractor_spec.rb +28 -0
data/spec/units/xml_terminology_based_solrizer_spec.rb +18 -5
metadata +128 -35
data/lib/solrizer/configuration.rb +0 -8
data/lib/solrizer/indexer.rb +0 -261
data/lib/solrizer/main.rb +0 -17
data/lib/solrizer/replicator.rb +0 -143
data/lib/solrizer/repository.rb +0 -54
data/spec/fixtures/rels_ext_cmodel.xml +0 -8
data/spec/fixtures/solr_mappings_af_0.1.yml +0 -16
data/spec/integration/indexer_spec.rb +0 -18
data/spec/units/indexer_spec.rb +0 -127
data/spec/units/shelver_spec.rb +0 -42

data/.gitignore CHANGED Viewed

@@ -17,3 +17,6 @@ rerun.txt
 .loadpath
 .project
 .buildpath
+/.bundle
+/.rvmrc

data/Gemfile ADDED Viewed

@@ -0,0 +1,14 @@
+source "http://rubygems.org"
+gem "solr-ruby"
+gem "nokogiri"
+gem "om", ">= 1.0.0"  # only required by xml/terminology_based_solrizer ...
+gem "mediashelf-loggable"
+group :development, :test do
+  gem "jeweler"
+  gem 'ruby-debug'
+  gem 'ruby-debug-base'
+  gem 'rspec', '<2.0.0'
+  gem 'mocha'
+end

data/Gemfile.lock ADDED Viewed

@@ -0,0 +1,44 @@
+GEM
+  remote: http://rubygems.org/
+  specs:
+    columnize (0.3.1)
+    facets (2.9.0)
+    gemcutter (0.6.1)
+    git (1.2.5)
+    jeweler (1.4.0)
+      gemcutter (>= 0.1.0)
+      git (>= 1.2.5)
+      rubyforge (>= 2.0.0)
+    json_pure (1.4.6)
+    linecache (0.43)
+    mediashelf-loggable (0.4.0)
+    mocha (0.9.9)
+      rake
+    nokogiri (1.4.3.1)
+    om (1.0.0)
+      facets
+      nokogiri (>= 1.4.2)
+    rake (0.8.7)
+    rspec (1.3.1)
+    ruby-debug (0.10.3)
+      columnize (>= 0.1)
+      ruby-debug-base (~> 0.10.3.0)
+    ruby-debug-base (0.10.3)
+      linecache (>= 0.3)
+    rubyforge (2.0.4)
+      json_pure (>= 1.1.7)
+    solr-ruby (0.0.8)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  jeweler
+  mediashelf-loggable
+  mocha
+  nokogiri
+  om (>= 1.0.0)
+  rspec (< 2.0.0)
+  ruby-debug
+  ruby-debug-base
+  solr-ruby

data/History.txt CHANGED Viewed

@@ -1,3 +1,11 @@
+h2. 0.3.0
+HYDRA-286 Re-structure Solrizer to separate solrizer base from fedora-solrizer
+Added TerminologyBasedSolrizer
+Added Extremely Configurable FieldMapper
+Updated FieldNameMapper to use new FieldMapper
 h2. 0.1.2
 Minor: switched active-fedora gem requirement to >= 1.1.5 instead of = 1.1.5 (was breaking apps that use later versions of active-fedora)

data/Rakefile CHANGED Viewed

@@ -10,9 +10,16 @@ begin
     gem.email = "matt.zumwalt@yourmediashelf.com"
     gem.homepage = "http://github.com/projecthydra/solrizer"
     gem.authors = ["Matt Zumwalt"]
-    gem.add_dependency "active-fedora", ">= 1.1.5"
-    gem.add_dependency "om", ">= 1.0.0"  # only required by xml/terminology_based_solrizer ...
-    gem.add_development_dependency "rspec", ">= 1.2.9"
+    gem.add_dependency "solr-ruby"
+    gem.add_dependency "nokogiri"
+    gem.add_dependency "om"
+    gem.add_dependency "nokogiri"
+    gem.add_dependency "mediashelf-loggable"
+    gem.add_development_dependency "jeweler"
+    gem.add_development_dependency 'ruby-debug'
+    gem.add_development_dependency 'ruby-debug-base'
+    gem.add_development_dependency 'rspec', '<2.0.0'
+    gem.add_development_dependency 'mocha'
     # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
   end
   Jeweler::GemcutterTasks.new

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.2.0
1	+ 0.3.0

data/config/solr_mappings.yml CHANGED Viewed

@@ -1,14 +1,17 @@
 id: id
-date: _dt
-string: _t
-text: _t
-symbol: _s
-integer: _i
-long: _l
-boolean: _b
-float: _f
-double: _d
-facet: _facet
-display: _display
-sort: _sort
-unstemmed_search: _unstem_search
+default: searchable
+searchable:
+  default: _t
+  date: _dt
+  string: _t
+  text: _t
+  symbol: _s
+  integer: _i
+  long: _l
+  boolean: _b
+  float: _f
+  double: _d
+displayable: _display
+facetable: _facet
+sortable: _sort
+unstemmed_searchable: _unstem_search

data/config/solr_mappings_af_0.1.yml ADDED Viewed

@@ -0,0 +1,18 @@
+id: id
+default: searchable
+searchable:
+  date: _date
+  string: _field
+  text: _field
+  symbol: _field
+  integer: _field
+  long: _field
+  boolean: _field
+  float: _field
+  double: _field
+displayable: _display
+facetable: _facet
+sortable: _sort
+unstemmed_searchable: _unstem_search

data/lib/solrizer/extractor.rb CHANGED Viewed

@@ -4,85 +4,44 @@ require "nokogiri"
 require 'yaml'
 module Solrizer
-class Extractor
-  def extract_tags(text)
-    doc = REXML::Document.new( text )
-    extract_tag(doc, 'archivist_tags').merge(extract_tag(doc, 'donor_tags'))
-  end
-  def extract_tag(doc, type)
-    tags = doc.elements["/fields/#{type}"]
-    return {} unless tags
-    {type => tags.text.split(/,/).map {|t| t.strip}}
-  end
+# Provides utilities for extracting solr fields from a variety of objects and/or creating solr documents from a given object
+# Note: These utilities are optional.  You can implement .to_solr directly on your classes if you want to bypass using Extractors.
+#
+# Each of the Solrizer implementations provides its own Extractor module that extends the behaviors of Solrizer::Extractor
+# with methods specific to that implementation (ie. extract_tag, extract_rels_ext, xml_to_solr, html_to_solr)
+#
+class Extractor
-  #
-  # Extracts content-model and hydra-type from RELS-EXT datastream
+  # Populates a solr doc with values from a hash.
+  # Accepts two forms of hashes:
+  # => {'technology'=>["t1", "t2"], 'company'=>"c1", "person"=>["p1", "p2"]}
+  # or
+  # => {:facets => {'technology'=>["t1", "t2"], 'company'=>"c1", "person"=>["p1", "p2"]} }
   #
-  def extract_rels_ext( text, solr_doc=Solr::Document.new )
-    # TODO: only read in this file once
-    if defined?(RAILS_ROOT)
-      config_path = File.join(RAILS_ROOT, "config")
-    else
-      config_path = File.join(File.dirname(__FILE__), "..", "..", "config")
-    end
-    map = YAML.load(File.open(File.join(config_path, "hydra_types.yml")))
-    doc = Nokogiri::XML(text)
-    doc.xpath( '//foo:hasModel', 'foo' => 'info:fedora/fedora-system:def/model#' ).each do |element|
-      cmodel = element.attributes['resource'].to_s
-      solr_doc << Solr::Field.new( :cmodel_t => cmodel )
-      if map.has_key?(cmodel)
-        solr_doc << Solr::Field.new( :hydra_type_t => map[cmodel] )
+  # Note that values for individual fields can be a single string or an array of strings.
+  def extract_hash( input_hash, solr_doc=Solr::Document.new )
+    facets = input_hash.has_key?(:facets) ? input_hash[:facets] : input_hash
+    facets.each_pair do |facet_name, value|
+      case value.class.to_s
+      when "String"
+        solr_doc << Solr::Field.new( :"#{facet_name}_facet" => "#{value}" )
+      when "Array"
+        value.each { |v| solr_doc << Solr::Field.new( :"#{facet_name}_facet" => "#{v}" ) }
       end
     end
-    return solr_doc
-  end
-  #
-  # This method extracts solr fields from simple xml
-  #
-  def xml_to_solr( text, solr_doc=Solr::Document.new )
-    doc = REXML::Document.new( text )
-    doc.root.elements.each do |element|
-      solr_doc << Solr::Field.new( :"#{element.name}_t" => "#{element.text}" )
-    end
-    return solr_doc
-  end
-  #
-  # This method strips html tags out and returns content to be indexed in solr
-  #
-  def html_content_to_solr( ds, solr_doc=Solr::Document.new )
-    text = CGI.unescapeHTML(ds.content)
-    doc = Nokogiri::HTML(text)
-    # html to story_display
-    stories = doc.xpath('//story')
-    stories.each do |story|
-      solr_doc << Solr::Field.new(:story_display => story.children.to_xml)
+    if input_hash.has_key?(:symbols)
+      input_hash[:symbols].each do |symbol_name, value|
+        case value.class.to_s
+        when "String"
+          solr_doc << Solr::Field.new( :"#{symbol_name}_s" => "#{value}" )
+	      when "Array"
+          value.each { |v| solr_doc << Solr::Field.new( :"#{symbol_name}_s" => "#{v}" ) }
+        end
+      end
     end
-    #strip out text and put in story_t
-    text_nodes = doc.xpath("//text()")
-    text = String.new
-     text_nodes.each do |text_node|
-       text << text_node.content
-     end
-     solr_doc << Solr::Field.new(:story_t => text)
-     return solr_doc
+    return solr_doc
   end
 end

data/lib/solrizer/field_mapper.rb ADDED Viewed

@@ -0,0 +1,351 @@
+require "loggable"
+module Solrizer
+  # Maps Term names and values to Solr fields, based on the Term's data type and any index_as options.
+  #
+  # The basic structure of a mapper is:
+  #
+  # == Mapping on Index Type
+  #
+  # To define a custom mapper:
+  #
+  #   class CustomMapper < Solrizer::FieldMapper
+  #     index_as :searchable, :suffix => '_search'
+  #     index_as :edible,     :suffix => '_food'
+  #   end
+  #
+  #   #   t.dish_name   :index_as => [:searchable]            -maps to->   dish_name_search
+  #   #   t.ingredients :index_as => [:searchable, :edible]   -maps to->   ingredients_search, ingredients_food
+  #
+  # (See Solrizer::XML::TerminologyBasedSolrizer for instructions on applying a custom mapping once you have defined it.)
+  #
+  # == Default Index Types
+  #
+  # You can mark a particular index type as a default. It will then always be included unless terms explicity
+  # exclude it with the "not_" prefix:
+  #
+  #   class CustomMapper < Solrizer::FieldMapper
+  #     index_as :searchable, :suffix => '_search', :default => true
+  #     index_as :edible,     :suffix => '_food'
+  #   end
+  #
+  #   #   t.dish_name                                                   -maps to->   dish_name_search
+  #   #   t.ingredients :index_as => [:edible]                          -maps to->   ingredients_search, ingredients_food
+  #   #   t.secret_ingredients :index_as => [:not_searchable, :edible]  -maps to->   secret_ingredients_food
+  #
+  # == Mapping on Data Type
+  #
+  # A mapper can apply different suffixes based on a term's data type:
+  #
+  #   class CustomMapper < Solrizer::FieldMapper
+  #     index_as :searchable, :suffix => '_search' do |type|
+  #       type.date    :suffix => '_date'
+  #       type.integer :suffix => '_numeric'
+  #       type.float   :suffix => '_numeric'
+  #     end
+  #     index_as :edible, :suffix => '_food'
+  #   end
+  #
+  #   #   t.published   :type => :date, :index_as => [:searchable]     -maps to->   published_date
+  #   #   t.votes       :type => :integer, :index_as => [:searchable]  -maps to->   votes_numeric
+  #
+  # If a specific data type doesn't appear in the list, the mapper falls back to the index_as:
+  #
+  #   #   t.description :type => :text, :index_as => [:searchable]     -maps to->   description_search
+  #
+  # == Custom Value Converters
+  #
+  # All of the above applies to the generation of Solr names. Mappers can also provide custom conversion logic for the
+  # generation of Solr values by attaching a custom value converter block to a data type:
+  #
+  #   require 'time'
+  #
+  #   class CustomMapper < Solrizer::FieldMapper
+  #     index_as :searchable, :suffix => '_search' do |type|
+  #       type.date do |value|
+  #         Time.parse(value).utc.to_i
+  #       end
+  #     end
+  #   end
+  #
+  # Note that the nesting order is always:
+  #
+  #   FieldMapper definition
+  #     index_as
+  #       data type
+  #         value converter
+  #
+  # You can use the special data type "default" to apply custom value conversion to any data type:
+  #
+  #   require 'time'
+  #
+  #   class CustomMapper < Solrizer::FieldMapper
+  #     index_as :searchable do |type|
+  #       type.date :suffix => '_date' do |value|
+  #         Time.parse(value).utc.to_i
+  #       end
+  #       type.default :suffix => '_search' do |value|
+  #         value.to_s.strip
+  #       end
+  #     end
+  #   end
+  #
+  # This example converts searchable dates to milliseconds, and strips extra whitespace from all other searchable data types.
+  #
+  # Note that the :suffix option may appear on the data types and the index_as. The search order for the suffix on a field
+  # of type foo is:
+  # 1. type.foo
+  # 2. type.default
+  # 3. index_as
+  # The suffix is optional in all three places.
+  #
+  # Note that a single Term with multiple index types can translate into multiple Solr fields, because we may want Solr to
+  # index a single field in multiple ways. However, if two different mappings generate both the same solr field name
+  # _and_ the same value, the mapper will only emit a single field.
+  #
+  # == ID Field
+  #
+  # In addition to the normal field mappings, Solrizer gives special treatment to an ID field. If you want that
+  # logic (and you probably do), specify a name for this field:
+  #
+  #   class CustomMapper < Solrizer::FieldMapper
+  #     id_field 'id'
+  #   end
+  #
+  # == Extending the Default
+  #
+  # The default mapper is Solrizer::FieldMapper::Default. You can customize the default mapping by subclassing it.
+  # For example, to override the ID field name and the default suffix for sortable, and inherit everything else:
+  #
+  #   class CustomMapperBasedOnDefault < Solrizer::FieldMapper::Default
+  #     id_field 'guid'
+  #     index_as :sortable, :suffix => '_xsort'
+  #   end
+  class FieldMapper
+    include Loggable
+    # ------ Class methods ------
+    @@instance_init_actions = Hash.new { |h,k| h[k] = [] }
+    def self.id_field(field_name)
+      add_instance_init_action do
+        @id_field = field_name
+      end
+    end
+    def self.index_as(index_type, opts = {}, &block)
+      add_instance_init_action do
+        mapping = (@mappings[index_type] ||= IndexTypeMapping.new)
+        mapping.opts.merge! opts
+        yield DataTypeMappingBuilder.new(mapping) if block_given?
+      end
+    end
+    # Loads solr mappings from yml file.
+    # Assumes that string values are solr field name suffixes.
+    # This is meant as a simple entry point for working with solr mappings.  For more powerful control over solr mappings, create your own subclasses of FieldMapper instead of using a yml file.
+    # @param [String] config_path This is the path to the directory where your mappings file is stored. Defaults to "RAILS_ROOT/config/solr_mappings.yml"
+    def self.load_mappings( config_path=nil )
+      if config_path.nil?
+        if defined?(RAILS_ROOT)
+          config_path = File.join(RAILS_ROOT, "config", "solr_mappings.yml")
+        end
+        # Default to using the config file within the gem
+        if !File.exist?(config_path.to_s)
+          config_path = File.join(File.dirname(__FILE__), "..", "..", "config", "solr_mappings.yml")
+        end
+      end
+      logger.info("SOLRIZER: loading field name mappings from #{File.expand_path(config_path)}")
+      mappings_from_file = YAML::load(File.open(config_path))
+      self.clear_mappings
+      # Set id_field from file if it is available
+      id_field_from_file = mappings_from_file.delete("id")
+      if id_field_from_file.nil?
+        id_field "id"
+      else
+        id_field id_field_from_file
+      end
+      default_index_type = mappings_from_file.delete("default")
+      mappings_from_file.each_pair do |index_type, type_settings|
+        if type_settings.kind_of?(Hash)
+          index_as index_type.to_sym, :default => index_type == default_index_type do |t|
+            type_settings.each_pair do |field_type, suffix|
+              eval("t.#{field_type} :suffix=>\"#{suffix}\"")
+            end
+          end
+        else
+          index_as index_type.to_sym, :default => index_type == default_index_type, :suffix=>type_settings
+        end
+      end
+    end
+  private
+    def self.add_instance_init_action(&block)
+      @@instance_init_actions[self] << lambda do |mapper|
+        mapper.instance_eval &block
+      end
+    end
+    def self.apply_instance_init_actions(instance)
+      if self.superclass.respond_to? :apply_instance_init_actions
+        self.superclass.apply_instance_init_actions(instance)
+      end
+      @@instance_init_actions[self].each do |action|
+        action.call(instance)
+      end
+    end
+    # Reset all of the mappings
+    def self.clear_mappings
+      logger.debug "resetting mappings for #{self.to_s}"
+      @@instance_init_actions[self] = []
+    end
+  public
+    # ------ Instance methods ------
+    attr_reader :id_field, :default_index_types, :mappings
+    def initialize
+      @mappings = {}
+      self.class.apply_instance_init_actions(self)
+      @default_index_types = @mappings.select { |ix_type, mapping| mapping.opts[:default] }.map(&:first)
+    end
+    # Given a specific field name, data type, and index type, returns the corresponding solr name.
+    def solr_name(field_name, field_type, index_type = :searchable)
+      name, mapping, data_type_mapping = solr_name_and_mappings(field_name, field_type, index_type)
+      name
+    end
+    # Given a field name-value pair, a data type, and an array of index types, returns a hash of
+    # mapped names and values. The values in the hash are _arrays_, and may contain multiple values.
+    def solr_names_and_values(field_name, field_value, field_type, index_types)
+      # Determine the set of index types, adding defaults and removing not_xyz
+      index_types ||= []
+      index_types += default_index_types
+      index_types.uniq!
+      index_types.dup.each do |index_type|
+        if index_type.to_s =~ /^not_(.*)/
+          index_types.delete index_type # not_foo
+          index_types.delete $1.to_sym  # foo
+        end
+      end
+      # Map names and values
+      results = {}
+      index_types.each do |index_type|
+        # Get mapping for field
+        name, mapping, data_type_mapping = solr_name_and_mappings(field_name, field_type, index_type)
+        next unless name
+        # Is there a custom converter?
+        value = if data_type_mapping && data_type_mapping.converter
+          converter = data_type_mapping.converter
+          if converter.arity == 1
+            converter.call(field_value)
+          else
+            converter.call(field_value, field_name)
+          end
+        else
+          field_value
+        end
+        # Add mapped name & value, unless it's a duplicate
+        values = (results[name] ||= [])
+        values << value unless values.contains?(value)
+      end
+      results
+    end
+  private
+    def solr_name_and_mappings(field_name, field_type, index_type)
+      field_name = field_name.to_s
+      mapping = @mappings[index_type]
+      unless mapping
+        logger.debug "Unknown index type '#{index_type}' for field #{field_name}"
+        return nil
+      end
+      data_type_mapping = mapping.data_types[field_type] || mapping.data_types[:default]
+      suffix = data_type_mapping.opts[:suffix] if data_type_mapping
+      suffix ||= mapping.opts[:suffix]
+      name = field_name + suffix
+      [name, mapping, data_type_mapping]
+    end
+    class IndexTypeMapping
+      attr_accessor :opts, :data_types
+      def initialize
+        @opts = {}
+        @data_types = {}
+      end
+    end
+    class DataTypeMapping
+      attr_accessor :opts, :converter
+      def initialize
+        @opts = {}
+      end
+    end
+    class DataTypeMappingBuilder
+      def initialize(index_type_mapping)
+        @index_type_mapping = index_type_mapping
+      end
+      def method_missing(method, *args, &block)
+        data_type_mapping = (@index_type_mapping.data_types[method] ||= DataTypeMapping.new)
+        data_type_mapping.opts.merge! args[0] if args.length > 0
+        data_type_mapping.converter = block if block_given?
+      end
+    end
+    # ------ Default mapper ------
+  public
+    class Default < FieldMapper
+      id_field 'id'
+      index_as :searchable, :default => true do |t|
+        t.default :suffix => '_t'
+        t.date    :suffix => '_dt'
+        t.string  :suffix => '_t'
+        t.text    :suffix => '_t'
+        t.symbol  :suffix => '_s'
+        t.integer :suffix => '_i'
+        t.long    :suffix => '_l'
+        t.boolean :suffix => '_b'
+        t.float   :suffix => '_f'
+        t.double  :suffix => '_d'
+      end
+      index_as :displayable,          :suffix => '_display'
+      index_as :facetable,            :suffix => '_facet'
+      index_as :sortable,             :suffix => '_sort'
+      index_as :unstemmed_searchable, :suffix => '_unstem_search'
+    end
+  end
+end