solrizer 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore CHANGED
@@ -17,3 +17,6 @@ rerun.txt
17
17
  .loadpath
18
18
  .project
19
19
  .buildpath
20
+
21
+ /.bundle
22
+ /.rvmrc
data/Gemfile ADDED
@@ -0,0 +1,14 @@
1
+ source "http://rubygems.org"
2
+
3
+ gem "solr-ruby"
4
+ gem "nokogiri"
5
+ gem "om", ">= 1.0.0" # only required by xml/terminology_based_solrizer ...
6
+ gem "mediashelf-loggable"
7
+
8
+ group :development, :test do
9
+ gem "jeweler"
10
+ gem 'ruby-debug'
11
+ gem 'ruby-debug-base'
12
+ gem 'rspec', '<2.0.0'
13
+ gem 'mocha'
14
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,44 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ columnize (0.3.1)
5
+ facets (2.9.0)
6
+ gemcutter (0.6.1)
7
+ git (1.2.5)
8
+ jeweler (1.4.0)
9
+ gemcutter (>= 0.1.0)
10
+ git (>= 1.2.5)
11
+ rubyforge (>= 2.0.0)
12
+ json_pure (1.4.6)
13
+ linecache (0.43)
14
+ mediashelf-loggable (0.4.0)
15
+ mocha (0.9.9)
16
+ rake
17
+ nokogiri (1.4.3.1)
18
+ om (1.0.0)
19
+ facets
20
+ nokogiri (>= 1.4.2)
21
+ rake (0.8.7)
22
+ rspec (1.3.1)
23
+ ruby-debug (0.10.3)
24
+ columnize (>= 0.1)
25
+ ruby-debug-base (~> 0.10.3.0)
26
+ ruby-debug-base (0.10.3)
27
+ linecache (>= 0.3)
28
+ rubyforge (2.0.4)
29
+ json_pure (>= 1.1.7)
30
+ solr-ruby (0.0.8)
31
+
32
+ PLATFORMS
33
+ ruby
34
+
35
+ DEPENDENCIES
36
+ jeweler
37
+ mediashelf-loggable
38
+ mocha
39
+ nokogiri
40
+ om (>= 1.0.0)
41
+ rspec (< 2.0.0)
42
+ ruby-debug
43
+ ruby-debug-base
44
+ solr-ruby
data/History.txt CHANGED
@@ -1,3 +1,11 @@
1
+ h2. 0.3.0
2
+
3
+ HYDRA-286 Re-structure Solrizer to separate solrizer base from fedora-solrizer
4
+
5
+ Added TerminologyBasedSolrizer
6
+ Added Extremely Configurable FieldMapper
7
+ Updated FieldNameMapper to use new FieldMapper
8
+
1
9
  h2. 0.1.2
2
10
 
3
11
  Minor: switched active-fedora gem requirement to >= 1.1.5 instead of = 1.1.5 (was breaking apps that use later versions of active-fedora)
data/Rakefile CHANGED
@@ -10,9 +10,16 @@ begin
10
10
  gem.email = "matt.zumwalt@yourmediashelf.com"
11
11
  gem.homepage = "http://github.com/projecthydra/solrizer"
12
12
  gem.authors = ["Matt Zumwalt"]
13
- gem.add_dependency "active-fedora", ">= 1.1.5"
14
- gem.add_dependency "om", ">= 1.0.0" # only required by xml/terminology_based_solrizer ...
15
- gem.add_development_dependency "rspec", ">= 1.2.9"
13
+ gem.add_dependency "solr-ruby"
14
+ gem.add_dependency "nokogiri"
15
+ gem.add_dependency "om"
16
+ gem.add_dependency "nokogiri"
17
+ gem.add_dependency "mediashelf-loggable"
18
+ gem.add_development_dependency "jeweler"
19
+ gem.add_development_dependency 'ruby-debug'
20
+ gem.add_development_dependency 'ruby-debug-base'
21
+ gem.add_development_dependency 'rspec', '<2.0.0'
22
+ gem.add_development_dependency 'mocha'
16
23
  # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
17
24
  end
18
25
  Jeweler::GemcutterTasks.new
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.0
1
+ 0.3.0
@@ -1,14 +1,17 @@
1
1
  id: id
2
- date: _dt
3
- string: _t
4
- text: _t
5
- symbol: _s
6
- integer: _i
7
- long: _l
8
- boolean: _b
9
- float: _f
10
- double: _d
11
- facet: _facet
12
- display: _display
13
- sort: _sort
14
- unstemmed_search: _unstem_search
2
+ default: searchable
3
+ searchable:
4
+ default: _t
5
+ date: _dt
6
+ string: _t
7
+ text: _t
8
+ symbol: _s
9
+ integer: _i
10
+ long: _l
11
+ boolean: _b
12
+ float: _f
13
+ double: _d
14
+ displayable: _display
15
+ facetable: _facet
16
+ sortable: _sort
17
+ unstemmed_searchable: _unstem_search
@@ -0,0 +1,18 @@
1
+ id: id
2
+ default: searchable
3
+ searchable:
4
+ date: _date
5
+ string: _field
6
+ text: _field
7
+ symbol: _field
8
+ integer: _field
9
+ long: _field
10
+ boolean: _field
11
+ float: _field
12
+ double: _field
13
+ displayable: _display
14
+ facetable: _facet
15
+ sortable: _sort
16
+ unstemmed_searchable: _unstem_search
17
+
18
+
@@ -4,85 +4,44 @@ require "nokogiri"
4
4
  require 'yaml'
5
5
 
6
6
  module Solrizer
7
- class Extractor
8
-
9
-
10
- def extract_tags(text)
11
- doc = REXML::Document.new( text )
12
- extract_tag(doc, 'archivist_tags').merge(extract_tag(doc, 'donor_tags'))
13
- end
14
7
 
15
- def extract_tag(doc, type)
16
- tags = doc.elements["/fields/#{type}"]
17
- return {} unless tags
18
- {type => tags.text.split(/,/).map {|t| t.strip}}
19
- end
8
+ # Provides utilities for extracting solr fields from a variety of objects and/or creating solr documents from a given object
9
+ # Note: These utilities are optional. You can implement .to_solr directly on your classes if you want to bypass using Extractors.
10
+ #
11
+ # Each of the Solrizer implementations provides its own Extractor module that extends the behaviors of Solrizer::Extractor
12
+ # with methods specific to that implementation (ie. extract_tag, extract_rels_ext, xml_to_solr, html_to_solr)
13
+ #
14
+ class Extractor
20
15
 
21
-
22
- #
23
- # Extracts content-model and hydra-type from RELS-EXT datastream
16
+ # Populates a solr doc with values from a hash.
17
+ # Accepts two forms of hashes:
18
+ # => {'technology'=>["t1", "t2"], 'company'=>"c1", "person"=>["p1", "p2"]}
19
+ # or
20
+ # => {:facets => {'technology'=>["t1", "t2"], 'company'=>"c1", "person"=>["p1", "p2"]} }
24
21
  #
25
- def extract_rels_ext( text, solr_doc=Solr::Document.new )
26
- # TODO: only read in this file once
27
-
28
- if defined?(RAILS_ROOT)
29
- config_path = File.join(RAILS_ROOT, "config")
30
- else
31
- config_path = File.join(File.dirname(__FILE__), "..", "..", "config")
32
- end
33
- map = YAML.load(File.open(File.join(config_path, "hydra_types.yml")))
34
-
35
- doc = Nokogiri::XML(text)
36
- doc.xpath( '//foo:hasModel', 'foo' => 'info:fedora/fedora-system:def/model#' ).each do |element|
37
- cmodel = element.attributes['resource'].to_s
38
- solr_doc << Solr::Field.new( :cmodel_t => cmodel )
39
-
40
- if map.has_key?(cmodel)
41
- solr_doc << Solr::Field.new( :hydra_type_t => map[cmodel] )
22
+ # Note that values for individual fields can be a single string or an array of strings.
23
+ def extract_hash( input_hash, solr_doc=Solr::Document.new )
24
+ facets = input_hash.has_key?(:facets) ? input_hash[:facets] : input_hash
25
+ facets.each_pair do |facet_name, value|
26
+ case value.class.to_s
27
+ when "String"
28
+ solr_doc << Solr::Field.new( :"#{facet_name}_facet" => "#{value}" )
29
+ when "Array"
30
+ value.each { |v| solr_doc << Solr::Field.new( :"#{facet_name}_facet" => "#{v}" ) }
42
31
  end
43
32
  end
44
-
45
- return solr_doc
46
- end
47
-
48
- #
49
- # This method extracts solr fields from simple xml
50
- #
51
- def xml_to_solr( text, solr_doc=Solr::Document.new )
52
- doc = REXML::Document.new( text )
53
- doc.root.elements.each do |element|
54
- solr_doc << Solr::Field.new( :"#{element.name}_t" => "#{element.text}" )
55
- end
56
-
57
- return solr_doc
58
- end
59
-
60
- #
61
- # This method strips html tags out and returns content to be indexed in solr
62
- #
63
- def html_content_to_solr( ds, solr_doc=Solr::Document.new )
64
-
65
- text = CGI.unescapeHTML(ds.content)
66
- doc = Nokogiri::HTML(text)
67
33
 
68
- # html to story_display
69
- stories = doc.xpath('//story')
70
-
71
- stories.each do |story|
72
- solr_doc << Solr::Field.new(:story_display => story.children.to_xml)
34
+ if input_hash.has_key?(:symbols)
35
+ input_hash[:symbols].each do |symbol_name, value|
36
+ case value.class.to_s
37
+ when "String"
38
+ solr_doc << Solr::Field.new( :"#{symbol_name}_s" => "#{value}" )
39
+ when "Array"
40
+ value.each { |v| solr_doc << Solr::Field.new( :"#{symbol_name}_s" => "#{v}" ) }
41
+ end
42
+ end
73
43
  end
74
-
75
- #strip out text and put in story_t
76
- text_nodes = doc.xpath("//text()")
77
- text = String.new
78
-
79
- text_nodes.each do |text_node|
80
- text << text_node.content
81
- end
82
-
83
- solr_doc << Solr::Field.new(:story_t => text)
84
-
85
- return solr_doc
44
+ return solr_doc
86
45
  end
87
46
 
88
47
  end
@@ -0,0 +1,351 @@
1
+ require "loggable"
2
+ module Solrizer
3
+
4
+ # Maps Term names and values to Solr fields, based on the Term's data type and any index_as options.
5
+ #
6
+ # The basic structure of a mapper is:
7
+ #
8
+ # == Mapping on Index Type
9
+ #
10
+ # To define a custom mapper:
11
+ #
12
+ # class CustomMapper < Solrizer::FieldMapper
13
+ # index_as :searchable, :suffix => '_search'
14
+ # index_as :edible, :suffix => '_food'
15
+ # end
16
+ #
17
+ # # t.dish_name :index_as => [:searchable] -maps to-> dish_name_search
18
+ # # t.ingredients :index_as => [:searchable, :edible] -maps to-> ingredients_search, ingredients_food
19
+ #
20
+ # (See Solrizer::XML::TerminologyBasedSolrizer for instructions on applying a custom mapping once you have defined it.)
21
+ #
22
+ # == Default Index Types
23
+ #
24
+ # You can mark a particular index type as a default. It will then always be included unless terms explicity
25
+ # exclude it with the "not_" prefix:
26
+ #
27
+ # class CustomMapper < Solrizer::FieldMapper
28
+ # index_as :searchable, :suffix => '_search', :default => true
29
+ # index_as :edible, :suffix => '_food'
30
+ # end
31
+ #
32
+ # # t.dish_name -maps to-> dish_name_search
33
+ # # t.ingredients :index_as => [:edible] -maps to-> ingredients_search, ingredients_food
34
+ # # t.secret_ingredients :index_as => [:not_searchable, :edible] -maps to-> secret_ingredients_food
35
+ #
36
+ # == Mapping on Data Type
37
+ #
38
+ # A mapper can apply different suffixes based on a term's data type:
39
+ #
40
+ # class CustomMapper < Solrizer::FieldMapper
41
+ # index_as :searchable, :suffix => '_search' do |type|
42
+ # type.date :suffix => '_date'
43
+ # type.integer :suffix => '_numeric'
44
+ # type.float :suffix => '_numeric'
45
+ # end
46
+ # index_as :edible, :suffix => '_food'
47
+ # end
48
+ #
49
+ # # t.published :type => :date, :index_as => [:searchable] -maps to-> published_date
50
+ # # t.votes :type => :integer, :index_as => [:searchable] -maps to-> votes_numeric
51
+ #
52
+ # If a specific data type doesn't appear in the list, the mapper falls back to the index_as:
53
+ #
54
+ # # t.description :type => :text, :index_as => [:searchable] -maps to-> description_search
55
+ #
56
+ # == Custom Value Converters
57
+ #
58
+ # All of the above applies to the generation of Solr names. Mappers can also provide custom conversion logic for the
59
+ # generation of Solr values by attaching a custom value converter block to a data type:
60
+ #
61
+ # require 'time'
62
+ #
63
+ # class CustomMapper < Solrizer::FieldMapper
64
+ # index_as :searchable, :suffix => '_search' do |type|
65
+ # type.date do |value|
66
+ # Time.parse(value).utc.to_i
67
+ # end
68
+ # end
69
+ # end
70
+ #
71
+ # Note that the nesting order is always:
72
+ #
73
+ # FieldMapper definition
74
+ # index_as
75
+ # data type
76
+ # value converter
77
+ #
78
+ # You can use the special data type "default" to apply custom value conversion to any data type:
79
+ #
80
+ # require 'time'
81
+ #
82
+ # class CustomMapper < Solrizer::FieldMapper
83
+ # index_as :searchable do |type|
84
+ # type.date :suffix => '_date' do |value|
85
+ # Time.parse(value).utc.to_i
86
+ # end
87
+ # type.default :suffix => '_search' do |value|
88
+ # value.to_s.strip
89
+ # end
90
+ # end
91
+ # end
92
+ #
93
+ # This example converts searchable dates to milliseconds, and strips extra whitespace from all other searchable data types.
94
+ #
95
+ # Note that the :suffix option may appear on the data types and the index_as. The search order for the suffix on a field
96
+ # of type foo is:
97
+ # 1. type.foo
98
+ # 2. type.default
99
+ # 3. index_as
100
+ # The suffix is optional in all three places.
101
+ #
102
+ # Note that a single Term with multiple index types can translate into multiple Solr fields, because we may want Solr to
103
+ # index a single field in multiple ways. However, if two different mappings generate both the same solr field name
104
+ # _and_ the same value, the mapper will only emit a single field.
105
+ #
106
+ # == ID Field
107
+ #
108
+ # In addition to the normal field mappings, Solrizer gives special treatment to an ID field. If you want that
109
+ # logic (and you probably do), specify a name for this field:
110
+ #
111
+ # class CustomMapper < Solrizer::FieldMapper
112
+ # id_field 'id'
113
+ # end
114
+ #
115
+ # == Extending the Default
116
+ #
117
+ # The default mapper is Solrizer::FieldMapper::Default. You can customize the default mapping by subclassing it.
118
+ # For example, to override the ID field name and the default suffix for sortable, and inherit everything else:
119
+ #
120
+ # class CustomMapperBasedOnDefault < Solrizer::FieldMapper::Default
121
+ # id_field 'guid'
122
+ # index_as :sortable, :suffix => '_xsort'
123
+ # end
124
+
125
+ class FieldMapper
126
+
127
+ include Loggable
128
+
129
+ # ------ Class methods ------
130
+
131
+ @@instance_init_actions = Hash.new { |h,k| h[k] = [] }
132
+
133
+ def self.id_field(field_name)
134
+ add_instance_init_action do
135
+ @id_field = field_name
136
+ end
137
+ end
138
+
139
+ def self.index_as(index_type, opts = {}, &block)
140
+ add_instance_init_action do
141
+ mapping = (@mappings[index_type] ||= IndexTypeMapping.new)
142
+ mapping.opts.merge! opts
143
+ yield DataTypeMappingBuilder.new(mapping) if block_given?
144
+ end
145
+ end
146
+
147
+ # Loads solr mappings from yml file.
148
+ # Assumes that string values are solr field name suffixes.
149
+ # This is meant as a simple entry point for working with solr mappings. For more powerful control over solr mappings, create your own subclasses of FieldMapper instead of using a yml file.
150
+ # @param [String] config_path This is the path to the directory where your mappings file is stored. Defaults to "RAILS_ROOT/config/solr_mappings.yml"
151
+ def self.load_mappings( config_path=nil )
152
+
153
+ if config_path.nil?
154
+ if defined?(RAILS_ROOT)
155
+ config_path = File.join(RAILS_ROOT, "config", "solr_mappings.yml")
156
+ end
157
+ # Default to using the config file within the gem
158
+ if !File.exist?(config_path.to_s)
159
+ config_path = File.join(File.dirname(__FILE__), "..", "..", "config", "solr_mappings.yml")
160
+ end
161
+ end
162
+
163
+ logger.info("SOLRIZER: loading field name mappings from #{File.expand_path(config_path)}")
164
+ mappings_from_file = YAML::load(File.open(config_path))
165
+
166
+ self.clear_mappings
167
+
168
+ # Set id_field from file if it is available
169
+ id_field_from_file = mappings_from_file.delete("id")
170
+ if id_field_from_file.nil?
171
+ id_field "id"
172
+ else
173
+ id_field id_field_from_file
174
+ end
175
+
176
+ default_index_type = mappings_from_file.delete("default")
177
+ mappings_from_file.each_pair do |index_type, type_settings|
178
+ if type_settings.kind_of?(Hash)
179
+ index_as index_type.to_sym, :default => index_type == default_index_type do |t|
180
+ type_settings.each_pair do |field_type, suffix|
181
+ eval("t.#{field_type} :suffix=>\"#{suffix}\"")
182
+ end
183
+ end
184
+ else
185
+ index_as index_type.to_sym, :default => index_type == default_index_type, :suffix=>type_settings
186
+ end
187
+ end
188
+ end
189
+
190
+ private
191
+
192
+ def self.add_instance_init_action(&block)
193
+ @@instance_init_actions[self] << lambda do |mapper|
194
+ mapper.instance_eval &block
195
+ end
196
+ end
197
+
198
+ def self.apply_instance_init_actions(instance)
199
+ if self.superclass.respond_to? :apply_instance_init_actions
200
+ self.superclass.apply_instance_init_actions(instance)
201
+ end
202
+ @@instance_init_actions[self].each do |action|
203
+ action.call(instance)
204
+ end
205
+ end
206
+
207
+ # Reset all of the mappings
208
+ def self.clear_mappings
209
+ logger.debug "resetting mappings for #{self.to_s}"
210
+ @@instance_init_actions[self] = []
211
+ end
212
+
213
+ public
214
+
215
+ # ------ Instance methods ------
216
+
217
+ attr_reader :id_field, :default_index_types, :mappings
218
+
219
+ def initialize
220
+ @mappings = {}
221
+ self.class.apply_instance_init_actions(self)
222
+ @default_index_types = @mappings.select { |ix_type, mapping| mapping.opts[:default] }.map(&:first)
223
+ end
224
+
225
+ # Given a specific field name, data type, and index type, returns the corresponding solr name.
226
+
227
+ def solr_name(field_name, field_type, index_type = :searchable)
228
+ name, mapping, data_type_mapping = solr_name_and_mappings(field_name, field_type, index_type)
229
+ name
230
+ end
231
+
232
+ # Given a field name-value pair, a data type, and an array of index types, returns a hash of
233
+ # mapped names and values. The values in the hash are _arrays_, and may contain multiple values.
234
+
235
+ def solr_names_and_values(field_name, field_value, field_type, index_types)
236
+ # Determine the set of index types, adding defaults and removing not_xyz
237
+
238
+ index_types ||= []
239
+ index_types += default_index_types
240
+ index_types.uniq!
241
+ index_types.dup.each do |index_type|
242
+ if index_type.to_s =~ /^not_(.*)/
243
+ index_types.delete index_type # not_foo
244
+ index_types.delete $1.to_sym # foo
245
+ end
246
+ end
247
+
248
+ # Map names and values
249
+
250
+ results = {}
251
+
252
+ index_types.each do |index_type|
253
+ # Get mapping for field
254
+ name, mapping, data_type_mapping = solr_name_and_mappings(field_name, field_type, index_type)
255
+ next unless name
256
+
257
+ # Is there a custom converter?
258
+ value = if data_type_mapping && data_type_mapping.converter
259
+ converter = data_type_mapping.converter
260
+ if converter.arity == 1
261
+ converter.call(field_value)
262
+ else
263
+ converter.call(field_value, field_name)
264
+ end
265
+ else
266
+ field_value
267
+ end
268
+
269
+ # Add mapped name & value, unless it's a duplicate
270
+ values = (results[name] ||= [])
271
+ values << value unless values.contains?(value)
272
+ end
273
+
274
+ results
275
+ end
276
+
277
+ private
278
+
279
+ def solr_name_and_mappings(field_name, field_type, index_type)
280
+ field_name = field_name.to_s
281
+ mapping = @mappings[index_type]
282
+ unless mapping
283
+ logger.debug "Unknown index type '#{index_type}' for field #{field_name}"
284
+ return nil
285
+ end
286
+
287
+ data_type_mapping = mapping.data_types[field_type] || mapping.data_types[:default]
288
+
289
+ suffix = data_type_mapping.opts[:suffix] if data_type_mapping
290
+ suffix ||= mapping.opts[:suffix]
291
+ name = field_name + suffix
292
+
293
+ [name, mapping, data_type_mapping]
294
+ end
295
+
296
+ class IndexTypeMapping
297
+ attr_accessor :opts, :data_types
298
+
299
+ def initialize
300
+ @opts = {}
301
+ @data_types = {}
302
+ end
303
+ end
304
+
305
+ class DataTypeMapping
306
+ attr_accessor :opts, :converter
307
+
308
+ def initialize
309
+ @opts = {}
310
+ end
311
+ end
312
+
313
+ class DataTypeMappingBuilder
314
+ def initialize(index_type_mapping)
315
+ @index_type_mapping = index_type_mapping
316
+ end
317
+
318
+ def method_missing(method, *args, &block)
319
+ data_type_mapping = (@index_type_mapping.data_types[method] ||= DataTypeMapping.new)
320
+ data_type_mapping.opts.merge! args[0] if args.length > 0
321
+ data_type_mapping.converter = block if block_given?
322
+ end
323
+ end
324
+
325
+ # ------ Default mapper ------
326
+
327
+ public
328
+
329
+ class Default < FieldMapper
330
+ id_field 'id'
331
+ index_as :searchable, :default => true do |t|
332
+ t.default :suffix => '_t'
333
+ t.date :suffix => '_dt'
334
+ t.string :suffix => '_t'
335
+ t.text :suffix => '_t'
336
+ t.symbol :suffix => '_s'
337
+ t.integer :suffix => '_i'
338
+ t.long :suffix => '_l'
339
+ t.boolean :suffix => '_b'
340
+ t.float :suffix => '_f'
341
+ t.double :suffix => '_d'
342
+ end
343
+ index_as :displayable, :suffix => '_display'
344
+ index_as :facetable, :suffix => '_facet'
345
+ index_as :sortable, :suffix => '_sort'
346
+ index_as :unstemmed_searchable, :suffix => '_unstem_search'
347
+ end
348
+
349
+ end
350
+
351
+ end