solrizer 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -17,3 +17,6 @@ rerun.txt
17
17
  .loadpath
18
18
  .project
19
19
  .buildpath
20
+
21
+ /.bundle
22
+ /.rvmrc
data/Gemfile ADDED
@@ -0,0 +1,14 @@
1
+ source "http://rubygems.org"
2
+
3
+ gem "solr-ruby"
4
+ gem "nokogiri"
5
+ gem "om", ">= 1.0.0" # only required by xml/terminology_based_solrizer ...
6
+ gem "mediashelf-loggable"
7
+
8
+ group :development, :test do
9
+ gem "jeweler"
10
+ gem 'ruby-debug'
11
+ gem 'ruby-debug-base'
12
+ gem 'rspec', '<2.0.0'
13
+ gem 'mocha'
14
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,44 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ columnize (0.3.1)
5
+ facets (2.9.0)
6
+ gemcutter (0.6.1)
7
+ git (1.2.5)
8
+ jeweler (1.4.0)
9
+ gemcutter (>= 0.1.0)
10
+ git (>= 1.2.5)
11
+ rubyforge (>= 2.0.0)
12
+ json_pure (1.4.6)
13
+ linecache (0.43)
14
+ mediashelf-loggable (0.4.0)
15
+ mocha (0.9.9)
16
+ rake
17
+ nokogiri (1.4.3.1)
18
+ om (1.0.0)
19
+ facets
20
+ nokogiri (>= 1.4.2)
21
+ rake (0.8.7)
22
+ rspec (1.3.1)
23
+ ruby-debug (0.10.3)
24
+ columnize (>= 0.1)
25
+ ruby-debug-base (~> 0.10.3.0)
26
+ ruby-debug-base (0.10.3)
27
+ linecache (>= 0.3)
28
+ rubyforge (2.0.4)
29
+ json_pure (>= 1.1.7)
30
+ solr-ruby (0.0.8)
31
+
32
+ PLATFORMS
33
+ ruby
34
+
35
+ DEPENDENCIES
36
+ jeweler
37
+ mediashelf-loggable
38
+ mocha
39
+ nokogiri
40
+ om (>= 1.0.0)
41
+ rspec (< 2.0.0)
42
+ ruby-debug
43
+ ruby-debug-base
44
+ solr-ruby
data/History.txt CHANGED
@@ -1,3 +1,11 @@
1
+ h2. 0.3.0
2
+
3
+ HYDRA-286 Re-structure Solrizer to separate solrizer base from fedora-solrizer
4
+
5
+ Added TerminologyBasedSolrizer
6
+ Added Extremely Configurable FieldMapper
7
+ Updated FieldNameMapper to use new FieldMapper
8
+
1
9
  h2. 0.1.2
2
10
 
3
11
  Minor: switched active-fedora gem requirement to >= 1.1.5 instead of = 1.1.5 (was breaking apps that use later versions of active-fedora)
data/Rakefile CHANGED
@@ -10,9 +10,16 @@ begin
10
10
  gem.email = "matt.zumwalt@yourmediashelf.com"
11
11
  gem.homepage = "http://github.com/projecthydra/solrizer"
12
12
  gem.authors = ["Matt Zumwalt"]
13
- gem.add_dependency "active-fedora", ">= 1.1.5"
14
- gem.add_dependency "om", ">= 1.0.0" # only required by xml/terminology_based_solrizer ...
15
- gem.add_development_dependency "rspec", ">= 1.2.9"
13
+ gem.add_dependency "solr-ruby"
14
+ gem.add_dependency "nokogiri"
15
+ gem.add_dependency "om"
16
+ gem.add_dependency "nokogiri"
17
+ gem.add_dependency "mediashelf-loggable"
18
+ gem.add_development_dependency "jeweler"
19
+ gem.add_development_dependency 'ruby-debug'
20
+ gem.add_development_dependency 'ruby-debug-base'
21
+ gem.add_development_dependency 'rspec', '<2.0.0'
22
+ gem.add_development_dependency 'mocha'
16
23
  # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
17
24
  end
18
25
  Jeweler::GemcutterTasks.new
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.0
1
+ 0.3.0
@@ -1,14 +1,17 @@
1
1
  id: id
2
- date: _dt
3
- string: _t
4
- text: _t
5
- symbol: _s
6
- integer: _i
7
- long: _l
8
- boolean: _b
9
- float: _f
10
- double: _d
11
- facet: _facet
12
- display: _display
13
- sort: _sort
14
- unstemmed_search: _unstem_search
2
+ default: searchable
3
+ searchable:
4
+ default: _t
5
+ date: _dt
6
+ string: _t
7
+ text: _t
8
+ symbol: _s
9
+ integer: _i
10
+ long: _l
11
+ boolean: _b
12
+ float: _f
13
+ double: _d
14
+ displayable: _display
15
+ facetable: _facet
16
+ sortable: _sort
17
+ unstemmed_searchable: _unstem_search
@@ -0,0 +1,18 @@
1
+ id: id
2
+ default: searchable
3
+ searchable:
4
+ date: _date
5
+ string: _field
6
+ text: _field
7
+ symbol: _field
8
+ integer: _field
9
+ long: _field
10
+ boolean: _field
11
+ float: _field
12
+ double: _field
13
+ displayable: _display
14
+ facetable: _facet
15
+ sortable: _sort
16
+ unstemmed_searchable: _unstem_search
17
+
18
+
@@ -4,85 +4,44 @@ require "nokogiri"
4
4
  require 'yaml'
5
5
 
6
6
  module Solrizer
7
- class Extractor
8
-
9
-
10
- def extract_tags(text)
11
- doc = REXML::Document.new( text )
12
- extract_tag(doc, 'archivist_tags').merge(extract_tag(doc, 'donor_tags'))
13
- end
14
7
 
15
- def extract_tag(doc, type)
16
- tags = doc.elements["/fields/#{type}"]
17
- return {} unless tags
18
- {type => tags.text.split(/,/).map {|t| t.strip}}
19
- end
8
+ # Provides utilities for extracting solr fields from a variety of objects and/or creating solr documents from a given object
9
+ # Note: These utilities are optional. You can implement .to_solr directly on your classes if you want to bypass using Extractors.
10
+ #
11
+ # Each of the Solrizer implementations provides its own Extractor module that extends the behaviors of Solrizer::Extractor
12
+ # with methods specific to that implementation (ie. extract_tag, extract_rels_ext, xml_to_solr, html_to_solr)
13
+ #
14
+ class Extractor
20
15
 
21
-
22
- #
23
- # Extracts content-model and hydra-type from RELS-EXT datastream
16
+ # Populates a solr doc with values from a hash.
17
+ # Accepts two forms of hashes:
18
+ # => {'technology'=>["t1", "t2"], 'company'=>"c1", "person"=>["p1", "p2"]}
19
+ # or
20
+ # => {:facets => {'technology'=>["t1", "t2"], 'company'=>"c1", "person"=>["p1", "p2"]} }
24
21
  #
25
- def extract_rels_ext( text, solr_doc=Solr::Document.new )
26
- # TODO: only read in this file once
27
-
28
- if defined?(RAILS_ROOT)
29
- config_path = File.join(RAILS_ROOT, "config")
30
- else
31
- config_path = File.join(File.dirname(__FILE__), "..", "..", "config")
32
- end
33
- map = YAML.load(File.open(File.join(config_path, "hydra_types.yml")))
34
-
35
- doc = Nokogiri::XML(text)
36
- doc.xpath( '//foo:hasModel', 'foo' => 'info:fedora/fedora-system:def/model#' ).each do |element|
37
- cmodel = element.attributes['resource'].to_s
38
- solr_doc << Solr::Field.new( :cmodel_t => cmodel )
39
-
40
- if map.has_key?(cmodel)
41
- solr_doc << Solr::Field.new( :hydra_type_t => map[cmodel] )
22
+ # Note that values for individual fields can be a single string or an array of strings.
23
+ def extract_hash( input_hash, solr_doc=Solr::Document.new )
24
+ facets = input_hash.has_key?(:facets) ? input_hash[:facets] : input_hash
25
+ facets.each_pair do |facet_name, value|
26
+ case value.class.to_s
27
+ when "String"
28
+ solr_doc << Solr::Field.new( :"#{facet_name}_facet" => "#{value}" )
29
+ when "Array"
30
+ value.each { |v| solr_doc << Solr::Field.new( :"#{facet_name}_facet" => "#{v}" ) }
42
31
  end
43
32
  end
44
-
45
- return solr_doc
46
- end
47
-
48
- #
49
- # This method extracts solr fields from simple xml
50
- #
51
- def xml_to_solr( text, solr_doc=Solr::Document.new )
52
- doc = REXML::Document.new( text )
53
- doc.root.elements.each do |element|
54
- solr_doc << Solr::Field.new( :"#{element.name}_t" => "#{element.text}" )
55
- end
56
-
57
- return solr_doc
58
- end
59
-
60
- #
61
- # This method strips html tags out and returns content to be indexed in solr
62
- #
63
- def html_content_to_solr( ds, solr_doc=Solr::Document.new )
64
-
65
- text = CGI.unescapeHTML(ds.content)
66
- doc = Nokogiri::HTML(text)
67
33
 
68
- # html to story_display
69
- stories = doc.xpath('//story')
70
-
71
- stories.each do |story|
72
- solr_doc << Solr::Field.new(:story_display => story.children.to_xml)
34
+ if input_hash.has_key?(:symbols)
35
+ input_hash[:symbols].each do |symbol_name, value|
36
+ case value.class.to_s
37
+ when "String"
38
+ solr_doc << Solr::Field.new( :"#{symbol_name}_s" => "#{value}" )
39
+ when "Array"
40
+ value.each { |v| solr_doc << Solr::Field.new( :"#{symbol_name}_s" => "#{v}" ) }
41
+ end
42
+ end
73
43
  end
74
-
75
- #strip out text and put in story_t
76
- text_nodes = doc.xpath("//text()")
77
- text = String.new
78
-
79
- text_nodes.each do |text_node|
80
- text << text_node.content
81
- end
82
-
83
- solr_doc << Solr::Field.new(:story_t => text)
84
-
85
- return solr_doc
44
+ return solr_doc
86
45
  end
87
46
 
88
47
  end
@@ -0,0 +1,351 @@
1
+ require "loggable"
2
+ module Solrizer
3
+
4
+ # Maps Term names and values to Solr fields, based on the Term's data type and any index_as options.
5
+ #
6
+ # The basic structure of a mapper is:
7
+ #
8
+ # == Mapping on Index Type
9
+ #
10
+ # To define a custom mapper:
11
+ #
12
+ # class CustomMapper < Solrizer::FieldMapper
13
+ # index_as :searchable, :suffix => '_search'
14
+ # index_as :edible, :suffix => '_food'
15
+ # end
16
+ #
17
+ # # t.dish_name :index_as => [:searchable] -maps to-> dish_name_search
18
+ # # t.ingredients :index_as => [:searchable, :edible] -maps to-> ingredients_search, ingredients_food
19
+ #
20
+ # (See Solrizer::XML::TerminologyBasedSolrizer for instructions on applying a custom mapping once you have defined it.)
21
+ #
22
+ # == Default Index Types
23
+ #
24
+ # You can mark a particular index type as a default. It will then always be included unless terms explicity
25
+ # exclude it with the "not_" prefix:
26
+ #
27
+ # class CustomMapper < Solrizer::FieldMapper
28
+ # index_as :searchable, :suffix => '_search', :default => true
29
+ # index_as :edible, :suffix => '_food'
30
+ # end
31
+ #
32
+ # # t.dish_name -maps to-> dish_name_search
33
+ # # t.ingredients :index_as => [:edible] -maps to-> ingredients_search, ingredients_food
34
+ # # t.secret_ingredients :index_as => [:not_searchable, :edible] -maps to-> secret_ingredients_food
35
+ #
36
+ # == Mapping on Data Type
37
+ #
38
+ # A mapper can apply different suffixes based on a term's data type:
39
+ #
40
+ # class CustomMapper < Solrizer::FieldMapper
41
+ # index_as :searchable, :suffix => '_search' do |type|
42
+ # type.date :suffix => '_date'
43
+ # type.integer :suffix => '_numeric'
44
+ # type.float :suffix => '_numeric'
45
+ # end
46
+ # index_as :edible, :suffix => '_food'
47
+ # end
48
+ #
49
+ # # t.published :type => :date, :index_as => [:searchable] -maps to-> published_date
50
+ # # t.votes :type => :integer, :index_as => [:searchable] -maps to-> votes_numeric
51
+ #
52
+ # If a specific data type doesn't appear in the list, the mapper falls back to the index_as:
53
+ #
54
+ # # t.description :type => :text, :index_as => [:searchable] -maps to-> description_search
55
+ #
56
+ # == Custom Value Converters
57
+ #
58
+ # All of the above applies to the generation of Solr names. Mappers can also provide custom conversion logic for the
59
+ # generation of Solr values by attaching a custom value converter block to a data type:
60
+ #
61
+ # require 'time'
62
+ #
63
+ # class CustomMapper < Solrizer::FieldMapper
64
+ # index_as :searchable, :suffix => '_search' do |type|
65
+ # type.date do |value|
66
+ # Time.parse(value).utc.to_i
67
+ # end
68
+ # end
69
+ # end
70
+ #
71
+ # Note that the nesting order is always:
72
+ #
73
+ # FieldMapper definition
74
+ # index_as
75
+ # data type
76
+ # value converter
77
+ #
78
+ # You can use the special data type "default" to apply custom value conversion to any data type:
79
+ #
80
+ # require 'time'
81
+ #
82
+ # class CustomMapper < Solrizer::FieldMapper
83
+ # index_as :searchable do |type|
84
+ # type.date :suffix => '_date' do |value|
85
+ # Time.parse(value).utc.to_i
86
+ # end
87
+ # type.default :suffix => '_search' do |value|
88
+ # value.to_s.strip
89
+ # end
90
+ # end
91
+ # end
92
+ #
93
+ # This example converts searchable dates to milliseconds, and strips extra whitespace from all other searchable data types.
94
+ #
95
+ # Note that the :suffix option may appear on the data types and the index_as. The search order for the suffix on a field
96
+ # of type foo is:
97
+ # 1. type.foo
98
+ # 2. type.default
99
+ # 3. index_as
100
+ # The suffix is optional in all three places.
101
+ #
102
+ # Note that a single Term with multiple index types can translate into multiple Solr fields, because we may want Solr to
103
+ # index a single field in multiple ways. However, if two different mappings generate both the same solr field name
104
+ # _and_ the same value, the mapper will only emit a single field.
105
+ #
106
+ # == ID Field
107
+ #
108
+ # In addition to the normal field mappings, Solrizer gives special treatment to an ID field. If you want that
109
+ # logic (and you probably do), specify a name for this field:
110
+ #
111
+ # class CustomMapper < Solrizer::FieldMapper
112
+ # id_field 'id'
113
+ # end
114
+ #
115
+ # == Extending the Default
116
+ #
117
+ # The default mapper is Solrizer::FieldMapper::Default. You can customize the default mapping by subclassing it.
118
+ # For example, to override the ID field name and the default suffix for sortable, and inherit everything else:
119
+ #
120
+ # class CustomMapperBasedOnDefault < Solrizer::FieldMapper::Default
121
+ # id_field 'guid'
122
+ # index_as :sortable, :suffix => '_xsort'
123
+ # end
124
+
125
+ class FieldMapper
126
+
127
+ include Loggable
128
+
129
+ # ------ Class methods ------
130
+
131
+ @@instance_init_actions = Hash.new { |h,k| h[k] = [] }
132
+
133
+ def self.id_field(field_name)
134
+ add_instance_init_action do
135
+ @id_field = field_name
136
+ end
137
+ end
138
+
139
+ def self.index_as(index_type, opts = {}, &block)
140
+ add_instance_init_action do
141
+ mapping = (@mappings[index_type] ||= IndexTypeMapping.new)
142
+ mapping.opts.merge! opts
143
+ yield DataTypeMappingBuilder.new(mapping) if block_given?
144
+ end
145
+ end
146
+
147
+ # Loads solr mappings from yml file.
148
+ # Assumes that string values are solr field name suffixes.
149
+ # This is meant as a simple entry point for working with solr mappings. For more powerful control over solr mappings, create your own subclasses of FieldMapper instead of using a yml file.
150
+ # @param [String] config_path This is the path to the directory where your mappings file is stored. Defaults to "RAILS_ROOT/config/solr_mappings.yml"
151
+ def self.load_mappings( config_path=nil )
152
+
153
+ if config_path.nil?
154
+ if defined?(RAILS_ROOT)
155
+ config_path = File.join(RAILS_ROOT, "config", "solr_mappings.yml")
156
+ end
157
+ # Default to using the config file within the gem
158
+ if !File.exist?(config_path.to_s)
159
+ config_path = File.join(File.dirname(__FILE__), "..", "..", "config", "solr_mappings.yml")
160
+ end
161
+ end
162
+
163
+ logger.info("SOLRIZER: loading field name mappings from #{File.expand_path(config_path)}")
164
+ mappings_from_file = YAML::load(File.open(config_path))
165
+
166
+ self.clear_mappings
167
+
168
+ # Set id_field from file if it is available
169
+ id_field_from_file = mappings_from_file.delete("id")
170
+ if id_field_from_file.nil?
171
+ id_field "id"
172
+ else
173
+ id_field id_field_from_file
174
+ end
175
+
176
+ default_index_type = mappings_from_file.delete("default")
177
+ mappings_from_file.each_pair do |index_type, type_settings|
178
+ if type_settings.kind_of?(Hash)
179
+ index_as index_type.to_sym, :default => index_type == default_index_type do |t|
180
+ type_settings.each_pair do |field_type, suffix|
181
+ eval("t.#{field_type} :suffix=>\"#{suffix}\"")
182
+ end
183
+ end
184
+ else
185
+ index_as index_type.to_sym, :default => index_type == default_index_type, :suffix=>type_settings
186
+ end
187
+ end
188
+ end
189
+
190
+ private
191
+
192
+ def self.add_instance_init_action(&block)
193
+ @@instance_init_actions[self] << lambda do |mapper|
194
+ mapper.instance_eval &block
195
+ end
196
+ end
197
+
198
+ def self.apply_instance_init_actions(instance)
199
+ if self.superclass.respond_to? :apply_instance_init_actions
200
+ self.superclass.apply_instance_init_actions(instance)
201
+ end
202
+ @@instance_init_actions[self].each do |action|
203
+ action.call(instance)
204
+ end
205
+ end
206
+
207
+ # Reset all of the mappings
208
+ def self.clear_mappings
209
+ logger.debug "resetting mappings for #{self.to_s}"
210
+ @@instance_init_actions[self] = []
211
+ end
212
+
213
+ public
214
+
215
+ # ------ Instance methods ------
216
+
217
+ attr_reader :id_field, :default_index_types, :mappings
218
+
219
+ def initialize
220
+ @mappings = {}
221
+ self.class.apply_instance_init_actions(self)
222
+ @default_index_types = @mappings.select { |ix_type, mapping| mapping.opts[:default] }.map(&:first)
223
+ end
224
+
225
+ # Given a specific field name, data type, and index type, returns the corresponding solr name.
226
+
227
+ def solr_name(field_name, field_type, index_type = :searchable)
228
+ name, mapping, data_type_mapping = solr_name_and_mappings(field_name, field_type, index_type)
229
+ name
230
+ end
231
+
232
+ # Given a field name-value pair, a data type, and an array of index types, returns a hash of
233
+ # mapped names and values. The values in the hash are _arrays_, and may contain multiple values.
234
+
235
+ def solr_names_and_values(field_name, field_value, field_type, index_types)
236
+ # Determine the set of index types, adding defaults and removing not_xyz
237
+
238
+ index_types ||= []
239
+ index_types += default_index_types
240
+ index_types.uniq!
241
+ index_types.dup.each do |index_type|
242
+ if index_type.to_s =~ /^not_(.*)/
243
+ index_types.delete index_type # not_foo
244
+ index_types.delete $1.to_sym # foo
245
+ end
246
+ end
247
+
248
+ # Map names and values
249
+
250
+ results = {}
251
+
252
+ index_types.each do |index_type|
253
+ # Get mapping for field
254
+ name, mapping, data_type_mapping = solr_name_and_mappings(field_name, field_type, index_type)
255
+ next unless name
256
+
257
+ # Is there a custom converter?
258
+ value = if data_type_mapping && data_type_mapping.converter
259
+ converter = data_type_mapping.converter
260
+ if converter.arity == 1
261
+ converter.call(field_value)
262
+ else
263
+ converter.call(field_value, field_name)
264
+ end
265
+ else
266
+ field_value
267
+ end
268
+
269
+ # Add mapped name & value, unless it's a duplicate
270
+ values = (results[name] ||= [])
271
+ values << value unless values.contains?(value)
272
+ end
273
+
274
+ results
275
+ end
276
+
277
+ private
278
+
279
+ def solr_name_and_mappings(field_name, field_type, index_type)
280
+ field_name = field_name.to_s
281
+ mapping = @mappings[index_type]
282
+ unless mapping
283
+ logger.debug "Unknown index type '#{index_type}' for field #{field_name}"
284
+ return nil
285
+ end
286
+
287
+ data_type_mapping = mapping.data_types[field_type] || mapping.data_types[:default]
288
+
289
+ suffix = data_type_mapping.opts[:suffix] if data_type_mapping
290
+ suffix ||= mapping.opts[:suffix]
291
+ name = field_name + suffix
292
+
293
+ [name, mapping, data_type_mapping]
294
+ end
295
+
296
+ class IndexTypeMapping
297
+ attr_accessor :opts, :data_types
298
+
299
+ def initialize
300
+ @opts = {}
301
+ @data_types = {}
302
+ end
303
+ end
304
+
305
+ class DataTypeMapping
306
+ attr_accessor :opts, :converter
307
+
308
+ def initialize
309
+ @opts = {}
310
+ end
311
+ end
312
+
313
+ class DataTypeMappingBuilder
314
+ def initialize(index_type_mapping)
315
+ @index_type_mapping = index_type_mapping
316
+ end
317
+
318
+ def method_missing(method, *args, &block)
319
+ data_type_mapping = (@index_type_mapping.data_types[method] ||= DataTypeMapping.new)
320
+ data_type_mapping.opts.merge! args[0] if args.length > 0
321
+ data_type_mapping.converter = block if block_given?
322
+ end
323
+ end
324
+
325
+ # ------ Default mapper ------
326
+
327
+ public
328
+
329
+ class Default < FieldMapper
330
+ id_field 'id'
331
+ index_as :searchable, :default => true do |t|
332
+ t.default :suffix => '_t'
333
+ t.date :suffix => '_dt'
334
+ t.string :suffix => '_t'
335
+ t.text :suffix => '_t'
336
+ t.symbol :suffix => '_s'
337
+ t.integer :suffix => '_i'
338
+ t.long :suffix => '_l'
339
+ t.boolean :suffix => '_b'
340
+ t.float :suffix => '_f'
341
+ t.double :suffix => '_d'
342
+ end
343
+ index_as :displayable, :suffix => '_display'
344
+ index_as :facetable, :suffix => '_facet'
345
+ index_as :sortable, :suffix => '_sort'
346
+ index_as :unstemmed_searchable, :suffix => '_unstem_search'
347
+ end
348
+
349
+ end
350
+
351
+ end