solrizer 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +3 -0
- data/Gemfile +14 -0
- data/Gemfile.lock +44 -0
- data/History.txt +8 -0
- data/Rakefile +10 -3
- data/VERSION +1 -1
- data/config/solr_mappings.yml +16 -13
- data/config/solr_mappings_af_0.1.yml +18 -0
- data/lib/solrizer/extractor.rb +31 -72
- data/lib/solrizer/field_mapper.rb +351 -0
- data/lib/solrizer/field_name_mapper.rb +37 -51
- data/lib/solrizer/html/extractor.rb +36 -0
- data/lib/solrizer/html.rb +7 -0
- data/lib/solrizer/xml/extractor.rb +31 -0
- data/lib/solrizer/xml/terminology_based_solrizer.rb +25 -29
- data/lib/solrizer/xml.rb +4 -1
- data/lib/solrizer.rb +2 -113
- data/lib/tasks/solrizer.rake +7 -27
- data/solrizer.gemspec +46 -26
- data/spec/{spec.opts → .rspec} +0 -0
- data/spec/fixtures/test_solr_mappings.yml +16 -0
- data/spec/spec_helper.rb +1 -0
- data/spec/units/extractor_spec.rb +43 -34
- data/spec/units/field_mapper_spec.rb +227 -0
- data/spec/units/field_name_mapper_spec.rb +16 -29
- data/spec/units/xml_extractor_spec.rb +28 -0
- data/spec/units/xml_terminology_based_solrizer_spec.rb +18 -5
- metadata +128 -35
- data/lib/solrizer/configuration.rb +0 -8
- data/lib/solrizer/indexer.rb +0 -261
- data/lib/solrizer/main.rb +0 -17
- data/lib/solrizer/replicator.rb +0 -143
- data/lib/solrizer/repository.rb +0 -54
- data/spec/fixtures/rels_ext_cmodel.xml +0 -8
- data/spec/fixtures/solr_mappings_af_0.1.yml +0 -16
- data/spec/integration/indexer_spec.rb +0 -18
- data/spec/units/indexer_spec.rb +0 -127
- data/spec/units/shelver_spec.rb +0 -42
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: solrizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
8
|
+
- 3
|
9
9
|
- 0
|
10
|
-
version: 0.
|
10
|
+
version: 0.3.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Matt Zumwalt
|
@@ -15,57 +15,151 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-
|
18
|
+
date: 2010-10-26 00:00:00 -05:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
22
|
-
name:
|
22
|
+
name: solr-ruby
|
23
23
|
prerelease: false
|
24
24
|
requirement: &id001 !ruby/object:Gem::Requirement
|
25
25
|
none: false
|
26
26
|
requirements:
|
27
27
|
- - ">="
|
28
28
|
- !ruby/object:Gem::Version
|
29
|
-
hash:
|
29
|
+
hash: 3
|
30
30
|
segments:
|
31
|
-
-
|
32
|
-
|
33
|
-
- 5
|
34
|
-
version: 1.1.5
|
31
|
+
- 0
|
32
|
+
version: "0"
|
35
33
|
type: :runtime
|
36
34
|
version_requirements: *id001
|
37
35
|
- !ruby/object:Gem::Dependency
|
38
|
-
name:
|
36
|
+
name: nokogiri
|
39
37
|
prerelease: false
|
40
38
|
requirement: &id002 !ruby/object:Gem::Requirement
|
41
39
|
none: false
|
42
40
|
requirements:
|
43
41
|
- - ">="
|
44
42
|
- !ruby/object:Gem::Version
|
45
|
-
hash:
|
43
|
+
hash: 3
|
46
44
|
segments:
|
47
|
-
- 1
|
48
|
-
- 0
|
49
45
|
- 0
|
50
|
-
version:
|
46
|
+
version: "0"
|
51
47
|
type: :runtime
|
52
48
|
version_requirements: *id002
|
53
49
|
- !ruby/object:Gem::Dependency
|
54
|
-
name:
|
50
|
+
name: om
|
55
51
|
prerelease: false
|
56
52
|
requirement: &id003 !ruby/object:Gem::Requirement
|
57
53
|
none: false
|
58
54
|
requirements:
|
59
55
|
- - ">="
|
60
56
|
- !ruby/object:Gem::Version
|
61
|
-
hash:
|
57
|
+
hash: 3
|
58
|
+
segments:
|
59
|
+
- 0
|
60
|
+
version: "0"
|
61
|
+
type: :runtime
|
62
|
+
version_requirements: *id003
|
63
|
+
- !ruby/object:Gem::Dependency
|
64
|
+
name: nokogiri
|
65
|
+
prerelease: false
|
66
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
67
|
+
none: false
|
68
|
+
requirements:
|
69
|
+
- - ">="
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
hash: 3
|
72
|
+
segments:
|
73
|
+
- 0
|
74
|
+
version: "0"
|
75
|
+
type: :runtime
|
76
|
+
version_requirements: *id004
|
77
|
+
- !ruby/object:Gem::Dependency
|
78
|
+
name: mediashelf-loggable
|
79
|
+
prerelease: false
|
80
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ">="
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
hash: 3
|
86
|
+
segments:
|
87
|
+
- 0
|
88
|
+
version: "0"
|
89
|
+
type: :runtime
|
90
|
+
version_requirements: *id005
|
91
|
+
- !ruby/object:Gem::Dependency
|
92
|
+
name: jeweler
|
93
|
+
prerelease: false
|
94
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
95
|
+
none: false
|
96
|
+
requirements:
|
97
|
+
- - ">="
|
98
|
+
- !ruby/object:Gem::Version
|
99
|
+
hash: 3
|
100
|
+
segments:
|
101
|
+
- 0
|
102
|
+
version: "0"
|
103
|
+
type: :development
|
104
|
+
version_requirements: *id006
|
105
|
+
- !ruby/object:Gem::Dependency
|
106
|
+
name: ruby-debug
|
107
|
+
prerelease: false
|
108
|
+
requirement: &id007 !ruby/object:Gem::Requirement
|
109
|
+
none: false
|
110
|
+
requirements:
|
111
|
+
- - ">="
|
112
|
+
- !ruby/object:Gem::Version
|
113
|
+
hash: 3
|
114
|
+
segments:
|
115
|
+
- 0
|
116
|
+
version: "0"
|
117
|
+
type: :development
|
118
|
+
version_requirements: *id007
|
119
|
+
- !ruby/object:Gem::Dependency
|
120
|
+
name: ruby-debug-base
|
121
|
+
prerelease: false
|
122
|
+
requirement: &id008 !ruby/object:Gem::Requirement
|
123
|
+
none: false
|
124
|
+
requirements:
|
125
|
+
- - ">="
|
126
|
+
- !ruby/object:Gem::Version
|
127
|
+
hash: 3
|
128
|
+
segments:
|
129
|
+
- 0
|
130
|
+
version: "0"
|
131
|
+
type: :development
|
132
|
+
version_requirements: *id008
|
133
|
+
- !ruby/object:Gem::Dependency
|
134
|
+
name: rspec
|
135
|
+
prerelease: false
|
136
|
+
requirement: &id009 !ruby/object:Gem::Requirement
|
137
|
+
none: false
|
138
|
+
requirements:
|
139
|
+
- - <
|
140
|
+
- !ruby/object:Gem::Version
|
141
|
+
hash: 15
|
62
142
|
segments:
|
63
|
-
- 1
|
64
143
|
- 2
|
65
|
-
-
|
66
|
-
|
144
|
+
- 0
|
145
|
+
- 0
|
146
|
+
version: 2.0.0
|
67
147
|
type: :development
|
68
|
-
version_requirements: *
|
148
|
+
version_requirements: *id009
|
149
|
+
- !ruby/object:Gem::Dependency
|
150
|
+
name: mocha
|
151
|
+
prerelease: false
|
152
|
+
requirement: &id010 !ruby/object:Gem::Requirement
|
153
|
+
none: false
|
154
|
+
requirements:
|
155
|
+
- - ">="
|
156
|
+
- !ruby/object:Gem::Version
|
157
|
+
hash: 3
|
158
|
+
segments:
|
159
|
+
- 0
|
160
|
+
version: "0"
|
161
|
+
type: :development
|
162
|
+
version_requirements: *id010
|
69
163
|
description: Use solrizer to populate solr indexes from Fedora repository content or from other sources. You can run solrizer from within your apps, using the provided rake tasks, or as a JMS listener
|
70
164
|
email: matt.zumwalt@yourmediashelf.com
|
71
165
|
executables: []
|
@@ -77,6 +171,8 @@ extra_rdoc_files:
|
|
77
171
|
- README.textile
|
78
172
|
files:
|
79
173
|
- .gitignore
|
174
|
+
- Gemfile
|
175
|
+
- Gemfile.lock
|
80
176
|
- History.txt
|
81
177
|
- LICENSE
|
82
178
|
- README.textile
|
@@ -86,34 +182,32 @@ files:
|
|
86
182
|
- config/hydra_types.yml
|
87
183
|
- config/solr.yml
|
88
184
|
- config/solr_mappings.yml
|
185
|
+
- config/solr_mappings_af_0.1.yml
|
89
186
|
- lib/solrizer.rb
|
90
|
-
- lib/solrizer/configuration.rb
|
91
187
|
- lib/solrizer/extractor.rb
|
188
|
+
- lib/solrizer/field_mapper.rb
|
92
189
|
- lib/solrizer/field_name_mapper.rb
|
93
|
-
- lib/solrizer/
|
94
|
-
- lib/solrizer/
|
95
|
-
- lib/solrizer/replicator.rb
|
96
|
-
- lib/solrizer/repository.rb
|
190
|
+
- lib/solrizer/html.rb
|
191
|
+
- lib/solrizer/html/extractor.rb
|
97
192
|
- lib/solrizer/xml.rb
|
193
|
+
- lib/solrizer/xml/extractor.rb
|
98
194
|
- lib/solrizer/xml/terminology_based_solrizer.rb
|
99
195
|
- lib/tasks/solrizer.rake
|
100
196
|
- solrizer.gemspec
|
197
|
+
- spec/.rspec
|
101
198
|
- spec/fixtures/druid-bv448hq0314-descMetadata.xml
|
102
199
|
- spec/fixtures/druid-bv448hq0314-extProperties.xml
|
103
200
|
- spec/fixtures/druid-cm234kq4672-extProperties.xml
|
104
201
|
- spec/fixtures/druid-cm234kq4672-stories.xml
|
105
202
|
- spec/fixtures/druid-hc513kw4806-descMetadata.xml
|
106
203
|
- spec/fixtures/mods_articles/hydrangea_article1.xml
|
107
|
-
- spec/fixtures/
|
108
|
-
- spec/fixtures/solr_mappings_af_0.1.yml
|
109
|
-
- spec/integration/indexer_spec.rb
|
204
|
+
- spec/fixtures/test_solr_mappings.yml
|
110
205
|
- spec/rcov.opts
|
111
|
-
- spec/spec.opts
|
112
206
|
- spec/spec_helper.rb
|
113
207
|
- spec/units/extractor_spec.rb
|
208
|
+
- spec/units/field_mapper_spec.rb
|
114
209
|
- spec/units/field_name_mapper_spec.rb
|
115
|
-
- spec/units/
|
116
|
-
- spec/units/shelver_spec.rb
|
210
|
+
- spec/units/xml_extractor_spec.rb
|
117
211
|
- spec/units/xml_terminology_based_solrizer_spec.rb
|
118
212
|
has_rdoc: true
|
119
213
|
homepage: http://github.com/projecthydra/solrizer
|
@@ -150,10 +244,9 @@ signing_key:
|
|
150
244
|
specification_version: 3
|
151
245
|
summary: A utility for building solr indexes, usually from Fedora repository content.
|
152
246
|
test_files:
|
153
|
-
- spec/integration/indexer_spec.rb
|
154
247
|
- spec/spec_helper.rb
|
155
248
|
- spec/units/extractor_spec.rb
|
249
|
+
- spec/units/field_mapper_spec.rb
|
156
250
|
- spec/units/field_name_mapper_spec.rb
|
157
|
-
- spec/units/
|
158
|
-
- spec/units/shelver_spec.rb
|
251
|
+
- spec/units/xml_extractor_spec.rb
|
159
252
|
- spec/units/xml_terminology_based_solrizer_spec.rb
|
@@ -1,8 +0,0 @@
|
|
1
|
-
|
2
|
-
# FEDORA_URL = 'http://fedoraAdmin:fedoraAdmin@salt-dev.stanford.edu/fedora'
|
3
|
-
# FEDORA_SOLR_URL = 'http://salt-dev.stanford.edu:8080/solr'
|
4
|
-
# SHELVER_SOLR_URL = 'http://sulwebappdev1.stanford.edu:8100/salt_solr'
|
5
|
-
FEDORA_URL = 'http://fedoraAdmin:fedoraAdmin@localhost:8080/fedora'
|
6
|
-
FEDORA_SOLR_URL = 'http://localhost:8080/solr'
|
7
|
-
SHELVER_SOLR_URL = 'http://localhost:8080/bl_solr'
|
8
|
-
|
data/lib/solrizer/indexer.rb
DELETED
@@ -1,261 +0,0 @@
|
|
1
|
-
require 'solr'
|
2
|
-
require 'solrizer/extractor'
|
3
|
-
require 'solrizer/repository'
|
4
|
-
|
5
|
-
|
6
|
-
module Solrizer
|
7
|
-
class Indexer
|
8
|
-
#
|
9
|
-
# Class variables
|
10
|
-
#
|
11
|
-
@@unique_id = 0
|
12
|
-
|
13
|
-
def self.unique_id
|
14
|
-
@@unique_id
|
15
|
-
end
|
16
|
-
|
17
|
-
#
|
18
|
-
# Member variables
|
19
|
-
#
|
20
|
-
attr_accessor :connection, :extractor, :index_full_text
|
21
|
-
|
22
|
-
#
|
23
|
-
# This method performs initialization tasks
|
24
|
-
#
|
25
|
-
def initialize( opts={} )
|
26
|
-
@@index_list = false unless defined?(@@index_list)
|
27
|
-
@extractor = Extractor.new
|
28
|
-
|
29
|
-
if opts[:index_full_text] == true || opts[:index_full_text] == "true"
|
30
|
-
@index_full_text = true
|
31
|
-
else
|
32
|
-
@index_full_text = false
|
33
|
-
end
|
34
|
-
|
35
|
-
connect
|
36
|
-
end
|
37
|
-
|
38
|
-
#
|
39
|
-
# This method connects to the Solr instance
|
40
|
-
#
|
41
|
-
def connect
|
42
|
-
|
43
|
-
if ActiveFedora.fedora_config.empty?
|
44
|
-
ActiveFedora.init
|
45
|
-
end
|
46
|
-
|
47
|
-
if defined?(Blacklight)
|
48
|
-
solr_config = Blacklight.solr_config
|
49
|
-
else
|
50
|
-
|
51
|
-
if defined?(RAILS_ROOT)
|
52
|
-
config_path = File.join(RAILS_ROOT, "config")
|
53
|
-
yaml = YAML.load(File.open(File.join(config_path, "solr.yml")))
|
54
|
-
solr_config = yaml[RAILS_ENV]
|
55
|
-
puts solr_config.inspect
|
56
|
-
else
|
57
|
-
config_path = File.join(File.dirname(__FILE__), "..", "..", "config")
|
58
|
-
yaml = YAML.load(File.open(File.join(config_path, "solr.yml")))
|
59
|
-
|
60
|
-
|
61
|
-
if ENV["environment"].nil?
|
62
|
-
environment = "development"
|
63
|
-
else
|
64
|
-
environment = ENV["environment"]
|
65
|
-
end
|
66
|
-
|
67
|
-
solr_config = yaml[environment]
|
68
|
-
puts solr_config.inspect
|
69
|
-
end
|
70
|
-
|
71
|
-
end
|
72
|
-
|
73
|
-
if index_full_text == true
|
74
|
-
url = solr_config['fulltext']['url']
|
75
|
-
elsif solr_config.has_key?("default")
|
76
|
-
url = solr_config['default']['url']
|
77
|
-
else
|
78
|
-
url = solr_config['url']
|
79
|
-
end
|
80
|
-
@connection = Solr::Connection.new(url, :autocommit => :on )
|
81
|
-
end
|
82
|
-
|
83
|
-
#
|
84
|
-
# This method extracts the facet categories from the given Fedora object's external tag datastream
|
85
|
-
#
|
86
|
-
def extract_xml_to_solr( obj, ds_name, solr_doc=Solr::Document.new )
|
87
|
-
xml_ds = Repository.get_datastream( obj, ds_name )
|
88
|
-
extractor.xml_to_solr( xml_ds.content, solr_doc )
|
89
|
-
end
|
90
|
-
|
91
|
-
#
|
92
|
-
#
|
93
|
-
#
|
94
|
-
def extract_rels_ext( obj, ds_name, solr_doc=Solr::Document.new )
|
95
|
-
rels_ext_ds = Repository.get_datastream( obj, ds_name )
|
96
|
-
extractor.extract_rels_ext( rels_ext_ds.content, solr_doc )
|
97
|
-
end
|
98
|
-
|
99
|
-
#
|
100
|
-
# This method generates the month and day facets from the date_t in solr_doc
|
101
|
-
#
|
102
|
-
|
103
|
-
def generate_dates(solr_doc)
|
104
|
-
|
105
|
-
# This will check for valid dates, but it seems most of the dates are currently invalid....
|
106
|
-
#date_check = /^(19|20)\d\d([- \/.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])/
|
107
|
-
|
108
|
-
#if there is not date_t, add on with easy-to-find value
|
109
|
-
if solr_doc[:date_t].nil?
|
110
|
-
solr_doc << Solr::Field.new( :date_t => "9999-99-99")
|
111
|
-
end #if
|
112
|
-
|
113
|
-
# unless date_check !~ solr_doc[:date_t]
|
114
|
-
date_obj = Date._parse(solr_doc[:date_t])
|
115
|
-
|
116
|
-
if date_obj[:mon].nil?
|
117
|
-
solr_doc << Solr::Field.new(:month_facet => 99)
|
118
|
-
elsif 0 < date_obj[:mon] && date_obj[:mon] < 13
|
119
|
-
solr_doc << Solr::Field.new( :month_facet => date_obj[:mon].to_s.rjust(2, '0'))
|
120
|
-
else
|
121
|
-
solr_doc << Solr::Field.new( :month_facet => 99)
|
122
|
-
end
|
123
|
-
|
124
|
-
if date_obj[:mday].nil?
|
125
|
-
solr_doc << Solr::Field.new( :day_facet => 99)
|
126
|
-
elsif 0 < date_obj[:mday] && date_obj[:mday] < 32
|
127
|
-
solr_doc << Solr::Field.new( :day_facet => date_obj[:mday].to_s.rjust(2, '0'))
|
128
|
-
else
|
129
|
-
solr_doc << Solr::Field.new( :day_facet => 99)
|
130
|
-
end
|
131
|
-
|
132
|
-
return solr_doc
|
133
|
-
# end
|
134
|
-
|
135
|
-
end
|
136
|
-
|
137
|
-
|
138
|
-
#
|
139
|
-
# This method creates a Solr-formatted XML document
|
140
|
-
#
|
141
|
-
def create_document( obj )
|
142
|
-
|
143
|
-
solr_doc = Solr::Document.new
|
144
|
-
|
145
|
-
model_klazz_array = ActiveFedora::ContentModel.known_models_for( obj )
|
146
|
-
model_klazz_array.delete(ActiveFedora::Base)
|
147
|
-
|
148
|
-
# If the object was passed in as an ActiveFedora::Base, call to_solr in order to get the base field entries from ActiveFedora::Base
|
149
|
-
# Otherwise, the object was passed in as a model instance other than ActiveFedora::Base,so call its to_solr method & allow it to insert the fields from ActiveFedora::Base
|
150
|
-
if obj.class == ActiveFedora::Base
|
151
|
-
solr_doc = obj.to_solr(solr_doc)
|
152
|
-
puts " added base fields from #{obj.class.to_s}"
|
153
|
-
else
|
154
|
-
solr_doc = obj.to_solr(solr_doc)
|
155
|
-
model_klazz_array.delete(obj.class)
|
156
|
-
puts " added base fields from #{obj.class.to_s} and model fields from #{obj.class.to_s}"
|
157
|
-
end
|
158
|
-
|
159
|
-
# Load the object as an instance of each of its other models and get the corresponding solr fields
|
160
|
-
# Include :model_only=>true in the options in order to avoid adding the metadata from ActiveFedora::Base every time.
|
161
|
-
model_klazz_array.each do |klazz|
|
162
|
-
instance = klazz.load_instance(obj.pid)
|
163
|
-
solr_doc = instance.to_solr(solr_doc, :model_only=>true)
|
164
|
-
puts " added solr fields from #{klazz.to_s}"
|
165
|
-
end
|
166
|
-
|
167
|
-
solr_doc << Solr::Field.new( :id_t => "#{obj.pid}" )
|
168
|
-
solr_doc << Solr::Field.new( :id => "#{obj.pid}" ) unless solr_doc[:id]
|
169
|
-
|
170
|
-
# increment the unique id to ensure that all documents in the search index are unique
|
171
|
-
@@unique_id += 1
|
172
|
-
|
173
|
-
return solr_doc
|
174
|
-
end
|
175
|
-
|
176
|
-
#
|
177
|
-
# This method adds a document to the Solr search index
|
178
|
-
#
|
179
|
-
def index( obj )
|
180
|
-
# print "Indexing '#{obj.pid}'..."
|
181
|
-
begin
|
182
|
-
|
183
|
-
solr_doc = create_document( obj )
|
184
|
-
connection.add( solr_doc )
|
185
|
-
|
186
|
-
# puts connection.url
|
187
|
-
#puts solr_doc
|
188
|
-
# puts "done"
|
189
|
-
|
190
|
-
# rescue Exception => e
|
191
|
-
# p "unable to index #{obj.pid}. Failed with #{e.inspect}"
|
192
|
-
end
|
193
|
-
|
194
|
-
end
|
195
|
-
|
196
|
-
#
|
197
|
-
# This method queries the Solr search index and returns a response
|
198
|
-
#
|
199
|
-
def query( query_str )
|
200
|
-
response = conn.query( query_str )
|
201
|
-
end
|
202
|
-
|
203
|
-
#
|
204
|
-
# This method prints out the results of the given query string by iterating through all the hits
|
205
|
-
#
|
206
|
-
def printResults( query_str )
|
207
|
-
query( query_str ) do |hit|
|
208
|
-
puts hit.inspect
|
209
|
-
end
|
210
|
-
end
|
211
|
-
|
212
|
-
#
|
213
|
-
# This method deletes a document from the Solr search index by id
|
214
|
-
#
|
215
|
-
def deleteDocument( id )
|
216
|
-
connection.delete( id )
|
217
|
-
end
|
218
|
-
|
219
|
-
# Populates a solr doc with values from a hash.
|
220
|
-
# Accepts two forms of hashes:
|
221
|
-
# => {'technology'=>["t1", "t2"], 'company'=>"c1", "person"=>["p1", "p2"]}
|
222
|
-
# or
|
223
|
-
# => {:facets => {'technology'=>["t1", "t2"], 'company'=>"c1", "person"=>["p1", "p2"]} }
|
224
|
-
#
|
225
|
-
# Note that values for individual fields can be a single string or an array of strings.
|
226
|
-
def self.solrize( input_hash, solr_doc=Solr::Document.new )
|
227
|
-
facets = input_hash.has_key?(:facets) ? input_hash[:facets] : input_hash
|
228
|
-
facets.each_pair do |facet_name, value|
|
229
|
-
case value.class.to_s
|
230
|
-
when "String"
|
231
|
-
solr_doc << Solr::Field.new( :"#{facet_name}_facet" => "#{value}" )
|
232
|
-
when "Array"
|
233
|
-
value.each { |v| solr_doc << Solr::Field.new( :"#{facet_name}_facet" => "#{v}" ) }
|
234
|
-
end
|
235
|
-
end
|
236
|
-
|
237
|
-
if input_hash.has_key?(:symbols)
|
238
|
-
input_hash[:symbols].each do |symbol_name, value|
|
239
|
-
case value.class.to_s
|
240
|
-
when "String"
|
241
|
-
solr_doc << Solr::Field.new( :"#{symbol_name}_s" => "#{value}" )
|
242
|
-
when "Array"
|
243
|
-
value.each { |v| solr_doc << Solr::Field.new( :"#{symbol_name}_s" => "#{v}" ) }
|
244
|
-
end
|
245
|
-
end
|
246
|
-
end
|
247
|
-
return solr_doc
|
248
|
-
end
|
249
|
-
|
250
|
-
|
251
|
-
private :connect, :create_document
|
252
|
-
|
253
|
-
def class_exists?(class_name)
|
254
|
-
klass = Module.const_get(class_name)
|
255
|
-
return klass.is_a?(Class)
|
256
|
-
rescue NameError
|
257
|
-
return false
|
258
|
-
end
|
259
|
-
|
260
|
-
end
|
261
|
-
end
|
data/lib/solrizer/main.rb
DELETED
@@ -1,17 +0,0 @@
|
|
1
|
-
#!/bin/env ruby
|
2
|
-
|
3
|
-
@index_full_text = false
|
4
|
-
|
5
|
-
require 'rubygems'
|
6
|
-
load 'configuration.rb'
|
7
|
-
load 'repository.rb'
|
8
|
-
load 'solrizer.rb'
|
9
|
-
|
10
|
-
# initialize connection to Fedora repository
|
11
|
-
repository = Repository.new
|
12
|
-
repository.initialize_repository
|
13
|
-
|
14
|
-
# solrize all objects in the Fedora repository
|
15
|
-
solrizer = Solrizer.new
|
16
|
-
solrizer.solrize_objects
|
17
|
-
|
data/lib/solrizer/replicator.rb
DELETED
@@ -1,143 +0,0 @@
|
|
1
|
-
require 'fastercsv'
|
2
|
-
REPLICATOR_LIST = false unless defined?(REPLICATOR_LIST)
|
3
|
-
|
4
|
-
|
5
|
-
module Solrizer
|
6
|
-
class Replicator
|
7
|
-
|
8
|
-
include Stanford::SaltControllerHelper
|
9
|
-
attr_accessor :dest_repo, :configs
|
10
|
-
|
11
|
-
def initialize
|
12
|
-
config_path = "#{RAILS_ROOT}/config/replicator.yml"
|
13
|
-
raw_configs = YAML::load(File.open(config_path))
|
14
|
-
@configs = raw_configs[RAILS_ENV]
|
15
|
-
@dest_repo = Fedora::Repository.new(configs["destination"]["fedora"]["url"])
|
16
|
-
|
17
|
-
ActiveFedora.fedora_config[:url] = configs["source"]["fedora"]["url"]
|
18
|
-
logger.info("REPLICATOR: re-initializing Fedora with fedora_config: #{ActiveFedora.fedora_config.inspect}")
|
19
|
-
|
20
|
-
Fedora::Repository.register(ActiveFedora.fedora_config[:url])
|
21
|
-
logger.info("REPLICATOR: re-initialized Fedora as: #{Fedora::Repository.instance.inspect}")
|
22
|
-
|
23
|
-
# Register Solr
|
24
|
-
ActiveFedora.solr_config[:url] = configs["source"]["solr"]["url"]
|
25
|
-
|
26
|
-
logger.info("REPLICATOR: re-initializing ActiveFedora::SolrService with solr_config: #{ActiveFedora.solr_config.inspect}")
|
27
|
-
|
28
|
-
ActiveFedora::SolrService.register(ActiveFedora.solr_config[:url])
|
29
|
-
|
30
|
-
end
|
31
|
-
|
32
|
-
def replicate_objects
|
33
|
-
# retrieve a list of all the pids in the fedora repository
|
34
|
-
num_docs = 1000000 # modify this number to guarantee that all the objects are retrieved from the repository
|
35
|
-
|
36
|
-
if REPLICATOR_LIST == false
|
37
|
-
|
38
|
-
pids = Repository.get_pids( num_docs )
|
39
|
-
puts "Replicating #{pids.length} Fedora objects"
|
40
|
-
pids.each do |pid|
|
41
|
-
unless pid[0].empty? || pid[0].nil? || !pid[0].include?("druid:")
|
42
|
-
puts "Processing #{pid}"
|
43
|
-
replicate_object( pid )
|
44
|
-
end #unless
|
45
|
-
end #pids.each
|
46
|
-
|
47
|
-
else
|
48
|
-
|
49
|
-
if File.exists?(REPLICATOR_LIST)
|
50
|
-
arr_of_pids = FasterCSV.read(REPLICATOR_LIST, :headers=>false)
|
51
|
-
|
52
|
-
puts "Replicating from list at #{REPLICATOR_LIST}"
|
53
|
-
puts "Replicating #{arr_of_pids.length} Fedora objects"
|
54
|
-
|
55
|
-
arr_of_pids.each do |row|
|
56
|
-
pid = row[0]
|
57
|
-
replicate_object( pid )
|
58
|
-
end #FASTERCSV
|
59
|
-
|
60
|
-
else
|
61
|
-
puts "#{REPLICATOR_LIST} does not exists!"
|
62
|
-
end #if File.exists
|
63
|
-
|
64
|
-
end #if Index_LISTS
|
65
|
-
end #replicate_objects
|
66
|
-
|
67
|
-
|
68
|
-
def replicate_object(obj)
|
69
|
-
#source_doc = Document.load_instance(pid)
|
70
|
-
obj = obj.kind_of?(ActiveFedora::Base) ? obj : Repository.get_object( obj )
|
71
|
-
p "Indexing object #{obj.pid} with label #{obj.label}"
|
72
|
-
begin
|
73
|
-
unless obj.nil?
|
74
|
-
create_stub(obj)
|
75
|
-
p "Successfully replicated #{obj.pid}"
|
76
|
-
end
|
77
|
-
rescue Exception => e
|
78
|
-
p "unable to create stub. Failed with #{e.inspect}"
|
79
|
-
end
|
80
|
-
end
|
81
|
-
|
82
|
-
# Creates a stub object in @dest_repo with the datastreams that we need in the stubs
|
83
|
-
def create_stub(source_object)
|
84
|
-
|
85
|
-
begin
|
86
|
-
|
87
|
-
jp2 = downloadables(source_object, :canonical=>true, :mime_type=>"image/jp2")
|
88
|
-
jp2.new_object = true
|
89
|
-
jp2.control_group = 'M'
|
90
|
-
jp2.blob = jp2.content
|
91
|
-
|
92
|
-
stub_object = Fedora::FedoraObject.new(:pid=>source_object.pid)
|
93
|
-
dest_repo.save(stub_object)
|
94
|
-
dest_repo.save(jp2)
|
95
|
-
|
96
|
-
["properties", "extProperties", "descMetadata", "location"].each do |ds_name|
|
97
|
-
ds = source_object.datastreams[ds_name]
|
98
|
-
ds.new_object = true
|
99
|
-
ds.blob = ds.content
|
100
|
-
dest_repo.save(ds)
|
101
|
-
end
|
102
|
-
|
103
|
-
rescue
|
104
|
-
#for object without jp2s
|
105
|
-
#this is a temp fix to the downloadables() issue
|
106
|
-
|
107
|
-
|
108
|
-
pid = source_object.pid
|
109
|
-
p "> #{pid}"
|
110
|
-
|
111
|
-
jp2_file = File.new('spec/fixtures/image.jp2')
|
112
|
-
ds = ActiveFedora::Datastream.new(:dsID => "image.jp2", :dsLabel => 'image.jp2', :controlGroup => 'M', :blob => jp2_file)
|
113
|
-
source_object.add_datastream(ds)
|
114
|
-
source_object.save
|
115
|
-
# source_object = Document.load_instance(pid)
|
116
|
-
source_object = ActiveFedora::Base.load_instance(pid)
|
117
|
-
stub_object = Fedora::FedoraObject.new(:pid=>source_object.pid)
|
118
|
-
dest_repo.save(stub_object)
|
119
|
-
|
120
|
-
jp2 = downloadables(source_object, :canonical=>true, :mime_type=>"image/jp2")
|
121
|
-
jp2.new_object = true
|
122
|
-
jp2.control_group = 'M'
|
123
|
-
jp2.blob = jp2.content
|
124
|
-
|
125
|
-
stub_object = Fedora::FedoraObject.new(:pid=>source_object.pid)
|
126
|
-
dest_repo.save(stub_object)
|
127
|
-
dest_repo.save(jp2)
|
128
|
-
|
129
|
-
["properties", "extProperties", "descMetadata", "location"].each do |ds_name|
|
130
|
-
ds = source_object.datastreams[ds_name]
|
131
|
-
ds.new_object = true
|
132
|
-
ds.blob = ds.content
|
133
|
-
dest_repo.save(ds)
|
134
|
-
end
|
135
|
-
|
136
|
-
end
|
137
|
-
end
|
138
|
-
def logger
|
139
|
-
@logger ||= defined?(RAILS_DEFAULT_LOGGER) ? RAILS_DEFAULT_LOGGER : Logger.new(STDOUT)
|
140
|
-
end
|
141
|
-
|
142
|
-
end
|
143
|
-
end
|