solrizer 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +3 -0
- data/Gemfile +14 -0
- data/Gemfile.lock +44 -0
- data/History.txt +8 -0
- data/Rakefile +10 -3
- data/VERSION +1 -1
- data/config/solr_mappings.yml +16 -13
- data/config/solr_mappings_af_0.1.yml +18 -0
- data/lib/solrizer/extractor.rb +31 -72
- data/lib/solrizer/field_mapper.rb +351 -0
- data/lib/solrizer/field_name_mapper.rb +37 -51
- data/lib/solrizer/html/extractor.rb +36 -0
- data/lib/solrizer/html.rb +7 -0
- data/lib/solrizer/xml/extractor.rb +31 -0
- data/lib/solrizer/xml/terminology_based_solrizer.rb +25 -29
- data/lib/solrizer/xml.rb +4 -1
- data/lib/solrizer.rb +2 -113
- data/lib/tasks/solrizer.rake +7 -27
- data/solrizer.gemspec +46 -26
- data/spec/{spec.opts → .rspec} +0 -0
- data/spec/fixtures/test_solr_mappings.yml +16 -0
- data/spec/spec_helper.rb +1 -0
- data/spec/units/extractor_spec.rb +43 -34
- data/spec/units/field_mapper_spec.rb +227 -0
- data/spec/units/field_name_mapper_spec.rb +16 -29
- data/spec/units/xml_extractor_spec.rb +28 -0
- data/spec/units/xml_terminology_based_solrizer_spec.rb +18 -5
- metadata +128 -35
- data/lib/solrizer/configuration.rb +0 -8
- data/lib/solrizer/indexer.rb +0 -261
- data/lib/solrizer/main.rb +0 -17
- data/lib/solrizer/replicator.rb +0 -143
- data/lib/solrizer/repository.rb +0 -54
- data/spec/fixtures/rels_ext_cmodel.xml +0 -8
- data/spec/fixtures/solr_mappings_af_0.1.yml +0 -16
- data/spec/integration/indexer_spec.rb +0 -18
- data/spec/units/indexer_spec.rb +0 -127
- data/spec/units/shelver_spec.rb +0 -42
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: solrizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
8
|
+
- 3
|
9
9
|
- 0
|
10
|
-
version: 0.
|
10
|
+
version: 0.3.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Matt Zumwalt
|
@@ -15,57 +15,151 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-
|
18
|
+
date: 2010-10-26 00:00:00 -05:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
22
|
-
name:
|
22
|
+
name: solr-ruby
|
23
23
|
prerelease: false
|
24
24
|
requirement: &id001 !ruby/object:Gem::Requirement
|
25
25
|
none: false
|
26
26
|
requirements:
|
27
27
|
- - ">="
|
28
28
|
- !ruby/object:Gem::Version
|
29
|
-
hash:
|
29
|
+
hash: 3
|
30
30
|
segments:
|
31
|
-
-
|
32
|
-
|
33
|
-
- 5
|
34
|
-
version: 1.1.5
|
31
|
+
- 0
|
32
|
+
version: "0"
|
35
33
|
type: :runtime
|
36
34
|
version_requirements: *id001
|
37
35
|
- !ruby/object:Gem::Dependency
|
38
|
-
name:
|
36
|
+
name: nokogiri
|
39
37
|
prerelease: false
|
40
38
|
requirement: &id002 !ruby/object:Gem::Requirement
|
41
39
|
none: false
|
42
40
|
requirements:
|
43
41
|
- - ">="
|
44
42
|
- !ruby/object:Gem::Version
|
45
|
-
hash:
|
43
|
+
hash: 3
|
46
44
|
segments:
|
47
|
-
- 1
|
48
|
-
- 0
|
49
45
|
- 0
|
50
|
-
version:
|
46
|
+
version: "0"
|
51
47
|
type: :runtime
|
52
48
|
version_requirements: *id002
|
53
49
|
- !ruby/object:Gem::Dependency
|
54
|
-
name:
|
50
|
+
name: om
|
55
51
|
prerelease: false
|
56
52
|
requirement: &id003 !ruby/object:Gem::Requirement
|
57
53
|
none: false
|
58
54
|
requirements:
|
59
55
|
- - ">="
|
60
56
|
- !ruby/object:Gem::Version
|
61
|
-
hash:
|
57
|
+
hash: 3
|
58
|
+
segments:
|
59
|
+
- 0
|
60
|
+
version: "0"
|
61
|
+
type: :runtime
|
62
|
+
version_requirements: *id003
|
63
|
+
- !ruby/object:Gem::Dependency
|
64
|
+
name: nokogiri
|
65
|
+
prerelease: false
|
66
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
67
|
+
none: false
|
68
|
+
requirements:
|
69
|
+
- - ">="
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
hash: 3
|
72
|
+
segments:
|
73
|
+
- 0
|
74
|
+
version: "0"
|
75
|
+
type: :runtime
|
76
|
+
version_requirements: *id004
|
77
|
+
- !ruby/object:Gem::Dependency
|
78
|
+
name: mediashelf-loggable
|
79
|
+
prerelease: false
|
80
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ">="
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
hash: 3
|
86
|
+
segments:
|
87
|
+
- 0
|
88
|
+
version: "0"
|
89
|
+
type: :runtime
|
90
|
+
version_requirements: *id005
|
91
|
+
- !ruby/object:Gem::Dependency
|
92
|
+
name: jeweler
|
93
|
+
prerelease: false
|
94
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
95
|
+
none: false
|
96
|
+
requirements:
|
97
|
+
- - ">="
|
98
|
+
- !ruby/object:Gem::Version
|
99
|
+
hash: 3
|
100
|
+
segments:
|
101
|
+
- 0
|
102
|
+
version: "0"
|
103
|
+
type: :development
|
104
|
+
version_requirements: *id006
|
105
|
+
- !ruby/object:Gem::Dependency
|
106
|
+
name: ruby-debug
|
107
|
+
prerelease: false
|
108
|
+
requirement: &id007 !ruby/object:Gem::Requirement
|
109
|
+
none: false
|
110
|
+
requirements:
|
111
|
+
- - ">="
|
112
|
+
- !ruby/object:Gem::Version
|
113
|
+
hash: 3
|
114
|
+
segments:
|
115
|
+
- 0
|
116
|
+
version: "0"
|
117
|
+
type: :development
|
118
|
+
version_requirements: *id007
|
119
|
+
- !ruby/object:Gem::Dependency
|
120
|
+
name: ruby-debug-base
|
121
|
+
prerelease: false
|
122
|
+
requirement: &id008 !ruby/object:Gem::Requirement
|
123
|
+
none: false
|
124
|
+
requirements:
|
125
|
+
- - ">="
|
126
|
+
- !ruby/object:Gem::Version
|
127
|
+
hash: 3
|
128
|
+
segments:
|
129
|
+
- 0
|
130
|
+
version: "0"
|
131
|
+
type: :development
|
132
|
+
version_requirements: *id008
|
133
|
+
- !ruby/object:Gem::Dependency
|
134
|
+
name: rspec
|
135
|
+
prerelease: false
|
136
|
+
requirement: &id009 !ruby/object:Gem::Requirement
|
137
|
+
none: false
|
138
|
+
requirements:
|
139
|
+
- - <
|
140
|
+
- !ruby/object:Gem::Version
|
141
|
+
hash: 15
|
62
142
|
segments:
|
63
|
-
- 1
|
64
143
|
- 2
|
65
|
-
-
|
66
|
-
|
144
|
+
- 0
|
145
|
+
- 0
|
146
|
+
version: 2.0.0
|
67
147
|
type: :development
|
68
|
-
version_requirements: *
|
148
|
+
version_requirements: *id009
|
149
|
+
- !ruby/object:Gem::Dependency
|
150
|
+
name: mocha
|
151
|
+
prerelease: false
|
152
|
+
requirement: &id010 !ruby/object:Gem::Requirement
|
153
|
+
none: false
|
154
|
+
requirements:
|
155
|
+
- - ">="
|
156
|
+
- !ruby/object:Gem::Version
|
157
|
+
hash: 3
|
158
|
+
segments:
|
159
|
+
- 0
|
160
|
+
version: "0"
|
161
|
+
type: :development
|
162
|
+
version_requirements: *id010
|
69
163
|
description: Use solrizer to populate solr indexes from Fedora repository content or from other sources. You can run solrizer from within your apps, using the provided rake tasks, or as a JMS listener
|
70
164
|
email: matt.zumwalt@yourmediashelf.com
|
71
165
|
executables: []
|
@@ -77,6 +171,8 @@ extra_rdoc_files:
|
|
77
171
|
- README.textile
|
78
172
|
files:
|
79
173
|
- .gitignore
|
174
|
+
- Gemfile
|
175
|
+
- Gemfile.lock
|
80
176
|
- History.txt
|
81
177
|
- LICENSE
|
82
178
|
- README.textile
|
@@ -86,34 +182,32 @@ files:
|
|
86
182
|
- config/hydra_types.yml
|
87
183
|
- config/solr.yml
|
88
184
|
- config/solr_mappings.yml
|
185
|
+
- config/solr_mappings_af_0.1.yml
|
89
186
|
- lib/solrizer.rb
|
90
|
-
- lib/solrizer/configuration.rb
|
91
187
|
- lib/solrizer/extractor.rb
|
188
|
+
- lib/solrizer/field_mapper.rb
|
92
189
|
- lib/solrizer/field_name_mapper.rb
|
93
|
-
- lib/solrizer/
|
94
|
-
- lib/solrizer/
|
95
|
-
- lib/solrizer/replicator.rb
|
96
|
-
- lib/solrizer/repository.rb
|
190
|
+
- lib/solrizer/html.rb
|
191
|
+
- lib/solrizer/html/extractor.rb
|
97
192
|
- lib/solrizer/xml.rb
|
193
|
+
- lib/solrizer/xml/extractor.rb
|
98
194
|
- lib/solrizer/xml/terminology_based_solrizer.rb
|
99
195
|
- lib/tasks/solrizer.rake
|
100
196
|
- solrizer.gemspec
|
197
|
+
- spec/.rspec
|
101
198
|
- spec/fixtures/druid-bv448hq0314-descMetadata.xml
|
102
199
|
- spec/fixtures/druid-bv448hq0314-extProperties.xml
|
103
200
|
- spec/fixtures/druid-cm234kq4672-extProperties.xml
|
104
201
|
- spec/fixtures/druid-cm234kq4672-stories.xml
|
105
202
|
- spec/fixtures/druid-hc513kw4806-descMetadata.xml
|
106
203
|
- spec/fixtures/mods_articles/hydrangea_article1.xml
|
107
|
-
- spec/fixtures/
|
108
|
-
- spec/fixtures/solr_mappings_af_0.1.yml
|
109
|
-
- spec/integration/indexer_spec.rb
|
204
|
+
- spec/fixtures/test_solr_mappings.yml
|
110
205
|
- spec/rcov.opts
|
111
|
-
- spec/spec.opts
|
112
206
|
- spec/spec_helper.rb
|
113
207
|
- spec/units/extractor_spec.rb
|
208
|
+
- spec/units/field_mapper_spec.rb
|
114
209
|
- spec/units/field_name_mapper_spec.rb
|
115
|
-
- spec/units/
|
116
|
-
- spec/units/shelver_spec.rb
|
210
|
+
- spec/units/xml_extractor_spec.rb
|
117
211
|
- spec/units/xml_terminology_based_solrizer_spec.rb
|
118
212
|
has_rdoc: true
|
119
213
|
homepage: http://github.com/projecthydra/solrizer
|
@@ -150,10 +244,9 @@ signing_key:
|
|
150
244
|
specification_version: 3
|
151
245
|
summary: A utility for building solr indexes, usually from Fedora repository content.
|
152
246
|
test_files:
|
153
|
-
- spec/integration/indexer_spec.rb
|
154
247
|
- spec/spec_helper.rb
|
155
248
|
- spec/units/extractor_spec.rb
|
249
|
+
- spec/units/field_mapper_spec.rb
|
156
250
|
- spec/units/field_name_mapper_spec.rb
|
157
|
-
- spec/units/
|
158
|
-
- spec/units/shelver_spec.rb
|
251
|
+
- spec/units/xml_extractor_spec.rb
|
159
252
|
- spec/units/xml_terminology_based_solrizer_spec.rb
|
@@ -1,8 +0,0 @@
|
|
1
|
-
|
2
|
-
# FEDORA_URL = 'http://fedoraAdmin:fedoraAdmin@salt-dev.stanford.edu/fedora'
|
3
|
-
# FEDORA_SOLR_URL = 'http://salt-dev.stanford.edu:8080/solr'
|
4
|
-
# SHELVER_SOLR_URL = 'http://sulwebappdev1.stanford.edu:8100/salt_solr'
|
5
|
-
FEDORA_URL = 'http://fedoraAdmin:fedoraAdmin@localhost:8080/fedora'
|
6
|
-
FEDORA_SOLR_URL = 'http://localhost:8080/solr'
|
7
|
-
SHELVER_SOLR_URL = 'http://localhost:8080/bl_solr'
|
8
|
-
|
data/lib/solrizer/indexer.rb
DELETED
@@ -1,261 +0,0 @@
|
|
1
|
-
require 'solr'
|
2
|
-
require 'solrizer/extractor'
|
3
|
-
require 'solrizer/repository'
|
4
|
-
|
5
|
-
|
6
|
-
module Solrizer
|
7
|
-
class Indexer
|
8
|
-
#
|
9
|
-
# Class variables
|
10
|
-
#
|
11
|
-
@@unique_id = 0
|
12
|
-
|
13
|
-
def self.unique_id
|
14
|
-
@@unique_id
|
15
|
-
end
|
16
|
-
|
17
|
-
#
|
18
|
-
# Member variables
|
19
|
-
#
|
20
|
-
attr_accessor :connection, :extractor, :index_full_text
|
21
|
-
|
22
|
-
#
|
23
|
-
# This method performs initialization tasks
|
24
|
-
#
|
25
|
-
def initialize( opts={} )
|
26
|
-
@@index_list = false unless defined?(@@index_list)
|
27
|
-
@extractor = Extractor.new
|
28
|
-
|
29
|
-
if opts[:index_full_text] == true || opts[:index_full_text] == "true"
|
30
|
-
@index_full_text = true
|
31
|
-
else
|
32
|
-
@index_full_text = false
|
33
|
-
end
|
34
|
-
|
35
|
-
connect
|
36
|
-
end
|
37
|
-
|
38
|
-
#
|
39
|
-
# This method connects to the Solr instance
|
40
|
-
#
|
41
|
-
def connect
|
42
|
-
|
43
|
-
if ActiveFedora.fedora_config.empty?
|
44
|
-
ActiveFedora.init
|
45
|
-
end
|
46
|
-
|
47
|
-
if defined?(Blacklight)
|
48
|
-
solr_config = Blacklight.solr_config
|
49
|
-
else
|
50
|
-
|
51
|
-
if defined?(RAILS_ROOT)
|
52
|
-
config_path = File.join(RAILS_ROOT, "config")
|
53
|
-
yaml = YAML.load(File.open(File.join(config_path, "solr.yml")))
|
54
|
-
solr_config = yaml[RAILS_ENV]
|
55
|
-
puts solr_config.inspect
|
56
|
-
else
|
57
|
-
config_path = File.join(File.dirname(__FILE__), "..", "..", "config")
|
58
|
-
yaml = YAML.load(File.open(File.join(config_path, "solr.yml")))
|
59
|
-
|
60
|
-
|
61
|
-
if ENV["environment"].nil?
|
62
|
-
environment = "development"
|
63
|
-
else
|
64
|
-
environment = ENV["environment"]
|
65
|
-
end
|
66
|
-
|
67
|
-
solr_config = yaml[environment]
|
68
|
-
puts solr_config.inspect
|
69
|
-
end
|
70
|
-
|
71
|
-
end
|
72
|
-
|
73
|
-
if index_full_text == true
|
74
|
-
url = solr_config['fulltext']['url']
|
75
|
-
elsif solr_config.has_key?("default")
|
76
|
-
url = solr_config['default']['url']
|
77
|
-
else
|
78
|
-
url = solr_config['url']
|
79
|
-
end
|
80
|
-
@connection = Solr::Connection.new(url, :autocommit => :on )
|
81
|
-
end
|
82
|
-
|
83
|
-
#
|
84
|
-
# This method extracts the facet categories from the given Fedora object's external tag datastream
|
85
|
-
#
|
86
|
-
def extract_xml_to_solr( obj, ds_name, solr_doc=Solr::Document.new )
|
87
|
-
xml_ds = Repository.get_datastream( obj, ds_name )
|
88
|
-
extractor.xml_to_solr( xml_ds.content, solr_doc )
|
89
|
-
end
|
90
|
-
|
91
|
-
#
|
92
|
-
#
|
93
|
-
#
|
94
|
-
def extract_rels_ext( obj, ds_name, solr_doc=Solr::Document.new )
|
95
|
-
rels_ext_ds = Repository.get_datastream( obj, ds_name )
|
96
|
-
extractor.extract_rels_ext( rels_ext_ds.content, solr_doc )
|
97
|
-
end
|
98
|
-
|
99
|
-
#
|
100
|
-
# This method generates the month and day facets from the date_t in solr_doc
|
101
|
-
#
|
102
|
-
|
103
|
-
def generate_dates(solr_doc)
|
104
|
-
|
105
|
-
# This will check for valid dates, but it seems most of the dates are currently invalid....
|
106
|
-
#date_check = /^(19|20)\d\d([- \/.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])/
|
107
|
-
|
108
|
-
#if there is not date_t, add on with easy-to-find value
|
109
|
-
if solr_doc[:date_t].nil?
|
110
|
-
solr_doc << Solr::Field.new( :date_t => "9999-99-99")
|
111
|
-
end #if
|
112
|
-
|
113
|
-
# unless date_check !~ solr_doc[:date_t]
|
114
|
-
date_obj = Date._parse(solr_doc[:date_t])
|
115
|
-
|
116
|
-
if date_obj[:mon].nil?
|
117
|
-
solr_doc << Solr::Field.new(:month_facet => 99)
|
118
|
-
elsif 0 < date_obj[:mon] && date_obj[:mon] < 13
|
119
|
-
solr_doc << Solr::Field.new( :month_facet => date_obj[:mon].to_s.rjust(2, '0'))
|
120
|
-
else
|
121
|
-
solr_doc << Solr::Field.new( :month_facet => 99)
|
122
|
-
end
|
123
|
-
|
124
|
-
if date_obj[:mday].nil?
|
125
|
-
solr_doc << Solr::Field.new( :day_facet => 99)
|
126
|
-
elsif 0 < date_obj[:mday] && date_obj[:mday] < 32
|
127
|
-
solr_doc << Solr::Field.new( :day_facet => date_obj[:mday].to_s.rjust(2, '0'))
|
128
|
-
else
|
129
|
-
solr_doc << Solr::Field.new( :day_facet => 99)
|
130
|
-
end
|
131
|
-
|
132
|
-
return solr_doc
|
133
|
-
# end
|
134
|
-
|
135
|
-
end
|
136
|
-
|
137
|
-
|
138
|
-
#
|
139
|
-
# This method creates a Solr-formatted XML document
|
140
|
-
#
|
141
|
-
def create_document( obj )
|
142
|
-
|
143
|
-
solr_doc = Solr::Document.new
|
144
|
-
|
145
|
-
model_klazz_array = ActiveFedora::ContentModel.known_models_for( obj )
|
146
|
-
model_klazz_array.delete(ActiveFedora::Base)
|
147
|
-
|
148
|
-
# If the object was passed in as an ActiveFedora::Base, call to_solr in order to get the base field entries from ActiveFedora::Base
|
149
|
-
# Otherwise, the object was passed in as a model instance other than ActiveFedora::Base,so call its to_solr method & allow it to insert the fields from ActiveFedora::Base
|
150
|
-
if obj.class == ActiveFedora::Base
|
151
|
-
solr_doc = obj.to_solr(solr_doc)
|
152
|
-
puts " added base fields from #{obj.class.to_s}"
|
153
|
-
else
|
154
|
-
solr_doc = obj.to_solr(solr_doc)
|
155
|
-
model_klazz_array.delete(obj.class)
|
156
|
-
puts " added base fields from #{obj.class.to_s} and model fields from #{obj.class.to_s}"
|
157
|
-
end
|
158
|
-
|
159
|
-
# Load the object as an instance of each of its other models and get the corresponding solr fields
|
160
|
-
# Include :model_only=>true in the options in order to avoid adding the metadata from ActiveFedora::Base every time.
|
161
|
-
model_klazz_array.each do |klazz|
|
162
|
-
instance = klazz.load_instance(obj.pid)
|
163
|
-
solr_doc = instance.to_solr(solr_doc, :model_only=>true)
|
164
|
-
puts " added solr fields from #{klazz.to_s}"
|
165
|
-
end
|
166
|
-
|
167
|
-
solr_doc << Solr::Field.new( :id_t => "#{obj.pid}" )
|
168
|
-
solr_doc << Solr::Field.new( :id => "#{obj.pid}" ) unless solr_doc[:id]
|
169
|
-
|
170
|
-
# increment the unique id to ensure that all documents in the search index are unique
|
171
|
-
@@unique_id += 1
|
172
|
-
|
173
|
-
return solr_doc
|
174
|
-
end
|
175
|
-
|
176
|
-
#
|
177
|
-
# This method adds a document to the Solr search index
|
178
|
-
#
|
179
|
-
def index( obj )
|
180
|
-
# print "Indexing '#{obj.pid}'..."
|
181
|
-
begin
|
182
|
-
|
183
|
-
solr_doc = create_document( obj )
|
184
|
-
connection.add( solr_doc )
|
185
|
-
|
186
|
-
# puts connection.url
|
187
|
-
#puts solr_doc
|
188
|
-
# puts "done"
|
189
|
-
|
190
|
-
# rescue Exception => e
|
191
|
-
# p "unable to index #{obj.pid}. Failed with #{e.inspect}"
|
192
|
-
end
|
193
|
-
|
194
|
-
end
|
195
|
-
|
196
|
-
#
|
197
|
-
# This method queries the Solr search index and returns a response
|
198
|
-
#
|
199
|
-
def query( query_str )
|
200
|
-
response = conn.query( query_str )
|
201
|
-
end
|
202
|
-
|
203
|
-
#
|
204
|
-
# This method prints out the results of the given query string by iterating through all the hits
|
205
|
-
#
|
206
|
-
def printResults( query_str )
|
207
|
-
query( query_str ) do |hit|
|
208
|
-
puts hit.inspect
|
209
|
-
end
|
210
|
-
end
|
211
|
-
|
212
|
-
#
|
213
|
-
# This method deletes a document from the Solr search index by id
|
214
|
-
#
|
215
|
-
def deleteDocument( id )
|
216
|
-
connection.delete( id )
|
217
|
-
end
|
218
|
-
|
219
|
-
# Populates a solr doc with values from a hash.
|
220
|
-
# Accepts two forms of hashes:
|
221
|
-
# => {'technology'=>["t1", "t2"], 'company'=>"c1", "person"=>["p1", "p2"]}
|
222
|
-
# or
|
223
|
-
# => {:facets => {'technology'=>["t1", "t2"], 'company'=>"c1", "person"=>["p1", "p2"]} }
|
224
|
-
#
|
225
|
-
# Note that values for individual fields can be a single string or an array of strings.
|
226
|
-
def self.solrize( input_hash, solr_doc=Solr::Document.new )
|
227
|
-
facets = input_hash.has_key?(:facets) ? input_hash[:facets] : input_hash
|
228
|
-
facets.each_pair do |facet_name, value|
|
229
|
-
case value.class.to_s
|
230
|
-
when "String"
|
231
|
-
solr_doc << Solr::Field.new( :"#{facet_name}_facet" => "#{value}" )
|
232
|
-
when "Array"
|
233
|
-
value.each { |v| solr_doc << Solr::Field.new( :"#{facet_name}_facet" => "#{v}" ) }
|
234
|
-
end
|
235
|
-
end
|
236
|
-
|
237
|
-
if input_hash.has_key?(:symbols)
|
238
|
-
input_hash[:symbols].each do |symbol_name, value|
|
239
|
-
case value.class.to_s
|
240
|
-
when "String"
|
241
|
-
solr_doc << Solr::Field.new( :"#{symbol_name}_s" => "#{value}" )
|
242
|
-
when "Array"
|
243
|
-
value.each { |v| solr_doc << Solr::Field.new( :"#{symbol_name}_s" => "#{v}" ) }
|
244
|
-
end
|
245
|
-
end
|
246
|
-
end
|
247
|
-
return solr_doc
|
248
|
-
end
|
249
|
-
|
250
|
-
|
251
|
-
private :connect, :create_document
|
252
|
-
|
253
|
-
def class_exists?(class_name)
|
254
|
-
klass = Module.const_get(class_name)
|
255
|
-
return klass.is_a?(Class)
|
256
|
-
rescue NameError
|
257
|
-
return false
|
258
|
-
end
|
259
|
-
|
260
|
-
end
|
261
|
-
end
|
data/lib/solrizer/main.rb
DELETED
@@ -1,17 +0,0 @@
|
|
1
|
-
#!/bin/env ruby
|
2
|
-
|
3
|
-
@index_full_text = false
|
4
|
-
|
5
|
-
require 'rubygems'
|
6
|
-
load 'configuration.rb'
|
7
|
-
load 'repository.rb'
|
8
|
-
load 'solrizer.rb'
|
9
|
-
|
10
|
-
# initialize connection to Fedora repository
|
11
|
-
repository = Repository.new
|
12
|
-
repository.initialize_repository
|
13
|
-
|
14
|
-
# solrize all objects in the Fedora repository
|
15
|
-
solrizer = Solrizer.new
|
16
|
-
solrizer.solrize_objects
|
17
|
-
|
data/lib/solrizer/replicator.rb
DELETED
@@ -1,143 +0,0 @@
|
|
1
|
-
require 'fastercsv'
|
2
|
-
REPLICATOR_LIST = false unless defined?(REPLICATOR_LIST)
|
3
|
-
|
4
|
-
|
5
|
-
module Solrizer
|
6
|
-
class Replicator
|
7
|
-
|
8
|
-
include Stanford::SaltControllerHelper
|
9
|
-
attr_accessor :dest_repo, :configs
|
10
|
-
|
11
|
-
def initialize
|
12
|
-
config_path = "#{RAILS_ROOT}/config/replicator.yml"
|
13
|
-
raw_configs = YAML::load(File.open(config_path))
|
14
|
-
@configs = raw_configs[RAILS_ENV]
|
15
|
-
@dest_repo = Fedora::Repository.new(configs["destination"]["fedora"]["url"])
|
16
|
-
|
17
|
-
ActiveFedora.fedora_config[:url] = configs["source"]["fedora"]["url"]
|
18
|
-
logger.info("REPLICATOR: re-initializing Fedora with fedora_config: #{ActiveFedora.fedora_config.inspect}")
|
19
|
-
|
20
|
-
Fedora::Repository.register(ActiveFedora.fedora_config[:url])
|
21
|
-
logger.info("REPLICATOR: re-initialized Fedora as: #{Fedora::Repository.instance.inspect}")
|
22
|
-
|
23
|
-
# Register Solr
|
24
|
-
ActiveFedora.solr_config[:url] = configs["source"]["solr"]["url"]
|
25
|
-
|
26
|
-
logger.info("REPLICATOR: re-initializing ActiveFedora::SolrService with solr_config: #{ActiveFedora.solr_config.inspect}")
|
27
|
-
|
28
|
-
ActiveFedora::SolrService.register(ActiveFedora.solr_config[:url])
|
29
|
-
|
30
|
-
end
|
31
|
-
|
32
|
-
def replicate_objects
|
33
|
-
# retrieve a list of all the pids in the fedora repository
|
34
|
-
num_docs = 1000000 # modify this number to guarantee that all the objects are retrieved from the repository
|
35
|
-
|
36
|
-
if REPLICATOR_LIST == false
|
37
|
-
|
38
|
-
pids = Repository.get_pids( num_docs )
|
39
|
-
puts "Replicating #{pids.length} Fedora objects"
|
40
|
-
pids.each do |pid|
|
41
|
-
unless pid[0].empty? || pid[0].nil? || !pid[0].include?("druid:")
|
42
|
-
puts "Processing #{pid}"
|
43
|
-
replicate_object( pid )
|
44
|
-
end #unless
|
45
|
-
end #pids.each
|
46
|
-
|
47
|
-
else
|
48
|
-
|
49
|
-
if File.exists?(REPLICATOR_LIST)
|
50
|
-
arr_of_pids = FasterCSV.read(REPLICATOR_LIST, :headers=>false)
|
51
|
-
|
52
|
-
puts "Replicating from list at #{REPLICATOR_LIST}"
|
53
|
-
puts "Replicating #{arr_of_pids.length} Fedora objects"
|
54
|
-
|
55
|
-
arr_of_pids.each do |row|
|
56
|
-
pid = row[0]
|
57
|
-
replicate_object( pid )
|
58
|
-
end #FASTERCSV
|
59
|
-
|
60
|
-
else
|
61
|
-
puts "#{REPLICATOR_LIST} does not exists!"
|
62
|
-
end #if File.exists
|
63
|
-
|
64
|
-
end #if Index_LISTS
|
65
|
-
end #replicate_objects
|
66
|
-
|
67
|
-
|
68
|
-
def replicate_object(obj)
|
69
|
-
#source_doc = Document.load_instance(pid)
|
70
|
-
obj = obj.kind_of?(ActiveFedora::Base) ? obj : Repository.get_object( obj )
|
71
|
-
p "Indexing object #{obj.pid} with label #{obj.label}"
|
72
|
-
begin
|
73
|
-
unless obj.nil?
|
74
|
-
create_stub(obj)
|
75
|
-
p "Successfully replicated #{obj.pid}"
|
76
|
-
end
|
77
|
-
rescue Exception => e
|
78
|
-
p "unable to create stub. Failed with #{e.inspect}"
|
79
|
-
end
|
80
|
-
end
|
81
|
-
|
82
|
-
# Creates a stub object in @dest_repo with the datastreams that we need in the stubs
|
83
|
-
def create_stub(source_object)
|
84
|
-
|
85
|
-
begin
|
86
|
-
|
87
|
-
jp2 = downloadables(source_object, :canonical=>true, :mime_type=>"image/jp2")
|
88
|
-
jp2.new_object = true
|
89
|
-
jp2.control_group = 'M'
|
90
|
-
jp2.blob = jp2.content
|
91
|
-
|
92
|
-
stub_object = Fedora::FedoraObject.new(:pid=>source_object.pid)
|
93
|
-
dest_repo.save(stub_object)
|
94
|
-
dest_repo.save(jp2)
|
95
|
-
|
96
|
-
["properties", "extProperties", "descMetadata", "location"].each do |ds_name|
|
97
|
-
ds = source_object.datastreams[ds_name]
|
98
|
-
ds.new_object = true
|
99
|
-
ds.blob = ds.content
|
100
|
-
dest_repo.save(ds)
|
101
|
-
end
|
102
|
-
|
103
|
-
rescue
|
104
|
-
#for object without jp2s
|
105
|
-
#this is a temp fix to the downloadables() issue
|
106
|
-
|
107
|
-
|
108
|
-
pid = source_object.pid
|
109
|
-
p "> #{pid}"
|
110
|
-
|
111
|
-
jp2_file = File.new('spec/fixtures/image.jp2')
|
112
|
-
ds = ActiveFedora::Datastream.new(:dsID => "image.jp2", :dsLabel => 'image.jp2', :controlGroup => 'M', :blob => jp2_file)
|
113
|
-
source_object.add_datastream(ds)
|
114
|
-
source_object.save
|
115
|
-
# source_object = Document.load_instance(pid)
|
116
|
-
source_object = ActiveFedora::Base.load_instance(pid)
|
117
|
-
stub_object = Fedora::FedoraObject.new(:pid=>source_object.pid)
|
118
|
-
dest_repo.save(stub_object)
|
119
|
-
|
120
|
-
jp2 = downloadables(source_object, :canonical=>true, :mime_type=>"image/jp2")
|
121
|
-
jp2.new_object = true
|
122
|
-
jp2.control_group = 'M'
|
123
|
-
jp2.blob = jp2.content
|
124
|
-
|
125
|
-
stub_object = Fedora::FedoraObject.new(:pid=>source_object.pid)
|
126
|
-
dest_repo.save(stub_object)
|
127
|
-
dest_repo.save(jp2)
|
128
|
-
|
129
|
-
["properties", "extProperties", "descMetadata", "location"].each do |ds_name|
|
130
|
-
ds = source_object.datastreams[ds_name]
|
131
|
-
ds.new_object = true
|
132
|
-
ds.blob = ds.content
|
133
|
-
dest_repo.save(ds)
|
134
|
-
end
|
135
|
-
|
136
|
-
end
|
137
|
-
end
|
138
|
-
def logger
|
139
|
-
@logger ||= defined?(RAILS_DEFAULT_LOGGER) ? RAILS_DEFAULT_LOGGER : Logger.new(STDOUT)
|
140
|
-
end
|
141
|
-
|
142
|
-
end
|
143
|
-
end
|