solrizer 1.2.2 → 2.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +2 -1
- data/History.txt +3 -0
- data/lib/solrizer/version.rb +1 -1
- data/lib/solrizer/xml/terminology_based_solrizer.rb +4 -1
- data/spec/fixtures/mods_article.rb +88 -0
- data/spec/units/xml_terminology_based_solrizer_spec.rb +13 -18
- metadata +19 -13
- data/.rvmrc +0 -35
- data/Gemfile.lock +0 -72
data/.gitignore
CHANGED
data/History.txt
CHANGED
data/lib/solrizer/version.rb
CHANGED
@@ -48,14 +48,17 @@ module Solrizer::XML::TerminologyBasedSolrizer
|
|
48
48
|
|
49
49
|
# Populate a solr document with solr fields corresponding to the given xml node
|
50
50
|
# Field names are generated using settings from the term in the +doc+'s terminology corresponding to +term_pointer+
|
51
|
+
# If the supplied term does not have an index_as attribute, no indexing will be performed.
|
51
52
|
# @param [Nokogiri::XML::Node] node to solrize
|
52
53
|
# @param [OM::XML::Document] doc document the node came from
|
53
54
|
# @param [Array] term_pointer Array pointing to the term that should be used for solrization settings
|
55
|
+
# @param [Term] term the term to be solrized
|
54
56
|
# @param [Hash] (optional) solr_doc (values hash) to populate
|
57
|
+
# @return [Hash] the solr doc
|
55
58
|
def self.solrize_node(node, doc, term_pointer, term, solr_doc = Hash.new, field_mapper = nil, opts = {})
|
59
|
+
return solr_doc unless term.index_as
|
56
60
|
field_mapper ||= self.default_field_mapper
|
57
61
|
terminology = doc.class.terminology
|
58
|
-
# term = terminology.retrieve_term(*term_pointer)
|
59
62
|
|
60
63
|
if term.path.kind_of?(Hash) && term.path.has_key?(:attribute)
|
61
64
|
node_value = node.value
|
@@ -0,0 +1,88 @@
|
|
1
|
+
module Samples
|
2
|
+
class ModsArticle
|
3
|
+
|
4
|
+
include OM::XML::Document
|
5
|
+
|
6
|
+
set_terminology do |t|
|
7
|
+
t.root(:path=>"mods", :xmlns=>"http://www.loc.gov/mods/v3", :schema=>"http://www.loc.gov/standards/mods/v3/mods-3-2.xsd", "xmlns:foo"=>"http://my.custom.namespace")
|
8
|
+
|
9
|
+
|
10
|
+
t.title_info(:path=>"titleInfo") {
|
11
|
+
t.main_title(:index_as=>[:facetable],:path=>"title", :label=>"title") {
|
12
|
+
t.main_title_lang(:path=>{:attribute=> "xml:lang"})
|
13
|
+
}
|
14
|
+
t.french_title(:ref=>[:title_info,:main_title], :attributes=>{"xml:lang"=>"fre"})
|
15
|
+
|
16
|
+
t.language(:index_as=>[:facetable],:path=>{:attribute=>"lang"})
|
17
|
+
}
|
18
|
+
t.language{
|
19
|
+
t.lang_code(:index_as=>[:facetable], :path=>"languageTerm", :attributes=>{:type=>"code"})
|
20
|
+
}
|
21
|
+
t.abstract(:index_as=>[])
|
22
|
+
t.subject {
|
23
|
+
t.topic(:index_as=>[:facetable])
|
24
|
+
}
|
25
|
+
t.topic_tag(:proxy=>[:subject, :topic])
|
26
|
+
# t.topic_tag(:index_as=>[:facetable],:path=>"subject", :default_content_path=>"topic")
|
27
|
+
# This is a mods:name. The underscore is purely to avoid namespace conflicts.
|
28
|
+
t.name_ {
|
29
|
+
# this is a namepart
|
30
|
+
t.namePart(:type=>:string, :label=>"generic name")
|
31
|
+
# affiliations are great
|
32
|
+
t.affiliation
|
33
|
+
t.institution(:path=>"affiliation", :index_as=>[:facetable], :label=>"organization")
|
34
|
+
t.displayForm
|
35
|
+
t.role(:ref=>[:role])
|
36
|
+
t.description(:index_as=>[:facetable])
|
37
|
+
t.date(:path=>"namePart", :attributes=>{:type=>"date"})
|
38
|
+
t.last_name(:path=>"namePart", :attributes=>{:type=>"family"}, :index_as=>[])
|
39
|
+
t.first_name(:path=>"namePart", :attributes=>{:type=>"given"}, :label=>"first name")
|
40
|
+
t.terms_of_address(:path=>"namePart", :attributes=>{:type=>"termsOfAddress"})
|
41
|
+
t.computing_id
|
42
|
+
t.name_content(:path=>"text()")
|
43
|
+
}
|
44
|
+
# lookup :person, :first_name
|
45
|
+
t.person(:ref=>:name, :attributes=>{:type=>"personal"}, :index_as=>[:facetable])
|
46
|
+
t.department(:proxy=>[:person,:description],:index_as=>[:facetable])
|
47
|
+
t.organization(:ref=>:name, :attributes=>{:type=>"corporate"}, :index_as=>[:facetable])
|
48
|
+
t.conference(:ref=>:name, :attributes=>{:type=>"conference"}, :index_as=>[:facetable])
|
49
|
+
t.role {
|
50
|
+
t.text(:path=>"roleTerm",:attributes=>{:type=>"text"}, :index_as=>[])
|
51
|
+
t.code(:path=>"roleTerm",:attributes=>{:type=>"code"})
|
52
|
+
}
|
53
|
+
t.journal(:path=>'relatedItem', :attributes=>{:type=>"host"}) {
|
54
|
+
t.title_info(:index_as=>[:facetable],:ref=>[:title_info])
|
55
|
+
t.origin_info(:path=>"originInfo") {
|
56
|
+
t.publisher
|
57
|
+
t.date_issued(:path=>"dateIssued")
|
58
|
+
t.issuance(:index_as=>[:facetable])
|
59
|
+
}
|
60
|
+
t.issn(:path=>"identifier", :attributes=>{:type=>"issn"})
|
61
|
+
t.issue(:path=>"part") {
|
62
|
+
t.volume(:path=>"detail", :attributes=>{:type=>"volume"}, :default_content_path=>"number")
|
63
|
+
t.level(:path=>"detail", :attributes=>{:type=>"number"}, :default_content_path=>"number")
|
64
|
+
t.extent
|
65
|
+
t.pages(:path=>"extent", :attributes=>{:unit=>"pages"}) {
|
66
|
+
t.start
|
67
|
+
t.end
|
68
|
+
}
|
69
|
+
t.start_page(:proxy=>[:pages, :start])
|
70
|
+
t.end_page(:proxy=>[:pages, :end])
|
71
|
+
t.publication_date(:path=>"date", :index_as=>[])
|
72
|
+
}
|
73
|
+
}
|
74
|
+
t.note
|
75
|
+
t.location(:path=>"location") {
|
76
|
+
t.url(:path=>"url")
|
77
|
+
}
|
78
|
+
t.publication_url(:proxy=>[:location,:url])
|
79
|
+
t.title(:proxy=>[:title_info, :main_title])
|
80
|
+
t.journal_title(:proxy=>[:journal, :title_info, :main_title])
|
81
|
+
end
|
82
|
+
|
83
|
+
# Changes from OM::Properties implementation
|
84
|
+
# renamed family_name => last_name
|
85
|
+
# start_page & end_page now accessible as [:journal, :issue, :pages, :start] (etc.)
|
86
|
+
|
87
|
+
end
|
88
|
+
end
|
@@ -1,15 +1,15 @@
|
|
1
1
|
require 'spec_helper'
|
2
|
-
require '
|
2
|
+
require 'fixtures/mods_article'
|
3
3
|
|
4
4
|
describe Solrizer::XML::TerminologyBasedSolrizer do
|
5
5
|
|
6
6
|
before(:all) do
|
7
|
-
|
7
|
+
Samples::ModsArticle.send(:include, Solrizer::XML::TerminologyBasedSolrizer)
|
8
8
|
end
|
9
9
|
|
10
10
|
before(:each) do
|
11
11
|
article_xml = fixture( File.join("mods_articles", "hydrangea_article1.xml") )
|
12
|
-
@mods_article =
|
12
|
+
@mods_article = Samples::ModsArticle.from_xml(article_xml)
|
13
13
|
end
|
14
14
|
|
15
15
|
describe ".to_solr" do
|
@@ -34,7 +34,7 @@ describe Solrizer::XML::TerminologyBasedSolrizer do
|
|
34
34
|
# ActiveFedora::NokogiriDatastream.stubs(:accessors).returns(mock_accessors)
|
35
35
|
solr_doc = Hash.new
|
36
36
|
@mods_article.field_mapper = Solrizer::FieldMapper::Default.new
|
37
|
-
|
37
|
+
Samples::ModsArticle.terminology.terms.each_pair do |k,v|
|
38
38
|
@mods_article.expects(:solrize_term).with(v, solr_doc, @mods_article.field_mapper)
|
39
39
|
end
|
40
40
|
@mods_article.to_solr(solr_doc)
|
@@ -49,21 +49,16 @@ describe Solrizer::XML::TerminologyBasedSolrizer do
|
|
49
49
|
solr_doc["abstract_t"].should == ["ABSTRACT"]
|
50
50
|
solr_doc["title_info_1_language_t"].should == ["finnish"]
|
51
51
|
solr_doc["person_1_role_0_text_t"].should == ["teacher"]
|
52
|
+
# No index_as on the code field.
|
53
|
+
solr_doc["person_1_role_0_code_t"].should be_nil
|
52
54
|
solr_doc["person_last_name_t"].sort.should == ["FAMILY NAME", "Gautama"]
|
53
|
-
|
54
|
-
# solr_doc["topic_tag_t"].sort.should == ["CONTROLLED TERM", "TOPIC1", "TOPIC2"]
|
55
|
+
solr_doc["topic_tag_t"].sort.should == ["CONTROLLED TERM", "TOPIC 1", "TOPIC 2"]
|
55
56
|
|
56
57
|
# These are a holdover from an old verison of OM
|
57
|
-
#
|
58
|
-
|
58
|
+
puts "DOC: #{solr_doc.length}"
|
59
|
+
solr_doc['journal_0_issue_0_publication_date_t'].should == ["FEB. 2007"]
|
59
60
|
|
60
|
-
|
61
|
-
#
|
62
|
-
# solr_doc[:publisher_t].should be_nil
|
63
|
-
# solr_doc[:coverage_t].should be_nil
|
64
|
-
# solr_doc[:creation_date_dt].should be_nil
|
65
|
-
# solr_doc.should == ""
|
66
|
-
|
61
|
+
|
67
62
|
end
|
68
63
|
|
69
64
|
end
|
@@ -72,14 +67,14 @@ describe Solrizer::XML::TerminologyBasedSolrizer do
|
|
72
67
|
|
73
68
|
it "should add fields to a solr document for all nodes corresponding to the given term and its children" do
|
74
69
|
solr_doc = Hash.new
|
75
|
-
result = @mods_article.solrize_term(
|
70
|
+
result = @mods_article.solrize_term(Samples::ModsArticle.terminology.retrieve_term(:title_info), solr_doc)
|
76
71
|
result.should == solr_doc
|
77
|
-
# @mods_article.solrize_term(:title_info,
|
72
|
+
# @mods_article.solrize_term(:title_info, Samples::ModsArticle.terminology.retrieve_term(:title_info), :solr_doc=>solr_doc).should == ""
|
78
73
|
end
|
79
74
|
|
80
75
|
it "should add multiple fields based on index_as" do
|
81
76
|
fake_solr_doc = {}
|
82
|
-
term =
|
77
|
+
term = Samples::ModsArticle.terminology.retrieve_term(:name)
|
83
78
|
term.children[:namePart].index_as = [:displayable, :facetable]
|
84
79
|
|
85
80
|
@mods_article.solrize_term(term, fake_solr_doc)
|
metadata
CHANGED
@@ -1,13 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: solrizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
5
|
-
prerelease:
|
4
|
+
hash: -2879808512
|
5
|
+
prerelease: 6
|
6
6
|
segments:
|
7
|
-
- 1
|
8
|
-
- 2
|
9
7
|
- 2
|
10
|
-
|
8
|
+
- 0
|
9
|
+
- 0
|
10
|
+
- rc
|
11
|
+
- 1
|
12
|
+
version: 2.0.0.rc1
|
11
13
|
platform: ruby
|
12
14
|
authors:
|
13
15
|
- Matt Zumwalt
|
@@ -15,7 +17,8 @@ autorequire:
|
|
15
17
|
bindir: bin
|
16
18
|
cert_chain: []
|
17
19
|
|
18
|
-
date: 2012-
|
20
|
+
date: 2012-10-15 00:00:00 -05:00
|
21
|
+
default_executable:
|
19
22
|
dependencies:
|
20
23
|
- !ruby/object:Gem::Dependency
|
21
24
|
name: nokogiri
|
@@ -188,9 +191,7 @@ extra_rdoc_files:
|
|
188
191
|
- README.textile
|
189
192
|
files:
|
190
193
|
- .gitignore
|
191
|
-
- .rvmrc
|
192
194
|
- Gemfile
|
193
|
-
- Gemfile.lock
|
194
195
|
- History.txt
|
195
196
|
- LICENSE
|
196
197
|
- README.textile
|
@@ -222,6 +223,7 @@ files:
|
|
222
223
|
- spec/fixtures/druid-cm234kq4672-extProperties.xml
|
223
224
|
- spec/fixtures/druid-cm234kq4672-stories.xml
|
224
225
|
- spec/fixtures/druid-hc513kw4806-descMetadata.xml
|
226
|
+
- spec/fixtures/mods_article.rb
|
225
227
|
- spec/fixtures/mods_articles/hydrangea_article1.xml
|
226
228
|
- spec/fixtures/test_solr_mappings.yml
|
227
229
|
- spec/spec_helper.rb
|
@@ -230,6 +232,7 @@ files:
|
|
230
232
|
- spec/units/field_name_mapper_spec.rb
|
231
233
|
- spec/units/xml_extractor_spec.rb
|
232
234
|
- spec/units/xml_terminology_based_solrizer_spec.rb
|
235
|
+
has_rdoc: true
|
233
236
|
homepage: http://github.com/projecthydra/solrizer
|
234
237
|
licenses: []
|
235
238
|
|
@@ -250,16 +253,18 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
250
253
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
251
254
|
none: false
|
252
255
|
requirements:
|
253
|
-
- - "
|
256
|
+
- - ">"
|
254
257
|
- !ruby/object:Gem::Version
|
255
|
-
hash:
|
258
|
+
hash: 25
|
256
259
|
segments:
|
257
|
-
-
|
258
|
-
|
260
|
+
- 1
|
261
|
+
- 3
|
262
|
+
- 1
|
263
|
+
version: 1.3.1
|
259
264
|
requirements: []
|
260
265
|
|
261
266
|
rubyforge_project:
|
262
|
-
rubygems_version: 1.
|
267
|
+
rubygems_version: 1.6.2
|
263
268
|
signing_key:
|
264
269
|
specification_version: 3
|
265
270
|
summary: A utility for building solr indexes, usually from Fedora repository content with solrizer-fedora extension gem.
|
@@ -269,6 +274,7 @@ test_files:
|
|
269
274
|
- spec/fixtures/druid-cm234kq4672-extProperties.xml
|
270
275
|
- spec/fixtures/druid-cm234kq4672-stories.xml
|
271
276
|
- spec/fixtures/druid-hc513kw4806-descMetadata.xml
|
277
|
+
- spec/fixtures/mods_article.rb
|
272
278
|
- spec/fixtures/mods_articles/hydrangea_article1.xml
|
273
279
|
- spec/fixtures/test_solr_mappings.yml
|
274
280
|
- spec/spec_helper.rb
|
data/.rvmrc
DELETED
@@ -1,35 +0,0 @@
|
|
1
|
-
#!/usr/bin/env bash
|
2
|
-
|
3
|
-
# This is an RVM Project .rvmrc file, used to automatically load the ruby
|
4
|
-
# development environment upon cd'ing into the directory
|
5
|
-
|
6
|
-
ruby_string="ree-1.8.7"
|
7
|
-
gemset_name="solrizer"
|
8
|
-
|
9
|
-
#
|
10
|
-
rvm_install_on_use_flag=1
|
11
|
-
|
12
|
-
# Specify our desired <ruby>[@<gemset>], the @gemset name is optional.
|
13
|
-
environment_id="${ruby_string}@${gemset_name}"
|
14
|
-
|
15
|
-
# First, attempt to load the desired environment directly from the environment
|
16
|
-
# file. This is very fast and efficient compared to running through the entire
|
17
|
-
# CLI and selector. If you want feedback on which environment was used then
|
18
|
-
# insert the word 'use' after --create as this triggers verbose mode.
|
19
|
-
#
|
20
|
-
if [[ -d "${rvm_path:-$HOME/.rvm}/environments" \
|
21
|
-
&& -s "${rvm_path:-$HOME/.rvm}/environments/$environment_id" ]] ; then
|
22
|
-
\. "${rvm_path:-$HOME/.rvm}/environments/$environment_id"
|
23
|
-
else
|
24
|
-
# If the environment file has not yet been created, use the RVM CLI to select.
|
25
|
-
rvm --create "$environment_id"
|
26
|
-
fi
|
27
|
-
|
28
|
-
#(
|
29
|
-
# Ensure that Bundler is installed, install it if it is not.
|
30
|
-
if ! command -v bundle ; then
|
31
|
-
printf "The rubygem 'bundler' is not installed, installing it now.\n"
|
32
|
-
gem install bundler
|
33
|
-
fi
|
34
|
-
#)&
|
35
|
-
|
data/Gemfile.lock
DELETED
@@ -1,72 +0,0 @@
|
|
1
|
-
PATH
|
2
|
-
remote: .
|
3
|
-
specs:
|
4
|
-
solrizer (1.2.0)
|
5
|
-
daemons
|
6
|
-
mediashelf-loggable (~> 0.4.7)
|
7
|
-
nokogiri
|
8
|
-
om (>= 1.5.0)
|
9
|
-
stomp
|
10
|
-
xml-simple
|
11
|
-
|
12
|
-
GEM
|
13
|
-
remote: http://rubygems.org/
|
14
|
-
specs:
|
15
|
-
RedCloth (4.2.8)
|
16
|
-
archive-tar-minitar (0.5.2)
|
17
|
-
columnize (0.3.4)
|
18
|
-
daemons (1.1.8)
|
19
|
-
diff-lcs (1.1.3)
|
20
|
-
linecache (0.46)
|
21
|
-
rbx-require-relative (> 0.0.4)
|
22
|
-
linecache19 (0.5.12)
|
23
|
-
ruby_core_source (>= 0.1.4)
|
24
|
-
mediashelf-loggable (0.4.9)
|
25
|
-
metaclass (0.0.1)
|
26
|
-
mocha (0.10.0)
|
27
|
-
metaclass (~> 0.0.1)
|
28
|
-
nokogiri (1.5.2)
|
29
|
-
om (1.6.0)
|
30
|
-
mediashelf-loggable
|
31
|
-
nokogiri (>= 1.4.2)
|
32
|
-
rbx-require-relative (0.0.5)
|
33
|
-
rcov (0.9.10)
|
34
|
-
rspec (2.7.0)
|
35
|
-
rspec-core (~> 2.7.0)
|
36
|
-
rspec-expectations (~> 2.7.0)
|
37
|
-
rspec-mocks (~> 2.7.0)
|
38
|
-
rspec-core (2.7.1)
|
39
|
-
rspec-expectations (2.7.0)
|
40
|
-
diff-lcs (~> 1.1.2)
|
41
|
-
rspec-mocks (2.7.0)
|
42
|
-
ruby-debug (0.10.4)
|
43
|
-
columnize (>= 0.1)
|
44
|
-
ruby-debug-base (~> 0.10.4.0)
|
45
|
-
ruby-debug-base (0.10.4)
|
46
|
-
linecache (>= 0.3)
|
47
|
-
ruby-debug-base19 (0.11.25)
|
48
|
-
columnize (>= 0.3.1)
|
49
|
-
linecache19 (>= 0.5.11)
|
50
|
-
ruby_core_source (>= 0.1.4)
|
51
|
-
ruby-debug19 (0.11.6)
|
52
|
-
columnize (>= 0.3.1)
|
53
|
-
linecache19 (>= 0.5.11)
|
54
|
-
ruby-debug-base19 (>= 0.11.19)
|
55
|
-
ruby_core_source (0.1.5)
|
56
|
-
archive-tar-minitar (>= 0.5.2)
|
57
|
-
stomp (1.2.2)
|
58
|
-
xml-simple (1.1.1)
|
59
|
-
yard (0.7.2)
|
60
|
-
|
61
|
-
PLATFORMS
|
62
|
-
ruby
|
63
|
-
|
64
|
-
DEPENDENCIES
|
65
|
-
RedCloth
|
66
|
-
mocha
|
67
|
-
rcov
|
68
|
-
rspec (~> 2.0)
|
69
|
-
ruby-debug
|
70
|
-
ruby-debug19
|
71
|
-
solrizer!
|
72
|
-
yard
|