solrizer 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +3 -0
- data/Gemfile +14 -0
- data/Gemfile.lock +44 -0
- data/History.txt +8 -0
- data/Rakefile +10 -3
- data/VERSION +1 -1
- data/config/solr_mappings.yml +16 -13
- data/config/solr_mappings_af_0.1.yml +18 -0
- data/lib/solrizer/extractor.rb +31 -72
- data/lib/solrizer/field_mapper.rb +351 -0
- data/lib/solrizer/field_name_mapper.rb +37 -51
- data/lib/solrizer/html/extractor.rb +36 -0
- data/lib/solrizer/html.rb +7 -0
- data/lib/solrizer/xml/extractor.rb +31 -0
- data/lib/solrizer/xml/terminology_based_solrizer.rb +25 -29
- data/lib/solrizer/xml.rb +4 -1
- data/lib/solrizer.rb +2 -113
- data/lib/tasks/solrizer.rake +7 -27
- data/solrizer.gemspec +46 -26
- data/spec/{spec.opts → .rspec} +0 -0
- data/spec/fixtures/test_solr_mappings.yml +16 -0
- data/spec/spec_helper.rb +1 -0
- data/spec/units/extractor_spec.rb +43 -34
- data/spec/units/field_mapper_spec.rb +227 -0
- data/spec/units/field_name_mapper_spec.rb +16 -29
- data/spec/units/xml_extractor_spec.rb +28 -0
- data/spec/units/xml_terminology_based_solrizer_spec.rb +18 -5
- metadata +128 -35
- data/lib/solrizer/configuration.rb +0 -8
- data/lib/solrizer/indexer.rb +0 -261
- data/lib/solrizer/main.rb +0 -17
- data/lib/solrizer/replicator.rb +0 -143
- data/lib/solrizer/repository.rb +0 -54
- data/spec/fixtures/rels_ext_cmodel.xml +0 -8
- data/spec/fixtures/solr_mappings_af_0.1.yml +0 -16
- data/spec/integration/indexer_spec.rb +0 -18
- data/spec/units/indexer_spec.rb +0 -127
- data/spec/units/shelver_spec.rb +0 -42
data/lib/solrizer/repository.rb
DELETED
@@ -1,54 +0,0 @@
|
|
1
|
-
|
2
|
-
require 'active-fedora'
|
3
|
-
|
4
|
-
|
5
|
-
module Solrizer
|
6
|
-
class Repository
|
7
|
-
|
8
|
-
#
|
9
|
-
# This method initializes the fedora repository and solr instance
|
10
|
-
#
|
11
|
-
def initialize_repository
|
12
|
-
Fedora::Repository.register( FEDORA_URL )
|
13
|
-
ActiveFedora::SolrService.register( FEDORA_SOLR_URL )
|
14
|
-
end
|
15
|
-
|
16
|
-
#
|
17
|
-
# This method retrieves a comprehensive list of unique ids in the fedora repository
|
18
|
-
#
|
19
|
-
def self.get_pids( num_docs )
|
20
|
-
solr_results = ActiveFedora::SolrService.instance.conn.query( "active_fedora_model_field:Document", { :rows => num_docs } )
|
21
|
-
id_array = []
|
22
|
-
solr_results.hits.each do |hit|
|
23
|
-
id_array << hit[SOLR_DOCUMENT_ID]
|
24
|
-
end
|
25
|
-
return id_array
|
26
|
-
end
|
27
|
-
|
28
|
-
#
|
29
|
-
# This method retrieves the object associated with the given unique id
|
30
|
-
#
|
31
|
-
def self.get_object( pid )
|
32
|
-
object = ActiveFedora::Base.load_instance( pid )
|
33
|
-
end
|
34
|
-
|
35
|
-
#
|
36
|
-
# This method retrieves a comprehensive list of datastreams for the given object
|
37
|
-
#
|
38
|
-
def self.get_datastreams( obj )
|
39
|
-
ds_keys = obj.datastreams.keys
|
40
|
-
end
|
41
|
-
|
42
|
-
#
|
43
|
-
# This method retrieves the datastream for the given object with the given datastream name
|
44
|
-
#
|
45
|
-
def self.get_datastream( obj, ds_name )
|
46
|
-
begin
|
47
|
-
obj.datastreams[ ds_name ]
|
48
|
-
rescue
|
49
|
-
return nil
|
50
|
-
end
|
51
|
-
end
|
52
|
-
|
53
|
-
end
|
54
|
-
end
|
@@ -1,8 +0,0 @@
|
|
1
|
-
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
2
|
-
<rdf:Description rdf:about="info:fedora/demo:multipurpose-objects-model_and_sdef">
|
3
|
-
<hasModel xmlns="info:fedora/fedora-system:def/model#" rdf:resource="info:fedora/fedora-system:ContentModel-3.0"/>
|
4
|
-
<hasModel xmlns="info:fedora/fedora-system:def/model#" rdf:resource="info:fedora/afmodel:SaltDocument"/>
|
5
|
-
<hasModel xmlns="info:fedora/fedora-system:def/model#" rdf:resource="info:fedora/afmodel:JP2Document"/>
|
6
|
-
<hasModel xmlns="info:fedora/fedora-system:def/model#" rdf:resource="info:fedora/afmodel:DCDocument"/>
|
7
|
-
</rdf:Description>
|
8
|
-
</rdf:RDF>
|
@@ -1,16 +0,0 @@
|
|
1
|
-
id: id
|
2
|
-
date: _date
|
3
|
-
string: _field
|
4
|
-
text: _field
|
5
|
-
symbol: _field
|
6
|
-
integer: _field
|
7
|
-
long: _field
|
8
|
-
boolean: _field
|
9
|
-
float: _field
|
10
|
-
double: _field
|
11
|
-
facet: _facet
|
12
|
-
display: _display
|
13
|
-
sort: _sort
|
14
|
-
unstemmed_search: _unstem_search
|
15
|
-
|
16
|
-
|
@@ -1,18 +0,0 @@
|
|
1
|
-
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
-
require 'solrizer'
|
3
|
-
|
4
|
-
describe Solrizer::Indexer do
|
5
|
-
|
6
|
-
before(:each) do
|
7
|
-
@indexer = Solrizer::Indexer.new
|
8
|
-
end
|
9
|
-
|
10
|
-
describe "index" do
|
11
|
-
it "should update solr with the metadata from the given object" do
|
12
|
-
pending "Got to decide if/how to handle fixtures in this gem. Probably should just mock out Fedora & Solr entirely."
|
13
|
-
obj = Solrizer::Repository.get_object( "druid:sb733gr4073" )
|
14
|
-
@indexer.index( obj )
|
15
|
-
end
|
16
|
-
end
|
17
|
-
|
18
|
-
end
|
data/spec/units/indexer_spec.rb
DELETED
@@ -1,127 +0,0 @@
|
|
1
|
-
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
-
require 'solrizer'
|
3
|
-
|
4
|
-
describe Solrizer::Indexer do
|
5
|
-
|
6
|
-
before(:each) do
|
7
|
-
Solrizer::Indexer.any_instance.stubs(:connect).returns("foo")
|
8
|
-
|
9
|
-
@extractor = mock("Extractor")
|
10
|
-
@extractor.stubs(:html_content_to_solr).returns(@solr_doc)
|
11
|
-
# @solr_doc = mock('solr_doc')
|
12
|
-
# @solr_doc.stubs(:<<)
|
13
|
-
# @solr_doc.stubs(:[])
|
14
|
-
|
15
|
-
@solr_doc = Solr::Document.new
|
16
|
-
|
17
|
-
Solrizer::Extractor.expects(:new).returns(@extractor)
|
18
|
-
@indexer = Solrizer::Indexer.new
|
19
|
-
|
20
|
-
end
|
21
|
-
|
22
|
-
describe "#generate_dates" do
|
23
|
-
it "should still give 9999-99-99 date if the solr document does not have a date_t field" do
|
24
|
-
|
25
|
-
solr_result = @indexer.generate_dates(@solr_doc)
|
26
|
-
solr_result.should be_kind_of Solr::Document
|
27
|
-
solr_result[:date_t].should == "9999-99-99"
|
28
|
-
solr_result[:month_facet].should == "99"
|
29
|
-
solr_result[:day_facet].should == '99'
|
30
|
-
|
31
|
-
end
|
32
|
-
|
33
|
-
it "should still give 9999-99-99 date if the solr_doc[:date_t] is not valid date in YYYY-MM-DD format " do
|
34
|
-
|
35
|
-
@solr_doc << Solr::Field.new(:date_t => "Unknown")
|
36
|
-
solr_result = @indexer.generate_dates(@solr_doc)
|
37
|
-
solr_result.should be_kind_of Solr::Document
|
38
|
-
solr_result[:date_t].should == "Unknown"
|
39
|
-
solr_result[:month_facet].should == "99"
|
40
|
-
solr_result[:day_facet].should == '99'
|
41
|
-
|
42
|
-
end
|
43
|
-
|
44
|
-
it "should give month and dates even if the :date_t is not a valid date but is in YYYY-MM-DD format " do
|
45
|
-
|
46
|
-
@solr_doc << Solr::Field.new(:date_t => "0000-13-11")
|
47
|
-
solr_result = @indexer.generate_dates(@solr_doc)
|
48
|
-
solr_result.should be_kind_of Solr::Document
|
49
|
-
solr_result[:date_t].should == "0000-13-11"
|
50
|
-
solr_result[:month_facet].should == "99"
|
51
|
-
solr_result[:day_facet].should == '11'
|
52
|
-
end
|
53
|
-
|
54
|
-
it "should give month and day when in a valid date format" do
|
55
|
-
@solr_doc << Solr::Field.new(:date_t => "1978-04-11")
|
56
|
-
solr_result = @indexer.generate_dates(@solr_doc)
|
57
|
-
solr_result.should be_kind_of Solr::Document
|
58
|
-
solr_result[:date_t].should == "1978-04-11"
|
59
|
-
solr_result[:month_facet].should == "04"
|
60
|
-
solr_result[:day_facet].should == '11'
|
61
|
-
|
62
|
-
end
|
63
|
-
|
64
|
-
it "should still give two digit strings even if the month/day is single digit" do
|
65
|
-
|
66
|
-
@solr_doc << Solr::Field.new(:date_t => "1978-4-1")
|
67
|
-
solr_result = @indexer.generate_dates(@solr_doc)
|
68
|
-
solr_result.should be_kind_of Solr::Document
|
69
|
-
solr_result[:date_t].should == "1978-4-1"
|
70
|
-
solr_result[:month_facet].should == "04"
|
71
|
-
solr_result[:day_facet].should == '01'
|
72
|
-
|
73
|
-
end
|
74
|
-
|
75
|
-
end
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
describe "#solrize" do
|
80
|
-
it "should convert a hash to a solr doc" do
|
81
|
-
example_hash = {"box"=>"Box 51A", "city"=>["Ann Arbor", "Hyderabad", "Palo Alto"], "person"=>["ELLIE ENGELMORE", "Reddy", "EDWARD FEIGENBAUM"], "title"=>"Letter from Ellie Engelmore to Professor K. C. Reddy", "series"=>"eaf7000", "folder"=>"Folder 15", "technology"=>["artificial intelligence"], "year"=>"1985", "organization"=>["Heuristic Programming Project", "Mathematics and Computer/Information Sciences University of Hyderabad Central University P. O. Hyder", "Professor K. C. Reddy School of Mathematics and Computer/Information Sciences"], "collection"=>"e-a-feigenbaum-collection", "state"=>["Michigan", "California"]}
|
82
|
-
|
83
|
-
example_result = Solrizer::Indexer.solrize( example_hash )
|
84
|
-
example_result.should be_kind_of Solr::Document
|
85
|
-
example_hash.each_pair do |key,values|
|
86
|
-
if values.class == String
|
87
|
-
example_result["#{key}_facet"].should == values
|
88
|
-
else
|
89
|
-
values.each do |v|
|
90
|
-
example_result.inspect.include?("@name=\"#{key}_facet\"").should be_true
|
91
|
-
example_result.inspect.include?("@value=\"#{v}\"").should be_true
|
92
|
-
end
|
93
|
-
end
|
94
|
-
end
|
95
|
-
end
|
96
|
-
|
97
|
-
it "should handle hashes with facets listed in a sub-hash" do
|
98
|
-
simple_hash = Hash[:facets => {'technology'=>["t1", "t2"], 'company'=>"c1", "person"=>["p1", "p2"]}]
|
99
|
-
result = Solrizer::Indexer.solrize( simple_hash )
|
100
|
-
result.should be_kind_of Solr::Document
|
101
|
-
result["technology_facet"].should == "t1"
|
102
|
-
result.inspect.include?('@boost=nil').should be_true
|
103
|
-
result.inspect.include?('@name="technology_facet"').should be_true
|
104
|
-
result.inspect.include?('@value="t2"').should be_true
|
105
|
-
result["company_facet"].should == "c1"
|
106
|
-
result["person_facet"].should == "p1"
|
107
|
-
result.inspect.include?('@name="person_facet"').should be_true
|
108
|
-
result.inspect.include?('@value="p2"').should be_true
|
109
|
-
|
110
|
-
end
|
111
|
-
|
112
|
-
it "should create symbols from the :symbols subhash" do
|
113
|
-
simple_hash = Hash[:facets => {'technology'=>["t1", "t2"], 'company'=>"c1", "person"=>["p1", "p2"]}, :symbols=>{'technology'=>["t1", "t2"], 'company'=>"c1", "person"=>["p1", "p2"]}]
|
114
|
-
result = Solrizer::Indexer.solrize( simple_hash )
|
115
|
-
result.should be_kind_of Solr::Document
|
116
|
-
result["technology_s"].should == "t1"
|
117
|
-
result.inspect.include?('@name="technology_s"').should be_true
|
118
|
-
result.inspect.include?('@value="t2"').should be_true
|
119
|
-
|
120
|
-
result["company_s"].should == "c1"
|
121
|
-
result["person_s"].should == "p1"
|
122
|
-
result.inspect.include?('@name="person_s"').should be_true
|
123
|
-
result.inspect.include?('@value="p2"').should be_true
|
124
|
-
|
125
|
-
end
|
126
|
-
end
|
127
|
-
end
|
data/spec/units/shelver_spec.rb
DELETED
@@ -1,42 +0,0 @@
|
|
1
|
-
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
-
|
3
|
-
describe Solrizer::Solrizer do
|
4
|
-
|
5
|
-
before(:each) do
|
6
|
-
@solrizer = Solrizer::Solrizer.new
|
7
|
-
end
|
8
|
-
|
9
|
-
describe "solrize" do
|
10
|
-
it "should trigger the indexer for the provided object" do
|
11
|
-
sample_obj = ActiveFedora::Base.new
|
12
|
-
@solrizer.indexer.expects(:index).with( sample_obj )
|
13
|
-
@solrizer.solrize( sample_obj )
|
14
|
-
end
|
15
|
-
it "should work with Fedora::FedoraObject objects" do
|
16
|
-
mock_object = Fedora::FedoraObject.new(:pid=>"my:pid", :label=>"my label")
|
17
|
-
ActiveFedora::Base.expects(:load_instance).with( mock_object.pid ).returns(mock_object)
|
18
|
-
@solrizer.indexer.expects(:index).with( mock_object )
|
19
|
-
@solrizer.solrize( mock_object )
|
20
|
-
end
|
21
|
-
it "should load the object if only a pid is provided" do
|
22
|
-
mock_object = mock("my object")
|
23
|
-
mock_object.stubs(:pid)
|
24
|
-
mock_object.stubs(:label)
|
25
|
-
mock_object.stubs(:datastreams).returns({'descMetadata'=>"foo","location"=>"bar"})
|
26
|
-
|
27
|
-
ActiveFedora::Base.expects(:load_instance).with( "_PID_" ).returns(mock_object)
|
28
|
-
@solrizer.indexer.expects(:index).with(mock_object)
|
29
|
-
@solrizer.solrize("_PID_")
|
30
|
-
end
|
31
|
-
|
32
|
-
end
|
33
|
-
|
34
|
-
describe "solrize_objects" do
|
35
|
-
it "should call solrize for each object returned by Fedora::Repository.find_objects" do
|
36
|
-
objects = [["pid1"], ["pid2"], ["pid3"]]
|
37
|
-
Fedora::Repository.any_instance.expects(:find_objects).returns(objects)
|
38
|
-
objects.each {|object| @solrizer.expects(:solrize).with( object ) }
|
39
|
-
@solrizer.solrize_objects
|
40
|
-
end
|
41
|
-
end
|
42
|
-
end
|