solrizer 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +3 -0
- data/Gemfile +14 -0
- data/Gemfile.lock +44 -0
- data/History.txt +8 -0
- data/Rakefile +10 -3
- data/VERSION +1 -1
- data/config/solr_mappings.yml +16 -13
- data/config/solr_mappings_af_0.1.yml +18 -0
- data/lib/solrizer/extractor.rb +31 -72
- data/lib/solrizer/field_mapper.rb +351 -0
- data/lib/solrizer/field_name_mapper.rb +37 -51
- data/lib/solrizer/html/extractor.rb +36 -0
- data/lib/solrizer/html.rb +7 -0
- data/lib/solrizer/xml/extractor.rb +31 -0
- data/lib/solrizer/xml/terminology_based_solrizer.rb +25 -29
- data/lib/solrizer/xml.rb +4 -1
- data/lib/solrizer.rb +2 -113
- data/lib/tasks/solrizer.rake +7 -27
- data/solrizer.gemspec +46 -26
- data/spec/{spec.opts → .rspec} +0 -0
- data/spec/fixtures/test_solr_mappings.yml +16 -0
- data/spec/spec_helper.rb +1 -0
- data/spec/units/extractor_spec.rb +43 -34
- data/spec/units/field_mapper_spec.rb +227 -0
- data/spec/units/field_name_mapper_spec.rb +16 -29
- data/spec/units/xml_extractor_spec.rb +28 -0
- data/spec/units/xml_terminology_based_solrizer_spec.rb +18 -5
- metadata +128 -35
- data/lib/solrizer/configuration.rb +0 -8
- data/lib/solrizer/indexer.rb +0 -261
- data/lib/solrizer/main.rb +0 -17
- data/lib/solrizer/replicator.rb +0 -143
- data/lib/solrizer/repository.rb +0 -54
- data/spec/fixtures/rels_ext_cmodel.xml +0 -8
- data/spec/fixtures/solr_mappings_af_0.1.yml +0 -16
- data/spec/integration/indexer_spec.rb +0 -18
- data/spec/units/indexer_spec.rb +0 -127
- data/spec/units/shelver_spec.rb +0 -42
data/lib/solrizer/repository.rb
DELETED
@@ -1,54 +0,0 @@
|
|
1
|
-
|
2
|
-
require 'active-fedora'
|
3
|
-
|
4
|
-
|
5
|
-
module Solrizer
|
6
|
-
class Repository
|
7
|
-
|
8
|
-
#
|
9
|
-
# This method initializes the fedora repository and solr instance
|
10
|
-
#
|
11
|
-
def initialize_repository
|
12
|
-
Fedora::Repository.register( FEDORA_URL )
|
13
|
-
ActiveFedora::SolrService.register( FEDORA_SOLR_URL )
|
14
|
-
end
|
15
|
-
|
16
|
-
#
|
17
|
-
# This method retrieves a comprehensive list of unique ids in the fedora repository
|
18
|
-
#
|
19
|
-
def self.get_pids( num_docs )
|
20
|
-
solr_results = ActiveFedora::SolrService.instance.conn.query( "active_fedora_model_field:Document", { :rows => num_docs } )
|
21
|
-
id_array = []
|
22
|
-
solr_results.hits.each do |hit|
|
23
|
-
id_array << hit[SOLR_DOCUMENT_ID]
|
24
|
-
end
|
25
|
-
return id_array
|
26
|
-
end
|
27
|
-
|
28
|
-
#
|
29
|
-
# This method retrieves the object associated with the given unique id
|
30
|
-
#
|
31
|
-
def self.get_object( pid )
|
32
|
-
object = ActiveFedora::Base.load_instance( pid )
|
33
|
-
end
|
34
|
-
|
35
|
-
#
|
36
|
-
# This method retrieves a comprehensive list of datastreams for the given object
|
37
|
-
#
|
38
|
-
def self.get_datastreams( obj )
|
39
|
-
ds_keys = obj.datastreams.keys
|
40
|
-
end
|
41
|
-
|
42
|
-
#
|
43
|
-
# This method retrieves the datastream for the given object with the given datastream name
|
44
|
-
#
|
45
|
-
def self.get_datastream( obj, ds_name )
|
46
|
-
begin
|
47
|
-
obj.datastreams[ ds_name ]
|
48
|
-
rescue
|
49
|
-
return nil
|
50
|
-
end
|
51
|
-
end
|
52
|
-
|
53
|
-
end
|
54
|
-
end
|
@@ -1,8 +0,0 @@
|
|
1
|
-
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
2
|
-
<rdf:Description rdf:about="info:fedora/demo:multipurpose-objects-model_and_sdef">
|
3
|
-
<hasModel xmlns="info:fedora/fedora-system:def/model#" rdf:resource="info:fedora/fedora-system:ContentModel-3.0"/>
|
4
|
-
<hasModel xmlns="info:fedora/fedora-system:def/model#" rdf:resource="info:fedora/afmodel:SaltDocument"/>
|
5
|
-
<hasModel xmlns="info:fedora/fedora-system:def/model#" rdf:resource="info:fedora/afmodel:JP2Document"/>
|
6
|
-
<hasModel xmlns="info:fedora/fedora-system:def/model#" rdf:resource="info:fedora/afmodel:DCDocument"/>
|
7
|
-
</rdf:Description>
|
8
|
-
</rdf:RDF>
|
@@ -1,16 +0,0 @@
|
|
1
|
-
id: id
|
2
|
-
date: _date
|
3
|
-
string: _field
|
4
|
-
text: _field
|
5
|
-
symbol: _field
|
6
|
-
integer: _field
|
7
|
-
long: _field
|
8
|
-
boolean: _field
|
9
|
-
float: _field
|
10
|
-
double: _field
|
11
|
-
facet: _facet
|
12
|
-
display: _display
|
13
|
-
sort: _sort
|
14
|
-
unstemmed_search: _unstem_search
|
15
|
-
|
16
|
-
|
@@ -1,18 +0,0 @@
|
|
1
|
-
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
-
require 'solrizer'
|
3
|
-
|
4
|
-
describe Solrizer::Indexer do
|
5
|
-
|
6
|
-
before(:each) do
|
7
|
-
@indexer = Solrizer::Indexer.new
|
8
|
-
end
|
9
|
-
|
10
|
-
describe "index" do
|
11
|
-
it "should update solr with the metadata from the given object" do
|
12
|
-
pending "Got to decide if/how to handle fixtures in this gem. Probably should just mock out Fedora & Solr entirely."
|
13
|
-
obj = Solrizer::Repository.get_object( "druid:sb733gr4073" )
|
14
|
-
@indexer.index( obj )
|
15
|
-
end
|
16
|
-
end
|
17
|
-
|
18
|
-
end
|
data/spec/units/indexer_spec.rb
DELETED
@@ -1,127 +0,0 @@
|
|
1
|
-
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
-
require 'solrizer'
|
3
|
-
|
4
|
-
describe Solrizer::Indexer do
|
5
|
-
|
6
|
-
before(:each) do
|
7
|
-
Solrizer::Indexer.any_instance.stubs(:connect).returns("foo")
|
8
|
-
|
9
|
-
@extractor = mock("Extractor")
|
10
|
-
@extractor.stubs(:html_content_to_solr).returns(@solr_doc)
|
11
|
-
# @solr_doc = mock('solr_doc')
|
12
|
-
# @solr_doc.stubs(:<<)
|
13
|
-
# @solr_doc.stubs(:[])
|
14
|
-
|
15
|
-
@solr_doc = Solr::Document.new
|
16
|
-
|
17
|
-
Solrizer::Extractor.expects(:new).returns(@extractor)
|
18
|
-
@indexer = Solrizer::Indexer.new
|
19
|
-
|
20
|
-
end
|
21
|
-
|
22
|
-
describe "#generate_dates" do
|
23
|
-
it "should still give 9999-99-99 date if the solr document does not have a date_t field" do
|
24
|
-
|
25
|
-
solr_result = @indexer.generate_dates(@solr_doc)
|
26
|
-
solr_result.should be_kind_of Solr::Document
|
27
|
-
solr_result[:date_t].should == "9999-99-99"
|
28
|
-
solr_result[:month_facet].should == "99"
|
29
|
-
solr_result[:day_facet].should == '99'
|
30
|
-
|
31
|
-
end
|
32
|
-
|
33
|
-
it "should still give 9999-99-99 date if the solr_doc[:date_t] is not valid date in YYYY-MM-DD format " do
|
34
|
-
|
35
|
-
@solr_doc << Solr::Field.new(:date_t => "Unknown")
|
36
|
-
solr_result = @indexer.generate_dates(@solr_doc)
|
37
|
-
solr_result.should be_kind_of Solr::Document
|
38
|
-
solr_result[:date_t].should == "Unknown"
|
39
|
-
solr_result[:month_facet].should == "99"
|
40
|
-
solr_result[:day_facet].should == '99'
|
41
|
-
|
42
|
-
end
|
43
|
-
|
44
|
-
it "should give month and dates even if the :date_t is not a valid date but is in YYYY-MM-DD format " do
|
45
|
-
|
46
|
-
@solr_doc << Solr::Field.new(:date_t => "0000-13-11")
|
47
|
-
solr_result = @indexer.generate_dates(@solr_doc)
|
48
|
-
solr_result.should be_kind_of Solr::Document
|
49
|
-
solr_result[:date_t].should == "0000-13-11"
|
50
|
-
solr_result[:month_facet].should == "99"
|
51
|
-
solr_result[:day_facet].should == '11'
|
52
|
-
end
|
53
|
-
|
54
|
-
it "should give month and day when in a valid date format" do
|
55
|
-
@solr_doc << Solr::Field.new(:date_t => "1978-04-11")
|
56
|
-
solr_result = @indexer.generate_dates(@solr_doc)
|
57
|
-
solr_result.should be_kind_of Solr::Document
|
58
|
-
solr_result[:date_t].should == "1978-04-11"
|
59
|
-
solr_result[:month_facet].should == "04"
|
60
|
-
solr_result[:day_facet].should == '11'
|
61
|
-
|
62
|
-
end
|
63
|
-
|
64
|
-
it "should still give two digit strings even if the month/day is single digit" do
|
65
|
-
|
66
|
-
@solr_doc << Solr::Field.new(:date_t => "1978-4-1")
|
67
|
-
solr_result = @indexer.generate_dates(@solr_doc)
|
68
|
-
solr_result.should be_kind_of Solr::Document
|
69
|
-
solr_result[:date_t].should == "1978-4-1"
|
70
|
-
solr_result[:month_facet].should == "04"
|
71
|
-
solr_result[:day_facet].should == '01'
|
72
|
-
|
73
|
-
end
|
74
|
-
|
75
|
-
end
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
describe "#solrize" do
|
80
|
-
it "should convert a hash to a solr doc" do
|
81
|
-
example_hash = {"box"=>"Box 51A", "city"=>["Ann Arbor", "Hyderabad", "Palo Alto"], "person"=>["ELLIE ENGELMORE", "Reddy", "EDWARD FEIGENBAUM"], "title"=>"Letter from Ellie Engelmore to Professor K. C. Reddy", "series"=>"eaf7000", "folder"=>"Folder 15", "technology"=>["artificial intelligence"], "year"=>"1985", "organization"=>["Heuristic Programming Project", "Mathematics and Computer/Information Sciences University of Hyderabad Central University P. O. Hyder", "Professor K. C. Reddy School of Mathematics and Computer/Information Sciences"], "collection"=>"e-a-feigenbaum-collection", "state"=>["Michigan", "California"]}
|
82
|
-
|
83
|
-
example_result = Solrizer::Indexer.solrize( example_hash )
|
84
|
-
example_result.should be_kind_of Solr::Document
|
85
|
-
example_hash.each_pair do |key,values|
|
86
|
-
if values.class == String
|
87
|
-
example_result["#{key}_facet"].should == values
|
88
|
-
else
|
89
|
-
values.each do |v|
|
90
|
-
example_result.inspect.include?("@name=\"#{key}_facet\"").should be_true
|
91
|
-
example_result.inspect.include?("@value=\"#{v}\"").should be_true
|
92
|
-
end
|
93
|
-
end
|
94
|
-
end
|
95
|
-
end
|
96
|
-
|
97
|
-
it "should handle hashes with facets listed in a sub-hash" do
|
98
|
-
simple_hash = Hash[:facets => {'technology'=>["t1", "t2"], 'company'=>"c1", "person"=>["p1", "p2"]}]
|
99
|
-
result = Solrizer::Indexer.solrize( simple_hash )
|
100
|
-
result.should be_kind_of Solr::Document
|
101
|
-
result["technology_facet"].should == "t1"
|
102
|
-
result.inspect.include?('@boost=nil').should be_true
|
103
|
-
result.inspect.include?('@name="technology_facet"').should be_true
|
104
|
-
result.inspect.include?('@value="t2"').should be_true
|
105
|
-
result["company_facet"].should == "c1"
|
106
|
-
result["person_facet"].should == "p1"
|
107
|
-
result.inspect.include?('@name="person_facet"').should be_true
|
108
|
-
result.inspect.include?('@value="p2"').should be_true
|
109
|
-
|
110
|
-
end
|
111
|
-
|
112
|
-
it "should create symbols from the :symbols subhash" do
|
113
|
-
simple_hash = Hash[:facets => {'technology'=>["t1", "t2"], 'company'=>"c1", "person"=>["p1", "p2"]}, :symbols=>{'technology'=>["t1", "t2"], 'company'=>"c1", "person"=>["p1", "p2"]}]
|
114
|
-
result = Solrizer::Indexer.solrize( simple_hash )
|
115
|
-
result.should be_kind_of Solr::Document
|
116
|
-
result["technology_s"].should == "t1"
|
117
|
-
result.inspect.include?('@name="technology_s"').should be_true
|
118
|
-
result.inspect.include?('@value="t2"').should be_true
|
119
|
-
|
120
|
-
result["company_s"].should == "c1"
|
121
|
-
result["person_s"].should == "p1"
|
122
|
-
result.inspect.include?('@name="person_s"').should be_true
|
123
|
-
result.inspect.include?('@value="p2"').should be_true
|
124
|
-
|
125
|
-
end
|
126
|
-
end
|
127
|
-
end
|
data/spec/units/shelver_spec.rb
DELETED
@@ -1,42 +0,0 @@
|
|
1
|
-
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
-
|
3
|
-
describe Solrizer::Solrizer do
|
4
|
-
|
5
|
-
before(:each) do
|
6
|
-
@solrizer = Solrizer::Solrizer.new
|
7
|
-
end
|
8
|
-
|
9
|
-
describe "solrize" do
|
10
|
-
it "should trigger the indexer for the provided object" do
|
11
|
-
sample_obj = ActiveFedora::Base.new
|
12
|
-
@solrizer.indexer.expects(:index).with( sample_obj )
|
13
|
-
@solrizer.solrize( sample_obj )
|
14
|
-
end
|
15
|
-
it "should work with Fedora::FedoraObject objects" do
|
16
|
-
mock_object = Fedora::FedoraObject.new(:pid=>"my:pid", :label=>"my label")
|
17
|
-
ActiveFedora::Base.expects(:load_instance).with( mock_object.pid ).returns(mock_object)
|
18
|
-
@solrizer.indexer.expects(:index).with( mock_object )
|
19
|
-
@solrizer.solrize( mock_object )
|
20
|
-
end
|
21
|
-
it "should load the object if only a pid is provided" do
|
22
|
-
mock_object = mock("my object")
|
23
|
-
mock_object.stubs(:pid)
|
24
|
-
mock_object.stubs(:label)
|
25
|
-
mock_object.stubs(:datastreams).returns({'descMetadata'=>"foo","location"=>"bar"})
|
26
|
-
|
27
|
-
ActiveFedora::Base.expects(:load_instance).with( "_PID_" ).returns(mock_object)
|
28
|
-
@solrizer.indexer.expects(:index).with(mock_object)
|
29
|
-
@solrizer.solrize("_PID_")
|
30
|
-
end
|
31
|
-
|
32
|
-
end
|
33
|
-
|
34
|
-
describe "solrize_objects" do
|
35
|
-
it "should call solrize for each object returned by Fedora::Repository.find_objects" do
|
36
|
-
objects = [["pid1"], ["pid2"], ["pid3"]]
|
37
|
-
Fedora::Repository.any_instance.expects(:find_objects).returns(objects)
|
38
|
-
objects.each {|object| @solrizer.expects(:solrize).with( object ) }
|
39
|
-
@solrizer.solrize_objects
|
40
|
-
end
|
41
|
-
end
|
42
|
-
end
|