solr_ead 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,128 @@
1
+ module SolrEad
2
+
3
+ # The main entry point for your ead going into solr.
4
+ #
5
+ # SolrEad uses RSolr to connect to your solr server and then gives you a couple of
6
+ # simple methods for creating, updating and deleting your ead documents.
7
+ #
8
+ # You'll need to have your solr configuration defined in config/solr.yml. If you're
9
+ # working within the Rails environment, it will obey your environment settings. However,
10
+ # if you are using the gem by itself outside of rails, you can use the RAILS_ENV environment
11
+ # variable, otherwise, it will default to the development url.
12
+ #
13
+ # ==Default indexing
14
+ # This will index your ead into one solr document for the main portion of ead and then
15
+ # multiple documents for the component documents. The fields for the main document
16
+ # are defined in SolrEad::Document and fields for the component are defined in SolrEad::Component.
17
+ # > file = File.new("path/to/your/ead.xml")
18
+ # > indexer = SolrEad::Indexer.new
19
+ # > indexer.create(file)
20
+ # > indexer.delete("EAD-ID")
21
+ #
22
+ # ==Simple indexing
23
+ # By using the :simple option, SolrEad will create only one solr document from one ead.
24
+ # The default implementation of SolrEad is to create multiple documents, so fields
25
+ # defined in SolrEad::Document reflect this. For example, no component fields are defined in
26
+ # SolrEad::Document, so none would be indexed. If you elect to use the :simple option, you'll
27
+ # want to override SolrEad::Document with your own and define any additional component fields
28
+ # you want to appear in your index.
29
+ # > file = File.new("path/to/your/ead.xml")
30
+ # > indexer = SolrEad::Indexer.new(:simple => true)
31
+ # > indexer.create(file)
32
+ # > indexer.delete("EAD-ID")
33
+
34
+ class Indexer
35
+
36
+ include RSolr
37
+ include SolrEad::Behaviors
38
+
39
+ attr_accessor :solr, :options
40
+
41
+ # Creates a new instance of SolrEad::Indexer and connects to your solr server
42
+ # using the url supplied in your config/solr.yml file.
43
+ def initialize(opts={})
44
+ if defined?(Rails.root)
45
+ url = YAML.load_file(File.join(Rails.root,"config","solr.yml"))[Rails.env]['url']
46
+ elsif ENV['RAILS_ENV']
47
+ url = YAML.load_file(File.join(Rails.root,"config","solr.yml"))[ENV['RAILS_ENV']]['url']
48
+ else
49
+ url = YAML.load_file("config/solr.yml")['development']['url']
50
+ end
51
+ self.solr = RSolr.connect :url => url
52
+ self.options = opts
53
+ end
54
+
55
+ # Indexes your ead and additional component documents with the supplied file, then
56
+ # commits the results to your solr server.
57
+ def create(file)
58
+ doc = om_document(File.new(file))
59
+ solr_doc = doc.to_solr
60
+ solr.add solr_doc
61
+ add_components(file) unless options[:simple]
62
+ solr.commit
63
+ end
64
+
65
+ # Updates your ead from a given file by first deleting the existing ead document and
66
+ # any component documents, then creating a new index from the supplied file.
67
+ # This method will also commit the results to your solr server when complete.
68
+ def update(file)
69
+ doc = om_document(File.new(file))
70
+ solr_doc = doc.to_solr
71
+ solr.delete_by_query( 'eadid_s:"' + solr_doc["id"] + '"' )
72
+ solr.add solr_doc
73
+ add_components(file) unless options[:simple]
74
+ solr.commit
75
+ end
76
+
77
+ # Deletes the ead document and any component documents from your solr index and
78
+ # commits the results.
79
+ def delete(id)
80
+ solr.delete_by_query( 'eadid_s:"' + id + '"')
81
+ solr.commit
82
+ end
83
+
84
+ protected
85
+
86
+ # Returns an OM document from a given file.
87
+ #
88
+ # Determines if you have specified a custom definition for your ead document.
89
+ # If you've defined a class CustomDocument, and have passed it as an option
90
+ # to your indexer, then SolrEad will use that class instead of SolrEad::Document.
91
+ def om_document(file)
92
+ options[:document] ? options[:document].from_xml(File.new(file)) : SolrEad::Document.from_xml(File.new(file))
93
+ end
94
+
95
+ # Returns an OM document from a given Nokogiri node
96
+ #
97
+ # Determines if you have specified a custom definition for your ead component.
98
+ # If you've defined a class CustomComponent, and have passed it as an option
99
+ # to your indexer, then SolrEad will use that class instead of SolrEad::Component.
100
+ def om_component_from_node(node)
101
+ options[:component] ? options[:component].from_xml(prep(node)) : SolrEad::Component.from_xml(prep(node))
102
+ end
103
+
104
+ # Creates solr documents for each individual component node in the ead. Field names
105
+ # and values are determined according to the OM terminology outlined in
106
+ # SolrEad::Component as well as additional fields taken from the rest of the ead
107
+ # document as described in SolrEad::Behaviors#additional_component_fields.
108
+ #
109
+ # Fields from both the terminology and #additional_component_fields are all assembled
110
+ # into one solr document via the SolrEad::Component#to_solr method. Any customizations to
111
+ # the contents or appearance of the fields can take place within that method.
112
+ #
113
+ # Furthermore, one final field is added to the solr document after the #to_solr method.
114
+ # A sorting field *sort_i* is added to the document using the index values from the array
115
+ # of <c> nodes. This allows us to preserve the order of <c> nodes as they appear
116
+ # in the original ead document.
117
+ def add_components(file)
118
+ counter = 1
119
+ components(file).each do |node|
120
+ solr_doc = om_component_from_node(node).to_solr(additional_component_fields(node))
121
+ solr_doc.merge!({"sort_i" => counter.to_s})
122
+ solr.add solr_doc
123
+ counter = counter + 1
124
+ end
125
+ end
126
+
127
+ end
128
+ end
@@ -0,0 +1,38 @@
1
+ module SolrEad::OmBehaviors
2
+
3
+ # This modifies the behavior of the OM gem, specifically, the way it creates
4
+ # documents using existing xml.
5
+ #
6
+ # Instead of using the xml as-as, this module will override OM::XML::Container.to_xml
7
+ # and remove all the namespaces from the xml first, then return the Nokogiri object.
8
+ # This makes working with the terminologies in SolrEad::Document much easier.
9
+ #
10
+ # Any customized ead document definitions should include this module. ex:
11
+ # class MyDocument < SolrEad::Document
12
+ #
13
+ # include OM::XML::Document
14
+ # include Solrizer::XML::TerminologyBasedSolrizer
15
+ # include SolrEad::Container
16
+ #
17
+ # end
18
+
19
+ module ClassMethods
20
+
21
+ def from_xml(xml=nil, tmpl=self.new) # :nodoc:
22
+ if xml.nil?
23
+ # noop: handled in #ng_xml accessor.. tmpl.ng_xml = self.xml_template
24
+ elsif xml.kind_of? Nokogiri::XML::Node
25
+ tmpl.ng_xml = xml.remove_namespaces!
26
+ else
27
+ tmpl.ng_xml = Nokogiri::XML::Document.parse(xml).remove_namespaces!
28
+ end
29
+ return tmpl
30
+ end
31
+
32
+ end
33
+
34
+ def self.included(klass)
35
+ klass.extend(ClassMethods)
36
+ end
37
+
38
+ end
@@ -0,0 +1,7 @@
1
+ module SolrEad
2
+ class Railtie < Rails::Railtie
3
+ rake_tasks do
4
+ load File.join(File.dirname(__FILE__),"../tasks/solr_ead.rake")
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,3 @@
1
+ module SolrEad
2
+ VERSION = "0.0.1"
3
+ end
data/lib/solr_ead.rb ADDED
@@ -0,0 +1,20 @@
1
+ require "nokogiri"
2
+ require "solrizer"
3
+ require "om"
4
+ require "rsolr"
5
+
6
+ module SolrEad
7
+ def self.version
8
+ SolrEad::VERSION
9
+ end
10
+ end
11
+
12
+ require "ead_mapper"
13
+ require "terminology_based_solrizer"
14
+ require "solr_ead/behaviors"
15
+ require "solr_ead/om_behaviors"
16
+ require "solr_ead/indexer"
17
+ require "solr_ead/document"
18
+ require "solr_ead/component"
19
+
20
+ require 'solr_ead/railtie' if defined?(Rails)
@@ -0,0 +1,30 @@
1
+ require "solr_ead"
2
+
3
+ namespace :solr_ead do
4
+
5
+ desc "Index and ead into solr using FILE=<path/to/ead.xml>"
6
+ task :index do
7
+ raise "Please specify your ead, ex. FILE=<path/to/ead.xml" unless ENV['FILE']
8
+ indexer = SolrEad::Indexer.new
9
+ indexer.update(ENV['FILE'])
10
+ end
11
+
12
+ desc "Delete and ead from your solr index using ID='<eadid>'"
13
+ task :delete do
14
+ raise "Please specify your ead id, ex. ID=<eadid>" unless ENV['ID']
15
+ indexer = SolrEad::Indexer.new
16
+ indexer.delete(ENV['ID'])
17
+ end
18
+
19
+ desc "Index a directory of ead files given by DIR=path/to/directory"
20
+ task :index_dir do
21
+ raise "Please specify your direction, ex. DIR=path/to/directory" unless ENV['DIR']
22
+ indexer = SolrEad::Indexer.new
23
+ Dir.glob(File.join(ENV['DIR'],"*")).each do |file|
24
+ print "Indexing #{File.basename(file)}..."
25
+ indexer.update(file) if File.extname(file).match("xml$")
26
+ print "done.\n"
27
+ end
28
+ end
29
+
30
+ end
@@ -0,0 +1,9 @@
1
+ # Overrides Solrizer::XML::TerminologyBasedSolrizer in the solrizer gem and
2
+ # uses our own custom field mapper defined in EadMapper
3
+ module Solrizer::XML::TerminologyBasedSolrizer
4
+
5
+ def self.default_field_mapper
6
+ @@default_field_mapper ||= EadMapper.new
7
+ end
8
+
9
+ end
data/solr_ead.gemspec ADDED
@@ -0,0 +1,27 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/solr_ead/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.authors = ["Adam Wead"]
6
+ gem.email = ["amsterdamos@gmail.com"]
7
+ gem.description = %q{A gem indexing ead into solr using OM. Define your own OM terminology to create the solr fields you want from your ead, then use solr-based applications like Blacklight to search and display the results.}
8
+ gem.summary = %q{A gem for indexing ead into solr using OM}
9
+ gem.homepage = "http://github.com/awead/ead_solr"
10
+
11
+ gem.files = `git ls-files`.split($\)
12
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
13
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
14
+ gem.name = "solr_ead"
15
+ gem.require_paths = ["lib"]
16
+ gem.version = SolrEad::VERSION
17
+
18
+ # Dependencies
19
+ gem.add_dependency('om')
20
+ gem.add_dependency('solrizer')
21
+ gem.add_dependency('rsolr')
22
+ gem.add_dependency('sanitize')
23
+ # For Development
24
+ gem.add_development_dependency 'rspec'
25
+ gem.add_development_dependency 'debugger'
26
+ gem.add_development_dependency 'rdoc'
27
+ end
@@ -0,0 +1,105 @@
1
+ require "spec_helper"
2
+
3
+ describe SolrEad::Behaviors do
4
+
5
+ before :all do
6
+ @not_numbered = fixture "ARC-0005.xml"
7
+ @numbered = fixture "pp002010.xml"
8
+ class TestClass
9
+ include SolrEad::Behaviors
10
+ end
11
+ @test = TestClass.new
12
+ end
13
+
14
+ describe "#components" do
15
+
16
+ before :all do
17
+ @non_numbered_nodeset = @test.components(@not_numbered)
18
+ @numbered_nodeset = @test.components(@numbered)
19
+ end
20
+
21
+ it "should return a nodeset" do
22
+ @non_numbered_nodeset.should be_a_kind_of(Nokogiri::XML::NodeSet)
23
+ @non_numbered_nodeset.first.should be_a_kind_of(Nokogiri::XML::Element)
24
+ end
25
+
26
+ it "should be able to handle both numbered and non-numbered <c> nodes" do
27
+ @non_numbered_nodeset.count.should == 135
28
+ @numbered_nodeset.count.should == 83
29
+ end
30
+ end
31
+
32
+ describe "#prep" do
33
+ it "should return a single component document" do
34
+ part = @test.prep(@numbered_nodeset)
35
+ part.should be_a_kind_of(Nokogiri::XML::Document)
36
+ end
37
+ end
38
+
39
+ describe "#component_children?" do
40
+
41
+ before :all do
42
+ @true = '
43
+ <c id="ref167" level="file">
44
+ <did>
45
+ <unittitle>Zines</unittitle>
46
+ </did>
47
+ <c id="ref169" level="file">
48
+ <did>
49
+ <unittitle>Contagion</unittitle>
50
+ <container id="cid384011" type="Box" label="Graphic materials">SF2</container>
51
+ <container parent="cid384011" type="Folder">8</container>
52
+ <unitdate>1980-1981</unitdate>
53
+ </did>
54
+ </c>
55
+ <c id="ref171" level="file">
56
+ <did>
57
+ <unittitle>Single issues</unittitle>
58
+ <container id="cid384012" type="Box" label="Graphic materials">SF2</container>
59
+ <container parent="cid384012" type="Folder">9</container>
60
+ <unitdate>1977-1985</unitdate>
61
+ </did>
62
+ </c>
63
+ </c>
64
+ '
65
+ @false = '
66
+ <c id="ref167" level="file">
67
+ <did>
68
+ <unittitle>Zines</unittitle>
69
+ </did>
70
+ </c>
71
+ '
72
+ end
73
+
74
+
75
+ it "should return true for components that have c nodes below them" do
76
+ node = Nokogiri::XML(@true)
77
+ @test.component_children?(node.elements.first).should be_true
78
+ end
79
+
80
+ it "should return false for components that do not have c nodes below them" do
81
+ node = Nokogiri::XML(@false)
82
+ @test.component_children?(node.elements.first).should be_false
83
+ end
84
+
85
+ it "should return true for series-level components" do
86
+ pending "This is an old test"
87
+ ["series","subseries"].each do |level|
88
+ xml = '<c id="ref42" level="' + level +'"></c>'
89
+ node = Nokogiri::XML(xml)
90
+ @test.component_children?(node.elements.first).should be_true
91
+ end
92
+ end
93
+
94
+ it "should return false for item-level components" do
95
+ pending "This is an old test"
96
+ ["file","item"].each do |level|
97
+ xml = '<c id="ref42" level="' + level +'"></c>'
98
+ node = Nokogiri::XML(xml)
99
+ @test.component_children?(node.elements.first).should be_false
100
+ end
101
+ end
102
+
103
+ end
104
+
105
+ end
@@ -0,0 +1,32 @@
1
+ require "spec_helper"
2
+
3
+ describe SolrEad::Component do
4
+
5
+ before(:all) do
6
+ file = "component_template.xml"
7
+ @doc = SolrEad::Component.from_xml(fixture file)
8
+ end
9
+
10
+ describe "the solr document" do
11
+
12
+ it "should accept additional fields from a hash" do
13
+ additional_fields = {
14
+ "id" => "TEST-0001:ref010",
15
+ "eadid_s" => "TEST-0001",
16
+ "parent_id_s" => "ref001",
17
+ "parent_id_list_t" => ["ref001", "ref002", "ref003"],
18
+ "parent_unittitle_list_t" => ["Series I", "Subseries A", "Subseries 1"],
19
+ "component_children_b" => FALSE
20
+ }
21
+ solr_doc = @doc.to_solr(additional_fields)
22
+ solr_doc["id"].should == "TEST-0001:ref010"
23
+ solr_doc["level_facet"].should include "item"
24
+ solr_doc["heading_display"].should == "Series I >> Subseries A >> Subseries 1 >> Internal Revenue Service Form Information Return [RESTRICTED]"
25
+ solr_doc["accessrestrict_t"].first.should match /^This item .* is available.$/
26
+ solr_doc["accessrestrict_heading_display"].should include "Access Restrictions"
27
+
28
+ end
29
+
30
+ end
31
+
32
+ end
@@ -0,0 +1,80 @@
1
+ require "spec_helper"
2
+
3
+ describe SolrEad::Document do
4
+
5
+ before(:all) do
6
+ @ex1 = SolrEad::Document.from_xml(fixture "ARC-0005.xml")
7
+ @ex2 = SolrEad::Document.from_xml(fixture "pp002010.xml")
8
+ @solr_ex1 = @ex1.to_solr
9
+ @solr_ex2 = @ex2.to_solr
10
+ end
11
+
12
+ describe "#terminology" do
13
+
14
+ it "should have an id field" do
15
+ @ex1.eadid.first.should == "ARC-0005"
16
+ end
17
+
18
+ it "should have a simple title" do
19
+ @ex1.title.first.should match "Eddie Cochran Historical Organization Collection"
20
+ end
21
+
22
+ it "should have some subject headings" do
23
+ @ex1.persname.should include "Cochran, Eddie, 1938-1960"
24
+ @ex1.genreform.should include "Newspapers"
25
+ @ex1.subject.should include "Rockabilly music"
26
+ @ex2.corpname.should include "Tuskegee Normal and Industrial Institute--1880-1940."
27
+ @ex2.genreform.should include "Group portraits--1880-1940."
28
+ @ex2.geogname.should include "Washington, D.C."
29
+ @ex2.name.should include "Bell, J.S., Portland, OR"
30
+ @ex2.persname.should include "Johnston, Frances Benjamin, 1864-1952, photographer."
31
+ @ex2.subject.should include "Buildings--1880-1940."
32
+ end
33
+
34
+ it "should have scope and contents" do
35
+ @ex2.scopecontent.first.should match /^Photographs/
36
+ end
37
+
38
+ it "should have one separatedmaterial material note from the archdesc section" do
39
+ @ex1.separatedmaterial.first.should match /^Commercially-released publications.*materials are available.$/
40
+ end
41
+
42
+ it "should have its xml" do
43
+ @ex1.to_xml.should match "<c\s"
44
+ @ex2.to_xml.should match "<c01\s"\
45
+ end
46
+
47
+ end
48
+
49
+ describe ".to_solr" do
50
+
51
+ it "should have the appropriate id fields" do
52
+ @solr_ex1["eadid_s"].should == "ARC-0005"
53
+ @solr_ex1["id"].should == "ARC-0005"
54
+ @solr_ex2["eadid_s"].should == "http://hdl.loc.gov/loc.pnp/eadpnp.pp002010"
55
+ @solr_ex2["id"].should == "http://hdl.loc.gov/loc.pnp/eadpnp.pp002010"
56
+
57
+ end
58
+
59
+ it "should have faceted terms created from subject headings" do
60
+ @solr_ex1["persname_facet"].should include "Cochran, Eddie, 1938-1960"
61
+ @solr_ex1["genreform_facet"].should include "Newspapers"
62
+ @solr_ex1["subject_facet"].should include "Rockabilly music"
63
+ @solr_ex2["corpname_facet"].should include "Tuskegee Normal and Industrial Institute--1880-1940."
64
+ @solr_ex2["genreform_facet"].should include "Group portraits--1880-1940."
65
+ @solr_ex2["geogname_facet"].should include "Washington, D.C."
66
+ @solr_ex2["name_facet"].should include "Bell, J.S., Portland, OR"
67
+ @solr_ex2["persname_facet"].should include "Johnston, Frances Benjamin, 1864-1952, photographer."
68
+ @solr_ex2["subject_facet"].should include "Buildings--1880-1940."
69
+ end
70
+
71
+ it "should index head tags as display and p tags as text" do
72
+ @solr_ex1["separatedmaterial_heading_display"].should include "Separated Materials"
73
+ @solr_ex1["separatedmaterial_t"].first.should match /^Commercially-released publications.*materials are available.$/
74
+ @solr_ex2["scopecontent_heading_display"].should include "Scope and Content Note"
75
+ @solr_ex2["scopecontent_t"].first.should match /^Photographs/
76
+ end
77
+
78
+ end
79
+
80
+ end