solr_ead 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,128 @@
1
+ module SolrEad
2
+
3
+ # The main entry point for your ead going into solr.
4
+ #
5
+ # SolrEad uses RSolr to connect to your solr server and then gives you a couple of
6
+ # simple methods for creating, updating and deleting your ead documents.
7
+ #
8
+ # You'll need to have your solr configuration defined in config/solr.yml. If you're
9
+ # working within the Rails environment, it will obey your environment settings. However,
10
+ # if you are using the gem by itself outside of rails, you can use the RAILS_ENV environment
11
+ # variable, otherwise, it will default to the development url.
12
+ #
13
+ # ==Default indexing
14
+ # This will index your ead into one solr document for the main portion of ead and then
15
+ # multiple documents for the component documents. The fields for the main document
16
+ # are defined in SolrEad::Document and fields for the component are defined in SolrEad::Component.
17
+ # > file = File.new("path/to/your/ead.xml")
18
+ # > indexer = SolrEad::Indexer.new
19
+ # > indexer.create(file)
20
+ # > indexer.delete("EAD-ID")
21
+ #
22
+ # ==Simple indexing
23
+ # By using the :simple option, SolrEad will create only one solr document from one ead.
24
+ # The default implementation of SolrEad is to create multiple documents, so fields
25
+ # defined in SolrEad::Document reflect this. For example, no component fields are defined in
26
+ # SolrEad::Document, so none would be indexed. If you elect to use the :simple option, you'll
27
+ # want to override SolrEad::Document with your own and define any additional component fields
28
+ # you want to appear in your index.
29
+ # > file = File.new("path/to/your/ead.xml")
30
+ # > indexer = SolrEad::Indexer.new(:simple => true)
31
+ # > indexer.create(file)
32
+ # > indexer.delete("EAD-ID")
33
+
34
+ class Indexer
35
+
36
+ include RSolr
37
+ include SolrEad::Behaviors
38
+
39
+ attr_accessor :solr, :options
40
+
41
+ # Creates a new instance of SolrEad::Indexer and connects to your solr server
42
+ # using the url supplied in your config/solr.yml file.
43
+ def initialize(opts={})
44
+ if defined?(Rails.root)
45
+ url = YAML.load_file(File.join(Rails.root,"config","solr.yml"))[Rails.env]['url']
46
+ elsif ENV['RAILS_ENV']
47
+ url = YAML.load_file(File.join(Rails.root,"config","solr.yml"))[ENV['RAILS_ENV']]['url']
48
+ else
49
+ url = YAML.load_file("config/solr.yml")['development']['url']
50
+ end
51
+ self.solr = RSolr.connect :url => url
52
+ self.options = opts
53
+ end
54
+
55
+ # Indexes your ead and additional component documents with the supplied file, then
56
+ # commits the results to your solr server.
57
+ def create(file)
58
+ doc = om_document(File.new(file))
59
+ solr_doc = doc.to_solr
60
+ solr.add solr_doc
61
+ add_components(file) unless options[:simple]
62
+ solr.commit
63
+ end
64
+
65
+ # Updates your ead from a given file by first deleting the existing ead document and
66
+ # any component documents, then creating a new index from the supplied file.
67
+ # This method will also commit the results to your solr server when complete.
68
+ def update(file)
69
+ doc = om_document(File.new(file))
70
+ solr_doc = doc.to_solr
71
+ solr.delete_by_query( 'eadid_s:"' + solr_doc["id"] + '"' )
72
+ solr.add solr_doc
73
+ add_components(file) unless options[:simple]
74
+ solr.commit
75
+ end
76
+
77
+ # Deletes the ead document and any component documents from your solr index and
78
+ # commits the results.
79
+ def delete(id)
80
+ solr.delete_by_query( 'eadid_s:"' + id + '"')
81
+ solr.commit
82
+ end
83
+
84
+ protected
85
+
86
+ # Returns an OM document from a given file.
87
+ #
88
+ # Determines if you have specified a custom definition for your ead document.
89
+ # If you've defined a class CustomDocument, and have passed it as an option
90
+ # to your indexer, then SolrEad will use that class instead of SolrEad::Document.
91
+ def om_document(file)
92
+ options[:document] ? options[:document].from_xml(File.new(file)) : SolrEad::Document.from_xml(File.new(file))
93
+ end
94
+
95
+ # Returns an OM document from a given Nokogiri node
96
+ #
97
+ # Determines if you have specified a custom definition for your ead component.
98
+ # If you've defined a class CustomComponent, and have passed it as an option
99
+ # to your indexer, then SolrEad will use that class instead of SolrEad::Component.
100
+ def om_component_from_node(node)
101
+ options[:component] ? options[:component].from_xml(prep(node)) : SolrEad::Component.from_xml(prep(node))
102
+ end
103
+
104
+ # Creates solr documents for each individual component node in the ead. Field names
105
+ # and values are determined according to the OM terminology outlined in
106
+ # SolrEad::Component as well as additional fields taken from the rest of the ead
107
+ # document as described in SolrEad::Behaviors#additional_component_fields.
108
+ #
109
+ # Fields from both the terminology and #additional_component_fields are all assembled
110
+ # into one solr document via the SolrEad::Component#to_solr method. Any customizations to
111
+ # the contents or appearance of the fields can take place within that method.
112
+ #
113
+ # Furthermore, one final field is added to the solr document after the #to_solr method.
114
+ # A sorting field *sort_i* is added to the document using the index values from the array
115
+ # of <c> nodes. This allows us to preserve the order of <c> nodes as they appear
116
+ # in the original ead document.
117
+ def add_components(file)
118
+ counter = 1
119
+ components(file).each do |node|
120
+ solr_doc = om_component_from_node(node).to_solr(additional_component_fields(node))
121
+ solr_doc.merge!({"sort_i" => counter.to_s})
122
+ solr.add solr_doc
123
+ counter = counter + 1
124
+ end
125
+ end
126
+
127
+ end
128
+ end
@@ -0,0 +1,38 @@
1
+ module SolrEad::OmBehaviors
2
+
3
+ # This modifies the behavior of the OM gem, specifically, the way it creates
4
+ # documents using existing xml.
5
+ #
6
+ # Instead of using the xml as-as, this module will override OM::XML::Container.to_xml
7
+ # and remove all the namespaces from the xml first, then return the Nokogiri object.
8
+ # This makes working with the terminologies in SolrEad::Document much easier.
9
+ #
10
+ # Any customized ead document definitions should include this module. ex:
11
+ # class MyDocument < SolrEad::Document
12
+ #
13
+ # include OM::XML::Document
14
+ # include Solrizer::XML::TerminologyBasedSolrizer
15
+ # include SolrEad::Container
16
+ #
17
+ # end
18
+
19
+ module ClassMethods
20
+
21
+ def from_xml(xml=nil, tmpl=self.new) # :nodoc:
22
+ if xml.nil?
23
+ # noop: handled in #ng_xml accessor.. tmpl.ng_xml = self.xml_template
24
+ elsif xml.kind_of? Nokogiri::XML::Node
25
+ tmpl.ng_xml = xml.remove_namespaces!
26
+ else
27
+ tmpl.ng_xml = Nokogiri::XML::Document.parse(xml).remove_namespaces!
28
+ end
29
+ return tmpl
30
+ end
31
+
32
+ end
33
+
34
+ def self.included(klass)
35
+ klass.extend(ClassMethods)
36
+ end
37
+
38
+ end
@@ -0,0 +1,7 @@
1
+ module SolrEad
2
+ class Railtie < Rails::Railtie
3
+ rake_tasks do
4
+ load File.join(File.dirname(__FILE__),"../tasks/solr_ead.rake")
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,3 @@
1
+ module SolrEad
2
+ VERSION = "0.0.1"
3
+ end
data/lib/solr_ead.rb ADDED
@@ -0,0 +1,20 @@
1
+ require "nokogiri"
2
+ require "solrizer"
3
+ require "om"
4
+ require "rsolr"
5
+
6
+ module SolrEad
7
+ def self.version
8
+ SolrEad::VERSION
9
+ end
10
+ end
11
+
12
+ require "ead_mapper"
13
+ require "terminology_based_solrizer"
14
+ require "solr_ead/behaviors"
15
+ require "solr_ead/om_behaviors"
16
+ require "solr_ead/indexer"
17
+ require "solr_ead/document"
18
+ require "solr_ead/component"
19
+
20
+ require 'solr_ead/railtie' if defined?(Rails)
@@ -0,0 +1,30 @@
1
+ require "solr_ead"
2
+
3
+ namespace :solr_ead do
4
+
5
+ desc "Index and ead into solr using FILE=<path/to/ead.xml>"
6
+ task :index do
7
+ raise "Please specify your ead, ex. FILE=<path/to/ead.xml" unless ENV['FILE']
8
+ indexer = SolrEad::Indexer.new
9
+ indexer.update(ENV['FILE'])
10
+ end
11
+
12
+ desc "Delete and ead from your solr index using ID='<eadid>'"
13
+ task :delete do
14
+ raise "Please specify your ead id, ex. ID=<eadid>" unless ENV['ID']
15
+ indexer = SolrEad::Indexer.new
16
+ indexer.delete(ENV['ID'])
17
+ end
18
+
19
+ desc "Index a directory of ead files given by DIR=path/to/directory"
20
+ task :index_dir do
21
+ raise "Please specify your direction, ex. DIR=path/to/directory" unless ENV['DIR']
22
+ indexer = SolrEad::Indexer.new
23
+ Dir.glob(File.join(ENV['DIR'],"*")).each do |file|
24
+ print "Indexing #{File.basename(file)}..."
25
+ indexer.update(file) if File.extname(file).match("xml$")
26
+ print "done.\n"
27
+ end
28
+ end
29
+
30
+ end
@@ -0,0 +1,9 @@
1
+ # Overrides Solrizer::XML::TerminologyBasedSolrizer in the solrizer gem and
2
+ # uses our own custom field mapper defined in EadMapper
3
+ module Solrizer::XML::TerminologyBasedSolrizer
4
+
5
+ def self.default_field_mapper
6
+ @@default_field_mapper ||= EadMapper.new
7
+ end
8
+
9
+ end
data/solr_ead.gemspec ADDED
@@ -0,0 +1,27 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/solr_ead/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.authors = ["Adam Wead"]
6
+ gem.email = ["amsterdamos@gmail.com"]
7
+ gem.description = %q{A gem indexing ead into solr using OM. Define your own OM terminology to create the solr fields you want from your ead, then use solr-based applications like Blacklight to search and display the results.}
8
+ gem.summary = %q{A gem for indexing ead into solr using OM}
9
+ gem.homepage = "http://github.com/awead/ead_solr"
10
+
11
+ gem.files = `git ls-files`.split($\)
12
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
13
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
14
+ gem.name = "solr_ead"
15
+ gem.require_paths = ["lib"]
16
+ gem.version = SolrEad::VERSION
17
+
18
+ # Dependencies
19
+ gem.add_dependency('om')
20
+ gem.add_dependency('solrizer')
21
+ gem.add_dependency('rsolr')
22
+ gem.add_dependency('sanitize')
23
+ # For Development
24
+ gem.add_development_dependency 'rspec'
25
+ gem.add_development_dependency 'debugger'
26
+ gem.add_development_dependency 'rdoc'
27
+ end
@@ -0,0 +1,105 @@
1
+ require "spec_helper"
2
+
3
+ describe SolrEad::Behaviors do
4
+
5
+ before :all do
6
+ @not_numbered = fixture "ARC-0005.xml"
7
+ @numbered = fixture "pp002010.xml"
8
+ class TestClass
9
+ include SolrEad::Behaviors
10
+ end
11
+ @test = TestClass.new
12
+ end
13
+
14
+ describe "#components" do
15
+
16
+ before :all do
17
+ @non_numbered_nodeset = @test.components(@not_numbered)
18
+ @numbered_nodeset = @test.components(@numbered)
19
+ end
20
+
21
+ it "should return a nodeset" do
22
+ @non_numbered_nodeset.should be_a_kind_of(Nokogiri::XML::NodeSet)
23
+ @non_numbered_nodeset.first.should be_a_kind_of(Nokogiri::XML::Element)
24
+ end
25
+
26
+ it "should be able to handle both numbered and non-numbered <c> nodes" do
27
+ @non_numbered_nodeset.count.should == 135
28
+ @numbered_nodeset.count.should == 83
29
+ end
30
+ end
31
+
32
+ describe "#prep" do
33
+ it "should return a single component document" do
34
+ part = @test.prep(@numbered_nodeset)
35
+ part.should be_a_kind_of(Nokogiri::XML::Document)
36
+ end
37
+ end
38
+
39
+ describe "#component_children?" do
40
+
41
+ before :all do
42
+ @true = '
43
+ <c id="ref167" level="file">
44
+ <did>
45
+ <unittitle>Zines</unittitle>
46
+ </did>
47
+ <c id="ref169" level="file">
48
+ <did>
49
+ <unittitle>Contagion</unittitle>
50
+ <container id="cid384011" type="Box" label="Graphic materials">SF2</container>
51
+ <container parent="cid384011" type="Folder">8</container>
52
+ <unitdate>1980-1981</unitdate>
53
+ </did>
54
+ </c>
55
+ <c id="ref171" level="file">
56
+ <did>
57
+ <unittitle>Single issues</unittitle>
58
+ <container id="cid384012" type="Box" label="Graphic materials">SF2</container>
59
+ <container parent="cid384012" type="Folder">9</container>
60
+ <unitdate>1977-1985</unitdate>
61
+ </did>
62
+ </c>
63
+ </c>
64
+ '
65
+ @false = '
66
+ <c id="ref167" level="file">
67
+ <did>
68
+ <unittitle>Zines</unittitle>
69
+ </did>
70
+ </c>
71
+ '
72
+ end
73
+
74
+
75
+ it "should return true for components that have c nodes below them" do
76
+ node = Nokogiri::XML(@true)
77
+ @test.component_children?(node.elements.first).should be_true
78
+ end
79
+
80
+ it "should return false for components that do not have c nodes below them" do
81
+ node = Nokogiri::XML(@false)
82
+ @test.component_children?(node.elements.first).should be_false
83
+ end
84
+
85
+ it "should return true for series-level components" do
86
+ pending "This is an old test"
87
+ ["series","subseries"].each do |level|
88
+ xml = '<c id="ref42" level="' + level +'"></c>'
89
+ node = Nokogiri::XML(xml)
90
+ @test.component_children?(node.elements.first).should be_true
91
+ end
92
+ end
93
+
94
+ it "should return false for item-level components" do
95
+ pending "This is an old test"
96
+ ["file","item"].each do |level|
97
+ xml = '<c id="ref42" level="' + level +'"></c>'
98
+ node = Nokogiri::XML(xml)
99
+ @test.component_children?(node.elements.first).should be_false
100
+ end
101
+ end
102
+
103
+ end
104
+
105
+ end
@@ -0,0 +1,32 @@
1
+ require "spec_helper"
2
+
3
+ describe SolrEad::Component do
4
+
5
+ before(:all) do
6
+ file = "component_template.xml"
7
+ @doc = SolrEad::Component.from_xml(fixture file)
8
+ end
9
+
10
+ describe "the solr document" do
11
+
12
+ it "should accept additional fields from a hash" do
13
+ additional_fields = {
14
+ "id" => "TEST-0001:ref010",
15
+ "eadid_s" => "TEST-0001",
16
+ "parent_id_s" => "ref001",
17
+ "parent_id_list_t" => ["ref001", "ref002", "ref003"],
18
+ "parent_unittitle_list_t" => ["Series I", "Subseries A", "Subseries 1"],
19
+ "component_children_b" => FALSE
20
+ }
21
+ solr_doc = @doc.to_solr(additional_fields)
22
+ solr_doc["id"].should == "TEST-0001:ref010"
23
+ solr_doc["level_facet"].should include "item"
24
+ solr_doc["heading_display"].should == "Series I >> Subseries A >> Subseries 1 >> Internal Revenue Service Form Information Return [RESTRICTED]"
25
+ solr_doc["accessrestrict_t"].first.should match /^This item .* is available.$/
26
+ solr_doc["accessrestrict_heading_display"].should include "Access Restrictions"
27
+
28
+ end
29
+
30
+ end
31
+
32
+ end
@@ -0,0 +1,80 @@
1
+ require "spec_helper"
2
+
3
+ describe SolrEad::Document do
4
+
5
+ before(:all) do
6
+ @ex1 = SolrEad::Document.from_xml(fixture "ARC-0005.xml")
7
+ @ex2 = SolrEad::Document.from_xml(fixture "pp002010.xml")
8
+ @solr_ex1 = @ex1.to_solr
9
+ @solr_ex2 = @ex2.to_solr
10
+ end
11
+
12
+ describe "#terminology" do
13
+
14
+ it "should have an id field" do
15
+ @ex1.eadid.first.should == "ARC-0005"
16
+ end
17
+
18
+ it "should have a simple title" do
19
+ @ex1.title.first.should match "Eddie Cochran Historical Organization Collection"
20
+ end
21
+
22
+ it "should have some subject headings" do
23
+ @ex1.persname.should include "Cochran, Eddie, 1938-1960"
24
+ @ex1.genreform.should include "Newspapers"
25
+ @ex1.subject.should include "Rockabilly music"
26
+ @ex2.corpname.should include "Tuskegee Normal and Industrial Institute--1880-1940."
27
+ @ex2.genreform.should include "Group portraits--1880-1940."
28
+ @ex2.geogname.should include "Washington, D.C."
29
+ @ex2.name.should include "Bell, J.S., Portland, OR"
30
+ @ex2.persname.should include "Johnston, Frances Benjamin, 1864-1952, photographer."
31
+ @ex2.subject.should include "Buildings--1880-1940."
32
+ end
33
+
34
+ it "should have scope and contents" do
35
+ @ex2.scopecontent.first.should match /^Photographs/
36
+ end
37
+
38
+ it "should have one separatedmaterial material note from the archdesc section" do
39
+ @ex1.separatedmaterial.first.should match /^Commercially-released publications.*materials are available.$/
40
+ end
41
+
42
+ it "should have its xml" do
43
+ @ex1.to_xml.should match "<c\s"
44
+ @ex2.to_xml.should match "<c01\s"\
45
+ end
46
+
47
+ end
48
+
49
+ describe ".to_solr" do
50
+
51
+ it "should have the appropriate id fields" do
52
+ @solr_ex1["eadid_s"].should == "ARC-0005"
53
+ @solr_ex1["id"].should == "ARC-0005"
54
+ @solr_ex2["eadid_s"].should == "http://hdl.loc.gov/loc.pnp/eadpnp.pp002010"
55
+ @solr_ex2["id"].should == "http://hdl.loc.gov/loc.pnp/eadpnp.pp002010"
56
+
57
+ end
58
+
59
+ it "should have faceted terms created from subject headings" do
60
+ @solr_ex1["persname_facet"].should include "Cochran, Eddie, 1938-1960"
61
+ @solr_ex1["genreform_facet"].should include "Newspapers"
62
+ @solr_ex1["subject_facet"].should include "Rockabilly music"
63
+ @solr_ex2["corpname_facet"].should include "Tuskegee Normal and Industrial Institute--1880-1940."
64
+ @solr_ex2["genreform_facet"].should include "Group portraits--1880-1940."
65
+ @solr_ex2["geogname_facet"].should include "Washington, D.C."
66
+ @solr_ex2["name_facet"].should include "Bell, J.S., Portland, OR"
67
+ @solr_ex2["persname_facet"].should include "Johnston, Frances Benjamin, 1864-1952, photographer."
68
+ @solr_ex2["subject_facet"].should include "Buildings--1880-1940."
69
+ end
70
+
71
+ it "should index head tags as display and p tags as text" do
72
+ @solr_ex1["separatedmaterial_heading_display"].should include "Separated Materials"
73
+ @solr_ex1["separatedmaterial_t"].first.should match /^Commercially-released publications.*materials are available.$/
74
+ @solr_ex2["scopecontent_heading_display"].should include "Scope and Content Note"
75
+ @solr_ex2["scopecontent_t"].first.should match /^Photographs/
76
+ end
77
+
78
+ end
79
+
80
+ end