solr_ead 0.6.0 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- ZDQzY2IzYzhhODUxZTg0NTk0Njk1YzczN2U2ZWY1NGM0OTVhNTAwOA==
4
+ NWJmOTVjOGQ3OWMwODIwNjE4NWYyYjA3ZjU4YmM5Y2IwMzI1YzQ0YQ==
5
5
  data.tar.gz: !binary |-
6
- MzIyMzM2N2U3MDkwMjJjNDc0MmZkZTJjNzJhZmFmY2FhMWQzNjRmYg==
6
+ MDA5ZjRiMGI3OTBlYzYzZTM1NDdiYmUwNWRhZmIyYzg0YjEwMDRiZQ==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- ZWMwYTZiYjdlNzY0YjU0YWUwN2FhYjljYzEwOTcyOGExYTVlMjY5MmUzODlh
10
- NjFlMzAxZWIwNDkwNzYyYzJkOGYxMDJmZjA1MWRjZGZlYzE5ZTQ2NzQyODFh
11
- Mzc5MDAwNTJmYjljMzM0ZDQ0YWUzMGQ2ZmIyYzJhNTk2MGMwMDA=
9
+ ZDcxMjhkZjNiYTgwOTZjMDRkZGQyMzgwOTdmYTkxZDE1YTkxOTI2YzQ3ODlm
10
+ ODk1MmJlZWY5MjQ1ZjM1ZWY3ODQ1MThiMDUzNTEyYjIzMjRjMDE0NTM3NTY3
11
+ ODIzMjU1NmFmZWRiZDBjYzk2MmRmYjcwMjA1MDEzM2U1YThmMDk=
12
12
  data.tar.gz: !binary |-
13
- MmQzNjA0NzdjYzA5MDkyMTI3MGNkMjFlZDEyMDQzZTVhYmFjMTc0NDc3ODk4
14
- NjQwMmEyZjUzMTVhZDZhMzZlMmQ3ZjY0ZThjNmUxMDgxMjMxMjRhZTQ1YmIy
15
- YjNlOGExYjM3ZTEyNDdhYmI4ODUwMjgxMDllOTcxNTFkOGIxNDk=
13
+ ZmI5N2NhY2U3MzY3MWUwNDkyNmFjOGJhNGMzNTBhYmE4ODhhYTZmYWQzOGMw
14
+ ZGVmYmVlYjQyNmQ2MDUwZDkwYThjYjFkYjBlNjYxNDNiZjY4OTU3YzVmMzY3
15
+ NDc3MmU5ZGE3NGE2MzI4ODkxMGZiYjdlMjVkZDJjZmU1NmI2OWU=
data/README.md CHANGED
@@ -73,6 +73,21 @@ will be able to apply any xslt processing you wish. Other solutions are possibl
73
73
  xml from the document as well as the component, depending on the needs of your
74
74
  application.
75
75
 
76
+ ### EAD Formatting
77
+
78
+ EAD xml may contain formatted text such as:
79
+
80
+ <title render="italic">this is italicized</title>
81
+
82
+ When OM processes any node that contains formatted text, the formatted nodes will be ignored
83
+ and the text will appear without any of the `<title>` tags denoting format. If you wish
84
+ to have the formatting preserved as converted HTML, you may add the formatted string
85
+ to your solr document:
86
+
87
+ Solrizer.set_field(solr_doc, "title", self.term_to_html("title"), :displayable)
88
+
89
+ See the section on customization for more information.
90
+
76
91
  ## Customization
77
92
 
78
93
  Chances are the default definitions are not sufficient for your needs. If you want to
@@ -202,6 +217,28 @@ solr. In order to have these fields index correctly, include the following in y
202
217
  Note that the type "text_en" is dependent on your particular solr application, but the others should be
203
218
  included in the default installation.
204
219
 
220
+ ### Displaying HTML
221
+
222
+ For converting formatted ead nodes to HTML, override the term's contents in the `to_solr` method:
223
+
224
+ class CustomDocument < SolrEad::Document
225
+
226
+ use_terminology SolrEad::Document
227
+
228
+ def to_solr(solr_doc = Hash.new)
229
+ super(solr_doc)
230
+ Solrizer.set_field(solr_doc, "title", self.term_to_html("title"), :displayable)
231
+ end
232
+
233
+ end
234
+
235
+ The above example takes the title term as it is defined in `SolrEad::Document` and changes the contents
236
+ of its solr display field. In this case, the contents of the xml node for the "title" OM term are
237
+ processed by the `term_to_html` method which converts the ead xml to html and stores it in the solr
238
+ field given by the `set_field` method.
239
+
240
+ The details of conversion from ead xml to html are specified in `SolrEad::Formatting`.
241
+
205
242
  ## Issues
206
243
 
207
244
  ### eadid format
@@ -2,6 +2,8 @@ require "sanitize"
2
2
 
3
3
  module SolrEad::Behaviors
4
4
 
5
+ include SolrEad::Formatting
6
+
5
7
  # Takes a file as its input and returns a Nokogiri::XML::NodeSet of component <c> nodes
6
8
  #
7
9
  # It'll make an attempt at substituting numbered component levels for non-numbered
@@ -102,23 +104,14 @@ module SolrEad::Behaviors
102
104
  title = xml.at("/c/did/unittitle")
103
105
  date = xml.at("/c/did/unitdate")
104
106
  if !title.nil? and !title.content.empty?
105
- return ead_clean_xml(title.content)
107
+ return ead_to_html(title.content)
106
108
  elsif !date.nil? and !date.content.empty?
107
- return ead_clean_xml(date.content)
109
+ return ead_to_html(date.content)
108
110
  else
109
111
  return "[No title available]"
110
112
  end
111
113
  end
112
114
 
113
- # Converts formatting elements in the ead into html tags
114
- def ead_clean_xml(string)
115
- string.gsub!(/<title/,"<span")
116
- string.gsub!(/<\/title/,"</span")
117
- string.gsub!(/render=/,"class=")
118
- sanitize = Sanitize.clean(string, :elements => ['span'], :attributes => {'span' => ['class']})
119
- sanitize.gsub("\n",'').gsub(/\s+/, ' ').strip
120
- end
121
-
122
115
  # Returns true or false for a component with attached <c> child nodes.
123
116
  def component_children?(node, t = Array.new)
124
117
  node.children.each { |n| t << n.name }
@@ -2,6 +2,7 @@ class SolrEad::Component
2
2
 
3
3
  include OM::XML::Document
4
4
  include OM::XML::TerminologyBasedSolrizer
5
+ include SolrEad::Formatting
5
6
 
6
7
  # Define each term in your ead that you want put into the solr document
7
8
  set_terminology do |t|
@@ -3,6 +3,7 @@ class SolrEad::Document
3
3
  include OM::XML::Document
4
4
  include OM::XML::TerminologyBasedSolrizer
5
5
  include SolrEad::OmBehaviors
6
+ include SolrEad::Formatting
6
7
 
7
8
  # Define each term in your ead that you want put into the solr document
8
9
  set_terminology do |t|
@@ -0,0 +1,56 @@
1
+ require 'sanitize'
2
+
3
+ module SolrEad::Formatting
4
+
5
+ RENDER_ATTRS =
6
+ {
7
+ "altrender" => "em",
8
+ "bold" => "strong",
9
+ "doublequote" => "em",
10
+ "bolddoublequote" => "strong",
11
+ "bolditalic" => "strong",
12
+ "boldsinglequote" => "strong",
13
+ "boldsmcaps" => "strong",
14
+ "boldunderline" => "strong",
15
+ "italic" => "em",
16
+ "italics" => "em",
17
+ "nonproport" => "em",
18
+ "singlequote" => "em",
19
+ "smcaps" => "em",
20
+ "sub" => "sub",
21
+ "super" => "sup",
22
+ "underline" => "em"
23
+ }
24
+
25
+ # If you're within the context of an OM::XML::Document, you can just pass the term you want converted and
26
+ # this will get the xml using the term.
27
+ def term_to_html term
28
+ ead_to_html self.send(term).nodeset.to_xml
29
+ end
30
+
31
+ # Use this method convert the xml directly
32
+ def ead_to_html xml
33
+ ::Sanitize.clean(transform_render_attributes(xml), :elements => RENDER_ATTRS.values.uniq )
34
+ end
35
+
36
+ private
37
+
38
+ def transform_render_attributes xml
39
+ ::Sanitize.clean(xml, :transformers => transformer)
40
+ end
41
+
42
+ def transformer
43
+ lambda do |env|
44
+ convert_ead_tag_to_html(env[:node])
45
+ {:node_whitelist => [env[:node]]}
46
+ end
47
+ end
48
+
49
+ def convert_ead_tag_to_html node
50
+ if RENDER_ATTRS.keys.include? node["render"]
51
+ node.name = RENDER_ATTRS[node["render"]]
52
+ node.remove_attribute "render"
53
+ end
54
+ end
55
+
56
+ end
@@ -1,3 +1,3 @@
1
1
  module SolrEad
2
- VERSION = "0.6.0"
2
+ VERSION = "0.7.0"
3
3
  end
data/lib/solr_ead.rb CHANGED
@@ -7,6 +7,7 @@ require 'active_support'
7
7
  module SolrEad
8
8
  extend ActiveSupport::Autoload
9
9
 
10
+ autoload :Formatting
10
11
  autoload :Behaviors
11
12
  autoload :OmBehaviors
12
13
  autoload :Indexer
@@ -14,7 +15,6 @@ module SolrEad
14
15
  autoload :Component
15
16
  autoload :Railtie if defined?(Rails)
16
17
 
17
-
18
18
  def self.version
19
19
  SolrEad::VERSION
20
20
  end
@@ -2,13 +2,13 @@ require "spec_helper"
2
2
 
3
3
  describe SolrEad::Component do
4
4
 
5
- before(:all) do
6
- file = "component_template.xml"
7
- @doc = SolrEad::Component.from_xml(fixture file)
8
- end
9
-
10
5
  describe "the solr document" do
11
6
 
7
+ before :all do
8
+ file = "component_template.xml"
9
+ @doc = SolrEad::Component.from_xml(fixture file)
10
+ end
11
+
12
12
  describe "for item-level components" do
13
13
 
14
14
  before :each do
@@ -52,4 +52,17 @@ describe SolrEad::Component do
52
52
 
53
53
  end
54
54
 
55
+ describe "formatting fields as html" do
56
+
57
+ before :all do
58
+ file = "html_component.xml"
59
+ @sample = SolrEad::Component.from_xml(fixture file)
60
+ end
61
+
62
+ it "should format as term as html" do
63
+ @sample.term_to_html("scopecontent").should include "<em>OPAL</em> "
64
+ end
65
+
66
+ end
67
+
55
68
  end
@@ -0,0 +1,20 @@
1
+ <c id="ref202" level="series">
2
+ <did>
3
+ <unittitle>Series VIII: Miscellaneous</unittitle>
4
+ <unitdate>1960</unitdate>
5
+ </did>
6
+ <scopecontent id="ref215">
7
+ <head>Scope and Contents</head>
8
+ <p>
9
+ Series VIII: Miscellaneous, 1960, contains a single issue of the pocket magazine <title render="italic">OPAL</title>, from Pride Publications (Cleveland, Ohio). The staff of <title render="italic">OPAL</title> included Valena M. Williams, executive editor; John Bentley, art director; and Nathaniel Hubbard, circulation manager. Volume 1, issue 5 includes work by photographers James Gayle, Anderson Marlow, and Harvey Bowie; columnists Bill Clark, Harrison Dillard, Edward Jones, William Matlock, and Mary Zachary; and artwork by cartoonist Ted Walker. It is possible one of the photographs in the magazine could be by Baynes, but none are cited as such. The publication contains information from members of the local community on birthday celebrations and other upcoming events, contest winners, and fashion, as well as longer articles on jazz, the importance of education and peaceful protests, the influence of disc jockeys and popular music on teens, WJMO's middle school student disc jockeys, singer Nancy Wilson, and Isabelle Cooley, co-star of the feature film <title render="italic">I Passed for White</title>.
10
+ </p>
11
+ </scopecontent>
12
+ <c id="ref111" level="file">
13
+ <did>
14
+ <unittitle>Cleveland Opal, Volume 1, Issue 5</unittitle>
15
+ <container id="cid1324121" type="Box" label="Periodicals">2</container>
16
+ <container parent="cid1324121" type="Folder">40</container>
17
+ <unitdate>1960 May 18</unitdate>
18
+ </did>
19
+ </c>
20
+ </c>
@@ -0,0 +1,26 @@
1
+ require 'spec_helper'
2
+
3
+ describe SolrEad::Formatting do
4
+
5
+ before :all do
6
+ class SampleClass
7
+ include SolrEad::Formatting
8
+ end
9
+ @sample = SampleClass.new
10
+ end
11
+
12
+ describe "#ead_to_html" do
13
+
14
+ it "should convert ead markup to html" do
15
+ xml = 'This is some text with <title render="italics">italics</title> included in it.'
16
+ @sample.ead_to_html(xml).should == 'This is some text with <em>italics</em> included in it.'
17
+ end
18
+
19
+ it "should remove other tags" do
20
+ xml = 'Blah blah <title render="italics">italics</title> blah <span>blah</span> <title render="bold">italics</title>'
21
+ @sample.ead_to_html(xml).should == 'Blah blah <em>italics</em> blah blah <strong>italics</strong>'
22
+ end
23
+ end
24
+
25
+
26
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: solr_ead
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Adam Wead
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-11-06 00:00:00.000000000 Z
11
+ date: 2013-11-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: om
@@ -187,6 +187,7 @@ files:
187
187
  - lib/solr_ead/behaviors.rb
188
188
  - lib/solr_ead/component.rb
189
189
  - lib/solr_ead/document.rb
190
+ - lib/solr_ead/formatting.rb
190
191
  - lib/solr_ead/indexer.rb
191
192
  - lib/solr_ead/om_behaviors.rb
192
193
  - lib/solr_ead/railtie.rb
@@ -204,7 +205,9 @@ files:
204
205
  - spec/fixtures/ead_messy_format.xml
205
206
  - spec/fixtures/ead_sample.xml
206
207
  - spec/fixtures/ead_template.xml
208
+ - spec/fixtures/html_component.xml
207
209
  - spec/fixtures/pp002010.xml
210
+ - spec/formatting_spec.rb
208
211
  - spec/indexer_spec.rb
209
212
  - spec/spec_helper.rb
210
213
  homepage: http://github.com/awead/solr_ead
@@ -241,7 +244,9 @@ test_files:
241
244
  - spec/fixtures/ead_messy_format.xml
242
245
  - spec/fixtures/ead_sample.xml
243
246
  - spec/fixtures/ead_template.xml
247
+ - spec/fixtures/html_component.xml
244
248
  - spec/fixtures/pp002010.xml
249
+ - spec/formatting_spec.rb
245
250
  - spec/indexer_spec.rb
246
251
  - spec/spec_helper.rb
247
252
  has_rdoc: