epub-parser 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,18 +1,48 @@
1
1
  require 'epub/content_document'
2
2
  require 'epub/constants'
3
+ require 'epub/parser/utils'
3
4
  require 'nokogiri'
4
5
 
5
6
  module EPUB
6
7
  class Parser
7
8
  class ContentDocument
9
+ include Utils
10
+
8
11
  class << self
9
- def parse
10
- new.parse
12
+ # @param [EPUB::Publication::Package::Manifest::Item] item
13
+ def parse(item)
14
+ new(item).parse
11
15
  end
12
16
  end
13
17
 
18
+ # @param [EPUB::Publication::Package::Manifest::Item] item
19
+ def initialize(item)
20
+ @item = item
21
+ end
22
+
14
23
  def parse
15
- raise 'Not implemented yet'
24
+ content_document = case @item.media_type
25
+ when 'application/xhtml+xml'
26
+ if @item.nav?
27
+ EPUB::ContentDocument::Navigation.new
28
+ else
29
+ EPUB::ContentDocument::XHTML.new
30
+ end
31
+ when 'image/svg+xml'
32
+ EPUB::ContentDocument::SVG.new
33
+ else
34
+ nil
35
+ end
36
+ return content_document if content_document.nil?
37
+ content_document.item = @item
38
+ document = Nokogiri.XML(@item.read)
39
+ # parse_content_document(document)
40
+ if @item.nav?
41
+ content_document.navigations = parse_navigations(document)
42
+ else
43
+ raise NotImplementedError
44
+ end
45
+ content_document
16
46
  end
17
47
 
18
48
  # @param [Nokogiri::HTML::Document] document HTML document or element including nav
@@ -24,13 +54,51 @@ module EPUB
24
54
  # @param [Nokogiri::XML::Element] element nav element
25
55
  # @return [EPUB::ContentDocument::Navigation::Nav] nav Nav object
26
56
  def parse_navigation(element)
27
- nav = EPUB::ContentDocument::Navigation::Nav.new
28
- nav.heading = find_heading element
29
- nav.type = element['type']
57
+ nav = EPUB::ContentDocument::Navigation::Navigation.new
58
+ nav.text = find_heading(element)
59
+ nav.type = extract_attribute(element, 'type', 'epub')
60
+ nav.items = element.xpath('./xhtml:ol/xhtml:li', EPUB::NAMESPACES).map {|elem| parse_navigation_item(elem)}
30
61
 
31
62
  nav
32
63
  end
33
64
 
65
+ # @param [Nokogiri::XML::Element] element li element
66
+ def parse_navigation_item(element)
67
+ item = EPUB::ContentDocument::Navigation::Item.new
68
+ a_or_span = element.xpath('./xhtml:a[1]|xhtml:span[1]', EPUB::NAMESPACES).first
69
+ return a_or_span if a_or_span.nil?
70
+
71
+ item.text = a_or_span.text
72
+ if a_or_span.name == 'a'
73
+ if item.text.empty?
74
+ embedded_content = a_or_span.xpath('./xhtml:audio[1]|xhtml:canvas[1]|xhtml:embed[1]|xhtml:iframe[1]|xhtml:img[1]|xhtml:math[1]|xhtml:object[1]|xhtml:svg[1]|xhtml:video[1]', EPUB::NAMESPACES).first
75
+ unless embedded_content.nil?
76
+ case embedded_content.name
77
+ when 'audio'
78
+ when 'canvas'
79
+ when 'embed'
80
+ when 'iframe'
81
+ item.text = (extract_attribute(embedded_content, 'name') || extract_attribute(embedded_content, 'srcdoc')).to_s
82
+ when 'img'
83
+ item.text = extract_attribute(embedded_content, 'alt').to_s
84
+ when 'math'
85
+ when 'object'
86
+ item.text = extract_attribute(embedded_content, 'name').to_s
87
+ when 'svg'
88
+ when 'video'
89
+ else
90
+ end
91
+ end
92
+ item.text = extract_attribute(a_or_span, 'title').to_s if item.text.nil? || item.text.empty?
93
+ end
94
+ item.href = Addressable::URI.parse(extract_attribute(a_or_span, 'href'))
95
+ item.item = @item.manifest.items.selector {|it| it.href.request_uri == item.href.request_uri}.first
96
+ end
97
+ item.items = element.xpath('./xhtml:ol[1]/xhtml:li', EPUB::NAMESPACES).map {|li| parse_navigation_item(li)}
98
+
99
+ item
100
+ end
101
+
34
102
  private
35
103
 
36
104
  # @param [Nokogiri::XML::Element] element nav element
@@ -6,6 +6,8 @@ require 'nokogiri'
6
6
  module EPUB
7
7
  class Parser
8
8
  class OCF
9
+ include Utils
10
+
9
11
  DIRECTORY = 'META-INF'
10
12
  EPUB::OCF::MODULES.each {|m| self.const_set "#{m.upcase}_FILE", "#{m}.xml"}
11
13
 
@@ -38,7 +40,8 @@ module EPUB
38
40
  doc.xpath('/ocf:container/ocf:rootfiles/ocf:rootfile', EPUB::NAMESPACES).each do |elem|
39
41
  rootfile = EPUB::OCF::Container::Rootfile.new
40
42
  %w[full-path media-type].each do |attr|
41
- rootfile.__send__(attr.gsub(/-/, '_') + '=', elem[attr])
43
+ value = extract_attribute(elem, attr)
44
+ rootfile.__send__(attr.gsub(/-/, '_') + '=', value)
42
45
  end
43
46
  container.rootfiles << rootfile
44
47
  end
@@ -8,9 +8,11 @@ require 'epub/constants'
8
8
  module EPUB
9
9
  class Parser
10
10
  class Publication
11
+ include Utils
12
+
11
13
  class << self
12
14
  def parse(zip_archive, file)
13
- opf = zip_archive.fopen(file).read
15
+ opf = zip_archive.fopen(Addressable::URI.unencode(file)).read
14
16
  new(opf, file).parse
15
17
  end
16
18
  end
@@ -35,11 +37,10 @@ module EPUB
35
37
  def parse_package
36
38
  elem = @doc.root
37
39
  %w[version xml:lang dir id].each do |attr|
38
- writer = attr.gsub(/\:/, '_') + '='
39
- @package.__send__(writer, elem[attr])
40
+ @package.__send__ "#{attr.gsub(/\:/, '_')}=", extract_attribute(elem, attr)
40
41
  end
41
42
  @unique_identifier_id = elem['unique-identifier']
42
- @package.prefix = parse_prefix(elem['prefix'])
43
+ @package.prefix = parse_prefix(extract_attribute(elem, 'prefix'))
43
44
 
44
45
  @package
45
46
  end
@@ -52,7 +53,7 @@ module EPUB
52
53
  metadata.identifiers = elem.xpath('./dc:identifier', EPUB::NAMESPACES).collect do |e|
53
54
  identifier = EPUB::Publication::Package::Metadata::DCMES.new
54
55
  identifier.content = e.content
55
- identifier.id = id = e['id']
56
+ identifier.id = id = extract_attribute(e, 'id')
56
57
  metadata.unique_identifier = identifier if id == @unique_identifier_id
57
58
 
58
59
  identifier
@@ -62,7 +63,7 @@ module EPUB
62
63
  metadata.titles = elem.xpath('./dc:title', EPUB::NAMESPACES).collect do |e|
63
64
  title = EPUB::Publication::Package::Metadata::Title.new
64
65
  %w[ id lang dir ].each do |attr|
65
- title.__send__("#{attr}=", e[attr])
66
+ title.__send__("#{attr}=", extract_attribute(e, attr))
66
67
  end
67
68
  title.content = e.content
68
69
 
@@ -85,9 +86,12 @@ module EPUB
85
86
 
86
87
  metadata.metas = elem.xpath('./opf:meta', EPUB::NAMESPACES).collect do |e|
87
88
  meta = EPUB::Publication::Package::Metadata::Meta.new
88
- %w[ property id scheme ].each { |attr| meta.__send__("#{attr}=", e[attr]) }
89
+ %w[property id scheme].each do |attr|
90
+ meta.__send__ "#{attr}=", extract_attribute(e, attr)
91
+ end
89
92
  meta.content = e.content
90
- if (refines = e['refines']) && refines[0] == '#'
93
+ refines = extract_attribute(e, 'refines')
94
+ if refines && refines[0] == '#'
91
95
  id = refines[1..-1]
92
96
  id_map[id] ||= {}
93
97
  id_map[id][:refiners] ||= []
@@ -101,11 +105,12 @@ module EPUB
101
105
  metadata.links = elem.xpath('./opf:link', EPUB::NAMESPACES).collect do |e|
102
106
  link = EPUB::Publication::Package::Metadata::Link.new
103
107
  %w[ id media-type ].each do |attr|
104
- link.__send__(attr.gsub(/-/, '_') + '=', e[attr])
108
+ link.__send__ (attr.gsub(/-/, '_') + '='), extract_attribute(e, attr)
105
109
  end
106
- link.href = Addressable::URI.parse(e['href'])
107
- link.rel = e['rel'].strip.split
108
- if (refines = e['refines']) && refines[0] == '#'
110
+ link.href = Addressable::URI.parse(extract_attribute(e, 'href'))
111
+ link.rel = extract_attribute(e, 'rel').strip.split
112
+ refines = extract_attribute(e, 'refines')
113
+ if refines && refines[0] == '#'
109
114
  id = refines[1..-1]
110
115
  id_map[id] ||= {}
111
116
  id_map[id][:refiners] ||= []
@@ -128,17 +133,19 @@ module EPUB
128
133
  def parse_manifest
129
134
  manifest = @package.manifest = EPUB::Publication::Package::Manifest.new
130
135
  elem = @doc.xpath('/opf:package/opf:manifest', EPUB::NAMESPACES).first
131
- manifest.id = elem['id']
136
+ manifest.id = extract_attribute(elem, 'id')
132
137
 
133
138
  fallback_map = {}
134
139
  elem.xpath('./opf:item', EPUB::NAMESPACES).each do |e|
135
140
  item = EPUB::Publication::Package::Manifest::Item.new
136
141
  %w[ id media-type media-overlay ].each do |attr|
137
- item.__send__("#{attr.gsub(/-/, '_')}=", e[attr])
142
+ item.__send__ "#{attr.gsub(/-/, '_')}=", extract_attribute(e, attr)
138
143
  end
139
- item.href = Addressable::URI.parse(e['href'])
140
- fallback_map[e['fallback']] = item if e['fallback']
141
- item.properties = e['properties'] ? e['properties'].split(' ') : []
144
+ item.href = Addressable::URI.parse(extract_attribute(e, 'href'))
145
+ fallback = extract_attribute(e, 'fallback')
146
+ fallback_map[fallback] = item if fallback
147
+ properties = extract_attribute(e, 'properties')
148
+ item.properties = properties ? properties.split(' ') : []
142
149
  manifest << item
143
150
  end
144
151
  fallback_map.each_pair do |id, from|
@@ -152,16 +159,17 @@ module EPUB
152
159
  spine = @package.spine = EPUB::Publication::Package::Spine.new
153
160
  elem = @doc.xpath('/opf:package/opf:spine', EPUB::NAMESPACES).first
154
161
  %w[ id toc page-progression-direction ].each do |attr|
155
- spine.__send__("#{attr.gsub(/-/, '_')}=", elem[attr])
162
+ spine.__send__ "#{attr.gsub(/-/, '_')}=", extract_attribute(elem, attr)
156
163
  end
157
164
 
158
165
  elem.xpath('./opf:itemref', EPUB::NAMESPACES).each do |e|
159
166
  itemref = EPUB::Publication::Package::Spine::Itemref.new
160
167
  %w[ idref id ].each do |attr|
161
- itemref.__send__("#{attr}=", e[attr])
168
+ itemref.__send__ "#{attr}=", extract_attribute(e, attr)
162
169
  end
163
- itemref.linear = (e['linear'] != 'no')
164
- itemref.properties = e['properties'] ? e['properties'].split(' ') : []
170
+ itemref.linear = (extract_attribute(e, 'linear') != 'no')
171
+ properties = extract_attribute(e, 'properties')
172
+ itemref.properties = properties ? properties.split(' ') : []
165
173
  spine << itemref
166
174
  end
167
175
 
@@ -173,9 +181,9 @@ module EPUB
173
181
  @doc.xpath('/opf:package/opf:guide/opf:reference', EPUB::NAMESPACES).each do |ref|
174
182
  reference = EPUB::Publication::Package::Guide::Reference.new
175
183
  %w[ type title ].each do |attr|
176
- reference.__send__("#{attr}=", ref[attr])
184
+ reference.__send__ "#{attr}=", extract_attribute(ref, attr)
177
185
  end
178
- reference.href = Addressable::URI.parse(ref['href'])
186
+ reference.href = Addressable::URI.parse(extract_attribute(ref, 'href'))
179
187
  guide << reference
180
188
  end
181
189
 
@@ -186,9 +194,9 @@ module EPUB
186
194
  bindings = @package.bindings = EPUB::Publication::Package::Bindings.new
187
195
  @doc.xpath('/opf:package/opf:bindings/opf:mediaType', EPUB::NAMESPACES).each do |elem|
188
196
  media_type = EPUB::Publication::Package::Bindings::MediaType.new
189
- media_type.media_type = elem['media-type']
197
+ media_type.media_type = extract_attribute(elem, 'media-type')
190
198
  items = @package.manifest.items
191
- media_type.handler = items.detect {|item| item.id == elem['handler']}
199
+ media_type.handler = items.detect {|item| item.id == extract_attribute(elem, 'handler')}
192
200
  bindings << media_type
193
201
  end
194
202
 
@@ -218,7 +226,7 @@ module EPUB
218
226
  md = EPUB::Publication::Package::Metadata::DCMES.new
219
227
  md.content = e.content
220
228
  %w[ id lang dir ].each do |attr|
221
- md.__send__("#{attr}=", e[attr])
229
+ md.__send__ "#{attr}=", extract_attribute(e, attr)
222
230
  end
223
231
  yield(md, e) if block_given?
224
232
  md
@@ -0,0 +1,20 @@
1
+ module EPUB
2
+ class Parser
3
+ module Utils
4
+ # Extract the value of attribute of element
5
+ #
6
+ # @todo Refinement Nokogiri::XML::Node instead of use this method after Ruby 2.0 becomes popular
7
+ #
8
+ # @param [Nokogiri::XML::Element] element
9
+ # @param [String] name name of attribute excluding namespace prefix
10
+ # @param [String, nil] prefix XML namespace prefix in {EPUB::Constants::NAMESPACES} keys
11
+ # @return [String] value of attribute when the attribute exists
12
+ # @return nil when the attribute doesn't exist
13
+ def extract_attribute(element, name, prefix=nil)
14
+ attr = element.attribute_with_ns(name, EPUB::NAMESPACES[prefix])
15
+ attr.nil? ? nil : attr.value
16
+ end
17
+ module_function :extract_attribute
18
+ end
19
+ end
20
+ end
@@ -1,5 +1,5 @@
1
1
  module EPUB
2
2
  class Parser
3
- VERSION = "0.1.2"
3
+ VERSION = "0.1.3"
4
4
  end
5
5
  end
@@ -1,5 +1,6 @@
1
1
  require 'enumerabler'
2
2
  require 'epub/constants'
3
+ require 'epub/parser/content_document'
3
4
 
4
5
  module EPUB
5
6
  module Publication
@@ -14,9 +15,8 @@ module EPUB
14
15
  @items[item.id] = item
15
16
  end
16
17
 
17
- # syntax sugar
18
18
  def navs
19
- items.selector {|i| i.properties.include? 'nav'}
19
+ items.selector(&:nav?)
20
20
  end
21
21
 
22
22
  def nav
@@ -62,10 +62,14 @@ module EPUB
62
62
  rootfile = Addressable::URI.parse(manifest.package.book.ocf.container.rootfile.full_path)
63
63
  Zip::Archive.open(manifest.package.book.epub_file) {|zip|
64
64
  path = Addressable::URI.unescape(rootfile + href.normalize.request_uri)
65
- zip.fopen(path.to_s).read
65
+ zip.fopen(path).read
66
66
  }
67
67
  end
68
68
 
69
+ def nav?
70
+ properties.include? 'nav'
71
+ end
72
+
69
73
  # @todo Handle circular fallback chain
70
74
  def use_fallback_chain(options = {})
71
75
  supported = EPUB::MediaType::CORE
@@ -84,6 +88,11 @@ module EPUB
84
88
  raise EPUB::MediaType::UnsupportedError
85
89
  end
86
90
 
91
+ def content_document
92
+ return nil unless %w[application/xhtml+xml image/svg+xml].include? media_type
93
+ @content_document ||= Parser::ContentDocument.parse(self)
94
+ end
95
+
87
96
  protected
88
97
 
89
98
  def traverse_fallback_chain(chain)
@@ -31,17 +31,31 @@ module EPUB
31
31
  titles.select {|title| title.title_type.to_s == type}.sort.join(' ')
32
32
  end
33
33
  end
34
+
34
35
  def subtitle
35
36
  titles.select {|title| title.title_type.to_s == 'subtitle'}.sort.join(' ')
36
37
  end
37
38
 
38
- def to_hash
39
+ def description
40
+ descriptions.join ' '
41
+ end
42
+
43
+ def date
44
+ dates.first
45
+ end
46
+
47
+ def to_h
39
48
  DC_ELEMS.inject({}) do |hsh, elem|
40
49
  hsh[elem] = __send__(elem)
41
50
  hsh
42
51
  end
43
52
  end
44
53
 
54
+ def to_hash
55
+ warn "#{self.class}##{__method__} is obsolete"
56
+ to_h
57
+ end
58
+
45
59
  def primary_metas
46
60
  metas.select {|meta| meta.primary_expression?}
47
61
  end
@@ -69,6 +83,13 @@ module EPUB
69
83
 
70
84
  attr_accessor :content, :id, :lang, :dir
71
85
 
86
+ def inspect
87
+ ivs = instance_variables.map {|iv|
88
+ [iv, instance_variable_get(iv).inspect].join('=')
89
+ }.join(' ')
90
+ '<#%s:%#0x %s>' % [self.class, __id__, ivs]
91
+ end
92
+
72
93
  def to_s
73
94
  content
74
95
  end
@@ -104,6 +125,13 @@ module EPUB
104
125
  ! subexpression?
105
126
  end
106
127
 
128
+ def inspect
129
+ ivs = instance_variables.map {|iv|
130
+ [iv, instance_variable_get(iv).inspect].join('=')
131
+ }.join(' ')
132
+ '<#%s:%#0x %s>' % [self.class, __id__, ivs]
133
+ end
134
+
107
135
  def to_s
108
136
  content
109
137
  end
@@ -4,3 +4,4 @@ SimpleCov.start do
4
4
  end
5
5
 
6
6
  require 'test/unit/full'
7
+ require 'epub/parser'
@@ -1,5 +1,4 @@
1
1
  require File.expand_path 'helper', File.dirname(__FILE__)
2
- require 'epub/parser'
3
2
 
4
3
  class MyBook
5
4
  include EPUB
@@ -1,18 +1,31 @@
1
1
  require_relative 'helper'
2
- require 'epub/parser/content_document'
3
2
 
4
3
  class TestParserContentDocument < Test::Unit::TestCase
5
4
  def setup
5
+ @manifest = EPUB::Publication::Package::Manifest.new
6
+ %w[item-1.xhtml item-2.xhtml nav.xhtml].each.with_index do |href, index|
7
+ item = EPUB::Publication::Package::Manifest::Item.new
8
+ item.id = index
9
+ item.href = Addressable::URI.parse(href)
10
+ @manifest << item
11
+ end
12
+
6
13
  @dir = 'test/fixtures/book'
7
- @parser = EPUB::Parser::ContentDocument.new
14
+ @parser = EPUB::Parser::ContentDocument.new(@manifest.items.last)
8
15
  end
9
16
 
10
17
  def test_parse_navigations
11
18
  doc = Nokogiri.XML open("#{@dir}/OPS/nav.xhtml")
12
19
  navs = @parser.parse_navigations doc
20
+ nav = navs.first
13
21
 
14
22
  assert_equal 1, navs.length
15
- assert_equal 'Table of Contents', navs.first.heading
16
- assert_equal 'toc', navs.first.type
23
+ assert_equal 'Table of Contents', nav.heading
24
+ assert_equal 'toc', nav.type
25
+
26
+ assert_equal 2, nav.items.length
27
+ assert_equal @manifest.items.first, nav.items.first.item
28
+ assert_equal @manifest.items[1], nav.items[1].items[0].item
29
+ assert_equal @manifest.items[1], nav.items[1].items[1].item
17
30
  end
18
31
  end