epub-parser 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,18 +1,48 @@
1
1
  require 'epub/content_document'
2
2
  require 'epub/constants'
3
+ require 'epub/parser/utils'
3
4
  require 'nokogiri'
4
5
 
5
6
  module EPUB
6
7
  class Parser
7
8
  class ContentDocument
9
+ include Utils
10
+
8
11
  class << self
9
- def parse
10
- new.parse
12
+ # @param [EPUB::Publication::Package::Manifest::Item] item
13
+ def parse(item)
14
+ new(item).parse
11
15
  end
12
16
  end
13
17
 
18
+ # @param [EPUB::Publication::Package::Manifest::Item] item
19
+ def initialize(item)
20
+ @item = item
21
+ end
22
+
14
23
  def parse
15
- raise 'Not implemented yet'
24
+ content_document = case @item.media_type
25
+ when 'application/xhtml+xml'
26
+ if @item.nav?
27
+ EPUB::ContentDocument::Navigation.new
28
+ else
29
+ EPUB::ContentDocument::XHTML.new
30
+ end
31
+ when 'image/svg+xml'
32
+ EPUB::ContentDocument::SVG.new
33
+ else
34
+ nil
35
+ end
36
+ return content_document if content_document.nil?
37
+ content_document.item = @item
38
+ document = Nokogiri.XML(@item.read)
39
+ # parse_content_document(document)
40
+ if @item.nav?
41
+ content_document.navigations = parse_navigations(document)
42
+ else
43
+ raise NotImplementedError
44
+ end
45
+ content_document
16
46
  end
17
47
 
18
48
  # @param [Nokogiri::HTML::Document] document HTML document or element including nav
@@ -24,13 +54,51 @@ module EPUB
24
54
  # @param [Nokogiri::XML::Element] element nav element
25
55
  # @return [EPUB::ContentDocument::Navigation::Nav] nav Nav object
26
56
  def parse_navigation(element)
27
- nav = EPUB::ContentDocument::Navigation::Nav.new
28
- nav.heading = find_heading element
29
- nav.type = element['type']
57
+ nav = EPUB::ContentDocument::Navigation::Navigation.new
58
+ nav.text = find_heading(element)
59
+ nav.type = extract_attribute(element, 'type', 'epub')
60
+ nav.items = element.xpath('./xhtml:ol/xhtml:li', EPUB::NAMESPACES).map {|elem| parse_navigation_item(elem)}
30
61
 
31
62
  nav
32
63
  end
33
64
 
65
+ # @param [Nokogiri::XML::Element] element li element
66
+ def parse_navigation_item(element)
67
+ item = EPUB::ContentDocument::Navigation::Item.new
68
+ a_or_span = element.xpath('./xhtml:a[1]|xhtml:span[1]', EPUB::NAMESPACES).first
69
+ return a_or_span if a_or_span.nil?
70
+
71
+ item.text = a_or_span.text
72
+ if a_or_span.name == 'a'
73
+ if item.text.empty?
74
+ embedded_content = a_or_span.xpath('./xhtml:audio[1]|xhtml:canvas[1]|xhtml:embed[1]|xhtml:iframe[1]|xhtml:img[1]|xhtml:math[1]|xhtml:object[1]|xhtml:svg[1]|xhtml:video[1]', EPUB::NAMESPACES).first
75
+ unless embedded_content.nil?
76
+ case embedded_content.name
77
+ when 'audio'
78
+ when 'canvas'
79
+ when 'embed'
80
+ when 'iframe'
81
+ item.text = (extract_attribute(embedded_content, 'name') || extract_attribute(embedded_content, 'srcdoc')).to_s
82
+ when 'img'
83
+ item.text = extract_attribute(embedded_content, 'alt').to_s
84
+ when 'math'
85
+ when 'object'
86
+ item.text = extract_attribute(embedded_content, 'name').to_s
87
+ when 'svg'
88
+ when 'video'
89
+ else
90
+ end
91
+ end
92
+ item.text = extract_attribute(a_or_span, 'title').to_s if item.text.nil? || item.text.empty?
93
+ end
94
+ item.href = Addressable::URI.parse(extract_attribute(a_or_span, 'href'))
95
+ item.item = @item.manifest.items.selector {|it| it.href.request_uri == item.href.request_uri}.first
96
+ end
97
+ item.items = element.xpath('./xhtml:ol[1]/xhtml:li', EPUB::NAMESPACES).map {|li| parse_navigation_item(li)}
98
+
99
+ item
100
+ end
101
+
34
102
  private
35
103
 
36
104
  # @param [Nokogiri::XML::Element] element nav element
@@ -6,6 +6,8 @@ require 'nokogiri'
6
6
  module EPUB
7
7
  class Parser
8
8
  class OCF
9
+ include Utils
10
+
9
11
  DIRECTORY = 'META-INF'
10
12
  EPUB::OCF::MODULES.each {|m| self.const_set "#{m.upcase}_FILE", "#{m}.xml"}
11
13
 
@@ -38,7 +40,8 @@ module EPUB
38
40
  doc.xpath('/ocf:container/ocf:rootfiles/ocf:rootfile', EPUB::NAMESPACES).each do |elem|
39
41
  rootfile = EPUB::OCF::Container::Rootfile.new
40
42
  %w[full-path media-type].each do |attr|
41
- rootfile.__send__(attr.gsub(/-/, '_') + '=', elem[attr])
43
+ value = extract_attribute(elem, attr)
44
+ rootfile.__send__(attr.gsub(/-/, '_') + '=', value)
42
45
  end
43
46
  container.rootfiles << rootfile
44
47
  end
@@ -8,9 +8,11 @@ require 'epub/constants'
8
8
  module EPUB
9
9
  class Parser
10
10
  class Publication
11
+ include Utils
12
+
11
13
  class << self
12
14
  def parse(zip_archive, file)
13
- opf = zip_archive.fopen(file).read
15
+ opf = zip_archive.fopen(Addressable::URI.unencode(file)).read
14
16
  new(opf, file).parse
15
17
  end
16
18
  end
@@ -35,11 +37,10 @@ module EPUB
35
37
  def parse_package
36
38
  elem = @doc.root
37
39
  %w[version xml:lang dir id].each do |attr|
38
- writer = attr.gsub(/\:/, '_') + '='
39
- @package.__send__(writer, elem[attr])
40
+ @package.__send__ "#{attr.gsub(/\:/, '_')}=", extract_attribute(elem, attr)
40
41
  end
41
42
  @unique_identifier_id = elem['unique-identifier']
42
- @package.prefix = parse_prefix(elem['prefix'])
43
+ @package.prefix = parse_prefix(extract_attribute(elem, 'prefix'))
43
44
 
44
45
  @package
45
46
  end
@@ -52,7 +53,7 @@ module EPUB
52
53
  metadata.identifiers = elem.xpath('./dc:identifier', EPUB::NAMESPACES).collect do |e|
53
54
  identifier = EPUB::Publication::Package::Metadata::DCMES.new
54
55
  identifier.content = e.content
55
- identifier.id = id = e['id']
56
+ identifier.id = id = extract_attribute(e, 'id')
56
57
  metadata.unique_identifier = identifier if id == @unique_identifier_id
57
58
 
58
59
  identifier
@@ -62,7 +63,7 @@ module EPUB
62
63
  metadata.titles = elem.xpath('./dc:title', EPUB::NAMESPACES).collect do |e|
63
64
  title = EPUB::Publication::Package::Metadata::Title.new
64
65
  %w[ id lang dir ].each do |attr|
65
- title.__send__("#{attr}=", e[attr])
66
+ title.__send__("#{attr}=", extract_attribute(e, attr))
66
67
  end
67
68
  title.content = e.content
68
69
 
@@ -85,9 +86,12 @@ module EPUB
85
86
 
86
87
  metadata.metas = elem.xpath('./opf:meta', EPUB::NAMESPACES).collect do |e|
87
88
  meta = EPUB::Publication::Package::Metadata::Meta.new
88
- %w[ property id scheme ].each { |attr| meta.__send__("#{attr}=", e[attr]) }
89
+ %w[property id scheme].each do |attr|
90
+ meta.__send__ "#{attr}=", extract_attribute(e, attr)
91
+ end
89
92
  meta.content = e.content
90
- if (refines = e['refines']) && refines[0] == '#'
93
+ refines = extract_attribute(e, 'refines')
94
+ if refines && refines[0] == '#'
91
95
  id = refines[1..-1]
92
96
  id_map[id] ||= {}
93
97
  id_map[id][:refiners] ||= []
@@ -101,11 +105,12 @@ module EPUB
101
105
  metadata.links = elem.xpath('./opf:link', EPUB::NAMESPACES).collect do |e|
102
106
  link = EPUB::Publication::Package::Metadata::Link.new
103
107
  %w[ id media-type ].each do |attr|
104
- link.__send__(attr.gsub(/-/, '_') + '=', e[attr])
108
+ link.__send__ (attr.gsub(/-/, '_') + '='), extract_attribute(e, attr)
105
109
  end
106
- link.href = Addressable::URI.parse(e['href'])
107
- link.rel = e['rel'].strip.split
108
- if (refines = e['refines']) && refines[0] == '#'
110
+ link.href = Addressable::URI.parse(extract_attribute(e, 'href'))
111
+ link.rel = extract_attribute(e, 'rel').strip.split
112
+ refines = extract_attribute(e, 'refines')
113
+ if refines && refines[0] == '#'
109
114
  id = refines[1..-1]
110
115
  id_map[id] ||= {}
111
116
  id_map[id][:refiners] ||= []
@@ -128,17 +133,19 @@ module EPUB
128
133
  def parse_manifest
129
134
  manifest = @package.manifest = EPUB::Publication::Package::Manifest.new
130
135
  elem = @doc.xpath('/opf:package/opf:manifest', EPUB::NAMESPACES).first
131
- manifest.id = elem['id']
136
+ manifest.id = extract_attribute(elem, 'id')
132
137
 
133
138
  fallback_map = {}
134
139
  elem.xpath('./opf:item', EPUB::NAMESPACES).each do |e|
135
140
  item = EPUB::Publication::Package::Manifest::Item.new
136
141
  %w[ id media-type media-overlay ].each do |attr|
137
- item.__send__("#{attr.gsub(/-/, '_')}=", e[attr])
142
+ item.__send__ "#{attr.gsub(/-/, '_')}=", extract_attribute(e, attr)
138
143
  end
139
- item.href = Addressable::URI.parse(e['href'])
140
- fallback_map[e['fallback']] = item if e['fallback']
141
- item.properties = e['properties'] ? e['properties'].split(' ') : []
144
+ item.href = Addressable::URI.parse(extract_attribute(e, 'href'))
145
+ fallback = extract_attribute(e, 'fallback')
146
+ fallback_map[fallback] = item if fallback
147
+ properties = extract_attribute(e, 'properties')
148
+ item.properties = properties ? properties.split(' ') : []
142
149
  manifest << item
143
150
  end
144
151
  fallback_map.each_pair do |id, from|
@@ -152,16 +159,17 @@ module EPUB
152
159
  spine = @package.spine = EPUB::Publication::Package::Spine.new
153
160
  elem = @doc.xpath('/opf:package/opf:spine', EPUB::NAMESPACES).first
154
161
  %w[ id toc page-progression-direction ].each do |attr|
155
- spine.__send__("#{attr.gsub(/-/, '_')}=", elem[attr])
162
+ spine.__send__ "#{attr.gsub(/-/, '_')}=", extract_attribute(elem, attr)
156
163
  end
157
164
 
158
165
  elem.xpath('./opf:itemref', EPUB::NAMESPACES).each do |e|
159
166
  itemref = EPUB::Publication::Package::Spine::Itemref.new
160
167
  %w[ idref id ].each do |attr|
161
- itemref.__send__("#{attr}=", e[attr])
168
+ itemref.__send__ "#{attr}=", extract_attribute(e, attr)
162
169
  end
163
- itemref.linear = (e['linear'] != 'no')
164
- itemref.properties = e['properties'] ? e['properties'].split(' ') : []
170
+ itemref.linear = (extract_attribute(e, 'linear') != 'no')
171
+ properties = extract_attribute(e, 'properties')
172
+ itemref.properties = properties ? properties.split(' ') : []
165
173
  spine << itemref
166
174
  end
167
175
 
@@ -173,9 +181,9 @@ module EPUB
173
181
  @doc.xpath('/opf:package/opf:guide/opf:reference', EPUB::NAMESPACES).each do |ref|
174
182
  reference = EPUB::Publication::Package::Guide::Reference.new
175
183
  %w[ type title ].each do |attr|
176
- reference.__send__("#{attr}=", ref[attr])
184
+ reference.__send__ "#{attr}=", extract_attribute(ref, attr)
177
185
  end
178
- reference.href = Addressable::URI.parse(ref['href'])
186
+ reference.href = Addressable::URI.parse(extract_attribute(ref, 'href'))
179
187
  guide << reference
180
188
  end
181
189
 
@@ -186,9 +194,9 @@ module EPUB
186
194
  bindings = @package.bindings = EPUB::Publication::Package::Bindings.new
187
195
  @doc.xpath('/opf:package/opf:bindings/opf:mediaType', EPUB::NAMESPACES).each do |elem|
188
196
  media_type = EPUB::Publication::Package::Bindings::MediaType.new
189
- media_type.media_type = elem['media-type']
197
+ media_type.media_type = extract_attribute(elem, 'media-type')
190
198
  items = @package.manifest.items
191
- media_type.handler = items.detect {|item| item.id == elem['handler']}
199
+ media_type.handler = items.detect {|item| item.id == extract_attribute(elem, 'handler')}
192
200
  bindings << media_type
193
201
  end
194
202
 
@@ -218,7 +226,7 @@ module EPUB
218
226
  md = EPUB::Publication::Package::Metadata::DCMES.new
219
227
  md.content = e.content
220
228
  %w[ id lang dir ].each do |attr|
221
- md.__send__("#{attr}=", e[attr])
229
+ md.__send__ "#{attr}=", extract_attribute(e, attr)
222
230
  end
223
231
  yield(md, e) if block_given?
224
232
  md
@@ -0,0 +1,20 @@
1
+ module EPUB
2
+ class Parser
3
+ module Utils
4
+ # Extract the value of attribute of element
5
+ #
6
+ # @todo Refinement Nokogiri::XML::Node instead of use this method after Ruby 2.0 becomes popular
7
+ #
8
+ # @param [Nokogiri::XML::Element] element
9
+ # @param [String] name name of attribute excluding namespace prefix
10
+ # @param [String, nil] prefix XML namespace prefix in {EPUB::Constants::NAMESPACES} keys
11
+ # @return [String] value of attribute when the attribute exists
12
+ # @return nil when the attribute doesn't exist
13
+ def extract_attribute(element, name, prefix=nil)
14
+ attr = element.attribute_with_ns(name, EPUB::NAMESPACES[prefix])
15
+ attr.nil? ? nil : attr.value
16
+ end
17
+ module_function :extract_attribute
18
+ end
19
+ end
20
+ end
@@ -1,5 +1,5 @@
1
1
  module EPUB
2
2
  class Parser
3
- VERSION = "0.1.2"
3
+ VERSION = "0.1.3"
4
4
  end
5
5
  end
@@ -1,5 +1,6 @@
1
1
  require 'enumerabler'
2
2
  require 'epub/constants'
3
+ require 'epub/parser/content_document'
3
4
 
4
5
  module EPUB
5
6
  module Publication
@@ -14,9 +15,8 @@ module EPUB
14
15
  @items[item.id] = item
15
16
  end
16
17
 
17
- # syntax sugar
18
18
  def navs
19
- items.selector {|i| i.properties.include? 'nav'}
19
+ items.selector(&:nav?)
20
20
  end
21
21
 
22
22
  def nav
@@ -62,10 +62,14 @@ module EPUB
62
62
  rootfile = Addressable::URI.parse(manifest.package.book.ocf.container.rootfile.full_path)
63
63
  Zip::Archive.open(manifest.package.book.epub_file) {|zip|
64
64
  path = Addressable::URI.unescape(rootfile + href.normalize.request_uri)
65
- zip.fopen(path.to_s).read
65
+ zip.fopen(path).read
66
66
  }
67
67
  end
68
68
 
69
+ def nav?
70
+ properties.include? 'nav'
71
+ end
72
+
69
73
  # @todo Handle circular fallback chain
70
74
  def use_fallback_chain(options = {})
71
75
  supported = EPUB::MediaType::CORE
@@ -84,6 +88,11 @@ module EPUB
84
88
  raise EPUB::MediaType::UnsupportedError
85
89
  end
86
90
 
91
+ def content_document
92
+ return nil unless %w[application/xhtml+xml image/svg+xml].include? media_type
93
+ @content_document ||= Parser::ContentDocument.parse(self)
94
+ end
95
+
87
96
  protected
88
97
 
89
98
  def traverse_fallback_chain(chain)
@@ -31,17 +31,31 @@ module EPUB
31
31
  titles.select {|title| title.title_type.to_s == type}.sort.join(' ')
32
32
  end
33
33
  end
34
+
34
35
  def subtitle
35
36
  titles.select {|title| title.title_type.to_s == 'subtitle'}.sort.join(' ')
36
37
  end
37
38
 
38
- def to_hash
39
+ def description
40
+ descriptions.join ' '
41
+ end
42
+
43
+ def date
44
+ dates.first
45
+ end
46
+
47
+ def to_h
39
48
  DC_ELEMS.inject({}) do |hsh, elem|
40
49
  hsh[elem] = __send__(elem)
41
50
  hsh
42
51
  end
43
52
  end
44
53
 
54
+ def to_hash
55
+ warn "#{self.class}##{__method__} is obsolete"
56
+ to_h
57
+ end
58
+
45
59
  def primary_metas
46
60
  metas.select {|meta| meta.primary_expression?}
47
61
  end
@@ -69,6 +83,13 @@ module EPUB
69
83
 
70
84
  attr_accessor :content, :id, :lang, :dir
71
85
 
86
+ def inspect
87
+ ivs = instance_variables.map {|iv|
88
+ [iv, instance_variable_get(iv).inspect].join('=')
89
+ }.join(' ')
90
+ '<#%s:%#0x %s>' % [self.class, __id__, ivs]
91
+ end
92
+
72
93
  def to_s
73
94
  content
74
95
  end
@@ -104,6 +125,13 @@ module EPUB
104
125
  ! subexpression?
105
126
  end
106
127
 
128
+ def inspect
129
+ ivs = instance_variables.map {|iv|
130
+ [iv, instance_variable_get(iv).inspect].join('=')
131
+ }.join(' ')
132
+ '<#%s:%#0x %s>' % [self.class, __id__, ivs]
133
+ end
134
+
107
135
  def to_s
108
136
  content
109
137
  end
@@ -4,3 +4,4 @@ SimpleCov.start do
4
4
  end
5
5
 
6
6
  require 'test/unit/full'
7
+ require 'epub/parser'
@@ -1,5 +1,4 @@
1
1
  require File.expand_path 'helper', File.dirname(__FILE__)
2
- require 'epub/parser'
3
2
 
4
3
  class MyBook
5
4
  include EPUB
@@ -1,18 +1,31 @@
1
1
  require_relative 'helper'
2
- require 'epub/parser/content_document'
3
2
 
4
3
  class TestParserContentDocument < Test::Unit::TestCase
5
4
  def setup
5
+ @manifest = EPUB::Publication::Package::Manifest.new
6
+ %w[item-1.xhtml item-2.xhtml nav.xhtml].each.with_index do |href, index|
7
+ item = EPUB::Publication::Package::Manifest::Item.new
8
+ item.id = index
9
+ item.href = Addressable::URI.parse(href)
10
+ @manifest << item
11
+ end
12
+
6
13
  @dir = 'test/fixtures/book'
7
- @parser = EPUB::Parser::ContentDocument.new
14
+ @parser = EPUB::Parser::ContentDocument.new(@manifest.items.last)
8
15
  end
9
16
 
10
17
  def test_parse_navigations
11
18
  doc = Nokogiri.XML open("#{@dir}/OPS/nav.xhtml")
12
19
  navs = @parser.parse_navigations doc
20
+ nav = navs.first
13
21
 
14
22
  assert_equal 1, navs.length
15
- assert_equal 'Table of Contents', navs.first.heading
16
- assert_equal 'toc', navs.first.type
23
+ assert_equal 'Table of Contents', nav.heading
24
+ assert_equal 'toc', nav.type
25
+
26
+ assert_equal 2, nav.items.length
27
+ assert_equal @manifest.items.first, nav.items.first.item
28
+ assert_equal @manifest.items[1], nav.items[1].items[0].item
29
+ assert_equal @manifest.items[1], nav.items[1].items[1].item
17
30
  end
18
31
  end