epub-parser 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.travis.yml +3 -0
- data/.yardopts +3 -0
- data/Gemfile +1 -1
- data/MIT-LICENSE +1 -1
- data/README.markdown +64 -11
- data/Rakefile +16 -6
- data/bin/epub-open +25 -0
- data/bin/epubinfo +1 -1
- data/epub-parser.gemspec +1 -2
- data/lib/epub.rb +1 -2
- data/lib/epub/constants.rb +3 -1
- data/lib/epub/content_document/navigation.rb +31 -42
- data/lib/epub/content_document/xhtml.rb +12 -0
- data/lib/epub/parser.rb +36 -6
- data/lib/epub/parser/content_document.rb +74 -6
- data/lib/epub/parser/ocf.rb +4 -1
- data/lib/epub/parser/publication.rb +34 -26
- data/lib/epub/parser/utils.rb +20 -0
- data/lib/epub/parser/version.rb +1 -1
- data/lib/epub/publication/package/manifest.rb +12 -3
- data/lib/epub/publication/package/metadata.rb +29 -1
- data/test/helper.rb +1 -0
- data/test/test_parser.rb +0 -1
- data/test/test_parser_content_document.rb +17 -4
- data/test/test_parser_ocf.rb +0 -1
- data/test/test_parser_publication.rb +0 -1
- data/test/test_publication.rb +20 -0
- metadata +140 -101
@@ -1,18 +1,48 @@
|
|
1
1
|
require 'epub/content_document'
|
2
2
|
require 'epub/constants'
|
3
|
+
require 'epub/parser/utils'
|
3
4
|
require 'nokogiri'
|
4
5
|
|
5
6
|
module EPUB
|
6
7
|
class Parser
|
7
8
|
class ContentDocument
|
9
|
+
include Utils
|
10
|
+
|
8
11
|
class << self
|
9
|
-
|
10
|
-
|
12
|
+
# @param [EPUB::Publication::Package::Manifest::Item] item
|
13
|
+
def parse(item)
|
14
|
+
new(item).parse
|
11
15
|
end
|
12
16
|
end
|
13
17
|
|
18
|
+
# @param [EPUB::Publication::Package::Manifest::Item] item
|
19
|
+
def initialize(item)
|
20
|
+
@item = item
|
21
|
+
end
|
22
|
+
|
14
23
|
def parse
|
15
|
-
|
24
|
+
content_document = case @item.media_type
|
25
|
+
when 'application/xhtml+xml'
|
26
|
+
if @item.nav?
|
27
|
+
EPUB::ContentDocument::Navigation.new
|
28
|
+
else
|
29
|
+
EPUB::ContentDocument::XHTML.new
|
30
|
+
end
|
31
|
+
when 'image/svg+xml'
|
32
|
+
EPUB::ContentDocument::SVG.new
|
33
|
+
else
|
34
|
+
nil
|
35
|
+
end
|
36
|
+
return content_document if content_document.nil?
|
37
|
+
content_document.item = @item
|
38
|
+
document = Nokogiri.XML(@item.read)
|
39
|
+
# parse_content_document(document)
|
40
|
+
if @item.nav?
|
41
|
+
content_document.navigations = parse_navigations(document)
|
42
|
+
else
|
43
|
+
raise NotImplementedError
|
44
|
+
end
|
45
|
+
content_document
|
16
46
|
end
|
17
47
|
|
18
48
|
# @param [Nokogiri::HTML::Document] document HTML document or element including nav
|
@@ -24,13 +54,51 @@ module EPUB
|
|
24
54
|
# @param [Nokogiri::XML::Element] element nav element
|
25
55
|
# @return [EPUB::ContentDocument::Navigation::Nav] nav Nav object
|
26
56
|
def parse_navigation(element)
|
27
|
-
nav = EPUB::ContentDocument::Navigation::
|
28
|
-
nav.
|
29
|
-
nav.type = element
|
57
|
+
nav = EPUB::ContentDocument::Navigation::Navigation.new
|
58
|
+
nav.text = find_heading(element)
|
59
|
+
nav.type = extract_attribute(element, 'type', 'epub')
|
60
|
+
nav.items = element.xpath('./xhtml:ol/xhtml:li', EPUB::NAMESPACES).map {|elem| parse_navigation_item(elem)}
|
30
61
|
|
31
62
|
nav
|
32
63
|
end
|
33
64
|
|
65
|
+
# @param [Nokogiri::XML::Element] element li element
|
66
|
+
def parse_navigation_item(element)
|
67
|
+
item = EPUB::ContentDocument::Navigation::Item.new
|
68
|
+
a_or_span = element.xpath('./xhtml:a[1]|xhtml:span[1]', EPUB::NAMESPACES).first
|
69
|
+
return a_or_span if a_or_span.nil?
|
70
|
+
|
71
|
+
item.text = a_or_span.text
|
72
|
+
if a_or_span.name == 'a'
|
73
|
+
if item.text.empty?
|
74
|
+
embedded_content = a_or_span.xpath('./xhtml:audio[1]|xhtml:canvas[1]|xhtml:embed[1]|xhtml:iframe[1]|xhtml:img[1]|xhtml:math[1]|xhtml:object[1]|xhtml:svg[1]|xhtml:video[1]', EPUB::NAMESPACES).first
|
75
|
+
unless embedded_content.nil?
|
76
|
+
case embedded_content.name
|
77
|
+
when 'audio'
|
78
|
+
when 'canvas'
|
79
|
+
when 'embed'
|
80
|
+
when 'iframe'
|
81
|
+
item.text = (extract_attribute(embedded_content, 'name') || extract_attribute(embedded_content, 'srcdoc')).to_s
|
82
|
+
when 'img'
|
83
|
+
item.text = extract_attribute(embedded_content, 'alt').to_s
|
84
|
+
when 'math'
|
85
|
+
when 'object'
|
86
|
+
item.text = extract_attribute(embedded_content, 'name').to_s
|
87
|
+
when 'svg'
|
88
|
+
when 'video'
|
89
|
+
else
|
90
|
+
end
|
91
|
+
end
|
92
|
+
item.text = extract_attribute(a_or_span, 'title').to_s if item.text.nil? || item.text.empty?
|
93
|
+
end
|
94
|
+
item.href = Addressable::URI.parse(extract_attribute(a_or_span, 'href'))
|
95
|
+
item.item = @item.manifest.items.selector {|it| it.href.request_uri == item.href.request_uri}.first
|
96
|
+
end
|
97
|
+
item.items = element.xpath('./xhtml:ol[1]/xhtml:li', EPUB::NAMESPACES).map {|li| parse_navigation_item(li)}
|
98
|
+
|
99
|
+
item
|
100
|
+
end
|
101
|
+
|
34
102
|
private
|
35
103
|
|
36
104
|
# @param [Nokogiri::XML::Element] element nav element
|
data/lib/epub/parser/ocf.rb
CHANGED
@@ -6,6 +6,8 @@ require 'nokogiri'
|
|
6
6
|
module EPUB
|
7
7
|
class Parser
|
8
8
|
class OCF
|
9
|
+
include Utils
|
10
|
+
|
9
11
|
DIRECTORY = 'META-INF'
|
10
12
|
EPUB::OCF::MODULES.each {|m| self.const_set "#{m.upcase}_FILE", "#{m}.xml"}
|
11
13
|
|
@@ -38,7 +40,8 @@ module EPUB
|
|
38
40
|
doc.xpath('/ocf:container/ocf:rootfiles/ocf:rootfile', EPUB::NAMESPACES).each do |elem|
|
39
41
|
rootfile = EPUB::OCF::Container::Rootfile.new
|
40
42
|
%w[full-path media-type].each do |attr|
|
41
|
-
|
43
|
+
value = extract_attribute(elem, attr)
|
44
|
+
rootfile.__send__(attr.gsub(/-/, '_') + '=', value)
|
42
45
|
end
|
43
46
|
container.rootfiles << rootfile
|
44
47
|
end
|
@@ -8,9 +8,11 @@ require 'epub/constants'
|
|
8
8
|
module EPUB
|
9
9
|
class Parser
|
10
10
|
class Publication
|
11
|
+
include Utils
|
12
|
+
|
11
13
|
class << self
|
12
14
|
def parse(zip_archive, file)
|
13
|
-
opf = zip_archive.fopen(file).read
|
15
|
+
opf = zip_archive.fopen(Addressable::URI.unencode(file)).read
|
14
16
|
new(opf, file).parse
|
15
17
|
end
|
16
18
|
end
|
@@ -35,11 +37,10 @@ module EPUB
|
|
35
37
|
def parse_package
|
36
38
|
elem = @doc.root
|
37
39
|
%w[version xml:lang dir id].each do |attr|
|
38
|
-
|
39
|
-
@package.__send__(writer, elem[attr])
|
40
|
+
@package.__send__ "#{attr.gsub(/\:/, '_')}=", extract_attribute(elem, attr)
|
40
41
|
end
|
41
42
|
@unique_identifier_id = elem['unique-identifier']
|
42
|
-
@package.prefix = parse_prefix(elem
|
43
|
+
@package.prefix = parse_prefix(extract_attribute(elem, 'prefix'))
|
43
44
|
|
44
45
|
@package
|
45
46
|
end
|
@@ -52,7 +53,7 @@ module EPUB
|
|
52
53
|
metadata.identifiers = elem.xpath('./dc:identifier', EPUB::NAMESPACES).collect do |e|
|
53
54
|
identifier = EPUB::Publication::Package::Metadata::DCMES.new
|
54
55
|
identifier.content = e.content
|
55
|
-
identifier.id = id = e
|
56
|
+
identifier.id = id = extract_attribute(e, 'id')
|
56
57
|
metadata.unique_identifier = identifier if id == @unique_identifier_id
|
57
58
|
|
58
59
|
identifier
|
@@ -62,7 +63,7 @@ module EPUB
|
|
62
63
|
metadata.titles = elem.xpath('./dc:title', EPUB::NAMESPACES).collect do |e|
|
63
64
|
title = EPUB::Publication::Package::Metadata::Title.new
|
64
65
|
%w[ id lang dir ].each do |attr|
|
65
|
-
title.__send__("#{attr}=", e
|
66
|
+
title.__send__("#{attr}=", extract_attribute(e, attr))
|
66
67
|
end
|
67
68
|
title.content = e.content
|
68
69
|
|
@@ -85,9 +86,12 @@ module EPUB
|
|
85
86
|
|
86
87
|
metadata.metas = elem.xpath('./opf:meta', EPUB::NAMESPACES).collect do |e|
|
87
88
|
meta = EPUB::Publication::Package::Metadata::Meta.new
|
88
|
-
%w[
|
89
|
+
%w[property id scheme].each do |attr|
|
90
|
+
meta.__send__ "#{attr}=", extract_attribute(e, attr)
|
91
|
+
end
|
89
92
|
meta.content = e.content
|
90
|
-
|
93
|
+
refines = extract_attribute(e, 'refines')
|
94
|
+
if refines && refines[0] == '#'
|
91
95
|
id = refines[1..-1]
|
92
96
|
id_map[id] ||= {}
|
93
97
|
id_map[id][:refiners] ||= []
|
@@ -101,11 +105,12 @@ module EPUB
|
|
101
105
|
metadata.links = elem.xpath('./opf:link', EPUB::NAMESPACES).collect do |e|
|
102
106
|
link = EPUB::Publication::Package::Metadata::Link.new
|
103
107
|
%w[ id media-type ].each do |attr|
|
104
|
-
link.__send__(attr.gsub(/-/, '_') + '=', e
|
108
|
+
link.__send__ (attr.gsub(/-/, '_') + '='), extract_attribute(e, attr)
|
105
109
|
end
|
106
|
-
link.href = Addressable::URI.parse(e
|
107
|
-
link.rel = e
|
108
|
-
|
110
|
+
link.href = Addressable::URI.parse(extract_attribute(e, 'href'))
|
111
|
+
link.rel = extract_attribute(e, 'rel').strip.split
|
112
|
+
refines = extract_attribute(e, 'refines')
|
113
|
+
if refines && refines[0] == '#'
|
109
114
|
id = refines[1..-1]
|
110
115
|
id_map[id] ||= {}
|
111
116
|
id_map[id][:refiners] ||= []
|
@@ -128,17 +133,19 @@ module EPUB
|
|
128
133
|
def parse_manifest
|
129
134
|
manifest = @package.manifest = EPUB::Publication::Package::Manifest.new
|
130
135
|
elem = @doc.xpath('/opf:package/opf:manifest', EPUB::NAMESPACES).first
|
131
|
-
manifest.id = elem
|
136
|
+
manifest.id = extract_attribute(elem, 'id')
|
132
137
|
|
133
138
|
fallback_map = {}
|
134
139
|
elem.xpath('./opf:item', EPUB::NAMESPACES).each do |e|
|
135
140
|
item = EPUB::Publication::Package::Manifest::Item.new
|
136
141
|
%w[ id media-type media-overlay ].each do |attr|
|
137
|
-
item.__send__
|
142
|
+
item.__send__ "#{attr.gsub(/-/, '_')}=", extract_attribute(e, attr)
|
138
143
|
end
|
139
|
-
item.href = Addressable::URI.parse(e
|
140
|
-
|
141
|
-
|
144
|
+
item.href = Addressable::URI.parse(extract_attribute(e, 'href'))
|
145
|
+
fallback = extract_attribute(e, 'fallback')
|
146
|
+
fallback_map[fallback] = item if fallback
|
147
|
+
properties = extract_attribute(e, 'properties')
|
148
|
+
item.properties = properties ? properties.split(' ') : []
|
142
149
|
manifest << item
|
143
150
|
end
|
144
151
|
fallback_map.each_pair do |id, from|
|
@@ -152,16 +159,17 @@ module EPUB
|
|
152
159
|
spine = @package.spine = EPUB::Publication::Package::Spine.new
|
153
160
|
elem = @doc.xpath('/opf:package/opf:spine', EPUB::NAMESPACES).first
|
154
161
|
%w[ id toc page-progression-direction ].each do |attr|
|
155
|
-
spine.__send__
|
162
|
+
spine.__send__ "#{attr.gsub(/-/, '_')}=", extract_attribute(elem, attr)
|
156
163
|
end
|
157
164
|
|
158
165
|
elem.xpath('./opf:itemref', EPUB::NAMESPACES).each do |e|
|
159
166
|
itemref = EPUB::Publication::Package::Spine::Itemref.new
|
160
167
|
%w[ idref id ].each do |attr|
|
161
|
-
itemref.__send__
|
168
|
+
itemref.__send__ "#{attr}=", extract_attribute(e, attr)
|
162
169
|
end
|
163
|
-
itemref.linear = (e
|
164
|
-
|
170
|
+
itemref.linear = (extract_attribute(e, 'linear') != 'no')
|
171
|
+
properties = extract_attribute(e, 'properties')
|
172
|
+
itemref.properties = properties ? properties.split(' ') : []
|
165
173
|
spine << itemref
|
166
174
|
end
|
167
175
|
|
@@ -173,9 +181,9 @@ module EPUB
|
|
173
181
|
@doc.xpath('/opf:package/opf:guide/opf:reference', EPUB::NAMESPACES).each do |ref|
|
174
182
|
reference = EPUB::Publication::Package::Guide::Reference.new
|
175
183
|
%w[ type title ].each do |attr|
|
176
|
-
reference.__send__
|
184
|
+
reference.__send__ "#{attr}=", extract_attribute(ref, attr)
|
177
185
|
end
|
178
|
-
reference.href = Addressable::URI.parse(ref
|
186
|
+
reference.href = Addressable::URI.parse(extract_attribute(ref, 'href'))
|
179
187
|
guide << reference
|
180
188
|
end
|
181
189
|
|
@@ -186,9 +194,9 @@ module EPUB
|
|
186
194
|
bindings = @package.bindings = EPUB::Publication::Package::Bindings.new
|
187
195
|
@doc.xpath('/opf:package/opf:bindings/opf:mediaType', EPUB::NAMESPACES).each do |elem|
|
188
196
|
media_type = EPUB::Publication::Package::Bindings::MediaType.new
|
189
|
-
media_type.media_type = elem
|
197
|
+
media_type.media_type = extract_attribute(elem, 'media-type')
|
190
198
|
items = @package.manifest.items
|
191
|
-
media_type.handler = items.detect {|item| item.id == elem
|
199
|
+
media_type.handler = items.detect {|item| item.id == extract_attribute(elem, 'handler')}
|
192
200
|
bindings << media_type
|
193
201
|
end
|
194
202
|
|
@@ -218,7 +226,7 @@ module EPUB
|
|
218
226
|
md = EPUB::Publication::Package::Metadata::DCMES.new
|
219
227
|
md.content = e.content
|
220
228
|
%w[ id lang dir ].each do |attr|
|
221
|
-
md.__send__
|
229
|
+
md.__send__ "#{attr}=", extract_attribute(e, attr)
|
222
230
|
end
|
223
231
|
yield(md, e) if block_given?
|
224
232
|
md
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module EPUB
|
2
|
+
class Parser
|
3
|
+
module Utils
|
4
|
+
# Extract the value of attribute of element
|
5
|
+
#
|
6
|
+
# @todo Refinement Nokogiri::XML::Node instead of use this method after Ruby 2.0 becomes popular
|
7
|
+
#
|
8
|
+
# @param [Nokogiri::XML::Element] element
|
9
|
+
# @param [String] name name of attribute excluding namespace prefix
|
10
|
+
# @param [String, nil] prefix XML namespace prefix in {EPUB::Constants::NAMESPACES} keys
|
11
|
+
# @return [String] value of attribute when the attribute exists
|
12
|
+
# @return nil when the attribute doesn't exist
|
13
|
+
def extract_attribute(element, name, prefix=nil)
|
14
|
+
attr = element.attribute_with_ns(name, EPUB::NAMESPACES[prefix])
|
15
|
+
attr.nil? ? nil : attr.value
|
16
|
+
end
|
17
|
+
module_function :extract_attribute
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
data/lib/epub/parser/version.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'enumerabler'
|
2
2
|
require 'epub/constants'
|
3
|
+
require 'epub/parser/content_document'
|
3
4
|
|
4
5
|
module EPUB
|
5
6
|
module Publication
|
@@ -14,9 +15,8 @@ module EPUB
|
|
14
15
|
@items[item.id] = item
|
15
16
|
end
|
16
17
|
|
17
|
-
# syntax sugar
|
18
18
|
def navs
|
19
|
-
items.selector
|
19
|
+
items.selector(&:nav?)
|
20
20
|
end
|
21
21
|
|
22
22
|
def nav
|
@@ -62,10 +62,14 @@ module EPUB
|
|
62
62
|
rootfile = Addressable::URI.parse(manifest.package.book.ocf.container.rootfile.full_path)
|
63
63
|
Zip::Archive.open(manifest.package.book.epub_file) {|zip|
|
64
64
|
path = Addressable::URI.unescape(rootfile + href.normalize.request_uri)
|
65
|
-
zip.fopen(path
|
65
|
+
zip.fopen(path).read
|
66
66
|
}
|
67
67
|
end
|
68
68
|
|
69
|
+
def nav?
|
70
|
+
properties.include? 'nav'
|
71
|
+
end
|
72
|
+
|
69
73
|
# @todo Handle circular fallback chain
|
70
74
|
def use_fallback_chain(options = {})
|
71
75
|
supported = EPUB::MediaType::CORE
|
@@ -84,6 +88,11 @@ module EPUB
|
|
84
88
|
raise EPUB::MediaType::UnsupportedError
|
85
89
|
end
|
86
90
|
|
91
|
+
def content_document
|
92
|
+
return nil unless %w[application/xhtml+xml image/svg+xml].include? media_type
|
93
|
+
@content_document ||= Parser::ContentDocument.parse(self)
|
94
|
+
end
|
95
|
+
|
87
96
|
protected
|
88
97
|
|
89
98
|
def traverse_fallback_chain(chain)
|
@@ -31,17 +31,31 @@ module EPUB
|
|
31
31
|
titles.select {|title| title.title_type.to_s == type}.sort.join(' ')
|
32
32
|
end
|
33
33
|
end
|
34
|
+
|
34
35
|
def subtitle
|
35
36
|
titles.select {|title| title.title_type.to_s == 'subtitle'}.sort.join(' ')
|
36
37
|
end
|
37
38
|
|
38
|
-
def
|
39
|
+
def description
|
40
|
+
descriptions.join ' '
|
41
|
+
end
|
42
|
+
|
43
|
+
def date
|
44
|
+
dates.first
|
45
|
+
end
|
46
|
+
|
47
|
+
def to_h
|
39
48
|
DC_ELEMS.inject({}) do |hsh, elem|
|
40
49
|
hsh[elem] = __send__(elem)
|
41
50
|
hsh
|
42
51
|
end
|
43
52
|
end
|
44
53
|
|
54
|
+
def to_hash
|
55
|
+
warn "#{self.class}##{__method__} is obsolete"
|
56
|
+
to_h
|
57
|
+
end
|
58
|
+
|
45
59
|
def primary_metas
|
46
60
|
metas.select {|meta| meta.primary_expression?}
|
47
61
|
end
|
@@ -69,6 +83,13 @@ module EPUB
|
|
69
83
|
|
70
84
|
attr_accessor :content, :id, :lang, :dir
|
71
85
|
|
86
|
+
def inspect
|
87
|
+
ivs = instance_variables.map {|iv|
|
88
|
+
[iv, instance_variable_get(iv).inspect].join('=')
|
89
|
+
}.join(' ')
|
90
|
+
'<#%s:%#0x %s>' % [self.class, __id__, ivs]
|
91
|
+
end
|
92
|
+
|
72
93
|
def to_s
|
73
94
|
content
|
74
95
|
end
|
@@ -104,6 +125,13 @@ module EPUB
|
|
104
125
|
! subexpression?
|
105
126
|
end
|
106
127
|
|
128
|
+
def inspect
|
129
|
+
ivs = instance_variables.map {|iv|
|
130
|
+
[iv, instance_variable_get(iv).inspect].join('=')
|
131
|
+
}.join(' ')
|
132
|
+
'<#%s:%#0x %s>' % [self.class, __id__, ivs]
|
133
|
+
end
|
134
|
+
|
107
135
|
def to_s
|
108
136
|
content
|
109
137
|
end
|
data/test/helper.rb
CHANGED
data/test/test_parser.rb
CHANGED
@@ -1,18 +1,31 @@
|
|
1
1
|
require_relative 'helper'
|
2
|
-
require 'epub/parser/content_document'
|
3
2
|
|
4
3
|
class TestParserContentDocument < Test::Unit::TestCase
|
5
4
|
def setup
|
5
|
+
@manifest = EPUB::Publication::Package::Manifest.new
|
6
|
+
%w[item-1.xhtml item-2.xhtml nav.xhtml].each.with_index do |href, index|
|
7
|
+
item = EPUB::Publication::Package::Manifest::Item.new
|
8
|
+
item.id = index
|
9
|
+
item.href = Addressable::URI.parse(href)
|
10
|
+
@manifest << item
|
11
|
+
end
|
12
|
+
|
6
13
|
@dir = 'test/fixtures/book'
|
7
|
-
@parser = EPUB::Parser::ContentDocument.new
|
14
|
+
@parser = EPUB::Parser::ContentDocument.new(@manifest.items.last)
|
8
15
|
end
|
9
16
|
|
10
17
|
def test_parse_navigations
|
11
18
|
doc = Nokogiri.XML open("#{@dir}/OPS/nav.xhtml")
|
12
19
|
navs = @parser.parse_navigations doc
|
20
|
+
nav = navs.first
|
13
21
|
|
14
22
|
assert_equal 1, navs.length
|
15
|
-
assert_equal 'Table of Contents',
|
16
|
-
assert_equal 'toc',
|
23
|
+
assert_equal 'Table of Contents', nav.heading
|
24
|
+
assert_equal 'toc', nav.type
|
25
|
+
|
26
|
+
assert_equal 2, nav.items.length
|
27
|
+
assert_equal @manifest.items.first, nav.items.first.item
|
28
|
+
assert_equal @manifest.items[1], nav.items[1].items[0].item
|
29
|
+
assert_equal @manifest.items[1], nav.items[1].items[1].item
|
17
30
|
end
|
18
31
|
end
|