epub-parser 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.travis.yml +3 -0
- data/.yardopts +3 -0
- data/Gemfile +1 -1
- data/MIT-LICENSE +1 -1
- data/README.markdown +64 -11
- data/Rakefile +16 -6
- data/bin/epub-open +25 -0
- data/bin/epubinfo +1 -1
- data/epub-parser.gemspec +1 -2
- data/lib/epub.rb +1 -2
- data/lib/epub/constants.rb +3 -1
- data/lib/epub/content_document/navigation.rb +31 -42
- data/lib/epub/content_document/xhtml.rb +12 -0
- data/lib/epub/parser.rb +36 -6
- data/lib/epub/parser/content_document.rb +74 -6
- data/lib/epub/parser/ocf.rb +4 -1
- data/lib/epub/parser/publication.rb +34 -26
- data/lib/epub/parser/utils.rb +20 -0
- data/lib/epub/parser/version.rb +1 -1
- data/lib/epub/publication/package/manifest.rb +12 -3
- data/lib/epub/publication/package/metadata.rb +29 -1
- data/test/helper.rb +1 -0
- data/test/test_parser.rb +0 -1
- data/test/test_parser_content_document.rb +17 -4
- data/test/test_parser_ocf.rb +0 -1
- data/test/test_parser_publication.rb +0 -1
- data/test/test_publication.rb +20 -0
- metadata +140 -101
@@ -1,18 +1,48 @@
|
|
1
1
|
require 'epub/content_document'
|
2
2
|
require 'epub/constants'
|
3
|
+
require 'epub/parser/utils'
|
3
4
|
require 'nokogiri'
|
4
5
|
|
5
6
|
module EPUB
|
6
7
|
class Parser
|
7
8
|
class ContentDocument
|
9
|
+
include Utils
|
10
|
+
|
8
11
|
class << self
|
9
|
-
|
10
|
-
|
12
|
+
# @param [EPUB::Publication::Package::Manifest::Item] item
|
13
|
+
def parse(item)
|
14
|
+
new(item).parse
|
11
15
|
end
|
12
16
|
end
|
13
17
|
|
18
|
+
# @param [EPUB::Publication::Package::Manifest::Item] item
|
19
|
+
def initialize(item)
|
20
|
+
@item = item
|
21
|
+
end
|
22
|
+
|
14
23
|
def parse
|
15
|
-
|
24
|
+
content_document = case @item.media_type
|
25
|
+
when 'application/xhtml+xml'
|
26
|
+
if @item.nav?
|
27
|
+
EPUB::ContentDocument::Navigation.new
|
28
|
+
else
|
29
|
+
EPUB::ContentDocument::XHTML.new
|
30
|
+
end
|
31
|
+
when 'image/svg+xml'
|
32
|
+
EPUB::ContentDocument::SVG.new
|
33
|
+
else
|
34
|
+
nil
|
35
|
+
end
|
36
|
+
return content_document if content_document.nil?
|
37
|
+
content_document.item = @item
|
38
|
+
document = Nokogiri.XML(@item.read)
|
39
|
+
# parse_content_document(document)
|
40
|
+
if @item.nav?
|
41
|
+
content_document.navigations = parse_navigations(document)
|
42
|
+
else
|
43
|
+
raise NotImplementedError
|
44
|
+
end
|
45
|
+
content_document
|
16
46
|
end
|
17
47
|
|
18
48
|
# @param [Nokogiri::HTML::Document] document HTML document or element including nav
|
@@ -24,13 +54,51 @@ module EPUB
|
|
24
54
|
# @param [Nokogiri::XML::Element] element nav element
|
25
55
|
# @return [EPUB::ContentDocument::Navigation::Nav] nav Nav object
|
26
56
|
def parse_navigation(element)
|
27
|
-
nav = EPUB::ContentDocument::Navigation::
|
28
|
-
nav.
|
29
|
-
nav.type = element
|
57
|
+
nav = EPUB::ContentDocument::Navigation::Navigation.new
|
58
|
+
nav.text = find_heading(element)
|
59
|
+
nav.type = extract_attribute(element, 'type', 'epub')
|
60
|
+
nav.items = element.xpath('./xhtml:ol/xhtml:li', EPUB::NAMESPACES).map {|elem| parse_navigation_item(elem)}
|
30
61
|
|
31
62
|
nav
|
32
63
|
end
|
33
64
|
|
65
|
+
# @param [Nokogiri::XML::Element] element li element
|
66
|
+
def parse_navigation_item(element)
|
67
|
+
item = EPUB::ContentDocument::Navigation::Item.new
|
68
|
+
a_or_span = element.xpath('./xhtml:a[1]|xhtml:span[1]', EPUB::NAMESPACES).first
|
69
|
+
return a_or_span if a_or_span.nil?
|
70
|
+
|
71
|
+
item.text = a_or_span.text
|
72
|
+
if a_or_span.name == 'a'
|
73
|
+
if item.text.empty?
|
74
|
+
embedded_content = a_or_span.xpath('./xhtml:audio[1]|xhtml:canvas[1]|xhtml:embed[1]|xhtml:iframe[1]|xhtml:img[1]|xhtml:math[1]|xhtml:object[1]|xhtml:svg[1]|xhtml:video[1]', EPUB::NAMESPACES).first
|
75
|
+
unless embedded_content.nil?
|
76
|
+
case embedded_content.name
|
77
|
+
when 'audio'
|
78
|
+
when 'canvas'
|
79
|
+
when 'embed'
|
80
|
+
when 'iframe'
|
81
|
+
item.text = (extract_attribute(embedded_content, 'name') || extract_attribute(embedded_content, 'srcdoc')).to_s
|
82
|
+
when 'img'
|
83
|
+
item.text = extract_attribute(embedded_content, 'alt').to_s
|
84
|
+
when 'math'
|
85
|
+
when 'object'
|
86
|
+
item.text = extract_attribute(embedded_content, 'name').to_s
|
87
|
+
when 'svg'
|
88
|
+
when 'video'
|
89
|
+
else
|
90
|
+
end
|
91
|
+
end
|
92
|
+
item.text = extract_attribute(a_or_span, 'title').to_s if item.text.nil? || item.text.empty?
|
93
|
+
end
|
94
|
+
item.href = Addressable::URI.parse(extract_attribute(a_or_span, 'href'))
|
95
|
+
item.item = @item.manifest.items.selector {|it| it.href.request_uri == item.href.request_uri}.first
|
96
|
+
end
|
97
|
+
item.items = element.xpath('./xhtml:ol[1]/xhtml:li', EPUB::NAMESPACES).map {|li| parse_navigation_item(li)}
|
98
|
+
|
99
|
+
item
|
100
|
+
end
|
101
|
+
|
34
102
|
private
|
35
103
|
|
36
104
|
# @param [Nokogiri::XML::Element] element nav element
|
data/lib/epub/parser/ocf.rb
CHANGED
@@ -6,6 +6,8 @@ require 'nokogiri'
|
|
6
6
|
module EPUB
|
7
7
|
class Parser
|
8
8
|
class OCF
|
9
|
+
include Utils
|
10
|
+
|
9
11
|
DIRECTORY = 'META-INF'
|
10
12
|
EPUB::OCF::MODULES.each {|m| self.const_set "#{m.upcase}_FILE", "#{m}.xml"}
|
11
13
|
|
@@ -38,7 +40,8 @@ module EPUB
|
|
38
40
|
doc.xpath('/ocf:container/ocf:rootfiles/ocf:rootfile', EPUB::NAMESPACES).each do |elem|
|
39
41
|
rootfile = EPUB::OCF::Container::Rootfile.new
|
40
42
|
%w[full-path media-type].each do |attr|
|
41
|
-
|
43
|
+
value = extract_attribute(elem, attr)
|
44
|
+
rootfile.__send__(attr.gsub(/-/, '_') + '=', value)
|
42
45
|
end
|
43
46
|
container.rootfiles << rootfile
|
44
47
|
end
|
@@ -8,9 +8,11 @@ require 'epub/constants'
|
|
8
8
|
module EPUB
|
9
9
|
class Parser
|
10
10
|
class Publication
|
11
|
+
include Utils
|
12
|
+
|
11
13
|
class << self
|
12
14
|
def parse(zip_archive, file)
|
13
|
-
opf = zip_archive.fopen(file).read
|
15
|
+
opf = zip_archive.fopen(Addressable::URI.unencode(file)).read
|
14
16
|
new(opf, file).parse
|
15
17
|
end
|
16
18
|
end
|
@@ -35,11 +37,10 @@ module EPUB
|
|
35
37
|
def parse_package
|
36
38
|
elem = @doc.root
|
37
39
|
%w[version xml:lang dir id].each do |attr|
|
38
|
-
|
39
|
-
@package.__send__(writer, elem[attr])
|
40
|
+
@package.__send__ "#{attr.gsub(/\:/, '_')}=", extract_attribute(elem, attr)
|
40
41
|
end
|
41
42
|
@unique_identifier_id = elem['unique-identifier']
|
42
|
-
@package.prefix = parse_prefix(elem
|
43
|
+
@package.prefix = parse_prefix(extract_attribute(elem, 'prefix'))
|
43
44
|
|
44
45
|
@package
|
45
46
|
end
|
@@ -52,7 +53,7 @@ module EPUB
|
|
52
53
|
metadata.identifiers = elem.xpath('./dc:identifier', EPUB::NAMESPACES).collect do |e|
|
53
54
|
identifier = EPUB::Publication::Package::Metadata::DCMES.new
|
54
55
|
identifier.content = e.content
|
55
|
-
identifier.id = id = e
|
56
|
+
identifier.id = id = extract_attribute(e, 'id')
|
56
57
|
metadata.unique_identifier = identifier if id == @unique_identifier_id
|
57
58
|
|
58
59
|
identifier
|
@@ -62,7 +63,7 @@ module EPUB
|
|
62
63
|
metadata.titles = elem.xpath('./dc:title', EPUB::NAMESPACES).collect do |e|
|
63
64
|
title = EPUB::Publication::Package::Metadata::Title.new
|
64
65
|
%w[ id lang dir ].each do |attr|
|
65
|
-
title.__send__("#{attr}=", e
|
66
|
+
title.__send__("#{attr}=", extract_attribute(e, attr))
|
66
67
|
end
|
67
68
|
title.content = e.content
|
68
69
|
|
@@ -85,9 +86,12 @@ module EPUB
|
|
85
86
|
|
86
87
|
metadata.metas = elem.xpath('./opf:meta', EPUB::NAMESPACES).collect do |e|
|
87
88
|
meta = EPUB::Publication::Package::Metadata::Meta.new
|
88
|
-
%w[
|
89
|
+
%w[property id scheme].each do |attr|
|
90
|
+
meta.__send__ "#{attr}=", extract_attribute(e, attr)
|
91
|
+
end
|
89
92
|
meta.content = e.content
|
90
|
-
|
93
|
+
refines = extract_attribute(e, 'refines')
|
94
|
+
if refines && refines[0] == '#'
|
91
95
|
id = refines[1..-1]
|
92
96
|
id_map[id] ||= {}
|
93
97
|
id_map[id][:refiners] ||= []
|
@@ -101,11 +105,12 @@ module EPUB
|
|
101
105
|
metadata.links = elem.xpath('./opf:link', EPUB::NAMESPACES).collect do |e|
|
102
106
|
link = EPUB::Publication::Package::Metadata::Link.new
|
103
107
|
%w[ id media-type ].each do |attr|
|
104
|
-
link.__send__(attr.gsub(/-/, '_') + '=', e
|
108
|
+
link.__send__ (attr.gsub(/-/, '_') + '='), extract_attribute(e, attr)
|
105
109
|
end
|
106
|
-
link.href = Addressable::URI.parse(e
|
107
|
-
link.rel = e
|
108
|
-
|
110
|
+
link.href = Addressable::URI.parse(extract_attribute(e, 'href'))
|
111
|
+
link.rel = extract_attribute(e, 'rel').strip.split
|
112
|
+
refines = extract_attribute(e, 'refines')
|
113
|
+
if refines && refines[0] == '#'
|
109
114
|
id = refines[1..-1]
|
110
115
|
id_map[id] ||= {}
|
111
116
|
id_map[id][:refiners] ||= []
|
@@ -128,17 +133,19 @@ module EPUB
|
|
128
133
|
def parse_manifest
|
129
134
|
manifest = @package.manifest = EPUB::Publication::Package::Manifest.new
|
130
135
|
elem = @doc.xpath('/opf:package/opf:manifest', EPUB::NAMESPACES).first
|
131
|
-
manifest.id = elem
|
136
|
+
manifest.id = extract_attribute(elem, 'id')
|
132
137
|
|
133
138
|
fallback_map = {}
|
134
139
|
elem.xpath('./opf:item', EPUB::NAMESPACES).each do |e|
|
135
140
|
item = EPUB::Publication::Package::Manifest::Item.new
|
136
141
|
%w[ id media-type media-overlay ].each do |attr|
|
137
|
-
item.__send__
|
142
|
+
item.__send__ "#{attr.gsub(/-/, '_')}=", extract_attribute(e, attr)
|
138
143
|
end
|
139
|
-
item.href = Addressable::URI.parse(e
|
140
|
-
|
141
|
-
|
144
|
+
item.href = Addressable::URI.parse(extract_attribute(e, 'href'))
|
145
|
+
fallback = extract_attribute(e, 'fallback')
|
146
|
+
fallback_map[fallback] = item if fallback
|
147
|
+
properties = extract_attribute(e, 'properties')
|
148
|
+
item.properties = properties ? properties.split(' ') : []
|
142
149
|
manifest << item
|
143
150
|
end
|
144
151
|
fallback_map.each_pair do |id, from|
|
@@ -152,16 +159,17 @@ module EPUB
|
|
152
159
|
spine = @package.spine = EPUB::Publication::Package::Spine.new
|
153
160
|
elem = @doc.xpath('/opf:package/opf:spine', EPUB::NAMESPACES).first
|
154
161
|
%w[ id toc page-progression-direction ].each do |attr|
|
155
|
-
spine.__send__
|
162
|
+
spine.__send__ "#{attr.gsub(/-/, '_')}=", extract_attribute(elem, attr)
|
156
163
|
end
|
157
164
|
|
158
165
|
elem.xpath('./opf:itemref', EPUB::NAMESPACES).each do |e|
|
159
166
|
itemref = EPUB::Publication::Package::Spine::Itemref.new
|
160
167
|
%w[ idref id ].each do |attr|
|
161
|
-
itemref.__send__
|
168
|
+
itemref.__send__ "#{attr}=", extract_attribute(e, attr)
|
162
169
|
end
|
163
|
-
itemref.linear = (e
|
164
|
-
|
170
|
+
itemref.linear = (extract_attribute(e, 'linear') != 'no')
|
171
|
+
properties = extract_attribute(e, 'properties')
|
172
|
+
itemref.properties = properties ? properties.split(' ') : []
|
165
173
|
spine << itemref
|
166
174
|
end
|
167
175
|
|
@@ -173,9 +181,9 @@ module EPUB
|
|
173
181
|
@doc.xpath('/opf:package/opf:guide/opf:reference', EPUB::NAMESPACES).each do |ref|
|
174
182
|
reference = EPUB::Publication::Package::Guide::Reference.new
|
175
183
|
%w[ type title ].each do |attr|
|
176
|
-
reference.__send__
|
184
|
+
reference.__send__ "#{attr}=", extract_attribute(ref, attr)
|
177
185
|
end
|
178
|
-
reference.href = Addressable::URI.parse(ref
|
186
|
+
reference.href = Addressable::URI.parse(extract_attribute(ref, 'href'))
|
179
187
|
guide << reference
|
180
188
|
end
|
181
189
|
|
@@ -186,9 +194,9 @@ module EPUB
|
|
186
194
|
bindings = @package.bindings = EPUB::Publication::Package::Bindings.new
|
187
195
|
@doc.xpath('/opf:package/opf:bindings/opf:mediaType', EPUB::NAMESPACES).each do |elem|
|
188
196
|
media_type = EPUB::Publication::Package::Bindings::MediaType.new
|
189
|
-
media_type.media_type = elem
|
197
|
+
media_type.media_type = extract_attribute(elem, 'media-type')
|
190
198
|
items = @package.manifest.items
|
191
|
-
media_type.handler = items.detect {|item| item.id == elem
|
199
|
+
media_type.handler = items.detect {|item| item.id == extract_attribute(elem, 'handler')}
|
192
200
|
bindings << media_type
|
193
201
|
end
|
194
202
|
|
@@ -218,7 +226,7 @@ module EPUB
|
|
218
226
|
md = EPUB::Publication::Package::Metadata::DCMES.new
|
219
227
|
md.content = e.content
|
220
228
|
%w[ id lang dir ].each do |attr|
|
221
|
-
md.__send__
|
229
|
+
md.__send__ "#{attr}=", extract_attribute(e, attr)
|
222
230
|
end
|
223
231
|
yield(md, e) if block_given?
|
224
232
|
md
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module EPUB
|
2
|
+
class Parser
|
3
|
+
module Utils
|
4
|
+
# Extract the value of attribute of element
|
5
|
+
#
|
6
|
+
# @todo Refinement Nokogiri::XML::Node instead of use this method after Ruby 2.0 becomes popular
|
7
|
+
#
|
8
|
+
# @param [Nokogiri::XML::Element] element
|
9
|
+
# @param [String] name name of attribute excluding namespace prefix
|
10
|
+
# @param [String, nil] prefix XML namespace prefix in {EPUB::Constants::NAMESPACES} keys
|
11
|
+
# @return [String] value of attribute when the attribute exists
|
12
|
+
# @return nil when the attribute doesn't exist
|
13
|
+
def extract_attribute(element, name, prefix=nil)
|
14
|
+
attr = element.attribute_with_ns(name, EPUB::NAMESPACES[prefix])
|
15
|
+
attr.nil? ? nil : attr.value
|
16
|
+
end
|
17
|
+
module_function :extract_attribute
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
data/lib/epub/parser/version.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'enumerabler'
|
2
2
|
require 'epub/constants'
|
3
|
+
require 'epub/parser/content_document'
|
3
4
|
|
4
5
|
module EPUB
|
5
6
|
module Publication
|
@@ -14,9 +15,8 @@ module EPUB
|
|
14
15
|
@items[item.id] = item
|
15
16
|
end
|
16
17
|
|
17
|
-
# syntax sugar
|
18
18
|
def navs
|
19
|
-
items.selector
|
19
|
+
items.selector(&:nav?)
|
20
20
|
end
|
21
21
|
|
22
22
|
def nav
|
@@ -62,10 +62,14 @@ module EPUB
|
|
62
62
|
rootfile = Addressable::URI.parse(manifest.package.book.ocf.container.rootfile.full_path)
|
63
63
|
Zip::Archive.open(manifest.package.book.epub_file) {|zip|
|
64
64
|
path = Addressable::URI.unescape(rootfile + href.normalize.request_uri)
|
65
|
-
zip.fopen(path
|
65
|
+
zip.fopen(path).read
|
66
66
|
}
|
67
67
|
end
|
68
68
|
|
69
|
+
def nav?
|
70
|
+
properties.include? 'nav'
|
71
|
+
end
|
72
|
+
|
69
73
|
# @todo Handle circular fallback chain
|
70
74
|
def use_fallback_chain(options = {})
|
71
75
|
supported = EPUB::MediaType::CORE
|
@@ -84,6 +88,11 @@ module EPUB
|
|
84
88
|
raise EPUB::MediaType::UnsupportedError
|
85
89
|
end
|
86
90
|
|
91
|
+
def content_document
|
92
|
+
return nil unless %w[application/xhtml+xml image/svg+xml].include? media_type
|
93
|
+
@content_document ||= Parser::ContentDocument.parse(self)
|
94
|
+
end
|
95
|
+
|
87
96
|
protected
|
88
97
|
|
89
98
|
def traverse_fallback_chain(chain)
|
@@ -31,17 +31,31 @@ module EPUB
|
|
31
31
|
titles.select {|title| title.title_type.to_s == type}.sort.join(' ')
|
32
32
|
end
|
33
33
|
end
|
34
|
+
|
34
35
|
def subtitle
|
35
36
|
titles.select {|title| title.title_type.to_s == 'subtitle'}.sort.join(' ')
|
36
37
|
end
|
37
38
|
|
38
|
-
def
|
39
|
+
def description
|
40
|
+
descriptions.join ' '
|
41
|
+
end
|
42
|
+
|
43
|
+
def date
|
44
|
+
dates.first
|
45
|
+
end
|
46
|
+
|
47
|
+
def to_h
|
39
48
|
DC_ELEMS.inject({}) do |hsh, elem|
|
40
49
|
hsh[elem] = __send__(elem)
|
41
50
|
hsh
|
42
51
|
end
|
43
52
|
end
|
44
53
|
|
54
|
+
def to_hash
|
55
|
+
warn "#{self.class}##{__method__} is obsolete"
|
56
|
+
to_h
|
57
|
+
end
|
58
|
+
|
45
59
|
def primary_metas
|
46
60
|
metas.select {|meta| meta.primary_expression?}
|
47
61
|
end
|
@@ -69,6 +83,13 @@ module EPUB
|
|
69
83
|
|
70
84
|
attr_accessor :content, :id, :lang, :dir
|
71
85
|
|
86
|
+
def inspect
|
87
|
+
ivs = instance_variables.map {|iv|
|
88
|
+
[iv, instance_variable_get(iv).inspect].join('=')
|
89
|
+
}.join(' ')
|
90
|
+
'<#%s:%#0x %s>' % [self.class, __id__, ivs]
|
91
|
+
end
|
92
|
+
|
72
93
|
def to_s
|
73
94
|
content
|
74
95
|
end
|
@@ -104,6 +125,13 @@ module EPUB
|
|
104
125
|
! subexpression?
|
105
126
|
end
|
106
127
|
|
128
|
+
def inspect
|
129
|
+
ivs = instance_variables.map {|iv|
|
130
|
+
[iv, instance_variable_get(iv).inspect].join('=')
|
131
|
+
}.join(' ')
|
132
|
+
'<#%s:%#0x %s>' % [self.class, __id__, ivs]
|
133
|
+
end
|
134
|
+
|
107
135
|
def to_s
|
108
136
|
content
|
109
137
|
end
|
data/test/helper.rb
CHANGED
data/test/test_parser.rb
CHANGED
@@ -1,18 +1,31 @@
|
|
1
1
|
require_relative 'helper'
|
2
|
-
require 'epub/parser/content_document'
|
3
2
|
|
4
3
|
class TestParserContentDocument < Test::Unit::TestCase
|
5
4
|
def setup
|
5
|
+
@manifest = EPUB::Publication::Package::Manifest.new
|
6
|
+
%w[item-1.xhtml item-2.xhtml nav.xhtml].each.with_index do |href, index|
|
7
|
+
item = EPUB::Publication::Package::Manifest::Item.new
|
8
|
+
item.id = index
|
9
|
+
item.href = Addressable::URI.parse(href)
|
10
|
+
@manifest << item
|
11
|
+
end
|
12
|
+
|
6
13
|
@dir = 'test/fixtures/book'
|
7
|
-
@parser = EPUB::Parser::ContentDocument.new
|
14
|
+
@parser = EPUB::Parser::ContentDocument.new(@manifest.items.last)
|
8
15
|
end
|
9
16
|
|
10
17
|
def test_parse_navigations
|
11
18
|
doc = Nokogiri.XML open("#{@dir}/OPS/nav.xhtml")
|
12
19
|
navs = @parser.parse_navigations doc
|
20
|
+
nav = navs.first
|
13
21
|
|
14
22
|
assert_equal 1, navs.length
|
15
|
-
assert_equal 'Table of Contents',
|
16
|
-
assert_equal 'toc',
|
23
|
+
assert_equal 'Table of Contents', nav.heading
|
24
|
+
assert_equal 'toc', nav.type
|
25
|
+
|
26
|
+
assert_equal 2, nav.items.length
|
27
|
+
assert_equal @manifest.items.first, nav.items.first.item
|
28
|
+
assert_equal @manifest.items[1], nav.items[1].items[0].item
|
29
|
+
assert_equal @manifest.items[1], nav.items[1].items[1].item
|
17
30
|
end
|
18
31
|
end
|