epub-parser 0.2.5 → 0.2.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +3 -3
- data/.yardopts +2 -0
- data/CHANGELOG.markdown +14 -1
- data/README.markdown +15 -29
- data/Rakefile +39 -4
- data/docs/FixedLayout.markdown +1 -1
- data/docs/Item.markdown +1 -1
- data/epub-parser.gemspec +2 -0
- data/examples/exctract-content-using-cfi.rb +111 -0
- data/examples/find-elements-and-cfis.rb +54 -0
- data/lib/epub/book/features.rb +36 -29
- data/lib/epub/constants.rb +2 -1
- data/lib/epub/inspector.rb +8 -3
- data/lib/epub/metadata.rb +178 -0
- data/lib/epub/ocf/container.rb +2 -1
- data/lib/epub/ocf/metadata.rb +2 -1
- data/lib/epub/ocf/physical_container.rb +11 -2
- data/lib/epub/ocf/physical_container/archive_zip.rb +7 -5
- data/lib/epub/ocf/physical_container/unpacked_directory.rb +4 -0
- data/lib/epub/ocf/physical_container/unpacked_uri.rb +4 -0
- data/lib/epub/ocf/physical_container/zipruby.rb +17 -5
- data/lib/epub/parser.rb +12 -5
- data/lib/epub/parser/metadata.rb +67 -0
- data/lib/epub/parser/ocf.rb +19 -4
- data/lib/epub/parser/publication.rb +32 -88
- data/lib/epub/parser/version.rb +1 -1
- data/lib/epub/publication/package/metadata.rb +2 -167
- data/test/fixtures/book/META-INF/metadata.xml +6 -0
- data/test/helper.rb +3 -0
- data/test/test_epub.rb +5 -1
- data/test/test_inspect.rb +4 -4
- data/test/test_parser_fixed_layout.rb +3 -2
- data/test/test_parser_ocf.rb +16 -1
- data/test/test_parser_publication.rb +14 -13
- data/test/test_publication.rb +36 -0
- data/test/test_searcher.rb +1 -1
- metadata +35 -3
data/lib/epub/parser/ocf.rb
CHANGED
@@ -1,12 +1,14 @@
|
|
1
1
|
require 'epub/constants'
|
2
2
|
require 'epub/ocf'
|
3
3
|
require 'epub/ocf/physical_container'
|
4
|
+
require 'epub/parser/metadata'
|
4
5
|
require 'nokogiri'
|
5
6
|
|
6
7
|
module EPUB
|
7
8
|
class Parser
|
8
9
|
class OCF
|
9
10
|
include Utils
|
11
|
+
include Metadata
|
10
12
|
|
11
13
|
DIRECTORY = 'META-INF'
|
12
14
|
|
@@ -26,9 +28,7 @@ module EPUB
|
|
26
28
|
begin
|
27
29
|
data = @container.read(File.join(DIRECTORY, "#{m}.xml"))
|
28
30
|
@ocf.__send__ "#{m}=", __send__("parse_#{m}", data)
|
29
|
-
rescue EPUB::OCF::PhysicalContainer::NoEntry
|
30
|
-
rescue => error
|
31
|
-
raise error unless (Object.const_defined? :Zip and ::Zip.const_defined? :Error and error.kind_of? ::Zip::Error)
|
31
|
+
rescue EPUB::OCF::PhysicalContainer::NoEntry
|
32
32
|
end
|
33
33
|
end
|
34
34
|
|
@@ -59,7 +59,14 @@ module EPUB
|
|
59
59
|
end
|
60
60
|
|
61
61
|
def parse_metadata(content)
|
62
|
-
|
62
|
+
doc = Nokogiri.XML(content)
|
63
|
+
unless multiple_rendition_metadata?(doc)
|
64
|
+
warn "Not implemented: #{self.class}##{__method__}" if $VERBOSE
|
65
|
+
metadata = EPUB::OCF::UnknownFormatMetadata.new
|
66
|
+
metadata.content = content
|
67
|
+
return metadata
|
68
|
+
end
|
69
|
+
super(doc.root, doc.root['unique-identifier'], 'metadata')
|
63
70
|
end
|
64
71
|
|
65
72
|
def parse_rights(content)
|
@@ -69,6 +76,14 @@ module EPUB
|
|
69
76
|
def parse_signatures(content)
|
70
77
|
warn "Not implemented: #{self.class}##{__method__}" if $VERBOSE
|
71
78
|
end
|
79
|
+
|
80
|
+
private
|
81
|
+
|
82
|
+
def multiple_rendition_metadata?(doc)
|
83
|
+
doc.root &&
|
84
|
+
doc.root.name == 'metadata' &&
|
85
|
+
doc.namespaces['xmlns'] == EPUB::NAMESPACES['metadata']
|
86
|
+
end
|
72
87
|
end
|
73
88
|
end
|
74
89
|
end
|
@@ -2,79 +2,56 @@ require 'strscan'
|
|
2
2
|
require 'nokogiri'
|
3
3
|
require 'epub/publication'
|
4
4
|
require 'epub/constants'
|
5
|
+
require 'epub/parser/metadata'
|
5
6
|
|
6
7
|
module EPUB
|
7
8
|
class Parser
|
8
9
|
class Publication
|
9
10
|
include Utils
|
11
|
+
include Metadata
|
10
12
|
|
11
13
|
class << self
|
12
14
|
def parse(container, file)
|
13
15
|
opf = container.read(Addressable::URI.unencode(file))
|
14
16
|
|
15
|
-
new(opf
|
17
|
+
new(opf).parse
|
16
18
|
end
|
17
19
|
end
|
18
20
|
|
19
|
-
def initialize(opf, rootfile)
|
20
|
-
|
21
|
-
@rootfile = Addressable::URI.parse(rootfile)
|
21
|
+
def initialize(opf, rootfile=nil)
|
22
|
+
warn "Second argument for #{self.class}.new is deprecated" if rootfile
|
22
23
|
@doc = Nokogiri.XML(opf)
|
23
24
|
end
|
24
25
|
|
25
26
|
def parse
|
26
|
-
|
27
|
-
|
27
|
+
package = parse_package(@doc)
|
28
|
+
(EPUB::Publication::Package::CONTENT_MODELS - [:bindings]).each do |model|
|
29
|
+
package.__send__ "#{model}=", __send__("parse_#{model}", @doc)
|
28
30
|
end
|
31
|
+
package.bindings = parse_bindings(@doc, package.manifest)
|
29
32
|
|
30
|
-
|
33
|
+
package
|
31
34
|
end
|
32
35
|
|
33
|
-
def parse_package
|
34
|
-
|
36
|
+
def parse_package(doc)
|
37
|
+
package = EPUB::Publication::Package.new
|
38
|
+
elem = doc.root
|
35
39
|
%w[version xml:lang dir id].each do |attr|
|
36
|
-
|
40
|
+
package.__send__ "#{attr.gsub(/\:/, '_')}=", extract_attribute(elem, attr)
|
37
41
|
end
|
38
|
-
|
39
|
-
|
40
|
-
EPUB::Publication.__send__ :include, EPUB::Publication::FixedLayout if @package.prefix.key? EPUB::Publication::FixedLayout::PREFIX_KEY
|
42
|
+
package.prefix = parse_prefix(extract_attribute(elem, 'prefix'))
|
43
|
+
EPUB::Publication.__send__ :include, EPUB::Publication::FixedLayout if package.prefix.key? EPUB::Publication::FixedLayout::PREFIX_KEY
|
41
44
|
|
42
|
-
|
45
|
+
package
|
43
46
|
end
|
44
47
|
|
45
|
-
def parse_metadata
|
46
|
-
|
47
|
-
elem = @doc.xpath('/opf:package/opf:metadata', EPUB::NAMESPACES).first
|
48
|
-
id_map = {}
|
49
|
-
|
50
|
-
metadata.identifiers = extract_model(elem, id_map, './dc:identifier', :Identifier, ['id']) {|identifier, e|
|
51
|
-
identifier.scheme = extract_attribute(e, 'scheme', 'opf')
|
52
|
-
metadata.unique_identifier = identifier if identifier.id == @unique_identifier_id
|
53
|
-
}
|
54
|
-
metadata.titles = extract_model(elem, id_map, './dc:title', :Title)
|
55
|
-
metadata.languages = extract_model(elem, id_map, './dc:language', :DCMES, %w[id])
|
56
|
-
%w[contributor coverage creator date description format publisher relation source subject type].each do |dcmes|
|
57
|
-
metadata.__send__ "#{dcmes}s=", extract_model(elem, id_map, "./dc:#{dcmes}")
|
58
|
-
end
|
59
|
-
metadata.rights = extract_model(elem, id_map, './dc:rights')
|
60
|
-
metadata.metas = extract_refinee(elem, id_map, './opf:meta', :Meta, %w[property id scheme])
|
61
|
-
metadata.links = extract_refinee(elem, id_map, './opf:link', :Link, %w[id media-type]) {|link, e|
|
62
|
-
link.href = extract_attribute(e, 'href')
|
63
|
-
link.rel = Set.new(extract_attribute(e, 'rel').split(nil))
|
64
|
-
}
|
65
|
-
|
66
|
-
id_map.values.each do |hsh|
|
67
|
-
next unless hsh[:refiners]
|
68
|
-
next unless hsh[:metadata]
|
69
|
-
hsh[:refiners].each {|meta| meta.refines = hsh[:metadata]}
|
70
|
-
end
|
71
|
-
|
72
|
-
metadata
|
48
|
+
def parse_metadata(doc)
|
49
|
+
super(doc.xpath('/opf:package/opf:metadata', EPUB::NAMESPACES).first, doc.root['unique-identifier'], 'opf')
|
73
50
|
end
|
74
51
|
|
75
|
-
def parse_manifest
|
76
|
-
manifest =
|
77
|
-
elem =
|
52
|
+
def parse_manifest(doc)
|
53
|
+
manifest = EPUB::Publication::Package::Manifest.new
|
54
|
+
elem = doc.xpath('/opf:package/opf:manifest', EPUB::NAMESPACES).first
|
78
55
|
manifest.id = extract_attribute(elem, 'id')
|
79
56
|
|
80
57
|
fallback_map = {}
|
@@ -97,9 +74,9 @@ module EPUB
|
|
97
74
|
manifest
|
98
75
|
end
|
99
76
|
|
100
|
-
def parse_spine
|
101
|
-
spine =
|
102
|
-
elem =
|
77
|
+
def parse_spine(doc)
|
78
|
+
spine = EPUB::Publication::Package::Spine.new
|
79
|
+
elem = doc.xpath('/opf:package/opf:spine', EPUB::NAMESPACES).first
|
103
80
|
%w[id toc page-progression-direction].each do |attr|
|
104
81
|
spine.__send__ "#{attr.gsub(/-/, '_')}=", extract_attribute(elem, attr)
|
105
82
|
end
|
@@ -118,9 +95,9 @@ module EPUB
|
|
118
95
|
spine
|
119
96
|
end
|
120
97
|
|
121
|
-
def parse_guide
|
122
|
-
guide =
|
123
|
-
|
98
|
+
def parse_guide(doc)
|
99
|
+
guide = EPUB::Publication::Package::Guide.new
|
100
|
+
doc.xpath('/opf:package/opf:guide/opf:reference', EPUB::NAMESPACES).each do |ref|
|
124
101
|
reference = EPUB::Publication::Package::Guide::Reference.new
|
125
102
|
%w[type title].each do |attr|
|
126
103
|
reference.__send__ "#{attr}=", extract_attribute(ref, attr)
|
@@ -132,12 +109,12 @@ module EPUB
|
|
132
109
|
guide
|
133
110
|
end
|
134
111
|
|
135
|
-
def parse_bindings
|
136
|
-
bindings =
|
137
|
-
|
112
|
+
def parse_bindings(doc, handler_map)
|
113
|
+
bindings = EPUB::Publication::Package::Bindings.new
|
114
|
+
doc.xpath('/opf:package/opf:bindings/opf:mediaType', EPUB::NAMESPACES).each do |elem|
|
138
115
|
media_type = EPUB::Publication::Package::Bindings::MediaType.new
|
139
116
|
media_type.media_type = extract_attribute(elem, 'media-type')
|
140
|
-
media_type.handler =
|
117
|
+
media_type.handler = handler_map[extract_attribute(elem, 'handler')]
|
141
118
|
bindings << media_type
|
142
119
|
end
|
143
120
|
|
@@ -161,39 +138,6 @@ module EPUB
|
|
161
138
|
end
|
162
139
|
prefixes
|
163
140
|
end
|
164
|
-
|
165
|
-
def extract_model(elem, id_map, xpath, klass=:DCMES, attributes=%w[id lang dir])
|
166
|
-
models = elem.xpath(xpath, EPUB::NAMESPACES).collect do |e|
|
167
|
-
model = EPUB::Publication::Package::Metadata.const_get(klass).new
|
168
|
-
attributes.each do |attr|
|
169
|
-
model.__send__ "#{attr.gsub(/-/, '_')}=", extract_attribute(e, attr)
|
170
|
-
end
|
171
|
-
model.content = e.content unless klass == :Link
|
172
|
-
|
173
|
-
yield model, e if block_given?
|
174
|
-
|
175
|
-
model
|
176
|
-
end
|
177
|
-
|
178
|
-
models.each do |model|
|
179
|
-
id_map[model.id] = {metadata: model} if model.respond_to?(:id) && model.id
|
180
|
-
end
|
181
|
-
|
182
|
-
models
|
183
|
-
end
|
184
|
-
|
185
|
-
def extract_refinee(elem, id_map, xpath, klass, attributes)
|
186
|
-
extract_model(elem, id_map, xpath, klass, attributes) {|model, e|
|
187
|
-
yield model, e if block_given?
|
188
|
-
refines = extract_attribute(e, 'refines')
|
189
|
-
if refines && refines[0] == '#'
|
190
|
-
id = refines[1..-1]
|
191
|
-
id_map[id] ||= {}
|
192
|
-
id_map[id][:refiners] ||= []
|
193
|
-
id_map[id][:refiners] << model
|
194
|
-
end
|
195
|
-
}
|
196
|
-
end
|
197
141
|
end
|
198
142
|
end
|
199
143
|
end
|
data/lib/epub/parser/version.rb
CHANGED
@@ -1,174 +1,9 @@
|
|
1
|
-
require '
|
1
|
+
require 'epub/metadata'
|
2
2
|
|
3
3
|
module EPUB
|
4
4
|
module Publication
|
5
5
|
class Package
|
6
|
-
|
7
|
-
include Inspector::PublicationModel
|
8
|
-
|
9
|
-
DC_ELEMS = [:identifiers, :titles, :languages] +
|
10
|
-
[:contributors, :coverages, :creators, :dates, :descriptions, :formats, :publishers,
|
11
|
-
:relations, :rights, :sources, :subjects, :types]
|
12
|
-
attr_accessor :package, :unique_identifier, :metas, :links,
|
13
|
-
*(DC_ELEMS.collect {|elem| "dc_#{elem}"})
|
14
|
-
DC_ELEMS.each do |elem|
|
15
|
-
alias_method elem, "dc_#{elem}"
|
16
|
-
alias_method "#{elem}=", "dc_#{elem}="
|
17
|
-
end
|
18
|
-
|
19
|
-
def initialize
|
20
|
-
(DC_ELEMS + [:metas, :links]).each do |elem|
|
21
|
-
__send__ "#{elem}=", []
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
def title
|
26
|
-
return extended_title unless extended_title.empty?
|
27
|
-
compositted = titles.select {|title| title.display_seq}.sort.join("\n")
|
28
|
-
return compositted unless compositted.empty?
|
29
|
-
return main_title unless main_title.empty?
|
30
|
-
titles.sort.join("\n")
|
31
|
-
end
|
32
|
-
|
33
|
-
%w[main short collection edition extended].each do |type|
|
34
|
-
define_method "#{type}_title" do
|
35
|
-
titles.select {|title| title.title_type.to_s == type}.sort.join(' ')
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
def subtitle
|
40
|
-
titles.select {|title| title.title_type.to_s == 'subtitle'}.sort.join(' ')
|
41
|
-
end
|
42
|
-
|
43
|
-
def description
|
44
|
-
descriptions.join(' ')
|
45
|
-
end
|
46
|
-
|
47
|
-
def date
|
48
|
-
dates.first
|
49
|
-
end
|
50
|
-
|
51
|
-
def language
|
52
|
-
languages.first
|
53
|
-
end
|
54
|
-
|
55
|
-
def modified
|
56
|
-
metas.find {|meta| meta.property == 'dcterms:modified'}
|
57
|
-
end
|
58
|
-
|
59
|
-
def to_h
|
60
|
-
DC_ELEMS.inject({}) do |hsh, elem|
|
61
|
-
hsh[elem] = __send__(elem)
|
62
|
-
hsh
|
63
|
-
end
|
64
|
-
end
|
65
|
-
|
66
|
-
def primary_metas
|
67
|
-
metas.select {|meta| meta.primary_expression?}
|
68
|
-
end
|
69
|
-
|
70
|
-
module Refinee
|
71
|
-
PROPERTIES = %w[alternate-script display-seq file-as group-position identifier-type meta-auth role title-type]
|
72
|
-
|
73
|
-
attr_writer :refiners
|
74
|
-
|
75
|
-
def refiners
|
76
|
-
@refiners ||= Set.new
|
77
|
-
end
|
78
|
-
|
79
|
-
PROPERTIES.each do |voc|
|
80
|
-
met = voc.gsub(/-/, '_')
|
81
|
-
attr_writer met
|
82
|
-
define_method met do
|
83
|
-
refiners.find {|refiner| refiner.property == voc}
|
84
|
-
end
|
85
|
-
end
|
86
|
-
end
|
87
|
-
|
88
|
-
class DCMES
|
89
|
-
include Refinee
|
90
|
-
|
91
|
-
attr_accessor :content, :id, :lang, :dir
|
92
|
-
|
93
|
-
def to_s
|
94
|
-
content.to_s
|
95
|
-
end
|
96
|
-
end
|
97
|
-
|
98
|
-
class Identifier < DCMES
|
99
|
-
# @note This is ad-hoc
|
100
|
-
# @todo Define and include OPF module for opf:scheme attribute
|
101
|
-
# @todo Define general way to handle with identifier-type refiners
|
102
|
-
attr_accessor :scheme
|
103
|
-
|
104
|
-
# @note This is ad-hoc
|
105
|
-
# @todo Define and include OPF module for opf:scheme attribute
|
106
|
-
# @todo Define general way to handle with identifier-type refiners
|
107
|
-
def isbn?
|
108
|
-
scheme == 'ISBN' or
|
109
|
-
content.to_s.downcase.start_with? 'urn:isbn' or
|
110
|
-
refiners.any? {|refiner|
|
111
|
-
refiner.property == 'identifier-type' and
|
112
|
-
refiner.scheme == 'onix:codelist5' and
|
113
|
-
%w[02 15].include? refiner.content
|
114
|
-
}
|
115
|
-
end
|
116
|
-
end
|
117
|
-
|
118
|
-
class Title < DCMES
|
119
|
-
include Comparable
|
120
|
-
|
121
|
-
def <=>(other)
|
122
|
-
return 1 if other.display_seq.nil?
|
123
|
-
return -1 if display_seq.nil?
|
124
|
-
display_seq.to_s.to_i <=> other.display_seq.to_s.to_i
|
125
|
-
end
|
126
|
-
end
|
127
|
-
|
128
|
-
class Meta
|
129
|
-
include Refinee
|
130
|
-
|
131
|
-
attr_accessor :property, :id, :scheme, :content
|
132
|
-
attr_reader :refines
|
133
|
-
|
134
|
-
def refines=(refinee)
|
135
|
-
refinee.refiners << self
|
136
|
-
@refines = refinee
|
137
|
-
end
|
138
|
-
|
139
|
-
def refines?
|
140
|
-
! refines.nil?
|
141
|
-
end
|
142
|
-
alias subexpression? refines?
|
143
|
-
|
144
|
-
def primary_expression?
|
145
|
-
! subexpression?
|
146
|
-
end
|
147
|
-
|
148
|
-
def inspect
|
149
|
-
ivs = instance_variables.map {|iv|
|
150
|
-
[iv, instance_variable_get(iv).inspect].join('=')
|
151
|
-
}.join(' ')
|
152
|
-
'<#%s:%#0x %s>' % [self.class, __id__, ivs]
|
153
|
-
end
|
154
|
-
|
155
|
-
def to_s
|
156
|
-
content.to_s
|
157
|
-
end
|
158
|
-
end
|
159
|
-
|
160
|
-
class Link
|
161
|
-
include Refinee
|
162
|
-
|
163
|
-
attr_accessor :href, :rel, :id, :media_type
|
164
|
-
attr_reader :refines
|
165
|
-
|
166
|
-
def refines=(refinee)
|
167
|
-
refinee.refiners << self
|
168
|
-
@refines = refinee
|
169
|
-
end
|
170
|
-
end
|
171
|
-
end
|
6
|
+
Metadata = EPUB::Metadata
|
172
7
|
end
|
173
8
|
end
|
174
9
|
end
|
@@ -0,0 +1,6 @@
|
|
1
|
+
<metadata xmlns="http://www.idpf.org/2013/metadata"
|
2
|
+
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
3
|
+
unique-identifier="pub-id">
|
4
|
+
<dc:identifier id="pub-id">urn:uuid:A1B0D67E-2E81-4DF5-9E67-A64CBE366809</dc:identifier>
|
5
|
+
<meta property="dcterms:modified">2011-01-01T12:00:00Z</meta>
|
6
|
+
</metadata>
|
data/test/helper.rb
CHANGED
data/test/test_epub.rb
CHANGED
@@ -17,5 +17,9 @@ class TestEUPB < Test::Unit::TestCase
|
|
17
17
|
assert_kind_of EPUB::Publication::Package::Manifest::Item, entry
|
18
18
|
end
|
19
19
|
end
|
20
|
-
end
|
21
20
|
|
21
|
+
def test_book_has_multiple_packages
|
22
|
+
book = EPUB::Parser.parse(@file)
|
23
|
+
assert_instance_of Array, book.packages
|
24
|
+
end
|
25
|
+
end
|