epub-parser 0.2.5 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +3 -3
- data/.yardopts +2 -0
- data/CHANGELOG.markdown +14 -1
- data/README.markdown +15 -29
- data/Rakefile +39 -4
- data/docs/FixedLayout.markdown +1 -1
- data/docs/Item.markdown +1 -1
- data/epub-parser.gemspec +2 -0
- data/examples/exctract-content-using-cfi.rb +111 -0
- data/examples/find-elements-and-cfis.rb +54 -0
- data/lib/epub/book/features.rb +36 -29
- data/lib/epub/constants.rb +2 -1
- data/lib/epub/inspector.rb +8 -3
- data/lib/epub/metadata.rb +178 -0
- data/lib/epub/ocf/container.rb +2 -1
- data/lib/epub/ocf/metadata.rb +2 -1
- data/lib/epub/ocf/physical_container.rb +11 -2
- data/lib/epub/ocf/physical_container/archive_zip.rb +7 -5
- data/lib/epub/ocf/physical_container/unpacked_directory.rb +4 -0
- data/lib/epub/ocf/physical_container/unpacked_uri.rb +4 -0
- data/lib/epub/ocf/physical_container/zipruby.rb +17 -5
- data/lib/epub/parser.rb +12 -5
- data/lib/epub/parser/metadata.rb +67 -0
- data/lib/epub/parser/ocf.rb +19 -4
- data/lib/epub/parser/publication.rb +32 -88
- data/lib/epub/parser/version.rb +1 -1
- data/lib/epub/publication/package/metadata.rb +2 -167
- data/test/fixtures/book/META-INF/metadata.xml +6 -0
- data/test/helper.rb +3 -0
- data/test/test_epub.rb +5 -1
- data/test/test_inspect.rb +4 -4
- data/test/test_parser_fixed_layout.rb +3 -2
- data/test/test_parser_ocf.rb +16 -1
- data/test/test_parser_publication.rb +14 -13
- data/test/test_publication.rb +36 -0
- data/test/test_searcher.rb +1 -1
- metadata +35 -3
data/lib/epub/parser/ocf.rb
CHANGED
@@ -1,12 +1,14 @@
|
|
1
1
|
require 'epub/constants'
|
2
2
|
require 'epub/ocf'
|
3
3
|
require 'epub/ocf/physical_container'
|
4
|
+
require 'epub/parser/metadata'
|
4
5
|
require 'nokogiri'
|
5
6
|
|
6
7
|
module EPUB
|
7
8
|
class Parser
|
8
9
|
class OCF
|
9
10
|
include Utils
|
11
|
+
include Metadata
|
10
12
|
|
11
13
|
DIRECTORY = 'META-INF'
|
12
14
|
|
@@ -26,9 +28,7 @@ module EPUB
|
|
26
28
|
begin
|
27
29
|
data = @container.read(File.join(DIRECTORY, "#{m}.xml"))
|
28
30
|
@ocf.__send__ "#{m}=", __send__("parse_#{m}", data)
|
29
|
-
rescue EPUB::OCF::PhysicalContainer::NoEntry
|
30
|
-
rescue => error
|
31
|
-
raise error unless (Object.const_defined? :Zip and ::Zip.const_defined? :Error and error.kind_of? ::Zip::Error)
|
31
|
+
rescue EPUB::OCF::PhysicalContainer::NoEntry
|
32
32
|
end
|
33
33
|
end
|
34
34
|
|
@@ -59,7 +59,14 @@ module EPUB
|
|
59
59
|
end
|
60
60
|
|
61
61
|
def parse_metadata(content)
|
62
|
-
|
62
|
+
doc = Nokogiri.XML(content)
|
63
|
+
unless multiple_rendition_metadata?(doc)
|
64
|
+
warn "Not implemented: #{self.class}##{__method__}" if $VERBOSE
|
65
|
+
metadata = EPUB::OCF::UnknownFormatMetadata.new
|
66
|
+
metadata.content = content
|
67
|
+
return metadata
|
68
|
+
end
|
69
|
+
super(doc.root, doc.root['unique-identifier'], 'metadata')
|
63
70
|
end
|
64
71
|
|
65
72
|
def parse_rights(content)
|
@@ -69,6 +76,14 @@ module EPUB
|
|
69
76
|
def parse_signatures(content)
|
70
77
|
warn "Not implemented: #{self.class}##{__method__}" if $VERBOSE
|
71
78
|
end
|
79
|
+
|
80
|
+
private
|
81
|
+
|
82
|
+
def multiple_rendition_metadata?(doc)
|
83
|
+
doc.root &&
|
84
|
+
doc.root.name == 'metadata' &&
|
85
|
+
doc.namespaces['xmlns'] == EPUB::NAMESPACES['metadata']
|
86
|
+
end
|
72
87
|
end
|
73
88
|
end
|
74
89
|
end
|
@@ -2,79 +2,56 @@ require 'strscan'
|
|
2
2
|
require 'nokogiri'
|
3
3
|
require 'epub/publication'
|
4
4
|
require 'epub/constants'
|
5
|
+
require 'epub/parser/metadata'
|
5
6
|
|
6
7
|
module EPUB
|
7
8
|
class Parser
|
8
9
|
class Publication
|
9
10
|
include Utils
|
11
|
+
include Metadata
|
10
12
|
|
11
13
|
class << self
|
12
14
|
def parse(container, file)
|
13
15
|
opf = container.read(Addressable::URI.unencode(file))
|
14
16
|
|
15
|
-
new(opf
|
17
|
+
new(opf).parse
|
16
18
|
end
|
17
19
|
end
|
18
20
|
|
19
|
-
def initialize(opf, rootfile)
|
20
|
-
|
21
|
-
@rootfile = Addressable::URI.parse(rootfile)
|
21
|
+
def initialize(opf, rootfile=nil)
|
22
|
+
warn "Second argument for #{self.class}.new is deprecated" if rootfile
|
22
23
|
@doc = Nokogiri.XML(opf)
|
23
24
|
end
|
24
25
|
|
25
26
|
def parse
|
26
|
-
|
27
|
-
|
27
|
+
package = parse_package(@doc)
|
28
|
+
(EPUB::Publication::Package::CONTENT_MODELS - [:bindings]).each do |model|
|
29
|
+
package.__send__ "#{model}=", __send__("parse_#{model}", @doc)
|
28
30
|
end
|
31
|
+
package.bindings = parse_bindings(@doc, package.manifest)
|
29
32
|
|
30
|
-
|
33
|
+
package
|
31
34
|
end
|
32
35
|
|
33
|
-
def parse_package
|
34
|
-
|
36
|
+
def parse_package(doc)
|
37
|
+
package = EPUB::Publication::Package.new
|
38
|
+
elem = doc.root
|
35
39
|
%w[version xml:lang dir id].each do |attr|
|
36
|
-
|
40
|
+
package.__send__ "#{attr.gsub(/\:/, '_')}=", extract_attribute(elem, attr)
|
37
41
|
end
|
38
|
-
|
39
|
-
|
40
|
-
EPUB::Publication.__send__ :include, EPUB::Publication::FixedLayout if @package.prefix.key? EPUB::Publication::FixedLayout::PREFIX_KEY
|
42
|
+
package.prefix = parse_prefix(extract_attribute(elem, 'prefix'))
|
43
|
+
EPUB::Publication.__send__ :include, EPUB::Publication::FixedLayout if package.prefix.key? EPUB::Publication::FixedLayout::PREFIX_KEY
|
41
44
|
|
42
|
-
|
45
|
+
package
|
43
46
|
end
|
44
47
|
|
45
|
-
def parse_metadata
|
46
|
-
|
47
|
-
elem = @doc.xpath('/opf:package/opf:metadata', EPUB::NAMESPACES).first
|
48
|
-
id_map = {}
|
49
|
-
|
50
|
-
metadata.identifiers = extract_model(elem, id_map, './dc:identifier', :Identifier, ['id']) {|identifier, e|
|
51
|
-
identifier.scheme = extract_attribute(e, 'scheme', 'opf')
|
52
|
-
metadata.unique_identifier = identifier if identifier.id == @unique_identifier_id
|
53
|
-
}
|
54
|
-
metadata.titles = extract_model(elem, id_map, './dc:title', :Title)
|
55
|
-
metadata.languages = extract_model(elem, id_map, './dc:language', :DCMES, %w[id])
|
56
|
-
%w[contributor coverage creator date description format publisher relation source subject type].each do |dcmes|
|
57
|
-
metadata.__send__ "#{dcmes}s=", extract_model(elem, id_map, "./dc:#{dcmes}")
|
58
|
-
end
|
59
|
-
metadata.rights = extract_model(elem, id_map, './dc:rights')
|
60
|
-
metadata.metas = extract_refinee(elem, id_map, './opf:meta', :Meta, %w[property id scheme])
|
61
|
-
metadata.links = extract_refinee(elem, id_map, './opf:link', :Link, %w[id media-type]) {|link, e|
|
62
|
-
link.href = extract_attribute(e, 'href')
|
63
|
-
link.rel = Set.new(extract_attribute(e, 'rel').split(nil))
|
64
|
-
}
|
65
|
-
|
66
|
-
id_map.values.each do |hsh|
|
67
|
-
next unless hsh[:refiners]
|
68
|
-
next unless hsh[:metadata]
|
69
|
-
hsh[:refiners].each {|meta| meta.refines = hsh[:metadata]}
|
70
|
-
end
|
71
|
-
|
72
|
-
metadata
|
48
|
+
def parse_metadata(doc)
|
49
|
+
super(doc.xpath('/opf:package/opf:metadata', EPUB::NAMESPACES).first, doc.root['unique-identifier'], 'opf')
|
73
50
|
end
|
74
51
|
|
75
|
-
def parse_manifest
|
76
|
-
manifest =
|
77
|
-
elem =
|
52
|
+
def parse_manifest(doc)
|
53
|
+
manifest = EPUB::Publication::Package::Manifest.new
|
54
|
+
elem = doc.xpath('/opf:package/opf:manifest', EPUB::NAMESPACES).first
|
78
55
|
manifest.id = extract_attribute(elem, 'id')
|
79
56
|
|
80
57
|
fallback_map = {}
|
@@ -97,9 +74,9 @@ module EPUB
|
|
97
74
|
manifest
|
98
75
|
end
|
99
76
|
|
100
|
-
def parse_spine
|
101
|
-
spine =
|
102
|
-
elem =
|
77
|
+
def parse_spine(doc)
|
78
|
+
spine = EPUB::Publication::Package::Spine.new
|
79
|
+
elem = doc.xpath('/opf:package/opf:spine', EPUB::NAMESPACES).first
|
103
80
|
%w[id toc page-progression-direction].each do |attr|
|
104
81
|
spine.__send__ "#{attr.gsub(/-/, '_')}=", extract_attribute(elem, attr)
|
105
82
|
end
|
@@ -118,9 +95,9 @@ module EPUB
|
|
118
95
|
spine
|
119
96
|
end
|
120
97
|
|
121
|
-
def parse_guide
|
122
|
-
guide =
|
123
|
-
|
98
|
+
def parse_guide(doc)
|
99
|
+
guide = EPUB::Publication::Package::Guide.new
|
100
|
+
doc.xpath('/opf:package/opf:guide/opf:reference', EPUB::NAMESPACES).each do |ref|
|
124
101
|
reference = EPUB::Publication::Package::Guide::Reference.new
|
125
102
|
%w[type title].each do |attr|
|
126
103
|
reference.__send__ "#{attr}=", extract_attribute(ref, attr)
|
@@ -132,12 +109,12 @@ module EPUB
|
|
132
109
|
guide
|
133
110
|
end
|
134
111
|
|
135
|
-
def parse_bindings
|
136
|
-
bindings =
|
137
|
-
|
112
|
+
def parse_bindings(doc, handler_map)
|
113
|
+
bindings = EPUB::Publication::Package::Bindings.new
|
114
|
+
doc.xpath('/opf:package/opf:bindings/opf:mediaType', EPUB::NAMESPACES).each do |elem|
|
138
115
|
media_type = EPUB::Publication::Package::Bindings::MediaType.new
|
139
116
|
media_type.media_type = extract_attribute(elem, 'media-type')
|
140
|
-
media_type.handler =
|
117
|
+
media_type.handler = handler_map[extract_attribute(elem, 'handler')]
|
141
118
|
bindings << media_type
|
142
119
|
end
|
143
120
|
|
@@ -161,39 +138,6 @@ module EPUB
|
|
161
138
|
end
|
162
139
|
prefixes
|
163
140
|
end
|
164
|
-
|
165
|
-
def extract_model(elem, id_map, xpath, klass=:DCMES, attributes=%w[id lang dir])
|
166
|
-
models = elem.xpath(xpath, EPUB::NAMESPACES).collect do |e|
|
167
|
-
model = EPUB::Publication::Package::Metadata.const_get(klass).new
|
168
|
-
attributes.each do |attr|
|
169
|
-
model.__send__ "#{attr.gsub(/-/, '_')}=", extract_attribute(e, attr)
|
170
|
-
end
|
171
|
-
model.content = e.content unless klass == :Link
|
172
|
-
|
173
|
-
yield model, e if block_given?
|
174
|
-
|
175
|
-
model
|
176
|
-
end
|
177
|
-
|
178
|
-
models.each do |model|
|
179
|
-
id_map[model.id] = {metadata: model} if model.respond_to?(:id) && model.id
|
180
|
-
end
|
181
|
-
|
182
|
-
models
|
183
|
-
end
|
184
|
-
|
185
|
-
def extract_refinee(elem, id_map, xpath, klass, attributes)
|
186
|
-
extract_model(elem, id_map, xpath, klass, attributes) {|model, e|
|
187
|
-
yield model, e if block_given?
|
188
|
-
refines = extract_attribute(e, 'refines')
|
189
|
-
if refines && refines[0] == '#'
|
190
|
-
id = refines[1..-1]
|
191
|
-
id_map[id] ||= {}
|
192
|
-
id_map[id][:refiners] ||= []
|
193
|
-
id_map[id][:refiners] << model
|
194
|
-
end
|
195
|
-
}
|
196
|
-
end
|
197
141
|
end
|
198
142
|
end
|
199
143
|
end
|
data/lib/epub/parser/version.rb
CHANGED
@@ -1,174 +1,9 @@
|
|
1
|
-
require '
|
1
|
+
require 'epub/metadata'
|
2
2
|
|
3
3
|
module EPUB
|
4
4
|
module Publication
|
5
5
|
class Package
|
6
|
-
|
7
|
-
include Inspector::PublicationModel
|
8
|
-
|
9
|
-
DC_ELEMS = [:identifiers, :titles, :languages] +
|
10
|
-
[:contributors, :coverages, :creators, :dates, :descriptions, :formats, :publishers,
|
11
|
-
:relations, :rights, :sources, :subjects, :types]
|
12
|
-
attr_accessor :package, :unique_identifier, :metas, :links,
|
13
|
-
*(DC_ELEMS.collect {|elem| "dc_#{elem}"})
|
14
|
-
DC_ELEMS.each do |elem|
|
15
|
-
alias_method elem, "dc_#{elem}"
|
16
|
-
alias_method "#{elem}=", "dc_#{elem}="
|
17
|
-
end
|
18
|
-
|
19
|
-
def initialize
|
20
|
-
(DC_ELEMS + [:metas, :links]).each do |elem|
|
21
|
-
__send__ "#{elem}=", []
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
def title
|
26
|
-
return extended_title unless extended_title.empty?
|
27
|
-
compositted = titles.select {|title| title.display_seq}.sort.join("\n")
|
28
|
-
return compositted unless compositted.empty?
|
29
|
-
return main_title unless main_title.empty?
|
30
|
-
titles.sort.join("\n")
|
31
|
-
end
|
32
|
-
|
33
|
-
%w[main short collection edition extended].each do |type|
|
34
|
-
define_method "#{type}_title" do
|
35
|
-
titles.select {|title| title.title_type.to_s == type}.sort.join(' ')
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
def subtitle
|
40
|
-
titles.select {|title| title.title_type.to_s == 'subtitle'}.sort.join(' ')
|
41
|
-
end
|
42
|
-
|
43
|
-
def description
|
44
|
-
descriptions.join(' ')
|
45
|
-
end
|
46
|
-
|
47
|
-
def date
|
48
|
-
dates.first
|
49
|
-
end
|
50
|
-
|
51
|
-
def language
|
52
|
-
languages.first
|
53
|
-
end
|
54
|
-
|
55
|
-
def modified
|
56
|
-
metas.find {|meta| meta.property == 'dcterms:modified'}
|
57
|
-
end
|
58
|
-
|
59
|
-
def to_h
|
60
|
-
DC_ELEMS.inject({}) do |hsh, elem|
|
61
|
-
hsh[elem] = __send__(elem)
|
62
|
-
hsh
|
63
|
-
end
|
64
|
-
end
|
65
|
-
|
66
|
-
def primary_metas
|
67
|
-
metas.select {|meta| meta.primary_expression?}
|
68
|
-
end
|
69
|
-
|
70
|
-
module Refinee
|
71
|
-
PROPERTIES = %w[alternate-script display-seq file-as group-position identifier-type meta-auth role title-type]
|
72
|
-
|
73
|
-
attr_writer :refiners
|
74
|
-
|
75
|
-
def refiners
|
76
|
-
@refiners ||= Set.new
|
77
|
-
end
|
78
|
-
|
79
|
-
PROPERTIES.each do |voc|
|
80
|
-
met = voc.gsub(/-/, '_')
|
81
|
-
attr_writer met
|
82
|
-
define_method met do
|
83
|
-
refiners.find {|refiner| refiner.property == voc}
|
84
|
-
end
|
85
|
-
end
|
86
|
-
end
|
87
|
-
|
88
|
-
class DCMES
|
89
|
-
include Refinee
|
90
|
-
|
91
|
-
attr_accessor :content, :id, :lang, :dir
|
92
|
-
|
93
|
-
def to_s
|
94
|
-
content.to_s
|
95
|
-
end
|
96
|
-
end
|
97
|
-
|
98
|
-
class Identifier < DCMES
|
99
|
-
# @note This is ad-hoc
|
100
|
-
# @todo Define and include OPF module for opf:scheme attribute
|
101
|
-
# @todo Define general way to handle with identifier-type refiners
|
102
|
-
attr_accessor :scheme
|
103
|
-
|
104
|
-
# @note This is ad-hoc
|
105
|
-
# @todo Define and include OPF module for opf:scheme attribute
|
106
|
-
# @todo Define general way to handle with identifier-type refiners
|
107
|
-
def isbn?
|
108
|
-
scheme == 'ISBN' or
|
109
|
-
content.to_s.downcase.start_with? 'urn:isbn' or
|
110
|
-
refiners.any? {|refiner|
|
111
|
-
refiner.property == 'identifier-type' and
|
112
|
-
refiner.scheme == 'onix:codelist5' and
|
113
|
-
%w[02 15].include? refiner.content
|
114
|
-
}
|
115
|
-
end
|
116
|
-
end
|
117
|
-
|
118
|
-
class Title < DCMES
|
119
|
-
include Comparable
|
120
|
-
|
121
|
-
def <=>(other)
|
122
|
-
return 1 if other.display_seq.nil?
|
123
|
-
return -1 if display_seq.nil?
|
124
|
-
display_seq.to_s.to_i <=> other.display_seq.to_s.to_i
|
125
|
-
end
|
126
|
-
end
|
127
|
-
|
128
|
-
class Meta
|
129
|
-
include Refinee
|
130
|
-
|
131
|
-
attr_accessor :property, :id, :scheme, :content
|
132
|
-
attr_reader :refines
|
133
|
-
|
134
|
-
def refines=(refinee)
|
135
|
-
refinee.refiners << self
|
136
|
-
@refines = refinee
|
137
|
-
end
|
138
|
-
|
139
|
-
def refines?
|
140
|
-
! refines.nil?
|
141
|
-
end
|
142
|
-
alias subexpression? refines?
|
143
|
-
|
144
|
-
def primary_expression?
|
145
|
-
! subexpression?
|
146
|
-
end
|
147
|
-
|
148
|
-
def inspect
|
149
|
-
ivs = instance_variables.map {|iv|
|
150
|
-
[iv, instance_variable_get(iv).inspect].join('=')
|
151
|
-
}.join(' ')
|
152
|
-
'<#%s:%#0x %s>' % [self.class, __id__, ivs]
|
153
|
-
end
|
154
|
-
|
155
|
-
def to_s
|
156
|
-
content.to_s
|
157
|
-
end
|
158
|
-
end
|
159
|
-
|
160
|
-
class Link
|
161
|
-
include Refinee
|
162
|
-
|
163
|
-
attr_accessor :href, :rel, :id, :media_type
|
164
|
-
attr_reader :refines
|
165
|
-
|
166
|
-
def refines=(refinee)
|
167
|
-
refinee.refiners << self
|
168
|
-
@refines = refinee
|
169
|
-
end
|
170
|
-
end
|
171
|
-
end
|
6
|
+
Metadata = EPUB::Metadata
|
172
7
|
end
|
173
8
|
end
|
174
9
|
end
|
@@ -0,0 +1,6 @@
|
|
1
|
+
<metadata xmlns="http://www.idpf.org/2013/metadata"
|
2
|
+
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
3
|
+
unique-identifier="pub-id">
|
4
|
+
<dc:identifier id="pub-id">urn:uuid:A1B0D67E-2E81-4DF5-9E67-A64CBE366809</dc:identifier>
|
5
|
+
<meta property="dcterms:modified">2011-01-01T12:00:00Z</meta>
|
6
|
+
</metadata>
|
data/test/helper.rb
CHANGED
data/test/test_epub.rb
CHANGED
@@ -17,5 +17,9 @@ class TestEUPB < Test::Unit::TestCase
|
|
17
17
|
assert_kind_of EPUB::Publication::Package::Manifest::Item, entry
|
18
18
|
end
|
19
19
|
end
|
20
|
-
end
|
21
20
|
|
21
|
+
def test_book_has_multiple_packages
|
22
|
+
book = EPUB::Parser.parse(@file)
|
23
|
+
assert_instance_of Array, book.packages
|
24
|
+
end
|
25
|
+
end
|