epub-parser 0.2.5 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +3 -3
- data/.yardopts +2 -0
- data/CHANGELOG.markdown +14 -1
- data/README.markdown +15 -29
- data/Rakefile +39 -4
- data/docs/FixedLayout.markdown +1 -1
- data/docs/Item.markdown +1 -1
- data/epub-parser.gemspec +2 -0
- data/examples/exctract-content-using-cfi.rb +111 -0
- data/examples/find-elements-and-cfis.rb +54 -0
- data/lib/epub/book/features.rb +36 -29
- data/lib/epub/constants.rb +2 -1
- data/lib/epub/inspector.rb +8 -3
- data/lib/epub/metadata.rb +178 -0
- data/lib/epub/ocf/container.rb +2 -1
- data/lib/epub/ocf/metadata.rb +2 -1
- data/lib/epub/ocf/physical_container.rb +11 -2
- data/lib/epub/ocf/physical_container/archive_zip.rb +7 -5
- data/lib/epub/ocf/physical_container/unpacked_directory.rb +4 -0
- data/lib/epub/ocf/physical_container/unpacked_uri.rb +4 -0
- data/lib/epub/ocf/physical_container/zipruby.rb +17 -5
- data/lib/epub/parser.rb +12 -5
- data/lib/epub/parser/metadata.rb +67 -0
- data/lib/epub/parser/ocf.rb +19 -4
- data/lib/epub/parser/publication.rb +32 -88
- data/lib/epub/parser/version.rb +1 -1
- data/lib/epub/publication/package/metadata.rb +2 -167
- data/test/fixtures/book/META-INF/metadata.xml +6 -0
- data/test/helper.rb +3 -0
- data/test/test_epub.rb +5 -1
- data/test/test_inspect.rb +4 -4
- data/test/test_parser_fixed_layout.rb +3 -2
- data/test/test_parser_ocf.rb +16 -1
- data/test/test_parser_publication.rb +14 -13
- data/test/test_publication.rb +36 -0
- data/test/test_searcher.rb +1 -1
- metadata +35 -3
data/lib/epub/constants.rb
CHANGED
@@ -7,7 +7,8 @@ module EPUB
|
|
7
7
|
'epub' => 'http://www.idpf.org/2007/ops',
|
8
8
|
'm' => 'http://www.w3.org/1998/Math/MathML',
|
9
9
|
'svg' => 'http://www.w3.org/2000/svg',
|
10
|
-
'smil' => 'http://www.w3.org/ns/SMIL'
|
10
|
+
'smil' => 'http://www.w3.org/ns/SMIL',
|
11
|
+
'metadata' => 'http://www.idpf.org/2013/metadata'
|
11
12
|
}
|
12
13
|
|
13
14
|
module MediaType
|
data/lib/epub/inspector.rb
CHANGED
@@ -25,7 +25,6 @@ module EPUB
|
|
25
25
|
end
|
26
26
|
|
27
27
|
module PublicationModel
|
28
|
-
TEMPLATE = "#<%{class}:%{object_id} @package=%{package} %{attributes}>"
|
29
28
|
class << self
|
30
29
|
def included(mod)
|
31
30
|
mod.__send__ :include, Inspector
|
@@ -33,13 +32,19 @@ module EPUB
|
|
33
32
|
end
|
34
33
|
|
35
34
|
def inspect
|
36
|
-
|
35
|
+
template % {
|
37
36
|
:class => self.class,
|
38
|
-
:package => package.inspect_simply,
|
37
|
+
:package => (package && package.inspect_simply),
|
39
38
|
:object_id => inspect_object_id,
|
40
39
|
:attributes => inspect_instance_variables(exclude: [:@package])
|
41
40
|
}
|
42
41
|
end
|
42
|
+
|
43
|
+
def template
|
44
|
+
t = "#<%{class}:%{object_id}"
|
45
|
+
t << " @package=%{package}" if package
|
46
|
+
t << " %{attributes}>"
|
47
|
+
end
|
43
48
|
end
|
44
49
|
end
|
45
50
|
end
|
@@ -0,0 +1,178 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module EPUB
|
4
|
+
class Metadata
|
5
|
+
include Inspector::PublicationModel
|
6
|
+
|
7
|
+
DC_ELEMS = [:identifiers, :titles, :languages] +
|
8
|
+
[:contributors, :coverages, :creators, :dates, :descriptions, :formats, :publishers,
|
9
|
+
:relations, :rights, :sources, :subjects, :types]
|
10
|
+
attr_accessor :package, :unique_identifier, :metas, :links,
|
11
|
+
*(DC_ELEMS.collect {|elem| "dc_#{elem}"})
|
12
|
+
DC_ELEMS.each do |elem|
|
13
|
+
alias_method elem, "dc_#{elem}"
|
14
|
+
alias_method "#{elem}=", "dc_#{elem}="
|
15
|
+
end
|
16
|
+
|
17
|
+
def initialize
|
18
|
+
(DC_ELEMS + [:metas, :links]).each do |elem|
|
19
|
+
__send__ "#{elem}=", []
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def release_identifier
|
24
|
+
"#{unique_identifier}@#{modified}"
|
25
|
+
end
|
26
|
+
alias package_identifier release_identifier
|
27
|
+
|
28
|
+
def title
|
29
|
+
return extended_title unless extended_title.empty?
|
30
|
+
compositted = titles.select {|title| title.display_seq}.sort.join("\n")
|
31
|
+
return compositted unless compositted.empty?
|
32
|
+
return main_title unless main_title.empty?
|
33
|
+
titles.sort.join("\n")
|
34
|
+
end
|
35
|
+
|
36
|
+
%w[main short collection edition extended].each do |type|
|
37
|
+
define_method "#{type}_title" do
|
38
|
+
titles.select {|title| title.title_type.to_s == type}.sort.join(' ')
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def subtitle
|
43
|
+
titles.select {|title| title.title_type.to_s == 'subtitle'}.sort.join(' ')
|
44
|
+
end
|
45
|
+
|
46
|
+
def description
|
47
|
+
descriptions.join(' ')
|
48
|
+
end
|
49
|
+
|
50
|
+
def date
|
51
|
+
dates.first
|
52
|
+
end
|
53
|
+
|
54
|
+
def language
|
55
|
+
languages.first
|
56
|
+
end
|
57
|
+
|
58
|
+
def modified
|
59
|
+
metas.find {|meta|
|
60
|
+
meta.property == 'dcterms:modified' &&
|
61
|
+
meta.refiners.empty?
|
62
|
+
}
|
63
|
+
end
|
64
|
+
|
65
|
+
def to_h
|
66
|
+
DC_ELEMS.inject({}) do |hsh, elem|
|
67
|
+
hsh[elem] = __send__(elem)
|
68
|
+
hsh
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def primary_metas
|
73
|
+
metas.select {|meta| meta.primary_expression?}
|
74
|
+
end
|
75
|
+
|
76
|
+
module Refinee
|
77
|
+
PROPERTIES = %w[alternate-script display-seq file-as group-position identifier-type meta-auth role title-type]
|
78
|
+
|
79
|
+
attr_writer :refiners
|
80
|
+
|
81
|
+
def refiners
|
82
|
+
@refiners ||= Set.new
|
83
|
+
end
|
84
|
+
|
85
|
+
PROPERTIES.each do |voc|
|
86
|
+
met = voc.gsub(/-/, '_')
|
87
|
+
attr_writer met
|
88
|
+
define_method met do
|
89
|
+
refiners.find {|refiner| refiner.property == voc}
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
class DCMES
|
95
|
+
include Refinee
|
96
|
+
|
97
|
+
attr_accessor :content, :id, :lang, :dir
|
98
|
+
|
99
|
+
def to_s
|
100
|
+
content.to_s
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
class Identifier < DCMES
|
105
|
+
# @note This is ad-hoc
|
106
|
+
# @todo Define and include OPF module for opf:scheme attribute
|
107
|
+
# @todo Define general way to handle with identifier-type refiners
|
108
|
+
attr_accessor :scheme
|
109
|
+
|
110
|
+
# @note This is ad-hoc
|
111
|
+
# @todo Define and include OPF module for opf:scheme attribute
|
112
|
+
# @todo Define general way to handle with identifier-type refiners
|
113
|
+
def isbn?
|
114
|
+
scheme == 'ISBN' or
|
115
|
+
content.to_s.downcase.start_with? 'urn:isbn' or
|
116
|
+
refiners.any? {|refiner|
|
117
|
+
refiner.property == 'identifier-type' and
|
118
|
+
refiner.scheme == 'onix:codelist5' and
|
119
|
+
%w[02 15].include? refiner.content
|
120
|
+
}
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
class Title < DCMES
|
125
|
+
include Comparable
|
126
|
+
|
127
|
+
def <=>(other)
|
128
|
+
return 1 if other.display_seq.nil?
|
129
|
+
return -1 if display_seq.nil?
|
130
|
+
display_seq.to_s.to_i <=> other.display_seq.to_s.to_i
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
class Meta
|
135
|
+
include Refinee
|
136
|
+
|
137
|
+
attr_accessor :property, :id, :scheme, :content
|
138
|
+
attr_reader :refines
|
139
|
+
|
140
|
+
def refines=(refinee)
|
141
|
+
refinee.refiners << self
|
142
|
+
@refines = refinee
|
143
|
+
end
|
144
|
+
|
145
|
+
def refines?
|
146
|
+
! refines.nil?
|
147
|
+
end
|
148
|
+
alias subexpression? refines?
|
149
|
+
|
150
|
+
def primary_expression?
|
151
|
+
! subexpression?
|
152
|
+
end
|
153
|
+
|
154
|
+
def inspect
|
155
|
+
ivs = instance_variables.map {|iv|
|
156
|
+
[iv, instance_variable_get(iv).inspect].join('=')
|
157
|
+
}.join(' ')
|
158
|
+
'<#%s:%#0x %s>' % [self.class, __id__, ivs]
|
159
|
+
end
|
160
|
+
|
161
|
+
def to_s
|
162
|
+
content.to_s
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
class Link
|
167
|
+
include Refinee
|
168
|
+
|
169
|
+
attr_accessor :href, :rel, :id, :media_type
|
170
|
+
attr_reader :refines
|
171
|
+
|
172
|
+
def refines=(refinee)
|
173
|
+
refinee.refiners << self
|
174
|
+
@refines = refinee
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
data/lib/epub/ocf/container.rb
CHANGED
data/lib/epub/ocf/metadata.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'monitor'
|
1
2
|
require 'epub/ocf/physical_container/archive_zip'
|
2
3
|
require 'epub/ocf/physical_container/unpacked_directory'
|
3
4
|
require 'epub/ocf/physical_container/unpacked_uri'
|
@@ -11,6 +12,14 @@ module EPUB
|
|
11
12
|
@adapter = ArchiveZip
|
12
13
|
|
13
14
|
class << self
|
15
|
+
def find_adapter(adapter)
|
16
|
+
return adapter if adapter.instance_of? Class
|
17
|
+
if adapter == :Zipruby && ! const_defined?(adapter)
|
18
|
+
require 'epub/ocf/physical_container/zipruby'
|
19
|
+
end
|
20
|
+
const_get adapter
|
21
|
+
end
|
22
|
+
|
14
23
|
def adapter
|
15
24
|
raise NoMethodError, "undefined method `#{__method__}' for #{self}" unless self == PhysicalContainer
|
16
25
|
@adapter
|
@@ -18,8 +27,7 @@ module EPUB
|
|
18
27
|
|
19
28
|
def adapter=(adapter)
|
20
29
|
raise NoMethodError, "undefined method `#{__method__}' for #{self}" unless self == PhysicalContainer
|
21
|
-
@adapter =
|
22
|
-
adapter
|
30
|
+
@adapter = find_adapter(adapter)
|
23
31
|
end
|
24
32
|
|
25
33
|
def open(container_path)
|
@@ -43,6 +51,7 @@ module EPUB
|
|
43
51
|
|
44
52
|
def initialize(container_path)
|
45
53
|
@container_path = container_path
|
54
|
+
@monitor = Monitor.new
|
46
55
|
end
|
47
56
|
end
|
48
57
|
end
|
@@ -12,11 +12,13 @@ module EPUB
|
|
12
12
|
|
13
13
|
def open
|
14
14
|
Archive::Zip.open @container_path do |archive|
|
15
|
-
@
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
15
|
+
@monitor.synchronize do
|
16
|
+
@archive = archive
|
17
|
+
begin
|
18
|
+
yield self
|
19
|
+
ensure
|
20
|
+
@archive = nil
|
21
|
+
end
|
20
22
|
end
|
21
23
|
end
|
22
24
|
end
|
@@ -6,11 +6,17 @@ module EPUB
|
|
6
6
|
class Zipruby < self
|
7
7
|
def open
|
8
8
|
Zip::Archive.open @container_path do |archive|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
9
|
+
@monitor.synchronize do
|
10
|
+
begin
|
11
|
+
@archive = archive
|
12
|
+
yield self
|
13
|
+
rescue ::Zip::Error => error
|
14
|
+
no_entry = NoEntry.new(error.message)
|
15
|
+
no_entry.set_backtrace error.backtrace
|
16
|
+
raise no_entry
|
17
|
+
ensure
|
18
|
+
@archive = nil
|
19
|
+
end
|
14
20
|
end
|
15
21
|
end
|
16
22
|
end
|
@@ -21,6 +27,12 @@ module EPUB
|
|
21
27
|
else
|
22
28
|
open {|container| container.read(path_name)}
|
23
29
|
end
|
30
|
+
rescue ::Zip::Error => error
|
31
|
+
no_entry = NoEntry.new(error.message)
|
32
|
+
no_entry.set_backtrace error.backtrace
|
33
|
+
raise no_entry
|
34
|
+
ensure
|
35
|
+
@archive = nil
|
24
36
|
end
|
25
37
|
end
|
26
38
|
end
|
data/lib/epub/parser.rb
CHANGED
@@ -46,22 +46,29 @@ module EPUB
|
|
46
46
|
options[:container_adapter] == :UnpackedURI or
|
47
47
|
EPUB::OCF::PhysicalContainer.adapter == EPUB::OCF::PhysicalContainer::UnpackedURI)
|
48
48
|
|
49
|
-
raise "File #{filepath} not
|
50
|
-
!path_is_uri and !File.
|
49
|
+
raise "File #{filepath} not found" if
|
50
|
+
!path_is_uri and !File.exist?(filepath)
|
51
51
|
|
52
52
|
@filepath = path_is_uri ? filepath : File.realpath(filepath)
|
53
53
|
@book = create_book(options)
|
54
|
+
if File.directory? @filepath
|
55
|
+
@book.container_adapter = :UnpackedDirectory
|
56
|
+
end
|
54
57
|
@book.epub_file = @filepath
|
55
58
|
if options[:container_adapter]
|
56
|
-
|
57
|
-
@book.container_adapter = adapter
|
59
|
+
@book.container_adapter = options[:container_adapter]
|
58
60
|
end
|
59
61
|
end
|
60
62
|
|
61
63
|
def parse
|
62
64
|
@book.container_adapter.open @filepath do |container|
|
63
65
|
@book.ocf = OCF.parse(container)
|
64
|
-
@book.
|
66
|
+
@book.ocf.container.rootfiles.each {|rootfile|
|
67
|
+
package = Publication.parse(container, rootfile.full_path.to_s)
|
68
|
+
rootfile.package = package
|
69
|
+
@book.packages << package
|
70
|
+
package.book = @book
|
71
|
+
}
|
65
72
|
end
|
66
73
|
|
67
74
|
@book
|
@@ -0,0 +1,67 @@
|
|
1
|
+
module EPUB
|
2
|
+
class Parser
|
3
|
+
module Metadata
|
4
|
+
def parse_metadata(elem, unique_identifier_id, default_namespace)
|
5
|
+
metadata = EPUB::Publication::Package::Metadata.new
|
6
|
+
id_map = {}
|
7
|
+
|
8
|
+
metadata.identifiers = extract_model(elem, id_map, './dc:identifier', :Identifier, ['id']) {|identifier, e|
|
9
|
+
identifier.scheme = extract_attribute(e, 'scheme', 'opf')
|
10
|
+
metadata.unique_identifier = identifier if identifier.id == unique_identifier_id
|
11
|
+
}
|
12
|
+
metadata.titles = extract_model(elem, id_map, './dc:title', :Title)
|
13
|
+
metadata.languages = extract_model(elem, id_map, './dc:language', :DCMES, %w[id])
|
14
|
+
%w[contributor coverage creator date description format publisher relation source subject type].each do |dcmes|
|
15
|
+
metadata.__send__ "#{dcmes}s=", extract_model(elem, id_map, "./dc:#{dcmes}")
|
16
|
+
end
|
17
|
+
metadata.rights = extract_model(elem, id_map, './dc:rights')
|
18
|
+
metadata.metas = extract_refinee(elem, id_map, "./#{default_namespace}:meta", :Meta, %w[property id scheme])
|
19
|
+
metadata.links = extract_refinee(elem, id_map, "./#{default_namespace}:link", :Link, %w[id media-type]) {|link, e|
|
20
|
+
link.href = extract_attribute(e, 'href')
|
21
|
+
link.rel = Set.new(extract_attribute(e, 'rel').split(nil))
|
22
|
+
}
|
23
|
+
|
24
|
+
id_map.values.each do |hsh|
|
25
|
+
next unless hsh[:refiners]
|
26
|
+
next unless hsh[:metadata]
|
27
|
+
hsh[:refiners].each {|meta| meta.refines = hsh[:metadata]}
|
28
|
+
end
|
29
|
+
|
30
|
+
metadata
|
31
|
+
end
|
32
|
+
|
33
|
+
def extract_model(elem, id_map, xpath, klass=:DCMES, attributes=%w[id lang dir])
|
34
|
+
models = elem.xpath(xpath, EPUB::NAMESPACES).collect do |e|
|
35
|
+
model = EPUB::Publication::Package::Metadata.const_get(klass).new
|
36
|
+
attributes.each do |attr|
|
37
|
+
model.__send__ "#{attr.gsub(/-/, '_')}=", extract_attribute(e, attr)
|
38
|
+
end
|
39
|
+
model.content = e.content unless klass == :Link
|
40
|
+
|
41
|
+
yield model, e if block_given?
|
42
|
+
|
43
|
+
model
|
44
|
+
end
|
45
|
+
|
46
|
+
models.each do |model|
|
47
|
+
id_map[model.id] = {metadata: model} if model.respond_to?(:id) && model.id
|
48
|
+
end
|
49
|
+
|
50
|
+
models
|
51
|
+
end
|
52
|
+
|
53
|
+
def extract_refinee(elem, id_map, xpath, klass, attributes)
|
54
|
+
extract_model(elem, id_map, xpath, klass, attributes) {|model, e|
|
55
|
+
yield model, e if block_given?
|
56
|
+
refines = extract_attribute(e, 'refines')
|
57
|
+
if refines && refines[0] == '#'
|
58
|
+
id = refines[1..-1]
|
59
|
+
id_map[id] ||= {}
|
60
|
+
id_map[id][:refiners] ||= []
|
61
|
+
id_map[id][:refiners] << model
|
62
|
+
end
|
63
|
+
}
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|