epub-parser-io 0.1.6a

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. data/.gemtest +0 -0
  2. data/.gitignore +12 -0
  3. data/.gitmodules +3 -0
  4. data/.travis.yml +4 -0
  5. data/.yardopts +10 -0
  6. data/CHANGELOG.markdown +61 -0
  7. data/Gemfile +2 -0
  8. data/MIT-LICENSE +7 -0
  9. data/README.markdown +174 -0
  10. data/Rakefile +68 -0
  11. data/bin/epub-open +25 -0
  12. data/bin/epubinfo +64 -0
  13. data/docs/EpubOpen.markdown +43 -0
  14. data/docs/Epubinfo.markdown +37 -0
  15. data/docs/FixedLayout.markdown +96 -0
  16. data/docs/Home.markdown +128 -0
  17. data/docs/Item.markdown +80 -0
  18. data/docs/Navigation.markdown +58 -0
  19. data/docs/Publication.markdown +54 -0
  20. data/epub-parser.gemspec +49 -0
  21. data/features/epubinfo.feature +6 -0
  22. data/features/step_definitions/epubinfo_steps.rb +5 -0
  23. data/features/support/env.rb +1 -0
  24. data/lib/epub/book/features.rb +85 -0
  25. data/lib/epub/book.rb +7 -0
  26. data/lib/epub/constants.rb +48 -0
  27. data/lib/epub/content_document/navigation.rb +104 -0
  28. data/lib/epub/content_document/xhtml.rb +41 -0
  29. data/lib/epub/content_document.rb +2 -0
  30. data/lib/epub/inspector.rb +45 -0
  31. data/lib/epub/ocf/container.rb +28 -0
  32. data/lib/epub/ocf/encryption.rb +7 -0
  33. data/lib/epub/ocf/manifest.rb +6 -0
  34. data/lib/epub/ocf/metadata.rb +6 -0
  35. data/lib/epub/ocf/rights.rb +6 -0
  36. data/lib/epub/ocf/signatures.rb +6 -0
  37. data/lib/epub/ocf.rb +8 -0
  38. data/lib/epub/parser/content_document.rb +111 -0
  39. data/lib/epub/parser/ocf.rb +73 -0
  40. data/lib/epub/parser/publication.rb +200 -0
  41. data/lib/epub/parser/utils.rb +20 -0
  42. data/lib/epub/parser/version.rb +5 -0
  43. data/lib/epub/parser.rb +103 -0
  44. data/lib/epub/publication/fixed_layout.rb +208 -0
  45. data/lib/epub/publication/package/bindings.rb +31 -0
  46. data/lib/epub/publication/package/guide.rb +51 -0
  47. data/lib/epub/publication/package/manifest.rb +180 -0
  48. data/lib/epub/publication/package/metadata.rb +170 -0
  49. data/lib/epub/publication/package/spine.rb +106 -0
  50. data/lib/epub/publication/package.rb +68 -0
  51. data/lib/epub/publication.rb +2 -0
  52. data/lib/epub.rb +14 -0
  53. data/man/epubinfo.1.ronn +19 -0
  54. data/schemas/epub-nav-30.rnc +10 -0
  55. data/schemas/epub-nav-30.sch +72 -0
  56. data/schemas/epub-xhtml-30.sch +377 -0
  57. data/schemas/ocf-container-30.rnc +16 -0
  58. data/test/fixtures/book/META-INF/container.xml +6 -0
  59. data/test/fixtures/book/OPS/%E6%97%A5%E6%9C%AC%E8%AA%9E.xhtml +10 -0
  60. data/test/fixtures/book/OPS/case-sensitive.xhtml +9 -0
  61. data/test/fixtures/book/OPS/containing space.xhtml +10 -0
  62. data/test/fixtures/book/OPS/containing%20space.xhtml +10 -0
  63. data/test/fixtures/book/OPS/nav.xhtml +28 -0
  64. data/test/fixtures/book/OPS//343/203/253/343/203/274/343/203/210/343/203/225/343/202/241/343/202/244/343/203/253.opf +119 -0
  65. data/test/fixtures/book/OPS//346/227/245/346/234/254/350/252/236.xhtml +10 -0
  66. data/test/fixtures/book/mimetype +1 -0
  67. data/test/helper.rb +9 -0
  68. data/test/test_content_document.rb +92 -0
  69. data/test/test_epub.rb +21 -0
  70. data/test/test_fixed_layout.rb +257 -0
  71. data/test/test_inspect.rb +121 -0
  72. data/test/test_parser.rb +60 -0
  73. data/test/test_parser_content_document.rb +36 -0
  74. data/test/test_parser_fixed_layout.rb +16 -0
  75. data/test/test_parser_ocf.rb +38 -0
  76. data/test/test_parser_publication.rb +247 -0
  77. data/test/test_publication.rb +324 -0
  78. metadata +445 -0
@@ -0,0 +1,28 @@
1
+ module EPUB
2
+ class OCF
3
+ class Container
4
+ FILE = 'container.xml'
5
+
6
+ attr_reader :rootfiles
7
+
8
+ def initialize
9
+ @rootfiles = []
10
+ end
11
+
12
+ # syntax sugar
13
+ def rootfile
14
+ rootfiles.first
15
+ end
16
+
17
+ class Rootfile
18
+ attr_accessor :full_path, :media_type
19
+
20
+ # @param full_path [Addressable::URI|nil]
21
+ # @param media_type [String]
22
+ def initialize(full_path=nil, media_type=EPUB::MediaType::ROOTFILE)
23
+ @full_path, @media_type = full_path, media_type
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,7 @@
1
+ module EPUB
2
+ class OCF
3
+ class Encryption
4
+ attr_accessor :content
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,6 @@
1
+ module EPUB
2
+ class OCF
3
+ class Manifest
4
+ end
5
+ end
6
+ end
@@ -0,0 +1,6 @@
1
+ module EPUB
2
+ class OCF
3
+ class Metadata
4
+ end
5
+ end
6
+ end
@@ -0,0 +1,6 @@
1
+ module EPUB
2
+ class OCF
3
+ class Rights
4
+ end
5
+ end
6
+ end
@@ -0,0 +1,6 @@
1
+ module EPUB
2
+ class OCF
3
+ class Signatures
4
+ end
5
+ end
6
+ end
data/lib/epub/ocf.rb ADDED
@@ -0,0 +1,8 @@
1
+ module EPUB
2
+ class OCF
3
+ MODULES = %w[container encryption manifest metadata rights signatures]
4
+ MODULES.each {|m| require "epub/ocf/#{m}"}
5
+
6
+ attr_accessor :book, *MODULES
7
+ end
8
+ end
@@ -0,0 +1,111 @@
1
+ require 'epub/content_document'
2
+ require 'epub/constants'
3
+ require 'epub/parser/utils'
4
+ require 'nokogiri'
5
+
6
+ module EPUB
7
+ class Parser
8
+ class ContentDocument
9
+ include Utils
10
+
11
+ # @param [EPUB::Publication::Package::Manifest::Item] item
12
+ def initialize(item)
13
+ @item = item
14
+ end
15
+
16
+ def parse
17
+ content_document = case @item.media_type
18
+ when 'application/xhtml+xml'
19
+ if @item.nav?
20
+ EPUB::ContentDocument::Navigation.new
21
+ else
22
+ EPUB::ContentDocument::XHTML.new
23
+ end
24
+ when 'image/svg+xml'
25
+ EPUB::ContentDocument::SVG.new
26
+ else
27
+ nil
28
+ end
29
+ return content_document if content_document.nil?
30
+ content_document.item = @item
31
+ document = Nokogiri.XML(@item.read)
32
+ # parse_content_document(document)
33
+ if @item.nav?
34
+ content_document.navigations = parse_navigations(document)
35
+ end
36
+ content_document
37
+ end
38
+
39
+ # @param [Nokogiri::HTML::Document] document HTML document or element including nav
40
+ # @return [Array<EPUB::ContentDocument::Navigation::Nav>] navs array of Nav object
41
+ def parse_navigations(document)
42
+ document.search('/xhtml:html/xhtml:body//xhtml:nav', EPUB::NAMESPACES).collect {|elem| parse_navigation elem}
43
+ end
44
+
45
+ # @param [Nokogiri::XML::Element] element nav element
46
+ # @return [EPUB::ContentDocument::Navigation::Nav] nav Nav object
47
+ def parse_navigation(element)
48
+ nav = EPUB::ContentDocument::Navigation::Navigation.new
49
+ nav.text = find_heading(element)
50
+ hidden = extract_attribute(element, 'hidden')
51
+ nav.hidden = hidden.nil? ? nil : true
52
+ nav.type = extract_attribute(element, 'type', 'epub')
53
+ element.xpath('./xhtml:ol/xhtml:li', EPUB::NAMESPACES).map do |elem|
54
+ nav.items << parse_navigation_item(elem)
55
+ end
56
+
57
+ nav
58
+ end
59
+
60
+ # @param [Nokogiri::XML::Element] element li element
61
+ def parse_navigation_item(element)
62
+ item = EPUB::ContentDocument::Navigation::Item.new
63
+ a_or_span = element.xpath('./xhtml:a[1]|xhtml:span[1]', EPUB::NAMESPACES).first
64
+ return a_or_span if a_or_span.nil?
65
+
66
+ item.text = a_or_span.text
67
+ if a_or_span.name == 'a'
68
+ if item.text.empty?
69
+ embedded_content = a_or_span.xpath('./xhtml:audio[1]|xhtml:canvas[1]|xhtml:embed[1]|xhtml:iframe[1]|xhtml:img[1]|xhtml:math[1]|xhtml:object[1]|xhtml:svg[1]|xhtml:video[1]', EPUB::NAMESPACES).first
70
+ unless embedded_content.nil?
71
+ case embedded_content.name
72
+ when 'audio'
73
+ when 'canvas'
74
+ when 'embed'
75
+ when 'iframe'
76
+ item.text = extract_attribute(embedded_content, 'name') || extract_attribute(embedded_content, 'srcdoc')
77
+ when 'img'
78
+ item.text = extract_attribute(embedded_content, 'alt')
79
+ when 'math'
80
+ when 'object'
81
+ item.text = extract_attribute(embedded_content, 'name')
82
+ when 'svg'
83
+ when 'video'
84
+ else
85
+ end
86
+ end
87
+ item.text = extract_attribute(a_or_span, 'title').to_s if item.text.nil? || item.text.empty?
88
+ end
89
+ item.href = Addressable::URI.parse(extract_attribute(a_or_span, 'href'))
90
+ item.item = @item.manifest.items.selector {|it| it.href.request_uri == item.href.request_uri}.first
91
+ end
92
+ item.items = element.xpath('./xhtml:ol[1]/xhtml:li', EPUB::NAMESPACES).map {|li| parse_navigation_item(li)}
93
+
94
+ item
95
+ end
96
+
97
+ private
98
+
99
+ # @param [Nokogiri::XML::Element] element nav element
100
+ # @return [String] heading heading text
101
+ def find_heading(element)
102
+ heading = element.xpath('./xhtml:h1|xhtml:h2|xhtml:h3|xhtml:h4|xhtml:h5|xhtml:h6|xhtml:hgroup', EPUB::NAMESPACES).first
103
+
104
+ return nil if heading.nil?
105
+ return heading.text unless heading.name == 'hgroup'
106
+
107
+ (heading/'h1' || heading/'h2' || heading/'h3' || heading/'h4' || heading/'h5' || heading/'h6').first.text
108
+ end
109
+ end
110
+ end
111
+ end
@@ -0,0 +1,73 @@
1
+ require 'epub/constants'
2
+ require 'epub/ocf'
3
+ require 'zipruby'
4
+ require 'nokogiri'
5
+
6
+ module EPUB
7
+ class Parser
8
+ class OCF
9
+ include Utils
10
+
11
+ DIRECTORY = 'META-INF'
12
+ EPUB::OCF::MODULES.each {|m| self.const_set "#{m.upcase}_FILE", "#{m}.xml"} # Deprecated
13
+
14
+ class << self
15
+ def parse(zip_archive)
16
+ new(zip_archive).parse
17
+ end
18
+ end
19
+
20
+ def initialize(zip_archive)
21
+ @zip = zip_archive
22
+ @ocf = EPUB::OCF.new
23
+ end
24
+
25
+ def parse
26
+ EPUB::OCF::MODULES.each do |m|
27
+ begin
28
+ file = @zip.fopen(File.join(DIRECTORY, "#{m}.xml"))
29
+ @ocf.__send__ "#{m}=", __send__("parse_#{m}", file.read)
30
+ rescue Zip::Error
31
+ end
32
+ end
33
+
34
+ @ocf
35
+ end
36
+
37
+ def parse_container(xml)
38
+ container = EPUB::OCF::Container.new
39
+ doc = Nokogiri.XML(xml)
40
+ doc.xpath('/ocf:container/ocf:rootfiles/ocf:rootfile', EPUB::NAMESPACES).each do |elem|
41
+ rootfile = EPUB::OCF::Container::Rootfile.new
42
+ rootfile.full_path = Addressable::URI.parse(extract_attribute(elem, 'full-path'))
43
+ rootfile.media_type = extract_attribute(elem, 'media-type')
44
+ container.rootfiles << rootfile
45
+ end
46
+
47
+ container
48
+ end
49
+
50
+ def parse_encryption(content)
51
+ encryption = EPUB::OCF::Encryption.new
52
+ encryption.content = content
53
+ encryption
54
+ end
55
+
56
+ def parse_manifest(content)
57
+ warn "Not implemented: #{self.class}##{__method__}" if $VERBOSE
58
+ end
59
+
60
+ def parse_metadata(content)
61
+ warn "Not implemented: #{self.class}##{__method__}" if $VERBOSE
62
+ end
63
+
64
+ def parse_rights(content)
65
+ warn "Not implemented: #{self.class}##{__method__}" if $VERBOSE
66
+ end
67
+
68
+ def parse_signatures(content)
69
+ warn "Not implemented: #{self.class}##{__method__}" if $VERBOSE
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,200 @@
1
+ require 'strscan'
2
+ require 'zipruby'
3
+ require 'nokogiri'
4
+ require 'addressable/uri'
5
+ require 'epub/publication'
6
+ require 'epub/constants'
7
+
8
+ module EPUB
9
+ class Parser
10
+ class Publication
11
+ include Utils
12
+
13
+ class << self
14
+ def parse(zip_archive, file)
15
+ opf = zip_archive.fopen(Addressable::URI.unencode(file)).read
16
+ new(opf, file).parse
17
+ end
18
+ end
19
+
20
+ def initialize(opf, rootfile)
21
+ @package = EPUB::Publication::Package.new
22
+ @rootfile = Addressable::URI.parse(rootfile)
23
+ @doc = Nokogiri.XML(opf)
24
+ end
25
+
26
+ def parse
27
+ ([:package] + EPUB::Publication::Package::CONTENT_MODELS).each do |model|
28
+ __send__ "parse_#{model}"
29
+ end
30
+
31
+ @package
32
+ end
33
+
34
+ def parse_package
35
+ elem = @doc.root
36
+ %w[version xml:lang dir id].each do |attr|
37
+ @package.__send__ "#{attr.gsub(/\:/, '_')}=", extract_attribute(elem, attr)
38
+ end
39
+ @unique_identifier_id = elem['unique-identifier']
40
+ @package.prefix = parse_prefix(extract_attribute(elem, 'prefix'))
41
+ EPUB::Publication.__send__ :include, EPUB::Publication::FixedLayout if @package.prefix.key? EPUB::Publication::FixedLayout::PREFIX_KEY
42
+
43
+ @package
44
+ end
45
+
46
+ def parse_metadata
47
+ metadata = @package.metadata = EPUB::Publication::Package::Metadata.new
48
+ elem = @doc.xpath('/opf:package/opf:metadata', EPUB::NAMESPACES).first
49
+ id_map = {}
50
+
51
+ metadata.identifiers = extract_model(elem, id_map, './dc:identifier', :Identifier, ['id']) {|identifier, e|
52
+ identifier.scheme = extract_attribute(e, 'scheme', 'opf')
53
+ metadata.unique_identifier = identifier if identifier.id == @unique_identifier_id
54
+ }
55
+ metadata.titles = extract_model(elem, id_map, './dc:title', :Title)
56
+ metadata.languages = extract_model(elem, id_map, './dc:language', :DCMES, %w[id])
57
+ %w[ contributor coverage creator date description format publisher relation source subject type ].each do |dcmes|
58
+ metadata.__send__ "#{dcmes}s=", extract_model(elem, id_map, "./dc:#{dcmes}")
59
+ end
60
+ metadata.rights = extract_model(elem, id_map, './dc:rights')
61
+ metadata.metas = extract_refinee(elem, id_map, './opf:meta', :Meta, %w[property id scheme])
62
+ metadata.links = extract_refinee(elem, id_map, './opf:link', :Link, %w[id media-type]) {|link, e|
63
+ link.href = Addressable::URI.parse(extract_attribute(e, 'href'))
64
+ link.rel = Set.new(extract_attribute(e, 'rel').split(nil))
65
+ }
66
+
67
+ id_map.values.each do |hsh|
68
+ next unless hsh[:refiners]
69
+ next unless hsh[:metadata]
70
+ hsh[:refiners].each {|meta| meta.refines = hsh[:metadata]}
71
+ end
72
+
73
+ metadata
74
+ end
75
+
76
+ def parse_manifest
77
+ manifest = @package.manifest = EPUB::Publication::Package::Manifest.new
78
+ elem = @doc.xpath('/opf:package/opf:manifest', EPUB::NAMESPACES).first
79
+ manifest.id = extract_attribute(elem, 'id')
80
+
81
+ fallback_map = {}
82
+ elem.xpath('./opf:item', EPUB::NAMESPACES).each do |e|
83
+ item = EPUB::Publication::Package::Manifest::Item.new
84
+ %w[ id media-type media-overlay ].each do |attr|
85
+ item.__send__ "#{attr.gsub(/-/, '_')}=", extract_attribute(e, attr)
86
+ end
87
+ item.href = Addressable::URI.parse(extract_attribute(e, 'href'))
88
+ fallback = extract_attribute(e, 'fallback')
89
+ fallback_map[fallback] = item if fallback
90
+ properties = extract_attribute(e, 'properties')
91
+ item.properties = properties.split(' ') if properties
92
+ manifest << item
93
+ end
94
+ fallback_map.each_pair do |id, from|
95
+ from.fallback = manifest[id]
96
+ end
97
+
98
+ manifest
99
+ end
100
+
101
+ def parse_spine
102
+ spine = @package.spine = EPUB::Publication::Package::Spine.new
103
+ elem = @doc.xpath('/opf:package/opf:spine', EPUB::NAMESPACES).first
104
+ %w[ id toc page-progression-direction ].each do |attr|
105
+ spine.__send__ "#{attr.gsub(/-/, '_')}=", extract_attribute(elem, attr)
106
+ end
107
+
108
+ elem.xpath('./opf:itemref', EPUB::NAMESPACES).each do |e|
109
+ itemref = EPUB::Publication::Package::Spine::Itemref.new
110
+ %w[ idref id ].each do |attr|
111
+ itemref.__send__ "#{attr}=", extract_attribute(e, attr)
112
+ end
113
+ itemref.linear = (extract_attribute(e, 'linear') != 'no')
114
+ properties = extract_attribute(e, 'properties')
115
+ itemref.properties = properties.split(' ') if properties
116
+ spine << itemref
117
+ end
118
+
119
+ spine
120
+ end
121
+
122
+ def parse_guide
123
+ guide = @package.guide = EPUB::Publication::Package::Guide.new
124
+ @doc.xpath('/opf:package/opf:guide/opf:reference', EPUB::NAMESPACES).each do |ref|
125
+ reference = EPUB::Publication::Package::Guide::Reference.new
126
+ %w[ type title ].each do |attr|
127
+ reference.__send__ "#{attr}=", extract_attribute(ref, attr)
128
+ end
129
+ reference.href = Addressable::URI.parse(extract_attribute(ref, 'href'))
130
+ guide << reference
131
+ end
132
+
133
+ guide
134
+ end
135
+
136
+ def parse_bindings
137
+ bindings = @package.bindings = EPUB::Publication::Package::Bindings.new
138
+ @doc.xpath('/opf:package/opf:bindings/opf:mediaType', EPUB::NAMESPACES).each do |elem|
139
+ media_type = EPUB::Publication::Package::Bindings::MediaType.new
140
+ media_type.media_type = extract_attribute(elem, 'media-type')
141
+ media_type.handler = @package.manifest[extract_attribute(elem, 'handler')]
142
+ bindings << media_type
143
+ end
144
+
145
+ bindings
146
+ end
147
+
148
+ def parse_prefix(str)
149
+ prefixes = {}
150
+ return prefixes if str.nil? or str.empty?
151
+ scanner = StringScanner.new(str)
152
+ scanner.scan /\s*/
153
+ while prefix = scanner.scan(/[^\:\s]+/)
154
+ scanner.scan /[\:\s]+/
155
+ iri = scanner.scan(/[^\s]+/)
156
+ if iri.nil? or iri.empty?
157
+ warn "no IRI detected for prefix `#{prefix}`"
158
+ else
159
+ prefixes[prefix] = iri
160
+ end
161
+ scanner.scan /\s*/
162
+ end
163
+ prefixes
164
+ end
165
+
166
+ def extract_model(elem, id_map, xpath, klass=:DCMES, attributes=%w[id lang dir])
167
+ models = elem.xpath(xpath, EPUB::NAMESPACES).collect do |e|
168
+ model = EPUB::Publication::Package::Metadata.const_get(klass).new
169
+ attributes.each do |attr|
170
+ model.__send__ "#{attr.gsub(/-/, '_')}=", extract_attribute(e, attr)
171
+ end
172
+ model.content = e.content unless klass == :Link
173
+
174
+ yield model, e if block_given?
175
+
176
+ model
177
+ end
178
+
179
+ models.each do |model|
180
+ id_map[model.id] = {metadata: model} if model.respond_to?(:id) && model.id
181
+ end
182
+
183
+ models
184
+ end
185
+
186
+ def extract_refinee(elem, id_map, xpath, klass, attributes)
187
+ extract_model(elem, id_map, xpath, klass, attributes) {|model, e|
188
+ yield model, e if block_given?
189
+ refines = extract_attribute(e, 'refines')
190
+ if refines && refines[0] == '#'
191
+ id = refines[1..-1]
192
+ id_map[id] ||= {}
193
+ id_map[id][:refiners] ||= []
194
+ id_map[id][:refiners] << model
195
+ end
196
+ }
197
+ end
198
+ end
199
+ end
200
+ end