epub-parser-io 0.1.6a

Sign up to get free protection for your applications and to get access to all the features.
Files changed (78) hide show
  1. data/.gemtest +0 -0
  2. data/.gitignore +12 -0
  3. data/.gitmodules +3 -0
  4. data/.travis.yml +4 -0
  5. data/.yardopts +10 -0
  6. data/CHANGELOG.markdown +61 -0
  7. data/Gemfile +2 -0
  8. data/MIT-LICENSE +7 -0
  9. data/README.markdown +174 -0
  10. data/Rakefile +68 -0
  11. data/bin/epub-open +25 -0
  12. data/bin/epubinfo +64 -0
  13. data/docs/EpubOpen.markdown +43 -0
  14. data/docs/Epubinfo.markdown +37 -0
  15. data/docs/FixedLayout.markdown +96 -0
  16. data/docs/Home.markdown +128 -0
  17. data/docs/Item.markdown +80 -0
  18. data/docs/Navigation.markdown +58 -0
  19. data/docs/Publication.markdown +54 -0
  20. data/epub-parser.gemspec +49 -0
  21. data/features/epubinfo.feature +6 -0
  22. data/features/step_definitions/epubinfo_steps.rb +5 -0
  23. data/features/support/env.rb +1 -0
  24. data/lib/epub/book/features.rb +85 -0
  25. data/lib/epub/book.rb +7 -0
  26. data/lib/epub/constants.rb +48 -0
  27. data/lib/epub/content_document/navigation.rb +104 -0
  28. data/lib/epub/content_document/xhtml.rb +41 -0
  29. data/lib/epub/content_document.rb +2 -0
  30. data/lib/epub/inspector.rb +45 -0
  31. data/lib/epub/ocf/container.rb +28 -0
  32. data/lib/epub/ocf/encryption.rb +7 -0
  33. data/lib/epub/ocf/manifest.rb +6 -0
  34. data/lib/epub/ocf/metadata.rb +6 -0
  35. data/lib/epub/ocf/rights.rb +6 -0
  36. data/lib/epub/ocf/signatures.rb +6 -0
  37. data/lib/epub/ocf.rb +8 -0
  38. data/lib/epub/parser/content_document.rb +111 -0
  39. data/lib/epub/parser/ocf.rb +73 -0
  40. data/lib/epub/parser/publication.rb +200 -0
  41. data/lib/epub/parser/utils.rb +20 -0
  42. data/lib/epub/parser/version.rb +5 -0
  43. data/lib/epub/parser.rb +103 -0
  44. data/lib/epub/publication/fixed_layout.rb +208 -0
  45. data/lib/epub/publication/package/bindings.rb +31 -0
  46. data/lib/epub/publication/package/guide.rb +51 -0
  47. data/lib/epub/publication/package/manifest.rb +180 -0
  48. data/lib/epub/publication/package/metadata.rb +170 -0
  49. data/lib/epub/publication/package/spine.rb +106 -0
  50. data/lib/epub/publication/package.rb +68 -0
  51. data/lib/epub/publication.rb +2 -0
  52. data/lib/epub.rb +14 -0
  53. data/man/epubinfo.1.ronn +19 -0
  54. data/schemas/epub-nav-30.rnc +10 -0
  55. data/schemas/epub-nav-30.sch +72 -0
  56. data/schemas/epub-xhtml-30.sch +377 -0
  57. data/schemas/ocf-container-30.rnc +16 -0
  58. data/test/fixtures/book/META-INF/container.xml +6 -0
  59. data/test/fixtures/book/OPS/%E6%97%A5%E6%9C%AC%E8%AA%9E.xhtml +10 -0
  60. data/test/fixtures/book/OPS/case-sensitive.xhtml +9 -0
  61. data/test/fixtures/book/OPS/containing space.xhtml +10 -0
  62. data/test/fixtures/book/OPS/containing%20space.xhtml +10 -0
  63. data/test/fixtures/book/OPS/nav.xhtml +28 -0
  64. data/test/fixtures/book/OPS//343/203/253/343/203/274/343/203/210/343/203/225/343/202/241/343/202/244/343/203/253.opf +119 -0
  65. data/test/fixtures/book/OPS//346/227/245/346/234/254/350/252/236.xhtml +10 -0
  66. data/test/fixtures/book/mimetype +1 -0
  67. data/test/helper.rb +9 -0
  68. data/test/test_content_document.rb +92 -0
  69. data/test/test_epub.rb +21 -0
  70. data/test/test_fixed_layout.rb +257 -0
  71. data/test/test_inspect.rb +121 -0
  72. data/test/test_parser.rb +60 -0
  73. data/test/test_parser_content_document.rb +36 -0
  74. data/test/test_parser_fixed_layout.rb +16 -0
  75. data/test/test_parser_ocf.rb +38 -0
  76. data/test/test_parser_publication.rb +247 -0
  77. data/test/test_publication.rb +324 -0
  78. metadata +445 -0
@@ -0,0 +1,28 @@
1
+ module EPUB
2
+ class OCF
3
+ class Container
4
+ FILE = 'container.xml'
5
+
6
+ attr_reader :rootfiles
7
+
8
+ def initialize
9
+ @rootfiles = []
10
+ end
11
+
12
+ # syntax sugar
13
+ def rootfile
14
+ rootfiles.first
15
+ end
16
+
17
+ class Rootfile
18
+ attr_accessor :full_path, :media_type
19
+
20
+ # @param full_path [Addressable::URI|nil]
21
+ # @param media_type [String]
22
+ def initialize(full_path=nil, media_type=EPUB::MediaType::ROOTFILE)
23
+ @full_path, @media_type = full_path, media_type
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,7 @@
1
+ module EPUB
2
+ class OCF
3
+ class Encryption
4
+ attr_accessor :content
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,6 @@
1
+ module EPUB
2
+ class OCF
3
+ class Manifest
4
+ end
5
+ end
6
+ end
@@ -0,0 +1,6 @@
1
+ module EPUB
2
+ class OCF
3
+ class Metadata
4
+ end
5
+ end
6
+ end
@@ -0,0 +1,6 @@
1
+ module EPUB
2
+ class OCF
3
+ class Rights
4
+ end
5
+ end
6
+ end
@@ -0,0 +1,6 @@
1
+ module EPUB
2
+ class OCF
3
+ class Signatures
4
+ end
5
+ end
6
+ end
data/lib/epub/ocf.rb ADDED
@@ -0,0 +1,8 @@
1
+ module EPUB
2
+ class OCF
3
+ MODULES = %w[container encryption manifest metadata rights signatures]
4
+ MODULES.each {|m| require "epub/ocf/#{m}"}
5
+
6
+ attr_accessor :book, *MODULES
7
+ end
8
+ end
@@ -0,0 +1,111 @@
1
+ require 'epub/content_document'
2
+ require 'epub/constants'
3
+ require 'epub/parser/utils'
4
+ require 'nokogiri'
5
+
6
+ module EPUB
7
+ class Parser
8
+ class ContentDocument
9
+ include Utils
10
+
11
+ # @param [EPUB::Publication::Package::Manifest::Item] item
12
+ def initialize(item)
13
+ @item = item
14
+ end
15
+
16
+ def parse
17
+ content_document = case @item.media_type
18
+ when 'application/xhtml+xml'
19
+ if @item.nav?
20
+ EPUB::ContentDocument::Navigation.new
21
+ else
22
+ EPUB::ContentDocument::XHTML.new
23
+ end
24
+ when 'image/svg+xml'
25
+ EPUB::ContentDocument::SVG.new
26
+ else
27
+ nil
28
+ end
29
+ return content_document if content_document.nil?
30
+ content_document.item = @item
31
+ document = Nokogiri.XML(@item.read)
32
+ # parse_content_document(document)
33
+ if @item.nav?
34
+ content_document.navigations = parse_navigations(document)
35
+ end
36
+ content_document
37
+ end
38
+
39
+ # @param [Nokogiri::HTML::Document] document HTML document or element including nav
40
+ # @return [Array<EPUB::ContentDocument::Navigation::Nav>] navs array of Nav object
41
+ def parse_navigations(document)
42
+ document.search('/xhtml:html/xhtml:body//xhtml:nav', EPUB::NAMESPACES).collect {|elem| parse_navigation elem}
43
+ end
44
+
45
+ # @param [Nokogiri::XML::Element] element nav element
46
+ # @return [EPUB::ContentDocument::Navigation::Nav] nav Nav object
47
+ def parse_navigation(element)
48
+ nav = EPUB::ContentDocument::Navigation::Navigation.new
49
+ nav.text = find_heading(element)
50
+ hidden = extract_attribute(element, 'hidden')
51
+ nav.hidden = hidden.nil? ? nil : true
52
+ nav.type = extract_attribute(element, 'type', 'epub')
53
+ element.xpath('./xhtml:ol/xhtml:li', EPUB::NAMESPACES).map do |elem|
54
+ nav.items << parse_navigation_item(elem)
55
+ end
56
+
57
+ nav
58
+ end
59
+
60
+ # @param [Nokogiri::XML::Element] element li element
61
+ def parse_navigation_item(element)
62
+ item = EPUB::ContentDocument::Navigation::Item.new
63
+ a_or_span = element.xpath('./xhtml:a[1]|xhtml:span[1]', EPUB::NAMESPACES).first
64
+ return a_or_span if a_or_span.nil?
65
+
66
+ item.text = a_or_span.text
67
+ if a_or_span.name == 'a'
68
+ if item.text.empty?
69
+ embedded_content = a_or_span.xpath('./xhtml:audio[1]|xhtml:canvas[1]|xhtml:embed[1]|xhtml:iframe[1]|xhtml:img[1]|xhtml:math[1]|xhtml:object[1]|xhtml:svg[1]|xhtml:video[1]', EPUB::NAMESPACES).first
70
+ unless embedded_content.nil?
71
+ case embedded_content.name
72
+ when 'audio'
73
+ when 'canvas'
74
+ when 'embed'
75
+ when 'iframe'
76
+ item.text = extract_attribute(embedded_content, 'name') || extract_attribute(embedded_content, 'srcdoc')
77
+ when 'img'
78
+ item.text = extract_attribute(embedded_content, 'alt')
79
+ when 'math'
80
+ when 'object'
81
+ item.text = extract_attribute(embedded_content, 'name')
82
+ when 'svg'
83
+ when 'video'
84
+ else
85
+ end
86
+ end
87
+ item.text = extract_attribute(a_or_span, 'title').to_s if item.text.nil? || item.text.empty?
88
+ end
89
+ item.href = Addressable::URI.parse(extract_attribute(a_or_span, 'href'))
90
+ item.item = @item.manifest.items.selector {|it| it.href.request_uri == item.href.request_uri}.first
91
+ end
92
+ item.items = element.xpath('./xhtml:ol[1]/xhtml:li', EPUB::NAMESPACES).map {|li| parse_navigation_item(li)}
93
+
94
+ item
95
+ end
96
+
97
+ private
98
+
99
+ # @param [Nokogiri::XML::Element] element nav element
100
+ # @return [String] heading heading text
101
+ def find_heading(element)
102
+ heading = element.xpath('./xhtml:h1|xhtml:h2|xhtml:h3|xhtml:h4|xhtml:h5|xhtml:h6|xhtml:hgroup', EPUB::NAMESPACES).first
103
+
104
+ return nil if heading.nil?
105
+ return heading.text unless heading.name == 'hgroup'
106
+
107
+ (heading/'h1' || heading/'h2' || heading/'h3' || heading/'h4' || heading/'h5' || heading/'h6').first.text
108
+ end
109
+ end
110
+ end
111
+ end
@@ -0,0 +1,73 @@
1
+ require 'epub/constants'
2
+ require 'epub/ocf'
3
+ require 'zipruby'
4
+ require 'nokogiri'
5
+
6
+ module EPUB
7
+ class Parser
8
+ class OCF
9
+ include Utils
10
+
11
+ DIRECTORY = 'META-INF'
12
+ EPUB::OCF::MODULES.each {|m| self.const_set "#{m.upcase}_FILE", "#{m}.xml"} # Deprecated
13
+
14
+ class << self
15
+ def parse(zip_archive)
16
+ new(zip_archive).parse
17
+ end
18
+ end
19
+
20
+ def initialize(zip_archive)
21
+ @zip = zip_archive
22
+ @ocf = EPUB::OCF.new
23
+ end
24
+
25
+ def parse
26
+ EPUB::OCF::MODULES.each do |m|
27
+ begin
28
+ file = @zip.fopen(File.join(DIRECTORY, "#{m}.xml"))
29
+ @ocf.__send__ "#{m}=", __send__("parse_#{m}", file.read)
30
+ rescue Zip::Error
31
+ end
32
+ end
33
+
34
+ @ocf
35
+ end
36
+
37
+ def parse_container(xml)
38
+ container = EPUB::OCF::Container.new
39
+ doc = Nokogiri.XML(xml)
40
+ doc.xpath('/ocf:container/ocf:rootfiles/ocf:rootfile', EPUB::NAMESPACES).each do |elem|
41
+ rootfile = EPUB::OCF::Container::Rootfile.new
42
+ rootfile.full_path = Addressable::URI.parse(extract_attribute(elem, 'full-path'))
43
+ rootfile.media_type = extract_attribute(elem, 'media-type')
44
+ container.rootfiles << rootfile
45
+ end
46
+
47
+ container
48
+ end
49
+
50
+ def parse_encryption(content)
51
+ encryption = EPUB::OCF::Encryption.new
52
+ encryption.content = content
53
+ encryption
54
+ end
55
+
56
+ def parse_manifest(content)
57
+ warn "Not implemented: #{self.class}##{__method__}" if $VERBOSE
58
+ end
59
+
60
+ def parse_metadata(content)
61
+ warn "Not implemented: #{self.class}##{__method__}" if $VERBOSE
62
+ end
63
+
64
+ def parse_rights(content)
65
+ warn "Not implemented: #{self.class}##{__method__}" if $VERBOSE
66
+ end
67
+
68
+ def parse_signatures(content)
69
+ warn "Not implemented: #{self.class}##{__method__}" if $VERBOSE
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,200 @@
1
+ require 'strscan'
2
+ require 'zipruby'
3
+ require 'nokogiri'
4
+ require 'addressable/uri'
5
+ require 'epub/publication'
6
+ require 'epub/constants'
7
+
8
+ module EPUB
9
+ class Parser
10
+ class Publication
11
+ include Utils
12
+
13
+ class << self
14
+ def parse(zip_archive, file)
15
+ opf = zip_archive.fopen(Addressable::URI.unencode(file)).read
16
+ new(opf, file).parse
17
+ end
18
+ end
19
+
20
+ def initialize(opf, rootfile)
21
+ @package = EPUB::Publication::Package.new
22
+ @rootfile = Addressable::URI.parse(rootfile)
23
+ @doc = Nokogiri.XML(opf)
24
+ end
25
+
26
+ def parse
27
+ ([:package] + EPUB::Publication::Package::CONTENT_MODELS).each do |model|
28
+ __send__ "parse_#{model}"
29
+ end
30
+
31
+ @package
32
+ end
33
+
34
+ def parse_package
35
+ elem = @doc.root
36
+ %w[version xml:lang dir id].each do |attr|
37
+ @package.__send__ "#{attr.gsub(/\:/, '_')}=", extract_attribute(elem, attr)
38
+ end
39
+ @unique_identifier_id = elem['unique-identifier']
40
+ @package.prefix = parse_prefix(extract_attribute(elem, 'prefix'))
41
+ EPUB::Publication.__send__ :include, EPUB::Publication::FixedLayout if @package.prefix.key? EPUB::Publication::FixedLayout::PREFIX_KEY
42
+
43
+ @package
44
+ end
45
+
46
+ def parse_metadata
47
+ metadata = @package.metadata = EPUB::Publication::Package::Metadata.new
48
+ elem = @doc.xpath('/opf:package/opf:metadata', EPUB::NAMESPACES).first
49
+ id_map = {}
50
+
51
+ metadata.identifiers = extract_model(elem, id_map, './dc:identifier', :Identifier, ['id']) {|identifier, e|
52
+ identifier.scheme = extract_attribute(e, 'scheme', 'opf')
53
+ metadata.unique_identifier = identifier if identifier.id == @unique_identifier_id
54
+ }
55
+ metadata.titles = extract_model(elem, id_map, './dc:title', :Title)
56
+ metadata.languages = extract_model(elem, id_map, './dc:language', :DCMES, %w[id])
57
+ %w[ contributor coverage creator date description format publisher relation source subject type ].each do |dcmes|
58
+ metadata.__send__ "#{dcmes}s=", extract_model(elem, id_map, "./dc:#{dcmes}")
59
+ end
60
+ metadata.rights = extract_model(elem, id_map, './dc:rights')
61
+ metadata.metas = extract_refinee(elem, id_map, './opf:meta', :Meta, %w[property id scheme])
62
+ metadata.links = extract_refinee(elem, id_map, './opf:link', :Link, %w[id media-type]) {|link, e|
63
+ link.href = Addressable::URI.parse(extract_attribute(e, 'href'))
64
+ link.rel = Set.new(extract_attribute(e, 'rel').split(nil))
65
+ }
66
+
67
+ id_map.values.each do |hsh|
68
+ next unless hsh[:refiners]
69
+ next unless hsh[:metadata]
70
+ hsh[:refiners].each {|meta| meta.refines = hsh[:metadata]}
71
+ end
72
+
73
+ metadata
74
+ end
75
+
76
+ def parse_manifest
77
+ manifest = @package.manifest = EPUB::Publication::Package::Manifest.new
78
+ elem = @doc.xpath('/opf:package/opf:manifest', EPUB::NAMESPACES).first
79
+ manifest.id = extract_attribute(elem, 'id')
80
+
81
+ fallback_map = {}
82
+ elem.xpath('./opf:item', EPUB::NAMESPACES).each do |e|
83
+ item = EPUB::Publication::Package::Manifest::Item.new
84
+ %w[ id media-type media-overlay ].each do |attr|
85
+ item.__send__ "#{attr.gsub(/-/, '_')}=", extract_attribute(e, attr)
86
+ end
87
+ item.href = Addressable::URI.parse(extract_attribute(e, 'href'))
88
+ fallback = extract_attribute(e, 'fallback')
89
+ fallback_map[fallback] = item if fallback
90
+ properties = extract_attribute(e, 'properties')
91
+ item.properties = properties.split(' ') if properties
92
+ manifest << item
93
+ end
94
+ fallback_map.each_pair do |id, from|
95
+ from.fallback = manifest[id]
96
+ end
97
+
98
+ manifest
99
+ end
100
+
101
+ def parse_spine
102
+ spine = @package.spine = EPUB::Publication::Package::Spine.new
103
+ elem = @doc.xpath('/opf:package/opf:spine', EPUB::NAMESPACES).first
104
+ %w[ id toc page-progression-direction ].each do |attr|
105
+ spine.__send__ "#{attr.gsub(/-/, '_')}=", extract_attribute(elem, attr)
106
+ end
107
+
108
+ elem.xpath('./opf:itemref', EPUB::NAMESPACES).each do |e|
109
+ itemref = EPUB::Publication::Package::Spine::Itemref.new
110
+ %w[ idref id ].each do |attr|
111
+ itemref.__send__ "#{attr}=", extract_attribute(e, attr)
112
+ end
113
+ itemref.linear = (extract_attribute(e, 'linear') != 'no')
114
+ properties = extract_attribute(e, 'properties')
115
+ itemref.properties = properties.split(' ') if properties
116
+ spine << itemref
117
+ end
118
+
119
+ spine
120
+ end
121
+
122
+ def parse_guide
123
+ guide = @package.guide = EPUB::Publication::Package::Guide.new
124
+ @doc.xpath('/opf:package/opf:guide/opf:reference', EPUB::NAMESPACES).each do |ref|
125
+ reference = EPUB::Publication::Package::Guide::Reference.new
126
+ %w[ type title ].each do |attr|
127
+ reference.__send__ "#{attr}=", extract_attribute(ref, attr)
128
+ end
129
+ reference.href = Addressable::URI.parse(extract_attribute(ref, 'href'))
130
+ guide << reference
131
+ end
132
+
133
+ guide
134
+ end
135
+
136
+ def parse_bindings
137
+ bindings = @package.bindings = EPUB::Publication::Package::Bindings.new
138
+ @doc.xpath('/opf:package/opf:bindings/opf:mediaType', EPUB::NAMESPACES).each do |elem|
139
+ media_type = EPUB::Publication::Package::Bindings::MediaType.new
140
+ media_type.media_type = extract_attribute(elem, 'media-type')
141
+ media_type.handler = @package.manifest[extract_attribute(elem, 'handler')]
142
+ bindings << media_type
143
+ end
144
+
145
+ bindings
146
+ end
147
+
148
+ def parse_prefix(str)
149
+ prefixes = {}
150
+ return prefixes if str.nil? or str.empty?
151
+ scanner = StringScanner.new(str)
152
+ scanner.scan /\s*/
153
+ while prefix = scanner.scan(/[^\:\s]+/)
154
+ scanner.scan /[\:\s]+/
155
+ iri = scanner.scan(/[^\s]+/)
156
+ if iri.nil? or iri.empty?
157
+ warn "no IRI detected for prefix `#{prefix}`"
158
+ else
159
+ prefixes[prefix] = iri
160
+ end
161
+ scanner.scan /\s*/
162
+ end
163
+ prefixes
164
+ end
165
+
166
+ def extract_model(elem, id_map, xpath, klass=:DCMES, attributes=%w[id lang dir])
167
+ models = elem.xpath(xpath, EPUB::NAMESPACES).collect do |e|
168
+ model = EPUB::Publication::Package::Metadata.const_get(klass).new
169
+ attributes.each do |attr|
170
+ model.__send__ "#{attr.gsub(/-/, '_')}=", extract_attribute(e, attr)
171
+ end
172
+ model.content = e.content unless klass == :Link
173
+
174
+ yield model, e if block_given?
175
+
176
+ model
177
+ end
178
+
179
+ models.each do |model|
180
+ id_map[model.id] = {metadata: model} if model.respond_to?(:id) && model.id
181
+ end
182
+
183
+ models
184
+ end
185
+
186
+ def extract_refinee(elem, id_map, xpath, klass, attributes)
187
+ extract_model(elem, id_map, xpath, klass, attributes) {|model, e|
188
+ yield model, e if block_given?
189
+ refines = extract_attribute(e, 'refines')
190
+ if refines && refines[0] == '#'
191
+ id = refines[1..-1]
192
+ id_map[id] ||= {}
193
+ id_map[id][:refiners] ||= []
194
+ id_map[id][:refiners] << model
195
+ end
196
+ }
197
+ end
198
+ end
199
+ end
200
+ end