peregrin 1.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,553 @@
1
+ class Peregrin::Epub
2
+
3
+ FORMAT = "EPUB"
4
+
5
+ NAMESPACES = {
6
+ :ocf => { 'ocf' => 'urn:oasis:names:tc:opendocument:xmlns:container' },
7
+ :opf => { 'opf' => 'http://www.idpf.org/2007/opf' },
8
+ :dc => { 'dc' => 'http://purl.org/dc/elements/1.1/' },
9
+ :ncx => { 'ncx' => 'http://www.daisy.org/z3986/2005/ncx/' },
10
+ :svg => { 'svg' => 'http://www.w3.org/2000/svg' }
11
+ }
12
+ OCF_PATH = "META-INF/container.xml"
13
+ HTML5_TAGNAMES = %w[section nav article aside hgroup header footer figure figcaption] # FIXME: Which to divify? Which to leave as-is?
14
+ MIMETYPE_MAP = {
15
+ '.xhtml' => 'application/xhtml+xml',
16
+ '.odt' => 'application/x-dtbook+xml',
17
+ '.odt' => 'application/x-dtbook+xml',
18
+ '.ncx' => 'application/x-dtbncx+xml',
19
+ '.epub' => 'application/epub+zip'
20
+ }
21
+ OEBPS = "OEBPS"
22
+ NCX = 'content'
23
+ OPF = 'content'
24
+
25
+
26
+ def self.validate(path)
27
+ raise FileNotFound.new(path) unless File.file?(path)
28
+ begin
29
+ zf = Zip::Archive.open(path)
30
+ rescue => e
31
+ raise NotAZipArchive.new(path)
32
+ end
33
+
34
+ begin
35
+ book = Peregrin::Book.new
36
+ epub = new(book)
37
+ epub.send(:load_config_documents, zf)
38
+ rescue => e
39
+ raise e.class.new(path)
40
+ end
41
+ ensure
42
+ zf.close if zf
43
+ end
44
+
45
+
46
+ def self.read(path)
47
+ book = Peregrin::Book.new
48
+ new(book, path)
49
+ end
50
+
51
+
52
+ def initialize(book, epub_path = nil)
53
+ @book = book
54
+ if epub_path
55
+ load_from_path(epub_path)
56
+ end
57
+ end
58
+
59
+
60
+ def write(path)
61
+ with_working_dir(path) {
62
+ build_ocf
63
+ build_ncx
64
+ write_components
65
+ build_opf
66
+ zip_it_up(File.basename(path))
67
+ }
68
+ end
69
+
70
+
71
+ def to_book(options = {})
72
+ bk = @book.deep_clone
73
+ end
74
+
75
+
76
+ protected
77
+
78
+ #---------------------------------------------------------------------------
79
+ # READING
80
+ #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
81
+
82
+ def load_from_path(epub_path)
83
+ docs = nil
84
+ Zip::Archive.open(epub_path) { |zipfile|
85
+ docs = load_config_documents(zipfile)
86
+ extract_properties(docs[:opf])
87
+ extract_components(zipfile, docs[:opf], docs[:opf_root])
88
+ extract_chapters(zipfile, docs[:ncx])
89
+ extract_cover(zipfile, docs)
90
+ }
91
+ @book.read_resource_proc = lambda { |resource|
92
+ media_path = from_opf_root(docs[:opf_root], resource.src)
93
+ media_path = URI.unescape(media_path)
94
+ Zip::Archive.open(epub_path) { |zipfile| zipfile.read(media_path) }
95
+ }
96
+ end
97
+
98
+
99
+ def load_config_documents(zipfile)
100
+ # The OCF file.
101
+ begin
102
+ docs = { :ocf => Nokogiri::XML::Document.parse(zipfile.read(OCF_PATH)) }
103
+ rescue
104
+ raise FailureLoadingOCF
105
+ end
106
+
107
+ # The OPF file.
108
+ begin
109
+ docs[:opf_path] = docs[:ocf].at_xpath(
110
+ '//ocf:rootfile[@media-type="application/oebps-package+xml"]',
111
+ NAMESPACES[:ocf]
112
+ )['full-path']
113
+ docs[:opf_root] = File.dirname(docs[:opf_path])
114
+ docs[:opf] = Nokogiri::XML::Document.parse(zipfile.read(docs[:opf_path]))
115
+ rescue
116
+ raise FailureLoadingOPF
117
+ end
118
+
119
+ # The NCX file.
120
+ begin
121
+ spine = docs[:opf].at_xpath('//opf:spine', NAMESPACES[:opf])
122
+ ncx_id = spine['toc'] ? spine['toc'] : 'ncx'
123
+ item = docs[:opf].at_xpath(
124
+ "//opf:manifest/opf:item[@id=#{escape_for_xpath(ncx_id)}]",
125
+ NAMESPACES[:opf]
126
+ )
127
+
128
+ docs[:ncx_path] = from_opf_root(docs[:opf_root], item['href'])
129
+ ncx_content = zipfile.read(docs[:ncx_path])
130
+ docs[:ncx] = Nokogiri::XML::Document.parse(ncx_content)
131
+ rescue => e
132
+ raise FailureLoadingNCX
133
+ end
134
+
135
+ docs
136
+ end
137
+
138
+
139
+ def extract_properties(opf_doc)
140
+ meta_elems = opf_doc.at_xpath(
141
+ '//opf:metadata',
142
+ NAMESPACES[:opf]
143
+ ).children.select { |ch|
144
+ ch.element?
145
+ }
146
+ meta_elems.each { |elem|
147
+ if elem.name == "meta"
148
+ name = elem['name']
149
+ content = elem['content']
150
+ else
151
+ name = elem.name
152
+ content = elem.content
153
+ end
154
+ atts = elem.attributes.inject({}) { |acc, pair|
155
+ key, attr = pair
156
+ acc[key] = attr.value unless ["name", "content"].include?(key)
157
+ acc
158
+ }
159
+ @book.add_property(name, content, atts)
160
+ }
161
+ end
162
+
163
+
164
+ def extract_components(zipfile, opf_doc, opf_root)
165
+ ids = {}
166
+ manifest = opf_doc.at_xpath('//opf:manifest', NAMESPACES[:opf])
167
+ spine = opf_doc.at_xpath('//opf:spine', NAMESPACES[:opf])
168
+
169
+ spine.search('//opf:itemref', NAMESPACES[:opf]).each { |iref|
170
+ id = iref['idref']
171
+ if item = manifest.at_xpath(
172
+ "//opf:item[@id=#{escape_for_xpath(id)}]",
173
+ NAMESPACES[:opf]
174
+ )
175
+ href = item['href']
176
+ linear = iref['linear'] != 'no'
177
+ begin
178
+ content = zipfile.read(from_opf_root(opf_root, href))
179
+ rescue
180
+ href = URI.unescape(href)
181
+ content = zipfile.read(from_opf_root(opf_root, href))
182
+ end
183
+ @book.add_component(
184
+ href,
185
+ content,
186
+ item['media-type'],
187
+ :id => id,
188
+ :linear => linear ? "yes" : "no"
189
+ )
190
+ end
191
+ }
192
+
193
+ manifest.search('//opf:item', NAMESPACES[:opf]).each { |item|
194
+ id = item['id']
195
+ next if item['media-type'] == MIMETYPE_MAP['.ncx']
196
+ next if @book.components.detect { |cmpt| cmpt.attributes[:id] == id }
197
+ @book.add_resource(item['href'], item['media-type'], :id => id)
198
+ }
199
+
200
+ opf_doc.search("//opf:guide/opf:reference", NAMESPACES[:opf]).each { |ref|
201
+ if it = @book.all_files.detect { |cmpt| cmpt.src == ref['href'] }
202
+ it.attributes[:guide_type] = ref['type']
203
+ it.attributes[:guide] = ref['title']
204
+ end
205
+ }
206
+ end
207
+
208
+
209
+ def extract_chapters(zipfile, ncx_doc)
210
+ curse = lambda { |point|
211
+ chp = Peregrin::Chapter.new(
212
+ point.at_xpath('.//ncx:text', NAMESPACES[:ncx]).content,
213
+ point['playOrder'],
214
+ point.at_xpath('.//ncx:content', NAMESPACES[:ncx])['src']
215
+ )
216
+ point.children.each { |pt|
217
+ next unless pt.element? && pt.name == "navPoint"
218
+ chp.children.push(curse.call(pt))
219
+ }
220
+ chp
221
+ }
222
+ ncx_doc.at_xpath("//ncx:navMap", NAMESPACES[:ncx]).children.each { |pt|
223
+ next unless pt.element? && pt.name == "navPoint"
224
+ @book.chapters.push(curse.call(pt))
225
+ }
226
+ end
227
+
228
+
229
+ def extract_cover(zipfile, docs)
230
+ @book.cover = nil
231
+
232
+ # 1. Cover image referenced from metadata
233
+ if id = @book.property_for('cover')
234
+ res = @book.all_files.detect { |r| r.attributes[:id] == id }
235
+ end
236
+
237
+ # 2. First image in a component listed in the guide as 'cover'
238
+ res ||= @book.all_files.detect { |r| r.attributes[:guide_type] == 'cover' }
239
+
240
+ # 3. A component with the id of 'cover-image', or 'cover', or 'coverpage'.
241
+ ['cover-image', 'cover', 'coverpage'].each { |cvr_id|
242
+ res ||= @book.all_files.detect { |r| r.attributes[:id] == cvr_id }
243
+ }
244
+
245
+ # 4. First image in first component.
246
+ res ||= @book.all_files.first
247
+
248
+ return unless res
249
+
250
+ if res.media_type.match(/^image\//)
251
+ @book.cover = res
252
+ else
253
+ path = from_opf_root(docs[:opf_root], res.src)
254
+ begin
255
+ doc = Nokogiri::XML::Document.parse(zipfile.read(path))
256
+ src = nil
257
+ if img = doc.at_css('img')
258
+ src = img['src']
259
+ elsif img = doc.at_xpath('//svg:image', NAMESPACES[:svg])
260
+ src = img['href']
261
+ end
262
+ if src
263
+ @book.cover = @book.resources.detect { |r| r.src == src }
264
+ end
265
+ rescue
266
+ #puts "Cover component is not an image or an XML document."
267
+ end
268
+ end
269
+
270
+ @book.cover
271
+ end
272
+
273
+
274
+ #---------------------------------------------------------------------------
275
+ # WRITING
276
+ #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
277
+
278
+ def with_working_dir(path)
279
+ raise ArgumentError unless block_given?
280
+ @working_dir = File.join(
281
+ File.dirname(path),
282
+ File.basename(path, File.extname(path))
283
+ )
284
+ FileUtils.rm_rf(@working_dir)
285
+ FileUtils.mkdir_p(@working_dir)
286
+ yield
287
+ ensure
288
+ #FileUtils.rm_rf(@working_dir)
289
+ @working_dir = nil
290
+ end
291
+
292
+
293
+ def working_dir(*args)
294
+ File.join(*([@working_dir, args].flatten.compact))
295
+ end
296
+
297
+
298
+ def build_ocf
299
+ build_xml_file(working_dir(OCF_PATH)) { |xml|
300
+ xml.container(:xmlns => NAMESPACES[:ocf]["ocf"], :version => "1.0") {
301
+ xml.rootfiles {
302
+ xml.rootfile(
303
+ "full-path" => "OEBPS/#{OPF}.opf",
304
+ "media-type" => "application/oebps-package+xml"
305
+ )
306
+ }
307
+ }
308
+ }
309
+ end
310
+
311
+
312
+ def build_ncx
313
+ ncx_path = build_xml_file(working_dir(OEBPS, "#{NCX}.ncx")) { |xml|
314
+ xml.ncx('xmlns' => NAMESPACES[:ncx]["ncx"], :version => "2005-1") {
315
+ xml.head {
316
+ xml.meta(:name => "dtb:uid", :content => unique_identifier)
317
+ xml.meta(:name => "dtb:depth", :content => heading_depth)
318
+ xml.meta(:name => "dtb:totalPageCount", :content => "0")
319
+ xml.meta(:name => "dtb:maxPageNumber", :content => "0")
320
+ }
321
+ xml.docTitle {
322
+ xml.text_(@book.property_for('title'))
323
+ }
324
+ xml.navMap {
325
+ i = 0
326
+ curse = lambda { |children|
327
+ children.each { |chapter|
328
+ xml.navPoint(
329
+ :id => "navPoint#{i+=1}",
330
+ :playOrder => chapter.position
331
+ ) {
332
+ xml.navLabel { xml.text_(chapter.title) }
333
+ xml.content(:src => chapter.src)
334
+ curse.call(chapter.children) if chapter.children.any?
335
+ } unless chapter.empty_leaf?
336
+ }
337
+ }
338
+ curse.call(@book.chapters)
339
+ }
340
+ }
341
+ }
342
+ @ncx_path = ncx_path
343
+ end
344
+
345
+
346
+ def write_components
347
+ # Linear components.
348
+ @book.components.each { |cmpt|
349
+ cmpt.attributes[:id] ||= File.basename(cmpt.src, File.extname(cmpt.src))
350
+
351
+ doc = Nokogiri::HTML::Document.parse(cmpt.contents)
352
+ html = root_to_xhtml(doc.root)
353
+ File.open(working_dir(OEBPS, cmpt.src), 'w') { |f| f.write(html) }
354
+ }
355
+
356
+ # Other components (@book.resources)
357
+ @book.resources.each { |res|
358
+ res.attributes[:id] ||= (
359
+ "#{File.dirname(res.src)}-#{File.basename(res.src)}"
360
+ ).gsub(/[^\w]+/, '-').gsub(/^-+/, '').gsub(/^(\d)/, 'a-\1')
361
+
362
+ dest_path = working_dir(OEBPS, res.src)
363
+ FileUtils.mkdir_p(File.dirname(dest_path))
364
+ @book.copy_resource_to(res, dest_path)
365
+ }
366
+ end
367
+
368
+
369
+ def build_opf
370
+ build_xml_file(working_dir(OEBPS, "#{OPF}.opf")) { |xml|
371
+ xml.package(
372
+ 'xmlns' => "http://www.idpf.org/2007/opf",
373
+ 'xmlns:dc' => "http://purl.org/dc/elements/1.1/",
374
+ 'version' => "2.0",
375
+ 'unique-identifier' => 'bookid'
376
+ ) {
377
+ xml.metadata {
378
+ xml['dc'].title(@book.property_for('title') || 'Untitled')
379
+ xml['dc'].identifier(unique_identifier, :id => 'bookid')
380
+ xml['dc'].language(@book.property_for('language') || 'en')
381
+ [
382
+ 'creator',
383
+ 'subject',
384
+ 'description',
385
+ 'publisher',
386
+ 'contributor',
387
+ 'date',
388
+ 'source',
389
+ 'relation',
390
+ 'coverage',
391
+ 'rights'
392
+ ].each { |dc|
393
+ if val = @book.property_for(dc)
394
+ val.split(/\n/).each { |v|
395
+ xml['dc'].send(dc, v) if v
396
+ }
397
+ end
398
+ }
399
+ if @book.cover
400
+ cover_id = @book.cover.attributes[:id] || "cover"
401
+ xml.meta(:name => "cover", :content => cover_id)
402
+ end
403
+ }
404
+ xml.manifest {
405
+ @book.components.each { |item|
406
+ xml.item(
407
+ 'id' => item.attributes[:id],
408
+ 'href' => item.src,
409
+ 'media-type' => MIMETYPE_MAP['.xhtml']
410
+ )
411
+ }
412
+ @book.resources.each { |item|
413
+ xml.item(
414
+ 'id' => item.attributes[:id],
415
+ 'href' => item.src,
416
+ 'media-type' => item.media_type
417
+ )
418
+ }
419
+ xml.item(
420
+ 'id' => NCX,
421
+ 'href' => @ncx_path,
422
+ 'media-type' => MIMETYPE_MAP['.ncx']
423
+ )
424
+ }
425
+ xml.spine(:toc => NCX) {
426
+ @book.components.each { |item|
427
+ xml.itemref(
428
+ :idref => item.attributes[:id],
429
+ :linear => item.attributes[:linear] || 'yes'
430
+ )
431
+ }
432
+ }
433
+ xml.guide {
434
+ guide_items = @book.components.select { |it| it.attributes[:guide] }
435
+ guide_items.each { |guide_item|
436
+ xml.reference(
437
+ :type => (
438
+ guide_item.attributes[:guide_type] ||
439
+ guide_item.attributes[:id]
440
+ ),
441
+ :title => guide_item.attributes[:guide],
442
+ :href => guide_item.src
443
+ )
444
+ }
445
+ }
446
+ }
447
+ }
448
+ end
449
+
450
+
451
+ def zip_it_up(filename)
452
+ path = working_dir("..", filename)
453
+ File.open(working_dir("mimetype"), 'w') { |f|
454
+ f.write(MIMETYPE_MAP['.epub'])
455
+ }
456
+ File.unlink(path) if File.exists?(path)
457
+ cmd = [
458
+ "cd #{working_dir}",
459
+ "zip -0Xq ../#{filename} mimetype",
460
+ "zip -Xr9Dq ../#{filename} *"
461
+ ]
462
+ `#{cmd.join(" && ")}`
463
+ path
464
+ end
465
+
466
+
467
+ def unique_identifier
468
+ @uid ||= @book.property_for('bookid') || random_string(12)
469
+ end
470
+
471
+
472
+ def random_string(len)
473
+ require 'digest/sha1'
474
+ s = Digest::SHA1.new
475
+ s << Time.now.to_s
476
+ s << String(Time.now.usec)
477
+ s << String(rand(0))
478
+ s << String($$)
479
+ str = s.hexdigest
480
+ str.slice(rand(str.size - len), len)
481
+ end
482
+
483
+
484
+ def heading_depth
485
+ max = 0
486
+ curr = 0
487
+ curse = lambda { |children|
488
+ children.each { |chp|
489
+ curr += 1
490
+ max = [curr, max].max
491
+ curse.call(chp.children) if chp.children.any?
492
+ curr -= 1
493
+ }
494
+ }
495
+ curse.call(@book.chapters)
496
+ max
497
+ end
498
+
499
+
500
+ def build_xml_file(path)
501
+ raise ArgumentError unless block_given?
502
+ builder = Nokogiri::XML::Builder.new(:encoding => 'UTF-8') { |xml|
503
+ yield(xml)
504
+ }
505
+ FileUtils.mkdir_p(File.dirname(path))
506
+ File.open(path, 'w') { |f|
507
+ builder.doc.write_xml_to(f, :encoding => 'UTF-8', :indent => 2)
508
+ }
509
+ path.gsub(/^#{working_dir(OEBPS)}\//, '')
510
+ end
511
+
512
+
513
+ def root_to_xhtml(root)
514
+ root.remove_attribute('manifest')
515
+ root.css(HTML5_TAGNAMES.join(', ')).each { |elem|
516
+ k = elem['class']
517
+ elem['class'] = "#{k.nil? || k.empty? ? '' : "#{k} " }#{elem.name}"
518
+ elem.name = "div"
519
+ }
520
+ root.remove_attribute('xmlns')
521
+ root.to_xhtml(:indent => 2, :encoding => root.document.encoding)
522
+ end
523
+
524
+
525
+ def from_opf_root(opf_root, *args)
526
+ if opf_root && !opf_root.empty? && opf_root != '.'
527
+ File.join(opf_root, *args)
528
+ else
529
+ File.join(*args)
530
+ end
531
+ end
532
+
533
+
534
+ def escape_for_xpath(str)
535
+ str.index("'") ? '"'+str+'"' : "'#{str}'"
536
+ end
537
+
538
+
539
+ class ValidationError < ::RuntimeError
540
+
541
+ def initialize(path = nil)
542
+ @path = path
543
+ end
544
+
545
+ end
546
+
547
+ class FileNotFound < ValidationError; end
548
+ class NotAZipArchive < ValidationError; end
549
+ class FailureLoadingOCF < ValidationError; end
550
+ class FailureLoadingOPF < ValidationError; end
551
+ class FailureLoadingNCX < ValidationError; end
552
+
553
+ end
@@ -0,0 +1,113 @@
1
+ class Peregrin::Ochook < Peregrin::Zhook
2
+
3
+ FORMAT = "Ochook"
4
+ MANIFEST_PATH = "ochook.manifest"
5
+
6
+ def self.validate(path)
7
+ path = path.gsub(/\/$/, '')
8
+ unless File.directory?(path)
9
+ raise DirectoryNotFound.new(path)
10
+ end
11
+ unless File.exists?(File.join(path, INDEX_PATH))
12
+ raise MissingIndexHTML.new(path)
13
+ end
14
+ unless File.exists?(File.join(path, COVER_PATH))
15
+ raise MissingCoverPNG.new(path)
16
+ end
17
+ unless File.exists?(File.join(path, MANIFEST_PATH))
18
+ raise MissingManifest.new(path)
19
+ end
20
+
21
+ doc = Nokogiri::HTML::Document.parse(IO.read(File.join(path, INDEX_PATH)))
22
+ raise IndexHTMLRootHasId.new(path) if doc.root['id']
23
+ unless doc.root['manifest'] = MANIFEST_PATH
24
+ raise IndexHTMLRootHasNoManifest.new(path)
25
+ end
26
+ end
27
+
28
+
29
+ def self.read(path)
30
+ path = path.gsub(/\/$/, '')
31
+ validate(path)
32
+ book = Peregrin::Book.new
33
+ book.add_component(INDEX_PATH, IO.read(File.join(path, INDEX_PATH)))
34
+ Dir.glob(File.join(path, '**', '*')).each { |fpath|
35
+ ex = [INDEX_PATH, MANIFEST_PATH]
36
+ mpath = fpath.gsub(/^#{path}\//,'')
37
+ unless File.directory?(fpath) || ex.include?(mpath)
38
+ book.add_resource(mpath)
39
+ end
40
+ }
41
+ book.read_resource_proc = lambda { |resource|
42
+ IO.read(File.join(path, resource.src))
43
+ }
44
+ extract_properties_from_index(book)
45
+ new(book)
46
+ end
47
+
48
+
49
+ def initialize(book)
50
+ super
51
+ insert_manifest_attribute
52
+ end
53
+
54
+
55
+ def write(dir)
56
+ FileUtils.rm_rf(dir) if File.directory?(dir)
57
+ FileUtils.mkdir_p(dir)
58
+
59
+ # Index
60
+ index_path = File.join(dir, INDEX_PATH)
61
+ File.open(index_path, 'w') { |f| f << htmlize(index) }
62
+
63
+ # Resources
64
+ @book.resources.each { |resource|
65
+ full_path = File.join(dir, resource.src)
66
+ FileUtils.mkdir_p(File.dirname(full_path))
67
+ File.open(full_path, 'w') { |f| f << @book.read_resource(resource) }
68
+ }
69
+
70
+ # Cover
71
+ unless @book.cover == COVER_PATH
72
+ cover_path = File.join(dir, COVER_PATH)
73
+ File.open(cover_path, 'wb') { |f| f << to_png_data(@book.cover) }
74
+ unless @book.resources.detect { |r| r.src == COVER_PATH }
75
+ @book.add_resource(COVER_PATH)
76
+ end
77
+ end
78
+
79
+ # Manifest
80
+ manifest_path = File.join(dir, MANIFEST_PATH)
81
+ File.open(manifest_path, 'w') { |f| f << manifest.join("\n") }
82
+ end
83
+
84
+
85
+ def to_book(options = {})
86
+ remove_manifest_attribute
87
+ super(options)
88
+ end
89
+
90
+
91
+ protected
92
+
93
+ def manifest
94
+ manifest = ["CACHE MANIFEST", "", "NETWORK:", "*", "", "CACHE:", INDEX_PATH]
95
+ @book.resources.inject(manifest) { |mf, resource| mf << resource.src; mf }
96
+ end
97
+
98
+
99
+ def insert_manifest_attribute
100
+ index.at_xpath('/html').set_attribute('manifest', MANIFEST_PATH)
101
+ end
102
+
103
+
104
+ def remove_manifest_attribute
105
+ index.at_xpath('/html').remove_attribute('manifest')
106
+ end
107
+
108
+
109
+ class DirectoryNotFound < ValidationError; end
110
+ class MissingManifest < ValidationError; end
111
+ class IndexHTMLRootHasNoManifest < ValidationError; end
112
+
113
+ end