peregrin 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,553 @@
1
+ class Peregrin::Epub
2
+
3
+ FORMAT = "EPUB"
4
+
5
+ NAMESPACES = {
6
+ :ocf => { 'ocf' => 'urn:oasis:names:tc:opendocument:xmlns:container' },
7
+ :opf => { 'opf' => 'http://www.idpf.org/2007/opf' },
8
+ :dc => { 'dc' => 'http://purl.org/dc/elements/1.1/' },
9
+ :ncx => { 'ncx' => 'http://www.daisy.org/z3986/2005/ncx/' },
10
+ :svg => { 'svg' => 'http://www.w3.org/2000/svg' }
11
+ }
12
+ OCF_PATH = "META-INF/container.xml"
13
+ HTML5_TAGNAMES = %w[section nav article aside hgroup header footer figure figcaption] # FIXME: Which to divify? Which to leave as-is?
14
+ MIMETYPE_MAP = {
15
+ '.xhtml' => 'application/xhtml+xml',
16
+ '.odt' => 'application/x-dtbook+xml',
17
+ '.odt' => 'application/x-dtbook+xml',
18
+ '.ncx' => 'application/x-dtbncx+xml',
19
+ '.epub' => 'application/epub+zip'
20
+ }
21
+ OEBPS = "OEBPS"
22
+ NCX = 'content'
23
+ OPF = 'content'
24
+
25
+
26
+ def self.validate(path)
27
+ raise FileNotFound.new(path) unless File.file?(path)
28
+ begin
29
+ zf = Zip::Archive.open(path)
30
+ rescue => e
31
+ raise NotAZipArchive.new(path)
32
+ end
33
+
34
+ begin
35
+ book = Peregrin::Book.new
36
+ epub = new(book)
37
+ epub.send(:load_config_documents, zf)
38
+ rescue => e
39
+ raise e.class.new(path)
40
+ end
41
+ ensure
42
+ zf.close if zf
43
+ end
44
+
45
+
46
+ def self.read(path)
47
+ book = Peregrin::Book.new
48
+ new(book, path)
49
+ end
50
+
51
+
52
+ def initialize(book, epub_path = nil)
53
+ @book = book
54
+ if epub_path
55
+ load_from_path(epub_path)
56
+ end
57
+ end
58
+
59
+
60
+ def write(path)
61
+ with_working_dir(path) {
62
+ build_ocf
63
+ build_ncx
64
+ write_components
65
+ build_opf
66
+ zip_it_up(File.basename(path))
67
+ }
68
+ end
69
+
70
+
71
+ def to_book(options = {})
72
+ bk = @book.deep_clone
73
+ end
74
+
75
+
76
+ protected
77
+
78
+ #---------------------------------------------------------------------------
79
+ # READING
80
+ #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
81
+
82
+ def load_from_path(epub_path)
83
+ docs = nil
84
+ Zip::Archive.open(epub_path) { |zipfile|
85
+ docs = load_config_documents(zipfile)
86
+ extract_properties(docs[:opf])
87
+ extract_components(zipfile, docs[:opf], docs[:opf_root])
88
+ extract_chapters(zipfile, docs[:ncx])
89
+ extract_cover(zipfile, docs)
90
+ }
91
+ @book.read_resource_proc = lambda { |resource|
92
+ media_path = from_opf_root(docs[:opf_root], resource.src)
93
+ media_path = URI.unescape(media_path)
94
+ Zip::Archive.open(epub_path) { |zipfile| zipfile.read(media_path) }
95
+ }
96
+ end
97
+
98
+
99
+ def load_config_documents(zipfile)
100
+ # The OCF file.
101
+ begin
102
+ docs = { :ocf => Nokogiri::XML::Document.parse(zipfile.read(OCF_PATH)) }
103
+ rescue
104
+ raise FailureLoadingOCF
105
+ end
106
+
107
+ # The OPF file.
108
+ begin
109
+ docs[:opf_path] = docs[:ocf].at_xpath(
110
+ '//ocf:rootfile[@media-type="application/oebps-package+xml"]',
111
+ NAMESPACES[:ocf]
112
+ )['full-path']
113
+ docs[:opf_root] = File.dirname(docs[:opf_path])
114
+ docs[:opf] = Nokogiri::XML::Document.parse(zipfile.read(docs[:opf_path]))
115
+ rescue
116
+ raise FailureLoadingOPF
117
+ end
118
+
119
+ # The NCX file.
120
+ begin
121
+ spine = docs[:opf].at_xpath('//opf:spine', NAMESPACES[:opf])
122
+ ncx_id = spine['toc'] ? spine['toc'] : 'ncx'
123
+ item = docs[:opf].at_xpath(
124
+ "//opf:manifest/opf:item[@id=#{escape_for_xpath(ncx_id)}]",
125
+ NAMESPACES[:opf]
126
+ )
127
+
128
+ docs[:ncx_path] = from_opf_root(docs[:opf_root], item['href'])
129
+ ncx_content = zipfile.read(docs[:ncx_path])
130
+ docs[:ncx] = Nokogiri::XML::Document.parse(ncx_content)
131
+ rescue => e
132
+ raise FailureLoadingNCX
133
+ end
134
+
135
+ docs
136
+ end
137
+
138
+
139
+ def extract_properties(opf_doc)
140
+ meta_elems = opf_doc.at_xpath(
141
+ '//opf:metadata',
142
+ NAMESPACES[:opf]
143
+ ).children.select { |ch|
144
+ ch.element?
145
+ }
146
+ meta_elems.each { |elem|
147
+ if elem.name == "meta"
148
+ name = elem['name']
149
+ content = elem['content']
150
+ else
151
+ name = elem.name
152
+ content = elem.content
153
+ end
154
+ atts = elem.attributes.inject({}) { |acc, pair|
155
+ key, attr = pair
156
+ acc[key] = attr.value unless ["name", "content"].include?(key)
157
+ acc
158
+ }
159
+ @book.add_property(name, content, atts)
160
+ }
161
+ end
162
+
163
+
164
+ def extract_components(zipfile, opf_doc, opf_root)
165
+ ids = {}
166
+ manifest = opf_doc.at_xpath('//opf:manifest', NAMESPACES[:opf])
167
+ spine = opf_doc.at_xpath('//opf:spine', NAMESPACES[:opf])
168
+
169
+ spine.search('//opf:itemref', NAMESPACES[:opf]).each { |iref|
170
+ id = iref['idref']
171
+ if item = manifest.at_xpath(
172
+ "//opf:item[@id=#{escape_for_xpath(id)}]",
173
+ NAMESPACES[:opf]
174
+ )
175
+ href = item['href']
176
+ linear = iref['linear'] != 'no'
177
+ begin
178
+ content = zipfile.read(from_opf_root(opf_root, href))
179
+ rescue
180
+ href = URI.unescape(href)
181
+ content = zipfile.read(from_opf_root(opf_root, href))
182
+ end
183
+ @book.add_component(
184
+ href,
185
+ content,
186
+ item['media-type'],
187
+ :id => id,
188
+ :linear => linear ? "yes" : "no"
189
+ )
190
+ end
191
+ }
192
+
193
+ manifest.search('//opf:item', NAMESPACES[:opf]).each { |item|
194
+ id = item['id']
195
+ next if item['media-type'] == MIMETYPE_MAP['.ncx']
196
+ next if @book.components.detect { |cmpt| cmpt.attributes[:id] == id }
197
+ @book.add_resource(item['href'], item['media-type'], :id => id)
198
+ }
199
+
200
+ opf_doc.search("//opf:guide/opf:reference", NAMESPACES[:opf]).each { |ref|
201
+ if it = @book.all_files.detect { |cmpt| cmpt.src == ref['href'] }
202
+ it.attributes[:guide_type] = ref['type']
203
+ it.attributes[:guide] = ref['title']
204
+ end
205
+ }
206
+ end
207
+
208
+
209
+ def extract_chapters(zipfile, ncx_doc)
210
+ curse = lambda { |point|
211
+ chp = Peregrin::Chapter.new(
212
+ point.at_xpath('.//ncx:text', NAMESPACES[:ncx]).content,
213
+ point['playOrder'],
214
+ point.at_xpath('.//ncx:content', NAMESPACES[:ncx])['src']
215
+ )
216
+ point.children.each { |pt|
217
+ next unless pt.element? && pt.name == "navPoint"
218
+ chp.children.push(curse.call(pt))
219
+ }
220
+ chp
221
+ }
222
+ ncx_doc.at_xpath("//ncx:navMap", NAMESPACES[:ncx]).children.each { |pt|
223
+ next unless pt.element? && pt.name == "navPoint"
224
+ @book.chapters.push(curse.call(pt))
225
+ }
226
+ end
227
+
228
+
229
+ def extract_cover(zipfile, docs)
230
+ @book.cover = nil
231
+
232
+ # 1. Cover image referenced from metadata
233
+ if id = @book.property_for('cover')
234
+ res = @book.all_files.detect { |r| r.attributes[:id] == id }
235
+ end
236
+
237
+ # 2. First image in a component listed in the guide as 'cover'
238
+ res ||= @book.all_files.detect { |r| r.attributes[:guide_type] == 'cover' }
239
+
240
+ # 3. A component with the id of 'cover-image', or 'cover', or 'coverpage'.
241
+ ['cover-image', 'cover', 'coverpage'].each { |cvr_id|
242
+ res ||= @book.all_files.detect { |r| r.attributes[:id] == cvr_id }
243
+ }
244
+
245
+ # 4. First image in first component.
246
+ res ||= @book.all_files.first
247
+
248
+ return unless res
249
+
250
+ if res.media_type.match(/^image\//)
251
+ @book.cover = res
252
+ else
253
+ path = from_opf_root(docs[:opf_root], res.src)
254
+ begin
255
+ doc = Nokogiri::XML::Document.parse(zipfile.read(path))
256
+ src = nil
257
+ if img = doc.at_css('img')
258
+ src = img['src']
259
+ elsif img = doc.at_xpath('//svg:image', NAMESPACES[:svg])
260
+ src = img['href']
261
+ end
262
+ if src
263
+ @book.cover = @book.resources.detect { |r| r.src == src }
264
+ end
265
+ rescue
266
+ #puts "Cover component is not an image or an XML document."
267
+ end
268
+ end
269
+
270
+ @book.cover
271
+ end
272
+
273
+
274
+ #---------------------------------------------------------------------------
275
+ # WRITING
276
+ #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
277
+
278
+ def with_working_dir(path)
279
+ raise ArgumentError unless block_given?
280
+ @working_dir = File.join(
281
+ File.dirname(path),
282
+ File.basename(path, File.extname(path))
283
+ )
284
+ FileUtils.rm_rf(@working_dir)
285
+ FileUtils.mkdir_p(@working_dir)
286
+ yield
287
+ ensure
288
+ #FileUtils.rm_rf(@working_dir)
289
+ @working_dir = nil
290
+ end
291
+
292
+
293
+ def working_dir(*args)
294
+ File.join(*([@working_dir, args].flatten.compact))
295
+ end
296
+
297
+
298
+ def build_ocf
299
+ build_xml_file(working_dir(OCF_PATH)) { |xml|
300
+ xml.container(:xmlns => NAMESPACES[:ocf]["ocf"], :version => "1.0") {
301
+ xml.rootfiles {
302
+ xml.rootfile(
303
+ "full-path" => "OEBPS/#{OPF}.opf",
304
+ "media-type" => "application/oebps-package+xml"
305
+ )
306
+ }
307
+ }
308
+ }
309
+ end
310
+
311
+
312
+ def build_ncx
313
+ ncx_path = build_xml_file(working_dir(OEBPS, "#{NCX}.ncx")) { |xml|
314
+ xml.ncx('xmlns' => NAMESPACES[:ncx]["ncx"], :version => "2005-1") {
315
+ xml.head {
316
+ xml.meta(:name => "dtb:uid", :content => unique_identifier)
317
+ xml.meta(:name => "dtb:depth", :content => heading_depth)
318
+ xml.meta(:name => "dtb:totalPageCount", :content => "0")
319
+ xml.meta(:name => "dtb:maxPageNumber", :content => "0")
320
+ }
321
+ xml.docTitle {
322
+ xml.text_(@book.property_for('title'))
323
+ }
324
+ xml.navMap {
325
+ i = 0
326
+ curse = lambda { |children|
327
+ children.each { |chapter|
328
+ xml.navPoint(
329
+ :id => "navPoint#{i+=1}",
330
+ :playOrder => chapter.position
331
+ ) {
332
+ xml.navLabel { xml.text_(chapter.title) }
333
+ xml.content(:src => chapter.src)
334
+ curse.call(chapter.children) if chapter.children.any?
335
+ } unless chapter.empty_leaf?
336
+ }
337
+ }
338
+ curse.call(@book.chapters)
339
+ }
340
+ }
341
+ }
342
+ @ncx_path = ncx_path
343
+ end
344
+
345
+
346
+ def write_components
347
+ # Linear components.
348
+ @book.components.each { |cmpt|
349
+ cmpt.attributes[:id] ||= File.basename(cmpt.src, File.extname(cmpt.src))
350
+
351
+ doc = Nokogiri::HTML::Document.parse(cmpt.contents)
352
+ html = root_to_xhtml(doc.root)
353
+ File.open(working_dir(OEBPS, cmpt.src), 'w') { |f| f.write(html) }
354
+ }
355
+
356
+ # Other components (@book.resources)
357
+ @book.resources.each { |res|
358
+ res.attributes[:id] ||= (
359
+ "#{File.dirname(res.src)}-#{File.basename(res.src)}"
360
+ ).gsub(/[^\w]+/, '-').gsub(/^-+/, '').gsub(/^(\d)/, 'a-\1')
361
+
362
+ dest_path = working_dir(OEBPS, res.src)
363
+ FileUtils.mkdir_p(File.dirname(dest_path))
364
+ @book.copy_resource_to(res, dest_path)
365
+ }
366
+ end
367
+
368
+
369
+ def build_opf
370
+ build_xml_file(working_dir(OEBPS, "#{OPF}.opf")) { |xml|
371
+ xml.package(
372
+ 'xmlns' => "http://www.idpf.org/2007/opf",
373
+ 'xmlns:dc' => "http://purl.org/dc/elements/1.1/",
374
+ 'version' => "2.0",
375
+ 'unique-identifier' => 'bookid'
376
+ ) {
377
+ xml.metadata {
378
+ xml['dc'].title(@book.property_for('title') || 'Untitled')
379
+ xml['dc'].identifier(unique_identifier, :id => 'bookid')
380
+ xml['dc'].language(@book.property_for('language') || 'en')
381
+ [
382
+ 'creator',
383
+ 'subject',
384
+ 'description',
385
+ 'publisher',
386
+ 'contributor',
387
+ 'date',
388
+ 'source',
389
+ 'relation',
390
+ 'coverage',
391
+ 'rights'
392
+ ].each { |dc|
393
+ if val = @book.property_for(dc)
394
+ val.split(/\n/).each { |v|
395
+ xml['dc'].send(dc, v) if v
396
+ }
397
+ end
398
+ }
399
+ if @book.cover
400
+ cover_id = @book.cover.attributes[:id] || "cover"
401
+ xml.meta(:name => "cover", :content => cover_id)
402
+ end
403
+ }
404
+ xml.manifest {
405
+ @book.components.each { |item|
406
+ xml.item(
407
+ 'id' => item.attributes[:id],
408
+ 'href' => item.src,
409
+ 'media-type' => MIMETYPE_MAP['.xhtml']
410
+ )
411
+ }
412
+ @book.resources.each { |item|
413
+ xml.item(
414
+ 'id' => item.attributes[:id],
415
+ 'href' => item.src,
416
+ 'media-type' => item.media_type
417
+ )
418
+ }
419
+ xml.item(
420
+ 'id' => NCX,
421
+ 'href' => @ncx_path,
422
+ 'media-type' => MIMETYPE_MAP['.ncx']
423
+ )
424
+ }
425
+ xml.spine(:toc => NCX) {
426
+ @book.components.each { |item|
427
+ xml.itemref(
428
+ :idref => item.attributes[:id],
429
+ :linear => item.attributes[:linear] || 'yes'
430
+ )
431
+ }
432
+ }
433
+ xml.guide {
434
+ guide_items = @book.components.select { |it| it.attributes[:guide] }
435
+ guide_items.each { |guide_item|
436
+ xml.reference(
437
+ :type => (
438
+ guide_item.attributes[:guide_type] ||
439
+ guide_item.attributes[:id]
440
+ ),
441
+ :title => guide_item.attributes[:guide],
442
+ :href => guide_item.src
443
+ )
444
+ }
445
+ }
446
+ }
447
+ }
448
+ end
449
+
450
+
451
+ def zip_it_up(filename)
452
+ path = working_dir("..", filename)
453
+ File.open(working_dir("mimetype"), 'w') { |f|
454
+ f.write(MIMETYPE_MAP['.epub'])
455
+ }
456
+ File.unlink(path) if File.exists?(path)
457
+ cmd = [
458
+ "cd #{working_dir}",
459
+ "zip -0Xq ../#{filename} mimetype",
460
+ "zip -Xr9Dq ../#{filename} *"
461
+ ]
462
+ `#{cmd.join(" && ")}`
463
+ path
464
+ end
465
+
466
+
467
+ def unique_identifier
468
+ @uid ||= @book.property_for('bookid') || random_string(12)
469
+ end
470
+
471
+
472
+ def random_string(len)
473
+ require 'digest/sha1'
474
+ s = Digest::SHA1.new
475
+ s << Time.now.to_s
476
+ s << String(Time.now.usec)
477
+ s << String(rand(0))
478
+ s << String($$)
479
+ str = s.hexdigest
480
+ str.slice(rand(str.size - len), len)
481
+ end
482
+
483
+
484
+ def heading_depth
485
+ max = 0
486
+ curr = 0
487
+ curse = lambda { |children|
488
+ children.each { |chp|
489
+ curr += 1
490
+ max = [curr, max].max
491
+ curse.call(chp.children) if chp.children.any?
492
+ curr -= 1
493
+ }
494
+ }
495
+ curse.call(@book.chapters)
496
+ max
497
+ end
498
+
499
+
500
+ def build_xml_file(path)
501
+ raise ArgumentError unless block_given?
502
+ builder = Nokogiri::XML::Builder.new(:encoding => 'UTF-8') { |xml|
503
+ yield(xml)
504
+ }
505
+ FileUtils.mkdir_p(File.dirname(path))
506
+ File.open(path, 'w') { |f|
507
+ builder.doc.write_xml_to(f, :encoding => 'UTF-8', :indent => 2)
508
+ }
509
+ path.gsub(/^#{working_dir(OEBPS)}\//, '')
510
+ end
511
+
512
+
513
+ def root_to_xhtml(root)
514
+ root.remove_attribute('manifest')
515
+ root.css(HTML5_TAGNAMES.join(', ')).each { |elem|
516
+ k = elem['class']
517
+ elem['class'] = "#{k.nil? || k.empty? ? '' : "#{k} " }#{elem.name}"
518
+ elem.name = "div"
519
+ }
520
+ root.remove_attribute('xmlns')
521
+ root.to_xhtml(:indent => 2, :encoding => root.document.encoding)
522
+ end
523
+
524
+
525
+ def from_opf_root(opf_root, *args)
526
+ if opf_root && !opf_root.empty? && opf_root != '.'
527
+ File.join(opf_root, *args)
528
+ else
529
+ File.join(*args)
530
+ end
531
+ end
532
+
533
+
534
+ def escape_for_xpath(str)
535
+ str.index("'") ? '"'+str+'"' : "'#{str}'"
536
+ end
537
+
538
+
539
+ class ValidationError < ::RuntimeError
540
+
541
+ def initialize(path = nil)
542
+ @path = path
543
+ end
544
+
545
+ end
546
+
547
+ class FileNotFound < ValidationError; end
548
+ class NotAZipArchive < ValidationError; end
549
+ class FailureLoadingOCF < ValidationError; end
550
+ class FailureLoadingOPF < ValidationError; end
551
+ class FailureLoadingNCX < ValidationError; end
552
+
553
+ end
@@ -0,0 +1,113 @@
1
+ class Peregrin::Ochook < Peregrin::Zhook
2
+
3
+ FORMAT = "Ochook"
4
+ MANIFEST_PATH = "ochook.manifest"
5
+
6
+ def self.validate(path)
7
+ path = path.gsub(/\/$/, '')
8
+ unless File.directory?(path)
9
+ raise DirectoryNotFound.new(path)
10
+ end
11
+ unless File.exists?(File.join(path, INDEX_PATH))
12
+ raise MissingIndexHTML.new(path)
13
+ end
14
+ unless File.exists?(File.join(path, COVER_PATH))
15
+ raise MissingCoverPNG.new(path)
16
+ end
17
+ unless File.exists?(File.join(path, MANIFEST_PATH))
18
+ raise MissingManifest.new(path)
19
+ end
20
+
21
+ doc = Nokogiri::HTML::Document.parse(IO.read(File.join(path, INDEX_PATH)))
22
+ raise IndexHTMLRootHasId.new(path) if doc.root['id']
23
+ unless doc.root['manifest'] = MANIFEST_PATH
24
+ raise IndexHTMLRootHasNoManifest.new(path)
25
+ end
26
+ end
27
+
28
+
29
+ def self.read(path)
30
+ path = path.gsub(/\/$/, '')
31
+ validate(path)
32
+ book = Peregrin::Book.new
33
+ book.add_component(INDEX_PATH, IO.read(File.join(path, INDEX_PATH)))
34
+ Dir.glob(File.join(path, '**', '*')).each { |fpath|
35
+ ex = [INDEX_PATH, MANIFEST_PATH]
36
+ mpath = fpath.gsub(/^#{path}\//,'')
37
+ unless File.directory?(fpath) || ex.include?(mpath)
38
+ book.add_resource(mpath)
39
+ end
40
+ }
41
+ book.read_resource_proc = lambda { |resource|
42
+ IO.read(File.join(path, resource.src))
43
+ }
44
+ extract_properties_from_index(book)
45
+ new(book)
46
+ end
47
+
48
+
49
+ def initialize(book)
50
+ super
51
+ insert_manifest_attribute
52
+ end
53
+
54
+
55
+ def write(dir)
56
+ FileUtils.rm_rf(dir) if File.directory?(dir)
57
+ FileUtils.mkdir_p(dir)
58
+
59
+ # Index
60
+ index_path = File.join(dir, INDEX_PATH)
61
+ File.open(index_path, 'w') { |f| f << htmlize(index) }
62
+
63
+ # Resources
64
+ @book.resources.each { |resource|
65
+ full_path = File.join(dir, resource.src)
66
+ FileUtils.mkdir_p(File.dirname(full_path))
67
+ File.open(full_path, 'w') { |f| f << @book.read_resource(resource) }
68
+ }
69
+
70
+ # Cover
71
+ unless @book.cover == COVER_PATH
72
+ cover_path = File.join(dir, COVER_PATH)
73
+ File.open(cover_path, 'wb') { |f| f << to_png_data(@book.cover) }
74
+ unless @book.resources.detect { |r| r.src == COVER_PATH }
75
+ @book.add_resource(COVER_PATH)
76
+ end
77
+ end
78
+
79
+ # Manifest
80
+ manifest_path = File.join(dir, MANIFEST_PATH)
81
+ File.open(manifest_path, 'w') { |f| f << manifest.join("\n") }
82
+ end
83
+
84
+
85
+ def to_book(options = {})
86
+ remove_manifest_attribute
87
+ super(options)
88
+ end
89
+
90
+
91
+ protected
92
+
93
+ def manifest
94
+ manifest = ["CACHE MANIFEST", "", "NETWORK:", "*", "", "CACHE:", INDEX_PATH]
95
+ @book.resources.inject(manifest) { |mf, resource| mf << resource.src; mf }
96
+ end
97
+
98
+
99
+ def insert_manifest_attribute
100
+ index.at_xpath('/html').set_attribute('manifest', MANIFEST_PATH)
101
+ end
102
+
103
+
104
+ def remove_manifest_attribute
105
+ index.at_xpath('/html').remove_attribute('manifest')
106
+ end
107
+
108
+
109
+ class DirectoryNotFound < ValidationError; end
110
+ class MissingManifest < ValidationError; end
111
+ class IndexHTMLRootHasNoManifest < ValidationError; end
112
+
113
+ end