RubyGems - peregrin - Versions diffs - 1.1.1 - Mend

peregrin 1.1.1

Files changed (24) hide show

data/MIT-LICENSE +20 -0
data/README.md +148 -0
data/bin/peregrin +6 -0
data/lib/formats/epub.rb +553 -0
data/lib/formats/ochook.rb +113 -0
data/lib/formats/zhook.rb +394 -0
data/lib/peregrin/book.rb +87 -0
data/lib/peregrin/chapter.rb +31 -0
data/lib/peregrin/component.rb +12 -0
data/lib/peregrin/componentizer.rb +118 -0
data/lib/peregrin/outliner.rb +204 -0
data/lib/peregrin/property.rb +16 -0
data/lib/peregrin/resource.rb +24 -0
data/lib/peregrin/version.rb +5 -0
data/lib/peregrin/zip_patch.rb +11 -0
data/lib/peregrin.rb +139 -0
data/test/conversion_test.rb +80 -0
data/test/formats/epub_test.rb +159 -0
data/test/formats/ochook_test.rb +104 -0
data/test/formats/zhook_test.rb +219 -0
data/test/test_helper.rb +16 -0
data/test/utils/componentizer_test.rb +78 -0
data/test/utils/outliner_test.rb +49 -0
metadata +135 -0

data/lib/formats/epub.rb ADDED Viewed

@@ -0,0 +1,553 @@
+class Peregrin::Epub
+  FORMAT = "EPUB"
+  NAMESPACES = {
+    :ocf => { 'ocf' => 'urn:oasis:names:tc:opendocument:xmlns:container' },
+    :opf => { 'opf' => 'http://www.idpf.org/2007/opf' },
+    :dc => { 'dc' => 'http://purl.org/dc/elements/1.1/' },
+    :ncx => { 'ncx' => 'http://www.daisy.org/z3986/2005/ncx/' },
+    :svg => { 'svg' => 'http://www.w3.org/2000/svg' }
+  }
+  OCF_PATH = "META-INF/container.xml"
+  HTML5_TAGNAMES = %w[section nav article aside hgroup header footer figure figcaption] # FIXME: Which to divify? Which to leave as-is?
+  MIMETYPE_MAP = {
+    '.xhtml' => 'application/xhtml+xml',
+    '.odt' => 'application/x-dtbook+xml',
+    '.odt' => 'application/x-dtbook+xml',
+    '.ncx' => 'application/x-dtbncx+xml',
+    '.epub' => 'application/epub+zip'
+  }
+  OEBPS = "OEBPS"
+  NCX = 'content'
+  OPF = 'content'
+  def self.validate(path)
+    raise FileNotFound.new(path)  unless File.file?(path)
+    begin
+      zf = Zip::Archive.open(path)
+    rescue => e
+      raise NotAZipArchive.new(path)
+    end
+    begin
+      book = Peregrin::Book.new
+      epub = new(book)
+      epub.send(:load_config_documents, zf)
+    rescue => e
+      raise e.class.new(path)
+    end
+  ensure
+    zf.close  if zf
+  end
+  def self.read(path)
+    book = Peregrin::Book.new
+    new(book, path)
+  end
+  def initialize(book, epub_path = nil)
+    @book = book
+    if epub_path
+      load_from_path(epub_path)
+    end
+  end
+  def write(path)
+    with_working_dir(path) {
+      build_ocf
+      build_ncx
+      write_components
+      build_opf
+      zip_it_up(File.basename(path))
+    }
+  end
+  def to_book(options = {})
+    bk = @book.deep_clone
+  end
+  protected
+    #---------------------------------------------------------------------------
+    # READING
+    #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+    def load_from_path(epub_path)
+      docs = nil
+      Zip::Archive.open(epub_path) { |zipfile|
+        docs = load_config_documents(zipfile)
+        extract_properties(docs[:opf])
+        extract_components(zipfile, docs[:opf], docs[:opf_root])
+        extract_chapters(zipfile, docs[:ncx])
+        extract_cover(zipfile, docs)
+      }
+      @book.read_resource_proc = lambda { |resource|
+        media_path = from_opf_root(docs[:opf_root], resource.src)
+        media_path = URI.unescape(media_path)
+        Zip::Archive.open(epub_path) { |zipfile| zipfile.read(media_path) }
+      }
+    end
+    def load_config_documents(zipfile)
+      # The OCF file.
+      begin
+        docs = { :ocf => Nokogiri::XML::Document.parse(zipfile.read(OCF_PATH)) }
+      rescue
+        raise FailureLoadingOCF
+      end
+      # The OPF file.
+      begin
+        docs[:opf_path] = docs[:ocf].at_xpath(
+          '//ocf:rootfile[@media-type="application/oebps-package+xml"]',
+          NAMESPACES[:ocf]
+        )['full-path']
+        docs[:opf_root] = File.dirname(docs[:opf_path])
+        docs[:opf] = Nokogiri::XML::Document.parse(zipfile.read(docs[:opf_path]))
+      rescue
+        raise FailureLoadingOPF
+      end
+      # The NCX file.
+      begin
+        spine = docs[:opf].at_xpath('//opf:spine', NAMESPACES[:opf])
+        ncx_id = spine['toc'] ? spine['toc'] : 'ncx'
+        item = docs[:opf].at_xpath(
+          "//opf:manifest/opf:item[@id=#{escape_for_xpath(ncx_id)}]",
+          NAMESPACES[:opf]
+        )
+        docs[:ncx_path] = from_opf_root(docs[:opf_root], item['href'])
+        ncx_content = zipfile.read(docs[:ncx_path])
+        docs[:ncx] = Nokogiri::XML::Document.parse(ncx_content)
+      rescue => e
+        raise FailureLoadingNCX
+      end
+      docs
+    end
+    def extract_properties(opf_doc)
+      meta_elems = opf_doc.at_xpath(
+        '//opf:metadata',
+        NAMESPACES[:opf]
+      ).children.select { |ch|
+        ch.element?
+      }
+      meta_elems.each { |elem|
+        if elem.name == "meta"
+          name = elem['name']
+          content = elem['content']
+        else
+          name = elem.name
+          content = elem.content
+        end
+        atts = elem.attributes.inject({}) { |acc, pair|
+          key, attr = pair
+          acc[key] = attr.value  unless ["name", "content"].include?(key)
+          acc
+        }
+        @book.add_property(name, content, atts)
+      }
+    end
+    def extract_components(zipfile, opf_doc, opf_root)
+      ids = {}
+      manifest = opf_doc.at_xpath('//opf:manifest', NAMESPACES[:opf])
+      spine = opf_doc.at_xpath('//opf:spine', NAMESPACES[:opf])
+      spine.search('//opf:itemref', NAMESPACES[:opf]).each { |iref|
+        id = iref['idref']
+        if item = manifest.at_xpath(
+          "//opf:item[@id=#{escape_for_xpath(id)}]",
+          NAMESPACES[:opf]
+        )
+          href = item['href']
+          linear = iref['linear'] != 'no'
+          begin
+            content = zipfile.read(from_opf_root(opf_root, href))
+          rescue
+            href = URI.unescape(href)
+            content = zipfile.read(from_opf_root(opf_root, href))
+          end
+          @book.add_component(
+            href,
+            content,
+            item['media-type'],
+            :id => id,
+            :linear => linear ? "yes" : "no"
+          )
+        end
+      }
+      manifest.search('//opf:item', NAMESPACES[:opf]).each { |item|
+        id = item['id']
+        next  if item['media-type'] == MIMETYPE_MAP['.ncx']
+        next  if @book.components.detect { |cmpt| cmpt.attributes[:id] == id }
+        @book.add_resource(item['href'], item['media-type'], :id => id)
+      }
+      opf_doc.search("//opf:guide/opf:reference", NAMESPACES[:opf]).each { |ref|
+        if it = @book.all_files.detect { |cmpt| cmpt.src == ref['href'] }
+          it.attributes[:guide_type] = ref['type']
+          it.attributes[:guide] = ref['title']
+        end
+      }
+    end
+    def extract_chapters(zipfile, ncx_doc)
+      curse = lambda { |point|
+        chp = Peregrin::Chapter.new(
+          point.at_xpath('.//ncx:text', NAMESPACES[:ncx]).content,
+          point['playOrder'],
+          point.at_xpath('.//ncx:content', NAMESPACES[:ncx])['src']
+        )
+        point.children.each { |pt|
+          next  unless pt.element? && pt.name == "navPoint"
+          chp.children.push(curse.call(pt))
+        }
+        chp
+      }
+      ncx_doc.at_xpath("//ncx:navMap", NAMESPACES[:ncx]).children.each { |pt|
+        next  unless pt.element? && pt.name == "navPoint"
+        @book.chapters.push(curse.call(pt))
+      }
+    end
+    def extract_cover(zipfile, docs)
+      @book.cover = nil
+      # 1. Cover image referenced from metadata
+      if id = @book.property_for('cover')
+        res = @book.all_files.detect { |r| r.attributes[:id] == id }
+      end
+      # 2. First image in a component listed in the guide as 'cover'
+      res ||= @book.all_files.detect { |r| r.attributes[:guide_type] == 'cover' }
+      # 3. A component with the id of 'cover-image', or 'cover', or 'coverpage'.
+      ['cover-image', 'cover', 'coverpage'].each { |cvr_id|
+        res ||= @book.all_files.detect { |r| r.attributes[:id] == cvr_id }
+      }
+      # 4. First image in first component.
+      res ||= @book.all_files.first
+      return  unless res
+      if res.media_type.match(/^image\//)
+        @book.cover = res
+      else
+        path = from_opf_root(docs[:opf_root], res.src)
+        begin
+          doc = Nokogiri::XML::Document.parse(zipfile.read(path))
+          src = nil
+          if img = doc.at_css('img')
+            src = img['src']
+          elsif img = doc.at_xpath('//svg:image', NAMESPACES[:svg])
+            src = img['href']
+          end
+          if src
+            @book.cover = @book.resources.detect { |r| r.src == src }
+          end
+        rescue
+          #puts "Cover component is not an image or an XML document."
+        end
+      end
+      @book.cover
+    end
+    #---------------------------------------------------------------------------
+    # WRITING
+    #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+    def with_working_dir(path)
+      raise ArgumentError  unless block_given?
+      @working_dir = File.join(
+        File.dirname(path),
+        File.basename(path, File.extname(path))
+      )
+      FileUtils.rm_rf(@working_dir)
+      FileUtils.mkdir_p(@working_dir)
+      yield
+    ensure
+      #FileUtils.rm_rf(@working_dir)
+      @working_dir = nil
+    end
+    def working_dir(*args)
+      File.join(*([@working_dir, args].flatten.compact))
+    end
+    def build_ocf
+      build_xml_file(working_dir(OCF_PATH)) { |xml|
+        xml.container(:xmlns => NAMESPACES[:ocf]["ocf"], :version => "1.0") {
+          xml.rootfiles {
+            xml.rootfile(
+              "full-path" => "OEBPS/#{OPF}.opf",
+              "media-type" => "application/oebps-package+xml"
+            )
+          }
+        }
+      }
+    end
+    def build_ncx
+      ncx_path = build_xml_file(working_dir(OEBPS, "#{NCX}.ncx")) { |xml|
+        xml.ncx('xmlns' => NAMESPACES[:ncx]["ncx"], :version => "2005-1") {
+          xml.head {
+            xml.meta(:name => "dtb:uid", :content => unique_identifier)
+            xml.meta(:name => "dtb:depth", :content => heading_depth)
+            xml.meta(:name => "dtb:totalPageCount", :content => "0")
+            xml.meta(:name => "dtb:maxPageNumber", :content => "0")
+          }
+          xml.docTitle {
+            xml.text_(@book.property_for('title'))
+          }
+          xml.navMap {
+            i = 0
+            curse = lambda { |children|
+              children.each { |chapter|
+                xml.navPoint(
+                  :id => "navPoint#{i+=1}",
+                  :playOrder => chapter.position
+                ) {
+                  xml.navLabel { xml.text_(chapter.title) }
+                  xml.content(:src => chapter.src)
+                  curse.call(chapter.children)  if chapter.children.any?
+                }  unless chapter.empty_leaf?
+              }
+            }
+            curse.call(@book.chapters)
+          }
+        }
+      }
+      @ncx_path = ncx_path
+    end
+    def write_components
+      # Linear components.
+      @book.components.each { |cmpt|
+        cmpt.attributes[:id] ||= File.basename(cmpt.src, File.extname(cmpt.src))
+        doc = Nokogiri::HTML::Document.parse(cmpt.contents)
+        html = root_to_xhtml(doc.root)
+        File.open(working_dir(OEBPS, cmpt.src), 'w') { |f| f.write(html) }
+      }
+      # Other components (@book.resources)
+      @book.resources.each { |res|
+        res.attributes[:id] ||= (
+          "#{File.dirname(res.src)}-#{File.basename(res.src)}"
+        ).gsub(/[^\w]+/, '-').gsub(/^-+/, '').gsub(/^(\d)/, 'a-\1')
+        dest_path = working_dir(OEBPS, res.src)
+        FileUtils.mkdir_p(File.dirname(dest_path))
+        @book.copy_resource_to(res, dest_path)
+      }
+    end
+    def build_opf
+      build_xml_file(working_dir(OEBPS, "#{OPF}.opf")) { |xml|
+        xml.package(
+          'xmlns' => "http://www.idpf.org/2007/opf",
+          'xmlns:dc' => "http://purl.org/dc/elements/1.1/",
+          'version' => "2.0",
+          'unique-identifier' => 'bookid'
+        ) {
+          xml.metadata {
+            xml['dc'].title(@book.property_for('title') || 'Untitled')
+            xml['dc'].identifier(unique_identifier, :id => 'bookid')
+            xml['dc'].language(@book.property_for('language') || 'en')
+            [
+              'creator',
+              'subject',
+              'description',
+              'publisher',
+              'contributor',
+              'date',
+              'source',
+              'relation',
+              'coverage',
+              'rights'
+            ].each { |dc|
+              if val = @book.property_for(dc)
+                val.split(/\n/).each { |v|
+                  xml['dc'].send(dc, v)  if v
+                }
+              end
+            }
+            if @book.cover
+              cover_id = @book.cover.attributes[:id] || "cover"
+              xml.meta(:name => "cover", :content => cover_id)
+            end
+          }
+          xml.manifest {
+            @book.components.each { |item|
+              xml.item(
+                'id' => item.attributes[:id],
+                'href' => item.src,
+                'media-type' => MIMETYPE_MAP['.xhtml']
+              )
+            }
+            @book.resources.each { |item|
+              xml.item(
+                'id' => item.attributes[:id],
+                'href' => item.src,
+                'media-type' => item.media_type
+              )
+            }
+            xml.item(
+              'id' => NCX,
+              'href' => @ncx_path,
+              'media-type' => MIMETYPE_MAP['.ncx']
+            )
+          }
+          xml.spine(:toc => NCX) {
+            @book.components.each { |item|
+              xml.itemref(
+                :idref => item.attributes[:id],
+                :linear => item.attributes[:linear] || 'yes'
+              )
+            }
+          }
+          xml.guide {
+            guide_items = @book.components.select { |it| it.attributes[:guide] }
+            guide_items.each { |guide_item|
+              xml.reference(
+                :type => (
+                  guide_item.attributes[:guide_type] ||
+                  guide_item.attributes[:id]
+                ),
+                :title => guide_item.attributes[:guide],
+                :href => guide_item.src
+              )
+            }
+          }
+        }
+      }
+    end
+    def zip_it_up(filename)
+      path = working_dir("..", filename)
+      File.open(working_dir("mimetype"), 'w') { |f|
+        f.write(MIMETYPE_MAP['.epub'])
+      }
+      File.unlink(path)  if File.exists?(path)
+      cmd = [
+        "cd #{working_dir}",
+        "zip -0Xq ../#{filename} mimetype",
+        "zip -Xr9Dq ../#{filename} *"
+      ]
+      `#{cmd.join(" && ")}`
+      path
+    end
+    def unique_identifier
+      @uid ||= @book.property_for('bookid') || random_string(12)
+    end
+    def random_string(len)
+      require 'digest/sha1'
+      s = Digest::SHA1.new
+      s << Time.now.to_s
+      s << String(Time.now.usec)
+      s << String(rand(0))
+      s << String($$)
+      str = s.hexdigest
+      str.slice(rand(str.size - len), len)
+    end
+    def heading_depth
+      max = 0
+      curr = 0
+      curse = lambda { |children|
+        children.each { |chp|
+          curr += 1
+          max = [curr, max].max
+          curse.call(chp.children)  if chp.children.any?
+          curr -= 1
+        }
+      }
+      curse.call(@book.chapters)
+      max
+    end
+    def build_xml_file(path)
+      raise ArgumentError  unless block_given?
+      builder = Nokogiri::XML::Builder.new(:encoding => 'UTF-8') { |xml|
+        yield(xml)
+      }
+      FileUtils.mkdir_p(File.dirname(path))
+      File.open(path, 'w') { |f|
+        builder.doc.write_xml_to(f, :encoding => 'UTF-8', :indent => 2)
+      }
+      path.gsub(/^#{working_dir(OEBPS)}\//, '')
+    end
+    def root_to_xhtml(root)
+      root.remove_attribute('manifest')
+      root.css(HTML5_TAGNAMES.join(', ')).each { |elem|
+        k = elem['class']
+        elem['class'] = "#{k.nil? || k.empty? ? '' : "#{k} " }#{elem.name}"
+        elem.name = "div"
+      }
+      root.remove_attribute('xmlns')
+      root.to_xhtml(:indent => 2, :encoding => root.document.encoding)
+    end
+    def from_opf_root(opf_root, *args)
+      if opf_root && !opf_root.empty? && opf_root != '.'
+        File.join(opf_root, *args)
+      else
+        File.join(*args)
+      end
+    end
+    def escape_for_xpath(str)
+      str.index("'") ? '"'+str+'"' : "'#{str}'"
+    end
+  class ValidationError < ::RuntimeError
+    def initialize(path = nil)
+      @path = path
+    end
+  end
+  class FileNotFound < ValidationError; end
+  class NotAZipArchive < ValidationError; end
+  class FailureLoadingOCF < ValidationError; end
+  class FailureLoadingOPF < ValidationError; end
+  class FailureLoadingNCX < ValidationError; end
+end

data/lib/formats/ochook.rb ADDED Viewed

@@ -0,0 +1,113 @@
+class Peregrin::Ochook < Peregrin::Zhook
+  FORMAT = "Ochook"
+  MANIFEST_PATH = "ochook.manifest"
+  def self.validate(path)
+    path = path.gsub(/\/$/, '')
+    unless File.directory?(path)
+      raise DirectoryNotFound.new(path)
+    end
+    unless File.exists?(File.join(path, INDEX_PATH))
+      raise MissingIndexHTML.new(path)
+    end
+    unless File.exists?(File.join(path, COVER_PATH))
+      raise MissingCoverPNG.new(path)
+    end
+    unless File.exists?(File.join(path, MANIFEST_PATH))
+      raise MissingManifest.new(path)
+    end
+    doc = Nokogiri::HTML::Document.parse(IO.read(File.join(path, INDEX_PATH)))
+    raise IndexHTMLRootHasId.new(path)  if doc.root['id']
+    unless doc.root['manifest'] = MANIFEST_PATH
+      raise IndexHTMLRootHasNoManifest.new(path)
+    end
+  end
+  def self.read(path)
+    path = path.gsub(/\/$/, '')
+    validate(path)
+    book = Peregrin::Book.new
+    book.add_component(INDEX_PATH, IO.read(File.join(path, INDEX_PATH)))
+    Dir.glob(File.join(path, '**', '*')).each { |fpath|
+      ex = [INDEX_PATH, MANIFEST_PATH]
+      mpath = fpath.gsub(/^#{path}\//,'')
+      unless File.directory?(fpath) || ex.include?(mpath)
+        book.add_resource(mpath)
+      end
+    }
+    book.read_resource_proc = lambda { |resource|
+      IO.read(File.join(path, resource.src))
+    }
+    extract_properties_from_index(book)
+    new(book)
+  end
+  def initialize(book)
+    super
+    insert_manifest_attribute
+  end
+  def write(dir)
+    FileUtils.rm_rf(dir)  if File.directory?(dir)
+    FileUtils.mkdir_p(dir)
+    # Index
+    index_path = File.join(dir, INDEX_PATH)
+    File.open(index_path, 'w') { |f| f << htmlize(index) }
+    # Resources
+    @book.resources.each { |resource|
+      full_path = File.join(dir, resource.src)
+      FileUtils.mkdir_p(File.dirname(full_path))
+      File.open(full_path, 'w') { |f| f << @book.read_resource(resource) }
+    }
+    # Cover
+    unless @book.cover == COVER_PATH
+      cover_path = File.join(dir, COVER_PATH)
+      File.open(cover_path, 'wb') { |f| f << to_png_data(@book.cover) }
+      unless @book.resources.detect { |r| r.src == COVER_PATH }
+        @book.add_resource(COVER_PATH)
+      end
+    end
+    # Manifest
+    manifest_path = File.join(dir, MANIFEST_PATH)
+    File.open(manifest_path, 'w') { |f| f << manifest.join("\n") }
+  end
+  def to_book(options = {})
+    remove_manifest_attribute
+    super(options)
+  end
+  protected
+    def manifest
+      manifest = ["CACHE MANIFEST", "", "NETWORK:", "*", "", "CACHE:", INDEX_PATH]
+      @book.resources.inject(manifest) { |mf, resource| mf << resource.src; mf }
+    end
+    def insert_manifest_attribute
+      index.at_xpath('/html').set_attribute('manifest', MANIFEST_PATH)
+    end
+    def remove_manifest_attribute
+      index.at_xpath('/html').remove_attribute('manifest')
+    end
+  class DirectoryNotFound < ValidationError; end
+  class MissingManifest < ValidationError; end
+  class IndexHTMLRootHasNoManifest < ValidationError; end
+end