RubyGems - pdf-core - Versions diffs - 0.0.1 - Mend

pdf-core 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +7 -0
data/lib/pdf/core.rb +35 -0
data/lib/pdf/core/annotations.rb +60 -0
data/lib/pdf/core/byte_string.rb +9 -0
data/lib/pdf/core/destinations.rb +90 -0
data/lib/pdf/core/document_state.rb +78 -0
data/lib/pdf/core/filter_list.rb +51 -0
data/lib/pdf/core/filters.rb +36 -0
data/lib/pdf/core/graphics_state.rb +68 -0
data/lib/pdf/core/literal_string.rb +16 -0
data/lib/pdf/core/name_tree.rb +177 -0
data/lib/pdf/core/object_store.rb +308 -0
data/lib/pdf/core/outline.rb +315 -0
data/lib/pdf/core/page.rb +212 -0
data/lib/pdf/core/page_geometry.rb +126 -0
data/lib/pdf/core/pdf_object.rb +99 -0
data/lib/pdf/core/reference.rb +103 -0
data/lib/pdf/core/stream.rb +98 -0
data/lib/pdf/core/text.rb +275 -0
data/pdf-core.gemspec +26 -0
metadata +140 -0

data/lib/pdf/core/literal_string.rb ADDED Viewed

@@ -0,0 +1,16 @@
+# encoding: utf-8
+module PDF
+  module Core
+    # This is used to differentiate strings that must be encoded as
+    # a *literal* string, versus those that can be encoded in
+    # the PDF hexadecimal format.
+    #
+    # Some features of the PDF format appear to require that literal
+    # strings be used. One such feature is the /Dest key of a link
+    # annotation; if a hex encoded string is used there, the links
+    # do not work (as tested in Mac OS X Preview, and Adobe Acrobat
+    # Reader).
+    class LiteralString < String #:nodoc:
+    end
+  end
+end

data/lib/pdf/core/name_tree.rb ADDED Viewed

@@ -0,0 +1,177 @@
+# encoding: utf-8
+# name_tree.rb : Implements NameTree for PDF
+#
+# Copyright November 2008, Jamis Buck. All Rights Reserved.
+#
+# This is free software. Please see the LICENSE and COPYING files for details.
+#
+module PDF
+  module Core
+    module NameTree #:nodoc:
+      class Node #:nodoc:
+        attr_reader :children
+        attr_reader :limit
+        attr_reader :document
+        attr_accessor :parent
+        attr_accessor :ref
+        def initialize(document, limit, parent=nil)
+          @document = document
+          @children = []
+          @limit = limit
+          @parent = parent
+          @ref = nil
+        end
+        def empty?
+          children.empty?
+        end
+        def size
+          leaf? ? children.size : children.inject(0) { |sum, child| sum + child.size }
+        end
+        def leaf?
+          children.empty? || children.first.is_a?(Value)
+        end
+        def add(name, value)
+          self << Value.new(name, value)
+        end
+        def to_hash
+          hash = {}
+          hash[:Limits] = [least, greatest] if parent
+          if leaf?
+            hash[:Names] = children if leaf?
+          else
+            hash[:Kids] = children.map { |child| child.ref }
+          end
+          return hash
+        end
+        def least
+          if leaf?
+            children.first.name
+          else
+            children.first.least
+          end
+        end
+        def greatest
+          if leaf?
+            children.last.name
+          else
+            children.last.greatest
+          end
+        end
+        def <<(value)
+          if children.empty?
+            children << value
+          elsif leaf?
+            children.insert(insertion_point(value), value)
+            split! if children.length > limit
+          else
+            fit = children.detect { |child| child >= value }
+            fit = children.last unless fit
+            fit << value
+          end
+          value
+        end
+        def >=(value)
+          children.empty? || children.last >= value
+        end
+        def split!
+          if parent
+            parent.split(self)
+          else
+            left, right = new_node(self), new_node(self)
+            split_children(self, left, right)
+            children.replace([left, right])
+          end
+        end
+        # Returns a deep copy of this node, without copying expensive things
+        # like the ref to @document.
+        #
+        def deep_copy
+          node = dup
+          node.instance_variable_set("@children",
+                                     Marshal.load(Marshal.dump(children)))
+          node.instance_variable_set("@ref",
+                                     node.ref ? node.ref.deep_copy : nil)
+          node
+        end
+        protected
+          def split(node)
+            new_child = new_node(self)
+            split_children(node, node, new_child)
+            index = children.index(node)
+            children.insert(index+1, new_child)
+            split! if children.length > limit
+          end
+        private
+          def new_node(parent=nil)
+            node = Node.new(document, limit, parent)
+            node.ref = document.ref!(node)
+            return node
+          end
+          def split_children(node, left, right)
+            half = (node.limit+1)/2
+            left_children, right_children = node.children[0...half], node.children[half..-1]
+            left.children.replace(left_children)
+            right.children.replace(right_children)
+            unless node.leaf?
+              left_children.each { |child| child.parent = left }
+              right_children.each { |child| child.parent = right }
+            end
+          end
+          def insertion_point(value)
+            children.each_with_index do |child, index|
+              return index if child >= value
+            end
+            return children.length
+          end
+      end
+      class Value #:nodoc:
+        include Comparable
+        attr_reader :name
+        attr_reader :value
+        def initialize(name, value)
+          @name, @value = PDF::Core::LiteralString.new(name), value
+        end
+        def <=>(leaf)
+          name <=> leaf.name
+        end
+        def inspect
+          "#<Value: #{name.inspect} : #{value.inspect}>"
+        end
+        def to_s
+          "#{name} : #{value}"
+        end
+      end
+    end
+  end
+end

data/lib/pdf/core/object_store.rb ADDED Viewed

@@ -0,0 +1,308 @@
+# encoding: utf-8
+# Implements PDF object repository
+#
+# Copyright August 2009, Brad Ediger.  All Rights Reserved.
+#
+# This is free software. Please see the LICENSE and COPYING files for details.
+require 'pdf/reader'
+module PDF
+  module Core
+    class ObjectStore #:nodoc:
+      include Enumerable
+      attr_reader :min_version
+      BASE_OBJECTS = %w[info pages root]
+      def initialize(opts = {})
+        @objects = {}
+        @identifiers = []
+        load_file(opts[:template]) if opts[:template]
+        @info  ||= ref(opts[:info] || {}).identifier
+        @root  ||= ref(:Type => :Catalog).identifier
+        if pages.nil?
+          root.data[:Pages] = ref(:Type => :Pages, :Count => 0, :Kids => [])
+        end
+      end
+      def ref(data, &block)
+        push(size + 1, data, &block)
+      end
+      def info
+        @objects[@info]
+      end
+      def root
+        @objects[@root]
+      end
+      def pages
+        root.data[:Pages]
+      end
+      def page_count
+        pages.data[:Count]
+      end
+      # Adds the given reference to the store and returns the reference object.
+      # If the object provided is not a PDF::Core::Reference, one is created from the
+      # arguments provided.
+      #
+      def push(*args, &block)
+        reference = if args.first.is_a?(PDF::Core::Reference)
+          args.first
+        else
+          PDF::Core::Reference.new(*args, &block)
+        end
+        @objects[reference.identifier] = reference
+        @identifiers << reference.identifier
+        reference
+      end
+      alias_method :<<, :push
+      def each
+        @identifiers.each do |id|
+          yield @objects[id]
+        end
+      end
+      def [](id)
+        @objects[id]
+      end
+      def size
+        @identifiers.size
+      end
+      alias_method :length, :size
+      def compact
+        # Clear live markers
+        each { |o| o.live = false }
+        # Recursively mark reachable objects live, starting from the roots
+        # (the only objects referenced in the trailer)
+        root.mark_live
+        info.mark_live
+        # Renumber live objects to eliminate gaps (shrink the xref table)
+        if @objects.any?{ |_, o| !o.live }
+          new_id = 1
+          new_objects = {}
+          new_identifiers = []
+          each do |obj|
+            if obj.live
+              obj.identifier = new_id
+              new_objects[new_id] = obj
+              new_identifiers << new_id
+              new_id += 1
+            end
+          end
+          @objects = new_objects
+          @identifiers = new_identifiers
+        end
+      end
+      # returns the object ID for a particular page in the document. Pages
+      # are indexed starting at 1 (not 0!).
+      #
+      #   object_id_for_page(1)
+      #   => 5
+      #   object_id_for_page(10)
+      #   => 87
+      #   object_id_for_page(-11)
+      #   => 17
+      #
+      def object_id_for_page(k)
+        k -= 1 if k > 0
+        flat_page_ids = get_page_objects(pages).flatten
+        flat_page_ids[k]
+      end
+      # imports all objects required to render a page from another PDF. The
+      # objects are added to the current object store, but NOT linked
+      # anywhere.
+      #
+      # The object ID of the root Page object is returned, it's up to the
+      # calling code to link that into the document structure somewhere. If
+      # this isn't done the imported objects will just be removed when the
+      # store is compacted.
+      #
+      # Imports nothing and returns nil if the requested page number doesn't
+      # exist. page_num is 1 indexed, so 1 indicates the first page.
+      #
+      def import_page(input, page_num)
+        @loaded_objects = {}
+        if template_id = indexed_template(input, page_num)
+          return template_id
+        end
+        io = if input.respond_to?(:seek) && input.respond_to?(:read)
+               input
+             elsif File.file?(input.to_s)
+               StringIO.new(File.binread(input.to_s))
+             else
+               raise ArgumentError, "input must be an IO-like object or a filename"
+             end
+                # unless File.file?(filename)
+        #   raise ArgumentError, "#{filename} does not exist"
+        # end
+        hash = indexed_hash(input, io)
+        ref  = hash.page_references[page_num - 1]
+        if ref.nil?
+          nil
+        else
+          index_template(input, page_num, load_object_graph(hash, ref).identifier)
+        end
+      rescue PDF::Reader::MalformedPDFError, PDF::Reader::InvalidObjectError => e
+        msg = "Error reading template file. If you are sure it's a valid PDF, it may be a bug.\n#{e.message}"
+        raise PDF::Core::Errors::TemplateError, msg
+      rescue PDF::Reader::UnsupportedFeatureError
+        msg = "Template file contains unsupported PDF features"
+        raise PDF::Core::Errors::TemplateError, msg
+      end
+      private
+      # An index for page templates so that their loaded object graph
+      # can be reused without multiple loading
+      def template_index
+        @template_index ||= {}
+      end
+      # An index for the read object hash of a pdf template so that the
+      # object hash does not need to be parsed multiple times when using
+      # different pages of the pdf as page templates
+      def hash_index
+        @hash_index ||= {}
+      end
+      # returns the indexed object graph identifier for a template page if
+      # it exists
+      def indexed_template(input, page_number)
+        key = indexing_key(input)
+        template_index[key] && template_index[key][page_number]
+      end
+      # indexes the identifier for a page from a template
+      def index_template(input, page_number, id)
+        (template_index[indexing_key(input)] ||= {})[page_number] ||= id
+      end
+      # reads and indexes a new IO for a template
+      # if the IO has been indexed already then the parsed object hash
+      # is returned directly
+      def indexed_hash(input, io)
+        hash_index[indexing_key(input)] ||= PDF::Reader::ObjectHash.new(io)
+      end
+      # the index key for the input.
+      # uses object_id so that both a string filename or an IO stream can be
+      # indexed and reused provided the same object gets used in multiple page
+      # template calls.
+      def indexing_key(input)
+        input.object_id
+      end
+      # returns a nested array of object IDs for all pages in this object store.
+      #
+      def get_page_objects(obj)
+        if obj.data[:Type] == :Page
+          obj.identifier
+        elsif obj.data[:Type] == :Pages
+          obj.data[:Kids].map { |kid| get_page_objects(kid) }
+        end
+      end
+      # takes a source PDF and uses it as a template for this document.
+      #
+      def load_file(template)
+        unless (template.respond_to?(:seek) && template.respond_to?(:read)) ||
+               File.file?(template)
+          raise ArgumentError, "#{template} does not exist"
+        end
+        hash = PDF::Reader::ObjectHash.new(template)
+        src_info = hash.trailer[:Info]
+        src_root = hash.trailer[:Root]
+        @min_version = hash.pdf_version.to_f
+        if hash.trailer[:Encrypt]
+          msg = "Template file is an encrypted PDF, it can't be used as a template"
+          raise PDF::Core::Errors::TemplateError, msg
+        end
+        if src_info
+          @info = load_object_graph(hash, src_info).identifier
+        end
+        if src_root
+          @root = load_object_graph(hash, src_root).identifier
+        end
+      rescue PDF::Reader::MalformedPDFError, PDF::Reader::InvalidObjectError => e
+        msg = "Error reading template file. If you are sure it's a valid PDF, it may be a bug.\n#{e.message}"
+        raise PDF::Core::Errors::TemplateError, msg
+      rescue PDF::Reader::UnsupportedFeatureError
+        msg = "Template file contains unsupported PDF features"
+        raise PDF::Core::Errors::TemplateError, msg
+      end
+      # recurse down an object graph from a source PDF, importing all the
+      # indirect objects we find.
+      #
+      # hash is the PDF::Reader::ObjectHash to extract objects from, object is
+      # the object to extract.
+      #
+      def load_object_graph(hash, object)
+        @loaded_objects ||= {}
+        case object
+        when ::Hash then
+          object.each { |key,value| object[key] = load_object_graph(hash, value) }
+          object
+        when Array then
+          object.map { |item| load_object_graph(hash, item)}
+        when PDF::Reader::Reference then
+          unless @loaded_objects.has_key?(object.id)
+            @loaded_objects[object.id] = ref(nil)
+            new_obj = load_object_graph(hash, hash[object])
+            if new_obj.kind_of?(PDF::Reader::Stream)
+              stream_dict = load_object_graph(hash, new_obj.hash)
+              @loaded_objects[object.id].data = stream_dict
+              @loaded_objects[object.id] << new_obj.data
+            else
+              @loaded_objects[object.id].data = new_obj
+            end
+          end
+          @loaded_objects[object.id]
+        when PDF::Reader::Stream
+          # Stream is a subclass of string, so this is here to prevent the stream
+          # being wrapped in a LiteralString
+          object
+        when String
+          is_utf8?(object) ? object : PDF::Core::ByteString.new(object)
+        else
+          object
+        end
+      end
+      def is_utf8?(str)
+        str.force_encoding(::Encoding::UTF_8)
+        str.valid_encoding?
+      end
+    end
+  end
+end