RubyGems - combine_pdf - Versions diffs - 0.2.30 → 0.2.31 - Mend

combine_pdf 0.2.30 → 0.2.31

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +13 -1
data/lib/combine_pdf/parser.rb +1 -1
data/lib/combine_pdf/pdf_protected.rb +62 -97
data/lib/combine_pdf/renderer.rb +3 -3
data/lib/combine_pdf/version.rb +1 -1
data/test/automated +16 -4
metadata +2 -3

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: f81f9412da41045468ecaa6e10104fc9062eee8d
-  data.tar.gz: 23410127dcabe19c6b9ddee352752f9b7bd0abb6
+  metadata.gz: 63b0c324e1bf003b0c0fc963eb2071b6c4672c18
+  data.tar.gz: 34c200edda06074773888c098b9d4f9a6479d752
 SHA512:
-  metadata.gz: 03fdcce50faf9045930e435cbdf4d31e4ed96419f6e594796df89ab7c6e0891568467d5c1b632338c5e33fe17f3e68fe14c1c7c6c199e6d6d8a33ec47e438a46
-  data.tar.gz: 24442ecac5ee2ed427de851eb5de4081ab8bb4a8625cdcaa2c9d4a84da86c79feaf3b02928ebee22462b2963a94ea9683fde5a47a3e6a35400d9ace2f3b13489
+  metadata.gz: ebf1cd2a7c1077f71d6f41037f0ad341c06e6bbb305bfa030833956feb0bb80576e2eaa5fd7324000ff95834f8c125cafc40a782ac97c16c7e3c7e7c02166794
+  data.tar.gz: a4cc257441939fbc0dd59dffe2d3faf11ade63b89fa2637bee8b54df7ea9e31a2ec5612d366ec9208856d750deac8fe10f1ddcaab6f9bf21996e6aa52c775e90

data/CHANGELOG.md CHANGED

@@ -2,9 +2,21 @@
 ***
+Change log v.0.2.31
+**Broke**: Broke the fix for issue #65 so that Radio buttons data might be lost... working on a fix.
+**Fix**: Fixed issue #82 (reintroduction of issue #19 due to core engine rewrite) related to a workaround for an issue with AcrobatReader. Credit to @gyuchang for testing and helping with the fix.
+**Merge**: Merged pull request #80, fixing an issue with byte decoding. Credit to @gyuchang for the PR.
+**Performance**: Improved performance for the reference and duplicate object resolution. Credit to @gyuchang for pointing some optimization options.
+***
 Change log v.0.2.30
-**Fix**: Fixed an issue where HTTP artifacts before the beginning of a PDF file / string would prevent the PDF from being parsed. This fixes issue #78 reported by @robvitaro.
+**Fix**: Fixed an issue where HTTP artifacts before the beginning of a PDF file / string would prevent the PDF from being parsed. This should fix issue #78 reported by @robvitaro.
 ***

data/lib/combine_pdf/parser.rb CHANGED

@@ -200,7 +200,7 @@ module CombinePDF
           # instead, a non-strict RegExp is used:
           str = @scanner.scan_until(/endstream/)
           # raise error if the stream doesn't end.
-          raise "Parsing Error: PDF file error - a stream object wasn't properly colsed using 'endstream'!" unless str
+          raise "Parsing Error: PDF file error - a stream object wasn't properly closed using 'endstream'!" unless str
           # need to remove end of stream
           if out.last.is_a? Hash
             # out.last[:raw_stream_content] = str[0...-10] #cuts only one EON char (\n or \r)

data/lib/combine_pdf/pdf_protected.rb CHANGED

@@ -19,84 +19,42 @@ module CombinePDF
     # this function adds the references contained in `@objects`.
     #
     # this is used for internal operations, such as injectng data using the << operator.
-    def add_referenced
+    def add_referenced(should_resolve = [])
       # add references but not root
-      should_resolve = @objects.dup
       dup_pages = nil
-      resolved = [].to_set
+      # an existing object map
+      resolved = {}.dup
+      existing = {}.dup
+      @objects.each { |obj| existing[obj] = obj }
+      # loop until should_resolve is empty
       while should_resolve.any?
         obj = should_resolve.pop
+        next if resolved[obj.object_id] # the object exists
         if obj.is_a?(Hash)
-          next if resolved.include? obj.object_id
-          resolved << obj.object_id
-          if obj[:referenced_object]
-            tmp = @objects.find_index(obj[:referenced_object])
+          referenced = obj[:referenced_object]
+          if referenced && referenced.any?
+            tmp = resolved[referenced.object_id] || existing[referenced]
             if tmp
-              tmp = @objects[tmp]
               obj[:referenced_object] = tmp
             else
-              tmp = obj[:referenced_object]
-              should_resolve << tmp
-              @objects << tmp
+              resolved[obj.object_id] = referenced
+              existing[referenced] = referenced
+              should_resolve << referenced
+              @objects << referenced
             end
           else
-            obj.keys.each { |k| should_resolve << obj[k] unless k == :Parent || resolved.include?(obj[k].object_id) || !obj[k].is_a?(Enumerable) }
+            resolved[obj.object_id] = obj
+            obj.keys.each { |k| should_resolve << obj[k] unless !obj[k].is_a?(Enumerable) || resolved[obj[k].object_id] }
           end
         elsif obj.is_a?(Array)
-          next if resolved.include? obj.object_id
-          resolved << obj.object_id
+          resolved[obj.object_id] = obj
           should_resolve.concat obj
         end
       end
       resolved.clear
+      existing.clear
     end
-    # # @private
-    # # Some PDF objects contain references to other PDF objects.
-    # #
-    # # this function adds the references contained in "object", but DOESN'T add the object itself.
-    # #
-    # # this is used for internal operations, such as injectng data using the << operator.
-    # def add_referenced(object, dup_pages = true)
-    #   # add references but not root
-    #   if object.is_a?(Array)
-    #     object.each { |it| add_referenced(it, dup_pages) }
-    #     return true
-    #   elsif object.is_a?(Hash)
-    #     # first if statement is actually a workaround for a bug in Acrobat Reader, regarding duplicate pages.
-    #     if dup_pages && object[:is_reference_only] && object[:referenced_object] && object[:referenced_object].is_a?(Hash) && object[:referenced_object][:Type] == :Page
-    #       if @objects.find_index object[:referenced_object]
-    #         @objects << (object[:referenced_object] = object[:referenced_object].dup)
-    #       else
-    #         @objects << object[:referenced_object]
-    #       end
-    #     elsif object[:is_reference_only] && object[:referenced_object]
-    #       found_at = @objects.find_index object[:referenced_object]
-    #       if found_at
-    #         # if the objects are equal, they might still be different objects!
-    #         # so, we need to make sure they are the same object for the pointers to effect id numbering
-    #         # and formatting operations.
-    #         object[:referenced_object] = @objects[found_at]
-    #         # stop this path, there is no need to run over the Hash's keys and values
-    #         return true
-    #       else
-    #         # stop if page propegation is false
-    #         return true if !dup_pages && object[:referenced_object][:Type] == :Page
-    #         # @objects.include? object[:referenced_object] is bound to be false
-    #         # the object wasn't found - add it to the @objects array
-    #         @objects << object[:referenced_object]
-    #       end
-    #
-    #     end
-    #     object.each do |k, v|
-    #         add_referenced(v, dup_pages) unless RECORSIVE_PROTECTION[k]
-    #     end
-    #   else
-    #     return false
-    #   end
-    #   true
-    # end
     # @private
     def rebuild_catalog(*with_pages)
       # # build page list v.1 Slow but WORKS
@@ -113,38 +71,62 @@ module CombinePDF
       # add pages to catalog, if requested
       page_list.concat(with_pages) unless with_pages.empty?
+      # duplicate any non-unique pages - This is a special case to resolve Adobe Acrobat Reader issues (see issues #19 and #81)
+      uniqueness = {}.dup
+      page_list.each { |page| page = page.dup if uniqueness[page.object_id]; uniqueness[page.object_id] = page }
+      page_list.clear
+      page_list = uniqueness.values
+      uniqueness.clear
       # build new Pages object
-      pages_object = { Type: :Pages, Count: page_list.length, Kids: page_list.map { |p| { referenced_object: p, is_reference_only: true } } }
+      page_object_kids = [].dup
+      pages_object = { Type: :Pages, Count: page_list.length, Kids: page_object_kids }
+      pages_object_reference = { referenced_object: pages_object, is_reference_only: true }
+      page_list.each { |pg| pg[:Parent] = pages_object_reference; page_object_kids << ({ referenced_object: pg, is_reference_only: true }) }
       # rebuild/rename the names dictionary
       rebuild_names
       # build new Catalog object
       catalog_object = { Type: :Catalog,
-                         Pages: { referenced_object: pages_object, is_reference_only: true },
-                         Names: { referenced_object: @names, is_reference_only: true },
-                         Outlines: { referenced_object: @outlines, is_reference_only: true } }
+                         Pages: { referenced_object: pages_object, is_reference_only: true } }
+      # pages_object[:Parent] = { referenced_object: catalog_object, is_reference_only: true } # causes AcrobatReader to fail
       catalog_object[:ViewerPreferences] = @viewer_preferences unless @viewer_preferences.empty?
-      # rebuild/rename the forms dictionary
-      if @forms_data.nil? || @forms_data.empty?
-        @forms_data = nil
-      else
-        @forms_data = { referenced_object: (@forms_data[:referenced_object] || @forms_data), is_reference_only: true }
-        catalog_object[:AcroForm] = @forms_data
-      end
       # point old Pages pointers to new Pages object
       ## first point known pages objects - enough?
       pages.each { |p| p[:Parent] = { referenced_object: pages_object, is_reference_only: true } }
       ## or should we, go over structure? (fails)
       # each_object {|obj| obj[:Parent][:referenced_object] = pages_object if obj.is_a?(Hash) && obj[:Parent].is_a?(Hash) && obj[:Parent][:referenced_object] && obj[:Parent][:referenced_object][:Type] == :Pages}
-      # remove old catalog and pages objects
-      @objects.reject! { |obj| obj.is_a?(Hash) && (obj[:Type] == :Catalog || obj[:Type] == :Pages) }
+      # # remove old catalog and pages objects
+      # @objects.reject! { |obj| obj.is_a?(Hash) && (obj[:Type] == :Catalog || obj[:Type] == :Pages) }
+      # remove old objects list and trees
+      @objects.clear
       # inject new catalog and pages objects
-      @objects << pages_object
+      @objects << @info if @info
       @objects << catalog_object
+      @objects << pages_object
+      # rebuild/rename the forms dictionary
+      if @forms_data.nil? || @forms_data.empty?
+        @forms_data = nil
+      else
+        @forms_data = { referenced_object: (@forms_data[:referenced_object] || @forms_data), is_reference_only: true }
+        catalog_object[:AcroForm] = @forms_data
+        @objects << @forms_data[:referenced_object]
+      end
+      # add the names dictionary
+      if @names && @names.length > 1
+        @objects << @names
+        catalog_object[:Names] = { referenced_object: @names, is_reference_only: true }
+      end
+      # add the outlines dictionary
+      if @outlines && @outlines.any?
+        @objects << @outlines
+        catalog_object[:Outlines] = { referenced_object: @outlines, is_reference_only: true }
+      end
       catalog_object
     end
@@ -166,26 +148,9 @@ module CombinePDF
     # there is no point is calling the method before preparing the output.
     def rebuild_catalog_and_objects
       catalog = rebuild_catalog
-      @objects.clear
-      @objects << @info
-      @objects << catalog
-      # fix Acrobat Reader issue with page reference uniqueness (must be unique or older Acrobat Reader fails)
-      catalog[:Pages][:referenced_object][:Kids].each do |page|
-        tmp = page[:referenced_object]
-        tmp = page[:referenced_object] = tmp.dup if @objects.include? tmp
-        @objects << tmp
-      end
+      page_objects = catalog[:Pages][:referenced_object][:Kids].map { |e| @objects << e[:referenced_object]; e[:referenced_object] }
       # adds every referenced object to the @objects (root), addition is performed as pointers rather then copies
-      # puts (Benchmark.measure do
-      add_referenced
-      # end)
-      # @objects << @info
-      # add_referenced @info
-      # add_referenced catalog
-      # add_referenced catalog[:Pages]
-      # add_referenced catalog[:Names], false
-      # add_referenced catalog[:Outlines], false
-      # add_referenced catalog[:AcroForm], false
+      add_referenced([page_objects, @forms_data, @names, @outlines, @info])
       catalog
     end
@@ -304,9 +269,9 @@ module CombinePDF
         # parent - the outline base node of the resulting merged outline
         # FIXME implement the possibility to insert somewhere in the middle of the outline
         prev = nil
-        pos = first = actual_object(((position != 0) ? old_data : new_data)[:First])
-        last = actual_object(((position != 0) ? new_data : old_data)[:Last])
-        median = { is_reference_only: true, referenced_object: actual_object(((position != 0) ? new_data : old_data)[:First]) }
+        pos = first = actual_object((position.nonzero? ? old_data : new_data)[:First])
+        last = actual_object((position.nonzero? ? new_data : old_data)[:Last])
+        median = { is_reference_only: true, referenced_object: actual_object((position.nonzero? ? new_data : old_data)[:First]) }
         old_data[:First] = { is_reference_only: true, referenced_object: first }
         old_data[:Last] = { is_reference_only: true, referenced_object: last }
         parent = { is_reference_only: true, referenced_object: old_data }

data/lib/combine_pdf/renderer.rb CHANGED

@@ -21,7 +21,7 @@ module CombinePDF
       elsif object.is_a?(Array)
         return format_array_to_pdf object
       elsif object.is_a?(Fixnum) || object.is_a?(Float) || object.is_a?(TrueClass) || object.is_a?(FalseClass)
-        return object.to_s + ' '
+        return object.to_s
       elsif object.is_a?(Hash)
         return format_hash_to_pdf object
       else
@@ -33,12 +33,12 @@ module CombinePDF
                                 "\x0D" => '\\r',
                                 "\x09" => '\\t',
                                 "\x08" => '\\b',
-                                "\xFF" => '\\f',
+                                "\x0C" => '\\f', # form-feed (\f) == 0x0C
                                 "\x28" => '\\(',
                                 "\x29" => '\\)',
                                 "\x5C" => '\\\\' }.dup
     32.times { |i| STRING_REPLACEMENT_HASH[i.chr] ||= "\\#{i}" }
-    (256 - 128).times { |i| STRING_REPLACEMENT_HASH[(i + 127).chr] ||= "\\#{i + 127}" }
+    (256 - 127).times { |i| STRING_REPLACEMENT_HASH[(i + 127).chr] ||= "\\#{i + 127}" }
     def format_string_to_pdf(object)
       # object.force_encoding(Encoding::ASCII_8BIT)

data/lib/combine_pdf/version.rb CHANGED

@@ -1,3 +1,3 @@
 module CombinePDF
-  VERSION = '0.2.30'.freeze
+  VERSION = '0.2.31'.freeze
 end

data/test/automated CHANGED

@@ -28,15 +28,16 @@ pdf = CombinePDF.load './Ruby/test pdfs/names_go_haywire_0.pdf'
 pdf << CombinePDF.load('./Ruby/test pdfs/names_go_haywire_1.pdf')
 pdf.save '04_check_view_and_names_reference.pdf'
-str = IO.binread './Ruby/test pdfs/outlines/self_merge_err.pdf'
+pdf = CombinePDF.load('./Ruby/test pdfs/outlines/self_merge_err.pdf')
+pdf.save '05_x1_scribus_test.pdf'
 pdf = CombinePDF.load('./Ruby/test pdfs/outlines/self_merge_err.pdf')
 pdf << CombinePDF.load('./Ruby/test pdfs/outlines/self_merge_err.pdf')
-pdf.save '05_scribus_test.pdf'
+pdf.save '05_x2_scribus_test.pdf'
 # pdf = CombinePDF.load "./Ruby/test pdfs/named_dest.pdf";nil
 # pdf.save '05_check_named_dest_links.pdf' # this will take a while
 # pdf = CombinePDF.load "./Ruby/test pdfs/named_dest.pdf";nil
-# pdf << CombinePDF.load("./Ruby/test pdfs/named_dest.pdf");nil
-# pdf.save '05_1_check_named_dest_links.pdf' # never ends... :-(
+pdf << CombinePDF.load('./Ruby/test pdfs/named_dest.pdf'); nil
+pdf.save '05_1_timeless_check_named_dest_links.pdf' # never ends... :-(
 pdf = CombinePDF.load './Ruby/test pdfs/outline_small.pdf'
 pdf << CombinePDF.load('./Ruby/test pdfs/outline_small.pdf')
@@ -55,6 +56,17 @@ CombinePDF.load("./Ruby/test\ pdfs/Scribus-unknown_err2.pdf").save '08_2-unknown
 CombinePDF.load("./Ruby/test\ pdfs/Scribus-unknown_err3.pdf").save '08_3-unknown-err-empty-str.pdf'
 CombinePDF.load("/Users/2Be/Ruby/test\ pdfs/nil_object.pdf").save('09_nil_in_parsed_array.pdf')
+require 'prawn'
+IO.binwrite '10_prawn.pdf', (Prawn::Document.new { text 'Hello World!' }).render
+page = CombinePDF.parse((Prawn::Document.new { text 'Hello World!' }).render)
+pdf = CombinePDF.new
+pdf << page
+pdf.save '10_parsed_from_prawn.pdf'
+pdf = CombinePDF.new
+pdf << page << page
+pdf.save('10_AcrobatReader_is_unique_page.pdf')
 # unify = [
 #   "./Ruby/test\ pdfs/AESv2\ encrypted.pdf",
 #   "./Ruby/test\ pdfs/data-in-comment.pdf",

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: combine_pdf
 version: !ruby/object:Gem::Version
-  version: 0.2.30
+  version: 0.2.31
 platform: ruby
 authors:
 - Boaz Segev
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-07-27 00:00:00.000000000 Z
+date: 2016-08-22 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: ruby-rc4
@@ -111,4 +111,3 @@ test_files:
 - test/automated
 - test/console
 - test/named_dest
-has_rdoc: