RubyGems - combine_pdf - Versions diffs - 0.2.37 → 1.0.0 - Mend

combine_pdf 0.2.37 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +13 -1
data/lib/combine_pdf.rb +1 -1
data/lib/combine_pdf/parser.rb +69 -60
data/lib/combine_pdf/pdf_protected.rb +11 -7
data/lib/combine_pdf/pdf_public.rb +1 -1
data/lib/combine_pdf/renderer.rb +1 -1
data/lib/combine_pdf/version.rb +1 -1
data/test/automated +11 -1
metadata +3 -3

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: cac27b28f3653156374b1ea4a429676625ba0c9f
-  data.tar.gz: 8ce9f60a9bdcbd763a72461703c51845dbab0f2c
+  metadata.gz: 3663c5f5602eeed30aba5405fc0503ab9a865432
+  data.tar.gz: f6e07e2fbb180065146c32a440f29348fb2a2808
 SHA512:
-  metadata.gz: 78aa47281a6f9fa5723a99ed9ce666479999348b69a19778e02eb2144e1c507d4a62ac23a07e4b46079193d14fecf77cbb9dac06591ac2354e92046ba0ba5d20
-  data.tar.gz: 2b92948efba5ab031a46865416b13b1aecb892a1763c0d45190598313e6e0139907ece00c129349060ff030c7b38531d1d848f9696168d5fec499a4c65121db8
+  metadata.gz: 835236c99911009df5112cc92c3a042e6fe2dcd634e9b189f9977aef9f9f42ae33ac9865e94d08e8d26b6ff0328d7bf84ccc0dc858897c2c0ceae10c3c80c944
+  data.tar.gz: f11df3aa2c055a17be86c83766ea85f536afc5dcf0b299cd0e5be48344724909619ea54c8b52b7a9997907ec9d5c585d21f9ba81eeb17a74a77234aba2a79994

data/CHANGELOG.md CHANGED

@@ -2,7 +2,19 @@
 ***
-#### Change log v.0.2.37 (Release Candidate)
+#### Change log v.1.0.0
+**Fix**: Fixed a possible issue with string corruption... it might have only existed in the development version, I'm not sure, but it's fixed anyway.
+**Fix** (degrade): Fixed an issue related to deeply nested objects causing unreasonable slowdowns. The issue was resolved by degrading the PDF optimization process to review object with `stream` data instead of reviewing every object. This means more duplicate objects might be observed when similar PDF files are merged.
+**Fix**: Fixed an issue related to form data where font information was lost during the PDF optimization process.
+**Fix**: Fixed issue #108 by adding support for PDFs that have spaces and missing zeros in their hex encoded strings. Credit to @emmanuelmillionaer.
+***
+#### Change log v.0.2.37
 **Fix**: Fixed `Page_Methods#textbox` default `:x`,`:y` to allow for non-zero/cropped page origin. Credit to @donnguyen for exposing the issue.

data/lib/combine_pdf.rb CHANGED

@@ -138,7 +138,7 @@ end
 # arrays are Array
 # strings are String
 # names are Symbols (String.to_sym)
-# numbers are Fixnum or Float
+# numbers are Integer or Float (Numeric)
 # boolean are TrueClass or FalseClass
 ## test performance with:

data/lib/combine_pdf/parser.rb CHANGED

@@ -107,7 +107,7 @@ module CombinePDF
           @scanner = StringScanner.new o[:raw_stream_content]
           stream_data = _parse_
           id_array = []
-          while stream_data[0].is_a? (Integer)
+          while stream_data[0].is_a? (Numeric)
             id_array << stream_data.shift
             stream_data.shift
           end
@@ -180,57 +180,34 @@ module CombinePDF
         if @scanner.scan(/\[/)
           out << _parse_
         ##########################################
-        ## parse a Dictionary
-        ##########################################
-        elsif @scanner.scan(/<</)
-          data = _parse_
-          obj = {}
-          obj[data.shift] = data.shift while data[0]
-          out << obj
-        ##########################################
-        ## return content of array or dictionary
-        ##########################################
-        elsif @scanner.scan(/\]/) || @scanner.scan(/>>/)
-          return out
-        ##########################################
-        ## parse a Stream
+        ## Parse a Name
         ##########################################
-        elsif @scanner.scan(/stream[\r\n]/)
-          @scanner.pos += 1 if @scanner.peek(1) == "\n".freeze && @scanner.matched[-1] != "\n".freeze
-          # the following was dicarded because some PDF files didn't have an EOL marker as required
-          # str = @scanner.scan_until(/(\r\n|\r|\n)endstream/)
-          # instead, a non-strict RegExp is used:
-          str = @scanner.scan_until(/endstream/)
-          # raise error if the stream doesn't end.
-          raise "Parsing Error: PDF file error - a stream object wasn't properly closed using 'endstream'!" unless str
-          # need to remove end of stream
-          if out.last.is_a? Hash
-            # out.last[:raw_stream_content] = str[0...-10] #cuts only one EON char (\n or \r)
-            out.last[:raw_stream_content] = unify_string str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT)
-          else
-            warn 'Stream not attached to dictionary!'
-            out << str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT)
-          end
+        # old, probably working version: when str = @scanner.scan(/\/[\#\w\d\.\+\-\\\?\,]+/)
+        # I don't know how to write the /[\x21-\x7e___subtract_certain_hex_values_here____]+/
+        # all allowed regular caracters between ! and ~ : /[\x21-\x24\x26\x27\x2a-\x2e\x30-\x3b\x3d\x3f-\x5a\x5c\x5e-\x7a\x7c\x7e]+
+        # all characters that aren't white space or special: /[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+
+        elsif str = @scanner.scan(/\/[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]*/)
+          out << (str[1..-1].gsub(/\#[0-9a-fA-F]{2}/) { |a| a[1..2].hex.chr }).to_sym
         ##########################################
-        ## parse an Object after finished
+        ## Parse a Number
         ##########################################
-        elsif str = @scanner.scan(/endobj/)
-          # what to do when this is an object?
-          if out.last.is_a? Hash
-            out << out.pop.merge(indirect_generation_number: out.pop, indirect_reference_id: out.pop)
-          else
-            out << { indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop }
-          end
-          fresh = true
-          # fix wkhtmltopdf use of PDF 1.1 Dest using symbols instead of strings
-          out.last[:Dest] = unify_string(out.last[:Dest].to_s) if out.last[:Dest] && out.last[:Dest].is_a?(Symbol)
-        # puts "!!!!!!!!! Error with :indirect_reference_id\n\nObject #{out.last}  :indirect_reference_id = #{out.last[:indirect_reference_id]}" unless out.last[:indirect_reference_id].is_a?(Integer)
+        elsif str = @scanner.scan(/[\+\-\.\d]+/)
+          str =~ /\./ ? (out << str.to_f) : (out << str.to_i)
         ##########################################
         ## parse a Hex String
         ##########################################
-        elsif str = @scanner.scan(/<[0-9a-fA-F]*>/)
+        elsif str = @scanner.scan(/\<[0-9a-fA-F]*\>/)
           # warn "Found a hex string"
-          out << unify_string([str[1..-2]].pack('H*').force_encoding(Encoding::ASCII_8BIT))
+          str = str.slice(1..-2).force_encoding(Encoding::ASCII_8BIT)
+          # str = "0#{str}" if str.length.odd?
+          out << unify_string([str].pack('H*').force_encoding(Encoding::ASCII_8BIT))
+        ##########################################
+        ## parse a space delimited Hex String
+        ##########################################
+        elsif str = @scanner.scan(/\<[0-9a-fA-F\s]*\>/)
+          # warn "Found a space seperated hex string"
+          str = str.force_encoding(Encoding::ASCII_8BIT).split(/\s/).map! {|b| b.length.odd? ? "0#{b}" : b}
+          out << unify_string(str.pack('H*' * str.length).force_encoding(Encoding::ASCII_8BIT))
         ##########################################
         ## parse a Literal String
         ##########################################
@@ -315,6 +292,52 @@ module CombinePDF
           end
           out << unify_string(str.pack('C*').force_encoding(Encoding::ASCII_8BIT))
         ##########################################
+        ## parse a Dictionary
+        ##########################################
+        elsif @scanner.scan(/<</)
+          data = _parse_
+          obj = {}
+          obj[data.shift] = data.shift while data[0]
+          out << obj
+        ##########################################
+        ## return content of array or dictionary
+        ##########################################
+        elsif @scanner.scan(/\]/) || @scanner.scan(/>>/)
+          return out
+        ##########################################
+        ## parse a Stream
+        ##########################################
+        elsif @scanner.scan(/stream[\r\n]/)
+          @scanner.pos += 1 if @scanner.peek(1) == "\n".freeze && @scanner.matched[-1] != "\n".freeze
+          # the following was dicarded because some PDF files didn't have an EOL marker as required
+          # str = @scanner.scan_until(/(\r\n|\r|\n)endstream/)
+          # instead, a non-strict RegExp is used:
+          str = @scanner.scan_until(/endstream/)
+          # raise error if the stream doesn't end.
+          raise "Parsing Error: PDF file error - a stream object wasn't properly closed using 'endstream'!" unless str
+          # need to remove end of stream
+          if out.last.is_a? Hash
+            # out.last[:raw_stream_content] = str[0...-10] #cuts only one EON char (\n or \r)
+            out.last[:raw_stream_content] = unify_string str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT)
+          else
+            warn 'Stream not attached to dictionary!'
+            out << str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT)
+          end
+        ##########################################
+        ## parse an Object after finished
+        ##########################################
+        elsif str = @scanner.scan(/endobj/)
+          # what to do when this is an object?
+          if out.last.is_a? Hash
+            out << out.pop.merge(indirect_generation_number: out.pop, indirect_reference_id: out.pop)
+          else
+            out << { indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop }
+          end
+          fresh = true
+          # fix wkhtmltopdf use of PDF 1.1 Dest using symbols instead of strings
+          out.last[:Dest] = unify_string(out.last[:Dest].to_s) if out.last[:Dest] && out.last[:Dest].is_a?(Symbol)
+        # puts "!!!!!!!!! Error with :indirect_reference_id\n\nObject #{out.last}  :indirect_reference_id = #{out.last[:indirect_reference_id]}" unless out.last[:indirect_reference_id].is_a?(Numeric)
+        ##########################################
         ## Parse a comment
         ##########################################
         elsif str = @scanner.scan(/\%/)
@@ -326,20 +349,6 @@ module CombinePDF
           end
         # puts "AFTER COMMENT: #{@scanner.peek 8}"
         ##########################################
-        ## Parse a Name
-        ##########################################
-        # old, probably working version: when str = @scanner.scan(/\/[\#\w\d\.\+\-\\\?\,]+/)
-        # I don't know how to write the /[\x21-\x7e___subtract_certain_hex_values_here____]+/
-        # all allowed regular caracters between ! and ~ : /[\x21-\x24\x26\x27\x2a-\x2e\x30-\x3b\x3d\x3f-\x5a\x5c\x5e-\x7a\x7c\x7e]+
-        # all characters that aren't white space or special: /[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+
-        elsif str = @scanner.scan(/\/[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]*/)
-          out << (str[1..-1].gsub(/\#[0-9a-fA-F]{2}/) { |a| a[1..2].hex.chr }).to_sym
-        ##########################################
-        ## Parse a Number
-        ##########################################
-        elsif str = @scanner.scan(/[\+\-\.\d]+/)
-          str =~ /\./ ? (out << str.to_f) : (out << str.to_i)
-        ##########################################
         ## Parse an Object Reference
         ##########################################
         elsif @scanner.scan(/R/)
@@ -562,7 +571,7 @@ module CombinePDF
                   o = nil
                 else
                   o[:referenced_object] = obj_dir[[o[:indirect_reference_id], o[:indirect_generation_number]]]
-                  warn "Couldn't connect reference for #{o}" if o[:referenced_object].nil?
+                  warn "Couldn't connect reference for #{o}" if o[:referenced_object].nil? && (o[:indirect_reference_id] + o[:indirect_generation_number] != 0)
                   o.delete :indirect_reference_id
                   o.delete :indirect_generation_number
                   o = (o[:referenced_object] && o[:referenced_object][:indirect_without_dictionary]) || o

data/lib/combine_pdf/pdf_protected.rb CHANGED

@@ -33,12 +33,14 @@ module CombinePDF
         if obj.is_a?(Hash)
           referenced = obj[:referenced_object]
           if referenced && referenced.any?
-            tmp = resolved[referenced.object_id] || existing[referenced]
+            #         tmp = resolved[referenced.object_id] || existing[referenced]
+            tmp = resolved[referenced.object_id] || (referenced[:raw_stream_content] && existing[referenced[:raw_stream_content]])
             if tmp
               obj[:referenced_object] = tmp
             else
               resolved[obj.object_id] = referenced
-              existing[referenced] = referenced
+              #        existing[referenced] = referenced
+              existing[referenced[:raw_stream_content]] = referenced
               should_resolve << referenced
               @objects << referenced
             end
@@ -150,7 +152,8 @@ module CombinePDF
       catalog = rebuild_catalog
       page_objects = catalog[:Pages][:referenced_object][:Kids].map { |e| @objects << e[:referenced_object]; e[:referenced_object] }
       # adds every referenced object to the @objects (root), addition is performed as pointers rather then copies
-      add_referenced([page_objects, @forms_data, @names, @outlines, @info])
+      # add_referenced([page_objects, @forms_data, @names, @outlines, @info])
+      add_referenced(@objects.dup)
       catalog
     end
@@ -163,7 +166,7 @@ module CombinePDF
     def renumber_object_ids(start = nil)
       @set_start_id = start || @set_start_id
       start = @set_start_id
-      history = {}
+      # history = {}
       @objects.each do |obj|
         obj[:indirect_reference_id] = start
         start += 1
@@ -191,7 +194,7 @@ module CombinePDF
             if pos[0].is_a? String
               (pos.length / 2).times do |i|
                 dic << (pos[i * 2].clear << base.next!)
-                pos[(i * 2) + 1][0] = {is_reference_only: true, referenced_object: pages[pos[(i * 2) + 1][0]]} if(pos[(i * 2) + 1].is_a?(Array) && pos[(i * 2) + 1][0].is_a?(Integer))
+                pos[(i * 2) + 1][0] = {is_reference_only: true, referenced_object: pages[pos[(i * 2) + 1][0]]} if(pos[(i * 2) + 1].is_a?(Array) && pos[(i * 2) + 1][0].is_a?(Numeric))
                 dic << (pos[(i * 2) + 1].is_a?(Array) ? { is_reference_only: true, referenced_object: { indirect_without_dictionary: pos[(i * 2) + 1] } } : pos[(i * 2) + 1])
                 # dic << pos[(i * 2) + 1]
               end
@@ -225,12 +228,13 @@ module CombinePDF
     # preffering the new over the old.
     def self.hash_merge_new_no_page(_key, old_data, new_data)
       return old_data unless new_data
+      return new_data unless old_data
       if old_data.is_a?(Hash) && new_data.is_a?(Hash)
         return old_data if (old_data[:Type] == :Page)
         old_data.merge(new_data, &(@hash_merge_new_no_page_proc ||= method(:hash_merge_new_no_page)))
       elsif old_data.is_a? Array
-        new_data = [new_data] unless new_data.is_a? Array
-        old_data + new_data
+        return old_data + new_data if new_data.is_a?(Array)
+        return old_data.dup << new_data
       elsif new_data.is_a? Array
         new_data + [old_data]
       else

data/lib/combine_pdf/pdf_public.rb CHANGED

@@ -100,7 +100,7 @@ module CombinePDF
       @names = parser.names_object || {}
       @forms_data = parser.forms_object || {}
       @outlines = parser.outlines_object || {}
-      # rebuild the catalo, to fix wkhtmltopdf's use of static page numbers
+      # rebuild the catalog, to fix wkhtmltopdf's use of static page numbers
       rebuild_catalog
       # general globals

data/lib/combine_pdf/renderer.rb CHANGED

@@ -20,7 +20,7 @@ module CombinePDF
         return format_name_to_pdf object
       elsif object.is_a?(Array)
         return format_array_to_pdf object
-      elsif object.is_a?(Integer) || object.is_a?(Float) || object.is_a?(TrueClass) || object.is_a?(FalseClass)
+      elsif object.is_a?(Numeric) || object.is_a?(TrueClass) || object.is_a?(FalseClass)
         return object.to_s
       elsif object.is_a?(Hash)
         return format_hash_to_pdf object

data/lib/combine_pdf/version.rb CHANGED

@@ -1,3 +1,3 @@
 module CombinePDF
-  VERSION = '0.2.37'.freeze
+  VERSION = '1.0.0'.freeze
 end

data/test/automated CHANGED

@@ -14,6 +14,7 @@ require 'bundler/setup'
 pdf = CombinePDF.load "./Ruby/test\ pdfs/filled_form.pdf"
 pdf.save '01_check_radio_buttuns.pdf'
+pdf = CombinePDF.load "./Ruby/test\ pdfs/filled_form.pdf"
 pdf << CombinePDF.load("./Ruby/test\ pdfs/empty_form.pdf")
 pdf << CombinePDF.load("./Ruby/test\ pdfs/filled_form.pdf")
 pdf.save '02_check_form_unification_middle_is_empty.pdf'
@@ -51,7 +52,16 @@ pdf.save '06_check_links_to_second_copy.pdf'
 lists = %w(./Ruby/test\ pdfs/outlines/self_merge_err.pdf ./Ruby/test\ pdfs/outlines/big_toc.pdf ./Ruby/test\ pdfs/outlines/bigger_toc.pdf ./Ruby/test\ pdfs/outlines/named_dest_no_toc.pdf ./Ruby/test\ pdfs/outlines/named_dest_no_toc2.pdf ./Ruby/test\ pdfs/outlines/named_dest.pdf ./Ruby/test\ pdfs/outlines/named_dest2.pdf)
 i = 0
-lists.each { |n| CombinePDF.load(n).save("07_#{(i += 1)}_#{n.split('/')[-1]}"); (CombinePDF.load(n) << CombinePDF.load(n)).save("07_#{i}x2_#{n.split('/')[-1]}") }
+lists.each do |n|
+  # puts "loading #{n}"
+  pdf = CombinePDF.load(n)
+  # puts "saving 07_#{(i += 1)}_#{n.split('/')[-1]}"
+  pdf.save("07_#{(i += 1)}_#{n.split('/')[-1]}")
+  # puts "loading #{n}X2"
+  pdf = CombinePDF.load(n) << CombinePDF.load(n)
+  # puts "saving 07_#{i}x2_#{n.split('/')[-1]}"
+  pdf.save("07_#{i}x2_#{n.split('/')[-1]}")
+end
 pdf = CombinePDF.new
 lists.each { |n| pdf << CombinePDF.load(n) }
 pdf.save('07_named destinations.pdf')

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: combine_pdf
 version: !ruby/object:Gem::Version
-  version: 0.2.37
+  version: 1.0.0
 platform: ruby
 authors:
 - Boaz Segev
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2017-05-10 00:00:00.000000000 Z
+date: 2017-05-23 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: ruby-rc4
@@ -104,7 +104,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.6.8
+rubygems_version: 2.6.11
 signing_key:
 specification_version: 4
 summary: Combine, stamp and watermark PDF files in pure Ruby.