RubyGems - pdf-reader - Versions diffs - 1.1.1 → 2.5.0 - Mend

pdf-reader 1.1.1 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (82) hide show

checksums.yaml +7 -0
data/CHANGELOG +87 -2
data/{README.rdoc → README.md} +43 -31
data/Rakefile +21 -16
data/bin/pdf_callbacks +1 -1
data/bin/pdf_object +4 -1
data/bin/pdf_text +1 -3
data/examples/callbacks.rb +2 -1
data/examples/extract_images.rb +11 -6
data/examples/fuzzy_paragraphs.rb +24 -0
data/lib/pdf/reader/afm/Courier-Bold.afm +342 -0
data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -0
data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -0
data/lib/pdf/reader/afm/Courier.afm +342 -0
data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -0
data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -0
data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -0
data/lib/pdf/reader/afm/Helvetica.afm +3051 -0
data/lib/pdf/reader/afm/MustRead.html +19 -0
data/lib/pdf/reader/afm/Symbol.afm +213 -0
data/lib/pdf/reader/afm/Times-Bold.afm +2588 -0
data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -0
data/lib/pdf/reader/afm/Times-Italic.afm +2667 -0
data/lib/pdf/reader/afm/Times-Roman.afm +2419 -0
data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -0
data/lib/pdf/reader/buffer.rb +90 -63
data/lib/pdf/reader/cid_widths.rb +63 -0
data/lib/pdf/reader/cmap.rb +69 -38
data/lib/pdf/reader/encoding.rb +74 -48
data/lib/pdf/reader/error.rb +24 -4
data/lib/pdf/reader/filter/ascii85.rb +28 -0
data/lib/pdf/reader/filter/ascii_hex.rb +30 -0
data/lib/pdf/reader/filter/depredict.rb +141 -0
data/lib/pdf/reader/filter/flate.rb +53 -0
data/lib/pdf/reader/filter/lzw.rb +21 -0
data/lib/pdf/reader/filter/null.rb +18 -0
data/lib/pdf/reader/filter/run_length.rb +45 -0
data/lib/pdf/reader/filter.rb +15 -234
data/lib/pdf/reader/font.rb +107 -43
data/lib/pdf/reader/font_descriptor.rb +80 -0
data/lib/pdf/reader/form_xobject.rb +26 -4
data/lib/pdf/reader/glyph_hash.rb +56 -18
data/lib/pdf/reader/lzw.rb +6 -4
data/lib/pdf/reader/null_security_handler.rb +17 -0
data/lib/pdf/reader/object_cache.rb +40 -16
data/lib/pdf/reader/object_hash.rb +94 -40
data/lib/pdf/reader/object_stream.rb +1 -0
data/lib/pdf/reader/orientation_detector.rb +34 -0
data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
data/lib/pdf/reader/page.rb +48 -3
data/lib/pdf/reader/page_layout.rb +125 -0
data/lib/pdf/reader/page_state.rb +185 -70
data/lib/pdf/reader/page_text_receiver.rb +70 -20
data/lib/pdf/reader/pages_strategy.rb +4 -293
data/lib/pdf/reader/parser.rb +37 -61
data/lib/pdf/reader/print_receiver.rb +6 -0
data/lib/pdf/reader/reference.rb +4 -1
data/lib/pdf/reader/register_receiver.rb +17 -31
data/lib/pdf/reader/resource_methods.rb +1 -0
data/lib/pdf/reader/standard_security_handler.rb +82 -42
data/lib/pdf/reader/standard_security_handler_v5.rb +91 -0
data/lib/pdf/reader/stream.rb +5 -2
data/lib/pdf/reader/synchronized_cache.rb +33 -0
data/lib/pdf/reader/text_run.rb +99 -0
data/lib/pdf/reader/token.rb +4 -1
data/lib/pdf/reader/transformation_matrix.rb +195 -0
data/lib/pdf/reader/unimplemented_security_handler.rb +17 -0
data/lib/pdf/reader/width_calculator/built_in.rb +67 -0
data/lib/pdf/reader/width_calculator/composite.rb +28 -0
data/lib/pdf/reader/width_calculator/true_type.rb +56 -0
data/lib/pdf/reader/width_calculator/type_one_or_three.rb +33 -0
data/lib/pdf/reader/width_calculator/type_zero.rb +25 -0
data/lib/pdf/reader/width_calculator.rb +12 -0
data/lib/pdf/reader/xref.rb +41 -9
data/lib/pdf/reader.rb +45 -104
data/lib/pdf-reader.rb +4 -1
metadata +220 -101
data/bin/pdf_list_callbacks +0 -17
data/lib/pdf/hash.rb +0 -15
data/lib/pdf/reader/abstract_strategy.rb +0 -81
data/lib/pdf/reader/metadata_strategy.rb +0 -56
data/lib/pdf/reader/text_receiver.rb +0 -264

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: 652d05cf6a22fad5ecb4b92de1e27ba60cafc6525c5ca524e24c7f9796fe1b83
+  data.tar.gz: 2c7448e97890a9fcbd10ec2cd5bafb9025db2fb75dabaf71a4074c542b1065a1
+SHA512:
+  metadata.gz: ac82452924cf46af98ee15f2a20642b1d06d5b9c22104fe171b5b4612665e482f341e12473805016ccb9d921fc15324ba51675170b369adeace8b278cd1279fb
+  data.tar.gz: b1dc1c4422b0e6bf01092cf724630ba7424fdef1fdaf34f33aaa3a31397caf6ef5a73185a98e6e2828a9e082d87cbca311565397cb064cac20d86e72be27626f

data/CHANGELOG CHANGED Viewed

@@ -1,3 +1,88 @@
+v2.5.0 (6th June 2021)
+- bump minimum ruby version to 2.0
+- Correctly handle trascoding to UTF-8 from some fonts that use a difference table [#344](https://github.com/yob/pdf-reader/pull/344/)
+- Fix some character spacing issues with the TJ operator [#343](https://github.com/yob/pdf-reader/pull/343)
+- Fix crash with some encrypted PDFs [#348](https://github.com/yob/pdf-reader/pull/348/)
+- Fix positions of text on some PDFs with pages rotated 90° [#350](https://github.com/yob/pdf-reader/pull/350/)
+v2.4.2 (28th January 2021)
+- relax ASCII85 dependency to allow 1.x
+- improved support for decompressing objects with slightly malformed zlib data
+v.2.4.1 (24th September 2020)
+- Re-vendor font metrics from Adobe to clarify their license
+v2.4.0 (21st November 2019)
+- Optimise overlapping characters code introduced in 2.3.0. Text extraction of pages with
+  thousands of characters is still slower than it was in 2.2.1, but it might tolerable
+  for now. See https://github.com/yob/pdf-reader/pull/308 for details.
+- Implement very basic font substitution for Type1 and TrueType fonts that aren't embedded
+- Remove PDF::Hash class. It's been deprecated since 2010, and it's hard to believe anyone
+  is still using it.
+- Several small bug fixes
+v2.3.0 (7th November 2019)
+- Text extraction now makes an effort to skip duplicate characters that overlap, a
+  common approach used for a fake "bold" effect, This will make text extraction a bit
+  slower - if that turns out to be an issue I'll look into further optimisations or
+  provide a toggle to turn it off
+- Several small bug fixes
+v2.2.1 (27th July 2019)
+- Improve utf8 text extraction from CMaps that contain surrogate pair ligatures
+v2.2.0 (18th December 2018)
+- Support additional XRef Stream variants (thanks Stefan Wienert)
+- Add frozen_strings pragma to reduce object allocations on ruby 2.3+
+- various bug fixes
+v2.1.0 (15th February 2018)
+- Support extra encrypted PDF variants (thanks to Gyuchang Jun)
+- various bug fixes
+v2.0.0 (25th February 2017)
+- various bug fixes
+v2.0.0.beta1 (15th February 2017)
+- BREAKING CHANGE: remove all methods that were deprecated in 1.0.0
+- Bug: Support extra encrypted PDF variants (thanks to Gyuchang Jun)
+- various bug fixes
+v1.4.1 (2nd January 2017)
+- improve compatibility with ruby 2.4 (thanks Akira Matsuda)
+- various bug fixes
+v1.4.0 (22nd February 2016)
+- raise minimum ruby version to 1.9.3
+- print warnings to stderr when deprecated methods are used. These methods have been
+  deprecated for 4 years, so hopefully few people are depending on them
+- Fix exception when a non-breaking space (character 160) is used with a
+  built-in font (helvetica, etc)
+- various bug fixes
+v1.3.3 (7th April 2013)
+- various bug fixes
+v1.3.2 (26th February 2013)
+- various bug fixes
+v1.3.1 (12th February 2013)
+- various bug fixes
+v1.3.0 (30th December 2012)
+- Numerous performance optimisations (thanks Alex Dowad)
+- Improved text extraction (thanks Nathaniel Madura)
+- Load less of the hashery gem to reduce core monkey patches
+- various bug fixes
+v1.2.0 (28th August 2012)
+- Feature: correctly extract text using surrogate pairs and ligatures
+  (thanks Nathaniel Madura)
+- Speed optimisation: cache tokenised Form XObjects to avoid re-parsing them
+- Feature: support opening documents with some junk bytes prepended to file
+  (thanks Paul Gallagher)
+  - Acrobat does this, so it seemed reasonable to add support
 v1.1.1 (9th May 2012)
 - bugfix release to improve parsing of some PDFs
@@ -56,10 +141,10 @@ v0.9.2 (24th April 2011)
 v0.9.1 (21st December 2010)
 - force gem to only install on ruby 1.8.7 or higher
-  - maintaining supprot for earlier versions takes more time than I have
+  - maintaining support for earlier versions takes more time than I have
     available at the moment
 - bug: fix parsing of obscure pdf name format
-- bug: fix behaviour when loaded in confunction with htmldoc gem
+- bug: fix behaviour when loaded in conjunction with htmldoc gem
 v0.9.0 (19th November 2010)
 - support for pdf 1.5+ files that use object and xref streams

data/{README.rdoc → README.md} RENAMED Viewed

@@ -1,4 +1,4 @@
-= Release Notes
+# pdf-reader
 The PDF::Reader library implements a PDF parser conforming as much as possible
 to the PDF specification from Adobe.
@@ -15,46 +15,55 @@ higher level functionality - it's not going to render a PDF for you. There are
 a few exceptions to support very common use cases like extracting text from a
 page.
-= Installation
+# Installation
 The recommended installation method is via Rubygems.
+```ruby
   gem install pdf-reader
+```
-= Usage
+# Usage
 Begin by creating a PDF::Reader instance that points to a PDF file. Document
 level information (metadata, page count, bookmarks, etc) is available via
 this object.
+```ruby
     reader = PDF::Reader.new("somefile.pdf")
     puts reader.pdf_version
     puts reader.info
     puts reader.metadata
     puts reader.page_count
+ ```
 PDF::Reader.new accepts an IO stream or a filename. Here's an example with
 an IO stream:
+```ruby
     require 'open-uri'
     io     = open('http://example.com/somefile.pdf')
     reader = PDF::Reader.new(io)
     puts reader.info
+ ```
 If you open a PDF with File#open or IO#open, I strongly recommend using "rb"
 mode to ensure the file isn't mangled by ruby being 'helpful'. This is
 particularly important on windows and MRI >= 1.9.2.
+```ruby
     File.open("somefile.pdf", "rb") do |io|
       reader = PDF::Reader.new(io)
       puts reader.info
     end
+ ```
 PDF is a page based file format, so most visible information is available via
 page-based iteration
+```ruby
     reader = PDF::Reader.new("somefile.pdf")
     reader.pages.each do |page|
@@ -62,10 +71,12 @@ page-based iteration
       puts page.text
       puts page.raw_content
     end
+```
 If you need to access the full program for rendering a page, use the walk() method
 of PDF::Reader::Page.
+```ruby
     class RedGreenBlue
       def set_rgb_color_for_nonstroking(r, g, b)
         puts "R: #{r}, G: #{g}, B: #{b}"
@@ -76,37 +87,32 @@ of PDF::Reader::Page.
     page     = reader.page(1)
     receiver = RedGreenBlue.new
     page.walk(receiver)
+```
-For low level access to the objects in a PDF file, use the ObjectHash class. You can
-build an ObjectHash instance directly:
-    puts PDF::Reader::ObjectHash.new("somefile.pdf")
-or via a PDF::Reader instance:
+For low level access to the objects in a PDF file, use the ObjectHash class like
+so:
+```ruby
     reader  = PDF::Reader.new("somefile.pdf")
-    puts reader.objects
-The second method is preferred to increase the effectiveness of internal caching.
+    puts reader.objects.inspect
+```
-= Text Encoding
+# Text Encoding
 Regardless of the internal encoding used in the PDF all text will be converted
 to UTF-8 before it is passed back from PDF::Reader.
-Strings that contain binary data (like font blobs) will be marked as such on
-M17N aware VMs.
+Strings that contain binary data (like font blobs) will be marked as such.
-= Former API
+# Former API
 Version 1.0.0 of PDF::Reader introduced a new page-based API that provides
 efficient and easy access to any page.
-The previous API is marked as deprecated but will continue to work for the
-time being. Eventually calls to the old API will begin triggering deprecation
-warnings before it is completely removed in version 2.0.0.
+The pre-1.0 API was deprecated during the 1.x release series, and has been
+removed from 2.0.0.
-= Exceptions
+# Exceptions
 There are two key exceptions that you will need to watch out for when processing a
 PDF file:
@@ -126,7 +132,7 @@ don't, 'rescue MalformedPDFError' will catch all the subclassed errors as well.
 Any other exceptions should be considered bugs in either PDF::Reader (please
 report it!).
-= PDF Integrity
+# PDF Integrity
 Windows developers may run into problems when running specs due to MalformedPDFError's
 This is usually because CRLF characters are automatically added to some of the PDF's in
@@ -134,18 +140,20 @@ the spec folder when you checkout a branch from Git.
 To remove any invalid CRLF characters added while checking out a branch from Git, run:
+```ruby
     rake fix_integrity
+```
-= Maintainers
+# Maintainers
-- James Healy <mailto:jimmy@deefa.com>
+* James Healy <mailto:jimmy@deefa.com>
-= Licensing
+# Licensing
 This library is distributed under the terms of the MIT License. See the included file for
 more detail.
-= Mailing List
+# Mailing List
 Any questions or feedback should be sent to the PDF::Reader google group. It's
 better that any answers be available for others instead of hiding in someone's
@@ -153,19 +161,23 @@ inbox.
 http://groups.google.com/group/pdf-reader
-= Examples
+# Examples
 The easiest way to explain how this works in practice is to show some examples.
 Check out the examples/ directory for a few files.
-= Known Limitations
+# Known Limitations
 Occasionally some text cannot be extracted properly due to the way it has been
 stored, or the use of invalid bytes. In these cases PDF::Reader will output a
 little UTF-8 friendly box to indicate an unrecognisable character.
-= Resources
+# Resources
+* PDF::Reader Code Repository: http://github.com/yob/pdf-reader
+* PDF Specification: http://www.adobe.com/devnet/pdf/pdf_reference.html
+* PDF Tutorial Slide Presentations: https://web.archive.org/web/20150110042057/http://home.comcast.net/~jk05/presentations/PDFTutorials.html
-- PDF::Reader Code Repository: http://github.com/yob/pdf-reader
-- PDF Specification: http://www.adobe.com/devnet/pdf/pdf_reference.html
-- PDF Tutorial Slide Presentations: http://home.comcast.net/~jk05/presentations/PDFTutorials.html
+* Developing with PDF (book): http://shop.oreilly.com/product/0636920025269.do

data/Rakefile CHANGED Viewed

@@ -1,19 +1,26 @@
-require "rubygems"
-require "bundler"
-Bundler.setup
-require 'rake'
-require 'rake/rdoctask'
-require 'rspec/core/rake_task'
-require 'roodi'
-require 'roodi_task'
+require "bundler/gem_tasks"
+require "digest/md5"
+require "rdoc/task"
+require "rspec/core/rake_task"
+require "yaml"
 desc "Default Task"
-task :default => [ :spec ]
+task :default => [ :quality, :spec ]
+require 'cane/rake_task'
+require 'morecane'
+desc "Run cane to check quality metrics"
+Cane::RakeTask.new(:quality) do |cane|
+  cane.abc_max = 20
+  cane.style_measure = 100
+  cane.max_violations = 31
+  cane.use Morecane::EncodingCheck, :encoding_glob => "{app,lib,spec}/**/*.rb"
+end
-# run all rspecs
 desc "Run all rspec files"
-RSpec::Core::RakeTask.new("spec") do |t|
+RSpec::Core::RakeTask.new(:spec) do |t|
   t.rspec_opts  = ["--color", "--format progress"]
   t.ruby_opts = "-w"
 end
@@ -31,16 +38,14 @@ Rake::RDocTask.new("doc") do |rdoc|
   rdoc.options << "--inline-source"
 end
-RoodiTask.new 'roodi', ['lib/**/*.rb']
 desc "Create a YAML file of integrity info for PDFs in the spec suite"
 task :integrity_yaml do
   data = {}
-  Dir.glob("spec/data/**/*.*").each do |path|
+  Dir.glob("spec/data/**/*.*").sort.each do |path|
     path_without_spec = path.gsub("spec/","")
     data[path_without_spec] = {
       :bytes => File.size(path),
-      :md5   => `md5sum "#{path}"`.split.first
+      :md5 => Digest::MD5.hexdigest(File.read(path))
     } if File.file?(path)
   end
   File.open("spec/integrity.yml","wb") { |f| f.write YAML.dump(data)}

data/bin/pdf_callbacks CHANGED Viewed

@@ -9,7 +9,7 @@ require 'pdf/reader'
 receiver = PDF::Reader::PrintReceiver.new
 if ARGV.empty?
-  browser = PDF::Reader.new($stdin)
+  browser = PDF::Reader.new(StringIO.new(ARGF.read))
 else
   browser = PDF::Reader.new(ARGV[0])
 end

data/bin/pdf_object CHANGED Viewed

@@ -25,7 +25,10 @@ gen = gen.to_i
 # make magic happen
 begin
-  obj = PDF::Reader.object_file(filename, id, gen)
+  obj = nil
+  PDF::Reader.open(filename) do |pdf|
+    obj = pdf.objects[PDF::Reader::Reference.new(id, gen)]
+  end
   case obj
   when Hash, Array

data/bin/pdf_text CHANGED Viewed

@@ -1,12 +1,10 @@
 #!/usr/bin/env ruby
 require 'rubygems'
-$LOAD_PATH.unshift(File.dirname(__FILE__) + "/../lib")
 require 'pdf/reader'
 if ARGV.empty?
-  browser = PDF::Reader.new($stdin)
+  browser = PDF::Reader.new(StringIO.new(ARGF.read))
 else
   browser = PDF::Reader.new(ARGV[0])
 end

data/examples/callbacks.rb CHANGED Viewed

@@ -9,12 +9,13 @@
 require 'rubygems'
 require 'pdf/reader'
-receiver = PDF::Reader::RegisterReceiver.new
 filename = File.expand_path(File.dirname(__FILE__)) + "/../spec/data/cairo-basic.pdf"
 PDF::Reader.open(filename) do |reader|
   reader.pages.each do |page|
+    receiver = PDF::Reader::RegisterReceiver.new
     page.walk(receiver)
     receiver.callbacks.each do |cb|
       puts cb
     end

data/examples/extract_images.rb CHANGED Viewed

@@ -86,14 +86,15 @@ module ExtractImages
       tiff = header.dup
       tiff << short_tag.call( 256, 1, w ) # image width
       tiff << short_tag.call( 257, 1, h ) # image height
-      tiff << long_tag.call( 258, 4, (header.size + (tag_count*12))) # bits per pixel
+      tiff << long_tag.call( 258, 4, (header.size + (tag_count*12) + 4)) # bits per pixel
       tiff << short_tag.call( 259, 1, 1 ) # compression
       tiff << short_tag.call( 262, 1, 5 ) # colorspace - separation
-      tiff << long_tag.call( 273, 1, (10 + (tag_count*12) + 16) ) # data offset
+      tiff << long_tag.call( 273, 1, (10 + (tag_count*12) + 20) ) # data offset
       tiff << short_tag.call( 277, 1, 4 ) # samples per pixel
       tiff << long_tag.call( 279, 1, stream.unfiltered_data.size) # data byte size
       tiff << short_tag.call( 284, 1, 1 ) # planer config
       tiff << long_tag.call( 332, 1, 1)   # inkset - CMYK
+      tiff << [0].pack("I") # next IFD pointer
       tiff << [bpc, bpc, bpc, bpc].pack("IIII")
       tiff << stream.unfiltered_data
       File.open(filename, "wb") { |file| file.write tiff }
@@ -119,10 +120,12 @@ module ExtractImages
       tiff << short_tag.call( 258, 1, 8 ) # bits per pixel
       tiff << short_tag.call( 259, 1, 1 ) # compression
       tiff << short_tag.call( 262, 1, 1 ) # colorspace - grayscale
-      tiff << long_tag.call( 273, 1, (10 + (tag_count*12)) ) # data offset
+      tiff << long_tag.call( 273, 1, (10 + (tag_count*12) + 4) ) # data offset
       tiff << short_tag.call( 277, 1, 1 ) # samples per pixel
       tiff << long_tag.call( 279, 1, stream.unfiltered_data.size) # data byte size
       tiff << short_tag.call( 284, 1, 1 ) # planer config
+      tiff << [0].pack("I") # next IFD pointer
+      p stream.unfiltered_data.size
       tiff << stream.unfiltered_data
       File.open(filename, "wb") { |file| file.write tiff }
     end
@@ -144,12 +147,13 @@ module ExtractImages
       tiff = header.dup
       tiff << short_tag.call( 256, 1, w ) # image width
       tiff << short_tag.call( 257, 1, h ) # image height
-      tiff << long_tag.call( 258, 3, (header.size + (tag_count*12))) # bits per pixel
+      tiff << long_tag.call( 258, 3, (header.size + (tag_count*12) + 4)) # bits per pixel
       tiff << short_tag.call( 259, 1, 1 ) # compression
       tiff << short_tag.call( 262, 1, 2 ) # colorspace - RGB
-      tiff << long_tag.call( 273, 1, (header.size + (tag_count*12) + 12) ) # data offset
+      tiff << long_tag.call( 273, 1, (header.size + (tag_count*12) + 16) ) # data offset
       tiff << short_tag.call( 277, 1, 3 ) # samples per pixel
       tiff << long_tag.call( 279, 1, stream.unfiltered_data.size) # data byte size
+      tiff << [0].pack("I") # next IFD pointer
       tiff << [bpc, bpc, bpc].pack("III")
       tiff << stream.unfiltered_data
       File.open(filename, "wb") { |file| file.write tiff }
@@ -209,8 +213,9 @@ module ExtractImages
       + short_tag.call( 256, cols ) \
       + short_tag.call( 257, h ) \
       + short_tag.call( 259, 4 ) \
-      + long_tag.call( 273, (10 + (5*12)) ) \
+      + long_tag.call( 273, (10 + (5*12) + 4) ) \
       + long_tag.call( 279, len) \
+      + [0].pack("I") \
       + stream.data
       File.open(filename, "wb") { |file| file.write tiff }
     end