pdf-reader 2.2.0 → 2.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +90 -0
  3. data/README.md +18 -3
  4. data/Rakefile +1 -1
  5. data/bin/pdf_callbacks +1 -1
  6. data/bin/pdf_text +1 -1
  7. data/examples/extract_fonts.rb +12 -7
  8. data/examples/rspec.rb +1 -0
  9. data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
  10. data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
  11. data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
  12. data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
  13. data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
  14. data/lib/pdf/reader/afm/Courier.afm +342 -342
  15. data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
  16. data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
  17. data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
  18. data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
  19. data/lib/pdf/reader/afm/MustRead.html +19 -0
  20. data/lib/pdf/reader/afm/Symbol.afm +213 -213
  21. data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
  22. data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
  23. data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
  24. data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
  25. data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
  26. data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
  27. data/lib/pdf/reader/buffer.rb +91 -47
  28. data/lib/pdf/reader/cid_widths.rb +7 -4
  29. data/lib/pdf/reader/cmap.rb +83 -59
  30. data/lib/pdf/reader/encoding.rb +17 -14
  31. data/lib/pdf/reader/error.rb +15 -3
  32. data/lib/pdf/reader/filter/ascii85.rb +7 -1
  33. data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
  34. data/lib/pdf/reader/filter/depredict.rb +12 -10
  35. data/lib/pdf/reader/filter/flate.rb +30 -16
  36. data/lib/pdf/reader/filter/lzw.rb +2 -0
  37. data/lib/pdf/reader/filter/null.rb +1 -1
  38. data/lib/pdf/reader/filter/run_length.rb +19 -13
  39. data/lib/pdf/reader/filter.rb +11 -11
  40. data/lib/pdf/reader/font.rb +89 -26
  41. data/lib/pdf/reader/font_descriptor.rb +22 -18
  42. data/lib/pdf/reader/form_xobject.rb +18 -5
  43. data/lib/pdf/reader/glyph_hash.rb +28 -13
  44. data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
  45. data/lib/pdf/reader/key_builder_v5.rb +138 -0
  46. data/lib/pdf/reader/lzw.rb +28 -11
  47. data/lib/pdf/reader/no_text_filter.rb +14 -0
  48. data/lib/pdf/reader/null_security_handler.rb +1 -4
  49. data/lib/pdf/reader/object_cache.rb +1 -0
  50. data/lib/pdf/reader/object_hash.rb +292 -63
  51. data/lib/pdf/reader/object_stream.rb +3 -2
  52. data/lib/pdf/reader/overlapping_runs_filter.rb +72 -0
  53. data/lib/pdf/reader/page.rb +143 -16
  54. data/lib/pdf/reader/page_layout.rb +43 -39
  55. data/lib/pdf/reader/page_state.rb +26 -17
  56. data/lib/pdf/reader/page_text_receiver.rb +74 -4
  57. data/lib/pdf/reader/pages_strategy.rb +1 -0
  58. data/lib/pdf/reader/parser.rb +34 -14
  59. data/lib/pdf/reader/point.rb +25 -0
  60. data/lib/pdf/reader/print_receiver.rb +1 -0
  61. data/lib/pdf/reader/rc4_security_handler.rb +38 -0
  62. data/lib/pdf/reader/rectangle.rb +113 -0
  63. data/lib/pdf/reader/reference.rb +3 -1
  64. data/lib/pdf/reader/register_receiver.rb +1 -0
  65. data/lib/pdf/reader/{resource_methods.rb → resources.rb} +17 -9
  66. data/lib/pdf/reader/security_handler_factory.rb +79 -0
  67. data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -94
  68. data/lib/pdf/reader/stream.rb +3 -2
  69. data/lib/pdf/reader/synchronized_cache.rb +1 -0
  70. data/lib/pdf/reader/text_run.rb +40 -5
  71. data/lib/pdf/reader/token.rb +1 -0
  72. data/lib/pdf/reader/transformation_matrix.rb +8 -7
  73. data/lib/pdf/reader/type_check.rb +98 -0
  74. data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
  75. data/lib/pdf/reader/validating_receiver.rb +262 -0
  76. data/lib/pdf/reader/width_calculator/built_in.rb +27 -17
  77. data/lib/pdf/reader/width_calculator/composite.rb +6 -1
  78. data/lib/pdf/reader/width_calculator/true_type.rb +10 -11
  79. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +6 -4
  80. data/lib/pdf/reader/width_calculator/type_zero.rb +6 -2
  81. data/lib/pdf/reader/width_calculator.rb +1 -0
  82. data/lib/pdf/reader/xref.rb +37 -11
  83. data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
  84. data/lib/pdf/reader.rb +49 -24
  85. data/lib/pdf-reader.rb +1 -0
  86. data/rbi/pdf-reader.rbi +2048 -0
  87. metadata +39 -23
  88. data/lib/pdf/hash.rb +0 -20
  89. data/lib/pdf/reader/orientation_detector.rb +0 -34
  90. data/lib/pdf/reader/standard_security_handler_v5.rb +0 -91
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: cfc4ed13692a51d8b78fc181d67fcf8b5e00fb1679dbca36137961f63365edaf
4
- data.tar.gz: de5556fabc41642746fd242a2623c92c9424c56da2d845507c49624c312b646b
3
+ metadata.gz: 2c84983c18d983798ff5f2ede514b540ee55a788229501976474b7341bf57fba
4
+ data.tar.gz: 79b8f092e72a194110062cf7d7e9425c0a6531e145009c9b7c10c2c072b3d1d5
5
5
  SHA512:
6
- metadata.gz: 4074d5dd87f1ad9286f4022ad46a4160f44c6afed2341f9115029770770ae80b248ace9a8d5df0e444046bed662f9aa5a9334822b23222abec9574523d9e7c36
7
- data.tar.gz: a69837921f7581d2aeb9226d0791b4b0dd5925a9f83e9cb4cee4dbaf43af33e6a7a570292650a14006ffc9d1759f2ea4ef268381e5aa63fc6da5c1a6d38f46a7
6
+ metadata.gz: '09c97a875bb46389172ed48ae8b2779ba3a8e032852b6a9943f187de13c23649e2398a5374358c62b64cf9e13bbf7f819bb5072d9aaa6882b9b94e96d23f5c13'
7
+ data.tar.gz: ed92250acee85f4e355785dd043f7774a5883550fe82b01b3cd9e10011f93a1fcdd500108b0e1f4e2af562bddd833c03ca601078b3eba8ee2e9990fd5e76305a
data/CHANGELOG CHANGED
@@ -1,3 +1,93 @@
1
+ v2.11.0 (26th October 2022)
2
+ - Various bug fixes
3
+ - Expanded sorbet type annotations
4
+
5
+ v2.10.0 (12th May 2022)
6
+ - Various bug fixes
7
+ - Expanded sorbet type annotations
8
+
9
+ v2.9.2 (20th February 2022)
10
+ - Fix PDF::Reader::ObjectHash#page_references to return an Array of PDF::Reader::Reference (http://github.com/yob/pdf-reader/pull/444)
11
+
12
+ v2.9.1 (4th February 2022)
13
+ - Fix exception in Page#walk introduced in 2.9.0 (http://github.com/yob/pdf-reader/pull/442)
14
+ - Other small bug fixes
15
+
16
+ v2.9.0 (24th January 2022)
17
+ - Support additional encryption standards (http://github.com/yob/pdf-reader/pull/419)
18
+ - Return CropBox correctly from Page#rectangles (https://github.com/yob/pdf-reader/pull/420)
19
+ - For sorbet users, additional type annotations are included in the gem
20
+
21
+ v2.8.0 (28th Decemeber 2021)
22
+ - Add PDF::Reader::Page#runs for extracting text from a page with positioning metadata (http://github.com/yob/pdf-reader/pull/411)
23
+ - Add options to PDF::Reader::Page#text to make some behaviour configurable (http://github.com/yob/pdf-reader/pull/411)
24
+ - including extracting the text for only part of the page
25
+ - Improve text positioning and extraction for Type3 fonts (http://github.com/yob/pdf-reader/pull/412)
26
+ - Skip extracting text that is positioned outside the page (http://github.com/yob/pdf-reader/pull/413)
27
+ - Fix occasional crash when reading some streams (http://github.com/yob/pdf-reader/pull/405)
28
+
29
+ v2.7.0 (13th December 2021)
30
+ - Include RBI type files in the gem
31
+ - Downstream users of pdf-reader who also use sorbet *should* find many parts of the API will
32
+ now be typed checked by sorbet
33
+ - Fix glyph positioning in some rotation scenarios (http://github.com/yob/pdf-reader/pull/403)
34
+ - Improved text extraction on some rotated pages, and rotated text on normal pages
35
+ - Add PDF::Reader::Page#rectangles (http://github.com/yob/pdf-reader/pull/402)
36
+ - Returns page boxes (MediaBox, etc) with rotation applied, and as PORO rather than arrays of numbers
37
+ - Add PDF::Reader::Page#origin (http://github.com/yob/pdf-reader/pull/400)
38
+ - Add PDF::Reader::Page#{height,width} (http://github.com/yob/pdf-reader/pull/399)
39
+ - Overlap filter should only drop characters that overlap *and* match (http://github.com/yob/pdf-reader/pull/401)
40
+
41
+ v2.6.0 (12th November 2021)
42
+ - Text extraction improvements
43
+ - Improved text layout on pages with a variety of font sizes (http://github.com/yob/pdf-reader/pull/355)
44
+ - Fixed text positioning for some rotated pages (http://github.com/yob/pdf-reader/pull/356)
45
+ - Improved character width calculation for PDFs using built-in (non-embedded) ZapfDingbats (http://github.com/yob/pdf-reader/pull/373)
46
+ - Skip zero-width characters (http://github.com/yob/pdf-reader/pull/372)
47
+ - Performance improvements
48
+ - Reduced memory pressure when decoding TIFF images (http://github.com/yob/pdf-reader/pull/360)
49
+ - Optional dependency on ascii81_native gem for faster processing of files using the ascii85 filter (http://github.com/yob/pdf-reader/pull/359)
50
+ - Successfully parse more files
51
+ - Gracefully handle some non-spec compliant CR/LF issues (http://github.com/yob/pdf-reader/pull/364)
52
+ - Fix parsing of some escape sequences in content streams (http://github.com/yob/pdf-reader/pull/368)
53
+ - Increase the amount of junk bytes we detect and skip at the end of a file (382)
54
+ - Ignore "/Prev 0" in trailers (http://github.com/yob/pdf-reader/pull/383)
55
+ - Fix parsing of some inline images (BI ID EI tokens) (http://github.com/yob/pdf-reader/pull/389)
56
+ - Gracefully handle some xref tables that incorrectly start with 1 (http://github.com/yob/pdf-reader/pull/384)
57
+
58
+ v2.5.0 (6th June 2021)
59
+ - bump minimum ruby version to 2.0
60
+ - Correctly handle trascoding to UTF-8 from some fonts that use a difference table [#344](https://github.com/yob/pdf-reader/pull/344/)
61
+ - Fix some character spacing issues with the TJ operator [#343](https://github.com/yob/pdf-reader/pull/343)
62
+ - Fix crash with some encrypted PDFs [#348](https://github.com/yob/pdf-reader/pull/348/)
63
+ - Fix positions of text on some PDFs with pages rotated 90° [#350](https://github.com/yob/pdf-reader/pull/350/)
64
+
65
+ v2.4.2 (28th January 2021)
66
+ - relax ASCII85 dependency to allow 1.x
67
+ - improved support for decompressing objects with slightly malformed zlib data
68
+
69
+ v.2.4.1 (24th September 2020)
70
+ - Re-vendor font metrics from Adobe to clarify their license
71
+
72
+ v2.4.0 (21st November 2019)
73
+ - Optimise overlapping characters code introduced in 2.3.0. Text extraction of pages with
74
+ thousands of characters is still slower than it was in 2.2.1, but it might tolerable
75
+ for now. See https://github.com/yob/pdf-reader/pull/308 for details.
76
+ - Implement very basic font substitution for Type1 and TrueType fonts that aren't embedded
77
+ - Remove PDF::Hash class. It's been deprecated since 2010, and it's hard to believe anyone
78
+ is still using it.
79
+ - Several small bug fixes
80
+
81
+ v2.3.0 (7th November 2019)
82
+ - Text extraction now makes an effort to skip duplicate characters that overlap, a
83
+ common approach used for a fake "bold" effect, This will make text extraction a bit
84
+ slower - if that turns out to be an issue I'll look into further optimisations or
85
+ provide a toggle to turn it off
86
+ - Several small bug fixes
87
+
88
+ v2.2.1 (27th July 2019)
89
+ - Improve utf8 text extraction from CMaps that contain surrogate pair ligatures
90
+
1
91
  v2.2.0 (18th December 2018)
2
92
  - Support additional XRef Stream variants (thanks Stefan Wienert)
3
93
  - Add frozen_strings pragma to reduce object allocations on ruby 2.3+
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # Release Notes
1
+ # pdf-reader
2
2
 
3
3
  The PDF::Reader library implements a PDF parser conforming as much as possible
4
4
  to the PDF specification from Adobe.
@@ -166,6 +166,19 @@ http://groups.google.com/group/pdf-reader
166
166
  The easiest way to explain how this works in practice is to show some examples.
167
167
  Check out the examples/ directory for a few files.
168
168
 
169
+ # Alternate Decoder
170
+
171
+ For PDF files containing Ascii85 streams, the [ascii85_native](https://github.com/AnomalousBit/ascii85_native) gem can be used for increased performance. If the ascii85_native gem is detected, pdf-reader will automatically use the gem.
172
+
173
+ First, run `gem install ascii85_native` and then require the gem alongside pdf-reader:
174
+
175
+ ```ruby
176
+ require "pdf-reader"
177
+ require "ascii85_native"
178
+ ```
179
+
180
+ Another way of enabling native Ascii85 decoding is to place `gem 'ascii85_native'` in your project's `Gemfile`.
181
+
169
182
  # Known Limitations
170
183
 
171
184
  Occasionally some text cannot be extracted properly due to the way it has been
@@ -176,8 +189,10 @@ little UTF-8 friendly box to indicate an unrecognisable character.
176
189
 
177
190
  * PDF::Reader Code Repository: http://github.com/yob/pdf-reader
178
191
 
179
- * PDF Specification: http://www.adobe.com/devnet/pdf/pdf_reference.html
192
+ * PDF Specification: https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf
193
+
194
+ * Adobe PDF Developer Resources: http://www.adobe.com/devnet/pdf/pdf_reference.html
180
195
 
181
- * PDF Tutorial Slide Presentations: http://home.comcast.net/~jk05/presentations/PDFTutorials.html
196
+ * PDF Tutorial Slide Presentations: https://web.archive.org/web/20150110042057/http://home.comcast.net/~jk05/presentations/PDFTutorials.html
182
197
 
183
198
  * Developing with PDF (book): http://shop.oreilly.com/product/0636920025269.do
data/Rakefile CHANGED
@@ -14,7 +14,7 @@ desc "Run cane to check quality metrics"
14
14
  Cane::RakeTask.new(:quality) do |cane|
15
15
  cane.abc_max = 20
16
16
  cane.style_measure = 100
17
- cane.max_violations = 31
17
+ cane.max_violations = 33
18
18
 
19
19
  cane.use Morecane::EncodingCheck, :encoding_glob => "{app,lib,spec}/**/*.rb"
20
20
  end
data/bin/pdf_callbacks CHANGED
@@ -9,7 +9,7 @@ require 'pdf/reader'
9
9
  receiver = PDF::Reader::PrintReceiver.new
10
10
 
11
11
  if ARGV.empty?
12
- browser = PDF::Reader.new($stdin)
12
+ browser = PDF::Reader.new(StringIO.new(ARGF.read))
13
13
  else
14
14
  browser = PDF::Reader.new(ARGV[0])
15
15
  end
data/bin/pdf_text CHANGED
@@ -4,7 +4,7 @@ require 'rubygems'
4
4
  require 'pdf/reader'
5
5
 
6
6
  if ARGV.empty?
7
- browser = PDF::Reader.new($stdin)
7
+ browser = PDF::Reader.new(StringIO.new(ARGF.read))
8
8
  else
9
9
  browser = PDF::Reader.new(ARGV[0])
10
10
  end
@@ -17,8 +17,8 @@ module ExtractFonts
17
17
  return count if page.fonts.nil? || page.fonts.empty?
18
18
 
19
19
  page.fonts.each do |label, font|
20
- next if complete_refs[font]
21
- complete_refs[font] = true
20
+ next if complete_refs[label]
21
+ complete_refs[label] = true
22
22
 
23
23
  process_font(page, font)
24
24
 
@@ -39,7 +39,7 @@ module ExtractFonts
39
39
  when :TrueType, :CIDFontType2 then
40
40
  ExtractFonts::TTF.new(page.objects, font).save("#{font[:BaseFont]}.ttf")
41
41
  else
42
- $stderr.puts "unsupported font type #{font[:Subtype]}"
42
+ $stderr.puts "unsupported font type #{font[:Subtype]} for #{font[:BaseFont]}"
43
43
  end
44
44
  end
45
45
 
@@ -68,10 +68,15 @@ module ExtractFonts
68
68
  end
69
69
  end
70
70
 
71
- filename = File.expand_path(File.dirname(__FILE__)) + "/../spec/data/cairo-unicode.pdf"
71
+ if ARGV.size == 0 # default file name
72
+ ARGV << File.expand_path(File.join(File.dirname(__dir__), "spec", "data", "cairo-unicode.pdf"))
73
+ end
74
+
72
75
  extractor = ExtractFonts::Extractor.new
73
76
 
74
- PDF::Reader.open(filename) do |reader|
75
- page = reader.page(1)
76
- extractor.page(page)
77
+ ARGV.each do |arg|
78
+ PDF::Reader.open(arg) do |reader|
79
+ page = reader.page(1)
80
+ extractor.page(page)
81
+ end
77
82
  end
data/examples/rspec.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
  # coding: utf-8
3
+ # typed: ignore
3
4
 
4
5
  # Basic RSpec of a generated PDF
5
6
  #
@@ -0,0 +1,41 @@
1
+ # coding: utf-8
2
+ # typed: strict
3
+ # frozen_string_literal: true
4
+
5
+ require 'digest/md5'
6
+
7
+ class PDF::Reader
8
+
9
+ # Decrypts data using the AESV2 algorithim defined in the PDF spec. Requires
10
+ # a decryption key, which is usually generated by PDF::Reader::StandardKeyBuilder
11
+ #
12
+ class AesV2SecurityHandler
13
+
14
+ def initialize(key)
15
+ @encrypt_key = key
16
+ end
17
+
18
+ ##7.6.2 General Encryption Algorithm
19
+ #
20
+ # Algorithm 1: Encryption of data using the AES-128-CBC algorithm
21
+ #
22
+ # version == 4 and CFM == AESV2
23
+ #
24
+ # buf - a string to decrypt
25
+ # ref - a PDF::Reader::Reference for the object to decrypt
26
+ #
27
+ def decrypt( buf, ref )
28
+ objKey = @encrypt_key.dup
29
+ (0..2).each { |e| objKey << (ref.id >> e*8 & 0xFF ) }
30
+ (0..1).each { |e| objKey << (ref.gen >> e*8 & 0xFF ) }
31
+ objKey << 'sAlT' # Algorithm 1, b)
32
+ length = objKey.length < 16 ? objKey.length : 16
33
+ cipher = OpenSSL::Cipher.new("AES-#{length << 3}-CBC")
34
+ cipher.decrypt
35
+ cipher.key = Digest::MD5.digest(objKey)[0,length]
36
+ cipher.iv = buf[0..15]
37
+ cipher.update(buf[16..-1]) + cipher.final
38
+ end
39
+
40
+ end
41
+ end
@@ -0,0 +1,38 @@
1
+ # coding: utf-8
2
+ # typed: strict
3
+ # frozen_string_literal: true
4
+
5
+ require 'digest'
6
+ require 'openssl'
7
+
8
+ class PDF::Reader
9
+
10
+ # Decrypts data using the AESV3 algorithim defined in the PDF 1.7, Extension Level 3 spec.
11
+ # Requires a decryption key, which is usually generated by PDF::Reader::KeyBuilderV5
12
+ #
13
+ class AesV3SecurityHandler
14
+
15
+ def initialize(key)
16
+ @encrypt_key = key
17
+ @cipher = "AES-256-CBC"
18
+ end
19
+
20
+ ##7.6.2 General Encryption Algorithm
21
+ #
22
+ # Algorithm 1: Encryption of data using the RC4 or AES algorithms
23
+ #
24
+ # used to decrypt RC4/AES encrypted PDF streams (buf)
25
+ #
26
+ # buf - a string to decrypt
27
+ # ref - a PDF::Reader::Reference for the object to decrypt
28
+ #
29
+ def decrypt( buf, ref )
30
+ cipher = OpenSSL::Cipher.new(@cipher)
31
+ cipher.decrypt
32
+ cipher.key = @encrypt_key.dup
33
+ cipher.iv = buf[0..15]
34
+ cipher.update(buf[16..-1]) + cipher.final
35
+ end
36
+
37
+ end
38
+ end