pdf-reader 1.4.1 → 2.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (79) hide show
  1. checksums.yaml +5 -5
  2. data/CHANGELOG +53 -3
  3. data/{README.rdoc → README.md} +40 -23
  4. data/Rakefile +2 -2
  5. data/bin/pdf_callbacks +1 -1
  6. data/bin/pdf_object +4 -1
  7. data/bin/pdf_text +1 -1
  8. data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
  9. data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
  10. data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
  11. data/lib/pdf/reader/afm/Courier.afm +342 -342
  12. data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
  13. data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
  14. data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
  15. data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
  16. data/lib/pdf/reader/afm/MustRead.html +19 -0
  17. data/lib/pdf/reader/afm/Symbol.afm +213 -213
  18. data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
  19. data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
  20. data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
  21. data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
  22. data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
  23. data/lib/pdf/reader/buffer.rb +14 -12
  24. data/lib/pdf/reader/cid_widths.rb +2 -0
  25. data/lib/pdf/reader/cmap.rb +48 -36
  26. data/lib/pdf/reader/encoding.rb +16 -18
  27. data/lib/pdf/reader/error.rb +5 -0
  28. data/lib/pdf/reader/filter/ascii85.rb +1 -0
  29. data/lib/pdf/reader/filter/ascii_hex.rb +2 -0
  30. data/lib/pdf/reader/filter/depredict.rb +1 -0
  31. data/lib/pdf/reader/filter/flate.rb +29 -16
  32. data/lib/pdf/reader/filter/lzw.rb +2 -0
  33. data/lib/pdf/reader/filter/null.rb +2 -0
  34. data/lib/pdf/reader/filter/run_length.rb +4 -6
  35. data/lib/pdf/reader/filter.rb +2 -0
  36. data/lib/pdf/reader/font.rb +12 -13
  37. data/lib/pdf/reader/font_descriptor.rb +1 -0
  38. data/lib/pdf/reader/form_xobject.rb +1 -0
  39. data/lib/pdf/reader/glyph_hash.rb +7 -2
  40. data/lib/pdf/reader/lzw.rb +4 -4
  41. data/lib/pdf/reader/null_security_handler.rb +17 -0
  42. data/lib/pdf/reader/object_cache.rb +1 -0
  43. data/lib/pdf/reader/object_hash.rb +91 -37
  44. data/lib/pdf/reader/object_stream.rb +1 -0
  45. data/lib/pdf/reader/orientation_detector.rb +5 -4
  46. data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
  47. data/lib/pdf/reader/page.rb +30 -1
  48. data/lib/pdf/reader/page_layout.rb +19 -24
  49. data/lib/pdf/reader/page_state.rb +8 -5
  50. data/lib/pdf/reader/page_text_receiver.rb +23 -1
  51. data/lib/pdf/reader/pages_strategy.rb +2 -304
  52. data/lib/pdf/reader/parser.rb +10 -7
  53. data/lib/pdf/reader/print_receiver.rb +1 -0
  54. data/lib/pdf/reader/reference.rb +1 -0
  55. data/lib/pdf/reader/register_receiver.rb +1 -0
  56. data/lib/pdf/reader/resource_methods.rb +1 -0
  57. data/lib/pdf/reader/standard_security_handler.rb +80 -42
  58. data/lib/pdf/reader/standard_security_handler_v5.rb +91 -0
  59. data/lib/pdf/reader/stream.rb +1 -0
  60. data/lib/pdf/reader/synchronized_cache.rb +1 -0
  61. data/lib/pdf/reader/text_run.rb +28 -9
  62. data/lib/pdf/reader/token.rb +1 -0
  63. data/lib/pdf/reader/transformation_matrix.rb +1 -0
  64. data/lib/pdf/reader/unimplemented_security_handler.rb +17 -0
  65. data/lib/pdf/reader/width_calculator/built_in.rb +25 -16
  66. data/lib/pdf/reader/width_calculator/composite.rb +1 -0
  67. data/lib/pdf/reader/width_calculator/true_type.rb +2 -2
  68. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
  69. data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
  70. data/lib/pdf/reader/width_calculator.rb +1 -0
  71. data/lib/pdf/reader/xref.rb +11 -5
  72. data/lib/pdf/reader.rb +30 -119
  73. data/lib/pdf-reader.rb +1 -0
  74. metadata +35 -61
  75. data/bin/pdf_list_callbacks +0 -17
  76. data/lib/pdf/hash.rb +0 -19
  77. data/lib/pdf/reader/abstract_strategy.rb +0 -81
  78. data/lib/pdf/reader/metadata_strategy.rb +0 -56
  79. data/lib/pdf/reader/text_receiver.rb +0 -265
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: fb8a5be7c95212f559bb4d26af5fbdb484d21e77
4
- data.tar.gz: f8fe70bf868dfff03b47a0b81993d1e680593e84
2
+ SHA256:
3
+ metadata.gz: 652d05cf6a22fad5ecb4b92de1e27ba60cafc6525c5ca524e24c7f9796fe1b83
4
+ data.tar.gz: 2c7448e97890a9fcbd10ec2cd5bafb9025db2fb75dabaf71a4074c542b1065a1
5
5
  SHA512:
6
- metadata.gz: b881cecddfa41e3ad15dcafd31d4109290c664d0cf06478f3af6769aa7ced108e3ba082db54c6759c117d7559cc118e0d3a971c17b59cb23bf4e50024089fa6b
7
- data.tar.gz: 50d61b135d79840dce5e5ca712b5db5185deefeee5de13d2adc63c1a8e1eb4b383bb0e8bb491c03bea49d11c4edf130b0fdb3b2eafea63ee0b85ca0390e047a0
6
+ metadata.gz: ac82452924cf46af98ee15f2a20642b1d06d5b9c22104fe171b5b4612665e482f341e12473805016ccb9d921fc15324ba51675170b369adeace8b278cd1279fb
7
+ data.tar.gz: b1dc1c4422b0e6bf01092cf724630ba7424fdef1fdaf34f33aaa3a31397caf6ef5a73185a98e6e2828a9e082d87cbca311565397cb064cac20d86e72be27626f
data/CHANGELOG CHANGED
@@ -1,5 +1,55 @@
1
+ v2.5.0 (6th June 2021)
2
+ - bump minimum ruby version to 2.0
3
+ - Correctly handle trascoding to UTF-8 from some fonts that use a difference table [#344](https://github.com/yob/pdf-reader/pull/344/)
4
+ - Fix some character spacing issues with the TJ operator [#343](https://github.com/yob/pdf-reader/pull/343)
5
+ - Fix crash with some encrypted PDFs [#348](https://github.com/yob/pdf-reader/pull/348/)
6
+ - Fix positions of text on some PDFs with pages rotated 90° [#350](https://github.com/yob/pdf-reader/pull/350/)
7
+
8
+ v2.4.2 (28th January 2021)
9
+ - relax ASCII85 dependency to allow 1.x
10
+ - improved support for decompressing objects with slightly malformed zlib data
11
+
12
+ v.2.4.1 (24th September 2020)
13
+ - Re-vendor font metrics from Adobe to clarify their license
14
+
15
+ v2.4.0 (21st November 2019)
16
+ - Optimise overlapping characters code introduced in 2.3.0. Text extraction of pages with
17
+ thousands of characters is still slower than it was in 2.2.1, but it might tolerable
18
+ for now. See https://github.com/yob/pdf-reader/pull/308 for details.
19
+ - Implement very basic font substitution for Type1 and TrueType fonts that aren't embedded
20
+ - Remove PDF::Hash class. It's been deprecated since 2010, and it's hard to believe anyone
21
+ is still using it.
22
+ - Several small bug fixes
23
+
24
+ v2.3.0 (7th November 2019)
25
+ - Text extraction now makes an effort to skip duplicate characters that overlap, a
26
+ common approach used for a fake "bold" effect, This will make text extraction a bit
27
+ slower - if that turns out to be an issue I'll look into further optimisations or
28
+ provide a toggle to turn it off
29
+ - Several small bug fixes
30
+
31
+ v2.2.1 (27th July 2019)
32
+ - Improve utf8 text extraction from CMaps that contain surrogate pair ligatures
33
+
34
+ v2.2.0 (18th December 2018)
35
+ - Support additional XRef Stream variants (thanks Stefan Wienert)
36
+ - Add frozen_strings pragma to reduce object allocations on ruby 2.3+
37
+ - various bug fixes
38
+
39
+ v2.1.0 (15th February 2018)
40
+ - Support extra encrypted PDF variants (thanks to Gyuchang Jun)
41
+ - various bug fixes
42
+
43
+ v2.0.0 (25th February 2017)
44
+ - various bug fixes
45
+
46
+ v2.0.0.beta1 (15th February 2017)
47
+ - BREAKING CHANGE: remove all methods that were deprecated in 1.0.0
48
+ - Bug: Support extra encrypted PDF variants (thanks to Gyuchang Jun)
49
+ - various bug fixes
50
+
1
51
  v1.4.1 (2nd January 2017)
2
- - improve compatability with ruby 2.4 (thanks Akira Matsuda)
52
+ - improve compatibility with ruby 2.4 (thanks Akira Matsuda)
3
53
  - various bug fixes
4
54
 
5
55
  v1.4.0 (22nd February 2016)
@@ -91,10 +141,10 @@ v0.9.2 (24th April 2011)
91
141
 
92
142
  v0.9.1 (21st December 2010)
93
143
  - force gem to only install on ruby 1.8.7 or higher
94
- - maintaining supprot for earlier versions takes more time than I have
144
+ - maintaining support for earlier versions takes more time than I have
95
145
  available at the moment
96
146
  - bug: fix parsing of obscure pdf name format
97
- - bug: fix behaviour when loaded in confunction with htmldoc gem
147
+ - bug: fix behaviour when loaded in conjunction with htmldoc gem
98
148
 
99
149
  v0.9.0 (19th November 2010)
100
150
  - support for pdf 1.5+ files that use object and xref streams
@@ -1,4 +1,4 @@
1
- = Release Notes
1
+ # pdf-reader
2
2
 
3
3
  The PDF::Reader library implements a PDF parser conforming as much as possible
4
4
  to the PDF specification from Adobe.
@@ -15,46 +15,55 @@ higher level functionality - it's not going to render a PDF for you. There are
15
15
  a few exceptions to support very common use cases like extracting text from a
16
16
  page.
17
17
 
18
- = Installation
18
+ # Installation
19
19
 
20
20
  The recommended installation method is via Rubygems.
21
21
 
22
+ ```ruby
22
23
  gem install pdf-reader
24
+ ```
23
25
 
24
- = Usage
26
+ # Usage
25
27
 
26
28
  Begin by creating a PDF::Reader instance that points to a PDF file. Document
27
29
  level information (metadata, page count, bookmarks, etc) is available via
28
30
  this object.
29
31
 
32
+ ```ruby
30
33
  reader = PDF::Reader.new("somefile.pdf")
31
34
 
32
35
  puts reader.pdf_version
33
36
  puts reader.info
34
37
  puts reader.metadata
35
38
  puts reader.page_count
39
+ ```
36
40
 
37
41
  PDF::Reader.new accepts an IO stream or a filename. Here's an example with
38
42
  an IO stream:
39
43
 
44
+ ```ruby
40
45
  require 'open-uri'
41
46
 
42
47
  io = open('http://example.com/somefile.pdf')
43
48
  reader = PDF::Reader.new(io)
44
49
  puts reader.info
50
+ ```
45
51
 
46
52
  If you open a PDF with File#open or IO#open, I strongly recommend using "rb"
47
53
  mode to ensure the file isn't mangled by ruby being 'helpful'. This is
48
54
  particularly important on windows and MRI >= 1.9.2.
49
55
 
56
+ ```ruby
50
57
  File.open("somefile.pdf", "rb") do |io|
51
58
  reader = PDF::Reader.new(io)
52
59
  puts reader.info
53
60
  end
61
+ ```
54
62
 
55
63
  PDF is a page based file format, so most visible information is available via
56
64
  page-based iteration
57
65
 
66
+ ```ruby
58
67
  reader = PDF::Reader.new("somefile.pdf")
59
68
 
60
69
  reader.pages.each do |page|
@@ -62,10 +71,12 @@ page-based iteration
62
71
  puts page.text
63
72
  puts page.raw_content
64
73
  end
74
+ ```
65
75
 
66
76
  If you need to access the full program for rendering a page, use the walk() method
67
77
  of PDF::Reader::Page.
68
78
 
79
+ ```ruby
69
80
  class RedGreenBlue
70
81
  def set_rgb_color_for_nonstroking(r, g, b)
71
82
  puts "R: #{r}, G: #{g}, B: #{b}"
@@ -76,31 +87,32 @@ of PDF::Reader::Page.
76
87
  page = reader.page(1)
77
88
  receiver = RedGreenBlue.new
78
89
  page.walk(receiver)
90
+ ```
79
91
 
80
92
  For low level access to the objects in a PDF file, use the ObjectHash class like
81
93
  so:
82
94
 
95
+ ```ruby
83
96
  reader = PDF::Reader.new("somefile.pdf")
84
97
  puts reader.objects.inspect
98
+ ```
85
99
 
86
- = Text Encoding
100
+ # Text Encoding
87
101
 
88
102
  Regardless of the internal encoding used in the PDF all text will be converted
89
103
  to UTF-8 before it is passed back from PDF::Reader.
90
104
 
91
- Strings that contain binary data (like font blobs) will be marked as such on
92
- M17N aware VMs.
105
+ Strings that contain binary data (like font blobs) will be marked as such.
93
106
 
94
- = Former API
107
+ # Former API
95
108
 
96
109
  Version 1.0.0 of PDF::Reader introduced a new page-based API that provides
97
110
  efficient and easy access to any page.
98
111
 
99
- The previous API is marked as deprecated but will continue to work for the
100
- time being. Eventually calls to the old API will begin triggering deprecation
101
- warnings before it is completely removed in version 2.0.0.
112
+ The pre-1.0 API was deprecated during the 1.x release series, and has been
113
+ removed from 2.0.0.
102
114
 
103
- = Exceptions
115
+ # Exceptions
104
116
 
105
117
  There are two key exceptions that you will need to watch out for when processing a
106
118
  PDF file:
@@ -120,7 +132,7 @@ don't, 'rescue MalformedPDFError' will catch all the subclassed errors as well.
120
132
  Any other exceptions should be considered bugs in either PDF::Reader (please
121
133
  report it!).
122
134
 
123
- = PDF Integrity
135
+ # PDF Integrity
124
136
 
125
137
  Windows developers may run into problems when running specs due to MalformedPDFError's
126
138
  This is usually because CRLF characters are automatically added to some of the PDF's in
@@ -128,18 +140,20 @@ the spec folder when you checkout a branch from Git.
128
140
 
129
141
  To remove any invalid CRLF characters added while checking out a branch from Git, run:
130
142
 
143
+ ```ruby
131
144
  rake fix_integrity
145
+ ```
132
146
 
133
- = Maintainers
147
+ # Maintainers
134
148
 
135
- - James Healy <mailto:jimmy@deefa.com>
149
+ * James Healy <mailto:jimmy@deefa.com>
136
150
 
137
- = Licensing
151
+ # Licensing
138
152
 
139
153
  This library is distributed under the terms of the MIT License. See the included file for
140
154
  more detail.
141
155
 
142
- = Mailing List
156
+ # Mailing List
143
157
 
144
158
  Any questions or feedback should be sent to the PDF::Reader google group. It's
145
159
  better that any answers be available for others instead of hiding in someone's
@@ -147,20 +161,23 @@ inbox.
147
161
 
148
162
  http://groups.google.com/group/pdf-reader
149
163
 
150
- = Examples
164
+ # Examples
151
165
 
152
166
  The easiest way to explain how this works in practice is to show some examples.
153
167
  Check out the examples/ directory for a few files.
154
168
 
155
- = Known Limitations
169
+ # Known Limitations
156
170
 
157
171
  Occasionally some text cannot be extracted properly due to the way it has been
158
172
  stored, or the use of invalid bytes. In these cases PDF::Reader will output a
159
173
  little UTF-8 friendly box to indicate an unrecognisable character.
160
174
 
161
- = Resources
175
+ # Resources
162
176
 
163
- - PDF::Reader Code Repository: http://github.com/yob/pdf-reader
164
- - PDF Specification: http://www.adobe.com/devnet/pdf/pdf_reference.html
165
- - PDF Tutorial Slide Presentations: http://home.comcast.net/~jk05/presentations/PDFTutorials.html
166
- - Developing with PDF (book): http://shop.oreilly.com/product/0636920025269.do
177
+ * PDF::Reader Code Repository: http://github.com/yob/pdf-reader
178
+
179
+ * PDF Specification: http://www.adobe.com/devnet/pdf/pdf_reference.html
180
+
181
+ * PDF Tutorial Slide Presentations: https://web.archive.org/web/20150110042057/http://home.comcast.net/~jk05/presentations/PDFTutorials.html
182
+
183
+ * Developing with PDF (book): http://shop.oreilly.com/product/0636920025269.do
data/Rakefile CHANGED
@@ -14,7 +14,7 @@ desc "Run cane to check quality metrics"
14
14
  Cane::RakeTask.new(:quality) do |cane|
15
15
  cane.abc_max = 20
16
16
  cane.style_measure = 100
17
- cane.max_violations = 93
17
+ cane.max_violations = 31
18
18
 
19
19
  cane.use Morecane::EncodingCheck, :encoding_glob => "{app,lib,spec}/**/*.rb"
20
20
  end
@@ -41,7 +41,7 @@ end
41
41
  desc "Create a YAML file of integrity info for PDFs in the spec suite"
42
42
  task :integrity_yaml do
43
43
  data = {}
44
- Dir.glob("spec/data/**/*.*").each do |path|
44
+ Dir.glob("spec/data/**/*.*").sort.each do |path|
45
45
  path_without_spec = path.gsub("spec/","")
46
46
  data[path_without_spec] = {
47
47
  :bytes => File.size(path),
data/bin/pdf_callbacks CHANGED
@@ -9,7 +9,7 @@ require 'pdf/reader'
9
9
  receiver = PDF::Reader::PrintReceiver.new
10
10
 
11
11
  if ARGV.empty?
12
- browser = PDF::Reader.new($stdin)
12
+ browser = PDF::Reader.new(StringIO.new(ARGF.read))
13
13
  else
14
14
  browser = PDF::Reader.new(ARGV[0])
15
15
  end
data/bin/pdf_object CHANGED
@@ -25,7 +25,10 @@ gen = gen.to_i
25
25
 
26
26
  # make magic happen
27
27
  begin
28
- obj = PDF::Reader.object_file(filename, id, gen)
28
+ obj = nil
29
+ PDF::Reader.open(filename) do |pdf|
30
+ obj = pdf.objects[PDF::Reader::Reference.new(id, gen)]
31
+ end
29
32
 
30
33
  case obj
31
34
  when Hash, Array
data/bin/pdf_text CHANGED
@@ -4,7 +4,7 @@ require 'rubygems'
4
4
  require 'pdf/reader'
5
5
 
6
6
  if ARGV.empty?
7
- browser = PDF::Reader.new($stdin)
7
+ browser = PDF::Reader.new(StringIO.new(ARGF.read))
8
8
  else
9
9
  browser = PDF::Reader.new(ARGV[0])
10
10
  end