pdf-reader 1.4.1 → 2.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/CHANGELOG +53 -3
- data/{README.rdoc → README.md} +40 -23
- data/Rakefile +2 -2
- data/bin/pdf_callbacks +1 -1
- data/bin/pdf_object +4 -1
- data/bin/pdf_text +1 -1
- data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
- data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier.afm +342 -342
- data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
- data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
- data/lib/pdf/reader/afm/MustRead.html +19 -0
- data/lib/pdf/reader/afm/Symbol.afm +213 -213
- data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
- data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
- data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
- data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
- data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
- data/lib/pdf/reader/buffer.rb +14 -12
- data/lib/pdf/reader/cid_widths.rb +2 -0
- data/lib/pdf/reader/cmap.rb +48 -36
- data/lib/pdf/reader/encoding.rb +16 -18
- data/lib/pdf/reader/error.rb +5 -0
- data/lib/pdf/reader/filter/ascii85.rb +1 -0
- data/lib/pdf/reader/filter/ascii_hex.rb +2 -0
- data/lib/pdf/reader/filter/depredict.rb +1 -0
- data/lib/pdf/reader/filter/flate.rb +29 -16
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +2 -0
- data/lib/pdf/reader/filter/run_length.rb +4 -6
- data/lib/pdf/reader/filter.rb +2 -0
- data/lib/pdf/reader/font.rb +12 -13
- data/lib/pdf/reader/font_descriptor.rb +1 -0
- data/lib/pdf/reader/form_xobject.rb +1 -0
- data/lib/pdf/reader/glyph_hash.rb +7 -2
- data/lib/pdf/reader/lzw.rb +4 -4
- data/lib/pdf/reader/null_security_handler.rb +17 -0
- data/lib/pdf/reader/object_cache.rb +1 -0
- data/lib/pdf/reader/object_hash.rb +91 -37
- data/lib/pdf/reader/object_stream.rb +1 -0
- data/lib/pdf/reader/orientation_detector.rb +5 -4
- data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
- data/lib/pdf/reader/page.rb +30 -1
- data/lib/pdf/reader/page_layout.rb +19 -24
- data/lib/pdf/reader/page_state.rb +8 -5
- data/lib/pdf/reader/page_text_receiver.rb +23 -1
- data/lib/pdf/reader/pages_strategy.rb +2 -304
- data/lib/pdf/reader/parser.rb +10 -7
- data/lib/pdf/reader/print_receiver.rb +1 -0
- data/lib/pdf/reader/reference.rb +1 -0
- data/lib/pdf/reader/register_receiver.rb +1 -0
- data/lib/pdf/reader/resource_methods.rb +1 -0
- data/lib/pdf/reader/standard_security_handler.rb +80 -42
- data/lib/pdf/reader/standard_security_handler_v5.rb +91 -0
- data/lib/pdf/reader/stream.rb +1 -0
- data/lib/pdf/reader/synchronized_cache.rb +1 -0
- data/lib/pdf/reader/text_run.rb +28 -9
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +1 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +17 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +25 -16
- data/lib/pdf/reader/width_calculator/composite.rb +1 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +2 -2
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
- data/lib/pdf/reader/width_calculator.rb +1 -0
- data/lib/pdf/reader/xref.rb +11 -5
- data/lib/pdf/reader.rb +30 -119
- data/lib/pdf-reader.rb +1 -0
- metadata +35 -61
- data/bin/pdf_list_callbacks +0 -17
- data/lib/pdf/hash.rb +0 -19
- data/lib/pdf/reader/abstract_strategy.rb +0 -81
- data/lib/pdf/reader/metadata_strategy.rb +0 -56
- data/lib/pdf/reader/text_receiver.rb +0 -265
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 652d05cf6a22fad5ecb4b92de1e27ba60cafc6525c5ca524e24c7f9796fe1b83
|
4
|
+
data.tar.gz: 2c7448e97890a9fcbd10ec2cd5bafb9025db2fb75dabaf71a4074c542b1065a1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ac82452924cf46af98ee15f2a20642b1d06d5b9c22104fe171b5b4612665e482f341e12473805016ccb9d921fc15324ba51675170b369adeace8b278cd1279fb
|
7
|
+
data.tar.gz: b1dc1c4422b0e6bf01092cf724630ba7424fdef1fdaf34f33aaa3a31397caf6ef5a73185a98e6e2828a9e082d87cbca311565397cb064cac20d86e72be27626f
|
data/CHANGELOG
CHANGED
@@ -1,5 +1,55 @@
|
|
1
|
+
v2.5.0 (6th June 2021)
|
2
|
+
- bump minimum ruby version to 2.0
|
3
|
+
- Correctly handle trascoding to UTF-8 from some fonts that use a difference table [#344](https://github.com/yob/pdf-reader/pull/344/)
|
4
|
+
- Fix some character spacing issues with the TJ operator [#343](https://github.com/yob/pdf-reader/pull/343)
|
5
|
+
- Fix crash with some encrypted PDFs [#348](https://github.com/yob/pdf-reader/pull/348/)
|
6
|
+
- Fix positions of text on some PDFs with pages rotated 90° [#350](https://github.com/yob/pdf-reader/pull/350/)
|
7
|
+
|
8
|
+
v2.4.2 (28th January 2021)
|
9
|
+
- relax ASCII85 dependency to allow 1.x
|
10
|
+
- improved support for decompressing objects with slightly malformed zlib data
|
11
|
+
|
12
|
+
v.2.4.1 (24th September 2020)
|
13
|
+
- Re-vendor font metrics from Adobe to clarify their license
|
14
|
+
|
15
|
+
v2.4.0 (21st November 2019)
|
16
|
+
- Optimise overlapping characters code introduced in 2.3.0. Text extraction of pages with
|
17
|
+
thousands of characters is still slower than it was in 2.2.1, but it might tolerable
|
18
|
+
for now. See https://github.com/yob/pdf-reader/pull/308 for details.
|
19
|
+
- Implement very basic font substitution for Type1 and TrueType fonts that aren't embedded
|
20
|
+
- Remove PDF::Hash class. It's been deprecated since 2010, and it's hard to believe anyone
|
21
|
+
is still using it.
|
22
|
+
- Several small bug fixes
|
23
|
+
|
24
|
+
v2.3.0 (7th November 2019)
|
25
|
+
- Text extraction now makes an effort to skip duplicate characters that overlap, a
|
26
|
+
common approach used for a fake "bold" effect, This will make text extraction a bit
|
27
|
+
slower - if that turns out to be an issue I'll look into further optimisations or
|
28
|
+
provide a toggle to turn it off
|
29
|
+
- Several small bug fixes
|
30
|
+
|
31
|
+
v2.2.1 (27th July 2019)
|
32
|
+
- Improve utf8 text extraction from CMaps that contain surrogate pair ligatures
|
33
|
+
|
34
|
+
v2.2.0 (18th December 2018)
|
35
|
+
- Support additional XRef Stream variants (thanks Stefan Wienert)
|
36
|
+
- Add frozen_strings pragma to reduce object allocations on ruby 2.3+
|
37
|
+
- various bug fixes
|
38
|
+
|
39
|
+
v2.1.0 (15th February 2018)
|
40
|
+
- Support extra encrypted PDF variants (thanks to Gyuchang Jun)
|
41
|
+
- various bug fixes
|
42
|
+
|
43
|
+
v2.0.0 (25th February 2017)
|
44
|
+
- various bug fixes
|
45
|
+
|
46
|
+
v2.0.0.beta1 (15th February 2017)
|
47
|
+
- BREAKING CHANGE: remove all methods that were deprecated in 1.0.0
|
48
|
+
- Bug: Support extra encrypted PDF variants (thanks to Gyuchang Jun)
|
49
|
+
- various bug fixes
|
50
|
+
|
1
51
|
v1.4.1 (2nd January 2017)
|
2
|
-
- improve
|
52
|
+
- improve compatibility with ruby 2.4 (thanks Akira Matsuda)
|
3
53
|
- various bug fixes
|
4
54
|
|
5
55
|
v1.4.0 (22nd February 2016)
|
@@ -91,10 +141,10 @@ v0.9.2 (24th April 2011)
|
|
91
141
|
|
92
142
|
v0.9.1 (21st December 2010)
|
93
143
|
- force gem to only install on ruby 1.8.7 or higher
|
94
|
-
- maintaining
|
144
|
+
- maintaining support for earlier versions takes more time than I have
|
95
145
|
available at the moment
|
96
146
|
- bug: fix parsing of obscure pdf name format
|
97
|
-
- bug: fix behaviour when loaded in
|
147
|
+
- bug: fix behaviour when loaded in conjunction with htmldoc gem
|
98
148
|
|
99
149
|
v0.9.0 (19th November 2010)
|
100
150
|
- support for pdf 1.5+ files that use object and xref streams
|
data/{README.rdoc → README.md}
RENAMED
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
# pdf-reader
|
2
2
|
|
3
3
|
The PDF::Reader library implements a PDF parser conforming as much as possible
|
4
4
|
to the PDF specification from Adobe.
|
@@ -15,46 +15,55 @@ higher level functionality - it's not going to render a PDF for you. There are
|
|
15
15
|
a few exceptions to support very common use cases like extracting text from a
|
16
16
|
page.
|
17
17
|
|
18
|
-
|
18
|
+
# Installation
|
19
19
|
|
20
20
|
The recommended installation method is via Rubygems.
|
21
21
|
|
22
|
+
```ruby
|
22
23
|
gem install pdf-reader
|
24
|
+
```
|
23
25
|
|
24
|
-
|
26
|
+
# Usage
|
25
27
|
|
26
28
|
Begin by creating a PDF::Reader instance that points to a PDF file. Document
|
27
29
|
level information (metadata, page count, bookmarks, etc) is available via
|
28
30
|
this object.
|
29
31
|
|
32
|
+
```ruby
|
30
33
|
reader = PDF::Reader.new("somefile.pdf")
|
31
34
|
|
32
35
|
puts reader.pdf_version
|
33
36
|
puts reader.info
|
34
37
|
puts reader.metadata
|
35
38
|
puts reader.page_count
|
39
|
+
```
|
36
40
|
|
37
41
|
PDF::Reader.new accepts an IO stream or a filename. Here's an example with
|
38
42
|
an IO stream:
|
39
43
|
|
44
|
+
```ruby
|
40
45
|
require 'open-uri'
|
41
46
|
|
42
47
|
io = open('http://example.com/somefile.pdf')
|
43
48
|
reader = PDF::Reader.new(io)
|
44
49
|
puts reader.info
|
50
|
+
```
|
45
51
|
|
46
52
|
If you open a PDF with File#open or IO#open, I strongly recommend using "rb"
|
47
53
|
mode to ensure the file isn't mangled by ruby being 'helpful'. This is
|
48
54
|
particularly important on windows and MRI >= 1.9.2.
|
49
55
|
|
56
|
+
```ruby
|
50
57
|
File.open("somefile.pdf", "rb") do |io|
|
51
58
|
reader = PDF::Reader.new(io)
|
52
59
|
puts reader.info
|
53
60
|
end
|
61
|
+
```
|
54
62
|
|
55
63
|
PDF is a page based file format, so most visible information is available via
|
56
64
|
page-based iteration
|
57
65
|
|
66
|
+
```ruby
|
58
67
|
reader = PDF::Reader.new("somefile.pdf")
|
59
68
|
|
60
69
|
reader.pages.each do |page|
|
@@ -62,10 +71,12 @@ page-based iteration
|
|
62
71
|
puts page.text
|
63
72
|
puts page.raw_content
|
64
73
|
end
|
74
|
+
```
|
65
75
|
|
66
76
|
If you need to access the full program for rendering a page, use the walk() method
|
67
77
|
of PDF::Reader::Page.
|
68
78
|
|
79
|
+
```ruby
|
69
80
|
class RedGreenBlue
|
70
81
|
def set_rgb_color_for_nonstroking(r, g, b)
|
71
82
|
puts "R: #{r}, G: #{g}, B: #{b}"
|
@@ -76,31 +87,32 @@ of PDF::Reader::Page.
|
|
76
87
|
page = reader.page(1)
|
77
88
|
receiver = RedGreenBlue.new
|
78
89
|
page.walk(receiver)
|
90
|
+
```
|
79
91
|
|
80
92
|
For low level access to the objects in a PDF file, use the ObjectHash class like
|
81
93
|
so:
|
82
94
|
|
95
|
+
```ruby
|
83
96
|
reader = PDF::Reader.new("somefile.pdf")
|
84
97
|
puts reader.objects.inspect
|
98
|
+
```
|
85
99
|
|
86
|
-
|
100
|
+
# Text Encoding
|
87
101
|
|
88
102
|
Regardless of the internal encoding used in the PDF all text will be converted
|
89
103
|
to UTF-8 before it is passed back from PDF::Reader.
|
90
104
|
|
91
|
-
Strings that contain binary data (like font blobs) will be marked as such
|
92
|
-
M17N aware VMs.
|
105
|
+
Strings that contain binary data (like font blobs) will be marked as such.
|
93
106
|
|
94
|
-
|
107
|
+
# Former API
|
95
108
|
|
96
109
|
Version 1.0.0 of PDF::Reader introduced a new page-based API that provides
|
97
110
|
efficient and easy access to any page.
|
98
111
|
|
99
|
-
The
|
100
|
-
|
101
|
-
warnings before it is completely removed in version 2.0.0.
|
112
|
+
The pre-1.0 API was deprecated during the 1.x release series, and has been
|
113
|
+
removed from 2.0.0.
|
102
114
|
|
103
|
-
|
115
|
+
# Exceptions
|
104
116
|
|
105
117
|
There are two key exceptions that you will need to watch out for when processing a
|
106
118
|
PDF file:
|
@@ -120,7 +132,7 @@ don't, 'rescue MalformedPDFError' will catch all the subclassed errors as well.
|
|
120
132
|
Any other exceptions should be considered bugs in either PDF::Reader (please
|
121
133
|
report it!).
|
122
134
|
|
123
|
-
|
135
|
+
# PDF Integrity
|
124
136
|
|
125
137
|
Windows developers may run into problems when running specs due to MalformedPDFError's
|
126
138
|
This is usually because CRLF characters are automatically added to some of the PDF's in
|
@@ -128,18 +140,20 @@ the spec folder when you checkout a branch from Git.
|
|
128
140
|
|
129
141
|
To remove any invalid CRLF characters added while checking out a branch from Git, run:
|
130
142
|
|
143
|
+
```ruby
|
131
144
|
rake fix_integrity
|
145
|
+
```
|
132
146
|
|
133
|
-
|
147
|
+
# Maintainers
|
134
148
|
|
135
|
-
|
149
|
+
* James Healy <mailto:jimmy@deefa.com>
|
136
150
|
|
137
|
-
|
151
|
+
# Licensing
|
138
152
|
|
139
153
|
This library is distributed under the terms of the MIT License. See the included file for
|
140
154
|
more detail.
|
141
155
|
|
142
|
-
|
156
|
+
# Mailing List
|
143
157
|
|
144
158
|
Any questions or feedback should be sent to the PDF::Reader google group. It's
|
145
159
|
better that any answers be available for others instead of hiding in someone's
|
@@ -147,20 +161,23 @@ inbox.
|
|
147
161
|
|
148
162
|
http://groups.google.com/group/pdf-reader
|
149
163
|
|
150
|
-
|
164
|
+
# Examples
|
151
165
|
|
152
166
|
The easiest way to explain how this works in practice is to show some examples.
|
153
167
|
Check out the examples/ directory for a few files.
|
154
168
|
|
155
|
-
|
169
|
+
# Known Limitations
|
156
170
|
|
157
171
|
Occasionally some text cannot be extracted properly due to the way it has been
|
158
172
|
stored, or the use of invalid bytes. In these cases PDF::Reader will output a
|
159
173
|
little UTF-8 friendly box to indicate an unrecognisable character.
|
160
174
|
|
161
|
-
|
175
|
+
# Resources
|
162
176
|
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
177
|
+
* PDF::Reader Code Repository: http://github.com/yob/pdf-reader
|
178
|
+
|
179
|
+
* PDF Specification: http://www.adobe.com/devnet/pdf/pdf_reference.html
|
180
|
+
|
181
|
+
* PDF Tutorial Slide Presentations: https://web.archive.org/web/20150110042057/http://home.comcast.net/~jk05/presentations/PDFTutorials.html
|
182
|
+
|
183
|
+
* Developing with PDF (book): http://shop.oreilly.com/product/0636920025269.do
|
data/Rakefile
CHANGED
@@ -14,7 +14,7 @@ desc "Run cane to check quality metrics"
|
|
14
14
|
Cane::RakeTask.new(:quality) do |cane|
|
15
15
|
cane.abc_max = 20
|
16
16
|
cane.style_measure = 100
|
17
|
-
cane.max_violations =
|
17
|
+
cane.max_violations = 31
|
18
18
|
|
19
19
|
cane.use Morecane::EncodingCheck, :encoding_glob => "{app,lib,spec}/**/*.rb"
|
20
20
|
end
|
@@ -41,7 +41,7 @@ end
|
|
41
41
|
desc "Create a YAML file of integrity info for PDFs in the spec suite"
|
42
42
|
task :integrity_yaml do
|
43
43
|
data = {}
|
44
|
-
Dir.glob("spec/data/**/*.*").each do |path|
|
44
|
+
Dir.glob("spec/data/**/*.*").sort.each do |path|
|
45
45
|
path_without_spec = path.gsub("spec/","")
|
46
46
|
data[path_without_spec] = {
|
47
47
|
:bytes => File.size(path),
|
data/bin/pdf_callbacks
CHANGED
data/bin/pdf_object
CHANGED
@@ -25,7 +25,10 @@ gen = gen.to_i
|
|
25
25
|
|
26
26
|
# make magic happen
|
27
27
|
begin
|
28
|
-
obj =
|
28
|
+
obj = nil
|
29
|
+
PDF::Reader.open(filename) do |pdf|
|
30
|
+
obj = pdf.objects[PDF::Reader::Reference.new(id, gen)]
|
31
|
+
end
|
29
32
|
|
30
33
|
case obj
|
31
34
|
when Hash, Array
|