pdf-reader 1.4.1 → 2.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/CHANGELOG +53 -3
- data/{README.rdoc → README.md} +40 -23
- data/Rakefile +2 -2
- data/bin/pdf_callbacks +1 -1
- data/bin/pdf_object +4 -1
- data/bin/pdf_text +1 -1
- data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
- data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier.afm +342 -342
- data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
- data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
- data/lib/pdf/reader/afm/MustRead.html +19 -0
- data/lib/pdf/reader/afm/Symbol.afm +213 -213
- data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
- data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
- data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
- data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
- data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
- data/lib/pdf/reader/buffer.rb +14 -12
- data/lib/pdf/reader/cid_widths.rb +2 -0
- data/lib/pdf/reader/cmap.rb +48 -36
- data/lib/pdf/reader/encoding.rb +16 -18
- data/lib/pdf/reader/error.rb +5 -0
- data/lib/pdf/reader/filter/ascii85.rb +1 -0
- data/lib/pdf/reader/filter/ascii_hex.rb +2 -0
- data/lib/pdf/reader/filter/depredict.rb +1 -0
- data/lib/pdf/reader/filter/flate.rb +29 -16
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +2 -0
- data/lib/pdf/reader/filter/run_length.rb +4 -6
- data/lib/pdf/reader/filter.rb +2 -0
- data/lib/pdf/reader/font.rb +12 -13
- data/lib/pdf/reader/font_descriptor.rb +1 -0
- data/lib/pdf/reader/form_xobject.rb +1 -0
- data/lib/pdf/reader/glyph_hash.rb +7 -2
- data/lib/pdf/reader/lzw.rb +4 -4
- data/lib/pdf/reader/null_security_handler.rb +17 -0
- data/lib/pdf/reader/object_cache.rb +1 -0
- data/lib/pdf/reader/object_hash.rb +91 -37
- data/lib/pdf/reader/object_stream.rb +1 -0
- data/lib/pdf/reader/orientation_detector.rb +5 -4
- data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
- data/lib/pdf/reader/page.rb +30 -1
- data/lib/pdf/reader/page_layout.rb +19 -24
- data/lib/pdf/reader/page_state.rb +8 -5
- data/lib/pdf/reader/page_text_receiver.rb +23 -1
- data/lib/pdf/reader/pages_strategy.rb +2 -304
- data/lib/pdf/reader/parser.rb +10 -7
- data/lib/pdf/reader/print_receiver.rb +1 -0
- data/lib/pdf/reader/reference.rb +1 -0
- data/lib/pdf/reader/register_receiver.rb +1 -0
- data/lib/pdf/reader/resource_methods.rb +1 -0
- data/lib/pdf/reader/standard_security_handler.rb +80 -42
- data/lib/pdf/reader/standard_security_handler_v5.rb +91 -0
- data/lib/pdf/reader/stream.rb +1 -0
- data/lib/pdf/reader/synchronized_cache.rb +1 -0
- data/lib/pdf/reader/text_run.rb +28 -9
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +1 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +17 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +25 -16
- data/lib/pdf/reader/width_calculator/composite.rb +1 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +2 -2
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
- data/lib/pdf/reader/width_calculator.rb +1 -0
- data/lib/pdf/reader/xref.rb +11 -5
- data/lib/pdf/reader.rb +30 -119
- data/lib/pdf-reader.rb +1 -0
- metadata +35 -61
- data/bin/pdf_list_callbacks +0 -17
- data/lib/pdf/hash.rb +0 -19
- data/lib/pdf/reader/abstract_strategy.rb +0 -81
- data/lib/pdf/reader/metadata_strategy.rb +0 -56
- data/lib/pdf/reader/text_receiver.rb +0 -265
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 652d05cf6a22fad5ecb4b92de1e27ba60cafc6525c5ca524e24c7f9796fe1b83
|
4
|
+
data.tar.gz: 2c7448e97890a9fcbd10ec2cd5bafb9025db2fb75dabaf71a4074c542b1065a1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ac82452924cf46af98ee15f2a20642b1d06d5b9c22104fe171b5b4612665e482f341e12473805016ccb9d921fc15324ba51675170b369adeace8b278cd1279fb
|
7
|
+
data.tar.gz: b1dc1c4422b0e6bf01092cf724630ba7424fdef1fdaf34f33aaa3a31397caf6ef5a73185a98e6e2828a9e082d87cbca311565397cb064cac20d86e72be27626f
|
data/CHANGELOG
CHANGED
@@ -1,5 +1,55 @@
|
|
1
|
+
v2.5.0 (6th June 2021)
|
2
|
+
- bump minimum ruby version to 2.0
|
3
|
+
- Correctly handle trascoding to UTF-8 from some fonts that use a difference table [#344](https://github.com/yob/pdf-reader/pull/344/)
|
4
|
+
- Fix some character spacing issues with the TJ operator [#343](https://github.com/yob/pdf-reader/pull/343)
|
5
|
+
- Fix crash with some encrypted PDFs [#348](https://github.com/yob/pdf-reader/pull/348/)
|
6
|
+
- Fix positions of text on some PDFs with pages rotated 90° [#350](https://github.com/yob/pdf-reader/pull/350/)
|
7
|
+
|
8
|
+
v2.4.2 (28th January 2021)
|
9
|
+
- relax ASCII85 dependency to allow 1.x
|
10
|
+
- improved support for decompressing objects with slightly malformed zlib data
|
11
|
+
|
12
|
+
v.2.4.1 (24th September 2020)
|
13
|
+
- Re-vendor font metrics from Adobe to clarify their license
|
14
|
+
|
15
|
+
v2.4.0 (21st November 2019)
|
16
|
+
- Optimise overlapping characters code introduced in 2.3.0. Text extraction of pages with
|
17
|
+
thousands of characters is still slower than it was in 2.2.1, but it might tolerable
|
18
|
+
for now. See https://github.com/yob/pdf-reader/pull/308 for details.
|
19
|
+
- Implement very basic font substitution for Type1 and TrueType fonts that aren't embedded
|
20
|
+
- Remove PDF::Hash class. It's been deprecated since 2010, and it's hard to believe anyone
|
21
|
+
is still using it.
|
22
|
+
- Several small bug fixes
|
23
|
+
|
24
|
+
v2.3.0 (7th November 2019)
|
25
|
+
- Text extraction now makes an effort to skip duplicate characters that overlap, a
|
26
|
+
common approach used for a fake "bold" effect, This will make text extraction a bit
|
27
|
+
slower - if that turns out to be an issue I'll look into further optimisations or
|
28
|
+
provide a toggle to turn it off
|
29
|
+
- Several small bug fixes
|
30
|
+
|
31
|
+
v2.2.1 (27th July 2019)
|
32
|
+
- Improve utf8 text extraction from CMaps that contain surrogate pair ligatures
|
33
|
+
|
34
|
+
v2.2.0 (18th December 2018)
|
35
|
+
- Support additional XRef Stream variants (thanks Stefan Wienert)
|
36
|
+
- Add frozen_strings pragma to reduce object allocations on ruby 2.3+
|
37
|
+
- various bug fixes
|
38
|
+
|
39
|
+
v2.1.0 (15th February 2018)
|
40
|
+
- Support extra encrypted PDF variants (thanks to Gyuchang Jun)
|
41
|
+
- various bug fixes
|
42
|
+
|
43
|
+
v2.0.0 (25th February 2017)
|
44
|
+
- various bug fixes
|
45
|
+
|
46
|
+
v2.0.0.beta1 (15th February 2017)
|
47
|
+
- BREAKING CHANGE: remove all methods that were deprecated in 1.0.0
|
48
|
+
- Bug: Support extra encrypted PDF variants (thanks to Gyuchang Jun)
|
49
|
+
- various bug fixes
|
50
|
+
|
1
51
|
v1.4.1 (2nd January 2017)
|
2
|
-
- improve
|
52
|
+
- improve compatibility with ruby 2.4 (thanks Akira Matsuda)
|
3
53
|
- various bug fixes
|
4
54
|
|
5
55
|
v1.4.0 (22nd February 2016)
|
@@ -91,10 +141,10 @@ v0.9.2 (24th April 2011)
|
|
91
141
|
|
92
142
|
v0.9.1 (21st December 2010)
|
93
143
|
- force gem to only install on ruby 1.8.7 or higher
|
94
|
-
- maintaining
|
144
|
+
- maintaining support for earlier versions takes more time than I have
|
95
145
|
available at the moment
|
96
146
|
- bug: fix parsing of obscure pdf name format
|
97
|
-
- bug: fix behaviour when loaded in
|
147
|
+
- bug: fix behaviour when loaded in conjunction with htmldoc gem
|
98
148
|
|
99
149
|
v0.9.0 (19th November 2010)
|
100
150
|
- support for pdf 1.5+ files that use object and xref streams
|
data/{README.rdoc → README.md}
RENAMED
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
# pdf-reader
|
2
2
|
|
3
3
|
The PDF::Reader library implements a PDF parser conforming as much as possible
|
4
4
|
to the PDF specification from Adobe.
|
@@ -15,46 +15,55 @@ higher level functionality - it's not going to render a PDF for you. There are
|
|
15
15
|
a few exceptions to support very common use cases like extracting text from a
|
16
16
|
page.
|
17
17
|
|
18
|
-
|
18
|
+
# Installation
|
19
19
|
|
20
20
|
The recommended installation method is via Rubygems.
|
21
21
|
|
22
|
+
```ruby
|
22
23
|
gem install pdf-reader
|
24
|
+
```
|
23
25
|
|
24
|
-
|
26
|
+
# Usage
|
25
27
|
|
26
28
|
Begin by creating a PDF::Reader instance that points to a PDF file. Document
|
27
29
|
level information (metadata, page count, bookmarks, etc) is available via
|
28
30
|
this object.
|
29
31
|
|
32
|
+
```ruby
|
30
33
|
reader = PDF::Reader.new("somefile.pdf")
|
31
34
|
|
32
35
|
puts reader.pdf_version
|
33
36
|
puts reader.info
|
34
37
|
puts reader.metadata
|
35
38
|
puts reader.page_count
|
39
|
+
```
|
36
40
|
|
37
41
|
PDF::Reader.new accepts an IO stream or a filename. Here's an example with
|
38
42
|
an IO stream:
|
39
43
|
|
44
|
+
```ruby
|
40
45
|
require 'open-uri'
|
41
46
|
|
42
47
|
io = open('http://example.com/somefile.pdf')
|
43
48
|
reader = PDF::Reader.new(io)
|
44
49
|
puts reader.info
|
50
|
+
```
|
45
51
|
|
46
52
|
If you open a PDF with File#open or IO#open, I strongly recommend using "rb"
|
47
53
|
mode to ensure the file isn't mangled by ruby being 'helpful'. This is
|
48
54
|
particularly important on windows and MRI >= 1.9.2.
|
49
55
|
|
56
|
+
```ruby
|
50
57
|
File.open("somefile.pdf", "rb") do |io|
|
51
58
|
reader = PDF::Reader.new(io)
|
52
59
|
puts reader.info
|
53
60
|
end
|
61
|
+
```
|
54
62
|
|
55
63
|
PDF is a page based file format, so most visible information is available via
|
56
64
|
page-based iteration
|
57
65
|
|
66
|
+
```ruby
|
58
67
|
reader = PDF::Reader.new("somefile.pdf")
|
59
68
|
|
60
69
|
reader.pages.each do |page|
|
@@ -62,10 +71,12 @@ page-based iteration
|
|
62
71
|
puts page.text
|
63
72
|
puts page.raw_content
|
64
73
|
end
|
74
|
+
```
|
65
75
|
|
66
76
|
If you need to access the full program for rendering a page, use the walk() method
|
67
77
|
of PDF::Reader::Page.
|
68
78
|
|
79
|
+
```ruby
|
69
80
|
class RedGreenBlue
|
70
81
|
def set_rgb_color_for_nonstroking(r, g, b)
|
71
82
|
puts "R: #{r}, G: #{g}, B: #{b}"
|
@@ -76,31 +87,32 @@ of PDF::Reader::Page.
|
|
76
87
|
page = reader.page(1)
|
77
88
|
receiver = RedGreenBlue.new
|
78
89
|
page.walk(receiver)
|
90
|
+
```
|
79
91
|
|
80
92
|
For low level access to the objects in a PDF file, use the ObjectHash class like
|
81
93
|
so:
|
82
94
|
|
95
|
+
```ruby
|
83
96
|
reader = PDF::Reader.new("somefile.pdf")
|
84
97
|
puts reader.objects.inspect
|
98
|
+
```
|
85
99
|
|
86
|
-
|
100
|
+
# Text Encoding
|
87
101
|
|
88
102
|
Regardless of the internal encoding used in the PDF all text will be converted
|
89
103
|
to UTF-8 before it is passed back from PDF::Reader.
|
90
104
|
|
91
|
-
Strings that contain binary data (like font blobs) will be marked as such
|
92
|
-
M17N aware VMs.
|
105
|
+
Strings that contain binary data (like font blobs) will be marked as such.
|
93
106
|
|
94
|
-
|
107
|
+
# Former API
|
95
108
|
|
96
109
|
Version 1.0.0 of PDF::Reader introduced a new page-based API that provides
|
97
110
|
efficient and easy access to any page.
|
98
111
|
|
99
|
-
The
|
100
|
-
|
101
|
-
warnings before it is completely removed in version 2.0.0.
|
112
|
+
The pre-1.0 API was deprecated during the 1.x release series, and has been
|
113
|
+
removed from 2.0.0.
|
102
114
|
|
103
|
-
|
115
|
+
# Exceptions
|
104
116
|
|
105
117
|
There are two key exceptions that you will need to watch out for when processing a
|
106
118
|
PDF file:
|
@@ -120,7 +132,7 @@ don't, 'rescue MalformedPDFError' will catch all the subclassed errors as well.
|
|
120
132
|
Any other exceptions should be considered bugs in either PDF::Reader (please
|
121
133
|
report it!).
|
122
134
|
|
123
|
-
|
135
|
+
# PDF Integrity
|
124
136
|
|
125
137
|
Windows developers may run into problems when running specs due to MalformedPDFError's
|
126
138
|
This is usually because CRLF characters are automatically added to some of the PDF's in
|
@@ -128,18 +140,20 @@ the spec folder when you checkout a branch from Git.
|
|
128
140
|
|
129
141
|
To remove any invalid CRLF characters added while checking out a branch from Git, run:
|
130
142
|
|
143
|
+
```ruby
|
131
144
|
rake fix_integrity
|
145
|
+
```
|
132
146
|
|
133
|
-
|
147
|
+
# Maintainers
|
134
148
|
|
135
|
-
|
149
|
+
* James Healy <mailto:jimmy@deefa.com>
|
136
150
|
|
137
|
-
|
151
|
+
# Licensing
|
138
152
|
|
139
153
|
This library is distributed under the terms of the MIT License. See the included file for
|
140
154
|
more detail.
|
141
155
|
|
142
|
-
|
156
|
+
# Mailing List
|
143
157
|
|
144
158
|
Any questions or feedback should be sent to the PDF::Reader google group. It's
|
145
159
|
better that any answers be available for others instead of hiding in someone's
|
@@ -147,20 +161,23 @@ inbox.
|
|
147
161
|
|
148
162
|
http://groups.google.com/group/pdf-reader
|
149
163
|
|
150
|
-
|
164
|
+
# Examples
|
151
165
|
|
152
166
|
The easiest way to explain how this works in practice is to show some examples.
|
153
167
|
Check out the examples/ directory for a few files.
|
154
168
|
|
155
|
-
|
169
|
+
# Known Limitations
|
156
170
|
|
157
171
|
Occasionally some text cannot be extracted properly due to the way it has been
|
158
172
|
stored, or the use of invalid bytes. In these cases PDF::Reader will output a
|
159
173
|
little UTF-8 friendly box to indicate an unrecognisable character.
|
160
174
|
|
161
|
-
|
175
|
+
# Resources
|
162
176
|
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
177
|
+
* PDF::Reader Code Repository: http://github.com/yob/pdf-reader
|
178
|
+
|
179
|
+
* PDF Specification: http://www.adobe.com/devnet/pdf/pdf_reference.html
|
180
|
+
|
181
|
+
* PDF Tutorial Slide Presentations: https://web.archive.org/web/20150110042057/http://home.comcast.net/~jk05/presentations/PDFTutorials.html
|
182
|
+
|
183
|
+
* Developing with PDF (book): http://shop.oreilly.com/product/0636920025269.do
|
data/Rakefile
CHANGED
@@ -14,7 +14,7 @@ desc "Run cane to check quality metrics"
|
|
14
14
|
Cane::RakeTask.new(:quality) do |cane|
|
15
15
|
cane.abc_max = 20
|
16
16
|
cane.style_measure = 100
|
17
|
-
cane.max_violations =
|
17
|
+
cane.max_violations = 31
|
18
18
|
|
19
19
|
cane.use Morecane::EncodingCheck, :encoding_glob => "{app,lib,spec}/**/*.rb"
|
20
20
|
end
|
@@ -41,7 +41,7 @@ end
|
|
41
41
|
desc "Create a YAML file of integrity info for PDFs in the spec suite"
|
42
42
|
task :integrity_yaml do
|
43
43
|
data = {}
|
44
|
-
Dir.glob("spec/data/**/*.*").each do |path|
|
44
|
+
Dir.glob("spec/data/**/*.*").sort.each do |path|
|
45
45
|
path_without_spec = path.gsub("spec/","")
|
46
46
|
data[path_without_spec] = {
|
47
47
|
:bytes => File.size(path),
|
data/bin/pdf_callbacks
CHANGED
data/bin/pdf_object
CHANGED
@@ -25,7 +25,10 @@ gen = gen.to_i
|
|
25
25
|
|
26
26
|
# make magic happen
|
27
27
|
begin
|
28
|
-
obj =
|
28
|
+
obj = nil
|
29
|
+
PDF::Reader.open(filename) do |pdf|
|
30
|
+
obj = pdf.objects[PDF::Reader::Reference.new(id, gen)]
|
31
|
+
end
|
29
32
|
|
30
33
|
case obj
|
31
34
|
when Hash, Array
|