pdf-reader 1.1.1 → 2.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG +87 -2
- data/{README.rdoc → README.md} +43 -31
- data/Rakefile +21 -16
- data/bin/pdf_callbacks +1 -1
- data/bin/pdf_object +4 -1
- data/bin/pdf_text +1 -3
- data/examples/callbacks.rb +2 -1
- data/examples/extract_images.rb +11 -6
- data/examples/fuzzy_paragraphs.rb +24 -0
- data/lib/pdf/reader/afm/Courier-Bold.afm +342 -0
- data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -0
- data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -0
- data/lib/pdf/reader/afm/Courier.afm +342 -0
- data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -0
- data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -0
- data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -0
- data/lib/pdf/reader/afm/Helvetica.afm +3051 -0
- data/lib/pdf/reader/afm/MustRead.html +19 -0
- data/lib/pdf/reader/afm/Symbol.afm +213 -0
- data/lib/pdf/reader/afm/Times-Bold.afm +2588 -0
- data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -0
- data/lib/pdf/reader/afm/Times-Italic.afm +2667 -0
- data/lib/pdf/reader/afm/Times-Roman.afm +2419 -0
- data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -0
- data/lib/pdf/reader/buffer.rb +90 -63
- data/lib/pdf/reader/cid_widths.rb +63 -0
- data/lib/pdf/reader/cmap.rb +69 -38
- data/lib/pdf/reader/encoding.rb +74 -48
- data/lib/pdf/reader/error.rb +24 -4
- data/lib/pdf/reader/filter/ascii85.rb +28 -0
- data/lib/pdf/reader/filter/ascii_hex.rb +30 -0
- data/lib/pdf/reader/filter/depredict.rb +141 -0
- data/lib/pdf/reader/filter/flate.rb +53 -0
- data/lib/pdf/reader/filter/lzw.rb +21 -0
- data/lib/pdf/reader/filter/null.rb +18 -0
- data/lib/pdf/reader/filter/run_length.rb +45 -0
- data/lib/pdf/reader/filter.rb +15 -234
- data/lib/pdf/reader/font.rb +107 -43
- data/lib/pdf/reader/font_descriptor.rb +80 -0
- data/lib/pdf/reader/form_xobject.rb +26 -4
- data/lib/pdf/reader/glyph_hash.rb +56 -18
- data/lib/pdf/reader/lzw.rb +6 -4
- data/lib/pdf/reader/null_security_handler.rb +17 -0
- data/lib/pdf/reader/object_cache.rb +40 -16
- data/lib/pdf/reader/object_hash.rb +94 -40
- data/lib/pdf/reader/object_stream.rb +1 -0
- data/lib/pdf/reader/orientation_detector.rb +34 -0
- data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
- data/lib/pdf/reader/page.rb +48 -3
- data/lib/pdf/reader/page_layout.rb +125 -0
- data/lib/pdf/reader/page_state.rb +185 -70
- data/lib/pdf/reader/page_text_receiver.rb +70 -20
- data/lib/pdf/reader/pages_strategy.rb +4 -293
- data/lib/pdf/reader/parser.rb +37 -61
- data/lib/pdf/reader/print_receiver.rb +6 -0
- data/lib/pdf/reader/reference.rb +4 -1
- data/lib/pdf/reader/register_receiver.rb +17 -31
- data/lib/pdf/reader/resource_methods.rb +1 -0
- data/lib/pdf/reader/standard_security_handler.rb +82 -42
- data/lib/pdf/reader/standard_security_handler_v5.rb +91 -0
- data/lib/pdf/reader/stream.rb +5 -2
- data/lib/pdf/reader/synchronized_cache.rb +33 -0
- data/lib/pdf/reader/text_run.rb +99 -0
- data/lib/pdf/reader/token.rb +4 -1
- data/lib/pdf/reader/transformation_matrix.rb +195 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +17 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +67 -0
- data/lib/pdf/reader/width_calculator/composite.rb +28 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +56 -0
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +33 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +25 -0
- data/lib/pdf/reader/width_calculator.rb +12 -0
- data/lib/pdf/reader/xref.rb +41 -9
- data/lib/pdf/reader.rb +45 -104
- data/lib/pdf-reader.rb +4 -1
- metadata +220 -101
- data/bin/pdf_list_callbacks +0 -17
- data/lib/pdf/hash.rb +0 -15
- data/lib/pdf/reader/abstract_strategy.rb +0 -81
- data/lib/pdf/reader/metadata_strategy.rb +0 -56
- data/lib/pdf/reader/text_receiver.rb +0 -264
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 652d05cf6a22fad5ecb4b92de1e27ba60cafc6525c5ca524e24c7f9796fe1b83
|
4
|
+
data.tar.gz: 2c7448e97890a9fcbd10ec2cd5bafb9025db2fb75dabaf71a4074c542b1065a1
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: ac82452924cf46af98ee15f2a20642b1d06d5b9c22104fe171b5b4612665e482f341e12473805016ccb9d921fc15324ba51675170b369adeace8b278cd1279fb
|
7
|
+
data.tar.gz: b1dc1c4422b0e6bf01092cf724630ba7424fdef1fdaf34f33aaa3a31397caf6ef5a73185a98e6e2828a9e082d87cbca311565397cb064cac20d86e72be27626f
|
data/CHANGELOG
CHANGED
@@ -1,3 +1,88 @@
|
|
1
|
+
v2.5.0 (6th June 2021)
|
2
|
+
- bump minimum ruby version to 2.0
|
3
|
+
- Correctly handle trascoding to UTF-8 from some fonts that use a difference table [#344](https://github.com/yob/pdf-reader/pull/344/)
|
4
|
+
- Fix some character spacing issues with the TJ operator [#343](https://github.com/yob/pdf-reader/pull/343)
|
5
|
+
- Fix crash with some encrypted PDFs [#348](https://github.com/yob/pdf-reader/pull/348/)
|
6
|
+
- Fix positions of text on some PDFs with pages rotated 90° [#350](https://github.com/yob/pdf-reader/pull/350/)
|
7
|
+
|
8
|
+
v2.4.2 (28th January 2021)
|
9
|
+
- relax ASCII85 dependency to allow 1.x
|
10
|
+
- improved support for decompressing objects with slightly malformed zlib data
|
11
|
+
|
12
|
+
v.2.4.1 (24th September 2020)
|
13
|
+
- Re-vendor font metrics from Adobe to clarify their license
|
14
|
+
|
15
|
+
v2.4.0 (21st November 2019)
|
16
|
+
- Optimise overlapping characters code introduced in 2.3.0. Text extraction of pages with
|
17
|
+
thousands of characters is still slower than it was in 2.2.1, but it might tolerable
|
18
|
+
for now. See https://github.com/yob/pdf-reader/pull/308 for details.
|
19
|
+
- Implement very basic font substitution for Type1 and TrueType fonts that aren't embedded
|
20
|
+
- Remove PDF::Hash class. It's been deprecated since 2010, and it's hard to believe anyone
|
21
|
+
is still using it.
|
22
|
+
- Several small bug fixes
|
23
|
+
|
24
|
+
v2.3.0 (7th November 2019)
|
25
|
+
- Text extraction now makes an effort to skip duplicate characters that overlap, a
|
26
|
+
common approach used for a fake "bold" effect, This will make text extraction a bit
|
27
|
+
slower - if that turns out to be an issue I'll look into further optimisations or
|
28
|
+
provide a toggle to turn it off
|
29
|
+
- Several small bug fixes
|
30
|
+
|
31
|
+
v2.2.1 (27th July 2019)
|
32
|
+
- Improve utf8 text extraction from CMaps that contain surrogate pair ligatures
|
33
|
+
|
34
|
+
v2.2.0 (18th December 2018)
|
35
|
+
- Support additional XRef Stream variants (thanks Stefan Wienert)
|
36
|
+
- Add frozen_strings pragma to reduce object allocations on ruby 2.3+
|
37
|
+
- various bug fixes
|
38
|
+
|
39
|
+
v2.1.0 (15th February 2018)
|
40
|
+
- Support extra encrypted PDF variants (thanks to Gyuchang Jun)
|
41
|
+
- various bug fixes
|
42
|
+
|
43
|
+
v2.0.0 (25th February 2017)
|
44
|
+
- various bug fixes
|
45
|
+
|
46
|
+
v2.0.0.beta1 (15th February 2017)
|
47
|
+
- BREAKING CHANGE: remove all methods that were deprecated in 1.0.0
|
48
|
+
- Bug: Support extra encrypted PDF variants (thanks to Gyuchang Jun)
|
49
|
+
- various bug fixes
|
50
|
+
|
51
|
+
v1.4.1 (2nd January 2017)
|
52
|
+
- improve compatibility with ruby 2.4 (thanks Akira Matsuda)
|
53
|
+
- various bug fixes
|
54
|
+
|
55
|
+
v1.4.0 (22nd February 2016)
|
56
|
+
- raise minimum ruby version to 1.9.3
|
57
|
+
- print warnings to stderr when deprecated methods are used. These methods have been
|
58
|
+
deprecated for 4 years, so hopefully few people are depending on them
|
59
|
+
- Fix exception when a non-breaking space (character 160) is used with a
|
60
|
+
built-in font (helvetica, etc)
|
61
|
+
- various bug fixes
|
62
|
+
|
63
|
+
v1.3.3 (7th April 2013)
|
64
|
+
- various bug fixes
|
65
|
+
|
66
|
+
v1.3.2 (26th February 2013)
|
67
|
+
- various bug fixes
|
68
|
+
|
69
|
+
v1.3.1 (12th February 2013)
|
70
|
+
- various bug fixes
|
71
|
+
|
72
|
+
v1.3.0 (30th December 2012)
|
73
|
+
- Numerous performance optimisations (thanks Alex Dowad)
|
74
|
+
- Improved text extraction (thanks Nathaniel Madura)
|
75
|
+
- Load less of the hashery gem to reduce core monkey patches
|
76
|
+
- various bug fixes
|
77
|
+
|
78
|
+
v1.2.0 (28th August 2012)
|
79
|
+
- Feature: correctly extract text using surrogate pairs and ligatures
|
80
|
+
(thanks Nathaniel Madura)
|
81
|
+
- Speed optimisation: cache tokenised Form XObjects to avoid re-parsing them
|
82
|
+
- Feature: support opening documents with some junk bytes prepended to file
|
83
|
+
(thanks Paul Gallagher)
|
84
|
+
- Acrobat does this, so it seemed reasonable to add support
|
85
|
+
|
1
86
|
v1.1.1 (9th May 2012)
|
2
87
|
- bugfix release to improve parsing of some PDFs
|
3
88
|
|
@@ -56,10 +141,10 @@ v0.9.2 (24th April 2011)
|
|
56
141
|
|
57
142
|
v0.9.1 (21st December 2010)
|
58
143
|
- force gem to only install on ruby 1.8.7 or higher
|
59
|
-
- maintaining
|
144
|
+
- maintaining support for earlier versions takes more time than I have
|
60
145
|
available at the moment
|
61
146
|
- bug: fix parsing of obscure pdf name format
|
62
|
-
- bug: fix behaviour when loaded in
|
147
|
+
- bug: fix behaviour when loaded in conjunction with htmldoc gem
|
63
148
|
|
64
149
|
v0.9.0 (19th November 2010)
|
65
150
|
- support for pdf 1.5+ files that use object and xref streams
|
data/{README.rdoc → README.md}
RENAMED
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
# pdf-reader
|
2
2
|
|
3
3
|
The PDF::Reader library implements a PDF parser conforming as much as possible
|
4
4
|
to the PDF specification from Adobe.
|
@@ -15,46 +15,55 @@ higher level functionality - it's not going to render a PDF for you. There are
|
|
15
15
|
a few exceptions to support very common use cases like extracting text from a
|
16
16
|
page.
|
17
17
|
|
18
|
-
|
18
|
+
# Installation
|
19
19
|
|
20
20
|
The recommended installation method is via Rubygems.
|
21
21
|
|
22
|
+
```ruby
|
22
23
|
gem install pdf-reader
|
24
|
+
```
|
23
25
|
|
24
|
-
|
26
|
+
# Usage
|
25
27
|
|
26
28
|
Begin by creating a PDF::Reader instance that points to a PDF file. Document
|
27
29
|
level information (metadata, page count, bookmarks, etc) is available via
|
28
30
|
this object.
|
29
31
|
|
32
|
+
```ruby
|
30
33
|
reader = PDF::Reader.new("somefile.pdf")
|
31
34
|
|
32
35
|
puts reader.pdf_version
|
33
36
|
puts reader.info
|
34
37
|
puts reader.metadata
|
35
38
|
puts reader.page_count
|
39
|
+
```
|
36
40
|
|
37
41
|
PDF::Reader.new accepts an IO stream or a filename. Here's an example with
|
38
42
|
an IO stream:
|
39
43
|
|
44
|
+
```ruby
|
40
45
|
require 'open-uri'
|
41
46
|
|
42
47
|
io = open('http://example.com/somefile.pdf')
|
43
48
|
reader = PDF::Reader.new(io)
|
44
49
|
puts reader.info
|
50
|
+
```
|
45
51
|
|
46
52
|
If you open a PDF with File#open or IO#open, I strongly recommend using "rb"
|
47
53
|
mode to ensure the file isn't mangled by ruby being 'helpful'. This is
|
48
54
|
particularly important on windows and MRI >= 1.9.2.
|
49
55
|
|
56
|
+
```ruby
|
50
57
|
File.open("somefile.pdf", "rb") do |io|
|
51
58
|
reader = PDF::Reader.new(io)
|
52
59
|
puts reader.info
|
53
60
|
end
|
61
|
+
```
|
54
62
|
|
55
63
|
PDF is a page based file format, so most visible information is available via
|
56
64
|
page-based iteration
|
57
65
|
|
66
|
+
```ruby
|
58
67
|
reader = PDF::Reader.new("somefile.pdf")
|
59
68
|
|
60
69
|
reader.pages.each do |page|
|
@@ -62,10 +71,12 @@ page-based iteration
|
|
62
71
|
puts page.text
|
63
72
|
puts page.raw_content
|
64
73
|
end
|
74
|
+
```
|
65
75
|
|
66
76
|
If you need to access the full program for rendering a page, use the walk() method
|
67
77
|
of PDF::Reader::Page.
|
68
78
|
|
79
|
+
```ruby
|
69
80
|
class RedGreenBlue
|
70
81
|
def set_rgb_color_for_nonstroking(r, g, b)
|
71
82
|
puts "R: #{r}, G: #{g}, B: #{b}"
|
@@ -76,37 +87,32 @@ of PDF::Reader::Page.
|
|
76
87
|
page = reader.page(1)
|
77
88
|
receiver = RedGreenBlue.new
|
78
89
|
page.walk(receiver)
|
90
|
+
```
|
79
91
|
|
80
|
-
For low level access to the objects in a PDF file, use the ObjectHash class
|
81
|
-
|
82
|
-
|
83
|
-
puts PDF::Reader::ObjectHash.new("somefile.pdf")
|
84
|
-
|
85
|
-
or via a PDF::Reader instance:
|
92
|
+
For low level access to the objects in a PDF file, use the ObjectHash class like
|
93
|
+
so:
|
86
94
|
|
95
|
+
```ruby
|
87
96
|
reader = PDF::Reader.new("somefile.pdf")
|
88
|
-
puts reader.objects
|
89
|
-
|
90
|
-
The second method is preferred to increase the effectiveness of internal caching.
|
97
|
+
puts reader.objects.inspect
|
98
|
+
```
|
91
99
|
|
92
|
-
|
100
|
+
# Text Encoding
|
93
101
|
|
94
102
|
Regardless of the internal encoding used in the PDF all text will be converted
|
95
103
|
to UTF-8 before it is passed back from PDF::Reader.
|
96
104
|
|
97
|
-
Strings that contain binary data (like font blobs) will be marked as such
|
98
|
-
M17N aware VMs.
|
105
|
+
Strings that contain binary data (like font blobs) will be marked as such.
|
99
106
|
|
100
|
-
|
107
|
+
# Former API
|
101
108
|
|
102
109
|
Version 1.0.0 of PDF::Reader introduced a new page-based API that provides
|
103
110
|
efficient and easy access to any page.
|
104
111
|
|
105
|
-
The
|
106
|
-
|
107
|
-
warnings before it is completely removed in version 2.0.0.
|
112
|
+
The pre-1.0 API was deprecated during the 1.x release series, and has been
|
113
|
+
removed from 2.0.0.
|
108
114
|
|
109
|
-
|
115
|
+
# Exceptions
|
110
116
|
|
111
117
|
There are two key exceptions that you will need to watch out for when processing a
|
112
118
|
PDF file:
|
@@ -126,7 +132,7 @@ don't, 'rescue MalformedPDFError' will catch all the subclassed errors as well.
|
|
126
132
|
Any other exceptions should be considered bugs in either PDF::Reader (please
|
127
133
|
report it!).
|
128
134
|
|
129
|
-
|
135
|
+
# PDF Integrity
|
130
136
|
|
131
137
|
Windows developers may run into problems when running specs due to MalformedPDFError's
|
132
138
|
This is usually because CRLF characters are automatically added to some of the PDF's in
|
@@ -134,18 +140,20 @@ the spec folder when you checkout a branch from Git.
|
|
134
140
|
|
135
141
|
To remove any invalid CRLF characters added while checking out a branch from Git, run:
|
136
142
|
|
143
|
+
```ruby
|
137
144
|
rake fix_integrity
|
145
|
+
```
|
138
146
|
|
139
|
-
|
147
|
+
# Maintainers
|
140
148
|
|
141
|
-
|
149
|
+
* James Healy <mailto:jimmy@deefa.com>
|
142
150
|
|
143
|
-
|
151
|
+
# Licensing
|
144
152
|
|
145
153
|
This library is distributed under the terms of the MIT License. See the included file for
|
146
154
|
more detail.
|
147
155
|
|
148
|
-
|
156
|
+
# Mailing List
|
149
157
|
|
150
158
|
Any questions or feedback should be sent to the PDF::Reader google group. It's
|
151
159
|
better that any answers be available for others instead of hiding in someone's
|
@@ -153,19 +161,23 @@ inbox.
|
|
153
161
|
|
154
162
|
http://groups.google.com/group/pdf-reader
|
155
163
|
|
156
|
-
|
164
|
+
# Examples
|
157
165
|
|
158
166
|
The easiest way to explain how this works in practice is to show some examples.
|
159
167
|
Check out the examples/ directory for a few files.
|
160
168
|
|
161
|
-
|
169
|
+
# Known Limitations
|
162
170
|
|
163
171
|
Occasionally some text cannot be extracted properly due to the way it has been
|
164
172
|
stored, or the use of invalid bytes. In these cases PDF::Reader will output a
|
165
173
|
little UTF-8 friendly box to indicate an unrecognisable character.
|
166
174
|
|
167
|
-
|
175
|
+
# Resources
|
176
|
+
|
177
|
+
* PDF::Reader Code Repository: http://github.com/yob/pdf-reader
|
178
|
+
|
179
|
+
* PDF Specification: http://www.adobe.com/devnet/pdf/pdf_reference.html
|
180
|
+
|
181
|
+
* PDF Tutorial Slide Presentations: https://web.archive.org/web/20150110042057/http://home.comcast.net/~jk05/presentations/PDFTutorials.html
|
168
182
|
|
169
|
-
|
170
|
-
- PDF Specification: http://www.adobe.com/devnet/pdf/pdf_reference.html
|
171
|
-
- PDF Tutorial Slide Presentations: http://home.comcast.net/~jk05/presentations/PDFTutorials.html
|
183
|
+
* Developing with PDF (book): http://shop.oreilly.com/product/0636920025269.do
|
data/Rakefile
CHANGED
@@ -1,19 +1,26 @@
|
|
1
|
-
require "
|
2
|
-
require "
|
3
|
-
|
4
|
-
|
5
|
-
require
|
6
|
-
require 'rake/rdoctask'
|
7
|
-
require 'rspec/core/rake_task'
|
8
|
-
require 'roodi'
|
9
|
-
require 'roodi_task'
|
1
|
+
require "bundler/gem_tasks"
|
2
|
+
require "digest/md5"
|
3
|
+
require "rdoc/task"
|
4
|
+
require "rspec/core/rake_task"
|
5
|
+
require "yaml"
|
10
6
|
|
11
7
|
desc "Default Task"
|
12
|
-
task :default => [ :spec ]
|
8
|
+
task :default => [ :quality, :spec ]
|
9
|
+
|
10
|
+
require 'cane/rake_task'
|
11
|
+
require 'morecane'
|
12
|
+
|
13
|
+
desc "Run cane to check quality metrics"
|
14
|
+
Cane::RakeTask.new(:quality) do |cane|
|
15
|
+
cane.abc_max = 20
|
16
|
+
cane.style_measure = 100
|
17
|
+
cane.max_violations = 31
|
18
|
+
|
19
|
+
cane.use Morecane::EncodingCheck, :encoding_glob => "{app,lib,spec}/**/*.rb"
|
20
|
+
end
|
13
21
|
|
14
|
-
# run all rspecs
|
15
22
|
desc "Run all rspec files"
|
16
|
-
RSpec::Core::RakeTask.new(
|
23
|
+
RSpec::Core::RakeTask.new(:spec) do |t|
|
17
24
|
t.rspec_opts = ["--color", "--format progress"]
|
18
25
|
t.ruby_opts = "-w"
|
19
26
|
end
|
@@ -31,16 +38,14 @@ Rake::RDocTask.new("doc") do |rdoc|
|
|
31
38
|
rdoc.options << "--inline-source"
|
32
39
|
end
|
33
40
|
|
34
|
-
RoodiTask.new 'roodi', ['lib/**/*.rb']
|
35
|
-
|
36
41
|
desc "Create a YAML file of integrity info for PDFs in the spec suite"
|
37
42
|
task :integrity_yaml do
|
38
43
|
data = {}
|
39
|
-
Dir.glob("spec/data/**/*.*").each do |path|
|
44
|
+
Dir.glob("spec/data/**/*.*").sort.each do |path|
|
40
45
|
path_without_spec = path.gsub("spec/","")
|
41
46
|
data[path_without_spec] = {
|
42
47
|
:bytes => File.size(path),
|
43
|
-
:md5
|
48
|
+
:md5 => Digest::MD5.hexdigest(File.read(path))
|
44
49
|
} if File.file?(path)
|
45
50
|
end
|
46
51
|
File.open("spec/integrity.yml","wb") { |f| f.write YAML.dump(data)}
|
data/bin/pdf_callbacks
CHANGED
data/bin/pdf_object
CHANGED
@@ -25,7 +25,10 @@ gen = gen.to_i
|
|
25
25
|
|
26
26
|
# make magic happen
|
27
27
|
begin
|
28
|
-
obj =
|
28
|
+
obj = nil
|
29
|
+
PDF::Reader.open(filename) do |pdf|
|
30
|
+
obj = pdf.objects[PDF::Reader::Reference.new(id, gen)]
|
31
|
+
end
|
29
32
|
|
30
33
|
case obj
|
31
34
|
when Hash, Array
|
data/bin/pdf_text
CHANGED
@@ -1,12 +1,10 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
require 'rubygems'
|
4
|
-
$LOAD_PATH.unshift(File.dirname(__FILE__) + "/../lib")
|
5
|
-
|
6
4
|
require 'pdf/reader'
|
7
5
|
|
8
6
|
if ARGV.empty?
|
9
|
-
browser = PDF::Reader.new(
|
7
|
+
browser = PDF::Reader.new(StringIO.new(ARGF.read))
|
10
8
|
else
|
11
9
|
browser = PDF::Reader.new(ARGV[0])
|
12
10
|
end
|
data/examples/callbacks.rb
CHANGED
@@ -9,12 +9,13 @@
|
|
9
9
|
require 'rubygems'
|
10
10
|
require 'pdf/reader'
|
11
11
|
|
12
|
-
receiver = PDF::Reader::RegisterReceiver.new
|
13
12
|
filename = File.expand_path(File.dirname(__FILE__)) + "/../spec/data/cairo-basic.pdf"
|
14
13
|
|
15
14
|
PDF::Reader.open(filename) do |reader|
|
16
15
|
reader.pages.each do |page|
|
16
|
+
receiver = PDF::Reader::RegisterReceiver.new
|
17
17
|
page.walk(receiver)
|
18
|
+
|
18
19
|
receiver.callbacks.each do |cb|
|
19
20
|
puts cb
|
20
21
|
end
|
data/examples/extract_images.rb
CHANGED
@@ -86,14 +86,15 @@ module ExtractImages
|
|
86
86
|
tiff = header.dup
|
87
87
|
tiff << short_tag.call( 256, 1, w ) # image width
|
88
88
|
tiff << short_tag.call( 257, 1, h ) # image height
|
89
|
-
tiff << long_tag.call( 258, 4, (header.size + (tag_count*12))) # bits per pixel
|
89
|
+
tiff << long_tag.call( 258, 4, (header.size + (tag_count*12) + 4)) # bits per pixel
|
90
90
|
tiff << short_tag.call( 259, 1, 1 ) # compression
|
91
91
|
tiff << short_tag.call( 262, 1, 5 ) # colorspace - separation
|
92
|
-
tiff << long_tag.call( 273, 1, (10 + (tag_count*12) +
|
92
|
+
tiff << long_tag.call( 273, 1, (10 + (tag_count*12) + 20) ) # data offset
|
93
93
|
tiff << short_tag.call( 277, 1, 4 ) # samples per pixel
|
94
94
|
tiff << long_tag.call( 279, 1, stream.unfiltered_data.size) # data byte size
|
95
95
|
tiff << short_tag.call( 284, 1, 1 ) # planer config
|
96
96
|
tiff << long_tag.call( 332, 1, 1) # inkset - CMYK
|
97
|
+
tiff << [0].pack("I") # next IFD pointer
|
97
98
|
tiff << [bpc, bpc, bpc, bpc].pack("IIII")
|
98
99
|
tiff << stream.unfiltered_data
|
99
100
|
File.open(filename, "wb") { |file| file.write tiff }
|
@@ -119,10 +120,12 @@ module ExtractImages
|
|
119
120
|
tiff << short_tag.call( 258, 1, 8 ) # bits per pixel
|
120
121
|
tiff << short_tag.call( 259, 1, 1 ) # compression
|
121
122
|
tiff << short_tag.call( 262, 1, 1 ) # colorspace - grayscale
|
122
|
-
tiff << long_tag.call( 273, 1, (10 + (tag_count*12)) ) # data offset
|
123
|
+
tiff << long_tag.call( 273, 1, (10 + (tag_count*12) + 4) ) # data offset
|
123
124
|
tiff << short_tag.call( 277, 1, 1 ) # samples per pixel
|
124
125
|
tiff << long_tag.call( 279, 1, stream.unfiltered_data.size) # data byte size
|
125
126
|
tiff << short_tag.call( 284, 1, 1 ) # planer config
|
127
|
+
tiff << [0].pack("I") # next IFD pointer
|
128
|
+
p stream.unfiltered_data.size
|
126
129
|
tiff << stream.unfiltered_data
|
127
130
|
File.open(filename, "wb") { |file| file.write tiff }
|
128
131
|
end
|
@@ -144,12 +147,13 @@ module ExtractImages
|
|
144
147
|
tiff = header.dup
|
145
148
|
tiff << short_tag.call( 256, 1, w ) # image width
|
146
149
|
tiff << short_tag.call( 257, 1, h ) # image height
|
147
|
-
tiff << long_tag.call( 258, 3, (header.size + (tag_count*12))) # bits per pixel
|
150
|
+
tiff << long_tag.call( 258, 3, (header.size + (tag_count*12) + 4)) # bits per pixel
|
148
151
|
tiff << short_tag.call( 259, 1, 1 ) # compression
|
149
152
|
tiff << short_tag.call( 262, 1, 2 ) # colorspace - RGB
|
150
|
-
tiff << long_tag.call( 273, 1, (header.size + (tag_count*12) +
|
153
|
+
tiff << long_tag.call( 273, 1, (header.size + (tag_count*12) + 16) ) # data offset
|
151
154
|
tiff << short_tag.call( 277, 1, 3 ) # samples per pixel
|
152
155
|
tiff << long_tag.call( 279, 1, stream.unfiltered_data.size) # data byte size
|
156
|
+
tiff << [0].pack("I") # next IFD pointer
|
153
157
|
tiff << [bpc, bpc, bpc].pack("III")
|
154
158
|
tiff << stream.unfiltered_data
|
155
159
|
File.open(filename, "wb") { |file| file.write tiff }
|
@@ -209,8 +213,9 @@ module ExtractImages
|
|
209
213
|
+ short_tag.call( 256, cols ) \
|
210
214
|
+ short_tag.call( 257, h ) \
|
211
215
|
+ short_tag.call( 259, 4 ) \
|
212
|
-
+ long_tag.call( 273, (10 + (5*12)) ) \
|
216
|
+
+ long_tag.call( 273, (10 + (5*12) + 4) ) \
|
213
217
|
+ long_tag.call( 279, len) \
|
218
|
+
+ [0].pack("I") \
|
214
219
|
+ stream.data
|
215
220
|
File.open(filename, "wb") { |file| file.write tiff }
|
216
221
|
end
|