pdf-reader 1.1.1 → 2.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG +87 -2
- data/{README.rdoc → README.md} +43 -31
- data/Rakefile +21 -16
- data/bin/pdf_callbacks +1 -1
- data/bin/pdf_object +4 -1
- data/bin/pdf_text +1 -3
- data/examples/callbacks.rb +2 -1
- data/examples/extract_images.rb +11 -6
- data/examples/fuzzy_paragraphs.rb +24 -0
- data/lib/pdf/reader/afm/Courier-Bold.afm +342 -0
- data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -0
- data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -0
- data/lib/pdf/reader/afm/Courier.afm +342 -0
- data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -0
- data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -0
- data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -0
- data/lib/pdf/reader/afm/Helvetica.afm +3051 -0
- data/lib/pdf/reader/afm/MustRead.html +19 -0
- data/lib/pdf/reader/afm/Symbol.afm +213 -0
- data/lib/pdf/reader/afm/Times-Bold.afm +2588 -0
- data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -0
- data/lib/pdf/reader/afm/Times-Italic.afm +2667 -0
- data/lib/pdf/reader/afm/Times-Roman.afm +2419 -0
- data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -0
- data/lib/pdf/reader/buffer.rb +90 -63
- data/lib/pdf/reader/cid_widths.rb +63 -0
- data/lib/pdf/reader/cmap.rb +69 -38
- data/lib/pdf/reader/encoding.rb +74 -48
- data/lib/pdf/reader/error.rb +24 -4
- data/lib/pdf/reader/filter/ascii85.rb +28 -0
- data/lib/pdf/reader/filter/ascii_hex.rb +30 -0
- data/lib/pdf/reader/filter/depredict.rb +141 -0
- data/lib/pdf/reader/filter/flate.rb +53 -0
- data/lib/pdf/reader/filter/lzw.rb +21 -0
- data/lib/pdf/reader/filter/null.rb +18 -0
- data/lib/pdf/reader/filter/run_length.rb +45 -0
- data/lib/pdf/reader/filter.rb +15 -234
- data/lib/pdf/reader/font.rb +107 -43
- data/lib/pdf/reader/font_descriptor.rb +80 -0
- data/lib/pdf/reader/form_xobject.rb +26 -4
- data/lib/pdf/reader/glyph_hash.rb +56 -18
- data/lib/pdf/reader/lzw.rb +6 -4
- data/lib/pdf/reader/null_security_handler.rb +17 -0
- data/lib/pdf/reader/object_cache.rb +40 -16
- data/lib/pdf/reader/object_hash.rb +94 -40
- data/lib/pdf/reader/object_stream.rb +1 -0
- data/lib/pdf/reader/orientation_detector.rb +34 -0
- data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
- data/lib/pdf/reader/page.rb +48 -3
- data/lib/pdf/reader/page_layout.rb +125 -0
- data/lib/pdf/reader/page_state.rb +185 -70
- data/lib/pdf/reader/page_text_receiver.rb +70 -20
- data/lib/pdf/reader/pages_strategy.rb +4 -293
- data/lib/pdf/reader/parser.rb +37 -61
- data/lib/pdf/reader/print_receiver.rb +6 -0
- data/lib/pdf/reader/reference.rb +4 -1
- data/lib/pdf/reader/register_receiver.rb +17 -31
- data/lib/pdf/reader/resource_methods.rb +1 -0
- data/lib/pdf/reader/standard_security_handler.rb +82 -42
- data/lib/pdf/reader/standard_security_handler_v5.rb +91 -0
- data/lib/pdf/reader/stream.rb +5 -2
- data/lib/pdf/reader/synchronized_cache.rb +33 -0
- data/lib/pdf/reader/text_run.rb +99 -0
- data/lib/pdf/reader/token.rb +4 -1
- data/lib/pdf/reader/transformation_matrix.rb +195 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +17 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +67 -0
- data/lib/pdf/reader/width_calculator/composite.rb +28 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +56 -0
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +33 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +25 -0
- data/lib/pdf/reader/width_calculator.rb +12 -0
- data/lib/pdf/reader/xref.rb +41 -9
- data/lib/pdf/reader.rb +45 -104
- data/lib/pdf-reader.rb +4 -1
- metadata +220 -101
- data/bin/pdf_list_callbacks +0 -17
- data/lib/pdf/hash.rb +0 -15
- data/lib/pdf/reader/abstract_strategy.rb +0 -81
- data/lib/pdf/reader/metadata_strategy.rb +0 -56
- data/lib/pdf/reader/text_receiver.rb +0 -264
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 652d05cf6a22fad5ecb4b92de1e27ba60cafc6525c5ca524e24c7f9796fe1b83
|
4
|
+
data.tar.gz: 2c7448e97890a9fcbd10ec2cd5bafb9025db2fb75dabaf71a4074c542b1065a1
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: ac82452924cf46af98ee15f2a20642b1d06d5b9c22104fe171b5b4612665e482f341e12473805016ccb9d921fc15324ba51675170b369adeace8b278cd1279fb
|
7
|
+
data.tar.gz: b1dc1c4422b0e6bf01092cf724630ba7424fdef1fdaf34f33aaa3a31397caf6ef5a73185a98e6e2828a9e082d87cbca311565397cb064cac20d86e72be27626f
|
data/CHANGELOG
CHANGED
@@ -1,3 +1,88 @@
|
|
1
|
+
v2.5.0 (6th June 2021)
|
2
|
+
- bump minimum ruby version to 2.0
|
3
|
+
- Correctly handle trascoding to UTF-8 from some fonts that use a difference table [#344](https://github.com/yob/pdf-reader/pull/344/)
|
4
|
+
- Fix some character spacing issues with the TJ operator [#343](https://github.com/yob/pdf-reader/pull/343)
|
5
|
+
- Fix crash with some encrypted PDFs [#348](https://github.com/yob/pdf-reader/pull/348/)
|
6
|
+
- Fix positions of text on some PDFs with pages rotated 90° [#350](https://github.com/yob/pdf-reader/pull/350/)
|
7
|
+
|
8
|
+
v2.4.2 (28th January 2021)
|
9
|
+
- relax ASCII85 dependency to allow 1.x
|
10
|
+
- improved support for decompressing objects with slightly malformed zlib data
|
11
|
+
|
12
|
+
v.2.4.1 (24th September 2020)
|
13
|
+
- Re-vendor font metrics from Adobe to clarify their license
|
14
|
+
|
15
|
+
v2.4.0 (21st November 2019)
|
16
|
+
- Optimise overlapping characters code introduced in 2.3.0. Text extraction of pages with
|
17
|
+
thousands of characters is still slower than it was in 2.2.1, but it might tolerable
|
18
|
+
for now. See https://github.com/yob/pdf-reader/pull/308 for details.
|
19
|
+
- Implement very basic font substitution for Type1 and TrueType fonts that aren't embedded
|
20
|
+
- Remove PDF::Hash class. It's been deprecated since 2010, and it's hard to believe anyone
|
21
|
+
is still using it.
|
22
|
+
- Several small bug fixes
|
23
|
+
|
24
|
+
v2.3.0 (7th November 2019)
|
25
|
+
- Text extraction now makes an effort to skip duplicate characters that overlap, a
|
26
|
+
common approach used for a fake "bold" effect, This will make text extraction a bit
|
27
|
+
slower - if that turns out to be an issue I'll look into further optimisations or
|
28
|
+
provide a toggle to turn it off
|
29
|
+
- Several small bug fixes
|
30
|
+
|
31
|
+
v2.2.1 (27th July 2019)
|
32
|
+
- Improve utf8 text extraction from CMaps that contain surrogate pair ligatures
|
33
|
+
|
34
|
+
v2.2.0 (18th December 2018)
|
35
|
+
- Support additional XRef Stream variants (thanks Stefan Wienert)
|
36
|
+
- Add frozen_strings pragma to reduce object allocations on ruby 2.3+
|
37
|
+
- various bug fixes
|
38
|
+
|
39
|
+
v2.1.0 (15th February 2018)
|
40
|
+
- Support extra encrypted PDF variants (thanks to Gyuchang Jun)
|
41
|
+
- various bug fixes
|
42
|
+
|
43
|
+
v2.0.0 (25th February 2017)
|
44
|
+
- various bug fixes
|
45
|
+
|
46
|
+
v2.0.0.beta1 (15th February 2017)
|
47
|
+
- BREAKING CHANGE: remove all methods that were deprecated in 1.0.0
|
48
|
+
- Bug: Support extra encrypted PDF variants (thanks to Gyuchang Jun)
|
49
|
+
- various bug fixes
|
50
|
+
|
51
|
+
v1.4.1 (2nd January 2017)
|
52
|
+
- improve compatibility with ruby 2.4 (thanks Akira Matsuda)
|
53
|
+
- various bug fixes
|
54
|
+
|
55
|
+
v1.4.0 (22nd February 2016)
|
56
|
+
- raise minimum ruby version to 1.9.3
|
57
|
+
- print warnings to stderr when deprecated methods are used. These methods have been
|
58
|
+
deprecated for 4 years, so hopefully few people are depending on them
|
59
|
+
- Fix exception when a non-breaking space (character 160) is used with a
|
60
|
+
built-in font (helvetica, etc)
|
61
|
+
- various bug fixes
|
62
|
+
|
63
|
+
v1.3.3 (7th April 2013)
|
64
|
+
- various bug fixes
|
65
|
+
|
66
|
+
v1.3.2 (26th February 2013)
|
67
|
+
- various bug fixes
|
68
|
+
|
69
|
+
v1.3.1 (12th February 2013)
|
70
|
+
- various bug fixes
|
71
|
+
|
72
|
+
v1.3.0 (30th December 2012)
|
73
|
+
- Numerous performance optimisations (thanks Alex Dowad)
|
74
|
+
- Improved text extraction (thanks Nathaniel Madura)
|
75
|
+
- Load less of the hashery gem to reduce core monkey patches
|
76
|
+
- various bug fixes
|
77
|
+
|
78
|
+
v1.2.0 (28th August 2012)
|
79
|
+
- Feature: correctly extract text using surrogate pairs and ligatures
|
80
|
+
(thanks Nathaniel Madura)
|
81
|
+
- Speed optimisation: cache tokenised Form XObjects to avoid re-parsing them
|
82
|
+
- Feature: support opening documents with some junk bytes prepended to file
|
83
|
+
(thanks Paul Gallagher)
|
84
|
+
- Acrobat does this, so it seemed reasonable to add support
|
85
|
+
|
1
86
|
v1.1.1 (9th May 2012)
|
2
87
|
- bugfix release to improve parsing of some PDFs
|
3
88
|
|
@@ -56,10 +141,10 @@ v0.9.2 (24th April 2011)
|
|
56
141
|
|
57
142
|
v0.9.1 (21st December 2010)
|
58
143
|
- force gem to only install on ruby 1.8.7 or higher
|
59
|
-
- maintaining
|
144
|
+
- maintaining support for earlier versions takes more time than I have
|
60
145
|
available at the moment
|
61
146
|
- bug: fix parsing of obscure pdf name format
|
62
|
-
- bug: fix behaviour when loaded in
|
147
|
+
- bug: fix behaviour when loaded in conjunction with htmldoc gem
|
63
148
|
|
64
149
|
v0.9.0 (19th November 2010)
|
65
150
|
- support for pdf 1.5+ files that use object and xref streams
|
data/{README.rdoc → README.md}
RENAMED
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
# pdf-reader
|
2
2
|
|
3
3
|
The PDF::Reader library implements a PDF parser conforming as much as possible
|
4
4
|
to the PDF specification from Adobe.
|
@@ -15,46 +15,55 @@ higher level functionality - it's not going to render a PDF for you. There are
|
|
15
15
|
a few exceptions to support very common use cases like extracting text from a
|
16
16
|
page.
|
17
17
|
|
18
|
-
|
18
|
+
# Installation
|
19
19
|
|
20
20
|
The recommended installation method is via Rubygems.
|
21
21
|
|
22
|
+
```ruby
|
22
23
|
gem install pdf-reader
|
24
|
+
```
|
23
25
|
|
24
|
-
|
26
|
+
# Usage
|
25
27
|
|
26
28
|
Begin by creating a PDF::Reader instance that points to a PDF file. Document
|
27
29
|
level information (metadata, page count, bookmarks, etc) is available via
|
28
30
|
this object.
|
29
31
|
|
32
|
+
```ruby
|
30
33
|
reader = PDF::Reader.new("somefile.pdf")
|
31
34
|
|
32
35
|
puts reader.pdf_version
|
33
36
|
puts reader.info
|
34
37
|
puts reader.metadata
|
35
38
|
puts reader.page_count
|
39
|
+
```
|
36
40
|
|
37
41
|
PDF::Reader.new accepts an IO stream or a filename. Here's an example with
|
38
42
|
an IO stream:
|
39
43
|
|
44
|
+
```ruby
|
40
45
|
require 'open-uri'
|
41
46
|
|
42
47
|
io = open('http://example.com/somefile.pdf')
|
43
48
|
reader = PDF::Reader.new(io)
|
44
49
|
puts reader.info
|
50
|
+
```
|
45
51
|
|
46
52
|
If you open a PDF with File#open or IO#open, I strongly recommend using "rb"
|
47
53
|
mode to ensure the file isn't mangled by ruby being 'helpful'. This is
|
48
54
|
particularly important on windows and MRI >= 1.9.2.
|
49
55
|
|
56
|
+
```ruby
|
50
57
|
File.open("somefile.pdf", "rb") do |io|
|
51
58
|
reader = PDF::Reader.new(io)
|
52
59
|
puts reader.info
|
53
60
|
end
|
61
|
+
```
|
54
62
|
|
55
63
|
PDF is a page based file format, so most visible information is available via
|
56
64
|
page-based iteration
|
57
65
|
|
66
|
+
```ruby
|
58
67
|
reader = PDF::Reader.new("somefile.pdf")
|
59
68
|
|
60
69
|
reader.pages.each do |page|
|
@@ -62,10 +71,12 @@ page-based iteration
|
|
62
71
|
puts page.text
|
63
72
|
puts page.raw_content
|
64
73
|
end
|
74
|
+
```
|
65
75
|
|
66
76
|
If you need to access the full program for rendering a page, use the walk() method
|
67
77
|
of PDF::Reader::Page.
|
68
78
|
|
79
|
+
```ruby
|
69
80
|
class RedGreenBlue
|
70
81
|
def set_rgb_color_for_nonstroking(r, g, b)
|
71
82
|
puts "R: #{r}, G: #{g}, B: #{b}"
|
@@ -76,37 +87,32 @@ of PDF::Reader::Page.
|
|
76
87
|
page = reader.page(1)
|
77
88
|
receiver = RedGreenBlue.new
|
78
89
|
page.walk(receiver)
|
90
|
+
```
|
79
91
|
|
80
|
-
For low level access to the objects in a PDF file, use the ObjectHash class
|
81
|
-
|
82
|
-
|
83
|
-
puts PDF::Reader::ObjectHash.new("somefile.pdf")
|
84
|
-
|
85
|
-
or via a PDF::Reader instance:
|
92
|
+
For low level access to the objects in a PDF file, use the ObjectHash class like
|
93
|
+
so:
|
86
94
|
|
95
|
+
```ruby
|
87
96
|
reader = PDF::Reader.new("somefile.pdf")
|
88
|
-
puts reader.objects
|
89
|
-
|
90
|
-
The second method is preferred to increase the effectiveness of internal caching.
|
97
|
+
puts reader.objects.inspect
|
98
|
+
```
|
91
99
|
|
92
|
-
|
100
|
+
# Text Encoding
|
93
101
|
|
94
102
|
Regardless of the internal encoding used in the PDF all text will be converted
|
95
103
|
to UTF-8 before it is passed back from PDF::Reader.
|
96
104
|
|
97
|
-
Strings that contain binary data (like font blobs) will be marked as such
|
98
|
-
M17N aware VMs.
|
105
|
+
Strings that contain binary data (like font blobs) will be marked as such.
|
99
106
|
|
100
|
-
|
107
|
+
# Former API
|
101
108
|
|
102
109
|
Version 1.0.0 of PDF::Reader introduced a new page-based API that provides
|
103
110
|
efficient and easy access to any page.
|
104
111
|
|
105
|
-
The
|
106
|
-
|
107
|
-
warnings before it is completely removed in version 2.0.0.
|
112
|
+
The pre-1.0 API was deprecated during the 1.x release series, and has been
|
113
|
+
removed from 2.0.0.
|
108
114
|
|
109
|
-
|
115
|
+
# Exceptions
|
110
116
|
|
111
117
|
There are two key exceptions that you will need to watch out for when processing a
|
112
118
|
PDF file:
|
@@ -126,7 +132,7 @@ don't, 'rescue MalformedPDFError' will catch all the subclassed errors as well.
|
|
126
132
|
Any other exceptions should be considered bugs in either PDF::Reader (please
|
127
133
|
report it!).
|
128
134
|
|
129
|
-
|
135
|
+
# PDF Integrity
|
130
136
|
|
131
137
|
Windows developers may run into problems when running specs due to MalformedPDFError's
|
132
138
|
This is usually because CRLF characters are automatically added to some of the PDF's in
|
@@ -134,18 +140,20 @@ the spec folder when you checkout a branch from Git.
|
|
134
140
|
|
135
141
|
To remove any invalid CRLF characters added while checking out a branch from Git, run:
|
136
142
|
|
143
|
+
```ruby
|
137
144
|
rake fix_integrity
|
145
|
+
```
|
138
146
|
|
139
|
-
|
147
|
+
# Maintainers
|
140
148
|
|
141
|
-
|
149
|
+
* James Healy <mailto:jimmy@deefa.com>
|
142
150
|
|
143
|
-
|
151
|
+
# Licensing
|
144
152
|
|
145
153
|
This library is distributed under the terms of the MIT License. See the included file for
|
146
154
|
more detail.
|
147
155
|
|
148
|
-
|
156
|
+
# Mailing List
|
149
157
|
|
150
158
|
Any questions or feedback should be sent to the PDF::Reader google group. It's
|
151
159
|
better that any answers be available for others instead of hiding in someone's
|
@@ -153,19 +161,23 @@ inbox.
|
|
153
161
|
|
154
162
|
http://groups.google.com/group/pdf-reader
|
155
163
|
|
156
|
-
|
164
|
+
# Examples
|
157
165
|
|
158
166
|
The easiest way to explain how this works in practice is to show some examples.
|
159
167
|
Check out the examples/ directory for a few files.
|
160
168
|
|
161
|
-
|
169
|
+
# Known Limitations
|
162
170
|
|
163
171
|
Occasionally some text cannot be extracted properly due to the way it has been
|
164
172
|
stored, or the use of invalid bytes. In these cases PDF::Reader will output a
|
165
173
|
little UTF-8 friendly box to indicate an unrecognisable character.
|
166
174
|
|
167
|
-
|
175
|
+
# Resources
|
176
|
+
|
177
|
+
* PDF::Reader Code Repository: http://github.com/yob/pdf-reader
|
178
|
+
|
179
|
+
* PDF Specification: http://www.adobe.com/devnet/pdf/pdf_reference.html
|
180
|
+
|
181
|
+
* PDF Tutorial Slide Presentations: https://web.archive.org/web/20150110042057/http://home.comcast.net/~jk05/presentations/PDFTutorials.html
|
168
182
|
|
169
|
-
|
170
|
-
- PDF Specification: http://www.adobe.com/devnet/pdf/pdf_reference.html
|
171
|
-
- PDF Tutorial Slide Presentations: http://home.comcast.net/~jk05/presentations/PDFTutorials.html
|
183
|
+
* Developing with PDF (book): http://shop.oreilly.com/product/0636920025269.do
|
data/Rakefile
CHANGED
@@ -1,19 +1,26 @@
|
|
1
|
-
require "
|
2
|
-
require "
|
3
|
-
|
4
|
-
|
5
|
-
require
|
6
|
-
require 'rake/rdoctask'
|
7
|
-
require 'rspec/core/rake_task'
|
8
|
-
require 'roodi'
|
9
|
-
require 'roodi_task'
|
1
|
+
require "bundler/gem_tasks"
|
2
|
+
require "digest/md5"
|
3
|
+
require "rdoc/task"
|
4
|
+
require "rspec/core/rake_task"
|
5
|
+
require "yaml"
|
10
6
|
|
11
7
|
desc "Default Task"
|
12
|
-
task :default => [ :spec ]
|
8
|
+
task :default => [ :quality, :spec ]
|
9
|
+
|
10
|
+
require 'cane/rake_task'
|
11
|
+
require 'morecane'
|
12
|
+
|
13
|
+
desc "Run cane to check quality metrics"
|
14
|
+
Cane::RakeTask.new(:quality) do |cane|
|
15
|
+
cane.abc_max = 20
|
16
|
+
cane.style_measure = 100
|
17
|
+
cane.max_violations = 31
|
18
|
+
|
19
|
+
cane.use Morecane::EncodingCheck, :encoding_glob => "{app,lib,spec}/**/*.rb"
|
20
|
+
end
|
13
21
|
|
14
|
-
# run all rspecs
|
15
22
|
desc "Run all rspec files"
|
16
|
-
RSpec::Core::RakeTask.new(
|
23
|
+
RSpec::Core::RakeTask.new(:spec) do |t|
|
17
24
|
t.rspec_opts = ["--color", "--format progress"]
|
18
25
|
t.ruby_opts = "-w"
|
19
26
|
end
|
@@ -31,16 +38,14 @@ Rake::RDocTask.new("doc") do |rdoc|
|
|
31
38
|
rdoc.options << "--inline-source"
|
32
39
|
end
|
33
40
|
|
34
|
-
RoodiTask.new 'roodi', ['lib/**/*.rb']
|
35
|
-
|
36
41
|
desc "Create a YAML file of integrity info for PDFs in the spec suite"
|
37
42
|
task :integrity_yaml do
|
38
43
|
data = {}
|
39
|
-
Dir.glob("spec/data/**/*.*").each do |path|
|
44
|
+
Dir.glob("spec/data/**/*.*").sort.each do |path|
|
40
45
|
path_without_spec = path.gsub("spec/","")
|
41
46
|
data[path_without_spec] = {
|
42
47
|
:bytes => File.size(path),
|
43
|
-
:md5
|
48
|
+
:md5 => Digest::MD5.hexdigest(File.read(path))
|
44
49
|
} if File.file?(path)
|
45
50
|
end
|
46
51
|
File.open("spec/integrity.yml","wb") { |f| f.write YAML.dump(data)}
|
data/bin/pdf_callbacks
CHANGED
data/bin/pdf_object
CHANGED
@@ -25,7 +25,10 @@ gen = gen.to_i
|
|
25
25
|
|
26
26
|
# make magic happen
|
27
27
|
begin
|
28
|
-
obj =
|
28
|
+
obj = nil
|
29
|
+
PDF::Reader.open(filename) do |pdf|
|
30
|
+
obj = pdf.objects[PDF::Reader::Reference.new(id, gen)]
|
31
|
+
end
|
29
32
|
|
30
33
|
case obj
|
31
34
|
when Hash, Array
|
data/bin/pdf_text
CHANGED
@@ -1,12 +1,10 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
require 'rubygems'
|
4
|
-
$LOAD_PATH.unshift(File.dirname(__FILE__) + "/../lib")
|
5
|
-
|
6
4
|
require 'pdf/reader'
|
7
5
|
|
8
6
|
if ARGV.empty?
|
9
|
-
browser = PDF::Reader.new(
|
7
|
+
browser = PDF::Reader.new(StringIO.new(ARGF.read))
|
10
8
|
else
|
11
9
|
browser = PDF::Reader.new(ARGV[0])
|
12
10
|
end
|
data/examples/callbacks.rb
CHANGED
@@ -9,12 +9,13 @@
|
|
9
9
|
require 'rubygems'
|
10
10
|
require 'pdf/reader'
|
11
11
|
|
12
|
-
receiver = PDF::Reader::RegisterReceiver.new
|
13
12
|
filename = File.expand_path(File.dirname(__FILE__)) + "/../spec/data/cairo-basic.pdf"
|
14
13
|
|
15
14
|
PDF::Reader.open(filename) do |reader|
|
16
15
|
reader.pages.each do |page|
|
16
|
+
receiver = PDF::Reader::RegisterReceiver.new
|
17
17
|
page.walk(receiver)
|
18
|
+
|
18
19
|
receiver.callbacks.each do |cb|
|
19
20
|
puts cb
|
20
21
|
end
|
data/examples/extract_images.rb
CHANGED
@@ -86,14 +86,15 @@ module ExtractImages
|
|
86
86
|
tiff = header.dup
|
87
87
|
tiff << short_tag.call( 256, 1, w ) # image width
|
88
88
|
tiff << short_tag.call( 257, 1, h ) # image height
|
89
|
-
tiff << long_tag.call( 258, 4, (header.size + (tag_count*12))) # bits per pixel
|
89
|
+
tiff << long_tag.call( 258, 4, (header.size + (tag_count*12) + 4)) # bits per pixel
|
90
90
|
tiff << short_tag.call( 259, 1, 1 ) # compression
|
91
91
|
tiff << short_tag.call( 262, 1, 5 ) # colorspace - separation
|
92
|
-
tiff << long_tag.call( 273, 1, (10 + (tag_count*12) +
|
92
|
+
tiff << long_tag.call( 273, 1, (10 + (tag_count*12) + 20) ) # data offset
|
93
93
|
tiff << short_tag.call( 277, 1, 4 ) # samples per pixel
|
94
94
|
tiff << long_tag.call( 279, 1, stream.unfiltered_data.size) # data byte size
|
95
95
|
tiff << short_tag.call( 284, 1, 1 ) # planer config
|
96
96
|
tiff << long_tag.call( 332, 1, 1) # inkset - CMYK
|
97
|
+
tiff << [0].pack("I") # next IFD pointer
|
97
98
|
tiff << [bpc, bpc, bpc, bpc].pack("IIII")
|
98
99
|
tiff << stream.unfiltered_data
|
99
100
|
File.open(filename, "wb") { |file| file.write tiff }
|
@@ -119,10 +120,12 @@ module ExtractImages
|
|
119
120
|
tiff << short_tag.call( 258, 1, 8 ) # bits per pixel
|
120
121
|
tiff << short_tag.call( 259, 1, 1 ) # compression
|
121
122
|
tiff << short_tag.call( 262, 1, 1 ) # colorspace - grayscale
|
122
|
-
tiff << long_tag.call( 273, 1, (10 + (tag_count*12)) ) # data offset
|
123
|
+
tiff << long_tag.call( 273, 1, (10 + (tag_count*12) + 4) ) # data offset
|
123
124
|
tiff << short_tag.call( 277, 1, 1 ) # samples per pixel
|
124
125
|
tiff << long_tag.call( 279, 1, stream.unfiltered_data.size) # data byte size
|
125
126
|
tiff << short_tag.call( 284, 1, 1 ) # planer config
|
127
|
+
tiff << [0].pack("I") # next IFD pointer
|
128
|
+
p stream.unfiltered_data.size
|
126
129
|
tiff << stream.unfiltered_data
|
127
130
|
File.open(filename, "wb") { |file| file.write tiff }
|
128
131
|
end
|
@@ -144,12 +147,13 @@ module ExtractImages
|
|
144
147
|
tiff = header.dup
|
145
148
|
tiff << short_tag.call( 256, 1, w ) # image width
|
146
149
|
tiff << short_tag.call( 257, 1, h ) # image height
|
147
|
-
tiff << long_tag.call( 258, 3, (header.size + (tag_count*12))) # bits per pixel
|
150
|
+
tiff << long_tag.call( 258, 3, (header.size + (tag_count*12) + 4)) # bits per pixel
|
148
151
|
tiff << short_tag.call( 259, 1, 1 ) # compression
|
149
152
|
tiff << short_tag.call( 262, 1, 2 ) # colorspace - RGB
|
150
|
-
tiff << long_tag.call( 273, 1, (header.size + (tag_count*12) +
|
153
|
+
tiff << long_tag.call( 273, 1, (header.size + (tag_count*12) + 16) ) # data offset
|
151
154
|
tiff << short_tag.call( 277, 1, 3 ) # samples per pixel
|
152
155
|
tiff << long_tag.call( 279, 1, stream.unfiltered_data.size) # data byte size
|
156
|
+
tiff << [0].pack("I") # next IFD pointer
|
153
157
|
tiff << [bpc, bpc, bpc].pack("III")
|
154
158
|
tiff << stream.unfiltered_data
|
155
159
|
File.open(filename, "wb") { |file| file.write tiff }
|
@@ -209,8 +213,9 @@ module ExtractImages
|
|
209
213
|
+ short_tag.call( 256, cols ) \
|
210
214
|
+ short_tag.call( 257, h ) \
|
211
215
|
+ short_tag.call( 259, 4 ) \
|
212
|
-
+ long_tag.call( 273, (10 + (5*12)) ) \
|
216
|
+
+ long_tag.call( 273, (10 + (5*12) + 4) ) \
|
213
217
|
+ long_tag.call( 279, len) \
|
218
|
+
+ [0].pack("I") \
|
214
219
|
+ stream.data
|
215
220
|
File.open(filename, "wb") { |file| file.write tiff }
|
216
221
|
end
|