fireinc-pdf-reader 0.11.0.alpha
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +168 -0
- data/MIT-LICENSE +21 -0
- data/README.rdoc +137 -0
- data/Rakefile +34 -0
- data/TODO +45 -0
- data/bin/pdf_list_callbacks +15 -0
- data/bin/pdf_object +48 -0
- data/bin/pdf_text +15 -0
- data/examples/callbacks.rb +21 -0
- data/examples/extract_bates.rb +49 -0
- data/examples/extract_images.rb +108 -0
- data/examples/hash.rb +12 -0
- data/examples/metadata.rb +25 -0
- data/examples/page_counter_improved.rb +23 -0
- data/examples/page_counter_naive.rb +24 -0
- data/examples/rspec.rb +57 -0
- data/examples/text.rb +40 -0
- data/examples/version.rb +25 -0
- data/lib/pdf/hash.rb +15 -0
- data/lib/pdf/reader/abstract_strategy.rb +81 -0
- data/lib/pdf/reader/buffer.rb +346 -0
- data/lib/pdf/reader/cmap.rb +138 -0
- data/lib/pdf/reader/encoding.rb +190 -0
- data/lib/pdf/reader/encodings/mac_expert.txt +159 -0
- data/lib/pdf/reader/encodings/mac_roman.txt +128 -0
- data/lib/pdf/reader/encodings/pdf_doc.txt +40 -0
- data/lib/pdf/reader/encodings/standard.txt +47 -0
- data/lib/pdf/reader/encodings/symbol.txt +154 -0
- data/lib/pdf/reader/encodings/win_ansi.txt +29 -0
- data/lib/pdf/reader/encodings/zapf_dingbats.txt +201 -0
- data/lib/pdf/reader/error.rb +53 -0
- data/lib/pdf/reader/filter.rb +219 -0
- data/lib/pdf/reader/font.rb +133 -0
- data/lib/pdf/reader/form_xobject.rb +83 -0
- data/lib/pdf/reader/glyphlist.txt +4322 -0
- data/lib/pdf/reader/lzw.rb +123 -0
- data/lib/pdf/reader/metadata_strategy.rb +56 -0
- data/lib/pdf/reader/object_cache.rb +85 -0
- data/lib/pdf/reader/object_hash.rb +289 -0
- data/lib/pdf/reader/object_stream.rb +51 -0
- data/lib/pdf/reader/page.rb +185 -0
- data/lib/pdf/reader/page_text_receiver.rb +278 -0
- data/lib/pdf/reader/pages_strategy.rb +475 -0
- data/lib/pdf/reader/parser.rb +225 -0
- data/lib/pdf/reader/print_receiver.rb +18 -0
- data/lib/pdf/reader/reference.rb +66 -0
- data/lib/pdf/reader/register_receiver.rb +95 -0
- data/lib/pdf/reader/stream.rb +69 -0
- data/lib/pdf/reader/text_receiver.rb +264 -0
- data/lib/pdf/reader/token.rb +41 -0
- data/lib/pdf/reader/xref.rb +220 -0
- data/lib/pdf/reader.rb +296 -0
- data/lib/pdf-reader.rb +1 -0
- metadata +211 -0
data/CHANGELOG
ADDED
@@ -0,0 +1,168 @@
|
|
1
|
+
v0.9.4 (XXX)
|
2
|
+
- support multiple receivers within a single pass over a source file
|
3
|
+
- massive time saving when dealing with multiple receivers
|
4
|
+
|
5
|
+
v0.9.3 (2nd July 2011)
|
6
|
+
- add PDF::Reader::Reference#hash method
|
7
|
+
- improves behaviour of Reference objects when tehy're used as Hash keys
|
8
|
+
|
9
|
+
v0.9.2 (24th April 2011)
|
10
|
+
- add basic support for fonts with Identity-V encoding.
|
11
|
+
- bug: improve robustness of text extraction
|
12
|
+
- thanks to Evan Arnold for reporting
|
13
|
+
- bug: fix loading of nested resources on XObjects
|
14
|
+
- thanks to Samuel Williams for reporting
|
15
|
+
- bug: improve parsing of files with XRef object streams
|
16
|
+
|
17
|
+
v0.9.1 (21st December 2010)
|
18
|
+
- force gem to only install on ruby 1.8.7 or higher
|
19
|
+
- maintaining supprot for earlier versions takes more time than I have
|
20
|
+
available at the moment
|
21
|
+
- bug: fix parsing of obscure pdf name format
|
22
|
+
- bug: fix behaviour when loaded in confunction with htmldoc gem
|
23
|
+
|
24
|
+
v0.9.0 (19th November 2010)
|
25
|
+
- support for pdf 1.5+ files that use object and xref streams
|
26
|
+
- support streams that use a flate filter with the predictor option
|
27
|
+
- ensure all content instructions are parsed when split over multiple stream
|
28
|
+
- thanks to Jack Rusher for reporting
|
29
|
+
- Various string parsing bug
|
30
|
+
- some character conversions to utf-8 were failing (thanks Andrea Barisani)
|
31
|
+
- hashes with nested hex strings were tokenising wronly (thanks Evan Arnold)
|
32
|
+
- escaping bug in tokenising of literal strings (thanks David Westerink)
|
33
|
+
- Fix a bug that prevented PDFs with white space after the EOF marker from loading
|
34
|
+
- thanks to Solomon White for reporting the issue
|
35
|
+
- Add support for de-filtering some LZW compressed streams
|
36
|
+
- thanks to Jose Ignacio Rubio Iradi for the patch
|
37
|
+
- some small speed improvements
|
38
|
+
- API CHANGE: PDF::Hash renamed to PDF::Reader::ObjectHash
|
39
|
+
- having a class named Hash was confusing for users
|
40
|
+
|
41
|
+
v0.8.6 (27th August 2010)
|
42
|
+
- new method: hash#page_references
|
43
|
+
- returns references to all page objects, gives rapid access to objects
|
44
|
+
for a given page
|
45
|
+
|
46
|
+
v0.8.5 (11th April 2010)
|
47
|
+
- fix a regression introduced in 0.8.4.
|
48
|
+
- Parameters passed to resource_font callback were inadvertently changed
|
49
|
+
|
50
|
+
v0.8.4 (30th March 2010)
|
51
|
+
- fix parsing of files that use Form XObjects
|
52
|
+
- thanks to Andrea Barisani for reporting the issue
|
53
|
+
- fix two issues that caused a small number of characters to convert to Unicode
|
54
|
+
incorrectly
|
55
|
+
- thanks to Andrea Barisani for reporting the issue
|
56
|
+
- require 'pdf-reader' now works a well as 'pdf/reader'
|
57
|
+
- good practice to have the require file match the gem name
|
58
|
+
- thanks to Chris O'Meara for highlighting this
|
59
|
+
|
60
|
+
v0.8.3 (14th February 2010)
|
61
|
+
- Fix a bug in tokenising of hex strings inside dictionaries
|
62
|
+
- Thanks to Brad Ediger for detecting the issue and proposing a solution
|
63
|
+
|
64
|
+
v0.8.2 (1st January 2010)
|
65
|
+
- Fix parsing of files that use Form XObjects behind an indirect reference
|
66
|
+
(thanks Cornelius Illi and Patrick Crosby)
|
67
|
+
- Rewrote Buffer class to fix various speed issues reported over the years
|
68
|
+
- On my sample file extracting full text reduced from 220 seconds to 9 seconds.
|
69
|
+
|
70
|
+
v0.8.1 (27th November 2009)
|
71
|
+
- Added PDF::Hash#version. Provides access to the source file PDF version
|
72
|
+
|
73
|
+
v0.8.0 (20th November 2009)
|
74
|
+
- Added PDF::Hash. It provides direct access to objects from a PDF file
|
75
|
+
with an API that emulates the standard Ruby hash
|
76
|
+
|
77
|
+
v0.7.7 (11th September 2009)
|
78
|
+
- Trigger callbacks contained in Form XObjects when we encounter them in a
|
79
|
+
content stream
|
80
|
+
- Fix inheritance of page resources to comply with section 3.6.2
|
81
|
+
|
82
|
+
v0.7.6 (28th August 2009)
|
83
|
+
- Various bug fixes that increase the files we can successfully parse
|
84
|
+
- Treat float and integer tokens differently (thanks Neil)
|
85
|
+
- Correctly handle PDFs where the Kids element of a Pages dict is an indirect
|
86
|
+
reference (thanks Rob Holland)
|
87
|
+
- Fix conversion of PDF strings to Ruby strings on 1.8.6 (thanks Andrès Koetsier)
|
88
|
+
- Fix decoding with ASCII85 and ASCIIHex filters (thanks Andrès Koetsier)
|
89
|
+
- Fix extracting inline images from content streams (thanks Andrès Koetsier)
|
90
|
+
- Fix extracting [ ] from content streams (thanks Christian Rishøj)
|
91
|
+
- Fix conversion of text to UTF8 when the cmap uses bfrange (thanks Federico Gonzalez Lutteroth)
|
92
|
+
|
93
|
+
v0.7.5 (27th August 2008)
|
94
|
+
- Fix a 1.8.7ism
|
95
|
+
|
96
|
+
v0.7.4 (7th August 2008)
|
97
|
+
- Raise a MalformedPDFError if a content stream contains an unterminated string
|
98
|
+
- Fix an bug that was causing an endless loop on some OSX systems
|
99
|
+
- valid strings were incorrectly thought to be unterminated
|
100
|
+
- thanks to Jeff Webb for playing email ping pong with me as I tracked this
|
101
|
+
issue down
|
102
|
+
|
103
|
+
v0.7.3 (11th June 2008)
|
104
|
+
- Add a high level way to get direct access to a PDF object, including a new executable: pdf_object
|
105
|
+
- Fix a hard loop bug caused by a content stream that is missing a final operator
|
106
|
+
- Significantly simplified the internal code for encoding conversions
|
107
|
+
- Fixes YACC parsing bug that occurs on Fedora 8's ruby VM
|
108
|
+
- New callbacks
|
109
|
+
- page_count
|
110
|
+
- pdf_version
|
111
|
+
- Fix a bug that prevented a font's BaseFont from being recorded correctly
|
112
|
+
|
113
|
+
v0.7.2 (20th May 2008)
|
114
|
+
- Throw an UnsupportedFeatureError if we try to open an encrypted/secure PDF
|
115
|
+
- Correctly handle page content instruction sets with trailing whitespace
|
116
|
+
- Represent PDF Streams with a new object, PDF::Reader::Stream
|
117
|
+
- their really wasn't any point in separating the stream content from it's associated dict. You need both
|
118
|
+
parts to correctly interpret the content
|
119
|
+
|
120
|
+
v0.7.1 (6th May 2008)
|
121
|
+
- Non-page strings (ie. metadata, etc) are now converted to UTF-8 more accurately
|
122
|
+
- Fixed a regression between 0.6.2 and 0.7 that prevented difference tables from being applied
|
123
|
+
correctly when translating text into UTF-8
|
124
|
+
|
125
|
+
v0.7 (6th May 2008)
|
126
|
+
- API INCOMPATIBLE CHANGE: any hashes that are passed to callbacks use symbols as keys instead of PDF::Reader::Name instances.
|
127
|
+
- Improved support for converting text in some PDF files to unicode
|
128
|
+
- Behave as expected if the Contents key in a Page Dict is a reference
|
129
|
+
- Include some basic metadata callbacks
|
130
|
+
- Don't interpret a comment token (%) inside a string as a comment
|
131
|
+
- Small fixes to improve 1.9 compatibility
|
132
|
+
- Improved our Zlib deflating to make it slightly more robust - still some more issues to work out though
|
133
|
+
- Throw an UnsupportedFeatureError if a pdf that uses XRef streams is opened
|
134
|
+
- Added an option to PDF::Reader#file and PDF::Reader#string to enable parsing of only parts of a PDF file(ie. only metadata, etc)
|
135
|
+
|
136
|
+
v0.6.2 (22nd March 2008)
|
137
|
+
- Catch low level errors when applying filters to a content stream and raise a MalformedPDFError instead.
|
138
|
+
- Added support for processing inline images
|
139
|
+
- Support for parsing XRef tables that have multiple subsections
|
140
|
+
- Added a few callbacks to improve the way we supply information on page resources
|
141
|
+
- Ignore whitespace in hex strings, as required by the spec (section 3.2.3)
|
142
|
+
- Use our "unknown character box" when a single character in an Identity-H string fails to decode
|
143
|
+
- Support ToUnicode CMaps that use the bfrange operator
|
144
|
+
- Tweaked tokenising code to ensure whitespace doesn't get in the way
|
145
|
+
|
146
|
+
v0.6.1 (12th March 2008)
|
147
|
+
- Tweaked behaviour when we encounter Identity-H encoded text that doesn't have a ToUnicode mapping. We
|
148
|
+
just replace each character with a little box.
|
149
|
+
- Use the same little box when invalid characters are found in other encodings instead of throwing an ugly
|
150
|
+
NoMethodError.
|
151
|
+
- Added a method to RegisterReceiver that returns all occurrences of a callback
|
152
|
+
|
153
|
+
v0.6.0 (27th February 2008)
|
154
|
+
- all text is now transparently converted to UTF-8 before being passed to the callbacks.
|
155
|
+
before this version, text was just passed as a byte level copy of what was in the PDF file, which
|
156
|
+
was mildly annoying with some encodings, and resulted in garbled text for Unicode encoded text.
|
157
|
+
- Fonts that use a difference table are now handled correctly
|
158
|
+
- fixed some 1.9 incompatible syntax
|
159
|
+
- expanded RegisterReceiver class to record extra info
|
160
|
+
- expanded rspec coverage
|
161
|
+
- tweaked a README example
|
162
|
+
|
163
|
+
v0.5.1 (1st January 2008)
|
164
|
+
- Several documentation tweaks
|
165
|
+
- Improve support for parsing PDFs under windows (thanks to Jari Williamsson)
|
166
|
+
|
167
|
+
v0.5 (14th December 2007)
|
168
|
+
- Initial Release
|
data/MIT-LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
Copyright (c) 2009 Peter Jones
|
2
|
+
Copyright (c) 2009 James Healy
|
3
|
+
|
4
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
a copy of this software and associated documentation files (the
|
6
|
+
"Software"), to deal in the Software without restriction, including
|
7
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be
|
13
|
+
included in all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,137 @@
|
|
1
|
+
The PDF::Reader library implements a PDF parser conforming as much as possible
|
2
|
+
to the PDF specification from Adobe.
|
3
|
+
|
4
|
+
It provides programmatic access to the contents of a PDF file with a high
|
5
|
+
degree of flexibility.
|
6
|
+
|
7
|
+
The PDF 1.7 specification is a weighty document and not all aspects are
|
8
|
+
currently supported. I welcome submission of PDF files that exhibit
|
9
|
+
unsupported aspects of the spec to assist with improving our support.
|
10
|
+
|
11
|
+
= Installation
|
12
|
+
|
13
|
+
The recommended installation method is via Rubygems.
|
14
|
+
|
15
|
+
gem install pdf-reader
|
16
|
+
|
17
|
+
= Usage
|
18
|
+
|
19
|
+
Begin by creating a PDF::Reader instance that points to a PDF file. Document
|
20
|
+
level information (metadata, page count, bookmarks, etc) is available via
|
21
|
+
this object.
|
22
|
+
|
23
|
+
reader = PDF::Reader.new("somefile.pdf")
|
24
|
+
|
25
|
+
puts reader.pdf_version
|
26
|
+
puts reader.info
|
27
|
+
puts reader.metadata
|
28
|
+
puts reader.page_count
|
29
|
+
|
30
|
+
PDF::Reader.new can accept an IO stream or a filename. Here's an example with
|
31
|
+
an IO stream:
|
32
|
+
|
33
|
+
require 'open-uri'
|
34
|
+
|
35
|
+
io = open('http://example.com/somefile.pdf')
|
36
|
+
reader = PDF::Reader.new(io)
|
37
|
+
puts reader.info
|
38
|
+
|
39
|
+
PDF is a page based file format, so most visible information is available via
|
40
|
+
page-based iteration
|
41
|
+
|
42
|
+
reader = PDF::Reader.new("somefile.pdf")
|
43
|
+
|
44
|
+
reader.pages.each do |page|
|
45
|
+
puts page.fonts
|
46
|
+
puts page.text
|
47
|
+
puts page.raw_content
|
48
|
+
end
|
49
|
+
|
50
|
+
If you need to access the full program for rendering a page, use the walk() method
|
51
|
+
of PDF::Reader::Page.
|
52
|
+
|
53
|
+
class RedGreenBlue
|
54
|
+
def set_rgb_color_for_nonstroking(r, g, b)
|
55
|
+
puts "R: #{r}, G: #{g}, B: #{b}"
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
reader = PDF::Reader.new("somefile.pdf")
|
60
|
+
page = reader.page(1)
|
61
|
+
receiver = RedGreenBlue.new
|
62
|
+
page.walk(receiver)
|
63
|
+
|
64
|
+
For low level access to the objects in a PDF file, use the ObjectHash class. You can
|
65
|
+
build an ObjectHash instance directly:
|
66
|
+
|
67
|
+
puts PDF::Reader::ObjectHash.new("somefile.pdf")
|
68
|
+
|
69
|
+
or via a PDF::Reader instance:
|
70
|
+
|
71
|
+
reader = PDF::Reader.new("somefile.pdf")
|
72
|
+
puts reader.objects
|
73
|
+
|
74
|
+
The second method is preferred to increase the effectiveness of internal caching.
|
75
|
+
|
76
|
+
= Text Encoding
|
77
|
+
|
78
|
+
Internally, text can be stored inside a PDF in various encodings, including
|
79
|
+
zingbats, win-1252, mac roman and a form of Unicode. To avoid confusion, all
|
80
|
+
text will be converted to UTF-8 before it is passed back from PDF::Reader.
|
81
|
+
|
82
|
+
Strings that contain binary data (like font blobs) will be marked as such on
|
83
|
+
M17N aware VMs.
|
84
|
+
|
85
|
+
= Exceptions
|
86
|
+
|
87
|
+
There are two key exceptions that you will need to watch out for when processing a
|
88
|
+
PDF file:
|
89
|
+
|
90
|
+
MalformedPDFError - The PDF appears to be corrupt in some way. If you believe the
|
91
|
+
file should be valid, or that a corrupt file didn't raise an exception, please
|
92
|
+
forward a copy of the file to the maintainers (preferably via the google group)
|
93
|
+
and we can attempt to improve the code.
|
94
|
+
|
95
|
+
UnsupportedFeatureError - The PDF uses a feature that PDF::Reader doesn't currently
|
96
|
+
support. Again, we welcome submissions of PDF files that exhibit these features to help
|
97
|
+
us with future code improvements.
|
98
|
+
|
99
|
+
MalformedPDFError has some subclasses if you want to detect finer grained issues. If you
|
100
|
+
don't, 'rescue MalformedPDFError' will catch all the subclassed errors as well.
|
101
|
+
|
102
|
+
Any other exceptions should be considered bugs in either PDF::Reader (please
|
103
|
+
report it!).
|
104
|
+
|
105
|
+
= Maintainers
|
106
|
+
|
107
|
+
- James Healy <mailto:jimmy@deefa.com>
|
108
|
+
|
109
|
+
= Licensing
|
110
|
+
|
111
|
+
This library is distributed under the terms of the MIT License. See the included file for
|
112
|
+
more detail.
|
113
|
+
|
114
|
+
= Mailing List
|
115
|
+
|
116
|
+
Any questions or feedback should be sent to the PDF::Reader google group. It's
|
117
|
+
better that any answers be available for others instead of hiding in someone's
|
118
|
+
inbox.
|
119
|
+
|
120
|
+
http://groups.google.com/group/pdf-reader
|
121
|
+
|
122
|
+
= Examples
|
123
|
+
|
124
|
+
The easiest way to explain how this works in practice is to show some examples.
|
125
|
+
Check out the examples/ directory for a few files.
|
126
|
+
|
127
|
+
= Known Limitations
|
128
|
+
|
129
|
+
Occasionally some text cannot be extracted properly due to the way it has been
|
130
|
+
stored, or the use of invalid bytes. In these cases PDF::Reader will output a
|
131
|
+
little UTF-8 friendly box to indicate an unrecognisable character.
|
132
|
+
|
133
|
+
= Resources
|
134
|
+
|
135
|
+
- PDF::Reader Code Repository: http://github.com/yob/pdf-reader
|
136
|
+
- PDF Specification: http://www.adobe.com/devnet/pdf/pdf_reference.html
|
137
|
+
- PDF Tutorial Slide Presentations: http://home.comcast.net/~jk05/presentations/PDFTutorials.html
|
data/Rakefile
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
require "rubygems"
|
2
|
+
require "bundler"
|
3
|
+
Bundler.setup
|
4
|
+
|
5
|
+
require 'rake'
|
6
|
+
require 'rake/rdoctask'
|
7
|
+
require 'rspec/core/rake_task'
|
8
|
+
require 'roodi'
|
9
|
+
require 'roodi_task'
|
10
|
+
|
11
|
+
desc "Default Task"
|
12
|
+
task :default => [ :spec ]
|
13
|
+
|
14
|
+
# run all rspecs
|
15
|
+
desc "Run all rspec files"
|
16
|
+
RSpec::Core::RakeTask.new("spec") do |t|
|
17
|
+
t.rspec_opts = ["--color", "--format progress"]
|
18
|
+
t.ruby_opts = "-w"
|
19
|
+
end
|
20
|
+
|
21
|
+
# Genereate the RDoc documentation
|
22
|
+
desc "Create documentation"
|
23
|
+
Rake::RDocTask.new("doc") do |rdoc|
|
24
|
+
rdoc.title = "pdf-reader"
|
25
|
+
rdoc.rdoc_dir = (ENV['CC_BUILD_ARTIFACTS'] || 'doc') + '/rdoc'
|
26
|
+
rdoc.rdoc_files.include('README.rdoc')
|
27
|
+
rdoc.rdoc_files.include('TODO')
|
28
|
+
rdoc.rdoc_files.include('CHANGELOG')
|
29
|
+
rdoc.rdoc_files.include('MIT-LICENSE')
|
30
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
31
|
+
rdoc.options << "--inline-source"
|
32
|
+
end
|
33
|
+
|
34
|
+
RoodiTask.new 'roodi', ['lib/**/*.rb']
|
data/TODO
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
v0.8
|
2
|
+
- add extra callbacks
|
3
|
+
- list implemented features
|
4
|
+
- encrypted? tagged? bookmarks? annotated? optimised?
|
5
|
+
- Allow more than just page content and metadata to be parsed (see spec section 3.6.1)
|
6
|
+
- bookmarks?
|
7
|
+
- outline?
|
8
|
+
- articles?
|
9
|
+
- viewer prefs?
|
10
|
+
- Don't remove comment when tokenising in the middle of a string
|
11
|
+
- Tweak encoding mappings to differentiate between bytes that are invalid for an encoding, and bytes that are unchanged.
|
12
|
+
poppler seems to do this in a quite reasonable way. Original Encoding -> Glyph Names -> Unicode. As of 0.6 we go straight
|
13
|
+
from the Original encoding to Unicode.
|
14
|
+
- detect when a font's encoding is a CMap (generally used for pre-Unicode, multibyte asian encodings), and display a user friendly error
|
15
|
+
- Improve interpretation of non content stream data (ie metadata). recognise dates, etc
|
16
|
+
- Fix inheritance of page attributes. Resources has been done, but plenty of other attributes
|
17
|
+
are inheritable. See table 3.2.7 in the spec
|
18
|
+
|
19
|
+
v0.9
|
20
|
+
- Add a way to extract raster images
|
21
|
+
- see XObjects section of spec (section 4.7)
|
22
|
+
- Add a way to extract font data?
|
23
|
+
|
24
|
+
Sometime
|
25
|
+
- Support for CJK text (convert to UTF-8 like all other encodings. See Section 5.9 of the PDF spec)
|
26
|
+
- Will require significantly improved handling of CMaps, including creating a bunch of predefined ones
|
27
|
+
|
28
|
+
- Work out why specs/data/zlib*.pdf isn't parsed correctly when all the major PDF viewers can display it correctly
|
29
|
+
|
30
|
+
- Ship some extra receivers in the standard package, particuarly ones that are useful for running
|
31
|
+
rspec over generated PDF files
|
32
|
+
|
33
|
+
- When we encounter Identity-H encoded text with no ToUnicode CMap, render the glyphs and treat them as images, as there's no
|
34
|
+
sensible way to convert them to unicode
|
35
|
+
|
36
|
+
- Add support for additional filters: ASCIIHexDecode, ASCII85Decode, LZWDecode, RunLengthDecode, CCITTFaxDecode, JBIG2Decode, DCTDecode, JPXDecode, Crypt?
|
37
|
+
|
38
|
+
- Add support for additional encodings:
|
39
|
+
- Identity-V(I *think* this relates to vertical text. Not sure how we'd support it sensibly)
|
40
|
+
|
41
|
+
- Investigate how R->L text is handled
|
42
|
+
|
43
|
+
- fix all callbacks to only ever return basic ruby objects (strings, ints,
|
44
|
+
attays, symbols, hashes, etc). No PDF::Reader::Reference or
|
45
|
+
PDF::Reader::Font, etc.
|
@@ -0,0 +1,15 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
|
5
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__) + "/../lib")
|
6
|
+
|
7
|
+
require 'pdf/reader'
|
8
|
+
|
9
|
+
receiver = PDF::Reader::PrintReceiver.new
|
10
|
+
|
11
|
+
if ARGV.empty?
|
12
|
+
PDF::Reader.new.parse($stdin, receiver)
|
13
|
+
else
|
14
|
+
PDF::Reader.file(ARGV[0], receiver)
|
15
|
+
end
|
data/bin/pdf_object
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
|
5
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__) + "/../lib")
|
6
|
+
|
7
|
+
USAGE = "USAGE: " + File.basename(__FILE__) + " <file> <object id> [generation]"
|
8
|
+
|
9
|
+
require 'pdf/reader'
|
10
|
+
|
11
|
+
filename, id, gen = *ARGV
|
12
|
+
|
13
|
+
if filename.nil? || id.nil?
|
14
|
+
puts USAGE
|
15
|
+
exit 1
|
16
|
+
elsif !File.file?(filename)
|
17
|
+
$stderr.puts "#{filename} does not exist"
|
18
|
+
exit 1
|
19
|
+
end
|
20
|
+
|
21
|
+
# tweak the users options
|
22
|
+
id = id.to_i
|
23
|
+
gen ||= 0
|
24
|
+
gen = gen.to_i
|
25
|
+
|
26
|
+
# make magic happen
|
27
|
+
begin
|
28
|
+
obj = PDF::Reader.object_file(filename, id, gen)
|
29
|
+
|
30
|
+
case obj
|
31
|
+
when Hash, Array
|
32
|
+
puts obj.inspect
|
33
|
+
when PDF::Reader::Stream
|
34
|
+
puts obj.hash.inspect
|
35
|
+
puts obj.unfiltered_data
|
36
|
+
else
|
37
|
+
puts obj
|
38
|
+
end
|
39
|
+
rescue PDF::Reader::InvalidObjectError
|
40
|
+
$stderr.puts "Error retreiving object #{id}, gen #{gen}. Does it exist?"
|
41
|
+
exit 1
|
42
|
+
rescue PDF::Reader::MalformedPDFError => e
|
43
|
+
$stderr.puts "Malformed PDF file: #{e.message}"
|
44
|
+
exit 1
|
45
|
+
rescue PDF::Reader::UnsupportedFeatureError => e
|
46
|
+
$stderr.puts "PDF file implements a feature unsupported by PDF::Reader: #{e.message}"
|
47
|
+
exit 1
|
48
|
+
end
|
data/bin/pdf_text
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__) + "/../lib")
|
5
|
+
|
6
|
+
require 'pdf/reader'
|
7
|
+
|
8
|
+
if ARGV.empty?
|
9
|
+
browser = PDF::Reader.new($stdin)
|
10
|
+
else
|
11
|
+
browser = PDF::Reader.new(ARGV[0])
|
12
|
+
end
|
13
|
+
browser.pages.each do |page|
|
14
|
+
puts page.text
|
15
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# coding: utf-8
|
3
|
+
|
4
|
+
# List all callbacks generated by each page
|
5
|
+
#
|
6
|
+
# WARNING: this will generate a *lot* of output, so you probably want to pipe
|
7
|
+
# it through less or to a text file.
|
8
|
+
|
9
|
+
require 'rubygems'
|
10
|
+
require 'pdf/reader'
|
11
|
+
|
12
|
+
receiver = PDF::Reader::RegisterReceiver.new
|
13
|
+
|
14
|
+
PDF::Reader.open("somefile.pdf") do |reader|
|
15
|
+
reader.pages.each do |page|
|
16
|
+
page.walk(receiver)
|
17
|
+
receiver.callbacks.each do |cb|
|
18
|
+
puts cb
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# coding: utf-8
|
3
|
+
|
4
|
+
# A sample script that attempts to extract bates numbers from a PDF file.
|
5
|
+
# Bates numbers are often used to markup documents being used in legal
|
6
|
+
# cases. For more info, see http://en.wikipedia.org/wiki/Bates_numbering
|
7
|
+
#
|
8
|
+
# Acrobat 9 introduced a markup syntax that directly specifies the bates
|
9
|
+
# number for each page. For earlier versions, the easiest way to find
|
10
|
+
# the number is to look for words that match a pattern.
|
11
|
+
#
|
12
|
+
# This example attempts to extract numbers using the Acrobat 9 syntax.
|
13
|
+
# As a fall back, you can use a regular expression to look for words
|
14
|
+
# that match the numbers you expect in the page content.
|
15
|
+
|
16
|
+
require 'rubygems'
|
17
|
+
require 'pdf/reader'
|
18
|
+
|
19
|
+
class BatesReceiver
|
20
|
+
|
21
|
+
attr_reader :numbers
|
22
|
+
|
23
|
+
def initialize
|
24
|
+
@numbers = []
|
25
|
+
end
|
26
|
+
|
27
|
+
def begin_marked_content(*args)
|
28
|
+
return unless args.size >= 2
|
29
|
+
return unless args.first == :Artifact
|
30
|
+
return unless args[1][:Subtype] == :BatesN
|
31
|
+
|
32
|
+
@numbers << args[1][:Contents]
|
33
|
+
end
|
34
|
+
alias :begin_marked_content_with_pl :begin_marked_content
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
PDF::Reader.open("bates.pdf") do |reader|
|
40
|
+
reader.pages.each do |page|
|
41
|
+
receiver = BatesReceiver.new
|
42
|
+
page.walk(receiver)
|
43
|
+
if receiver.numbers.empty?
|
44
|
+
puts page.scan(/CC.+/)
|
45
|
+
else
|
46
|
+
puts receiver.numbers.inspect
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,108 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
# This demonstrates a way to extract some images (those based on the JPG or
|
4
|
+
# TIFF formats) from a PDF. There are other ways to store images, so
|
5
|
+
# it may need to be expanded for real world usage, but it should serve
|
6
|
+
# as a good guide.
|
7
|
+
#
|
8
|
+
# Thanks to Jack Rusher for the initial version of this example.
|
9
|
+
#
|
10
|
+
# USAGE:
|
11
|
+
#
|
12
|
+
# ruby extract_images.rb somefile.pdf
|
13
|
+
|
14
|
+
require 'pdf/reader'
|
15
|
+
|
16
|
+
module ExtractImages
|
17
|
+
|
18
|
+
class Receiver
|
19
|
+
attr_reader :count
|
20
|
+
|
21
|
+
def initialize
|
22
|
+
@count = 0
|
23
|
+
end
|
24
|
+
|
25
|
+
def resource_xobject(name, stream)
|
26
|
+
return unless stream.hash[:Subtype] == :Image
|
27
|
+
increment_count
|
28
|
+
|
29
|
+
case stream.hash[:Filter]
|
30
|
+
when :CCITTFaxDecode
|
31
|
+
ExtractImages::Tiff.new(stream).save("#{count}-#{name}.tif")
|
32
|
+
when :DCTDecode
|
33
|
+
ExtractImages::Jpg.new(stream).save("#{count}-#{name}.jpg")
|
34
|
+
else
|
35
|
+
$stderr.puts "unrecognized image filter '#{stream.hash[:Filter]}'!"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def increment_count
|
40
|
+
@count += 1
|
41
|
+
end
|
42
|
+
private :increment_count
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
class Jpg
|
47
|
+
attr_reader :stream
|
48
|
+
|
49
|
+
def initialize(stream)
|
50
|
+
@stream = stream
|
51
|
+
end
|
52
|
+
|
53
|
+
def save(filename)
|
54
|
+
w = stream.hash[:Width]
|
55
|
+
h = stream.hash[:Height]
|
56
|
+
puts "#{filename}: h=#{h}, w=#{w}"
|
57
|
+
File.open(filename, "wb") { |file| file.write stream.data }
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
class Tiff
|
62
|
+
attr_reader :stream
|
63
|
+
|
64
|
+
def initialize(stream)
|
65
|
+
@stream = stream
|
66
|
+
end
|
67
|
+
|
68
|
+
def save(filename)
|
69
|
+
if stream.hash[:DecodeParms][:K] <= 0
|
70
|
+
save_group_four(filename)
|
71
|
+
else
|
72
|
+
$stderr.puts "#{filename}: CCITT non-group 4/2D image."
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
private
|
77
|
+
|
78
|
+
# Group 4, 2D
|
79
|
+
def save_group_four(filename)
|
80
|
+
k = stream.hash[:DecodeParms][:K]
|
81
|
+
h = stream.hash[:Height]
|
82
|
+
w = stream.hash[:Width]
|
83
|
+
bpc = stream.hash[:BitsPerComponent]
|
84
|
+
mask = stream.hash[:ImageMask]
|
85
|
+
len = stream.hash[:Length]
|
86
|
+
cols = stream.hash[:DecodeParms][:Columns]
|
87
|
+
puts "#{filename}: h=#{h}, w=#{w}, bpc=#{bpc}, mask=#{mask}, len=#{len}, cols=#{cols}, k=#{k}"
|
88
|
+
|
89
|
+
# Synthesize a TIFF header
|
90
|
+
long_tag = lambda {|tag, value| [ tag, 4, 1, value ].pack( "ssII" ) }
|
91
|
+
short_tag = lambda {|tag, value| [ tag, 3, 1, value ].pack( "ssII" ) }
|
92
|
+
# header = byte order, version magic, offset of directory, directory count,
|
93
|
+
# followed by a series of tags containing metadata: 259 is a magic number for
|
94
|
+
# the compression type; 273 is the offset of the image data.
|
95
|
+
tiff = [ 73, 73, 42, 8, 5 ].pack("ccsIs") \
|
96
|
+
+ short_tag.call( 256, cols ) \
|
97
|
+
+ short_tag.call( 257, h ) \
|
98
|
+
+ short_tag.call( 259, 4 ) \
|
99
|
+
+ long_tag.call( 273, (10 + (5*12)) ) \
|
100
|
+
+ long_tag.call( 279, len) \
|
101
|
+
+ stream.data
|
102
|
+
File.open(filename, "wb") { |file| file.write tiff }
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
receiver = ExtractImages::Receiver.new
|
108
|
+
PDF::Reader.file(ARGV[0], receiver)
|
data/examples/hash.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# coding: utf-8
|
3
|
+
|
4
|
+
# get direct access to PDF objects
|
5
|
+
#
|
6
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__) + '/../lib')
|
7
|
+
|
8
|
+
require 'pdf/reader'
|
9
|
+
|
10
|
+
filename = File.dirname(__FILE__) + "/../specs/data/cairo-unicode.pdf"
|
11
|
+
hash = PDF::Reader::ObjectHash.new(filename)
|
12
|
+
puts hash[3]
|