fireinc-pdf-reader 0.11.0.alpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +168 -0
- data/MIT-LICENSE +21 -0
- data/README.rdoc +137 -0
- data/Rakefile +34 -0
- data/TODO +45 -0
- data/bin/pdf_list_callbacks +15 -0
- data/bin/pdf_object +48 -0
- data/bin/pdf_text +15 -0
- data/examples/callbacks.rb +21 -0
- data/examples/extract_bates.rb +49 -0
- data/examples/extract_images.rb +108 -0
- data/examples/hash.rb +12 -0
- data/examples/metadata.rb +25 -0
- data/examples/page_counter_improved.rb +23 -0
- data/examples/page_counter_naive.rb +24 -0
- data/examples/rspec.rb +57 -0
- data/examples/text.rb +40 -0
- data/examples/version.rb +25 -0
- data/lib/pdf/hash.rb +15 -0
- data/lib/pdf/reader/abstract_strategy.rb +81 -0
- data/lib/pdf/reader/buffer.rb +346 -0
- data/lib/pdf/reader/cmap.rb +138 -0
- data/lib/pdf/reader/encoding.rb +190 -0
- data/lib/pdf/reader/encodings/mac_expert.txt +159 -0
- data/lib/pdf/reader/encodings/mac_roman.txt +128 -0
- data/lib/pdf/reader/encodings/pdf_doc.txt +40 -0
- data/lib/pdf/reader/encodings/standard.txt +47 -0
- data/lib/pdf/reader/encodings/symbol.txt +154 -0
- data/lib/pdf/reader/encodings/win_ansi.txt +29 -0
- data/lib/pdf/reader/encodings/zapf_dingbats.txt +201 -0
- data/lib/pdf/reader/error.rb +53 -0
- data/lib/pdf/reader/filter.rb +219 -0
- data/lib/pdf/reader/font.rb +133 -0
- data/lib/pdf/reader/form_xobject.rb +83 -0
- data/lib/pdf/reader/glyphlist.txt +4322 -0
- data/lib/pdf/reader/lzw.rb +123 -0
- data/lib/pdf/reader/metadata_strategy.rb +56 -0
- data/lib/pdf/reader/object_cache.rb +85 -0
- data/lib/pdf/reader/object_hash.rb +289 -0
- data/lib/pdf/reader/object_stream.rb +51 -0
- data/lib/pdf/reader/page.rb +185 -0
- data/lib/pdf/reader/page_text_receiver.rb +278 -0
- data/lib/pdf/reader/pages_strategy.rb +475 -0
- data/lib/pdf/reader/parser.rb +225 -0
- data/lib/pdf/reader/print_receiver.rb +18 -0
- data/lib/pdf/reader/reference.rb +66 -0
- data/lib/pdf/reader/register_receiver.rb +95 -0
- data/lib/pdf/reader/stream.rb +69 -0
- data/lib/pdf/reader/text_receiver.rb +264 -0
- data/lib/pdf/reader/token.rb +41 -0
- data/lib/pdf/reader/xref.rb +220 -0
- data/lib/pdf/reader.rb +296 -0
- data/lib/pdf-reader.rb +1 -0
- metadata +211 -0
data/CHANGELOG
ADDED
@@ -0,0 +1,168 @@
|
|
1
|
+
v0.9.4 (XXX)
|
2
|
+
- support multiple receivers within a single pass over a source file
|
3
|
+
- massive time saving when dealing with multiple receivers
|
4
|
+
|
5
|
+
v0.9.3 (2nd July 2011)
|
6
|
+
- add PDF::Reader::Reference#hash method
|
7
|
+
- improves behaviour of Reference objects when tehy're used as Hash keys
|
8
|
+
|
9
|
+
v0.9.2 (24th April 2011)
|
10
|
+
- add basic support for fonts with Identity-V encoding.
|
11
|
+
- bug: improve robustness of text extraction
|
12
|
+
- thanks to Evan Arnold for reporting
|
13
|
+
- bug: fix loading of nested resources on XObjects
|
14
|
+
- thanks to Samuel Williams for reporting
|
15
|
+
- bug: improve parsing of files with XRef object streams
|
16
|
+
|
17
|
+
v0.9.1 (21st December 2010)
|
18
|
+
- force gem to only install on ruby 1.8.7 or higher
|
19
|
+
- maintaining supprot for earlier versions takes more time than I have
|
20
|
+
available at the moment
|
21
|
+
- bug: fix parsing of obscure pdf name format
|
22
|
+
- bug: fix behaviour when loaded in confunction with htmldoc gem
|
23
|
+
|
24
|
+
v0.9.0 (19th November 2010)
|
25
|
+
- support for pdf 1.5+ files that use object and xref streams
|
26
|
+
- support streams that use a flate filter with the predictor option
|
27
|
+
- ensure all content instructions are parsed when split over multiple stream
|
28
|
+
- thanks to Jack Rusher for reporting
|
29
|
+
- Various string parsing bug
|
30
|
+
- some character conversions to utf-8 were failing (thanks Andrea Barisani)
|
31
|
+
- hashes with nested hex strings were tokenising wronly (thanks Evan Arnold)
|
32
|
+
- escaping bug in tokenising of literal strings (thanks David Westerink)
|
33
|
+
- Fix a bug that prevented PDFs with white space after the EOF marker from loading
|
34
|
+
- thanks to Solomon White for reporting the issue
|
35
|
+
- Add support for de-filtering some LZW compressed streams
|
36
|
+
- thanks to Jose Ignacio Rubio Iradi for the patch
|
37
|
+
- some small speed improvements
|
38
|
+
- API CHANGE: PDF::Hash renamed to PDF::Reader::ObjectHash
|
39
|
+
- having a class named Hash was confusing for users
|
40
|
+
|
41
|
+
v0.8.6 (27th August 2010)
|
42
|
+
- new method: hash#page_references
|
43
|
+
- returns references to all page objects, gives rapid access to objects
|
44
|
+
for a given page
|
45
|
+
|
46
|
+
v0.8.5 (11th April 2010)
|
47
|
+
- fix a regression introduced in 0.8.4.
|
48
|
+
- Parameters passed to resource_font callback were inadvertently changed
|
49
|
+
|
50
|
+
v0.8.4 (30th March 2010)
|
51
|
+
- fix parsing of files that use Form XObjects
|
52
|
+
- thanks to Andrea Barisani for reporting the issue
|
53
|
+
- fix two issues that caused a small number of characters to convert to Unicode
|
54
|
+
incorrectly
|
55
|
+
- thanks to Andrea Barisani for reporting the issue
|
56
|
+
- require 'pdf-reader' now works a well as 'pdf/reader'
|
57
|
+
- good practice to have the require file match the gem name
|
58
|
+
- thanks to Chris O'Meara for highlighting this
|
59
|
+
|
60
|
+
v0.8.3 (14th February 2010)
|
61
|
+
- Fix a bug in tokenising of hex strings inside dictionaries
|
62
|
+
- Thanks to Brad Ediger for detecting the issue and proposing a solution
|
63
|
+
|
64
|
+
v0.8.2 (1st January 2010)
|
65
|
+
- Fix parsing of files that use Form XObjects behind an indirect reference
|
66
|
+
(thanks Cornelius Illi and Patrick Crosby)
|
67
|
+
- Rewrote Buffer class to fix various speed issues reported over the years
|
68
|
+
- On my sample file extracting full text reduced from 220 seconds to 9 seconds.
|
69
|
+
|
70
|
+
v0.8.1 (27th November 2009)
|
71
|
+
- Added PDF::Hash#version. Provides access to the source file PDF version
|
72
|
+
|
73
|
+
v0.8.0 (20th November 2009)
|
74
|
+
- Added PDF::Hash. It provides direct access to objects from a PDF file
|
75
|
+
with an API that emulates the standard Ruby hash
|
76
|
+
|
77
|
+
v0.7.7 (11th September 2009)
|
78
|
+
- Trigger callbacks contained in Form XObjects when we encounter them in a
|
79
|
+
content stream
|
80
|
+
- Fix inheritance of page resources to comply with section 3.6.2
|
81
|
+
|
82
|
+
v0.7.6 (28th August 2009)
|
83
|
+
- Various bug fixes that increase the files we can successfully parse
|
84
|
+
- Treat float and integer tokens differently (thanks Neil)
|
85
|
+
- Correctly handle PDFs where the Kids element of a Pages dict is an indirect
|
86
|
+
reference (thanks Rob Holland)
|
87
|
+
- Fix conversion of PDF strings to Ruby strings on 1.8.6 (thanks Andrès Koetsier)
|
88
|
+
- Fix decoding with ASCII85 and ASCIIHex filters (thanks Andrès Koetsier)
|
89
|
+
- Fix extracting inline images from content streams (thanks Andrès Koetsier)
|
90
|
+
- Fix extracting [ ] from content streams (thanks Christian Rishøj)
|
91
|
+
- Fix conversion of text to UTF8 when the cmap uses bfrange (thanks Federico Gonzalez Lutteroth)
|
92
|
+
|
93
|
+
v0.7.5 (27th August 2008)
|
94
|
+
- Fix a 1.8.7ism
|
95
|
+
|
96
|
+
v0.7.4 (7th August 2008)
|
97
|
+
- Raise a MalformedPDFError if a content stream contains an unterminated string
|
98
|
+
- Fix an bug that was causing an endless loop on some OSX systems
|
99
|
+
- valid strings were incorrectly thought to be unterminated
|
100
|
+
- thanks to Jeff Webb for playing email ping pong with me as I tracked this
|
101
|
+
issue down
|
102
|
+
|
103
|
+
v0.7.3 (11th June 2008)
|
104
|
+
- Add a high level way to get direct access to a PDF object, including a new executable: pdf_object
|
105
|
+
- Fix a hard loop bug caused by a content stream that is missing a final operator
|
106
|
+
- Significantly simplified the internal code for encoding conversions
|
107
|
+
- Fixes YACC parsing bug that occurs on Fedora 8's ruby VM
|
108
|
+
- New callbacks
|
109
|
+
- page_count
|
110
|
+
- pdf_version
|
111
|
+
- Fix a bug that prevented a font's BaseFont from being recorded correctly
|
112
|
+
|
113
|
+
v0.7.2 (20th May 2008)
|
114
|
+
- Throw an UnsupportedFeatureError if we try to open an encrypted/secure PDF
|
115
|
+
- Correctly handle page content instruction sets with trailing whitespace
|
116
|
+
- Represent PDF Streams with a new object, PDF::Reader::Stream
|
117
|
+
- their really wasn't any point in separating the stream content from it's associated dict. You need both
|
118
|
+
parts to correctly interpret the content
|
119
|
+
|
120
|
+
v0.7.1 (6th May 2008)
|
121
|
+
- Non-page strings (ie. metadata, etc) are now converted to UTF-8 more accurately
|
122
|
+
- Fixed a regression between 0.6.2 and 0.7 that prevented difference tables from being applied
|
123
|
+
correctly when translating text into UTF-8
|
124
|
+
|
125
|
+
v0.7 (6th May 2008)
|
126
|
+
- API INCOMPATIBLE CHANGE: any hashes that are passed to callbacks use symbols as keys instead of PDF::Reader::Name instances.
|
127
|
+
- Improved support for converting text in some PDF files to unicode
|
128
|
+
- Behave as expected if the Contents key in a Page Dict is a reference
|
129
|
+
- Include some basic metadata callbacks
|
130
|
+
- Don't interpret a comment token (%) inside a string as a comment
|
131
|
+
- Small fixes to improve 1.9 compatibility
|
132
|
+
- Improved our Zlib deflating to make it slightly more robust - still some more issues to work out though
|
133
|
+
- Throw an UnsupportedFeatureError if a pdf that uses XRef streams is opened
|
134
|
+
- Added an option to PDF::Reader#file and PDF::Reader#string to enable parsing of only parts of a PDF file(ie. only metadata, etc)
|
135
|
+
|
136
|
+
v0.6.2 (22nd March 2008)
|
137
|
+
- Catch low level errors when applying filters to a content stream and raise a MalformedPDFError instead.
|
138
|
+
- Added support for processing inline images
|
139
|
+
- Support for parsing XRef tables that have multiple subsections
|
140
|
+
- Added a few callbacks to improve the way we supply information on page resources
|
141
|
+
- Ignore whitespace in hex strings, as required by the spec (section 3.2.3)
|
142
|
+
- Use our "unknown character box" when a single character in an Identity-H string fails to decode
|
143
|
+
- Support ToUnicode CMaps that use the bfrange operator
|
144
|
+
- Tweaked tokenising code to ensure whitespace doesn't get in the way
|
145
|
+
|
146
|
+
v0.6.1 (12th March 2008)
|
147
|
+
- Tweaked behaviour when we encounter Identity-H encoded text that doesn't have a ToUnicode mapping. We
|
148
|
+
just replace each character with a little box.
|
149
|
+
- Use the same little box when invalid characters are found in other encodings instead of throwing an ugly
|
150
|
+
NoMethodError.
|
151
|
+
- Added a method to RegisterReceiver that returns all occurrences of a callback
|
152
|
+
|
153
|
+
v0.6.0 (27th February 2008)
|
154
|
+
- all text is now transparently converted to UTF-8 before being passed to the callbacks.
|
155
|
+
before this version, text was just passed as a byte level copy of what was in the PDF file, which
|
156
|
+
was mildly annoying with some encodings, and resulted in garbled text for Unicode encoded text.
|
157
|
+
- Fonts that use a difference table are now handled correctly
|
158
|
+
- fixed some 1.9 incompatible syntax
|
159
|
+
- expanded RegisterReceiver class to record extra info
|
160
|
+
- expanded rspec coverage
|
161
|
+
- tweaked a README example
|
162
|
+
|
163
|
+
v0.5.1 (1st January 2008)
|
164
|
+
- Several documentation tweaks
|
165
|
+
- Improve support for parsing PDFs under windows (thanks to Jari Williamsson)
|
166
|
+
|
167
|
+
v0.5 (14th December 2007)
|
168
|
+
- Initial Release
|
data/MIT-LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
Copyright (c) 2009 Peter Jones
|
2
|
+
Copyright (c) 2009 James Healy
|
3
|
+
|
4
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
a copy of this software and associated documentation files (the
|
6
|
+
"Software"), to deal in the Software without restriction, including
|
7
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be
|
13
|
+
included in all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,137 @@
|
|
1
|
+
The PDF::Reader library implements a PDF parser conforming as much as possible
|
2
|
+
to the PDF specification from Adobe.
|
3
|
+
|
4
|
+
It provides programmatic access to the contents of a PDF file with a high
|
5
|
+
degree of flexibility.
|
6
|
+
|
7
|
+
The PDF 1.7 specification is a weighty document and not all aspects are
|
8
|
+
currently supported. I welcome submission of PDF files that exhibit
|
9
|
+
unsupported aspects of the spec to assist with improving our support.
|
10
|
+
|
11
|
+
= Installation
|
12
|
+
|
13
|
+
The recommended installation method is via Rubygems.
|
14
|
+
|
15
|
+
gem install pdf-reader
|
16
|
+
|
17
|
+
= Usage
|
18
|
+
|
19
|
+
Begin by creating a PDF::Reader instance that points to a PDF file. Document
|
20
|
+
level information (metadata, page count, bookmarks, etc) is available via
|
21
|
+
this object.
|
22
|
+
|
23
|
+
reader = PDF::Reader.new("somefile.pdf")
|
24
|
+
|
25
|
+
puts reader.pdf_version
|
26
|
+
puts reader.info
|
27
|
+
puts reader.metadata
|
28
|
+
puts reader.page_count
|
29
|
+
|
30
|
+
PDF::Reader.new can accept an IO stream or a filename. Here's an example with
|
31
|
+
an IO stream:
|
32
|
+
|
33
|
+
require 'open-uri'
|
34
|
+
|
35
|
+
io = open('http://example.com/somefile.pdf')
|
36
|
+
reader = PDF::Reader.new(io)
|
37
|
+
puts reader.info
|
38
|
+
|
39
|
+
PDF is a page based file format, so most visible information is available via
|
40
|
+
page-based iteration
|
41
|
+
|
42
|
+
reader = PDF::Reader.new("somefile.pdf")
|
43
|
+
|
44
|
+
reader.pages.each do |page|
|
45
|
+
puts page.fonts
|
46
|
+
puts page.text
|
47
|
+
puts page.raw_content
|
48
|
+
end
|
49
|
+
|
50
|
+
If you need to access the full program for rendering a page, use the walk() method
|
51
|
+
of PDF::Reader::Page.
|
52
|
+
|
53
|
+
class RedGreenBlue
|
54
|
+
def set_rgb_color_for_nonstroking(r, g, b)
|
55
|
+
puts "R: #{r}, G: #{g}, B: #{b}"
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
reader = PDF::Reader.new("somefile.pdf")
|
60
|
+
page = reader.page(1)
|
61
|
+
receiver = RedGreenBlue.new
|
62
|
+
page.walk(receiver)
|
63
|
+
|
64
|
+
For low level access to the objects in a PDF file, use the ObjectHash class. You can
|
65
|
+
build an ObjectHash instance directly:
|
66
|
+
|
67
|
+
puts PDF::Reader::ObjectHash.new("somefile.pdf")
|
68
|
+
|
69
|
+
or via a PDF::Reader instance:
|
70
|
+
|
71
|
+
reader = PDF::Reader.new("somefile.pdf")
|
72
|
+
puts reader.objects
|
73
|
+
|
74
|
+
The second method is preferred to increase the effectiveness of internal caching.
|
75
|
+
|
76
|
+
= Text Encoding
|
77
|
+
|
78
|
+
Internally, text can be stored inside a PDF in various encodings, including
|
79
|
+
zingbats, win-1252, mac roman and a form of Unicode. To avoid confusion, all
|
80
|
+
text will be converted to UTF-8 before it is passed back from PDF::Reader.
|
81
|
+
|
82
|
+
Strings that contain binary data (like font blobs) will be marked as such on
|
83
|
+
M17N aware VMs.
|
84
|
+
|
85
|
+
= Exceptions
|
86
|
+
|
87
|
+
There are two key exceptions that you will need to watch out for when processing a
|
88
|
+
PDF file:
|
89
|
+
|
90
|
+
MalformedPDFError - The PDF appears to be corrupt in some way. If you believe the
|
91
|
+
file should be valid, or that a corrupt file didn't raise an exception, please
|
92
|
+
forward a copy of the file to the maintainers (preferably via the google group)
|
93
|
+
and we can attempt to improve the code.
|
94
|
+
|
95
|
+
UnsupportedFeatureError - The PDF uses a feature that PDF::Reader doesn't currently
|
96
|
+
support. Again, we welcome submissions of PDF files that exhibit these features to help
|
97
|
+
us with future code improvements.
|
98
|
+
|
99
|
+
MalformedPDFError has some subclasses if you want to detect finer grained issues. If you
|
100
|
+
don't, 'rescue MalformedPDFError' will catch all the subclassed errors as well.
|
101
|
+
|
102
|
+
Any other exceptions should be considered bugs in either PDF::Reader (please
|
103
|
+
report it!).
|
104
|
+
|
105
|
+
= Maintainers
|
106
|
+
|
107
|
+
- James Healy <mailto:jimmy@deefa.com>
|
108
|
+
|
109
|
+
= Licensing
|
110
|
+
|
111
|
+
This library is distributed under the terms of the MIT License. See the included file for
|
112
|
+
more detail.
|
113
|
+
|
114
|
+
= Mailing List
|
115
|
+
|
116
|
+
Any questions or feedback should be sent to the PDF::Reader google group. It's
|
117
|
+
better that any answers be available for others instead of hiding in someone's
|
118
|
+
inbox.
|
119
|
+
|
120
|
+
http://groups.google.com/group/pdf-reader
|
121
|
+
|
122
|
+
= Examples
|
123
|
+
|
124
|
+
The easiest way to explain how this works in practice is to show some examples.
|
125
|
+
Check out the examples/ directory for a few files.
|
126
|
+
|
127
|
+
= Known Limitations
|
128
|
+
|
129
|
+
Occasionally some text cannot be extracted properly due to the way it has been
|
130
|
+
stored, or the use of invalid bytes. In these cases PDF::Reader will output a
|
131
|
+
little UTF-8 friendly box to indicate an unrecognisable character.
|
132
|
+
|
133
|
+
= Resources
|
134
|
+
|
135
|
+
- PDF::Reader Code Repository: http://github.com/yob/pdf-reader
|
136
|
+
- PDF Specification: http://www.adobe.com/devnet/pdf/pdf_reference.html
|
137
|
+
- PDF Tutorial Slide Presentations: http://home.comcast.net/~jk05/presentations/PDFTutorials.html
|
data/Rakefile
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
require "rubygems"
|
2
|
+
require "bundler"
|
3
|
+
Bundler.setup
|
4
|
+
|
5
|
+
require 'rake'
|
6
|
+
require 'rake/rdoctask'
|
7
|
+
require 'rspec/core/rake_task'
|
8
|
+
require 'roodi'
|
9
|
+
require 'roodi_task'
|
10
|
+
|
11
|
+
desc "Default Task"
|
12
|
+
task :default => [ :spec ]
|
13
|
+
|
14
|
+
# run all rspecs
|
15
|
+
desc "Run all rspec files"
|
16
|
+
RSpec::Core::RakeTask.new("spec") do |t|
|
17
|
+
t.rspec_opts = ["--color", "--format progress"]
|
18
|
+
t.ruby_opts = "-w"
|
19
|
+
end
|
20
|
+
|
21
|
+
# Genereate the RDoc documentation
|
22
|
+
desc "Create documentation"
|
23
|
+
Rake::RDocTask.new("doc") do |rdoc|
|
24
|
+
rdoc.title = "pdf-reader"
|
25
|
+
rdoc.rdoc_dir = (ENV['CC_BUILD_ARTIFACTS'] || 'doc') + '/rdoc'
|
26
|
+
rdoc.rdoc_files.include('README.rdoc')
|
27
|
+
rdoc.rdoc_files.include('TODO')
|
28
|
+
rdoc.rdoc_files.include('CHANGELOG')
|
29
|
+
rdoc.rdoc_files.include('MIT-LICENSE')
|
30
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
31
|
+
rdoc.options << "--inline-source"
|
32
|
+
end
|
33
|
+
|
34
|
+
RoodiTask.new 'roodi', ['lib/**/*.rb']
|
data/TODO
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
v0.8
|
2
|
+
- add extra callbacks
|
3
|
+
- list implemented features
|
4
|
+
- encrypted? tagged? bookmarks? annotated? optimised?
|
5
|
+
- Allow more than just page content and metadata to be parsed (see spec section 3.6.1)
|
6
|
+
- bookmarks?
|
7
|
+
- outline?
|
8
|
+
- articles?
|
9
|
+
- viewer prefs?
|
10
|
+
- Don't remove comment when tokenising in the middle of a string
|
11
|
+
- Tweak encoding mappings to differentiate between bytes that are invalid for an encoding, and bytes that are unchanged.
|
12
|
+
poppler seems to do this in a quite reasonable way. Original Encoding -> Glyph Names -> Unicode. As of 0.6 we go straight
|
13
|
+
from the Original encoding to Unicode.
|
14
|
+
- detect when a font's encoding is a CMap (generally used for pre-Unicode, multibyte asian encodings), and display a user friendly error
|
15
|
+
- Improve interpretation of non content stream data (ie metadata). recognise dates, etc
|
16
|
+
- Fix inheritance of page attributes. Resources has been done, but plenty of other attributes
|
17
|
+
are inheritable. See table 3.2.7 in the spec
|
18
|
+
|
19
|
+
v0.9
|
20
|
+
- Add a way to extract raster images
|
21
|
+
- see XObjects section of spec (section 4.7)
|
22
|
+
- Add a way to extract font data?
|
23
|
+
|
24
|
+
Sometime
|
25
|
+
- Support for CJK text (convert to UTF-8 like all other encodings. See Section 5.9 of the PDF spec)
|
26
|
+
- Will require significantly improved handling of CMaps, including creating a bunch of predefined ones
|
27
|
+
|
28
|
+
- Work out why specs/data/zlib*.pdf isn't parsed correctly when all the major PDF viewers can display it correctly
|
29
|
+
|
30
|
+
- Ship some extra receivers in the standard package, particuarly ones that are useful for running
|
31
|
+
rspec over generated PDF files
|
32
|
+
|
33
|
+
- When we encounter Identity-H encoded text with no ToUnicode CMap, render the glyphs and treat them as images, as there's no
|
34
|
+
sensible way to convert them to unicode
|
35
|
+
|
36
|
+
- Add support for additional filters: ASCIIHexDecode, ASCII85Decode, LZWDecode, RunLengthDecode, CCITTFaxDecode, JBIG2Decode, DCTDecode, JPXDecode, Crypt?
|
37
|
+
|
38
|
+
- Add support for additional encodings:
|
39
|
+
- Identity-V(I *think* this relates to vertical text. Not sure how we'd support it sensibly)
|
40
|
+
|
41
|
+
- Investigate how R->L text is handled
|
42
|
+
|
43
|
+
- fix all callbacks to only ever return basic ruby objects (strings, ints,
|
44
|
+
attays, symbols, hashes, etc). No PDF::Reader::Reference or
|
45
|
+
PDF::Reader::Font, etc.
|
@@ -0,0 +1,15 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
|
5
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__) + "/../lib")
|
6
|
+
|
7
|
+
require 'pdf/reader'
|
8
|
+
|
9
|
+
receiver = PDF::Reader::PrintReceiver.new
|
10
|
+
|
11
|
+
if ARGV.empty?
|
12
|
+
PDF::Reader.new.parse($stdin, receiver)
|
13
|
+
else
|
14
|
+
PDF::Reader.file(ARGV[0], receiver)
|
15
|
+
end
|
data/bin/pdf_object
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
|
5
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__) + "/../lib")
|
6
|
+
|
7
|
+
USAGE = "USAGE: " + File.basename(__FILE__) + " <file> <object id> [generation]"
|
8
|
+
|
9
|
+
require 'pdf/reader'
|
10
|
+
|
11
|
+
filename, id, gen = *ARGV
|
12
|
+
|
13
|
+
if filename.nil? || id.nil?
|
14
|
+
puts USAGE
|
15
|
+
exit 1
|
16
|
+
elsif !File.file?(filename)
|
17
|
+
$stderr.puts "#{filename} does not exist"
|
18
|
+
exit 1
|
19
|
+
end
|
20
|
+
|
21
|
+
# tweak the users options
|
22
|
+
id = id.to_i
|
23
|
+
gen ||= 0
|
24
|
+
gen = gen.to_i
|
25
|
+
|
26
|
+
# make magic happen
|
27
|
+
begin
|
28
|
+
obj = PDF::Reader.object_file(filename, id, gen)
|
29
|
+
|
30
|
+
case obj
|
31
|
+
when Hash, Array
|
32
|
+
puts obj.inspect
|
33
|
+
when PDF::Reader::Stream
|
34
|
+
puts obj.hash.inspect
|
35
|
+
puts obj.unfiltered_data
|
36
|
+
else
|
37
|
+
puts obj
|
38
|
+
end
|
39
|
+
rescue PDF::Reader::InvalidObjectError
|
40
|
+
$stderr.puts "Error retreiving object #{id}, gen #{gen}. Does it exist?"
|
41
|
+
exit 1
|
42
|
+
rescue PDF::Reader::MalformedPDFError => e
|
43
|
+
$stderr.puts "Malformed PDF file: #{e.message}"
|
44
|
+
exit 1
|
45
|
+
rescue PDF::Reader::UnsupportedFeatureError => e
|
46
|
+
$stderr.puts "PDF file implements a feature unsupported by PDF::Reader: #{e.message}"
|
47
|
+
exit 1
|
48
|
+
end
|
data/bin/pdf_text
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__) + "/../lib")
|
5
|
+
|
6
|
+
require 'pdf/reader'
|
7
|
+
|
8
|
+
if ARGV.empty?
|
9
|
+
browser = PDF::Reader.new($stdin)
|
10
|
+
else
|
11
|
+
browser = PDF::Reader.new(ARGV[0])
|
12
|
+
end
|
13
|
+
browser.pages.each do |page|
|
14
|
+
puts page.text
|
15
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# coding: utf-8
|
3
|
+
|
4
|
+
# List all callbacks generated by each page
|
5
|
+
#
|
6
|
+
# WARNING: this will generate a *lot* of output, so you probably want to pipe
|
7
|
+
# it through less or to a text file.
|
8
|
+
|
9
|
+
require 'rubygems'
|
10
|
+
require 'pdf/reader'
|
11
|
+
|
12
|
+
receiver = PDF::Reader::RegisterReceiver.new
|
13
|
+
|
14
|
+
PDF::Reader.open("somefile.pdf") do |reader|
|
15
|
+
reader.pages.each do |page|
|
16
|
+
page.walk(receiver)
|
17
|
+
receiver.callbacks.each do |cb|
|
18
|
+
puts cb
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# coding: utf-8
|
3
|
+
|
4
|
+
# A sample script that attempts to extract bates numbers from a PDF file.
|
5
|
+
# Bates numbers are often used to markup documents being used in legal
|
6
|
+
# cases. For more info, see http://en.wikipedia.org/wiki/Bates_numbering
|
7
|
+
#
|
8
|
+
# Acrobat 9 introduced a markup syntax that directly specifies the bates
|
9
|
+
# number for each page. For earlier versions, the easiest way to find
|
10
|
+
# the number is to look for words that match a pattern.
|
11
|
+
#
|
12
|
+
# This example attempts to extract numbers using the Acrobat 9 syntax.
|
13
|
+
# As a fall back, you can use a regular expression to look for words
|
14
|
+
# that match the numbers you expect in the page content.
|
15
|
+
|
16
|
+
require 'rubygems'
|
17
|
+
require 'pdf/reader'
|
18
|
+
|
19
|
+
class BatesReceiver
|
20
|
+
|
21
|
+
attr_reader :numbers
|
22
|
+
|
23
|
+
def initialize
|
24
|
+
@numbers = []
|
25
|
+
end
|
26
|
+
|
27
|
+
def begin_marked_content(*args)
|
28
|
+
return unless args.size >= 2
|
29
|
+
return unless args.first == :Artifact
|
30
|
+
return unless args[1][:Subtype] == :BatesN
|
31
|
+
|
32
|
+
@numbers << args[1][:Contents]
|
33
|
+
end
|
34
|
+
alias :begin_marked_content_with_pl :begin_marked_content
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
PDF::Reader.open("bates.pdf") do |reader|
|
40
|
+
reader.pages.each do |page|
|
41
|
+
receiver = BatesReceiver.new
|
42
|
+
page.walk(receiver)
|
43
|
+
if receiver.numbers.empty?
|
44
|
+
puts page.scan(/CC.+/)
|
45
|
+
else
|
46
|
+
puts receiver.numbers.inspect
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,108 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
# This demonstrates a way to extract some images (those based on the JPG or
|
4
|
+
# TIFF formats) from a PDF. There are other ways to store images, so
|
5
|
+
# it may need to be expanded for real world usage, but it should serve
|
6
|
+
# as a good guide.
|
7
|
+
#
|
8
|
+
# Thanks to Jack Rusher for the initial version of this example.
|
9
|
+
#
|
10
|
+
# USAGE:
|
11
|
+
#
|
12
|
+
# ruby extract_images.rb somefile.pdf
|
13
|
+
|
14
|
+
require 'pdf/reader'
|
15
|
+
|
16
|
+
module ExtractImages
|
17
|
+
|
18
|
+
class Receiver
|
19
|
+
attr_reader :count
|
20
|
+
|
21
|
+
def initialize
|
22
|
+
@count = 0
|
23
|
+
end
|
24
|
+
|
25
|
+
def resource_xobject(name, stream)
|
26
|
+
return unless stream.hash[:Subtype] == :Image
|
27
|
+
increment_count
|
28
|
+
|
29
|
+
case stream.hash[:Filter]
|
30
|
+
when :CCITTFaxDecode
|
31
|
+
ExtractImages::Tiff.new(stream).save("#{count}-#{name}.tif")
|
32
|
+
when :DCTDecode
|
33
|
+
ExtractImages::Jpg.new(stream).save("#{count}-#{name}.jpg")
|
34
|
+
else
|
35
|
+
$stderr.puts "unrecognized image filter '#{stream.hash[:Filter]}'!"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def increment_count
|
40
|
+
@count += 1
|
41
|
+
end
|
42
|
+
private :increment_count
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
class Jpg
|
47
|
+
attr_reader :stream
|
48
|
+
|
49
|
+
def initialize(stream)
|
50
|
+
@stream = stream
|
51
|
+
end
|
52
|
+
|
53
|
+
def save(filename)
|
54
|
+
w = stream.hash[:Width]
|
55
|
+
h = stream.hash[:Height]
|
56
|
+
puts "#{filename}: h=#{h}, w=#{w}"
|
57
|
+
File.open(filename, "wb") { |file| file.write stream.data }
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
class Tiff
|
62
|
+
attr_reader :stream
|
63
|
+
|
64
|
+
def initialize(stream)
|
65
|
+
@stream = stream
|
66
|
+
end
|
67
|
+
|
68
|
+
def save(filename)
|
69
|
+
if stream.hash[:DecodeParms][:K] <= 0
|
70
|
+
save_group_four(filename)
|
71
|
+
else
|
72
|
+
$stderr.puts "#{filename}: CCITT non-group 4/2D image."
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
private
|
77
|
+
|
78
|
+
# Group 4, 2D
|
79
|
+
def save_group_four(filename)
|
80
|
+
k = stream.hash[:DecodeParms][:K]
|
81
|
+
h = stream.hash[:Height]
|
82
|
+
w = stream.hash[:Width]
|
83
|
+
bpc = stream.hash[:BitsPerComponent]
|
84
|
+
mask = stream.hash[:ImageMask]
|
85
|
+
len = stream.hash[:Length]
|
86
|
+
cols = stream.hash[:DecodeParms][:Columns]
|
87
|
+
puts "#{filename}: h=#{h}, w=#{w}, bpc=#{bpc}, mask=#{mask}, len=#{len}, cols=#{cols}, k=#{k}"
|
88
|
+
|
89
|
+
# Synthesize a TIFF header
|
90
|
+
long_tag = lambda {|tag, value| [ tag, 4, 1, value ].pack( "ssII" ) }
|
91
|
+
short_tag = lambda {|tag, value| [ tag, 3, 1, value ].pack( "ssII" ) }
|
92
|
+
# header = byte order, version magic, offset of directory, directory count,
|
93
|
+
# followed by a series of tags containing metadata: 259 is a magic number for
|
94
|
+
# the compression type; 273 is the offset of the image data.
|
95
|
+
tiff = [ 73, 73, 42, 8, 5 ].pack("ccsIs") \
|
96
|
+
+ short_tag.call( 256, cols ) \
|
97
|
+
+ short_tag.call( 257, h ) \
|
98
|
+
+ short_tag.call( 259, 4 ) \
|
99
|
+
+ long_tag.call( 273, (10 + (5*12)) ) \
|
100
|
+
+ long_tag.call( 279, len) \
|
101
|
+
+ stream.data
|
102
|
+
File.open(filename, "wb") { |file| file.write tiff }
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
receiver = ExtractImages::Receiver.new
|
108
|
+
PDF::Reader.file(ARGV[0], receiver)
|
data/examples/hash.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# coding: utf-8
|
3
|
+
|
4
|
+
# get direct access to PDF objects
|
5
|
+
#
|
6
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__) + '/../lib')
|
7
|
+
|
8
|
+
require 'pdf/reader'
|
9
|
+
|
10
|
+
filename = File.dirname(__FILE__) + "/../specs/data/cairo-unicode.pdf"
|
11
|
+
hash = PDF::Reader::ObjectHash.new(filename)
|
12
|
+
puts hash[3]
|