fireinc-pdf-reader 0.11.0.alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. data/CHANGELOG +168 -0
  2. data/MIT-LICENSE +21 -0
  3. data/README.rdoc +137 -0
  4. data/Rakefile +34 -0
  5. data/TODO +45 -0
  6. data/bin/pdf_list_callbacks +15 -0
  7. data/bin/pdf_object +48 -0
  8. data/bin/pdf_text +15 -0
  9. data/examples/callbacks.rb +21 -0
  10. data/examples/extract_bates.rb +49 -0
  11. data/examples/extract_images.rb +108 -0
  12. data/examples/hash.rb +12 -0
  13. data/examples/metadata.rb +25 -0
  14. data/examples/page_counter_improved.rb +23 -0
  15. data/examples/page_counter_naive.rb +24 -0
  16. data/examples/rspec.rb +57 -0
  17. data/examples/text.rb +40 -0
  18. data/examples/version.rb +25 -0
  19. data/lib/pdf/hash.rb +15 -0
  20. data/lib/pdf/reader/abstract_strategy.rb +81 -0
  21. data/lib/pdf/reader/buffer.rb +346 -0
  22. data/lib/pdf/reader/cmap.rb +138 -0
  23. data/lib/pdf/reader/encoding.rb +190 -0
  24. data/lib/pdf/reader/encodings/mac_expert.txt +159 -0
  25. data/lib/pdf/reader/encodings/mac_roman.txt +128 -0
  26. data/lib/pdf/reader/encodings/pdf_doc.txt +40 -0
  27. data/lib/pdf/reader/encodings/standard.txt +47 -0
  28. data/lib/pdf/reader/encodings/symbol.txt +154 -0
  29. data/lib/pdf/reader/encodings/win_ansi.txt +29 -0
  30. data/lib/pdf/reader/encodings/zapf_dingbats.txt +201 -0
  31. data/lib/pdf/reader/error.rb +53 -0
  32. data/lib/pdf/reader/filter.rb +219 -0
  33. data/lib/pdf/reader/font.rb +133 -0
  34. data/lib/pdf/reader/form_xobject.rb +83 -0
  35. data/lib/pdf/reader/glyphlist.txt +4322 -0
  36. data/lib/pdf/reader/lzw.rb +123 -0
  37. data/lib/pdf/reader/metadata_strategy.rb +56 -0
  38. data/lib/pdf/reader/object_cache.rb +85 -0
  39. data/lib/pdf/reader/object_hash.rb +289 -0
  40. data/lib/pdf/reader/object_stream.rb +51 -0
  41. data/lib/pdf/reader/page.rb +185 -0
  42. data/lib/pdf/reader/page_text_receiver.rb +278 -0
  43. data/lib/pdf/reader/pages_strategy.rb +475 -0
  44. data/lib/pdf/reader/parser.rb +225 -0
  45. data/lib/pdf/reader/print_receiver.rb +18 -0
  46. data/lib/pdf/reader/reference.rb +66 -0
  47. data/lib/pdf/reader/register_receiver.rb +95 -0
  48. data/lib/pdf/reader/stream.rb +69 -0
  49. data/lib/pdf/reader/text_receiver.rb +264 -0
  50. data/lib/pdf/reader/token.rb +41 -0
  51. data/lib/pdf/reader/xref.rb +220 -0
  52. data/lib/pdf/reader.rb +296 -0
  53. data/lib/pdf-reader.rb +1 -0
  54. metadata +211 -0
data/CHANGELOG ADDED
@@ -0,0 +1,168 @@
1
+ v0.9.4 (XXX)
2
+ - support multiple receivers within a single pass over a source file
3
+ - massive time saving when dealing with multiple receivers
4
+
5
+ v0.9.3 (2nd July 2011)
6
+ - add PDF::Reader::Reference#hash method
7
+ - improves behaviour of Reference objects when tehy're used as Hash keys
8
+
9
+ v0.9.2 (24th April 2011)
10
+ - add basic support for fonts with Identity-V encoding.
11
+ - bug: improve robustness of text extraction
12
+ - thanks to Evan Arnold for reporting
13
+ - bug: fix loading of nested resources on XObjects
14
+ - thanks to Samuel Williams for reporting
15
+ - bug: improve parsing of files with XRef object streams
16
+
17
+ v0.9.1 (21st December 2010)
18
+ - force gem to only install on ruby 1.8.7 or higher
19
+ - maintaining supprot for earlier versions takes more time than I have
20
+ available at the moment
21
+ - bug: fix parsing of obscure pdf name format
22
+ - bug: fix behaviour when loaded in confunction with htmldoc gem
23
+
24
+ v0.9.0 (19th November 2010)
25
+ - support for pdf 1.5+ files that use object and xref streams
26
+ - support streams that use a flate filter with the predictor option
27
+ - ensure all content instructions are parsed when split over multiple stream
28
+ - thanks to Jack Rusher for reporting
29
+ - Various string parsing bug
30
+ - some character conversions to utf-8 were failing (thanks Andrea Barisani)
31
+ - hashes with nested hex strings were tokenising wronly (thanks Evan Arnold)
32
+ - escaping bug in tokenising of literal strings (thanks David Westerink)
33
+ - Fix a bug that prevented PDFs with white space after the EOF marker from loading
34
+ - thanks to Solomon White for reporting the issue
35
+ - Add support for de-filtering some LZW compressed streams
36
+ - thanks to Jose Ignacio Rubio Iradi for the patch
37
+ - some small speed improvements
38
+ - API CHANGE: PDF::Hash renamed to PDF::Reader::ObjectHash
39
+ - having a class named Hash was confusing for users
40
+
41
+ v0.8.6 (27th August 2010)
42
+ - new method: hash#page_references
43
+ - returns references to all page objects, gives rapid access to objects
44
+ for a given page
45
+
46
+ v0.8.5 (11th April 2010)
47
+ - fix a regression introduced in 0.8.4.
48
+ - Parameters passed to resource_font callback were inadvertently changed
49
+
50
+ v0.8.4 (30th March 2010)
51
+ - fix parsing of files that use Form XObjects
52
+ - thanks to Andrea Barisani for reporting the issue
53
+ - fix two issues that caused a small number of characters to convert to Unicode
54
+ incorrectly
55
+ - thanks to Andrea Barisani for reporting the issue
56
+ - require 'pdf-reader' now works a well as 'pdf/reader'
57
+ - good practice to have the require file match the gem name
58
+ - thanks to Chris O'Meara for highlighting this
59
+
60
+ v0.8.3 (14th February 2010)
61
+ - Fix a bug in tokenising of hex strings inside dictionaries
62
+ - Thanks to Brad Ediger for detecting the issue and proposing a solution
63
+
64
+ v0.8.2 (1st January 2010)
65
+ - Fix parsing of files that use Form XObjects behind an indirect reference
66
+ (thanks Cornelius Illi and Patrick Crosby)
67
+ - Rewrote Buffer class to fix various speed issues reported over the years
68
+ - On my sample file extracting full text reduced from 220 seconds to 9 seconds.
69
+
70
+ v0.8.1 (27th November 2009)
71
+ - Added PDF::Hash#version. Provides access to the source file PDF version
72
+
73
+ v0.8.0 (20th November 2009)
74
+ - Added PDF::Hash. It provides direct access to objects from a PDF file
75
+ with an API that emulates the standard Ruby hash
76
+
77
+ v0.7.7 (11th September 2009)
78
+ - Trigger callbacks contained in Form XObjects when we encounter them in a
79
+ content stream
80
+ - Fix inheritance of page resources to comply with section 3.6.2
81
+
82
+ v0.7.6 (28th August 2009)
83
+ - Various bug fixes that increase the files we can successfully parse
84
+ - Treat float and integer tokens differently (thanks Neil)
85
+ - Correctly handle PDFs where the Kids element of a Pages dict is an indirect
86
+ reference (thanks Rob Holland)
87
+ - Fix conversion of PDF strings to Ruby strings on 1.8.6 (thanks Andrès Koetsier)
88
+ - Fix decoding with ASCII85 and ASCIIHex filters (thanks Andrès Koetsier)
89
+ - Fix extracting inline images from content streams (thanks Andrès Koetsier)
90
+ - Fix extracting [ ] from content streams (thanks Christian Rishøj)
91
+ - Fix conversion of text to UTF8 when the cmap uses bfrange (thanks Federico Gonzalez Lutteroth)
92
+
93
+ v0.7.5 (27th August 2008)
94
+ - Fix a 1.8.7ism
95
+
96
+ v0.7.4 (7th August 2008)
97
+ - Raise a MalformedPDFError if a content stream contains an unterminated string
98
+ - Fix an bug that was causing an endless loop on some OSX systems
99
+ - valid strings were incorrectly thought to be unterminated
100
+ - thanks to Jeff Webb for playing email ping pong with me as I tracked this
101
+ issue down
102
+
103
+ v0.7.3 (11th June 2008)
104
+ - Add a high level way to get direct access to a PDF object, including a new executable: pdf_object
105
+ - Fix a hard loop bug caused by a content stream that is missing a final operator
106
+ - Significantly simplified the internal code for encoding conversions
107
+ - Fixes YACC parsing bug that occurs on Fedora 8's ruby VM
108
+ - New callbacks
109
+ - page_count
110
+ - pdf_version
111
+ - Fix a bug that prevented a font's BaseFont from being recorded correctly
112
+
113
+ v0.7.2 (20th May 2008)
114
+ - Throw an UnsupportedFeatureError if we try to open an encrypted/secure PDF
115
+ - Correctly handle page content instruction sets with trailing whitespace
116
+ - Represent PDF Streams with a new object, PDF::Reader::Stream
117
+ - their really wasn't any point in separating the stream content from it's associated dict. You need both
118
+ parts to correctly interpret the content
119
+
120
+ v0.7.1 (6th May 2008)
121
+ - Non-page strings (ie. metadata, etc) are now converted to UTF-8 more accurately
122
+ - Fixed a regression between 0.6.2 and 0.7 that prevented difference tables from being applied
123
+ correctly when translating text into UTF-8
124
+
125
+ v0.7 (6th May 2008)
126
+ - API INCOMPATIBLE CHANGE: any hashes that are passed to callbacks use symbols as keys instead of PDF::Reader::Name instances.
127
+ - Improved support for converting text in some PDF files to unicode
128
+ - Behave as expected if the Contents key in a Page Dict is a reference
129
+ - Include some basic metadata callbacks
130
+ - Don't interpret a comment token (%) inside a string as a comment
131
+ - Small fixes to improve 1.9 compatibility
132
+ - Improved our Zlib deflating to make it slightly more robust - still some more issues to work out though
133
+ - Throw an UnsupportedFeatureError if a pdf that uses XRef streams is opened
134
+ - Added an option to PDF::Reader#file and PDF::Reader#string to enable parsing of only parts of a PDF file(ie. only metadata, etc)
135
+
136
+ v0.6.2 (22nd March 2008)
137
+ - Catch low level errors when applying filters to a content stream and raise a MalformedPDFError instead.
138
+ - Added support for processing inline images
139
+ - Support for parsing XRef tables that have multiple subsections
140
+ - Added a few callbacks to improve the way we supply information on page resources
141
+ - Ignore whitespace in hex strings, as required by the spec (section 3.2.3)
142
+ - Use our "unknown character box" when a single character in an Identity-H string fails to decode
143
+ - Support ToUnicode CMaps that use the bfrange operator
144
+ - Tweaked tokenising code to ensure whitespace doesn't get in the way
145
+
146
+ v0.6.1 (12th March 2008)
147
+ - Tweaked behaviour when we encounter Identity-H encoded text that doesn't have a ToUnicode mapping. We
148
+ just replace each character with a little box.
149
+ - Use the same little box when invalid characters are found in other encodings instead of throwing an ugly
150
+ NoMethodError.
151
+ - Added a method to RegisterReceiver that returns all occurrences of a callback
152
+
153
+ v0.6.0 (27th February 2008)
154
+ - all text is now transparently converted to UTF-8 before being passed to the callbacks.
155
+ before this version, text was just passed as a byte level copy of what was in the PDF file, which
156
+ was mildly annoying with some encodings, and resulted in garbled text for Unicode encoded text.
157
+ - Fonts that use a difference table are now handled correctly
158
+ - fixed some 1.9 incompatible syntax
159
+ - expanded RegisterReceiver class to record extra info
160
+ - expanded rspec coverage
161
+ - tweaked a README example
162
+
163
+ v0.5.1 (1st January 2008)
164
+ - Several documentation tweaks
165
+ - Improve support for parsing PDFs under windows (thanks to Jari Williamsson)
166
+
167
+ v0.5 (14th December 2007)
168
+ - Initial Release
data/MIT-LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ Copyright (c) 2009 Peter Jones
2
+ Copyright (c) 2009 James Healy
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining
5
+ a copy of this software and associated documentation files (the
6
+ "Software"), to deal in the Software without restriction, including
7
+ without limitation the rights to use, copy, modify, merge, publish,
8
+ distribute, sublicense, and/or sell copies of the Software, and to
9
+ permit persons to whom the Software is furnished to do so, subject to
10
+ the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be
13
+ included in all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,137 @@
1
+ The PDF::Reader library implements a PDF parser conforming as much as possible
2
+ to the PDF specification from Adobe.
3
+
4
+ It provides programmatic access to the contents of a PDF file with a high
5
+ degree of flexibility.
6
+
7
+ The PDF 1.7 specification is a weighty document and not all aspects are
8
+ currently supported. I welcome submission of PDF files that exhibit
9
+ unsupported aspects of the spec to assist with improving our support.
10
+
11
+ = Installation
12
+
13
+ The recommended installation method is via Rubygems.
14
+
15
+ gem install pdf-reader
16
+
17
+ = Usage
18
+
19
+ Begin by creating a PDF::Reader instance that points to a PDF file. Document
20
+ level information (metadata, page count, bookmarks, etc) is available via
21
+ this object.
22
+
23
+ reader = PDF::Reader.new("somefile.pdf")
24
+
25
+ puts reader.pdf_version
26
+ puts reader.info
27
+ puts reader.metadata
28
+ puts reader.page_count
29
+
30
+ PDF::Reader.new can accept an IO stream or a filename. Here's an example with
31
+ an IO stream:
32
+
33
+ require 'open-uri'
34
+
35
+ io = open('http://example.com/somefile.pdf')
36
+ reader = PDF::Reader.new(io)
37
+ puts reader.info
38
+
39
+ PDF is a page based file format, so most visible information is available via
40
+ page-based iteration
41
+
42
+ reader = PDF::Reader.new("somefile.pdf")
43
+
44
+ reader.pages.each do |page|
45
+ puts page.fonts
46
+ puts page.text
47
+ puts page.raw_content
48
+ end
49
+
50
+ If you need to access the full program for rendering a page, use the walk() method
51
+ of PDF::Reader::Page.
52
+
53
+ class RedGreenBlue
54
+ def set_rgb_color_for_nonstroking(r, g, b)
55
+ puts "R: #{r}, G: #{g}, B: #{b}"
56
+ end
57
+ end
58
+
59
+ reader = PDF::Reader.new("somefile.pdf")
60
+ page = reader.page(1)
61
+ receiver = RedGreenBlue.new
62
+ page.walk(receiver)
63
+
64
+ For low level access to the objects in a PDF file, use the ObjectHash class. You can
65
+ build an ObjectHash instance directly:
66
+
67
+ puts PDF::Reader::ObjectHash.new("somefile.pdf")
68
+
69
+ or via a PDF::Reader instance:
70
+
71
+ reader = PDF::Reader.new("somefile.pdf")
72
+ puts reader.objects
73
+
74
+ The second method is preferred to increase the effectiveness of internal caching.
75
+
76
+ = Text Encoding
77
+
78
+ Internally, text can be stored inside a PDF in various encodings, including
79
+ zingbats, win-1252, mac roman and a form of Unicode. To avoid confusion, all
80
+ text will be converted to UTF-8 before it is passed back from PDF::Reader.
81
+
82
+ Strings that contain binary data (like font blobs) will be marked as such on
83
+ M17N aware VMs.
84
+
85
+ = Exceptions
86
+
87
+ There are two key exceptions that you will need to watch out for when processing a
88
+ PDF file:
89
+
90
+ MalformedPDFError - The PDF appears to be corrupt in some way. If you believe the
91
+ file should be valid, or that a corrupt file didn't raise an exception, please
92
+ forward a copy of the file to the maintainers (preferably via the google group)
93
+ and we can attempt to improve the code.
94
+
95
+ UnsupportedFeatureError - The PDF uses a feature that PDF::Reader doesn't currently
96
+ support. Again, we welcome submissions of PDF files that exhibit these features to help
97
+ us with future code improvements.
98
+
99
+ MalformedPDFError has some subclasses if you want to detect finer grained issues. If you
100
+ don't, 'rescue MalformedPDFError' will catch all the subclassed errors as well.
101
+
102
+ Any other exceptions should be considered bugs in either PDF::Reader (please
103
+ report it!).
104
+
105
+ = Maintainers
106
+
107
+ - James Healy <mailto:jimmy@deefa.com>
108
+
109
+ = Licensing
110
+
111
+ This library is distributed under the terms of the MIT License. See the included file for
112
+ more detail.
113
+
114
+ = Mailing List
115
+
116
+ Any questions or feedback should be sent to the PDF::Reader google group. It's
117
+ better that any answers be available for others instead of hiding in someone's
118
+ inbox.
119
+
120
+ http://groups.google.com/group/pdf-reader
121
+
122
+ = Examples
123
+
124
+ The easiest way to explain how this works in practice is to show some examples.
125
+ Check out the examples/ directory for a few files.
126
+
127
+ = Known Limitations
128
+
129
+ Occasionally some text cannot be extracted properly due to the way it has been
130
+ stored, or the use of invalid bytes. In these cases PDF::Reader will output a
131
+ little UTF-8 friendly box to indicate an unrecognisable character.
132
+
133
+ = Resources
134
+
135
+ - PDF::Reader Code Repository: http://github.com/yob/pdf-reader
136
+ - PDF Specification: http://www.adobe.com/devnet/pdf/pdf_reference.html
137
+ - PDF Tutorial Slide Presentations: http://home.comcast.net/~jk05/presentations/PDFTutorials.html
data/Rakefile ADDED
@@ -0,0 +1,34 @@
1
+ require "rubygems"
2
+ require "bundler"
3
+ Bundler.setup
4
+
5
+ require 'rake'
6
+ require 'rake/rdoctask'
7
+ require 'rspec/core/rake_task'
8
+ require 'roodi'
9
+ require 'roodi_task'
10
+
11
+ desc "Default Task"
12
+ task :default => [ :spec ]
13
+
14
+ # run all rspecs
15
+ desc "Run all rspec files"
16
+ RSpec::Core::RakeTask.new("spec") do |t|
17
+ t.rspec_opts = ["--color", "--format progress"]
18
+ t.ruby_opts = "-w"
19
+ end
20
+
21
+ # Genereate the RDoc documentation
22
+ desc "Create documentation"
23
+ Rake::RDocTask.new("doc") do |rdoc|
24
+ rdoc.title = "pdf-reader"
25
+ rdoc.rdoc_dir = (ENV['CC_BUILD_ARTIFACTS'] || 'doc') + '/rdoc'
26
+ rdoc.rdoc_files.include('README.rdoc')
27
+ rdoc.rdoc_files.include('TODO')
28
+ rdoc.rdoc_files.include('CHANGELOG')
29
+ rdoc.rdoc_files.include('MIT-LICENSE')
30
+ rdoc.rdoc_files.include('lib/**/*.rb')
31
+ rdoc.options << "--inline-source"
32
+ end
33
+
34
+ RoodiTask.new 'roodi', ['lib/**/*.rb']
data/TODO ADDED
@@ -0,0 +1,45 @@
1
+ v0.8
2
+ - add extra callbacks
3
+ - list implemented features
4
+ - encrypted? tagged? bookmarks? annotated? optimised?
5
+ - Allow more than just page content and metadata to be parsed (see spec section 3.6.1)
6
+ - bookmarks?
7
+ - outline?
8
+ - articles?
9
+ - viewer prefs?
10
+ - Don't remove comment when tokenising in the middle of a string
11
+ - Tweak encoding mappings to differentiate between bytes that are invalid for an encoding, and bytes that are unchanged.
12
+ poppler seems to do this in a quite reasonable way. Original Encoding -> Glyph Names -> Unicode. As of 0.6 we go straight
13
+ from the Original encoding to Unicode.
14
+ - detect when a font's encoding is a CMap (generally used for pre-Unicode, multibyte asian encodings), and display a user friendly error
15
+ - Improve interpretation of non content stream data (ie metadata). recognise dates, etc
16
+ - Fix inheritance of page attributes. Resources has been done, but plenty of other attributes
17
+ are inheritable. See table 3.2.7 in the spec
18
+
19
+ v0.9
20
+ - Add a way to extract raster images
21
+ - see XObjects section of spec (section 4.7)
22
+ - Add a way to extract font data?
23
+
24
+ Sometime
25
+ - Support for CJK text (convert to UTF-8 like all other encodings. See Section 5.9 of the PDF spec)
26
+ - Will require significantly improved handling of CMaps, including creating a bunch of predefined ones
27
+
28
+ - Work out why specs/data/zlib*.pdf isn't parsed correctly when all the major PDF viewers can display it correctly
29
+
30
+ - Ship some extra receivers in the standard package, particuarly ones that are useful for running
31
+ rspec over generated PDF files
32
+
33
+ - When we encounter Identity-H encoded text with no ToUnicode CMap, render the glyphs and treat them as images, as there's no
34
+ sensible way to convert them to unicode
35
+
36
+ - Add support for additional filters: ASCIIHexDecode, ASCII85Decode, LZWDecode, RunLengthDecode, CCITTFaxDecode, JBIG2Decode, DCTDecode, JPXDecode, Crypt?
37
+
38
+ - Add support for additional encodings:
39
+ - Identity-V(I *think* this relates to vertical text. Not sure how we'd support it sensibly)
40
+
41
+ - Investigate how R->L text is handled
42
+
43
+ - fix all callbacks to only ever return basic ruby objects (strings, ints,
44
+ attays, symbols, hashes, etc). No PDF::Reader::Reference or
45
+ PDF::Reader::Font, etc.
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rubygems'
4
+
5
+ $LOAD_PATH.unshift(File.dirname(__FILE__) + "/../lib")
6
+
7
+ require 'pdf/reader'
8
+
9
+ receiver = PDF::Reader::PrintReceiver.new
10
+
11
+ if ARGV.empty?
12
+ PDF::Reader.new.parse($stdin, receiver)
13
+ else
14
+ PDF::Reader.file(ARGV[0], receiver)
15
+ end
data/bin/pdf_object ADDED
@@ -0,0 +1,48 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rubygems'
4
+
5
+ $LOAD_PATH.unshift(File.dirname(__FILE__) + "/../lib")
6
+
7
+ USAGE = "USAGE: " + File.basename(__FILE__) + " <file> <object id> [generation]"
8
+
9
+ require 'pdf/reader'
10
+
11
+ filename, id, gen = *ARGV
12
+
13
+ if filename.nil? || id.nil?
14
+ puts USAGE
15
+ exit 1
16
+ elsif !File.file?(filename)
17
+ $stderr.puts "#{filename} does not exist"
18
+ exit 1
19
+ end
20
+
21
+ # tweak the users options
22
+ id = id.to_i
23
+ gen ||= 0
24
+ gen = gen.to_i
25
+
26
+ # make magic happen
27
+ begin
28
+ obj = PDF::Reader.object_file(filename, id, gen)
29
+
30
+ case obj
31
+ when Hash, Array
32
+ puts obj.inspect
33
+ when PDF::Reader::Stream
34
+ puts obj.hash.inspect
35
+ puts obj.unfiltered_data
36
+ else
37
+ puts obj
38
+ end
39
+ rescue PDF::Reader::InvalidObjectError
40
+ $stderr.puts "Error retreiving object #{id}, gen #{gen}. Does it exist?"
41
+ exit 1
42
+ rescue PDF::Reader::MalformedPDFError => e
43
+ $stderr.puts "Malformed PDF file: #{e.message}"
44
+ exit 1
45
+ rescue PDF::Reader::UnsupportedFeatureError => e
46
+ $stderr.puts "PDF file implements a feature unsupported by PDF::Reader: #{e.message}"
47
+ exit 1
48
+ end
data/bin/pdf_text ADDED
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rubygems'
4
+ $LOAD_PATH.unshift(File.dirname(__FILE__) + "/../lib")
5
+
6
+ require 'pdf/reader'
7
+
8
+ if ARGV.empty?
9
+ browser = PDF::Reader.new($stdin)
10
+ else
11
+ browser = PDF::Reader.new(ARGV[0])
12
+ end
13
+ browser.pages.each do |page|
14
+ puts page.text
15
+ end
@@ -0,0 +1,21 @@
1
+ #!/usr/bin/env ruby
2
+ # coding: utf-8
3
+
4
+ # List all callbacks generated by each page
5
+ #
6
+ # WARNING: this will generate a *lot* of output, so you probably want to pipe
7
+ # it through less or to a text file.
8
+
9
+ require 'rubygems'
10
+ require 'pdf/reader'
11
+
12
+ receiver = PDF::Reader::RegisterReceiver.new
13
+
14
+ PDF::Reader.open("somefile.pdf") do |reader|
15
+ reader.pages.each do |page|
16
+ page.walk(receiver)
17
+ receiver.callbacks.each do |cb|
18
+ puts cb
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,49 @@
1
+ #!/usr/bin/env ruby
2
+ # coding: utf-8
3
+
4
+ # A sample script that attempts to extract bates numbers from a PDF file.
5
+ # Bates numbers are often used to markup documents being used in legal
6
+ # cases. For more info, see http://en.wikipedia.org/wiki/Bates_numbering
7
+ #
8
+ # Acrobat 9 introduced a markup syntax that directly specifies the bates
9
+ # number for each page. For earlier versions, the easiest way to find
10
+ # the number is to look for words that match a pattern.
11
+ #
12
+ # This example attempts to extract numbers using the Acrobat 9 syntax.
13
+ # As a fall back, you can use a regular expression to look for words
14
+ # that match the numbers you expect in the page content.
15
+
16
+ require 'rubygems'
17
+ require 'pdf/reader'
18
+
19
+ class BatesReceiver
20
+
21
+ attr_reader :numbers
22
+
23
+ def initialize
24
+ @numbers = []
25
+ end
26
+
27
+ def begin_marked_content(*args)
28
+ return unless args.size >= 2
29
+ return unless args.first == :Artifact
30
+ return unless args[1][:Subtype] == :BatesN
31
+
32
+ @numbers << args[1][:Contents]
33
+ end
34
+ alias :begin_marked_content_with_pl :begin_marked_content
35
+
36
+ end
37
+
38
+
39
+ PDF::Reader.open("bates.pdf") do |reader|
40
+ reader.pages.each do |page|
41
+ receiver = BatesReceiver.new
42
+ page.walk(receiver)
43
+ if receiver.numbers.empty?
44
+ puts page.scan(/CC.+/)
45
+ else
46
+ puts receiver.numbers.inspect
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,108 @@
1
+ # coding: utf-8
2
+
3
+ # This demonstrates a way to extract some images (those based on the JPG or
4
+ # TIFF formats) from a PDF. There are other ways to store images, so
5
+ # it may need to be expanded for real world usage, but it should serve
6
+ # as a good guide.
7
+ #
8
+ # Thanks to Jack Rusher for the initial version of this example.
9
+ #
10
+ # USAGE:
11
+ #
12
+ # ruby extract_images.rb somefile.pdf
13
+
14
+ require 'pdf/reader'
15
+
16
+ module ExtractImages
17
+
18
+ class Receiver
19
+ attr_reader :count
20
+
21
+ def initialize
22
+ @count = 0
23
+ end
24
+
25
+ def resource_xobject(name, stream)
26
+ return unless stream.hash[:Subtype] == :Image
27
+ increment_count
28
+
29
+ case stream.hash[:Filter]
30
+ when :CCITTFaxDecode
31
+ ExtractImages::Tiff.new(stream).save("#{count}-#{name}.tif")
32
+ when :DCTDecode
33
+ ExtractImages::Jpg.new(stream).save("#{count}-#{name}.jpg")
34
+ else
35
+ $stderr.puts "unrecognized image filter '#{stream.hash[:Filter]}'!"
36
+ end
37
+ end
38
+
39
+ def increment_count
40
+ @count += 1
41
+ end
42
+ private :increment_count
43
+
44
+ end
45
+
46
+ class Jpg
47
+ attr_reader :stream
48
+
49
+ def initialize(stream)
50
+ @stream = stream
51
+ end
52
+
53
+ def save(filename)
54
+ w = stream.hash[:Width]
55
+ h = stream.hash[:Height]
56
+ puts "#{filename}: h=#{h}, w=#{w}"
57
+ File.open(filename, "wb") { |file| file.write stream.data }
58
+ end
59
+ end
60
+
61
+ class Tiff
62
+ attr_reader :stream
63
+
64
+ def initialize(stream)
65
+ @stream = stream
66
+ end
67
+
68
+ def save(filename)
69
+ if stream.hash[:DecodeParms][:K] <= 0
70
+ save_group_four(filename)
71
+ else
72
+ $stderr.puts "#{filename}: CCITT non-group 4/2D image."
73
+ end
74
+ end
75
+
76
+ private
77
+
78
+ # Group 4, 2D
79
+ def save_group_four(filename)
80
+ k = stream.hash[:DecodeParms][:K]
81
+ h = stream.hash[:Height]
82
+ w = stream.hash[:Width]
83
+ bpc = stream.hash[:BitsPerComponent]
84
+ mask = stream.hash[:ImageMask]
85
+ len = stream.hash[:Length]
86
+ cols = stream.hash[:DecodeParms][:Columns]
87
+ puts "#{filename}: h=#{h}, w=#{w}, bpc=#{bpc}, mask=#{mask}, len=#{len}, cols=#{cols}, k=#{k}"
88
+
89
+ # Synthesize a TIFF header
90
+ long_tag = lambda {|tag, value| [ tag, 4, 1, value ].pack( "ssII" ) }
91
+ short_tag = lambda {|tag, value| [ tag, 3, 1, value ].pack( "ssII" ) }
92
+ # header = byte order, version magic, offset of directory, directory count,
93
+ # followed by a series of tags containing metadata: 259 is a magic number for
94
+ # the compression type; 273 is the offset of the image data.
95
+ tiff = [ 73, 73, 42, 8, 5 ].pack("ccsIs") \
96
+ + short_tag.call( 256, cols ) \
97
+ + short_tag.call( 257, h ) \
98
+ + short_tag.call( 259, 4 ) \
99
+ + long_tag.call( 273, (10 + (5*12)) ) \
100
+ + long_tag.call( 279, len) \
101
+ + stream.data
102
+ File.open(filename, "wb") { |file| file.write tiff }
103
+ end
104
+ end
105
+ end
106
+
107
+ receiver = ExtractImages::Receiver.new
108
+ PDF::Reader.file(ARGV[0], receiver)
data/examples/hash.rb ADDED
@@ -0,0 +1,12 @@
1
+ #!/usr/bin/env ruby
2
+ # coding: utf-8
3
+
4
+ # get direct access to PDF objects
5
+ #
6
+ $LOAD_PATH.unshift(File.dirname(__FILE__) + '/../lib')
7
+
8
+ require 'pdf/reader'
9
+
10
+ filename = File.dirname(__FILE__) + "/../specs/data/cairo-unicode.pdf"
11
+ hash = PDF::Reader::ObjectHash.new(filename)
12
+ puts hash[3]