panjiva-pdf-reader 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +222 -0
- data/MIT-LICENSE +21 -0
- data/README.rdoc +192 -0
- data/Rakefile +93 -0
- data/TODO +34 -0
- data/bin/pdf_callbacks +23 -0
- data/bin/pdf_list_callbacks +17 -0
- data/bin/pdf_object +48 -0
- data/bin/pdf_text +13 -0
- data/examples/callbacks.rb +22 -0
- data/examples/extract_bates.rb +50 -0
- data/examples/extract_fonts.rb +77 -0
- data/examples/extract_images.rb +231 -0
- data/examples/hash.rb +12 -0
- data/examples/metadata.rb +14 -0
- data/examples/page_count.rb +13 -0
- data/examples/rspec.rb +33 -0
- data/examples/text.rb +15 -0
- data/examples/version.rb +13 -0
- data/lib/pdf-reader.rb +3 -0
- data/lib/pdf/hash.rb +19 -0
- data/lib/pdf/reader.rb +374 -0
- data/lib/pdf/reader/abstract_strategy.rb +81 -0
- data/lib/pdf/reader/afm/Courier-Bold.afm +342 -0
- data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -0
- data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -0
- data/lib/pdf/reader/afm/Courier.afm +342 -0
- data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -0
- data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -0
- data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -0
- data/lib/pdf/reader/afm/Helvetica.afm +3051 -0
- data/lib/pdf/reader/afm/Symbol.afm +213 -0
- data/lib/pdf/reader/afm/Times-Bold.afm +2588 -0
- data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -0
- data/lib/pdf/reader/afm/Times-Italic.afm +2667 -0
- data/lib/pdf/reader/afm/Times-Roman.afm +2419 -0
- data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -0
- data/lib/pdf/reader/buffer.rb +389 -0
- data/lib/pdf/reader/cmap.rb +161 -0
- data/lib/pdf/reader/encoding.rb +349 -0
- data/lib/pdf/reader/encodings/mac_expert.txt +159 -0
- data/lib/pdf/reader/encodings/mac_roman.txt +128 -0
- data/lib/pdf/reader/encodings/pdf_doc.txt +40 -0
- data/lib/pdf/reader/encodings/standard.txt +47 -0
- data/lib/pdf/reader/encodings/symbol.txt +154 -0
- data/lib/pdf/reader/encodings/win_ansi.txt +29 -0
- data/lib/pdf/reader/encodings/zapf_dingbats.txt +201 -0
- data/lib/pdf/reader/error.rb +69 -0
- data/lib/pdf/reader/filter.rb +58 -0
- data/lib/pdf/reader/filter/ascii85.rb +27 -0
- data/lib/pdf/reader/filter/ascii_hex.rb +28 -0
- data/lib/pdf/reader/filter/depredict.rb +140 -0
- data/lib/pdf/reader/filter/flate.rb +40 -0
- data/lib/pdf/reader/filter/lzw.rb +19 -0
- data/lib/pdf/reader/filter/null.rb +16 -0
- data/lib/pdf/reader/filter/run_length.rb +47 -0
- data/lib/pdf/reader/font.rb +208 -0
- data/lib/pdf/reader/font_descriptor.rb +80 -0
- data/lib/pdf/reader/form_xobject.rb +107 -0
- data/lib/pdf/reader/glyph_hash.rb +96 -0
- data/lib/pdf/reader/glyphlist.txt +4405 -0
- data/lib/pdf/reader/lzw.rb +126 -0
- data/lib/pdf/reader/metadata_strategy.rb +56 -0
- data/lib/pdf/reader/object_cache.rb +108 -0
- data/lib/pdf/reader/object_hash.rb +350 -0
- data/lib/pdf/reader/object_stream.rb +51 -0
- data/lib/pdf/reader/page.rb +178 -0
- data/lib/pdf/reader/page_layout.rb +125 -0
- data/lib/pdf/reader/page_receiver.rb +412 -0
- data/lib/pdf/reader/page_text_receiver.rb +51 -0
- data/lib/pdf/reader/pages_strategy.rb +488 -0
- data/lib/pdf/reader/parser.rb +219 -0
- data/lib/pdf/reader/print_receiver.rb +23 -0
- data/lib/pdf/reader/reference.rb +68 -0
- data/lib/pdf/reader/register_receiver.rb +95 -0
- data/lib/pdf/reader/resource_methods.rb +92 -0
- data/lib/pdf/reader/standard_security_handler.rb +190 -0
- data/lib/pdf/reader/stream.rb +71 -0
- data/lib/pdf/reader/synchronized_cache.rb +32 -0
- data/lib/pdf/reader/text_receiver.rb +265 -0
- data/lib/pdf/reader/text_run.rb +80 -0
- data/lib/pdf/reader/token.rb +43 -0
- data/lib/pdf/reader/transformation_matrix.rb +239 -0
- data/lib/pdf/reader/width_calculator.rb +11 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +58 -0
- data/lib/pdf/reader/width_calculator/composite.rb +109 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +63 -0
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +39 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +30 -0
- data/lib/pdf/reader/xref.rb +246 -0
- metadata +358 -0
data/CHANGELOG
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
v1.3.0 (30th December 2012)
|
|
2
|
+
- Numerous performance optimisations (thanks Alex Dowad)
|
|
3
|
+
- Improved text extraction (thanks Nathaniel Madura)
|
|
4
|
+
- Load less of the hashery gem to reduce core monkey patches
|
|
5
|
+
- various bug fixes
|
|
6
|
+
|
|
7
|
+
v1.2.0 (28th August 2012)
|
|
8
|
+
- Feature: correctly extract text using surrogate pairs and ligatures
|
|
9
|
+
(thanks Nathaniel Madura)
|
|
10
|
+
- Speed optimisation: cache tokenised Form XObjects to avoid re-parsing them
|
|
11
|
+
- Feature: support opening documents with some junk bytes prepended to file
|
|
12
|
+
(thanks Paul Gallagher)
|
|
13
|
+
- Acrobat does this, so it seemed reasonable to add support
|
|
14
|
+
|
|
15
|
+
v1.1.1 (9th May 2012)
|
|
16
|
+
- bugfix release to improve parsing of some PDFs
|
|
17
|
+
|
|
18
|
+
v1.1.0 (25th March 2012)
|
|
19
|
+
- new PageState class for handling common state tracking in page receivers
|
|
20
|
+
- see PageTextReceiver for example usage
|
|
21
|
+
- various bugfixes to support reading more PDF dialects
|
|
22
|
+
|
|
23
|
+
v1.0.0 (16th January 2012)
|
|
24
|
+
- support a new encryption variation
|
|
25
|
+
- bugfix in PageTextRender (thanks Paul Gallagher)
|
|
26
|
+
|
|
27
|
+
v1.0.0.rc1 (19th December 2011)
|
|
28
|
+
- performance optimisations (all by Bernerd Schaefer)
|
|
29
|
+
- some improvements to text extraction from form xobjects
|
|
30
|
+
- assume invalid font encodings are StandardEncoding
|
|
31
|
+
- use binary mode when opening PDFs to stop ruby being helpful and transcoding
|
|
32
|
+
bytes for us
|
|
33
|
+
|
|
34
|
+
v1.0.0.beta1 (6th October 2011)
|
|
35
|
+
- ensure inline images that contain "EI" are correctly parsed
|
|
36
|
+
(thanks Bernard Schaefer)
|
|
37
|
+
- fix parsing of inline image data
|
|
38
|
+
|
|
39
|
+
v0.12.0.alpha (28th August 2011)
|
|
40
|
+
- small breaking changes to the page-based API - it's alpha for a reason
|
|
41
|
+
- resource related methods on Page object return raw PDF objects
|
|
42
|
+
- if the caller wants the resources wrapped in a more convenient
|
|
43
|
+
Ruby object (like PDF::Reader::Font or PDF::Reader::FormXObject) will
|
|
44
|
+
need to do so themselves
|
|
45
|
+
- add support for RunLengthDecode filters (thanks Bernerd Schaefer)
|
|
46
|
+
- add support for standard PDF encryption (thanks Evan Brunner)
|
|
47
|
+
- add support for decoding stream with TIFF prediction
|
|
48
|
+
- new PDF::Reader::FormXObject class to simplify working with form XObjects
|
|
49
|
+
|
|
50
|
+
v0.11.0.alpha (19th July 2011)
|
|
51
|
+
- introduce experimental new page-based API
|
|
52
|
+
- old API is deprecated but will continue to work with no warnings
|
|
53
|
+
- add transparent caching of common objects to ObjectHash
|
|
54
|
+
|
|
55
|
+
v0.10.0 (6th July 2011)
|
|
56
|
+
- support multiple receivers within a single pass over a source file
|
|
57
|
+
- massive time saving when dealing with multiple receivers
|
|
58
|
+
|
|
59
|
+
v0.9.3 (2nd July 2011)
|
|
60
|
+
- add PDF::Reader::Reference#hash method
|
|
61
|
+
- improves behaviour of Reference objects when tehy're used as Hash keys
|
|
62
|
+
|
|
63
|
+
v0.9.2 (24th April 2011)
|
|
64
|
+
- add basic support for fonts with Identity-V encoding.
|
|
65
|
+
- bug: improve robustness of text extraction
|
|
66
|
+
- thanks to Evan Arnold for reporting
|
|
67
|
+
- bug: fix loading of nested resources on XObjects
|
|
68
|
+
- thanks to Samuel Williams for reporting
|
|
69
|
+
- bug: improve parsing of files with XRef object streams
|
|
70
|
+
|
|
71
|
+
v0.9.1 (21st December 2010)
|
|
72
|
+
- force gem to only install on ruby 1.8.7 or higher
|
|
73
|
+
- maintaining supprot for earlier versions takes more time than I have
|
|
74
|
+
available at the moment
|
|
75
|
+
- bug: fix parsing of obscure pdf name format
|
|
76
|
+
- bug: fix behaviour when loaded in confunction with htmldoc gem
|
|
77
|
+
|
|
78
|
+
v0.9.0 (19th November 2010)
|
|
79
|
+
- support for pdf 1.5+ files that use object and xref streams
|
|
80
|
+
- support streams that use a flate filter with the predictor option
|
|
81
|
+
- ensure all content instructions are parsed when split over multiple stream
|
|
82
|
+
- thanks to Jack Rusher for reporting
|
|
83
|
+
- Various string parsing bug
|
|
84
|
+
- some character conversions to utf-8 were failing (thanks Andrea Barisani)
|
|
85
|
+
- hashes with nested hex strings were tokenising wronly (thanks Evan Arnold)
|
|
86
|
+
- escaping bug in tokenising of literal strings (thanks David Westerink)
|
|
87
|
+
- Fix a bug that prevented PDFs with white space after the EOF marker from loading
|
|
88
|
+
- thanks to Solomon White for reporting the issue
|
|
89
|
+
- Add support for de-filtering some LZW compressed streams
|
|
90
|
+
- thanks to Jose Ignacio Rubio Iradi for the patch
|
|
91
|
+
- some small speed improvements
|
|
92
|
+
- API CHANGE: PDF::Hash renamed to PDF::Reader::ObjectHash
|
|
93
|
+
- having a class named Hash was confusing for users
|
|
94
|
+
|
|
95
|
+
v0.8.6 (27th August 2010)
|
|
96
|
+
- new method: hash#page_references
|
|
97
|
+
- returns references to all page objects, gives rapid access to objects
|
|
98
|
+
for a given page
|
|
99
|
+
|
|
100
|
+
v0.8.5 (11th April 2010)
|
|
101
|
+
- fix a regression introduced in 0.8.4.
|
|
102
|
+
- Parameters passed to resource_font callback were inadvertently changed
|
|
103
|
+
|
|
104
|
+
v0.8.4 (30th March 2010)
|
|
105
|
+
- fix parsing of files that use Form XObjects
|
|
106
|
+
- thanks to Andrea Barisani for reporting the issue
|
|
107
|
+
- fix two issues that caused a small number of characters to convert to Unicode
|
|
108
|
+
incorrectly
|
|
109
|
+
- thanks to Andrea Barisani for reporting the issue
|
|
110
|
+
- require 'pdf-reader' now works a well as 'pdf/reader'
|
|
111
|
+
- good practice to have the require file match the gem name
|
|
112
|
+
- thanks to Chris O'Meara for highlighting this
|
|
113
|
+
|
|
114
|
+
v0.8.3 (14th February 2010)
|
|
115
|
+
- Fix a bug in tokenising of hex strings inside dictionaries
|
|
116
|
+
- Thanks to Brad Ediger for detecting the issue and proposing a solution
|
|
117
|
+
|
|
118
|
+
v0.8.2 (1st January 2010)
|
|
119
|
+
- Fix parsing of files that use Form XObjects behind an indirect reference
|
|
120
|
+
(thanks Cornelius Illi and Patrick Crosby)
|
|
121
|
+
- Rewrote Buffer class to fix various speed issues reported over the years
|
|
122
|
+
- On my sample file extracting full text reduced from 220 seconds to 9 seconds.
|
|
123
|
+
|
|
124
|
+
v0.8.1 (27th November 2009)
|
|
125
|
+
- Added PDF::Hash#version. Provides access to the source file PDF version
|
|
126
|
+
|
|
127
|
+
v0.8.0 (20th November 2009)
|
|
128
|
+
- Added PDF::Hash. It provides direct access to objects from a PDF file
|
|
129
|
+
with an API that emulates the standard Ruby hash
|
|
130
|
+
|
|
131
|
+
v0.7.7 (11th September 2009)
|
|
132
|
+
- Trigger callbacks contained in Form XObjects when we encounter them in a
|
|
133
|
+
content stream
|
|
134
|
+
- Fix inheritance of page resources to comply with section 3.6.2
|
|
135
|
+
|
|
136
|
+
v0.7.6 (28th August 2009)
|
|
137
|
+
- Various bug fixes that increase the files we can successfully parse
|
|
138
|
+
- Treat float and integer tokens differently (thanks Neil)
|
|
139
|
+
- Correctly handle PDFs where the Kids element of a Pages dict is an indirect
|
|
140
|
+
reference (thanks Rob Holland)
|
|
141
|
+
- Fix conversion of PDF strings to Ruby strings on 1.8.6 (thanks Andrès Koetsier)
|
|
142
|
+
- Fix decoding with ASCII85 and ASCIIHex filters (thanks Andrès Koetsier)
|
|
143
|
+
- Fix extracting inline images from content streams (thanks Andrès Koetsier)
|
|
144
|
+
- Fix extracting [ ] from content streams (thanks Christian Rishøj)
|
|
145
|
+
- Fix conversion of text to UTF8 when the cmap uses bfrange (thanks Federico Gonzalez Lutteroth)
|
|
146
|
+
|
|
147
|
+
v0.7.5 (27th August 2008)
|
|
148
|
+
- Fix a 1.8.7ism
|
|
149
|
+
|
|
150
|
+
v0.7.4 (7th August 2008)
|
|
151
|
+
- Raise a MalformedPDFError if a content stream contains an unterminated string
|
|
152
|
+
- Fix an bug that was causing an endless loop on some OSX systems
|
|
153
|
+
- valid strings were incorrectly thought to be unterminated
|
|
154
|
+
- thanks to Jeff Webb for playing email ping pong with me as I tracked this
|
|
155
|
+
issue down
|
|
156
|
+
|
|
157
|
+
v0.7.3 (11th June 2008)
|
|
158
|
+
- Add a high level way to get direct access to a PDF object, including a new executable: pdf_object
|
|
159
|
+
- Fix a hard loop bug caused by a content stream that is missing a final operator
|
|
160
|
+
- Significantly simplified the internal code for encoding conversions
|
|
161
|
+
- Fixes YACC parsing bug that occurs on Fedora 8's ruby VM
|
|
162
|
+
- New callbacks
|
|
163
|
+
- page_count
|
|
164
|
+
- pdf_version
|
|
165
|
+
- Fix a bug that prevented a font's BaseFont from being recorded correctly
|
|
166
|
+
|
|
167
|
+
v0.7.2 (20th May 2008)
|
|
168
|
+
- Throw an UnsupportedFeatureError if we try to open an encrypted/secure PDF
|
|
169
|
+
- Correctly handle page content instruction sets with trailing whitespace
|
|
170
|
+
- Represent PDF Streams with a new object, PDF::Reader::Stream
|
|
171
|
+
- their really wasn't any point in separating the stream content from it's associated dict. You need both
|
|
172
|
+
parts to correctly interpret the content
|
|
173
|
+
|
|
174
|
+
v0.7.1 (6th May 2008)
|
|
175
|
+
- Non-page strings (ie. metadata, etc) are now converted to UTF-8 more accurately
|
|
176
|
+
- Fixed a regression between 0.6.2 and 0.7 that prevented difference tables from being applied
|
|
177
|
+
correctly when translating text into UTF-8
|
|
178
|
+
|
|
179
|
+
v0.7 (6th May 2008)
|
|
180
|
+
- API INCOMPATIBLE CHANGE: any hashes that are passed to callbacks use symbols as keys instead of PDF::Reader::Name instances.
|
|
181
|
+
- Improved support for converting text in some PDF files to unicode
|
|
182
|
+
- Behave as expected if the Contents key in a Page Dict is a reference
|
|
183
|
+
- Include some basic metadata callbacks
|
|
184
|
+
- Don't interpret a comment token (%) inside a string as a comment
|
|
185
|
+
- Small fixes to improve 1.9 compatibility
|
|
186
|
+
- Improved our Zlib deflating to make it slightly more robust - still some more issues to work out though
|
|
187
|
+
- Throw an UnsupportedFeatureError if a pdf that uses XRef streams is opened
|
|
188
|
+
- Added an option to PDF::Reader#file and PDF::Reader#string to enable parsing of only parts of a PDF file(ie. only metadata, etc)
|
|
189
|
+
|
|
190
|
+
v0.6.2 (22nd March 2008)
|
|
191
|
+
- Catch low level errors when applying filters to a content stream and raise a MalformedPDFError instead.
|
|
192
|
+
- Added support for processing inline images
|
|
193
|
+
- Support for parsing XRef tables that have multiple subsections
|
|
194
|
+
- Added a few callbacks to improve the way we supply information on page resources
|
|
195
|
+
- Ignore whitespace in hex strings, as required by the spec (section 3.2.3)
|
|
196
|
+
- Use our "unknown character box" when a single character in an Identity-H string fails to decode
|
|
197
|
+
- Support ToUnicode CMaps that use the bfrange operator
|
|
198
|
+
- Tweaked tokenising code to ensure whitespace doesn't get in the way
|
|
199
|
+
|
|
200
|
+
v0.6.1 (12th March 2008)
|
|
201
|
+
- Tweaked behaviour when we encounter Identity-H encoded text that doesn't have a ToUnicode mapping. We
|
|
202
|
+
just replace each character with a little box.
|
|
203
|
+
- Use the same little box when invalid characters are found in other encodings instead of throwing an ugly
|
|
204
|
+
NoMethodError.
|
|
205
|
+
- Added a method to RegisterReceiver that returns all occurrences of a callback
|
|
206
|
+
|
|
207
|
+
v0.6.0 (27th February 2008)
|
|
208
|
+
- all text is now transparently converted to UTF-8 before being passed to the callbacks.
|
|
209
|
+
before this version, text was just passed as a byte level copy of what was in the PDF file, which
|
|
210
|
+
was mildly annoying with some encodings, and resulted in garbled text for Unicode encoded text.
|
|
211
|
+
- Fonts that use a difference table are now handled correctly
|
|
212
|
+
- fixed some 1.9 incompatible syntax
|
|
213
|
+
- expanded RegisterReceiver class to record extra info
|
|
214
|
+
- expanded rspec coverage
|
|
215
|
+
- tweaked a README example
|
|
216
|
+
|
|
217
|
+
v0.5.1 (1st January 2008)
|
|
218
|
+
- Several documentation tweaks
|
|
219
|
+
- Improve support for parsing PDFs under windows (thanks to Jari Williamsson)
|
|
220
|
+
|
|
221
|
+
v0.5 (14th December 2007)
|
|
222
|
+
- Initial Release
|
data/MIT-LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
Copyright (c) 2009 Peter Jones
|
|
2
|
+
Copyright (c) 2009 James Healy
|
|
3
|
+
|
|
4
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
|
5
|
+
a copy of this software and associated documentation files (the
|
|
6
|
+
"Software"), to deal in the Software without restriction, including
|
|
7
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
|
8
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
|
9
|
+
permit persons to whom the Software is furnished to do so, subject to
|
|
10
|
+
the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be
|
|
13
|
+
included in all copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
16
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
17
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
18
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
|
19
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
|
20
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
|
21
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
= Experimental Feature Branch
|
|
2
|
+
|
|
3
|
+
This experimental fork of the pdf-reader project was created to fix some bugs
|
|
4
|
+
in the core parser and to simplify part of the codebase.
|
|
5
|
+
|
|
6
|
+
Implemented features include:
|
|
7
|
+
-Created PDF::Reader::PageReceiver to ease the process of making custom receiver subclasses
|
|
8
|
+
-Fixed bugs relating to the positioning of text
|
|
9
|
+
-Support for vertical-mode text rendering
|
|
10
|
+
|
|
11
|
+
Remaining goals include:
|
|
12
|
+
-Import fonts that support vertical-mode rendering
|
|
13
|
+
-Implement the ability to calculate the precise bounding rectangle for any character on the page
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
Although the documented external API remains the same, anything dependent on the
|
|
18
|
+
internal API of PageState and PageTextReceiver will probably break.
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
= Release Notes
|
|
22
|
+
|
|
23
|
+
The PDF::Reader library implements a PDF parser conforming as much as possible
|
|
24
|
+
to the PDF specification from Adobe.
|
|
25
|
+
|
|
26
|
+
It provides programmatic access to the contents of a PDF file with a high
|
|
27
|
+
degree of flexibility.
|
|
28
|
+
|
|
29
|
+
The PDF 1.7 specification is a weighty document and not all aspects are
|
|
30
|
+
currently supported. I welcome submission of PDF files that exhibit
|
|
31
|
+
unsupported aspects of the spec to assist with improving our support.
|
|
32
|
+
|
|
33
|
+
This is primarily a low-level library that should be used as the foundation for
|
|
34
|
+
higher level functionality - it's not going to render a PDF for you. There are
|
|
35
|
+
a few exceptions to support very common use cases like extracting text from a
|
|
36
|
+
page.
|
|
37
|
+
|
|
38
|
+
= Installation
|
|
39
|
+
|
|
40
|
+
The recommended installation method is via Rubygems.
|
|
41
|
+
|
|
42
|
+
gem install pdf-reader
|
|
43
|
+
|
|
44
|
+
= Usage
|
|
45
|
+
|
|
46
|
+
Begin by creating a PDF::Reader instance that points to a PDF file. Document
|
|
47
|
+
level information (metadata, page count, bookmarks, etc) is available via
|
|
48
|
+
this object.
|
|
49
|
+
|
|
50
|
+
reader = PDF::Reader.new("somefile.pdf")
|
|
51
|
+
|
|
52
|
+
puts reader.pdf_version
|
|
53
|
+
puts reader.info
|
|
54
|
+
puts reader.metadata
|
|
55
|
+
puts reader.page_count
|
|
56
|
+
|
|
57
|
+
PDF::Reader.new accepts an IO stream or a filename. Here's an example with
|
|
58
|
+
an IO stream:
|
|
59
|
+
|
|
60
|
+
require 'open-uri'
|
|
61
|
+
|
|
62
|
+
io = open('http://example.com/somefile.pdf')
|
|
63
|
+
reader = PDF::Reader.new(io)
|
|
64
|
+
puts reader.info
|
|
65
|
+
|
|
66
|
+
If you open a PDF with File#open or IO#open, I strongly recommend using "rb"
|
|
67
|
+
mode to ensure the file isn't mangled by ruby being 'helpful'. This is
|
|
68
|
+
particularly important on windows and MRI >= 1.9.2.
|
|
69
|
+
|
|
70
|
+
File.open("somefile.pdf", "rb") do |io|
|
|
71
|
+
reader = PDF::Reader.new(io)
|
|
72
|
+
puts reader.info
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
PDF is a page based file format, so most visible information is available via
|
|
76
|
+
page-based iteration
|
|
77
|
+
|
|
78
|
+
reader = PDF::Reader.new("somefile.pdf")
|
|
79
|
+
|
|
80
|
+
reader.pages.each do |page|
|
|
81
|
+
puts page.fonts
|
|
82
|
+
puts page.text
|
|
83
|
+
puts page.raw_content
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
If you need to access the full program for rendering a page, use the walk() method
|
|
87
|
+
of PDF::Reader::Page.
|
|
88
|
+
|
|
89
|
+
class RedGreenBlue
|
|
90
|
+
def set_rgb_color_for_nonstroking(r, g, b)
|
|
91
|
+
puts "R: #{r}, G: #{g}, B: #{b}"
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
reader = PDF::Reader.new("somefile.pdf")
|
|
96
|
+
page = reader.page(1)
|
|
97
|
+
receiver = RedGreenBlue.new
|
|
98
|
+
page.walk(receiver)
|
|
99
|
+
|
|
100
|
+
For low level access to the objects in a PDF file, use the ObjectHash class. You can
|
|
101
|
+
build an ObjectHash instance directly:
|
|
102
|
+
|
|
103
|
+
puts PDF::Reader::ObjectHash.new("somefile.pdf")
|
|
104
|
+
|
|
105
|
+
or via a PDF::Reader instance:
|
|
106
|
+
|
|
107
|
+
reader = PDF::Reader.new("somefile.pdf")
|
|
108
|
+
puts reader.objects
|
|
109
|
+
|
|
110
|
+
The second method is preferred to increase the effectiveness of internal caching.
|
|
111
|
+
|
|
112
|
+
= Text Encoding
|
|
113
|
+
|
|
114
|
+
Regardless of the internal encoding used in the PDF all text will be converted
|
|
115
|
+
to UTF-8 before it is passed back from PDF::Reader.
|
|
116
|
+
|
|
117
|
+
Strings that contain binary data (like font blobs) will be marked as such on
|
|
118
|
+
M17N aware VMs.
|
|
119
|
+
|
|
120
|
+
= Former API
|
|
121
|
+
|
|
122
|
+
Version 1.0.0 of PDF::Reader introduced a new page-based API that provides
|
|
123
|
+
efficient and easy access to any page.
|
|
124
|
+
|
|
125
|
+
The previous API is marked as deprecated but will continue to work for the
|
|
126
|
+
time being. Eventually calls to the old API will begin triggering deprecation
|
|
127
|
+
warnings before it is completely removed in version 2.0.0.
|
|
128
|
+
|
|
129
|
+
= Exceptions
|
|
130
|
+
|
|
131
|
+
There are two key exceptions that you will need to watch out for when processing a
|
|
132
|
+
PDF file:
|
|
133
|
+
|
|
134
|
+
MalformedPDFError - The PDF appears to be corrupt in some way. If you believe the
|
|
135
|
+
file should be valid, or that a corrupt file didn't raise an exception, please
|
|
136
|
+
forward a copy of the file to the maintainers (preferably via the google group)
|
|
137
|
+
and we will attempt to improve the code.
|
|
138
|
+
|
|
139
|
+
UnsupportedFeatureError - The PDF uses a feature that PDF::Reader doesn't currently
|
|
140
|
+
support. Again, we welcome submissions of PDF files that exhibit these features to help
|
|
141
|
+
us with future code improvements.
|
|
142
|
+
|
|
143
|
+
MalformedPDFError has some subclasses if you want to detect finer grained issues. If you
|
|
144
|
+
don't, 'rescue MalformedPDFError' will catch all the subclassed errors as well.
|
|
145
|
+
|
|
146
|
+
Any other exceptions should be considered bugs in either PDF::Reader (please
|
|
147
|
+
report it!).
|
|
148
|
+
|
|
149
|
+
= PDF Integrity
|
|
150
|
+
|
|
151
|
+
Windows developers may run into problems when running specs due to MalformedPDFError's
|
|
152
|
+
This is usually because CRLF characters are automatically added to some of the PDF's in
|
|
153
|
+
the spec folder when you checkout a branch from Git.
|
|
154
|
+
|
|
155
|
+
To remove any invalid CRLF characters added while checking out a branch from Git, run:
|
|
156
|
+
|
|
157
|
+
rake fix_integrity
|
|
158
|
+
|
|
159
|
+
= Maintainers
|
|
160
|
+
|
|
161
|
+
- James Healy <mailto:jimmy@deefa.com>
|
|
162
|
+
|
|
163
|
+
= Licensing
|
|
164
|
+
|
|
165
|
+
This library is distributed under the terms of the MIT License. See the included file for
|
|
166
|
+
more detail.
|
|
167
|
+
|
|
168
|
+
= Mailing List
|
|
169
|
+
|
|
170
|
+
Any questions or feedback should be sent to the PDF::Reader google group. It's
|
|
171
|
+
better that any answers be available for others instead of hiding in someone's
|
|
172
|
+
inbox.
|
|
173
|
+
|
|
174
|
+
http://groups.google.com/group/pdf-reader
|
|
175
|
+
|
|
176
|
+
= Examples
|
|
177
|
+
|
|
178
|
+
The easiest way to explain how this works in practice is to show some examples.
|
|
179
|
+
Check out the examples/ directory for a few files.
|
|
180
|
+
|
|
181
|
+
= Known Limitations
|
|
182
|
+
|
|
183
|
+
Occasionally some text cannot be extracted properly due to the way it has been
|
|
184
|
+
stored, or the use of invalid bytes. In these cases PDF::Reader will output a
|
|
185
|
+
little UTF-8 friendly box to indicate an unrecognisable character.
|
|
186
|
+
|
|
187
|
+
= Resources
|
|
188
|
+
|
|
189
|
+
- PDF::Reader Code Repository: http://github.com/yob/pdf-reader
|
|
190
|
+
- PDF Specification: http://www.adobe.com/devnet/pdf/pdf_reference.html
|
|
191
|
+
- PDF Tutorial Slide Presentations: http://home.comcast.net/~jk05/presentations/PDFTutorials.html
|
|
192
|
+
- Developing with PDF (book): http://shop.oreilly.com/product/0636920025269.do
|
data/Rakefile
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
require "rubygems"
|
|
2
|
+
require "bundler"
|
|
3
|
+
Bundler.setup
|
|
4
|
+
|
|
5
|
+
require 'rake'
|
|
6
|
+
require 'rdoc/task'
|
|
7
|
+
require 'rspec/core/rake_task'
|
|
8
|
+
|
|
9
|
+
# Cane requires ripper, which appears to only work on MRI 1.9
|
|
10
|
+
if RUBY_VERSION >= "1.9" && RUBY_ENGINE == "ruby"
|
|
11
|
+
|
|
12
|
+
desc "Default Task"
|
|
13
|
+
task :default => [ :quality, :spec ]
|
|
14
|
+
|
|
15
|
+
require 'cane/rake_task'
|
|
16
|
+
require 'morecane'
|
|
17
|
+
|
|
18
|
+
desc "Run cane to check quality metrics"
|
|
19
|
+
Cane::RakeTask.new(:quality) do |cane|
|
|
20
|
+
cane.abc_max = 20
|
|
21
|
+
cane.style_measure = 100
|
|
22
|
+
cane.max_violations = 94
|
|
23
|
+
|
|
24
|
+
cane.use Morecane::EncodingCheck, :encoding_glob => "{app,lib,spec}/**/*.rb"
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
else
|
|
28
|
+
desc "Default Task"
|
|
29
|
+
task :default => [ :spec ]
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
desc "Run all rspec files"
|
|
33
|
+
RSpec::Core::RakeTask.new("spec") do |t|
|
|
34
|
+
t.rspec_opts = ["--color", "--format progress"]
|
|
35
|
+
t.ruby_opts = "-w"
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Generate the RDoc documentation
|
|
39
|
+
desc "Create documentation"
|
|
40
|
+
Rake::RDocTask.new("doc") do |rdoc|
|
|
41
|
+
rdoc.title = "pdf-reader"
|
|
42
|
+
rdoc.rdoc_dir = (ENV['CC_BUILD_ARTIFACTS'] || 'doc') + '/rdoc'
|
|
43
|
+
rdoc.rdoc_files.include('README.rdoc')
|
|
44
|
+
rdoc.rdoc_files.include('TODO')
|
|
45
|
+
rdoc.rdoc_files.include('CHANGELOG')
|
|
46
|
+
rdoc.rdoc_files.include('MIT-LICENSE')
|
|
47
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
|
48
|
+
rdoc.options << "--inline-source"
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
desc "Create a YAML file of integrity info for PDFs in the spec suite"
|
|
52
|
+
task :integrity_yaml do
|
|
53
|
+
data = {}
|
|
54
|
+
Dir.glob("spec/data/**/*.*").each do |path|
|
|
55
|
+
path_without_spec = path.gsub("spec/","")
|
|
56
|
+
data[path_without_spec] = {
|
|
57
|
+
:bytes => File.size(path),
|
|
58
|
+
:md5 => `md5sum "#{path}"`.split.first
|
|
59
|
+
} if File.file?(path)
|
|
60
|
+
end
|
|
61
|
+
File.open("spec/integrity.yml","wb") { |f| f.write YAML.dump(data)}
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
desc "Remove any CRLF characters added by Git"
|
|
65
|
+
task :fix_integrity do
|
|
66
|
+
yaml_path = File.expand_path("spec/integrity.yml",File.dirname(__FILE__))
|
|
67
|
+
integrity = YAML.load_file(yaml_path)
|
|
68
|
+
|
|
69
|
+
Dir.glob("spec/data/**/*.pdf").each do |path|
|
|
70
|
+
path_relative_to_spec_folder = path[/.+(data\/.+)/,1]
|
|
71
|
+
item = integrity[path_relative_to_spec_folder]
|
|
72
|
+
|
|
73
|
+
if File.file?(path)
|
|
74
|
+
file_contents = File.open(path, "rb") { |f| f.read }
|
|
75
|
+
md5 = Digest::MD5.hexdigest(file_contents)
|
|
76
|
+
|
|
77
|
+
unless md5 == item[:md5]
|
|
78
|
+
#file md5 does not match what was checked into Git
|
|
79
|
+
|
|
80
|
+
if Digest::MD5.hexdigest(file_contents.gsub(/\r\n/, "\n")) == item[:md5]
|
|
81
|
+
#pdf file is fixable by swapping CRLF characters
|
|
82
|
+
|
|
83
|
+
File.open(path, "wb") do |f|
|
|
84
|
+
f.write(file_contents.gsub(/\r\n/, "\n"))
|
|
85
|
+
end
|
|
86
|
+
puts "Replaced CRLF characters in: #{path}"
|
|
87
|
+
else
|
|
88
|
+
puts "Failed to fix: #{path}"
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
end
|