panjiva-pdf-reader 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. data/CHANGELOG +222 -0
  2. data/MIT-LICENSE +21 -0
  3. data/README.rdoc +192 -0
  4. data/Rakefile +93 -0
  5. data/TODO +34 -0
  6. data/bin/pdf_callbacks +23 -0
  7. data/bin/pdf_list_callbacks +17 -0
  8. data/bin/pdf_object +48 -0
  9. data/bin/pdf_text +13 -0
  10. data/examples/callbacks.rb +22 -0
  11. data/examples/extract_bates.rb +50 -0
  12. data/examples/extract_fonts.rb +77 -0
  13. data/examples/extract_images.rb +231 -0
  14. data/examples/hash.rb +12 -0
  15. data/examples/metadata.rb +14 -0
  16. data/examples/page_count.rb +13 -0
  17. data/examples/rspec.rb +33 -0
  18. data/examples/text.rb +15 -0
  19. data/examples/version.rb +13 -0
  20. data/lib/pdf-reader.rb +3 -0
  21. data/lib/pdf/hash.rb +19 -0
  22. data/lib/pdf/reader.rb +374 -0
  23. data/lib/pdf/reader/abstract_strategy.rb +81 -0
  24. data/lib/pdf/reader/afm/Courier-Bold.afm +342 -0
  25. data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -0
  26. data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -0
  27. data/lib/pdf/reader/afm/Courier.afm +342 -0
  28. data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -0
  29. data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -0
  30. data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -0
  31. data/lib/pdf/reader/afm/Helvetica.afm +3051 -0
  32. data/lib/pdf/reader/afm/Symbol.afm +213 -0
  33. data/lib/pdf/reader/afm/Times-Bold.afm +2588 -0
  34. data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -0
  35. data/lib/pdf/reader/afm/Times-Italic.afm +2667 -0
  36. data/lib/pdf/reader/afm/Times-Roman.afm +2419 -0
  37. data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -0
  38. data/lib/pdf/reader/buffer.rb +389 -0
  39. data/lib/pdf/reader/cmap.rb +161 -0
  40. data/lib/pdf/reader/encoding.rb +349 -0
  41. data/lib/pdf/reader/encodings/mac_expert.txt +159 -0
  42. data/lib/pdf/reader/encodings/mac_roman.txt +128 -0
  43. data/lib/pdf/reader/encodings/pdf_doc.txt +40 -0
  44. data/lib/pdf/reader/encodings/standard.txt +47 -0
  45. data/lib/pdf/reader/encodings/symbol.txt +154 -0
  46. data/lib/pdf/reader/encodings/win_ansi.txt +29 -0
  47. data/lib/pdf/reader/encodings/zapf_dingbats.txt +201 -0
  48. data/lib/pdf/reader/error.rb +69 -0
  49. data/lib/pdf/reader/filter.rb +58 -0
  50. data/lib/pdf/reader/filter/ascii85.rb +27 -0
  51. data/lib/pdf/reader/filter/ascii_hex.rb +28 -0
  52. data/lib/pdf/reader/filter/depredict.rb +140 -0
  53. data/lib/pdf/reader/filter/flate.rb +40 -0
  54. data/lib/pdf/reader/filter/lzw.rb +19 -0
  55. data/lib/pdf/reader/filter/null.rb +16 -0
  56. data/lib/pdf/reader/filter/run_length.rb +47 -0
  57. data/lib/pdf/reader/font.rb +208 -0
  58. data/lib/pdf/reader/font_descriptor.rb +80 -0
  59. data/lib/pdf/reader/form_xobject.rb +107 -0
  60. data/lib/pdf/reader/glyph_hash.rb +96 -0
  61. data/lib/pdf/reader/glyphlist.txt +4405 -0
  62. data/lib/pdf/reader/lzw.rb +126 -0
  63. data/lib/pdf/reader/metadata_strategy.rb +56 -0
  64. data/lib/pdf/reader/object_cache.rb +108 -0
  65. data/lib/pdf/reader/object_hash.rb +350 -0
  66. data/lib/pdf/reader/object_stream.rb +51 -0
  67. data/lib/pdf/reader/page.rb +178 -0
  68. data/lib/pdf/reader/page_layout.rb +125 -0
  69. data/lib/pdf/reader/page_receiver.rb +412 -0
  70. data/lib/pdf/reader/page_text_receiver.rb +51 -0
  71. data/lib/pdf/reader/pages_strategy.rb +488 -0
  72. data/lib/pdf/reader/parser.rb +219 -0
  73. data/lib/pdf/reader/print_receiver.rb +23 -0
  74. data/lib/pdf/reader/reference.rb +68 -0
  75. data/lib/pdf/reader/register_receiver.rb +95 -0
  76. data/lib/pdf/reader/resource_methods.rb +92 -0
  77. data/lib/pdf/reader/standard_security_handler.rb +190 -0
  78. data/lib/pdf/reader/stream.rb +71 -0
  79. data/lib/pdf/reader/synchronized_cache.rb +32 -0
  80. data/lib/pdf/reader/text_receiver.rb +265 -0
  81. data/lib/pdf/reader/text_run.rb +80 -0
  82. data/lib/pdf/reader/token.rb +43 -0
  83. data/lib/pdf/reader/transformation_matrix.rb +239 -0
  84. data/lib/pdf/reader/width_calculator.rb +11 -0
  85. data/lib/pdf/reader/width_calculator/built_in.rb +58 -0
  86. data/lib/pdf/reader/width_calculator/composite.rb +109 -0
  87. data/lib/pdf/reader/width_calculator/true_type.rb +63 -0
  88. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +39 -0
  89. data/lib/pdf/reader/width_calculator/type_zero.rb +30 -0
  90. data/lib/pdf/reader/xref.rb +246 -0
  91. metadata +358 -0
@@ -0,0 +1,222 @@
1
+ v1.3.0 (30th December 2012)
2
+ - Numerous performance optimisations (thanks Alex Dowad)
3
+ - Improved text extraction (thanks Nathaniel Madura)
4
+ - Load less of the hashery gem to reduce core monkey patches
5
+ - various bug fixes
6
+
7
+ v1.2.0 (28th August 2012)
8
+ - Feature: correctly extract text using surrogate pairs and ligatures
9
+ (thanks Nathaniel Madura)
10
+ - Speed optimisation: cache tokenised Form XObjects to avoid re-parsing them
11
+ - Feature: support opening documents with some junk bytes prepended to file
12
+ (thanks Paul Gallagher)
13
+ - Acrobat does this, so it seemed reasonable to add support
14
+
15
+ v1.1.1 (9th May 2012)
16
+ - bugfix release to improve parsing of some PDFs
17
+
18
+ v1.1.0 (25th March 2012)
19
+ - new PageState class for handling common state tracking in page receivers
20
+ - see PageTextReceiver for example usage
21
+ - various bugfixes to support reading more PDF dialects
22
+
23
+ v1.0.0 (16th January 2012)
24
+ - support a new encryption variation
25
+ - bugfix in PageTextRender (thanks Paul Gallagher)
26
+
27
+ v1.0.0.rc1 (19th December 2011)
28
+ - performance optimisations (all by Bernerd Schaefer)
29
+ - some improvements to text extraction from form xobjects
30
+ - assume invalid font encodings are StandardEncoding
31
+ - use binary mode when opening PDFs to stop ruby being helpful and transcoding
32
+ bytes for us
33
+
34
+ v1.0.0.beta1 (6th October 2011)
35
+ - ensure inline images that contain "EI" are correctly parsed
36
+ (thanks Bernard Schaefer)
37
+ - fix parsing of inline image data
38
+
39
+ v0.12.0.alpha (28th August 2011)
40
+ - small breaking changes to the page-based API - it's alpha for a reason
41
+ - resource related methods on Page object return raw PDF objects
42
+ - if the caller wants the resources wrapped in a more convenient
43
+ Ruby object (like PDF::Reader::Font or PDF::Reader::FormXObject) will
44
+ need to do so themselves
45
+ - add support for RunLengthDecode filters (thanks Bernerd Schaefer)
46
+ - add support for standard PDF encryption (thanks Evan Brunner)
47
+ - add support for decoding stream with TIFF prediction
48
+ - new PDF::Reader::FormXObject class to simplify working with form XObjects
49
+
50
+ v0.11.0.alpha (19th July 2011)
51
+ - introduce experimental new page-based API
52
+ - old API is deprecated but will continue to work with no warnings
53
+ - add transparent caching of common objects to ObjectHash
54
+
55
+ v0.10.0 (6th July 2011)
56
+ - support multiple receivers within a single pass over a source file
57
+ - massive time saving when dealing with multiple receivers
58
+
59
+ v0.9.3 (2nd July 2011)
60
+ - add PDF::Reader::Reference#hash method
61
+ - improves behaviour of Reference objects when tehy're used as Hash keys
62
+
63
+ v0.9.2 (24th April 2011)
64
+ - add basic support for fonts with Identity-V encoding.
65
+ - bug: improve robustness of text extraction
66
+ - thanks to Evan Arnold for reporting
67
+ - bug: fix loading of nested resources on XObjects
68
+ - thanks to Samuel Williams for reporting
69
+ - bug: improve parsing of files with XRef object streams
70
+
71
+ v0.9.1 (21st December 2010)
72
+ - force gem to only install on ruby 1.8.7 or higher
73
+ - maintaining supprot for earlier versions takes more time than I have
74
+ available at the moment
75
+ - bug: fix parsing of obscure pdf name format
76
+ - bug: fix behaviour when loaded in confunction with htmldoc gem
77
+
78
+ v0.9.0 (19th November 2010)
79
+ - support for pdf 1.5+ files that use object and xref streams
80
+ - support streams that use a flate filter with the predictor option
81
+ - ensure all content instructions are parsed when split over multiple stream
82
+ - thanks to Jack Rusher for reporting
83
+ - Various string parsing bug
84
+ - some character conversions to utf-8 were failing (thanks Andrea Barisani)
85
+ - hashes with nested hex strings were tokenising wronly (thanks Evan Arnold)
86
+ - escaping bug in tokenising of literal strings (thanks David Westerink)
87
+ - Fix a bug that prevented PDFs with white space after the EOF marker from loading
88
+ - thanks to Solomon White for reporting the issue
89
+ - Add support for de-filtering some LZW compressed streams
90
+ - thanks to Jose Ignacio Rubio Iradi for the patch
91
+ - some small speed improvements
92
+ - API CHANGE: PDF::Hash renamed to PDF::Reader::ObjectHash
93
+ - having a class named Hash was confusing for users
94
+
95
+ v0.8.6 (27th August 2010)
96
+ - new method: hash#page_references
97
+ - returns references to all page objects, gives rapid access to objects
98
+ for a given page
99
+
100
+ v0.8.5 (11th April 2010)
101
+ - fix a regression introduced in 0.8.4.
102
+ - Parameters passed to resource_font callback were inadvertently changed
103
+
104
+ v0.8.4 (30th March 2010)
105
+ - fix parsing of files that use Form XObjects
106
+ - thanks to Andrea Barisani for reporting the issue
107
+ - fix two issues that caused a small number of characters to convert to Unicode
108
+ incorrectly
109
+ - thanks to Andrea Barisani for reporting the issue
110
+ - require 'pdf-reader' now works a well as 'pdf/reader'
111
+ - good practice to have the require file match the gem name
112
+ - thanks to Chris O'Meara for highlighting this
113
+
114
+ v0.8.3 (14th February 2010)
115
+ - Fix a bug in tokenising of hex strings inside dictionaries
116
+ - Thanks to Brad Ediger for detecting the issue and proposing a solution
117
+
118
+ v0.8.2 (1st January 2010)
119
+ - Fix parsing of files that use Form XObjects behind an indirect reference
120
+ (thanks Cornelius Illi and Patrick Crosby)
121
+ - Rewrote Buffer class to fix various speed issues reported over the years
122
+ - On my sample file extracting full text reduced from 220 seconds to 9 seconds.
123
+
124
+ v0.8.1 (27th November 2009)
125
+ - Added PDF::Hash#version. Provides access to the source file PDF version
126
+
127
+ v0.8.0 (20th November 2009)
128
+ - Added PDF::Hash. It provides direct access to objects from a PDF file
129
+ with an API that emulates the standard Ruby hash
130
+
131
+ v0.7.7 (11th September 2009)
132
+ - Trigger callbacks contained in Form XObjects when we encounter them in a
133
+ content stream
134
+ - Fix inheritance of page resources to comply with section 3.6.2
135
+
136
+ v0.7.6 (28th August 2009)
137
+ - Various bug fixes that increase the files we can successfully parse
138
+ - Treat float and integer tokens differently (thanks Neil)
139
+ - Correctly handle PDFs where the Kids element of a Pages dict is an indirect
140
+ reference (thanks Rob Holland)
141
+ - Fix conversion of PDF strings to Ruby strings on 1.8.6 (thanks Andrès Koetsier)
142
+ - Fix decoding with ASCII85 and ASCIIHex filters (thanks Andrès Koetsier)
143
+ - Fix extracting inline images from content streams (thanks Andrès Koetsier)
144
+ - Fix extracting [ ] from content streams (thanks Christian Rishøj)
145
+ - Fix conversion of text to UTF8 when the cmap uses bfrange (thanks Federico Gonzalez Lutteroth)
146
+
147
+ v0.7.5 (27th August 2008)
148
+ - Fix a 1.8.7ism
149
+
150
+ v0.7.4 (7th August 2008)
151
+ - Raise a MalformedPDFError if a content stream contains an unterminated string
152
+ - Fix an bug that was causing an endless loop on some OSX systems
153
+ - valid strings were incorrectly thought to be unterminated
154
+ - thanks to Jeff Webb for playing email ping pong with me as I tracked this
155
+ issue down
156
+
157
+ v0.7.3 (11th June 2008)
158
+ - Add a high level way to get direct access to a PDF object, including a new executable: pdf_object
159
+ - Fix a hard loop bug caused by a content stream that is missing a final operator
160
+ - Significantly simplified the internal code for encoding conversions
161
+ - Fixes YACC parsing bug that occurs on Fedora 8's ruby VM
162
+ - New callbacks
163
+ - page_count
164
+ - pdf_version
165
+ - Fix a bug that prevented a font's BaseFont from being recorded correctly
166
+
167
+ v0.7.2 (20th May 2008)
168
+ - Throw an UnsupportedFeatureError if we try to open an encrypted/secure PDF
169
+ - Correctly handle page content instruction sets with trailing whitespace
170
+ - Represent PDF Streams with a new object, PDF::Reader::Stream
171
+ - their really wasn't any point in separating the stream content from it's associated dict. You need both
172
+ parts to correctly interpret the content
173
+
174
+ v0.7.1 (6th May 2008)
175
+ - Non-page strings (ie. metadata, etc) are now converted to UTF-8 more accurately
176
+ - Fixed a regression between 0.6.2 and 0.7 that prevented difference tables from being applied
177
+ correctly when translating text into UTF-8
178
+
179
+ v0.7 (6th May 2008)
180
+ - API INCOMPATIBLE CHANGE: any hashes that are passed to callbacks use symbols as keys instead of PDF::Reader::Name instances.
181
+ - Improved support for converting text in some PDF files to unicode
182
+ - Behave as expected if the Contents key in a Page Dict is a reference
183
+ - Include some basic metadata callbacks
184
+ - Don't interpret a comment token (%) inside a string as a comment
185
+ - Small fixes to improve 1.9 compatibility
186
+ - Improved our Zlib deflating to make it slightly more robust - still some more issues to work out though
187
+ - Throw an UnsupportedFeatureError if a pdf that uses XRef streams is opened
188
+ - Added an option to PDF::Reader#file and PDF::Reader#string to enable parsing of only parts of a PDF file(ie. only metadata, etc)
189
+
190
+ v0.6.2 (22nd March 2008)
191
+ - Catch low level errors when applying filters to a content stream and raise a MalformedPDFError instead.
192
+ - Added support for processing inline images
193
+ - Support for parsing XRef tables that have multiple subsections
194
+ - Added a few callbacks to improve the way we supply information on page resources
195
+ - Ignore whitespace in hex strings, as required by the spec (section 3.2.3)
196
+ - Use our "unknown character box" when a single character in an Identity-H string fails to decode
197
+ - Support ToUnicode CMaps that use the bfrange operator
198
+ - Tweaked tokenising code to ensure whitespace doesn't get in the way
199
+
200
+ v0.6.1 (12th March 2008)
201
+ - Tweaked behaviour when we encounter Identity-H encoded text that doesn't have a ToUnicode mapping. We
202
+ just replace each character with a little box.
203
+ - Use the same little box when invalid characters are found in other encodings instead of throwing an ugly
204
+ NoMethodError.
205
+ - Added a method to RegisterReceiver that returns all occurrences of a callback
206
+
207
+ v0.6.0 (27th February 2008)
208
+ - all text is now transparently converted to UTF-8 before being passed to the callbacks.
209
+ before this version, text was just passed as a byte level copy of what was in the PDF file, which
210
+ was mildly annoying with some encodings, and resulted in garbled text for Unicode encoded text.
211
+ - Fonts that use a difference table are now handled correctly
212
+ - fixed some 1.9 incompatible syntax
213
+ - expanded RegisterReceiver class to record extra info
214
+ - expanded rspec coverage
215
+ - tweaked a README example
216
+
217
+ v0.5.1 (1st January 2008)
218
+ - Several documentation tweaks
219
+ - Improve support for parsing PDFs under windows (thanks to Jari Williamsson)
220
+
221
+ v0.5 (14th December 2007)
222
+ - Initial Release
@@ -0,0 +1,21 @@
1
+ Copyright (c) 2009 Peter Jones
2
+ Copyright (c) 2009 James Healy
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining
5
+ a copy of this software and associated documentation files (the
6
+ "Software"), to deal in the Software without restriction, including
7
+ without limitation the rights to use, copy, modify, merge, publish,
8
+ distribute, sublicense, and/or sell copies of the Software, and to
9
+ permit persons to whom the Software is furnished to do so, subject to
10
+ the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be
13
+ included in all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,192 @@
1
+ = Experimental Feature Branch
2
+
3
+ This experimental fork of the pdf-reader project was created to fix some bugs
4
+ in the core parser and to simplify part of the codebase.
5
+
6
+ Implemented features include:
7
+ -Created PDF::Reader::PageReceiver to ease the process of making custom receiver subclasses
8
+ -Fixed bugs relating to the positioning of text
9
+ -Support for vertical-mode text rendering
10
+
11
+ Remaining goals include:
12
+ -Import fonts that support vertical-mode rendering
13
+ -Implement the ability to calculate the precise bounding rectangle for any character on the page
14
+
15
+
16
+
17
+ Although the documented external API remains the same, anything dependent on the
18
+ internal API of PageState and PageTextReceiver will probably break.
19
+
20
+
21
+ = Release Notes
22
+
23
+ The PDF::Reader library implements a PDF parser conforming as much as possible
24
+ to the PDF specification from Adobe.
25
+
26
+ It provides programmatic access to the contents of a PDF file with a high
27
+ degree of flexibility.
28
+
29
+ The PDF 1.7 specification is a weighty document and not all aspects are
30
+ currently supported. I welcome submission of PDF files that exhibit
31
+ unsupported aspects of the spec to assist with improving our support.
32
+
33
+ This is primarily a low-level library that should be used as the foundation for
34
+ higher level functionality - it's not going to render a PDF for you. There are
35
+ a few exceptions to support very common use cases like extracting text from a
36
+ page.
37
+
38
+ = Installation
39
+
40
+ The recommended installation method is via Rubygems.
41
+
42
+ gem install pdf-reader
43
+
44
+ = Usage
45
+
46
+ Begin by creating a PDF::Reader instance that points to a PDF file. Document
47
+ level information (metadata, page count, bookmarks, etc) is available via
48
+ this object.
49
+
50
+ reader = PDF::Reader.new("somefile.pdf")
51
+
52
+ puts reader.pdf_version
53
+ puts reader.info
54
+ puts reader.metadata
55
+ puts reader.page_count
56
+
57
+ PDF::Reader.new accepts an IO stream or a filename. Here's an example with
58
+ an IO stream:
59
+
60
+ require 'open-uri'
61
+
62
+ io = open('http://example.com/somefile.pdf')
63
+ reader = PDF::Reader.new(io)
64
+ puts reader.info
65
+
66
+ If you open a PDF with File#open or IO#open, I strongly recommend using "rb"
67
+ mode to ensure the file isn't mangled by ruby being 'helpful'. This is
68
+ particularly important on windows and MRI >= 1.9.2.
69
+
70
+ File.open("somefile.pdf", "rb") do |io|
71
+ reader = PDF::Reader.new(io)
72
+ puts reader.info
73
+ end
74
+
75
+ PDF is a page based file format, so most visible information is available via
76
+ page-based iteration
77
+
78
+ reader = PDF::Reader.new("somefile.pdf")
79
+
80
+ reader.pages.each do |page|
81
+ puts page.fonts
82
+ puts page.text
83
+ puts page.raw_content
84
+ end
85
+
86
+ If you need to access the full program for rendering a page, use the walk() method
87
+ of PDF::Reader::Page.
88
+
89
+ class RedGreenBlue
90
+ def set_rgb_color_for_nonstroking(r, g, b)
91
+ puts "R: #{r}, G: #{g}, B: #{b}"
92
+ end
93
+ end
94
+
95
+ reader = PDF::Reader.new("somefile.pdf")
96
+ page = reader.page(1)
97
+ receiver = RedGreenBlue.new
98
+ page.walk(receiver)
99
+
100
+ For low level access to the objects in a PDF file, use the ObjectHash class. You can
101
+ build an ObjectHash instance directly:
102
+
103
+ puts PDF::Reader::ObjectHash.new("somefile.pdf")
104
+
105
+ or via a PDF::Reader instance:
106
+
107
+ reader = PDF::Reader.new("somefile.pdf")
108
+ puts reader.objects
109
+
110
+ The second method is preferred to increase the effectiveness of internal caching.
111
+
112
+ = Text Encoding
113
+
114
+ Regardless of the internal encoding used in the PDF all text will be converted
115
+ to UTF-8 before it is passed back from PDF::Reader.
116
+
117
+ Strings that contain binary data (like font blobs) will be marked as such on
118
+ M17N aware VMs.
119
+
120
+ = Former API
121
+
122
+ Version 1.0.0 of PDF::Reader introduced a new page-based API that provides
123
+ efficient and easy access to any page.
124
+
125
+ The previous API is marked as deprecated but will continue to work for the
126
+ time being. Eventually calls to the old API will begin triggering deprecation
127
+ warnings before it is completely removed in version 2.0.0.
128
+
129
+ = Exceptions
130
+
131
+ There are two key exceptions that you will need to watch out for when processing a
132
+ PDF file:
133
+
134
+ MalformedPDFError - The PDF appears to be corrupt in some way. If you believe the
135
+ file should be valid, or that a corrupt file didn't raise an exception, please
136
+ forward a copy of the file to the maintainers (preferably via the google group)
137
+ and we will attempt to improve the code.
138
+
139
+ UnsupportedFeatureError - The PDF uses a feature that PDF::Reader doesn't currently
140
+ support. Again, we welcome submissions of PDF files that exhibit these features to help
141
+ us with future code improvements.
142
+
143
+ MalformedPDFError has some subclasses if you want to detect finer grained issues. If you
144
+ don't, 'rescue MalformedPDFError' will catch all the subclassed errors as well.
145
+
146
+ Any other exceptions should be considered bugs in either PDF::Reader (please
147
+ report it!).
148
+
149
+ = PDF Integrity
150
+
151
+ Windows developers may run into problems when running specs due to MalformedPDFError's
152
+ This is usually because CRLF characters are automatically added to some of the PDF's in
153
+ the spec folder when you checkout a branch from Git.
154
+
155
+ To remove any invalid CRLF characters added while checking out a branch from Git, run:
156
+
157
+ rake fix_integrity
158
+
159
+ = Maintainers
160
+
161
+ - James Healy <mailto:jimmy@deefa.com>
162
+
163
+ = Licensing
164
+
165
+ This library is distributed under the terms of the MIT License. See the included file for
166
+ more detail.
167
+
168
+ = Mailing List
169
+
170
+ Any questions or feedback should be sent to the PDF::Reader google group. It's
171
+ better that any answers be available for others instead of hiding in someone's
172
+ inbox.
173
+
174
+ http://groups.google.com/group/pdf-reader
175
+
176
+ = Examples
177
+
178
+ The easiest way to explain how this works in practice is to show some examples.
179
+ Check out the examples/ directory for a few files.
180
+
181
+ = Known Limitations
182
+
183
+ Occasionally some text cannot be extracted properly due to the way it has been
184
+ stored, or the use of invalid bytes. In these cases PDF::Reader will output a
185
+ little UTF-8 friendly box to indicate an unrecognisable character.
186
+
187
+ = Resources
188
+
189
+ - PDF::Reader Code Repository: http://github.com/yob/pdf-reader
190
+ - PDF Specification: http://www.adobe.com/devnet/pdf/pdf_reference.html
191
+ - PDF Tutorial Slide Presentations: http://home.comcast.net/~jk05/presentations/PDFTutorials.html
192
+ - Developing with PDF (book): http://shop.oreilly.com/product/0636920025269.do
@@ -0,0 +1,93 @@
1
+ require "rubygems"
2
+ require "bundler"
3
+ Bundler.setup
4
+
5
+ require 'rake'
6
+ require 'rdoc/task'
7
+ require 'rspec/core/rake_task'
8
+
9
+ # Cane requires ripper, which appears to only work on MRI 1.9
10
+ if RUBY_VERSION >= "1.9" && RUBY_ENGINE == "ruby"
11
+
12
+ desc "Default Task"
13
+ task :default => [ :quality, :spec ]
14
+
15
+ require 'cane/rake_task'
16
+ require 'morecane'
17
+
18
+ desc "Run cane to check quality metrics"
19
+ Cane::RakeTask.new(:quality) do |cane|
20
+ cane.abc_max = 20
21
+ cane.style_measure = 100
22
+ cane.max_violations = 94
23
+
24
+ cane.use Morecane::EncodingCheck, :encoding_glob => "{app,lib,spec}/**/*.rb"
25
+ end
26
+
27
+ else
28
+ desc "Default Task"
29
+ task :default => [ :spec ]
30
+ end
31
+
32
+ desc "Run all rspec files"
33
+ RSpec::Core::RakeTask.new("spec") do |t|
34
+ t.rspec_opts = ["--color", "--format progress"]
35
+ t.ruby_opts = "-w"
36
+ end
37
+
38
+ # Generate the RDoc documentation
39
+ desc "Create documentation"
40
+ Rake::RDocTask.new("doc") do |rdoc|
41
+ rdoc.title = "pdf-reader"
42
+ rdoc.rdoc_dir = (ENV['CC_BUILD_ARTIFACTS'] || 'doc') + '/rdoc'
43
+ rdoc.rdoc_files.include('README.rdoc')
44
+ rdoc.rdoc_files.include('TODO')
45
+ rdoc.rdoc_files.include('CHANGELOG')
46
+ rdoc.rdoc_files.include('MIT-LICENSE')
47
+ rdoc.rdoc_files.include('lib/**/*.rb')
48
+ rdoc.options << "--inline-source"
49
+ end
50
+
51
+ desc "Create a YAML file of integrity info for PDFs in the spec suite"
52
+ task :integrity_yaml do
53
+ data = {}
54
+ Dir.glob("spec/data/**/*.*").each do |path|
55
+ path_without_spec = path.gsub("spec/","")
56
+ data[path_without_spec] = {
57
+ :bytes => File.size(path),
58
+ :md5 => `md5sum "#{path}"`.split.first
59
+ } if File.file?(path)
60
+ end
61
+ File.open("spec/integrity.yml","wb") { |f| f.write YAML.dump(data)}
62
+ end
63
+
64
+ desc "Remove any CRLF characters added by Git"
65
+ task :fix_integrity do
66
+ yaml_path = File.expand_path("spec/integrity.yml",File.dirname(__FILE__))
67
+ integrity = YAML.load_file(yaml_path)
68
+
69
+ Dir.glob("spec/data/**/*.pdf").each do |path|
70
+ path_relative_to_spec_folder = path[/.+(data\/.+)/,1]
71
+ item = integrity[path_relative_to_spec_folder]
72
+
73
+ if File.file?(path)
74
+ file_contents = File.open(path, "rb") { |f| f.read }
75
+ md5 = Digest::MD5.hexdigest(file_contents)
76
+
77
+ unless md5 == item[:md5]
78
+ #file md5 does not match what was checked into Git
79
+
80
+ if Digest::MD5.hexdigest(file_contents.gsub(/\r\n/, "\n")) == item[:md5]
81
+ #pdf file is fixable by swapping CRLF characters
82
+
83
+ File.open(path, "wb") do |f|
84
+ f.write(file_contents.gsub(/\r\n/, "\n"))
85
+ end
86
+ puts "Replaced CRLF characters in: #{path}"
87
+ else
88
+ puts "Failed to fix: #{path}"
89
+ end
90
+ end
91
+ end
92
+ end
93
+ end