pdf-reader 1.1.1 → 2.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG +87 -2
  3. data/{README.rdoc → README.md} +43 -31
  4. data/Rakefile +21 -16
  5. data/bin/pdf_callbacks +1 -1
  6. data/bin/pdf_object +4 -1
  7. data/bin/pdf_text +1 -3
  8. data/examples/callbacks.rb +2 -1
  9. data/examples/extract_images.rb +11 -6
  10. data/examples/fuzzy_paragraphs.rb +24 -0
  11. data/lib/pdf/reader/afm/Courier-Bold.afm +342 -0
  12. data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -0
  13. data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -0
  14. data/lib/pdf/reader/afm/Courier.afm +342 -0
  15. data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -0
  16. data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -0
  17. data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -0
  18. data/lib/pdf/reader/afm/Helvetica.afm +3051 -0
  19. data/lib/pdf/reader/afm/MustRead.html +19 -0
  20. data/lib/pdf/reader/afm/Symbol.afm +213 -0
  21. data/lib/pdf/reader/afm/Times-Bold.afm +2588 -0
  22. data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -0
  23. data/lib/pdf/reader/afm/Times-Italic.afm +2667 -0
  24. data/lib/pdf/reader/afm/Times-Roman.afm +2419 -0
  25. data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -0
  26. data/lib/pdf/reader/buffer.rb +90 -63
  27. data/lib/pdf/reader/cid_widths.rb +63 -0
  28. data/lib/pdf/reader/cmap.rb +69 -38
  29. data/lib/pdf/reader/encoding.rb +74 -48
  30. data/lib/pdf/reader/error.rb +24 -4
  31. data/lib/pdf/reader/filter/ascii85.rb +28 -0
  32. data/lib/pdf/reader/filter/ascii_hex.rb +30 -0
  33. data/lib/pdf/reader/filter/depredict.rb +141 -0
  34. data/lib/pdf/reader/filter/flate.rb +53 -0
  35. data/lib/pdf/reader/filter/lzw.rb +21 -0
  36. data/lib/pdf/reader/filter/null.rb +18 -0
  37. data/lib/pdf/reader/filter/run_length.rb +45 -0
  38. data/lib/pdf/reader/filter.rb +15 -234
  39. data/lib/pdf/reader/font.rb +107 -43
  40. data/lib/pdf/reader/font_descriptor.rb +80 -0
  41. data/lib/pdf/reader/form_xobject.rb +26 -4
  42. data/lib/pdf/reader/glyph_hash.rb +56 -18
  43. data/lib/pdf/reader/lzw.rb +6 -4
  44. data/lib/pdf/reader/null_security_handler.rb +17 -0
  45. data/lib/pdf/reader/object_cache.rb +40 -16
  46. data/lib/pdf/reader/object_hash.rb +94 -40
  47. data/lib/pdf/reader/object_stream.rb +1 -0
  48. data/lib/pdf/reader/orientation_detector.rb +34 -0
  49. data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
  50. data/lib/pdf/reader/page.rb +48 -3
  51. data/lib/pdf/reader/page_layout.rb +125 -0
  52. data/lib/pdf/reader/page_state.rb +185 -70
  53. data/lib/pdf/reader/page_text_receiver.rb +70 -20
  54. data/lib/pdf/reader/pages_strategy.rb +4 -293
  55. data/lib/pdf/reader/parser.rb +37 -61
  56. data/lib/pdf/reader/print_receiver.rb +6 -0
  57. data/lib/pdf/reader/reference.rb +4 -1
  58. data/lib/pdf/reader/register_receiver.rb +17 -31
  59. data/lib/pdf/reader/resource_methods.rb +1 -0
  60. data/lib/pdf/reader/standard_security_handler.rb +82 -42
  61. data/lib/pdf/reader/standard_security_handler_v5.rb +91 -0
  62. data/lib/pdf/reader/stream.rb +5 -2
  63. data/lib/pdf/reader/synchronized_cache.rb +33 -0
  64. data/lib/pdf/reader/text_run.rb +99 -0
  65. data/lib/pdf/reader/token.rb +4 -1
  66. data/lib/pdf/reader/transformation_matrix.rb +195 -0
  67. data/lib/pdf/reader/unimplemented_security_handler.rb +17 -0
  68. data/lib/pdf/reader/width_calculator/built_in.rb +67 -0
  69. data/lib/pdf/reader/width_calculator/composite.rb +28 -0
  70. data/lib/pdf/reader/width_calculator/true_type.rb +56 -0
  71. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +33 -0
  72. data/lib/pdf/reader/width_calculator/type_zero.rb +25 -0
  73. data/lib/pdf/reader/width_calculator.rb +12 -0
  74. data/lib/pdf/reader/xref.rb +41 -9
  75. data/lib/pdf/reader.rb +45 -104
  76. data/lib/pdf-reader.rb +4 -1
  77. metadata +220 -101
  78. data/bin/pdf_list_callbacks +0 -17
  79. data/lib/pdf/hash.rb +0 -15
  80. data/lib/pdf/reader/abstract_strategy.rb +0 -81
  81. data/lib/pdf/reader/metadata_strategy.rb +0 -56
  82. data/lib/pdf/reader/text_receiver.rb +0 -264
@@ -1,81 +0,0 @@
1
- # coding: utf-8
2
-
3
- class PDF::Reader
4
-
5
- # DEPRECATED: this class was deprecated in version 0.11.0 and will
6
- # eventually be removed
7
- class AbstractStrategy # :nodoc:
8
-
9
- def initialize(ohash, receivers, options = {})
10
- @ohash, @options = ohash, options
11
- if receivers.is_a?(Array)
12
- @receivers = receivers
13
- else
14
- @receivers = [receivers]
15
- end
16
- end
17
-
18
- private
19
-
20
- def options
21
- @options || {}
22
- end
23
-
24
- # calls the name callback method on the receiver class with params as the arguments
25
- #
26
- def callback (name, params=[])
27
- @receivers.each do |receiver|
28
- receiver.send(name, *params) if receiver.respond_to?(name)
29
- end
30
- end
31
-
32
- # strings outside of page content should be in either PDFDocEncoding or UTF-16.
33
- def decode_strings(obj)
34
- case obj
35
- when String then
36
- if obj[0,2].unpack("C*").slice(0,2) == [254,255]
37
- PDF::Reader::Encoding.new(:UTF16Encoding).to_utf8(obj[2, obj.size])
38
- else
39
- PDF::Reader::Encoding.new(:PDFDocEncoding).to_utf8(obj)
40
- end
41
- when Hash then obj.each { |key,val| obj[key] = decode_strings(val) }
42
- when Array then obj.collect { |item| decode_strings(item) }
43
- else
44
- obj
45
- end
46
- end
47
-
48
- def info
49
- ohash.object(trailer[:Info])
50
- end
51
-
52
- def info?
53
- info ? true : false
54
- end
55
-
56
- def ohash
57
- @ohash
58
- end
59
-
60
- def pages
61
- ohash.object(root[:Pages])
62
- end
63
-
64
- def pages?
65
- pages ? true : false
66
- end
67
-
68
- def root
69
- ohash.object(trailer[:Root])
70
- end
71
-
72
- def root?
73
- root ? true : false
74
- end
75
-
76
- def trailer
77
- ohash.trailer
78
- end
79
-
80
- end
81
- end
@@ -1,56 +0,0 @@
1
- # coding: utf-8
2
-
3
- class PDF::Reader
4
-
5
- # DEPRECATED: this class was deprecated in version 0.11.0 and will
6
- # eventually be removed
7
- #
8
- class MetadataStrategy < AbstractStrategy # :nodoc:
9
-
10
- def self.to_sym
11
- :metadata
12
- end
13
-
14
- def process
15
- return false unless options[:metadata]
16
-
17
- # may be useful to some people
18
- callback(:pdf_version, ohash.pdf_version)
19
-
20
- # ye olde metadata
21
- callback(:metadata, [decoded_info]) if info?
22
-
23
- # new style xml metadata
24
- callback(:xml_metadata, [xml_metadata]) if xml_metadata?
25
-
26
- # page count
27
- if pages?
28
- count = ohash.object(pages[:Count])
29
- callback(:page_count, count.to_i)
30
- end
31
- end
32
-
33
- private
34
-
35
- def xml_metadata
36
- return @xml_metadata if defined?(@xml_metadata)
37
-
38
- if root[:Metadata].nil?
39
- @xml_metadata = nil
40
- else
41
- string = ohash.object(root[:Metadata]).unfiltered_data
42
- string.force_encoding("utf-8") if string.respond_to?(:force_encoding)
43
- @xml_metadata = string
44
- end
45
- end
46
-
47
- def xml_metadata?
48
- xml_metadata ? true : false
49
- end
50
-
51
- def decoded_info
52
- @decoded_info ||= decode_strings(info)
53
- end
54
-
55
- end
56
- end
@@ -1,264 +0,0 @@
1
- ################################################################################
2
- #
3
- # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
- #
5
- # Permission is hereby granted, free of charge, to any person obtaining
6
- # a copy of this software and associated documentation files (the
7
- # "Software"), to deal in the Software without restriction, including
8
- # without limitation the rights to use, copy, modify, merge, publish,
9
- # distribute, sublicense, and/or sell copies of the Software, and to
10
- # permit persons to whom the Software is furnished to do so, subject to
11
- # the following conditions:
12
- #
13
- # The above copyright notice and this permission notice shall be
14
- # included in all copies or substantial portions of the Software.
15
- #
16
- # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
- # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
- # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
- # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
- # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
- # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
- # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
- #
24
- ################################################################################
25
-
26
- class PDF::Reader
27
- ################################################################################
28
- # An example receiver class that processes all text found in a PDF file. All text that
29
- # is found will be printed to the IO object specified in the constructor.
30
- #
31
- # Usage:
32
- # receiver = PDF::Reader::TextReceiver.new($stdout)
33
- # PDF::Reader.file("somefile.pdf", receiver)
34
- #
35
- # DEPRECATED: this class was deprecated in version 0.11.0 and will
36
- # eventually be removed
37
- class TextReceiver
38
- ################################################################################
39
- # Initialize with the library user's receiver
40
- def initialize (main_receiver)
41
- @main_receiver = main_receiver
42
- @upper_corners = []
43
- end
44
- ################################################################################
45
- # Called when the document parsing begins
46
- def begin_document (root)
47
- @upper_corners = []
48
- end
49
- ################################################################################
50
- # Called when the document parsing ends
51
- def end_document
52
- @state.clear
53
- end
54
- ################################################################################
55
- def begin_page_container (page)
56
- @upper_corners.push(media_box_check(page))
57
- end
58
- ################################################################################
59
- def end_page_container
60
- @upper_corners.pop
61
- end
62
- ################################################################################
63
- # Called when new page parsing begins
64
- def begin_page (info)
65
- @page = info
66
-
67
- @state = [{
68
- :char_spacing => 0,
69
- :word_spacing => 0,
70
- :hori_scaling => 100,
71
- :leading => 0,
72
- :tj_adjustment => 0,
73
- }]
74
-
75
- @upper_corners.push(media_box_check(info))
76
-
77
- @output = []
78
- @line = 0
79
- @location = 0
80
- @displacement = {}
81
- @smallest_y_loc = @upper_corners.last[:ury]
82
- @written_to = false
83
- end
84
- ################################################################################
85
- # Called when page parsing ends
86
- def end_page
87
- @main_receiver << @output.join("\n")
88
- @upper_corners.pop
89
- end
90
- ################################################################################
91
- # PDF operator BT
92
- def begin_text_object
93
- @state.push(@state.last.dup)
94
- end
95
- ################################################################################
96
- # PDF operator ET
97
- def end_text_object
98
- @state.pop
99
- end
100
- ################################################################################
101
- # PDF operator Tm
102
- def set_text_matrix_and_text_line_matrix (*args)
103
- # these variable names look bad, but they're from the PDF spec
104
- a, b, c, d, e, f = *args
105
- calculate_line_and_location(f)
106
- end
107
- ################################################################################
108
- # PDF operator Tc
109
- def set_character_spacing (n)
110
- @state.last[:char_spacing] = n
111
- end
112
- ################################################################################
113
- # PDF operator Tw
114
- def set_word_spacing (n)
115
- @state.last[:word_spacing] = n
116
- end
117
- ################################################################################
118
- # PDF operator Tz
119
- def set_horizontal_text_scaling (n)
120
- @state.last[:hori_scaling] = n/100
121
- end
122
- ################################################################################
123
- # PDF operator TL
124
- def set_text_leading (n)
125
- @state.last[:leading] = n
126
- end
127
- ################################################################################
128
- # PDF operator T*
129
- def move_to_start_of_next_line
130
- move_text_position(0, @state.last[:leading])
131
- end
132
- ################################################################################
133
- # PDF operator Td
134
- def move_text_position (tx, ty)
135
- #puts "#{tx} #{ty} Td"
136
- calculate_line_and_location(@location + ty)
137
- end
138
- ################################################################################
139
- # PDF operator TD
140
- def move_text_position_and_set_leading (tx, ty)
141
- set_text_leading(ty)# * -1)
142
- move_text_position(tx, ty)
143
- end
144
- ################################################################################
145
- # PDF operator Tj
146
- def show_text (string)
147
- #puts "getting line #@line"
148
-
149
- place = (@output[@line] ||= "")
150
- #place << " " unless place.empty?
151
-
152
- place << " " * (@state.last[:tj_adjustment].abs/900) if @state.last[:tj_adjustment] < -1000
153
- place << string
154
-
155
- #puts "place is now: #{place}"
156
- @written_to = true
157
- end
158
- def super_show_text (string)
159
- urx = @upper_corners.last[:urx]/TS_UNITS_PER_H_CHAR
160
- ury = @upper_corners.last[:ury]/TS_UNITS_PER_V_CHAR
161
-
162
- x = (@tm[2,0]/TS_UNITS_PER_H_CHAR).to_i
163
- y = (ury - (@tm[2,1]/TS_UNITS_PER_V_CHAR)).to_i
164
-
165
- #puts "rendering '#{string}' to #{x}x#{y}"
166
-
167
- place = (@output[y] ||= (" " * urx.to_i))
168
- #puts "#{urx} #{place.size} #{string.size} #{x}"
169
- return if x+string.size >= urx
170
-
171
- string.split(//).each do |c|
172
- chars = 1
173
-
174
- case c
175
- when " "
176
- chars += @state.last[:word_spacing].to_i
177
- place[x-1, chars] = (" " * chars)
178
- else
179
- chars += @state.last[:char_spacing].to_i
180
- chars -= (@state.last[:tj_adjustment]/1000).to_i if @state.last[:tj_adjustment]
181
- chars = 1 if chars < 1
182
-
183
- place[x-1] = c
184
- place[x, chars-1] = (" " * (chars-1)) if chars > 1
185
- end
186
-
187
- x += chars
188
- end
189
-
190
- @tm += Matrix.rows([[1, 0, 0], [0, 1, 0], [x*TS_UNITS_PER_H_CHAR, y*TS_UNITS_PER_V_CHAR, 1]])
191
- end
192
- ################################################################################
193
- # PDF operator TJ
194
- def show_text_with_positioning (params)
195
- prev_adjustment = @state.last[:tj_adjustment]
196
-
197
- params.each do |p|
198
- case p
199
- when Float, Fixnum
200
- @state.last[:tj_adjustment] = p
201
- else
202
- show_text(p)
203
- end
204
- end
205
-
206
- @state.last[:tj_adjustment] = prev_adjustment
207
- end
208
- ################################################################################
209
- # PDF operator '
210
- def move_to_next_line_and_show_text (string)
211
- move_to_start_of_next_line
212
- show_text(string)
213
- end
214
- ################################################################################
215
- # PDF operator "
216
- def set_spacing_next_line_show_text (aw, ac, string)
217
- set_word_spacing(aw)
218
- set_character_spacing(ac)
219
- move_to_next_line_and_show_text(string)
220
- end
221
- ################################################################################
222
- def media_box_check (dict)
223
- corners = (@upper_corners.last || {:urx => 0, :ury => 0}).dup
224
-
225
- if dict.has_key?(:MediaBox)
226
- media_box = dict[:MediaBox]
227
- corners[:urx] = media_box[2] - media_box[0]
228
- corners[:ury] = media_box[3] - media_box[1]
229
- end
230
-
231
- corners
232
- end
233
- ################################################################################
234
- def calculate_line_and_location (new_loc)
235
- ##puts "calculate_line_and_location(#{new_loc})"
236
- key = new_loc; key.freeze
237
-
238
- #key = new_loc.to_s # because hashes with string keys are magic (auto-freeze)
239
-
240
- if @written_to
241
- unless @displacement.has_key?(key)
242
- if key < @location
243
- @displacement[key] = @line + 1
244
- elsif key < @smallest_y_loc
245
- @displacement[key] = @line + 1
246
- else
247
- key = @displacement.keys.find_all {|i| key > i}.sort.last
248
- @displacement[key] = 0 unless @displacement.has_key?(key)
249
- end
250
- end
251
- else
252
- @displacement[key] = 0
253
- end
254
-
255
- @smallest_y_loc = key if key < @smallest_y_loc
256
- @location = key
257
- @line = @displacement[key]
258
- #puts "calculate_line_and_location: @location=#@location @line=#@line smallest_y_loc=#@smallest_y_loc"
259
- end
260
- ################################################################################
261
- end
262
- ################################################################################
263
- end
264
- ################################################################################