pdf-reader 1.1.1 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG +87 -2
  3. data/{README.rdoc → README.md} +43 -31
  4. data/Rakefile +21 -16
  5. data/bin/pdf_callbacks +1 -1
  6. data/bin/pdf_object +4 -1
  7. data/bin/pdf_text +1 -3
  8. data/examples/callbacks.rb +2 -1
  9. data/examples/extract_images.rb +11 -6
  10. data/examples/fuzzy_paragraphs.rb +24 -0
  11. data/lib/pdf/reader/afm/Courier-Bold.afm +342 -0
  12. data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -0
  13. data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -0
  14. data/lib/pdf/reader/afm/Courier.afm +342 -0
  15. data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -0
  16. data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -0
  17. data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -0
  18. data/lib/pdf/reader/afm/Helvetica.afm +3051 -0
  19. data/lib/pdf/reader/afm/MustRead.html +19 -0
  20. data/lib/pdf/reader/afm/Symbol.afm +213 -0
  21. data/lib/pdf/reader/afm/Times-Bold.afm +2588 -0
  22. data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -0
  23. data/lib/pdf/reader/afm/Times-Italic.afm +2667 -0
  24. data/lib/pdf/reader/afm/Times-Roman.afm +2419 -0
  25. data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -0
  26. data/lib/pdf/reader/buffer.rb +90 -63
  27. data/lib/pdf/reader/cid_widths.rb +63 -0
  28. data/lib/pdf/reader/cmap.rb +69 -38
  29. data/lib/pdf/reader/encoding.rb +74 -48
  30. data/lib/pdf/reader/error.rb +24 -4
  31. data/lib/pdf/reader/filter/ascii85.rb +28 -0
  32. data/lib/pdf/reader/filter/ascii_hex.rb +30 -0
  33. data/lib/pdf/reader/filter/depredict.rb +141 -0
  34. data/lib/pdf/reader/filter/flate.rb +53 -0
  35. data/lib/pdf/reader/filter/lzw.rb +21 -0
  36. data/lib/pdf/reader/filter/null.rb +18 -0
  37. data/lib/pdf/reader/filter/run_length.rb +45 -0
  38. data/lib/pdf/reader/filter.rb +15 -234
  39. data/lib/pdf/reader/font.rb +107 -43
  40. data/lib/pdf/reader/font_descriptor.rb +80 -0
  41. data/lib/pdf/reader/form_xobject.rb +26 -4
  42. data/lib/pdf/reader/glyph_hash.rb +56 -18
  43. data/lib/pdf/reader/lzw.rb +6 -4
  44. data/lib/pdf/reader/null_security_handler.rb +17 -0
  45. data/lib/pdf/reader/object_cache.rb +40 -16
  46. data/lib/pdf/reader/object_hash.rb +94 -40
  47. data/lib/pdf/reader/object_stream.rb +1 -0
  48. data/lib/pdf/reader/orientation_detector.rb +34 -0
  49. data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
  50. data/lib/pdf/reader/page.rb +48 -3
  51. data/lib/pdf/reader/page_layout.rb +125 -0
  52. data/lib/pdf/reader/page_state.rb +185 -70
  53. data/lib/pdf/reader/page_text_receiver.rb +70 -20
  54. data/lib/pdf/reader/pages_strategy.rb +4 -293
  55. data/lib/pdf/reader/parser.rb +37 -61
  56. data/lib/pdf/reader/print_receiver.rb +6 -0
  57. data/lib/pdf/reader/reference.rb +4 -1
  58. data/lib/pdf/reader/register_receiver.rb +17 -31
  59. data/lib/pdf/reader/resource_methods.rb +1 -0
  60. data/lib/pdf/reader/standard_security_handler.rb +82 -42
  61. data/lib/pdf/reader/standard_security_handler_v5.rb +91 -0
  62. data/lib/pdf/reader/stream.rb +5 -2
  63. data/lib/pdf/reader/synchronized_cache.rb +33 -0
  64. data/lib/pdf/reader/text_run.rb +99 -0
  65. data/lib/pdf/reader/token.rb +4 -1
  66. data/lib/pdf/reader/transformation_matrix.rb +195 -0
  67. data/lib/pdf/reader/unimplemented_security_handler.rb +17 -0
  68. data/lib/pdf/reader/width_calculator/built_in.rb +67 -0
  69. data/lib/pdf/reader/width_calculator/composite.rb +28 -0
  70. data/lib/pdf/reader/width_calculator/true_type.rb +56 -0
  71. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +33 -0
  72. data/lib/pdf/reader/width_calculator/type_zero.rb +25 -0
  73. data/lib/pdf/reader/width_calculator.rb +12 -0
  74. data/lib/pdf/reader/xref.rb +41 -9
  75. data/lib/pdf/reader.rb +45 -104
  76. data/lib/pdf-reader.rb +4 -1
  77. metadata +220 -101
  78. data/bin/pdf_list_callbacks +0 -17
  79. data/lib/pdf/hash.rb +0 -15
  80. data/lib/pdf/reader/abstract_strategy.rb +0 -81
  81. data/lib/pdf/reader/metadata_strategy.rb +0 -56
  82. data/lib/pdf/reader/text_receiver.rb +0 -264
@@ -1,81 +0,0 @@
1
- # coding: utf-8
2
-
3
- class PDF::Reader
4
-
5
- # DEPRECATED: this class was deprecated in version 0.11.0 and will
6
- # eventually be removed
7
- class AbstractStrategy # :nodoc:
8
-
9
- def initialize(ohash, receivers, options = {})
10
- @ohash, @options = ohash, options
11
- if receivers.is_a?(Array)
12
- @receivers = receivers
13
- else
14
- @receivers = [receivers]
15
- end
16
- end
17
-
18
- private
19
-
20
- def options
21
- @options || {}
22
- end
23
-
24
- # calls the name callback method on the receiver class with params as the arguments
25
- #
26
- def callback (name, params=[])
27
- @receivers.each do |receiver|
28
- receiver.send(name, *params) if receiver.respond_to?(name)
29
- end
30
- end
31
-
32
- # strings outside of page content should be in either PDFDocEncoding or UTF-16.
33
- def decode_strings(obj)
34
- case obj
35
- when String then
36
- if obj[0,2].unpack("C*").slice(0,2) == [254,255]
37
- PDF::Reader::Encoding.new(:UTF16Encoding).to_utf8(obj[2, obj.size])
38
- else
39
- PDF::Reader::Encoding.new(:PDFDocEncoding).to_utf8(obj)
40
- end
41
- when Hash then obj.each { |key,val| obj[key] = decode_strings(val) }
42
- when Array then obj.collect { |item| decode_strings(item) }
43
- else
44
- obj
45
- end
46
- end
47
-
48
- def info
49
- ohash.object(trailer[:Info])
50
- end
51
-
52
- def info?
53
- info ? true : false
54
- end
55
-
56
- def ohash
57
- @ohash
58
- end
59
-
60
- def pages
61
- ohash.object(root[:Pages])
62
- end
63
-
64
- def pages?
65
- pages ? true : false
66
- end
67
-
68
- def root
69
- ohash.object(trailer[:Root])
70
- end
71
-
72
- def root?
73
- root ? true : false
74
- end
75
-
76
- def trailer
77
- ohash.trailer
78
- end
79
-
80
- end
81
- end
@@ -1,56 +0,0 @@
1
- # coding: utf-8
2
-
3
- class PDF::Reader
4
-
5
- # DEPRECATED: this class was deprecated in version 0.11.0 and will
6
- # eventually be removed
7
- #
8
- class MetadataStrategy < AbstractStrategy # :nodoc:
9
-
10
- def self.to_sym
11
- :metadata
12
- end
13
-
14
- def process
15
- return false unless options[:metadata]
16
-
17
- # may be useful to some people
18
- callback(:pdf_version, ohash.pdf_version)
19
-
20
- # ye olde metadata
21
- callback(:metadata, [decoded_info]) if info?
22
-
23
- # new style xml metadata
24
- callback(:xml_metadata, [xml_metadata]) if xml_metadata?
25
-
26
- # page count
27
- if pages?
28
- count = ohash.object(pages[:Count])
29
- callback(:page_count, count.to_i)
30
- end
31
- end
32
-
33
- private
34
-
35
- def xml_metadata
36
- return @xml_metadata if defined?(@xml_metadata)
37
-
38
- if root[:Metadata].nil?
39
- @xml_metadata = nil
40
- else
41
- string = ohash.object(root[:Metadata]).unfiltered_data
42
- string.force_encoding("utf-8") if string.respond_to?(:force_encoding)
43
- @xml_metadata = string
44
- end
45
- end
46
-
47
- def xml_metadata?
48
- xml_metadata ? true : false
49
- end
50
-
51
- def decoded_info
52
- @decoded_info ||= decode_strings(info)
53
- end
54
-
55
- end
56
- end
@@ -1,264 +0,0 @@
1
- ################################################################################
2
- #
3
- # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
- #
5
- # Permission is hereby granted, free of charge, to any person obtaining
6
- # a copy of this software and associated documentation files (the
7
- # "Software"), to deal in the Software without restriction, including
8
- # without limitation the rights to use, copy, modify, merge, publish,
9
- # distribute, sublicense, and/or sell copies of the Software, and to
10
- # permit persons to whom the Software is furnished to do so, subject to
11
- # the following conditions:
12
- #
13
- # The above copyright notice and this permission notice shall be
14
- # included in all copies or substantial portions of the Software.
15
- #
16
- # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
- # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
- # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
- # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
- # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
- # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
- # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
- #
24
- ################################################################################
25
-
26
- class PDF::Reader
27
- ################################################################################
28
- # An example receiver class that processes all text found in a PDF file. All text that
29
- # is found will be printed to the IO object specified in the constructor.
30
- #
31
- # Usage:
32
- # receiver = PDF::Reader::TextReceiver.new($stdout)
33
- # PDF::Reader.file("somefile.pdf", receiver)
34
- #
35
- # DEPRECATED: this class was deprecated in version 0.11.0 and will
36
- # eventually be removed
37
- class TextReceiver
38
- ################################################################################
39
- # Initialize with the library user's receiver
40
- def initialize (main_receiver)
41
- @main_receiver = main_receiver
42
- @upper_corners = []
43
- end
44
- ################################################################################
45
- # Called when the document parsing begins
46
- def begin_document (root)
47
- @upper_corners = []
48
- end
49
- ################################################################################
50
- # Called when the document parsing ends
51
- def end_document
52
- @state.clear
53
- end
54
- ################################################################################
55
- def begin_page_container (page)
56
- @upper_corners.push(media_box_check(page))
57
- end
58
- ################################################################################
59
- def end_page_container
60
- @upper_corners.pop
61
- end
62
- ################################################################################
63
- # Called when new page parsing begins
64
- def begin_page (info)
65
- @page = info
66
-
67
- @state = [{
68
- :char_spacing => 0,
69
- :word_spacing => 0,
70
- :hori_scaling => 100,
71
- :leading => 0,
72
- :tj_adjustment => 0,
73
- }]
74
-
75
- @upper_corners.push(media_box_check(info))
76
-
77
- @output = []
78
- @line = 0
79
- @location = 0
80
- @displacement = {}
81
- @smallest_y_loc = @upper_corners.last[:ury]
82
- @written_to = false
83
- end
84
- ################################################################################
85
- # Called when page parsing ends
86
- def end_page
87
- @main_receiver << @output.join("\n")
88
- @upper_corners.pop
89
- end
90
- ################################################################################
91
- # PDF operator BT
92
- def begin_text_object
93
- @state.push(@state.last.dup)
94
- end
95
- ################################################################################
96
- # PDF operator ET
97
- def end_text_object
98
- @state.pop
99
- end
100
- ################################################################################
101
- # PDF operator Tm
102
- def set_text_matrix_and_text_line_matrix (*args)
103
- # these variable names look bad, but they're from the PDF spec
104
- a, b, c, d, e, f = *args
105
- calculate_line_and_location(f)
106
- end
107
- ################################################################################
108
- # PDF operator Tc
109
- def set_character_spacing (n)
110
- @state.last[:char_spacing] = n
111
- end
112
- ################################################################################
113
- # PDF operator Tw
114
- def set_word_spacing (n)
115
- @state.last[:word_spacing] = n
116
- end
117
- ################################################################################
118
- # PDF operator Tz
119
- def set_horizontal_text_scaling (n)
120
- @state.last[:hori_scaling] = n/100
121
- end
122
- ################################################################################
123
- # PDF operator TL
124
- def set_text_leading (n)
125
- @state.last[:leading] = n
126
- end
127
- ################################################################################
128
- # PDF operator T*
129
- def move_to_start_of_next_line
130
- move_text_position(0, @state.last[:leading])
131
- end
132
- ################################################################################
133
- # PDF operator Td
134
- def move_text_position (tx, ty)
135
- #puts "#{tx} #{ty} Td"
136
- calculate_line_and_location(@location + ty)
137
- end
138
- ################################################################################
139
- # PDF operator TD
140
- def move_text_position_and_set_leading (tx, ty)
141
- set_text_leading(ty)# * -1)
142
- move_text_position(tx, ty)
143
- end
144
- ################################################################################
145
- # PDF operator Tj
146
- def show_text (string)
147
- #puts "getting line #@line"
148
-
149
- place = (@output[@line] ||= "")
150
- #place << " " unless place.empty?
151
-
152
- place << " " * (@state.last[:tj_adjustment].abs/900) if @state.last[:tj_adjustment] < -1000
153
- place << string
154
-
155
- #puts "place is now: #{place}"
156
- @written_to = true
157
- end
158
- def super_show_text (string)
159
- urx = @upper_corners.last[:urx]/TS_UNITS_PER_H_CHAR
160
- ury = @upper_corners.last[:ury]/TS_UNITS_PER_V_CHAR
161
-
162
- x = (@tm[2,0]/TS_UNITS_PER_H_CHAR).to_i
163
- y = (ury - (@tm[2,1]/TS_UNITS_PER_V_CHAR)).to_i
164
-
165
- #puts "rendering '#{string}' to #{x}x#{y}"
166
-
167
- place = (@output[y] ||= (" " * urx.to_i))
168
- #puts "#{urx} #{place.size} #{string.size} #{x}"
169
- return if x+string.size >= urx
170
-
171
- string.split(//).each do |c|
172
- chars = 1
173
-
174
- case c
175
- when " "
176
- chars += @state.last[:word_spacing].to_i
177
- place[x-1, chars] = (" " * chars)
178
- else
179
- chars += @state.last[:char_spacing].to_i
180
- chars -= (@state.last[:tj_adjustment]/1000).to_i if @state.last[:tj_adjustment]
181
- chars = 1 if chars < 1
182
-
183
- place[x-1] = c
184
- place[x, chars-1] = (" " * (chars-1)) if chars > 1
185
- end
186
-
187
- x += chars
188
- end
189
-
190
- @tm += Matrix.rows([[1, 0, 0], [0, 1, 0], [x*TS_UNITS_PER_H_CHAR, y*TS_UNITS_PER_V_CHAR, 1]])
191
- end
192
- ################################################################################
193
- # PDF operator TJ
194
- def show_text_with_positioning (params)
195
- prev_adjustment = @state.last[:tj_adjustment]
196
-
197
- params.each do |p|
198
- case p
199
- when Float, Fixnum
200
- @state.last[:tj_adjustment] = p
201
- else
202
- show_text(p)
203
- end
204
- end
205
-
206
- @state.last[:tj_adjustment] = prev_adjustment
207
- end
208
- ################################################################################
209
- # PDF operator '
210
- def move_to_next_line_and_show_text (string)
211
- move_to_start_of_next_line
212
- show_text(string)
213
- end
214
- ################################################################################
215
- # PDF operator "
216
- def set_spacing_next_line_show_text (aw, ac, string)
217
- set_word_spacing(aw)
218
- set_character_spacing(ac)
219
- move_to_next_line_and_show_text(string)
220
- end
221
- ################################################################################
222
- def media_box_check (dict)
223
- corners = (@upper_corners.last || {:urx => 0, :ury => 0}).dup
224
-
225
- if dict.has_key?(:MediaBox)
226
- media_box = dict[:MediaBox]
227
- corners[:urx] = media_box[2] - media_box[0]
228
- corners[:ury] = media_box[3] - media_box[1]
229
- end
230
-
231
- corners
232
- end
233
- ################################################################################
234
- def calculate_line_and_location (new_loc)
235
- ##puts "calculate_line_and_location(#{new_loc})"
236
- key = new_loc; key.freeze
237
-
238
- #key = new_loc.to_s # because hashes with string keys are magic (auto-freeze)
239
-
240
- if @written_to
241
- unless @displacement.has_key?(key)
242
- if key < @location
243
- @displacement[key] = @line + 1
244
- elsif key < @smallest_y_loc
245
- @displacement[key] = @line + 1
246
- else
247
- key = @displacement.keys.find_all {|i| key > i}.sort.last
248
- @displacement[key] = 0 unless @displacement.has_key?(key)
249
- end
250
- end
251
- else
252
- @displacement[key] = 0
253
- end
254
-
255
- @smallest_y_loc = key if key < @smallest_y_loc
256
- @location = key
257
- @line = @displacement[key]
258
- #puts "calculate_line_and_location: @location=#@location @line=#@line smallest_y_loc=#@smallest_y_loc"
259
- end
260
- ################################################################################
261
- end
262
- ################################################################################
263
- end
264
- ################################################################################