pdf-reader 2.1.0 → 2.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +28 -1
  3. data/README.md +2 -2
  4. data/bin/pdf_callbacks +1 -1
  5. data/bin/pdf_text +1 -1
  6. data/lib/pdf-reader.rb +1 -0
  7. data/lib/pdf/reader.rb +2 -2
  8. data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
  9. data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
  10. data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
  11. data/lib/pdf/reader/afm/Courier.afm +342 -342
  12. data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
  13. data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
  14. data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
  15. data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
  16. data/lib/pdf/reader/afm/MustRead.html +19 -0
  17. data/lib/pdf/reader/afm/Symbol.afm +213 -213
  18. data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
  19. data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
  20. data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
  21. data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
  22. data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
  23. data/lib/pdf/reader/buffer.rb +12 -11
  24. data/lib/pdf/reader/cid_widths.rb +2 -0
  25. data/lib/pdf/reader/cmap.rb +22 -12
  26. data/lib/pdf/reader/encoding.rb +12 -9
  27. data/lib/pdf/reader/error.rb +1 -0
  28. data/lib/pdf/reader/filter.rb +1 -0
  29. data/lib/pdf/reader/filter/ascii85.rb +1 -0
  30. data/lib/pdf/reader/filter/ascii_hex.rb +2 -0
  31. data/lib/pdf/reader/filter/depredict.rb +1 -0
  32. data/lib/pdf/reader/filter/flate.rb +6 -4
  33. data/lib/pdf/reader/filter/lzw.rb +2 -0
  34. data/lib/pdf/reader/filter/null.rb +2 -0
  35. data/lib/pdf/reader/filter/run_length.rb +3 -1
  36. data/lib/pdf/reader/font.rb +11 -2
  37. data/lib/pdf/reader/font_descriptor.rb +1 -0
  38. data/lib/pdf/reader/form_xobject.rb +1 -0
  39. data/lib/pdf/reader/glyph_hash.rb +1 -0
  40. data/lib/pdf/reader/lzw.rb +2 -1
  41. data/lib/pdf/reader/null_security_handler.rb +1 -0
  42. data/lib/pdf/reader/object_cache.rb +1 -0
  43. data/lib/pdf/reader/object_hash.rb +22 -10
  44. data/lib/pdf/reader/object_stream.rb +1 -0
  45. data/lib/pdf/reader/orientation_detector.rb +5 -4
  46. data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
  47. data/lib/pdf/reader/page.rb +29 -0
  48. data/lib/pdf/reader/page_layout.rb +10 -5
  49. data/lib/pdf/reader/page_state.rb +10 -1
  50. data/lib/pdf/reader/page_text_receiver.rb +5 -1
  51. data/lib/pdf/reader/pages_strategy.rb +1 -0
  52. data/lib/pdf/reader/parser.rb +5 -4
  53. data/lib/pdf/reader/print_receiver.rb +1 -0
  54. data/lib/pdf/reader/reference.rb +1 -0
  55. data/lib/pdf/reader/register_receiver.rb +1 -0
  56. data/lib/pdf/reader/resource_methods.rb +1 -0
  57. data/lib/pdf/reader/standard_security_handler.rb +1 -0
  58. data/lib/pdf/reader/standard_security_handler_v5.rb +2 -0
  59. data/lib/pdf/reader/stream.rb +1 -0
  60. data/lib/pdf/reader/synchronized_cache.rb +1 -0
  61. data/lib/pdf/reader/text_run.rb +25 -0
  62. data/lib/pdf/reader/token.rb +1 -0
  63. data/lib/pdf/reader/transformation_matrix.rb +1 -0
  64. data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
  65. data/lib/pdf/reader/width_calculator.rb +1 -0
  66. data/lib/pdf/reader/width_calculator/built_in.rb +18 -1
  67. data/lib/pdf/reader/width_calculator/composite.rb +1 -0
  68. data/lib/pdf/reader/width_calculator/true_type.rb +2 -2
  69. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
  70. data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
  71. data/lib/pdf/reader/xref.rb +11 -5
  72. metadata +17 -13
  73. data/lib/pdf/hash.rb +0 -19
@@ -1,4 +1,7 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ require 'pdf/reader/overlapping_runs_filter'
2
5
 
3
6
  class PDF::Reader
4
7
 
@@ -14,13 +17,15 @@ class PDF::Reader
14
17
  def initialize(runs, mediabox)
15
18
  raise ArgumentError, "a mediabox must be provided" if mediabox.nil?
16
19
 
17
- @runs = merge_runs(runs)
20
+ @runs = merge_runs(OverlappingRunsFilter.exclude_redundant_runs(runs))
18
21
  @mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
19
22
  @mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
20
23
  @mean_glyph_width = mean(@runs.map(&:mean_character_width)) || 0
21
- @page_width = mediabox[2] - mediabox[0]
22
- @page_height = mediabox[3] - mediabox[1]
23
- @x_offset = @runs.map(&:x).sort.first
24
+ @page_width = (mediabox[2] - mediabox[0]).abs
25
+ @page_height = (mediabox[3] - mediabox[1]).abs
26
+ @x_offset = @runs.map(&:x).sort.first || 0
27
+ lowest_y = @runs.map(&:y).sort.first || 0
28
+ @y_offset = lowest_y > 0 ? 0 : lowest_y
24
29
  end
25
30
 
26
31
  def to_s
@@ -29,7 +34,7 @@ class PDF::Reader
29
34
  page = row_count.times.map { |i| " " * col_count }
30
35
  @runs.each do |run|
31
36
  x_pos = ((run.x - @x_offset) / col_multiplier).round
32
- y_pos = row_count - (run.y / row_multiplier).round
37
+ y_pos = row_count - ((run.y - @y_offset) / row_multiplier).round
33
38
  if y_pos <= row_count && y_pos >= 0 && x_pos <= col_count && x_pos >= 0
34
39
  local_string_insert(page[y_pos-1], run.text, x_pos)
35
40
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'pdf/reader/transformation_matrix'
4
5
 
@@ -29,7 +30,15 @@ class PDF::Reader
29
30
  @xobject_stack = [page.xobjects]
30
31
  @cs_stack = [page.color_spaces]
31
32
  @stack = [DEFAULT_GRAPHICS_STATE.dup]
32
- state[:ctm] = identity_matrix
33
+ if page.rotate == 0
34
+ state[:ctm] = identity_matrix
35
+ else
36
+ rotate_cos = Math.cos(page.rotate * (Math::PI/180.0)).round(2)
37
+ rotate_sin = Math.sin(page.rotate * (Math::PI/180.0)).round(2)
38
+ state[:ctm] = TransformationMatrix.new(rotate_cos, rotate_sin,
39
+ rotate_sin * -1, rotate_cos,
40
+ 0, 0)
41
+ end
33
42
  end
34
43
 
35
44
  #####################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'forwardable'
4
5
  require 'pdf/reader/page_layout'
@@ -43,10 +44,13 @@ module PDF
43
44
  @content = []
44
45
  @characters = []
45
46
  @mediabox = page.objects.deref(page.attributes[:MediaBox])
47
+ device_bl = @state.ctm_transform(@mediabox[0], @mediabox[1])
48
+ device_tr = @state.ctm_transform(@mediabox[2], @mediabox[3])
49
+ @device_mediabox = [ device_bl.first, device_bl.last, device_tr.first, device_tr.last]
46
50
  end
47
51
 
48
52
  def content
49
- PageLayout.new(@characters, @mediabox).to_s
53
+ PageLayout.new(@characters, @device_mediabox).to_s
50
54
  end
51
55
 
52
56
  #####################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  ################################################################################
4
5
  #
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  ################################################################################
4
5
  #
@@ -132,7 +133,7 @@ class PDF::Reader
132
133
  # reads a PDF name from the buffer and converts it to a Ruby Symbol
133
134
  def pdf_name
134
135
  tok = @buffer.token
135
- tok.gsub!(/#([A-Fa-f0-9]{2})/) do |match|
136
+ tok = tok.dup.gsub(/#([A-Fa-f0-9]{2})/) do |match|
136
137
  match[1, 2].hex.chr
137
138
  end
138
139
  tok.to_sym
@@ -154,7 +155,7 @@ class PDF::Reader
154
155
  ################################################################################
155
156
  # Reads a PDF hex string from the buffer and converts it to a Ruby String
156
157
  def hex_string
157
- str = ""
158
+ str = "".dup
158
159
 
159
160
  loop do
160
161
  token = @buffer.token
@@ -171,11 +172,11 @@ class PDF::Reader
171
172
  # Reads a PDF String from the buffer and converts it to a Ruby String
172
173
  def string
173
174
  str = @buffer.token
174
- return "".force_encoding("binary") if str == ")"
175
+ return "".dup.force_encoding("binary") if str == ")"
175
176
  Error.assert_equal(parse_token, ")")
176
177
 
177
178
  str.gsub!(/\\([nrtbf()\\\n]|\d{1,3})?|\r\n?|\n\r/m) do |match|
178
- MAPPING[match] || ""
179
+ MAPPING[match] || "".dup
179
180
  end
180
181
  str.force_encoding("binary")
181
182
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  class PDF::Reader
4
5
  # A simple receiver that prints all operaters and parameters in the content
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  ################################################################################
4
5
  #
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  # Copyright (C) 2010 James Healy (jimmy@deefa.com)
4
5
 
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  module PDF
4
5
  class Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  ################################################################################
4
5
  #
@@ -1,4 +1,6 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
3
+
2
4
  require 'digest'
3
5
  require 'openssl'
4
6
 
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  ################################################################################
4
5
  #
@@ -1,4 +1,5 @@
1
1
  # encoding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  # utilities.rb : General-purpose utility classes which don't fit anywhere else
4
5
  #
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  class PDF::Reader
4
5
  # A value object that represents one or more consecutive characters on a page.
@@ -37,6 +38,10 @@ class PDF::Reader
37
38
  @endx ||= x + width
38
39
  end
39
40
 
41
+ def endy
42
+ @endy ||= y + font_size
43
+ end
44
+
40
45
  def mean_character_width
41
46
  @width / character_count
42
47
  end
@@ -59,8 +64,28 @@ class PDF::Reader
59
64
  "#{text} w:#{width} f:#{font_size} @#{x},#{y}"
60
65
  end
61
66
 
67
+ def intersect?(other_run)
68
+ x <= other_run.endx && endx >= other_run.x &&
69
+ endy >= other_run.y && y <= other_run.endy
70
+ end
71
+
72
+ # return what percentage of this text run is overlapped by another run
73
+ def intersection_area_percent(other_run)
74
+ return 0 unless intersect?(other_run)
75
+
76
+ dx = [endx, other_run.endx].min - [x, other_run.x].max
77
+ dy = [endy, other_run.endy].min - [y, other_run.y].max
78
+ intersection_area = dx*dy
79
+
80
+ intersection_area.to_f / area
81
+ end
82
+
62
83
  private
63
84
 
85
+ def area
86
+ (endx - x) * (endy - y)
87
+ end
88
+
64
89
  def mergable_range
65
90
  @mergable_range ||= Range.new(endx - 3, endx + font_size)
66
91
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  ################################################################################
4
5
  #
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  class PDF::Reader
4
5
  # co-ordinate systems in PDF files are specified using a 3x3 matrix that looks
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  class PDF::Reader
4
5
 
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  # PDF files may define fonts in a number of ways. Each approach means we must
4
5
  # calculate glyph widths differently, so this set of classes conform to an
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'afm'
4
5
  require 'pdf/reader/synchronized_cache'
@@ -11,11 +12,20 @@ class PDF::Reader
11
12
  # see Section 9.6.2.2, PDF 32000-1:2008, pp 256
12
13
  class BuiltIn
13
14
 
15
+ BUILTINS = [
16
+ :Courier, :"Courier-Bold", :"Courier-BoldOblique", :"Courier-Oblique",
17
+ :Helvetica, :"Helvetica-Bold", :"Helvetica-BoldOblique", :"Helvetica-Oblique",
18
+ :Symbol,
19
+ :"Times-Roman", :"Times-Bold", :"Times-BoldItalic", :"Times-Italic",
20
+ :ZapfDingbats
21
+ ]
22
+
14
23
  def initialize(font)
15
24
  @font = font
16
25
  @@all_metrics ||= PDF::Reader::SynchronizedCache.new
17
26
 
18
- metrics_path = File.join(File.dirname(__FILE__), "..","afm","#{font.basefont}.afm")
27
+ basefont = extract_basefont(font.basefont)
28
+ metrics_path = File.join(File.dirname(__FILE__), "..","afm","#{basefont}.afm")
19
29
 
20
30
  if File.file?(metrics_path)
21
31
  @metrics = @@all_metrics[metrics_path] ||= AFM::Font.new(metrics_path)
@@ -53,6 +63,13 @@ class PDF::Reader
53
63
  @font.encoding.int_to_name(code_point).first.to_s[/\Acontrol..\Z/]
54
64
  end
55
65
 
66
+ def extract_basefont(font_name)
67
+ if BUILTINS.include?(font_name)
68
+ font_name
69
+ else
70
+ "Times-Roman"
71
+ end
72
+ end
56
73
  end
57
74
  end
58
75
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  class PDF::Reader
4
5
  module WidthCalculator
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  class PDF::Reader
4
5
  module WidthCalculator
@@ -17,8 +18,7 @@ class PDF::Reader
17
18
 
18
19
  def glyph_width(code_point)
19
20
  return 0 if code_point.nil? || code_point < 0
20
-
21
- glyph_width_from_font(code_point) || glyph_width_from_descriptor(code_point)
21
+ glyph_width_from_font(code_point) || glyph_width_from_descriptor(code_point) || 0
22
22
  end
23
23
 
24
24
  private
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  class PDF::Reader
4
5
  module WidthCalculator
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  class PDF::Reader
4
5
  module WidthCalculator
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  ################################################################################
4
5
  #
@@ -203,8 +204,10 @@ class PDF::Reader
203
204
  ("\x00" + bytes).unpack("N")[0]
204
205
  elsif bytes.size == 4
205
206
  bytes.unpack("N")[0]
207
+ elsif bytes.size == 8
208
+ bytes.unpack("Q>")[0]
206
209
  else
207
- raise UnsupportedFeatureError, "Unable to unpack xref stream entries with more than 4 bytes"
210
+ raise UnsupportedFeatureError, "Unable to unpack xref stream entries of #{bytes.size} bytes"
208
211
  end
209
212
  end
210
213
  ################################################################################
@@ -227,18 +230,21 @@ class PDF::Reader
227
230
  # should always be 0, but all sort of crazy junk is prefixed to PDF files
228
231
  # in the real world.
229
232
  #
230
- # Checks up to 50 chars into the file, returns nil if no PDF data detected.
233
+ # Checks up to 1024 chars into the file,
234
+ # returns nil if no PDF data detected.
235
+ # Adobe PDF 1.4 spec (3.4.1) 12. Acrobat viewers require only that the
236
+ # header appear somewhere within the first 1024 bytes of the file
231
237
  #
232
238
  def calc_junk_offset(io)
233
239
  io.rewind
234
240
  offset = io.pos
235
- until (c = io.readchar) == '%' || c == 37 || offset > 50
241
+ until (c = io.readchar) == '%' || c == 37 || offset > 1024
236
242
  offset += 1
237
243
  end
238
244
  io.rewind
239
- offset < 50 ? offset : nil
245
+ offset < 1024 ? offset : nil
240
246
  rescue EOFError
241
- return nil
247
+ nil
242
248
  end
243
249
  end
244
250
  ################################################################################
metadata CHANGED
@@ -1,29 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf-reader
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.0
4
+ version: 2.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Healy
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-02-15 00:00:00.000000000 Z
11
+ date: 2020-09-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ">="
17
+ - - "<"
18
18
  - !ruby/object:Gem::Version
19
- version: '0'
19
+ version: '13.0'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ">="
24
+ - - "<"
25
25
  - !ruby/object:Gem::Version
26
- version: '0'
26
+ version: '13.0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rspec
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -67,7 +67,7 @@ dependencies:
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0.2'
69
69
  - !ruby/object:Gem::Dependency
70
- name: ir_b
70
+ name: pry
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
73
  - - ">="
@@ -167,7 +167,7 @@ dependencies:
167
167
  description: The PDF::Reader library implements a PDF parser conforming as much as
168
168
  possible to the PDF specification from Adobe
169
169
  email:
170
- - jimmy@deefa.com
170
+ - james@yob.id.au
171
171
  executables:
172
172
  - pdf_object
173
173
  - pdf_text
@@ -199,7 +199,6 @@ files:
199
199
  - examples/text.rb
200
200
  - examples/version.rb
201
201
  - lib/pdf-reader.rb
202
- - lib/pdf/hash.rb
203
202
  - lib/pdf/reader.rb
204
203
  - lib/pdf/reader/afm/Courier-Bold.afm
205
204
  - lib/pdf/reader/afm/Courier-BoldOblique.afm
@@ -209,6 +208,7 @@ files:
209
208
  - lib/pdf/reader/afm/Helvetica-BoldOblique.afm
210
209
  - lib/pdf/reader/afm/Helvetica-Oblique.afm
211
210
  - lib/pdf/reader/afm/Helvetica.afm
211
+ - lib/pdf/reader/afm/MustRead.html
212
212
  - lib/pdf/reader/afm/Symbol.afm
213
213
  - lib/pdf/reader/afm/Times-Bold.afm
214
214
  - lib/pdf/reader/afm/Times-BoldItalic.afm
@@ -246,6 +246,7 @@ files:
246
246
  - lib/pdf/reader/object_hash.rb
247
247
  - lib/pdf/reader/object_stream.rb
248
248
  - lib/pdf/reader/orientation_detector.rb
249
+ - lib/pdf/reader/overlapping_runs_filter.rb
249
250
  - lib/pdf/reader/page.rb
250
251
  - lib/pdf/reader/page_layout.rb
251
252
  - lib/pdf/reader/page_state.rb
@@ -271,10 +272,14 @@ files:
271
272
  - lib/pdf/reader/width_calculator/type_one_or_three.rb
272
273
  - lib/pdf/reader/width_calculator/type_zero.rb
273
274
  - lib/pdf/reader/xref.rb
274
- homepage: http://github.com/yob/pdf-reader
275
+ homepage: https://github.com/yob/pdf-reader
275
276
  licenses:
276
277
  - MIT
277
- metadata: {}
278
+ metadata:
279
+ bug_tracker_uri: https://github.com/yob/pdf-reader/issues
280
+ changelog_uri: https://github.com/yob/pdf-reader/blob/v2.4.1/CHANGELOG
281
+ documentation_uri: https://www.rubydoc.info/gems/pdf-reader/2.4.1
282
+ source_code_uri: https://github.com/yob/pdf-reader/tree/v2.4.1
278
283
  post_install_message:
279
284
  rdoc_options:
280
285
  - "--title"
@@ -295,8 +300,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
295
300
  - !ruby/object:Gem::Version
296
301
  version: '0'
297
302
  requirements: []
298
- rubyforge_project:
299
- rubygems_version: 2.7.3
303
+ rubygems_version: 3.0.3
300
304
  signing_key:
301
305
  specification_version: 4
302
306
  summary: A library for accessing the content of PDF files