pdf-reader 2.11.0 → 2.13.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG +9 -0
- data/README.md +33 -33
- data/lib/pdf/reader/buffer.rb +4 -5
- data/lib/pdf/reader/cmap.rb +3 -3
- data/lib/pdf/reader/encoding.rb +2 -3
- data/lib/pdf/reader/font.rb +2 -2
- data/lib/pdf/reader/lzw.rb +1 -1
- data/lib/pdf/reader/object_hash.rb +1 -1
- data/lib/pdf/reader/pages_strategy.rb +1 -1
- data/lib/pdf/reader/parser.rb +1 -3
- data/lib/pdf/reader/width_calculator/built_in.rb +1 -1
- data/rbi/pdf-reader.rbi +2 -2
- metadata +20 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5db630726ada74d004eb928e8cf164d9a65070150bc99268bed3c3c22a4b16fd
|
4
|
+
data.tar.gz: 186960431832f9808e292e823a1b8cd3ccbe96bf89b7f8e6801b111b0899b690
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4cc29c3f7d3dd36ff55178c6825dab455bbfd9f3e3b62298dac03a835c741ffeebaa1959f3b3ceba19c82fe8a516acad554ad41e5142bd4a8c75a9725857fc96
|
7
|
+
data.tar.gz: a89f8815c83d6f89bc51e3aa232776d6d365eb45f0cbfd01ae9de157390144c9ff8bdbdf3e1359048612d3febeffbfd77a01d0c2b08da0b53dad64b6290f6292
|
data/CHANGELOG
CHANGED
@@ -1,3 +1,12 @@
|
|
1
|
+
v2.13.0 (2nd November 2024)
|
2
|
+
- Permit Ascii86 v1.0 and v2.0 (https://github.com/yob/pdf-reader/pull/539)
|
3
|
+
- Allow StringIO type for PDF::Reader input (https://github.com/yob/pdf-reader/pull/535)
|
4
|
+
|
5
|
+
v2.12.0 (26th December 2023)
|
6
|
+
- Fix a sorbet method signature (http://github.com/yob/pdf-reader/pull/512)
|
7
|
+
- Reduce allocations when parsing PDFs with hex strings (http://github.com/yob/pdf-reader/pull/528)
|
8
|
+
- Fix text extraction of some rare unicode codepoints (http://github.com/yob/pdf-reader/pull/529)
|
9
|
+
|
1
10
|
v2.11.0 (26th October 2022)
|
2
11
|
- Various bug fixes
|
3
12
|
- Expanded sorbet type annotations
|
data/README.md
CHANGED
@@ -20,7 +20,7 @@ page.
|
|
20
20
|
The recommended installation method is via Rubygems.
|
21
21
|
|
22
22
|
```ruby
|
23
|
-
|
23
|
+
gem install pdf-reader
|
24
24
|
```
|
25
25
|
|
26
26
|
# Usage
|
@@ -30,23 +30,23 @@ level information (metadata, page count, bookmarks, etc) is available via
|
|
30
30
|
this object.
|
31
31
|
|
32
32
|
```ruby
|
33
|
-
|
33
|
+
reader = PDF::Reader.new("somefile.pdf")
|
34
34
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
35
|
+
puts reader.pdf_version
|
36
|
+
puts reader.info
|
37
|
+
puts reader.metadata
|
38
|
+
puts reader.page_count
|
39
39
|
```
|
40
40
|
|
41
41
|
PDF::Reader.new accepts an IO stream or a filename. Here's an example with
|
42
42
|
an IO stream:
|
43
43
|
|
44
44
|
```ruby
|
45
|
-
|
45
|
+
require 'open-uri'
|
46
46
|
|
47
|
-
|
48
|
-
|
49
|
-
|
47
|
+
io = open('http://example.com/somefile.pdf')
|
48
|
+
reader = PDF::Reader.new(io)
|
49
|
+
puts reader.info
|
50
50
|
```
|
51
51
|
|
52
52
|
If you open a PDF with File#open or IO#open, I strongly recommend using "rb"
|
@@ -54,47 +54,47 @@ mode to ensure the file isn't mangled by ruby being 'helpful'. This is
|
|
54
54
|
particularly important on windows and MRI >= 1.9.2.
|
55
55
|
|
56
56
|
```ruby
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
57
|
+
File.open("somefile.pdf", "rb") do |io|
|
58
|
+
reader = PDF::Reader.new(io)
|
59
|
+
puts reader.info
|
60
|
+
end
|
61
61
|
```
|
62
62
|
|
63
63
|
PDF is a page based file format, so most visible information is available via
|
64
64
|
page-based iteration
|
65
65
|
|
66
66
|
```ruby
|
67
|
-
|
67
|
+
reader = PDF::Reader.new("somefile.pdf")
|
68
68
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
69
|
+
reader.pages.each do |page|
|
70
|
+
puts page.fonts
|
71
|
+
puts page.text
|
72
|
+
puts page.raw_content
|
73
|
+
end
|
74
74
|
```
|
75
75
|
|
76
76
|
If you need to access the full program for rendering a page, use the walk() method
|
77
77
|
of PDF::Reader::Page.
|
78
78
|
|
79
79
|
```ruby
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
80
|
+
class RedGreenBlue
|
81
|
+
def set_rgb_color_for_nonstroking(r, g, b)
|
82
|
+
puts "R: #{r}, G: #{g}, B: #{b}"
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
reader = PDF::Reader.new("somefile.pdf")
|
87
|
+
page = reader.page(1)
|
88
|
+
receiver = RedGreenBlue.new
|
89
|
+
page.walk(receiver)
|
90
90
|
```
|
91
91
|
|
92
92
|
For low level access to the objects in a PDF file, use the ObjectHash class like
|
93
93
|
so:
|
94
94
|
|
95
95
|
```ruby
|
96
|
-
|
97
|
-
|
96
|
+
reader = PDF::Reader.new("somefile.pdf")
|
97
|
+
puts reader.objects.inspect
|
98
98
|
```
|
99
99
|
|
100
100
|
# Text Encoding
|
@@ -141,7 +141,7 @@ the spec folder when you checkout a branch from Git.
|
|
141
141
|
To remove any invalid CRLF characters added while checking out a branch from Git, run:
|
142
142
|
|
143
143
|
```ruby
|
144
|
-
|
144
|
+
rake fix_integrity
|
145
145
|
```
|
146
146
|
|
147
147
|
# Maintainers
|
data/lib/pdf/reader/buffer.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
# coding: ASCII-8BIT
|
2
|
-
# typed:
|
2
|
+
# typed: true
|
3
3
|
# frozen_string_literal: true
|
4
4
|
|
5
5
|
################################################################################
|
@@ -300,13 +300,12 @@ class PDF::Reader
|
|
300
300
|
# we find a closing >
|
301
301
|
#
|
302
302
|
def prepare_hex_token
|
303
|
-
finished = :false
|
304
303
|
str = "".dup
|
305
304
|
|
306
|
-
|
305
|
+
loop do
|
307
306
|
byte = @io.getbyte
|
308
307
|
if byte.nil?
|
309
|
-
|
308
|
+
break
|
310
309
|
elsif (48..57).include?(byte) || (65..90).include?(byte) || (97..122).include?(byte)
|
311
310
|
str << byte
|
312
311
|
elsif byte <= 32
|
@@ -315,7 +314,7 @@ class PDF::Reader
|
|
315
314
|
@tokens << str if str.size > 0
|
316
315
|
@tokens << ">" if byte != 0x3E # '>'
|
317
316
|
@tokens << byte.chr
|
318
|
-
|
317
|
+
break
|
319
318
|
end
|
320
319
|
end
|
321
320
|
end
|
data/lib/pdf/reader/cmap.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
-
# typed:
|
2
|
+
# typed: true
|
3
3
|
# frozen_string_literal: true
|
4
4
|
|
5
5
|
################################################################################
|
@@ -118,8 +118,8 @@ class PDF::Reader
|
|
118
118
|
result = []
|
119
119
|
while unpacked_string.any? do
|
120
120
|
if unpacked_string.size >= 2 &&
|
121
|
-
unpacked_string.first.to_i
|
122
|
-
unpacked_string.first.to_i
|
121
|
+
unpacked_string.first.to_i >= 0xD800 &&
|
122
|
+
unpacked_string.first.to_i <= 0xDBFF
|
123
123
|
# this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7
|
124
124
|
# lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the
|
125
125
|
# low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E)
|
data/lib/pdf/reader/encoding.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
-
# typed:
|
2
|
+
# typed: true
|
3
3
|
# frozen_string_literal: true
|
4
4
|
|
5
5
|
################################################################################
|
@@ -119,7 +119,7 @@ class PDF::Reader
|
|
119
119
|
# => [:A]
|
120
120
|
#
|
121
121
|
def int_to_name(glyph_code)
|
122
|
-
if @enc_name == "Identity-H" || @enc_name == "Identity-V"
|
122
|
+
if @enc_name == :"Identity-H" || @enc_name == :"Identity-V"
|
123
123
|
[]
|
124
124
|
elsif differences[glyph_code]
|
125
125
|
[differences[glyph_code]]
|
@@ -143,7 +143,6 @@ class PDF::Reader
|
|
143
143
|
CONTROL_CHARS.include?(i) ? [i, UNKNOWN_CHAR] : [i,i]
|
144
144
|
}
|
145
145
|
mapping = Hash[tuples]
|
146
|
-
mapping[nil] = UNKNOWN_CHAR
|
147
146
|
mapping
|
148
147
|
end
|
149
148
|
|
data/lib/pdf/reader/font.rb
CHANGED
@@ -82,8 +82,8 @@ class PDF::Reader
|
|
82
82
|
glyph_width_in_glyph_space = glyph_width(code_point)
|
83
83
|
|
84
84
|
if @subtype == :Type3
|
85
|
-
x1,
|
86
|
-
x2,
|
85
|
+
x1, _y1 = font_matrix_transform(0,0)
|
86
|
+
x2, _y2 = font_matrix_transform(glyph_width_in_glyph_space, 0)
|
87
87
|
(x2 - x1).abs.round(2)
|
88
88
|
else
|
89
89
|
glyph_width_in_glyph_space / 1000.0
|
data/lib/pdf/reader/lzw.rb
CHANGED
@@ -42,7 +42,7 @@ module PDF
|
|
42
42
|
while bits_left_in_chunk > 0 and @current_pos < @data.size
|
43
43
|
chunk = 0 if chunk < 0
|
44
44
|
codepoint = @data[@current_pos, 1].to_s.unpack("C*")[0].to_i
|
45
|
-
current_byte = codepoint & (2**@bits_left_in_byte - 1) #clear consumed bits
|
45
|
+
current_byte = codepoint & (2**@bits_left_in_byte - 1).to_i #clear consumed bits
|
46
46
|
dif = bits_left_in_chunk - @bits_left_in_byte
|
47
47
|
if dif > 0 then current_byte <<= dif
|
48
48
|
elsif dif < 0 then current_byte >>= dif.abs
|
data/lib/pdf/reader/parser.rb
CHANGED
@@ -173,9 +173,7 @@ class PDF::Reader
|
|
173
173
|
|
174
174
|
# add a missing digit if required, as required by the spec
|
175
175
|
str << "0" unless str.size % 2 == 0
|
176
|
-
str.
|
177
|
-
nibbles.join("").hex.chr
|
178
|
-
}.join.force_encoding("binary")
|
176
|
+
[str].pack('H*')
|
179
177
|
end
|
180
178
|
################################################################################
|
181
179
|
# Reads a PDF String from the buffer and converts it to a Ruby String
|
data/rbi/pdf-reader.rbi
CHANGED
@@ -4,7 +4,7 @@ module PDF
|
|
4
4
|
sig { returns(PDF::Reader::ObjectHash) }
|
5
5
|
attr_reader :objects
|
6
6
|
|
7
|
-
sig { params(input: T.any(String, Tempfile, IO), opts: T::Hash[T.untyped, T.untyped]).void }
|
7
|
+
sig { params(input: T.any(String, Tempfile, IO, StringIO), opts: T::Hash[T.untyped, T.untyped]).void }
|
8
8
|
def initialize(input, opts = {})
|
9
9
|
@cache = T.let(T.unsafe(nil), PDF::Reader::ObjectCache)
|
10
10
|
@objects = T.let(T.unsafe(nil), PDF::Reader::ObjectHash)
|
@@ -842,7 +842,7 @@ module PDF
|
|
842
842
|
sig { params(runs: T::Array[PDF::Reader::TextRun]).returns(T::Array[PDF::Reader::TextRun]) }
|
843
843
|
def self.exclude_redundant_runs(runs); end
|
844
844
|
|
845
|
-
sig { params(sweep_line_status: T::Array[PDF::Reader::TextRun], event_point: EventPoint).returns(T::Boolean) }
|
845
|
+
sig { params(sweep_line_status: T::Array[PDF::Reader::TextRun], event_point: PDF::Reader::EventPoint).returns(T::Boolean) }
|
846
846
|
def self.detect_intersection(sweep_line_status, event_point); end
|
847
847
|
end
|
848
848
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf-reader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.13.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Healy
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-11-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -98,16 +98,28 @@ dependencies:
|
|
98
98
|
name: Ascii85
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
100
100
|
requirements:
|
101
|
-
- - "
|
101
|
+
- - ">="
|
102
102
|
- !ruby/object:Gem::Version
|
103
103
|
version: '1.0'
|
104
|
+
- - "<"
|
105
|
+
- !ruby/object:Gem::Version
|
106
|
+
version: '3.0'
|
107
|
+
- - "!="
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: 2.0.0
|
104
110
|
type: :runtime
|
105
111
|
prerelease: false
|
106
112
|
version_requirements: !ruby/object:Gem::Requirement
|
107
113
|
requirements:
|
108
|
-
- - "
|
114
|
+
- - ">="
|
109
115
|
- !ruby/object:Gem::Version
|
110
116
|
version: '1.0'
|
117
|
+
- - "<"
|
118
|
+
- !ruby/object:Gem::Version
|
119
|
+
version: '3.0'
|
120
|
+
- - "!="
|
121
|
+
- !ruby/object:Gem::Version
|
122
|
+
version: 2.0.0
|
111
123
|
- !ruby/object:Gem::Dependency
|
112
124
|
name: ruby-rc4
|
113
125
|
requirement: !ruby/object:Gem::Requirement
|
@@ -289,9 +301,9 @@ licenses:
|
|
289
301
|
- MIT
|
290
302
|
metadata:
|
291
303
|
bug_tracker_uri: https://github.com/yob/pdf-reader/issues
|
292
|
-
changelog_uri: https://github.com/yob/pdf-reader/blob/v2.
|
293
|
-
documentation_uri: https://www.rubydoc.info/gems/pdf-reader/2.
|
294
|
-
source_code_uri: https://github.com/yob/pdf-reader/tree/v2.
|
304
|
+
changelog_uri: https://github.com/yob/pdf-reader/blob/v2.13.0/CHANGELOG
|
305
|
+
documentation_uri: https://www.rubydoc.info/gems/pdf-reader/2.13.0
|
306
|
+
source_code_uri: https://github.com/yob/pdf-reader/tree/v2.13.0
|
295
307
|
post_install_message:
|
296
308
|
rdoc_options:
|
297
309
|
- "--title"
|
@@ -312,7 +324,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
312
324
|
- !ruby/object:Gem::Version
|
313
325
|
version: '0'
|
314
326
|
requirements: []
|
315
|
-
rubygems_version: 3.
|
327
|
+
rubygems_version: 3.4.10
|
316
328
|
signing_key:
|
317
329
|
specification_version: 4
|
318
330
|
summary: A library for accessing the content of PDF files
|