pdf-reader 2.11.0 → 2.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +9 -0
- data/README.md +33 -33
- data/lib/pdf/reader/buffer.rb +4 -5
- data/lib/pdf/reader/cmap.rb +3 -3
- data/lib/pdf/reader/encoding.rb +2 -3
- data/lib/pdf/reader/font.rb +2 -2
- data/lib/pdf/reader/lzw.rb +1 -1
- data/lib/pdf/reader/object_hash.rb +1 -1
- data/lib/pdf/reader/pages_strategy.rb +1 -1
- data/lib/pdf/reader/parser.rb +1 -3
- data/lib/pdf/reader/width_calculator/built_in.rb +1 -1
- data/rbi/pdf-reader.rbi +2 -2
- metadata +20 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5db630726ada74d004eb928e8cf164d9a65070150bc99268bed3c3c22a4b16fd
|
4
|
+
data.tar.gz: 186960431832f9808e292e823a1b8cd3ccbe96bf89b7f8e6801b111b0899b690
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4cc29c3f7d3dd36ff55178c6825dab455bbfd9f3e3b62298dac03a835c741ffeebaa1959f3b3ceba19c82fe8a516acad554ad41e5142bd4a8c75a9725857fc96
|
7
|
+
data.tar.gz: a89f8815c83d6f89bc51e3aa232776d6d365eb45f0cbfd01ae9de157390144c9ff8bdbdf3e1359048612d3febeffbfd77a01d0c2b08da0b53dad64b6290f6292
|
data/CHANGELOG
CHANGED
@@ -1,3 +1,12 @@
|
|
1
|
+
v2.13.0 (2nd November 2024)
|
2
|
+
- Permit Ascii86 v1.0 and v2.0 (https://github.com/yob/pdf-reader/pull/539)
|
3
|
+
- Allow StringIO type for PDF::Reader input (https://github.com/yob/pdf-reader/pull/535)
|
4
|
+
|
5
|
+
v2.12.0 (26th December 2023)
|
6
|
+
- Fix a sorbet method signature (http://github.com/yob/pdf-reader/pull/512)
|
7
|
+
- Reduce allocations when parsing PDFs with hex strings (http://github.com/yob/pdf-reader/pull/528)
|
8
|
+
- Fix text extraction of some rare unicode codepoints (http://github.com/yob/pdf-reader/pull/529)
|
9
|
+
|
1
10
|
v2.11.0 (26th October 2022)
|
2
11
|
- Various bug fixes
|
3
12
|
- Expanded sorbet type annotations
|
data/README.md
CHANGED
@@ -20,7 +20,7 @@ page.
|
|
20
20
|
The recommended installation method is via Rubygems.
|
21
21
|
|
22
22
|
```ruby
|
23
|
-
|
23
|
+
gem install pdf-reader
|
24
24
|
```
|
25
25
|
|
26
26
|
# Usage
|
@@ -30,23 +30,23 @@ level information (metadata, page count, bookmarks, etc) is available via
|
|
30
30
|
this object.
|
31
31
|
|
32
32
|
```ruby
|
33
|
-
|
33
|
+
reader = PDF::Reader.new("somefile.pdf")
|
34
34
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
35
|
+
puts reader.pdf_version
|
36
|
+
puts reader.info
|
37
|
+
puts reader.metadata
|
38
|
+
puts reader.page_count
|
39
39
|
```
|
40
40
|
|
41
41
|
PDF::Reader.new accepts an IO stream or a filename. Here's an example with
|
42
42
|
an IO stream:
|
43
43
|
|
44
44
|
```ruby
|
45
|
-
|
45
|
+
require 'open-uri'
|
46
46
|
|
47
|
-
|
48
|
-
|
49
|
-
|
47
|
+
io = open('http://example.com/somefile.pdf')
|
48
|
+
reader = PDF::Reader.new(io)
|
49
|
+
puts reader.info
|
50
50
|
```
|
51
51
|
|
52
52
|
If you open a PDF with File#open or IO#open, I strongly recommend using "rb"
|
@@ -54,47 +54,47 @@ mode to ensure the file isn't mangled by ruby being 'helpful'. This is
|
|
54
54
|
particularly important on windows and MRI >= 1.9.2.
|
55
55
|
|
56
56
|
```ruby
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
57
|
+
File.open("somefile.pdf", "rb") do |io|
|
58
|
+
reader = PDF::Reader.new(io)
|
59
|
+
puts reader.info
|
60
|
+
end
|
61
61
|
```
|
62
62
|
|
63
63
|
PDF is a page based file format, so most visible information is available via
|
64
64
|
page-based iteration
|
65
65
|
|
66
66
|
```ruby
|
67
|
-
|
67
|
+
reader = PDF::Reader.new("somefile.pdf")
|
68
68
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
69
|
+
reader.pages.each do |page|
|
70
|
+
puts page.fonts
|
71
|
+
puts page.text
|
72
|
+
puts page.raw_content
|
73
|
+
end
|
74
74
|
```
|
75
75
|
|
76
76
|
If you need to access the full program for rendering a page, use the walk() method
|
77
77
|
of PDF::Reader::Page.
|
78
78
|
|
79
79
|
```ruby
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
80
|
+
class RedGreenBlue
|
81
|
+
def set_rgb_color_for_nonstroking(r, g, b)
|
82
|
+
puts "R: #{r}, G: #{g}, B: #{b}"
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
reader = PDF::Reader.new("somefile.pdf")
|
87
|
+
page = reader.page(1)
|
88
|
+
receiver = RedGreenBlue.new
|
89
|
+
page.walk(receiver)
|
90
90
|
```
|
91
91
|
|
92
92
|
For low level access to the objects in a PDF file, use the ObjectHash class like
|
93
93
|
so:
|
94
94
|
|
95
95
|
```ruby
|
96
|
-
|
97
|
-
|
96
|
+
reader = PDF::Reader.new("somefile.pdf")
|
97
|
+
puts reader.objects.inspect
|
98
98
|
```
|
99
99
|
|
100
100
|
# Text Encoding
|
@@ -141,7 +141,7 @@ the spec folder when you checkout a branch from Git.
|
|
141
141
|
To remove any invalid CRLF characters added while checking out a branch from Git, run:
|
142
142
|
|
143
143
|
```ruby
|
144
|
-
|
144
|
+
rake fix_integrity
|
145
145
|
```
|
146
146
|
|
147
147
|
# Maintainers
|
data/lib/pdf/reader/buffer.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
# coding: ASCII-8BIT
|
2
|
-
# typed:
|
2
|
+
# typed: true
|
3
3
|
# frozen_string_literal: true
|
4
4
|
|
5
5
|
################################################################################
|
@@ -300,13 +300,12 @@ class PDF::Reader
|
|
300
300
|
# we find a closing >
|
301
301
|
#
|
302
302
|
def prepare_hex_token
|
303
|
-
finished = :false
|
304
303
|
str = "".dup
|
305
304
|
|
306
|
-
|
305
|
+
loop do
|
307
306
|
byte = @io.getbyte
|
308
307
|
if byte.nil?
|
309
|
-
|
308
|
+
break
|
310
309
|
elsif (48..57).include?(byte) || (65..90).include?(byte) || (97..122).include?(byte)
|
311
310
|
str << byte
|
312
311
|
elsif byte <= 32
|
@@ -315,7 +314,7 @@ class PDF::Reader
|
|
315
314
|
@tokens << str if str.size > 0
|
316
315
|
@tokens << ">" if byte != 0x3E # '>'
|
317
316
|
@tokens << byte.chr
|
318
|
-
|
317
|
+
break
|
319
318
|
end
|
320
319
|
end
|
321
320
|
end
|
data/lib/pdf/reader/cmap.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
-
# typed:
|
2
|
+
# typed: true
|
3
3
|
# frozen_string_literal: true
|
4
4
|
|
5
5
|
################################################################################
|
@@ -118,8 +118,8 @@ class PDF::Reader
|
|
118
118
|
result = []
|
119
119
|
while unpacked_string.any? do
|
120
120
|
if unpacked_string.size >= 2 &&
|
121
|
-
unpacked_string.first.to_i
|
122
|
-
unpacked_string.first.to_i
|
121
|
+
unpacked_string.first.to_i >= 0xD800 &&
|
122
|
+
unpacked_string.first.to_i <= 0xDBFF
|
123
123
|
# this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7
|
124
124
|
# lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the
|
125
125
|
# low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E)
|
data/lib/pdf/reader/encoding.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
-
# typed:
|
2
|
+
# typed: true
|
3
3
|
# frozen_string_literal: true
|
4
4
|
|
5
5
|
################################################################################
|
@@ -119,7 +119,7 @@ class PDF::Reader
|
|
119
119
|
# => [:A]
|
120
120
|
#
|
121
121
|
def int_to_name(glyph_code)
|
122
|
-
if @enc_name == "Identity-H" || @enc_name == "Identity-V"
|
122
|
+
if @enc_name == :"Identity-H" || @enc_name == :"Identity-V"
|
123
123
|
[]
|
124
124
|
elsif differences[glyph_code]
|
125
125
|
[differences[glyph_code]]
|
@@ -143,7 +143,6 @@ class PDF::Reader
|
|
143
143
|
CONTROL_CHARS.include?(i) ? [i, UNKNOWN_CHAR] : [i,i]
|
144
144
|
}
|
145
145
|
mapping = Hash[tuples]
|
146
|
-
mapping[nil] = UNKNOWN_CHAR
|
147
146
|
mapping
|
148
147
|
end
|
149
148
|
|
data/lib/pdf/reader/font.rb
CHANGED
@@ -82,8 +82,8 @@ class PDF::Reader
|
|
82
82
|
glyph_width_in_glyph_space = glyph_width(code_point)
|
83
83
|
|
84
84
|
if @subtype == :Type3
|
85
|
-
x1,
|
86
|
-
x2,
|
85
|
+
x1, _y1 = font_matrix_transform(0,0)
|
86
|
+
x2, _y2 = font_matrix_transform(glyph_width_in_glyph_space, 0)
|
87
87
|
(x2 - x1).abs.round(2)
|
88
88
|
else
|
89
89
|
glyph_width_in_glyph_space / 1000.0
|
data/lib/pdf/reader/lzw.rb
CHANGED
@@ -42,7 +42,7 @@ module PDF
|
|
42
42
|
while bits_left_in_chunk > 0 and @current_pos < @data.size
|
43
43
|
chunk = 0 if chunk < 0
|
44
44
|
codepoint = @data[@current_pos, 1].to_s.unpack("C*")[0].to_i
|
45
|
-
current_byte = codepoint & (2**@bits_left_in_byte - 1) #clear consumed bits
|
45
|
+
current_byte = codepoint & (2**@bits_left_in_byte - 1).to_i #clear consumed bits
|
46
46
|
dif = bits_left_in_chunk - @bits_left_in_byte
|
47
47
|
if dif > 0 then current_byte <<= dif
|
48
48
|
elsif dif < 0 then current_byte >>= dif.abs
|
data/lib/pdf/reader/parser.rb
CHANGED
@@ -173,9 +173,7 @@ class PDF::Reader
|
|
173
173
|
|
174
174
|
# add a missing digit if required, as required by the spec
|
175
175
|
str << "0" unless str.size % 2 == 0
|
176
|
-
str.
|
177
|
-
nibbles.join("").hex.chr
|
178
|
-
}.join.force_encoding("binary")
|
176
|
+
[str].pack('H*')
|
179
177
|
end
|
180
178
|
################################################################################
|
181
179
|
# Reads a PDF String from the buffer and converts it to a Ruby String
|
data/rbi/pdf-reader.rbi
CHANGED
@@ -4,7 +4,7 @@ module PDF
|
|
4
4
|
sig { returns(PDF::Reader::ObjectHash) }
|
5
5
|
attr_reader :objects
|
6
6
|
|
7
|
-
sig { params(input: T.any(String, Tempfile, IO), opts: T::Hash[T.untyped, T.untyped]).void }
|
7
|
+
sig { params(input: T.any(String, Tempfile, IO, StringIO), opts: T::Hash[T.untyped, T.untyped]).void }
|
8
8
|
def initialize(input, opts = {})
|
9
9
|
@cache = T.let(T.unsafe(nil), PDF::Reader::ObjectCache)
|
10
10
|
@objects = T.let(T.unsafe(nil), PDF::Reader::ObjectHash)
|
@@ -842,7 +842,7 @@ module PDF
|
|
842
842
|
sig { params(runs: T::Array[PDF::Reader::TextRun]).returns(T::Array[PDF::Reader::TextRun]) }
|
843
843
|
def self.exclude_redundant_runs(runs); end
|
844
844
|
|
845
|
-
sig { params(sweep_line_status: T::Array[PDF::Reader::TextRun], event_point: EventPoint).returns(T::Boolean) }
|
845
|
+
sig { params(sweep_line_status: T::Array[PDF::Reader::TextRun], event_point: PDF::Reader::EventPoint).returns(T::Boolean) }
|
846
846
|
def self.detect_intersection(sweep_line_status, event_point); end
|
847
847
|
end
|
848
848
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf-reader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.13.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Healy
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-11-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -98,16 +98,28 @@ dependencies:
|
|
98
98
|
name: Ascii85
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
100
100
|
requirements:
|
101
|
-
- - "
|
101
|
+
- - ">="
|
102
102
|
- !ruby/object:Gem::Version
|
103
103
|
version: '1.0'
|
104
|
+
- - "<"
|
105
|
+
- !ruby/object:Gem::Version
|
106
|
+
version: '3.0'
|
107
|
+
- - "!="
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: 2.0.0
|
104
110
|
type: :runtime
|
105
111
|
prerelease: false
|
106
112
|
version_requirements: !ruby/object:Gem::Requirement
|
107
113
|
requirements:
|
108
|
-
- - "
|
114
|
+
- - ">="
|
109
115
|
- !ruby/object:Gem::Version
|
110
116
|
version: '1.0'
|
117
|
+
- - "<"
|
118
|
+
- !ruby/object:Gem::Version
|
119
|
+
version: '3.0'
|
120
|
+
- - "!="
|
121
|
+
- !ruby/object:Gem::Version
|
122
|
+
version: 2.0.0
|
111
123
|
- !ruby/object:Gem::Dependency
|
112
124
|
name: ruby-rc4
|
113
125
|
requirement: !ruby/object:Gem::Requirement
|
@@ -289,9 +301,9 @@ licenses:
|
|
289
301
|
- MIT
|
290
302
|
metadata:
|
291
303
|
bug_tracker_uri: https://github.com/yob/pdf-reader/issues
|
292
|
-
changelog_uri: https://github.com/yob/pdf-reader/blob/v2.
|
293
|
-
documentation_uri: https://www.rubydoc.info/gems/pdf-reader/2.
|
294
|
-
source_code_uri: https://github.com/yob/pdf-reader/tree/v2.
|
304
|
+
changelog_uri: https://github.com/yob/pdf-reader/blob/v2.13.0/CHANGELOG
|
305
|
+
documentation_uri: https://www.rubydoc.info/gems/pdf-reader/2.13.0
|
306
|
+
source_code_uri: https://github.com/yob/pdf-reader/tree/v2.13.0
|
295
307
|
post_install_message:
|
296
308
|
rdoc_options:
|
297
309
|
- "--title"
|
@@ -312,7 +324,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
312
324
|
- !ruby/object:Gem::Version
|
313
325
|
version: '0'
|
314
326
|
requirements: []
|
315
|
-
rubygems_version: 3.
|
327
|
+
rubygems_version: 3.4.10
|
316
328
|
signing_key:
|
317
329
|
specification_version: 4
|
318
330
|
summary: A library for accessing the content of PDF files
|