pdf-reader 2.13.0 → 2.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +7 -0
- data/Rakefile +1 -1
- data/lib/pdf/reader/advanced_text_run_filter.rb +137 -0
- data/lib/pdf/reader/buffer.rb +2 -2
- data/lib/pdf/reader/encoding.rb +1 -1
- data/lib/pdf/reader/page_text_receiver.rb +8 -0
- data/lib/pdf/reader.rb +1 -0
- data/rbi/pdf-reader.rbi +60 -14
- metadata +8 -10
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 38765d176ae7b8f4cff7ea6f10fff00b811f6812629d76a2b966f36139c23188
|
4
|
+
data.tar.gz: a406d525e4fccb84cc9e86b28aab06a12854c6f0f297a1a479d26b3f845267f6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 010c16b1528d4c46d0175737c9694e2e326092b5e7091cbdd0e0ca41567e662b1adabe989c33b0b919a021bee9f985fa4f2862058bd144762c090e718b3089cc
|
7
|
+
data.tar.gz: 996fe5b0761280edd67c5523d00c04519b7c682c5ededd86d8dfd412df6e11d554d162ab5b4eb231709f4d3013c5963129b32358ef0b49a4521e8ba72dcf490b
|
data/CHANGELOG
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
v2.14.1 (4th February 2025)
|
2
|
+
- Fix issue in RBI signatures, introduced in v2.14.0(https://github.com/yob/pdf-reader/pull/550)
|
3
|
+
|
4
|
+
v2.14.0 (29th January 2025)
|
5
|
+
- Raise minimum supported ruby to 2.1 (https://github.com/yob/pdf-reader/pull/543)
|
6
|
+
- Add support for filtering to Page#text (https://github.com/yob/pdf-reader/pull/545)
|
7
|
+
|
1
8
|
v2.13.0 (2nd November 2024)
|
2
9
|
- Permit Ascii86 v1.0 and v2.0 (https://github.com/yob/pdf-reader/pull/539)
|
3
10
|
- Allow StringIO type for PDF::Reader input (https://github.com/yob/pdf-reader/pull/535)
|
data/Rakefile
CHANGED
@@ -41,7 +41,7 @@ end
|
|
41
41
|
desc "Create a YAML file of integrity info for PDFs in the spec suite"
|
42
42
|
task :integrity_yaml do
|
43
43
|
data = {}
|
44
|
-
Dir.glob("spec/data
|
44
|
+
Dir.glob("spec/data/**/*.pdf").sort.each do |path|
|
45
45
|
path_without_spec = path.gsub("spec/","")
|
46
46
|
data[path_without_spec] = {
|
47
47
|
:bytes => File.size(path),
|
@@ -0,0 +1,137 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
# typed: strict
|
4
|
+
|
5
|
+
class PDF::Reader
|
6
|
+
# Filter a collection of TextRun objects based on a set of conditions.
|
7
|
+
# It can be used to filter text runs based on their attributes.
|
8
|
+
# The filter can return the text runs that matches the conditions (only) or
|
9
|
+
# the text runs that do not match the conditions (exclude).
|
10
|
+
#
|
11
|
+
# You can filter the text runs based on all its attributes with the operators
|
12
|
+
# mentioned in VALID_OPERATORS.
|
13
|
+
# The filter can be nested with 'or' and 'and' conditions.
|
14
|
+
#
|
15
|
+
# Examples:
|
16
|
+
# 1. Single condition
|
17
|
+
# AdvancedTextRunFilter.exclude(text_runs, text: { include: 'sample' })
|
18
|
+
#
|
19
|
+
# 2. Multiple conditions (and)
|
20
|
+
# AdvancedTextRunFilter.exclude(text_runs, {
|
21
|
+
# font_size: { greater_than: 10, less_than: 15 }
|
22
|
+
# })
|
23
|
+
#
|
24
|
+
# 3. Multiple possible values (or)
|
25
|
+
# AdvancedTextRunFilter.exclude(text_runs, {
|
26
|
+
# font_size: { equal: [10, 12] }
|
27
|
+
# })
|
28
|
+
#
|
29
|
+
# 4. Complex AND/OR filter
|
30
|
+
# AdvancedTextRunFilter.exclude(text_runs, {
|
31
|
+
# and: [
|
32
|
+
# { font_size: { greater_than: 10 } },
|
33
|
+
# { or: [
|
34
|
+
# { text: { include: "sample" } },
|
35
|
+
# { width: { greater_than: 100 } }
|
36
|
+
# ]}
|
37
|
+
# ]
|
38
|
+
# })
|
39
|
+
class AdvancedTextRunFilter
|
40
|
+
VALID_OPERATORS = %i[
|
41
|
+
equal
|
42
|
+
not_equal
|
43
|
+
greater_than
|
44
|
+
less_than
|
45
|
+
greater_than_or_equal
|
46
|
+
less_than_or_equal
|
47
|
+
include
|
48
|
+
exclude
|
49
|
+
]
|
50
|
+
|
51
|
+
def self.only(text_runs, filter_hash)
|
52
|
+
new(text_runs, filter_hash).only
|
53
|
+
end
|
54
|
+
|
55
|
+
def self.exclude(text_runs, filter_hash)
|
56
|
+
new(text_runs, filter_hash).exclude
|
57
|
+
end
|
58
|
+
|
59
|
+
attr_reader :text_runs, :filter_hash
|
60
|
+
|
61
|
+
def initialize(text_runs, filter_hash)
|
62
|
+
@text_runs = text_runs
|
63
|
+
@filter_hash = filter_hash
|
64
|
+
end
|
65
|
+
|
66
|
+
def only
|
67
|
+
return text_runs if filter_hash.empty?
|
68
|
+
text_runs.select { |text_run| evaluate_filter(text_run) }
|
69
|
+
end
|
70
|
+
|
71
|
+
def exclude
|
72
|
+
return text_runs if filter_hash.empty?
|
73
|
+
text_runs.reject { |text_run| evaluate_filter(text_run) }
|
74
|
+
end
|
75
|
+
|
76
|
+
private
|
77
|
+
|
78
|
+
def evaluate_filter(text_run)
|
79
|
+
if filter_hash[:or]
|
80
|
+
evaluate_or_filters(text_run, filter_hash[:or])
|
81
|
+
elsif filter_hash[:and]
|
82
|
+
evaluate_and_filters(text_run, filter_hash[:and])
|
83
|
+
else
|
84
|
+
evaluate_filters(text_run, filter_hash)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def evaluate_or_filters(text_run, conditions)
|
89
|
+
conditions.any? do |condition|
|
90
|
+
evaluate_filters(text_run, condition)
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
def evaluate_and_filters(text_run, conditions)
|
95
|
+
conditions.all? do |condition|
|
96
|
+
evaluate_filters(text_run, condition)
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
def evaluate_filters(text_run, filter_hash)
|
101
|
+
filter_hash.all? do |attribute, conditions|
|
102
|
+
evaluate_attribute_conditions(text_run, attribute, conditions)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
def evaluate_attribute_conditions(text_run, attribute, conditions)
|
107
|
+
conditions.all? do |operator, value|
|
108
|
+
unless VALID_OPERATORS.include?(operator)
|
109
|
+
raise ArgumentError, "Invalid operator: #{operator}"
|
110
|
+
end
|
111
|
+
|
112
|
+
apply_operator(text_run.send(attribute), operator, value)
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
def apply_operator(attribute_value, operator, filter_value)
|
117
|
+
case operator
|
118
|
+
when :equal
|
119
|
+
Array(filter_value).include?(attribute_value)
|
120
|
+
when :not_equal
|
121
|
+
!Array(filter_value).include?(attribute_value)
|
122
|
+
when :greater_than
|
123
|
+
attribute_value > filter_value
|
124
|
+
when :less_than
|
125
|
+
attribute_value < filter_value
|
126
|
+
when :greater_than_or_equal
|
127
|
+
attribute_value >= filter_value
|
128
|
+
when :less_than_or_equal
|
129
|
+
attribute_value <= filter_value
|
130
|
+
when :include
|
131
|
+
Array(filter_value).any? { |v| attribute_value.to_s.include?(v.to_s) }
|
132
|
+
when :exclude
|
133
|
+
Array(filter_value).none? { |v| attribute_value.to_s.include?(v.to_s) }
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
data/lib/pdf/reader/buffer.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
# coding: ASCII-8BIT
|
2
|
-
# typed:
|
2
|
+
# typed: strict
|
3
3
|
# frozen_string_literal: true
|
4
4
|
|
5
5
|
################################################################################
|
@@ -52,7 +52,7 @@ class PDF::Reader
|
|
52
52
|
CR = "\r"
|
53
53
|
LF = "\n"
|
54
54
|
CRLF = "\r\n"
|
55
|
-
WHITE_SPACE = [
|
55
|
+
WHITE_SPACE = ["\n", "\r", ' ']
|
56
56
|
|
57
57
|
# Quite a few PDFs have trailing junk.
|
58
58
|
# This can be several k of nuls in some cases
|
data/lib/pdf/reader/encoding.rb
CHANGED
@@ -68,6 +68,14 @@ module PDF
|
|
68
68
|
runs = merge_runs(runs)
|
69
69
|
end
|
70
70
|
|
71
|
+
if (only_filter = opts.fetch(:only, nil))
|
72
|
+
runs = AdvancedTextRunFilter.only(runs, only_filter)
|
73
|
+
end
|
74
|
+
|
75
|
+
if (exclude_filter = opts.fetch(:exclude, nil))
|
76
|
+
runs = AdvancedTextRunFilter.exclude(runs, exclude_filter)
|
77
|
+
end
|
78
|
+
|
71
79
|
runs
|
72
80
|
end
|
73
81
|
|
data/lib/pdf/reader.rb
CHANGED
@@ -280,6 +280,7 @@ end
|
|
280
280
|
################################################################################
|
281
281
|
|
282
282
|
require 'pdf/reader/resources'
|
283
|
+
require 'pdf/reader/advanced_text_run_filter'
|
283
284
|
require 'pdf/reader/buffer'
|
284
285
|
require 'pdf/reader/bounding_rectangle_runs_filter'
|
285
286
|
require 'pdf/reader/cid_widths'
|
data/rbi/pdf-reader.rbi
CHANGED
@@ -75,20 +75,20 @@ module PDF
|
|
75
75
|
end
|
76
76
|
|
77
77
|
class Buffer
|
78
|
-
TOKEN_WHITESPACE = T.let(T
|
79
|
-
TOKEN_DELIMITER = T.let(T
|
80
|
-
LEFT_PAREN = T.let(
|
81
|
-
LESS_THAN = T.let(
|
82
|
-
STREAM = T.let(
|
83
|
-
ID = T.let(
|
84
|
-
FWD_SLASH = T.let(
|
85
|
-
NULL_BYTE = T.let(
|
86
|
-
CR = T.let(
|
87
|
-
LF = T.let(
|
88
|
-
CRLF = T.let(
|
89
|
-
WHITE_SPACE = T.let(T
|
90
|
-
TRAILING_BYTECOUNT = T.let(
|
91
|
-
DIGITS_ONLY = T.let(
|
78
|
+
TOKEN_WHITESPACE = T.let(T::Array[String])
|
79
|
+
TOKEN_DELIMITER = T.let(T::Array[Integer])
|
80
|
+
LEFT_PAREN = T.let(String)
|
81
|
+
LESS_THAN = T.let(String)
|
82
|
+
STREAM = T.let(String)
|
83
|
+
ID = T.let(String)
|
84
|
+
FWD_SLASH = T.let(String)
|
85
|
+
NULL_BYTE = T.let(String)
|
86
|
+
CR = T.let(String)
|
87
|
+
LF = T.let(String)
|
88
|
+
CRLF = T.let(String)
|
89
|
+
WHITE_SPACE = T.let(T::Array[String])
|
90
|
+
TRAILING_BYTECOUNT = T.let(Integer)
|
91
|
+
DIGITS_ONLY = T.let(Regexp)
|
92
92
|
|
93
93
|
sig { returns(Integer) }
|
94
94
|
attr_reader :pos
|
@@ -851,6 +851,52 @@ module PDF
|
|
851
851
|
def self.exclude_empty_strings(runs); end
|
852
852
|
end
|
853
853
|
|
854
|
+
class AdvancedTextRunFilter
|
855
|
+
VALID_OPERATORS = T.let(T::Array[Symbol])
|
856
|
+
|
857
|
+
sig { params(text_runs: T::Array[PDF::Reader::TextRun], filter_hash: T::Hash[Symbol, T.untyped]).returns(T::Array[PDF::Reader::TextRun]) }
|
858
|
+
def self.only(text_runs, filter_hash); end
|
859
|
+
|
860
|
+
sig { params(text_runs: T::Array[PDF::Reader::TextRun], filter_hash: T::Hash[Symbol, T.untyped]).returns(T::Array[PDF::Reader::TextRun]) }
|
861
|
+
def self.exclude(text_runs, filter_hash); end
|
862
|
+
|
863
|
+
sig { returns(T::Array[PDF::Reader::TextRun]) }
|
864
|
+
attr_reader :text_runs
|
865
|
+
|
866
|
+
sig { returns(T::Hash[Symbol, T.untyped]) }
|
867
|
+
attr_reader :filter_hash
|
868
|
+
|
869
|
+
sig { params(text_runs: T::Array[PDF::Reader::TextRun], filter_hash: T::Hash[Symbol, T.untyped]).void }
|
870
|
+
def initialize(text_runs, filter_hash)
|
871
|
+
@text_runs = T.let(T.unsafe(nil), T::Array[PDF::Reader::TextRun])
|
872
|
+
@filter_hash = T.let(T.unsafe(nil), T::Hash[Symbol, T.untyped])
|
873
|
+
end
|
874
|
+
|
875
|
+
sig { returns(T::Array[PDF::Reader::TextRun]) }
|
876
|
+
def only; end
|
877
|
+
|
878
|
+
sig { returns(T::Array[PDF::Reader::TextRun]) }
|
879
|
+
def exclude; end
|
880
|
+
|
881
|
+
sig { params(text_run: PDF::Reader::TextRun).returns(T::Boolean) }
|
882
|
+
def evaluate_filter(text_run); end
|
883
|
+
|
884
|
+
sig { params(text_run: PDF::Reader::TextRun, conditions: T::Array[T::Hash[Symbol, T.untyped]]).returns(T::Boolean) }
|
885
|
+
def evaluate_or_filters(text_run, conditions); end
|
886
|
+
|
887
|
+
sig { params(text_run: PDF::Reader::TextRun, conditions: T::Array[T::Hash[Symbol, T.untyped]]).returns(T::Boolean) }
|
888
|
+
def evaluate_and_filters(text_run, conditions); end
|
889
|
+
|
890
|
+
sig { params(text_run: PDF::Reader::TextRun, filter_hash: T::Hash[Symbol, T.untyped]).returns(T::Boolean) }
|
891
|
+
def evaluate_filters(text_run, filter_hash); end
|
892
|
+
|
893
|
+
sig { params(text_run: PDF::Reader::TextRun, attribute: Symbol, conditions: T::Hash[Symbol, T.untyped]).returns(T::Boolean) }
|
894
|
+
def evaluate_attribute_conditions(text_run, attribute, conditions); end
|
895
|
+
|
896
|
+
sig { params(attribute_value: T.untyped, operator: Symbol, filter_value: T.untyped).returns(T::Boolean) }
|
897
|
+
def apply_operator(attribute_value, operator, filter_value); end
|
898
|
+
end
|
899
|
+
|
854
900
|
class EventPoint
|
855
901
|
sig { returns(Numeric) }
|
856
902
|
attr_reader :x
|
metadata
CHANGED
@@ -1,14 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf-reader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.14.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Healy
|
8
|
-
autorequire:
|
9
8
|
bindir: bin
|
10
9
|
cert_chain: []
|
11
|
-
date:
|
10
|
+
date: 2025-02-04 00:00:00.000000000 Z
|
12
11
|
dependencies:
|
13
12
|
- !ruby/object:Gem::Dependency
|
14
13
|
name: rake
|
@@ -212,6 +211,7 @@ files:
|
|
212
211
|
- examples/version.rb
|
213
212
|
- lib/pdf-reader.rb
|
214
213
|
- lib/pdf/reader.rb
|
214
|
+
- lib/pdf/reader/advanced_text_run_filter.rb
|
215
215
|
- lib/pdf/reader/aes_v2_security_handler.rb
|
216
216
|
- lib/pdf/reader/aes_v3_security_handler.rb
|
217
217
|
- lib/pdf/reader/afm/Courier-Bold.afm
|
@@ -301,10 +301,9 @@ licenses:
|
|
301
301
|
- MIT
|
302
302
|
metadata:
|
303
303
|
bug_tracker_uri: https://github.com/yob/pdf-reader/issues
|
304
|
-
changelog_uri: https://github.com/yob/pdf-reader/blob/v2.
|
305
|
-
documentation_uri: https://www.rubydoc.info/gems/pdf-reader/2.
|
306
|
-
source_code_uri: https://github.com/yob/pdf-reader/tree/v2.
|
307
|
-
post_install_message:
|
304
|
+
changelog_uri: https://github.com/yob/pdf-reader/blob/v2.14.1/CHANGELOG
|
305
|
+
documentation_uri: https://www.rubydoc.info/gems/pdf-reader/2.14.1
|
306
|
+
source_code_uri: https://github.com/yob/pdf-reader/tree/v2.14.1
|
308
307
|
rdoc_options:
|
309
308
|
- "--title"
|
310
309
|
- PDF::Reader Documentation
|
@@ -317,15 +316,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
317
316
|
requirements:
|
318
317
|
- - ">="
|
319
318
|
- !ruby/object:Gem::Version
|
320
|
-
version: '2.
|
319
|
+
version: '2.1'
|
321
320
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
322
321
|
requirements:
|
323
322
|
- - ">="
|
324
323
|
- !ruby/object:Gem::Version
|
325
324
|
version: '0'
|
326
325
|
requirements: []
|
327
|
-
rubygems_version: 3.
|
328
|
-
signing_key:
|
326
|
+
rubygems_version: 3.6.2
|
329
327
|
specification_version: 4
|
330
328
|
summary: A library for accessing the content of PDF files
|
331
329
|
test_files: []
|