pdf-reader 2.13.0 → 2.14.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Rakefile +1 -1
- data/lib/pdf/reader/advanced_text_run_filter.rb +137 -0
- data/lib/pdf/reader/buffer.rb +2 -2
- data/lib/pdf/reader/encoding.rb +1 -1
- data/lib/pdf/reader/page_text_receiver.rb +8 -0
- data/lib/pdf/reader.rb +1 -0
- data/rbi/pdf-reader.rbi +60 -14
- metadata +8 -10
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7174f6e8c3c655cc9a1c120e5f0b99d06b0c2355803480d1cf8347c1825ddb01
|
4
|
+
data.tar.gz: ade7c031fe3c3d6e022125ccd3ef65a9482e21a7d90e6ea9d9d65aeeee3b30e8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: '08d343015a23dd678264053ade37a3449c91bdfda9764c65bc6ae196529062c33272ffe191beaea9f1e20d31cb1de9c117535e27fee763968e822288324931c6'
|
7
|
+
data.tar.gz: 8b0df463cc6292048f0ad68dd682131d53a4bc9630f84841103f8c73a6c837e91cdf5af08f98c255bcefc7d2a67146387d26ab0c3cbece42853f2581981a613b
|
data/Rakefile
CHANGED
@@ -41,7 +41,7 @@ end
|
|
41
41
|
desc "Create a YAML file of integrity info for PDFs in the spec suite"
|
42
42
|
task :integrity_yaml do
|
43
43
|
data = {}
|
44
|
-
Dir.glob("spec/data
|
44
|
+
Dir.glob("spec/data/**/*.pdf").sort.each do |path|
|
45
45
|
path_without_spec = path.gsub("spec/","")
|
46
46
|
data[path_without_spec] = {
|
47
47
|
:bytes => File.size(path),
|
@@ -0,0 +1,137 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
# typed: strict
|
4
|
+
|
5
|
+
class PDF::Reader
|
6
|
+
# Filter a collection of TextRun objects based on a set of conditions.
|
7
|
+
# It can be used to filter text runs based on their attributes.
|
8
|
+
# The filter can return the text runs that matches the conditions (only) or
|
9
|
+
# the text runs that do not match the conditions (exclude).
|
10
|
+
#
|
11
|
+
# You can filter the text runs based on all its attributes with the operators
|
12
|
+
# mentioned in VALID_OPERATORS.
|
13
|
+
# The filter can be nested with 'or' and 'and' conditions.
|
14
|
+
#
|
15
|
+
# Examples:
|
16
|
+
# 1. Single condition
|
17
|
+
# AdvancedTextRunFilter.exclude(text_runs, text: { include: 'sample' })
|
18
|
+
#
|
19
|
+
# 2. Multiple conditions (and)
|
20
|
+
# AdvancedTextRunFilter.exclude(text_runs, {
|
21
|
+
# font_size: { greater_than: 10, less_than: 15 }
|
22
|
+
# })
|
23
|
+
#
|
24
|
+
# 3. Multiple possible values (or)
|
25
|
+
# AdvancedTextRunFilter.exclude(text_runs, {
|
26
|
+
# font_size: { equal: [10, 12] }
|
27
|
+
# })
|
28
|
+
#
|
29
|
+
# 4. Complex AND/OR filter
|
30
|
+
# AdvancedTextRunFilter.exclude(text_runs, {
|
31
|
+
# and: [
|
32
|
+
# { font_size: { greater_than: 10 } },
|
33
|
+
# { or: [
|
34
|
+
# { text: { include: "sample" } },
|
35
|
+
# { width: { greater_than: 100 } }
|
36
|
+
# ]}
|
37
|
+
# ]
|
38
|
+
# })
|
39
|
+
class AdvancedTextRunFilter
|
40
|
+
VALID_OPERATORS = %i[
|
41
|
+
equal
|
42
|
+
not_equal
|
43
|
+
greater_than
|
44
|
+
less_than
|
45
|
+
greater_than_or_equal
|
46
|
+
less_than_or_equal
|
47
|
+
include
|
48
|
+
exclude
|
49
|
+
]
|
50
|
+
|
51
|
+
def self.only(text_runs, filter_hash)
|
52
|
+
new(text_runs, filter_hash).only
|
53
|
+
end
|
54
|
+
|
55
|
+
def self.exclude(text_runs, filter_hash)
|
56
|
+
new(text_runs, filter_hash).exclude
|
57
|
+
end
|
58
|
+
|
59
|
+
attr_reader :text_runs, :filter_hash
|
60
|
+
|
61
|
+
def initialize(text_runs, filter_hash)
|
62
|
+
@text_runs = text_runs
|
63
|
+
@filter_hash = filter_hash
|
64
|
+
end
|
65
|
+
|
66
|
+
def only
|
67
|
+
return text_runs if filter_hash.empty?
|
68
|
+
text_runs.select { |text_run| evaluate_filter(text_run) }
|
69
|
+
end
|
70
|
+
|
71
|
+
def exclude
|
72
|
+
return text_runs if filter_hash.empty?
|
73
|
+
text_runs.reject { |text_run| evaluate_filter(text_run) }
|
74
|
+
end
|
75
|
+
|
76
|
+
private
|
77
|
+
|
78
|
+
def evaluate_filter(text_run)
|
79
|
+
if filter_hash[:or]
|
80
|
+
evaluate_or_filters(text_run, filter_hash[:or])
|
81
|
+
elsif filter_hash[:and]
|
82
|
+
evaluate_and_filters(text_run, filter_hash[:and])
|
83
|
+
else
|
84
|
+
evaluate_filters(text_run, filter_hash)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def evaluate_or_filters(text_run, conditions)
|
89
|
+
conditions.any? do |condition|
|
90
|
+
evaluate_filters(text_run, condition)
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
def evaluate_and_filters(text_run, conditions)
|
95
|
+
conditions.all? do |condition|
|
96
|
+
evaluate_filters(text_run, condition)
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
def evaluate_filters(text_run, filter_hash)
|
101
|
+
filter_hash.all? do |attribute, conditions|
|
102
|
+
evaluate_attribute_conditions(text_run, attribute, conditions)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
def evaluate_attribute_conditions(text_run, attribute, conditions)
|
107
|
+
conditions.all? do |operator, value|
|
108
|
+
unless VALID_OPERATORS.include?(operator)
|
109
|
+
raise ArgumentError, "Invalid operator: #{operator}"
|
110
|
+
end
|
111
|
+
|
112
|
+
apply_operator(text_run.send(attribute), operator, value)
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
def apply_operator(attribute_value, operator, filter_value)
|
117
|
+
case operator
|
118
|
+
when :equal
|
119
|
+
Array(filter_value).include?(attribute_value)
|
120
|
+
when :not_equal
|
121
|
+
!Array(filter_value).include?(attribute_value)
|
122
|
+
when :greater_than
|
123
|
+
attribute_value > filter_value
|
124
|
+
when :less_than
|
125
|
+
attribute_value < filter_value
|
126
|
+
when :greater_than_or_equal
|
127
|
+
attribute_value >= filter_value
|
128
|
+
when :less_than_or_equal
|
129
|
+
attribute_value <= filter_value
|
130
|
+
when :include
|
131
|
+
Array(filter_value).any? { |v| attribute_value.to_s.include?(v.to_s) }
|
132
|
+
when :exclude
|
133
|
+
Array(filter_value).none? { |v| attribute_value.to_s.include?(v.to_s) }
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
data/lib/pdf/reader/buffer.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
# coding: ASCII-8BIT
|
2
|
-
# typed:
|
2
|
+
# typed: strict
|
3
3
|
# frozen_string_literal: true
|
4
4
|
|
5
5
|
################################################################################
|
@@ -52,7 +52,7 @@ class PDF::Reader
|
|
52
52
|
CR = "\r"
|
53
53
|
LF = "\n"
|
54
54
|
CRLF = "\r\n"
|
55
|
-
WHITE_SPACE = [
|
55
|
+
WHITE_SPACE = ["\n", "\r", ' ']
|
56
56
|
|
57
57
|
# Quite a few PDFs have trailing junk.
|
58
58
|
# This can be several k of nuls in some cases
|
data/lib/pdf/reader/encoding.rb
CHANGED
@@ -68,6 +68,14 @@ module PDF
|
|
68
68
|
runs = merge_runs(runs)
|
69
69
|
end
|
70
70
|
|
71
|
+
if (only_filter = opts.fetch(:only, nil))
|
72
|
+
runs = AdvancedTextRunFilter.only(runs, only_filter)
|
73
|
+
end
|
74
|
+
|
75
|
+
if (exclude_filter = opts.fetch(:exclude, nil))
|
76
|
+
runs = AdvancedTextRunFilter.exclude(runs, exclude_filter)
|
77
|
+
end
|
78
|
+
|
71
79
|
runs
|
72
80
|
end
|
73
81
|
|
data/lib/pdf/reader.rb
CHANGED
@@ -280,6 +280,7 @@ end
|
|
280
280
|
################################################################################
|
281
281
|
|
282
282
|
require 'pdf/reader/resources'
|
283
|
+
require 'pdf/reader/advanced_text_run_filter'
|
283
284
|
require 'pdf/reader/buffer'
|
284
285
|
require 'pdf/reader/bounding_rectangle_runs_filter'
|
285
286
|
require 'pdf/reader/cid_widths'
|
data/rbi/pdf-reader.rbi
CHANGED
@@ -75,20 +75,20 @@ module PDF
|
|
75
75
|
end
|
76
76
|
|
77
77
|
class Buffer
|
78
|
-
TOKEN_WHITESPACE = T.let(T
|
79
|
-
TOKEN_DELIMITER = T.let(T
|
80
|
-
LEFT_PAREN = T.let(
|
81
|
-
LESS_THAN = T.let(
|
82
|
-
STREAM = T.let(
|
83
|
-
ID = T.let(
|
84
|
-
FWD_SLASH = T.let(
|
85
|
-
NULL_BYTE = T.let(
|
86
|
-
CR = T.let(
|
87
|
-
LF = T.let(
|
88
|
-
CRLF = T.let(
|
89
|
-
WHITE_SPACE = T.let(T
|
90
|
-
TRAILING_BYTECOUNT = T.let(
|
91
|
-
DIGITS_ONLY = T.let(
|
78
|
+
TOKEN_WHITESPACE = T.let(T::Array[String])
|
79
|
+
TOKEN_DELIMITER = T.let(T::Array[Integer])
|
80
|
+
LEFT_PAREN = T.let(String)
|
81
|
+
LESS_THAN = T.let(String)
|
82
|
+
STREAM = T.let(String)
|
83
|
+
ID = T.let(String)
|
84
|
+
FWD_SLASH = T.let(String)
|
85
|
+
NULL_BYTE = T.let(String)
|
86
|
+
CR = T.let(String)
|
87
|
+
LF = T.let(String)
|
88
|
+
CRLF = T.let(String)
|
89
|
+
WHITE_SPACE = T.let(T::Array[String])
|
90
|
+
TRAILING_BYTECOUNT = T.let(Integer)
|
91
|
+
DIGITS_ONLY = T.let(Regexp)
|
92
92
|
|
93
93
|
sig { returns(Integer) }
|
94
94
|
attr_reader :pos
|
@@ -851,6 +851,52 @@ module PDF
|
|
851
851
|
def self.exclude_empty_strings(runs); end
|
852
852
|
end
|
853
853
|
|
854
|
+
class AdvancedTextRunFilter
|
855
|
+
VALID_OPERATORS = T.let(T::Array[Symbol])
|
856
|
+
|
857
|
+
sig { params(text_runs: T::Array[TextRun], filter_hash: T::Hash[Symbol, T.untyped]).returns(T::Array[TextRun]) }
|
858
|
+
def self.only(text_runs, filter_hash); end
|
859
|
+
|
860
|
+
sig { params(text_runs: T::Array[TextRun], filter_hash: T::Hash[Symbol, T.untyped]).returns(T::Array[TextRun]) }
|
861
|
+
def self.exclude(text_runs, filter_hash); end
|
862
|
+
|
863
|
+
sig { returns(T::Array[TextRun]) }
|
864
|
+
attr_reader :text_runs
|
865
|
+
|
866
|
+
sig { returns(T::Hash[Symbol, T.untyped]) }
|
867
|
+
attr_reader :filter_hash
|
868
|
+
|
869
|
+
sig { params(text_runs: T::Array[TextRun], filter_hash: T::Hash[Symbol, T.untyped]).void }
|
870
|
+
def initialize(text_runs, filter_hash)
|
871
|
+
@text_runs = T.let(T.unsafe(nil), T::Array[TextRun])
|
872
|
+
@filter_hash = T.let(T.unsafe(nil), T::Hash[Symbol, T.untyped])
|
873
|
+
end
|
874
|
+
|
875
|
+
sig { returns(T::Array[TextRun]) }
|
876
|
+
def only; end
|
877
|
+
|
878
|
+
sig { returns(T::Array[TextRun]) }
|
879
|
+
def exclude; end
|
880
|
+
|
881
|
+
sig { params(text_run: TextRun).returns(T::Boolean) }
|
882
|
+
def evaluate_filter(text_run); end
|
883
|
+
|
884
|
+
sig { params(text_run: TextRun, conditions: T::Array[T::Hash[Symbol, T.untyped]]).returns(T::Boolean) }
|
885
|
+
def evaluate_or_filters(text_run, conditions); end
|
886
|
+
|
887
|
+
sig { params(text_run: TextRun, conditions: T::Array[T::Hash[Symbol, T.untyped]]).returns(T::Boolean) }
|
888
|
+
def evaluate_and_filters(text_run, conditions); end
|
889
|
+
|
890
|
+
sig { params(text_run: TextRun, filter_hash: T::Hash[Symbol, T.untyped]).returns(T::Boolean) }
|
891
|
+
def evaluate_filters(text_run, filter_hash); end
|
892
|
+
|
893
|
+
sig { params(text_run: TextRun, attribute: Symbol, conditions: T::Hash[Symbol, T.untyped]).returns(T::Boolean) }
|
894
|
+
def evaluate_attribute_conditions(text_run, attribute, conditions); end
|
895
|
+
|
896
|
+
sig { params(attribute_value: T.untyped, operator: Symbol, filter_value: T.untyped).returns(T::Boolean) }
|
897
|
+
def apply_operator(attribute_value, operator, filter_value); end
|
898
|
+
end
|
899
|
+
|
854
900
|
class EventPoint
|
855
901
|
sig { returns(Numeric) }
|
856
902
|
attr_reader :x
|
metadata
CHANGED
@@ -1,14 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf-reader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.14.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Healy
|
8
|
-
autorequire:
|
9
8
|
bindir: bin
|
10
9
|
cert_chain: []
|
11
|
-
date:
|
10
|
+
date: 2025-01-29 00:00:00.000000000 Z
|
12
11
|
dependencies:
|
13
12
|
- !ruby/object:Gem::Dependency
|
14
13
|
name: rake
|
@@ -212,6 +211,7 @@ files:
|
|
212
211
|
- examples/version.rb
|
213
212
|
- lib/pdf-reader.rb
|
214
213
|
- lib/pdf/reader.rb
|
214
|
+
- lib/pdf/reader/advanced_text_run_filter.rb
|
215
215
|
- lib/pdf/reader/aes_v2_security_handler.rb
|
216
216
|
- lib/pdf/reader/aes_v3_security_handler.rb
|
217
217
|
- lib/pdf/reader/afm/Courier-Bold.afm
|
@@ -301,10 +301,9 @@ licenses:
|
|
301
301
|
- MIT
|
302
302
|
metadata:
|
303
303
|
bug_tracker_uri: https://github.com/yob/pdf-reader/issues
|
304
|
-
changelog_uri: https://github.com/yob/pdf-reader/blob/v2.
|
305
|
-
documentation_uri: https://www.rubydoc.info/gems/pdf-reader/2.
|
306
|
-
source_code_uri: https://github.com/yob/pdf-reader/tree/v2.
|
307
|
-
post_install_message:
|
304
|
+
changelog_uri: https://github.com/yob/pdf-reader/blob/v2.14.0/CHANGELOG
|
305
|
+
documentation_uri: https://www.rubydoc.info/gems/pdf-reader/2.14.0
|
306
|
+
source_code_uri: https://github.com/yob/pdf-reader/tree/v2.14.0
|
308
307
|
rdoc_options:
|
309
308
|
- "--title"
|
310
309
|
- PDF::Reader Documentation
|
@@ -317,15 +316,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
317
316
|
requirements:
|
318
317
|
- - ">="
|
319
318
|
- !ruby/object:Gem::Version
|
320
|
-
version: '2.
|
319
|
+
version: '2.1'
|
321
320
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
322
321
|
requirements:
|
323
322
|
- - ">="
|
324
323
|
- !ruby/object:Gem::Version
|
325
324
|
version: '0'
|
326
325
|
requirements: []
|
327
|
-
rubygems_version: 3.
|
328
|
-
signing_key:
|
326
|
+
rubygems_version: 3.6.2
|
329
327
|
specification_version: 4
|
330
328
|
summary: A library for accessing the content of PDF files
|
331
329
|
test_files: []
|