pdf-reader 2.13.0 → 2.14.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5db630726ada74d004eb928e8cf164d9a65070150bc99268bed3c3c22a4b16fd
4
- data.tar.gz: 186960431832f9808e292e823a1b8cd3ccbe96bf89b7f8e6801b111b0899b690
3
+ metadata.gz: 7174f6e8c3c655cc9a1c120e5f0b99d06b0c2355803480d1cf8347c1825ddb01
4
+ data.tar.gz: ade7c031fe3c3d6e022125ccd3ef65a9482e21a7d90e6ea9d9d65aeeee3b30e8
5
5
  SHA512:
6
- metadata.gz: 4cc29c3f7d3dd36ff55178c6825dab455bbfd9f3e3b62298dac03a835c741ffeebaa1959f3b3ceba19c82fe8a516acad554ad41e5142bd4a8c75a9725857fc96
7
- data.tar.gz: a89f8815c83d6f89bc51e3aa232776d6d365eb45f0cbfd01ae9de157390144c9ff8bdbdf3e1359048612d3febeffbfd77a01d0c2b08da0b53dad64b6290f6292
6
+ metadata.gz: '08d343015a23dd678264053ade37a3449c91bdfda9764c65bc6ae196529062c33272ffe191beaea9f1e20d31cb1de9c117535e27fee763968e822288324931c6'
7
+ data.tar.gz: 8b0df463cc6292048f0ad68dd682131d53a4bc9630f84841103f8c73a6c837e91cdf5af08f98c255bcefc7d2a67146387d26ab0c3cbece42853f2581981a613b
data/Rakefile CHANGED
@@ -41,7 +41,7 @@ end
41
41
  desc "Create a YAML file of integrity info for PDFs in the spec suite"
42
42
  task :integrity_yaml do
43
43
  data = {}
44
- Dir.glob("spec/data/**/*.*").sort.each do |path|
44
+ Dir.glob("spec/data/**/*.pdf").sort.each do |path|
45
45
  path_without_spec = path.gsub("spec/","")
46
46
  data[path_without_spec] = {
47
47
  :bytes => File.size(path),
@@ -0,0 +1,137 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+ # typed: strict
4
+
5
+ class PDF::Reader
6
+ # Filter a collection of TextRun objects based on a set of conditions.
7
+ # It can be used to filter text runs based on their attributes.
8
+ # The filter can return the text runs that matches the conditions (only) or
9
+ # the text runs that do not match the conditions (exclude).
10
+ #
11
+ # You can filter the text runs based on all its attributes with the operators
12
+ # mentioned in VALID_OPERATORS.
13
+ # The filter can be nested with 'or' and 'and' conditions.
14
+ #
15
+ # Examples:
16
+ # 1. Single condition
17
+ # AdvancedTextRunFilter.exclude(text_runs, text: { include: 'sample' })
18
+ #
19
+ # 2. Multiple conditions (and)
20
+ # AdvancedTextRunFilter.exclude(text_runs, {
21
+ # font_size: { greater_than: 10, less_than: 15 }
22
+ # })
23
+ #
24
+ # 3. Multiple possible values (or)
25
+ # AdvancedTextRunFilter.exclude(text_runs, {
26
+ # font_size: { equal: [10, 12] }
27
+ # })
28
+ #
29
+ # 4. Complex AND/OR filter
30
+ # AdvancedTextRunFilter.exclude(text_runs, {
31
+ # and: [
32
+ # { font_size: { greater_than: 10 } },
33
+ # { or: [
34
+ # { text: { include: "sample" } },
35
+ # { width: { greater_than: 100 } }
36
+ # ]}
37
+ # ]
38
+ # })
39
+ class AdvancedTextRunFilter
40
+ VALID_OPERATORS = %i[
41
+ equal
42
+ not_equal
43
+ greater_than
44
+ less_than
45
+ greater_than_or_equal
46
+ less_than_or_equal
47
+ include
48
+ exclude
49
+ ]
50
+
51
+ def self.only(text_runs, filter_hash)
52
+ new(text_runs, filter_hash).only
53
+ end
54
+
55
+ def self.exclude(text_runs, filter_hash)
56
+ new(text_runs, filter_hash).exclude
57
+ end
58
+
59
+ attr_reader :text_runs, :filter_hash
60
+
61
+ def initialize(text_runs, filter_hash)
62
+ @text_runs = text_runs
63
+ @filter_hash = filter_hash
64
+ end
65
+
66
+ def only
67
+ return text_runs if filter_hash.empty?
68
+ text_runs.select { |text_run| evaluate_filter(text_run) }
69
+ end
70
+
71
+ def exclude
72
+ return text_runs if filter_hash.empty?
73
+ text_runs.reject { |text_run| evaluate_filter(text_run) }
74
+ end
75
+
76
+ private
77
+
78
+ def evaluate_filter(text_run)
79
+ if filter_hash[:or]
80
+ evaluate_or_filters(text_run, filter_hash[:or])
81
+ elsif filter_hash[:and]
82
+ evaluate_and_filters(text_run, filter_hash[:and])
83
+ else
84
+ evaluate_filters(text_run, filter_hash)
85
+ end
86
+ end
87
+
88
+ def evaluate_or_filters(text_run, conditions)
89
+ conditions.any? do |condition|
90
+ evaluate_filters(text_run, condition)
91
+ end
92
+ end
93
+
94
+ def evaluate_and_filters(text_run, conditions)
95
+ conditions.all? do |condition|
96
+ evaluate_filters(text_run, condition)
97
+ end
98
+ end
99
+
100
+ def evaluate_filters(text_run, filter_hash)
101
+ filter_hash.all? do |attribute, conditions|
102
+ evaluate_attribute_conditions(text_run, attribute, conditions)
103
+ end
104
+ end
105
+
106
+ def evaluate_attribute_conditions(text_run, attribute, conditions)
107
+ conditions.all? do |operator, value|
108
+ unless VALID_OPERATORS.include?(operator)
109
+ raise ArgumentError, "Invalid operator: #{operator}"
110
+ end
111
+
112
+ apply_operator(text_run.send(attribute), operator, value)
113
+ end
114
+ end
115
+
116
+ def apply_operator(attribute_value, operator, filter_value)
117
+ case operator
118
+ when :equal
119
+ Array(filter_value).include?(attribute_value)
120
+ when :not_equal
121
+ !Array(filter_value).include?(attribute_value)
122
+ when :greater_than
123
+ attribute_value > filter_value
124
+ when :less_than
125
+ attribute_value < filter_value
126
+ when :greater_than_or_equal
127
+ attribute_value >= filter_value
128
+ when :less_than_or_equal
129
+ attribute_value <= filter_value
130
+ when :include
131
+ Array(filter_value).any? { |v| attribute_value.to_s.include?(v.to_s) }
132
+ when :exclude
133
+ Array(filter_value).none? { |v| attribute_value.to_s.include?(v.to_s) }
134
+ end
135
+ end
136
+ end
137
+ end
@@ -1,5 +1,5 @@
1
1
  # coding: ASCII-8BIT
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
5
  ################################################################################
@@ -52,7 +52,7 @@ class PDF::Reader
52
52
  CR = "\r"
53
53
  LF = "\n"
54
54
  CRLF = "\r\n"
55
- WHITE_SPACE = [LF, CR, ' ']
55
+ WHITE_SPACE = ["\n", "\r", ' ']
56
56
 
57
57
  # Quite a few PDFs have trailing junk.
58
58
  # This can be several k of nuls in some cases
@@ -1,5 +1,5 @@
1
1
  # coding: utf-8
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
5
  ################################################################################
@@ -68,6 +68,14 @@ module PDF
68
68
  runs = merge_runs(runs)
69
69
  end
70
70
 
71
+ if (only_filter = opts.fetch(:only, nil))
72
+ runs = AdvancedTextRunFilter.only(runs, only_filter)
73
+ end
74
+
75
+ if (exclude_filter = opts.fetch(:exclude, nil))
76
+ runs = AdvancedTextRunFilter.exclude(runs, exclude_filter)
77
+ end
78
+
71
79
  runs
72
80
  end
73
81
 
data/lib/pdf/reader.rb CHANGED
@@ -280,6 +280,7 @@ end
280
280
  ################################################################################
281
281
 
282
282
  require 'pdf/reader/resources'
283
+ require 'pdf/reader/advanced_text_run_filter'
283
284
  require 'pdf/reader/buffer'
284
285
  require 'pdf/reader/bounding_rectangle_runs_filter'
285
286
  require 'pdf/reader/cid_widths'
data/rbi/pdf-reader.rbi CHANGED
@@ -75,20 +75,20 @@ module PDF
75
75
  end
76
76
 
77
77
  class Buffer
78
- TOKEN_WHITESPACE = T.let(T.unsafe(nil), T::Array[Integer])
79
- TOKEN_DELIMITER = T.let(T.unsafe(nil), T::Array[Integer])
80
- LEFT_PAREN = T.let(T.unsafe(nil), String)
81
- LESS_THAN = T.let(T.unsafe(nil), String)
82
- STREAM = T.let(T.unsafe(nil), String)
83
- ID = T.let(T.unsafe(nil), String)
84
- FWD_SLASH = T.let(T.unsafe(nil), String)
85
- NULL_BYTE = T.let(T.unsafe(nil), String)
86
- CR = T.let(T.unsafe(nil), String)
87
- LF = T.let(T.unsafe(nil), String)
88
- CRLF = T.let(T.unsafe(nil), String)
89
- WHITE_SPACE = T.let(T.unsafe(nil), T::Array[String])
90
- TRAILING_BYTECOUNT = T.let(T.unsafe(nil), Integer)
91
- DIGITS_ONLY = T.let(T.unsafe(nil), Regexp)
78
+ TOKEN_WHITESPACE = T.let(T::Array[String])
79
+ TOKEN_DELIMITER = T.let(T::Array[Integer])
80
+ LEFT_PAREN = T.let(String)
81
+ LESS_THAN = T.let(String)
82
+ STREAM = T.let(String)
83
+ ID = T.let(String)
84
+ FWD_SLASH = T.let(String)
85
+ NULL_BYTE = T.let(String)
86
+ CR = T.let(String)
87
+ LF = T.let(String)
88
+ CRLF = T.let(String)
89
+ WHITE_SPACE = T.let(T::Array[String])
90
+ TRAILING_BYTECOUNT = T.let(Integer)
91
+ DIGITS_ONLY = T.let(Regexp)
92
92
 
93
93
  sig { returns(Integer) }
94
94
  attr_reader :pos
@@ -851,6 +851,52 @@ module PDF
851
851
  def self.exclude_empty_strings(runs); end
852
852
  end
853
853
 
854
+ class AdvancedTextRunFilter
855
+ VALID_OPERATORS = T.let(T::Array[Symbol])
856
+
857
+ sig { params(text_runs: T::Array[TextRun], filter_hash: T::Hash[Symbol, T.untyped]).returns(T::Array[TextRun]) }
858
+ def self.only(text_runs, filter_hash); end
859
+
860
+ sig { params(text_runs: T::Array[TextRun], filter_hash: T::Hash[Symbol, T.untyped]).returns(T::Array[TextRun]) }
861
+ def self.exclude(text_runs, filter_hash); end
862
+
863
+ sig { returns(T::Array[TextRun]) }
864
+ attr_reader :text_runs
865
+
866
+ sig { returns(T::Hash[Symbol, T.untyped]) }
867
+ attr_reader :filter_hash
868
+
869
+ sig { params(text_runs: T::Array[TextRun], filter_hash: T::Hash[Symbol, T.untyped]).void }
870
+ def initialize(text_runs, filter_hash)
871
+ @text_runs = T.let(T.unsafe(nil), T::Array[TextRun])
872
+ @filter_hash = T.let(T.unsafe(nil), T::Hash[Symbol, T.untyped])
873
+ end
874
+
875
+ sig { returns(T::Array[TextRun]) }
876
+ def only; end
877
+
878
+ sig { returns(T::Array[TextRun]) }
879
+ def exclude; end
880
+
881
+ sig { params(text_run: TextRun).returns(T::Boolean) }
882
+ def evaluate_filter(text_run); end
883
+
884
+ sig { params(text_run: TextRun, conditions: T::Array[T::Hash[Symbol, T.untyped]]).returns(T::Boolean) }
885
+ def evaluate_or_filters(text_run, conditions); end
886
+
887
+ sig { params(text_run: TextRun, conditions: T::Array[T::Hash[Symbol, T.untyped]]).returns(T::Boolean) }
888
+ def evaluate_and_filters(text_run, conditions); end
889
+
890
+ sig { params(text_run: TextRun, filter_hash: T::Hash[Symbol, T.untyped]).returns(T::Boolean) }
891
+ def evaluate_filters(text_run, filter_hash); end
892
+
893
+ sig { params(text_run: TextRun, attribute: Symbol, conditions: T::Hash[Symbol, T.untyped]).returns(T::Boolean) }
894
+ def evaluate_attribute_conditions(text_run, attribute, conditions); end
895
+
896
+ sig { params(attribute_value: T.untyped, operator: Symbol, filter_value: T.untyped).returns(T::Boolean) }
897
+ def apply_operator(attribute_value, operator, filter_value); end
898
+ end
899
+
854
900
  class EventPoint
855
901
  sig { returns(Numeric) }
856
902
  attr_reader :x
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf-reader
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.13.0
4
+ version: 2.14.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Healy
8
- autorequire:
9
8
  bindir: bin
10
9
  cert_chain: []
11
- date: 2024-11-02 00:00:00.000000000 Z
10
+ date: 2025-01-29 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: rake
@@ -212,6 +211,7 @@ files:
212
211
  - examples/version.rb
213
212
  - lib/pdf-reader.rb
214
213
  - lib/pdf/reader.rb
214
+ - lib/pdf/reader/advanced_text_run_filter.rb
215
215
  - lib/pdf/reader/aes_v2_security_handler.rb
216
216
  - lib/pdf/reader/aes_v3_security_handler.rb
217
217
  - lib/pdf/reader/afm/Courier-Bold.afm
@@ -301,10 +301,9 @@ licenses:
301
301
  - MIT
302
302
  metadata:
303
303
  bug_tracker_uri: https://github.com/yob/pdf-reader/issues
304
- changelog_uri: https://github.com/yob/pdf-reader/blob/v2.13.0/CHANGELOG
305
- documentation_uri: https://www.rubydoc.info/gems/pdf-reader/2.13.0
306
- source_code_uri: https://github.com/yob/pdf-reader/tree/v2.13.0
307
- post_install_message:
304
+ changelog_uri: https://github.com/yob/pdf-reader/blob/v2.14.0/CHANGELOG
305
+ documentation_uri: https://www.rubydoc.info/gems/pdf-reader/2.14.0
306
+ source_code_uri: https://github.com/yob/pdf-reader/tree/v2.14.0
308
307
  rdoc_options:
309
308
  - "--title"
310
309
  - PDF::Reader Documentation
@@ -317,15 +316,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
317
316
  requirements:
318
317
  - - ">="
319
318
  - !ruby/object:Gem::Version
320
- version: '2.0'
319
+ version: '2.1'
321
320
  required_rubygems_version: !ruby/object:Gem::Requirement
322
321
  requirements:
323
322
  - - ">="
324
323
  - !ruby/object:Gem::Version
325
324
  version: '0'
326
325
  requirements: []
327
- rubygems_version: 3.4.10
328
- signing_key:
326
+ rubygems_version: 3.6.2
329
327
  specification_version: 4
330
328
  summary: A library for accessing the content of PDF files
331
329
  test_files: []