pdf-reader 2.13.0 → 2.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5db630726ada74d004eb928e8cf164d9a65070150bc99268bed3c3c22a4b16fd
4
- data.tar.gz: 186960431832f9808e292e823a1b8cd3ccbe96bf89b7f8e6801b111b0899b690
3
+ metadata.gz: 38765d176ae7b8f4cff7ea6f10fff00b811f6812629d76a2b966f36139c23188
4
+ data.tar.gz: a406d525e4fccb84cc9e86b28aab06a12854c6f0f297a1a479d26b3f845267f6
5
5
  SHA512:
6
- metadata.gz: 4cc29c3f7d3dd36ff55178c6825dab455bbfd9f3e3b62298dac03a835c741ffeebaa1959f3b3ceba19c82fe8a516acad554ad41e5142bd4a8c75a9725857fc96
7
- data.tar.gz: a89f8815c83d6f89bc51e3aa232776d6d365eb45f0cbfd01ae9de157390144c9ff8bdbdf3e1359048612d3febeffbfd77a01d0c2b08da0b53dad64b6290f6292
6
+ metadata.gz: 010c16b1528d4c46d0175737c9694e2e326092b5e7091cbdd0e0ca41567e662b1adabe989c33b0b919a021bee9f985fa4f2862058bd144762c090e718b3089cc
7
+ data.tar.gz: 996fe5b0761280edd67c5523d00c04519b7c682c5ededd86d8dfd412df6e11d554d162ab5b4eb231709f4d3013c5963129b32358ef0b49a4521e8ba72dcf490b
data/CHANGELOG CHANGED
@@ -1,3 +1,10 @@
1
+ v2.14.1 (4th February 2025)
2
+ - Fix issue in RBI signatures, introduced in v2.14.0(https://github.com/yob/pdf-reader/pull/550)
3
+
4
+ v2.14.0 (29th January 2025)
5
+ - Raise minimum supported ruby to 2.1 (https://github.com/yob/pdf-reader/pull/543)
6
+ - Add support for filtering to Page#text (https://github.com/yob/pdf-reader/pull/545)
7
+
1
8
  v2.13.0 (2nd November 2024)
2
9
  - Permit Ascii86 v1.0 and v2.0 (https://github.com/yob/pdf-reader/pull/539)
3
10
  - Allow StringIO type for PDF::Reader input (https://github.com/yob/pdf-reader/pull/535)
data/Rakefile CHANGED
@@ -41,7 +41,7 @@ end
41
41
  desc "Create a YAML file of integrity info for PDFs in the spec suite"
42
42
  task :integrity_yaml do
43
43
  data = {}
44
- Dir.glob("spec/data/**/*.*").sort.each do |path|
44
+ Dir.glob("spec/data/**/*.pdf").sort.each do |path|
45
45
  path_without_spec = path.gsub("spec/","")
46
46
  data[path_without_spec] = {
47
47
  :bytes => File.size(path),
@@ -0,0 +1,137 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+ # typed: strict
4
+
5
+ class PDF::Reader
6
+ # Filter a collection of TextRun objects based on a set of conditions.
7
+ # It can be used to filter text runs based on their attributes.
8
+ # The filter can return the text runs that matches the conditions (only) or
9
+ # the text runs that do not match the conditions (exclude).
10
+ #
11
+ # You can filter the text runs based on all its attributes with the operators
12
+ # mentioned in VALID_OPERATORS.
13
+ # The filter can be nested with 'or' and 'and' conditions.
14
+ #
15
+ # Examples:
16
+ # 1. Single condition
17
+ # AdvancedTextRunFilter.exclude(text_runs, text: { include: 'sample' })
18
+ #
19
+ # 2. Multiple conditions (and)
20
+ # AdvancedTextRunFilter.exclude(text_runs, {
21
+ # font_size: { greater_than: 10, less_than: 15 }
22
+ # })
23
+ #
24
+ # 3. Multiple possible values (or)
25
+ # AdvancedTextRunFilter.exclude(text_runs, {
26
+ # font_size: { equal: [10, 12] }
27
+ # })
28
+ #
29
+ # 4. Complex AND/OR filter
30
+ # AdvancedTextRunFilter.exclude(text_runs, {
31
+ # and: [
32
+ # { font_size: { greater_than: 10 } },
33
+ # { or: [
34
+ # { text: { include: "sample" } },
35
+ # { width: { greater_than: 100 } }
36
+ # ]}
37
+ # ]
38
+ # })
39
+ class AdvancedTextRunFilter
40
+ VALID_OPERATORS = %i[
41
+ equal
42
+ not_equal
43
+ greater_than
44
+ less_than
45
+ greater_than_or_equal
46
+ less_than_or_equal
47
+ include
48
+ exclude
49
+ ]
50
+
51
+ def self.only(text_runs, filter_hash)
52
+ new(text_runs, filter_hash).only
53
+ end
54
+
55
+ def self.exclude(text_runs, filter_hash)
56
+ new(text_runs, filter_hash).exclude
57
+ end
58
+
59
+ attr_reader :text_runs, :filter_hash
60
+
61
+ def initialize(text_runs, filter_hash)
62
+ @text_runs = text_runs
63
+ @filter_hash = filter_hash
64
+ end
65
+
66
+ def only
67
+ return text_runs if filter_hash.empty?
68
+ text_runs.select { |text_run| evaluate_filter(text_run) }
69
+ end
70
+
71
+ def exclude
72
+ return text_runs if filter_hash.empty?
73
+ text_runs.reject { |text_run| evaluate_filter(text_run) }
74
+ end
75
+
76
+ private
77
+
78
+ def evaluate_filter(text_run)
79
+ if filter_hash[:or]
80
+ evaluate_or_filters(text_run, filter_hash[:or])
81
+ elsif filter_hash[:and]
82
+ evaluate_and_filters(text_run, filter_hash[:and])
83
+ else
84
+ evaluate_filters(text_run, filter_hash)
85
+ end
86
+ end
87
+
88
+ def evaluate_or_filters(text_run, conditions)
89
+ conditions.any? do |condition|
90
+ evaluate_filters(text_run, condition)
91
+ end
92
+ end
93
+
94
+ def evaluate_and_filters(text_run, conditions)
95
+ conditions.all? do |condition|
96
+ evaluate_filters(text_run, condition)
97
+ end
98
+ end
99
+
100
+ def evaluate_filters(text_run, filter_hash)
101
+ filter_hash.all? do |attribute, conditions|
102
+ evaluate_attribute_conditions(text_run, attribute, conditions)
103
+ end
104
+ end
105
+
106
+ def evaluate_attribute_conditions(text_run, attribute, conditions)
107
+ conditions.all? do |operator, value|
108
+ unless VALID_OPERATORS.include?(operator)
109
+ raise ArgumentError, "Invalid operator: #{operator}"
110
+ end
111
+
112
+ apply_operator(text_run.send(attribute), operator, value)
113
+ end
114
+ end
115
+
116
+ def apply_operator(attribute_value, operator, filter_value)
117
+ case operator
118
+ when :equal
119
+ Array(filter_value).include?(attribute_value)
120
+ when :not_equal
121
+ !Array(filter_value).include?(attribute_value)
122
+ when :greater_than
123
+ attribute_value > filter_value
124
+ when :less_than
125
+ attribute_value < filter_value
126
+ when :greater_than_or_equal
127
+ attribute_value >= filter_value
128
+ when :less_than_or_equal
129
+ attribute_value <= filter_value
130
+ when :include
131
+ Array(filter_value).any? { |v| attribute_value.to_s.include?(v.to_s) }
132
+ when :exclude
133
+ Array(filter_value).none? { |v| attribute_value.to_s.include?(v.to_s) }
134
+ end
135
+ end
136
+ end
137
+ end
@@ -1,5 +1,5 @@
1
1
  # coding: ASCII-8BIT
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
5
  ################################################################################
@@ -52,7 +52,7 @@ class PDF::Reader
52
52
  CR = "\r"
53
53
  LF = "\n"
54
54
  CRLF = "\r\n"
55
- WHITE_SPACE = [LF, CR, ' ']
55
+ WHITE_SPACE = ["\n", "\r", ' ']
56
56
 
57
57
  # Quite a few PDFs have trailing junk.
58
58
  # This can be several k of nuls in some cases
@@ -1,5 +1,5 @@
1
1
  # coding: utf-8
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
5
  ################################################################################
@@ -68,6 +68,14 @@ module PDF
68
68
  runs = merge_runs(runs)
69
69
  end
70
70
 
71
+ if (only_filter = opts.fetch(:only, nil))
72
+ runs = AdvancedTextRunFilter.only(runs, only_filter)
73
+ end
74
+
75
+ if (exclude_filter = opts.fetch(:exclude, nil))
76
+ runs = AdvancedTextRunFilter.exclude(runs, exclude_filter)
77
+ end
78
+
71
79
  runs
72
80
  end
73
81
 
data/lib/pdf/reader.rb CHANGED
@@ -280,6 +280,7 @@ end
280
280
  ################################################################################
281
281
 
282
282
  require 'pdf/reader/resources'
283
+ require 'pdf/reader/advanced_text_run_filter'
283
284
  require 'pdf/reader/buffer'
284
285
  require 'pdf/reader/bounding_rectangle_runs_filter'
285
286
  require 'pdf/reader/cid_widths'
data/rbi/pdf-reader.rbi CHANGED
@@ -75,20 +75,20 @@ module PDF
75
75
  end
76
76
 
77
77
  class Buffer
78
- TOKEN_WHITESPACE = T.let(T.unsafe(nil), T::Array[Integer])
79
- TOKEN_DELIMITER = T.let(T.unsafe(nil), T::Array[Integer])
80
- LEFT_PAREN = T.let(T.unsafe(nil), String)
81
- LESS_THAN = T.let(T.unsafe(nil), String)
82
- STREAM = T.let(T.unsafe(nil), String)
83
- ID = T.let(T.unsafe(nil), String)
84
- FWD_SLASH = T.let(T.unsafe(nil), String)
85
- NULL_BYTE = T.let(T.unsafe(nil), String)
86
- CR = T.let(T.unsafe(nil), String)
87
- LF = T.let(T.unsafe(nil), String)
88
- CRLF = T.let(T.unsafe(nil), String)
89
- WHITE_SPACE = T.let(T.unsafe(nil), T::Array[String])
90
- TRAILING_BYTECOUNT = T.let(T.unsafe(nil), Integer)
91
- DIGITS_ONLY = T.let(T.unsafe(nil), Regexp)
78
+ TOKEN_WHITESPACE = T.let(T::Array[String])
79
+ TOKEN_DELIMITER = T.let(T::Array[Integer])
80
+ LEFT_PAREN = T.let(String)
81
+ LESS_THAN = T.let(String)
82
+ STREAM = T.let(String)
83
+ ID = T.let(String)
84
+ FWD_SLASH = T.let(String)
85
+ NULL_BYTE = T.let(String)
86
+ CR = T.let(String)
87
+ LF = T.let(String)
88
+ CRLF = T.let(String)
89
+ WHITE_SPACE = T.let(T::Array[String])
90
+ TRAILING_BYTECOUNT = T.let(Integer)
91
+ DIGITS_ONLY = T.let(Regexp)
92
92
 
93
93
  sig { returns(Integer) }
94
94
  attr_reader :pos
@@ -851,6 +851,52 @@ module PDF
851
851
  def self.exclude_empty_strings(runs); end
852
852
  end
853
853
 
854
+ class AdvancedTextRunFilter
855
+ VALID_OPERATORS = T.let(T::Array[Symbol])
856
+
857
+ sig { params(text_runs: T::Array[PDF::Reader::TextRun], filter_hash: T::Hash[Symbol, T.untyped]).returns(T::Array[PDF::Reader::TextRun]) }
858
+ def self.only(text_runs, filter_hash); end
859
+
860
+ sig { params(text_runs: T::Array[PDF::Reader::TextRun], filter_hash: T::Hash[Symbol, T.untyped]).returns(T::Array[PDF::Reader::TextRun]) }
861
+ def self.exclude(text_runs, filter_hash); end
862
+
863
+ sig { returns(T::Array[PDF::Reader::TextRun]) }
864
+ attr_reader :text_runs
865
+
866
+ sig { returns(T::Hash[Symbol, T.untyped]) }
867
+ attr_reader :filter_hash
868
+
869
+ sig { params(text_runs: T::Array[PDF::Reader::TextRun], filter_hash: T::Hash[Symbol, T.untyped]).void }
870
+ def initialize(text_runs, filter_hash)
871
+ @text_runs = T.let(T.unsafe(nil), T::Array[PDF::Reader::TextRun])
872
+ @filter_hash = T.let(T.unsafe(nil), T::Hash[Symbol, T.untyped])
873
+ end
874
+
875
+ sig { returns(T::Array[PDF::Reader::TextRun]) }
876
+ def only; end
877
+
878
+ sig { returns(T::Array[PDF::Reader::TextRun]) }
879
+ def exclude; end
880
+
881
+ sig { params(text_run: PDF::Reader::TextRun).returns(T::Boolean) }
882
+ def evaluate_filter(text_run); end
883
+
884
+ sig { params(text_run: PDF::Reader::TextRun, conditions: T::Array[T::Hash[Symbol, T.untyped]]).returns(T::Boolean) }
885
+ def evaluate_or_filters(text_run, conditions); end
886
+
887
+ sig { params(text_run: PDF::Reader::TextRun, conditions: T::Array[T::Hash[Symbol, T.untyped]]).returns(T::Boolean) }
888
+ def evaluate_and_filters(text_run, conditions); end
889
+
890
+ sig { params(text_run: PDF::Reader::TextRun, filter_hash: T::Hash[Symbol, T.untyped]).returns(T::Boolean) }
891
+ def evaluate_filters(text_run, filter_hash); end
892
+
893
+ sig { params(text_run: PDF::Reader::TextRun, attribute: Symbol, conditions: T::Hash[Symbol, T.untyped]).returns(T::Boolean) }
894
+ def evaluate_attribute_conditions(text_run, attribute, conditions); end
895
+
896
+ sig { params(attribute_value: T.untyped, operator: Symbol, filter_value: T.untyped).returns(T::Boolean) }
897
+ def apply_operator(attribute_value, operator, filter_value); end
898
+ end
899
+
854
900
  class EventPoint
855
901
  sig { returns(Numeric) }
856
902
  attr_reader :x
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf-reader
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.13.0
4
+ version: 2.14.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Healy
8
- autorequire:
9
8
  bindir: bin
10
9
  cert_chain: []
11
- date: 2024-11-02 00:00:00.000000000 Z
10
+ date: 2025-02-04 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: rake
@@ -212,6 +211,7 @@ files:
212
211
  - examples/version.rb
213
212
  - lib/pdf-reader.rb
214
213
  - lib/pdf/reader.rb
214
+ - lib/pdf/reader/advanced_text_run_filter.rb
215
215
  - lib/pdf/reader/aes_v2_security_handler.rb
216
216
  - lib/pdf/reader/aes_v3_security_handler.rb
217
217
  - lib/pdf/reader/afm/Courier-Bold.afm
@@ -301,10 +301,9 @@ licenses:
301
301
  - MIT
302
302
  metadata:
303
303
  bug_tracker_uri: https://github.com/yob/pdf-reader/issues
304
- changelog_uri: https://github.com/yob/pdf-reader/blob/v2.13.0/CHANGELOG
305
- documentation_uri: https://www.rubydoc.info/gems/pdf-reader/2.13.0
306
- source_code_uri: https://github.com/yob/pdf-reader/tree/v2.13.0
307
- post_install_message:
304
+ changelog_uri: https://github.com/yob/pdf-reader/blob/v2.14.1/CHANGELOG
305
+ documentation_uri: https://www.rubydoc.info/gems/pdf-reader/2.14.1
306
+ source_code_uri: https://github.com/yob/pdf-reader/tree/v2.14.1
308
307
  rdoc_options:
309
308
  - "--title"
310
309
  - PDF::Reader Documentation
@@ -317,15 +316,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
317
316
  requirements:
318
317
  - - ">="
319
318
  - !ruby/object:Gem::Version
320
- version: '2.0'
319
+ version: '2.1'
321
320
  required_rubygems_version: !ruby/object:Gem::Requirement
322
321
  requirements:
323
322
  - - ">="
324
323
  - !ruby/object:Gem::Version
325
324
  version: '0'
326
325
  requirements: []
327
- rubygems_version: 3.4.10
328
- signing_key:
326
+ rubygems_version: 3.6.2
329
327
  specification_version: 4
330
328
  summary: A library for accessing the content of PDF files
331
329
  test_files: []