pdf-reader-turtletext 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +6 -1
- data/README.rdoc +40 -6
- data/lib/pdf/reader/turtletext.rb +10 -8
- data/lib/pdf/reader/turtletext/textangle.rb +48 -14
- data/lib/pdf/reader/turtletext/version.rb +1 -1
- data/pdf-reader-turtletext.gemspec +2 -2
- data/spec/fixtures/pdf_samples/expectations.yml +6 -3
- data/spec/unit/reader/turtletext/textangle_spec.rb +395 -124
- data/spec/unit/reader/turtletext/turtletext_spec.rb +15 -4
- metadata +18 -18
data/CHANGELOG
CHANGED
@@ -1,4 +1,9 @@
|
|
1
|
-
Version 0.2.
|
1
|
+
Version 0.2.2 Release: 1st Aug 2012
|
2
|
+
==================================================
|
3
|
+
* provide better control of inclusive/exclusive behaviour
|
4
|
+
for region selection \#4
|
5
|
+
|
6
|
+
Version 0.2.1 Release: 31st July 2012
|
2
7
|
==================================================
|
3
8
|
* fix row sorting for Rubinius 1.8 mode
|
4
9
|
|
data/README.rdoc
CHANGED
@@ -36,7 +36,9 @@ Then bundle install:
|
|
36
36
|
|
37
37
|
=== How do I install it for gem development?
|
38
38
|
|
39
|
-
If you want to work on enhancements of fix bugs in PDF::Reader::Turtletext, fork and clone the github repository.
|
39
|
+
If you want to work on enhancements of fix bugs in PDF::Reader::Turtletext, fork and clone the github repository. If you are using bundler (recommended), run <tt>bundle</tt> to install development dependencies.
|
40
|
+
|
41
|
+
See the section below on 'Contributing to PDF::Reader::Turtletext' for more information.
|
40
42
|
|
41
43
|
=== How to instantiate Turtletext in code
|
42
44
|
|
@@ -70,6 +72,8 @@ Solution: use the <tt>bounding_box</tt> method to describe the region and extrac
|
|
70
72
|
|
71
73
|
The range of methods that can be used within the <tt>bounding_box</tt> block are all optional, and include:
|
72
74
|
* <tt>page</tt> - specifies the PDF page from which to extract text (default is 1).
|
75
|
+
* <tt>inclusive</tt> - whether region selection should be inclusive or exclusive of the specified positions
|
76
|
+
(default is false).
|
73
77
|
* <tt>below</tt> - a string, regex or number that describes the upper limit of the text box
|
74
78
|
(default is top border of the page).
|
75
79
|
* <tt>above</tt> - a string, regex or number that describes the lower limit of the text box
|
@@ -98,17 +102,47 @@ An explicit block parameter may be used with the <tt>bounding_box</tt> method:
|
|
98
102
|
textangle.text
|
99
103
|
=> [['string','string'],['string']] # array of rows, each row is an array of text elements in the row
|
100
104
|
|
105
|
+
=== How to describe an inclusive <tt>bounding_box</tt> region
|
106
|
+
|
107
|
+
By default, the <tt>bounding_box</tt> method makes exclusive selection (i.e. not including the
|
108
|
+
region limits).
|
109
|
+
|
110
|
+
To specifiy an inclusive region, use the <tt>inclusive!</tt> command:
|
111
|
+
|
112
|
+
textangle = reader.bounding_box do
|
113
|
+
inclusive!
|
114
|
+
below /electricity/i
|
115
|
+
left_of "Total ($)"
|
116
|
+
end
|
117
|
+
|
118
|
+
Alternatively, set <tt>inclusive</tt> to true:
|
119
|
+
|
120
|
+
textangle = reader.bounding_box do
|
121
|
+
inclusive true
|
122
|
+
below /electricity/i
|
123
|
+
left_of "Total ($)"
|
124
|
+
end
|
125
|
+
|
126
|
+
Or with a block parameter, you may also assign <tt>inclusive</tt> to true:
|
127
|
+
|
128
|
+
textangle = reader.bounding_box do |r|
|
129
|
+
r.inclusive = true
|
130
|
+
r.below /electricity/i
|
131
|
+
r.left_of "Total ($)"
|
132
|
+
end
|
133
|
+
|
101
134
|
=== Extract text for a region with known positional co-ordinates
|
102
135
|
|
103
136
|
If you know (or can calculate) the x,y positions of the required text region, you can extract the region's
|
104
137
|
text using the <tt>text_in_region</tt> method.
|
105
138
|
|
106
139
|
text = reader.text_in_region(
|
107
|
-
10,
|
108
|
-
900,
|
109
|
-
200,
|
110
|
-
400,
|
111
|
-
1 # page
|
140
|
+
10, # minimum x (left-most)
|
141
|
+
900, # maximum x (right-most)
|
142
|
+
200, # minimum y (bottom-most)
|
143
|
+
400, # maximum y (top-most)
|
144
|
+
1, # page (default 1)
|
145
|
+
false # inclusive of x/y position if true (default false)
|
112
146
|
)
|
113
147
|
=> [['string','string'],['string']] # array of rows, each row is an array of text elements in the row
|
114
148
|
|
@@ -76,21 +76,22 @@ class PDF::Reader::Turtletext
|
|
76
76
|
end
|
77
77
|
end
|
78
78
|
|
79
|
-
# Returns an array of text elements found within the x,y limits
|
80
|
-
# x ranges from +xmin+ (left of page) to +xmax+ (right of page)
|
81
|
-
# y ranges from +ymin+ (bottom of page) to +ymax+ (top of page)
|
82
|
-
#
|
79
|
+
# Returns an array of text elements found within the x,y limits on +page+:
|
80
|
+
# * x ranges from +xmin+ (left of page) to +xmax+ (right of page)
|
81
|
+
# * y ranges from +ymin+ (bottom of page) to +ymax+ (top of page)
|
82
|
+
# When +inclusive+ is false (default) the x/y limits do not include the actual x/y value.
|
83
83
|
# Each line of text is an array of the seperate text elements found on that line.
|
84
84
|
# [["first line first text", "first line last text"],["second line text"]]
|
85
|
-
def text_in_region(xmin,xmax,ymin,ymax,page=1)
|
85
|
+
def text_in_region(xmin,xmax,ymin,ymax,page=1,inclusive=false)
|
86
|
+
return [] unless xmin && xmax && ymin && ymax
|
86
87
|
text_map = content(page)
|
87
88
|
box = []
|
88
89
|
|
89
90
|
text_map.each do |y,text_row|
|
90
|
-
if y >= ymin && y<= ymax
|
91
|
+
if inclusive ? (y >= ymin && y <= ymax) : (y > ymin && y < ymax)
|
91
92
|
row = []
|
92
93
|
text_row.each do |x,element|
|
93
|
-
if x >= xmin && x<= xmax
|
94
|
+
if inclusive ? (x >= xmin && x <= xmax) : (x > xmin && x < xmax)
|
94
95
|
row << element
|
95
96
|
end
|
96
97
|
end
|
@@ -102,7 +103,8 @@ class PDF::Reader::Turtletext
|
|
102
103
|
|
103
104
|
# Returns the position of +text+ on +page+
|
104
105
|
# {x: val, y: val }
|
105
|
-
# +text+ may be a string (exact match required) or a Regexp
|
106
|
+
# +text+ may be a string (exact match required) or a Regexp.
|
107
|
+
# Returns nil if the text cannot be found.
|
106
108
|
def text_position(text,page=1)
|
107
109
|
item = if text.class <= Regexp
|
108
110
|
content(page).map do |k,v|
|
@@ -10,14 +10,15 @@
|
|
10
10
|
# textangle.text
|
11
11
|
#
|
12
12
|
class PDF::Reader::Turtletext::Textangle
|
13
|
+
|
14
|
+
#
|
13
15
|
attr_reader :reader
|
14
|
-
attr_accessor :page
|
15
|
-
attr_writer :above,:below,:left_of,:right_of
|
16
16
|
|
17
17
|
# +turtletext_reader+ is a PDF::Reader::Turtletext
|
18
18
|
def initialize(turtletext_reader,&block)
|
19
19
|
@reader = turtletext_reader
|
20
20
|
@page = 1
|
21
|
+
@inclusive = false
|
21
22
|
if block_given?
|
22
23
|
if block.arity == 1
|
23
24
|
yield self
|
@@ -27,6 +28,34 @@ class PDF::Reader::Turtletext::Textangle
|
|
27
28
|
end
|
28
29
|
end
|
29
30
|
|
31
|
+
attr_writer :inclusive
|
32
|
+
|
33
|
+
def inclusive(*args)
|
34
|
+
if value = args.first
|
35
|
+
@inclusive = value
|
36
|
+
end
|
37
|
+
@inclusive
|
38
|
+
end
|
39
|
+
|
40
|
+
# Command: sets +inclusive true
|
41
|
+
def inclusive!
|
42
|
+
@inclusive = true
|
43
|
+
end
|
44
|
+
|
45
|
+
# Command: sets +inclusive false
|
46
|
+
def exclusive!
|
47
|
+
@inclusive = false
|
48
|
+
end
|
49
|
+
|
50
|
+
attr_writer :page
|
51
|
+
def page(*args)
|
52
|
+
if value = args.first
|
53
|
+
@page = value
|
54
|
+
end
|
55
|
+
@page
|
56
|
+
end
|
57
|
+
|
58
|
+
attr_writer :above
|
30
59
|
def above(*args)
|
31
60
|
if value = args.first
|
32
61
|
@above = value
|
@@ -34,6 +63,7 @@ class PDF::Reader::Turtletext::Textangle
|
|
34
63
|
@above
|
35
64
|
end
|
36
65
|
|
66
|
+
attr_writer :below
|
37
67
|
def below(*args)
|
38
68
|
if value = args.first
|
39
69
|
@below = value
|
@@ -41,6 +71,7 @@ class PDF::Reader::Turtletext::Textangle
|
|
41
71
|
@below
|
42
72
|
end
|
43
73
|
|
74
|
+
attr_writer :left_of
|
44
75
|
def left_of(*args)
|
45
76
|
if value = args.first
|
46
77
|
@left_of = value
|
@@ -48,6 +79,7 @@ class PDF::Reader::Turtletext::Textangle
|
|
48
79
|
@left_of
|
49
80
|
end
|
50
81
|
|
82
|
+
attr_writer :right_of
|
51
83
|
def right_of(*args)
|
52
84
|
if value = args.first
|
53
85
|
@right_of = value
|
@@ -55,15 +87,17 @@ class PDF::Reader::Turtletext::Textangle
|
|
55
87
|
@right_of
|
56
88
|
end
|
57
89
|
|
58
|
-
# Returns the text
|
90
|
+
# Returns the text array found within the defined region.
|
91
|
+
# Each line of text is an array of the seperate text elements found on that line.
|
92
|
+
# [["first line first text", "first line last text"],["second line text"]]
|
59
93
|
def text
|
60
94
|
return unless reader
|
61
95
|
|
62
96
|
xmin = if right_of
|
63
97
|
if [Fixnum,Float].include?(right_of.class)
|
64
98
|
right_of
|
65
|
-
|
66
|
-
|
99
|
+
elsif xy = reader.text_position(right_of,page)
|
100
|
+
xy[:x]
|
67
101
|
end
|
68
102
|
else
|
69
103
|
0
|
@@ -71,18 +105,18 @@ class PDF::Reader::Turtletext::Textangle
|
|
71
105
|
xmax = if left_of
|
72
106
|
if [Fixnum,Float].include?(left_of.class)
|
73
107
|
left_of
|
74
|
-
|
75
|
-
|
108
|
+
elsif xy = reader.text_position(left_of,page)
|
109
|
+
xy[:x]
|
76
110
|
end
|
77
111
|
else
|
78
|
-
99999 # TODO actual limit
|
112
|
+
99999 # TODO: figure out the actual limit?
|
79
113
|
end
|
80
114
|
|
81
115
|
ymin = if above
|
82
116
|
if [Fixnum,Float].include?(above.class)
|
83
117
|
above
|
84
|
-
|
85
|
-
|
118
|
+
elsif xy = reader.text_position(above,page)
|
119
|
+
xy[:y]
|
86
120
|
end
|
87
121
|
else
|
88
122
|
0
|
@@ -90,14 +124,14 @@ class PDF::Reader::Turtletext::Textangle
|
|
90
124
|
ymax = if below
|
91
125
|
if [Fixnum,Float].include?(below.class)
|
92
126
|
below
|
93
|
-
|
94
|
-
|
127
|
+
elsif xy = reader.text_position(below,page)
|
128
|
+
xy[:y]
|
95
129
|
end
|
96
130
|
else
|
97
|
-
99999 # TODO actual limit
|
131
|
+
99999 # TODO: figure out the actual limit?
|
98
132
|
end
|
99
133
|
|
100
|
-
reader.text_in_region(xmin,xmax,ymin,ymax,page)
|
134
|
+
reader.text_in_region(xmin,xmax,ymin,ymax,page,inclusive)
|
101
135
|
end
|
102
136
|
|
103
137
|
end
|
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "pdf-reader-turtletext"
|
8
|
-
s.version = "0.2.
|
8
|
+
s.version = "0.2.2"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Paul Gallagher"]
|
12
|
-
s.date = "2012-
|
12
|
+
s.date = "2012-08-01"
|
13
13
|
s.description = "a library that can read structured and positional text from PDFs. Ideal for asembling structured data from invoices and the like."
|
14
14
|
s.email = "gallagher.paul@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
@@ -3,19 +3,22 @@
|
|
3
3
|
# This is a YAML-format file, so beware that indentation is significant
|
4
4
|
---
|
5
5
|
hello_world.pdf:
|
6
|
-
:
|
6
|
+
:test_numeric_above:
|
7
7
|
:above: 100
|
8
8
|
:expected_text:
|
9
9
|
-
|
10
10
|
- "Hello World"
|
11
|
-
:
|
11
|
+
:test_numeric_below:
|
12
12
|
:below: 900
|
13
13
|
:expected_text:
|
14
14
|
-
|
15
15
|
- "Hello World"
|
16
|
-
:
|
16
|
+
:test_numeric_below_na:
|
17
17
|
:below: 10
|
18
18
|
:expected_text: []
|
19
|
+
:test_below_na:
|
20
|
+
:below: "Bertie"
|
21
|
+
:expected_text: []
|
19
22
|
simple_table_text.pdf:
|
20
23
|
:test_above:
|
21
24
|
:above: Table Header
|
@@ -14,7 +14,7 @@ describe PDF::Reader::Turtletext::Textangle do
|
|
14
14
|
it { should be_a(PDF::Reader::Turtletext) }
|
15
15
|
end
|
16
16
|
|
17
|
-
|
17
|
+
context "with mock content" do
|
18
18
|
let(:page) { 1 }
|
19
19
|
before do
|
20
20
|
turtletext_reader.stub(:load_content).and_return(given_page_content)
|
@@ -28,10 +28,10 @@ describe PDF::Reader::Turtletext::Textangle do
|
|
28
28
|
|
29
29
|
context "with block param" do
|
30
30
|
[:above,:below,:left_of,:right_of].each do |positional_method|
|
31
|
-
|
31
|
+
describe "##{positional_method}" do
|
32
32
|
let(:term) { "canary" }
|
33
33
|
|
34
|
-
it "should
|
34
|
+
it "should assign correctly" do
|
35
35
|
textangle = resource_class.new(turtletext_reader) do |r|
|
36
36
|
r.send("#{positional_method}=",term)
|
37
37
|
end
|
@@ -40,159 +40,430 @@ describe PDF::Reader::Turtletext::Textangle do
|
|
40
40
|
|
41
41
|
end
|
42
42
|
end
|
43
|
+
|
44
|
+
describe "#page" do
|
45
|
+
let(:value) { 2 }
|
46
|
+
describe "default" do
|
47
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
48
|
+
end }
|
49
|
+
subject { textangle.page }
|
50
|
+
it { should eql(1) }
|
51
|
+
end
|
52
|
+
it "should assign correctly" do
|
53
|
+
textangle = resource_class.new(turtletext_reader) do |r|
|
54
|
+
r.page = value
|
55
|
+
end
|
56
|
+
textangle.page.should eql(value)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
describe "#inclusive" do
|
61
|
+
describe "default" do
|
62
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
63
|
+
end }
|
64
|
+
subject { textangle.inclusive }
|
65
|
+
it { should be_false }
|
66
|
+
end
|
67
|
+
it "should assign true correctly" do
|
68
|
+
textangle = resource_class.new(turtletext_reader) do |r|
|
69
|
+
r.inclusive = true
|
70
|
+
end
|
71
|
+
textangle.inclusive.should be_true
|
72
|
+
end
|
73
|
+
it "should assign false correctly" do
|
74
|
+
textangle = resource_class.new(turtletext_reader) do |r|
|
75
|
+
r.inclusive = false
|
76
|
+
end
|
77
|
+
textangle.inclusive.should be_false
|
78
|
+
end
|
79
|
+
end
|
80
|
+
describe "#inclusive!" do
|
81
|
+
it "should assign correctly" do
|
82
|
+
textangle = resource_class.new(turtletext_reader) do |r|
|
83
|
+
r.inclusive!
|
84
|
+
end
|
85
|
+
textangle.inclusive.should be_true
|
86
|
+
end
|
87
|
+
end
|
88
|
+
describe "#exclusive!" do
|
89
|
+
it "should assign correctly" do
|
90
|
+
textangle = resource_class.new(turtletext_reader) do |r|
|
91
|
+
r.exclusive!
|
92
|
+
end
|
93
|
+
textangle.inclusive.should be_false
|
94
|
+
end
|
95
|
+
end
|
43
96
|
end
|
44
97
|
|
45
98
|
context "without block param" do
|
46
|
-
it "#above should
|
99
|
+
it "#above should assign correctly" do
|
47
100
|
textangle = resource_class.new(turtletext_reader) do
|
48
101
|
above "canary"
|
49
102
|
end
|
50
103
|
textangle.above.should eql("canary")
|
51
104
|
end
|
52
|
-
it "#below should
|
105
|
+
it "#below should assign correctly" do
|
53
106
|
textangle = resource_class.new(turtletext_reader) do
|
54
107
|
below "canary"
|
55
108
|
end
|
56
109
|
textangle.below.should eql("canary")
|
57
110
|
end
|
58
|
-
it "#left_of should
|
111
|
+
it "#left_of should assign correctly" do
|
59
112
|
textangle = resource_class.new(turtletext_reader) do
|
60
113
|
left_of "canary"
|
61
114
|
end
|
62
115
|
textangle.left_of.should eql("canary")
|
63
116
|
end
|
64
|
-
it "#below should
|
117
|
+
it "#below should assign correctly" do
|
65
118
|
textangle = resource_class.new(turtletext_reader) do
|
66
119
|
right_of "canary"
|
67
120
|
end
|
68
121
|
textangle.right_of.should eql("canary")
|
69
122
|
end
|
70
|
-
end
|
71
123
|
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
it { should eql(expected) }
|
80
|
-
end
|
81
|
-
context "as a regex" do
|
82
|
-
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
83
|
-
r.below = /Fraud/i
|
84
|
-
end }
|
85
|
-
let(:expected) { [["smoked and streaky for me"]]}
|
86
|
-
subject { textangle.text }
|
87
|
-
it { should eql(expected) }
|
88
|
-
end
|
89
|
-
context "as a number" do
|
90
|
-
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
91
|
-
r.below = 20
|
92
|
-
end }
|
93
|
-
let(:expected) { [["smoked and streaky for me"]]}
|
94
|
-
subject { textangle.text }
|
95
|
-
it { should eql(expected) }
|
124
|
+
describe "#page" do
|
125
|
+
it "should assign correctly" do
|
126
|
+
textangle = resource_class.new(turtletext_reader) do
|
127
|
+
page 2
|
128
|
+
end
|
129
|
+
textangle.page.should eql(2)
|
130
|
+
end
|
96
131
|
end
|
97
|
-
end
|
98
132
|
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
it
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
end
|
112
|
-
|
113
|
-
|
114
|
-
it
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
end
|
120
|
-
|
121
|
-
|
122
|
-
it
|
133
|
+
describe "#inclusive" do
|
134
|
+
it "should assign true correctly" do
|
135
|
+
textangle = resource_class.new(turtletext_reader) do
|
136
|
+
inclusive true
|
137
|
+
end
|
138
|
+
textangle.inclusive.should be_true
|
139
|
+
end
|
140
|
+
it "should assign false correctly" do
|
141
|
+
textangle = resource_class.new(turtletext_reader) do
|
142
|
+
inclusive false
|
143
|
+
end
|
144
|
+
textangle.inclusive.should be_false
|
145
|
+
end
|
146
|
+
end
|
147
|
+
describe "#inclusive!" do
|
148
|
+
it "should assign correctly" do
|
149
|
+
textangle = resource_class.new(turtletext_reader) do
|
150
|
+
inclusive!
|
151
|
+
end
|
152
|
+
textangle.inclusive.should be_true
|
153
|
+
end
|
154
|
+
end
|
155
|
+
describe "#exclusive!" do
|
156
|
+
it "should assign correctly" do
|
157
|
+
textangle = resource_class.new(turtletext_reader) do
|
158
|
+
exclusive!
|
159
|
+
end
|
160
|
+
textangle.inclusive.should be_false
|
161
|
+
end
|
123
162
|
end
|
124
163
|
end
|
125
164
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
165
|
+
describe "#text" do
|
166
|
+
|
167
|
+
context "when only below specified" do
|
168
|
+
context "when exclusive (default)" do
|
169
|
+
context "as a string" do
|
170
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
171
|
+
r.below = "turkey bacon"
|
172
|
+
end }
|
173
|
+
let(:expected) { [["smoked and streaky for me"]]}
|
174
|
+
subject { textangle.text }
|
175
|
+
it { should eql(expected) }
|
176
|
+
end
|
177
|
+
context "as a regex" do
|
178
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
179
|
+
r.below = /turkey/i
|
180
|
+
end }
|
181
|
+
let(:expected) { [["smoked and streaky for me"]]}
|
182
|
+
subject { textangle.text }
|
183
|
+
it { should eql(expected) }
|
184
|
+
end
|
185
|
+
context "as a number" do
|
186
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
187
|
+
r.below = 30
|
188
|
+
end }
|
189
|
+
let(:expected) { [["smoked and streaky for me"]]}
|
190
|
+
subject { textangle.text }
|
191
|
+
it { should eql(expected) }
|
192
|
+
end
|
193
|
+
end
|
194
|
+
context "when inclusive" do
|
195
|
+
context "as a string" do
|
196
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
197
|
+
r.inclusive = true
|
198
|
+
r.below = "smoked and streaky for me"
|
199
|
+
end }
|
200
|
+
let(:expected) { [["smoked and streaky for me"]]}
|
201
|
+
subject { textangle.text }
|
202
|
+
it { should eql(expected) }
|
203
|
+
end
|
204
|
+
context "as a regex" do
|
205
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
206
|
+
r.inclusive = true
|
207
|
+
r.below = /Streaky/i
|
208
|
+
end }
|
209
|
+
let(:expected) { [["smoked and streaky for me"]]}
|
210
|
+
subject { textangle.text }
|
211
|
+
it { should eql(expected) }
|
212
|
+
end
|
213
|
+
context "as a number" do
|
214
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
215
|
+
r.inclusive = true
|
216
|
+
r.below = 10
|
217
|
+
end }
|
218
|
+
let(:expected) { [["smoked and streaky for me"]]}
|
219
|
+
subject { textangle.text }
|
220
|
+
it { should eql(expected) }
|
221
|
+
end
|
222
|
+
end
|
223
|
+
context "when no match" do
|
224
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
225
|
+
r.below = "fake"
|
226
|
+
end }
|
227
|
+
let(:expected) { [] }
|
228
|
+
subject { textangle.text }
|
229
|
+
it { should eql(expected) }
|
230
|
+
end
|
159
231
|
end
|
160
|
-
end
|
161
232
|
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
end
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
233
|
+
context "when only above specified" do
|
234
|
+
context "when exclusive (default)" do
|
235
|
+
context "as a string" do
|
236
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
237
|
+
r.above = "bacon on kimchi noodles"
|
238
|
+
end }
|
239
|
+
let(:expected) { [["crunchy bacon"]] }
|
240
|
+
subject { textangle.text }
|
241
|
+
it { should eql(expected) }
|
242
|
+
end
|
243
|
+
context "as a regex" do
|
244
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
245
|
+
r.above = /kimchi/i
|
246
|
+
end }
|
247
|
+
let(:expected) { [["crunchy bacon"]] }
|
248
|
+
subject { textangle.text }
|
249
|
+
it { should eql(expected) }
|
250
|
+
end
|
251
|
+
context "as a number" do
|
252
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
253
|
+
r.above = 40
|
254
|
+
end }
|
255
|
+
let(:expected) { [["crunchy bacon"]] }
|
256
|
+
subject { textangle.text }
|
257
|
+
it { should eql(expected) }
|
258
|
+
end
|
259
|
+
end
|
260
|
+
context "when inclusive" do
|
261
|
+
context "as a string" do
|
262
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
263
|
+
r.inclusive = true
|
264
|
+
r.above = "crunchy bacon"
|
265
|
+
end }
|
266
|
+
let(:expected) { [["crunchy bacon"]] }
|
267
|
+
subject { textangle.text }
|
268
|
+
it { should eql(expected) }
|
269
|
+
end
|
270
|
+
context "as a regex" do
|
271
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
272
|
+
r.inclusive = true
|
273
|
+
r.above = /crunChy/i
|
274
|
+
end }
|
275
|
+
let(:expected) { [["crunchy bacon"]] }
|
276
|
+
subject { textangle.text }
|
277
|
+
it { should eql(expected) }
|
278
|
+
end
|
279
|
+
context "as a number" do
|
280
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
281
|
+
r.inclusive = true
|
282
|
+
r.above = 70
|
283
|
+
end }
|
284
|
+
let(:expected) { [["crunchy bacon"]] }
|
285
|
+
subject { textangle.text }
|
286
|
+
it { should eql(expected) }
|
287
|
+
end
|
288
|
+
end
|
289
|
+
context "when no match" do
|
290
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
291
|
+
r.above = "fake"
|
292
|
+
end }
|
293
|
+
let(:expected) { [] }
|
294
|
+
subject { textangle.text }
|
295
|
+
it { should eql(expected) }
|
296
|
+
end
|
195
297
|
end
|
298
|
+
|
299
|
+
context "when only left_of specified" do
|
300
|
+
context "when exclusive (default)" do
|
301
|
+
context "as a string" do
|
302
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
303
|
+
r.left_of = "turkey bacon"
|
304
|
+
end }
|
305
|
+
let(:expected) { [
|
306
|
+
["crunchy bacon"],
|
307
|
+
["bacon on kimchi noodles", "heaven"]
|
308
|
+
] }
|
309
|
+
subject { textangle.text }
|
310
|
+
it { should eql(expected) }
|
311
|
+
end
|
312
|
+
context "as a regex" do
|
313
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
314
|
+
r.left_of = /turKey/i
|
315
|
+
end }
|
316
|
+
let(:expected) { [
|
317
|
+
["crunchy bacon"],
|
318
|
+
["bacon on kimchi noodles", "heaven"]
|
319
|
+
] }
|
320
|
+
subject { textangle.text }
|
321
|
+
it { should eql(expected) }
|
322
|
+
end
|
323
|
+
context "as a number" do
|
324
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
325
|
+
r.left_of = 30
|
326
|
+
end }
|
327
|
+
let(:expected) { [
|
328
|
+
["crunchy bacon"],
|
329
|
+
["bacon on kimchi noodles", "heaven"]
|
330
|
+
] }
|
331
|
+
subject { textangle.text }
|
332
|
+
it { should eql(expected) }
|
333
|
+
end
|
334
|
+
end
|
335
|
+
context "when inclusive" do
|
336
|
+
context "as a string" do
|
337
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
338
|
+
r.inclusive = true
|
339
|
+
r.left_of = "heaven"
|
340
|
+
end }
|
341
|
+
let(:expected) { [
|
342
|
+
["crunchy bacon"],
|
343
|
+
["bacon on kimchi noodles", "heaven"]
|
344
|
+
] }
|
345
|
+
subject { textangle.text }
|
346
|
+
it { should eql(expected) }
|
347
|
+
end
|
348
|
+
context "as a regex" do
|
349
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
350
|
+
r.inclusive = true
|
351
|
+
r.left_of = /heaVen/i
|
352
|
+
end }
|
353
|
+
let(:expected) { [
|
354
|
+
["crunchy bacon"],
|
355
|
+
["bacon on kimchi noodles", "heaven"]
|
356
|
+
] }
|
357
|
+
subject { textangle.text }
|
358
|
+
it { should eql(expected) }
|
359
|
+
end
|
360
|
+
context "as a number" do
|
361
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
362
|
+
r.inclusive = true
|
363
|
+
r.left_of = 25
|
364
|
+
end }
|
365
|
+
let(:expected) { [
|
366
|
+
["crunchy bacon"],
|
367
|
+
["bacon on kimchi noodles", "heaven"]
|
368
|
+
] }
|
369
|
+
subject { textangle.text }
|
370
|
+
it { should eql(expected) }
|
371
|
+
end
|
372
|
+
end
|
373
|
+
context "when no match" do
|
374
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
375
|
+
r.left_of = "fake"
|
376
|
+
end }
|
377
|
+
let(:expected) { [] }
|
378
|
+
subject { textangle.text }
|
379
|
+
it { should eql(expected) }
|
380
|
+
end
|
381
|
+
end
|
382
|
+
|
383
|
+
context "when only right_of specified" do
|
384
|
+
context "when exclusive (default)" do
|
385
|
+
context "as a string" do
|
386
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
387
|
+
r.right_of = "heaven"
|
388
|
+
end }
|
389
|
+
let(:expected) { [
|
390
|
+
["turkey bacon","fraud"],
|
391
|
+
["smoked and streaky for me"]
|
392
|
+
] }
|
393
|
+
subject { textangle.text }
|
394
|
+
it { should eql(expected) }
|
395
|
+
end
|
396
|
+
context "as a regex" do
|
397
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
398
|
+
r.right_of = /Heaven/i
|
399
|
+
end }
|
400
|
+
let(:expected) { [
|
401
|
+
["turkey bacon","fraud"],
|
402
|
+
["smoked and streaky for me"]
|
403
|
+
] }
|
404
|
+
subject { textangle.text }
|
405
|
+
it { should eql(expected) }
|
406
|
+
end
|
407
|
+
context "as a number" do
|
408
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
409
|
+
r.right_of = 25
|
410
|
+
end }
|
411
|
+
let(:expected) { [
|
412
|
+
["turkey bacon","fraud"],
|
413
|
+
["smoked and streaky for me"]
|
414
|
+
] }
|
415
|
+
subject { textangle.text }
|
416
|
+
it { should eql(expected) }
|
417
|
+
end
|
418
|
+
end
|
419
|
+
context "when inclusive" do
|
420
|
+
context "as a string" do
|
421
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
422
|
+
r.inclusive = true
|
423
|
+
r.right_of = "turkey bacon"
|
424
|
+
end }
|
425
|
+
let(:expected) { [
|
426
|
+
["turkey bacon","fraud"],
|
427
|
+
["smoked and streaky for me"]
|
428
|
+
] }
|
429
|
+
subject { textangle.text }
|
430
|
+
it { should eql(expected) }
|
431
|
+
end
|
432
|
+
context "as a regex" do
|
433
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
434
|
+
r.inclusive = true
|
435
|
+
r.right_of = /turkey/i
|
436
|
+
end }
|
437
|
+
let(:expected) { [
|
438
|
+
["turkey bacon","fraud"],
|
439
|
+
["smoked and streaky for me"]
|
440
|
+
] }
|
441
|
+
subject { textangle.text }
|
442
|
+
it { should eql(expected) }
|
443
|
+
end
|
444
|
+
context "as a number" do
|
445
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
446
|
+
r.inclusive = true
|
447
|
+
r.right_of = 30
|
448
|
+
end }
|
449
|
+
let(:expected) { [
|
450
|
+
["turkey bacon","fraud"],
|
451
|
+
["smoked and streaky for me"]
|
452
|
+
] }
|
453
|
+
subject { textangle.text }
|
454
|
+
it { should eql(expected) }
|
455
|
+
end
|
456
|
+
end
|
457
|
+
context "when no match" do
|
458
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
459
|
+
r.right_of = "fake"
|
460
|
+
end }
|
461
|
+
let(:expected) { [] }
|
462
|
+
subject { textangle.text }
|
463
|
+
it { should eql(expected) }
|
464
|
+
end
|
465
|
+
end
|
466
|
+
|
196
467
|
end
|
197
468
|
|
198
469
|
end
|
@@ -90,7 +90,7 @@ describe PDF::Reader::Turtletext do
|
|
90
90
|
{
|
91
91
|
:with_single_text => {
|
92
92
|
:source_page_content => {10.0=>{10.0=>"a first bit of text"}},
|
93
|
-
:xmin => 0, :xmax => 100, :ymin => 0, :ymax => 100,
|
93
|
+
:xmin => 0, :xmax => 100, :ymin => 0, :ymax => 100, :inclusive => false,
|
94
94
|
:expected_text => [["a first bit of text"]]
|
95
95
|
},
|
96
96
|
:with_single_line_text => {
|
@@ -99,7 +99,7 @@ describe PDF::Reader::Turtletext do
|
|
99
99
|
30.0=>{10.0=>"first part found", 20.0=>"last part found"},
|
100
100
|
10.0=>{10.0=>"last line ignored"}
|
101
101
|
},
|
102
|
-
:xmin => 0, :xmax => 100, :ymin => 20, :ymax => 50,
|
102
|
+
:xmin => 0, :xmax => 100, :ymin => 20, :ymax => 50, :inclusive => false,
|
103
103
|
:expected_text => [["first part found", "last part found"]]
|
104
104
|
},
|
105
105
|
:with_multi_line_text => {
|
@@ -109,11 +109,20 @@ describe PDF::Reader::Turtletext do
|
|
109
109
|
30.0=>{10.0=>"last line first part found", 20.0=>"last line last part found"},
|
110
110
|
10.0=>{10.0=>"last line ignored"}
|
111
111
|
},
|
112
|
-
:xmin => 0, :xmax => 100, :ymin => 20, :ymax => 50,
|
112
|
+
:xmin => 0, :xmax => 100, :ymin => 20, :ymax => 50, :inclusive => false,
|
113
113
|
:expected_text => [
|
114
114
|
["first line first part found", "first line last part found"],
|
115
115
|
["last line first part found", "last line last part found"]
|
116
116
|
]
|
117
|
+
},
|
118
|
+
:with_inclusive_text => {
|
119
|
+
:source_page_content => {
|
120
|
+
70.0=>{10.0=>"first line ignored"},
|
121
|
+
30.0=>{10.0=>"first part found", 20.0=>"last part found"},
|
122
|
+
10.0=>{10.0=>"last line ignored"}
|
123
|
+
},
|
124
|
+
:xmin => 10, :xmax => 100, :ymin => 30, :ymax => 30, :inclusive => true,
|
125
|
+
:expected_text => [["first part found", "last part found"]]
|
117
126
|
}
|
118
127
|
}.each do |test_name,test_expectations|
|
119
128
|
context test_name do
|
@@ -122,8 +131,9 @@ describe PDF::Reader::Turtletext do
|
|
122
131
|
let(:xmax) { test_expectations[:xmax] }
|
123
132
|
let(:ymin) { test_expectations[:ymin] }
|
124
133
|
let(:ymax) { test_expectations[:ymax] }
|
134
|
+
let(:inclusive) { test_expectations[:inclusive] }
|
125
135
|
let(:expected_text) { test_expectations[:expected_text] }
|
126
|
-
subject { turtletext_reader.text_in_region(xmin,xmax,ymin,ymax,page) }
|
136
|
+
subject { turtletext_reader.text_in_region(xmin,xmax,ymin,ymax,page,inclusive) }
|
127
137
|
it { should eql(expected_text) }
|
128
138
|
end
|
129
139
|
end
|
@@ -137,6 +147,7 @@ describe PDF::Reader::Turtletext do
|
|
137
147
|
10.0=>{40.0=>"smoked and streaky da bomb"}
|
138
148
|
} }
|
139
149
|
{
|
150
|
+
:with_no_match => { :match_term => 'bertie beetle', :expected_position => nil },
|
140
151
|
:with_simple_match => { :match_term => 'turkey bacon', :expected_position => {:x=>30.0, :y=>30.0} },
|
141
152
|
:with_match_along_line => { :match_term => 'heaven', :expected_position => {:x=>25.0, :y=>40.0} },
|
142
153
|
:with_regex_match => { :match_term => /kimchi/, :expected_position => {:x=>15.0, :y=>40.0} },
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf-reader-turtletext
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-08-01 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: pdf-reader
|
16
|
-
requirement: &
|
16
|
+
requirement: &70159058920880 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - =
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 1.1.1
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70159058920880
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: bundler
|
27
|
-
requirement: &
|
27
|
+
requirement: &70159058920240 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 1.1.4
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70159058920240
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: jeweler
|
38
|
-
requirement: &
|
38
|
+
requirement: &70159058919460 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: 1.6.4
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70159058919460
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: rake
|
49
|
-
requirement: &
|
49
|
+
requirement: &70159058918900 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ~>
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: 0.9.2.2
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70159058918900
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: rspec
|
60
|
-
requirement: &
|
60
|
+
requirement: &70159058918080 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ~>
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: 2.8.0
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70159058918080
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: rdoc
|
71
|
-
requirement: &
|
71
|
+
requirement: &70159058917260 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ~>
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '3.11'
|
77
77
|
type: :development
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70159058917260
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: prawn
|
82
|
-
requirement: &
|
82
|
+
requirement: &70159058916580 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ~>
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: 0.12.0
|
88
88
|
type: :development
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *70159058916580
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: guard-rspec
|
93
|
-
requirement: &
|
93
|
+
requirement: &70159058915980 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ~>
|
@@ -98,7 +98,7 @@ dependencies:
|
|
98
98
|
version: 1.2.0
|
99
99
|
type: :development
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *70159058915980
|
102
102
|
description: a library that can read structured and positional text from PDFs. Ideal
|
103
103
|
for asembling structured data from invoices and the like.
|
104
104
|
email: gallagher.paul@gmail.com
|