pdf-reader-turtletext 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +6 -1
- data/README.rdoc +40 -6
- data/lib/pdf/reader/turtletext.rb +10 -8
- data/lib/pdf/reader/turtletext/textangle.rb +48 -14
- data/lib/pdf/reader/turtletext/version.rb +1 -1
- data/pdf-reader-turtletext.gemspec +2 -2
- data/spec/fixtures/pdf_samples/expectations.yml +6 -3
- data/spec/unit/reader/turtletext/textangle_spec.rb +395 -124
- data/spec/unit/reader/turtletext/turtletext_spec.rb +15 -4
- metadata +18 -18
data/CHANGELOG
CHANGED
@@ -1,4 +1,9 @@
|
|
1
|
-
Version 0.2.
|
1
|
+
Version 0.2.2 Release: 1st Aug 2012
|
2
|
+
==================================================
|
3
|
+
* provide better control of inclusive/exclusive behaviour
|
4
|
+
for region selection \#4
|
5
|
+
|
6
|
+
Version 0.2.1 Release: 31st July 2012
|
2
7
|
==================================================
|
3
8
|
* fix row sorting for Rubinius 1.8 mode
|
4
9
|
|
data/README.rdoc
CHANGED
@@ -36,7 +36,9 @@ Then bundle install:
|
|
36
36
|
|
37
37
|
=== How do I install it for gem development?
|
38
38
|
|
39
|
-
If you want to work on enhancements of fix bugs in PDF::Reader::Turtletext, fork and clone the github repository.
|
39
|
+
If you want to work on enhancements of fix bugs in PDF::Reader::Turtletext, fork and clone the github repository. If you are using bundler (recommended), run <tt>bundle</tt> to install development dependencies.
|
40
|
+
|
41
|
+
See the section below on 'Contributing to PDF::Reader::Turtletext' for more information.
|
40
42
|
|
41
43
|
=== How to instantiate Turtletext in code
|
42
44
|
|
@@ -70,6 +72,8 @@ Solution: use the <tt>bounding_box</tt> method to describe the region and extrac
|
|
70
72
|
|
71
73
|
The range of methods that can be used within the <tt>bounding_box</tt> block are all optional, and include:
|
72
74
|
* <tt>page</tt> - specifies the PDF page from which to extract text (default is 1).
|
75
|
+
* <tt>inclusive</tt> - whether region selection should be inclusive or exclusive of the specified positions
|
76
|
+
(default is false).
|
73
77
|
* <tt>below</tt> - a string, regex or number that describes the upper limit of the text box
|
74
78
|
(default is top border of the page).
|
75
79
|
* <tt>above</tt> - a string, regex or number that describes the lower limit of the text box
|
@@ -98,17 +102,47 @@ An explicit block parameter may be used with the <tt>bounding_box</tt> method:
|
|
98
102
|
textangle.text
|
99
103
|
=> [['string','string'],['string']] # array of rows, each row is an array of text elements in the row
|
100
104
|
|
105
|
+
=== How to describe an inclusive <tt>bounding_box</tt> region
|
106
|
+
|
107
|
+
By default, the <tt>bounding_box</tt> method makes exclusive selection (i.e. not including the
|
108
|
+
region limits).
|
109
|
+
|
110
|
+
To specifiy an inclusive region, use the <tt>inclusive!</tt> command:
|
111
|
+
|
112
|
+
textangle = reader.bounding_box do
|
113
|
+
inclusive!
|
114
|
+
below /electricity/i
|
115
|
+
left_of "Total ($)"
|
116
|
+
end
|
117
|
+
|
118
|
+
Alternatively, set <tt>inclusive</tt> to true:
|
119
|
+
|
120
|
+
textangle = reader.bounding_box do
|
121
|
+
inclusive true
|
122
|
+
below /electricity/i
|
123
|
+
left_of "Total ($)"
|
124
|
+
end
|
125
|
+
|
126
|
+
Or with a block parameter, you may also assign <tt>inclusive</tt> to true:
|
127
|
+
|
128
|
+
textangle = reader.bounding_box do |r|
|
129
|
+
r.inclusive = true
|
130
|
+
r.below /electricity/i
|
131
|
+
r.left_of "Total ($)"
|
132
|
+
end
|
133
|
+
|
101
134
|
=== Extract text for a region with known positional co-ordinates
|
102
135
|
|
103
136
|
If you know (or can calculate) the x,y positions of the required text region, you can extract the region's
|
104
137
|
text using the <tt>text_in_region</tt> method.
|
105
138
|
|
106
139
|
text = reader.text_in_region(
|
107
|
-
10,
|
108
|
-
900,
|
109
|
-
200,
|
110
|
-
400,
|
111
|
-
1 # page
|
140
|
+
10, # minimum x (left-most)
|
141
|
+
900, # maximum x (right-most)
|
142
|
+
200, # minimum y (bottom-most)
|
143
|
+
400, # maximum y (top-most)
|
144
|
+
1, # page (default 1)
|
145
|
+
false # inclusive of x/y position if true (default false)
|
112
146
|
)
|
113
147
|
=> [['string','string'],['string']] # array of rows, each row is an array of text elements in the row
|
114
148
|
|
@@ -76,21 +76,22 @@ class PDF::Reader::Turtletext
|
|
76
76
|
end
|
77
77
|
end
|
78
78
|
|
79
|
-
# Returns an array of text elements found within the x,y limits
|
80
|
-
# x ranges from +xmin+ (left of page) to +xmax+ (right of page)
|
81
|
-
# y ranges from +ymin+ (bottom of page) to +ymax+ (top of page)
|
82
|
-
#
|
79
|
+
# Returns an array of text elements found within the x,y limits on +page+:
|
80
|
+
# * x ranges from +xmin+ (left of page) to +xmax+ (right of page)
|
81
|
+
# * y ranges from +ymin+ (bottom of page) to +ymax+ (top of page)
|
82
|
+
# When +inclusive+ is false (default) the x/y limits do not include the actual x/y value.
|
83
83
|
# Each line of text is an array of the seperate text elements found on that line.
|
84
84
|
# [["first line first text", "first line last text"],["second line text"]]
|
85
|
-
def text_in_region(xmin,xmax,ymin,ymax,page=1)
|
85
|
+
def text_in_region(xmin,xmax,ymin,ymax,page=1,inclusive=false)
|
86
|
+
return [] unless xmin && xmax && ymin && ymax
|
86
87
|
text_map = content(page)
|
87
88
|
box = []
|
88
89
|
|
89
90
|
text_map.each do |y,text_row|
|
90
|
-
if y >= ymin && y<= ymax
|
91
|
+
if inclusive ? (y >= ymin && y <= ymax) : (y > ymin && y < ymax)
|
91
92
|
row = []
|
92
93
|
text_row.each do |x,element|
|
93
|
-
if x >= xmin && x<= xmax
|
94
|
+
if inclusive ? (x >= xmin && x <= xmax) : (x > xmin && x < xmax)
|
94
95
|
row << element
|
95
96
|
end
|
96
97
|
end
|
@@ -102,7 +103,8 @@ class PDF::Reader::Turtletext
|
|
102
103
|
|
103
104
|
# Returns the position of +text+ on +page+
|
104
105
|
# {x: val, y: val }
|
105
|
-
# +text+ may be a string (exact match required) or a Regexp
|
106
|
+
# +text+ may be a string (exact match required) or a Regexp.
|
107
|
+
# Returns nil if the text cannot be found.
|
106
108
|
def text_position(text,page=1)
|
107
109
|
item = if text.class <= Regexp
|
108
110
|
content(page).map do |k,v|
|
@@ -10,14 +10,15 @@
|
|
10
10
|
# textangle.text
|
11
11
|
#
|
12
12
|
class PDF::Reader::Turtletext::Textangle
|
13
|
+
|
14
|
+
#
|
13
15
|
attr_reader :reader
|
14
|
-
attr_accessor :page
|
15
|
-
attr_writer :above,:below,:left_of,:right_of
|
16
16
|
|
17
17
|
# +turtletext_reader+ is a PDF::Reader::Turtletext
|
18
18
|
def initialize(turtletext_reader,&block)
|
19
19
|
@reader = turtletext_reader
|
20
20
|
@page = 1
|
21
|
+
@inclusive = false
|
21
22
|
if block_given?
|
22
23
|
if block.arity == 1
|
23
24
|
yield self
|
@@ -27,6 +28,34 @@ class PDF::Reader::Turtletext::Textangle
|
|
27
28
|
end
|
28
29
|
end
|
29
30
|
|
31
|
+
attr_writer :inclusive
|
32
|
+
|
33
|
+
def inclusive(*args)
|
34
|
+
if value = args.first
|
35
|
+
@inclusive = value
|
36
|
+
end
|
37
|
+
@inclusive
|
38
|
+
end
|
39
|
+
|
40
|
+
# Command: sets +inclusive true
|
41
|
+
def inclusive!
|
42
|
+
@inclusive = true
|
43
|
+
end
|
44
|
+
|
45
|
+
# Command: sets +inclusive false
|
46
|
+
def exclusive!
|
47
|
+
@inclusive = false
|
48
|
+
end
|
49
|
+
|
50
|
+
attr_writer :page
|
51
|
+
def page(*args)
|
52
|
+
if value = args.first
|
53
|
+
@page = value
|
54
|
+
end
|
55
|
+
@page
|
56
|
+
end
|
57
|
+
|
58
|
+
attr_writer :above
|
30
59
|
def above(*args)
|
31
60
|
if value = args.first
|
32
61
|
@above = value
|
@@ -34,6 +63,7 @@ class PDF::Reader::Turtletext::Textangle
|
|
34
63
|
@above
|
35
64
|
end
|
36
65
|
|
66
|
+
attr_writer :below
|
37
67
|
def below(*args)
|
38
68
|
if value = args.first
|
39
69
|
@below = value
|
@@ -41,6 +71,7 @@ class PDF::Reader::Turtletext::Textangle
|
|
41
71
|
@below
|
42
72
|
end
|
43
73
|
|
74
|
+
attr_writer :left_of
|
44
75
|
def left_of(*args)
|
45
76
|
if value = args.first
|
46
77
|
@left_of = value
|
@@ -48,6 +79,7 @@ class PDF::Reader::Turtletext::Textangle
|
|
48
79
|
@left_of
|
49
80
|
end
|
50
81
|
|
82
|
+
attr_writer :right_of
|
51
83
|
def right_of(*args)
|
52
84
|
if value = args.first
|
53
85
|
@right_of = value
|
@@ -55,15 +87,17 @@ class PDF::Reader::Turtletext::Textangle
|
|
55
87
|
@right_of
|
56
88
|
end
|
57
89
|
|
58
|
-
# Returns the text
|
90
|
+
# Returns the text array found within the defined region.
|
91
|
+
# Each line of text is an array of the seperate text elements found on that line.
|
92
|
+
# [["first line first text", "first line last text"],["second line text"]]
|
59
93
|
def text
|
60
94
|
return unless reader
|
61
95
|
|
62
96
|
xmin = if right_of
|
63
97
|
if [Fixnum,Float].include?(right_of.class)
|
64
98
|
right_of
|
65
|
-
|
66
|
-
|
99
|
+
elsif xy = reader.text_position(right_of,page)
|
100
|
+
xy[:x]
|
67
101
|
end
|
68
102
|
else
|
69
103
|
0
|
@@ -71,18 +105,18 @@ class PDF::Reader::Turtletext::Textangle
|
|
71
105
|
xmax = if left_of
|
72
106
|
if [Fixnum,Float].include?(left_of.class)
|
73
107
|
left_of
|
74
|
-
|
75
|
-
|
108
|
+
elsif xy = reader.text_position(left_of,page)
|
109
|
+
xy[:x]
|
76
110
|
end
|
77
111
|
else
|
78
|
-
99999 # TODO actual limit
|
112
|
+
99999 # TODO: figure out the actual limit?
|
79
113
|
end
|
80
114
|
|
81
115
|
ymin = if above
|
82
116
|
if [Fixnum,Float].include?(above.class)
|
83
117
|
above
|
84
|
-
|
85
|
-
|
118
|
+
elsif xy = reader.text_position(above,page)
|
119
|
+
xy[:y]
|
86
120
|
end
|
87
121
|
else
|
88
122
|
0
|
@@ -90,14 +124,14 @@ class PDF::Reader::Turtletext::Textangle
|
|
90
124
|
ymax = if below
|
91
125
|
if [Fixnum,Float].include?(below.class)
|
92
126
|
below
|
93
|
-
|
94
|
-
|
127
|
+
elsif xy = reader.text_position(below,page)
|
128
|
+
xy[:y]
|
95
129
|
end
|
96
130
|
else
|
97
|
-
99999 # TODO actual limit
|
131
|
+
99999 # TODO: figure out the actual limit?
|
98
132
|
end
|
99
133
|
|
100
|
-
reader.text_in_region(xmin,xmax,ymin,ymax,page)
|
134
|
+
reader.text_in_region(xmin,xmax,ymin,ymax,page,inclusive)
|
101
135
|
end
|
102
136
|
|
103
137
|
end
|
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "pdf-reader-turtletext"
|
8
|
-
s.version = "0.2.
|
8
|
+
s.version = "0.2.2"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Paul Gallagher"]
|
12
|
-
s.date = "2012-
|
12
|
+
s.date = "2012-08-01"
|
13
13
|
s.description = "a library that can read structured and positional text from PDFs. Ideal for asembling structured data from invoices and the like."
|
14
14
|
s.email = "gallagher.paul@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
@@ -3,19 +3,22 @@
|
|
3
3
|
# This is a YAML-format file, so beware that indentation is significant
|
4
4
|
---
|
5
5
|
hello_world.pdf:
|
6
|
-
:
|
6
|
+
:test_numeric_above:
|
7
7
|
:above: 100
|
8
8
|
:expected_text:
|
9
9
|
-
|
10
10
|
- "Hello World"
|
11
|
-
:
|
11
|
+
:test_numeric_below:
|
12
12
|
:below: 900
|
13
13
|
:expected_text:
|
14
14
|
-
|
15
15
|
- "Hello World"
|
16
|
-
:
|
16
|
+
:test_numeric_below_na:
|
17
17
|
:below: 10
|
18
18
|
:expected_text: []
|
19
|
+
:test_below_na:
|
20
|
+
:below: "Bertie"
|
21
|
+
:expected_text: []
|
19
22
|
simple_table_text.pdf:
|
20
23
|
:test_above:
|
21
24
|
:above: Table Header
|
@@ -14,7 +14,7 @@ describe PDF::Reader::Turtletext::Textangle do
|
|
14
14
|
it { should be_a(PDF::Reader::Turtletext) }
|
15
15
|
end
|
16
16
|
|
17
|
-
|
17
|
+
context "with mock content" do
|
18
18
|
let(:page) { 1 }
|
19
19
|
before do
|
20
20
|
turtletext_reader.stub(:load_content).and_return(given_page_content)
|
@@ -28,10 +28,10 @@ describe PDF::Reader::Turtletext::Textangle do
|
|
28
28
|
|
29
29
|
context "with block param" do
|
30
30
|
[:above,:below,:left_of,:right_of].each do |positional_method|
|
31
|
-
|
31
|
+
describe "##{positional_method}" do
|
32
32
|
let(:term) { "canary" }
|
33
33
|
|
34
|
-
it "should
|
34
|
+
it "should assign correctly" do
|
35
35
|
textangle = resource_class.new(turtletext_reader) do |r|
|
36
36
|
r.send("#{positional_method}=",term)
|
37
37
|
end
|
@@ -40,159 +40,430 @@ describe PDF::Reader::Turtletext::Textangle do
|
|
40
40
|
|
41
41
|
end
|
42
42
|
end
|
43
|
+
|
44
|
+
describe "#page" do
|
45
|
+
let(:value) { 2 }
|
46
|
+
describe "default" do
|
47
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
48
|
+
end }
|
49
|
+
subject { textangle.page }
|
50
|
+
it { should eql(1) }
|
51
|
+
end
|
52
|
+
it "should assign correctly" do
|
53
|
+
textangle = resource_class.new(turtletext_reader) do |r|
|
54
|
+
r.page = value
|
55
|
+
end
|
56
|
+
textangle.page.should eql(value)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
describe "#inclusive" do
|
61
|
+
describe "default" do
|
62
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
63
|
+
end }
|
64
|
+
subject { textangle.inclusive }
|
65
|
+
it { should be_false }
|
66
|
+
end
|
67
|
+
it "should assign true correctly" do
|
68
|
+
textangle = resource_class.new(turtletext_reader) do |r|
|
69
|
+
r.inclusive = true
|
70
|
+
end
|
71
|
+
textangle.inclusive.should be_true
|
72
|
+
end
|
73
|
+
it "should assign false correctly" do
|
74
|
+
textangle = resource_class.new(turtletext_reader) do |r|
|
75
|
+
r.inclusive = false
|
76
|
+
end
|
77
|
+
textangle.inclusive.should be_false
|
78
|
+
end
|
79
|
+
end
|
80
|
+
describe "#inclusive!" do
|
81
|
+
it "should assign correctly" do
|
82
|
+
textangle = resource_class.new(turtletext_reader) do |r|
|
83
|
+
r.inclusive!
|
84
|
+
end
|
85
|
+
textangle.inclusive.should be_true
|
86
|
+
end
|
87
|
+
end
|
88
|
+
describe "#exclusive!" do
|
89
|
+
it "should assign correctly" do
|
90
|
+
textangle = resource_class.new(turtletext_reader) do |r|
|
91
|
+
r.exclusive!
|
92
|
+
end
|
93
|
+
textangle.inclusive.should be_false
|
94
|
+
end
|
95
|
+
end
|
43
96
|
end
|
44
97
|
|
45
98
|
context "without block param" do
|
46
|
-
it "#above should
|
99
|
+
it "#above should assign correctly" do
|
47
100
|
textangle = resource_class.new(turtletext_reader) do
|
48
101
|
above "canary"
|
49
102
|
end
|
50
103
|
textangle.above.should eql("canary")
|
51
104
|
end
|
52
|
-
it "#below should
|
105
|
+
it "#below should assign correctly" do
|
53
106
|
textangle = resource_class.new(turtletext_reader) do
|
54
107
|
below "canary"
|
55
108
|
end
|
56
109
|
textangle.below.should eql("canary")
|
57
110
|
end
|
58
|
-
it "#left_of should
|
111
|
+
it "#left_of should assign correctly" do
|
59
112
|
textangle = resource_class.new(turtletext_reader) do
|
60
113
|
left_of "canary"
|
61
114
|
end
|
62
115
|
textangle.left_of.should eql("canary")
|
63
116
|
end
|
64
|
-
it "#below should
|
117
|
+
it "#below should assign correctly" do
|
65
118
|
textangle = resource_class.new(turtletext_reader) do
|
66
119
|
right_of "canary"
|
67
120
|
end
|
68
121
|
textangle.right_of.should eql("canary")
|
69
122
|
end
|
70
|
-
end
|
71
123
|
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
it { should eql(expected) }
|
80
|
-
end
|
81
|
-
context "as a regex" do
|
82
|
-
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
83
|
-
r.below = /Fraud/i
|
84
|
-
end }
|
85
|
-
let(:expected) { [["smoked and streaky for me"]]}
|
86
|
-
subject { textangle.text }
|
87
|
-
it { should eql(expected) }
|
88
|
-
end
|
89
|
-
context "as a number" do
|
90
|
-
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
91
|
-
r.below = 20
|
92
|
-
end }
|
93
|
-
let(:expected) { [["smoked and streaky for me"]]}
|
94
|
-
subject { textangle.text }
|
95
|
-
it { should eql(expected) }
|
124
|
+
describe "#page" do
|
125
|
+
it "should assign correctly" do
|
126
|
+
textangle = resource_class.new(turtletext_reader) do
|
127
|
+
page 2
|
128
|
+
end
|
129
|
+
textangle.page.should eql(2)
|
130
|
+
end
|
96
131
|
end
|
97
|
-
end
|
98
132
|
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
it
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
end
|
112
|
-
|
113
|
-
|
114
|
-
it
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
end
|
120
|
-
|
121
|
-
|
122
|
-
it
|
133
|
+
describe "#inclusive" do
|
134
|
+
it "should assign true correctly" do
|
135
|
+
textangle = resource_class.new(turtletext_reader) do
|
136
|
+
inclusive true
|
137
|
+
end
|
138
|
+
textangle.inclusive.should be_true
|
139
|
+
end
|
140
|
+
it "should assign false correctly" do
|
141
|
+
textangle = resource_class.new(turtletext_reader) do
|
142
|
+
inclusive false
|
143
|
+
end
|
144
|
+
textangle.inclusive.should be_false
|
145
|
+
end
|
146
|
+
end
|
147
|
+
describe "#inclusive!" do
|
148
|
+
it "should assign correctly" do
|
149
|
+
textangle = resource_class.new(turtletext_reader) do
|
150
|
+
inclusive!
|
151
|
+
end
|
152
|
+
textangle.inclusive.should be_true
|
153
|
+
end
|
154
|
+
end
|
155
|
+
describe "#exclusive!" do
|
156
|
+
it "should assign correctly" do
|
157
|
+
textangle = resource_class.new(turtletext_reader) do
|
158
|
+
exclusive!
|
159
|
+
end
|
160
|
+
textangle.inclusive.should be_false
|
161
|
+
end
|
123
162
|
end
|
124
163
|
end
|
125
164
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
165
|
+
describe "#text" do
|
166
|
+
|
167
|
+
context "when only below specified" do
|
168
|
+
context "when exclusive (default)" do
|
169
|
+
context "as a string" do
|
170
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
171
|
+
r.below = "turkey bacon"
|
172
|
+
end }
|
173
|
+
let(:expected) { [["smoked and streaky for me"]]}
|
174
|
+
subject { textangle.text }
|
175
|
+
it { should eql(expected) }
|
176
|
+
end
|
177
|
+
context "as a regex" do
|
178
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
179
|
+
r.below = /turkey/i
|
180
|
+
end }
|
181
|
+
let(:expected) { [["smoked and streaky for me"]]}
|
182
|
+
subject { textangle.text }
|
183
|
+
it { should eql(expected) }
|
184
|
+
end
|
185
|
+
context "as a number" do
|
186
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
187
|
+
r.below = 30
|
188
|
+
end }
|
189
|
+
let(:expected) { [["smoked and streaky for me"]]}
|
190
|
+
subject { textangle.text }
|
191
|
+
it { should eql(expected) }
|
192
|
+
end
|
193
|
+
end
|
194
|
+
context "when inclusive" do
|
195
|
+
context "as a string" do
|
196
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
197
|
+
r.inclusive = true
|
198
|
+
r.below = "smoked and streaky for me"
|
199
|
+
end }
|
200
|
+
let(:expected) { [["smoked and streaky for me"]]}
|
201
|
+
subject { textangle.text }
|
202
|
+
it { should eql(expected) }
|
203
|
+
end
|
204
|
+
context "as a regex" do
|
205
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
206
|
+
r.inclusive = true
|
207
|
+
r.below = /Streaky/i
|
208
|
+
end }
|
209
|
+
let(:expected) { [["smoked and streaky for me"]]}
|
210
|
+
subject { textangle.text }
|
211
|
+
it { should eql(expected) }
|
212
|
+
end
|
213
|
+
context "as a number" do
|
214
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
215
|
+
r.inclusive = true
|
216
|
+
r.below = 10
|
217
|
+
end }
|
218
|
+
let(:expected) { [["smoked and streaky for me"]]}
|
219
|
+
subject { textangle.text }
|
220
|
+
it { should eql(expected) }
|
221
|
+
end
|
222
|
+
end
|
223
|
+
context "when no match" do
|
224
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
225
|
+
r.below = "fake"
|
226
|
+
end }
|
227
|
+
let(:expected) { [] }
|
228
|
+
subject { textangle.text }
|
229
|
+
it { should eql(expected) }
|
230
|
+
end
|
159
231
|
end
|
160
|
-
end
|
161
232
|
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
end
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
233
|
+
context "when only above specified" do
|
234
|
+
context "when exclusive (default)" do
|
235
|
+
context "as a string" do
|
236
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
237
|
+
r.above = "bacon on kimchi noodles"
|
238
|
+
end }
|
239
|
+
let(:expected) { [["crunchy bacon"]] }
|
240
|
+
subject { textangle.text }
|
241
|
+
it { should eql(expected) }
|
242
|
+
end
|
243
|
+
context "as a regex" do
|
244
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
245
|
+
r.above = /kimchi/i
|
246
|
+
end }
|
247
|
+
let(:expected) { [["crunchy bacon"]] }
|
248
|
+
subject { textangle.text }
|
249
|
+
it { should eql(expected) }
|
250
|
+
end
|
251
|
+
context "as a number" do
|
252
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
253
|
+
r.above = 40
|
254
|
+
end }
|
255
|
+
let(:expected) { [["crunchy bacon"]] }
|
256
|
+
subject { textangle.text }
|
257
|
+
it { should eql(expected) }
|
258
|
+
end
|
259
|
+
end
|
260
|
+
context "when inclusive" do
|
261
|
+
context "as a string" do
|
262
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
263
|
+
r.inclusive = true
|
264
|
+
r.above = "crunchy bacon"
|
265
|
+
end }
|
266
|
+
let(:expected) { [["crunchy bacon"]] }
|
267
|
+
subject { textangle.text }
|
268
|
+
it { should eql(expected) }
|
269
|
+
end
|
270
|
+
context "as a regex" do
|
271
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
272
|
+
r.inclusive = true
|
273
|
+
r.above = /crunChy/i
|
274
|
+
end }
|
275
|
+
let(:expected) { [["crunchy bacon"]] }
|
276
|
+
subject { textangle.text }
|
277
|
+
it { should eql(expected) }
|
278
|
+
end
|
279
|
+
context "as a number" do
|
280
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
281
|
+
r.inclusive = true
|
282
|
+
r.above = 70
|
283
|
+
end }
|
284
|
+
let(:expected) { [["crunchy bacon"]] }
|
285
|
+
subject { textangle.text }
|
286
|
+
it { should eql(expected) }
|
287
|
+
end
|
288
|
+
end
|
289
|
+
context "when no match" do
|
290
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
291
|
+
r.above = "fake"
|
292
|
+
end }
|
293
|
+
let(:expected) { [] }
|
294
|
+
subject { textangle.text }
|
295
|
+
it { should eql(expected) }
|
296
|
+
end
|
195
297
|
end
|
298
|
+
|
299
|
+
context "when only left_of specified" do
|
300
|
+
context "when exclusive (default)" do
|
301
|
+
context "as a string" do
|
302
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
303
|
+
r.left_of = "turkey bacon"
|
304
|
+
end }
|
305
|
+
let(:expected) { [
|
306
|
+
["crunchy bacon"],
|
307
|
+
["bacon on kimchi noodles", "heaven"]
|
308
|
+
] }
|
309
|
+
subject { textangle.text }
|
310
|
+
it { should eql(expected) }
|
311
|
+
end
|
312
|
+
context "as a regex" do
|
313
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
314
|
+
r.left_of = /turKey/i
|
315
|
+
end }
|
316
|
+
let(:expected) { [
|
317
|
+
["crunchy bacon"],
|
318
|
+
["bacon on kimchi noodles", "heaven"]
|
319
|
+
] }
|
320
|
+
subject { textangle.text }
|
321
|
+
it { should eql(expected) }
|
322
|
+
end
|
323
|
+
context "as a number" do
|
324
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
325
|
+
r.left_of = 30
|
326
|
+
end }
|
327
|
+
let(:expected) { [
|
328
|
+
["crunchy bacon"],
|
329
|
+
["bacon on kimchi noodles", "heaven"]
|
330
|
+
] }
|
331
|
+
subject { textangle.text }
|
332
|
+
it { should eql(expected) }
|
333
|
+
end
|
334
|
+
end
|
335
|
+
context "when inclusive" do
|
336
|
+
context "as a string" do
|
337
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
338
|
+
r.inclusive = true
|
339
|
+
r.left_of = "heaven"
|
340
|
+
end }
|
341
|
+
let(:expected) { [
|
342
|
+
["crunchy bacon"],
|
343
|
+
["bacon on kimchi noodles", "heaven"]
|
344
|
+
] }
|
345
|
+
subject { textangle.text }
|
346
|
+
it { should eql(expected) }
|
347
|
+
end
|
348
|
+
context "as a regex" do
|
349
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
350
|
+
r.inclusive = true
|
351
|
+
r.left_of = /heaVen/i
|
352
|
+
end }
|
353
|
+
let(:expected) { [
|
354
|
+
["crunchy bacon"],
|
355
|
+
["bacon on kimchi noodles", "heaven"]
|
356
|
+
] }
|
357
|
+
subject { textangle.text }
|
358
|
+
it { should eql(expected) }
|
359
|
+
end
|
360
|
+
context "as a number" do
|
361
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
362
|
+
r.inclusive = true
|
363
|
+
r.left_of = 25
|
364
|
+
end }
|
365
|
+
let(:expected) { [
|
366
|
+
["crunchy bacon"],
|
367
|
+
["bacon on kimchi noodles", "heaven"]
|
368
|
+
] }
|
369
|
+
subject { textangle.text }
|
370
|
+
it { should eql(expected) }
|
371
|
+
end
|
372
|
+
end
|
373
|
+
context "when no match" do
|
374
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
375
|
+
r.left_of = "fake"
|
376
|
+
end }
|
377
|
+
let(:expected) { [] }
|
378
|
+
subject { textangle.text }
|
379
|
+
it { should eql(expected) }
|
380
|
+
end
|
381
|
+
end
|
382
|
+
|
383
|
+
context "when only right_of specified" do
|
384
|
+
context "when exclusive (default)" do
|
385
|
+
context "as a string" do
|
386
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
387
|
+
r.right_of = "heaven"
|
388
|
+
end }
|
389
|
+
let(:expected) { [
|
390
|
+
["turkey bacon","fraud"],
|
391
|
+
["smoked and streaky for me"]
|
392
|
+
] }
|
393
|
+
subject { textangle.text }
|
394
|
+
it { should eql(expected) }
|
395
|
+
end
|
396
|
+
context "as a regex" do
|
397
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
398
|
+
r.right_of = /Heaven/i
|
399
|
+
end }
|
400
|
+
let(:expected) { [
|
401
|
+
["turkey bacon","fraud"],
|
402
|
+
["smoked and streaky for me"]
|
403
|
+
] }
|
404
|
+
subject { textangle.text }
|
405
|
+
it { should eql(expected) }
|
406
|
+
end
|
407
|
+
context "as a number" do
|
408
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
409
|
+
r.right_of = 25
|
410
|
+
end }
|
411
|
+
let(:expected) { [
|
412
|
+
["turkey bacon","fraud"],
|
413
|
+
["smoked and streaky for me"]
|
414
|
+
] }
|
415
|
+
subject { textangle.text }
|
416
|
+
it { should eql(expected) }
|
417
|
+
end
|
418
|
+
end
|
419
|
+
context "when inclusive" do
|
420
|
+
context "as a string" do
|
421
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
422
|
+
r.inclusive = true
|
423
|
+
r.right_of = "turkey bacon"
|
424
|
+
end }
|
425
|
+
let(:expected) { [
|
426
|
+
["turkey bacon","fraud"],
|
427
|
+
["smoked and streaky for me"]
|
428
|
+
] }
|
429
|
+
subject { textangle.text }
|
430
|
+
it { should eql(expected) }
|
431
|
+
end
|
432
|
+
context "as a regex" do
|
433
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
434
|
+
r.inclusive = true
|
435
|
+
r.right_of = /turkey/i
|
436
|
+
end }
|
437
|
+
let(:expected) { [
|
438
|
+
["turkey bacon","fraud"],
|
439
|
+
["smoked and streaky for me"]
|
440
|
+
] }
|
441
|
+
subject { textangle.text }
|
442
|
+
it { should eql(expected) }
|
443
|
+
end
|
444
|
+
context "as a number" do
|
445
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
446
|
+
r.inclusive = true
|
447
|
+
r.right_of = 30
|
448
|
+
end }
|
449
|
+
let(:expected) { [
|
450
|
+
["turkey bacon","fraud"],
|
451
|
+
["smoked and streaky for me"]
|
452
|
+
] }
|
453
|
+
subject { textangle.text }
|
454
|
+
it { should eql(expected) }
|
455
|
+
end
|
456
|
+
end
|
457
|
+
context "when no match" do
|
458
|
+
let(:textangle) { resource_class.new(turtletext_reader) do |r|
|
459
|
+
r.right_of = "fake"
|
460
|
+
end }
|
461
|
+
let(:expected) { [] }
|
462
|
+
subject { textangle.text }
|
463
|
+
it { should eql(expected) }
|
464
|
+
end
|
465
|
+
end
|
466
|
+
|
196
467
|
end
|
197
468
|
|
198
469
|
end
|
@@ -90,7 +90,7 @@ describe PDF::Reader::Turtletext do
|
|
90
90
|
{
|
91
91
|
:with_single_text => {
|
92
92
|
:source_page_content => {10.0=>{10.0=>"a first bit of text"}},
|
93
|
-
:xmin => 0, :xmax => 100, :ymin => 0, :ymax => 100,
|
93
|
+
:xmin => 0, :xmax => 100, :ymin => 0, :ymax => 100, :inclusive => false,
|
94
94
|
:expected_text => [["a first bit of text"]]
|
95
95
|
},
|
96
96
|
:with_single_line_text => {
|
@@ -99,7 +99,7 @@ describe PDF::Reader::Turtletext do
|
|
99
99
|
30.0=>{10.0=>"first part found", 20.0=>"last part found"},
|
100
100
|
10.0=>{10.0=>"last line ignored"}
|
101
101
|
},
|
102
|
-
:xmin => 0, :xmax => 100, :ymin => 20, :ymax => 50,
|
102
|
+
:xmin => 0, :xmax => 100, :ymin => 20, :ymax => 50, :inclusive => false,
|
103
103
|
:expected_text => [["first part found", "last part found"]]
|
104
104
|
},
|
105
105
|
:with_multi_line_text => {
|
@@ -109,11 +109,20 @@ describe PDF::Reader::Turtletext do
|
|
109
109
|
30.0=>{10.0=>"last line first part found", 20.0=>"last line last part found"},
|
110
110
|
10.0=>{10.0=>"last line ignored"}
|
111
111
|
},
|
112
|
-
:xmin => 0, :xmax => 100, :ymin => 20, :ymax => 50,
|
112
|
+
:xmin => 0, :xmax => 100, :ymin => 20, :ymax => 50, :inclusive => false,
|
113
113
|
:expected_text => [
|
114
114
|
["first line first part found", "first line last part found"],
|
115
115
|
["last line first part found", "last line last part found"]
|
116
116
|
]
|
117
|
+
},
|
118
|
+
:with_inclusive_text => {
|
119
|
+
:source_page_content => {
|
120
|
+
70.0=>{10.0=>"first line ignored"},
|
121
|
+
30.0=>{10.0=>"first part found", 20.0=>"last part found"},
|
122
|
+
10.0=>{10.0=>"last line ignored"}
|
123
|
+
},
|
124
|
+
:xmin => 10, :xmax => 100, :ymin => 30, :ymax => 30, :inclusive => true,
|
125
|
+
:expected_text => [["first part found", "last part found"]]
|
117
126
|
}
|
118
127
|
}.each do |test_name,test_expectations|
|
119
128
|
context test_name do
|
@@ -122,8 +131,9 @@ describe PDF::Reader::Turtletext do
|
|
122
131
|
let(:xmax) { test_expectations[:xmax] }
|
123
132
|
let(:ymin) { test_expectations[:ymin] }
|
124
133
|
let(:ymax) { test_expectations[:ymax] }
|
134
|
+
let(:inclusive) { test_expectations[:inclusive] }
|
125
135
|
let(:expected_text) { test_expectations[:expected_text] }
|
126
|
-
subject { turtletext_reader.text_in_region(xmin,xmax,ymin,ymax,page) }
|
136
|
+
subject { turtletext_reader.text_in_region(xmin,xmax,ymin,ymax,page,inclusive) }
|
127
137
|
it { should eql(expected_text) }
|
128
138
|
end
|
129
139
|
end
|
@@ -137,6 +147,7 @@ describe PDF::Reader::Turtletext do
|
|
137
147
|
10.0=>{40.0=>"smoked and streaky da bomb"}
|
138
148
|
} }
|
139
149
|
{
|
150
|
+
:with_no_match => { :match_term => 'bertie beetle', :expected_position => nil },
|
140
151
|
:with_simple_match => { :match_term => 'turkey bacon', :expected_position => {:x=>30.0, :y=>30.0} },
|
141
152
|
:with_match_along_line => { :match_term => 'heaven', :expected_position => {:x=>25.0, :y=>40.0} },
|
142
153
|
:with_regex_match => { :match_term => /kimchi/, :expected_position => {:x=>15.0, :y=>40.0} },
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf-reader-turtletext
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-08-01 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: pdf-reader
|
16
|
-
requirement: &
|
16
|
+
requirement: &70159058920880 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - =
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 1.1.1
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70159058920880
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: bundler
|
27
|
-
requirement: &
|
27
|
+
requirement: &70159058920240 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 1.1.4
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70159058920240
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: jeweler
|
38
|
-
requirement: &
|
38
|
+
requirement: &70159058919460 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: 1.6.4
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70159058919460
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: rake
|
49
|
-
requirement: &
|
49
|
+
requirement: &70159058918900 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ~>
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: 0.9.2.2
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70159058918900
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: rspec
|
60
|
-
requirement: &
|
60
|
+
requirement: &70159058918080 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ~>
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: 2.8.0
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70159058918080
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: rdoc
|
71
|
-
requirement: &
|
71
|
+
requirement: &70159058917260 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ~>
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '3.11'
|
77
77
|
type: :development
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70159058917260
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: prawn
|
82
|
-
requirement: &
|
82
|
+
requirement: &70159058916580 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ~>
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: 0.12.0
|
88
88
|
type: :development
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *70159058916580
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: guard-rspec
|
93
|
-
requirement: &
|
93
|
+
requirement: &70159058915980 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ~>
|
@@ -98,7 +98,7 @@ dependencies:
|
|
98
98
|
version: 1.2.0
|
99
99
|
type: :development
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *70159058915980
|
102
102
|
description: a library that can read structured and positional text from PDFs. Ideal
|
103
103
|
for asembling structured data from invoices and the like.
|
104
104
|
email: gallagher.paul@gmail.com
|