amazon-textract-parser-ruby 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.DS_Store +0 -0
- data/.gitignore +8 -0
- data/.travis.yml +7 -0
- data/Gemfile +6 -0
- data/Gemfile.lock +45 -0
- data/LICENSE.txt +21 -0
- data/README.md +46 -0
- data/Rakefile +10 -0
- data/amazon-textract-parser-ruby.gemspec +31 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/.DS_Store +0 -0
- data/lib/amazon-textract-parser-ruby.rb +590 -0
- data/lib/amazon-textract-parser-ruby/.DS_Store +0 -0
- data/lib/amazon-textract-parser-ruby/version.rb +3 -0
- metadata +129 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 415a439783ef9a00d5d1caf1ddbc111e5311a5bdce51a064daf5b831a6615c34
|
4
|
+
data.tar.gz: 358f5d7b9f74dad6a5f7110188769be65eda89ea9d346072ae047d03c083373f
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 70cd4616be9e8047fac583c6e4db430654ae414934a19c4df9c6e832792a01cdda55d300d4f602714b2bb87f52ebc1c0f1311717d3446a72486bfe9cf521a12f
|
7
|
+
data.tar.gz: 91bf70e8869832452cb896243691090fcb7042e4fb77a3ee121bf64265921ac320ba5ec399322e17fc32b166bd2656b6108cda75aad1417d1dbd6005b2de2205
|
data/.DS_Store
ADDED
Binary file
|
data/.gitignore
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
amazon-textract-parser-ruby (0.1.0)
|
5
|
+
|
6
|
+
GEM
|
7
|
+
remote: https://rubygems.org/
|
8
|
+
specs:
|
9
|
+
activesupport (6.0.3.2)
|
10
|
+
concurrent-ruby (~> 1.0, >= 1.0.2)
|
11
|
+
i18n (>= 0.7, < 2)
|
12
|
+
minitest (~> 5.1)
|
13
|
+
tzinfo (~> 1.1)
|
14
|
+
zeitwerk (~> 2.2, >= 2.2.2)
|
15
|
+
ansi (1.5.0)
|
16
|
+
builder (3.2.4)
|
17
|
+
concurrent-ruby (1.1.6)
|
18
|
+
i18n (1.8.3)
|
19
|
+
concurrent-ruby (~> 1.0)
|
20
|
+
minitest (5.14.1)
|
21
|
+
minitest-reporters (1.4.2)
|
22
|
+
ansi
|
23
|
+
builder
|
24
|
+
minitest (>= 5.0)
|
25
|
+
ruby-progressbar
|
26
|
+
rake (12.3.3)
|
27
|
+
ruby-progressbar (1.10.1)
|
28
|
+
thread_safe (0.3.6)
|
29
|
+
tzinfo (1.2.7)
|
30
|
+
thread_safe (~> 0.1)
|
31
|
+
zeitwerk (2.4.0)
|
32
|
+
|
33
|
+
PLATFORMS
|
34
|
+
ruby
|
35
|
+
|
36
|
+
DEPENDENCIES
|
37
|
+
activesupport (~> 6.0.3.2)
|
38
|
+
amazon-textract-parser-ruby!
|
39
|
+
bundler (~> 1.17)
|
40
|
+
minitest (~> 5.0)
|
41
|
+
minitest-reporters
|
42
|
+
rake (~> 12.3.3)
|
43
|
+
|
44
|
+
BUNDLED WITH
|
45
|
+
1.17.2
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2020 Niels Vanspauwen
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
# Amazon Textract Results Parser
|
2
|
+
|
3
|
+
This is a quick Ruby port of [https://github.com/mludvig/amazon-textract-parser](https://github.com/mludvig/amazon-textract-parser)
|
4
|
+
|
5
|
+
It's useful for interpreting the result of Amazon Textract info.
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Add this line to your application's Gemfile:
|
10
|
+
|
11
|
+
```ruby
|
12
|
+
gem 'amazon-textract-parser-ruby'
|
13
|
+
```
|
14
|
+
|
15
|
+
And then execute:
|
16
|
+
|
17
|
+
$ bundle
|
18
|
+
|
19
|
+
Or install it yourself as:
|
20
|
+
|
21
|
+
$ gem install amazon-textract-parser-ruby
|
22
|
+
|
23
|
+
## Usage
|
24
|
+
|
25
|
+
```ruby
|
26
|
+
textract = Aws::Textract::Client.new
|
27
|
+
textract.start_document_analysis({...})
|
28
|
+
response = textract.get_document_analysis({...})
|
29
|
+
doc = AmazonTRP::Document.new(response.to_h)
|
30
|
+
```
|
31
|
+
|
32
|
+
## Development
|
33
|
+
|
34
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
35
|
+
|
36
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
37
|
+
|
38
|
+
For more info on creating and maintaining gems, check https://bundler.io/v2.0/guides/creating_gem.html
|
39
|
+
|
40
|
+
## Contributing
|
41
|
+
|
42
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/nielsvanspauwen/amazon-textract-parser-ruby.
|
43
|
+
|
44
|
+
## License
|
45
|
+
|
46
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
|
2
|
+
lib = File.expand_path("../lib", __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require "amazon-textract-parser-ruby/version"
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "amazon-textract-parser-ruby"
|
8
|
+
spec.version = AmazonTRP::VERSION
|
9
|
+
spec.authors = ["Niels Vanspauwen"]
|
10
|
+
spec.email = ["niels.vanspauwen@gmail.com"]
|
11
|
+
|
12
|
+
spec.summary = %q{Amazon Textract Results Parser}
|
13
|
+
spec.description = %q{This is a quick Ruby port of https://github.com/mludvig/amazon-textract-parser\nIt's useful for interpreting the result of Amazon Textract info.}
|
14
|
+
spec.homepage = "https://github.com/nielsvanspauwen/amazon-textract-parser-ruby"
|
15
|
+
spec.license = "MIT"
|
16
|
+
|
17
|
+
# Specify which files should be added to the gem when it is released.
|
18
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
19
|
+
spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
|
20
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
21
|
+
end
|
22
|
+
spec.bindir = "exe"
|
23
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
24
|
+
spec.require_paths = ["lib"]
|
25
|
+
|
26
|
+
spec.add_development_dependency "bundler", "~> 1.17"
|
27
|
+
spec.add_development_dependency "rake", "~> 12.3.3"
|
28
|
+
spec.add_development_dependency "minitest", "~> 5.0"
|
29
|
+
spec.add_development_dependency "minitest-reporters"
|
30
|
+
spec.add_development_dependency "activesupport", "~> 6.0.3.2"
|
31
|
+
end
|
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "amazon-textract-parser-ruby"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start(__FILE__)
|
data/bin/setup
ADDED
data/lib/.DS_Store
ADDED
Binary file
|
@@ -0,0 +1,590 @@
|
|
1
|
+
require "amazon-textract-parser-ruby/version"
|
2
|
+
|
3
|
+
module AmazonTRP
|
4
|
+
class Error < StandardError; end
|
5
|
+
|
6
|
+
def AmazonTRP.stable_sort_by(e)
|
7
|
+
e.sort_by.with_index { |x, idx| [yield(x), idx] }
|
8
|
+
end
|
9
|
+
|
10
|
+
|
11
|
+
class BoundingBox
|
12
|
+
attr_reader :width
|
13
|
+
attr_reader :height
|
14
|
+
attr_reader :left
|
15
|
+
attr_reader :top
|
16
|
+
|
17
|
+
def initialize(width, height, left, top)
|
18
|
+
@width = width
|
19
|
+
@height = height
|
20
|
+
@left = left
|
21
|
+
@top = top
|
22
|
+
end
|
23
|
+
|
24
|
+
def to_s
|
25
|
+
"width: #{@width}, height: #{@height}, left: #{@left}, top: #{@top}"
|
26
|
+
end
|
27
|
+
|
28
|
+
def right
|
29
|
+
@left + @width
|
30
|
+
end
|
31
|
+
|
32
|
+
def bottom
|
33
|
+
@top + @height
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
class Point
|
39
|
+
attr_reader :x
|
40
|
+
attr_reader :y
|
41
|
+
|
42
|
+
def initialize(x, y)
|
43
|
+
@x = x
|
44
|
+
@y = y
|
45
|
+
end
|
46
|
+
|
47
|
+
def to_s
|
48
|
+
"(#{@x}, #{@y})"
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
|
53
|
+
class Geometry
|
54
|
+
attr_reader :boundingBox
|
55
|
+
attr_reader :polygon
|
56
|
+
|
57
|
+
def initialize(geometry)
|
58
|
+
bbox = geometry[:bounding_box]
|
59
|
+
pg = geometry[:polygon]
|
60
|
+
@boundingBox = BoundingBox.new(bbox[:width], bbox[:height], bbox[:left], bbox[:top])
|
61
|
+
@polygon = pg.map{|p| Point.new(p[:x], p[:y])}
|
62
|
+
end
|
63
|
+
|
64
|
+
def to_s
|
65
|
+
"BoundingBox: #{@bounding_box}"
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
|
70
|
+
class Word
|
71
|
+
attr_reader :confidence
|
72
|
+
attr_reader :geometry
|
73
|
+
attr_reader :id
|
74
|
+
attr_reader :text
|
75
|
+
attr_reader :block
|
76
|
+
|
77
|
+
def initialize(block, blockMap)
|
78
|
+
@block = block
|
79
|
+
@confidence = block[:confidence]
|
80
|
+
@geometry = Geometry.new(block[:geometry])
|
81
|
+
@id = block[:id]
|
82
|
+
@text = block[:text] || ""
|
83
|
+
end
|
84
|
+
|
85
|
+
def to_s
|
86
|
+
@text
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
|
91
|
+
class Line
|
92
|
+
attr_reader :confidence
|
93
|
+
attr_reader :geometry
|
94
|
+
attr_reader :id
|
95
|
+
attr_reader :words
|
96
|
+
attr_reader :text
|
97
|
+
attr_reader :block
|
98
|
+
|
99
|
+
def initialize(block, blockMap)
|
100
|
+
@block = block
|
101
|
+
@confidence = block[:confidence]
|
102
|
+
@geometry = Geometry.new(block[:geometry])
|
103
|
+
@id = block[:id]
|
104
|
+
|
105
|
+
@text = block[:text] || ""
|
106
|
+
|
107
|
+
@words = []
|
108
|
+
if block[:relationships]
|
109
|
+
block[:relationships].each do |rs|
|
110
|
+
if rs[:type] == 'CHILD'
|
111
|
+
rs[:ids].each do |cid|
|
112
|
+
if blockMap[cid][:block_type] == "WORD"
|
113
|
+
@words.append(Word.new(blockMap[cid], blockMap))
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
def to_s
|
122
|
+
s = "Line: "
|
123
|
+
s = s + @text + "\n"
|
124
|
+
s = s + "Words: "
|
125
|
+
@words.each do |word|
|
126
|
+
s = s + "[#{word}]"
|
127
|
+
end
|
128
|
+
return s
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
|
133
|
+
class SelectionElement
|
134
|
+
attr_reader :confidence
|
135
|
+
attr_reader :geometry
|
136
|
+
attr_reader :id
|
137
|
+
attr_reader :selectionStatus
|
138
|
+
|
139
|
+
def initialize(block, blockMap)
|
140
|
+
@confidence = block[:confidence]
|
141
|
+
@geometry = Geometry.new(block[:geometry])
|
142
|
+
@id = block[:id]
|
143
|
+
@selectionStatus = block[:selection_status]
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
|
148
|
+
class FieldKey
|
149
|
+
attr_reader :confidence
|
150
|
+
attr_reader :geometry
|
151
|
+
attr_reader :id
|
152
|
+
attr_reader :content
|
153
|
+
attr_reader :text
|
154
|
+
attr_reader :block
|
155
|
+
|
156
|
+
def initialize(block, children, blockMap)
|
157
|
+
@block = block
|
158
|
+
@confidence = block[:confidence]
|
159
|
+
@geometry = Geometry.new(block[:geometry])
|
160
|
+
@id = block[:id]
|
161
|
+
@text = ""
|
162
|
+
@content = []
|
163
|
+
|
164
|
+
t = []
|
165
|
+
children.each do |eid|
|
166
|
+
wb = blockMap[eid]
|
167
|
+
if wb[:block_type] == "WORD"
|
168
|
+
w = Word.new(wb, blockMap)
|
169
|
+
@content.append(w)
|
170
|
+
t.append(w.text)
|
171
|
+
end
|
172
|
+
end
|
173
|
+
@text = t.join(' ') if t
|
174
|
+
end
|
175
|
+
|
176
|
+
def to_s
|
177
|
+
@text
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
|
182
|
+
class FieldValue
|
183
|
+
attr_reader :confidence
|
184
|
+
attr_reader :geometry
|
185
|
+
attr_reader :id
|
186
|
+
attr_reader :content
|
187
|
+
attr_reader :text
|
188
|
+
attr_reader :block
|
189
|
+
|
190
|
+
def initialize(block, children, blockMap)
|
191
|
+
@block = block
|
192
|
+
@confidence = block[:confidence]
|
193
|
+
@geometry = Geometry.new(block[:geometry])
|
194
|
+
@id = block[:id]
|
195
|
+
@text = ""
|
196
|
+
@content = []
|
197
|
+
|
198
|
+
t = []
|
199
|
+
children.each do |eid|
|
200
|
+
wb = blockMap[eid]
|
201
|
+
if wb[:block_type] == "WORD"
|
202
|
+
w = Word.new(wb, blockMap)
|
203
|
+
@content.append(w)
|
204
|
+
t.append(w.text)
|
205
|
+
elsif wb[:block_type] == "SELECTION_ELEMENT"
|
206
|
+
se = SelectionElement.new(wb, blockMap)
|
207
|
+
@content.append(se)
|
208
|
+
t.append(se.selectionStatus)
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
@text = t.join(' ') if t
|
213
|
+
end
|
214
|
+
|
215
|
+
def to_s
|
216
|
+
@text
|
217
|
+
end
|
218
|
+
end
|
219
|
+
|
220
|
+
|
221
|
+
class Field
|
222
|
+
attr_reader :key
|
223
|
+
attr_reader :value
|
224
|
+
|
225
|
+
def initialize(block, blockMap)
|
226
|
+
@key = nil
|
227
|
+
@value = nil
|
228
|
+
|
229
|
+
block[:relationships].each do |item|
|
230
|
+
if item[:type] == "CHILD"
|
231
|
+
@key = FieldKey.new(block, item[:ids], blockMap)
|
232
|
+
elsif item[:type] == "VALUE"
|
233
|
+
item[:ids].each do |eid|
|
234
|
+
vkvs = blockMap[eid]
|
235
|
+
if vkvs[:entity_types].include?('VALUE')
|
236
|
+
if vkvs.has_key?(:relationships)
|
237
|
+
vkvs[:relationships].each do |vitem|
|
238
|
+
@value = FieldValue.new(vkvs, vitem[:ids], blockMap) if vitem[:type] == "CHILD"
|
239
|
+
end
|
240
|
+
end
|
241
|
+
end
|
242
|
+
end
|
243
|
+
end
|
244
|
+
end
|
245
|
+
end
|
246
|
+
|
247
|
+
def to_s
|
248
|
+
k = ""
|
249
|
+
v = ""
|
250
|
+
|
251
|
+
k = @key.to_s if @key
|
252
|
+
v = @value.to_s if @value
|
253
|
+
|
254
|
+
return "Field: #{k} = #{v}"
|
255
|
+
end
|
256
|
+
end
|
257
|
+
|
258
|
+
|
259
|
+
class Form
|
260
|
+
attr_reader :fields
|
261
|
+
|
262
|
+
def initialize
|
263
|
+
@fields = []
|
264
|
+
@fieldsMap = {}
|
265
|
+
end
|
266
|
+
|
267
|
+
def addField(field)
|
268
|
+
@fields.append(field)
|
269
|
+
@fieldsMap[field.key.text] = field
|
270
|
+
end
|
271
|
+
|
272
|
+
def to_s
|
273
|
+
s = "Form fields:\n"
|
274
|
+
@fields.each do |field|
|
275
|
+
s = s + field.to_s + "\n"
|
276
|
+
end
|
277
|
+
return s
|
278
|
+
end
|
279
|
+
|
280
|
+
def getFieldByKey(key)
|
281
|
+
@fieldsMap[key]
|
282
|
+
end
|
283
|
+
|
284
|
+
def findFieldsByKey(key)
|
285
|
+
searchKey = key.downcase()
|
286
|
+
results = []
|
287
|
+
@fields.each do |field|
|
288
|
+
if field.key && (field.key.text.downcase.include?(searchKey))
|
289
|
+
results.append(field)
|
290
|
+
end
|
291
|
+
end
|
292
|
+
return results
|
293
|
+
end
|
294
|
+
|
295
|
+
def findFieldByKey(key)
|
296
|
+
fields = findFieldsByKey(key)
|
297
|
+
# Choose the shortest match
|
298
|
+
match = nil
|
299
|
+
matchLength = 0
|
300
|
+
fields.each do |f|
|
301
|
+
if match.nil? || f.key.text.length < matchLength
|
302
|
+
match = f
|
303
|
+
matchLength = f.key.text.length
|
304
|
+
end
|
305
|
+
end
|
306
|
+
return match
|
307
|
+
end
|
308
|
+
end
|
309
|
+
|
310
|
+
|
311
|
+
class Cell
|
312
|
+
attr_reader :confidence
|
313
|
+
attr_reader :rowIndex
|
314
|
+
attr_reader :columnIndex
|
315
|
+
attr_reader :rowSpan
|
316
|
+
attr_reader :columnSpan
|
317
|
+
attr_reader :geometry
|
318
|
+
attr_reader :id
|
319
|
+
attr_reader :content
|
320
|
+
attr_reader :text
|
321
|
+
attr_reader :block
|
322
|
+
|
323
|
+
def initialize(block, blockMap)
|
324
|
+
@block = block
|
325
|
+
@confidence = block[:confidence]
|
326
|
+
@rowIndex = block[:row_index]
|
327
|
+
@columnIndex = block[:column_index]
|
328
|
+
@rowSpan = block[:row_span]
|
329
|
+
@columnSpan = block[:column_span]
|
330
|
+
@geometry = Geometry.new(block[:geometry])
|
331
|
+
@id = block[:id]
|
332
|
+
@content = []
|
333
|
+
@text = ""
|
334
|
+
if block[:relationships]
|
335
|
+
block[:relationships].each do |rs|
|
336
|
+
if rs[:type] == 'CHILD'
|
337
|
+
for cid in rs[:ids]
|
338
|
+
blockType = blockMap[cid][:block_type]
|
339
|
+
if blockType == "WORD"
|
340
|
+
w = Word.new(blockMap[cid], blockMap)
|
341
|
+
@content.append(w)
|
342
|
+
@text = @text + w.text + ' '
|
343
|
+
elsif blockType == "SELECTION_ELEMENT"
|
344
|
+
se = SelectionElement.new(blockMap[cid], blockMap)
|
345
|
+
@content.append(se)
|
346
|
+
@text = @text + se.selectionStatus + ', '
|
347
|
+
end
|
348
|
+
end
|
349
|
+
end
|
350
|
+
end
|
351
|
+
end
|
352
|
+
@text = @text.strip
|
353
|
+
end
|
354
|
+
|
355
|
+
def to_s
|
356
|
+
@text
|
357
|
+
end
|
358
|
+
end
|
359
|
+
|
360
|
+
|
361
|
+
class Row
|
362
|
+
attr_reader :cells
|
363
|
+
|
364
|
+
def initialize
|
365
|
+
@cells = []
|
366
|
+
end
|
367
|
+
|
368
|
+
def to_s
|
369
|
+
s = ""
|
370
|
+
@cells.each do |cell|
|
371
|
+
s = s + "[#{cell}]"
|
372
|
+
end
|
373
|
+
return s
|
374
|
+
end
|
375
|
+
end
|
376
|
+
|
377
|
+
|
378
|
+
class Table
|
379
|
+
attr_reader :confidence
|
380
|
+
attr_reader :geometry
|
381
|
+
attr_reader :id
|
382
|
+
attr_reader :rows
|
383
|
+
attr_reader :block
|
384
|
+
|
385
|
+
def initialize(block, blockMap)
|
386
|
+
@block = block
|
387
|
+
|
388
|
+
@confidence = block[:confidence]
|
389
|
+
@geometry = Geometry.new(block[:geometry])
|
390
|
+
|
391
|
+
@id = block[:id]
|
392
|
+
@rows = []
|
393
|
+
|
394
|
+
ri = 1
|
395
|
+
row = Row.new()
|
396
|
+
cell = nil
|
397
|
+
if block[:relationships]
|
398
|
+
block[:relationships].each do |rs|
|
399
|
+
if rs[:type] == 'CHILD'
|
400
|
+
for cid in rs[:ids]
|
401
|
+
cell = Cell.new(blockMap[cid], blockMap)
|
402
|
+
if cell.rowIndex > ri
|
403
|
+
@rows.append(row)
|
404
|
+
row = Row.new()
|
405
|
+
ri = cell.rowIndex
|
406
|
+
end
|
407
|
+
row.cells.append(cell)
|
408
|
+
end
|
409
|
+
@rows.append(row) if row && row.cells
|
410
|
+
end
|
411
|
+
end
|
412
|
+
end
|
413
|
+
end
|
414
|
+
|
415
|
+
def to_s
|
416
|
+
s = "Table:\n"
|
417
|
+
@rows.each do |row|
|
418
|
+
s = s + row.to_s + "\n"
|
419
|
+
end
|
420
|
+
return s
|
421
|
+
end
|
422
|
+
end
|
423
|
+
|
424
|
+
|
425
|
+
class Page
|
426
|
+
attr_reader :blocks
|
427
|
+
attr_reader :text
|
428
|
+
attr_reader :lines
|
429
|
+
attr_reader :form
|
430
|
+
attr_reader :tables
|
431
|
+
attr_reader :content
|
432
|
+
attr_reader :geometry
|
433
|
+
attr_reader :id
|
434
|
+
|
435
|
+
def initialize(blocks, blockMap)
|
436
|
+
@blocks = blocks
|
437
|
+
@text = ""
|
438
|
+
@lines = []
|
439
|
+
@form = Form.new()
|
440
|
+
@tables = []
|
441
|
+
@content = []
|
442
|
+
|
443
|
+
_parse(blockMap)
|
444
|
+
end
|
445
|
+
|
446
|
+
def to_s
|
447
|
+
s = "Page:\n"
|
448
|
+
@content.each do |item|
|
449
|
+
s = s + item.to_s + "\n"
|
450
|
+
end
|
451
|
+
return s
|
452
|
+
end
|
453
|
+
|
454
|
+
def _parse(blockMap)
|
455
|
+
@blocks.each do |item|
|
456
|
+
if item[:block_type] == "PAGE"
|
457
|
+
@geometry = Geometry.new(item[:geometry])
|
458
|
+
@id = item[:id]
|
459
|
+
elsif item[:block_type] == "LINE"
|
460
|
+
l = Line.new(item, blockMap)
|
461
|
+
@lines.append(l)
|
462
|
+
@content.append(l)
|
463
|
+
@text = @text + l.text + '\n'
|
464
|
+
elsif item[:block_type] == "TABLE"
|
465
|
+
t = Table.new(item, blockMap)
|
466
|
+
@tables.append(t)
|
467
|
+
@content.append(t)
|
468
|
+
elsif item[:block_type] == "KEY_VALUE_SET"
|
469
|
+
if item[:entity_types].include?('KEY')
|
470
|
+
f = Field.new(item, blockMap)
|
471
|
+
if f.key
|
472
|
+
@form.addField(f)
|
473
|
+
@content.append(f)
|
474
|
+
end
|
475
|
+
end
|
476
|
+
end
|
477
|
+
end
|
478
|
+
end
|
479
|
+
|
480
|
+
def getLinesInReadingOrder
|
481
|
+
columns = []
|
482
|
+
lines = []
|
483
|
+
@lines.each do |item|
|
484
|
+
column_found = false
|
485
|
+
columns.each_with_index do |column, index|
|
486
|
+
bbox_left = item.geometry.boundingBox.left
|
487
|
+
bbox_right = item.geometry.boundingBox.right
|
488
|
+
bbox_centre = item.geometry.boundingBox.left + item.geometry.boundingBox.width/2
|
489
|
+
column_centre = column[:left] + ((column[:right] - column[:left]) / 2)
|
490
|
+
if (bbox_centre > column[:left] && bbox_centre < column[:right]) || (column_centre > bbox_left && column_centre < bbox_right)
|
491
|
+
# Bbox appears inside the column
|
492
|
+
lines.append({:column => index, :text => item.text})
|
493
|
+
column_found = true
|
494
|
+
break
|
495
|
+
end
|
496
|
+
end
|
497
|
+
if !column_found
|
498
|
+
columns.append({:left => item.geometry.boundingBox.left, :right => item.geometry.boundingBox.right})
|
499
|
+
lines.append({:column => columns.count - 1, :text => item.text})
|
500
|
+
end
|
501
|
+
end
|
502
|
+
|
503
|
+
return AmazonTRP::stable_sort_by(lines) {|x| x[:column]}
|
504
|
+
end
|
505
|
+
|
506
|
+
def getTextInReadingOrder
|
507
|
+
lines = getLinesInReadingOrder()
|
508
|
+
text = ""
|
509
|
+
lines.each do |line|
|
510
|
+
text = text + line[:text] + "\n"
|
511
|
+
end
|
512
|
+
return text
|
513
|
+
end
|
514
|
+
|
515
|
+
def getLinesInBoundingBox(boundingBox)
|
516
|
+
lines = []
|
517
|
+
@lines.each do |line|
|
518
|
+
line_bbox = line.geometry.boundingBox
|
519
|
+
if (line_bbox.left >= boundingBox.left &&
|
520
|
+
line_bbox.left <= boundingBox.right &&
|
521
|
+
line_bbox.top >= boundingBox.top &&
|
522
|
+
line_bbox.top <= boundingBox.bottom)
|
523
|
+
lines.append(line)
|
524
|
+
end
|
525
|
+
end
|
526
|
+
return lines
|
527
|
+
end
|
528
|
+
end
|
529
|
+
|
530
|
+
|
531
|
+
class Document
|
532
|
+
attr_reader :blocks
|
533
|
+
attr_reader :pageBlocks
|
534
|
+
attr_reader :pages
|
535
|
+
|
536
|
+
def initialize(responsePages)
|
537
|
+
@responsePages = responsePages.is_a?(Array) ? responsePages : [responsePages]
|
538
|
+
@pages = []
|
539
|
+
_parse()
|
540
|
+
end
|
541
|
+
|
542
|
+
def to_s
|
543
|
+
s = "\nDocument:\n"
|
544
|
+
@pages.each do |p|
|
545
|
+
s = s + p.to_s + "\n\n"
|
546
|
+
end
|
547
|
+
return s
|
548
|
+
end
|
549
|
+
|
550
|
+
def _parseDocumentPagesAndBlockMap
|
551
|
+
blockMap = {}
|
552
|
+
|
553
|
+
documentPages = []
|
554
|
+
documentPage = nil
|
555
|
+
@responsePages.each do |page|
|
556
|
+
unless page[:blocks].nil?
|
557
|
+
page[:blocks].each do |block|
|
558
|
+
if block.has_key?(:block_type) && block.has_key?(:id)
|
559
|
+
blockMap[block[:id]] = block
|
560
|
+
end
|
561
|
+
|
562
|
+
if block[:block_type] == 'PAGE'
|
563
|
+
documentPages.append({:blocks => documentPage}) if documentPage
|
564
|
+
documentPage = []
|
565
|
+
documentPage.append(block)
|
566
|
+
else
|
567
|
+
documentPage.append(block)
|
568
|
+
end
|
569
|
+
end
|
570
|
+
end
|
571
|
+
end
|
572
|
+
documentPages.append({:blocks => documentPage}) if documentPage
|
573
|
+
return documentPages, blockMap
|
574
|
+
end
|
575
|
+
|
576
|
+
def _parse
|
577
|
+
@responseDocumentPages, @blockMap = _parseDocumentPagesAndBlockMap()
|
578
|
+
@responseDocumentPages.each do |documentPage|
|
579
|
+
page = Page.new(documentPage[:blocks], @blockMap)
|
580
|
+
@pages.append(page)
|
581
|
+
end
|
582
|
+
end
|
583
|
+
|
584
|
+
def getBlockById(blockId)
|
585
|
+
return @blockMap[blockId] if @blockMap && @blockMap.has_key?(blockId)
|
586
|
+
return nil
|
587
|
+
end
|
588
|
+
end
|
589
|
+
|
590
|
+
end
|
Binary file
|
metadata
ADDED
@@ -0,0 +1,129 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: amazon-textract-parser-ruby
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Niels Vanspauwen
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2020-07-17 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.17'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.17'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 12.3.3
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 12.3.3
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: minitest
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '5.0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '5.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: minitest-reporters
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: activesupport
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: 6.0.3.2
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 6.0.3.2
|
83
|
+
description: This is a quick Ruby port of https://github.com/mludvig/amazon-textract-parser\nIt's
|
84
|
+
useful for interpreting the result of Amazon Textract info.
|
85
|
+
email:
|
86
|
+
- niels.vanspauwen@gmail.com
|
87
|
+
executables: []
|
88
|
+
extensions: []
|
89
|
+
extra_rdoc_files: []
|
90
|
+
files:
|
91
|
+
- ".DS_Store"
|
92
|
+
- ".gitignore"
|
93
|
+
- ".travis.yml"
|
94
|
+
- Gemfile
|
95
|
+
- Gemfile.lock
|
96
|
+
- LICENSE.txt
|
97
|
+
- README.md
|
98
|
+
- Rakefile
|
99
|
+
- amazon-textract-parser-ruby.gemspec
|
100
|
+
- bin/console
|
101
|
+
- bin/setup
|
102
|
+
- lib/.DS_Store
|
103
|
+
- lib/amazon-textract-parser-ruby.rb
|
104
|
+
- lib/amazon-textract-parser-ruby/.DS_Store
|
105
|
+
- lib/amazon-textract-parser-ruby/version.rb
|
106
|
+
homepage: https://github.com/nielsvanspauwen/amazon-textract-parser-ruby
|
107
|
+
licenses:
|
108
|
+
- MIT
|
109
|
+
metadata: {}
|
110
|
+
post_install_message:
|
111
|
+
rdoc_options: []
|
112
|
+
require_paths:
|
113
|
+
- lib
|
114
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
115
|
+
requirements:
|
116
|
+
- - ">="
|
117
|
+
- !ruby/object:Gem::Version
|
118
|
+
version: '0'
|
119
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
120
|
+
requirements:
|
121
|
+
- - ">="
|
122
|
+
- !ruby/object:Gem::Version
|
123
|
+
version: '0'
|
124
|
+
requirements: []
|
125
|
+
rubygems_version: 3.0.1
|
126
|
+
signing_key:
|
127
|
+
specification_version: 4
|
128
|
+
summary: Amazon Textract Results Parser
|
129
|
+
test_files: []
|