amazon-textract-parser-ruby 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 415a439783ef9a00d5d1caf1ddbc111e5311a5bdce51a064daf5b831a6615c34
4
+ data.tar.gz: 358f5d7b9f74dad6a5f7110188769be65eda89ea9d346072ae047d03c083373f
5
+ SHA512:
6
+ metadata.gz: 70cd4616be9e8047fac583c6e4db430654ae414934a19c4df9c6e832792a01cdda55d300d4f602714b2bb87f52ebc1c0f1311717d3446a72486bfe9cf521a12f
7
+ data.tar.gz: 91bf70e8869832452cb896243691090fcb7042e4fb77a3ee121bf64265921ac320ba5ec399322e17fc32b166bd2656b6108cda75aad1417d1dbd6005b2de2205
Binary file
@@ -0,0 +1,8 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
@@ -0,0 +1,7 @@
1
+ ---
2
+ sudo: false
3
+ language: ruby
4
+ cache: bundler
5
+ rvm:
6
+ - 2.6.0
7
+ before_install: gem install bundler -v 1.17.2
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source "https://rubygems.org"
2
+
3
+ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
4
+
5
+ # Specify your gem's dependencies in amazon-textract-parser-ruby.gemspec
6
+ gemspec
@@ -0,0 +1,45 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ amazon-textract-parser-ruby (0.1.0)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ activesupport (6.0.3.2)
10
+ concurrent-ruby (~> 1.0, >= 1.0.2)
11
+ i18n (>= 0.7, < 2)
12
+ minitest (~> 5.1)
13
+ tzinfo (~> 1.1)
14
+ zeitwerk (~> 2.2, >= 2.2.2)
15
+ ansi (1.5.0)
16
+ builder (3.2.4)
17
+ concurrent-ruby (1.1.6)
18
+ i18n (1.8.3)
19
+ concurrent-ruby (~> 1.0)
20
+ minitest (5.14.1)
21
+ minitest-reporters (1.4.2)
22
+ ansi
23
+ builder
24
+ minitest (>= 5.0)
25
+ ruby-progressbar
26
+ rake (12.3.3)
27
+ ruby-progressbar (1.10.1)
28
+ thread_safe (0.3.6)
29
+ tzinfo (1.2.7)
30
+ thread_safe (~> 0.1)
31
+ zeitwerk (2.4.0)
32
+
33
+ PLATFORMS
34
+ ruby
35
+
36
+ DEPENDENCIES
37
+ activesupport (~> 6.0.3.2)
38
+ amazon-textract-parser-ruby!
39
+ bundler (~> 1.17)
40
+ minitest (~> 5.0)
41
+ minitest-reporters
42
+ rake (~> 12.3.3)
43
+
44
+ BUNDLED WITH
45
+ 1.17.2
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2020 Niels Vanspauwen
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,46 @@
1
+ # Amazon Textract Results Parser
2
+
3
+ This is a quick Ruby port of [https://github.com/mludvig/amazon-textract-parser](https://github.com/mludvig/amazon-textract-parser)
4
+
5
+ It's useful for interpreting the result of Amazon Textract info.
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'amazon-textract-parser-ruby'
13
+ ```
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install amazon-textract-parser-ruby
22
+
23
+ ## Usage
24
+
25
+ ```ruby
26
+ textract = Aws::Textract::Client.new
27
+ textract.start_document_analysis({...})
28
+ response = textract.get_document_analysis({...})
29
+ doc = AmazonTRP::Document.new(response.to_h)
30
+ ```
31
+
32
+ ## Development
33
+
34
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
35
+
36
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
37
+
38
+ For more info on creating and maintaining gems, check https://bundler.io/v2.0/guides/creating_gem.html
39
+
40
+ ## Contributing
41
+
42
+ Bug reports and pull requests are welcome on GitHub at https://github.com/nielsvanspauwen/amazon-textract-parser-ruby.
43
+
44
+ ## License
45
+
46
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+ require "rake/testtask"
3
+
4
+ Rake::TestTask.new(:test) do |t|
5
+ t.libs << "test"
6
+ t.libs << "lib"
7
+ t.test_files = FileList["test/**/*_test.rb"]
8
+ end
9
+
10
+ task :default => :test
@@ -0,0 +1,31 @@
1
+
2
+ lib = File.expand_path("../lib", __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require "amazon-textract-parser-ruby/version"
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "amazon-textract-parser-ruby"
8
+ spec.version = AmazonTRP::VERSION
9
+ spec.authors = ["Niels Vanspauwen"]
10
+ spec.email = ["niels.vanspauwen@gmail.com"]
11
+
12
+ spec.summary = %q{Amazon Textract Results Parser}
13
+ spec.description = %q{This is a quick Ruby port of https://github.com/mludvig/amazon-textract-parser\nIt's useful for interpreting the result of Amazon Textract info.}
14
+ spec.homepage = "https://github.com/nielsvanspauwen/amazon-textract-parser-ruby"
15
+ spec.license = "MIT"
16
+
17
+ # Specify which files should be added to the gem when it is released.
18
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
19
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
20
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
21
+ end
22
+ spec.bindir = "exe"
23
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
24
+ spec.require_paths = ["lib"]
25
+
26
+ spec.add_development_dependency "bundler", "~> 1.17"
27
+ spec.add_development_dependency "rake", "~> 12.3.3"
28
+ spec.add_development_dependency "minitest", "~> 5.0"
29
+ spec.add_development_dependency "minitest-reporters"
30
+ spec.add_development_dependency "activesupport", "~> 6.0.3.2"
31
+ end
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "amazon-textract-parser-ruby"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
Binary file
@@ -0,0 +1,590 @@
1
+ require "amazon-textract-parser-ruby/version"
2
+
3
+ module AmazonTRP
4
+ class Error < StandardError; end
5
+
6
+ def AmazonTRP.stable_sort_by(e)
7
+ e.sort_by.with_index { |x, idx| [yield(x), idx] }
8
+ end
9
+
10
+
11
+ class BoundingBox
12
+ attr_reader :width
13
+ attr_reader :height
14
+ attr_reader :left
15
+ attr_reader :top
16
+
17
+ def initialize(width, height, left, top)
18
+ @width = width
19
+ @height = height
20
+ @left = left
21
+ @top = top
22
+ end
23
+
24
+ def to_s
25
+ "width: #{@width}, height: #{@height}, left: #{@left}, top: #{@top}"
26
+ end
27
+
28
+ def right
29
+ @left + @width
30
+ end
31
+
32
+ def bottom
33
+ @top + @height
34
+ end
35
+ end
36
+
37
+
38
+ class Point
39
+ attr_reader :x
40
+ attr_reader :y
41
+
42
+ def initialize(x, y)
43
+ @x = x
44
+ @y = y
45
+ end
46
+
47
+ def to_s
48
+ "(#{@x}, #{@y})"
49
+ end
50
+ end
51
+
52
+
53
+ class Geometry
54
+ attr_reader :boundingBox
55
+ attr_reader :polygon
56
+
57
+ def initialize(geometry)
58
+ bbox = geometry[:bounding_box]
59
+ pg = geometry[:polygon]
60
+ @boundingBox = BoundingBox.new(bbox[:width], bbox[:height], bbox[:left], bbox[:top])
61
+ @polygon = pg.map{|p| Point.new(p[:x], p[:y])}
62
+ end
63
+
64
+ def to_s
65
+ "BoundingBox: #{@bounding_box}"
66
+ end
67
+ end
68
+
69
+
70
+ class Word
71
+ attr_reader :confidence
72
+ attr_reader :geometry
73
+ attr_reader :id
74
+ attr_reader :text
75
+ attr_reader :block
76
+
77
+ def initialize(block, blockMap)
78
+ @block = block
79
+ @confidence = block[:confidence]
80
+ @geometry = Geometry.new(block[:geometry])
81
+ @id = block[:id]
82
+ @text = block[:text] || ""
83
+ end
84
+
85
+ def to_s
86
+ @text
87
+ end
88
+ end
89
+
90
+
91
+ class Line
92
+ attr_reader :confidence
93
+ attr_reader :geometry
94
+ attr_reader :id
95
+ attr_reader :words
96
+ attr_reader :text
97
+ attr_reader :block
98
+
99
+ def initialize(block, blockMap)
100
+ @block = block
101
+ @confidence = block[:confidence]
102
+ @geometry = Geometry.new(block[:geometry])
103
+ @id = block[:id]
104
+
105
+ @text = block[:text] || ""
106
+
107
+ @words = []
108
+ if block[:relationships]
109
+ block[:relationships].each do |rs|
110
+ if rs[:type] == 'CHILD'
111
+ rs[:ids].each do |cid|
112
+ if blockMap[cid][:block_type] == "WORD"
113
+ @words.append(Word.new(blockMap[cid], blockMap))
114
+ end
115
+ end
116
+ end
117
+ end
118
+ end
119
+ end
120
+
121
+ def to_s
122
+ s = "Line: "
123
+ s = s + @text + "\n"
124
+ s = s + "Words: "
125
+ @words.each do |word|
126
+ s = s + "[#{word}]"
127
+ end
128
+ return s
129
+ end
130
+ end
131
+
132
+
133
+ class SelectionElement
134
+ attr_reader :confidence
135
+ attr_reader :geometry
136
+ attr_reader :id
137
+ attr_reader :selectionStatus
138
+
139
+ def initialize(block, blockMap)
140
+ @confidence = block[:confidence]
141
+ @geometry = Geometry.new(block[:geometry])
142
+ @id = block[:id]
143
+ @selectionStatus = block[:selection_status]
144
+ end
145
+ end
146
+
147
+
148
+ class FieldKey
149
+ attr_reader :confidence
150
+ attr_reader :geometry
151
+ attr_reader :id
152
+ attr_reader :content
153
+ attr_reader :text
154
+ attr_reader :block
155
+
156
+ def initialize(block, children, blockMap)
157
+ @block = block
158
+ @confidence = block[:confidence]
159
+ @geometry = Geometry.new(block[:geometry])
160
+ @id = block[:id]
161
+ @text = ""
162
+ @content = []
163
+
164
+ t = []
165
+ children.each do |eid|
166
+ wb = blockMap[eid]
167
+ if wb[:block_type] == "WORD"
168
+ w = Word.new(wb, blockMap)
169
+ @content.append(w)
170
+ t.append(w.text)
171
+ end
172
+ end
173
+ @text = t.join(' ') if t
174
+ end
175
+
176
+ def to_s
177
+ @text
178
+ end
179
+ end
180
+
181
+
182
+ class FieldValue
183
+ attr_reader :confidence
184
+ attr_reader :geometry
185
+ attr_reader :id
186
+ attr_reader :content
187
+ attr_reader :text
188
+ attr_reader :block
189
+
190
+ def initialize(block, children, blockMap)
191
+ @block = block
192
+ @confidence = block[:confidence]
193
+ @geometry = Geometry.new(block[:geometry])
194
+ @id = block[:id]
195
+ @text = ""
196
+ @content = []
197
+
198
+ t = []
199
+ children.each do |eid|
200
+ wb = blockMap[eid]
201
+ if wb[:block_type] == "WORD"
202
+ w = Word.new(wb, blockMap)
203
+ @content.append(w)
204
+ t.append(w.text)
205
+ elsif wb[:block_type] == "SELECTION_ELEMENT"
206
+ se = SelectionElement.new(wb, blockMap)
207
+ @content.append(se)
208
+ t.append(se.selectionStatus)
209
+ end
210
+ end
211
+
212
+ @text = t.join(' ') if t
213
+ end
214
+
215
+ def to_s
216
+ @text
217
+ end
218
+ end
219
+
220
+
221
+ class Field
222
+ attr_reader :key
223
+ attr_reader :value
224
+
225
+ def initialize(block, blockMap)
226
+ @key = nil
227
+ @value = nil
228
+
229
+ block[:relationships].each do |item|
230
+ if item[:type] == "CHILD"
231
+ @key = FieldKey.new(block, item[:ids], blockMap)
232
+ elsif item[:type] == "VALUE"
233
+ item[:ids].each do |eid|
234
+ vkvs = blockMap[eid]
235
+ if vkvs[:entity_types].include?('VALUE')
236
+ if vkvs.has_key?(:relationships)
237
+ vkvs[:relationships].each do |vitem|
238
+ @value = FieldValue.new(vkvs, vitem[:ids], blockMap) if vitem[:type] == "CHILD"
239
+ end
240
+ end
241
+ end
242
+ end
243
+ end
244
+ end
245
+ end
246
+
247
+ def to_s
248
+ k = ""
249
+ v = ""
250
+
251
+ k = @key.to_s if @key
252
+ v = @value.to_s if @value
253
+
254
+ return "Field: #{k} = #{v}"
255
+ end
256
+ end
257
+
258
+
259
+ class Form
260
+ attr_reader :fields
261
+
262
+ def initialize
263
+ @fields = []
264
+ @fieldsMap = {}
265
+ end
266
+
267
+ def addField(field)
268
+ @fields.append(field)
269
+ @fieldsMap[field.key.text] = field
270
+ end
271
+
272
+ def to_s
273
+ s = "Form fields:\n"
274
+ @fields.each do |field|
275
+ s = s + field.to_s + "\n"
276
+ end
277
+ return s
278
+ end
279
+
280
+ def getFieldByKey(key)
281
+ @fieldsMap[key]
282
+ end
283
+
284
+ def findFieldsByKey(key)
285
+ searchKey = key.downcase()
286
+ results = []
287
+ @fields.each do |field|
288
+ if field.key && (field.key.text.downcase.include?(searchKey))
289
+ results.append(field)
290
+ end
291
+ end
292
+ return results
293
+ end
294
+
295
+ def findFieldByKey(key)
296
+ fields = findFieldsByKey(key)
297
+ # Choose the shortest match
298
+ match = nil
299
+ matchLength = 0
300
+ fields.each do |f|
301
+ if match.nil? || f.key.text.length < matchLength
302
+ match = f
303
+ matchLength = f.key.text.length
304
+ end
305
+ end
306
+ return match
307
+ end
308
+ end
309
+
310
+
311
+ class Cell
312
+ attr_reader :confidence
313
+ attr_reader :rowIndex
314
+ attr_reader :columnIndex
315
+ attr_reader :rowSpan
316
+ attr_reader :columnSpan
317
+ attr_reader :geometry
318
+ attr_reader :id
319
+ attr_reader :content
320
+ attr_reader :text
321
+ attr_reader :block
322
+
323
+ def initialize(block, blockMap)
324
+ @block = block
325
+ @confidence = block[:confidence]
326
+ @rowIndex = block[:row_index]
327
+ @columnIndex = block[:column_index]
328
+ @rowSpan = block[:row_span]
329
+ @columnSpan = block[:column_span]
330
+ @geometry = Geometry.new(block[:geometry])
331
+ @id = block[:id]
332
+ @content = []
333
+ @text = ""
334
+ if block[:relationships]
335
+ block[:relationships].each do |rs|
336
+ if rs[:type] == 'CHILD'
337
+ for cid in rs[:ids]
338
+ blockType = blockMap[cid][:block_type]
339
+ if blockType == "WORD"
340
+ w = Word.new(blockMap[cid], blockMap)
341
+ @content.append(w)
342
+ @text = @text + w.text + ' '
343
+ elsif blockType == "SELECTION_ELEMENT"
344
+ se = SelectionElement.new(blockMap[cid], blockMap)
345
+ @content.append(se)
346
+ @text = @text + se.selectionStatus + ', '
347
+ end
348
+ end
349
+ end
350
+ end
351
+ end
352
+ @text = @text.strip
353
+ end
354
+
355
+ def to_s
356
+ @text
357
+ end
358
+ end
359
+
360
+
361
+ class Row
362
+ attr_reader :cells
363
+
364
+ def initialize
365
+ @cells = []
366
+ end
367
+
368
+ def to_s
369
+ s = ""
370
+ @cells.each do |cell|
371
+ s = s + "[#{cell}]"
372
+ end
373
+ return s
374
+ end
375
+ end
376
+
377
+
378
+ class Table
379
+ attr_reader :confidence
380
+ attr_reader :geometry
381
+ attr_reader :id
382
+ attr_reader :rows
383
+ attr_reader :block
384
+
385
+ def initialize(block, blockMap)
386
+ @block = block
387
+
388
+ @confidence = block[:confidence]
389
+ @geometry = Geometry.new(block[:geometry])
390
+
391
+ @id = block[:id]
392
+ @rows = []
393
+
394
+ ri = 1
395
+ row = Row.new()
396
+ cell = nil
397
+ if block[:relationships]
398
+ block[:relationships].each do |rs|
399
+ if rs[:type] == 'CHILD'
400
+ for cid in rs[:ids]
401
+ cell = Cell.new(blockMap[cid], blockMap)
402
+ if cell.rowIndex > ri
403
+ @rows.append(row)
404
+ row = Row.new()
405
+ ri = cell.rowIndex
406
+ end
407
+ row.cells.append(cell)
408
+ end
409
+ @rows.append(row) if row && row.cells
410
+ end
411
+ end
412
+ end
413
+ end
414
+
415
+ def to_s
416
+ s = "Table:\n"
417
+ @rows.each do |row|
418
+ s = s + row.to_s + "\n"
419
+ end
420
+ return s
421
+ end
422
+ end
423
+
424
+
425
+ class Page
426
+ attr_reader :blocks
427
+ attr_reader :text
428
+ attr_reader :lines
429
+ attr_reader :form
430
+ attr_reader :tables
431
+ attr_reader :content
432
+ attr_reader :geometry
433
+ attr_reader :id
434
+
435
+ def initialize(blocks, blockMap)
436
+ @blocks = blocks
437
+ @text = ""
438
+ @lines = []
439
+ @form = Form.new()
440
+ @tables = []
441
+ @content = []
442
+
443
+ _parse(blockMap)
444
+ end
445
+
446
+ def to_s
447
+ s = "Page:\n"
448
+ @content.each do |item|
449
+ s = s + item.to_s + "\n"
450
+ end
451
+ return s
452
+ end
453
+
454
+ def _parse(blockMap)
455
+ @blocks.each do |item|
456
+ if item[:block_type] == "PAGE"
457
+ @geometry = Geometry.new(item[:geometry])
458
+ @id = item[:id]
459
+ elsif item[:block_type] == "LINE"
460
+ l = Line.new(item, blockMap)
461
+ @lines.append(l)
462
+ @content.append(l)
463
+ @text = @text + l.text + '\n'
464
+ elsif item[:block_type] == "TABLE"
465
+ t = Table.new(item, blockMap)
466
+ @tables.append(t)
467
+ @content.append(t)
468
+ elsif item[:block_type] == "KEY_VALUE_SET"
469
+ if item[:entity_types].include?('KEY')
470
+ f = Field.new(item, blockMap)
471
+ if f.key
472
+ @form.addField(f)
473
+ @content.append(f)
474
+ end
475
+ end
476
+ end
477
+ end
478
+ end
479
+
480
+ def getLinesInReadingOrder
481
+ columns = []
482
+ lines = []
483
+ @lines.each do |item|
484
+ column_found = false
485
+ columns.each_with_index do |column, index|
486
+ bbox_left = item.geometry.boundingBox.left
487
+ bbox_right = item.geometry.boundingBox.right
488
+ bbox_centre = item.geometry.boundingBox.left + item.geometry.boundingBox.width/2
489
+ column_centre = column[:left] + ((column[:right] - column[:left]) / 2)
490
+ if (bbox_centre > column[:left] && bbox_centre < column[:right]) || (column_centre > bbox_left && column_centre < bbox_right)
491
+ # Bbox appears inside the column
492
+ lines.append({:column => index, :text => item.text})
493
+ column_found = true
494
+ break
495
+ end
496
+ end
497
+ if !column_found
498
+ columns.append({:left => item.geometry.boundingBox.left, :right => item.geometry.boundingBox.right})
499
+ lines.append({:column => columns.count - 1, :text => item.text})
500
+ end
501
+ end
502
+
503
+ return AmazonTRP::stable_sort_by(lines) {|x| x[:column]}
504
+ end
505
+
506
+ def getTextInReadingOrder
507
+ lines = getLinesInReadingOrder()
508
+ text = ""
509
+ lines.each do |line|
510
+ text = text + line[:text] + "\n"
511
+ end
512
+ return text
513
+ end
514
+
515
+ def getLinesInBoundingBox(boundingBox)
516
+ lines = []
517
+ @lines.each do |line|
518
+ line_bbox = line.geometry.boundingBox
519
+ if (line_bbox.left >= boundingBox.left &&
520
+ line_bbox.left <= boundingBox.right &&
521
+ line_bbox.top >= boundingBox.top &&
522
+ line_bbox.top <= boundingBox.bottom)
523
+ lines.append(line)
524
+ end
525
+ end
526
+ return lines
527
+ end
528
+ end
529
+
530
+
531
+ class Document
532
+ attr_reader :blocks
533
+ attr_reader :pageBlocks
534
+ attr_reader :pages
535
+
536
+ def initialize(responsePages)
537
+ @responsePages = responsePages.is_a?(Array) ? responsePages : [responsePages]
538
+ @pages = []
539
+ _parse()
540
+ end
541
+
542
+ def to_s
543
+ s = "\nDocument:\n"
544
+ @pages.each do |p|
545
+ s = s + p.to_s + "\n\n"
546
+ end
547
+ return s
548
+ end
549
+
550
+ def _parseDocumentPagesAndBlockMap
551
+ blockMap = {}
552
+
553
+ documentPages = []
554
+ documentPage = nil
555
+ @responsePages.each do |page|
556
+ unless page[:blocks].nil?
557
+ page[:blocks].each do |block|
558
+ if block.has_key?(:block_type) && block.has_key?(:id)
559
+ blockMap[block[:id]] = block
560
+ end
561
+
562
+ if block[:block_type] == 'PAGE'
563
+ documentPages.append({:blocks => documentPage}) if documentPage
564
+ documentPage = []
565
+ documentPage.append(block)
566
+ else
567
+ documentPage.append(block)
568
+ end
569
+ end
570
+ end
571
+ end
572
+ documentPages.append({:blocks => documentPage}) if documentPage
573
+ return documentPages, blockMap
574
+ end
575
+
576
+ def _parse
577
+ @responseDocumentPages, @blockMap = _parseDocumentPagesAndBlockMap()
578
+ @responseDocumentPages.each do |documentPage|
579
+ page = Page.new(documentPage[:blocks], @blockMap)
580
+ @pages.append(page)
581
+ end
582
+ end
583
+
584
+ def getBlockById(blockId)
585
+ return @blockMap[blockId] if @blockMap && @blockMap.has_key?(blockId)
586
+ return nil
587
+ end
588
+ end
589
+
590
+ end
@@ -0,0 +1,3 @@
1
+ module AmazonTRP
2
+ VERSION = "0.1.0"
3
+ end
metadata ADDED
@@ -0,0 +1,129 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: amazon-textract-parser-ruby
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Niels Vanspauwen
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2020-07-17 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.17'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.17'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 12.3.3
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 12.3.3
41
+ - !ruby/object:Gem::Dependency
42
+ name: minitest
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '5.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '5.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: minitest-reporters
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: activesupport
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: 6.0.3.2
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: 6.0.3.2
83
+ description: This is a quick Ruby port of https://github.com/mludvig/amazon-textract-parser\nIt's
84
+ useful for interpreting the result of Amazon Textract info.
85
+ email:
86
+ - niels.vanspauwen@gmail.com
87
+ executables: []
88
+ extensions: []
89
+ extra_rdoc_files: []
90
+ files:
91
+ - ".DS_Store"
92
+ - ".gitignore"
93
+ - ".travis.yml"
94
+ - Gemfile
95
+ - Gemfile.lock
96
+ - LICENSE.txt
97
+ - README.md
98
+ - Rakefile
99
+ - amazon-textract-parser-ruby.gemspec
100
+ - bin/console
101
+ - bin/setup
102
+ - lib/.DS_Store
103
+ - lib/amazon-textract-parser-ruby.rb
104
+ - lib/amazon-textract-parser-ruby/.DS_Store
105
+ - lib/amazon-textract-parser-ruby/version.rb
106
+ homepage: https://github.com/nielsvanspauwen/amazon-textract-parser-ruby
107
+ licenses:
108
+ - MIT
109
+ metadata: {}
110
+ post_install_message:
111
+ rdoc_options: []
112
+ require_paths:
113
+ - lib
114
+ required_ruby_version: !ruby/object:Gem::Requirement
115
+ requirements:
116
+ - - ">="
117
+ - !ruby/object:Gem::Version
118
+ version: '0'
119
+ required_rubygems_version: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - ">="
122
+ - !ruby/object:Gem::Version
123
+ version: '0'
124
+ requirements: []
125
+ rubygems_version: 3.0.1
126
+ signing_key:
127
+ specification_version: 4
128
+ summary: Amazon Textract Results Parser
129
+ test_files: []