amazon-textract-parser-ruby 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 415a439783ef9a00d5d1caf1ddbc111e5311a5bdce51a064daf5b831a6615c34
4
+ data.tar.gz: 358f5d7b9f74dad6a5f7110188769be65eda89ea9d346072ae047d03c083373f
5
+ SHA512:
6
+ metadata.gz: 70cd4616be9e8047fac583c6e4db430654ae414934a19c4df9c6e832792a01cdda55d300d4f602714b2bb87f52ebc1c0f1311717d3446a72486bfe9cf521a12f
7
+ data.tar.gz: 91bf70e8869832452cb896243691090fcb7042e4fb77a3ee121bf64265921ac320ba5ec399322e17fc32b166bd2656b6108cda75aad1417d1dbd6005b2de2205
Binary file
@@ -0,0 +1,8 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
@@ -0,0 +1,7 @@
1
+ ---
2
+ sudo: false
3
+ language: ruby
4
+ cache: bundler
5
+ rvm:
6
+ - 2.6.0
7
+ before_install: gem install bundler -v 1.17.2
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source "https://rubygems.org"
2
+
3
+ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
4
+
5
+ # Specify your gem's dependencies in amazon-textract-parser-ruby.gemspec
6
+ gemspec
@@ -0,0 +1,45 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ amazon-textract-parser-ruby (0.1.0)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ activesupport (6.0.3.2)
10
+ concurrent-ruby (~> 1.0, >= 1.0.2)
11
+ i18n (>= 0.7, < 2)
12
+ minitest (~> 5.1)
13
+ tzinfo (~> 1.1)
14
+ zeitwerk (~> 2.2, >= 2.2.2)
15
+ ansi (1.5.0)
16
+ builder (3.2.4)
17
+ concurrent-ruby (1.1.6)
18
+ i18n (1.8.3)
19
+ concurrent-ruby (~> 1.0)
20
+ minitest (5.14.1)
21
+ minitest-reporters (1.4.2)
22
+ ansi
23
+ builder
24
+ minitest (>= 5.0)
25
+ ruby-progressbar
26
+ rake (12.3.3)
27
+ ruby-progressbar (1.10.1)
28
+ thread_safe (0.3.6)
29
+ tzinfo (1.2.7)
30
+ thread_safe (~> 0.1)
31
+ zeitwerk (2.4.0)
32
+
33
+ PLATFORMS
34
+ ruby
35
+
36
+ DEPENDENCIES
37
+ activesupport (~> 6.0.3.2)
38
+ amazon-textract-parser-ruby!
39
+ bundler (~> 1.17)
40
+ minitest (~> 5.0)
41
+ minitest-reporters
42
+ rake (~> 12.3.3)
43
+
44
+ BUNDLED WITH
45
+ 1.17.2
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2020 Niels Vanspauwen
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,46 @@
1
+ # Amazon Textract Results Parser
2
+
3
+ This is a quick Ruby port of [https://github.com/mludvig/amazon-textract-parser](https://github.com/mludvig/amazon-textract-parser)
4
+
5
+ It's useful for interpreting the result of Amazon Textract info.
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'amazon-textract-parser-ruby'
13
+ ```
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install amazon-textract-parser-ruby
22
+
23
+ ## Usage
24
+
25
+ ```ruby
26
+ textract = Aws::Textract::Client.new
27
+ textract.start_document_analysis({...})
28
+ response = textract.get_document_analysis({...})
29
+ doc = AmazonTRP::Document.new(response.to_h)
30
+ ```
31
+
32
+ ## Development
33
+
34
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
35
+
36
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
37
+
38
+ For more info on creating and maintaining gems, check https://bundler.io/v2.0/guides/creating_gem.html
39
+
40
+ ## Contributing
41
+
42
+ Bug reports and pull requests are welcome on GitHub at https://github.com/nielsvanspauwen/amazon-textract-parser-ruby.
43
+
44
+ ## License
45
+
46
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+ require "rake/testtask"
3
+
4
+ Rake::TestTask.new(:test) do |t|
5
+ t.libs << "test"
6
+ t.libs << "lib"
7
+ t.test_files = FileList["test/**/*_test.rb"]
8
+ end
9
+
10
+ task :default => :test
@@ -0,0 +1,31 @@
1
+
2
+ lib = File.expand_path("../lib", __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require "amazon-textract-parser-ruby/version"
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "amazon-textract-parser-ruby"
8
+ spec.version = AmazonTRP::VERSION
9
+ spec.authors = ["Niels Vanspauwen"]
10
+ spec.email = ["niels.vanspauwen@gmail.com"]
11
+
12
+ spec.summary = %q{Amazon Textract Results Parser}
13
+ spec.description = %q{This is a quick Ruby port of https://github.com/mludvig/amazon-textract-parser\nIt's useful for interpreting the result of Amazon Textract info.}
14
+ spec.homepage = "https://github.com/nielsvanspauwen/amazon-textract-parser-ruby"
15
+ spec.license = "MIT"
16
+
17
+ # Specify which files should be added to the gem when it is released.
18
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
19
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
20
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
21
+ end
22
+ spec.bindir = "exe"
23
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
24
+ spec.require_paths = ["lib"]
25
+
26
+ spec.add_development_dependency "bundler", "~> 1.17"
27
+ spec.add_development_dependency "rake", "~> 12.3.3"
28
+ spec.add_development_dependency "minitest", "~> 5.0"
29
+ spec.add_development_dependency "minitest-reporters"
30
+ spec.add_development_dependency "activesupport", "~> 6.0.3.2"
31
+ end
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "amazon-textract-parser-ruby"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
Binary file
@@ -0,0 +1,590 @@
1
+ require "amazon-textract-parser-ruby/version"
2
+
3
+ module AmazonTRP
4
+ class Error < StandardError; end
5
+
6
+ def AmazonTRP.stable_sort_by(e)
7
+ e.sort_by.with_index { |x, idx| [yield(x), idx] }
8
+ end
9
+
10
+
11
+ class BoundingBox
12
+ attr_reader :width
13
+ attr_reader :height
14
+ attr_reader :left
15
+ attr_reader :top
16
+
17
+ def initialize(width, height, left, top)
18
+ @width = width
19
+ @height = height
20
+ @left = left
21
+ @top = top
22
+ end
23
+
24
+ def to_s
25
+ "width: #{@width}, height: #{@height}, left: #{@left}, top: #{@top}"
26
+ end
27
+
28
+ def right
29
+ @left + @width
30
+ end
31
+
32
+ def bottom
33
+ @top + @height
34
+ end
35
+ end
36
+
37
+
38
+ class Point
39
+ attr_reader :x
40
+ attr_reader :y
41
+
42
+ def initialize(x, y)
43
+ @x = x
44
+ @y = y
45
+ end
46
+
47
+ def to_s
48
+ "(#{@x}, #{@y})"
49
+ end
50
+ end
51
+
52
+
53
+ class Geometry
54
+ attr_reader :boundingBox
55
+ attr_reader :polygon
56
+
57
+ def initialize(geometry)
58
+ bbox = geometry[:bounding_box]
59
+ pg = geometry[:polygon]
60
+ @boundingBox = BoundingBox.new(bbox[:width], bbox[:height], bbox[:left], bbox[:top])
61
+ @polygon = pg.map{|p| Point.new(p[:x], p[:y])}
62
+ end
63
+
64
+ def to_s
65
+ "BoundingBox: #{@bounding_box}"
66
+ end
67
+ end
68
+
69
+
70
+ class Word
71
+ attr_reader :confidence
72
+ attr_reader :geometry
73
+ attr_reader :id
74
+ attr_reader :text
75
+ attr_reader :block
76
+
77
+ def initialize(block, blockMap)
78
+ @block = block
79
+ @confidence = block[:confidence]
80
+ @geometry = Geometry.new(block[:geometry])
81
+ @id = block[:id]
82
+ @text = block[:text] || ""
83
+ end
84
+
85
+ def to_s
86
+ @text
87
+ end
88
+ end
89
+
90
+
91
+ class Line
92
+ attr_reader :confidence
93
+ attr_reader :geometry
94
+ attr_reader :id
95
+ attr_reader :words
96
+ attr_reader :text
97
+ attr_reader :block
98
+
99
+ def initialize(block, blockMap)
100
+ @block = block
101
+ @confidence = block[:confidence]
102
+ @geometry = Geometry.new(block[:geometry])
103
+ @id = block[:id]
104
+
105
+ @text = block[:text] || ""
106
+
107
+ @words = []
108
+ if block[:relationships]
109
+ block[:relationships].each do |rs|
110
+ if rs[:type] == 'CHILD'
111
+ rs[:ids].each do |cid|
112
+ if blockMap[cid][:block_type] == "WORD"
113
+ @words.append(Word.new(blockMap[cid], blockMap))
114
+ end
115
+ end
116
+ end
117
+ end
118
+ end
119
+ end
120
+
121
+ def to_s
122
+ s = "Line: "
123
+ s = s + @text + "\n"
124
+ s = s + "Words: "
125
+ @words.each do |word|
126
+ s = s + "[#{word}]"
127
+ end
128
+ return s
129
+ end
130
+ end
131
+
132
+
133
+ class SelectionElement
134
+ attr_reader :confidence
135
+ attr_reader :geometry
136
+ attr_reader :id
137
+ attr_reader :selectionStatus
138
+
139
+ def initialize(block, blockMap)
140
+ @confidence = block[:confidence]
141
+ @geometry = Geometry.new(block[:geometry])
142
+ @id = block[:id]
143
+ @selectionStatus = block[:selection_status]
144
+ end
145
+ end
146
+
147
+
148
+ class FieldKey
149
+ attr_reader :confidence
150
+ attr_reader :geometry
151
+ attr_reader :id
152
+ attr_reader :content
153
+ attr_reader :text
154
+ attr_reader :block
155
+
156
+ def initialize(block, children, blockMap)
157
+ @block = block
158
+ @confidence = block[:confidence]
159
+ @geometry = Geometry.new(block[:geometry])
160
+ @id = block[:id]
161
+ @text = ""
162
+ @content = []
163
+
164
+ t = []
165
+ children.each do |eid|
166
+ wb = blockMap[eid]
167
+ if wb[:block_type] == "WORD"
168
+ w = Word.new(wb, blockMap)
169
+ @content.append(w)
170
+ t.append(w.text)
171
+ end
172
+ end
173
+ @text = t.join(' ') if t
174
+ end
175
+
176
+ def to_s
177
+ @text
178
+ end
179
+ end
180
+
181
+
182
+ class FieldValue
183
+ attr_reader :confidence
184
+ attr_reader :geometry
185
+ attr_reader :id
186
+ attr_reader :content
187
+ attr_reader :text
188
+ attr_reader :block
189
+
190
+ def initialize(block, children, blockMap)
191
+ @block = block
192
+ @confidence = block[:confidence]
193
+ @geometry = Geometry.new(block[:geometry])
194
+ @id = block[:id]
195
+ @text = ""
196
+ @content = []
197
+
198
+ t = []
199
+ children.each do |eid|
200
+ wb = blockMap[eid]
201
+ if wb[:block_type] == "WORD"
202
+ w = Word.new(wb, blockMap)
203
+ @content.append(w)
204
+ t.append(w.text)
205
+ elsif wb[:block_type] == "SELECTION_ELEMENT"
206
+ se = SelectionElement.new(wb, blockMap)
207
+ @content.append(se)
208
+ t.append(se.selectionStatus)
209
+ end
210
+ end
211
+
212
+ @text = t.join(' ') if t
213
+ end
214
+
215
+ def to_s
216
+ @text
217
+ end
218
+ end
219
+
220
+
221
+ class Field
222
+ attr_reader :key
223
+ attr_reader :value
224
+
225
+ def initialize(block, blockMap)
226
+ @key = nil
227
+ @value = nil
228
+
229
+ block[:relationships].each do |item|
230
+ if item[:type] == "CHILD"
231
+ @key = FieldKey.new(block, item[:ids], blockMap)
232
+ elsif item[:type] == "VALUE"
233
+ item[:ids].each do |eid|
234
+ vkvs = blockMap[eid]
235
+ if vkvs[:entity_types].include?('VALUE')
236
+ if vkvs.has_key?(:relationships)
237
+ vkvs[:relationships].each do |vitem|
238
+ @value = FieldValue.new(vkvs, vitem[:ids], blockMap) if vitem[:type] == "CHILD"
239
+ end
240
+ end
241
+ end
242
+ end
243
+ end
244
+ end
245
+ end
246
+
247
+ def to_s
248
+ k = ""
249
+ v = ""
250
+
251
+ k = @key.to_s if @key
252
+ v = @value.to_s if @value
253
+
254
+ return "Field: #{k} = #{v}"
255
+ end
256
+ end
257
+
258
+
259
+ class Form
260
+ attr_reader :fields
261
+
262
+ def initialize
263
+ @fields = []
264
+ @fieldsMap = {}
265
+ end
266
+
267
+ def addField(field)
268
+ @fields.append(field)
269
+ @fieldsMap[field.key.text] = field
270
+ end
271
+
272
+ def to_s
273
+ s = "Form fields:\n"
274
+ @fields.each do |field|
275
+ s = s + field.to_s + "\n"
276
+ end
277
+ return s
278
+ end
279
+
280
+ def getFieldByKey(key)
281
+ @fieldsMap[key]
282
+ end
283
+
284
+ def findFieldsByKey(key)
285
+ searchKey = key.downcase()
286
+ results = []
287
+ @fields.each do |field|
288
+ if field.key && (field.key.text.downcase.include?(searchKey))
289
+ results.append(field)
290
+ end
291
+ end
292
+ return results
293
+ end
294
+
295
+ def findFieldByKey(key)
296
+ fields = findFieldsByKey(key)
297
+ # Choose the shortest match
298
+ match = nil
299
+ matchLength = 0
300
+ fields.each do |f|
301
+ if match.nil? || f.key.text.length < matchLength
302
+ match = f
303
+ matchLength = f.key.text.length
304
+ end
305
+ end
306
+ return match
307
+ end
308
+ end
309
+
310
+
311
+ class Cell
312
+ attr_reader :confidence
313
+ attr_reader :rowIndex
314
+ attr_reader :columnIndex
315
+ attr_reader :rowSpan
316
+ attr_reader :columnSpan
317
+ attr_reader :geometry
318
+ attr_reader :id
319
+ attr_reader :content
320
+ attr_reader :text
321
+ attr_reader :block
322
+
323
+ def initialize(block, blockMap)
324
+ @block = block
325
+ @confidence = block[:confidence]
326
+ @rowIndex = block[:row_index]
327
+ @columnIndex = block[:column_index]
328
+ @rowSpan = block[:row_span]
329
+ @columnSpan = block[:column_span]
330
+ @geometry = Geometry.new(block[:geometry])
331
+ @id = block[:id]
332
+ @content = []
333
+ @text = ""
334
+ if block[:relationships]
335
+ block[:relationships].each do |rs|
336
+ if rs[:type] == 'CHILD'
337
+ for cid in rs[:ids]
338
+ blockType = blockMap[cid][:block_type]
339
+ if blockType == "WORD"
340
+ w = Word.new(blockMap[cid], blockMap)
341
+ @content.append(w)
342
+ @text = @text + w.text + ' '
343
+ elsif blockType == "SELECTION_ELEMENT"
344
+ se = SelectionElement.new(blockMap[cid], blockMap)
345
+ @content.append(se)
346
+ @text = @text + se.selectionStatus + ', '
347
+ end
348
+ end
349
+ end
350
+ end
351
+ end
352
+ @text = @text.strip
353
+ end
354
+
355
+ def to_s
356
+ @text
357
+ end
358
+ end
359
+
360
+
361
+ class Row
362
+ attr_reader :cells
363
+
364
+ def initialize
365
+ @cells = []
366
+ end
367
+
368
+ def to_s
369
+ s = ""
370
+ @cells.each do |cell|
371
+ s = s + "[#{cell}]"
372
+ end
373
+ return s
374
+ end
375
+ end
376
+
377
+
378
+ class Table
379
+ attr_reader :confidence
380
+ attr_reader :geometry
381
+ attr_reader :id
382
+ attr_reader :rows
383
+ attr_reader :block
384
+
385
+ def initialize(block, blockMap)
386
+ @block = block
387
+
388
+ @confidence = block[:confidence]
389
+ @geometry = Geometry.new(block[:geometry])
390
+
391
+ @id = block[:id]
392
+ @rows = []
393
+
394
+ ri = 1
395
+ row = Row.new()
396
+ cell = nil
397
+ if block[:relationships]
398
+ block[:relationships].each do |rs|
399
+ if rs[:type] == 'CHILD'
400
+ for cid in rs[:ids]
401
+ cell = Cell.new(blockMap[cid], blockMap)
402
+ if cell.rowIndex > ri
403
+ @rows.append(row)
404
+ row = Row.new()
405
+ ri = cell.rowIndex
406
+ end
407
+ row.cells.append(cell)
408
+ end
409
+ @rows.append(row) if row && row.cells
410
+ end
411
+ end
412
+ end
413
+ end
414
+
415
+ def to_s
416
+ s = "Table:\n"
417
+ @rows.each do |row|
418
+ s = s + row.to_s + "\n"
419
+ end
420
+ return s
421
+ end
422
+ end
423
+
424
+
425
+ class Page
426
+ attr_reader :blocks
427
+ attr_reader :text
428
+ attr_reader :lines
429
+ attr_reader :form
430
+ attr_reader :tables
431
+ attr_reader :content
432
+ attr_reader :geometry
433
+ attr_reader :id
434
+
435
+ def initialize(blocks, blockMap)
436
+ @blocks = blocks
437
+ @text = ""
438
+ @lines = []
439
+ @form = Form.new()
440
+ @tables = []
441
+ @content = []
442
+
443
+ _parse(blockMap)
444
+ end
445
+
446
+ def to_s
447
+ s = "Page:\n"
448
+ @content.each do |item|
449
+ s = s + item.to_s + "\n"
450
+ end
451
+ return s
452
+ end
453
+
454
+ def _parse(blockMap)
455
+ @blocks.each do |item|
456
+ if item[:block_type] == "PAGE"
457
+ @geometry = Geometry.new(item[:geometry])
458
+ @id = item[:id]
459
+ elsif item[:block_type] == "LINE"
460
+ l = Line.new(item, blockMap)
461
+ @lines.append(l)
462
+ @content.append(l)
463
+ @text = @text + l.text + '\n'
464
+ elsif item[:block_type] == "TABLE"
465
+ t = Table.new(item, blockMap)
466
+ @tables.append(t)
467
+ @content.append(t)
468
+ elsif item[:block_type] == "KEY_VALUE_SET"
469
+ if item[:entity_types].include?('KEY')
470
+ f = Field.new(item, blockMap)
471
+ if f.key
472
+ @form.addField(f)
473
+ @content.append(f)
474
+ end
475
+ end
476
+ end
477
+ end
478
+ end
479
+
480
+ def getLinesInReadingOrder
481
+ columns = []
482
+ lines = []
483
+ @lines.each do |item|
484
+ column_found = false
485
+ columns.each_with_index do |column, index|
486
+ bbox_left = item.geometry.boundingBox.left
487
+ bbox_right = item.geometry.boundingBox.right
488
+ bbox_centre = item.geometry.boundingBox.left + item.geometry.boundingBox.width/2
489
+ column_centre = column[:left] + ((column[:right] - column[:left]) / 2)
490
+ if (bbox_centre > column[:left] && bbox_centre < column[:right]) || (column_centre > bbox_left && column_centre < bbox_right)
491
+ # Bbox appears inside the column
492
+ lines.append({:column => index, :text => item.text})
493
+ column_found = true
494
+ break
495
+ end
496
+ end
497
+ if !column_found
498
+ columns.append({:left => item.geometry.boundingBox.left, :right => item.geometry.boundingBox.right})
499
+ lines.append({:column => columns.count - 1, :text => item.text})
500
+ end
501
+ end
502
+
503
+ return AmazonTRP::stable_sort_by(lines) {|x| x[:column]}
504
+ end
505
+
506
+ def getTextInReadingOrder
507
+ lines = getLinesInReadingOrder()
508
+ text = ""
509
+ lines.each do |line|
510
+ text = text + line[:text] + "\n"
511
+ end
512
+ return text
513
+ end
514
+
515
+ def getLinesInBoundingBox(boundingBox)
516
+ lines = []
517
+ @lines.each do |line|
518
+ line_bbox = line.geometry.boundingBox
519
+ if (line_bbox.left >= boundingBox.left &&
520
+ line_bbox.left <= boundingBox.right &&
521
+ line_bbox.top >= boundingBox.top &&
522
+ line_bbox.top <= boundingBox.bottom)
523
+ lines.append(line)
524
+ end
525
+ end
526
+ return lines
527
+ end
528
+ end
529
+
530
+
531
+ class Document
532
+ attr_reader :blocks
533
+ attr_reader :pageBlocks
534
+ attr_reader :pages
535
+
536
+ def initialize(responsePages)
537
+ @responsePages = responsePages.is_a?(Array) ? responsePages : [responsePages]
538
+ @pages = []
539
+ _parse()
540
+ end
541
+
542
+ def to_s
543
+ s = "\nDocument:\n"
544
+ @pages.each do |p|
545
+ s = s + p.to_s + "\n\n"
546
+ end
547
+ return s
548
+ end
549
+
550
+ def _parseDocumentPagesAndBlockMap
551
+ blockMap = {}
552
+
553
+ documentPages = []
554
+ documentPage = nil
555
+ @responsePages.each do |page|
556
+ unless page[:blocks].nil?
557
+ page[:blocks].each do |block|
558
+ if block.has_key?(:block_type) && block.has_key?(:id)
559
+ blockMap[block[:id]] = block
560
+ end
561
+
562
+ if block[:block_type] == 'PAGE'
563
+ documentPages.append({:blocks => documentPage}) if documentPage
564
+ documentPage = []
565
+ documentPage.append(block)
566
+ else
567
+ documentPage.append(block)
568
+ end
569
+ end
570
+ end
571
+ end
572
+ documentPages.append({:blocks => documentPage}) if documentPage
573
+ return documentPages, blockMap
574
+ end
575
+
576
+ def _parse
577
+ @responseDocumentPages, @blockMap = _parseDocumentPagesAndBlockMap()
578
+ @responseDocumentPages.each do |documentPage|
579
+ page = Page.new(documentPage[:blocks], @blockMap)
580
+ @pages.append(page)
581
+ end
582
+ end
583
+
584
+ def getBlockById(blockId)
585
+ return @blockMap[blockId] if @blockMap && @blockMap.has_key?(blockId)
586
+ return nil
587
+ end
588
+ end
589
+
590
+ end
@@ -0,0 +1,3 @@
1
+ module AmazonTRP
2
+ VERSION = "0.1.0"
3
+ end
metadata ADDED
@@ -0,0 +1,129 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: amazon-textract-parser-ruby
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Niels Vanspauwen
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2020-07-17 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.17'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.17'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 12.3.3
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 12.3.3
41
+ - !ruby/object:Gem::Dependency
42
+ name: minitest
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '5.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '5.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: minitest-reporters
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: activesupport
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: 6.0.3.2
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: 6.0.3.2
83
+ description: This is a quick Ruby port of https://github.com/mludvig/amazon-textract-parser\nIt's
84
+ useful for interpreting the result of Amazon Textract info.
85
+ email:
86
+ - niels.vanspauwen@gmail.com
87
+ executables: []
88
+ extensions: []
89
+ extra_rdoc_files: []
90
+ files:
91
+ - ".DS_Store"
92
+ - ".gitignore"
93
+ - ".travis.yml"
94
+ - Gemfile
95
+ - Gemfile.lock
96
+ - LICENSE.txt
97
+ - README.md
98
+ - Rakefile
99
+ - amazon-textract-parser-ruby.gemspec
100
+ - bin/console
101
+ - bin/setup
102
+ - lib/.DS_Store
103
+ - lib/amazon-textract-parser-ruby.rb
104
+ - lib/amazon-textract-parser-ruby/.DS_Store
105
+ - lib/amazon-textract-parser-ruby/version.rb
106
+ homepage: https://github.com/nielsvanspauwen/amazon-textract-parser-ruby
107
+ licenses:
108
+ - MIT
109
+ metadata: {}
110
+ post_install_message:
111
+ rdoc_options: []
112
+ require_paths:
113
+ - lib
114
+ required_ruby_version: !ruby/object:Gem::Requirement
115
+ requirements:
116
+ - - ">="
117
+ - !ruby/object:Gem::Version
118
+ version: '0'
119
+ required_rubygems_version: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - ">="
122
+ - !ruby/object:Gem::Version
123
+ version: '0'
124
+ requirements: []
125
+ rubygems_version: 3.0.1
126
+ signing_key:
127
+ specification_version: 4
128
+ summary: Amazon Textract Results Parser
129
+ test_files: []