dsr 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (5) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +21 -0
  3. data/dsr.gemspec +15 -0
  4. data/lib/dsr.rb +106 -0
  5. metadata +75 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: e94dd71eedb7e3989dde0e7a35c73db47c97891211ab3620ef668bf6fb163dcb
4
+ data.tar.gz: af6324245ad6c157dc3a700f00121860c75bb828dc096ffcd4a4c1e15ae0dd96
5
+ SHA512:
6
+ metadata.gz: 1a607b1cfaaec5321348face51201ec20baa16958775485f3a86fe67a1eb703be6a58bcf555045dbb181397644eb42240fbe28a38b4dd79f361de4a287e23897
7
+ data.tar.gz: 66fefe0318e13f2b2d60f60dd125af1577313ac038d1fc91215016fac21066f91abbb58ce91a6f1eabfcb2b39513c800f563cac567f3539e162b1ec05cfddf10
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Victor Maslov
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/dsr.gemspec ADDED
@@ -0,0 +1,15 @@
1
+ Gem::Specification.new do |spec|
2
+ spec.name = "dsr"
3
+ spec.version = "0.0.0"
4
+ spec.summary = "[WIP] Document Structure Recognizer -- currently a collection of common routines I use to build A.I.s"
5
+
6
+ spec.author = "Victor Maslov aka Nakilon"
7
+ spec.email = "nakilon@gmail.com"
8
+ spec.license = "MIT"
9
+ spec.metadata = {"source_code_uri" => "https://github.com/nakilon/dsr"}
10
+
11
+ spec.add_dependency "nakischema"
12
+ spec.add_dependency "hexapdf"
13
+
14
+ spec.files = %w{ LICENSE dsr.gemspec lib/dsr.rb }
15
+ end
data/lib/dsr.rb ADDED
@@ -0,0 +1,106 @@
1
+ module DSR
2
+
3
+ Struct = ::Struct.new :text, :left, :bottom, :right, :top, :width, :height
4
+ private_constant :Struct
5
+
6
+ class Texts < Array
7
+ def find_all_by_text text
8
+ self.class.new select{ |_| text == _.text }
9
+ end
10
+ def select_intersecting_vertically_with item
11
+ self.class.new (self-[item]).select{ |_| _.bottom >= item.top && _.top <= item.bottom }
12
+ end
13
+ end
14
+ private_constant :Texts
15
+
16
+ def self.google2struct json
17
+ require "json"
18
+ Texts.new( JSON.load(json).tap do |json|
19
+ require "nakischema"
20
+ Nakischema.validate json, {
21
+ hash_req: {
22
+ "cropHintsAnnotation" => Hash,
23
+ "fullTextAnnotation" => Hash,
24
+ "imagePropertiesAnnotation" => Hash,
25
+ "labelAnnotations" => Array,
26
+ "safeSearchAnnotation" => Hash,
27
+ "textAnnotations" => { each: {
28
+ hash_req: {
29
+ "boundingPoly" => { hash: {
30
+ "vertices" => { size: 4..4, each: { hash: { "x" => Integer, "y" => Integer } } },
31
+ } },
32
+ "description" => /\A\S(.*\S)?\z/m,
33
+ },
34
+ hash_opt: {
35
+ "locale" => /\A\S(.*\S)?\z/m,
36
+ },
37
+ } },
38
+ },
39
+ hash_opt: {
40
+ "localizedObjectAnnotations" => Array,
41
+ }
42
+ }
43
+ end["textAnnotations"].map do |text|
44
+ Struct.new text["description"],
45
+ text["boundingPoly"]["vertices"].map{ |_| _["x"] }.min,
46
+ text["boundingPoly"]["vertices"].map{ |_| _["y"] }.max,
47
+ text["boundingPoly"]["vertices"].map{ |_| _["x"] }.max,
48
+ text["boundingPoly"]["vertices"].map{ |_| _["y"] }.min,
49
+ text["boundingPoly"]["vertices"].map{ |_| _["x"] }.max - text["boundingPoly"]["vertices"].map{ |_| _["x"] }.min,
50
+ text["boundingPoly"]["vertices"].map{ |_| _["y"] }.max - text["boundingPoly"]["vertices"].map{ |_| _["y"] }.min
51
+ end )
52
+ end
53
+
54
+ def self.link headers, array, direction, alignment, *priority
55
+ l, r = case direction
56
+ when :horizontal ; %i{ left right }
57
+ when :vertical ; %i{ top bottom }
58
+ else ; fail "invalid direction"
59
+ end
60
+ headers = headers.sort_by(&l).map(&:dup)
61
+ headers.each_cons(2){ |a, b| a[r], b[l] = [a[r], b[l]].max, [a[r], b[l]].min }
62
+ headers.first[l] = -Float::INFINITY
63
+ headers.last[r] = +Float::INFINITY
64
+ headers.unshift headers.delete_at headers.index{ |_| priority.include? _.text } unless priority.empty? # TODO: document/explain this
65
+ array.sort_by(&l).each_with_object([]) do |cell, a|
66
+ i = headers.public_send(alignment){ |_| (_[l].._[r]).include?((cell[l]+cell[r])/2) }
67
+ a[i] ||= []
68
+ a[i] << cell
69
+ end
70
+ end
71
+ def self.pdf2struct object
72
+ require "hexapdf"
73
+ processor = Class.new HexaPDF::Content::Processor do
74
+ attr_reader :texts
75
+ def initialize _
76
+ super
77
+ @texts = Texts.new
78
+ end
79
+ def show_text str
80
+ boxes = decode_text_with_positioning str
81
+ @texts.push Struct.new boxes.string,
82
+ boxes.lower_left[0], -boxes.lower_left[1],
83
+ boxes.upper_right[0], -boxes.upper_right[1],
84
+ boxes.upper_right[0] - boxes.lower_left[0],
85
+ boxes.lower_left[1] - boxes.upper_right[1]
86
+ end
87
+ end
88
+ HexaPDF::Document.new(io: object).pages.map do |page|
89
+ processor.new(page).tap(&page.method(:process_contents)).texts
90
+ end
91
+ end
92
+
93
+ def self.subgraphs data
94
+ data.zip.tap do |array|
95
+ (0...data.size).each do |i|
96
+ (0...i).to_a.select do |j|
97
+ array[i].product(array[j]).any?{ |i,j| yield i,j }
98
+ end.each do |j|
99
+ array[i].concat array[j]
100
+ array[j].clear
101
+ end
102
+ end
103
+ end.reject &:empty?
104
+ end
105
+
106
+ end
metadata ADDED
@@ -0,0 +1,75 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: dsr
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Victor Maslov aka Nakilon
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2023-08-16 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nakischema
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: hexapdf
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description:
42
+ email: nakilon@gmail.com
43
+ executables: []
44
+ extensions: []
45
+ extra_rdoc_files: []
46
+ files:
47
+ - LICENSE
48
+ - dsr.gemspec
49
+ - lib/dsr.rb
50
+ homepage:
51
+ licenses:
52
+ - MIT
53
+ metadata:
54
+ source_code_uri: https://github.com/nakilon/dsr
55
+ post_install_message:
56
+ rdoc_options: []
57
+ require_paths:
58
+ - lib
59
+ required_ruby_version: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - ">="
62
+ - !ruby/object:Gem::Version
63
+ version: '0'
64
+ required_rubygems_version: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ requirements: []
70
+ rubygems_version: 3.3.25
71
+ signing_key:
72
+ specification_version: 4
73
+ summary: "[WIP] Document Structure Recognizer -- currently a collection of common
74
+ routines I use to build A.I.s"
75
+ test_files: []