dsr 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (5) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +21 -0
  3. data/dsr.gemspec +15 -0
  4. data/lib/dsr.rb +106 -0
  5. metadata +75 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: e94dd71eedb7e3989dde0e7a35c73db47c97891211ab3620ef668bf6fb163dcb
4
+ data.tar.gz: af6324245ad6c157dc3a700f00121860c75bb828dc096ffcd4a4c1e15ae0dd96
5
+ SHA512:
6
+ metadata.gz: 1a607b1cfaaec5321348face51201ec20baa16958775485f3a86fe67a1eb703be6a58bcf555045dbb181397644eb42240fbe28a38b4dd79f361de4a287e23897
7
+ data.tar.gz: 66fefe0318e13f2b2d60f60dd125af1577313ac038d1fc91215016fac21066f91abbb58ce91a6f1eabfcb2b39513c800f563cac567f3539e162b1ec05cfddf10
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Victor Maslov
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/dsr.gemspec ADDED
@@ -0,0 +1,15 @@
1
+ Gem::Specification.new do |spec|
2
+ spec.name = "dsr"
3
+ spec.version = "0.0.0"
4
+ spec.summary = "[WIP] Document Structure Recognizer -- currently a collection of common routines I use to build A.I.s"
5
+
6
+ spec.author = "Victor Maslov aka Nakilon"
7
+ spec.email = "nakilon@gmail.com"
8
+ spec.license = "MIT"
9
+ spec.metadata = {"source_code_uri" => "https://github.com/nakilon/dsr"}
10
+
11
+ spec.add_dependency "nakischema"
12
+ spec.add_dependency "hexapdf"
13
+
14
+ spec.files = %w{ LICENSE dsr.gemspec lib/dsr.rb }
15
+ end
data/lib/dsr.rb ADDED
@@ -0,0 +1,106 @@
1
+ module DSR
2
+
3
+ Struct = ::Struct.new :text, :left, :bottom, :right, :top, :width, :height
4
+ private_constant :Struct
5
+
6
+ class Texts < Array
7
+ def find_all_by_text text
8
+ self.class.new select{ |_| text == _.text }
9
+ end
10
+ def select_intersecting_vertically_with item
11
+ self.class.new (self-[item]).select{ |_| _.bottom >= item.top && _.top <= item.bottom }
12
+ end
13
+ end
14
+ private_constant :Texts
15
+
16
+ def self.google2struct json
17
+ require "json"
18
+ Texts.new( JSON.load(json).tap do |json|
19
+ require "nakischema"
20
+ Nakischema.validate json, {
21
+ hash_req: {
22
+ "cropHintsAnnotation" => Hash,
23
+ "fullTextAnnotation" => Hash,
24
+ "imagePropertiesAnnotation" => Hash,
25
+ "labelAnnotations" => Array,
26
+ "safeSearchAnnotation" => Hash,
27
+ "textAnnotations" => { each: {
28
+ hash_req: {
29
+ "boundingPoly" => { hash: {
30
+ "vertices" => { size: 4..4, each: { hash: { "x" => Integer, "y" => Integer } } },
31
+ } },
32
+ "description" => /\A\S(.*\S)?\z/m,
33
+ },
34
+ hash_opt: {
35
+ "locale" => /\A\S(.*\S)?\z/m,
36
+ },
37
+ } },
38
+ },
39
+ hash_opt: {
40
+ "localizedObjectAnnotations" => Array,
41
+ }
42
+ }
43
+ end["textAnnotations"].map do |text|
44
+ Struct.new text["description"],
45
+ text["boundingPoly"]["vertices"].map{ |_| _["x"] }.min,
46
+ text["boundingPoly"]["vertices"].map{ |_| _["y"] }.max,
47
+ text["boundingPoly"]["vertices"].map{ |_| _["x"] }.max,
48
+ text["boundingPoly"]["vertices"].map{ |_| _["y"] }.min,
49
+ text["boundingPoly"]["vertices"].map{ |_| _["x"] }.max - text["boundingPoly"]["vertices"].map{ |_| _["x"] }.min,
50
+ text["boundingPoly"]["vertices"].map{ |_| _["y"] }.max - text["boundingPoly"]["vertices"].map{ |_| _["y"] }.min
51
+ end )
52
+ end
53
+
54
+ def self.link headers, array, direction, alignment, *priority
55
+ l, r = case direction
56
+ when :horizontal ; %i{ left right }
57
+ when :vertical ; %i{ top bottom }
58
+ else ; fail "invalid direction"
59
+ end
60
+ headers = headers.sort_by(&l).map(&:dup)
61
+ headers.each_cons(2){ |a, b| a[r], b[l] = [a[r], b[l]].max, [a[r], b[l]].min }
62
+ headers.first[l] = -Float::INFINITY
63
+ headers.last[r] = +Float::INFINITY
64
+ headers.unshift headers.delete_at headers.index{ |_| priority.include? _.text } unless priority.empty? # TODO: document/explain this
65
+ array.sort_by(&l).each_with_object([]) do |cell, a|
66
+ i = headers.public_send(alignment){ |_| (_[l].._[r]).include?((cell[l]+cell[r])/2) }
67
+ a[i] ||= []
68
+ a[i] << cell
69
+ end
70
+ end
71
+ def self.pdf2struct object
72
+ require "hexapdf"
73
+ processor = Class.new HexaPDF::Content::Processor do
74
+ attr_reader :texts
75
+ def initialize _
76
+ super
77
+ @texts = Texts.new
78
+ end
79
+ def show_text str
80
+ boxes = decode_text_with_positioning str
81
+ @texts.push Struct.new boxes.string,
82
+ boxes.lower_left[0], -boxes.lower_left[1],
83
+ boxes.upper_right[0], -boxes.upper_right[1],
84
+ boxes.upper_right[0] - boxes.lower_left[0],
85
+ boxes.lower_left[1] - boxes.upper_right[1]
86
+ end
87
+ end
88
+ HexaPDF::Document.new(io: object).pages.map do |page|
89
+ processor.new(page).tap(&page.method(:process_contents)).texts
90
+ end
91
+ end
92
+
93
+ def self.subgraphs data
94
+ data.zip.tap do |array|
95
+ (0...data.size).each do |i|
96
+ (0...i).to_a.select do |j|
97
+ array[i].product(array[j]).any?{ |i,j| yield i,j }
98
+ end.each do |j|
99
+ array[i].concat array[j]
100
+ array[j].clear
101
+ end
102
+ end
103
+ end.reject &:empty?
104
+ end
105
+
106
+ end
metadata ADDED
@@ -0,0 +1,75 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: dsr
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Victor Maslov aka Nakilon
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2023-08-16 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nakischema
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: hexapdf
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description:
42
+ email: nakilon@gmail.com
43
+ executables: []
44
+ extensions: []
45
+ extra_rdoc_files: []
46
+ files:
47
+ - LICENSE
48
+ - dsr.gemspec
49
+ - lib/dsr.rb
50
+ homepage:
51
+ licenses:
52
+ - MIT
53
+ metadata:
54
+ source_code_uri: https://github.com/nakilon/dsr
55
+ post_install_message:
56
+ rdoc_options: []
57
+ require_paths:
58
+ - lib
59
+ required_ruby_version: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - ">="
62
+ - !ruby/object:Gem::Version
63
+ version: '0'
64
+ required_rubygems_version: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ requirements: []
70
+ rubygems_version: 3.3.25
71
+ signing_key:
72
+ specification_version: 4
73
+ summary: "[WIP] Document Structure Recognizer -- currently a collection of common
74
+ routines I use to build A.I.s"
75
+ test_files: []