dsr 0.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/dsr.gemspec +15 -0
- data/lib/dsr.rb +106 -0
- metadata +75 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: e94dd71eedb7e3989dde0e7a35c73db47c97891211ab3620ef668bf6fb163dcb
|
4
|
+
data.tar.gz: af6324245ad6c157dc3a700f00121860c75bb828dc096ffcd4a4c1e15ae0dd96
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 1a607b1cfaaec5321348face51201ec20baa16958775485f3a86fe67a1eb703be6a58bcf555045dbb181397644eb42240fbe28a38b4dd79f361de4a287e23897
|
7
|
+
data.tar.gz: 66fefe0318e13f2b2d60f60dd125af1577313ac038d1fc91215016fac21066f91abbb58ce91a6f1eabfcb2b39513c800f563cac567f3539e162b1ec05cfddf10
|
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2023 Victor Maslov
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/dsr.gemspec
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
Gem::Specification.new do |spec|
|
2
|
+
spec.name = "dsr"
|
3
|
+
spec.version = "0.0.0"
|
4
|
+
spec.summary = "[WIP] Document Structure Recognizer -- currently a collection of common routines I use to build A.I.s"
|
5
|
+
|
6
|
+
spec.author = "Victor Maslov aka Nakilon"
|
7
|
+
spec.email = "nakilon@gmail.com"
|
8
|
+
spec.license = "MIT"
|
9
|
+
spec.metadata = {"source_code_uri" => "https://github.com/nakilon/dsr"}
|
10
|
+
|
11
|
+
spec.add_dependency "nakischema"
|
12
|
+
spec.add_dependency "hexapdf"
|
13
|
+
|
14
|
+
spec.files = %w{ LICENSE dsr.gemspec lib/dsr.rb }
|
15
|
+
end
|
data/lib/dsr.rb
ADDED
@@ -0,0 +1,106 @@
|
|
1
|
+
module DSR
|
2
|
+
|
3
|
+
Struct = ::Struct.new :text, :left, :bottom, :right, :top, :width, :height
|
4
|
+
private_constant :Struct
|
5
|
+
|
6
|
+
class Texts < Array
|
7
|
+
def find_all_by_text text
|
8
|
+
self.class.new select{ |_| text == _.text }
|
9
|
+
end
|
10
|
+
def select_intersecting_vertically_with item
|
11
|
+
self.class.new (self-[item]).select{ |_| _.bottom >= item.top && _.top <= item.bottom }
|
12
|
+
end
|
13
|
+
end
|
14
|
+
private_constant :Texts
|
15
|
+
|
16
|
+
def self.google2struct json
|
17
|
+
require "json"
|
18
|
+
Texts.new( JSON.load(json).tap do |json|
|
19
|
+
require "nakischema"
|
20
|
+
Nakischema.validate json, {
|
21
|
+
hash_req: {
|
22
|
+
"cropHintsAnnotation" => Hash,
|
23
|
+
"fullTextAnnotation" => Hash,
|
24
|
+
"imagePropertiesAnnotation" => Hash,
|
25
|
+
"labelAnnotations" => Array,
|
26
|
+
"safeSearchAnnotation" => Hash,
|
27
|
+
"textAnnotations" => { each: {
|
28
|
+
hash_req: {
|
29
|
+
"boundingPoly" => { hash: {
|
30
|
+
"vertices" => { size: 4..4, each: { hash: { "x" => Integer, "y" => Integer } } },
|
31
|
+
} },
|
32
|
+
"description" => /\A\S(.*\S)?\z/m,
|
33
|
+
},
|
34
|
+
hash_opt: {
|
35
|
+
"locale" => /\A\S(.*\S)?\z/m,
|
36
|
+
},
|
37
|
+
} },
|
38
|
+
},
|
39
|
+
hash_opt: {
|
40
|
+
"localizedObjectAnnotations" => Array,
|
41
|
+
}
|
42
|
+
}
|
43
|
+
end["textAnnotations"].map do |text|
|
44
|
+
Struct.new text["description"],
|
45
|
+
text["boundingPoly"]["vertices"].map{ |_| _["x"] }.min,
|
46
|
+
text["boundingPoly"]["vertices"].map{ |_| _["y"] }.max,
|
47
|
+
text["boundingPoly"]["vertices"].map{ |_| _["x"] }.max,
|
48
|
+
text["boundingPoly"]["vertices"].map{ |_| _["y"] }.min,
|
49
|
+
text["boundingPoly"]["vertices"].map{ |_| _["x"] }.max - text["boundingPoly"]["vertices"].map{ |_| _["x"] }.min,
|
50
|
+
text["boundingPoly"]["vertices"].map{ |_| _["y"] }.max - text["boundingPoly"]["vertices"].map{ |_| _["y"] }.min
|
51
|
+
end )
|
52
|
+
end
|
53
|
+
|
54
|
+
def self.link headers, array, direction, alignment, *priority
|
55
|
+
l, r = case direction
|
56
|
+
when :horizontal ; %i{ left right }
|
57
|
+
when :vertical ; %i{ top bottom }
|
58
|
+
else ; fail "invalid direction"
|
59
|
+
end
|
60
|
+
headers = headers.sort_by(&l).map(&:dup)
|
61
|
+
headers.each_cons(2){ |a, b| a[r], b[l] = [a[r], b[l]].max, [a[r], b[l]].min }
|
62
|
+
headers.first[l] = -Float::INFINITY
|
63
|
+
headers.last[r] = +Float::INFINITY
|
64
|
+
headers.unshift headers.delete_at headers.index{ |_| priority.include? _.text } unless priority.empty? # TODO: document/explain this
|
65
|
+
array.sort_by(&l).each_with_object([]) do |cell, a|
|
66
|
+
i = headers.public_send(alignment){ |_| (_[l].._[r]).include?((cell[l]+cell[r])/2) }
|
67
|
+
a[i] ||= []
|
68
|
+
a[i] << cell
|
69
|
+
end
|
70
|
+
end
|
71
|
+
def self.pdf2struct object
|
72
|
+
require "hexapdf"
|
73
|
+
processor = Class.new HexaPDF::Content::Processor do
|
74
|
+
attr_reader :texts
|
75
|
+
def initialize _
|
76
|
+
super
|
77
|
+
@texts = Texts.new
|
78
|
+
end
|
79
|
+
def show_text str
|
80
|
+
boxes = decode_text_with_positioning str
|
81
|
+
@texts.push Struct.new boxes.string,
|
82
|
+
boxes.lower_left[0], -boxes.lower_left[1],
|
83
|
+
boxes.upper_right[0], -boxes.upper_right[1],
|
84
|
+
boxes.upper_right[0] - boxes.lower_left[0],
|
85
|
+
boxes.lower_left[1] - boxes.upper_right[1]
|
86
|
+
end
|
87
|
+
end
|
88
|
+
HexaPDF::Document.new(io: object).pages.map do |page|
|
89
|
+
processor.new(page).tap(&page.method(:process_contents)).texts
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def self.subgraphs data
|
94
|
+
data.zip.tap do |array|
|
95
|
+
(0...data.size).each do |i|
|
96
|
+
(0...i).to_a.select do |j|
|
97
|
+
array[i].product(array[j]).any?{ |i,j| yield i,j }
|
98
|
+
end.each do |j|
|
99
|
+
array[i].concat array[j]
|
100
|
+
array[j].clear
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end.reject &:empty?
|
104
|
+
end
|
105
|
+
|
106
|
+
end
|
metadata
ADDED
@@ -0,0 +1,75 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: dsr
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Victor Maslov aka Nakilon
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2023-08-16 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nakischema
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: hexapdf
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
description:
|
42
|
+
email: nakilon@gmail.com
|
43
|
+
executables: []
|
44
|
+
extensions: []
|
45
|
+
extra_rdoc_files: []
|
46
|
+
files:
|
47
|
+
- LICENSE
|
48
|
+
- dsr.gemspec
|
49
|
+
- lib/dsr.rb
|
50
|
+
homepage:
|
51
|
+
licenses:
|
52
|
+
- MIT
|
53
|
+
metadata:
|
54
|
+
source_code_uri: https://github.com/nakilon/dsr
|
55
|
+
post_install_message:
|
56
|
+
rdoc_options: []
|
57
|
+
require_paths:
|
58
|
+
- lib
|
59
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
60
|
+
requirements:
|
61
|
+
- - ">="
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: '0'
|
64
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
requirements: []
|
70
|
+
rubygems_version: 3.3.25
|
71
|
+
signing_key:
|
72
|
+
specification_version: 4
|
73
|
+
summary: "[WIP] Document Structure Recognizer -- currently a collection of common
|
74
|
+
routines I use to build A.I.s"
|
75
|
+
test_files: []
|