datalab 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/datalab.gemspec +35 -0
- data/lib/datalab/error_result.rb +55 -0
- data/lib/datalab/marker_options.rb +34 -0
- data/lib/datalab/marker_request.rb +99 -0
- data/lib/datalab/marker_result.rb +75 -0
- data/lib/datalab/module_methods.rb +18 -0
- data/lib/datalab/ocr_options.rb +30 -0
- data/lib/datalab/ocr_page.rb +23 -0
- data/lib/datalab/ocr_request.rb +98 -0
- data/lib/datalab/ocr_result.rb +64 -0
- data/lib/datalab/ocr_text_line.rb +23 -0
- data/lib/datalab/request.rb +38 -0
- data/lib/datalab/response_methods.rb +15 -0
- data/lib/datalab.rb +29 -0
- metadata +149 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 9ae1f504f63608091a53657a07e315c7578fb6a1d31f260a083db626feabdf20
|
4
|
+
data.tar.gz: d41b72dcba3ce505311318220acf789b6329f56069ba27783c18c6936b76cbe1
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: '08a22067023a2ba935cad056132a9a190a5c5f16068ae77d51f59f68dd42f428eb5306d974599a6d0553a9a7a71ce733aa88e210af2a48af4699a68a223cdc4a'
|
7
|
+
data.tar.gz: f49525b822f5739ef7613d56ab0e87322e903f3debacd22ca43cbb73c8bd7989cab3bf5c80145e613ce74428ab2d9afd6b6fcc9da5665d6a269fc964d8df7077
|
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2024 Endless International
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/datalab.gemspec
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
Gem::Specification.new do | spec |
|
2
|
+
|
3
|
+
spec.name = 'datalab'
|
4
|
+
spec.version = '0.1.0'
|
5
|
+
spec.authors = [ 'Kristoph Cichocki-Romanov' ]
|
6
|
+
spec.email = [ 'rubygems.org@kristoph.net' ]
|
7
|
+
|
8
|
+
spec.summary =
|
9
|
+
"The Datalab gem implements a lightweight interface to the Datalab API which provides " \
|
10
|
+
"document to Markdown conversion as well as sophisticated OCR for documents and images."
|
11
|
+
spec.description =
|
12
|
+
"The Datalab gem implements a lightweight interface to the Datalab API. The Datalab API " \
|
13
|
+
"can convert a number of document formats, including PDF, Word and Powerpoint to Markdown. " \
|
14
|
+
"In addition in offers sophisticate OCR, layout and line detection for documents an images."
|
15
|
+
spec.license = 'MIT'
|
16
|
+
spec.homepage = 'https://github.com/EndlessInternational/datalab'
|
17
|
+
spec.metadata = {
|
18
|
+
'source_code_uri' => 'https://github.com/EndlessInternational/datalab',
|
19
|
+
'bug_tracker_uri' => 'https://github.com/EndlessInternational/datalab/issues',
|
20
|
+
# 'documentation_uri' => 'https://github.com/EndlessInternational/datalab'
|
21
|
+
}
|
22
|
+
|
23
|
+
spec.required_ruby_version = '>= 3.0'
|
24
|
+
spec.files = Dir[ "lib/**/*.rb", "LICENSE", "README.md", "datalab.gemspec" ]
|
25
|
+
spec.require_paths = [ "lib" ]
|
26
|
+
|
27
|
+
spec.add_runtime_dependency 'faraday', '~> 2.7'
|
28
|
+
spec.add_runtime_dependency 'faraday-multipart', '~>1.0'
|
29
|
+
spec.add_runtime_dependency 'dynamicschema', '~> 1.0.0.beta04'
|
30
|
+
|
31
|
+
spec.add_development_dependency 'rspec', '~> 3.13'
|
32
|
+
spec.add_development_dependency 'debug', '~> 1.9'
|
33
|
+
spec.add_development_dependency 'vcr', '~> 6.3'
|
34
|
+
|
35
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module Datalab
|
2
|
+
class ErrorResult
|
3
|
+
|
4
|
+
attr_reader :error_type, :error_description
|
5
|
+
|
6
|
+
def initialize( status_code, attributes = nil )
|
7
|
+
@error_code, @error_description = status_code_to_error( status_code )
|
8
|
+
if detail = attributes[ :detail ]&.first rescue nil
|
9
|
+
if message = detail[ :msg ]
|
10
|
+
@error_description = message
|
11
|
+
if location = detail[ :loc ]
|
12
|
+
@error_description += " : " + [ location ].flatten.join( '/' )
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
def status_code_to_error( status_code )
|
20
|
+
case status_code
|
21
|
+
# this is here because I've noted invalid payloads being returned with a 200
|
22
|
+
when 200
|
23
|
+
[ :unexpected_error,
|
24
|
+
"The response was successful but it did not include a valid payload." ]
|
25
|
+
when 400
|
26
|
+
[ :invalid_request_error,
|
27
|
+
"There was an issue with the format or content of your request." ]
|
28
|
+
when 401
|
29
|
+
[ :authentication_error,
|
30
|
+
"There's an issue with your API key." ]
|
31
|
+
when 402
|
32
|
+
[ :payment_required,
|
33
|
+
"The request requires a paid account" ]
|
34
|
+
when 404
|
35
|
+
[ :not_found_error,
|
36
|
+
"The requested resource was not found." ]
|
37
|
+
when 422
|
38
|
+
[ :invalid_data_error,
|
39
|
+
"The request body is invalid." ]
|
40
|
+
when 429
|
41
|
+
[ :rate_limit_error,
|
42
|
+
"Your account has hit a rate limit." ]
|
43
|
+
when 500..505
|
44
|
+
[ :api_error,
|
45
|
+
"An unexpected Datalab server error has occurred." ]
|
46
|
+
when 529
|
47
|
+
[ :overloaded_error,
|
48
|
+
"The Datalab service is overloaded." ]
|
49
|
+
else
|
50
|
+
[ :unknown_error,
|
51
|
+
"The Datalab service returned an unexpected status code: '#{status_code}'." ]
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module Datalab
|
2
|
+
class MarkerOptions
|
3
|
+
include DynamicSchema::Definable
|
4
|
+
|
5
|
+
schema do
|
6
|
+
maximum_pages Integer, as: :max_pages, in: (1..)
|
7
|
+
languages String, array: true
|
8
|
+
force_ocr [ TrueClass, FalseClass ]
|
9
|
+
paginate [ TrueClass, FalseClass ]
|
10
|
+
extract_images [ TrueClass, FalseClass ]
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.build( options = nil, &block )
|
14
|
+
new( api_options: builder.build( options, &block ) )
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.build!( options = nil, &block )
|
18
|
+
new( api_options: builder.build!( options, &block ) )
|
19
|
+
end
|
20
|
+
|
21
|
+
def initialize( options = {}, api_options: nil )
|
22
|
+
@options = self.class.builder.build( options || {} )
|
23
|
+
@options = api_options.merge( @options ) if api_options
|
24
|
+
@options[ :formats ]&.map! { | format | string_camelize( format.to_s ) }
|
25
|
+
end
|
26
|
+
|
27
|
+
def to_h
|
28
|
+
@options.to_h
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
|
@@ -0,0 +1,99 @@
|
|
1
|
+
module Datalab
|
2
|
+
|
3
|
+
##
|
4
|
+
# The +MarkerRequest+ class encapsulates a document conversion request in the Datalab API.
|
5
|
+
# After instantiating a new +MarkerRequest+ instance you can begin a markdown conversion
|
6
|
+
# document by calling the +submit+ method and then subsequently retrieving the results by
|
7
|
+
# calling the +retrieve+' method.
|
8
|
+
#
|
9
|
+
# === examples
|
10
|
+
#
|
11
|
+
# require 'datalab'
|
12
|
+
#
|
13
|
+
# request = Datalab::MarkerRequest.new( api_key: ENV[ 'DATALAB_API_KEY' ] )
|
14
|
+
#
|
15
|
+
# file = Faraday::UploadIO.new( ARGV[ 0 ], 'application/pdf' )
|
16
|
+
# response = request.submit( file )
|
17
|
+
# while response.success? && ( result = response.result ).success?
|
18
|
+
# result = request.retrieve( result )
|
19
|
+
# break if result.complete?
|
20
|
+
# end
|
21
|
+
#
|
22
|
+
# if response.success?
|
23
|
+
# if response.result.success?
|
24
|
+
# puts response.result.markdown
|
25
|
+
# else
|
26
|
+
# puts response.result.failure_message
|
27
|
+
# end
|
28
|
+
# else
|
29
|
+
# puts response.result.error_description
|
30
|
+
# end
|
31
|
+
#
|
32
|
+
class MarkerRequest < Request
|
33
|
+
|
34
|
+
##
|
35
|
+
# The +submit+ method makes a Datalab '/marker' POST request which will begin conversion of
|
36
|
+
# the given file to markdown.
|
37
|
+
#
|
38
|
+
# The response is always an instance of +Faraday::Response+. If +response.success?+ is true,
|
39
|
+
# then +response.result+ will be an instance +MarkerResult+. If the request is not successful
|
40
|
+
# then +response.result+ will be an instance of +ErrorResult+.
|
41
|
+
#
|
42
|
+
# Remember that you should call +response.success?+ to validate that the call to the API was
|
43
|
+
# successful and then +response.result.success?+ to validate that the API processed the
|
44
|
+
# request successfuly.
|
45
|
+
#
|
46
|
+
def submit( file, options = nil, &block )
|
47
|
+
if options
|
48
|
+
options = options.is_a?( MarkerOptions ) ? options : MarkerOptions.build( options.to_h )
|
49
|
+
options = options.to_h
|
50
|
+
else
|
51
|
+
options = {}
|
52
|
+
end
|
53
|
+
options[ :file ] = file
|
54
|
+
response = post( "#{BASE_URI}/marker", options, &block )
|
55
|
+
result = nil
|
56
|
+
if response.success?
|
57
|
+
attributes = ( JSON.parse( response.body, symbolize_names: true ) rescue nil )
|
58
|
+
attributes ||= { success: false, status: :failed, error: 'An unknown error occured.' }
|
59
|
+
result = MarkerResult.new( attributes )
|
60
|
+
else
|
61
|
+
attributes = JSON.parse( response.body, symbolize_names: true ) rescue {}
|
62
|
+
result = ErrorResult.new( response.status, attributes )
|
63
|
+
end
|
64
|
+
|
65
|
+
ResponseMethods.install( response, result )
|
66
|
+
end
|
67
|
+
|
68
|
+
##
|
69
|
+
# The +retrieve+ method takes the successful result of the submit method and makes a Datalab
|
70
|
+
# '/marker/{id}' GET request which will return the conversion progress result or, if
|
71
|
+
# conversion has been completed, the conversion results.
|
72
|
+
#
|
73
|
+
# The response is always an instance of +Faraday::Response+. If +response.success?+ is +true+,
|
74
|
+
# then +response.result+ will be an instance +Datalab::MarkerResult+. If the request is not
|
75
|
+
# successful then +response.result+ will be an instance of +Datalab::ErrorResult+.
|
76
|
+
#
|
77
|
+
# Remember that you should call +response.success?+ to valida that the call to the API was
|
78
|
+
# successful and then +response.result.success?+ to validate that the API processed the
|
79
|
+
# request successfuly.
|
80
|
+
#
|
81
|
+
def retrieve( submit_result, &block )
|
82
|
+
raise ArgumentError, "The first argument must be an instance of MarkerResult." \
|
83
|
+
unless submit_result.is_a?( MarkerResult )
|
84
|
+
|
85
|
+
response = get( "#{BASE_URI}/marker/#{submit_result.id}", &block )
|
86
|
+
result = nil
|
87
|
+
attributes = JSON.parse( response.body, symbolize_names: true ) rescue nil
|
88
|
+
|
89
|
+
if response.success?
|
90
|
+
result = submit_result.merge( attributes || { success: false, status: :failed } )
|
91
|
+
else
|
92
|
+
result = ErrorResult.new( response.status, attributes || {} )
|
93
|
+
end
|
94
|
+
|
95
|
+
ResponseMethods.install( response, result )
|
96
|
+
end
|
97
|
+
|
98
|
+
end
|
99
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
module Datalab
|
2
|
+
class MarkerResult
|
3
|
+
|
4
|
+
def initialize( attributes )
|
5
|
+
@success =
|
6
|
+
attributes[ :success ] ||
|
7
|
+
[ :processing, :complete ].include?( attributes[ :status ]&.to_sym )
|
8
|
+
@attributes = attributes || {}
|
9
|
+
end
|
10
|
+
|
11
|
+
def id
|
12
|
+
@attributes[ :request_id ]
|
13
|
+
end
|
14
|
+
|
15
|
+
##
|
16
|
+
# The +success?+ method returns +true+ if the converstion was successful.
|
17
|
+
#
|
18
|
+
# Note that the response +success?+ tells you if the call to the Datalab API was successful
|
19
|
+
# while this +success?+ method tells you if the actual conversaion operation began
|
20
|
+
# successfully.
|
21
|
+
#
|
22
|
+
def success?
|
23
|
+
@success || false
|
24
|
+
end
|
25
|
+
|
26
|
+
def status
|
27
|
+
@attributes[ :status ]&.to_sym || :processing
|
28
|
+
end
|
29
|
+
|
30
|
+
def processing?
|
31
|
+
status == :processing
|
32
|
+
end
|
33
|
+
|
34
|
+
def complete?
|
35
|
+
status == :complete
|
36
|
+
end
|
37
|
+
|
38
|
+
##
|
39
|
+
# If +success?+ returns +false+ this method will return a message explaining the reason
|
40
|
+
# for the failure.
|
41
|
+
#
|
42
|
+
def failure_message
|
43
|
+
@attributes[ :error ]
|
44
|
+
end
|
45
|
+
|
46
|
+
##
|
47
|
+
# The +markdown+ method returns the markdown content extracted from the given document.
|
48
|
+
#
|
49
|
+
def markdown
|
50
|
+
@attributes[ :markdown ]
|
51
|
+
end
|
52
|
+
|
53
|
+
##
|
54
|
+
#
|
55
|
+
#
|
56
|
+
def images
|
57
|
+
@attribute[ :images ]
|
58
|
+
end
|
59
|
+
|
60
|
+
def metadata
|
61
|
+
unless @metadata
|
62
|
+
metadata = @attributes[ :metadata ] || {}
|
63
|
+
@metadata = metadata.transform_keys do | key |
|
64
|
+
key.to_s.gsub( /([a-z])([A-Z])/, '\1_\2' ).downcase
|
65
|
+
end
|
66
|
+
end
|
67
|
+
@metadata
|
68
|
+
end
|
69
|
+
|
70
|
+
def merge( attributes )
|
71
|
+
self.class.new( ( @attributes || {} ).merge( attributes ) )
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module Datalab
|
2
|
+
module ModuleMethods
|
3
|
+
DEFAULT_CONNECTION = Faraday.new do | builder |
|
4
|
+
builder.request :multipart
|
5
|
+
builder.request :url_encoded
|
6
|
+
builder.adapter Faraday.default_adapter
|
7
|
+
end
|
8
|
+
|
9
|
+
def connection( connection = nil )
|
10
|
+
@connection = connection || @connection || DEFAULT_CONNECTION
|
11
|
+
end
|
12
|
+
|
13
|
+
def api_key( api_key = nil )
|
14
|
+
@api_key = api_key || @api_key
|
15
|
+
@api_key
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Datalab
|
2
|
+
class OcrOptions
|
3
|
+
include DynamicSchema::Definable
|
4
|
+
|
5
|
+
schema do
|
6
|
+
maximum_pages Integer, as: :max_pages, in: (1..)
|
7
|
+
languages String, array: true
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.build( options = nil, &block )
|
11
|
+
new( api_options: builder.build( options, &block ) )
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.build!( options = nil, &block )
|
15
|
+
new( api_options: builder.build!( options, &block ) )
|
16
|
+
end
|
17
|
+
|
18
|
+
def initialize( options = {}, api_options: nil )
|
19
|
+
@options = self.class.builder.build( options || {} )
|
20
|
+
@options = api_options.merge( @options ) if api_options
|
21
|
+
end
|
22
|
+
|
23
|
+
def to_h
|
24
|
+
@options.to_h
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Datalab
|
2
|
+
class OcrPage
|
3
|
+
def initialize( attributes )
|
4
|
+
@attributes = attributes&.dup || {}
|
5
|
+
end
|
6
|
+
|
7
|
+
def text_lines
|
8
|
+
( @attributes[ :text_lines ] || [] ).map( &OcrTextLine.method( :new ) )
|
9
|
+
end
|
10
|
+
|
11
|
+
def languages
|
12
|
+
@attributes[ :languages ]
|
13
|
+
end
|
14
|
+
|
15
|
+
def number
|
16
|
+
@attributes[ :page ]
|
17
|
+
end
|
18
|
+
|
19
|
+
def bounding_rectangle
|
20
|
+
@attributes[ :image_bbox ]
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
module Datalab
|
2
|
+
|
3
|
+
##
|
4
|
+
# The +OcrRequest+ class encapsulates a document or image recognition request in the Datalab API.
|
5
|
+
# After instantiating a new +OcrRequest+ instance you can begin recognition by calling the
|
6
|
+
# +submit+ method and then subsequently retrieving the results by calling the +retrieve+ method.
|
7
|
+
#
|
8
|
+
# === examples
|
9
|
+
#
|
10
|
+
# require 'datalab'
|
11
|
+
#
|
12
|
+
# request = Datalab::OcrRequest.new( api_key: ENV[ 'DATALAB_API_KEY' ] )
|
13
|
+
#
|
14
|
+
# file = Faraday::UploadIO.new( ARGV[ 0 ], 'image/jpeg' )
|
15
|
+
# response = request.submit( file )
|
16
|
+
# while response.success? && ( result = response.result ).success?
|
17
|
+
# result = request.retrieve( result )
|
18
|
+
# break if result.complete?
|
19
|
+
# end
|
20
|
+
#
|
21
|
+
# if response.success?
|
22
|
+
# if response.result.success?
|
23
|
+
# puts response.result.markdown
|
24
|
+
# else
|
25
|
+
# puts response.result.failure_message
|
26
|
+
# end
|
27
|
+
# else
|
28
|
+
# puts response.result.error_description
|
29
|
+
# end
|
30
|
+
#
|
31
|
+
class OcrRequest < Request
|
32
|
+
|
33
|
+
##
|
34
|
+
# The +submit+ method makes a Datalab '/ocr' POST request which will begin recognition of the
|
35
|
+
# given file.
|
36
|
+
#
|
37
|
+
# The response is always an instance of +Faraday::Response+. If +response.success?+ is true,
|
38
|
+
# then +response.result+ will be an instance +OcrResult+. If the request is not successful
|
39
|
+
# then +response.result+ will be an instance of +ErrorResult+.
|
40
|
+
#
|
41
|
+
# Remember that you should call +response.success?+ to validate that the call to the API was
|
42
|
+
# successful and then +response.result.success?+ to validate that the API processed the
|
43
|
+
# request successfuly.
|
44
|
+
#
|
45
|
+
def submit( file, options = nil, &block )
|
46
|
+
if options
|
47
|
+
options = options.is_a?( OcrOptions ) ? options : OcrOptions.build( options.to_h )
|
48
|
+
options = options.to_h
|
49
|
+
else
|
50
|
+
options = {}
|
51
|
+
end
|
52
|
+
options[ :file ] = file
|
53
|
+
response = post( "#{BASE_URI}/ocr", options, &block )
|
54
|
+
result = nil
|
55
|
+
if response.success?
|
56
|
+
attributes = ( JSON.parse( response.body, symbolize_names: true ) rescue nil )
|
57
|
+
attributes ||= { success: false, status: :failed, error: 'An unknown error occured.' }
|
58
|
+
result = OcrResult.new( attributes )
|
59
|
+
else
|
60
|
+
attributes = JSON.parse( response.body, symbolize_names: true ) rescue {}
|
61
|
+
result = ErrorResult.new( response.status, attributes )
|
62
|
+
end
|
63
|
+
|
64
|
+
ResponseMethods.install( response, result )
|
65
|
+
end
|
66
|
+
|
67
|
+
##
|
68
|
+
# The +retrieve+ method takes the successful result of the submit method and makes a Datalab
|
69
|
+
# '/ocr/{id}' GET request which will return the recognition progress result or, if recognition
|
70
|
+
# has been completed, the recognition results.
|
71
|
+
#
|
72
|
+
# The response is always an instance of +Faraday::Response+. If +response.success?+ is +true+,
|
73
|
+
# then +response.result+ will be an instance +Datalab::OcrResult+. If the request is not
|
74
|
+
# successful then +response.result+ will be an instance of +Datalab::ErrorResult+.
|
75
|
+
#
|
76
|
+
# Remember that you should call +response.success?+ to valide that the call to the API was
|
77
|
+
# successful and then +response.result.success?+ to validate that the API processed the
|
78
|
+
# request successfuly.
|
79
|
+
#
|
80
|
+
def retrieve( submit_result, &block )
|
81
|
+
raise ArgumentError, "The first argument must be an instance of OcrResult." \
|
82
|
+
unless submit_result.is_a?( OcrResult )
|
83
|
+
|
84
|
+
response = get( "#{BASE_URI}/ocr/#{submit_result.id}", &block )
|
85
|
+
result = nil
|
86
|
+
attributes = JSON.parse( response.body, symbolize_names: true ) rescue nil
|
87
|
+
|
88
|
+
if response.success?
|
89
|
+
result = submit_result.merge( attributes || { success: false, status: :failed } )
|
90
|
+
else
|
91
|
+
result = ErrorResult.new( response.status, attributes || {} )
|
92
|
+
end
|
93
|
+
|
94
|
+
ResponseMethods.install( response, result )
|
95
|
+
end
|
96
|
+
|
97
|
+
end
|
98
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
module Datalab
|
2
|
+
class OcrResult
|
3
|
+
|
4
|
+
def initialize( attributes )
|
5
|
+
@success =
|
6
|
+
attributes[ :success ] ||
|
7
|
+
[ :processing, :complete ].include?( attributes[ :status ]&.to_sym )
|
8
|
+
@attributes = attributes || {}
|
9
|
+
end
|
10
|
+
|
11
|
+
def id
|
12
|
+
@attributes[ :request_id ]
|
13
|
+
end
|
14
|
+
|
15
|
+
##
|
16
|
+
# The +success?+ method returns +true+ if the converstion was successful.
|
17
|
+
#
|
18
|
+
# Note that the response +success?+ tells you if the call to the Datalab API was successful
|
19
|
+
# while this +success?+ method tells you if the actual conversaion operation began
|
20
|
+
# successfully.
|
21
|
+
#
|
22
|
+
def success?
|
23
|
+
@success || false
|
24
|
+
end
|
25
|
+
|
26
|
+
def status
|
27
|
+
@attributes[ :status ]&.to_sym || :processing
|
28
|
+
end
|
29
|
+
|
30
|
+
def processing?
|
31
|
+
status == :processing
|
32
|
+
end
|
33
|
+
|
34
|
+
def complete?
|
35
|
+
status == :complete
|
36
|
+
end
|
37
|
+
|
38
|
+
##
|
39
|
+
# If +success?+ returns +false+ this method will return a message explaining the reason
|
40
|
+
# for the failure.
|
41
|
+
#
|
42
|
+
def failure_message
|
43
|
+
@attributes[ :error ]
|
44
|
+
end
|
45
|
+
|
46
|
+
##
|
47
|
+
# The +pages+ method returns the pages extracted from the given document. If the given
|
48
|
+
# document was an image the result is an array with a single page. If no pages were recognized
|
49
|
+
# the result is an empty array.
|
50
|
+
#
|
51
|
+
def pages
|
52
|
+
( @attributes[ :pages ] || [] ).map( &OcrPage.method( :new ) )
|
53
|
+
end
|
54
|
+
|
55
|
+
def page_count
|
56
|
+
@attributes[ :page_count ]
|
57
|
+
end
|
58
|
+
|
59
|
+
def merge( attributes )
|
60
|
+
self.class.new( ( @attributes || {} ).merge( attributes ) )
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Datalab
|
2
|
+
class OcrTextLine
|
3
|
+
def initialize( attributes )
|
4
|
+
@attributes = attributes&.dup || {}
|
5
|
+
end
|
6
|
+
|
7
|
+
def text
|
8
|
+
@attributes[ :text ]
|
9
|
+
end
|
10
|
+
|
11
|
+
def confidence
|
12
|
+
@attributes[ :confidence ]
|
13
|
+
end
|
14
|
+
|
15
|
+
def bounding_polygon
|
16
|
+
@attributes[ :polygon ]
|
17
|
+
end
|
18
|
+
|
19
|
+
def bounding_rectangle
|
20
|
+
@attributes[ :image_bbox ]
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module Datalab
|
2
|
+
|
3
|
+
##
|
4
|
+
# The +Request+ class encapsulates a request to the Datalab API. This class serves as the
|
5
|
+
# implementation of the MarkerRequest, OcrRequest and other classes and should not be used
|
6
|
+
# directly.
|
7
|
+
#
|
8
|
+
class Request
|
9
|
+
|
10
|
+
BASE_URI = 'https://www.datalab.to/api/v1'
|
11
|
+
|
12
|
+
##
|
13
|
+
# The +initialize+ method initializes the +Request+ instance. You can pass an +api_key+ and
|
14
|
+
# and optionally a (Faraday) +connection+.
|
15
|
+
#
|
16
|
+
def initialize( connection: nil, api_key: nil )
|
17
|
+
@connection = connection || Datalab.connection
|
18
|
+
@api_key = api_key || Datalab.api_key
|
19
|
+
raise ArgumentError, "An 'api_key' is required unless configured using 'Datalab.api_key'." \
|
20
|
+
unless @api_key
|
21
|
+
end
|
22
|
+
|
23
|
+
protected
|
24
|
+
|
25
|
+
def post( uri, body, &block )
|
26
|
+
@connection.post( uri, body, 'X-Api-Key' => @api_key, &block )
|
27
|
+
end
|
28
|
+
|
29
|
+
def get( uri, &block )
|
30
|
+
@connection.get( uri ) do | request |
|
31
|
+
request.headers[ 'X-Api-Key' ] = @api_key
|
32
|
+
block.call( request ) if block
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Datalab
|
2
|
+
#
|
3
|
+
# The ResponseMethods module extends a Faraday reponse, adding the +result+ method.
|
4
|
+
#
|
5
|
+
module ResponseMethods
|
6
|
+
def self.install( response, result )
|
7
|
+
response.instance_variable_set( "@_datalab_result", result )
|
8
|
+
response.extend( ResponseMethods )
|
9
|
+
end
|
10
|
+
|
11
|
+
def result
|
12
|
+
@_datalab_result
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
data/lib/datalab.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'base64'
|
3
|
+
require 'uri'
|
4
|
+
|
5
|
+
require 'faraday'
|
6
|
+
require 'faraday/multipart'
|
7
|
+
require 'dynamic_schema'
|
8
|
+
|
9
|
+
require_relative 'datalab/error_result'
|
10
|
+
require_relative 'datalab/request'
|
11
|
+
require_relative 'datalab/response_methods'
|
12
|
+
|
13
|
+
require_relative 'datalab/marker_options'
|
14
|
+
require_relative 'datalab/marker_result'
|
15
|
+
require_relative 'datalab/marker_request'
|
16
|
+
|
17
|
+
require_relative 'datalab/ocr_text_line'
|
18
|
+
require_relative 'datalab/ocr_page'
|
19
|
+
require_relative 'datalab/ocr_options'
|
20
|
+
require_relative 'datalab/ocr_result'
|
21
|
+
require_relative 'datalab/ocr_request'
|
22
|
+
|
23
|
+
require_relative 'datalab/module_methods'
|
24
|
+
|
25
|
+
module Datalab
|
26
|
+
extend ModuleMethods
|
27
|
+
end
|
28
|
+
|
29
|
+
|
metadata
ADDED
@@ -0,0 +1,149 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: datalab
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Kristoph Cichocki-Romanov
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2024-11-14 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: faraday
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '2.7'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '2.7'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: faraday-multipart
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: dynamicschema
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 1.0.0.beta04
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 1.0.0.beta04
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rspec
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '3.13'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '3.13'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: debug
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '1.9'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '1.9'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: vcr
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '6.3'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '6.3'
|
97
|
+
description: The Datalab gem implements a lightweight interface to the Datalab API.
|
98
|
+
The Datalab API can convert a number of document formats, including PDF, Word and
|
99
|
+
Powerpoint to Markdown. In addition in offers sophisticate OCR, layout and line
|
100
|
+
detection for documents an images.
|
101
|
+
email:
|
102
|
+
- rubygems.org@kristoph.net
|
103
|
+
executables: []
|
104
|
+
extensions: []
|
105
|
+
extra_rdoc_files: []
|
106
|
+
files:
|
107
|
+
- LICENSE
|
108
|
+
- datalab.gemspec
|
109
|
+
- lib/datalab.rb
|
110
|
+
- lib/datalab/error_result.rb
|
111
|
+
- lib/datalab/marker_options.rb
|
112
|
+
- lib/datalab/marker_request.rb
|
113
|
+
- lib/datalab/marker_result.rb
|
114
|
+
- lib/datalab/module_methods.rb
|
115
|
+
- lib/datalab/ocr_options.rb
|
116
|
+
- lib/datalab/ocr_page.rb
|
117
|
+
- lib/datalab/ocr_request.rb
|
118
|
+
- lib/datalab/ocr_result.rb
|
119
|
+
- lib/datalab/ocr_text_line.rb
|
120
|
+
- lib/datalab/request.rb
|
121
|
+
- lib/datalab/response_methods.rb
|
122
|
+
homepage: https://github.com/EndlessInternational/datalab
|
123
|
+
licenses:
|
124
|
+
- MIT
|
125
|
+
metadata:
|
126
|
+
source_code_uri: https://github.com/EndlessInternational/datalab
|
127
|
+
bug_tracker_uri: https://github.com/EndlessInternational/datalab/issues
|
128
|
+
post_install_message:
|
129
|
+
rdoc_options: []
|
130
|
+
require_paths:
|
131
|
+
- lib
|
132
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
133
|
+
requirements:
|
134
|
+
- - ">="
|
135
|
+
- !ruby/object:Gem::Version
|
136
|
+
version: '3.0'
|
137
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
138
|
+
requirements:
|
139
|
+
- - ">="
|
140
|
+
- !ruby/object:Gem::Version
|
141
|
+
version: '0'
|
142
|
+
requirements: []
|
143
|
+
rubygems_version: 3.5.19
|
144
|
+
signing_key:
|
145
|
+
specification_version: 4
|
146
|
+
summary: The Datalab gem implements a lightweight interface to the Datalab API which
|
147
|
+
provides document to Markdown conversion as well as sophisticated OCR for documents
|
148
|
+
and images.
|
149
|
+
test_files: []
|