datalab 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/datalab.gemspec +35 -0
- data/lib/datalab/error_result.rb +55 -0
- data/lib/datalab/marker_options.rb +34 -0
- data/lib/datalab/marker_request.rb +99 -0
- data/lib/datalab/marker_result.rb +75 -0
- data/lib/datalab/module_methods.rb +18 -0
- data/lib/datalab/ocr_options.rb +30 -0
- data/lib/datalab/ocr_page.rb +23 -0
- data/lib/datalab/ocr_request.rb +98 -0
- data/lib/datalab/ocr_result.rb +64 -0
- data/lib/datalab/ocr_text_line.rb +23 -0
- data/lib/datalab/request.rb +38 -0
- data/lib/datalab/response_methods.rb +15 -0
- data/lib/datalab.rb +29 -0
- metadata +149 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 9ae1f504f63608091a53657a07e315c7578fb6a1d31f260a083db626feabdf20
|
4
|
+
data.tar.gz: d41b72dcba3ce505311318220acf789b6329f56069ba27783c18c6936b76cbe1
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: '08a22067023a2ba935cad056132a9a190a5c5f16068ae77d51f59f68dd42f428eb5306d974599a6d0553a9a7a71ce733aa88e210af2a48af4699a68a223cdc4a'
|
7
|
+
data.tar.gz: f49525b822f5739ef7613d56ab0e87322e903f3debacd22ca43cbb73c8bd7989cab3bf5c80145e613ce74428ab2d9afd6b6fcc9da5665d6a269fc964d8df7077
|
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2024 Endless International
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/datalab.gemspec
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
Gem::Specification.new do | spec |
|
2
|
+
|
3
|
+
spec.name = 'datalab'
|
4
|
+
spec.version = '0.1.0'
|
5
|
+
spec.authors = [ 'Kristoph Cichocki-Romanov' ]
|
6
|
+
spec.email = [ 'rubygems.org@kristoph.net' ]
|
7
|
+
|
8
|
+
spec.summary =
|
9
|
+
"The Datalab gem implements a lightweight interface to the Datalab API which provides " \
|
10
|
+
"document to Markdown conversion as well as sophisticated OCR for documents and images."
|
11
|
+
spec.description =
|
12
|
+
"The Datalab gem implements a lightweight interface to the Datalab API. The Datalab API " \
|
13
|
+
"can convert a number of document formats, including PDF, Word and Powerpoint to Markdown. " \
|
14
|
+
"In addition in offers sophisticate OCR, layout and line detection for documents an images."
|
15
|
+
spec.license = 'MIT'
|
16
|
+
spec.homepage = 'https://github.com/EndlessInternational/datalab'
|
17
|
+
spec.metadata = {
|
18
|
+
'source_code_uri' => 'https://github.com/EndlessInternational/datalab',
|
19
|
+
'bug_tracker_uri' => 'https://github.com/EndlessInternational/datalab/issues',
|
20
|
+
# 'documentation_uri' => 'https://github.com/EndlessInternational/datalab'
|
21
|
+
}
|
22
|
+
|
23
|
+
spec.required_ruby_version = '>= 3.0'
|
24
|
+
spec.files = Dir[ "lib/**/*.rb", "LICENSE", "README.md", "datalab.gemspec" ]
|
25
|
+
spec.require_paths = [ "lib" ]
|
26
|
+
|
27
|
+
spec.add_runtime_dependency 'faraday', '~> 2.7'
|
28
|
+
spec.add_runtime_dependency 'faraday-multipart', '~>1.0'
|
29
|
+
spec.add_runtime_dependency 'dynamicschema', '~> 1.0.0.beta04'
|
30
|
+
|
31
|
+
spec.add_development_dependency 'rspec', '~> 3.13'
|
32
|
+
spec.add_development_dependency 'debug', '~> 1.9'
|
33
|
+
spec.add_development_dependency 'vcr', '~> 6.3'
|
34
|
+
|
35
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module Datalab
|
2
|
+
class ErrorResult
|
3
|
+
|
4
|
+
attr_reader :error_type, :error_description
|
5
|
+
|
6
|
+
def initialize( status_code, attributes = nil )
|
7
|
+
@error_code, @error_description = status_code_to_error( status_code )
|
8
|
+
if detail = attributes[ :detail ]&.first rescue nil
|
9
|
+
if message = detail[ :msg ]
|
10
|
+
@error_description = message
|
11
|
+
if location = detail[ :loc ]
|
12
|
+
@error_description += " : " + [ location ].flatten.join( '/' )
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
def status_code_to_error( status_code )
|
20
|
+
case status_code
|
21
|
+
# this is here because I've noted invalid payloads being returned with a 200
|
22
|
+
when 200
|
23
|
+
[ :unexpected_error,
|
24
|
+
"The response was successful but it did not include a valid payload." ]
|
25
|
+
when 400
|
26
|
+
[ :invalid_request_error,
|
27
|
+
"There was an issue with the format or content of your request." ]
|
28
|
+
when 401
|
29
|
+
[ :authentication_error,
|
30
|
+
"There's an issue with your API key." ]
|
31
|
+
when 402
|
32
|
+
[ :payment_required,
|
33
|
+
"The request requires a paid account" ]
|
34
|
+
when 404
|
35
|
+
[ :not_found_error,
|
36
|
+
"The requested resource was not found." ]
|
37
|
+
when 422
|
38
|
+
[ :invalid_data_error,
|
39
|
+
"The request body is invalid." ]
|
40
|
+
when 429
|
41
|
+
[ :rate_limit_error,
|
42
|
+
"Your account has hit a rate limit." ]
|
43
|
+
when 500..505
|
44
|
+
[ :api_error,
|
45
|
+
"An unexpected Datalab server error has occurred." ]
|
46
|
+
when 529
|
47
|
+
[ :overloaded_error,
|
48
|
+
"The Datalab service is overloaded." ]
|
49
|
+
else
|
50
|
+
[ :unknown_error,
|
51
|
+
"The Datalab service returned an unexpected status code: '#{status_code}'." ]
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module Datalab
|
2
|
+
class MarkerOptions
|
3
|
+
include DynamicSchema::Definable
|
4
|
+
|
5
|
+
schema do
|
6
|
+
maximum_pages Integer, as: :max_pages, in: (1..)
|
7
|
+
languages String, array: true
|
8
|
+
force_ocr [ TrueClass, FalseClass ]
|
9
|
+
paginate [ TrueClass, FalseClass ]
|
10
|
+
extract_images [ TrueClass, FalseClass ]
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.build( options = nil, &block )
|
14
|
+
new( api_options: builder.build( options, &block ) )
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.build!( options = nil, &block )
|
18
|
+
new( api_options: builder.build!( options, &block ) )
|
19
|
+
end
|
20
|
+
|
21
|
+
def initialize( options = {}, api_options: nil )
|
22
|
+
@options = self.class.builder.build( options || {} )
|
23
|
+
@options = api_options.merge( @options ) if api_options
|
24
|
+
@options[ :formats ]&.map! { | format | string_camelize( format.to_s ) }
|
25
|
+
end
|
26
|
+
|
27
|
+
def to_h
|
28
|
+
@options.to_h
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
|
@@ -0,0 +1,99 @@
|
|
1
|
+
module Datalab
|
2
|
+
|
3
|
+
##
|
4
|
+
# The +MarkerRequest+ class encapsulates a document conversion request in the Datalab API.
|
5
|
+
# After instantiating a new +MarkerRequest+ instance you can begin a markdown conversion
|
6
|
+
# document by calling the +submit+ method and then subsequently retrieving the results by
|
7
|
+
# calling the +retrieve+' method.
|
8
|
+
#
|
9
|
+
# === examples
|
10
|
+
#
|
11
|
+
# require 'datalab'
|
12
|
+
#
|
13
|
+
# request = Datalab::MarkerRequest.new( api_key: ENV[ 'DATALAB_API_KEY' ] )
|
14
|
+
#
|
15
|
+
# file = Faraday::UploadIO.new( ARGV[ 0 ], 'application/pdf' )
|
16
|
+
# response = request.submit( file )
|
17
|
+
# while response.success? && ( result = response.result ).success?
|
18
|
+
# result = request.retrieve( result )
|
19
|
+
# break if result.complete?
|
20
|
+
# end
|
21
|
+
#
|
22
|
+
# if response.success?
|
23
|
+
# if response.result.success?
|
24
|
+
# puts response.result.markdown
|
25
|
+
# else
|
26
|
+
# puts response.result.failure_message
|
27
|
+
# end
|
28
|
+
# else
|
29
|
+
# puts response.result.error_description
|
30
|
+
# end
|
31
|
+
#
|
32
|
+
class MarkerRequest < Request
|
33
|
+
|
34
|
+
##
|
35
|
+
# The +submit+ method makes a Datalab '/marker' POST request which will begin conversion of
|
36
|
+
# the given file to markdown.
|
37
|
+
#
|
38
|
+
# The response is always an instance of +Faraday::Response+. If +response.success?+ is true,
|
39
|
+
# then +response.result+ will be an instance +MarkerResult+. If the request is not successful
|
40
|
+
# then +response.result+ will be an instance of +ErrorResult+.
|
41
|
+
#
|
42
|
+
# Remember that you should call +response.success?+ to validate that the call to the API was
|
43
|
+
# successful and then +response.result.success?+ to validate that the API processed the
|
44
|
+
# request successfuly.
|
45
|
+
#
|
46
|
+
def submit( file, options = nil, &block )
|
47
|
+
if options
|
48
|
+
options = options.is_a?( MarkerOptions ) ? options : MarkerOptions.build( options.to_h )
|
49
|
+
options = options.to_h
|
50
|
+
else
|
51
|
+
options = {}
|
52
|
+
end
|
53
|
+
options[ :file ] = file
|
54
|
+
response = post( "#{BASE_URI}/marker", options, &block )
|
55
|
+
result = nil
|
56
|
+
if response.success?
|
57
|
+
attributes = ( JSON.parse( response.body, symbolize_names: true ) rescue nil )
|
58
|
+
attributes ||= { success: false, status: :failed, error: 'An unknown error occured.' }
|
59
|
+
result = MarkerResult.new( attributes )
|
60
|
+
else
|
61
|
+
attributes = JSON.parse( response.body, symbolize_names: true ) rescue {}
|
62
|
+
result = ErrorResult.new( response.status, attributes )
|
63
|
+
end
|
64
|
+
|
65
|
+
ResponseMethods.install( response, result )
|
66
|
+
end
|
67
|
+
|
68
|
+
##
|
69
|
+
# The +retrieve+ method takes the successful result of the submit method and makes a Datalab
|
70
|
+
# '/marker/{id}' GET request which will return the conversion progress result or, if
|
71
|
+
# conversion has been completed, the conversion results.
|
72
|
+
#
|
73
|
+
# The response is always an instance of +Faraday::Response+. If +response.success?+ is +true+,
|
74
|
+
# then +response.result+ will be an instance +Datalab::MarkerResult+. If the request is not
|
75
|
+
# successful then +response.result+ will be an instance of +Datalab::ErrorResult+.
|
76
|
+
#
|
77
|
+
# Remember that you should call +response.success?+ to valida that the call to the API was
|
78
|
+
# successful and then +response.result.success?+ to validate that the API processed the
|
79
|
+
# request successfuly.
|
80
|
+
#
|
81
|
+
def retrieve( submit_result, &block )
|
82
|
+
raise ArgumentError, "The first argument must be an instance of MarkerResult." \
|
83
|
+
unless submit_result.is_a?( MarkerResult )
|
84
|
+
|
85
|
+
response = get( "#{BASE_URI}/marker/#{submit_result.id}", &block )
|
86
|
+
result = nil
|
87
|
+
attributes = JSON.parse( response.body, symbolize_names: true ) rescue nil
|
88
|
+
|
89
|
+
if response.success?
|
90
|
+
result = submit_result.merge( attributes || { success: false, status: :failed } )
|
91
|
+
else
|
92
|
+
result = ErrorResult.new( response.status, attributes || {} )
|
93
|
+
end
|
94
|
+
|
95
|
+
ResponseMethods.install( response, result )
|
96
|
+
end
|
97
|
+
|
98
|
+
end
|
99
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
module Datalab
|
2
|
+
class MarkerResult
|
3
|
+
|
4
|
+
def initialize( attributes )
|
5
|
+
@success =
|
6
|
+
attributes[ :success ] ||
|
7
|
+
[ :processing, :complete ].include?( attributes[ :status ]&.to_sym )
|
8
|
+
@attributes = attributes || {}
|
9
|
+
end
|
10
|
+
|
11
|
+
def id
|
12
|
+
@attributes[ :request_id ]
|
13
|
+
end
|
14
|
+
|
15
|
+
##
|
16
|
+
# The +success?+ method returns +true+ if the converstion was successful.
|
17
|
+
#
|
18
|
+
# Note that the response +success?+ tells you if the call to the Datalab API was successful
|
19
|
+
# while this +success?+ method tells you if the actual conversaion operation began
|
20
|
+
# successfully.
|
21
|
+
#
|
22
|
+
def success?
|
23
|
+
@success || false
|
24
|
+
end
|
25
|
+
|
26
|
+
def status
|
27
|
+
@attributes[ :status ]&.to_sym || :processing
|
28
|
+
end
|
29
|
+
|
30
|
+
def processing?
|
31
|
+
status == :processing
|
32
|
+
end
|
33
|
+
|
34
|
+
def complete?
|
35
|
+
status == :complete
|
36
|
+
end
|
37
|
+
|
38
|
+
##
|
39
|
+
# If +success?+ returns +false+ this method will return a message explaining the reason
|
40
|
+
# for the failure.
|
41
|
+
#
|
42
|
+
def failure_message
|
43
|
+
@attributes[ :error ]
|
44
|
+
end
|
45
|
+
|
46
|
+
##
|
47
|
+
# The +markdown+ method returns the markdown content extracted from the given document.
|
48
|
+
#
|
49
|
+
def markdown
|
50
|
+
@attributes[ :markdown ]
|
51
|
+
end
|
52
|
+
|
53
|
+
##
|
54
|
+
#
|
55
|
+
#
|
56
|
+
def images
|
57
|
+
@attribute[ :images ]
|
58
|
+
end
|
59
|
+
|
60
|
+
def metadata
|
61
|
+
unless @metadata
|
62
|
+
metadata = @attributes[ :metadata ] || {}
|
63
|
+
@metadata = metadata.transform_keys do | key |
|
64
|
+
key.to_s.gsub( /([a-z])([A-Z])/, '\1_\2' ).downcase
|
65
|
+
end
|
66
|
+
end
|
67
|
+
@metadata
|
68
|
+
end
|
69
|
+
|
70
|
+
def merge( attributes )
|
71
|
+
self.class.new( ( @attributes || {} ).merge( attributes ) )
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module Datalab
|
2
|
+
module ModuleMethods
|
3
|
+
DEFAULT_CONNECTION = Faraday.new do | builder |
|
4
|
+
builder.request :multipart
|
5
|
+
builder.request :url_encoded
|
6
|
+
builder.adapter Faraday.default_adapter
|
7
|
+
end
|
8
|
+
|
9
|
+
def connection( connection = nil )
|
10
|
+
@connection = connection || @connection || DEFAULT_CONNECTION
|
11
|
+
end
|
12
|
+
|
13
|
+
def api_key( api_key = nil )
|
14
|
+
@api_key = api_key || @api_key
|
15
|
+
@api_key
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Datalab
|
2
|
+
class OcrOptions
|
3
|
+
include DynamicSchema::Definable
|
4
|
+
|
5
|
+
schema do
|
6
|
+
maximum_pages Integer, as: :max_pages, in: (1..)
|
7
|
+
languages String, array: true
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.build( options = nil, &block )
|
11
|
+
new( api_options: builder.build( options, &block ) )
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.build!( options = nil, &block )
|
15
|
+
new( api_options: builder.build!( options, &block ) )
|
16
|
+
end
|
17
|
+
|
18
|
+
def initialize( options = {}, api_options: nil )
|
19
|
+
@options = self.class.builder.build( options || {} )
|
20
|
+
@options = api_options.merge( @options ) if api_options
|
21
|
+
end
|
22
|
+
|
23
|
+
def to_h
|
24
|
+
@options.to_h
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Datalab
|
2
|
+
class OcrPage
|
3
|
+
def initialize( attributes )
|
4
|
+
@attributes = attributes&.dup || {}
|
5
|
+
end
|
6
|
+
|
7
|
+
def text_lines
|
8
|
+
( @attributes[ :text_lines ] || [] ).map( &OcrTextLine.method( :new ) )
|
9
|
+
end
|
10
|
+
|
11
|
+
def languages
|
12
|
+
@attributes[ :languages ]
|
13
|
+
end
|
14
|
+
|
15
|
+
def number
|
16
|
+
@attributes[ :page ]
|
17
|
+
end
|
18
|
+
|
19
|
+
def bounding_rectangle
|
20
|
+
@attributes[ :image_bbox ]
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
module Datalab
|
2
|
+
|
3
|
+
##
|
4
|
+
# The +OcrRequest+ class encapsulates a document or image recognition request in the Datalab API.
|
5
|
+
# After instantiating a new +OcrRequest+ instance you can begin recognition by calling the
|
6
|
+
# +submit+ method and then subsequently retrieving the results by calling the +retrieve+ method.
|
7
|
+
#
|
8
|
+
# === examples
|
9
|
+
#
|
10
|
+
# require 'datalab'
|
11
|
+
#
|
12
|
+
# request = Datalab::OcrRequest.new( api_key: ENV[ 'DATALAB_API_KEY' ] )
|
13
|
+
#
|
14
|
+
# file = Faraday::UploadIO.new( ARGV[ 0 ], 'image/jpeg' )
|
15
|
+
# response = request.submit( file )
|
16
|
+
# while response.success? && ( result = response.result ).success?
|
17
|
+
# result = request.retrieve( result )
|
18
|
+
# break if result.complete?
|
19
|
+
# end
|
20
|
+
#
|
21
|
+
# if response.success?
|
22
|
+
# if response.result.success?
|
23
|
+
# puts response.result.markdown
|
24
|
+
# else
|
25
|
+
# puts response.result.failure_message
|
26
|
+
# end
|
27
|
+
# else
|
28
|
+
# puts response.result.error_description
|
29
|
+
# end
|
30
|
+
#
|
31
|
+
class OcrRequest < Request
|
32
|
+
|
33
|
+
##
|
34
|
+
# The +submit+ method makes a Datalab '/ocr' POST request which will begin recognition of the
|
35
|
+
# given file.
|
36
|
+
#
|
37
|
+
# The response is always an instance of +Faraday::Response+. If +response.success?+ is true,
|
38
|
+
# then +response.result+ will be an instance +OcrResult+. If the request is not successful
|
39
|
+
# then +response.result+ will be an instance of +ErrorResult+.
|
40
|
+
#
|
41
|
+
# Remember that you should call +response.success?+ to validate that the call to the API was
|
42
|
+
# successful and then +response.result.success?+ to validate that the API processed the
|
43
|
+
# request successfuly.
|
44
|
+
#
|
45
|
+
def submit( file, options = nil, &block )
|
46
|
+
if options
|
47
|
+
options = options.is_a?( OcrOptions ) ? options : OcrOptions.build( options.to_h )
|
48
|
+
options = options.to_h
|
49
|
+
else
|
50
|
+
options = {}
|
51
|
+
end
|
52
|
+
options[ :file ] = file
|
53
|
+
response = post( "#{BASE_URI}/ocr", options, &block )
|
54
|
+
result = nil
|
55
|
+
if response.success?
|
56
|
+
attributes = ( JSON.parse( response.body, symbolize_names: true ) rescue nil )
|
57
|
+
attributes ||= { success: false, status: :failed, error: 'An unknown error occured.' }
|
58
|
+
result = OcrResult.new( attributes )
|
59
|
+
else
|
60
|
+
attributes = JSON.parse( response.body, symbolize_names: true ) rescue {}
|
61
|
+
result = ErrorResult.new( response.status, attributes )
|
62
|
+
end
|
63
|
+
|
64
|
+
ResponseMethods.install( response, result )
|
65
|
+
end
|
66
|
+
|
67
|
+
##
|
68
|
+
# The +retrieve+ method takes the successful result of the submit method and makes a Datalab
|
69
|
+
# '/ocr/{id}' GET request which will return the recognition progress result or, if recognition
|
70
|
+
# has been completed, the recognition results.
|
71
|
+
#
|
72
|
+
# The response is always an instance of +Faraday::Response+. If +response.success?+ is +true+,
|
73
|
+
# then +response.result+ will be an instance +Datalab::OcrResult+. If the request is not
|
74
|
+
# successful then +response.result+ will be an instance of +Datalab::ErrorResult+.
|
75
|
+
#
|
76
|
+
# Remember that you should call +response.success?+ to valide that the call to the API was
|
77
|
+
# successful and then +response.result.success?+ to validate that the API processed the
|
78
|
+
# request successfuly.
|
79
|
+
#
|
80
|
+
def retrieve( submit_result, &block )
|
81
|
+
raise ArgumentError, "The first argument must be an instance of OcrResult." \
|
82
|
+
unless submit_result.is_a?( OcrResult )
|
83
|
+
|
84
|
+
response = get( "#{BASE_URI}/ocr/#{submit_result.id}", &block )
|
85
|
+
result = nil
|
86
|
+
attributes = JSON.parse( response.body, symbolize_names: true ) rescue nil
|
87
|
+
|
88
|
+
if response.success?
|
89
|
+
result = submit_result.merge( attributes || { success: false, status: :failed } )
|
90
|
+
else
|
91
|
+
result = ErrorResult.new( response.status, attributes || {} )
|
92
|
+
end
|
93
|
+
|
94
|
+
ResponseMethods.install( response, result )
|
95
|
+
end
|
96
|
+
|
97
|
+
end
|
98
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
module Datalab
|
2
|
+
class OcrResult
|
3
|
+
|
4
|
+
def initialize( attributes )
|
5
|
+
@success =
|
6
|
+
attributes[ :success ] ||
|
7
|
+
[ :processing, :complete ].include?( attributes[ :status ]&.to_sym )
|
8
|
+
@attributes = attributes || {}
|
9
|
+
end
|
10
|
+
|
11
|
+
def id
|
12
|
+
@attributes[ :request_id ]
|
13
|
+
end
|
14
|
+
|
15
|
+
##
|
16
|
+
# The +success?+ method returns +true+ if the converstion was successful.
|
17
|
+
#
|
18
|
+
# Note that the response +success?+ tells you if the call to the Datalab API was successful
|
19
|
+
# while this +success?+ method tells you if the actual conversaion operation began
|
20
|
+
# successfully.
|
21
|
+
#
|
22
|
+
def success?
|
23
|
+
@success || false
|
24
|
+
end
|
25
|
+
|
26
|
+
def status
|
27
|
+
@attributes[ :status ]&.to_sym || :processing
|
28
|
+
end
|
29
|
+
|
30
|
+
def processing?
|
31
|
+
status == :processing
|
32
|
+
end
|
33
|
+
|
34
|
+
def complete?
|
35
|
+
status == :complete
|
36
|
+
end
|
37
|
+
|
38
|
+
##
|
39
|
+
# If +success?+ returns +false+ this method will return a message explaining the reason
|
40
|
+
# for the failure.
|
41
|
+
#
|
42
|
+
def failure_message
|
43
|
+
@attributes[ :error ]
|
44
|
+
end
|
45
|
+
|
46
|
+
##
|
47
|
+
# The +pages+ method returns the pages extracted from the given document. If the given
|
48
|
+
# document was an image the result is an array with a single page. If no pages were recognized
|
49
|
+
# the result is an empty array.
|
50
|
+
#
|
51
|
+
def pages
|
52
|
+
( @attributes[ :pages ] || [] ).map( &OcrPage.method( :new ) )
|
53
|
+
end
|
54
|
+
|
55
|
+
def page_count
|
56
|
+
@attributes[ :page_count ]
|
57
|
+
end
|
58
|
+
|
59
|
+
def merge( attributes )
|
60
|
+
self.class.new( ( @attributes || {} ).merge( attributes ) )
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Datalab
|
2
|
+
class OcrTextLine
|
3
|
+
def initialize( attributes )
|
4
|
+
@attributes = attributes&.dup || {}
|
5
|
+
end
|
6
|
+
|
7
|
+
def text
|
8
|
+
@attributes[ :text ]
|
9
|
+
end
|
10
|
+
|
11
|
+
def confidence
|
12
|
+
@attributes[ :confidence ]
|
13
|
+
end
|
14
|
+
|
15
|
+
def bounding_polygon
|
16
|
+
@attributes[ :polygon ]
|
17
|
+
end
|
18
|
+
|
19
|
+
def bounding_rectangle
|
20
|
+
@attributes[ :image_bbox ]
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module Datalab
|
2
|
+
|
3
|
+
##
|
4
|
+
# The +Request+ class encapsulates a request to the Datalab API. This class serves as the
|
5
|
+
# implementation of the MarkerRequest, OcrRequest and other classes and should not be used
|
6
|
+
# directly.
|
7
|
+
#
|
8
|
+
class Request
|
9
|
+
|
10
|
+
BASE_URI = 'https://www.datalab.to/api/v1'
|
11
|
+
|
12
|
+
##
|
13
|
+
# The +initialize+ method initializes the +Request+ instance. You can pass an +api_key+ and
|
14
|
+
# and optionally a (Faraday) +connection+.
|
15
|
+
#
|
16
|
+
def initialize( connection: nil, api_key: nil )
|
17
|
+
@connection = connection || Datalab.connection
|
18
|
+
@api_key = api_key || Datalab.api_key
|
19
|
+
raise ArgumentError, "An 'api_key' is required unless configured using 'Datalab.api_key'." \
|
20
|
+
unless @api_key
|
21
|
+
end
|
22
|
+
|
23
|
+
protected
|
24
|
+
|
25
|
+
def post( uri, body, &block )
|
26
|
+
@connection.post( uri, body, 'X-Api-Key' => @api_key, &block )
|
27
|
+
end
|
28
|
+
|
29
|
+
def get( uri, &block )
|
30
|
+
@connection.get( uri ) do | request |
|
31
|
+
request.headers[ 'X-Api-Key' ] = @api_key
|
32
|
+
block.call( request ) if block
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Datalab
|
2
|
+
#
|
3
|
+
# The ResponseMethods module extends a Faraday reponse, adding the +result+ method.
|
4
|
+
#
|
5
|
+
module ResponseMethods
|
6
|
+
def self.install( response, result )
|
7
|
+
response.instance_variable_set( "@_datalab_result", result )
|
8
|
+
response.extend( ResponseMethods )
|
9
|
+
end
|
10
|
+
|
11
|
+
def result
|
12
|
+
@_datalab_result
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
data/lib/datalab.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'base64'
|
3
|
+
require 'uri'
|
4
|
+
|
5
|
+
require 'faraday'
|
6
|
+
require 'faraday/multipart'
|
7
|
+
require 'dynamic_schema'
|
8
|
+
|
9
|
+
require_relative 'datalab/error_result'
|
10
|
+
require_relative 'datalab/request'
|
11
|
+
require_relative 'datalab/response_methods'
|
12
|
+
|
13
|
+
require_relative 'datalab/marker_options'
|
14
|
+
require_relative 'datalab/marker_result'
|
15
|
+
require_relative 'datalab/marker_request'
|
16
|
+
|
17
|
+
require_relative 'datalab/ocr_text_line'
|
18
|
+
require_relative 'datalab/ocr_page'
|
19
|
+
require_relative 'datalab/ocr_options'
|
20
|
+
require_relative 'datalab/ocr_result'
|
21
|
+
require_relative 'datalab/ocr_request'
|
22
|
+
|
23
|
+
require_relative 'datalab/module_methods'
|
24
|
+
|
25
|
+
module Datalab
|
26
|
+
extend ModuleMethods
|
27
|
+
end
|
28
|
+
|
29
|
+
|
metadata
ADDED
@@ -0,0 +1,149 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: datalab
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Kristoph Cichocki-Romanov
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2024-11-14 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: faraday
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '2.7'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '2.7'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: faraday-multipart
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: dynamicschema
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 1.0.0.beta04
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 1.0.0.beta04
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rspec
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '3.13'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '3.13'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: debug
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '1.9'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '1.9'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: vcr
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '6.3'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '6.3'
|
97
|
+
description: The Datalab gem implements a lightweight interface to the Datalab API.
|
98
|
+
The Datalab API can convert a number of document formats, including PDF, Word and
|
99
|
+
Powerpoint to Markdown. In addition in offers sophisticate OCR, layout and line
|
100
|
+
detection for documents an images.
|
101
|
+
email:
|
102
|
+
- rubygems.org@kristoph.net
|
103
|
+
executables: []
|
104
|
+
extensions: []
|
105
|
+
extra_rdoc_files: []
|
106
|
+
files:
|
107
|
+
- LICENSE
|
108
|
+
- datalab.gemspec
|
109
|
+
- lib/datalab.rb
|
110
|
+
- lib/datalab/error_result.rb
|
111
|
+
- lib/datalab/marker_options.rb
|
112
|
+
- lib/datalab/marker_request.rb
|
113
|
+
- lib/datalab/marker_result.rb
|
114
|
+
- lib/datalab/module_methods.rb
|
115
|
+
- lib/datalab/ocr_options.rb
|
116
|
+
- lib/datalab/ocr_page.rb
|
117
|
+
- lib/datalab/ocr_request.rb
|
118
|
+
- lib/datalab/ocr_result.rb
|
119
|
+
- lib/datalab/ocr_text_line.rb
|
120
|
+
- lib/datalab/request.rb
|
121
|
+
- lib/datalab/response_methods.rb
|
122
|
+
homepage: https://github.com/EndlessInternational/datalab
|
123
|
+
licenses:
|
124
|
+
- MIT
|
125
|
+
metadata:
|
126
|
+
source_code_uri: https://github.com/EndlessInternational/datalab
|
127
|
+
bug_tracker_uri: https://github.com/EndlessInternational/datalab/issues
|
128
|
+
post_install_message:
|
129
|
+
rdoc_options: []
|
130
|
+
require_paths:
|
131
|
+
- lib
|
132
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
133
|
+
requirements:
|
134
|
+
- - ">="
|
135
|
+
- !ruby/object:Gem::Version
|
136
|
+
version: '3.0'
|
137
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
138
|
+
requirements:
|
139
|
+
- - ">="
|
140
|
+
- !ruby/object:Gem::Version
|
141
|
+
version: '0'
|
142
|
+
requirements: []
|
143
|
+
rubygems_version: 3.5.19
|
144
|
+
signing_key:
|
145
|
+
specification_version: 4
|
146
|
+
summary: The Datalab gem implements a lightweight interface to the Datalab API which
|
147
|
+
provides document to Markdown conversion as well as sophisticated OCR for documents
|
148
|
+
and images.
|
149
|
+
test_files: []
|