activestorage-ocr 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/activestorage/ocr/client.rb +29 -18
- data/lib/activestorage/ocr/configuration.rb +30 -0
- data/lib/activestorage/ocr/result.rb +19 -2
- data/lib/activestorage/ocr/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 9ac32a71c542c906c4a3d29ab10e1592a2fd08e79914f447beaa4216b9f3e321
|
|
4
|
+
data.tar.gz: e89646596ca0d7662d48f61633744453153426518e7c509951d9da632ae42571
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: f18b15c1cdc22d5816cf5f7d3c6d2b0cb9b1f89a60f1f78f20f5e020a34e0ca98c5f97108aec75904be08ed84b84937d801d0b2e09415279f028e5f384db439c
|
|
7
|
+
data.tar.gz: cb1422a3f9b30b81a9336ca9c69cc76cef8e17d52dc68711587ca80f26ad525bd26c875e7a0cc7aaf9460101c126c2fd9424216951fdc74b4c6e7673b436e3a5
|
|
@@ -44,6 +44,8 @@ module ActiveStorage
|
|
|
44
44
|
#
|
|
45
45
|
# * +blob+ - An ActiveStorage::Blob instance
|
|
46
46
|
# * +engine+ - OCR engine to use (:ocrs or :leptess). Defaults to configured engine.
|
|
47
|
+
# * +preprocess+ - Preprocessing preset (:none, :minimal, :default, :aggressive).
|
|
48
|
+
# Defaults to configured preset.
|
|
47
49
|
#
|
|
48
50
|
# ==== Returns
|
|
49
51
|
#
|
|
@@ -53,9 +55,9 @@ module ActiveStorage
|
|
|
53
55
|
#
|
|
54
56
|
# * ConnectionError - if the server is unreachable
|
|
55
57
|
# * ServerError - if the server returns an error
|
|
56
|
-
def extract_text(blob, engine: nil)
|
|
58
|
+
def extract_text(blob, engine: nil, preprocess: nil)
|
|
57
59
|
blob.open do |file|
|
|
58
|
-
extract_text_from_file(file, blob.content_type, blob.filename.to_s, engine: engine)
|
|
60
|
+
extract_text_from_file(file, blob.content_type, blob.filename.to_s, engine: engine, preprocess: preprocess)
|
|
59
61
|
end
|
|
60
62
|
end
|
|
61
63
|
|
|
@@ -67,6 +69,8 @@ module ActiveStorage
|
|
|
67
69
|
# * +content_type+ - MIME type (auto-detected if not provided)
|
|
68
70
|
# * +filename+ - Filename to send (defaults to basename of path)
|
|
69
71
|
# * +engine+ - OCR engine to use (:ocrs or :leptess). Defaults to configured engine.
|
|
72
|
+
# * +preprocess+ - Preprocessing preset (:none, :minimal, :default, :aggressive).
|
|
73
|
+
# Defaults to configured preset.
|
|
70
74
|
#
|
|
71
75
|
# ==== Returns
|
|
72
76
|
#
|
|
@@ -76,12 +80,12 @@ module ActiveStorage
|
|
|
76
80
|
#
|
|
77
81
|
# * ConnectionError - if the server is unreachable
|
|
78
82
|
# * ServerError - if the server returns an error
|
|
79
|
-
def extract_text_from_path(path, content_type: nil, filename: nil, engine: nil)
|
|
83
|
+
def extract_text_from_path(path, content_type: nil, filename: nil, engine: nil, preprocess: nil)
|
|
80
84
|
content_type ||= Marcel::MimeType.for(Pathname.new(path))
|
|
81
85
|
filename ||= File.basename(path)
|
|
82
86
|
|
|
83
87
|
File.open(path, "rb") do |file|
|
|
84
|
-
extract_text_from_file(file, content_type, filename, engine: engine)
|
|
88
|
+
extract_text_from_file(file, content_type, filename, engine: engine, preprocess: preprocess)
|
|
85
89
|
end
|
|
86
90
|
end
|
|
87
91
|
|
|
@@ -95,6 +99,8 @@ module ActiveStorage
|
|
|
95
99
|
# * +content_type+ - MIME type of the file
|
|
96
100
|
# * +filename+ - Filename to send to the server
|
|
97
101
|
# * +engine+ - OCR engine to use (:ocrs or :leptess). Defaults to configured engine.
|
|
102
|
+
# * +preprocess+ - Preprocessing preset (:none, :minimal, :default, :aggressive).
|
|
103
|
+
# Defaults to configured preset.
|
|
98
104
|
#
|
|
99
105
|
# ==== Returns
|
|
100
106
|
#
|
|
@@ -104,9 +110,10 @@ module ActiveStorage
|
|
|
104
110
|
#
|
|
105
111
|
# * ConnectionError - if the server is unreachable
|
|
106
112
|
# * ServerError - if the server returns an error
|
|
107
|
-
def extract_text_from_file(file, content_type, filename, engine: nil)
|
|
113
|
+
def extract_text_from_file(file, content_type, filename, engine: nil, preprocess: nil)
|
|
108
114
|
target_engine = engine || @config.engine
|
|
109
|
-
|
|
115
|
+
target_preprocess = preprocess || @config.preprocess
|
|
116
|
+
endpoint = ocr_endpoint_for(target_engine, target_preprocess)
|
|
110
117
|
|
|
111
118
|
response = connection.post(endpoint) do |req|
|
|
112
119
|
req.body = {
|
|
@@ -206,24 +213,27 @@ module ActiveStorage
|
|
|
206
213
|
|
|
207
214
|
private
|
|
208
215
|
|
|
209
|
-
# Returns the OCR endpoint path for the given engine.
|
|
216
|
+
# Returns the OCR endpoint path for the given engine and preprocess preset.
|
|
210
217
|
#
|
|
211
218
|
# ==== Parameters
|
|
212
219
|
#
|
|
213
220
|
# * +engine+ - Engine name (:ocrs or :leptess)
|
|
221
|
+
# * +preprocess+ - Preprocessing preset (:none, :minimal, :default, :aggressive)
|
|
214
222
|
#
|
|
215
223
|
# ==== Returns
|
|
216
224
|
#
|
|
217
|
-
# The endpoint path string (e.g., "/ocr"
|
|
218
|
-
def ocr_endpoint_for(engine)
|
|
219
|
-
case engine.to_sym
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
225
|
+
# The endpoint path string with query parameter (e.g., "/ocr?preprocess=default")
|
|
226
|
+
def ocr_endpoint_for(engine, preprocess)
|
|
227
|
+
base = case engine.to_sym
|
|
228
|
+
when :ocrs
|
|
229
|
+
"/ocr"
|
|
230
|
+
when :leptess
|
|
231
|
+
"/ocr/leptess"
|
|
232
|
+
else
|
|
233
|
+
raise ArgumentError, "Unknown engine: #{engine}"
|
|
234
|
+
end
|
|
235
|
+
|
|
236
|
+
"#{base}?preprocess=#{preprocess}"
|
|
227
237
|
end
|
|
228
238
|
|
|
229
239
|
# Returns the Faraday connection, creating it if necessary.
|
|
@@ -250,7 +260,8 @@ module ActiveStorage
|
|
|
250
260
|
confidence: data[:confidence],
|
|
251
261
|
processing_time_ms: data[:processing_time_ms],
|
|
252
262
|
warnings: data[:warnings] || [],
|
|
253
|
-
engine: data[:engine]
|
|
263
|
+
engine: data[:engine],
|
|
264
|
+
preprocessing: data[:preprocessing]
|
|
254
265
|
)
|
|
255
266
|
end
|
|
256
267
|
end
|
|
@@ -10,6 +10,7 @@ module ActiveStorage
|
|
|
10
10
|
# config.server_url = "http://localhost:9292"
|
|
11
11
|
# config.timeout = 60
|
|
12
12
|
# config.engine = :leptess # Use Tesseract engine instead of default ocrs
|
|
13
|
+
# config.preprocess = :aggressive # Use aggressive preprocessing
|
|
13
14
|
# end
|
|
14
15
|
#
|
|
15
16
|
# == Environment Variables
|
|
@@ -18,11 +19,15 @@ module ActiveStorage
|
|
|
18
19
|
# * +ACTIVESTORAGE_OCR_TIMEOUT+ - Request timeout in seconds (default: 30)
|
|
19
20
|
# * +ACTIVESTORAGE_OCR_OPEN_TIMEOUT+ - Connection timeout in seconds (default: 5)
|
|
20
21
|
# * +ACTIVESTORAGE_OCR_ENGINE+ - OCR engine to use: ocrs (default) or leptess
|
|
22
|
+
# * +ACTIVESTORAGE_OCR_PREPROCESS+ - Preprocessing preset: none, minimal, default, aggressive
|
|
21
23
|
#
|
|
22
24
|
class Configuration
|
|
23
25
|
# Valid OCR engine names
|
|
24
26
|
VALID_ENGINES = %i[ocrs leptess].freeze
|
|
25
27
|
|
|
28
|
+
# Valid preprocessing preset names
|
|
29
|
+
VALID_PREPROCESS = %i[none minimal default aggressive].freeze
|
|
30
|
+
|
|
26
31
|
# The URL of the OCR server.
|
|
27
32
|
attr_accessor :server_url
|
|
28
33
|
|
|
@@ -40,6 +45,11 @@ module ActiveStorage
|
|
|
40
45
|
# Use :leptess for Tesseract-based OCR (better for messy images).
|
|
41
46
|
attr_reader :engine
|
|
42
47
|
|
|
48
|
+
# The preprocessing preset to use (:none, :minimal, :default, :aggressive).
|
|
49
|
+
# Default is :default.
|
|
50
|
+
# Use :none to skip preprocessing, :aggressive for poor quality images.
|
|
51
|
+
attr_reader :preprocess
|
|
52
|
+
|
|
43
53
|
# Creates a new Configuration with default values.
|
|
44
54
|
#
|
|
45
55
|
# Defaults are read from environment variables if set.
|
|
@@ -49,6 +59,7 @@ module ActiveStorage
|
|
|
49
59
|
@open_timeout = ENV.fetch("ACTIVESTORAGE_OCR_OPEN_TIMEOUT", 5).to_i
|
|
50
60
|
@content_types = default_content_types
|
|
51
61
|
self.engine = ENV.fetch("ACTIVESTORAGE_OCR_ENGINE", "ocrs").to_sym
|
|
62
|
+
self.preprocess = ENV.fetch("ACTIVESTORAGE_OCR_PREPROCESS", "default").to_sym
|
|
52
63
|
end
|
|
53
64
|
|
|
54
65
|
# Set the OCR engine to use.
|
|
@@ -69,6 +80,25 @@ module ActiveStorage
|
|
|
69
80
|
@engine = value
|
|
70
81
|
end
|
|
71
82
|
|
|
83
|
+
# Set the preprocessing preset to use.
|
|
84
|
+
#
|
|
85
|
+
# ==== Parameters
|
|
86
|
+
#
|
|
87
|
+
# * +value+ - Preset name (:none, :minimal, :default, :aggressive)
|
|
88
|
+
#
|
|
89
|
+
# ==== Raises
|
|
90
|
+
#
|
|
91
|
+
# * +ArgumentError+ if an invalid preset name is provided
|
|
92
|
+
def preprocess=(value)
|
|
93
|
+
value = value.to_sym
|
|
94
|
+
unless VALID_PREPROCESS.include?(value)
|
|
95
|
+
raise ArgumentError,
|
|
96
|
+
"Invalid preprocess preset: #{value}. Valid presets: #{VALID_PREPROCESS.join(', ')}"
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
@preprocess = value
|
|
100
|
+
end
|
|
101
|
+
|
|
72
102
|
# Returns the default list of supported content types.
|
|
73
103
|
#
|
|
74
104
|
# Includes common image formats and PDF.
|
|
@@ -30,6 +30,10 @@ module ActiveStorage
|
|
|
30
30
|
# The OCR engine that processed this result (e.g., "ocrs" or "leptess").
|
|
31
31
|
attr_reader :engine
|
|
32
32
|
|
|
33
|
+
# Preprocessing statistics (Hash with :preset, :total_time_ms, :steps).
|
|
34
|
+
# nil if preprocessing was skipped.
|
|
35
|
+
attr_reader :preprocessing
|
|
36
|
+
|
|
33
37
|
# Creates a new Result.
|
|
34
38
|
#
|
|
35
39
|
# ==== Parameters
|
|
@@ -39,12 +43,14 @@ module ActiveStorage
|
|
|
39
43
|
# * +processing_time_ms+ - Processing time in milliseconds
|
|
40
44
|
# * +warnings+ - Array of warning messages (optional)
|
|
41
45
|
# * +engine+ - The OCR engine used (optional)
|
|
42
|
-
|
|
46
|
+
# * +preprocessing+ - Preprocessing stats hash (optional)
|
|
47
|
+
def initialize(text:, confidence:, processing_time_ms:, warnings: [], engine: nil, preprocessing: nil)
|
|
43
48
|
@text = text
|
|
44
49
|
@confidence = confidence
|
|
45
50
|
@processing_time_ms = processing_time_ms
|
|
46
51
|
@warnings = warnings
|
|
47
52
|
@engine = engine
|
|
53
|
+
@preprocessing = preprocessing
|
|
48
54
|
end
|
|
49
55
|
|
|
50
56
|
# Returns whether OCR successfully extracted text.
|
|
@@ -56,6 +62,16 @@ module ActiveStorage
|
|
|
56
62
|
!text.nil? && !text.empty?
|
|
57
63
|
end
|
|
58
64
|
|
|
65
|
+
# Returns the preprocessing time in milliseconds, or 0 if not preprocessed.
|
|
66
|
+
def preprocessing_time_ms
|
|
67
|
+
preprocessing&.dig(:total_time_ms) || 0
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Returns the preprocessing preset used, or nil if not preprocessed.
|
|
71
|
+
def preprocessing_preset
|
|
72
|
+
preprocessing&.dig(:preset)
|
|
73
|
+
end
|
|
74
|
+
|
|
59
75
|
# Converts the result to a Hash.
|
|
60
76
|
#
|
|
61
77
|
# ==== Returns
|
|
@@ -67,7 +83,8 @@ module ActiveStorage
|
|
|
67
83
|
confidence: confidence,
|
|
68
84
|
processing_time_ms: processing_time_ms,
|
|
69
85
|
warnings: warnings,
|
|
70
|
-
engine: engine
|
|
86
|
+
engine: engine,
|
|
87
|
+
preprocessing: preprocessing
|
|
71
88
|
}
|
|
72
89
|
end
|
|
73
90
|
|