tahweel 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.vscode/extensions.json +15 -0
- data/.vscode/settings.json +16 -0
- data/CHANGELOG.md +54 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE.txt +21 -0
- data/README.md +43 -0
- data/Rakefile +12 -0
- data/bin/tahweel +67 -0
- data/bin/tahweel-clear +14 -0
- data/lib/tahweel/authorizer.rb +166 -0
- data/lib/tahweel/cli/file_collector.rb +33 -0
- data/lib/tahweel/cli/file_processor.rb +126 -0
- data/lib/tahweel/cli/options.rb +115 -0
- data/lib/tahweel/cli/progress_renderer.rb +148 -0
- data/lib/tahweel/converter.rb +131 -0
- data/lib/tahweel/ocr.rb +42 -0
- data/lib/tahweel/pdf_splitter.rb +147 -0
- data/lib/tahweel/processors/google_drive.rb +109 -0
- data/lib/tahweel/templates/success.html +192 -0
- data/lib/tahweel/version.rb +5 -0
- data/lib/tahweel/writer.rb +49 -0
- data/lib/tahweel/writers/docx.rb +82 -0
- data/lib/tahweel/writers/json.rb +32 -0
- data/lib/tahweel/writers/txt.rb +27 -0
- data/lib/tahweel.rb +37 -0
- data/mise.toml +2 -0
- data/sig/tahweel.rbs +4 -0
- metadata +173 -0
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "pathname"
|
|
4
|
+
|
|
5
|
+
module Tahweel
|
|
6
|
+
module CLI
|
|
7
|
+
# Processes a single file by orchestrating conversion/extraction and writing the output.
|
|
8
|
+
#
|
|
9
|
+
# This class acts as the bridge between the CLI inputs and the core library logic.
|
|
10
|
+
# It determines the file type (PDF or Image), calls the appropriate processing method,
|
|
11
|
+
# and directs the results to the {Tahweel::Writer}.
|
|
12
|
+
class FileProcessor
|
|
13
|
+
# Processes the given file according to the provided options.
|
|
14
|
+
#
|
|
15
|
+
# @param file_path [String] The path to the input file.
|
|
16
|
+
# @param options [Hash] Configuration options.
|
|
17
|
+
# @option options [String] :output The directory to save output files (defaults to current directory).
|
|
18
|
+
# @option options [Integer] :dpi DPI for PDF conversion (defaults to 150).
|
|
19
|
+
# @option options [Symbol] :processor The OCR processor to use (e.g., :google_drive).
|
|
20
|
+
# @option options [Integer] :page_concurrency Max concurrent operations.
|
|
21
|
+
# @option options [Array<Symbol>] :formats Output formats (e.g., [:txt, :docx]).
|
|
22
|
+
# @option options [String] :page_separator Separator string for TXT output.
|
|
23
|
+
# @option options [String] :base_input_path The base path used to determine relative output structure.
|
|
24
|
+
# @param &block [Proc] A block that will be yielded with progress info.
|
|
25
|
+
# @yield [Hash] Progress info: {
|
|
26
|
+
# stage: :splitting or :ocr,
|
|
27
|
+
# current_page: Integer,
|
|
28
|
+
# percentage: Float,
|
|
29
|
+
# remaining_pages: Integer
|
|
30
|
+
# }
|
|
31
|
+
# @return [void]
|
|
32
|
+
def self.process(file_path, options, &) = new(file_path, options).process(&)
|
|
33
|
+
|
|
34
|
+
# Initializes a new FileProcessor.
|
|
35
|
+
#
|
|
36
|
+
# @param file_path [String] The path to the input file.
|
|
37
|
+
# @param options [Hash] Configuration options (see {.process}).
|
|
38
|
+
def initialize(file_path, options)
|
|
39
|
+
@file_path = file_path
|
|
40
|
+
@options = options
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Executes the processing logic.
|
|
44
|
+
#
|
|
45
|
+
# 1. Ensures the output directory exists.
|
|
46
|
+
# 2. Checks if output files already exist to avoid redundant processing.
|
|
47
|
+
# 3. Detects if the input is a PDF or an image.
|
|
48
|
+
# 4. Runs the appropriate conversion/extraction pipeline.
|
|
49
|
+
# 5. Writes the results to the configured formats.
|
|
50
|
+
#
|
|
51
|
+
# @param &block [Proc] A block that will be yielded with progress info.
|
|
52
|
+
# @yield [Hash] Progress info: {
|
|
53
|
+
# stage: :splitting or :ocr,
|
|
54
|
+
# current_page: Integer,
|
|
55
|
+
# percentage: Float,
|
|
56
|
+
# remaining_pages: Integer
|
|
57
|
+
# }
|
|
58
|
+
# @return [void]
|
|
59
|
+
def process(&)
|
|
60
|
+
ensure_output_directory_exists
|
|
61
|
+
|
|
62
|
+
return if all_outputs_exist?
|
|
63
|
+
|
|
64
|
+
pdf? ? process_pdf(&) : process_image
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
private
|
|
68
|
+
|
|
69
|
+
def ensure_output_directory_exists = FileUtils.mkdir_p(output_directory)
|
|
70
|
+
|
|
71
|
+
def all_outputs_exist?
|
|
72
|
+
@options[:formats].all? do |format|
|
|
73
|
+
extension = Tahweel::Writer.new(format: format).extension
|
|
74
|
+
File.exist?("#{base_output_path}.#{extension}")
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def pdf? = File.extname(@file_path).downcase == ".pdf"
|
|
79
|
+
|
|
80
|
+
def process_pdf(&)
|
|
81
|
+
texts = Tahweel.convert(
|
|
82
|
+
@file_path,
|
|
83
|
+
dpi: @options[:dpi],
|
|
84
|
+
processor: @options[:processor],
|
|
85
|
+
concurrency: @options.fetch(:page_concurrency, Tahweel::Converter::DEFAULT_CONCURRENCY),
|
|
86
|
+
&
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
write_output(texts)
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def process_image = write_output([Tahweel.extract(@file_path, processor: @options[:processor])])
|
|
93
|
+
|
|
94
|
+
def write_output(texts)
|
|
95
|
+
Tahweel::Writer.write(
|
|
96
|
+
texts,
|
|
97
|
+
base_output_path,
|
|
98
|
+
formats: @options[:formats],
|
|
99
|
+
page_separator: @options[:page_separator]
|
|
100
|
+
)
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
def base_output_path = File.join(output_directory, File.basename(@file_path, ".*"))
|
|
104
|
+
|
|
105
|
+
# Determines the output directory.
|
|
106
|
+
#
|
|
107
|
+
# If an output option is provided, it attempts to preserve the directory structure
|
|
108
|
+
# relative to the `base_input_path`. If `base_input_path` is not provided or
|
|
109
|
+
# calculation fails, it falls back to the provided output directory.
|
|
110
|
+
#
|
|
111
|
+
# If no output option is provided, it defaults to the file's own directory.
|
|
112
|
+
#
|
|
113
|
+
# @return [String] The target output directory.
|
|
114
|
+
def output_directory
|
|
115
|
+
return File.dirname(@file_path) unless @options[:output]
|
|
116
|
+
|
|
117
|
+
if @options[:base_input_path]
|
|
118
|
+
relative_dir = Pathname.new(File.dirname(@file_path)).relative_path_from(@options[:base_input_path])
|
|
119
|
+
return File.join(@options[:output], relative_dir) unless relative_dir.to_s == "."
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
@options[:output]
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
end
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "etc"
|
|
4
|
+
require "optparse"
|
|
5
|
+
|
|
6
|
+
module Tahweel
|
|
7
|
+
module CLI
|
|
8
|
+
# Parses command-line arguments for the Tahweel CLI.
|
|
9
|
+
class Options
|
|
10
|
+
POSITIVE_INTEGER = /\A\+?[1-9]\d*(?:_\d+)*\z/
|
|
11
|
+
|
|
12
|
+
# Parses the command-line arguments.
|
|
13
|
+
#
|
|
14
|
+
# @param args [Array<String>] The command-line arguments.
|
|
15
|
+
# @return [Hash] The parsed options.
|
|
16
|
+
def self.parse(args)
|
|
17
|
+
options = default_options
|
|
18
|
+
parser = OptionParser.new { configure_parser(_1, options) }
|
|
19
|
+
begin
|
|
20
|
+
parser.parse!(args)
|
|
21
|
+
rescue OptionParser::ParseError => e
|
|
22
|
+
abort "Error: #{e.message}"
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
validate_args!(args, parser)
|
|
26
|
+
options
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def self.default_options
|
|
30
|
+
{
|
|
31
|
+
dpi: 150,
|
|
32
|
+
processor: :google_drive,
|
|
33
|
+
page_concurrency: Tahweel::Converter::DEFAULT_CONCURRENCY,
|
|
34
|
+
file_concurrency: (Etc.nprocessors - 2).clamp(2..),
|
|
35
|
+
output: nil,
|
|
36
|
+
formats: %i[txt docx],
|
|
37
|
+
page_separator: Tahweel::Writers::Txt::PAGE_SEPARATOR
|
|
38
|
+
}
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def self.configure_parser(opts, options) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
|
|
42
|
+
opts.program_name = "tahweel"
|
|
43
|
+
opts.version = Tahweel::VERSION
|
|
44
|
+
|
|
45
|
+
opts.accept(POSITIVE_INTEGER) do |value|
|
|
46
|
+
n = Integer(value)
|
|
47
|
+
raise OptionParser::InvalidArgument, "must be a positive integer" if n < 1
|
|
48
|
+
|
|
49
|
+
n
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
opts.on(
|
|
53
|
+
"-e", "--extensions EXTENSIONS", Array,
|
|
54
|
+
"Comma-separated list of file extensions to process " \
|
|
55
|
+
"(default: #{Tahweel::CLI::FileCollector::SUPPORTED_EXTENSIONS.join(", ")})"
|
|
56
|
+
) do |value|
|
|
57
|
+
options[:extensions] = value
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
opts.on("--dpi DPI", POSITIVE_INTEGER, "DPI for PDF to Image conversion (default: #{options[:dpi]})") do |value|
|
|
61
|
+
options[:dpi] = value
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
opts.on(
|
|
65
|
+
"-p", "--processor PROCESSOR", Tahweel::Ocr::AVAILABLE_PROCESSORS,
|
|
66
|
+
"OCR processor to use (default: google_drive). Available: #{Tahweel::Ocr::AVAILABLE_PROCESSORS.join(", ")}"
|
|
67
|
+
) do |value|
|
|
68
|
+
options[:processor] = value
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
opts.on(
|
|
72
|
+
"-P", "--page-concurrency PAGE_CONCURRENCY", POSITIVE_INTEGER,
|
|
73
|
+
"Max concurrent OCR operations (default: #{options[:page_concurrency]})"
|
|
74
|
+
) do |value|
|
|
75
|
+
options[:page_concurrency] = value
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
opts.on(
|
|
79
|
+
"-F", "--file-concurrency FILE_CONCURRENCY", POSITIVE_INTEGER,
|
|
80
|
+
"Max concurrent files to process (default: CPUs - 2 = #{options[:file_concurrency]})"
|
|
81
|
+
) do |value|
|
|
82
|
+
options[:file_concurrency] = value
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
opts.on(
|
|
86
|
+
"-f", "--formats FORMATS", Array,
|
|
87
|
+
"Output formats (comma-separated, default: txt). Available: #{Tahweel::Writer::AVAILABLE_FORMATS.join(", ")}"
|
|
88
|
+
) do |value|
|
|
89
|
+
options[:formats] = value.map(&:to_sym)
|
|
90
|
+
|
|
91
|
+
invalid_formats = options[:formats] - Tahweel::Writer::AVAILABLE_FORMATS
|
|
92
|
+
abort "Error: invalid format(s): #{invalid_formats.join(", ")}" if invalid_formats.any?
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
opts.on(
|
|
96
|
+
"--page-separator SEPARATOR", String,
|
|
97
|
+
"Separator between pages in TXT output (default: #{options[:page_separator].gsub("\n", "\\n")})"
|
|
98
|
+
) do |value|
|
|
99
|
+
options[:page_separator] = value.gsub("\\n", "\n")
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
opts.on("-o", "--output DIR", String, "Output directory (default: current directory)") do |value|
|
|
103
|
+
options[:output] = value
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def self.validate_args!(args, parser)
|
|
108
|
+
return unless args.empty?
|
|
109
|
+
|
|
110
|
+
puts parser
|
|
111
|
+
exit 1
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
end
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "io/console"
|
|
4
|
+
|
|
5
|
+
module Tahweel
|
|
6
|
+
module CLI
|
|
7
|
+
# Handles thread-safe rendering of the progress dashboard for the CLI.
|
|
8
|
+
#
|
|
9
|
+
# This class manages ANSI escape codes to create a dynamic, multi-line progress display
|
|
10
|
+
# showing global status and individual worker threads.
|
|
11
|
+
class ProgressRenderer
|
|
12
|
+
# ANSI Color Codes
|
|
13
|
+
RESET = "\e[0m"
|
|
14
|
+
BOLD = "\e[1m"
|
|
15
|
+
RED = "\e[31m"
|
|
16
|
+
GREEN = "\e[32m"
|
|
17
|
+
YELLOW = "\e[33m"
|
|
18
|
+
BLUE = "\e[34m"
|
|
19
|
+
CYAN = "\e[36m"
|
|
20
|
+
DIM = "\e[2m"
|
|
21
|
+
|
|
22
|
+
# Initializes the renderer and prepares the terminal.
|
|
23
|
+
#
|
|
24
|
+
# @param total_files [Integer] Total number of files to process.
|
|
25
|
+
# @param concurrency [Integer] Number of concurrent worker threads.
|
|
26
|
+
def initialize(total_files, concurrency) # rubocop:disable Metrics/MethodLength
|
|
27
|
+
@total_files = total_files
|
|
28
|
+
@concurrency = concurrency
|
|
29
|
+
@processed_files = 0
|
|
30
|
+
@worker_states = Array.new(concurrency)
|
|
31
|
+
@mutex = Mutex.new
|
|
32
|
+
@start_time = Time.now
|
|
33
|
+
@running = true
|
|
34
|
+
|
|
35
|
+
# Hide cursor
|
|
36
|
+
$stdout.print "\e[?25l"
|
|
37
|
+
|
|
38
|
+
# Reserve space for global status + 1 line per worker
|
|
39
|
+
$stdout.print "\n" * (@concurrency + 1)
|
|
40
|
+
|
|
41
|
+
# Trap Interrupt to restore cursor
|
|
42
|
+
trap("INT") do
|
|
43
|
+
@running = false
|
|
44
|
+
$stdout.print "\e[?25h"
|
|
45
|
+
exit
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
start_ticker
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Updates the state for a worker starting a new file.
|
|
52
|
+
#
|
|
53
|
+
# @param worker_index [Integer] The index of the worker thread (0-based).
|
|
54
|
+
# @param file [String] The path of the file being started.
|
|
55
|
+
def start_file(worker_index, file)
|
|
56
|
+
@mutex.synchronize do
|
|
57
|
+
@worker_states[worker_index] = {
|
|
58
|
+
file:,
|
|
59
|
+
stage: "Starting...",
|
|
60
|
+
percentage: 0,
|
|
61
|
+
details: ""
|
|
62
|
+
}
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Updates the progress for a specific worker.
|
|
67
|
+
#
|
|
68
|
+
# @param worker_index [Integer] The index of the worker thread.
|
|
69
|
+
# @param progress [Hash] The progress hash containing stage, percentage, etc.
|
|
70
|
+
def update(worker_index, progress)
|
|
71
|
+
@mutex.synchronize do
|
|
72
|
+
return unless @worker_states[worker_index]
|
|
73
|
+
|
|
74
|
+
stage = progress[:stage].to_s.capitalize
|
|
75
|
+
percentage = progress[:percentage]
|
|
76
|
+
current_page = progress[:current_page]
|
|
77
|
+
total_pages = current_page + progress[:remaining_pages]
|
|
78
|
+
|
|
79
|
+
@worker_states[worker_index][:stage] = stage
|
|
80
|
+
@worker_states[worker_index][:percentage] = percentage
|
|
81
|
+
@worker_states[worker_index][:details] = "(#{current_page}/#{total_pages})"
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Marks a worker as finished with its current file.
|
|
86
|
+
#
|
|
87
|
+
# @param worker_index [Integer] The index of the worker thread.
|
|
88
|
+
def finish_file(worker_index)
|
|
89
|
+
@mutex.synchronize do
|
|
90
|
+
@processed_files += 1
|
|
91
|
+
@worker_states[worker_index] = nil # Idle
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# Restores the cursor and finalizes the display.
|
|
96
|
+
def finish_all
|
|
97
|
+
@running = false
|
|
98
|
+
@ticker_thread&.join
|
|
99
|
+
render # Ensure final state is drawn
|
|
100
|
+
$stdout.print "\e[?25h"
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
private
|
|
104
|
+
|
|
105
|
+
def start_ticker
|
|
106
|
+
@ticker_thread = Thread.new do
|
|
107
|
+
while @running
|
|
108
|
+
sleep 0.1
|
|
109
|
+
@mutex.synchronize { render } if @running
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def render # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
|
115
|
+
# Move cursor up to the start of our block
|
|
116
|
+
$stdout.print "\e[#{@concurrency + 1}A"
|
|
117
|
+
|
|
118
|
+
# 1. Global Progress
|
|
119
|
+
percent = @total_files.positive? ? ((@processed_files.to_f / @total_files) * 100).round(1) : 0
|
|
120
|
+
elapsed = (Time.now - @start_time).round
|
|
121
|
+
$stdout.print "\r\e[K" # Clear line
|
|
122
|
+
puts "#{BOLD}Total Progress:#{RESET} [#{GREEN}#{@processed_files}#{RESET}/#{@total_files}] #{CYAN}#{percent}%#{RESET} | Time: #{YELLOW}#{elapsed}s#{RESET}" # rubocop:disable Layout/LineLength
|
|
123
|
+
|
|
124
|
+
# 2. Worker Statuses
|
|
125
|
+
@concurrency.times do |i|
|
|
126
|
+
$stdout.print "\r\e[K" # Clear line
|
|
127
|
+
state = @worker_states[i]
|
|
128
|
+
if state
|
|
129
|
+
# Limit filename length to avoid wrapping issues, truncate from beginning
|
|
130
|
+
fname = truncate_path(state[:file], 40)
|
|
131
|
+
stage_color = state[:stage] == "Splitting" ? BLUE : YELLOW
|
|
132
|
+
puts " [Worker #{i + 1}] #{CYAN}#{fname}#{RESET} | #{stage_color}#{state[:stage].ljust(10)}#{RESET} | #{GREEN}#{state[:percentage].to_s.rjust(5)}%#{RESET} #{DIM}#{state[:details]}#{RESET}" # rubocop:disable Layout/LineLength
|
|
133
|
+
else
|
|
134
|
+
puts " [Worker #{i + 1}] #{DIM}Idle#{RESET}"
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
$stdout.flush
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
def truncate_path(path, max_length)
|
|
142
|
+
return path.ljust(max_length) if path.length <= max_length
|
|
143
|
+
|
|
144
|
+
"...#{path[-(max_length - 3)..]}"
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
end
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "pdf_splitter"
|
|
4
|
+
require_relative "ocr"
|
|
5
|
+
require "fileutils"
|
|
6
|
+
|
|
7
|
+
module Tahweel
|
|
8
|
+
# Orchestrates the full conversion process:
|
|
9
|
+
# 1. Splits a PDF into images.
|
|
10
|
+
# 2. Performs OCR on each image concurrently.
|
|
11
|
+
# 3. Returns the aggregated text.
|
|
12
|
+
# 4. Cleans up temporary files.
|
|
13
|
+
class Converter
|
|
14
|
+
# Max concurrent OCR operations to avoid hitting API rate limits too hard.
|
|
15
|
+
DEFAULT_CONCURRENCY = 12
|
|
16
|
+
|
|
17
|
+
# Convenience method to convert a PDF file to text.
|
|
18
|
+
#
|
|
19
|
+
# @param pdf_path [String] Path to the PDF file.
|
|
20
|
+
# @param dpi [Integer] DPI for PDF to image conversion (default: 150).
|
|
21
|
+
# @param processor [Symbol] OCR processor to use (default: :google_drive).
|
|
22
|
+
# @param concurrency [Integer] Max concurrent OCR operations (default: 12).
|
|
23
|
+
# @param &block [Proc] A block that will be yielded with progress info.
|
|
24
|
+
# @yield [Hash] Progress info: {
|
|
25
|
+
# stage: :splitting or :ocr,
|
|
26
|
+
# current_page: Integer,
|
|
27
|
+
# percentage: Float,
|
|
28
|
+
# remaining_pages: Integer
|
|
29
|
+
# }
|
|
30
|
+
# @return [Array<String>] An array containing the text of each page.
|
|
31
|
+
def self.convert(
|
|
32
|
+
pdf_path,
|
|
33
|
+
dpi: PdfSplitter::DEFAULT_DPI,
|
|
34
|
+
processor: :google_drive,
|
|
35
|
+
concurrency: DEFAULT_CONCURRENCY,
|
|
36
|
+
&
|
|
37
|
+
) = new(pdf_path, dpi:, processor:, concurrency:).convert(&)
|
|
38
|
+
|
|
39
|
+
# Initializes the Converter.
|
|
40
|
+
#
|
|
41
|
+
# @param pdf_path [String] Path to the PDF file.
|
|
42
|
+
# @param dpi [Integer] DPI for PDF to image conversion.
|
|
43
|
+
# @param processor [Symbol] OCR processor to use.
|
|
44
|
+
# @param concurrency [Integer] Max concurrent OCR operations.
|
|
45
|
+
def initialize(pdf_path, dpi: PdfSplitter::DEFAULT_DPI, processor: :google_drive, concurrency: DEFAULT_CONCURRENCY)
|
|
46
|
+
@pdf_path = pdf_path
|
|
47
|
+
@dpi = dpi
|
|
48
|
+
@processor_type = processor
|
|
49
|
+
@concurrency = concurrency
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Executes the conversion process.
|
|
53
|
+
#
|
|
54
|
+
# @param &block [Proc] A block that will be yielded with progress info.
|
|
55
|
+
# @yield [Hash] Progress info: {
|
|
56
|
+
# stage: :splitting or :ocr,
|
|
57
|
+
# current_page: Integer,
|
|
58
|
+
# percentage: Float,
|
|
59
|
+
# remaining_pages: Integer
|
|
60
|
+
# }
|
|
61
|
+
# @return [Array<String>] An array containing the text of each page.
|
|
62
|
+
def convert(&)
|
|
63
|
+
image_paths, temp_dir = PdfSplitter.split(@pdf_path, dpi: @dpi, &).values_at(:image_paths, :folder_path)
|
|
64
|
+
|
|
65
|
+
begin
|
|
66
|
+
process_images(image_paths, Ocr.new(processor: @processor_type), &)
|
|
67
|
+
ensure
|
|
68
|
+
FileUtils.rm_rf(temp_dir)
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
private
|
|
73
|
+
|
|
74
|
+
def process_images(image_paths, ocr_engine, &)
|
|
75
|
+
texts = Array.new(image_paths.size)
|
|
76
|
+
mutex = Mutex.new
|
|
77
|
+
processed_count = 0
|
|
78
|
+
|
|
79
|
+
run_workers(build_queue(image_paths), ocr_engine, texts, mutex) do
|
|
80
|
+
processed_count += 1
|
|
81
|
+
report_progress(processed_count, image_paths.size, &)
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
texts
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def build_queue(image_paths)
|
|
88
|
+
queue = Queue.new
|
|
89
|
+
image_paths.each_with_index { |path, index| queue << [path, index] }
|
|
90
|
+
queue
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def run_workers(queue, ocr_engine, texts, mutex, &)
|
|
94
|
+
Array.new(@concurrency) do
|
|
95
|
+
Thread.new { process_queue_items(queue, ocr_engine, texts, mutex, &) }
|
|
96
|
+
end.each(&:join)
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def process_queue_items(queue, ocr_engine, texts, mutex, &)
|
|
100
|
+
loop do
|
|
101
|
+
begin
|
|
102
|
+
path, index = queue.pop(true)
|
|
103
|
+
rescue ThreadError
|
|
104
|
+
break
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
text = ocr_engine.extract(path)
|
|
108
|
+
save_result(texts, index, text, mutex, &)
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
def save_result(texts, index, text, mutex)
|
|
113
|
+
mutex.synchronize do
|
|
114
|
+
texts[index] = text
|
|
115
|
+
yield
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def report_progress(processed, total)
|
|
120
|
+
return unless block_given?
|
|
121
|
+
|
|
122
|
+
yield({
|
|
123
|
+
file_path: @pdf_path,
|
|
124
|
+
stage: :ocr,
|
|
125
|
+
current_page: processed,
|
|
126
|
+
percentage: ((processed.to_f / total) * 100).round(2),
|
|
127
|
+
remaining_pages: total - processed
|
|
128
|
+
})
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
end
|
data/lib/tahweel/ocr.rb
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "processors/google_drive"
|
|
4
|
+
|
|
5
|
+
module Tahweel
|
|
6
|
+
# The main entry point for Optical Character Recognition (OCR).
|
|
7
|
+
# This class acts as a factory/strategy context, delegating the actual extraction logic
|
|
8
|
+
# to a specific processor.
|
|
9
|
+
#
|
|
10
|
+
# @example Usage with default processor (Google Drive)
|
|
11
|
+
# text = Tahweel::Ocr.extract("image.png")
|
|
12
|
+
#
|
|
13
|
+
# @example Usage with a specific processor (Future-proofing)
|
|
14
|
+
# # text = Tahweel::Ocr.extract("image.png", processor: :tesseract)
|
|
15
|
+
class Ocr
|
|
16
|
+
AVAILABLE_PROCESSORS = [:google_drive].freeze
|
|
17
|
+
|
|
18
|
+
# Convenience method to extract text using a specific processor.
|
|
19
|
+
#
|
|
20
|
+
# @param file_path [String] Path to the image file.
|
|
21
|
+
# @param processor [Symbol] The processor to use (default: :google_drive).
|
|
22
|
+
# @return [String] The extracted text.
|
|
23
|
+
def self.extract(file_path, processor: :google_drive) = new(processor: processor).extract(file_path)
|
|
24
|
+
|
|
25
|
+
# Initializes the OCR engine with a specific processor strategy.
|
|
26
|
+
#
|
|
27
|
+
# @param processor [Symbol] The processor to use (default: :google_drive).
|
|
28
|
+
# @raise [ArgumentError] If an unknown processor is specified.
|
|
29
|
+
def initialize(processor: :google_drive)
|
|
30
|
+
@processor = case processor
|
|
31
|
+
when :google_drive then Processors::GoogleDrive.new
|
|
32
|
+
else raise ArgumentError, "Unknown processor: #{processor}"
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Extracts text from the file using the configured processor.
|
|
37
|
+
#
|
|
38
|
+
# @param file_path [String] Path to the image file.
|
|
39
|
+
# @return [String] The extracted text.
|
|
40
|
+
def extract(file_path) = @processor.extract(file_path)
|
|
41
|
+
end
|
|
42
|
+
end
|