tahweel 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,126 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "pathname"
4
+
5
+ module Tahweel
6
+ module CLI
7
+ # Processes a single file by orchestrating conversion/extraction and writing the output.
8
+ #
9
+ # This class acts as the bridge between the CLI inputs and the core library logic.
10
+ # It determines the file type (PDF or Image), calls the appropriate processing method,
11
+ # and directs the results to the {Tahweel::Writer}.
12
+ class FileProcessor
13
+ # Processes the given file according to the provided options.
14
+ #
15
+ # @param file_path [String] The path to the input file.
16
+ # @param options [Hash] Configuration options.
17
+ # @option options [String] :output The directory to save output files (defaults to current directory).
18
+ # @option options [Integer] :dpi DPI for PDF conversion (defaults to 150).
19
+ # @option options [Symbol] :processor The OCR processor to use (e.g., :google_drive).
20
+ # @option options [Integer] :page_concurrency Max concurrent operations.
21
+ # @option options [Array<Symbol>] :formats Output formats (e.g., [:txt, :docx]).
22
+ # @option options [String] :page_separator Separator string for TXT output.
23
+ # @option options [String] :base_input_path The base path used to determine relative output structure.
24
+ # @param &block [Proc] A block that will be yielded with progress info.
25
+ # @yield [Hash] Progress info: {
26
+ # stage: :splitting or :ocr,
27
+ # current_page: Integer,
28
+ # percentage: Float,
29
+ # remaining_pages: Integer
30
+ # }
31
+ # @return [void]
32
+ def self.process(file_path, options, &) = new(file_path, options).process(&)
33
+
34
+ # Initializes a new FileProcessor.
35
+ #
36
+ # @param file_path [String] The path to the input file.
37
+ # @param options [Hash] Configuration options (see {.process}).
38
+ def initialize(file_path, options)
39
+ @file_path = file_path
40
+ @options = options
41
+ end
42
+
43
+ # Executes the processing logic.
44
+ #
45
+ # 1. Ensures the output directory exists.
46
+ # 2. Checks if output files already exist to avoid redundant processing.
47
+ # 3. Detects if the input is a PDF or an image.
48
+ # 4. Runs the appropriate conversion/extraction pipeline.
49
+ # 5. Writes the results to the configured formats.
50
+ #
51
+ # @param &block [Proc] A block that will be yielded with progress info.
52
+ # @yield [Hash] Progress info: {
53
+ # stage: :splitting or :ocr,
54
+ # current_page: Integer,
55
+ # percentage: Float,
56
+ # remaining_pages: Integer
57
+ # }
58
+ # @return [void]
59
+ def process(&)
60
+ ensure_output_directory_exists
61
+
62
+ return if all_outputs_exist?
63
+
64
+ pdf? ? process_pdf(&) : process_image
65
+ end
66
+
67
+ private
68
+
69
+ def ensure_output_directory_exists = FileUtils.mkdir_p(output_directory)
70
+
71
+ def all_outputs_exist?
72
+ @options[:formats].all? do |format|
73
+ extension = Tahweel::Writer.new(format: format).extension
74
+ File.exist?("#{base_output_path}.#{extension}")
75
+ end
76
+ end
77
+
78
+ def pdf? = File.extname(@file_path).downcase == ".pdf"
79
+
80
+ def process_pdf(&)
81
+ texts = Tahweel.convert(
82
+ @file_path,
83
+ dpi: @options[:dpi],
84
+ processor: @options[:processor],
85
+ concurrency: @options.fetch(:page_concurrency, Tahweel::Converter::DEFAULT_CONCURRENCY),
86
+ &
87
+ )
88
+
89
+ write_output(texts)
90
+ end
91
+
92
+ def process_image = write_output([Tahweel.extract(@file_path, processor: @options[:processor])])
93
+
94
+ def write_output(texts)
95
+ Tahweel::Writer.write(
96
+ texts,
97
+ base_output_path,
98
+ formats: @options[:formats],
99
+ page_separator: @options[:page_separator]
100
+ )
101
+ end
102
+
103
+ def base_output_path = File.join(output_directory, File.basename(@file_path, ".*"))
104
+
105
+ # Determines the output directory.
106
+ #
107
+ # If an output option is provided, it attempts to preserve the directory structure
108
+ # relative to the `base_input_path`. If `base_input_path` is not provided or
109
+ # calculation fails, it falls back to the provided output directory.
110
+ #
111
+ # If no output option is provided, it defaults to the file's own directory.
112
+ #
113
+ # @return [String] The target output directory.
114
+ def output_directory
115
+ return File.dirname(@file_path) unless @options[:output]
116
+
117
+ if @options[:base_input_path]
118
+ relative_dir = Pathname.new(File.dirname(@file_path)).relative_path_from(@options[:base_input_path])
119
+ return File.join(@options[:output], relative_dir) unless relative_dir.to_s == "."
120
+ end
121
+
122
+ @options[:output]
123
+ end
124
+ end
125
+ end
126
+ end
@@ -0,0 +1,115 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "etc"
4
+ require "optparse"
5
+
6
+ module Tahweel
7
+ module CLI
8
+ # Parses command-line arguments for the Tahweel CLI.
9
+ class Options
10
+ POSITIVE_INTEGER = /\A\+?[1-9]\d*(?:_\d+)*\z/
11
+
12
+ # Parses the command-line arguments.
13
+ #
14
+ # @param args [Array<String>] The command-line arguments.
15
+ # @return [Hash] The parsed options.
16
+ def self.parse(args)
17
+ options = default_options
18
+ parser = OptionParser.new { configure_parser(_1, options) }
19
+ begin
20
+ parser.parse!(args)
21
+ rescue OptionParser::ParseError => e
22
+ abort "Error: #{e.message}"
23
+ end
24
+
25
+ validate_args!(args, parser)
26
+ options
27
+ end
28
+
29
+ def self.default_options
30
+ {
31
+ dpi: 150,
32
+ processor: :google_drive,
33
+ page_concurrency: Tahweel::Converter::DEFAULT_CONCURRENCY,
34
+ file_concurrency: (Etc.nprocessors - 2).clamp(2..),
35
+ output: nil,
36
+ formats: %i[txt docx],
37
+ page_separator: Tahweel::Writers::Txt::PAGE_SEPARATOR
38
+ }
39
+ end
40
+
41
+ def self.configure_parser(opts, options) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
42
+ opts.program_name = "tahweel"
43
+ opts.version = Tahweel::VERSION
44
+
45
+ opts.accept(POSITIVE_INTEGER) do |value|
46
+ n = Integer(value)
47
+ raise OptionParser::InvalidArgument, "must be a positive integer" if n < 1
48
+
49
+ n
50
+ end
51
+
52
+ opts.on(
53
+ "-e", "--extensions EXTENSIONS", Array,
54
+ "Comma-separated list of file extensions to process " \
55
+ "(default: #{Tahweel::CLI::FileCollector::SUPPORTED_EXTENSIONS.join(", ")})"
56
+ ) do |value|
57
+ options[:extensions] = value
58
+ end
59
+
60
+ opts.on("--dpi DPI", POSITIVE_INTEGER, "DPI for PDF to Image conversion (default: #{options[:dpi]})") do |value|
61
+ options[:dpi] = value
62
+ end
63
+
64
+ opts.on(
65
+ "-p", "--processor PROCESSOR", Tahweel::Ocr::AVAILABLE_PROCESSORS,
66
+ "OCR processor to use (default: google_drive). Available: #{Tahweel::Ocr::AVAILABLE_PROCESSORS.join(", ")}"
67
+ ) do |value|
68
+ options[:processor] = value
69
+ end
70
+
71
+ opts.on(
72
+ "-P", "--page-concurrency PAGE_CONCURRENCY", POSITIVE_INTEGER,
73
+ "Max concurrent OCR operations (default: #{options[:page_concurrency]})"
74
+ ) do |value|
75
+ options[:page_concurrency] = value
76
+ end
77
+
78
+ opts.on(
79
+ "-F", "--file-concurrency FILE_CONCURRENCY", POSITIVE_INTEGER,
80
+ "Max concurrent files to process (default: CPUs - 2 = #{options[:file_concurrency]})"
81
+ ) do |value|
82
+ options[:file_concurrency] = value
83
+ end
84
+
85
+ opts.on(
86
+ "-f", "--formats FORMATS", Array,
87
+ "Output formats (comma-separated, default: txt). Available: #{Tahweel::Writer::AVAILABLE_FORMATS.join(", ")}"
88
+ ) do |value|
89
+ options[:formats] = value.map(&:to_sym)
90
+
91
+ invalid_formats = options[:formats] - Tahweel::Writer::AVAILABLE_FORMATS
92
+ abort "Error: invalid format(s): #{invalid_formats.join(", ")}" if invalid_formats.any?
93
+ end
94
+
95
+ opts.on(
96
+ "--page-separator SEPARATOR", String,
97
+ "Separator between pages in TXT output (default: #{options[:page_separator].gsub("\n", "\\n")})"
98
+ ) do |value|
99
+ options[:page_separator] = value.gsub("\\n", "\n")
100
+ end
101
+
102
+ opts.on("-o", "--output DIR", String, "Output directory (default: current directory)") do |value|
103
+ options[:output] = value
104
+ end
105
+ end
106
+
107
+ def self.validate_args!(args, parser)
108
+ return unless args.empty?
109
+
110
+ puts parser
111
+ exit 1
112
+ end
113
+ end
114
+ end
115
+ end
@@ -0,0 +1,148 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "io/console"
4
+
5
+ module Tahweel
6
+ module CLI
7
+ # Handles thread-safe rendering of the progress dashboard for the CLI.
8
+ #
9
+ # This class manages ANSI escape codes to create a dynamic, multi-line progress display
10
+ # showing global status and individual worker threads.
11
+ class ProgressRenderer
12
+ # ANSI Color Codes
13
+ RESET = "\e[0m"
14
+ BOLD = "\e[1m"
15
+ RED = "\e[31m"
16
+ GREEN = "\e[32m"
17
+ YELLOW = "\e[33m"
18
+ BLUE = "\e[34m"
19
+ CYAN = "\e[36m"
20
+ DIM = "\e[2m"
21
+
22
+ # Initializes the renderer and prepares the terminal.
23
+ #
24
+ # @param total_files [Integer] Total number of files to process.
25
+ # @param concurrency [Integer] Number of concurrent worker threads.
26
+ def initialize(total_files, concurrency) # rubocop:disable Metrics/MethodLength
27
+ @total_files = total_files
28
+ @concurrency = concurrency
29
+ @processed_files = 0
30
+ @worker_states = Array.new(concurrency)
31
+ @mutex = Mutex.new
32
+ @start_time = Time.now
33
+ @running = true
34
+
35
+ # Hide cursor
36
+ $stdout.print "\e[?25l"
37
+
38
+ # Reserve space for global status + 1 line per worker
39
+ $stdout.print "\n" * (@concurrency + 1)
40
+
41
+ # Trap Interrupt to restore cursor
42
+ trap("INT") do
43
+ @running = false
44
+ $stdout.print "\e[?25h"
45
+ exit
46
+ end
47
+
48
+ start_ticker
49
+ end
50
+
51
+ # Updates the state for a worker starting a new file.
52
+ #
53
+ # @param worker_index [Integer] The index of the worker thread (0-based).
54
+ # @param file [String] The path of the file being started.
55
+ def start_file(worker_index, file)
56
+ @mutex.synchronize do
57
+ @worker_states[worker_index] = {
58
+ file:,
59
+ stage: "Starting...",
60
+ percentage: 0,
61
+ details: ""
62
+ }
63
+ end
64
+ end
65
+
66
+ # Updates the progress for a specific worker.
67
+ #
68
+ # @param worker_index [Integer] The index of the worker thread.
69
+ # @param progress [Hash] The progress hash containing stage, percentage, etc.
70
+ def update(worker_index, progress)
71
+ @mutex.synchronize do
72
+ return unless @worker_states[worker_index]
73
+
74
+ stage = progress[:stage].to_s.capitalize
75
+ percentage = progress[:percentage]
76
+ current_page = progress[:current_page]
77
+ total_pages = current_page + progress[:remaining_pages]
78
+
79
+ @worker_states[worker_index][:stage] = stage
80
+ @worker_states[worker_index][:percentage] = percentage
81
+ @worker_states[worker_index][:details] = "(#{current_page}/#{total_pages})"
82
+ end
83
+ end
84
+
85
+ # Marks a worker as finished with its current file.
86
+ #
87
+ # @param worker_index [Integer] The index of the worker thread.
88
+ def finish_file(worker_index)
89
+ @mutex.synchronize do
90
+ @processed_files += 1
91
+ @worker_states[worker_index] = nil # Idle
92
+ end
93
+ end
94
+
95
+ # Restores the cursor and finalizes the display.
96
+ def finish_all
97
+ @running = false
98
+ @ticker_thread&.join
99
+ render # Ensure final state is drawn
100
+ $stdout.print "\e[?25h"
101
+ end
102
+
103
+ private
104
+
105
+ def start_ticker
106
+ @ticker_thread = Thread.new do
107
+ while @running
108
+ sleep 0.1
109
+ @mutex.synchronize { render } if @running
110
+ end
111
+ end
112
+ end
113
+
114
+ def render # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
115
+ # Move cursor up to the start of our block
116
+ $stdout.print "\e[#{@concurrency + 1}A"
117
+
118
+ # 1. Global Progress
119
+ percent = @total_files.positive? ? ((@processed_files.to_f / @total_files) * 100).round(1) : 0
120
+ elapsed = (Time.now - @start_time).round
121
+ $stdout.print "\r\e[K" # Clear line
122
+ puts "#{BOLD}Total Progress:#{RESET} [#{GREEN}#{@processed_files}#{RESET}/#{@total_files}] #{CYAN}#{percent}%#{RESET} | Time: #{YELLOW}#{elapsed}s#{RESET}" # rubocop:disable Layout/LineLength
123
+
124
+ # 2. Worker Statuses
125
+ @concurrency.times do |i|
126
+ $stdout.print "\r\e[K" # Clear line
127
+ state = @worker_states[i]
128
+ if state
129
+ # Limit filename length to avoid wrapping issues, truncate from beginning
130
+ fname = truncate_path(state[:file], 40)
131
+ stage_color = state[:stage] == "Splitting" ? BLUE : YELLOW
132
+ puts " [Worker #{i + 1}] #{CYAN}#{fname}#{RESET} | #{stage_color}#{state[:stage].ljust(10)}#{RESET} | #{GREEN}#{state[:percentage].to_s.rjust(5)}%#{RESET} #{DIM}#{state[:details]}#{RESET}" # rubocop:disable Layout/LineLength
133
+ else
134
+ puts " [Worker #{i + 1}] #{DIM}Idle#{RESET}"
135
+ end
136
+ end
137
+
138
+ $stdout.flush
139
+ end
140
+
141
+ def truncate_path(path, max_length)
142
+ return path.ljust(max_length) if path.length <= max_length
143
+
144
+ "...#{path[-(max_length - 3)..]}"
145
+ end
146
+ end
147
+ end
148
+ end
@@ -0,0 +1,131 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "pdf_splitter"
4
+ require_relative "ocr"
5
+ require "fileutils"
6
+
7
+ module Tahweel
8
+ # Orchestrates the full conversion process:
9
+ # 1. Splits a PDF into images.
10
+ # 2. Performs OCR on each image concurrently.
11
+ # 3. Returns the aggregated text.
12
+ # 4. Cleans up temporary files.
13
+ class Converter
14
+ # Max concurrent OCR operations to avoid hitting API rate limits too hard.
15
+ DEFAULT_CONCURRENCY = 12
16
+
17
+ # Convenience method to convert a PDF file to text.
18
+ #
19
+ # @param pdf_path [String] Path to the PDF file.
20
+ # @param dpi [Integer] DPI for PDF to image conversion (default: 150).
21
+ # @param processor [Symbol] OCR processor to use (default: :google_drive).
22
+ # @param concurrency [Integer] Max concurrent OCR operations (default: 12).
23
+ # @param &block [Proc] A block that will be yielded with progress info.
24
+ # @yield [Hash] Progress info: {
25
+ # stage: :splitting or :ocr,
26
+ # current_page: Integer,
27
+ # percentage: Float,
28
+ # remaining_pages: Integer
29
+ # }
30
+ # @return [Array<String>] An array containing the text of each page.
31
+ def self.convert(
32
+ pdf_path,
33
+ dpi: PdfSplitter::DEFAULT_DPI,
34
+ processor: :google_drive,
35
+ concurrency: DEFAULT_CONCURRENCY,
36
+ &
37
+ ) = new(pdf_path, dpi:, processor:, concurrency:).convert(&)
38
+
39
+ # Initializes the Converter.
40
+ #
41
+ # @param pdf_path [String] Path to the PDF file.
42
+ # @param dpi [Integer] DPI for PDF to image conversion.
43
+ # @param processor [Symbol] OCR processor to use.
44
+ # @param concurrency [Integer] Max concurrent OCR operations.
45
+ def initialize(pdf_path, dpi: PdfSplitter::DEFAULT_DPI, processor: :google_drive, concurrency: DEFAULT_CONCURRENCY)
46
+ @pdf_path = pdf_path
47
+ @dpi = dpi
48
+ @processor_type = processor
49
+ @concurrency = concurrency
50
+ end
51
+
52
+ # Executes the conversion process.
53
+ #
54
+ # @param &block [Proc] A block that will be yielded with progress info.
55
+ # @yield [Hash] Progress info: {
56
+ # stage: :splitting or :ocr,
57
+ # current_page: Integer,
58
+ # percentage: Float,
59
+ # remaining_pages: Integer
60
+ # }
61
+ # @return [Array<String>] An array containing the text of each page.
62
+ def convert(&)
63
+ image_paths, temp_dir = PdfSplitter.split(@pdf_path, dpi: @dpi, &).values_at(:image_paths, :folder_path)
64
+
65
+ begin
66
+ process_images(image_paths, Ocr.new(processor: @processor_type), &)
67
+ ensure
68
+ FileUtils.rm_rf(temp_dir)
69
+ end
70
+ end
71
+
72
+ private
73
+
74
+ def process_images(image_paths, ocr_engine, &)
75
+ texts = Array.new(image_paths.size)
76
+ mutex = Mutex.new
77
+ processed_count = 0
78
+
79
+ run_workers(build_queue(image_paths), ocr_engine, texts, mutex) do
80
+ processed_count += 1
81
+ report_progress(processed_count, image_paths.size, &)
82
+ end
83
+
84
+ texts
85
+ end
86
+
87
+ def build_queue(image_paths)
88
+ queue = Queue.new
89
+ image_paths.each_with_index { |path, index| queue << [path, index] }
90
+ queue
91
+ end
92
+
93
+ def run_workers(queue, ocr_engine, texts, mutex, &)
94
+ Array.new(@concurrency) do
95
+ Thread.new { process_queue_items(queue, ocr_engine, texts, mutex, &) }
96
+ end.each(&:join)
97
+ end
98
+
99
+ def process_queue_items(queue, ocr_engine, texts, mutex, &)
100
+ loop do
101
+ begin
102
+ path, index = queue.pop(true)
103
+ rescue ThreadError
104
+ break
105
+ end
106
+
107
+ text = ocr_engine.extract(path)
108
+ save_result(texts, index, text, mutex, &)
109
+ end
110
+ end
111
+
112
+ def save_result(texts, index, text, mutex)
113
+ mutex.synchronize do
114
+ texts[index] = text
115
+ yield
116
+ end
117
+ end
118
+
119
+ def report_progress(processed, total)
120
+ return unless block_given?
121
+
122
+ yield({
123
+ file_path: @pdf_path,
124
+ stage: :ocr,
125
+ current_page: processed,
126
+ percentage: ((processed.to_f / total) * 100).round(2),
127
+ remaining_pages: total - processed
128
+ })
129
+ end
130
+ end
131
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "processors/google_drive"
4
+
5
+ module Tahweel
6
+ # The main entry point for Optical Character Recognition (OCR).
7
+ # This class acts as a factory/strategy context, delegating the actual extraction logic
8
+ # to a specific processor.
9
+ #
10
+ # @example Usage with default processor (Google Drive)
11
+ # text = Tahweel::Ocr.extract("image.png")
12
+ #
13
+ # @example Usage with a specific processor (Future-proofing)
14
+ # # text = Tahweel::Ocr.extract("image.png", processor: :tesseract)
15
+ class Ocr
16
+ AVAILABLE_PROCESSORS = [:google_drive].freeze
17
+
18
+ # Convenience method to extract text using a specific processor.
19
+ #
20
+ # @param file_path [String] Path to the image file.
21
+ # @param processor [Symbol] The processor to use (default: :google_drive).
22
+ # @return [String] The extracted text.
23
+ def self.extract(file_path, processor: :google_drive) = new(processor: processor).extract(file_path)
24
+
25
+ # Initializes the OCR engine with a specific processor strategy.
26
+ #
27
+ # @param processor [Symbol] The processor to use (default: :google_drive).
28
+ # @raise [ArgumentError] If an unknown processor is specified.
29
+ def initialize(processor: :google_drive)
30
+ @processor = case processor
31
+ when :google_drive then Processors::GoogleDrive.new
32
+ else raise ArgumentError, "Unknown processor: #{processor}"
33
+ end
34
+ end
35
+
36
+ # Extracts text from the file using the configured processor.
37
+ #
38
+ # @param file_path [String] Path to the image file.
39
+ # @return [String] The extracted text.
40
+ def extract(file_path) = @processor.extract(file_path)
41
+ end
42
+ end