tahweel 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,147 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "fileutils"
4
+ require "rbconfig"
5
+ require "securerandom"
6
+ require "tmpdir"
7
+ require "vips"
8
+
9
+ module Tahweel
10
+ # Handles the logic for splitting a PDF file into individual image pages.
11
+ # Uses the libvips library for high-performance image processing.
12
+ class PdfSplitter
13
+ # Default DPI used when converting PDF pages to images.
14
+ # 150 DPI is a good balance between quality and file size for general documents.
15
+ DEFAULT_DPI = 150
16
+
17
+ # Convenience class method to initialize and execute the split operation in one go.
18
+ #
19
+ # @param pdf_path [String] The local file path to the PDF document.
20
+ # @param dpi [Integer] The resolution (Dots Per Inch) for rendering the PDF pages. Defaults to 150.
21
+ # @param &block [Proc] A block that will be yielded with progress info.
22
+ # @yield [Hash] Progress info: {
23
+ # stage: :splitting,
24
+ # current_page: Integer,
25
+ # percentage: Float,
26
+ # remaining_pages: Integer
27
+ # }
28
+ # @return [Hash] A hash containing the :folder_path (String) and :image_paths (Array<String>).
29
+ def self.split(pdf_path, dpi: DEFAULT_DPI, &) = new(pdf_path, dpi:).split(&)
30
+
31
+ # Initializes a new PdfSplitter instance.
32
+ #
33
+ # @param pdf_path [String] The local file path to the PDF document.
34
+ # @param dpi [Integer] The resolution (Dots Per Inch) to use. Defaults to 150.
35
+ def initialize(pdf_path, dpi: DEFAULT_DPI)
36
+ @pdf_path = pdf_path
37
+ @dpi = dpi
38
+ @image_paths = []
39
+ end
40
+
41
+ # Executes the PDF splitting process.
42
+ #
43
+ # This method performs the following steps:
44
+ # 1. Checks if libvips is installed (skips on Windows).
45
+ # 2. Validates the existence of the source PDF file.
46
+ # 3. Creates a unique temporary directory for output.
47
+ # 4. Iterates through each page of the PDF and converts it to a PNG image.
48
+ #
49
+ # @param &block [Proc] A block that will be yielded with progress info.
50
+ # @yield [Hash] Progress info: {
51
+ # stage: :splitting,
52
+ # current_page: Integer,
53
+ # percentage: Float,
54
+ # remaining_pages: Integer
55
+ # }
56
+ # @return [Hash] Result hash with keys:
57
+ # - :folder_path [String] The absolute path to the temporary directory containing the images.
58
+ # - :image_paths [Array<String>] List of absolute paths for each generated image file.
59
+ # @raise [RuntimeError] If the PDF file is not found or libvips is missing.
60
+ # @raise [Vips::Error] If the underlying VIPS library encounters an error during processing.
61
+ def split(&)
62
+ check_libvips_installed!
63
+ validate_file_exists!
64
+ setup_output_directory
65
+ process_pages(&)
66
+ result
67
+ end
68
+
69
+ private
70
+
71
+ attr_reader :pdf_path, :dpi, :image_paths, :output_dir
72
+
73
+ # Checks if the `vips` CLI tool is available in the system PATH.
74
+ # Skips this check on Windows systems, assuming the environment is managed differently.
75
+ # Aborts execution with an error message if vips is missing.
76
+ def check_libvips_installed!
77
+ return if /mswin|mingw|cygwin/.match?(RbConfig::CONFIG["host_os"])
78
+ return if system("vips --version", out: File::NULL, err: File::NULL)
79
+
80
+ abort "Error: libvips is not installed. Please install it before using Tahweel.\n" \
81
+ "MacOS: `brew install vips`\n" \
82
+ "Ubuntu: `sudo apt install libvips42`\n" \
83
+ "Windows: Already installed with the Tahweel gem"
84
+ end
85
+
86
+ # Ensures the source PDF file actually exists.
87
+ # @raise [RuntimeError] if the file is missing.
88
+ def validate_file_exists!
89
+ raise "File not found: #{pdf_path}" unless File.exist?(pdf_path)
90
+ end
91
+
92
+ # Creates a secure, unique temporary directory using UUIDs.
93
+ def setup_output_directory
94
+ @output_dir = File.join(Dir.tmpdir, "tahweel_#{SecureRandom.uuid}")
95
+ FileUtils.mkdir_p(@output_dir)
96
+ end
97
+
98
+ # Iterates through all pages and extracts them.
99
+ #
100
+ # @param &block [Proc] A block that will be yielded with progress info.
101
+ # @yield [Hash] Progress info: {
102
+ # stage: :splitting,
103
+ # current_page: Integer,
104
+ # percentage: Float,
105
+ # remaining_pages: Integer
106
+ # }
107
+ # @return [void]
108
+ def process_pages(&)
109
+ total_pages.times do |i|
110
+ extract_page(i)
111
+
112
+ next unless block_given?
113
+
114
+ yield({
115
+ file_path: @pdf_path, stage: :splitting,
116
+ current_page: i + 1,
117
+ percentage: (((i + 1).to_f / total_pages) * 100).round(2),
118
+ remaining_pages: total_pages - (i + 1)
119
+ })
120
+ end
121
+ end
122
+
123
+ # Calculates the total number of pages in the PDF by loading the first page metadata.
124
+ # @return [Integer] The page count.
125
+ def total_pages
126
+ @total_pages ||= Vips::Image.pdfload(pdf_path, page: 0, dpi: dpi, access: :sequential).get("pdf-n_pages")
127
+ end
128
+
129
+ # Extracts a specific page from the PDF and saves it as a PNG.
130
+ #
131
+ # @param page_num [Integer] The zero-based index of the page to extract.
132
+ def extract_page(page_num)
133
+ output_path = File.join(output_dir, "page_#{page_num + 1}.png")
134
+ Vips::Image.pdfload(pdf_path, page: page_num, dpi: dpi, access: :sequential).write_to_file(output_path)
135
+ image_paths << output_path
136
+ end
137
+
138
+ # Constructs the final result hash.
139
+ # @return [Hash]
140
+ def result
141
+ {
142
+ folder_path: output_dir,
143
+ image_paths: image_paths
144
+ }
145
+ end
146
+ end
147
+ end
@@ -0,0 +1,109 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "google/apis/drive_v3"
4
+ require "securerandom"
5
+ require "stringio"
6
+
7
+ require_relative "../authorizer"
8
+
9
+ module Tahweel
10
+ module Processors
11
+ # Handles the conversion of images to text using Google Drive's OCR capabilities.
12
+ #
13
+ # This class automates the process of:
14
+ # 1. Uploading a local image to Google Drive as a Google Document.
15
+ # 2. Downloading the content of that document as plain text.
16
+ # 3. Cleaning up (deleting) the temporary file from Drive.
17
+ #
18
+ # It includes robust error handling with infinite retries and exponential backoff
19
+ # for network issues, rate limits, and server errors.
20
+ class GoogleDrive
21
+ # Initializes the Google Drive OCR service.
22
+ # Sets up the Google Drive API client and authorizes it using {Tahweel::Authorizer}.
23
+ #
24
+ # @note This operation performs filesystem I/O to read credentials.
25
+ # For bulk processing, instantiate this once and reuse it.
26
+ def initialize
27
+ @service = Google::Apis::DriveV3::DriveService.new
28
+ @service.client_options.application_name = "Tahweel"
29
+ @service.authorization = Tahweel::Authorizer.authorize
30
+ end
31
+
32
+ # Extracts text from an image file using the "Upload -> Export -> Delete" flow.
33
+ #
34
+ # The method ensures that the temporary file created on Google Drive is deleted
35
+ # regardless of whether the download succeeds or fails.
36
+ #
37
+ # @param file_path [String] The path to the image file.
38
+ # @return [String] The extracted text.
39
+ # @raise [RuntimeError] If the file does not exist locally.
40
+ # @raise [Google::Apis::Error] If a non-retriable API error occurs (e.g., 401, 403, 404).
41
+ def extract(file_path)
42
+ raise "File not found: #{file_path}" unless File.exist?(file_path)
43
+
44
+ begin
45
+ file_id = upload_file(file_path)
46
+ download_text(file_id).gsub("\r\n", "\n").gsub("________________", "").strip
47
+ ensure
48
+ delete_file(file_id)
49
+ end
50
+ end
51
+
52
+ private
53
+
54
+ # Uploads the file to Google Drive with the MIME type set to 'application/vnd.google-apps.document'.
55
+ # This triggers Google's automatic OCR processing.
56
+ #
57
+ # @param file_path [String] Path to the local file.
58
+ # @return [String] The ID of the created file on Google Drive.
59
+ def upload_file(file_path)
60
+ execute_with_retry do
61
+ @service.create_file(
62
+ {
63
+ name: SecureRandom.uuid,
64
+ mime_type: "application/vnd.google-apps.document"
65
+ },
66
+ upload_source: file_path,
67
+ fields: "id"
68
+ ).id
69
+ end
70
+ end
71
+
72
+ # Exports the Google Document as plain text.
73
+ #
74
+ # @param file_id [String] The ID of the file on Google Drive.
75
+ # @return [String] The content of the file as a string.
76
+ def download_text(file_id)
77
+ execute_with_retry do
78
+ StringIO.new.tap do |dest|
79
+ @service.export_file(file_id, "text/plain", download_dest: dest)
80
+ end.string
81
+ end
82
+ end
83
+
84
+ # Deletes the temporary file from Google Drive.
85
+ #
86
+ # @param file_id [String] The ID of the file to delete.
87
+ # @return [void]
88
+ def delete_file(file_id) = execute_with_retry { @service.delete_file(file_id) }
89
+
90
+ # Executes a block with infinite retries and exponential backoff.
91
+ # Designed to handle transient errors (Rate Limits, Network issues, Server errors).
92
+ #
93
+ # @yield The block to execute.
94
+ # @raise [Google::Apis::Error] Rethrows non-retriable errors immediately.
95
+ def execute_with_retry
96
+ retries = 0
97
+
98
+ begin
99
+ yield
100
+ rescue Google::Apis::RateLimitError, Google::Apis::RequestTimeOutError,
101
+ Google::Apis::TransmissionError, Google::Apis::ServerError
102
+ sleep([1.5**retries, 15].min + rand(0..1))
103
+ retries += 1
104
+ retry
105
+ end
106
+ end
107
+ end
108
+ end
109
+ end
@@ -0,0 +1,192 @@
1
+ <!DOCTYPE html>
2
+ <html lang="ar" dir="rtl">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Tahweel Authorization</title>
7
+ <!-- Google Fonts: Poppins for English, Cairo for Arabic -->
8
+ <link rel="preconnect" href="https://fonts.googleapis.com">
9
+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
10
+ <link href="https://fonts.googleapis.com/css2?family=Cairo:wght@400;700&family=Poppins:wght@400;600&display=swap" rel="stylesheet">
11
+ <style>
12
+ :root {
13
+ --primary-color: #10b981; /* Emerald Green */
14
+ --primary-hover: #059669;
15
+ --bg-gradient: linear-gradient(135deg, #f0fdf4 0%, #d1fae5 100%);
16
+ --text-dark: #1f2937;
17
+ --text-light: #6b7280;
18
+ --card-shadow: 0 10px 25px -5px rgba(0, 0, 0, 0.1), 0 8px 10px -6px rgba(0, 0, 0, 0.1);
19
+ }
20
+
21
+ body {
22
+ margin: 0;
23
+ min-height: 100vh;
24
+ display: flex;
25
+ align-items: center;
26
+ justify-content: center;
27
+ background: var(--bg-gradient);
28
+ font-family: 'Cairo', sans-serif; /* Default to Arabic font */
29
+ transition: all 0.3s ease;
30
+ }
31
+
32
+ /* English Font Override */
33
+ body.font-en {
34
+ font-family: 'Poppins', sans-serif;
35
+ }
36
+
37
+ .container {
38
+ background: rgba(255, 255, 255, 0.95);
39
+ padding: 3rem;
40
+ border-radius: 24px;
41
+ box-shadow: var(--card-shadow);
42
+ text-align: center;
43
+ max-width: 420px;
44
+ width: 90%;
45
+ position: relative;
46
+ backdrop-filter: blur(10px);
47
+ animation: fadeUp 0.6s cubic-bezier(0.16, 1, 0.3, 1);
48
+ border: 1px solid rgba(255,255,255,0.5);
49
+ }
50
+
51
+ /* Success Icon Animation */
52
+ .icon-wrapper {
53
+ width: 80px;
54
+ height: 80px;
55
+ background: #d1fae5;
56
+ border-radius: 50%;
57
+ display: flex;
58
+ align-items: center;
59
+ justify-content: center;
60
+ margin: 0 auto 1.5rem auto;
61
+ color: var(--primary-color);
62
+ }
63
+
64
+ .icon-wrapper svg {
65
+ width: 40px;
66
+ height: 40px;
67
+ stroke-width: 3;
68
+ stroke: currentColor;
69
+ fill: none;
70
+ stroke-linecap: round;
71
+ stroke-linejoin: round;
72
+ animation: drawCheck 0.8s ease-out forwards;
73
+ }
74
+
75
+ h1 {
76
+ color: var(--text-dark);
77
+ font-size: 1.5rem;
78
+ font-weight: 700;
79
+ margin: 0 0 0.5rem 0;
80
+ }
81
+
82
+ p {
83
+ color: var(--text-light);
84
+ font-size: 1rem;
85
+ line-height: 1.6;
86
+ margin: 0 0 1.5rem 0;
87
+ }
88
+
89
+ .sub-text {
90
+ font-size: 0.875rem;
91
+ opacity: 0.8;
92
+ margin-top: -1rem;
93
+ }
94
+
95
+ /* Language Toggle Button */
96
+ .lang-btn {
97
+ position: absolute;
98
+ top: 20px;
99
+ right: 20px;
100
+ background: white;
101
+ border: 1px solid #e5e7eb;
102
+ padding: 8px 16px;
103
+ border-radius: 20px;
104
+ font-family: inherit;
105
+ font-size: 0.875rem;
106
+ font-weight: 600;
107
+ color: var(--text-dark);
108
+ cursor: pointer;
109
+ box-shadow: 0 2px 5px rgba(0,0,0,0.05);
110
+ transition: all 0.2s ease;
111
+ display: flex;
112
+ align-items: center;
113
+ gap: 6px;
114
+ }
115
+
116
+ html[dir="ltr"] .lang-btn {
117
+ right: auto;
118
+ left: 20px;
119
+ }
120
+
121
+ .lang-btn:hover {
122
+ background: #f9fafb;
123
+ transform: translateY(-1px);
124
+ box-shadow: 0 4px 12px rgba(0,0,0,0.08);
125
+ }
126
+
127
+ /* Animations */
128
+ @keyframes fadeUp {
129
+ from { opacity: 0; transform: translateY(20px); }
130
+ to { opacity: 1; transform: translateY(0); }
131
+ }
132
+
133
+ @keyframes drawCheck {
134
+ 0% { stroke-dasharray: 100; stroke-dashoffset: 100; opacity: 0; transform: scale(0.8); }
135
+ 100% { stroke-dasharray: 100; stroke-dashoffset: 0; opacity: 1; transform: scale(1); }
136
+ }
137
+ </style>
138
+ </head>
139
+ <body>
140
+ <button class="lang-btn" onclick="toggleLanguage()" id="langBtn">
141
+ <span>🌐</span> <span id="btnText">English</span>
142
+ </button>
143
+ <div class="container">
144
+ <div class="icon-wrapper">
145
+ <!-- Simple SVG Checkmark -->
146
+ <svg viewBox="0 0 24 24">
147
+ <path d="M20 6L9 17l-5-5"></path>
148
+ </svg>
149
+ </div>
150
+ <h1 id="title">تمت المُصادقة بنجاح!</h1>
151
+ <p id="msg1">لقد قمت بتسجيل الدخول بنجاح إلى تحويل.</p>
152
+ <p id="msg2" class="sub-text">يمكنك إغلاق هذه النافذة والعودة إلى البرنامج.</p>
153
+ </div>
154
+ <script>
155
+ // Content Dictionary
156
+ const content = {
157
+ ar: {
158
+ btn: "English",
159
+ title: "تمت المُصادقة بنجاح!",
160
+ msg1: "لقد قمت بتسجيل الدخول بنجاح إلى تحويل.",
161
+ msg2: "يمكنك إغلاق هذه النافذة والعودة إلى البرنامج."
162
+ },
163
+ en: {
164
+ btn: "العربية",
165
+ title: "Authorization Successful!",
166
+ msg1: "You have successfully logged in to Tahweel.",
167
+ msg2: "You can close this window and return to your terminal."
168
+ }
169
+ };
170
+
171
+ let currentLang = 'ar';
172
+
173
+ function toggleLanguage() {
174
+ currentLang = currentLang === 'ar' ? 'en' : 'ar';
175
+
176
+ // Toggle Direction and Lang Attribute
177
+ const htmlEl = document.documentElement;
178
+ htmlEl.lang = currentLang;
179
+ htmlEl.dir = currentLang === 'ar' ? 'rtl' : 'ltr';
180
+
181
+ // Toggle Font Class (for body typography)
182
+ document.body.classList.toggle('font-en', currentLang === 'en');
183
+
184
+ // Update Text
185
+ document.getElementById('btnText').textContent = content[currentLang].btn;
186
+ document.getElementById('title').textContent = content[currentLang].title;
187
+ document.getElementById('msg1').textContent = content[currentLang].msg1;
188
+ document.getElementById('msg2').textContent = content[currentLang].msg2;
189
+ }
190
+ </script>
191
+ </body>
192
+ </html>
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Tahweel
4
+ VERSION = "0.1.0"
5
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "writers/txt"
4
+ require_relative "writers/docx"
5
+ require_relative "writers/json"
6
+
7
+ module Tahweel
8
+ # Factory class for writing extracted text to different formats.
9
+ class Writer
10
+ AVAILABLE_FORMATS = %i[txt docx json].freeze
11
+
12
+ # Convenience method to write texts to files in the specified formats.
13
+ #
14
+ # @param texts [Array<String>] The extracted texts.
15
+ # @param base_path [String] The base output path (without extension).
16
+ # @param formats [Array<Symbol>] The output formats (default: [:txt]).
17
+ # @param options [Hash] Options for writers.
18
+ # @return [void]
19
+ def self.write(texts, base_path, formats: [:txt], **options)
20
+ formats.each { new(format: _1).write(texts, base_path, **options) }
21
+ end
22
+
23
+ # Initializes the Writer with a specific format strategy.
24
+ #
25
+ # @param format [Symbol] The output format.
26
+ # @raise [ArgumentError] If the format is unknown.
27
+ def initialize(format: :txt)
28
+ @writer = case format
29
+ when :txt then Writers::Txt.new
30
+ when :docx then Writers::Docx.new
31
+ when :json then Writers::Json.new
32
+ else raise ArgumentError, "Unknown format: #{format}"
33
+ end
34
+ end
35
+
36
+ # Writes the texts to the destination using the selected strategy.
37
+ # Appends the appropriate extension to the base path.
38
+ #
39
+ # @param texts [Array<String>] The extracted texts.
40
+ # @param base_path [String] The base output file path.
41
+ # @param options [Hash] Options to pass to the writer.
42
+ def write(texts, base_path, **options) = @writer.write(texts, "#{base_path}.#{extension}", options)
43
+
44
+ # Delegates the extension retrieval to the specific writer strategy.
45
+ #
46
+ # @return [String] The file extension.
47
+ def extension = @writer.extension
48
+ end
49
+ end
@@ -0,0 +1,82 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "caracal"
4
+
5
+ module Tahweel
6
+ module Writers
7
+ # Writer class for outputting text to a .docx file.
8
+ class Docx
9
+ # Returns the file extension for this writer.
10
+ #
11
+ # @return [String] The file extension.
12
+ def extension = "docx"
13
+
14
+ # Writes the extracted texts to a file.
15
+ #
16
+ # It applies several transformations to the text before writing:
17
+ # 1. Normalizes line endings to `\n`.
18
+ # 2. Collapses consecutive identical whitespace characters.
19
+ # 3. Compacts the text by merging short lines if the page is too long (> 40 lines).
20
+ # 4. Determines text alignment (RTL/LTR) based on content.
21
+ #
22
+ # @param texts [Array<String>] The extracted texts (one per page).
23
+ # @param destination [String] The output file path.
24
+ # @param options [Hash] Options for writing (unused for now).
25
+ # @return [void]
26
+ def write(texts, destination, options = {}) # rubocop:disable Lint/UnusedMethodArgument
27
+ Caracal::Document.save(destination) do |docx|
28
+ texts.each_with_index do |text, index|
29
+ text = text.gsub(/(\r\n)+/, "\n").gsub(/(\s)\1+/, '\1').strip
30
+ text = compact_shortest_lines(text) while expected_lines_in_page(text) > 40
31
+
32
+ docx.p text, size: 20, align: alignment_for(text)
33
+
34
+ docx.page if index < texts.size - 1
35
+ end
36
+ end
37
+ end
38
+
39
+ private
40
+
41
+ # Determines the text alignment based on the ratio of Arabic to non-Arabic characters.
42
+ #
43
+ # @param text [String] The text to analyze.
44
+ # @return [Symbol] :right if Arabic characters dominate, :left otherwise.
45
+ def alignment_for(text)
46
+ arabic_chars_count = text.scan(/\p{Arabic}/).count
47
+ other_chars_count = text.scan(/[^\p{Arabic}\p{P}\d\s]/).count
48
+
49
+ arabic_chars_count >= other_chars_count ? :right : :left
50
+ end
51
+
52
+ # Estimates the number of lines the text will occupy on a page.
53
+ #
54
+ # Assumes a line wraps if it exceeds 80 characters.
55
+ #
56
+ # @param text [String] The text to analyze.
57
+ # @return [Integer] The estimated line count.
58
+ def expected_lines_in_page(text) = text.count("\n") + 1 + text.split("\n").count { _1.length > 80 }
59
+
60
+ # Compacts the text by merging the two shortest adjacent lines.
61
+ #
62
+ # @param text [String] The text to compact.
63
+ # @return [String] The compacted text.
64
+ def compact_shortest_lines(text)
65
+ lines = text.split("\n")
66
+ return text if lines.size < 2
67
+
68
+ index = find_merge_index(lines)
69
+ lines[index] = "#{lines[index]} #{lines[index + 1]}"
70
+ lines.delete_at(index + 1)
71
+
72
+ lines.join("\n")
73
+ end
74
+
75
+ # Finds the index of the first line in the pair of adjacent lines with the minimum combined length.
76
+ #
77
+ # @param lines [Array<String>] The lines to analyze.
78
+ # @return [Integer] The index of the first line in the optimal pair.
79
+ def find_merge_index(lines) = (0...(lines.size - 1)).min_by { lines[_1].length + lines[_1 + 1].length }
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ module Tahweel
6
+ module Writers
7
+ # Writer class for outputting text to a .json file.
8
+ class Json
9
+ # Returns the file extension for this writer.
10
+ #
11
+ # @return [String] The file extension.
12
+ def extension = "json"
13
+
14
+ # Writes the extracted texts to a file.
15
+ #
16
+ # @param texts [Array<String>] The extracted texts (one per page).
17
+ # @param destination [String] The output file path.
18
+ # @param options [Hash] Options for writing (unused for now).
19
+ # @return [void]
20
+ def write(texts, destination, options = {}) # rubocop:disable Lint/UnusedMethodArgument
21
+ structured_data = texts.map.with_index do |text, index|
22
+ {
23
+ page: index + 1,
24
+ content: text.strip
25
+ }
26
+ end
27
+
28
+ File.write(destination, JSON.pretty_generate(structured_data))
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Tahweel
4
+ module Writers
5
+ # Writer class for outputting text to a .txt file.
6
+ class Txt
7
+ PAGE_SEPARATOR = "\n\nPAGE_SEPARATOR\n\n"
8
+
9
+ # Returns the file extension for this writer.
10
+ #
11
+ # @return [String] The file extension.
12
+ def extension = "txt"
13
+
14
+ # Writes the extracted texts to a file.
15
+ #
16
+ # @param texts [Array<String>] The extracted texts (one per page).
17
+ # @param destination [String] The output file path.
18
+ # @param options [Hash] Options for writing.
19
+ # @option options [String] :page_separator (PAGE_SEPARATOR) Separator between pages.
20
+ # @return [void]
21
+ def write(texts, destination, options = {})
22
+ separator = options[:page_separator] || PAGE_SEPARATOR
23
+ File.write(destination, texts.map(&:strip).join(separator))
24
+ end
25
+ end
26
+ end
27
+ end
data/lib/tahweel.rb ADDED
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "tahweel/version"
4
+ require_relative "tahweel/authorizer"
5
+ require_relative "tahweel/pdf_splitter"
6
+ require_relative "tahweel/ocr"
7
+ require_relative "tahweel/converter"
8
+ require_relative "tahweel/writer"
9
+ require_relative "tahweel/cli/file_processor"
10
+ require_relative "tahweel/cli/file_collector"
11
+ require_relative "tahweel/cli/options"
12
+
13
+ module Tahweel # rubocop:disable Style/Documentation
14
+ class Error < StandardError; end
15
+
16
+ # Converts a PDF file to text by splitting it into images and running OCR on each page.
17
+ #
18
+ # @param pdf_path [String] Path to the PDF file.
19
+ # @param dpi [Integer] DPI for PDF to image conversion (default: 150).
20
+ # @param processor [Symbol] OCR processor to use (default: :google_drive).
21
+ # @param concurrency [Integer] Max concurrent OCR operations (default: 12).
22
+ # @return [Array<String>] An array containing the text of each page.
23
+ def self.convert(
24
+ pdf_path,
25
+ dpi: PdfSplitter::DEFAULT_DPI,
26
+ processor: :google_drive,
27
+ concurrency: Converter::DEFAULT_CONCURRENCY,
28
+ &
29
+ ) = Converter.convert(pdf_path, dpi:, processor:, concurrency:, &)
30
+
31
+ # Extracts text from an image file using the specified OCR processor.
32
+ #
33
+ # @param image_path [String] Path to the image file.
34
+ # @param processor [Symbol] OCR processor to use (default: :google_drive).
35
+ # @return [String] The extracted text.
36
+ def self.extract(image_path, processor: :google_drive) = Ocr.extract(image_path, processor:)
37
+ end
data/mise.toml ADDED
@@ -0,0 +1,2 @@
1
+ [tools]
2
+ ruby = "3.4.7"