tahweel 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.vscode/settings.json +1 -0
- data/CHANGELOG.md +13 -0
- data/README.en.md +468 -0
- data/README.md +464 -20
- data/assets/logo.png +0 -0
- data/assets/windows/tahweel.ico +0 -0
- data/bin/tahweel +10 -5
- data/bin/tahweel-ui +300 -0
- data/lib/tahweel/authorizer.rb +17 -7
- data/lib/tahweel/cli/file_processor.rb +14 -2
- data/lib/tahweel/cli/options.rb +24 -7
- data/lib/tahweel/cli/progress_renderer.rb +7 -0
- data/lib/tahweel/converter.rb +46 -9
- data/lib/tahweel/pdf_splitter.rb +92 -38
- data/lib/tahweel/poppler_installer.rb +185 -0
- data/lib/tahweel/processors/google_drive.rb +1 -1
- data/lib/tahweel/version.rb +1 -1
- data/lib/tahweel/writer.rb +1 -1
- data/lib/tahweel/writers/docx.rb +26 -3
- data/lib/tahweel.rb +8 -3
- data/website/favicon.ico +0 -0
- data/website/index.html +792 -0
- data/website/logo.png +0 -0
- data/website/privacy.html +489 -0
- metadata +53 -15
data/bin/tahweel-ui
ADDED
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
# Add the ../lib directory to the load path so we can require 'tahweel'
|
|
5
|
+
$LOAD_PATH.unshift(File.expand_path("../lib", __dir__))
|
|
6
|
+
|
|
7
|
+
require "matrix"
|
|
8
|
+
require "glimmer-dsl-libui"
|
|
9
|
+
require "launchy"
|
|
10
|
+
require "tahweel"
|
|
11
|
+
|
|
12
|
+
Tahweel::Authorizer.authorize unless defined?(Ocran)
|
|
13
|
+
|
|
14
|
+
class TahweelApp # rubocop:disable Metrics/ClassLength,Style/Documentation
|
|
15
|
+
include Glimmer::LibUI::Application
|
|
16
|
+
|
|
17
|
+
TRANSLATIONS = {
|
|
18
|
+
# https://www.perplexity.ai/search/add-direction-unicode-controls-Z5haIOFEQZeCbG5B8Wt2dg#3
|
|
19
|
+
ar: {
|
|
20
|
+
window_title: "تحويل",
|
|
21
|
+
title_label: "تحويل: حوّل الملفات من صيغة PDF إلى TXT و DOCX ↓",
|
|
22
|
+
note_label: "ملاحظة: يدعم تحويل الملفات بصيغة PDF أو صورة (JPG و JPEG و PNG) فقط.",
|
|
23
|
+
file_btn: "تحويل ملف واحد",
|
|
24
|
+
folder_btn: "تحويل مجلد كامل",
|
|
25
|
+
language_btn: "English",
|
|
26
|
+
global_progress: "التقدم العام:",
|
|
27
|
+
file_progress: "تقدم الملف الحالي:",
|
|
28
|
+
status_done: "انتهى التحويل.",
|
|
29
|
+
msg_success_title: "اكتمل التحويل",
|
|
30
|
+
msg_error_title: "خطأ",
|
|
31
|
+
msg_no_files: "لم نعثر على ملفات لتحويلها.",
|
|
32
|
+
msg_bad_extension: "صيغة الملف غير مدعومة.",
|
|
33
|
+
stage_preparing: "جارٍ التحضير...",
|
|
34
|
+
stage_splitting: "جارٍ تقسيم الملف...",
|
|
35
|
+
stage_ocr: "جارٍ استخراج النصوص...",
|
|
36
|
+
stage_done: "انتهى"
|
|
37
|
+
},
|
|
38
|
+
en: {
|
|
39
|
+
window_title: "Tahweel",
|
|
40
|
+
title_label: "Tahweel: Convert PDF files to TXT and DOCX ↓",
|
|
41
|
+
note_label: "Note: Tahweel supports PDF or image files (JPG, JPEG, and PNG) only.",
|
|
42
|
+
file_btn: "Convert a Single File",
|
|
43
|
+
folder_btn: "Convert a Folder",
|
|
44
|
+
language_btn: "العربية",
|
|
45
|
+
global_progress: "Progress:",
|
|
46
|
+
file_progress: "Current File:",
|
|
47
|
+
status_done: "Conversion complete.",
|
|
48
|
+
msg_success_title: "Conversion Complete",
|
|
49
|
+
msg_error_title: "Error",
|
|
50
|
+
msg_no_files: "No files found to convert.",
|
|
51
|
+
msg_bad_extension: "Unsupported file format.",
|
|
52
|
+
stage_preparing: "Preparing...",
|
|
53
|
+
stage_splitting: "Splitting file...",
|
|
54
|
+
stage_ocr: "Extracting text...",
|
|
55
|
+
stage_done: "Done"
|
|
56
|
+
}
|
|
57
|
+
}.freeze
|
|
58
|
+
|
|
59
|
+
def initialize(*args)
|
|
60
|
+
@lang = :ar
|
|
61
|
+
@rtl_components = []
|
|
62
|
+
@ltr_components = []
|
|
63
|
+
|
|
64
|
+
super
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
body do
|
|
68
|
+
@main_window = window(t(:window_title)) do
|
|
69
|
+
margined true
|
|
70
|
+
|
|
71
|
+
vertical_box do
|
|
72
|
+
@header_label = right_aligned_label(t(:title_label))
|
|
73
|
+
convert_buttons
|
|
74
|
+
@note_label = right_aligned_label(t(:note_label))
|
|
75
|
+
|
|
76
|
+
@progress_section = progress_section
|
|
77
|
+
@progress_section.visible = false
|
|
78
|
+
|
|
79
|
+
horizontal_separator { stretchy false }
|
|
80
|
+
|
|
81
|
+
@language_btn = language_button
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
private
|
|
87
|
+
|
|
88
|
+
def language_button # rubocop:disable Metrics/MethodLength
|
|
89
|
+
ref = nil
|
|
90
|
+
|
|
91
|
+
horizontal_box do
|
|
92
|
+
stretchy false
|
|
93
|
+
|
|
94
|
+
right_alignment_label = label("") { stretchy true }
|
|
95
|
+
right_alignment_label.visible = false
|
|
96
|
+
@rtl_components << right_alignment_label
|
|
97
|
+
|
|
98
|
+
ref = button(t(:language_btn)) do
|
|
99
|
+
stretchy false
|
|
100
|
+
on_clicked { toggle_language }
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
left_alignment_label = label("") { stretchy true }
|
|
104
|
+
@ltr_components << left_alignment_label
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
ref
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def convert_buttons
|
|
111
|
+
horizontal_box do
|
|
112
|
+
@ar_convert_folder_btn = button(t(:folder_btn, lang: :ar)) { on_clicked { on_folder_click } }
|
|
113
|
+
@rtl_components << @ar_convert_folder_btn
|
|
114
|
+
|
|
115
|
+
@convert_file_btn = button(t(:file_btn)) { on_clicked { on_file_click } }
|
|
116
|
+
|
|
117
|
+
@en_convert_folder_btn = button(t(:folder_btn, lang: :en)) { on_clicked { on_folder_click } }
|
|
118
|
+
@en_convert_folder_btn.visible = false
|
|
119
|
+
@rtl_components << @en_convert_folder_btn
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
def progress_section
|
|
124
|
+
group do
|
|
125
|
+
stretchy false
|
|
126
|
+
|
|
127
|
+
vertical_box do
|
|
128
|
+
@global_progress_label = right_aligned_label(t(:global_progress))
|
|
129
|
+
@global_progress = progress_bar { stretchy true }
|
|
130
|
+
|
|
131
|
+
@file_progress_label = right_aligned_label(t(:file_progress))
|
|
132
|
+
@file_progress = progress_bar { stretchy true }
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def on_file_click
|
|
138
|
+
file = open_file
|
|
139
|
+
convert(File.dirname(file), [file]) if file && valid_file?(file)
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
def valid_file?(file)
|
|
143
|
+
return true if %w[.pdf .jpg .jpeg .png].include? File.extname(file).downcase
|
|
144
|
+
|
|
145
|
+
msg_box_error(t(:msg_error_title), t(:msg_bad_extension))
|
|
146
|
+
false
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
def on_folder_click
|
|
150
|
+
folder = open_folder
|
|
151
|
+
paths = collect_files(folder)
|
|
152
|
+
convert(folder, paths) if paths.any?
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
def collect_files(folder)
|
|
156
|
+
paths = folder ? Tahweel::CLI::FileCollector.collect(folder) : []
|
|
157
|
+
return paths if folder && paths.any?
|
|
158
|
+
|
|
159
|
+
msg_box_error(t(:msg_error_title), t(:msg_no_files))
|
|
160
|
+
[]
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
def convert(folder, paths)
|
|
164
|
+
disable_window
|
|
165
|
+
@progress_section.visible = true
|
|
166
|
+
|
|
167
|
+
Thread.new do
|
|
168
|
+
paths.each_with_index { |path, index| process_path(path, index, paths.size) }
|
|
169
|
+
finish_conversion(folder, paths.size)
|
|
170
|
+
end
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
def disable_window
|
|
174
|
+
@convert_file_btn.enabled = false
|
|
175
|
+
@ar_convert_folder_btn.enabled = false
|
|
176
|
+
@en_convert_folder_btn.enabled = false
|
|
177
|
+
@language_btn.enabled = false
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
def process_path(path, index, total_files)
|
|
181
|
+
reset_file_progress(index, total_files)
|
|
182
|
+
|
|
183
|
+
Tahweel::CLI::FileProcessor.process(path, options(path)) { update_file_progress(_1) }
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
def reset_file_progress(index, total_files)
|
|
187
|
+
Glimmer::LibUI.queue_main do
|
|
188
|
+
@global_progress.value = ((index.to_f / total_files) * 100).to_i
|
|
189
|
+
@global_progress_label.text = "\u202B#{t(:global_progress)} (\u202A#{index}/#{total_files}\u202C)\u202C"
|
|
190
|
+
@file_progress.value = 0
|
|
191
|
+
@file_progress_label.text = "#{t(:file_progress)} #{t(:stage_preparing)}"
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
def options(path)
|
|
196
|
+
{
|
|
197
|
+
dpi: 150,
|
|
198
|
+
processor: :google_drive,
|
|
199
|
+
ocr_concurrency: Tahweel::Converter::DEFAULT_CONCURRENCY,
|
|
200
|
+
file_concurrency: 1,
|
|
201
|
+
formats: %i[txt docx],
|
|
202
|
+
base_input_path: File.directory?(path) ? path : File.dirname(path)
|
|
203
|
+
}
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
def update_file_progress(progress)
|
|
207
|
+
Glimmer::LibUI.queue_main do
|
|
208
|
+
@file_progress.value = progress[:percentage].to_i
|
|
209
|
+
@file_progress_label.text = "#{t(:file_progress)} (#{stage_text(progress[:stage])})"
|
|
210
|
+
end
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
def stage_text(stage)
|
|
214
|
+
case stage
|
|
215
|
+
when :splitting then t(:stage_splitting)
|
|
216
|
+
when :ocr then t(:stage_ocr)
|
|
217
|
+
end
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
def finish_conversion(folder, total_files)
|
|
221
|
+
Glimmer::LibUI.queue_main do
|
|
222
|
+
@global_progress.value = 100
|
|
223
|
+
@file_progress.value = 100
|
|
224
|
+
@global_progress_label.text = "#{t(:global_progress)} (#{total_files}/#{total_files})"
|
|
225
|
+
@file_progress_label.text = "#{t(:file_progress)} (#{t(:stage_done)})"
|
|
226
|
+
|
|
227
|
+
enable_window
|
|
228
|
+
msg_box(t(:msg_success_title), convert_finished_message(total_files))
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
Launchy.open(folder)
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
def enable_window
|
|
235
|
+
@convert_file_btn.enabled = true
|
|
236
|
+
@ar_convert_folder_btn.enabled = true
|
|
237
|
+
@en_convert_folder_btn.enabled = true
|
|
238
|
+
@language_btn.enabled = true
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
def convert_finished_message(files_count)
|
|
242
|
+
if @lang == :en
|
|
243
|
+
"Finished converting #{files_count} file(s) successfully."
|
|
244
|
+
else
|
|
245
|
+
arabic_convert_finished_message(files_count)
|
|
246
|
+
end
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
def arabic_convert_finished_message(files_count)
|
|
250
|
+
return "انتهى تحويل ملف واحد بنجاح." if files_count == 1
|
|
251
|
+
return "انتهى تحويل ملفين بنجاح." if files_count == 2
|
|
252
|
+
|
|
253
|
+
last_two_digits = files_count % 100
|
|
254
|
+
|
|
255
|
+
suffix = case last_two_digits
|
|
256
|
+
when 0..2 then "ملف"
|
|
257
|
+
when 3..10 then "ملفات"
|
|
258
|
+
else "ملفًا"
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
"انتهى تحويل #{files_count} #{suffix} بنجاح."
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
def toggle_language # rubocop:disable Metrics/AbcSize
|
|
265
|
+
@lang = @lang == :ar ? :en : :ar
|
|
266
|
+
|
|
267
|
+
@main_window.title = t(:window_title)
|
|
268
|
+
@header_label.text = t(:title_label)
|
|
269
|
+
@note_label.text = t(:note_label)
|
|
270
|
+
@convert_file_btn.text = t(:file_btn)
|
|
271
|
+
@language_btn.text = t(:language_btn)
|
|
272
|
+
|
|
273
|
+
@global_progress_label.text = t(:global_progress)
|
|
274
|
+
@file_progress_label.text = t(:file_progress)
|
|
275
|
+
|
|
276
|
+
@rtl_components.each { _1.visible = !_1.visible }
|
|
277
|
+
@ltr_components.each { _1.visible = !_1.visible }
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
def t(key, lang: nil) = TRANSLATIONS[lang || @lang][key]
|
|
281
|
+
|
|
282
|
+
def right_aligned_label(text)
|
|
283
|
+
ref = nil
|
|
284
|
+
|
|
285
|
+
horizontal_box do
|
|
286
|
+
right_alignment_label = label("") { stretchy true }
|
|
287
|
+
@rtl_components << right_alignment_label
|
|
288
|
+
|
|
289
|
+
ref = label(text) { stretchy false }
|
|
290
|
+
|
|
291
|
+
left_alignment_label = label("") { stretchy true }
|
|
292
|
+
left_alignment_label.visible = false
|
|
293
|
+
@ltr_components << left_alignment_label
|
|
294
|
+
end
|
|
295
|
+
|
|
296
|
+
ref
|
|
297
|
+
end
|
|
298
|
+
end
|
|
299
|
+
|
|
300
|
+
TahweelApp.launch unless defined?(Ocran)
|
data/lib/tahweel/authorizer.rb
CHANGED
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require "
|
|
4
|
-
require "googleauth"
|
|
5
|
-
require "googleauth/stores/file_token_store"
|
|
3
|
+
require "fileutils"
|
|
6
4
|
require "socket"
|
|
7
5
|
require "uri"
|
|
6
|
+
|
|
7
|
+
require "googleauth"
|
|
8
|
+
require "googleauth/stores/file_token_store"
|
|
9
|
+
require "google/apis/drive_v3"
|
|
8
10
|
require "launchy"
|
|
9
|
-
require "fileutils"
|
|
10
11
|
require "xdg"
|
|
11
12
|
|
|
12
13
|
module Tahweel
|
|
@@ -18,8 +19,8 @@ module Tahweel
|
|
|
18
19
|
# 3. Initiating the OAuth 2.0 flow via a local web server if needed.
|
|
19
20
|
# 4. Exchanging the authorization code for credentials and persisting them.
|
|
20
21
|
class Authorizer
|
|
21
|
-
CLIENT_ID = "512416833080-
|
|
22
|
-
CLIENT_SECRET = "GOCSPX-
|
|
22
|
+
CLIENT_ID = "512416833080-808aqp20iith31t9rgtdmsgc53jp0sc2.apps.googleusercontent.com"
|
|
23
|
+
CLIENT_SECRET = "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"
|
|
23
24
|
|
|
24
25
|
PORT = 3027
|
|
25
26
|
REDIRECT_URI = "http://localhost:#{PORT}/".freeze
|
|
@@ -104,7 +105,16 @@ module Tahweel
|
|
|
104
105
|
end
|
|
105
106
|
|
|
106
107
|
# Opens the system default browser to the Google Authorization URL.
|
|
107
|
-
def open_browser_for_auth
|
|
108
|
+
def open_browser_for_auth
|
|
109
|
+
url = @authorizer.get_authorization_url(base_url: REDIRECT_URI)
|
|
110
|
+
|
|
111
|
+
if Gem.win_platform?
|
|
112
|
+
# https://github.com/copiousfreetime/launchy/issues/167
|
|
113
|
+
system("start \"\" \"#{url}\"")
|
|
114
|
+
else
|
|
115
|
+
Launchy.open(url)
|
|
116
|
+
end
|
|
117
|
+
end
|
|
108
118
|
|
|
109
119
|
# Listens on the local server for the OAuth callback request.
|
|
110
120
|
# Handles multiple incoming requests to filter out noise (like favicon.ico).
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "fileutils"
|
|
3
4
|
require "pathname"
|
|
4
5
|
|
|
5
6
|
module Tahweel
|
|
@@ -17,7 +18,7 @@ module Tahweel
|
|
|
17
18
|
# @option options [String] :output The directory to save output files (defaults to current directory).
|
|
18
19
|
# @option options [Integer] :dpi DPI for PDF conversion (defaults to 150).
|
|
19
20
|
# @option options [Symbol] :processor The OCR processor to use (e.g., :google_drive).
|
|
20
|
-
# @option options [Integer] :
|
|
21
|
+
# @option options [Integer] :ocr_concurrency Max concurrent operations.
|
|
21
22
|
# @option options [Array<Symbol>] :formats Output formats (e.g., [:txt, :docx]).
|
|
22
23
|
# @option options [String] :page_separator Separator string for TXT output.
|
|
23
24
|
# @option options [String] :base_input_path The base path used to determine relative output structure.
|
|
@@ -66,8 +67,11 @@ module Tahweel
|
|
|
66
67
|
|
|
67
68
|
private
|
|
68
69
|
|
|
70
|
+
# Creates the output directory if it doesn't exist.
|
|
69
71
|
def ensure_output_directory_exists = FileUtils.mkdir_p(output_directory)
|
|
70
72
|
|
|
73
|
+
# Checks if all requested output formats already exist.
|
|
74
|
+
# @return [Boolean] True if all output files exist.
|
|
71
75
|
def all_outputs_exist?
|
|
72
76
|
@options[:formats].all? do |format|
|
|
73
77
|
extension = Tahweel::Writer.new(format: format).extension
|
|
@@ -75,22 +79,29 @@ module Tahweel
|
|
|
75
79
|
end
|
|
76
80
|
end
|
|
77
81
|
|
|
82
|
+
# Checks if the input file is a PDF.
|
|
83
|
+
# @return [Boolean]
|
|
78
84
|
def pdf? = File.extname(@file_path).downcase == ".pdf"
|
|
79
85
|
|
|
86
|
+
# Handles PDF processing: splitting, OCR, and saving.
|
|
87
|
+
# @param &block [Proc] Progress callback.
|
|
80
88
|
def process_pdf(&)
|
|
81
89
|
texts = Tahweel.convert(
|
|
82
90
|
@file_path,
|
|
83
91
|
dpi: @options[:dpi],
|
|
84
92
|
processor: @options[:processor],
|
|
85
|
-
concurrency: @options.fetch(:
|
|
93
|
+
concurrency: @options.fetch(:ocr_concurrency, Tahweel::Converter::DEFAULT_CONCURRENCY),
|
|
86
94
|
&
|
|
87
95
|
)
|
|
88
96
|
|
|
89
97
|
write_output(texts)
|
|
90
98
|
end
|
|
91
99
|
|
|
100
|
+
# Handles single image processing.
|
|
92
101
|
def process_image = write_output([Tahweel.extract(@file_path, processor: @options[:processor])])
|
|
93
102
|
|
|
103
|
+
# Writes the extracted text to all configured formats.
|
|
104
|
+
# @param texts [Array<String>] The list of extracted texts (per page).
|
|
94
105
|
def write_output(texts)
|
|
95
106
|
Tahweel::Writer.write(
|
|
96
107
|
texts,
|
|
@@ -100,6 +111,7 @@ module Tahweel
|
|
|
100
111
|
)
|
|
101
112
|
end
|
|
102
113
|
|
|
114
|
+
# @return [String] The full path for output files without extension.
|
|
103
115
|
def base_output_path = File.join(output_directory, File.basename(@file_path, ".*"))
|
|
104
116
|
|
|
105
117
|
# Determines the output directory.
|
data/lib/tahweel/cli/options.rb
CHANGED
|
@@ -3,6 +3,13 @@
|
|
|
3
3
|
require "etc"
|
|
4
4
|
require "optparse"
|
|
5
5
|
|
|
6
|
+
require_relative "../version"
|
|
7
|
+
require_relative "../converter"
|
|
8
|
+
require_relative "../ocr"
|
|
9
|
+
require_relative "../writer"
|
|
10
|
+
require_relative "../writers/txt"
|
|
11
|
+
require_relative "file_collector"
|
|
12
|
+
|
|
6
13
|
module Tahweel
|
|
7
14
|
module CLI
|
|
8
15
|
# Parses command-line arguments for the Tahweel CLI.
|
|
@@ -26,11 +33,13 @@ module Tahweel
|
|
|
26
33
|
options
|
|
27
34
|
end
|
|
28
35
|
|
|
36
|
+
# Returns the default configuration options.
|
|
37
|
+
# @return [Hash] Default options.
|
|
29
38
|
def self.default_options
|
|
30
39
|
{
|
|
31
40
|
dpi: 150,
|
|
32
41
|
processor: :google_drive,
|
|
33
|
-
|
|
42
|
+
ocr_concurrency: Tahweel::Converter::DEFAULT_CONCURRENCY,
|
|
34
43
|
file_concurrency: (Etc.nprocessors - 2).clamp(2..),
|
|
35
44
|
output: nil,
|
|
36
45
|
formats: %i[txt docx],
|
|
@@ -38,6 +47,10 @@ module Tahweel
|
|
|
38
47
|
}
|
|
39
48
|
end
|
|
40
49
|
|
|
50
|
+
# Configures the OptionParser instance.
|
|
51
|
+
#
|
|
52
|
+
# @param opts [OptionParser] The parser instance.
|
|
53
|
+
# @param options [Hash] The options hash to populate.
|
|
41
54
|
def self.configure_parser(opts, options) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
|
|
42
55
|
opts.program_name = "tahweel"
|
|
43
56
|
opts.version = Tahweel::VERSION
|
|
@@ -69,17 +82,17 @@ module Tahweel
|
|
|
69
82
|
end
|
|
70
83
|
|
|
71
84
|
opts.on(
|
|
72
|
-
"-
|
|
73
|
-
"Max concurrent
|
|
85
|
+
"-F", "--file-concurrency FILE_CONCURRENCY", POSITIVE_INTEGER,
|
|
86
|
+
"Max concurrent files to process (default: CPUs - 2 = #{options[:file_concurrency]})"
|
|
74
87
|
) do |value|
|
|
75
|
-
options[:
|
|
88
|
+
options[:file_concurrency] = value
|
|
76
89
|
end
|
|
77
90
|
|
|
78
91
|
opts.on(
|
|
79
|
-
"-
|
|
80
|
-
"Max concurrent
|
|
92
|
+
"-O", "--ocr-concurrency OCR_CONCURRENCY", POSITIVE_INTEGER,
|
|
93
|
+
"Max concurrent OCR operations (default: #{options[:ocr_concurrency]})"
|
|
81
94
|
) do |value|
|
|
82
|
-
options[:
|
|
95
|
+
options[:ocr_concurrency] = value
|
|
83
96
|
end
|
|
84
97
|
|
|
85
98
|
opts.on(
|
|
@@ -104,6 +117,10 @@ module Tahweel
|
|
|
104
117
|
end
|
|
105
118
|
end
|
|
106
119
|
|
|
120
|
+
# Validates that arguments were provided.
|
|
121
|
+
#
|
|
122
|
+
# @param args [Array<String>] The remaining arguments after parsing.
|
|
123
|
+
# @param parser [OptionParser] The parser instance for printing help.
|
|
107
124
|
def self.validate_args!(args, parser)
|
|
108
125
|
return unless args.empty?
|
|
109
126
|
|
|
@@ -102,6 +102,7 @@ module Tahweel
|
|
|
102
102
|
|
|
103
103
|
private
|
|
104
104
|
|
|
105
|
+
# Starts a background thread to refresh the display periodically.
|
|
105
106
|
def start_ticker
|
|
106
107
|
@ticker_thread = Thread.new do
|
|
107
108
|
while @running
|
|
@@ -111,6 +112,7 @@ module Tahweel
|
|
|
111
112
|
end
|
|
112
113
|
end
|
|
113
114
|
|
|
115
|
+
# Renders the progress dashboard to stdout.
|
|
114
116
|
def render # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
|
115
117
|
# Move cursor up to the start of our block
|
|
116
118
|
$stdout.print "\e[#{@concurrency + 1}A"
|
|
@@ -138,6 +140,11 @@ module Tahweel
|
|
|
138
140
|
$stdout.flush
|
|
139
141
|
end
|
|
140
142
|
|
|
143
|
+
# Truncates a file path to fit within the display.
|
|
144
|
+
#
|
|
145
|
+
# @param path [String] The file path.
|
|
146
|
+
# @param max_length [Integer] Maximum allowed length.
|
|
147
|
+
# @return [String] The truncated path.
|
|
141
148
|
def truncate_path(path, max_length)
|
|
142
149
|
return path.ljust(max_length) if path.length <= max_length
|
|
143
150
|
|
data/lib/tahweel/converter.rb
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "fileutils"
|
|
4
|
+
|
|
3
5
|
require_relative "pdf_splitter"
|
|
4
6
|
require_relative "ocr"
|
|
5
|
-
require "fileutils"
|
|
6
7
|
|
|
7
8
|
module Tahweel
|
|
8
9
|
# Orchestrates the full conversion process:
|
|
@@ -60,10 +61,10 @@ module Tahweel
|
|
|
60
61
|
# }
|
|
61
62
|
# @return [Array<String>] An array containing the text of each page.
|
|
62
63
|
def convert(&)
|
|
63
|
-
|
|
64
|
+
images_paths, temp_dir = PdfSplitter.split(@pdf_path, dpi: @dpi, &).values_at(:images_paths, :folder_path)
|
|
64
65
|
|
|
65
66
|
begin
|
|
66
|
-
process_images(
|
|
67
|
+
process_images(images_paths, Ocr.new(processor: @processor_type), &)
|
|
67
68
|
ensure
|
|
68
69
|
FileUtils.rm_rf(temp_dir)
|
|
69
70
|
end
|
|
@@ -71,31 +72,55 @@ module Tahweel
|
|
|
71
72
|
|
|
72
73
|
private
|
|
73
74
|
|
|
74
|
-
|
|
75
|
-
|
|
75
|
+
# Processes the list of images concurrently using the specified OCR engine.
|
|
76
|
+
#
|
|
77
|
+
# @param images_paths [Array<String>] List of paths to the image files.
|
|
78
|
+
# @param ocr_engine [Tahweel::Ocr] The initialized OCR engine instance.
|
|
79
|
+
# @param &block [Proc] Block to yield progress updates.
|
|
80
|
+
# @return [Array<String>] The text extracted from the images.
|
|
81
|
+
def process_images(images_paths, ocr_engine, &)
|
|
82
|
+
texts = Array.new(images_paths.size)
|
|
76
83
|
mutex = Mutex.new
|
|
77
84
|
processed_count = 0
|
|
78
85
|
|
|
79
|
-
run_workers(build_queue(
|
|
86
|
+
run_workers(build_queue(images_paths), ocr_engine, texts, mutex) do
|
|
80
87
|
processed_count += 1
|
|
81
|
-
report_progress(processed_count,
|
|
88
|
+
report_progress(processed_count, images_paths.size, &)
|
|
82
89
|
end
|
|
83
90
|
|
|
84
91
|
texts
|
|
85
92
|
end
|
|
86
93
|
|
|
87
|
-
|
|
94
|
+
# Builds a queue of images paths and their indices.
|
|
95
|
+
#
|
|
96
|
+
# @param images_paths [Array<String>] List of image paths.
|
|
97
|
+
# @return [Queue] A queue containing [path, index] tuples.
|
|
98
|
+
def build_queue(images_paths)
|
|
88
99
|
queue = Queue.new
|
|
89
|
-
|
|
100
|
+
images_paths.each_with_index { |path, index| queue << [path, index] }
|
|
90
101
|
queue
|
|
91
102
|
end
|
|
92
103
|
|
|
104
|
+
# Spawns worker threads to process items from the queue.
|
|
105
|
+
#
|
|
106
|
+
# @param queue [Queue] The queue of images to process.
|
|
107
|
+
# @param ocr_engine [Tahweel::Ocr] The OCR engine.
|
|
108
|
+
# @param texts [Array<String>] Shared array to store results.
|
|
109
|
+
# @param mutex [Mutex] Mutex for thread-safe updates.
|
|
110
|
+
# @param &block [Proc] Block to yield progress updates.
|
|
93
111
|
def run_workers(queue, ocr_engine, texts, mutex, &)
|
|
94
112
|
Array.new(@concurrency) do
|
|
95
113
|
Thread.new { process_queue_items(queue, ocr_engine, texts, mutex, &) }
|
|
96
114
|
end.each(&:join)
|
|
97
115
|
end
|
|
98
116
|
|
|
117
|
+
# Processing loop for a single worker thread.
|
|
118
|
+
#
|
|
119
|
+
# @param queue [Queue] The shared queue.
|
|
120
|
+
# @param ocr_engine [Tahweel::Ocr] The OCR engine.
|
|
121
|
+
# @param texts [Array<String>] Shared result array.
|
|
122
|
+
# @param mutex [Mutex] Synchronization primitive.
|
|
123
|
+
# @param &block [Proc] Block to yield progress updates.
|
|
99
124
|
def process_queue_items(queue, ocr_engine, texts, mutex, &)
|
|
100
125
|
loop do
|
|
101
126
|
begin
|
|
@@ -109,6 +134,13 @@ module Tahweel
|
|
|
109
134
|
end
|
|
110
135
|
end
|
|
111
136
|
|
|
137
|
+
# Thread-safe saving of OCR results.
|
|
138
|
+
#
|
|
139
|
+
# @param texts [Array<String>] The results array.
|
|
140
|
+
# @param index [Integer] Index of the current page.
|
|
141
|
+
# @param text [String] Extracted text.
|
|
142
|
+
# @param mutex [Mutex] Synchronization primitive.
|
|
143
|
+
# @yield Executes the progress reporting block within the lock.
|
|
112
144
|
def save_result(texts, index, text, mutex)
|
|
113
145
|
mutex.synchronize do
|
|
114
146
|
texts[index] = text
|
|
@@ -116,6 +148,11 @@ module Tahweel
|
|
|
116
148
|
end
|
|
117
149
|
end
|
|
118
150
|
|
|
151
|
+
# Reports progress to the optional block.
|
|
152
|
+
#
|
|
153
|
+
# @param processed [Integer] Number of pages processed.
|
|
154
|
+
# @param total [Integer] Total number of pages.
|
|
155
|
+
# @yield [Hash] Progress information.
|
|
119
156
|
def report_progress(processed, total)
|
|
120
157
|
return unless block_given?
|
|
121
158
|
|