tahweel 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.vscode/settings.json +1 -0
- data/CHANGELOG.md +7 -0
- data/assets/logo.png +0 -0
- data/assets/windows/tahweel.ico +0 -0
- data/bin/tahweel +10 -5
- data/bin/tahweel-ui +300 -0
- data/lib/tahweel/authorizer.rb +15 -5
- data/lib/tahweel/cli/file_processor.rb +14 -2
- data/lib/tahweel/cli/options.rb +24 -7
- data/lib/tahweel/cli/progress_renderer.rb +7 -0
- data/lib/tahweel/converter.rb +46 -9
- data/lib/tahweel/pdf_splitter.rb +92 -38
- data/lib/tahweel/poppler_installer.rb +185 -0
- data/lib/tahweel/processors/google_drive.rb +1 -1
- data/lib/tahweel/version.rb +1 -1
- data/lib/tahweel/writer.rb +1 -1
- data/lib/tahweel/writers/docx.rb +26 -3
- data/lib/tahweel.rb +8 -3
- metadata +48 -15
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: cfa4c4ffbb0b1229794addc7ae400b8af9fda793e7c17b403d4aa60bd92fa7f6
|
|
4
|
+
data.tar.gz: 2d236076739f2ae1b892669487b6ac839b1a5420c1928621787e6387e7ba50ab
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: de06384d492cd26925dee76392119d0cd7c05d279e1aaafa97d055091558513847b051f5c65bc505a5213623a36004f734fbe0794b414ebc28c6982639841960
|
|
7
|
+
data.tar.gz: 31e2d05fbaf89c4f09ef3859243052ffd00f95ffa277368826e9ebed11539b401d8b8487920c799b6bc2bf5d52ab36cfa4ff4771f9661a38a839d5353dfed2bf
|
data/.vscode/settings.json
CHANGED
data/CHANGELOG.md
CHANGED
data/assets/logo.png
ADDED
|
Binary file
|
|
Binary file
|
data/bin/tahweel
CHANGED
|
@@ -11,10 +11,15 @@ require "tahweel/cli/progress_renderer"
|
|
|
11
11
|
begin
|
|
12
12
|
soft, hard = Process.getrlimit(:NOFILE)
|
|
13
13
|
Process.setrlimit(:NOFILE, [4096, hard].min) if soft < 4096
|
|
14
|
-
rescue
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
14
|
+
rescue Exception # rubocop:disable Lint/RescueException
|
|
15
|
+
if Gem.win_platform?
|
|
16
|
+
puts "\e[33mWarning: Could not adjust file descriptor limit on Windows. Proceeding with default limits."
|
|
17
|
+
puts "If you faced connection errors or the CLI froze, please report at https://github.com/ieasybooks/tahweel.rb/issues.\e[0m"
|
|
18
|
+
else
|
|
19
|
+
puts "\e[33mWarning: Tahweel failed to increase the soft limit of file descriptors to 4096."
|
|
20
|
+
puts "If you faced connection errors or the CLI froze, try running `ulimit -n 4096` in your terminal."
|
|
21
|
+
puts "If you still face issues, please report them at https://github.com/ieasybooks/tahweel.rb/issues.\e[0m"
|
|
22
|
+
end
|
|
18
23
|
end
|
|
19
24
|
|
|
20
25
|
begin
|
|
@@ -27,7 +32,7 @@ begin
|
|
|
27
32
|
exit 0
|
|
28
33
|
end
|
|
29
34
|
|
|
30
|
-
Tahweel::Authorizer.authorize if options[:
|
|
35
|
+
Tahweel::Authorizer.authorize if options[:processor] == :google_drive
|
|
31
36
|
|
|
32
37
|
base_path = File.directory?(input_path) ? input_path : File.dirname(input_path)
|
|
33
38
|
|
data/bin/tahweel-ui
ADDED
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
# Add the ../lib directory to the load path so we can require 'tahweel'
|
|
5
|
+
$LOAD_PATH.unshift(File.expand_path("../lib", __dir__))
|
|
6
|
+
|
|
7
|
+
require "matrix"
|
|
8
|
+
require "glimmer-dsl-libui"
|
|
9
|
+
require "launchy"
|
|
10
|
+
require "tahweel"
|
|
11
|
+
|
|
12
|
+
Tahweel::Authorizer.authorize unless defined?(Ocran)
|
|
13
|
+
|
|
14
|
+
class TahweelApp # rubocop:disable Metrics/ClassLength,Style/Documentation
|
|
15
|
+
include Glimmer::LibUI::Application
|
|
16
|
+
|
|
17
|
+
TRANSLATIONS = {
|
|
18
|
+
# https://www.perplexity.ai/search/add-direction-unicode-controls-Z5haIOFEQZeCbG5B8Wt2dg#3
|
|
19
|
+
ar: {
|
|
20
|
+
window_title: "تحويل",
|
|
21
|
+
title_label: "تحويل: حوّل الملفات من صيغة PDF إلى TXT و DOCX ↓",
|
|
22
|
+
note_label: "ملاحظة: يدعم تحويل الملفات بصيغة PDF أو صورة (JPG و JPEG و PNG) فقط.",
|
|
23
|
+
file_btn: "تحويل ملف واحد",
|
|
24
|
+
folder_btn: "تحويل مجلد كامل",
|
|
25
|
+
language_btn: "English",
|
|
26
|
+
global_progress: "التقدم العام:",
|
|
27
|
+
file_progress: "تقدم الملف الحالي:",
|
|
28
|
+
status_done: "انتهى التحويل.",
|
|
29
|
+
msg_success_title: "اكتمل التحويل",
|
|
30
|
+
msg_error_title: "خطأ",
|
|
31
|
+
msg_no_files: "لم نعثر على ملفات لتحويلها.",
|
|
32
|
+
msg_bad_extension: "صيغة الملف غير مدعومة.",
|
|
33
|
+
stage_preparing: "جارٍ التحضير...",
|
|
34
|
+
stage_splitting: "جارٍ تقسيم الملف...",
|
|
35
|
+
stage_ocr: "جارٍ استخراج النصوص...",
|
|
36
|
+
stage_done: "انتهى"
|
|
37
|
+
},
|
|
38
|
+
en: {
|
|
39
|
+
window_title: "Tahweel",
|
|
40
|
+
title_label: "Tahweel: Convert PDF files to TXT and DOCX ↓",
|
|
41
|
+
note_label: "Note: Tahweel supports PDF or image files (JPG, JPEG, and PNG) only.",
|
|
42
|
+
file_btn: "Convert a Single File",
|
|
43
|
+
folder_btn: "Convert a Folder",
|
|
44
|
+
language_btn: "العربية",
|
|
45
|
+
global_progress: "Progress:",
|
|
46
|
+
file_progress: "Current File:",
|
|
47
|
+
status_done: "Conversion complete.",
|
|
48
|
+
msg_success_title: "Conversion Complete",
|
|
49
|
+
msg_error_title: "Error",
|
|
50
|
+
msg_no_files: "No files found to convert.",
|
|
51
|
+
msg_bad_extension: "Unsupported file format.",
|
|
52
|
+
stage_preparing: "Preparing...",
|
|
53
|
+
stage_splitting: "Splitting file...",
|
|
54
|
+
stage_ocr: "Extracting text...",
|
|
55
|
+
stage_done: "Done"
|
|
56
|
+
}
|
|
57
|
+
}.freeze
|
|
58
|
+
|
|
59
|
+
def initialize(*args)
|
|
60
|
+
@lang = :ar
|
|
61
|
+
@rtl_components = []
|
|
62
|
+
@ltr_components = []
|
|
63
|
+
|
|
64
|
+
super
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
body do
|
|
68
|
+
@main_window = window(t(:window_title)) do
|
|
69
|
+
margined true
|
|
70
|
+
|
|
71
|
+
vertical_box do
|
|
72
|
+
@header_label = right_aligned_label(t(:title_label))
|
|
73
|
+
convert_buttons
|
|
74
|
+
@note_label = right_aligned_label(t(:note_label))
|
|
75
|
+
|
|
76
|
+
@progress_section = progress_section
|
|
77
|
+
@progress_section.visible = false
|
|
78
|
+
|
|
79
|
+
horizontal_separator { stretchy false }
|
|
80
|
+
|
|
81
|
+
@language_btn = language_button
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
private
|
|
87
|
+
|
|
88
|
+
def language_button # rubocop:disable Metrics/MethodLength
|
|
89
|
+
ref = nil
|
|
90
|
+
|
|
91
|
+
horizontal_box do
|
|
92
|
+
stretchy false
|
|
93
|
+
|
|
94
|
+
right_alignment_label = label("") { stretchy true }
|
|
95
|
+
right_alignment_label.visible = false
|
|
96
|
+
@rtl_components << right_alignment_label
|
|
97
|
+
|
|
98
|
+
ref = button(t(:language_btn)) do
|
|
99
|
+
stretchy false
|
|
100
|
+
on_clicked { toggle_language }
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
left_alignment_label = label("") { stretchy true }
|
|
104
|
+
@ltr_components << left_alignment_label
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
ref
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def convert_buttons
|
|
111
|
+
horizontal_box do
|
|
112
|
+
@ar_convert_folder_btn = button(t(:folder_btn, lang: :ar)) { on_clicked { on_folder_click } }
|
|
113
|
+
@rtl_components << @ar_convert_folder_btn
|
|
114
|
+
|
|
115
|
+
@convert_file_btn = button(t(:file_btn)) { on_clicked { on_file_click } }
|
|
116
|
+
|
|
117
|
+
@en_convert_folder_btn = button(t(:folder_btn, lang: :en)) { on_clicked { on_folder_click } }
|
|
118
|
+
@en_convert_folder_btn.visible = false
|
|
119
|
+
@rtl_components << @en_convert_folder_btn
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
def progress_section
|
|
124
|
+
group do
|
|
125
|
+
stretchy false
|
|
126
|
+
|
|
127
|
+
vertical_box do
|
|
128
|
+
@global_progress_label = right_aligned_label(t(:global_progress))
|
|
129
|
+
@global_progress = progress_bar { stretchy true }
|
|
130
|
+
|
|
131
|
+
@file_progress_label = right_aligned_label(t(:file_progress))
|
|
132
|
+
@file_progress = progress_bar { stretchy true }
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def on_file_click
|
|
138
|
+
file = open_file
|
|
139
|
+
convert(File.dirname(file), [file]) if file && valid_file?(file)
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
def valid_file?(file)
|
|
143
|
+
return true if %w[.pdf .jpg .jpeg .png].include? File.extname(file).downcase
|
|
144
|
+
|
|
145
|
+
msg_box_error(t(:msg_error_title), t(:msg_bad_extension))
|
|
146
|
+
false
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
def on_folder_click
|
|
150
|
+
folder = open_folder
|
|
151
|
+
paths = collect_files(folder)
|
|
152
|
+
convert(folder, paths) if paths.any?
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
def collect_files(folder)
|
|
156
|
+
paths = folder ? Tahweel::CLI::FileCollector.collect(folder) : []
|
|
157
|
+
return paths if folder && paths.any?
|
|
158
|
+
|
|
159
|
+
msg_box_error(t(:msg_error_title), t(:msg_no_files))
|
|
160
|
+
[]
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
def convert(folder, paths)
|
|
164
|
+
disable_window
|
|
165
|
+
@progress_section.visible = true
|
|
166
|
+
|
|
167
|
+
Thread.new do
|
|
168
|
+
paths.each_with_index { |path, index| process_path(path, index, paths.size) }
|
|
169
|
+
finish_conversion(folder, paths.size)
|
|
170
|
+
end
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
def disable_window
|
|
174
|
+
@convert_file_btn.enabled = false
|
|
175
|
+
@ar_convert_folder_btn.enabled = false
|
|
176
|
+
@en_convert_folder_btn.enabled = false
|
|
177
|
+
@language_btn.enabled = false
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
def process_path(path, index, total_files)
|
|
181
|
+
reset_file_progress(index, total_files)
|
|
182
|
+
|
|
183
|
+
Tahweel::CLI::FileProcessor.process(path, options(path)) { update_file_progress(_1) }
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
def reset_file_progress(index, total_files)
|
|
187
|
+
Glimmer::LibUI.queue_main do
|
|
188
|
+
@global_progress.value = ((index.to_f / total_files) * 100).to_i
|
|
189
|
+
@global_progress_label.text = "\u202B#{t(:global_progress)} (\u202A#{index}/#{total_files}\u202C)\u202C"
|
|
190
|
+
@file_progress.value = 0
|
|
191
|
+
@file_progress_label.text = "#{t(:file_progress)} #{t(:stage_preparing)}"
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
def options(path)
|
|
196
|
+
{
|
|
197
|
+
dpi: 150,
|
|
198
|
+
processor: :google_drive,
|
|
199
|
+
ocr_concurrency: Tahweel::Converter::DEFAULT_CONCURRENCY,
|
|
200
|
+
file_concurrency: 1,
|
|
201
|
+
formats: %i[txt docx],
|
|
202
|
+
base_input_path: File.directory?(path) ? path : File.dirname(path)
|
|
203
|
+
}
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
def update_file_progress(progress)
|
|
207
|
+
Glimmer::LibUI.queue_main do
|
|
208
|
+
@file_progress.value = progress[:percentage].to_i
|
|
209
|
+
@file_progress_label.text = "#{t(:file_progress)} (#{stage_text(progress[:stage])})"
|
|
210
|
+
end
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
def stage_text(stage)
|
|
214
|
+
case stage
|
|
215
|
+
when :splitting then t(:stage_splitting)
|
|
216
|
+
when :ocr then t(:stage_ocr)
|
|
217
|
+
end
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
def finish_conversion(folder, total_files)
|
|
221
|
+
Glimmer::LibUI.queue_main do
|
|
222
|
+
@global_progress.value = 100
|
|
223
|
+
@file_progress.value = 100
|
|
224
|
+
@global_progress_label.text = "#{t(:global_progress)} (#{total_files}/#{total_files})"
|
|
225
|
+
@file_progress_label.text = "#{t(:file_progress)} (#{t(:stage_done)})"
|
|
226
|
+
|
|
227
|
+
enable_window
|
|
228
|
+
msg_box(t(:msg_success_title), convert_finished_message(total_files))
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
Launchy.open(folder)
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
def enable_window
|
|
235
|
+
@convert_file_btn.enabled = true
|
|
236
|
+
@ar_convert_folder_btn.enabled = true
|
|
237
|
+
@en_convert_folder_btn.enabled = true
|
|
238
|
+
@language_btn.enabled = true
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
def convert_finished_message(files_count)
|
|
242
|
+
if @lang == :en
|
|
243
|
+
"Finished converting #{files_count} file(s) successfully."
|
|
244
|
+
else
|
|
245
|
+
arabic_convert_finished_message(files_count)
|
|
246
|
+
end
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
def arabic_convert_finished_message(files_count)
|
|
250
|
+
return "انتهى تحويل ملف واحد بنجاح." if files_count == 1
|
|
251
|
+
return "انتهى تحويل ملفين بنجاح." if files_count == 2
|
|
252
|
+
|
|
253
|
+
last_two_digits = files_count % 100
|
|
254
|
+
|
|
255
|
+
suffix = case last_two_digits
|
|
256
|
+
when 0..2 then "ملف"
|
|
257
|
+
when 3..10 then "ملفات"
|
|
258
|
+
else "ملفًا"
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
"انتهى تحويل #{files_count} #{suffix} بنجاح."
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
def toggle_language # rubocop:disable Metrics/AbcSize
|
|
265
|
+
@lang = @lang == :ar ? :en : :ar
|
|
266
|
+
|
|
267
|
+
@main_window.title = t(:window_title)
|
|
268
|
+
@header_label.text = t(:title_label)
|
|
269
|
+
@note_label.text = t(:note_label)
|
|
270
|
+
@convert_file_btn.text = t(:file_btn)
|
|
271
|
+
@language_btn.text = t(:language_btn)
|
|
272
|
+
|
|
273
|
+
@global_progress_label.text = t(:global_progress)
|
|
274
|
+
@file_progress_label.text = t(:file_progress)
|
|
275
|
+
|
|
276
|
+
@rtl_components.each { _1.visible = !_1.visible }
|
|
277
|
+
@ltr_components.each { _1.visible = !_1.visible }
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
def t(key, lang: nil) = TRANSLATIONS[lang || @lang][key]
|
|
281
|
+
|
|
282
|
+
def right_aligned_label(text)
|
|
283
|
+
ref = nil
|
|
284
|
+
|
|
285
|
+
horizontal_box do
|
|
286
|
+
right_alignment_label = label("") { stretchy true }
|
|
287
|
+
@rtl_components << right_alignment_label
|
|
288
|
+
|
|
289
|
+
ref = label(text) { stretchy false }
|
|
290
|
+
|
|
291
|
+
left_alignment_label = label("") { stretchy true }
|
|
292
|
+
left_alignment_label.visible = false
|
|
293
|
+
@ltr_components << left_alignment_label
|
|
294
|
+
end
|
|
295
|
+
|
|
296
|
+
ref
|
|
297
|
+
end
|
|
298
|
+
end
|
|
299
|
+
|
|
300
|
+
TahweelApp.launch unless defined?(Ocran)
|
data/lib/tahweel/authorizer.rb
CHANGED
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require "
|
|
4
|
-
require "googleauth"
|
|
5
|
-
require "googleauth/stores/file_token_store"
|
|
3
|
+
require "fileutils"
|
|
6
4
|
require "socket"
|
|
7
5
|
require "uri"
|
|
6
|
+
|
|
7
|
+
require "googleauth"
|
|
8
|
+
require "googleauth/stores/file_token_store"
|
|
9
|
+
require "google/apis/drive_v3"
|
|
8
10
|
require "launchy"
|
|
9
|
-
require "fileutils"
|
|
10
11
|
require "xdg"
|
|
11
12
|
|
|
12
13
|
module Tahweel
|
|
@@ -104,7 +105,16 @@ module Tahweel
|
|
|
104
105
|
end
|
|
105
106
|
|
|
106
107
|
# Opens the system default browser to the Google Authorization URL.
|
|
107
|
-
def open_browser_for_auth
|
|
108
|
+
def open_browser_for_auth
|
|
109
|
+
url = @authorizer.get_authorization_url(base_url: REDIRECT_URI)
|
|
110
|
+
|
|
111
|
+
if Gem.win_platform?
|
|
112
|
+
# https://github.com/copiousfreetime/launchy/issues/167
|
|
113
|
+
system("start \"\" \"#{url}\"")
|
|
114
|
+
else
|
|
115
|
+
Launchy.open(url)
|
|
116
|
+
end
|
|
117
|
+
end
|
|
108
118
|
|
|
109
119
|
# Listens on the local server for the OAuth callback request.
|
|
110
120
|
# Handles multiple incoming requests to filter out noise (like favicon.ico).
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "fileutils"
|
|
3
4
|
require "pathname"
|
|
4
5
|
|
|
5
6
|
module Tahweel
|
|
@@ -17,7 +18,7 @@ module Tahweel
|
|
|
17
18
|
# @option options [String] :output The directory to save output files (defaults to current directory).
|
|
18
19
|
# @option options [Integer] :dpi DPI for PDF conversion (defaults to 150).
|
|
19
20
|
# @option options [Symbol] :processor The OCR processor to use (e.g., :google_drive).
|
|
20
|
-
# @option options [Integer] :
|
|
21
|
+
# @option options [Integer] :ocr_concurrency Max concurrent operations.
|
|
21
22
|
# @option options [Array<Symbol>] :formats Output formats (e.g., [:txt, :docx]).
|
|
22
23
|
# @option options [String] :page_separator Separator string for TXT output.
|
|
23
24
|
# @option options [String] :base_input_path The base path used to determine relative output structure.
|
|
@@ -66,8 +67,11 @@ module Tahweel
|
|
|
66
67
|
|
|
67
68
|
private
|
|
68
69
|
|
|
70
|
+
# Creates the output directory if it doesn't exist.
|
|
69
71
|
def ensure_output_directory_exists = FileUtils.mkdir_p(output_directory)
|
|
70
72
|
|
|
73
|
+
# Checks if all requested output formats already exist.
|
|
74
|
+
# @return [Boolean] True if all output files exist.
|
|
71
75
|
def all_outputs_exist?
|
|
72
76
|
@options[:formats].all? do |format|
|
|
73
77
|
extension = Tahweel::Writer.new(format: format).extension
|
|
@@ -75,22 +79,29 @@ module Tahweel
|
|
|
75
79
|
end
|
|
76
80
|
end
|
|
77
81
|
|
|
82
|
+
# Checks if the input file is a PDF.
|
|
83
|
+
# @return [Boolean]
|
|
78
84
|
def pdf? = File.extname(@file_path).downcase == ".pdf"
|
|
79
85
|
|
|
86
|
+
# Handles PDF processing: splitting, OCR, and saving.
|
|
87
|
+
# @param &block [Proc] Progress callback.
|
|
80
88
|
def process_pdf(&)
|
|
81
89
|
texts = Tahweel.convert(
|
|
82
90
|
@file_path,
|
|
83
91
|
dpi: @options[:dpi],
|
|
84
92
|
processor: @options[:processor],
|
|
85
|
-
concurrency: @options.fetch(:
|
|
93
|
+
concurrency: @options.fetch(:ocr_concurrency, Tahweel::Converter::DEFAULT_CONCURRENCY),
|
|
86
94
|
&
|
|
87
95
|
)
|
|
88
96
|
|
|
89
97
|
write_output(texts)
|
|
90
98
|
end
|
|
91
99
|
|
|
100
|
+
# Handles single image processing.
|
|
92
101
|
def process_image = write_output([Tahweel.extract(@file_path, processor: @options[:processor])])
|
|
93
102
|
|
|
103
|
+
# Writes the extracted text to all configured formats.
|
|
104
|
+
# @param texts [Array<String>] The list of extracted texts (per page).
|
|
94
105
|
def write_output(texts)
|
|
95
106
|
Tahweel::Writer.write(
|
|
96
107
|
texts,
|
|
@@ -100,6 +111,7 @@ module Tahweel
|
|
|
100
111
|
)
|
|
101
112
|
end
|
|
102
113
|
|
|
114
|
+
# @return [String] The full path for output files without extension.
|
|
103
115
|
def base_output_path = File.join(output_directory, File.basename(@file_path, ".*"))
|
|
104
116
|
|
|
105
117
|
# Determines the output directory.
|
data/lib/tahweel/cli/options.rb
CHANGED
|
@@ -3,6 +3,13 @@
|
|
|
3
3
|
require "etc"
|
|
4
4
|
require "optparse"
|
|
5
5
|
|
|
6
|
+
require_relative "../version"
|
|
7
|
+
require_relative "../converter"
|
|
8
|
+
require_relative "../ocr"
|
|
9
|
+
require_relative "../writer"
|
|
10
|
+
require_relative "../writers/txt"
|
|
11
|
+
require_relative "file_collector"
|
|
12
|
+
|
|
6
13
|
module Tahweel
|
|
7
14
|
module CLI
|
|
8
15
|
# Parses command-line arguments for the Tahweel CLI.
|
|
@@ -26,11 +33,13 @@ module Tahweel
|
|
|
26
33
|
options
|
|
27
34
|
end
|
|
28
35
|
|
|
36
|
+
# Returns the default configuration options.
|
|
37
|
+
# @return [Hash] Default options.
|
|
29
38
|
def self.default_options
|
|
30
39
|
{
|
|
31
40
|
dpi: 150,
|
|
32
41
|
processor: :google_drive,
|
|
33
|
-
|
|
42
|
+
ocr_concurrency: Tahweel::Converter::DEFAULT_CONCURRENCY,
|
|
34
43
|
file_concurrency: (Etc.nprocessors - 2).clamp(2..),
|
|
35
44
|
output: nil,
|
|
36
45
|
formats: %i[txt docx],
|
|
@@ -38,6 +47,10 @@ module Tahweel
|
|
|
38
47
|
}
|
|
39
48
|
end
|
|
40
49
|
|
|
50
|
+
# Configures the OptionParser instance.
|
|
51
|
+
#
|
|
52
|
+
# @param opts [OptionParser] The parser instance.
|
|
53
|
+
# @param options [Hash] The options hash to populate.
|
|
41
54
|
def self.configure_parser(opts, options) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
|
|
42
55
|
opts.program_name = "tahweel"
|
|
43
56
|
opts.version = Tahweel::VERSION
|
|
@@ -69,17 +82,17 @@ module Tahweel
|
|
|
69
82
|
end
|
|
70
83
|
|
|
71
84
|
opts.on(
|
|
72
|
-
"-
|
|
73
|
-
"Max concurrent
|
|
85
|
+
"-F", "--file-concurrency FILE_CONCURRENCY", POSITIVE_INTEGER,
|
|
86
|
+
"Max concurrent files to process (default: CPUs - 2 = #{options[:file_concurrency]})"
|
|
74
87
|
) do |value|
|
|
75
|
-
options[:
|
|
88
|
+
options[:file_concurrency] = value
|
|
76
89
|
end
|
|
77
90
|
|
|
78
91
|
opts.on(
|
|
79
|
-
"-
|
|
80
|
-
"Max concurrent
|
|
92
|
+
"-O", "--ocr-concurrency OCR_CONCURRENCY", POSITIVE_INTEGER,
|
|
93
|
+
"Max concurrent OCR operations (default: #{options[:ocr_concurrency]})"
|
|
81
94
|
) do |value|
|
|
82
|
-
options[:
|
|
95
|
+
options[:ocr_concurrency] = value
|
|
83
96
|
end
|
|
84
97
|
|
|
85
98
|
opts.on(
|
|
@@ -104,6 +117,10 @@ module Tahweel
|
|
|
104
117
|
end
|
|
105
118
|
end
|
|
106
119
|
|
|
120
|
+
# Validates that arguments were provided.
|
|
121
|
+
#
|
|
122
|
+
# @param args [Array<String>] The remaining arguments after parsing.
|
|
123
|
+
# @param parser [OptionParser] The parser instance for printing help.
|
|
107
124
|
def self.validate_args!(args, parser)
|
|
108
125
|
return unless args.empty?
|
|
109
126
|
|
|
@@ -102,6 +102,7 @@ module Tahweel
|
|
|
102
102
|
|
|
103
103
|
private
|
|
104
104
|
|
|
105
|
+
# Starts a background thread to refresh the display periodically.
|
|
105
106
|
def start_ticker
|
|
106
107
|
@ticker_thread = Thread.new do
|
|
107
108
|
while @running
|
|
@@ -111,6 +112,7 @@ module Tahweel
|
|
|
111
112
|
end
|
|
112
113
|
end
|
|
113
114
|
|
|
115
|
+
# Renders the progress dashboard to stdout.
|
|
114
116
|
def render # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
|
115
117
|
# Move cursor up to the start of our block
|
|
116
118
|
$stdout.print "\e[#{@concurrency + 1}A"
|
|
@@ -138,6 +140,11 @@ module Tahweel
|
|
|
138
140
|
$stdout.flush
|
|
139
141
|
end
|
|
140
142
|
|
|
143
|
+
# Truncates a file path to fit within the display.
|
|
144
|
+
#
|
|
145
|
+
# @param path [String] The file path.
|
|
146
|
+
# @param max_length [Integer] Maximum allowed length.
|
|
147
|
+
# @return [String] The truncated path.
|
|
141
148
|
def truncate_path(path, max_length)
|
|
142
149
|
return path.ljust(max_length) if path.length <= max_length
|
|
143
150
|
|
data/lib/tahweel/converter.rb
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "fileutils"
|
|
4
|
+
|
|
3
5
|
require_relative "pdf_splitter"
|
|
4
6
|
require_relative "ocr"
|
|
5
|
-
require "fileutils"
|
|
6
7
|
|
|
7
8
|
module Tahweel
|
|
8
9
|
# Orchestrates the full conversion process:
|
|
@@ -60,10 +61,10 @@ module Tahweel
|
|
|
60
61
|
# }
|
|
61
62
|
# @return [Array<String>] An array containing the text of each page.
|
|
62
63
|
def convert(&)
|
|
63
|
-
|
|
64
|
+
images_paths, temp_dir = PdfSplitter.split(@pdf_path, dpi: @dpi, &).values_at(:images_paths, :folder_path)
|
|
64
65
|
|
|
65
66
|
begin
|
|
66
|
-
process_images(
|
|
67
|
+
process_images(images_paths, Ocr.new(processor: @processor_type), &)
|
|
67
68
|
ensure
|
|
68
69
|
FileUtils.rm_rf(temp_dir)
|
|
69
70
|
end
|
|
@@ -71,31 +72,55 @@ module Tahweel
|
|
|
71
72
|
|
|
72
73
|
private
|
|
73
74
|
|
|
74
|
-
|
|
75
|
-
|
|
75
|
+
# Processes the list of images concurrently using the specified OCR engine.
|
|
76
|
+
#
|
|
77
|
+
# @param images_paths [Array<String>] List of paths to the image files.
|
|
78
|
+
# @param ocr_engine [Tahweel::Ocr] The initialized OCR engine instance.
|
|
79
|
+
# @param &block [Proc] Block to yield progress updates.
|
|
80
|
+
# @return [Array<String>] The text extracted from the images.
|
|
81
|
+
def process_images(images_paths, ocr_engine, &)
|
|
82
|
+
texts = Array.new(images_paths.size)
|
|
76
83
|
mutex = Mutex.new
|
|
77
84
|
processed_count = 0
|
|
78
85
|
|
|
79
|
-
run_workers(build_queue(
|
|
86
|
+
run_workers(build_queue(images_paths), ocr_engine, texts, mutex) do
|
|
80
87
|
processed_count += 1
|
|
81
|
-
report_progress(processed_count,
|
|
88
|
+
report_progress(processed_count, images_paths.size, &)
|
|
82
89
|
end
|
|
83
90
|
|
|
84
91
|
texts
|
|
85
92
|
end
|
|
86
93
|
|
|
87
|
-
|
|
94
|
+
# Builds a queue of images paths and their indices.
|
|
95
|
+
#
|
|
96
|
+
# @param images_paths [Array<String>] List of image paths.
|
|
97
|
+
# @return [Queue] A queue containing [path, index] tuples.
|
|
98
|
+
def build_queue(images_paths)
|
|
88
99
|
queue = Queue.new
|
|
89
|
-
|
|
100
|
+
images_paths.each_with_index { |path, index| queue << [path, index] }
|
|
90
101
|
queue
|
|
91
102
|
end
|
|
92
103
|
|
|
104
|
+
# Spawns worker threads to process items from the queue.
|
|
105
|
+
#
|
|
106
|
+
# @param queue [Queue] The queue of images to process.
|
|
107
|
+
# @param ocr_engine [Tahweel::Ocr] The OCR engine.
|
|
108
|
+
# @param texts [Array<String>] Shared array to store results.
|
|
109
|
+
# @param mutex [Mutex] Mutex for thread-safe updates.
|
|
110
|
+
# @param &block [Proc] Block to yield progress updates.
|
|
93
111
|
def run_workers(queue, ocr_engine, texts, mutex, &)
|
|
94
112
|
Array.new(@concurrency) do
|
|
95
113
|
Thread.new { process_queue_items(queue, ocr_engine, texts, mutex, &) }
|
|
96
114
|
end.each(&:join)
|
|
97
115
|
end
|
|
98
116
|
|
|
117
|
+
# Processing loop for a single worker thread.
|
|
118
|
+
#
|
|
119
|
+
# @param queue [Queue] The shared queue.
|
|
120
|
+
# @param ocr_engine [Tahweel::Ocr] The OCR engine.
|
|
121
|
+
# @param texts [Array<String>] Shared result array.
|
|
122
|
+
# @param mutex [Mutex] Synchronization primitive.
|
|
123
|
+
# @param &block [Proc] Block to yield progress updates.
|
|
99
124
|
def process_queue_items(queue, ocr_engine, texts, mutex, &)
|
|
100
125
|
loop do
|
|
101
126
|
begin
|
|
@@ -109,6 +134,13 @@ module Tahweel
|
|
|
109
134
|
end
|
|
110
135
|
end
|
|
111
136
|
|
|
137
|
+
# Thread-safe saving of OCR results.
|
|
138
|
+
#
|
|
139
|
+
# @param texts [Array<String>] The results array.
|
|
140
|
+
# @param index [Integer] Index of the current page.
|
|
141
|
+
# @param text [String] Extracted text.
|
|
142
|
+
# @param mutex [Mutex] Synchronization primitive.
|
|
143
|
+
# @yield Executes the progress reporting block within the lock.
|
|
112
144
|
def save_result(texts, index, text, mutex)
|
|
113
145
|
mutex.synchronize do
|
|
114
146
|
texts[index] = text
|
|
@@ -116,6 +148,11 @@ module Tahweel
|
|
|
116
148
|
end
|
|
117
149
|
end
|
|
118
150
|
|
|
151
|
+
# Reports progress to the optional block.
|
|
152
|
+
#
|
|
153
|
+
# @param processed [Integer] Number of pages processed.
|
|
154
|
+
# @param total [Integer] Total number of pages.
|
|
155
|
+
# @yield [Hash] Progress information.
|
|
119
156
|
def report_progress(processed, total)
|
|
120
157
|
return unless block_given?
|
|
121
158
|
|
data/lib/tahweel/pdf_splitter.rb
CHANGED
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "etc"
|
|
3
4
|
require "fileutils"
|
|
4
|
-
require "rbconfig"
|
|
5
5
|
require "securerandom"
|
|
6
6
|
require "tmpdir"
|
|
7
|
-
|
|
7
|
+
|
|
8
|
+
require_relative "poppler_installer"
|
|
8
9
|
|
|
9
10
|
module Tahweel
|
|
10
11
|
# Handles the logic for splitting a PDF file into individual image pages.
|
|
11
|
-
# Uses
|
|
12
|
+
# Uses Poppler utils (pdftoppm, pdfinfo) for high-performance image processing.
|
|
12
13
|
class PdfSplitter
|
|
13
14
|
# Default DPI used when converting PDF pages to images.
|
|
14
15
|
# 150 DPI is a good balance between quality and file size for general documents.
|
|
@@ -25,7 +26,7 @@ module Tahweel
|
|
|
25
26
|
# percentage: Float,
|
|
26
27
|
# remaining_pages: Integer
|
|
27
28
|
# }
|
|
28
|
-
# @return [Hash] A hash containing the :folder_path (String) and :
|
|
29
|
+
# @return [Hash] A hash containing the :folder_path (String) and :images_paths (Array<String>).
|
|
29
30
|
def self.split(pdf_path, dpi: DEFAULT_DPI, &) = new(pdf_path, dpi:).split(&)
|
|
30
31
|
|
|
31
32
|
# Initializes a new PdfSplitter instance.
|
|
@@ -35,13 +36,12 @@ module Tahweel
|
|
|
35
36
|
def initialize(pdf_path, dpi: DEFAULT_DPI)
|
|
36
37
|
@pdf_path = pdf_path
|
|
37
38
|
@dpi = dpi
|
|
38
|
-
@image_paths = []
|
|
39
39
|
end
|
|
40
40
|
|
|
41
41
|
# Executes the PDF splitting process.
|
|
42
42
|
#
|
|
43
43
|
# This method performs the following steps:
|
|
44
|
-
# 1. Checks if
|
|
44
|
+
# 1. Checks if Poppler utils are available (installs if missing on Windows).
|
|
45
45
|
# 2. Validates the existence of the source PDF file.
|
|
46
46
|
# 3. Creates a unique temporary directory for output.
|
|
47
47
|
# 4. Iterates through each page of the PDF and converts it to a PNG image.
|
|
@@ -55,12 +55,11 @@ module Tahweel
|
|
|
55
55
|
# }
|
|
56
56
|
# @return [Hash] Result hash with keys:
|
|
57
57
|
# - :folder_path [String] The absolute path to the temporary directory containing the images.
|
|
58
|
-
# - :
|
|
59
|
-
# @raise [RuntimeError] If the PDF file is not found
|
|
60
|
-
# @raise [Vips::Error] If the underlying VIPS library encounters an error during processing.
|
|
58
|
+
# - :images_paths [Array<String>] List of absolute paths for each generated image file.
|
|
59
|
+
# @raise [RuntimeError] If the PDF file is not found.
|
|
61
60
|
def split(&)
|
|
62
|
-
check_libvips_installed!
|
|
63
61
|
validate_file_exists!
|
|
62
|
+
PopplerInstaller.ensure_installed!
|
|
64
63
|
setup_output_directory
|
|
65
64
|
process_pages(&)
|
|
66
65
|
result
|
|
@@ -68,20 +67,7 @@ module Tahweel
|
|
|
68
67
|
|
|
69
68
|
private
|
|
70
69
|
|
|
71
|
-
attr_reader :pdf_path, :dpi, :
|
|
72
|
-
|
|
73
|
-
# Checks if the `vips` CLI tool is available in the system PATH.
|
|
74
|
-
# Skips this check on Windows systems, assuming the environment is managed differently.
|
|
75
|
-
# Aborts execution with an error message if vips is missing.
|
|
76
|
-
def check_libvips_installed!
|
|
77
|
-
return if /mswin|mingw|cygwin/.match?(RbConfig::CONFIG["host_os"])
|
|
78
|
-
return if system("vips --version", out: File::NULL, err: File::NULL)
|
|
79
|
-
|
|
80
|
-
abort "Error: libvips is not installed. Please install it before using Tahweel.\n" \
|
|
81
|
-
"MacOS: `brew install vips`\n" \
|
|
82
|
-
"Ubuntu: `sudo apt install libvips42`\n" \
|
|
83
|
-
"Windows: Already installed with the Tahweel gem"
|
|
84
|
-
end
|
|
70
|
+
attr_reader :pdf_path, :dpi, :output_dir
|
|
85
71
|
|
|
86
72
|
# Ensures the source PDF file actually exists.
|
|
87
73
|
# @raise [RuntimeError] if the file is missing.
|
|
@@ -106,33 +92,101 @@ module Tahweel
|
|
|
106
92
|
# }
|
|
107
93
|
# @return [void]
|
|
108
94
|
def process_pages(&)
|
|
109
|
-
|
|
110
|
-
|
|
95
|
+
mutex = Mutex.new
|
|
96
|
+
processed_count = 0
|
|
97
|
+
|
|
98
|
+
run_workers(build_queue, mutex) do
|
|
99
|
+
processed_count += 1
|
|
100
|
+
report_progress(processed_count, &)
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# Builds a queue containing all page indices to be processed.
|
|
105
|
+
# @return [Queue] The queue populated with page numbers.
|
|
106
|
+
def build_queue
|
|
107
|
+
queue = Queue.new
|
|
108
|
+
total_pages.times { queue << _1 }
|
|
109
|
+
queue
|
|
110
|
+
end
|
|
111
111
|
|
|
112
|
-
|
|
112
|
+
# Spawns and manages worker threads to process the queue.
|
|
113
|
+
#
|
|
114
|
+
# @param queue [Queue] The queue of pages to process.
|
|
115
|
+
# @param mutex [Mutex] Synchronization primitive for thread safety.
|
|
116
|
+
# @param &block [Proc] Block to execute when a page is processed.
|
|
117
|
+
def run_workers(queue, mutex, &)
|
|
118
|
+
concurrency = (Etc.nprocessors - 2).clamp(2..)
|
|
119
|
+
|
|
120
|
+
Array.new([concurrency, total_pages].min) do
|
|
121
|
+
Thread.new { process_queue_items(queue, mutex, &) }
|
|
122
|
+
end.each(&:join)
|
|
123
|
+
end
|
|
113
124
|
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
125
|
+
# Processing loop for individual worker threads.
|
|
126
|
+
#
|
|
127
|
+
# @param queue [Queue] The shared queue of pages.
|
|
128
|
+
# @param mutex [Mutex] Synchronization primitive.
|
|
129
|
+
# @param &block [Proc] Block to yield for progress updates.
|
|
130
|
+
def process_queue_items(queue, mutex, &)
|
|
131
|
+
loop do
|
|
132
|
+
begin
|
|
133
|
+
page_num = queue.pop(true)
|
|
134
|
+
rescue ThreadError
|
|
135
|
+
break
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
extract_page(page_num)
|
|
139
|
+
|
|
140
|
+
mutex.synchronize(&)
|
|
120
141
|
end
|
|
121
142
|
end
|
|
122
143
|
|
|
144
|
+
# Reports progress back to the caller.
|
|
145
|
+
#
|
|
146
|
+
# @param processed [Integer] Number of pages processed so far.
|
|
147
|
+
# @param &block [Proc] The progress callback block.
|
|
148
|
+
def report_progress(processed, &)
|
|
149
|
+
return unless block_given?
|
|
150
|
+
|
|
151
|
+
yield({
|
|
152
|
+
file_path: @pdf_path, stage: :splitting,
|
|
153
|
+
current_page: processed,
|
|
154
|
+
percentage: ((processed.to_f / total_pages) * 100).round(2),
|
|
155
|
+
remaining_pages: total_pages - processed
|
|
156
|
+
})
|
|
157
|
+
end
|
|
158
|
+
|
|
123
159
|
# Calculates the total number of pages in the PDF by loading the first page metadata.
|
|
124
160
|
# @return [Integer] The page count.
|
|
125
161
|
def total_pages
|
|
126
|
-
@total_pages ||=
|
|
162
|
+
@total_pages ||= begin
|
|
163
|
+
output = `#{PopplerInstaller.pdfinfo_path} "#{pdf_path}"`.encode(
|
|
164
|
+
"UTF-8",
|
|
165
|
+
invalid: :replace, undef: :replace, replace: ""
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
pages = output[/Pages:\s*(\d+)/, 1]
|
|
169
|
+
raise "Failed to get page count from PDF: #{output}" unless pages
|
|
170
|
+
|
|
171
|
+
pages.to_i
|
|
172
|
+
end
|
|
127
173
|
end
|
|
128
174
|
|
|
129
175
|
# Extracts a specific page from the PDF and saves it as a PNG.
|
|
130
176
|
#
|
|
131
177
|
# @param page_num [Integer] The zero-based index of the page to extract.
|
|
132
178
|
def extract_page(page_num)
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
179
|
+
output_prefix = File.join(output_dir, "page")
|
|
180
|
+
|
|
181
|
+
system(
|
|
182
|
+
PopplerInstaller.pdftoppm_path,
|
|
183
|
+
"-png",
|
|
184
|
+
"-r", dpi.to_s,
|
|
185
|
+
"-f", (page_num + 1).to_s,
|
|
186
|
+
"-l", (page_num + 1).to_s,
|
|
187
|
+
pdf_path,
|
|
188
|
+
output_prefix
|
|
189
|
+
)
|
|
136
190
|
end
|
|
137
191
|
|
|
138
192
|
# Constructs the final result hash.
|
|
@@ -140,7 +194,7 @@ module Tahweel
|
|
|
140
194
|
def result
|
|
141
195
|
{
|
|
142
196
|
folder_path: output_dir,
|
|
143
|
-
|
|
197
|
+
images_paths: Dir.glob(File.join(output_dir, "page-*.png")).sort!
|
|
144
198
|
}
|
|
145
199
|
end
|
|
146
200
|
end
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "fileutils"
|
|
4
|
+
require "json"
|
|
5
|
+
require "net/http"
|
|
6
|
+
require "open-uri"
|
|
7
|
+
require "uri"
|
|
8
|
+
|
|
9
|
+
require "xdg"
|
|
10
|
+
require "zip"
|
|
11
|
+
|
|
12
|
+
module Tahweel
|
|
13
|
+
# Handles the installation and path resolution for Poppler utilities.
|
|
14
|
+
#
|
|
15
|
+
# On Windows, this class can automatically download and install the necessary
|
|
16
|
+
# binaries if they are not present. On other platforms, it provides instructions
|
|
17
|
+
# for manual installation.
|
|
18
|
+
class PopplerInstaller
|
|
19
|
+
POPPLER_REPO_API = "https://api.github.com/repos/oschwartz10612/poppler-windows/releases/latest"
|
|
20
|
+
|
|
21
|
+
# Ensures that Poppler utilities are installed.
|
|
22
|
+
#
|
|
23
|
+
# On Windows: Installs Poppler locally if not found.
|
|
24
|
+
# On other platforms: Aborts with an error message if Poppler is missing.
|
|
25
|
+
#
|
|
26
|
+
# @raise [SystemExit] if Poppler is missing on non-Windows platforms.
|
|
27
|
+
def self.ensure_installed! # rubocop:disable Metrics/MethodLength
|
|
28
|
+
installer = new
|
|
29
|
+
return if installer.installed?
|
|
30
|
+
|
|
31
|
+
if Gem.win_platform?
|
|
32
|
+
installer.install
|
|
33
|
+
else
|
|
34
|
+
abort <<~MSG
|
|
35
|
+
Error: Poppler utilities are not installed. Please install them:
|
|
36
|
+
MacOS: `brew install poppler`
|
|
37
|
+
Ubuntu: `sudo apt install poppler-utils`
|
|
38
|
+
MSG
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Returns the path to the `pdftoppm` executable.
|
|
43
|
+
# @return [String] path to the executable.
|
|
44
|
+
def self.pdftoppm_path = new.pdftoppm_path
|
|
45
|
+
|
|
46
|
+
# Returns the path to the `pdfinfo` executable.
|
|
47
|
+
# @return [String] path to the executable.
|
|
48
|
+
def self.pdfinfo_path = new.pdfinfo_path
|
|
49
|
+
|
|
50
|
+
# Installs Poppler binaries on Windows.
|
|
51
|
+
#
|
|
52
|
+
# Downloads the latest release from GitHub and extracts it to the cache directory.
|
|
53
|
+
# Does nothing if already installed.
|
|
54
|
+
def install
|
|
55
|
+
zip_path = nil
|
|
56
|
+
return if installed?
|
|
57
|
+
|
|
58
|
+
zip_path = download_release_file
|
|
59
|
+
extract_zip_file(zip_path)
|
|
60
|
+
ensure
|
|
61
|
+
FileUtils.rm_f(zip_path) if zip_path
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Checks if Poppler utilities are available.
|
|
65
|
+
#
|
|
66
|
+
# @return [Boolean] true if `pdftoppm` and `pdfinfo` are in the PATH or cached.
|
|
67
|
+
def installed? = (command_exists?("pdftoppm") && command_exists?("pdfinfo")) || cached?
|
|
68
|
+
|
|
69
|
+
# Checks if Poppler binaries are present in the local cache (Windows only).
|
|
70
|
+
#
|
|
71
|
+
# @return [Boolean] true if cached binaries exist.
|
|
72
|
+
def cached?
|
|
73
|
+
return false unless Gem.win_platform?
|
|
74
|
+
|
|
75
|
+
File.exist?(File.join(cached_bin_path, "pdftoppm.exe"))
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Resolves the path to the `pdftoppm` executable.
|
|
79
|
+
#
|
|
80
|
+
# Prioritizes the system PATH, falling back to the cached version on Windows.
|
|
81
|
+
#
|
|
82
|
+
# @return [String] path to `pdftoppm`.
|
|
83
|
+
def pdftoppm_path
|
|
84
|
+
return "pdftoppm" if command_exists?("pdftoppm")
|
|
85
|
+
|
|
86
|
+
Gem.win_platform? ? File.join(cached_bin_path, "pdftoppm.exe") : nil
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Resolves the path to the `pdfinfo` executable.
|
|
90
|
+
#
|
|
91
|
+
# Prioritizes the system PATH, falling back to the cached version on Windows.
|
|
92
|
+
#
|
|
93
|
+
# @return [String] path to `pdfinfo`.
|
|
94
|
+
def pdfinfo_path
|
|
95
|
+
return "pdfinfo" if command_exists?("pdfinfo")
|
|
96
|
+
|
|
97
|
+
Gem.win_platform? ? File.join(cached_bin_path, "pdfinfo.exe") : nil
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
private
|
|
101
|
+
|
|
102
|
+
# Locates the `bin` directory within the cached Poppler installation.
|
|
103
|
+
#
|
|
104
|
+
# Searches for a directory matching "poppler-*" in the cache directory and returns
|
|
105
|
+
# the path to its `Library/bin` subdirectory.
|
|
106
|
+
#
|
|
107
|
+
# @return [String] Path to the `bin` directory, or an empty string if not found.
|
|
108
|
+
def cached_bin_path
|
|
109
|
+
poppler_root = Dir.glob(File.join(cache_dir, "poppler-*")).first
|
|
110
|
+
return "" unless poppler_root
|
|
111
|
+
|
|
112
|
+
File.join(poppler_root, "Library", "bin")
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# Checks if a command is available in the system path.
|
|
116
|
+
#
|
|
117
|
+
# @param cmd [String] The command to check for.
|
|
118
|
+
# @return [Boolean] true if the command exists in the PATH.
|
|
119
|
+
def command_exists?(cmd)
|
|
120
|
+
tool = Gem.win_platform? ? "where" : "which"
|
|
121
|
+
system("#{tool} #{cmd} > #{File::NULL} 2>&1")
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# Downloads the latest Poppler release zip file.
|
|
125
|
+
#
|
|
126
|
+
# Fetches the download URL from the GitHub API and saves the file to the cache directory.
|
|
127
|
+
#
|
|
128
|
+
# @return [String] The local path to the downloaded zip file.
|
|
129
|
+
def download_release_file
|
|
130
|
+
release_url = latest_release_url
|
|
131
|
+
zip_path = File.join(cache_dir, File.basename(release_url))
|
|
132
|
+
URI.parse(release_url).open { File.binwrite(zip_path, _1.read) }
|
|
133
|
+
zip_path
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
# Retrieves the download URL for the latest Windows release of Poppler.
|
|
137
|
+
#
|
|
138
|
+
# Queries the GitHub API for the latest release and finds the asset matching "Release*.zip".
|
|
139
|
+
#
|
|
140
|
+
# @return [String] The download URL of the asset.
|
|
141
|
+
# @raise [SystemExit] if the API request fails or no valid asset is found.
|
|
142
|
+
def latest_release_url # rubocop:disable Metrics/AbcSize
|
|
143
|
+
uri = URI(POPPLER_REPO_API)
|
|
144
|
+
request = Net::HTTP::Get.new(uri)
|
|
145
|
+
request["User-Agent"] = "Tahweel-Gem"
|
|
146
|
+
|
|
147
|
+
response = Net::HTTP.start(uri.hostname, uri.port, use_ssl: true) { _1.request(request) }
|
|
148
|
+
|
|
149
|
+
unless response.is_a?(Net::HTTPSuccess)
|
|
150
|
+
abort "Failed to fetch Poppler release info: #{response.code} #{response.message}"
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
asset = JSON.parse(response.body)["assets"].find { _1["name"].match?(/^Release.*\.zip$/) }
|
|
154
|
+
|
|
155
|
+
asset ? asset["browser_download_url"] : abort("No valid Windows release found for Poppler.")
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
# Extracts the downloaded zip file to the cache directory.
|
|
159
|
+
#
|
|
160
|
+
# @param zip_path [String] Path to the zip file to extract.
|
|
161
|
+
def extract_zip_file(zip_path)
|
|
162
|
+
Zip::File.open(zip_path) do |zip_file|
|
|
163
|
+
zip_file.each do |entry|
|
|
164
|
+
entry_dest = File.join(cache_dir, entry.name)
|
|
165
|
+
FileUtils.mkdir_p(File.dirname(entry_dest))
|
|
166
|
+
zip_file.extract(entry, entry_dest) { true }
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
# Resolves the directory used for caching downloaded binaries.
|
|
172
|
+
#
|
|
173
|
+
# Uses the XDG cache home directory if available, otherwise defaults to `~/.cache/tahweel/poppler`.
|
|
174
|
+
#
|
|
175
|
+
# @return [String] Path to the cache directory.
|
|
176
|
+
def cache_dir
|
|
177
|
+
base = XDG.new.cache_home.to_s
|
|
178
|
+
base = File.join(Dir.home, ".cache") if base.empty?
|
|
179
|
+
|
|
180
|
+
dir = File.join(base, "tahweel", "poppler")
|
|
181
|
+
FileUtils.mkdir_p(dir)
|
|
182
|
+
dir
|
|
183
|
+
end
|
|
184
|
+
end
|
|
185
|
+
end
|
data/lib/tahweel/version.rb
CHANGED
data/lib/tahweel/writer.rb
CHANGED
data/lib/tahweel/writers/docx.rb
CHANGED
|
@@ -14,10 +14,11 @@ module Tahweel
|
|
|
14
14
|
# Writes the extracted texts to a file.
|
|
15
15
|
#
|
|
16
16
|
# It applies several transformations to the text before writing:
|
|
17
|
-
# 1. Normalizes line endings to `\n`.
|
|
17
|
+
# 1. Normalizes all line endings (`\r\n`, `\r`) to `\n`.
|
|
18
18
|
# 2. Collapses consecutive identical whitespace characters.
|
|
19
19
|
# 3. Compacts the text by merging short lines if the page is too long (> 40 lines).
|
|
20
20
|
# 4. Determines text alignment (RTL/LTR) based on content.
|
|
21
|
+
# 5. Converts `\n` to proper OOXML line breaks for cross-platform compatibility.
|
|
21
22
|
#
|
|
22
23
|
# @param texts [Array<String>] The extracted texts (one per page).
|
|
23
24
|
# @param destination [String] The output file path.
|
|
@@ -26,10 +27,10 @@ module Tahweel
|
|
|
26
27
|
def write(texts, destination, options = {}) # rubocop:disable Lint/UnusedMethodArgument
|
|
27
28
|
Caracal::Document.save(destination) do |docx|
|
|
28
29
|
texts.each_with_index do |text, index|
|
|
29
|
-
text = text.gsub(
|
|
30
|
+
text = text.gsub(/\r\n?/, "\n").gsub(/(\s)\1+/, '\1').strip
|
|
30
31
|
text = compact_shortest_lines(text) while expected_lines_in_page(text) > 40
|
|
31
32
|
|
|
32
|
-
docx
|
|
33
|
+
write_paragraph(docx, text)
|
|
33
34
|
|
|
34
35
|
docx.page if index < texts.size - 1
|
|
35
36
|
end
|
|
@@ -38,6 +39,28 @@ module Tahweel
|
|
|
38
39
|
|
|
39
40
|
private
|
|
40
41
|
|
|
42
|
+
# Writes a paragraph with proper OOXML line breaks.
|
|
43
|
+
#
|
|
44
|
+
# Raw newline characters (\n, \r\n) are not valid line breaks in DOCX format.
|
|
45
|
+
# Microsoft Word on Windows requires proper <w:br/> elements for line breaks,
|
|
46
|
+
# while macOS Pages is more lenient. This method uses Caracal's `br` method
|
|
47
|
+
# to insert cross-platform compatible line breaks.
|
|
48
|
+
#
|
|
49
|
+
# @param docx [Caracal::Document] The document to write to.
|
|
50
|
+
# @param text [String] The text content with newlines.
|
|
51
|
+
# @return [void]
|
|
52
|
+
def write_paragraph(docx, text)
|
|
53
|
+
lines = text.split("\n")
|
|
54
|
+
alignment = alignment_for(text)
|
|
55
|
+
|
|
56
|
+
docx.p align: alignment do
|
|
57
|
+
lines.each_with_index do |line, line_index|
|
|
58
|
+
text line, size: 20
|
|
59
|
+
br if line_index < lines.size - 1
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
|
|
41
64
|
# Determines the text alignment based on the ratio of Arabic to non-Arabic characters.
|
|
42
65
|
#
|
|
43
66
|
# @param text [String] The text to analyze.
|
data/lib/tahweel.rb
CHANGED
|
@@ -1,14 +1,19 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require_relative "tahweel/version"
|
|
4
|
+
require_relative "tahweel/cli/options"
|
|
5
|
+
require_relative "tahweel/cli/file_processor"
|
|
6
|
+
require_relative "tahweel/cli/file_collector"
|
|
4
7
|
require_relative "tahweel/authorizer"
|
|
8
|
+
require_relative "tahweel/poppler_installer"
|
|
5
9
|
require_relative "tahweel/pdf_splitter"
|
|
6
10
|
require_relative "tahweel/ocr"
|
|
11
|
+
require_relative "tahweel/processors/google_drive"
|
|
7
12
|
require_relative "tahweel/converter"
|
|
8
13
|
require_relative "tahweel/writer"
|
|
9
|
-
require_relative "tahweel/
|
|
10
|
-
require_relative "tahweel/
|
|
11
|
-
require_relative "tahweel/
|
|
14
|
+
require_relative "tahweel/writers/txt"
|
|
15
|
+
require_relative "tahweel/writers/docx"
|
|
16
|
+
require_relative "tahweel/writers/json"
|
|
12
17
|
|
|
13
18
|
module Tahweel # rubocop:disable Style/Documentation
|
|
14
19
|
class Error < StandardError; end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: tahweel
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ali Hamdi Ali Fadel
|
|
@@ -23,6 +23,48 @@ dependencies:
|
|
|
23
23
|
- - "~>"
|
|
24
24
|
- !ruby/object:Gem::Version
|
|
25
25
|
version: '1.4'
|
|
26
|
+
- !ruby/object:Gem::Dependency
|
|
27
|
+
name: csv
|
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
|
29
|
+
requirements:
|
|
30
|
+
- - "~>"
|
|
31
|
+
- !ruby/object:Gem::Version
|
|
32
|
+
version: '3.3'
|
|
33
|
+
type: :runtime
|
|
34
|
+
prerelease: false
|
|
35
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
36
|
+
requirements:
|
|
37
|
+
- - "~>"
|
|
38
|
+
- !ruby/object:Gem::Version
|
|
39
|
+
version: '3.3'
|
|
40
|
+
- !ruby/object:Gem::Dependency
|
|
41
|
+
name: fiddle
|
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
|
43
|
+
requirements:
|
|
44
|
+
- - "~>"
|
|
45
|
+
- !ruby/object:Gem::Version
|
|
46
|
+
version: '1.1'
|
|
47
|
+
type: :runtime
|
|
48
|
+
prerelease: false
|
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
50
|
+
requirements:
|
|
51
|
+
- - "~>"
|
|
52
|
+
- !ruby/object:Gem::Version
|
|
53
|
+
version: '1.1'
|
|
54
|
+
- !ruby/object:Gem::Dependency
|
|
55
|
+
name: glimmer-dsl-libui
|
|
56
|
+
requirement: !ruby/object:Gem::Requirement
|
|
57
|
+
requirements:
|
|
58
|
+
- - "~>"
|
|
59
|
+
- !ruby/object:Gem::Version
|
|
60
|
+
version: 0.13.1
|
|
61
|
+
type: :runtime
|
|
62
|
+
prerelease: false
|
|
63
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
64
|
+
requirements:
|
|
65
|
+
- - "~>"
|
|
66
|
+
- !ruby/object:Gem::Version
|
|
67
|
+
version: 0.13.1
|
|
26
68
|
- !ruby/object:Gem::Dependency
|
|
27
69
|
name: google-apis-drive_v3
|
|
28
70
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -79,20 +121,6 @@ dependencies:
|
|
|
79
121
|
- - "~>"
|
|
80
122
|
- !ruby/object:Gem::Version
|
|
81
123
|
version: '3.3'
|
|
82
|
-
- !ruby/object:Gem::Dependency
|
|
83
|
-
name: ruby-vips
|
|
84
|
-
requirement: !ruby/object:Gem::Requirement
|
|
85
|
-
requirements:
|
|
86
|
-
- - "~>"
|
|
87
|
-
- !ruby/object:Gem::Version
|
|
88
|
-
version: '2.2'
|
|
89
|
-
type: :runtime
|
|
90
|
-
prerelease: false
|
|
91
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
92
|
-
requirements:
|
|
93
|
-
- - "~>"
|
|
94
|
-
- !ruby/object:Gem::Version
|
|
95
|
-
version: '2.2'
|
|
96
124
|
- !ruby/object:Gem::Dependency
|
|
97
125
|
name: xdg
|
|
98
126
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -114,6 +142,7 @@ email:
|
|
|
114
142
|
executables:
|
|
115
143
|
- tahweel
|
|
116
144
|
- tahweel-clear
|
|
145
|
+
- tahweel-ui
|
|
117
146
|
extensions: []
|
|
118
147
|
extra_rdoc_files: []
|
|
119
148
|
files:
|
|
@@ -124,8 +153,11 @@ files:
|
|
|
124
153
|
- LICENSE.txt
|
|
125
154
|
- README.md
|
|
126
155
|
- Rakefile
|
|
156
|
+
- assets/logo.png
|
|
157
|
+
- assets/windows/tahweel.ico
|
|
127
158
|
- bin/tahweel
|
|
128
159
|
- bin/tahweel-clear
|
|
160
|
+
- bin/tahweel-ui
|
|
129
161
|
- lib/tahweel.rb
|
|
130
162
|
- lib/tahweel/authorizer.rb
|
|
131
163
|
- lib/tahweel/cli/file_collector.rb
|
|
@@ -135,6 +167,7 @@ files:
|
|
|
135
167
|
- lib/tahweel/converter.rb
|
|
136
168
|
- lib/tahweel/ocr.rb
|
|
137
169
|
- lib/tahweel/pdf_splitter.rb
|
|
170
|
+
- lib/tahweel/poppler_installer.rb
|
|
138
171
|
- lib/tahweel/processors/google_drive.rb
|
|
139
172
|
- lib/tahweel/templates/success.html
|
|
140
173
|
- lib/tahweel/version.rb
|