tahweel 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.vscode/extensions.json +15 -0
- data/.vscode/settings.json +16 -0
- data/CHANGELOG.md +54 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE.txt +21 -0
- data/README.md +43 -0
- data/Rakefile +12 -0
- data/bin/tahweel +67 -0
- data/bin/tahweel-clear +14 -0
- data/lib/tahweel/authorizer.rb +166 -0
- data/lib/tahweel/cli/file_collector.rb +33 -0
- data/lib/tahweel/cli/file_processor.rb +126 -0
- data/lib/tahweel/cli/options.rb +115 -0
- data/lib/tahweel/cli/progress_renderer.rb +148 -0
- data/lib/tahweel/converter.rb +131 -0
- data/lib/tahweel/ocr.rb +42 -0
- data/lib/tahweel/pdf_splitter.rb +147 -0
- data/lib/tahweel/processors/google_drive.rb +109 -0
- data/lib/tahweel/templates/success.html +192 -0
- data/lib/tahweel/version.rb +5 -0
- data/lib/tahweel/writer.rb +49 -0
- data/lib/tahweel/writers/docx.rb +82 -0
- data/lib/tahweel/writers/json.rb +32 -0
- data/lib/tahweel/writers/txt.rb +27 -0
- data/lib/tahweel.rb +37 -0
- data/mise.toml +2 -0
- data/sig/tahweel.rbs +4 -0
- metadata +173 -0
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "fileutils"
|
|
4
|
+
require "rbconfig"
|
|
5
|
+
require "securerandom"
|
|
6
|
+
require "tmpdir"
|
|
7
|
+
require "vips"
|
|
8
|
+
|
|
9
|
+
module Tahweel
|
|
10
|
+
# Handles the logic for splitting a PDF file into individual image pages.
|
|
11
|
+
# Uses the libvips library for high-performance image processing.
|
|
12
|
+
class PdfSplitter
|
|
13
|
+
# Default DPI used when converting PDF pages to images.
|
|
14
|
+
# 150 DPI is a good balance between quality and file size for general documents.
|
|
15
|
+
DEFAULT_DPI = 150
|
|
16
|
+
|
|
17
|
+
# Convenience class method to initialize and execute the split operation in one go.
|
|
18
|
+
#
|
|
19
|
+
# @param pdf_path [String] The local file path to the PDF document.
|
|
20
|
+
# @param dpi [Integer] The resolution (Dots Per Inch) for rendering the PDF pages. Defaults to 150.
|
|
21
|
+
# @param &block [Proc] A block that will be yielded with progress info.
|
|
22
|
+
# @yield [Hash] Progress info: {
|
|
23
|
+
# stage: :splitting,
|
|
24
|
+
# current_page: Integer,
|
|
25
|
+
# percentage: Float,
|
|
26
|
+
# remaining_pages: Integer
|
|
27
|
+
# }
|
|
28
|
+
# @return [Hash] A hash containing the :folder_path (String) and :image_paths (Array<String>).
|
|
29
|
+
def self.split(pdf_path, dpi: DEFAULT_DPI, &) = new(pdf_path, dpi:).split(&)
|
|
30
|
+
|
|
31
|
+
# Initializes a new PdfSplitter instance.
|
|
32
|
+
#
|
|
33
|
+
# @param pdf_path [String] The local file path to the PDF document.
|
|
34
|
+
# @param dpi [Integer] The resolution (Dots Per Inch) to use. Defaults to 150.
|
|
35
|
+
def initialize(pdf_path, dpi: DEFAULT_DPI)
|
|
36
|
+
@pdf_path = pdf_path
|
|
37
|
+
@dpi = dpi
|
|
38
|
+
@image_paths = []
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Executes the PDF splitting process.
|
|
42
|
+
#
|
|
43
|
+
# This method performs the following steps:
|
|
44
|
+
# 1. Checks if libvips is installed (skips on Windows).
|
|
45
|
+
# 2. Validates the existence of the source PDF file.
|
|
46
|
+
# 3. Creates a unique temporary directory for output.
|
|
47
|
+
# 4. Iterates through each page of the PDF and converts it to a PNG image.
|
|
48
|
+
#
|
|
49
|
+
# @param &block [Proc] A block that will be yielded with progress info.
|
|
50
|
+
# @yield [Hash] Progress info: {
|
|
51
|
+
# stage: :splitting,
|
|
52
|
+
# current_page: Integer,
|
|
53
|
+
# percentage: Float,
|
|
54
|
+
# remaining_pages: Integer
|
|
55
|
+
# }
|
|
56
|
+
# @return [Hash] Result hash with keys:
|
|
57
|
+
# - :folder_path [String] The absolute path to the temporary directory containing the images.
|
|
58
|
+
# - :image_paths [Array<String>] List of absolute paths for each generated image file.
|
|
59
|
+
# @raise [RuntimeError] If the PDF file is not found or libvips is missing.
|
|
60
|
+
# @raise [Vips::Error] If the underlying VIPS library encounters an error during processing.
|
|
61
|
+
def split(&)
|
|
62
|
+
check_libvips_installed!
|
|
63
|
+
validate_file_exists!
|
|
64
|
+
setup_output_directory
|
|
65
|
+
process_pages(&)
|
|
66
|
+
result
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
private
|
|
70
|
+
|
|
71
|
+
attr_reader :pdf_path, :dpi, :image_paths, :output_dir
|
|
72
|
+
|
|
73
|
+
# Checks if the `vips` CLI tool is available in the system PATH.
|
|
74
|
+
# Skips this check on Windows systems, assuming the environment is managed differently.
|
|
75
|
+
# Aborts execution with an error message if vips is missing.
|
|
76
|
+
def check_libvips_installed!
|
|
77
|
+
return if /mswin|mingw|cygwin/.match?(RbConfig::CONFIG["host_os"])
|
|
78
|
+
return if system("vips --version", out: File::NULL, err: File::NULL)
|
|
79
|
+
|
|
80
|
+
abort "Error: libvips is not installed. Please install it before using Tahweel.\n" \
|
|
81
|
+
"MacOS: `brew install vips`\n" \
|
|
82
|
+
"Ubuntu: `sudo apt install libvips42`\n" \
|
|
83
|
+
"Windows: Already installed with the Tahweel gem"
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Ensures the source PDF file actually exists.
|
|
87
|
+
# @raise [RuntimeError] if the file is missing.
|
|
88
|
+
def validate_file_exists!
|
|
89
|
+
raise "File not found: #{pdf_path}" unless File.exist?(pdf_path)
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# Creates a secure, unique temporary directory using UUIDs.
|
|
93
|
+
def setup_output_directory
|
|
94
|
+
@output_dir = File.join(Dir.tmpdir, "tahweel_#{SecureRandom.uuid}")
|
|
95
|
+
FileUtils.mkdir_p(@output_dir)
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# Iterates through all pages and extracts them.
|
|
99
|
+
#
|
|
100
|
+
# @param &block [Proc] A block that will be yielded with progress info.
|
|
101
|
+
# @yield [Hash] Progress info: {
|
|
102
|
+
# stage: :splitting,
|
|
103
|
+
# current_page: Integer,
|
|
104
|
+
# percentage: Float,
|
|
105
|
+
# remaining_pages: Integer
|
|
106
|
+
# }
|
|
107
|
+
# @return [void]
|
|
108
|
+
def process_pages(&)
|
|
109
|
+
total_pages.times do |i|
|
|
110
|
+
extract_page(i)
|
|
111
|
+
|
|
112
|
+
next unless block_given?
|
|
113
|
+
|
|
114
|
+
yield({
|
|
115
|
+
file_path: @pdf_path, stage: :splitting,
|
|
116
|
+
current_page: i + 1,
|
|
117
|
+
percentage: (((i + 1).to_f / total_pages) * 100).round(2),
|
|
118
|
+
remaining_pages: total_pages - (i + 1)
|
|
119
|
+
})
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# Calculates the total number of pages in the PDF by loading the first page metadata.
|
|
124
|
+
# @return [Integer] The page count.
|
|
125
|
+
def total_pages
|
|
126
|
+
@total_pages ||= Vips::Image.pdfload(pdf_path, page: 0, dpi: dpi, access: :sequential).get("pdf-n_pages")
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# Extracts a specific page from the PDF and saves it as a PNG.
|
|
130
|
+
#
|
|
131
|
+
# @param page_num [Integer] The zero-based index of the page to extract.
|
|
132
|
+
def extract_page(page_num)
|
|
133
|
+
output_path = File.join(output_dir, "page_#{page_num + 1}.png")
|
|
134
|
+
Vips::Image.pdfload(pdf_path, page: page_num, dpi: dpi, access: :sequential).write_to_file(output_path)
|
|
135
|
+
image_paths << output_path
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# Constructs the final result hash.
|
|
139
|
+
# @return [Hash]
|
|
140
|
+
def result
|
|
141
|
+
{
|
|
142
|
+
folder_path: output_dir,
|
|
143
|
+
image_paths: image_paths
|
|
144
|
+
}
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
end
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "google/apis/drive_v3"
|
|
4
|
+
require "securerandom"
|
|
5
|
+
require "stringio"
|
|
6
|
+
|
|
7
|
+
require_relative "../authorizer"
|
|
8
|
+
|
|
9
|
+
module Tahweel
|
|
10
|
+
module Processors
|
|
11
|
+
# Handles the conversion of images to text using Google Drive's OCR capabilities.
|
|
12
|
+
#
|
|
13
|
+
# This class automates the process of:
|
|
14
|
+
# 1. Uploading a local image to Google Drive as a Google Document.
|
|
15
|
+
# 2. Downloading the content of that document as plain text.
|
|
16
|
+
# 3. Cleaning up (deleting) the temporary file from Drive.
|
|
17
|
+
#
|
|
18
|
+
# It includes robust error handling with infinite retries and exponential backoff
|
|
19
|
+
# for network issues, rate limits, and server errors.
|
|
20
|
+
class GoogleDrive
|
|
21
|
+
# Initializes the Google Drive OCR service.
|
|
22
|
+
# Sets up the Google Drive API client and authorizes it using {Tahweel::Authorizer}.
|
|
23
|
+
#
|
|
24
|
+
# @note This operation performs filesystem I/O to read credentials.
|
|
25
|
+
# For bulk processing, instantiate this once and reuse it.
|
|
26
|
+
def initialize
|
|
27
|
+
@service = Google::Apis::DriveV3::DriveService.new
|
|
28
|
+
@service.client_options.application_name = "Tahweel"
|
|
29
|
+
@service.authorization = Tahweel::Authorizer.authorize
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Extracts text from an image file using the "Upload -> Export -> Delete" flow.
|
|
33
|
+
#
|
|
34
|
+
# The method ensures that the temporary file created on Google Drive is deleted
|
|
35
|
+
# regardless of whether the download succeeds or fails.
|
|
36
|
+
#
|
|
37
|
+
# @param file_path [String] The path to the image file.
|
|
38
|
+
# @return [String] The extracted text.
|
|
39
|
+
# @raise [RuntimeError] If the file does not exist locally.
|
|
40
|
+
# @raise [Google::Apis::Error] If a non-retriable API error occurs (e.g., 401, 403, 404).
|
|
41
|
+
def extract(file_path)
|
|
42
|
+
raise "File not found: #{file_path}" unless File.exist?(file_path)
|
|
43
|
+
|
|
44
|
+
begin
|
|
45
|
+
file_id = upload_file(file_path)
|
|
46
|
+
download_text(file_id).gsub("\r\n", "\n").gsub("________________", "").strip
|
|
47
|
+
ensure
|
|
48
|
+
delete_file(file_id)
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
private
|
|
53
|
+
|
|
54
|
+
# Uploads the file to Google Drive with the MIME type set to 'application/vnd.google-apps.document'.
|
|
55
|
+
# This triggers Google's automatic OCR processing.
|
|
56
|
+
#
|
|
57
|
+
# @param file_path [String] Path to the local file.
|
|
58
|
+
# @return [String] The ID of the created file on Google Drive.
|
|
59
|
+
def upload_file(file_path)
|
|
60
|
+
execute_with_retry do
|
|
61
|
+
@service.create_file(
|
|
62
|
+
{
|
|
63
|
+
name: SecureRandom.uuid,
|
|
64
|
+
mime_type: "application/vnd.google-apps.document"
|
|
65
|
+
},
|
|
66
|
+
upload_source: file_path,
|
|
67
|
+
fields: "id"
|
|
68
|
+
).id
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# Exports the Google Document as plain text.
|
|
73
|
+
#
|
|
74
|
+
# @param file_id [String] The ID of the file on Google Drive.
|
|
75
|
+
# @return [String] The content of the file as a string.
|
|
76
|
+
def download_text(file_id)
|
|
77
|
+
execute_with_retry do
|
|
78
|
+
StringIO.new.tap do |dest|
|
|
79
|
+
@service.export_file(file_id, "text/plain", download_dest: dest)
|
|
80
|
+
end.string
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Deletes the temporary file from Google Drive.
|
|
85
|
+
#
|
|
86
|
+
# @param file_id [String] The ID of the file to delete.
|
|
87
|
+
# @return [void]
|
|
88
|
+
def delete_file(file_id) = execute_with_retry { @service.delete_file(file_id) }
|
|
89
|
+
|
|
90
|
+
# Executes a block with infinite retries and exponential backoff.
|
|
91
|
+
# Designed to handle transient errors (Rate Limits, Network issues, Server errors).
|
|
92
|
+
#
|
|
93
|
+
# @yield The block to execute.
|
|
94
|
+
# @raise [Google::Apis::Error] Rethrows non-retriable errors immediately.
|
|
95
|
+
def execute_with_retry
|
|
96
|
+
retries = 0
|
|
97
|
+
|
|
98
|
+
begin
|
|
99
|
+
yield
|
|
100
|
+
rescue Google::Apis::RateLimitError, Google::Apis::RequestTimeOutError,
|
|
101
|
+
Google::Apis::TransmissionError, Google::Apis::ServerError
|
|
102
|
+
sleep([1.5**retries, 15].min + rand(0..1))
|
|
103
|
+
retries += 1
|
|
104
|
+
retry
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
end
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
<!DOCTYPE html>
|
|
2
|
+
<html lang="ar" dir="rtl">
|
|
3
|
+
<head>
|
|
4
|
+
<meta charset="UTF-8">
|
|
5
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
6
|
+
<title>Tahweel Authorization</title>
|
|
7
|
+
<!-- Google Fonts: Poppins for English, Cairo for Arabic -->
|
|
8
|
+
<link rel="preconnect" href="https://fonts.googleapis.com">
|
|
9
|
+
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
|
10
|
+
<link href="https://fonts.googleapis.com/css2?family=Cairo:wght@400;700&family=Poppins:wght@400;600&display=swap" rel="stylesheet">
|
|
11
|
+
<style>
|
|
12
|
+
:root {
|
|
13
|
+
--primary-color: #10b981; /* Emerald Green */
|
|
14
|
+
--primary-hover: #059669;
|
|
15
|
+
--bg-gradient: linear-gradient(135deg, #f0fdf4 0%, #d1fae5 100%);
|
|
16
|
+
--text-dark: #1f2937;
|
|
17
|
+
--text-light: #6b7280;
|
|
18
|
+
--card-shadow: 0 10px 25px -5px rgba(0, 0, 0, 0.1), 0 8px 10px -6px rgba(0, 0, 0, 0.1);
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
body {
|
|
22
|
+
margin: 0;
|
|
23
|
+
min-height: 100vh;
|
|
24
|
+
display: flex;
|
|
25
|
+
align-items: center;
|
|
26
|
+
justify-content: center;
|
|
27
|
+
background: var(--bg-gradient);
|
|
28
|
+
font-family: 'Cairo', sans-serif; /* Default to Arabic font */
|
|
29
|
+
transition: all 0.3s ease;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/* English Font Override */
|
|
33
|
+
body.font-en {
|
|
34
|
+
font-family: 'Poppins', sans-serif;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
.container {
|
|
38
|
+
background: rgba(255, 255, 255, 0.95);
|
|
39
|
+
padding: 3rem;
|
|
40
|
+
border-radius: 24px;
|
|
41
|
+
box-shadow: var(--card-shadow);
|
|
42
|
+
text-align: center;
|
|
43
|
+
max-width: 420px;
|
|
44
|
+
width: 90%;
|
|
45
|
+
position: relative;
|
|
46
|
+
backdrop-filter: blur(10px);
|
|
47
|
+
animation: fadeUp 0.6s cubic-bezier(0.16, 1, 0.3, 1);
|
|
48
|
+
border: 1px solid rgba(255,255,255,0.5);
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/* Success Icon Animation */
|
|
52
|
+
.icon-wrapper {
|
|
53
|
+
width: 80px;
|
|
54
|
+
height: 80px;
|
|
55
|
+
background: #d1fae5;
|
|
56
|
+
border-radius: 50%;
|
|
57
|
+
display: flex;
|
|
58
|
+
align-items: center;
|
|
59
|
+
justify-content: center;
|
|
60
|
+
margin: 0 auto 1.5rem auto;
|
|
61
|
+
color: var(--primary-color);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
.icon-wrapper svg {
|
|
65
|
+
width: 40px;
|
|
66
|
+
height: 40px;
|
|
67
|
+
stroke-width: 3;
|
|
68
|
+
stroke: currentColor;
|
|
69
|
+
fill: none;
|
|
70
|
+
stroke-linecap: round;
|
|
71
|
+
stroke-linejoin: round;
|
|
72
|
+
animation: drawCheck 0.8s ease-out forwards;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
h1 {
|
|
76
|
+
color: var(--text-dark);
|
|
77
|
+
font-size: 1.5rem;
|
|
78
|
+
font-weight: 700;
|
|
79
|
+
margin: 0 0 0.5rem 0;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
p {
|
|
83
|
+
color: var(--text-light);
|
|
84
|
+
font-size: 1rem;
|
|
85
|
+
line-height: 1.6;
|
|
86
|
+
margin: 0 0 1.5rem 0;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
.sub-text {
|
|
90
|
+
font-size: 0.875rem;
|
|
91
|
+
opacity: 0.8;
|
|
92
|
+
margin-top: -1rem;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
/* Language Toggle Button */
|
|
96
|
+
.lang-btn {
|
|
97
|
+
position: absolute;
|
|
98
|
+
top: 20px;
|
|
99
|
+
right: 20px;
|
|
100
|
+
background: white;
|
|
101
|
+
border: 1px solid #e5e7eb;
|
|
102
|
+
padding: 8px 16px;
|
|
103
|
+
border-radius: 20px;
|
|
104
|
+
font-family: inherit;
|
|
105
|
+
font-size: 0.875rem;
|
|
106
|
+
font-weight: 600;
|
|
107
|
+
color: var(--text-dark);
|
|
108
|
+
cursor: pointer;
|
|
109
|
+
box-shadow: 0 2px 5px rgba(0,0,0,0.05);
|
|
110
|
+
transition: all 0.2s ease;
|
|
111
|
+
display: flex;
|
|
112
|
+
align-items: center;
|
|
113
|
+
gap: 6px;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
html[dir="ltr"] .lang-btn {
|
|
117
|
+
right: auto;
|
|
118
|
+
left: 20px;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
.lang-btn:hover {
|
|
122
|
+
background: #f9fafb;
|
|
123
|
+
transform: translateY(-1px);
|
|
124
|
+
box-shadow: 0 4px 12px rgba(0,0,0,0.08);
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
/* Animations */
|
|
128
|
+
@keyframes fadeUp {
|
|
129
|
+
from { opacity: 0; transform: translateY(20px); }
|
|
130
|
+
to { opacity: 1; transform: translateY(0); }
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
@keyframes drawCheck {
|
|
134
|
+
0% { stroke-dasharray: 100; stroke-dashoffset: 100; opacity: 0; transform: scale(0.8); }
|
|
135
|
+
100% { stroke-dasharray: 100; stroke-dashoffset: 0; opacity: 1; transform: scale(1); }
|
|
136
|
+
}
|
|
137
|
+
</style>
|
|
138
|
+
</head>
|
|
139
|
+
<body>
|
|
140
|
+
<button class="lang-btn" onclick="toggleLanguage()" id="langBtn">
|
|
141
|
+
<span>🌐</span> <span id="btnText">English</span>
|
|
142
|
+
</button>
|
|
143
|
+
<div class="container">
|
|
144
|
+
<div class="icon-wrapper">
|
|
145
|
+
<!-- Simple SVG Checkmark -->
|
|
146
|
+
<svg viewBox="0 0 24 24">
|
|
147
|
+
<path d="M20 6L9 17l-5-5"></path>
|
|
148
|
+
</svg>
|
|
149
|
+
</div>
|
|
150
|
+
<h1 id="title">تمت المُصادقة بنجاح!</h1>
|
|
151
|
+
<p id="msg1">لقد قمت بتسجيل الدخول بنجاح إلى تحويل.</p>
|
|
152
|
+
<p id="msg2" class="sub-text">يمكنك إغلاق هذه النافذة والعودة إلى البرنامج.</p>
|
|
153
|
+
</div>
|
|
154
|
+
<script>
|
|
155
|
+
// Content Dictionary
|
|
156
|
+
const content = {
|
|
157
|
+
ar: {
|
|
158
|
+
btn: "English",
|
|
159
|
+
title: "تمت المُصادقة بنجاح!",
|
|
160
|
+
msg1: "لقد قمت بتسجيل الدخول بنجاح إلى تحويل.",
|
|
161
|
+
msg2: "يمكنك إغلاق هذه النافذة والعودة إلى البرنامج."
|
|
162
|
+
},
|
|
163
|
+
en: {
|
|
164
|
+
btn: "العربية",
|
|
165
|
+
title: "Authorization Successful!",
|
|
166
|
+
msg1: "You have successfully logged in to Tahweel.",
|
|
167
|
+
msg2: "You can close this window and return to your terminal."
|
|
168
|
+
}
|
|
169
|
+
};
|
|
170
|
+
|
|
171
|
+
let currentLang = 'ar';
|
|
172
|
+
|
|
173
|
+
function toggleLanguage() {
|
|
174
|
+
currentLang = currentLang === 'ar' ? 'en' : 'ar';
|
|
175
|
+
|
|
176
|
+
// Toggle Direction and Lang Attribute
|
|
177
|
+
const htmlEl = document.documentElement;
|
|
178
|
+
htmlEl.lang = currentLang;
|
|
179
|
+
htmlEl.dir = currentLang === 'ar' ? 'rtl' : 'ltr';
|
|
180
|
+
|
|
181
|
+
// Toggle Font Class (for body typography)
|
|
182
|
+
document.body.classList.toggle('font-en', currentLang === 'en');
|
|
183
|
+
|
|
184
|
+
// Update Text
|
|
185
|
+
document.getElementById('btnText').textContent = content[currentLang].btn;
|
|
186
|
+
document.getElementById('title').textContent = content[currentLang].title;
|
|
187
|
+
document.getElementById('msg1').textContent = content[currentLang].msg1;
|
|
188
|
+
document.getElementById('msg2').textContent = content[currentLang].msg2;
|
|
189
|
+
}
|
|
190
|
+
</script>
|
|
191
|
+
</body>
|
|
192
|
+
</html>
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "writers/txt"
|
|
4
|
+
require_relative "writers/docx"
|
|
5
|
+
require_relative "writers/json"
|
|
6
|
+
|
|
7
|
+
module Tahweel
|
|
8
|
+
# Factory class for writing extracted text to different formats.
|
|
9
|
+
class Writer
|
|
10
|
+
AVAILABLE_FORMATS = %i[txt docx json].freeze
|
|
11
|
+
|
|
12
|
+
# Convenience method to write texts to files in the specified formats.
|
|
13
|
+
#
|
|
14
|
+
# @param texts [Array<String>] The extracted texts.
|
|
15
|
+
# @param base_path [String] The base output path (without extension).
|
|
16
|
+
# @param formats [Array<Symbol>] The output formats (default: [:txt]).
|
|
17
|
+
# @param options [Hash] Options for writers.
|
|
18
|
+
# @return [void]
|
|
19
|
+
def self.write(texts, base_path, formats: [:txt], **options)
|
|
20
|
+
formats.each { new(format: _1).write(texts, base_path, **options) }
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Initializes the Writer with a specific format strategy.
|
|
24
|
+
#
|
|
25
|
+
# @param format [Symbol] The output format.
|
|
26
|
+
# @raise [ArgumentError] If the format is unknown.
|
|
27
|
+
def initialize(format: :txt)
|
|
28
|
+
@writer = case format
|
|
29
|
+
when :txt then Writers::Txt.new
|
|
30
|
+
when :docx then Writers::Docx.new
|
|
31
|
+
when :json then Writers::Json.new
|
|
32
|
+
else raise ArgumentError, "Unknown format: #{format}"
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Writes the texts to the destination using the selected strategy.
|
|
37
|
+
# Appends the appropriate extension to the base path.
|
|
38
|
+
#
|
|
39
|
+
# @param texts [Array<String>] The extracted texts.
|
|
40
|
+
# @param base_path [String] The base output file path.
|
|
41
|
+
# @param options [Hash] Options to pass to the writer.
|
|
42
|
+
def write(texts, base_path, **options) = @writer.write(texts, "#{base_path}.#{extension}", options)
|
|
43
|
+
|
|
44
|
+
# Delegates the extension retrieval to the specific writer strategy.
|
|
45
|
+
#
|
|
46
|
+
# @return [String] The file extension.
|
|
47
|
+
def extension = @writer.extension
|
|
48
|
+
end
|
|
49
|
+
end
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "caracal"
|
|
4
|
+
|
|
5
|
+
module Tahweel
|
|
6
|
+
module Writers
|
|
7
|
+
# Writer class for outputting text to a .docx file.
|
|
8
|
+
class Docx
|
|
9
|
+
# Returns the file extension for this writer.
|
|
10
|
+
#
|
|
11
|
+
# @return [String] The file extension.
|
|
12
|
+
def extension = "docx"
|
|
13
|
+
|
|
14
|
+
# Writes the extracted texts to a file.
|
|
15
|
+
#
|
|
16
|
+
# It applies several transformations to the text before writing:
|
|
17
|
+
# 1. Normalizes line endings to `\n`.
|
|
18
|
+
# 2. Collapses consecutive identical whitespace characters.
|
|
19
|
+
# 3. Compacts the text by merging short lines if the page is too long (> 40 lines).
|
|
20
|
+
# 4. Determines text alignment (RTL/LTR) based on content.
|
|
21
|
+
#
|
|
22
|
+
# @param texts [Array<String>] The extracted texts (one per page).
|
|
23
|
+
# @param destination [String] The output file path.
|
|
24
|
+
# @param options [Hash] Options for writing (unused for now).
|
|
25
|
+
# @return [void]
|
|
26
|
+
def write(texts, destination, options = {}) # rubocop:disable Lint/UnusedMethodArgument
|
|
27
|
+
Caracal::Document.save(destination) do |docx|
|
|
28
|
+
texts.each_with_index do |text, index|
|
|
29
|
+
text = text.gsub(/(\r\n)+/, "\n").gsub(/(\s)\1+/, '\1').strip
|
|
30
|
+
text = compact_shortest_lines(text) while expected_lines_in_page(text) > 40
|
|
31
|
+
|
|
32
|
+
docx.p text, size: 20, align: alignment_for(text)
|
|
33
|
+
|
|
34
|
+
docx.page if index < texts.size - 1
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
private
|
|
40
|
+
|
|
41
|
+
# Determines the text alignment based on the ratio of Arabic to non-Arabic characters.
|
|
42
|
+
#
|
|
43
|
+
# @param text [String] The text to analyze.
|
|
44
|
+
# @return [Symbol] :right if Arabic characters dominate, :left otherwise.
|
|
45
|
+
def alignment_for(text)
|
|
46
|
+
arabic_chars_count = text.scan(/\p{Arabic}/).count
|
|
47
|
+
other_chars_count = text.scan(/[^\p{Arabic}\p{P}\d\s]/).count
|
|
48
|
+
|
|
49
|
+
arabic_chars_count >= other_chars_count ? :right : :left
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Estimates the number of lines the text will occupy on a page.
|
|
53
|
+
#
|
|
54
|
+
# Assumes a line wraps if it exceeds 80 characters.
|
|
55
|
+
#
|
|
56
|
+
# @param text [String] The text to analyze.
|
|
57
|
+
# @return [Integer] The estimated line count.
|
|
58
|
+
def expected_lines_in_page(text) = text.count("\n") + 1 + text.split("\n").count { _1.length > 80 }
|
|
59
|
+
|
|
60
|
+
# Compacts the text by merging the two shortest adjacent lines.
|
|
61
|
+
#
|
|
62
|
+
# @param text [String] The text to compact.
|
|
63
|
+
# @return [String] The compacted text.
|
|
64
|
+
def compact_shortest_lines(text)
|
|
65
|
+
lines = text.split("\n")
|
|
66
|
+
return text if lines.size < 2
|
|
67
|
+
|
|
68
|
+
index = find_merge_index(lines)
|
|
69
|
+
lines[index] = "#{lines[index]} #{lines[index + 1]}"
|
|
70
|
+
lines.delete_at(index + 1)
|
|
71
|
+
|
|
72
|
+
lines.join("\n")
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Finds the index of the first line in the pair of adjacent lines with the minimum combined length.
|
|
76
|
+
#
|
|
77
|
+
# @param lines [Array<String>] The lines to analyze.
|
|
78
|
+
# @return [Integer] The index of the first line in the optimal pair.
|
|
79
|
+
def find_merge_index(lines) = (0...(lines.size - 1)).min_by { lines[_1].length + lines[_1 + 1].length }
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
|
|
5
|
+
module Tahweel
|
|
6
|
+
module Writers
|
|
7
|
+
# Writer class for outputting text to a .json file.
|
|
8
|
+
class Json
|
|
9
|
+
# Returns the file extension for this writer.
|
|
10
|
+
#
|
|
11
|
+
# @return [String] The file extension.
|
|
12
|
+
def extension = "json"
|
|
13
|
+
|
|
14
|
+
# Writes the extracted texts to a file.
|
|
15
|
+
#
|
|
16
|
+
# @param texts [Array<String>] The extracted texts (one per page).
|
|
17
|
+
# @param destination [String] The output file path.
|
|
18
|
+
# @param options [Hash] Options for writing (unused for now).
|
|
19
|
+
# @return [void]
|
|
20
|
+
def write(texts, destination, options = {}) # rubocop:disable Lint/UnusedMethodArgument
|
|
21
|
+
structured_data = texts.map.with_index do |text, index|
|
|
22
|
+
{
|
|
23
|
+
page: index + 1,
|
|
24
|
+
content: text.strip
|
|
25
|
+
}
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
File.write(destination, JSON.pretty_generate(structured_data))
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Tahweel
|
|
4
|
+
module Writers
|
|
5
|
+
# Writer class for outputting text to a .txt file.
|
|
6
|
+
class Txt
|
|
7
|
+
PAGE_SEPARATOR = "\n\nPAGE_SEPARATOR\n\n"
|
|
8
|
+
|
|
9
|
+
# Returns the file extension for this writer.
|
|
10
|
+
#
|
|
11
|
+
# @return [String] The file extension.
|
|
12
|
+
def extension = "txt"
|
|
13
|
+
|
|
14
|
+
# Writes the extracted texts to a file.
|
|
15
|
+
#
|
|
16
|
+
# @param texts [Array<String>] The extracted texts (one per page).
|
|
17
|
+
# @param destination [String] The output file path.
|
|
18
|
+
# @param options [Hash] Options for writing.
|
|
19
|
+
# @option options [String] :page_separator (PAGE_SEPARATOR) Separator between pages.
|
|
20
|
+
# @return [void]
|
|
21
|
+
def write(texts, destination, options = {})
|
|
22
|
+
separator = options[:page_separator] || PAGE_SEPARATOR
|
|
23
|
+
File.write(destination, texts.map(&:strip).join(separator))
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
data/lib/tahweel.rb
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "tahweel/version"
|
|
4
|
+
require_relative "tahweel/authorizer"
|
|
5
|
+
require_relative "tahweel/pdf_splitter"
|
|
6
|
+
require_relative "tahweel/ocr"
|
|
7
|
+
require_relative "tahweel/converter"
|
|
8
|
+
require_relative "tahweel/writer"
|
|
9
|
+
require_relative "tahweel/cli/file_processor"
|
|
10
|
+
require_relative "tahweel/cli/file_collector"
|
|
11
|
+
require_relative "tahweel/cli/options"
|
|
12
|
+
|
|
13
|
+
module Tahweel # rubocop:disable Style/Documentation
|
|
14
|
+
class Error < StandardError; end
|
|
15
|
+
|
|
16
|
+
# Converts a PDF file to text by splitting it into images and running OCR on each page.
|
|
17
|
+
#
|
|
18
|
+
# @param pdf_path [String] Path to the PDF file.
|
|
19
|
+
# @param dpi [Integer] DPI for PDF to image conversion (default: 150).
|
|
20
|
+
# @param processor [Symbol] OCR processor to use (default: :google_drive).
|
|
21
|
+
# @param concurrency [Integer] Max concurrent OCR operations (default: 12).
|
|
22
|
+
# @return [Array<String>] An array containing the text of each page.
|
|
23
|
+
def self.convert(
|
|
24
|
+
pdf_path,
|
|
25
|
+
dpi: PdfSplitter::DEFAULT_DPI,
|
|
26
|
+
processor: :google_drive,
|
|
27
|
+
concurrency: Converter::DEFAULT_CONCURRENCY,
|
|
28
|
+
&
|
|
29
|
+
) = Converter.convert(pdf_path, dpi:, processor:, concurrency:, &)
|
|
30
|
+
|
|
31
|
+
# Extracts text from an image file using the specified OCR processor.
|
|
32
|
+
#
|
|
33
|
+
# @param image_path [String] Path to the image file.
|
|
34
|
+
# @param processor [Symbol] OCR processor to use (default: :google_drive).
|
|
35
|
+
# @return [String] The extracted text.
|
|
36
|
+
def self.extract(image_path, processor: :google_drive) = Ocr.extract(image_path, processor:)
|
|
37
|
+
end
|
data/mise.toml
ADDED