tahweel 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,14 +1,15 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "etc"
3
4
  require "fileutils"
4
- require "rbconfig"
5
5
  require "securerandom"
6
6
  require "tmpdir"
7
- require "vips"
7
+
8
+ require_relative "poppler_installer"
8
9
 
9
10
  module Tahweel
10
11
  # Handles the logic for splitting a PDF file into individual image pages.
11
- # Uses the libvips library for high-performance image processing.
12
+ # Uses Poppler utils (pdftoppm, pdfinfo) for high-performance image processing.
12
13
  class PdfSplitter
13
14
  # Default DPI used when converting PDF pages to images.
14
15
  # 150 DPI is a good balance between quality and file size for general documents.
@@ -25,7 +26,7 @@ module Tahweel
25
26
  # percentage: Float,
26
27
  # remaining_pages: Integer
27
28
  # }
28
- # @return [Hash] A hash containing the :folder_path (String) and :image_paths (Array<String>).
29
+ # @return [Hash] A hash containing the :folder_path (String) and :images_paths (Array<String>).
29
30
  def self.split(pdf_path, dpi: DEFAULT_DPI, &) = new(pdf_path, dpi:).split(&)
30
31
 
31
32
  # Initializes a new PdfSplitter instance.
@@ -35,13 +36,12 @@ module Tahweel
35
36
  def initialize(pdf_path, dpi: DEFAULT_DPI)
36
37
  @pdf_path = pdf_path
37
38
  @dpi = dpi
38
- @image_paths = []
39
39
  end
40
40
 
41
41
  # Executes the PDF splitting process.
42
42
  #
43
43
  # This method performs the following steps:
44
- # 1. Checks if libvips is installed (skips on Windows).
44
+ # 1. Checks if Poppler utils are available (installs if missing on Windows).
45
45
  # 2. Validates the existence of the source PDF file.
46
46
  # 3. Creates a unique temporary directory for output.
47
47
  # 4. Iterates through each page of the PDF and converts it to a PNG image.
@@ -55,12 +55,11 @@ module Tahweel
55
55
  # }
56
56
  # @return [Hash] Result hash with keys:
57
57
  # - :folder_path [String] The absolute path to the temporary directory containing the images.
58
- # - :image_paths [Array<String>] List of absolute paths for each generated image file.
59
- # @raise [RuntimeError] If the PDF file is not found or libvips is missing.
60
- # @raise [Vips::Error] If the underlying VIPS library encounters an error during processing.
58
+ # - :images_paths [Array<String>] List of absolute paths for each generated image file.
59
+ # @raise [RuntimeError] If the PDF file is not found.
61
60
  def split(&)
62
- check_libvips_installed!
63
61
  validate_file_exists!
62
+ PopplerInstaller.ensure_installed!
64
63
  setup_output_directory
65
64
  process_pages(&)
66
65
  result
@@ -68,20 +67,7 @@ module Tahweel
68
67
 
69
68
  private
70
69
 
71
- attr_reader :pdf_path, :dpi, :image_paths, :output_dir
72
-
73
- # Checks if the `vips` CLI tool is available in the system PATH.
74
- # Skips this check on Windows systems, assuming the environment is managed differently.
75
- # Aborts execution with an error message if vips is missing.
76
- def check_libvips_installed!
77
- return if /mswin|mingw|cygwin/.match?(RbConfig::CONFIG["host_os"])
78
- return if system("vips --version", out: File::NULL, err: File::NULL)
79
-
80
- abort "Error: libvips is not installed. Please install it before using Tahweel.\n" \
81
- "MacOS: `brew install vips`\n" \
82
- "Ubuntu: `sudo apt install libvips42`\n" \
83
- "Windows: Already installed with the Tahweel gem"
84
- end
70
+ attr_reader :pdf_path, :dpi, :output_dir
85
71
 
86
72
  # Ensures the source PDF file actually exists.
87
73
  # @raise [RuntimeError] if the file is missing.
@@ -106,33 +92,101 @@ module Tahweel
106
92
  # }
107
93
  # @return [void]
108
94
  def process_pages(&)
109
- total_pages.times do |i|
110
- extract_page(i)
95
+ mutex = Mutex.new
96
+ processed_count = 0
97
+
98
+ run_workers(build_queue, mutex) do
99
+ processed_count += 1
100
+ report_progress(processed_count, &)
101
+ end
102
+ end
103
+
104
+ # Builds a queue containing all page indices to be processed.
105
+ # @return [Queue] The queue populated with page numbers.
106
+ def build_queue
107
+ queue = Queue.new
108
+ total_pages.times { queue << _1 }
109
+ queue
110
+ end
111
111
 
112
- next unless block_given?
112
+ # Spawns and manages worker threads to process the queue.
113
+ #
114
+ # @param queue [Queue] The queue of pages to process.
115
+ # @param mutex [Mutex] Synchronization primitive for thread safety.
116
+ # @param &block [Proc] Block to execute when a page is processed.
117
+ def run_workers(queue, mutex, &)
118
+ concurrency = (Etc.nprocessors - 2).clamp(2..)
119
+
120
+ Array.new([concurrency, total_pages].min) do
121
+ Thread.new { process_queue_items(queue, mutex, &) }
122
+ end.each(&:join)
123
+ end
113
124
 
114
- yield({
115
- file_path: @pdf_path, stage: :splitting,
116
- current_page: i + 1,
117
- percentage: (((i + 1).to_f / total_pages) * 100).round(2),
118
- remaining_pages: total_pages - (i + 1)
119
- })
125
+ # Processing loop for individual worker threads.
126
+ #
127
+ # @param queue [Queue] The shared queue of pages.
128
+ # @param mutex [Mutex] Synchronization primitive.
129
+ # @param &block [Proc] Block to yield for progress updates.
130
+ def process_queue_items(queue, mutex, &)
131
+ loop do
132
+ begin
133
+ page_num = queue.pop(true)
134
+ rescue ThreadError
135
+ break
136
+ end
137
+
138
+ extract_page(page_num)
139
+
140
+ mutex.synchronize(&)
120
141
  end
121
142
  end
122
143
 
144
+ # Reports progress back to the caller.
145
+ #
146
+ # @param processed [Integer] Number of pages processed so far.
147
+ # @param &block [Proc] The progress callback block.
148
+ def report_progress(processed, &)
149
+ return unless block_given?
150
+
151
+ yield({
152
+ file_path: @pdf_path, stage: :splitting,
153
+ current_page: processed,
154
+ percentage: ((processed.to_f / total_pages) * 100).round(2),
155
+ remaining_pages: total_pages - processed
156
+ })
157
+ end
158
+
123
159
  # Calculates the total number of pages in the PDF by loading the first page metadata.
124
160
  # @return [Integer] The page count.
125
161
  def total_pages
126
- @total_pages ||= Vips::Image.pdfload(pdf_path, page: 0, dpi: dpi, access: :sequential).get("pdf-n_pages")
162
+ @total_pages ||= begin
163
+ output = `#{PopplerInstaller.pdfinfo_path} "#{pdf_path}"`.encode(
164
+ "UTF-8",
165
+ invalid: :replace, undef: :replace, replace: ""
166
+ )
167
+
168
+ pages = output[/Pages:\s*(\d+)/, 1]
169
+ raise "Failed to get page count from PDF: #{output}" unless pages
170
+
171
+ pages.to_i
172
+ end
127
173
  end
128
174
 
129
175
  # Extracts a specific page from the PDF and saves it as a PNG.
130
176
  #
131
177
  # @param page_num [Integer] The zero-based index of the page to extract.
132
178
  def extract_page(page_num)
133
- output_path = File.join(output_dir, "page_#{page_num + 1}.png")
134
- Vips::Image.pdfload(pdf_path, page: page_num, dpi: dpi, access: :sequential).write_to_file(output_path)
135
- image_paths << output_path
179
+ output_prefix = File.join(output_dir, "page")
180
+
181
+ system(
182
+ PopplerInstaller.pdftoppm_path,
183
+ "-png",
184
+ "-r", dpi.to_s,
185
+ "-f", (page_num + 1).to_s,
186
+ "-l", (page_num + 1).to_s,
187
+ pdf_path,
188
+ output_prefix
189
+ )
136
190
  end
137
191
 
138
192
  # Constructs the final result hash.
@@ -140,7 +194,7 @@ module Tahweel
140
194
  def result
141
195
  {
142
196
  folder_path: output_dir,
143
- image_paths: image_paths
197
+ images_paths: Dir.glob(File.join(output_dir, "page-*.png")).sort!
144
198
  }
145
199
  end
146
200
  end
@@ -0,0 +1,185 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "fileutils"
4
+ require "json"
5
+ require "net/http"
6
+ require "open-uri"
7
+ require "uri"
8
+
9
+ require "xdg"
10
+ require "zip"
11
+
12
+ module Tahweel
13
+ # Handles the installation and path resolution for Poppler utilities.
14
+ #
15
+ # On Windows, this class can automatically download and install the necessary
16
+ # binaries if they are not present. On other platforms, it provides instructions
17
+ # for manual installation.
18
+ class PopplerInstaller
19
+ POPPLER_REPO_API = "https://api.github.com/repos/oschwartz10612/poppler-windows/releases/latest"
20
+
21
+ # Ensures that Poppler utilities are installed.
22
+ #
23
+ # On Windows: Installs Poppler locally if not found.
24
+ # On other platforms: Aborts with an error message if Poppler is missing.
25
+ #
26
+ # @raise [SystemExit] if Poppler is missing on non-Windows platforms.
27
+ def self.ensure_installed! # rubocop:disable Metrics/MethodLength
28
+ installer = new
29
+ return if installer.installed?
30
+
31
+ if Gem.win_platform?
32
+ installer.install
33
+ else
34
+ abort <<~MSG
35
+ Error: Poppler utilities are not installed. Please install them:
36
+ MacOS: `brew install poppler`
37
+ Ubuntu: `sudo apt install poppler-utils`
38
+ MSG
39
+ end
40
+ end
41
+
42
+ # Returns the path to the `pdftoppm` executable.
43
+ # @return [String] path to the executable.
44
+ def self.pdftoppm_path = new.pdftoppm_path
45
+
46
+ # Returns the path to the `pdfinfo` executable.
47
+ # @return [String] path to the executable.
48
+ def self.pdfinfo_path = new.pdfinfo_path
49
+
50
+ # Installs Poppler binaries on Windows.
51
+ #
52
+ # Downloads the latest release from GitHub and extracts it to the cache directory.
53
+ # Does nothing if already installed.
54
+ def install
55
+ zip_path = nil
56
+ return if installed?
57
+
58
+ zip_path = download_release_file
59
+ extract_zip_file(zip_path)
60
+ ensure
61
+ FileUtils.rm_f(zip_path) if zip_path
62
+ end
63
+
64
+ # Checks if Poppler utilities are available.
65
+ #
66
+ # @return [Boolean] true if `pdftoppm` and `pdfinfo` are in the PATH or cached.
67
+ def installed? = (command_exists?("pdftoppm") && command_exists?("pdfinfo")) || cached?
68
+
69
+ # Checks if Poppler binaries are present in the local cache (Windows only).
70
+ #
71
+ # @return [Boolean] true if cached binaries exist.
72
+ def cached?
73
+ return false unless Gem.win_platform?
74
+
75
+ File.exist?(File.join(cached_bin_path, "pdftoppm.exe"))
76
+ end
77
+
78
+ # Resolves the path to the `pdftoppm` executable.
79
+ #
80
+ # Prioritizes the system PATH, falling back to the cached version on Windows.
81
+ #
82
+ # @return [String] path to `pdftoppm`.
83
+ def pdftoppm_path
84
+ return "pdftoppm" if command_exists?("pdftoppm")
85
+
86
+ Gem.win_platform? ? File.join(cached_bin_path, "pdftoppm.exe") : nil
87
+ end
88
+
89
+ # Resolves the path to the `pdfinfo` executable.
90
+ #
91
+ # Prioritizes the system PATH, falling back to the cached version on Windows.
92
+ #
93
+ # @return [String] path to `pdfinfo`.
94
+ def pdfinfo_path
95
+ return "pdfinfo" if command_exists?("pdfinfo")
96
+
97
+ Gem.win_platform? ? File.join(cached_bin_path, "pdfinfo.exe") : nil
98
+ end
99
+
100
+ private
101
+
102
+ # Locates the `bin` directory within the cached Poppler installation.
103
+ #
104
+ # Searches for a directory matching "poppler-*" in the cache directory and returns
105
+ # the path to its `Library/bin` subdirectory.
106
+ #
107
+ # @return [String] Path to the `bin` directory, or an empty string if not found.
108
+ def cached_bin_path
109
+ poppler_root = Dir.glob(File.join(cache_dir, "poppler-*")).first
110
+ return "" unless poppler_root
111
+
112
+ File.join(poppler_root, "Library", "bin")
113
+ end
114
+
115
+ # Checks if a command is available in the system path.
116
+ #
117
+ # @param cmd [String] The command to check for.
118
+ # @return [Boolean] true if the command exists in the PATH.
119
+ def command_exists?(cmd)
120
+ tool = Gem.win_platform? ? "where" : "which"
121
+ system("#{tool} #{cmd} > #{File::NULL} 2>&1")
122
+ end
123
+
124
+ # Downloads the latest Poppler release zip file.
125
+ #
126
+ # Fetches the download URL from the GitHub API and saves the file to the cache directory.
127
+ #
128
+ # @return [String] The local path to the downloaded zip file.
129
+ def download_release_file
130
+ release_url = latest_release_url
131
+ zip_path = File.join(cache_dir, File.basename(release_url))
132
+ URI.parse(release_url).open { File.binwrite(zip_path, _1.read) }
133
+ zip_path
134
+ end
135
+
136
+ # Retrieves the download URL for the latest Windows release of Poppler.
137
+ #
138
+ # Queries the GitHub API for the latest release and finds the asset matching "Release*.zip".
139
+ #
140
+ # @return [String] The download URL of the asset.
141
+ # @raise [SystemExit] if the API request fails or no valid asset is found.
142
+ def latest_release_url # rubocop:disable Metrics/AbcSize
143
+ uri = URI(POPPLER_REPO_API)
144
+ request = Net::HTTP::Get.new(uri)
145
+ request["User-Agent"] = "Tahweel-Gem"
146
+
147
+ response = Net::HTTP.start(uri.hostname, uri.port, use_ssl: true) { _1.request(request) }
148
+
149
+ unless response.is_a?(Net::HTTPSuccess)
150
+ abort "Failed to fetch Poppler release info: #{response.code} #{response.message}"
151
+ end
152
+
153
+ asset = JSON.parse(response.body)["assets"].find { _1["name"].match?(/^Release.*\.zip$/) }
154
+
155
+ asset ? asset["browser_download_url"] : abort("No valid Windows release found for Poppler.")
156
+ end
157
+
158
+ # Extracts the downloaded zip file to the cache directory.
159
+ #
160
+ # @param zip_path [String] Path to the zip file to extract.
161
+ def extract_zip_file(zip_path)
162
+ Zip::File.open(zip_path) do |zip_file|
163
+ zip_file.each do |entry|
164
+ entry_dest = File.join(cache_dir, entry.name)
165
+ FileUtils.mkdir_p(File.dirname(entry_dest))
166
+ zip_file.extract(entry, entry_dest) { true }
167
+ end
168
+ end
169
+ end
170
+
171
+ # Resolves the directory used for caching downloaded binaries.
172
+ #
173
+ # Uses the XDG cache home directory if available, otherwise defaults to `~/.cache/tahweel/poppler`.
174
+ #
175
+ # @return [String] Path to the cache directory.
176
+ def cache_dir
177
+ base = XDG.new.cache_home.to_s
178
+ base = File.join(Dir.home, ".cache") if base.empty?
179
+
180
+ dir = File.join(base, "tahweel", "poppler")
181
+ FileUtils.mkdir_p(dir)
182
+ dir
183
+ end
184
+ end
185
+ end
@@ -43,7 +43,7 @@ module Tahweel
43
43
 
44
44
  begin
45
45
  file_id = upload_file(file_path)
46
- download_text(file_id).gsub("\r\n", "\n").gsub("________________", "").strip
46
+ download_text(file_id).gsub("________________", "").strip
47
47
  ensure
48
48
  delete_file(file_id)
49
49
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Tahweel
4
- VERSION = "0.1.1"
4
+ VERSION = "0.1.3"
5
5
  end
@@ -1,8 +1,8 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative "writers/txt"
4
3
  require_relative "writers/docx"
5
4
  require_relative "writers/json"
5
+ require_relative "writers/txt"
6
6
 
7
7
  module Tahweel
8
8
  # Factory class for writing extracted text to different formats.
@@ -14,10 +14,11 @@ module Tahweel
14
14
  # Writes the extracted texts to a file.
15
15
  #
16
16
  # It applies several transformations to the text before writing:
17
- # 1. Normalizes line endings to `\n`.
17
+ # 1. Normalizes all line endings (`\r\n`, `\r`) to `\n`.
18
18
  # 2. Collapses consecutive identical whitespace characters.
19
19
  # 3. Compacts the text by merging short lines if the page is too long (> 40 lines).
20
20
  # 4. Determines text alignment (RTL/LTR) based on content.
21
+ # 5. Converts `\n` to proper OOXML line breaks for cross-platform compatibility.
21
22
  #
22
23
  # @param texts [Array<String>] The extracted texts (one per page).
23
24
  # @param destination [String] The output file path.
@@ -26,10 +27,10 @@ module Tahweel
26
27
  def write(texts, destination, options = {}) # rubocop:disable Lint/UnusedMethodArgument
27
28
  Caracal::Document.save(destination) do |docx|
28
29
  texts.each_with_index do |text, index|
29
- text = text.gsub(/(\r\n)+/, "\n").gsub(/(\s)\1+/, '\1').strip
30
+ text = text.gsub(/\r\n?/, "\n").gsub(/(\s)\1+/, '\1').strip
30
31
  text = compact_shortest_lines(text) while expected_lines_in_page(text) > 40
31
32
 
32
- docx.p text, size: 20, align: alignment_for(text)
33
+ write_paragraph(docx, text)
33
34
 
34
35
  docx.page if index < texts.size - 1
35
36
  end
@@ -38,6 +39,28 @@ module Tahweel
38
39
 
39
40
  private
40
41
 
42
+ # Writes a paragraph with proper OOXML line breaks.
43
+ #
44
+ # Raw newline characters (\n, \r\n) are not valid line breaks in DOCX format.
45
+ # Microsoft Word on Windows requires proper <w:br/> elements for line breaks,
46
+ # while macOS Pages is more lenient. This method uses Caracal's `br` method
47
+ # to insert cross-platform compatible line breaks.
48
+ #
49
+ # @param docx [Caracal::Document] The document to write to.
50
+ # @param text [String] The text content with newlines.
51
+ # @return [void]
52
+ def write_paragraph(docx, text)
53
+ lines = text.split("\n")
54
+ alignment = alignment_for(text)
55
+
56
+ docx.p align: alignment do
57
+ lines.each_with_index do |line, line_index|
58
+ text line, size: 20
59
+ br if line_index < lines.size - 1
60
+ end
61
+ end
62
+ end
63
+
41
64
  # Determines the text alignment based on the ratio of Arabic to non-Arabic characters.
42
65
  #
43
66
  # @param text [String] The text to analyze.
data/lib/tahweel.rb CHANGED
@@ -1,14 +1,19 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative "tahweel/version"
4
+ require_relative "tahweel/cli/options"
5
+ require_relative "tahweel/cli/file_processor"
6
+ require_relative "tahweel/cli/file_collector"
4
7
  require_relative "tahweel/authorizer"
8
+ require_relative "tahweel/poppler_installer"
5
9
  require_relative "tahweel/pdf_splitter"
6
10
  require_relative "tahweel/ocr"
11
+ require_relative "tahweel/processors/google_drive"
7
12
  require_relative "tahweel/converter"
8
13
  require_relative "tahweel/writer"
9
- require_relative "tahweel/cli/file_processor"
10
- require_relative "tahweel/cli/file_collector"
11
- require_relative "tahweel/cli/options"
14
+ require_relative "tahweel/writers/txt"
15
+ require_relative "tahweel/writers/docx"
16
+ require_relative "tahweel/writers/json"
12
17
 
13
18
  module Tahweel # rubocop:disable Style/Documentation
14
19
  class Error < StandardError; end
Binary file