RubyGems - pdf2markdownOCR - Versions diffs - 0.0.1 → 0.0.3 - Mend

pdf2markdownOCR 0.0.1 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/README.md +59 -2
data/lib/pdf2markdownOCR/cli.rb +9 -1
data/lib/pdf2markdownOCR/llm_api.rb +5 -7
data/lib/pdf2markdownOCR/pdf2image.rb +51 -14
data/lib/pdf2markdownOCR/version.rb +1 -1
data/lib/pdf2markdownOCR.rb +4 -5
metadata +61 -4

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: f125dfcb354343c2aa778d36b422f338db5bf24734e48acbfe53bd57ad343569
-  data.tar.gz: 620f9cc746b7da2a38b8b75fa8b9ca87ff44c8d13215aa1cea54e6d0c14aa36d
+  metadata.gz: 1391a8f43986ec5bfcc1b29bc412c71a44a0843fd4bb4364eb34cd5b6c69275b
+  data.tar.gz: f780980fa2a82a1de804e65d162b69c5a067bae4680d9848d99bc6d36c7b3367
 SHA512:
-  metadata.gz: 4dd91a17be2519f2e6f8fb41455f7d27ec7a123f36d55fff0fa891a2cadfcbee36d7bc39d9848b32262abecf4055f2fe21631b69f9e5204a32b60fc695b48198
-  data.tar.gz: bd2aad9b2417f69c9d1240ca8a06e20bfaf2b4d2ed5f7a47609b84acab1d73dd67983a18446467628d8279cac31f046fca013a69187f30cab3c726236a36a90c
+  metadata.gz: 9378b9851b4487119ad3291c27fc4bb5914087713e1eb5a7838d2de5a094b76c981235fdcae98ea829daab763852fc283e34953ec9d25affbc7bfd46a20c9f6e
+  data.tar.gz: c40a6d78f42f6e55872242ebfa1b3bb651e81a8963aac3b5bfd268186d868d571a7ba739fe973da398fbba27bf77a88b8a894615dcc79843b93d596e1996d5e0

data/README.md CHANGED Viewed

@@ -77,7 +77,7 @@ end
 ```ruby
 require 'pdf2markdownOCR'
-markdown = Pdf2MarkdownOCR.convert_pdf("document.pdf")
+markdown = Pdf2MarkdownOCR.convert_pdf(pdf_path: "document.pdf")
 puts markdown
 ```
@@ -85,10 +85,17 @@ puts markdown
 ```ruby
-Pdf2MarkdownOCR.convert_pdf("document.pdf", "output.md")
+Pdf2MarkdownOCR.convert_pdf(pdf_path: "document.pdf", output_file: "output.md")
 # => nil  (content written to output.md)
 ```
+### Convert specific page range
+```ruby
+Pdf2MarkdownOCR.convert_pdf(pdf_path: "document.pdf", output_file: "output.md", pages: "1,2,5-7") #Will convert pages 1,2,5,6,7
+```
 ## Usage as a CLI
 After installation the `pdf2markdownocr` executable is available on your `PATH`. Options are the same as in the configuration block
@@ -104,6 +111,7 @@ Options:
   --llm-model MODEL
   --mode Processing mode: single_thread or multi_thread
   --png-dpi DPI resolution for PNG conversion
+  --pages Page range
   -h, --help Show help message
 ```
@@ -125,6 +133,55 @@ pdf2markdownocr document.pdf -o result.md --llm-api-url http://localhost:9800 --
 pdf2markdownocr --version
 ```
+## Running the models
+### Ollama setup
+Easy to try, but not recommended because performance isnt great, as it doesnt process the requests in parallel
+Pull the model
+```bash
+ollama pull deepseek-ocr:latest
+ollama run deepseek-ocr:latest
+```
+Then call the tool with the correct port and model
+```bash
+pdf2markdownocr document.pdf -o result.md --llm-api-url http://localhost:11434 --llm-model deepseek-ocr:latest
+```
+### vLLM
+[Official vLLM Guide](https://docs.vllm.ai/projects/recipes/en/latest/DeepSeek/DeepSeek-OCR-2.html)
+1. Install uv and torch, and vllm
+```bash
+uv venv
+source .venv/bin/activate
+```
+Ive had problems with my GPU by using the default vllm install and I find that installing torch and torchvision separately helps. (Install pytorch)[https://pytorch.org/get-started/locally/]
+```bash
+uv run pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cu132 #This will depend on the cuda version installed in your system
+```
+Install vllm
+```bash
+uv pip install -U vllm --torch-backend auto
+```
+Then run the model
+```bash
+uv run vllm serve deepseek-ai/DeepSeek-OCR-2 --logits_processors vllm.model_executor.models.deepseek_ocr:NGramPerReqLogitsProcessor --no-enable-prefix-caching --mm-processor-cache-gb 0
+```
 ## License
 MIT

data/lib/pdf2markdownOCR/cli.rb CHANGED Viewed

@@ -37,6 +37,11 @@ module Pdf2MarkdownOCR
           options[:png_dpi] = dpi
         end
+        opts.on("--pages PAGES", "Pages to process, separated by commas. Ranges can be provided with a dash (e.g., 1,3-5)") do |pages|
+          options[:pages] = pages
+        end
         opts.on("-v", "--version", "Print version") do
           puts Pdf2MarkdownOCR.gem_version
           exit
@@ -67,7 +72,10 @@ module Pdf2MarkdownOCR
         config.png_dpi_resolution = options[:png_dpi] if options[:png_dpi]
       end
-      markdown_content = Pdf2MarkdownOCR.convert_pdf(pdf_path, options[:output])
+      markdown_content = Pdf2MarkdownOCR.convert_pdf(pdf_path: pdf_path,
+        output_file: options[:output],
+        pages: options[:pages])
       if markdown_content && !markdown_content.empty? && !options[:output]
         puts markdown_content
       end

data/lib/pdf2markdownOCR/llm_api.rb CHANGED Viewed

@@ -11,16 +11,16 @@ module Pdf2MarkdownOCR
           {
             role: "user",
             content: [
+              {
+                type: "text",
+                text: "<image>\nFree OCR."
+              },
               {
                 type: "image_url",
                 image_url: {
                   url: "data:image/png;base64,#{image_url}"
                 }
               },
-              {
-                type: "text",
-                text: "<image_url>\n Free OCR."
-              }
             ]
           }
         ],
@@ -57,12 +57,10 @@ module Pdf2MarkdownOCR
               Pdf2MarkdownOCR.configuration.logger.warn "Warning: No Markdown content generated for #{image_path}"
             end
           else
-            Pdf2MarkdownOCR.configuration.logger.error "Error processing #{image_path}: #{response.return_message} (#{response.code})"
+            Pdf2MarkdownOCR.configuration.logger.error "Error processing #{image_path}: #{response.body} (#{response.code})"
           end
         end
         hydra.queue(request)
       end
       hydra.run

data/lib/pdf2markdownOCR/pdf2image.rb CHANGED Viewed

@@ -3,39 +3,76 @@ require 'terrapin'
 module Pdf2MarkdownOCR
   module Pdf2Image
-    def self.single_thread_conversion(pdf_path, output_prefix)
+    def self.single_thread_conversion(pdf_path:, output_prefix:, pages: nil)
       Pdf2MarkdownOCR.configuration.logger.info "Converting #{pdf_path} into images. DPI: #{Pdf2MarkdownOCR.configuration.png_dpi_resolution}"
       t1 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
-      line = Terrapin::CommandLine.new(
-        "pdftoppm",
-        "-png -r #{Pdf2MarkdownOCR.configuration.png_dpi_resolution} #{pdf_path} #{output_prefix}/pdf2ocr"
-      )
-      line.run
+      if pages.nil?
+        line = Terrapin::CommandLine.new(
+          "pdftoppm",
+          "-png -r #{Pdf2MarkdownOCR.configuration.png_dpi_resolution} #{pdf_path} #{output_prefix}/pdf2ocr"
+        )
+        line.run
+      else
+        pages.split(",").each do |page_range|
+          if page_range.include?("-")
+            start_page, end_page = page_range.split("-").map(&:to_i)
+            line = Terrapin::CommandLine.new(
+              "pdftoppm",
+              "-png -r #{Pdf2MarkdownOCR.configuration.png_dpi_resolution}" +
+              " -f #{start_page} -l #{end_page}" +
+              " #{pdf_path} #{output_prefix}/pdf2ocr"
+            )
+            line.run
+          else
+            line = Terrapin::CommandLine.new(
+              "pdftoppm",
+              "-png -r #{Pdf2MarkdownOCR.configuration.png_dpi_resolution}" +
+              " -f #{page_range} -l #{page_range}" +
+              " #{pdf_path} #{output_prefix}/pdf2ocr"
+            )
+            line.run
+          end
+        end
+      end
       t2 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
       Pdf2MarkdownOCR.configuration.logger.info "PDF to image conversion time: #{(t2 - t1).round(2)} seconds"
       Dir.glob("#{output_prefix}/pdf2ocr*.png").sort
     end
-    def self.multi_thread_conversion(pdf_path, output_prefix)
+    def self.multi_thread_conversion(pdf_path:, output_prefix:, pages: nil)
       Pdf2MarkdownOCR.configuration.logger.info "Converting #{pdf_path} into images. DPI: #{Pdf2MarkdownOCR.configuration.png_dpi_resolution}"
-      # Get total page count
-      info = Terrapin::CommandLine.new("pdfinfo", pdf_path).run
-      total_pages = info.match(/^Pages:\s+(\d+)/)[1].to_i
+      pages_to_process = []
+      if pages.nil?
+        info = Terrapin::CommandLine.new("pdfinfo", pdf_path).run
+        total_pages = info.match(/^Pages:\s+(\d+)/)[1].to_i
+        pages_to_process = (1..total_pages).to_a
+      else
+        pages.split(",").each do |page_range|
+          if page_range.include?("-")
+            start_page, end_page = page_range.split("-").map(&:to_i)
+            pages_to_process += (start_page..end_page).to_a
+          else
+            pages_to_process << page_range.to_i
+          end
+        end
+      end
-      Pdf2MarkdownOCR.configuration.logger.info "Total pages: #{total_pages}"
+      Pdf2MarkdownOCR.configuration.logger.info "Total pages to process: #{pages_to_process.size}"
       t1 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
-      threads = (1..total_pages).map do |page|
+      threads = pages_to_process.map do |page|
         Thread.new do
           Terrapin::CommandLine.new(
             "pdftoppm",
             "-png -r #{Pdf2MarkdownOCR.configuration.png_dpi_resolution} -f #{page} -l #{page} #{pdf_path} #{output_prefix}/pdf2ocr"
           ).run
-          Pdf2MarkdownOCR.configuration.logger.info "Converted page #{page}/#{total_pages}"
+          Pdf2MarkdownOCR.configuration.logger.info "Converted page #{page}/#{pages_to_process.size}"
         end
       end

data/lib/pdf2markdownOCR/version.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 # frozen_string_literal: true
 module Pdf2MarkdownOCR
-  VERSION = "0.0.1"
+  VERSION = "0.0.3"
   def self.gem_version
     Gem::Version.new(VERSION)

data/lib/pdf2markdownOCR.rb CHANGED Viewed

@@ -3,7 +3,6 @@
 #
 require "base64"
-require "httparty"
 require 'json'
 require 'typhoeus'
@@ -28,7 +27,7 @@ module Pdf2MarkdownOCR
   end
-  def self.convert_pdf(pdf_path, output_file = nil)
+  def self.convert_pdf(pdf_path:, output_file: nil, pages: nil)
     Pdf2MarkdownOCR.configuration.logger.info "Parsing PDF file: #{pdf_path}"
     unless File.exist?(pdf_path)
@@ -38,13 +37,13 @@ module Pdf2MarkdownOCR
     markdown_content = ""
     begin
-      tempdir = Dir.mktmpdir
+      tempdir = Dir.mktmpdir("pdf2markdownocr")
       images = []
       if Pdf2MarkdownOCR.configuration.mode == :multi_thread
-        images = Pdf2MarkdownOCR::Pdf2Image.multi_thread_conversion(pdf_path, tempdir)
+        images = Pdf2MarkdownOCR::Pdf2Image.multi_thread_conversion(pdf_path: pdf_path, pages: pages, output_prefix: tempdir)
       else
-        images = Pdf2MarkdownOCR::Pdf2Image.single_thread_conversion(pdf_path, tempdir)
+        images = Pdf2MarkdownOCR::Pdf2Image.single_thread_conversion(pdf_path: pdf_path, pages: pages, output_prefix: tempdir)
       end
       markdown_content = Pdf2MarkdownOCR::LlmApi.ocr_images(images)

metadata CHANGED Viewed

@@ -1,16 +1,73 @@
 --- !ruby/object:Gem::Specification
 name: pdf2markdownOCR
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.0.3
 platform: ruby
 authors:
 - Guillermo Molini
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2026-06-04 00:00:00.000000000 Z
-dependencies: []
-description: A Ruby library for converting PDF documents to Markdown using OCR.
+date: 2026-06-05 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: typhoeus
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.4'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.4'
+- !ruby/object:Gem::Dependency
+  name: terrapin
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 1.1.1
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 1.1.1
+- !ruby/object:Gem::Dependency
+  name: pry
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.14'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.14'
+- !ruby/object:Gem::Dependency
+  name: listen
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.0'
+description: A Ruby library for converting PDF documents to Markdown using OCR using
+  a locally hosted OpenAI compatible server.
 email:
 - guillermo.molini@gmail.com
 executables: