pdf2markdownOCR 0.0.1 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f125dfcb354343c2aa778d36b422f338db5bf24734e48acbfe53bd57ad343569
4
- data.tar.gz: 620f9cc746b7da2a38b8b75fa8b9ca87ff44c8d13215aa1cea54e6d0c14aa36d
3
+ metadata.gz: 1391a8f43986ec5bfcc1b29bc412c71a44a0843fd4bb4364eb34cd5b6c69275b
4
+ data.tar.gz: f780980fa2a82a1de804e65d162b69c5a067bae4680d9848d99bc6d36c7b3367
5
5
  SHA512:
6
- metadata.gz: 4dd91a17be2519f2e6f8fb41455f7d27ec7a123f36d55fff0fa891a2cadfcbee36d7bc39d9848b32262abecf4055f2fe21631b69f9e5204a32b60fc695b48198
7
- data.tar.gz: bd2aad9b2417f69c9d1240ca8a06e20bfaf2b4d2ed5f7a47609b84acab1d73dd67983a18446467628d8279cac31f046fca013a69187f30cab3c726236a36a90c
6
+ metadata.gz: 9378b9851b4487119ad3291c27fc4bb5914087713e1eb5a7838d2de5a094b76c981235fdcae98ea829daab763852fc283e34953ec9d25affbc7bfd46a20c9f6e
7
+ data.tar.gz: c40a6d78f42f6e55872242ebfa1b3bb651e81a8963aac3b5bfd268186d868d571a7ba739fe973da398fbba27bf77a88b8a894615dcc79843b93d596e1996d5e0
data/README.md CHANGED
@@ -77,7 +77,7 @@ end
77
77
  ```ruby
78
78
  require 'pdf2markdownOCR'
79
79
 
80
- markdown = Pdf2MarkdownOCR.convert_pdf("document.pdf")
80
+ markdown = Pdf2MarkdownOCR.convert_pdf(pdf_path: "document.pdf")
81
81
  puts markdown
82
82
  ```
83
83
 
@@ -85,10 +85,17 @@ puts markdown
85
85
 
86
86
  ```ruby
87
87
 
88
- Pdf2MarkdownOCR.convert_pdf("document.pdf", "output.md")
88
+ Pdf2MarkdownOCR.convert_pdf(pdf_path: "document.pdf", output_file: "output.md")
89
89
  # => nil (content written to output.md)
90
90
  ```
91
91
 
92
+ ### Convert specific page range
93
+
94
+ ```ruby
95
+
96
+ Pdf2MarkdownOCR.convert_pdf(pdf_path: "document.pdf", output_file: "output.md", pages: "1,2,5-7") #Will convert pages 1,2,5,6,7
97
+ ```
98
+
92
99
  ## Usage as a CLI
93
100
 
94
101
  After installation the `pdf2markdownocr` executable is available on your `PATH`. Options are the same as in the configuration block
@@ -104,6 +111,7 @@ Options:
104
111
  --llm-model MODEL
105
112
  --mode Processing mode: single_thread or multi_thread
106
113
  --png-dpi DPI resolution for PNG conversion
114
+ --pages Page range
107
115
  -h, --help Show help message
108
116
 
109
117
  ```
@@ -125,6 +133,55 @@ pdf2markdownocr document.pdf -o result.md --llm-api-url http://localhost:9800 --
125
133
  pdf2markdownocr --version
126
134
  ```
127
135
 
136
+ ## Running the models
137
+
138
+ ### Ollama setup
139
+
140
+ Easy to try, but not recommended because performance isnt great, as it doesnt process the requests in parallel
141
+
142
+ Pull the model
143
+
144
+ ```bash
145
+ ollama pull deepseek-ocr:latest
146
+ ollama run deepseek-ocr:latest
147
+ ```
148
+
149
+ Then call the tool with the correct port and model
150
+
151
+ ```bash
152
+ pdf2markdownocr document.pdf -o result.md --llm-api-url http://localhost:11434 --llm-model deepseek-ocr:latest
153
+
154
+ ```
155
+
156
+ ### vLLM
157
+
158
+ [Official vLLM Guide](https://docs.vllm.ai/projects/recipes/en/latest/DeepSeek/DeepSeek-OCR-2.html)
159
+
160
+ 1. Install uv and torch, and vllm
161
+
162
+ ```bash
163
+ uv venv
164
+ source .venv/bin/activate
165
+ ```
166
+
167
+ Ive had problems with my GPU by using the default vllm install and I find that installing torch and torchvision separately helps. (Install pytorch)[https://pytorch.org/get-started/locally/]
168
+
169
+ ```bash
170
+ uv run pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cu132 #This will depend on the cuda version installed in your system
171
+ ```
172
+
173
+ Install vllm
174
+
175
+ ```bash
176
+ uv pip install -U vllm --torch-backend auto
177
+ ```
178
+
179
+ Then run the model
180
+
181
+ ```bash
182
+ uv run vllm serve deepseek-ai/DeepSeek-OCR-2 --logits_processors vllm.model_executor.models.deepseek_ocr:NGramPerReqLogitsProcessor --no-enable-prefix-caching --mm-processor-cache-gb 0
183
+ ```
184
+
128
185
  ## License
129
186
 
130
187
  MIT
@@ -37,6 +37,11 @@ module Pdf2MarkdownOCR
37
37
  options[:png_dpi] = dpi
38
38
  end
39
39
 
40
+ opts.on("--pages PAGES", "Pages to process, separated by commas. Ranges can be provided with a dash (e.g., 1,3-5)") do |pages|
41
+ options[:pages] = pages
42
+ end
43
+
44
+
40
45
  opts.on("-v", "--version", "Print version") do
41
46
  puts Pdf2MarkdownOCR.gem_version
42
47
  exit
@@ -67,7 +72,10 @@ module Pdf2MarkdownOCR
67
72
  config.png_dpi_resolution = options[:png_dpi] if options[:png_dpi]
68
73
  end
69
74
 
70
- markdown_content = Pdf2MarkdownOCR.convert_pdf(pdf_path, options[:output])
75
+ markdown_content = Pdf2MarkdownOCR.convert_pdf(pdf_path: pdf_path,
76
+ output_file: options[:output],
77
+ pages: options[:pages])
78
+
71
79
  if markdown_content && !markdown_content.empty? && !options[:output]
72
80
  puts markdown_content
73
81
  end
@@ -11,16 +11,16 @@ module Pdf2MarkdownOCR
11
11
  {
12
12
  role: "user",
13
13
  content: [
14
+ {
15
+ type: "text",
16
+ text: "<image>\nFree OCR."
17
+ },
14
18
  {
15
19
  type: "image_url",
16
20
  image_url: {
17
21
  url: "data:image/png;base64,#{image_url}"
18
22
  }
19
23
  },
20
- {
21
- type: "text",
22
- text: "<image_url>\n Free OCR."
23
- }
24
24
  ]
25
25
  }
26
26
  ],
@@ -57,12 +57,10 @@ module Pdf2MarkdownOCR
57
57
  Pdf2MarkdownOCR.configuration.logger.warn "Warning: No Markdown content generated for #{image_path}"
58
58
  end
59
59
  else
60
- Pdf2MarkdownOCR.configuration.logger.error "Error processing #{image_path}: #{response.return_message} (#{response.code})"
60
+ Pdf2MarkdownOCR.configuration.logger.error "Error processing #{image_path}: #{response.body} (#{response.code})"
61
61
  end
62
62
  end
63
63
 
64
-
65
-
66
64
  hydra.queue(request)
67
65
  end
68
66
  hydra.run
@@ -3,39 +3,76 @@ require 'terrapin'
3
3
 
4
4
  module Pdf2MarkdownOCR
5
5
  module Pdf2Image
6
- def self.single_thread_conversion(pdf_path, output_prefix)
6
+
7
+ def self.single_thread_conversion(pdf_path:, output_prefix:, pages: nil)
7
8
  Pdf2MarkdownOCR.configuration.logger.info "Converting #{pdf_path} into images. DPI: #{Pdf2MarkdownOCR.configuration.png_dpi_resolution}"
8
9
 
9
10
  t1 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
10
- line = Terrapin::CommandLine.new(
11
- "pdftoppm",
12
- "-png -r #{Pdf2MarkdownOCR.configuration.png_dpi_resolution} #{pdf_path} #{output_prefix}/pdf2ocr"
13
- )
14
- line.run
11
+
12
+ if pages.nil?
13
+ line = Terrapin::CommandLine.new(
14
+ "pdftoppm",
15
+ "-png -r #{Pdf2MarkdownOCR.configuration.png_dpi_resolution} #{pdf_path} #{output_prefix}/pdf2ocr"
16
+ )
17
+ line.run
18
+ else
19
+ pages.split(",").each do |page_range|
20
+ if page_range.include?("-")
21
+ start_page, end_page = page_range.split("-").map(&:to_i)
22
+ line = Terrapin::CommandLine.new(
23
+ "pdftoppm",
24
+ "-png -r #{Pdf2MarkdownOCR.configuration.png_dpi_resolution}" +
25
+ " -f #{start_page} -l #{end_page}" +
26
+ " #{pdf_path} #{output_prefix}/pdf2ocr"
27
+ )
28
+ line.run
29
+ else
30
+ line = Terrapin::CommandLine.new(
31
+ "pdftoppm",
32
+ "-png -r #{Pdf2MarkdownOCR.configuration.png_dpi_resolution}" +
33
+ " -f #{page_range} -l #{page_range}" +
34
+ " #{pdf_path} #{output_prefix}/pdf2ocr"
35
+ )
36
+ line.run
37
+ end
38
+ end
39
+ end
15
40
 
16
41
  t2 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
17
42
  Pdf2MarkdownOCR.configuration.logger.info "PDF to image conversion time: #{(t2 - t1).round(2)} seconds"
18
43
  Dir.glob("#{output_prefix}/pdf2ocr*.png").sort
19
44
  end
20
45
 
21
- def self.multi_thread_conversion(pdf_path, output_prefix)
46
+ def self.multi_thread_conversion(pdf_path:, output_prefix:, pages: nil)
22
47
 
23
48
  Pdf2MarkdownOCR.configuration.logger.info "Converting #{pdf_path} into images. DPI: #{Pdf2MarkdownOCR.configuration.png_dpi_resolution}"
24
-
25
- # Get total page count
26
- info = Terrapin::CommandLine.new("pdfinfo", pdf_path).run
27
- total_pages = info.match(/^Pages:\s+(\d+)/)[1].to_i
49
+
50
+ pages_to_process = []
51
+ if pages.nil?
52
+ info = Terrapin::CommandLine.new("pdfinfo", pdf_path).run
53
+ total_pages = info.match(/^Pages:\s+(\d+)/)[1].to_i
54
+ pages_to_process = (1..total_pages).to_a
55
+ else
56
+ pages.split(",").each do |page_range|
57
+ if page_range.include?("-")
58
+ start_page, end_page = page_range.split("-").map(&:to_i)
59
+ pages_to_process += (start_page..end_page).to_a
60
+ else
61
+ pages_to_process << page_range.to_i
62
+ end
63
+ end
64
+ end
28
65
 
29
- Pdf2MarkdownOCR.configuration.logger.info "Total pages: #{total_pages}"
66
+ Pdf2MarkdownOCR.configuration.logger.info "Total pages to process: #{pages_to_process.size}"
30
67
  t1 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
31
- threads = (1..total_pages).map do |page|
68
+ threads = pages_to_process.map do |page|
32
69
  Thread.new do
33
70
  Terrapin::CommandLine.new(
34
71
  "pdftoppm",
35
72
  "-png -r #{Pdf2MarkdownOCR.configuration.png_dpi_resolution} -f #{page} -l #{page} #{pdf_path} #{output_prefix}/pdf2ocr"
36
73
  ).run
37
74
 
38
- Pdf2MarkdownOCR.configuration.logger.info "Converted page #{page}/#{total_pages}"
75
+ Pdf2MarkdownOCR.configuration.logger.info "Converted page #{page}/#{pages_to_process.size}"
39
76
  end
40
77
  end
41
78
 
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Pdf2MarkdownOCR
4
- VERSION = "0.0.1"
4
+ VERSION = "0.0.3"
5
5
 
6
6
  def self.gem_version
7
7
  Gem::Version.new(VERSION)
@@ -3,7 +3,6 @@
3
3
  #
4
4
 
5
5
  require "base64"
6
- require "httparty"
7
6
  require 'json'
8
7
  require 'typhoeus'
9
8
 
@@ -28,7 +27,7 @@ module Pdf2MarkdownOCR
28
27
 
29
28
  end
30
29
 
31
- def self.convert_pdf(pdf_path, output_file = nil)
30
+ def self.convert_pdf(pdf_path:, output_file: nil, pages: nil)
32
31
 
33
32
  Pdf2MarkdownOCR.configuration.logger.info "Parsing PDF file: #{pdf_path}"
34
33
  unless File.exist?(pdf_path)
@@ -38,13 +37,13 @@ module Pdf2MarkdownOCR
38
37
 
39
38
  markdown_content = ""
40
39
  begin
41
- tempdir = Dir.mktmpdir
40
+ tempdir = Dir.mktmpdir("pdf2markdownocr")
42
41
 
43
42
  images = []
44
43
  if Pdf2MarkdownOCR.configuration.mode == :multi_thread
45
- images = Pdf2MarkdownOCR::Pdf2Image.multi_thread_conversion(pdf_path, tempdir)
44
+ images = Pdf2MarkdownOCR::Pdf2Image.multi_thread_conversion(pdf_path: pdf_path, pages: pages, output_prefix: tempdir)
46
45
  else
47
- images = Pdf2MarkdownOCR::Pdf2Image.single_thread_conversion(pdf_path, tempdir)
46
+ images = Pdf2MarkdownOCR::Pdf2Image.single_thread_conversion(pdf_path: pdf_path, pages: pages, output_prefix: tempdir)
48
47
  end
49
48
 
50
49
  markdown_content = Pdf2MarkdownOCR::LlmApi.ocr_images(images)
metadata CHANGED
@@ -1,16 +1,73 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf2markdownOCR
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Guillermo Molini
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2026-06-04 00:00:00.000000000 Z
12
- dependencies: []
13
- description: A Ruby library for converting PDF documents to Markdown using OCR.
11
+ date: 2026-06-05 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: typhoeus
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.4'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.4'
27
+ - !ruby/object:Gem::Dependency
28
+ name: terrapin
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 1.1.1
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 1.1.1
41
+ - !ruby/object:Gem::Dependency
42
+ name: pry
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '0.14'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '0.14'
55
+ - !ruby/object:Gem::Dependency
56
+ name: listen
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '3.0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '3.0'
69
+ description: A Ruby library for converting PDF documents to Markdown using OCR using
70
+ a locally hosted OpenAI compatible server.
14
71
  email:
15
72
  - guillermo.molini@gmail.com
16
73
  executables: