pdf2markdownOCR 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +59 -2
- data/lib/pdf2markdownOCR/cli.rb +9 -1
- data/lib/pdf2markdownOCR/llm_api.rb +1 -3
- data/lib/pdf2markdownOCR/pdf2image.rb +51 -14
- data/lib/pdf2markdownOCR/version.rb +1 -1
- data/lib/pdf2markdownOCR.rb +4 -4
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 1391a8f43986ec5bfcc1b29bc412c71a44a0843fd4bb4364eb34cd5b6c69275b
|
|
4
|
+
data.tar.gz: f780980fa2a82a1de804e65d162b69c5a067bae4680d9848d99bc6d36c7b3367
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 9378b9851b4487119ad3291c27fc4bb5914087713e1eb5a7838d2de5a094b76c981235fdcae98ea829daab763852fc283e34953ec9d25affbc7bfd46a20c9f6e
|
|
7
|
+
data.tar.gz: c40a6d78f42f6e55872242ebfa1b3bb651e81a8963aac3b5bfd268186d868d571a7ba739fe973da398fbba27bf77a88b8a894615dcc79843b93d596e1996d5e0
|
data/README.md
CHANGED
|
@@ -77,7 +77,7 @@ end
|
|
|
77
77
|
```ruby
|
|
78
78
|
require 'pdf2markdownOCR'
|
|
79
79
|
|
|
80
|
-
markdown = Pdf2MarkdownOCR.convert_pdf("document.pdf")
|
|
80
|
+
markdown = Pdf2MarkdownOCR.convert_pdf(pdf_path: "document.pdf")
|
|
81
81
|
puts markdown
|
|
82
82
|
```
|
|
83
83
|
|
|
@@ -85,10 +85,17 @@ puts markdown
|
|
|
85
85
|
|
|
86
86
|
```ruby
|
|
87
87
|
|
|
88
|
-
Pdf2MarkdownOCR.convert_pdf("document.pdf", "output.md")
|
|
88
|
+
Pdf2MarkdownOCR.convert_pdf(pdf_path: "document.pdf", output_file: "output.md")
|
|
89
89
|
# => nil (content written to output.md)
|
|
90
90
|
```
|
|
91
91
|
|
|
92
|
+
### Convert specific page range
|
|
93
|
+
|
|
94
|
+
```ruby
|
|
95
|
+
|
|
96
|
+
Pdf2MarkdownOCR.convert_pdf(pdf_path: "document.pdf", output_file: "output.md", pages: "1,2,5-7") #Will convert pages 1,2,5,6,7
|
|
97
|
+
```
|
|
98
|
+
|
|
92
99
|
## Usage as a CLI
|
|
93
100
|
|
|
94
101
|
After installation the `pdf2markdownocr` executable is available on your `PATH`. Options are the same as in the configuration block
|
|
@@ -104,6 +111,7 @@ Options:
|
|
|
104
111
|
--llm-model MODEL
|
|
105
112
|
--mode Processing mode: single_thread or multi_thread
|
|
106
113
|
--png-dpi DPI resolution for PNG conversion
|
|
114
|
+
--pages Page range
|
|
107
115
|
-h, --help Show help message
|
|
108
116
|
|
|
109
117
|
```
|
|
@@ -125,6 +133,55 @@ pdf2markdownocr document.pdf -o result.md --llm-api-url http://localhost:9800 --
|
|
|
125
133
|
pdf2markdownocr --version
|
|
126
134
|
```
|
|
127
135
|
|
|
136
|
+
## Running the models
|
|
137
|
+
|
|
138
|
+
### Ollama setup
|
|
139
|
+
|
|
140
|
+
Easy to try, but not recommended because performance isnt great, as it doesnt process the requests in parallel
|
|
141
|
+
|
|
142
|
+
Pull the model
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
ollama pull deepseek-ocr:latest
|
|
146
|
+
ollama run deepseek-ocr:latest
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
Then call the tool with the correct port and model
|
|
150
|
+
|
|
151
|
+
```bash
|
|
152
|
+
pdf2markdownocr document.pdf -o result.md --llm-api-url http://localhost:11434 --llm-model deepseek-ocr:latest
|
|
153
|
+
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
### vLLM
|
|
157
|
+
|
|
158
|
+
[Official vLLM Guide](https://docs.vllm.ai/projects/recipes/en/latest/DeepSeek/DeepSeek-OCR-2.html)
|
|
159
|
+
|
|
160
|
+
1. Install uv and torch, and vllm
|
|
161
|
+
|
|
162
|
+
```bash
|
|
163
|
+
uv venv
|
|
164
|
+
source .venv/bin/activate
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
Ive had problems with my GPU by using the default vllm install and I find that installing torch and torchvision separately helps. (Install pytorch)[https://pytorch.org/get-started/locally/]
|
|
168
|
+
|
|
169
|
+
```bash
|
|
170
|
+
uv run pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cu132 #This will depend on the cuda version installed in your system
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
Install vllm
|
|
174
|
+
|
|
175
|
+
```bash
|
|
176
|
+
uv pip install -U vllm --torch-backend auto
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
Then run the model
|
|
180
|
+
|
|
181
|
+
```bash
|
|
182
|
+
uv run vllm serve deepseek-ai/DeepSeek-OCR-2 --logits_processors vllm.model_executor.models.deepseek_ocr:NGramPerReqLogitsProcessor --no-enable-prefix-caching --mm-processor-cache-gb 0
|
|
183
|
+
```
|
|
184
|
+
|
|
128
185
|
## License
|
|
129
186
|
|
|
130
187
|
MIT
|
data/lib/pdf2markdownOCR/cli.rb
CHANGED
|
@@ -37,6 +37,11 @@ module Pdf2MarkdownOCR
|
|
|
37
37
|
options[:png_dpi] = dpi
|
|
38
38
|
end
|
|
39
39
|
|
|
40
|
+
opts.on("--pages PAGES", "Pages to process, separated by commas. Ranges can be provided with a dash (e.g., 1,3-5)") do |pages|
|
|
41
|
+
options[:pages] = pages
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
|
|
40
45
|
opts.on("-v", "--version", "Print version") do
|
|
41
46
|
puts Pdf2MarkdownOCR.gem_version
|
|
42
47
|
exit
|
|
@@ -67,7 +72,10 @@ module Pdf2MarkdownOCR
|
|
|
67
72
|
config.png_dpi_resolution = options[:png_dpi] if options[:png_dpi]
|
|
68
73
|
end
|
|
69
74
|
|
|
70
|
-
markdown_content = Pdf2MarkdownOCR.convert_pdf(pdf_path,
|
|
75
|
+
markdown_content = Pdf2MarkdownOCR.convert_pdf(pdf_path: pdf_path,
|
|
76
|
+
output_file: options[:output],
|
|
77
|
+
pages: options[:pages])
|
|
78
|
+
|
|
71
79
|
if markdown_content && !markdown_content.empty? && !options[:output]
|
|
72
80
|
puts markdown_content
|
|
73
81
|
end
|
|
@@ -57,12 +57,10 @@ module Pdf2MarkdownOCR
|
|
|
57
57
|
Pdf2MarkdownOCR.configuration.logger.warn "Warning: No Markdown content generated for #{image_path}"
|
|
58
58
|
end
|
|
59
59
|
else
|
|
60
|
-
Pdf2MarkdownOCR.configuration.logger.error "Error processing #{image_path}: #{response.
|
|
60
|
+
Pdf2MarkdownOCR.configuration.logger.error "Error processing #{image_path}: #{response.body} (#{response.code})"
|
|
61
61
|
end
|
|
62
62
|
end
|
|
63
63
|
|
|
64
|
-
|
|
65
|
-
|
|
66
64
|
hydra.queue(request)
|
|
67
65
|
end
|
|
68
66
|
hydra.run
|
|
@@ -3,39 +3,76 @@ require 'terrapin'
|
|
|
3
3
|
|
|
4
4
|
module Pdf2MarkdownOCR
|
|
5
5
|
module Pdf2Image
|
|
6
|
-
|
|
6
|
+
|
|
7
|
+
def self.single_thread_conversion(pdf_path:, output_prefix:, pages: nil)
|
|
7
8
|
Pdf2MarkdownOCR.configuration.logger.info "Converting #{pdf_path} into images. DPI: #{Pdf2MarkdownOCR.configuration.png_dpi_resolution}"
|
|
8
9
|
|
|
9
10
|
t1 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
11
|
+
|
|
12
|
+
if pages.nil?
|
|
13
|
+
line = Terrapin::CommandLine.new(
|
|
14
|
+
"pdftoppm",
|
|
15
|
+
"-png -r #{Pdf2MarkdownOCR.configuration.png_dpi_resolution} #{pdf_path} #{output_prefix}/pdf2ocr"
|
|
16
|
+
)
|
|
17
|
+
line.run
|
|
18
|
+
else
|
|
19
|
+
pages.split(",").each do |page_range|
|
|
20
|
+
if page_range.include?("-")
|
|
21
|
+
start_page, end_page = page_range.split("-").map(&:to_i)
|
|
22
|
+
line = Terrapin::CommandLine.new(
|
|
23
|
+
"pdftoppm",
|
|
24
|
+
"-png -r #{Pdf2MarkdownOCR.configuration.png_dpi_resolution}" +
|
|
25
|
+
" -f #{start_page} -l #{end_page}" +
|
|
26
|
+
" #{pdf_path} #{output_prefix}/pdf2ocr"
|
|
27
|
+
)
|
|
28
|
+
line.run
|
|
29
|
+
else
|
|
30
|
+
line = Terrapin::CommandLine.new(
|
|
31
|
+
"pdftoppm",
|
|
32
|
+
"-png -r #{Pdf2MarkdownOCR.configuration.png_dpi_resolution}" +
|
|
33
|
+
" -f #{page_range} -l #{page_range}" +
|
|
34
|
+
" #{pdf_path} #{output_prefix}/pdf2ocr"
|
|
35
|
+
)
|
|
36
|
+
line.run
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
15
40
|
|
|
16
41
|
t2 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
17
42
|
Pdf2MarkdownOCR.configuration.logger.info "PDF to image conversion time: #{(t2 - t1).round(2)} seconds"
|
|
18
43
|
Dir.glob("#{output_prefix}/pdf2ocr*.png").sort
|
|
19
44
|
end
|
|
20
45
|
|
|
21
|
-
def self.multi_thread_conversion(pdf_path
|
|
46
|
+
def self.multi_thread_conversion(pdf_path:, output_prefix:, pages: nil)
|
|
22
47
|
|
|
23
48
|
Pdf2MarkdownOCR.configuration.logger.info "Converting #{pdf_path} into images. DPI: #{Pdf2MarkdownOCR.configuration.png_dpi_resolution}"
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
49
|
+
|
|
50
|
+
pages_to_process = []
|
|
51
|
+
if pages.nil?
|
|
52
|
+
info = Terrapin::CommandLine.new("pdfinfo", pdf_path).run
|
|
53
|
+
total_pages = info.match(/^Pages:\s+(\d+)/)[1].to_i
|
|
54
|
+
pages_to_process = (1..total_pages).to_a
|
|
55
|
+
else
|
|
56
|
+
pages.split(",").each do |page_range|
|
|
57
|
+
if page_range.include?("-")
|
|
58
|
+
start_page, end_page = page_range.split("-").map(&:to_i)
|
|
59
|
+
pages_to_process += (start_page..end_page).to_a
|
|
60
|
+
else
|
|
61
|
+
pages_to_process << page_range.to_i
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
28
65
|
|
|
29
|
-
Pdf2MarkdownOCR.configuration.logger.info "Total pages: #{
|
|
66
|
+
Pdf2MarkdownOCR.configuration.logger.info "Total pages to process: #{pages_to_process.size}"
|
|
30
67
|
t1 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
31
|
-
threads =
|
|
68
|
+
threads = pages_to_process.map do |page|
|
|
32
69
|
Thread.new do
|
|
33
70
|
Terrapin::CommandLine.new(
|
|
34
71
|
"pdftoppm",
|
|
35
72
|
"-png -r #{Pdf2MarkdownOCR.configuration.png_dpi_resolution} -f #{page} -l #{page} #{pdf_path} #{output_prefix}/pdf2ocr"
|
|
36
73
|
).run
|
|
37
74
|
|
|
38
|
-
Pdf2MarkdownOCR.configuration.logger.info "Converted page #{page}/#{
|
|
75
|
+
Pdf2MarkdownOCR.configuration.logger.info "Converted page #{page}/#{pages_to_process.size}"
|
|
39
76
|
end
|
|
40
77
|
end
|
|
41
78
|
|
data/lib/pdf2markdownOCR.rb
CHANGED
|
@@ -27,7 +27,7 @@ module Pdf2MarkdownOCR
|
|
|
27
27
|
|
|
28
28
|
end
|
|
29
29
|
|
|
30
|
-
def self.convert_pdf(pdf_path
|
|
30
|
+
def self.convert_pdf(pdf_path:, output_file: nil, pages: nil)
|
|
31
31
|
|
|
32
32
|
Pdf2MarkdownOCR.configuration.logger.info "Parsing PDF file: #{pdf_path}"
|
|
33
33
|
unless File.exist?(pdf_path)
|
|
@@ -37,13 +37,13 @@ module Pdf2MarkdownOCR
|
|
|
37
37
|
|
|
38
38
|
markdown_content = ""
|
|
39
39
|
begin
|
|
40
|
-
tempdir = Dir.mktmpdir
|
|
40
|
+
tempdir = Dir.mktmpdir("pdf2markdownocr")
|
|
41
41
|
|
|
42
42
|
images = []
|
|
43
43
|
if Pdf2MarkdownOCR.configuration.mode == :multi_thread
|
|
44
|
-
images = Pdf2MarkdownOCR::Pdf2Image.multi_thread_conversion(pdf_path, tempdir)
|
|
44
|
+
images = Pdf2MarkdownOCR::Pdf2Image.multi_thread_conversion(pdf_path: pdf_path, pages: pages, output_prefix: tempdir)
|
|
45
45
|
else
|
|
46
|
-
images = Pdf2MarkdownOCR::Pdf2Image.single_thread_conversion(pdf_path, tempdir)
|
|
46
|
+
images = Pdf2MarkdownOCR::Pdf2Image.single_thread_conversion(pdf_path: pdf_path, pages: pages, output_prefix: tempdir)
|
|
47
47
|
end
|
|
48
48
|
|
|
49
49
|
markdown_content = Pdf2MarkdownOCR::LlmApi.ocr_images(images)
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: pdf2markdownOCR
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0.
|
|
4
|
+
version: 0.0.3
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Guillermo Molini
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-06-
|
|
11
|
+
date: 2026-06-05 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: typhoeus
|