llamaparserb 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -5
- data/README.md +111 -10
- data/lib/llamaparserb/version.rb +1 -1
- data/lib/llamaparserb.rb +71 -7
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9c49dc1624b1b955cad8032696308fc81357533113cf429f53fe063d5db0cab2
|
4
|
+
data.tar.gz: 2eee40a9054fe05d02a094828bb47a6cc3a5b65ca6fcb5864b9648cff5f0b418
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 50ed45e0d813d776b79fce2e87593cdc88063158045c62ba76e1bc0e353fb1b27b2b03b87dc4253443b4b2a355fec179708d04e1920dc0f47730c271e3ff81cc
|
7
|
+
data.tar.gz: 9f86c50b3f5c4bd987c9bab987c5b139ce1dd998577d04ea3fba53d64b430d3cdcc0af3d880d73ebea1914341bc185fb78469f78d9521098f06510c7024b8746
|
data/CHANGELOG.md
CHANGED
@@ -6,10 +6,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
6
6
|
|
7
7
|
## [Unreleased]
|
8
8
|
|
9
|
+
## [0.3.0] - 2024-11-28
|
10
|
+
### Added
|
11
|
+
- Add support for parsing files from URLs
|
12
|
+
|
13
|
+
## [0.2.3] - 2024-11-28
|
14
|
+
### Added
|
15
|
+
- Add support for all supported optional llamaparsse parameters to `parse_file`
|
16
|
+
|
17
|
+
[0.2.3]: https://github.com/horizing/llamaparserb/releases/tag/v0.2.3...v0.2.2
|
18
|
+
|
9
19
|
## [0.2.2] - 2024-11-28
|
10
20
|
### Fixed
|
11
21
|
- Fix issue with handling file path
|
12
22
|
|
23
|
+
[0.2.2]: https://github.com/horizing/llamaparserb/releases/tag/v0.2.2
|
24
|
+
|
13
25
|
## [0.2.1] - 2024-11-28
|
14
26
|
### Fixed
|
15
27
|
- Fix parse_file to handle files that are not on the local filesystem
|
@@ -25,8 +37,4 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
25
37
|
|
26
38
|
## [0.1.0] - 2024-11-27
|
27
39
|
### Added
|
28
|
-
- Initial release
|
29
|
-
|
30
|
-
[Unreleased]: https://github.com/horizing/llamaparserb/compare/v0.1.0...HEAD
|
31
|
-
[0.1.1]: https://github.com/horizing/llamaparserb/releases/tag/v0.1.1...v0.1.0
|
32
|
-
[0.1.0]: https://github.com/horizing/llamaparserb/releases/tag/v0.1.0
|
40
|
+
- Initial release
|
data/README.md
CHANGED
@@ -43,11 +43,14 @@ text = client.parse_file(file_content, 'pdf')
|
|
43
43
|
# Parse a file to markdown
|
44
44
|
client = Llamaparserb::Client.new(ENV['LLAMA_CLOUD_API_KEY'], result_type: "markdown")
|
45
45
|
markdown = client.parse_file('path/to/document.pdf')
|
46
|
+
|
47
|
+
# Parse a file from a URL
|
48
|
+
markdown = client.parse_file('https://example.com/document.pdf')
|
46
49
|
```
|
47
50
|
|
48
51
|
### File Input Options
|
49
52
|
|
50
|
-
The `parse_file` method accepts
|
53
|
+
The `parse_file` method accepts three types of inputs:
|
51
54
|
|
52
55
|
1. File path (String):
|
53
56
|
```ruby
|
@@ -69,25 +72,123 @@ temp_file = Tempfile.new(['document', '.pdf'])
|
|
69
72
|
client.parse_file(temp_file, 'pdf')
|
70
73
|
```
|
71
74
|
|
75
|
+
3. URL (String):
|
76
|
+
```ruby
|
77
|
+
client.parse_file('https://example.com/document.pdf')
|
78
|
+
```
|
79
|
+
|
72
80
|
### Advanced Options
|
73
81
|
|
74
82
|
```ruby
|
75
83
|
client = Llamaparserb::Client.new(
|
76
84
|
ENV['LLAMA_CLOUD_API_KEY'],
|
77
85
|
{
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
86
|
+
# Basic Configuration
|
87
|
+
result_type: "markdown", # Output format: "text" or "markdown"
|
88
|
+
num_workers: 4, # Number of workers for concurrent processing
|
89
|
+
check_interval: 1, # How often to check job status (seconds)
|
90
|
+
max_timeout: 2000, # Maximum time to wait for parsing (seconds)
|
91
|
+
verbose: true, # Enable detailed logging
|
92
|
+
show_progress: true, # Show progress during parsing
|
93
|
+
ignore_errors: true, # Return nil instead of raising errors
|
94
|
+
|
95
|
+
# Language and Parsing Options
|
96
|
+
language: :en, # Target language for parsing
|
97
|
+
parsing_instruction: "", # Custom parsing instructions
|
98
|
+
skip_diagonal_text: false, # Skip diagonal text in documents
|
99
|
+
invalidate_cache: false, # Force reprocessing of cached documents
|
100
|
+
do_not_cache: false, # Disable caching of results
|
101
|
+
|
102
|
+
# Processing Modes
|
103
|
+
fast_mode: false, # Enable faster processing (may reduce quality)
|
104
|
+
premium_mode: false, # Enable premium parsing features
|
105
|
+
continuous_mode: false, # Process document as continuous text
|
106
|
+
do_not_unroll_columns: false, # Keep columnar text structure
|
107
|
+
|
108
|
+
# Page Handling
|
109
|
+
split_by_page: true, # Split result by pages
|
110
|
+
page_separator: "\n\n", # Custom page separator
|
111
|
+
page_prefix: "Page ", # Text to prepend to each page
|
112
|
+
page_suffix: "\n", # Text to append to each page
|
113
|
+
target_pages: [1,2,3], # Array of specific pages to process
|
114
|
+
bounding_box: { # Specify area to parse (coordinates in pixels)
|
115
|
+
x1: 0, y1: 0, # Top-left corner
|
116
|
+
x2: 612, y2: 792 # Bottom-right corner
|
117
|
+
},
|
118
|
+
|
119
|
+
# OCR and Image Processing
|
120
|
+
disable_ocr: false, # Disable Optical Character Recognition
|
121
|
+
take_screenshot: false, # Capture screenshot of document
|
122
|
+
|
123
|
+
# Advanced Processing Features
|
124
|
+
gpt4o_mode: false, # Enable GPT-4 Optimization mode
|
125
|
+
gpt4o_api_key: "key", # API key for GPT-4 Optimization
|
126
|
+
guess_xlsx_sheet_names: false, # Attempt to guess Excel sheet names
|
127
|
+
is_formatting_instruction: false, # Use formatting instructions
|
128
|
+
annotate_links: false, # Include link annotations in output
|
129
|
+
|
130
|
+
# Multimodal Processing
|
131
|
+
vendor_multimodal_api_key: "key", # API key for multimodal processing
|
132
|
+
use_vendor_multimodal_model: false, # Enable multimodal model
|
133
|
+
vendor_multimodal_model_name: "model", # Specify multimodal model
|
134
|
+
|
135
|
+
# Integration Options
|
136
|
+
webhook_url: "https://...", # URL for webhook notifications
|
137
|
+
http_proxy: "http://...", # HTTP proxy configuration
|
138
|
+
|
139
|
+
# Azure OpenAI Configuration
|
140
|
+
azure_openai_deployment_name: "deployment", # Azure OpenAI deployment name
|
141
|
+
azure_openai_endpoint: "endpoint", # Azure OpenAI endpoint
|
142
|
+
azure_openai_api_version: "2023-05-15", # Azure OpenAI API version
|
143
|
+
azure_openai_key: "key" # Azure OpenAI API key
|
87
144
|
}
|
88
145
|
)
|
89
146
|
```
|
90
147
|
|
148
|
+
### Feature-Specific Options
|
149
|
+
|
150
|
+
#### Page Processing
|
151
|
+
- `split_by_page`: Split the document into separate pages
|
152
|
+
- `page_separator`: Custom text to insert between pages
|
153
|
+
- `page_prefix`/`page_suffix`: Add custom text before/after each page
|
154
|
+
- `target_pages`: Process only specific pages
|
155
|
+
- `bounding_box`: Parse only a specific area of the document
|
156
|
+
|
157
|
+
#### OCR and Image Processing
|
158
|
+
- `disable_ocr`: Turn off Optical Character Recognition
|
159
|
+
- `take_screenshot`: Generate document screenshots
|
160
|
+
- `skip_diagonal_text`: Ignore text at diagonal angles
|
161
|
+
|
162
|
+
#### Advanced Processing
|
163
|
+
- `continuous_mode`: Process text as a continuous stream
|
164
|
+
- `do_not_unroll_columns`: Preserve column structure
|
165
|
+
- `guess_xlsx_sheet_names`: Auto-detect Excel sheet names
|
166
|
+
- `annotate_links`: Include document hyperlinks in output
|
167
|
+
- `is_formatting_instruction`: Use special formatting instructions
|
168
|
+
|
169
|
+
#### Performance Options
|
170
|
+
- `fast_mode`: Faster processing with potential quality trade-offs
|
171
|
+
- `premium_mode`: Access to premium features
|
172
|
+
- `invalidate_cache`/`do_not_cache`: Control result caching
|
173
|
+
- `num_workers`: Configure concurrent processing
|
174
|
+
|
175
|
+
#### Integration Features
|
176
|
+
- `webhook_url`: Receive processing notifications
|
177
|
+
- `http_proxy`: Configure proxy settings
|
178
|
+
|
179
|
+
#### Azure OpenAI Integration
|
180
|
+
Configure Azure OpenAI services with:
|
181
|
+
- `azure_openai_deployment_name`
|
182
|
+
- `azure_openai_endpoint`
|
183
|
+
- `azure_openai_api_version`
|
184
|
+
- `azure_openai_key`
|
185
|
+
|
186
|
+
#### Multimodal Processing
|
187
|
+
Enable advanced multimodal processing with:
|
188
|
+
- `vendor_multimodal_api_key`
|
189
|
+
- `use_vendor_multimodal_model`
|
190
|
+
- `vendor_multimodal_model_name`
|
191
|
+
|
91
192
|
### Supported File Types
|
92
193
|
|
93
194
|
The client supports a wide range of file formats including:
|
data/lib/llamaparserb/version.rb
CHANGED
data/lib/llamaparserb.rb
CHANGED
@@ -51,6 +51,9 @@ module Llamaparserb
|
|
51
51
|
elsif File.exist?(file_input)
|
52
52
|
job_id = create_job_from_path(file_input)
|
53
53
|
log "Started parsing file under job_id #{job_id}", :info
|
54
|
+
elsif URI::DEFAULT_PARSER.make_regexp.match?(file_input)
|
55
|
+
job_id = create_job_from_url(file_input)
|
56
|
+
log "Started parsing URL under job_id #{job_id}", :info
|
54
57
|
else
|
55
58
|
raise Error, "file_type parameter is required for binary string input"
|
56
59
|
end
|
@@ -100,7 +103,20 @@ module Llamaparserb
|
|
100
103
|
bounding_box: nil,
|
101
104
|
target_pages: nil,
|
102
105
|
ignore_errors: true,
|
103
|
-
split_by_page: true
|
106
|
+
split_by_page: true,
|
107
|
+
vendor_multimodal_api_key: nil,
|
108
|
+
use_vendor_multimodal_model: false,
|
109
|
+
vendor_multimodal_model_name: nil,
|
110
|
+
take_screenshot: false,
|
111
|
+
disable_ocr: false,
|
112
|
+
is_formatting_instruction: false,
|
113
|
+
annotate_links: false,
|
114
|
+
webhook_url: nil,
|
115
|
+
azure_openai_deployment_name: nil,
|
116
|
+
azure_openai_endpoint: nil,
|
117
|
+
azure_openai_api_version: nil,
|
118
|
+
azure_openai_key: nil,
|
119
|
+
http_proxy: nil
|
104
120
|
}
|
105
121
|
end
|
106
122
|
|
@@ -184,7 +200,7 @@ module Llamaparserb
|
|
184
200
|
def build_connection
|
185
201
|
Faraday.new(url: base_url) do |f|
|
186
202
|
f.request :multipart
|
187
|
-
f.request :
|
203
|
+
f.request :url_encoded
|
188
204
|
f.response :json
|
189
205
|
f.response :raise_error
|
190
206
|
f.adapter Faraday.default_adapter
|
@@ -221,7 +237,13 @@ module Llamaparserb
|
|
221
237
|
temp_file,
|
222
238
|
detect_content_type(temp_file.path)
|
223
239
|
)
|
224
|
-
|
240
|
+
|
241
|
+
response = @connection.post("upload") do |req|
|
242
|
+
req.headers["Authorization"] = "Bearer #{api_key}"
|
243
|
+
req.body = {file: file}
|
244
|
+
end
|
245
|
+
|
246
|
+
response.body["id"]
|
225
247
|
ensure
|
226
248
|
temp_file&.close
|
227
249
|
temp_file&.unlink
|
@@ -236,9 +258,8 @@ module Llamaparserb
|
|
236
258
|
response.body["id"]
|
237
259
|
end
|
238
260
|
|
239
|
-
def upload_params(file)
|
240
|
-
{
|
241
|
-
file: file,
|
261
|
+
def upload_params(file = nil, url = nil)
|
262
|
+
params = {
|
242
263
|
language: @options[:language].to_s,
|
243
264
|
parsing_instruction: @options[:parsing_instruction],
|
244
265
|
invalidate_cache: @options[:invalidate_cache],
|
@@ -250,8 +271,36 @@ module Llamaparserb
|
|
250
271
|
do_not_unroll_columns: @options[:do_not_unroll_columns],
|
251
272
|
gpt4o_mode: @options[:gpt4o_mode],
|
252
273
|
gpt4o_api_key: @options[:gpt4o_api_key],
|
274
|
+
vendor_multimodal_api_key: @options[:vendor_multimodal_api_key],
|
275
|
+
use_vendor_multimodal_model: @options[:use_vendor_multimodal_model],
|
276
|
+
vendor_multimodal_model_name: @options[:vendor_multimodal_model_name],
|
277
|
+
take_screenshot: @options[:take_screenshot],
|
278
|
+
disable_ocr: @options[:disable_ocr],
|
279
|
+
guess_xlsx_sheet_names: @options[:guess_xlsx_sheet_names],
|
280
|
+
is_formatting_instruction: @options[:is_formatting_instruction],
|
281
|
+
annotate_links: @options[:annotate_links],
|
253
282
|
from_ruby_package: true
|
254
|
-
}
|
283
|
+
}
|
284
|
+
|
285
|
+
params[:page_separator] = @options[:page_separator] if @options[:page_separator]
|
286
|
+
params[:page_prefix] = @options[:page_prefix] if @options[:page_prefix]
|
287
|
+
params[:page_suffix] = @options[:page_suffix] if @options[:page_suffix]
|
288
|
+
params[:bounding_box] = @options[:bounding_box] if @options[:bounding_box]
|
289
|
+
params[:target_pages] = @options[:target_pages] if @options[:target_pages]
|
290
|
+
params[:webhook_url] = @options[:webhook_url] if @options[:webhook_url]
|
291
|
+
params[:azure_openai_deployment_name] = @options[:azure_openai_deployment_name] if @options[:azure_openai_deployment_name]
|
292
|
+
params[:azure_openai_endpoint] = @options[:azure_openai_endpoint] if @options[:azure_openai_endpoint]
|
293
|
+
params[:azure_openai_api_version] = @options[:azure_openai_api_version] if @options[:azure_openai_api_version]
|
294
|
+
params[:azure_openai_key] = @options[:azure_openai_key] if @options[:azure_openai_key]
|
295
|
+
params[:http_proxy] = @options[:http_proxy] if @options[:http_proxy]
|
296
|
+
|
297
|
+
if url
|
298
|
+
params[:input_url] = url.to_s
|
299
|
+
elsif file
|
300
|
+
params[:file] = file
|
301
|
+
end
|
302
|
+
|
303
|
+
params.compact
|
255
304
|
end
|
256
305
|
|
257
306
|
def get_job_status(job_id)
|
@@ -300,5 +349,20 @@ module Llamaparserb
|
|
300
349
|
raise Error, "Unsupported file type: #{extension}. Supported types: #{SUPPORTED_FILE_TYPES.join(", ")}"
|
301
350
|
end
|
302
351
|
end
|
352
|
+
|
353
|
+
def create_job_from_url(url)
|
354
|
+
log "Creating job from URL: #{url}", :debug
|
355
|
+
|
356
|
+
response = @connection.post("upload") do |req|
|
357
|
+
req.headers["Authorization"] = "Bearer #{api_key}"
|
358
|
+
req.headers["Accept"] = "application/json"
|
359
|
+
# Create a simple form data request
|
360
|
+
req.options.timeout = 30 # Optional: add timeout
|
361
|
+
req.body = {"input_url" => url.to_s}
|
362
|
+
end
|
363
|
+
|
364
|
+
log "Response: #{response.body.inspect}", :debug
|
365
|
+
response.body["id"]
|
366
|
+
end
|
303
367
|
end
|
304
368
|
end
|