llamaparserb 0.2.2 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -5
- data/README.md +111 -10
- data/lib/llamaparserb/version.rb +1 -1
- data/lib/llamaparserb.rb +71 -7
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9c49dc1624b1b955cad8032696308fc81357533113cf429f53fe063d5db0cab2
|
4
|
+
data.tar.gz: 2eee40a9054fe05d02a094828bb47a6cc3a5b65ca6fcb5864b9648cff5f0b418
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 50ed45e0d813d776b79fce2e87593cdc88063158045c62ba76e1bc0e353fb1b27b2b03b87dc4253443b4b2a355fec179708d04e1920dc0f47730c271e3ff81cc
|
7
|
+
data.tar.gz: 9f86c50b3f5c4bd987c9bab987c5b139ce1dd998577d04ea3fba53d64b430d3cdcc0af3d880d73ebea1914341bc185fb78469f78d9521098f06510c7024b8746
|
data/CHANGELOG.md
CHANGED
@@ -6,10 +6,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
6
6
|
|
7
7
|
## [Unreleased]
|
8
8
|
|
9
|
+
## [0.3.0] - 2024-11-28
|
10
|
+
### Added
|
11
|
+
- Add support for parsing files from URLs
|
12
|
+
|
13
|
+
## [0.2.3] - 2024-11-28
|
14
|
+
### Added
|
15
|
+
- Add support for all supported optional llamaparsse parameters to `parse_file`
|
16
|
+
|
17
|
+
[0.2.3]: https://github.com/horizing/llamaparserb/releases/tag/v0.2.3...v0.2.2
|
18
|
+
|
9
19
|
## [0.2.2] - 2024-11-28
|
10
20
|
### Fixed
|
11
21
|
- Fix issue with handling file path
|
12
22
|
|
23
|
+
[0.2.2]: https://github.com/horizing/llamaparserb/releases/tag/v0.2.2
|
24
|
+
|
13
25
|
## [0.2.1] - 2024-11-28
|
14
26
|
### Fixed
|
15
27
|
- Fix parse_file to handle files that are not on the local filesystem
|
@@ -25,8 +37,4 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
25
37
|
|
26
38
|
## [0.1.0] - 2024-11-27
|
27
39
|
### Added
|
28
|
-
- Initial release
|
29
|
-
|
30
|
-
[Unreleased]: https://github.com/horizing/llamaparserb/compare/v0.1.0...HEAD
|
31
|
-
[0.1.1]: https://github.com/horizing/llamaparserb/releases/tag/v0.1.1...v0.1.0
|
32
|
-
[0.1.0]: https://github.com/horizing/llamaparserb/releases/tag/v0.1.0
|
40
|
+
- Initial release
|
data/README.md
CHANGED
@@ -43,11 +43,14 @@ text = client.parse_file(file_content, 'pdf')
|
|
43
43
|
# Parse a file to markdown
|
44
44
|
client = Llamaparserb::Client.new(ENV['LLAMA_CLOUD_API_KEY'], result_type: "markdown")
|
45
45
|
markdown = client.parse_file('path/to/document.pdf')
|
46
|
+
|
47
|
+
# Parse a file from a URL
|
48
|
+
markdown = client.parse_file('https://example.com/document.pdf')
|
46
49
|
```
|
47
50
|
|
48
51
|
### File Input Options
|
49
52
|
|
50
|
-
The `parse_file` method accepts
|
53
|
+
The `parse_file` method accepts three types of inputs:
|
51
54
|
|
52
55
|
1. File path (String):
|
53
56
|
```ruby
|
@@ -69,25 +72,123 @@ temp_file = Tempfile.new(['document', '.pdf'])
|
|
69
72
|
client.parse_file(temp_file, 'pdf')
|
70
73
|
```
|
71
74
|
|
75
|
+
3. URL (String):
|
76
|
+
```ruby
|
77
|
+
client.parse_file('https://example.com/document.pdf')
|
78
|
+
```
|
79
|
+
|
72
80
|
### Advanced Options
|
73
81
|
|
74
82
|
```ruby
|
75
83
|
client = Llamaparserb::Client.new(
|
76
84
|
ENV['LLAMA_CLOUD_API_KEY'],
|
77
85
|
{
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
86
|
+
# Basic Configuration
|
87
|
+
result_type: "markdown", # Output format: "text" or "markdown"
|
88
|
+
num_workers: 4, # Number of workers for concurrent processing
|
89
|
+
check_interval: 1, # How often to check job status (seconds)
|
90
|
+
max_timeout: 2000, # Maximum time to wait for parsing (seconds)
|
91
|
+
verbose: true, # Enable detailed logging
|
92
|
+
show_progress: true, # Show progress during parsing
|
93
|
+
ignore_errors: true, # Return nil instead of raising errors
|
94
|
+
|
95
|
+
# Language and Parsing Options
|
96
|
+
language: :en, # Target language for parsing
|
97
|
+
parsing_instruction: "", # Custom parsing instructions
|
98
|
+
skip_diagonal_text: false, # Skip diagonal text in documents
|
99
|
+
invalidate_cache: false, # Force reprocessing of cached documents
|
100
|
+
do_not_cache: false, # Disable caching of results
|
101
|
+
|
102
|
+
# Processing Modes
|
103
|
+
fast_mode: false, # Enable faster processing (may reduce quality)
|
104
|
+
premium_mode: false, # Enable premium parsing features
|
105
|
+
continuous_mode: false, # Process document as continuous text
|
106
|
+
do_not_unroll_columns: false, # Keep columnar text structure
|
107
|
+
|
108
|
+
# Page Handling
|
109
|
+
split_by_page: true, # Split result by pages
|
110
|
+
page_separator: "\n\n", # Custom page separator
|
111
|
+
page_prefix: "Page ", # Text to prepend to each page
|
112
|
+
page_suffix: "\n", # Text to append to each page
|
113
|
+
target_pages: [1,2,3], # Array of specific pages to process
|
114
|
+
bounding_box: { # Specify area to parse (coordinates in pixels)
|
115
|
+
x1: 0, y1: 0, # Top-left corner
|
116
|
+
x2: 612, y2: 792 # Bottom-right corner
|
117
|
+
},
|
118
|
+
|
119
|
+
# OCR and Image Processing
|
120
|
+
disable_ocr: false, # Disable Optical Character Recognition
|
121
|
+
take_screenshot: false, # Capture screenshot of document
|
122
|
+
|
123
|
+
# Advanced Processing Features
|
124
|
+
gpt4o_mode: false, # Enable GPT-4 Optimization mode
|
125
|
+
gpt4o_api_key: "key", # API key for GPT-4 Optimization
|
126
|
+
guess_xlsx_sheet_names: false, # Attempt to guess Excel sheet names
|
127
|
+
is_formatting_instruction: false, # Use formatting instructions
|
128
|
+
annotate_links: false, # Include link annotations in output
|
129
|
+
|
130
|
+
# Multimodal Processing
|
131
|
+
vendor_multimodal_api_key: "key", # API key for multimodal processing
|
132
|
+
use_vendor_multimodal_model: false, # Enable multimodal model
|
133
|
+
vendor_multimodal_model_name: "model", # Specify multimodal model
|
134
|
+
|
135
|
+
# Integration Options
|
136
|
+
webhook_url: "https://...", # URL for webhook notifications
|
137
|
+
http_proxy: "http://...", # HTTP proxy configuration
|
138
|
+
|
139
|
+
# Azure OpenAI Configuration
|
140
|
+
azure_openai_deployment_name: "deployment", # Azure OpenAI deployment name
|
141
|
+
azure_openai_endpoint: "endpoint", # Azure OpenAI endpoint
|
142
|
+
azure_openai_api_version: "2023-05-15", # Azure OpenAI API version
|
143
|
+
azure_openai_key: "key" # Azure OpenAI API key
|
87
144
|
}
|
88
145
|
)
|
89
146
|
```
|
90
147
|
|
148
|
+
### Feature-Specific Options
|
149
|
+
|
150
|
+
#### Page Processing
|
151
|
+
- `split_by_page`: Split the document into separate pages
|
152
|
+
- `page_separator`: Custom text to insert between pages
|
153
|
+
- `page_prefix`/`page_suffix`: Add custom text before/after each page
|
154
|
+
- `target_pages`: Process only specific pages
|
155
|
+
- `bounding_box`: Parse only a specific area of the document
|
156
|
+
|
157
|
+
#### OCR and Image Processing
|
158
|
+
- `disable_ocr`: Turn off Optical Character Recognition
|
159
|
+
- `take_screenshot`: Generate document screenshots
|
160
|
+
- `skip_diagonal_text`: Ignore text at diagonal angles
|
161
|
+
|
162
|
+
#### Advanced Processing
|
163
|
+
- `continuous_mode`: Process text as a continuous stream
|
164
|
+
- `do_not_unroll_columns`: Preserve column structure
|
165
|
+
- `guess_xlsx_sheet_names`: Auto-detect Excel sheet names
|
166
|
+
- `annotate_links`: Include document hyperlinks in output
|
167
|
+
- `is_formatting_instruction`: Use special formatting instructions
|
168
|
+
|
169
|
+
#### Performance Options
|
170
|
+
- `fast_mode`: Faster processing with potential quality trade-offs
|
171
|
+
- `premium_mode`: Access to premium features
|
172
|
+
- `invalidate_cache`/`do_not_cache`: Control result caching
|
173
|
+
- `num_workers`: Configure concurrent processing
|
174
|
+
|
175
|
+
#### Integration Features
|
176
|
+
- `webhook_url`: Receive processing notifications
|
177
|
+
- `http_proxy`: Configure proxy settings
|
178
|
+
|
179
|
+
#### Azure OpenAI Integration
|
180
|
+
Configure Azure OpenAI services with:
|
181
|
+
- `azure_openai_deployment_name`
|
182
|
+
- `azure_openai_endpoint`
|
183
|
+
- `azure_openai_api_version`
|
184
|
+
- `azure_openai_key`
|
185
|
+
|
186
|
+
#### Multimodal Processing
|
187
|
+
Enable advanced multimodal processing with:
|
188
|
+
- `vendor_multimodal_api_key`
|
189
|
+
- `use_vendor_multimodal_model`
|
190
|
+
- `vendor_multimodal_model_name`
|
191
|
+
|
91
192
|
### Supported File Types
|
92
193
|
|
93
194
|
The client supports a wide range of file formats including:
|
data/lib/llamaparserb/version.rb
CHANGED
data/lib/llamaparserb.rb
CHANGED
@@ -51,6 +51,9 @@ module Llamaparserb
|
|
51
51
|
elsif File.exist?(file_input)
|
52
52
|
job_id = create_job_from_path(file_input)
|
53
53
|
log "Started parsing file under job_id #{job_id}", :info
|
54
|
+
elsif URI::DEFAULT_PARSER.make_regexp.match?(file_input)
|
55
|
+
job_id = create_job_from_url(file_input)
|
56
|
+
log "Started parsing URL under job_id #{job_id}", :info
|
54
57
|
else
|
55
58
|
raise Error, "file_type parameter is required for binary string input"
|
56
59
|
end
|
@@ -100,7 +103,20 @@ module Llamaparserb
|
|
100
103
|
bounding_box: nil,
|
101
104
|
target_pages: nil,
|
102
105
|
ignore_errors: true,
|
103
|
-
split_by_page: true
|
106
|
+
split_by_page: true,
|
107
|
+
vendor_multimodal_api_key: nil,
|
108
|
+
use_vendor_multimodal_model: false,
|
109
|
+
vendor_multimodal_model_name: nil,
|
110
|
+
take_screenshot: false,
|
111
|
+
disable_ocr: false,
|
112
|
+
is_formatting_instruction: false,
|
113
|
+
annotate_links: false,
|
114
|
+
webhook_url: nil,
|
115
|
+
azure_openai_deployment_name: nil,
|
116
|
+
azure_openai_endpoint: nil,
|
117
|
+
azure_openai_api_version: nil,
|
118
|
+
azure_openai_key: nil,
|
119
|
+
http_proxy: nil
|
104
120
|
}
|
105
121
|
end
|
106
122
|
|
@@ -184,7 +200,7 @@ module Llamaparserb
|
|
184
200
|
def build_connection
|
185
201
|
Faraday.new(url: base_url) do |f|
|
186
202
|
f.request :multipart
|
187
|
-
f.request :
|
203
|
+
f.request :url_encoded
|
188
204
|
f.response :json
|
189
205
|
f.response :raise_error
|
190
206
|
f.adapter Faraday.default_adapter
|
@@ -221,7 +237,13 @@ module Llamaparserb
|
|
221
237
|
temp_file,
|
222
238
|
detect_content_type(temp_file.path)
|
223
239
|
)
|
224
|
-
|
240
|
+
|
241
|
+
response = @connection.post("upload") do |req|
|
242
|
+
req.headers["Authorization"] = "Bearer #{api_key}"
|
243
|
+
req.body = {file: file}
|
244
|
+
end
|
245
|
+
|
246
|
+
response.body["id"]
|
225
247
|
ensure
|
226
248
|
temp_file&.close
|
227
249
|
temp_file&.unlink
|
@@ -236,9 +258,8 @@ module Llamaparserb
|
|
236
258
|
response.body["id"]
|
237
259
|
end
|
238
260
|
|
239
|
-
def upload_params(file)
|
240
|
-
{
|
241
|
-
file: file,
|
261
|
+
def upload_params(file = nil, url = nil)
|
262
|
+
params = {
|
242
263
|
language: @options[:language].to_s,
|
243
264
|
parsing_instruction: @options[:parsing_instruction],
|
244
265
|
invalidate_cache: @options[:invalidate_cache],
|
@@ -250,8 +271,36 @@ module Llamaparserb
|
|
250
271
|
do_not_unroll_columns: @options[:do_not_unroll_columns],
|
251
272
|
gpt4o_mode: @options[:gpt4o_mode],
|
252
273
|
gpt4o_api_key: @options[:gpt4o_api_key],
|
274
|
+
vendor_multimodal_api_key: @options[:vendor_multimodal_api_key],
|
275
|
+
use_vendor_multimodal_model: @options[:use_vendor_multimodal_model],
|
276
|
+
vendor_multimodal_model_name: @options[:vendor_multimodal_model_name],
|
277
|
+
take_screenshot: @options[:take_screenshot],
|
278
|
+
disable_ocr: @options[:disable_ocr],
|
279
|
+
guess_xlsx_sheet_names: @options[:guess_xlsx_sheet_names],
|
280
|
+
is_formatting_instruction: @options[:is_formatting_instruction],
|
281
|
+
annotate_links: @options[:annotate_links],
|
253
282
|
from_ruby_package: true
|
254
|
-
}
|
283
|
+
}
|
284
|
+
|
285
|
+
params[:page_separator] = @options[:page_separator] if @options[:page_separator]
|
286
|
+
params[:page_prefix] = @options[:page_prefix] if @options[:page_prefix]
|
287
|
+
params[:page_suffix] = @options[:page_suffix] if @options[:page_suffix]
|
288
|
+
params[:bounding_box] = @options[:bounding_box] if @options[:bounding_box]
|
289
|
+
params[:target_pages] = @options[:target_pages] if @options[:target_pages]
|
290
|
+
params[:webhook_url] = @options[:webhook_url] if @options[:webhook_url]
|
291
|
+
params[:azure_openai_deployment_name] = @options[:azure_openai_deployment_name] if @options[:azure_openai_deployment_name]
|
292
|
+
params[:azure_openai_endpoint] = @options[:azure_openai_endpoint] if @options[:azure_openai_endpoint]
|
293
|
+
params[:azure_openai_api_version] = @options[:azure_openai_api_version] if @options[:azure_openai_api_version]
|
294
|
+
params[:azure_openai_key] = @options[:azure_openai_key] if @options[:azure_openai_key]
|
295
|
+
params[:http_proxy] = @options[:http_proxy] if @options[:http_proxy]
|
296
|
+
|
297
|
+
if url
|
298
|
+
params[:input_url] = url.to_s
|
299
|
+
elsif file
|
300
|
+
params[:file] = file
|
301
|
+
end
|
302
|
+
|
303
|
+
params.compact
|
255
304
|
end
|
256
305
|
|
257
306
|
def get_job_status(job_id)
|
@@ -300,5 +349,20 @@ module Llamaparserb
|
|
300
349
|
raise Error, "Unsupported file type: #{extension}. Supported types: #{SUPPORTED_FILE_TYPES.join(", ")}"
|
301
350
|
end
|
302
351
|
end
|
352
|
+
|
353
|
+
def create_job_from_url(url)
|
354
|
+
log "Creating job from URL: #{url}", :debug
|
355
|
+
|
356
|
+
response = @connection.post("upload") do |req|
|
357
|
+
req.headers["Authorization"] = "Bearer #{api_key}"
|
358
|
+
req.headers["Accept"] = "application/json"
|
359
|
+
# Create a simple form data request
|
360
|
+
req.options.timeout = 30 # Optional: add timeout
|
361
|
+
req.body = {"input_url" => url.to_s}
|
362
|
+
end
|
363
|
+
|
364
|
+
log "Response: #{response.body.inspect}", :debug
|
365
|
+
response.body["id"]
|
366
|
+
end
|
303
367
|
end
|
304
368
|
end
|