llamaparserb 0.2.3 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -6
- data/README.md +9 -1
- data/lib/llamaparserb/version.rb +1 -1
- data/lib/llamaparserb.rb +47 -20
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: '045959dc406ac1ba5ce9db3cde8bdfc7e0bbccf83601027a0847b6693220af8f'
|
4
|
+
data.tar.gz: a56317601d9feba955aa6d8c08586ac495673b3d506ed0b332c43aca59a0bacf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 585bd687b193ffedceccf87e3a95550a1053e31d6d14da430f603109e9be74aa4aaf06d901afda2a0aaa9cdf87be93a08420cbc88b876630ce7d865c2e1c08db
|
7
|
+
data.tar.gz: 2d0616205409a56943d31014ca01c910bd31d6fb40a0c98fcbb02e6ba946d92a22925ba77ba05cfeae10ae6af0f3e7753b8dbcefed66739b97663808862d1892
|
data/CHANGELOG.md
CHANGED
@@ -5,15 +5,25 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
5
5
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
6
6
|
|
7
7
|
## [Unreleased]
|
8
|
+
### Added
|
9
|
+
- Add support for all supported optional llamaparse parameters when parsing files from URLs
|
10
|
+
|
11
|
+
## [0.3.0] - 2024-11-28
|
12
|
+
### Added
|
13
|
+
- Add support for parsing files from URLs
|
8
14
|
|
9
15
|
## [0.2.3] - 2024-11-28
|
10
16
|
### Added
|
11
|
-
- Add support for all supported optional
|
17
|
+
- Add support for all supported optional llamaparse parameters to `parse_file`
|
18
|
+
|
19
|
+
[0.2.3]: https://github.com/horizing/llamaparserb/releases/tag/v0.2.3...v0.2.2
|
12
20
|
|
13
21
|
## [0.2.2] - 2024-11-28
|
14
22
|
### Fixed
|
15
23
|
- Fix issue with handling file path
|
16
24
|
|
25
|
+
[0.2.2]: https://github.com/horizing/llamaparserb/releases/tag/v0.2.2
|
26
|
+
|
17
27
|
## [0.2.1] - 2024-11-28
|
18
28
|
### Fixed
|
19
29
|
- Fix parse_file to handle files that are not on the local filesystem
|
@@ -29,8 +39,4 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
29
39
|
|
30
40
|
## [0.1.0] - 2024-11-27
|
31
41
|
### Added
|
32
|
-
- Initial release
|
33
|
-
|
34
|
-
[Unreleased]: https://github.com/horizing/llamaparserb/compare/v0.1.0...HEAD
|
35
|
-
[0.1.1]: https://github.com/horizing/llamaparserb/releases/tag/v0.1.1...v0.1.0
|
36
|
-
[0.1.0]: https://github.com/horizing/llamaparserb/releases/tag/v0.1.0
|
42
|
+
- Initial release
|
data/README.md
CHANGED
@@ -43,11 +43,14 @@ text = client.parse_file(file_content, 'pdf')
|
|
43
43
|
# Parse a file to markdown
|
44
44
|
client = Llamaparserb::Client.new(ENV['LLAMA_CLOUD_API_KEY'], result_type: "markdown")
|
45
45
|
markdown = client.parse_file('path/to/document.pdf')
|
46
|
+
|
47
|
+
# Parse a file from a URL
|
48
|
+
markdown = client.parse_file('https://example.com/document.pdf')
|
46
49
|
```
|
47
50
|
|
48
51
|
### File Input Options
|
49
52
|
|
50
|
-
The `parse_file` method accepts
|
53
|
+
The `parse_file` method accepts three types of inputs:
|
51
54
|
|
52
55
|
1. File path (String):
|
53
56
|
```ruby
|
@@ -69,6 +72,11 @@ temp_file = Tempfile.new(['document', '.pdf'])
|
|
69
72
|
client.parse_file(temp_file, 'pdf')
|
70
73
|
```
|
71
74
|
|
75
|
+
3. URL (String):
|
76
|
+
```ruby
|
77
|
+
client.parse_file('https://example.com/document.pdf')
|
78
|
+
```
|
79
|
+
|
72
80
|
### Advanced Options
|
73
81
|
|
74
82
|
```ruby
|
data/lib/llamaparserb/version.rb
CHANGED
data/lib/llamaparserb.rb
CHANGED
@@ -51,6 +51,9 @@ module Llamaparserb
|
|
51
51
|
elsif File.exist?(file_input)
|
52
52
|
job_id = create_job_from_path(file_input)
|
53
53
|
log "Started parsing file under job_id #{job_id}", :info
|
54
|
+
elsif URI::DEFAULT_PARSER.make_regexp.match?(file_input)
|
55
|
+
job_id = create_job_from_url(file_input)
|
56
|
+
log "Started parsing URL under job_id #{job_id}", :info
|
54
57
|
else
|
55
58
|
raise Error, "file_type parameter is required for binary string input"
|
56
59
|
end
|
@@ -197,7 +200,7 @@ module Llamaparserb
|
|
197
200
|
def build_connection
|
198
201
|
Faraday.new(url: base_url) do |f|
|
199
202
|
f.request :multipart
|
200
|
-
f.request :
|
203
|
+
f.request :url_encoded
|
201
204
|
f.response :json
|
202
205
|
f.response :raise_error
|
203
206
|
f.adapter Faraday.default_adapter
|
@@ -234,7 +237,13 @@ module Llamaparserb
|
|
234
237
|
temp_file,
|
235
238
|
detect_content_type(temp_file.path)
|
236
239
|
)
|
237
|
-
|
240
|
+
|
241
|
+
response = @connection.post("upload") do |req|
|
242
|
+
req.headers["Authorization"] = "Bearer #{api_key}"
|
243
|
+
req.body = upload_params(file)
|
244
|
+
end
|
245
|
+
|
246
|
+
response.body["id"]
|
238
247
|
ensure
|
239
248
|
temp_file&.close
|
240
249
|
temp_file&.unlink
|
@@ -249,9 +258,8 @@ module Llamaparserb
|
|
249
258
|
response.body["id"]
|
250
259
|
end
|
251
260
|
|
252
|
-
def upload_params(file)
|
261
|
+
def upload_params(file = nil, url = nil)
|
253
262
|
params = {
|
254
|
-
file: file,
|
255
263
|
language: @options[:language].to_s,
|
256
264
|
parsing_instruction: @options[:parsing_instruction],
|
257
265
|
invalidate_cache: @options[:invalidate_cache],
|
@@ -261,30 +269,35 @@ module Llamaparserb
|
|
261
269
|
premium_mode: @options[:premium_mode],
|
262
270
|
continuous_mode: @options[:continuous_mode],
|
263
271
|
do_not_unroll_columns: @options[:do_not_unroll_columns],
|
272
|
+
page_separator: @options[:page_separator],
|
273
|
+
page_prefix: @options[:page_prefix],
|
274
|
+
page_suffix: @options[:page_suffix],
|
275
|
+
target_pages: @options[:target_pages],
|
276
|
+
bounding_box: @options[:bounding_box],
|
277
|
+
disable_ocr: @options[:disable_ocr],
|
278
|
+
take_screenshot: @options[:take_screenshot],
|
264
279
|
gpt4o_mode: @options[:gpt4o_mode],
|
265
280
|
gpt4o_api_key: @options[:gpt4o_api_key],
|
266
|
-
vendor_multimodal_api_key: @options[:vendor_multimodal_api_key],
|
267
|
-
use_vendor_multimodal_model: @options[:use_vendor_multimodal_model],
|
268
|
-
vendor_multimodal_model_name: @options[:vendor_multimodal_model_name],
|
269
|
-
take_screenshot: @options[:take_screenshot],
|
270
|
-
disable_ocr: @options[:disable_ocr],
|
271
281
|
guess_xlsx_sheet_names: @options[:guess_xlsx_sheet_names],
|
272
282
|
is_formatting_instruction: @options[:is_formatting_instruction],
|
273
283
|
annotate_links: @options[:annotate_links],
|
284
|
+
vendor_multimodal_api_key: @options[:vendor_multimodal_api_key],
|
285
|
+
use_vendor_multimodal_model: @options[:use_vendor_multimodal_model],
|
286
|
+
vendor_multimodal_model_name: @options[:vendor_multimodal_model_name],
|
287
|
+
webhook_url: @options[:webhook_url],
|
288
|
+
http_proxy: @options[:http_proxy],
|
289
|
+
azure_openai_deployment_name: @options[:azure_openai_deployment_name],
|
290
|
+
azure_openai_endpoint: @options[:azure_openai_endpoint],
|
291
|
+
azure_openai_api_version: @options[:azure_openai_api_version],
|
292
|
+
azure_openai_key: @options[:azure_openai_key],
|
274
293
|
from_ruby_package: true
|
275
294
|
}
|
276
295
|
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
params[:webhook_url] = @options[:webhook_url] if @options[:webhook_url]
|
283
|
-
params[:azure_openai_deployment_name] = @options[:azure_openai_deployment_name] if @options[:azure_openai_deployment_name]
|
284
|
-
params[:azure_openai_endpoint] = @options[:azure_openai_endpoint] if @options[:azure_openai_endpoint]
|
285
|
-
params[:azure_openai_api_version] = @options[:azure_openai_api_version] if @options[:azure_openai_api_version]
|
286
|
-
params[:azure_openai_key] = @options[:azure_openai_key] if @options[:azure_openai_key]
|
287
|
-
params[:http_proxy] = @options[:http_proxy] if @options[:http_proxy]
|
296
|
+
if url
|
297
|
+
params[:input_url] = url.to_s
|
298
|
+
elsif file
|
299
|
+
params[:file] = file
|
300
|
+
end
|
288
301
|
|
289
302
|
params.compact
|
290
303
|
end
|
@@ -335,5 +348,19 @@ module Llamaparserb
|
|
335
348
|
raise Error, "Unsupported file type: #{extension}. Supported types: #{SUPPORTED_FILE_TYPES.join(", ")}"
|
336
349
|
end
|
337
350
|
end
|
351
|
+
|
352
|
+
def create_job_from_url(url)
|
353
|
+
log "Creating job from URL: #{url}", :debug
|
354
|
+
|
355
|
+
response = @connection.post("upload") do |req|
|
356
|
+
req.headers["Authorization"] = "Bearer #{api_key}"
|
357
|
+
req.headers["Accept"] = "application/json"
|
358
|
+
req.options.timeout = 30
|
359
|
+
req.body = upload_params(nil, url)
|
360
|
+
end
|
361
|
+
|
362
|
+
log "Response: #{response.body.inspect}", :debug
|
363
|
+
response.body["id"]
|
364
|
+
end
|
338
365
|
end
|
339
366
|
end
|