llamaparserb 0.2.3 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -5
- data/README.md +9 -1
- data/lib/llamaparserb/version.rb +1 -1
- data/lib/llamaparserb.rb +33 -4
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9c49dc1624b1b955cad8032696308fc81357533113cf429f53fe063d5db0cab2
|
4
|
+
data.tar.gz: 2eee40a9054fe05d02a094828bb47a6cc3a5b65ca6fcb5864b9648cff5f0b418
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 50ed45e0d813d776b79fce2e87593cdc88063158045c62ba76e1bc0e353fb1b27b2b03b87dc4253443b4b2a355fec179708d04e1920dc0f47730c271e3ff81cc
|
7
|
+
data.tar.gz: 9f86c50b3f5c4bd987c9bab987c5b139ce1dd998577d04ea3fba53d64b430d3cdcc0af3d880d73ebea1914341bc185fb78469f78d9521098f06510c7024b8746
|
data/CHANGELOG.md
CHANGED
@@ -6,14 +6,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
6
6
|
|
7
7
|
## [Unreleased]
|
8
8
|
|
9
|
+
## [0.3.0] - 2024-11-28
|
10
|
+
### Added
|
11
|
+
- Add support for parsing files from URLs
|
12
|
+
|
9
13
|
## [0.2.3] - 2024-11-28
|
10
14
|
### Added
|
11
15
|
- Add support for all supported optional llamaparsse parameters to `parse_file`
|
12
16
|
|
17
|
+
[0.2.3]: https://github.com/horizing/llamaparserb/releases/tag/v0.2.3...v0.2.2
|
18
|
+
|
13
19
|
## [0.2.2] - 2024-11-28
|
14
20
|
### Fixed
|
15
21
|
- Fix issue with handling file path
|
16
22
|
|
23
|
+
[0.2.2]: https://github.com/horizing/llamaparserb/releases/tag/v0.2.2
|
24
|
+
|
17
25
|
## [0.2.1] - 2024-11-28
|
18
26
|
### Fixed
|
19
27
|
- Fix parse_file to handle files that are not on the local filesystem
|
@@ -29,8 +37,4 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
29
37
|
|
30
38
|
## [0.1.0] - 2024-11-27
|
31
39
|
### Added
|
32
|
-
- Initial release
|
33
|
-
|
34
|
-
[Unreleased]: https://github.com/horizing/llamaparserb/compare/v0.1.0...HEAD
|
35
|
-
[0.1.1]: https://github.com/horizing/llamaparserb/releases/tag/v0.1.1...v0.1.0
|
36
|
-
[0.1.0]: https://github.com/horizing/llamaparserb/releases/tag/v0.1.0
|
40
|
+
- Initial release
|
data/README.md
CHANGED
@@ -43,11 +43,14 @@ text = client.parse_file(file_content, 'pdf')
|
|
43
43
|
# Parse a file to markdown
|
44
44
|
client = Llamaparserb::Client.new(ENV['LLAMA_CLOUD_API_KEY'], result_type: "markdown")
|
45
45
|
markdown = client.parse_file('path/to/document.pdf')
|
46
|
+
|
47
|
+
# Parse a file from a URL
|
48
|
+
markdown = client.parse_file('https://example.com/document.pdf')
|
46
49
|
```
|
47
50
|
|
48
51
|
### File Input Options
|
49
52
|
|
50
|
-
The `parse_file` method accepts
|
53
|
+
The `parse_file` method accepts three types of inputs:
|
51
54
|
|
52
55
|
1. File path (String):
|
53
56
|
```ruby
|
@@ -69,6 +72,11 @@ temp_file = Tempfile.new(['document', '.pdf'])
|
|
69
72
|
client.parse_file(temp_file, 'pdf')
|
70
73
|
```
|
71
74
|
|
75
|
+
3. URL (String):
|
76
|
+
```ruby
|
77
|
+
client.parse_file('https://example.com/document.pdf')
|
78
|
+
```
|
79
|
+
|
72
80
|
### Advanced Options
|
73
81
|
|
74
82
|
```ruby
|
data/lib/llamaparserb/version.rb
CHANGED
data/lib/llamaparserb.rb
CHANGED
@@ -51,6 +51,9 @@ module Llamaparserb
|
|
51
51
|
elsif File.exist?(file_input)
|
52
52
|
job_id = create_job_from_path(file_input)
|
53
53
|
log "Started parsing file under job_id #{job_id}", :info
|
54
|
+
elsif URI::DEFAULT_PARSER.make_regexp.match?(file_input)
|
55
|
+
job_id = create_job_from_url(file_input)
|
56
|
+
log "Started parsing URL under job_id #{job_id}", :info
|
54
57
|
else
|
55
58
|
raise Error, "file_type parameter is required for binary string input"
|
56
59
|
end
|
@@ -197,7 +200,7 @@ module Llamaparserb
|
|
197
200
|
def build_connection
|
198
201
|
Faraday.new(url: base_url) do |f|
|
199
202
|
f.request :multipart
|
200
|
-
f.request :
|
203
|
+
f.request :url_encoded
|
201
204
|
f.response :json
|
202
205
|
f.response :raise_error
|
203
206
|
f.adapter Faraday.default_adapter
|
@@ -234,7 +237,13 @@ module Llamaparserb
|
|
234
237
|
temp_file,
|
235
238
|
detect_content_type(temp_file.path)
|
236
239
|
)
|
237
|
-
|
240
|
+
|
241
|
+
response = @connection.post("upload") do |req|
|
242
|
+
req.headers["Authorization"] = "Bearer #{api_key}"
|
243
|
+
req.body = {file: file}
|
244
|
+
end
|
245
|
+
|
246
|
+
response.body["id"]
|
238
247
|
ensure
|
239
248
|
temp_file&.close
|
240
249
|
temp_file&.unlink
|
@@ -249,9 +258,8 @@ module Llamaparserb
|
|
249
258
|
response.body["id"]
|
250
259
|
end
|
251
260
|
|
252
|
-
def upload_params(file)
|
261
|
+
def upload_params(file = nil, url = nil)
|
253
262
|
params = {
|
254
|
-
file: file,
|
255
263
|
language: @options[:language].to_s,
|
256
264
|
parsing_instruction: @options[:parsing_instruction],
|
257
265
|
invalidate_cache: @options[:invalidate_cache],
|
@@ -286,6 +294,12 @@ module Llamaparserb
|
|
286
294
|
params[:azure_openai_key] = @options[:azure_openai_key] if @options[:azure_openai_key]
|
287
295
|
params[:http_proxy] = @options[:http_proxy] if @options[:http_proxy]
|
288
296
|
|
297
|
+
if url
|
298
|
+
params[:input_url] = url.to_s
|
299
|
+
elsif file
|
300
|
+
params[:file] = file
|
301
|
+
end
|
302
|
+
|
289
303
|
params.compact
|
290
304
|
end
|
291
305
|
|
@@ -335,5 +349,20 @@ module Llamaparserb
|
|
335
349
|
raise Error, "Unsupported file type: #{extension}. Supported types: #{SUPPORTED_FILE_TYPES.join(", ")}"
|
336
350
|
end
|
337
351
|
end
|
352
|
+
|
353
|
+
def create_job_from_url(url)
|
354
|
+
log "Creating job from URL: #{url}", :debug
|
355
|
+
|
356
|
+
response = @connection.post("upload") do |req|
|
357
|
+
req.headers["Authorization"] = "Bearer #{api_key}"
|
358
|
+
req.headers["Accept"] = "application/json"
|
359
|
+
# Create a simple form data request
|
360
|
+
req.options.timeout = 30 # Optional: add timeout
|
361
|
+
req.body = {"input_url" => url.to_s}
|
362
|
+
end
|
363
|
+
|
364
|
+
log "Response: #{response.body.inspect}", :debug
|
365
|
+
response.body["id"]
|
366
|
+
end
|
338
367
|
end
|
339
368
|
end
|