llamaparserb 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +30 -1
- data/lib/llamaparserb/version.rb +1 -1
- data/lib/llamaparserb.rb +43 -9
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 14e4e5deea50bc3f5cb0e32c3c3029f6b26fda4cacf91be0f96e9f677f92d1e5
|
4
|
+
data.tar.gz: e3e6cb569456d1c649be22ad3e5af65e30484fce50582d7493f5e22b2ac03f61
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 72121ad4b70f95ddd2bfe23f129d5c9ee634543543ba43e4faa437da67bf3cbdf8bc1ea828aeb58d43073bed46c28d4010a6865b5bc006c32ccfc30a099d1d65
|
7
|
+
data.tar.gz: 5bea9e367b71d38fe8f6a1a80be0b4fd190e3e56b780e369d33c3319dfd91b483283f4b43ab8fc484468ab611c5e3efc639d8302ba611ccfb71258c459fdb783
|
data/CHANGELOG.md
CHANGED
@@ -6,6 +6,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
6
6
|
|
7
7
|
## [Unreleased]
|
8
8
|
|
9
|
+
## [0.2.0] - 2024-11-28
|
10
|
+
### Changed
|
11
|
+
- Allow passing in a string or an IO object to `parse_file`
|
12
|
+
- Add support for file type parameter to `parse_file`
|
13
|
+
|
9
14
|
## [0.1.1] - 2024-11-28
|
10
15
|
### Changed
|
11
16
|
- Move gem ownership to Horizing
|
data/README.md
CHANGED
@@ -32,14 +32,43 @@ require 'llamaparserb'
|
|
32
32
|
# Initialize client with API key
|
33
33
|
client = Llamaparserb::Client.new(ENV['LLAMA_CLOUD_API_KEY'])
|
34
34
|
|
35
|
-
# Parse a file to text
|
35
|
+
# Parse a file from disk (to text by default)
|
36
36
|
text = client.parse_file('path/to/document.pdf')
|
37
37
|
|
38
|
+
# Parse an in-memory file (requires file type)
|
39
|
+
require 'open-uri'
|
40
|
+
file_content = URI.open('https://example.com/document.pdf')
|
41
|
+
text = client.parse_file(file_content, 'pdf')
|
42
|
+
|
38
43
|
# Parse a file to markdown
|
39
44
|
client = Llamaparserb::Client.new(ENV['LLAMA_CLOUD_API_KEY'], result_type: "markdown")
|
40
45
|
markdown = client.parse_file('path/to/document.pdf')
|
41
46
|
```
|
42
47
|
|
48
|
+
### File Input Options
|
49
|
+
|
50
|
+
The `parse_file` method accepts two types of inputs:
|
51
|
+
|
52
|
+
1. File path (String):
|
53
|
+
```ruby
|
54
|
+
client.parse_file('path/to/document.pdf')
|
55
|
+
```
|
56
|
+
|
57
|
+
2. IO object (requires file type parameter):
|
58
|
+
```ruby
|
59
|
+
# From a URL
|
60
|
+
file_content = URI.open('https://example.com/document.pdf')
|
61
|
+
client.parse_file(file_content, 'pdf')
|
62
|
+
|
63
|
+
# From memory
|
64
|
+
io = StringIO.new(file_content)
|
65
|
+
client.parse_file(io, 'pdf')
|
66
|
+
|
67
|
+
# From a Tempfile
|
68
|
+
temp_file = Tempfile.new(['document', '.pdf'])
|
69
|
+
client.parse_file(temp_file, 'pdf')
|
70
|
+
```
|
71
|
+
|
43
72
|
### Advanced Options
|
44
73
|
|
45
74
|
```ruby
|
data/lib/llamaparserb/version.rb
CHANGED
data/lib/llamaparserb.rb
CHANGED
@@ -8,6 +8,7 @@ require "mime/types"
|
|
8
8
|
require "uri"
|
9
9
|
require "async"
|
10
10
|
require "logger"
|
11
|
+
require "tempfile"
|
11
12
|
|
12
13
|
module Llamaparserb
|
13
14
|
class Error < StandardError; end
|
@@ -41,17 +42,27 @@ module Llamaparserb
|
|
41
42
|
@connection = build_connection
|
42
43
|
end
|
43
44
|
|
44
|
-
def parse_file(
|
45
|
-
|
46
|
-
|
45
|
+
def parse_file(file_input, file_type = nil)
|
46
|
+
case file_input
|
47
|
+
when String
|
48
|
+
# Treat as file path
|
49
|
+
job_id = create_job_from_path(file_input)
|
50
|
+
log "Started parsing file under job_id #{job_id}", :info
|
51
|
+
when IO, StringIO, Tempfile
|
52
|
+
# Treat as file object
|
53
|
+
raise Error, "file_type parameter is required for IO objects" unless file_type
|
54
|
+
job_id = create_job_from_io(file_input, file_type)
|
55
|
+
log "Started parsing in-memory file under job_id #{job_id}", :info
|
56
|
+
else
|
57
|
+
raise Error, "Invalid input type. Expected String (file path) or IO object, got #{file_input.class}"
|
58
|
+
end
|
47
59
|
|
48
60
|
wait_for_completion(job_id)
|
49
|
-
|
50
61
|
result = get_result(job_id)
|
51
62
|
log "Successfully retrieved result", :info
|
52
63
|
result
|
53
64
|
rescue => e
|
54
|
-
handle_error(e,
|
65
|
+
handle_error(e, file_input)
|
55
66
|
end
|
56
67
|
|
57
68
|
private
|
@@ -144,9 +155,9 @@ module Llamaparserb
|
|
144
155
|
end
|
145
156
|
end
|
146
157
|
|
147
|
-
def handle_error(error,
|
158
|
+
def handle_error(error, file_input)
|
148
159
|
if @options[:ignore_errors]
|
149
|
-
log "Error while parsing file '#{
|
160
|
+
log "Error while parsing file '#{file_input}'", :error
|
150
161
|
nil
|
151
162
|
else
|
152
163
|
raise error
|
@@ -163,14 +174,37 @@ module Llamaparserb
|
|
163
174
|
end
|
164
175
|
end
|
165
176
|
|
166
|
-
def
|
177
|
+
def create_job_from_path(file_path)
|
167
178
|
validate_file_type!(file_path)
|
168
|
-
|
169
179
|
file = Faraday::Multipart::FilePart.new(
|
170
180
|
file_path,
|
171
181
|
detect_content_type(file_path)
|
172
182
|
)
|
183
|
+
create_job(file)
|
184
|
+
end
|
185
|
+
|
186
|
+
def create_job_from_io(io, file_type)
|
187
|
+
# Ensure file_type starts with a dot
|
188
|
+
file_type = ".#{file_type}" unless file_type.start_with?(".")
|
189
|
+
validate_file_type!(file_type)
|
190
|
+
|
191
|
+
temp_file = Tempfile.new(["upload", file_type])
|
192
|
+
temp_file.binmode
|
193
|
+
io.rewind
|
194
|
+
temp_file.write(io.read)
|
195
|
+
temp_file.rewind
|
196
|
+
|
197
|
+
file = Faraday::Multipart::FilePart.new(
|
198
|
+
temp_file,
|
199
|
+
detect_content_type(temp_file.path)
|
200
|
+
)
|
201
|
+
create_job(file)
|
202
|
+
ensure
|
203
|
+
temp_file&.close
|
204
|
+
temp_file&.unlink
|
205
|
+
end
|
173
206
|
|
207
|
+
def create_job(file)
|
174
208
|
response = @connection.post("upload") do |req|
|
175
209
|
req.headers["Authorization"] = "Bearer #{api_key}"
|
176
210
|
req.body = upload_params(file)
|