llamaparserb 0.1.1 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +30 -1
- data/lib/llamaparserb/version.rb +1 -1
- data/lib/llamaparserb.rb +76 -14
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7bc729e8371ed2f748eaef9a92b82380c2e7d674caf6c4fe1b17e7e53d8ea62a
|
4
|
+
data.tar.gz: 8f510a5be8efc2877b617bd7e4682a4cb0e92c4fd794e6e6ef036481f3d57284
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5e6c3d6df9c69da63cf631a296c9618c074fc127c48cc23b398237dc015f4cb4d2bf4d2f4f50b7ed380ac053ff98c19ea9fc3530d890db34cf090e2084fb0821
|
7
|
+
data.tar.gz: 04b10a441f82670dc58d873d586cdd043b94f1a477b2d72b6494128d9ab76cd2f8656b1c5db0ad61487d4efe51dbc42e63b047c8101f6f74593d29e098965b52
|
data/CHANGELOG.md
CHANGED
@@ -6,6 +6,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
6
6
|
|
7
7
|
## [Unreleased]
|
8
8
|
|
9
|
+
## [0.2.1] - 2024-11-28
|
10
|
+
### Fixed
|
11
|
+
- Fix parse_file to handle files that are not on the local filesystem
|
12
|
+
|
13
|
+
## [0.2.0] - 2024-11-28
|
14
|
+
### Changed
|
15
|
+
- Allow passing in a string or an IO object to `parse_file`
|
16
|
+
- Add support for file type parameter to `parse_file`
|
17
|
+
|
9
18
|
## [0.1.1] - 2024-11-28
|
10
19
|
### Changed
|
11
20
|
- Move gem ownership to Horizing
|
data/README.md
CHANGED
@@ -32,14 +32,43 @@ require 'llamaparserb'
|
|
32
32
|
# Initialize client with API key
|
33
33
|
client = Llamaparserb::Client.new(ENV['LLAMA_CLOUD_API_KEY'])
|
34
34
|
|
35
|
-
# Parse a file to text
|
35
|
+
# Parse a file from disk (to text by default)
|
36
36
|
text = client.parse_file('path/to/document.pdf')
|
37
37
|
|
38
|
+
# Parse an in-memory file (requires file type)
|
39
|
+
require 'open-uri'
|
40
|
+
file_content = URI.open('https://example.com/document.pdf')
|
41
|
+
text = client.parse_file(file_content, 'pdf')
|
42
|
+
|
38
43
|
# Parse a file to markdown
|
39
44
|
client = Llamaparserb::Client.new(ENV['LLAMA_CLOUD_API_KEY'], result_type: "markdown")
|
40
45
|
markdown = client.parse_file('path/to/document.pdf')
|
41
46
|
```
|
42
47
|
|
48
|
+
### File Input Options
|
49
|
+
|
50
|
+
The `parse_file` method accepts two types of inputs:
|
51
|
+
|
52
|
+
1. File path (String):
|
53
|
+
```ruby
|
54
|
+
client.parse_file('path/to/document.pdf')
|
55
|
+
```
|
56
|
+
|
57
|
+
2. IO object (requires file type parameter):
|
58
|
+
```ruby
|
59
|
+
# From a URL
|
60
|
+
file_content = URI.open('https://example.com/document.pdf')
|
61
|
+
client.parse_file(file_content, 'pdf')
|
62
|
+
|
63
|
+
# From memory
|
64
|
+
io = StringIO.new(file_content)
|
65
|
+
client.parse_file(io, 'pdf')
|
66
|
+
|
67
|
+
# From a Tempfile
|
68
|
+
temp_file = Tempfile.new(['document', '.pdf'])
|
69
|
+
client.parse_file(temp_file, 'pdf')
|
70
|
+
```
|
71
|
+
|
43
72
|
### Advanced Options
|
44
73
|
|
45
74
|
```ruby
|
data/lib/llamaparserb/version.rb
CHANGED
data/lib/llamaparserb.rb
CHANGED
@@ -8,6 +8,7 @@ require "mime/types"
|
|
8
8
|
require "uri"
|
9
9
|
require "async"
|
10
10
|
require "logger"
|
11
|
+
require "tempfile"
|
11
12
|
|
12
13
|
module Llamaparserb
|
13
14
|
class Error < StandardError; end
|
@@ -41,17 +42,34 @@ module Llamaparserb
|
|
41
42
|
@connection = build_connection
|
42
43
|
end
|
43
44
|
|
44
|
-
def parse_file(
|
45
|
-
|
46
|
-
|
45
|
+
def parse_file(file_input, file_type = nil)
|
46
|
+
case file_input
|
47
|
+
when String
|
48
|
+
if file_type
|
49
|
+
job_id = create_job_from_io(file_input, file_type)
|
50
|
+
log "Started parsing binary data under job_id #{job_id}", :info
|
51
|
+
elsif File.exist?(file_input)
|
52
|
+
job_id = create_job_from_path(file_input)
|
53
|
+
log "Started parsing file under job_id #{job_id}", :info
|
54
|
+
else
|
55
|
+
raise Error, "file_type parameter is required for binary string input"
|
56
|
+
end
|
57
|
+
when IO, StringIO, Tempfile
|
58
|
+
raise Error, "file_type parameter is required for IO objects" unless file_type
|
59
|
+
job_id = create_job_from_io(file_input, file_type)
|
60
|
+
log "Started parsing in-memory file under job_id #{job_id}", :info
|
61
|
+
else
|
62
|
+
raise Error, "Invalid input type. Expected String (file path) or IO object, got #{file_input.class}"
|
63
|
+
end
|
47
64
|
|
48
65
|
wait_for_completion(job_id)
|
49
|
-
|
50
66
|
result = get_result(job_id)
|
51
67
|
log "Successfully retrieved result", :info
|
52
68
|
result
|
53
69
|
rescue => e
|
54
|
-
handle_error(e,
|
70
|
+
handle_error(e, file_input)
|
71
|
+
raise unless @options[:ignore_errors]
|
72
|
+
nil
|
55
73
|
end
|
56
74
|
|
57
75
|
private
|
@@ -97,15 +115,19 @@ module Llamaparserb
|
|
97
115
|
|
98
116
|
def log(message, level = :debug)
|
99
117
|
return unless @options[:verbose]
|
118
|
+
|
119
|
+
# Convert message to string and force UTF-8 encoding, replacing invalid characters
|
120
|
+
safe_message = message.to_s.encode("UTF-8", invalid: :replace, undef: :replace, replace: "?")
|
121
|
+
|
100
122
|
case level
|
101
123
|
when :info
|
102
|
-
logger.info(
|
124
|
+
logger.info(safe_message)
|
103
125
|
when :warn
|
104
|
-
logger.warn(
|
126
|
+
logger.warn(safe_message)
|
105
127
|
when :error
|
106
|
-
logger.error(
|
128
|
+
logger.error(safe_message)
|
107
129
|
else
|
108
|
-
logger.debug(
|
130
|
+
logger.debug(safe_message)
|
109
131
|
end
|
110
132
|
end
|
111
133
|
|
@@ -144,9 +166,15 @@ module Llamaparserb
|
|
144
166
|
end
|
145
167
|
end
|
146
168
|
|
147
|
-
def handle_error(error,
|
169
|
+
def handle_error(error, file_input)
|
148
170
|
if @options[:ignore_errors]
|
149
|
-
|
171
|
+
safe_message = if file_input.is_a?(String) && !File.exist?(file_input)
|
172
|
+
"binary data"
|
173
|
+
else
|
174
|
+
file_input.class.to_s
|
175
|
+
end
|
176
|
+
|
177
|
+
log "Error while parsing file (#{safe_message}): #{error.message}", :error
|
150
178
|
nil
|
151
179
|
else
|
152
180
|
raise error
|
@@ -163,14 +191,43 @@ module Llamaparserb
|
|
163
191
|
end
|
164
192
|
end
|
165
193
|
|
166
|
-
def
|
194
|
+
def create_job_from_path(file_path)
|
167
195
|
validate_file_type!(file_path)
|
168
|
-
|
169
196
|
file = Faraday::Multipart::FilePart.new(
|
170
197
|
file_path,
|
171
198
|
detect_content_type(file_path)
|
172
199
|
)
|
200
|
+
create_job(file)
|
201
|
+
end
|
202
|
+
|
203
|
+
def create_job_from_io(io_or_string, file_type)
|
204
|
+
file_type = ".#{file_type}" unless file_type.start_with?(".")
|
205
|
+
validate_file_type!(file_type)
|
173
206
|
|
207
|
+
temp_file = Tempfile.new(["upload", file_type])
|
208
|
+
temp_file.binmode
|
209
|
+
|
210
|
+
case io_or_string
|
211
|
+
when String
|
212
|
+
temp_file.write(io_or_string.force_encoding("ASCII-8BIT"))
|
213
|
+
else
|
214
|
+
io_or_string.rewind if io_or_string.respond_to?(:rewind)
|
215
|
+
temp_file.write(io_or_string.read.force_encoding("ASCII-8BIT"))
|
216
|
+
end
|
217
|
+
|
218
|
+
temp_file.rewind
|
219
|
+
|
220
|
+
file = Faraday::Multipart::FilePart.new(
|
221
|
+
temp_file,
|
222
|
+
detect_content_type(temp_file.path)
|
223
|
+
)
|
224
|
+
create_job(file)
|
225
|
+
ensure
|
226
|
+
temp_file&.close
|
227
|
+
temp_file&.unlink
|
228
|
+
end
|
229
|
+
|
230
|
+
def create_job(file)
|
174
231
|
response = @connection.post("upload") do |req|
|
175
232
|
req.headers["Authorization"] = "Bearer #{api_key}"
|
176
233
|
req.body = upload_params(file)
|
@@ -233,7 +290,12 @@ module Llamaparserb
|
|
233
290
|
end
|
234
291
|
|
235
292
|
def validate_file_type!(file_path)
|
236
|
-
extension =
|
293
|
+
extension = if file_path.start_with?(".")
|
294
|
+
file_path
|
295
|
+
else
|
296
|
+
File.extname(file_path).downcase
|
297
|
+
end
|
298
|
+
|
237
299
|
unless SUPPORTED_FILE_TYPES.include?(extension)
|
238
300
|
raise Error, "Unsupported file type: #{extension}. Supported types: #{SUPPORTED_FILE_TYPES.join(", ")}"
|
239
301
|
end
|