llamaparserb 0.1.1 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b0073944d6fc5235f1ee62b86656a396afaf63cdb92aecb2c65468f57767876d
4
- data.tar.gz: 1cedf4c9db42955c7e9c3baac69b8a8a78213817b505e65033c00817fdf73163
3
+ metadata.gz: 7bc729e8371ed2f748eaef9a92b82380c2e7d674caf6c4fe1b17e7e53d8ea62a
4
+ data.tar.gz: 8f510a5be8efc2877b617bd7e4682a4cb0e92c4fd794e6e6ef036481f3d57284
5
5
  SHA512:
6
- metadata.gz: fc4f606f909a118fed822fa47515c71b45daab1e26b8b059229dff9f9e90ad338be1fde4995ac0dd2a47ffe64378a6a7dda3c163c5cc598e7637f7d4126468f1
7
- data.tar.gz: 502b1ffe2170259741d42f1bae218b725e7430de5166cdb58ffa4d03536e4220be0001ed04e07f4abe6c3e3a6158b70ca42dc887217efc61e49a6d382040f35c
6
+ metadata.gz: 5e6c3d6df9c69da63cf631a296c9618c074fc127c48cc23b398237dc015f4cb4d2bf4d2f4f50b7ed380ac053ff98c19ea9fc3530d890db34cf090e2084fb0821
7
+ data.tar.gz: 04b10a441f82670dc58d873d586cdd043b94f1a477b2d72b6494128d9ab76cd2f8656b1c5db0ad61487d4efe51dbc42e63b047c8101f6f74593d29e098965b52
data/CHANGELOG.md CHANGED
@@ -6,6 +6,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
6
6
 
7
7
  ## [Unreleased]
8
8
 
9
+ ## [0.2.1] - 2024-11-28
10
+ ### Fixed
11
+ - Fix parse_file to handle files that are not on the local filesystem
12
+
13
+ ## [0.2.0] - 2024-11-28
14
+ ### Changed
15
+ - Allow passing in a string or an IO object to `parse_file`
16
+ - Add support for file type parameter to `parse_file`
17
+
9
18
  ## [0.1.1] - 2024-11-28
10
19
  ### Changed
11
20
  - Move gem ownership to Horizing
data/README.md CHANGED
@@ -32,14 +32,43 @@ require 'llamaparserb'
32
32
  # Initialize client with API key
33
33
  client = Llamaparserb::Client.new(ENV['LLAMA_CLOUD_API_KEY'])
34
34
 
35
- # Parse a file to text (default)
35
+ # Parse a file from disk (to text by default)
36
36
  text = client.parse_file('path/to/document.pdf')
37
37
 
38
+ # Parse an in-memory file (requires file type)
39
+ require 'open-uri'
40
+ file_content = URI.open('https://example.com/document.pdf')
41
+ text = client.parse_file(file_content, 'pdf')
42
+
38
43
  # Parse a file to markdown
39
44
  client = Llamaparserb::Client.new(ENV['LLAMA_CLOUD_API_KEY'], result_type: "markdown")
40
45
  markdown = client.parse_file('path/to/document.pdf')
41
46
  ```
42
47
 
48
+ ### File Input Options
49
+
50
+ The `parse_file` method accepts two types of inputs:
51
+
52
+ 1. File path (String):
53
+ ```ruby
54
+ client.parse_file('path/to/document.pdf')
55
+ ```
56
+
57
+ 2. IO object (requires file type parameter):
58
+ ```ruby
59
+ # From a URL
60
+ file_content = URI.open('https://example.com/document.pdf')
61
+ client.parse_file(file_content, 'pdf')
62
+
63
+ # From memory
64
+ io = StringIO.new(file_content)
65
+ client.parse_file(io, 'pdf')
66
+
67
+ # From a Tempfile
68
+ temp_file = Tempfile.new(['document', '.pdf'])
69
+ client.parse_file(temp_file, 'pdf')
70
+ ```
71
+
43
72
  ### Advanced Options
44
73
 
45
74
  ```ruby
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Llamaparserb
4
- VERSION = "0.1.1"
4
+ VERSION = "0.2.1"
5
5
  end
data/lib/llamaparserb.rb CHANGED
@@ -8,6 +8,7 @@ require "mime/types"
8
8
  require "uri"
9
9
  require "async"
10
10
  require "logger"
11
+ require "tempfile"
11
12
 
12
13
  module Llamaparserb
13
14
  class Error < StandardError; end
@@ -41,17 +42,34 @@ module Llamaparserb
41
42
  @connection = build_connection
42
43
  end
43
44
 
44
- def parse_file(file_path)
45
- job_id = create_job(file_path)
46
- log "Started parsing file under job_id #{job_id}", :info
45
+ def parse_file(file_input, file_type = nil)
46
+ case file_input
47
+ when String
48
+ if file_type
49
+ job_id = create_job_from_io(file_input, file_type)
50
+ log "Started parsing binary data under job_id #{job_id}", :info
51
+ elsif File.exist?(file_input)
52
+ job_id = create_job_from_path(file_input)
53
+ log "Started parsing file under job_id #{job_id}", :info
54
+ else
55
+ raise Error, "file_type parameter is required for binary string input"
56
+ end
57
+ when IO, StringIO, Tempfile
58
+ raise Error, "file_type parameter is required for IO objects" unless file_type
59
+ job_id = create_job_from_io(file_input, file_type)
60
+ log "Started parsing in-memory file under job_id #{job_id}", :info
61
+ else
62
+ raise Error, "Invalid input type. Expected String (file path) or IO object, got #{file_input.class}"
63
+ end
47
64
 
48
65
  wait_for_completion(job_id)
49
-
50
66
  result = get_result(job_id)
51
67
  log "Successfully retrieved result", :info
52
68
  result
53
69
  rescue => e
54
- handle_error(e, file_path)
70
+ handle_error(e, file_input)
71
+ raise unless @options[:ignore_errors]
72
+ nil
55
73
  end
56
74
 
57
75
  private
@@ -97,15 +115,19 @@ module Llamaparserb
97
115
 
98
116
  def log(message, level = :debug)
99
117
  return unless @options[:verbose]
118
+
119
+ # Convert message to string and force UTF-8 encoding, replacing invalid characters
120
+ safe_message = message.to_s.encode("UTF-8", invalid: :replace, undef: :replace, replace: "?")
121
+
100
122
  case level
101
123
  when :info
102
- logger.info(message)
124
+ logger.info(safe_message)
103
125
  when :warn
104
- logger.warn(message)
126
+ logger.warn(safe_message)
105
127
  when :error
106
- logger.error(message)
128
+ logger.error(safe_message)
107
129
  else
108
- logger.debug(message)
130
+ logger.debug(safe_message)
109
131
  end
110
132
  end
111
133
 
@@ -144,9 +166,15 @@ module Llamaparserb
144
166
  end
145
167
  end
146
168
 
147
- def handle_error(error, file_path)
169
+ def handle_error(error, file_input)
148
170
  if @options[:ignore_errors]
149
- log "Error while parsing file '#{file_path}': #{error.message}", :error
171
+ safe_message = if file_input.is_a?(String) && !File.exist?(file_input)
172
+ "binary data"
173
+ else
174
+ file_input.class.to_s
175
+ end
176
+
177
+ log "Error while parsing file (#{safe_message}): #{error.message}", :error
150
178
  nil
151
179
  else
152
180
  raise error
@@ -163,14 +191,43 @@ module Llamaparserb
163
191
  end
164
192
  end
165
193
 
166
- def create_job(file_path)
194
+ def create_job_from_path(file_path)
167
195
  validate_file_type!(file_path)
168
-
169
196
  file = Faraday::Multipart::FilePart.new(
170
197
  file_path,
171
198
  detect_content_type(file_path)
172
199
  )
200
+ create_job(file)
201
+ end
202
+
203
+ def create_job_from_io(io_or_string, file_type)
204
+ file_type = ".#{file_type}" unless file_type.start_with?(".")
205
+ validate_file_type!(file_type)
173
206
 
207
+ temp_file = Tempfile.new(["upload", file_type])
208
+ temp_file.binmode
209
+
210
+ case io_or_string
211
+ when String
212
+ temp_file.write(io_or_string.force_encoding("ASCII-8BIT"))
213
+ else
214
+ io_or_string.rewind if io_or_string.respond_to?(:rewind)
215
+ temp_file.write(io_or_string.read.force_encoding("ASCII-8BIT"))
216
+ end
217
+
218
+ temp_file.rewind
219
+
220
+ file = Faraday::Multipart::FilePart.new(
221
+ temp_file,
222
+ detect_content_type(temp_file.path)
223
+ )
224
+ create_job(file)
225
+ ensure
226
+ temp_file&.close
227
+ temp_file&.unlink
228
+ end
229
+
230
+ def create_job(file)
174
231
  response = @connection.post("upload") do |req|
175
232
  req.headers["Authorization"] = "Bearer #{api_key}"
176
233
  req.body = upload_params(file)
@@ -233,7 +290,12 @@ module Llamaparserb
233
290
  end
234
291
 
235
292
  def validate_file_type!(file_path)
236
- extension = File.extname(file_path).downcase
293
+ extension = if file_path.start_with?(".")
294
+ file_path
295
+ else
296
+ File.extname(file_path).downcase
297
+ end
298
+
237
299
  unless SUPPORTED_FILE_TYPES.include?(extension)
238
300
  raise Error, "Unsupported file type: #{extension}. Supported types: #{SUPPORTED_FILE_TYPES.join(", ")}"
239
301
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llamaparserb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Heidar Bernhardsson