llamaparserb 0.1.1 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b0073944d6fc5235f1ee62b86656a396afaf63cdb92aecb2c65468f57767876d
4
- data.tar.gz: 1cedf4c9db42955c7e9c3baac69b8a8a78213817b505e65033c00817fdf73163
3
+ metadata.gz: 7bc729e8371ed2f748eaef9a92b82380c2e7d674caf6c4fe1b17e7e53d8ea62a
4
+ data.tar.gz: 8f510a5be8efc2877b617bd7e4682a4cb0e92c4fd794e6e6ef036481f3d57284
5
5
  SHA512:
6
- metadata.gz: fc4f606f909a118fed822fa47515c71b45daab1e26b8b059229dff9f9e90ad338be1fde4995ac0dd2a47ffe64378a6a7dda3c163c5cc598e7637f7d4126468f1
7
- data.tar.gz: 502b1ffe2170259741d42f1bae218b725e7430de5166cdb58ffa4d03536e4220be0001ed04e07f4abe6c3e3a6158b70ca42dc887217efc61e49a6d382040f35c
6
+ metadata.gz: 5e6c3d6df9c69da63cf631a296c9618c074fc127c48cc23b398237dc015f4cb4d2bf4d2f4f50b7ed380ac053ff98c19ea9fc3530d890db34cf090e2084fb0821
7
+ data.tar.gz: 04b10a441f82670dc58d873d586cdd043b94f1a477b2d72b6494128d9ab76cd2f8656b1c5db0ad61487d4efe51dbc42e63b047c8101f6f74593d29e098965b52
data/CHANGELOG.md CHANGED
@@ -6,6 +6,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
6
6
 
7
7
  ## [Unreleased]
8
8
 
9
+ ## [0.2.1] - 2024-11-28
10
+ ### Fixed
11
+ - Fix parse_file to handle files that are not on the local filesystem
12
+
13
+ ## [0.2.0] - 2024-11-28
14
+ ### Changed
15
+ - Allow passing in a string or an IO object to `parse_file`
16
+ - Add support for file type parameter to `parse_file`
17
+
9
18
  ## [0.1.1] - 2024-11-28
10
19
  ### Changed
11
20
  - Move gem ownership to Horizing
data/README.md CHANGED
@@ -32,14 +32,43 @@ require 'llamaparserb'
32
32
  # Initialize client with API key
33
33
  client = Llamaparserb::Client.new(ENV['LLAMA_CLOUD_API_KEY'])
34
34
 
35
- # Parse a file to text (default)
35
+ # Parse a file from disk (to text by default)
36
36
  text = client.parse_file('path/to/document.pdf')
37
37
 
38
+ # Parse an in-memory file (requires file type)
39
+ require 'open-uri'
40
+ file_content = URI.open('https://example.com/document.pdf')
41
+ text = client.parse_file(file_content, 'pdf')
42
+
38
43
  # Parse a file to markdown
39
44
  client = Llamaparserb::Client.new(ENV['LLAMA_CLOUD_API_KEY'], result_type: "markdown")
40
45
  markdown = client.parse_file('path/to/document.pdf')
41
46
  ```
42
47
 
48
+ ### File Input Options
49
+
50
+ The `parse_file` method accepts two types of inputs:
51
+
52
+ 1. File path (String):
53
+ ```ruby
54
+ client.parse_file('path/to/document.pdf')
55
+ ```
56
+
57
+ 2. IO object (requires file type parameter):
58
+ ```ruby
59
+ # From a URL
60
+ file_content = URI.open('https://example.com/document.pdf')
61
+ client.parse_file(file_content, 'pdf')
62
+
63
+ # From memory
64
+ io = StringIO.new(file_content)
65
+ client.parse_file(io, 'pdf')
66
+
67
+ # From a Tempfile
68
+ temp_file = Tempfile.new(['document', '.pdf'])
69
+ client.parse_file(temp_file, 'pdf')
70
+ ```
71
+
43
72
  ### Advanced Options
44
73
 
45
74
  ```ruby
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Llamaparserb
4
- VERSION = "0.1.1"
4
+ VERSION = "0.2.1"
5
5
  end
data/lib/llamaparserb.rb CHANGED
@@ -8,6 +8,7 @@ require "mime/types"
8
8
  require "uri"
9
9
  require "async"
10
10
  require "logger"
11
+ require "tempfile"
11
12
 
12
13
  module Llamaparserb
13
14
  class Error < StandardError; end
@@ -41,17 +42,34 @@ module Llamaparserb
41
42
  @connection = build_connection
42
43
  end
43
44
 
44
- def parse_file(file_path)
45
- job_id = create_job(file_path)
46
- log "Started parsing file under job_id #{job_id}", :info
45
+ def parse_file(file_input, file_type = nil)
46
+ case file_input
47
+ when String
48
+ if file_type
49
+ job_id = create_job_from_io(file_input, file_type)
50
+ log "Started parsing binary data under job_id #{job_id}", :info
51
+ elsif File.exist?(file_input)
52
+ job_id = create_job_from_path(file_input)
53
+ log "Started parsing file under job_id #{job_id}", :info
54
+ else
55
+ raise Error, "file_type parameter is required for binary string input"
56
+ end
57
+ when IO, StringIO, Tempfile
58
+ raise Error, "file_type parameter is required for IO objects" unless file_type
59
+ job_id = create_job_from_io(file_input, file_type)
60
+ log "Started parsing in-memory file under job_id #{job_id}", :info
61
+ else
62
+ raise Error, "Invalid input type. Expected String (file path) or IO object, got #{file_input.class}"
63
+ end
47
64
 
48
65
  wait_for_completion(job_id)
49
-
50
66
  result = get_result(job_id)
51
67
  log "Successfully retrieved result", :info
52
68
  result
53
69
  rescue => e
54
- handle_error(e, file_path)
70
+ handle_error(e, file_input)
71
+ raise unless @options[:ignore_errors]
72
+ nil
55
73
  end
56
74
 
57
75
  private
@@ -97,15 +115,19 @@ module Llamaparserb
97
115
 
98
116
  def log(message, level = :debug)
99
117
  return unless @options[:verbose]
118
+
119
+ # Convert message to string and force UTF-8 encoding, replacing invalid characters
120
+ safe_message = message.to_s.encode("UTF-8", invalid: :replace, undef: :replace, replace: "?")
121
+
100
122
  case level
101
123
  when :info
102
- logger.info(message)
124
+ logger.info(safe_message)
103
125
  when :warn
104
- logger.warn(message)
126
+ logger.warn(safe_message)
105
127
  when :error
106
- logger.error(message)
128
+ logger.error(safe_message)
107
129
  else
108
- logger.debug(message)
130
+ logger.debug(safe_message)
109
131
  end
110
132
  end
111
133
 
@@ -144,9 +166,15 @@ module Llamaparserb
144
166
  end
145
167
  end
146
168
 
147
- def handle_error(error, file_path)
169
+ def handle_error(error, file_input)
148
170
  if @options[:ignore_errors]
149
- log "Error while parsing file '#{file_path}': #{error.message}", :error
171
+ safe_message = if file_input.is_a?(String) && !File.exist?(file_input)
172
+ "binary data"
173
+ else
174
+ file_input.class.to_s
175
+ end
176
+
177
+ log "Error while parsing file (#{safe_message}): #{error.message}", :error
150
178
  nil
151
179
  else
152
180
  raise error
@@ -163,14 +191,43 @@ module Llamaparserb
163
191
  end
164
192
  end
165
193
 
166
- def create_job(file_path)
194
+ def create_job_from_path(file_path)
167
195
  validate_file_type!(file_path)
168
-
169
196
  file = Faraday::Multipart::FilePart.new(
170
197
  file_path,
171
198
  detect_content_type(file_path)
172
199
  )
200
+ create_job(file)
201
+ end
202
+
203
+ def create_job_from_io(io_or_string, file_type)
204
+ file_type = ".#{file_type}" unless file_type.start_with?(".")
205
+ validate_file_type!(file_type)
173
206
 
207
+ temp_file = Tempfile.new(["upload", file_type])
208
+ temp_file.binmode
209
+
210
+ case io_or_string
211
+ when String
212
+ temp_file.write(io_or_string.force_encoding("ASCII-8BIT"))
213
+ else
214
+ io_or_string.rewind if io_or_string.respond_to?(:rewind)
215
+ temp_file.write(io_or_string.read.force_encoding("ASCII-8BIT"))
216
+ end
217
+
218
+ temp_file.rewind
219
+
220
+ file = Faraday::Multipart::FilePart.new(
221
+ temp_file,
222
+ detect_content_type(temp_file.path)
223
+ )
224
+ create_job(file)
225
+ ensure
226
+ temp_file&.close
227
+ temp_file&.unlink
228
+ end
229
+
230
+ def create_job(file)
174
231
  response = @connection.post("upload") do |req|
175
232
  req.headers["Authorization"] = "Bearer #{api_key}"
176
233
  req.body = upload_params(file)
@@ -233,7 +290,12 @@ module Llamaparserb
233
290
  end
234
291
 
235
292
  def validate_file_type!(file_path)
236
- extension = File.extname(file_path).downcase
293
+ extension = if file_path.start_with?(".")
294
+ file_path
295
+ else
296
+ File.extname(file_path).downcase
297
+ end
298
+
237
299
  unless SUPPORTED_FILE_TYPES.include?(extension)
238
300
  raise Error, "Unsupported file type: #{extension}. Supported types: #{SUPPORTED_FILE_TYPES.join(", ")}"
239
301
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llamaparserb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Heidar Bernhardsson