llamaparserb 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b0073944d6fc5235f1ee62b86656a396afaf63cdb92aecb2c65468f57767876d
4
- data.tar.gz: 1cedf4c9db42955c7e9c3baac69b8a8a78213817b505e65033c00817fdf73163
3
+ metadata.gz: 14e4e5deea50bc3f5cb0e32c3c3029f6b26fda4cacf91be0f96e9f677f92d1e5
4
+ data.tar.gz: e3e6cb569456d1c649be22ad3e5af65e30484fce50582d7493f5e22b2ac03f61
5
5
  SHA512:
6
- metadata.gz: fc4f606f909a118fed822fa47515c71b45daab1e26b8b059229dff9f9e90ad338be1fde4995ac0dd2a47ffe64378a6a7dda3c163c5cc598e7637f7d4126468f1
7
- data.tar.gz: 502b1ffe2170259741d42f1bae218b725e7430de5166cdb58ffa4d03536e4220be0001ed04e07f4abe6c3e3a6158b70ca42dc887217efc61e49a6d382040f35c
6
+ metadata.gz: 72121ad4b70f95ddd2bfe23f129d5c9ee634543543ba43e4faa437da67bf3cbdf8bc1ea828aeb58d43073bed46c28d4010a6865b5bc006c32ccfc30a099d1d65
7
+ data.tar.gz: 5bea9e367b71d38fe8f6a1a80be0b4fd190e3e56b780e369d33c3319dfd91b483283f4b43ab8fc484468ab611c5e3efc639d8302ba611ccfb71258c459fdb783
data/CHANGELOG.md CHANGED
@@ -6,6 +6,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
6
6
 
7
7
  ## [Unreleased]
8
8
 
9
+ ## [0.2.0] - 2024-11-28
10
+ ### Changed
11
+ - Allow passing in a string or an IO object to `parse_file`
12
+ - Add support for file type parameter to `parse_file`
13
+
9
14
  ## [0.1.1] - 2024-11-28
10
15
  ### Changed
11
16
  - Move gem ownership to Horizing
data/README.md CHANGED
@@ -32,14 +32,43 @@ require 'llamaparserb'
32
32
  # Initialize client with API key
33
33
  client = Llamaparserb::Client.new(ENV['LLAMA_CLOUD_API_KEY'])
34
34
 
35
- # Parse a file to text (default)
35
+ # Parse a file from disk (to text by default)
36
36
  text = client.parse_file('path/to/document.pdf')
37
37
 
38
+ # Parse an in-memory file (requires file type)
39
+ require 'open-uri'
40
+ file_content = URI.open('https://example.com/document.pdf')
41
+ text = client.parse_file(file_content, 'pdf')
42
+
38
43
  # Parse a file to markdown
39
44
  client = Llamaparserb::Client.new(ENV['LLAMA_CLOUD_API_KEY'], result_type: "markdown")
40
45
  markdown = client.parse_file('path/to/document.pdf')
41
46
  ```
42
47
 
48
+ ### File Input Options
49
+
50
+ The `parse_file` method accepts two types of inputs:
51
+
52
+ 1. File path (String):
53
+ ```ruby
54
+ client.parse_file('path/to/document.pdf')
55
+ ```
56
+
57
+ 2. IO object (requires file type parameter):
58
+ ```ruby
59
+ # From a URL
60
+ file_content = URI.open('https://example.com/document.pdf')
61
+ client.parse_file(file_content, 'pdf')
62
+
63
+ # From memory
64
+ io = StringIO.new(file_content)
65
+ client.parse_file(io, 'pdf')
66
+
67
+ # From a Tempfile
68
+ temp_file = Tempfile.new(['document', '.pdf'])
69
+ client.parse_file(temp_file, 'pdf')
70
+ ```
71
+
43
72
  ### Advanced Options
44
73
 
45
74
  ```ruby
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Llamaparserb
4
- VERSION = "0.1.1"
4
+ VERSION = "0.2.0"
5
5
  end
data/lib/llamaparserb.rb CHANGED
@@ -8,6 +8,7 @@ require "mime/types"
8
8
  require "uri"
9
9
  require "async"
10
10
  require "logger"
11
+ require "tempfile"
11
12
 
12
13
  module Llamaparserb
13
14
  class Error < StandardError; end
@@ -41,17 +42,27 @@ module Llamaparserb
41
42
  @connection = build_connection
42
43
  end
43
44
 
44
- def parse_file(file_path)
45
- job_id = create_job(file_path)
46
- log "Started parsing file under job_id #{job_id}", :info
45
+ def parse_file(file_input, file_type = nil)
46
+ case file_input
47
+ when String
48
+ # Treat as file path
49
+ job_id = create_job_from_path(file_input)
50
+ log "Started parsing file under job_id #{job_id}", :info
51
+ when IO, StringIO, Tempfile
52
+ # Treat as file object
53
+ raise Error, "file_type parameter is required for IO objects" unless file_type
54
+ job_id = create_job_from_io(file_input, file_type)
55
+ log "Started parsing in-memory file under job_id #{job_id}", :info
56
+ else
57
+ raise Error, "Invalid input type. Expected String (file path) or IO object, got #{file_input.class}"
58
+ end
47
59
 
48
60
  wait_for_completion(job_id)
49
-
50
61
  result = get_result(job_id)
51
62
  log "Successfully retrieved result", :info
52
63
  result
53
64
  rescue => e
54
- handle_error(e, file_path)
65
+ handle_error(e, file_input)
55
66
  end
56
67
 
57
68
  private
@@ -144,9 +155,9 @@ module Llamaparserb
144
155
  end
145
156
  end
146
157
 
147
- def handle_error(error, file_path)
158
+ def handle_error(error, file_input)
148
159
  if @options[:ignore_errors]
149
- log "Error while parsing file '#{file_path}': #{error.message}", :error
160
+ log "Error while parsing file '#{file_input}'", :error
150
161
  nil
151
162
  else
152
163
  raise error
@@ -163,14 +174,37 @@ module Llamaparserb
163
174
  end
164
175
  end
165
176
 
166
- def create_job(file_path)
177
+ def create_job_from_path(file_path)
167
178
  validate_file_type!(file_path)
168
-
169
179
  file = Faraday::Multipart::FilePart.new(
170
180
  file_path,
171
181
  detect_content_type(file_path)
172
182
  )
183
+ create_job(file)
184
+ end
185
+
186
+ def create_job_from_io(io, file_type)
187
+ # Ensure file_type starts with a dot
188
+ file_type = ".#{file_type}" unless file_type.start_with?(".")
189
+ validate_file_type!(file_type)
190
+
191
+ temp_file = Tempfile.new(["upload", file_type])
192
+ temp_file.binmode
193
+ io.rewind
194
+ temp_file.write(io.read)
195
+ temp_file.rewind
196
+
197
+ file = Faraday::Multipart::FilePart.new(
198
+ temp_file,
199
+ detect_content_type(temp_file.path)
200
+ )
201
+ create_job(file)
202
+ ensure
203
+ temp_file&.close
204
+ temp_file&.unlink
205
+ end
173
206
 
207
+ def create_job(file)
174
208
  response = @connection.post("upload") do |req|
175
209
  req.headers["Authorization"] = "Bearer #{api_key}"
176
210
  req.body = upload_params(file)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llamaparserb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Heidar Bernhardsson