llamaparserb 0.2.3 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ba81bbf8d24dc79b57a29c8c40764c42d700012e6608d1a494075dd63900d06f
4
- data.tar.gz: 1ce8846e182bf7025d90e8722148554fe2e65d47d82e1671e19f7e6386d30ce9
3
+ metadata.gz: '045959dc406ac1ba5ce9db3cde8bdfc7e0bbccf83601027a0847b6693220af8f'
4
+ data.tar.gz: a56317601d9feba955aa6d8c08586ac495673b3d506ed0b332c43aca59a0bacf
5
5
  SHA512:
6
- metadata.gz: b5c86e77644210049df9a1095049e2a276f70e40208637e80fd14283fded8eee45ec034cc9bd7c205b802ef24be252989bbd5be671ac50981c0acb998876131b
7
- data.tar.gz: 91ea52459cc1fc38f15dd5b050a2c25449147f8905d24af34c76986f48c84f54d0187bad3c0d3b4da81061e8d593982252229611592948deffdc7a6d6d6c066f
6
+ metadata.gz: 585bd687b193ffedceccf87e3a95550a1053e31d6d14da430f603109e9be74aa4aaf06d901afda2a0aaa9cdf87be93a08420cbc88b876630ce7d865c2e1c08db
7
+ data.tar.gz: 2d0616205409a56943d31014ca01c910bd31d6fb40a0c98fcbb02e6ba946d92a22925ba77ba05cfeae10ae6af0f3e7753b8dbcefed66739b97663808862d1892
data/CHANGELOG.md CHANGED
@@ -5,15 +5,25 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
5
5
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
6
6
 
7
7
  ## [Unreleased]
8
+ ### Added
9
+ - Add support for all supported optional llamaparse parameters when parsing files from URLs
10
+
11
+ ## [0.3.0] - 2024-11-28
12
+ ### Added
13
+ - Add support for parsing files from URLs
8
14
 
9
15
  ## [0.2.3] - 2024-11-28
10
16
  ### Added
11
- - Add support for all supported optional llamaparsse parameters to `parse_file`
17
+ - Add support for all supported optional llamaparse parameters to `parse_file`
18
+
19
+ [0.2.3]: https://github.com/horizing/llamaparserb/releases/tag/v0.2.3...v0.2.2
12
20
 
13
21
  ## [0.2.2] - 2024-11-28
14
22
  ### Fixed
15
23
  - Fix issue with handling file path
16
24
 
25
+ [0.2.2]: https://github.com/horizing/llamaparserb/releases/tag/v0.2.2
26
+
17
27
  ## [0.2.1] - 2024-11-28
18
28
  ### Fixed
19
29
  - Fix parse_file to handle files that are not on the local filesystem
@@ -29,8 +39,4 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
29
39
 
30
40
  ## [0.1.0] - 2024-11-27
31
41
  ### Added
32
- - Initial release
33
-
34
- [Unreleased]: https://github.com/horizing/llamaparserb/compare/v0.1.0...HEAD
35
- [0.1.1]: https://github.com/horizing/llamaparserb/releases/tag/v0.1.1...v0.1.0
36
- [0.1.0]: https://github.com/horizing/llamaparserb/releases/tag/v0.1.0
42
+ - Initial release
data/README.md CHANGED
@@ -43,11 +43,14 @@ text = client.parse_file(file_content, 'pdf')
43
43
  # Parse a file to markdown
44
44
  client = Llamaparserb::Client.new(ENV['LLAMA_CLOUD_API_KEY'], result_type: "markdown")
45
45
  markdown = client.parse_file('path/to/document.pdf')
46
+
47
+ # Parse a file from a URL
48
+ markdown = client.parse_file('https://example.com/document.pdf')
46
49
  ```
47
50
 
48
51
  ### File Input Options
49
52
 
50
- The `parse_file` method accepts two types of inputs:
53
+ The `parse_file` method accepts three types of inputs:
51
54
 
52
55
  1. File path (String):
53
56
  ```ruby
@@ -69,6 +72,11 @@ temp_file = Tempfile.new(['document', '.pdf'])
69
72
  client.parse_file(temp_file, 'pdf')
70
73
  ```
71
74
 
75
+ 3. URL (String):
76
+ ```ruby
77
+ client.parse_file('https://example.com/document.pdf')
78
+ ```
79
+
72
80
  ### Advanced Options
73
81
 
74
82
  ```ruby
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Llamaparserb
4
- VERSION = "0.2.3"
4
+ VERSION = "0.3.1"
5
5
  end
data/lib/llamaparserb.rb CHANGED
@@ -51,6 +51,9 @@ module Llamaparserb
51
51
  elsif File.exist?(file_input)
52
52
  job_id = create_job_from_path(file_input)
53
53
  log "Started parsing file under job_id #{job_id}", :info
54
+ elsif URI::DEFAULT_PARSER.make_regexp.match?(file_input)
55
+ job_id = create_job_from_url(file_input)
56
+ log "Started parsing URL under job_id #{job_id}", :info
54
57
  else
55
58
  raise Error, "file_type parameter is required for binary string input"
56
59
  end
@@ -197,7 +200,7 @@ module Llamaparserb
197
200
  def build_connection
198
201
  Faraday.new(url: base_url) do |f|
199
202
  f.request :multipart
200
- f.request :json
203
+ f.request :url_encoded
201
204
  f.response :json
202
205
  f.response :raise_error
203
206
  f.adapter Faraday.default_adapter
@@ -234,7 +237,13 @@ module Llamaparserb
234
237
  temp_file,
235
238
  detect_content_type(temp_file.path)
236
239
  )
237
- create_job(file)
240
+
241
+ response = @connection.post("upload") do |req|
242
+ req.headers["Authorization"] = "Bearer #{api_key}"
243
+ req.body = upload_params(file)
244
+ end
245
+
246
+ response.body["id"]
238
247
  ensure
239
248
  temp_file&.close
240
249
  temp_file&.unlink
@@ -249,9 +258,8 @@ module Llamaparserb
249
258
  response.body["id"]
250
259
  end
251
260
 
252
- def upload_params(file)
261
+ def upload_params(file = nil, url = nil)
253
262
  params = {
254
- file: file,
255
263
  language: @options[:language].to_s,
256
264
  parsing_instruction: @options[:parsing_instruction],
257
265
  invalidate_cache: @options[:invalidate_cache],
@@ -261,30 +269,35 @@ module Llamaparserb
261
269
  premium_mode: @options[:premium_mode],
262
270
  continuous_mode: @options[:continuous_mode],
263
271
  do_not_unroll_columns: @options[:do_not_unroll_columns],
272
+ page_separator: @options[:page_separator],
273
+ page_prefix: @options[:page_prefix],
274
+ page_suffix: @options[:page_suffix],
275
+ target_pages: @options[:target_pages],
276
+ bounding_box: @options[:bounding_box],
277
+ disable_ocr: @options[:disable_ocr],
278
+ take_screenshot: @options[:take_screenshot],
264
279
  gpt4o_mode: @options[:gpt4o_mode],
265
280
  gpt4o_api_key: @options[:gpt4o_api_key],
266
- vendor_multimodal_api_key: @options[:vendor_multimodal_api_key],
267
- use_vendor_multimodal_model: @options[:use_vendor_multimodal_model],
268
- vendor_multimodal_model_name: @options[:vendor_multimodal_model_name],
269
- take_screenshot: @options[:take_screenshot],
270
- disable_ocr: @options[:disable_ocr],
271
281
  guess_xlsx_sheet_names: @options[:guess_xlsx_sheet_names],
272
282
  is_formatting_instruction: @options[:is_formatting_instruction],
273
283
  annotate_links: @options[:annotate_links],
284
+ vendor_multimodal_api_key: @options[:vendor_multimodal_api_key],
285
+ use_vendor_multimodal_model: @options[:use_vendor_multimodal_model],
286
+ vendor_multimodal_model_name: @options[:vendor_multimodal_model_name],
287
+ webhook_url: @options[:webhook_url],
288
+ http_proxy: @options[:http_proxy],
289
+ azure_openai_deployment_name: @options[:azure_openai_deployment_name],
290
+ azure_openai_endpoint: @options[:azure_openai_endpoint],
291
+ azure_openai_api_version: @options[:azure_openai_api_version],
292
+ azure_openai_key: @options[:azure_openai_key],
274
293
  from_ruby_package: true
275
294
  }
276
295
 
277
- params[:page_separator] = @options[:page_separator] if @options[:page_separator]
278
- params[:page_prefix] = @options[:page_prefix] if @options[:page_prefix]
279
- params[:page_suffix] = @options[:page_suffix] if @options[:page_suffix]
280
- params[:bounding_box] = @options[:bounding_box] if @options[:bounding_box]
281
- params[:target_pages] = @options[:target_pages] if @options[:target_pages]
282
- params[:webhook_url] = @options[:webhook_url] if @options[:webhook_url]
283
- params[:azure_openai_deployment_name] = @options[:azure_openai_deployment_name] if @options[:azure_openai_deployment_name]
284
- params[:azure_openai_endpoint] = @options[:azure_openai_endpoint] if @options[:azure_openai_endpoint]
285
- params[:azure_openai_api_version] = @options[:azure_openai_api_version] if @options[:azure_openai_api_version]
286
- params[:azure_openai_key] = @options[:azure_openai_key] if @options[:azure_openai_key]
287
- params[:http_proxy] = @options[:http_proxy] if @options[:http_proxy]
296
+ if url
297
+ params[:input_url] = url.to_s
298
+ elsif file
299
+ params[:file] = file
300
+ end
288
301
 
289
302
  params.compact
290
303
  end
@@ -335,5 +348,19 @@ module Llamaparserb
335
348
  raise Error, "Unsupported file type: #{extension}. Supported types: #{SUPPORTED_FILE_TYPES.join(", ")}"
336
349
  end
337
350
  end
351
+
352
+ def create_job_from_url(url)
353
+ log "Creating job from URL: #{url}", :debug
354
+
355
+ response = @connection.post("upload") do |req|
356
+ req.headers["Authorization"] = "Bearer #{api_key}"
357
+ req.headers["Accept"] = "application/json"
358
+ req.options.timeout = 30
359
+ req.body = upload_params(nil, url)
360
+ end
361
+
362
+ log "Response: #{response.body.inspect}", :debug
363
+ response.body["id"]
364
+ end
338
365
  end
339
366
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llamaparserb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Heidar Bernhardsson