llamaparserb 0.2.3 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ba81bbf8d24dc79b57a29c8c40764c42d700012e6608d1a494075dd63900d06f
4
- data.tar.gz: 1ce8846e182bf7025d90e8722148554fe2e65d47d82e1671e19f7e6386d30ce9
3
+ metadata.gz: '045959dc406ac1ba5ce9db3cde8bdfc7e0bbccf83601027a0847b6693220af8f'
4
+ data.tar.gz: a56317601d9feba955aa6d8c08586ac495673b3d506ed0b332c43aca59a0bacf
5
5
  SHA512:
6
- metadata.gz: b5c86e77644210049df9a1095049e2a276f70e40208637e80fd14283fded8eee45ec034cc9bd7c205b802ef24be252989bbd5be671ac50981c0acb998876131b
7
- data.tar.gz: 91ea52459cc1fc38f15dd5b050a2c25449147f8905d24af34c76986f48c84f54d0187bad3c0d3b4da81061e8d593982252229611592948deffdc7a6d6d6c066f
6
+ metadata.gz: 585bd687b193ffedceccf87e3a95550a1053e31d6d14da430f603109e9be74aa4aaf06d901afda2a0aaa9cdf87be93a08420cbc88b876630ce7d865c2e1c08db
7
+ data.tar.gz: 2d0616205409a56943d31014ca01c910bd31d6fb40a0c98fcbb02e6ba946d92a22925ba77ba05cfeae10ae6af0f3e7753b8dbcefed66739b97663808862d1892
data/CHANGELOG.md CHANGED
@@ -5,15 +5,25 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
5
5
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
6
6
 
7
7
  ## [Unreleased]
8
+ ### Added
9
+ - Add support for all supported optional llamaparse parameters when parsing files from URLs
10
+
11
+ ## [0.3.0] - 2024-11-28
12
+ ### Added
13
+ - Add support for parsing files from URLs
8
14
 
9
15
  ## [0.2.3] - 2024-11-28
10
16
  ### Added
11
- - Add support for all supported optional llamaparsse parameters to `parse_file`
17
+ - Add support for all supported optional llamaparse parameters to `parse_file`
18
+
19
+ [0.2.3]: https://github.com/horizing/llamaparserb/releases/tag/v0.2.3...v0.2.2
12
20
 
13
21
  ## [0.2.2] - 2024-11-28
14
22
  ### Fixed
15
23
  - Fix issue with handling file path
16
24
 
25
+ [0.2.2]: https://github.com/horizing/llamaparserb/releases/tag/v0.2.2
26
+
17
27
  ## [0.2.1] - 2024-11-28
18
28
  ### Fixed
19
29
  - Fix parse_file to handle files that are not on the local filesystem
@@ -29,8 +39,4 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
29
39
 
30
40
  ## [0.1.0] - 2024-11-27
31
41
  ### Added
32
- - Initial release
33
-
34
- [Unreleased]: https://github.com/horizing/llamaparserb/compare/v0.1.0...HEAD
35
- [0.1.1]: https://github.com/horizing/llamaparserb/releases/tag/v0.1.1...v0.1.0
36
- [0.1.0]: https://github.com/horizing/llamaparserb/releases/tag/v0.1.0
42
+ - Initial release
data/README.md CHANGED
@@ -43,11 +43,14 @@ text = client.parse_file(file_content, 'pdf')
43
43
  # Parse a file to markdown
44
44
  client = Llamaparserb::Client.new(ENV['LLAMA_CLOUD_API_KEY'], result_type: "markdown")
45
45
  markdown = client.parse_file('path/to/document.pdf')
46
+
47
+ # Parse a file from a URL
48
+ markdown = client.parse_file('https://example.com/document.pdf')
46
49
  ```
47
50
 
48
51
  ### File Input Options
49
52
 
50
- The `parse_file` method accepts two types of inputs:
53
+ The `parse_file` method accepts three types of inputs:
51
54
 
52
55
  1. File path (String):
53
56
  ```ruby
@@ -69,6 +72,11 @@ temp_file = Tempfile.new(['document', '.pdf'])
69
72
  client.parse_file(temp_file, 'pdf')
70
73
  ```
71
74
 
75
+ 3. URL (String):
76
+ ```ruby
77
+ client.parse_file('https://example.com/document.pdf')
78
+ ```
79
+
72
80
  ### Advanced Options
73
81
 
74
82
  ```ruby
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Llamaparserb
4
- VERSION = "0.2.3"
4
+ VERSION = "0.3.1"
5
5
  end
data/lib/llamaparserb.rb CHANGED
@@ -51,6 +51,9 @@ module Llamaparserb
51
51
  elsif File.exist?(file_input)
52
52
  job_id = create_job_from_path(file_input)
53
53
  log "Started parsing file under job_id #{job_id}", :info
54
+ elsif URI::DEFAULT_PARSER.make_regexp.match?(file_input)
55
+ job_id = create_job_from_url(file_input)
56
+ log "Started parsing URL under job_id #{job_id}", :info
54
57
  else
55
58
  raise Error, "file_type parameter is required for binary string input"
56
59
  end
@@ -197,7 +200,7 @@ module Llamaparserb
197
200
  def build_connection
198
201
  Faraday.new(url: base_url) do |f|
199
202
  f.request :multipart
200
- f.request :json
203
+ f.request :url_encoded
201
204
  f.response :json
202
205
  f.response :raise_error
203
206
  f.adapter Faraday.default_adapter
@@ -234,7 +237,13 @@ module Llamaparserb
234
237
  temp_file,
235
238
  detect_content_type(temp_file.path)
236
239
  )
237
- create_job(file)
240
+
241
+ response = @connection.post("upload") do |req|
242
+ req.headers["Authorization"] = "Bearer #{api_key}"
243
+ req.body = upload_params(file)
244
+ end
245
+
246
+ response.body["id"]
238
247
  ensure
239
248
  temp_file&.close
240
249
  temp_file&.unlink
@@ -249,9 +258,8 @@ module Llamaparserb
249
258
  response.body["id"]
250
259
  end
251
260
 
252
- def upload_params(file)
261
+ def upload_params(file = nil, url = nil)
253
262
  params = {
254
- file: file,
255
263
  language: @options[:language].to_s,
256
264
  parsing_instruction: @options[:parsing_instruction],
257
265
  invalidate_cache: @options[:invalidate_cache],
@@ -261,30 +269,35 @@ module Llamaparserb
261
269
  premium_mode: @options[:premium_mode],
262
270
  continuous_mode: @options[:continuous_mode],
263
271
  do_not_unroll_columns: @options[:do_not_unroll_columns],
272
+ page_separator: @options[:page_separator],
273
+ page_prefix: @options[:page_prefix],
274
+ page_suffix: @options[:page_suffix],
275
+ target_pages: @options[:target_pages],
276
+ bounding_box: @options[:bounding_box],
277
+ disable_ocr: @options[:disable_ocr],
278
+ take_screenshot: @options[:take_screenshot],
264
279
  gpt4o_mode: @options[:gpt4o_mode],
265
280
  gpt4o_api_key: @options[:gpt4o_api_key],
266
- vendor_multimodal_api_key: @options[:vendor_multimodal_api_key],
267
- use_vendor_multimodal_model: @options[:use_vendor_multimodal_model],
268
- vendor_multimodal_model_name: @options[:vendor_multimodal_model_name],
269
- take_screenshot: @options[:take_screenshot],
270
- disable_ocr: @options[:disable_ocr],
271
281
  guess_xlsx_sheet_names: @options[:guess_xlsx_sheet_names],
272
282
  is_formatting_instruction: @options[:is_formatting_instruction],
273
283
  annotate_links: @options[:annotate_links],
284
+ vendor_multimodal_api_key: @options[:vendor_multimodal_api_key],
285
+ use_vendor_multimodal_model: @options[:use_vendor_multimodal_model],
286
+ vendor_multimodal_model_name: @options[:vendor_multimodal_model_name],
287
+ webhook_url: @options[:webhook_url],
288
+ http_proxy: @options[:http_proxy],
289
+ azure_openai_deployment_name: @options[:azure_openai_deployment_name],
290
+ azure_openai_endpoint: @options[:azure_openai_endpoint],
291
+ azure_openai_api_version: @options[:azure_openai_api_version],
292
+ azure_openai_key: @options[:azure_openai_key],
274
293
  from_ruby_package: true
275
294
  }
276
295
 
277
- params[:page_separator] = @options[:page_separator] if @options[:page_separator]
278
- params[:page_prefix] = @options[:page_prefix] if @options[:page_prefix]
279
- params[:page_suffix] = @options[:page_suffix] if @options[:page_suffix]
280
- params[:bounding_box] = @options[:bounding_box] if @options[:bounding_box]
281
- params[:target_pages] = @options[:target_pages] if @options[:target_pages]
282
- params[:webhook_url] = @options[:webhook_url] if @options[:webhook_url]
283
- params[:azure_openai_deployment_name] = @options[:azure_openai_deployment_name] if @options[:azure_openai_deployment_name]
284
- params[:azure_openai_endpoint] = @options[:azure_openai_endpoint] if @options[:azure_openai_endpoint]
285
- params[:azure_openai_api_version] = @options[:azure_openai_api_version] if @options[:azure_openai_api_version]
286
- params[:azure_openai_key] = @options[:azure_openai_key] if @options[:azure_openai_key]
287
- params[:http_proxy] = @options[:http_proxy] if @options[:http_proxy]
296
+ if url
297
+ params[:input_url] = url.to_s
298
+ elsif file
299
+ params[:file] = file
300
+ end
288
301
 
289
302
  params.compact
290
303
  end
@@ -335,5 +348,19 @@ module Llamaparserb
335
348
  raise Error, "Unsupported file type: #{extension}. Supported types: #{SUPPORTED_FILE_TYPES.join(", ")}"
336
349
  end
337
350
  end
351
+
352
+ def create_job_from_url(url)
353
+ log "Creating job from URL: #{url}", :debug
354
+
355
+ response = @connection.post("upload") do |req|
356
+ req.headers["Authorization"] = "Bearer #{api_key}"
357
+ req.headers["Accept"] = "application/json"
358
+ req.options.timeout = 30
359
+ req.body = upload_params(nil, url)
360
+ end
361
+
362
+ log "Response: #{response.body.inspect}", :debug
363
+ response.body["id"]
364
+ end
338
365
  end
339
366
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llamaparserb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Heidar Bernhardsson