llamaparserb 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a1761244b0d8ac8c6ee13eb10b50d3eadcb182d8e7bad24828ba70a290bc7507
4
- data.tar.gz: cf7908683f630f17ef39b4c7e6393cb4637e350ea89c674c78a03a68b1bdf1c5
3
+ metadata.gz: 9c49dc1624b1b955cad8032696308fc81357533113cf429f53fe063d5db0cab2
4
+ data.tar.gz: 2eee40a9054fe05d02a094828bb47a6cc3a5b65ca6fcb5864b9648cff5f0b418
5
5
  SHA512:
6
- metadata.gz: 7bc3f4c44814c1cf63ad480882ee7f2b647af3ee8b34b46d95da7ae90c372b656c0f69fd224ba1c03d26cfa5185ee89a356f05e080169375945fc0f8d3548d8d
7
- data.tar.gz: 5e8d6c3d234e298836f2f373d515321337d1f1ab1904a38e9dfb8a1e6076b6307e69e84d93b5fd27fd02f3f0642e19f8adc38b9de20e5d0ac87b47600b13de90
6
+ metadata.gz: 50ed45e0d813d776b79fce2e87593cdc88063158045c62ba76e1bc0e353fb1b27b2b03b87dc4253443b4b2a355fec179708d04e1920dc0f47730c271e3ff81cc
7
+ data.tar.gz: 9f86c50b3f5c4bd987c9bab987c5b139ce1dd998577d04ea3fba53d64b430d3cdcc0af3d880d73ebea1914341bc185fb78469f78d9521098f06510c7024b8746
data/CHANGELOG.md CHANGED
@@ -6,10 +6,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
6
6
 
7
7
  ## [Unreleased]
8
8
 
9
+ ## [0.3.0] - 2024-11-28
10
+ ### Added
11
+ - Add support for parsing files from URLs
12
+
13
+ ## [0.2.3] - 2024-11-28
14
+ ### Added
15
+ - Add support for all supported optional llamaparsse parameters to `parse_file`
16
+
17
+ [0.2.3]: https://github.com/horizing/llamaparserb/releases/tag/v0.2.3...v0.2.2
18
+
9
19
  ## [0.2.2] - 2024-11-28
10
20
  ### Fixed
11
21
  - Fix issue with handling file path
12
22
 
23
+ [0.2.2]: https://github.com/horizing/llamaparserb/releases/tag/v0.2.2
24
+
13
25
  ## [0.2.1] - 2024-11-28
14
26
  ### Fixed
15
27
  - Fix parse_file to handle files that are not on the local filesystem
@@ -25,8 +37,4 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
25
37
 
26
38
  ## [0.1.0] - 2024-11-27
27
39
  ### Added
28
- - Initial release
29
-
30
- [Unreleased]: https://github.com/horizing/llamaparserb/compare/v0.1.0...HEAD
31
- [0.1.1]: https://github.com/horizing/llamaparserb/releases/tag/v0.1.1...v0.1.0
32
- [0.1.0]: https://github.com/horizing/llamaparserb/releases/tag/v0.1.0
40
+ - Initial release
data/README.md CHANGED
@@ -43,11 +43,14 @@ text = client.parse_file(file_content, 'pdf')
43
43
  # Parse a file to markdown
44
44
  client = Llamaparserb::Client.new(ENV['LLAMA_CLOUD_API_KEY'], result_type: "markdown")
45
45
  markdown = client.parse_file('path/to/document.pdf')
46
+
47
+ # Parse a file from a URL
48
+ markdown = client.parse_file('https://example.com/document.pdf')
46
49
  ```
47
50
 
48
51
  ### File Input Options
49
52
 
50
- The `parse_file` method accepts two types of inputs:
53
+ The `parse_file` method accepts three types of inputs:
51
54
 
52
55
  1. File path (String):
53
56
  ```ruby
@@ -69,25 +72,123 @@ temp_file = Tempfile.new(['document', '.pdf'])
69
72
  client.parse_file(temp_file, 'pdf')
70
73
  ```
71
74
 
75
+ 3. URL (String):
76
+ ```ruby
77
+ client.parse_file('https://example.com/document.pdf')
78
+ ```
79
+
72
80
  ### Advanced Options
73
81
 
74
82
  ```ruby
75
83
  client = Llamaparserb::Client.new(
76
84
  ENV['LLAMA_CLOUD_API_KEY'],
77
85
  {
78
- result_type: "markdown", # Output format: "text" or "markdown"
79
- num_workers: 4, # Number of workers for concurrent processing
80
- check_interval: 1, # How often to check job status (seconds)
81
- max_timeout: 2000, # Maximum time to wait for parsing (seconds)
82
- verbose: true, # Enable detailed logging
83
- language: :en, # Target language
84
- parsing_instruction: "", # Custom parsing instructions
85
- premium_mode: false, # Enable premium parsing features
86
- split_by_page: true # Split result by pages
86
+ # Basic Configuration
87
+ result_type: "markdown", # Output format: "text" or "markdown"
88
+ num_workers: 4, # Number of workers for concurrent processing
89
+ check_interval: 1, # How often to check job status (seconds)
90
+ max_timeout: 2000, # Maximum time to wait for parsing (seconds)
91
+ verbose: true, # Enable detailed logging
92
+ show_progress: true, # Show progress during parsing
93
+ ignore_errors: true, # Return nil instead of raising errors
94
+
95
+ # Language and Parsing Options
96
+ language: :en, # Target language for parsing
97
+ parsing_instruction: "", # Custom parsing instructions
98
+ skip_diagonal_text: false, # Skip diagonal text in documents
99
+ invalidate_cache: false, # Force reprocessing of cached documents
100
+ do_not_cache: false, # Disable caching of results
101
+
102
+ # Processing Modes
103
+ fast_mode: false, # Enable faster processing (may reduce quality)
104
+ premium_mode: false, # Enable premium parsing features
105
+ continuous_mode: false, # Process document as continuous text
106
+ do_not_unroll_columns: false, # Keep columnar text structure
107
+
108
+ # Page Handling
109
+ split_by_page: true, # Split result by pages
110
+ page_separator: "\n\n", # Custom page separator
111
+ page_prefix: "Page ", # Text to prepend to each page
112
+ page_suffix: "\n", # Text to append to each page
113
+ target_pages: [1,2,3], # Array of specific pages to process
114
+ bounding_box: { # Specify area to parse (coordinates in pixels)
115
+ x1: 0, y1: 0, # Top-left corner
116
+ x2: 612, y2: 792 # Bottom-right corner
117
+ },
118
+
119
+ # OCR and Image Processing
120
+ disable_ocr: false, # Disable Optical Character Recognition
121
+ take_screenshot: false, # Capture screenshot of document
122
+
123
+ # Advanced Processing Features
124
+ gpt4o_mode: false, # Enable GPT-4 Optimization mode
125
+ gpt4o_api_key: "key", # API key for GPT-4 Optimization
126
+ guess_xlsx_sheet_names: false, # Attempt to guess Excel sheet names
127
+ is_formatting_instruction: false, # Use formatting instructions
128
+ annotate_links: false, # Include link annotations in output
129
+
130
+ # Multimodal Processing
131
+ vendor_multimodal_api_key: "key", # API key for multimodal processing
132
+ use_vendor_multimodal_model: false, # Enable multimodal model
133
+ vendor_multimodal_model_name: "model", # Specify multimodal model
134
+
135
+ # Integration Options
136
+ webhook_url: "https://...", # URL for webhook notifications
137
+ http_proxy: "http://...", # HTTP proxy configuration
138
+
139
+ # Azure OpenAI Configuration
140
+ azure_openai_deployment_name: "deployment", # Azure OpenAI deployment name
141
+ azure_openai_endpoint: "endpoint", # Azure OpenAI endpoint
142
+ azure_openai_api_version: "2023-05-15", # Azure OpenAI API version
143
+ azure_openai_key: "key" # Azure OpenAI API key
87
144
  }
88
145
  )
89
146
  ```
90
147
 
148
+ ### Feature-Specific Options
149
+
150
+ #### Page Processing
151
+ - `split_by_page`: Split the document into separate pages
152
+ - `page_separator`: Custom text to insert between pages
153
+ - `page_prefix`/`page_suffix`: Add custom text before/after each page
154
+ - `target_pages`: Process only specific pages
155
+ - `bounding_box`: Parse only a specific area of the document
156
+
157
+ #### OCR and Image Processing
158
+ - `disable_ocr`: Turn off Optical Character Recognition
159
+ - `take_screenshot`: Generate document screenshots
160
+ - `skip_diagonal_text`: Ignore text at diagonal angles
161
+
162
+ #### Advanced Processing
163
+ - `continuous_mode`: Process text as a continuous stream
164
+ - `do_not_unroll_columns`: Preserve column structure
165
+ - `guess_xlsx_sheet_names`: Auto-detect Excel sheet names
166
+ - `annotate_links`: Include document hyperlinks in output
167
+ - `is_formatting_instruction`: Use special formatting instructions
168
+
169
+ #### Performance Options
170
+ - `fast_mode`: Faster processing with potential quality trade-offs
171
+ - `premium_mode`: Access to premium features
172
+ - `invalidate_cache`/`do_not_cache`: Control result caching
173
+ - `num_workers`: Configure concurrent processing
174
+
175
+ #### Integration Features
176
+ - `webhook_url`: Receive processing notifications
177
+ - `http_proxy`: Configure proxy settings
178
+
179
+ #### Azure OpenAI Integration
180
+ Configure Azure OpenAI services with:
181
+ - `azure_openai_deployment_name`
182
+ - `azure_openai_endpoint`
183
+ - `azure_openai_api_version`
184
+ - `azure_openai_key`
185
+
186
+ #### Multimodal Processing
187
+ Enable advanced multimodal processing with:
188
+ - `vendor_multimodal_api_key`
189
+ - `use_vendor_multimodal_model`
190
+ - `vendor_multimodal_model_name`
191
+
91
192
  ### Supported File Types
92
193
 
93
194
  The client supports a wide range of file formats including:
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Llamaparserb
4
- VERSION = "0.2.2"
4
+ VERSION = "0.3.0"
5
5
  end
data/lib/llamaparserb.rb CHANGED
@@ -51,6 +51,9 @@ module Llamaparserb
51
51
  elsif File.exist?(file_input)
52
52
  job_id = create_job_from_path(file_input)
53
53
  log "Started parsing file under job_id #{job_id}", :info
54
+ elsif URI::DEFAULT_PARSER.make_regexp.match?(file_input)
55
+ job_id = create_job_from_url(file_input)
56
+ log "Started parsing URL under job_id #{job_id}", :info
54
57
  else
55
58
  raise Error, "file_type parameter is required for binary string input"
56
59
  end
@@ -100,7 +103,20 @@ module Llamaparserb
100
103
  bounding_box: nil,
101
104
  target_pages: nil,
102
105
  ignore_errors: true,
103
- split_by_page: true
106
+ split_by_page: true,
107
+ vendor_multimodal_api_key: nil,
108
+ use_vendor_multimodal_model: false,
109
+ vendor_multimodal_model_name: nil,
110
+ take_screenshot: false,
111
+ disable_ocr: false,
112
+ is_formatting_instruction: false,
113
+ annotate_links: false,
114
+ webhook_url: nil,
115
+ azure_openai_deployment_name: nil,
116
+ azure_openai_endpoint: nil,
117
+ azure_openai_api_version: nil,
118
+ azure_openai_key: nil,
119
+ http_proxy: nil
104
120
  }
105
121
  end
106
122
 
@@ -184,7 +200,7 @@ module Llamaparserb
184
200
  def build_connection
185
201
  Faraday.new(url: base_url) do |f|
186
202
  f.request :multipart
187
- f.request :json
203
+ f.request :url_encoded
188
204
  f.response :json
189
205
  f.response :raise_error
190
206
  f.adapter Faraday.default_adapter
@@ -221,7 +237,13 @@ module Llamaparserb
221
237
  temp_file,
222
238
  detect_content_type(temp_file.path)
223
239
  )
224
- create_job(file)
240
+
241
+ response = @connection.post("upload") do |req|
242
+ req.headers["Authorization"] = "Bearer #{api_key}"
243
+ req.body = {file: file}
244
+ end
245
+
246
+ response.body["id"]
225
247
  ensure
226
248
  temp_file&.close
227
249
  temp_file&.unlink
@@ -236,9 +258,8 @@ module Llamaparserb
236
258
  response.body["id"]
237
259
  end
238
260
 
239
- def upload_params(file)
240
- {
241
- file: file,
261
+ def upload_params(file = nil, url = nil)
262
+ params = {
242
263
  language: @options[:language].to_s,
243
264
  parsing_instruction: @options[:parsing_instruction],
244
265
  invalidate_cache: @options[:invalidate_cache],
@@ -250,8 +271,36 @@ module Llamaparserb
250
271
  do_not_unroll_columns: @options[:do_not_unroll_columns],
251
272
  gpt4o_mode: @options[:gpt4o_mode],
252
273
  gpt4o_api_key: @options[:gpt4o_api_key],
274
+ vendor_multimodal_api_key: @options[:vendor_multimodal_api_key],
275
+ use_vendor_multimodal_model: @options[:use_vendor_multimodal_model],
276
+ vendor_multimodal_model_name: @options[:vendor_multimodal_model_name],
277
+ take_screenshot: @options[:take_screenshot],
278
+ disable_ocr: @options[:disable_ocr],
279
+ guess_xlsx_sheet_names: @options[:guess_xlsx_sheet_names],
280
+ is_formatting_instruction: @options[:is_formatting_instruction],
281
+ annotate_links: @options[:annotate_links],
253
282
  from_ruby_package: true
254
- }.compact
283
+ }
284
+
285
+ params[:page_separator] = @options[:page_separator] if @options[:page_separator]
286
+ params[:page_prefix] = @options[:page_prefix] if @options[:page_prefix]
287
+ params[:page_suffix] = @options[:page_suffix] if @options[:page_suffix]
288
+ params[:bounding_box] = @options[:bounding_box] if @options[:bounding_box]
289
+ params[:target_pages] = @options[:target_pages] if @options[:target_pages]
290
+ params[:webhook_url] = @options[:webhook_url] if @options[:webhook_url]
291
+ params[:azure_openai_deployment_name] = @options[:azure_openai_deployment_name] if @options[:azure_openai_deployment_name]
292
+ params[:azure_openai_endpoint] = @options[:azure_openai_endpoint] if @options[:azure_openai_endpoint]
293
+ params[:azure_openai_api_version] = @options[:azure_openai_api_version] if @options[:azure_openai_api_version]
294
+ params[:azure_openai_key] = @options[:azure_openai_key] if @options[:azure_openai_key]
295
+ params[:http_proxy] = @options[:http_proxy] if @options[:http_proxy]
296
+
297
+ if url
298
+ params[:input_url] = url.to_s
299
+ elsif file
300
+ params[:file] = file
301
+ end
302
+
303
+ params.compact
255
304
  end
256
305
 
257
306
  def get_job_status(job_id)
@@ -300,5 +349,20 @@ module Llamaparserb
300
349
  raise Error, "Unsupported file type: #{extension}. Supported types: #{SUPPORTED_FILE_TYPES.join(", ")}"
301
350
  end
302
351
  end
352
+
353
+ def create_job_from_url(url)
354
+ log "Creating job from URL: #{url}", :debug
355
+
356
+ response = @connection.post("upload") do |req|
357
+ req.headers["Authorization"] = "Bearer #{api_key}"
358
+ req.headers["Accept"] = "application/json"
359
+ # Create a simple form data request
360
+ req.options.timeout = 30 # Optional: add timeout
361
+ req.body = {"input_url" => url.to_s}
362
+ end
363
+
364
+ log "Response: #{response.body.inspect}", :debug
365
+ response.body["id"]
366
+ end
303
367
  end
304
368
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llamaparserb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Heidar Bernhardsson