llamaparserb 0.2.2 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a1761244b0d8ac8c6ee13eb10b50d3eadcb182d8e7bad24828ba70a290bc7507
4
- data.tar.gz: cf7908683f630f17ef39b4c7e6393cb4637e350ea89c674c78a03a68b1bdf1c5
3
+ metadata.gz: 9c49dc1624b1b955cad8032696308fc81357533113cf429f53fe063d5db0cab2
4
+ data.tar.gz: 2eee40a9054fe05d02a094828bb47a6cc3a5b65ca6fcb5864b9648cff5f0b418
5
5
  SHA512:
6
- metadata.gz: 7bc3f4c44814c1cf63ad480882ee7f2b647af3ee8b34b46d95da7ae90c372b656c0f69fd224ba1c03d26cfa5185ee89a356f05e080169375945fc0f8d3548d8d
7
- data.tar.gz: 5e8d6c3d234e298836f2f373d515321337d1f1ab1904a38e9dfb8a1e6076b6307e69e84d93b5fd27fd02f3f0642e19f8adc38b9de20e5d0ac87b47600b13de90
6
+ metadata.gz: 50ed45e0d813d776b79fce2e87593cdc88063158045c62ba76e1bc0e353fb1b27b2b03b87dc4253443b4b2a355fec179708d04e1920dc0f47730c271e3ff81cc
7
+ data.tar.gz: 9f86c50b3f5c4bd987c9bab987c5b139ce1dd998577d04ea3fba53d64b430d3cdcc0af3d880d73ebea1914341bc185fb78469f78d9521098f06510c7024b8746
data/CHANGELOG.md CHANGED
@@ -6,10 +6,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
6
6
 
7
7
  ## [Unreleased]
8
8
 
9
+ ## [0.3.0] - 2024-11-28
10
+ ### Added
11
+ - Add support for parsing files from URLs
12
+
13
+ ## [0.2.3] - 2024-11-28
14
+ ### Added
15
+ - Add support for all supported optional llamaparsse parameters to `parse_file`
16
+
17
+ [0.2.3]: https://github.com/horizing/llamaparserb/releases/tag/v0.2.3...v0.2.2
18
+
9
19
  ## [0.2.2] - 2024-11-28
10
20
  ### Fixed
11
21
  - Fix issue with handling file path
12
22
 
23
+ [0.2.2]: https://github.com/horizing/llamaparserb/releases/tag/v0.2.2
24
+
13
25
  ## [0.2.1] - 2024-11-28
14
26
  ### Fixed
15
27
  - Fix parse_file to handle files that are not on the local filesystem
@@ -25,8 +37,4 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
25
37
 
26
38
  ## [0.1.0] - 2024-11-27
27
39
  ### Added
28
- - Initial release
29
-
30
- [Unreleased]: https://github.com/horizing/llamaparserb/compare/v0.1.0...HEAD
31
- [0.1.1]: https://github.com/horizing/llamaparserb/releases/tag/v0.1.1...v0.1.0
32
- [0.1.0]: https://github.com/horizing/llamaparserb/releases/tag/v0.1.0
40
+ - Initial release
data/README.md CHANGED
@@ -43,11 +43,14 @@ text = client.parse_file(file_content, 'pdf')
43
43
  # Parse a file to markdown
44
44
  client = Llamaparserb::Client.new(ENV['LLAMA_CLOUD_API_KEY'], result_type: "markdown")
45
45
  markdown = client.parse_file('path/to/document.pdf')
46
+
47
+ # Parse a file from a URL
48
+ markdown = client.parse_file('https://example.com/document.pdf')
46
49
  ```
47
50
 
48
51
  ### File Input Options
49
52
 
50
- The `parse_file` method accepts two types of inputs:
53
+ The `parse_file` method accepts three types of inputs:
51
54
 
52
55
  1. File path (String):
53
56
  ```ruby
@@ -69,25 +72,123 @@ temp_file = Tempfile.new(['document', '.pdf'])
69
72
  client.parse_file(temp_file, 'pdf')
70
73
  ```
71
74
 
75
+ 3. URL (String):
76
+ ```ruby
77
+ client.parse_file('https://example.com/document.pdf')
78
+ ```
79
+
72
80
  ### Advanced Options
73
81
 
74
82
  ```ruby
75
83
  client = Llamaparserb::Client.new(
76
84
  ENV['LLAMA_CLOUD_API_KEY'],
77
85
  {
78
- result_type: "markdown", # Output format: "text" or "markdown"
79
- num_workers: 4, # Number of workers for concurrent processing
80
- check_interval: 1, # How often to check job status (seconds)
81
- max_timeout: 2000, # Maximum time to wait for parsing (seconds)
82
- verbose: true, # Enable detailed logging
83
- language: :en, # Target language
84
- parsing_instruction: "", # Custom parsing instructions
85
- premium_mode: false, # Enable premium parsing features
86
- split_by_page: true # Split result by pages
86
+ # Basic Configuration
87
+ result_type: "markdown", # Output format: "text" or "markdown"
88
+ num_workers: 4, # Number of workers for concurrent processing
89
+ check_interval: 1, # How often to check job status (seconds)
90
+ max_timeout: 2000, # Maximum time to wait for parsing (seconds)
91
+ verbose: true, # Enable detailed logging
92
+ show_progress: true, # Show progress during parsing
93
+ ignore_errors: true, # Return nil instead of raising errors
94
+
95
+ # Language and Parsing Options
96
+ language: :en, # Target language for parsing
97
+ parsing_instruction: "", # Custom parsing instructions
98
+ skip_diagonal_text: false, # Skip diagonal text in documents
99
+ invalidate_cache: false, # Force reprocessing of cached documents
100
+ do_not_cache: false, # Disable caching of results
101
+
102
+ # Processing Modes
103
+ fast_mode: false, # Enable faster processing (may reduce quality)
104
+ premium_mode: false, # Enable premium parsing features
105
+ continuous_mode: false, # Process document as continuous text
106
+ do_not_unroll_columns: false, # Keep columnar text structure
107
+
108
+ # Page Handling
109
+ split_by_page: true, # Split result by pages
110
+ page_separator: "\n\n", # Custom page separator
111
+ page_prefix: "Page ", # Text to prepend to each page
112
+ page_suffix: "\n", # Text to append to each page
113
+ target_pages: [1,2,3], # Array of specific pages to process
114
+ bounding_box: { # Specify area to parse (coordinates in pixels)
115
+ x1: 0, y1: 0, # Top-left corner
116
+ x2: 612, y2: 792 # Bottom-right corner
117
+ },
118
+
119
+ # OCR and Image Processing
120
+ disable_ocr: false, # Disable Optical Character Recognition
121
+ take_screenshot: false, # Capture screenshot of document
122
+
123
+ # Advanced Processing Features
124
+ gpt4o_mode: false, # Enable GPT-4 Optimization mode
125
+ gpt4o_api_key: "key", # API key for GPT-4 Optimization
126
+ guess_xlsx_sheet_names: false, # Attempt to guess Excel sheet names
127
+ is_formatting_instruction: false, # Use formatting instructions
128
+ annotate_links: false, # Include link annotations in output
129
+
130
+ # Multimodal Processing
131
+ vendor_multimodal_api_key: "key", # API key for multimodal processing
132
+ use_vendor_multimodal_model: false, # Enable multimodal model
133
+ vendor_multimodal_model_name: "model", # Specify multimodal model
134
+
135
+ # Integration Options
136
+ webhook_url: "https://...", # URL for webhook notifications
137
+ http_proxy: "http://...", # HTTP proxy configuration
138
+
139
+ # Azure OpenAI Configuration
140
+ azure_openai_deployment_name: "deployment", # Azure OpenAI deployment name
141
+ azure_openai_endpoint: "endpoint", # Azure OpenAI endpoint
142
+ azure_openai_api_version: "2023-05-15", # Azure OpenAI API version
143
+ azure_openai_key: "key" # Azure OpenAI API key
87
144
  }
88
145
  )
89
146
  ```
90
147
 
148
+ ### Feature-Specific Options
149
+
150
+ #### Page Processing
151
+ - `split_by_page`: Split the document into separate pages
152
+ - `page_separator`: Custom text to insert between pages
153
+ - `page_prefix`/`page_suffix`: Add custom text before/after each page
154
+ - `target_pages`: Process only specific pages
155
+ - `bounding_box`: Parse only a specific area of the document
156
+
157
+ #### OCR and Image Processing
158
+ - `disable_ocr`: Turn off Optical Character Recognition
159
+ - `take_screenshot`: Generate document screenshots
160
+ - `skip_diagonal_text`: Ignore text at diagonal angles
161
+
162
+ #### Advanced Processing
163
+ - `continuous_mode`: Process text as a continuous stream
164
+ - `do_not_unroll_columns`: Preserve column structure
165
+ - `guess_xlsx_sheet_names`: Auto-detect Excel sheet names
166
+ - `annotate_links`: Include document hyperlinks in output
167
+ - `is_formatting_instruction`: Use special formatting instructions
168
+
169
+ #### Performance Options
170
+ - `fast_mode`: Faster processing with potential quality trade-offs
171
+ - `premium_mode`: Access to premium features
172
+ - `invalidate_cache`/`do_not_cache`: Control result caching
173
+ - `num_workers`: Configure concurrent processing
174
+
175
+ #### Integration Features
176
+ - `webhook_url`: Receive processing notifications
177
+ - `http_proxy`: Configure proxy settings
178
+
179
+ #### Azure OpenAI Integration
180
+ Configure Azure OpenAI services with:
181
+ - `azure_openai_deployment_name`
182
+ - `azure_openai_endpoint`
183
+ - `azure_openai_api_version`
184
+ - `azure_openai_key`
185
+
186
+ #### Multimodal Processing
187
+ Enable advanced multimodal processing with:
188
+ - `vendor_multimodal_api_key`
189
+ - `use_vendor_multimodal_model`
190
+ - `vendor_multimodal_model_name`
191
+
91
192
  ### Supported File Types
92
193
 
93
194
  The client supports a wide range of file formats including:
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Llamaparserb
4
- VERSION = "0.2.2"
4
+ VERSION = "0.3.0"
5
5
  end
data/lib/llamaparserb.rb CHANGED
@@ -51,6 +51,9 @@ module Llamaparserb
51
51
  elsif File.exist?(file_input)
52
52
  job_id = create_job_from_path(file_input)
53
53
  log "Started parsing file under job_id #{job_id}", :info
54
+ elsif URI::DEFAULT_PARSER.make_regexp.match?(file_input)
55
+ job_id = create_job_from_url(file_input)
56
+ log "Started parsing URL under job_id #{job_id}", :info
54
57
  else
55
58
  raise Error, "file_type parameter is required for binary string input"
56
59
  end
@@ -100,7 +103,20 @@ module Llamaparserb
100
103
  bounding_box: nil,
101
104
  target_pages: nil,
102
105
  ignore_errors: true,
103
- split_by_page: true
106
+ split_by_page: true,
107
+ vendor_multimodal_api_key: nil,
108
+ use_vendor_multimodal_model: false,
109
+ vendor_multimodal_model_name: nil,
110
+ take_screenshot: false,
111
+ disable_ocr: false,
112
+ is_formatting_instruction: false,
113
+ annotate_links: false,
114
+ webhook_url: nil,
115
+ azure_openai_deployment_name: nil,
116
+ azure_openai_endpoint: nil,
117
+ azure_openai_api_version: nil,
118
+ azure_openai_key: nil,
119
+ http_proxy: nil
104
120
  }
105
121
  end
106
122
 
@@ -184,7 +200,7 @@ module Llamaparserb
184
200
  def build_connection
185
201
  Faraday.new(url: base_url) do |f|
186
202
  f.request :multipart
187
- f.request :json
203
+ f.request :url_encoded
188
204
  f.response :json
189
205
  f.response :raise_error
190
206
  f.adapter Faraday.default_adapter
@@ -221,7 +237,13 @@ module Llamaparserb
221
237
  temp_file,
222
238
  detect_content_type(temp_file.path)
223
239
  )
224
- create_job(file)
240
+
241
+ response = @connection.post("upload") do |req|
242
+ req.headers["Authorization"] = "Bearer #{api_key}"
243
+ req.body = {file: file}
244
+ end
245
+
246
+ response.body["id"]
225
247
  ensure
226
248
  temp_file&.close
227
249
  temp_file&.unlink
@@ -236,9 +258,8 @@ module Llamaparserb
236
258
  response.body["id"]
237
259
  end
238
260
 
239
- def upload_params(file)
240
- {
241
- file: file,
261
+ def upload_params(file = nil, url = nil)
262
+ params = {
242
263
  language: @options[:language].to_s,
243
264
  parsing_instruction: @options[:parsing_instruction],
244
265
  invalidate_cache: @options[:invalidate_cache],
@@ -250,8 +271,36 @@ module Llamaparserb
250
271
  do_not_unroll_columns: @options[:do_not_unroll_columns],
251
272
  gpt4o_mode: @options[:gpt4o_mode],
252
273
  gpt4o_api_key: @options[:gpt4o_api_key],
274
+ vendor_multimodal_api_key: @options[:vendor_multimodal_api_key],
275
+ use_vendor_multimodal_model: @options[:use_vendor_multimodal_model],
276
+ vendor_multimodal_model_name: @options[:vendor_multimodal_model_name],
277
+ take_screenshot: @options[:take_screenshot],
278
+ disable_ocr: @options[:disable_ocr],
279
+ guess_xlsx_sheet_names: @options[:guess_xlsx_sheet_names],
280
+ is_formatting_instruction: @options[:is_formatting_instruction],
281
+ annotate_links: @options[:annotate_links],
253
282
  from_ruby_package: true
254
- }.compact
283
+ }
284
+
285
+ params[:page_separator] = @options[:page_separator] if @options[:page_separator]
286
+ params[:page_prefix] = @options[:page_prefix] if @options[:page_prefix]
287
+ params[:page_suffix] = @options[:page_suffix] if @options[:page_suffix]
288
+ params[:bounding_box] = @options[:bounding_box] if @options[:bounding_box]
289
+ params[:target_pages] = @options[:target_pages] if @options[:target_pages]
290
+ params[:webhook_url] = @options[:webhook_url] if @options[:webhook_url]
291
+ params[:azure_openai_deployment_name] = @options[:azure_openai_deployment_name] if @options[:azure_openai_deployment_name]
292
+ params[:azure_openai_endpoint] = @options[:azure_openai_endpoint] if @options[:azure_openai_endpoint]
293
+ params[:azure_openai_api_version] = @options[:azure_openai_api_version] if @options[:azure_openai_api_version]
294
+ params[:azure_openai_key] = @options[:azure_openai_key] if @options[:azure_openai_key]
295
+ params[:http_proxy] = @options[:http_proxy] if @options[:http_proxy]
296
+
297
+ if url
298
+ params[:input_url] = url.to_s
299
+ elsif file
300
+ params[:file] = file
301
+ end
302
+
303
+ params.compact
255
304
  end
256
305
 
257
306
  def get_job_status(job_id)
@@ -300,5 +349,20 @@ module Llamaparserb
300
349
  raise Error, "Unsupported file type: #{extension}. Supported types: #{SUPPORTED_FILE_TYPES.join(", ")}"
301
350
  end
302
351
  end
352
+
353
+ def create_job_from_url(url)
354
+ log "Creating job from URL: #{url}", :debug
355
+
356
+ response = @connection.post("upload") do |req|
357
+ req.headers["Authorization"] = "Bearer #{api_key}"
358
+ req.headers["Accept"] = "application/json"
359
+ # Create a simple form data request
360
+ req.options.timeout = 30 # Optional: add timeout
361
+ req.body = {"input_url" => url.to_s}
362
+ end
363
+
364
+ log "Response: #{response.body.inspect}", :debug
365
+ response.body["id"]
366
+ end
303
367
  end
304
368
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llamaparserb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Heidar Bernhardsson