RubyGems - llamaparserb - Versions diffs - 0.2.2 → 0.3.0 - Mend

llamaparserb 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: a1761244b0d8ac8c6ee13eb10b50d3eadcb182d8e7bad24828ba70a290bc7507
-  data.tar.gz: cf7908683f630f17ef39b4c7e6393cb4637e350ea89c674c78a03a68b1bdf1c5
+  metadata.gz: 9c49dc1624b1b955cad8032696308fc81357533113cf429f53fe063d5db0cab2
+  data.tar.gz: 2eee40a9054fe05d02a094828bb47a6cc3a5b65ca6fcb5864b9648cff5f0b418
 SHA512:
-  metadata.gz: 7bc3f4c44814c1cf63ad480882ee7f2b647af3ee8b34b46d95da7ae90c372b656c0f69fd224ba1c03d26cfa5185ee89a356f05e080169375945fc0f8d3548d8d
-  data.tar.gz: 5e8d6c3d234e298836f2f373d515321337d1f1ab1904a38e9dfb8a1e6076b6307e69e84d93b5fd27fd02f3f0642e19f8adc38b9de20e5d0ac87b47600b13de90
+  metadata.gz: 50ed45e0d813d776b79fce2e87593cdc88063158045c62ba76e1bc0e353fb1b27b2b03b87dc4253443b4b2a355fec179708d04e1920dc0f47730c271e3ff81cc
+  data.tar.gz: 9f86c50b3f5c4bd987c9bab987c5b139ce1dd998577d04ea3fba53d64b430d3cdcc0af3d880d73ebea1914341bc185fb78469f78d9521098f06510c7024b8746

data/CHANGELOG.md CHANGED Viewed

@@ -6,10 +6,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
+## [0.3.0] - 2024-11-28
+### Added
+- Add support for parsing files from URLs
+## [0.2.3] - 2024-11-28
+### Added
+- Add support for all supported optional llamaparsse parameters to `parse_file`
+[0.2.3]: https://github.com/horizing/llamaparserb/releases/tag/v0.2.3...v0.2.2
 ## [0.2.2] - 2024-11-28
 ### Fixed
 - Fix issue with handling file path
+[0.2.2]: https://github.com/horizing/llamaparserb/releases/tag/v0.2.2
 ## [0.2.1] - 2024-11-28
 ### Fixed
 - Fix parse_file to handle files that are not on the local filesystem
@@ -25,8 +37,4 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [0.1.0] - 2024-11-27
 ### Added
-- Initial release
-[Unreleased]: https://github.com/horizing/llamaparserb/compare/v0.1.0...HEAD
-[0.1.1]: https://github.com/horizing/llamaparserb/releases/tag/v0.1.1...v0.1.0
-[0.1.0]: https://github.com/horizing/llamaparserb/releases/tag/v0.1.0
+- Initial release

data/README.md CHANGED Viewed

@@ -43,11 +43,14 @@ text = client.parse_file(file_content, 'pdf')
 # Parse a file to markdown
 client = Llamaparserb::Client.new(ENV['LLAMA_CLOUD_API_KEY'], result_type: "markdown")
 markdown = client.parse_file('path/to/document.pdf')
+# Parse a file from a URL
+markdown = client.parse_file('https://example.com/document.pdf')
 ```
 ### File Input Options
-The `parse_file` method accepts two types of inputs:
+The `parse_file` method accepts three types of inputs:
 1. File path (String):
 ```ruby
@@ -69,25 +72,123 @@ temp_file = Tempfile.new(['document', '.pdf'])
 client.parse_file(temp_file, 'pdf')
 ```
+3. URL (String):
+```ruby
+client.parse_file('https://example.com/document.pdf')
+```
 ### Advanced Options
 ```ruby
 client = Llamaparserb::Client.new(
   ENV['LLAMA_CLOUD_API_KEY'],
   {
-    result_type: "markdown",  # Output format: "text" or "markdown"
-    num_workers: 4,           # Number of workers for concurrent processing
-    check_interval: 1,        # How often to check job status (seconds)
-    max_timeout: 2000,        # Maximum time to wait for parsing (seconds)
-    verbose: true,            # Enable detailed logging
-    language: :en,            # Target language
-    parsing_instruction: "",  # Custom parsing instructions
-    premium_mode: false,      # Enable premium parsing features
-    split_by_page: true       # Split result by pages
+    # Basic Configuration
+    result_type: "markdown",    # Output format: "text" or "markdown"
+    num_workers: 4,             # Number of workers for concurrent processing
+    check_interval: 1,          # How often to check job status (seconds)
+    max_timeout: 2000,          # Maximum time to wait for parsing (seconds)
+    verbose: true,              # Enable detailed logging
+    show_progress: true,        # Show progress during parsing
+    ignore_errors: true,        # Return nil instead of raising errors
+    # Language and Parsing Options
+    language: :en,              # Target language for parsing
+    parsing_instruction: "",    # Custom parsing instructions
+    skip_diagonal_text: false,  # Skip diagonal text in documents
+    invalidate_cache: false,    # Force reprocessing of cached documents
+    do_not_cache: false,        # Disable caching of results
+    # Processing Modes
+    fast_mode: false,          # Enable faster processing (may reduce quality)
+    premium_mode: false,       # Enable premium parsing features
+    continuous_mode: false,    # Process document as continuous text
+    do_not_unroll_columns: false, # Keep columnar text structure
+    # Page Handling
+    split_by_page: true,       # Split result by pages
+    page_separator: "\n\n",    # Custom page separator
+    page_prefix: "Page ",      # Text to prepend to each page
+    page_suffix: "\n",         # Text to append to each page
+    target_pages: [1,2,3],     # Array of specific pages to process
+    bounding_box: {            # Specify area to parse (coordinates in pixels)
+      x1: 0, y1: 0,           # Top-left corner
+      x2: 612, y2: 792        # Bottom-right corner
+    },
+    # OCR and Image Processing
+    disable_ocr: false,        # Disable Optical Character Recognition
+    take_screenshot: false,    # Capture screenshot of document
+    # Advanced Processing Features
+    gpt4o_mode: false,         # Enable GPT-4 Optimization mode
+    gpt4o_api_key: "key",      # API key for GPT-4 Optimization
+    guess_xlsx_sheet_names: false, # Attempt to guess Excel sheet names
+    is_formatting_instruction: false, # Use formatting instructions
+    annotate_links: false,     # Include link annotations in output
+    # Multimodal Processing
+    vendor_multimodal_api_key: "key",      # API key for multimodal processing
+    use_vendor_multimodal_model: false,     # Enable multimodal model
+    vendor_multimodal_model_name: "model",  # Specify multimodal model
+    # Integration Options
+    webhook_url: "https://...", # URL for webhook notifications
+    http_proxy: "http://...",   # HTTP proxy configuration
+    # Azure OpenAI Configuration
+    azure_openai_deployment_name: "deployment", # Azure OpenAI deployment name
+    azure_openai_endpoint: "endpoint",         # Azure OpenAI endpoint
+    azure_openai_api_version: "2023-05-15",    # Azure OpenAI API version
+    azure_openai_key: "key"                    # Azure OpenAI API key
   }
 )
 ```
+### Feature-Specific Options
+#### Page Processing
+- `split_by_page`: Split the document into separate pages
+- `page_separator`: Custom text to insert between pages
+- `page_prefix`/`page_suffix`: Add custom text before/after each page
+- `target_pages`: Process only specific pages
+- `bounding_box`: Parse only a specific area of the document
+#### OCR and Image Processing
+- `disable_ocr`: Turn off Optical Character Recognition
+- `take_screenshot`: Generate document screenshots
+- `skip_diagonal_text`: Ignore text at diagonal angles
+#### Advanced Processing
+- `continuous_mode`: Process text as a continuous stream
+- `do_not_unroll_columns`: Preserve column structure
+- `guess_xlsx_sheet_names`: Auto-detect Excel sheet names
+- `annotate_links`: Include document hyperlinks in output
+- `is_formatting_instruction`: Use special formatting instructions
+#### Performance Options
+- `fast_mode`: Faster processing with potential quality trade-offs
+- `premium_mode`: Access to premium features
+- `invalidate_cache`/`do_not_cache`: Control result caching
+- `num_workers`: Configure concurrent processing
+#### Integration Features
+- `webhook_url`: Receive processing notifications
+- `http_proxy`: Configure proxy settings
+#### Azure OpenAI Integration
+Configure Azure OpenAI services with:
+- `azure_openai_deployment_name`
+- `azure_openai_endpoint`
+- `azure_openai_api_version`
+- `azure_openai_key`
+#### Multimodal Processing
+Enable advanced multimodal processing with:
+- `vendor_multimodal_api_key`
+- `use_vendor_multimodal_model`
+- `vendor_multimodal_model_name`
 ### Supported File Types
 The client supports a wide range of file formats including:

data/lib/llamaparserb/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Llamaparserb
-  VERSION = "0.2.2"
+  VERSION = "0.3.0"
 end

data/lib/llamaparserb.rb CHANGED Viewed

@@ -51,6 +51,9 @@ module Llamaparserb
         elsif File.exist?(file_input)
           job_id = create_job_from_path(file_input)
           log "Started parsing file under job_id #{job_id}", :info
+        elsif URI::DEFAULT_PARSER.make_regexp.match?(file_input)
+          job_id = create_job_from_url(file_input)
+          log "Started parsing URL under job_id #{job_id}", :info
         else
           raise Error, "file_type parameter is required for binary string input"
         end
@@ -100,7 +103,20 @@ module Llamaparserb
         bounding_box: nil,
         target_pages: nil,
         ignore_errors: true,
-        split_by_page: true
+        split_by_page: true,
+        vendor_multimodal_api_key: nil,
+        use_vendor_multimodal_model: false,
+        vendor_multimodal_model_name: nil,
+        take_screenshot: false,
+        disable_ocr: false,
+        is_formatting_instruction: false,
+        annotate_links: false,
+        webhook_url: nil,
+        azure_openai_deployment_name: nil,
+        azure_openai_endpoint: nil,
+        azure_openai_api_version: nil,
+        azure_openai_key: nil,
+        http_proxy: nil
       }
     end
@@ -184,7 +200,7 @@ module Llamaparserb
     def build_connection
       Faraday.new(url: base_url) do |f|
         f.request :multipart
-        f.request :json
+        f.request :url_encoded
         f.response :json
         f.response :raise_error
         f.adapter Faraday.default_adapter
@@ -221,7 +237,13 @@ module Llamaparserb
         temp_file,
         detect_content_type(temp_file.path)
       )
-      create_job(file)
+      response = @connection.post("upload") do |req|
+        req.headers["Authorization"] = "Bearer #{api_key}"
+        req.body = {file: file}
+      end
+      response.body["id"]
     ensure
       temp_file&.close
       temp_file&.unlink
@@ -236,9 +258,8 @@ module Llamaparserb
       response.body["id"]
     end
-    def upload_params(file)
-      {
-        file: file,
+    def upload_params(file = nil, url = nil)
+      params = {
         language: @options[:language].to_s,
         parsing_instruction: @options[:parsing_instruction],
         invalidate_cache: @options[:invalidate_cache],
@@ -250,8 +271,36 @@ module Llamaparserb
         do_not_unroll_columns: @options[:do_not_unroll_columns],
         gpt4o_mode: @options[:gpt4o_mode],
         gpt4o_api_key: @options[:gpt4o_api_key],
+        vendor_multimodal_api_key: @options[:vendor_multimodal_api_key],
+        use_vendor_multimodal_model: @options[:use_vendor_multimodal_model],
+        vendor_multimodal_model_name: @options[:vendor_multimodal_model_name],
+        take_screenshot: @options[:take_screenshot],
+        disable_ocr: @options[:disable_ocr],
+        guess_xlsx_sheet_names: @options[:guess_xlsx_sheet_names],
+        is_formatting_instruction: @options[:is_formatting_instruction],
+        annotate_links: @options[:annotate_links],
         from_ruby_package: true
-      }.compact
+      }
+      params[:page_separator] = @options[:page_separator] if @options[:page_separator]
+      params[:page_prefix] = @options[:page_prefix] if @options[:page_prefix]
+      params[:page_suffix] = @options[:page_suffix] if @options[:page_suffix]
+      params[:bounding_box] = @options[:bounding_box] if @options[:bounding_box]
+      params[:target_pages] = @options[:target_pages] if @options[:target_pages]
+      params[:webhook_url] = @options[:webhook_url] if @options[:webhook_url]
+      params[:azure_openai_deployment_name] = @options[:azure_openai_deployment_name] if @options[:azure_openai_deployment_name]
+      params[:azure_openai_endpoint] = @options[:azure_openai_endpoint] if @options[:azure_openai_endpoint]
+      params[:azure_openai_api_version] = @options[:azure_openai_api_version] if @options[:azure_openai_api_version]
+      params[:azure_openai_key] = @options[:azure_openai_key] if @options[:azure_openai_key]
+      params[:http_proxy] = @options[:http_proxy] if @options[:http_proxy]
+      if url
+        params[:input_url] = url.to_s
+      elsif file
+        params[:file] = file
+      end
+      params.compact
     end
     def get_job_status(job_id)
@@ -300,5 +349,20 @@ module Llamaparserb
         raise Error, "Unsupported file type: #{extension}. Supported types: #{SUPPORTED_FILE_TYPES.join(", ")}"
       end
     end
+    def create_job_from_url(url)
+      log "Creating job from URL: #{url}", :debug
+      response = @connection.post("upload") do |req|
+        req.headers["Authorization"] = "Bearer #{api_key}"
+        req.headers["Accept"] = "application/json"
+        # Create a simple form data request
+        req.options.timeout = 30  # Optional: add timeout
+        req.body = {"input_url" => url.to_s}
+      end
+      log "Response: #{response.body.inspect}", :debug
+      response.body["id"]
+    end
   end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: llamaparserb
 version: !ruby/object:Gem::Version
-  version: 0.2.2
+  version: 0.3.0
 platform: ruby
 authors:
 - Heidar Bernhardsson