RubyGems - llamaparserb - Versions diffs - 0.2.1 → 0.2.3 - Mend

llamaparserb 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 7bc729e8371ed2f748eaef9a92b82380c2e7d674caf6c4fe1b17e7e53d8ea62a
-  data.tar.gz: 8f510a5be8efc2877b617bd7e4682a4cb0e92c4fd794e6e6ef036481f3d57284
+  metadata.gz: ba81bbf8d24dc79b57a29c8c40764c42d700012e6608d1a494075dd63900d06f
+  data.tar.gz: 1ce8846e182bf7025d90e8722148554fe2e65d47d82e1671e19f7e6386d30ce9
 SHA512:
-  metadata.gz: 5e6c3d6df9c69da63cf631a296c9618c074fc127c48cc23b398237dc015f4cb4d2bf4d2f4f50b7ed380ac053ff98c19ea9fc3530d890db34cf090e2084fb0821
-  data.tar.gz: 04b10a441f82670dc58d873d586cdd043b94f1a477b2d72b6494128d9ab76cd2f8656b1c5db0ad61487d4efe51dbc42e63b047c8101f6f74593d29e098965b52
+  metadata.gz: b5c86e77644210049df9a1095049e2a276f70e40208637e80fd14283fded8eee45ec034cc9bd7c205b802ef24be252989bbd5be671ac50981c0acb998876131b
+  data.tar.gz: 91ea52459cc1fc38f15dd5b050a2c25449147f8905d24af34c76986f48c84f54d0187bad3c0d3b4da81061e8d593982252229611592948deffdc7a6d6d6c066f

data/CHANGELOG.md CHANGED Viewed

@@ -6,6 +6,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
+## [0.2.3] - 2024-11-28
+### Added
+- Add support for all supported optional llamaparsse parameters to `parse_file`
+## [0.2.2] - 2024-11-28
+### Fixed
+- Fix issue with handling file path
 ## [0.2.1] - 2024-11-28
 ### Fixed
 - Fix parse_file to handle files that are not on the local filesystem

data/README.md CHANGED Viewed

@@ -75,19 +75,112 @@ client.parse_file(temp_file, 'pdf')
 client = Llamaparserb::Client.new(
   ENV['LLAMA_CLOUD_API_KEY'],
   {
-    result_type: "markdown",  # Output format: "text" or "markdown"
-    num_workers: 4,           # Number of workers for concurrent processing
-    check_interval: 1,        # How often to check job status (seconds)
-    max_timeout: 2000,        # Maximum time to wait for parsing (seconds)
-    verbose: true,            # Enable detailed logging
-    language: :en,            # Target language
-    parsing_instruction: "",  # Custom parsing instructions
-    premium_mode: false,      # Enable premium parsing features
-    split_by_page: true       # Split result by pages
+    # Basic Configuration
+    result_type: "markdown",    # Output format: "text" or "markdown"
+    num_workers: 4,             # Number of workers for concurrent processing
+    check_interval: 1,          # How often to check job status (seconds)
+    max_timeout: 2000,          # Maximum time to wait for parsing (seconds)
+    verbose: true,              # Enable detailed logging
+    show_progress: true,        # Show progress during parsing
+    ignore_errors: true,        # Return nil instead of raising errors
+    # Language and Parsing Options
+    language: :en,              # Target language for parsing
+    parsing_instruction: "",    # Custom parsing instructions
+    skip_diagonal_text: false,  # Skip diagonal text in documents
+    invalidate_cache: false,    # Force reprocessing of cached documents
+    do_not_cache: false,        # Disable caching of results
+    # Processing Modes
+    fast_mode: false,          # Enable faster processing (may reduce quality)
+    premium_mode: false,       # Enable premium parsing features
+    continuous_mode: false,    # Process document as continuous text
+    do_not_unroll_columns: false, # Keep columnar text structure
+    # Page Handling
+    split_by_page: true,       # Split result by pages
+    page_separator: "\n\n",    # Custom page separator
+    page_prefix: "Page ",      # Text to prepend to each page
+    page_suffix: "\n",         # Text to append to each page
+    target_pages: [1,2,3],     # Array of specific pages to process
+    bounding_box: {            # Specify area to parse (coordinates in pixels)
+      x1: 0, y1: 0,           # Top-left corner
+      x2: 612, y2: 792        # Bottom-right corner
+    },
+    # OCR and Image Processing
+    disable_ocr: false,        # Disable Optical Character Recognition
+    take_screenshot: false,    # Capture screenshot of document
+    # Advanced Processing Features
+    gpt4o_mode: false,         # Enable GPT-4 Optimization mode
+    gpt4o_api_key: "key",      # API key for GPT-4 Optimization
+    guess_xlsx_sheet_names: false, # Attempt to guess Excel sheet names
+    is_formatting_instruction: false, # Use formatting instructions
+    annotate_links: false,     # Include link annotations in output
+    # Multimodal Processing
+    vendor_multimodal_api_key: "key",      # API key for multimodal processing
+    use_vendor_multimodal_model: false,     # Enable multimodal model
+    vendor_multimodal_model_name: "model",  # Specify multimodal model
+    # Integration Options
+    webhook_url: "https://...", # URL for webhook notifications
+    http_proxy: "http://...",   # HTTP proxy configuration
+    # Azure OpenAI Configuration
+    azure_openai_deployment_name: "deployment", # Azure OpenAI deployment name
+    azure_openai_endpoint: "endpoint",         # Azure OpenAI endpoint
+    azure_openai_api_version: "2023-05-15",    # Azure OpenAI API version
+    azure_openai_key: "key"                    # Azure OpenAI API key
   }
 )
 ```
+### Feature-Specific Options
+#### Page Processing
+- `split_by_page`: Split the document into separate pages
+- `page_separator`: Custom text to insert between pages
+- `page_prefix`/`page_suffix`: Add custom text before/after each page
+- `target_pages`: Process only specific pages
+- `bounding_box`: Parse only a specific area of the document
+#### OCR and Image Processing
+- `disable_ocr`: Turn off Optical Character Recognition
+- `take_screenshot`: Generate document screenshots
+- `skip_diagonal_text`: Ignore text at diagonal angles
+#### Advanced Processing
+- `continuous_mode`: Process text as a continuous stream
+- `do_not_unroll_columns`: Preserve column structure
+- `guess_xlsx_sheet_names`: Auto-detect Excel sheet names
+- `annotate_links`: Include document hyperlinks in output
+- `is_formatting_instruction`: Use special formatting instructions
+#### Performance Options
+- `fast_mode`: Faster processing with potential quality trade-offs
+- `premium_mode`: Access to premium features
+- `invalidate_cache`/`do_not_cache`: Control result caching
+- `num_workers`: Configure concurrent processing
+#### Integration Features
+- `webhook_url`: Receive processing notifications
+- `http_proxy`: Configure proxy settings
+#### Azure OpenAI Integration
+Configure Azure OpenAI services with:
+- `azure_openai_deployment_name`
+- `azure_openai_endpoint`
+- `azure_openai_api_version`
+- `azure_openai_key`
+#### Multimodal Processing
+Enable advanced multimodal processing with:
+- `vendor_multimodal_api_key`
+- `use_vendor_multimodal_model`
+- `vendor_multimodal_model_name`
 ### Supported File Types
 The client supports a wide range of file formats including:

data/lib/llamaparserb/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Llamaparserb
-  VERSION = "0.2.1"
+  VERSION = "0.2.3"
 end

data/lib/llamaparserb.rb CHANGED Viewed

@@ -100,7 +100,20 @@ module Llamaparserb
         bounding_box: nil,
         target_pages: nil,
         ignore_errors: true,
-        split_by_page: true
+        split_by_page: true,
+        vendor_multimodal_api_key: nil,
+        use_vendor_multimodal_model: false,
+        vendor_multimodal_model_name: nil,
+        take_screenshot: false,
+        disable_ocr: false,
+        is_formatting_instruction: false,
+        annotate_links: false,
+        webhook_url: nil,
+        azure_openai_deployment_name: nil,
+        azure_openai_endpoint: nil,
+        azure_openai_api_version: nil,
+        azure_openai_key: nil,
+        http_proxy: nil
       }
     end
@@ -168,10 +181,10 @@ module Llamaparserb
     def handle_error(error, file_input)
       if @options[:ignore_errors]
-        safe_message = if file_input.is_a?(String) && !File.exist?(file_input)
-          "binary data"
+        safe_message = if file_input.is_a?(String) && file_input.start_with?("/")
+          "file path: #{file_input}"
         else
-          file_input.class.to_s
+          "binary data"
         end
         log "Error while parsing file (#{safe_message}): #{error.message}", :error
@@ -237,7 +250,7 @@ module Llamaparserb
     end
     def upload_params(file)
-      {
+      params = {
         file: file,
         language: @options[:language].to_s,
         parsing_instruction: @options[:parsing_instruction],
@@ -250,8 +263,30 @@ module Llamaparserb
         do_not_unroll_columns: @options[:do_not_unroll_columns],
         gpt4o_mode: @options[:gpt4o_mode],
         gpt4o_api_key: @options[:gpt4o_api_key],
+        vendor_multimodal_api_key: @options[:vendor_multimodal_api_key],
+        use_vendor_multimodal_model: @options[:use_vendor_multimodal_model],
+        vendor_multimodal_model_name: @options[:vendor_multimodal_model_name],
+        take_screenshot: @options[:take_screenshot],
+        disable_ocr: @options[:disable_ocr],
+        guess_xlsx_sheet_names: @options[:guess_xlsx_sheet_names],
+        is_formatting_instruction: @options[:is_formatting_instruction],
+        annotate_links: @options[:annotate_links],
         from_ruby_package: true
-      }.compact
+      }
+      params[:page_separator] = @options[:page_separator] if @options[:page_separator]
+      params[:page_prefix] = @options[:page_prefix] if @options[:page_prefix]
+      params[:page_suffix] = @options[:page_suffix] if @options[:page_suffix]
+      params[:bounding_box] = @options[:bounding_box] if @options[:bounding_box]
+      params[:target_pages] = @options[:target_pages] if @options[:target_pages]
+      params[:webhook_url] = @options[:webhook_url] if @options[:webhook_url]
+      params[:azure_openai_deployment_name] = @options[:azure_openai_deployment_name] if @options[:azure_openai_deployment_name]
+      params[:azure_openai_endpoint] = @options[:azure_openai_endpoint] if @options[:azure_openai_endpoint]
+      params[:azure_openai_api_version] = @options[:azure_openai_api_version] if @options[:azure_openai_api_version]
+      params[:azure_openai_key] = @options[:azure_openai_key] if @options[:azure_openai_key]
+      params[:http_proxy] = @options[:http_proxy] if @options[:http_proxy]
+      params.compact
     end
     def get_job_status(job_id)

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: llamaparserb
 version: !ruby/object:Gem::Version
-  version: 0.2.1
+  version: 0.2.3
 platform: ruby
 authors:
 - Heidar Bernhardsson