llamaparserb 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: c22d4883933d23de15c34dd65a2e851c7294ab934fcb43be1fd79cafc8c95515
4
+ data.tar.gz: 6cde8b62919e0a73ccc7e73f00231c5711552f7a7dad1a8e29cccbc8c55551d4
5
+ SHA512:
6
+ metadata.gz: 54fee6b5080d020caf2f28f77645336fea1fdbcace4442ffe2f842bb3101e63f0df62102b45e3af6caaae51a1d0c83cff5278fc870f3e1b0fb81ee36e4c7e0fd
7
+ data.tar.gz: ac05eeeaaceb34c47490797d8966ac12a3fd6e03050b1a4e51fe77ed8ff813d6deaaf3a85ba32219a47b1e53dd9ea25e51e91e1d582aa4f3e1d6c64d2f99c97d
data/CHANGELOG.md ADDED
@@ -0,0 +1,14 @@
1
+ # Changelog
2
+ All notable changes to this project will be documented in this file.
3
+
4
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
5
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
6
+
7
+ ## [Unreleased]
8
+
9
+ ## [0.1.0] - 2024-11-27
10
+ ### Added
11
+ - Initial release
12
+
13
+ [Unreleased]: https://github.com/heidar/llamaparserb/compare/v0.1.0...HEAD
14
+ [0.1.0]: https://github.com/heidar/llamaparserb/releases/tag/v0.1.0
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2024 Heidar Bernhardsson
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,114 @@
1
+ # Llamaparserb
2
+
3
+ A Ruby client for the LlamaIndex Parsing API. This gem allows you to easily parse various document formats (PDF, DOCX, etc.) into text or markdown. Loosely based on the Python version.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```bash
10
+ gem 'llamaparserb'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ ```bash
16
+ $ bundle install
17
+ ```
18
+
19
+ Or install it yourself as:
20
+
21
+ ```bash
22
+ $ gem install llamaparserb
23
+ ```
24
+
25
+ ## Usage
26
+
27
+ ### Basic Usage
28
+
29
+ ```ruby
30
+ require 'llamaparserb'
31
+
32
+ # Initialize client with API key
33
+ client = Llamaparserb::Client.new(ENV['LLAMA_CLOUD_API_KEY'])
34
+
35
+ # Parse a file to text (default)
36
+ text = client.parse_file('path/to/document.pdf')
37
+
38
+ # Parse a file to markdown
39
+ client = Llamaparserb::Client.new(ENV['LLAMA_CLOUD_API_KEY'], result_type: "markdown")
40
+ markdown = client.parse_file('path/to/document.pdf')
41
+ ```
42
+
43
+ ### Advanced Options
44
+
45
+ ```ruby
46
+ client = Llamaparserb::Client.new(
47
+ ENV['LLAMA_CLOUD_API_KEY'],
48
+ {
49
+ result_type: "markdown", # Output format: "text" or "markdown"
50
+ num_workers: 4, # Number of workers for concurrent processing
51
+ check_interval: 1, # How often to check job status (seconds)
52
+ max_timeout: 2000, # Maximum time to wait for parsing (seconds)
53
+ verbose: true, # Enable detailed logging
54
+ language: :en, # Target language
55
+ parsing_instruction: "", # Custom parsing instructions
56
+ premium_mode: false, # Enable premium parsing features
57
+ split_by_page: true # Split result by pages
58
+ }
59
+ )
60
+ ```
61
+
62
+ ### Supported File Types
63
+
64
+ The client supports a wide range of file formats including:
65
+ - Documents: PDF, DOCX, DOC, RTF, TXT
66
+ - Presentations: PPT, PPTX
67
+ - Spreadsheets: XLS, XLSX, CSV
68
+ - Images: JPG, PNG, TIFF
69
+ - And many more
70
+
71
+ See `SUPPORTED_FILE_TYPES` constant for the complete list.
72
+
73
+ ## Error Handling
74
+
75
+ By default, the client will return `nil` and print an error message if something goes wrong. You can change this behavior with the `ignore_errors` option:
76
+
77
+ ```ruby
78
+ # Raise errors instead of returning nil
79
+ client = Llamaparserb::Client.new(api_key, ignore_errors: false)
80
+ ```
81
+
82
+ ## Logging
83
+
84
+ By default, the client uses Ruby's standard Logger with output to STDOUT. You can configure logging in several ways:
85
+
86
+ ```ruby
87
+ # Use default logger with debug level output
88
+ client = Llamaparserb::Client.new(api_key, verbose: true)
89
+
90
+ # Use default logger with info level (less output)
91
+ client = Llamaparserb::Client.new(api_key, verbose: false)
92
+
93
+ # Use custom logger
94
+ custom_logger = Logger.new('llamaparse.log')
95
+ custom_logger.level = Logger::INFO
96
+ client = Llamaparserb::Client.new(api_key, logger: custom_logger)
97
+
98
+ # Use Rails logger in a Rails app
99
+ client = Llamaparserb::Client.new(api_key, logger: Rails.logger)
100
+ ```
101
+
102
+ ## Development
103
+
104
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
105
+
106
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
107
+
108
+ ## Contributing
109
+
110
+ Bug reports and pull requests are welcome on GitHub at https://github.com/heidar/llamaparserb.
111
+
112
+ ## License
113
+
114
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Llamaparserb
4
+ VERSION = "0.1.0"
5
+ end
@@ -0,0 +1,242 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "llamaparserb/version"
4
+ require "faraday"
5
+ require "faraday/multipart"
6
+ require "json"
7
+ require "mime/types"
8
+ require "uri"
9
+ require "async"
10
+ require "logger"
11
+
12
+ module Llamaparserb
13
+ class Error < StandardError; end
14
+
15
+ class Client
16
+ DEFAULT_BASE_URL = "https://api.cloud.llamaindex.ai/api/parsing"
17
+ DEFAULT_SEPARATOR = "\n---\n"
18
+ VALID_STATUSES = ["SUCCESS", "COMPLETED"].freeze
19
+ SUPPORTED_FILE_TYPES = [
20
+ ".pdf", ".602", ".abw", ".cgm", ".cwk", ".doc", ".docx", ".docm", ".dot",
21
+ ".dotm", ".hwp", ".key", ".lwp", ".mw", ".mcw", ".pages", ".pbd", ".ppt",
22
+ ".pptm", ".pptx", ".pot", ".potm", ".potx", ".rtf", ".sda", ".sdd", ".sdp",
23
+ ".sdw", ".sgl", ".sti", ".sxi", ".sxw", ".stw", ".sxg", ".txt", ".uof",
24
+ ".uop", ".uot", ".vor", ".wpd", ".wps", ".xml", ".zabw", ".epub", ".jpg",
25
+ ".jpeg", ".png", ".gif", ".bmp", ".svg", ".tiff", ".webp", ".htm", ".html",
26
+ ".xlsx", ".xls", ".xlsm", ".xlsb", ".xlw", ".csv", ".dif", ".sylk", ".slk",
27
+ ".prn", ".numbers", ".et", ".ods", ".fods", ".uos1", ".uos2", ".dbf",
28
+ ".wk1", ".wk2", ".wk3", ".wk4", ".wks", ".123", ".wq1", ".wq2", ".wb1",
29
+ ".wb2", ".wb3", ".qpw", ".xlr", ".eth", ".tsv"
30
+ ].freeze
31
+
32
+ attr_reader :api_key, :base_url, :options, :logger
33
+
34
+ def initialize(api_key = nil, options = {})
35
+ @api_key = api_key || ENV["LLAMA_CLOUD_API_KEY"]
36
+ raise Error, "API key is required" unless @api_key
37
+
38
+ @base_url = options[:base_url] || ENV["LLAMA_CLOUD_BASE_URL"] || DEFAULT_BASE_URL
39
+ @options = default_options.merge(options)
40
+ @logger = options[:logger] || default_logger
41
+ @connection = build_connection
42
+ end
43
+
44
+ def parse_file(file_path)
45
+ job_id = create_job(file_path)
46
+ log "Started parsing file under job_id #{job_id}", :info
47
+
48
+ wait_for_completion(job_id)
49
+
50
+ result = get_result(job_id)
51
+ log "Successfully retrieved result", :info
52
+ result
53
+ rescue => e
54
+ handle_error(e, file_path)
55
+ end
56
+
57
+ private
58
+
59
+ def default_options
60
+ {
61
+ result_type: :text,
62
+ num_workers: 4,
63
+ check_interval: 1,
64
+ max_timeout: 2000,
65
+ verbose: true,
66
+ show_progress: true,
67
+ language: :en,
68
+ parsing_instruction: "",
69
+ skip_diagonal_text: false,
70
+ invalidate_cache: false,
71
+ do_not_cache: false,
72
+ fast_mode: false,
73
+ premium_mode: false,
74
+ continuous_mode: false,
75
+ do_not_unroll_columns: false,
76
+ page_separator: nil,
77
+ page_prefix: nil,
78
+ page_suffix: nil,
79
+ gpt4o_mode: false,
80
+ gpt4o_api_key: nil,
81
+ guess_xlsx_sheet_names: false,
82
+ bounding_box: nil,
83
+ target_pages: nil,
84
+ ignore_errors: true,
85
+ split_by_page: true
86
+ }
87
+ end
88
+
89
+ def default_logger
90
+ logger = Logger.new($stdout)
91
+ logger.level = @options[:verbose] ? Logger::DEBUG : Logger::INFO
92
+ logger.formatter = proc do |severity, datetime, progname, msg|
93
+ "#{msg}\n"
94
+ end
95
+ logger
96
+ end
97
+
98
+ def log(message, level = :debug)
99
+ return unless @options[:verbose]
100
+ case level
101
+ when :info
102
+ logger.info(message)
103
+ when :warn
104
+ logger.warn(message)
105
+ when :error
106
+ logger.error(message)
107
+ else
108
+ logger.debug(message)
109
+ end
110
+ end
111
+
112
+ def wait_for_completion(job_id)
113
+ start_time = Time.now
114
+
115
+ loop do
116
+ sleep(@options[:check_interval])
117
+ response = get_job_status(job_id)
118
+ log "Status: #{response["status"]}", :debug
119
+
120
+ check_timeout(start_time, job_id)
121
+ break if job_completed?(response)
122
+ handle_error_status(response, job_id)
123
+ end
124
+ end
125
+
126
+ def job_completed?(response)
127
+ VALID_STATUSES.include?(response["status"])
128
+ end
129
+
130
+ def check_timeout(start_time, job_id)
131
+ return unless Time.now - start_time > @options[:max_timeout]
132
+ raise Error, "Job #{job_id} timed out after #{@options[:max_timeout]} seconds"
133
+ end
134
+
135
+ def handle_error_status(response, job_id)
136
+ if response["status"] == "ERROR"
137
+ error_code = response["error_code"] || "No error code found"
138
+ error_message = response["error_message"] || "No error message found"
139
+ raise Error, "Job failed: #{error_code} - #{error_message}"
140
+ end
141
+
142
+ unless response["status"] == "PENDING"
143
+ raise Error, "Unexpected status: #{response["status"]}"
144
+ end
145
+ end
146
+
147
+ def handle_error(error, file_path)
148
+ if @options[:ignore_errors]
149
+ log "Error while parsing file '#{file_path}': #{error.message}", :error
150
+ nil
151
+ else
152
+ raise error
153
+ end
154
+ end
155
+
156
+ def build_connection
157
+ Faraday.new(url: base_url) do |f|
158
+ f.request :multipart
159
+ f.request :json
160
+ f.response :json
161
+ f.response :raise_error
162
+ f.adapter Faraday.default_adapter
163
+ end
164
+ end
165
+
166
+ def create_job(file_path)
167
+ validate_file_type!(file_path)
168
+
169
+ file = Faraday::Multipart::FilePart.new(
170
+ file_path,
171
+ detect_content_type(file_path)
172
+ )
173
+
174
+ response = @connection.post("upload") do |req|
175
+ req.headers["Authorization"] = "Bearer #{api_key}"
176
+ req.body = upload_params(file)
177
+ end
178
+
179
+ response.body["id"]
180
+ end
181
+
182
+ def upload_params(file)
183
+ {
184
+ file: file,
185
+ language: @options[:language].to_s,
186
+ parsing_instruction: @options[:parsing_instruction],
187
+ invalidate_cache: @options[:invalidate_cache],
188
+ skip_diagonal_text: @options[:skip_diagonal_text],
189
+ do_not_cache: @options[:do_not_cache],
190
+ fast_mode: @options[:fast_mode],
191
+ premium_mode: @options[:premium_mode],
192
+ continuous_mode: @options[:continuous_mode],
193
+ do_not_unroll_columns: @options[:do_not_unroll_columns],
194
+ gpt4o_mode: @options[:gpt4o_mode],
195
+ gpt4o_api_key: @options[:gpt4o_api_key],
196
+ from_ruby_package: true
197
+ }.compact
198
+ end
199
+
200
+ def get_job_status(job_id)
201
+ response = @connection.get("job/#{job_id}") do |req|
202
+ req.headers["Authorization"] = "Bearer #{api_key}"
203
+ end
204
+
205
+ response.body
206
+ end
207
+
208
+ def get_result(job_id)
209
+ result_type = @options[:result_type].to_s
210
+ response = @connection.get("job/#{job_id}/result/#{result_type}") do |req|
211
+ req.headers["Authorization"] = "Bearer #{api_key}"
212
+ end
213
+
214
+ log "Result type: #{result_type}", :info
215
+ log "Raw response body: #{response.body.inspect}", :info
216
+
217
+ extract_content(response.body, result_type)
218
+ end
219
+
220
+ def extract_content(body, result_type)
221
+ content = if body.is_a?(Hash)
222
+ body[result_type] || body["content"]
223
+ else
224
+ body
225
+ end
226
+
227
+ log "Warning: No content found in response", :warn if content.nil?
228
+ content
229
+ end
230
+
231
+ def detect_content_type(filename)
232
+ MIME::Types.type_for(filename).first&.content_type || "application/octet-stream"
233
+ end
234
+
235
+ def validate_file_type!(file_path)
236
+ extension = File.extname(file_path).downcase
237
+ unless SUPPORTED_FILE_TYPES.include?(extension)
238
+ raise Error, "Unsupported file type: #{extension}. Supported types: #{SUPPORTED_FILE_TYPES.join(", ")}"
239
+ end
240
+ end
241
+ end
242
+ end
metadata ADDED
@@ -0,0 +1,108 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: llamaparserb
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Heidar Bernhardsson
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2024-11-27 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: faraday
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: faraday-multipart
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: mime-types
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '3.0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '3.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: async
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '2.0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '2.0'
69
+ description: A llamaparse client for Ruby.
70
+ email:
71
+ - heidar@heidarb.com
72
+ executables: []
73
+ extensions: []
74
+ extra_rdoc_files: []
75
+ files:
76
+ - CHANGELOG.md
77
+ - LICENSE.txt
78
+ - README.md
79
+ - lib/llamaparserb.rb
80
+ - lib/llamaparserb/version.rb
81
+ homepage: https://github.com/heidar/llamaparserb
82
+ licenses:
83
+ - MIT
84
+ metadata:
85
+ allowed_push_host: https://rubygems.org
86
+ homepage_uri: https://github.com/heidar/llamaparserb
87
+ source_code_uri: https://github.com/heidar/llamaparserb
88
+ changelog_uri: https://github.com/heidar/llamaparserb/blob/master/CHANGELOG.md
89
+ post_install_message:
90
+ rdoc_options: []
91
+ require_paths:
92
+ - lib
93
+ required_ruby_version: !ruby/object:Gem::Requirement
94
+ requirements:
95
+ - - ">="
96
+ - !ruby/object:Gem::Version
97
+ version: 3.0.0
98
+ required_rubygems_version: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - ">="
101
+ - !ruby/object:Gem::Version
102
+ version: '0'
103
+ requirements: []
104
+ rubygems_version: 3.5.11
105
+ signing_key:
106
+ specification_version: 4
107
+ summary: A llamaparse client for Ruby.
108
+ test_files: []