mistral_rb 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2e43cdb7fa62132f8342d13ec666a4fe48571dc0dbf5b64a2476c23d4da86aa5
4
- data.tar.gz: 402b62dfe6f4ca6a2b77d3e13127b673037374fe75ffbe874c4cafef2fbdaeb3
3
+ metadata.gz: 9a420d04befbc7d822678ac18d5d251e9a3804a4f50413efe6743c05f8781c61
4
+ data.tar.gz: 572d1b152567b1ac54cd59ab0137eaef286732e288afcbfd79b025ced473b10d
5
5
  SHA512:
6
- metadata.gz: 921683a278ab7f4d9f443c2db7a0dd16dd08b81752e6d99ac49fe619fb23e7d2cd145d30e8760988786ffa81647f9d666a73d0ac3a56e6c34ce921bd61a8339e
7
- data.tar.gz: cbc90300023388689b9b5572f25dec507de1cc956f90c7406cfe8fbe685011200a248bbbb7574e564b3ff0b89a9726c60890c9175ac8e4832018b4766d1dabaf
6
+ metadata.gz: ca18ed1fb53190fe6146c26992e2e84525dcc596189396f883871c85dfb55507bb25c399cd290bcdadf72bd781740b8429533b3a6d05249f9a2ef0fff84551b8
7
+ data.tar.gz: 4544e24b6fb292b3b489e5969b3334e75ef035c340d8d7118275c43ffc2ca23413feab2cc881d1310c203c567d0f9d84ceb2eadbc2035223e29963f78d99fa75
data/README.md CHANGED
@@ -44,7 +44,7 @@ end
44
44
  Here is how to use streaming:
45
45
 
46
46
  ```ruby
47
- api = MistralAPI.new("api_key")
47
+ api = MistralAPI.new(api_key: "api_key")
48
48
 
49
49
  api.create_chat_completion(
50
50
  model: 'mistral-tiny',
@@ -56,6 +56,21 @@ end
56
56
 
57
57
  ```
58
58
 
59
+ It is now possible add RAG (Retrieval Augmented Generation) in your apps, with only 6 lines of code:
60
+
61
+ ```ruby
62
+
63
+ vector_store = PineconeService.new(index_name: 'your_index_name')
64
+ llm = MistralAPI.new
65
+ file = "https://www.ycombinator.com/deal"
66
+ embedding_creator = MistralEmbeddingCreator.new
67
+
68
+ responder = Responder.new(vector_store: vector_store,llm: llm,file: file, embedding_creator: embedding_creator)
69
+
70
+ puts responder.call("How much does YC invest per startup ?")
71
+
72
+ ```
73
+
59
74
 
60
75
  ## Development
61
76
 
@@ -0,0 +1,58 @@
1
+ require "mime/types"
2
+ require "httparty"
3
+ require_relative './docx_content_extractor.rb'
4
+ require_relative './html_content_extractor.rb'
5
+ require_relative './pdf_content_extractor.rb'
6
+ require_relative './text_content_extractor.rb'
7
+
8
+ class ContentExtractorFactory
9
+ def self.for(file)
10
+ type = determine_file_type(file)
11
+
12
+ case type
13
+ when :pdf
14
+ PdfContentExtractor.new(file)
15
+ when :docx
16
+ DocxContentExtractor.new(file)
17
+ when :txt
18
+ TxtContentExtractor.new(file)
19
+ when :html
20
+ HtmlContentExtractor.new(file)
21
+ else
22
+ raise "Unsupported file type: #{type}"
23
+ end
24
+ end
25
+
26
+ private
27
+
28
+ def self.determine_file_type(file)
29
+ if file_is_url?(file)
30
+ content_type = fetch_url_content_type_with_httparty(file)
31
+ return :html if content_type.include?('text/html')
32
+ else
33
+ content_type = file.content_type
34
+ end
35
+
36
+ case content_type
37
+ when 'application/pdf'
38
+ :pdf
39
+ when 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
40
+ :docx
41
+ when 'text/plain'
42
+ :txt
43
+ else
44
+ :unknown
45
+ end
46
+ end
47
+
48
+ def self.file_is_url?(file)
49
+ file.respond_to?(:to_str) && file.to_str =~ /\A#{URI::regexp(['http', 'https'])}\z/
50
+ end
51
+
52
+ def self.fetch_url_content_type_with_httparty(url)
53
+ response = HTTParty.head(url)
54
+ response.headers['content-type']
55
+ rescue HTTParty::Error
56
+ :unknown
57
+ end
58
+ end
@@ -0,0 +1,77 @@
1
+ require 'docx'
2
+ require 'tempfile'
3
+ require_relative '../utils/sanitizer.rb'
4
+ require_relative '../utils/adapters.rb'
5
+
6
+ class DocxContentExtractor
7
+ attr_reader :page_count
8
+
9
+ WORDS_PER_PAGE = 500
10
+
11
+ # Define custom error classes
12
+ class ExtractionError < StandardError; end
13
+ class FileDownloadError < ExtractionError; end
14
+ class FileReadError < ExtractionError; end
15
+ class DocxProcessingError < ExtractionError; end
16
+
17
+ def initialize(file)
18
+ @file = file
19
+ end
20
+
21
+ def call
22
+ extract_content
23
+ rescue StandardError => e
24
+ raise ExtractionError, "Content extraction failed: #{e.message}"
25
+ end
26
+
27
+ private
28
+
29
+ def extract_content
30
+ Tempfile.open(['temp', '.docx'], binmode: true) do |tempfile|
31
+ begin
32
+ @file.download { |chunk| tempfile.write(chunk.force_encoding("ASCII-8BIT")) }
33
+ rescue => e
34
+ raise FileDownloadError, "Failed to download file: #{e.message}"
35
+ end
36
+
37
+ begin
38
+ doc = Docx::Document.open(tempfile.path)
39
+ rescue => e
40
+ raise DocxProcessingError, "Failed to process DOCX file: #{e.message}"
41
+ end
42
+
43
+ content = extract_and_sanitize_content(doc)
44
+ pages = split_into_pages(content)
45
+ @page_count = pages.size
46
+
47
+ [pages, content]
48
+ end
49
+ end
50
+
51
+ def extract_and_sanitize_content(doc)
52
+ begin
53
+ content = doc.paragraphs.map(&:text).join("\n")
54
+ sanitize_page_content(content)
55
+ rescue => e
56
+ raise FileReadError, "Failed to read content from DOCX file: #{e.message}"
57
+ end
58
+ end
59
+
60
+ def split_into_pages(content)
61
+ words = content.split(/\s+/)
62
+ pages = []
63
+ words.each_slice(WORDS_PER_PAGE) do |page_words|
64
+ pages << page_words.join(' ')
65
+ end
66
+ pages
67
+ end
68
+
69
+ # Sanitize the content
70
+ def sanitize_page_content(content)
71
+ sanitized_text = Sanitizer.remove_excessive_newlines(content)
72
+ sanitized_text = Sanitizer.remove_excessive_spaces(sanitized_text)
73
+ sanitized_text = Sanitizer.remove_bullet_points(sanitized_text)
74
+ # Add additional sanitization methods as required
75
+ sanitized_text
76
+ end
77
+ end
@@ -0,0 +1,83 @@
1
+ require 'nokogiri'
2
+ require 'watir'
3
+ require 'webdrivers'
4
+ require_relative '../utils/sanitizer.rb'
5
+
6
+ class HtmlContentExtractor
7
+ attr_reader :content
8
+
9
+ WORDS_PER_PAGE = 500
10
+
11
+ class ExtractionError < StandardError; end
12
+ class UrlDownloadError < ExtractionError; end
13
+ class ParsingError < ExtractionError; end
14
+
15
+ BROWSERS = [:chrome, :firefox, :safari]
16
+
17
+ def initialize(url)
18
+ @url = url
19
+ end
20
+
21
+ def call
22
+ extract_content
23
+ rescue StandardError => e
24
+ raise ExtractionError, "HTML content extraction failed: #{e.message}"
25
+ end
26
+
27
+ private
28
+
29
+ def extract_content
30
+ BROWSERS.each do |browser|
31
+ begin
32
+ html = download_html_with_watir(browser)
33
+ document = parse_html(html)
34
+ text_content = document.xpath('//body').text.strip
35
+ title = document.title
36
+
37
+ combined_content = "#{title}\n\n#{text_content}"
38
+ @content = sanitize_content(combined_content)
39
+ pages = split_into_pages(@content)
40
+ return [pages, @content]
41
+ rescue UrlDownloadError => e
42
+ next
43
+ end
44
+ end
45
+
46
+ raise UrlDownloadError, "Failed to download URL with all browser drivers"
47
+ end
48
+
49
+ def download_html_with_watir(browser)
50
+ browser = Watir::Browser.new(browser, headless: true)
51
+ browser.goto(@url)
52
+ sleep(5) # Adjust sleep time as needed for JavaScript to render
53
+ html_content = browser.html
54
+ browser.quit
55
+ html_content
56
+ rescue => e
57
+ raise UrlDownloadError, "Failed to download URL using #{browser.to_s.capitalize} browser: #{e.message}"
58
+ ensure
59
+ browser.quit if browser.exists?
60
+ end
61
+
62
+ def parse_html(html)
63
+ Nokogiri::HTML(html)
64
+ rescue => e
65
+ raise ParsingError, "Failed to parse HTML content: #{e.message}"
66
+ end
67
+
68
+ def sanitize_content(content)
69
+ sanitized_text = Sanitizer.remove_excessive_newlines(content)
70
+ sanitized_text = Sanitizer.remove_excessive_spaces(sanitized_text)
71
+ sanitized_text = Sanitizer.remove_bullet_points(sanitized_text)
72
+ sanitized_text
73
+ end
74
+
75
+ def split_into_pages(content)
76
+ words = content.split(/\s+/)
77
+ pages = []
78
+ words.each_slice(WORDS_PER_PAGE) do |page_words|
79
+ pages << page_words.join(' ')
80
+ end
81
+ pages
82
+ end
83
+ end
@@ -0,0 +1,66 @@
1
+ require 'pdf-reader'
2
+ require 'tempfile'
3
+ require_relative '../utils/sanitizer.rb'
4
+ require_relative '../utils/adapters.rb'
5
+
6
+ class PdfContentExtractor
7
+ attr_reader :page_count
8
+
9
+ # Define custom error classes
10
+ class ExtractionError < StandardError; end
11
+ class UnreadableContentError < ExtractionError; end
12
+ class EmptyContentError < ExtractionError; end
13
+
14
+ def initialize(file)
15
+ @file = file
16
+ @page_count = 0
17
+ end
18
+
19
+ def call
20
+ extract_content
21
+ rescue PDF::Reader::MalformedPDFError, PDF::Reader::UnsupportedFeatureError => e
22
+ # Handle known PDF::Reader errors
23
+ raise UnreadableContentError, "PDF could not be read: #{e.message}"
24
+ rescue StandardError => e
25
+ # Handle any other unforeseen errors
26
+ raise ExtractionError, "Content extraction failed: #{e.message}"
27
+ end
28
+
29
+ private
30
+
31
+ def extract_content
32
+ Tempfile.open(['extracted_content', '.pdf'], binmode: true) do |tempfile|
33
+ begin
34
+ @file.download { |chunk| tempfile.write(chunk.force_encoding("ASCII-8BIT")) }
35
+ tempfile.close # Close the tempfile to flush and save data before reading
36
+
37
+ reader = PDF::Reader.new(tempfile.path)
38
+ @page_count = reader.page_count # Store the page count
39
+
40
+ pages = reader.pages.map do |page|
41
+ # Encode the extracted text to UTF-8, replacing invalid characters
42
+ page_text = page.text.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
43
+
44
+ # Sanitize the page text
45
+ sanitize_page_content(page_text)
46
+ end
47
+
48
+ raise EmptyContentError, 'The PDF content is empty or unreadable.' if pages.all? { |page| page.nil? || page.strip.empty? }
49
+
50
+ [pages, content]
51
+ ensure
52
+ tempfile.unlink # Delete the tempfile
53
+ end
54
+ end
55
+ end
56
+
57
+ # Sanitize the page content
58
+ def sanitize_page_content(page_text)
59
+ sanitized_text = Sanitizer.remove_excessive_newlines(page_text)
60
+ sanitized_text = Sanitizer.remove_excessive_spaces(sanitized_text)
61
+ sanitized_text = Sanitizer.remove_bullet_points(sanitized_text)
62
+ # Add additional sanitization methods as required
63
+
64
+ sanitized_text
65
+ end
66
+ end
@@ -0,0 +1,66 @@
1
+ require 'tempfile'
2
+ require_relative '../utils/sanitizer.rb'
3
+ require_relative '../utils/adapters.rb'
4
+
5
+ class TxtContentExtractor
6
+ attr_reader :page_count
7
+
8
+ WORDS_PER_PAGE = 500
9
+
10
+ # Define custom error classes
11
+ class ExtractionError < StandardError; end
12
+ class FileDownloadError < ExtractionError; end
13
+ class FileReadError < ExtractionError; end
14
+
15
+ def initialize(file)
16
+ @file = file
17
+ end
18
+
19
+ def call
20
+ extract_content
21
+ rescue StandardError => e
22
+ raise ExtractionError, "Content extraction failed: #{e.message}"
23
+ end
24
+
25
+ private
26
+
27
+ def extract_content
28
+ Tempfile.open(['temp', '.txt'], binmode: true) do |tempfile|
29
+ begin
30
+ @file.download { |chunk| tempfile.write(chunk.force_encoding("UTF-8")) }
31
+ rescue => e
32
+ raise FileDownloadError, "Failed to download file: #{e.message}"
33
+ end
34
+
35
+ begin
36
+ content = File.read(tempfile.path)
37
+ rescue => e
38
+ raise FileReadError, "Failed to read file: #{e.message}"
39
+ end
40
+
41
+ sanitized_content = sanitize_page_content(content)
42
+ pages = split_into_pages(sanitized_content)
43
+ @page_count = pages.size
44
+
45
+ pages
46
+ end
47
+ end
48
+
49
+ def split_into_pages(content)
50
+ words = content.split(/\s+/)
51
+ pages = []
52
+ words.each_slice(WORDS_PER_PAGE) do |page_words|
53
+ pages << page_words.join(' ')
54
+ end
55
+ pages
56
+ end
57
+
58
+ # Sanitize the content
59
+ def sanitize_page_content(content)
60
+ sanitized_text = Sanitizer.remove_excessive_newlines(content)
61
+ sanitized_text = Sanitizer.remove_excessive_spaces(sanitized_text)
62
+ sanitized_text = Sanitizer.remove_bullet_points(sanitized_text)
63
+ # Add additional sanitization methods as required
64
+ sanitized_text
65
+ end
66
+ end
@@ -0,0 +1,45 @@
1
+ require 'ruby/openai'
2
+
3
+ class BasicTextChunker
4
+ def initialize(token_limit=390)
5
+ @token_limit = token_limit
6
+ end
7
+
8
+ def split_into_chunks(text)
9
+ sentences = text.split(/[.!?]\s+/)
10
+ chunks = []
11
+ current_chunk = ""
12
+ current_token_count = 0
13
+
14
+ sentences.each do |sentence|
15
+ sentence_token_count = OpenAI.rough_token_count(sentence)
16
+
17
+ while sentence_token_count > @token_limit
18
+ tokens_to_take = @token_limit - current_token_count
19
+ partial = sentence.split(/\s+/).first(tokens_to_take).join(" ")
20
+ current_chunk += partial + " "
21
+ sentence = sentence[partial.length..].strip
22
+ current_token_count += tokens_to_take
23
+ sentence_token_count -= tokens_to_take
24
+
25
+ if current_token_count == @token_limit
26
+ chunks << current_chunk.strip
27
+ current_chunk = ""
28
+ current_token_count = 0
29
+ end
30
+ end
31
+
32
+ if current_token_count + sentence_token_count <= @token_limit
33
+ current_chunk += sentence + " "
34
+ current_token_count += sentence_token_count
35
+ else
36
+ chunks << current_chunk.strip
37
+ current_chunk = sentence + " "
38
+ current_token_count = sentence_token_count
39
+ end
40
+ end
41
+
42
+ chunks << current_chunk.strip unless current_chunk.empty?
43
+ chunks
44
+ end
45
+ end
@@ -0,0 +1,63 @@
1
+ require 'dotenv'
2
+ require_relative '../mistral_rb.rb'
3
+ require_relative '../content_splitters/basic_sentence_splitter.rb'
4
+
5
+ Dotenv.load()
6
+
7
+ class MistralEmbeddingCreator
8
+ def initialize(api_key = nil, chunker = BasicTextChunker.new, model = "mistral-embed")
9
+ @chunker = chunker
10
+ @model = model
11
+ @api_key = api_key || ENV['MISTRAL_API_KEY']
12
+
13
+ if @api_key
14
+ @llm = MistralAPI.new(api_key: @api_key)
15
+ else
16
+ Rails.logger.error "MISTRAL AI API key not provided. Set the MISTRAL_API_KEY in the ENV variables or pass it as an argument."
17
+ end
18
+ end
19
+
20
+ def call(text, pages_mode=true)
21
+
22
+ if pages_mode
23
+ vectors = []
24
+ return [] unless @llm # Return empty if the API client isn't set up
25
+
26
+ # Divide the text into chunks for each page
27
+ text.each_with_index do |page_content, page_index|
28
+ chunks = @chunker.split_into_chunks(page_content)
29
+
30
+ # Create embeddings for each chunk
31
+ chunks.each_with_index do |chunk, index|
32
+ response = @llm.create_embeddings(
33
+ model: @model,
34
+ input: [chunk]
35
+ )
36
+
37
+ # Extract the embeddings from the response
38
+ embedding = response.data.first.embedding
39
+
40
+ # Create vector data for the chunk and keep page numbers for reference
41
+ vector_data = {
42
+ id: "vec #{index + 1}",
43
+ values: embedding,
44
+ metadata: {
45
+ text: chunk,
46
+ page: page_index + 1,
47
+ }
48
+ }
49
+ # storing each chunk vector data in an array
50
+ vectors << vector_data
51
+ end
52
+ end
53
+ vectors
54
+ else
55
+ response = @llm.create_embeddings(
56
+ model: @model,
57
+ input: [text]
58
+ )
59
+
60
+ response.data.first.embedding
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,67 @@
1
+ require 'dotenv'
2
+ require 'ruby/openai'
3
+ require_relative '../content_splitters/basic_sentence_splitter.rb'
4
+
5
+ Dotenv.load()
6
+
7
+ class OpenaiEmbeddingCreator
8
+ def initialize(api_key = nil, chunker = BasicTextChunker.new, model = "text-embedding-ada-002")
9
+ @chunker = chunker
10
+ @model = model
11
+ @api_key = api_key || ENV['OPENAI_API_KEY']
12
+
13
+ if @api_key
14
+ @llm = OpenAI::Client.new(access_token: @api_key)
15
+ else
16
+ Rails.logger.error "OpenAI API key not provided. Set the OPENAI_API_KEY in the ENV variables or pass it as an argument."
17
+ end
18
+ end
19
+
20
+ def call(text, pages_mode=true)
21
+
22
+ if pages_mode
23
+ vectors = []
24
+ return [] unless @llm # Return empty if the API client isn't set up
25
+
26
+ # Divide the text into chunks for each page
27
+ text.each_with_index do |page_content, page_index|
28
+ chunks = @chunker.split_into_chunks(page_content)
29
+
30
+ # Create embeddings for each chunk
31
+ chunks.each_with_index do |chunk, index|
32
+ response = @llm.embeddings(
33
+ parameters: {
34
+ model: @model,
35
+ input: chunk
36
+ }
37
+ )
38
+
39
+ # Extract the embeddings from the response
40
+ embedding = response['data'][0]['embedding']
41
+
42
+ # Create vector data for the chunk and keep page numbers for reference
43
+ vector_data = {
44
+ id: "vec #{index + 1}",
45
+ values: embedding,
46
+ metadata: {
47
+ text: chunk,
48
+ page: page_index + 1,
49
+ }
50
+ }
51
+ # storing each chunk vector data in an array
52
+ vectors << vector_data
53
+ end
54
+ end
55
+ vectors
56
+ else
57
+ response = @llm.embeddings(
58
+ parameters: {
59
+ model: @model,
60
+ input: chunk
61
+ }
62
+ )
63
+ # Extract the embeddings from the response
64
+ response['data'][0]['embedding']
65
+ end
66
+ end
67
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module MistralRb
4
- VERSION = "0.1.3"
4
+ VERSION = "0.1.4"
5
5
  end
data/lib/mistral_rb.rb CHANGED
@@ -4,11 +4,14 @@ require_relative "mistral_rb/version"
4
4
  require "httparty"
5
5
  require "json"
6
6
  require_relative "mistral_rb/response_models"
7
+ require 'dotenv'
8
+
9
+ Dotenv.load()
7
10
 
8
11
  class MistralAPI
9
12
  include HTTParty
10
13
 
11
- def initialize(api_key:, base_uri: "https://api.mistral.ai/v1")
14
+ def initialize(api_key: ENV["MISTRAL_API_KEY"], base_uri: "https://api.mistral.ai/v1")
12
15
  @headers = {
13
16
  "Authorization" => "Bearer #{api_key}",
14
17
  "Content-Type" => "application/json"
data/lib/responder.rb ADDED
@@ -0,0 +1,91 @@
1
+ require 'dotenv'
2
+ require 'ruby/openai'
3
+ require_relative './mistral_rb.rb'
4
+ require_relative './content_extractors/content_extractor_factory.rb'
5
+ require_relative './content_splitters/basic_sentence_splitter.rb'
6
+ require_relative './embedding_engines/mistral_embeddings.rb'
7
+ require_relative './vector_stores/pinecone.rb'
8
+ require_relative './utils/similarity_service.rb'
9
+ require_relative './utils/adapters.rb'
10
+
11
+ Dotenv.load()
12
+
13
+ class Responder
14
+ def initialize(vector_store:, llm: MistralAPI.new, file:, embedding_creator: MistralEmbeddingCreator.new)
15
+ @vector_store = vector_store
16
+ @llm = llm
17
+ @file = file
18
+ @embedding_creator = embedding_creator
19
+ end
20
+
21
+ def call(question, top_k=10)
22
+ embedding = text_to_embedding(question)
23
+ results = process_similarity(question, top_k)
24
+ context = fetch_context(embedding, top_k)
25
+ merged_text = merge_texts(results, context)
26
+ prompt = construct_prompt(question, merged_text)
27
+ generate_response(prompt)
28
+ end
29
+
30
+ private
31
+
32
+ def extract_content
33
+ @extractor ||= ContentExtractorFactory.for(@file)
34
+
35
+ # Check if either @pages or @content is uninitialized
36
+ if @pages.nil? || @content.nil?
37
+ extracted_pages, extracted_content = @extractor.call
38
+ @pages ||= extracted_pages
39
+ @content ||= extracted_content
40
+ end
41
+ end
42
+
43
+ def store_embeddings
44
+ @embeddings ||= @embedding_creator.call(@pages)
45
+ @namespace ||= @vector_store.store(@embeddings, @content)
46
+ end
47
+
48
+ def text_to_embedding(question)
49
+ @embedding_creator.call(question, false)
50
+ end
51
+
52
+ # This method processes the similarity between the question and the content
53
+ def process_similarity(question, top_k)
54
+ extract_content # Ensure content is extracted
55
+ similarity_service = SimilarityService.new(question, @pages)
56
+ similarity_service.most_similar_sentences(top_k)
57
+ end
58
+
59
+ # Fetches context from the vector store based on the embedding
60
+ def fetch_context(embedding, top_k)
61
+ store_embeddings # Ensure embeddings are stored
62
+ if @namespace
63
+ @vector_store.index.query(
64
+ vector: embedding,
65
+ namespace: @namespace,
66
+ top_k: top_k,
67
+ include_values: false,
68
+ include_metadata: true
69
+ )
70
+ else
71
+ nil
72
+ end
73
+ end
74
+
75
+ # Merges the results from similarity processing with the context
76
+ def merge_texts(results, context)
77
+ [results, context].compact.join(' ')
78
+ end
79
+
80
+ def construct_prompt(question, merged_text)
81
+ "You are a helpful assistant. Answer this question: #{question}, using these information from the document the user uploaded: #{merged_text} in 60 words. Reply in the language of the question."
82
+ end
83
+
84
+ def generate_response(prompt)
85
+ response = @llm.create_chat_completion(
86
+ model: "mistral-tiny",
87
+ messages: [{role: "user", content: prompt}]
88
+ )
89
+ response.choices.first.message.content
90
+ end
91
+ end
@@ -0,0 +1,12 @@
1
+ class LocalFileAdapter
2
+ # This adapter will wrap a local file path and provide a download method that yields the file's contents
3
+ def initialize(file_path)
4
+ @file_path = file_path
5
+ end
6
+
7
+ def download
8
+ File.open(@file_path, 'rb') do |file|
9
+ yield file.read
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,16 @@
1
+ module Sanitizer
2
+ # Remove sequences of more than two newline characters
3
+ def self.remove_excessive_newlines(text)
4
+ text.gsub(/(\n\s*){3,}/, "\n\n")
5
+ end
6
+
7
+ # Remove sequences of more than two spaces and replace with one space
8
+ def self.remove_excessive_spaces(text)
9
+ text.gsub(/ {3,}/, ' ')
10
+ end
11
+
12
+ # Remove bullet point characters
13
+ def self.remove_bullet_points(text)
14
+ text.gsub("•", "")
15
+ end
16
+ end
@@ -0,0 +1,40 @@
1
+
2
+ class SimilarityService
3
+ FRENCH_STOP_WORDS = %w(
4
+ je tu il nous vous ils elle me te se le la les et ou mais
5
+ que quand donc or ni car
6
+ ).freeze
7
+
8
+ ENGLISH_STOP_WORDS = %w(
9
+ i you he we they she me him us them and or but that when so nor for
10
+ ).freeze
11
+
12
+ STOP_WORDS = (FRENCH_STOP_WORDS + ENGLISH_STOP_WORDS).freeze
13
+
14
+ def initialize(input_question, document_chunks)
15
+ @input_question = input_question
16
+ @document_chunks = document_chunks
17
+ end
18
+
19
+ def jaccard_similarity(str1, str2)
20
+ set1 = str1.downcase.split(" ").reject { |word| STOP_WORDS.include?(word) }.to_set
21
+ set2 = str2.downcase.split(" ").reject { |word| STOP_WORDS.include?(word) }.to_set
22
+ intersection = set1 & set2
23
+ union = set1 | set2
24
+ intersection.size.to_f / union.size
25
+ end
26
+
27
+ def most_similar_sentences(top_k)
28
+ sentence_delimiters = /[\.\?!:]/
29
+ all_sentences = @document_chunks.flat_map { |chunk| chunk.split(sentence_delimiters).map(&:strip) }
30
+
31
+ similarities = all_sentences.map do |sentence|
32
+ [sentence, jaccard_similarity(@input_question, sentence)]
33
+ end
34
+
35
+ # Sort by similarity and take the top_k
36
+ top_sentences = similarities.sort_by { |_, similarity| -similarity }.take(top_k).map(&:first)
37
+
38
+ top_sentences.join(' ')
39
+ end
40
+ end
@@ -0,0 +1,61 @@
1
+ require 'pinecone'
2
+ require 'digest'
3
+ require 'dotenv'
4
+
5
+ Dotenv.load()
6
+
7
+ class PineconeService
8
+ attr_reader :index
9
+ def initialize(pinecone_key: ENV['PINECONE_API_KEY'], pinecone_env: ENV['PINECONE_ENV'], index_name:)
10
+ @pinecone_key = pinecone_key
11
+ @pinecone_env = pinecone_env
12
+ @index_name = index_name
13
+
14
+
15
+ Pinecone.configure do |config|
16
+ config.api_key = @pinecone_key
17
+ config.environment = @pinecone_env
18
+ end
19
+
20
+ if @pinecone_key && @pinecone_env
21
+ @pinecone = Pinecone::Client.new
22
+ else
23
+ Rails.logger.error "Set the PINECONE_API_KEY and PINECONE_ENV in the ENV variables"
24
+ end
25
+ @index = @pinecone.index(@index_name)
26
+ end
27
+
28
+ def compute_hash(text)
29
+ Digest::SHA256.hexdigest(text)[0...44]
30
+ end
31
+
32
+ def store(embeddings, text)
33
+ namespace = compute_hash(text)
34
+
35
+ upsert_with_retry(@index, namespace, embeddings)
36
+
37
+ namespace
38
+ end
39
+
40
+ private
41
+
42
+ def upsert_with_retry(index, namespace, embeddings, max_retries = 5, retry_delay = 10)
43
+ retries = 0
44
+ response = nil
45
+
46
+ loop do
47
+ response = index.upsert(
48
+ namespace: namespace,
49
+ vectors: embeddings
50
+ )
51
+
52
+ break if response["code"] != 9 || retries >= max_retries
53
+
54
+ puts "Encountered error. Retrying in #{retry_delay} seconds... (Attempt #{retries + 1} of #{max_retries})"
55
+ sleep(retry_delay)
56
+ retries += 1
57
+ end
58
+
59
+ response
60
+ end
61
+ end
data/mistral_rb.gemspec CHANGED
@@ -31,10 +31,22 @@ Gem::Specification.new do |spec|
31
31
  spec.require_paths = ["lib"]
32
32
 
33
33
  # Specify runtime and development dependencies in gemspec
34
- spec.add_runtime_dependency "httparty", "~> 0.18"
35
34
  spec.add_development_dependency "bundler", "~> 2.0"
36
35
  spec.add_development_dependency "rake", "~> 13.0"
37
36
 
37
+ spec.add_runtime_dependency "httparty", "~> 0.18"
38
+ spec.add_runtime_dependency "mime-types"
39
+ spec.add_runtime_dependency "pdf-reader"
40
+ spec.add_runtime_dependency "pinecone"
41
+ spec.add_runtime_dependency "docx"
42
+ spec.add_runtime_dependency "dotenv-rails"
43
+ # spec.add_runtime_dependency "csv"
44
+ # spec.add_runtime_dependency "daru"
45
+ spec.add_runtime_dependency "nokogiri"
46
+ spec.add_runtime_dependency 'selenium-webdriver', '~> 4.5'
47
+ spec.add_runtime_dependency 'webdrivers', '~> 5.3'
48
+ spec.add_runtime_dependency 'watir'
49
+ spec.add_runtime_dependency 'ruby-openai'
38
50
  # For more information and examples about making a new gem, check out our
39
51
  # guide at: https://bundler.io/guides/creating_gem.html
40
52
  end
metadata CHANGED
@@ -1,15 +1,43 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mistral_rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Franck Stephane Ndzomga
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-12-24 00:00:00.000000000 Z
11
+ date: 2023-12-26 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '13.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '13.0'
13
41
  - !ruby/object:Gem::Dependency
14
42
  name: httparty
15
43
  requirement: !ruby/object:Gem::Requirement
@@ -25,33 +53,145 @@ dependencies:
25
53
  - !ruby/object:Gem::Version
26
54
  version: '0.18'
27
55
  - !ruby/object:Gem::Dependency
28
- name: bundler
56
+ name: mime-types
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: pdf-reader
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: pinecone
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: docx
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: dotenv-rails
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: nokogiri
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ type: :runtime
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
139
+ - !ruby/object:Gem::Dependency
140
+ name: selenium-webdriver
29
141
  requirement: !ruby/object:Gem::Requirement
30
142
  requirements:
31
143
  - - "~>"
32
144
  - !ruby/object:Gem::Version
33
- version: '2.0'
34
- type: :development
145
+ version: '4.5'
146
+ type: :runtime
35
147
  prerelease: false
36
148
  version_requirements: !ruby/object:Gem::Requirement
37
149
  requirements:
38
150
  - - "~>"
39
151
  - !ruby/object:Gem::Version
40
- version: '2.0'
152
+ version: '4.5'
41
153
  - !ruby/object:Gem::Dependency
42
- name: rake
154
+ name: webdrivers
43
155
  requirement: !ruby/object:Gem::Requirement
44
156
  requirements:
45
157
  - - "~>"
46
158
  - !ruby/object:Gem::Version
47
- version: '13.0'
48
- type: :development
159
+ version: '5.3'
160
+ type: :runtime
49
161
  prerelease: false
50
162
  version_requirements: !ruby/object:Gem::Requirement
51
163
  requirements:
52
164
  - - "~>"
53
165
  - !ruby/object:Gem::Version
54
- version: '13.0'
166
+ version: '5.3'
167
+ - !ruby/object:Gem::Dependency
168
+ name: watir
169
+ requirement: !ruby/object:Gem::Requirement
170
+ requirements:
171
+ - - ">="
172
+ - !ruby/object:Gem::Version
173
+ version: '0'
174
+ type: :runtime
175
+ prerelease: false
176
+ version_requirements: !ruby/object:Gem::Requirement
177
+ requirements:
178
+ - - ">="
179
+ - !ruby/object:Gem::Version
180
+ version: '0'
181
+ - !ruby/object:Gem::Dependency
182
+ name: ruby-openai
183
+ requirement: !ruby/object:Gem::Requirement
184
+ requirements:
185
+ - - ">="
186
+ - !ruby/object:Gem::Version
187
+ version: '0'
188
+ type: :runtime
189
+ prerelease: false
190
+ version_requirements: !ruby/object:Gem::Requirement
191
+ requirements:
192
+ - - ">="
193
+ - !ruby/object:Gem::Version
194
+ version: '0'
55
195
  description: This gem provides an easy-to-use interface for the Mistral AI API.
56
196
  email:
57
197
  - ndzomgafs@gmail.com
@@ -65,9 +205,22 @@ files:
65
205
  - LICENSE.txt
66
206
  - README.md
67
207
  - Rakefile
208
+ - lib/content_extractors/content_extractor_factory.rb
209
+ - lib/content_extractors/docx_content_extractor.rb
210
+ - lib/content_extractors/html_content_extractor.rb
211
+ - lib/content_extractors/pdf_content_extractor.rb
212
+ - lib/content_extractors/text_content_extractor.rb
213
+ - lib/content_splitters/basic_sentence_splitter.rb
214
+ - lib/embedding_engines/mistral_embeddings.rb
215
+ - lib/embedding_engines/openai_embeddings.rb
68
216
  - lib/mistral_rb.rb
69
217
  - lib/mistral_rb/response_models.rb
70
218
  - lib/mistral_rb/version.rb
219
+ - lib/responder.rb
220
+ - lib/utils/adapters.rb
221
+ - lib/utils/sanitizer.rb
222
+ - lib/utils/similarity_service.rb
223
+ - lib/vector_stores/pinecone.rb
71
224
  - mistral_rb.gemspec
72
225
  - sig/mistral_rb.rbs
73
226
  homepage: https://github.com/fsndzomga/mistral_rb
@@ -92,7 +245,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
92
245
  - !ruby/object:Gem::Version
93
246
  version: '0'
94
247
  requirements: []
95
- rubygems_version: 3.3.7
248
+ rubygems_version: 3.5.3
96
249
  signing_key:
97
250
  specification_version: 4
98
251
  summary: A simple wrapper for the Mistral API