mistral_rb 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2e43cdb7fa62132f8342d13ec666a4fe48571dc0dbf5b64a2476c23d4da86aa5
4
- data.tar.gz: 402b62dfe6f4ca6a2b77d3e13127b673037374fe75ffbe874c4cafef2fbdaeb3
3
+ metadata.gz: 9a420d04befbc7d822678ac18d5d251e9a3804a4f50413efe6743c05f8781c61
4
+ data.tar.gz: 572d1b152567b1ac54cd59ab0137eaef286732e288afcbfd79b025ced473b10d
5
5
  SHA512:
6
- metadata.gz: 921683a278ab7f4d9f443c2db7a0dd16dd08b81752e6d99ac49fe619fb23e7d2cd145d30e8760988786ffa81647f9d666a73d0ac3a56e6c34ce921bd61a8339e
7
- data.tar.gz: cbc90300023388689b9b5572f25dec507de1cc956f90c7406cfe8fbe685011200a248bbbb7574e564b3ff0b89a9726c60890c9175ac8e4832018b4766d1dabaf
6
+ metadata.gz: ca18ed1fb53190fe6146c26992e2e84525dcc596189396f883871c85dfb55507bb25c399cd290bcdadf72bd781740b8429533b3a6d05249f9a2ef0fff84551b8
7
+ data.tar.gz: 4544e24b6fb292b3b489e5969b3334e75ef035c340d8d7118275c43ffc2ca23413feab2cc881d1310c203c567d0f9d84ceb2eadbc2035223e29963f78d99fa75
data/README.md CHANGED
@@ -44,7 +44,7 @@ end
44
44
  Here is how to use streaming:
45
45
 
46
46
  ```ruby
47
- api = MistralAPI.new("api_key")
47
+ api = MistralAPI.new(api_key: "api_key")
48
48
 
49
49
  api.create_chat_completion(
50
50
  model: 'mistral-tiny',
@@ -56,6 +56,21 @@ end
56
56
 
57
57
  ```
58
58
 
59
+ It is now possible add RAG (Retrieval Augmented Generation) in your apps, with only 6 lines of code:
60
+
61
+ ```ruby
62
+
63
+ vector_store = PineconeService.new(index_name: 'your_index_name')
64
+ llm = MistralAPI.new
65
+ file = "https://www.ycombinator.com/deal"
66
+ embedding_creator = MistralEmbeddingCreator.new
67
+
68
+ responder = Responder.new(vector_store: vector_store,llm: llm,file: file, embedding_creator: embedding_creator)
69
+
70
+ puts responder.call("How much does YC invest per startup ?")
71
+
72
+ ```
73
+
59
74
 
60
75
  ## Development
61
76
 
@@ -0,0 +1,58 @@
1
+ require "mime/types"
2
+ require "httparty"
3
+ require_relative './docx_content_extractor.rb'
4
+ require_relative './html_content_extractor.rb'
5
+ require_relative './pdf_content_extractor.rb'
6
+ require_relative './text_content_extractor.rb'
7
+
8
+ class ContentExtractorFactory
9
+ def self.for(file)
10
+ type = determine_file_type(file)
11
+
12
+ case type
13
+ when :pdf
14
+ PdfContentExtractor.new(file)
15
+ when :docx
16
+ DocxContentExtractor.new(file)
17
+ when :txt
18
+ TxtContentExtractor.new(file)
19
+ when :html
20
+ HtmlContentExtractor.new(file)
21
+ else
22
+ raise "Unsupported file type: #{type}"
23
+ end
24
+ end
25
+
26
+ private
27
+
28
+ def self.determine_file_type(file)
29
+ if file_is_url?(file)
30
+ content_type = fetch_url_content_type_with_httparty(file)
31
+ return :html if content_type.include?('text/html')
32
+ else
33
+ content_type = file.content_type
34
+ end
35
+
36
+ case content_type
37
+ when 'application/pdf'
38
+ :pdf
39
+ when 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
40
+ :docx
41
+ when 'text/plain'
42
+ :txt
43
+ else
44
+ :unknown
45
+ end
46
+ end
47
+
48
+ def self.file_is_url?(file)
49
+ file.respond_to?(:to_str) && file.to_str =~ /\A#{URI::regexp(['http', 'https'])}\z/
50
+ end
51
+
52
+ def self.fetch_url_content_type_with_httparty(url)
53
+ response = HTTParty.head(url)
54
+ response.headers['content-type']
55
+ rescue HTTParty::Error
56
+ :unknown
57
+ end
58
+ end
@@ -0,0 +1,77 @@
1
+ require 'docx'
2
+ require 'tempfile'
3
+ require_relative '../utils/sanitizer.rb'
4
+ require_relative '../utils/adapters.rb'
5
+
6
+ class DocxContentExtractor
7
+ attr_reader :page_count
8
+
9
+ WORDS_PER_PAGE = 500
10
+
11
+ # Define custom error classes
12
+ class ExtractionError < StandardError; end
13
+ class FileDownloadError < ExtractionError; end
14
+ class FileReadError < ExtractionError; end
15
+ class DocxProcessingError < ExtractionError; end
16
+
17
+ def initialize(file)
18
+ @file = file
19
+ end
20
+
21
+ def call
22
+ extract_content
23
+ rescue StandardError => e
24
+ raise ExtractionError, "Content extraction failed: #{e.message}"
25
+ end
26
+
27
+ private
28
+
29
+ def extract_content
30
+ Tempfile.open(['temp', '.docx'], binmode: true) do |tempfile|
31
+ begin
32
+ @file.download { |chunk| tempfile.write(chunk.force_encoding("ASCII-8BIT")) }
33
+ rescue => e
34
+ raise FileDownloadError, "Failed to download file: #{e.message}"
35
+ end
36
+
37
+ begin
38
+ doc = Docx::Document.open(tempfile.path)
39
+ rescue => e
40
+ raise DocxProcessingError, "Failed to process DOCX file: #{e.message}"
41
+ end
42
+
43
+ content = extract_and_sanitize_content(doc)
44
+ pages = split_into_pages(content)
45
+ @page_count = pages.size
46
+
47
+ [pages, content]
48
+ end
49
+ end
50
+
51
+ def extract_and_sanitize_content(doc)
52
+ begin
53
+ content = doc.paragraphs.map(&:text).join("\n")
54
+ sanitize_page_content(content)
55
+ rescue => e
56
+ raise FileReadError, "Failed to read content from DOCX file: #{e.message}"
57
+ end
58
+ end
59
+
60
+ def split_into_pages(content)
61
+ words = content.split(/\s+/)
62
+ pages = []
63
+ words.each_slice(WORDS_PER_PAGE) do |page_words|
64
+ pages << page_words.join(' ')
65
+ end
66
+ pages
67
+ end
68
+
69
+ # Sanitize the content
70
+ def sanitize_page_content(content)
71
+ sanitized_text = Sanitizer.remove_excessive_newlines(content)
72
+ sanitized_text = Sanitizer.remove_excessive_spaces(sanitized_text)
73
+ sanitized_text = Sanitizer.remove_bullet_points(sanitized_text)
74
+ # Add additional sanitization methods as required
75
+ sanitized_text
76
+ end
77
+ end
@@ -0,0 +1,83 @@
1
+ require 'nokogiri'
2
+ require 'watir'
3
+ require 'webdrivers'
4
+ require_relative '../utils/sanitizer.rb'
5
+
6
+ class HtmlContentExtractor
7
+ attr_reader :content
8
+
9
+ WORDS_PER_PAGE = 500
10
+
11
+ class ExtractionError < StandardError; end
12
+ class UrlDownloadError < ExtractionError; end
13
+ class ParsingError < ExtractionError; end
14
+
15
+ BROWSERS = [:chrome, :firefox, :safari]
16
+
17
+ def initialize(url)
18
+ @url = url
19
+ end
20
+
21
+ def call
22
+ extract_content
23
+ rescue StandardError => e
24
+ raise ExtractionError, "HTML content extraction failed: #{e.message}"
25
+ end
26
+
27
+ private
28
+
29
+ def extract_content
30
+ BROWSERS.each do |browser|
31
+ begin
32
+ html = download_html_with_watir(browser)
33
+ document = parse_html(html)
34
+ text_content = document.xpath('//body').text.strip
35
+ title = document.title
36
+
37
+ combined_content = "#{title}\n\n#{text_content}"
38
+ @content = sanitize_content(combined_content)
39
+ pages = split_into_pages(@content)
40
+ return [pages, @content]
41
+ rescue UrlDownloadError => e
42
+ next
43
+ end
44
+ end
45
+
46
+ raise UrlDownloadError, "Failed to download URL with all browser drivers"
47
+ end
48
+
49
+ def download_html_with_watir(browser)
50
+ browser = Watir::Browser.new(browser, headless: true)
51
+ browser.goto(@url)
52
+ sleep(5) # Adjust sleep time as needed for JavaScript to render
53
+ html_content = browser.html
54
+ browser.quit
55
+ html_content
56
+ rescue => e
57
+ raise UrlDownloadError, "Failed to download URL using #{browser.to_s.capitalize} browser: #{e.message}"
58
+ ensure
59
+ browser.quit if browser.exists?
60
+ end
61
+
62
+ def parse_html(html)
63
+ Nokogiri::HTML(html)
64
+ rescue => e
65
+ raise ParsingError, "Failed to parse HTML content: #{e.message}"
66
+ end
67
+
68
+ def sanitize_content(content)
69
+ sanitized_text = Sanitizer.remove_excessive_newlines(content)
70
+ sanitized_text = Sanitizer.remove_excessive_spaces(sanitized_text)
71
+ sanitized_text = Sanitizer.remove_bullet_points(sanitized_text)
72
+ sanitized_text
73
+ end
74
+
75
+ def split_into_pages(content)
76
+ words = content.split(/\s+/)
77
+ pages = []
78
+ words.each_slice(WORDS_PER_PAGE) do |page_words|
79
+ pages << page_words.join(' ')
80
+ end
81
+ pages
82
+ end
83
+ end
@@ -0,0 +1,66 @@
1
+ require 'pdf-reader'
2
+ require 'tempfile'
3
+ require_relative '../utils/sanitizer.rb'
4
+ require_relative '../utils/adapters.rb'
5
+
6
+ class PdfContentExtractor
7
+ attr_reader :page_count
8
+
9
+ # Define custom error classes
10
+ class ExtractionError < StandardError; end
11
+ class UnreadableContentError < ExtractionError; end
12
+ class EmptyContentError < ExtractionError; end
13
+
14
+ def initialize(file)
15
+ @file = file
16
+ @page_count = 0
17
+ end
18
+
19
+ def call
20
+ extract_content
21
+ rescue PDF::Reader::MalformedPDFError, PDF::Reader::UnsupportedFeatureError => e
22
+ # Handle known PDF::Reader errors
23
+ raise UnreadableContentError, "PDF could not be read: #{e.message}"
24
+ rescue StandardError => e
25
+ # Handle any other unforeseen errors
26
+ raise ExtractionError, "Content extraction failed: #{e.message}"
27
+ end
28
+
29
+ private
30
+
31
+ def extract_content
32
+ Tempfile.open(['extracted_content', '.pdf'], binmode: true) do |tempfile|
33
+ begin
34
+ @file.download { |chunk| tempfile.write(chunk.force_encoding("ASCII-8BIT")) }
35
+ tempfile.close # Close the tempfile to flush and save data before reading
36
+
37
+ reader = PDF::Reader.new(tempfile.path)
38
+ @page_count = reader.page_count # Store the page count
39
+
40
+ pages = reader.pages.map do |page|
41
+ # Encode the extracted text to UTF-8, replacing invalid characters
42
+ page_text = page.text.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
43
+
44
+ # Sanitize the page text
45
+ sanitize_page_content(page_text)
46
+ end
47
+
48
+ raise EmptyContentError, 'The PDF content is empty or unreadable.' if pages.all? { |page| page.nil? || page.strip.empty? }
49
+
50
+ [pages, content]
51
+ ensure
52
+ tempfile.unlink # Delete the tempfile
53
+ end
54
+ end
55
+ end
56
+
57
+ # Sanitize the page content
58
+ def sanitize_page_content(page_text)
59
+ sanitized_text = Sanitizer.remove_excessive_newlines(page_text)
60
+ sanitized_text = Sanitizer.remove_excessive_spaces(sanitized_text)
61
+ sanitized_text = Sanitizer.remove_bullet_points(sanitized_text)
62
+ # Add additional sanitization methods as required
63
+
64
+ sanitized_text
65
+ end
66
+ end
@@ -0,0 +1,66 @@
1
+ require 'tempfile'
2
+ require_relative '../utils/sanitizer.rb'
3
+ require_relative '../utils/adapters.rb'
4
+
5
+ class TxtContentExtractor
6
+ attr_reader :page_count
7
+
8
+ WORDS_PER_PAGE = 500
9
+
10
+ # Define custom error classes
11
+ class ExtractionError < StandardError; end
12
+ class FileDownloadError < ExtractionError; end
13
+ class FileReadError < ExtractionError; end
14
+
15
+ def initialize(file)
16
+ @file = file
17
+ end
18
+
19
+ def call
20
+ extract_content
21
+ rescue StandardError => e
22
+ raise ExtractionError, "Content extraction failed: #{e.message}"
23
+ end
24
+
25
+ private
26
+
27
+ def extract_content
28
+ Tempfile.open(['temp', '.txt'], binmode: true) do |tempfile|
29
+ begin
30
+ @file.download { |chunk| tempfile.write(chunk.force_encoding("UTF-8")) }
31
+ rescue => e
32
+ raise FileDownloadError, "Failed to download file: #{e.message}"
33
+ end
34
+
35
+ begin
36
+ content = File.read(tempfile.path)
37
+ rescue => e
38
+ raise FileReadError, "Failed to read file: #{e.message}"
39
+ end
40
+
41
+ sanitized_content = sanitize_page_content(content)
42
+ pages = split_into_pages(sanitized_content)
43
+ @page_count = pages.size
44
+
45
+ pages
46
+ end
47
+ end
48
+
49
+ def split_into_pages(content)
50
+ words = content.split(/\s+/)
51
+ pages = []
52
+ words.each_slice(WORDS_PER_PAGE) do |page_words|
53
+ pages << page_words.join(' ')
54
+ end
55
+ pages
56
+ end
57
+
58
+ # Sanitize the content
59
+ def sanitize_page_content(content)
60
+ sanitized_text = Sanitizer.remove_excessive_newlines(content)
61
+ sanitized_text = Sanitizer.remove_excessive_spaces(sanitized_text)
62
+ sanitized_text = Sanitizer.remove_bullet_points(sanitized_text)
63
+ # Add additional sanitization methods as required
64
+ sanitized_text
65
+ end
66
+ end
@@ -0,0 +1,45 @@
1
+ require 'ruby/openai'
2
+
3
+ class BasicTextChunker
4
+ def initialize(token_limit=390)
5
+ @token_limit = token_limit
6
+ end
7
+
8
+ def split_into_chunks(text)
9
+ sentences = text.split(/[.!?]\s+/)
10
+ chunks = []
11
+ current_chunk = ""
12
+ current_token_count = 0
13
+
14
+ sentences.each do |sentence|
15
+ sentence_token_count = OpenAI.rough_token_count(sentence)
16
+
17
+ while sentence_token_count > @token_limit
18
+ tokens_to_take = @token_limit - current_token_count
19
+ partial = sentence.split(/\s+/).first(tokens_to_take).join(" ")
20
+ current_chunk += partial + " "
21
+ sentence = sentence[partial.length..].strip
22
+ current_token_count += tokens_to_take
23
+ sentence_token_count -= tokens_to_take
24
+
25
+ if current_token_count == @token_limit
26
+ chunks << current_chunk.strip
27
+ current_chunk = ""
28
+ current_token_count = 0
29
+ end
30
+ end
31
+
32
+ if current_token_count + sentence_token_count <= @token_limit
33
+ current_chunk += sentence + " "
34
+ current_token_count += sentence_token_count
35
+ else
36
+ chunks << current_chunk.strip
37
+ current_chunk = sentence + " "
38
+ current_token_count = sentence_token_count
39
+ end
40
+ end
41
+
42
+ chunks << current_chunk.strip unless current_chunk.empty?
43
+ chunks
44
+ end
45
+ end
@@ -0,0 +1,63 @@
1
+ require 'dotenv'
2
+ require_relative '../mistral_rb.rb'
3
+ require_relative '../content_splitters/basic_sentence_splitter.rb'
4
+
5
+ Dotenv.load()
6
+
7
+ class MistralEmbeddingCreator
8
+ def initialize(api_key = nil, chunker = BasicTextChunker.new, model = "mistral-embed")
9
+ @chunker = chunker
10
+ @model = model
11
+ @api_key = api_key || ENV['MISTRAL_API_KEY']
12
+
13
+ if @api_key
14
+ @llm = MistralAPI.new(api_key: @api_key)
15
+ else
16
+ Rails.logger.error "MISTRAL AI API key not provided. Set the MISTRAL_API_KEY in the ENV variables or pass it as an argument."
17
+ end
18
+ end
19
+
20
+ def call(text, pages_mode=true)
21
+
22
+ if pages_mode
23
+ vectors = []
24
+ return [] unless @llm # Return empty if the API client isn't set up
25
+
26
+ # Divide the text into chunks for each page
27
+ text.each_with_index do |page_content, page_index|
28
+ chunks = @chunker.split_into_chunks(page_content)
29
+
30
+ # Create embeddings for each chunk
31
+ chunks.each_with_index do |chunk, index|
32
+ response = @llm.create_embeddings(
33
+ model: @model,
34
+ input: [chunk]
35
+ )
36
+
37
+ # Extract the embeddings from the response
38
+ embedding = response.data.first.embedding
39
+
40
+ # Create vector data for the chunk and keep page numbers for reference
41
+ vector_data = {
42
+ id: "vec #{index + 1}",
43
+ values: embedding,
44
+ metadata: {
45
+ text: chunk,
46
+ page: page_index + 1,
47
+ }
48
+ }
49
+ # storing each chunk vector data in an array
50
+ vectors << vector_data
51
+ end
52
+ end
53
+ vectors
54
+ else
55
+ response = @llm.create_embeddings(
56
+ model: @model,
57
+ input: [text]
58
+ )
59
+
60
+ response.data.first.embedding
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,67 @@
1
+ require 'dotenv'
2
+ require 'ruby/openai'
3
+ require_relative '../content_splitters/basic_sentence_splitter.rb'
4
+
5
+ Dotenv.load()
6
+
7
+ class OpenaiEmbeddingCreator
8
+ def initialize(api_key = nil, chunker = BasicTextChunker.new, model = "text-embedding-ada-002")
9
+ @chunker = chunker
10
+ @model = model
11
+ @api_key = api_key || ENV['OPENAI_API_KEY']
12
+
13
+ if @api_key
14
+ @llm = OpenAI::Client.new(access_token: @api_key)
15
+ else
16
+ Rails.logger.error "OpenAI API key not provided. Set the OPENAI_API_KEY in the ENV variables or pass it as an argument."
17
+ end
18
+ end
19
+
20
+ def call(text, pages_mode=true)
21
+
22
+ if pages_mode
23
+ vectors = []
24
+ return [] unless @llm # Return empty if the API client isn't set up
25
+
26
+ # Divide the text into chunks for each page
27
+ text.each_with_index do |page_content, page_index|
28
+ chunks = @chunker.split_into_chunks(page_content)
29
+
30
+ # Create embeddings for each chunk
31
+ chunks.each_with_index do |chunk, index|
32
+ response = @llm.embeddings(
33
+ parameters: {
34
+ model: @model,
35
+ input: chunk
36
+ }
37
+ )
38
+
39
+ # Extract the embeddings from the response
40
+ embedding = response['data'][0]['embedding']
41
+
42
+ # Create vector data for the chunk and keep page numbers for reference
43
+ vector_data = {
44
+ id: "vec #{index + 1}",
45
+ values: embedding,
46
+ metadata: {
47
+ text: chunk,
48
+ page: page_index + 1,
49
+ }
50
+ }
51
+ # storing each chunk vector data in an array
52
+ vectors << vector_data
53
+ end
54
+ end
55
+ vectors
56
+ else
57
+ response = @llm.embeddings(
58
+ parameters: {
59
+ model: @model,
60
+ input: chunk
61
+ }
62
+ )
63
+ # Extract the embeddings from the response
64
+ response['data'][0]['embedding']
65
+ end
66
+ end
67
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module MistralRb
4
- VERSION = "0.1.3"
4
+ VERSION = "0.1.4"
5
5
  end
data/lib/mistral_rb.rb CHANGED
@@ -4,11 +4,14 @@ require_relative "mistral_rb/version"
4
4
  require "httparty"
5
5
  require "json"
6
6
  require_relative "mistral_rb/response_models"
7
+ require 'dotenv'
8
+
9
+ Dotenv.load()
7
10
 
8
11
  class MistralAPI
9
12
  include HTTParty
10
13
 
11
- def initialize(api_key:, base_uri: "https://api.mistral.ai/v1")
14
+ def initialize(api_key: ENV["MISTRAL_API_KEY"], base_uri: "https://api.mistral.ai/v1")
12
15
  @headers = {
13
16
  "Authorization" => "Bearer #{api_key}",
14
17
  "Content-Type" => "application/json"
data/lib/responder.rb ADDED
@@ -0,0 +1,91 @@
1
+ require 'dotenv'
2
+ require 'ruby/openai'
3
+ require_relative './mistral_rb.rb'
4
+ require_relative './content_extractors/content_extractor_factory.rb'
5
+ require_relative './content_splitters/basic_sentence_splitter.rb'
6
+ require_relative './embedding_engines/mistral_embeddings.rb'
7
+ require_relative './vector_stores/pinecone.rb'
8
+ require_relative './utils/similarity_service.rb'
9
+ require_relative './utils/adapters.rb'
10
+
11
+ Dotenv.load()
12
+
13
+ class Responder
14
+ def initialize(vector_store:, llm: MistralAPI.new, file:, embedding_creator: MistralEmbeddingCreator.new)
15
+ @vector_store = vector_store
16
+ @llm = llm
17
+ @file = file
18
+ @embedding_creator = embedding_creator
19
+ end
20
+
21
+ def call(question, top_k=10)
22
+ embedding = text_to_embedding(question)
23
+ results = process_similarity(question, top_k)
24
+ context = fetch_context(embedding, top_k)
25
+ merged_text = merge_texts(results, context)
26
+ prompt = construct_prompt(question, merged_text)
27
+ generate_response(prompt)
28
+ end
29
+
30
+ private
31
+
32
+ def extract_content
33
+ @extractor ||= ContentExtractorFactory.for(@file)
34
+
35
+ # Check if either @pages or @content is uninitialized
36
+ if @pages.nil? || @content.nil?
37
+ extracted_pages, extracted_content = @extractor.call
38
+ @pages ||= extracted_pages
39
+ @content ||= extracted_content
40
+ end
41
+ end
42
+
43
+ def store_embeddings
44
+ @embeddings ||= @embedding_creator.call(@pages)
45
+ @namespace ||= @vector_store.store(@embeddings, @content)
46
+ end
47
+
48
+ def text_to_embedding(question)
49
+ @embedding_creator.call(question, false)
50
+ end
51
+
52
+ # This method processes the similarity between the question and the content
53
+ def process_similarity(question, top_k)
54
+ extract_content # Ensure content is extracted
55
+ similarity_service = SimilarityService.new(question, @pages)
56
+ similarity_service.most_similar_sentences(top_k)
57
+ end
58
+
59
+ # Fetches context from the vector store based on the embedding
60
+ def fetch_context(embedding, top_k)
61
+ store_embeddings # Ensure embeddings are stored
62
+ if @namespace
63
+ @vector_store.index.query(
64
+ vector: embedding,
65
+ namespace: @namespace,
66
+ top_k: top_k,
67
+ include_values: false,
68
+ include_metadata: true
69
+ )
70
+ else
71
+ nil
72
+ end
73
+ end
74
+
75
+ # Merges the results from similarity processing with the context
76
+ def merge_texts(results, context)
77
+ [results, context].compact.join(' ')
78
+ end
79
+
80
+ def construct_prompt(question, merged_text)
81
+ "You are a helpful assistant. Answer this question: #{question}, using these information from the document the user uploaded: #{merged_text} in 60 words. Reply in the language of the question."
82
+ end
83
+
84
+ def generate_response(prompt)
85
+ response = @llm.create_chat_completion(
86
+ model: "mistral-tiny",
87
+ messages: [{role: "user", content: prompt}]
88
+ )
89
+ response.choices.first.message.content
90
+ end
91
+ end
@@ -0,0 +1,12 @@
1
+ class LocalFileAdapter
2
+ # This adapter will wrap a local file path and provide a download method that yields the file's contents
3
+ def initialize(file_path)
4
+ @file_path = file_path
5
+ end
6
+
7
+ def download
8
+ File.open(@file_path, 'rb') do |file|
9
+ yield file.read
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,16 @@
1
+ module Sanitizer
2
+ # Remove sequences of more than two newline characters
3
+ def self.remove_excessive_newlines(text)
4
+ text.gsub(/(\n\s*){3,}/, "\n\n")
5
+ end
6
+
7
+ # Remove sequences of more than two spaces and replace with one space
8
+ def self.remove_excessive_spaces(text)
9
+ text.gsub(/ {3,}/, ' ')
10
+ end
11
+
12
+ # Remove bullet point characters
13
+ def self.remove_bullet_points(text)
14
+ text.gsub("•", "")
15
+ end
16
+ end
@@ -0,0 +1,40 @@
1
+
2
+ class SimilarityService
3
+ FRENCH_STOP_WORDS = %w(
4
+ je tu il nous vous ils elle me te se le la les et ou mais
5
+ que quand donc or ni car
6
+ ).freeze
7
+
8
+ ENGLISH_STOP_WORDS = %w(
9
+ i you he we they she me him us them and or but that when so nor for
10
+ ).freeze
11
+
12
+ STOP_WORDS = (FRENCH_STOP_WORDS + ENGLISH_STOP_WORDS).freeze
13
+
14
+ def initialize(input_question, document_chunks)
15
+ @input_question = input_question
16
+ @document_chunks = document_chunks
17
+ end
18
+
19
+ def jaccard_similarity(str1, str2)
20
+ set1 = str1.downcase.split(" ").reject { |word| STOP_WORDS.include?(word) }.to_set
21
+ set2 = str2.downcase.split(" ").reject { |word| STOP_WORDS.include?(word) }.to_set
22
+ intersection = set1 & set2
23
+ union = set1 | set2
24
+ intersection.size.to_f / union.size
25
+ end
26
+
27
+ def most_similar_sentences(top_k)
28
+ sentence_delimiters = /[\.\?!:]/
29
+ all_sentences = @document_chunks.flat_map { |chunk| chunk.split(sentence_delimiters).map(&:strip) }
30
+
31
+ similarities = all_sentences.map do |sentence|
32
+ [sentence, jaccard_similarity(@input_question, sentence)]
33
+ end
34
+
35
+ # Sort by similarity and take the top_k
36
+ top_sentences = similarities.sort_by { |_, similarity| -similarity }.take(top_k).map(&:first)
37
+
38
+ top_sentences.join(' ')
39
+ end
40
+ end
@@ -0,0 +1,61 @@
1
+ require 'pinecone'
2
+ require 'digest'
3
+ require 'dotenv'
4
+
5
+ Dotenv.load()
6
+
7
+ class PineconeService
8
+ attr_reader :index
9
+ def initialize(pinecone_key: ENV['PINECONE_API_KEY'], pinecone_env: ENV['PINECONE_ENV'], index_name:)
10
+ @pinecone_key = pinecone_key
11
+ @pinecone_env = pinecone_env
12
+ @index_name = index_name
13
+
14
+
15
+ Pinecone.configure do |config|
16
+ config.api_key = @pinecone_key
17
+ config.environment = @pinecone_env
18
+ end
19
+
20
+ if @pinecone_key && @pinecone_env
21
+ @pinecone = Pinecone::Client.new
22
+ else
23
+ Rails.logger.error "Set the PINECONE_API_KEY and PINECONE_ENV in the ENV variables"
24
+ end
25
+ @index = @pinecone.index(@index_name)
26
+ end
27
+
28
+ def compute_hash(text)
29
+ Digest::SHA256.hexdigest(text)[0...44]
30
+ end
31
+
32
+ def store(embeddings, text)
33
+ namespace = compute_hash(text)
34
+
35
+ upsert_with_retry(@index, namespace, embeddings)
36
+
37
+ namespace
38
+ end
39
+
40
+ private
41
+
42
+ def upsert_with_retry(index, namespace, embeddings, max_retries = 5, retry_delay = 10)
43
+ retries = 0
44
+ response = nil
45
+
46
+ loop do
47
+ response = index.upsert(
48
+ namespace: namespace,
49
+ vectors: embeddings
50
+ )
51
+
52
+ break if response["code"] != 9 || retries >= max_retries
53
+
54
+ puts "Encountered error. Retrying in #{retry_delay} seconds... (Attempt #{retries + 1} of #{max_retries})"
55
+ sleep(retry_delay)
56
+ retries += 1
57
+ end
58
+
59
+ response
60
+ end
61
+ end
data/mistral_rb.gemspec CHANGED
@@ -31,10 +31,22 @@ Gem::Specification.new do |spec|
31
31
  spec.require_paths = ["lib"]
32
32
 
33
33
  # Specify runtime and development dependencies in gemspec
34
- spec.add_runtime_dependency "httparty", "~> 0.18"
35
34
  spec.add_development_dependency "bundler", "~> 2.0"
36
35
  spec.add_development_dependency "rake", "~> 13.0"
37
36
 
37
+ spec.add_runtime_dependency "httparty", "~> 0.18"
38
+ spec.add_runtime_dependency "mime-types"
39
+ spec.add_runtime_dependency "pdf-reader"
40
+ spec.add_runtime_dependency "pinecone"
41
+ spec.add_runtime_dependency "docx"
42
+ spec.add_runtime_dependency "dotenv-rails"
43
+ # spec.add_runtime_dependency "csv"
44
+ # spec.add_runtime_dependency "daru"
45
+ spec.add_runtime_dependency "nokogiri"
46
+ spec.add_runtime_dependency 'selenium-webdriver', '~> 4.5'
47
+ spec.add_runtime_dependency 'webdrivers', '~> 5.3'
48
+ spec.add_runtime_dependency 'watir'
49
+ spec.add_runtime_dependency 'ruby-openai'
38
50
  # For more information and examples about making a new gem, check out our
39
51
  # guide at: https://bundler.io/guides/creating_gem.html
40
52
  end
metadata CHANGED
@@ -1,15 +1,43 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mistral_rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Franck Stephane Ndzomga
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-12-24 00:00:00.000000000 Z
11
+ date: 2023-12-26 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '13.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '13.0'
13
41
  - !ruby/object:Gem::Dependency
14
42
  name: httparty
15
43
  requirement: !ruby/object:Gem::Requirement
@@ -25,33 +53,145 @@ dependencies:
25
53
  - !ruby/object:Gem::Version
26
54
  version: '0.18'
27
55
  - !ruby/object:Gem::Dependency
28
- name: bundler
56
+ name: mime-types
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: pdf-reader
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: pinecone
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: docx
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: dotenv-rails
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: nokogiri
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ type: :runtime
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
139
+ - !ruby/object:Gem::Dependency
140
+ name: selenium-webdriver
29
141
  requirement: !ruby/object:Gem::Requirement
30
142
  requirements:
31
143
  - - "~>"
32
144
  - !ruby/object:Gem::Version
33
- version: '2.0'
34
- type: :development
145
+ version: '4.5'
146
+ type: :runtime
35
147
  prerelease: false
36
148
  version_requirements: !ruby/object:Gem::Requirement
37
149
  requirements:
38
150
  - - "~>"
39
151
  - !ruby/object:Gem::Version
40
- version: '2.0'
152
+ version: '4.5'
41
153
  - !ruby/object:Gem::Dependency
42
- name: rake
154
+ name: webdrivers
43
155
  requirement: !ruby/object:Gem::Requirement
44
156
  requirements:
45
157
  - - "~>"
46
158
  - !ruby/object:Gem::Version
47
- version: '13.0'
48
- type: :development
159
+ version: '5.3'
160
+ type: :runtime
49
161
  prerelease: false
50
162
  version_requirements: !ruby/object:Gem::Requirement
51
163
  requirements:
52
164
  - - "~>"
53
165
  - !ruby/object:Gem::Version
54
- version: '13.0'
166
+ version: '5.3'
167
+ - !ruby/object:Gem::Dependency
168
+ name: watir
169
+ requirement: !ruby/object:Gem::Requirement
170
+ requirements:
171
+ - - ">="
172
+ - !ruby/object:Gem::Version
173
+ version: '0'
174
+ type: :runtime
175
+ prerelease: false
176
+ version_requirements: !ruby/object:Gem::Requirement
177
+ requirements:
178
+ - - ">="
179
+ - !ruby/object:Gem::Version
180
+ version: '0'
181
+ - !ruby/object:Gem::Dependency
182
+ name: ruby-openai
183
+ requirement: !ruby/object:Gem::Requirement
184
+ requirements:
185
+ - - ">="
186
+ - !ruby/object:Gem::Version
187
+ version: '0'
188
+ type: :runtime
189
+ prerelease: false
190
+ version_requirements: !ruby/object:Gem::Requirement
191
+ requirements:
192
+ - - ">="
193
+ - !ruby/object:Gem::Version
194
+ version: '0'
55
195
  description: This gem provides an easy-to-use interface for the Mistral AI API.
56
196
  email:
57
197
  - ndzomgafs@gmail.com
@@ -65,9 +205,22 @@ files:
65
205
  - LICENSE.txt
66
206
  - README.md
67
207
  - Rakefile
208
+ - lib/content_extractors/content_extractor_factory.rb
209
+ - lib/content_extractors/docx_content_extractor.rb
210
+ - lib/content_extractors/html_content_extractor.rb
211
+ - lib/content_extractors/pdf_content_extractor.rb
212
+ - lib/content_extractors/text_content_extractor.rb
213
+ - lib/content_splitters/basic_sentence_splitter.rb
214
+ - lib/embedding_engines/mistral_embeddings.rb
215
+ - lib/embedding_engines/openai_embeddings.rb
68
216
  - lib/mistral_rb.rb
69
217
  - lib/mistral_rb/response_models.rb
70
218
  - lib/mistral_rb/version.rb
219
+ - lib/responder.rb
220
+ - lib/utils/adapters.rb
221
+ - lib/utils/sanitizer.rb
222
+ - lib/utils/similarity_service.rb
223
+ - lib/vector_stores/pinecone.rb
71
224
  - mistral_rb.gemspec
72
225
  - sig/mistral_rb.rbs
73
226
  homepage: https://github.com/fsndzomga/mistral_rb
@@ -92,7 +245,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
92
245
  - !ruby/object:Gem::Version
93
246
  version: '0'
94
247
  requirements: []
95
- rubygems_version: 3.3.7
248
+ rubygems_version: 3.5.3
96
249
  signing_key:
97
250
  specification_version: 4
98
251
  summary: A simple wrapper for the Mistral API