langchainrb 0.3.11 → 0.3.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/Gemfile.lock +2 -1
- data/README.md +22 -6
- data/examples/pdf_store_and_query_with_chroma.rb +2 -2
- data/lib/langchain/loader.rb +80 -0
- data/lib/langchain/processors/base.rb +14 -0
- data/lib/langchain/processors/docx.rb +24 -0
- data/lib/langchain/processors/html.rb +28 -0
- data/lib/langchain/processors/pdf.rb +26 -0
- data/lib/langchain/processors/text.rb +17 -0
- data/lib/langchain.rb +10 -14
- data/lib/langchainrb.rb +1 -0
- data/lib/vectorsearch/base.rb +4 -12
- data/lib/vectorsearch/pinecone.rb +26 -10
- data/lib/version.rb +1 -1
- metadata +9 -8
- data/lib/loader.rb +0 -26
- data/lib/loaders/base.rb +0 -19
- data/lib/loaders/docx.rb +0 -34
- data/lib/loaders/html.rb +0 -38
- data/lib/loaders/pdf.rb +0 -36
- data/lib/loaders/text.rb +0 -24
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 974f0a2b8ce3fe42144016bd740ee9d4f7e597834319cc92fbf1d50bd1f4468e
|
4
|
+
data.tar.gz: 3686a42c37eb117e6d7485ef4f7777c0f12968bb9cdcc3a30c7721c86c0a4325
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a61f9b36d9d19eb6cf87af18c7fb40f55d39771257d08a6af2ec3384988419dfb158ffa8fc81c3769c0149f1ffa8b03200366bbea55b03b0d1553912af8d9ae6
|
7
|
+
data.tar.gz: 7dc53be923fe5b8587f61617198b24c42e8793fbd8e18c42a17035bf68279c59c37c6c691cabe13c83adc5dc2cff66ea293f198297ab9a9de30aa68ca72bd9c4
|
data/CHANGELOG.md
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
langchainrb (0.3.
|
4
|
+
langchainrb (0.3.12)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: https://rubygems.org/
|
@@ -271,6 +271,7 @@ GEM
|
|
271
271
|
zeitwerk (2.6.8)
|
272
272
|
|
273
273
|
PLATFORMS
|
274
|
+
arm64-darwin-21
|
274
275
|
arm64-darwin-22
|
275
276
|
x86_64-darwin-19
|
276
277
|
x86_64-darwin-22
|
data/README.md
CHANGED
@@ -268,12 +268,28 @@ agent.run(question: "How many full soccer fields would be needed to cover the di
|
|
268
268
|
|
269
269
|
Need to read data from various sources? Load it up.
|
270
270
|
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
271
|
+
##### Usage
|
272
|
+
|
273
|
+
Just call `Langchan::Loader.load` with the path to the file or a URL you want to load.
|
274
|
+
|
275
|
+
```ruby
|
276
|
+
Langchaing::Loader.load('/path/to/file.pdf')
|
277
|
+
```
|
278
|
+
|
279
|
+
or
|
280
|
+
|
281
|
+
```ruby
|
282
|
+
Langchain::Loader.load('https://www.example.com/file.pdf')
|
283
|
+
```
|
284
|
+
|
285
|
+
##### Supported Formats
|
286
|
+
|
287
|
+
| Format | Pocessor | Gem Requirements |
|
288
|
+
| ------ | ---------------- | :--------------------------: |
|
289
|
+
| docx | Processors::Docx | `gem "docx", "~> 0.8.0"` |
|
290
|
+
| html | Processors::HTML | `gem "nokogiri", "~> 1.13"` |
|
291
|
+
| pdf | Processors::PDF | `gem "pdf-reader", "~> 1.4"` |
|
292
|
+
| text | Processors::Text | |
|
277
293
|
|
278
294
|
## Examples
|
279
295
|
Additional examples available: [/examples](https://github.com/andreibondarev/langchainrb/tree/main/examples)
|
@@ -0,0 +1,80 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "open-uri"
|
4
|
+
|
5
|
+
module Langchain
|
6
|
+
class Loader
|
7
|
+
class FileNotFound < StandardError; end
|
8
|
+
|
9
|
+
class UnknownFormatError < StandardError; end
|
10
|
+
|
11
|
+
URI_REGEX = %r{\A[A-Za-z][A-Za-z0-9+\-.]*://}
|
12
|
+
|
13
|
+
# Load data from a file or url
|
14
|
+
# Equivalent to Langchain::Loader.new(path).load
|
15
|
+
# @param path [String | Pathname] path to file or url
|
16
|
+
# @return [String] file content
|
17
|
+
def self.load(path)
|
18
|
+
new(path).load
|
19
|
+
end
|
20
|
+
|
21
|
+
# Initialize Langchain::Loader
|
22
|
+
# @param path [String | Pathname] path to file or url
|
23
|
+
# @return [Langchain::Loader] loader instance
|
24
|
+
def initialize(path)
|
25
|
+
@path = path
|
26
|
+
end
|
27
|
+
|
28
|
+
# Check if path is url
|
29
|
+
# @return [Boolean] true if path is url
|
30
|
+
def url?
|
31
|
+
return false if @path.is_a?(Pathname)
|
32
|
+
|
33
|
+
!!(@path =~ URI_REGEX)
|
34
|
+
end
|
35
|
+
|
36
|
+
# Load data from a file or url
|
37
|
+
# @return [String] file content
|
38
|
+
def load
|
39
|
+
url? ? from_url(@path) : from_path(@path)
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
|
44
|
+
def from_url(url)
|
45
|
+
process do
|
46
|
+
data = URI.parse(url).open
|
47
|
+
processor = find_processor(:CONTENT_TYPES, data.content_type)
|
48
|
+
[data, processor]
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def from_path(path)
|
53
|
+
raise FileNotFound unless File.exist?(path)
|
54
|
+
|
55
|
+
process do
|
56
|
+
[File.open(path), find_processor(:EXTENSIONS, File.extname(path))]
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def process(&block)
|
61
|
+
data, processor = yield
|
62
|
+
|
63
|
+
raise UnknownFormatError unless processor
|
64
|
+
|
65
|
+
Langchain::Processors.const_get(processor).new.parse(data)
|
66
|
+
end
|
67
|
+
|
68
|
+
def find_processor(constant, value)
|
69
|
+
processors.find { |klass| processor_matches? "#{klass}::#{constant}", value }
|
70
|
+
end
|
71
|
+
|
72
|
+
def processor_matches?(constant, value)
|
73
|
+
Langchain::Processors.const_get(constant).include?(value)
|
74
|
+
end
|
75
|
+
|
76
|
+
def processors
|
77
|
+
Langchain::Processors.constants
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Langchain
|
4
|
+
module Processors
|
5
|
+
class Docx < Base
|
6
|
+
EXTENSIONS = [".docx"]
|
7
|
+
CONTENT_TYPES = ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"]
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
depends_on "docx"
|
11
|
+
require "docx"
|
12
|
+
end
|
13
|
+
|
14
|
+
# Parse the document and return the text
|
15
|
+
# @param [File] data
|
16
|
+
# @return [String]
|
17
|
+
def parse(data)
|
18
|
+
::Docx::Document
|
19
|
+
.open(StringIO.new(data.read))
|
20
|
+
.text
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Langchain
|
4
|
+
module Processors
|
5
|
+
class HTML < Base
|
6
|
+
EXTENSIONS = [".html", ".htm"]
|
7
|
+
CONTENT_TYPES = ["text/html"]
|
8
|
+
|
9
|
+
# We only look for headings and paragraphs
|
10
|
+
TEXT_CONTENT_TAGS = %w[h1 h2 h3 h4 h5 h6 p]
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
depends_on "nokogiri"
|
14
|
+
require "nokogiri"
|
15
|
+
end
|
16
|
+
|
17
|
+
# Parse the document and return the text
|
18
|
+
# @param [File] data
|
19
|
+
# @return [String]
|
20
|
+
def parse(data)
|
21
|
+
Nokogiri::HTML(data.read)
|
22
|
+
.css(TEXT_CONTENT_TAGS.join(","))
|
23
|
+
.map(&:inner_text)
|
24
|
+
.join("\n\n")
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Langchain
|
4
|
+
module Processors
|
5
|
+
class PDF < Base
|
6
|
+
EXTENSIONS = [".pdf"]
|
7
|
+
CONTENT_TYPES = ["application/pdf"]
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
depends_on "pdf-reader"
|
11
|
+
require "pdf-reader"
|
12
|
+
end
|
13
|
+
|
14
|
+
# Parse the document and return the text
|
15
|
+
# @param [File] data
|
16
|
+
# @return [String]
|
17
|
+
def parse(data)
|
18
|
+
::PDF::Reader
|
19
|
+
.new(StringIO.new(data.read))
|
20
|
+
.pages
|
21
|
+
.map(&:text)
|
22
|
+
.join("\n\n")
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Langchain
|
4
|
+
module Processors
|
5
|
+
class Text < Base
|
6
|
+
EXTENSIONS = [".txt"]
|
7
|
+
CONTENT_TYPES = ["text/plain"]
|
8
|
+
|
9
|
+
# Parse the document and return the text
|
10
|
+
# @param [File] data
|
11
|
+
# @return [String]
|
12
|
+
def parse(data)
|
13
|
+
data.read
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
data/lib/langchain.rb
CHANGED
@@ -7,7 +7,6 @@ require_relative "./version"
|
|
7
7
|
require_relative "./dependency_helper"
|
8
8
|
module Langchain
|
9
9
|
class << self
|
10
|
-
attr_accessor :default_loaders
|
11
10
|
attr_accessor :logger
|
12
11
|
|
13
12
|
attr_reader :root
|
@@ -16,6 +15,16 @@ module Langchain
|
|
16
15
|
@logger ||= ::Logger.new($stdout, level: :warn, formatter: ->(severity, datetime, progname, msg) { "[LangChain.rb] #{msg}\n" })
|
17
16
|
|
18
17
|
@root = Pathname.new(__dir__)
|
18
|
+
|
19
|
+
autoload :Loader, "langchain/loader"
|
20
|
+
|
21
|
+
module Processors
|
22
|
+
autoload :Base, "langchain/processors/base"
|
23
|
+
autoload :PDF, "langchain/processors/pdf"
|
24
|
+
autoload :HTML, "langchain/processors/html"
|
25
|
+
autoload :Text, "langchain/processors/text"
|
26
|
+
autoload :Docx, "langchain/processors/docx"
|
27
|
+
end
|
19
28
|
end
|
20
29
|
|
21
30
|
module Agent
|
@@ -55,16 +64,3 @@ module Tool
|
|
55
64
|
autoload :SerpApi, "tool/serp_api"
|
56
65
|
autoload :Wikipedia, "tool/wikipedia"
|
57
66
|
end
|
58
|
-
|
59
|
-
module Loaders
|
60
|
-
autoload :Base, "loaders/base"
|
61
|
-
autoload :Docx, "loaders/docx"
|
62
|
-
autoload :PDF, "loaders/pdf"
|
63
|
-
autoload :Text, "loaders/text"
|
64
|
-
autoload :HTML, "loaders/html"
|
65
|
-
end
|
66
|
-
|
67
|
-
autoload :Loader, "loader"
|
68
|
-
|
69
|
-
# Load the default Loaders
|
70
|
-
Langchain.default_loaders ||= [::Loaders::Text, ::Loaders::PDF, ::Loaders::Docx]
|
data/lib/langchainrb.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "langchain"
|
data/lib/vectorsearch/base.rb
CHANGED
@@ -19,8 +19,6 @@ module Vectorsearch
|
|
19
19
|
@llm_api_key = llm_api_key
|
20
20
|
|
21
21
|
@llm_client = LLM.const_get(LLM::Base::LLMS.fetch(llm)).new(api_key: llm_api_key)
|
22
|
-
|
23
|
-
@loaders = Langchain.default_loaders
|
24
22
|
end
|
25
23
|
|
26
24
|
# Method supported by Vectorsearch DB to create a default schema
|
@@ -74,18 +72,12 @@ module Vectorsearch
|
|
74
72
|
raise ArgumentError, "Either path or paths must be provided" if path.nil? && paths.nil?
|
75
73
|
raise ArgumentError, "Either path or paths must be provided, not both" if !path.nil? && !paths.nil?
|
76
74
|
|
77
|
-
texts =
|
78
|
-
|
79
|
-
|
80
|
-
|
75
|
+
texts = Array(path || paths)
|
76
|
+
.flatten
|
77
|
+
.map { |path| Langchain::Loader.new(path)&.load }
|
78
|
+
.compact
|
81
79
|
|
82
80
|
add_texts(texts: texts)
|
83
81
|
end
|
84
|
-
|
85
|
-
attr_reader :loaders
|
86
|
-
|
87
|
-
def add_loader(*loaders)
|
88
|
-
loaders.each { |loader| @loaders << loader }
|
89
|
-
end
|
90
82
|
end
|
91
83
|
end
|
@@ -25,20 +25,22 @@ module Vectorsearch
|
|
25
25
|
|
26
26
|
# Add a list of texts to the index
|
27
27
|
# @param texts [Array] The list of texts to add
|
28
|
+
# @param namespace [String] The namespace to add the texts to
|
29
|
+
# @param metadata [Hash] The metadata to use for the texts
|
28
30
|
# @return [Hash] The response from the server
|
29
|
-
def add_texts(texts:)
|
31
|
+
def add_texts(texts:, namespace: "", metadata: nil)
|
30
32
|
vectors = texts.map do |text|
|
31
33
|
{
|
32
34
|
# TODO: Allows passing in your own IDs
|
33
35
|
id: SecureRandom.uuid,
|
34
|
-
metadata: {content: text},
|
36
|
+
metadata: metadata || {content: text},
|
35
37
|
values: llm_client.embed(text: text)
|
36
38
|
}
|
37
39
|
end
|
38
40
|
|
39
41
|
index = client.index(index_name)
|
40
42
|
|
41
|
-
index.upsert(vectors: vectors)
|
43
|
+
index.upsert(vectors: vectors, namespace: namespace)
|
42
44
|
end
|
43
45
|
|
44
46
|
# Create the index with the default schema
|
@@ -54,40 +56,54 @@ module Vectorsearch
|
|
54
56
|
# Search for similar texts
|
55
57
|
# @param query [String] The text to search for
|
56
58
|
# @param k [Integer] The number of results to return
|
59
|
+
# @param namespace [String] The namespace to search in
|
60
|
+
# @param filter [String] The filter to use
|
57
61
|
# @return [Array] The list of results
|
58
62
|
def similarity_search(
|
59
63
|
query:,
|
60
|
-
k: 4
|
64
|
+
k: 4,
|
65
|
+
namespace: "",
|
66
|
+
filter: nil
|
61
67
|
)
|
62
68
|
embedding = llm_client.embed(text: query)
|
63
69
|
|
64
70
|
similarity_search_by_vector(
|
65
71
|
embedding: embedding,
|
66
|
-
k: k
|
72
|
+
k: k,
|
73
|
+
namespace: namespace,
|
74
|
+
filter: filter
|
67
75
|
)
|
68
76
|
end
|
69
77
|
|
70
78
|
# Search for similar texts by embedding
|
71
79
|
# @param embedding [Array] The embedding to search for
|
72
80
|
# @param k [Integer] The number of results to return
|
81
|
+
# @param namespace [String] The namespace to search in
|
82
|
+
# @param filter [String] The filter to use
|
73
83
|
# @return [Array] The list of results
|
74
|
-
def similarity_search_by_vector(embedding:, k: 4)
|
84
|
+
def similarity_search_by_vector(embedding:, k: 4, namespace: "", filter: nil)
|
75
85
|
index = client.index(index_name)
|
76
86
|
|
77
|
-
|
87
|
+
query_params = {
|
78
88
|
vector: embedding,
|
89
|
+
namespace: namespace,
|
90
|
+
filter: filter,
|
79
91
|
top_k: k,
|
80
92
|
include_values: true,
|
81
93
|
include_metadata: true
|
82
|
-
|
94
|
+
}.compact
|
95
|
+
|
96
|
+
response = index.query(query_params)
|
83
97
|
response.dig("matches")
|
84
98
|
end
|
85
99
|
|
86
100
|
# Ask a question and return the answer
|
87
101
|
# @param question [String] The question to ask
|
102
|
+
# @param namespace [String] The namespace to search in
|
103
|
+
# @param filter [String] The filter to use
|
88
104
|
# @return [String] The answer to the question
|
89
|
-
def ask(question:)
|
90
|
-
search_results = similarity_search(query: question)
|
105
|
+
def ask(question:, namespace: "", filter: nil)
|
106
|
+
search_results = similarity_search(query: question, namespace: namespace, filter: filter)
|
91
107
|
|
92
108
|
context = search_results.map do |result|
|
93
109
|
result.dig("metadata").to_s
|
data/lib/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: langchainrb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.12
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrei Bondarev
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-05-
|
11
|
+
date: 2023-05-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: dotenv-rails
|
@@ -288,18 +288,19 @@ files:
|
|
288
288
|
- lib/agent/chain_of_thought_agent/chain_of_thought_agent_prompt.json
|
289
289
|
- lib/dependency_helper.rb
|
290
290
|
- lib/langchain.rb
|
291
|
+
- lib/langchain/loader.rb
|
292
|
+
- lib/langchain/processors/base.rb
|
293
|
+
- lib/langchain/processors/docx.rb
|
294
|
+
- lib/langchain/processors/html.rb
|
295
|
+
- lib/langchain/processors/pdf.rb
|
296
|
+
- lib/langchain/processors/text.rb
|
297
|
+
- lib/langchainrb.rb
|
291
298
|
- lib/llm/base.rb
|
292
299
|
- lib/llm/cohere.rb
|
293
300
|
- lib/llm/google_palm.rb
|
294
301
|
- lib/llm/hugging_face.rb
|
295
302
|
- lib/llm/openai.rb
|
296
303
|
- lib/llm/replicate.rb
|
297
|
-
- lib/loader.rb
|
298
|
-
- lib/loaders/base.rb
|
299
|
-
- lib/loaders/docx.rb
|
300
|
-
- lib/loaders/html.rb
|
301
|
-
- lib/loaders/pdf.rb
|
302
|
-
- lib/loaders/text.rb
|
303
304
|
- lib/prompt/base.rb
|
304
305
|
- lib/prompt/few_shot_prompt_template.rb
|
305
306
|
- lib/prompt/loading.rb
|
data/lib/loader.rb
DELETED
@@ -1,26 +0,0 @@
|
|
1
|
-
module Loader
|
2
|
-
def self.with(*loaders)
|
3
|
-
LoaderSet.new(loaders)
|
4
|
-
end
|
5
|
-
|
6
|
-
class LoaderSet
|
7
|
-
def initialize(loaders)
|
8
|
-
@loaders = Array(loaders)
|
9
|
-
end
|
10
|
-
|
11
|
-
def load(*paths)
|
12
|
-
Array(paths)
|
13
|
-
.flatten
|
14
|
-
.map { |path| first_loadable_loader(path)&.load }
|
15
|
-
.compact
|
16
|
-
end
|
17
|
-
|
18
|
-
def first_loadable_loader(path)
|
19
|
-
@loaders
|
20
|
-
.each do |loader_klass|
|
21
|
-
loader_instance = loader_klass.new(path)
|
22
|
-
return(loader_instance) if loader_instance.loadable?
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|
data/lib/loaders/base.rb
DELETED
@@ -1,19 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
# TODO: Add chunking options to the loaders
|
4
|
-
|
5
|
-
module Loaders
|
6
|
-
class Base
|
7
|
-
def self.load(path)
|
8
|
-
new.load(path)
|
9
|
-
end
|
10
|
-
|
11
|
-
def initialize(path)
|
12
|
-
@path = path
|
13
|
-
end
|
14
|
-
|
15
|
-
def loadable?
|
16
|
-
raise NotImplementedError
|
17
|
-
end
|
18
|
-
end
|
19
|
-
end
|
data/lib/loaders/docx.rb
DELETED
@@ -1,34 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module Loaders
|
4
|
-
class Docx < Base
|
5
|
-
#
|
6
|
-
# This Loader parses Docx files into text.
|
7
|
-
# If you'd like to use it directly you can do so like this:
|
8
|
-
# Loaders::Docx.new("path/to/my.docx").load
|
9
|
-
#
|
10
|
-
# This parser is also invoked when you're adding data to a Vectorsearch DB:
|
11
|
-
# qdrant = Vectorsearch::Qdrant.new(...)
|
12
|
-
# path = Langchain.root.join("path/to/my.docx")
|
13
|
-
# qdrant.add_data(path: path)
|
14
|
-
#
|
15
|
-
|
16
|
-
def initialize(path)
|
17
|
-
depends_on "docx"
|
18
|
-
require "docx"
|
19
|
-
|
20
|
-
@path = path
|
21
|
-
end
|
22
|
-
|
23
|
-
# Check that the file is a `.docx` file
|
24
|
-
def loadable?
|
25
|
-
@path.to_s.end_with?(".docx")
|
26
|
-
end
|
27
|
-
|
28
|
-
def load
|
29
|
-
::Docx::Document
|
30
|
-
.open(@path.to_s)
|
31
|
-
.text
|
32
|
-
end
|
33
|
-
end
|
34
|
-
end
|
data/lib/loaders/html.rb
DELETED
@@ -1,38 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require "open-uri"
|
4
|
-
|
5
|
-
module Loaders
|
6
|
-
class HTML < Base
|
7
|
-
# We only look for headings and paragraphs
|
8
|
-
TEXT_CONTENT_TAGS = %w[h1 h2 h3 h4 h5 h6 p]
|
9
|
-
|
10
|
-
#
|
11
|
-
# This Loader parses URL into a text.
|
12
|
-
# If you'd like to use it directly you can do so like this:
|
13
|
-
# Loaders::URL.new("https://nokogiri.org/").load
|
14
|
-
#
|
15
|
-
def initialize(url)
|
16
|
-
depends_on "nokogiri"
|
17
|
-
require "nokogiri"
|
18
|
-
|
19
|
-
@url = url
|
20
|
-
end
|
21
|
-
|
22
|
-
# Check that url is a valid URL
|
23
|
-
def loadable?
|
24
|
-
!!(@url =~ URI::DEFAULT_PARSER.make_regexp)
|
25
|
-
end
|
26
|
-
|
27
|
-
def load
|
28
|
-
return unless response.status.first == "200"
|
29
|
-
|
30
|
-
doc = Nokogiri::HTML(response.read)
|
31
|
-
doc.css(TEXT_CONTENT_TAGS.join(",")).map(&:inner_text).join("\n\n")
|
32
|
-
end
|
33
|
-
|
34
|
-
def response
|
35
|
-
@response ||= URI.parse(@url).open
|
36
|
-
end
|
37
|
-
end
|
38
|
-
end
|
data/lib/loaders/pdf.rb
DELETED
@@ -1,36 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module Loaders
|
4
|
-
class PDF < Base
|
5
|
-
#
|
6
|
-
# This Loader parses PDF files into text.
|
7
|
-
# If you'd like to use it directly you can do so like this:
|
8
|
-
# Loaders::PDF.new("path/to/my.pdf").load
|
9
|
-
#
|
10
|
-
# This parser is also invoked when you're adding data to a Vectorsearch DB:
|
11
|
-
# qdrant = Vectorsearch::Qdrant.new(...)
|
12
|
-
# path = Langchain.root.join("path/to/my.pdf")
|
13
|
-
# qdrant.add_data(path: path)
|
14
|
-
#
|
15
|
-
|
16
|
-
def initialize(path)
|
17
|
-
depends_on "pdf-reader"
|
18
|
-
require "pdf-reader"
|
19
|
-
|
20
|
-
@path = path
|
21
|
-
end
|
22
|
-
|
23
|
-
# Check that the file is a PDF file
|
24
|
-
def loadable?
|
25
|
-
@path.to_s.end_with?(".pdf")
|
26
|
-
end
|
27
|
-
|
28
|
-
def load
|
29
|
-
::PDF::Reader
|
30
|
-
.new(@path)
|
31
|
-
.pages
|
32
|
-
.map(&:text)
|
33
|
-
.join("\n\n")
|
34
|
-
end
|
35
|
-
end
|
36
|
-
end
|
data/lib/loaders/text.rb
DELETED
@@ -1,24 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module Loaders
|
4
|
-
class Text < Base
|
5
|
-
#
|
6
|
-
# This Loader parses .txt files.
|
7
|
-
# If you'd like to use it directly you can do so like this:
|
8
|
-
# Loaders::Text.new("path/to/my.txt").load
|
9
|
-
#
|
10
|
-
# This parser is also invoked when you're adding data to a Vectorsearch DB:
|
11
|
-
# qdrant = Vectorsearch::Qdrant.new(...)
|
12
|
-
# path = Langchain.root.join("path/to/my.txt")
|
13
|
-
# qdrant.add_data(path: path)
|
14
|
-
#
|
15
|
-
|
16
|
-
def loadable?
|
17
|
-
@path.to_s.end_with?(".txt")
|
18
|
-
end
|
19
|
-
|
20
|
-
def load
|
21
|
-
@path.read
|
22
|
-
end
|
23
|
-
end
|
24
|
-
end
|