beekeeper-ai 0.6.6__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- beekeeper/_bundle/__init__.py +0 -0
- beekeeper_ai-1.0.1.dist-info/METADATA +46 -0
- beekeeper_ai-1.0.1.dist-info/RECORD +5 -0
- {beekeeper_ai-0.6.6.dist-info → beekeeper_ai-1.0.1.dist-info}/WHEEL +1 -1
- beekeeper_ai-1.0.1.dist-info/licenses/LICENSE +176 -0
- beekeeper/__init__.py +0 -1
- beekeeper/core/document/__init__.py +0 -6
- beekeeper/core/document/schema.py +0 -97
- beekeeper/core/document_loaders/__init__.py +0 -5
- beekeeper/core/document_loaders/base.py +0 -24
- beekeeper/core/embeddings/__init__.py +0 -6
- beekeeper/core/embeddings/base.py +0 -44
- beekeeper/core/text_splitters/utils.py +0 -142
- beekeeper/core/utils/pairwise.py +0 -20
- beekeeper/document_loaders/__init__.py +0 -17
- beekeeper/document_loaders/directory.py +0 -65
- beekeeper/document_loaders/docx.py +0 -31
- beekeeper/document_loaders/html.py +0 -77
- beekeeper/document_loaders/json.py +0 -53
- beekeeper/document_loaders/pdf.py +0 -38
- beekeeper/document_loaders/s3.py +0 -72
- beekeeper/document_loaders/watson_discovery.py +0 -121
- beekeeper/embeddings/__init__.py +0 -7
- beekeeper/embeddings/huggingface.py +0 -66
- beekeeper/embeddings/watsonx.py +0 -100
- beekeeper/evaluation/__init__.py +0 -5
- beekeeper/evaluation/knowledge_base_coverage.py +0 -62
- beekeeper/monitor/__init__.py +0 -11
- beekeeper/monitor/watsonx.py +0 -843
- beekeeper/retrievers/__init__.py +0 -5
- beekeeper/retrievers/watson_discovery.py +0 -121
- beekeeper/text_splitters/__init__.py +0 -9
- beekeeper/text_splitters/semantic.py +0 -139
- beekeeper/text_splitters/sentence.py +0 -107
- beekeeper/text_splitters/token.py +0 -101
- beekeeper/vector_stores/__init__.py +0 -7
- beekeeper/vector_stores/chroma.py +0 -115
- beekeeper/vector_stores/elasticsearch.py +0 -183
- beekeeper_ai-0.6.6.dist-info/LICENSE +0 -7
- beekeeper_ai-0.6.6.dist-info/METADATA +0 -49
- beekeeper_ai-0.6.6.dist-info/RECORD +0 -37
beekeeper/retrievers/__init__.py
DELETED
|
@@ -1,121 +0,0 @@
|
|
|
1
|
-
from logging import getLogger
|
|
2
|
-
from typing import List
|
|
3
|
-
|
|
4
|
-
from beekeeper.core.document import Document, DocumentWithScore
|
|
5
|
-
|
|
6
|
-
logger = getLogger(__name__)
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class WatsonDiscoveryRetriever:
|
|
10
|
-
"""Provides functionality to interact with IBM Watson Discovery for querying documents.
|
|
11
|
-
|
|
12
|
-
See https://cloud.ibm.com/docs/discovery-data?topic=discovery-data-getting-started for more info.
|
|
13
|
-
|
|
14
|
-
Args:
|
|
15
|
-
url (str): Watson Discovery instance url.
|
|
16
|
-
api_key (str): Watson Discovery API key.
|
|
17
|
-
project_id (str): Watson Discovery project_id.
|
|
18
|
-
version (str, optional): Watson Discovery API version. Defaults to ``2023-03-31``.
|
|
19
|
-
disable_passages (bool, optional): Return the full document instead of passages (only enable this if all documents are short). Defaults to ``False``.
|
|
20
|
-
|
|
21
|
-
**Example**
|
|
22
|
-
|
|
23
|
-
.. code-block:: python
|
|
24
|
-
|
|
25
|
-
from beekeeper.retrievers import WatsonDiscoveryRetriever
|
|
26
|
-
|
|
27
|
-
doc_retriever = WatsonDiscoveryRetriever(url="your_url",
|
|
28
|
-
api_key="your_api_key",
|
|
29
|
-
project_id="your_project_id")
|
|
30
|
-
"""
|
|
31
|
-
|
|
32
|
-
def __init__(self,
|
|
33
|
-
url: str,
|
|
34
|
-
api_key: str,
|
|
35
|
-
project_id: str,
|
|
36
|
-
version: str = "2023-03-31",
|
|
37
|
-
disable_passages: bool = False
|
|
38
|
-
) -> None:
|
|
39
|
-
try:
|
|
40
|
-
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
|
|
41
|
-
from ibm_watson import DiscoveryV2
|
|
42
|
-
|
|
43
|
-
except ImportError:
|
|
44
|
-
raise ImportError("ibm-watson package not found, please install it with `pip install ibm-watson`")
|
|
45
|
-
|
|
46
|
-
self.disable_passages = disable_passages
|
|
47
|
-
self.project_id = project_id
|
|
48
|
-
|
|
49
|
-
try:
|
|
50
|
-
authenticator = IAMAuthenticator(api_key)
|
|
51
|
-
self._client = DiscoveryV2(authenticator=authenticator,
|
|
52
|
-
version=version)
|
|
53
|
-
|
|
54
|
-
self._client.set_service_url(url)
|
|
55
|
-
except Exception as e:
|
|
56
|
-
logger.error(f"Error connecting to IBM Watson Discovery: {e}")
|
|
57
|
-
raise
|
|
58
|
-
|
|
59
|
-
def query(self, query: str, filter: str = None, top_k: int = 4) -> List[DocumentWithScore]:
|
|
60
|
-
"""Search your data in the Discovery API and return a list of documents.
|
|
61
|
-
|
|
62
|
-
Args:
|
|
63
|
-
query (str): Query text.
|
|
64
|
-
filter (str, optional): Searches for documents that match the filter. Use Discovery Query Language syntax.
|
|
65
|
-
top_k (int, optional): Number of top results to return. Defaults to ``4``.
|
|
66
|
-
|
|
67
|
-
**Example**
|
|
68
|
-
|
|
69
|
-
.. code-block:: python
|
|
70
|
-
|
|
71
|
-
docs = doc_retriever.query("What's Beekeeper?")
|
|
72
|
-
"""
|
|
73
|
-
from ibm_watson.discovery_v2 import QueryLargePassages
|
|
74
|
-
return_fields = ["extracted_metadata.filename", "extracted_metadata.file_type"]
|
|
75
|
-
|
|
76
|
-
if not self.disable_passages:
|
|
77
|
-
return_fields.append("passages")
|
|
78
|
-
else:
|
|
79
|
-
return_fields.append("text")
|
|
80
|
-
|
|
81
|
-
discovery_results = self._client.query(
|
|
82
|
-
project_id=self.project_id,
|
|
83
|
-
natural_language_query=query,
|
|
84
|
-
count=top_k,
|
|
85
|
-
return_=return_fields,
|
|
86
|
-
filter=filter,
|
|
87
|
-
passages=QueryLargePassages(enabled=not self.disable_passages,
|
|
88
|
-
per_document=False,
|
|
89
|
-
count=top_k,
|
|
90
|
-
find_answers=False,
|
|
91
|
-
characters=600)
|
|
92
|
-
).get_result()
|
|
93
|
-
|
|
94
|
-
docs_and_scores = []
|
|
95
|
-
|
|
96
|
-
if not self.disable_passages and len(discovery_results["passages"]) > 0:
|
|
97
|
-
# If not `disable_passages`, always use discovery passages (recommended)
|
|
98
|
-
for passage in discovery_results["passages"]:
|
|
99
|
-
document_id_target = passage["document_id"]
|
|
100
|
-
document = [doc for doc in discovery_results["results"] if doc["document_id"] == document_id_target]
|
|
101
|
-
|
|
102
|
-
docs_and_scores.append(DocumentWithScore(
|
|
103
|
-
document=Document(
|
|
104
|
-
text=passage["passage_text"],
|
|
105
|
-
metadata={"collection_id": passage["collection_id"]} | document[0]["extracted_metadata"]),
|
|
106
|
-
score=passage["passage_score"] / 100))
|
|
107
|
-
|
|
108
|
-
elif discovery_results["matching_results"] > 0:
|
|
109
|
-
# If `disable_passages`, use document text (not recommended,
|
|
110
|
-
# make sure that all documents are short to not exceed the model context window)
|
|
111
|
-
logger.warning("Not recommended to disable passages. Make sure that all documents are short to not "
|
|
112
|
-
"exceed the model context window.")
|
|
113
|
-
for document in discovery_results["results"]:
|
|
114
|
-
docs_and_scores.append(DocumentWithScore(
|
|
115
|
-
document=Document(
|
|
116
|
-
text=" ".join(document["text"]),
|
|
117
|
-
metadata={"collection_id": document["result_metadata"]["collection_id"]} | document[
|
|
118
|
-
"extracted_metadata"]),
|
|
119
|
-
score=document["result_metadata"]["confidence"]))
|
|
120
|
-
|
|
121
|
-
return docs_and_scores
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
from beekeeper.text_splitters.semantic import SemanticSplitter
|
|
2
|
-
from beekeeper.text_splitters.sentence import SentenceSplitter
|
|
3
|
-
from beekeeper.text_splitters.token import TokenTextSplitter
|
|
4
|
-
|
|
5
|
-
__all__ = [
|
|
6
|
-
"SentenceSplitter",
|
|
7
|
-
"SemanticSplitter",
|
|
8
|
-
"TokenTextSplitter",
|
|
9
|
-
]
|
|
@@ -1,139 +0,0 @@
|
|
|
1
|
-
import re
|
|
2
|
-
from typing import List, Literal, Tuple
|
|
3
|
-
|
|
4
|
-
import numpy as np
|
|
5
|
-
from pydantic.v1 import BaseModel
|
|
6
|
-
|
|
7
|
-
from beekeeper.core.document import Document
|
|
8
|
-
from beekeeper.core.embeddings import BaseEmbedding
|
|
9
|
-
from beekeeper.core.utils.pairwise import cosine_similarity
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class SemanticSplitter(BaseModel):
|
|
13
|
-
"""Python class designed to split text into chunks using semantic understanding.
|
|
14
|
-
|
|
15
|
-
Credit to Greg Kamradt's notebook: `5 Levels Of Text Splitting <https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/main/tutorials/LevelsOfTextSplitting/5_Levels_Of_Text_Splitting.ipynb>`_.
|
|
16
|
-
|
|
17
|
-
Args:
|
|
18
|
-
embed_model (BaseEmbedding):
|
|
19
|
-
buffer_size (int, optional): Size of the buffer for semantic chunking. Default is ``1``.
|
|
20
|
-
breakpoint_threshold_amount (int, optional): Threshold percentage for detecting breakpoints. Default is ``95``.
|
|
21
|
-
device (str, optional): Device to use for processing. Currently supports "cpu" and "cuda". Default is ``cpu``.
|
|
22
|
-
|
|
23
|
-
**Example**
|
|
24
|
-
|
|
25
|
-
.. code-block:: python
|
|
26
|
-
|
|
27
|
-
from beekeeper.embeddings import HuggingFaceEmbedding
|
|
28
|
-
from beekeeper.text_splitters import SemanticSplitter
|
|
29
|
-
|
|
30
|
-
embedding = HuggingFaceEmbedding()
|
|
31
|
-
text_splitter = SemanticSplitter(embed_model=embedding)
|
|
32
|
-
"""
|
|
33
|
-
|
|
34
|
-
embed_model: BaseEmbedding
|
|
35
|
-
buffer_size: int = 1
|
|
36
|
-
breakpoint_threshold_amount: int = 95
|
|
37
|
-
device: Literal["cpu", "cuda"] = "cpu"
|
|
38
|
-
|
|
39
|
-
class Config:
|
|
40
|
-
arbitrary_types_allowed = True
|
|
41
|
-
|
|
42
|
-
def _combine_sentences(self, sentences: List[dict]) -> List[dict]:
|
|
43
|
-
"""Combine sentences with neighbors based on buffer size."""
|
|
44
|
-
for i in range(len(sentences)):
|
|
45
|
-
combined_sentence = ""
|
|
46
|
-
|
|
47
|
-
# Add previous sentences based on buffer size
|
|
48
|
-
for j in range(i - self.buffer_size, i):
|
|
49
|
-
if j >= 0:
|
|
50
|
-
combined_sentence += sentences[j]["sentence"] + " "
|
|
51
|
-
|
|
52
|
-
# Add the current sentence
|
|
53
|
-
combined_sentence += sentences[i]["sentence"]
|
|
54
|
-
|
|
55
|
-
# Add next sentences based on buffer size
|
|
56
|
-
for j in range(i + 1, i + 1 + self.buffer_size):
|
|
57
|
-
if j < len(sentences):
|
|
58
|
-
combined_sentence += " " + sentences[j]["sentence"]
|
|
59
|
-
|
|
60
|
-
sentences[i]["combined_sentence"] = combined_sentence
|
|
61
|
-
|
|
62
|
-
return sentences
|
|
63
|
-
|
|
64
|
-
def _calculate_cosine_distances(self, single_sentences_list: List[str]) -> Tuple[List[float], List[dict]]:
|
|
65
|
-
_sentences = [{"sentence": x, "index": i} for i, x in enumerate(single_sentences_list)]
|
|
66
|
-
|
|
67
|
-
sentences = self._combine_sentences(_sentences)
|
|
68
|
-
embeddings = self.embed_model.get_texts_embedding(
|
|
69
|
-
[x["combined_sentence"] for x in sentences]
|
|
70
|
-
)
|
|
71
|
-
|
|
72
|
-
for i, sentence in enumerate(sentences):
|
|
73
|
-
sentence["combined_sentence_embedding"] = embeddings[i]
|
|
74
|
-
|
|
75
|
-
distances = []
|
|
76
|
-
for i in range(len(sentences) - 1):
|
|
77
|
-
embedding_current = sentences[i]["combined_sentence_embedding"]
|
|
78
|
-
embedding_next = sentences[i + 1]["combined_sentence_embedding"]
|
|
79
|
-
|
|
80
|
-
similarity = cosine_similarity(embedding_current, embedding_next)
|
|
81
|
-
|
|
82
|
-
distance = 1 - similarity
|
|
83
|
-
distances.append(distance)
|
|
84
|
-
|
|
85
|
-
# Store distance in the dictionary
|
|
86
|
-
sentences[i]["distance_to_next"] = distance
|
|
87
|
-
|
|
88
|
-
return distances, sentences
|
|
89
|
-
|
|
90
|
-
def _calculate_breakpoint(self, distances: List[float]) -> List:
|
|
91
|
-
distance_threshold = np.percentile(distances, self.breakpoint_threshold_amount)
|
|
92
|
-
|
|
93
|
-
return [i for i, x in enumerate(distances) if x > distance_threshold]
|
|
94
|
-
|
|
95
|
-
def from_text(self, text: str) -> List[str]:
|
|
96
|
-
"""Split text into chunks.
|
|
97
|
-
|
|
98
|
-
Args:
|
|
99
|
-
text (str): Input text to split.
|
|
100
|
-
"""
|
|
101
|
-
single_sentences_list = re.split(r"(?<=[.?!])\s+", text)
|
|
102
|
-
distances, sentences = self._calculate_cosine_distances(single_sentences_list)
|
|
103
|
-
|
|
104
|
-
indices_above_thresh = self._calculate_breakpoint(distances)
|
|
105
|
-
|
|
106
|
-
chunks = []
|
|
107
|
-
start_index = 0
|
|
108
|
-
|
|
109
|
-
for index in indices_above_thresh:
|
|
110
|
-
# Slice the sentence_dicts from the current start index to the end index
|
|
111
|
-
group = sentences[start_index: index + 1]
|
|
112
|
-
combined_text = " ".join([d["sentence"] for d in group])
|
|
113
|
-
chunks.append(combined_text)
|
|
114
|
-
|
|
115
|
-
# Update the start index for the next group
|
|
116
|
-
start_index = index + 1
|
|
117
|
-
|
|
118
|
-
# The last group, if any sentences remain
|
|
119
|
-
if start_index < len(sentences):
|
|
120
|
-
combined_text = " ".join([d["sentence"] for d in sentences[start_index:]])
|
|
121
|
-
chunks.append(combined_text)
|
|
122
|
-
|
|
123
|
-
return chunks
|
|
124
|
-
|
|
125
|
-
def from_documents(self, documents: List[Document]) -> List[Document]:
|
|
126
|
-
"""Split documents into chunks.
|
|
127
|
-
|
|
128
|
-
Args:
|
|
129
|
-
documents (List[Document]): List of `Document` objects to split.
|
|
130
|
-
"""
|
|
131
|
-
chunks = []
|
|
132
|
-
|
|
133
|
-
for document in documents:
|
|
134
|
-
texts = self.from_text(document.get_content())
|
|
135
|
-
|
|
136
|
-
for text in texts:
|
|
137
|
-
chunks.append(Document(text=text, metadata=document.get_metadata()))
|
|
138
|
-
|
|
139
|
-
return chunks
|
|
@@ -1,107 +0,0 @@
|
|
|
1
|
-
from typing import List
|
|
2
|
-
|
|
3
|
-
from beekeeper.core.document import Document
|
|
4
|
-
from beekeeper.core.text_splitters.utils import (
|
|
5
|
-
merge_splits,
|
|
6
|
-
split_by_char,
|
|
7
|
-
split_by_fns,
|
|
8
|
-
split_by_regex,
|
|
9
|
-
split_by_sentence_tokenizer,
|
|
10
|
-
split_by_sep,
|
|
11
|
-
tokenizer,
|
|
12
|
-
)
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class SentenceSplitter:
|
|
16
|
-
"""Designed to split input text into smaller chunks,
|
|
17
|
-
particularly useful for processing large documents or texts, tries to keep sentences and paragraphs together.
|
|
18
|
-
|
|
19
|
-
Args:
|
|
20
|
-
chunk_size (int, optional): Size of each chunk. Default is ``512``.
|
|
21
|
-
chunk_overlap (int, optional): Amount of overlap between chunks. Default is ``256``.
|
|
22
|
-
separator (str, optional): Separators used for splitting into words. Default is ``" "``
|
|
23
|
-
|
|
24
|
-
**Example**
|
|
25
|
-
|
|
26
|
-
.. code-block:: python
|
|
27
|
-
|
|
28
|
-
from beekeeper.text_splitters import SentenceSplitter
|
|
29
|
-
|
|
30
|
-
text_splitter = SentenceSplitter()
|
|
31
|
-
"""
|
|
32
|
-
|
|
33
|
-
def __init__(self,
|
|
34
|
-
chunk_size: int = 512,
|
|
35
|
-
chunk_overlap: int = 256,
|
|
36
|
-
separator=" "
|
|
37
|
-
) -> None:
|
|
38
|
-
|
|
39
|
-
if chunk_overlap > chunk_size:
|
|
40
|
-
raise ValueError(
|
|
41
|
-
f"Got a larger `chunk_overlap` ({chunk_overlap}) than `chunk_size` "
|
|
42
|
-
f"({chunk_size}). `chunk_overlap` should be smaller."
|
|
43
|
-
)
|
|
44
|
-
|
|
45
|
-
self.chunk_size = chunk_size
|
|
46
|
-
self.chunk_overlap = chunk_overlap
|
|
47
|
-
|
|
48
|
-
self._split_fns = [
|
|
49
|
-
split_by_sep("\n\n\n"),
|
|
50
|
-
split_by_sentence_tokenizer()
|
|
51
|
-
]
|
|
52
|
-
self._sub_split_fns = [
|
|
53
|
-
split_by_regex("[^,.;?!]+[,.;?!]?"),
|
|
54
|
-
split_by_sep(separator),
|
|
55
|
-
split_by_char()
|
|
56
|
-
]
|
|
57
|
-
|
|
58
|
-
def from_text(self, text: str) -> List[str]:
|
|
59
|
-
"""Split text into chunks.
|
|
60
|
-
|
|
61
|
-
Args:
|
|
62
|
-
text (str): Input text to split.
|
|
63
|
-
|
|
64
|
-
**Example**
|
|
65
|
-
|
|
66
|
-
.. code-block:: python
|
|
67
|
-
|
|
68
|
-
chunks = text_splitter.from_text("Beekeeper is a data framework to load any data in one line of code and connect with AI applications.")
|
|
69
|
-
"""
|
|
70
|
-
splits = self._split(text)
|
|
71
|
-
|
|
72
|
-
return merge_splits(splits, self.chunk_size, self.chunk_overlap)
|
|
73
|
-
|
|
74
|
-
def from_documents(self, documents: List[Document]) -> List[Document]:
|
|
75
|
-
"""Split documents into chunks.
|
|
76
|
-
|
|
77
|
-
Args:
|
|
78
|
-
documents (List[Document]): List of `Document` objects to split.
|
|
79
|
-
"""
|
|
80
|
-
chunks = []
|
|
81
|
-
|
|
82
|
-
for document in documents:
|
|
83
|
-
texts = self.from_text(document.get_content())
|
|
84
|
-
|
|
85
|
-
for text in texts:
|
|
86
|
-
chunks.append(Document(text=text, metadata=document.get_metadata()))
|
|
87
|
-
|
|
88
|
-
return chunks
|
|
89
|
-
|
|
90
|
-
def _split(self, text: str) -> List[dict]:
|
|
91
|
-
|
|
92
|
-
text_len = len(tokenizer(text))
|
|
93
|
-
if text_len <= self.chunk_size:
|
|
94
|
-
return [{"text": text, "is_sentence": True, "token_size": text_len}]
|
|
95
|
-
|
|
96
|
-
text_splits = []
|
|
97
|
-
text_splits_by_fns, is_sentence = split_by_fns(text, self._split_fns, self._sub_split_fns)
|
|
98
|
-
|
|
99
|
-
for text_split_by_fns in text_splits_by_fns:
|
|
100
|
-
split_len = len(tokenizer(text_split_by_fns))
|
|
101
|
-
if split_len <= self.chunk_size:
|
|
102
|
-
text_splits.append({"text": text_split_by_fns, "is_sentence": is_sentence, "token_size": split_len})
|
|
103
|
-
else:
|
|
104
|
-
recursive_text_splits = self._split(text_split_by_fns)
|
|
105
|
-
text_splits.extend(recursive_text_splits)
|
|
106
|
-
|
|
107
|
-
return text_splits
|
|
@@ -1,101 +0,0 @@
|
|
|
1
|
-
from typing import List
|
|
2
|
-
|
|
3
|
-
from beekeeper.core.document import Document
|
|
4
|
-
from beekeeper.core.text_splitters.utils import (
|
|
5
|
-
merge_splits,
|
|
6
|
-
split_by_char,
|
|
7
|
-
split_by_fns,
|
|
8
|
-
split_by_sep,
|
|
9
|
-
tokenizer,
|
|
10
|
-
)
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class TokenTextSplitter:
|
|
14
|
-
r"""This is the simplest splitting method. Designed to split input text into smaller chunks looking at word tokens.
|
|
15
|
-
|
|
16
|
-
Args:
|
|
17
|
-
chunk_size (int, optional): Size of each chunk. Default is ``512``.
|
|
18
|
-
chunk_overlap (int, optional): Amount of overlap between chunks. Default is ``256``.
|
|
19
|
-
separator (str, optional): Separators used for splitting into words. Default is ``\\n\\n``.
|
|
20
|
-
|
|
21
|
-
**Example**
|
|
22
|
-
|
|
23
|
-
.. code-block:: python
|
|
24
|
-
|
|
25
|
-
from beekeeper.text_splitters import TokenTextSplitter
|
|
26
|
-
|
|
27
|
-
text_splitter = TokenTextSplitter()
|
|
28
|
-
"""
|
|
29
|
-
|
|
30
|
-
def __init__(self,
|
|
31
|
-
chunk_size: int = 512,
|
|
32
|
-
chunk_overlap: int = 256,
|
|
33
|
-
separator="\n\n") -> None:
|
|
34
|
-
|
|
35
|
-
if chunk_overlap > chunk_size:
|
|
36
|
-
raise ValueError(
|
|
37
|
-
f"Got a larger `chunk_overlap` ({chunk_overlap}) than `chunk_size` "
|
|
38
|
-
f"({chunk_size}). `chunk_overlap` should be smaller."
|
|
39
|
-
)
|
|
40
|
-
|
|
41
|
-
self.chunk_size = chunk_size
|
|
42
|
-
self.chunk_overlap = chunk_overlap
|
|
43
|
-
|
|
44
|
-
self._split_fns = [
|
|
45
|
-
split_by_sep(separator)
|
|
46
|
-
]
|
|
47
|
-
|
|
48
|
-
self._sub_split_fns = [
|
|
49
|
-
split_by_char()
|
|
50
|
-
]
|
|
51
|
-
|
|
52
|
-
def from_text(self, text: str) -> List[str]:
|
|
53
|
-
"""Split text into chunks.
|
|
54
|
-
|
|
55
|
-
Args:
|
|
56
|
-
text (str): Input text to split.
|
|
57
|
-
|
|
58
|
-
**Example**
|
|
59
|
-
|
|
60
|
-
.. code-block:: python
|
|
61
|
-
|
|
62
|
-
chunks = text_splitter.from_text("Beekeeper is a data framework to load any data in one line of code and connect with AI applications.")
|
|
63
|
-
"""
|
|
64
|
-
splits = self._split(text)
|
|
65
|
-
|
|
66
|
-
return merge_splits(splits, self.chunk_size, self.chunk_overlap)
|
|
67
|
-
|
|
68
|
-
def from_documents(self, documents: List[Document]) -> List[Document]:
|
|
69
|
-
"""Split documents into chunks.
|
|
70
|
-
|
|
71
|
-
Args:
|
|
72
|
-
documents (List[Document]): List of `Document` objects to split.
|
|
73
|
-
"""
|
|
74
|
-
chunks = []
|
|
75
|
-
|
|
76
|
-
for document in documents:
|
|
77
|
-
texts = self.from_text(document.get_content())
|
|
78
|
-
|
|
79
|
-
for text in texts:
|
|
80
|
-
chunks.append(Document(text=text, metadata=document.get_metadata()))
|
|
81
|
-
|
|
82
|
-
return chunks
|
|
83
|
-
|
|
84
|
-
def _split(self, text: str) -> List[dict]:
|
|
85
|
-
|
|
86
|
-
text_len = len(tokenizer(text))
|
|
87
|
-
if text_len <= self.chunk_size:
|
|
88
|
-
return [{"text": text, "is_sentence": True, "token_size": text_len}]
|
|
89
|
-
|
|
90
|
-
text_splits = []
|
|
91
|
-
text_splits_by_fns, is_sentence = split_by_fns(text, self._split_fns, self._sub_split_fns)
|
|
92
|
-
|
|
93
|
-
for text_split_by_fns in text_splits_by_fns:
|
|
94
|
-
split_len = len(tokenizer(text_split_by_fns))
|
|
95
|
-
if split_len <= self.chunk_size:
|
|
96
|
-
text_splits.append({"text": text_split_by_fns, "is_sentence": False, "token_size": split_len})
|
|
97
|
-
else:
|
|
98
|
-
recursive_text_splits = self._split(text_split_by_fns)
|
|
99
|
-
text_splits.extend(recursive_text_splits)
|
|
100
|
-
|
|
101
|
-
return text_splits
|
|
@@ -1,115 +0,0 @@
|
|
|
1
|
-
import uuid
|
|
2
|
-
from logging import getLogger
|
|
3
|
-
from typing import List, Literal
|
|
4
|
-
|
|
5
|
-
from beekeeper.core.document import Document, DocumentWithScore
|
|
6
|
-
from beekeeper.core.embeddings import BaseEmbedding
|
|
7
|
-
|
|
8
|
-
logger = getLogger(__name__)
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class ChromaVectorStore:
|
|
12
|
-
"""Chroma is the AI-native open-source vector database. Embeddings are stored within a ChromaDB collection.
|
|
13
|
-
|
|
14
|
-
Args:
|
|
15
|
-
embed_model (BaseEmbedding):
|
|
16
|
-
collection_name (str, optional): Name of the ChromaDB collection.
|
|
17
|
-
distance_strategy (str, optional): Distance strategy for similarity search. Currently supports "cosine", "ip" and "l2". Defaults to ``cosine``.
|
|
18
|
-
|
|
19
|
-
**Example**
|
|
20
|
-
|
|
21
|
-
.. code-block:: python
|
|
22
|
-
|
|
23
|
-
from beekeeper.embeddings import HuggingFaceEmbedding
|
|
24
|
-
from beekeeper.vector_stores import ChromaVectorStore
|
|
25
|
-
|
|
26
|
-
embedding = HuggingFaceEmbedding()
|
|
27
|
-
vector_db = ChromaVectorStore(embed_model=embedding)
|
|
28
|
-
"""
|
|
29
|
-
|
|
30
|
-
def __init__(self, embed_model: BaseEmbedding,
|
|
31
|
-
collection_name: str = None,
|
|
32
|
-
distance_strategy: Literal["cosine", "ip", "l2"] = "cosine") -> None:
|
|
33
|
-
try:
|
|
34
|
-
import chromadb
|
|
35
|
-
import chromadb.config
|
|
36
|
-
|
|
37
|
-
except ImportError:
|
|
38
|
-
raise ImportError("chromadb package not found, please install it with `pip install chromadb`")
|
|
39
|
-
|
|
40
|
-
self._embed_model = embed_model
|
|
41
|
-
self._client_settings = chromadb.config.Settings()
|
|
42
|
-
self._client = chromadb.Client(self._client_settings)
|
|
43
|
-
|
|
44
|
-
if collection_name is None:
|
|
45
|
-
collection_name = "auto-generated-" + str(uuid.uuid4())[:8]
|
|
46
|
-
logger.info(f"collection_name: {collection_name}")
|
|
47
|
-
|
|
48
|
-
self._collection = self._client.get_or_create_collection(
|
|
49
|
-
name=collection_name,
|
|
50
|
-
embedding_function=None,
|
|
51
|
-
metadata={"hnsw:space": distance_strategy}
|
|
52
|
-
)
|
|
53
|
-
|
|
54
|
-
def add_documents(self, documents: List[Document]) -> List:
|
|
55
|
-
"""Add documents to the ChromaDB collection.
|
|
56
|
-
|
|
57
|
-
Args:
|
|
58
|
-
documents (List[Document]): List of `Document` objects to add to the collection.
|
|
59
|
-
"""
|
|
60
|
-
embeddings = []
|
|
61
|
-
metadatas = []
|
|
62
|
-
ids = []
|
|
63
|
-
chroma_documents = []
|
|
64
|
-
|
|
65
|
-
for doc in documents:
|
|
66
|
-
embeddings.append(self._embed_model.get_query_embedding(doc.get_content()))
|
|
67
|
-
metadatas.append(doc.get_metadata() if doc.get_metadata() else None)
|
|
68
|
-
ids.append(doc.doc_id if doc.doc_id else str(uuid.uuid4()))
|
|
69
|
-
chroma_documents.append(doc.get_content())
|
|
70
|
-
|
|
71
|
-
self._collection.add(embeddings=embeddings,
|
|
72
|
-
ids=ids,
|
|
73
|
-
metadatas=metadatas,
|
|
74
|
-
documents=chroma_documents)
|
|
75
|
-
|
|
76
|
-
return ids
|
|
77
|
-
|
|
78
|
-
def query(self, query: str, top_k: int = 4) -> List[DocumentWithScore]:
|
|
79
|
-
"""Performs a similarity search for top-k most similar documents.
|
|
80
|
-
|
|
81
|
-
Args:
|
|
82
|
-
query (str): Query text.
|
|
83
|
-
top_k (int, optional): Number of top results to return. Defaults to ``4``.
|
|
84
|
-
"""
|
|
85
|
-
query_embedding = self._embed_model.get_query_embedding(query)
|
|
86
|
-
|
|
87
|
-
results = self._collection.query(
|
|
88
|
-
query_embeddings=query_embedding,
|
|
89
|
-
n_results=top_k
|
|
90
|
-
)
|
|
91
|
-
|
|
92
|
-
return [
|
|
93
|
-
DocumentWithScore(document=Document(
|
|
94
|
-
doc_id=result[0],
|
|
95
|
-
text=result[1],
|
|
96
|
-
metadata=result[2]
|
|
97
|
-
), score=result[3])
|
|
98
|
-
for result in zip(
|
|
99
|
-
results["ids"][0],
|
|
100
|
-
results["documents"][0],
|
|
101
|
-
results["metadatas"][0],
|
|
102
|
-
results["distances"][0],
|
|
103
|
-
)
|
|
104
|
-
]
|
|
105
|
-
|
|
106
|
-
def delete_documents(self, ids: List[str] = None) -> None:
|
|
107
|
-
"""Delete documents from the ChromaDB collection.
|
|
108
|
-
|
|
109
|
-
Args:
|
|
110
|
-
ids (List[str]): List of `Document` IDs to delete. Defaults to ``None``.
|
|
111
|
-
"""
|
|
112
|
-
if not ids:
|
|
113
|
-
raise ValueError("No ids provided to delete.")
|
|
114
|
-
|
|
115
|
-
self._collection.delete(ids=ids)
|