beekeeper-ai 0.6.6__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- beekeeper/_bundle/__init__.py +0 -0
- beekeeper_ai-1.0.1.dist-info/METADATA +46 -0
- beekeeper_ai-1.0.1.dist-info/RECORD +5 -0
- {beekeeper_ai-0.6.6.dist-info → beekeeper_ai-1.0.1.dist-info}/WHEEL +1 -1
- beekeeper_ai-1.0.1.dist-info/licenses/LICENSE +176 -0
- beekeeper/__init__.py +0 -1
- beekeeper/core/document/__init__.py +0 -6
- beekeeper/core/document/schema.py +0 -97
- beekeeper/core/document_loaders/__init__.py +0 -5
- beekeeper/core/document_loaders/base.py +0 -24
- beekeeper/core/embeddings/__init__.py +0 -6
- beekeeper/core/embeddings/base.py +0 -44
- beekeeper/core/text_splitters/utils.py +0 -142
- beekeeper/core/utils/pairwise.py +0 -20
- beekeeper/document_loaders/__init__.py +0 -17
- beekeeper/document_loaders/directory.py +0 -65
- beekeeper/document_loaders/docx.py +0 -31
- beekeeper/document_loaders/html.py +0 -77
- beekeeper/document_loaders/json.py +0 -53
- beekeeper/document_loaders/pdf.py +0 -38
- beekeeper/document_loaders/s3.py +0 -72
- beekeeper/document_loaders/watson_discovery.py +0 -121
- beekeeper/embeddings/__init__.py +0 -7
- beekeeper/embeddings/huggingface.py +0 -66
- beekeeper/embeddings/watsonx.py +0 -100
- beekeeper/evaluation/__init__.py +0 -5
- beekeeper/evaluation/knowledge_base_coverage.py +0 -62
- beekeeper/monitor/__init__.py +0 -11
- beekeeper/monitor/watsonx.py +0 -843
- beekeeper/retrievers/__init__.py +0 -5
- beekeeper/retrievers/watson_discovery.py +0 -121
- beekeeper/text_splitters/__init__.py +0 -9
- beekeeper/text_splitters/semantic.py +0 -139
- beekeeper/text_splitters/sentence.py +0 -107
- beekeeper/text_splitters/token.py +0 -101
- beekeeper/vector_stores/__init__.py +0 -7
- beekeeper/vector_stores/chroma.py +0 -115
- beekeeper/vector_stores/elasticsearch.py +0 -183
- beekeeper_ai-0.6.6.dist-info/LICENSE +0 -7
- beekeeper_ai-0.6.6.dist-info/METADATA +0 -49
- beekeeper_ai-0.6.6.dist-info/RECORD +0 -37
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
from typing import List, Optional
|
|
4
|
-
|
|
5
|
-
from beekeeper.core.document import Document
|
|
6
|
-
from beekeeper.core.document_loaders import BaseLoader
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class DocxLoader(BaseLoader):
|
|
10
|
-
"""Microsoft Word (Docx) loader."""
|
|
11
|
-
|
|
12
|
-
def load_data(self, input_file: str, extra_info: Optional[dict] = None) -> List[Document]:
|
|
13
|
-
"""Loads data from the specified directory.
|
|
14
|
-
|
|
15
|
-
Args:
|
|
16
|
-
input_file (str): File path to load.
|
|
17
|
-
"""
|
|
18
|
-
try:
|
|
19
|
-
import docx2txt # noqa: F401
|
|
20
|
-
except ImportError:
|
|
21
|
-
raise ImportError("docx2txt package not found, please install it with `pip install docx2txt`")
|
|
22
|
-
|
|
23
|
-
if not os.path.isfile(input_file):
|
|
24
|
-
raise ValueError(f"File `{input_file}` does not exist")
|
|
25
|
-
|
|
26
|
-
input_file = str(Path(input_file).resolve())
|
|
27
|
-
|
|
28
|
-
text = docx2txt.process(input_file)
|
|
29
|
-
metadata = {"source": input_file}
|
|
30
|
-
|
|
31
|
-
return [Document(text=text, metadata=metadata)]
|
|
@@ -1,77 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
from typing import List, Optional
|
|
4
|
-
|
|
5
|
-
from beekeeper.core.document import Document
|
|
6
|
-
from beekeeper.core.document_loaders import BaseLoader
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class HTMLLoader(BaseLoader):
|
|
10
|
-
"""Load HTML file and extract text from a specific tag.
|
|
11
|
-
|
|
12
|
-
Args:
|
|
13
|
-
tag (str): HTML tag to extract. Defaults to ``section``.
|
|
14
|
-
"""
|
|
15
|
-
|
|
16
|
-
tag: str = "section"
|
|
17
|
-
|
|
18
|
-
def load_data(self, input_file: str, extra_info: Optional[dict] = None) -> List[Document]:
|
|
19
|
-
"""Loads data from the specified directory.
|
|
20
|
-
|
|
21
|
-
Args:
|
|
22
|
-
input_file (str): File path to load.
|
|
23
|
-
"""
|
|
24
|
-
try:
|
|
25
|
-
from bs4 import BeautifulSoup # noqa: F401
|
|
26
|
-
except ImportError:
|
|
27
|
-
raise ImportError("beautifulsoup4 package not found, please install it with `pip install beautifulsoup4`")
|
|
28
|
-
|
|
29
|
-
if not os.path.isfile(input_file):
|
|
30
|
-
raise ValueError(f"File `{input_file}` does not exist")
|
|
31
|
-
|
|
32
|
-
input_file = str(Path(input_file).resolve())
|
|
33
|
-
|
|
34
|
-
with open(input_file, encoding="utf-8") as html_file:
|
|
35
|
-
soup = BeautifulSoup(html_file, "html.parser")
|
|
36
|
-
|
|
37
|
-
tags = soup.find_all(self.tag)
|
|
38
|
-
documents = []
|
|
39
|
-
|
|
40
|
-
for tag in tags:
|
|
41
|
-
tag_text = self._extract_text_from_tag(tag)
|
|
42
|
-
|
|
43
|
-
metadata = {
|
|
44
|
-
"tag": self.tag,
|
|
45
|
-
"source": input_file,
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
doc = Document(
|
|
49
|
-
text=tag_text,
|
|
50
|
-
metadata=metadata,
|
|
51
|
-
)
|
|
52
|
-
|
|
53
|
-
documents.append(doc)
|
|
54
|
-
|
|
55
|
-
return documents
|
|
56
|
-
|
|
57
|
-
def _extract_text_from_tag(self, tag) -> str:
|
|
58
|
-
"""Extract the text from an HTML tag, ignoring other nested tags."""
|
|
59
|
-
try:
|
|
60
|
-
from bs4 import NavigableString # noqa: F401
|
|
61
|
-
except ImportError:
|
|
62
|
-
raise ImportError("beautifulsoup4 package not found, please install it with `pip install beautifulsoup4`")
|
|
63
|
-
|
|
64
|
-
texts = []
|
|
65
|
-
|
|
66
|
-
for elem in tag.children:
|
|
67
|
-
# Check if the element is a text node, not a tag
|
|
68
|
-
if isinstance(elem, NavigableString):
|
|
69
|
-
if elem.strip():
|
|
70
|
-
texts.append(elem.strip())
|
|
71
|
-
# Ignore any tag that matches the main tag being processed (to avoid recursion)
|
|
72
|
-
elif elem.name == self.tag:
|
|
73
|
-
continue
|
|
74
|
-
else:
|
|
75
|
-
texts.append(elem.get_text().strip())
|
|
76
|
-
|
|
77
|
-
return "\n".join(texts)
|
|
@@ -1,53 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import os
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
from typing import List, Optional
|
|
5
|
-
|
|
6
|
-
from beekeeper.core.document import Document
|
|
7
|
-
from beekeeper.core.document_loaders import BaseLoader
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class JSONLoader(BaseLoader):
|
|
11
|
-
"""JSON loader.
|
|
12
|
-
|
|
13
|
-
Args:
|
|
14
|
-
jq_schema (str, optional): jq schema to use to extract the data from the JSON.
|
|
15
|
-
"""
|
|
16
|
-
|
|
17
|
-
jq_schema: Optional[str] = None
|
|
18
|
-
|
|
19
|
-
def load_data(self, input_file: str, extra_info: Optional[dict] = None) -> List[Document]:
|
|
20
|
-
"""Loads data from the specified directory.
|
|
21
|
-
|
|
22
|
-
Args:
|
|
23
|
-
input_file (str): File path to load.
|
|
24
|
-
"""
|
|
25
|
-
try:
|
|
26
|
-
import jq # noqa: F401
|
|
27
|
-
except ImportError:
|
|
28
|
-
raise ImportError("jq package not found, please install it with `pip install jq`")
|
|
29
|
-
|
|
30
|
-
if not os.path.isfile(input_file):
|
|
31
|
-
raise ValueError(f"File `{input_file}` does not exist")
|
|
32
|
-
|
|
33
|
-
documents = []
|
|
34
|
-
jq_compiler = jq.compile(self.jq_schema)
|
|
35
|
-
json_file = Path(input_file).resolve().read_text(encoding="utf-8")
|
|
36
|
-
json_data = jq_compiler.input(json.loads(json_file))
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
for content in json_data:
|
|
40
|
-
|
|
41
|
-
if isinstance(content, str):
|
|
42
|
-
content = content
|
|
43
|
-
elif isinstance(content, dict):
|
|
44
|
-
content = json.dumps(content) if content else ""
|
|
45
|
-
else:
|
|
46
|
-
content = str(content) if content is not None else ""
|
|
47
|
-
|
|
48
|
-
if content.strip() != "":
|
|
49
|
-
documents.append(Document(
|
|
50
|
-
text=content,
|
|
51
|
-
metadata={"source": str(Path(input_file).resolve())}))
|
|
52
|
-
|
|
53
|
-
return documents
|
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
import os
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
from typing import List, Optional
|
|
5
|
-
|
|
6
|
-
from beekeeper.core.document import Document
|
|
7
|
-
from beekeeper.core.document_loaders import BaseLoader
|
|
8
|
-
|
|
9
|
-
logging.getLogger("pypdf").setLevel(logging.ERROR)
|
|
10
|
-
|
|
11
|
-
class PDFLoader(BaseLoader):
|
|
12
|
-
"""PDF loader using PyPDF."""
|
|
13
|
-
|
|
14
|
-
def load_data(self, input_file: str, extra_info: Optional[dict] = None) -> List[Document]:
|
|
15
|
-
"""Loads data from the specified directory.
|
|
16
|
-
|
|
17
|
-
Args:
|
|
18
|
-
input_file (str): File path to load.
|
|
19
|
-
"""
|
|
20
|
-
try:
|
|
21
|
-
import pypdf # noqa: F401
|
|
22
|
-
|
|
23
|
-
except ImportError:
|
|
24
|
-
raise ImportError("pypdf package not found, please install it with `pip install pypdf`")
|
|
25
|
-
|
|
26
|
-
if not os.path.isfile(input_file):
|
|
27
|
-
raise ValueError(f"File `{input_file}` does not exist")
|
|
28
|
-
|
|
29
|
-
input_file = str(Path(input_file).resolve())
|
|
30
|
-
pdf_loader = pypdf.PdfReader(input_file)
|
|
31
|
-
|
|
32
|
-
return [
|
|
33
|
-
Document(
|
|
34
|
-
text=page.extract_text().strip(),
|
|
35
|
-
metadata={"source": input_file, "page": page_number}
|
|
36
|
-
)
|
|
37
|
-
for page_number, page in enumerate(pdf_loader.pages)
|
|
38
|
-
]
|
beekeeper/document_loaders/s3.py
DELETED
|
@@ -1,72 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import re
|
|
3
|
-
import tempfile
|
|
4
|
-
from typing import List, Optional
|
|
5
|
-
|
|
6
|
-
from beekeeper.core.document import Document
|
|
7
|
-
from beekeeper.core.document_loaders import BaseLoader
|
|
8
|
-
from beekeeper.document_loaders import DirectoryLoader
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class S3Loader(BaseLoader):
|
|
12
|
-
"""S3 bucket loader.
|
|
13
|
-
|
|
14
|
-
Args:
|
|
15
|
-
bucket (str): Name of the S3 bucket.
|
|
16
|
-
ibm_api_key_id (str): IBM Cloud API key.
|
|
17
|
-
ibm_service_instance_id (str): Service instance ID for the IBM COS.
|
|
18
|
-
s3_endpoint_url (str): Endpoint URL for the S3 service.
|
|
19
|
-
|
|
20
|
-
**Example**
|
|
21
|
-
|
|
22
|
-
.. code-block:: python
|
|
23
|
-
|
|
24
|
-
from beekeeper.document_loaders import S3Loader
|
|
25
|
-
|
|
26
|
-
loader = S3Loader(bucket="your_bucket",
|
|
27
|
-
ibm_api_key_id="your_api_key",
|
|
28
|
-
ibm_service_instance_id="your_instance_id",
|
|
29
|
-
s3_endpoint_url="your_api_url")
|
|
30
|
-
"""
|
|
31
|
-
|
|
32
|
-
def __init__(self, bucket: str,
|
|
33
|
-
ibm_api_key_id: str = None,
|
|
34
|
-
ibm_service_instance_id: str = None,
|
|
35
|
-
s3_endpoint_url: str = None
|
|
36
|
-
):
|
|
37
|
-
|
|
38
|
-
try:
|
|
39
|
-
import ibm_boto3
|
|
40
|
-
from ibm_botocore.client import Config
|
|
41
|
-
|
|
42
|
-
self._ibm_boto3 = ibm_boto3
|
|
43
|
-
self._boto_config = Config
|
|
44
|
-
except ImportError:
|
|
45
|
-
raise ImportError("ibm-cos-sdk package not found, please install it with `pip install ibm-cos-sdk`")
|
|
46
|
-
|
|
47
|
-
self.bucket = bucket
|
|
48
|
-
self.ibm_api_key_id = ibm_api_key_id
|
|
49
|
-
self.ibm_service_instance_id = ibm_service_instance_id
|
|
50
|
-
self.s3_endpoint_url = s3_endpoint_url
|
|
51
|
-
|
|
52
|
-
def load_data(self, extra_info: Optional[dict] = None) -> List[Document]:
|
|
53
|
-
"""Loads data from the specified S3 bucket."""
|
|
54
|
-
ibm_s3 = self._ibm_boto3.resource(
|
|
55
|
-
"s3",
|
|
56
|
-
ibm_api_key_id=self.ibm_api_key_id,
|
|
57
|
-
ibm_service_instance_id=self.ibm_service_instance_id,
|
|
58
|
-
config=self._boto_config(signature_version="oauth"),
|
|
59
|
-
endpoint_url=self.s3_endpoint_url,
|
|
60
|
-
)
|
|
61
|
-
|
|
62
|
-
bucket = ibm_s3.Bucket(self.bucket)
|
|
63
|
-
|
|
64
|
-
with tempfile.TemporaryDirectory() as temp_dir:
|
|
65
|
-
for obj in bucket.objects.filter(Prefix=""):
|
|
66
|
-
file_path = f"{temp_dir}/{obj.key}"
|
|
67
|
-
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
|
68
|
-
ibm_s3.meta.client.download_file(self.bucket, obj.key, file_path)
|
|
69
|
-
|
|
70
|
-
s3_source = re.sub(r"^(https?)://", "", self.s3_endpoint_url)
|
|
71
|
-
|
|
72
|
-
return DirectoryLoader(input_dir=temp_dir).load_data(extra_info={"source": f"{s3_source}/{self.bucket}"})
|
|
@@ -1,121 +0,0 @@
|
|
|
1
|
-
from datetime import datetime
|
|
2
|
-
from logging import getLogger
|
|
3
|
-
from typing import List, Optional
|
|
4
|
-
|
|
5
|
-
from beekeeper.core.document import Document
|
|
6
|
-
from beekeeper.core.document_loaders import BaseLoader
|
|
7
|
-
|
|
8
|
-
logger = getLogger(__name__)
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class WatsonDiscoveryLoader(BaseLoader):
|
|
12
|
-
"""Provides functionality to load documents from IBM Watson Discovery.
|
|
13
|
-
|
|
14
|
-
See https://cloud.ibm.com/docs/discovery-data?topic=discovery-data-getting-started for more info.
|
|
15
|
-
|
|
16
|
-
Args:
|
|
17
|
-
url (str): Watson Discovery instance url.
|
|
18
|
-
api_key (str): Watson Discovery API key.
|
|
19
|
-
project_id (str): Watson Discovery project_id.
|
|
20
|
-
version (str, optional): Watson Discovery API version. Defaults to ``2023-03-31``.
|
|
21
|
-
batch_size (int, optional): Batch size for bulk operations. Defaults to ``50``.
|
|
22
|
-
created_date (str, optional): Load documents created after the date. Expected format ``YYYY-MM-DD``. Defaults to ``datetime.today()``.
|
|
23
|
-
pre_additional_data_field (str, optional): Additional data field to be added to the beginning of the Document content. Defaults to ``None``.
|
|
24
|
-
|
|
25
|
-
**Example**
|
|
26
|
-
|
|
27
|
-
.. code-block:: python
|
|
28
|
-
|
|
29
|
-
from beekeeper.document_loaders import WatsonDiscoveryLoader
|
|
30
|
-
|
|
31
|
-
doc_loader = WatsonDiscoveryLoader(url="your_url",
|
|
32
|
-
api_key="your_api_key",
|
|
33
|
-
project_id="your_project_id")
|
|
34
|
-
"""
|
|
35
|
-
|
|
36
|
-
def __init__(self,
|
|
37
|
-
url: str,
|
|
38
|
-
api_key: str,
|
|
39
|
-
project_id: str,
|
|
40
|
-
version: str = "2023-03-31",
|
|
41
|
-
batch_size: int = 50,
|
|
42
|
-
created_date: str = datetime.today().strftime("%Y-%m-%d"),
|
|
43
|
-
pre_additional_data_field: str = None
|
|
44
|
-
) -> None:
|
|
45
|
-
try:
|
|
46
|
-
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
|
|
47
|
-
from ibm_watson import DiscoveryV2
|
|
48
|
-
|
|
49
|
-
except ImportError:
|
|
50
|
-
raise ImportError("ibm-watson package not found, please install it with `pip install ibm-watson`")
|
|
51
|
-
|
|
52
|
-
self.project_id = project_id
|
|
53
|
-
self.batch_size = batch_size
|
|
54
|
-
self.created_date = created_date
|
|
55
|
-
self.pre_additional_data_field = pre_additional_data_field
|
|
56
|
-
|
|
57
|
-
try:
|
|
58
|
-
authenticator = IAMAuthenticator(api_key)
|
|
59
|
-
self._client = DiscoveryV2(authenticator=authenticator,
|
|
60
|
-
version=version)
|
|
61
|
-
|
|
62
|
-
self._client.set_service_url(url)
|
|
63
|
-
except Exception as e:
|
|
64
|
-
logger.error(f"Error connecting to IBM Watson Discovery: {e}")
|
|
65
|
-
raise
|
|
66
|
-
|
|
67
|
-
def load_data(self, extra_info: Optional[dict] = None) -> List[Document]:
|
|
68
|
-
"""Loads documents from the Watson Discovery.
|
|
69
|
-
|
|
70
|
-
**Example**
|
|
71
|
-
|
|
72
|
-
.. code-block:: python
|
|
73
|
-
|
|
74
|
-
docs = doc_loader.load_data()
|
|
75
|
-
"""
|
|
76
|
-
from ibm_watson.discovery_v2 import QueryLargePassages
|
|
77
|
-
last_batch_size = self.batch_size
|
|
78
|
-
offset_len = 0
|
|
79
|
-
documents = []
|
|
80
|
-
return_fields = ["extracted_metadata.filename", "extracted_metadata.file_type", "text"]
|
|
81
|
-
|
|
82
|
-
if self.pre_additional_data_field:
|
|
83
|
-
return_fields.append(self.pre_additional_data_field)
|
|
84
|
-
|
|
85
|
-
while last_batch_size == self.batch_size:
|
|
86
|
-
results = self._client.query(
|
|
87
|
-
project_id=self.project_id,
|
|
88
|
-
count=self.batch_size,
|
|
89
|
-
offset=offset_len,
|
|
90
|
-
return_=return_fields,
|
|
91
|
-
filter="extracted_metadata.publicationdate>={}".format(self.created_date),
|
|
92
|
-
passages=QueryLargePassages(enabled=False)).get_result()
|
|
93
|
-
|
|
94
|
-
last_batch_size = len(results["results"])
|
|
95
|
-
offset_len = offset_len + last_batch_size
|
|
96
|
-
|
|
97
|
-
# Make sure all retrieved document 'text' exist
|
|
98
|
-
results_documents = [doc for doc in results["results"] if "text" in doc]
|
|
99
|
-
|
|
100
|
-
if self.pre_additional_data_field:
|
|
101
|
-
for i, doc in enumerate(results_documents):
|
|
102
|
-
doc["text"].insert(0, self._get_nested_value(doc, self.pre_additional_data_field))
|
|
103
|
-
|
|
104
|
-
documents.extend([Document(doc_id=doc["document_id"],
|
|
105
|
-
text="\n".join(doc["text"]),
|
|
106
|
-
metadata={"collection_id": doc["result_metadata"]["collection_id"]} | doc[
|
|
107
|
-
"extracted_metadata"])
|
|
108
|
-
for doc in results_documents])
|
|
109
|
-
|
|
110
|
-
return documents
|
|
111
|
-
|
|
112
|
-
@staticmethod
|
|
113
|
-
def _get_nested_value(d, key_path, separator: Optional[str] = "."):
|
|
114
|
-
"""Accesses a nested value in a dictionary using a string key path."""
|
|
115
|
-
keys = key_path.split(separator) # Split the key_path using the separator
|
|
116
|
-
nested_value = d
|
|
117
|
-
|
|
118
|
-
for key in keys:
|
|
119
|
-
nested_value = nested_value[key] # Traverse the dictionary by each key
|
|
120
|
-
|
|
121
|
-
return nested_value
|
beekeeper/embeddings/__init__.py
DELETED
|
@@ -1,66 +0,0 @@
|
|
|
1
|
-
from typing import Any, List, Literal
|
|
2
|
-
|
|
3
|
-
from pydantic.v1 import BaseModel, PrivateAttr
|
|
4
|
-
|
|
5
|
-
from beekeeper.core.document import Document
|
|
6
|
-
from beekeeper.core.embeddings import BaseEmbedding, Embedding
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class HuggingFaceEmbedding(BaseModel, BaseEmbedding):
|
|
10
|
-
"""HuggingFace sentence_transformers embedding models.
|
|
11
|
-
|
|
12
|
-
Args:
|
|
13
|
-
model_name (str): Hugging Face model to be used. Defaults to ``sentence-transformers/all-MiniLM-L6-v2``.
|
|
14
|
-
device (str, optional): Device to run the model on. Currently supports "cpu" and "cuda". Defaults to ``cpu``.
|
|
15
|
-
|
|
16
|
-
**Example**
|
|
17
|
-
|
|
18
|
-
.. code-block:: python
|
|
19
|
-
|
|
20
|
-
from beekeeper.embeddings import HuggingFaceEmbedding
|
|
21
|
-
|
|
22
|
-
embedding = HuggingFaceEmbedding()
|
|
23
|
-
"""
|
|
24
|
-
|
|
25
|
-
model_name: str = "sentence-transformers/all-MiniLM-L6-v2"
|
|
26
|
-
device: Literal["cpu", "cuda"] = "cpu"
|
|
27
|
-
|
|
28
|
-
_client: Any = PrivateAttr()
|
|
29
|
-
|
|
30
|
-
def __init__(self, **kwargs: Any) -> None:
|
|
31
|
-
super().__init__(**kwargs)
|
|
32
|
-
from sentence_transformers import SentenceTransformer
|
|
33
|
-
|
|
34
|
-
self._client = SentenceTransformer(self.model_name, device=self.device)
|
|
35
|
-
|
|
36
|
-
def get_query_embedding(self, query: str) -> Embedding:
|
|
37
|
-
"""Compute embedding for a text.
|
|
38
|
-
|
|
39
|
-
Args:
|
|
40
|
-
query (str): Input query to compute embedding.
|
|
41
|
-
|
|
42
|
-
**Example**
|
|
43
|
-
|
|
44
|
-
.. code-block:: python
|
|
45
|
-
|
|
46
|
-
embedded_query = embedding.get_query_embedding("Beekeeper is a data framework to load any data in one line of code and connect with AI applications.")
|
|
47
|
-
"""
|
|
48
|
-
return self.get_texts_embedding([query])[0]
|
|
49
|
-
|
|
50
|
-
def get_texts_embedding(self, texts: List[str]) -> List[Embedding]:
|
|
51
|
-
"""Compute embeddings for list of texts.
|
|
52
|
-
|
|
53
|
-
Args:
|
|
54
|
-
texts (List[str]): List of text to compute embeddings.
|
|
55
|
-
"""
|
|
56
|
-
return self._client.encode(texts).tolist()
|
|
57
|
-
|
|
58
|
-
def get_documents_embedding(self, documents: List[Document]) -> List[Embedding]:
|
|
59
|
-
"""Compute embeddings for a list of documents.
|
|
60
|
-
|
|
61
|
-
Args:
|
|
62
|
-
documents (List[Document]): List of `Document` objects to compute embeddings.
|
|
63
|
-
"""
|
|
64
|
-
texts = [document.get_content() for document in documents]
|
|
65
|
-
|
|
66
|
-
return self.get_texts_embedding(texts)
|
beekeeper/embeddings/watsonx.py
DELETED
|
@@ -1,100 +0,0 @@
|
|
|
1
|
-
from typing import Any, List, Optional
|
|
2
|
-
|
|
3
|
-
from pydantic.v1 import BaseModel, PrivateAttr
|
|
4
|
-
|
|
5
|
-
from beekeeper.core.document import Document
|
|
6
|
-
from beekeeper.core.embeddings import BaseEmbedding, Embedding
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class WatsonxEmbedding(BaseModel, BaseEmbedding):
|
|
10
|
-
"""IBM watsonx embedding models.
|
|
11
|
-
|
|
12
|
-
Note:
|
|
13
|
-
One of these parameters is required: ``project_id`` or ``space_id``. Not both.
|
|
14
|
-
|
|
15
|
-
See https://cloud.ibm.com/apidocs/watsonx-ai#endpoint-url for the watsonx.ai API endpoints.
|
|
16
|
-
|
|
17
|
-
Args:
|
|
18
|
-
model_name (str): IBM watsonx.ai model to be used. Defaults to ``ibm/slate-30m-english-rtrvr``.
|
|
19
|
-
api_key (str): watsonx API key.
|
|
20
|
-
url (str): watsonx instance url.
|
|
21
|
-
truncate_input_tokens (str): Maximum number of input tokens accepted. Defaults to ``512``
|
|
22
|
-
project_id (str, optional): watsonx project_id.
|
|
23
|
-
space_id (str, optional): watsonx space_id.
|
|
24
|
-
|
|
25
|
-
**Example**
|
|
26
|
-
|
|
27
|
-
.. code-block:: python
|
|
28
|
-
|
|
29
|
-
from beekeeper.embeddings import WatsonxEmbedding
|
|
30
|
-
|
|
31
|
-
watsonx_embedding = WatsonxEmbedding(api_key="your_api_key",
|
|
32
|
-
url="your_instance_url",
|
|
33
|
-
project_id="your_project_id")
|
|
34
|
-
"""
|
|
35
|
-
|
|
36
|
-
model_name: str = "ibm/slate-30m-english-rtrvr"
|
|
37
|
-
api_key: str
|
|
38
|
-
url: str
|
|
39
|
-
truncate_input_tokens: int = 512
|
|
40
|
-
project_id: Optional[str] = None
|
|
41
|
-
space_id: Optional[str] = None
|
|
42
|
-
|
|
43
|
-
_client: Any = PrivateAttr()
|
|
44
|
-
|
|
45
|
-
def __init__(self, **kwargs: Any) -> None:
|
|
46
|
-
super().__init__(**kwargs)
|
|
47
|
-
try:
|
|
48
|
-
from ibm_watsonx_ai import Credentials
|
|
49
|
-
from ibm_watsonx_ai.foundation_models import Embeddings as WatsonxEmbeddings
|
|
50
|
-
|
|
51
|
-
except ImportError:
|
|
52
|
-
raise ImportError("ibm-watsonx-ai package not found, please install it with `pip install ibm-watsonx-ai`")
|
|
53
|
-
|
|
54
|
-
if (not (self.project_id or self.space_id)) or (self.project_id and self.space_id):
|
|
55
|
-
raise ValueError("Must provide one of these parameters [`project_id`, `space_id`], not both.")
|
|
56
|
-
|
|
57
|
-
kwargs_params = {
|
|
58
|
-
"model_id": self.model_name,
|
|
59
|
-
"params": {"truncate_input_tokens": self.truncate_input_tokens, "return_options": {"input_text": False}},
|
|
60
|
-
"credentials": Credentials(api_key=self.api_key, url=self.url)
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
if self.project_id:
|
|
64
|
-
kwargs_params["project_id"] = self.project_id
|
|
65
|
-
else:
|
|
66
|
-
kwargs_params["space_id"] = self.space_id
|
|
67
|
-
|
|
68
|
-
self._client = WatsonxEmbeddings(**kwargs_params)
|
|
69
|
-
|
|
70
|
-
def get_query_embedding(self, query: str) -> Embedding:
|
|
71
|
-
"""Compute embedding for a text.
|
|
72
|
-
|
|
73
|
-
Args:
|
|
74
|
-
query (str): Input query to compute embedding.
|
|
75
|
-
|
|
76
|
-
**Example**
|
|
77
|
-
|
|
78
|
-
.. code-block:: python
|
|
79
|
-
|
|
80
|
-
embedded_query = watsonx_embedding.get_query_embedding("Beekeeper is a data framework to load any data in one line of code and connect with AI applications.")
|
|
81
|
-
"""
|
|
82
|
-
return self.get_texts_embedding([query])[0]
|
|
83
|
-
|
|
84
|
-
def get_texts_embedding(self, texts: List[str]) -> List[Embedding]:
|
|
85
|
-
"""Compute embeddings for list of texts.
|
|
86
|
-
|
|
87
|
-
Args:
|
|
88
|
-
texts (List[str]): List of text to compute embeddings.
|
|
89
|
-
"""
|
|
90
|
-
return self._client.embed_documents(texts)
|
|
91
|
-
|
|
92
|
-
def get_documents_embedding(self, documents: List[Document]) -> List[Embedding]:
|
|
93
|
-
"""Compute embeddings for a list of documents.
|
|
94
|
-
|
|
95
|
-
Args:
|
|
96
|
-
documents (List[Document]): List of `Document` objects to compute embeddings.
|
|
97
|
-
"""
|
|
98
|
-
texts = [document.get_content() for document in documents]
|
|
99
|
-
|
|
100
|
-
return self.get_texts_embedding(texts)
|
beekeeper/evaluation/__init__.py
DELETED
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
from typing import Dict, List, Literal
|
|
2
|
-
|
|
3
|
-
import numpy as np
|
|
4
|
-
from pydantic.v1 import BaseModel
|
|
5
|
-
|
|
6
|
-
from beekeeper.core.embeddings import BaseEmbedding
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class KnowledgeBaseCoverage(BaseModel):
|
|
10
|
-
"""Measures how much the knowledge base (context) has contributed to the answer’s coverage.
|
|
11
|
-
Higher value suggests greater proportion of context are in LLM response.
|
|
12
|
-
|
|
13
|
-
Args:
|
|
14
|
-
embed_model (BaseEmbedding):
|
|
15
|
-
similarity_mode (str, optional): Similarity strategy. Currently supports "cosine", "dot_product" and "euclidean". Defaults to ``cosine``.
|
|
16
|
-
similarity_threshold (float, optional): Embedding similarity threshold for "passing". Defaults to ``0.8``.
|
|
17
|
-
|
|
18
|
-
**Example**
|
|
19
|
-
|
|
20
|
-
.. code-block:: python
|
|
21
|
-
|
|
22
|
-
from beekeeper.embeddings import HuggingFaceEmbedding
|
|
23
|
-
from beekeeper.evaluation import KnowledgeBaseCoverage
|
|
24
|
-
|
|
25
|
-
embedding = HuggingFaceEmbedding()
|
|
26
|
-
eval_coverage = KnowledgeBaseCoverage(embed_model=embedding)
|
|
27
|
-
"""
|
|
28
|
-
|
|
29
|
-
embed_model: BaseEmbedding
|
|
30
|
-
similarity_mode: Literal["cosine", "dot_product", "euclidean"] = "cosine"
|
|
31
|
-
similarity_threshold: float = 0.8
|
|
32
|
-
|
|
33
|
-
class Config:
|
|
34
|
-
arbitrary_types_allowed = True
|
|
35
|
-
|
|
36
|
-
def compute_metric(self, contexts: List[str], candidate: str) -> Dict:
|
|
37
|
-
"""
|
|
38
|
-
Args:
|
|
39
|
-
contexts (List[str]): List text used as LLM context.
|
|
40
|
-
candidate (str): LLM response based on given context.
|
|
41
|
-
|
|
42
|
-
**Example**
|
|
43
|
-
|
|
44
|
-
.. code-block:: python
|
|
45
|
-
|
|
46
|
-
context_coverage = eval_coverage.compute_metric(context=[], candidate="<candidate>")
|
|
47
|
-
"""
|
|
48
|
-
if not contexts or not candidate:
|
|
49
|
-
raise ValueError("Must provide these parameters [`contexts`, `candidate`]")
|
|
50
|
-
|
|
51
|
-
coverage = {"contexts_score": [], "score": 0}
|
|
52
|
-
candidate_embedding = self.embed_model.get_query_embedding(candidate)
|
|
53
|
-
|
|
54
|
-
for context in contexts:
|
|
55
|
-
context_embedding = self.embed_model.get_query_embedding(context)
|
|
56
|
-
coverage["contexts_score"].append(
|
|
57
|
-
self.embed_model.similarity(candidate_embedding, context_embedding, mode=self.similarity_mode))
|
|
58
|
-
|
|
59
|
-
coverage["score"] = np.mean(coverage["contexts_score"])
|
|
60
|
-
coverage["passing"] = coverage["score"] >= self.similarity_threshold
|
|
61
|
-
|
|
62
|
-
return coverage
|
beekeeper/monitor/__init__.py
DELETED