beekeeper-core 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- beekeeper_core-1.0.0/.gitignore +86 -0
- beekeeper_core-1.0.0/PKG-INFO +30 -0
- beekeeper_core-1.0.0/README.md +9 -0
- beekeeper_core-1.0.0/beekeeper/core/__init__.py +0 -0
- beekeeper_core-1.0.0/beekeeper/core/document/__init__.py +3 -0
- beekeeper_core-1.0.0/beekeeper/core/document/base.py +103 -0
- beekeeper_core-1.0.0/beekeeper/core/embeddings/__init__.py +7 -0
- beekeeper_core-1.0.0/beekeeper/core/embeddings/base.py +66 -0
- beekeeper_core-1.0.0/beekeeper/core/evaluation/__init__.py +5 -0
- beekeeper_core-1.0.0/beekeeper/core/evaluation/context_similarity.py +73 -0
- beekeeper_core-1.0.0/beekeeper/core/flows/__init__.py +3 -0
- beekeeper_core-1.0.0/beekeeper/core/flows/ingestion_flow.py +178 -0
- beekeeper_core-1.0.0/beekeeper/core/llms/__init__.py +11 -0
- beekeeper_core-1.0.0/beekeeper/core/llms/base.py +35 -0
- beekeeper_core-1.0.0/beekeeper/core/llms/decorators.py +83 -0
- beekeeper_core-1.0.0/beekeeper/core/llms/types.py +33 -0
- beekeeper_core-1.0.0/beekeeper/core/observers/__init__.py +3 -0
- beekeeper_core-1.0.0/beekeeper/core/observers/base.py +36 -0
- beekeeper_core-1.0.0/beekeeper/core/observers/types.py +11 -0
- beekeeper_core-1.0.0/beekeeper/core/prompts/__init__.py +3 -0
- beekeeper_core-1.0.0/beekeeper/core/prompts/base.py +30 -0
- beekeeper_core-1.0.0/beekeeper/core/prompts/utils.py +44 -0
- beekeeper_core-1.0.0/beekeeper/core/readers/__init__.py +4 -0
- beekeeper_core-1.0.0/beekeeper/core/readers/base.py +23 -0
- beekeeper_core-1.0.0/beekeeper/core/readers/directory.py +90 -0
- beekeeper_core-1.0.0/beekeeper/core/schema.py +10 -0
- beekeeper_core-1.0.0/beekeeper/core/text_chunkers/__init__.py +11 -0
- beekeeper_core-1.0.0/beekeeper/core/text_chunkers/base.py +24 -0
- beekeeper_core-1.0.0/beekeeper/core/text_chunkers/semantic.py +164 -0
- beekeeper_core-1.0.0/beekeeper/core/text_chunkers/sentence.py +133 -0
- beekeeper_core-1.0.0/beekeeper/core/text_chunkers/token.py +128 -0
- beekeeper_core-1.0.0/beekeeper/core/text_chunkers/utils.py +149 -0
- beekeeper_core-1.0.0/beekeeper/core/tools/__init__.py +5 -0
- beekeeper_core-1.0.0/beekeeper/core/tools/base.py +39 -0
- beekeeper_core-1.0.0/beekeeper/core/utils/pairwise.py +21 -0
- beekeeper_core-1.0.0/beekeeper/core/vector_stores/__init__.py +3 -0
- beekeeper_core-1.0.0/beekeeper/core/vector_stores/base.py +38 -0
- beekeeper_core-1.0.0/pyproject.toml +39 -0
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
#IDE
|
|
10
|
+
.DS_Store
|
|
11
|
+
.idea
|
|
12
|
+
.vscode
|
|
13
|
+
|
|
14
|
+
# Distribution / packaging
|
|
15
|
+
.Python
|
|
16
|
+
build/
|
|
17
|
+
develop-eggs/
|
|
18
|
+
dist/
|
|
19
|
+
downloads/
|
|
20
|
+
eggs/
|
|
21
|
+
.eggs/
|
|
22
|
+
lib/
|
|
23
|
+
lib64/
|
|
24
|
+
parts/
|
|
25
|
+
sdist/
|
|
26
|
+
var/
|
|
27
|
+
wheels/
|
|
28
|
+
share/python-wheels/
|
|
29
|
+
*.egg-info/
|
|
30
|
+
.installed.cfg
|
|
31
|
+
*.egg
|
|
32
|
+
MANIFEST
|
|
33
|
+
|
|
34
|
+
# PyInstaller
|
|
35
|
+
# Usually these files are written by a python script from a template
|
|
36
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
37
|
+
*.manifest
|
|
38
|
+
*.spec
|
|
39
|
+
|
|
40
|
+
# Installer logs
|
|
41
|
+
pip-log.txt
|
|
42
|
+
pip-delete-this-directory.txt
|
|
43
|
+
|
|
44
|
+
# Unit test / coverage reports
|
|
45
|
+
htmlcov/
|
|
46
|
+
.tox/
|
|
47
|
+
.nox/
|
|
48
|
+
.coverage
|
|
49
|
+
.coverage.*
|
|
50
|
+
.cache
|
|
51
|
+
nosetests.xml
|
|
52
|
+
coverage.xml
|
|
53
|
+
*.cover
|
|
54
|
+
*.py,cover
|
|
55
|
+
.hypothesis/
|
|
56
|
+
.pytest_cache/
|
|
57
|
+
cover/
|
|
58
|
+
|
|
59
|
+
# Sphinx documentation
|
|
60
|
+
docs/_build/
|
|
61
|
+
docs/sphinx-pages/
|
|
62
|
+
sphinx-pages/
|
|
63
|
+
|
|
64
|
+
# Ruff
|
|
65
|
+
.ruff_cache/
|
|
66
|
+
|
|
67
|
+
# PyBuilder
|
|
68
|
+
.pybuilder/
|
|
69
|
+
target/
|
|
70
|
+
|
|
71
|
+
# Jupyter Notebook
|
|
72
|
+
.ipynb_checkpoints
|
|
73
|
+
|
|
74
|
+
# pyenv
|
|
75
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
76
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
77
|
+
# .python-version
|
|
78
|
+
|
|
79
|
+
# Environments
|
|
80
|
+
.env
|
|
81
|
+
.venv
|
|
82
|
+
env/
|
|
83
|
+
venv/
|
|
84
|
+
ENV/
|
|
85
|
+
env.bak/
|
|
86
|
+
venv.bak/
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: beekeeper-core
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Load any data in one line of code and connect with AI applications
|
|
5
|
+
Project-URL: Repository, https://github.com/leonardofurnielis/beekeeper
|
|
6
|
+
Author-email: Leonardo Furnielis <leonardofurnielis@outlook.com>
|
|
7
|
+
License: Apache-2.0
|
|
8
|
+
Keywords: AI,LLM,QA,RAG,data,observability,retrieval,semantic-search
|
|
9
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
10
|
+
Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
|
|
11
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
12
|
+
Requires-Python: <4.0,>=3.10
|
|
13
|
+
Requires-Dist: deprecated<2.0.0,>=1.2.18
|
|
14
|
+
Requires-Dist: nltk<4.0.0,>=3.9.1
|
|
15
|
+
Requires-Dist: numpy<1.27.0,>=1.26.4
|
|
16
|
+
Requires-Dist: pydantic<3.0.0,>=2.11.5
|
|
17
|
+
Requires-Dist: tiktoken<0.10.0,>=0.9.0
|
|
18
|
+
Provides-Extra: dev
|
|
19
|
+
Requires-Dist: ruff>=0.11.13; extra == 'dev'
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
# Beekeeper Core
|
|
23
|
+
|
|
24
|
+
This is the primary Python package for the Beekeeper. It provides essential classes and abstractions that serve as the backbone for applications built with LLMs, particularly Retrieval-Augmented Generation (RAG). These foundational components include interfaces for LLMs, vector databases, embeddings, storage systems, callable functions, and more.
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install beekeeper-core
|
|
30
|
+
```
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# Beekeeper Core
|
|
2
|
+
|
|
3
|
+
This is the primary Python package for the Beekeeper. It provides essential classes and abstractions that serve as the backbone for applications built with LLMs, particularly Retrieval-Augmented Generation (RAG). These foundational components include interfaces for LLMs, vector databases, embeddings, storage systems, callable functions, and more.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install beekeeper-core
|
|
9
|
+
```
|
|
File without changes
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import uuid
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from hashlib import sha256
|
|
4
|
+
from typing import Any, Dict, List, Optional, Union
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
from pydantic.v1 import BaseModel, Field, validator
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class BaseDocument(ABC, BaseModel):
|
|
11
|
+
"""Generic abstract interface for retrievable documents."""
|
|
12
|
+
|
|
13
|
+
id_: str = Field(
|
|
14
|
+
default_factory=lambda: str(uuid.uuid4()),
|
|
15
|
+
description="Unique ID of the document.",
|
|
16
|
+
)
|
|
17
|
+
metadata: Dict[str, Any] = Field(
|
|
18
|
+
default_factory=dict,
|
|
19
|
+
description="A flat dictionary of metadata fields.",
|
|
20
|
+
)
|
|
21
|
+
embedding: Optional[Union[List[float], np.ndarray]] = Field(
|
|
22
|
+
default_factory=None,
|
|
23
|
+
description="Embedding of the document.",
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
class Config:
|
|
27
|
+
arbitrary_types_allowed = True
|
|
28
|
+
|
|
29
|
+
@validator("metadata", pre=True)
|
|
30
|
+
def _validate_metadata(cls, v) -> Dict:
|
|
31
|
+
if v is None:
|
|
32
|
+
return {}
|
|
33
|
+
return v
|
|
34
|
+
|
|
35
|
+
@abstractmethod
|
|
36
|
+
def get_content(self) -> str:
|
|
37
|
+
"""Get document content."""
|
|
38
|
+
|
|
39
|
+
def get_metadata(self) -> dict:
|
|
40
|
+
"""Get metadata."""
|
|
41
|
+
return self.metadata
|
|
42
|
+
|
|
43
|
+
def get_embedding(self) -> List[float]:
|
|
44
|
+
"""Get metadata."""
|
|
45
|
+
return self.embedding
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
@abstractmethod
|
|
49
|
+
def hash(self) -> str:
|
|
50
|
+
"""Get hash."""
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class Document(BaseDocument):
|
|
54
|
+
"""Generic interface for data document."""
|
|
55
|
+
|
|
56
|
+
text: str = Field(default="", description="Text content of the document.")
|
|
57
|
+
|
|
58
|
+
@classmethod
|
|
59
|
+
def class_name(cls) -> str:
|
|
60
|
+
return "Document"
|
|
61
|
+
|
|
62
|
+
def get_content(self) -> str:
|
|
63
|
+
"""Get the text content."""
|
|
64
|
+
return self.text
|
|
65
|
+
|
|
66
|
+
@property
|
|
67
|
+
def hash(self) -> str:
|
|
68
|
+
"""Get document hash."""
|
|
69
|
+
return str(sha256(str(self.text).encode("utf-8", "surrogatepass")).hexdigest())
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class DocumentWithScore(BaseModel):
|
|
73
|
+
document: BaseDocument
|
|
74
|
+
score: Optional[float] = None
|
|
75
|
+
|
|
76
|
+
@classmethod
|
|
77
|
+
def class_name(cls) -> str:
|
|
78
|
+
return "DocumentWithScore"
|
|
79
|
+
|
|
80
|
+
def get_score(self) -> float:
|
|
81
|
+
"""Get score."""
|
|
82
|
+
if self.score is None:
|
|
83
|
+
return 0.0
|
|
84
|
+
else:
|
|
85
|
+
return self.score
|
|
86
|
+
|
|
87
|
+
# #### pass through methods to BaseDocument ####
|
|
88
|
+
@property
|
|
89
|
+
def id_(self) -> str:
|
|
90
|
+
return self.document.id_
|
|
91
|
+
|
|
92
|
+
@property
|
|
93
|
+
def text(self) -> str:
|
|
94
|
+
if isinstance(self.document, Document):
|
|
95
|
+
return self.document.text
|
|
96
|
+
else:
|
|
97
|
+
raise ValueError("Must be a Document to get text.")
|
|
98
|
+
|
|
99
|
+
def get_content(self) -> str:
|
|
100
|
+
return self.document.get_content()
|
|
101
|
+
|
|
102
|
+
def get_metadata(self) -> str:
|
|
103
|
+
return self.document.get_metadata()
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
from beekeeper.core.document import Document
|
|
7
|
+
from beekeeper.core.schema import TransformerComponent
|
|
8
|
+
from beekeeper.core.utils.pairwise import cosine_similarity
|
|
9
|
+
|
|
10
|
+
Embedding = List[float]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class SimilarityMode(str, Enum):
|
|
14
|
+
"""Modes for similarity."""
|
|
15
|
+
|
|
16
|
+
COSINE = "cosine"
|
|
17
|
+
DOT_PRODUCT = "dot_product"
|
|
18
|
+
EUCLIDEAN = "euclidean"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def similarity(
|
|
22
|
+
embedding1: Embedding,
|
|
23
|
+
embedding2: Embedding,
|
|
24
|
+
mode: SimilarityMode = SimilarityMode.COSINE,
|
|
25
|
+
):
|
|
26
|
+
"""Get embedding similarity."""
|
|
27
|
+
if mode == SimilarityMode.EUCLIDEAN:
|
|
28
|
+
return -float(np.linalg.norm(np.array(embedding1) - np.array(embedding2)))
|
|
29
|
+
|
|
30
|
+
elif mode == SimilarityMode.DOT_PRODUCT:
|
|
31
|
+
return np.dot(embedding1, embedding2)
|
|
32
|
+
|
|
33
|
+
else:
|
|
34
|
+
return cosine_similarity(embedding1, embedding2)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class BaseEmbedding(TransformerComponent, ABC):
|
|
38
|
+
"""An interface for embedding models."""
|
|
39
|
+
|
|
40
|
+
@classmethod
|
|
41
|
+
def class_name(cls) -> str:
|
|
42
|
+
return "BaseEmbedding"
|
|
43
|
+
|
|
44
|
+
@abstractmethod
|
|
45
|
+
def get_text_embedding(self, query: str) -> Embedding:
|
|
46
|
+
"""Get query embedding."""
|
|
47
|
+
|
|
48
|
+
@abstractmethod
|
|
49
|
+
def get_texts_embedding(self, texts: List[str]) -> List[Embedding]:
|
|
50
|
+
"""Get text embeddings."""
|
|
51
|
+
|
|
52
|
+
@abstractmethod
|
|
53
|
+
def get_documents_embedding(self, documents: List[Document]) -> List[Document]:
|
|
54
|
+
"""Get documents embeddings."""
|
|
55
|
+
|
|
56
|
+
@staticmethod
|
|
57
|
+
def similarity(
|
|
58
|
+
embedding1: Embedding,
|
|
59
|
+
embedding2: Embedding,
|
|
60
|
+
mode: SimilarityMode = SimilarityMode.COSINE,
|
|
61
|
+
):
|
|
62
|
+
"""Get embedding similarity."""
|
|
63
|
+
return similarity(embedding1, embedding2, mode)
|
|
64
|
+
|
|
65
|
+
def __call__(self, documents: List[Document]) -> List[Document]:
|
|
66
|
+
return self.get_documents_embedding(documents)
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
from typing import Dict, List
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from beekeeper.core.embeddings import BaseEmbedding, SimilarityMode
|
|
5
|
+
from pydantic.v1 import BaseModel
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ContextSimilarityEvaluator(BaseModel):
|
|
9
|
+
"""
|
|
10
|
+
Measures how much context has contributed to the answer’s.
|
|
11
|
+
A higher value suggests a greater proportion of the context is present in the LLM's response.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
embed_model (BaseEmbedding): The embedding model used to compute vector representations.
|
|
15
|
+
similarity_mode (str, optional): Similarity strategy to use. Supported options are
|
|
16
|
+
`"cosine"`, `"dot_product"`, and `"euclidean"`. Defaults to `"cosine"`.
|
|
17
|
+
similarity_threshold (float, optional): Embedding similarity threshold for determining
|
|
18
|
+
whether a context segment "passes". Defaults to `0.8`.
|
|
19
|
+
|
|
20
|
+
Example:
|
|
21
|
+
.. code-block:: python
|
|
22
|
+
|
|
23
|
+
from beekeeper.core.evaluation import ContextSimilarityEvaluator
|
|
24
|
+
from beekeeper.embeddings.huggingface import HuggingFaceEmbedding
|
|
25
|
+
|
|
26
|
+
embedding = HuggingFaceEmbedding()
|
|
27
|
+
ctx_sim_evaluator = ContextSimilarityEvaluator(embed_model=embedding)
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
embed_model: BaseEmbedding
|
|
31
|
+
similarity_mode: SimilarityMode = SimilarityMode.COSINE
|
|
32
|
+
similarity_threshold: float = 0.8
|
|
33
|
+
|
|
34
|
+
class Config:
|
|
35
|
+
arbitrary_types_allowed = True
|
|
36
|
+
|
|
37
|
+
def evaluate(self, contexts: List[str], generated_text: str) -> Dict:
|
|
38
|
+
"""
|
|
39
|
+
Args:
|
|
40
|
+
contexts (List[str]): List contexts used to generate LLM response.
|
|
41
|
+
generated_text (str): LLM response based on given context.
|
|
42
|
+
|
|
43
|
+
Example:
|
|
44
|
+
.. code-block:: python
|
|
45
|
+
|
|
46
|
+
evaluation_result = ctx_sim_evaluator.evaluate(
|
|
47
|
+
contexts=[], generated_text="<candidate>"
|
|
48
|
+
)
|
|
49
|
+
"""
|
|
50
|
+
if not contexts or not generated_text:
|
|
51
|
+
raise ValueError(
|
|
52
|
+
"Must provide these parameters [`contexts`, `generated_text`]",
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
evaluation_result = {"contexts_score": [], "score": 0}
|
|
56
|
+
candidate_embedding = self.embed_model.get_text_embedding(generated_text)
|
|
57
|
+
|
|
58
|
+
for context in contexts:
|
|
59
|
+
context_embedding = self.embed_model.get_text_embedding(context)
|
|
60
|
+
evaluation_result["contexts_score"].append(
|
|
61
|
+
self.embed_model.similarity(
|
|
62
|
+
candidate_embedding,
|
|
63
|
+
context_embedding,
|
|
64
|
+
mode=self.similarity_mode,
|
|
65
|
+
),
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
evaluation_result["score"] = np.mean(evaluation_result["contexts_score"])
|
|
69
|
+
evaluation_result["passing"] = (
|
|
70
|
+
evaluation_result["score"] >= self.similarity_threshold
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
return evaluation_result
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from typing import List, Optional
|
|
3
|
+
|
|
4
|
+
from beekeeper.core.document import Document
|
|
5
|
+
from beekeeper.core.readers import BaseReader
|
|
6
|
+
from beekeeper.core.schema import TransformerComponent
|
|
7
|
+
from beekeeper.core.vector_stores import BaseVectorStore
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DocStrategy(Enum):
|
|
11
|
+
"""
|
|
12
|
+
Document de-duplication strategies work by comparing the hashes in the vector store.
|
|
13
|
+
They require a vector store to be set.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
DUPLICATE_ONLY = "duplicate_only"
|
|
17
|
+
DUPLICATE_AND_DELETE = "duplicate_and_delete"
|
|
18
|
+
DEDUPLICATE_OFF = "deduplicate_off"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class IngestionFlow:
|
|
22
|
+
"""
|
|
23
|
+
An ingestion flow for processing and storing data.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
transformers (List[TransformerComponent]): A list of transformer components applied to the input documents.
|
|
27
|
+
doc_strategy (DocStrategy): The strategy used for handling document duplicates.
|
|
28
|
+
Defaults to `DocStrategy.DUPLICATE_ONLY`.
|
|
29
|
+
post_transformer (bool): Whether document de-duplication should be applied after transformation step.
|
|
30
|
+
Defaults to `False`.
|
|
31
|
+
readers (BaseReader, optional): List of readers for loading or fetching documents.
|
|
32
|
+
vector_store (BaseVectorStore, optional): Vector store for saving processed documents
|
|
33
|
+
|
|
34
|
+
Example:
|
|
35
|
+
.. code-block:: python
|
|
36
|
+
|
|
37
|
+
from beekeeper.core.flows import IngestionFlow
|
|
38
|
+
from beekeeper.core.text_chunkers import TokenTextChunker
|
|
39
|
+
from beekeeper.embeddings.huggingface import HuggingFaceEmbedding
|
|
40
|
+
|
|
41
|
+
ingestion_flow = IngestionFlow(
|
|
42
|
+
transformers=[
|
|
43
|
+
TokenTextChunker(),
|
|
44
|
+
HuggingFaceEmbedding(model_name="intfloat/multilingual-e5-small"),
|
|
45
|
+
]
|
|
46
|
+
)
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
transformers: List[TransformerComponent],
|
|
52
|
+
doc_strategy: DocStrategy = DocStrategy.DUPLICATE_ONLY,
|
|
53
|
+
post_transformer: bool = False,
|
|
54
|
+
readers: Optional[List[BaseReader]] = None,
|
|
55
|
+
vector_store: Optional[BaseVectorStore] = None,
|
|
56
|
+
) -> None:
|
|
57
|
+
self.doc_strategy = doc_strategy
|
|
58
|
+
self.post_transformer = post_transformer
|
|
59
|
+
self.transformers = transformers
|
|
60
|
+
self.readers = readers
|
|
61
|
+
self.vector_store = vector_store
|
|
62
|
+
|
|
63
|
+
def _read_documents(self, documents: Optional[List[Document]]):
|
|
64
|
+
input_documents = []
|
|
65
|
+
|
|
66
|
+
if documents is not None:
|
|
67
|
+
input_documents.extend(documents)
|
|
68
|
+
|
|
69
|
+
if self.readers is not None:
|
|
70
|
+
for reader in self.readers:
|
|
71
|
+
input_documents.extend(reader.load_data())
|
|
72
|
+
|
|
73
|
+
return input_documents
|
|
74
|
+
|
|
75
|
+
def _handle_duplicates(self, documents) -> List[Document]:
|
|
76
|
+
ids, existing_hashes, existing_ref_hashes = (
|
|
77
|
+
self.vector_store.get_all_document_hashes()
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
if self.post_transformer:
|
|
81
|
+
# Use own document hash (chunks level) for de-duplication
|
|
82
|
+
hashes_fallback = existing_hashes
|
|
83
|
+
else:
|
|
84
|
+
# Use parent document hash `ref_doc_hash`
|
|
85
|
+
# Fallback to document own hash if `ref_doc_hash` (parent level) is missing for de-duplication
|
|
86
|
+
hashes_fallback = [
|
|
87
|
+
existing_ref_hashes[i]
|
|
88
|
+
if existing_ref_hashes[i] is not None
|
|
89
|
+
else existing_hashes[i]
|
|
90
|
+
for i in range(len(existing_ref_hashes))
|
|
91
|
+
]
|
|
92
|
+
|
|
93
|
+
current_hashes = []
|
|
94
|
+
current_unique_hashes = []
|
|
95
|
+
dedup_documents_to_run = []
|
|
96
|
+
|
|
97
|
+
for doc in documents:
|
|
98
|
+
current_hashes.append(doc.hash)
|
|
99
|
+
|
|
100
|
+
if (
|
|
101
|
+
doc.hash not in hashes_fallback
|
|
102
|
+
and doc.hash not in current_unique_hashes
|
|
103
|
+
and doc.get_content() != ""
|
|
104
|
+
):
|
|
105
|
+
dedup_documents_to_run.append(doc)
|
|
106
|
+
current_unique_hashes.append(
|
|
107
|
+
doc.hash,
|
|
108
|
+
) # Prevent duplicating same document hash in same batch flow execution.
|
|
109
|
+
|
|
110
|
+
if self.doc_strategy == DocStrategy.DUPLICATE_AND_DELETE:
|
|
111
|
+
ids_to_remove = [
|
|
112
|
+
ids[i]
|
|
113
|
+
for i in range(len(hashes_fallback))
|
|
114
|
+
if hashes_fallback[i] not in current_hashes
|
|
115
|
+
]
|
|
116
|
+
|
|
117
|
+
if self.vector_store is not None:
|
|
118
|
+
self.vector_store.delete_documents(ids_to_remove)
|
|
119
|
+
|
|
120
|
+
return dedup_documents_to_run
|
|
121
|
+
|
|
122
|
+
def _run_transformers(
|
|
123
|
+
self,
|
|
124
|
+
documents: List[Document],
|
|
125
|
+
transformers: TransformerComponent,
|
|
126
|
+
) -> List[Document]:
|
|
127
|
+
_documents = documents.copy()
|
|
128
|
+
|
|
129
|
+
for transformer in transformers:
|
|
130
|
+
_documents = transformer(_documents)
|
|
131
|
+
|
|
132
|
+
return _documents
|
|
133
|
+
|
|
134
|
+
def run(self, documents: List[Document] = None) -> List[Document]:
|
|
135
|
+
"""
|
|
136
|
+
Run an ingestion flow.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
documents: Set of documents to be transformed.
|
|
140
|
+
|
|
141
|
+
Example:
|
|
142
|
+
.. code-block:: python
|
|
143
|
+
|
|
144
|
+
ingestion_flow.run(documents: List[Document])
|
|
145
|
+
"""
|
|
146
|
+
documents_processed = []
|
|
147
|
+
input_documents = self._read_documents(documents)
|
|
148
|
+
|
|
149
|
+
if (
|
|
150
|
+
self.vector_store is not None
|
|
151
|
+
and self.doc_strategy != DocStrategy.DEDUPLICATE_OFF
|
|
152
|
+
and not self.post_transformer
|
|
153
|
+
):
|
|
154
|
+
# Apply transformers before de-dup (parent level)
|
|
155
|
+
|
|
156
|
+
documents_to_run = self._handle_duplicates(input_documents)
|
|
157
|
+
else:
|
|
158
|
+
# Apply transformers after de-dup (chunk level)
|
|
159
|
+
documents_to_run = input_documents
|
|
160
|
+
|
|
161
|
+
if documents_to_run:
|
|
162
|
+
documents_processed = self._run_transformers(
|
|
163
|
+
documents_to_run,
|
|
164
|
+
self.transformers,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
# Apply transformers after de-dup (chunk level)
|
|
168
|
+
if (
|
|
169
|
+
self.vector_store is not None
|
|
170
|
+
and self.doc_strategy != DocStrategy.DEDUPLICATE_OFF
|
|
171
|
+
and self.post_transformer
|
|
172
|
+
):
|
|
173
|
+
documents_processed = self._handle_duplicates(documents_processed)
|
|
174
|
+
|
|
175
|
+
if self.vector_store is not None and documents_processed:
|
|
176
|
+
self.vector_store.add_documents(documents_processed)
|
|
177
|
+
|
|
178
|
+
return documents_processed
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from beekeeper.core.llms.base import BaseLLM
|
|
2
|
+
from beekeeper.core.llms.types import (
|
|
3
|
+
ChatMessage,
|
|
4
|
+
ChatResponse,
|
|
5
|
+
GenerateResponse,
|
|
6
|
+
MessageRole,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
__all__ = (
|
|
10
|
+
["BaseLLM", "ChatMessage", "ChatResponse", "GenerateResponse", "MessageRole"],
|
|
11
|
+
)
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Any, List, Optional
|
|
3
|
+
|
|
4
|
+
from beekeeper.core.llms.types import ChatMessage, ChatResponse, GenerateResponse
|
|
5
|
+
from beekeeper.core.observers import BaseObserver
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class BaseLLM(ABC, BaseModel):
|
|
10
|
+
"""An interface for LLMs."""
|
|
11
|
+
|
|
12
|
+
model_config = {"arbitrary_types_allowed": True}
|
|
13
|
+
callback_manager: Optional[BaseObserver] = None
|
|
14
|
+
|
|
15
|
+
@classmethod
|
|
16
|
+
def class_name(cls) -> str:
|
|
17
|
+
return "BaseLLM"
|
|
18
|
+
|
|
19
|
+
def convert_chat_messages(self, messages: List[ChatMessage]) -> List[dict]:
|
|
20
|
+
"""Convert chat messages to LLM message format."""
|
|
21
|
+
return [message.model_dump() for message in messages]
|
|
22
|
+
|
|
23
|
+
@abstractmethod
|
|
24
|
+
def completion(self, prompt: str, **kwargs: Any) -> GenerateResponse:
|
|
25
|
+
"""Generates a completion for LLM."""
|
|
26
|
+
|
|
27
|
+
@abstractmethod
|
|
28
|
+
def text_completion(self, prompt: str, **kwargs: Any) -> str:
|
|
29
|
+
"""Generates a text completion for LLM."""
|
|
30
|
+
|
|
31
|
+
@abstractmethod
|
|
32
|
+
def chat_completion(
|
|
33
|
+
self, messages: List[ChatMessage], **kwargs: Any
|
|
34
|
+
) -> ChatResponse:
|
|
35
|
+
"""Generates a chat completion for LLM."""
|