evaluation-embedder 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evaluation_embedder-0.1.0/PKG-INFO +19 -0
- evaluation_embedder-0.1.0/README.md +0 -0
- evaluation_embedder-0.1.0/evaluation_embedder/__init__.py +0 -0
- evaluation_embedder-0.1.0/evaluation_embedder/src/__init__.py +0 -0
- evaluation_embedder-0.1.0/evaluation_embedder/src/constants.py +38 -0
- evaluation_embedder-0.1.0/evaluation_embedder/src/datasets/__init__.py +289 -0
- evaluation_embedder-0.1.0/evaluation_embedder/src/datasets/polars.py +26 -0
- evaluation_embedder-0.1.0/evaluation_embedder/src/evaluation/__init__.py +175 -0
- evaluation_embedder-0.1.0/evaluation_embedder/src/evaluation/embedders.py +27 -0
- evaluation_embedder-0.1.0/evaluation_embedder/src/evaluation/evaluators.py +50 -0
- evaluation_embedder-0.1.0/evaluation_embedder/src/evaluation/processors.py +16 -0
- evaluation_embedder-0.1.0/evaluation_embedder/src/evaluation/retrievers.py +26 -0
- evaluation_embedder-0.1.0/evaluation_embedder/src/evaluation/scores.py +59 -0
- evaluation_embedder-0.1.0/evaluation_embedder/src/evaluation/vector_stores.py +115 -0
- evaluation_embedder-0.1.0/evaluation_embedder/src/mixins.py +61 -0
- evaluation_embedder-0.1.0/evaluation_embedder/src/settings.py +144 -0
- evaluation_embedder-0.1.0/evaluation_embedder/src/utils.py +8 -0
- evaluation_embedder-0.1.0/pyproject.toml +35 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: evaluation-embedder
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary:
|
|
5
|
+
Author: jalal
|
|
6
|
+
Author-email: jalalkhaldi3@gmail.com
|
|
7
|
+
Requires-Python: 3.12.9
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Requires-Dist: faiss-cpu (>=1.13.2,<2.0.0)
|
|
10
|
+
Requires-Dist: langchain (>=1.2.3,<2.0.0)
|
|
11
|
+
Requires-Dist: minio (>=7.2.20,<8.0.0)
|
|
12
|
+
Requires-Dist: numpy (>=2.4.1,<3.0.0)
|
|
13
|
+
Requires-Dist: openai (>=2.15.0,<3.0.0)
|
|
14
|
+
Requires-Dist: polars (>=1.37.1,<2.0.0)
|
|
15
|
+
Requires-Dist: pydantic-settings (>=2.12.0,<3.0.0)
|
|
16
|
+
Requires-Dist: qdrant-client (>=1.16.2,<2.0.0)
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
|
|
19
|
+
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING, Literal, TypeAlias, TypeVar
|
|
2
|
+
|
|
3
|
+
from polars import Enum
|
|
4
|
+
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from evaluation_embedder.src.settings import (
|
|
7
|
+
DatasetSettings,
|
|
8
|
+
EmbedderSettings,
|
|
9
|
+
EvaluatorSettings,
|
|
10
|
+
FromConfigMixinSettings,
|
|
11
|
+
ProcessorSettings,
|
|
12
|
+
RetrieverSettings,
|
|
13
|
+
ScoreSettings,
|
|
14
|
+
VectorStoreSettings,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
CONFIG_PATH = "/config/config.yaml"
|
|
18
|
+
|
|
19
|
+
TDataset = TypeVar("TDataset")
|
|
20
|
+
ParquetCompression: TypeAlias = Literal["lz4", "uncompressed", "snappy", "gzip", "brotli", "zstd"]
|
|
21
|
+
TCFromConfigMixin = TypeVar("TCFromConfigMixin", bound="FromConfigMixinSettings")
|
|
22
|
+
TCDataset = TypeVar("TCDataset", bound="DatasetSettings")
|
|
23
|
+
TCEmbedder = TypeVar("TCEmbedder", bound="EmbedderSettings")
|
|
24
|
+
TCEvaluator = TypeVar("TCEvaluator", bound="EvaluatorSettings")
|
|
25
|
+
TCProcessor = TypeVar("TCProcessor", bound="ProcessorSettings")
|
|
26
|
+
TCRetriever = TypeVar("TCRetriever", bound="RetrieverSettings")
|
|
27
|
+
TCScore = TypeVar("TCScore", bound="ScoreSettings")
|
|
28
|
+
TCVectorStore = TypeVar("TCVectorStore", bound="VectorStoreSettings")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class EmbeddingPurposeEnum(Enum):
|
|
32
|
+
DOCUMENT = "document"
|
|
33
|
+
QUERY = "query"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class FAISSIndexType(Enum):
|
|
37
|
+
FLAT_IP = "flat_ip"
|
|
38
|
+
FLAT_L2 = "flat_l2"
|
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
import io
|
|
2
|
+
import logging
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import (
|
|
6
|
+
Any,
|
|
7
|
+
Dict,
|
|
8
|
+
Generic,
|
|
9
|
+
Iterable,
|
|
10
|
+
Iterator,
|
|
11
|
+
List,
|
|
12
|
+
Literal,
|
|
13
|
+
Optional,
|
|
14
|
+
Self,
|
|
15
|
+
Tuple,
|
|
16
|
+
Union,
|
|
17
|
+
cast,
|
|
18
|
+
overload,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
import polars as pl
|
|
22
|
+
from langchain_core.documents import Document
|
|
23
|
+
from minio import Minio
|
|
24
|
+
from polars._typing import ColumnNameOrSelector, IntoExpr, IntoExprColumn
|
|
25
|
+
|
|
26
|
+
from evaluation_embedder.src.constants import ParquetCompression, TDataset
|
|
27
|
+
from evaluation_embedder.src.settings import (
|
|
28
|
+
MinioDatasetSettings,
|
|
29
|
+
ParquetDatasetSettings,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
_logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class Dataset(ABC, Generic[TDataset]):
|
|
36
|
+
def __init__(self, service: TDataset):
|
|
37
|
+
super().__init__()
|
|
38
|
+
self.service = service
|
|
39
|
+
self._polars: Optional[pl.DataFrame] = None
|
|
40
|
+
self._lazy_polars: Optional[pl.LazyFrame] = None
|
|
41
|
+
|
|
42
|
+
@classmethod
|
|
43
|
+
@abstractmethod
|
|
44
|
+
def from_polars(cls, df: Union[pl.DataFrame, pl.LazyFrame]) -> "Dataset[TDataset]":
|
|
45
|
+
raise NotImplementedError
|
|
46
|
+
|
|
47
|
+
@abstractmethod
|
|
48
|
+
def to_polars(self) -> pl.DataFrame:
|
|
49
|
+
raise NotImplementedError
|
|
50
|
+
|
|
51
|
+
@abstractmethod
|
|
52
|
+
def to_lazy_polars(self) -> pl.LazyFrame:
|
|
53
|
+
raise NotImplementedError
|
|
54
|
+
|
|
55
|
+
@classmethod
|
|
56
|
+
def from_parquet(
|
|
57
|
+
cls,
|
|
58
|
+
path: Union[str, Path],
|
|
59
|
+
*,
|
|
60
|
+
lazy: bool = True,
|
|
61
|
+
) -> "Dataset[TDataset]":
|
|
62
|
+
df: Union[pl.DataFrame, pl.LazyFrame]
|
|
63
|
+
if lazy:
|
|
64
|
+
df = pl.scan_parquet(path)
|
|
65
|
+
else:
|
|
66
|
+
df = pl.read_parquet(path)
|
|
67
|
+
|
|
68
|
+
return cls.from_polars(df)
|
|
69
|
+
|
|
70
|
+
@classmethod
|
|
71
|
+
def from_minio(
|
|
72
|
+
cls,
|
|
73
|
+
*,
|
|
74
|
+
bucket: str,
|
|
75
|
+
key: str,
|
|
76
|
+
endpoint_url: str,
|
|
77
|
+
access_key: str,
|
|
78
|
+
secret_key: str,
|
|
79
|
+
) -> "Dataset[TDataset]":
|
|
80
|
+
client = Minio(
|
|
81
|
+
endpoint_url.replace("http://", "").replace("https://", ""),
|
|
82
|
+
access_key=access_key,
|
|
83
|
+
secret_key=secret_key,
|
|
84
|
+
secure=endpoint_url.startswith("https://"),
|
|
85
|
+
)
|
|
86
|
+
response = client.get_object(bucket, key)
|
|
87
|
+
try:
|
|
88
|
+
buffer = io.BytesIO(response.read())
|
|
89
|
+
finally:
|
|
90
|
+
response.close()
|
|
91
|
+
response.release_conn()
|
|
92
|
+
|
|
93
|
+
df = pl.read_parquet(buffer)
|
|
94
|
+
return cls.from_polars(df)
|
|
95
|
+
|
|
96
|
+
@property
|
|
97
|
+
def polars(self) -> pl.DataFrame:
|
|
98
|
+
if self._polars is None:
|
|
99
|
+
self._polars = self.to_polars()
|
|
100
|
+
return self._polars
|
|
101
|
+
|
|
102
|
+
@property
|
|
103
|
+
def lazy_polars(self) -> pl.LazyFrame:
|
|
104
|
+
if self._lazy_polars is None:
|
|
105
|
+
self._lazy_polars = self.to_lazy_polars()
|
|
106
|
+
return self._lazy_polars
|
|
107
|
+
|
|
108
|
+
@property
|
|
109
|
+
def polars_shape(self) -> Tuple[int, int]:
|
|
110
|
+
return self.polars.shape
|
|
111
|
+
|
|
112
|
+
@classmethod
|
|
113
|
+
def from_documents(cls, docs: List[Document]) -> "TextDataset[TDataset]":
|
|
114
|
+
df = pl.DataFrame(
|
|
115
|
+
{
|
|
116
|
+
"page_content": [d.page_content for d in docs],
|
|
117
|
+
"metadata": [d.metadata or {} for d in docs],
|
|
118
|
+
}
|
|
119
|
+
)
|
|
120
|
+
return cast(TextDataset[TDataset], cls.from_polars(df))
|
|
121
|
+
|
|
122
|
+
def with_columns(
|
|
123
|
+
self,
|
|
124
|
+
*exprs: IntoExpr | Iterable[IntoExpr],
|
|
125
|
+
**named_exprs: IntoExpr,
|
|
126
|
+
) -> "Dataset[TDataset]":
|
|
127
|
+
df = self.polars.with_columns(*exprs, **named_exprs)
|
|
128
|
+
return self.__class__.from_polars(df)
|
|
129
|
+
|
|
130
|
+
def filter(
|
|
131
|
+
self,
|
|
132
|
+
*predicates: (IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool]),
|
|
133
|
+
**constraints: Any,
|
|
134
|
+
) -> "Dataset[TDataset]":
|
|
135
|
+
return self.__class__.from_polars(self.polars.filter(*predicates, **constraints))
|
|
136
|
+
|
|
137
|
+
def drop(
|
|
138
|
+
self,
|
|
139
|
+
*columns: ColumnNameOrSelector | Iterable[ColumnNameOrSelector],
|
|
140
|
+
strict: bool = True,
|
|
141
|
+
) -> "Dataset[TDataset]":
|
|
142
|
+
return self.__class__.from_polars(self.polars.drop(*columns, strict=strict))
|
|
143
|
+
|
|
144
|
+
def __len__(self) -> int:
|
|
145
|
+
return self.polars_shape[0]
|
|
146
|
+
|
|
147
|
+
@overload
|
|
148
|
+
def iter_rows(self, *, named: Literal[False] = ..., buffer_size: int = ...) -> Iterator[Tuple[Any, ...]]: ...
|
|
149
|
+
|
|
150
|
+
@overload
|
|
151
|
+
def iter_rows(self, *, named: Literal[True], buffer_size: int = ...) -> Iterator[Dict[str, Any]]: ...
|
|
152
|
+
|
|
153
|
+
def iter_rows(
|
|
154
|
+
self, *, named: Literal[False, True] = False, buffer_size: int = 512
|
|
155
|
+
) -> Iterator[Tuple[Any, ...]] | Iterator[Dict[str, Any]]:
|
|
156
|
+
df_stream = self.lazy_polars.collect(streaming=True) # type: ignore[call-overload]
|
|
157
|
+
return df_stream.iter_rows(named=named, buffer_size=buffer_size) # type: ignore[no-any-return]
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
class TextDataset(Dataset[TDataset], Generic[TDataset]):
|
|
161
|
+
REQUIRED_COLUMNS = {"page_content", "metadata"}
|
|
162
|
+
|
|
163
|
+
def __init__(self, service: TDataset):
|
|
164
|
+
super().__init__(service)
|
|
165
|
+
self._validate_schema()
|
|
166
|
+
|
|
167
|
+
def _validate_schema(self) -> None:
|
|
168
|
+
df = self.polars
|
|
169
|
+
missing = self.REQUIRED_COLUMNS - set(df.columns)
|
|
170
|
+
if missing:
|
|
171
|
+
raise ValueError(
|
|
172
|
+
f"{self.__class__.__name__} requires columns {self.REQUIRED_COLUMNS}, " f"but missing {missing}"
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
def iter_documents(self) -> Iterator[Document]:
|
|
176
|
+
for row in self.polars.iter_rows(named=True):
|
|
177
|
+
yield Document(
|
|
178
|
+
page_content=row["page_content"],
|
|
179
|
+
metadata=row["metadata"],
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
def dump_documents(
|
|
183
|
+
self,
|
|
184
|
+
out_dir: str,
|
|
185
|
+
prefix: str = "doc",
|
|
186
|
+
ext: str = ".md",
|
|
187
|
+
encoding: str = "utf-8",
|
|
188
|
+
) -> None:
|
|
189
|
+
out_dir_path = Path(out_dir)
|
|
190
|
+
out_dir_path.mkdir(parents=True, exist_ok=True)
|
|
191
|
+
for i, row in enumerate(self.polars.iter_rows(named=True)):
|
|
192
|
+
path = out_dir_path / f"{prefix}_{i:05d}{ext}"
|
|
193
|
+
path.write_text(row["page_content"], encoding=encoding)
|
|
194
|
+
|
|
195
|
+
def to_langchain_documents(
|
|
196
|
+
self,
|
|
197
|
+
) -> list[Document]:
|
|
198
|
+
docs: list[Document] = []
|
|
199
|
+
for row in self.polars.iter_rows(named=True):
|
|
200
|
+
docs.append(
|
|
201
|
+
Document(
|
|
202
|
+
page_content=row["page_content"],
|
|
203
|
+
metadata=row["metadata"],
|
|
204
|
+
)
|
|
205
|
+
)
|
|
206
|
+
return docs
|
|
207
|
+
|
|
208
|
+
def to_minio(
|
|
209
|
+
self,
|
|
210
|
+
*,
|
|
211
|
+
bucket: str,
|
|
212
|
+
key: str,
|
|
213
|
+
endpoint_url: str,
|
|
214
|
+
access_key: str,
|
|
215
|
+
secret_key: str,
|
|
216
|
+
compression: ParquetCompression = "zstd",
|
|
217
|
+
row_group_size: int = 100_000,
|
|
218
|
+
) -> None:
|
|
219
|
+
client = Minio(
|
|
220
|
+
endpoint_url.replace("http://", "").replace("https://", ""),
|
|
221
|
+
access_key=access_key,
|
|
222
|
+
secret_key=secret_key,
|
|
223
|
+
secure=endpoint_url.startswith("https://"),
|
|
224
|
+
)
|
|
225
|
+
buffer = io.BytesIO()
|
|
226
|
+
self.polars.write_parquet(
|
|
227
|
+
buffer,
|
|
228
|
+
compression=compression,
|
|
229
|
+
row_group_size=row_group_size,
|
|
230
|
+
)
|
|
231
|
+
buffer.seek(0)
|
|
232
|
+
client.put_object(
|
|
233
|
+
bucket_name=bucket,
|
|
234
|
+
object_name=key,
|
|
235
|
+
data=buffer,
|
|
236
|
+
length=buffer.getbuffer().nbytes,
|
|
237
|
+
content_type="application/octet-stream",
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
@overload
|
|
241
|
+
@classmethod
|
|
242
|
+
def from_config(
|
|
243
|
+
cls,
|
|
244
|
+
config: ParquetDatasetSettings,
|
|
245
|
+
) -> Self: ...
|
|
246
|
+
|
|
247
|
+
@overload
|
|
248
|
+
@classmethod
|
|
249
|
+
def from_config(
|
|
250
|
+
cls,
|
|
251
|
+
config: MinioDatasetSettings,
|
|
252
|
+
) -> Self: ...
|
|
253
|
+
|
|
254
|
+
@classmethod
|
|
255
|
+
def from_config(
|
|
256
|
+
cls,
|
|
257
|
+
config: Union[ParquetDatasetSettings, MinioDatasetSettings],
|
|
258
|
+
) -> Self:
|
|
259
|
+
if isinstance(config, ParquetDatasetSettings):
|
|
260
|
+
df = pl.scan_parquet(config.path) if config.lazy else pl.read_parquet(config.path)
|
|
261
|
+
return cast(Self, cls.from_polars(df))
|
|
262
|
+
if isinstance(config, MinioDatasetSettings):
|
|
263
|
+
return cast(
|
|
264
|
+
Self,
|
|
265
|
+
cls.from_minio(
|
|
266
|
+
bucket=config.bucket,
|
|
267
|
+
key=config.key,
|
|
268
|
+
endpoint_url=config.endpoint,
|
|
269
|
+
access_key=config.access_key,
|
|
270
|
+
secret_key=config.secret_key,
|
|
271
|
+
),
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
raise TypeError(f"Unsupported dataset config: {type(config).__name__}")
|
|
275
|
+
|
|
276
|
+
@classmethod
|
|
277
|
+
def from_records(
|
|
278
|
+
cls,
|
|
279
|
+
records: List[Tuple[str, Dict[str, Any]]],
|
|
280
|
+
) -> "TextDataset[TDataset]":
|
|
281
|
+
if not records:
|
|
282
|
+
raise ValueError("records must be non-empty")
|
|
283
|
+
df = pl.DataFrame(
|
|
284
|
+
{
|
|
285
|
+
"page_content": [text for text, _ in records],
|
|
286
|
+
"metadata": [meta for _, meta in records],
|
|
287
|
+
}
|
|
288
|
+
)
|
|
289
|
+
return cast("TextDataset[TDataset]", cls.from_polars(df))
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from typing import Union, cast
|
|
2
|
+
|
|
3
|
+
import polars as pl
|
|
4
|
+
|
|
5
|
+
from evaluation_embedder.src.constants import TDataset
|
|
6
|
+
from evaluation_embedder.src.datasets import TextDataset
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class PolarsTextDataset(TextDataset[Union[pl.DataFrame, pl.LazyFrame]]):
|
|
10
|
+
|
|
11
|
+
def __init__(self, service: Union[pl.DataFrame, pl.LazyFrame]):
|
|
12
|
+
super().__init__(service)
|
|
13
|
+
|
|
14
|
+
@classmethod
|
|
15
|
+
def from_polars(cls, df: Union[pl.DataFrame, pl.LazyFrame]) -> "TextDataset[TDataset]":
|
|
16
|
+
return cast("TextDataset[TDataset]", cls(df))
|
|
17
|
+
|
|
18
|
+
def to_polars(self) -> pl.DataFrame:
|
|
19
|
+
if isinstance(self.service, pl.LazyFrame):
|
|
20
|
+
return self.service.collect()
|
|
21
|
+
return self.service
|
|
22
|
+
|
|
23
|
+
def to_lazy_polars(self) -> pl.LazyFrame:
|
|
24
|
+
if isinstance(self.service, pl.LazyFrame):
|
|
25
|
+
return self.service
|
|
26
|
+
return self.service.lazy()
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import Any, Generic, List, Optional, cast
|
|
5
|
+
|
|
6
|
+
from langchain_core.documents import (
|
|
7
|
+
Document, # or: from langchain.schema import Document
|
|
8
|
+
)
|
|
9
|
+
from pydantic import BaseModel, Field
|
|
10
|
+
|
|
11
|
+
from evaluation_embedder.src.constants import (
|
|
12
|
+
EmbeddingPurposeEnum,
|
|
13
|
+
TCEmbedder,
|
|
14
|
+
TCEvaluator,
|
|
15
|
+
TCProcessor,
|
|
16
|
+
TCRetriever,
|
|
17
|
+
TCScore,
|
|
18
|
+
TCVectorStore,
|
|
19
|
+
)
|
|
20
|
+
from evaluation_embedder.src.datasets import TextDataset
|
|
21
|
+
from evaluation_embedder.src.mixins import FromConfigMixin
|
|
22
|
+
from evaluation_embedder.src.utils import load_class
|
|
23
|
+
|
|
24
|
+
_logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class Score(ABC, Generic[TCScore]):
|
|
28
|
+
|
|
29
|
+
class ScoreResult(BaseModel):
|
|
30
|
+
name: str
|
|
31
|
+
value: float
|
|
32
|
+
|
|
33
|
+
def __init__(self, config: TCScore) -> None:
|
|
34
|
+
self.config = config
|
|
35
|
+
_logger.info(f"Initialized Score | class={self.__class__.__name__} | config={config}")
|
|
36
|
+
|
|
37
|
+
@abstractmethod
|
|
38
|
+
def __call__(self, hits: List[bool]) -> ScoreResult:
|
|
39
|
+
raise NotImplementedError()
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class Processor(FromConfigMixin[TCProcessor], ABC, Generic[TCProcessor]):
|
|
43
|
+
|
|
44
|
+
def __init__(self, config: TCProcessor) -> None:
|
|
45
|
+
self.config = config
|
|
46
|
+
|
|
47
|
+
@abstractmethod
|
|
48
|
+
def __call__(self, text: str, purpose: EmbeddingPurposeEnum) -> str:
|
|
49
|
+
raise NotImplementedError()
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class Embedder(FromConfigMixin[TCEmbedder], ABC, Generic[TCEmbedder]):
|
|
53
|
+
|
|
54
|
+
def __init__(self, config: TCEmbedder) -> None:
|
|
55
|
+
self.config = config
|
|
56
|
+
_logger.info(f"Initialized Embedder | class={self.__class__.__name__} | config={config}")
|
|
57
|
+
|
|
58
|
+
@abstractmethod
|
|
59
|
+
async def _aembed_documents(self, texts: List[str]) -> List[List[float]]:
|
|
60
|
+
raise NotImplementedError()
|
|
61
|
+
|
|
62
|
+
def process_query(self, text: str, processor: Optional[Processor[Any]]) -> str:
|
|
63
|
+
if processor:
|
|
64
|
+
return processor(text=text, purpose=EmbeddingPurposeEnum.QUERY) # type: ignore[arg-type]
|
|
65
|
+
return text
|
|
66
|
+
|
|
67
|
+
def process_document(self, text: str, processor: Optional[Processor[Any]]) -> str:
|
|
68
|
+
if processor:
|
|
69
|
+
return processor(text=text, purpose=EmbeddingPurposeEnum.DOCUMENT) # type: ignore[arg-type]
|
|
70
|
+
return text
|
|
71
|
+
|
|
72
|
+
async def aembed_documents(
|
|
73
|
+
self,
|
|
74
|
+
texts: List[str],
|
|
75
|
+
processor: Optional[Processor] = None,
|
|
76
|
+
) -> List[List[float]]:
|
|
77
|
+
docs = [self.process_document(t, processor) for t in texts]
|
|
78
|
+
return await self._aembed_documents(docs)
|
|
79
|
+
|
|
80
|
+
async def aembed_query(self, text: str, processor: Optional[Processor] = None) -> List[float]:
|
|
81
|
+
text = self.process_query(text, processor)
|
|
82
|
+
return (await self.aembed_documents([text]))[0]
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class VectorStore(FromConfigMixin[TCVectorStore], ABC, Generic[TCVectorStore]):
|
|
86
|
+
class ScoredPoint(BaseModel):
|
|
87
|
+
score: float = Field(..., description="Points vector distance to the query vector")
|
|
88
|
+
document: Document
|
|
89
|
+
|
|
90
|
+
class QueryResponse(BaseModel):
|
|
91
|
+
points: List["VectorStore.ScoredPoint"]
|
|
92
|
+
|
|
93
|
+
def __init__(self, config: TCVectorStore) -> None:
|
|
94
|
+
self.config = config
|
|
95
|
+
_logger.info(f"Initialized VectorStore | class={self.__class__.__name__} | config={config}")
|
|
96
|
+
|
|
97
|
+
@abstractmethod
|
|
98
|
+
def query_points(self, query: List[float], *, limit: int) -> QueryResponse:
|
|
99
|
+
raise NotImplementedError()
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class Retriever(FromConfigMixin[TCRetriever], ABC, Generic[TCRetriever]):
|
|
103
|
+
|
|
104
|
+
def __init__(self, config: TCRetriever) -> None:
|
|
105
|
+
self.config = config
|
|
106
|
+
|
|
107
|
+
_logger.info(f"Initializing Retriever | class={self.__class__.__name__}")
|
|
108
|
+
|
|
109
|
+
self.embedder: Embedder[Any] = load_class(self.config.embedder.module_path)(self.config.embedder)
|
|
110
|
+
|
|
111
|
+
self.vector_store: VectorStore[Any] = load_class(self.config.vector_store.module_path)(self.config.vector_store)
|
|
112
|
+
|
|
113
|
+
self.processor: Processor[Any] = load_class(self.config.processor.module_path)(self.config.processor)
|
|
114
|
+
|
|
115
|
+
_logger.info(
|
|
116
|
+
f"Retriever initialized | embedder={type(self.embedder).__name__} | "
|
|
117
|
+
f"vector_store={type(self.vector_store).__name__}"
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
async def retrieve(self, query: str, *, limit: int) -> VectorStore.QueryResponse:
|
|
121
|
+
query_embedding = await self.embedder.aembed_query(query, self.processor)
|
|
122
|
+
return self.vector_store.query_points(
|
|
123
|
+
query=query_embedding,
|
|
124
|
+
limit=limit,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
class Evaluator(FromConfigMixin[TCEvaluator], ABC, Generic[TCEvaluator]):
|
|
129
|
+
|
|
130
|
+
NB_LOGS_PER_QUERIES = 100
|
|
131
|
+
|
|
132
|
+
def __init__(self, config: TCEvaluator) -> None:
|
|
133
|
+
self.config = config
|
|
134
|
+
|
|
135
|
+
_logger.info(f"Initializing Evaluator | class={self.__class__.__name__}")
|
|
136
|
+
|
|
137
|
+
self.dataset: TextDataset[Any] = self._load_dataset()
|
|
138
|
+
self.retriever: Retriever[Any] = load_class(self.config.retriever.module_path)(self.config.retriever)
|
|
139
|
+
self.scores: List[Score[Any]] = [load_class(s.module_path)(s) for s in self.config.scores]
|
|
140
|
+
|
|
141
|
+
_logger.info(
|
|
142
|
+
f"Evaluator ready | dataset_size={len(self.dataset)} | " f"scores={[type(s).__name__ for s in self.scores]}"
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
def _load_dataset(self) -> TextDataset[Any]:
|
|
146
|
+
dataset_cls = load_class(self.config.dataset.module_path)
|
|
147
|
+
_logger.info(
|
|
148
|
+
f"Instantiating dataset | class={dataset_cls.__name__} | " f"module_path={self.config.dataset.module_path}"
|
|
149
|
+
)
|
|
150
|
+
dataset = cast(
|
|
151
|
+
TextDataset[Any],
|
|
152
|
+
dataset_cls.from_config(self.config.dataset),
|
|
153
|
+
)
|
|
154
|
+
_logger.info(f"Dataset loaded | type={type(dataset).__name__} | " f"size={len(dataset)}")
|
|
155
|
+
return dataset
|
|
156
|
+
|
|
157
|
+
async def eval(self) -> List[List[Score.ScoreResult]]:
|
|
158
|
+
scores_all = []
|
|
159
|
+
max_k = max(getattr(score.config, "k", 0) for score in self.scores)
|
|
160
|
+
_logger.info(f"Starting evaluation | max_k={max_k} | num_queries={len(self.dataset)}")
|
|
161
|
+
for idx, sample in enumerate(self.dataset.iter_rows(named=True), start=1):
|
|
162
|
+
scores = []
|
|
163
|
+
query = sample["metadata"]["query"]
|
|
164
|
+
page_content = self.retriever.processor(sample["page_content"], purpose=EmbeddingPurposeEnum.DOCUMENT) # type: ignore[arg-type]
|
|
165
|
+
response = await self.retriever.retrieve(
|
|
166
|
+
query,
|
|
167
|
+
limit=max_k,
|
|
168
|
+
)
|
|
169
|
+
hits = [p.document.page_content == page_content for p in response.points]
|
|
170
|
+
scores = [s(hits) for s in self.scores]
|
|
171
|
+
scores_all.append(scores)
|
|
172
|
+
if idx % self.__class__.NB_LOGS_PER_QUERIES == 0 or idx == len(self.dataset):
|
|
173
|
+
_logger.info(f"Eval progress | processed={idx}/{len(self.dataset)}")
|
|
174
|
+
_logger.info("Evaluation completed")
|
|
175
|
+
return scores_all
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
from openai import AsyncOpenAI
|
|
4
|
+
|
|
5
|
+
from evaluation_embedder.src.evaluation import Embedder
|
|
6
|
+
from evaluation_embedder.src.settings import VLLMEmbedderSettings
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class VLLMEmbedder(Embedder[VLLMEmbedderSettings]):
|
|
10
|
+
def __init__(self, config: VLLMEmbedderSettings):
|
|
11
|
+
super().__init__(config)
|
|
12
|
+
self.client = AsyncOpenAI(
|
|
13
|
+
base_url=self.config.base_url,
|
|
14
|
+
api_key="", # vLLM does not require a key
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
async def _aembed_documents(self, texts: List[str]) -> List[List[float]]:
|
|
18
|
+
"""
|
|
19
|
+
Async embedding via vLLM OpenAI-compatible API.
|
|
20
|
+
"""
|
|
21
|
+
response = await self.client.embeddings.create(
|
|
22
|
+
model=self.config.model_name,
|
|
23
|
+
input=texts,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
# Preserve order
|
|
27
|
+
return [item.embedding for item in response.data]
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from typing import List, cast
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from langchain_core.documents import Document
|
|
5
|
+
|
|
6
|
+
from evaluation_embedder.src.datasets.polars import PolarsTextDataset
|
|
7
|
+
from evaluation_embedder.src.evaluation import Evaluator
|
|
8
|
+
from evaluation_embedder.src.evaluation.vector_stores import FaissVectorStore
|
|
9
|
+
from evaluation_embedder.src.settings import (
|
|
10
|
+
FaissEvaluatorSettings,
|
|
11
|
+
QdrantEvaluatorSettings,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class QdrantEvaluator(Evaluator[QdrantEvaluatorSettings]):
|
|
16
|
+
def __init__(self, config: QdrantEvaluatorSettings):
|
|
17
|
+
super().__init__(config)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class FaissEvaluator(Evaluator[FaissEvaluatorSettings]):
|
|
21
|
+
def __init__(self, config: FaissEvaluatorSettings):
|
|
22
|
+
super().__init__(config)
|
|
23
|
+
|
|
24
|
+
@classmethod
|
|
25
|
+
async def create(cls, config: FaissEvaluatorSettings) -> "FaissEvaluator":
|
|
26
|
+
self = cls(config)
|
|
27
|
+
|
|
28
|
+
docs = self.get_docs()
|
|
29
|
+
texts = [d.page_content for d in docs]
|
|
30
|
+
|
|
31
|
+
embeddings = np.asarray(
|
|
32
|
+
await self.retriever.embedder.aembed_documents(texts),
|
|
33
|
+
dtype="float32",
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
vector_store = cast(FaissVectorStore, self.retriever.vector_store)
|
|
37
|
+
vector_store.index = vector_store.build_faiss_index(embeddings.shape[-1])
|
|
38
|
+
vector_store.add_documents(docs, embeddings)
|
|
39
|
+
|
|
40
|
+
return self
|
|
41
|
+
|
|
42
|
+
def get_docs(self) -> List[Document]:
|
|
43
|
+
docs_idx = []
|
|
44
|
+
seen = set()
|
|
45
|
+
for i, row in enumerate(self.dataset.iter_rows(named=True)):
|
|
46
|
+
doc_id = row["metadata"]["doc_id"]
|
|
47
|
+
if doc_id not in seen:
|
|
48
|
+
seen.add(doc_id)
|
|
49
|
+
docs_idx.append(i)
|
|
50
|
+
return PolarsTextDataset(self.dataset.polars[docs_idx]).to_langchain_documents()
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from evaluation_embedder.src.constants import EmbeddingPurposeEnum
|
|
2
|
+
from evaluation_embedder.src.evaluation import Processor
|
|
3
|
+
from evaluation_embedder.src.settings import NomicProcessorSettings
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class NomicProcessor(Processor[NomicProcessorSettings]):
|
|
7
|
+
|
|
8
|
+
def __init__(self, config: NomicProcessorSettings) -> None:
|
|
9
|
+
super().__init__(config)
|
|
10
|
+
|
|
11
|
+
def __call__(self, text: str, purpose: EmbeddingPurposeEnum) -> str:
|
|
12
|
+
if purpose == EmbeddingPurposeEnum.DOCUMENT:
|
|
13
|
+
return f"search_document: {text}"
|
|
14
|
+
if purpose == EmbeddingPurposeEnum.QUERY:
|
|
15
|
+
return f"search_query: {text}"
|
|
16
|
+
raise ValueError(f"Unsupported embedding purpose {purpose}")
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from evaluation_embedder.src.evaluation import Retriever
|
|
2
|
+
from evaluation_embedder.src.settings import (
|
|
3
|
+
FaissVectorStoreSettings,
|
|
4
|
+
NomicProcessorSettings,
|
|
5
|
+
QdrantVectorStoreSettings,
|
|
6
|
+
RetrieverSettings,
|
|
7
|
+
VLLMEmbedderSettings,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class VLLMFAISSRetriever(
|
|
12
|
+
Retriever[RetrieverSettings[VLLMEmbedderSettings, FaissVectorStoreSettings, NomicProcessorSettings]]
|
|
13
|
+
):
|
|
14
|
+
def __init__(
|
|
15
|
+
self, config: RetrieverSettings[VLLMEmbedderSettings, FaissVectorStoreSettings, NomicProcessorSettings]
|
|
16
|
+
):
|
|
17
|
+
super().__init__(config)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class VLLMQdrantRetriever(
|
|
21
|
+
Retriever[RetrieverSettings[VLLMEmbedderSettings, QdrantVectorStoreSettings, NomicProcessorSettings]]
|
|
22
|
+
):
|
|
23
|
+
def __init__(
|
|
24
|
+
self, config: RetrieverSettings[VLLMEmbedderSettings, QdrantVectorStoreSettings, NomicProcessorSettings]
|
|
25
|
+
):
|
|
26
|
+
super().__init__(config)
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
from evaluation_embedder.src.evaluation import Score
|
|
4
|
+
from evaluation_embedder.src.settings import (
|
|
5
|
+
HitAtKScoreSettings,
|
|
6
|
+
MRRAtKScoreSettings,
|
|
7
|
+
PrecisionAtKScoreSettings,
|
|
8
|
+
RecallAtKScoreSettings,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class RecallAtK(Score[RecallAtKScoreSettings]):
|
|
13
|
+
|
|
14
|
+
def __call__(self, hits: List[bool]) -> Score.ScoreResult:
|
|
15
|
+
hits_k = hits[: self.config.k]
|
|
16
|
+
value = sum(hits_k) / len(hits_k) if hits_k else 0.0
|
|
17
|
+
|
|
18
|
+
return Score.ScoreResult(
|
|
19
|
+
name=f"Recall@{self.config.k}",
|
|
20
|
+
value=value,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class PrecisionAtK(Score[PrecisionAtKScoreSettings]):
|
|
25
|
+
def __call__(self, hits: List[bool]) -> Score.ScoreResult:
|
|
26
|
+
hits_k = hits[: self.config.k]
|
|
27
|
+
value = sum(hits_k) / self.config.k
|
|
28
|
+
return Score.ScoreResult(
|
|
29
|
+
name=f"Precision@{self.config.k}",
|
|
30
|
+
value=value,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class HitAtK(Score[HitAtKScoreSettings]):
|
|
35
|
+
|
|
36
|
+
def __call__(self, hits: List[bool]) -> Score.ScoreResult:
|
|
37
|
+
hits_k = hits[: self.config.k]
|
|
38
|
+
value = 1.0 if any(hits_k) else 0.0
|
|
39
|
+
|
|
40
|
+
return Score.ScoreResult(
|
|
41
|
+
name=f"Hit@{self.config.k}",
|
|
42
|
+
value=value,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class MRRAtK(Score[MRRAtKScoreSettings]):
|
|
47
|
+
|
|
48
|
+
def __call__(self, hits: List[bool]) -> Score.ScoreResult:
|
|
49
|
+
value = 0.0
|
|
50
|
+
|
|
51
|
+
for rank, is_hit in enumerate(hits[: self.config.k], start=1):
|
|
52
|
+
if is_hit:
|
|
53
|
+
value = 1.0 / rank
|
|
54
|
+
break
|
|
55
|
+
|
|
56
|
+
return Score.ScoreResult(
|
|
57
|
+
name=f"MRR@{self.config.k}",
|
|
58
|
+
value=value,
|
|
59
|
+
)
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
from typing import List, Optional
|
|
2
|
+
|
|
3
|
+
import faiss # type:ignore[import-untyped]
|
|
4
|
+
import numpy as np
|
|
5
|
+
from langchain_core.documents import Document
|
|
6
|
+
from qdrant_client import QdrantClient
|
|
7
|
+
|
|
8
|
+
from evaluation_embedder.src.constants import FAISSIndexType
|
|
9
|
+
from evaluation_embedder.src.evaluation import VectorStore
|
|
10
|
+
from evaluation_embedder.src.evaluation.vector_stores import VectorStore
|
|
11
|
+
from evaluation_embedder.src.settings import (
|
|
12
|
+
FaissVectorStoreSettings,
|
|
13
|
+
QdrantVectorStoreSettings,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class FaissVectorStore(VectorStore[FaissVectorStoreSettings]):
|
|
18
|
+
|
|
19
|
+
def __init__(self, config: FaissVectorStoreSettings):
|
|
20
|
+
super().__init__(config)
|
|
21
|
+
self.documents: List[Document] = []
|
|
22
|
+
self.index: Optional[faiss.Index] = None
|
|
23
|
+
|
|
24
|
+
def build_faiss_index(self, dim: int) -> faiss.Index:
|
|
25
|
+
if self.config.index_type is FAISSIndexType.FLAT_IP:
|
|
26
|
+
return faiss.IndexFlatIP(dim)
|
|
27
|
+
return faiss.IndexFlatL2(dim)
|
|
28
|
+
|
|
29
|
+
# --------------------------------------------------
|
|
30
|
+
# Add documents
|
|
31
|
+
# --------------------------------------------------
|
|
32
|
+
def add_documents(
|
|
33
|
+
self,
|
|
34
|
+
documents: List[Document],
|
|
35
|
+
embeddings: np.ndarray,
|
|
36
|
+
) -> None:
|
|
37
|
+
if self.config.normalize:
|
|
38
|
+
faiss.normalize_L2(embeddings)
|
|
39
|
+
if self.index is None:
|
|
40
|
+
raise ValueError(f"index should be created before adding documents")
|
|
41
|
+
self.index.add(embeddings.astype("float32"))
|
|
42
|
+
self.documents.extend(documents)
|
|
43
|
+
|
|
44
|
+
# --------------------------------------------------
|
|
45
|
+
# Query (same contract as Qdrant)
|
|
46
|
+
# --------------------------------------------------
|
|
47
|
+
def query_points(
|
|
48
|
+
self,
|
|
49
|
+
query: List[float],
|
|
50
|
+
*,
|
|
51
|
+
limit: int,
|
|
52
|
+
) -> VectorStore.QueryResponse:
|
|
53
|
+
|
|
54
|
+
query_vec = np.asarray(query, dtype="float32")[None, :]
|
|
55
|
+
|
|
56
|
+
if self.config.normalize:
|
|
57
|
+
faiss.normalize_L2(query_vec)
|
|
58
|
+
if self.index is None:
|
|
59
|
+
raise ValueError(f"index should be created before querying points")
|
|
60
|
+
scores, indices = self.index.search(query_vec, limit)
|
|
61
|
+
|
|
62
|
+
points: List[VectorStore.ScoredPoint] = []
|
|
63
|
+
|
|
64
|
+
for idx, score in zip(indices[0], scores[0]):
|
|
65
|
+
if idx == -1:
|
|
66
|
+
continue
|
|
67
|
+
points.append(
|
|
68
|
+
VectorStore.ScoredPoint(
|
|
69
|
+
score=float(score),
|
|
70
|
+
document=self.documents[idx],
|
|
71
|
+
)
|
|
72
|
+
)
|
|
73
|
+
return VectorStore.QueryResponse(points=points)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class QdrantVectorStore(VectorStore[QdrantVectorStoreSettings]):
|
|
77
|
+
|
|
78
|
+
def __init__(self, config: QdrantVectorStoreSettings):
|
|
79
|
+
super().__init__(config)
|
|
80
|
+
self.client = QdrantClient(url=self.config.url)
|
|
81
|
+
|
|
82
|
+
def query_points(self, query: List[float], *, limit: int) -> VectorStore.QueryResponse:
|
|
83
|
+
result = self.client.query_points(
|
|
84
|
+
collection_name=self.config.collection_name,
|
|
85
|
+
query=query,
|
|
86
|
+
limit=limit,
|
|
87
|
+
)
|
|
88
|
+
points: List[VectorStore.ScoredPoint] = []
|
|
89
|
+
for idx, point in enumerate(result.points):
|
|
90
|
+
payload = point.payload
|
|
91
|
+
if payload is None:
|
|
92
|
+
raise ValueError(
|
|
93
|
+
f"Qdrant returned a point with no payload | "
|
|
94
|
+
f"collection={self.config.collection_name} | "
|
|
95
|
+
f"index={idx} | score={point.score}"
|
|
96
|
+
)
|
|
97
|
+
if "page_content" not in payload:
|
|
98
|
+
raise KeyError(
|
|
99
|
+
f"Missing 'page_content' in payload | "
|
|
100
|
+
f"collection={self.config.collection_name} | "
|
|
101
|
+
f"keys={list(payload.keys())}"
|
|
102
|
+
)
|
|
103
|
+
page_content = payload["page_content"]
|
|
104
|
+
metadata = dict(payload["metadata"])
|
|
105
|
+
points.append(
|
|
106
|
+
VectorStore.ScoredPoint(
|
|
107
|
+
score=point.score,
|
|
108
|
+
document=Document(
|
|
109
|
+
page_content=page_content,
|
|
110
|
+
metadata=metadata,
|
|
111
|
+
),
|
|
112
|
+
)
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
return VectorStore.QueryResponse(points=points)
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any, Dict, Generic, Self, Type, get_args
|
|
4
|
+
|
|
5
|
+
import yaml
|
|
6
|
+
|
|
7
|
+
from evaluation_embedder.src.constants import TCFromConfigMixin
|
|
8
|
+
from evaluation_embedder.src.utils import load_class
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class FromConfigMixin(ABC, Generic[TCFromConfigMixin]):
|
|
12
|
+
|
|
13
|
+
def __init__(self, config: TCFromConfigMixin) -> None:
|
|
14
|
+
super().__init__()
|
|
15
|
+
self.config = config
|
|
16
|
+
|
|
17
|
+
@classmethod
|
|
18
|
+
def from_config(
|
|
19
|
+
cls,
|
|
20
|
+
config: TCFromConfigMixin,
|
|
21
|
+
) -> Self:
|
|
22
|
+
return cls(config)
|
|
23
|
+
|
|
24
|
+
@classmethod
|
|
25
|
+
def get_config_class(cls) -> Type[TCFromConfigMixin]:
|
|
26
|
+
return get_args(cls.__orig_bases__[0])[0] # type: ignore
|
|
27
|
+
|
|
28
|
+
@classmethod
|
|
29
|
+
def from_yaml(
|
|
30
|
+
cls,
|
|
31
|
+
path: str,
|
|
32
|
+
key: str | None = None,
|
|
33
|
+
) -> Self:
|
|
34
|
+
"""
|
|
35
|
+
Load a runtime object from YAML.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
yaml_path: Path to YAML config file
|
|
39
|
+
settings_cls: Pydantic Settings model to validate config
|
|
40
|
+
key: Optional top-level YAML key (e.g. "retriever")
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Instantiated runtime object
|
|
44
|
+
"""
|
|
45
|
+
yaml_path = Path(path)
|
|
46
|
+
with yaml_path.open("r") as f:
|
|
47
|
+
raw: Dict[str, Any] = yaml.safe_load(f)
|
|
48
|
+
if key is not None:
|
|
49
|
+
raw = raw[key]
|
|
50
|
+
settings = cls.get_config_class().model_validate(raw)
|
|
51
|
+
runtime_cls = load_class(settings.module_path)
|
|
52
|
+
return runtime_cls(settings) # type: ignore[no-any-return]
|
|
53
|
+
|
|
54
|
+
@classmethod
|
|
55
|
+
def from_settings(cls) -> Self:
|
|
56
|
+
"""
|
|
57
|
+
Load runtime object using Pydantic Settings resolution:
|
|
58
|
+
init > yaml > env > dotenv > secrets
|
|
59
|
+
"""
|
|
60
|
+
settings = cls.get_config_class()() # type: ignore[call-arg]
|
|
61
|
+
return load_class(settings.module_path)(settings) # type: ignore[no-any-return]
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
from typing import Generic, List
|
|
2
|
+
|
|
3
|
+
from pydantic_settings import (
|
|
4
|
+
BaseSettings,
|
|
5
|
+
PydanticBaseSettingsSource,
|
|
6
|
+
SettingsConfigDict,
|
|
7
|
+
YamlConfigSettingsSource,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
from evaluation_embedder.src.constants import (
|
|
11
|
+
CONFIG_PATH,
|
|
12
|
+
FAISSIndexType,
|
|
13
|
+
TCDataset,
|
|
14
|
+
TCEmbedder,
|
|
15
|
+
TCProcessor,
|
|
16
|
+
TCRetriever,
|
|
17
|
+
TCScore,
|
|
18
|
+
TCVectorStore,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class DatasetSettings(BaseSettings):
|
|
23
|
+
module_path: str
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ParquetDatasetSettings(DatasetSettings):
|
|
27
|
+
path: str
|
|
28
|
+
lazy: bool
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class MinioDatasetSettings(DatasetSettings):
|
|
32
|
+
endpoint: str
|
|
33
|
+
bucket: str
|
|
34
|
+
key: str
|
|
35
|
+
access_key: str
|
|
36
|
+
secret_key: str
|
|
37
|
+
model_config = SettingsConfigDict(env_prefix='MINIO_', extra="ignore")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class FromConfigMixinSettings(BaseSettings):
|
|
41
|
+
module_path: str
|
|
42
|
+
model_config = SettingsConfigDict(
|
|
43
|
+
yaml_file=CONFIG_PATH,
|
|
44
|
+
extra="ignore",
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
@classmethod
|
|
48
|
+
def settings_customise_sources(
|
|
49
|
+
cls,
|
|
50
|
+
settings_cls: type[BaseSettings],
|
|
51
|
+
init_settings: PydanticBaseSettingsSource,
|
|
52
|
+
env_settings: PydanticBaseSettingsSource,
|
|
53
|
+
dotenv_settings: PydanticBaseSettingsSource,
|
|
54
|
+
file_secret_settings: PydanticBaseSettingsSource,
|
|
55
|
+
) -> tuple[PydanticBaseSettingsSource, ...]:
|
|
56
|
+
return (
|
|
57
|
+
init_settings,
|
|
58
|
+
YamlConfigSettingsSource(settings_cls),
|
|
59
|
+
env_settings,
|
|
60
|
+
dotenv_settings,
|
|
61
|
+
file_secret_settings,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class EmbedderSettings(FromConfigMixinSettings):
|
|
66
|
+
model_name: str
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class VLLMEmbedderSettings(EmbedderSettings):
|
|
70
|
+
base_url: str
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class ProcessorSettings(FromConfigMixinSettings):
|
|
74
|
+
pass
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class NomicProcessorSettings(ProcessorSettings):
|
|
78
|
+
pass
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class ScoreSettings(BaseSettings):
|
|
82
|
+
module_path: str
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class RecallAtKScoreSettings(ScoreSettings):
|
|
86
|
+
k: int
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class PrecisionAtKScoreSettings(ScoreSettings):
|
|
90
|
+
k: int
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class HitAtKScoreSettings(ScoreSettings):
|
|
94
|
+
k: int
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class MRRAtKScoreSettings(ScoreSettings):
|
|
98
|
+
k: int
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class VectorStoreSettings(FromConfigMixinSettings):
|
|
102
|
+
pass
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class QdrantVectorStoreSettings(VectorStoreSettings):
|
|
106
|
+
url: str
|
|
107
|
+
collection_name: str
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class FaissVectorStoreSettings(VectorStoreSettings):
|
|
111
|
+
index_type: FAISSIndexType
|
|
112
|
+
normalize: bool
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class RetrieverSettings(FromConfigMixinSettings, Generic[TCEmbedder, TCVectorStore, TCProcessor]):
|
|
116
|
+
embedder: TCEmbedder
|
|
117
|
+
vector_store: TCVectorStore
|
|
118
|
+
processor: TCProcessor
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class EvaluatorSettings(FromConfigMixinSettings, Generic[TCDataset, TCRetriever, TCScore]):
|
|
122
|
+
dataset: TCDataset
|
|
123
|
+
retriever: TCRetriever
|
|
124
|
+
scores: List[TCScore]
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class QdrantEvaluatorSettings(
|
|
128
|
+
EvaluatorSettings[
|
|
129
|
+
MinioDatasetSettings,
|
|
130
|
+
RetrieverSettings[VLLMEmbedderSettings, QdrantVectorStoreSettings, NomicProcessorSettings],
|
|
131
|
+
RecallAtKScoreSettings,
|
|
132
|
+
]
|
|
133
|
+
):
|
|
134
|
+
pass
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
class FaissEvaluatorSettings(
|
|
138
|
+
EvaluatorSettings[
|
|
139
|
+
MinioDatasetSettings,
|
|
140
|
+
RetrieverSettings[VLLMEmbedderSettings, FaissVectorStoreSettings, NomicProcessorSettings],
|
|
141
|
+
RecallAtKScoreSettings,
|
|
142
|
+
]
|
|
143
|
+
):
|
|
144
|
+
pass
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "evaluation-embedder"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = ""
|
|
5
|
+
authors = [
|
|
6
|
+
{name = "jalal",email = "jalalkhaldi3@gmail.com"}
|
|
7
|
+
]
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
requires-python = "3.12.9"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"polars (>=1.37.1,<2.0.0)",
|
|
12
|
+
"minio (>=7.2.20,<8.0.0)",
|
|
13
|
+
"langchain (>=1.2.3,<2.0.0)",
|
|
14
|
+
"pydantic-settings (>=2.12.0,<3.0.0)",
|
|
15
|
+
"openai (>=2.15.0,<3.0.0)",
|
|
16
|
+
"numpy (>=2.4.1,<3.0.0)",
|
|
17
|
+
"qdrant-client (>=1.16.2,<2.0.0)",
|
|
18
|
+
"faiss-cpu (>=1.13.2,<2.0.0)",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
[dependency-groups]
|
|
23
|
+
notebook = [
|
|
24
|
+
"jupyter (>=1.1.1,<2.0.0)",
|
|
25
|
+
"mypy (>=1.19.0,<2.0.0)",
|
|
26
|
+
"pre-commit (>=4.5.0,<5.0.0)",
|
|
27
|
+
"types-pyyaml (>=6.0.12.20250915,<7.0.0.0)",
|
|
28
|
+
"torch (>=2.9.0,<3.0.0)"
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
[build-system]
|
|
34
|
+
requires = ["poetry-core>=2.0.0,<3.0.0"]
|
|
35
|
+
build-backend = "poetry.core.masonry.api"
|