evaluation-embedder 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,19 @@
1
+ Metadata-Version: 2.4
2
+ Name: evaluation-embedder
3
+ Version: 0.1.0
4
+ Summary:
5
+ Author: jalal
6
+ Author-email: jalalkhaldi3@gmail.com
7
+ Requires-Python: 3.12.9
8
+ Classifier: Programming Language :: Python :: 3
9
+ Requires-Dist: faiss-cpu (>=1.13.2,<2.0.0)
10
+ Requires-Dist: langchain (>=1.2.3,<2.0.0)
11
+ Requires-Dist: minio (>=7.2.20,<8.0.0)
12
+ Requires-Dist: numpy (>=2.4.1,<3.0.0)
13
+ Requires-Dist: openai (>=2.15.0,<3.0.0)
14
+ Requires-Dist: polars (>=1.37.1,<2.0.0)
15
+ Requires-Dist: pydantic-settings (>=2.12.0,<3.0.0)
16
+ Requires-Dist: qdrant-client (>=1.16.2,<2.0.0)
17
+ Description-Content-Type: text/markdown
18
+
19
+
File without changes
@@ -0,0 +1,38 @@
1
+ from typing import TYPE_CHECKING, Literal, TypeAlias, TypeVar
2
+
3
+ from polars import Enum
4
+
5
+ if TYPE_CHECKING:
6
+ from evaluation_embedder.src.settings import (
7
+ DatasetSettings,
8
+ EmbedderSettings,
9
+ EvaluatorSettings,
10
+ FromConfigMixinSettings,
11
+ ProcessorSettings,
12
+ RetrieverSettings,
13
+ ScoreSettings,
14
+ VectorStoreSettings,
15
+ )
16
+
17
+ CONFIG_PATH = "/config/config.yaml"
18
+
19
+ TDataset = TypeVar("TDataset")
20
+ ParquetCompression: TypeAlias = Literal["lz4", "uncompressed", "snappy", "gzip", "brotli", "zstd"]
21
+ TCFromConfigMixin = TypeVar("TCFromConfigMixin", bound="FromConfigMixinSettings")
22
+ TCDataset = TypeVar("TCDataset", bound="DatasetSettings")
23
+ TCEmbedder = TypeVar("TCEmbedder", bound="EmbedderSettings")
24
+ TCEvaluator = TypeVar("TCEvaluator", bound="EvaluatorSettings")
25
+ TCProcessor = TypeVar("TCProcessor", bound="ProcessorSettings")
26
+ TCRetriever = TypeVar("TCRetriever", bound="RetrieverSettings")
27
+ TCScore = TypeVar("TCScore", bound="ScoreSettings")
28
+ TCVectorStore = TypeVar("TCVectorStore", bound="VectorStoreSettings")
29
+
30
+
31
+ class EmbeddingPurposeEnum(Enum):
32
+ DOCUMENT = "document"
33
+ QUERY = "query"
34
+
35
+
36
+ class FAISSIndexType(Enum):
37
+ FLAT_IP = "flat_ip"
38
+ FLAT_L2 = "flat_l2"
@@ -0,0 +1,289 @@
1
+ import io
2
+ import logging
3
+ from abc import ABC, abstractmethod
4
+ from pathlib import Path
5
+ from typing import (
6
+ Any,
7
+ Dict,
8
+ Generic,
9
+ Iterable,
10
+ Iterator,
11
+ List,
12
+ Literal,
13
+ Optional,
14
+ Self,
15
+ Tuple,
16
+ Union,
17
+ cast,
18
+ overload,
19
+ )
20
+
21
+ import polars as pl
22
+ from langchain_core.documents import Document
23
+ from minio import Minio
24
+ from polars._typing import ColumnNameOrSelector, IntoExpr, IntoExprColumn
25
+
26
+ from evaluation_embedder.src.constants import ParquetCompression, TDataset
27
+ from evaluation_embedder.src.settings import (
28
+ MinioDatasetSettings,
29
+ ParquetDatasetSettings,
30
+ )
31
+
32
+ _logger = logging.getLogger(__name__)
33
+
34
+
35
+ class Dataset(ABC, Generic[TDataset]):
36
+ def __init__(self, service: TDataset):
37
+ super().__init__()
38
+ self.service = service
39
+ self._polars: Optional[pl.DataFrame] = None
40
+ self._lazy_polars: Optional[pl.LazyFrame] = None
41
+
42
+ @classmethod
43
+ @abstractmethod
44
+ def from_polars(cls, df: Union[pl.DataFrame, pl.LazyFrame]) -> "Dataset[TDataset]":
45
+ raise NotImplementedError
46
+
47
+ @abstractmethod
48
+ def to_polars(self) -> pl.DataFrame:
49
+ raise NotImplementedError
50
+
51
+ @abstractmethod
52
+ def to_lazy_polars(self) -> pl.LazyFrame:
53
+ raise NotImplementedError
54
+
55
+ @classmethod
56
+ def from_parquet(
57
+ cls,
58
+ path: Union[str, Path],
59
+ *,
60
+ lazy: bool = True,
61
+ ) -> "Dataset[TDataset]":
62
+ df: Union[pl.DataFrame, pl.LazyFrame]
63
+ if lazy:
64
+ df = pl.scan_parquet(path)
65
+ else:
66
+ df = pl.read_parquet(path)
67
+
68
+ return cls.from_polars(df)
69
+
70
+ @classmethod
71
+ def from_minio(
72
+ cls,
73
+ *,
74
+ bucket: str,
75
+ key: str,
76
+ endpoint_url: str,
77
+ access_key: str,
78
+ secret_key: str,
79
+ ) -> "Dataset[TDataset]":
80
+ client = Minio(
81
+ endpoint_url.replace("http://", "").replace("https://", ""),
82
+ access_key=access_key,
83
+ secret_key=secret_key,
84
+ secure=endpoint_url.startswith("https://"),
85
+ )
86
+ response = client.get_object(bucket, key)
87
+ try:
88
+ buffer = io.BytesIO(response.read())
89
+ finally:
90
+ response.close()
91
+ response.release_conn()
92
+
93
+ df = pl.read_parquet(buffer)
94
+ return cls.from_polars(df)
95
+
96
+ @property
97
+ def polars(self) -> pl.DataFrame:
98
+ if self._polars is None:
99
+ self._polars = self.to_polars()
100
+ return self._polars
101
+
102
+ @property
103
+ def lazy_polars(self) -> pl.LazyFrame:
104
+ if self._lazy_polars is None:
105
+ self._lazy_polars = self.to_lazy_polars()
106
+ return self._lazy_polars
107
+
108
+ @property
109
+ def polars_shape(self) -> Tuple[int, int]:
110
+ return self.polars.shape
111
+
112
+ @classmethod
113
+ def from_documents(cls, docs: List[Document]) -> "TextDataset[TDataset]":
114
+ df = pl.DataFrame(
115
+ {
116
+ "page_content": [d.page_content for d in docs],
117
+ "metadata": [d.metadata or {} for d in docs],
118
+ }
119
+ )
120
+ return cast(TextDataset[TDataset], cls.from_polars(df))
121
+
122
+ def with_columns(
123
+ self,
124
+ *exprs: IntoExpr | Iterable[IntoExpr],
125
+ **named_exprs: IntoExpr,
126
+ ) -> "Dataset[TDataset]":
127
+ df = self.polars.with_columns(*exprs, **named_exprs)
128
+ return self.__class__.from_polars(df)
129
+
130
+ def filter(
131
+ self,
132
+ *predicates: (IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool]),
133
+ **constraints: Any,
134
+ ) -> "Dataset[TDataset]":
135
+ return self.__class__.from_polars(self.polars.filter(*predicates, **constraints))
136
+
137
+ def drop(
138
+ self,
139
+ *columns: ColumnNameOrSelector | Iterable[ColumnNameOrSelector],
140
+ strict: bool = True,
141
+ ) -> "Dataset[TDataset]":
142
+ return self.__class__.from_polars(self.polars.drop(*columns, strict=strict))
143
+
144
+ def __len__(self) -> int:
145
+ return self.polars_shape[0]
146
+
147
+ @overload
148
+ def iter_rows(self, *, named: Literal[False] = ..., buffer_size: int = ...) -> Iterator[Tuple[Any, ...]]: ...
149
+
150
+ @overload
151
+ def iter_rows(self, *, named: Literal[True], buffer_size: int = ...) -> Iterator[Dict[str, Any]]: ...
152
+
153
+ def iter_rows(
154
+ self, *, named: Literal[False, True] = False, buffer_size: int = 512
155
+ ) -> Iterator[Tuple[Any, ...]] | Iterator[Dict[str, Any]]:
156
+ df_stream = self.lazy_polars.collect(streaming=True) # type: ignore[call-overload]
157
+ return df_stream.iter_rows(named=named, buffer_size=buffer_size) # type: ignore[no-any-return]
158
+
159
+
160
+ class TextDataset(Dataset[TDataset], Generic[TDataset]):
161
+ REQUIRED_COLUMNS = {"page_content", "metadata"}
162
+
163
+ def __init__(self, service: TDataset):
164
+ super().__init__(service)
165
+ self._validate_schema()
166
+
167
+ def _validate_schema(self) -> None:
168
+ df = self.polars
169
+ missing = self.REQUIRED_COLUMNS - set(df.columns)
170
+ if missing:
171
+ raise ValueError(
172
+ f"{self.__class__.__name__} requires columns {self.REQUIRED_COLUMNS}, " f"but missing {missing}"
173
+ )
174
+
175
+ def iter_documents(self) -> Iterator[Document]:
176
+ for row in self.polars.iter_rows(named=True):
177
+ yield Document(
178
+ page_content=row["page_content"],
179
+ metadata=row["metadata"],
180
+ )
181
+
182
+ def dump_documents(
183
+ self,
184
+ out_dir: str,
185
+ prefix: str = "doc",
186
+ ext: str = ".md",
187
+ encoding: str = "utf-8",
188
+ ) -> None:
189
+ out_dir_path = Path(out_dir)
190
+ out_dir_path.mkdir(parents=True, exist_ok=True)
191
+ for i, row in enumerate(self.polars.iter_rows(named=True)):
192
+ path = out_dir_path / f"{prefix}_{i:05d}{ext}"
193
+ path.write_text(row["page_content"], encoding=encoding)
194
+
195
+ def to_langchain_documents(
196
+ self,
197
+ ) -> list[Document]:
198
+ docs: list[Document] = []
199
+ for row in self.polars.iter_rows(named=True):
200
+ docs.append(
201
+ Document(
202
+ page_content=row["page_content"],
203
+ metadata=row["metadata"],
204
+ )
205
+ )
206
+ return docs
207
+
208
+ def to_minio(
209
+ self,
210
+ *,
211
+ bucket: str,
212
+ key: str,
213
+ endpoint_url: str,
214
+ access_key: str,
215
+ secret_key: str,
216
+ compression: ParquetCompression = "zstd",
217
+ row_group_size: int = 100_000,
218
+ ) -> None:
219
+ client = Minio(
220
+ endpoint_url.replace("http://", "").replace("https://", ""),
221
+ access_key=access_key,
222
+ secret_key=secret_key,
223
+ secure=endpoint_url.startswith("https://"),
224
+ )
225
+ buffer = io.BytesIO()
226
+ self.polars.write_parquet(
227
+ buffer,
228
+ compression=compression,
229
+ row_group_size=row_group_size,
230
+ )
231
+ buffer.seek(0)
232
+ client.put_object(
233
+ bucket_name=bucket,
234
+ object_name=key,
235
+ data=buffer,
236
+ length=buffer.getbuffer().nbytes,
237
+ content_type="application/octet-stream",
238
+ )
239
+
240
+ @overload
241
+ @classmethod
242
+ def from_config(
243
+ cls,
244
+ config: ParquetDatasetSettings,
245
+ ) -> Self: ...
246
+
247
+ @overload
248
+ @classmethod
249
+ def from_config(
250
+ cls,
251
+ config: MinioDatasetSettings,
252
+ ) -> Self: ...
253
+
254
+ @classmethod
255
+ def from_config(
256
+ cls,
257
+ config: Union[ParquetDatasetSettings, MinioDatasetSettings],
258
+ ) -> Self:
259
+ if isinstance(config, ParquetDatasetSettings):
260
+ df = pl.scan_parquet(config.path) if config.lazy else pl.read_parquet(config.path)
261
+ return cast(Self, cls.from_polars(df))
262
+ if isinstance(config, MinioDatasetSettings):
263
+ return cast(
264
+ Self,
265
+ cls.from_minio(
266
+ bucket=config.bucket,
267
+ key=config.key,
268
+ endpoint_url=config.endpoint,
269
+ access_key=config.access_key,
270
+ secret_key=config.secret_key,
271
+ ),
272
+ )
273
+
274
+ raise TypeError(f"Unsupported dataset config: {type(config).__name__}")
275
+
276
+ @classmethod
277
+ def from_records(
278
+ cls,
279
+ records: List[Tuple[str, Dict[str, Any]]],
280
+ ) -> "TextDataset[TDataset]":
281
+ if not records:
282
+ raise ValueError("records must be non-empty")
283
+ df = pl.DataFrame(
284
+ {
285
+ "page_content": [text for text, _ in records],
286
+ "metadata": [meta for _, meta in records],
287
+ }
288
+ )
289
+ return cast("TextDataset[TDataset]", cls.from_polars(df))
@@ -0,0 +1,26 @@
1
+ from typing import Union, cast
2
+
3
+ import polars as pl
4
+
5
+ from evaluation_embedder.src.constants import TDataset
6
+ from evaluation_embedder.src.datasets import TextDataset
7
+
8
+
9
+ class PolarsTextDataset(TextDataset[Union[pl.DataFrame, pl.LazyFrame]]):
10
+
11
+ def __init__(self, service: Union[pl.DataFrame, pl.LazyFrame]):
12
+ super().__init__(service)
13
+
14
+ @classmethod
15
+ def from_polars(cls, df: Union[pl.DataFrame, pl.LazyFrame]) -> "TextDataset[TDataset]":
16
+ return cast("TextDataset[TDataset]", cls(df))
17
+
18
+ def to_polars(self) -> pl.DataFrame:
19
+ if isinstance(self.service, pl.LazyFrame):
20
+ return self.service.collect()
21
+ return self.service
22
+
23
+ def to_lazy_polars(self) -> pl.LazyFrame:
24
+ if isinstance(self.service, pl.LazyFrame):
25
+ return self.service
26
+ return self.service.lazy()
@@ -0,0 +1,175 @@
1
+ import asyncio
2
+ import logging
3
+ from abc import ABC, abstractmethod
4
+ from typing import Any, Generic, List, Optional, cast
5
+
6
+ from langchain_core.documents import (
7
+ Document, # or: from langchain.schema import Document
8
+ )
9
+ from pydantic import BaseModel, Field
10
+
11
+ from evaluation_embedder.src.constants import (
12
+ EmbeddingPurposeEnum,
13
+ TCEmbedder,
14
+ TCEvaluator,
15
+ TCProcessor,
16
+ TCRetriever,
17
+ TCScore,
18
+ TCVectorStore,
19
+ )
20
+ from evaluation_embedder.src.datasets import TextDataset
21
+ from evaluation_embedder.src.mixins import FromConfigMixin
22
+ from evaluation_embedder.src.utils import load_class
23
+
24
+ _logger = logging.getLogger(__name__)
25
+
26
+
27
+ class Score(ABC, Generic[TCScore]):
28
+
29
+ class ScoreResult(BaseModel):
30
+ name: str
31
+ value: float
32
+
33
+ def __init__(self, config: TCScore) -> None:
34
+ self.config = config
35
+ _logger.info(f"Initialized Score | class={self.__class__.__name__} | config={config}")
36
+
37
+ @abstractmethod
38
+ def __call__(self, hits: List[bool]) -> ScoreResult:
39
+ raise NotImplementedError()
40
+
41
+
42
+ class Processor(FromConfigMixin[TCProcessor], ABC, Generic[TCProcessor]):
43
+
44
+ def __init__(self, config: TCProcessor) -> None:
45
+ self.config = config
46
+
47
+ @abstractmethod
48
+ def __call__(self, text: str, purpose: EmbeddingPurposeEnum) -> str:
49
+ raise NotImplementedError()
50
+
51
+
52
+ class Embedder(FromConfigMixin[TCEmbedder], ABC, Generic[TCEmbedder]):
53
+
54
+ def __init__(self, config: TCEmbedder) -> None:
55
+ self.config = config
56
+ _logger.info(f"Initialized Embedder | class={self.__class__.__name__} | config={config}")
57
+
58
+ @abstractmethod
59
+ async def _aembed_documents(self, texts: List[str]) -> List[List[float]]:
60
+ raise NotImplementedError()
61
+
62
+ def process_query(self, text: str, processor: Optional[Processor[Any]]) -> str:
63
+ if processor:
64
+ return processor(text=text, purpose=EmbeddingPurposeEnum.QUERY) # type: ignore[arg-type]
65
+ return text
66
+
67
+ def process_document(self, text: str, processor: Optional[Processor[Any]]) -> str:
68
+ if processor:
69
+ return processor(text=text, purpose=EmbeddingPurposeEnum.DOCUMENT) # type: ignore[arg-type]
70
+ return text
71
+
72
+ async def aembed_documents(
73
+ self,
74
+ texts: List[str],
75
+ processor: Optional[Processor] = None,
76
+ ) -> List[List[float]]:
77
+ docs = [self.process_document(t, processor) for t in texts]
78
+ return await self._aembed_documents(docs)
79
+
80
+ async def aembed_query(self, text: str, processor: Optional[Processor] = None) -> List[float]:
81
+ text = self.process_query(text, processor)
82
+ return (await self.aembed_documents([text]))[0]
83
+
84
+
85
+ class VectorStore(FromConfigMixin[TCVectorStore], ABC, Generic[TCVectorStore]):
86
+ class ScoredPoint(BaseModel):
87
+ score: float = Field(..., description="Points vector distance to the query vector")
88
+ document: Document
89
+
90
+ class QueryResponse(BaseModel):
91
+ points: List["VectorStore.ScoredPoint"]
92
+
93
+ def __init__(self, config: TCVectorStore) -> None:
94
+ self.config = config
95
+ _logger.info(f"Initialized VectorStore | class={self.__class__.__name__} | config={config}")
96
+
97
+ @abstractmethod
98
+ def query_points(self, query: List[float], *, limit: int) -> QueryResponse:
99
+ raise NotImplementedError()
100
+
101
+
102
+ class Retriever(FromConfigMixin[TCRetriever], ABC, Generic[TCRetriever]):
103
+
104
+ def __init__(self, config: TCRetriever) -> None:
105
+ self.config = config
106
+
107
+ _logger.info(f"Initializing Retriever | class={self.__class__.__name__}")
108
+
109
+ self.embedder: Embedder[Any] = load_class(self.config.embedder.module_path)(self.config.embedder)
110
+
111
+ self.vector_store: VectorStore[Any] = load_class(self.config.vector_store.module_path)(self.config.vector_store)
112
+
113
+ self.processor: Processor[Any] = load_class(self.config.processor.module_path)(self.config.processor)
114
+
115
+ _logger.info(
116
+ f"Retriever initialized | embedder={type(self.embedder).__name__} | "
117
+ f"vector_store={type(self.vector_store).__name__}"
118
+ )
119
+
120
+ async def retrieve(self, query: str, *, limit: int) -> VectorStore.QueryResponse:
121
+ query_embedding = await self.embedder.aembed_query(query, self.processor)
122
+ return self.vector_store.query_points(
123
+ query=query_embedding,
124
+ limit=limit,
125
+ )
126
+
127
+
128
+ class Evaluator(FromConfigMixin[TCEvaluator], ABC, Generic[TCEvaluator]):
129
+
130
+ NB_LOGS_PER_QUERIES = 100
131
+
132
+ def __init__(self, config: TCEvaluator) -> None:
133
+ self.config = config
134
+
135
+ _logger.info(f"Initializing Evaluator | class={self.__class__.__name__}")
136
+
137
+ self.dataset: TextDataset[Any] = self._load_dataset()
138
+ self.retriever: Retriever[Any] = load_class(self.config.retriever.module_path)(self.config.retriever)
139
+ self.scores: List[Score[Any]] = [load_class(s.module_path)(s) for s in self.config.scores]
140
+
141
+ _logger.info(
142
+ f"Evaluator ready | dataset_size={len(self.dataset)} | " f"scores={[type(s).__name__ for s in self.scores]}"
143
+ )
144
+
145
+ def _load_dataset(self) -> TextDataset[Any]:
146
+ dataset_cls = load_class(self.config.dataset.module_path)
147
+ _logger.info(
148
+ f"Instantiating dataset | class={dataset_cls.__name__} | " f"module_path={self.config.dataset.module_path}"
149
+ )
150
+ dataset = cast(
151
+ TextDataset[Any],
152
+ dataset_cls.from_config(self.config.dataset),
153
+ )
154
+ _logger.info(f"Dataset loaded | type={type(dataset).__name__} | " f"size={len(dataset)}")
155
+ return dataset
156
+
157
+ async def eval(self) -> List[List[Score.ScoreResult]]:
158
+ scores_all = []
159
+ max_k = max(getattr(score.config, "k", 0) for score in self.scores)
160
+ _logger.info(f"Starting evaluation | max_k={max_k} | num_queries={len(self.dataset)}")
161
+ for idx, sample in enumerate(self.dataset.iter_rows(named=True), start=1):
162
+ scores = []
163
+ query = sample["metadata"]["query"]
164
+ page_content = self.retriever.processor(sample["page_content"], purpose=EmbeddingPurposeEnum.DOCUMENT) # type: ignore[arg-type]
165
+ response = await self.retriever.retrieve(
166
+ query,
167
+ limit=max_k,
168
+ )
169
+ hits = [p.document.page_content == page_content for p in response.points]
170
+ scores = [s(hits) for s in self.scores]
171
+ scores_all.append(scores)
172
+ if idx % self.__class__.NB_LOGS_PER_QUERIES == 0 or idx == len(self.dataset):
173
+ _logger.info(f"Eval progress | processed={idx}/{len(self.dataset)}")
174
+ _logger.info("Evaluation completed")
175
+ return scores_all
@@ -0,0 +1,27 @@
1
+ from typing import List
2
+
3
+ from openai import AsyncOpenAI
4
+
5
+ from evaluation_embedder.src.evaluation import Embedder
6
+ from evaluation_embedder.src.settings import VLLMEmbedderSettings
7
+
8
+
9
+ class VLLMEmbedder(Embedder[VLLMEmbedderSettings]):
10
+ def __init__(self, config: VLLMEmbedderSettings):
11
+ super().__init__(config)
12
+ self.client = AsyncOpenAI(
13
+ base_url=self.config.base_url,
14
+ api_key="", # vLLM does not require a key
15
+ )
16
+
17
+ async def _aembed_documents(self, texts: List[str]) -> List[List[float]]:
18
+ """
19
+ Async embedding via vLLM OpenAI-compatible API.
20
+ """
21
+ response = await self.client.embeddings.create(
22
+ model=self.config.model_name,
23
+ input=texts,
24
+ )
25
+
26
+ # Preserve order
27
+ return [item.embedding for item in response.data]
@@ -0,0 +1,50 @@
1
+ from typing import List, cast
2
+
3
+ import numpy as np
4
+ from langchain_core.documents import Document
5
+
6
+ from evaluation_embedder.src.datasets.polars import PolarsTextDataset
7
+ from evaluation_embedder.src.evaluation import Evaluator
8
+ from evaluation_embedder.src.evaluation.vector_stores import FaissVectorStore
9
+ from evaluation_embedder.src.settings import (
10
+ FaissEvaluatorSettings,
11
+ QdrantEvaluatorSettings,
12
+ )
13
+
14
+
15
+ class QdrantEvaluator(Evaluator[QdrantEvaluatorSettings]):
16
+ def __init__(self, config: QdrantEvaluatorSettings):
17
+ super().__init__(config)
18
+
19
+
20
+ class FaissEvaluator(Evaluator[FaissEvaluatorSettings]):
21
+ def __init__(self, config: FaissEvaluatorSettings):
22
+ super().__init__(config)
23
+
24
+ @classmethod
25
+ async def create(cls, config: FaissEvaluatorSettings) -> "FaissEvaluator":
26
+ self = cls(config)
27
+
28
+ docs = self.get_docs()
29
+ texts = [d.page_content for d in docs]
30
+
31
+ embeddings = np.asarray(
32
+ await self.retriever.embedder.aembed_documents(texts),
33
+ dtype="float32",
34
+ )
35
+
36
+ vector_store = cast(FaissVectorStore, self.retriever.vector_store)
37
+ vector_store.index = vector_store.build_faiss_index(embeddings.shape[-1])
38
+ vector_store.add_documents(docs, embeddings)
39
+
40
+ return self
41
+
42
+ def get_docs(self) -> List[Document]:
43
+ docs_idx = []
44
+ seen = set()
45
+ for i, row in enumerate(self.dataset.iter_rows(named=True)):
46
+ doc_id = row["metadata"]["doc_id"]
47
+ if doc_id not in seen:
48
+ seen.add(doc_id)
49
+ docs_idx.append(i)
50
+ return PolarsTextDataset(self.dataset.polars[docs_idx]).to_langchain_documents()
@@ -0,0 +1,16 @@
1
+ from evaluation_embedder.src.constants import EmbeddingPurposeEnum
2
+ from evaluation_embedder.src.evaluation import Processor
3
+ from evaluation_embedder.src.settings import NomicProcessorSettings
4
+
5
+
6
+ class NomicProcessor(Processor[NomicProcessorSettings]):
7
+
8
+ def __init__(self, config: NomicProcessorSettings) -> None:
9
+ super().__init__(config)
10
+
11
+ def __call__(self, text: str, purpose: EmbeddingPurposeEnum) -> str:
12
+ if purpose == EmbeddingPurposeEnum.DOCUMENT:
13
+ return f"search_document: {text}"
14
+ if purpose == EmbeddingPurposeEnum.QUERY:
15
+ return f"search_query: {text}"
16
+ raise ValueError(f"Unsupported embedding purpose {purpose}")
@@ -0,0 +1,26 @@
1
+ from evaluation_embedder.src.evaluation import Retriever
2
+ from evaluation_embedder.src.settings import (
3
+ FaissVectorStoreSettings,
4
+ NomicProcessorSettings,
5
+ QdrantVectorStoreSettings,
6
+ RetrieverSettings,
7
+ VLLMEmbedderSettings,
8
+ )
9
+
10
+
11
+ class VLLMFAISSRetriever(
12
+ Retriever[RetrieverSettings[VLLMEmbedderSettings, FaissVectorStoreSettings, NomicProcessorSettings]]
13
+ ):
14
+ def __init__(
15
+ self, config: RetrieverSettings[VLLMEmbedderSettings, FaissVectorStoreSettings, NomicProcessorSettings]
16
+ ):
17
+ super().__init__(config)
18
+
19
+
20
+ class VLLMQdrantRetriever(
21
+ Retriever[RetrieverSettings[VLLMEmbedderSettings, QdrantVectorStoreSettings, NomicProcessorSettings]]
22
+ ):
23
+ def __init__(
24
+ self, config: RetrieverSettings[VLLMEmbedderSettings, QdrantVectorStoreSettings, NomicProcessorSettings]
25
+ ):
26
+ super().__init__(config)
@@ -0,0 +1,59 @@
1
+ from typing import List
2
+
3
+ from evaluation_embedder.src.evaluation import Score
4
+ from evaluation_embedder.src.settings import (
5
+ HitAtKScoreSettings,
6
+ MRRAtKScoreSettings,
7
+ PrecisionAtKScoreSettings,
8
+ RecallAtKScoreSettings,
9
+ )
10
+
11
+
12
+ class RecallAtK(Score[RecallAtKScoreSettings]):
13
+
14
+ def __call__(self, hits: List[bool]) -> Score.ScoreResult:
15
+ hits_k = hits[: self.config.k]
16
+ value = sum(hits_k) / len(hits_k) if hits_k else 0.0
17
+
18
+ return Score.ScoreResult(
19
+ name=f"Recall@{self.config.k}",
20
+ value=value,
21
+ )
22
+
23
+
24
+ class PrecisionAtK(Score[PrecisionAtKScoreSettings]):
25
+ def __call__(self, hits: List[bool]) -> Score.ScoreResult:
26
+ hits_k = hits[: self.config.k]
27
+ value = sum(hits_k) / self.config.k
28
+ return Score.ScoreResult(
29
+ name=f"Precision@{self.config.k}",
30
+ value=value,
31
+ )
32
+
33
+
34
+ class HitAtK(Score[HitAtKScoreSettings]):
35
+
36
+ def __call__(self, hits: List[bool]) -> Score.ScoreResult:
37
+ hits_k = hits[: self.config.k]
38
+ value = 1.0 if any(hits_k) else 0.0
39
+
40
+ return Score.ScoreResult(
41
+ name=f"Hit@{self.config.k}",
42
+ value=value,
43
+ )
44
+
45
+
46
+ class MRRAtK(Score[MRRAtKScoreSettings]):
47
+
48
+ def __call__(self, hits: List[bool]) -> Score.ScoreResult:
49
+ value = 0.0
50
+
51
+ for rank, is_hit in enumerate(hits[: self.config.k], start=1):
52
+ if is_hit:
53
+ value = 1.0 / rank
54
+ break
55
+
56
+ return Score.ScoreResult(
57
+ name=f"MRR@{self.config.k}",
58
+ value=value,
59
+ )
@@ -0,0 +1,115 @@
1
+ from typing import List, Optional
2
+
3
+ import faiss # type:ignore[import-untyped]
4
+ import numpy as np
5
+ from langchain_core.documents import Document
6
+ from qdrant_client import QdrantClient
7
+
8
+ from evaluation_embedder.src.constants import FAISSIndexType
9
+ from evaluation_embedder.src.evaluation import VectorStore
10
+ from evaluation_embedder.src.evaluation.vector_stores import VectorStore
11
+ from evaluation_embedder.src.settings import (
12
+ FaissVectorStoreSettings,
13
+ QdrantVectorStoreSettings,
14
+ )
15
+
16
+
17
+ class FaissVectorStore(VectorStore[FaissVectorStoreSettings]):
18
+
19
+ def __init__(self, config: FaissVectorStoreSettings):
20
+ super().__init__(config)
21
+ self.documents: List[Document] = []
22
+ self.index: Optional[faiss.Index] = None
23
+
24
+ def build_faiss_index(self, dim: int) -> faiss.Index:
25
+ if self.config.index_type is FAISSIndexType.FLAT_IP:
26
+ return faiss.IndexFlatIP(dim)
27
+ return faiss.IndexFlatL2(dim)
28
+
29
+ # --------------------------------------------------
30
+ # Add documents
31
+ # --------------------------------------------------
32
+ def add_documents(
33
+ self,
34
+ documents: List[Document],
35
+ embeddings: np.ndarray,
36
+ ) -> None:
37
+ if self.config.normalize:
38
+ faiss.normalize_L2(embeddings)
39
+ if self.index is None:
40
+ raise ValueError(f"index should be created before adding documents")
41
+ self.index.add(embeddings.astype("float32"))
42
+ self.documents.extend(documents)
43
+
44
+ # --------------------------------------------------
45
+ # Query (same contract as Qdrant)
46
+ # --------------------------------------------------
47
+ def query_points(
48
+ self,
49
+ query: List[float],
50
+ *,
51
+ limit: int,
52
+ ) -> VectorStore.QueryResponse:
53
+
54
+ query_vec = np.asarray(query, dtype="float32")[None, :]
55
+
56
+ if self.config.normalize:
57
+ faiss.normalize_L2(query_vec)
58
+ if self.index is None:
59
+ raise ValueError(f"index should be created before querying points")
60
+ scores, indices = self.index.search(query_vec, limit)
61
+
62
+ points: List[VectorStore.ScoredPoint] = []
63
+
64
+ for idx, score in zip(indices[0], scores[0]):
65
+ if idx == -1:
66
+ continue
67
+ points.append(
68
+ VectorStore.ScoredPoint(
69
+ score=float(score),
70
+ document=self.documents[idx],
71
+ )
72
+ )
73
+ return VectorStore.QueryResponse(points=points)
74
+
75
+
76
+ class QdrantVectorStore(VectorStore[QdrantVectorStoreSettings]):
77
+
78
+ def __init__(self, config: QdrantVectorStoreSettings):
79
+ super().__init__(config)
80
+ self.client = QdrantClient(url=self.config.url)
81
+
82
+ def query_points(self, query: List[float], *, limit: int) -> VectorStore.QueryResponse:
83
+ result = self.client.query_points(
84
+ collection_name=self.config.collection_name,
85
+ query=query,
86
+ limit=limit,
87
+ )
88
+ points: List[VectorStore.ScoredPoint] = []
89
+ for idx, point in enumerate(result.points):
90
+ payload = point.payload
91
+ if payload is None:
92
+ raise ValueError(
93
+ f"Qdrant returned a point with no payload | "
94
+ f"collection={self.config.collection_name} | "
95
+ f"index={idx} | score={point.score}"
96
+ )
97
+ if "page_content" not in payload:
98
+ raise KeyError(
99
+ f"Missing 'page_content' in payload | "
100
+ f"collection={self.config.collection_name} | "
101
+ f"keys={list(payload.keys())}"
102
+ )
103
+ page_content = payload["page_content"]
104
+ metadata = dict(payload["metadata"])
105
+ points.append(
106
+ VectorStore.ScoredPoint(
107
+ score=point.score,
108
+ document=Document(
109
+ page_content=page_content,
110
+ metadata=metadata,
111
+ ),
112
+ )
113
+ )
114
+
115
+ return VectorStore.QueryResponse(points=points)
@@ -0,0 +1,61 @@
1
+ from abc import ABC
2
+ from pathlib import Path
3
+ from typing import Any, Dict, Generic, Self, Type, get_args
4
+
5
+ import yaml
6
+
7
+ from evaluation_embedder.src.constants import TCFromConfigMixin
8
+ from evaluation_embedder.src.utils import load_class
9
+
10
+
11
+ class FromConfigMixin(ABC, Generic[TCFromConfigMixin]):
12
+
13
+ def __init__(self, config: TCFromConfigMixin) -> None:
14
+ super().__init__()
15
+ self.config = config
16
+
17
+ @classmethod
18
+ def from_config(
19
+ cls,
20
+ config: TCFromConfigMixin,
21
+ ) -> Self:
22
+ return cls(config)
23
+
24
+ @classmethod
25
+ def get_config_class(cls) -> Type[TCFromConfigMixin]:
26
+ return get_args(cls.__orig_bases__[0])[0] # type: ignore
27
+
28
+ @classmethod
29
+ def from_yaml(
30
+ cls,
31
+ path: str,
32
+ key: str | None = None,
33
+ ) -> Self:
34
+ """
35
+ Load a runtime object from YAML.
36
+
37
+ Args:
38
+ yaml_path: Path to YAML config file
39
+ settings_cls: Pydantic Settings model to validate config
40
+ key: Optional top-level YAML key (e.g. "retriever")
41
+
42
+ Returns:
43
+ Instantiated runtime object
44
+ """
45
+ yaml_path = Path(path)
46
+ with yaml_path.open("r") as f:
47
+ raw: Dict[str, Any] = yaml.safe_load(f)
48
+ if key is not None:
49
+ raw = raw[key]
50
+ settings = cls.get_config_class().model_validate(raw)
51
+ runtime_cls = load_class(settings.module_path)
52
+ return runtime_cls(settings) # type: ignore[no-any-return]
53
+
54
+ @classmethod
55
+ def from_settings(cls) -> Self:
56
+ """
57
+ Load runtime object using Pydantic Settings resolution:
58
+ init > yaml > env > dotenv > secrets
59
+ """
60
+ settings = cls.get_config_class()() # type: ignore[call-arg]
61
+ return load_class(settings.module_path)(settings) # type: ignore[no-any-return]
@@ -0,0 +1,144 @@
1
+ from typing import Generic, List
2
+
3
+ from pydantic_settings import (
4
+ BaseSettings,
5
+ PydanticBaseSettingsSource,
6
+ SettingsConfigDict,
7
+ YamlConfigSettingsSource,
8
+ )
9
+
10
+ from evaluation_embedder.src.constants import (
11
+ CONFIG_PATH,
12
+ FAISSIndexType,
13
+ TCDataset,
14
+ TCEmbedder,
15
+ TCProcessor,
16
+ TCRetriever,
17
+ TCScore,
18
+ TCVectorStore,
19
+ )
20
+
21
+
22
+ class DatasetSettings(BaseSettings):
23
+ module_path: str
24
+
25
+
26
+ class ParquetDatasetSettings(DatasetSettings):
27
+ path: str
28
+ lazy: bool
29
+
30
+
31
+ class MinioDatasetSettings(DatasetSettings):
32
+ endpoint: str
33
+ bucket: str
34
+ key: str
35
+ access_key: str
36
+ secret_key: str
37
+ model_config = SettingsConfigDict(env_prefix='MINIO_', extra="ignore")
38
+
39
+
40
+ class FromConfigMixinSettings(BaseSettings):
41
+ module_path: str
42
+ model_config = SettingsConfigDict(
43
+ yaml_file=CONFIG_PATH,
44
+ extra="ignore",
45
+ )
46
+
47
+ @classmethod
48
+ def settings_customise_sources(
49
+ cls,
50
+ settings_cls: type[BaseSettings],
51
+ init_settings: PydanticBaseSettingsSource,
52
+ env_settings: PydanticBaseSettingsSource,
53
+ dotenv_settings: PydanticBaseSettingsSource,
54
+ file_secret_settings: PydanticBaseSettingsSource,
55
+ ) -> tuple[PydanticBaseSettingsSource, ...]:
56
+ return (
57
+ init_settings,
58
+ YamlConfigSettingsSource(settings_cls),
59
+ env_settings,
60
+ dotenv_settings,
61
+ file_secret_settings,
62
+ )
63
+
64
+
65
+ class EmbedderSettings(FromConfigMixinSettings):
66
+ model_name: str
67
+
68
+
69
+ class VLLMEmbedderSettings(EmbedderSettings):
70
+ base_url: str
71
+
72
+
73
+ class ProcessorSettings(FromConfigMixinSettings):
74
+ pass
75
+
76
+
77
+ class NomicProcessorSettings(ProcessorSettings):
78
+ pass
79
+
80
+
81
+ class ScoreSettings(BaseSettings):
82
+ module_path: str
83
+
84
+
85
+ class RecallAtKScoreSettings(ScoreSettings):
86
+ k: int
87
+
88
+
89
+ class PrecisionAtKScoreSettings(ScoreSettings):
90
+ k: int
91
+
92
+
93
+ class HitAtKScoreSettings(ScoreSettings):
94
+ k: int
95
+
96
+
97
+ class MRRAtKScoreSettings(ScoreSettings):
98
+ k: int
99
+
100
+
101
+ class VectorStoreSettings(FromConfigMixinSettings):
102
+ pass
103
+
104
+
105
+ class QdrantVectorStoreSettings(VectorStoreSettings):
106
+ url: str
107
+ collection_name: str
108
+
109
+
110
+ class FaissVectorStoreSettings(VectorStoreSettings):
111
+ index_type: FAISSIndexType
112
+ normalize: bool
113
+
114
+
115
+ class RetrieverSettings(FromConfigMixinSettings, Generic[TCEmbedder, TCVectorStore, TCProcessor]):
116
+ embedder: TCEmbedder
117
+ vector_store: TCVectorStore
118
+ processor: TCProcessor
119
+
120
+
121
+ class EvaluatorSettings(FromConfigMixinSettings, Generic[TCDataset, TCRetriever, TCScore]):
122
+ dataset: TCDataset
123
+ retriever: TCRetriever
124
+ scores: List[TCScore]
125
+
126
+
127
+ class QdrantEvaluatorSettings(
128
+ EvaluatorSettings[
129
+ MinioDatasetSettings,
130
+ RetrieverSettings[VLLMEmbedderSettings, QdrantVectorStoreSettings, NomicProcessorSettings],
131
+ RecallAtKScoreSettings,
132
+ ]
133
+ ):
134
+ pass
135
+
136
+
137
+ class FaissEvaluatorSettings(
138
+ EvaluatorSettings[
139
+ MinioDatasetSettings,
140
+ RetrieverSettings[VLLMEmbedderSettings, FaissVectorStoreSettings, NomicProcessorSettings],
141
+ RecallAtKScoreSettings,
142
+ ]
143
+ ):
144
+ pass
@@ -0,0 +1,8 @@
1
+ import importlib
2
+ from typing import Any
3
+
4
+
5
+ def load_class(path: str) -> Any:
6
+ module_path, class_name = path.rsplit(".", 1)
7
+ module = importlib.import_module(module_path)
8
+ return getattr(module, class_name)
@@ -0,0 +1,35 @@
1
+ [project]
2
+ name = "evaluation-embedder"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = [
6
+ {name = "jalal",email = "jalalkhaldi3@gmail.com"}
7
+ ]
8
+ readme = "README.md"
9
+ requires-python = "3.12.9"
10
+ dependencies = [
11
+ "polars (>=1.37.1,<2.0.0)",
12
+ "minio (>=7.2.20,<8.0.0)",
13
+ "langchain (>=1.2.3,<2.0.0)",
14
+ "pydantic-settings (>=2.12.0,<3.0.0)",
15
+ "openai (>=2.15.0,<3.0.0)",
16
+ "numpy (>=2.4.1,<3.0.0)",
17
+ "qdrant-client (>=1.16.2,<2.0.0)",
18
+ "faiss-cpu (>=1.13.2,<2.0.0)",
19
+ ]
20
+
21
+
22
+ [dependency-groups]
23
+ notebook = [
24
+ "jupyter (>=1.1.1,<2.0.0)",
25
+ "mypy (>=1.19.0,<2.0.0)",
26
+ "pre-commit (>=4.5.0,<5.0.0)",
27
+ "types-pyyaml (>=6.0.12.20250915,<7.0.0.0)",
28
+ "torch (>=2.9.0,<3.0.0)"
29
+ ]
30
+
31
+
32
+
33
+ [build-system]
34
+ requires = ["poetry-core>=2.0.0,<3.0.0"]
35
+ build-backend = "poetry.core.masonry.api"