ragxo 0.1.11__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ragxo/__init__.py
CHANGED
@@ -1,3 +1,3 @@
|
|
1
1
|
|
2
|
-
from .client import Ragxo, Document
|
3
|
-
__all__ = ["Ragxo", "Document"]
|
2
|
+
from .client import Ragxo, Document, EvaluationExample
|
3
|
+
__all__ = ["Ragxo", "Document", "EvaluationExample"]
|
ragxo/client.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
import time
|
2
|
-
from typing import Self, Callable
|
2
|
+
from typing import Literal, Self, Callable
|
3
3
|
from pymilvus import MilvusClient
|
4
|
-
from pydantic import BaseModel
|
4
|
+
from pydantic import BaseModel, Field
|
5
5
|
import boto3
|
6
6
|
import dill
|
7
7
|
import os
|
@@ -20,6 +20,14 @@ class Document(BaseModel):
|
|
20
20
|
metadata: dict
|
21
21
|
id: int
|
22
22
|
|
23
|
+
class EvaluationExample(BaseModel):
|
24
|
+
query: str
|
25
|
+
expected: str
|
26
|
+
|
27
|
+
class EvaluationResults(BaseModel):
|
28
|
+
results: list[str] = Field(description="A list of strings, each either 'correct' or 'incorrect'")
|
29
|
+
|
30
|
+
|
23
31
|
class Ragxo:
|
24
32
|
"""
|
25
33
|
A RAG (Retrieval-Augmented Generation) system that combines vector search with LLM responses.
|
@@ -52,6 +60,13 @@ class Ragxo:
|
|
52
60
|
self.embedding_fn = None
|
53
61
|
self.system_prompt = None
|
54
62
|
self.model = "gpt-4o-mini"
|
63
|
+
self.limit = 10
|
64
|
+
self.temperature = 0.5
|
65
|
+
self.max_tokens = 2000
|
66
|
+
self.top_p = 1.0
|
67
|
+
self.frequency_penalty = 0.0
|
68
|
+
self.presence_penalty = 0.0
|
69
|
+
|
55
70
|
|
56
71
|
def add_preprocess(self, fn: Callable) -> Self:
|
57
72
|
"""
|
@@ -313,7 +328,7 @@ class Ragxo:
|
|
313
328
|
raise
|
314
329
|
|
315
330
|
@classmethod
|
316
|
-
def _load_from_s3(cls, prefix: str, bucket: str) ->
|
331
|
+
def _load_from_s3(cls, prefix: str, bucket: str) -> Self:
|
317
332
|
"""
|
318
333
|
Internal classmethod to handle S3 loading.
|
319
334
|
"""
|
@@ -355,13 +370,17 @@ class Ragxo:
|
|
355
370
|
|
356
371
|
def generate_llm_response(self,
|
357
372
|
query: str,
|
373
|
+
history: list[dict] = [],
|
374
|
+
messages: list[dict] = None,
|
358
375
|
data: list[dict] = None) -> ChatCompletion:
|
359
376
|
"""
|
360
377
|
Generate LLM response based on query and retrieved data.
|
361
378
|
|
362
379
|
Args:
|
363
|
-
query (str): User query
|
380
|
+
query (str): User query, this is used if messages is None
|
364
381
|
data (list[dict], optional): Retrieved documents. If None, performs a new query
|
382
|
+
history (list[dict], optional): History of messages
|
383
|
+
messages (list[dict], optional): Messages to pass to the LLM: [{"role": "system", "content": system_prompt}, {"role": "user", "content": "Some user message"}, {"role": "assistant", "content": "Some assistant message"}]
|
365
384
|
|
366
385
|
Returns:
|
367
386
|
ChatCompletion: LLM response
|
@@ -378,9 +397,10 @@ class Ragxo:
|
|
378
397
|
response = openai.chat.completions.create(
|
379
398
|
model=self.model,
|
380
399
|
messages=[
|
381
|
-
{"role": "system", "content": self.system_prompt}
|
382
|
-
|
383
|
-
|
400
|
+
{"role": "system", "content": self.system_prompt}
|
401
|
+
] + history + [
|
402
|
+
{"role": "user", "content": f"query: {query} data: {data}"}
|
403
|
+
] if messages is None else messages,
|
384
404
|
temperature=self.temperature,
|
385
405
|
max_tokens=self.max_tokens,
|
386
406
|
top_p=self.top_p,
|
@@ -388,4 +408,85 @@ class Ragxo:
|
|
388
408
|
presence_penalty=self.presence_penalty,
|
389
409
|
)
|
390
410
|
|
391
|
-
return response
|
411
|
+
return response
|
412
|
+
|
413
|
+
|
414
|
+
|
415
|
+
@with_loading("Evaluating test dataset")
|
416
|
+
def evaluate(self, test_data: list[EvaluationExample], batch_size: int = 10, judge_model: str = "gpt-4o-mini") -> float:
|
417
|
+
"""
|
418
|
+
Evaluate the performance of the RAG system on a test dataset using a single prompt per batch.
|
419
|
+
|
420
|
+
For each batch:
|
421
|
+
1. Generates an answer for each query.
|
422
|
+
2. Concatenates evaluation details (query, expected, generated answer) into one prompt.
|
423
|
+
3. Instructs the judge to output a JSON object strictly adhering to our schema:
|
424
|
+
{"results": ["correct", "incorrect", ...]}.
|
425
|
+
4. Parses the structured output and computes overall accuracy.
|
426
|
+
|
427
|
+
Args:
|
428
|
+
test_data (list[EvaluationExample]): List of evaluation examples.
|
429
|
+
batch_size (int): Number of examples to process per batch.
|
430
|
+
|
431
|
+
Returns:
|
432
|
+
float: Accuracy as a fraction of correct evaluations.
|
433
|
+
"""
|
434
|
+
total = len(test_data)
|
435
|
+
correct_count = 0
|
436
|
+
|
437
|
+
for i in range(0, total, batch_size):
|
438
|
+
batch = test_data[i : i + batch_size]
|
439
|
+
batch_prompt = "Evaluate the following examples and output your answer as a JSON object with a single key \"results\" that maps to an array of strings. Each element in the array should be either \"correct\" or \"incorrect\", corresponding to each example in order.\n\n"
|
440
|
+
|
441
|
+
# For each example in the batch, generate the answer and include details.
|
442
|
+
for idx, example in enumerate(batch):
|
443
|
+
query = example.query
|
444
|
+
expected = example.expected
|
445
|
+
|
446
|
+
# Generate the answer using the RAG system.
|
447
|
+
llm_response = self.generate_llm_response(query)
|
448
|
+
generated_answer = llm_response.choices[0].message.content.strip()
|
449
|
+
|
450
|
+
batch_prompt += f"Example {idx+1}:\n"
|
451
|
+
batch_prompt += f"Query: {query}\n"
|
452
|
+
batch_prompt += f"Expected Answer: {expected}\n"
|
453
|
+
batch_prompt += f"Generated Answer: {generated_answer}\n\n"
|
454
|
+
|
455
|
+
# Append clear instructions for the structured output.
|
456
|
+
batch_prompt += (
|
457
|
+
"Return your output as a JSON object exactly in this format: "
|
458
|
+
"{\"results\": [\"correct\", \"incorrect\", ...]} with no additional text or markdown formatting."
|
459
|
+
)
|
460
|
+
|
461
|
+
messages = [
|
462
|
+
{"role": "system", "content": "You are an expert evaluator. Evaluate whether each generated answer meets the expected answer."},
|
463
|
+
{"role": "user", "content": batch_prompt}
|
464
|
+
]
|
465
|
+
|
466
|
+
# Call the OpenAI API with a structured response enforced via a JSON Schema.
|
467
|
+
response = openai.beta.chat.completions.parse(
|
468
|
+
model=judge_model,
|
469
|
+
messages=messages,
|
470
|
+
temperature=0, # Deterministic output.
|
471
|
+
response_format=EvaluationResults
|
472
|
+
)
|
473
|
+
|
474
|
+
output_text = response.choices[0].message.content.strip()
|
475
|
+
|
476
|
+
try:
|
477
|
+
# Parse the JSON output using the Pydantic model.
|
478
|
+
eval_results = EvaluationResults.model_validate_json(output_text)
|
479
|
+
except Exception as e:
|
480
|
+
print(f"Error parsing JSON: {e}\nReceived output: {output_text}")
|
481
|
+
eval_results = None
|
482
|
+
|
483
|
+
if eval_results:
|
484
|
+
for result in eval_results.results:
|
485
|
+
if result.lower() == "correct":
|
486
|
+
correct_count += 1
|
487
|
+
else:
|
488
|
+
print("Skipping batch due to parsing error.")
|
489
|
+
|
490
|
+
accuracy = correct_count / total if total > 0 else 0.0
|
491
|
+
print(f"Accuracy: {accuracy * 100:.2f}% ({correct_count}/{total})")
|
492
|
+
return accuracy
|
ragxo/utils.py
CHANGED
@@ -13,7 +13,7 @@ def with_loading(title: str):
|
|
13
13
|
def decorator(func):
|
14
14
|
@functools.wraps(func)
|
15
15
|
def wrapper(self, *args, **kwargs):
|
16
|
-
with alive_bar(title=title, bar=None, stats=False) as bar:
|
16
|
+
with alive_bar(title=title, bar=None, stats=False, monitor=False, stats_end=False) as bar:
|
17
17
|
result = func(self, *args, **kwargs)
|
18
18
|
bar()
|
19
19
|
return result
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: ragxo
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.12
|
4
4
|
Summary: A RAG (Retrieval-Augmented Generation) toolkit with Milvus integration
|
5
5
|
Home-page: https://github.com/yourusername/ragx
|
6
6
|
License: MIT
|
@@ -19,7 +19,8 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
19
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
20
20
|
Requires-Dist: alive-progress (>=3.1.1,<4.0.0)
|
21
21
|
Requires-Dist: boto3 (>=1.36.14,<2.0.0)
|
22
|
-
Requires-Dist:
|
22
|
+
Requires-Dist: datasets (>=3.2.0,<4.0.0)
|
23
|
+
Requires-Dist: dill (<0.3.9)
|
23
24
|
Requires-Dist: milvus (>=2.3.9,<3.0.0)
|
24
25
|
Requires-Dist: mocker (>=1.1.1,<2.0.0)
|
25
26
|
Requires-Dist: openai (>=1.61.1,<2.0.0)
|
@@ -37,6 +38,24 @@ Export, version and reuse your E2E RAG pipeline everywhere 🚀
|
|
37
38
|
[](https://opensource.org/licenses/MIT)
|
38
39
|
[](https://www.python.org/downloads/release/python-380/)
|
39
40
|
|
41
|
+
## Table of Contents
|
42
|
+
- [Features](#features-)
|
43
|
+
- [Installation](#installation-️)
|
44
|
+
- [Quickstart](#quickstart-)
|
45
|
+
- [Build a RAG pipeline](#build-a-rag-pipeline)
|
46
|
+
- [Load a RAG pipeline](#load-a-rag-pipeline)
|
47
|
+
- [Usage Guide](#usage-guide-)
|
48
|
+
- [Import](#import)
|
49
|
+
- [Adding Preprocessing Steps](#adding-preprocessing-steps)
|
50
|
+
- [Custom Embedding Functions](#custom-embedding-functions)
|
51
|
+
- [Creating Documents](#creating-documents)
|
52
|
+
- [LLM Configuration](#llm-configuration)
|
53
|
+
- [Export and Load](#export-and-load)
|
54
|
+
- [Evaluation](#evaluation)
|
55
|
+
- [Best Practices](#best-practices-)
|
56
|
+
- [License](#license-)
|
57
|
+
- [Contributing](#contributing-)
|
58
|
+
|
40
59
|
RagXO extends the capabilities of traditional RAG (Retrieval-Augmented Generation) systems by providing a unified way to package, version, and deploy your entire RAG pipeline with LLM integration. Export your complete system—including embedding functions, preprocessing steps, vector store, and LLM configurations—into a single, portable artifact.
|
41
60
|
|
42
61
|
## Features ✨
|
@@ -212,6 +231,46 @@ ragxo_client.export("rag_pipeline_v1")
|
|
212
231
|
loaded_ragxo_client = Ragxo.load("rag_pipeline_v1")
|
213
232
|
```
|
214
233
|
|
234
|
+
### Evaluation
|
235
|
+
|
236
|
+
```python
|
237
|
+
from ragxo import EvaluationExample
|
238
|
+
|
239
|
+
# Create test examples
|
240
|
+
test_data = [
|
241
|
+
EvaluationExample(
|
242
|
+
query="What is the capital of France?",
|
243
|
+
expected="The capital of France is Paris."
|
244
|
+
),
|
245
|
+
EvaluationExample(
|
246
|
+
query="What is the capital of Germany?",
|
247
|
+
expected="The capital of Germany is Berlin."
|
248
|
+
),
|
249
|
+
]
|
250
|
+
|
251
|
+
# Evaluate the RAG system
|
252
|
+
accuracy = ragxo_client.evaluate(
|
253
|
+
test_data=test_data,
|
254
|
+
batch_size=10, # Process 10 examples at a time
|
255
|
+
judge_model="gpt-4" # Optional: specify a different model for evaluation
|
256
|
+
)
|
257
|
+
|
258
|
+
print(f"Evaluation accuracy: {accuracy * 100:.2f}%")
|
259
|
+
```
|
260
|
+
|
261
|
+
The evaluation process:
|
262
|
+
1. Processes test examples in batches
|
263
|
+
2. Generates RAG responses for each query
|
264
|
+
3. Uses an LLM to compare generated answers with expected answers
|
265
|
+
4. Returns accuracy score (0.0 to 1.0)
|
266
|
+
|
267
|
+
Best practices for evaluation:
|
268
|
+
- Use diverse test examples
|
269
|
+
- Include edge cases
|
270
|
+
- Keep expected answers consistent in format
|
271
|
+
- Use a more capable model for evaluation (e.g., GPT-4)
|
272
|
+
- Adjust batch size based on your rate limits and needs
|
273
|
+
|
215
274
|
## Best Practices 💡
|
216
275
|
|
217
276
|
1. **Version Your Exports**: Use semantic versioning for your exports:
|
@@ -0,0 +1,6 @@
|
|
1
|
+
ragxo/__init__.py,sha256=BAVy_mbqGOaAMmXpIqB94za5WgxuK9DAfd6BtJUsM_s,108
|
2
|
+
ragxo/client.py,sha256=pX7v24Rw_MC6HInrxvSJUxNqFa1YdNjf8_-WySljP0o,17676
|
3
|
+
ragxo/utils.py,sha256=BQ3u1oSi-kRqYTnpnJHq1KebuoVnA15u_5REVlYuM1o,569
|
4
|
+
ragxo-0.1.12.dist-info/METADATA,sha256=3aw_8FCoQ86bl2KCt0CEZt39GCquCFxFoDyHOYJOEuk,8233
|
5
|
+
ragxo-0.1.12.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
6
|
+
ragxo-0.1.12.dist-info/RECORD,,
|
ragxo-0.1.11.dist-info/RECORD
DELETED
@@ -1,6 +0,0 @@
|
|
1
|
-
ragxo/__init__.py,sha256=0VVe-z4XkkGQLQIG0hF0Hyf87_RgX0E4T9TRwwTkbmE,68
|
2
|
-
ragxo/client.py,sha256=6VE9h9XoEaS01irBp70brrWev8tNagtYnHOoyvjBVmo,12914
|
3
|
-
ragxo/utils.py,sha256=yy5_ejmxU75mRhfE_XGtKOvKtHo8AoV1QFQlwMDEiFw,537
|
4
|
-
ragxo-0.1.11.dist-info/METADATA,sha256=t3vwZl6LHtRgnrGqI8jMRrWGUy2c9tQlkxG45UDjjf4,6472
|
5
|
-
ragxo-0.1.11.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
6
|
-
ragxo-0.1.11.dist-info/RECORD,,
|
File without changes
|