ragxo 0.1.11__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ragxo/__init__.py CHANGED
@@ -1,3 +1,3 @@
1
1
 
2
- from .client import Ragxo, Document
3
- __all__ = ["Ragxo", "Document"]
2
+ from .client import Ragxo, Document, EvaluationExample
3
+ __all__ = ["Ragxo", "Document", "EvaluationExample"]
ragxo/client.py CHANGED
@@ -1,7 +1,7 @@
1
1
  import time
2
- from typing import Self, Callable
2
+ from typing import Literal, Self, Callable
3
3
  from pymilvus import MilvusClient
4
- from pydantic import BaseModel
4
+ from pydantic import BaseModel, Field
5
5
  import boto3
6
6
  import dill
7
7
  import os
@@ -20,6 +20,14 @@ class Document(BaseModel):
20
20
  metadata: dict
21
21
  id: int
22
22
 
23
+ class EvaluationExample(BaseModel):
24
+ query: str
25
+ expected: str
26
+
27
+ class EvaluationResults(BaseModel):
28
+ results: list[str] = Field(description="A list of strings, each either 'correct' or 'incorrect'")
29
+
30
+
23
31
  class Ragxo:
24
32
  """
25
33
  A RAG (Retrieval-Augmented Generation) system that combines vector search with LLM responses.
@@ -52,6 +60,13 @@ class Ragxo:
52
60
  self.embedding_fn = None
53
61
  self.system_prompt = None
54
62
  self.model = "gpt-4o-mini"
63
+ self.limit = 10
64
+ self.temperature = 0.5
65
+ self.max_tokens = 2000
66
+ self.top_p = 1.0
67
+ self.frequency_penalty = 0.0
68
+ self.presence_penalty = 0.0
69
+
55
70
 
56
71
  def add_preprocess(self, fn: Callable) -> Self:
57
72
  """
@@ -313,7 +328,7 @@ class Ragxo:
313
328
  raise
314
329
 
315
330
  @classmethod
316
- def _load_from_s3(cls, prefix: str, bucket: str) -> 'Ragx':
331
+ def _load_from_s3(cls, prefix: str, bucket: str) -> Self:
317
332
  """
318
333
  Internal classmethod to handle S3 loading.
319
334
  """
@@ -355,13 +370,17 @@ class Ragxo:
355
370
 
356
371
  def generate_llm_response(self,
357
372
  query: str,
373
+ history: list[dict] = [],
374
+ messages: list[dict] = None,
358
375
  data: list[dict] = None) -> ChatCompletion:
359
376
  """
360
377
  Generate LLM response based on query and retrieved data.
361
378
 
362
379
  Args:
363
- query (str): User query
380
+ query (str): User query, this is used if messages is None
364
381
  data (list[dict], optional): Retrieved documents. If None, performs a new query
382
+ history (list[dict], optional): History of messages
383
+ messages (list[dict], optional): Messages to pass to the LLM: [{"role": "system", "content": system_prompt}, {"role": "user", "content": "Some user message"}, {"role": "assistant", "content": "Some assistant message"}]
365
384
 
366
385
  Returns:
367
386
  ChatCompletion: LLM response
@@ -378,9 +397,10 @@ class Ragxo:
378
397
  response = openai.chat.completions.create(
379
398
  model=self.model,
380
399
  messages=[
381
- {"role": "system", "content": self.system_prompt},
382
- {"role": "user", "content": "query: {} data: {}".format(query, data)}
383
- ],
400
+ {"role": "system", "content": self.system_prompt}
401
+ ] + history + [
402
+ {"role": "user", "content": f"query: {query} data: {data}"}
403
+ ] if messages is None else messages,
384
404
  temperature=self.temperature,
385
405
  max_tokens=self.max_tokens,
386
406
  top_p=self.top_p,
@@ -388,4 +408,85 @@ class Ragxo:
388
408
  presence_penalty=self.presence_penalty,
389
409
  )
390
410
 
391
- return response
411
+ return response
412
+
413
+
414
+
415
+ @with_loading("Evaluating test dataset")
416
+ def evaluate(self, test_data: list[EvaluationExample], batch_size: int = 10, judge_model: str = "gpt-4o-mini") -> float:
417
+ """
418
+ Evaluate the performance of the RAG system on a test dataset using a single prompt per batch.
419
+
420
+ For each batch:
421
+ 1. Generates an answer for each query.
422
+ 2. Concatenates evaluation details (query, expected, generated answer) into one prompt.
423
+ 3. Instructs the judge to output a JSON object strictly adhering to our schema:
424
+ {"results": ["correct", "incorrect", ...]}.
425
+ 4. Parses the structured output and computes overall accuracy.
426
+
427
+ Args:
428
+ test_data (list[EvaluationExample]): List of evaluation examples.
429
+ batch_size (int): Number of examples to process per batch.
430
+
431
+ Returns:
432
+ float: Accuracy as a fraction of correct evaluations.
433
+ """
434
+ total = len(test_data)
435
+ correct_count = 0
436
+
437
+ for i in range(0, total, batch_size):
438
+ batch = test_data[i : i + batch_size]
439
+ batch_prompt = "Evaluate the following examples and output your answer as a JSON object with a single key \"results\" that maps to an array of strings. Each element in the array should be either \"correct\" or \"incorrect\", corresponding to each example in order.\n\n"
440
+
441
+ # For each example in the batch, generate the answer and include details.
442
+ for idx, example in enumerate(batch):
443
+ query = example.query
444
+ expected = example.expected
445
+
446
+ # Generate the answer using the RAG system.
447
+ llm_response = self.generate_llm_response(query)
448
+ generated_answer = llm_response.choices[0].message.content.strip()
449
+
450
+ batch_prompt += f"Example {idx+1}:\n"
451
+ batch_prompt += f"Query: {query}\n"
452
+ batch_prompt += f"Expected Answer: {expected}\n"
453
+ batch_prompt += f"Generated Answer: {generated_answer}\n\n"
454
+
455
+ # Append clear instructions for the structured output.
456
+ batch_prompt += (
457
+ "Return your output as a JSON object exactly in this format: "
458
+ "{\"results\": [\"correct\", \"incorrect\", ...]} with no additional text or markdown formatting."
459
+ )
460
+
461
+ messages = [
462
+ {"role": "system", "content": "You are an expert evaluator. Evaluate whether each generated answer meets the expected answer."},
463
+ {"role": "user", "content": batch_prompt}
464
+ ]
465
+
466
+ # Call the OpenAI API with a structured response enforced via a JSON Schema.
467
+ response = openai.beta.chat.completions.parse(
468
+ model=judge_model,
469
+ messages=messages,
470
+ temperature=0, # Deterministic output.
471
+ response_format=EvaluationResults
472
+ )
473
+
474
+ output_text = response.choices[0].message.content.strip()
475
+
476
+ try:
477
+ # Parse the JSON output using the Pydantic model.
478
+ eval_results = EvaluationResults.model_validate_json(output_text)
479
+ except Exception as e:
480
+ print(f"Error parsing JSON: {e}\nReceived output: {output_text}")
481
+ eval_results = None
482
+
483
+ if eval_results:
484
+ for result in eval_results.results:
485
+ if result.lower() == "correct":
486
+ correct_count += 1
487
+ else:
488
+ print("Skipping batch due to parsing error.")
489
+
490
+ accuracy = correct_count / total if total > 0 else 0.0
491
+ print(f"Accuracy: {accuracy * 100:.2f}% ({correct_count}/{total})")
492
+ return accuracy
ragxo/utils.py CHANGED
@@ -13,7 +13,7 @@ def with_loading(title: str):
13
13
  def decorator(func):
14
14
  @functools.wraps(func)
15
15
  def wrapper(self, *args, **kwargs):
16
- with alive_bar(title=title, bar=None, stats=False) as bar:
16
+ with alive_bar(title=title, bar=None, stats=False, monitor=False, stats_end=False) as bar:
17
17
  result = func(self, *args, **kwargs)
18
18
  bar()
19
19
  return result
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ragxo
3
- Version: 0.1.11
3
+ Version: 0.1.12
4
4
  Summary: A RAG (Retrieval-Augmented Generation) toolkit with Milvus integration
5
5
  Home-page: https://github.com/yourusername/ragx
6
6
  License: MIT
@@ -19,7 +19,8 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
19
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
20
  Requires-Dist: alive-progress (>=3.1.1,<4.0.0)
21
21
  Requires-Dist: boto3 (>=1.36.14,<2.0.0)
22
- Requires-Dist: dill (>=0.3.9,<0.4.0)
22
+ Requires-Dist: datasets (>=3.2.0,<4.0.0)
23
+ Requires-Dist: dill (<0.3.9)
23
24
  Requires-Dist: milvus (>=2.3.9,<3.0.0)
24
25
  Requires-Dist: mocker (>=1.1.1,<2.0.0)
25
26
  Requires-Dist: openai (>=1.61.1,<2.0.0)
@@ -37,6 +38,24 @@ Export, version and reuse your E2E RAG pipeline everywhere 🚀
37
38
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
38
39
  [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/release/python-380/)
39
40
 
41
+ ## Table of Contents
42
+ - [Features](#features-)
43
+ - [Installation](#installation-️)
44
+ - [Quickstart](#quickstart-)
45
+ - [Build a RAG pipeline](#build-a-rag-pipeline)
46
+ - [Load a RAG pipeline](#load-a-rag-pipeline)
47
+ - [Usage Guide](#usage-guide-)
48
+ - [Import](#import)
49
+ - [Adding Preprocessing Steps](#adding-preprocessing-steps)
50
+ - [Custom Embedding Functions](#custom-embedding-functions)
51
+ - [Creating Documents](#creating-documents)
52
+ - [LLM Configuration](#llm-configuration)
53
+ - [Export and Load](#export-and-load)
54
+ - [Evaluation](#evaluation)
55
+ - [Best Practices](#best-practices-)
56
+ - [License](#license-)
57
+ - [Contributing](#contributing-)
58
+
40
59
  RagXO extends the capabilities of traditional RAG (Retrieval-Augmented Generation) systems by providing a unified way to package, version, and deploy your entire RAG pipeline with LLM integration. Export your complete system—including embedding functions, preprocessing steps, vector store, and LLM configurations—into a single, portable artifact.
41
60
 
42
61
  ## Features ✨
@@ -212,6 +231,46 @@ ragxo_client.export("rag_pipeline_v1")
212
231
  loaded_ragxo_client = Ragxo.load("rag_pipeline_v1")
213
232
  ```
214
233
 
234
+ ### Evaluation
235
+
236
+ ```python
237
+ from ragxo import EvaluationExample
238
+
239
+ # Create test examples
240
+ test_data = [
241
+ EvaluationExample(
242
+ query="What is the capital of France?",
243
+ expected="The capital of France is Paris."
244
+ ),
245
+ EvaluationExample(
246
+ query="What is the capital of Germany?",
247
+ expected="The capital of Germany is Berlin."
248
+ ),
249
+ ]
250
+
251
+ # Evaluate the RAG system
252
+ accuracy = ragxo_client.evaluate(
253
+ test_data=test_data,
254
+ batch_size=10, # Process 10 examples at a time
255
+ judge_model="gpt-4" # Optional: specify a different model for evaluation
256
+ )
257
+
258
+ print(f"Evaluation accuracy: {accuracy * 100:.2f}%")
259
+ ```
260
+
261
+ The evaluation process:
262
+ 1. Processes test examples in batches
263
+ 2. Generates RAG responses for each query
264
+ 3. Uses an LLM to compare generated answers with expected answers
265
+ 4. Returns accuracy score (0.0 to 1.0)
266
+
267
+ Best practices for evaluation:
268
+ - Use diverse test examples
269
+ - Include edge cases
270
+ - Keep expected answers consistent in format
271
+ - Use a more capable model for evaluation (e.g., GPT-4)
272
+ - Adjust batch size based on your rate limits and needs
273
+
215
274
  ## Best Practices 💡
216
275
 
217
276
  1. **Version Your Exports**: Use semantic versioning for your exports:
@@ -0,0 +1,6 @@
1
+ ragxo/__init__.py,sha256=BAVy_mbqGOaAMmXpIqB94za5WgxuK9DAfd6BtJUsM_s,108
2
+ ragxo/client.py,sha256=pX7v24Rw_MC6HInrxvSJUxNqFa1YdNjf8_-WySljP0o,17676
3
+ ragxo/utils.py,sha256=BQ3u1oSi-kRqYTnpnJHq1KebuoVnA15u_5REVlYuM1o,569
4
+ ragxo-0.1.12.dist-info/METADATA,sha256=3aw_8FCoQ86bl2KCt0CEZt39GCquCFxFoDyHOYJOEuk,8233
5
+ ragxo-0.1.12.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
6
+ ragxo-0.1.12.dist-info/RECORD,,
@@ -1,6 +0,0 @@
1
- ragxo/__init__.py,sha256=0VVe-z4XkkGQLQIG0hF0Hyf87_RgX0E4T9TRwwTkbmE,68
2
- ragxo/client.py,sha256=6VE9h9XoEaS01irBp70brrWev8tNagtYnHOoyvjBVmo,12914
3
- ragxo/utils.py,sha256=yy5_ejmxU75mRhfE_XGtKOvKtHo8AoV1QFQlwMDEiFw,537
4
- ragxo-0.1.11.dist-info/METADATA,sha256=t3vwZl6LHtRgnrGqI8jMRrWGUy2c9tQlkxG45UDjjf4,6472
5
- ragxo-0.1.11.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
6
- ragxo-0.1.11.dist-info/RECORD,,
File without changes