ragxo 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ragxo/__init__.py CHANGED
@@ -1,3 +1,3 @@
1
- from ragx import Ragxo, Document
2
1
 
2
+ from .client import Ragxo, Document
3
3
  __all__ = ["Ragxo", "Document"]
ragxo/client.py ADDED
@@ -0,0 +1,253 @@
1
+ import time
2
+ from typing import Self, Callable
3
+ from pymilvus import MilvusClient
4
+ from pydantic import BaseModel
5
+ import boto3
6
+ import dill
7
+ import os
8
+ import shutil
9
+ import logging
10
+ import tempfile
11
+ from botocore.exceptions import ClientError
12
+ import openai
13
+ from openai import ChatCompletion
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ class Document(BaseModel):
18
+ text: str
19
+ metadata: dict
20
+ id: int
21
+
22
+ class Ragxo:
23
+ def __init__(self, dimension: int) -> None:
24
+ self.dimension = dimension
25
+ self.collection_name = "ragx"
26
+ os.makedirs("ragx_artifacts", exist_ok=True)
27
+
28
+ self.db_path = f"ragx_artifacts/milvus_{int(time.time())}.db"
29
+ self.client = MilvusClient(self.db_path)
30
+ self.client.create_collection(self.collection_name, dimension=dimension)
31
+ self.processing_fn = []
32
+ self.embedding_fn = None
33
+ self.system_prompt = None
34
+ self.model = "gpt-4o-mini"
35
+
36
+ def add_preprocess(self, fn: Callable) -> Self:
37
+ self.processing_fn.append(fn)
38
+ return self
39
+
40
+ def add_embedding_fn(self, fn: Callable) -> Self:
41
+ if not fn:
42
+ raise ValueError("Embedding function cannot be None")
43
+ self.embedding_fn = fn
44
+ return self
45
+
46
+ def add_system_prompt(self, prompt: str) -> Self:
47
+ self.system_prompt = prompt
48
+ return self
49
+
50
+ def add_model(self, model: str) -> Self:
51
+ self.model = model
52
+ return self
53
+
54
+ def index(self, data: list[Document]) -> Self:
55
+ if not self.embedding_fn:
56
+ raise ValueError("Embedding function not set")
57
+
58
+ processed_text = []
59
+ for item in data:
60
+ current_text = item.text
61
+ for fn in self.processing_fn:
62
+ current_text = fn(current_text)
63
+ processed_text.append(current_text)
64
+
65
+ embeddings = [
66
+ self.embedding_fn(text)
67
+ for text in processed_text
68
+ ]
69
+
70
+ self.client.insert(self.collection_name, [
71
+ {
72
+ "text": item.text,
73
+ "metadata": item.metadata,
74
+ "id": item.id,
75
+ "vector": embedding
76
+ }
77
+ for item, embedding in zip(data, embeddings)
78
+ ])
79
+ return self
80
+
81
+ def query(self, query: str, output_fields: list[str] = ['text', 'metadata']) -> list[list[dict]]:
82
+ if not self.embedding_fn:
83
+ raise ValueError("Embedding function not set. Please call add_embedding_fn first.")
84
+
85
+ preprocessed_query = query
86
+ for fn in self.processing_fn:
87
+ preprocessed_query = fn(preprocessed_query)
88
+
89
+ embedding = self.embedding_fn(preprocessed_query)
90
+
91
+ return self.client.search(
92
+ collection_name=self.collection_name,
93
+ data=[embedding],
94
+ limit=10,
95
+ output_fields=output_fields
96
+ )
97
+
98
+ def export(self, destination: str, s3_bucket: str = None) -> Self:
99
+ """
100
+ Export the Ragx instance to either local filesystem or S3.
101
+
102
+ Args:
103
+ destination: str - Local path or S3 key prefix
104
+ s3_bucket: str, optional - S3 bucket name. If provided, export to S3
105
+ """
106
+ try:
107
+ # If s3_bucket is provided, export to S3
108
+ if s3_bucket:
109
+ return self._export_to_s3(destination, s3_bucket)
110
+
111
+ # Otherwise, export to local filesystem
112
+ os.makedirs(destination, exist_ok=True)
113
+
114
+ # Save using dill
115
+ pickle_path = os.path.join(destination, "ragx.pkl")
116
+ with open(pickle_path, "wb") as f:
117
+ dill.dump(self, f)
118
+
119
+ # Copy database
120
+ db_dest = os.path.join(destination, "milvus.db")
121
+ shutil.copy(self.db_path, db_dest)
122
+
123
+ return self
124
+
125
+ except Exception as e:
126
+ logger.error(f"Error in export: {e}")
127
+ raise
128
+
129
+ def _export_to_s3(self, prefix: str, bucket: str) -> Self:
130
+ """
131
+ Internal method to handle S3 export.
132
+ """
133
+ try:
134
+ s3_client = boto3.client('s3')
135
+
136
+ # Create a temporary directory for the files
137
+ with tempfile.TemporaryDirectory() as temp_dir:
138
+ # Save pickle file
139
+ pickle_path = os.path.join(temp_dir, "ragx.pkl")
140
+ with open(pickle_path, "wb") as f:
141
+ dill.dump(self, f)
142
+
143
+ # Copy database
144
+ db_path = os.path.join(temp_dir, "milvus.db")
145
+ shutil.copy(self.db_path, db_path)
146
+
147
+ # Upload to S3
148
+ s3_client.upload_file(
149
+ pickle_path,
150
+ bucket,
151
+ f"{prefix}/ragx.pkl"
152
+ )
153
+ s3_client.upload_file(
154
+ db_path,
155
+ bucket,
156
+ f"{prefix}/milvus.db"
157
+ )
158
+
159
+ return self
160
+
161
+ except ClientError as e:
162
+ logger.error(f"Error uploading to S3: {e}")
163
+ raise
164
+ except Exception as e:
165
+ logger.error(f"Error in S3 export: {e}")
166
+ raise
167
+
168
+ @classmethod
169
+ def load(cls, source: str, s3_bucket: str = None) -> Self:
170
+ """
171
+ Load a Ragx instance from either local filesystem or S3.
172
+
173
+ Args:
174
+ source: str - Local path or S3 key prefix
175
+ s3_bucket: str, optional - S3 bucket name. If provided, load from S3
176
+ """
177
+ try:
178
+ # If s3_bucket is provided, load from S3
179
+ if s3_bucket:
180
+ return cls._load_from_s3(source, s3_bucket)
181
+
182
+ # Otherwise, load from local filesystem
183
+ pickle_path = os.path.join(source, "ragx.pkl")
184
+
185
+ with open(pickle_path, "rb") as f:
186
+ instance = dill.load(f)
187
+
188
+ # Restore client
189
+ instance.client = MilvusClient(os.path.join(source, "milvus.db"))
190
+
191
+ return instance
192
+
193
+ except Exception as e:
194
+ logger.error(f"Error in load: {e}")
195
+ raise
196
+
197
+ @classmethod
198
+ def _load_from_s3(cls, prefix: str, bucket: str) -> 'Ragx':
199
+ """
200
+ Internal classmethod to handle S3 loading.
201
+ """
202
+ try:
203
+ s3_client = boto3.client('s3')
204
+
205
+ # Create a temporary directory for the files
206
+ with tempfile.TemporaryDirectory() as temp_dir:
207
+ # Download files from S3
208
+ pickle_path = os.path.join(temp_dir, "ragx.pkl")
209
+ db_path = os.path.join(temp_dir, "milvus.db")
210
+
211
+ s3_client.download_file(
212
+ bucket,
213
+ f"{prefix}/ragx.pkl",
214
+ pickle_path
215
+ )
216
+ s3_client.download_file(
217
+ bucket,
218
+ f"{prefix}/milvus.db",
219
+ db_path
220
+ )
221
+
222
+ # Load the pickle file
223
+ with open(pickle_path, "rb") as f:
224
+ instance = dill.load(f)
225
+
226
+ # Restore client with the downloaded database
227
+ instance.client = MilvusClient(db_path)
228
+
229
+ return instance
230
+
231
+ except ClientError as e:
232
+ logger.error(f"Error downloading from S3: {e}")
233
+ raise
234
+ except Exception as e:
235
+ logger.error(f"Error in S3 load: {e}")
236
+ raise
237
+
238
+ def generate_llm_response(self, query: str, data: list[dict] = None) -> ChatCompletion:
239
+ if data is None:
240
+ data = self.query(query)[0]
241
+
242
+ if not self.system_prompt:
243
+ raise ValueError("System prompt not set. Please call add_system_prompt first.")
244
+
245
+ response = openai.chat.completions.create(
246
+ model=self.model,
247
+ messages=[
248
+ {"role": "system", "content": self.system_prompt},
249
+ {"role": "user", "content": "query: {} data: {}".format(query, data)}
250
+ ]
251
+ )
252
+
253
+ return response
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ragxo
3
- Version: 0.1.3
3
+ Version: 0.1.5
4
4
  Summary: A RAG (Retrieval-Augmented Generation) toolkit with Milvus integration
5
5
  Home-page: https://github.com/yourusername/ragx
6
6
  License: MIT
@@ -17,6 +17,7 @@ Classifier: Programming Language :: Python :: 3.12
17
17
  Classifier: Programming Language :: Python :: 3.13
18
18
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
19
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Requires-Dist: boto3 (>=1.36.14,<2.0.0)
20
21
  Requires-Dist: dill (>=0.3.9,<0.4.0)
21
22
  Requires-Dist: milvus (>=2.3.9,<3.0.0)
22
23
  Requires-Dist: openai (>=1.61.1,<2.0.0)
@@ -48,65 +49,15 @@ RagXO extends the capabilities of traditional RAG (Retrieval-Augmented Generatio
48
49
  pip install ragxo
49
50
  ```
50
51
 
51
- ## Quick Start 🚀
52
-
53
- ```python
54
- from ragxo import Ragxo, Document
55
- from openai import OpenAI
56
- client = OpenAI()
57
-
58
- def get_openai_embeddings(text: str) -> list[float]:
59
- response = client.embeddings.create(
60
- input=text,
61
- model="text-embedding-ada-002"
62
- )
63
- return response.data[0].embedding
64
-
65
- def preprocess_text(text: str) -> str:
66
- return text.lower()
67
-
68
- # Initialize and configure RagXO
69
- ragxo = Ragxo(dimension=384)
70
- ragxo.add_preprocess(preprocess_text)
71
- ragxo.add_embedding_fn(get_openai_embeddings)
72
-
73
- # Add system prompt and model
74
- ragxo.add_system_prompt("You are a helpful assistant.")
75
- ragxo.add_model("gpt-4o-mini")
76
-
77
- # Create and index documents
78
- documents = [
79
- Document(
80
- text="Sample document for indexing",
81
- metadata={"source": "example"},
82
- id=1
83
- )
84
- ]
85
- ragxo.index(documents)
86
-
87
- # Export the pipeline
88
- ragxo.export("my_rag_v1")
89
-
90
- # Load and use elsewhere
91
- loaded_ragxo = Ragxo.load("my_rag_v1")
92
-
93
- # Query and generate response
94
- similar_docs = loaded_ragxo.query("sample query")
95
- llm_response = loaded_ragxo.generate_llm_response("What can you tell me about the sample?")
96
- ```
97
-
98
52
  ## Usage Guide 📚
99
53
 
100
- ### Creating Documents
54
+ ### Import
101
55
 
102
56
  ```python
103
- from ragxo import Document
57
+ from ragxo import Ragxo, Document
58
+
59
+ ragxo_client = Ragxo(dimension=768)
104
60
 
105
- doc = Document(
106
- text="Your document content here",
107
- metadata={"source": "wiki", "category": "science"},
108
- id=1
109
- )
110
61
  ```
111
62
 
112
63
  ### Adding Preprocessing Steps
@@ -120,8 +71,8 @@ def remove_special_chars(text: str) -> str:
120
71
  def lowercase(text: str) -> str:
121
72
  return text.lower()
122
73
 
123
- ragxo.add_preprocess(remove_special_chars)
124
- ragxo.add_preprocess(lowercase)
74
+ ragxo_client.add_preprocess(remove_special_chars)
75
+ ragxo_client.add_preprocess(lowercase)
125
76
  ```
126
77
 
127
78
  ### Custom Embedding Functions
@@ -150,27 +101,43 @@ def get_openai_embeddings(text: str) -> list[float]:
150
101
  ragxo.add_embedding_fn(get_openai_embeddings)
151
102
  ```
152
103
 
104
+
105
+ ### Creating Documents
106
+
107
+ ```python
108
+ from ragxo import Document
109
+
110
+ doc = Document(
111
+ text="Your document content here",
112
+ metadata={"source": "wiki", "category": "science"},
113
+ id=1
114
+ )
115
+
116
+ ragxo_client.index([doc])
117
+
118
+ ```
119
+
153
120
  ### LLM Configuration
154
121
 
155
122
  ```python
156
123
  # Set system prompt
157
- ragxo.add_system_prompt("""
124
+ ragxo_client.add_system_prompt("""
158
125
  You are a helpful assistant. Use the provided context to answer questions accurately.
159
126
  If you're unsure about something, please say so.
160
127
  """)
161
128
 
162
129
  # Set LLM model
163
- ragxo.add_model("gpt-4")
130
+ ragxo_client.add_model("gpt-4")
164
131
  ```
165
132
 
166
133
  ### Export and Load
167
134
 
168
135
  ```python
169
136
  # Export your RAG pipeline
170
- ragxo.export("rag_pipeline_v1")
137
+ ragxo_client.export("rag_pipeline_v1")
171
138
 
172
139
  # Load it elsewhere
173
- loaded_ragxo = Ragxo.load("rag_pipeline_v1")
140
+ loaded_ragxo_client = Ragxo.load("rag_pipeline_v1")
174
141
  ```
175
142
 
176
143
  ## Best Practices 💡
@@ -180,27 +147,15 @@ loaded_ragxo = Ragxo.load("rag_pipeline_v1")
180
147
  ragxo.export("my_rag_v1.0.0")
181
148
  ```
182
149
 
183
- 2. **Validate After Loading**: Always test your loaded pipeline:
184
- ```python
185
- loaded_ragxo = Ragxo.load("my_rag")
186
- try:
187
- # Test similarity search
188
- similar_docs = loaded_ragxo.query("test query")
189
- # Test LLM generation
190
- llm_response = loaded_ragxo.generate_llm_response("test question")
191
- print("Pipeline loaded successfully!")
192
- except Exception as e:
193
- print(f"Error loading pipeline: {e}")
150
+ 2. **S3**: Use S3 to store your exports
151
+
152
+ ```shell
153
+ export AWS_ACCESS_KEY_ID=your_access_key
154
+ export AWS_SECRET_ACCESS_KEY=your_secret_key
194
155
  ```
195
156
 
196
- 3. **Document Your Pipeline Configuration**: Keep track of your setup:
197
157
  ```python
198
- pipeline_config = {
199
- "preprocessing_steps": ["remove_special_chars", "lowercase"],
200
- "embedding_model": "all-MiniLM-L6-v2",
201
- "llm_model": "gpt-4",
202
- "dimension": 384
203
- }
158
+ ragxo_client.export("my_rag_v1.0.0", s3_bucket="my_bucket")
204
159
  ```
205
160
 
206
161
  ## License 📝
@@ -0,0 +1,5 @@
1
+ ragxo/__init__.py,sha256=0VVe-z4XkkGQLQIG0hF0Hyf87_RgX0E4T9TRwwTkbmE,68
2
+ ragxo/client.py,sha256=smS3vt7k0k1p1mDOT8Taa4vpHl0fiuvbo3RdRY_D01k,8300
3
+ ragxo-0.1.5.dist-info/METADATA,sha256=0oS9x3tRULcvKZNQQOyIlKpQ_baDzt1pHB0wTubsb7I,4600
4
+ ragxo-0.1.5.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
5
+ ragxo-0.1.5.dist-info/RECORD,,
ragxo/ragx.py DELETED
@@ -1,145 +0,0 @@
1
- from typing import Self, Callable
2
- from pymilvus import MilvusClient
3
- from pydantic import BaseModel
4
- import dill
5
- import os
6
- import shutil
7
- import logging
8
- import openai
9
- from openai import ChatCompletion
10
-
11
- logging.basicConfig(level=logging.DEBUG)
12
- logger = logging.getLogger(__name__)
13
-
14
- class Document(BaseModel):
15
- text: str
16
- metadata: dict
17
- id: int
18
-
19
- class Ragxo:
20
- def __init__(self, dimension: int) -> None:
21
- self.dimension = dimension
22
- self.collection_name = "ragx"
23
- self.db_path = "milvus.db"
24
- self.client = MilvusClient(self.db_path)
25
- self.client.create_collection(self.collection_name, dimension=dimension)
26
- self.processing_fn = []
27
- self.embedding_fn = None
28
- self.system_prompt = None
29
- self.model = "gpt-4o-mini"
30
-
31
- def add_preprocess(self, fn: Callable) -> Self:
32
- self.processing_fn.append(fn)
33
- return self
34
-
35
- def add_embedding_fn(self, fn: Callable) -> Self:
36
- if not fn:
37
- raise ValueError("Embedding function cannot be None")
38
- self.embedding_fn = fn
39
- return self
40
-
41
- def add_system_prompt(self, prompt: str) -> Self:
42
- self.system_prompt = prompt
43
- return self
44
-
45
- def add_model(self, model: str) -> Self:
46
- self.model = model
47
- return self
48
-
49
- def index(self, data: list[Document]) -> Self:
50
- if not self.embedding_fn:
51
- raise ValueError("Embedding function not set")
52
-
53
- processed_text = []
54
- for item in data:
55
- current_text = item.text
56
- for fn in self.processing_fn:
57
- current_text = fn(current_text)
58
- processed_text.append(current_text)
59
-
60
- embeddings = [
61
- self.embedding_fn(text)
62
- for text in processed_text
63
- ]
64
-
65
- self.client.insert(self.collection_name, [
66
- {
67
- "text": item.text,
68
- "metadata": item.metadata,
69
- "id": item.id,
70
- "vector": embedding
71
- }
72
- for item, embedding in zip(data, embeddings)
73
- ])
74
- return self
75
-
76
- def query(self, query: str, output_fields: list[str] = ['text', 'metadata']) -> list[list[dict]]:
77
- if not self.embedding_fn:
78
- raise ValueError("Embedding function not set. Please call add_embedding_fn first.")
79
-
80
- preprocessed_query = query
81
- for fn in self.processing_fn:
82
- preprocessed_query = fn(preprocessed_query)
83
-
84
- embedding = self.embedding_fn(preprocessed_query)
85
-
86
- return self.client.search(
87
- collection_name=self.collection_name,
88
- data=[embedding],
89
- limit=10,
90
- output_fields=output_fields
91
- )
92
-
93
- def export(self, folder_path: str) -> Self:
94
- try:
95
- os.makedirs(folder_path, exist_ok=True)
96
-
97
- # Save using dill
98
- pickle_path = os.path.join(folder_path, "ragx.pkl")
99
- with open(pickle_path, "wb") as f:
100
- dill.dump(self, f)
101
-
102
- # Copy database
103
- db_dest = os.path.join(folder_path, "milvus.db")
104
- shutil.copy(self.db_path, db_dest)
105
-
106
- return self
107
-
108
- except Exception as e:
109
- logger.error(f"Error in export: {e}")
110
- raise
111
-
112
- @classmethod
113
- def load(cls, folder_path: str) -> 'Ragx':
114
- try:
115
- pickle_path = os.path.join(folder_path, "ragx.pkl")
116
-
117
- with open(pickle_path, "rb") as f:
118
- instance = dill.load(f)
119
-
120
- # Restore client
121
- instance.client = MilvusClient(os.path.join(folder_path, "milvus.db"))
122
-
123
- return instance
124
-
125
- except Exception as e:
126
- logger.error(f"Error in load: {e}")
127
- raise
128
-
129
- def generate_llm_response(self, query: str, data: list[dict] = None) -> ChatCompletion:
130
-
131
- if data is None:
132
- data = self.query(query)[0]
133
-
134
- if not self.system_prompt:
135
- raise ValueError("System prompt not set. Please call add_system_prompt first.")
136
-
137
- response = openai.chat.completions.create(
138
- model=self.model,
139
- messages=[
140
- {"role": "system", "content": self.system_prompt},
141
- {"role": "user", "content": "query: {} data: {}".format(query, data)}
142
- ]
143
- )
144
-
145
- return response
@@ -1,5 +0,0 @@
1
- ragxo/__init__.py,sha256=jI_6iulTUQk9JUDft-jM6NHESpZSmJVPIaVOmd4-jWw,65
2
- ragxo/ragx.py,sha256=_HQCTth_iR2rxV9amMyA6qlOpdGji5_-rSDB5WWG2u4,4537
3
- ragxo-0.1.3.dist-info/METADATA,sha256=FZmy-PL_SZMf9NuDWcniQUsleZna_GYsz5GLoJRbHcM,5960
4
- ragxo-0.1.3.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
5
- ragxo-0.1.3.dist-info/RECORD,,
File without changes