ragit 0.8__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ragit/main.py DELETED
@@ -1,354 +0,0 @@
1
- import chromadb
2
- import pandas as pd
3
- import logging
4
- from sentence_transformers import SentenceTransformer
5
- from typing import List, Dict, Optional, Union
6
- import os
7
-
8
- class VectorDBManager:
9
- def __init__(
10
- self,
11
- persist_directory: str = "./vector_db",
12
- provider: str = "sentence_transformer",
13
- model_name: str = "all-mpnet-base-v2",
14
- ):
15
- """
16
- Initialize the Vector Database Manager.
17
-
18
- Args:
19
- persist_directory (str): Directory to persist the database
20
- """
21
- self.persist_directory = persist_directory
22
- self.client = chromadb.PersistentClient(path=persist_directory)
23
- if provider == "sentence_transformer":
24
- self.model = SentenceTransformer(model_name)
25
-
26
- logging.basicConfig(
27
- level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
28
- )
29
- self.logger = logging.getLogger(__name__)
30
-
31
- def create_database(
32
- self,
33
- csv_path: str,
34
- collection_name: str,
35
- distance_metric: str = "l2",
36
- collection_metadata: Dict = None,
37
- ) -> bool:
38
- """
39
- Create a new database from a CSV file.
40
-
41
- Args:
42
- csv_path (str): Path to the CSV file containing 'id' and 'text' columns
43
- collection_name (str): Name of the collection to create
44
- distance_metric (str): Distance metric (l2, cosine, ip)
45
- collection_metadata (Dict, optional): Additional metadata for the collection
46
-
47
- Returns:
48
- bool: True if successful, False otherwise
49
- """
50
- try:
51
-
52
- df = pd.read_csv(csv_path)
53
-
54
- if not {"id", "text"}.issubset(df.columns):
55
- self.logger.error("CSV must contain 'id' and 'text' columns")
56
- return False
57
-
58
- collection_meta = {
59
- "hnsw:space": distance_metric,
60
- "description": f"Collection created from {csv_path}",
61
- }
62
-
63
- if collection_metadata:
64
- collection_meta.update(collection_metadata)
65
-
66
- collection = self.client.create_collection(
67
- name=collection_name, metadata=collection_meta
68
- )
69
-
70
- embeddings = self.model.encode(df["text"].tolist()).tolist()
71
-
72
- collection.add(
73
- ids=[str(id_) for id_ in df["id"]],
74
- documents=df["text"].tolist(),
75
- embeddings=embeddings,
76
- )
77
-
78
- self.logger.info(f"Successfully created collection '{collection_name}'")
79
- return True
80
-
81
- except Exception as e:
82
- self.logger.error(f"Error creating database: {str(e)}")
83
- return False
84
-
85
- def add_values_from_csv(
86
- self, csv_path: str, collection_name: str
87
- ) -> Dict[str, int]:
88
- """
89
- Add values from CSV file to existing collection, skipping existing IDs.
90
-
91
- Args:
92
- csv_path (str): Path to the CSV file
93
- collection_name (str): Name of the target collection
94
-
95
- Returns:
96
- Dict[str, int]: Statistics about the operation
97
- """
98
- try:
99
-
100
- df = pd.read_csv(csv_path)
101
-
102
- collection = self.client.get_collection(collection_name)
103
-
104
- existing_ids = set(collection.get()["ids"])
105
-
106
- new_df = df[~df["id"].astype(str).isin(existing_ids)]
107
-
108
- if not new_df.empty:
109
-
110
- embeddings = self.model.encode(new_df["text"].tolist()).tolist()
111
-
112
- collection.add(
113
- ids=[str(id_) for id_ in new_df["id"]],
114
- documents=new_df["text"].tolist(),
115
- embeddings=embeddings,
116
- )
117
-
118
- stats = {
119
- "total_entries": len(df),
120
- "new_entries_added": len(new_df),
121
- "skipped_entries": len(df) - len(new_df),
122
- }
123
-
124
- self.logger.info(
125
- f"Added {stats['new_entries_added']} new entries to '{collection_name}'"
126
- )
127
- return stats
128
-
129
- except Exception as e:
130
- self.logger.error(f"Error adding values from CSV: {str(e)}")
131
- return {"error": str(e)}
132
-
133
- def add_single_row(self, id_: str, text: str, collection_name: str) -> bool:
134
- """
135
- Add a single entry to the collection.
136
-
137
- Args:
138
- id_ (str): ID for the new entry
139
- text (str): Text content
140
- collection_name (str): Target collection name
141
-
142
- Returns:
143
- bool: True if successful, False otherwise
144
- """
145
- try:
146
- collection = self.client.get_collection(collection_name)
147
-
148
- if str(id_) in collection.get()["ids"]:
149
- self.logger.warning(f"ID {id_} already exists in collection")
150
- return False
151
-
152
- embedding = self.model.encode([text]).tolist()
153
-
154
- collection.add(ids=[str(id_)], documents=[text], embeddings=embedding)
155
-
156
- self.logger.info(f"Successfully added entry with ID {id_}")
157
- return True
158
-
159
- except Exception as e:
160
- self.logger.error(f"Error adding single row: {str(e)}")
161
- return False
162
-
163
- def delete_entry_by_id(self, id_: str, collection_name: str) -> bool:
164
- """
165
- Delete an entry by its ID.
166
-
167
- Args:
168
- id_ (str): ID of the entry to delete
169
- collection_name (str): Collection name
170
-
171
- Returns:
172
- bool: True if successful, False otherwise
173
- """
174
- try:
175
- collection = self.client.get_collection(collection_name)
176
-
177
- if str(id_) not in collection.get()["ids"]:
178
- self.logger.warning(f"ID {id_} not found in collection")
179
- return False
180
-
181
- collection.delete(ids=[str(id_)])
182
-
183
- self.logger.info(f"Successfully deleted entry with ID {id_}")
184
- return True
185
-
186
- except Exception as e:
187
- self.logger.error(f"Error deleting entry: {str(e)}")
188
- return False
189
-
190
- def find_nearby_texts(
191
- self,
192
- text: str,
193
- collection_name: str,
194
- search_string: Optional[str] = None,
195
- k: int = 5,
196
- ) -> List[Dict[str, Union[str, float]]]:
197
- """
198
- Find nearby texts using similarity search with scores.
199
-
200
- Args:
201
- text (str): Query text
202
- collection_name (str): Collection to search in
203
- k (int): Number of results to return
204
-
205
- Returns:
206
- List[Dict[str, Union[str, float]]]: List of nearby texts with their IDs and similarity scores
207
- """
208
- try:
209
- collection = self.client.get_collection(collection_name)
210
- print("Metadata:", collection.metadata)
211
-
212
- distance_metric = collection.metadata["hnsw:space"]
213
-
214
- query_embedding = self.model.encode([text]).tolist()
215
-
216
- if search_string:
217
- results = collection.query(
218
- query_embeddings=query_embedding,
219
- n_results=k,
220
- include=["documents", "distances", "metadatas"],
221
- where_document={"$contains": search_string},
222
- )
223
- else:
224
- results = collection.query(
225
- query_embeddings=query_embedding,
226
- n_results=k,
227
- include=["documents", "distances", "metadatas"],
228
- )
229
-
230
- distances = results["distances"][0]
231
- if not distances:
232
- return []
233
-
234
- similarities = []
235
- for dist in distances:
236
- if distance_metric == "cosine":
237
-
238
- similarity = 1 - dist
239
- elif distance_metric == "ip":
240
-
241
- min_dist = min(distances)
242
- max_dist = max(distances)
243
- similarity = (
244
- (dist - min_dist) / (max_dist - min_dist)
245
- if max_dist > min_dist
246
- else 1.0
247
- )
248
- elif distance_metric == "l1":
249
-
250
- max_dist = max(distances)
251
- similarity = 1 - (dist / max_dist) if max_dist > 0 else 1.0
252
- elif distance_metric == "l2":
253
-
254
- max_dist = max(distances)
255
- similarity = 1 - (dist / max_dist) if max_dist > 0 else 1.0
256
-
257
- similarities.append(similarity)
258
-
259
- nearby_texts = [
260
- {
261
- "id": id_,
262
- "text": text_,
263
- "similarity": round(similarity * 100, 4),
264
- "raw_distance": dist,
265
- "metric": distance_metric,
266
- }
267
- for id_, text_, similarity, dist in zip(
268
- results["ids"][0], results["documents"][0], similarities, distances
269
- )
270
- ]
271
-
272
- return nearby_texts
273
-
274
- except Exception as e:
275
- self.logger.error(f"Error finding nearby texts: {str(e)}")
276
- return []
277
-
278
- def delete_collection(self, collection_name: str, confirmation: str = "no") -> bool:
279
- """
280
- Delete an entire collection.
281
-
282
- Args:
283
- collection_name (str): Name of collection to delete
284
- confirmation (str): Must be 'yes' to proceed
285
-
286
- Returns:
287
- bool: True if successful, False otherwise
288
- """
289
- try:
290
- if confirmation.lower() != "yes":
291
- self.logger.warning("Deletion cancelled - confirmation not provided")
292
- return False
293
-
294
- self.client.delete_collection(collection_name)
295
- self.logger.info(f"Successfully deleted collection '{collection_name}'")
296
- return True
297
-
298
- except Exception as e:
299
- self.logger.error(f"Error deleting collection: {str(e)}")
300
- return False
301
-
302
- def get_collection_info(self, collection_name: str) -> Dict:
303
- """
304
- Get information about a collection.
305
-
306
- Args:
307
- collection_name (str): Name of the collection
308
-
309
- Returns:
310
- Dict: Collection information and statistics
311
- """
312
- try:
313
- collection = self.client.get_collection(collection_name)
314
- collection_data = collection.get()
315
-
316
- info = {
317
- "name": collection_name,
318
- "count": len(collection_data["ids"]),
319
- "metadata": collection.metadata,
320
- }
321
-
322
- return info
323
-
324
- except Exception as e:
325
- self.logger.error(f"Error getting collection info: {str(e)}")
326
- return {"error": str(e)}
327
-
328
- def get_by_ids(self, ids: List[str], collection_name: str) -> Dict[str, str]:
329
- """
330
- Get texts for given IDs in batch.
331
-
332
- Args:
333
- ids (List[str]): List of IDs to fetch
334
- collection_name (str): Name of the collection
335
-
336
- Returns:
337
- Dict[str, str]: Dictionary mapping IDs to their corresponding texts
338
- """
339
- try:
340
- collection = self.client.get_collection(collection_name)
341
-
342
- str_ids = [str(id_) for id_ in ids]
343
-
344
- results = collection.get(ids=str_ids, include=["documents"])
345
-
346
- id_to_text = {
347
- id_: text for id_, text in zip(results["ids"], results["documents"])
348
- }
349
-
350
- return id_to_text
351
-
352
- except Exception as e:
353
- self.logger.error(f"Error getting texts by IDs: {str(e)}")
354
- return {}
@@ -1,21 +0,0 @@
1
- MIT License
2
-
3
- Copyright (c) 2025 Salman Faroz
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining a copy
6
- of this software and associated documentation files (the "Software"), to deal
7
- in the Software without restriction, including without limitation the rights
8
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- copies of the Software, and to permit persons to whom the Software is
10
- furnished to do so, subject to the following conditions:
11
-
12
- The above copyright notice and this permission notice shall be included in all
13
- copies or substantial portions of the Software.
14
-
15
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- SOFTWARE.
@@ -1,176 +0,0 @@
1
- Metadata-Version: 2.2
2
- Name: ragit
3
- Version: 0.8
4
- Home-page: https://github.com/stsfaroz/ragit
5
- Author: Salman Faroz
6
- License: MIT
7
- Description-Content-Type: text/markdown
8
- License-File: LICENSE
9
- Requires-Dist: sentence-transformers>=3.4.1
10
- Requires-Dist: pandas>=2.2.3
11
- Requires-Dist: chromadb>=0.6.3
12
- Requires-Dist: setuptools>=75.8.0
13
- Requires-Dist: wheel>=0.45.1
14
- Requires-Dist: twine>=6.1.0
15
- Dynamic: author
16
- Dynamic: description
17
- Dynamic: description-content-type
18
- Dynamic: home-page
19
- Dynamic: license
20
- Dynamic: requires-dist
21
-
22
-
23
- # Ragit
24
- 🚀 Smart, Fast, Scalable Search 🚀
25
-
26
- ## Installation
27
- ```
28
- pip install ragit
29
- ```
30
-
31
- **ragit** is a lightweight Python library that simplifies the management of vector databases. With **ragit**, you can easily create, update, query, and manage your vector database, all from CSV files containing text data.
32
-
33
- ## Features
34
-
35
- - **Create a Vector Database:** Build your database from a CSV file with two required columns: `id` and `text`.
36
- - **Add New Entries:** Insert additional entries from CSV files or add them individually.
37
- - **Similarity Search:** Find nearby texts using various distance metrics (e.g., cosine, L2) with similarity scores.
38
- - **Data Retrieval:** Fetch entries by IDs or exact text matches.
39
- - **Deletion:** Remove single entries or entire collections when needed.
40
-
41
- ## CSV File Format
42
- ragit expects your CSV file to have exactly two columns: `id` and `text`. **Note:** Each `id` must be unique.
43
-
44
- ## Example CSV (`data.csv`):
45
-
46
- ```csv
47
- id,text
48
- 1,The quick brown fox jumps over the lazy dog.
49
- 2,Another sample entry for testing.
50
- ```
51
-
52
- ## Usage
53
- Below are some examples that demonstrate how to use `ragit`. The examples cover creating a database, adding entries, performing similarity searches, and more.
54
-
55
- ### 1. Importing and Initializing
56
- First, import the `VectorDBManager` class from `ragit` and initialize it:
57
-
58
- ```python
59
- from ragit import VectorDBManager
60
-
61
- # Initialize the vector database manager with a custom persistence directory and model
62
- db_manager = VectorDBManager(
63
- persist_directory="./my_vector_db", # Optional # default : "./vector_db"
64
- provider="sentence_transformer", # Optional # default : "sentence_transformer"
65
- model_name="all-mpnet-base-v2" # Optional # default : "all-mpnet-base-v2"
66
- )
67
- ```
68
-
69
- ### 2. Creating a Database
70
- Create a new collection (named `my_collection`) using your CSV file. In this example, the `distance_metric` is set to "cosine"(available options: l2, cosine, ip, l1) :
71
-
72
- ```python
73
- db_manager.create_database(
74
- csv_path="data.csv",
75
- collection_name="my_collection",
76
- distance_metric="cosine" # Optional # default : l2
77
- )
78
- ```
79
- ### Reloading Your Database
80
-
81
- To reuse your existing vector database, initialize VectorDBManager with the same parameters that were used when creating the database.
82
-
83
- ```python
84
- from ragit import VectorDBManager
85
-
86
- db_manager = VectorDBManager(
87
- persist_directory="./my_vector_db",
88
- provider="sentence_transformer",
89
- model_name="all-mpnet-base-v2"
90
- )
91
- ```
92
-
93
- ### 3. Adding a Single Entry
94
- Add an individual entry to the collection:
95
-
96
- ```python
97
- db_manager.add_single_row(
98
- id_="101",
99
- text="This is a new test entry for the database.",
100
- collection_name="my_collection"
101
- )
102
- ```
103
-
104
- ### 4. Adding Multiple Entries from CSV
105
- You can also add multiple entries from a CSV file. This function skips any entries that already exist in the collection:
106
-
107
- ```python
108
- stats = db_manager.add_values_from_csv(
109
- csv_path="data.csv",
110
- collection_name="my_collection"
111
- )
112
- print(f"Added {stats['new_entries_added']} new entries")
113
- ```
114
-
115
- ### 5. Retrieving Collection Information
116
- Fetch and display information about your collection:
117
-
118
- ```python
119
- info = db_manager.get_collection_info("my_collection")
120
- print(f"Collection size: {info['count']} entries")
121
- ```
122
-
123
- ### 6. Performing a Similarity Search
124
- Find texts that are similar to your query. In this example, the query text is "ai", and the search is filtered using the string "Artificial intelligence". The top 2 results are returned:
125
-
126
- ```python
127
- results = db_manager.find_nearby_texts(
128
- text="ai",
129
- collection_name="my_collection",
130
- k=2,
131
- search_string="Artificial intelligence" # Optional
132
- )
133
-
134
- print("Results:")
135
- for item in results:
136
- print(f"\nID: {item['id']}")
137
- print(f"Text: {item['text']}")
138
- print(f"Similarity: {item['similarity']}%")
139
- print(f"Distance ({item['metric']}): {item['raw_distance']}")
140
- ```
141
-
142
- ### 7. Fetching Texts by IDs
143
- Retrieve text entries for a list of IDs:
144
-
145
- ```python
146
- ids_to_fetch = ["1", "2", "3"]
147
- texts = db_manager.get_by_ids(ids_to_fetch, "my_collection")
148
- print("Texts:", texts)
149
- ```
150
-
151
- ### 8. Deleting a Row / Collection
152
-
153
- Remove an entry from the collection by its ID:
154
-
155
- ```python
156
- db_manager.delete_entry_by_id(
157
- id_="1",
158
- collection_name="my_collection"
159
- )
160
- ```
161
-
162
-
163
- Delete an entire collection. **Note:** You must pass `confirmation="yes"` to proceed with deletion.
164
-
165
- ```python
166
- db_manager.delete_collection(
167
- collection_name="my_collection",
168
- confirmation="yes"
169
- )
170
- ```
171
-
172
- ## Contributing
173
- Contributions are welcome! If you encounter any issues or have suggestions for improvements, please feel free to open an issue or submit a pull request on GitHub.
174
-
175
- ## License
176
- This project is licensed under the MIT License. See the `LICENSE` file for details.
@@ -1,7 +0,0 @@
1
- ragit/__init__.py,sha256=GECJxYFL_0PMy6tbcVFpW9Fhe1JiI2uXH4iJWhUHpKs,48
2
- ragit/main.py,sha256=f2kDfZPxP26DBvzmP7aF6VhnNAE1hC-ZONU5ZH6RVBM,11774
3
- ragit-0.8.dist-info/LICENSE,sha256=L8f7hg7lQm80qoZhSCoW1ACAKph-FpJaNaa9MyNDBqo,1069
4
- ragit-0.8.dist-info/METADATA,sha256=H73xJZU_viExL4wcb-knClT7-BmMvhtlAaeQ07gliXM,5230
5
- ragit-0.8.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
6
- ragit-0.8.dist-info/top_level.txt,sha256=pkPbG7yrw61wt9_y_xcLE2vq2a55fzockASD0yq0g4s,6
7
- ragit-0.8.dist-info/RECORD,,