ragit 0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ragit-0.1/PKG-INFO ADDED
@@ -0,0 +1,10 @@
1
+ Metadata-Version: 2.2
2
+ Name: ragit
3
+ Version: 0.1
4
+ Requires-Dist: sentence-transformers>=3.4.1
5
+ Requires-Dist: pandas>=2.2.3
6
+ Requires-Dist: chromadb>=0.6.3
7
+ Requires-Dist: setuptools>=75.8.0
8
+ Requires-Dist: wheel>=0.45.1
9
+ Requires-Dist: twine>=6.1.0
10
+ Dynamic: requires-dist
ragit-0.1/README.md ADDED
@@ -0,0 +1,153 @@
1
+ # ragit
2
+
3
+ **ragit** is a lightweight Python library that simplifies the management of vector databases using [ChromaDB](https://github.com/chroma-core/chroma) and [Sentence Transformers](https://www.sbert.net/). With ragit, you can easily create, update, query, and manage your vector database—all from CSV files containing text data.
4
+
5
+ ## Features
6
+
7
+ - **Create a Vector Database:** Build your database from a CSV file with two required columns: `id` and `text`.
8
+ - **Add New Entries:** Insert additional entries from CSV files or add them individually.
9
+ - **Similarity Search:** Find nearby texts using various distance metrics (e.g., cosine, L2) with similarity scores.
10
+ - **Data Retrieval:** Fetch entries by IDs or exact text matches.
11
+ - **Deletion:** Remove single entries or entire collections when needed.
12
+
13
+ ## Installation
14
+
15
+ Install ragit from PyPI using pip:
16
+
17
+ ```bash
18
+ pip install ragit
19
+ ```
20
+
21
+ ## CSV File Format
22
+ ragit expects your CSV file to have exactly two columns: `id` and `text`.
23
+
24
+ ## Example CSV (`data.csv`):
25
+
26
+ ```csv
27
+ id,text
28
+ 1,The quick brown fox jumps over the lazy dog.
29
+ 2,Another sample entry for testing.
30
+ ```
31
+
32
+ ## Usage
33
+ Below are some examples that demonstrate how to use `ragit`. The examples cover creating a database, adding entries, performing similarity searches, and more.
34
+
35
+ ### 1. Importing and Initializing
36
+ First, import the `VectorDBManager` class from `ragit` and initialize it:
37
+
38
+ ```python
39
+ from ragit import VectorDBManager
40
+
41
+ # Initialize the vector database manager with a custom persistence directory and model
42
+ db_manager = VectorDBManager(
43
+ persist_directory="./my_vector_db", # Optional # default : "./vector_db"
44
+ provider="sentence_transformer", # Optional # default : "sentence_transformer"
45
+ model_name="all-mpnet-base-v2" # Optional # default : "all-mpnet-base-v2"
46
+ )
47
+ ```
48
+
49
+ ### 2. Creating a Database
50
+ Create a new collection (named `my_collection`) using your CSV file. In this example, the `distance_metric` is set to "cosine"(available options: l2, cosine, ip, l1) :
51
+
52
+ ```python
53
+ db_manager.create_database(
54
+ csv_path="data.csv",
55
+ collection_name="my_collection",
56
+ distance_metric="cosine" # Optional # default : l2
57
+ )
58
+ ```
59
+
60
+ ### 3. Adding a Single Entry
61
+ Add an individual entry to the collection:
62
+
63
+ ```python
64
+ db_manager.add_single_row(
65
+ id_="101",
66
+ text="This is a new test entry for the database.",
67
+ collection_name="my_collection"
68
+ )
69
+ ```
70
+
71
+ ### 4. Adding Multiple Entries from CSV
72
+ You can also add multiple entries from a CSV file. This function skips any entries that already exist in the collection:
73
+
74
+ ```python
75
+ stats = db_manager.add_values_from_csv(
76
+ csv_path="data.csv",
77
+ collection_name="my_collection"
78
+ )
79
+ print(f"Added {stats['new_entries_added']} new entries")
80
+ ```
81
+
82
+ ### 5. Retrieving Collection Information
83
+ Fetch and display information about your collection:
84
+
85
+ ```python
86
+ info = db_manager.get_collection_info("my_collection")
87
+ print(f"Collection size: {info['count']} entries")
88
+ ```
89
+
90
+ ### 6. Performing a Similarity Search
91
+ Find texts that are similar to your query. In this example, the query text is "ai", and the search is filtered using the string "Artificial intelligence". The top 2 results are returned:
92
+
93
+ ```python
94
+ results = db_manager.find_nearby_texts(
95
+ text="ai",
96
+ collection_name="my_collection",
97
+ k=2,
98
+ search_string="Artificial intelligence" # Optional
99
+ )
100
+
101
+ print("Results:")
102
+ for item in results:
103
+ print(f"\nID: {item['id']}")
104
+ print(f"Text: {item['text']}")
105
+ print(f"Similarity: {item['similarity']}%")
106
+ print(f"Distance ({item['metric']}): {item['raw_distance']}")
107
+ ```
108
+
109
+ ### 7. Deleting an Entry
110
+ Remove an entry from the collection by its ID:
111
+
112
+ ```python
113
+ db_manager.delete_entry_by_id(
114
+ id_="1",
115
+ collection_name="my_collection"
116
+ )
117
+ ```
118
+
119
+ ### 8. Fetching Texts by IDs
120
+ Retrieve text entries for a list of IDs:
121
+
122
+ ```python
123
+ ids_to_fetch = ["1", "2", "3"]
124
+ texts = db_manager.get_by_ids(ids_to_fetch, "my_collection")
125
+ print("Texts:", texts)
126
+ ```
127
+
128
+ ### 9. Fetching IDs by Texts
129
+ For an exact text match, get the corresponding IDs:
130
+
131
+ ```python
132
+ texts_to_fetch = [
133
+ "Plato was an ancient Greek philosopher of the Classical period who is considered a foundational thinker in Western philosophy"
134
+ ]
135
+ ids = db_manager.get_by_texts(texts_to_fetch, "my_collection")
136
+ print("IDs:", ids)
137
+ ```
138
+
139
+ ### 10. Deleting a Collection
140
+ Delete an entire collection. **Note:** You must pass `confirmation="yes"` to proceed with deletion.
141
+
142
+ ```python
143
+ db_manager.delete_collection(
144
+ collection_name="my_collection",
145
+ confirmation="yes"
146
+ )
147
+ ```
148
+
149
+ ## Contributing
150
+ Contributions are welcome! If you encounter any issues or have suggestions for improvements, please feel free to open an issue or submit a pull request on GitHub.
151
+
152
+ ## License
153
+ This project is licensed under the MIT License. See the `LICENSE` file for details.
@@ -0,0 +1,2 @@
1
+ # __init__.py
2
+ from .main import VectorDBManager
@@ -0,0 +1,384 @@
1
+ import chromadb
2
+ import pandas as pd
3
+ import logging
4
+ from sentence_transformers import SentenceTransformer
5
+ from typing import List, Dict, Optional, Union
6
+ import os
7
+
8
+
9
+ class VectorDBManager:
10
+ def __init__(
11
+ self,
12
+ persist_directory: str = "./vector_db",
13
+ provider: str = "sentence_transformer",
14
+ model_name: str = "all-mpnet-base-v2",
15
+ ):
16
+ """
17
+ Initialize the Vector Database Manager.
18
+
19
+ Args:
20
+ persist_directory (str): Directory to persist the database
21
+ """
22
+ self.persist_directory = persist_directory
23
+ self.client = chromadb.PersistentClient(path=persist_directory)
24
+ if provider == "sentence_transformer":
25
+ self.model = SentenceTransformer(model_name)
26
+
27
+ logging.basicConfig(
28
+ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
29
+ )
30
+ self.logger = logging.getLogger(__name__)
31
+
32
+ def create_database(
33
+ self,
34
+ csv_path: str,
35
+ collection_name: str,
36
+ distance_metric: str = "l2",
37
+ collection_metadata: Dict = None,
38
+ ) -> bool:
39
+ """
40
+ Create a new database from a CSV file.
41
+
42
+ Args:
43
+ csv_path (str): Path to the CSV file containing 'id' and 'text' columns
44
+ collection_name (str): Name of the collection to create
45
+ distance_metric (str): Distance metric (l2, cosine, ip)
46
+ collection_metadata (Dict, optional): Additional metadata for the collection
47
+
48
+ Returns:
49
+ bool: True if successful, False otherwise
50
+ """
51
+ try:
52
+
53
+ df = pd.read_csv(csv_path)
54
+
55
+ if not {"id", "text"}.issubset(df.columns):
56
+ self.logger.error("CSV must contain 'id' and 'text' columns")
57
+ return False
58
+
59
+ collection_meta = {
60
+ "hnsw:space": distance_metric,
61
+ "description": f"Collection created from {csv_path}",
62
+ }
63
+
64
+ if collection_metadata:
65
+ collection_meta.update(collection_metadata)
66
+
67
+ collection = self.client.create_collection(
68
+ name=collection_name, metadata=collection_meta
69
+ )
70
+
71
+ embeddings = self.model.encode(df["text"].tolist()).tolist()
72
+
73
+ collection.add(
74
+ ids=[str(id_) for id_ in df["id"]],
75
+ documents=df["text"].tolist(),
76
+ embeddings=embeddings,
77
+ )
78
+
79
+ self.logger.info(f"Successfully created collection '{collection_name}'")
80
+ return True
81
+
82
+ except Exception as e:
83
+ self.logger.error(f"Error creating database: {str(e)}")
84
+ return False
85
+
86
+ def add_values_from_csv(
87
+ self, csv_path: str, collection_name: str
88
+ ) -> Dict[str, int]:
89
+ """
90
+ Add values from CSV file to existing collection, skipping existing IDs.
91
+
92
+ Args:
93
+ csv_path (str): Path to the CSV file
94
+ collection_name (str): Name of the target collection
95
+
96
+ Returns:
97
+ Dict[str, int]: Statistics about the operation
98
+ """
99
+ try:
100
+
101
+ df = pd.read_csv(csv_path)
102
+
103
+ collection = self.client.get_collection(collection_name)
104
+
105
+ existing_ids = set(collection.get()["ids"])
106
+
107
+ new_df = df[~df["id"].astype(str).isin(existing_ids)]
108
+
109
+ if not new_df.empty:
110
+
111
+ embeddings = self.model.encode(new_df["text"].tolist()).tolist()
112
+
113
+ collection.add(
114
+ ids=[str(id_) for id_ in new_df["id"]],
115
+ documents=new_df["text"].tolist(),
116
+ embeddings=embeddings,
117
+ )
118
+
119
+ stats = {
120
+ "total_entries": len(df),
121
+ "new_entries_added": len(new_df),
122
+ "skipped_entries": len(df) - len(new_df),
123
+ }
124
+
125
+ self.logger.info(
126
+ f"Added {stats['new_entries_added']} new entries to '{collection_name}'"
127
+ )
128
+ return stats
129
+
130
+ except Exception as e:
131
+ self.logger.error(f"Error adding values from CSV: {str(e)}")
132
+ return {"error": str(e)}
133
+
134
+ def add_single_row(self, id_: str, text: str, collection_name: str) -> bool:
135
+ """
136
+ Add a single entry to the collection.
137
+
138
+ Args:
139
+ id_ (str): ID for the new entry
140
+ text (str): Text content
141
+ collection_name (str): Target collection name
142
+
143
+ Returns:
144
+ bool: True if successful, False otherwise
145
+ """
146
+ try:
147
+ collection = self.client.get_collection(collection_name)
148
+
149
+ if str(id_) in collection.get()["ids"]:
150
+ self.logger.warning(f"ID {id_} already exists in collection")
151
+ return False
152
+
153
+ embedding = self.model.encode([text]).tolist()
154
+
155
+ collection.add(ids=[str(id_)], documents=[text], embeddings=embedding)
156
+
157
+ self.logger.info(f"Successfully added entry with ID {id_}")
158
+ return True
159
+
160
+ except Exception as e:
161
+ self.logger.error(f"Error adding single row: {str(e)}")
162
+ return False
163
+
164
+ def delete_entry_by_id(self, id_: str, collection_name: str) -> bool:
165
+ """
166
+ Delete an entry by its ID.
167
+
168
+ Args:
169
+ id_ (str): ID of the entry to delete
170
+ collection_name (str): Collection name
171
+
172
+ Returns:
173
+ bool: True if successful, False otherwise
174
+ """
175
+ try:
176
+ collection = self.client.get_collection(collection_name)
177
+
178
+ if str(id_) not in collection.get()["ids"]:
179
+ self.logger.warning(f"ID {id_} not found in collection")
180
+ return False
181
+
182
+ collection.delete(ids=[str(id_)])
183
+
184
+ self.logger.info(f"Successfully deleted entry with ID {id_}")
185
+ return True
186
+
187
+ except Exception as e:
188
+ self.logger.error(f"Error deleting entry: {str(e)}")
189
+ return False
190
+
191
+ def find_nearby_texts(
192
+ self,
193
+ text: str,
194
+ collection_name: str,
195
+ search_string: Optional[str] = None,
196
+ k: int = 5,
197
+ ) -> List[Dict[str, Union[str, float]]]:
198
+ """
199
+ Find nearby texts using similarity search with scores.
200
+
201
+ Args:
202
+ text (str): Query text
203
+ collection_name (str): Collection to search in
204
+ k (int): Number of results to return
205
+
206
+ Returns:
207
+ List[Dict[str, Union[str, float]]]: List of nearby texts with their IDs and similarity scores
208
+ """
209
+ try:
210
+ collection = self.client.get_collection(collection_name)
211
+ print("Metadata:", collection.metadata)
212
+
213
+ distance_metric = collection.metadata["hnsw:space"]
214
+
215
+ query_embedding = self.model.encode([text]).tolist()
216
+
217
+ if search_string:
218
+ results = collection.query(
219
+ query_embeddings=query_embedding,
220
+ n_results=k,
221
+ include=["documents", "distances", "metadatas"],
222
+ where_document={"$contains": search_string},
223
+ )
224
+ else:
225
+ results = collection.query(
226
+ query_embeddings=query_embedding,
227
+ n_results=k,
228
+ include=["documents", "distances", "metadatas"],
229
+ )
230
+
231
+ distances = results["distances"][0]
232
+ if not distances:
233
+ return []
234
+
235
+ similarities = []
236
+ for dist in distances:
237
+ if distance_metric == "cosine":
238
+
239
+ similarity = 1 - dist
240
+ elif distance_metric == "ip":
241
+
242
+ min_dist = min(distances)
243
+ max_dist = max(distances)
244
+ similarity = (
245
+ (dist - min_dist) / (max_dist - min_dist)
246
+ if max_dist > min_dist
247
+ else 1.0
248
+ )
249
+ elif distance_metric == "l1":
250
+
251
+ max_dist = max(distances)
252
+ similarity = 1 - (dist / max_dist) if max_dist > 0 else 1.0
253
+ elif distance_metric == "l2":
254
+
255
+ max_dist = max(distances)
256
+ similarity = 1 - (dist / max_dist) if max_dist > 0 else 1.0
257
+
258
+ similarities.append(similarity)
259
+
260
+ nearby_texts = [
261
+ {
262
+ "id": id_,
263
+ "text": text_,
264
+ "similarity": round(similarity * 100, 4),
265
+ "raw_distance": dist,
266
+ "metric": distance_metric,
267
+ }
268
+ for id_, text_, similarity, dist in zip(
269
+ results["ids"][0], results["documents"][0], similarities, distances
270
+ )
271
+ ]
272
+
273
+ return nearby_texts
274
+
275
+ except Exception as e:
276
+ self.logger.error(f"Error finding nearby texts: {str(e)}")
277
+ return []
278
+
279
+ def delete_collection(self, collection_name: str, confirmation: str = "no") -> bool:
280
+ """
281
+ Delete an entire collection.
282
+
283
+ Args:
284
+ collection_name (str): Name of collection to delete
285
+ confirmation (str): Must be 'yes' to proceed
286
+
287
+ Returns:
288
+ bool: True if successful, False otherwise
289
+ """
290
+ try:
291
+ if confirmation.lower() != "yes":
292
+ self.logger.warning("Deletion cancelled - confirmation not provided")
293
+ return False
294
+
295
+ self.client.delete_collection(collection_name)
296
+ self.logger.info(f"Successfully deleted collection '{collection_name}'")
297
+ return True
298
+
299
+ except Exception as e:
300
+ self.logger.error(f"Error deleting collection: {str(e)}")
301
+ return False
302
+
303
+ def get_collection_info(self, collection_name: str) -> Dict:
304
+ """
305
+ Get information about a collection.
306
+
307
+ Args:
308
+ collection_name (str): Name of the collection
309
+
310
+ Returns:
311
+ Dict: Collection information and statistics
312
+ """
313
+ try:
314
+ collection = self.client.get_collection(collection_name)
315
+ collection_data = collection.get()
316
+
317
+ info = {
318
+ "name": collection_name,
319
+ "count": len(collection_data["ids"]),
320
+ "metadata": collection.metadata,
321
+ }
322
+
323
+ return info
324
+
325
+ except Exception as e:
326
+ self.logger.error(f"Error getting collection info: {str(e)}")
327
+ return {"error": str(e)}
328
+
329
+ def get_by_ids(self, ids: List[str], collection_name: str) -> Dict[str, str]:
330
+ """
331
+ Get texts for given IDs in batch.
332
+
333
+ Args:
334
+ ids (List[str]): List of IDs to fetch
335
+ collection_name (str): Name of the collection
336
+
337
+ Returns:
338
+ Dict[str, str]: Dictionary mapping IDs to their corresponding texts
339
+ """
340
+ try:
341
+ collection = self.client.get_collection(collection_name)
342
+
343
+ str_ids = [str(id_) for id_ in ids]
344
+
345
+ results = collection.get(ids=str_ids, include=["documents"])
346
+
347
+ id_to_text = {
348
+ id_: text for id_, text in zip(results["ids"], results["documents"])
349
+ }
350
+
351
+ return id_to_text
352
+
353
+ except Exception as e:
354
+ self.logger.error(f"Error getting texts by IDs: {str(e)}")
355
+ return {}
356
+
357
+ def get_by_texts(self, texts: List[str], collection_name: str) -> Dict[str, str]:
358
+ """
359
+ Get IDs for given texts in batch.
360
+ Note: For exact text matching. For similar texts, use find_nearby_texts.
361
+
362
+ Args:
363
+ texts (List[str]): List of texts to fetch
364
+ collection_name (str): Name of the collection
365
+
366
+ Returns:
367
+ Dict[str, str]: Dictionary mapping texts to their corresponding IDs
368
+ """
369
+ try:
370
+ collection = self.client.get_collection(collection_name)
371
+
372
+ all_data = collection.get()
373
+
374
+ text_to_id = {
375
+ text: id_
376
+ for text, id_ in zip(all_data["documents"], all_data["ids"])
377
+ if text in texts
378
+ }
379
+
380
+ return text_to_id
381
+
382
+ except Exception as e:
383
+ self.logger.error(f"Error getting IDs by texts: {str(e)}")
384
+ return {}
@@ -0,0 +1,10 @@
1
+ Metadata-Version: 2.2
2
+ Name: ragit
3
+ Version: 0.1
4
+ Requires-Dist: sentence-transformers>=3.4.1
5
+ Requires-Dist: pandas>=2.2.3
6
+ Requires-Dist: chromadb>=0.6.3
7
+ Requires-Dist: setuptools>=75.8.0
8
+ Requires-Dist: wheel>=0.45.1
9
+ Requires-Dist: twine>=6.1.0
10
+ Dynamic: requires-dist
@@ -0,0 +1,9 @@
1
+ README.md
2
+ setup.py
3
+ ragit/__init__.py
4
+ ragit/main.py
5
+ ragit.egg-info/PKG-INFO
6
+ ragit.egg-info/SOURCES.txt
7
+ ragit.egg-info/dependency_links.txt
8
+ ragit.egg-info/requires.txt
9
+ ragit.egg-info/top_level.txt
@@ -0,0 +1,6 @@
1
+ sentence-transformers>=3.4.1
2
+ pandas>=2.2.3
3
+ chromadb>=0.6.3
4
+ setuptools>=75.8.0
5
+ wheel>=0.45.1
6
+ twine>=6.1.0
@@ -0,0 +1 @@
1
+ ragit
ragit-0.1/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
ragit-0.1/setup.py ADDED
@@ -0,0 +1,11 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name="ragit",
5
+ version="0.1",
6
+ packages=find_packages(),
7
+ install_requires = ['sentence-transformers>=3.4.1',
8
+ 'pandas>=2.2.3', 'chromadb>=0.6.3',
9
+ 'setuptools>=75.8.0',
10
+ 'wheel>=0.45.1', 'twine>=6.1.0']
11
+ )