langroid 0.39.5__py3-none-any.whl → 0.40.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -50,6 +50,13 @@ except ImportError:
50
50
  pass
51
51
 
52
52
  try:
53
+ from . import postgres
54
+ from .postgres import PostgresDB, PostgresDBConfig
55
+
56
+ postgres # silence linters
57
+ PostgresDB
58
+ PostgresDBConfig
59
+ __all__.extend(["postgres", "PostgresDB", "PostgresDBConfig"])
53
60
  from . import weaviatedb
54
61
  from .weaviatedb import WeaviateDBConfig, WeaviateDB
55
62
 
@@ -59,6 +59,7 @@ class VectorStore(ABC):
59
59
  from langroid.vector_store.lancedb import LanceDB, LanceDBConfig
60
60
  from langroid.vector_store.meilisearch import MeiliSearch, MeiliSearchConfig
61
61
  from langroid.vector_store.momento import MomentoVI, MomentoVIConfig
62
+ from langroid.vector_store.postgres import PostgresDB, PostgresDBConfig
62
63
  from langroid.vector_store.qdrantdb import QdrantDB, QdrantDBConfig
63
64
  from langroid.vector_store.weaviatedb import WeaviateDB, WeaviateDBConfig
64
65
 
@@ -72,6 +73,8 @@ class VectorStore(ABC):
72
73
  return LanceDB(config)
73
74
  elif isinstance(config, MeiliSearchConfig):
74
75
  return MeiliSearch(config)
76
+ elif isinstance(config, PostgresDBConfig):
77
+ return PostgresDB(config)
75
78
  elif isinstance(config, WeaviateDBConfig):
76
79
  return WeaviateDB(config)
77
80
 
@@ -0,0 +1,414 @@
1
+ import hashlib
2
+ import json
3
+ import logging
4
+ import os
5
+ import uuid
6
+ from typing import Any, Dict, List, Optional, Sequence, Tuple
7
+
8
+ from sqlalchemy import (
9
+ Column,
10
+ MetaData,
11
+ String,
12
+ Table,
13
+ case,
14
+ create_engine,
15
+ inspect,
16
+ text,
17
+ )
18
+ from sqlalchemy.dialects.postgresql import JSONB
19
+ from sqlalchemy.engine import Connection, Engine
20
+ from sqlalchemy.orm import sessionmaker
21
+ from sqlalchemy.sql.expression import insert
22
+
23
+ from langroid.embedding_models.base import (
24
+ EmbeddingModelsConfig,
25
+ )
26
+ from langroid.embedding_models.models import OpenAIEmbeddingsConfig
27
+ from langroid.exceptions import LangroidImportError
28
+ from langroid.mytypes import DocMetaData, Document
29
+ from langroid.vector_store.base import VectorStore, VectorStoreConfig
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ class PostgresDBConfig(VectorStoreConfig):
35
+ collection_name: str = "embeddings"
36
+ cloud: bool = False
37
+ docker: bool = True
38
+ host: str = "127.0.0.1"
39
+ port: int = 5432
40
+ replace_collection: bool = False
41
+ embedding: EmbeddingModelsConfig = OpenAIEmbeddingsConfig()
42
+ pool_size: int = 10
43
+ max_overflow: int = 20
44
+ hnsw_m: int = 16
45
+ hnsw_ef_construction: int = 200
46
+
47
+
48
+ class PostgresDB(VectorStore):
49
+ def __init__(self, config: PostgresDBConfig = PostgresDBConfig()):
50
+ super().__init__(config)
51
+ self.config: PostgresDBConfig = config
52
+ self.engine = self._create_engine()
53
+ PostgresDB._create_vector_extension(self.engine)
54
+ self.SessionLocal = sessionmaker(
55
+ autocommit=False, autoflush=False, bind=self.engine
56
+ )
57
+ self.metadata = MetaData()
58
+ self._setup_table()
59
+
60
+ def _create_engine(self) -> Engine:
61
+ """Creates a SQLAlchemy engine based on the configuration."""
62
+
63
+ connection_string: str | None = None # Ensure variable is always defined
64
+
65
+ if self.config.cloud:
66
+ connection_string = os.getenv("POSTGRES_CONNECTION_STRING")
67
+
68
+ if connection_string and connection_string.startswith("postgres://"):
69
+ connection_string = connection_string.replace(
70
+ "postgres://", "postgresql+psycopg2://", 1
71
+ )
72
+ elif not connection_string:
73
+ raise ValueError("Provide the POSTGRES_CONNECTION_STRING.")
74
+
75
+ elif self.config.docker:
76
+ username = os.getenv("POSTGRES_USER", "postgres")
77
+ password = os.getenv("POSTGRES_PASSWORD", "postgres")
78
+ database = os.getenv("POSTGRES_DB", "langroid")
79
+
80
+ if not (username and password and database):
81
+ raise ValueError(
82
+ "Provide POSTGRES_USER, POSTGRES_PASSWORD, " "POSTGRES_DB. "
83
+ )
84
+
85
+ connection_string = (
86
+ f"postgresql+psycopg2://{username}:{password}@"
87
+ f"{self.config.host}:{self.config.port}/{database}"
88
+ )
89
+ self.config.cloud = False # Ensures cloud is disabled if using Docker
90
+
91
+ else:
92
+ raise ValueError(
93
+ "Provide either Docker or Cloud config to connect to the database."
94
+ )
95
+
96
+ return create_engine(
97
+ connection_string,
98
+ pool_size=self.config.pool_size,
99
+ max_overflow=self.config.max_overflow,
100
+ )
101
+
102
+ def _setup_table(self) -> None:
103
+ try:
104
+ from pgvector.sqlalchemy import Vector
105
+ except ImportError as e:
106
+ raise LangroidImportError(extra="postgres", error=str(e))
107
+
108
+ if self.config.replace_collection:
109
+ self.delete_collection(self.config.collection_name)
110
+
111
+ self.embeddings_table = Table(
112
+ self.config.collection_name,
113
+ self.metadata,
114
+ Column("id", String, primary_key=True, nullable=False, unique=True),
115
+ Column("embedding", Vector(self.embedding_dim)),
116
+ Column("document", String),
117
+ Column("cmetadata", JSONB),
118
+ extend_existing=True,
119
+ )
120
+
121
+ self.metadata.create_all(self.engine)
122
+ self.metadata.reflect(bind=self.engine, only=[self.config.collection_name])
123
+
124
+ # Create HNSW index for embeddings column if it doesn't exist.
125
+ # This index enables efficient nearest-neighbor search using cosine similarity.
126
+ # PostgreSQL automatically builds the index after creation;
127
+ # no manual step required.
128
+ # Read more about pgvector hnsw index here:
129
+ # https://github.com/pgvector/pgvector?tab=readme-ov-file#hnsw
130
+
131
+ index_name = f"hnsw_index_{self.config.collection_name}_embedding"
132
+ with self.engine.connect() as connection:
133
+ if not self.index_exists(connection, index_name):
134
+ connection.execute(text("COMMIT"))
135
+ create_index_query = text(
136
+ f"""
137
+ CREATE INDEX CONCURRENTLY IF NOT EXISTS {index_name}
138
+ ON {self.config.collection_name}
139
+ USING hnsw (embedding vector_cosine_ops)
140
+ WITH (
141
+ m = {self.config.hnsw_m},
142
+ ef_construction = {self.config.hnsw_ef_construction}
143
+ );
144
+ """
145
+ )
146
+ connection.execute(create_index_query)
147
+
148
+ def index_exists(self, connection: Connection, index_name: str) -> bool:
149
+ """Check if an index exists."""
150
+ query = text(
151
+ "SELECT 1 FROM pg_indexes WHERE indexname = :index_name"
152
+ ).bindparams(index_name=index_name)
153
+ result = connection.execute(query).scalar()
154
+ return bool(result)
155
+
156
+ @staticmethod
157
+ def _create_vector_extension(conn: Engine) -> None:
158
+
159
+ with conn.connect() as connection:
160
+ with connection.begin():
161
+ # The number is a unique identifier used to lock a specific resource
162
+ # during transaction. Any 64-bit integer can be used for advisory locks.
163
+ # Acquire advisory lock to ensure atomic, isolated setup
164
+ # and prevent race conditions.
165
+
166
+ statement = text(
167
+ "SELECT pg_advisory_xact_lock(1573678846307946496);"
168
+ "CREATE EXTENSION IF NOT EXISTS vector;"
169
+ )
170
+ connection.execute(statement)
171
+
172
+ def set_collection(self, collection_name: str, replace: bool = False) -> None:
173
+ inspector = inspect(self.engine)
174
+ table_exists = collection_name in inspector.get_table_names()
175
+
176
+ if (
177
+ collection_name == self.config.collection_name
178
+ and table_exists
179
+ and not replace
180
+ ):
181
+ return
182
+ else:
183
+ self.config.collection_name = collection_name
184
+ self.config.replace_collection = replace
185
+ self._setup_table()
186
+
187
+ def list_collections(self, empty: bool = True) -> List[str]:
188
+ inspector = inspect(self.engine)
189
+ table_names = inspector.get_table_names()
190
+
191
+ with self.SessionLocal() as session:
192
+ collections = []
193
+ for table_name in table_names:
194
+ table = Table(table_name, self.metadata, autoload_with=self.engine)
195
+ if empty:
196
+ collections.append(table_name)
197
+ else:
198
+ # Efficiently check for non-emptiness
199
+ if session.query(table.select().limit(1).exists()).scalar():
200
+ collections.append(table_name)
201
+ return collections
202
+
203
+ def create_collection(self, collection_name: str, replace: bool = False) -> None:
204
+ self.set_collection(collection_name, replace=replace)
205
+
206
+ def delete_collection(self, collection_name: str) -> None:
207
+ """
208
+ Deletes a collection and its associated HNSW index, handling metadata
209
+ synchronization issues.
210
+ """
211
+ with self.engine.connect() as connection:
212
+ connection.execute(text("COMMIT"))
213
+ index_name = f"hnsw_index_{collection_name}_embedding"
214
+ drop_index_query = text(f"DROP INDEX CONCURRENTLY IF EXISTS {index_name}")
215
+ connection.execute(drop_index_query)
216
+
217
+ # 3. Now, drop the table using SQLAlchemy
218
+ table = Table(collection_name, self.metadata)
219
+ table.drop(self.engine, checkfirst=True)
220
+
221
+ # 4. Refresh metadata again after dropping the table
222
+ self.metadata.clear()
223
+ self.metadata.reflect(bind=self.engine)
224
+
225
+ def clear_all_collections(self, really: bool = False, prefix: str = "") -> int:
226
+ if not really:
227
+ logger.warning("Not deleting all tables, set really=True to confirm")
228
+ return 0
229
+
230
+ inspector = inspect(self.engine)
231
+ table_names = inspector.get_table_names()
232
+
233
+ with self.SessionLocal() as session:
234
+ deleted_count = 0
235
+ for table_name in table_names:
236
+ if table_name.startswith(prefix):
237
+ # Use delete_collection to handle index and table deletion
238
+ self.delete_collection(table_name)
239
+ deleted_count += 1
240
+ session.commit()
241
+ logger.warning(f"Deleted {deleted_count} tables with prefix '{prefix}'.")
242
+ return deleted_count
243
+
244
+ def clear_empty_collections(self) -> int:
245
+ inspector = inspect(self.engine)
246
+ table_names = inspector.get_table_names()
247
+
248
+ with self.SessionLocal() as session:
249
+ deleted_count = 0
250
+ for table_name in table_names:
251
+ table = Table(table_name, self.metadata, autoload_with=self.engine)
252
+
253
+ # Efficiently check for emptiness without fetching all rows
254
+ if session.query(table.select().limit(1).exists()).scalar():
255
+ continue
256
+
257
+ # Use delete_collection to handle index and table deletion
258
+ self.delete_collection(table_name)
259
+ deleted_count += 1
260
+
261
+ session.commit() # Commit is likely not needed here
262
+ logger.warning(f"Deleted {deleted_count} empty tables.")
263
+ return deleted_count
264
+
265
+ def _parse_embedding_store_record(self, res: Any) -> Dict[str, Any]:
266
+ metadata = res.cmetadata or {}
267
+ metadata["id"] = res.id
268
+ return {
269
+ "content": res.document,
270
+ "metadata": DocMetaData(**metadata),
271
+ }
272
+
273
+ def get_all_documents(self, where: str = "") -> List[Document]:
274
+ with self.SessionLocal() as session:
275
+ query = session.query(self.embeddings_table)
276
+
277
+ # Apply 'where' clause if provided
278
+ if where:
279
+ try:
280
+ where_json = json.loads(where)
281
+ query = query.filter(
282
+ self.embeddings_table.c.cmetadata.contains(where_json)
283
+ )
284
+ except json.JSONDecodeError:
285
+ logger.error(f"Invalid JSON in 'where' clause: {where}")
286
+ return [] # Return empty list or handle error as appropriate
287
+
288
+ results = query.all()
289
+ documents = [
290
+ Document(**self._parse_embedding_store_record(res)) for res in results
291
+ ]
292
+ return documents
293
+
294
+ def get_documents_by_ids(self, ids: List[str]) -> List[Document]:
295
+ with self.SessionLocal() as session:
296
+ # Add a CASE statement to preserve the order of IDs
297
+ case_stmt = case(
298
+ {id_: index for index, id_ in enumerate(ids)},
299
+ value=self.embeddings_table.c.id,
300
+ )
301
+
302
+ query = (
303
+ session.query(self.embeddings_table)
304
+ .filter(self.embeddings_table.c.id.in_(ids))
305
+ .order_by(case_stmt) # Order by the CASE statement
306
+ )
307
+ results = query.all()
308
+
309
+ documents = [
310
+ Document(**self._parse_embedding_store_record(row)) for row in results
311
+ ]
312
+ return documents
313
+
314
+ def add_documents(self, documents: Sequence[Document]) -> None:
315
+ super().maybe_add_ids(documents)
316
+ for doc in documents:
317
+ doc.metadata.id = str(PostgresDB._id_to_uuid(doc.metadata.id, doc.metadata))
318
+
319
+ embeddings = self.embedding_fn([doc.content for doc in documents])
320
+
321
+ batch_size = self.config.batch_size
322
+ with self.SessionLocal() as session:
323
+ for i in range(0, len(documents), batch_size):
324
+ batch_docs = documents[i : i + batch_size]
325
+ batch_embeddings = embeddings[i : i + batch_size]
326
+
327
+ new_records = [
328
+ {
329
+ "id": doc.metadata.id,
330
+ "embedding": embedding,
331
+ "document": doc.content,
332
+ "cmetadata": doc.metadata.dict(),
333
+ }
334
+ for doc, embedding in zip(batch_docs, batch_embeddings)
335
+ ]
336
+
337
+ if new_records:
338
+ stmt = insert(self.embeddings_table).values(new_records)
339
+ session.execute(stmt)
340
+ session.commit()
341
+
342
+ @staticmethod
343
+ def _id_to_uuid(id: str, obj: object) -> str:
344
+ try:
345
+ doc_id = str(uuid.UUID(id))
346
+ except ValueError:
347
+ obj_repr = repr(obj)
348
+
349
+ obj_hash = hashlib.sha256(obj_repr.encode()).hexdigest()
350
+
351
+ combined = f"{id}-{obj_hash}"
352
+
353
+ doc_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, combined))
354
+
355
+ return doc_id
356
+
357
+ def similar_texts_with_scores(
358
+ self,
359
+ query: str,
360
+ k: int = 1,
361
+ where: Optional[str] = None,
362
+ neighbors: int = 1, # Parameter not used in this implementation
363
+ ) -> List[Tuple[Document, float]]:
364
+ embedding = self.embedding_fn([query])[0]
365
+
366
+ with self.SessionLocal() as session:
367
+ # Calculate the score (1 - cosine_distance) and label it as "score"
368
+ score = (
369
+ 1 - (self.embeddings_table.c.embedding.cosine_distance(embedding))
370
+ ).label("score")
371
+
372
+ if where is not None:
373
+ try:
374
+ json_query = json.loads(where)
375
+ except json.JSONDecodeError:
376
+ raise ValueError(f"Invalid JSON in 'where' clause: {where}")
377
+
378
+ results = (
379
+ session.query(
380
+ self.embeddings_table.c.id,
381
+ self.embeddings_table.c.document,
382
+ self.embeddings_table.c.cmetadata,
383
+ score, # Select the calculated score
384
+ )
385
+ .filter(self.embeddings_table.c.cmetadata.contains(json_query))
386
+ .order_by(score.desc()) # Order by score in descending order
387
+ .limit(k)
388
+ .all()
389
+ )
390
+ else:
391
+ results = (
392
+ session.query(
393
+ self.embeddings_table.c.id,
394
+ self.embeddings_table.c.document,
395
+ self.embeddings_table.c.cmetadata,
396
+ score, # Select the calculated score
397
+ )
398
+ .order_by(score.desc()) # Order by score in descending order
399
+ .limit(k)
400
+ .all()
401
+ )
402
+
403
+ documents_with_scores = [
404
+ (
405
+ Document(
406
+ content=result.document,
407
+ metadata=DocMetaData(**(result.cmetadata or {})),
408
+ ),
409
+ result.score, # Use the score from the query result
410
+ )
411
+ for result in results
412
+ ]
413
+
414
+ return documents_with_scores
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: langroid
3
- Version: 0.39.5
3
+ Version: 0.40.0
4
4
  Summary: Harness LLMs with Multi-Agent Programming
5
5
  Author-email: Prasad Chalasani <pchalasani@gmail.com>
6
6
  License: MIT
@@ -64,6 +64,8 @@ Requires-Dist: litellm<2.0.0,>=1.30.1; extra == 'all'
64
64
  Requires-Dist: metaphor-python<0.2.0,>=0.1.23; extra == 'all'
65
65
  Requires-Dist: neo4j<6.0.0,>=5.14.1; extra == 'all'
66
66
  Requires-Dist: pdf2image<2.0.0,>=1.17.0; extra == 'all'
67
+ Requires-Dist: pgvector>=0.3.6; extra == 'all'
68
+ Requires-Dist: psycopg2-binary>=2.9.10; extra == 'all'
67
69
  Requires-Dist: psycopg2<3.0.0,>=2.9.7; extra == 'all'
68
70
  Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17; extra == 'all'
69
71
  Requires-Dist: pymupdf<2.0.0,>=1.23.3; extra == 'all'
@@ -140,7 +142,10 @@ Requires-Dist: pymupdf<2.0.0,>=1.23.3; extra == 'pdf-parsers'
140
142
  Requires-Dist: pypdf>=5.1.0; extra == 'pdf-parsers'
141
143
  Requires-Dist: pytesseract<0.4.0,>=0.3.10; extra == 'pdf-parsers'
142
144
  Provides-Extra: postgres
145
+ Requires-Dist: pgvector>=0.3.6; extra == 'postgres'
146
+ Requires-Dist: psycopg2-binary>=2.9.10; extra == 'postgres'
143
147
  Requires-Dist: psycopg2<3.0.0,>=2.9.7; extra == 'postgres'
148
+ Requires-Dist: sqlalchemy<3.0.0,>=2.0.19; extra == 'postgres'
144
149
  Provides-Extra: pymupdf4llm
145
150
  Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17; extra == 'pymupdf4llm'
146
151
  Provides-Extra: scrapy
@@ -116,15 +116,16 @@ langroid/utils/output/__init__.py,sha256=7P0f--4IZneNsTxXY5fd6d6iW-CeVe-KSsl-87s
116
116
  langroid/utils/output/citations.py,sha256=9T69O_N6mxPQjQ-qC1vKS8_kyg1z5hDQXMhBsA45xkk,3147
117
117
  langroid/utils/output/printing.py,sha256=yzPJZN-8_jyOJmI9N_oLwEDfjMwVgk3IDiwnZ4eK_AE,2962
118
118
  langroid/utils/output/status.py,sha256=rzbE7mDJcgNNvdtylCseQcPGCGghtJvVq3lB-OPJ49E,1049
119
- langroid/vector_store/__init__.py,sha256=BcoOm1tG3y0EqjkIGmMOHkY9iTUhDHgyruknWDKgqIg,1214
120
- langroid/vector_store/base.py,sha256=69keYWkUD0fcGXC0STcdO1-jn8H4Ez-L_fnxmRvUoNw,14412
119
+ langroid/vector_store/__init__.py,sha256=iRAwrMn72NNQutdmYwtGFHywjX8r0rVwioUJBBPMESM,1432
120
+ langroid/vector_store/base.py,sha256=On7SY2hU7fvtuAvoHNjQEcaBBUx4OJem8BKyKri2Wx8,14581
121
121
  langroid/vector_store/chromadb.py,sha256=p9mEqJwO2BrL2jSSXfa23kCPlPOwWpF3xJYd5zoWw_c,8661
122
122
  langroid/vector_store/lancedb.py,sha256=Qd20gKjWozPWfW5-D66J6U8dSrJo1yl-maj6s1lbf1c,14688
123
123
  langroid/vector_store/meilisearch.py,sha256=6frB7GFWeWmeKzRfLZIvzRjllniZ1cYj3HmhHQICXLs,11663
124
124
  langroid/vector_store/momento.py,sha256=xOaU7Hlyyn_5ihb0ARS5JHtmrKrTCt2IdRA-ioMM5ek,10307
125
+ langroid/vector_store/postgres.py,sha256=-bQ_AXpIkoK_lg8k6qt7pEz8gZuTXHuhnAPXhqYpUQ0,15697
125
126
  langroid/vector_store/qdrantdb.py,sha256=v7TAsIoj_vxeKDYS9tpwJLBZA8fuTweTYxHo0X_uawM,17949
126
127
  langroid/vector_store/weaviatedb.py,sha256=cMg9kqJXlD1WURs6QivHvwausCyLYGr4mOK2v9uYkhw,11105
127
- langroid-0.39.5.dist-info/METADATA,sha256=BPira_zYZOFY685gg9eWMVM7q-x8o710qWCNBRKJAMw,60634
128
- langroid-0.39.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
129
- langroid-0.39.5.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
130
- langroid-0.39.5.dist-info/RECORD,,
128
+ langroid-0.40.0.dist-info/METADATA,sha256=6e_B25ingRSVwU_fIbpAu2pP5sYCHc3Bz1Y-TTXyMA0,60910
129
+ langroid-0.40.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
130
+ langroid-0.40.0.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
131
+ langroid-0.40.0.dist-info/RECORD,,