langchain-postgres 0.0.9__tar.gz → 0.0.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,17 +1,17 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: langchain-postgres
3
- Version: 0.0.9
3
+ Version: 0.0.11
4
4
  Summary: An integration package connecting Postgres and LangChain
5
5
  Home-page: https://github.com/langchain-ai/langchain-postgres
6
6
  License: MIT
7
- Requires-Python: >=3.8.1,<4.0.0
7
+ Requires-Python: >=3.9,<4.0
8
8
  Classifier: License :: OSI Approved :: MIT License
9
9
  Classifier: Programming Language :: Python :: 3
10
10
  Classifier: Programming Language :: Python :: 3.9
11
11
  Classifier: Programming Language :: Python :: 3.10
12
12
  Classifier: Programming Language :: Python :: 3.11
13
13
  Classifier: Programming Language :: Python :: 3.12
14
- Requires-Dist: langchain-core (>=0.1.50,<0.3)
14
+ Requires-Dist: langchain-core (>=0.2.13,<0.4.0)
15
15
  Requires-Dist: numpy (>=1,<2)
16
16
  Requires-Dist: pgvector (>=0.2.5,<0.3.0)
17
17
  Requires-Dist: psycopg (>=3,<4)
@@ -30,10 +30,8 @@ def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray:
30
30
 
31
31
  X = np.array(X, dtype=np.float32)
32
32
  Y = np.array(Y, dtype=np.float32)
33
- Z = 1 - simd.cdist(X, Y, metric="cosine")
34
- if isinstance(Z, float):
35
- return np.array([Z])
36
- return np.array(Z)
33
+ Z = 1 - np.array(simd.cdist(X, Y, metric="cosine"))
34
+ return Z
37
35
  except ImportError:
38
36
  logger.debug(
39
37
  "Unable to import simsimd, defaulting to NumPy implementation. If you want "
@@ -246,98 +246,130 @@ DBConnection = Union[sqlalchemy.engine.Engine, str]
246
246
 
247
247
 
248
248
  class PGVector(VectorStore):
249
- """Vectorstore implementation using Postgres as the backend.
249
+ """Postgres vector store integration.
250
250
 
251
- Currently, there is no mechanism for supporting data migration.
251
+ Setup:
252
+ Install ``langchain_postgres`` and run the docker container.
252
253
 
253
- So breaking changes in the vectorstore schema will require the user to recreate
254
- the tables and re-add the documents.
254
+ .. code-block:: bash
255
255
 
256
- If this is a concern, please use a different vectorstore. If
257
- not, this implementation should be fine for your use case.
256
+ pip install -qU langchain-postgres
257
+ docker run --name pgvector-container -e POSTGRES_USER=langchain -e POSTGRES_PASSWORD=langchain -e POSTGRES_DB=langchain -p 6024:5432 -d pgvector/pgvector:pg16
258
258
 
259
- To use this vectorstore you need to have the `vector` extension installed.
260
- The `vector` extension is a Postgres extension that provides vector
261
- similarity search capabilities.
259
+ Key init args indexing params:
260
+ collection_name: str
261
+ Name of the collection.
262
+ embeddings: Embeddings
263
+ Embedding function to use.
262
264
 
263
- ```sh
264
- docker run --name pgvector-container -e POSTGRES_PASSWORD=...
265
- -d pgvector/pgvector:pg16
266
- ```
265
+ Key init args — client params:
266
+ connection: Union[None, DBConnection, Engine, AsyncEngine, str]
267
+ Connection string or engine.
267
268
 
268
- Example:
269
+ Instantiate:
269
270
  .. code-block:: python
270
271
 
272
+ from langchain_postgres import PGVector
271
273
  from langchain_postgres.vectorstores import PGVector
272
- from langchain_openai.embeddings import OpenAIEmbeddings
273
-
274
- connection_string = "postgresql+psycopg://..."
275
- collection_name = "state_of_the_union_test"
276
- embeddings = OpenAIEmbeddings()
277
- vectorstore = PGVector.from_documents(
278
- embedding=embeddings,
279
- documents=docs,
280
- connection=connection_string,
274
+ from langchain_openai import OpenAIEmbeddings
275
+
276
+ # See docker command above to launch a postgres instance with pgvector enabled.
277
+ connection = "postgresql+psycopg://langchain:langchain@localhost:6024/langchain" # Uses psycopg3!
278
+ collection_name = "my_docs"
279
+
280
+ vector_store = PGVector(
281
+ embeddings=OpenAIEmbeddings(model="text-embedding-3-large"),
281
282
  collection_name=collection_name,
283
+ connection=connection,
282
284
  use_jsonb=True,
283
- async_mode=False,
284
285
  )
285
286
 
287
+ Add Documents:
288
+ .. code-block:: python
289
+
290
+ from langchain_core.documents import Document
291
+
292
+ document_1 = Document(page_content="foo", metadata={"baz": "bar"})
293
+ document_2 = Document(page_content="thud", metadata={"bar": "baz"})
294
+ document_3 = Document(page_content="i will be deleted :(")
295
+
296
+ documents = [document_1, document_2, document_3]
297
+ ids = ["1", "2", "3"]
298
+ vector_store.add_documents(documents=documents, ids=ids)
299
+
300
+ Delete Documents:
301
+ .. code-block:: python
302
+
303
+ vector_store.delete(ids=["3"])
304
+
305
+ Search:
306
+ .. code-block:: python
307
+
308
+ results = vector_store.similarity_search(query="thud",k=1)
309
+ for doc in results:
310
+ print(f"* {doc.page_content} [{doc.metadata}]")
311
+
312
+ .. code-block:: python
313
+
314
+ * thud [{'bar': 'baz'}]
315
+
316
+ Search with filter:
317
+ .. code-block:: python
318
+
319
+ results = vector_store.similarity_search(query="thud",k=1,filter={"bar": "baz"})
320
+ for doc in results:
321
+ print(f"* {doc.page_content} [{doc.metadata}]")
322
+
323
+ .. code-block:: python
324
+
325
+ * thud [{'bar': 'baz'}]
326
+
327
+ Search with score:
328
+ .. code-block:: python
329
+
330
+ results = vector_store.similarity_search_with_score(query="qux",k=1)
331
+ for doc, score in results:
332
+ print(f"* [SIM={score:3f}] {doc.page_content} [{doc.metadata}]")
333
+
334
+ .. code-block:: python
335
+
336
+ * [SIM=0.499243] foo [{'baz': 'bar'}]
337
+
338
+ Async:
339
+ .. code-block:: python
340
+
341
+ # add documents
342
+ # await vector_store.aadd_documents(documents=documents, ids=ids)
343
+
344
+ # delete documents
345
+ # await vector_store.adelete(ids=["3"])
346
+
347
+ # search
348
+ # results = vector_store.asimilarity_search(query="thud",k=1)
349
+
350
+ # search with score
351
+ results = await vector_store.asimilarity_search_with_score(query="qux",k=1)
352
+ for doc,score in results:
353
+ print(f"* [SIM={score:3f}] {doc.page_content} [{doc.metadata}]")
354
+
355
+ .. code-block:: python
356
+
357
+ * [SIM=0.499243] foo [{'baz': 'bar'}]
358
+
359
+ Use as Retriever:
360
+ .. code-block:: python
361
+
362
+ retriever = vector_store.as_retriever(
363
+ search_type="mmr",
364
+ search_kwargs={"k": 1, "fetch_k": 2, "lambda_mult": 0.5},
365
+ )
366
+ retriever.invoke("thud")
367
+
368
+ .. code-block:: python
369
+
370
+ [Document(metadata={'bar': 'baz'}, page_content='thud')]
286
371
 
287
- This code has been ported over from langchain_community with minimal changes
288
- to allow users to easily transition from langchain_community to langchain_postgres.
289
-
290
- Some changes had to be made to address issues with the community implementation:
291
- * langchain_postgres now works with psycopg3. Please update your
292
- connection strings from `postgresql+psycopg2://...` to
293
- `postgresql+psycopg://langchain:langchain@...`
294
- (yes, the driver name is `psycopg` not `psycopg3`)
295
- * The schema of the embedding store and collection have been changed to make
296
- add_documents work correctly with user specified ids, specifically
297
- when overwriting existing documents.
298
- You will need to recreate the tables if you are using an existing database.
299
- * A Connection object has to be provided explicitly. Connections will not be
300
- picked up automatically based on env variables.
301
- * langchain_postgres now accept async connections. If you want to use the async
302
- version, you need to set `async_mode=True` when initializing the store or
303
- use an async engine.
304
-
305
- Supported filter operators:
306
-
307
- * $eq: Equality operator
308
- * $ne: Not equal operator
309
- * $lt: Less than operator
310
- * $lte: Less than or equal operator
311
- * $gt: Greater than operator
312
- * $gte: Greater than or equal operator
313
- * $in: In operator
314
- * $nin: Not in operator
315
- * $between: Between operator
316
- * $exists: Exists operator
317
- * $like: Like operator
318
- * $ilike: Case insensitive like operator
319
- * $and: Logical AND operator
320
- * $or: Logical OR operator
321
- * $not: Logical NOT operator
322
-
323
- Example:
324
-
325
- .. code-block:: python
326
-
327
- vectorstore.similarity_search('kitty', k=10, filter={
328
- 'id': {'$in': [1, 5, 2, 9]}
329
- })
330
- #%% md
331
-
332
- If you provide a dict with multiple fields, but no operators,
333
- the top level will be interpreted as a logical **AND** filter
334
-
335
- vectorstore.similarity_search('ducks', k=10, filter={
336
- 'id': {'$in': [1, 5, 2, 9]},
337
- 'location': {'$in': ["pond", "market"]}
338
- })
339
-
340
- """
372
+ """ # noqa: E501
341
373
 
342
374
  def __init__(
343
375
  self,
@@ -714,7 +746,7 @@ class PGVector(VectorStore):
714
746
 
715
747
  def add_embeddings(
716
748
  self,
717
- texts: Iterable[str],
749
+ texts: Sequence[str],
718
750
  embeddings: List[List[float]],
719
751
  metadatas: Optional[List[dict]] = None,
720
752
  ids: Optional[List[str]] = None,
@@ -732,7 +764,9 @@ class PGVector(VectorStore):
732
764
  """
733
765
  assert not self._async_engine, "This method must be called with sync_mode"
734
766
  if ids is None:
735
- ids = [str(uuid.uuid4()) for _ in texts]
767
+ ids_ = [str(uuid.uuid4()) for _ in texts]
768
+ else:
769
+ ids_ = [id if id is not None else str(uuid.uuid4()) for id in ids]
736
770
 
737
771
  if not metadatas:
738
772
  metadatas = [{} for _ in texts]
@@ -750,7 +784,7 @@ class PGVector(VectorStore):
750
784
  "cmetadata": metadata or {},
751
785
  }
752
786
  for text, metadata, embedding, id in zip(
753
- texts, metadatas, embeddings, ids
787
+ texts, metadatas, embeddings, ids_
754
788
  )
755
789
  ]
756
790
  stmt = insert(self.EmbeddingStore).values(data)
@@ -766,11 +800,11 @@ class PGVector(VectorStore):
766
800
  session.execute(on_conflict_stmt)
767
801
  session.commit()
768
802
 
769
- return ids
803
+ return ids_
770
804
 
771
805
  async def aadd_embeddings(
772
806
  self,
773
- texts: Iterable[str],
807
+ texts: Sequence[str],
774
808
  embeddings: List[List[float]],
775
809
  metadatas: Optional[List[dict]] = None,
776
810
  ids: Optional[List[str]] = None,
@@ -787,8 +821,11 @@ class PGVector(VectorStore):
787
821
  kwargs: vectorstore specific parameters
788
822
  """
789
823
  await self.__apost_init__() # Lazy async init
824
+
790
825
  if ids is None:
791
- ids = [str(uuid.uuid1()) for _ in texts]
826
+ ids_ = [str(uuid.uuid4()) for _ in texts]
827
+ else:
828
+ ids_ = [id if id is not None else str(uuid.uuid4()) for id in ids]
792
829
 
793
830
  if not metadatas:
794
831
  metadatas = [{} for _ in texts]
@@ -806,7 +843,7 @@ class PGVector(VectorStore):
806
843
  "cmetadata": metadata or {},
807
844
  }
808
845
  for text, metadata, embedding, id in zip(
809
- texts, metadatas, embeddings, ids
846
+ texts, metadatas, embeddings, ids_
810
847
  )
811
848
  ]
812
849
  stmt = insert(self.EmbeddingStore).values(data)
@@ -822,7 +859,7 @@ class PGVector(VectorStore):
822
859
  await session.execute(on_conflict_stmt)
823
860
  await session.commit()
824
861
 
825
- return ids
862
+ return ids_
826
863
 
827
864
  def add_texts(
828
865
  self,
@@ -844,9 +881,14 @@ class PGVector(VectorStore):
844
881
  List of ids from adding the texts into the vectorstore.
845
882
  """
846
883
  assert not self._async_engine, "This method must be called without async_mode"
847
- embeddings = self.embedding_function.embed_documents(list(texts))
884
+ texts_ = list(texts)
885
+ embeddings = self.embedding_function.embed_documents(texts_)
848
886
  return self.add_embeddings(
849
- texts=texts, embeddings=embeddings, metadatas=metadatas, ids=ids, **kwargs
887
+ texts=texts_,
888
+ embeddings=list(embeddings),
889
+ metadatas=list(metadatas) if metadatas else None,
890
+ ids=list(ids) if ids else None,
891
+ **kwargs,
850
892
  )
851
893
 
852
894
  async def aadd_texts(
@@ -869,9 +911,14 @@ class PGVector(VectorStore):
869
911
  List of ids from adding the texts into the vectorstore.
870
912
  """
871
913
  await self.__apost_init__() # Lazy async init
872
- embeddings = await self.embedding_function.aembed_documents(list(texts))
914
+ texts_ = list(texts)
915
+ embeddings = await self.embedding_function.aembed_documents(texts_)
873
916
  return await self.aadd_embeddings(
874
- texts=texts, embeddings=embeddings, metadatas=metadatas, ids=ids, **kwargs
917
+ texts=texts_,
918
+ embeddings=list(embeddings),
919
+ metadatas=list(metadatas) if metadatas else None,
920
+ ids=list(ids) if ids else None,
921
+ **kwargs,
875
922
  )
876
923
 
877
924
  def similarity_search(
@@ -1014,6 +1061,7 @@ class PGVector(VectorStore):
1014
1061
  docs = [
1015
1062
  (
1016
1063
  Document(
1064
+ id=str(result.EmbeddingStore.id),
1017
1065
  page_content=result.EmbeddingStore.document,
1018
1066
  metadata=result.EmbeddingStore.cmetadata,
1019
1067
  ),
@@ -2178,3 +2226,54 @@ class PGVector(VectorStore):
2178
2226
  )
2179
2227
  async with self.session_maker() as session:
2180
2228
  yield typing_cast(AsyncSession, session)
2229
+
2230
+ def get_by_ids(self, ids: Sequence[str], /) -> List[Document]:
2231
+ """Get documents by ids."""
2232
+ documents = []
2233
+ with self._make_sync_session() as session:
2234
+ collection = self.get_collection(session)
2235
+ filter_by = [self.EmbeddingStore.collection_id == collection.uuid]
2236
+ stmt = (
2237
+ select(
2238
+ self.EmbeddingStore,
2239
+ )
2240
+ .where(self.EmbeddingStore.id.in_(ids))
2241
+ .filter(*filter_by)
2242
+ )
2243
+
2244
+ for result in session.execute(stmt).scalars().all():
2245
+ documents.append(
2246
+ Document(
2247
+ id=result.id,
2248
+ page_content=result.document,
2249
+ metadata=result.cmetadata,
2250
+ )
2251
+ )
2252
+ return documents
2253
+
2254
+ async def aget_by_ids(self, ids: Sequence[str], /) -> List[Document]:
2255
+ """Get documents by ids."""
2256
+ documents = []
2257
+ async with self._make_async_session() as session:
2258
+ collection = await self.aget_collection(session)
2259
+ filter_by = [self.EmbeddingStore.collection_id == collection.uuid]
2260
+
2261
+ stmt = (
2262
+ select(
2263
+ self.EmbeddingStore,
2264
+ )
2265
+ .where(self.EmbeddingStore.id.in_(ids))
2266
+ .filter(*filter_by)
2267
+ )
2268
+
2269
+ results: Sequence[Any] = (await session.execute(stmt)).scalars().all()
2270
+
2271
+ for result in results:
2272
+ documents.append(
2273
+ Document(
2274
+ id=str(result.id),
2275
+ page_content=result.document,
2276
+ metadata=result.cmetadata,
2277
+ )
2278
+ )
2279
+ return documents
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "langchain-postgres"
3
- version = "0.0.9"
3
+ version = "0.0.11"
4
4
  description = "An integration package connecting Postgres and LangChain"
5
5
  authors = []
6
6
  readme = "README.md"
@@ -11,8 +11,8 @@ license = "MIT"
11
11
  "Source Code" = "https://github.com/langchain-ai/langchain-postgres/tree/master/langchain_postgres"
12
12
 
13
13
  [tool.poetry.dependencies]
14
- python = "^3.8.1"
15
- langchain-core = ">=0.1.50,<0.3"
14
+ python = "^3.9"
15
+ langchain-core = ">=0.2.13,<0.4.0"
16
16
  psycopg = "^3"
17
17
  psycopg-pool = "^3.2.1"
18
18
  sqlalchemy = "^2"
@@ -24,6 +24,7 @@ numpy = "^1"
24
24
  [tool.poetry.group.dev.dependencies]
25
25
  jupyterlab = "^3.6.1"
26
26
 
27
+
27
28
  [tool.poetry.group.test]
28
29
  optional = true
29
30
 
@@ -33,6 +34,8 @@ pytest-asyncio = "^0.23.2"
33
34
  pytest-socket = "^0.7.0"
34
35
  pytest-cov = "^5.0.0"
35
36
  pytest-timeout = "^2.3.1"
37
+ langchain-core = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/core/"}
38
+ langchain-standard-tests = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/standard-tests/"}
36
39
 
37
40
  [tool.poetry.group.codespell]
38
41
  optional = true