langchain-postgres 0.0.14rc1__py3-none-any.whl → 0.0.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langchain_postgres/__init__.py +1 -1
- langchain_postgres/chat_message_histories.py +1 -0
- langchain_postgres/utils/pgvector_migrator.py +1 -1
- langchain_postgres/v2/async_vectorstore.py +190 -49
- langchain_postgres/v2/engine.py +65 -6
- langchain_postgres/v2/hybrid_search_config.py +212 -0
- langchain_postgres/v2/vectorstores.py +37 -0
- langchain_postgres/vectorstores.py +1 -8
- {langchain_postgres-0.0.14rc1.dist-info → langchain_postgres-0.0.16.dist-info}/METADATA +40 -29
- langchain_postgres-0.0.16.dist-info/RECORD +17 -0
- {langchain_postgres-0.0.14rc1.dist-info → langchain_postgres-0.0.16.dist-info}/WHEEL +1 -1
- langchain_postgres-0.0.14rc1.dist-info/RECORD +0 -16
- {langchain_postgres-0.0.14rc1.dist-info → langchain_postgres-0.0.16.dist-info/licenses}/LICENSE +0 -0
langchain_postgres/__init__.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
from importlib import metadata
|
2
2
|
|
3
3
|
from langchain_postgres.chat_message_histories import PostgresChatMessageHistory
|
4
|
-
from langchain_postgres.v2.engine import Column, PGEngine, ColumnDict
|
5
4
|
from langchain_postgres.translator import PGVectorTranslator
|
5
|
+
from langchain_postgres.v2.engine import Column, ColumnDict, PGEngine
|
6
6
|
from langchain_postgres.v2.vectorstores import PGVectorStore
|
7
7
|
from langchain_postgres.vectorstores import PGVector
|
8
8
|
|
@@ -68,7 +68,7 @@ async def __aextract_pgvector_collection(
|
|
68
68
|
if not rows:
|
69
69
|
break
|
70
70
|
yield [row._mapping for row in rows]
|
71
|
-
except ValueError
|
71
|
+
except ValueError:
|
72
72
|
raise ValueError(f"Collection, {collection_name} does not exist.")
|
73
73
|
except SQLAlchemyError as e:
|
74
74
|
raise ProgrammingError(
|
@@ -14,6 +14,7 @@ from sqlalchemy import RowMapping, text
|
|
14
14
|
from sqlalchemy.ext.asyncio import AsyncEngine
|
15
15
|
|
16
16
|
from .engine import PGEngine
|
17
|
+
from .hybrid_search_config import HybridSearchConfig
|
17
18
|
from .indexes import (
|
18
19
|
DEFAULT_DISTANCE_STRATEGY,
|
19
20
|
DEFAULT_INDEX_NAME_SUFFIX,
|
@@ -77,6 +78,7 @@ class AsyncPGVectorStore(VectorStore):
|
|
77
78
|
fetch_k: int = 20,
|
78
79
|
lambda_mult: float = 0.5,
|
79
80
|
index_query_options: Optional[QueryOptions] = None,
|
81
|
+
hybrid_search_config: Optional[HybridSearchConfig] = None,
|
80
82
|
):
|
81
83
|
"""AsyncPGVectorStore constructor.
|
82
84
|
Args:
|
@@ -95,6 +97,7 @@ class AsyncPGVectorStore(VectorStore):
|
|
95
97
|
fetch_k (int): Number of Documents to fetch to pass to MMR algorithm.
|
96
98
|
lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5.
|
97
99
|
index_query_options (QueryOptions): Index query option.
|
100
|
+
hybrid_search_config (HybridSearchConfig): Hybrid search configuration. Defaults to None.
|
98
101
|
|
99
102
|
|
100
103
|
Raises:
|
@@ -119,6 +122,7 @@ class AsyncPGVectorStore(VectorStore):
|
|
119
122
|
self.fetch_k = fetch_k
|
120
123
|
self.lambda_mult = lambda_mult
|
121
124
|
self.index_query_options = index_query_options
|
125
|
+
self.hybrid_search_config = hybrid_search_config
|
122
126
|
|
123
127
|
@classmethod
|
124
128
|
async def create(
|
@@ -139,6 +143,7 @@ class AsyncPGVectorStore(VectorStore):
|
|
139
143
|
fetch_k: int = 20,
|
140
144
|
lambda_mult: float = 0.5,
|
141
145
|
index_query_options: Optional[QueryOptions] = None,
|
146
|
+
hybrid_search_config: Optional[HybridSearchConfig] = None,
|
142
147
|
) -> AsyncPGVectorStore:
|
143
148
|
"""Create an AsyncPGVectorStore instance.
|
144
149
|
|
@@ -158,6 +163,7 @@ class AsyncPGVectorStore(VectorStore):
|
|
158
163
|
fetch_k (int): Number of Documents to fetch to pass to MMR algorithm.
|
159
164
|
lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5.
|
160
165
|
index_query_options (QueryOptions): Index query option.
|
166
|
+
hybrid_search_config (HybridSearchConfig): Hybrid search configuration. Defaults to None.
|
161
167
|
|
162
168
|
Returns:
|
163
169
|
AsyncPGVectorStore
|
@@ -193,9 +199,18 @@ class AsyncPGVectorStore(VectorStore):
|
|
193
199
|
raise ValueError(
|
194
200
|
f"Content column, {content_column}, is type, {content_type}. It must be a type of character string."
|
195
201
|
)
|
202
|
+
if hybrid_search_config:
|
203
|
+
tsv_column_name = (
|
204
|
+
hybrid_search_config.tsv_column
|
205
|
+
if hybrid_search_config.tsv_column
|
206
|
+
else content_column + "_tsv"
|
207
|
+
)
|
208
|
+
if tsv_column_name not in columns or columns[tsv_column_name] != "tsvector":
|
209
|
+
# mark tsv_column as empty because there is no TSV column in table
|
210
|
+
hybrid_search_config.tsv_column = ""
|
196
211
|
if embedding_column not in columns:
|
197
212
|
raise ValueError(f"Embedding column, {embedding_column}, does not exist.")
|
198
|
-
if columns[embedding_column]
|
213
|
+
if columns[embedding_column] not in ["USER-DEFINED", "vector"]:
|
199
214
|
raise ValueError(
|
200
215
|
f"Embedding column, {embedding_column}, is not type Vector."
|
201
216
|
)
|
@@ -236,6 +251,7 @@ class AsyncPGVectorStore(VectorStore):
|
|
236
251
|
fetch_k=fetch_k,
|
237
252
|
lambda_mult=lambda_mult,
|
238
253
|
index_query_options=index_query_options,
|
254
|
+
hybrid_search_config=hybrid_search_config,
|
239
255
|
)
|
240
256
|
|
241
257
|
@property
|
@@ -273,17 +289,30 @@ class AsyncPGVectorStore(VectorStore):
|
|
273
289
|
if len(self.metadata_columns) > 0
|
274
290
|
else ""
|
275
291
|
)
|
276
|
-
|
292
|
+
hybrid_search_column = (
|
293
|
+
f', "{self.hybrid_search_config.tsv_column}"'
|
294
|
+
if self.hybrid_search_config and self.hybrid_search_config.tsv_column
|
295
|
+
else ""
|
296
|
+
)
|
297
|
+
insert_stmt = f'INSERT INTO "{self.schema_name}"."{self.table_name}"("{self.id_column}", "{self.content_column}", "{self.embedding_column}"{hybrid_search_column}{metadata_col_names}'
|
277
298
|
values = {
|
278
|
-
"
|
299
|
+
"langchain_id": id,
|
279
300
|
"content": content,
|
280
301
|
"embedding": str([float(dimension) for dimension in embedding]),
|
281
302
|
}
|
282
|
-
values_stmt = "VALUES (:
|
303
|
+
values_stmt = "VALUES (:langchain_id, :content, :embedding"
|
283
304
|
|
284
305
|
if not embedding and can_inline_embed:
|
285
|
-
values_stmt = f"VALUES (:
|
306
|
+
values_stmt = f"VALUES (:langchain_id, :content, {self.embedding_service.embed_query_inline(content)}" # type: ignore
|
286
307
|
|
308
|
+
if self.hybrid_search_config and self.hybrid_search_config.tsv_column:
|
309
|
+
lang = (
|
310
|
+
f"'{self.hybrid_search_config.tsv_lang}',"
|
311
|
+
if self.hybrid_search_config.tsv_lang
|
312
|
+
else ""
|
313
|
+
)
|
314
|
+
values_stmt += f", to_tsvector({lang} :tsv_content)"
|
315
|
+
values["tsv_content"] = content
|
287
316
|
# Add metadata
|
288
317
|
extra = copy.deepcopy(metadata)
|
289
318
|
for metadata_column in self.metadata_columns:
|
@@ -308,6 +337,9 @@ class AsyncPGVectorStore(VectorStore):
|
|
308
337
|
|
309
338
|
upsert_stmt = f' ON CONFLICT ("{self.id_column}") DO UPDATE SET "{self.content_column}" = EXCLUDED."{self.content_column}", "{self.embedding_column}" = EXCLUDED."{self.embedding_column}"'
|
310
339
|
|
340
|
+
if self.hybrid_search_config and self.hybrid_search_config.tsv_column:
|
341
|
+
upsert_stmt += f', "{self.hybrid_search_config.tsv_column}" = EXCLUDED."{self.hybrid_search_config.tsv_column}"'
|
342
|
+
|
311
343
|
if self.metadata_json_column:
|
312
344
|
upsert_stmt += f', "{self.metadata_json_column}" = EXCLUDED."{self.metadata_json_column}"'
|
313
345
|
|
@@ -408,6 +440,7 @@ class AsyncPGVectorStore(VectorStore):
|
|
408
440
|
fetch_k: int = 20,
|
409
441
|
lambda_mult: float = 0.5,
|
410
442
|
index_query_options: Optional[QueryOptions] = None,
|
443
|
+
hybrid_search_config: Optional[HybridSearchConfig] = None,
|
411
444
|
**kwargs: Any,
|
412
445
|
) -> AsyncPGVectorStore:
|
413
446
|
"""Create an AsyncPGVectorStore instance from texts.
|
@@ -453,6 +486,7 @@ class AsyncPGVectorStore(VectorStore):
|
|
453
486
|
fetch_k=fetch_k,
|
454
487
|
lambda_mult=lambda_mult,
|
455
488
|
index_query_options=index_query_options,
|
489
|
+
hybrid_search_config=hybrid_search_config,
|
456
490
|
)
|
457
491
|
await vs.aadd_texts(texts, metadatas=metadatas, ids=ids, **kwargs)
|
458
492
|
return vs
|
@@ -478,6 +512,7 @@ class AsyncPGVectorStore(VectorStore):
|
|
478
512
|
fetch_k: int = 20,
|
479
513
|
lambda_mult: float = 0.5,
|
480
514
|
index_query_options: Optional[QueryOptions] = None,
|
515
|
+
hybrid_search_config: Optional[HybridSearchConfig] = None,
|
481
516
|
**kwargs: Any,
|
482
517
|
) -> AsyncPGVectorStore:
|
483
518
|
"""Create an AsyncPGVectorStore instance from documents.
|
@@ -524,6 +559,7 @@ class AsyncPGVectorStore(VectorStore):
|
|
524
559
|
fetch_k=fetch_k,
|
525
560
|
lambda_mult=lambda_mult,
|
526
561
|
index_query_options=index_query_options,
|
562
|
+
hybrid_search_config=hybrid_search_config,
|
527
563
|
)
|
528
564
|
texts = [doc.page_content for doc in documents]
|
529
565
|
metadatas = [doc.metadata for doc in documents]
|
@@ -538,16 +574,30 @@ class AsyncPGVectorStore(VectorStore):
|
|
538
574
|
filter: Optional[dict] = None,
|
539
575
|
**kwargs: Any,
|
540
576
|
) -> Sequence[RowMapping]:
|
541
|
-
"""
|
542
|
-
|
577
|
+
"""
|
578
|
+
Perform similarity search (or hybrid search) query on database.
|
579
|
+
Queries might be slow if the hybrid search column does not exist.
|
580
|
+
For best hybrid search performance, consider creating a TSV column
|
581
|
+
and adding GIN index.
|
582
|
+
"""
|
583
|
+
hybrid_search_config = kwargs.get(
|
584
|
+
"hybrid_search_config", self.hybrid_search_config
|
585
|
+
)
|
586
|
+
|
587
|
+
final_k = k if k is not None else self.k
|
588
|
+
|
589
|
+
dense_limit = final_k
|
590
|
+
if hybrid_search_config:
|
591
|
+
dense_limit = hybrid_search_config.primary_top_k
|
592
|
+
|
543
593
|
operator = self.distance_strategy.operator
|
544
594
|
search_function = self.distance_strategy.search_function
|
545
595
|
|
546
|
-
columns =
|
596
|
+
columns = [
|
547
597
|
self.id_column,
|
548
598
|
self.content_column,
|
549
599
|
self.embedding_column,
|
550
|
-
]
|
600
|
+
] + self.metadata_columns
|
551
601
|
if self.metadata_json_column:
|
552
602
|
columns.append(self.metadata_json_column)
|
553
603
|
|
@@ -557,16 +607,19 @@ class AsyncPGVectorStore(VectorStore):
|
|
557
607
|
filter_dict = None
|
558
608
|
if filter and isinstance(filter, dict):
|
559
609
|
safe_filter, filter_dict = self._create_filter_clause(filter)
|
560
|
-
|
610
|
+
|
561
611
|
inline_embed_func = getattr(self.embedding_service, "embed_query_inline", None)
|
562
612
|
if not embedding and callable(inline_embed_func) and "query" in kwargs:
|
563
613
|
query_embedding = self.embedding_service.embed_query_inline(kwargs["query"]) # type: ignore
|
614
|
+
embedding_data_string = f"{query_embedding}"
|
564
615
|
else:
|
565
616
|
query_embedding = f"{[float(dimension) for dimension in embedding]}"
|
566
|
-
|
567
|
-
|
617
|
+
embedding_data_string = ":query_embedding"
|
618
|
+
where_filters = f"WHERE {safe_filter}" if safe_filter else ""
|
619
|
+
dense_query_stmt = f"""SELECT {column_names}, {search_function}("{self.embedding_column}", {embedding_data_string}) as distance
|
620
|
+
FROM "{self.schema_name}"."{self.table_name}" {where_filters} ORDER BY "{self.embedding_column}" {operator} {embedding_data_string} LIMIT :dense_limit;
|
568
621
|
"""
|
569
|
-
param_dict = {"query_embedding": query_embedding, "
|
622
|
+
param_dict = {"query_embedding": query_embedding, "dense_limit": dense_limit}
|
570
623
|
if filter_dict:
|
571
624
|
param_dict.update(filter_dict)
|
572
625
|
if self.index_query_options:
|
@@ -575,15 +628,49 @@ class AsyncPGVectorStore(VectorStore):
|
|
575
628
|
for query_option in self.index_query_options.to_parameter():
|
576
629
|
query_options_stmt = f"SET LOCAL {query_option};"
|
577
630
|
await conn.execute(text(query_options_stmt))
|
578
|
-
result = await conn.execute(text(
|
631
|
+
result = await conn.execute(text(dense_query_stmt), param_dict)
|
579
632
|
result_map = result.mappings()
|
580
|
-
|
633
|
+
dense_results = result_map.fetchall()
|
581
634
|
else:
|
582
635
|
async with self.engine.connect() as conn:
|
583
|
-
result = await conn.execute(text(
|
636
|
+
result = await conn.execute(text(dense_query_stmt), param_dict)
|
637
|
+
result_map = result.mappings()
|
638
|
+
dense_results = result_map.fetchall()
|
639
|
+
|
640
|
+
fts_query = (
|
641
|
+
hybrid_search_config.fts_query
|
642
|
+
if hybrid_search_config and hybrid_search_config.fts_query
|
643
|
+
else kwargs.get("fts_query", "")
|
644
|
+
)
|
645
|
+
if hybrid_search_config and fts_query:
|
646
|
+
hybrid_search_config.fusion_function_parameters["fetch_top_k"] = final_k
|
647
|
+
# do the sparse query
|
648
|
+
lang = (
|
649
|
+
f"'{hybrid_search_config.tsv_lang}',"
|
650
|
+
if hybrid_search_config.tsv_lang
|
651
|
+
else ""
|
652
|
+
)
|
653
|
+
query_tsv = f"plainto_tsquery({lang} :fts_query)"
|
654
|
+
param_dict["fts_query"] = fts_query
|
655
|
+
if hybrid_search_config.tsv_column:
|
656
|
+
content_tsv = f'"{hybrid_search_config.tsv_column}"'
|
657
|
+
else:
|
658
|
+
content_tsv = f'to_tsvector({lang} "{self.content_column}")'
|
659
|
+
and_filters = f"AND ({safe_filter})" if safe_filter else ""
|
660
|
+
sparse_query_stmt = f'SELECT {column_names}, ts_rank_cd({content_tsv}, {query_tsv}) as distance FROM "{self.schema_name}"."{self.table_name}" WHERE {content_tsv} @@ {query_tsv} {and_filters} ORDER BY distance desc LIMIT {hybrid_search_config.secondary_top_k};'
|
661
|
+
async with self.engine.connect() as conn:
|
662
|
+
result = await conn.execute(text(sparse_query_stmt), param_dict)
|
584
663
|
result_map = result.mappings()
|
585
|
-
|
586
|
-
|
664
|
+
sparse_results = result_map.fetchall()
|
665
|
+
|
666
|
+
combined_results = hybrid_search_config.fusion_function(
|
667
|
+
dense_results,
|
668
|
+
sparse_results,
|
669
|
+
**hybrid_search_config.fusion_function_parameters,
|
670
|
+
distance_strategy=self.distance_strategy,
|
671
|
+
)
|
672
|
+
return combined_results
|
673
|
+
return dense_results
|
587
674
|
|
588
675
|
async def asimilarity_search(
|
589
676
|
self,
|
@@ -601,6 +688,14 @@ class AsyncPGVectorStore(VectorStore):
|
|
601
688
|
)
|
602
689
|
kwargs["query"] = query
|
603
690
|
|
691
|
+
# add fts_query to hybrid_search_config
|
692
|
+
hybrid_search_config = kwargs.get(
|
693
|
+
"hybrid_search_config", self.hybrid_search_config
|
694
|
+
)
|
695
|
+
if hybrid_search_config and not hybrid_search_config.fts_query:
|
696
|
+
hybrid_search_config.fts_query = query
|
697
|
+
kwargs["hybrid_search_config"] = hybrid_search_config
|
698
|
+
|
604
699
|
return await self.asimilarity_search_by_vector(
|
605
700
|
embedding=embedding, k=k, filter=filter, **kwargs
|
606
701
|
)
|
@@ -632,6 +727,14 @@ class AsyncPGVectorStore(VectorStore):
|
|
632
727
|
)
|
633
728
|
kwargs["query"] = query
|
634
729
|
|
730
|
+
# add fts_query to hybrid_search_config
|
731
|
+
hybrid_search_config = kwargs.get(
|
732
|
+
"hybrid_search_config", self.hybrid_search_config
|
733
|
+
)
|
734
|
+
if hybrid_search_config and not hybrid_search_config.fts_query:
|
735
|
+
hybrid_search_config.fts_query = query
|
736
|
+
kwargs["hybrid_search_config"] = hybrid_search_config
|
737
|
+
|
635
738
|
docs = await self.asimilarity_search_with_score_by_vector(
|
636
739
|
embedding=embedding, k=k, filter=filter, **kwargs
|
637
740
|
)
|
@@ -776,6 +879,41 @@ class AsyncPGVectorStore(VectorStore):
|
|
776
879
|
|
777
880
|
return [r for i, r in enumerate(documents_with_scores) if i in mmr_selected]
|
778
881
|
|
882
|
+
async def aapply_hybrid_search_index(
|
883
|
+
self,
|
884
|
+
concurrently: bool = False,
|
885
|
+
) -> None:
|
886
|
+
"""Creates a TSV index in the vector store table if possible."""
|
887
|
+
if (
|
888
|
+
not self.hybrid_search_config
|
889
|
+
or not self.hybrid_search_config.index_type
|
890
|
+
or not self.hybrid_search_config.index_name
|
891
|
+
):
|
892
|
+
# no index needs to be created
|
893
|
+
raise ValueError("Hybrid Search Config cannot create index.")
|
894
|
+
|
895
|
+
lang = (
|
896
|
+
f"'{self.hybrid_search_config.tsv_lang}',"
|
897
|
+
if self.hybrid_search_config.tsv_lang
|
898
|
+
else ""
|
899
|
+
)
|
900
|
+
tsv_column_name = (
|
901
|
+
self.hybrid_search_config.tsv_column
|
902
|
+
if self.hybrid_search_config.tsv_column
|
903
|
+
else f"to_tsvector({lang} {self.content_column})"
|
904
|
+
)
|
905
|
+
tsv_index_query = f'CREATE INDEX {"CONCURRENTLY" if concurrently else ""} {self.hybrid_search_config.index_name} ON "{self.schema_name}"."{self.table_name}" USING {self.hybrid_search_config.index_type}({tsv_column_name});'
|
906
|
+
if concurrently:
|
907
|
+
async with self.engine.connect() as conn:
|
908
|
+
autocommit_conn = await conn.execution_options(
|
909
|
+
isolation_level="AUTOCOMMIT"
|
910
|
+
)
|
911
|
+
await autocommit_conn.execute(text(tsv_index_query))
|
912
|
+
else:
|
913
|
+
async with self.engine.connect() as conn:
|
914
|
+
await conn.execute(text(tsv_index_query))
|
915
|
+
await conn.commit()
|
916
|
+
|
779
917
|
async def aapply_vector_index(
|
780
918
|
self,
|
781
919
|
index: BaseIndex,
|
@@ -800,10 +938,11 @@ class AsyncPGVectorStore(VectorStore):
|
|
800
938
|
filter = f"WHERE ({index.partial_indexes})" if index.partial_indexes else ""
|
801
939
|
params = "WITH " + index.index_options()
|
802
940
|
if name is None:
|
803
|
-
if index.name
|
941
|
+
if index.name is None:
|
804
942
|
index.name = self.table_name + DEFAULT_INDEX_NAME_SUFFIX
|
805
943
|
name = index.name
|
806
944
|
stmt = f'CREATE INDEX {"CONCURRENTLY" if concurrently else ""} "{name}" ON "{self.schema_name}"."{self.table_name}" USING {index.index_type} ({self.embedding_column} {function}) {params} {filter};'
|
945
|
+
|
807
946
|
if concurrently:
|
808
947
|
async with self.engine.connect() as conn:
|
809
948
|
autocommit_conn = await conn.execution_options(
|
@@ -954,46 +1093,48 @@ class AsyncPGVectorStore(VectorStore):
|
|
954
1093
|
operator = "$eq"
|
955
1094
|
filter_value = value
|
956
1095
|
|
1096
|
+
suffix_id = str(uuid.uuid4()).split("-")[0]
|
957
1097
|
if operator in COMPARISONS_TO_NATIVE:
|
958
1098
|
# Then we implement an equality filter
|
959
1099
|
# native is trusted input
|
960
1100
|
native = COMPARISONS_TO_NATIVE[operator]
|
961
|
-
|
962
|
-
return f"{field} {native} :{
|
1101
|
+
param_name = f"{field}_{suffix_id}"
|
1102
|
+
return f"{field} {native} :{param_name}", {f"{param_name}": filter_value}
|
963
1103
|
elif operator == "$between":
|
964
1104
|
# Use AND with two comparisons
|
965
1105
|
low, high = filter_value
|
966
|
-
|
967
|
-
|
968
|
-
|
969
|
-
f"{
|
1106
|
+
low_param_name = f"{field}_low_{suffix_id}"
|
1107
|
+
high_param_name = f"{field}_high_{suffix_id}"
|
1108
|
+
return f"({field} BETWEEN :{low_param_name} AND :{high_param_name})", {
|
1109
|
+
f"{low_param_name}": low,
|
1110
|
+
f"{high_param_name}": high,
|
970
1111
|
}
|
971
|
-
elif operator in {"$in", "$nin"
|
1112
|
+
elif operator in {"$in", "$nin"}:
|
972
1113
|
# We'll do force coercion to text
|
973
|
-
|
974
|
-
|
975
|
-
|
976
|
-
|
977
|
-
|
978
|
-
|
979
|
-
|
980
|
-
|
981
|
-
|
982
|
-
|
983
|
-
|
984
|
-
|
985
|
-
|
986
|
-
|
987
|
-
|
988
|
-
|
989
|
-
elif operator in {"$like"}:
|
990
|
-
return f"({field} LIKE :{field}_like)", {f"{field}_like": filter_value}
|
991
|
-
elif operator in {"$ilike"}:
|
992
|
-
return f"({field} ILIKE :{field}_ilike)", {
|
993
|
-
f"{field}_ilike": filter_value
|
1114
|
+
for val in filter_value:
|
1115
|
+
if not isinstance(val, (str, int, float)):
|
1116
|
+
raise NotImplementedError(
|
1117
|
+
f"Unsupported type: {type(val)} for value: {val}"
|
1118
|
+
)
|
1119
|
+
|
1120
|
+
if isinstance(val, bool): # b/c bool is an instance of int
|
1121
|
+
raise NotImplementedError(
|
1122
|
+
f"Unsupported type: {type(val)} for value: {val}"
|
1123
|
+
)
|
1124
|
+
param_name = f"{field}_{operator.replace('$', '')}_{suffix_id}"
|
1125
|
+
if operator == "$in":
|
1126
|
+
return f"{field} = ANY(:{param_name})", {f"{param_name}": filter_value}
|
1127
|
+
else: # i.e. $nin
|
1128
|
+
return f"{field} <> ALL (:{param_name})", {
|
1129
|
+
f"{param_name}": filter_value
|
994
1130
|
}
|
995
|
-
|
996
|
-
|
1131
|
+
|
1132
|
+
elif operator in {"$like", "$ilike"}:
|
1133
|
+
param_name = f"{field}_{operator.replace('$', '')}_{suffix_id}"
|
1134
|
+
if operator == "$like":
|
1135
|
+
return f"({field} LIKE :{param_name})", {f"{param_name}": filter_value}
|
1136
|
+
else: # i.e. $ilike
|
1137
|
+
return f"({field} ILIKE :{param_name})", {f"{param_name}": filter_value}
|
997
1138
|
elif operator == "$exists":
|
998
1139
|
if not isinstance(filter_value, bool):
|
999
1140
|
raise ValueError(
|
langchain_postgres/v2/engine.py
CHANGED
@@ -3,14 +3,13 @@ from __future__ import annotations
|
|
3
3
|
import asyncio
|
4
4
|
from dataclasses import dataclass
|
5
5
|
from threading import Thread
|
6
|
-
from typing import
|
6
|
+
from typing import Any, Awaitable, Optional, TypedDict, TypeVar, Union
|
7
7
|
|
8
8
|
from sqlalchemy import text
|
9
9
|
from sqlalchemy.engine import URL
|
10
10
|
from sqlalchemy.ext.asyncio import AsyncEngine, create_async_engine
|
11
11
|
|
12
|
-
|
13
|
-
import asyncpg # type: ignore
|
12
|
+
from .hybrid_search_config import HybridSearchConfig
|
14
13
|
|
15
14
|
T = TypeVar("T")
|
16
15
|
|
@@ -120,7 +119,7 @@ class PGEngine:
|
|
120
119
|
return await coro
|
121
120
|
# Otherwise, run in the background thread
|
122
121
|
return await asyncio.wrap_future(
|
123
|
-
asyncio.run_coroutine_threadsafe(coro, self._loop)
|
122
|
+
asyncio.run_coroutine_threadsafe(coro, self._loop) # type: ignore[arg-type]
|
124
123
|
)
|
125
124
|
|
126
125
|
def _run_as_sync(self, coro: Awaitable[T]) -> T:
|
@@ -129,7 +128,7 @@ class PGEngine:
|
|
129
128
|
raise Exception(
|
130
129
|
"Engine was initialized without a background loop and cannot call sync methods."
|
131
130
|
)
|
132
|
-
return asyncio.run_coroutine_threadsafe(coro, self._loop).result()
|
131
|
+
return asyncio.run_coroutine_threadsafe(coro, self._loop).result() # type: ignore[arg-type]
|
133
132
|
|
134
133
|
async def close(self) -> None:
|
135
134
|
"""Dispose of connection pool"""
|
@@ -159,6 +158,7 @@ class PGEngine:
|
|
159
158
|
id_column: Union[str, Column, ColumnDict] = "langchain_id",
|
160
159
|
overwrite_existing: bool = False,
|
161
160
|
store_metadata: bool = True,
|
161
|
+
hybrid_search_config: Optional[HybridSearchConfig] = None,
|
162
162
|
) -> None:
|
163
163
|
"""
|
164
164
|
Create a table for saving of vectors to be used with PGVectorStore.
|
@@ -181,6 +181,8 @@ class PGEngine:
|
|
181
181
|
overwrite_existing (bool): Whether to drop existing table. Default: False.
|
182
182
|
store_metadata (bool): Whether to store metadata in the table.
|
183
183
|
Default: True.
|
184
|
+
hybrid_search_config (HybridSearchConfig): Hybrid search configuration.
|
185
|
+
Default: None.
|
184
186
|
|
185
187
|
Raises:
|
186
188
|
:class:`DuplicateTableError <asyncpg.exceptions.DuplicateTableError>`: if table already exists.
|
@@ -189,6 +191,7 @@ class PGEngine:
|
|
189
191
|
|
190
192
|
schema_name = self._escape_postgres_identifier(schema_name)
|
191
193
|
table_name = self._escape_postgres_identifier(table_name)
|
194
|
+
hybrid_search_default_column_name = content_column + "_tsv"
|
192
195
|
content_column = self._escape_postgres_identifier(content_column)
|
193
196
|
embedding_column = self._escape_postgres_identifier(embedding_column)
|
194
197
|
if metadata_columns is None:
|
@@ -229,10 +232,22 @@ class PGEngine:
|
|
229
232
|
id_data_type = id_column["data_type"]
|
230
233
|
id_column_name = id_column["name"]
|
231
234
|
|
235
|
+
hybrid_search_column = "" # Default is no TSV column for hybrid search
|
236
|
+
if hybrid_search_config:
|
237
|
+
hybrid_search_column_name = (
|
238
|
+
hybrid_search_config.tsv_column or hybrid_search_default_column_name
|
239
|
+
)
|
240
|
+
hybrid_search_column_name = self._escape_postgres_identifier(
|
241
|
+
hybrid_search_column_name
|
242
|
+
)
|
243
|
+
hybrid_search_config.tsv_column = hybrid_search_column_name
|
244
|
+
hybrid_search_column = f',"{self._escape_postgres_identifier(hybrid_search_column_name)}" TSVECTOR NOT NULL'
|
245
|
+
|
232
246
|
query = f"""CREATE TABLE "{schema_name}"."{table_name}"(
|
233
247
|
"{id_column_name}" {id_data_type} PRIMARY KEY,
|
234
248
|
"{content_column}" TEXT NOT NULL,
|
235
|
-
"{embedding_column}" vector({vector_size}) NOT NULL
|
249
|
+
"{embedding_column}" vector({vector_size}) NOT NULL
|
250
|
+
{hybrid_search_column}"""
|
236
251
|
for column in metadata_columns:
|
237
252
|
if isinstance(column, Column):
|
238
253
|
nullable = "NOT NULL" if not column.nullable else ""
|
@@ -261,6 +276,7 @@ class PGEngine:
|
|
261
276
|
id_column: Union[str, Column, ColumnDict] = "langchain_id",
|
262
277
|
overwrite_existing: bool = False,
|
263
278
|
store_metadata: bool = True,
|
279
|
+
hybrid_search_config: Optional[HybridSearchConfig] = None,
|
264
280
|
) -> None:
|
265
281
|
"""
|
266
282
|
Create a table for saving of vectors to be used with PGVectorStore.
|
@@ -283,6 +299,10 @@ class PGEngine:
|
|
283
299
|
overwrite_existing (bool): Whether to drop existing table. Default: False.
|
284
300
|
store_metadata (bool): Whether to store metadata in the table.
|
285
301
|
Default: True.
|
302
|
+
hybrid_search_config (HybridSearchConfig): Hybrid search configuration.
|
303
|
+
Note that queries might be slow if the hybrid search column does not exist.
|
304
|
+
For best hybrid search performance, consider creating a TSV column and adding GIN index.
|
305
|
+
Default: None.
|
286
306
|
"""
|
287
307
|
await self._run_as_async(
|
288
308
|
self._ainit_vectorstore_table(
|
@@ -296,6 +316,7 @@ class PGEngine:
|
|
296
316
|
id_column=id_column,
|
297
317
|
overwrite_existing=overwrite_existing,
|
298
318
|
store_metadata=store_metadata,
|
319
|
+
hybrid_search_config=hybrid_search_config,
|
299
320
|
)
|
300
321
|
)
|
301
322
|
|
@@ -312,6 +333,7 @@ class PGEngine:
|
|
312
333
|
id_column: Union[str, Column, ColumnDict] = "langchain_id",
|
313
334
|
overwrite_existing: bool = False,
|
314
335
|
store_metadata: bool = True,
|
336
|
+
hybrid_search_config: Optional[HybridSearchConfig] = None,
|
315
337
|
) -> None:
|
316
338
|
"""
|
317
339
|
Create a table for saving of vectors to be used with PGVectorStore.
|
@@ -334,6 +356,10 @@ class PGEngine:
|
|
334
356
|
overwrite_existing (bool): Whether to drop existing table. Default: False.
|
335
357
|
store_metadata (bool): Whether to store metadata in the table.
|
336
358
|
Default: True.
|
359
|
+
hybrid_search_config (HybridSearchConfig): Hybrid search configuration.
|
360
|
+
Note that queries might be slow if the hybrid search column does not exist.
|
361
|
+
For best hybrid search performance, consider creating a TSV column and adding GIN index.
|
362
|
+
Default: None.
|
337
363
|
"""
|
338
364
|
self._run_as_sync(
|
339
365
|
self._ainit_vectorstore_table(
|
@@ -347,5 +373,38 @@ class PGEngine:
|
|
347
373
|
id_column=id_column,
|
348
374
|
overwrite_existing=overwrite_existing,
|
349
375
|
store_metadata=store_metadata,
|
376
|
+
hybrid_search_config=hybrid_search_config,
|
350
377
|
)
|
351
378
|
)
|
379
|
+
|
380
|
+
async def _adrop_table(
|
381
|
+
self,
|
382
|
+
table_name: str,
|
383
|
+
*,
|
384
|
+
schema_name: str = "public",
|
385
|
+
) -> None:
|
386
|
+
"""Drop the vector store table"""
|
387
|
+
query = f'DROP TABLE IF EXISTS "{schema_name}"."{table_name}";'
|
388
|
+
async with self._pool.connect() as conn:
|
389
|
+
await conn.execute(text(query))
|
390
|
+
await conn.commit()
|
391
|
+
|
392
|
+
async def adrop_table(
|
393
|
+
self,
|
394
|
+
table_name: str,
|
395
|
+
*,
|
396
|
+
schema_name: str = "public",
|
397
|
+
) -> None:
|
398
|
+
await self._run_as_async(
|
399
|
+
self._adrop_table(table_name=table_name, schema_name=schema_name)
|
400
|
+
)
|
401
|
+
|
402
|
+
def drop_table(
|
403
|
+
self,
|
404
|
+
table_name: str,
|
405
|
+
*,
|
406
|
+
schema_name: str = "public",
|
407
|
+
) -> None:
|
408
|
+
self._run_as_sync(
|
409
|
+
self._adrop_table(table_name=table_name, schema_name=schema_name)
|
410
|
+
)
|
@@ -0,0 +1,212 @@
|
|
1
|
+
from abc import ABC
|
2
|
+
from dataclasses import dataclass, field
|
3
|
+
from typing import Any, Callable, Optional, Sequence
|
4
|
+
|
5
|
+
from sqlalchemy import RowMapping
|
6
|
+
|
7
|
+
from .indexes import DistanceStrategy
|
8
|
+
|
9
|
+
|
10
|
+
def _normalize_scores(
|
11
|
+
results: Sequence[dict[str, Any]], is_distance_metric: bool
|
12
|
+
) -> Sequence[dict[str, Any]]:
|
13
|
+
"""Normalizes scores to a 0-1 scale, where 1 is best."""
|
14
|
+
if not results:
|
15
|
+
return []
|
16
|
+
|
17
|
+
# Get scores from the last column of each result
|
18
|
+
scores = [float(list(item.values())[-1]) for item in results]
|
19
|
+
min_score, max_score = min(scores), max(scores)
|
20
|
+
score_range = max_score - min_score
|
21
|
+
|
22
|
+
if score_range == 0:
|
23
|
+
# All documents are of the highest quality (1.0)
|
24
|
+
for item in results:
|
25
|
+
item["normalized_score"] = 1.0
|
26
|
+
return list(results)
|
27
|
+
|
28
|
+
for item in results:
|
29
|
+
# Access the score again from the last column for calculation
|
30
|
+
score = list(item.values())[-1]
|
31
|
+
normalized = (score - min_score) / score_range
|
32
|
+
if is_distance_metric:
|
33
|
+
# For distance, a lower score is better, so we invert the result.
|
34
|
+
item["normalized_score"] = 1.0 - normalized
|
35
|
+
else:
|
36
|
+
# For similarity (like keyword search), a higher score is better.
|
37
|
+
item["normalized_score"] = normalized
|
38
|
+
|
39
|
+
return list(results)
|
40
|
+
|
41
|
+
|
42
|
+
def weighted_sum_ranking(
|
43
|
+
primary_search_results: Sequence[RowMapping],
|
44
|
+
secondary_search_results: Sequence[RowMapping],
|
45
|
+
primary_results_weight: float = 0.5,
|
46
|
+
secondary_results_weight: float = 0.5,
|
47
|
+
fetch_top_k: int = 4,
|
48
|
+
**kwargs: Any,
|
49
|
+
) -> Sequence[dict[str, Any]]:
|
50
|
+
"""
|
51
|
+
Ranks documents using a weighted sum of scores from two sources.
|
52
|
+
|
53
|
+
Args:
|
54
|
+
primary_search_results: A list of (document, distance) tuples from
|
55
|
+
the primary search.
|
56
|
+
secondary_search_results: A list of (document, distance) tuples from
|
57
|
+
the secondary search.
|
58
|
+
primary_results_weight: The weight for the primary source's scores.
|
59
|
+
Defaults to 0.5.
|
60
|
+
secondary_results_weight: The weight for the secondary source's scores.
|
61
|
+
Defaults to 0.5.
|
62
|
+
fetch_top_k: The number of documents to fetch after merging the results.
|
63
|
+
Defaults to 4.
|
64
|
+
|
65
|
+
Returns:
|
66
|
+
A list of (document, distance) tuples, sorted by weighted_score in
|
67
|
+
descending order.
|
68
|
+
"""
|
69
|
+
|
70
|
+
distance_strategy = kwargs.get(
|
71
|
+
"distance_strategy", DistanceStrategy.COSINE_DISTANCE
|
72
|
+
)
|
73
|
+
is_primary_distance = distance_strategy != DistanceStrategy.INNER_PRODUCT
|
74
|
+
|
75
|
+
# Normalize both sets of results onto a 0-1 scale
|
76
|
+
normalized_primary = _normalize_scores(
|
77
|
+
[dict(row) for row in primary_search_results],
|
78
|
+
is_distance_metric=is_primary_distance,
|
79
|
+
)
|
80
|
+
|
81
|
+
# Keyword search relevance is a similarity score (higher is better)
|
82
|
+
normalized_secondary = _normalize_scores(
|
83
|
+
[dict(row) for row in secondary_search_results], is_distance_metric=False
|
84
|
+
)
|
85
|
+
|
86
|
+
# stores computed metric with provided distance metric and weights
|
87
|
+
weighted_scores: dict[str, dict[str, Any]] = {}
|
88
|
+
|
89
|
+
# Process primary results
|
90
|
+
for item in normalized_primary:
|
91
|
+
doc_id = str(list(item.values())[0])
|
92
|
+
# Set the 'distance' key with the weighted primary score
|
93
|
+
item["distance"] = item["normalized_score"] * primary_results_weight
|
94
|
+
weighted_scores[doc_id] = item
|
95
|
+
|
96
|
+
# Process secondary results
|
97
|
+
for item in normalized_secondary:
|
98
|
+
doc_id = str(list(item.values())[0])
|
99
|
+
secondary_weighted_score = item["normalized_score"] * secondary_results_weight
|
100
|
+
|
101
|
+
if doc_id in weighted_scores:
|
102
|
+
# Add to the existing 'distance' score
|
103
|
+
weighted_scores[doc_id]["distance"] += secondary_weighted_score
|
104
|
+
else:
|
105
|
+
# Set the 'distance' key for the new item
|
106
|
+
item["distance"] = secondary_weighted_score
|
107
|
+
weighted_scores[doc_id] = item
|
108
|
+
|
109
|
+
ranked_results = sorted(
|
110
|
+
weighted_scores.values(), key=lambda item: item["distance"], reverse=True
|
111
|
+
)
|
112
|
+
|
113
|
+
for result in ranked_results:
|
114
|
+
result.pop("normalized_score", None)
|
115
|
+
|
116
|
+
return ranked_results[:fetch_top_k]
|
117
|
+
|
118
|
+
|
119
|
+
def reciprocal_rank_fusion(
|
120
|
+
primary_search_results: Sequence[RowMapping],
|
121
|
+
secondary_search_results: Sequence[RowMapping],
|
122
|
+
rrf_k: float = 60,
|
123
|
+
fetch_top_k: int = 4,
|
124
|
+
**kwargs: Any,
|
125
|
+
) -> Sequence[dict[str, Any]]:
|
126
|
+
"""
|
127
|
+
Ranks documents using Reciprocal Rank Fusion (RRF) of scores from two sources.
|
128
|
+
|
129
|
+
Args:
|
130
|
+
primary_search_results: A list of (document, distance) tuples from
|
131
|
+
the primary search.
|
132
|
+
secondary_search_results: A list of (document, distance) tuples from
|
133
|
+
the secondary search.
|
134
|
+
rrf_k: The RRF parameter k.
|
135
|
+
Defaults to 60.
|
136
|
+
fetch_top_k: The number of documents to fetch after merging the results.
|
137
|
+
Defaults to 4.
|
138
|
+
|
139
|
+
Returns:
|
140
|
+
A list of (document_id, rrf_score) tuples, sorted by rrf_score
|
141
|
+
in descending order.
|
142
|
+
"""
|
143
|
+
distance_strategy = kwargs.get(
|
144
|
+
"distance_strategy", DistanceStrategy.COSINE_DISTANCE
|
145
|
+
)
|
146
|
+
rrf_scores: dict[str, dict[str, Any]] = {}
|
147
|
+
|
148
|
+
# Process results from primary source
|
149
|
+
# Determine sorting order based on the vector distance strategy.
|
150
|
+
# For COSINE & EUCLIDEAN(distance), we sort ascending (reverse=False).
|
151
|
+
# For INNER_PRODUCT (similarity), we sort descending (reverse=True).
|
152
|
+
is_similarity_metric = distance_strategy == DistanceStrategy.INNER_PRODUCT
|
153
|
+
sorted_primary = sorted(
|
154
|
+
primary_search_results,
|
155
|
+
key=lambda item: item["distance"],
|
156
|
+
reverse=is_similarity_metric,
|
157
|
+
)
|
158
|
+
|
159
|
+
for rank, row in enumerate(sorted_primary):
|
160
|
+
doc_id = str(list(row.values())[0])
|
161
|
+
if doc_id not in rrf_scores:
|
162
|
+
rrf_scores[doc_id] = dict(row)
|
163
|
+
rrf_scores[doc_id]["distance"] = 0.0
|
164
|
+
# Add the "normalized" rank score
|
165
|
+
rrf_scores[doc_id]["distance"] += 1.0 / (rank + rrf_k)
|
166
|
+
|
167
|
+
# Process results from secondary source
|
168
|
+
# Keyword search relevance is always "higher is better" -> sort descending
|
169
|
+
sorted_secondary = sorted(
|
170
|
+
secondary_search_results,
|
171
|
+
key=lambda item: item["distance"],
|
172
|
+
reverse=True,
|
173
|
+
)
|
174
|
+
|
175
|
+
for rank, row in enumerate(sorted_secondary):
|
176
|
+
doc_id = str(list(row.values())[0])
|
177
|
+
if doc_id not in rrf_scores:
|
178
|
+
rrf_scores[doc_id] = dict(row)
|
179
|
+
rrf_scores[doc_id]["distance"] = 0.0
|
180
|
+
# Add the rank score from this list to the existing score
|
181
|
+
rrf_scores[doc_id]["distance"] += 1.0 / (rank + rrf_k)
|
182
|
+
|
183
|
+
# Sort the results by rrf score in descending order
|
184
|
+
# Sort the results by weighted score in descending order
|
185
|
+
ranked_results = sorted(
|
186
|
+
rrf_scores.values(), key=lambda item: item["distance"], reverse=True
|
187
|
+
)
|
188
|
+
# Extract only the RowMapping for the top results
|
189
|
+
return ranked_results[:fetch_top_k]
|
190
|
+
|
191
|
+
|
192
|
+
@dataclass
|
193
|
+
class HybridSearchConfig(ABC):
|
194
|
+
"""
|
195
|
+
AlloyDB Vector Store Hybrid Search Config.
|
196
|
+
|
197
|
+
Queries might be slow if the hybrid search column does not exist.
|
198
|
+
For best hybrid search performance, consider creating a TSV column
|
199
|
+
and adding GIN index.
|
200
|
+
"""
|
201
|
+
|
202
|
+
tsv_column: Optional[str] = ""
|
203
|
+
tsv_lang: Optional[str] = "pg_catalog.english"
|
204
|
+
fts_query: Optional[str] = ""
|
205
|
+
fusion_function: Callable[
|
206
|
+
[Sequence[RowMapping], Sequence[RowMapping], Any], Sequence[Any]
|
207
|
+
] = weighted_sum_ranking # Updated default
|
208
|
+
fusion_function_parameters: dict[str, Any] = field(default_factory=dict)
|
209
|
+
primary_top_k: int = 4
|
210
|
+
secondary_top_k: int = 4
|
211
|
+
index_name: str = "langchain_tsv_index"
|
212
|
+
index_type: str = "GIN"
|
@@ -9,6 +9,7 @@ from langchain_core.vectorstores import VectorStore
|
|
9
9
|
|
10
10
|
from .async_vectorstore import AsyncPGVectorStore
|
11
11
|
from .engine import PGEngine
|
12
|
+
from .hybrid_search_config import HybridSearchConfig
|
12
13
|
from .indexes import (
|
13
14
|
DEFAULT_DISTANCE_STRATEGY,
|
14
15
|
BaseIndex,
|
@@ -59,6 +60,7 @@ class PGVectorStore(VectorStore):
|
|
59
60
|
fetch_k: int = 20,
|
60
61
|
lambda_mult: float = 0.5,
|
61
62
|
index_query_options: Optional[QueryOptions] = None,
|
63
|
+
hybrid_search_config: Optional[HybridSearchConfig] = None,
|
62
64
|
) -> PGVectorStore:
|
63
65
|
"""Create an PGVectorStore instance.
|
64
66
|
|
@@ -78,6 +80,7 @@ class PGVectorStore(VectorStore):
|
|
78
80
|
fetch_k (int): Number of Documents to fetch to pass to MMR algorithm.
|
79
81
|
lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5.
|
80
82
|
index_query_options (QueryOptions): Index query option.
|
83
|
+
hybrid_search_config (HybridSearchConfig): Hybrid search configuration. Defaults to None.
|
81
84
|
|
82
85
|
Returns:
|
83
86
|
PGVectorStore
|
@@ -98,6 +101,7 @@ class PGVectorStore(VectorStore):
|
|
98
101
|
fetch_k=fetch_k,
|
99
102
|
lambda_mult=lambda_mult,
|
100
103
|
index_query_options=index_query_options,
|
104
|
+
hybrid_search_config=hybrid_search_config,
|
101
105
|
)
|
102
106
|
vs = await engine._run_as_async(coro)
|
103
107
|
return cls(cls.__create_key, engine, vs)
|
@@ -120,6 +124,7 @@ class PGVectorStore(VectorStore):
|
|
120
124
|
fetch_k: int = 20,
|
121
125
|
lambda_mult: float = 0.5,
|
122
126
|
index_query_options: Optional[QueryOptions] = None,
|
127
|
+
hybrid_search_config: Optional[HybridSearchConfig] = None,
|
123
128
|
) -> PGVectorStore:
|
124
129
|
"""Create an PGVectorStore instance.
|
125
130
|
|
@@ -140,6 +145,7 @@ class PGVectorStore(VectorStore):
|
|
140
145
|
fetch_k (int, optional): Number of Documents to fetch to pass to MMR algorithm. Defaults to 20.
|
141
146
|
lambda_mult (float, optional): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5.
|
142
147
|
index_query_options (Optional[QueryOptions], optional): Index query option. Defaults to None.
|
148
|
+
hybrid_search_config (HybridSearchConfig): Hybrid search configuration. Defaults to None.
|
143
149
|
|
144
150
|
Returns:
|
145
151
|
PGVectorStore
|
@@ -160,6 +166,7 @@ class PGVectorStore(VectorStore):
|
|
160
166
|
fetch_k=fetch_k,
|
161
167
|
lambda_mult=lambda_mult,
|
162
168
|
index_query_options=index_query_options,
|
169
|
+
hybrid_search_config=hybrid_search_config,
|
163
170
|
)
|
164
171
|
vs = engine._run_as_sync(coro)
|
165
172
|
return cls(cls.__create_key, engine, vs)
|
@@ -301,6 +308,7 @@ class PGVectorStore(VectorStore):
|
|
301
308
|
fetch_k: int = 20,
|
302
309
|
lambda_mult: float = 0.5,
|
303
310
|
index_query_options: Optional[QueryOptions] = None,
|
311
|
+
hybrid_search_config: Optional[HybridSearchConfig] = None,
|
304
312
|
**kwargs: Any,
|
305
313
|
) -> PGVectorStore:
|
306
314
|
"""Create an PGVectorStore instance from texts.
|
@@ -324,6 +332,7 @@ class PGVectorStore(VectorStore):
|
|
324
332
|
fetch_k (int): Number of Documents to fetch to pass to MMR algorithm.
|
325
333
|
lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5.
|
326
334
|
index_query_options (QueryOptions): Index query option.
|
335
|
+
hybrid_search_config (HybridSearchConfig): Hybrid search configuration. Defaults to None.
|
327
336
|
|
328
337
|
Raises:
|
329
338
|
:class:`InvalidTextRepresentationError <asyncpg.exceptions.InvalidTextRepresentationError>`: if the `ids` data type does not match that of the `id_column`.
|
@@ -347,6 +356,7 @@ class PGVectorStore(VectorStore):
|
|
347
356
|
fetch_k=fetch_k,
|
348
357
|
lambda_mult=lambda_mult,
|
349
358
|
index_query_options=index_query_options,
|
359
|
+
hybrid_search_config=hybrid_search_config,
|
350
360
|
)
|
351
361
|
await vs.aadd_texts(texts, metadatas=metadatas, ids=ids)
|
352
362
|
return vs
|
@@ -371,6 +381,7 @@ class PGVectorStore(VectorStore):
|
|
371
381
|
fetch_k: int = 20,
|
372
382
|
lambda_mult: float = 0.5,
|
373
383
|
index_query_options: Optional[QueryOptions] = None,
|
384
|
+
hybrid_search_config: Optional[HybridSearchConfig] = None,
|
374
385
|
**kwargs: Any,
|
375
386
|
) -> PGVectorStore:
|
376
387
|
"""Create an PGVectorStore instance from documents.
|
@@ -393,6 +404,7 @@ class PGVectorStore(VectorStore):
|
|
393
404
|
fetch_k (int): Number of Documents to fetch to pass to MMR algorithm.
|
394
405
|
lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5.
|
395
406
|
index_query_options (QueryOptions): Index query option.
|
407
|
+
hybrid_search_config (HybridSearchConfig): Hybrid search configuration. Defaults to None.
|
396
408
|
|
397
409
|
Raises:
|
398
410
|
:class:`InvalidTextRepresentationError <asyncpg.exceptions.InvalidTextRepresentationError>`: if the `ids` data type does not match that of the `id_column`.
|
@@ -417,6 +429,7 @@ class PGVectorStore(VectorStore):
|
|
417
429
|
fetch_k=fetch_k,
|
418
430
|
lambda_mult=lambda_mult,
|
419
431
|
index_query_options=index_query_options,
|
432
|
+
hybrid_search_config=hybrid_search_config,
|
420
433
|
)
|
421
434
|
await vs.aadd_documents(documents, ids=ids)
|
422
435
|
return vs
|
@@ -442,6 +455,7 @@ class PGVectorStore(VectorStore):
|
|
442
455
|
fetch_k: int = 20,
|
443
456
|
lambda_mult: float = 0.5,
|
444
457
|
index_query_options: Optional[QueryOptions] = None,
|
458
|
+
hybrid_search_config: Optional[HybridSearchConfig] = None,
|
445
459
|
**kwargs: Any,
|
446
460
|
) -> PGVectorStore:
|
447
461
|
"""Create an PGVectorStore instance from texts.
|
@@ -465,6 +479,7 @@ class PGVectorStore(VectorStore):
|
|
465
479
|
fetch_k (int): Number of Documents to fetch to pass to MMR algorithm.
|
466
480
|
lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5.
|
467
481
|
index_query_options (QueryOptions): Index query option.
|
482
|
+
hybrid_search_config (HybridSearchConfig): Hybrid search configuration. Defaults to None.
|
468
483
|
|
469
484
|
Raises:
|
470
485
|
:class:`InvalidTextRepresentationError <asyncpg.exceptions.InvalidTextRepresentationError>`: if the `ids` data type does not match that of the `id_column`.
|
@@ -488,6 +503,7 @@ class PGVectorStore(VectorStore):
|
|
488
503
|
fetch_k=fetch_k,
|
489
504
|
lambda_mult=lambda_mult,
|
490
505
|
index_query_options=index_query_options,
|
506
|
+
hybrid_search_config=hybrid_search_config,
|
491
507
|
**kwargs,
|
492
508
|
)
|
493
509
|
vs.add_texts(texts, metadatas=metadatas, ids=ids)
|
@@ -513,6 +529,7 @@ class PGVectorStore(VectorStore):
|
|
513
529
|
fetch_k: int = 20,
|
514
530
|
lambda_mult: float = 0.5,
|
515
531
|
index_query_options: Optional[QueryOptions] = None,
|
532
|
+
hybrid_search_config: Optional[HybridSearchConfig] = None,
|
516
533
|
**kwargs: Any,
|
517
534
|
) -> PGVectorStore:
|
518
535
|
"""Create an PGVectorStore instance from documents.
|
@@ -535,6 +552,7 @@ class PGVectorStore(VectorStore):
|
|
535
552
|
fetch_k (int): Number of Documents to fetch to pass to MMR algorithm.
|
536
553
|
lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5.
|
537
554
|
index_query_options (QueryOptions): Index query option.
|
555
|
+
hybrid_search_config (HybridSearchConfig): Hybrid search configuration. Defaults to None.
|
538
556
|
|
539
557
|
Raises:
|
540
558
|
:class:`InvalidTextRepresentationError <asyncpg.exceptions.InvalidTextRepresentationError>`: if the `ids` data type does not match that of the `id_column`.
|
@@ -558,6 +576,7 @@ class PGVectorStore(VectorStore):
|
|
558
576
|
fetch_k=fetch_k,
|
559
577
|
lambda_mult=lambda_mult,
|
560
578
|
index_query_options=index_query_options,
|
579
|
+
hybrid_search_config=hybrid_search_config,
|
561
580
|
**kwargs,
|
562
581
|
)
|
563
582
|
vs.add_documents(documents, ids=ids)
|
@@ -770,6 +789,24 @@ class PGVectorStore(VectorStore):
|
|
770
789
|
)
|
771
790
|
)
|
772
791
|
|
792
|
+
async def aapply_hybrid_search_index(
|
793
|
+
self,
|
794
|
+
concurrently: bool = False,
|
795
|
+
) -> None:
|
796
|
+
"""Creates a TSV index in the vector store table if possible."""
|
797
|
+
return await self._engine._run_as_async(
|
798
|
+
self.__vs.aapply_hybrid_search_index(concurrently=concurrently)
|
799
|
+
)
|
800
|
+
|
801
|
+
def apply_hybrid_search_index(
|
802
|
+
self,
|
803
|
+
concurrently: bool = False,
|
804
|
+
) -> None:
|
805
|
+
"""Creates a TSV index in the vector store table if possible."""
|
806
|
+
return self._engine._run_as_sync(
|
807
|
+
self.__vs.aapply_hybrid_search_index(concurrently=concurrently)
|
808
|
+
)
|
809
|
+
|
773
810
|
async def aapply_vector_index(
|
774
811
|
self,
|
775
812
|
index: BaseIndex,
|
@@ -5,6 +5,7 @@ import contextlib
|
|
5
5
|
import enum
|
6
6
|
import logging
|
7
7
|
import uuid
|
8
|
+
import warnings
|
8
9
|
from typing import (
|
9
10
|
Any,
|
10
11
|
AsyncGenerator,
|
@@ -19,7 +20,6 @@ from typing import (
|
|
19
20
|
Type,
|
20
21
|
Union,
|
21
22
|
)
|
22
|
-
import warnings
|
23
23
|
from typing import (
|
24
24
|
cast as typing_cast,
|
25
25
|
)
|
@@ -429,13 +429,6 @@ class PGVector(VectorStore):
|
|
429
429
|
self._async_engine: Optional[AsyncEngine] = None
|
430
430
|
self._async_init = False
|
431
431
|
|
432
|
-
warnings.warn(
|
433
|
-
"PGVector is being deprecated and will be removed in the future. "
|
434
|
-
"Please migrate to PGVectorStore. "
|
435
|
-
"Refer to the migration guide at [https://github.com/langchain-ai/langchain-postgres/blob/main/examples/migrate_pgvector_to_pgvectorstore.md] for details.",
|
436
|
-
PendingDeprecationWarning,
|
437
|
-
)
|
438
|
-
|
439
432
|
if isinstance(connection, str):
|
440
433
|
if async_mode:
|
441
434
|
self._async_engine = create_async_engine(
|
@@ -1,25 +1,17 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: langchain-postgres
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.16
|
4
4
|
Summary: An integration package connecting Postgres and LangChain
|
5
|
-
|
6
|
-
License:
|
7
|
-
Requires-Python: >=3.9
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
Requires-Dist:
|
15
|
-
Requires-Dist: langchain-core (>=0.2.13,<0.4.0)
|
16
|
-
Requires-Dist: numpy (>=1.21,<2.0)
|
17
|
-
Requires-Dist: pgvector (>=0.2.5,<0.4)
|
18
|
-
Requires-Dist: psycopg (>=3,<4)
|
19
|
-
Requires-Dist: psycopg-pool (>=3.2.1,<4.0.0)
|
20
|
-
Requires-Dist: sqlalchemy (>=2,<3)
|
21
|
-
Project-URL: Repository, https://github.com/langchain-ai/langchain-postgres
|
22
|
-
Project-URL: Source Code, https://github.com/langchain-ai/langchain-postgres/tree/master/langchain_postgres
|
5
|
+
License-Expression: MIT
|
6
|
+
License-File: LICENSE
|
7
|
+
Requires-Python: >=3.9
|
8
|
+
Requires-Dist: asyncpg>=0.30.0
|
9
|
+
Requires-Dist: langchain-core<2.0,>=0.2.13
|
10
|
+
Requires-Dist: numpy<3,>=1.21
|
11
|
+
Requires-Dist: pgvector<0.4,>=0.2.5
|
12
|
+
Requires-Dist: psycopg-pool<4,>=3.2.1
|
13
|
+
Requires-Dist: psycopg[binary]<4,>=3
|
14
|
+
Requires-Dist: sqlalchemy[asyncio]<3,>=2
|
23
15
|
Description-Content-Type: text/markdown
|
24
16
|
|
25
17
|
# langchain-postgres
|
@@ -39,7 +31,7 @@ Feel free to use the abstraction as provided or else modify them / extend them a
|
|
39
31
|
|
40
32
|
## Requirements
|
41
33
|
|
42
|
-
The package supports the [asyncpg](https://github.com/MagicStack/asyncpg) and [
|
34
|
+
The package supports the [asyncpg](https://github.com/MagicStack/asyncpg) and [psycopg3](https://www.psycopg.org/psycopg3/) drivers.
|
43
35
|
|
44
36
|
## Installation
|
45
37
|
|
@@ -47,17 +39,19 @@ The package supports the [asyncpg](https://github.com/MagicStack/asyncpg) and [p
|
|
47
39
|
pip install -U langchain-postgres
|
48
40
|
```
|
49
41
|
|
50
|
-
##
|
51
|
-
|
52
|
-
### Vectorstore
|
42
|
+
## Vectorstore
|
53
43
|
|
54
44
|
> [!WARNING]
|
55
45
|
> In v0.0.14+, `PGVector` is deprecated. Please migrate to `PGVectorStore`
|
56
|
-
> Version 0.0.14+ has not been released yet, but you can test version of the vectorstore on the main branch. Until official release do not use in production.
|
57
46
|
> for improved performance and manageability.
|
58
|
-
> See the [migration guide](https://github.com/langchain-ai/langchain-postgres/blob/main/examples/migrate_pgvector_to_pgvectorstore.
|
47
|
+
> See the [migration guide](https://github.com/langchain-ai/langchain-postgres/blob/main/examples/migrate_pgvector_to_pgvectorstore.ipynb) for details on how to migrate from `PGVector` to `PGVectorStore`.
|
48
|
+
|
49
|
+
### Documentation
|
59
50
|
|
60
|
-
|
51
|
+
* [Quickstart](https://github.com/langchain-ai/langchain-postgres/blob/main/examples/pg_vectorstore.ipynb)
|
52
|
+
* [How-to](https://github.com/langchain-ai/langchain-postgres/blob/main/examples/pg_vectorstore_how_to.ipynb)
|
53
|
+
|
54
|
+
### Example
|
61
55
|
|
62
56
|
```python
|
63
57
|
from langchain_core.documents import Document
|
@@ -101,7 +95,25 @@ print(docs)
|
|
101
95
|
> [!TIP]
|
102
96
|
> All synchronous functions have corresponding asynchronous functions
|
103
97
|
|
104
|
-
###
|
98
|
+
### Hybrid Search with PGVectorStore
|
99
|
+
|
100
|
+
With PGVectorStore you can use hybrid search for more comprehensive and relevant search results.
|
101
|
+
|
102
|
+
```python
|
103
|
+
vs = PGVectorStore.create_sync(
|
104
|
+
engine=engine,
|
105
|
+
table_name=TABLE_NAME,
|
106
|
+
embedding_service=embedding,
|
107
|
+
hybrid_search_config=HybridSearchConfig(
|
108
|
+
fusion_function=reciprocal_rank_fusion
|
109
|
+
),
|
110
|
+
)
|
111
|
+
hybrid_docs = vector_store.similarity_search("products", k=5)
|
112
|
+
```
|
113
|
+
|
114
|
+
For a detailed guide on how to use hybrid search, see the [documentation](/examples/pg_vectorstore_how_to.ipynb#hybrid-search-with-pgvectorstore ).
|
115
|
+
|
116
|
+
## ChatMessageHistory
|
105
117
|
|
106
118
|
The chat message history abstraction helps to persist chat message history
|
107
119
|
in a postgres table.
|
@@ -167,4 +179,3 @@ Using the Google Cloud integrations provides the following benefits:
|
|
167
179
|
| Google AlloyDB | ✓ | ✓ | ✓ | ✓ | ✗ |
|
168
180
|
| Google Cloud SQL Postgres| ✓ | ✓ | ✓ | ✓ | ✗ |
|
169
181
|
|
170
|
-
|
@@ -0,0 +1,17 @@
|
|
1
|
+
langchain_postgres/__init__.py,sha256=-ovoLrNuzL-kMUV-RrIxoEI8wmgOAg4vfE8xevYSA3Q,702
|
2
|
+
langchain_postgres/_utils.py,sha256=N_OBzYFCb_bsHOnZ-YRg6izhmuudorQhupgeG-rSKUc,2848
|
3
|
+
langchain_postgres/chat_message_histories.py,sha256=Hq_0nGX1BoBxq5jg0LwfQg7iXm6B4izYVr6iLkMGoEY,14214
|
4
|
+
langchain_postgres/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
+
langchain_postgres/translator.py,sha256=6cTS2RJUodMUdsurJM-f-vgPXl6Ad6bfMo8ECuh5Jr4,1524
|
6
|
+
langchain_postgres/vectorstores.py,sha256=vzRbPwU1Rn-pOsnTsz1u72cSYD7H8jMlW4N7A58QIt4,83826
|
7
|
+
langchain_postgres/utils/pgvector_migrator.py,sha256=OxW2_FxaomZw5kqPAz-3lmZ5t2hSXU4ZW3xK6O62MH4,11771
|
8
|
+
langchain_postgres/v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
|
+
langchain_postgres/v2/async_vectorstore.py,sha256=MuRjlRcANOnxrXRGcyGEzIZYr4v75tk8jbMZZCexSAc,58711
|
10
|
+
langchain_postgres/v2/engine.py,sha256=UC3upYnqmgKBw4E6t62CbjUEdVO67t1j0rCbdFmoQnI,16902
|
11
|
+
langchain_postgres/v2/hybrid_search_config.py,sha256=dhBeedqpVXv2VP2_RLs_jNHLLLrukJ-UXytxRD3zVts,7658
|
12
|
+
langchain_postgres/v2/indexes.py,sha256=aLCFGYiIbLBUr88drMLD6l41MPRI7lv0ALMVRWfqdq4,4888
|
13
|
+
langchain_postgres/v2/vectorstores.py,sha256=Iq5z3KU0Ne_djMLlhJNL43zprii0O1JdUN2uEuvvKNI,39213
|
14
|
+
langchain_postgres-0.0.16.dist-info/METADATA,sha256=fLsfXjrnlW412RDvPW5nv4uFJqaujUQkIBujCCsERWc,7143
|
15
|
+
langchain_postgres-0.0.16.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
16
|
+
langchain_postgres-0.0.16.dist-info/licenses/LICENSE,sha256=2btS8uNUDWD_UNjw9ba6ZJt_00aUjEw9CGyK-xIHY8c,1072
|
17
|
+
langchain_postgres-0.0.16.dist-info/RECORD,,
|
@@ -1,16 +0,0 @@
|
|
1
|
-
langchain_postgres/__init__.py,sha256=UxIanyWPeUVtWFKCT-sWGXbWUO5I76akABXhXolY9bM,702
|
2
|
-
langchain_postgres/_utils.py,sha256=N_OBzYFCb_bsHOnZ-YRg6izhmuudorQhupgeG-rSKUc,2848
|
3
|
-
langchain_postgres/chat_message_histories.py,sha256=Et5AgXSRBCghLC5sn6EEUDd1xupaiPv-A5IyNBjpaTc,14213
|
4
|
-
langchain_postgres/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
-
langchain_postgres/translator.py,sha256=6cTS2RJUodMUdsurJM-f-vgPXl6Ad6bfMo8ECuh5Jr4,1524
|
6
|
-
langchain_postgres/utils/pgvector_migrator.py,sha256=OIclFsCKWQAtJ1JyFQsVQoWZSrEJg67GVnY84aBlucE,11776
|
7
|
-
langchain_postgres/v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
|
-
langchain_postgres/v2/async_vectorstore.py,sha256=FMV-IwH7cJ2VuxnrMCy0X0WWG65oHNXfKAwsdf0Tp20,51786
|
9
|
-
langchain_postgres/v2/engine.py,sha256=8XD6ta2HzuYtHnxhvY-I_vMYqZd33yj2y9ZqQFbEz1g,14266
|
10
|
-
langchain_postgres/v2/indexes.py,sha256=aLCFGYiIbLBUr88drMLD6l41MPRI7lv0ALMVRWfqdq4,4888
|
11
|
-
langchain_postgres/v2/vectorstores.py,sha256=R17q1KIEZPBwEHgE6JYiRSiN8rZXzVPCmBoJobiyjM8,37198
|
12
|
-
langchain_postgres/vectorstores.py,sha256=Xjyqxa_nL7Xvq6dwqWUu4VdNZ5z6ypjFoSU9wj6Ad5c,84195
|
13
|
-
langchain_postgres-0.0.14rc1.dist-info/LICENSE,sha256=2btS8uNUDWD_UNjw9ba6ZJt_00aUjEw9CGyK-xIHY8c,1072
|
14
|
-
langchain_postgres-0.0.14rc1.dist-info/METADATA,sha256=ZOG0qTuKUt4_uz2VUAy4Cj4A-DtsfouNBB6ITk7bihk,7179
|
15
|
-
langchain_postgres-0.0.14rc1.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
|
16
|
-
langchain_postgres-0.0.14rc1.dist-info/RECORD,,
|
{langchain_postgres-0.0.14rc1.dist-info → langchain_postgres-0.0.16.dist-info/licenses}/LICENSE
RENAMED
File without changes
|