langchain-postgres 0.0.15__py3-none-any.whl → 0.0.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langchain_postgres/v2/async_vectorstore.py +15 -17
- langchain_postgres/v2/engine.py +2 -2
- langchain_postgres/v2/hybrid_search_config.py +109 -46
- langchain_postgres/v2/vectorstores.py +18 -0
- {langchain_postgres-0.0.15.dist-info → langchain_postgres-0.0.16.dist-info}/METADATA +22 -4
- {langchain_postgres-0.0.15.dist-info → langchain_postgres-0.0.16.dist-info}/RECORD +8 -8
- {langchain_postgres-0.0.15.dist-info → langchain_postgres-0.0.16.dist-info}/WHEEL +0 -0
- {langchain_postgres-0.0.15.dist-info → langchain_postgres-0.0.16.dist-info}/licenses/LICENSE +0 -0
@@ -210,7 +210,7 @@ class AsyncPGVectorStore(VectorStore):
|
|
210
210
|
hybrid_search_config.tsv_column = ""
|
211
211
|
if embedding_column not in columns:
|
212
212
|
raise ValueError(f"Embedding column, {embedding_column}, does not exist.")
|
213
|
-
if columns[embedding_column]
|
213
|
+
if columns[embedding_column] not in ["USER-DEFINED", "vector"]:
|
214
214
|
raise ValueError(
|
215
215
|
f"Embedding column, {embedding_column}, is not type Vector."
|
216
216
|
)
|
@@ -580,16 +580,16 @@ class AsyncPGVectorStore(VectorStore):
|
|
580
580
|
For best hybrid search performance, consider creating a TSV column
|
581
581
|
and adding GIN index.
|
582
582
|
"""
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
|
583
|
+
hybrid_search_config = kwargs.get(
|
584
|
+
"hybrid_search_config", self.hybrid_search_config
|
585
|
+
)
|
586
|
+
|
587
|
+
final_k = k if k is not None else self.k
|
588
|
+
|
589
|
+
dense_limit = final_k
|
590
|
+
if hybrid_search_config:
|
591
|
+
dense_limit = hybrid_search_config.primary_top_k
|
592
|
+
|
593
593
|
operator = self.distance_strategy.operator
|
594
594
|
search_function = self.distance_strategy.search_function
|
595
595
|
|
@@ -617,9 +617,9 @@ class AsyncPGVectorStore(VectorStore):
|
|
617
617
|
embedding_data_string = ":query_embedding"
|
618
618
|
where_filters = f"WHERE {safe_filter}" if safe_filter else ""
|
619
619
|
dense_query_stmt = f"""SELECT {column_names}, {search_function}("{self.embedding_column}", {embedding_data_string}) as distance
|
620
|
-
FROM "{self.schema_name}"."{self.table_name}" {where_filters} ORDER BY "{self.embedding_column}" {operator} {embedding_data_string} LIMIT :
|
620
|
+
FROM "{self.schema_name}"."{self.table_name}" {where_filters} ORDER BY "{self.embedding_column}" {operator} {embedding_data_string} LIMIT :dense_limit;
|
621
621
|
"""
|
622
|
-
param_dict = {"query_embedding": query_embedding, "
|
622
|
+
param_dict = {"query_embedding": query_embedding, "dense_limit": dense_limit}
|
623
623
|
if filter_dict:
|
624
624
|
param_dict.update(filter_dict)
|
625
625
|
if self.index_query_options:
|
@@ -637,16 +637,13 @@ class AsyncPGVectorStore(VectorStore):
|
|
637
637
|
result_map = result.mappings()
|
638
638
|
dense_results = result_map.fetchall()
|
639
639
|
|
640
|
-
hybrid_search_config = kwargs.get(
|
641
|
-
"hybrid_search_config", self.hybrid_search_config
|
642
|
-
)
|
643
640
|
fts_query = (
|
644
641
|
hybrid_search_config.fts_query
|
645
642
|
if hybrid_search_config and hybrid_search_config.fts_query
|
646
643
|
else kwargs.get("fts_query", "")
|
647
644
|
)
|
648
645
|
if hybrid_search_config and fts_query:
|
649
|
-
hybrid_search_config.fusion_function_parameters["fetch_top_k"] =
|
646
|
+
hybrid_search_config.fusion_function_parameters["fetch_top_k"] = final_k
|
650
647
|
# do the sparse query
|
651
648
|
lang = (
|
652
649
|
f"'{hybrid_search_config.tsv_lang}',"
|
@@ -670,6 +667,7 @@ class AsyncPGVectorStore(VectorStore):
|
|
670
667
|
dense_results,
|
671
668
|
sparse_results,
|
672
669
|
**hybrid_search_config.fusion_function_parameters,
|
670
|
+
distance_strategy=self.distance_strategy,
|
673
671
|
)
|
674
672
|
return combined_results
|
675
673
|
return dense_results
|
langchain_postgres/v2/engine.py
CHANGED
@@ -119,7 +119,7 @@ class PGEngine:
|
|
119
119
|
return await coro
|
120
120
|
# Otherwise, run in the background thread
|
121
121
|
return await asyncio.wrap_future(
|
122
|
-
asyncio.run_coroutine_threadsafe(coro, self._loop)
|
122
|
+
asyncio.run_coroutine_threadsafe(coro, self._loop) # type: ignore[arg-type]
|
123
123
|
)
|
124
124
|
|
125
125
|
def _run_as_sync(self, coro: Awaitable[T]) -> T:
|
@@ -128,7 +128,7 @@ class PGEngine:
|
|
128
128
|
raise Exception(
|
129
129
|
"Engine was initialized without a background loop and cannot call sync methods."
|
130
130
|
)
|
131
|
-
return asyncio.run_coroutine_threadsafe(coro, self._loop).result()
|
131
|
+
return asyncio.run_coroutine_threadsafe(coro, self._loop).result() # type: ignore[arg-type]
|
132
132
|
|
133
133
|
async def close(self) -> None:
|
134
134
|
"""Dispose of connection pool"""
|
@@ -4,6 +4,40 @@ from typing import Any, Callable, Optional, Sequence
|
|
4
4
|
|
5
5
|
from sqlalchemy import RowMapping
|
6
6
|
|
7
|
+
from .indexes import DistanceStrategy
|
8
|
+
|
9
|
+
|
10
|
+
def _normalize_scores(
|
11
|
+
results: Sequence[dict[str, Any]], is_distance_metric: bool
|
12
|
+
) -> Sequence[dict[str, Any]]:
|
13
|
+
"""Normalizes scores to a 0-1 scale, where 1 is best."""
|
14
|
+
if not results:
|
15
|
+
return []
|
16
|
+
|
17
|
+
# Get scores from the last column of each result
|
18
|
+
scores = [float(list(item.values())[-1]) for item in results]
|
19
|
+
min_score, max_score = min(scores), max(scores)
|
20
|
+
score_range = max_score - min_score
|
21
|
+
|
22
|
+
if score_range == 0:
|
23
|
+
# All documents are of the highest quality (1.0)
|
24
|
+
for item in results:
|
25
|
+
item["normalized_score"] = 1.0
|
26
|
+
return list(results)
|
27
|
+
|
28
|
+
for item in results:
|
29
|
+
# Access the score again from the last column for calculation
|
30
|
+
score = list(item.values())[-1]
|
31
|
+
normalized = (score - min_score) / score_range
|
32
|
+
if is_distance_metric:
|
33
|
+
# For distance, a lower score is better, so we invert the result.
|
34
|
+
item["normalized_score"] = 1.0 - normalized
|
35
|
+
else:
|
36
|
+
# For similarity (like keyword search), a higher score is better.
|
37
|
+
item["normalized_score"] = normalized
|
38
|
+
|
39
|
+
return list(results)
|
40
|
+
|
7
41
|
|
8
42
|
def weighted_sum_ranking(
|
9
43
|
primary_search_results: Sequence[RowMapping],
|
@@ -11,6 +45,7 @@ def weighted_sum_ranking(
|
|
11
45
|
primary_results_weight: float = 0.5,
|
12
46
|
secondary_results_weight: float = 0.5,
|
13
47
|
fetch_top_k: int = 4,
|
48
|
+
**kwargs: Any,
|
14
49
|
) -> Sequence[dict[str, Any]]:
|
15
50
|
"""
|
16
51
|
Ranks documents using a weighted sum of scores from two sources.
|
@@ -32,35 +67,52 @@ def weighted_sum_ranking(
|
|
32
67
|
descending order.
|
33
68
|
"""
|
34
69
|
|
70
|
+
distance_strategy = kwargs.get(
|
71
|
+
"distance_strategy", DistanceStrategy.COSINE_DISTANCE
|
72
|
+
)
|
73
|
+
is_primary_distance = distance_strategy != DistanceStrategy.INNER_PRODUCT
|
74
|
+
|
75
|
+
# Normalize both sets of results onto a 0-1 scale
|
76
|
+
normalized_primary = _normalize_scores(
|
77
|
+
[dict(row) for row in primary_search_results],
|
78
|
+
is_distance_metric=is_primary_distance,
|
79
|
+
)
|
80
|
+
|
81
|
+
# Keyword search relevance is a similarity score (higher is better)
|
82
|
+
normalized_secondary = _normalize_scores(
|
83
|
+
[dict(row) for row in secondary_search_results], is_distance_metric=False
|
84
|
+
)
|
85
|
+
|
35
86
|
# stores computed metric with provided distance metric and weights
|
36
87
|
weighted_scores: dict[str, dict[str, Any]] = {}
|
37
88
|
|
38
|
-
# Process
|
39
|
-
for
|
40
|
-
|
41
|
-
|
42
|
-
distance =
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
row_values["distance"] = distance * secondary_results_weight + primary_score
|
58
|
-
weighted_scores[doc_id] = row_values
|
89
|
+
# Process primary results
|
90
|
+
for item in normalized_primary:
|
91
|
+
doc_id = str(list(item.values())[0])
|
92
|
+
# Set the 'distance' key with the weighted primary score
|
93
|
+
item["distance"] = item["normalized_score"] * primary_results_weight
|
94
|
+
weighted_scores[doc_id] = item
|
95
|
+
|
96
|
+
# Process secondary results
|
97
|
+
for item in normalized_secondary:
|
98
|
+
doc_id = str(list(item.values())[0])
|
99
|
+
secondary_weighted_score = item["normalized_score"] * secondary_results_weight
|
100
|
+
|
101
|
+
if doc_id in weighted_scores:
|
102
|
+
# Add to the existing 'distance' score
|
103
|
+
weighted_scores[doc_id]["distance"] += secondary_weighted_score
|
104
|
+
else:
|
105
|
+
# Set the 'distance' key for the new item
|
106
|
+
item["distance"] = secondary_weighted_score
|
107
|
+
weighted_scores[doc_id] = item
|
59
108
|
|
60
|
-
# Sort the results by weighted score in descending order
|
61
109
|
ranked_results = sorted(
|
62
110
|
weighted_scores.values(), key=lambda item: item["distance"], reverse=True
|
63
111
|
)
|
112
|
+
|
113
|
+
for result in ranked_results:
|
114
|
+
result.pop("normalized_score", None)
|
115
|
+
|
64
116
|
return ranked_results[:fetch_top_k]
|
65
117
|
|
66
118
|
|
@@ -69,6 +121,7 @@ def reciprocal_rank_fusion(
|
|
69
121
|
secondary_search_results: Sequence[RowMapping],
|
70
122
|
rrf_k: float = 60,
|
71
123
|
fetch_top_k: int = 4,
|
124
|
+
**kwargs: Any,
|
72
125
|
) -> Sequence[dict[str, Any]]:
|
73
126
|
"""
|
74
127
|
Ranks documents using Reciprocal Rank Fusion (RRF) of scores from two sources.
|
@@ -87,35 +140,45 @@ def reciprocal_rank_fusion(
|
|
87
140
|
A list of (document_id, rrf_score) tuples, sorted by rrf_score
|
88
141
|
in descending order.
|
89
142
|
"""
|
143
|
+
distance_strategy = kwargs.get(
|
144
|
+
"distance_strategy", DistanceStrategy.COSINE_DISTANCE
|
145
|
+
)
|
90
146
|
rrf_scores: dict[str, dict[str, Any]] = {}
|
91
147
|
|
92
148
|
# Process results from primary source
|
93
|
-
|
94
|
-
|
95
|
-
)
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
149
|
+
# Determine sorting order based on the vector distance strategy.
|
150
|
+
# For COSINE & EUCLIDEAN(distance), we sort ascending (reverse=False).
|
151
|
+
# For INNER_PRODUCT (similarity), we sort descending (reverse=True).
|
152
|
+
is_similarity_metric = distance_strategy == DistanceStrategy.INNER_PRODUCT
|
153
|
+
sorted_primary = sorted(
|
154
|
+
primary_search_results,
|
155
|
+
key=lambda item: item["distance"],
|
156
|
+
reverse=is_similarity_metric,
|
157
|
+
)
|
158
|
+
|
159
|
+
for rank, row in enumerate(sorted_primary):
|
160
|
+
doc_id = str(list(row.values())[0])
|
161
|
+
if doc_id not in rrf_scores:
|
162
|
+
rrf_scores[doc_id] = dict(row)
|
163
|
+
rrf_scores[doc_id]["distance"] = 0.0
|
164
|
+
# Add the "normalized" rank score
|
165
|
+
rrf_scores[doc_id]["distance"] += 1.0 / (rank + rrf_k)
|
103
166
|
|
104
167
|
# Process results from secondary source
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
rrf_scores[doc_id]
|
168
|
+
# Keyword search relevance is always "higher is better" -> sort descending
|
169
|
+
sorted_secondary = sorted(
|
170
|
+
secondary_search_results,
|
171
|
+
key=lambda item: item["distance"],
|
172
|
+
reverse=True,
|
173
|
+
)
|
174
|
+
|
175
|
+
for rank, row in enumerate(sorted_secondary):
|
176
|
+
doc_id = str(list(row.values())[0])
|
177
|
+
if doc_id not in rrf_scores:
|
178
|
+
rrf_scores[doc_id] = dict(row)
|
179
|
+
rrf_scores[doc_id]["distance"] = 0.0
|
180
|
+
# Add the rank score from this list to the existing score
|
181
|
+
rrf_scores[doc_id]["distance"] += 1.0 / (rank + rrf_k)
|
119
182
|
|
120
183
|
# Sort the results by rrf score in descending order
|
121
184
|
# Sort the results by weighted score in descending order
|
@@ -789,6 +789,24 @@ class PGVectorStore(VectorStore):
|
|
789
789
|
)
|
790
790
|
)
|
791
791
|
|
792
|
+
async def aapply_hybrid_search_index(
|
793
|
+
self,
|
794
|
+
concurrently: bool = False,
|
795
|
+
) -> None:
|
796
|
+
"""Creates a TSV index in the vector store table if possible."""
|
797
|
+
return await self._engine._run_as_async(
|
798
|
+
self.__vs.aapply_hybrid_search_index(concurrently=concurrently)
|
799
|
+
)
|
800
|
+
|
801
|
+
def apply_hybrid_search_index(
|
802
|
+
self,
|
803
|
+
concurrently: bool = False,
|
804
|
+
) -> None:
|
805
|
+
"""Creates a TSV index in the vector store table if possible."""
|
806
|
+
return self._engine._run_as_sync(
|
807
|
+
self.__vs.aapply_hybrid_search_index(concurrently=concurrently)
|
808
|
+
)
|
809
|
+
|
792
810
|
async def aapply_vector_index(
|
793
811
|
self,
|
794
812
|
index: BaseIndex,
|
@@ -1,17 +1,17 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: langchain-postgres
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.16
|
4
4
|
Summary: An integration package connecting Postgres and LangChain
|
5
5
|
License-Expression: MIT
|
6
6
|
License-File: LICENSE
|
7
7
|
Requires-Python: >=3.9
|
8
8
|
Requires-Dist: asyncpg>=0.30.0
|
9
|
-
Requires-Dist: langchain-core<
|
9
|
+
Requires-Dist: langchain-core<2.0,>=0.2.13
|
10
10
|
Requires-Dist: numpy<3,>=1.21
|
11
11
|
Requires-Dist: pgvector<0.4,>=0.2.5
|
12
12
|
Requires-Dist: psycopg-pool<4,>=3.2.1
|
13
|
-
Requires-Dist: psycopg<4,>=3
|
14
|
-
Requires-Dist: sqlalchemy<3,>=2
|
13
|
+
Requires-Dist: psycopg[binary]<4,>=3
|
14
|
+
Requires-Dist: sqlalchemy[asyncio]<3,>=2
|
15
15
|
Description-Content-Type: text/markdown
|
16
16
|
|
17
17
|
# langchain-postgres
|
@@ -95,6 +95,24 @@ print(docs)
|
|
95
95
|
> [!TIP]
|
96
96
|
> All synchronous functions have corresponding asynchronous functions
|
97
97
|
|
98
|
+
### Hybrid Search with PGVectorStore
|
99
|
+
|
100
|
+
With PGVectorStore you can use hybrid search for more comprehensive and relevant search results.
|
101
|
+
|
102
|
+
```python
|
103
|
+
vs = PGVectorStore.create_sync(
|
104
|
+
engine=engine,
|
105
|
+
table_name=TABLE_NAME,
|
106
|
+
embedding_service=embedding,
|
107
|
+
hybrid_search_config=HybridSearchConfig(
|
108
|
+
fusion_function=reciprocal_rank_fusion
|
109
|
+
),
|
110
|
+
)
|
111
|
+
hybrid_docs = vector_store.similarity_search("products", k=5)
|
112
|
+
```
|
113
|
+
|
114
|
+
For a detailed guide on how to use hybrid search, see the [documentation](/examples/pg_vectorstore_how_to.ipynb#hybrid-search-with-pgvectorstore ).
|
115
|
+
|
98
116
|
## ChatMessageHistory
|
99
117
|
|
100
118
|
The chat message history abstraction helps to persist chat message history
|
@@ -6,12 +6,12 @@ langchain_postgres/translator.py,sha256=6cTS2RJUodMUdsurJM-f-vgPXl6Ad6bfMo8ECuh5
|
|
6
6
|
langchain_postgres/vectorstores.py,sha256=vzRbPwU1Rn-pOsnTsz1u72cSYD7H8jMlW4N7A58QIt4,83826
|
7
7
|
langchain_postgres/utils/pgvector_migrator.py,sha256=OxW2_FxaomZw5kqPAz-3lmZ5t2hSXU4ZW3xK6O62MH4,11771
|
8
8
|
langchain_postgres/v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
|
-
langchain_postgres/v2/async_vectorstore.py,sha256=
|
10
|
-
langchain_postgres/v2/engine.py,sha256=
|
11
|
-
langchain_postgres/v2/hybrid_search_config.py,sha256=
|
9
|
+
langchain_postgres/v2/async_vectorstore.py,sha256=MuRjlRcANOnxrXRGcyGEzIZYr4v75tk8jbMZZCexSAc,58711
|
10
|
+
langchain_postgres/v2/engine.py,sha256=UC3upYnqmgKBw4E6t62CbjUEdVO67t1j0rCbdFmoQnI,16902
|
11
|
+
langchain_postgres/v2/hybrid_search_config.py,sha256=dhBeedqpVXv2VP2_RLs_jNHLLLrukJ-UXytxRD3zVts,7658
|
12
12
|
langchain_postgres/v2/indexes.py,sha256=aLCFGYiIbLBUr88drMLD6l41MPRI7lv0ALMVRWfqdq4,4888
|
13
|
-
langchain_postgres/v2/vectorstores.py,sha256=
|
14
|
-
langchain_postgres-0.0.
|
15
|
-
langchain_postgres-0.0.
|
16
|
-
langchain_postgres-0.0.
|
17
|
-
langchain_postgres-0.0.
|
13
|
+
langchain_postgres/v2/vectorstores.py,sha256=Iq5z3KU0Ne_djMLlhJNL43zprii0O1JdUN2uEuvvKNI,39213
|
14
|
+
langchain_postgres-0.0.16.dist-info/METADATA,sha256=fLsfXjrnlW412RDvPW5nv4uFJqaujUQkIBujCCsERWc,7143
|
15
|
+
langchain_postgres-0.0.16.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
16
|
+
langchain_postgres-0.0.16.dist-info/licenses/LICENSE,sha256=2btS8uNUDWD_UNjw9ba6ZJt_00aUjEw9CGyK-xIHY8c,1072
|
17
|
+
langchain_postgres-0.0.16.dist-info/RECORD,,
|
File without changes
|
{langchain_postgres-0.0.15.dist-info → langchain_postgres-0.0.16.dist-info}/licenses/LICENSE
RENAMED
File without changes
|