langchain-postgres 0.0.15__py3-none-any.whl → 0.0.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -210,7 +210,7 @@ class AsyncPGVectorStore(VectorStore):
210
210
  hybrid_search_config.tsv_column = ""
211
211
  if embedding_column not in columns:
212
212
  raise ValueError(f"Embedding column, {embedding_column}, does not exist.")
213
- if columns[embedding_column] != "USER-DEFINED":
213
+ if columns[embedding_column] not in ["USER-DEFINED", "vector"]:
214
214
  raise ValueError(
215
215
  f"Embedding column, {embedding_column}, is not type Vector."
216
216
  )
@@ -580,16 +580,16 @@ class AsyncPGVectorStore(VectorStore):
580
580
  For best hybrid search performance, consider creating a TSV column
581
581
  and adding GIN index.
582
582
  """
583
- if not k:
584
- k = (
585
- max(
586
- self.k,
587
- self.hybrid_search_config.primary_top_k,
588
- self.hybrid_search_config.secondary_top_k,
589
- )
590
- if self.hybrid_search_config
591
- else self.k
592
- )
583
+ hybrid_search_config = kwargs.get(
584
+ "hybrid_search_config", self.hybrid_search_config
585
+ )
586
+
587
+ final_k = k if k is not None else self.k
588
+
589
+ dense_limit = final_k
590
+ if hybrid_search_config:
591
+ dense_limit = hybrid_search_config.primary_top_k
592
+
593
593
  operator = self.distance_strategy.operator
594
594
  search_function = self.distance_strategy.search_function
595
595
 
@@ -617,9 +617,9 @@ class AsyncPGVectorStore(VectorStore):
617
617
  embedding_data_string = ":query_embedding"
618
618
  where_filters = f"WHERE {safe_filter}" if safe_filter else ""
619
619
  dense_query_stmt = f"""SELECT {column_names}, {search_function}("{self.embedding_column}", {embedding_data_string}) as distance
620
- FROM "{self.schema_name}"."{self.table_name}" {where_filters} ORDER BY "{self.embedding_column}" {operator} {embedding_data_string} LIMIT :k;
620
+ FROM "{self.schema_name}"."{self.table_name}" {where_filters} ORDER BY "{self.embedding_column}" {operator} {embedding_data_string} LIMIT :dense_limit;
621
621
  """
622
- param_dict = {"query_embedding": query_embedding, "k": k}
622
+ param_dict = {"query_embedding": query_embedding, "dense_limit": dense_limit}
623
623
  if filter_dict:
624
624
  param_dict.update(filter_dict)
625
625
  if self.index_query_options:
@@ -637,16 +637,13 @@ class AsyncPGVectorStore(VectorStore):
637
637
  result_map = result.mappings()
638
638
  dense_results = result_map.fetchall()
639
639
 
640
- hybrid_search_config = kwargs.get(
641
- "hybrid_search_config", self.hybrid_search_config
642
- )
643
640
  fts_query = (
644
641
  hybrid_search_config.fts_query
645
642
  if hybrid_search_config and hybrid_search_config.fts_query
646
643
  else kwargs.get("fts_query", "")
647
644
  )
648
645
  if hybrid_search_config and fts_query:
649
- hybrid_search_config.fusion_function_parameters["fetch_top_k"] = k
646
+ hybrid_search_config.fusion_function_parameters["fetch_top_k"] = final_k
650
647
  # do the sparse query
651
648
  lang = (
652
649
  f"'{hybrid_search_config.tsv_lang}',"
@@ -670,6 +667,7 @@ class AsyncPGVectorStore(VectorStore):
670
667
  dense_results,
671
668
  sparse_results,
672
669
  **hybrid_search_config.fusion_function_parameters,
670
+ distance_strategy=self.distance_strategy,
673
671
  )
674
672
  return combined_results
675
673
  return dense_results
@@ -119,7 +119,7 @@ class PGEngine:
119
119
  return await coro
120
120
  # Otherwise, run in the background thread
121
121
  return await asyncio.wrap_future(
122
- asyncio.run_coroutine_threadsafe(coro, self._loop)
122
+ asyncio.run_coroutine_threadsafe(coro, self._loop) # type: ignore[arg-type]
123
123
  )
124
124
 
125
125
  def _run_as_sync(self, coro: Awaitable[T]) -> T:
@@ -128,7 +128,7 @@ class PGEngine:
128
128
  raise Exception(
129
129
  "Engine was initialized without a background loop and cannot call sync methods."
130
130
  )
131
- return asyncio.run_coroutine_threadsafe(coro, self._loop).result()
131
+ return asyncio.run_coroutine_threadsafe(coro, self._loop).result() # type: ignore[arg-type]
132
132
 
133
133
  async def close(self) -> None:
134
134
  """Dispose of connection pool"""
@@ -4,6 +4,40 @@ from typing import Any, Callable, Optional, Sequence
4
4
 
5
5
  from sqlalchemy import RowMapping
6
6
 
7
+ from .indexes import DistanceStrategy
8
+
9
+
10
+ def _normalize_scores(
11
+ results: Sequence[dict[str, Any]], is_distance_metric: bool
12
+ ) -> Sequence[dict[str, Any]]:
13
+ """Normalizes scores to a 0-1 scale, where 1 is best."""
14
+ if not results:
15
+ return []
16
+
17
+ # Get scores from the last column of each result
18
+ scores = [float(list(item.values())[-1]) for item in results]
19
+ min_score, max_score = min(scores), max(scores)
20
+ score_range = max_score - min_score
21
+
22
+ if score_range == 0:
23
+ # All documents are of the highest quality (1.0)
24
+ for item in results:
25
+ item["normalized_score"] = 1.0
26
+ return list(results)
27
+
28
+ for item in results:
29
+ # Access the score again from the last column for calculation
30
+ score = list(item.values())[-1]
31
+ normalized = (score - min_score) / score_range
32
+ if is_distance_metric:
33
+ # For distance, a lower score is better, so we invert the result.
34
+ item["normalized_score"] = 1.0 - normalized
35
+ else:
36
+ # For similarity (like keyword search), a higher score is better.
37
+ item["normalized_score"] = normalized
38
+
39
+ return list(results)
40
+
7
41
 
8
42
  def weighted_sum_ranking(
9
43
  primary_search_results: Sequence[RowMapping],
@@ -11,6 +45,7 @@ def weighted_sum_ranking(
11
45
  primary_results_weight: float = 0.5,
12
46
  secondary_results_weight: float = 0.5,
13
47
  fetch_top_k: int = 4,
48
+ **kwargs: Any,
14
49
  ) -> Sequence[dict[str, Any]]:
15
50
  """
16
51
  Ranks documents using a weighted sum of scores from two sources.
@@ -32,35 +67,52 @@ def weighted_sum_ranking(
32
67
  descending order.
33
68
  """
34
69
 
70
+ distance_strategy = kwargs.get(
71
+ "distance_strategy", DistanceStrategy.COSINE_DISTANCE
72
+ )
73
+ is_primary_distance = distance_strategy != DistanceStrategy.INNER_PRODUCT
74
+
75
+ # Normalize both sets of results onto a 0-1 scale
76
+ normalized_primary = _normalize_scores(
77
+ [dict(row) for row in primary_search_results],
78
+ is_distance_metric=is_primary_distance,
79
+ )
80
+
81
+ # Keyword search relevance is a similarity score (higher is better)
82
+ normalized_secondary = _normalize_scores(
83
+ [dict(row) for row in secondary_search_results], is_distance_metric=False
84
+ )
85
+
35
86
  # stores computed metric with provided distance metric and weights
36
87
  weighted_scores: dict[str, dict[str, Any]] = {}
37
88
 
38
- # Process results from primary source
39
- for row in primary_search_results:
40
- values = list(row.values())
41
- doc_id = str(values[0]) # first value is doc_id
42
- distance = float(values[-1]) # type: ignore # last value is distance
43
- row_values = dict(row)
44
- row_values["distance"] = primary_results_weight * distance
45
- weighted_scores[doc_id] = row_values
46
-
47
- # Process results from secondary source,
48
- # adding to existing scores or creating new ones
49
- for row in secondary_search_results:
50
- values = list(row.values())
51
- doc_id = str(values[0]) # first value is doc_id
52
- distance = float(values[-1]) # type: ignore # last value is distance
53
- primary_score = (
54
- weighted_scores[doc_id]["distance"] if doc_id in weighted_scores else 0.0
55
- )
56
- row_values = dict(row)
57
- row_values["distance"] = distance * secondary_results_weight + primary_score
58
- weighted_scores[doc_id] = row_values
89
+ # Process primary results
90
+ for item in normalized_primary:
91
+ doc_id = str(list(item.values())[0])
92
+ # Set the 'distance' key with the weighted primary score
93
+ item["distance"] = item["normalized_score"] * primary_results_weight
94
+ weighted_scores[doc_id] = item
95
+
96
+ # Process secondary results
97
+ for item in normalized_secondary:
98
+ doc_id = str(list(item.values())[0])
99
+ secondary_weighted_score = item["normalized_score"] * secondary_results_weight
100
+
101
+ if doc_id in weighted_scores:
102
+ # Add to the existing 'distance' score
103
+ weighted_scores[doc_id]["distance"] += secondary_weighted_score
104
+ else:
105
+ # Set the 'distance' key for the new item
106
+ item["distance"] = secondary_weighted_score
107
+ weighted_scores[doc_id] = item
59
108
 
60
- # Sort the results by weighted score in descending order
61
109
  ranked_results = sorted(
62
110
  weighted_scores.values(), key=lambda item: item["distance"], reverse=True
63
111
  )
112
+
113
+ for result in ranked_results:
114
+ result.pop("normalized_score", None)
115
+
64
116
  return ranked_results[:fetch_top_k]
65
117
 
66
118
 
@@ -69,6 +121,7 @@ def reciprocal_rank_fusion(
69
121
  secondary_search_results: Sequence[RowMapping],
70
122
  rrf_k: float = 60,
71
123
  fetch_top_k: int = 4,
124
+ **kwargs: Any,
72
125
  ) -> Sequence[dict[str, Any]]:
73
126
  """
74
127
  Ranks documents using Reciprocal Rank Fusion (RRF) of scores from two sources.
@@ -87,35 +140,45 @@ def reciprocal_rank_fusion(
87
140
  A list of (document_id, rrf_score) tuples, sorted by rrf_score
88
141
  in descending order.
89
142
  """
143
+ distance_strategy = kwargs.get(
144
+ "distance_strategy", DistanceStrategy.COSINE_DISTANCE
145
+ )
90
146
  rrf_scores: dict[str, dict[str, Any]] = {}
91
147
 
92
148
  # Process results from primary source
93
- for rank, row in enumerate(
94
- sorted(primary_search_results, key=lambda item: item["distance"], reverse=True)
95
- ):
96
- values = list(row.values())
97
- doc_id = str(values[0])
98
- row_values = dict(row)
99
- primary_score = rrf_scores[doc_id]["distance"] if doc_id in rrf_scores else 0.0
100
- primary_score += 1.0 / (rank + rrf_k)
101
- row_values["distance"] = primary_score
102
- rrf_scores[doc_id] = row_values
149
+ # Determine sorting order based on the vector distance strategy.
150
+ # For COSINE & EUCLIDEAN(distance), we sort ascending (reverse=False).
151
+ # For INNER_PRODUCT (similarity), we sort descending (reverse=True).
152
+ is_similarity_metric = distance_strategy == DistanceStrategy.INNER_PRODUCT
153
+ sorted_primary = sorted(
154
+ primary_search_results,
155
+ key=lambda item: item["distance"],
156
+ reverse=is_similarity_metric,
157
+ )
158
+
159
+ for rank, row in enumerate(sorted_primary):
160
+ doc_id = str(list(row.values())[0])
161
+ if doc_id not in rrf_scores:
162
+ rrf_scores[doc_id] = dict(row)
163
+ rrf_scores[doc_id]["distance"] = 0.0
164
+ # Add the "normalized" rank score
165
+ rrf_scores[doc_id]["distance"] += 1.0 / (rank + rrf_k)
103
166
 
104
167
  # Process results from secondary source
105
- for rank, row in enumerate(
106
- sorted(
107
- secondary_search_results, key=lambda item: item["distance"], reverse=True
108
- )
109
- ):
110
- values = list(row.values())
111
- doc_id = str(values[0])
112
- row_values = dict(row)
113
- secondary_score = (
114
- rrf_scores[doc_id]["distance"] if doc_id in rrf_scores else 0.0
115
- )
116
- secondary_score += 1.0 / (rank + rrf_k)
117
- row_values["distance"] = secondary_score
118
- rrf_scores[doc_id] = row_values
168
+ # Keyword search relevance is always "higher is better" -> sort descending
169
+ sorted_secondary = sorted(
170
+ secondary_search_results,
171
+ key=lambda item: item["distance"],
172
+ reverse=True,
173
+ )
174
+
175
+ for rank, row in enumerate(sorted_secondary):
176
+ doc_id = str(list(row.values())[0])
177
+ if doc_id not in rrf_scores:
178
+ rrf_scores[doc_id] = dict(row)
179
+ rrf_scores[doc_id]["distance"] = 0.0
180
+ # Add the rank score from this list to the existing score
181
+ rrf_scores[doc_id]["distance"] += 1.0 / (rank + rrf_k)
119
182
 
120
183
  # Sort the results by rrf score in descending order
121
184
  # Sort the results by weighted score in descending order
@@ -789,6 +789,24 @@ class PGVectorStore(VectorStore):
789
789
  )
790
790
  )
791
791
 
792
+ async def aapply_hybrid_search_index(
793
+ self,
794
+ concurrently: bool = False,
795
+ ) -> None:
796
+ """Creates a TSV index in the vector store table if possible."""
797
+ return await self._engine._run_as_async(
798
+ self.__vs.aapply_hybrid_search_index(concurrently=concurrently)
799
+ )
800
+
801
+ def apply_hybrid_search_index(
802
+ self,
803
+ concurrently: bool = False,
804
+ ) -> None:
805
+ """Creates a TSV index in the vector store table if possible."""
806
+ return self._engine._run_as_sync(
807
+ self.__vs.aapply_hybrid_search_index(concurrently=concurrently)
808
+ )
809
+
792
810
  async def aapply_vector_index(
793
811
  self,
794
812
  index: BaseIndex,
@@ -1,17 +1,17 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: langchain-postgres
3
- Version: 0.0.15
3
+ Version: 0.0.16
4
4
  Summary: An integration package connecting Postgres and LangChain
5
5
  License-Expression: MIT
6
6
  License-File: LICENSE
7
7
  Requires-Python: >=3.9
8
8
  Requires-Dist: asyncpg>=0.30.0
9
- Requires-Dist: langchain-core<0.4.0,>=0.2.13
9
+ Requires-Dist: langchain-core<2.0,>=0.2.13
10
10
  Requires-Dist: numpy<3,>=1.21
11
11
  Requires-Dist: pgvector<0.4,>=0.2.5
12
12
  Requires-Dist: psycopg-pool<4,>=3.2.1
13
- Requires-Dist: psycopg<4,>=3
14
- Requires-Dist: sqlalchemy<3,>=2
13
+ Requires-Dist: psycopg[binary]<4,>=3
14
+ Requires-Dist: sqlalchemy[asyncio]<3,>=2
15
15
  Description-Content-Type: text/markdown
16
16
 
17
17
  # langchain-postgres
@@ -95,6 +95,24 @@ print(docs)
95
95
  > [!TIP]
96
96
  > All synchronous functions have corresponding asynchronous functions
97
97
 
98
+ ### Hybrid Search with PGVectorStore
99
+
100
+ With PGVectorStore you can use hybrid search for more comprehensive and relevant search results.
101
+
102
+ ```python
103
+ vs = PGVectorStore.create_sync(
104
+ engine=engine,
105
+ table_name=TABLE_NAME,
106
+ embedding_service=embedding,
107
+ hybrid_search_config=HybridSearchConfig(
108
+ fusion_function=reciprocal_rank_fusion
109
+ ),
110
+ )
111
+ hybrid_docs = vector_store.similarity_search("products", k=5)
112
+ ```
113
+
114
+ For a detailed guide on how to use hybrid search, see the [documentation](/examples/pg_vectorstore_how_to.ipynb#hybrid-search-with-pgvectorstore ).
115
+
98
116
  ## ChatMessageHistory
99
117
 
100
118
  The chat message history abstraction helps to persist chat message history
@@ -6,12 +6,12 @@ langchain_postgres/translator.py,sha256=6cTS2RJUodMUdsurJM-f-vgPXl6Ad6bfMo8ECuh5
6
6
  langchain_postgres/vectorstores.py,sha256=vzRbPwU1Rn-pOsnTsz1u72cSYD7H8jMlW4N7A58QIt4,83826
7
7
  langchain_postgres/utils/pgvector_migrator.py,sha256=OxW2_FxaomZw5kqPAz-3lmZ5t2hSXU4ZW3xK6O62MH4,11771
8
8
  langchain_postgres/v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
- langchain_postgres/v2/async_vectorstore.py,sha256=WJaFs38fZiNJ6ZM2fhz7u6oJZhfig4fP-OKFuyB7MIQ,58739
10
- langchain_postgres/v2/engine.py,sha256=BZJHWzS7SqMWs1-7ZHKkRAIu5PuO98zqg5aWf0EXkDM,16850
11
- langchain_postgres/v2/hybrid_search_config.py,sha256=zDVMscaV0n92BkgGd2J77Y675z9xWS-U6jTmkqHJtGI,5490
9
+ langchain_postgres/v2/async_vectorstore.py,sha256=MuRjlRcANOnxrXRGcyGEzIZYr4v75tk8jbMZZCexSAc,58711
10
+ langchain_postgres/v2/engine.py,sha256=UC3upYnqmgKBw4E6t62CbjUEdVO67t1j0rCbdFmoQnI,16902
11
+ langchain_postgres/v2/hybrid_search_config.py,sha256=dhBeedqpVXv2VP2_RLs_jNHLLLrukJ-UXytxRD3zVts,7658
12
12
  langchain_postgres/v2/indexes.py,sha256=aLCFGYiIbLBUr88drMLD6l41MPRI7lv0ALMVRWfqdq4,4888
13
- langchain_postgres/v2/vectorstores.py,sha256=Lo3IQKjQ6AQlyNP8ILGeyCk6ZyKANcvebpRT5tHCT78,38595
14
- langchain_postgres-0.0.15.dist-info/METADATA,sha256=RzNeUX4gFCBEQ7u7qQHWOC6LsVPxl6xMoOQMLAXtkkU,6556
15
- langchain_postgres-0.0.15.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
16
- langchain_postgres-0.0.15.dist-info/licenses/LICENSE,sha256=2btS8uNUDWD_UNjw9ba6ZJt_00aUjEw9CGyK-xIHY8c,1072
17
- langchain_postgres-0.0.15.dist-info/RECORD,,
13
+ langchain_postgres/v2/vectorstores.py,sha256=Iq5z3KU0Ne_djMLlhJNL43zprii0O1JdUN2uEuvvKNI,39213
14
+ langchain_postgres-0.0.16.dist-info/METADATA,sha256=fLsfXjrnlW412RDvPW5nv4uFJqaujUQkIBujCCsERWc,7143
15
+ langchain_postgres-0.0.16.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
16
+ langchain_postgres-0.0.16.dist-info/licenses/LICENSE,sha256=2btS8uNUDWD_UNjw9ba6ZJt_00aUjEw9CGyK-xIHY8c,1072
17
+ langchain_postgres-0.0.16.dist-info/RECORD,,