PyPI - langchain-postgres - Versions diffs - 0.0.15__py3-none-any.whl → 0.0.16__py3-none-any.whl - Mend

langchain-postgres 0.0.15py3-none-any.whl → 0.0.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

langchain_postgres/v2/async_vectorstore.py CHANGED Viewed

@@ -210,7 +210,7 @@ class AsyncPGVectorStore(VectorStore):
                 hybrid_search_config.tsv_column = ""
         if embedding_column not in columns:
             raise ValueError(f"Embedding column, {embedding_column}, does not exist.")
-        if columns[embedding_column] != "USER-DEFINED":
+        if columns[embedding_column] not in ["USER-DEFINED", "vector"]:
             raise ValueError(
                 f"Embedding column, {embedding_column}, is not type Vector."
             )
@@ -580,16 +580,16 @@ class AsyncPGVectorStore(VectorStore):
         For best hybrid search performance, consider creating a TSV column
         and adding GIN index.
         """
-        if not k:
-            k = (
-                max(
-                    self.k,
-                    self.hybrid_search_config.primary_top_k,
-                    self.hybrid_search_config.secondary_top_k,
-                )
-                if self.hybrid_search_config
-                else self.k
-            )
+        hybrid_search_config = kwargs.get(
+            "hybrid_search_config", self.hybrid_search_config
+        )
+        final_k = k if k is not None else self.k
+        dense_limit = final_k
+        if hybrid_search_config:
+            dense_limit = hybrid_search_config.primary_top_k
         operator = self.distance_strategy.operator
         search_function = self.distance_strategy.search_function
@@ -617,9 +617,9 @@ class AsyncPGVectorStore(VectorStore):
             embedding_data_string = ":query_embedding"
         where_filters = f"WHERE {safe_filter}" if safe_filter else ""
         dense_query_stmt = f"""SELECT {column_names}, {search_function}("{self.embedding_column}", {embedding_data_string}) as distance
-        FROM "{self.schema_name}"."{self.table_name}" {where_filters} ORDER BY "{self.embedding_column}" {operator} {embedding_data_string} LIMIT :k;
+        FROM "{self.schema_name}"."{self.table_name}" {where_filters} ORDER BY "{self.embedding_column}" {operator} {embedding_data_string} LIMIT :dense_limit;
         """
-        param_dict = {"query_embedding": query_embedding, "k": k}
+        param_dict = {"query_embedding": query_embedding, "dense_limit": dense_limit}
         if filter_dict:
             param_dict.update(filter_dict)
         if self.index_query_options:
@@ -637,16 +637,13 @@ class AsyncPGVectorStore(VectorStore):
                 result_map = result.mappings()
                 dense_results = result_map.fetchall()
-        hybrid_search_config = kwargs.get(
-            "hybrid_search_config", self.hybrid_search_config
-        )
         fts_query = (
             hybrid_search_config.fts_query
             if hybrid_search_config and hybrid_search_config.fts_query
             else kwargs.get("fts_query", "")
         )
         if hybrid_search_config and fts_query:
-            hybrid_search_config.fusion_function_parameters["fetch_top_k"] = k
+            hybrid_search_config.fusion_function_parameters["fetch_top_k"] = final_k
             # do the sparse query
             lang = (
                 f"'{hybrid_search_config.tsv_lang}',"
@@ -670,6 +667,7 @@ class AsyncPGVectorStore(VectorStore):
                 dense_results,
                 sparse_results,
                 **hybrid_search_config.fusion_function_parameters,
+                distance_strategy=self.distance_strategy,
             )
             return combined_results
         return dense_results

langchain_postgres/v2/engine.py CHANGED Viewed

@@ -119,7 +119,7 @@ class PGEngine:
             return await coro
         # Otherwise, run in the background thread
         return await asyncio.wrap_future(
-            asyncio.run_coroutine_threadsafe(coro, self._loop)
+            asyncio.run_coroutine_threadsafe(coro, self._loop)  # type: ignore[arg-type]
         )
     def _run_as_sync(self, coro: Awaitable[T]) -> T:
@@ -128,7 +128,7 @@ class PGEngine:
             raise Exception(
                 "Engine was initialized without a background loop and cannot call sync methods."
             )
-        return asyncio.run_coroutine_threadsafe(coro, self._loop).result()
+        return asyncio.run_coroutine_threadsafe(coro, self._loop).result()  # type: ignore[arg-type]
     async def close(self) -> None:
         """Dispose of connection pool"""

langchain_postgres/v2/hybrid_search_config.py CHANGED Viewed

@@ -4,6 +4,40 @@ from typing import Any, Callable, Optional, Sequence
 from sqlalchemy import RowMapping
+from .indexes import DistanceStrategy
+def _normalize_scores(
+    results: Sequence[dict[str, Any]], is_distance_metric: bool
+) -> Sequence[dict[str, Any]]:
+    """Normalizes scores to a 0-1 scale, where 1 is best."""
+    if not results:
+        return []
+    # Get scores from the last column of each result
+    scores = [float(list(item.values())[-1]) for item in results]
+    min_score, max_score = min(scores), max(scores)
+    score_range = max_score - min_score
+    if score_range == 0:
+        # All documents are of the highest quality (1.0)
+        for item in results:
+            item["normalized_score"] = 1.0
+        return list(results)
+    for item in results:
+        # Access the score again from the last column for calculation
+        score = list(item.values())[-1]
+        normalized = (score - min_score) / score_range
+        if is_distance_metric:
+            # For distance, a lower score is better, so we invert the result.
+            item["normalized_score"] = 1.0 - normalized
+        else:
+            # For similarity (like keyword search), a higher score is better.
+            item["normalized_score"] = normalized
+    return list(results)
 def weighted_sum_ranking(
     primary_search_results: Sequence[RowMapping],
@@ -11,6 +45,7 @@ def weighted_sum_ranking(
     primary_results_weight: float = 0.5,
     secondary_results_weight: float = 0.5,
     fetch_top_k: int = 4,
+    **kwargs: Any,
 ) -> Sequence[dict[str, Any]]:
     """
     Ranks documents using a weighted sum of scores from two sources.
@@ -32,35 +67,52 @@ def weighted_sum_ranking(
         descending order.
     """
+    distance_strategy = kwargs.get(
+        "distance_strategy", DistanceStrategy.COSINE_DISTANCE
+    )
+    is_primary_distance = distance_strategy != DistanceStrategy.INNER_PRODUCT
+    # Normalize both sets of results onto a 0-1 scale
+    normalized_primary = _normalize_scores(
+        [dict(row) for row in primary_search_results],
+        is_distance_metric=is_primary_distance,
+    )
+    # Keyword search relevance is a similarity score (higher is better)
+    normalized_secondary = _normalize_scores(
+        [dict(row) for row in secondary_search_results], is_distance_metric=False
+    )
     # stores computed metric with provided distance metric and weights
     weighted_scores: dict[str, dict[str, Any]] = {}
-    # Process results from primary source
-    for row in primary_search_results:
-        values = list(row.values())
-        doc_id = str(values[0])  # first value is doc_id
-        distance = float(values[-1])  # type: ignore # last value is distance
-        row_values = dict(row)
-        row_values["distance"] = primary_results_weight * distance
-        weighted_scores[doc_id] = row_values
-    # Process results from secondary source,
-    # adding to existing scores or creating new ones
-    for row in secondary_search_results:
-        values = list(row.values())
-        doc_id = str(values[0])  # first value is doc_id
-        distance = float(values[-1])  # type: ignore # last value is distance
-        primary_score = (
-            weighted_scores[doc_id]["distance"] if doc_id in weighted_scores else 0.0
-        )
-        row_values = dict(row)
-        row_values["distance"] = distance * secondary_results_weight + primary_score
-        weighted_scores[doc_id] = row_values
+    # Process primary results
+    for item in normalized_primary:
+        doc_id = str(list(item.values())[0])
+        # Set the 'distance' key with the weighted primary score
+        item["distance"] = item["normalized_score"] * primary_results_weight
+        weighted_scores[doc_id] = item
+    # Process secondary results
+    for item in normalized_secondary:
+        doc_id = str(list(item.values())[0])
+        secondary_weighted_score = item["normalized_score"] * secondary_results_weight
+        if doc_id in weighted_scores:
+            # Add to the existing 'distance' score
+            weighted_scores[doc_id]["distance"] += secondary_weighted_score
+        else:
+            # Set the 'distance' key for the new item
+            item["distance"] = secondary_weighted_score
+            weighted_scores[doc_id] = item
-    # Sort the results by weighted score in descending order
     ranked_results = sorted(
         weighted_scores.values(), key=lambda item: item["distance"], reverse=True
     )
+    for result in ranked_results:
+        result.pop("normalized_score", None)
     return ranked_results[:fetch_top_k]
@@ -69,6 +121,7 @@ def reciprocal_rank_fusion(
     secondary_search_results: Sequence[RowMapping],
     rrf_k: float = 60,
     fetch_top_k: int = 4,
+    **kwargs: Any,
 ) -> Sequence[dict[str, Any]]:
     """
     Ranks documents using Reciprocal Rank Fusion (RRF) of scores from two sources.
@@ -87,35 +140,45 @@ def reciprocal_rank_fusion(
         A list of (document_id, rrf_score) tuples, sorted by rrf_score
         in descending order.
     """
+    distance_strategy = kwargs.get(
+        "distance_strategy", DistanceStrategy.COSINE_DISTANCE
+    )
     rrf_scores: dict[str, dict[str, Any]] = {}
     # Process results from primary source
-    for rank, row in enumerate(
-        sorted(primary_search_results, key=lambda item: item["distance"], reverse=True)
-    ):
-        values = list(row.values())
-        doc_id = str(values[0])
-        row_values = dict(row)
-        primary_score = rrf_scores[doc_id]["distance"] if doc_id in rrf_scores else 0.0
-        primary_score += 1.0 / (rank + rrf_k)
-        row_values["distance"] = primary_score
-        rrf_scores[doc_id] = row_values
+    # Determine sorting order based on the vector distance strategy.
+    # For COSINE & EUCLIDEAN(distance), we sort ascending (reverse=False).
+    # For INNER_PRODUCT (similarity), we sort descending (reverse=True).
+    is_similarity_metric = distance_strategy == DistanceStrategy.INNER_PRODUCT
+    sorted_primary = sorted(
+        primary_search_results,
+        key=lambda item: item["distance"],
+        reverse=is_similarity_metric,
+    )
+    for rank, row in enumerate(sorted_primary):
+        doc_id = str(list(row.values())[0])
+        if doc_id not in rrf_scores:
+            rrf_scores[doc_id] = dict(row)
+            rrf_scores[doc_id]["distance"] = 0.0
+        # Add the "normalized" rank score
+        rrf_scores[doc_id]["distance"] += 1.0 / (rank + rrf_k)
     # Process results from secondary source
-    for rank, row in enumerate(
-        sorted(
-            secondary_search_results, key=lambda item: item["distance"], reverse=True
-        )
-    ):
-        values = list(row.values())
-        doc_id = str(values[0])
-        row_values = dict(row)
-        secondary_score = (
-            rrf_scores[doc_id]["distance"] if doc_id in rrf_scores else 0.0
-        )
-        secondary_score += 1.0 / (rank + rrf_k)
-        row_values["distance"] = secondary_score
-        rrf_scores[doc_id] = row_values
+    # Keyword search relevance is always "higher is better" -> sort descending
+    sorted_secondary = sorted(
+        secondary_search_results,
+        key=lambda item: item["distance"],
+        reverse=True,
+    )
+    for rank, row in enumerate(sorted_secondary):
+        doc_id = str(list(row.values())[0])
+        if doc_id not in rrf_scores:
+            rrf_scores[doc_id] = dict(row)
+            rrf_scores[doc_id]["distance"] = 0.0
+        # Add the rank score from this list to the existing score
+        rrf_scores[doc_id]["distance"] += 1.0 / (rank + rrf_k)
     # Sort the results by rrf score in descending order
     # Sort the results by weighted score in descending order

langchain_postgres/v2/vectorstores.py CHANGED Viewed

@@ -789,6 +789,24 @@ class PGVectorStore(VectorStore):
             )
         )
+    async def aapply_hybrid_search_index(
+        self,
+        concurrently: bool = False,
+    ) -> None:
+        """Creates a TSV index in the vector store table if possible."""
+        return await self._engine._run_as_async(
+            self.__vs.aapply_hybrid_search_index(concurrently=concurrently)
+        )
+    def apply_hybrid_search_index(
+        self,
+        concurrently: bool = False,
+    ) -> None:
+        """Creates a TSV index in the vector store table if possible."""
+        return self._engine._run_as_sync(
+            self.__vs.aapply_hybrid_search_index(concurrently=concurrently)
+        )
     async def aapply_vector_index(
         self,
         index: BaseIndex,

{langchain_postgres-0.0.15.dist-info → langchain_postgres-0.0.16.dist-info}/METADATA RENAMED Viewed

@@ -1,17 +1,17 @@
 Metadata-Version: 2.4
 Name: langchain-postgres
-Version: 0.0.15
+Version: 0.0.16
 Summary: An integration package connecting Postgres and LangChain
 License-Expression: MIT
 License-File: LICENSE
 Requires-Python: >=3.9
 Requires-Dist: asyncpg>=0.30.0
-Requires-Dist: langchain-core<0.4.0,>=0.2.13
+Requires-Dist: langchain-core<2.0,>=0.2.13
 Requires-Dist: numpy<3,>=1.21
 Requires-Dist: pgvector<0.4,>=0.2.5
 Requires-Dist: psycopg-pool<4,>=3.2.1
-Requires-Dist: psycopg<4,>=3
-Requires-Dist: sqlalchemy<3,>=2
+Requires-Dist: psycopg[binary]<4,>=3
+Requires-Dist: sqlalchemy[asyncio]<3,>=2
 Description-Content-Type: text/markdown
 # langchain-postgres
@@ -95,6 +95,24 @@ print(docs)
 > [!TIP]
 > All synchronous functions have corresponding asynchronous functions
+### Hybrid Search with PGVectorStore
+With PGVectorStore you can use hybrid search for more comprehensive and relevant search results.
+```python
+vs = PGVectorStore.create_sync(
+    engine=engine,
+    table_name=TABLE_NAME,
+    embedding_service=embedding,
+    hybrid_search_config=HybridSearchConfig(
+      fusion_function=reciprocal_rank_fusion
+    ),
+)
+hybrid_docs = vector_store.similarity_search("products", k=5)
+```
+For a detailed guide on how to use hybrid search, see the [documentation](/examples/pg_vectorstore_how_to.ipynb#hybrid-search-with-pgvectorstore ).
 ## ChatMessageHistory
 The chat message history abstraction helps to persist chat message history

{langchain_postgres-0.0.15.dist-info → langchain_postgres-0.0.16.dist-info}/RECORD RENAMED Viewed

@@ -6,12 +6,12 @@ langchain_postgres/translator.py,sha256=6cTS2RJUodMUdsurJM-f-vgPXl6Ad6bfMo8ECuh5
 langchain_postgres/vectorstores.py,sha256=vzRbPwU1Rn-pOsnTsz1u72cSYD7H8jMlW4N7A58QIt4,83826
 langchain_postgres/utils/pgvector_migrator.py,sha256=OxW2_FxaomZw5kqPAz-3lmZ5t2hSXU4ZW3xK6O62MH4,11771
 langchain_postgres/v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-langchain_postgres/v2/async_vectorstore.py,sha256=WJaFs38fZiNJ6ZM2fhz7u6oJZhfig4fP-OKFuyB7MIQ,58739
-langchain_postgres/v2/engine.py,sha256=BZJHWzS7SqMWs1-7ZHKkRAIu5PuO98zqg5aWf0EXkDM,16850
-langchain_postgres/v2/hybrid_search_config.py,sha256=zDVMscaV0n92BkgGd2J77Y675z9xWS-U6jTmkqHJtGI,5490
+langchain_postgres/v2/async_vectorstore.py,sha256=MuRjlRcANOnxrXRGcyGEzIZYr4v75tk8jbMZZCexSAc,58711
+langchain_postgres/v2/engine.py,sha256=UC3upYnqmgKBw4E6t62CbjUEdVO67t1j0rCbdFmoQnI,16902
+langchain_postgres/v2/hybrid_search_config.py,sha256=dhBeedqpVXv2VP2_RLs_jNHLLLrukJ-UXytxRD3zVts,7658
 langchain_postgres/v2/indexes.py,sha256=aLCFGYiIbLBUr88drMLD6l41MPRI7lv0ALMVRWfqdq4,4888
-langchain_postgres/v2/vectorstores.py,sha256=Lo3IQKjQ6AQlyNP8ILGeyCk6ZyKANcvebpRT5tHCT78,38595
-langchain_postgres-0.0.15.dist-info/METADATA,sha256=RzNeUX4gFCBEQ7u7qQHWOC6LsVPxl6xMoOQMLAXtkkU,6556
-langchain_postgres-0.0.15.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-langchain_postgres-0.0.15.dist-info/licenses/LICENSE,sha256=2btS8uNUDWD_UNjw9ba6ZJt_00aUjEw9CGyK-xIHY8c,1072
-langchain_postgres-0.0.15.dist-info/RECORD,,
+langchain_postgres/v2/vectorstores.py,sha256=Iq5z3KU0Ne_djMLlhJNL43zprii0O1JdUN2uEuvvKNI,39213
+langchain_postgres-0.0.16.dist-info/METADATA,sha256=fLsfXjrnlW412RDvPW5nv4uFJqaujUQkIBujCCsERWc,7143
+langchain_postgres-0.0.16.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+langchain_postgres-0.0.16.dist-info/licenses/LICENSE,sha256=2btS8uNUDWD_UNjw9ba6ZJt_00aUjEw9CGyK-xIHY8c,1072
+langchain_postgres-0.0.16.dist-info/RECORD,,

{langchain_postgres-0.0.15.dist-info → langchain_postgres-0.0.16.dist-info}/WHEEL RENAMED Viewed

File without changes

{langchain_postgres-0.0.15.dist-info → langchain_postgres-0.0.16.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

langchain-postgres 0.0.15__py3-none-any.whl → 0.0.16__py3-none-any.whl

langchain-postgres 0.0.15py3-none-any.whl → 0.0.16py3-none-any.whl