langchain-postgres 0.0.15__tar.gz → 0.0.16__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/DEVELOPMENT.md +1 -1
  2. langchain_postgres-0.0.15/README.md → langchain_postgres-0.0.16/PKG-INFO +34 -0
  3. langchain_postgres-0.0.15/PKG-INFO → langchain_postgres-0.0.16/README.md +18 -16
  4. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/examples/pg_vectorstore_how_to.ipynb +254 -0
  5. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/langchain_postgres/v2/async_vectorstore.py +15 -17
  6. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/langchain_postgres/v2/engine.py +2 -2
  7. langchain_postgres-0.0.16/langchain_postgres/v2/hybrid_search_config.py +212 -0
  8. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/langchain_postgres/v2/vectorstores.py +18 -0
  9. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/pyproject.toml +7 -4
  10. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/v2/test_async_pg_vectorstore_search.py +3 -3
  11. langchain_postgres-0.0.16/tests/unit_tests/v2/test_hybrid_search_config.py +314 -0
  12. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/v2/test_pg_vectorstore_index.py +116 -3
  13. langchain_postgres-0.0.16/uv.lock +1819 -0
  14. langchain_postgres-0.0.15/langchain_postgres/v2/hybrid_search_config.py +0 -149
  15. langchain_postgres-0.0.15/tests/unit_tests/v2/test_hybrid_search_config.py +0 -229
  16. langchain_postgres-0.0.15/uv.lock +0 -1456
  17. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/.github/actions/uv_setup/action.yml +0 -0
  18. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/.github/workflows/_lint.yml +0 -0
  19. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/.github/workflows/_release.yml +0 -0
  20. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/.github/workflows/_test.yml +0 -0
  21. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/.github/workflows/ci.yml +0 -0
  22. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/.gitignore +0 -0
  23. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/CONTRIBUTING.md +0 -0
  24. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/LICENSE +0 -0
  25. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/Makefile +0 -0
  26. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/docker-compose.yml +0 -0
  27. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/docs/v2_design_overview.md +0 -0
  28. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/examples/migrate_pgvector_to_pgvectorstore.ipynb +0 -0
  29. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/examples/migrate_pgvector_to_pgvectorstore.md +0 -0
  30. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/examples/pg_vectorstore.ipynb +0 -0
  31. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/examples/vectorstore.ipynb +0 -0
  32. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/langchain_postgres/__init__.py +0 -0
  33. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/langchain_postgres/_utils.py +0 -0
  34. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/langchain_postgres/chat_message_histories.py +0 -0
  35. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/langchain_postgres/py.typed +0 -0
  36. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/langchain_postgres/translator.py +0 -0
  37. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/langchain_postgres/utils/pgvector_migrator.py +0 -0
  38. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/langchain_postgres/v2/__init__.py +0 -0
  39. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/langchain_postgres/v2/indexes.py +0 -0
  40. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/langchain_postgres/vectorstores.py +0 -0
  41. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/security.md +0 -0
  42. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/__init__.py +0 -0
  43. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/__init__.py +0 -0
  44. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/fake_embeddings.py +0 -0
  45. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/fixtures/__init__.py +0 -0
  46. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/fixtures/filtering_test_cases.py +0 -0
  47. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/fixtures/metadata_filtering_data.py +0 -0
  48. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/query_constructors/__init__.py +0 -0
  49. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/query_constructors/test_pgvector.py +0 -0
  50. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/test_imports.py +0 -0
  51. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/v1/__init__.py +0 -0
  52. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/v1/test_chat_histories.py +0 -0
  53. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/v1/test_vectorstore.py +0 -0
  54. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/v1/test_vectorstore_standard_tests.py +0 -0
  55. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/v2/__init__.py +0 -0
  56. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/v2/test_async_pg_vectorstore.py +0 -0
  57. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/v2/test_async_pg_vectorstore_from_methods.py +0 -0
  58. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/v2/test_async_pg_vectorstore_index.py +0 -0
  59. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/v2/test_engine.py +0 -0
  60. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/v2/test_indexes.py +0 -0
  61. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/v2/test_pg_vectorstore.py +0 -0
  62. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/v2/test_pg_vectorstore_from_methods.py +0 -0
  63. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/v2/test_pg_vectorstore_search.py +0 -0
  64. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/v2/test_pg_vectorstore_standard_suite.py +0 -0
  65. {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/utils.py +0 -0
@@ -21,7 +21,7 @@ Start PostgreSQL/PGVector.
21
21
  docker run --rm -it --name pgvector-container \
22
22
  -e POSTGRES_USER=langchain \
23
23
  -e POSTGRES_PASSWORD=langchain \
24
- -e POSTGRES_DB=langchain \
24
+ -e POSTGRES_DB=langchain_test \
25
25
  -p 6024:5432 pgvector/pgvector:pg16 \
26
26
  postgres -c log_statement=all
27
27
  ```
@@ -1,3 +1,19 @@
1
+ Metadata-Version: 2.4
2
+ Name: langchain-postgres
3
+ Version: 0.0.16
4
+ Summary: An integration package connecting Postgres and LangChain
5
+ License-Expression: MIT
6
+ License-File: LICENSE
7
+ Requires-Python: >=3.9
8
+ Requires-Dist: asyncpg>=0.30.0
9
+ Requires-Dist: langchain-core<2.0,>=0.2.13
10
+ Requires-Dist: numpy<3,>=1.21
11
+ Requires-Dist: pgvector<0.4,>=0.2.5
12
+ Requires-Dist: psycopg-pool<4,>=3.2.1
13
+ Requires-Dist: psycopg[binary]<4,>=3
14
+ Requires-Dist: sqlalchemy[asyncio]<3,>=2
15
+ Description-Content-Type: text/markdown
16
+
1
17
  # langchain-postgres
2
18
 
3
19
  [![Release Notes](https://img.shields.io/github/release/langchain-ai/langchain-postgres)](https://github.com/langchain-ai/langchain-postgres/releases)
@@ -79,6 +95,24 @@ print(docs)
79
95
  > [!TIP]
80
96
  > All synchronous functions have corresponding asynchronous functions
81
97
 
98
+ ### Hybrid Search with PGVectorStore
99
+
100
+ With PGVectorStore you can use hybrid search for more comprehensive and relevant search results.
101
+
102
+ ```python
103
+ vs = PGVectorStore.create_sync(
104
+ engine=engine,
105
+ table_name=TABLE_NAME,
106
+ embedding_service=embedding,
107
+ hybrid_search_config=HybridSearchConfig(
108
+ fusion_function=reciprocal_rank_fusion
109
+ ),
110
+ )
111
+ hybrid_docs = vector_store.similarity_search("products", k=5)
112
+ ```
113
+
114
+ For a detailed guide on how to use hybrid search, see the [documentation](/examples/pg_vectorstore_how_to.ipynb#hybrid-search-with-pgvectorstore ).
115
+
82
116
  ## ChatMessageHistory
83
117
 
84
118
  The chat message history abstraction helps to persist chat message history
@@ -1,19 +1,3 @@
1
- Metadata-Version: 2.4
2
- Name: langchain-postgres
3
- Version: 0.0.15
4
- Summary: An integration package connecting Postgres and LangChain
5
- License-Expression: MIT
6
- License-File: LICENSE
7
- Requires-Python: >=3.9
8
- Requires-Dist: asyncpg>=0.30.0
9
- Requires-Dist: langchain-core<0.4.0,>=0.2.13
10
- Requires-Dist: numpy<3,>=1.21
11
- Requires-Dist: pgvector<0.4,>=0.2.5
12
- Requires-Dist: psycopg-pool<4,>=3.2.1
13
- Requires-Dist: psycopg<4,>=3
14
- Requires-Dist: sqlalchemy<3,>=2
15
- Description-Content-Type: text/markdown
16
-
17
1
  # langchain-postgres
18
2
 
19
3
  [![Release Notes](https://img.shields.io/github/release/langchain-ai/langchain-postgres)](https://github.com/langchain-ai/langchain-postgres/releases)
@@ -95,6 +79,24 @@ print(docs)
95
79
  > [!TIP]
96
80
  > All synchronous functions have corresponding asynchronous functions
97
81
 
82
+ ### Hybrid Search with PGVectorStore
83
+
84
+ With PGVectorStore you can use hybrid search for more comprehensive and relevant search results.
85
+
86
+ ```python
87
+ vs = PGVectorStore.create_sync(
88
+ engine=engine,
89
+ table_name=TABLE_NAME,
90
+ embedding_service=embedding,
91
+ hybrid_search_config=HybridSearchConfig(
92
+ fusion_function=reciprocal_rank_fusion
93
+ ),
94
+ )
95
+ hybrid_docs = vector_store.similarity_search("products", k=5)
96
+ ```
97
+
98
+ For a detailed guide on how to use hybrid search, see the [documentation](/examples/pg_vectorstore_how_to.ipynb#hybrid-search-with-pgvectorstore ).
99
+
98
100
  ## ChatMessageHistory
99
101
 
100
102
  The chat message history abstraction helps to persist chat message history
@@ -686,6 +686,260 @@
686
686
  "1. For new records, added via `VectorStore` embeddings are automatically generated."
687
687
  ]
688
688
  },
689
+ {
690
+ "cell_type": "markdown",
691
+ "metadata": {},
692
+ "source": [
693
+ "# Hybrid Search with PGVectorStore\n",
694
+ "\n",
695
+ "A Hybrid Search combines multiple lookup strategies to provide more comprehensive and relevant search results. Specifically, it leverages both dense embedding vector search as the primary search (for semantic similarity) and TSV (Text Search Vector) based keyword search as the secondary search (for lexical matching). This approach is particularly powerful for applications requiring efficient searching through customized text and metadata, especially when a specialized embedding model isn't feasible or necessary.\n",
696
+ "\n",
697
+ "By integrating both semantic and lexical capabilities, hybrid search helps overcome the limitations of each individual method:\n",
698
+ "* **Semantic Search**: Excellent for understanding the meaning of a query, even if the exact keywords aren't present. However, it can sometimes miss highly relevant documents that contain the precise keywords but have a slightly different semantic context.\n",
699
+ "* **Keyword Search**: Highly effective for finding documents with exact keyword matches and is generally fast. Its weakness lies in its inability to understand synonyms, misspellings, or conceptual relationships."
700
+ ]
701
+ },
702
+ {
703
+ "cell_type": "markdown",
704
+ "metadata": {},
705
+ "source": [
706
+ "## Hybrid Search Config\n",
707
+ "\n",
708
+ "You can take advantage of hybrid search with PGVectorStore using the `HybridSearchConfig`.\n",
709
+ "\n",
710
+ "With a `HybridSearchConfig` provided, the `PGVectorStore` class can efficiently manage a hybrid search vector store using PostgreSQL as the backend, automatically handling the creation and population of the necessary TSV columns when possible."
711
+ ]
712
+ },
713
+ {
714
+ "cell_type": "markdown",
715
+ "metadata": {},
716
+ "source": [
717
+ "### Building the config\n",
718
+ "\n",
719
+ "Here are the parameters to the hybrid search config:\n",
720
+ "* **tsv_column:** The column name for TSV column. Default: `<content_column>_tsv`\n",
721
+ "* **tsv_lang:** Value representing a supported language. Default: `pg_catalog.english`\n",
722
+ "* **fts_query:** If provided, this would be used for secondary retrieval instead of user provided query.\n",
723
+ "* **fusion_function:** Determines how the results are to be merged, default is equal weighted sum ranking.\n",
724
+ "* **fusion_function_parameters:** Parameters for the fusion function\n",
725
+ "* **primary_top_k:** Max results fetched for primary retrieval. Default: `4`\n",
726
+ "* **secondary_top_k:** Max results fetched for secondary retrieval. Default: `4`\n",
727
+ "* **index_name:** Name of the index built on the `tsv_column`\n",
728
+ "* **index_type:** GIN or GIST. Default: `GIN`"
729
+ ]
730
+ },
731
+ {
732
+ "cell_type": "markdown",
733
+ "metadata": {},
734
+ "source": [
735
+ "Here is an example `HybridSearchConfig`"
736
+ ]
737
+ },
738
+ {
739
+ "cell_type": "code",
740
+ "execution_count": null,
741
+ "metadata": {},
742
+ "outputs": [],
743
+ "source": [
744
+ "from langchain_postgres.v2.hybrid_search_config import (\n",
745
+ " HybridSearchConfig,\n",
746
+ " reciprocal_rank_fusion,\n",
747
+ ")\n",
748
+ "\n",
749
+ "hybrid_search_config = HybridSearchConfig(\n",
750
+ " tsv_column=\"hybrid_description\",\n",
751
+ " tsv_lang=\"pg_catalog.english\",\n",
752
+ " fusion_function=reciprocal_rank_fusion,\n",
753
+ " fusion_function_parameters={\n",
754
+ " \"rrf_k\": 60,\n",
755
+ " \"fetch_top_k\": 10,\n",
756
+ " },\n",
757
+ ")"
758
+ ]
759
+ },
760
+ {
761
+ "cell_type": "markdown",
762
+ "metadata": {},
763
+ "source": [
764
+ "**Note:** In this case, we have mentioned the fusion function to be a `reciprocal rank fusion` but you can also use the `weighted_sum_ranking`.\n",
765
+ "\n",
766
+ "Make sure to use the right fusion function parameters\n",
767
+ "\n",
768
+ "`reciprocal_rank_fusion`:\n",
769
+ "* rrf_k: The RRF parameter k. Defaults to 60\n",
770
+ "* fetch_top_k: The number of documents to fetch after merging the results. Defaults to 4\n",
771
+ "\n",
772
+ "`weighted_sum_ranking`:\n",
773
+ "* primary_results_weight: The weight for the primary source's scores. Defaults to 0.5\n",
774
+ "* secondary_results_weight: The weight for the secondary source's scores. Defaults to 0.5\n",
775
+ "* fetch_top_k: The number of documents to fetch after merging the results. Defaults to 4\n"
776
+ ]
777
+ },
778
+ {
779
+ "cell_type": "markdown",
780
+ "metadata": {},
781
+ "source": [
782
+ "## Usage\n",
783
+ "\n",
784
+ "Let's assume we are using the previously mentioned table [`products`](#create-a-vector-store-using-existing-table), which stores product details for an eComm venture.\n"
785
+ ]
786
+ },
787
+ {
788
+ "cell_type": "markdown",
789
+ "metadata": {},
790
+ "source": [
791
+ "### With a new hybrid search table\n",
792
+ "To create a new postgres table with the tsv column, specify the hybrid search config during the initialization of the vector store.\n",
793
+ "\n",
794
+ "In this case, all the similarity searches will make use of hybrid search."
795
+ ]
796
+ },
797
+ {
798
+ "cell_type": "code",
799
+ "execution_count": null,
800
+ "metadata": {},
801
+ "outputs": [],
802
+ "source": [
803
+ "from langchain_postgres import PGVectorStore\n",
804
+ "\n",
805
+ "TABLE_NAME = \"hybrid_search_products\"\n",
806
+ "\n",
807
+ "await pg_engine.ainit_vectorstore_table(\n",
808
+ " table_name=TABLE_NAME,\n",
809
+ " # schema_name=SCHEMA_NAME,\n",
810
+ " vector_size=VECTOR_SIZE,\n",
811
+ " id_column=\"product_id\",\n",
812
+ " content_column=\"description\",\n",
813
+ " embedding_column=\"embed\",\n",
814
+ " metadata_columns=[\"name\", \"category\", \"price_usd\", \"quantity\", \"sku\", \"image_url\"],\n",
815
+ " metadata_json_column=\"metadata\",\n",
816
+ " hybrid_search_config=hybrid_search_config,\n",
817
+ " store_metadata=True,\n",
818
+ ")\n",
819
+ "\n",
820
+ "vs_hybrid = await PGVectorStore.create(\n",
821
+ " pg_engine,\n",
822
+ " table_name=TABLE_NAME,\n",
823
+ " # schema_name=SCHEMA_NAME,\n",
824
+ " embedding_service=embedding,\n",
825
+ " # Connect to existing VectorStore by customizing below column names\n",
826
+ " id_column=\"product_id\",\n",
827
+ " content_column=\"description\",\n",
828
+ " embedding_column=\"embed\",\n",
829
+ " metadata_columns=[\"name\", \"category\", \"price_usd\", \"quantity\", \"sku\", \"image_url\"],\n",
830
+ " metadata_json_column=\"metadata\",\n",
831
+ " hybrid_search_config=hybrid_search_config,\n",
832
+ ")\n",
833
+ "\n",
834
+ "# Fetch documents from the previously created store to fetch product documents\n",
835
+ "docs = await custom_store.asimilarity_search(\"products\", k=5)\n",
836
+ "# Add data normally to the hybrid search vector store, which will also add the tsv values in tsv_column\n",
837
+ "await vs_hybrid.aadd_documents(docs)\n",
838
+ "\n",
839
+ "# Use hybrid search\n",
840
+ "hybrid_docs = await vs_hybrid.asimilarity_search(\"products\", k=5)\n",
841
+ "print(hybrid_docs)"
842
+ ]
843
+ },
844
+ {
845
+ "cell_type": "markdown",
846
+ "metadata": {},
847
+ "source": [
848
+ "### With a pre-existing table\n",
849
+ "\n",
850
+ "If a hybrid search config is **NOT** provided during `init_vectorstore_table` while creating a table, the table will not contain a tsv_column. In this case you can still take advantage of hybrid search using the `HybridSearchConfig`.\n",
851
+ "\n",
852
+ "The specified TSV column is not present but the TSV vectors are created dynamically on-the-go for hybrid search."
853
+ ]
854
+ },
855
+ {
856
+ "cell_type": "code",
857
+ "execution_count": null,
858
+ "metadata": {},
859
+ "outputs": [],
860
+ "source": [
861
+ "from langchain_postgres import PGVectorStore\n",
862
+ "\n",
863
+ "# Set the existing table name\n",
864
+ "TABLE_NAME = \"products\"\n",
865
+ "# SCHEMA_NAME = \"my_schema\"\n",
866
+ "\n",
867
+ "hybrid_search_config = HybridSearchConfig(\n",
868
+ " tsv_lang=\"pg_catalog.english\",\n",
869
+ " fusion_function=reciprocal_rank_fusion,\n",
870
+ " fusion_function_parameters={\n",
871
+ " \"rrf_k\": 60,\n",
872
+ " \"fetch_top_k\": 10,\n",
873
+ " },\n",
874
+ ")\n",
875
+ "\n",
876
+ "# Initialize PGVectorStore with the hybrid search config\n",
877
+ "custom_hybrid_store = await PGVectorStore.create(\n",
878
+ " pg_engine,\n",
879
+ " table_name=TABLE_NAME,\n",
880
+ " # schema_name=SCHEMA_NAME,\n",
881
+ " embedding_service=embedding,\n",
882
+ " # Connect to existing VectorStore by customizing below column names\n",
883
+ " id_column=\"product_id\",\n",
884
+ " content_column=\"description\",\n",
885
+ " embedding_column=\"embed\",\n",
886
+ " metadata_columns=[\"name\", \"category\", \"price_usd\", \"quantity\", \"sku\", \"image_url\"],\n",
887
+ " metadata_json_column=\"metadata\",\n",
888
+ " hybrid_search_config=hybrid_search_config,\n",
889
+ ")\n",
890
+ "\n",
891
+ "# Use hybrid search\n",
892
+ "hybrid_docs = await custom_hybrid_store.asimilarity_search(\"products\", k=5)\n",
893
+ "print(hybrid_docs)"
894
+ ]
895
+ },
896
+ {
897
+ "cell_type": "markdown",
898
+ "metadata": {},
899
+ "source": [
900
+ "In this case, all the similarity searches will make use of hybrid search."
901
+ ]
902
+ },
903
+ {
904
+ "cell_type": "markdown",
905
+ "metadata": {},
906
+ "source": [
907
+ "### Applying Hybrid Search to Specific Queries\n",
908
+ "\n",
909
+ "To use hybrid search only for certain queries, omit the configuration during initialization and pass it directly to the search method when needed."
910
+ ]
911
+ },
912
+ {
913
+ "cell_type": "code",
914
+ "execution_count": null,
915
+ "metadata": {},
916
+ "outputs": [],
917
+ "source": [
918
+ "# Use hybrid search\n",
919
+ "hybrid_docs = await custom_store.asimilarity_search(\n",
920
+ " \"products\", k=5, hybrid_search_config=hybrid_search_config\n",
921
+ ")\n",
922
+ "print(hybrid_docs)"
923
+ ]
924
+ },
925
+ {
926
+ "cell_type": "markdown",
927
+ "metadata": {},
928
+ "source": [
929
+ "## Hybrid Search Index\n",
930
+ "\n",
931
+ "Optionally, if you have created a Postgres table with a tsv_column, you can create an index."
932
+ ]
933
+ },
934
+ {
935
+ "cell_type": "code",
936
+ "execution_count": null,
937
+ "metadata": {},
938
+ "outputs": [],
939
+ "source": [
940
+ "await vs_hybrid.aapply_hybrid_search_index()"
941
+ ]
942
+ },
689
943
  {
690
944
  "cell_type": "markdown",
691
945
  "metadata": {},
@@ -210,7 +210,7 @@ class AsyncPGVectorStore(VectorStore):
210
210
  hybrid_search_config.tsv_column = ""
211
211
  if embedding_column not in columns:
212
212
  raise ValueError(f"Embedding column, {embedding_column}, does not exist.")
213
- if columns[embedding_column] != "USER-DEFINED":
213
+ if columns[embedding_column] not in ["USER-DEFINED", "vector"]:
214
214
  raise ValueError(
215
215
  f"Embedding column, {embedding_column}, is not type Vector."
216
216
  )
@@ -580,16 +580,16 @@ class AsyncPGVectorStore(VectorStore):
580
580
  For best hybrid search performance, consider creating a TSV column
581
581
  and adding GIN index.
582
582
  """
583
- if not k:
584
- k = (
585
- max(
586
- self.k,
587
- self.hybrid_search_config.primary_top_k,
588
- self.hybrid_search_config.secondary_top_k,
589
- )
590
- if self.hybrid_search_config
591
- else self.k
592
- )
583
+ hybrid_search_config = kwargs.get(
584
+ "hybrid_search_config", self.hybrid_search_config
585
+ )
586
+
587
+ final_k = k if k is not None else self.k
588
+
589
+ dense_limit = final_k
590
+ if hybrid_search_config:
591
+ dense_limit = hybrid_search_config.primary_top_k
592
+
593
593
  operator = self.distance_strategy.operator
594
594
  search_function = self.distance_strategy.search_function
595
595
 
@@ -617,9 +617,9 @@ class AsyncPGVectorStore(VectorStore):
617
617
  embedding_data_string = ":query_embedding"
618
618
  where_filters = f"WHERE {safe_filter}" if safe_filter else ""
619
619
  dense_query_stmt = f"""SELECT {column_names}, {search_function}("{self.embedding_column}", {embedding_data_string}) as distance
620
- FROM "{self.schema_name}"."{self.table_name}" {where_filters} ORDER BY "{self.embedding_column}" {operator} {embedding_data_string} LIMIT :k;
620
+ FROM "{self.schema_name}"."{self.table_name}" {where_filters} ORDER BY "{self.embedding_column}" {operator} {embedding_data_string} LIMIT :dense_limit;
621
621
  """
622
- param_dict = {"query_embedding": query_embedding, "k": k}
622
+ param_dict = {"query_embedding": query_embedding, "dense_limit": dense_limit}
623
623
  if filter_dict:
624
624
  param_dict.update(filter_dict)
625
625
  if self.index_query_options:
@@ -637,16 +637,13 @@ class AsyncPGVectorStore(VectorStore):
637
637
  result_map = result.mappings()
638
638
  dense_results = result_map.fetchall()
639
639
 
640
- hybrid_search_config = kwargs.get(
641
- "hybrid_search_config", self.hybrid_search_config
642
- )
643
640
  fts_query = (
644
641
  hybrid_search_config.fts_query
645
642
  if hybrid_search_config and hybrid_search_config.fts_query
646
643
  else kwargs.get("fts_query", "")
647
644
  )
648
645
  if hybrid_search_config and fts_query:
649
- hybrid_search_config.fusion_function_parameters["fetch_top_k"] = k
646
+ hybrid_search_config.fusion_function_parameters["fetch_top_k"] = final_k
650
647
  # do the sparse query
651
648
  lang = (
652
649
  f"'{hybrid_search_config.tsv_lang}',"
@@ -670,6 +667,7 @@ class AsyncPGVectorStore(VectorStore):
670
667
  dense_results,
671
668
  sparse_results,
672
669
  **hybrid_search_config.fusion_function_parameters,
670
+ distance_strategy=self.distance_strategy,
673
671
  )
674
672
  return combined_results
675
673
  return dense_results
@@ -119,7 +119,7 @@ class PGEngine:
119
119
  return await coro
120
120
  # Otherwise, run in the background thread
121
121
  return await asyncio.wrap_future(
122
- asyncio.run_coroutine_threadsafe(coro, self._loop)
122
+ asyncio.run_coroutine_threadsafe(coro, self._loop) # type: ignore[arg-type]
123
123
  )
124
124
 
125
125
  def _run_as_sync(self, coro: Awaitable[T]) -> T:
@@ -128,7 +128,7 @@ class PGEngine:
128
128
  raise Exception(
129
129
  "Engine was initialized without a background loop and cannot call sync methods."
130
130
  )
131
- return asyncio.run_coroutine_threadsafe(coro, self._loop).result()
131
+ return asyncio.run_coroutine_threadsafe(coro, self._loop).result() # type: ignore[arg-type]
132
132
 
133
133
  async def close(self) -> None:
134
134
  """Dispose of connection pool"""
@@ -0,0 +1,212 @@
1
+ from abc import ABC
2
+ from dataclasses import dataclass, field
3
+ from typing import Any, Callable, Optional, Sequence
4
+
5
+ from sqlalchemy import RowMapping
6
+
7
+ from .indexes import DistanceStrategy
8
+
9
+
10
+ def _normalize_scores(
11
+ results: Sequence[dict[str, Any]], is_distance_metric: bool
12
+ ) -> Sequence[dict[str, Any]]:
13
+ """Normalizes scores to a 0-1 scale, where 1 is best."""
14
+ if not results:
15
+ return []
16
+
17
+ # Get scores from the last column of each result
18
+ scores = [float(list(item.values())[-1]) for item in results]
19
+ min_score, max_score = min(scores), max(scores)
20
+ score_range = max_score - min_score
21
+
22
+ if score_range == 0:
23
+ # All documents are of the highest quality (1.0)
24
+ for item in results:
25
+ item["normalized_score"] = 1.0
26
+ return list(results)
27
+
28
+ for item in results:
29
+ # Access the score again from the last column for calculation
30
+ score = list(item.values())[-1]
31
+ normalized = (score - min_score) / score_range
32
+ if is_distance_metric:
33
+ # For distance, a lower score is better, so we invert the result.
34
+ item["normalized_score"] = 1.0 - normalized
35
+ else:
36
+ # For similarity (like keyword search), a higher score is better.
37
+ item["normalized_score"] = normalized
38
+
39
+ return list(results)
40
+
41
+
42
+ def weighted_sum_ranking(
43
+ primary_search_results: Sequence[RowMapping],
44
+ secondary_search_results: Sequence[RowMapping],
45
+ primary_results_weight: float = 0.5,
46
+ secondary_results_weight: float = 0.5,
47
+ fetch_top_k: int = 4,
48
+ **kwargs: Any,
49
+ ) -> Sequence[dict[str, Any]]:
50
+ """
51
+ Ranks documents using a weighted sum of scores from two sources.
52
+
53
+ Args:
54
+ primary_search_results: A list of (document, distance) tuples from
55
+ the primary search.
56
+ secondary_search_results: A list of (document, distance) tuples from
57
+ the secondary search.
58
+ primary_results_weight: The weight for the primary source's scores.
59
+ Defaults to 0.5.
60
+ secondary_results_weight: The weight for the secondary source's scores.
61
+ Defaults to 0.5.
62
+ fetch_top_k: The number of documents to fetch after merging the results.
63
+ Defaults to 4.
64
+
65
+ Returns:
66
+ A list of (document, distance) tuples, sorted by weighted_score in
67
+ descending order.
68
+ """
69
+
70
+ distance_strategy = kwargs.get(
71
+ "distance_strategy", DistanceStrategy.COSINE_DISTANCE
72
+ )
73
+ is_primary_distance = distance_strategy != DistanceStrategy.INNER_PRODUCT
74
+
75
+ # Normalize both sets of results onto a 0-1 scale
76
+ normalized_primary = _normalize_scores(
77
+ [dict(row) for row in primary_search_results],
78
+ is_distance_metric=is_primary_distance,
79
+ )
80
+
81
+ # Keyword search relevance is a similarity score (higher is better)
82
+ normalized_secondary = _normalize_scores(
83
+ [dict(row) for row in secondary_search_results], is_distance_metric=False
84
+ )
85
+
86
+ # stores computed metric with provided distance metric and weights
87
+ weighted_scores: dict[str, dict[str, Any]] = {}
88
+
89
+ # Process primary results
90
+ for item in normalized_primary:
91
+ doc_id = str(list(item.values())[0])
92
+ # Set the 'distance' key with the weighted primary score
93
+ item["distance"] = item["normalized_score"] * primary_results_weight
94
+ weighted_scores[doc_id] = item
95
+
96
+ # Process secondary results
97
+ for item in normalized_secondary:
98
+ doc_id = str(list(item.values())[0])
99
+ secondary_weighted_score = item["normalized_score"] * secondary_results_weight
100
+
101
+ if doc_id in weighted_scores:
102
+ # Add to the existing 'distance' score
103
+ weighted_scores[doc_id]["distance"] += secondary_weighted_score
104
+ else:
105
+ # Set the 'distance' key for the new item
106
+ item["distance"] = secondary_weighted_score
107
+ weighted_scores[doc_id] = item
108
+
109
+ ranked_results = sorted(
110
+ weighted_scores.values(), key=lambda item: item["distance"], reverse=True
111
+ )
112
+
113
+ for result in ranked_results:
114
+ result.pop("normalized_score", None)
115
+
116
+ return ranked_results[:fetch_top_k]
117
+
118
+
119
+ def reciprocal_rank_fusion(
120
+ primary_search_results: Sequence[RowMapping],
121
+ secondary_search_results: Sequence[RowMapping],
122
+ rrf_k: float = 60,
123
+ fetch_top_k: int = 4,
124
+ **kwargs: Any,
125
+ ) -> Sequence[dict[str, Any]]:
126
+ """
127
+ Ranks documents using Reciprocal Rank Fusion (RRF) of scores from two sources.
128
+
129
+ Args:
130
+ primary_search_results: A list of (document, distance) tuples from
131
+ the primary search.
132
+ secondary_search_results: A list of (document, distance) tuples from
133
+ the secondary search.
134
+ rrf_k: The RRF parameter k.
135
+ Defaults to 60.
136
+ fetch_top_k: The number of documents to fetch after merging the results.
137
+ Defaults to 4.
138
+
139
+ Returns:
140
+ A list of (document_id, rrf_score) tuples, sorted by rrf_score
141
+ in descending order.
142
+ """
143
+ distance_strategy = kwargs.get(
144
+ "distance_strategy", DistanceStrategy.COSINE_DISTANCE
145
+ )
146
+ rrf_scores: dict[str, dict[str, Any]] = {}
147
+
148
+ # Process results from primary source
149
+ # Determine sorting order based on the vector distance strategy.
150
+ # For COSINE & EUCLIDEAN(distance), we sort ascending (reverse=False).
151
+ # For INNER_PRODUCT (similarity), we sort descending (reverse=True).
152
+ is_similarity_metric = distance_strategy == DistanceStrategy.INNER_PRODUCT
153
+ sorted_primary = sorted(
154
+ primary_search_results,
155
+ key=lambda item: item["distance"],
156
+ reverse=is_similarity_metric,
157
+ )
158
+
159
+ for rank, row in enumerate(sorted_primary):
160
+ doc_id = str(list(row.values())[0])
161
+ if doc_id not in rrf_scores:
162
+ rrf_scores[doc_id] = dict(row)
163
+ rrf_scores[doc_id]["distance"] = 0.0
164
+ # Add the "normalized" rank score
165
+ rrf_scores[doc_id]["distance"] += 1.0 / (rank + rrf_k)
166
+
167
+ # Process results from secondary source
168
+ # Keyword search relevance is always "higher is better" -> sort descending
169
+ sorted_secondary = sorted(
170
+ secondary_search_results,
171
+ key=lambda item: item["distance"],
172
+ reverse=True,
173
+ )
174
+
175
+ for rank, row in enumerate(sorted_secondary):
176
+ doc_id = str(list(row.values())[0])
177
+ if doc_id not in rrf_scores:
178
+ rrf_scores[doc_id] = dict(row)
179
+ rrf_scores[doc_id]["distance"] = 0.0
180
+ # Add the rank score from this list to the existing score
181
+ rrf_scores[doc_id]["distance"] += 1.0 / (rank + rrf_k)
182
+
183
+ # Sort the results by rrf score in descending order
184
+ # Sort the results by weighted score in descending order
185
+ ranked_results = sorted(
186
+ rrf_scores.values(), key=lambda item: item["distance"], reverse=True
187
+ )
188
+ # Extract only the RowMapping for the top results
189
+ return ranked_results[:fetch_top_k]
190
+
191
+
192
+ @dataclass
193
+ class HybridSearchConfig(ABC):
194
+ """
195
+ AlloyDB Vector Store Hybrid Search Config.
196
+
197
+ Queries might be slow if the hybrid search column does not exist.
198
+ For best hybrid search performance, consider creating a TSV column
199
+ and adding GIN index.
200
+ """
201
+
202
+ tsv_column: Optional[str] = ""
203
+ tsv_lang: Optional[str] = "pg_catalog.english"
204
+ fts_query: Optional[str] = ""
205
+ fusion_function: Callable[
206
+ [Sequence[RowMapping], Sequence[RowMapping], Any], Sequence[Any]
207
+ ] = weighted_sum_ranking # Updated default
208
+ fusion_function_parameters: dict[str, Any] = field(default_factory=dict)
209
+ primary_top_k: int = 4
210
+ secondary_top_k: int = 4
211
+ index_name: str = "langchain_tsv_index"
212
+ index_type: str = "GIN"