langchain-postgres 0.0.15__tar.gz → 0.0.16__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/DEVELOPMENT.md +1 -1
- langchain_postgres-0.0.15/README.md → langchain_postgres-0.0.16/PKG-INFO +34 -0
- langchain_postgres-0.0.15/PKG-INFO → langchain_postgres-0.0.16/README.md +18 -16
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/examples/pg_vectorstore_how_to.ipynb +254 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/langchain_postgres/v2/async_vectorstore.py +15 -17
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/langchain_postgres/v2/engine.py +2 -2
- langchain_postgres-0.0.16/langchain_postgres/v2/hybrid_search_config.py +212 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/langchain_postgres/v2/vectorstores.py +18 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/pyproject.toml +7 -4
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/v2/test_async_pg_vectorstore_search.py +3 -3
- langchain_postgres-0.0.16/tests/unit_tests/v2/test_hybrid_search_config.py +314 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/v2/test_pg_vectorstore_index.py +116 -3
- langchain_postgres-0.0.16/uv.lock +1819 -0
- langchain_postgres-0.0.15/langchain_postgres/v2/hybrid_search_config.py +0 -149
- langchain_postgres-0.0.15/tests/unit_tests/v2/test_hybrid_search_config.py +0 -229
- langchain_postgres-0.0.15/uv.lock +0 -1456
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/.github/actions/uv_setup/action.yml +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/.github/workflows/_lint.yml +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/.github/workflows/_release.yml +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/.github/workflows/_test.yml +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/.github/workflows/ci.yml +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/.gitignore +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/CONTRIBUTING.md +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/LICENSE +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/Makefile +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/docker-compose.yml +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/docs/v2_design_overview.md +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/examples/migrate_pgvector_to_pgvectorstore.ipynb +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/examples/migrate_pgvector_to_pgvectorstore.md +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/examples/pg_vectorstore.ipynb +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/examples/vectorstore.ipynb +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/langchain_postgres/__init__.py +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/langchain_postgres/_utils.py +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/langchain_postgres/chat_message_histories.py +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/langchain_postgres/py.typed +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/langchain_postgres/translator.py +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/langchain_postgres/utils/pgvector_migrator.py +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/langchain_postgres/v2/__init__.py +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/langchain_postgres/v2/indexes.py +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/langchain_postgres/vectorstores.py +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/security.md +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/__init__.py +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/__init__.py +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/fake_embeddings.py +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/fixtures/__init__.py +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/fixtures/filtering_test_cases.py +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/fixtures/metadata_filtering_data.py +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/query_constructors/__init__.py +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/query_constructors/test_pgvector.py +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/test_imports.py +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/v1/__init__.py +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/v1/test_chat_histories.py +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/v1/test_vectorstore.py +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/v1/test_vectorstore_standard_tests.py +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/v2/__init__.py +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/v2/test_async_pg_vectorstore.py +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/v2/test_async_pg_vectorstore_from_methods.py +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/v2/test_async_pg_vectorstore_index.py +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/v2/test_engine.py +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/v2/test_indexes.py +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/v2/test_pg_vectorstore.py +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/v2/test_pg_vectorstore_from_methods.py +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/v2/test_pg_vectorstore_search.py +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/unit_tests/v2/test_pg_vectorstore_standard_suite.py +0 -0
- {langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/tests/utils.py +0 -0
@@ -21,7 +21,7 @@ Start PostgreSQL/PGVector.
|
|
21
21
|
docker run --rm -it --name pgvector-container \
|
22
22
|
-e POSTGRES_USER=langchain \
|
23
23
|
-e POSTGRES_PASSWORD=langchain \
|
24
|
-
-e POSTGRES_DB=
|
24
|
+
-e POSTGRES_DB=langchain_test \
|
25
25
|
-p 6024:5432 pgvector/pgvector:pg16 \
|
26
26
|
postgres -c log_statement=all
|
27
27
|
```
|
@@ -1,3 +1,19 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: langchain-postgres
|
3
|
+
Version: 0.0.16
|
4
|
+
Summary: An integration package connecting Postgres and LangChain
|
5
|
+
License-Expression: MIT
|
6
|
+
License-File: LICENSE
|
7
|
+
Requires-Python: >=3.9
|
8
|
+
Requires-Dist: asyncpg>=0.30.0
|
9
|
+
Requires-Dist: langchain-core<2.0,>=0.2.13
|
10
|
+
Requires-Dist: numpy<3,>=1.21
|
11
|
+
Requires-Dist: pgvector<0.4,>=0.2.5
|
12
|
+
Requires-Dist: psycopg-pool<4,>=3.2.1
|
13
|
+
Requires-Dist: psycopg[binary]<4,>=3
|
14
|
+
Requires-Dist: sqlalchemy[asyncio]<3,>=2
|
15
|
+
Description-Content-Type: text/markdown
|
16
|
+
|
1
17
|
# langchain-postgres
|
2
18
|
|
3
19
|
[](https://github.com/langchain-ai/langchain-postgres/releases)
|
@@ -79,6 +95,24 @@ print(docs)
|
|
79
95
|
> [!TIP]
|
80
96
|
> All synchronous functions have corresponding asynchronous functions
|
81
97
|
|
98
|
+
### Hybrid Search with PGVectorStore
|
99
|
+
|
100
|
+
With PGVectorStore you can use hybrid search for more comprehensive and relevant search results.
|
101
|
+
|
102
|
+
```python
|
103
|
+
vs = PGVectorStore.create_sync(
|
104
|
+
engine=engine,
|
105
|
+
table_name=TABLE_NAME,
|
106
|
+
embedding_service=embedding,
|
107
|
+
hybrid_search_config=HybridSearchConfig(
|
108
|
+
fusion_function=reciprocal_rank_fusion
|
109
|
+
),
|
110
|
+
)
|
111
|
+
hybrid_docs = vector_store.similarity_search("products", k=5)
|
112
|
+
```
|
113
|
+
|
114
|
+
For a detailed guide on how to use hybrid search, see the [documentation](/examples/pg_vectorstore_how_to.ipynb#hybrid-search-with-pgvectorstore ).
|
115
|
+
|
82
116
|
## ChatMessageHistory
|
83
117
|
|
84
118
|
The chat message history abstraction helps to persist chat message history
|
@@ -1,19 +1,3 @@
|
|
1
|
-
Metadata-Version: 2.4
|
2
|
-
Name: langchain-postgres
|
3
|
-
Version: 0.0.15
|
4
|
-
Summary: An integration package connecting Postgres and LangChain
|
5
|
-
License-Expression: MIT
|
6
|
-
License-File: LICENSE
|
7
|
-
Requires-Python: >=3.9
|
8
|
-
Requires-Dist: asyncpg>=0.30.0
|
9
|
-
Requires-Dist: langchain-core<0.4.0,>=0.2.13
|
10
|
-
Requires-Dist: numpy<3,>=1.21
|
11
|
-
Requires-Dist: pgvector<0.4,>=0.2.5
|
12
|
-
Requires-Dist: psycopg-pool<4,>=3.2.1
|
13
|
-
Requires-Dist: psycopg<4,>=3
|
14
|
-
Requires-Dist: sqlalchemy<3,>=2
|
15
|
-
Description-Content-Type: text/markdown
|
16
|
-
|
17
1
|
# langchain-postgres
|
18
2
|
|
19
3
|
[](https://github.com/langchain-ai/langchain-postgres/releases)
|
@@ -95,6 +79,24 @@ print(docs)
|
|
95
79
|
> [!TIP]
|
96
80
|
> All synchronous functions have corresponding asynchronous functions
|
97
81
|
|
82
|
+
### Hybrid Search with PGVectorStore
|
83
|
+
|
84
|
+
With PGVectorStore you can use hybrid search for more comprehensive and relevant search results.
|
85
|
+
|
86
|
+
```python
|
87
|
+
vs = PGVectorStore.create_sync(
|
88
|
+
engine=engine,
|
89
|
+
table_name=TABLE_NAME,
|
90
|
+
embedding_service=embedding,
|
91
|
+
hybrid_search_config=HybridSearchConfig(
|
92
|
+
fusion_function=reciprocal_rank_fusion
|
93
|
+
),
|
94
|
+
)
|
95
|
+
hybrid_docs = vector_store.similarity_search("products", k=5)
|
96
|
+
```
|
97
|
+
|
98
|
+
For a detailed guide on how to use hybrid search, see the [documentation](/examples/pg_vectorstore_how_to.ipynb#hybrid-search-with-pgvectorstore ).
|
99
|
+
|
98
100
|
## ChatMessageHistory
|
99
101
|
|
100
102
|
The chat message history abstraction helps to persist chat message history
|
{langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/examples/pg_vectorstore_how_to.ipynb
RENAMED
@@ -686,6 +686,260 @@
|
|
686
686
|
"1. For new records, added via `VectorStore` embeddings are automatically generated."
|
687
687
|
]
|
688
688
|
},
|
689
|
+
{
|
690
|
+
"cell_type": "markdown",
|
691
|
+
"metadata": {},
|
692
|
+
"source": [
|
693
|
+
"# Hybrid Search with PGVectorStore\n",
|
694
|
+
"\n",
|
695
|
+
"A Hybrid Search combines multiple lookup strategies to provide more comprehensive and relevant search results. Specifically, it leverages both dense embedding vector search as the primary search (for semantic similarity) and TSV (Text Search Vector) based keyword search as the secondary search (for lexical matching). This approach is particularly powerful for applications requiring efficient searching through customized text and metadata, especially when a specialized embedding model isn't feasible or necessary.\n",
|
696
|
+
"\n",
|
697
|
+
"By integrating both semantic and lexical capabilities, hybrid search helps overcome the limitations of each individual method:\n",
|
698
|
+
"* **Semantic Search**: Excellent for understanding the meaning of a query, even if the exact keywords aren't present. However, it can sometimes miss highly relevant documents that contain the precise keywords but have a slightly different semantic context.\n",
|
699
|
+
"* **Keyword Search**: Highly effective for finding documents with exact keyword matches and is generally fast. Its weakness lies in its inability to understand synonyms, misspellings, or conceptual relationships."
|
700
|
+
]
|
701
|
+
},
|
702
|
+
{
|
703
|
+
"cell_type": "markdown",
|
704
|
+
"metadata": {},
|
705
|
+
"source": [
|
706
|
+
"## Hybrid Search Config\n",
|
707
|
+
"\n",
|
708
|
+
"You can take advantage of hybrid search with PGVectorStore using the `HybridSearchConfig`.\n",
|
709
|
+
"\n",
|
710
|
+
"With a `HybridSearchConfig` provided, the `PGVectorStore` class can efficiently manage a hybrid search vector store using PostgreSQL as the backend, automatically handling the creation and population of the necessary TSV columns when possible."
|
711
|
+
]
|
712
|
+
},
|
713
|
+
{
|
714
|
+
"cell_type": "markdown",
|
715
|
+
"metadata": {},
|
716
|
+
"source": [
|
717
|
+
"### Building the config\n",
|
718
|
+
"\n",
|
719
|
+
"Here are the parameters to the hybrid search config:\n",
|
720
|
+
"* **tsv_column:** The column name for TSV column. Default: `<content_column>_tsv`\n",
|
721
|
+
"* **tsv_lang:** Value representing a supported language. Default: `pg_catalog.english`\n",
|
722
|
+
"* **fts_query:** If provided, this would be used for secondary retrieval instead of user provided query.\n",
|
723
|
+
"* **fusion_function:** Determines how the results are to be merged, default is equal weighted sum ranking.\n",
|
724
|
+
"* **fusion_function_parameters:** Parameters for the fusion function\n",
|
725
|
+
"* **primary_top_k:** Max results fetched for primary retrieval. Default: `4`\n",
|
726
|
+
"* **secondary_top_k:** Max results fetched for secondary retrieval. Default: `4`\n",
|
727
|
+
"* **index_name:** Name of the index built on the `tsv_column`\n",
|
728
|
+
"* **index_type:** GIN or GIST. Default: `GIN`"
|
729
|
+
]
|
730
|
+
},
|
731
|
+
{
|
732
|
+
"cell_type": "markdown",
|
733
|
+
"metadata": {},
|
734
|
+
"source": [
|
735
|
+
"Here is an example `HybridSearchConfig`"
|
736
|
+
]
|
737
|
+
},
|
738
|
+
{
|
739
|
+
"cell_type": "code",
|
740
|
+
"execution_count": null,
|
741
|
+
"metadata": {},
|
742
|
+
"outputs": [],
|
743
|
+
"source": [
|
744
|
+
"from langchain_postgres.v2.hybrid_search_config import (\n",
|
745
|
+
" HybridSearchConfig,\n",
|
746
|
+
" reciprocal_rank_fusion,\n",
|
747
|
+
")\n",
|
748
|
+
"\n",
|
749
|
+
"hybrid_search_config = HybridSearchConfig(\n",
|
750
|
+
" tsv_column=\"hybrid_description\",\n",
|
751
|
+
" tsv_lang=\"pg_catalog.english\",\n",
|
752
|
+
" fusion_function=reciprocal_rank_fusion,\n",
|
753
|
+
" fusion_function_parameters={\n",
|
754
|
+
" \"rrf_k\": 60,\n",
|
755
|
+
" \"fetch_top_k\": 10,\n",
|
756
|
+
" },\n",
|
757
|
+
")"
|
758
|
+
]
|
759
|
+
},
|
760
|
+
{
|
761
|
+
"cell_type": "markdown",
|
762
|
+
"metadata": {},
|
763
|
+
"source": [
|
764
|
+
"**Note:** In this case, we have mentioned the fusion function to be a `reciprocal rank fusion` but you can also use the `weighted_sum_ranking`.\n",
|
765
|
+
"\n",
|
766
|
+
"Make sure to use the right fusion function parameters\n",
|
767
|
+
"\n",
|
768
|
+
"`reciprocal_rank_fusion`:\n",
|
769
|
+
"* rrf_k: The RRF parameter k. Defaults to 60\n",
|
770
|
+
"* fetch_top_k: The number of documents to fetch after merging the results. Defaults to 4\n",
|
771
|
+
"\n",
|
772
|
+
"`weighted_sum_ranking`:\n",
|
773
|
+
"* primary_results_weight: The weight for the primary source's scores. Defaults to 0.5\n",
|
774
|
+
"* secondary_results_weight: The weight for the secondary source's scores. Defaults to 0.5\n",
|
775
|
+
"* fetch_top_k: The number of documents to fetch after merging the results. Defaults to 4\n"
|
776
|
+
]
|
777
|
+
},
|
778
|
+
{
|
779
|
+
"cell_type": "markdown",
|
780
|
+
"metadata": {},
|
781
|
+
"source": [
|
782
|
+
"## Usage\n",
|
783
|
+
"\n",
|
784
|
+
"Let's assume we are using the previously mentioned table [`products`](#create-a-vector-store-using-existing-table), which stores product details for an eComm venture.\n"
|
785
|
+
]
|
786
|
+
},
|
787
|
+
{
|
788
|
+
"cell_type": "markdown",
|
789
|
+
"metadata": {},
|
790
|
+
"source": [
|
791
|
+
"### With a new hybrid search table\n",
|
792
|
+
"To create a new postgres table with the tsv column, specify the hybrid search config during the initialization of the vector store.\n",
|
793
|
+
"\n",
|
794
|
+
"In this case, all the similarity searches will make use of hybrid search."
|
795
|
+
]
|
796
|
+
},
|
797
|
+
{
|
798
|
+
"cell_type": "code",
|
799
|
+
"execution_count": null,
|
800
|
+
"metadata": {},
|
801
|
+
"outputs": [],
|
802
|
+
"source": [
|
803
|
+
"from langchain_postgres import PGVectorStore\n",
|
804
|
+
"\n",
|
805
|
+
"TABLE_NAME = \"hybrid_search_products\"\n",
|
806
|
+
"\n",
|
807
|
+
"await pg_engine.ainit_vectorstore_table(\n",
|
808
|
+
" table_name=TABLE_NAME,\n",
|
809
|
+
" # schema_name=SCHEMA_NAME,\n",
|
810
|
+
" vector_size=VECTOR_SIZE,\n",
|
811
|
+
" id_column=\"product_id\",\n",
|
812
|
+
" content_column=\"description\",\n",
|
813
|
+
" embedding_column=\"embed\",\n",
|
814
|
+
" metadata_columns=[\"name\", \"category\", \"price_usd\", \"quantity\", \"sku\", \"image_url\"],\n",
|
815
|
+
" metadata_json_column=\"metadata\",\n",
|
816
|
+
" hybrid_search_config=hybrid_search_config,\n",
|
817
|
+
" store_metadata=True,\n",
|
818
|
+
")\n",
|
819
|
+
"\n",
|
820
|
+
"vs_hybrid = await PGVectorStore.create(\n",
|
821
|
+
" pg_engine,\n",
|
822
|
+
" table_name=TABLE_NAME,\n",
|
823
|
+
" # schema_name=SCHEMA_NAME,\n",
|
824
|
+
" embedding_service=embedding,\n",
|
825
|
+
" # Connect to existing VectorStore by customizing below column names\n",
|
826
|
+
" id_column=\"product_id\",\n",
|
827
|
+
" content_column=\"description\",\n",
|
828
|
+
" embedding_column=\"embed\",\n",
|
829
|
+
" metadata_columns=[\"name\", \"category\", \"price_usd\", \"quantity\", \"sku\", \"image_url\"],\n",
|
830
|
+
" metadata_json_column=\"metadata\",\n",
|
831
|
+
" hybrid_search_config=hybrid_search_config,\n",
|
832
|
+
")\n",
|
833
|
+
"\n",
|
834
|
+
"# Fetch documents from the previously created store to fetch product documents\n",
|
835
|
+
"docs = await custom_store.asimilarity_search(\"products\", k=5)\n",
|
836
|
+
"# Add data normally to the hybrid search vector store, which will also add the tsv values in tsv_column\n",
|
837
|
+
"await vs_hybrid.aadd_documents(docs)\n",
|
838
|
+
"\n",
|
839
|
+
"# Use hybrid search\n",
|
840
|
+
"hybrid_docs = await vs_hybrid.asimilarity_search(\"products\", k=5)\n",
|
841
|
+
"print(hybrid_docs)"
|
842
|
+
]
|
843
|
+
},
|
844
|
+
{
|
845
|
+
"cell_type": "markdown",
|
846
|
+
"metadata": {},
|
847
|
+
"source": [
|
848
|
+
"### With a pre-existing table\n",
|
849
|
+
"\n",
|
850
|
+
"If a hybrid search config is **NOT** provided during `init_vectorstore_table` while creating a table, the table will not contain a tsv_column. In this case you can still take advantage of hybrid search using the `HybridSearchConfig`.\n",
|
851
|
+
"\n",
|
852
|
+
"The specified TSV column is not present but the TSV vectors are created dynamically on-the-go for hybrid search."
|
853
|
+
]
|
854
|
+
},
|
855
|
+
{
|
856
|
+
"cell_type": "code",
|
857
|
+
"execution_count": null,
|
858
|
+
"metadata": {},
|
859
|
+
"outputs": [],
|
860
|
+
"source": [
|
861
|
+
"from langchain_postgres import PGVectorStore\n",
|
862
|
+
"\n",
|
863
|
+
"# Set the existing table name\n",
|
864
|
+
"TABLE_NAME = \"products\"\n",
|
865
|
+
"# SCHEMA_NAME = \"my_schema\"\n",
|
866
|
+
"\n",
|
867
|
+
"hybrid_search_config = HybridSearchConfig(\n",
|
868
|
+
" tsv_lang=\"pg_catalog.english\",\n",
|
869
|
+
" fusion_function=reciprocal_rank_fusion,\n",
|
870
|
+
" fusion_function_parameters={\n",
|
871
|
+
" \"rrf_k\": 60,\n",
|
872
|
+
" \"fetch_top_k\": 10,\n",
|
873
|
+
" },\n",
|
874
|
+
")\n",
|
875
|
+
"\n",
|
876
|
+
"# Initialize PGVectorStore with the hybrid search config\n",
|
877
|
+
"custom_hybrid_store = await PGVectorStore.create(\n",
|
878
|
+
" pg_engine,\n",
|
879
|
+
" table_name=TABLE_NAME,\n",
|
880
|
+
" # schema_name=SCHEMA_NAME,\n",
|
881
|
+
" embedding_service=embedding,\n",
|
882
|
+
" # Connect to existing VectorStore by customizing below column names\n",
|
883
|
+
" id_column=\"product_id\",\n",
|
884
|
+
" content_column=\"description\",\n",
|
885
|
+
" embedding_column=\"embed\",\n",
|
886
|
+
" metadata_columns=[\"name\", \"category\", \"price_usd\", \"quantity\", \"sku\", \"image_url\"],\n",
|
887
|
+
" metadata_json_column=\"metadata\",\n",
|
888
|
+
" hybrid_search_config=hybrid_search_config,\n",
|
889
|
+
")\n",
|
890
|
+
"\n",
|
891
|
+
"# Use hybrid search\n",
|
892
|
+
"hybrid_docs = await custom_hybrid_store.asimilarity_search(\"products\", k=5)\n",
|
893
|
+
"print(hybrid_docs)"
|
894
|
+
]
|
895
|
+
},
|
896
|
+
{
|
897
|
+
"cell_type": "markdown",
|
898
|
+
"metadata": {},
|
899
|
+
"source": [
|
900
|
+
"In this case, all the similarity searches will make use of hybrid search."
|
901
|
+
]
|
902
|
+
},
|
903
|
+
{
|
904
|
+
"cell_type": "markdown",
|
905
|
+
"metadata": {},
|
906
|
+
"source": [
|
907
|
+
"### Applying Hybrid Search to Specific Queries\n",
|
908
|
+
"\n",
|
909
|
+
"To use hybrid search only for certain queries, omit the configuration during initialization and pass it directly to the search method when needed."
|
910
|
+
]
|
911
|
+
},
|
912
|
+
{
|
913
|
+
"cell_type": "code",
|
914
|
+
"execution_count": null,
|
915
|
+
"metadata": {},
|
916
|
+
"outputs": [],
|
917
|
+
"source": [
|
918
|
+
"# Use hybrid search\n",
|
919
|
+
"hybrid_docs = await custom_store.asimilarity_search(\n",
|
920
|
+
" \"products\", k=5, hybrid_search_config=hybrid_search_config\n",
|
921
|
+
")\n",
|
922
|
+
"print(hybrid_docs)"
|
923
|
+
]
|
924
|
+
},
|
925
|
+
{
|
926
|
+
"cell_type": "markdown",
|
927
|
+
"metadata": {},
|
928
|
+
"source": [
|
929
|
+
"## Hybrid Search Index\n",
|
930
|
+
"\n",
|
931
|
+
"Optionally, if you have created a Postgres table with a tsv_column, you can create an index."
|
932
|
+
]
|
933
|
+
},
|
934
|
+
{
|
935
|
+
"cell_type": "code",
|
936
|
+
"execution_count": null,
|
937
|
+
"metadata": {},
|
938
|
+
"outputs": [],
|
939
|
+
"source": [
|
940
|
+
"await vs_hybrid.aapply_hybrid_search_index()"
|
941
|
+
]
|
942
|
+
},
|
689
943
|
{
|
690
944
|
"cell_type": "markdown",
|
691
945
|
"metadata": {},
|
{langchain_postgres-0.0.15 → langchain_postgres-0.0.16}/langchain_postgres/v2/async_vectorstore.py
RENAMED
@@ -210,7 +210,7 @@ class AsyncPGVectorStore(VectorStore):
|
|
210
210
|
hybrid_search_config.tsv_column = ""
|
211
211
|
if embedding_column not in columns:
|
212
212
|
raise ValueError(f"Embedding column, {embedding_column}, does not exist.")
|
213
|
-
if columns[embedding_column]
|
213
|
+
if columns[embedding_column] not in ["USER-DEFINED", "vector"]:
|
214
214
|
raise ValueError(
|
215
215
|
f"Embedding column, {embedding_column}, is not type Vector."
|
216
216
|
)
|
@@ -580,16 +580,16 @@ class AsyncPGVectorStore(VectorStore):
|
|
580
580
|
For best hybrid search performance, consider creating a TSV column
|
581
581
|
and adding GIN index.
|
582
582
|
"""
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
|
583
|
+
hybrid_search_config = kwargs.get(
|
584
|
+
"hybrid_search_config", self.hybrid_search_config
|
585
|
+
)
|
586
|
+
|
587
|
+
final_k = k if k is not None else self.k
|
588
|
+
|
589
|
+
dense_limit = final_k
|
590
|
+
if hybrid_search_config:
|
591
|
+
dense_limit = hybrid_search_config.primary_top_k
|
592
|
+
|
593
593
|
operator = self.distance_strategy.operator
|
594
594
|
search_function = self.distance_strategy.search_function
|
595
595
|
|
@@ -617,9 +617,9 @@ class AsyncPGVectorStore(VectorStore):
|
|
617
617
|
embedding_data_string = ":query_embedding"
|
618
618
|
where_filters = f"WHERE {safe_filter}" if safe_filter else ""
|
619
619
|
dense_query_stmt = f"""SELECT {column_names}, {search_function}("{self.embedding_column}", {embedding_data_string}) as distance
|
620
|
-
FROM "{self.schema_name}"."{self.table_name}" {where_filters} ORDER BY "{self.embedding_column}" {operator} {embedding_data_string} LIMIT :
|
620
|
+
FROM "{self.schema_name}"."{self.table_name}" {where_filters} ORDER BY "{self.embedding_column}" {operator} {embedding_data_string} LIMIT :dense_limit;
|
621
621
|
"""
|
622
|
-
param_dict = {"query_embedding": query_embedding, "
|
622
|
+
param_dict = {"query_embedding": query_embedding, "dense_limit": dense_limit}
|
623
623
|
if filter_dict:
|
624
624
|
param_dict.update(filter_dict)
|
625
625
|
if self.index_query_options:
|
@@ -637,16 +637,13 @@ class AsyncPGVectorStore(VectorStore):
|
|
637
637
|
result_map = result.mappings()
|
638
638
|
dense_results = result_map.fetchall()
|
639
639
|
|
640
|
-
hybrid_search_config = kwargs.get(
|
641
|
-
"hybrid_search_config", self.hybrid_search_config
|
642
|
-
)
|
643
640
|
fts_query = (
|
644
641
|
hybrid_search_config.fts_query
|
645
642
|
if hybrid_search_config and hybrid_search_config.fts_query
|
646
643
|
else kwargs.get("fts_query", "")
|
647
644
|
)
|
648
645
|
if hybrid_search_config and fts_query:
|
649
|
-
hybrid_search_config.fusion_function_parameters["fetch_top_k"] =
|
646
|
+
hybrid_search_config.fusion_function_parameters["fetch_top_k"] = final_k
|
650
647
|
# do the sparse query
|
651
648
|
lang = (
|
652
649
|
f"'{hybrid_search_config.tsv_lang}',"
|
@@ -670,6 +667,7 @@ class AsyncPGVectorStore(VectorStore):
|
|
670
667
|
dense_results,
|
671
668
|
sparse_results,
|
672
669
|
**hybrid_search_config.fusion_function_parameters,
|
670
|
+
distance_strategy=self.distance_strategy,
|
673
671
|
)
|
674
672
|
return combined_results
|
675
673
|
return dense_results
|
@@ -119,7 +119,7 @@ class PGEngine:
|
|
119
119
|
return await coro
|
120
120
|
# Otherwise, run in the background thread
|
121
121
|
return await asyncio.wrap_future(
|
122
|
-
asyncio.run_coroutine_threadsafe(coro, self._loop)
|
122
|
+
asyncio.run_coroutine_threadsafe(coro, self._loop) # type: ignore[arg-type]
|
123
123
|
)
|
124
124
|
|
125
125
|
def _run_as_sync(self, coro: Awaitable[T]) -> T:
|
@@ -128,7 +128,7 @@ class PGEngine:
|
|
128
128
|
raise Exception(
|
129
129
|
"Engine was initialized without a background loop and cannot call sync methods."
|
130
130
|
)
|
131
|
-
return asyncio.run_coroutine_threadsafe(coro, self._loop).result()
|
131
|
+
return asyncio.run_coroutine_threadsafe(coro, self._loop).result() # type: ignore[arg-type]
|
132
132
|
|
133
133
|
async def close(self) -> None:
|
134
134
|
"""Dispose of connection pool"""
|
@@ -0,0 +1,212 @@
|
|
1
|
+
from abc import ABC
|
2
|
+
from dataclasses import dataclass, field
|
3
|
+
from typing import Any, Callable, Optional, Sequence
|
4
|
+
|
5
|
+
from sqlalchemy import RowMapping
|
6
|
+
|
7
|
+
from .indexes import DistanceStrategy
|
8
|
+
|
9
|
+
|
10
|
+
def _normalize_scores(
|
11
|
+
results: Sequence[dict[str, Any]], is_distance_metric: bool
|
12
|
+
) -> Sequence[dict[str, Any]]:
|
13
|
+
"""Normalizes scores to a 0-1 scale, where 1 is best."""
|
14
|
+
if not results:
|
15
|
+
return []
|
16
|
+
|
17
|
+
# Get scores from the last column of each result
|
18
|
+
scores = [float(list(item.values())[-1]) for item in results]
|
19
|
+
min_score, max_score = min(scores), max(scores)
|
20
|
+
score_range = max_score - min_score
|
21
|
+
|
22
|
+
if score_range == 0:
|
23
|
+
# All documents are of the highest quality (1.0)
|
24
|
+
for item in results:
|
25
|
+
item["normalized_score"] = 1.0
|
26
|
+
return list(results)
|
27
|
+
|
28
|
+
for item in results:
|
29
|
+
# Access the score again from the last column for calculation
|
30
|
+
score = list(item.values())[-1]
|
31
|
+
normalized = (score - min_score) / score_range
|
32
|
+
if is_distance_metric:
|
33
|
+
# For distance, a lower score is better, so we invert the result.
|
34
|
+
item["normalized_score"] = 1.0 - normalized
|
35
|
+
else:
|
36
|
+
# For similarity (like keyword search), a higher score is better.
|
37
|
+
item["normalized_score"] = normalized
|
38
|
+
|
39
|
+
return list(results)
|
40
|
+
|
41
|
+
|
42
|
+
def weighted_sum_ranking(
|
43
|
+
primary_search_results: Sequence[RowMapping],
|
44
|
+
secondary_search_results: Sequence[RowMapping],
|
45
|
+
primary_results_weight: float = 0.5,
|
46
|
+
secondary_results_weight: float = 0.5,
|
47
|
+
fetch_top_k: int = 4,
|
48
|
+
**kwargs: Any,
|
49
|
+
) -> Sequence[dict[str, Any]]:
|
50
|
+
"""
|
51
|
+
Ranks documents using a weighted sum of scores from two sources.
|
52
|
+
|
53
|
+
Args:
|
54
|
+
primary_search_results: A list of (document, distance) tuples from
|
55
|
+
the primary search.
|
56
|
+
secondary_search_results: A list of (document, distance) tuples from
|
57
|
+
the secondary search.
|
58
|
+
primary_results_weight: The weight for the primary source's scores.
|
59
|
+
Defaults to 0.5.
|
60
|
+
secondary_results_weight: The weight for the secondary source's scores.
|
61
|
+
Defaults to 0.5.
|
62
|
+
fetch_top_k: The number of documents to fetch after merging the results.
|
63
|
+
Defaults to 4.
|
64
|
+
|
65
|
+
Returns:
|
66
|
+
A list of (document, distance) tuples, sorted by weighted_score in
|
67
|
+
descending order.
|
68
|
+
"""
|
69
|
+
|
70
|
+
distance_strategy = kwargs.get(
|
71
|
+
"distance_strategy", DistanceStrategy.COSINE_DISTANCE
|
72
|
+
)
|
73
|
+
is_primary_distance = distance_strategy != DistanceStrategy.INNER_PRODUCT
|
74
|
+
|
75
|
+
# Normalize both sets of results onto a 0-1 scale
|
76
|
+
normalized_primary = _normalize_scores(
|
77
|
+
[dict(row) for row in primary_search_results],
|
78
|
+
is_distance_metric=is_primary_distance,
|
79
|
+
)
|
80
|
+
|
81
|
+
# Keyword search relevance is a similarity score (higher is better)
|
82
|
+
normalized_secondary = _normalize_scores(
|
83
|
+
[dict(row) for row in secondary_search_results], is_distance_metric=False
|
84
|
+
)
|
85
|
+
|
86
|
+
# stores computed metric with provided distance metric and weights
|
87
|
+
weighted_scores: dict[str, dict[str, Any]] = {}
|
88
|
+
|
89
|
+
# Process primary results
|
90
|
+
for item in normalized_primary:
|
91
|
+
doc_id = str(list(item.values())[0])
|
92
|
+
# Set the 'distance' key with the weighted primary score
|
93
|
+
item["distance"] = item["normalized_score"] * primary_results_weight
|
94
|
+
weighted_scores[doc_id] = item
|
95
|
+
|
96
|
+
# Process secondary results
|
97
|
+
for item in normalized_secondary:
|
98
|
+
doc_id = str(list(item.values())[0])
|
99
|
+
secondary_weighted_score = item["normalized_score"] * secondary_results_weight
|
100
|
+
|
101
|
+
if doc_id in weighted_scores:
|
102
|
+
# Add to the existing 'distance' score
|
103
|
+
weighted_scores[doc_id]["distance"] += secondary_weighted_score
|
104
|
+
else:
|
105
|
+
# Set the 'distance' key for the new item
|
106
|
+
item["distance"] = secondary_weighted_score
|
107
|
+
weighted_scores[doc_id] = item
|
108
|
+
|
109
|
+
ranked_results = sorted(
|
110
|
+
weighted_scores.values(), key=lambda item: item["distance"], reverse=True
|
111
|
+
)
|
112
|
+
|
113
|
+
for result in ranked_results:
|
114
|
+
result.pop("normalized_score", None)
|
115
|
+
|
116
|
+
return ranked_results[:fetch_top_k]
|
117
|
+
|
118
|
+
|
119
|
+
def reciprocal_rank_fusion(
|
120
|
+
primary_search_results: Sequence[RowMapping],
|
121
|
+
secondary_search_results: Sequence[RowMapping],
|
122
|
+
rrf_k: float = 60,
|
123
|
+
fetch_top_k: int = 4,
|
124
|
+
**kwargs: Any,
|
125
|
+
) -> Sequence[dict[str, Any]]:
|
126
|
+
"""
|
127
|
+
Ranks documents using Reciprocal Rank Fusion (RRF) of scores from two sources.
|
128
|
+
|
129
|
+
Args:
|
130
|
+
primary_search_results: A list of (document, distance) tuples from
|
131
|
+
the primary search.
|
132
|
+
secondary_search_results: A list of (document, distance) tuples from
|
133
|
+
the secondary search.
|
134
|
+
rrf_k: The RRF parameter k.
|
135
|
+
Defaults to 60.
|
136
|
+
fetch_top_k: The number of documents to fetch after merging the results.
|
137
|
+
Defaults to 4.
|
138
|
+
|
139
|
+
Returns:
|
140
|
+
A list of (document_id, rrf_score) tuples, sorted by rrf_score
|
141
|
+
in descending order.
|
142
|
+
"""
|
143
|
+
distance_strategy = kwargs.get(
|
144
|
+
"distance_strategy", DistanceStrategy.COSINE_DISTANCE
|
145
|
+
)
|
146
|
+
rrf_scores: dict[str, dict[str, Any]] = {}
|
147
|
+
|
148
|
+
# Process results from primary source
|
149
|
+
# Determine sorting order based on the vector distance strategy.
|
150
|
+
# For COSINE & EUCLIDEAN(distance), we sort ascending (reverse=False).
|
151
|
+
# For INNER_PRODUCT (similarity), we sort descending (reverse=True).
|
152
|
+
is_similarity_metric = distance_strategy == DistanceStrategy.INNER_PRODUCT
|
153
|
+
sorted_primary = sorted(
|
154
|
+
primary_search_results,
|
155
|
+
key=lambda item: item["distance"],
|
156
|
+
reverse=is_similarity_metric,
|
157
|
+
)
|
158
|
+
|
159
|
+
for rank, row in enumerate(sorted_primary):
|
160
|
+
doc_id = str(list(row.values())[0])
|
161
|
+
if doc_id not in rrf_scores:
|
162
|
+
rrf_scores[doc_id] = dict(row)
|
163
|
+
rrf_scores[doc_id]["distance"] = 0.0
|
164
|
+
# Add the "normalized" rank score
|
165
|
+
rrf_scores[doc_id]["distance"] += 1.0 / (rank + rrf_k)
|
166
|
+
|
167
|
+
# Process results from secondary source
|
168
|
+
# Keyword search relevance is always "higher is better" -> sort descending
|
169
|
+
sorted_secondary = sorted(
|
170
|
+
secondary_search_results,
|
171
|
+
key=lambda item: item["distance"],
|
172
|
+
reverse=True,
|
173
|
+
)
|
174
|
+
|
175
|
+
for rank, row in enumerate(sorted_secondary):
|
176
|
+
doc_id = str(list(row.values())[0])
|
177
|
+
if doc_id not in rrf_scores:
|
178
|
+
rrf_scores[doc_id] = dict(row)
|
179
|
+
rrf_scores[doc_id]["distance"] = 0.0
|
180
|
+
# Add the rank score from this list to the existing score
|
181
|
+
rrf_scores[doc_id]["distance"] += 1.0 / (rank + rrf_k)
|
182
|
+
|
183
|
+
# Sort the results by rrf score in descending order
|
184
|
+
# Sort the results by weighted score in descending order
|
185
|
+
ranked_results = sorted(
|
186
|
+
rrf_scores.values(), key=lambda item: item["distance"], reverse=True
|
187
|
+
)
|
188
|
+
# Extract only the RowMapping for the top results
|
189
|
+
return ranked_results[:fetch_top_k]
|
190
|
+
|
191
|
+
|
192
|
+
@dataclass
|
193
|
+
class HybridSearchConfig(ABC):
|
194
|
+
"""
|
195
|
+
AlloyDB Vector Store Hybrid Search Config.
|
196
|
+
|
197
|
+
Queries might be slow if the hybrid search column does not exist.
|
198
|
+
For best hybrid search performance, consider creating a TSV column
|
199
|
+
and adding GIN index.
|
200
|
+
"""
|
201
|
+
|
202
|
+
tsv_column: Optional[str] = ""
|
203
|
+
tsv_lang: Optional[str] = "pg_catalog.english"
|
204
|
+
fts_query: Optional[str] = ""
|
205
|
+
fusion_function: Callable[
|
206
|
+
[Sequence[RowMapping], Sequence[RowMapping], Any], Sequence[Any]
|
207
|
+
] = weighted_sum_ranking # Updated default
|
208
|
+
fusion_function_parameters: dict[str, Any] = field(default_factory=dict)
|
209
|
+
primary_top_k: int = 4
|
210
|
+
secondary_top_k: int = 4
|
211
|
+
index_name: str = "langchain_tsv_index"
|
212
|
+
index_type: str = "GIN"
|