cmem-plugin-pgvector 0.6.3__tar.gz → 0.7.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: cmem-plugin-pgvector
3
- Version: 0.6.3
3
+ Version: 0.7.1
4
4
  Summary: Store and search for embedding vectors in a Postgres vector store.
5
5
  License: Apache-2.0
6
6
  Keywords: eccenca Corporate Memory,plugin
@@ -60,23 +60,15 @@ class PGVectorCollection(StringParameterType):
60
60
  password = depend_on_parameter_values[4]
61
61
  password = password if isinstance(password, str) else password.decrypt()
62
62
  result = []
63
- try:
64
- collections = get_collection_names(
65
- host=host, port=port, dbname=dbname, user=user, password=password
66
- )
67
- filtered_models = set()
68
- if query_terms:
69
- for term in query_terms:
70
- for collection in collections:
71
- if term in collection:
72
- filtered_models.add(collection)
73
- else:
74
- filtered_models = set(collections)
75
- result = [Autocompletion(value=f"{_}", label=f"{_}") for _ in filtered_models]
76
- except Exception as error:
77
- raise ValueError(
78
- "Failed to authenticate with OpenAI API, Please check URL and API key."
79
- ) from error
63
+ collections = get_collection_names(
64
+ host=host, port=port, dbname=dbname, user=user, password=password
65
+ )
66
+ filtered_collections = set()
67
+ for term in query_terms:
68
+ for collection in collections:
69
+ if term in collection:
70
+ filtered_collections.add(collection)
71
+ result = [Autocompletion(value=f"{_}", label=f"{_}") for _ in filtered_collections]
80
72
  result.sort(key=lambda x: x.label)
81
73
  return result
82
74
 
@@ -13,10 +13,11 @@ from cmem_plugin_base.dataintegration.ports import (
13
13
  FixedNumberOfInputs,
14
14
  FixedSchemaPort,
15
15
  )
16
- from cmem_plugin_base.dataintegration.types import IntParameterType
16
+ from cmem_plugin_base.dataintegration.types import EnumParameterType, IntParameterType
17
17
  from langchain_core.documents import Document
18
18
  from langchain_core.embeddings import Embeddings
19
19
  from langchain_postgres import PGVector
20
+ from langchain_postgres.vectorstores import DistanceStrategy
20
21
 
21
22
  from cmem_plugin_pgvector.commons import DatabaseParams
22
23
 
@@ -86,6 +87,14 @@ The results in this output are structured like this:
86
87
  default_value=10,
87
88
  param_type=IntParameterType(),
88
89
  ),
90
+ PluginParameter(
91
+ name="distance_strategy",
92
+ label="Distance Strategy",
93
+ description="The distance strategy to use. (default: COSINE)",
94
+ param_type=EnumParameterType(enum_type=DistanceStrategy),
95
+ default_value=DistanceStrategy.COSINE,
96
+ advanced=True,
97
+ ),
89
98
  ],
90
99
  )
91
100
  class PGVectorSearchPlugin(WorkflowPlugin):
@@ -105,6 +114,7 @@ class PGVectorSearchPlugin(WorkflowPlugin):
105
114
  report: ExecutionReport
106
115
  search_result_path: str
107
116
  top_k: int
117
+ distance_strategy: DistanceStrategy
108
118
 
109
119
  def __init__( # noqa: PLR0913
110
120
  self,
@@ -117,6 +127,7 @@ class PGVectorSearchPlugin(WorkflowPlugin):
117
127
  search_result_path: str = "_search_result",
118
128
  embedding_query_path: str = "_embedding",
119
129
  top_k: int = 10,
130
+ distance_strategy: DistanceStrategy = DistanceStrategy.COSINE,
120
131
  ) -> None:
121
132
  self.collection_name = collection_name
122
133
  self.user = user
@@ -126,6 +137,7 @@ class PGVectorSearchPlugin(WorkflowPlugin):
126
137
  self.embedding_query_path = embedding_query_path
127
138
  self.search_result_path = search_result_path
128
139
  self.top_k = top_k
140
+ self.distance_strategy = distance_strategy
129
141
 
130
142
  str_password = self.password = password if isinstance(password, str) else password.decrypt()
131
143
  self.connection_string = (
@@ -135,14 +147,6 @@ class PGVectorSearchPlugin(WorkflowPlugin):
135
147
  self.report = ExecutionReport()
136
148
  self.report.operation = "search"
137
149
  self.report.operation_desc = "searches"
138
-
139
- self.db = PGVector(
140
- collection_name=self.collection_name,
141
- connection=self.connection_string,
142
- embeddings=DummyEmbeddings(),
143
- use_jsonb=True,
144
- pre_delete_collection=False,
145
- )
146
150
  self._setup_ports()
147
151
 
148
152
  def _setup_ports(self) -> None:
@@ -224,6 +228,14 @@ class PGVectorSearchPlugin(WorkflowPlugin):
224
228
  ) -> Entities:
225
229
  """Run the workflow operator."""
226
230
  self.log.info("Start searching collection.")
231
+ self.db = PGVector(
232
+ collection_name=self.collection_name,
233
+ connection=self.connection_string,
234
+ embeddings=DummyEmbeddings(),
235
+ use_jsonb=True,
236
+ pre_delete_collection=False,
237
+ distance_strategy=self.distance_strategy,
238
+ )
227
239
  self.inputs = inputs
228
240
  self.execution_context = context
229
241
  try:
@@ -137,6 +137,7 @@ class PGVectorStorePlugin(WorkflowPlugin):
137
137
  self.embedding_path = embedding_path
138
138
  self.metadata_paths = metadata_paths
139
139
  self.source_path = source_path
140
+ self.pre_delete_collection = pre_delete_collection
140
141
 
141
142
  self.output_port = None
142
143
  self.input_ports = FixedNumberOfInputs([UnknownSchemaPort()])
@@ -149,14 +150,6 @@ class PGVectorStorePlugin(WorkflowPlugin):
149
150
  self.report.operation = "store"
150
151
  self.report.operation_desc = "vectors stored"
151
152
 
152
- self.db = PGVector(
153
- collection_name=self.collection_name,
154
- connection=self.connection_string,
155
- embeddings=None, # type: ignore # noqa: PGH003
156
- use_jsonb=True,
157
- pre_delete_collection=pre_delete_collection,
158
- )
159
-
160
153
  def _update_report(self, count: int) -> None:
161
154
  self.report.entity_count = count
162
155
  self.execution_context.report.update(self.report)
@@ -228,6 +221,13 @@ class PGVectorStorePlugin(WorkflowPlugin):
228
221
  ) -> None:
229
222
  """Run the workflow operator."""
230
223
  self.log.info("Start storing vectors.")
224
+ self.db = PGVector(
225
+ collection_name=self.collection_name,
226
+ connection=self.connection_string,
227
+ embeddings=None, # type: ignore # noqa: PGH003
228
+ use_jsonb=True,
229
+ pre_delete_collection=self.pre_delete_collection,
230
+ )
231
231
  self.inputs = inputs
232
232
  self.execution_context = context
233
233
  try:
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "cmem-plugin-pgvector"
3
- version = "0.6.3"
3
+ version = "0.7.1"
4
4
  license = "Apache-2.0"
5
5
  description = "Store and search for embedding vectors in a Postgres vector store."
6
6
  authors = ["eccenca GmbH <cmempy-developer@eccenca.com>"]
@@ -40,6 +40,7 @@ pytest-memray = { version = "^1.7.0", markers = "platform_system != 'Windows'"
40
40
  ruff = "^0.12.0"
41
41
  safety = "^1.10.3"
42
42
  aiohttp = "^3.10.11"
43
+ testcontainers = "^4.12.0"
43
44
 
44
45
  [build-system]
45
46
  requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning"]