nucliadb 6.9.0.post5086__py3-none-any.whl → 6.9.0.post5100__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nucliadb might be problematic. Click here for more details.

@@ -23,13 +23,9 @@ import async_lru
23
23
 
24
24
  from nucliadb.common import datamanagers
25
25
  from nucliadb.common.external_index_providers.base import ExternalIndexManager
26
- from nucliadb.common.external_index_providers.pinecone import PineconeIndexManager
27
- from nucliadb.common.external_index_providers.settings import settings
28
26
  from nucliadb_protos.knowledgebox_pb2 import (
29
- ExternalIndexProviderType,
30
27
  StoredExternalIndexProviderMetadata,
31
28
  )
32
- from nucliadb_utils.utilities import get_endecryptor
33
29
 
34
30
 
35
31
  async def get_external_index_manager(
@@ -39,31 +35,7 @@ async def get_external_index_manager(
39
35
  Returns an ExternalIndexManager for the given kbid.
40
36
  If for_rollover is True, the ExternalIndexManager returned will include the rollover indexes (if any).
41
37
  """
42
- metadata = await get_external_index_metadata(kbid)
43
- if metadata is None or metadata.type != ExternalIndexProviderType.PINECONE:
44
- # Only Pinecone is supported for now
45
- return None
46
-
47
- api_key = get_endecryptor().decrypt(metadata.pinecone_config.encrypted_api_key)
48
- default_vectorset = await get_default_vectorset_id(kbid)
49
-
50
- rollover_indexes = None
51
- if for_rollover:
52
- rollover_metadata = await get_rollover_external_index_metadata(kbid)
53
- if rollover_metadata is not None:
54
- rollover_indexes = dict(rollover_metadata.pinecone_config.indexes)
55
-
56
- return PineconeIndexManager(
57
- kbid=kbid,
58
- api_key=api_key,
59
- indexes=dict(metadata.pinecone_config.indexes),
60
- upsert_parallelism=settings.pinecone_upsert_parallelism,
61
- delete_parallelism=settings.pinecone_delete_parallelism,
62
- upsert_timeout=settings.pinecone_upsert_timeout,
63
- delete_timeout=settings.pinecone_delete_timeout,
64
- default_vectorset=default_vectorset,
65
- rollover_indexes=rollover_indexes,
66
- )
38
+ return None
67
39
 
68
40
 
69
41
  @async_lru.alru_cache(maxsize=None)
@@ -17,36 +17,10 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from pydantic import Field
21
20
  from pydantic_settings import BaseSettings
22
21
 
23
22
 
24
- class ExternalIndexProvidersSettings(BaseSettings):
25
- pinecone_upsert_parallelism: int = Field(
26
- default=3,
27
- title="Pinecone upsert parallelism",
28
- description="Number of parallel upserts to Pinecone on each set resource operation",
29
- )
30
- pinecone_delete_parallelism: int = Field(
31
- default=2,
32
- title="Pinecone delete parallelism",
33
- description="Number of parallel deletes to Pinecone on each delete resource operation",
34
- )
35
- pinecone_upsert_timeout: float = Field(
36
- default=10.0,
37
- title="Pinecone upsert timeout",
38
- description="Timeout in seconds for each upsert operation to Pinecone",
39
- )
40
- pinecone_delete_timeout: float = Field(
41
- default=10.0,
42
- title="Pinecone delete timeout",
43
- description="Timeout in seconds for each delete operation to Pinecone",
44
- )
45
- pinecone_query_timeout: float = Field(
46
- default=10.0,
47
- title="Pinecone query timeout",
48
- description="Timeout in seconds for each query operation to Pinecone",
49
- )
23
+ class ExternalIndexProvidersSettings(BaseSettings): ...
50
24
 
51
25
 
52
26
  settings = ExternalIndexProvidersSettings()
@@ -36,7 +36,6 @@ from nucliadb.common.datamanagers.resources import (
36
36
  KB_RESOURCE_SLUG_BASE,
37
37
  )
38
38
  from nucliadb.common.external_index_providers.base import VectorsetExternalIndex
39
- from nucliadb.common.external_index_providers.pinecone import PineconeIndexManager
40
39
  from nucliadb.common.maindb.driver import Driver, Transaction
41
40
  from nucliadb.common.maindb.pg import PGTransaction
42
41
  from nucliadb.common.nidx import get_nidx_api_client
@@ -53,7 +52,6 @@ from nucliadb.migrator.utils import get_latest_version
53
52
  from nucliadb_protos import knowledgebox_pb2, writer_pb2
54
53
  from nucliadb_protos.knowledgebox_pb2 import (
55
54
  CreateExternalIndexProviderMetadata,
56
- ExternalIndexProviderType,
57
55
  KnowledgeBoxConfig,
58
56
  SemanticModelMetadata,
59
57
  StoredExternalIndexProviderMetadata,
@@ -535,10 +533,7 @@ class KnowledgeBox:
535
533
  request: CreateExternalIndexProviderMetadata,
536
534
  indexes: list[VectorsetExternalIndex],
537
535
  ) -> StoredExternalIndexProviderMetadata:
538
- if request.type != ExternalIndexProviderType.PINECONE:
539
- return StoredExternalIndexProviderMetadata(type=request.type)
540
- # Only pinecone is supported for now
541
- return await PineconeIndexManager.create_indexes(kbid, request, indexes)
536
+ return StoredExternalIndexProviderMetadata(type=request.type)
542
537
 
543
538
  @classmethod
544
539
  async def _maybe_delete_external_indexes(
@@ -546,10 +541,7 @@ class KnowledgeBox:
546
541
  kbid: str,
547
542
  stored: StoredExternalIndexProviderMetadata,
548
543
  ) -> None:
549
- if stored.type != ExternalIndexProviderType.PINECONE:
550
- return
551
- # Only pinecone is supported for now
552
- await PineconeIndexManager.delete_indexes(kbid, stored)
544
+ return
553
545
 
554
546
 
555
547
  def chunker(seq: Sequence, size: int):
@@ -36,10 +36,6 @@ from nucliadb.writer import logger
36
36
  from nucliadb.writer.api.utils import only_for_onprem
37
37
  from nucliadb.writer.api.v1.router import KB_PREFIX, KBS_PREFIX, api
38
38
  from nucliadb.writer.utilities import get_processing
39
- from nucliadb_models.external_index_providers import (
40
- ExternalIndexProviderType,
41
- PineconeServerlessCloud,
42
- )
43
39
  from nucliadb_models.resource import (
44
40
  KnowledgeBoxConfig,
45
41
  KnowledgeBoxObj,
@@ -118,20 +114,6 @@ async def create_kb(item: KnowledgeBoxConfig) -> tuple[str, str]:
118
114
  external_index_provider = knowledgebox_pb2.CreateExternalIndexProviderMetadata(
119
115
  type=knowledgebox_pb2.ExternalIndexProviderType.UNSET,
120
116
  )
121
- if (
122
- item.external_index_provider
123
- and item.external_index_provider.type == ExternalIndexProviderType.PINECONE
124
- ):
125
- pinecone_api_key = item.external_index_provider.api_key
126
- serverless_pb = to_pinecone_serverless_cloud_pb(item.external_index_provider.serverless_cloud)
127
- external_index_provider = knowledgebox_pb2.CreateExternalIndexProviderMetadata(
128
- type=knowledgebox_pb2.ExternalIndexProviderType.PINECONE,
129
- pinecone_config=knowledgebox_pb2.CreatePineconeConfig(
130
- api_key=pinecone_api_key,
131
- serverless_cloud=serverless_pb,
132
- ),
133
- )
134
-
135
117
  try:
136
118
  (kbid, slug) = await KnowledgeBox.create(
137
119
  driver,
@@ -236,15 +218,3 @@ async def delete_kb(request: Request, kbid: str) -> KnowledgeBoxObj:
236
218
  asyncio.create_task(processing.delete_from_processing(kbid=kbid))
237
219
 
238
220
  return KnowledgeBoxObj(uuid=kbid)
239
-
240
-
241
- def to_pinecone_serverless_cloud_pb(
242
- serverless: PineconeServerlessCloud,
243
- ) -> knowledgebox_pb2.PineconeServerlessCloud.ValueType:
244
- return {
245
- PineconeServerlessCloud.AWS_EU_WEST_1: knowledgebox_pb2.PineconeServerlessCloud.AWS_EU_WEST_1,
246
- PineconeServerlessCloud.AWS_US_EAST_1: knowledgebox_pb2.PineconeServerlessCloud.AWS_US_EAST_1,
247
- PineconeServerlessCloud.AWS_US_WEST_2: knowledgebox_pb2.PineconeServerlessCloud.AWS_US_WEST_2,
248
- PineconeServerlessCloud.AZURE_EASTUS2: knowledgebox_pb2.PineconeServerlessCloud.AZURE_EASTUS2,
249
- PineconeServerlessCloud.GCP_US_CENTRAL1: knowledgebox_pb2.PineconeServerlessCloud.GCP_US_CENTRAL1,
250
- }[serverless]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nucliadb
3
- Version: 6.9.0.post5086
3
+ Version: 6.9.0.post5100
4
4
  Summary: NucliaDB
5
5
  Author-email: Nuclia <nucliadb@nuclia.com>
6
6
  License-Expression: AGPL-3.0-or-later
@@ -19,11 +19,11 @@ Classifier: Programming Language :: Python :: 3.12
19
19
  Classifier: Programming Language :: Python :: 3 :: Only
20
20
  Requires-Python: <4,>=3.9
21
21
  Description-Content-Type: text/markdown
22
- Requires-Dist: nucliadb-telemetry[all]>=6.9.0.post5086
23
- Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.9.0.post5086
24
- Requires-Dist: nucliadb-protos>=6.9.0.post5086
25
- Requires-Dist: nucliadb-models>=6.9.0.post5086
26
- Requires-Dist: nidx-protos>=6.9.0.post5086
22
+ Requires-Dist: nucliadb-telemetry[all]>=6.9.0.post5100
23
+ Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.9.0.post5100
24
+ Requires-Dist: nucliadb-protos>=6.9.0.post5100
25
+ Requires-Dist: nucliadb-models>=6.9.0.post5100
26
+ Requires-Dist: nidx-protos>=6.9.0.post5100
27
27
  Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
28
28
  Requires-Dist: nuclia-models>=0.50.0
29
29
  Requires-Dist: uvicorn[standard]
@@ -110,9 +110,8 @@ nucliadb/common/datamanagers/vectorsets.py,sha256=ciYb5uD435Zo8ZbqgPUAszFW9Svp_-
110
110
  nucliadb/common/external_index_providers/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
111
111
  nucliadb/common/external_index_providers/base.py,sha256=BL3DuYbnp-KCmGUiN-FGRtgjWj3SmtgMsGdjGq_7cX4,8905
112
112
  nucliadb/common/external_index_providers/exceptions.py,sha256=nDhhOIkb66hjCrBk4Spvl2vN1SuW5gbwrMCDmrdjHHE,1209
113
- nucliadb/common/external_index_providers/manager.py,sha256=aFSrrKKYG1ydpTSyq4zYD0LOxFS7P-CO6rcKC0hiF4I,4267
114
- nucliadb/common/external_index_providers/pinecone.py,sha256=PB0lUBBZyI9qcyRxtoi9zNp68TbN5FycfIM29gEIvqw,38096
115
- nucliadb/common/external_index_providers/settings.py,sha256=EGHnIkwxqe6aypwKegXTlKO3AgUxNa-6GeAZG25Njis,2002
113
+ nucliadb/common/external_index_providers/manager.py,sha256=tX-WjVZ2Oi7AHu3oWLBw731zFymCYFqNfoo1xEU4IQs,2979
114
+ nucliadb/common/external_index_providers/settings.py,sha256=8_eohahfIqqaNeeo7w4Dkb3_LmjjoWLuS0eqBvczBNA,982
116
115
  nucliadb/common/http_clients/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
117
116
  nucliadb/common/http_clients/auth.py,sha256=srfpgAbs2wmqA9u_l-HxsV4YoO77Tse4y3gm3q2YvYM,2112
118
117
  nucliadb/common/http_clients/exceptions.py,sha256=HniqLZEZN9BNfVv-AaBLpRyb8wpXzMpZNP5oANJYE6M,1208
@@ -167,7 +166,7 @@ nucliadb/ingest/orm/broker_message.py,sha256=XWaiZgDOz94NPOPT-hqbRr5ZkpVimUw6PjU
167
166
  nucliadb/ingest/orm/entities.py,sha256=kXyeF6XOpFKhEsGLcY-GLIk21Exp0cJst4XQQ9jJoug,14791
168
167
  nucliadb/ingest/orm/exceptions.py,sha256=gsp7TtVNQPiIEh-zf_UEJClwuFU0iu-5vzj0OrKMScg,1550
169
168
  nucliadb/ingest/orm/index_message.py,sha256=mWlpQ0-KChSVIbHewVE8sXCe-7LiPIIh0cBqr3axU8o,16554
170
- nucliadb/ingest/orm/knowledgebox.py,sha256=OG9dmfklYf1PgTHwQd_iFZOociLEvUSMMv1ZKeUgecE,23910
169
+ nucliadb/ingest/orm/knowledgebox.py,sha256=n72wuqordGbKhUcFd1jEbf3c7E3IQPnyBXq9rIEDtvM,23428
171
170
  nucliadb/ingest/orm/metrics.py,sha256=OiuggTh-n3kZHA2G73NEUdIlh8c3yFrbusI88DK-Mko,1273
172
171
  nucliadb/ingest/orm/resource.py,sha256=zQeZyZ-tCxr-DhonLobfZRkz_iEew0Y-cGfXeNNIHG0,40432
173
172
  nucliadb/ingest/orm/utils.py,sha256=fCQRuyecgqhaY7mcBG93oaXMkzkKb9BFjOcy4-ZiSNw,2693
@@ -362,7 +361,7 @@ nucliadb/writer/api/utils.py,sha256=wIQHlU8RQiIGVLI72suvyVIKlCU44Unh0Ae0IiN6Qwo,
362
361
  nucliadb/writer/api/v1/__init__.py,sha256=akI9A_jloNLb0dU4T5zjfdyvmSAiDeIdjAlzNx74FlU,1128
363
362
  nucliadb/writer/api/v1/export_import.py,sha256=v0sU55TtRSqDzwkDgcwv2uSaqKCuQTtGcMpYoHQYBQA,8192
364
363
  nucliadb/writer/api/v1/field.py,sha256=nO3IEV6v5hokdIo5HoaecdwDqvr1PzCJlh5DafzcNTw,19130
365
- nucliadb/writer/api/v1/knowledgebox.py,sha256=kioqjD3yN-y1cDTgmXAAOwivXHX9NXxwblcSzGqJup0,9533
364
+ nucliadb/writer/api/v1/knowledgebox.py,sha256=oSeM0HkOhvDIvjwrBeqbd9ubeN91x2-PiuhJgNuBn-w,8084
366
365
  nucliadb/writer/api/v1/learning_config.py,sha256=DTLEzKJ3dHvi8pbZscjElUqCH_ZvLc6WZgvalFqHo10,4450
367
366
  nucliadb/writer/api/v1/resource.py,sha256=IfcT6HXnR5sC5wSnQSuKmFzEWcLTh7OzZEAV4hYmXnA,20442
368
367
  nucliadb/writer/api/v1/router.py,sha256=RjuoWLpZer6Kl2BW_wznpNo6XL3BOpdTGqXZCn3QrrQ,1034
@@ -385,8 +384,8 @@ nucliadb/writer/tus/local.py,sha256=7jYa_w9b-N90jWgN2sQKkNcomqn6JMVBOVeDOVYJHto,
385
384
  nucliadb/writer/tus/s3.py,sha256=vu1BGg4VqJ_x2P1u2BxqPKlSfw5orT_a3R-Ln5oPUpU,8483
386
385
  nucliadb/writer/tus/storage.py,sha256=ToqwjoYnjI4oIcwzkhha_MPxi-k4Jk3Lt55zRwaC1SM,2903
387
386
  nucliadb/writer/tus/utils.py,sha256=MSdVbRsRSZVdkaum69_0wku7X3p5wlZf4nr6E0GMKbw,2556
388
- nucliadb-6.9.0.post5086.dist-info/METADATA,sha256=kaY1HSGwZVkCNZzUpp-_XgKI32QB48_6dRb-kK9GUSA,4158
389
- nucliadb-6.9.0.post5086.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
390
- nucliadb-6.9.0.post5086.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
391
- nucliadb-6.9.0.post5086.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
392
- nucliadb-6.9.0.post5086.dist-info/RECORD,,
387
+ nucliadb-6.9.0.post5100.dist-info/METADATA,sha256=0sGsM87JOiyKio3xqNtZEcVvuiXIuhMFywyEPrXoIeQ,4158
388
+ nucliadb-6.9.0.post5100.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
389
+ nucliadb-6.9.0.post5100.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
390
+ nucliadb-6.9.0.post5100.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
391
+ nucliadb-6.9.0.post5100.dist-info/RECORD,,
@@ -1,894 +0,0 @@
1
- # Copyright (C) 2021 Bosutech XXI S.L.
2
- #
3
- # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
- # For commercial licensing, contact us at info@nuclia.com.
5
- #
6
- # AGPL:
7
- # This program is free software: you can redistribute it and/or modify
8
- # it under the terms of the GNU Affero General Public License as
9
- # published by the Free Software Foundation, either version 3 of the
10
- # License, or (at your option) any later version.
11
- #
12
- # This program is distributed in the hope that it will be useful,
13
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- # GNU Affero General Public License for more details.
16
- #
17
- # You should have received a copy of the GNU Affero General Public License
18
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
- #
20
- import asyncio
21
- import logging
22
- from copy import deepcopy
23
- from typing import Any, Iterator, Optional
24
- from uuid import uuid4
25
-
26
- import backoff
27
- from cachetools import TTLCache
28
- from nidx_protos.nodereader_pb2 import FilterExpression, SearchRequest
29
- from nidx_protos.noderesources_pb2 import IndexParagraph, Resource, VectorSentence
30
- from pydantic import BaseModel
31
-
32
- from nucliadb.common.counters import IndexCounts
33
- from nucliadb.common.external_index_providers.base import (
34
- ExternalIndexManager,
35
- ExternalIndexProviderType,
36
- QueryResults,
37
- TextBlockMatch,
38
- VectorsetExternalIndex,
39
- )
40
- from nucliadb.common.external_index_providers.exceptions import ExternalIndexCreationError
41
- from nucliadb.common.ids import ParagraphId, VectorId
42
- from nucliadb_models.search import SCORE_TYPE, TextPosition
43
- from nucliadb_protos import knowledgebox_pb2 as kb_pb2
44
- from nucliadb_protos import utils_pb2
45
- from nucliadb_telemetry.metrics import Observer
46
- from nucliadb_utils.aiopynecone.client import DataPlane, FilterOperator, LogicalOperator
47
- from nucliadb_utils.aiopynecone.exceptions import (
48
- MetadataTooLargeError,
49
- PineconeAPIError,
50
- )
51
- from nucliadb_utils.aiopynecone.models import QueryResponse
52
- from nucliadb_utils.aiopynecone.models import Vector as PineconeVector
53
- from nucliadb_utils.utilities import get_endecryptor, get_pinecone
54
-
55
- logger = logging.getLogger(__name__)
56
-
57
- manager_observer = Observer("pinecone_index_manager", labels={"operation": ""})
58
-
59
-
60
- DISCARDED_LABEL_PREFIXES = [
61
- # NER-related labels are not supported in the Pinecone integration because right now
62
- # the number of detected entities is unbounded and may exceed the vector metadata size limit.
63
- "/e/",
64
- # Processing status labels are only needed for the catalog endpoint.
65
- "/n/s",
66
- ]
67
-
68
- # To avoid querying the Pinecone API for the same index stats multiple times in a short period of time
69
- COUNTERS_CACHE = TTLCache(maxsize=1024, ttl=60) # type: ignore
70
-
71
-
72
- class PineconeQueryResults(QueryResults):
73
- type: ExternalIndexProviderType = ExternalIndexProviderType.PINECONE
74
- results: QueryResponse
75
-
76
- def iter_matching_text_blocks(self) -> Iterator[TextBlockMatch]:
77
- for order, matching_vector in enumerate(self.results.matches):
78
- try:
79
- vector_id = VectorId.from_string(matching_vector.id)
80
- paragraph_id = ParagraphId.from_vector_id(vector_id)
81
- except ValueError: # pragma: no cover
82
- logger.error(f"Invalid Pinecone vector id: {matching_vector.id}")
83
- continue
84
- vector_metadata = VectorMetadata.model_validate(matching_vector.metadata) # noqa
85
- yield TextBlockMatch(
86
- paragraph_id=paragraph_id,
87
- text=None, # To be filled by the results hydrator
88
- score=matching_vector.score,
89
- score_type=SCORE_TYPE.VECTOR,
90
- order=order,
91
- fuzzy_search=False, # semantic search doesn't use fuzziness
92
- is_a_table=vector_metadata.is_a_table or False,
93
- page_with_visual=vector_metadata.page_with_visual or False,
94
- representation_file=vector_metadata.representation_file,
95
- paragraph_labels=vector_metadata.paragraph_labels or [],
96
- field_labels=vector_metadata.field_labels or [],
97
- position=TextPosition(
98
- page_number=vector_metadata.page_number,
99
- index=vector_id.index or 0,
100
- start=paragraph_id.paragraph_start,
101
- end=paragraph_id.paragraph_end,
102
- start_seconds=list(map(int, vector_metadata.position_start_seconds or [])),
103
- end_seconds=list(map(int, vector_metadata.position_end_seconds or [])),
104
- ),
105
- )
106
-
107
-
108
- class IndexHostNotFound(Exception): ...
109
-
110
-
111
- class VectorMetadata(BaseModel):
112
- """
113
- This class models what we index at Pinecone's metadata attribute for each vector.
114
- https://docs.pinecone.io/guides/data/filter-with-metadata
115
- """
116
-
117
- # Id filtering
118
- rid: str
119
- field_type: str
120
- field_id: str
121
-
122
- # Date range filtering
123
- date_created: Optional[int] = None
124
- date_modified: Optional[int] = None
125
-
126
- # Label filtering
127
- paragraph_labels: Optional[list[str]] = None
128
- field_labels: Optional[list[str]] = None
129
-
130
- # Security
131
- security_public: bool = True
132
- security_ids_with_access: Optional[list[str]] = None
133
-
134
- # Position
135
- position_start_seconds: Optional[list[str]] = None
136
- position_end_seconds: Optional[list[str]] = None
137
- page_number: Optional[int] = None
138
-
139
- # AI-tables metadata
140
- page_with_visual: Optional[bool] = None
141
- is_a_table: Optional[bool] = None
142
- representation_file: Optional[str] = None
143
-
144
-
145
- class PineconeIndexManager(ExternalIndexManager):
146
- type = ExternalIndexProviderType.PINECONE
147
- supports_rollover = True
148
-
149
- def __init__(
150
- self,
151
- kbid: str,
152
- api_key: str,
153
- indexes: dict[str, kb_pb2.PineconeIndexMetadata],
154
- upsert_parallelism: int = 3,
155
- delete_parallelism: int = 2,
156
- upsert_timeout: float = 10.0,
157
- delete_timeout: float = 10.0,
158
- query_timeout: float = 10.0,
159
- default_vectorset: Optional[str] = None,
160
- rollover_indexes: Optional[dict[str, kb_pb2.PineconeIndexMetadata]] = None,
161
- ):
162
- super().__init__(kbid=kbid)
163
- assert api_key != ""
164
- self.api_key = api_key
165
- self.indexes = indexes
166
- self.rollover_indexes = rollover_indexes or {}
167
- self.pinecone = get_pinecone()
168
- self.upsert_parallelism = upsert_parallelism
169
- self.delete_parallelism = delete_parallelism
170
- self.upsert_timeout = upsert_timeout
171
- self.delete_timeout = delete_timeout
172
- self.query_timeout = query_timeout
173
- self.default_vectorset = default_vectorset
174
-
175
- def get_data_plane(self, index_host: str) -> DataPlane:
176
- return self.pinecone.data_plane(api_key=self.api_key, index_host=index_host)
177
-
178
- @classmethod
179
- async def create_indexes(
180
- cls,
181
- kbid: str,
182
- request: kb_pb2.CreateExternalIndexProviderMetadata,
183
- indexes: list[VectorsetExternalIndex],
184
- ) -> kb_pb2.StoredExternalIndexProviderMetadata:
185
- created_indexes = []
186
- metadata = kb_pb2.StoredExternalIndexProviderMetadata(
187
- type=kb_pb2.ExternalIndexProviderType.PINECONE
188
- )
189
- api_key = request.pinecone_config.api_key
190
- metadata.pinecone_config.encrypted_api_key = get_endecryptor().encrypt(api_key)
191
- metadata.pinecone_config.serverless_cloud = request.pinecone_config.serverless_cloud
192
- pinecone = get_pinecone().control_plane(api_key=api_key)
193
- serverless_cloud = to_pinecone_serverless_cloud_payload(request.pinecone_config.serverless_cloud)
194
- for index in indexes:
195
- vectorset_id = index.vectorset_id
196
- index_name = PineconeIndexManager.get_index_name()
197
- index_dimension = index.dimension
198
- similarity_metric = to_pinecone_index_metric(index.similarity)
199
- logger.info(
200
- "Creating pincone index",
201
- extra={
202
- "kbid": kbid,
203
- "index_name": index_name,
204
- "similarity": similarity_metric,
205
- "vector_dimension": index_dimension,
206
- "vectorset_id": vectorset_id,
207
- "cloud": serverless_cloud,
208
- },
209
- )
210
- try:
211
- index_host = await pinecone.create_index(
212
- name=index_name,
213
- dimension=index_dimension,
214
- metric=similarity_metric,
215
- serverless_cloud=serverless_cloud,
216
- )
217
- created_indexes.append(index_name)
218
- except PineconeAPIError as exc:
219
- # Try index creation rollback
220
- for index_name in created_indexes:
221
- try:
222
- await cls._delete_index(api_key, index_name)
223
- except Exception:
224
- logger.exception("Could not rollback created pinecone indexes")
225
- raise ExternalIndexCreationError("pinecone", exc.message) from exc
226
- metadata.pinecone_config.indexes[vectorset_id].CopyFrom(
227
- kb_pb2.PineconeIndexMetadata(
228
- index_name=index_name,
229
- index_host=index_host,
230
- vector_dimension=index.dimension,
231
- similarity=index.similarity,
232
- )
233
- )
234
- return metadata
235
-
236
- @classmethod
237
- async def delete_indexes(
238
- cls,
239
- kbid: str,
240
- stored: kb_pb2.StoredExternalIndexProviderMetadata,
241
- ) -> None:
242
- api_key = get_endecryptor().decrypt(stored.pinecone_config.encrypted_api_key)
243
- # Delete all indexes stored in the config and passed as parameters
244
- for index_metadata in stored.pinecone_config.indexes.values():
245
- index_name = index_metadata.index_name
246
- try:
247
- logger.info("Deleting pincone index", extra={"kbid": kbid, "index_name": index_name})
248
- await cls._delete_index(api_key, index_name)
249
- except Exception:
250
- logger.exception(
251
- "Error deleting pinecone index", extra={"kbid": kbid, "index_name": index_name}
252
- )
253
-
254
- @classmethod
255
- @backoff.on_exception(
256
- backoff.expo,
257
- (PineconeAPIError,),
258
- jitter=backoff.random_jitter,
259
- max_tries=3,
260
- )
261
- async def _delete_index(cls, api_key: str, index_name: str) -> None:
262
- control_plane = get_pinecone().control_plane(api_key=api_key)
263
- await control_plane.delete_index(index_name)
264
-
265
- async def rollover_create_indexes(
266
- self, stored: kb_pb2.StoredExternalIndexProviderMetadata
267
- ) -> kb_pb2.StoredExternalIndexProviderMetadata:
268
- result = kb_pb2.StoredExternalIndexProviderMetadata()
269
- result.CopyFrom(stored)
270
- control_plane = get_pinecone().control_plane(api_key=self.api_key)
271
- created_indexes = []
272
- cloud = to_pinecone_serverless_cloud_payload(stored.pinecone_config.serverless_cloud)
273
- try:
274
- for vectorset_id, index in stored.pinecone_config.indexes.items():
275
- rollover_index_name = PineconeIndexManager.get_index_name()
276
- index_dimension = index.vector_dimension
277
- similarity_metric = to_pinecone_index_metric(index.similarity)
278
- logger.info(
279
- "Creating pincone rollover index",
280
- extra={
281
- "kbid": self.kbid,
282
- "index_name": index.index_name,
283
- "rollover_index_name": rollover_index_name,
284
- "similarity": similarity_metric,
285
- "vector_dimension": index_dimension,
286
- "vectorset_id": vectorset_id,
287
- },
288
- )
289
- try:
290
- index_host = await control_plane.create_index(
291
- name=rollover_index_name,
292
- dimension=index_dimension,
293
- metric=similarity_metric,
294
- serverless_cloud=cloud,
295
- )
296
- result.pinecone_config.indexes[vectorset_id].MergeFrom(
297
- kb_pb2.PineconeIndexMetadata(
298
- index_name=rollover_index_name,
299
- index_host=index_host,
300
- vector_dimension=index_dimension,
301
- similarity=index.similarity,
302
- )
303
- )
304
- created_indexes.append(rollover_index_name)
305
- except PineconeAPIError as exc:
306
- raise ExternalIndexCreationError("pinecone", exc.message) from exc
307
- except Exception:
308
- # Rollback any created indexes
309
- for index_name in created_indexes:
310
- try:
311
- await self.__class__._delete_index(self.api_key, index_name)
312
- except Exception:
313
- logger.exception(
314
- f"Could not rollback created pinecone index",
315
- extra={
316
- "kbid": self.kbid,
317
- "index_name": index_name,
318
- },
319
- )
320
- raise
321
-
322
- # Wait for all indexes to be in the ready state
323
- wait_tasks = []
324
- for index_name in created_indexes:
325
- wait_tasks.append(
326
- asyncio.create_task(self.wait_for_index_ready(index_name, max_wait_seconds=60))
327
- )
328
- if len(wait_tasks) > 0:
329
- try:
330
- await asyncio.gather(*wait_tasks)
331
- except asyncio.TimeoutError:
332
- logger.warning(
333
- "Timeout waiting for pinecone indexes to be ready",
334
- extra={"kbid": self.kbid, "indexes": created_indexes},
335
- )
336
-
337
- # Clear the rollover indexes and update the stored metadata
338
- self.rollover_indexes.clear()
339
- self.rollover_indexes = dict(result.pinecone_config.indexes)
340
- return result
341
-
342
- async def wait_for_index_ready(self, index_name: str, max_wait_seconds: int = 10) -> None:
343
- """
344
- Wait for an index to be ready.
345
- Params:
346
- - `name`: The name of the index to wait for.
347
- - `max_wait_seconds`: The maximum number of seconds to wait.
348
- """
349
- control_plane = self.pinecone.control_plane(api_key=self.api_key)
350
- for _ in range(max_wait_seconds):
351
- try:
352
- index = await control_plane.describe_index(index_name)
353
- if index.status.ready:
354
- return
355
- except PineconeAPIError:
356
- logger.exception(
357
- "Failed to describe index while waiting for it to become ready.",
358
- extra={"kbid": self.kbid, "index_name": index_name},
359
- )
360
- await asyncio.sleep(1)
361
-
362
- raise TimeoutError(f"Index {index_name} did not become ready after {max_wait_seconds} seconds.")
363
-
364
- async def rollover_cutover_indexes(self) -> None:
365
- assert len(self.rollover_indexes) > 0, "No rollover indexes to cutover to"
366
- control_plane = self.pinecone.control_plane(api_key=self.api_key)
367
- for index in self.indexes.values():
368
- index_name = index.index_name
369
- try:
370
- await control_plane.delete_index(index.index_name)
371
- except Exception:
372
- logger.exception(
373
- "Error deleting pinecone index on cutover",
374
- extra={"kbid": self.kbid, "index_name": index_name},
375
- )
376
- self.indexes.clear()
377
- self.indexes.update(self.rollover_indexes)
378
-
379
- @classmethod
380
- def get_index_name(cls) -> str:
381
- """
382
- Index names can't be longer than 45 characters and can only contain
383
- alphanumeric lowercase characters: https://docs.pinecone.io/troubleshooting/restrictions-on-index-names
384
-
385
- We generate a unique id for each pinecone index created.
386
- `nuclia-` is prepended to easily identify which indexes are created by Nuclia.
387
-
388
- Example:
389
- >>> get_index_name()
390
- 'nuclia-2d899e8a0af54ac9a5addbd483d02ec9'
391
- """
392
- return f"nuclia-{uuid4().hex}"
393
-
394
- async def _delete_resource_to_index(self, index_host: str, resource_uuid: str) -> None:
395
- data_plane = self.get_data_plane(index_host=index_host)
396
- with manager_observer({"operation": "delete_by_resource_prefix"}):
397
- await data_plane.delete_by_id_prefix(
398
- id_prefix=resource_uuid,
399
- max_parallel_batches=self.delete_parallelism,
400
- batch_timeout=self.delete_timeout,
401
- )
402
-
403
- async def _delete_resource(self, resource_uuid: str) -> None:
404
- """
405
- Deletes by resource uuid on all indexes in parallel.
406
- """
407
- delete_tasks = []
408
- for index in self.indexes.values():
409
- index_host = index.index_host
410
- delete_tasks.append(
411
- asyncio.create_task(
412
- self._delete_resource_to_index(
413
- index_host=index_host,
414
- resource_uuid=resource_uuid,
415
- )
416
- )
417
- )
418
- if len(delete_tasks) > 0:
419
- await asyncio.gather(*delete_tasks)
420
-
421
- def get_index_host(self, vectorset_id: str, rollover: bool = False) -> str:
422
- if rollover:
423
- return self.rollover_indexes[vectorset_id].index_host
424
- else:
425
- return self.indexes[vectorset_id].index_host
426
-
427
- def get_prefixes_to_delete(self, index_data: Resource) -> dict[str, set[str]]:
428
- return {
429
- vectorset_id: set(prefixes_list.items)
430
- for vectorset_id, prefixes_list in index_data.vector_prefixes_to_delete.items()
431
- }
432
-
433
- async def _index_resource(
434
- self, resource_uuid: str, index_data: Resource, to_rollover_indexes: bool = False
435
- ) -> None:
436
- """
437
- Index NucliaDB resource into a Pinecone index.
438
- Handles multiple vectorsets.
439
-
440
- The algorithm is as follows:
441
- - First, get the vectorsets for which we have vectors to upsert.
442
- - Then, delete any previously existing vectors with the same field prefixes on all vectorsets.
443
- - Then, iterate the fields and the paragraphs to compute the base metadata for each vector.
444
- - After that, iterate the sentences now, and compute the list of vectors to upsert, and extend the vector
445
- metadata with any specific sentence metadata. This is done for each vectorset.
446
- - Finally, upsert the vectors to each vectorset index in parallel.
447
- """
448
- delete_tasks = []
449
- for vectorset, prefixes_to_delete in self.get_prefixes_to_delete(index_data).items():
450
- index_host = self.get_index_host(vectorset_id=vectorset, rollover=to_rollover_indexes)
451
- delete_tasks.append(
452
- asyncio.create_task(
453
- self._delete_by_prefix_to_index(
454
- index_host=index_host,
455
- prefixes_to_delete=prefixes_to_delete,
456
- )
457
- )
458
- )
459
- if len(delete_tasks) > 0:
460
- await asyncio.gather(*delete_tasks)
461
-
462
- with manager_observer({"operation": "compute_base_vector_metadatas"}):
463
- base_vector_metadatas: dict[str, VectorMetadata] = await self.compute_base_vector_metadatas(
464
- index_data, resource_uuid
465
- )
466
-
467
- with manager_observer({"operation": "compute_vectorset_vectors"}):
468
- vectorset_vectors: dict[str, list[PineconeVector]] = await self.compute_vectorset_vectors(
469
- index_data, base_vector_metadatas
470
- )
471
-
472
- upsert_tasks = []
473
- for vectorset_id, vectors in vectorset_vectors.items():
474
- index_host = self.get_index_host(vectorset_id=vectorset_id, rollover=to_rollover_indexes)
475
- upsert_tasks.append(
476
- asyncio.create_task(
477
- self._upsert_to_index(
478
- index_host=index_host,
479
- vectors=vectors,
480
- )
481
- )
482
- )
483
- if len(upsert_tasks) > 0:
484
- await asyncio.gather(*upsert_tasks)
485
-
486
- async def _upsert_to_index(self, index_host: str, vectors: list[PineconeVector]) -> None:
487
- if len(vectors) == 0: # pragma: no cover
488
- return
489
- data_plane = self.get_data_plane(index_host=index_host)
490
- with manager_observer({"operation": "upsert_in_batches"}):
491
- await data_plane.upsert_in_batches(
492
- vectors=vectors,
493
- max_parallel_batches=self.upsert_parallelism,
494
- batch_timeout=self.upsert_timeout,
495
- )
496
-
497
- async def _delete_by_prefix_to_index(self, index_host: str, prefixes_to_delete: set[str]) -> None:
498
- if len(prefixes_to_delete) == 0: # pragma: no cover
499
- return
500
- data_plane = self.get_data_plane(index_host=index_host)
501
- with manager_observer({"operation": "delete_by_prefix"}):
502
- for prefix in prefixes_to_delete:
503
- await data_plane.delete_by_id_prefix(
504
- id_prefix=prefix,
505
- max_parallel_batches=self.delete_parallelism,
506
- batch_timeout=self.delete_timeout,
507
- )
508
-
509
- async def compute_base_vector_metadatas(
510
- self, index_data: Resource, resource_uuid: str
511
- ) -> dict[str, VectorMetadata]:
512
- # This is a CPU bound operation and when the number of vectors is large, it can take a
513
- # long time (around a second).
514
- # Ideally, we would use a ProcessPoolExecutor to parallelize the computation of the metadata, but
515
- # the Resource protobuf is not pickleable, so we can't use it in a ProcessPoolExecutor. This will
516
- # be less of a problem when we move pinecone indexing to its own consumer.
517
- return await asyncio.to_thread(self._compute_base_vector_metadatas, index_data, resource_uuid)
518
-
519
- def _compute_base_vector_metadatas(
520
- self, index_data: Resource, resource_uuid: str
521
- ) -> dict[str, VectorMetadata]:
522
- """
523
- Compute the base metadata for each vector in the resource.
524
- This metadata is common to all vectors in the same paragraph, for all vectorsets.
525
- """
526
- metadatas: dict[str, VectorMetadata] = {}
527
- security_public = True
528
- security_ids_with_access = None
529
- if index_data.HasField("security"):
530
- security_public = False
531
- security_ids_with_access = list(set(index_data.security.access_groups))
532
-
533
- resource_labels = set(index_data.labels)
534
- date_created = index_data.metadata.created.ToSeconds()
535
- date_modified = index_data.metadata.modified.ToSeconds()
536
-
537
- # First off, iterate the fields and the paragraphs to compute the metadata for
538
- # each vector, specifically the labels that will be used for filtering.
539
- for field_id, text_info in index_data.texts.items():
540
- field_labels = set(text_info.labels)
541
- field_paragraphs = index_data.paragraphs.get(field_id)
542
- if field_paragraphs is None:
543
- logger.info(
544
- "Paragraphs not found for field",
545
- extra={"kbid": self.kbid, "rid": resource_uuid, "field_id": field_id},
546
- )
547
- continue
548
-
549
- paragraph: IndexParagraph
550
- for paragraph_id, paragraph in field_paragraphs.paragraphs.items():
551
- fid = ParagraphId.from_string(paragraph_id).field_id
552
- vector_metadata = VectorMetadata(
553
- rid=resource_uuid,
554
- field_type=fid.type,
555
- field_id=fid.key,
556
- date_created=date_created,
557
- date_modified=date_modified,
558
- security_public=security_public,
559
- security_ids_with_access=security_ids_with_access,
560
- )
561
- metadatas[paragraph_id] = vector_metadata
562
- final_field_labels = resource_labels.union(field_labels)
563
- if final_field_labels:
564
- vector_metadata.field_labels = unique(discard_labels(list(final_field_labels)))
565
- final_paragraph_labels = paragraph.labels
566
- if final_paragraph_labels:
567
- vector_metadata.paragraph_labels = unique(
568
- discard_labels(list(final_paragraph_labels))
569
- )
570
- return metadatas
571
-
572
- async def compute_vectorset_vectors(
573
- self, index_data: Resource, base_vector_metadatas: dict[str, VectorMetadata]
574
- ) -> dict[str, list[PineconeVector]]:
575
- # This is a CPU bound operation and when the number of vectors is large, it can take a
576
- # long time (around a second).
577
- # Ideally, we would use a ProcessPoolExecutor to parallelize the computation of the metadata, but
578
- # the Resource protobuf is not pickleable, so we can't use it in a ProcessPoolExecutor. This will
579
- # be less of a problem when we move pinecone indexing to its own consumer.
580
- return await asyncio.to_thread(
581
- self._compute_vectorset_vectors, index_data, base_vector_metadatas
582
- )
583
-
584
- def _compute_vectorset_vectors(
585
- self, index_data: Resource, base_vector_metadatas: dict[str, VectorMetadata]
586
- ) -> dict[str, list[PineconeVector]]:
587
- vectorset_vectors: dict[str, list[PineconeVector]] = {}
588
- for index_paragraph_id, index_paragraph in iter_paragraphs(index_data):
589
- # We must compute the vectors for each vectorset present the paragraph.
590
- vectorset_iterators = {}
591
- if index_paragraph.sentences and self.default_vectorset:
592
- vectorset_iterators[self.default_vectorset] = index_paragraph.sentences.items()
593
- for vectorset_id, vector_sentences in index_paragraph.vectorsets_sentences.items():
594
- if vector_sentences.sentences:
595
- vectorset_iterators[vectorset_id] = vector_sentences.sentences.items()
596
-
597
- vector_sentence: VectorSentence
598
- for vectorset_id, sentences_iterator in vectorset_iterators.items():
599
- for sentence_id, vector_sentence in sentences_iterator:
600
- vector_metadata_to_copy = base_vector_metadatas.get(index_paragraph_id)
601
- if vector_metadata_to_copy is None:
602
- logger.warning(
603
- f"Metadata not found for sentences of paragraph {index_paragraph_id}"
604
- )
605
- continue
606
- # Copy the initial metadata collected at paragraph parsing in case
607
- # the metadata is different for each vectorset
608
- vector_metadata = deepcopy(vector_metadata_to_copy)
609
-
610
- # AI-tables metadata
611
- if vector_sentence.metadata.page_with_visual:
612
- vector_metadata.page_with_visual = True
613
- if vector_sentence.metadata.representation.is_a_table:
614
- vector_metadata.is_a_table = True
615
- if vector_sentence.metadata.representation.file:
616
- vector_metadata.representation_file = (
617
- vector_sentence.metadata.representation.file
618
- )
619
-
620
- # Video positions
621
- if len(vector_sentence.metadata.position.start_seconds):
622
- vector_metadata.position_start_seconds = list(
623
- map(str, vector_sentence.metadata.position.start_seconds)
624
- )
625
- if len(vector_sentence.metadata.position.end_seconds):
626
- vector_metadata.position_end_seconds = list(
627
- map(str, vector_sentence.metadata.position.end_seconds)
628
- )
629
- vector_metadata.page_number = vector_sentence.metadata.position.page_number
630
- try:
631
- pc_vector = PineconeVector(
632
- id=sentence_id,
633
- values=list(vector_sentence.vector),
634
- metadata=vector_metadata.model_dump(exclude_none=True),
635
- )
636
- except MetadataTooLargeError as exc: # pragma: no cover
637
- logger.error(f"Invalid Pinecone vector. Metadata is too large. Skipping: {exc}")
638
- continue
639
-
640
- vectors = vectorset_vectors.setdefault(vectorset_id, [])
641
- vectors.append(pc_vector)
642
- return vectorset_vectors
643
-
644
- async def _query(self, request: SearchRequest) -> PineconeQueryResults:
645
- if len(request.vector) == 0:
646
- return PineconeQueryResults(results=QueryResponse(matches=[]))
647
- vectorset_id = request.vectorset or self.default_vectorset or "__default__"
648
- index_host = self.get_index_host(vectorset_id=vectorset_id)
649
- data_plane = self.get_data_plane(index_host=index_host)
650
- filter = convert_to_pinecone_filter(request)
651
- top_k = request.result_per_page
652
- query_results = await data_plane.query(
653
- vector=list(request.vector),
654
- top_k=top_k,
655
- include_values=False,
656
- include_metadata=True,
657
- filter=filter,
658
- timeout=self.query_timeout,
659
- )
660
- # filter by min score manually, as Pinecone don't implement this feature
661
- results = QueryResponse(
662
- matches=[
663
- match for match in query_results.matches if match.score >= request.min_score_semantic
664
- ]
665
- )
666
- return PineconeQueryResults(results=results)
667
-
668
- async def _get_index_counts(self) -> IndexCounts:
669
- if self.kbid in COUNTERS_CACHE:
670
- # Cache hit
671
- return COUNTERS_CACHE[self.kbid]
672
- total = IndexCounts(fields=0, paragraphs=0, sentences=0, size_bytes=0)
673
- tasks = []
674
- vectorset_results: dict[str, IndexCounts] = {}
675
-
676
- for vectorset_id in self.indexes.keys():
677
- tasks.append(
678
- asyncio.create_task(self._get_vectorset_index_counts(vectorset_id, vectorset_results))
679
- )
680
- if len(tasks) > 0:
681
- await asyncio.gather(*tasks)
682
-
683
- for _, counts in vectorset_results.items():
684
- total.paragraphs += counts.paragraphs
685
- total.sentences += counts.sentences
686
- COUNTERS_CACHE[self.kbid] = total
687
- return total
688
-
689
- async def _get_vectorset_index_counts(
690
- self, vectorset_id: str, results: dict[str, IndexCounts]
691
- ) -> None:
692
- index_host = self.get_index_host(vectorset_id=vectorset_id)
693
- data_plane = self.get_data_plane(index_host=index_host)
694
- try:
695
- index_stats = await data_plane.stats()
696
- results[vectorset_id] = IndexCounts(
697
- fields=0,
698
- paragraphs=index_stats.totalVectorCount,
699
- sentences=index_stats.totalVectorCount,
700
- size_bytes=0,
701
- )
702
- except Exception:
703
- logger.exception(
704
- "Error getting index stats",
705
- extra={"kbid": self.kbid, "provider": self.type.value, "index_host": index_host},
706
- )
707
-
708
-
709
- def discard_labels(labels: list[str]) -> list[str]:
710
- return [
711
- label
712
- for label in labels
713
- if not any(label.startswith(prefix) for prefix in DISCARDED_LABEL_PREFIXES)
714
- ]
715
-
716
-
717
- def unique(labels: list[str]) -> list[str]:
718
- return list(set(labels))
719
-
720
-
721
- def convert_to_pinecone_filter(request: SearchRequest) -> Optional[dict[str, Any]]:
722
- """
723
- Returns a Pinecone filter from a SearchRequest so that RAG features supported by Nuclia
724
- can be used on Pinecone indexes.
725
- """
726
- and_terms = []
727
- if request.HasField("field_filter"):
728
- and_terms.append(convert_filter_expression("field_labels", request.field_filter))
729
- if request.HasField("paragraph_filter"):
730
- and_terms.append(convert_filter_expression("paragraph_labels", request.field_filter))
731
-
732
- if len(request.security.access_groups):
733
- # Security filtering
734
- security_term = {
735
- LogicalOperator.OR: [
736
- {"security_public": {"$eq": True}},
737
- {
738
- "security_ids_with_access": {
739
- FilterOperator.IN: list(set(request.security.access_groups))
740
- }
741
- },
742
- ]
743
- }
744
- and_terms.append(security_term)
745
-
746
- if len(and_terms) == 0:
747
- return None
748
- if len(and_terms) == 1:
749
- return and_terms[0]
750
- return {LogicalOperator.AND: and_terms}
751
-
752
-
753
- def convert_filter_expression(
754
- field: str, expression: FilterExpression, negative: bool = False
755
- ) -> dict[str, Any]:
756
- """
757
- Converts internal label filter expressions to Pinecone's metadata query language.
758
-
759
- Note: Since Pinecone does not support negation of expressions, we need to use De Morgan's laws to
760
- convert the expression to a positive one.
761
- """
762
-
763
- kind = expression.WhichOneof("expr")
764
- if kind == "bool_and":
765
- if negative:
766
- return {
767
- LogicalOperator.OR: [
768
- convert_filter_expression(field, sub_expression, negative=True)
769
- for sub_expression in expression.bool_and.operands
770
- ]
771
- }
772
- else:
773
- return {
774
- LogicalOperator.AND: [
775
- convert_filter_expression(field, sub_expression)
776
- for sub_expression in expression.bool_and.operands
777
- ]
778
- }
779
- elif kind == "bool_or":
780
- if negative:
781
- return {
782
- LogicalOperator.AND: [
783
- convert_filter_expression(field, sub_expression, negative=True)
784
- for sub_expression in expression.bool_or.operands
785
- ]
786
- }
787
- else:
788
- return {
789
- LogicalOperator.OR: [
790
- convert_filter_expression(field, sub_expression)
791
- for sub_expression in expression.bool_or.operands
792
- ]
793
- }
794
-
795
- elif kind == "bool_not":
796
- return convert_filter_expression(field, expression.bool_not, negative=not negative)
797
-
798
- elif kind == "resource":
799
- operator = FilterOperator.NOT_EQUALS if negative else FilterOperator.EQUALS
800
- return {"rid": {operator: expression.resource.resource_id}}
801
-
802
- elif kind == "field":
803
- field_id = expression.field.field_type
804
- if expression.field.HasField("field_id"):
805
- field_id += f"/{expression.field.field_id}"
806
- operator = FilterOperator.NOT_EQUALS if negative else FilterOperator.EQUALS
807
- return {"field_id": {operator: field_id}}
808
-
809
- elif kind == "keyword":
810
- raise ValueError("Cannot filter by keywords")
811
-
812
- elif kind == "date":
813
- date_field = (
814
- "date_created"
815
- if expression.date.field == FilterExpression.DateRangeFilter.DateField.CREATED
816
- else "date_modified"
817
- )
818
- if negative:
819
- terms = []
820
- if expression.date.HasField("since"):
821
- operator = FilterOperator.LESS_THAN
822
- terms.append({date_field: {operator: expression.date.since.ToSeconds()}})
823
- if expression.date.HasField("until"):
824
- operator = FilterOperator.GREATER_THAN
825
- terms.append({date_field: {operator: expression.date.until.ToSeconds()}})
826
-
827
- if len(terms) == 2:
828
- return {LogicalOperator.OR: terms}
829
- elif len(terms) == 1:
830
- return terms[0]
831
- else:
832
- raise ValueError(f"Invalid filter expression: {expression}")
833
- else:
834
- terms = []
835
- if expression.date.HasField("since"):
836
- operator = FilterOperator.GREATER_THAN_OR_EQUAL
837
- terms.append({date_field: {operator: expression.date.since.ToSeconds()}})
838
- if expression.date.HasField("until"):
839
- operator = FilterOperator.LESS_THAN_OR_EQUAL
840
- terms.append({date_field: {operator: expression.date.until.ToSeconds()}})
841
-
842
- if len(terms) == 2:
843
- return {LogicalOperator.AND: terms}
844
- elif len(terms) == 1:
845
- return terms[0]
846
- else:
847
- raise ValueError(f"Invalid filter expression: {expression}")
848
-
849
- elif kind == "facet":
850
- operator = FilterOperator.NOT_IN if negative else FilterOperator.IN
851
- return {field: {operator: [expression.facet.facet]}}
852
-
853
- else:
854
- raise ValueError(f"Invalid filter expression: {expression}")
855
-
856
-
857
- def iter_paragraphs(resource: Resource) -> Iterator[tuple[str, IndexParagraph]]:
858
- for _, paragraphs in resource.paragraphs.items():
859
- for paragraph_id, paragraph in paragraphs.paragraphs.items():
860
- yield paragraph_id, paragraph
861
-
862
-
863
- def to_pinecone_index_metric(similarity: utils_pb2.VectorSimilarity.ValueType) -> str:
864
- return {
865
- utils_pb2.VectorSimilarity.COSINE: "cosine",
866
- utils_pb2.VectorSimilarity.DOT: "dotproduct",
867
- }[similarity]
868
-
869
-
870
- def to_pinecone_serverless_cloud_payload(
871
- serverless: kb_pb2.PineconeServerlessCloud.ValueType,
872
- ) -> dict[str, str]:
873
- return {
874
- kb_pb2.PineconeServerlessCloud.AWS_EU_WEST_1: {
875
- "cloud": "aws",
876
- "region": "eu-west-1",
877
- },
878
- kb_pb2.PineconeServerlessCloud.AWS_US_EAST_1: {
879
- "cloud": "aws",
880
- "region": "us-east-1",
881
- },
882
- kb_pb2.PineconeServerlessCloud.AWS_US_WEST_2: {
883
- "cloud": "aws",
884
- "region": "us-west-2",
885
- },
886
- kb_pb2.PineconeServerlessCloud.AZURE_EASTUS2: {
887
- "cloud": "azure",
888
- "region": "eastus2",
889
- },
890
- kb_pb2.PineconeServerlessCloud.GCP_US_CENTRAL1: {
891
- "cloud": "gcp",
892
- "region": "us-central1",
893
- },
894
- }[serverless]