nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0023_backfill_pg_catalog.py +8 -4
- migrations/0028_extracted_vectors_reference.py +1 -1
- migrations/0029_backfill_field_status.py +3 -4
- migrations/0032_remove_old_relations.py +2 -3
- migrations/0038_backfill_catalog_field_labels.py +8 -4
- migrations/0039_backfill_converation_splits_metadata.py +106 -0
- migrations/0040_migrate_search_configurations.py +79 -0
- migrations/0041_reindex_conversations.py +137 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
- migrations/pg/0012_catalog_statistics_undo.py +26 -0
- nucliadb/backups/create.py +2 -15
- nucliadb/backups/restore.py +4 -15
- nucliadb/backups/tasks.py +4 -1
- nucliadb/common/back_pressure/cache.py +2 -3
- nucliadb/common/back_pressure/materializer.py +7 -13
- nucliadb/common/back_pressure/settings.py +6 -6
- nucliadb/common/back_pressure/utils.py +1 -0
- nucliadb/common/cache.py +9 -9
- nucliadb/common/catalog/__init__.py +79 -0
- nucliadb/common/catalog/dummy.py +36 -0
- nucliadb/common/catalog/interface.py +85 -0
- nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
- nucliadb/common/catalog/utils.py +56 -0
- nucliadb/common/cluster/manager.py +8 -23
- nucliadb/common/cluster/rebalance.py +484 -112
- nucliadb/common/cluster/rollover.py +36 -9
- nucliadb/common/cluster/settings.py +4 -9
- nucliadb/common/cluster/utils.py +34 -8
- nucliadb/common/context/__init__.py +7 -8
- nucliadb/common/context/fastapi.py +1 -2
- nucliadb/common/datamanagers/__init__.py +2 -4
- nucliadb/common/datamanagers/atomic.py +9 -2
- nucliadb/common/datamanagers/cluster.py +1 -2
- nucliadb/common/datamanagers/fields.py +3 -4
- nucliadb/common/datamanagers/kb.py +6 -6
- nucliadb/common/datamanagers/labels.py +2 -3
- nucliadb/common/datamanagers/resources.py +10 -33
- nucliadb/common/datamanagers/rollover.py +5 -7
- nucliadb/common/datamanagers/search_configurations.py +1 -2
- nucliadb/common/datamanagers/synonyms.py +1 -2
- nucliadb/common/datamanagers/utils.py +4 -4
- nucliadb/common/datamanagers/vectorsets.py +4 -4
- nucliadb/common/external_index_providers/base.py +32 -5
- nucliadb/common/external_index_providers/manager.py +5 -34
- nucliadb/common/external_index_providers/settings.py +1 -27
- nucliadb/common/filter_expression.py +129 -41
- nucliadb/common/http_clients/exceptions.py +8 -0
- nucliadb/common/http_clients/processing.py +16 -23
- nucliadb/common/http_clients/utils.py +3 -0
- nucliadb/common/ids.py +82 -58
- nucliadb/common/locking.py +1 -2
- nucliadb/common/maindb/driver.py +9 -8
- nucliadb/common/maindb/local.py +5 -5
- nucliadb/common/maindb/pg.py +9 -8
- nucliadb/common/nidx.py +22 -5
- nucliadb/common/vector_index_config.py +1 -1
- nucliadb/export_import/datamanager.py +4 -3
- nucliadb/export_import/exporter.py +11 -19
- nucliadb/export_import/importer.py +13 -6
- nucliadb/export_import/tasks.py +2 -0
- nucliadb/export_import/utils.py +6 -18
- nucliadb/health.py +2 -2
- nucliadb/ingest/app.py +8 -8
- nucliadb/ingest/consumer/consumer.py +8 -10
- nucliadb/ingest/consumer/pull.py +10 -8
- nucliadb/ingest/consumer/service.py +5 -30
- nucliadb/ingest/consumer/shard_creator.py +16 -5
- nucliadb/ingest/consumer/utils.py +1 -1
- nucliadb/ingest/fields/base.py +37 -49
- nucliadb/ingest/fields/conversation.py +55 -9
- nucliadb/ingest/fields/exceptions.py +1 -2
- nucliadb/ingest/fields/file.py +22 -8
- nucliadb/ingest/fields/link.py +7 -7
- nucliadb/ingest/fields/text.py +2 -3
- nucliadb/ingest/orm/brain_v2.py +89 -57
- nucliadb/ingest/orm/broker_message.py +2 -4
- nucliadb/ingest/orm/entities.py +10 -209
- nucliadb/ingest/orm/index_message.py +128 -113
- nucliadb/ingest/orm/knowledgebox.py +91 -59
- nucliadb/ingest/orm/processor/auditing.py +1 -3
- nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
- nucliadb/ingest/orm/processor/processor.py +98 -153
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
- nucliadb/ingest/orm/resource.py +82 -71
- nucliadb/ingest/orm/utils.py +1 -1
- nucliadb/ingest/partitions.py +12 -1
- nucliadb/ingest/processing.py +17 -17
- nucliadb/ingest/serialize.py +202 -145
- nucliadb/ingest/service/writer.py +15 -114
- nucliadb/ingest/settings.py +36 -15
- nucliadb/ingest/utils.py +1 -2
- nucliadb/learning_proxy.py +23 -26
- nucliadb/metrics_exporter.py +20 -6
- nucliadb/middleware/__init__.py +82 -1
- nucliadb/migrator/datamanager.py +4 -11
- nucliadb/migrator/migrator.py +1 -2
- nucliadb/migrator/models.py +1 -2
- nucliadb/migrator/settings.py +1 -2
- nucliadb/models/internal/augment.py +614 -0
- nucliadb/models/internal/processing.py +19 -19
- nucliadb/openapi.py +2 -2
- nucliadb/purge/__init__.py +3 -8
- nucliadb/purge/orphan_shards.py +1 -2
- nucliadb/reader/__init__.py +5 -0
- nucliadb/reader/api/models.py +6 -13
- nucliadb/reader/api/v1/download.py +59 -38
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/knowledgebox.py +37 -9
- nucliadb/reader/api/v1/learning_config.py +33 -14
- nucliadb/reader/api/v1/resource.py +61 -9
- nucliadb/reader/api/v1/services.py +18 -14
- nucliadb/reader/app.py +3 -1
- nucliadb/reader/reader/notifications.py +1 -2
- nucliadb/search/api/v1/__init__.py +3 -0
- nucliadb/search/api/v1/ask.py +3 -4
- nucliadb/search/api/v1/augment.py +585 -0
- nucliadb/search/api/v1/catalog.py +15 -19
- nucliadb/search/api/v1/find.py +16 -22
- nucliadb/search/api/v1/hydrate.py +328 -0
- nucliadb/search/api/v1/knowledgebox.py +1 -2
- nucliadb/search/api/v1/predict_proxy.py +1 -2
- nucliadb/search/api/v1/resource/ask.py +28 -8
- nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
- nucliadb/search/api/v1/resource/search.py +9 -11
- nucliadb/search/api/v1/retrieve.py +130 -0
- nucliadb/search/api/v1/search.py +28 -32
- nucliadb/search/api/v1/suggest.py +11 -14
- nucliadb/search/api/v1/summarize.py +1 -2
- nucliadb/search/api/v1/utils.py +2 -2
- nucliadb/search/app.py +3 -2
- nucliadb/search/augmentor/__init__.py +21 -0
- nucliadb/search/augmentor/augmentor.py +232 -0
- nucliadb/search/augmentor/fields.py +704 -0
- nucliadb/search/augmentor/metrics.py +24 -0
- nucliadb/search/augmentor/paragraphs.py +334 -0
- nucliadb/search/augmentor/resources.py +238 -0
- nucliadb/search/augmentor/utils.py +33 -0
- nucliadb/search/lifecycle.py +3 -1
- nucliadb/search/predict.py +33 -19
- nucliadb/search/predict_models.py +8 -9
- nucliadb/search/requesters/utils.py +11 -10
- nucliadb/search/search/cache.py +19 -42
- nucliadb/search/search/chat/ask.py +131 -59
- nucliadb/search/search/chat/exceptions.py +3 -5
- nucliadb/search/search/chat/fetcher.py +201 -0
- nucliadb/search/search/chat/images.py +6 -4
- nucliadb/search/search/chat/old_prompt.py +1375 -0
- nucliadb/search/search/chat/parser.py +510 -0
- nucliadb/search/search/chat/prompt.py +563 -615
- nucliadb/search/search/chat/query.py +453 -32
- nucliadb/search/search/chat/rpc.py +85 -0
- nucliadb/search/search/fetch.py +3 -4
- nucliadb/search/search/filters.py +8 -11
- nucliadb/search/search/find.py +33 -31
- nucliadb/search/search/find_merge.py +124 -331
- nucliadb/search/search/graph_strategy.py +14 -12
- nucliadb/search/search/hydrator/__init__.py +49 -0
- nucliadb/search/search/hydrator/fields.py +217 -0
- nucliadb/search/search/hydrator/images.py +130 -0
- nucliadb/search/search/hydrator/paragraphs.py +323 -0
- nucliadb/search/search/hydrator/resources.py +60 -0
- nucliadb/search/search/ingestion_agents.py +5 -5
- nucliadb/search/search/merge.py +90 -94
- nucliadb/search/search/metrics.py +24 -7
- nucliadb/search/search/paragraphs.py +7 -9
- nucliadb/search/search/predict_proxy.py +44 -18
- nucliadb/search/search/query.py +14 -86
- nucliadb/search/search/query_parser/fetcher.py +51 -82
- nucliadb/search/search/query_parser/models.py +19 -48
- nucliadb/search/search/query_parser/old_filters.py +20 -19
- nucliadb/search/search/query_parser/parsers/ask.py +5 -6
- nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
- nucliadb/search/search/query_parser/parsers/common.py +21 -13
- nucliadb/search/search/query_parser/parsers/find.py +6 -29
- nucliadb/search/search/query_parser/parsers/graph.py +18 -28
- nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
- nucliadb/search/search/query_parser/parsers/search.py +15 -56
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
- nucliadb/search/search/rank_fusion.py +18 -13
- nucliadb/search/search/rerankers.py +6 -7
- nucliadb/search/search/retrieval.py +300 -0
- nucliadb/search/search/summarize.py +5 -6
- nucliadb/search/search/utils.py +3 -4
- nucliadb/search/settings.py +1 -2
- nucliadb/standalone/api_router.py +1 -1
- nucliadb/standalone/app.py +4 -3
- nucliadb/standalone/auth.py +5 -6
- nucliadb/standalone/lifecycle.py +2 -2
- nucliadb/standalone/run.py +5 -4
- nucliadb/standalone/settings.py +5 -6
- nucliadb/standalone/versions.py +3 -4
- nucliadb/tasks/consumer.py +13 -8
- nucliadb/tasks/models.py +2 -1
- nucliadb/tasks/producer.py +3 -3
- nucliadb/tasks/retries.py +8 -7
- nucliadb/train/api/utils.py +1 -3
- nucliadb/train/api/v1/shards.py +1 -2
- nucliadb/train/api/v1/trainset.py +1 -2
- nucliadb/train/app.py +1 -1
- nucliadb/train/generator.py +4 -4
- nucliadb/train/generators/field_classifier.py +2 -2
- nucliadb/train/generators/field_streaming.py +6 -6
- nucliadb/train/generators/image_classifier.py +2 -2
- nucliadb/train/generators/paragraph_classifier.py +2 -2
- nucliadb/train/generators/paragraph_streaming.py +2 -2
- nucliadb/train/generators/question_answer_streaming.py +2 -2
- nucliadb/train/generators/sentence_classifier.py +4 -10
- nucliadb/train/generators/token_classifier.py +3 -2
- nucliadb/train/generators/utils.py +6 -5
- nucliadb/train/nodes.py +3 -3
- nucliadb/train/resource.py +6 -8
- nucliadb/train/settings.py +3 -4
- nucliadb/train/types.py +11 -11
- nucliadb/train/upload.py +3 -2
- nucliadb/train/uploader.py +1 -2
- nucliadb/train/utils.py +1 -2
- nucliadb/writer/api/v1/export_import.py +4 -1
- nucliadb/writer/api/v1/field.py +15 -14
- nucliadb/writer/api/v1/knowledgebox.py +18 -56
- nucliadb/writer/api/v1/learning_config.py +5 -4
- nucliadb/writer/api/v1/resource.py +9 -20
- nucliadb/writer/api/v1/services.py +10 -132
- nucliadb/writer/api/v1/upload.py +73 -72
- nucliadb/writer/app.py +8 -2
- nucliadb/writer/resource/basic.py +12 -15
- nucliadb/writer/resource/field.py +43 -5
- nucliadb/writer/resource/origin.py +7 -0
- nucliadb/writer/settings.py +2 -3
- nucliadb/writer/tus/__init__.py +2 -3
- nucliadb/writer/tus/azure.py +5 -7
- nucliadb/writer/tus/dm.py +3 -3
- nucliadb/writer/tus/exceptions.py +3 -4
- nucliadb/writer/tus/gcs.py +15 -22
- nucliadb/writer/tus/s3.py +2 -3
- nucliadb/writer/tus/storage.py +3 -3
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
- nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
- nucliadb/common/datamanagers/entities.py +0 -139
- nucliadb/common/external_index_providers/pinecone.py +0 -894
- nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
- nucliadb/search/search/hydrator.py +0 -197
- nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
|
@@ -18,7 +18,6 @@
|
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
20
|
|
|
21
|
-
from typing import Optional
|
|
22
21
|
|
|
23
22
|
from nucliadb.common.datamanagers.utils import get_kv_pb
|
|
24
23
|
from nucliadb.common.maindb.driver import Transaction
|
|
@@ -27,7 +26,7 @@ from nucliadb_protos import knowledgebox_pb2
|
|
|
27
26
|
KB_SYNONYMS = "/kbs/{kbid}/synonyms"
|
|
28
27
|
|
|
29
28
|
|
|
30
|
-
async def get(txn: Transaction, *, kbid: str) ->
|
|
29
|
+
async def get(txn: Transaction, *, kbid: str) -> knowledgebox_pb2.Synonyms | None:
|
|
31
30
|
key = KB_SYNONYMS.format(kbid=kbid)
|
|
32
31
|
return await get_kv_pb(txn, key, knowledgebox_pb2.Synonyms, for_update=False)
|
|
33
32
|
|
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
20
|
import contextlib
|
|
21
|
-
from typing import
|
|
21
|
+
from typing import TypeVar
|
|
22
22
|
|
|
23
23
|
from google.protobuf.message import Message
|
|
24
24
|
|
|
@@ -29,9 +29,9 @@ PB_TYPE = TypeVar("PB_TYPE", bound=Message)
|
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
async def get_kv_pb(
|
|
32
|
-
txn: Transaction, key: str, pb_type:
|
|
33
|
-
) ->
|
|
34
|
-
serialized:
|
|
32
|
+
txn: Transaction, key: str, pb_type: type[PB_TYPE], for_update: bool = True
|
|
33
|
+
) -> PB_TYPE | None:
|
|
34
|
+
serialized: bytes | None = await txn.get(key, for_update=for_update)
|
|
35
35
|
if serialized is None:
|
|
36
36
|
return None
|
|
37
37
|
pb = pb_type()
|
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
|
-
from
|
|
20
|
+
from collections.abc import AsyncIterator
|
|
21
21
|
|
|
22
22
|
from nucliadb.common.datamanagers.utils import get_kv_pb
|
|
23
23
|
from nucliadb.common.maindb.driver import Transaction
|
|
@@ -37,7 +37,7 @@ async def initialize(txn: Transaction, *, kbid: str):
|
|
|
37
37
|
|
|
38
38
|
async def get(
|
|
39
39
|
txn: Transaction, *, kbid: str, vectorset_id: str
|
|
40
|
-
) ->
|
|
40
|
+
) -> knowledgebox_pb2.VectorSetConfig | None:
|
|
41
41
|
kb_vectorsets = await _get_or_default(txn, kbid=kbid, for_update=False)
|
|
42
42
|
index = _find_vectorset(kb_vectorsets, vectorset_id)
|
|
43
43
|
if index is None:
|
|
@@ -80,7 +80,7 @@ async def set(txn: Transaction, *, kbid: str, config: knowledgebox_pb2.VectorSet
|
|
|
80
80
|
|
|
81
81
|
async def delete(
|
|
82
82
|
txn: Transaction, *, kbid: str, vectorset_id: str
|
|
83
|
-
) ->
|
|
83
|
+
) -> knowledgebox_pb2.VectorSetConfig | None:
|
|
84
84
|
kb_vectorsets = await _get_or_default(txn, kbid=kbid, for_update=True)
|
|
85
85
|
index = _find_vectorset(kb_vectorsets, vectorset_id)
|
|
86
86
|
if index is None:
|
|
@@ -111,7 +111,7 @@ async def _get_or_default(
|
|
|
111
111
|
|
|
112
112
|
def _find_vectorset(
|
|
113
113
|
kb_vectorsets: knowledgebox_pb2.KnowledgeBoxVectorSetsConfig, vectorset_id: str
|
|
114
|
-
) ->
|
|
114
|
+
) -> int | None:
|
|
115
115
|
"""Return the position of the vectorset in `vectorsets` or `None` if not found."""
|
|
116
116
|
for idx, vectorset in enumerate(kb_vectorsets.vectorsets):
|
|
117
117
|
if vectorset.vectorset_id == vectorset_id:
|
|
@@ -19,8 +19,9 @@
|
|
|
19
19
|
#
|
|
20
20
|
import abc
|
|
21
21
|
import logging
|
|
22
|
+
from collections.abc import Iterator
|
|
22
23
|
from dataclasses import dataclass
|
|
23
|
-
from typing import Any
|
|
24
|
+
from typing import Any
|
|
24
25
|
|
|
25
26
|
from nidx_protos.nodereader_pb2 import SearchRequest
|
|
26
27
|
from nidx_protos.noderesources_pb2 import Resource
|
|
@@ -30,7 +31,9 @@ from nucliadb.common.counters import IndexCounts
|
|
|
30
31
|
from nucliadb.common.external_index_providers.exceptions import ExternalIndexingError
|
|
31
32
|
from nucliadb.common.ids import ParagraphId
|
|
32
33
|
from nucliadb_models.external_index_providers import ExternalIndexProviderType
|
|
34
|
+
from nucliadb_models.retrieval import Score
|
|
33
35
|
from nucliadb_models.search import SCORE_TYPE, Relations, TextPosition
|
|
36
|
+
from nucliadb_protos import resources_pb2
|
|
34
37
|
from nucliadb_protos.knowledgebox_pb2 import (
|
|
35
38
|
CreateExternalIndexProviderMetadata,
|
|
36
39
|
StoredExternalIndexProviderMetadata,
|
|
@@ -43,6 +46,16 @@ logger = logging.getLogger(__name__)
|
|
|
43
46
|
manager_observer = Observer("external_index_manager", labels={"operation": "", "provider": ""})
|
|
44
47
|
|
|
45
48
|
|
|
49
|
+
# /k/ocr
|
|
50
|
+
_OCR_LABEL = (
|
|
51
|
+
f"/k/{resources_pb2.Paragraph.TypeParagraph.Name(resources_pb2.Paragraph.TypeParagraph.OCR).lower()}"
|
|
52
|
+
)
|
|
53
|
+
# /k/inception
|
|
54
|
+
_INCEPTION_LABEL = (
|
|
55
|
+
f"/k/{resources_pb2.Paragraph.TypeParagraph.Name(resources_pb2.Paragraph.TypeParagraph.OCR).lower()}"
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
46
59
|
@dataclass
|
|
47
60
|
class VectorsetExternalIndex:
|
|
48
61
|
"""
|
|
@@ -57,9 +70,19 @@ class VectorsetExternalIndex:
|
|
|
57
70
|
|
|
58
71
|
class ScoredTextBlock(BaseModel):
|
|
59
72
|
paragraph_id: ParagraphId
|
|
60
|
-
score: float
|
|
61
73
|
score_type: SCORE_TYPE
|
|
62
74
|
|
|
75
|
+
scores: list[Score]
|
|
76
|
+
|
|
77
|
+
@property
|
|
78
|
+
def score(self) -> float:
|
|
79
|
+
return self.current_score.score
|
|
80
|
+
|
|
81
|
+
@property
|
|
82
|
+
def current_score(self) -> Score:
|
|
83
|
+
assert len(self.scores) > 0, "text block matches must be scored"
|
|
84
|
+
return self.scores[-1]
|
|
85
|
+
|
|
63
86
|
|
|
64
87
|
class TextBlockMatch(ScoredTextBlock):
|
|
65
88
|
"""
|
|
@@ -72,11 +95,15 @@ class TextBlockMatch(ScoredTextBlock):
|
|
|
72
95
|
page_with_visual: bool = False
|
|
73
96
|
fuzzy_search: bool
|
|
74
97
|
is_a_table: bool = False
|
|
75
|
-
representation_file:
|
|
98
|
+
representation_file: str | None = None
|
|
76
99
|
paragraph_labels: list[str] = []
|
|
77
100
|
field_labels: list[str] = []
|
|
78
|
-
text:
|
|
79
|
-
relevant_relations:
|
|
101
|
+
text: str | None = None
|
|
102
|
+
relevant_relations: Relations | None = None
|
|
103
|
+
|
|
104
|
+
@property
|
|
105
|
+
def is_an_image(self) -> bool:
|
|
106
|
+
return _OCR_LABEL in self.paragraph_labels or _INCEPTION_LABEL in self.paragraph_labels
|
|
80
107
|
|
|
81
108
|
|
|
82
109
|
class QueryResults(BaseModel):
|
|
@@ -17,62 +17,33 @@
|
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
|
-
from typing import Optional
|
|
21
20
|
|
|
22
21
|
import async_lru
|
|
23
22
|
|
|
24
23
|
from nucliadb.common import datamanagers
|
|
25
24
|
from nucliadb.common.external_index_providers.base import ExternalIndexManager
|
|
26
|
-
from nucliadb.common.external_index_providers.pinecone import PineconeIndexManager
|
|
27
|
-
from nucliadb.common.external_index_providers.settings import settings
|
|
28
25
|
from nucliadb_protos.knowledgebox_pb2 import (
|
|
29
|
-
ExternalIndexProviderType,
|
|
30
26
|
StoredExternalIndexProviderMetadata,
|
|
31
27
|
)
|
|
32
|
-
from nucliadb_utils.utilities import get_endecryptor
|
|
33
28
|
|
|
34
29
|
|
|
35
30
|
async def get_external_index_manager(
|
|
36
31
|
kbid: str, for_rollover: bool = False
|
|
37
|
-
) ->
|
|
32
|
+
) -> ExternalIndexManager | None:
|
|
38
33
|
"""
|
|
39
34
|
Returns an ExternalIndexManager for the given kbid.
|
|
40
35
|
If for_rollover is True, the ExternalIndexManager returned will include the rollover indexes (if any).
|
|
41
36
|
"""
|
|
42
|
-
|
|
43
|
-
if metadata is None or metadata.type != ExternalIndexProviderType.PINECONE:
|
|
44
|
-
# Only Pinecone is supported for now
|
|
45
|
-
return None
|
|
46
|
-
|
|
47
|
-
api_key = get_endecryptor().decrypt(metadata.pinecone_config.encrypted_api_key)
|
|
48
|
-
default_vectorset = await get_default_vectorset_id(kbid)
|
|
49
|
-
|
|
50
|
-
rollover_indexes = None
|
|
51
|
-
if for_rollover:
|
|
52
|
-
rollover_metadata = await get_rollover_external_index_metadata(kbid)
|
|
53
|
-
if rollover_metadata is not None:
|
|
54
|
-
rollover_indexes = dict(rollover_metadata.pinecone_config.indexes)
|
|
55
|
-
|
|
56
|
-
return PineconeIndexManager(
|
|
57
|
-
kbid=kbid,
|
|
58
|
-
api_key=api_key,
|
|
59
|
-
indexes=dict(metadata.pinecone_config.indexes),
|
|
60
|
-
upsert_parallelism=settings.pinecone_upsert_parallelism,
|
|
61
|
-
delete_parallelism=settings.pinecone_delete_parallelism,
|
|
62
|
-
upsert_timeout=settings.pinecone_upsert_timeout,
|
|
63
|
-
delete_timeout=settings.pinecone_delete_timeout,
|
|
64
|
-
default_vectorset=default_vectorset,
|
|
65
|
-
rollover_indexes=rollover_indexes,
|
|
66
|
-
)
|
|
37
|
+
return None
|
|
67
38
|
|
|
68
39
|
|
|
69
40
|
@async_lru.alru_cache(maxsize=None)
|
|
70
|
-
async def get_external_index_metadata(kbid: str) ->
|
|
41
|
+
async def get_external_index_metadata(kbid: str) -> StoredExternalIndexProviderMetadata | None:
|
|
71
42
|
return await datamanagers.atomic.kb.get_external_index_provider_metadata(kbid=kbid)
|
|
72
43
|
|
|
73
44
|
|
|
74
45
|
@async_lru.alru_cache(maxsize=None)
|
|
75
|
-
async def get_default_vectorset_id(kbid: str) ->
|
|
46
|
+
async def get_default_vectorset_id(kbid: str) -> str | None:
|
|
76
47
|
"""
|
|
77
48
|
While we are transitioning to the new vectorset system, we need to take into account
|
|
78
49
|
that KBs that have only one semantic model will have the `vectorset_id` field on BrokerMessage.field_vectors
|
|
@@ -96,6 +67,6 @@ async def get_default_vectorset_id(kbid: str) -> Optional[str]:
|
|
|
96
67
|
|
|
97
68
|
async def get_rollover_external_index_metadata(
|
|
98
69
|
kbid: str,
|
|
99
|
-
) ->
|
|
70
|
+
) -> StoredExternalIndexProviderMetadata | None:
|
|
100
71
|
async with datamanagers.with_ro_transaction() as txn:
|
|
101
72
|
return await datamanagers.rollover.get_kb_rollover_external_index_metadata(txn, kbid=kbid)
|
|
@@ -17,36 +17,10 @@
|
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
|
-
from pydantic import Field
|
|
21
20
|
from pydantic_settings import BaseSettings
|
|
22
21
|
|
|
23
22
|
|
|
24
|
-
class ExternalIndexProvidersSettings(BaseSettings):
|
|
25
|
-
pinecone_upsert_parallelism: int = Field(
|
|
26
|
-
default=3,
|
|
27
|
-
title="Pinecone upsert parallelism",
|
|
28
|
-
description="Number of parallel upserts to Pinecone on each set resource operation",
|
|
29
|
-
)
|
|
30
|
-
pinecone_delete_parallelism: int = Field(
|
|
31
|
-
default=2,
|
|
32
|
-
title="Pinecone delete parallelism",
|
|
33
|
-
description="Number of parallel deletes to Pinecone on each delete resource operation",
|
|
34
|
-
)
|
|
35
|
-
pinecone_upsert_timeout: float = Field(
|
|
36
|
-
default=10.0,
|
|
37
|
-
title="Pinecone upsert timeout",
|
|
38
|
-
description="Timeout in seconds for each upsert operation to Pinecone",
|
|
39
|
-
)
|
|
40
|
-
pinecone_delete_timeout: float = Field(
|
|
41
|
-
default=10.0,
|
|
42
|
-
title="Pinecone delete timeout",
|
|
43
|
-
description="Timeout in seconds for each delete operation to Pinecone",
|
|
44
|
-
)
|
|
45
|
-
pinecone_query_timeout: float = Field(
|
|
46
|
-
default=10.0,
|
|
47
|
-
title="Pinecone query timeout",
|
|
48
|
-
description="Timeout in seconds for each query operation to Pinecone",
|
|
49
|
-
)
|
|
23
|
+
class ExternalIndexProvidersSettings(BaseSettings): ...
|
|
50
24
|
|
|
51
25
|
|
|
52
26
|
settings = ExternalIndexProvidersSettings()
|
|
@@ -18,13 +18,14 @@
|
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
20
|
|
|
21
|
-
from typing import Union
|
|
22
21
|
|
|
23
22
|
from nidx_protos.nodereader_pb2 import FilterExpression as PBFilterExpression
|
|
23
|
+
from typing_extensions import assert_never
|
|
24
24
|
|
|
25
25
|
from nucliadb.common import datamanagers
|
|
26
26
|
from nucliadb.common.exceptions import InvalidQueryError
|
|
27
27
|
from nucliadb.common.ids import FIELD_TYPE_NAME_TO_STR
|
|
28
|
+
from nucliadb_models.common import Paragraph
|
|
28
29
|
from nucliadb_models.filters import (
|
|
29
30
|
And,
|
|
30
31
|
DateCreated,
|
|
@@ -50,44 +51,28 @@ from nucliadb_models.filters import (
|
|
|
50
51
|
ResourceMimetype,
|
|
51
52
|
Status,
|
|
52
53
|
)
|
|
54
|
+
from nucliadb_models.metadata import ResourceProcessingStatus
|
|
53
55
|
|
|
54
56
|
# Filters that end up as a facet
|
|
55
|
-
FacetFilter =
|
|
56
|
-
OriginTag
|
|
57
|
-
Label
|
|
58
|
-
ResourceMimetype
|
|
59
|
-
FieldMimetype
|
|
60
|
-
Entity
|
|
61
|
-
Language
|
|
62
|
-
OriginMetadata
|
|
63
|
-
OriginPath
|
|
64
|
-
Generated
|
|
65
|
-
Kind
|
|
66
|
-
OriginCollaborator
|
|
67
|
-
OriginSource
|
|
68
|
-
Status
|
|
69
|
-
]
|
|
70
|
-
# In Python 3.9 we cannot do isinstance against an union
|
|
71
|
-
# Once we support only 3.10+, we can remove this
|
|
72
|
-
FacetFilterTypes = (
|
|
73
|
-
OriginTag,
|
|
74
|
-
Label,
|
|
75
|
-
ResourceMimetype,
|
|
76
|
-
FieldMimetype,
|
|
77
|
-
Entity,
|
|
78
|
-
Language,
|
|
79
|
-
OriginMetadata,
|
|
80
|
-
OriginPath,
|
|
81
|
-
Generated,
|
|
82
|
-
Kind,
|
|
83
|
-
OriginCollaborator,
|
|
84
|
-
OriginSource,
|
|
85
|
-
Status,
|
|
57
|
+
FacetFilter = (
|
|
58
|
+
OriginTag
|
|
59
|
+
| Label
|
|
60
|
+
| ResourceMimetype
|
|
61
|
+
| FieldMimetype
|
|
62
|
+
| Entity
|
|
63
|
+
| Language
|
|
64
|
+
| OriginMetadata
|
|
65
|
+
| OriginPath
|
|
66
|
+
| Generated
|
|
67
|
+
| Kind
|
|
68
|
+
| OriginCollaborator
|
|
69
|
+
| OriginSource
|
|
70
|
+
| Status
|
|
86
71
|
)
|
|
87
72
|
|
|
88
73
|
|
|
89
74
|
async def parse_expression(
|
|
90
|
-
expr:
|
|
75
|
+
expr: FieldFilterExpression | ParagraphFilterExpression,
|
|
91
76
|
kbid: str,
|
|
92
77
|
) -> PBFilterExpression:
|
|
93
78
|
f = PBFilterExpression()
|
|
@@ -110,7 +95,7 @@ async def parse_expression(
|
|
|
110
95
|
if rid is None:
|
|
111
96
|
raise InvalidQueryError("slug", f"Cannot find slug {expr.slug}")
|
|
112
97
|
f.resource.resource_id = rid
|
|
113
|
-
else: # pragma:
|
|
98
|
+
else: # pragma: no cover
|
|
114
99
|
# Cannot happen due to model validation
|
|
115
100
|
raise ValueError("Resource needs id or slug")
|
|
116
101
|
elif isinstance(expr, Field):
|
|
@@ -131,12 +116,10 @@ async def parse_expression(
|
|
|
131
116
|
f.date.since.FromDatetime(expr.since)
|
|
132
117
|
if expr.until:
|
|
133
118
|
f.date.until.FromDatetime(expr.until)
|
|
134
|
-
elif isinstance(expr,
|
|
119
|
+
elif isinstance(expr, FacetFilter):
|
|
135
120
|
f.facet.facet = facet_from_filter(expr)
|
|
136
121
|
else:
|
|
137
|
-
|
|
138
|
-
# that is, if we are missing some ifs
|
|
139
|
-
_a: int = "a"
|
|
122
|
+
assert_never(expr)
|
|
140
123
|
|
|
141
124
|
return f
|
|
142
125
|
|
|
@@ -190,13 +173,118 @@ def facet_from_filter(expr: FacetFilter) -> str:
|
|
|
190
173
|
elif isinstance(expr, Status):
|
|
191
174
|
facet = f"/n/s/{expr.status.value}"
|
|
192
175
|
else:
|
|
193
|
-
|
|
194
|
-
# that is, if we are missing some ifs
|
|
195
|
-
_a: int = "a"
|
|
176
|
+
assert_never(expr)
|
|
196
177
|
|
|
197
178
|
return facet
|
|
198
179
|
|
|
199
180
|
|
|
181
|
+
def filter_from_facet(facet: str) -> FacetFilter:
|
|
182
|
+
expr: FacetFilter
|
|
183
|
+
|
|
184
|
+
if facet.startswith("/t/"):
|
|
185
|
+
value = facet.removeprefix("/t/")
|
|
186
|
+
expr = OriginTag(tag=value)
|
|
187
|
+
|
|
188
|
+
elif facet.startswith("/l/"):
|
|
189
|
+
value = facet.removeprefix("/l/")
|
|
190
|
+
parts = value.split("/", maxsplit=1)
|
|
191
|
+
if len(parts) == 1:
|
|
192
|
+
type = parts[0]
|
|
193
|
+
expr = Label(labelset=type)
|
|
194
|
+
else:
|
|
195
|
+
type, subtype = parts
|
|
196
|
+
expr = Label(labelset=type, label=subtype)
|
|
197
|
+
|
|
198
|
+
elif facet.startswith("/n/i/"):
|
|
199
|
+
value = facet.removeprefix("/n/i/")
|
|
200
|
+
parts = value.split("/", maxsplit=1)
|
|
201
|
+
if len(parts) == 1:
|
|
202
|
+
type = parts[0]
|
|
203
|
+
expr = ResourceMimetype(type=type)
|
|
204
|
+
else:
|
|
205
|
+
type, subtype = parts
|
|
206
|
+
expr = ResourceMimetype(type=type, subtype=subtype)
|
|
207
|
+
|
|
208
|
+
elif facet.startswith("/mt/"):
|
|
209
|
+
value = facet.removeprefix("/mt/")
|
|
210
|
+
parts = value.split("/", maxsplit=1)
|
|
211
|
+
if len(parts) == 1:
|
|
212
|
+
type = parts[0]
|
|
213
|
+
expr = FieldMimetype(type=type)
|
|
214
|
+
else:
|
|
215
|
+
type, subtype = parts
|
|
216
|
+
expr = FieldMimetype(type=type, subtype=subtype)
|
|
217
|
+
|
|
218
|
+
elif facet.startswith("/e/"):
|
|
219
|
+
value = facet.removeprefix("/e/")
|
|
220
|
+
parts = value.split("/", maxsplit=1)
|
|
221
|
+
if len(parts) == 1:
|
|
222
|
+
subtype = parts[0]
|
|
223
|
+
expr = Entity(subtype=subtype)
|
|
224
|
+
else:
|
|
225
|
+
subtype, value = parts
|
|
226
|
+
expr = Entity(subtype=subtype, value=value)
|
|
227
|
+
|
|
228
|
+
elif facet.startswith("/s/p"):
|
|
229
|
+
value = facet.removeprefix("/s/p/")
|
|
230
|
+
expr = Language(language=value, only_primary=True)
|
|
231
|
+
|
|
232
|
+
elif facet.startswith("/s/s"):
|
|
233
|
+
value = facet.removeprefix("/s/s/")
|
|
234
|
+
expr = Language(language=value, only_primary=False)
|
|
235
|
+
|
|
236
|
+
elif facet.startswith("/m/"):
|
|
237
|
+
value = facet.removeprefix("/m/")
|
|
238
|
+
parts = value.split("/", maxsplit=1)
|
|
239
|
+
if len(parts) == 1:
|
|
240
|
+
field = parts[0]
|
|
241
|
+
expr = OriginMetadata(field=field)
|
|
242
|
+
else:
|
|
243
|
+
field, value = parts
|
|
244
|
+
expr = OriginMetadata(field=field, value=value)
|
|
245
|
+
|
|
246
|
+
elif facet.startswith("/p/"):
|
|
247
|
+
value = facet.removeprefix("/p/")
|
|
248
|
+
expr = OriginPath(prefix=value)
|
|
249
|
+
|
|
250
|
+
elif facet.startswith("/g/da"):
|
|
251
|
+
value = facet.removeprefix("/g/da")
|
|
252
|
+
expr = expr = Generated(by="data-augmentation")
|
|
253
|
+
if value.removeprefix("/"):
|
|
254
|
+
expr.da_task = value.removeprefix("/")
|
|
255
|
+
|
|
256
|
+
elif facet.startswith("/k/"):
|
|
257
|
+
value = facet.removeprefix("/k/")
|
|
258
|
+
try:
|
|
259
|
+
kind = Paragraph.TypeParagraph(value.upper())
|
|
260
|
+
except ValueError:
|
|
261
|
+
raise InvalidQueryError("filters", f"invalid paragraph kind: {value}")
|
|
262
|
+
expr = Kind(kind=kind)
|
|
263
|
+
|
|
264
|
+
elif facet.startswith("/u/o/"):
|
|
265
|
+
value = facet.removeprefix("/u/o/")
|
|
266
|
+
expr = OriginCollaborator(collaborator=value)
|
|
267
|
+
|
|
268
|
+
elif facet.startswith("/u/s"):
|
|
269
|
+
value = facet.removeprefix("/u/s")
|
|
270
|
+
expr = OriginSource()
|
|
271
|
+
if value.removeprefix("/"):
|
|
272
|
+
expr.id = value.removeprefix("/")
|
|
273
|
+
|
|
274
|
+
elif facet.startswith("/n/s/"):
|
|
275
|
+
value = facet.removeprefix("/n/s/")
|
|
276
|
+
try:
|
|
277
|
+
status = ResourceProcessingStatus(value.upper())
|
|
278
|
+
except ValueError:
|
|
279
|
+
raise InvalidQueryError("filters", f"invalid resource processing status: {value}")
|
|
280
|
+
expr = Status(status=status)
|
|
281
|
+
|
|
282
|
+
else:
|
|
283
|
+
raise InvalidQueryError("filters", f"invalid filter: {facet}")
|
|
284
|
+
|
|
285
|
+
return expr
|
|
286
|
+
|
|
287
|
+
|
|
200
288
|
def add_and_expression(dest: PBFilterExpression, add: PBFilterExpression):
|
|
201
289
|
dest_expr_type = dest.WhichOneof("expr")
|
|
202
290
|
if dest_expr_type is None:
|
|
@@ -21,6 +21,10 @@ class ClientException(Exception):
|
|
|
21
21
|
pass
|
|
22
22
|
|
|
23
23
|
|
|
24
|
+
class ServerException(Exception):
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
|
|
24
28
|
class NotFoundException(ClientException):
|
|
25
29
|
pass
|
|
26
30
|
|
|
@@ -35,3 +39,7 @@ class RateLimitException(ClientException):
|
|
|
35
39
|
|
|
36
40
|
class AccountLimitException(ClientException):
|
|
37
41
|
pass
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class ServiceUnavailableException(ServerException):
|
|
45
|
+
pass
|
|
@@ -19,10 +19,8 @@
|
|
|
19
19
|
#
|
|
20
20
|
import logging
|
|
21
21
|
from datetime import datetime
|
|
22
|
-
from typing import Optional
|
|
23
22
|
|
|
24
23
|
import aiohttp
|
|
25
|
-
import jwt
|
|
26
24
|
import pydantic
|
|
27
25
|
|
|
28
26
|
from nucliadb_utils.helpers import MessageProgressUpdater
|
|
@@ -33,15 +31,6 @@ from .utils import check_status
|
|
|
33
31
|
logger = logging.getLogger(__name__)
|
|
34
32
|
|
|
35
33
|
|
|
36
|
-
def get_nua_api_id() -> str:
|
|
37
|
-
assert nuclia_settings.nuclia_service_account is not None
|
|
38
|
-
claimset = jwt.decode(
|
|
39
|
-
nuclia_settings.nuclia_service_account,
|
|
40
|
-
options={"verify_signature": False},
|
|
41
|
-
)
|
|
42
|
-
return claimset.get("sub")
|
|
43
|
-
|
|
44
|
-
|
|
45
34
|
def get_processing_api_url() -> str:
|
|
46
35
|
if nuclia_settings.nuclia_service_account:
|
|
47
36
|
return (
|
|
@@ -64,10 +53,10 @@ def get_processing_api_v2_url() -> str:
|
|
|
64
53
|
|
|
65
54
|
class PullResponse(pydantic.BaseModel):
|
|
66
55
|
status: str
|
|
67
|
-
payload:
|
|
56
|
+
payload: str | None = None
|
|
68
57
|
payloads: list[bytes] = []
|
|
69
|
-
msgid:
|
|
70
|
-
cursor:
|
|
58
|
+
msgid: str | None = None
|
|
59
|
+
cursor: int | None = None
|
|
71
60
|
|
|
72
61
|
|
|
73
62
|
class PullPosition(pydantic.BaseModel):
|
|
@@ -86,7 +75,7 @@ class RequestsResult(pydantic.BaseModel):
|
|
|
86
75
|
description="Resource ID.",
|
|
87
76
|
)
|
|
88
77
|
kbid: str = pydantic.Field(..., title="KnowledgeBox ID")
|
|
89
|
-
title:
|
|
78
|
+
title: str | None = pydantic.Field(
|
|
90
79
|
None,
|
|
91
80
|
title="Title",
|
|
92
81
|
description="Title of the resource.",
|
|
@@ -111,12 +100,12 @@ class RequestsResult(pydantic.BaseModel):
|
|
|
111
100
|
title="Timestamp",
|
|
112
101
|
description="Timestamp of when the resource was first scheduled.",
|
|
113
102
|
)
|
|
114
|
-
completed_at:
|
|
103
|
+
completed_at: datetime | None = pydantic.Field(
|
|
115
104
|
None,
|
|
116
105
|
title="Completed At",
|
|
117
106
|
description="Timestamp of when the resource was completed",
|
|
118
107
|
)
|
|
119
|
-
scheduled_at:
|
|
108
|
+
scheduled_at: datetime | None = pydantic.Field(
|
|
120
109
|
None,
|
|
121
110
|
title="Scheduled At",
|
|
122
111
|
description="Timestamp of when the resource was first scheduled.",
|
|
@@ -149,7 +138,7 @@ class RequestsResults(pydantic.BaseModel):
|
|
|
149
138
|
title="Results",
|
|
150
139
|
description="List of results.",
|
|
151
140
|
)
|
|
152
|
-
cursor:
|
|
141
|
+
cursor: str | None = pydantic.Field(
|
|
153
142
|
None,
|
|
154
143
|
title="Cursor",
|
|
155
144
|
description="Cursor to use for the next page of results.",
|
|
@@ -209,6 +198,10 @@ class ProcessingHTTPClient:
|
|
|
209
198
|
async def close(self):
|
|
210
199
|
await self.session.close()
|
|
211
200
|
|
|
201
|
+
async def reset_session(self):
|
|
202
|
+
await self.close()
|
|
203
|
+
self.session = aiohttp.ClientSession()
|
|
204
|
+
|
|
212
205
|
async def in_progress(self, ack_token: str):
|
|
213
206
|
url = self.base_url_v2 + "/pull/in_progress"
|
|
214
207
|
request = InProgressRequest(ack=[ack_token])
|
|
@@ -220,7 +213,7 @@ class ProcessingHTTPClient:
|
|
|
220
213
|
|
|
221
214
|
async def pull_v2(
|
|
222
215
|
self, ack_tokens: list[str], limit: int = 1, timeout: float = 5
|
|
223
|
-
) ->
|
|
216
|
+
) -> PullResponseV2 | None:
|
|
224
217
|
url = self.base_url_v2 + "/pull"
|
|
225
218
|
request = PullRequestV2(limit=limit, timeout=timeout, ack=ack_tokens)
|
|
226
219
|
async with self.session.post(
|
|
@@ -244,9 +237,9 @@ class ProcessingHTTPClient:
|
|
|
244
237
|
|
|
245
238
|
async def requests(
|
|
246
239
|
self,
|
|
247
|
-
cursor:
|
|
248
|
-
scheduled:
|
|
249
|
-
kbid:
|
|
240
|
+
cursor: str | None = None,
|
|
241
|
+
scheduled: bool | None = None,
|
|
242
|
+
kbid: str | None = None,
|
|
250
243
|
limit: int = 20,
|
|
251
244
|
) -> RequestsResults:
|
|
252
245
|
url = self.base_url + "/requests"
|
|
@@ -263,7 +256,7 @@ class ProcessingHTTPClient:
|
|
|
263
256
|
check_status(resp, resp_text)
|
|
264
257
|
return RequestsResults.model_validate_json(resp_text)
|
|
265
258
|
|
|
266
|
-
async def stats(self, kbid: str, timeout:
|
|
259
|
+
async def stats(self, kbid: str, timeout: float | None = 1.0) -> StatsResponse:
|
|
267
260
|
url = self.base_url + "/stats"
|
|
268
261
|
async with self.session.get(
|
|
269
262
|
url,
|
|
@@ -33,5 +33,8 @@ def check_status(resp: aiohttp.ClientResponse, resp_text: str) -> None:
|
|
|
33
33
|
raise exceptions.AuthorizationException(f"Unauthorized to access: {resp.status}")
|
|
34
34
|
elif resp.status == 429:
|
|
35
35
|
raise exceptions.RateLimitException("Rate limited")
|
|
36
|
+
elif resp.status in (502, 503):
|
|
37
|
+
# Service unavailable, can be retried
|
|
38
|
+
raise exceptions.ServiceUnavailableException(f"Service unavailable: {resp.status} - {resp_text}")
|
|
36
39
|
else:
|
|
37
40
|
raise exceptions.ClientException(f"Unknown error: {resp.status} - {resp_text}")
|