nucliadb 6.7.2.post4862__py3-none-any.whl → 6.9.2.post5282__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nucliadb might be problematic. Click here for more details.
- migrations/0016_upgrade_to_paragraphs_v2.py +1 -1
- migrations/0017_multiple_writable_shards.py +1 -1
- migrations/0018_purge_orphan_kbslugs.py +1 -1
- migrations/0019_upgrade_to_paragraphs_v3.py +1 -1
- migrations/0021_overwrite_vectorsets_key.py +1 -1
- migrations/0023_backfill_pg_catalog.py +7 -3
- migrations/0025_assign_models_to_kbs_v2.py +3 -3
- migrations/0027_rollover_texts3.py +1 -1
- migrations/0028_extracted_vectors_reference.py +1 -1
- migrations/0029_backfill_field_status.py +1 -1
- migrations/0032_remove_old_relations.py +1 -1
- migrations/0036_backfill_catalog_slug.py +1 -1
- migrations/0037_backfill_catalog_facets.py +1 -1
- migrations/0038_backfill_catalog_field_labels.py +7 -3
- migrations/0039_backfill_converation_splits_metadata.py +106 -0
- migrations/0040_migrate_search_configurations.py +79 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/backups/create.py +3 -3
- nucliadb/backups/restore.py +3 -3
- nucliadb/common/cache.py +1 -1
- nucliadb/common/catalog/__init__.py +79 -0
- nucliadb/common/catalog/dummy.py +36 -0
- nucliadb/common/catalog/interface.py +85 -0
- nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +294 -208
- nucliadb/common/catalog/utils.py +56 -0
- nucliadb/common/cluster/manager.py +3 -19
- nucliadb/common/cluster/rebalance.py +484 -110
- nucliadb/common/cluster/rollover.py +29 -0
- nucliadb/common/cluster/settings.py +1 -1
- nucliadb/common/cluster/utils.py +26 -0
- nucliadb/common/datamanagers/atomic.py +6 -0
- nucliadb/common/datamanagers/utils.py +2 -2
- nucliadb/common/external_index_providers/manager.py +1 -29
- nucliadb/common/external_index_providers/settings.py +1 -27
- nucliadb/common/filter_expression.py +16 -33
- nucliadb/common/http_clients/exceptions.py +8 -0
- nucliadb/common/http_clients/processing.py +4 -0
- nucliadb/common/http_clients/utils.py +3 -0
- nucliadb/common/ids.py +77 -55
- nucliadb/common/locking.py +4 -4
- nucliadb/common/maindb/driver.py +11 -1
- nucliadb/common/maindb/local.py +1 -1
- nucliadb/common/maindb/pg.py +1 -1
- nucliadb/common/nidx.py +19 -1
- nucliadb/common/vector_index_config.py +1 -1
- nucliadb/export_import/datamanager.py +3 -3
- nucliadb/ingest/consumer/pull.py +7 -0
- nucliadb/ingest/consumer/service.py +2 -27
- nucliadb/ingest/consumer/shard_creator.py +17 -6
- nucliadb/ingest/fields/base.py +9 -17
- nucliadb/ingest/fields/conversation.py +47 -1
- nucliadb/ingest/orm/brain_v2.py +21 -3
- nucliadb/ingest/orm/index_message.py +126 -111
- nucliadb/ingest/orm/knowledgebox.py +84 -43
- nucliadb/ingest/orm/processor/auditing.py +1 -1
- nucliadb/ingest/orm/processor/processor.py +95 -149
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +10 -1
- nucliadb/ingest/partitions.py +12 -1
- nucliadb/ingest/serialize.py +2 -2
- nucliadb/ingest/service/writer.py +26 -19
- nucliadb/ingest/settings.py +33 -11
- nucliadb/learning_proxy.py +12 -15
- nucliadb/metrics_exporter.py +17 -4
- nucliadb/migrator/datamanager.py +11 -17
- nucliadb/migrator/migrator.py +2 -2
- nucliadb/purge/__init__.py +12 -17
- nucliadb/purge/orphan_shards.py +2 -2
- nucliadb/reader/api/v1/knowledgebox.py +40 -12
- nucliadb/reader/api/v1/learning_config.py +30 -10
- nucliadb/reader/api/v1/resource.py +2 -2
- nucliadb/reader/api/v1/services.py +1 -1
- nucliadb/reader/reader/notifications.py +1 -1
- nucliadb/search/api/v1/__init__.py +1 -0
- nucliadb/search/api/v1/catalog.py +4 -4
- nucliadb/search/api/v1/find.py +1 -4
- nucliadb/search/api/v1/hydrate.py +328 -0
- nucliadb/search/api/v1/resource/ask.py +21 -1
- nucliadb/search/api/v1/search.py +1 -4
- nucliadb/search/predict.py +9 -2
- nucliadb/search/search/cache.py +1 -20
- nucliadb/search/search/chat/ask.py +50 -8
- nucliadb/search/search/chat/prompt.py +47 -15
- nucliadb/search/search/chat/query.py +8 -1
- nucliadb/search/search/fetch.py +1 -1
- nucliadb/search/search/find.py +1 -6
- nucliadb/search/search/{hydrator.py → hydrator/__init__.py} +5 -4
- nucliadb/search/search/hydrator/fields.py +175 -0
- nucliadb/search/search/hydrator/images.py +130 -0
- nucliadb/search/search/hydrator/paragraphs.py +307 -0
- nucliadb/search/search/hydrator/resources.py +56 -0
- nucliadb/search/search/metrics.py +16 -0
- nucliadb/search/search/predict_proxy.py +33 -11
- nucliadb/search/search/query.py +0 -23
- nucliadb/search/search/query_parser/fetcher.py +5 -5
- nucliadb/search/search/query_parser/models.py +1 -30
- nucliadb/search/search/query_parser/parsers/ask.py +1 -1
- nucliadb/search/search/query_parser/parsers/catalog.py +4 -7
- nucliadb/search/search/query_parser/parsers/common.py +16 -7
- nucliadb/search/search/query_parser/parsers/find.py +0 -11
- nucliadb/search/search/query_parser/parsers/graph.py +5 -5
- nucliadb/search/search/query_parser/parsers/search.py +0 -11
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +4 -11
- nucliadb/search/search/rerankers.py +1 -1
- nucliadb/search/search/summarize.py +1 -1
- nucliadb/standalone/run.py +3 -0
- nucliadb/tasks/retries.py +4 -4
- nucliadb/train/generators/sentence_classifier.py +2 -8
- nucliadb/train/generators/utils.py +1 -1
- nucliadb/train/nodes.py +4 -4
- nucliadb/train/servicer.py +1 -1
- nucliadb/train/uploader.py +1 -1
- nucliadb/writer/api/v1/field.py +14 -9
- nucliadb/writer/api/v1/knowledgebox.py +15 -52
- nucliadb/writer/api/v1/learning_config.py +5 -4
- nucliadb/writer/api/v1/resource.py +2 -2
- nucliadb/writer/resource/field.py +38 -2
- nucliadb/writer/tus/azure.py +4 -4
- nucliadb/writer/tus/gcs.py +11 -17
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/METADATA +9 -10
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/RECORD +124 -114
- nucliadb/common/external_index_providers/pinecone.py +0 -894
- nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/WHEEL +0 -0
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/top_level.txt +0 -0
|
@@ -34,8 +34,10 @@ from nucliadb.common.external_index_providers.base import ExternalIndexManager
|
|
|
34
34
|
from nucliadb.common.external_index_providers.manager import (
|
|
35
35
|
get_external_index_manager,
|
|
36
36
|
)
|
|
37
|
+
from nucliadb.common.maindb.utils import get_driver
|
|
37
38
|
from nucliadb.common.nidx import get_nidx_api_client
|
|
38
39
|
from nucliadb.common.vector_index_config import nucliadb_index_config_to_nidx
|
|
40
|
+
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
|
|
39
41
|
from nucliadb.migrator.settings import settings
|
|
40
42
|
from nucliadb_protos import utils_pb2, writer_pb2
|
|
41
43
|
from nucliadb_telemetry import errors
|
|
@@ -45,6 +47,7 @@ from .utils import (
|
|
|
45
47
|
get_resource,
|
|
46
48
|
get_rollover_resource_index_message,
|
|
47
49
|
index_resource_to_shard,
|
|
50
|
+
wait_for_nidx,
|
|
48
51
|
)
|
|
49
52
|
|
|
50
53
|
logger = logging.getLogger(__name__)
|
|
@@ -254,6 +257,7 @@ async def index_to_rollover_index(
|
|
|
254
257
|
for rid in resource_ids
|
|
255
258
|
]
|
|
256
259
|
await asyncio.gather(*batch)
|
|
260
|
+
await wait_for_indexing_to_catch_up(app_context)
|
|
257
261
|
|
|
258
262
|
async with datamanagers.with_transaction() as txn:
|
|
259
263
|
state.resources_indexed = True
|
|
@@ -262,6 +266,22 @@ async def index_to_rollover_index(
|
|
|
262
266
|
await txn.commit()
|
|
263
267
|
|
|
264
268
|
|
|
269
|
+
async def wait_for_indexing_to_catch_up(app_context: ApplicationContext):
|
|
270
|
+
try:
|
|
271
|
+
app_context.nats_manager
|
|
272
|
+
except AssertionError:
|
|
273
|
+
logger.warning("Nats manager not initialized. Cannot wait for indexing to catch up")
|
|
274
|
+
return
|
|
275
|
+
max_pending = 1000
|
|
276
|
+
while True:
|
|
277
|
+
try:
|
|
278
|
+
await wait_for_nidx(app_context.nats_manager, max_wait_seconds=60, max_pending=max_pending)
|
|
279
|
+
return
|
|
280
|
+
except asyncio.TimeoutError:
|
|
281
|
+
logger.warning(f"Nidx is behind more than {max_pending} messages. Throttling rollover.")
|
|
282
|
+
await asyncio.sleep(30)
|
|
283
|
+
|
|
284
|
+
|
|
265
285
|
async def _index_resource_to_rollover_index(
|
|
266
286
|
app_context: ApplicationContext,
|
|
267
287
|
rollover_shards: writer_pb2.Shards,
|
|
@@ -415,6 +435,15 @@ async def cutover_shards(app_context: ApplicationContext, kbid: str) -> None:
|
|
|
415
435
|
|
|
416
436
|
await txn.commit()
|
|
417
437
|
|
|
438
|
+
# For KBs with pre-warm enabled, we must configure the new shards. There may
|
|
439
|
+
# be some small delay between this call and the shards being actually
|
|
440
|
+
# prewarmed, but rollovers are quite unusual and we prefer this rather than
|
|
441
|
+
# prewarming old and new shards at the same time
|
|
442
|
+
kb_config = await datamanagers.atomic.kb.get_config(kbid=kbid)
|
|
443
|
+
if kb_config is not None and kb_config.prewarm_enabled:
|
|
444
|
+
driver = get_driver()
|
|
445
|
+
await KnowledgeBox.configure_shards(driver, kbid, prewarm=True)
|
|
446
|
+
|
|
418
447
|
|
|
419
448
|
async def validate_indexed_data(
|
|
420
449
|
app_context: ApplicationContext, kbid: str, external: Optional[ExternalIndexManager] = None
|
|
@@ -42,7 +42,7 @@ class Settings(BaseSettings):
|
|
|
42
42
|
description="Maximum number of paragraphs to target per shard",
|
|
43
43
|
)
|
|
44
44
|
max_resource_paragraphs: int = Field(
|
|
45
|
-
default=
|
|
45
|
+
default=300_000,
|
|
46
46
|
title="Max paragraphs per resource",
|
|
47
47
|
description="Maximum number of paragraphs allowed on a single resource",
|
|
48
48
|
)
|
nucliadb/common/cluster/utils.py
CHANGED
|
@@ -32,6 +32,7 @@ from nucliadb.common.cluster.settings import settings
|
|
|
32
32
|
from nucliadb.ingest.orm import index_message
|
|
33
33
|
from nucliadb.ingest.orm.resource import Resource
|
|
34
34
|
from nucliadb_protos import writer_pb2
|
|
35
|
+
from nucliadb_utils.nats import NatsConnectionManager
|
|
35
36
|
from nucliadb_utils.utilities import Utility, clean_utility, get_utility, set_utility
|
|
36
37
|
|
|
37
38
|
if TYPE_CHECKING: # pragma: no cover
|
|
@@ -125,3 +126,28 @@ async def delete_resource_from_shard(
|
|
|
125
126
|
partition = partitioning.generate_partition(kbid, resource_id)
|
|
126
127
|
|
|
127
128
|
await sm.delete_resource(shard, resource_id, 0, str(partition), kbid)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
async def get_nats_consumer_pending_messages(
|
|
132
|
+
nats_manager: NatsConnectionManager, *, stream: str, consumer: str
|
|
133
|
+
) -> int:
|
|
134
|
+
# get raw js client
|
|
135
|
+
js = nats_manager.js
|
|
136
|
+
consumer_info = await js.consumer_info(stream, consumer)
|
|
137
|
+
return consumer_info.num_pending
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
async def wait_for_nidx(
|
|
141
|
+
nats_manager: NatsConnectionManager,
|
|
142
|
+
max_pending: int,
|
|
143
|
+
poll_interval_seconds: int = 5,
|
|
144
|
+
max_wait_seconds: int = 60,
|
|
145
|
+
):
|
|
146
|
+
async with asyncio.timeout(max_wait_seconds): # type: ignore
|
|
147
|
+
while True:
|
|
148
|
+
pending = await get_nats_consumer_pending_messages(
|
|
149
|
+
nats_manager, stream="nidx", consumer="nidx"
|
|
150
|
+
)
|
|
151
|
+
if pending < max_pending:
|
|
152
|
+
return
|
|
153
|
+
await asyncio.sleep(poll_interval_seconds)
|
|
@@ -42,6 +42,7 @@ from typing_extensions import Concatenate, ParamSpec
|
|
|
42
42
|
|
|
43
43
|
from nucliadb.common.maindb.driver import Transaction
|
|
44
44
|
|
|
45
|
+
from . import cluster as cluster_dm
|
|
45
46
|
from . import kb as kb_dm
|
|
46
47
|
from . import labels as labels_dm
|
|
47
48
|
from . import resources as resources_dm
|
|
@@ -73,6 +74,10 @@ def rw_txn_wrap(fun: Callable[Concatenate[Transaction, P], Awaitable[T]]) -> Cal
|
|
|
73
74
|
return wrapper
|
|
74
75
|
|
|
75
76
|
|
|
77
|
+
class cluster:
|
|
78
|
+
get_kb_shards = ro_txn_wrap(cluster_dm.get_kb_shards)
|
|
79
|
+
|
|
80
|
+
|
|
76
81
|
class kb:
|
|
77
82
|
exists_kb = ro_txn_wrap(kb_dm.exists_kb)
|
|
78
83
|
get_config = ro_txn_wrap(kb_dm.get_config)
|
|
@@ -83,6 +88,7 @@ class resources:
|
|
|
83
88
|
get_resource_uuid_from_slug = ro_txn_wrap(resources_dm.get_resource_uuid_from_slug)
|
|
84
89
|
resource_exists = ro_txn_wrap(resources_dm.resource_exists)
|
|
85
90
|
slug_exists = ro_txn_wrap(resources_dm.slug_exists)
|
|
91
|
+
get_all_field_ids = ro_txn_wrap(resources_dm.get_all_field_ids)
|
|
86
92
|
|
|
87
93
|
|
|
88
94
|
class labelset:
|
|
@@ -42,7 +42,7 @@ async def get_kv_pb(
|
|
|
42
42
|
@contextlib.asynccontextmanager
|
|
43
43
|
async def with_rw_transaction():
|
|
44
44
|
driver = get_driver()
|
|
45
|
-
async with driver.
|
|
45
|
+
async with driver.rw_transaction() as txn:
|
|
46
46
|
yield txn
|
|
47
47
|
|
|
48
48
|
|
|
@@ -53,5 +53,5 @@ with_transaction = with_rw_transaction
|
|
|
53
53
|
@contextlib.asynccontextmanager
|
|
54
54
|
async def with_ro_transaction():
|
|
55
55
|
driver = get_driver()
|
|
56
|
-
async with driver.
|
|
56
|
+
async with driver.ro_transaction() as ro_txn:
|
|
57
57
|
yield ro_txn
|
|
@@ -23,13 +23,9 @@ import async_lru
|
|
|
23
23
|
|
|
24
24
|
from nucliadb.common import datamanagers
|
|
25
25
|
from nucliadb.common.external_index_providers.base import ExternalIndexManager
|
|
26
|
-
from nucliadb.common.external_index_providers.pinecone import PineconeIndexManager
|
|
27
|
-
from nucliadb.common.external_index_providers.settings import settings
|
|
28
26
|
from nucliadb_protos.knowledgebox_pb2 import (
|
|
29
|
-
ExternalIndexProviderType,
|
|
30
27
|
StoredExternalIndexProviderMetadata,
|
|
31
28
|
)
|
|
32
|
-
from nucliadb_utils.utilities import get_endecryptor
|
|
33
29
|
|
|
34
30
|
|
|
35
31
|
async def get_external_index_manager(
|
|
@@ -39,31 +35,7 @@ async def get_external_index_manager(
|
|
|
39
35
|
Returns an ExternalIndexManager for the given kbid.
|
|
40
36
|
If for_rollover is True, the ExternalIndexManager returned will include the rollover indexes (if any).
|
|
41
37
|
"""
|
|
42
|
-
|
|
43
|
-
if metadata is None or metadata.type != ExternalIndexProviderType.PINECONE:
|
|
44
|
-
# Only Pinecone is supported for now
|
|
45
|
-
return None
|
|
46
|
-
|
|
47
|
-
api_key = get_endecryptor().decrypt(metadata.pinecone_config.encrypted_api_key)
|
|
48
|
-
default_vectorset = await get_default_vectorset_id(kbid)
|
|
49
|
-
|
|
50
|
-
rollover_indexes = None
|
|
51
|
-
if for_rollover:
|
|
52
|
-
rollover_metadata = await get_rollover_external_index_metadata(kbid)
|
|
53
|
-
if rollover_metadata is not None:
|
|
54
|
-
rollover_indexes = dict(rollover_metadata.pinecone_config.indexes)
|
|
55
|
-
|
|
56
|
-
return PineconeIndexManager(
|
|
57
|
-
kbid=kbid,
|
|
58
|
-
api_key=api_key,
|
|
59
|
-
indexes=dict(metadata.pinecone_config.indexes),
|
|
60
|
-
upsert_parallelism=settings.pinecone_upsert_parallelism,
|
|
61
|
-
delete_parallelism=settings.pinecone_delete_parallelism,
|
|
62
|
-
upsert_timeout=settings.pinecone_upsert_timeout,
|
|
63
|
-
delete_timeout=settings.pinecone_delete_timeout,
|
|
64
|
-
default_vectorset=default_vectorset,
|
|
65
|
-
rollover_indexes=rollover_indexes,
|
|
66
|
-
)
|
|
38
|
+
return None
|
|
67
39
|
|
|
68
40
|
|
|
69
41
|
@async_lru.alru_cache(maxsize=None)
|
|
@@ -17,36 +17,10 @@
|
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
|
-
from pydantic import Field
|
|
21
20
|
from pydantic_settings import BaseSettings
|
|
22
21
|
|
|
23
22
|
|
|
24
|
-
class ExternalIndexProvidersSettings(BaseSettings):
|
|
25
|
-
pinecone_upsert_parallelism: int = Field(
|
|
26
|
-
default=3,
|
|
27
|
-
title="Pinecone upsert parallelism",
|
|
28
|
-
description="Number of parallel upserts to Pinecone on each set resource operation",
|
|
29
|
-
)
|
|
30
|
-
pinecone_delete_parallelism: int = Field(
|
|
31
|
-
default=2,
|
|
32
|
-
title="Pinecone delete parallelism",
|
|
33
|
-
description="Number of parallel deletes to Pinecone on each delete resource operation",
|
|
34
|
-
)
|
|
35
|
-
pinecone_upsert_timeout: float = Field(
|
|
36
|
-
default=10.0,
|
|
37
|
-
title="Pinecone upsert timeout",
|
|
38
|
-
description="Timeout in seconds for each upsert operation to Pinecone",
|
|
39
|
-
)
|
|
40
|
-
pinecone_delete_timeout: float = Field(
|
|
41
|
-
default=10.0,
|
|
42
|
-
title="Pinecone delete timeout",
|
|
43
|
-
description="Timeout in seconds for each delete operation to Pinecone",
|
|
44
|
-
)
|
|
45
|
-
pinecone_query_timeout: float = Field(
|
|
46
|
-
default=10.0,
|
|
47
|
-
title="Pinecone query timeout",
|
|
48
|
-
description="Timeout in seconds for each query operation to Pinecone",
|
|
49
|
-
)
|
|
23
|
+
class ExternalIndexProvidersSettings(BaseSettings): ...
|
|
50
24
|
|
|
51
25
|
|
|
52
26
|
settings = ExternalIndexProvidersSettings()
|
|
@@ -52,37 +52,20 @@ from nucliadb_models.filters import (
|
|
|
52
52
|
)
|
|
53
53
|
|
|
54
54
|
# Filters that end up as a facet
|
|
55
|
-
FacetFilter =
|
|
56
|
-
OriginTag
|
|
57
|
-
Label
|
|
58
|
-
ResourceMimetype
|
|
59
|
-
FieldMimetype
|
|
60
|
-
Entity
|
|
61
|
-
Language
|
|
62
|
-
OriginMetadata
|
|
63
|
-
OriginPath
|
|
64
|
-
Generated
|
|
65
|
-
Kind
|
|
66
|
-
OriginCollaborator
|
|
67
|
-
OriginSource
|
|
68
|
-
Status
|
|
69
|
-
]
|
|
70
|
-
# In Python 3.9 we cannot do isinstance against an union
|
|
71
|
-
# Once we support only 3.10+, we can remove this
|
|
72
|
-
FacetFilterTypes = (
|
|
73
|
-
OriginTag,
|
|
74
|
-
Label,
|
|
75
|
-
ResourceMimetype,
|
|
76
|
-
FieldMimetype,
|
|
77
|
-
Entity,
|
|
78
|
-
Language,
|
|
79
|
-
OriginMetadata,
|
|
80
|
-
OriginPath,
|
|
81
|
-
Generated,
|
|
82
|
-
Kind,
|
|
83
|
-
OriginCollaborator,
|
|
84
|
-
OriginSource,
|
|
85
|
-
Status,
|
|
55
|
+
FacetFilter = (
|
|
56
|
+
OriginTag
|
|
57
|
+
| Label
|
|
58
|
+
| ResourceMimetype
|
|
59
|
+
| FieldMimetype
|
|
60
|
+
| Entity
|
|
61
|
+
| Language
|
|
62
|
+
| OriginMetadata
|
|
63
|
+
| OriginPath
|
|
64
|
+
| Generated
|
|
65
|
+
| Kind
|
|
66
|
+
| OriginCollaborator
|
|
67
|
+
| OriginSource
|
|
68
|
+
| Status
|
|
86
69
|
)
|
|
87
70
|
|
|
88
71
|
|
|
@@ -110,7 +93,7 @@ async def parse_expression(
|
|
|
110
93
|
if rid is None:
|
|
111
94
|
raise InvalidQueryError("slug", f"Cannot find slug {expr.slug}")
|
|
112
95
|
f.resource.resource_id = rid
|
|
113
|
-
else: # pragma:
|
|
96
|
+
else: # pragma: no cover
|
|
114
97
|
# Cannot happen due to model validation
|
|
115
98
|
raise ValueError("Resource needs id or slug")
|
|
116
99
|
elif isinstance(expr, Field):
|
|
@@ -131,7 +114,7 @@ async def parse_expression(
|
|
|
131
114
|
f.date.since.FromDatetime(expr.since)
|
|
132
115
|
if expr.until:
|
|
133
116
|
f.date.until.FromDatetime(expr.until)
|
|
134
|
-
elif isinstance(expr,
|
|
117
|
+
elif isinstance(expr, FacetFilter):
|
|
135
118
|
f.facet.facet = facet_from_filter(expr)
|
|
136
119
|
else:
|
|
137
120
|
# This is a trick so mypy generates an error if this branch can be reached,
|
|
@@ -21,6 +21,10 @@ class ClientException(Exception):
|
|
|
21
21
|
pass
|
|
22
22
|
|
|
23
23
|
|
|
24
|
+
class ServerException(Exception):
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
|
|
24
28
|
class NotFoundException(ClientException):
|
|
25
29
|
pass
|
|
26
30
|
|
|
@@ -35,3 +39,7 @@ class RateLimitException(ClientException):
|
|
|
35
39
|
|
|
36
40
|
class AccountLimitException(ClientException):
|
|
37
41
|
pass
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class ServiceUnavailableException(ServerException):
|
|
45
|
+
pass
|
|
@@ -209,6 +209,10 @@ class ProcessingHTTPClient:
|
|
|
209
209
|
async def close(self):
|
|
210
210
|
await self.session.close()
|
|
211
211
|
|
|
212
|
+
async def reset_session(self):
|
|
213
|
+
await self.close()
|
|
214
|
+
self.session = aiohttp.ClientSession()
|
|
215
|
+
|
|
212
216
|
async def in_progress(self, ack_token: str):
|
|
213
217
|
url = self.base_url_v2 + "/pull/in_progress"
|
|
214
218
|
request = InProgressRequest(ack=[ack_token])
|
|
@@ -33,5 +33,8 @@ def check_status(resp: aiohttp.ClientResponse, resp_text: str) -> None:
|
|
|
33
33
|
raise exceptions.AuthorizationException(f"Unauthorized to access: {resp.status}")
|
|
34
34
|
elif resp.status == 429:
|
|
35
35
|
raise exceptions.RateLimitException("Rate limited")
|
|
36
|
+
elif resp.status in (502, 503):
|
|
37
|
+
# Service unavailable, can be retried
|
|
38
|
+
raise exceptions.ServiceUnavailableException(f"Service unavailable: {resp.status} - {resp_text}")
|
|
36
39
|
else:
|
|
37
40
|
raise exceptions.ClientException(f"Unknown error: {resp.status} - {resp_text}")
|
nucliadb/common/ids.py
CHANGED
|
@@ -47,6 +47,8 @@ FIELD_TYPE_NAME_TO_STR = {
|
|
|
47
47
|
FieldTypeName.CONVERSATION: "c",
|
|
48
48
|
}
|
|
49
49
|
|
|
50
|
+
FIELD_TYPE_STR_TO_NAME = {v: k for k, v in FIELD_TYPE_NAME_TO_STR.items()}
|
|
51
|
+
|
|
50
52
|
|
|
51
53
|
@dataclass
|
|
52
54
|
class FieldId:
|
|
@@ -65,7 +67,7 @@ class FieldId:
|
|
|
65
67
|
|
|
66
68
|
Examples:
|
|
67
69
|
|
|
68
|
-
>>> FieldId(rid="rid", type="u", key="
|
|
70
|
+
>>> FieldId(rid="rid", type="u", key="my-link")
|
|
69
71
|
FieldID("rid/u/my-link")
|
|
70
72
|
>>> FieldId.from_string("rid/u/my-link")
|
|
71
73
|
FieldID("rid/u/my-link")
|
|
@@ -77,31 +79,6 @@ class FieldId:
|
|
|
77
79
|
# also knwon as `split`, this indicates a part of a field in, for example, conversations
|
|
78
80
|
subfield_id: Optional[str] = None
|
|
79
81
|
|
|
80
|
-
def __repr__(self) -> str:
|
|
81
|
-
return f"FieldId({self.full()})"
|
|
82
|
-
|
|
83
|
-
def short_without_subfield(self) -> str:
|
|
84
|
-
return f"/{self.type}/{self.key}"
|
|
85
|
-
|
|
86
|
-
def full(self) -> str:
|
|
87
|
-
if self.subfield_id is None:
|
|
88
|
-
return f"{self.rid}/{self.type}/{self.key}"
|
|
89
|
-
else:
|
|
90
|
-
return f"{self.rid}/{self.type}/{self.key}/{self.subfield_id}"
|
|
91
|
-
|
|
92
|
-
def __hash__(self) -> int:
|
|
93
|
-
return hash(self.full())
|
|
94
|
-
|
|
95
|
-
@property
|
|
96
|
-
def pb_type(self) -> FieldType.ValueType:
|
|
97
|
-
return FIELD_TYPE_STR_TO_PB[self.type]
|
|
98
|
-
|
|
99
|
-
@classmethod
|
|
100
|
-
def from_pb(
|
|
101
|
-
cls, rid: str, field_type: FieldType.ValueType, key: str, subfield_id: Optional[str] = None
|
|
102
|
-
) -> "FieldId":
|
|
103
|
-
return cls(rid=rid, type=FIELD_TYPE_PB_TO_STR[field_type], key=key, subfield_id=subfield_id)
|
|
104
|
-
|
|
105
82
|
@classmethod
|
|
106
83
|
def from_string(cls, value: str) -> "FieldId":
|
|
107
84
|
"""
|
|
@@ -120,11 +97,11 @@ class FieldId:
|
|
|
120
97
|
parts = value.split("/")
|
|
121
98
|
if len(parts) == 3:
|
|
122
99
|
rid, _type, key = parts
|
|
123
|
-
_type = cls.
|
|
100
|
+
_type = cls._parse_field_type(_type)
|
|
124
101
|
return cls(rid=rid, type=_type, key=key)
|
|
125
102
|
elif len(parts) == 4:
|
|
126
103
|
rid, _type, key, subfield_id = parts
|
|
127
|
-
_type = cls.
|
|
104
|
+
_type = cls._parse_field_type(_type)
|
|
128
105
|
return cls(
|
|
129
106
|
rid=rid,
|
|
130
107
|
type=_type,
|
|
@@ -135,7 +112,46 @@ class FieldId:
|
|
|
135
112
|
raise ValueError(f"Invalid FieldId: {value}")
|
|
136
113
|
|
|
137
114
|
@classmethod
|
|
138
|
-
def
|
|
115
|
+
def from_pb(
|
|
116
|
+
cls, rid: str, field_type: FieldType.ValueType, key: str, subfield_id: Optional[str] = None
|
|
117
|
+
) -> "FieldId":
|
|
118
|
+
return cls(rid=rid, type=FIELD_TYPE_PB_TO_STR[field_type], key=key, subfield_id=subfield_id)
|
|
119
|
+
|
|
120
|
+
@property
|
|
121
|
+
def pb_type(self) -> FieldType.ValueType:
|
|
122
|
+
return FIELD_TYPE_STR_TO_PB[self.type]
|
|
123
|
+
|
|
124
|
+
def full(self) -> str:
|
|
125
|
+
if self.subfield_id is None:
|
|
126
|
+
return f"{self.rid}/{self.type}/{self.key}"
|
|
127
|
+
else:
|
|
128
|
+
return f"{self.rid}/{self.type}/{self.key}/{self.subfield_id}"
|
|
129
|
+
|
|
130
|
+
def short_without_subfield(self) -> str:
|
|
131
|
+
return f"/{self.type}/{self.key}"
|
|
132
|
+
|
|
133
|
+
def paragraph_id(self, paragraph_start: int, paragraph_end: int) -> "ParagraphId":
|
|
134
|
+
"""Generate a ParagraphId from the current field given its start and
|
|
135
|
+
end.
|
|
136
|
+
|
|
137
|
+
"""
|
|
138
|
+
return ParagraphId(
|
|
139
|
+
field_id=self,
|
|
140
|
+
paragraph_start=paragraph_start,
|
|
141
|
+
paragraph_end=paragraph_end,
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
def __str__(self) -> str:
|
|
145
|
+
return self.full()
|
|
146
|
+
|
|
147
|
+
def __repr__(self) -> str:
|
|
148
|
+
return f"FieldId({self.full()})"
|
|
149
|
+
|
|
150
|
+
def __hash__(self) -> int:
|
|
151
|
+
return hash(self.full())
|
|
152
|
+
|
|
153
|
+
@staticmethod
|
|
154
|
+
def _parse_field_type(_type: str) -> str:
|
|
139
155
|
if _type not in FIELD_TYPE_STR_TO_PB:
|
|
140
156
|
# Try to parse the enum value
|
|
141
157
|
# XXX: This is to support field types that are integer values of FieldType
|
|
@@ -157,19 +173,6 @@ class ParagraphId:
|
|
|
157
173
|
paragraph_start: int
|
|
158
174
|
paragraph_end: int
|
|
159
175
|
|
|
160
|
-
def __repr__(self) -> str:
|
|
161
|
-
return f"ParagraphId({self.full()})"
|
|
162
|
-
|
|
163
|
-
def full(self) -> str:
|
|
164
|
-
return f"{self.field_id.full()}/{self.paragraph_start}-{self.paragraph_end}"
|
|
165
|
-
|
|
166
|
-
def __hash__(self) -> int:
|
|
167
|
-
return hash(self.full())
|
|
168
|
-
|
|
169
|
-
@property
|
|
170
|
-
def rid(self) -> str:
|
|
171
|
-
return self.field_id.rid
|
|
172
|
-
|
|
173
176
|
@classmethod
|
|
174
177
|
def from_string(cls, value: str) -> "ParagraphId":
|
|
175
178
|
parts = value.split("/")
|
|
@@ -192,6 +195,22 @@ class ParagraphId:
|
|
|
192
195
|
paragraph_end=vid.vector_end,
|
|
193
196
|
)
|
|
194
197
|
|
|
198
|
+
@property
|
|
199
|
+
def rid(self) -> str:
|
|
200
|
+
return self.field_id.rid
|
|
201
|
+
|
|
202
|
+
def full(self) -> str:
|
|
203
|
+
return f"{self.field_id.full()}/{self.paragraph_start}-{self.paragraph_end}"
|
|
204
|
+
|
|
205
|
+
def __str__(self) -> str:
|
|
206
|
+
return self.full()
|
|
207
|
+
|
|
208
|
+
def __repr__(self) -> str:
|
|
209
|
+
return f"ParagraphId({self.full()})"
|
|
210
|
+
|
|
211
|
+
def __hash__(self) -> int:
|
|
212
|
+
return hash(self.full())
|
|
213
|
+
|
|
195
214
|
|
|
196
215
|
@dataclass
|
|
197
216
|
class VectorId:
|
|
@@ -217,19 +236,6 @@ class VectorId:
|
|
|
217
236
|
vector_start: int
|
|
218
237
|
vector_end: int
|
|
219
238
|
|
|
220
|
-
def __repr__(self) -> str:
|
|
221
|
-
return f"VectorId({self.full()})"
|
|
222
|
-
|
|
223
|
-
def full(self) -> str:
|
|
224
|
-
return f"{self.field_id.full()}/{self.index}/{self.vector_start}-{self.vector_end}"
|
|
225
|
-
|
|
226
|
-
def __hash__(self) -> int:
|
|
227
|
-
return hash(self.full())
|
|
228
|
-
|
|
229
|
-
@property
|
|
230
|
-
def rid(self) -> str:
|
|
231
|
-
return self.field_id.rid
|
|
232
|
-
|
|
233
239
|
@classmethod
|
|
234
240
|
def from_string(cls, value: str) -> "VectorId":
|
|
235
241
|
parts = value.split("/")
|
|
@@ -239,6 +245,22 @@ class VectorId:
|
|
|
239
245
|
field_id = FieldId.from_string("/".join(parts[:-2]))
|
|
240
246
|
return cls(field_id=field_id, index=index, vector_start=start, vector_end=end)
|
|
241
247
|
|
|
248
|
+
@property
|
|
249
|
+
def rid(self) -> str:
|
|
250
|
+
return self.field_id.rid
|
|
251
|
+
|
|
252
|
+
def full(self) -> str:
|
|
253
|
+
return f"{self.field_id.full()}/{self.index}/{self.vector_start}-{self.vector_end}"
|
|
254
|
+
|
|
255
|
+
def __str__(self) -> str:
|
|
256
|
+
return self.full()
|
|
257
|
+
|
|
258
|
+
def __repr__(self) -> str:
|
|
259
|
+
return f"VectorId({self.full()})"
|
|
260
|
+
|
|
261
|
+
def __hash__(self) -> int:
|
|
262
|
+
return hash(self.full())
|
|
263
|
+
|
|
242
264
|
|
|
243
265
|
def extract_data_augmentation_id(generated_field_id: str) -> Optional[str]:
|
|
244
266
|
"""Data augmentation generated fields have a strict id with the following
|
nucliadb/common/locking.py
CHANGED
|
@@ -75,7 +75,7 @@ class _Lock:
|
|
|
75
75
|
start = time.time()
|
|
76
76
|
while True:
|
|
77
77
|
try:
|
|
78
|
-
async with self.driver.
|
|
78
|
+
async with self.driver.rw_transaction() as txn:
|
|
79
79
|
lock_data = await self.get_lock_data(txn)
|
|
80
80
|
if lock_data is None:
|
|
81
81
|
await self._set_lock_value(txn)
|
|
@@ -128,7 +128,7 @@ class _Lock:
|
|
|
128
128
|
while True:
|
|
129
129
|
try:
|
|
130
130
|
await asyncio.sleep(self.refresh_timeout)
|
|
131
|
-
async with self.driver.
|
|
131
|
+
async with self.driver.rw_transaction() as txn:
|
|
132
132
|
await self._update_lock_value(txn)
|
|
133
133
|
await txn.commit()
|
|
134
134
|
except (asyncio.CancelledError, RuntimeError):
|
|
@@ -138,12 +138,12 @@ class _Lock:
|
|
|
138
138
|
|
|
139
139
|
async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
140
140
|
self.task.cancel()
|
|
141
|
-
async with self.driver.
|
|
141
|
+
async with self.driver.rw_transaction() as txn:
|
|
142
142
|
await txn.delete(self.key)
|
|
143
143
|
await txn.commit()
|
|
144
144
|
|
|
145
145
|
async def is_locked(self) -> bool:
|
|
146
|
-
async with get_driver().
|
|
146
|
+
async with get_driver().ro_transaction() as txn:
|
|
147
147
|
lock_data = await self.get_lock_data(txn)
|
|
148
148
|
return lock_data is not None and time.time() < lock_data.expires_at
|
|
149
149
|
|
nucliadb/common/maindb/driver.py
CHANGED
|
@@ -81,5 +81,15 @@ class Driver:
|
|
|
81
81
|
pass
|
|
82
82
|
|
|
83
83
|
@asynccontextmanager
|
|
84
|
-
async def
|
|
84
|
+
async def _transaction(self, *, read_only: bool) -> AsyncGenerator[Transaction, None]:
|
|
85
85
|
yield Transaction()
|
|
86
|
+
|
|
87
|
+
@asynccontextmanager
|
|
88
|
+
async def ro_transaction(self) -> AsyncGenerator[Transaction, None]:
|
|
89
|
+
async with self._transaction(read_only=True) as txn:
|
|
90
|
+
yield txn
|
|
91
|
+
|
|
92
|
+
@asynccontextmanager
|
|
93
|
+
async def rw_transaction(self) -> AsyncGenerator[Transaction, None]:
|
|
94
|
+
async with self._transaction(read_only=False) as txn:
|
|
95
|
+
yield txn
|
nucliadb/common/maindb/local.py
CHANGED
|
@@ -222,7 +222,7 @@ class LocalDriver(Driver):
|
|
|
222
222
|
pass
|
|
223
223
|
|
|
224
224
|
@asynccontextmanager
|
|
225
|
-
async def
|
|
225
|
+
async def _transaction(self, *, read_only: bool) -> AsyncGenerator[Transaction, None]:
|
|
226
226
|
if self.url is None:
|
|
227
227
|
raise AttributeError("Invalid url")
|
|
228
228
|
txn = LocalTransaction(self.url, self)
|
nucliadb/common/maindb/pg.py
CHANGED
|
@@ -330,7 +330,7 @@ class PGDriver(Driver):
|
|
|
330
330
|
metric.set(value)
|
|
331
331
|
|
|
332
332
|
@asynccontextmanager
|
|
333
|
-
async def
|
|
333
|
+
async def _transaction(self, *, read_only: bool) -> AsyncGenerator[Transaction, None]:
|
|
334
334
|
if read_only:
|
|
335
335
|
yield ReadOnlyPGTransaction(self)
|
|
336
336
|
else:
|
nucliadb/common/nidx.py
CHANGED
|
@@ -82,6 +82,24 @@ def _storage_config(prefix: str, bucket: Optional[str]) -> dict[str, str]:
|
|
|
82
82
|
config[f"{prefix}__REGION_NAME"] = storage_settings.s3_region_name or ""
|
|
83
83
|
if storage_settings.s3_endpoint:
|
|
84
84
|
config[f"{prefix}__ENDPOINT"] = storage_settings.s3_endpoint
|
|
85
|
+
elif storage_settings.file_backend == FileBackendConfig.AZURE:
|
|
86
|
+
if storage_settings.azure_account_url is None:
|
|
87
|
+
raise ValueError("Azure account is required")
|
|
88
|
+
config[f"{prefix}__OBJECT_STORE"] = "azure"
|
|
89
|
+
url = storage_settings.azure_account_url
|
|
90
|
+
container = bucket or extended_storage_settings.azure_indexing_bucket
|
|
91
|
+
if container:
|
|
92
|
+
url += f"/{container}"
|
|
93
|
+
config[f"{prefix}__CONTAINER_URL"] = url
|
|
94
|
+
if storage_settings.azure_connection_string:
|
|
95
|
+
params = {
|
|
96
|
+
p.split("=", 1)[0]: p.split("=", 1)[1]
|
|
97
|
+
for p in storage_settings.azure_connection_string.split(";")
|
|
98
|
+
}
|
|
99
|
+
if "AccountKey" in params:
|
|
100
|
+
config[f"{prefix}__ACCOUNT_KEY"] = params["AccountKey"]
|
|
101
|
+
if "BlobEndpoint" in params:
|
|
102
|
+
config[f"{prefix}__ENDPOINT"] = params["BlobEndpoint"]
|
|
85
103
|
|
|
86
104
|
return config
|
|
87
105
|
|
|
@@ -198,7 +216,7 @@ class NidxServiceUtility(NidxUtility):
|
|
|
198
216
|
return await self.indexer.index(writer)
|
|
199
217
|
|
|
200
218
|
|
|
201
|
-
async def start_nidx_utility(service_name: str = "nucliadb.nidx") ->
|
|
219
|
+
async def start_nidx_utility(service_name: str = "nucliadb.nidx") -> NidxUtility:
|
|
202
220
|
nidx = get_utility(Utility.NIDX)
|
|
203
221
|
if nidx:
|
|
204
222
|
return nidx
|
|
@@ -26,7 +26,7 @@ from nucliadb_protos import knowledgebox_pb2 as Nucliadb
|
|
|
26
26
|
def nucliadb_vector_type_to_nidx(nucliadb: Nucliadb.VectorType.ValueType) -> Nidx.VectorType.ValueType:
|
|
27
27
|
if nucliadb == Nucliadb.DENSE_F32:
|
|
28
28
|
return Nidx.DENSE_F32
|
|
29
|
-
else: # pragma:
|
|
29
|
+
else: # pragma: no cover
|
|
30
30
|
raise Exception("Unknown vector type")
|
|
31
31
|
|
|
32
32
|
|