nucliadb 6.4.0.post4127__py3-none-any.whl → 6.4.0.post4132__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nucliadb/common/cluster/grpc_node_dummy.py +1 -18
- nucliadb/common/cluster/manager.py +26 -21
- nucliadb/common/cluster/rebalance.py +7 -7
- nucliadb/common/cluster/rollover.py +12 -5
- nucliadb/common/nidx.py +0 -44
- nucliadb/ingest/consumer/auditing.py +5 -5
- nucliadb/ingest/consumer/shard_creator.py +5 -4
- nucliadb/ingest/orm/entities.py +4 -5
- nucliadb/metrics_exporter.py +0 -19
- nucliadb/purge/orphan_shards.py +17 -14
- nucliadb/search/api/v1/knowledgebox.py +6 -14
- nucliadb/search/api/v1/resource/search.py +2 -5
- nucliadb/search/api/v1/search.py +2 -6
- nucliadb/search/api/v1/suggest.py +1 -2
- nucliadb/search/requesters/utils.py +14 -33
- nucliadb/search/search/find.py +2 -8
- nucliadb/search/search/shards.py +9 -25
- nucliadb/train/generator.py +9 -11
- nucliadb/train/generators/field_classifier.py +3 -5
- nucliadb/train/generators/field_streaming.py +3 -5
- nucliadb/train/generators/image_classifier.py +1 -4
- nucliadb/train/generators/paragraph_classifier.py +3 -5
- nucliadb/train/generators/paragraph_streaming.py +3 -5
- nucliadb/train/generators/question_answer_streaming.py +3 -5
- nucliadb/train/generators/sentence_classifier.py +3 -5
- nucliadb/train/generators/token_classifier.py +3 -5
- nucliadb/train/nodes.py +2 -4
- {nucliadb-6.4.0.post4127.dist-info → nucliadb-6.4.0.post4132.dist-info}/METADATA +6 -6
- {nucliadb-6.4.0.post4127.dist-info → nucliadb-6.4.0.post4132.dist-info}/RECORD +32 -33
- nucliadb/common/cluster/base.py +0 -146
- {nucliadb-6.4.0.post4127.dist-info → nucliadb-6.4.0.post4132.dist-info}/WHEEL +0 -0
- {nucliadb-6.4.0.post4127.dist-info → nucliadb-6.4.0.post4132.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.4.0.post4127.dist-info → nucliadb-6.4.0.post4132.dist-info}/top_level.txt +0 -0
@@ -19,22 +19,15 @@
|
|
19
19
|
#
|
20
20
|
from typing import Any
|
21
21
|
|
22
|
-
from nidx_protos.
|
23
|
-
EdgeList,
|
24
|
-
RelationEdge,
|
25
|
-
)
|
22
|
+
from nidx_protos.noderesources_pb2 import Shard as NodeResourcesShard
|
26
23
|
from nidx_protos.noderesources_pb2 import (
|
27
|
-
EmptyResponse,
|
28
24
|
ShardCreated,
|
29
25
|
ShardId,
|
30
26
|
ShardIds,
|
31
27
|
VectorSetList,
|
32
28
|
)
|
33
|
-
from nidx_protos.noderesources_pb2 import Shard as NodeResourcesShard
|
34
29
|
from nidx_protos.nodewriter_pb2 import OpStatus
|
35
30
|
|
36
|
-
from nucliadb_protos.utils_pb2 import Relation
|
37
|
-
|
38
31
|
|
39
32
|
class DummyWriterStub: # pragma: no cover
|
40
33
|
def __init__(self: "DummyWriterStub"):
|
@@ -77,10 +70,6 @@ class DummyWriterStub: # pragma: no cover
|
|
77
70
|
result.vectorsets.append("base")
|
78
71
|
return result
|
79
72
|
|
80
|
-
async def GC(self, request: ShardId) -> EmptyResponse: # pragma: no cover
|
81
|
-
self.calls.setdefault("GC", []).append(request)
|
82
|
-
return EmptyResponse()
|
83
|
-
|
84
73
|
|
85
74
|
class DummyReaderStub: # pragma: no cover
|
86
75
|
def __init__(self: "DummyReaderStub"):
|
@@ -89,9 +78,3 @@ class DummyReaderStub: # pragma: no cover
|
|
89
78
|
async def GetShard(self, data): # pragma: no cover
|
90
79
|
self.calls.setdefault("GetShard", []).append(data)
|
91
80
|
return NodeResourcesShard(shard_id="shard", fields=2, paragraphs=2, sentences=2)
|
92
|
-
|
93
|
-
async def RelationEdges(self, data): # pragma: no cover
|
94
|
-
self.calls.setdefault("RelationEdges", []).append(data)
|
95
|
-
result = EdgeList()
|
96
|
-
result.list.append(RelationEdge(edge_type=Relation.RelationType.ENTITY, property="dummy"))
|
97
|
-
return result
|
@@ -23,17 +23,21 @@ import uuid
|
|
23
23
|
from typing import Any, Awaitable, Callable, Optional
|
24
24
|
|
25
25
|
from nidx_protos import noderesources_pb2, nodewriter_pb2
|
26
|
-
from nidx_protos.nodewriter_pb2 import
|
26
|
+
from nidx_protos.nodewriter_pb2 import (
|
27
|
+
IndexMessage,
|
28
|
+
IndexMessageSource,
|
29
|
+
NewShardRequest,
|
30
|
+
NewVectorSetRequest,
|
31
|
+
TypeMessage,
|
32
|
+
)
|
27
33
|
|
28
34
|
from nucliadb.common import datamanagers
|
29
|
-
from nucliadb.common.cluster.base import AbstractIndexNode
|
30
35
|
from nucliadb.common.cluster.exceptions import (
|
31
36
|
NodeError,
|
32
|
-
ShardNotFound,
|
33
37
|
ShardsNotFound,
|
34
38
|
)
|
35
39
|
from nucliadb.common.maindb.driver import Transaction
|
36
|
-
from nucliadb.common.nidx import get_nidx, get_nidx_api_client
|
40
|
+
from nucliadb.common.nidx import get_nidx, get_nidx_api_client
|
37
41
|
from nucliadb.common.vector_index_config import nucliadb_index_config_to_nidx
|
38
42
|
from nucliadb_protos import knowledgebox_pb2, writer_pb2
|
39
43
|
from nucliadb_telemetry import errors
|
@@ -63,18 +67,14 @@ class KBShardManager:
|
|
63
67
|
async def apply_for_all_shards(
|
64
68
|
self,
|
65
69
|
kbid: str,
|
66
|
-
aw: Callable[[
|
70
|
+
aw: Callable[[str], Awaitable[Any]],
|
67
71
|
timeout: float,
|
68
72
|
) -> list[Any]:
|
69
73
|
shards = await self.get_shards_by_kbid(kbid)
|
70
74
|
ops = []
|
71
75
|
|
72
76
|
for shard_obj in shards:
|
73
|
-
|
74
|
-
if shard_id is None:
|
75
|
-
raise ShardNotFound("Found a node but not a shard")
|
76
|
-
|
77
|
-
ops.append(aw(node, shard_id))
|
77
|
+
ops.append(aw(shard_obj.nidx_shard_id))
|
78
78
|
|
79
79
|
try:
|
80
80
|
results = await asyncio.wait_for(
|
@@ -252,10 +252,18 @@ class KBShardManager:
|
|
252
252
|
async def create_vectorset(self, kbid: str, config: knowledgebox_pb2.VectorSetConfig):
|
253
253
|
"""Create a new vectorset in all KB shards."""
|
254
254
|
|
255
|
-
async def _create_vectorset(
|
255
|
+
async def _create_vectorset(shard_id: str):
|
256
256
|
vectorset_id = config.vectorset_id
|
257
257
|
index_config = nucliadb_index_config_to_nidx(config.vectorset_index_config)
|
258
|
-
|
258
|
+
|
259
|
+
req = NewVectorSetRequest(
|
260
|
+
id=noderesources_pb2.VectorSetID(
|
261
|
+
shard=noderesources_pb2.ShardId(id=shard_id), vectorset=vectorset_id
|
262
|
+
),
|
263
|
+
config=index_config,
|
264
|
+
)
|
265
|
+
|
266
|
+
result = await get_nidx_api_client().AddVectorSet(req)
|
259
267
|
if result.status != result.Status.OK:
|
260
268
|
raise NodeError(
|
261
269
|
f"Unable to create vectorset {vectorset_id} in kb {kbid} shard {shard_id}"
|
@@ -266,8 +274,12 @@ class KBShardManager:
|
|
266
274
|
async def delete_vectorset(self, kbid: str, vectorset_id: str):
|
267
275
|
"""Delete a vectorset from all KB shards"""
|
268
276
|
|
269
|
-
async def _delete_vectorset(
|
270
|
-
|
277
|
+
async def _delete_vectorset(shard_id: str):
|
278
|
+
req = noderesources_pb2.VectorSetID()
|
279
|
+
req.shard.id = shard_id
|
280
|
+
req.vectorset = vectorset_id
|
281
|
+
|
282
|
+
result = await get_nidx_api_client().RemoveVectorSet(req)
|
271
283
|
if result.status != result.Status.OK:
|
272
284
|
raise NodeError(
|
273
285
|
f"Unable to delete vectorset {vectorset_id} in kb {kbid} shard {shard_id}"
|
@@ -341,10 +353,3 @@ class StandaloneKBShardManager(KBShardManager):
|
|
341
353
|
await storage.delete_upload(storage_key, storage.indexing_bucket)
|
342
354
|
except Exception:
|
343
355
|
pass
|
344
|
-
|
345
|
-
|
346
|
-
def choose_node(
|
347
|
-
shard: writer_pb2.ShardObject,
|
348
|
-
) -> tuple[AbstractIndexNode, str]:
|
349
|
-
fake_node = get_nidx_fake_node()
|
350
|
-
return fake_node, shard.nidx_shard_id
|
@@ -23,9 +23,9 @@ import logging
|
|
23
23
|
from nidx_protos import nodereader_pb2, noderesources_pb2
|
24
24
|
|
25
25
|
from nucliadb.common import datamanagers, locking
|
26
|
-
from nucliadb.common.cluster.manager import choose_node
|
27
26
|
from nucliadb.common.cluster.utils import get_shard_manager
|
28
27
|
from nucliadb.common.context import ApplicationContext
|
28
|
+
from nucliadb.common.nidx import get_nidx_api_client, get_nidx_searcher_client
|
29
29
|
from nucliadb_telemetry import errors
|
30
30
|
from nucliadb_telemetry.logs import setup_logging
|
31
31
|
from nucliadb_telemetry.utils import setup_telemetry
|
@@ -51,9 +51,10 @@ async def get_shards_paragraphs(kbid: str) -> list[tuple[str, int]]:
|
|
51
51
|
results = {}
|
52
52
|
for shard_meta in kb_shards.shards:
|
53
53
|
# Rebalance using node as source of truth. But it will rebalance nidx
|
54
|
-
|
55
|
-
|
56
|
-
|
54
|
+
shard_data: nodereader_pb2.Shard = await get_nidx_api_client().GetShard(
|
55
|
+
nodereader_pb2.GetShardRequest(
|
56
|
+
shard_id=noderesources_pb2.ShardId(id=shard_meta.nidx_shard_id)
|
57
|
+
) # type: ignore
|
57
58
|
)
|
58
59
|
results[shard_meta.shard] = shard_data.paragraphs
|
59
60
|
|
@@ -101,16 +102,15 @@ async def move_set_of_kb_resources(
|
|
101
102
|
from_shard = [s for s in kb_shards.shards if s.shard == from_shard_id][0]
|
102
103
|
to_shard = [s for s in kb_shards.shards if s.shard == to_shard_id][0]
|
103
104
|
|
104
|
-
from_node, from_shard_replica_id = choose_node(from_shard)
|
105
105
|
request = nodereader_pb2.SearchRequest(
|
106
|
-
shard=
|
106
|
+
shard=from_shard.nidx_shard_id,
|
107
107
|
paragraph=False,
|
108
108
|
document=True,
|
109
109
|
result_per_page=count,
|
110
110
|
)
|
111
111
|
request.field_filter.field.field_type = "a"
|
112
112
|
request.field_filter.field.field_id = "title"
|
113
|
-
search_response: nodereader_pb2.SearchResponse = await
|
113
|
+
search_response: nodereader_pb2.SearchResponse = await get_nidx_searcher_client().Search(request)
|
114
114
|
|
115
115
|
for result in search_response.document.results:
|
116
116
|
resource_id = result.uuid
|
@@ -23,6 +23,10 @@ import logging
|
|
23
23
|
from datetime import datetime
|
24
24
|
from typing import Optional
|
25
25
|
|
26
|
+
from nidx_protos.nodewriter_pb2 import (
|
27
|
+
NewShardRequest,
|
28
|
+
)
|
29
|
+
|
26
30
|
from nucliadb.common import datamanagers, locking
|
27
31
|
from nucliadb.common.context import ApplicationContext
|
28
32
|
from nucliadb.common.datamanagers.rollover import RolloverState, RolloverStateNotFoundError
|
@@ -30,10 +34,10 @@ from nucliadb.common.external_index_providers.base import ExternalIndexManager
|
|
30
34
|
from nucliadb.common.external_index_providers.manager import (
|
31
35
|
get_external_index_manager,
|
32
36
|
)
|
33
|
-
from nucliadb.common.nidx import
|
37
|
+
from nucliadb.common.nidx import get_nidx_api_client
|
34
38
|
from nucliadb.common.vector_index_config import nucliadb_index_config_to_nidx
|
35
39
|
from nucliadb.migrator.settings import settings
|
36
|
-
from nucliadb_protos import writer_pb2
|
40
|
+
from nucliadb_protos import utils_pb2, writer_pb2
|
37
41
|
from nucliadb_telemetry import errors
|
38
42
|
|
39
43
|
from .utils import (
|
@@ -109,7 +113,6 @@ async def create_rollover_shards(
|
|
109
113
|
|
110
114
|
logger.info("Creating rollover shards", extra={"kbid": kbid})
|
111
115
|
sm = app_context.shard_manager
|
112
|
-
nidx_node = get_nidx_fake_node()
|
113
116
|
|
114
117
|
async with datamanagers.with_ro_transaction() as txn:
|
115
118
|
try:
|
@@ -143,10 +146,14 @@ async def create_rollover_shards(
|
|
143
146
|
async for vectorset_id, vectorset_config in datamanagers.vectorsets.iter(txn, kbid=kbid)
|
144
147
|
}
|
145
148
|
|
146
|
-
|
147
|
-
kbid,
|
149
|
+
req = NewShardRequest(
|
150
|
+
kbid=kbid,
|
151
|
+
release_channel=utils_pb2.ReleaseChannel.STABLE,
|
148
152
|
vectorsets_configs=vectorsets,
|
149
153
|
)
|
154
|
+
|
155
|
+
nidx_shard = await get_nidx_api_client().NewShard(req)
|
156
|
+
|
150
157
|
shard.nidx_shard_id = nidx_shard.id
|
151
158
|
created_shards.append(shard)
|
152
159
|
|
nucliadb/common/nidx.py
CHANGED
@@ -26,7 +26,6 @@ from nidx_protos.nodewriter_pb2 import (
|
|
26
26
|
IndexMessage,
|
27
27
|
)
|
28
28
|
|
29
|
-
from nucliadb.common.cluster.base import AbstractIndexNode
|
30
29
|
from nucliadb.common.cluster.settings import settings
|
31
30
|
from nucliadb.ingest.settings import DriverConfig
|
32
31
|
from nucliadb.ingest.settings import settings as ingest_settings
|
@@ -244,46 +243,3 @@ def get_nidx_searcher_client() -> "NidxSearcherStub":
|
|
244
243
|
return nidx.searcher_client
|
245
244
|
else:
|
246
245
|
raise Exception("nidx not initialized")
|
247
|
-
|
248
|
-
|
249
|
-
# TODO: Remove the index node abstraction
|
250
|
-
class NodeNidxAdapter:
|
251
|
-
def __init__(self, api_client, searcher_client):
|
252
|
-
# API methods
|
253
|
-
self.GetShard = api_client.GetShard
|
254
|
-
self.NewShard = api_client.NewShard
|
255
|
-
self.DeleteShard = api_client.DeleteShard
|
256
|
-
self.ListShards = api_client.ListShards
|
257
|
-
self.AddVectorSet = api_client.AddVectorSet
|
258
|
-
self.RemoveVectorSet = api_client.RemoveVectorSet
|
259
|
-
self.ListVectorSets = api_client.ListVectorSets
|
260
|
-
self.GetMetadata = api_client.GetMetadata
|
261
|
-
|
262
|
-
# Searcher methods
|
263
|
-
self.Search = searcher_client.Search
|
264
|
-
self.Suggest = searcher_client.Suggest
|
265
|
-
self.GraphSearch = searcher_client.GraphSearch
|
266
|
-
self.Paragraphs = searcher_client.Paragraphs
|
267
|
-
self.Documents = searcher_client.Documents
|
268
|
-
|
269
|
-
|
270
|
-
class FakeNode(AbstractIndexNode):
|
271
|
-
def __init__(self, api_client, searcher_client):
|
272
|
-
self.client = NodeNidxAdapter(api_client, searcher_client)
|
273
|
-
|
274
|
-
@property
|
275
|
-
def reader(self):
|
276
|
-
return self.client
|
277
|
-
|
278
|
-
@property
|
279
|
-
def writer(self):
|
280
|
-
return self.client
|
281
|
-
|
282
|
-
@property
|
283
|
-
def id(self):
|
284
|
-
return "nidx"
|
285
|
-
|
286
|
-
|
287
|
-
def get_nidx_fake_node() -> FakeNode:
|
288
|
-
nidx = get_nidx()
|
289
|
-
return FakeNode(nidx.api_client, nidx.searcher_client)
|
@@ -27,9 +27,9 @@ from nidx_protos import nodereader_pb2, noderesources_pb2
|
|
27
27
|
|
28
28
|
from nucliadb.common import datamanagers
|
29
29
|
from nucliadb.common.cluster.exceptions import ShardsNotFound
|
30
|
-
from nucliadb.common.cluster.manager import choose_node
|
31
30
|
from nucliadb.common.cluster.utils import get_shard_manager
|
32
31
|
from nucliadb.common.constants import AVG_PARAGRAPH_SIZE_BYTES
|
32
|
+
from nucliadb.common.nidx import get_nidx_api_client
|
33
33
|
from nucliadb_protos import audit_pb2, writer_pb2
|
34
34
|
from nucliadb_utils import const
|
35
35
|
from nucliadb_utils.audit.audit import AuditStorage
|
@@ -114,10 +114,10 @@ class IndexAuditHandler:
|
|
114
114
|
total_paragraphs = 0
|
115
115
|
|
116
116
|
for shard_obj in shard_groups:
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
117
|
+
shard: nodereader_pb2.Shard = await get_nidx_api_client().GetShard(
|
118
|
+
nodereader_pb2.GetShardRequest(
|
119
|
+
shard_id=noderesources_pb2.ShardId(id=shard_obj.nidx_shard_id)
|
120
|
+
)
|
121
121
|
)
|
122
122
|
|
123
123
|
total_fields += shard.fields
|
@@ -25,9 +25,9 @@ from functools import partial
|
|
25
25
|
from nidx_protos import nodereader_pb2, noderesources_pb2
|
26
26
|
|
27
27
|
from nucliadb.common import locking
|
28
|
-
from nucliadb.common.cluster.manager import choose_node
|
29
28
|
from nucliadb.common.cluster.utils import get_shard_manager
|
30
29
|
from nucliadb.common.maindb.driver import Driver
|
30
|
+
from nucliadb.common.nidx import get_nidx_api_client
|
31
31
|
from nucliadb_protos import writer_pb2
|
32
32
|
from nucliadb_utils import const
|
33
33
|
from nucliadb_utils.cache.pubsub import PubSubDriver
|
@@ -105,8 +105,9 @@ class ShardCreatorHandler:
|
|
105
105
|
async with locking.distributed_lock(locking.NEW_SHARD_LOCK.format(kbid=kbid)):
|
106
106
|
# remember, a lock will do at least 1+ reads and 1 write.
|
107
107
|
# with heavy writes, this adds some simple k/v pressure
|
108
|
-
|
109
|
-
|
110
|
-
|
108
|
+
shard: nodereader_pb2.Shard = await get_nidx_api_client().GetShard(
|
109
|
+
nodereader_pb2.GetShardRequest(
|
110
|
+
shard_id=noderesources_pb2.ShardId(id=current_shard.nidx_shard_id)
|
111
|
+
) # type: ignore
|
111
112
|
)
|
112
113
|
await self.shard_manager.maybe_create_new_shard(kbid, shard.paragraphs)
|
nucliadb/ingest/orm/entities.py
CHANGED
@@ -30,7 +30,6 @@ from nidx_protos.nodereader_pb2 import (
|
|
30
30
|
)
|
31
31
|
|
32
32
|
from nucliadb.common import datamanagers
|
33
|
-
from nucliadb.common.cluster.base import AbstractIndexNode
|
34
33
|
from nucliadb.common.cluster.exceptions import (
|
35
34
|
AlreadyExists,
|
36
35
|
EntitiesGroupNotFound,
|
@@ -203,7 +202,7 @@ class EntitiesManager:
|
|
203
202
|
async def get_indexed_entities_group(self, group: str) -> Optional[EntitiesGroup]:
|
204
203
|
shard_manager = get_shard_manager()
|
205
204
|
|
206
|
-
async def do_entities_search(
|
205
|
+
async def do_entities_search(shard_id: str) -> GraphSearchResponse:
|
207
206
|
request = GraphSearchRequest()
|
208
207
|
# XXX: this is a wild guess. Are those enough or too many?
|
209
208
|
request.top_k = 500
|
@@ -211,7 +210,7 @@ class EntitiesManager:
|
|
211
210
|
request.query.path.path.source.node_type = RelationNode.NodeType.ENTITY
|
212
211
|
request.query.path.path.source.node_subtype = group
|
213
212
|
request.query.path.path.undirected = True
|
214
|
-
response = await graph_search_shard(
|
213
|
+
response = await graph_search_shard(shard_id, request)
|
215
214
|
return response
|
216
215
|
|
217
216
|
results = await shard_manager.apply_for_all_shards(
|
@@ -293,7 +292,7 @@ class EntitiesManager:
|
|
293
292
|
) -> set[str]:
|
294
293
|
shard_manager = get_shard_manager()
|
295
294
|
|
296
|
-
async def query_indexed_entities_group_names(
|
295
|
+
async def query_indexed_entities_group_names(shard_id: str) -> set[str]:
|
297
296
|
"""Search all relation types"""
|
298
297
|
request = SearchRequest(
|
299
298
|
shard=shard_id,
|
@@ -303,7 +302,7 @@ class EntitiesManager:
|
|
303
302
|
paragraph=False,
|
304
303
|
faceted=Faceted(labels=["/e"]),
|
305
304
|
)
|
306
|
-
response: SearchResponse = await query_shard(
|
305
|
+
response: SearchResponse = await query_shard(shard_id, request)
|
307
306
|
try:
|
308
307
|
facetresults = response.document.facets["/e"].facetresults
|
309
308
|
except KeyError:
|
nucliadb/metrics_exporter.py
CHANGED
@@ -22,40 +22,22 @@ from __future__ import annotations
|
|
22
22
|
import asyncio
|
23
23
|
from typing import AsyncGenerator, Callable, Tuple, cast
|
24
24
|
|
25
|
-
from nidx_protos.noderesources_pb2 import EmptyQuery, NodeMetadata
|
26
|
-
|
27
25
|
from nucliadb import logger
|
28
26
|
from nucliadb.common import datamanagers
|
29
27
|
from nucliadb.common.context import ApplicationContext
|
30
28
|
from nucliadb.common.maindb.pg import PGDriver
|
31
29
|
from nucliadb.common.maindb.utils import get_driver
|
32
|
-
from nucliadb.common.nidx import get_nidx_api_client
|
33
30
|
from nucliadb.migrator.datamanager import MigrationsDataManager
|
34
31
|
from nucliadb_telemetry import metrics
|
35
32
|
from nucliadb_telemetry.logs import setup_logging
|
36
33
|
from nucliadb_telemetry.utils import setup_telemetry
|
37
34
|
from nucliadb_utils.fastapi.run import serve_metrics
|
38
35
|
|
39
|
-
SHARD_COUNT = metrics.Gauge("nucliadb_node_shard_count", labels={"node": ""})
|
40
|
-
|
41
36
|
MIGRATION_COUNT = metrics.Gauge("nucliadb_migration", labels={"type": "", "version": ""})
|
42
37
|
|
43
38
|
PENDING_RESOURCE_COUNT = metrics.Gauge("nucliadb_pending_resources_count")
|
44
39
|
|
45
40
|
|
46
|
-
async def update_node_metrics(context: ApplicationContext):
|
47
|
-
"""
|
48
|
-
Report the number of shards in each node.
|
49
|
-
"""
|
50
|
-
# Clear previoulsy set values so that we report only the current state
|
51
|
-
SHARD_COUNT.gauge.clear()
|
52
|
-
|
53
|
-
nidx_api = get_nidx_api_client()
|
54
|
-
metadata: NodeMetadata = await nidx_api.GetMetadata(EmptyQuery())
|
55
|
-
|
56
|
-
SHARD_COUNT.set(metadata.shard_count, labels={"node": "nidx"})
|
57
|
-
|
58
|
-
|
59
41
|
async def iter_kbids(context: ApplicationContext) -> AsyncGenerator[str, None]:
|
60
42
|
"""
|
61
43
|
Return a list of all KB ids.
|
@@ -127,7 +109,6 @@ async def run_exporter(context: ApplicationContext):
|
|
127
109
|
# Schedule exporter tasks
|
128
110
|
tasks = []
|
129
111
|
for export_task, interval in [
|
130
|
-
(update_node_metrics, 10),
|
131
112
|
(update_migration_metrics, 60 * 3),
|
132
113
|
(update_resource_metrics, 60 * 5),
|
133
114
|
]:
|
nucliadb/purge/orphan_shards.py
CHANGED
@@ -23,14 +23,17 @@ import importlib.metadata
|
|
23
23
|
from typing import Optional
|
24
24
|
|
25
25
|
from grpc.aio import AioRpcError
|
26
|
+
from nidx_protos import nodereader_pb2, noderesources_pb2
|
26
27
|
|
27
28
|
from nucliadb.common import datamanagers
|
28
|
-
from nucliadb.common.cluster import manager
|
29
|
-
from nucliadb.common.cluster.base import AbstractIndexNode
|
30
29
|
from nucliadb.common.cluster.utils import setup_cluster, teardown_cluster
|
31
30
|
from nucliadb.common.maindb.driver import Driver
|
32
31
|
from nucliadb.common.maindb.utils import setup_driver, teardown_driver
|
33
|
-
from nucliadb.common.nidx import
|
32
|
+
from nucliadb.common.nidx import (
|
33
|
+
get_nidx_api_client,
|
34
|
+
start_nidx_utility,
|
35
|
+
stop_nidx_utility,
|
36
|
+
)
|
34
37
|
from nucliadb.ingest import logger
|
35
38
|
from nucliadb_telemetry import errors
|
36
39
|
from nucliadb_telemetry.logs import setup_logging
|
@@ -69,9 +72,8 @@ async def detect_orphan_shards(driver: Driver) -> dict[str, ShardKb]:
|
|
69
72
|
|
70
73
|
orphan_shard_ids = indexed_shards.keys() - stored_shards.keys()
|
71
74
|
orphan_shards: dict[str, ShardKb] = {}
|
72
|
-
node = manager.get_nidx_fake_node()
|
73
75
|
for shard_id in orphan_shard_ids:
|
74
|
-
kbid = await _get_kbid(
|
76
|
+
kbid = await _get_kbid(shard_id) or UNKNOWN_KB
|
75
77
|
# Shards with knwon KB ids can be checked and ignore those comming from
|
76
78
|
# an ongoing migration/rollover (ongoing or finished)
|
77
79
|
if kbid != UNKNOWN_KB:
|
@@ -84,15 +86,15 @@ async def detect_orphan_shards(driver: Driver) -> dict[str, ShardKb]:
|
|
84
86
|
orphan_shards[shard_id] = kbid
|
85
87
|
|
86
88
|
for shard_id in orphan_shard_ids:
|
87
|
-
kbid = await _get_kbid(
|
89
|
+
kbid = await _get_kbid(shard_id) or UNKNOWN_KB
|
88
90
|
orphan_shards[shard_id] = kbid
|
89
91
|
return orphan_shards
|
90
92
|
|
91
93
|
|
92
94
|
async def _get_indexed_shards() -> dict[str, ShardKb]:
|
93
|
-
|
94
|
-
|
95
|
-
return {
|
95
|
+
shards = await get_nidx_api_client().ListShards(noderesources_pb2.EmptyQuery())
|
96
|
+
|
97
|
+
return {shard.id: UNKNOWN_KB for shard in shards.ids}
|
96
98
|
|
97
99
|
|
98
100
|
async def _get_stored_shards(driver: Driver) -> dict[str, ShardKb]:
|
@@ -111,16 +113,17 @@ async def _get_stored_shards(driver: Driver) -> dict[str, ShardKb]:
|
|
111
113
|
return stored_shards
|
112
114
|
|
113
115
|
|
114
|
-
async def _get_kbid(
|
116
|
+
async def _get_kbid(shard_id: str) -> Optional[str]:
|
115
117
|
kbid = None
|
116
118
|
try:
|
117
|
-
|
119
|
+
req = nodereader_pb2.GetShardRequest()
|
120
|
+
req.shard_id.id = shard_id
|
121
|
+
shard_pb = await get_nidx_api_client().GetShard(req)
|
118
122
|
except AioRpcError as grpc_error:
|
119
123
|
logger.error(
|
120
124
|
"Can't get shard while looking for orphans in nidx, is there something broken?",
|
121
125
|
exc_info=grpc_error,
|
122
126
|
extra={
|
123
|
-
"node_id": node.id,
|
124
127
|
"shard_id": shard_id,
|
125
128
|
},
|
126
129
|
)
|
@@ -156,7 +159,6 @@ async def purge_orphan_shards(driver: Driver):
|
|
156
159
|
orphan_shards = await detect_orphan_shards(driver)
|
157
160
|
logger.info(f"Found {len(orphan_shards)} orphan shards. Purge starts...")
|
158
161
|
|
159
|
-
node = manager.get_nidx_fake_node()
|
160
162
|
for shard_id, kbid in orphan_shards.items():
|
161
163
|
logger.info(
|
162
164
|
"Deleting orphan shard from index node",
|
@@ -165,7 +167,8 @@ async def purge_orphan_shards(driver: Driver):
|
|
165
167
|
"kbid": kbid,
|
166
168
|
},
|
167
169
|
)
|
168
|
-
|
170
|
+
req = noderesources_pb2.ShardId(id=shard_id)
|
171
|
+
await get_nidx_api_client().DeleteShard(req)
|
169
172
|
|
170
173
|
|
171
174
|
def parse_arguments():
|
@@ -28,7 +28,6 @@ from nidx_protos.noderesources_pb2 import Shard
|
|
28
28
|
|
29
29
|
from nucliadb.common import datamanagers
|
30
30
|
from nucliadb.common.cluster.exceptions import ShardsNotFound
|
31
|
-
from nucliadb.common.cluster.manager import choose_node
|
32
31
|
from nucliadb.common.cluster.utils import get_shard_manager
|
33
32
|
from nucliadb.common.constants import AVG_PARAGRAPH_SIZE_BYTES
|
34
33
|
from nucliadb.common.counters import IndexCounts
|
@@ -164,19 +163,12 @@ async def get_node_index_counts(kbid: str) -> tuple[IndexCounts, list[str]]:
|
|
164
163
|
ops = []
|
165
164
|
queried_shards = []
|
166
165
|
for shard_object in shard_groups:
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
)
|
174
|
-
else:
|
175
|
-
if shard_id is not None:
|
176
|
-
# At least one node is alive for this shard group
|
177
|
-
# let's add it ot the query list if has a valid value
|
178
|
-
ops.append(get_shard(node, shard_id))
|
179
|
-
queried_shards.append(shard_id)
|
166
|
+
shard_id = shard_object.nidx_shard_id
|
167
|
+
if shard_id is not None:
|
168
|
+
# At least one node is alive for this shard group
|
169
|
+
# let's add it ot the query list if has a valid value
|
170
|
+
ops.append(get_shard(shard_id))
|
171
|
+
queried_shards.append(shard_id)
|
180
172
|
|
181
173
|
if not ops:
|
182
174
|
logger.info(f"No node found for any of this resources shards {kbid}")
|
@@ -27,7 +27,7 @@ from pydantic import ValidationError
|
|
27
27
|
from nucliadb.models.responses import HTTPClientError
|
28
28
|
from nucliadb.search.api.v1.router import KB_PREFIX, RESOURCE_PREFIX, api
|
29
29
|
from nucliadb.search.api.v1.utils import fastapi_query
|
30
|
-
from nucliadb.search.requesters.utils import Method,
|
30
|
+
from nucliadb.search.requesters.utils import Method, node_query
|
31
31
|
from nucliadb.search.search import cache
|
32
32
|
from nucliadb.search.search.exceptions import InvalidQueryError
|
33
33
|
from nucliadb.search.search.merge import merge_paragraphs_results
|
@@ -110,7 +110,7 @@ async def resource_search(
|
|
110
110
|
detail = json.loads(exc.json())
|
111
111
|
return HTTPClientError(status_code=422, detail=detail)
|
112
112
|
|
113
|
-
results, incomplete_results,
|
113
|
+
results, incomplete_results, queried_shards = await node_query(kbid, Method.SEARCH, pb_query)
|
114
114
|
|
115
115
|
# We need to merge
|
116
116
|
search_results = await merge_paragraphs_results(
|
@@ -122,9 +122,6 @@ async def resource_search(
|
|
122
122
|
)
|
123
123
|
|
124
124
|
response.status_code = 206 if incomplete_results else 200
|
125
|
-
if debug:
|
126
|
-
search_results.nodes = debug_nodes_info(queried_nodes)
|
127
125
|
|
128
|
-
queried_shards = [shard_id for _, shard_id in queried_nodes]
|
129
126
|
search_results.shards = queried_shards
|
130
127
|
return search_results
|
nucliadb/search/api/v1/search.py
CHANGED
@@ -32,7 +32,7 @@ from nucliadb.models.responses import HTTPClientError
|
|
32
32
|
from nucliadb.search import predict
|
33
33
|
from nucliadb.search.api.v1.router import KB_PREFIX, api
|
34
34
|
from nucliadb.search.api.v1.utils import fastapi_query
|
35
|
-
from nucliadb.search.requesters.utils import Method,
|
35
|
+
from nucliadb.search.requesters.utils import Method, node_query
|
36
36
|
from nucliadb.search.search import cache
|
37
37
|
from nucliadb.search.search.exceptions import InvalidQueryError
|
38
38
|
from nucliadb.search.search.merge import merge_results
|
@@ -265,7 +265,7 @@ async def search(
|
|
265
265
|
pb_query, incomplete_results, autofilters, _ = await legacy_convert_retrieval_to_proto(parsed)
|
266
266
|
|
267
267
|
# We need to query all nodes
|
268
|
-
results, query_incomplete_results,
|
268
|
+
results, query_incomplete_results, queried_shards = await node_query(kbid, Method.SEARCH, pb_query)
|
269
269
|
incomplete_results = incomplete_results or query_incomplete_results
|
270
270
|
|
271
271
|
# We need to merge
|
@@ -290,10 +290,6 @@ async def search(
|
|
290
290
|
len(search_results.resources),
|
291
291
|
)
|
292
292
|
|
293
|
-
if item.debug:
|
294
|
-
search_results.nodes = debug_nodes_info(queried_nodes)
|
295
|
-
|
296
|
-
queried_shards = [shard_id for _, shard_id in queried_nodes]
|
297
293
|
search_results.shards = queried_shards
|
298
294
|
search_results.autofilters = autofilters
|
299
295
|
return search_results, incomplete_results
|
@@ -160,7 +160,7 @@ async def suggest(
|
|
160
160
|
range_modification_end,
|
161
161
|
hidden,
|
162
162
|
)
|
163
|
-
results, incomplete_results,
|
163
|
+
results, incomplete_results, queried_shards = await node_query(kbid, Method.SUGGEST, pb_query)
|
164
164
|
|
165
165
|
# We need to merge
|
166
166
|
search_results = await merge_suggest_results(
|
@@ -171,7 +171,6 @@ async def suggest(
|
|
171
171
|
|
172
172
|
response.status_code = 206 if incomplete_results else 200
|
173
173
|
|
174
|
-
queried_shards = [shard_id for _, shard_id in queried_nodes]
|
175
174
|
if debug and queried_shards:
|
176
175
|
search_results.shards = queried_shards
|
177
176
|
|