nucliadb 6.4.0.post4200__py3-none-any.whl → 6.4.0.post4210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nucliadb/common/back_pressure/__init__.py +20 -0
- nucliadb/common/back_pressure/cache.py +86 -0
- nucliadb/common/back_pressure/materializer.py +315 -0
- nucliadb/common/back_pressure/settings.py +72 -0
- nucliadb/common/back_pressure/utils.py +59 -0
- nucliadb/common/external_index_providers/base.py +7 -4
- nucliadb/search/search/chat/ask.py +43 -32
- nucliadb/search/search/find_merge.py +8 -2
- nucliadb/search/search/rank_fusion.py +136 -58
- nucliadb/writer/api/v1/export_import.py +2 -2
- nucliadb/writer/api/v1/field.py +3 -3
- nucliadb/writer/api/v1/resource.py +5 -5
- nucliadb/writer/api/v1/upload.py +3 -3
- nucliadb/writer/lifecycle.py +2 -2
- nucliadb/writer/settings.py +0 -51
- {nucliadb-6.4.0.post4200.dist-info → nucliadb-6.4.0.post4210.dist-info}/METADATA +6 -6
- {nucliadb-6.4.0.post4200.dist-info → nucliadb-6.4.0.post4210.dist-info}/RECORD +20 -16
- nucliadb/writer/back_pressure.py +0 -485
- {nucliadb-6.4.0.post4200.dist-info → nucliadb-6.4.0.post4210.dist-info}/WHEEL +0 -0
- {nucliadb-6.4.0.post4200.dist-info → nucliadb-6.4.0.post4210.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.4.0.post4200.dist-info → nucliadb-6.4.0.post4210.dist-info}/top_level.txt +0 -0
@@ -42,7 +42,7 @@ from nucliadb.search.search.hydrator import (
|
|
42
42
|
)
|
43
43
|
from nucliadb.search.search.merge import merge_relations_results
|
44
44
|
from nucliadb.search.search.query_parser.models import UnitRetrieval
|
45
|
-
from nucliadb.search.search.rank_fusion import RankFusionAlgorithm
|
45
|
+
from nucliadb.search.search.rank_fusion import IndexSource, RankFusionAlgorithm
|
46
46
|
from nucliadb.search.search.rerankers import (
|
47
47
|
RerankableItem,
|
48
48
|
Reranker,
|
@@ -108,7 +108,13 @@ async def build_find_response(
|
|
108
108
|
)
|
109
109
|
graph_results = graph_results_to_text_block_matches(search_response.graph)
|
110
110
|
|
111
|
-
merged_text_blocks = rank_fusion_algorithm.fuse(
|
111
|
+
merged_text_blocks = rank_fusion_algorithm.fuse(
|
112
|
+
{
|
113
|
+
IndexSource.KEYWORD: keyword_results,
|
114
|
+
IndexSource.SEMANTIC: semantic_results,
|
115
|
+
IndexSource.GRAPH: graph_results,
|
116
|
+
}
|
117
|
+
)
|
112
118
|
|
113
119
|
# cut
|
114
120
|
# we assume pagination + predict reranker is forbidden and has been already
|
@@ -19,8 +19,10 @@
|
|
19
19
|
#
|
20
20
|
import logging
|
21
21
|
from abc import ABC, abstractmethod
|
22
|
+
from enum import Enum, auto
|
23
|
+
from typing import Optional, TypeVar
|
22
24
|
|
23
|
-
from nucliadb.common.external_index_providers.base import
|
25
|
+
from nucliadb.common.external_index_providers.base import ScoredTextBlock
|
24
26
|
from nucliadb.common.ids import ParagraphId
|
25
27
|
from nucliadb.search.search.query_parser import models as parser_models
|
26
28
|
from nucliadb_models.search import SCORE_TYPE
|
@@ -45,6 +47,14 @@ rank_fusion_observer = Observer(
|
|
45
47
|
],
|
46
48
|
)
|
47
49
|
|
50
|
+
ScoredItem = TypeVar("ScoredItem", bound=ScoredTextBlock)
|
51
|
+
|
52
|
+
|
53
|
+
class IndexSource(str, Enum):
|
54
|
+
KEYWORD = auto()
|
55
|
+
SEMANTIC = auto()
|
56
|
+
GRAPH = auto()
|
57
|
+
|
48
58
|
|
49
59
|
class RankFusionAlgorithm(ABC):
|
50
60
|
def __init__(self, window: int):
|
@@ -60,46 +70,44 @@ class RankFusionAlgorithm(ABC):
|
|
60
70
|
"""
|
61
71
|
return self._window
|
62
72
|
|
63
|
-
def fuse(
|
64
|
-
|
65
|
-
keyword: list[TextBlockMatch],
|
66
|
-
semantic: list[TextBlockMatch],
|
67
|
-
graph: list[TextBlockMatch],
|
68
|
-
) -> list[TextBlockMatch]:
|
69
|
-
"""Fuse keyword and semantic results and return a list with the merged
|
73
|
+
def fuse(self, sources: dict[str, list[ScoredItem]]) -> list[ScoredItem]:
|
74
|
+
"""Fuse elements from multiple sources and return a list of merged
|
70
75
|
results.
|
71
76
|
|
72
|
-
If only one
|
77
|
+
If only one source is provided, rank fusion will be skipped.
|
73
78
|
|
74
79
|
"""
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
retrievals_with_results = [x for x in (keyword, semantic, graph) if len(x) > 0]
|
81
|
-
if len(retrievals_with_results) == 1:
|
82
|
-
return retrievals_with_results[0]
|
80
|
+
sources_with_results = [x for x in sources.values() if len(x) > 0]
|
81
|
+
if len(sources_with_results) == 1:
|
82
|
+
# skip rank fusion, we only have a source
|
83
|
+
merged = sources_with_results[0]
|
83
84
|
else:
|
84
|
-
merged = self._fuse(
|
85
|
+
merged = self._fuse(sources)
|
86
|
+
|
87
|
+
# sort and return the unordered results from the implementation
|
88
|
+
merged.sort(key=lambda r: r.score, reverse=True)
|
85
89
|
return merged
|
86
90
|
|
87
91
|
@abstractmethod
|
88
|
-
def _fuse(
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
by
|
92
|
+
def _fuse(self, sources: dict[str, list[ScoredItem]]) -> list[ScoredItem]:
|
93
|
+
"""Rank fusion implementation.
|
94
|
+
|
95
|
+
Each concrete subclass must provide an implementation that merges
|
96
|
+
`sources`, a group of unordered matches, into a list of unordered
|
97
|
+
results with the new rank fusion score.
|
98
|
+
|
99
|
+
Results can be deduplicated or changed by the rank fusion algorithm.
|
100
|
+
|
101
|
+
"""
|
96
102
|
...
|
97
103
|
|
98
104
|
|
99
105
|
class ReciprocalRankFusion(RankFusionAlgorithm):
|
100
106
|
"""Rank-based rank fusion algorithm. Discounts the weight of documents
|
101
|
-
occurring deep in retrieved lists using a reciprocal distribution.
|
102
|
-
|
107
|
+
occurring deep in retrieved lists using a reciprocal distribution.
|
108
|
+
|
109
|
+
This implementation can be further parametrized with a weight (boost) per
|
110
|
+
retriever that will be applied to all documents ranked by it.
|
103
111
|
|
104
112
|
RRF = Σ(r ∈ R) (1 / (k + r(d)) · w(r))
|
105
113
|
|
@@ -119,9 +127,8 @@ class ReciprocalRankFusion(RankFusionAlgorithm):
|
|
119
127
|
k: float = 60.0,
|
120
128
|
*,
|
121
129
|
window: int,
|
122
|
-
|
123
|
-
|
124
|
-
graph_weight: float = 1.0,
|
130
|
+
weights: Optional[dict[str, float]] = None,
|
131
|
+
default_weight: float = 1.0,
|
125
132
|
):
|
126
133
|
super().__init__(window)
|
127
134
|
# Constant used in RRF, studies agree on 60 as a good default value
|
@@ -129,49 +136,118 @@ class ReciprocalRankFusion(RankFusionAlgorithm):
|
|
129
136
|
# difference among the best results and a smaller score difference among
|
130
137
|
# bad results
|
131
138
|
self._k = k
|
132
|
-
self.
|
133
|
-
self.
|
134
|
-
self._graph_boost = graph_weight
|
139
|
+
self._weights = weights or {}
|
140
|
+
self._default_weight = default_weight
|
135
141
|
|
136
142
|
@rank_fusion_observer.wrap({"type": "reciprocal_rank_fusion"})
|
137
143
|
def _fuse(
|
138
144
|
self,
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
145
|
+
sources: dict[str, list[ScoredItem]],
|
146
|
+
) -> list[ScoredItem]:
|
147
|
+
# accumulated scores per paragraph
|
148
|
+
scores: dict[ParagraphId, tuple[float, SCORE_TYPE]] = {}
|
149
|
+
# pointers from paragraph to the original source
|
150
|
+
match_positions: dict[ParagraphId, list[tuple[int, int]]] = {}
|
151
|
+
|
152
|
+
# sort results by it's score before fusing them, as we need the rank
|
153
|
+
sources = {
|
154
|
+
retriever: sorted(values, key=lambda r: r.score, reverse=True)
|
155
|
+
for retriever, values in sources.items()
|
156
|
+
}
|
157
|
+
rankings = [
|
158
|
+
(values, self._weights.get(source, self._default_weight))
|
159
|
+
for source, values in sources.items()
|
160
|
+
]
|
161
|
+
for i, (ranking, weight) in enumerate(rankings):
|
162
|
+
for rank, item in enumerate(ranking):
|
163
|
+
id = item.paragraph_id
|
164
|
+
score, score_type = scores.setdefault(id, (0, item.score_type))
|
165
|
+
score += 1 / (self._k + rank) * weight
|
166
|
+
if {score_type, item.score_type} == {SCORE_TYPE.BM25, SCORE_TYPE.VECTOR}:
|
167
|
+
score_type = SCORE_TYPE.BOTH
|
168
|
+
scores[id] = (score, score_type)
|
169
|
+
|
170
|
+
position = (i, rank)
|
171
|
+
match_positions.setdefault(item.paragraph_id, []).append(position)
|
172
|
+
|
173
|
+
merged = []
|
174
|
+
for paragraph_id, positions in match_positions.items():
|
175
|
+
# we are getting only one position, effectively deduplicating
|
176
|
+
# multiple matches for the same text block
|
177
|
+
i, j = match_positions[paragraph_id][0]
|
178
|
+
score, score_type = scores[paragraph_id]
|
179
|
+
item = rankings[i][0][j]
|
180
|
+
item.score = score
|
181
|
+
item.score_type = score_type
|
182
|
+
merged.append(item)
|
183
|
+
|
184
|
+
return merged
|
185
|
+
|
186
|
+
|
187
|
+
class WeightedCombSum(RankFusionAlgorithm):
|
188
|
+
"""Score-based rank fusion algorithm. Multiply each score by a list-specific
|
189
|
+
weight (boost). Then adds the retrieval score of documents contained in more
|
190
|
+
than one list and sort by score.
|
191
|
+
|
192
|
+
wCombSUM = Σ(r ∈ R) (w(r) · S(r, d))
|
193
|
+
|
194
|
+
where:
|
195
|
+
- d is a document
|
196
|
+
- R is the set of retrievers
|
197
|
+
- w(r) weight (boost) for retriever r
|
198
|
+
- S(r, d) is the score of document d given by retriever r
|
199
|
+
|
200
|
+
wCombSUM boosts matches from multiple retrievers and deduplicate them. As a
|
201
|
+
score ranking algorithm, comparison of different scores may lead to bad
|
202
|
+
results.
|
203
|
+
|
204
|
+
"""
|
205
|
+
|
206
|
+
def __init__(
|
207
|
+
self,
|
208
|
+
*,
|
209
|
+
window: int,
|
210
|
+
weights: Optional[dict[str, float]] = None,
|
211
|
+
default_weight: float = 1.0,
|
212
|
+
):
|
213
|
+
super().__init__(window)
|
214
|
+
self._weights = weights or {}
|
215
|
+
self._default_weight = default_weight
|
216
|
+
|
217
|
+
@rank_fusion_observer.wrap({"type": "weighted_comb_sum"})
|
218
|
+
def _fuse(self, sources: dict[str, list[ScoredItem]]) -> list[ScoredItem]:
|
219
|
+
# accumulated scores per paragraph
|
143
220
|
scores: dict[ParagraphId, tuple[float, SCORE_TYPE]] = {}
|
221
|
+
# pointers from paragraph to the original source
|
144
222
|
match_positions: dict[ParagraphId, list[tuple[int, int]]] = {}
|
145
223
|
|
146
224
|
rankings = [
|
147
|
-
(
|
148
|
-
|
149
|
-
(graph, self._graph_boost),
|
225
|
+
(values, self._weights.get(source, self._default_weight))
|
226
|
+
for source, values in sources.items()
|
150
227
|
]
|
151
|
-
for
|
152
|
-
for
|
153
|
-
id =
|
154
|
-
score, score_type = scores.setdefault(id, (0,
|
155
|
-
score +=
|
156
|
-
if {score_type,
|
228
|
+
for i, (ranking, weight) in enumerate(rankings):
|
229
|
+
for j, item in enumerate(ranking):
|
230
|
+
id = item.paragraph_id
|
231
|
+
score, score_type = scores.setdefault(id, (0, item.score_type))
|
232
|
+
score += item.score * weight
|
233
|
+
if {score_type, item.score_type} == {SCORE_TYPE.BM25, SCORE_TYPE.VECTOR}:
|
157
234
|
score_type = SCORE_TYPE.BOTH
|
158
235
|
scores[id] = (score, score_type)
|
159
236
|
|
160
|
-
position = (
|
161
|
-
match_positions.setdefault(
|
237
|
+
position = (i, j)
|
238
|
+
match_positions.setdefault(item.paragraph_id, []).append(position)
|
162
239
|
|
163
240
|
merged = []
|
164
241
|
for paragraph_id, positions in match_positions.items():
|
165
242
|
# we are getting only one position, effectively deduplicating
|
166
243
|
# multiple matches for the same text block
|
167
|
-
|
244
|
+
i, j = match_positions[paragraph_id][0]
|
168
245
|
score, score_type = scores[paragraph_id]
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
merged.append(
|
246
|
+
item = rankings[i][0][j]
|
247
|
+
item.score = score
|
248
|
+
item.score_type = score_type
|
249
|
+
merged.append(item)
|
173
250
|
|
174
|
-
merged.sort(key=lambda x: x.score, reverse=True)
|
175
251
|
return merged
|
176
252
|
|
177
253
|
|
@@ -184,9 +260,11 @@ def get_rank_fusion(rank_fusion: parser_models.RankFusion) -> RankFusionAlgorith
|
|
184
260
|
algorithm = ReciprocalRankFusion(
|
185
261
|
k=rank_fusion.k,
|
186
262
|
window=window,
|
187
|
-
|
188
|
-
|
189
|
-
|
263
|
+
weights={
|
264
|
+
IndexSource.KEYWORD: rank_fusion.boosting.keyword,
|
265
|
+
IndexSource.SEMANTIC: rank_fusion.boosting.semantic,
|
266
|
+
IndexSource.GRAPH: rank_fusion.boosting.graph,
|
267
|
+
},
|
190
268
|
)
|
191
269
|
|
192
270
|
else:
|
@@ -25,6 +25,7 @@ from fastapi_versioning import version
|
|
25
25
|
from starlette.requests import Request
|
26
26
|
|
27
27
|
from nucliadb.common import datamanagers
|
28
|
+
from nucliadb.common.back_pressure import maybe_back_pressure
|
28
29
|
from nucliadb.common.cluster.settings import in_standalone_mode
|
29
30
|
from nucliadb.common.context import ApplicationContext
|
30
31
|
from nucliadb.common.context.fastapi import get_app_context
|
@@ -45,7 +46,6 @@ from nucliadb.writer import logger
|
|
45
46
|
from nucliadb.writer.api.utils import only_for_onprem
|
46
47
|
from nucliadb.writer.api.v1.knowledgebox import create_kb
|
47
48
|
from nucliadb.writer.api.v1.router import KB_PREFIX, KBS_PREFIX, api
|
48
|
-
from nucliadb.writer.back_pressure import maybe_back_pressure
|
49
49
|
from nucliadb_models.export_import import (
|
50
50
|
CreateExportResponse,
|
51
51
|
CreateImportResponse,
|
@@ -148,7 +148,7 @@ async def start_kb_import_endpoint(request: Request, kbid: str):
|
|
148
148
|
if not await datamanagers.atomic.kb.exists_kb(kbid=kbid):
|
149
149
|
return HTTPClientError(status_code=404, detail="Knowledge Box not found")
|
150
150
|
|
151
|
-
await maybe_back_pressure(
|
151
|
+
await maybe_back_pressure(kbid)
|
152
152
|
|
153
153
|
stream = stream_compatible_with_kb(kbid, request.stream())
|
154
154
|
try:
|
nucliadb/writer/api/v1/field.py
CHANGED
@@ -25,6 +25,7 @@ from fastapi_versioning import version
|
|
25
25
|
from starlette.requests import Request
|
26
26
|
|
27
27
|
import nucliadb_models as models
|
28
|
+
from nucliadb.common.back_pressure import maybe_back_pressure
|
28
29
|
from nucliadb.common.maindb.utils import get_driver
|
29
30
|
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
|
30
31
|
from nucliadb.models.internal.processing import PushPayload, Source
|
@@ -40,7 +41,6 @@ from nucliadb.writer.api.v1.resource import (
|
|
40
41
|
validate_rid_exists_or_raise_error,
|
41
42
|
)
|
42
43
|
from nucliadb.writer.api.v1.router import KB_PREFIX, RESOURCE_PREFIX, RSLUG_PREFIX, api
|
43
|
-
from nucliadb.writer.back_pressure import maybe_back_pressure
|
44
44
|
from nucliadb.writer.resource.audit import parse_audit
|
45
45
|
from nucliadb.writer.resource.field import (
|
46
46
|
ResourceClassifications,
|
@@ -96,7 +96,7 @@ async def add_field_to_resource(
|
|
96
96
|
**parser_kwargs,
|
97
97
|
):
|
98
98
|
await validate_rid_exists_or_raise_error(kbid, rid)
|
99
|
-
await maybe_back_pressure(
|
99
|
+
await maybe_back_pressure(kbid, resource_uuid=rid)
|
100
100
|
|
101
101
|
partitioning = get_partitioning()
|
102
102
|
partition = partitioning.generate_partition(kbid, rid)
|
@@ -542,7 +542,7 @@ async def reprocess_file_field(
|
|
542
542
|
x_nucliadb_user: Annotated[str, X_NUCLIADB_USER] = "",
|
543
543
|
x_file_password: Annotated[Optional[str], X_FILE_PASSWORD] = None,
|
544
544
|
) -> ResourceUpdated:
|
545
|
-
await maybe_back_pressure(
|
545
|
+
await maybe_back_pressure(kbid, resource_uuid=rid)
|
546
546
|
|
547
547
|
processing = get_processing()
|
548
548
|
partitioning = get_partitioning()
|
@@ -28,6 +28,7 @@ from fastapi_versioning import version
|
|
28
28
|
from starlette.requests import Request
|
29
29
|
|
30
30
|
from nucliadb.common import datamanagers
|
31
|
+
from nucliadb.common.back_pressure import maybe_back_pressure
|
31
32
|
from nucliadb.common.context.fastapi import get_app_context
|
32
33
|
from nucliadb.common.maindb.driver import Driver
|
33
34
|
from nucliadb.common.maindb.exceptions import ConflictError, NotFoundError
|
@@ -45,7 +46,6 @@ from nucliadb.writer.api.v1.router import (
|
|
45
46
|
api,
|
46
47
|
)
|
47
48
|
from nucliadb.writer.api.v1.slug import ensure_slug_uniqueness, noop_context_manager
|
48
|
-
from nucliadb.writer.back_pressure import maybe_back_pressure
|
49
49
|
from nucliadb.writer.resource.audit import parse_audit
|
50
50
|
from nucliadb.writer.resource.basic import (
|
51
51
|
parse_basic_creation,
|
@@ -106,7 +106,7 @@ async def create_resource(
|
|
106
106
|
detail="Cannot hide a resource: the KB does not have hidden resources enabled",
|
107
107
|
)
|
108
108
|
|
109
|
-
await maybe_back_pressure(
|
109
|
+
await maybe_back_pressure(kbid)
|
110
110
|
|
111
111
|
partitioning = get_partitioning()
|
112
112
|
|
@@ -243,7 +243,7 @@ async def modify_resource_endpoint(
|
|
243
243
|
):
|
244
244
|
await validate_rid_exists_or_raise_error(kbid, rid)
|
245
245
|
|
246
|
-
await maybe_back_pressure(
|
246
|
+
await maybe_back_pressure(kbid, resource_uuid=rid)
|
247
247
|
|
248
248
|
if item.slug is None:
|
249
249
|
return await modify_resource(
|
@@ -424,7 +424,7 @@ async def _reprocess_resource(
|
|
424
424
|
x_nucliadb_user: str,
|
425
425
|
):
|
426
426
|
await validate_rid_exists_or_raise_error(kbid, rid)
|
427
|
-
await maybe_back_pressure(
|
427
|
+
await maybe_back_pressure(kbid, resource_uuid=rid)
|
428
428
|
|
429
429
|
partitioning = get_partitioning()
|
430
430
|
|
@@ -571,7 +571,7 @@ async def _reindex_resource(
|
|
571
571
|
reindex_vectors: bool,
|
572
572
|
):
|
573
573
|
await validate_rid_exists_or_raise_error(kbid, rid)
|
574
|
-
await maybe_back_pressure(
|
574
|
+
await maybe_back_pressure(kbid, resource_uuid=rid)
|
575
575
|
|
576
576
|
ingest = get_ingest()
|
577
577
|
index_req = IndexResource()
|
nucliadb/writer/api/v1/upload.py
CHANGED
@@ -32,6 +32,7 @@ from fastapi_versioning import version
|
|
32
32
|
from starlette.requests import Request as StarletteRequest
|
33
33
|
|
34
34
|
from nucliadb.common import datamanagers
|
35
|
+
from nucliadb.common.back_pressure import maybe_back_pressure
|
35
36
|
from nucliadb.ingest.orm.utils import set_title
|
36
37
|
from nucliadb.models.internal.processing import PushPayload, Source
|
37
38
|
from nucliadb.models.responses import HTTPClientError
|
@@ -43,7 +44,6 @@ from nucliadb.writer.api.v1.resource import (
|
|
43
44
|
validate_rid_exists_or_raise_error,
|
44
45
|
)
|
45
46
|
from nucliadb.writer.api.v1.slug import ensure_slug_uniqueness, noop_context_manager
|
46
|
-
from nucliadb.writer.back_pressure import maybe_back_pressure
|
47
47
|
from nucliadb.writer.resource.audit import parse_audit
|
48
48
|
from nucliadb.writer.resource.basic import parse_basic_creation, parse_user_classifications
|
49
49
|
from nucliadb.writer.resource.field import (
|
@@ -215,7 +215,7 @@ async def _tus_post(
|
|
215
215
|
detail="Cannot hide a resource: the KB does not have hidden resources enabled",
|
216
216
|
)
|
217
217
|
|
218
|
-
await maybe_back_pressure(
|
218
|
+
await maybe_back_pressure(kbid, resource_uuid=path_rid)
|
219
219
|
|
220
220
|
dm = get_dm()
|
221
221
|
storage_manager = get_storage_manager()
|
@@ -713,7 +713,7 @@ async def _upload(
|
|
713
713
|
if path_rid is not None:
|
714
714
|
await validate_rid_exists_or_raise_error(kbid, path_rid)
|
715
715
|
|
716
|
-
await maybe_back_pressure(
|
716
|
+
await maybe_back_pressure(kbid, resource_uuid=path_rid)
|
717
717
|
|
718
718
|
md5_user = x_md5
|
719
719
|
path, rid, valid_field = await validate_field_upload(kbid, path_rid, field, md5_user)
|
nucliadb/writer/lifecycle.py
CHANGED
@@ -21,12 +21,12 @@ from contextlib import asynccontextmanager
|
|
21
21
|
|
22
22
|
from fastapi import FastAPI
|
23
23
|
|
24
|
+
from nucliadb.common.back_pressure import start_materializer, stop_materializer
|
25
|
+
from nucliadb.common.back_pressure.settings import settings as back_pressure_settings
|
24
26
|
from nucliadb.common.context.fastapi import inject_app_context
|
25
27
|
from nucliadb.ingest.processing import start_processing_engine, stop_processing_engine
|
26
28
|
from nucliadb.ingest.utils import start_ingest, stop_ingest
|
27
29
|
from nucliadb.writer import SERVICE_NAME
|
28
|
-
from nucliadb.writer.back_pressure import start_materializer, stop_materializer
|
29
|
-
from nucliadb.writer.settings import back_pressure_settings
|
30
30
|
from nucliadb.writer.tus import finalize as storage_finalize
|
31
31
|
from nucliadb.writer.tus import initialize as storage_initialize
|
32
32
|
from nucliadb_telemetry.utils import clean_telemetry, setup_telemetry
|
nucliadb/writer/settings.py
CHANGED
@@ -19,7 +19,6 @@
|
|
19
19
|
#
|
20
20
|
from typing import Optional
|
21
21
|
|
22
|
-
from pydantic import Field
|
23
22
|
from pydantic_settings import BaseSettings
|
24
23
|
|
25
24
|
|
@@ -29,54 +28,4 @@ class Settings(BaseSettings):
|
|
29
28
|
dm_redis_port: Optional[int] = None
|
30
29
|
|
31
30
|
|
32
|
-
class BackPressureSettings(BaseSettings):
|
33
|
-
enabled: bool = Field(
|
34
|
-
default=False,
|
35
|
-
description="Enable or disable back pressure.",
|
36
|
-
alias="back_pressure_enabled",
|
37
|
-
)
|
38
|
-
indexing_rate: float = Field(
|
39
|
-
default=10,
|
40
|
-
description="Estimation of the indexing rate in messages per second. This is used to calculate the try again in time", # noqa
|
41
|
-
)
|
42
|
-
ingest_rate: float = Field(
|
43
|
-
default=4,
|
44
|
-
description="Estimation of the ingest processed consumer rate in messages per second. This is used to calculate the try again in time", # noqa
|
45
|
-
)
|
46
|
-
processing_rate: float = Field(
|
47
|
-
default=1,
|
48
|
-
description="Estimation of the processing rate in messages per second. This is used to calculate the try again in time", # noqa
|
49
|
-
)
|
50
|
-
max_indexing_pending: int = Field(
|
51
|
-
default=1000,
|
52
|
-
description="Max number of messages pending to index in a node queue before rate limiting writes. Set to 0 to disable indexing back pressure checks", # noqa
|
53
|
-
alias="back_pressure_max_indexing_pending",
|
54
|
-
)
|
55
|
-
max_ingest_pending: int = Field(
|
56
|
-
# Disabled by default
|
57
|
-
default=0,
|
58
|
-
description="Max number of messages pending to be ingested by processed consumers before rate limiting writes. Set to 0 to disable ingest back pressure checks", # noqa
|
59
|
-
alias="back_pressure_max_ingest_pending",
|
60
|
-
)
|
61
|
-
max_processing_pending: int = Field(
|
62
|
-
default=1000,
|
63
|
-
description="Max number of messages pending to process per Knowledge Box before rate limiting writes. Set to 0 to disable processing back pressure checks", # noqa
|
64
|
-
alias="back_pressure_max_processing_pending",
|
65
|
-
)
|
66
|
-
indexing_check_interval: int = Field(
|
67
|
-
default=30,
|
68
|
-
description="Interval in seconds to check the indexing pending messages",
|
69
|
-
)
|
70
|
-
ingest_check_interval: int = Field(
|
71
|
-
default=30,
|
72
|
-
description="Interval in seconds to check the ingest pending messages",
|
73
|
-
)
|
74
|
-
max_wait_time: int = Field(
|
75
|
-
default=60,
|
76
|
-
description="Max time in seconds to wait before trying again after back pressure",
|
77
|
-
)
|
78
|
-
|
79
|
-
|
80
31
|
settings = Settings()
|
81
|
-
|
82
|
-
back_pressure_settings = BackPressureSettings()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: nucliadb
|
3
|
-
Version: 6.4.0.
|
3
|
+
Version: 6.4.0.post4210
|
4
4
|
Summary: NucliaDB
|
5
5
|
Author-email: Nuclia <nucliadb@nuclia.com>
|
6
6
|
License: AGPL
|
@@ -20,11 +20,11 @@ Classifier: Programming Language :: Python :: 3.12
|
|
20
20
|
Classifier: Programming Language :: Python :: 3 :: Only
|
21
21
|
Requires-Python: <4,>=3.9
|
22
22
|
Description-Content-Type: text/markdown
|
23
|
-
Requires-Dist: nucliadb-telemetry[all]>=6.4.0.
|
24
|
-
Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.4.0.
|
25
|
-
Requires-Dist: nucliadb-protos>=6.4.0.
|
26
|
-
Requires-Dist: nucliadb-models>=6.4.0.
|
27
|
-
Requires-Dist: nidx-protos>=6.4.0.
|
23
|
+
Requires-Dist: nucliadb-telemetry[all]>=6.4.0.post4210
|
24
|
+
Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.4.0.post4210
|
25
|
+
Requires-Dist: nucliadb-protos>=6.4.0.post4210
|
26
|
+
Requires-Dist: nucliadb-models>=6.4.0.post4210
|
27
|
+
Requires-Dist: nidx-protos>=6.4.0.post4210
|
28
28
|
Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
|
29
29
|
Requires-Dist: nuclia-models>=0.24.2
|
30
30
|
Requires-Dist: uvicorn[standard]
|
@@ -60,6 +60,11 @@ nucliadb/common/ids.py,sha256=4QjoIofes_vtKj2HsFWZf8VVIVWXxdkYtLpx1n618Us,8239
|
|
60
60
|
nucliadb/common/locking.py,sha256=RL0CabZVPzxHZyUjYeUyLvsJTm7W3J9o4fEgsY_ufNc,5896
|
61
61
|
nucliadb/common/nidx.py,sha256=3EeQGjM_gxK0l_Rb54fspFWVNnzUiKF-_GMxTiiDC8Q,9116
|
62
62
|
nucliadb/common/vector_index_config.py,sha256=LqGwhrDCp1q1vBow3scd1Chhr4GLYjYnGL72FKvOYYc,1552
|
63
|
+
nucliadb/common/back_pressure/__init__.py,sha256=paAcAZcfGRTyURF9lnn3vX0vcwakTEVswG_xcdGBH-U,928
|
64
|
+
nucliadb/common/back_pressure/cache.py,sha256=ANvXglWzI5naAD6N4E_fNi17qS6KNyAhjLeh6WlZZ84,2931
|
65
|
+
nucliadb/common/back_pressure/materializer.py,sha256=YzYfN7xI5nlmSowbdLktWIkrJJb3Q2vEmoyz9O3eb2s,11667
|
66
|
+
nucliadb/common/back_pressure/settings.py,sha256=3qNOzbI0KC6LMy-wMilXRSBfZu6CCpGHod26MTgAZ2o,3082
|
67
|
+
nucliadb/common/back_pressure/utils.py,sha256=aZeP1XSkdgaRgZC76yR9Kje3511ZUCp7KB-XzcvhMYY,2018
|
63
68
|
nucliadb/common/cluster/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
64
69
|
nucliadb/common/cluster/exceptions.py,sha256=t7v_l93t44l2tQpdQXgO_w-c4YZRcaayOz1A2i0w4RQ,1258
|
65
70
|
nucliadb/common/cluster/grpc_node_dummy.py,sha256=JkufazWzMA4KFEU8EBkMbiiDW4C8lLcRhiiCxP7aCQY,2949
|
@@ -88,7 +93,7 @@ nucliadb/common/datamanagers/synonyms.py,sha256=zk3GEH38KF5vV_VcuL6DCg-2JwgXJfQl
|
|
88
93
|
nucliadb/common/datamanagers/utils.py,sha256=McHlXvE4P3x-bBY3pr0n8djbTDQvI1G5WusJrnRdhLA,1827
|
89
94
|
nucliadb/common/datamanagers/vectorsets.py,sha256=ciYb5uD435Zo8ZbqgPUAszFW9Svp_-R2hY2FEhQ411Y,4304
|
90
95
|
nucliadb/common/external_index_providers/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
91
|
-
nucliadb/common/external_index_providers/base.py,sha256=
|
96
|
+
nucliadb/common/external_index_providers/base.py,sha256=BL3DuYbnp-KCmGUiN-FGRtgjWj3SmtgMsGdjGq_7cX4,8905
|
92
97
|
nucliadb/common/external_index_providers/exceptions.py,sha256=nDhhOIkb66hjCrBk4Spvl2vN1SuW5gbwrMCDmrdjHHE,1209
|
93
98
|
nucliadb/common/external_index_providers/manager.py,sha256=aFSrrKKYG1ydpTSyq4zYD0LOxFS7P-CO6rcKC0hiF4I,4267
|
94
99
|
nucliadb/common/external_index_providers/pinecone.py,sha256=sCvMgmXrLebnFLkpLnimDGrm4lE9ydb_ywotOMUSMrI,38124
|
@@ -234,7 +239,7 @@ nucliadb/search/search/exceptions.py,sha256=klGLgAGGrXcSGix_W6418ZBMqDchAIGjN77o
|
|
234
239
|
nucliadb/search/search/fetch.py,sha256=eiljOKim-4OOEZn-3fyVZSYxztCH156BXYdqlIwVdN4,6181
|
235
240
|
nucliadb/search/search/filters.py,sha256=1MkHlJjAQqoRCj7e5cEzK2HvBxGLE17I_omsjiklbtw,6476
|
236
241
|
nucliadb/search/search/find.py,sha256=WTc_0HdbaU0Mwkmlf9s4pCmTuU0hz1jetBjIpNLXEEM,7982
|
237
|
-
nucliadb/search/search/find_merge.py,sha256=
|
242
|
+
nucliadb/search/search/find_merge.py,sha256=c-7IlfjfdmWAvQOyM7IO3bKS1EQpnR4oi6pN6mwrQKw,19815
|
238
243
|
nucliadb/search/search/graph_merge.py,sha256=y5V7X-BhjHsKDXE69tzQLIIKGm4XuaFrZXw0odcHVNM,3402
|
239
244
|
nucliadb/search/search/graph_strategy.py,sha256=JsV-i9PGhekCAzmGpqeueQIitJb7fWCihIwUf76Q3pU,32912
|
240
245
|
nucliadb/search/search/hydrator.py,sha256=-R37gCrGxkyaiHQalnTWHNG_FCx11Zucd7qA1vQCxuw,6985
|
@@ -245,13 +250,13 @@ nucliadb/search/search/paragraphs.py,sha256=pNAEiYqJGGUVcEf7xf-PFMVqz0PX4Qb-WNG-
|
|
245
250
|
nucliadb/search/search/pgcatalog.py,sha256=s_J98fsX_RuFXwpejpkGqG-tD9ELuzz4YQ6U3ew5h2g,9313
|
246
251
|
nucliadb/search/search/predict_proxy.py,sha256=IFI3v_ODz2_UU1XZnyaD391fE7-2C0npSmj_HmDvzS4,3123
|
247
252
|
nucliadb/search/search/query.py,sha256=-gvKsyGmKYpsoEVzKkq3HJUMcs_3LD3TYUueOcJsTec,11511
|
248
|
-
nucliadb/search/search/rank_fusion.py,sha256=
|
253
|
+
nucliadb/search/search/rank_fusion.py,sha256=xZtXhbmKb_56gs73u6KkFm2efvTATOSMmpOV2wrAIqE,9613
|
249
254
|
nucliadb/search/search/rerankers.py,sha256=PvhExUb8zZYghiFHRgGotw6h6bU--Rft09wE8arvtAw,7424
|
250
255
|
nucliadb/search/search/shards.py,sha256=mc5DK-MoCv9AFhlXlOFHbPvetcyNDzTFOJ5rimK8PC8,2636
|
251
256
|
nucliadb/search/search/summarize.py,sha256=ksmYPubEQvAQgfPdZHfzB_rR19B2ci4IYZ6jLdHxZo8,4996
|
252
257
|
nucliadb/search/search/utils.py,sha256=ajRIXfdTF67dBVahQCXW-rSv6gJpUMPt3QhJrWqArTQ,2175
|
253
258
|
nucliadb/search/search/chat/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
254
|
-
nucliadb/search/search/chat/ask.py,sha256=
|
259
|
+
nucliadb/search/search/chat/ask.py,sha256=jYOGh2rySV4aFx_D2KlNVbPXHBsbkcy0Ve-eBS7CSYc,37611
|
255
260
|
nucliadb/search/search/chat/exceptions.py,sha256=Siy4GXW2L7oPhIR86H3WHBhE9lkV4A4YaAszuGGUf54,1356
|
256
261
|
nucliadb/search/search/chat/images.py,sha256=PA8VWxT5_HUGfW1ULhKTK46UBsVyINtWWqEM1ulzX1E,3095
|
257
262
|
nucliadb/search/search/chat/prompt.py,sha256=Jnja-Ss7skgnnDY8BymVfdeYsFPnIQFL8tEvcRXTKUE,47356
|
@@ -328,28 +333,27 @@ nucliadb/train/generators/token_classifier.py,sha256=DdyMbrpxIVGWdTcz3SEN_3HwxKf
|
|
328
333
|
nucliadb/train/generators/utils.py,sha256=ZNwvEVPZr-eP0MW3ABN7a11hPQKaa0NdVaRcgBcTp5w,3601
|
329
334
|
nucliadb/writer/__init__.py,sha256=S298mrZL3vr62OrBqi97mdLxgR5cReMlRJgnaQHZV7s,1304
|
330
335
|
nucliadb/writer/app.py,sha256=ABBO8-u4pDAa61b3mCdD0TFhuHAYcxMkgpZSGgWARuE,2736
|
331
|
-
nucliadb/writer/back_pressure.py,sha256=4OwFGq9pvAbChB3WBZAY36lclfD-gD2ouC6YsKA4bIo,16892
|
332
336
|
nucliadb/writer/exceptions.py,sha256=-Z7LW--eid7PNeKFuzo9kAlbLEBMUosxE-UVIgGD3SA,929
|
333
|
-
nucliadb/writer/lifecycle.py,sha256=
|
337
|
+
nucliadb/writer/lifecycle.py,sha256=P1b_KoNkMTeF1IbyDCh_zhexWbeYe5LH6p2iFSJPiN4,2576
|
334
338
|
nucliadb/writer/openapi.py,sha256=thqCO1ht_RJgOkXs-aIsv8aXJrU5z8wo2n05l2_LqMs,1032
|
335
339
|
nucliadb/writer/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
336
340
|
nucliadb/writer/run.py,sha256=euVZ_rtHDXs-O1kB-Pt1Id8eft9CYVpWH3zJzEoEqls,1448
|
337
|
-
nucliadb/writer/settings.py,sha256=
|
341
|
+
nucliadb/writer/settings.py,sha256=gKtCTDF2E1m6lYL0Iv4WwY4VZuvw1Dsa-uIBZxCHTdU,1071
|
338
342
|
nucliadb/writer/utilities.py,sha256=AZ5qEny1Xm0IDsFtH13oJa2usvJZK8f0FdgF1LrnLCw,1036
|
339
343
|
nucliadb/writer/api/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
340
344
|
nucliadb/writer/api/constants.py,sha256=qWEDjFUycrEZnSJyLnNK4PQNodU2oVmkO4NycaEZtio,1738
|
341
345
|
nucliadb/writer/api/utils.py,sha256=wIQHlU8RQiIGVLI72suvyVIKlCU44Unh0Ae0IiN6Qwo,1313
|
342
346
|
nucliadb/writer/api/v1/__init__.py,sha256=akI9A_jloNLb0dU4T5zjfdyvmSAiDeIdjAlzNx74FlU,1128
|
343
|
-
nucliadb/writer/api/v1/export_import.py,sha256=
|
344
|
-
nucliadb/writer/api/v1/field.py,sha256=
|
347
|
+
nucliadb/writer/api/v1/export_import.py,sha256=v0sU55TtRSqDzwkDgcwv2uSaqKCuQTtGcMpYoHQYBQA,8192
|
348
|
+
nucliadb/writer/api/v1/field.py,sha256=OicvLF1bnkJj1ixALFLuhvFX6NCMFpORROcFcS9nKpk,18505
|
345
349
|
nucliadb/writer/api/v1/knowledgebox.py,sha256=PHEYDFa-sN5JrI8-EiVVg5FDOsRuCLT43kyAB4xt-xA,9530
|
346
350
|
nucliadb/writer/api/v1/learning_config.py,sha256=CKBjqcbewkfPwGUPLDWzZSpro6XkmCaVppe5Qtpu5Go,3117
|
347
|
-
nucliadb/writer/api/v1/resource.py,sha256=
|
351
|
+
nucliadb/writer/api/v1/resource.py,sha256=IaKHwP4M4Pm3xXj_xcnQCnTzKtXj_xj-r7YOHdH-89I,19750
|
348
352
|
nucliadb/writer/api/v1/router.py,sha256=RjuoWLpZer6Kl2BW_wznpNo6XL3BOpdTGqXZCn3QrrQ,1034
|
349
353
|
nucliadb/writer/api/v1/services.py,sha256=3AUjk-SmvqJx76v7y89DZx6oyasojPliGYeniRQjpcU,13337
|
350
354
|
nucliadb/writer/api/v1/slug.py,sha256=xlVBDBpRi9bNulpBHZwhyftVvulfE0zFm1XZIWl-AKY,2389
|
351
355
|
nucliadb/writer/api/v1/transaction.py,sha256=d2Vbgnkk_-FLGSTt3vfldwiJIUf0XoyD0wP1jQNz_DY,2430
|
352
|
-
nucliadb/writer/api/v1/upload.py,sha256=
|
356
|
+
nucliadb/writer/api/v1/upload.py,sha256=vdKurdxRU7vYlcQIXf5RNTuX-G0waBSak2HnNRmAbLk,33791
|
353
357
|
nucliadb/writer/api/v1/vectorsets.py,sha256=F3iMViL5G95_Tns4aO2SOA0DwAzxK2_P8MXxtd_XLRE,6973
|
354
358
|
nucliadb/writer/resource/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
355
359
|
nucliadb/writer/resource/audit.py,sha256=FvxMZPzrNHtd31HgpZEvxzwAkbxJTZRhPLqRYYJi3tA,1426
|
@@ -365,8 +369,8 @@ nucliadb/writer/tus/local.py,sha256=7jYa_w9b-N90jWgN2sQKkNcomqn6JMVBOVeDOVYJHto,
|
|
365
369
|
nucliadb/writer/tus/s3.py,sha256=vF0NkFTXiXhXq3bCVXXVV-ED38ECVoUeeYViP8uMqcU,8357
|
366
370
|
nucliadb/writer/tus/storage.py,sha256=ToqwjoYnjI4oIcwzkhha_MPxi-k4Jk3Lt55zRwaC1SM,2903
|
367
371
|
nucliadb/writer/tus/utils.py,sha256=MSdVbRsRSZVdkaum69_0wku7X3p5wlZf4nr6E0GMKbw,2556
|
368
|
-
nucliadb-6.4.0.
|
369
|
-
nucliadb-6.4.0.
|
370
|
-
nucliadb-6.4.0.
|
371
|
-
nucliadb-6.4.0.
|
372
|
-
nucliadb-6.4.0.
|
372
|
+
nucliadb-6.4.0.post4210.dist-info/METADATA,sha256=SB9gIMgWxoWNtUEexRLH85E0PL-MnroGhJ6aOambTT4,4223
|
373
|
+
nucliadb-6.4.0.post4210.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
|
374
|
+
nucliadb-6.4.0.post4210.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
|
375
|
+
nucliadb-6.4.0.post4210.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
|
376
|
+
nucliadb-6.4.0.post4210.dist-info/RECORD,,
|