nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0023_backfill_pg_catalog.py +8 -4
- migrations/0028_extracted_vectors_reference.py +1 -1
- migrations/0029_backfill_field_status.py +3 -4
- migrations/0032_remove_old_relations.py +2 -3
- migrations/0038_backfill_catalog_field_labels.py +8 -4
- migrations/0039_backfill_converation_splits_metadata.py +106 -0
- migrations/0040_migrate_search_configurations.py +79 -0
- migrations/0041_reindex_conversations.py +137 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
- migrations/pg/0012_catalog_statistics_undo.py +26 -0
- nucliadb/backups/create.py +2 -15
- nucliadb/backups/restore.py +4 -15
- nucliadb/backups/tasks.py +4 -1
- nucliadb/common/back_pressure/cache.py +2 -3
- nucliadb/common/back_pressure/materializer.py +7 -13
- nucliadb/common/back_pressure/settings.py +6 -6
- nucliadb/common/back_pressure/utils.py +1 -0
- nucliadb/common/cache.py +9 -9
- nucliadb/common/catalog/__init__.py +79 -0
- nucliadb/common/catalog/dummy.py +36 -0
- nucliadb/common/catalog/interface.py +85 -0
- nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
- nucliadb/common/catalog/utils.py +56 -0
- nucliadb/common/cluster/manager.py +8 -23
- nucliadb/common/cluster/rebalance.py +484 -112
- nucliadb/common/cluster/rollover.py +36 -9
- nucliadb/common/cluster/settings.py +4 -9
- nucliadb/common/cluster/utils.py +34 -8
- nucliadb/common/context/__init__.py +7 -8
- nucliadb/common/context/fastapi.py +1 -2
- nucliadb/common/datamanagers/__init__.py +2 -4
- nucliadb/common/datamanagers/atomic.py +9 -2
- nucliadb/common/datamanagers/cluster.py +1 -2
- nucliadb/common/datamanagers/fields.py +3 -4
- nucliadb/common/datamanagers/kb.py +6 -6
- nucliadb/common/datamanagers/labels.py +2 -3
- nucliadb/common/datamanagers/resources.py +10 -33
- nucliadb/common/datamanagers/rollover.py +5 -7
- nucliadb/common/datamanagers/search_configurations.py +1 -2
- nucliadb/common/datamanagers/synonyms.py +1 -2
- nucliadb/common/datamanagers/utils.py +4 -4
- nucliadb/common/datamanagers/vectorsets.py +4 -4
- nucliadb/common/external_index_providers/base.py +32 -5
- nucliadb/common/external_index_providers/manager.py +5 -34
- nucliadb/common/external_index_providers/settings.py +1 -27
- nucliadb/common/filter_expression.py +129 -41
- nucliadb/common/http_clients/exceptions.py +8 -0
- nucliadb/common/http_clients/processing.py +16 -23
- nucliadb/common/http_clients/utils.py +3 -0
- nucliadb/common/ids.py +82 -58
- nucliadb/common/locking.py +1 -2
- nucliadb/common/maindb/driver.py +9 -8
- nucliadb/common/maindb/local.py +5 -5
- nucliadb/common/maindb/pg.py +9 -8
- nucliadb/common/nidx.py +22 -5
- nucliadb/common/vector_index_config.py +1 -1
- nucliadb/export_import/datamanager.py +4 -3
- nucliadb/export_import/exporter.py +11 -19
- nucliadb/export_import/importer.py +13 -6
- nucliadb/export_import/tasks.py +2 -0
- nucliadb/export_import/utils.py +6 -18
- nucliadb/health.py +2 -2
- nucliadb/ingest/app.py +8 -8
- nucliadb/ingest/consumer/consumer.py +8 -10
- nucliadb/ingest/consumer/pull.py +10 -8
- nucliadb/ingest/consumer/service.py +5 -30
- nucliadb/ingest/consumer/shard_creator.py +16 -5
- nucliadb/ingest/consumer/utils.py +1 -1
- nucliadb/ingest/fields/base.py +37 -49
- nucliadb/ingest/fields/conversation.py +55 -9
- nucliadb/ingest/fields/exceptions.py +1 -2
- nucliadb/ingest/fields/file.py +22 -8
- nucliadb/ingest/fields/link.py +7 -7
- nucliadb/ingest/fields/text.py +2 -3
- nucliadb/ingest/orm/brain_v2.py +89 -57
- nucliadb/ingest/orm/broker_message.py +2 -4
- nucliadb/ingest/orm/entities.py +10 -209
- nucliadb/ingest/orm/index_message.py +128 -113
- nucliadb/ingest/orm/knowledgebox.py +91 -59
- nucliadb/ingest/orm/processor/auditing.py +1 -3
- nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
- nucliadb/ingest/orm/processor/processor.py +98 -153
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
- nucliadb/ingest/orm/resource.py +82 -71
- nucliadb/ingest/orm/utils.py +1 -1
- nucliadb/ingest/partitions.py +12 -1
- nucliadb/ingest/processing.py +17 -17
- nucliadb/ingest/serialize.py +202 -145
- nucliadb/ingest/service/writer.py +15 -114
- nucliadb/ingest/settings.py +36 -15
- nucliadb/ingest/utils.py +1 -2
- nucliadb/learning_proxy.py +23 -26
- nucliadb/metrics_exporter.py +20 -6
- nucliadb/middleware/__init__.py +82 -1
- nucliadb/migrator/datamanager.py +4 -11
- nucliadb/migrator/migrator.py +1 -2
- nucliadb/migrator/models.py +1 -2
- nucliadb/migrator/settings.py +1 -2
- nucliadb/models/internal/augment.py +614 -0
- nucliadb/models/internal/processing.py +19 -19
- nucliadb/openapi.py +2 -2
- nucliadb/purge/__init__.py +3 -8
- nucliadb/purge/orphan_shards.py +1 -2
- nucliadb/reader/__init__.py +5 -0
- nucliadb/reader/api/models.py +6 -13
- nucliadb/reader/api/v1/download.py +59 -38
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/knowledgebox.py +37 -9
- nucliadb/reader/api/v1/learning_config.py +33 -14
- nucliadb/reader/api/v1/resource.py +61 -9
- nucliadb/reader/api/v1/services.py +18 -14
- nucliadb/reader/app.py +3 -1
- nucliadb/reader/reader/notifications.py +1 -2
- nucliadb/search/api/v1/__init__.py +3 -0
- nucliadb/search/api/v1/ask.py +3 -4
- nucliadb/search/api/v1/augment.py +585 -0
- nucliadb/search/api/v1/catalog.py +15 -19
- nucliadb/search/api/v1/find.py +16 -22
- nucliadb/search/api/v1/hydrate.py +328 -0
- nucliadb/search/api/v1/knowledgebox.py +1 -2
- nucliadb/search/api/v1/predict_proxy.py +1 -2
- nucliadb/search/api/v1/resource/ask.py +28 -8
- nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
- nucliadb/search/api/v1/resource/search.py +9 -11
- nucliadb/search/api/v1/retrieve.py +130 -0
- nucliadb/search/api/v1/search.py +28 -32
- nucliadb/search/api/v1/suggest.py +11 -14
- nucliadb/search/api/v1/summarize.py +1 -2
- nucliadb/search/api/v1/utils.py +2 -2
- nucliadb/search/app.py +3 -2
- nucliadb/search/augmentor/__init__.py +21 -0
- nucliadb/search/augmentor/augmentor.py +232 -0
- nucliadb/search/augmentor/fields.py +704 -0
- nucliadb/search/augmentor/metrics.py +24 -0
- nucliadb/search/augmentor/paragraphs.py +334 -0
- nucliadb/search/augmentor/resources.py +238 -0
- nucliadb/search/augmentor/utils.py +33 -0
- nucliadb/search/lifecycle.py +3 -1
- nucliadb/search/predict.py +33 -19
- nucliadb/search/predict_models.py +8 -9
- nucliadb/search/requesters/utils.py +11 -10
- nucliadb/search/search/cache.py +19 -42
- nucliadb/search/search/chat/ask.py +131 -59
- nucliadb/search/search/chat/exceptions.py +3 -5
- nucliadb/search/search/chat/fetcher.py +201 -0
- nucliadb/search/search/chat/images.py +6 -4
- nucliadb/search/search/chat/old_prompt.py +1375 -0
- nucliadb/search/search/chat/parser.py +510 -0
- nucliadb/search/search/chat/prompt.py +563 -615
- nucliadb/search/search/chat/query.py +453 -32
- nucliadb/search/search/chat/rpc.py +85 -0
- nucliadb/search/search/fetch.py +3 -4
- nucliadb/search/search/filters.py +8 -11
- nucliadb/search/search/find.py +33 -31
- nucliadb/search/search/find_merge.py +124 -331
- nucliadb/search/search/graph_strategy.py +14 -12
- nucliadb/search/search/hydrator/__init__.py +49 -0
- nucliadb/search/search/hydrator/fields.py +217 -0
- nucliadb/search/search/hydrator/images.py +130 -0
- nucliadb/search/search/hydrator/paragraphs.py +323 -0
- nucliadb/search/search/hydrator/resources.py +60 -0
- nucliadb/search/search/ingestion_agents.py +5 -5
- nucliadb/search/search/merge.py +90 -94
- nucliadb/search/search/metrics.py +24 -7
- nucliadb/search/search/paragraphs.py +7 -9
- nucliadb/search/search/predict_proxy.py +44 -18
- nucliadb/search/search/query.py +14 -86
- nucliadb/search/search/query_parser/fetcher.py +51 -82
- nucliadb/search/search/query_parser/models.py +19 -48
- nucliadb/search/search/query_parser/old_filters.py +20 -19
- nucliadb/search/search/query_parser/parsers/ask.py +5 -6
- nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
- nucliadb/search/search/query_parser/parsers/common.py +21 -13
- nucliadb/search/search/query_parser/parsers/find.py +6 -29
- nucliadb/search/search/query_parser/parsers/graph.py +18 -28
- nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
- nucliadb/search/search/query_parser/parsers/search.py +15 -56
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
- nucliadb/search/search/rank_fusion.py +18 -13
- nucliadb/search/search/rerankers.py +6 -7
- nucliadb/search/search/retrieval.py +300 -0
- nucliadb/search/search/summarize.py +5 -6
- nucliadb/search/search/utils.py +3 -4
- nucliadb/search/settings.py +1 -2
- nucliadb/standalone/api_router.py +1 -1
- nucliadb/standalone/app.py +4 -3
- nucliadb/standalone/auth.py +5 -6
- nucliadb/standalone/lifecycle.py +2 -2
- nucliadb/standalone/run.py +5 -4
- nucliadb/standalone/settings.py +5 -6
- nucliadb/standalone/versions.py +3 -4
- nucliadb/tasks/consumer.py +13 -8
- nucliadb/tasks/models.py +2 -1
- nucliadb/tasks/producer.py +3 -3
- nucliadb/tasks/retries.py +8 -7
- nucliadb/train/api/utils.py +1 -3
- nucliadb/train/api/v1/shards.py +1 -2
- nucliadb/train/api/v1/trainset.py +1 -2
- nucliadb/train/app.py +1 -1
- nucliadb/train/generator.py +4 -4
- nucliadb/train/generators/field_classifier.py +2 -2
- nucliadb/train/generators/field_streaming.py +6 -6
- nucliadb/train/generators/image_classifier.py +2 -2
- nucliadb/train/generators/paragraph_classifier.py +2 -2
- nucliadb/train/generators/paragraph_streaming.py +2 -2
- nucliadb/train/generators/question_answer_streaming.py +2 -2
- nucliadb/train/generators/sentence_classifier.py +4 -10
- nucliadb/train/generators/token_classifier.py +3 -2
- nucliadb/train/generators/utils.py +6 -5
- nucliadb/train/nodes.py +3 -3
- nucliadb/train/resource.py +6 -8
- nucliadb/train/settings.py +3 -4
- nucliadb/train/types.py +11 -11
- nucliadb/train/upload.py +3 -2
- nucliadb/train/uploader.py +1 -2
- nucliadb/train/utils.py +1 -2
- nucliadb/writer/api/v1/export_import.py +4 -1
- nucliadb/writer/api/v1/field.py +15 -14
- nucliadb/writer/api/v1/knowledgebox.py +18 -56
- nucliadb/writer/api/v1/learning_config.py +5 -4
- nucliadb/writer/api/v1/resource.py +9 -20
- nucliadb/writer/api/v1/services.py +10 -132
- nucliadb/writer/api/v1/upload.py +73 -72
- nucliadb/writer/app.py +8 -2
- nucliadb/writer/resource/basic.py +12 -15
- nucliadb/writer/resource/field.py +43 -5
- nucliadb/writer/resource/origin.py +7 -0
- nucliadb/writer/settings.py +2 -3
- nucliadb/writer/tus/__init__.py +2 -3
- nucliadb/writer/tus/azure.py +5 -7
- nucliadb/writer/tus/dm.py +3 -3
- nucliadb/writer/tus/exceptions.py +3 -4
- nucliadb/writer/tus/gcs.py +15 -22
- nucliadb/writer/tus/s3.py +2 -3
- nucliadb/writer/tus/storage.py +3 -3
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
- nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
- nucliadb/common/datamanagers/entities.py +0 -139
- nucliadb/common/external_index_providers/pinecone.py +0 -894
- nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
- nucliadb/search/search/hydrator.py +0 -197
- nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
nucliadb/backups/tasks.py
CHANGED
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
|
-
from
|
|
20
|
+
from collections.abc import Awaitable, Callable
|
|
21
21
|
|
|
22
22
|
from nucliadb.backups.const import BackupsNatsConfig
|
|
23
23
|
from nucliadb.backups.create import backup_kb_task
|
|
@@ -38,6 +38,7 @@ def creator_consumer() -> NatsTaskConsumer[CreateBackupRequest]:
|
|
|
38
38
|
callback=backup_kb_task,
|
|
39
39
|
msg_type=CreateBackupRequest,
|
|
40
40
|
max_concurrent_messages=10,
|
|
41
|
+
max_retries=100,
|
|
41
42
|
)
|
|
42
43
|
return consumer
|
|
43
44
|
|
|
@@ -64,6 +65,7 @@ def restorer_consumer() -> NatsTaskConsumer[RestoreBackupRequest]:
|
|
|
64
65
|
callback=restore_kb_task,
|
|
65
66
|
msg_type=RestoreBackupRequest,
|
|
66
67
|
max_concurrent_messages=10,
|
|
68
|
+
max_retries=100,
|
|
67
69
|
)
|
|
68
70
|
return consumer
|
|
69
71
|
|
|
@@ -90,6 +92,7 @@ def deleter_consumer() -> NatsTaskConsumer[DeleteBackupRequest]:
|
|
|
90
92
|
callback=delete_backup_task,
|
|
91
93
|
msg_type=DeleteBackupRequest,
|
|
92
94
|
max_concurrent_messages=2,
|
|
95
|
+
max_retries=100,
|
|
93
96
|
)
|
|
94
97
|
return consumer
|
|
95
98
|
|
|
@@ -21,7 +21,6 @@ import contextlib
|
|
|
21
21
|
import logging
|
|
22
22
|
import threading
|
|
23
23
|
from datetime import datetime, timezone
|
|
24
|
-
from typing import Optional
|
|
25
24
|
|
|
26
25
|
from cachetools import TTLCache
|
|
27
26
|
|
|
@@ -47,7 +46,7 @@ class BackPressureCache:
|
|
|
47
46
|
self._cache = TTLCache(maxsize=1024, ttl=5 * 60)
|
|
48
47
|
self._lock = threading.Lock()
|
|
49
48
|
|
|
50
|
-
def get(self, key: str) ->
|
|
49
|
+
def get(self, key: str) -> BackPressureData | None:
|
|
51
50
|
with self._lock:
|
|
52
51
|
data = self._cache.get(key, None)
|
|
53
52
|
if data is None:
|
|
@@ -72,7 +71,7 @@ def cached_back_pressure(cache_key: str):
|
|
|
72
71
|
Context manager that handles the caching of the try again in time so that
|
|
73
72
|
we don't recompute try again times if we have already applied back pressure.
|
|
74
73
|
"""
|
|
75
|
-
data:
|
|
74
|
+
data: BackPressureData | None = _cache.get(cache_key)
|
|
76
75
|
if data is not None:
|
|
77
76
|
back_pressure_type = data.type
|
|
78
77
|
RATE_LIMITED_REQUESTS_COUNTER.inc({"type": back_pressure_type, "cached": "true"})
|
|
@@ -20,7 +20,6 @@
|
|
|
20
20
|
import asyncio
|
|
21
21
|
import logging
|
|
22
22
|
import threading
|
|
23
|
-
from typing import Optional
|
|
24
23
|
|
|
25
24
|
from cachetools import TTLCache
|
|
26
25
|
from fastapi import HTTPException
|
|
@@ -118,12 +117,6 @@ class BackPressureMaterializer:
|
|
|
118
117
|
extra={"kbid": kbid},
|
|
119
118
|
)
|
|
120
119
|
return 0
|
|
121
|
-
|
|
122
|
-
if pending > 0:
|
|
123
|
-
logger.info(
|
|
124
|
-
f"Processing returned {pending} pending messages for KB",
|
|
125
|
-
extra={"kbid": kbid},
|
|
126
|
-
)
|
|
127
120
|
self.processing_pending_cache[kbid] = pending
|
|
128
121
|
return pending
|
|
129
122
|
|
|
@@ -184,7 +177,7 @@ class BackPressureMaterializer:
|
|
|
184
177
|
pending=pending,
|
|
185
178
|
max_wait=settings.max_wait_time,
|
|
186
179
|
)
|
|
187
|
-
data = BackPressureData(type="indexing", try_after=try_after)
|
|
180
|
+
data = BackPressureData(type="indexing", try_after=try_after, pending=pending)
|
|
188
181
|
raise BackPressureException(data)
|
|
189
182
|
|
|
190
183
|
def check_ingest(self):
|
|
@@ -199,7 +192,7 @@ class BackPressureMaterializer:
|
|
|
199
192
|
pending=ingest_pending,
|
|
200
193
|
max_wait=settings.max_wait_time,
|
|
201
194
|
)
|
|
202
|
-
data = BackPressureData(type="ingest", try_after=try_after)
|
|
195
|
+
data = BackPressureData(type="ingest", try_after=try_after, pending=ingest_pending)
|
|
203
196
|
raise BackPressureException(data)
|
|
204
197
|
|
|
205
198
|
async def check_processing(self, kbid: str):
|
|
@@ -215,11 +208,11 @@ class BackPressureMaterializer:
|
|
|
215
208
|
pending=kb_pending,
|
|
216
209
|
max_wait=settings.max_wait_time,
|
|
217
210
|
)
|
|
218
|
-
data = BackPressureData(type="processing", try_after=try_after)
|
|
211
|
+
data = BackPressureData(type="processing", try_after=try_after, pending=kb_pending)
|
|
219
212
|
raise BackPressureException(data)
|
|
220
213
|
|
|
221
214
|
|
|
222
|
-
MATERIALIZER:
|
|
215
|
+
MATERIALIZER: BackPressureMaterializer | None = None
|
|
223
216
|
materializer_lock = threading.Lock()
|
|
224
217
|
|
|
225
218
|
|
|
@@ -268,7 +261,7 @@ def get_materializer() -> BackPressureMaterializer:
|
|
|
268
261
|
return MATERIALIZER
|
|
269
262
|
|
|
270
263
|
|
|
271
|
-
async def maybe_back_pressure(kbid: str, resource_uuid:
|
|
264
|
+
async def maybe_back_pressure(kbid: str, resource_uuid: str | None = None) -> None:
|
|
272
265
|
"""
|
|
273
266
|
This function does system checks to see if we need to put back pressure on writes.
|
|
274
267
|
In that case, a HTTP 429 will be raised with the estimated time to try again.
|
|
@@ -278,7 +271,7 @@ async def maybe_back_pressure(kbid: str, resource_uuid: Optional[str] = None) ->
|
|
|
278
271
|
await back_pressure_checks(kbid, resource_uuid)
|
|
279
272
|
|
|
280
273
|
|
|
281
|
-
async def back_pressure_checks(kbid: str, resource_uuid:
|
|
274
|
+
async def back_pressure_checks(kbid: str, resource_uuid: str | None = None):
|
|
282
275
|
"""
|
|
283
276
|
Will raise a 429 if back pressure is needed:
|
|
284
277
|
- If the processing engine is behind.
|
|
@@ -299,6 +292,7 @@ async def back_pressure_checks(kbid: str, resource_uuid: Optional[str] = None):
|
|
|
299
292
|
"resource_uuid": resource_uuid,
|
|
300
293
|
"try_after": exc.data.try_after,
|
|
301
294
|
"back_pressure_type": exc.data.type,
|
|
295
|
+
"pending": exc.data.pending,
|
|
302
296
|
},
|
|
303
297
|
)
|
|
304
298
|
raise HTTPException(
|
|
@@ -29,30 +29,30 @@ class BackPressureSettings(BaseSettings):
|
|
|
29
29
|
)
|
|
30
30
|
indexing_rate: float = Field(
|
|
31
31
|
default=10,
|
|
32
|
-
description="Estimation of the indexing rate in messages per second. This is used to calculate the try again in time",
|
|
32
|
+
description="Estimation of the indexing rate in messages per second. This is used to calculate the try again in time",
|
|
33
33
|
)
|
|
34
34
|
ingest_rate: float = Field(
|
|
35
35
|
default=4,
|
|
36
|
-
description="Estimation of the ingest processed consumer rate in messages per second. This is used to calculate the try again in time",
|
|
36
|
+
description="Estimation of the ingest processed consumer rate in messages per second. This is used to calculate the try again in time",
|
|
37
37
|
)
|
|
38
38
|
processing_rate: float = Field(
|
|
39
39
|
default=1,
|
|
40
|
-
description="Estimation of the processing rate in messages per second. This is used to calculate the try again in time",
|
|
40
|
+
description="Estimation of the processing rate in messages per second. This is used to calculate the try again in time",
|
|
41
41
|
)
|
|
42
42
|
max_indexing_pending: int = Field(
|
|
43
43
|
default=1000,
|
|
44
|
-
description="Max number of messages pending to index in a node queue before rate limiting writes. Set to 0 to disable indexing back pressure checks",
|
|
44
|
+
description="Max number of messages pending to index in a node queue before rate limiting writes. Set to 0 to disable indexing back pressure checks",
|
|
45
45
|
alias="back_pressure_max_indexing_pending",
|
|
46
46
|
)
|
|
47
47
|
max_ingest_pending: int = Field(
|
|
48
48
|
# Disabled by default
|
|
49
49
|
default=0,
|
|
50
|
-
description="Max number of messages pending to be ingested by processed consumers before rate limiting writes. Set to 0 to disable ingest back pressure checks",
|
|
50
|
+
description="Max number of messages pending to be ingested by processed consumers before rate limiting writes. Set to 0 to disable ingest back pressure checks",
|
|
51
51
|
alias="back_pressure_max_ingest_pending",
|
|
52
52
|
)
|
|
53
53
|
max_processing_pending: int = Field(
|
|
54
54
|
default=1000,
|
|
55
|
-
description="Max number of messages pending to process per Knowledge Box before rate limiting writes. Set to 0 to disable processing back pressure checks",
|
|
55
|
+
description="Max number of messages pending to process per Knowledge Box before rate limiting writes. Set to 0 to disable processing back pressure checks",
|
|
56
56
|
alias="back_pressure_max_processing_pending",
|
|
57
57
|
)
|
|
58
58
|
indexing_check_interval: int = Field(
|
nucliadb/common/cache.py
CHANGED
|
@@ -24,7 +24,7 @@ from abc import ABC, abstractmethod
|
|
|
24
24
|
from contextvars import ContextVar
|
|
25
25
|
from dataclasses import dataclass
|
|
26
26
|
from functools import cached_property
|
|
27
|
-
from typing import Generic,
|
|
27
|
+
from typing import Generic, TypeVar
|
|
28
28
|
|
|
29
29
|
import backoff
|
|
30
30
|
from async_lru import _LRUCacheWrapper, alru_cache
|
|
@@ -66,9 +66,9 @@ class Cache(Generic[K, T], ABC):
|
|
|
66
66
|
|
|
67
67
|
"""
|
|
68
68
|
|
|
69
|
-
cache: _LRUCacheWrapper[
|
|
69
|
+
cache: _LRUCacheWrapper[T | None]
|
|
70
70
|
|
|
71
|
-
async def get(self, *args: K.args, **kwargs: K.kwargs) ->
|
|
71
|
+
async def get(self, *args: K.args, **kwargs: K.kwargs) -> T | None:
|
|
72
72
|
result = await self.cache(*args)
|
|
73
73
|
# Do not cache None
|
|
74
74
|
if result is None:
|
|
@@ -88,7 +88,7 @@ class Cache(Generic[K, T], ABC):
|
|
|
88
88
|
class ResourceCache(Cache[[str, str], ResourceORM]):
|
|
89
89
|
def __init__(self, cache_size: int) -> None:
|
|
90
90
|
@alru_cache(maxsize=cache_size)
|
|
91
|
-
async def _get_resource(kbid: str, rid: str) ->
|
|
91
|
+
async def _get_resource(kbid: str, rid: str) -> ResourceORM | None:
|
|
92
92
|
storage = await get_storage()
|
|
93
93
|
async with get_driver().ro_transaction() as txn:
|
|
94
94
|
kb = KnowledgeBoxORM(txn, storage, kbid)
|
|
@@ -115,7 +115,7 @@ class ExtractedTextCache(Cache[[str, FieldId], ExtractedText]):
|
|
|
115
115
|
def __init__(self, cache_size: int) -> None:
|
|
116
116
|
@alru_cache(maxsize=cache_size)
|
|
117
117
|
@backoff.on_exception(backoff.expo, (Exception,), jitter=backoff.random_jitter, max_tries=3)
|
|
118
|
-
async def _get_extracted_text(kbid: str, field_id: FieldId) ->
|
|
118
|
+
async def _get_extracted_text(kbid: str, field_id: FieldId) -> ExtractedText | None:
|
|
119
119
|
storage = await get_storage()
|
|
120
120
|
try:
|
|
121
121
|
sf = storage.file_extracted(
|
|
@@ -144,18 +144,18 @@ class ExtractedTextCache(Cache[[str, FieldId], ExtractedText]):
|
|
|
144
144
|
|
|
145
145
|
# Global caches (per asyncio task)
|
|
146
146
|
|
|
147
|
-
rcache: ContextVar[
|
|
148
|
-
etcache: ContextVar[
|
|
147
|
+
rcache: ContextVar[ResourceCache | None] = ContextVar("rcache", default=None)
|
|
148
|
+
etcache: ContextVar[ExtractedTextCache | None] = ContextVar("etcache", default=None)
|
|
149
149
|
|
|
150
150
|
|
|
151
151
|
# Cache management
|
|
152
152
|
|
|
153
153
|
|
|
154
|
-
def get_resource_cache() ->
|
|
154
|
+
def get_resource_cache() -> ResourceCache | None:
|
|
155
155
|
return rcache.get()
|
|
156
156
|
|
|
157
157
|
|
|
158
|
-
def get_extracted_text_cache() ->
|
|
158
|
+
def get_extracted_text_cache() -> ExtractedTextCache | None:
|
|
159
159
|
return etcache.get()
|
|
160
160
|
|
|
161
161
|
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
21
|
+
#
|
|
22
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
23
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
24
|
+
#
|
|
25
|
+
# AGPL:
|
|
26
|
+
# This program is free software: you can redistribute it and/or modify
|
|
27
|
+
# it under the terms of the GNU Affero General Public License as
|
|
28
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
29
|
+
# License, or (at your option) any later version.
|
|
30
|
+
#
|
|
31
|
+
# This program is distributed in the hope that it will be useful,
|
|
32
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
33
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
34
|
+
# GNU Affero General Public License for more details.
|
|
35
|
+
#
|
|
36
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
37
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
38
|
+
|
|
39
|
+
from nidx_protos.noderesources_pb2 import Resource as IndexMessage
|
|
40
|
+
|
|
41
|
+
from nucliadb.common.catalog.dummy import DummyCatalog
|
|
42
|
+
from nucliadb.common.catalog.interface import Catalog, CatalogQuery
|
|
43
|
+
from nucliadb.common.catalog.pg import PGCatalog
|
|
44
|
+
from nucliadb.common.catalog.utils import build_catalog_resource_data
|
|
45
|
+
from nucliadb.common.maindb.driver import Transaction
|
|
46
|
+
from nucliadb.ingest.orm.resource import Resource
|
|
47
|
+
from nucliadb.ingest.settings import CatalogConfig, settings
|
|
48
|
+
from nucliadb_models.search import CatalogFacetsRequest, Resources
|
|
49
|
+
from nucliadb_utils.exceptions import ConfigurationError
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def get_catalog() -> Catalog:
|
|
53
|
+
if settings.catalog == CatalogConfig.UNSET:
|
|
54
|
+
return DummyCatalog()
|
|
55
|
+
elif settings.catalog == CatalogConfig.PG:
|
|
56
|
+
return PGCatalog()
|
|
57
|
+
else:
|
|
58
|
+
raise ConfigurationError(f"Unknown catalog configuration: {settings.catalog}")
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
async def catalog_update(txn: Transaction, kbid: str, resource: Resource, index_message: IndexMessage):
|
|
62
|
+
catalog = get_catalog()
|
|
63
|
+
resource_data = build_catalog_resource_data(resource, index_message)
|
|
64
|
+
await catalog.update(txn, kbid, resource.uuid, resource_data)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
async def catalog_delete(txn: Transaction, kbid: str, rid: str):
|
|
68
|
+
catalog = get_catalog()
|
|
69
|
+
await catalog.delete(txn, kbid, rid)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
async def catalog_search(query: CatalogQuery) -> Resources:
|
|
73
|
+
catalog = get_catalog()
|
|
74
|
+
return await catalog.search(query)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
async def catalog_facets(kbid: str, request: CatalogFacetsRequest) -> dict[str, int]:
|
|
78
|
+
catalog = get_catalog()
|
|
79
|
+
return await catalog.facets(kbid, request)
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
from nucliadb.common.catalog.interface import Catalog, CatalogQuery, CatalogResourceData
|
|
21
|
+
from nucliadb.common.maindb.driver import Transaction
|
|
22
|
+
from nucliadb_models.search import CatalogFacetsRequest, Resources
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class DummyCatalog(Catalog):
|
|
26
|
+
async def update(self, txn: Transaction, kbid: str, rid: str, data: CatalogResourceData):
|
|
27
|
+
return
|
|
28
|
+
|
|
29
|
+
async def delete(self, txn: Transaction, kbid: str, rid: str):
|
|
30
|
+
return
|
|
31
|
+
|
|
32
|
+
async def search(self, query: CatalogQuery) -> Resources:
|
|
33
|
+
return Resources(results=[], min_score=0.0)
|
|
34
|
+
|
|
35
|
+
async def facets(self, kbid: str, request: CatalogFacetsRequest) -> dict[str, int]:
|
|
36
|
+
return {}
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import abc
|
|
23
|
+
import datetime
|
|
24
|
+
from dataclasses import dataclass
|
|
25
|
+
from typing import Literal
|
|
26
|
+
|
|
27
|
+
from pydantic import BaseModel, Field
|
|
28
|
+
|
|
29
|
+
from nucliadb.common.maindb.driver import Transaction
|
|
30
|
+
from nucliadb_models import search as search_models
|
|
31
|
+
from nucliadb_models.search import CatalogFacetsRequest, Resources
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class CatalogResourceData(BaseModel):
|
|
35
|
+
"""
|
|
36
|
+
Data extracted from a resource to be indexed in the catalog
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
title: str = Field(description="Resource title")
|
|
40
|
+
created_at: datetime.datetime = Field(description="Resource creation date")
|
|
41
|
+
modified_at: datetime.datetime = Field(description="Resource last modification date")
|
|
42
|
+
labels: list[str] = Field(
|
|
43
|
+
description="Resource labels. This includes labels at the resource level and all classification labels of its fields"
|
|
44
|
+
)
|
|
45
|
+
slug: str = Field(description="Resource slug")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class CatalogExpression:
|
|
50
|
+
@dataclass
|
|
51
|
+
class Date:
|
|
52
|
+
field: Literal["created_at"] | Literal["modified_at"]
|
|
53
|
+
since: datetime.datetime | None
|
|
54
|
+
until: datetime.datetime | None
|
|
55
|
+
|
|
56
|
+
bool_and: list[CatalogExpression] | None = None
|
|
57
|
+
bool_or: list[CatalogExpression] | None = None
|
|
58
|
+
bool_not: CatalogExpression | None = None
|
|
59
|
+
date: Date | None = None
|
|
60
|
+
facet: str | None = None
|
|
61
|
+
resource_id: str | None = None
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class CatalogQuery(BaseModel):
|
|
65
|
+
kbid: str
|
|
66
|
+
query: search_models.CatalogQuery | None = Field(description="Full-text search query")
|
|
67
|
+
filters: CatalogExpression | None = Field(description="Filters to apply to the search")
|
|
68
|
+
sort: search_models.SortOptions = Field(description="Sorting option")
|
|
69
|
+
faceted: list[str] = Field(description="List of facets to compute during the search")
|
|
70
|
+
page_size: int = Field(description="Used for pagination. Maximum page size is 100")
|
|
71
|
+
page_number: int = Field(description="Used for pagination. First page is 0")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class Catalog(abc.ABC, metaclass=abc.ABCMeta):
|
|
75
|
+
@abc.abstractmethod
|
|
76
|
+
async def update(self, txn: Transaction, kbid: str, rid: str, data: CatalogResourceData): ...
|
|
77
|
+
|
|
78
|
+
@abc.abstractmethod
|
|
79
|
+
async def delete(self, txn: Transaction, kbid: str, rid: str): ...
|
|
80
|
+
|
|
81
|
+
@abc.abstractmethod
|
|
82
|
+
async def search(self, query: CatalogQuery) -> Resources: ...
|
|
83
|
+
|
|
84
|
+
@abc.abstractmethod
|
|
85
|
+
async def facets(self, kbid: str, request: CatalogFacetsRequest) -> dict[str, int]: ...
|