nucliadb 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0023_backfill_pg_catalog.py +2 -2
- migrations/0029_backfill_field_status.py +3 -4
- migrations/0032_remove_old_relations.py +2 -3
- migrations/0038_backfill_catalog_field_labels.py +2 -2
- migrations/0039_backfill_converation_splits_metadata.py +2 -2
- migrations/0041_reindex_conversations.py +137 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
- migrations/pg/0012_catalog_statistics_undo.py +26 -0
- nucliadb/backups/create.py +2 -15
- nucliadb/backups/restore.py +4 -15
- nucliadb/backups/tasks.py +4 -1
- nucliadb/common/back_pressure/cache.py +2 -3
- nucliadb/common/back_pressure/materializer.py +7 -13
- nucliadb/common/back_pressure/settings.py +6 -6
- nucliadb/common/back_pressure/utils.py +1 -0
- nucliadb/common/cache.py +9 -9
- nucliadb/common/catalog/interface.py +12 -12
- nucliadb/common/catalog/pg.py +41 -29
- nucliadb/common/catalog/utils.py +3 -3
- nucliadb/common/cluster/manager.py +5 -4
- nucliadb/common/cluster/rebalance.py +483 -114
- nucliadb/common/cluster/rollover.py +25 -9
- nucliadb/common/cluster/settings.py +3 -8
- nucliadb/common/cluster/utils.py +34 -8
- nucliadb/common/context/__init__.py +7 -8
- nucliadb/common/context/fastapi.py +1 -2
- nucliadb/common/datamanagers/__init__.py +2 -4
- nucliadb/common/datamanagers/atomic.py +4 -2
- nucliadb/common/datamanagers/cluster.py +1 -2
- nucliadb/common/datamanagers/fields.py +3 -4
- nucliadb/common/datamanagers/kb.py +6 -6
- nucliadb/common/datamanagers/labels.py +2 -3
- nucliadb/common/datamanagers/resources.py +10 -33
- nucliadb/common/datamanagers/rollover.py +5 -7
- nucliadb/common/datamanagers/search_configurations.py +1 -2
- nucliadb/common/datamanagers/synonyms.py +1 -2
- nucliadb/common/datamanagers/utils.py +4 -4
- nucliadb/common/datamanagers/vectorsets.py +4 -4
- nucliadb/common/external_index_providers/base.py +32 -5
- nucliadb/common/external_index_providers/manager.py +4 -5
- nucliadb/common/filter_expression.py +128 -40
- nucliadb/common/http_clients/processing.py +12 -23
- nucliadb/common/ids.py +6 -4
- nucliadb/common/locking.py +1 -2
- nucliadb/common/maindb/driver.py +9 -8
- nucliadb/common/maindb/local.py +5 -5
- nucliadb/common/maindb/pg.py +9 -8
- nucliadb/common/nidx.py +3 -4
- nucliadb/export_import/datamanager.py +4 -3
- nucliadb/export_import/exporter.py +11 -19
- nucliadb/export_import/importer.py +13 -6
- nucliadb/export_import/tasks.py +2 -0
- nucliadb/export_import/utils.py +6 -18
- nucliadb/health.py +2 -2
- nucliadb/ingest/app.py +8 -8
- nucliadb/ingest/consumer/consumer.py +8 -10
- nucliadb/ingest/consumer/pull.py +3 -8
- nucliadb/ingest/consumer/service.py +3 -3
- nucliadb/ingest/consumer/utils.py +1 -1
- nucliadb/ingest/fields/base.py +28 -49
- nucliadb/ingest/fields/conversation.py +12 -12
- nucliadb/ingest/fields/exceptions.py +1 -2
- nucliadb/ingest/fields/file.py +22 -8
- nucliadb/ingest/fields/link.py +7 -7
- nucliadb/ingest/fields/text.py +2 -3
- nucliadb/ingest/orm/brain_v2.py +78 -64
- nucliadb/ingest/orm/broker_message.py +2 -4
- nucliadb/ingest/orm/entities.py +10 -209
- nucliadb/ingest/orm/index_message.py +4 -4
- nucliadb/ingest/orm/knowledgebox.py +18 -27
- nucliadb/ingest/orm/processor/auditing.py +1 -3
- nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
- nucliadb/ingest/orm/processor/processor.py +27 -27
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
- nucliadb/ingest/orm/resource.py +72 -70
- nucliadb/ingest/orm/utils.py +1 -1
- nucliadb/ingest/processing.py +17 -17
- nucliadb/ingest/serialize.py +202 -145
- nucliadb/ingest/service/writer.py +3 -109
- nucliadb/ingest/settings.py +3 -4
- nucliadb/ingest/utils.py +1 -2
- nucliadb/learning_proxy.py +11 -11
- nucliadb/metrics_exporter.py +5 -4
- nucliadb/middleware/__init__.py +82 -1
- nucliadb/migrator/datamanager.py +3 -4
- nucliadb/migrator/migrator.py +1 -2
- nucliadb/migrator/models.py +1 -2
- nucliadb/migrator/settings.py +1 -2
- nucliadb/models/internal/augment.py +614 -0
- nucliadb/models/internal/processing.py +19 -19
- nucliadb/openapi.py +2 -2
- nucliadb/purge/__init__.py +3 -8
- nucliadb/purge/orphan_shards.py +1 -2
- nucliadb/reader/__init__.py +5 -0
- nucliadb/reader/api/models.py +6 -13
- nucliadb/reader/api/v1/download.py +59 -38
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/learning_config.py +24 -4
- nucliadb/reader/api/v1/resource.py +61 -9
- nucliadb/reader/api/v1/services.py +18 -14
- nucliadb/reader/app.py +3 -1
- nucliadb/reader/reader/notifications.py +1 -2
- nucliadb/search/api/v1/__init__.py +2 -0
- nucliadb/search/api/v1/ask.py +3 -4
- nucliadb/search/api/v1/augment.py +585 -0
- nucliadb/search/api/v1/catalog.py +11 -15
- nucliadb/search/api/v1/find.py +16 -22
- nucliadb/search/api/v1/hydrate.py +25 -25
- nucliadb/search/api/v1/knowledgebox.py +1 -2
- nucliadb/search/api/v1/predict_proxy.py +1 -2
- nucliadb/search/api/v1/resource/ask.py +7 -7
- nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
- nucliadb/search/api/v1/resource/search.py +9 -11
- nucliadb/search/api/v1/retrieve.py +130 -0
- nucliadb/search/api/v1/search.py +28 -32
- nucliadb/search/api/v1/suggest.py +11 -14
- nucliadb/search/api/v1/summarize.py +1 -2
- nucliadb/search/api/v1/utils.py +2 -2
- nucliadb/search/app.py +3 -2
- nucliadb/search/augmentor/__init__.py +21 -0
- nucliadb/search/augmentor/augmentor.py +232 -0
- nucliadb/search/augmentor/fields.py +704 -0
- nucliadb/search/augmentor/metrics.py +24 -0
- nucliadb/search/augmentor/paragraphs.py +334 -0
- nucliadb/search/augmentor/resources.py +238 -0
- nucliadb/search/augmentor/utils.py +33 -0
- nucliadb/search/lifecycle.py +3 -1
- nucliadb/search/predict.py +24 -17
- nucliadb/search/predict_models.py +8 -9
- nucliadb/search/requesters/utils.py +11 -10
- nucliadb/search/search/cache.py +19 -23
- nucliadb/search/search/chat/ask.py +88 -59
- nucliadb/search/search/chat/exceptions.py +3 -5
- nucliadb/search/search/chat/fetcher.py +201 -0
- nucliadb/search/search/chat/images.py +6 -4
- nucliadb/search/search/chat/old_prompt.py +1375 -0
- nucliadb/search/search/chat/parser.py +510 -0
- nucliadb/search/search/chat/prompt.py +563 -615
- nucliadb/search/search/chat/query.py +449 -36
- nucliadb/search/search/chat/rpc.py +85 -0
- nucliadb/search/search/fetch.py +3 -4
- nucliadb/search/search/filters.py +8 -11
- nucliadb/search/search/find.py +33 -31
- nucliadb/search/search/find_merge.py +124 -331
- nucliadb/search/search/graph_strategy.py +14 -12
- nucliadb/search/search/hydrator/__init__.py +3 -152
- nucliadb/search/search/hydrator/fields.py +92 -50
- nucliadb/search/search/hydrator/images.py +7 -7
- nucliadb/search/search/hydrator/paragraphs.py +42 -26
- nucliadb/search/search/hydrator/resources.py +20 -16
- nucliadb/search/search/ingestion_agents.py +5 -5
- nucliadb/search/search/merge.py +90 -94
- nucliadb/search/search/metrics.py +10 -9
- nucliadb/search/search/paragraphs.py +7 -9
- nucliadb/search/search/predict_proxy.py +13 -9
- nucliadb/search/search/query.py +14 -86
- nucliadb/search/search/query_parser/fetcher.py +51 -82
- nucliadb/search/search/query_parser/models.py +19 -20
- nucliadb/search/search/query_parser/old_filters.py +20 -19
- nucliadb/search/search/query_parser/parsers/ask.py +4 -5
- nucliadb/search/search/query_parser/parsers/catalog.py +5 -6
- nucliadb/search/search/query_parser/parsers/common.py +5 -6
- nucliadb/search/search/query_parser/parsers/find.py +6 -26
- nucliadb/search/search/query_parser/parsers/graph.py +13 -23
- nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
- nucliadb/search/search/query_parser/parsers/search.py +15 -53
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
- nucliadb/search/search/rank_fusion.py +18 -13
- nucliadb/search/search/rerankers.py +5 -6
- nucliadb/search/search/retrieval.py +300 -0
- nucliadb/search/search/summarize.py +5 -6
- nucliadb/search/search/utils.py +3 -4
- nucliadb/search/settings.py +1 -2
- nucliadb/standalone/api_router.py +1 -1
- nucliadb/standalone/app.py +4 -3
- nucliadb/standalone/auth.py +5 -6
- nucliadb/standalone/lifecycle.py +2 -2
- nucliadb/standalone/run.py +2 -4
- nucliadb/standalone/settings.py +5 -6
- nucliadb/standalone/versions.py +3 -4
- nucliadb/tasks/consumer.py +13 -8
- nucliadb/tasks/models.py +2 -1
- nucliadb/tasks/producer.py +3 -3
- nucliadb/tasks/retries.py +8 -7
- nucliadb/train/api/utils.py +1 -3
- nucliadb/train/api/v1/shards.py +1 -2
- nucliadb/train/api/v1/trainset.py +1 -2
- nucliadb/train/app.py +1 -1
- nucliadb/train/generator.py +4 -4
- nucliadb/train/generators/field_classifier.py +2 -2
- nucliadb/train/generators/field_streaming.py +6 -6
- nucliadb/train/generators/image_classifier.py +2 -2
- nucliadb/train/generators/paragraph_classifier.py +2 -2
- nucliadb/train/generators/paragraph_streaming.py +2 -2
- nucliadb/train/generators/question_answer_streaming.py +2 -2
- nucliadb/train/generators/sentence_classifier.py +2 -2
- nucliadb/train/generators/token_classifier.py +3 -2
- nucliadb/train/generators/utils.py +6 -5
- nucliadb/train/nodes.py +3 -3
- nucliadb/train/resource.py +6 -8
- nucliadb/train/settings.py +3 -4
- nucliadb/train/types.py +11 -11
- nucliadb/train/upload.py +3 -2
- nucliadb/train/uploader.py +1 -2
- nucliadb/train/utils.py +1 -2
- nucliadb/writer/api/v1/export_import.py +4 -1
- nucliadb/writer/api/v1/field.py +7 -11
- nucliadb/writer/api/v1/knowledgebox.py +3 -4
- nucliadb/writer/api/v1/resource.py +9 -20
- nucliadb/writer/api/v1/services.py +10 -132
- nucliadb/writer/api/v1/upload.py +73 -72
- nucliadb/writer/app.py +8 -2
- nucliadb/writer/resource/basic.py +12 -15
- nucliadb/writer/resource/field.py +7 -5
- nucliadb/writer/resource/origin.py +7 -0
- nucliadb/writer/settings.py +2 -3
- nucliadb/writer/tus/__init__.py +2 -3
- nucliadb/writer/tus/azure.py +1 -3
- nucliadb/writer/tus/dm.py +3 -3
- nucliadb/writer/tus/exceptions.py +3 -4
- nucliadb/writer/tus/gcs.py +5 -6
- nucliadb/writer/tus/s3.py +2 -3
- nucliadb/writer/tus/storage.py +3 -3
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +9 -10
- nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
- nucliadb/common/datamanagers/entities.py +0 -139
- nucliadb-6.9.1.post5192.dist-info/RECORD +0 -392
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
|
@@ -20,11 +20,16 @@
|
|
|
20
20
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
21
21
|
#
|
|
22
22
|
|
|
23
|
-
|
|
24
|
-
from nucliadb.common.models_utils import from_proto
|
|
25
23
|
from nucliadb.ingest.orm.resource import Resource
|
|
24
|
+
from nucliadb.models.internal.augment import (
|
|
25
|
+
ResourceOrigin,
|
|
26
|
+
ResourceProp,
|
|
27
|
+
ResourceSecurity,
|
|
28
|
+
ResourceSummary,
|
|
29
|
+
ResourceTitle,
|
|
30
|
+
)
|
|
31
|
+
from nucliadb.search.augmentor.resources import db_augment_resource
|
|
26
32
|
from nucliadb_models import hydration as hydration_models
|
|
27
|
-
from nucliadb_models.security import ResourceSecurity
|
|
28
33
|
|
|
29
34
|
|
|
30
35
|
async def hydrate_resource(
|
|
@@ -35,22 +40,21 @@ async def hydrate_resource(
|
|
|
35
40
|
slug = basic.slug
|
|
36
41
|
hydrated = hydration_models.HydratedResource(id=rid, slug=slug)
|
|
37
42
|
|
|
43
|
+
select: list[ResourceProp] = []
|
|
38
44
|
if config.title:
|
|
39
|
-
|
|
45
|
+
select.append(ResourceTitle())
|
|
40
46
|
if config.summary:
|
|
41
|
-
|
|
42
|
-
|
|
47
|
+
select.append(ResourceSummary())
|
|
48
|
+
if config.origin:
|
|
49
|
+
select.append(ResourceOrigin())
|
|
43
50
|
if config.security:
|
|
44
|
-
|
|
45
|
-
hydrated.security = ResourceSecurity(access_groups=[])
|
|
46
|
-
if security is not None:
|
|
47
|
-
for group_id in security.access_groups:
|
|
48
|
-
hydrated.security.access_groups.append(group_id)
|
|
51
|
+
select.append(ResourceSecurity())
|
|
49
52
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
53
|
+
augmented = await db_augment_resource(resource, select)
|
|
54
|
+
|
|
55
|
+
hydrated.title = augmented.title
|
|
56
|
+
hydrated.summary = augmented.summary
|
|
57
|
+
hydrated.origin = augmented.origin
|
|
58
|
+
hydrated.security = augmented.security
|
|
55
59
|
|
|
56
60
|
return hydrated
|
|
@@ -19,10 +19,10 @@
|
|
|
19
19
|
#
|
|
20
20
|
import asyncio
|
|
21
21
|
from base64 import b64encode
|
|
22
|
-
from typing import Optional
|
|
23
22
|
|
|
24
23
|
from nucliadb.common import datamanagers
|
|
25
24
|
from nucliadb.ingest.fields.base import Field
|
|
25
|
+
from nucliadb.ingest.orm.resource import Resource
|
|
26
26
|
from nucliadb.search.predict_models import (
|
|
27
27
|
FieldInfo,
|
|
28
28
|
NameOperationFilter,
|
|
@@ -40,8 +40,8 @@ async def run_agents(
|
|
|
40
40
|
kbid: str,
|
|
41
41
|
rid: str,
|
|
42
42
|
user_id: str,
|
|
43
|
-
filters:
|
|
44
|
-
agent_ids:
|
|
43
|
+
filters: list[AgentsFilter] | None = None,
|
|
44
|
+
agent_ids: list[str] | None = None,
|
|
45
45
|
) -> RunAgentsResponse:
|
|
46
46
|
fields = await fetch_resource_fields(kbid, rid)
|
|
47
47
|
|
|
@@ -56,7 +56,7 @@ async def run_agents(
|
|
|
56
56
|
return await predict.run_agents(kbid, item)
|
|
57
57
|
|
|
58
58
|
|
|
59
|
-
def _parse_filters(filters:
|
|
59
|
+
def _parse_filters(filters: list[AgentsFilter] | None) -> list[NameOperationFilter] | None:
|
|
60
60
|
if filters is None:
|
|
61
61
|
return None
|
|
62
62
|
return [
|
|
@@ -69,7 +69,7 @@ def _parse_filters(filters: Optional[list[AgentsFilter]]) -> Optional[list[NameO
|
|
|
69
69
|
|
|
70
70
|
async def fetch_resource_fields(kbid: str, rid: str) -> list[FieldInfo]:
|
|
71
71
|
async with datamanagers.with_ro_transaction() as txn:
|
|
72
|
-
resource = await
|
|
72
|
+
resource = await Resource.get(txn, kbid=kbid, rid=rid)
|
|
73
73
|
if resource is None:
|
|
74
74
|
raise ResourceNotFoundError()
|
|
75
75
|
fields = await resource.get_fields(force=True)
|
nucliadb/search/search/merge.py
CHANGED
|
@@ -20,7 +20,8 @@
|
|
|
20
20
|
import asyncio
|
|
21
21
|
import datetime
|
|
22
22
|
import math
|
|
23
|
-
from
|
|
23
|
+
from collections.abc import Iterable
|
|
24
|
+
from typing import Any
|
|
24
25
|
|
|
25
26
|
from nidx_protos.nodereader_pb2 import (
|
|
26
27
|
DocumentResult,
|
|
@@ -37,7 +38,6 @@ from nidx_protos.nodereader_pb2 import (
|
|
|
37
38
|
from nucliadb.common.ids import FieldId, ParagraphId
|
|
38
39
|
from nucliadb.common.models_utils import from_proto
|
|
39
40
|
from nucliadb.common.models_utils.from_proto import RelationTypePbMap
|
|
40
|
-
from nucliadb.search.search import cache
|
|
41
41
|
from nucliadb.search.search.cut import cut_page
|
|
42
42
|
from nucliadb.search.search.fetch import (
|
|
43
43
|
fetch_resources,
|
|
@@ -80,7 +80,7 @@ from .paragraphs import get_paragraph_text, get_text_sentence
|
|
|
80
80
|
Bm25Score = tuple[float, float]
|
|
81
81
|
TimestampScore = datetime.datetime
|
|
82
82
|
TitleScore = str
|
|
83
|
-
SortValue =
|
|
83
|
+
SortValue = Bm25Score | TimestampScore | TitleScore
|
|
84
84
|
|
|
85
85
|
|
|
86
86
|
def relation_node_type_to_entity_type(node_type: RelationNode.NodeType.ValueType) -> EntityType:
|
|
@@ -101,47 +101,17 @@ def entity_type_to_relation_node_type(node_type: EntityType) -> RelationNode.Nod
|
|
|
101
101
|
}[node_type]
|
|
102
102
|
|
|
103
103
|
|
|
104
|
-
def sort_results_by_score(results:
|
|
104
|
+
def sort_results_by_score(results: list[ParagraphResult] | list[DocumentResult]):
|
|
105
105
|
results.sort(key=lambda x: (x.score.bm25, x.score.booster), reverse=True)
|
|
106
106
|
|
|
107
107
|
|
|
108
|
-
async def get_sort_value(
|
|
109
|
-
item: Union[DocumentResult, ParagraphResult],
|
|
110
|
-
sort_field: SortField,
|
|
111
|
-
kbid: str,
|
|
112
|
-
) -> Optional[SortValue]:
|
|
113
|
-
"""Returns the score for given `item` and `sort_field`. If the resource is being
|
|
114
|
-
deleted, it might appear on search results but not in maindb. In this
|
|
115
|
-
specific case, return None.
|
|
116
|
-
"""
|
|
117
|
-
if sort_field == SortField.SCORE:
|
|
118
|
-
return (item.score.bm25, item.score.booster)
|
|
119
|
-
|
|
120
|
-
score: Any = None
|
|
121
|
-
resource = await cache.get_resource(kbid, item.uuid)
|
|
122
|
-
if resource is None:
|
|
123
|
-
return score
|
|
124
|
-
|
|
125
|
-
basic = await resource.get_basic()
|
|
126
|
-
if basic is None:
|
|
127
|
-
return score
|
|
128
|
-
|
|
129
|
-
if sort_field == SortField.CREATED:
|
|
130
|
-
score = basic.created.ToDatetime()
|
|
131
|
-
elif sort_field == SortField.MODIFIED:
|
|
132
|
-
score = basic.modified.ToDatetime()
|
|
133
|
-
elif sort_field == SortField.TITLE:
|
|
134
|
-
score = basic.title
|
|
135
|
-
|
|
136
|
-
return score
|
|
137
|
-
|
|
138
|
-
|
|
139
108
|
async def merge_documents_results(
|
|
140
109
|
kbid: str,
|
|
141
110
|
responses: list[DocumentSearchResponse],
|
|
142
111
|
*,
|
|
143
112
|
query: FulltextQuery,
|
|
144
113
|
top_k: int,
|
|
114
|
+
offset: int,
|
|
145
115
|
) -> tuple[Resources, list[str]]:
|
|
146
116
|
raw_resource_list: list[tuple[DocumentResult, SortValue]] = []
|
|
147
117
|
facets: dict[str, Any] = {}
|
|
@@ -159,14 +129,22 @@ async def merge_documents_results(
|
|
|
159
129
|
if document_response.next_page:
|
|
160
130
|
next_page = True
|
|
161
131
|
for result in document_response.results:
|
|
162
|
-
sort_value
|
|
132
|
+
sort_value: SortValue
|
|
133
|
+
if query.order_by == SortField.SCORE:
|
|
134
|
+
sort_value = (result.score.bm25, result.score.booster)
|
|
135
|
+
else:
|
|
136
|
+
sort_value = result.date.ToDatetime()
|
|
163
137
|
if sort_value is not None:
|
|
164
138
|
raw_resource_list.append((result, sort_value))
|
|
139
|
+
|
|
165
140
|
total += document_response.total
|
|
166
141
|
|
|
167
142
|
# We need to cut first and then sort, otherwise the page will be wrong if the order is DESC
|
|
168
|
-
raw_resource_list, has_more = cut_page(raw_resource_list, top_k)
|
|
143
|
+
raw_resource_list, has_more = cut_page(raw_resource_list[offset:], top_k)
|
|
169
144
|
next_page = next_page or has_more
|
|
145
|
+
|
|
146
|
+
# Sort the list by score. It's important that this sort is stable, so the
|
|
147
|
+
# ordering of results with same scores accross multiple shards doesn't change
|
|
170
148
|
raw_resource_list.sort(key=lambda x: x[1], reverse=(query.sort == SortOrder.DESC))
|
|
171
149
|
|
|
172
150
|
result_resource_ids = []
|
|
@@ -270,7 +248,7 @@ async def merge_vectors_results(
|
|
|
270
248
|
resources: list[str],
|
|
271
249
|
kbid: str,
|
|
272
250
|
top_k: int,
|
|
273
|
-
min_score:
|
|
251
|
+
min_score: float | None = None,
|
|
274
252
|
) -> Sentences:
|
|
275
253
|
facets: dict[str, Any] = {}
|
|
276
254
|
raw_vectors_list: list[DocumentScored] = []
|
|
@@ -350,12 +328,13 @@ async def merge_paragraph_results(
|
|
|
350
328
|
highlight: bool,
|
|
351
329
|
sort: SortOptions,
|
|
352
330
|
min_score: float,
|
|
331
|
+
offset: int,
|
|
353
332
|
) -> tuple[Paragraphs, list[str]]:
|
|
354
333
|
raw_paragraph_list: list[tuple[ParagraphResult, SortValue]] = []
|
|
355
334
|
facets: dict[str, Any] = {}
|
|
356
335
|
query = None
|
|
357
336
|
next_page = False
|
|
358
|
-
ematches:
|
|
337
|
+
ematches: list[str] | None = None
|
|
359
338
|
total = 0
|
|
360
339
|
for paragraph_response in paragraph_responses:
|
|
361
340
|
if ematches is None:
|
|
@@ -373,66 +352,31 @@ async def merge_paragraph_results(
|
|
|
373
352
|
if paragraph_response.next_page:
|
|
374
353
|
next_page = True
|
|
375
354
|
for result in paragraph_response.results:
|
|
376
|
-
|
|
377
|
-
if
|
|
378
|
-
|
|
355
|
+
sort_value: SortValue
|
|
356
|
+
if sort.field == SortField.SCORE:
|
|
357
|
+
sort_value = (result.score.bm25, result.score.booster)
|
|
358
|
+
else:
|
|
359
|
+
sort_value = result.date.ToDatetime()
|
|
360
|
+
if sort_value is not None:
|
|
361
|
+
raw_paragraph_list.append((result, sort_value))
|
|
362
|
+
|
|
379
363
|
total += paragraph_response.total
|
|
380
364
|
|
|
365
|
+
# Sort the list by score. It's important that this sort is stable, so the
|
|
366
|
+
# ordering of results with same scores accross multiple shards doesn't change
|
|
381
367
|
raw_paragraph_list.sort(key=lambda x: x[1], reverse=(sort.order == SortOrder.DESC))
|
|
382
368
|
|
|
383
|
-
raw_paragraph_list, has_more = cut_page(raw_paragraph_list, top_k)
|
|
369
|
+
raw_paragraph_list, has_more = cut_page(raw_paragraph_list[offset:], top_k)
|
|
384
370
|
next_page = next_page or has_more
|
|
385
371
|
|
|
386
372
|
result_resource_ids = []
|
|
387
|
-
result_paragraph_list: list[Paragraph] =
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
field_id=FieldId(
|
|
394
|
-
rid=result.uuid,
|
|
395
|
-
type=field_type,
|
|
396
|
-
key=field,
|
|
397
|
-
subfield_id=result.split,
|
|
398
|
-
),
|
|
399
|
-
paragraph_start=result.start,
|
|
400
|
-
paragraph_end=result.end,
|
|
401
|
-
),
|
|
402
|
-
highlight=highlight,
|
|
403
|
-
ematches=ematches,
|
|
404
|
-
matches=result.matches, # type: ignore
|
|
405
|
-
)
|
|
406
|
-
labels = await get_labels_paragraph(result, kbid)
|
|
407
|
-
fuzzy_result = len(result.matches) > 0
|
|
408
|
-
new_paragraph = Paragraph(
|
|
409
|
-
score=result.score.bm25,
|
|
410
|
-
rid=result.uuid,
|
|
411
|
-
field_type=field_type,
|
|
412
|
-
field=field,
|
|
413
|
-
text=text,
|
|
414
|
-
labels=labels,
|
|
415
|
-
position=TextPosition(
|
|
416
|
-
index=result.metadata.position.index,
|
|
417
|
-
start=result.metadata.position.start,
|
|
418
|
-
end=result.metadata.position.end,
|
|
419
|
-
page_number=result.metadata.position.page_number,
|
|
420
|
-
),
|
|
421
|
-
fuzzy_result=fuzzy_result,
|
|
422
|
-
)
|
|
423
|
-
if len(result.metadata.position.start_seconds) or len(result.metadata.position.end_seconds):
|
|
424
|
-
new_paragraph.start_seconds = list(result.metadata.position.start_seconds)
|
|
425
|
-
new_paragraph.end_seconds = list(result.metadata.position.end_seconds)
|
|
426
|
-
else:
|
|
427
|
-
# TODO: Remove once we are sure all data has been migrated!
|
|
428
|
-
seconds_positions = await get_seconds_paragraph(result, kbid)
|
|
429
|
-
if seconds_positions is not None:
|
|
430
|
-
new_paragraph.start_seconds = seconds_positions[0]
|
|
431
|
-
new_paragraph.end_seconds = seconds_positions[1]
|
|
373
|
+
result_paragraph_list: list[Paragraph] = await asyncio.gather(
|
|
374
|
+
*(load_paragraph(result, kbid, highlight, ematches) for result, _ in raw_paragraph_list)
|
|
375
|
+
)
|
|
376
|
+
for paragraph in result_paragraph_list:
|
|
377
|
+
if paragraph.rid not in result_resource_ids:
|
|
378
|
+
result_resource_ids.append(paragraph.rid)
|
|
432
379
|
|
|
433
|
-
result_paragraph_list.append(new_paragraph)
|
|
434
|
-
if new_paragraph.rid not in result_resource_ids:
|
|
435
|
-
result_resource_ids.append(new_paragraph.rid)
|
|
436
380
|
return Paragraphs(
|
|
437
381
|
results=result_paragraph_list,
|
|
438
382
|
facets=facets,
|
|
@@ -445,6 +389,56 @@ async def merge_paragraph_results(
|
|
|
445
389
|
), result_resource_ids
|
|
446
390
|
|
|
447
391
|
|
|
392
|
+
async def load_paragraph(
|
|
393
|
+
result: ParagraphResult, kbid: str, highlight: bool, ematches: list[str] | None
|
|
394
|
+
) -> Paragraph:
|
|
395
|
+
_, field_type, field = result.field.split("/")
|
|
396
|
+
text = await get_paragraph_text(
|
|
397
|
+
kbid=kbid,
|
|
398
|
+
paragraph_id=ParagraphId(
|
|
399
|
+
field_id=FieldId(
|
|
400
|
+
rid=result.uuid,
|
|
401
|
+
type=field_type,
|
|
402
|
+
key=field,
|
|
403
|
+
subfield_id=result.split,
|
|
404
|
+
),
|
|
405
|
+
paragraph_start=result.start,
|
|
406
|
+
paragraph_end=result.end,
|
|
407
|
+
),
|
|
408
|
+
highlight=highlight,
|
|
409
|
+
ematches=ematches,
|
|
410
|
+
matches=result.matches, # type: ignore
|
|
411
|
+
)
|
|
412
|
+
labels = await get_labels_paragraph(result, kbid)
|
|
413
|
+
fuzzy_result = len(result.matches) > 0
|
|
414
|
+
new_paragraph = Paragraph(
|
|
415
|
+
score=result.score.bm25,
|
|
416
|
+
rid=result.uuid,
|
|
417
|
+
field_type=field_type,
|
|
418
|
+
field=field,
|
|
419
|
+
text=text,
|
|
420
|
+
labels=labels,
|
|
421
|
+
position=TextPosition(
|
|
422
|
+
index=result.metadata.position.index,
|
|
423
|
+
start=result.metadata.position.start,
|
|
424
|
+
end=result.metadata.position.end,
|
|
425
|
+
page_number=result.metadata.position.page_number,
|
|
426
|
+
),
|
|
427
|
+
fuzzy_result=fuzzy_result,
|
|
428
|
+
)
|
|
429
|
+
if len(result.metadata.position.start_seconds) or len(result.metadata.position.end_seconds):
|
|
430
|
+
new_paragraph.start_seconds = list(result.metadata.position.start_seconds)
|
|
431
|
+
new_paragraph.end_seconds = list(result.metadata.position.end_seconds)
|
|
432
|
+
else:
|
|
433
|
+
# TODO: Remove once we are sure all data has been migrated!
|
|
434
|
+
seconds_positions = await get_seconds_paragraph(result, kbid)
|
|
435
|
+
if seconds_positions is not None:
|
|
436
|
+
new_paragraph.start_seconds = seconds_positions[0]
|
|
437
|
+
new_paragraph.end_seconds = seconds_positions[1]
|
|
438
|
+
|
|
439
|
+
return new_paragraph
|
|
440
|
+
|
|
441
|
+
|
|
448
442
|
@merge_observer.wrap({"type": "merge_relations"})
|
|
449
443
|
async def merge_relations_results(
|
|
450
444
|
graph_responses: list[GraphSearchResponse],
|
|
@@ -520,6 +514,7 @@ async def merge_results(
|
|
|
520
514
|
show: list[ResourceProperties],
|
|
521
515
|
field_type_filter: list[FieldTypeName],
|
|
522
516
|
extracted: list[ExtractedDataTypeName],
|
|
517
|
+
offset: int,
|
|
523
518
|
highlight: bool = False,
|
|
524
519
|
) -> KnowledgeboxSearchResults:
|
|
525
520
|
paragraphs = []
|
|
@@ -543,6 +538,7 @@ async def merge_results(
|
|
|
543
538
|
documents,
|
|
544
539
|
query=retrieval.query.fulltext,
|
|
545
540
|
top_k=retrieval.top_k,
|
|
541
|
+
offset=offset,
|
|
546
542
|
)
|
|
547
543
|
resources.extend(matched_resources)
|
|
548
544
|
|
|
@@ -550,7 +546,6 @@ async def merge_results(
|
|
|
550
546
|
sort = SortOptions(
|
|
551
547
|
field=retrieval.query.keyword.order_by,
|
|
552
548
|
order=retrieval.query.keyword.sort,
|
|
553
|
-
limit=None, # unused
|
|
554
549
|
)
|
|
555
550
|
api_results.paragraphs, matched_resources = await merge_paragraph_results(
|
|
556
551
|
kbid,
|
|
@@ -559,6 +554,7 @@ async def merge_results(
|
|
|
559
554
|
highlight,
|
|
560
555
|
sort,
|
|
561
556
|
min_score=retrieval.query.keyword.min_score,
|
|
557
|
+
offset=offset,
|
|
562
558
|
)
|
|
563
559
|
resources.extend(matched_resources)
|
|
564
560
|
|
|
@@ -601,9 +597,9 @@ async def merge_paragraphs_results(
|
|
|
601
597
|
sort=SortOptions(
|
|
602
598
|
field=SortField.SCORE,
|
|
603
599
|
order=SortOrder.DESC,
|
|
604
|
-
limit=None,
|
|
605
600
|
),
|
|
606
601
|
min_score=min_score,
|
|
602
|
+
offset=0,
|
|
607
603
|
)
|
|
608
604
|
return api_results
|
|
609
605
|
|
|
@@ -611,7 +607,7 @@ async def merge_paragraphs_results(
|
|
|
611
607
|
async def merge_suggest_entities_results(
|
|
612
608
|
suggest_responses: list[SuggestResponse],
|
|
613
609
|
) -> RelatedEntities:
|
|
614
|
-
unique_entities:
|
|
610
|
+
unique_entities: set[RelatedEntity] = set()
|
|
615
611
|
for response in suggest_responses:
|
|
616
612
|
response_entities = (
|
|
617
613
|
RelatedEntity(family=e.subtype, value=e.value) for e in response.entity_results.nodes
|
|
@@ -19,7 +19,7 @@
|
|
|
19
19
|
#
|
|
20
20
|
import contextlib
|
|
21
21
|
import time
|
|
22
|
-
from typing import Any
|
|
22
|
+
from typing import Any
|
|
23
23
|
|
|
24
24
|
from nucliadb_telemetry import metrics
|
|
25
25
|
|
|
@@ -27,6 +27,7 @@ merge_observer = metrics.Observer("merge_results", labels={"type": ""})
|
|
|
27
27
|
node_features = metrics.Counter("nucliadb_node_features", labels={"type": ""})
|
|
28
28
|
query_parse_dependency_observer = metrics.Observer("query_parse_dependency", labels={"type": ""})
|
|
29
29
|
query_parser_observer = metrics.Observer("nucliadb_query_parser", labels={"type": ""})
|
|
30
|
+
search_observer = metrics.Observer("nucliadb_search", labels={"type": ""})
|
|
30
31
|
|
|
31
32
|
buckets = [
|
|
32
33
|
0.005,
|
|
@@ -62,7 +63,7 @@ rag_histogram = metrics.Histogram(
|
|
|
62
63
|
buckets=buckets,
|
|
63
64
|
)
|
|
64
65
|
|
|
65
|
-
MetricsData = dict[str,
|
|
66
|
+
MetricsData = dict[str, int | float]
|
|
66
67
|
|
|
67
68
|
|
|
68
69
|
class Metrics:
|
|
@@ -86,10 +87,10 @@ class Metrics:
|
|
|
86
87
|
self.child_spans.append(child_span)
|
|
87
88
|
return child_span
|
|
88
89
|
|
|
89
|
-
def set(self, key: str, value:
|
|
90
|
+
def set(self, key: str, value: int | float):
|
|
90
91
|
self._metrics[key] = value
|
|
91
92
|
|
|
92
|
-
def get(self, key: str) ->
|
|
93
|
+
def get(self, key: str) -> int | float | None:
|
|
93
94
|
return self._metrics.get(key)
|
|
94
95
|
|
|
95
96
|
def to_dict(self) -> MetricsData:
|
|
@@ -102,7 +103,7 @@ class Metrics:
|
|
|
102
103
|
result[self.id] = self.to_dict()
|
|
103
104
|
return result
|
|
104
105
|
|
|
105
|
-
def __getitem__(self, key: str) ->
|
|
106
|
+
def __getitem__(self, key: str) -> int | float:
|
|
106
107
|
return self._metrics[key]
|
|
107
108
|
|
|
108
109
|
|
|
@@ -110,8 +111,8 @@ class AskMetrics(Metrics):
|
|
|
110
111
|
def __init__(self: "AskMetrics"):
|
|
111
112
|
super().__init__(id="ask")
|
|
112
113
|
self.global_start = time.monotonic()
|
|
113
|
-
self.first_chunk_yielded_at:
|
|
114
|
-
self.first_reasoning_chunk_yielded_at:
|
|
114
|
+
self.first_chunk_yielded_at: float | None = None
|
|
115
|
+
self.first_reasoning_chunk_yielded_at: float | None = None
|
|
115
116
|
|
|
116
117
|
def record_first_chunk_yielded(self):
|
|
117
118
|
self.first_chunk_yielded_at = time.monotonic()
|
|
@@ -123,12 +124,12 @@ class AskMetrics(Metrics):
|
|
|
123
124
|
self.first_reasoning_chunk_yielded_at - self.global_start
|
|
124
125
|
)
|
|
125
126
|
|
|
126
|
-
def get_first_chunk_time(self) ->
|
|
127
|
+
def get_first_chunk_time(self) -> float | None:
|
|
127
128
|
if self.first_chunk_yielded_at is None:
|
|
128
129
|
return None
|
|
129
130
|
return self.first_chunk_yielded_at - self.global_start
|
|
130
131
|
|
|
131
|
-
def get_first_reasoning_chunk_time(self) ->
|
|
132
|
+
def get_first_reasoning_chunk_time(self) -> float | None:
|
|
132
133
|
if self.first_reasoning_chunk_yielded_at is None:
|
|
133
134
|
return None
|
|
134
135
|
return self.first_reasoning_chunk_yielded_at - self.global_start
|
|
@@ -20,7 +20,6 @@
|
|
|
20
20
|
import logging
|
|
21
21
|
import re
|
|
22
22
|
import string
|
|
23
|
-
from typing import Optional
|
|
24
23
|
|
|
25
24
|
from nucliadb.common.ids import FIELD_TYPE_STR_TO_PB, ParagraphId
|
|
26
25
|
from nucliadb.ingest.fields.base import Field
|
|
@@ -58,7 +57,7 @@ async def get_paragraph_from_full_text(
|
|
|
58
57
|
field: Field,
|
|
59
58
|
start: int,
|
|
60
59
|
end: int,
|
|
61
|
-
split:
|
|
60
|
+
split: str | None = None,
|
|
62
61
|
log_on_missing_field: bool = True,
|
|
63
62
|
) -> str:
|
|
64
63
|
"""
|
|
@@ -90,11 +89,10 @@ async def get_paragraph_text(
|
|
|
90
89
|
kbid: str,
|
|
91
90
|
paragraph_id: ParagraphId,
|
|
92
91
|
highlight: bool = False,
|
|
93
|
-
ematches:
|
|
94
|
-
matches:
|
|
95
|
-
orm_resource:
|
|
96
|
-
|
|
97
|
-
] = None, # allow passing in orm_resource to avoid extra DB calls or txn issues
|
|
92
|
+
ematches: list[str] | None = None,
|
|
93
|
+
matches: list[str] | None = None,
|
|
94
|
+
orm_resource: None
|
|
95
|
+
| (ResourceORM) = None, # allow passing in orm_resource to avoid extra DB calls or txn issues
|
|
98
96
|
log_on_missing_field: bool = True,
|
|
99
97
|
) -> str:
|
|
100
98
|
rid = paragraph_id.rid
|
|
@@ -139,7 +137,7 @@ async def get_text_sentence(
|
|
|
139
137
|
index: int,
|
|
140
138
|
start: int,
|
|
141
139
|
end: int,
|
|
142
|
-
split:
|
|
140
|
+
split: str | None = None,
|
|
143
141
|
) -> str:
|
|
144
142
|
"""
|
|
145
143
|
Leave separated from get paragraph for now until we understand the differences
|
|
@@ -169,7 +167,7 @@ async def get_text_sentence(
|
|
|
169
167
|
|
|
170
168
|
|
|
171
169
|
def highlight_paragraph(
|
|
172
|
-
text: str, words:
|
|
170
|
+
text: str, words: list[str] | None = None, ematches: list[str] | None = None
|
|
173
171
|
) -> str:
|
|
174
172
|
"""
|
|
175
173
|
Highlight `text` with <mark></mark> tags around the words in `words` and `ematches`.
|
|
@@ -19,7 +19,7 @@
|
|
|
19
19
|
#
|
|
20
20
|
import json
|
|
21
21
|
from enum import Enum
|
|
22
|
-
from typing import Any
|
|
22
|
+
from typing import Any
|
|
23
23
|
|
|
24
24
|
import aiohttp
|
|
25
25
|
from fastapi.datastructures import QueryParams
|
|
@@ -78,9 +78,9 @@ async def predict_proxy(
|
|
|
78
78
|
user_id: str,
|
|
79
79
|
client_type: NucliaDBClientType,
|
|
80
80
|
origin: str,
|
|
81
|
-
json:
|
|
81
|
+
json: Any | None = None,
|
|
82
82
|
headers: dict[str, str] = {},
|
|
83
|
-
) ->
|
|
83
|
+
) -> Response | StreamingResponse:
|
|
84
84
|
if not await exists_kb(kbid=kbid):
|
|
85
85
|
raise datamanagers.exceptions.KnowledgeBoxNotFound()
|
|
86
86
|
|
|
@@ -99,11 +99,15 @@ async def predict_proxy(
|
|
|
99
99
|
)
|
|
100
100
|
|
|
101
101
|
status_code = predict_response.status
|
|
102
|
+
|
|
103
|
+
# Only audit /predict/chat successful responses
|
|
104
|
+
should_audit = endpoint == PredictProxiedEndpoints.CHAT and 200 <= status_code < 300
|
|
105
|
+
|
|
102
106
|
media_type = predict_response.headers.get("Content-Type")
|
|
103
|
-
response:
|
|
107
|
+
response: Response | StreamingResponse
|
|
104
108
|
user_query = json.get("question") if json is not None else ""
|
|
105
109
|
if predict_response.headers.get("Transfer-Encoding") == "chunked":
|
|
106
|
-
if
|
|
110
|
+
if should_audit:
|
|
107
111
|
streaming_generator = chat_streaming_generator(
|
|
108
112
|
predict_response=predict_response,
|
|
109
113
|
kbid=kbid,
|
|
@@ -126,7 +130,7 @@ async def predict_proxy(
|
|
|
126
130
|
with metrics.time(PREDICT_ANSWER_METRIC):
|
|
127
131
|
content = await predict_response.read()
|
|
128
132
|
|
|
129
|
-
if
|
|
133
|
+
if should_audit:
|
|
130
134
|
try:
|
|
131
135
|
llm_status_code = int(content[-1:].decode()) # Decode just the last char
|
|
132
136
|
if llm_status_code != 0:
|
|
@@ -250,10 +254,10 @@ def audit_predict_proxy_endpoint(
|
|
|
250
254
|
client_type: NucliaDBClientType,
|
|
251
255
|
origin: str,
|
|
252
256
|
text_answer: bytes,
|
|
253
|
-
text_reasoning:
|
|
257
|
+
text_reasoning: str | None,
|
|
254
258
|
generative_answer_time: float,
|
|
255
|
-
generative_answer_first_chunk_time:
|
|
256
|
-
generative_reasoning_first_chunk_time:
|
|
259
|
+
generative_answer_first_chunk_time: float | None,
|
|
260
|
+
generative_reasoning_first_chunk_time: float | None,
|
|
257
261
|
status_code: AnswerStatusCode,
|
|
258
262
|
):
|
|
259
263
|
maybe_audit_chat(
|