nucliadb 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (231) hide show
  1. migrations/0023_backfill_pg_catalog.py +2 -2
  2. migrations/0029_backfill_field_status.py +3 -4
  3. migrations/0032_remove_old_relations.py +2 -3
  4. migrations/0038_backfill_catalog_field_labels.py +2 -2
  5. migrations/0039_backfill_converation_splits_metadata.py +2 -2
  6. migrations/0041_reindex_conversations.py +137 -0
  7. migrations/pg/0010_shards_index.py +34 -0
  8. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  9. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  10. nucliadb/backups/create.py +2 -15
  11. nucliadb/backups/restore.py +4 -15
  12. nucliadb/backups/tasks.py +4 -1
  13. nucliadb/common/back_pressure/cache.py +2 -3
  14. nucliadb/common/back_pressure/materializer.py +7 -13
  15. nucliadb/common/back_pressure/settings.py +6 -6
  16. nucliadb/common/back_pressure/utils.py +1 -0
  17. nucliadb/common/cache.py +9 -9
  18. nucliadb/common/catalog/interface.py +12 -12
  19. nucliadb/common/catalog/pg.py +41 -29
  20. nucliadb/common/catalog/utils.py +3 -3
  21. nucliadb/common/cluster/manager.py +5 -4
  22. nucliadb/common/cluster/rebalance.py +483 -114
  23. nucliadb/common/cluster/rollover.py +25 -9
  24. nucliadb/common/cluster/settings.py +3 -8
  25. nucliadb/common/cluster/utils.py +34 -8
  26. nucliadb/common/context/__init__.py +7 -8
  27. nucliadb/common/context/fastapi.py +1 -2
  28. nucliadb/common/datamanagers/__init__.py +2 -4
  29. nucliadb/common/datamanagers/atomic.py +4 -2
  30. nucliadb/common/datamanagers/cluster.py +1 -2
  31. nucliadb/common/datamanagers/fields.py +3 -4
  32. nucliadb/common/datamanagers/kb.py +6 -6
  33. nucliadb/common/datamanagers/labels.py +2 -3
  34. nucliadb/common/datamanagers/resources.py +10 -33
  35. nucliadb/common/datamanagers/rollover.py +5 -7
  36. nucliadb/common/datamanagers/search_configurations.py +1 -2
  37. nucliadb/common/datamanagers/synonyms.py +1 -2
  38. nucliadb/common/datamanagers/utils.py +4 -4
  39. nucliadb/common/datamanagers/vectorsets.py +4 -4
  40. nucliadb/common/external_index_providers/base.py +32 -5
  41. nucliadb/common/external_index_providers/manager.py +4 -5
  42. nucliadb/common/filter_expression.py +128 -40
  43. nucliadb/common/http_clients/processing.py +12 -23
  44. nucliadb/common/ids.py +6 -4
  45. nucliadb/common/locking.py +1 -2
  46. nucliadb/common/maindb/driver.py +9 -8
  47. nucliadb/common/maindb/local.py +5 -5
  48. nucliadb/common/maindb/pg.py +9 -8
  49. nucliadb/common/nidx.py +3 -4
  50. nucliadb/export_import/datamanager.py +4 -3
  51. nucliadb/export_import/exporter.py +11 -19
  52. nucliadb/export_import/importer.py +13 -6
  53. nucliadb/export_import/tasks.py +2 -0
  54. nucliadb/export_import/utils.py +6 -18
  55. nucliadb/health.py +2 -2
  56. nucliadb/ingest/app.py +8 -8
  57. nucliadb/ingest/consumer/consumer.py +8 -10
  58. nucliadb/ingest/consumer/pull.py +3 -8
  59. nucliadb/ingest/consumer/service.py +3 -3
  60. nucliadb/ingest/consumer/utils.py +1 -1
  61. nucliadb/ingest/fields/base.py +28 -49
  62. nucliadb/ingest/fields/conversation.py +12 -12
  63. nucliadb/ingest/fields/exceptions.py +1 -2
  64. nucliadb/ingest/fields/file.py +22 -8
  65. nucliadb/ingest/fields/link.py +7 -7
  66. nucliadb/ingest/fields/text.py +2 -3
  67. nucliadb/ingest/orm/brain_v2.py +78 -64
  68. nucliadb/ingest/orm/broker_message.py +2 -4
  69. nucliadb/ingest/orm/entities.py +10 -209
  70. nucliadb/ingest/orm/index_message.py +4 -4
  71. nucliadb/ingest/orm/knowledgebox.py +18 -27
  72. nucliadb/ingest/orm/processor/auditing.py +1 -3
  73. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  74. nucliadb/ingest/orm/processor/processor.py +27 -27
  75. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  76. nucliadb/ingest/orm/resource.py +72 -70
  77. nucliadb/ingest/orm/utils.py +1 -1
  78. nucliadb/ingest/processing.py +17 -17
  79. nucliadb/ingest/serialize.py +202 -145
  80. nucliadb/ingest/service/writer.py +3 -109
  81. nucliadb/ingest/settings.py +3 -4
  82. nucliadb/ingest/utils.py +1 -2
  83. nucliadb/learning_proxy.py +11 -11
  84. nucliadb/metrics_exporter.py +5 -4
  85. nucliadb/middleware/__init__.py +82 -1
  86. nucliadb/migrator/datamanager.py +3 -4
  87. nucliadb/migrator/migrator.py +1 -2
  88. nucliadb/migrator/models.py +1 -2
  89. nucliadb/migrator/settings.py +1 -2
  90. nucliadb/models/internal/augment.py +614 -0
  91. nucliadb/models/internal/processing.py +19 -19
  92. nucliadb/openapi.py +2 -2
  93. nucliadb/purge/__init__.py +3 -8
  94. nucliadb/purge/orphan_shards.py +1 -2
  95. nucliadb/reader/__init__.py +5 -0
  96. nucliadb/reader/api/models.py +6 -13
  97. nucliadb/reader/api/v1/download.py +59 -38
  98. nucliadb/reader/api/v1/export_import.py +4 -4
  99. nucliadb/reader/api/v1/learning_config.py +24 -4
  100. nucliadb/reader/api/v1/resource.py +61 -9
  101. nucliadb/reader/api/v1/services.py +18 -14
  102. nucliadb/reader/app.py +3 -1
  103. nucliadb/reader/reader/notifications.py +1 -2
  104. nucliadb/search/api/v1/__init__.py +2 -0
  105. nucliadb/search/api/v1/ask.py +3 -4
  106. nucliadb/search/api/v1/augment.py +585 -0
  107. nucliadb/search/api/v1/catalog.py +11 -15
  108. nucliadb/search/api/v1/find.py +16 -22
  109. nucliadb/search/api/v1/hydrate.py +25 -25
  110. nucliadb/search/api/v1/knowledgebox.py +1 -2
  111. nucliadb/search/api/v1/predict_proxy.py +1 -2
  112. nucliadb/search/api/v1/resource/ask.py +7 -7
  113. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  114. nucliadb/search/api/v1/resource/search.py +9 -11
  115. nucliadb/search/api/v1/retrieve.py +130 -0
  116. nucliadb/search/api/v1/search.py +28 -32
  117. nucliadb/search/api/v1/suggest.py +11 -14
  118. nucliadb/search/api/v1/summarize.py +1 -2
  119. nucliadb/search/api/v1/utils.py +2 -2
  120. nucliadb/search/app.py +3 -2
  121. nucliadb/search/augmentor/__init__.py +21 -0
  122. nucliadb/search/augmentor/augmentor.py +232 -0
  123. nucliadb/search/augmentor/fields.py +704 -0
  124. nucliadb/search/augmentor/metrics.py +24 -0
  125. nucliadb/search/augmentor/paragraphs.py +334 -0
  126. nucliadb/search/augmentor/resources.py +238 -0
  127. nucliadb/search/augmentor/utils.py +33 -0
  128. nucliadb/search/lifecycle.py +3 -1
  129. nucliadb/search/predict.py +24 -17
  130. nucliadb/search/predict_models.py +8 -9
  131. nucliadb/search/requesters/utils.py +11 -10
  132. nucliadb/search/search/cache.py +19 -23
  133. nucliadb/search/search/chat/ask.py +88 -59
  134. nucliadb/search/search/chat/exceptions.py +3 -5
  135. nucliadb/search/search/chat/fetcher.py +201 -0
  136. nucliadb/search/search/chat/images.py +6 -4
  137. nucliadb/search/search/chat/old_prompt.py +1375 -0
  138. nucliadb/search/search/chat/parser.py +510 -0
  139. nucliadb/search/search/chat/prompt.py +563 -615
  140. nucliadb/search/search/chat/query.py +449 -36
  141. nucliadb/search/search/chat/rpc.py +85 -0
  142. nucliadb/search/search/fetch.py +3 -4
  143. nucliadb/search/search/filters.py +8 -11
  144. nucliadb/search/search/find.py +33 -31
  145. nucliadb/search/search/find_merge.py +124 -331
  146. nucliadb/search/search/graph_strategy.py +14 -12
  147. nucliadb/search/search/hydrator/__init__.py +3 -152
  148. nucliadb/search/search/hydrator/fields.py +92 -50
  149. nucliadb/search/search/hydrator/images.py +7 -7
  150. nucliadb/search/search/hydrator/paragraphs.py +42 -26
  151. nucliadb/search/search/hydrator/resources.py +20 -16
  152. nucliadb/search/search/ingestion_agents.py +5 -5
  153. nucliadb/search/search/merge.py +90 -94
  154. nucliadb/search/search/metrics.py +10 -9
  155. nucliadb/search/search/paragraphs.py +7 -9
  156. nucliadb/search/search/predict_proxy.py +13 -9
  157. nucliadb/search/search/query.py +14 -86
  158. nucliadb/search/search/query_parser/fetcher.py +51 -82
  159. nucliadb/search/search/query_parser/models.py +19 -20
  160. nucliadb/search/search/query_parser/old_filters.py +20 -19
  161. nucliadb/search/search/query_parser/parsers/ask.py +4 -5
  162. nucliadb/search/search/query_parser/parsers/catalog.py +5 -6
  163. nucliadb/search/search/query_parser/parsers/common.py +5 -6
  164. nucliadb/search/search/query_parser/parsers/find.py +6 -26
  165. nucliadb/search/search/query_parser/parsers/graph.py +13 -23
  166. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  167. nucliadb/search/search/query_parser/parsers/search.py +15 -53
  168. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  169. nucliadb/search/search/rank_fusion.py +18 -13
  170. nucliadb/search/search/rerankers.py +5 -6
  171. nucliadb/search/search/retrieval.py +300 -0
  172. nucliadb/search/search/summarize.py +5 -6
  173. nucliadb/search/search/utils.py +3 -4
  174. nucliadb/search/settings.py +1 -2
  175. nucliadb/standalone/api_router.py +1 -1
  176. nucliadb/standalone/app.py +4 -3
  177. nucliadb/standalone/auth.py +5 -6
  178. nucliadb/standalone/lifecycle.py +2 -2
  179. nucliadb/standalone/run.py +2 -4
  180. nucliadb/standalone/settings.py +5 -6
  181. nucliadb/standalone/versions.py +3 -4
  182. nucliadb/tasks/consumer.py +13 -8
  183. nucliadb/tasks/models.py +2 -1
  184. nucliadb/tasks/producer.py +3 -3
  185. nucliadb/tasks/retries.py +8 -7
  186. nucliadb/train/api/utils.py +1 -3
  187. nucliadb/train/api/v1/shards.py +1 -2
  188. nucliadb/train/api/v1/trainset.py +1 -2
  189. nucliadb/train/app.py +1 -1
  190. nucliadb/train/generator.py +4 -4
  191. nucliadb/train/generators/field_classifier.py +2 -2
  192. nucliadb/train/generators/field_streaming.py +6 -6
  193. nucliadb/train/generators/image_classifier.py +2 -2
  194. nucliadb/train/generators/paragraph_classifier.py +2 -2
  195. nucliadb/train/generators/paragraph_streaming.py +2 -2
  196. nucliadb/train/generators/question_answer_streaming.py +2 -2
  197. nucliadb/train/generators/sentence_classifier.py +2 -2
  198. nucliadb/train/generators/token_classifier.py +3 -2
  199. nucliadb/train/generators/utils.py +6 -5
  200. nucliadb/train/nodes.py +3 -3
  201. nucliadb/train/resource.py +6 -8
  202. nucliadb/train/settings.py +3 -4
  203. nucliadb/train/types.py +11 -11
  204. nucliadb/train/upload.py +3 -2
  205. nucliadb/train/uploader.py +1 -2
  206. nucliadb/train/utils.py +1 -2
  207. nucliadb/writer/api/v1/export_import.py +4 -1
  208. nucliadb/writer/api/v1/field.py +7 -11
  209. nucliadb/writer/api/v1/knowledgebox.py +3 -4
  210. nucliadb/writer/api/v1/resource.py +9 -20
  211. nucliadb/writer/api/v1/services.py +10 -132
  212. nucliadb/writer/api/v1/upload.py +73 -72
  213. nucliadb/writer/app.py +8 -2
  214. nucliadb/writer/resource/basic.py +12 -15
  215. nucliadb/writer/resource/field.py +7 -5
  216. nucliadb/writer/resource/origin.py +7 -0
  217. nucliadb/writer/settings.py +2 -3
  218. nucliadb/writer/tus/__init__.py +2 -3
  219. nucliadb/writer/tus/azure.py +1 -3
  220. nucliadb/writer/tus/dm.py +3 -3
  221. nucliadb/writer/tus/exceptions.py +3 -4
  222. nucliadb/writer/tus/gcs.py +5 -6
  223. nucliadb/writer/tus/s3.py +2 -3
  224. nucliadb/writer/tus/storage.py +3 -3
  225. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +9 -10
  226. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  227. nucliadb/common/datamanagers/entities.py +0 -139
  228. nucliadb-6.9.1.post5192.dist-info/RECORD +0 -392
  229. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  230. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  231. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
@@ -20,11 +20,16 @@
20
20
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
21
21
  #
22
22
 
23
-
24
- from nucliadb.common.models_utils import from_proto
25
23
  from nucliadb.ingest.orm.resource import Resource
24
+ from nucliadb.models.internal.augment import (
25
+ ResourceOrigin,
26
+ ResourceProp,
27
+ ResourceSecurity,
28
+ ResourceSummary,
29
+ ResourceTitle,
30
+ )
31
+ from nucliadb.search.augmentor.resources import db_augment_resource
26
32
  from nucliadb_models import hydration as hydration_models
27
- from nucliadb_models.security import ResourceSecurity
28
33
 
29
34
 
30
35
  async def hydrate_resource(
@@ -35,22 +40,21 @@ async def hydrate_resource(
35
40
  slug = basic.slug
36
41
  hydrated = hydration_models.HydratedResource(id=rid, slug=slug)
37
42
 
43
+ select: list[ResourceProp] = []
38
44
  if config.title:
39
- hydrated.title = basic.title
45
+ select.append(ResourceTitle())
40
46
  if config.summary:
41
- hydrated.summary = basic.summary
42
-
47
+ select.append(ResourceSummary())
48
+ if config.origin:
49
+ select.append(ResourceOrigin())
43
50
  if config.security:
44
- security = await resource.get_security()
45
- hydrated.security = ResourceSecurity(access_groups=[])
46
- if security is not None:
47
- for group_id in security.access_groups:
48
- hydrated.security.access_groups.append(group_id)
51
+ select.append(ResourceSecurity())
49
52
 
50
- if config.origin:
51
- origin = await resource.get_origin()
52
- if origin is not None:
53
- # TODO: we want a better hydration than proto to JSON
54
- hydrated.origin = from_proto.origin(origin)
53
+ augmented = await db_augment_resource(resource, select)
54
+
55
+ hydrated.title = augmented.title
56
+ hydrated.summary = augmented.summary
57
+ hydrated.origin = augmented.origin
58
+ hydrated.security = augmented.security
55
59
 
56
60
  return hydrated
@@ -19,10 +19,10 @@
19
19
  #
20
20
  import asyncio
21
21
  from base64 import b64encode
22
- from typing import Optional
23
22
 
24
23
  from nucliadb.common import datamanagers
25
24
  from nucliadb.ingest.fields.base import Field
25
+ from nucliadb.ingest.orm.resource import Resource
26
26
  from nucliadb.search.predict_models import (
27
27
  FieldInfo,
28
28
  NameOperationFilter,
@@ -40,8 +40,8 @@ async def run_agents(
40
40
  kbid: str,
41
41
  rid: str,
42
42
  user_id: str,
43
- filters: Optional[list[AgentsFilter]] = None,
44
- agent_ids: Optional[list[str]] = None,
43
+ filters: list[AgentsFilter] | None = None,
44
+ agent_ids: list[str] | None = None,
45
45
  ) -> RunAgentsResponse:
46
46
  fields = await fetch_resource_fields(kbid, rid)
47
47
 
@@ -56,7 +56,7 @@ async def run_agents(
56
56
  return await predict.run_agents(kbid, item)
57
57
 
58
58
 
59
- def _parse_filters(filters: Optional[list[AgentsFilter]]) -> Optional[list[NameOperationFilter]]:
59
+ def _parse_filters(filters: list[AgentsFilter] | None) -> list[NameOperationFilter] | None:
60
60
  if filters is None:
61
61
  return None
62
62
  return [
@@ -69,7 +69,7 @@ def _parse_filters(filters: Optional[list[AgentsFilter]]) -> Optional[list[NameO
69
69
 
70
70
  async def fetch_resource_fields(kbid: str, rid: str) -> list[FieldInfo]:
71
71
  async with datamanagers.with_ro_transaction() as txn:
72
- resource = await datamanagers.resources.get_resource(txn, kbid=kbid, rid=rid)
72
+ resource = await Resource.get(txn, kbid=kbid, rid=rid)
73
73
  if resource is None:
74
74
  raise ResourceNotFoundError()
75
75
  fields = await resource.get_fields(force=True)
@@ -20,7 +20,8 @@
20
20
  import asyncio
21
21
  import datetime
22
22
  import math
23
- from typing import Any, Iterable, Optional, Set, Union
23
+ from collections.abc import Iterable
24
+ from typing import Any
24
25
 
25
26
  from nidx_protos.nodereader_pb2 import (
26
27
  DocumentResult,
@@ -37,7 +38,6 @@ from nidx_protos.nodereader_pb2 import (
37
38
  from nucliadb.common.ids import FieldId, ParagraphId
38
39
  from nucliadb.common.models_utils import from_proto
39
40
  from nucliadb.common.models_utils.from_proto import RelationTypePbMap
40
- from nucliadb.search.search import cache
41
41
  from nucliadb.search.search.cut import cut_page
42
42
  from nucliadb.search.search.fetch import (
43
43
  fetch_resources,
@@ -80,7 +80,7 @@ from .paragraphs import get_paragraph_text, get_text_sentence
80
80
  Bm25Score = tuple[float, float]
81
81
  TimestampScore = datetime.datetime
82
82
  TitleScore = str
83
- SortValue = Union[Bm25Score, TimestampScore, TitleScore]
83
+ SortValue = Bm25Score | TimestampScore | TitleScore
84
84
 
85
85
 
86
86
  def relation_node_type_to_entity_type(node_type: RelationNode.NodeType.ValueType) -> EntityType:
@@ -101,47 +101,17 @@ def entity_type_to_relation_node_type(node_type: EntityType) -> RelationNode.Nod
101
101
  }[node_type]
102
102
 
103
103
 
104
- def sort_results_by_score(results: Union[list[ParagraphResult], list[DocumentResult]]):
104
+ def sort_results_by_score(results: list[ParagraphResult] | list[DocumentResult]):
105
105
  results.sort(key=lambda x: (x.score.bm25, x.score.booster), reverse=True)
106
106
 
107
107
 
108
- async def get_sort_value(
109
- item: Union[DocumentResult, ParagraphResult],
110
- sort_field: SortField,
111
- kbid: str,
112
- ) -> Optional[SortValue]:
113
- """Returns the score for given `item` and `sort_field`. If the resource is being
114
- deleted, it might appear on search results but not in maindb. In this
115
- specific case, return None.
116
- """
117
- if sort_field == SortField.SCORE:
118
- return (item.score.bm25, item.score.booster)
119
-
120
- score: Any = None
121
- resource = await cache.get_resource(kbid, item.uuid)
122
- if resource is None:
123
- return score
124
-
125
- basic = await resource.get_basic()
126
- if basic is None:
127
- return score
128
-
129
- if sort_field == SortField.CREATED:
130
- score = basic.created.ToDatetime()
131
- elif sort_field == SortField.MODIFIED:
132
- score = basic.modified.ToDatetime()
133
- elif sort_field == SortField.TITLE:
134
- score = basic.title
135
-
136
- return score
137
-
138
-
139
108
  async def merge_documents_results(
140
109
  kbid: str,
141
110
  responses: list[DocumentSearchResponse],
142
111
  *,
143
112
  query: FulltextQuery,
144
113
  top_k: int,
114
+ offset: int,
145
115
  ) -> tuple[Resources, list[str]]:
146
116
  raw_resource_list: list[tuple[DocumentResult, SortValue]] = []
147
117
  facets: dict[str, Any] = {}
@@ -159,14 +129,22 @@ async def merge_documents_results(
159
129
  if document_response.next_page:
160
130
  next_page = True
161
131
  for result in document_response.results:
162
- sort_value = await get_sort_value(result, query.order_by, kbid)
132
+ sort_value: SortValue
133
+ if query.order_by == SortField.SCORE:
134
+ sort_value = (result.score.bm25, result.score.booster)
135
+ else:
136
+ sort_value = result.date.ToDatetime()
163
137
  if sort_value is not None:
164
138
  raw_resource_list.append((result, sort_value))
139
+
165
140
  total += document_response.total
166
141
 
167
142
  # We need to cut first and then sort, otherwise the page will be wrong if the order is DESC
168
- raw_resource_list, has_more = cut_page(raw_resource_list, top_k)
143
+ raw_resource_list, has_more = cut_page(raw_resource_list[offset:], top_k)
169
144
  next_page = next_page or has_more
145
+
146
+ # Sort the list by score. It's important that this sort is stable, so the
147
+ # ordering of results with same scores accross multiple shards doesn't change
170
148
  raw_resource_list.sort(key=lambda x: x[1], reverse=(query.sort == SortOrder.DESC))
171
149
 
172
150
  result_resource_ids = []
@@ -270,7 +248,7 @@ async def merge_vectors_results(
270
248
  resources: list[str],
271
249
  kbid: str,
272
250
  top_k: int,
273
- min_score: Optional[float] = None,
251
+ min_score: float | None = None,
274
252
  ) -> Sentences:
275
253
  facets: dict[str, Any] = {}
276
254
  raw_vectors_list: list[DocumentScored] = []
@@ -350,12 +328,13 @@ async def merge_paragraph_results(
350
328
  highlight: bool,
351
329
  sort: SortOptions,
352
330
  min_score: float,
331
+ offset: int,
353
332
  ) -> tuple[Paragraphs, list[str]]:
354
333
  raw_paragraph_list: list[tuple[ParagraphResult, SortValue]] = []
355
334
  facets: dict[str, Any] = {}
356
335
  query = None
357
336
  next_page = False
358
- ematches: Optional[list[str]] = None
337
+ ematches: list[str] | None = None
359
338
  total = 0
360
339
  for paragraph_response in paragraph_responses:
361
340
  if ematches is None:
@@ -373,66 +352,31 @@ async def merge_paragraph_results(
373
352
  if paragraph_response.next_page:
374
353
  next_page = True
375
354
  for result in paragraph_response.results:
376
- score = await get_sort_value(result, sort.field, kbid)
377
- if score is not None:
378
- raw_paragraph_list.append((result, score))
355
+ sort_value: SortValue
356
+ if sort.field == SortField.SCORE:
357
+ sort_value = (result.score.bm25, result.score.booster)
358
+ else:
359
+ sort_value = result.date.ToDatetime()
360
+ if sort_value is not None:
361
+ raw_paragraph_list.append((result, sort_value))
362
+
379
363
  total += paragraph_response.total
380
364
 
365
+ # Sort the list by score. It's important that this sort is stable, so the
366
+ # ordering of results with same scores accross multiple shards doesn't change
381
367
  raw_paragraph_list.sort(key=lambda x: x[1], reverse=(sort.order == SortOrder.DESC))
382
368
 
383
- raw_paragraph_list, has_more = cut_page(raw_paragraph_list, top_k)
369
+ raw_paragraph_list, has_more = cut_page(raw_paragraph_list[offset:], top_k)
384
370
  next_page = next_page or has_more
385
371
 
386
372
  result_resource_ids = []
387
- result_paragraph_list: list[Paragraph] = []
388
- for result, _ in raw_paragraph_list:
389
- _, field_type, field = result.field.split("/")
390
- text = await get_paragraph_text(
391
- kbid=kbid,
392
- paragraph_id=ParagraphId(
393
- field_id=FieldId(
394
- rid=result.uuid,
395
- type=field_type,
396
- key=field,
397
- subfield_id=result.split,
398
- ),
399
- paragraph_start=result.start,
400
- paragraph_end=result.end,
401
- ),
402
- highlight=highlight,
403
- ematches=ematches,
404
- matches=result.matches, # type: ignore
405
- )
406
- labels = await get_labels_paragraph(result, kbid)
407
- fuzzy_result = len(result.matches) > 0
408
- new_paragraph = Paragraph(
409
- score=result.score.bm25,
410
- rid=result.uuid,
411
- field_type=field_type,
412
- field=field,
413
- text=text,
414
- labels=labels,
415
- position=TextPosition(
416
- index=result.metadata.position.index,
417
- start=result.metadata.position.start,
418
- end=result.metadata.position.end,
419
- page_number=result.metadata.position.page_number,
420
- ),
421
- fuzzy_result=fuzzy_result,
422
- )
423
- if len(result.metadata.position.start_seconds) or len(result.metadata.position.end_seconds):
424
- new_paragraph.start_seconds = list(result.metadata.position.start_seconds)
425
- new_paragraph.end_seconds = list(result.metadata.position.end_seconds)
426
- else:
427
- # TODO: Remove once we are sure all data has been migrated!
428
- seconds_positions = await get_seconds_paragraph(result, kbid)
429
- if seconds_positions is not None:
430
- new_paragraph.start_seconds = seconds_positions[0]
431
- new_paragraph.end_seconds = seconds_positions[1]
373
+ result_paragraph_list: list[Paragraph] = await asyncio.gather(
374
+ *(load_paragraph(result, kbid, highlight, ematches) for result, _ in raw_paragraph_list)
375
+ )
376
+ for paragraph in result_paragraph_list:
377
+ if paragraph.rid not in result_resource_ids:
378
+ result_resource_ids.append(paragraph.rid)
432
379
 
433
- result_paragraph_list.append(new_paragraph)
434
- if new_paragraph.rid not in result_resource_ids:
435
- result_resource_ids.append(new_paragraph.rid)
436
380
  return Paragraphs(
437
381
  results=result_paragraph_list,
438
382
  facets=facets,
@@ -445,6 +389,56 @@ async def merge_paragraph_results(
445
389
  ), result_resource_ids
446
390
 
447
391
 
392
+ async def load_paragraph(
393
+ result: ParagraphResult, kbid: str, highlight: bool, ematches: list[str] | None
394
+ ) -> Paragraph:
395
+ _, field_type, field = result.field.split("/")
396
+ text = await get_paragraph_text(
397
+ kbid=kbid,
398
+ paragraph_id=ParagraphId(
399
+ field_id=FieldId(
400
+ rid=result.uuid,
401
+ type=field_type,
402
+ key=field,
403
+ subfield_id=result.split,
404
+ ),
405
+ paragraph_start=result.start,
406
+ paragraph_end=result.end,
407
+ ),
408
+ highlight=highlight,
409
+ ematches=ematches,
410
+ matches=result.matches, # type: ignore
411
+ )
412
+ labels = await get_labels_paragraph(result, kbid)
413
+ fuzzy_result = len(result.matches) > 0
414
+ new_paragraph = Paragraph(
415
+ score=result.score.bm25,
416
+ rid=result.uuid,
417
+ field_type=field_type,
418
+ field=field,
419
+ text=text,
420
+ labels=labels,
421
+ position=TextPosition(
422
+ index=result.metadata.position.index,
423
+ start=result.metadata.position.start,
424
+ end=result.metadata.position.end,
425
+ page_number=result.metadata.position.page_number,
426
+ ),
427
+ fuzzy_result=fuzzy_result,
428
+ )
429
+ if len(result.metadata.position.start_seconds) or len(result.metadata.position.end_seconds):
430
+ new_paragraph.start_seconds = list(result.metadata.position.start_seconds)
431
+ new_paragraph.end_seconds = list(result.metadata.position.end_seconds)
432
+ else:
433
+ # TODO: Remove once we are sure all data has been migrated!
434
+ seconds_positions = await get_seconds_paragraph(result, kbid)
435
+ if seconds_positions is not None:
436
+ new_paragraph.start_seconds = seconds_positions[0]
437
+ new_paragraph.end_seconds = seconds_positions[1]
438
+
439
+ return new_paragraph
440
+
441
+
448
442
  @merge_observer.wrap({"type": "merge_relations"})
449
443
  async def merge_relations_results(
450
444
  graph_responses: list[GraphSearchResponse],
@@ -520,6 +514,7 @@ async def merge_results(
520
514
  show: list[ResourceProperties],
521
515
  field_type_filter: list[FieldTypeName],
522
516
  extracted: list[ExtractedDataTypeName],
517
+ offset: int,
523
518
  highlight: bool = False,
524
519
  ) -> KnowledgeboxSearchResults:
525
520
  paragraphs = []
@@ -543,6 +538,7 @@ async def merge_results(
543
538
  documents,
544
539
  query=retrieval.query.fulltext,
545
540
  top_k=retrieval.top_k,
541
+ offset=offset,
546
542
  )
547
543
  resources.extend(matched_resources)
548
544
 
@@ -550,7 +546,6 @@ async def merge_results(
550
546
  sort = SortOptions(
551
547
  field=retrieval.query.keyword.order_by,
552
548
  order=retrieval.query.keyword.sort,
553
- limit=None, # unused
554
549
  )
555
550
  api_results.paragraphs, matched_resources = await merge_paragraph_results(
556
551
  kbid,
@@ -559,6 +554,7 @@ async def merge_results(
559
554
  highlight,
560
555
  sort,
561
556
  min_score=retrieval.query.keyword.min_score,
557
+ offset=offset,
562
558
  )
563
559
  resources.extend(matched_resources)
564
560
 
@@ -601,9 +597,9 @@ async def merge_paragraphs_results(
601
597
  sort=SortOptions(
602
598
  field=SortField.SCORE,
603
599
  order=SortOrder.DESC,
604
- limit=None,
605
600
  ),
606
601
  min_score=min_score,
602
+ offset=0,
607
603
  )
608
604
  return api_results
609
605
 
@@ -611,7 +607,7 @@ async def merge_paragraphs_results(
611
607
  async def merge_suggest_entities_results(
612
608
  suggest_responses: list[SuggestResponse],
613
609
  ) -> RelatedEntities:
614
- unique_entities: Set[RelatedEntity] = set()
610
+ unique_entities: set[RelatedEntity] = set()
615
611
  for response in suggest_responses:
616
612
  response_entities = (
617
613
  RelatedEntity(family=e.subtype, value=e.value) for e in response.entity_results.nodes
@@ -19,7 +19,7 @@
19
19
  #
20
20
  import contextlib
21
21
  import time
22
- from typing import Any, Optional, Union
22
+ from typing import Any
23
23
 
24
24
  from nucliadb_telemetry import metrics
25
25
 
@@ -27,6 +27,7 @@ merge_observer = metrics.Observer("merge_results", labels={"type": ""})
27
27
  node_features = metrics.Counter("nucliadb_node_features", labels={"type": ""})
28
28
  query_parse_dependency_observer = metrics.Observer("query_parse_dependency", labels={"type": ""})
29
29
  query_parser_observer = metrics.Observer("nucliadb_query_parser", labels={"type": ""})
30
+ search_observer = metrics.Observer("nucliadb_search", labels={"type": ""})
30
31
 
31
32
  buckets = [
32
33
  0.005,
@@ -62,7 +63,7 @@ rag_histogram = metrics.Histogram(
62
63
  buckets=buckets,
63
64
  )
64
65
 
65
- MetricsData = dict[str, Union[int, float]]
66
+ MetricsData = dict[str, int | float]
66
67
 
67
68
 
68
69
  class Metrics:
@@ -86,10 +87,10 @@ class Metrics:
86
87
  self.child_spans.append(child_span)
87
88
  return child_span
88
89
 
89
- def set(self, key: str, value: Union[int, float]):
90
+ def set(self, key: str, value: int | float):
90
91
  self._metrics[key] = value
91
92
 
92
- def get(self, key: str) -> Optional[Union[int, float]]:
93
+ def get(self, key: str) -> int | float | None:
93
94
  return self._metrics.get(key)
94
95
 
95
96
  def to_dict(self) -> MetricsData:
@@ -102,7 +103,7 @@ class Metrics:
102
103
  result[self.id] = self.to_dict()
103
104
  return result
104
105
 
105
- def __getitem__(self, key: str) -> Union[int, float]:
106
+ def __getitem__(self, key: str) -> int | float:
106
107
  return self._metrics[key]
107
108
 
108
109
 
@@ -110,8 +111,8 @@ class AskMetrics(Metrics):
110
111
  def __init__(self: "AskMetrics"):
111
112
  super().__init__(id="ask")
112
113
  self.global_start = time.monotonic()
113
- self.first_chunk_yielded_at: Optional[float] = None
114
- self.first_reasoning_chunk_yielded_at: Optional[float] = None
114
+ self.first_chunk_yielded_at: float | None = None
115
+ self.first_reasoning_chunk_yielded_at: float | None = None
115
116
 
116
117
  def record_first_chunk_yielded(self):
117
118
  self.first_chunk_yielded_at = time.monotonic()
@@ -123,12 +124,12 @@ class AskMetrics(Metrics):
123
124
  self.first_reasoning_chunk_yielded_at - self.global_start
124
125
  )
125
126
 
126
- def get_first_chunk_time(self) -> Optional[float]:
127
+ def get_first_chunk_time(self) -> float | None:
127
128
  if self.first_chunk_yielded_at is None:
128
129
  return None
129
130
  return self.first_chunk_yielded_at - self.global_start
130
131
 
131
- def get_first_reasoning_chunk_time(self) -> Optional[float]:
132
+ def get_first_reasoning_chunk_time(self) -> float | None:
132
133
  if self.first_reasoning_chunk_yielded_at is None:
133
134
  return None
134
135
  return self.first_reasoning_chunk_yielded_at - self.global_start
@@ -20,7 +20,6 @@
20
20
  import logging
21
21
  import re
22
22
  import string
23
- from typing import Optional
24
23
 
25
24
  from nucliadb.common.ids import FIELD_TYPE_STR_TO_PB, ParagraphId
26
25
  from nucliadb.ingest.fields.base import Field
@@ -58,7 +57,7 @@ async def get_paragraph_from_full_text(
58
57
  field: Field,
59
58
  start: int,
60
59
  end: int,
61
- split: Optional[str] = None,
60
+ split: str | None = None,
62
61
  log_on_missing_field: bool = True,
63
62
  ) -> str:
64
63
  """
@@ -90,11 +89,10 @@ async def get_paragraph_text(
90
89
  kbid: str,
91
90
  paragraph_id: ParagraphId,
92
91
  highlight: bool = False,
93
- ematches: Optional[list[str]] = None,
94
- matches: Optional[list[str]] = None,
95
- orm_resource: Optional[
96
- ResourceORM
97
- ] = None, # allow passing in orm_resource to avoid extra DB calls or txn issues
92
+ ematches: list[str] | None = None,
93
+ matches: list[str] | None = None,
94
+ orm_resource: None
95
+ | (ResourceORM) = None, # allow passing in orm_resource to avoid extra DB calls or txn issues
98
96
  log_on_missing_field: bool = True,
99
97
  ) -> str:
100
98
  rid = paragraph_id.rid
@@ -139,7 +137,7 @@ async def get_text_sentence(
139
137
  index: int,
140
138
  start: int,
141
139
  end: int,
142
- split: Optional[str] = None,
140
+ split: str | None = None,
143
141
  ) -> str:
144
142
  """
145
143
  Leave separated from get paragraph for now until we understand the differences
@@ -169,7 +167,7 @@ async def get_text_sentence(
169
167
 
170
168
 
171
169
  def highlight_paragraph(
172
- text: str, words: Optional[list[str]] = None, ematches: Optional[list[str]] = None
170
+ text: str, words: list[str] | None = None, ematches: list[str] | None = None
173
171
  ) -> str:
174
172
  """
175
173
  Highlight `text` with <mark></mark> tags around the words in `words` and `ematches`.
@@ -19,7 +19,7 @@
19
19
  #
20
20
  import json
21
21
  from enum import Enum
22
- from typing import Any, Optional, Union
22
+ from typing import Any
23
23
 
24
24
  import aiohttp
25
25
  from fastapi.datastructures import QueryParams
@@ -78,9 +78,9 @@ async def predict_proxy(
78
78
  user_id: str,
79
79
  client_type: NucliaDBClientType,
80
80
  origin: str,
81
- json: Optional[Any] = None,
81
+ json: Any | None = None,
82
82
  headers: dict[str, str] = {},
83
- ) -> Union[Response, StreamingResponse]:
83
+ ) -> Response | StreamingResponse:
84
84
  if not await exists_kb(kbid=kbid):
85
85
  raise datamanagers.exceptions.KnowledgeBoxNotFound()
86
86
 
@@ -99,11 +99,15 @@ async def predict_proxy(
99
99
  )
100
100
 
101
101
  status_code = predict_response.status
102
+
103
+ # Only audit /predict/chat successful responses
104
+ should_audit = endpoint == PredictProxiedEndpoints.CHAT and 200 <= status_code < 300
105
+
102
106
  media_type = predict_response.headers.get("Content-Type")
103
- response: Union[Response, StreamingResponse]
107
+ response: Response | StreamingResponse
104
108
  user_query = json.get("question") if json is not None else ""
105
109
  if predict_response.headers.get("Transfer-Encoding") == "chunked":
106
- if endpoint == PredictProxiedEndpoints.CHAT:
110
+ if should_audit:
107
111
  streaming_generator = chat_streaming_generator(
108
112
  predict_response=predict_response,
109
113
  kbid=kbid,
@@ -126,7 +130,7 @@ async def predict_proxy(
126
130
  with metrics.time(PREDICT_ANSWER_METRIC):
127
131
  content = await predict_response.read()
128
132
 
129
- if endpoint == PredictProxiedEndpoints.CHAT:
133
+ if should_audit:
130
134
  try:
131
135
  llm_status_code = int(content[-1:].decode()) # Decode just the last char
132
136
  if llm_status_code != 0:
@@ -250,10 +254,10 @@ def audit_predict_proxy_endpoint(
250
254
  client_type: NucliaDBClientType,
251
255
  origin: str,
252
256
  text_answer: bytes,
253
- text_reasoning: Optional[str],
257
+ text_reasoning: str | None,
254
258
  generative_answer_time: float,
255
- generative_answer_first_chunk_time: Optional[float],
256
- generative_reasoning_first_chunk_time: Optional[float],
259
+ generative_answer_first_chunk_time: float | None,
260
+ generative_reasoning_first_chunk_time: float | None,
257
261
  status_code: AnswerStatusCode,
258
262
  ):
259
263
  maybe_audit_chat(