nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (246) hide show
  1. migrations/0023_backfill_pg_catalog.py +8 -4
  2. migrations/0028_extracted_vectors_reference.py +1 -1
  3. migrations/0029_backfill_field_status.py +3 -4
  4. migrations/0032_remove_old_relations.py +2 -3
  5. migrations/0038_backfill_catalog_field_labels.py +8 -4
  6. migrations/0039_backfill_converation_splits_metadata.py +106 -0
  7. migrations/0040_migrate_search_configurations.py +79 -0
  8. migrations/0041_reindex_conversations.py +137 -0
  9. migrations/pg/0010_shards_index.py +34 -0
  10. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  11. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  12. nucliadb/backups/create.py +2 -15
  13. nucliadb/backups/restore.py +4 -15
  14. nucliadb/backups/tasks.py +4 -1
  15. nucliadb/common/back_pressure/cache.py +2 -3
  16. nucliadb/common/back_pressure/materializer.py +7 -13
  17. nucliadb/common/back_pressure/settings.py +6 -6
  18. nucliadb/common/back_pressure/utils.py +1 -0
  19. nucliadb/common/cache.py +9 -9
  20. nucliadb/common/catalog/__init__.py +79 -0
  21. nucliadb/common/catalog/dummy.py +36 -0
  22. nucliadb/common/catalog/interface.py +85 -0
  23. nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
  24. nucliadb/common/catalog/utils.py +56 -0
  25. nucliadb/common/cluster/manager.py +8 -23
  26. nucliadb/common/cluster/rebalance.py +484 -112
  27. nucliadb/common/cluster/rollover.py +36 -9
  28. nucliadb/common/cluster/settings.py +4 -9
  29. nucliadb/common/cluster/utils.py +34 -8
  30. nucliadb/common/context/__init__.py +7 -8
  31. nucliadb/common/context/fastapi.py +1 -2
  32. nucliadb/common/datamanagers/__init__.py +2 -4
  33. nucliadb/common/datamanagers/atomic.py +9 -2
  34. nucliadb/common/datamanagers/cluster.py +1 -2
  35. nucliadb/common/datamanagers/fields.py +3 -4
  36. nucliadb/common/datamanagers/kb.py +6 -6
  37. nucliadb/common/datamanagers/labels.py +2 -3
  38. nucliadb/common/datamanagers/resources.py +10 -33
  39. nucliadb/common/datamanagers/rollover.py +5 -7
  40. nucliadb/common/datamanagers/search_configurations.py +1 -2
  41. nucliadb/common/datamanagers/synonyms.py +1 -2
  42. nucliadb/common/datamanagers/utils.py +4 -4
  43. nucliadb/common/datamanagers/vectorsets.py +4 -4
  44. nucliadb/common/external_index_providers/base.py +32 -5
  45. nucliadb/common/external_index_providers/manager.py +5 -34
  46. nucliadb/common/external_index_providers/settings.py +1 -27
  47. nucliadb/common/filter_expression.py +129 -41
  48. nucliadb/common/http_clients/exceptions.py +8 -0
  49. nucliadb/common/http_clients/processing.py +16 -23
  50. nucliadb/common/http_clients/utils.py +3 -0
  51. nucliadb/common/ids.py +82 -58
  52. nucliadb/common/locking.py +1 -2
  53. nucliadb/common/maindb/driver.py +9 -8
  54. nucliadb/common/maindb/local.py +5 -5
  55. nucliadb/common/maindb/pg.py +9 -8
  56. nucliadb/common/nidx.py +22 -5
  57. nucliadb/common/vector_index_config.py +1 -1
  58. nucliadb/export_import/datamanager.py +4 -3
  59. nucliadb/export_import/exporter.py +11 -19
  60. nucliadb/export_import/importer.py +13 -6
  61. nucliadb/export_import/tasks.py +2 -0
  62. nucliadb/export_import/utils.py +6 -18
  63. nucliadb/health.py +2 -2
  64. nucliadb/ingest/app.py +8 -8
  65. nucliadb/ingest/consumer/consumer.py +8 -10
  66. nucliadb/ingest/consumer/pull.py +10 -8
  67. nucliadb/ingest/consumer/service.py +5 -30
  68. nucliadb/ingest/consumer/shard_creator.py +16 -5
  69. nucliadb/ingest/consumer/utils.py +1 -1
  70. nucliadb/ingest/fields/base.py +37 -49
  71. nucliadb/ingest/fields/conversation.py +55 -9
  72. nucliadb/ingest/fields/exceptions.py +1 -2
  73. nucliadb/ingest/fields/file.py +22 -8
  74. nucliadb/ingest/fields/link.py +7 -7
  75. nucliadb/ingest/fields/text.py +2 -3
  76. nucliadb/ingest/orm/brain_v2.py +89 -57
  77. nucliadb/ingest/orm/broker_message.py +2 -4
  78. nucliadb/ingest/orm/entities.py +10 -209
  79. nucliadb/ingest/orm/index_message.py +128 -113
  80. nucliadb/ingest/orm/knowledgebox.py +91 -59
  81. nucliadb/ingest/orm/processor/auditing.py +1 -3
  82. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  83. nucliadb/ingest/orm/processor/processor.py +98 -153
  84. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  85. nucliadb/ingest/orm/resource.py +82 -71
  86. nucliadb/ingest/orm/utils.py +1 -1
  87. nucliadb/ingest/partitions.py +12 -1
  88. nucliadb/ingest/processing.py +17 -17
  89. nucliadb/ingest/serialize.py +202 -145
  90. nucliadb/ingest/service/writer.py +15 -114
  91. nucliadb/ingest/settings.py +36 -15
  92. nucliadb/ingest/utils.py +1 -2
  93. nucliadb/learning_proxy.py +23 -26
  94. nucliadb/metrics_exporter.py +20 -6
  95. nucliadb/middleware/__init__.py +82 -1
  96. nucliadb/migrator/datamanager.py +4 -11
  97. nucliadb/migrator/migrator.py +1 -2
  98. nucliadb/migrator/models.py +1 -2
  99. nucliadb/migrator/settings.py +1 -2
  100. nucliadb/models/internal/augment.py +614 -0
  101. nucliadb/models/internal/processing.py +19 -19
  102. nucliadb/openapi.py +2 -2
  103. nucliadb/purge/__init__.py +3 -8
  104. nucliadb/purge/orphan_shards.py +1 -2
  105. nucliadb/reader/__init__.py +5 -0
  106. nucliadb/reader/api/models.py +6 -13
  107. nucliadb/reader/api/v1/download.py +59 -38
  108. nucliadb/reader/api/v1/export_import.py +4 -4
  109. nucliadb/reader/api/v1/knowledgebox.py +37 -9
  110. nucliadb/reader/api/v1/learning_config.py +33 -14
  111. nucliadb/reader/api/v1/resource.py +61 -9
  112. nucliadb/reader/api/v1/services.py +18 -14
  113. nucliadb/reader/app.py +3 -1
  114. nucliadb/reader/reader/notifications.py +1 -2
  115. nucliadb/search/api/v1/__init__.py +3 -0
  116. nucliadb/search/api/v1/ask.py +3 -4
  117. nucliadb/search/api/v1/augment.py +585 -0
  118. nucliadb/search/api/v1/catalog.py +15 -19
  119. nucliadb/search/api/v1/find.py +16 -22
  120. nucliadb/search/api/v1/hydrate.py +328 -0
  121. nucliadb/search/api/v1/knowledgebox.py +1 -2
  122. nucliadb/search/api/v1/predict_proxy.py +1 -2
  123. nucliadb/search/api/v1/resource/ask.py +28 -8
  124. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  125. nucliadb/search/api/v1/resource/search.py +9 -11
  126. nucliadb/search/api/v1/retrieve.py +130 -0
  127. nucliadb/search/api/v1/search.py +28 -32
  128. nucliadb/search/api/v1/suggest.py +11 -14
  129. nucliadb/search/api/v1/summarize.py +1 -2
  130. nucliadb/search/api/v1/utils.py +2 -2
  131. nucliadb/search/app.py +3 -2
  132. nucliadb/search/augmentor/__init__.py +21 -0
  133. nucliadb/search/augmentor/augmentor.py +232 -0
  134. nucliadb/search/augmentor/fields.py +704 -0
  135. nucliadb/search/augmentor/metrics.py +24 -0
  136. nucliadb/search/augmentor/paragraphs.py +334 -0
  137. nucliadb/search/augmentor/resources.py +238 -0
  138. nucliadb/search/augmentor/utils.py +33 -0
  139. nucliadb/search/lifecycle.py +3 -1
  140. nucliadb/search/predict.py +33 -19
  141. nucliadb/search/predict_models.py +8 -9
  142. nucliadb/search/requesters/utils.py +11 -10
  143. nucliadb/search/search/cache.py +19 -42
  144. nucliadb/search/search/chat/ask.py +131 -59
  145. nucliadb/search/search/chat/exceptions.py +3 -5
  146. nucliadb/search/search/chat/fetcher.py +201 -0
  147. nucliadb/search/search/chat/images.py +6 -4
  148. nucliadb/search/search/chat/old_prompt.py +1375 -0
  149. nucliadb/search/search/chat/parser.py +510 -0
  150. nucliadb/search/search/chat/prompt.py +563 -615
  151. nucliadb/search/search/chat/query.py +453 -32
  152. nucliadb/search/search/chat/rpc.py +85 -0
  153. nucliadb/search/search/fetch.py +3 -4
  154. nucliadb/search/search/filters.py +8 -11
  155. nucliadb/search/search/find.py +33 -31
  156. nucliadb/search/search/find_merge.py +124 -331
  157. nucliadb/search/search/graph_strategy.py +14 -12
  158. nucliadb/search/search/hydrator/__init__.py +49 -0
  159. nucliadb/search/search/hydrator/fields.py +217 -0
  160. nucliadb/search/search/hydrator/images.py +130 -0
  161. nucliadb/search/search/hydrator/paragraphs.py +323 -0
  162. nucliadb/search/search/hydrator/resources.py +60 -0
  163. nucliadb/search/search/ingestion_agents.py +5 -5
  164. nucliadb/search/search/merge.py +90 -94
  165. nucliadb/search/search/metrics.py +24 -7
  166. nucliadb/search/search/paragraphs.py +7 -9
  167. nucliadb/search/search/predict_proxy.py +44 -18
  168. nucliadb/search/search/query.py +14 -86
  169. nucliadb/search/search/query_parser/fetcher.py +51 -82
  170. nucliadb/search/search/query_parser/models.py +19 -48
  171. nucliadb/search/search/query_parser/old_filters.py +20 -19
  172. nucliadb/search/search/query_parser/parsers/ask.py +5 -6
  173. nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
  174. nucliadb/search/search/query_parser/parsers/common.py +21 -13
  175. nucliadb/search/search/query_parser/parsers/find.py +6 -29
  176. nucliadb/search/search/query_parser/parsers/graph.py +18 -28
  177. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  178. nucliadb/search/search/query_parser/parsers/search.py +15 -56
  179. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  180. nucliadb/search/search/rank_fusion.py +18 -13
  181. nucliadb/search/search/rerankers.py +6 -7
  182. nucliadb/search/search/retrieval.py +300 -0
  183. nucliadb/search/search/summarize.py +5 -6
  184. nucliadb/search/search/utils.py +3 -4
  185. nucliadb/search/settings.py +1 -2
  186. nucliadb/standalone/api_router.py +1 -1
  187. nucliadb/standalone/app.py +4 -3
  188. nucliadb/standalone/auth.py +5 -6
  189. nucliadb/standalone/lifecycle.py +2 -2
  190. nucliadb/standalone/run.py +5 -4
  191. nucliadb/standalone/settings.py +5 -6
  192. nucliadb/standalone/versions.py +3 -4
  193. nucliadb/tasks/consumer.py +13 -8
  194. nucliadb/tasks/models.py +2 -1
  195. nucliadb/tasks/producer.py +3 -3
  196. nucliadb/tasks/retries.py +8 -7
  197. nucliadb/train/api/utils.py +1 -3
  198. nucliadb/train/api/v1/shards.py +1 -2
  199. nucliadb/train/api/v1/trainset.py +1 -2
  200. nucliadb/train/app.py +1 -1
  201. nucliadb/train/generator.py +4 -4
  202. nucliadb/train/generators/field_classifier.py +2 -2
  203. nucliadb/train/generators/field_streaming.py +6 -6
  204. nucliadb/train/generators/image_classifier.py +2 -2
  205. nucliadb/train/generators/paragraph_classifier.py +2 -2
  206. nucliadb/train/generators/paragraph_streaming.py +2 -2
  207. nucliadb/train/generators/question_answer_streaming.py +2 -2
  208. nucliadb/train/generators/sentence_classifier.py +4 -10
  209. nucliadb/train/generators/token_classifier.py +3 -2
  210. nucliadb/train/generators/utils.py +6 -5
  211. nucliadb/train/nodes.py +3 -3
  212. nucliadb/train/resource.py +6 -8
  213. nucliadb/train/settings.py +3 -4
  214. nucliadb/train/types.py +11 -11
  215. nucliadb/train/upload.py +3 -2
  216. nucliadb/train/uploader.py +1 -2
  217. nucliadb/train/utils.py +1 -2
  218. nucliadb/writer/api/v1/export_import.py +4 -1
  219. nucliadb/writer/api/v1/field.py +15 -14
  220. nucliadb/writer/api/v1/knowledgebox.py +18 -56
  221. nucliadb/writer/api/v1/learning_config.py +5 -4
  222. nucliadb/writer/api/v1/resource.py +9 -20
  223. nucliadb/writer/api/v1/services.py +10 -132
  224. nucliadb/writer/api/v1/upload.py +73 -72
  225. nucliadb/writer/app.py +8 -2
  226. nucliadb/writer/resource/basic.py +12 -15
  227. nucliadb/writer/resource/field.py +43 -5
  228. nucliadb/writer/resource/origin.py +7 -0
  229. nucliadb/writer/settings.py +2 -3
  230. nucliadb/writer/tus/__init__.py +2 -3
  231. nucliadb/writer/tus/azure.py +5 -7
  232. nucliadb/writer/tus/dm.py +3 -3
  233. nucliadb/writer/tus/exceptions.py +3 -4
  234. nucliadb/writer/tus/gcs.py +15 -22
  235. nucliadb/writer/tus/s3.py +2 -3
  236. nucliadb/writer/tus/storage.py +3 -3
  237. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
  238. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  239. nucliadb/common/datamanagers/entities.py +0 -139
  240. nucliadb/common/external_index_providers/pinecone.py +0 -894
  241. nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
  242. nucliadb/search/search/hydrator.py +0 -197
  243. nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
  244. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  245. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  246. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
@@ -20,7 +20,8 @@
20
20
  import asyncio
21
21
  import datetime
22
22
  import math
23
- from typing import Any, Iterable, Optional, Set, Union
23
+ from collections.abc import Iterable
24
+ from typing import Any
24
25
 
25
26
  from nidx_protos.nodereader_pb2 import (
26
27
  DocumentResult,
@@ -37,7 +38,6 @@ from nidx_protos.nodereader_pb2 import (
37
38
  from nucliadb.common.ids import FieldId, ParagraphId
38
39
  from nucliadb.common.models_utils import from_proto
39
40
  from nucliadb.common.models_utils.from_proto import RelationTypePbMap
40
- from nucliadb.search.search import cache
41
41
  from nucliadb.search.search.cut import cut_page
42
42
  from nucliadb.search.search.fetch import (
43
43
  fetch_resources,
@@ -80,7 +80,7 @@ from .paragraphs import get_paragraph_text, get_text_sentence
80
80
  Bm25Score = tuple[float, float]
81
81
  TimestampScore = datetime.datetime
82
82
  TitleScore = str
83
- SortValue = Union[Bm25Score, TimestampScore, TitleScore]
83
+ SortValue = Bm25Score | TimestampScore | TitleScore
84
84
 
85
85
 
86
86
  def relation_node_type_to_entity_type(node_type: RelationNode.NodeType.ValueType) -> EntityType:
@@ -101,47 +101,17 @@ def entity_type_to_relation_node_type(node_type: EntityType) -> RelationNode.Nod
101
101
  }[node_type]
102
102
 
103
103
 
104
- def sort_results_by_score(results: Union[list[ParagraphResult], list[DocumentResult]]):
104
+ def sort_results_by_score(results: list[ParagraphResult] | list[DocumentResult]):
105
105
  results.sort(key=lambda x: (x.score.bm25, x.score.booster), reverse=True)
106
106
 
107
107
 
108
- async def get_sort_value(
109
- item: Union[DocumentResult, ParagraphResult],
110
- sort_field: SortField,
111
- kbid: str,
112
- ) -> Optional[SortValue]:
113
- """Returns the score for given `item` and `sort_field`. If the resource is being
114
- deleted, it might appear on search results but not in maindb. In this
115
- specific case, return None.
116
- """
117
- if sort_field == SortField.SCORE:
118
- return (item.score.bm25, item.score.booster)
119
-
120
- score: Any = None
121
- resource = await cache.get_resource(kbid, item.uuid)
122
- if resource is None:
123
- return score
124
-
125
- basic = await resource.get_basic()
126
- if basic is None:
127
- return score
128
-
129
- if sort_field == SortField.CREATED:
130
- score = basic.created.ToDatetime()
131
- elif sort_field == SortField.MODIFIED:
132
- score = basic.modified.ToDatetime()
133
- elif sort_field == SortField.TITLE:
134
- score = basic.title
135
-
136
- return score
137
-
138
-
139
108
  async def merge_documents_results(
140
109
  kbid: str,
141
110
  responses: list[DocumentSearchResponse],
142
111
  *,
143
112
  query: FulltextQuery,
144
113
  top_k: int,
114
+ offset: int,
145
115
  ) -> tuple[Resources, list[str]]:
146
116
  raw_resource_list: list[tuple[DocumentResult, SortValue]] = []
147
117
  facets: dict[str, Any] = {}
@@ -159,14 +129,22 @@ async def merge_documents_results(
159
129
  if document_response.next_page:
160
130
  next_page = True
161
131
  for result in document_response.results:
162
- sort_value = await get_sort_value(result, query.order_by, kbid)
132
+ sort_value: SortValue
133
+ if query.order_by == SortField.SCORE:
134
+ sort_value = (result.score.bm25, result.score.booster)
135
+ else:
136
+ sort_value = result.date.ToDatetime()
163
137
  if sort_value is not None:
164
138
  raw_resource_list.append((result, sort_value))
139
+
165
140
  total += document_response.total
166
141
 
167
142
  # We need to cut first and then sort, otherwise the page will be wrong if the order is DESC
168
- raw_resource_list, has_more = cut_page(raw_resource_list, top_k)
143
+ raw_resource_list, has_more = cut_page(raw_resource_list[offset:], top_k)
169
144
  next_page = next_page or has_more
145
+
146
+ # Sort the list by score. It's important that this sort is stable, so the
147
+ # ordering of results with same scores accross multiple shards doesn't change
170
148
  raw_resource_list.sort(key=lambda x: x[1], reverse=(query.sort == SortOrder.DESC))
171
149
 
172
150
  result_resource_ids = []
@@ -270,7 +248,7 @@ async def merge_vectors_results(
270
248
  resources: list[str],
271
249
  kbid: str,
272
250
  top_k: int,
273
- min_score: Optional[float] = None,
251
+ min_score: float | None = None,
274
252
  ) -> Sentences:
275
253
  facets: dict[str, Any] = {}
276
254
  raw_vectors_list: list[DocumentScored] = []
@@ -350,12 +328,13 @@ async def merge_paragraph_results(
350
328
  highlight: bool,
351
329
  sort: SortOptions,
352
330
  min_score: float,
331
+ offset: int,
353
332
  ) -> tuple[Paragraphs, list[str]]:
354
333
  raw_paragraph_list: list[tuple[ParagraphResult, SortValue]] = []
355
334
  facets: dict[str, Any] = {}
356
335
  query = None
357
336
  next_page = False
358
- ematches: Optional[list[str]] = None
337
+ ematches: list[str] | None = None
359
338
  total = 0
360
339
  for paragraph_response in paragraph_responses:
361
340
  if ematches is None:
@@ -373,66 +352,31 @@ async def merge_paragraph_results(
373
352
  if paragraph_response.next_page:
374
353
  next_page = True
375
354
  for result in paragraph_response.results:
376
- score = await get_sort_value(result, sort.field, kbid)
377
- if score is not None:
378
- raw_paragraph_list.append((result, score))
355
+ sort_value: SortValue
356
+ if sort.field == SortField.SCORE:
357
+ sort_value = (result.score.bm25, result.score.booster)
358
+ else:
359
+ sort_value = result.date.ToDatetime()
360
+ if sort_value is not None:
361
+ raw_paragraph_list.append((result, sort_value))
362
+
379
363
  total += paragraph_response.total
380
364
 
365
+ # Sort the list by score. It's important that this sort is stable, so the
366
+ # ordering of results with same scores accross multiple shards doesn't change
381
367
  raw_paragraph_list.sort(key=lambda x: x[1], reverse=(sort.order == SortOrder.DESC))
382
368
 
383
- raw_paragraph_list, has_more = cut_page(raw_paragraph_list, top_k)
369
+ raw_paragraph_list, has_more = cut_page(raw_paragraph_list[offset:], top_k)
384
370
  next_page = next_page or has_more
385
371
 
386
372
  result_resource_ids = []
387
- result_paragraph_list: list[Paragraph] = []
388
- for result, _ in raw_paragraph_list:
389
- _, field_type, field = result.field.split("/")
390
- text = await get_paragraph_text(
391
- kbid=kbid,
392
- paragraph_id=ParagraphId(
393
- field_id=FieldId(
394
- rid=result.uuid,
395
- type=field_type,
396
- key=field,
397
- subfield_id=result.split,
398
- ),
399
- paragraph_start=result.start,
400
- paragraph_end=result.end,
401
- ),
402
- highlight=highlight,
403
- ematches=ematches,
404
- matches=result.matches, # type: ignore
405
- )
406
- labels = await get_labels_paragraph(result, kbid)
407
- fuzzy_result = len(result.matches) > 0
408
- new_paragraph = Paragraph(
409
- score=result.score.bm25,
410
- rid=result.uuid,
411
- field_type=field_type,
412
- field=field,
413
- text=text,
414
- labels=labels,
415
- position=TextPosition(
416
- index=result.metadata.position.index,
417
- start=result.metadata.position.start,
418
- end=result.metadata.position.end,
419
- page_number=result.metadata.position.page_number,
420
- ),
421
- fuzzy_result=fuzzy_result,
422
- )
423
- if len(result.metadata.position.start_seconds) or len(result.metadata.position.end_seconds):
424
- new_paragraph.start_seconds = list(result.metadata.position.start_seconds)
425
- new_paragraph.end_seconds = list(result.metadata.position.end_seconds)
426
- else:
427
- # TODO: Remove once we are sure all data has been migrated!
428
- seconds_positions = await get_seconds_paragraph(result, kbid)
429
- if seconds_positions is not None:
430
- new_paragraph.start_seconds = seconds_positions[0]
431
- new_paragraph.end_seconds = seconds_positions[1]
373
+ result_paragraph_list: list[Paragraph] = await asyncio.gather(
374
+ *(load_paragraph(result, kbid, highlight, ematches) for result, _ in raw_paragraph_list)
375
+ )
376
+ for paragraph in result_paragraph_list:
377
+ if paragraph.rid not in result_resource_ids:
378
+ result_resource_ids.append(paragraph.rid)
432
379
 
433
- result_paragraph_list.append(new_paragraph)
434
- if new_paragraph.rid not in result_resource_ids:
435
- result_resource_ids.append(new_paragraph.rid)
436
380
  return Paragraphs(
437
381
  results=result_paragraph_list,
438
382
  facets=facets,
@@ -445,6 +389,56 @@ async def merge_paragraph_results(
445
389
  ), result_resource_ids
446
390
 
447
391
 
392
+ async def load_paragraph(
393
+ result: ParagraphResult, kbid: str, highlight: bool, ematches: list[str] | None
394
+ ) -> Paragraph:
395
+ _, field_type, field = result.field.split("/")
396
+ text = await get_paragraph_text(
397
+ kbid=kbid,
398
+ paragraph_id=ParagraphId(
399
+ field_id=FieldId(
400
+ rid=result.uuid,
401
+ type=field_type,
402
+ key=field,
403
+ subfield_id=result.split,
404
+ ),
405
+ paragraph_start=result.start,
406
+ paragraph_end=result.end,
407
+ ),
408
+ highlight=highlight,
409
+ ematches=ematches,
410
+ matches=result.matches, # type: ignore
411
+ )
412
+ labels = await get_labels_paragraph(result, kbid)
413
+ fuzzy_result = len(result.matches) > 0
414
+ new_paragraph = Paragraph(
415
+ score=result.score.bm25,
416
+ rid=result.uuid,
417
+ field_type=field_type,
418
+ field=field,
419
+ text=text,
420
+ labels=labels,
421
+ position=TextPosition(
422
+ index=result.metadata.position.index,
423
+ start=result.metadata.position.start,
424
+ end=result.metadata.position.end,
425
+ page_number=result.metadata.position.page_number,
426
+ ),
427
+ fuzzy_result=fuzzy_result,
428
+ )
429
+ if len(result.metadata.position.start_seconds) or len(result.metadata.position.end_seconds):
430
+ new_paragraph.start_seconds = list(result.metadata.position.start_seconds)
431
+ new_paragraph.end_seconds = list(result.metadata.position.end_seconds)
432
+ else:
433
+ # TODO: Remove once we are sure all data has been migrated!
434
+ seconds_positions = await get_seconds_paragraph(result, kbid)
435
+ if seconds_positions is not None:
436
+ new_paragraph.start_seconds = seconds_positions[0]
437
+ new_paragraph.end_seconds = seconds_positions[1]
438
+
439
+ return new_paragraph
440
+
441
+
448
442
  @merge_observer.wrap({"type": "merge_relations"})
449
443
  async def merge_relations_results(
450
444
  graph_responses: list[GraphSearchResponse],
@@ -520,6 +514,7 @@ async def merge_results(
520
514
  show: list[ResourceProperties],
521
515
  field_type_filter: list[FieldTypeName],
522
516
  extracted: list[ExtractedDataTypeName],
517
+ offset: int,
523
518
  highlight: bool = False,
524
519
  ) -> KnowledgeboxSearchResults:
525
520
  paragraphs = []
@@ -543,6 +538,7 @@ async def merge_results(
543
538
  documents,
544
539
  query=retrieval.query.fulltext,
545
540
  top_k=retrieval.top_k,
541
+ offset=offset,
546
542
  )
547
543
  resources.extend(matched_resources)
548
544
 
@@ -550,7 +546,6 @@ async def merge_results(
550
546
  sort = SortOptions(
551
547
  field=retrieval.query.keyword.order_by,
552
548
  order=retrieval.query.keyword.sort,
553
- limit=None, # unused
554
549
  )
555
550
  api_results.paragraphs, matched_resources = await merge_paragraph_results(
556
551
  kbid,
@@ -559,6 +554,7 @@ async def merge_results(
559
554
  highlight,
560
555
  sort,
561
556
  min_score=retrieval.query.keyword.min_score,
557
+ offset=offset,
562
558
  )
563
559
  resources.extend(matched_resources)
564
560
 
@@ -601,9 +597,9 @@ async def merge_paragraphs_results(
601
597
  sort=SortOptions(
602
598
  field=SortField.SCORE,
603
599
  order=SortOrder.DESC,
604
- limit=None,
605
600
  ),
606
601
  min_score=min_score,
602
+ offset=0,
607
603
  )
608
604
  return api_results
609
605
 
@@ -611,7 +607,7 @@ async def merge_paragraphs_results(
611
607
  async def merge_suggest_entities_results(
612
608
  suggest_responses: list[SuggestResponse],
613
609
  ) -> RelatedEntities:
614
- unique_entities: Set[RelatedEntity] = set()
610
+ unique_entities: set[RelatedEntity] = set()
615
611
  for response in suggest_responses:
616
612
  response_entities = (
617
613
  RelatedEntity(family=e.subtype, value=e.value) for e in response.entity_results.nodes
@@ -19,7 +19,7 @@
19
19
  #
20
20
  import contextlib
21
21
  import time
22
- from typing import Any, Optional, Union
22
+ from typing import Any
23
23
 
24
24
  from nucliadb_telemetry import metrics
25
25
 
@@ -27,6 +27,7 @@ merge_observer = metrics.Observer("merge_results", labels={"type": ""})
27
27
  node_features = metrics.Counter("nucliadb_node_features", labels={"type": ""})
28
28
  query_parse_dependency_observer = metrics.Observer("query_parse_dependency", labels={"type": ""})
29
29
  query_parser_observer = metrics.Observer("nucliadb_query_parser", labels={"type": ""})
30
+ search_observer = metrics.Observer("nucliadb_search", labels={"type": ""})
30
31
 
31
32
  buckets = [
32
33
  0.005,
@@ -49,6 +50,10 @@ buckets = [
49
50
  ]
50
51
 
51
52
  generative_first_chunk_histogram = metrics.Histogram(
53
+ name="generative_reasoning_first_chunk",
54
+ buckets=buckets,
55
+ )
56
+ reasoning_first_chunk_histogram = metrics.Histogram(
52
57
  name="generative_first_chunk",
53
58
  buckets=buckets,
54
59
  )
@@ -58,7 +63,7 @@ rag_histogram = metrics.Histogram(
58
63
  buckets=buckets,
59
64
  )
60
65
 
61
- MetricsData = dict[str, Union[int, float]]
66
+ MetricsData = dict[str, int | float]
62
67
 
63
68
 
64
69
  class Metrics:
@@ -82,10 +87,10 @@ class Metrics:
82
87
  self.child_spans.append(child_span)
83
88
  return child_span
84
89
 
85
- def set(self, key: str, value: Union[int, float]):
90
+ def set(self, key: str, value: int | float):
86
91
  self._metrics[key] = value
87
92
 
88
- def get(self, key: str) -> Optional[Union[int, float]]:
93
+ def get(self, key: str) -> int | float | None:
89
94
  return self._metrics.get(key)
90
95
 
91
96
  def to_dict(self) -> MetricsData:
@@ -98,7 +103,7 @@ class Metrics:
98
103
  result[self.id] = self.to_dict()
99
104
  return result
100
105
 
101
- def __getitem__(self, key: str) -> Union[int, float]:
106
+ def __getitem__(self, key: str) -> int | float:
102
107
  return self._metrics[key]
103
108
 
104
109
 
@@ -106,13 +111,25 @@ class AskMetrics(Metrics):
106
111
  def __init__(self: "AskMetrics"):
107
112
  super().__init__(id="ask")
108
113
  self.global_start = time.monotonic()
109
- self.first_chunk_yielded_at: Optional[float] = None
114
+ self.first_chunk_yielded_at: float | None = None
115
+ self.first_reasoning_chunk_yielded_at: float | None = None
110
116
 
111
117
  def record_first_chunk_yielded(self):
112
118
  self.first_chunk_yielded_at = time.monotonic()
113
119
  generative_first_chunk_histogram.observe(self.first_chunk_yielded_at - self.global_start)
114
120
 
115
- def get_first_chunk_time(self) -> Optional[float]:
121
+ def record_first_reasoning_chunk_yielded(self):
122
+ self.first_reasoning_chunk_yielded_at = time.monotonic()
123
+ reasoning_first_chunk_histogram.observe(
124
+ self.first_reasoning_chunk_yielded_at - self.global_start
125
+ )
126
+
127
+ def get_first_chunk_time(self) -> float | None:
116
128
  if self.first_chunk_yielded_at is None:
117
129
  return None
118
130
  return self.first_chunk_yielded_at - self.global_start
131
+
132
+ def get_first_reasoning_chunk_time(self) -> float | None:
133
+ if self.first_reasoning_chunk_yielded_at is None:
134
+ return None
135
+ return self.first_reasoning_chunk_yielded_at - self.global_start
@@ -20,7 +20,6 @@
20
20
  import logging
21
21
  import re
22
22
  import string
23
- from typing import Optional
24
23
 
25
24
  from nucliadb.common.ids import FIELD_TYPE_STR_TO_PB, ParagraphId
26
25
  from nucliadb.ingest.fields.base import Field
@@ -58,7 +57,7 @@ async def get_paragraph_from_full_text(
58
57
  field: Field,
59
58
  start: int,
60
59
  end: int,
61
- split: Optional[str] = None,
60
+ split: str | None = None,
62
61
  log_on_missing_field: bool = True,
63
62
  ) -> str:
64
63
  """
@@ -90,11 +89,10 @@ async def get_paragraph_text(
90
89
  kbid: str,
91
90
  paragraph_id: ParagraphId,
92
91
  highlight: bool = False,
93
- ematches: Optional[list[str]] = None,
94
- matches: Optional[list[str]] = None,
95
- orm_resource: Optional[
96
- ResourceORM
97
- ] = None, # allow passing in orm_resource to avoid extra DB calls or txn issues
92
+ ematches: list[str] | None = None,
93
+ matches: list[str] | None = None,
94
+ orm_resource: None
95
+ | (ResourceORM) = None, # allow passing in orm_resource to avoid extra DB calls or txn issues
98
96
  log_on_missing_field: bool = True,
99
97
  ) -> str:
100
98
  rid = paragraph_id.rid
@@ -139,7 +137,7 @@ async def get_text_sentence(
139
137
  index: int,
140
138
  start: int,
141
139
  end: int,
142
- split: Optional[str] = None,
140
+ split: str | None = None,
143
141
  ) -> str:
144
142
  """
145
143
  Leave separated from get paragraph for now until we understand the differences
@@ -169,7 +167,7 @@ async def get_text_sentence(
169
167
 
170
168
 
171
169
  def highlight_paragraph(
172
- text: str, words: Optional[list[str]] = None, ematches: Optional[list[str]] = None
170
+ text: str, words: list[str] | None = None, ematches: list[str] | None = None
173
171
  ) -> str:
174
172
  """
175
173
  Highlight `text` with <mark></mark> tags around the words in `words` and `ematches`.
@@ -19,7 +19,7 @@
19
19
  #
20
20
  import json
21
21
  from enum import Enum
22
- from typing import Any, Optional, Union
22
+ from typing import Any
23
23
 
24
24
  import aiohttp
25
25
  from fastapi.datastructures import QueryParams
@@ -28,6 +28,7 @@ from multidict import CIMultiDictProxy
28
28
  from nuclia_models.predict.generative_responses import (
29
29
  GenerativeChunk,
30
30
  JSONGenerativeResponse,
31
+ ReasoningGenerativeResponse,
31
32
  StatusGenerativeResponse,
32
33
  TextGenerativeResponse,
33
34
  )
@@ -77,9 +78,9 @@ async def predict_proxy(
77
78
  user_id: str,
78
79
  client_type: NucliaDBClientType,
79
80
  origin: str,
80
- json: Optional[Any] = None,
81
+ json: Any | None = None,
81
82
  headers: dict[str, str] = {},
82
- ) -> Union[Response, StreamingResponse]:
83
+ ) -> Response | StreamingResponse:
83
84
  if not await exists_kb(kbid=kbid):
84
85
  raise datamanagers.exceptions.KnowledgeBoxNotFound()
85
86
 
@@ -87,6 +88,7 @@ async def predict_proxy(
87
88
  predict_headers = predict.get_predict_headers(kbid)
88
89
  user_headers = {k: v for k, v in headers.items() if k.capitalize() in ALLOWED_HEADERS}
89
90
 
91
+ metrics = AskMetrics()
90
92
  # Proxy the request to predict API
91
93
  predict_response = await predict.make_request(
92
94
  method=method,
@@ -97,11 +99,15 @@ async def predict_proxy(
97
99
  )
98
100
 
99
101
  status_code = predict_response.status
102
+
103
+ # Only audit /predict/chat successful responses
104
+ should_audit = endpoint == PredictProxiedEndpoints.CHAT and 200 <= status_code < 300
105
+
100
106
  media_type = predict_response.headers.get("Content-Type")
101
- response: Union[Response, StreamingResponse]
107
+ response: Response | StreamingResponse
102
108
  user_query = json.get("question") if json is not None else ""
103
109
  if predict_response.headers.get("Transfer-Encoding") == "chunked":
104
- if endpoint == PredictProxiedEndpoints.CHAT:
110
+ if should_audit:
105
111
  streaming_generator = chat_streaming_generator(
106
112
  predict_response=predict_response,
107
113
  kbid=kbid,
@@ -109,7 +115,8 @@ async def predict_proxy(
109
115
  client_type=client_type,
110
116
  origin=origin,
111
117
  user_query=user_query,
112
- is_json="json" in (media_type or ""),
118
+ is_ndjson_stream="json" in (media_type or ""),
119
+ metrics=metrics,
113
120
  )
114
121
  else:
115
122
  streaming_generator = predict_response.content.iter_any()
@@ -120,11 +127,10 @@ async def predict_proxy(
120
127
  media_type=media_type,
121
128
  )
122
129
  else:
123
- metrics = AskMetrics()
124
130
  with metrics.time(PREDICT_ANSWER_METRIC):
125
131
  content = await predict_response.read()
126
132
 
127
- if endpoint == PredictProxiedEndpoints.CHAT:
133
+ if should_audit:
128
134
  try:
129
135
  llm_status_code = int(content[-1:].decode()) # Decode just the last char
130
136
  if llm_status_code != 0:
@@ -140,8 +146,10 @@ async def predict_proxy(
140
146
  client_type=client_type,
141
147
  origin=origin,
142
148
  text_answer=content,
149
+ text_reasoning=None,
143
150
  generative_answer_time=metrics[PREDICT_ANSWER_METRIC],
144
151
  generative_answer_first_chunk_time=None,
152
+ generative_reasoning_first_chunk_time=None,
145
153
  status_code=AnswerStatusCode(str(llm_status_code)),
146
154
  )
147
155
 
@@ -170,26 +178,35 @@ async def chat_streaming_generator(
170
178
  client_type: NucliaDBClientType,
171
179
  origin: str,
172
180
  user_query: str,
173
- is_json: bool,
181
+ is_ndjson_stream: bool,
182
+ metrics: AskMetrics,
174
183
  ):
175
184
  first = True
185
+ first_reasoning = True
176
186
  status_code = AnswerStatusCode.ERROR.value
177
187
  text_answer = ""
188
+ text_reasoning = ""
178
189
  json_object = None
179
- metrics = AskMetrics()
180
190
  with metrics.time(PREDICT_ANSWER_METRIC):
181
191
  async for chunk in predict_response.content:
182
- if first:
183
- metrics.record_first_chunk_yielded()
184
- first = False
185
-
186
192
  yield chunk
187
-
188
- if is_json:
193
+ if is_ndjson_stream:
189
194
  try:
190
195
  parsed_chunk = GenerativeChunk.model_validate_json(chunk).chunk
196
+ if first and isinstance(
197
+ parsed_chunk,
198
+ (TextGenerativeResponse, JSONGenerativeResponse, StatusGenerativeResponse),
199
+ ):
200
+ metrics.record_first_chunk_yielded()
201
+ first = False
202
+
191
203
  if isinstance(parsed_chunk, TextGenerativeResponse):
192
204
  text_answer += parsed_chunk.text
205
+ elif isinstance(parsed_chunk, ReasoningGenerativeResponse):
206
+ if first_reasoning:
207
+ metrics.record_first_reasoning_chunk_yielded()
208
+ first_reasoning = False
209
+ text_reasoning += parsed_chunk.text
193
210
  elif isinstance(parsed_chunk, JSONGenerativeResponse):
194
211
  json_object = parsed_chunk.object
195
212
  elif isinstance(parsed_chunk, StatusGenerativeResponse):
@@ -201,8 +218,11 @@ async def chat_streaming_generator(
201
218
  )
202
219
  else:
203
220
  text_answer += chunk.decode()
221
+ if first:
222
+ metrics.record_first_chunk_yielded()
223
+ first = False
204
224
 
205
- if is_json is False and chunk: # Ensure chunk is not empty before decoding
225
+ if is_ndjson_stream is False and chunk: # Ensure chunk is not empty before decoding
206
226
  # If response is text the status_code comes at the last chunk of data
207
227
  last_chunk = chunk.decode()
208
228
  if last_chunk[-1] == "0":
@@ -218,8 +238,10 @@ async def chat_streaming_generator(
218
238
  client_type=client_type,
219
239
  origin=origin,
220
240
  text_answer=text_answer.encode() if json_object is None else json.dumps(json_object).encode(),
241
+ text_reasoning=text_reasoning if text_reasoning else None,
221
242
  generative_answer_time=metrics[PREDICT_ANSWER_METRIC],
222
243
  generative_answer_first_chunk_time=metrics.get_first_chunk_time(),
244
+ generative_reasoning_first_chunk_time=metrics.get_first_reasoning_chunk_time(),
223
245
  status_code=AnswerStatusCode(status_code),
224
246
  )
225
247
 
@@ -232,8 +254,10 @@ def audit_predict_proxy_endpoint(
232
254
  client_type: NucliaDBClientType,
233
255
  origin: str,
234
256
  text_answer: bytes,
257
+ text_reasoning: str | None,
235
258
  generative_answer_time: float,
236
- generative_answer_first_chunk_time: Optional[float],
259
+ generative_answer_first_chunk_time: float | None,
260
+ generative_reasoning_first_chunk_time: float | None,
237
261
  status_code: AnswerStatusCode,
238
262
  ):
239
263
  maybe_audit_chat(
@@ -250,8 +274,10 @@ def audit_predict_proxy_endpoint(
250
274
  query_context_order={},
251
275
  model=headers.get(NUCLIA_LEARNING_MODEL_HEADER),
252
276
  text_answer=text_answer,
277
+ text_reasoning=text_reasoning,
253
278
  generative_answer_time=generative_answer_time,
254
279
  generative_answer_first_chunk_time=generative_answer_first_chunk_time or 0,
280
+ generative_reasoning_first_chunk_time=generative_reasoning_first_chunk_time,
255
281
  rephrase_time=None,
256
282
  status_code=status_code,
257
283
  )