nucliadb 6.7.2.post4862__py3-none-any.whl → 6.9.2.post5282__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nucliadb might be problematic. Click here for more details.

Files changed (126) hide show
  1. migrations/0016_upgrade_to_paragraphs_v2.py +1 -1
  2. migrations/0017_multiple_writable_shards.py +1 -1
  3. migrations/0018_purge_orphan_kbslugs.py +1 -1
  4. migrations/0019_upgrade_to_paragraphs_v3.py +1 -1
  5. migrations/0021_overwrite_vectorsets_key.py +1 -1
  6. migrations/0023_backfill_pg_catalog.py +7 -3
  7. migrations/0025_assign_models_to_kbs_v2.py +3 -3
  8. migrations/0027_rollover_texts3.py +1 -1
  9. migrations/0028_extracted_vectors_reference.py +1 -1
  10. migrations/0029_backfill_field_status.py +1 -1
  11. migrations/0032_remove_old_relations.py +1 -1
  12. migrations/0036_backfill_catalog_slug.py +1 -1
  13. migrations/0037_backfill_catalog_facets.py +1 -1
  14. migrations/0038_backfill_catalog_field_labels.py +7 -3
  15. migrations/0039_backfill_converation_splits_metadata.py +106 -0
  16. migrations/0040_migrate_search_configurations.py +79 -0
  17. migrations/pg/0010_shards_index.py +34 -0
  18. nucliadb/backups/create.py +3 -3
  19. nucliadb/backups/restore.py +3 -3
  20. nucliadb/common/cache.py +1 -1
  21. nucliadb/common/catalog/__init__.py +79 -0
  22. nucliadb/common/catalog/dummy.py +36 -0
  23. nucliadb/common/catalog/interface.py +85 -0
  24. nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +294 -208
  25. nucliadb/common/catalog/utils.py +56 -0
  26. nucliadb/common/cluster/manager.py +3 -19
  27. nucliadb/common/cluster/rebalance.py +484 -110
  28. nucliadb/common/cluster/rollover.py +29 -0
  29. nucliadb/common/cluster/settings.py +1 -1
  30. nucliadb/common/cluster/utils.py +26 -0
  31. nucliadb/common/datamanagers/atomic.py +6 -0
  32. nucliadb/common/datamanagers/utils.py +2 -2
  33. nucliadb/common/external_index_providers/manager.py +1 -29
  34. nucliadb/common/external_index_providers/settings.py +1 -27
  35. nucliadb/common/filter_expression.py +16 -33
  36. nucliadb/common/http_clients/exceptions.py +8 -0
  37. nucliadb/common/http_clients/processing.py +4 -0
  38. nucliadb/common/http_clients/utils.py +3 -0
  39. nucliadb/common/ids.py +77 -55
  40. nucliadb/common/locking.py +4 -4
  41. nucliadb/common/maindb/driver.py +11 -1
  42. nucliadb/common/maindb/local.py +1 -1
  43. nucliadb/common/maindb/pg.py +1 -1
  44. nucliadb/common/nidx.py +19 -1
  45. nucliadb/common/vector_index_config.py +1 -1
  46. nucliadb/export_import/datamanager.py +3 -3
  47. nucliadb/ingest/consumer/pull.py +7 -0
  48. nucliadb/ingest/consumer/service.py +2 -27
  49. nucliadb/ingest/consumer/shard_creator.py +17 -6
  50. nucliadb/ingest/fields/base.py +9 -17
  51. nucliadb/ingest/fields/conversation.py +47 -1
  52. nucliadb/ingest/orm/brain_v2.py +21 -3
  53. nucliadb/ingest/orm/index_message.py +126 -111
  54. nucliadb/ingest/orm/knowledgebox.py +84 -43
  55. nucliadb/ingest/orm/processor/auditing.py +1 -1
  56. nucliadb/ingest/orm/processor/processor.py +95 -149
  57. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  58. nucliadb/ingest/orm/resource.py +10 -1
  59. nucliadb/ingest/partitions.py +12 -1
  60. nucliadb/ingest/serialize.py +2 -2
  61. nucliadb/ingest/service/writer.py +26 -19
  62. nucliadb/ingest/settings.py +33 -11
  63. nucliadb/learning_proxy.py +12 -15
  64. nucliadb/metrics_exporter.py +17 -4
  65. nucliadb/migrator/datamanager.py +11 -17
  66. nucliadb/migrator/migrator.py +2 -2
  67. nucliadb/purge/__init__.py +12 -17
  68. nucliadb/purge/orphan_shards.py +2 -2
  69. nucliadb/reader/api/v1/knowledgebox.py +40 -12
  70. nucliadb/reader/api/v1/learning_config.py +30 -10
  71. nucliadb/reader/api/v1/resource.py +2 -2
  72. nucliadb/reader/api/v1/services.py +1 -1
  73. nucliadb/reader/reader/notifications.py +1 -1
  74. nucliadb/search/api/v1/__init__.py +1 -0
  75. nucliadb/search/api/v1/catalog.py +4 -4
  76. nucliadb/search/api/v1/find.py +1 -4
  77. nucliadb/search/api/v1/hydrate.py +328 -0
  78. nucliadb/search/api/v1/resource/ask.py +21 -1
  79. nucliadb/search/api/v1/search.py +1 -4
  80. nucliadb/search/predict.py +9 -2
  81. nucliadb/search/search/cache.py +1 -20
  82. nucliadb/search/search/chat/ask.py +50 -8
  83. nucliadb/search/search/chat/prompt.py +47 -15
  84. nucliadb/search/search/chat/query.py +8 -1
  85. nucliadb/search/search/fetch.py +1 -1
  86. nucliadb/search/search/find.py +1 -6
  87. nucliadb/search/search/{hydrator.py → hydrator/__init__.py} +5 -4
  88. nucliadb/search/search/hydrator/fields.py +175 -0
  89. nucliadb/search/search/hydrator/images.py +130 -0
  90. nucliadb/search/search/hydrator/paragraphs.py +307 -0
  91. nucliadb/search/search/hydrator/resources.py +56 -0
  92. nucliadb/search/search/metrics.py +16 -0
  93. nucliadb/search/search/predict_proxy.py +33 -11
  94. nucliadb/search/search/query.py +0 -23
  95. nucliadb/search/search/query_parser/fetcher.py +5 -5
  96. nucliadb/search/search/query_parser/models.py +1 -30
  97. nucliadb/search/search/query_parser/parsers/ask.py +1 -1
  98. nucliadb/search/search/query_parser/parsers/catalog.py +4 -7
  99. nucliadb/search/search/query_parser/parsers/common.py +16 -7
  100. nucliadb/search/search/query_parser/parsers/find.py +0 -11
  101. nucliadb/search/search/query_parser/parsers/graph.py +5 -5
  102. nucliadb/search/search/query_parser/parsers/search.py +0 -11
  103. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +4 -11
  104. nucliadb/search/search/rerankers.py +1 -1
  105. nucliadb/search/search/summarize.py +1 -1
  106. nucliadb/standalone/run.py +3 -0
  107. nucliadb/tasks/retries.py +4 -4
  108. nucliadb/train/generators/sentence_classifier.py +2 -8
  109. nucliadb/train/generators/utils.py +1 -1
  110. nucliadb/train/nodes.py +4 -4
  111. nucliadb/train/servicer.py +1 -1
  112. nucliadb/train/uploader.py +1 -1
  113. nucliadb/writer/api/v1/field.py +14 -9
  114. nucliadb/writer/api/v1/knowledgebox.py +15 -52
  115. nucliadb/writer/api/v1/learning_config.py +5 -4
  116. nucliadb/writer/api/v1/resource.py +2 -2
  117. nucliadb/writer/resource/field.py +38 -2
  118. nucliadb/writer/tus/azure.py +4 -4
  119. nucliadb/writer/tus/gcs.py +11 -17
  120. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/METADATA +9 -10
  121. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/RECORD +124 -114
  122. nucliadb/common/external_index_providers/pinecone.py +0 -894
  123. nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
  124. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/WHEEL +0 -0
  125. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/entry_points.txt +0 -0
  126. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/top_level.txt +0 -0
@@ -20,14 +20,16 @@
20
20
  import dataclasses
21
21
  import functools
22
22
  import json
23
- from typing import AsyncGenerator, Optional, cast
23
+ from typing import AsyncGenerator, Optional, Union, cast
24
24
 
25
25
  from nuclia_models.common.consumption import Consumption
26
26
  from nuclia_models.predict.generative_responses import (
27
27
  CitationsGenerativeResponse,
28
+ FootnoteCitationsGenerativeResponse,
28
29
  GenerativeChunk,
29
30
  JSONGenerativeResponse,
30
31
  MetaGenerativeResponse,
32
+ ReasoningGenerativeResponse,
31
33
  StatusGenerativeResponse,
32
34
  TextGenerativeResponse,
33
35
  )
@@ -90,6 +92,7 @@ from nucliadb_models.search import (
90
92
  FindOptions,
91
93
  FindParagraph,
92
94
  FindRequest,
95
+ FootnoteCitationsAskResponseItem,
93
96
  GraphStrategy,
94
97
  JSONAskResponseItem,
95
98
  KnowledgeboxFindResults,
@@ -102,6 +105,7 @@ from nucliadb_models.search import (
102
105
  PromptContext,
103
106
  PromptContextOrder,
104
107
  RagStrategyName,
108
+ ReasoningAskResponseItem,
105
109
  Relations,
106
110
  RelationsAskResponseItem,
107
111
  RetrievalAskResponseItem,
@@ -167,9 +171,11 @@ class AskResult:
167
171
 
168
172
  # Computed from the predict chat answer stream
169
173
  self._answer_text = ""
174
+ self._reasoning_text: Optional[str] = None
170
175
  self._object: Optional[JSONGenerativeResponse] = None
171
176
  self._status: Optional[StatusGenerativeResponse] = None
172
177
  self._citations: Optional[CitationsGenerativeResponse] = None
178
+ self._footnote_citations: Optional[FootnoteCitationsGenerativeResponse] = None
173
179
  self._metadata: Optional[MetaGenerativeResponse] = None
174
180
  self._relations: Optional[Relations] = None
175
181
  self._consumption: Optional[Consumption] = None
@@ -220,12 +226,23 @@ class AskResult:
220
226
  async def _stream(self) -> AsyncGenerator[AskResponseItemType, None]:
221
227
  # First, stream out the predict answer
222
228
  first_chunk_yielded = False
229
+ first_reasoning_chunk_yielded = False
223
230
  with self.metrics.time("stream_predict_answer"):
224
231
  async for answer_chunk in self._stream_predict_answer_text():
225
- yield AnswerAskResponseItem(text=answer_chunk)
226
- if not first_chunk_yielded:
227
- self.metrics.record_first_chunk_yielded()
228
- first_chunk_yielded = True
232
+ if isinstance(answer_chunk, TextGenerativeResponse):
233
+ yield AnswerAskResponseItem(text=answer_chunk.text)
234
+ if not first_chunk_yielded:
235
+ self.metrics.record_first_chunk_yielded()
236
+ first_chunk_yielded = True
237
+ elif isinstance(answer_chunk, ReasoningGenerativeResponse):
238
+ yield ReasoningAskResponseItem(text=answer_chunk.text)
239
+ if not first_reasoning_chunk_yielded:
240
+ self.metrics.record_first_reasoning_chunk_yielded()
241
+ first_reasoning_chunk_yielded = True
242
+ else:
243
+ # This is a trick so mypy generates an error if this branch can be reached,
244
+ # that is, if we are missing some ifs
245
+ _a: int = "a"
229
246
 
230
247
  if self._object is not None:
231
248
  yield JSONAskResponseItem(object=self._object.object)
@@ -274,8 +291,10 @@ class AskResult:
274
291
  audit_answer = json.dumps(self._object.object).encode("utf-8")
275
292
  self.auditor.audit(
276
293
  text_answer=audit_answer,
294
+ text_reasoning=self._reasoning_text,
277
295
  generative_answer_time=self.metrics["stream_predict_answer"],
278
296
  generative_answer_first_chunk_time=self.metrics.get_first_chunk_time() or 0,
297
+ generative_reasoning_first_chunk_time=self.metrics.get_first_reasoning_chunk_time(),
279
298
  rephrase_time=self.metrics.get("rephrase"),
280
299
  status_code=self.status_code,
281
300
  )
@@ -287,6 +306,11 @@ class AskResult:
287
306
  yield CitationsAskResponseItem(
288
307
  citations=self._citations.citations,
289
308
  )
309
+ # Stream out the footnote citations mapping
310
+ if self._footnote_citations is not None:
311
+ yield FootnoteCitationsAskResponseItem(
312
+ footnote_to_context=self._footnote_citations.footnote_to_context,
313
+ )
290
314
 
291
315
  # Stream out generic metadata about the answer
292
316
  if self._metadata is not None:
@@ -364,6 +388,10 @@ class AskResult:
364
388
  if self._citations is not None:
365
389
  citations = self._citations.citations
366
390
 
391
+ footnote_citations = {}
392
+ if self._footnote_citations is not None:
393
+ footnote_citations = self._footnote_citations.footnote_to_context
394
+
367
395
  answer_json = None
368
396
  if self._object is not None:
369
397
  answer_json = self._object.object
@@ -384,6 +412,7 @@ class AskResult:
384
412
 
385
413
  response = SyncAskResponse(
386
414
  answer=self._answer_text,
415
+ reasoning=self._reasoning_text,
387
416
  answer_json=answer_json,
388
417
  status=self.status_code.prettify(),
389
418
  relations=self._relations,
@@ -391,6 +420,7 @@ class AskResult:
391
420
  retrieval_best_matches=best_matches,
392
421
  prequeries=prequeries_results,
393
422
  citations=citations,
423
+ citation_footnote_to_context=footnote_citations,
394
424
  metadata=metadata,
395
425
  consumption=self._consumption,
396
426
  learning_id=self.nuclia_learning_id or "",
@@ -420,7 +450,9 @@ class AskResult:
420
450
  )
421
451
  return self._relations
422
452
 
423
- async def _stream_predict_answer_text(self) -> AsyncGenerator[str, None]:
453
+ async def _stream_predict_answer_text(
454
+ self,
455
+ ) -> AsyncGenerator[Union[TextGenerativeResponse, ReasoningGenerativeResponse], None]:
424
456
  """
425
457
  Reads the stream of the generative model, yielding the answer text but also parsing
426
458
  other items like status codes, citations and miscellaneous metadata.
@@ -435,13 +467,21 @@ class AskResult:
435
467
  item = generative_chunk.chunk
436
468
  if isinstance(item, TextGenerativeResponse):
437
469
  self._answer_text += item.text
438
- yield item.text
470
+ yield item
471
+ elif isinstance(item, ReasoningGenerativeResponse):
472
+ if self._reasoning_text is None:
473
+ self._reasoning_text = item.text
474
+ else:
475
+ self._reasoning_text += item.text
476
+ yield item
439
477
  elif isinstance(item, JSONGenerativeResponse):
440
478
  self._object = item
441
479
  elif isinstance(item, StatusGenerativeResponse):
442
480
  self._status = item
443
481
  elif isinstance(item, CitationsGenerativeResponse):
444
482
  self._citations = item
483
+ elif isinstance(item, FootnoteCitationsGenerativeResponse):
484
+ self._footnote_citations = item
445
485
  elif isinstance(item, MetaGenerativeResponse):
446
486
  self._metadata = item
447
487
  elif isinstance(item, Consumption):
@@ -559,11 +599,13 @@ async def ask(
559
599
  origin=origin,
560
600
  generative_answer_time=0,
561
601
  generative_answer_first_chunk_time=0,
602
+ generative_reasoning_first_chunk_time=None,
562
603
  rephrase_time=metrics.get("rephrase"),
563
604
  user_query=user_query,
564
605
  rephrased_query=rephrased_query,
565
606
  retrieval_rephrase_query=err.main_query.rephrased_query if err.main_query else None,
566
607
  text_answer=b"",
608
+ text_reasoning=None,
567
609
  status_code=AnswerStatusCode.NO_RETRIEVAL_DATA,
568
610
  chat_history=chat_history,
569
611
  query_context={},
@@ -625,6 +667,7 @@ async def ask(
625
667
  json_schema=ask_request.answer_json_schema,
626
668
  rerank_context=False,
627
669
  top_k=ask_request.top_k,
670
+ reasoning=ask_request.reasoning,
628
671
  )
629
672
 
630
673
  nuclia_learning_id = None
@@ -1034,7 +1077,6 @@ def calculate_prequeries_for_json_schema(
1034
1077
  rephrase=ask_request.rephrase,
1035
1078
  rephrase_prompt=parse_rephrase_prompt(ask_request),
1036
1079
  security=ask_request.security,
1037
- autofilter=False,
1038
1080
  )
1039
1081
  prequery = PreQuery(
1040
1082
  request=req,
@@ -26,6 +26,7 @@ from typing import Deque, Dict, List, Optional, Sequence, Tuple, Union, cast
26
26
  import yaml
27
27
  from pydantic import BaseModel
28
28
 
29
+ from nucliadb.common import datamanagers
29
30
  from nucliadb.common.ids import FIELD_TYPE_STR_TO_PB, FieldId, ParagraphId
30
31
  from nucliadb.common.maindb.utils import get_driver
31
32
  from nucliadb.common.models_utils import from_proto
@@ -246,7 +247,7 @@ async def default_prompt_context(
246
247
  - Using an dict prevents from duplicates pulled in through conversation expansion.
247
248
  """
248
249
  # Sort retrieved paragraphs by decreasing order (most relevant first)
249
- async with get_driver().transaction(read_only=True) as txn:
250
+ async with get_driver().ro_transaction() as txn:
250
251
  storage = await get_storage()
251
252
  kb = KnowledgeBoxORM(txn, storage, kbid)
252
253
  for paragraph in ordered_paragraphs:
@@ -589,18 +590,7 @@ async def field_extension_prompt_context(
589
590
  if resource_uuid not in ordered_resources:
590
591
  ordered_resources.append(resource_uuid)
591
592
 
592
- # Fetch the extracted texts of the specified fields for each resource
593
- extend_fields = strategy.fields
594
- extend_field_ids = []
595
- for resource_uuid in ordered_resources:
596
- for field_id in extend_fields:
597
- try:
598
- fid = FieldId.from_string(f"{resource_uuid}/{field_id.strip('/')}")
599
- extend_field_ids.append(fid)
600
- except ValueError: # pragma: no cover
601
- # Invalid field id, skiping
602
- continue
603
-
593
+ extend_field_ids = await get_matching_field_ids(kbid, ordered_resources, strategy)
604
594
  tasks = [hydrate_field_text(kbid, fid) for fid in extend_field_ids]
605
595
  field_extracted_texts = await run_concurrently(tasks)
606
596
 
@@ -630,6 +620,43 @@ async def field_extension_prompt_context(
630
620
  context[paragraph.id] = _clean_paragraph_text(paragraph)
631
621
 
632
622
 
623
+ async def get_matching_field_ids(
624
+ kbid: str, ordered_resources: list[str], strategy: FieldExtensionStrategy
625
+ ) -> list[FieldId]:
626
+ extend_field_ids: list[FieldId] = []
627
+ # Fetch the extracted texts of the specified fields for each resource
628
+ for resource_uuid in ordered_resources:
629
+ for field_id in strategy.fields:
630
+ try:
631
+ fid = FieldId.from_string(f"{resource_uuid}/{field_id.strip('/')}")
632
+ extend_field_ids.append(fid)
633
+ except ValueError: # pragma: no cover
634
+ # Invalid field id, skiping
635
+ continue
636
+ if len(strategy.data_augmentation_field_prefixes) > 0:
637
+ for resource_uuid in ordered_resources:
638
+ all_field_ids = await datamanagers.atomic.resources.get_all_field_ids(
639
+ kbid=kbid, rid=resource_uuid, for_update=False
640
+ )
641
+ if all_field_ids is None:
642
+ continue
643
+ for fieldid in all_field_ids.fields:
644
+ # Generated fields are always text fields starting with "da-"
645
+ if any(
646
+ (
647
+ fieldid.field_type == resources_pb2.FieldType.TEXT
648
+ and fieldid.field.startswith(f"da-{prefix}-")
649
+ )
650
+ for prefix in strategy.data_augmentation_field_prefixes
651
+ ):
652
+ extend_field_ids.append(
653
+ FieldId.from_pb(
654
+ rid=resource_uuid, field_type=fieldid.field_type, key=fieldid.field
655
+ )
656
+ )
657
+ return extend_field_ids
658
+
659
+
633
660
  async def get_orm_field(kbid: str, field_id: FieldId) -> Optional[Field]:
634
661
  resource = await cache.get_resource(kbid, field_id.rid)
635
662
  if resource is None: # pragma: no cover
@@ -779,7 +806,7 @@ async def conversation_prompt_context(
779
806
  ):
780
807
  analyzed_fields: List[str] = []
781
808
  ops = 0
782
- async with get_driver().transaction(read_only=True) as txn:
809
+ async with get_driver().ro_transaction() as txn:
783
810
  storage = await get_storage()
784
811
  kb = KnowledgeBoxORM(txn, storage, kbid)
785
812
  for paragraph in ordered_paragraphs:
@@ -946,9 +973,14 @@ async def hierarchy_prompt_context(
946
973
  paragraph_id = ParagraphId.from_string(paragraph.id)
947
974
  extended_paragraph_text = paragraph.text
948
975
  if paragraphs_extra_characters > 0:
976
+ extended_paragraph_id = ParagraphId(
977
+ field_id=paragraph_id.field_id,
978
+ paragraph_start=paragraph_id.paragraph_start,
979
+ paragraph_end=paragraph_id.paragraph_end + paragraphs_extra_characters,
980
+ )
949
981
  extended_paragraph_text = await get_paragraph_text(
950
982
  kbid=kbid,
951
- paragraph_id=paragraph_id,
983
+ paragraph_id=extended_paragraph_id,
952
984
  log_on_missing_field=True,
953
985
  )
954
986
  rid = paragraph_id.rid
@@ -200,7 +200,6 @@ def find_request_from_ask_request(item: AskRequest, query: str) -> FindRequest:
200
200
  find_request.range_modification_end = item.range_modification_end
201
201
  find_request.show = item.show
202
202
  find_request.extracted = item.extracted
203
- find_request.autofilter = item.autofilter
204
203
  find_request.highlight = item.highlight
205
204
  find_request.security = item.security
206
205
  find_request.debug = item.debug
@@ -308,11 +307,13 @@ def maybe_audit_chat(
308
307
  origin: str,
309
308
  generative_answer_time: float,
310
309
  generative_answer_first_chunk_time: float,
310
+ generative_reasoning_first_chunk_time: Optional[float],
311
311
  rephrase_time: Optional[float],
312
312
  user_query: str,
313
313
  rephrased_query: Optional[str],
314
314
  retrieval_rephrase_query: Optional[str],
315
315
  text_answer: bytes,
316
+ text_reasoning: Optional[str],
316
317
  status_code: AnswerStatusCode,
317
318
  chat_history: list[ChatContextMessage],
318
319
  query_context: PromptContext,
@@ -344,12 +345,14 @@ def maybe_audit_chat(
344
345
  question=user_query,
345
346
  generative_answer_time=generative_answer_time,
346
347
  generative_answer_first_chunk_time=generative_answer_first_chunk_time,
348
+ generative_reasoning_first_chunk_time=generative_reasoning_first_chunk_time,
347
349
  rephrase_time=rephrase_time,
348
350
  rephrased_question=rephrased_query,
349
351
  retrieval_rephrased_question=retrieval_rephrase_query,
350
352
  chat_context=chat_history_context,
351
353
  retrieved_context=chat_retrieved_context,
352
354
  answer=audit_answer,
355
+ reasoning=text_reasoning,
353
356
  learning_id=learning_id,
354
357
  status_code=int(status_code.value),
355
358
  model=model,
@@ -401,8 +404,10 @@ class ChatAuditor:
401
404
  def audit(
402
405
  self,
403
406
  text_answer: bytes,
407
+ text_reasoning: Optional[str],
404
408
  generative_answer_time: float,
405
409
  generative_answer_first_chunk_time: float,
410
+ generative_reasoning_first_chunk_time: Optional[float],
406
411
  rephrase_time: Optional[float],
407
412
  status_code: AnswerStatusCode,
408
413
  ):
@@ -416,8 +421,10 @@ class ChatAuditor:
416
421
  retrieval_rephrase_query=self.retrieval_rephrased_query,
417
422
  generative_answer_time=generative_answer_time,
418
423
  generative_answer_first_chunk_time=generative_answer_first_chunk_time,
424
+ generative_reasoning_first_chunk_time=generative_reasoning_first_chunk_time,
419
425
  rephrase_time=rephrase_time,
420
426
  text_answer=text_answer,
427
+ text_reasoning=text_reasoning,
421
428
  status_code=status_code,
422
429
  chat_history=self.chat_history,
423
430
  query_context=self.query_context,
@@ -55,7 +55,7 @@ async def fetch_resources(
55
55
  extracted = []
56
56
 
57
57
  result = {}
58
- async with get_driver().transaction(read_only=True) as txn:
58
+ async with get_driver().ro_transaction() as txn:
59
59
  tasks = []
60
60
  for resource in resources:
61
61
  tasks.append(
@@ -100,7 +100,6 @@ async def _index_node_retrieval(
100
100
  (
101
101
  pb_query,
102
102
  incomplete_results,
103
- autofilters,
104
103
  rephrased_query,
105
104
  ) = await legacy_convert_retrieval_to_proto(parsed)
106
105
 
@@ -137,7 +136,6 @@ async def _index_node_retrieval(
137
136
  )
138
137
 
139
138
  search_results.shards = queried_shards
140
- search_results.autofilters = autofilters
141
139
 
142
140
  ndb_time = metrics["index_search"] + metrics["results_merge"]
143
141
  if metrics["index_search"] > settings.slow_node_query_log_threshold:
@@ -180,9 +178,7 @@ async def _external_index_retrieval(
180
178
  parsed = await parse_find(kbid, item)
181
179
  assert parsed.retrieval.reranker is not None, "find parser must provide a reranking algorithm"
182
180
  reranker = get_reranker(parsed.retrieval.reranker)
183
- search_request, incomplete_results, _, rephrased_query = await legacy_convert_retrieval_to_proto(
184
- parsed
185
- )
181
+ search_request, incomplete_results, rephrased_query = await legacy_convert_retrieval_to_proto(parsed)
186
182
 
187
183
  # Query index
188
184
  query_results = await external_index_manager.query(search_request) # noqa
@@ -220,7 +216,6 @@ async def _external_index_retrieval(
220
216
  page_number=0,
221
217
  page_size=item.top_k,
222
218
  relations=None, # Not implemented for external indexes yet
223
- autofilters=[], # Not implemented for external indexes yet
224
219
  min_score=results_min_score,
225
220
  best_matches=best_matches,
226
221
  # These are not used for external indexes
@@ -28,7 +28,8 @@ from nucliadb.common.external_index_providers.base import TextBlockMatch
28
28
  from nucliadb.common.ids import FieldId
29
29
  from nucliadb.common.maindb.utils import get_driver
30
30
  from nucliadb.ingest.serialize import managed_serialize
31
- from nucliadb.search.search import cache, paragraphs
31
+ from nucliadb.search.search import cache
32
+ from nucliadb.search.search.paragraphs import get_paragraph_text
32
33
  from nucliadb_models.common import FieldTypeName
33
34
  from nucliadb_models.resource import ExtractedDataTypeName, Resource
34
35
  from nucliadb_models.search import (
@@ -79,7 +80,7 @@ async def hydrate_resource_text(
79
80
  return []
80
81
 
81
82
  # Schedule the extraction of the text of each field in the resource
82
- async with get_driver().transaction(read_only=True) as txn:
83
+ async with get_driver().ro_transaction() as txn:
83
84
  resource.txn = txn
84
85
  runner = ConcurrentRunner(max_tasks=max_concurrent_tasks)
85
86
  for field_type, field_key in await resource.get_fields(force=True):
@@ -120,7 +121,7 @@ async def hydrate_resource_metadata(
120
121
  if concurrency_control is not None:
121
122
  await stack.enter_async_context(concurrency_control)
122
123
 
123
- async with get_driver().transaction(read_only=True) as ro_txn:
124
+ async with get_driver().ro_transaction() as ro_txn:
124
125
  serialized_resource = await managed_serialize(
125
126
  txn=ro_txn,
126
127
  kbid=kbid,
@@ -170,7 +171,7 @@ async def hydrate_text_block(
170
171
  if concurrency_control is not None:
171
172
  await stack.enter_async_context(concurrency_control)
172
173
 
173
- text_block.text = await paragraphs.get_paragraph_text(
174
+ text_block.text = await get_paragraph_text(
174
175
  kbid=kbid,
175
176
  paragraph_id=text_block.paragraph_id,
176
177
  highlight=options.highlight,
@@ -0,0 +1,175 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+
22
+ from nucliadb.common.ids import FIELD_TYPE_STR_TO_NAME, FieldId
23
+ from nucliadb.common.models_utils import from_proto
24
+ from nucliadb.ingest.orm.resource import Resource
25
+ from nucliadb.search.search.hydrator import hydrate_field_text
26
+ from nucliadb_models import hydration as hydration_models
27
+ from nucliadb_models.common import FieldTypeName
28
+
29
+
30
+ def page_preview_id(page_number: int) -> str:
31
+ """Return the string page number for an specific page"""
32
+ return f"{page_number}"
33
+
34
+
35
+ async def hydrate_field(resource: Resource, field_id: FieldId, config: hydration_models.FieldHydration):
36
+ field_type = FIELD_TYPE_STR_TO_NAME[field_id.type]
37
+
38
+ if field_type == FieldTypeName.TEXT:
39
+ if not config.text is not None:
40
+ return
41
+ return await hydrate_text_field(resource, field_id, config.text)
42
+
43
+ elif field_type == FieldTypeName.FILE is not None:
44
+ if not config.file:
45
+ return
46
+ return await hydrate_file_field(resource, field_id, config.file)
47
+
48
+ elif field_type == FieldTypeName.LINK is not None:
49
+ if not config.link:
50
+ return
51
+ return await hydrate_link_field(resource, field_id, config.link)
52
+
53
+ elif field_type == FieldTypeName.CONVERSATION is not None:
54
+ if not config.conversation:
55
+ return
56
+ return await hydrate_conversation_field(resource, field_id, config.conversation)
57
+
58
+ elif field_type == FieldTypeName.GENERIC is not None:
59
+ if not config.generic:
60
+ return
61
+ return await hydrate_generic_field(resource, field_id, config.generic)
62
+
63
+ else: # pragma: no cover
64
+ # This is a trick so mypy generates an error if this branch can be reached,
65
+ # that is, if we are missing some ifs
66
+ _a: int = "a"
67
+
68
+
69
+ async def hydrate_text_field(
70
+ resource: Resource,
71
+ field_id: FieldId,
72
+ config: hydration_models.TextFieldHydration,
73
+ ) -> hydration_models.HydratedTextField:
74
+ hydrated = hydration_models.HydratedTextField(
75
+ id=field_id.full(),
76
+ resource=field_id.rid,
77
+ field_type=FieldTypeName.TEXT,
78
+ )
79
+
80
+ if config.extracted_text:
81
+ field_text = await hydrate_field_text(resource.kb.kbid, field_id)
82
+ if field_text is not None:
83
+ (_, text) = field_text
84
+ hydrated.extracted = hydration_models.FieldExtractedData(text=text)
85
+
86
+ return hydrated
87
+
88
+
89
+ async def hydrate_file_field(
90
+ resource: Resource,
91
+ field_id: FieldId,
92
+ config: hydration_models.FileFieldHydration,
93
+ ) -> hydration_models.HydratedFileField:
94
+ hydrated = hydration_models.HydratedFileField(
95
+ id=field_id.full(),
96
+ resource=field_id.rid,
97
+ field_type=FieldTypeName.FILE,
98
+ )
99
+
100
+ if config.value:
101
+ field = await resource.get_field(field_id.key, field_id.pb_type)
102
+ value = await field.get_value()
103
+ hydrated.value = from_proto.field_file(value)
104
+
105
+ if config.extracted_text:
106
+ field_text = await hydrate_field_text(resource.kb.kbid, field_id)
107
+ if field_text is not None:
108
+ (_, text) = field_text
109
+ hydrated.extracted = hydration_models.FieldExtractedData(text=text)
110
+
111
+ return hydrated
112
+
113
+
114
+ async def hydrate_link_field(
115
+ resource: Resource,
116
+ field_id: FieldId,
117
+ config: hydration_models.LinkFieldHydration,
118
+ ) -> hydration_models.HydratedLinkField:
119
+ hydrated = hydration_models.HydratedLinkField(
120
+ id=field_id.full(),
121
+ resource=field_id.rid,
122
+ field_type=FieldTypeName.LINK,
123
+ )
124
+
125
+ if config.value:
126
+ field = await resource.get_field(field_id.key, field_id.pb_type)
127
+ value = await field.get_value()
128
+ hydrated.value = from_proto.field_link(value)
129
+
130
+ if config.extracted_text:
131
+ field_text = await hydrate_field_text(resource.kb.kbid, field_id)
132
+ if field_text is not None:
133
+ (_, text) = field_text
134
+ hydrated.extracted = hydration_models.FieldExtractedData(text=text)
135
+
136
+ return hydrated
137
+
138
+
139
+ async def hydrate_conversation_field(
140
+ resource: Resource,
141
+ field_id: FieldId,
142
+ config: hydration_models.ConversationFieldHydration,
143
+ ) -> hydration_models.HydratedConversationField:
144
+ hydrated = hydration_models.HydratedConversationField(
145
+ id=field_id.full(),
146
+ resource=field_id.rid,
147
+ field_type=FieldTypeName.CONVERSATION,
148
+ )
149
+ # TODO: implement conversation fields
150
+ return hydrated
151
+
152
+
153
+ async def hydrate_generic_field(
154
+ resource: Resource,
155
+ field_id: FieldId,
156
+ config: hydration_models.GenericFieldHydration,
157
+ ) -> hydration_models.HydratedGenericField:
158
+ hydrated = hydration_models.HydratedGenericField(
159
+ id=field_id.full(),
160
+ resource=field_id.rid,
161
+ field_type=FieldTypeName.GENERIC,
162
+ )
163
+
164
+ if config.value:
165
+ field = await resource.get_field(field_id.key, field_id.pb_type)
166
+ value = await field.get_value()
167
+ hydrated.value = value
168
+
169
+ if config.extracted_text:
170
+ field_text = await hydrate_field_text(resource.kb.kbid, field_id)
171
+ if field_text is not None:
172
+ (_, text) = field_text
173
+ hydrated.extracted = hydration_models.FieldExtractedData(text=text)
174
+
175
+ return hydrated