nucliadb 6.7.2.post4862__py3-none-any.whl → 6.9.2.post5282__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nucliadb might be problematic. Click here for more details.
- migrations/0016_upgrade_to_paragraphs_v2.py +1 -1
- migrations/0017_multiple_writable_shards.py +1 -1
- migrations/0018_purge_orphan_kbslugs.py +1 -1
- migrations/0019_upgrade_to_paragraphs_v3.py +1 -1
- migrations/0021_overwrite_vectorsets_key.py +1 -1
- migrations/0023_backfill_pg_catalog.py +7 -3
- migrations/0025_assign_models_to_kbs_v2.py +3 -3
- migrations/0027_rollover_texts3.py +1 -1
- migrations/0028_extracted_vectors_reference.py +1 -1
- migrations/0029_backfill_field_status.py +1 -1
- migrations/0032_remove_old_relations.py +1 -1
- migrations/0036_backfill_catalog_slug.py +1 -1
- migrations/0037_backfill_catalog_facets.py +1 -1
- migrations/0038_backfill_catalog_field_labels.py +7 -3
- migrations/0039_backfill_converation_splits_metadata.py +106 -0
- migrations/0040_migrate_search_configurations.py +79 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/backups/create.py +3 -3
- nucliadb/backups/restore.py +3 -3
- nucliadb/common/cache.py +1 -1
- nucliadb/common/catalog/__init__.py +79 -0
- nucliadb/common/catalog/dummy.py +36 -0
- nucliadb/common/catalog/interface.py +85 -0
- nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +294 -208
- nucliadb/common/catalog/utils.py +56 -0
- nucliadb/common/cluster/manager.py +3 -19
- nucliadb/common/cluster/rebalance.py +484 -110
- nucliadb/common/cluster/rollover.py +29 -0
- nucliadb/common/cluster/settings.py +1 -1
- nucliadb/common/cluster/utils.py +26 -0
- nucliadb/common/datamanagers/atomic.py +6 -0
- nucliadb/common/datamanagers/utils.py +2 -2
- nucliadb/common/external_index_providers/manager.py +1 -29
- nucliadb/common/external_index_providers/settings.py +1 -27
- nucliadb/common/filter_expression.py +16 -33
- nucliadb/common/http_clients/exceptions.py +8 -0
- nucliadb/common/http_clients/processing.py +4 -0
- nucliadb/common/http_clients/utils.py +3 -0
- nucliadb/common/ids.py +77 -55
- nucliadb/common/locking.py +4 -4
- nucliadb/common/maindb/driver.py +11 -1
- nucliadb/common/maindb/local.py +1 -1
- nucliadb/common/maindb/pg.py +1 -1
- nucliadb/common/nidx.py +19 -1
- nucliadb/common/vector_index_config.py +1 -1
- nucliadb/export_import/datamanager.py +3 -3
- nucliadb/ingest/consumer/pull.py +7 -0
- nucliadb/ingest/consumer/service.py +2 -27
- nucliadb/ingest/consumer/shard_creator.py +17 -6
- nucliadb/ingest/fields/base.py +9 -17
- nucliadb/ingest/fields/conversation.py +47 -1
- nucliadb/ingest/orm/brain_v2.py +21 -3
- nucliadb/ingest/orm/index_message.py +126 -111
- nucliadb/ingest/orm/knowledgebox.py +84 -43
- nucliadb/ingest/orm/processor/auditing.py +1 -1
- nucliadb/ingest/orm/processor/processor.py +95 -149
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +10 -1
- nucliadb/ingest/partitions.py +12 -1
- nucliadb/ingest/serialize.py +2 -2
- nucliadb/ingest/service/writer.py +26 -19
- nucliadb/ingest/settings.py +33 -11
- nucliadb/learning_proxy.py +12 -15
- nucliadb/metrics_exporter.py +17 -4
- nucliadb/migrator/datamanager.py +11 -17
- nucliadb/migrator/migrator.py +2 -2
- nucliadb/purge/__init__.py +12 -17
- nucliadb/purge/orphan_shards.py +2 -2
- nucliadb/reader/api/v1/knowledgebox.py +40 -12
- nucliadb/reader/api/v1/learning_config.py +30 -10
- nucliadb/reader/api/v1/resource.py +2 -2
- nucliadb/reader/api/v1/services.py +1 -1
- nucliadb/reader/reader/notifications.py +1 -1
- nucliadb/search/api/v1/__init__.py +1 -0
- nucliadb/search/api/v1/catalog.py +4 -4
- nucliadb/search/api/v1/find.py +1 -4
- nucliadb/search/api/v1/hydrate.py +328 -0
- nucliadb/search/api/v1/resource/ask.py +21 -1
- nucliadb/search/api/v1/search.py +1 -4
- nucliadb/search/predict.py +9 -2
- nucliadb/search/search/cache.py +1 -20
- nucliadb/search/search/chat/ask.py +50 -8
- nucliadb/search/search/chat/prompt.py +47 -15
- nucliadb/search/search/chat/query.py +8 -1
- nucliadb/search/search/fetch.py +1 -1
- nucliadb/search/search/find.py +1 -6
- nucliadb/search/search/{hydrator.py → hydrator/__init__.py} +5 -4
- nucliadb/search/search/hydrator/fields.py +175 -0
- nucliadb/search/search/hydrator/images.py +130 -0
- nucliadb/search/search/hydrator/paragraphs.py +307 -0
- nucliadb/search/search/hydrator/resources.py +56 -0
- nucliadb/search/search/metrics.py +16 -0
- nucliadb/search/search/predict_proxy.py +33 -11
- nucliadb/search/search/query.py +0 -23
- nucliadb/search/search/query_parser/fetcher.py +5 -5
- nucliadb/search/search/query_parser/models.py +1 -30
- nucliadb/search/search/query_parser/parsers/ask.py +1 -1
- nucliadb/search/search/query_parser/parsers/catalog.py +4 -7
- nucliadb/search/search/query_parser/parsers/common.py +16 -7
- nucliadb/search/search/query_parser/parsers/find.py +0 -11
- nucliadb/search/search/query_parser/parsers/graph.py +5 -5
- nucliadb/search/search/query_parser/parsers/search.py +0 -11
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +4 -11
- nucliadb/search/search/rerankers.py +1 -1
- nucliadb/search/search/summarize.py +1 -1
- nucliadb/standalone/run.py +3 -0
- nucliadb/tasks/retries.py +4 -4
- nucliadb/train/generators/sentence_classifier.py +2 -8
- nucliadb/train/generators/utils.py +1 -1
- nucliadb/train/nodes.py +4 -4
- nucliadb/train/servicer.py +1 -1
- nucliadb/train/uploader.py +1 -1
- nucliadb/writer/api/v1/field.py +14 -9
- nucliadb/writer/api/v1/knowledgebox.py +15 -52
- nucliadb/writer/api/v1/learning_config.py +5 -4
- nucliadb/writer/api/v1/resource.py +2 -2
- nucliadb/writer/resource/field.py +38 -2
- nucliadb/writer/tus/azure.py +4 -4
- nucliadb/writer/tus/gcs.py +11 -17
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/METADATA +9 -10
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/RECORD +124 -114
- nucliadb/common/external_index_providers/pinecone.py +0 -894
- nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/WHEEL +0 -0
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/top_level.txt +0 -0
|
@@ -20,14 +20,16 @@
|
|
|
20
20
|
import dataclasses
|
|
21
21
|
import functools
|
|
22
22
|
import json
|
|
23
|
-
from typing import AsyncGenerator, Optional, cast
|
|
23
|
+
from typing import AsyncGenerator, Optional, Union, cast
|
|
24
24
|
|
|
25
25
|
from nuclia_models.common.consumption import Consumption
|
|
26
26
|
from nuclia_models.predict.generative_responses import (
|
|
27
27
|
CitationsGenerativeResponse,
|
|
28
|
+
FootnoteCitationsGenerativeResponse,
|
|
28
29
|
GenerativeChunk,
|
|
29
30
|
JSONGenerativeResponse,
|
|
30
31
|
MetaGenerativeResponse,
|
|
32
|
+
ReasoningGenerativeResponse,
|
|
31
33
|
StatusGenerativeResponse,
|
|
32
34
|
TextGenerativeResponse,
|
|
33
35
|
)
|
|
@@ -90,6 +92,7 @@ from nucliadb_models.search import (
|
|
|
90
92
|
FindOptions,
|
|
91
93
|
FindParagraph,
|
|
92
94
|
FindRequest,
|
|
95
|
+
FootnoteCitationsAskResponseItem,
|
|
93
96
|
GraphStrategy,
|
|
94
97
|
JSONAskResponseItem,
|
|
95
98
|
KnowledgeboxFindResults,
|
|
@@ -102,6 +105,7 @@ from nucliadb_models.search import (
|
|
|
102
105
|
PromptContext,
|
|
103
106
|
PromptContextOrder,
|
|
104
107
|
RagStrategyName,
|
|
108
|
+
ReasoningAskResponseItem,
|
|
105
109
|
Relations,
|
|
106
110
|
RelationsAskResponseItem,
|
|
107
111
|
RetrievalAskResponseItem,
|
|
@@ -167,9 +171,11 @@ class AskResult:
|
|
|
167
171
|
|
|
168
172
|
# Computed from the predict chat answer stream
|
|
169
173
|
self._answer_text = ""
|
|
174
|
+
self._reasoning_text: Optional[str] = None
|
|
170
175
|
self._object: Optional[JSONGenerativeResponse] = None
|
|
171
176
|
self._status: Optional[StatusGenerativeResponse] = None
|
|
172
177
|
self._citations: Optional[CitationsGenerativeResponse] = None
|
|
178
|
+
self._footnote_citations: Optional[FootnoteCitationsGenerativeResponse] = None
|
|
173
179
|
self._metadata: Optional[MetaGenerativeResponse] = None
|
|
174
180
|
self._relations: Optional[Relations] = None
|
|
175
181
|
self._consumption: Optional[Consumption] = None
|
|
@@ -220,12 +226,23 @@ class AskResult:
|
|
|
220
226
|
async def _stream(self) -> AsyncGenerator[AskResponseItemType, None]:
|
|
221
227
|
# First, stream out the predict answer
|
|
222
228
|
first_chunk_yielded = False
|
|
229
|
+
first_reasoning_chunk_yielded = False
|
|
223
230
|
with self.metrics.time("stream_predict_answer"):
|
|
224
231
|
async for answer_chunk in self._stream_predict_answer_text():
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
232
|
+
if isinstance(answer_chunk, TextGenerativeResponse):
|
|
233
|
+
yield AnswerAskResponseItem(text=answer_chunk.text)
|
|
234
|
+
if not first_chunk_yielded:
|
|
235
|
+
self.metrics.record_first_chunk_yielded()
|
|
236
|
+
first_chunk_yielded = True
|
|
237
|
+
elif isinstance(answer_chunk, ReasoningGenerativeResponse):
|
|
238
|
+
yield ReasoningAskResponseItem(text=answer_chunk.text)
|
|
239
|
+
if not first_reasoning_chunk_yielded:
|
|
240
|
+
self.metrics.record_first_reasoning_chunk_yielded()
|
|
241
|
+
first_reasoning_chunk_yielded = True
|
|
242
|
+
else:
|
|
243
|
+
# This is a trick so mypy generates an error if this branch can be reached,
|
|
244
|
+
# that is, if we are missing some ifs
|
|
245
|
+
_a: int = "a"
|
|
229
246
|
|
|
230
247
|
if self._object is not None:
|
|
231
248
|
yield JSONAskResponseItem(object=self._object.object)
|
|
@@ -274,8 +291,10 @@ class AskResult:
|
|
|
274
291
|
audit_answer = json.dumps(self._object.object).encode("utf-8")
|
|
275
292
|
self.auditor.audit(
|
|
276
293
|
text_answer=audit_answer,
|
|
294
|
+
text_reasoning=self._reasoning_text,
|
|
277
295
|
generative_answer_time=self.metrics["stream_predict_answer"],
|
|
278
296
|
generative_answer_first_chunk_time=self.metrics.get_first_chunk_time() or 0,
|
|
297
|
+
generative_reasoning_first_chunk_time=self.metrics.get_first_reasoning_chunk_time(),
|
|
279
298
|
rephrase_time=self.metrics.get("rephrase"),
|
|
280
299
|
status_code=self.status_code,
|
|
281
300
|
)
|
|
@@ -287,6 +306,11 @@ class AskResult:
|
|
|
287
306
|
yield CitationsAskResponseItem(
|
|
288
307
|
citations=self._citations.citations,
|
|
289
308
|
)
|
|
309
|
+
# Stream out the footnote citations mapping
|
|
310
|
+
if self._footnote_citations is not None:
|
|
311
|
+
yield FootnoteCitationsAskResponseItem(
|
|
312
|
+
footnote_to_context=self._footnote_citations.footnote_to_context,
|
|
313
|
+
)
|
|
290
314
|
|
|
291
315
|
# Stream out generic metadata about the answer
|
|
292
316
|
if self._metadata is not None:
|
|
@@ -364,6 +388,10 @@ class AskResult:
|
|
|
364
388
|
if self._citations is not None:
|
|
365
389
|
citations = self._citations.citations
|
|
366
390
|
|
|
391
|
+
footnote_citations = {}
|
|
392
|
+
if self._footnote_citations is not None:
|
|
393
|
+
footnote_citations = self._footnote_citations.footnote_to_context
|
|
394
|
+
|
|
367
395
|
answer_json = None
|
|
368
396
|
if self._object is not None:
|
|
369
397
|
answer_json = self._object.object
|
|
@@ -384,6 +412,7 @@ class AskResult:
|
|
|
384
412
|
|
|
385
413
|
response = SyncAskResponse(
|
|
386
414
|
answer=self._answer_text,
|
|
415
|
+
reasoning=self._reasoning_text,
|
|
387
416
|
answer_json=answer_json,
|
|
388
417
|
status=self.status_code.prettify(),
|
|
389
418
|
relations=self._relations,
|
|
@@ -391,6 +420,7 @@ class AskResult:
|
|
|
391
420
|
retrieval_best_matches=best_matches,
|
|
392
421
|
prequeries=prequeries_results,
|
|
393
422
|
citations=citations,
|
|
423
|
+
citation_footnote_to_context=footnote_citations,
|
|
394
424
|
metadata=metadata,
|
|
395
425
|
consumption=self._consumption,
|
|
396
426
|
learning_id=self.nuclia_learning_id or "",
|
|
@@ -420,7 +450,9 @@ class AskResult:
|
|
|
420
450
|
)
|
|
421
451
|
return self._relations
|
|
422
452
|
|
|
423
|
-
async def _stream_predict_answer_text(
|
|
453
|
+
async def _stream_predict_answer_text(
|
|
454
|
+
self,
|
|
455
|
+
) -> AsyncGenerator[Union[TextGenerativeResponse, ReasoningGenerativeResponse], None]:
|
|
424
456
|
"""
|
|
425
457
|
Reads the stream of the generative model, yielding the answer text but also parsing
|
|
426
458
|
other items like status codes, citations and miscellaneous metadata.
|
|
@@ -435,13 +467,21 @@ class AskResult:
|
|
|
435
467
|
item = generative_chunk.chunk
|
|
436
468
|
if isinstance(item, TextGenerativeResponse):
|
|
437
469
|
self._answer_text += item.text
|
|
438
|
-
yield item
|
|
470
|
+
yield item
|
|
471
|
+
elif isinstance(item, ReasoningGenerativeResponse):
|
|
472
|
+
if self._reasoning_text is None:
|
|
473
|
+
self._reasoning_text = item.text
|
|
474
|
+
else:
|
|
475
|
+
self._reasoning_text += item.text
|
|
476
|
+
yield item
|
|
439
477
|
elif isinstance(item, JSONGenerativeResponse):
|
|
440
478
|
self._object = item
|
|
441
479
|
elif isinstance(item, StatusGenerativeResponse):
|
|
442
480
|
self._status = item
|
|
443
481
|
elif isinstance(item, CitationsGenerativeResponse):
|
|
444
482
|
self._citations = item
|
|
483
|
+
elif isinstance(item, FootnoteCitationsGenerativeResponse):
|
|
484
|
+
self._footnote_citations = item
|
|
445
485
|
elif isinstance(item, MetaGenerativeResponse):
|
|
446
486
|
self._metadata = item
|
|
447
487
|
elif isinstance(item, Consumption):
|
|
@@ -559,11 +599,13 @@ async def ask(
|
|
|
559
599
|
origin=origin,
|
|
560
600
|
generative_answer_time=0,
|
|
561
601
|
generative_answer_first_chunk_time=0,
|
|
602
|
+
generative_reasoning_first_chunk_time=None,
|
|
562
603
|
rephrase_time=metrics.get("rephrase"),
|
|
563
604
|
user_query=user_query,
|
|
564
605
|
rephrased_query=rephrased_query,
|
|
565
606
|
retrieval_rephrase_query=err.main_query.rephrased_query if err.main_query else None,
|
|
566
607
|
text_answer=b"",
|
|
608
|
+
text_reasoning=None,
|
|
567
609
|
status_code=AnswerStatusCode.NO_RETRIEVAL_DATA,
|
|
568
610
|
chat_history=chat_history,
|
|
569
611
|
query_context={},
|
|
@@ -625,6 +667,7 @@ async def ask(
|
|
|
625
667
|
json_schema=ask_request.answer_json_schema,
|
|
626
668
|
rerank_context=False,
|
|
627
669
|
top_k=ask_request.top_k,
|
|
670
|
+
reasoning=ask_request.reasoning,
|
|
628
671
|
)
|
|
629
672
|
|
|
630
673
|
nuclia_learning_id = None
|
|
@@ -1034,7 +1077,6 @@ def calculate_prequeries_for_json_schema(
|
|
|
1034
1077
|
rephrase=ask_request.rephrase,
|
|
1035
1078
|
rephrase_prompt=parse_rephrase_prompt(ask_request),
|
|
1036
1079
|
security=ask_request.security,
|
|
1037
|
-
autofilter=False,
|
|
1038
1080
|
)
|
|
1039
1081
|
prequery = PreQuery(
|
|
1040
1082
|
request=req,
|
|
@@ -26,6 +26,7 @@ from typing import Deque, Dict, List, Optional, Sequence, Tuple, Union, cast
|
|
|
26
26
|
import yaml
|
|
27
27
|
from pydantic import BaseModel
|
|
28
28
|
|
|
29
|
+
from nucliadb.common import datamanagers
|
|
29
30
|
from nucliadb.common.ids import FIELD_TYPE_STR_TO_PB, FieldId, ParagraphId
|
|
30
31
|
from nucliadb.common.maindb.utils import get_driver
|
|
31
32
|
from nucliadb.common.models_utils import from_proto
|
|
@@ -246,7 +247,7 @@ async def default_prompt_context(
|
|
|
246
247
|
- Using an dict prevents from duplicates pulled in through conversation expansion.
|
|
247
248
|
"""
|
|
248
249
|
# Sort retrieved paragraphs by decreasing order (most relevant first)
|
|
249
|
-
async with get_driver().
|
|
250
|
+
async with get_driver().ro_transaction() as txn:
|
|
250
251
|
storage = await get_storage()
|
|
251
252
|
kb = KnowledgeBoxORM(txn, storage, kbid)
|
|
252
253
|
for paragraph in ordered_paragraphs:
|
|
@@ -589,18 +590,7 @@ async def field_extension_prompt_context(
|
|
|
589
590
|
if resource_uuid not in ordered_resources:
|
|
590
591
|
ordered_resources.append(resource_uuid)
|
|
591
592
|
|
|
592
|
-
|
|
593
|
-
extend_fields = strategy.fields
|
|
594
|
-
extend_field_ids = []
|
|
595
|
-
for resource_uuid in ordered_resources:
|
|
596
|
-
for field_id in extend_fields:
|
|
597
|
-
try:
|
|
598
|
-
fid = FieldId.from_string(f"{resource_uuid}/{field_id.strip('/')}")
|
|
599
|
-
extend_field_ids.append(fid)
|
|
600
|
-
except ValueError: # pragma: no cover
|
|
601
|
-
# Invalid field id, skiping
|
|
602
|
-
continue
|
|
603
|
-
|
|
593
|
+
extend_field_ids = await get_matching_field_ids(kbid, ordered_resources, strategy)
|
|
604
594
|
tasks = [hydrate_field_text(kbid, fid) for fid in extend_field_ids]
|
|
605
595
|
field_extracted_texts = await run_concurrently(tasks)
|
|
606
596
|
|
|
@@ -630,6 +620,43 @@ async def field_extension_prompt_context(
|
|
|
630
620
|
context[paragraph.id] = _clean_paragraph_text(paragraph)
|
|
631
621
|
|
|
632
622
|
|
|
623
|
+
async def get_matching_field_ids(
|
|
624
|
+
kbid: str, ordered_resources: list[str], strategy: FieldExtensionStrategy
|
|
625
|
+
) -> list[FieldId]:
|
|
626
|
+
extend_field_ids: list[FieldId] = []
|
|
627
|
+
# Fetch the extracted texts of the specified fields for each resource
|
|
628
|
+
for resource_uuid in ordered_resources:
|
|
629
|
+
for field_id in strategy.fields:
|
|
630
|
+
try:
|
|
631
|
+
fid = FieldId.from_string(f"{resource_uuid}/{field_id.strip('/')}")
|
|
632
|
+
extend_field_ids.append(fid)
|
|
633
|
+
except ValueError: # pragma: no cover
|
|
634
|
+
# Invalid field id, skiping
|
|
635
|
+
continue
|
|
636
|
+
if len(strategy.data_augmentation_field_prefixes) > 0:
|
|
637
|
+
for resource_uuid in ordered_resources:
|
|
638
|
+
all_field_ids = await datamanagers.atomic.resources.get_all_field_ids(
|
|
639
|
+
kbid=kbid, rid=resource_uuid, for_update=False
|
|
640
|
+
)
|
|
641
|
+
if all_field_ids is None:
|
|
642
|
+
continue
|
|
643
|
+
for fieldid in all_field_ids.fields:
|
|
644
|
+
# Generated fields are always text fields starting with "da-"
|
|
645
|
+
if any(
|
|
646
|
+
(
|
|
647
|
+
fieldid.field_type == resources_pb2.FieldType.TEXT
|
|
648
|
+
and fieldid.field.startswith(f"da-{prefix}-")
|
|
649
|
+
)
|
|
650
|
+
for prefix in strategy.data_augmentation_field_prefixes
|
|
651
|
+
):
|
|
652
|
+
extend_field_ids.append(
|
|
653
|
+
FieldId.from_pb(
|
|
654
|
+
rid=resource_uuid, field_type=fieldid.field_type, key=fieldid.field
|
|
655
|
+
)
|
|
656
|
+
)
|
|
657
|
+
return extend_field_ids
|
|
658
|
+
|
|
659
|
+
|
|
633
660
|
async def get_orm_field(kbid: str, field_id: FieldId) -> Optional[Field]:
|
|
634
661
|
resource = await cache.get_resource(kbid, field_id.rid)
|
|
635
662
|
if resource is None: # pragma: no cover
|
|
@@ -779,7 +806,7 @@ async def conversation_prompt_context(
|
|
|
779
806
|
):
|
|
780
807
|
analyzed_fields: List[str] = []
|
|
781
808
|
ops = 0
|
|
782
|
-
async with get_driver().
|
|
809
|
+
async with get_driver().ro_transaction() as txn:
|
|
783
810
|
storage = await get_storage()
|
|
784
811
|
kb = KnowledgeBoxORM(txn, storage, kbid)
|
|
785
812
|
for paragraph in ordered_paragraphs:
|
|
@@ -946,9 +973,14 @@ async def hierarchy_prompt_context(
|
|
|
946
973
|
paragraph_id = ParagraphId.from_string(paragraph.id)
|
|
947
974
|
extended_paragraph_text = paragraph.text
|
|
948
975
|
if paragraphs_extra_characters > 0:
|
|
976
|
+
extended_paragraph_id = ParagraphId(
|
|
977
|
+
field_id=paragraph_id.field_id,
|
|
978
|
+
paragraph_start=paragraph_id.paragraph_start,
|
|
979
|
+
paragraph_end=paragraph_id.paragraph_end + paragraphs_extra_characters,
|
|
980
|
+
)
|
|
949
981
|
extended_paragraph_text = await get_paragraph_text(
|
|
950
982
|
kbid=kbid,
|
|
951
|
-
paragraph_id=
|
|
983
|
+
paragraph_id=extended_paragraph_id,
|
|
952
984
|
log_on_missing_field=True,
|
|
953
985
|
)
|
|
954
986
|
rid = paragraph_id.rid
|
|
@@ -200,7 +200,6 @@ def find_request_from_ask_request(item: AskRequest, query: str) -> FindRequest:
|
|
|
200
200
|
find_request.range_modification_end = item.range_modification_end
|
|
201
201
|
find_request.show = item.show
|
|
202
202
|
find_request.extracted = item.extracted
|
|
203
|
-
find_request.autofilter = item.autofilter
|
|
204
203
|
find_request.highlight = item.highlight
|
|
205
204
|
find_request.security = item.security
|
|
206
205
|
find_request.debug = item.debug
|
|
@@ -308,11 +307,13 @@ def maybe_audit_chat(
|
|
|
308
307
|
origin: str,
|
|
309
308
|
generative_answer_time: float,
|
|
310
309
|
generative_answer_first_chunk_time: float,
|
|
310
|
+
generative_reasoning_first_chunk_time: Optional[float],
|
|
311
311
|
rephrase_time: Optional[float],
|
|
312
312
|
user_query: str,
|
|
313
313
|
rephrased_query: Optional[str],
|
|
314
314
|
retrieval_rephrase_query: Optional[str],
|
|
315
315
|
text_answer: bytes,
|
|
316
|
+
text_reasoning: Optional[str],
|
|
316
317
|
status_code: AnswerStatusCode,
|
|
317
318
|
chat_history: list[ChatContextMessage],
|
|
318
319
|
query_context: PromptContext,
|
|
@@ -344,12 +345,14 @@ def maybe_audit_chat(
|
|
|
344
345
|
question=user_query,
|
|
345
346
|
generative_answer_time=generative_answer_time,
|
|
346
347
|
generative_answer_first_chunk_time=generative_answer_first_chunk_time,
|
|
348
|
+
generative_reasoning_first_chunk_time=generative_reasoning_first_chunk_time,
|
|
347
349
|
rephrase_time=rephrase_time,
|
|
348
350
|
rephrased_question=rephrased_query,
|
|
349
351
|
retrieval_rephrased_question=retrieval_rephrase_query,
|
|
350
352
|
chat_context=chat_history_context,
|
|
351
353
|
retrieved_context=chat_retrieved_context,
|
|
352
354
|
answer=audit_answer,
|
|
355
|
+
reasoning=text_reasoning,
|
|
353
356
|
learning_id=learning_id,
|
|
354
357
|
status_code=int(status_code.value),
|
|
355
358
|
model=model,
|
|
@@ -401,8 +404,10 @@ class ChatAuditor:
|
|
|
401
404
|
def audit(
|
|
402
405
|
self,
|
|
403
406
|
text_answer: bytes,
|
|
407
|
+
text_reasoning: Optional[str],
|
|
404
408
|
generative_answer_time: float,
|
|
405
409
|
generative_answer_first_chunk_time: float,
|
|
410
|
+
generative_reasoning_first_chunk_time: Optional[float],
|
|
406
411
|
rephrase_time: Optional[float],
|
|
407
412
|
status_code: AnswerStatusCode,
|
|
408
413
|
):
|
|
@@ -416,8 +421,10 @@ class ChatAuditor:
|
|
|
416
421
|
retrieval_rephrase_query=self.retrieval_rephrased_query,
|
|
417
422
|
generative_answer_time=generative_answer_time,
|
|
418
423
|
generative_answer_first_chunk_time=generative_answer_first_chunk_time,
|
|
424
|
+
generative_reasoning_first_chunk_time=generative_reasoning_first_chunk_time,
|
|
419
425
|
rephrase_time=rephrase_time,
|
|
420
426
|
text_answer=text_answer,
|
|
427
|
+
text_reasoning=text_reasoning,
|
|
421
428
|
status_code=status_code,
|
|
422
429
|
chat_history=self.chat_history,
|
|
423
430
|
query_context=self.query_context,
|
nucliadb/search/search/fetch.py
CHANGED
nucliadb/search/search/find.py
CHANGED
|
@@ -100,7 +100,6 @@ async def _index_node_retrieval(
|
|
|
100
100
|
(
|
|
101
101
|
pb_query,
|
|
102
102
|
incomplete_results,
|
|
103
|
-
autofilters,
|
|
104
103
|
rephrased_query,
|
|
105
104
|
) = await legacy_convert_retrieval_to_proto(parsed)
|
|
106
105
|
|
|
@@ -137,7 +136,6 @@ async def _index_node_retrieval(
|
|
|
137
136
|
)
|
|
138
137
|
|
|
139
138
|
search_results.shards = queried_shards
|
|
140
|
-
search_results.autofilters = autofilters
|
|
141
139
|
|
|
142
140
|
ndb_time = metrics["index_search"] + metrics["results_merge"]
|
|
143
141
|
if metrics["index_search"] > settings.slow_node_query_log_threshold:
|
|
@@ -180,9 +178,7 @@ async def _external_index_retrieval(
|
|
|
180
178
|
parsed = await parse_find(kbid, item)
|
|
181
179
|
assert parsed.retrieval.reranker is not None, "find parser must provide a reranking algorithm"
|
|
182
180
|
reranker = get_reranker(parsed.retrieval.reranker)
|
|
183
|
-
search_request, incomplete_results,
|
|
184
|
-
parsed
|
|
185
|
-
)
|
|
181
|
+
search_request, incomplete_results, rephrased_query = await legacy_convert_retrieval_to_proto(parsed)
|
|
186
182
|
|
|
187
183
|
# Query index
|
|
188
184
|
query_results = await external_index_manager.query(search_request) # noqa
|
|
@@ -220,7 +216,6 @@ async def _external_index_retrieval(
|
|
|
220
216
|
page_number=0,
|
|
221
217
|
page_size=item.top_k,
|
|
222
218
|
relations=None, # Not implemented for external indexes yet
|
|
223
|
-
autofilters=[], # Not implemented for external indexes yet
|
|
224
219
|
min_score=results_min_score,
|
|
225
220
|
best_matches=best_matches,
|
|
226
221
|
# These are not used for external indexes
|
|
@@ -28,7 +28,8 @@ from nucliadb.common.external_index_providers.base import TextBlockMatch
|
|
|
28
28
|
from nucliadb.common.ids import FieldId
|
|
29
29
|
from nucliadb.common.maindb.utils import get_driver
|
|
30
30
|
from nucliadb.ingest.serialize import managed_serialize
|
|
31
|
-
from nucliadb.search.search import cache
|
|
31
|
+
from nucliadb.search.search import cache
|
|
32
|
+
from nucliadb.search.search.paragraphs import get_paragraph_text
|
|
32
33
|
from nucliadb_models.common import FieldTypeName
|
|
33
34
|
from nucliadb_models.resource import ExtractedDataTypeName, Resource
|
|
34
35
|
from nucliadb_models.search import (
|
|
@@ -79,7 +80,7 @@ async def hydrate_resource_text(
|
|
|
79
80
|
return []
|
|
80
81
|
|
|
81
82
|
# Schedule the extraction of the text of each field in the resource
|
|
82
|
-
async with get_driver().
|
|
83
|
+
async with get_driver().ro_transaction() as txn:
|
|
83
84
|
resource.txn = txn
|
|
84
85
|
runner = ConcurrentRunner(max_tasks=max_concurrent_tasks)
|
|
85
86
|
for field_type, field_key in await resource.get_fields(force=True):
|
|
@@ -120,7 +121,7 @@ async def hydrate_resource_metadata(
|
|
|
120
121
|
if concurrency_control is not None:
|
|
121
122
|
await stack.enter_async_context(concurrency_control)
|
|
122
123
|
|
|
123
|
-
async with get_driver().
|
|
124
|
+
async with get_driver().ro_transaction() as ro_txn:
|
|
124
125
|
serialized_resource = await managed_serialize(
|
|
125
126
|
txn=ro_txn,
|
|
126
127
|
kbid=kbid,
|
|
@@ -170,7 +171,7 @@ async def hydrate_text_block(
|
|
|
170
171
|
if concurrency_control is not None:
|
|
171
172
|
await stack.enter_async_context(concurrency_control)
|
|
172
173
|
|
|
173
|
-
text_block.text = await
|
|
174
|
+
text_block.text = await get_paragraph_text(
|
|
174
175
|
kbid=kbid,
|
|
175
176
|
paragraph_id=text_block.paragraph_id,
|
|
176
177
|
highlight=options.highlight,
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
from nucliadb.common.ids import FIELD_TYPE_STR_TO_NAME, FieldId
|
|
23
|
+
from nucliadb.common.models_utils import from_proto
|
|
24
|
+
from nucliadb.ingest.orm.resource import Resource
|
|
25
|
+
from nucliadb.search.search.hydrator import hydrate_field_text
|
|
26
|
+
from nucliadb_models import hydration as hydration_models
|
|
27
|
+
from nucliadb_models.common import FieldTypeName
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def page_preview_id(page_number: int) -> str:
|
|
31
|
+
"""Return the string page number for an specific page"""
|
|
32
|
+
return f"{page_number}"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
async def hydrate_field(resource: Resource, field_id: FieldId, config: hydration_models.FieldHydration):
|
|
36
|
+
field_type = FIELD_TYPE_STR_TO_NAME[field_id.type]
|
|
37
|
+
|
|
38
|
+
if field_type == FieldTypeName.TEXT:
|
|
39
|
+
if not config.text is not None:
|
|
40
|
+
return
|
|
41
|
+
return await hydrate_text_field(resource, field_id, config.text)
|
|
42
|
+
|
|
43
|
+
elif field_type == FieldTypeName.FILE is not None:
|
|
44
|
+
if not config.file:
|
|
45
|
+
return
|
|
46
|
+
return await hydrate_file_field(resource, field_id, config.file)
|
|
47
|
+
|
|
48
|
+
elif field_type == FieldTypeName.LINK is not None:
|
|
49
|
+
if not config.link:
|
|
50
|
+
return
|
|
51
|
+
return await hydrate_link_field(resource, field_id, config.link)
|
|
52
|
+
|
|
53
|
+
elif field_type == FieldTypeName.CONVERSATION is not None:
|
|
54
|
+
if not config.conversation:
|
|
55
|
+
return
|
|
56
|
+
return await hydrate_conversation_field(resource, field_id, config.conversation)
|
|
57
|
+
|
|
58
|
+
elif field_type == FieldTypeName.GENERIC is not None:
|
|
59
|
+
if not config.generic:
|
|
60
|
+
return
|
|
61
|
+
return await hydrate_generic_field(resource, field_id, config.generic)
|
|
62
|
+
|
|
63
|
+
else: # pragma: no cover
|
|
64
|
+
# This is a trick so mypy generates an error if this branch can be reached,
|
|
65
|
+
# that is, if we are missing some ifs
|
|
66
|
+
_a: int = "a"
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
async def hydrate_text_field(
|
|
70
|
+
resource: Resource,
|
|
71
|
+
field_id: FieldId,
|
|
72
|
+
config: hydration_models.TextFieldHydration,
|
|
73
|
+
) -> hydration_models.HydratedTextField:
|
|
74
|
+
hydrated = hydration_models.HydratedTextField(
|
|
75
|
+
id=field_id.full(),
|
|
76
|
+
resource=field_id.rid,
|
|
77
|
+
field_type=FieldTypeName.TEXT,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
if config.extracted_text:
|
|
81
|
+
field_text = await hydrate_field_text(resource.kb.kbid, field_id)
|
|
82
|
+
if field_text is not None:
|
|
83
|
+
(_, text) = field_text
|
|
84
|
+
hydrated.extracted = hydration_models.FieldExtractedData(text=text)
|
|
85
|
+
|
|
86
|
+
return hydrated
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
async def hydrate_file_field(
|
|
90
|
+
resource: Resource,
|
|
91
|
+
field_id: FieldId,
|
|
92
|
+
config: hydration_models.FileFieldHydration,
|
|
93
|
+
) -> hydration_models.HydratedFileField:
|
|
94
|
+
hydrated = hydration_models.HydratedFileField(
|
|
95
|
+
id=field_id.full(),
|
|
96
|
+
resource=field_id.rid,
|
|
97
|
+
field_type=FieldTypeName.FILE,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
if config.value:
|
|
101
|
+
field = await resource.get_field(field_id.key, field_id.pb_type)
|
|
102
|
+
value = await field.get_value()
|
|
103
|
+
hydrated.value = from_proto.field_file(value)
|
|
104
|
+
|
|
105
|
+
if config.extracted_text:
|
|
106
|
+
field_text = await hydrate_field_text(resource.kb.kbid, field_id)
|
|
107
|
+
if field_text is not None:
|
|
108
|
+
(_, text) = field_text
|
|
109
|
+
hydrated.extracted = hydration_models.FieldExtractedData(text=text)
|
|
110
|
+
|
|
111
|
+
return hydrated
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
async def hydrate_link_field(
|
|
115
|
+
resource: Resource,
|
|
116
|
+
field_id: FieldId,
|
|
117
|
+
config: hydration_models.LinkFieldHydration,
|
|
118
|
+
) -> hydration_models.HydratedLinkField:
|
|
119
|
+
hydrated = hydration_models.HydratedLinkField(
|
|
120
|
+
id=field_id.full(),
|
|
121
|
+
resource=field_id.rid,
|
|
122
|
+
field_type=FieldTypeName.LINK,
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
if config.value:
|
|
126
|
+
field = await resource.get_field(field_id.key, field_id.pb_type)
|
|
127
|
+
value = await field.get_value()
|
|
128
|
+
hydrated.value = from_proto.field_link(value)
|
|
129
|
+
|
|
130
|
+
if config.extracted_text:
|
|
131
|
+
field_text = await hydrate_field_text(resource.kb.kbid, field_id)
|
|
132
|
+
if field_text is not None:
|
|
133
|
+
(_, text) = field_text
|
|
134
|
+
hydrated.extracted = hydration_models.FieldExtractedData(text=text)
|
|
135
|
+
|
|
136
|
+
return hydrated
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
async def hydrate_conversation_field(
|
|
140
|
+
resource: Resource,
|
|
141
|
+
field_id: FieldId,
|
|
142
|
+
config: hydration_models.ConversationFieldHydration,
|
|
143
|
+
) -> hydration_models.HydratedConversationField:
|
|
144
|
+
hydrated = hydration_models.HydratedConversationField(
|
|
145
|
+
id=field_id.full(),
|
|
146
|
+
resource=field_id.rid,
|
|
147
|
+
field_type=FieldTypeName.CONVERSATION,
|
|
148
|
+
)
|
|
149
|
+
# TODO: implement conversation fields
|
|
150
|
+
return hydrated
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
async def hydrate_generic_field(
|
|
154
|
+
resource: Resource,
|
|
155
|
+
field_id: FieldId,
|
|
156
|
+
config: hydration_models.GenericFieldHydration,
|
|
157
|
+
) -> hydration_models.HydratedGenericField:
|
|
158
|
+
hydrated = hydration_models.HydratedGenericField(
|
|
159
|
+
id=field_id.full(),
|
|
160
|
+
resource=field_id.rid,
|
|
161
|
+
field_type=FieldTypeName.GENERIC,
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
if config.value:
|
|
165
|
+
field = await resource.get_field(field_id.key, field_id.pb_type)
|
|
166
|
+
value = await field.get_value()
|
|
167
|
+
hydrated.value = value
|
|
168
|
+
|
|
169
|
+
if config.extracted_text:
|
|
170
|
+
field_text = await hydrate_field_text(resource.kb.kbid, field_id)
|
|
171
|
+
if field_text is not None:
|
|
172
|
+
(_, text) = field_text
|
|
173
|
+
hydrated.extracted = hydration_models.FieldExtractedData(text=text)
|
|
174
|
+
|
|
175
|
+
return hydrated
|