nucliadb 6.6.1.post4558__py3-none-any.whl → 6.6.1.post4569__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nucliadb/search/search/chat/prompt.py +37 -21
- {nucliadb-6.6.1.post4558.dist-info → nucliadb-6.6.1.post4569.dist-info}/METADATA +6 -6
- {nucliadb-6.6.1.post4558.dist-info → nucliadb-6.6.1.post4569.dist-info}/RECORD +6 -6
- {nucliadb-6.6.1.post4558.dist-info → nucliadb-6.6.1.post4569.dist-info}/WHEEL +0 -0
- {nucliadb-6.6.1.post4558.dist-info → nucliadb-6.6.1.post4569.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.6.1.post4558.dist-info → nucliadb-6.6.1.post4569.dist-info}/top_level.txt +0 -0
@@ -107,6 +107,9 @@ class CappedPromptContext:
|
|
107
107
|
def __getitem__(self, key: str) -> str:
|
108
108
|
return self.output.__getitem__(key)
|
109
109
|
|
110
|
+
def __contains__(self, key: str) -> bool:
|
111
|
+
return key in self.output
|
112
|
+
|
110
113
|
def __delitem__(self, key: str) -> None:
|
111
114
|
try:
|
112
115
|
self.output.__delitem__(key)
|
@@ -395,7 +398,10 @@ def parse_text_block_id(text_block_id: str) -> TextBlockId:
|
|
395
398
|
|
396
399
|
|
397
400
|
async def extend_prompt_context_with_origin_metadata(
|
398
|
-
context
|
401
|
+
context: CappedPromptContext,
|
402
|
+
kbid,
|
403
|
+
text_block_ids: list[TextBlockId],
|
404
|
+
augmented_context: AugmentedContext,
|
399
405
|
):
|
400
406
|
async def _get_origin(kbid: str, rid: str) -> tuple[str, Optional[Origin]]:
|
401
407
|
origin = None
|
@@ -411,7 +417,7 @@ async def extend_prompt_context_with_origin_metadata(
|
|
411
417
|
rid_to_origin = {rid: origin for rid, origin in origins if origin is not None}
|
412
418
|
for tb_id in text_block_ids:
|
413
419
|
origin = rid_to_origin.get(tb_id.rid)
|
414
|
-
if origin is not None and tb_id.full() in context
|
420
|
+
if origin is not None and tb_id.full() in context:
|
415
421
|
text = context.output.pop(tb_id.full())
|
416
422
|
extended_text = text + f"\n\nDOCUMENT METADATA AT ORIGIN:\n{to_yaml(origin)}"
|
417
423
|
context[tb_id.full()] = extended_text
|
@@ -424,7 +430,10 @@ async def extend_prompt_context_with_origin_metadata(
|
|
424
430
|
|
425
431
|
|
426
432
|
async def extend_prompt_context_with_classification_labels(
|
427
|
-
context
|
433
|
+
context: CappedPromptContext,
|
434
|
+
kbid: str,
|
435
|
+
text_block_ids: list[TextBlockId],
|
436
|
+
augmented_context: AugmentedContext,
|
428
437
|
):
|
429
438
|
async def _get_labels(kbid: str, _id: TextBlockId) -> tuple[TextBlockId, list[tuple[str, str]]]:
|
430
439
|
fid = _id if isinstance(_id, FieldId) else _id.field_id
|
@@ -449,7 +458,7 @@ async def extend_prompt_context_with_classification_labels(
|
|
449
458
|
tb_id_to_labels = {tb_id: labels for tb_id, labels in classif_labels if len(labels) > 0}
|
450
459
|
for tb_id in text_block_ids:
|
451
460
|
labels = tb_id_to_labels.get(tb_id)
|
452
|
-
if labels is not None and tb_id.full() in context
|
461
|
+
if labels is not None and tb_id.full() in context:
|
453
462
|
text = context.output.pop(tb_id.full())
|
454
463
|
|
455
464
|
labels_text = "DOCUMENT CLASSIFICATION LABELS:"
|
@@ -467,7 +476,10 @@ async def extend_prompt_context_with_classification_labels(
|
|
467
476
|
|
468
477
|
|
469
478
|
async def extend_prompt_context_with_ner(
|
470
|
-
context
|
479
|
+
context: CappedPromptContext,
|
480
|
+
kbid: str,
|
481
|
+
text_block_ids: list[TextBlockId],
|
482
|
+
augmented_context: AugmentedContext,
|
471
483
|
):
|
472
484
|
async def _get_ners(kbid: str, _id: TextBlockId) -> tuple[TextBlockId, dict[str, set[str]]]:
|
473
485
|
fid = _id if isinstance(_id, FieldId) else _id.field_id
|
@@ -494,7 +506,7 @@ async def extend_prompt_context_with_ner(
|
|
494
506
|
tb_id_to_ners = {tb_id: ners for tb_id, ners in nerss if len(ners) > 0}
|
495
507
|
for tb_id in text_block_ids:
|
496
508
|
ners = tb_id_to_ners.get(tb_id)
|
497
|
-
if ners is not None and tb_id.full() in context
|
509
|
+
if ners is not None and tb_id.full() in context:
|
498
510
|
text = context.output.pop(tb_id.full())
|
499
511
|
|
500
512
|
ners_text = "DOCUMENT NAMED ENTITIES (NERs):"
|
@@ -515,7 +527,10 @@ async def extend_prompt_context_with_ner(
|
|
515
527
|
|
516
528
|
|
517
529
|
async def extend_prompt_context_with_extra_metadata(
|
518
|
-
context
|
530
|
+
context: CappedPromptContext,
|
531
|
+
kbid: str,
|
532
|
+
text_block_ids: list[TextBlockId],
|
533
|
+
augmented_context: AugmentedContext,
|
519
534
|
):
|
520
535
|
async def _get_extra(kbid: str, rid: str) -> tuple[str, Optional[Extra]]:
|
521
536
|
extra = None
|
@@ -531,7 +546,7 @@ async def extend_prompt_context_with_extra_metadata(
|
|
531
546
|
rid_to_extra = {rid: extra for rid, extra in extras if extra is not None}
|
532
547
|
for tb_id in text_block_ids:
|
533
548
|
extra = rid_to_extra.get(tb_id.rid)
|
534
|
-
if extra is not None and tb_id.full() in context
|
549
|
+
if extra is not None and tb_id.full() in context:
|
535
550
|
text = context.output.pop(tb_id.full())
|
536
551
|
extended_text = text + f"\n\nDOCUMENT EXTRA METADATA:\n{to_yaml(extra)}"
|
537
552
|
context[tb_id.full()] = extended_text
|
@@ -600,7 +615,7 @@ async def field_extension_prompt_context(
|
|
600
615
|
if tb_id.startswith(field.full()):
|
601
616
|
del context[tb_id]
|
602
617
|
# Add the extracted text of each field to the beginning of the context.
|
603
|
-
if field.full() not in context
|
618
|
+
if field.full() not in context:
|
604
619
|
context[field.full()] = extracted_text
|
605
620
|
augmented_context.fields[field.full()] = AugmentedTextBlock(
|
606
621
|
id=field.full(),
|
@@ -610,7 +625,7 @@ async def field_extension_prompt_context(
|
|
610
625
|
|
611
626
|
# Add the extracted text of each paragraph to the end of the context.
|
612
627
|
for paragraph in ordered_paragraphs:
|
613
|
-
if paragraph.id not in context
|
628
|
+
if paragraph.id not in context:
|
614
629
|
context[paragraph.id] = _clean_paragraph_text(paragraph)
|
615
630
|
|
616
631
|
|
@@ -668,7 +683,7 @@ async def neighbouring_paragraphs_prompt_context(
|
|
668
683
|
if field_extracted_text is None:
|
669
684
|
continue
|
670
685
|
ptext = _get_paragraph_text(field_extracted_text, pid)
|
671
|
-
if ptext:
|
686
|
+
if ptext and pid.full() not in context:
|
672
687
|
context[pid.full()] = ptext
|
673
688
|
|
674
689
|
# Now add the neighbouring paragraphs
|
@@ -702,8 +717,8 @@ async def neighbouring_paragraphs_prompt_context(
|
|
702
717
|
npid = field_pids[neighbour_index]
|
703
718
|
except IndexError:
|
704
719
|
continue
|
705
|
-
if npid in retrieved_paragraphs_ids or npid.full() in context
|
706
|
-
# Already added
|
720
|
+
if npid in retrieved_paragraphs_ids or npid.full() in context:
|
721
|
+
# Already added
|
707
722
|
continue
|
708
723
|
ptext = _get_paragraph_text(field_extracted_text, npid)
|
709
724
|
if not ptext:
|
@@ -742,7 +757,8 @@ async def conversation_prompt_context(
|
|
742
757
|
storage = await get_storage()
|
743
758
|
kb = KnowledgeBoxORM(txn, storage, kbid)
|
744
759
|
for paragraph in ordered_paragraphs:
|
745
|
-
|
760
|
+
if paragraph.id not in context:
|
761
|
+
context[paragraph.id] = _clean_paragraph_text(paragraph)
|
746
762
|
|
747
763
|
# If the paragraph is a conversation and it matches semantically, we assume we
|
748
764
|
# have matched with the question, therefore try to include the answer to the
|
@@ -780,7 +796,7 @@ async def conversation_prompt_context(
|
|
780
796
|
text = message.content.text.strip()
|
781
797
|
pid = f"{rid}/{field_type}/{field_id}/{ident}/0-{len(text) + 1}"
|
782
798
|
attachments.extend(message.content.attachments_fields)
|
783
|
-
if pid in context
|
799
|
+
if pid in context:
|
784
800
|
continue
|
785
801
|
context[pid] = text
|
786
802
|
augmented_context.paragraphs[pid] = AugmentedTextBlock(
|
@@ -802,7 +818,7 @@ async def conversation_prompt_context(
|
|
802
818
|
text = message.content.text.strip()
|
803
819
|
attachments.extend(message.content.attachments_fields)
|
804
820
|
pid = f"{rid}/{field_type}/{field_id}/{ident}/0-{len(text) + 1}"
|
805
|
-
if pid in context
|
821
|
+
if pid in context:
|
806
822
|
continue
|
807
823
|
context[pid] = text
|
808
824
|
augmented_context.paragraphs[pid] = AugmentedTextBlock(
|
@@ -834,7 +850,7 @@ async def conversation_prompt_context(
|
|
834
850
|
text = message.content.text.strip()
|
835
851
|
attachments.extend(message.content.attachments_fields)
|
836
852
|
pid = f"{rid}/{field_type}/{field_id}/{message.ident}/0-{len(message.content.text) + 1}"
|
837
|
-
if pid in context
|
853
|
+
if pid in context:
|
838
854
|
continue
|
839
855
|
context[pid] = text
|
840
856
|
augmented_context.paragraphs[pid] = AugmentedTextBlock(
|
@@ -854,7 +870,7 @@ async def conversation_prompt_context(
|
|
854
870
|
extracted_text = await field.get_extracted_text()
|
855
871
|
if extracted_text is not None:
|
856
872
|
pid = f"{rid}/{field_type}/{attachment.field_id}/0-{len(extracted_text.text) + 1}"
|
857
|
-
if pid in context
|
873
|
+
if pid in context:
|
858
874
|
continue
|
859
875
|
text = f"Attachment {attachment.field_id}: {extracted_text.text}\n\n"
|
860
876
|
context[pid] = text
|
@@ -977,9 +993,9 @@ async def hierarchy_prompt_context(
|
|
977
993
|
paragraph_text = _clean_paragraph_text(paragraph)
|
978
994
|
context[paragraph.id] = paragraph_text
|
979
995
|
if paragraph.id in augmented_paragraphs:
|
980
|
-
|
981
|
-
augmented_context.
|
982
|
-
id=
|
996
|
+
pid = ParagraphId.from_string(paragraph.id)
|
997
|
+
augmented_context.paragraphs[pid.full()] = AugmentedTextBlock(
|
998
|
+
id=pid.full(), text=paragraph_text, augmentation_type=TextBlockAugmentationType.HIERARCHY
|
983
999
|
)
|
984
1000
|
return
|
985
1001
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: nucliadb
|
3
|
-
Version: 6.6.1.
|
3
|
+
Version: 6.6.1.post4569
|
4
4
|
Summary: NucliaDB
|
5
5
|
Author-email: Nuclia <nucliadb@nuclia.com>
|
6
6
|
License-Expression: AGPL-3.0-or-later
|
@@ -19,11 +19,11 @@ Classifier: Programming Language :: Python :: 3.12
|
|
19
19
|
Classifier: Programming Language :: Python :: 3 :: Only
|
20
20
|
Requires-Python: <4,>=3.9
|
21
21
|
Description-Content-Type: text/markdown
|
22
|
-
Requires-Dist: nucliadb-telemetry[all]>=6.6.1.
|
23
|
-
Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.6.1.
|
24
|
-
Requires-Dist: nucliadb-protos>=6.6.1.
|
25
|
-
Requires-Dist: nucliadb-models>=6.6.1.
|
26
|
-
Requires-Dist: nidx-protos>=6.6.1.
|
22
|
+
Requires-Dist: nucliadb-telemetry[all]>=6.6.1.post4569
|
23
|
+
Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.6.1.post4569
|
24
|
+
Requires-Dist: nucliadb-protos>=6.6.1.post4569
|
25
|
+
Requires-Dist: nucliadb-models>=6.6.1.post4569
|
26
|
+
Requires-Dist: nidx-protos>=6.6.1.post4569
|
27
27
|
Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
|
28
28
|
Requires-Dist: nuclia-models>=0.24.2
|
29
29
|
Requires-Dist: uvicorn[standard]
|
@@ -266,7 +266,7 @@ nucliadb/search/search/chat/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn
|
|
266
266
|
nucliadb/search/search/chat/ask.py,sha256=0sgfiCbNaCZrTvYaRGtf5xL6VnzRgzofINiEP4IvhWs,38278
|
267
267
|
nucliadb/search/search/chat/exceptions.py,sha256=Siy4GXW2L7oPhIR86H3WHBhE9lkV4A4YaAszuGGUf54,1356
|
268
268
|
nucliadb/search/search/chat/images.py,sha256=PA8VWxT5_HUGfW1ULhKTK46UBsVyINtWWqEM1ulzX1E,3095
|
269
|
-
nucliadb/search/search/chat/prompt.py,sha256=
|
269
|
+
nucliadb/search/search/chat/prompt.py,sha256=QwHULUDqe_pS2HZvQH1vzqpYEHQG_-UagXCNtLLtJEI,52997
|
270
270
|
nucliadb/search/search/chat/query.py,sha256=3jMPNbiFEOoS0ydMOPYkSx1qVlvAv51npzadWXDwkMs,16650
|
271
271
|
nucliadb/search/search/query_parser/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
272
272
|
nucliadb/search/search/query_parser/exceptions.py,sha256=sVl9gRNzhE-s480LBBVkiXzNRbKhYRQN5F3it5tNNp8,939
|
@@ -375,8 +375,8 @@ nucliadb/writer/tus/local.py,sha256=7jYa_w9b-N90jWgN2sQKkNcomqn6JMVBOVeDOVYJHto,
|
|
375
375
|
nucliadb/writer/tus/s3.py,sha256=vF0NkFTXiXhXq3bCVXXVV-ED38ECVoUeeYViP8uMqcU,8357
|
376
376
|
nucliadb/writer/tus/storage.py,sha256=ToqwjoYnjI4oIcwzkhha_MPxi-k4Jk3Lt55zRwaC1SM,2903
|
377
377
|
nucliadb/writer/tus/utils.py,sha256=MSdVbRsRSZVdkaum69_0wku7X3p5wlZf4nr6E0GMKbw,2556
|
378
|
-
nucliadb-6.6.1.
|
379
|
-
nucliadb-6.6.1.
|
380
|
-
nucliadb-6.6.1.
|
381
|
-
nucliadb-6.6.1.
|
382
|
-
nucliadb-6.6.1.
|
378
|
+
nucliadb-6.6.1.post4569.dist-info/METADATA,sha256=T15E0MzfZzXM-4jw758DQyl620svAuHBBov-_d8ucMI,4158
|
379
|
+
nucliadb-6.6.1.post4569.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
380
|
+
nucliadb-6.6.1.post4569.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
|
381
|
+
nucliadb-6.6.1.post4569.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
|
382
|
+
nucliadb-6.6.1.post4569.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|