nucliadb 6.7.2.post4862__py3-none-any.whl → 6.9.2.post5282__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nucliadb might be problematic. Click here for more details.
- migrations/0016_upgrade_to_paragraphs_v2.py +1 -1
- migrations/0017_multiple_writable_shards.py +1 -1
- migrations/0018_purge_orphan_kbslugs.py +1 -1
- migrations/0019_upgrade_to_paragraphs_v3.py +1 -1
- migrations/0021_overwrite_vectorsets_key.py +1 -1
- migrations/0023_backfill_pg_catalog.py +7 -3
- migrations/0025_assign_models_to_kbs_v2.py +3 -3
- migrations/0027_rollover_texts3.py +1 -1
- migrations/0028_extracted_vectors_reference.py +1 -1
- migrations/0029_backfill_field_status.py +1 -1
- migrations/0032_remove_old_relations.py +1 -1
- migrations/0036_backfill_catalog_slug.py +1 -1
- migrations/0037_backfill_catalog_facets.py +1 -1
- migrations/0038_backfill_catalog_field_labels.py +7 -3
- migrations/0039_backfill_converation_splits_metadata.py +106 -0
- migrations/0040_migrate_search_configurations.py +79 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/backups/create.py +3 -3
- nucliadb/backups/restore.py +3 -3
- nucliadb/common/cache.py +1 -1
- nucliadb/common/catalog/__init__.py +79 -0
- nucliadb/common/catalog/dummy.py +36 -0
- nucliadb/common/catalog/interface.py +85 -0
- nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +294 -208
- nucliadb/common/catalog/utils.py +56 -0
- nucliadb/common/cluster/manager.py +3 -19
- nucliadb/common/cluster/rebalance.py +484 -110
- nucliadb/common/cluster/rollover.py +29 -0
- nucliadb/common/cluster/settings.py +1 -1
- nucliadb/common/cluster/utils.py +26 -0
- nucliadb/common/datamanagers/atomic.py +6 -0
- nucliadb/common/datamanagers/utils.py +2 -2
- nucliadb/common/external_index_providers/manager.py +1 -29
- nucliadb/common/external_index_providers/settings.py +1 -27
- nucliadb/common/filter_expression.py +16 -33
- nucliadb/common/http_clients/exceptions.py +8 -0
- nucliadb/common/http_clients/processing.py +4 -0
- nucliadb/common/http_clients/utils.py +3 -0
- nucliadb/common/ids.py +77 -55
- nucliadb/common/locking.py +4 -4
- nucliadb/common/maindb/driver.py +11 -1
- nucliadb/common/maindb/local.py +1 -1
- nucliadb/common/maindb/pg.py +1 -1
- nucliadb/common/nidx.py +19 -1
- nucliadb/common/vector_index_config.py +1 -1
- nucliadb/export_import/datamanager.py +3 -3
- nucliadb/ingest/consumer/pull.py +7 -0
- nucliadb/ingest/consumer/service.py +2 -27
- nucliadb/ingest/consumer/shard_creator.py +17 -6
- nucliadb/ingest/fields/base.py +9 -17
- nucliadb/ingest/fields/conversation.py +47 -1
- nucliadb/ingest/orm/brain_v2.py +21 -3
- nucliadb/ingest/orm/index_message.py +126 -111
- nucliadb/ingest/orm/knowledgebox.py +84 -43
- nucliadb/ingest/orm/processor/auditing.py +1 -1
- nucliadb/ingest/orm/processor/processor.py +95 -149
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +10 -1
- nucliadb/ingest/partitions.py +12 -1
- nucliadb/ingest/serialize.py +2 -2
- nucliadb/ingest/service/writer.py +26 -19
- nucliadb/ingest/settings.py +33 -11
- nucliadb/learning_proxy.py +12 -15
- nucliadb/metrics_exporter.py +17 -4
- nucliadb/migrator/datamanager.py +11 -17
- nucliadb/migrator/migrator.py +2 -2
- nucliadb/purge/__init__.py +12 -17
- nucliadb/purge/orphan_shards.py +2 -2
- nucliadb/reader/api/v1/knowledgebox.py +40 -12
- nucliadb/reader/api/v1/learning_config.py +30 -10
- nucliadb/reader/api/v1/resource.py +2 -2
- nucliadb/reader/api/v1/services.py +1 -1
- nucliadb/reader/reader/notifications.py +1 -1
- nucliadb/search/api/v1/__init__.py +1 -0
- nucliadb/search/api/v1/catalog.py +4 -4
- nucliadb/search/api/v1/find.py +1 -4
- nucliadb/search/api/v1/hydrate.py +328 -0
- nucliadb/search/api/v1/resource/ask.py +21 -1
- nucliadb/search/api/v1/search.py +1 -4
- nucliadb/search/predict.py +9 -2
- nucliadb/search/search/cache.py +1 -20
- nucliadb/search/search/chat/ask.py +50 -8
- nucliadb/search/search/chat/prompt.py +47 -15
- nucliadb/search/search/chat/query.py +8 -1
- nucliadb/search/search/fetch.py +1 -1
- nucliadb/search/search/find.py +1 -6
- nucliadb/search/search/{hydrator.py → hydrator/__init__.py} +5 -4
- nucliadb/search/search/hydrator/fields.py +175 -0
- nucliadb/search/search/hydrator/images.py +130 -0
- nucliadb/search/search/hydrator/paragraphs.py +307 -0
- nucliadb/search/search/hydrator/resources.py +56 -0
- nucliadb/search/search/metrics.py +16 -0
- nucliadb/search/search/predict_proxy.py +33 -11
- nucliadb/search/search/query.py +0 -23
- nucliadb/search/search/query_parser/fetcher.py +5 -5
- nucliadb/search/search/query_parser/models.py +1 -30
- nucliadb/search/search/query_parser/parsers/ask.py +1 -1
- nucliadb/search/search/query_parser/parsers/catalog.py +4 -7
- nucliadb/search/search/query_parser/parsers/common.py +16 -7
- nucliadb/search/search/query_parser/parsers/find.py +0 -11
- nucliadb/search/search/query_parser/parsers/graph.py +5 -5
- nucliadb/search/search/query_parser/parsers/search.py +0 -11
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +4 -11
- nucliadb/search/search/rerankers.py +1 -1
- nucliadb/search/search/summarize.py +1 -1
- nucliadb/standalone/run.py +3 -0
- nucliadb/tasks/retries.py +4 -4
- nucliadb/train/generators/sentence_classifier.py +2 -8
- nucliadb/train/generators/utils.py +1 -1
- nucliadb/train/nodes.py +4 -4
- nucliadb/train/servicer.py +1 -1
- nucliadb/train/uploader.py +1 -1
- nucliadb/writer/api/v1/field.py +14 -9
- nucliadb/writer/api/v1/knowledgebox.py +15 -52
- nucliadb/writer/api/v1/learning_config.py +5 -4
- nucliadb/writer/api/v1/resource.py +2 -2
- nucliadb/writer/resource/field.py +38 -2
- nucliadb/writer/tus/azure.py +4 -4
- nucliadb/writer/tus/gcs.py +11 -17
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/METADATA +9 -10
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/RECORD +124 -114
- nucliadb/common/external_index_providers/pinecone.py +0 -894
- nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/WHEEL +0 -0
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
import base64
|
|
21
|
+
from typing import Optional, cast
|
|
22
|
+
|
|
23
|
+
from nucliadb.common.ids import FIELD_TYPE_STR_TO_NAME, FieldId, ParagraphId
|
|
24
|
+
from nucliadb.ingest.fields.base import Field
|
|
25
|
+
from nucliadb.ingest.fields.file import File
|
|
26
|
+
from nucliadb.search import SERVICE_NAME
|
|
27
|
+
from nucliadb_models.common import FieldTypeName
|
|
28
|
+
from nucliadb_models.search import Image
|
|
29
|
+
from nucliadb_protos import resources_pb2
|
|
30
|
+
from nucliadb_utils.utilities import get_storage
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
async def paragraph_source_image(
|
|
34
|
+
kbid: str, paragraph_id: ParagraphId, paragraph: resources_pb2.Paragraph
|
|
35
|
+
) -> Optional[Image]:
|
|
36
|
+
"""Certain paragraphs are extracted from images using techniques like OCR or
|
|
37
|
+
inception. If that's the case, return the original image for this paragraph.
|
|
38
|
+
|
|
39
|
+
"""
|
|
40
|
+
source_image = paragraph.representation.reference_file
|
|
41
|
+
if not source_image:
|
|
42
|
+
return None
|
|
43
|
+
|
|
44
|
+
if paragraph.kind not in (
|
|
45
|
+
resources_pb2.Paragraph.TypeParagraph.OCR,
|
|
46
|
+
resources_pb2.Paragraph.TypeParagraph.INCEPTION,
|
|
47
|
+
):
|
|
48
|
+
return None
|
|
49
|
+
|
|
50
|
+
field_id = paragraph_id.field_id
|
|
51
|
+
|
|
52
|
+
# Paragraphs extracted from an image store its original image representation
|
|
53
|
+
# in the reference file. The path is incomplete though, as it's stored in
|
|
54
|
+
# the `generated` folder
|
|
55
|
+
image = await download_image(
|
|
56
|
+
kbid,
|
|
57
|
+
field_id,
|
|
58
|
+
f"generated/{source_image}",
|
|
59
|
+
# XXX: we assume all reference files are PNG images, but this actually
|
|
60
|
+
# depends on learning so it's a dangerous assumption. We should check it
|
|
61
|
+
# by ourselves
|
|
62
|
+
mime_type="image/png",
|
|
63
|
+
)
|
|
64
|
+
return image
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
async def download_image(
|
|
68
|
+
kbid: str, field_id: FieldId, image_path: str, *, mime_type: str
|
|
69
|
+
) -> Optional[Image]:
|
|
70
|
+
storage = await get_storage(service_name=SERVICE_NAME)
|
|
71
|
+
sf = storage.file_extracted(
|
|
72
|
+
kbid,
|
|
73
|
+
field_id.rid,
|
|
74
|
+
field_id.type,
|
|
75
|
+
field_id.key,
|
|
76
|
+
image_path,
|
|
77
|
+
)
|
|
78
|
+
raw_image = (await storage.downloadbytes(sf.bucket, sf.key)).getvalue()
|
|
79
|
+
if not raw_image:
|
|
80
|
+
return None
|
|
81
|
+
return Image(content_type=mime_type, b64encoded=base64.b64encode(raw_image).decode())
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
async def download_page_preview(field: Field, page: int) -> Optional[Image]:
|
|
85
|
+
"""Download a specific page preview for a field and return it as an Image.
|
|
86
|
+
As not all fields have previews, this function can return None.
|
|
87
|
+
|
|
88
|
+
Page previews are uploaded by learning and shared through a known path with.
|
|
89
|
+
nucliadb
|
|
90
|
+
|
|
91
|
+
"""
|
|
92
|
+
field_type = FIELD_TYPE_STR_TO_NAME[field.type]
|
|
93
|
+
|
|
94
|
+
if field_type == FieldTypeName.FILE:
|
|
95
|
+
field = cast(File, field)
|
|
96
|
+
metadata = await field.get_file_extracted_data()
|
|
97
|
+
|
|
98
|
+
if metadata is None:
|
|
99
|
+
return None
|
|
100
|
+
|
|
101
|
+
assert page <= len(metadata.file_pages_previews.positions), (
|
|
102
|
+
f"paragraph page number {page} should be less or equal to the total file pages previews {len(metadata.file_pages_previews.positions)}"
|
|
103
|
+
)
|
|
104
|
+
image = await download_image(
|
|
105
|
+
field.kbid,
|
|
106
|
+
field.field_id,
|
|
107
|
+
f"generated/extracted_images_{page}.png",
|
|
108
|
+
mime_type="image/png",
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
elif field_type == FieldTypeName.LINK:
|
|
112
|
+
# TODO: in case of links, we want to return the link preview, that is a
|
|
113
|
+
# link converted to PDF and screenshotted
|
|
114
|
+
# REVIEW: link preview is an image or a PDF?
|
|
115
|
+
image = None
|
|
116
|
+
|
|
117
|
+
elif (
|
|
118
|
+
field_type == FieldTypeName.TEXT
|
|
119
|
+
or field_type == FieldTypeName.CONVERSATION
|
|
120
|
+
or field_type == FieldTypeName.GENERIC
|
|
121
|
+
):
|
|
122
|
+
# these fields don't have previews
|
|
123
|
+
image = None
|
|
124
|
+
|
|
125
|
+
else: # pragma: no cover
|
|
126
|
+
# This is a trick so mypy generates an error if this branch can be reached,
|
|
127
|
+
# that is, if we are missing some ifs
|
|
128
|
+
_a: int = "a"
|
|
129
|
+
|
|
130
|
+
return image
|
|
@@ -0,0 +1,307 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
import asyncio
|
|
21
|
+
from dataclasses import dataclass
|
|
22
|
+
from typing import Optional, Union
|
|
23
|
+
|
|
24
|
+
from nucliadb.common.ids import FieldId, ParagraphId
|
|
25
|
+
from nucliadb.ingest.fields.base import Field
|
|
26
|
+
from nucliadb.ingest.orm.resource import Resource
|
|
27
|
+
from nucliadb.search.search import paragraphs
|
|
28
|
+
from nucliadb.search.search.hydrator.fields import page_preview_id
|
|
29
|
+
from nucliadb.search.search.hydrator.images import paragraph_source_image
|
|
30
|
+
from nucliadb_models import hydration as hydration_models
|
|
31
|
+
from nucliadb_protos import resources_pb2
|
|
32
|
+
from nucliadb_protos.resources_pb2 import FieldComputedMetadata
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class ParagraphIndex:
|
|
36
|
+
"""Small helper class to cache field paragraphs and its relations and be
|
|
37
|
+
used as an index.
|
|
38
|
+
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
NEXT = "next"
|
|
42
|
+
PREVIOUS = "previous"
|
|
43
|
+
PARENTS = "parents"
|
|
44
|
+
SIBLINGS = "siblings"
|
|
45
|
+
REPLACEMENTS = "replacements"
|
|
46
|
+
|
|
47
|
+
def __init__(self, field_id: FieldId) -> None:
|
|
48
|
+
self.field_id = field_id
|
|
49
|
+
self.paragraphs: dict[str, resources_pb2.Paragraph] = {}
|
|
50
|
+
self.neighbours: dict[tuple[str, str], str] = {}
|
|
51
|
+
self.related: dict[tuple[str, str], list[str]] = {}
|
|
52
|
+
self._lock = asyncio.Lock()
|
|
53
|
+
self._built = False
|
|
54
|
+
|
|
55
|
+
async def build(self, field: Field):
|
|
56
|
+
"""Build the index if it hasn't been built yet.
|
|
57
|
+
|
|
58
|
+
This function is async-safe, multiple concurrent tasks can ask for a
|
|
59
|
+
built and it'll only be done once
|
|
60
|
+
"""
|
|
61
|
+
if self._built:
|
|
62
|
+
return
|
|
63
|
+
|
|
64
|
+
async with self._lock:
|
|
65
|
+
# double check we haven't built the index meanwhile we waited for the
|
|
66
|
+
# lock
|
|
67
|
+
if self._built:
|
|
68
|
+
return
|
|
69
|
+
|
|
70
|
+
field_metadata = await field.get_field_metadata()
|
|
71
|
+
|
|
72
|
+
if field_metadata is None:
|
|
73
|
+
# field metadata may be still processing. As we want to provide a
|
|
74
|
+
# consistent view, even if it can appear meanwhile we hydrate, we
|
|
75
|
+
# consider we don't have it. We mark the index as built and any
|
|
76
|
+
# paragraph will be found for this field
|
|
77
|
+
self._built = True
|
|
78
|
+
return None
|
|
79
|
+
|
|
80
|
+
# REVIEW: this is a CPU-bound code, we may consider running this in an
|
|
81
|
+
# executor to not block the loop
|
|
82
|
+
self._build(field_metadata)
|
|
83
|
+
self._built = True
|
|
84
|
+
|
|
85
|
+
def _build(self, field_metadata: FieldComputedMetadata):
|
|
86
|
+
self.paragraphs.clear()
|
|
87
|
+
self.neighbours.clear()
|
|
88
|
+
self.related.clear()
|
|
89
|
+
|
|
90
|
+
if self.field_id.subfield_id is None:
|
|
91
|
+
field_paragraphs = field_metadata.metadata.paragraphs
|
|
92
|
+
else:
|
|
93
|
+
field_paragraphs = field_metadata.split_metadata[self.field_id.subfield_id].paragraphs
|
|
94
|
+
|
|
95
|
+
previous = None
|
|
96
|
+
for paragraph in field_paragraphs:
|
|
97
|
+
paragraph_id = self.field_id.paragraph_id(paragraph.start, paragraph.end).full()
|
|
98
|
+
self.paragraphs[paragraph_id] = paragraph
|
|
99
|
+
|
|
100
|
+
if previous is not None:
|
|
101
|
+
self.neighbours[(previous, ParagraphIndex.NEXT)] = paragraph_id
|
|
102
|
+
self.neighbours[(paragraph_id, ParagraphIndex.PREVIOUS)] = previous
|
|
103
|
+
previous = paragraph_id
|
|
104
|
+
|
|
105
|
+
self.related[(paragraph_id, ParagraphIndex.PARENTS)] = [
|
|
106
|
+
parent for parent in paragraph.relations.parents
|
|
107
|
+
]
|
|
108
|
+
self.related[(paragraph_id, ParagraphIndex.SIBLINGS)] = [
|
|
109
|
+
sibling for sibling in paragraph.relations.siblings
|
|
110
|
+
]
|
|
111
|
+
self.related[(paragraph_id, ParagraphIndex.REPLACEMENTS)] = [
|
|
112
|
+
replacement for replacement in paragraph.relations.replacements
|
|
113
|
+
]
|
|
114
|
+
|
|
115
|
+
def get(self, paragraph_id: Union[str, ParagraphId]) -> Optional[resources_pb2.Paragraph]:
|
|
116
|
+
paragraph_id = str(paragraph_id)
|
|
117
|
+
return self.paragraphs.get(paragraph_id)
|
|
118
|
+
|
|
119
|
+
def previous(self, paragraph_id: Union[str, ParagraphId]) -> Optional[str]:
|
|
120
|
+
paragraph_id = str(paragraph_id)
|
|
121
|
+
return self.neighbours.get((paragraph_id, ParagraphIndex.PREVIOUS))
|
|
122
|
+
|
|
123
|
+
def next(self, paragraph_id: Union[str, ParagraphId]) -> Optional[str]:
|
|
124
|
+
paragraph_id = str(paragraph_id)
|
|
125
|
+
return self.neighbours.get((paragraph_id, ParagraphIndex.NEXT))
|
|
126
|
+
|
|
127
|
+
def n_previous(self, paragraph_id: Union[str, ParagraphId], count: int = 1) -> list[str]:
|
|
128
|
+
assert count >= 1, f"can't find negative previous {count}"
|
|
129
|
+
paragraph_id = str(paragraph_id)
|
|
130
|
+
previous: list[str] = []
|
|
131
|
+
current_id = paragraph_id
|
|
132
|
+
for _ in range(count):
|
|
133
|
+
previous_id = self.previous(current_id)
|
|
134
|
+
if previous_id is None:
|
|
135
|
+
# we've reached the first paragraph
|
|
136
|
+
break
|
|
137
|
+
previous.insert(0, previous_id)
|
|
138
|
+
current_id = previous_id
|
|
139
|
+
return previous
|
|
140
|
+
|
|
141
|
+
def n_next(self, paragraph_id: Union[str, ParagraphId], count: int = 1) -> list[str]:
|
|
142
|
+
assert count >= 1, f"can't find negative nexts {count}"
|
|
143
|
+
paragraph_id = str(paragraph_id)
|
|
144
|
+
nexts = []
|
|
145
|
+
current_id = paragraph_id
|
|
146
|
+
for _ in range(count):
|
|
147
|
+
next_id = self.next(current_id)
|
|
148
|
+
if next_id is None:
|
|
149
|
+
# we've reached the last paragraph
|
|
150
|
+
break
|
|
151
|
+
current_id = next_id
|
|
152
|
+
nexts.append(next_id)
|
|
153
|
+
return nexts
|
|
154
|
+
|
|
155
|
+
def parents(self, paragraph_id: Union[str, ParagraphId]) -> list[str]:
|
|
156
|
+
paragraph_id = str(paragraph_id)
|
|
157
|
+
return self.related.get((paragraph_id, ParagraphIndex.PARENTS), [])
|
|
158
|
+
|
|
159
|
+
def siblings(self, paragraph_id: Union[str, ParagraphId]) -> list[str]:
|
|
160
|
+
paragraph_id = str(paragraph_id)
|
|
161
|
+
return self.related.get((paragraph_id, ParagraphIndex.SIBLINGS), [])
|
|
162
|
+
|
|
163
|
+
def replacements(self, paragraph_id: Union[str, ParagraphId]) -> list[str]:
|
|
164
|
+
paragraph_id = str(paragraph_id)
|
|
165
|
+
return self.related.get((paragraph_id, ParagraphIndex.REPLACEMENTS), [])
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
@dataclass
|
|
169
|
+
class ExtraParagraphHydration:
|
|
170
|
+
field_page: Optional[int]
|
|
171
|
+
field_table_page: Optional[int]
|
|
172
|
+
related_paragraph_ids: list[ParagraphId]
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
async def hydrate_paragraph(
|
|
176
|
+
resource: Resource,
|
|
177
|
+
field: Field,
|
|
178
|
+
paragraph_id: ParagraphId,
|
|
179
|
+
config: hydration_models.ParagraphHydration,
|
|
180
|
+
field_paragraphs_index: ParagraphIndex,
|
|
181
|
+
) -> tuple[hydration_models.HydratedParagraph, ExtraParagraphHydration]:
|
|
182
|
+
"""Hydrate a paragraph and return the extra hydration to built a coherent
|
|
183
|
+
hydration around this paragraph.
|
|
184
|
+
|
|
185
|
+
Although the resource and field exist, the paragraph doesn't necessarily
|
|
186
|
+
need to be a real one in the paragraph metadata, it can be made-up to
|
|
187
|
+
include more or less text than the originally extracted.
|
|
188
|
+
|
|
189
|
+
"""
|
|
190
|
+
kbid = resource.kb.kbid
|
|
191
|
+
|
|
192
|
+
hydrated = hydration_models.HydratedParagraph(
|
|
193
|
+
id=paragraph_id.full(),
|
|
194
|
+
field=paragraph_id.field_id.full(),
|
|
195
|
+
resource=paragraph_id.rid,
|
|
196
|
+
)
|
|
197
|
+
extra_hydration = ExtraParagraphHydration(
|
|
198
|
+
field_page=None, field_table_page=None, related_paragraph_ids=[]
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
if config.text:
|
|
202
|
+
text = await paragraphs.get_paragraph_text(kbid=kbid, paragraph_id=paragraph_id)
|
|
203
|
+
hydrated.text = text
|
|
204
|
+
|
|
205
|
+
requires_paragraph_metadata = config.image or config.table or config.page or config.related
|
|
206
|
+
if requires_paragraph_metadata:
|
|
207
|
+
await field_paragraphs_index.build(field)
|
|
208
|
+
paragraph = field_paragraphs_index.get(paragraph_id)
|
|
209
|
+
if paragraph is not None:
|
|
210
|
+
# otherwise, this is a fake paragraph. We can't hydrate anything else here
|
|
211
|
+
|
|
212
|
+
if config.related:
|
|
213
|
+
hydrated.related, related_ids = await related_paragraphs_refs(
|
|
214
|
+
paragraph_id, field_paragraphs_index, config.related
|
|
215
|
+
)
|
|
216
|
+
extra_hydration.related_paragraph_ids = related_ids
|
|
217
|
+
|
|
218
|
+
if config.image:
|
|
219
|
+
hydrated.image = hydration_models.HydratedParagraphImage()
|
|
220
|
+
|
|
221
|
+
if config.image.source_image:
|
|
222
|
+
hydrated.image.source_image = await paragraph_source_image(
|
|
223
|
+
kbid, paragraph_id, paragraph
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
if config.page:
|
|
227
|
+
if hydrated.page is None:
|
|
228
|
+
hydrated.page = hydration_models.HydratedParagraphPage()
|
|
229
|
+
|
|
230
|
+
if config.page.page_with_visual:
|
|
231
|
+
if paragraph.page.page_with_visual:
|
|
232
|
+
# Paragraphs can be found on pages with visual content. In this
|
|
233
|
+
# case, we want to return the preview of the paragraph page as
|
|
234
|
+
# an image
|
|
235
|
+
page_number = paragraph.page.page
|
|
236
|
+
# TODO: what should I do if I later find there's no page in the DB?
|
|
237
|
+
hydrated.page.page_preview_ref = page_preview_id(page_number)
|
|
238
|
+
extra_hydration.field_page = page_number
|
|
239
|
+
|
|
240
|
+
if config.table:
|
|
241
|
+
if hydrated.table is None:
|
|
242
|
+
hydrated.table = hydration_models.HydratedParagraphTable()
|
|
243
|
+
|
|
244
|
+
if config.table.table_page_preview:
|
|
245
|
+
if paragraph.representation.is_a_table:
|
|
246
|
+
# When a paragraph comes with a table and table hydration is
|
|
247
|
+
# enabled, we want to return the image representing that table.
|
|
248
|
+
# Ideally we should hydrate the paragraph reference_file, but
|
|
249
|
+
# table screenshots are not always perfect so we prefer to use
|
|
250
|
+
# the page preview. If at some point the table images are good
|
|
251
|
+
# enough, it'd be better to use those
|
|
252
|
+
page_number = paragraph.page.page
|
|
253
|
+
hydrated.table.page_preview_ref = page_preview_id(page_number)
|
|
254
|
+
extra_hydration.field_table_page = page_number
|
|
255
|
+
|
|
256
|
+
return hydrated, extra_hydration
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
async def related_paragraphs_refs(
|
|
260
|
+
paragraph_id: ParagraphId,
|
|
261
|
+
index: ParagraphIndex,
|
|
262
|
+
config: hydration_models.RelatedParagraphHydration,
|
|
263
|
+
) -> tuple[hydration_models.RelatedParagraphRefs, list[ParagraphId]]:
|
|
264
|
+
"""Compute the related paragraph references for a specific `paragraph_id`
|
|
265
|
+
and return them with the plain list of unique related paragraphs (to
|
|
266
|
+
facilitate work to the caller).
|
|
267
|
+
|
|
268
|
+
"""
|
|
269
|
+
hydrated = hydration_models.RelatedParagraphRefs()
|
|
270
|
+
related = set()
|
|
271
|
+
|
|
272
|
+
if config.neighbours:
|
|
273
|
+
hydrated.neighbours = hydration_models.RelatedNeighbourParagraphRefs()
|
|
274
|
+
|
|
275
|
+
if config.neighbours.before is not None:
|
|
276
|
+
hydrated.neighbours.before = []
|
|
277
|
+
if config.neighbours.before > 0:
|
|
278
|
+
for previous_id in index.n_previous(paragraph_id, config.neighbours.before):
|
|
279
|
+
hydrated.neighbours.before.insert(0, previous_id)
|
|
280
|
+
related.add(ParagraphId.from_string(previous_id))
|
|
281
|
+
|
|
282
|
+
if config.neighbours.after is not None:
|
|
283
|
+
hydrated.neighbours.after = []
|
|
284
|
+
if config.neighbours.after > 0:
|
|
285
|
+
for next_id in index.n_next(paragraph_id, config.neighbours.after):
|
|
286
|
+
hydrated.neighbours.after.append(next_id)
|
|
287
|
+
related.add(ParagraphId.from_string(next_id))
|
|
288
|
+
|
|
289
|
+
if config.parents:
|
|
290
|
+
hydrated.parents = []
|
|
291
|
+
for parent_id in index.parents(paragraph_id):
|
|
292
|
+
hydrated.parents.append(parent_id)
|
|
293
|
+
related.add(ParagraphId.from_string(parent_id))
|
|
294
|
+
|
|
295
|
+
if config.siblings:
|
|
296
|
+
hydrated.siblings = []
|
|
297
|
+
for sibling_id in index.siblings(paragraph_id):
|
|
298
|
+
hydrated.siblings.append(sibling_id)
|
|
299
|
+
related.add(ParagraphId.from_string(sibling_id))
|
|
300
|
+
|
|
301
|
+
if config.replacements:
|
|
302
|
+
hydrated.replacements = []
|
|
303
|
+
for replacement_id in index.replacements(paragraph_id):
|
|
304
|
+
hydrated.replacements.append(replacement_id)
|
|
305
|
+
related.add(ParagraphId.from_string(replacement_id))
|
|
306
|
+
|
|
307
|
+
return hydrated, list(related)
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
4
|
+
#
|
|
5
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
6
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
7
|
+
#
|
|
8
|
+
# AGPL:
|
|
9
|
+
# This program is free software: you can redistribute it and/or modify
|
|
10
|
+
# it under the terms of the GNU Affero General Public License as
|
|
11
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
12
|
+
# License, or (at your option) any later version.
|
|
13
|
+
#
|
|
14
|
+
# This program is distributed in the hope that it will be useful,
|
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
17
|
+
# GNU Affero General Public License for more details.
|
|
18
|
+
#
|
|
19
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
20
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
21
|
+
#
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
from nucliadb.common.models_utils import from_proto
|
|
25
|
+
from nucliadb.ingest.orm.resource import Resource
|
|
26
|
+
from nucliadb_models import hydration as hydration_models
|
|
27
|
+
from nucliadb_models.security import ResourceSecurity
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
async def hydrate_resource(
|
|
31
|
+
resource: Resource, rid: str, config: hydration_models.ResourceHydration
|
|
32
|
+
) -> hydration_models.HydratedResource:
|
|
33
|
+
basic = await resource.get_basic()
|
|
34
|
+
|
|
35
|
+
slug = basic.slug
|
|
36
|
+
hydrated = hydration_models.HydratedResource(id=rid, slug=slug)
|
|
37
|
+
|
|
38
|
+
if config.title:
|
|
39
|
+
hydrated.title = basic.title
|
|
40
|
+
if config.summary:
|
|
41
|
+
hydrated.summary = basic.summary
|
|
42
|
+
|
|
43
|
+
if config.security:
|
|
44
|
+
security = await resource.get_security()
|
|
45
|
+
hydrated.security = ResourceSecurity(access_groups=[])
|
|
46
|
+
if security is not None:
|
|
47
|
+
for group_id in security.access_groups:
|
|
48
|
+
hydrated.security.access_groups.append(group_id)
|
|
49
|
+
|
|
50
|
+
if config.origin:
|
|
51
|
+
origin = await resource.get_origin()
|
|
52
|
+
if origin is not None:
|
|
53
|
+
# TODO: we want a better hydration than proto to JSON
|
|
54
|
+
hydrated.origin = from_proto.origin(origin)
|
|
55
|
+
|
|
56
|
+
return hydrated
|
|
@@ -49,6 +49,10 @@ buckets = [
|
|
|
49
49
|
]
|
|
50
50
|
|
|
51
51
|
generative_first_chunk_histogram = metrics.Histogram(
|
|
52
|
+
name="generative_reasoning_first_chunk",
|
|
53
|
+
buckets=buckets,
|
|
54
|
+
)
|
|
55
|
+
reasoning_first_chunk_histogram = metrics.Histogram(
|
|
52
56
|
name="generative_first_chunk",
|
|
53
57
|
buckets=buckets,
|
|
54
58
|
)
|
|
@@ -107,12 +111,24 @@ class AskMetrics(Metrics):
|
|
|
107
111
|
super().__init__(id="ask")
|
|
108
112
|
self.global_start = time.monotonic()
|
|
109
113
|
self.first_chunk_yielded_at: Optional[float] = None
|
|
114
|
+
self.first_reasoning_chunk_yielded_at: Optional[float] = None
|
|
110
115
|
|
|
111
116
|
def record_first_chunk_yielded(self):
|
|
112
117
|
self.first_chunk_yielded_at = time.monotonic()
|
|
113
118
|
generative_first_chunk_histogram.observe(self.first_chunk_yielded_at - self.global_start)
|
|
114
119
|
|
|
120
|
+
def record_first_reasoning_chunk_yielded(self):
|
|
121
|
+
self.first_reasoning_chunk_yielded_at = time.monotonic()
|
|
122
|
+
reasoning_first_chunk_histogram.observe(
|
|
123
|
+
self.first_reasoning_chunk_yielded_at - self.global_start
|
|
124
|
+
)
|
|
125
|
+
|
|
115
126
|
def get_first_chunk_time(self) -> Optional[float]:
|
|
116
127
|
if self.first_chunk_yielded_at is None:
|
|
117
128
|
return None
|
|
118
129
|
return self.first_chunk_yielded_at - self.global_start
|
|
130
|
+
|
|
131
|
+
def get_first_reasoning_chunk_time(self) -> Optional[float]:
|
|
132
|
+
if self.first_reasoning_chunk_yielded_at is None:
|
|
133
|
+
return None
|
|
134
|
+
return self.first_reasoning_chunk_yielded_at - self.global_start
|
|
@@ -28,6 +28,7 @@ from multidict import CIMultiDictProxy
|
|
|
28
28
|
from nuclia_models.predict.generative_responses import (
|
|
29
29
|
GenerativeChunk,
|
|
30
30
|
JSONGenerativeResponse,
|
|
31
|
+
ReasoningGenerativeResponse,
|
|
31
32
|
StatusGenerativeResponse,
|
|
32
33
|
TextGenerativeResponse,
|
|
33
34
|
)
|
|
@@ -87,6 +88,7 @@ async def predict_proxy(
|
|
|
87
88
|
predict_headers = predict.get_predict_headers(kbid)
|
|
88
89
|
user_headers = {k: v for k, v in headers.items() if k.capitalize() in ALLOWED_HEADERS}
|
|
89
90
|
|
|
91
|
+
metrics = AskMetrics()
|
|
90
92
|
# Proxy the request to predict API
|
|
91
93
|
predict_response = await predict.make_request(
|
|
92
94
|
method=method,
|
|
@@ -109,7 +111,8 @@ async def predict_proxy(
|
|
|
109
111
|
client_type=client_type,
|
|
110
112
|
origin=origin,
|
|
111
113
|
user_query=user_query,
|
|
112
|
-
|
|
114
|
+
is_ndjson_stream="json" in (media_type or ""),
|
|
115
|
+
metrics=metrics,
|
|
113
116
|
)
|
|
114
117
|
else:
|
|
115
118
|
streaming_generator = predict_response.content.iter_any()
|
|
@@ -120,7 +123,6 @@ async def predict_proxy(
|
|
|
120
123
|
media_type=media_type,
|
|
121
124
|
)
|
|
122
125
|
else:
|
|
123
|
-
metrics = AskMetrics()
|
|
124
126
|
with metrics.time(PREDICT_ANSWER_METRIC):
|
|
125
127
|
content = await predict_response.read()
|
|
126
128
|
|
|
@@ -140,8 +142,10 @@ async def predict_proxy(
|
|
|
140
142
|
client_type=client_type,
|
|
141
143
|
origin=origin,
|
|
142
144
|
text_answer=content,
|
|
145
|
+
text_reasoning=None,
|
|
143
146
|
generative_answer_time=metrics[PREDICT_ANSWER_METRIC],
|
|
144
147
|
generative_answer_first_chunk_time=None,
|
|
148
|
+
generative_reasoning_first_chunk_time=None,
|
|
145
149
|
status_code=AnswerStatusCode(str(llm_status_code)),
|
|
146
150
|
)
|
|
147
151
|
|
|
@@ -170,26 +174,35 @@ async def chat_streaming_generator(
|
|
|
170
174
|
client_type: NucliaDBClientType,
|
|
171
175
|
origin: str,
|
|
172
176
|
user_query: str,
|
|
173
|
-
|
|
177
|
+
is_ndjson_stream: bool,
|
|
178
|
+
metrics: AskMetrics,
|
|
174
179
|
):
|
|
175
180
|
first = True
|
|
181
|
+
first_reasoning = True
|
|
176
182
|
status_code = AnswerStatusCode.ERROR.value
|
|
177
183
|
text_answer = ""
|
|
184
|
+
text_reasoning = ""
|
|
178
185
|
json_object = None
|
|
179
|
-
metrics = AskMetrics()
|
|
180
186
|
with metrics.time(PREDICT_ANSWER_METRIC):
|
|
181
187
|
async for chunk in predict_response.content:
|
|
182
|
-
if first:
|
|
183
|
-
metrics.record_first_chunk_yielded()
|
|
184
|
-
first = False
|
|
185
|
-
|
|
186
188
|
yield chunk
|
|
187
|
-
|
|
188
|
-
if is_json:
|
|
189
|
+
if is_ndjson_stream:
|
|
189
190
|
try:
|
|
190
191
|
parsed_chunk = GenerativeChunk.model_validate_json(chunk).chunk
|
|
192
|
+
if first and isinstance(
|
|
193
|
+
parsed_chunk,
|
|
194
|
+
(TextGenerativeResponse, JSONGenerativeResponse, StatusGenerativeResponse),
|
|
195
|
+
):
|
|
196
|
+
metrics.record_first_chunk_yielded()
|
|
197
|
+
first = False
|
|
198
|
+
|
|
191
199
|
if isinstance(parsed_chunk, TextGenerativeResponse):
|
|
192
200
|
text_answer += parsed_chunk.text
|
|
201
|
+
elif isinstance(parsed_chunk, ReasoningGenerativeResponse):
|
|
202
|
+
if first_reasoning:
|
|
203
|
+
metrics.record_first_reasoning_chunk_yielded()
|
|
204
|
+
first_reasoning = False
|
|
205
|
+
text_reasoning += parsed_chunk.text
|
|
193
206
|
elif isinstance(parsed_chunk, JSONGenerativeResponse):
|
|
194
207
|
json_object = parsed_chunk.object
|
|
195
208
|
elif isinstance(parsed_chunk, StatusGenerativeResponse):
|
|
@@ -201,8 +214,11 @@ async def chat_streaming_generator(
|
|
|
201
214
|
)
|
|
202
215
|
else:
|
|
203
216
|
text_answer += chunk.decode()
|
|
217
|
+
if first:
|
|
218
|
+
metrics.record_first_chunk_yielded()
|
|
219
|
+
first = False
|
|
204
220
|
|
|
205
|
-
if
|
|
221
|
+
if is_ndjson_stream is False and chunk: # Ensure chunk is not empty before decoding
|
|
206
222
|
# If response is text the status_code comes at the last chunk of data
|
|
207
223
|
last_chunk = chunk.decode()
|
|
208
224
|
if last_chunk[-1] == "0":
|
|
@@ -218,8 +234,10 @@ async def chat_streaming_generator(
|
|
|
218
234
|
client_type=client_type,
|
|
219
235
|
origin=origin,
|
|
220
236
|
text_answer=text_answer.encode() if json_object is None else json.dumps(json_object).encode(),
|
|
237
|
+
text_reasoning=text_reasoning if text_reasoning else None,
|
|
221
238
|
generative_answer_time=metrics[PREDICT_ANSWER_METRIC],
|
|
222
239
|
generative_answer_first_chunk_time=metrics.get_first_chunk_time(),
|
|
240
|
+
generative_reasoning_first_chunk_time=metrics.get_first_reasoning_chunk_time(),
|
|
223
241
|
status_code=AnswerStatusCode(status_code),
|
|
224
242
|
)
|
|
225
243
|
|
|
@@ -232,8 +250,10 @@ def audit_predict_proxy_endpoint(
|
|
|
232
250
|
client_type: NucliaDBClientType,
|
|
233
251
|
origin: str,
|
|
234
252
|
text_answer: bytes,
|
|
253
|
+
text_reasoning: Optional[str],
|
|
235
254
|
generative_answer_time: float,
|
|
236
255
|
generative_answer_first_chunk_time: Optional[float],
|
|
256
|
+
generative_reasoning_first_chunk_time: Optional[float],
|
|
237
257
|
status_code: AnswerStatusCode,
|
|
238
258
|
):
|
|
239
259
|
maybe_audit_chat(
|
|
@@ -250,8 +270,10 @@ def audit_predict_proxy_endpoint(
|
|
|
250
270
|
query_context_order={},
|
|
251
271
|
model=headers.get(NUCLIA_LEARNING_MODEL_HEADER),
|
|
252
272
|
text_answer=text_answer,
|
|
273
|
+
text_reasoning=text_reasoning,
|
|
253
274
|
generative_answer_time=generative_answer_time,
|
|
254
275
|
generative_answer_first_chunk_time=generative_answer_first_chunk_time or 0,
|
|
276
|
+
generative_reasoning_first_chunk_time=generative_reasoning_first_chunk_time,
|
|
255
277
|
rephrase_time=None,
|
|
256
278
|
status_code=status_code,
|
|
257
279
|
)
|