nucliadb 6.7.2.post4862__py3-none-any.whl → 6.9.2.post5282__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nucliadb might be problematic. Click here for more details.

Files changed (126) hide show
  1. migrations/0016_upgrade_to_paragraphs_v2.py +1 -1
  2. migrations/0017_multiple_writable_shards.py +1 -1
  3. migrations/0018_purge_orphan_kbslugs.py +1 -1
  4. migrations/0019_upgrade_to_paragraphs_v3.py +1 -1
  5. migrations/0021_overwrite_vectorsets_key.py +1 -1
  6. migrations/0023_backfill_pg_catalog.py +7 -3
  7. migrations/0025_assign_models_to_kbs_v2.py +3 -3
  8. migrations/0027_rollover_texts3.py +1 -1
  9. migrations/0028_extracted_vectors_reference.py +1 -1
  10. migrations/0029_backfill_field_status.py +1 -1
  11. migrations/0032_remove_old_relations.py +1 -1
  12. migrations/0036_backfill_catalog_slug.py +1 -1
  13. migrations/0037_backfill_catalog_facets.py +1 -1
  14. migrations/0038_backfill_catalog_field_labels.py +7 -3
  15. migrations/0039_backfill_converation_splits_metadata.py +106 -0
  16. migrations/0040_migrate_search_configurations.py +79 -0
  17. migrations/pg/0010_shards_index.py +34 -0
  18. nucliadb/backups/create.py +3 -3
  19. nucliadb/backups/restore.py +3 -3
  20. nucliadb/common/cache.py +1 -1
  21. nucliadb/common/catalog/__init__.py +79 -0
  22. nucliadb/common/catalog/dummy.py +36 -0
  23. nucliadb/common/catalog/interface.py +85 -0
  24. nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +294 -208
  25. nucliadb/common/catalog/utils.py +56 -0
  26. nucliadb/common/cluster/manager.py +3 -19
  27. nucliadb/common/cluster/rebalance.py +484 -110
  28. nucliadb/common/cluster/rollover.py +29 -0
  29. nucliadb/common/cluster/settings.py +1 -1
  30. nucliadb/common/cluster/utils.py +26 -0
  31. nucliadb/common/datamanagers/atomic.py +6 -0
  32. nucliadb/common/datamanagers/utils.py +2 -2
  33. nucliadb/common/external_index_providers/manager.py +1 -29
  34. nucliadb/common/external_index_providers/settings.py +1 -27
  35. nucliadb/common/filter_expression.py +16 -33
  36. nucliadb/common/http_clients/exceptions.py +8 -0
  37. nucliadb/common/http_clients/processing.py +4 -0
  38. nucliadb/common/http_clients/utils.py +3 -0
  39. nucliadb/common/ids.py +77 -55
  40. nucliadb/common/locking.py +4 -4
  41. nucliadb/common/maindb/driver.py +11 -1
  42. nucliadb/common/maindb/local.py +1 -1
  43. nucliadb/common/maindb/pg.py +1 -1
  44. nucliadb/common/nidx.py +19 -1
  45. nucliadb/common/vector_index_config.py +1 -1
  46. nucliadb/export_import/datamanager.py +3 -3
  47. nucliadb/ingest/consumer/pull.py +7 -0
  48. nucliadb/ingest/consumer/service.py +2 -27
  49. nucliadb/ingest/consumer/shard_creator.py +17 -6
  50. nucliadb/ingest/fields/base.py +9 -17
  51. nucliadb/ingest/fields/conversation.py +47 -1
  52. nucliadb/ingest/orm/brain_v2.py +21 -3
  53. nucliadb/ingest/orm/index_message.py +126 -111
  54. nucliadb/ingest/orm/knowledgebox.py +84 -43
  55. nucliadb/ingest/orm/processor/auditing.py +1 -1
  56. nucliadb/ingest/orm/processor/processor.py +95 -149
  57. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  58. nucliadb/ingest/orm/resource.py +10 -1
  59. nucliadb/ingest/partitions.py +12 -1
  60. nucliadb/ingest/serialize.py +2 -2
  61. nucliadb/ingest/service/writer.py +26 -19
  62. nucliadb/ingest/settings.py +33 -11
  63. nucliadb/learning_proxy.py +12 -15
  64. nucliadb/metrics_exporter.py +17 -4
  65. nucliadb/migrator/datamanager.py +11 -17
  66. nucliadb/migrator/migrator.py +2 -2
  67. nucliadb/purge/__init__.py +12 -17
  68. nucliadb/purge/orphan_shards.py +2 -2
  69. nucliadb/reader/api/v1/knowledgebox.py +40 -12
  70. nucliadb/reader/api/v1/learning_config.py +30 -10
  71. nucliadb/reader/api/v1/resource.py +2 -2
  72. nucliadb/reader/api/v1/services.py +1 -1
  73. nucliadb/reader/reader/notifications.py +1 -1
  74. nucliadb/search/api/v1/__init__.py +1 -0
  75. nucliadb/search/api/v1/catalog.py +4 -4
  76. nucliadb/search/api/v1/find.py +1 -4
  77. nucliadb/search/api/v1/hydrate.py +328 -0
  78. nucliadb/search/api/v1/resource/ask.py +21 -1
  79. nucliadb/search/api/v1/search.py +1 -4
  80. nucliadb/search/predict.py +9 -2
  81. nucliadb/search/search/cache.py +1 -20
  82. nucliadb/search/search/chat/ask.py +50 -8
  83. nucliadb/search/search/chat/prompt.py +47 -15
  84. nucliadb/search/search/chat/query.py +8 -1
  85. nucliadb/search/search/fetch.py +1 -1
  86. nucliadb/search/search/find.py +1 -6
  87. nucliadb/search/search/{hydrator.py → hydrator/__init__.py} +5 -4
  88. nucliadb/search/search/hydrator/fields.py +175 -0
  89. nucliadb/search/search/hydrator/images.py +130 -0
  90. nucliadb/search/search/hydrator/paragraphs.py +307 -0
  91. nucliadb/search/search/hydrator/resources.py +56 -0
  92. nucliadb/search/search/metrics.py +16 -0
  93. nucliadb/search/search/predict_proxy.py +33 -11
  94. nucliadb/search/search/query.py +0 -23
  95. nucliadb/search/search/query_parser/fetcher.py +5 -5
  96. nucliadb/search/search/query_parser/models.py +1 -30
  97. nucliadb/search/search/query_parser/parsers/ask.py +1 -1
  98. nucliadb/search/search/query_parser/parsers/catalog.py +4 -7
  99. nucliadb/search/search/query_parser/parsers/common.py +16 -7
  100. nucliadb/search/search/query_parser/parsers/find.py +0 -11
  101. nucliadb/search/search/query_parser/parsers/graph.py +5 -5
  102. nucliadb/search/search/query_parser/parsers/search.py +0 -11
  103. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +4 -11
  104. nucliadb/search/search/rerankers.py +1 -1
  105. nucliadb/search/search/summarize.py +1 -1
  106. nucliadb/standalone/run.py +3 -0
  107. nucliadb/tasks/retries.py +4 -4
  108. nucliadb/train/generators/sentence_classifier.py +2 -8
  109. nucliadb/train/generators/utils.py +1 -1
  110. nucliadb/train/nodes.py +4 -4
  111. nucliadb/train/servicer.py +1 -1
  112. nucliadb/train/uploader.py +1 -1
  113. nucliadb/writer/api/v1/field.py +14 -9
  114. nucliadb/writer/api/v1/knowledgebox.py +15 -52
  115. nucliadb/writer/api/v1/learning_config.py +5 -4
  116. nucliadb/writer/api/v1/resource.py +2 -2
  117. nucliadb/writer/resource/field.py +38 -2
  118. nucliadb/writer/tus/azure.py +4 -4
  119. nucliadb/writer/tus/gcs.py +11 -17
  120. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/METADATA +9 -10
  121. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/RECORD +124 -114
  122. nucliadb/common/external_index_providers/pinecone.py +0 -894
  123. nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
  124. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/WHEEL +0 -0
  125. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/entry_points.txt +0 -0
  126. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,130 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ import base64
21
+ from typing import Optional, cast
22
+
23
+ from nucliadb.common.ids import FIELD_TYPE_STR_TO_NAME, FieldId, ParagraphId
24
+ from nucliadb.ingest.fields.base import Field
25
+ from nucliadb.ingest.fields.file import File
26
+ from nucliadb.search import SERVICE_NAME
27
+ from nucliadb_models.common import FieldTypeName
28
+ from nucliadb_models.search import Image
29
+ from nucliadb_protos import resources_pb2
30
+ from nucliadb_utils.utilities import get_storage
31
+
32
+
33
+ async def paragraph_source_image(
34
+ kbid: str, paragraph_id: ParagraphId, paragraph: resources_pb2.Paragraph
35
+ ) -> Optional[Image]:
36
+ """Certain paragraphs are extracted from images using techniques like OCR or
37
+ inception. If that's the case, return the original image for this paragraph.
38
+
39
+ """
40
+ source_image = paragraph.representation.reference_file
41
+ if not source_image:
42
+ return None
43
+
44
+ if paragraph.kind not in (
45
+ resources_pb2.Paragraph.TypeParagraph.OCR,
46
+ resources_pb2.Paragraph.TypeParagraph.INCEPTION,
47
+ ):
48
+ return None
49
+
50
+ field_id = paragraph_id.field_id
51
+
52
+ # Paragraphs extracted from an image store its original image representation
53
+ # in the reference file. The path is incomplete though, as it's stored in
54
+ # the `generated` folder
55
+ image = await download_image(
56
+ kbid,
57
+ field_id,
58
+ f"generated/{source_image}",
59
+ # XXX: we assume all reference files are PNG images, but this actually
60
+ # depends on learning so it's a dangerous assumption. We should check it
61
+ # by ourselves
62
+ mime_type="image/png",
63
+ )
64
+ return image
65
+
66
+
67
+ async def download_image(
68
+ kbid: str, field_id: FieldId, image_path: str, *, mime_type: str
69
+ ) -> Optional[Image]:
70
+ storage = await get_storage(service_name=SERVICE_NAME)
71
+ sf = storage.file_extracted(
72
+ kbid,
73
+ field_id.rid,
74
+ field_id.type,
75
+ field_id.key,
76
+ image_path,
77
+ )
78
+ raw_image = (await storage.downloadbytes(sf.bucket, sf.key)).getvalue()
79
+ if not raw_image:
80
+ return None
81
+ return Image(content_type=mime_type, b64encoded=base64.b64encode(raw_image).decode())
82
+
83
+
84
+ async def download_page_preview(field: Field, page: int) -> Optional[Image]:
85
+ """Download a specific page preview for a field and return it as an Image.
86
+ As not all fields have previews, this function can return None.
87
+
88
+ Page previews are uploaded by learning and shared through a known path with.
89
+ nucliadb
90
+
91
+ """
92
+ field_type = FIELD_TYPE_STR_TO_NAME[field.type]
93
+
94
+ if field_type == FieldTypeName.FILE:
95
+ field = cast(File, field)
96
+ metadata = await field.get_file_extracted_data()
97
+
98
+ if metadata is None:
99
+ return None
100
+
101
+ assert page <= len(metadata.file_pages_previews.positions), (
102
+ f"paragraph page number {page} should be less or equal to the total file pages previews {len(metadata.file_pages_previews.positions)}"
103
+ )
104
+ image = await download_image(
105
+ field.kbid,
106
+ field.field_id,
107
+ f"generated/extracted_images_{page}.png",
108
+ mime_type="image/png",
109
+ )
110
+
111
+ elif field_type == FieldTypeName.LINK:
112
+ # TODO: in case of links, we want to return the link preview, that is a
113
+ # link converted to PDF and screenshotted
114
+ # REVIEW: link preview is an image or a PDF?
115
+ image = None
116
+
117
+ elif (
118
+ field_type == FieldTypeName.TEXT
119
+ or field_type == FieldTypeName.CONVERSATION
120
+ or field_type == FieldTypeName.GENERIC
121
+ ):
122
+ # these fields don't have previews
123
+ image = None
124
+
125
+ else: # pragma: no cover
126
+ # This is a trick so mypy generates an error if this branch can be reached,
127
+ # that is, if we are missing some ifs
128
+ _a: int = "a"
129
+
130
+ return image
@@ -0,0 +1,307 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ import asyncio
21
+ from dataclasses import dataclass
22
+ from typing import Optional, Union
23
+
24
+ from nucliadb.common.ids import FieldId, ParagraphId
25
+ from nucliadb.ingest.fields.base import Field
26
+ from nucliadb.ingest.orm.resource import Resource
27
+ from nucliadb.search.search import paragraphs
28
+ from nucliadb.search.search.hydrator.fields import page_preview_id
29
+ from nucliadb.search.search.hydrator.images import paragraph_source_image
30
+ from nucliadb_models import hydration as hydration_models
31
+ from nucliadb_protos import resources_pb2
32
+ from nucliadb_protos.resources_pb2 import FieldComputedMetadata
33
+
34
+
35
+ class ParagraphIndex:
36
+ """Small helper class to cache field paragraphs and its relations and be
37
+ used as an index.
38
+
39
+ """
40
+
41
+ NEXT = "next"
42
+ PREVIOUS = "previous"
43
+ PARENTS = "parents"
44
+ SIBLINGS = "siblings"
45
+ REPLACEMENTS = "replacements"
46
+
47
+ def __init__(self, field_id: FieldId) -> None:
48
+ self.field_id = field_id
49
+ self.paragraphs: dict[str, resources_pb2.Paragraph] = {}
50
+ self.neighbours: dict[tuple[str, str], str] = {}
51
+ self.related: dict[tuple[str, str], list[str]] = {}
52
+ self._lock = asyncio.Lock()
53
+ self._built = False
54
+
55
+ async def build(self, field: Field):
56
+ """Build the index if it hasn't been built yet.
57
+
58
+ This function is async-safe, multiple concurrent tasks can ask for a
59
+ built and it'll only be done once
60
+ """
61
+ if self._built:
62
+ return
63
+
64
+ async with self._lock:
65
+ # double check we haven't built the index meanwhile we waited for the
66
+ # lock
67
+ if self._built:
68
+ return
69
+
70
+ field_metadata = await field.get_field_metadata()
71
+
72
+ if field_metadata is None:
73
+ # field metadata may be still processing. As we want to provide a
74
+ # consistent view, even if it can appear meanwhile we hydrate, we
75
+ # consider we don't have it. We mark the index as built and any
76
+ # paragraph will be found for this field
77
+ self._built = True
78
+ return None
79
+
80
+ # REVIEW: this is a CPU-bound code, we may consider running this in an
81
+ # executor to not block the loop
82
+ self._build(field_metadata)
83
+ self._built = True
84
+
85
+ def _build(self, field_metadata: FieldComputedMetadata):
86
+ self.paragraphs.clear()
87
+ self.neighbours.clear()
88
+ self.related.clear()
89
+
90
+ if self.field_id.subfield_id is None:
91
+ field_paragraphs = field_metadata.metadata.paragraphs
92
+ else:
93
+ field_paragraphs = field_metadata.split_metadata[self.field_id.subfield_id].paragraphs
94
+
95
+ previous = None
96
+ for paragraph in field_paragraphs:
97
+ paragraph_id = self.field_id.paragraph_id(paragraph.start, paragraph.end).full()
98
+ self.paragraphs[paragraph_id] = paragraph
99
+
100
+ if previous is not None:
101
+ self.neighbours[(previous, ParagraphIndex.NEXT)] = paragraph_id
102
+ self.neighbours[(paragraph_id, ParagraphIndex.PREVIOUS)] = previous
103
+ previous = paragraph_id
104
+
105
+ self.related[(paragraph_id, ParagraphIndex.PARENTS)] = [
106
+ parent for parent in paragraph.relations.parents
107
+ ]
108
+ self.related[(paragraph_id, ParagraphIndex.SIBLINGS)] = [
109
+ sibling for sibling in paragraph.relations.siblings
110
+ ]
111
+ self.related[(paragraph_id, ParagraphIndex.REPLACEMENTS)] = [
112
+ replacement for replacement in paragraph.relations.replacements
113
+ ]
114
+
115
+ def get(self, paragraph_id: Union[str, ParagraphId]) -> Optional[resources_pb2.Paragraph]:
116
+ paragraph_id = str(paragraph_id)
117
+ return self.paragraphs.get(paragraph_id)
118
+
119
+ def previous(self, paragraph_id: Union[str, ParagraphId]) -> Optional[str]:
120
+ paragraph_id = str(paragraph_id)
121
+ return self.neighbours.get((paragraph_id, ParagraphIndex.PREVIOUS))
122
+
123
+ def next(self, paragraph_id: Union[str, ParagraphId]) -> Optional[str]:
124
+ paragraph_id = str(paragraph_id)
125
+ return self.neighbours.get((paragraph_id, ParagraphIndex.NEXT))
126
+
127
+ def n_previous(self, paragraph_id: Union[str, ParagraphId], count: int = 1) -> list[str]:
128
+ assert count >= 1, f"can't find negative previous {count}"
129
+ paragraph_id = str(paragraph_id)
130
+ previous: list[str] = []
131
+ current_id = paragraph_id
132
+ for _ in range(count):
133
+ previous_id = self.previous(current_id)
134
+ if previous_id is None:
135
+ # we've reached the first paragraph
136
+ break
137
+ previous.insert(0, previous_id)
138
+ current_id = previous_id
139
+ return previous
140
+
141
+ def n_next(self, paragraph_id: Union[str, ParagraphId], count: int = 1) -> list[str]:
142
+ assert count >= 1, f"can't find negative nexts {count}"
143
+ paragraph_id = str(paragraph_id)
144
+ nexts = []
145
+ current_id = paragraph_id
146
+ for _ in range(count):
147
+ next_id = self.next(current_id)
148
+ if next_id is None:
149
+ # we've reached the last paragraph
150
+ break
151
+ current_id = next_id
152
+ nexts.append(next_id)
153
+ return nexts
154
+
155
+ def parents(self, paragraph_id: Union[str, ParagraphId]) -> list[str]:
156
+ paragraph_id = str(paragraph_id)
157
+ return self.related.get((paragraph_id, ParagraphIndex.PARENTS), [])
158
+
159
+ def siblings(self, paragraph_id: Union[str, ParagraphId]) -> list[str]:
160
+ paragraph_id = str(paragraph_id)
161
+ return self.related.get((paragraph_id, ParagraphIndex.SIBLINGS), [])
162
+
163
+ def replacements(self, paragraph_id: Union[str, ParagraphId]) -> list[str]:
164
+ paragraph_id = str(paragraph_id)
165
+ return self.related.get((paragraph_id, ParagraphIndex.REPLACEMENTS), [])
166
+
167
+
168
+ @dataclass
169
+ class ExtraParagraphHydration:
170
+ field_page: Optional[int]
171
+ field_table_page: Optional[int]
172
+ related_paragraph_ids: list[ParagraphId]
173
+
174
+
175
+ async def hydrate_paragraph(
176
+ resource: Resource,
177
+ field: Field,
178
+ paragraph_id: ParagraphId,
179
+ config: hydration_models.ParagraphHydration,
180
+ field_paragraphs_index: ParagraphIndex,
181
+ ) -> tuple[hydration_models.HydratedParagraph, ExtraParagraphHydration]:
182
+ """Hydrate a paragraph and return the extra hydration to built a coherent
183
+ hydration around this paragraph.
184
+
185
+ Although the resource and field exist, the paragraph doesn't necessarily
186
+ need to be a real one in the paragraph metadata, it can be made-up to
187
+ include more or less text than the originally extracted.
188
+
189
+ """
190
+ kbid = resource.kb.kbid
191
+
192
+ hydrated = hydration_models.HydratedParagraph(
193
+ id=paragraph_id.full(),
194
+ field=paragraph_id.field_id.full(),
195
+ resource=paragraph_id.rid,
196
+ )
197
+ extra_hydration = ExtraParagraphHydration(
198
+ field_page=None, field_table_page=None, related_paragraph_ids=[]
199
+ )
200
+
201
+ if config.text:
202
+ text = await paragraphs.get_paragraph_text(kbid=kbid, paragraph_id=paragraph_id)
203
+ hydrated.text = text
204
+
205
+ requires_paragraph_metadata = config.image or config.table or config.page or config.related
206
+ if requires_paragraph_metadata:
207
+ await field_paragraphs_index.build(field)
208
+ paragraph = field_paragraphs_index.get(paragraph_id)
209
+ if paragraph is not None:
210
+ # otherwise, this is a fake paragraph. We can't hydrate anything else here
211
+
212
+ if config.related:
213
+ hydrated.related, related_ids = await related_paragraphs_refs(
214
+ paragraph_id, field_paragraphs_index, config.related
215
+ )
216
+ extra_hydration.related_paragraph_ids = related_ids
217
+
218
+ if config.image:
219
+ hydrated.image = hydration_models.HydratedParagraphImage()
220
+
221
+ if config.image.source_image:
222
+ hydrated.image.source_image = await paragraph_source_image(
223
+ kbid, paragraph_id, paragraph
224
+ )
225
+
226
+ if config.page:
227
+ if hydrated.page is None:
228
+ hydrated.page = hydration_models.HydratedParagraphPage()
229
+
230
+ if config.page.page_with_visual:
231
+ if paragraph.page.page_with_visual:
232
+ # Paragraphs can be found on pages with visual content. In this
233
+ # case, we want to return the preview of the paragraph page as
234
+ # an image
235
+ page_number = paragraph.page.page
236
+ # TODO: what should I do if I later find there's no page in the DB?
237
+ hydrated.page.page_preview_ref = page_preview_id(page_number)
238
+ extra_hydration.field_page = page_number
239
+
240
+ if config.table:
241
+ if hydrated.table is None:
242
+ hydrated.table = hydration_models.HydratedParagraphTable()
243
+
244
+ if config.table.table_page_preview:
245
+ if paragraph.representation.is_a_table:
246
+ # When a paragraph comes with a table and table hydration is
247
+ # enabled, we want to return the image representing that table.
248
+ # Ideally we should hydrate the paragraph reference_file, but
249
+ # table screenshots are not always perfect so we prefer to use
250
+ # the page preview. If at some point the table images are good
251
+ # enough, it'd be better to use those
252
+ page_number = paragraph.page.page
253
+ hydrated.table.page_preview_ref = page_preview_id(page_number)
254
+ extra_hydration.field_table_page = page_number
255
+
256
+ return hydrated, extra_hydration
257
+
258
+
259
+ async def related_paragraphs_refs(
260
+ paragraph_id: ParagraphId,
261
+ index: ParagraphIndex,
262
+ config: hydration_models.RelatedParagraphHydration,
263
+ ) -> tuple[hydration_models.RelatedParagraphRefs, list[ParagraphId]]:
264
+ """Compute the related paragraph references for a specific `paragraph_id`
265
+ and return them with the plain list of unique related paragraphs (to
266
+ facilitate work to the caller).
267
+
268
+ """
269
+ hydrated = hydration_models.RelatedParagraphRefs()
270
+ related = set()
271
+
272
+ if config.neighbours:
273
+ hydrated.neighbours = hydration_models.RelatedNeighbourParagraphRefs()
274
+
275
+ if config.neighbours.before is not None:
276
+ hydrated.neighbours.before = []
277
+ if config.neighbours.before > 0:
278
+ for previous_id in index.n_previous(paragraph_id, config.neighbours.before):
279
+ hydrated.neighbours.before.insert(0, previous_id)
280
+ related.add(ParagraphId.from_string(previous_id))
281
+
282
+ if config.neighbours.after is not None:
283
+ hydrated.neighbours.after = []
284
+ if config.neighbours.after > 0:
285
+ for next_id in index.n_next(paragraph_id, config.neighbours.after):
286
+ hydrated.neighbours.after.append(next_id)
287
+ related.add(ParagraphId.from_string(next_id))
288
+
289
+ if config.parents:
290
+ hydrated.parents = []
291
+ for parent_id in index.parents(paragraph_id):
292
+ hydrated.parents.append(parent_id)
293
+ related.add(ParagraphId.from_string(parent_id))
294
+
295
+ if config.siblings:
296
+ hydrated.siblings = []
297
+ for sibling_id in index.siblings(paragraph_id):
298
+ hydrated.siblings.append(sibling_id)
299
+ related.add(ParagraphId.from_string(sibling_id))
300
+
301
+ if config.replacements:
302
+ hydrated.replacements = []
303
+ for replacement_id in index.replacements(paragraph_id):
304
+ hydrated.replacements.append(replacement_id)
305
+ related.add(ParagraphId.from_string(replacement_id))
306
+
307
+ return hydrated, list(related)
@@ -0,0 +1,56 @@
1
+ #!/usr/bin/env python3
2
+
3
+ # Copyright (C) 2021 Bosutech XXI S.L.
4
+ #
5
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
6
+ # For commercial licensing, contact us at info@nuclia.com.
7
+ #
8
+ # AGPL:
9
+ # This program is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU Affero General Public License as
11
+ # published by the Free Software Foundation, either version 3 of the
12
+ # License, or (at your option) any later version.
13
+ #
14
+ # This program is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU Affero General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU Affero General Public License
20
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
21
+ #
22
+
23
+
24
+ from nucliadb.common.models_utils import from_proto
25
+ from nucliadb.ingest.orm.resource import Resource
26
+ from nucliadb_models import hydration as hydration_models
27
+ from nucliadb_models.security import ResourceSecurity
28
+
29
+
30
+ async def hydrate_resource(
31
+ resource: Resource, rid: str, config: hydration_models.ResourceHydration
32
+ ) -> hydration_models.HydratedResource:
33
+ basic = await resource.get_basic()
34
+
35
+ slug = basic.slug
36
+ hydrated = hydration_models.HydratedResource(id=rid, slug=slug)
37
+
38
+ if config.title:
39
+ hydrated.title = basic.title
40
+ if config.summary:
41
+ hydrated.summary = basic.summary
42
+
43
+ if config.security:
44
+ security = await resource.get_security()
45
+ hydrated.security = ResourceSecurity(access_groups=[])
46
+ if security is not None:
47
+ for group_id in security.access_groups:
48
+ hydrated.security.access_groups.append(group_id)
49
+
50
+ if config.origin:
51
+ origin = await resource.get_origin()
52
+ if origin is not None:
53
+ # TODO: we want a better hydration than proto to JSON
54
+ hydrated.origin = from_proto.origin(origin)
55
+
56
+ return hydrated
@@ -49,6 +49,10 @@ buckets = [
49
49
  ]
50
50
 
51
51
  generative_first_chunk_histogram = metrics.Histogram(
52
+ name="generative_reasoning_first_chunk",
53
+ buckets=buckets,
54
+ )
55
+ reasoning_first_chunk_histogram = metrics.Histogram(
52
56
  name="generative_first_chunk",
53
57
  buckets=buckets,
54
58
  )
@@ -107,12 +111,24 @@ class AskMetrics(Metrics):
107
111
  super().__init__(id="ask")
108
112
  self.global_start = time.monotonic()
109
113
  self.first_chunk_yielded_at: Optional[float] = None
114
+ self.first_reasoning_chunk_yielded_at: Optional[float] = None
110
115
 
111
116
  def record_first_chunk_yielded(self):
112
117
  self.first_chunk_yielded_at = time.monotonic()
113
118
  generative_first_chunk_histogram.observe(self.first_chunk_yielded_at - self.global_start)
114
119
 
120
+ def record_first_reasoning_chunk_yielded(self):
121
+ self.first_reasoning_chunk_yielded_at = time.monotonic()
122
+ reasoning_first_chunk_histogram.observe(
123
+ self.first_reasoning_chunk_yielded_at - self.global_start
124
+ )
125
+
115
126
  def get_first_chunk_time(self) -> Optional[float]:
116
127
  if self.first_chunk_yielded_at is None:
117
128
  return None
118
129
  return self.first_chunk_yielded_at - self.global_start
130
+
131
+ def get_first_reasoning_chunk_time(self) -> Optional[float]:
132
+ if self.first_reasoning_chunk_yielded_at is None:
133
+ return None
134
+ return self.first_reasoning_chunk_yielded_at - self.global_start
@@ -28,6 +28,7 @@ from multidict import CIMultiDictProxy
28
28
  from nuclia_models.predict.generative_responses import (
29
29
  GenerativeChunk,
30
30
  JSONGenerativeResponse,
31
+ ReasoningGenerativeResponse,
31
32
  StatusGenerativeResponse,
32
33
  TextGenerativeResponse,
33
34
  )
@@ -87,6 +88,7 @@ async def predict_proxy(
87
88
  predict_headers = predict.get_predict_headers(kbid)
88
89
  user_headers = {k: v for k, v in headers.items() if k.capitalize() in ALLOWED_HEADERS}
89
90
 
91
+ metrics = AskMetrics()
90
92
  # Proxy the request to predict API
91
93
  predict_response = await predict.make_request(
92
94
  method=method,
@@ -109,7 +111,8 @@ async def predict_proxy(
109
111
  client_type=client_type,
110
112
  origin=origin,
111
113
  user_query=user_query,
112
- is_json="json" in (media_type or ""),
114
+ is_ndjson_stream="json" in (media_type or ""),
115
+ metrics=metrics,
113
116
  )
114
117
  else:
115
118
  streaming_generator = predict_response.content.iter_any()
@@ -120,7 +123,6 @@ async def predict_proxy(
120
123
  media_type=media_type,
121
124
  )
122
125
  else:
123
- metrics = AskMetrics()
124
126
  with metrics.time(PREDICT_ANSWER_METRIC):
125
127
  content = await predict_response.read()
126
128
 
@@ -140,8 +142,10 @@ async def predict_proxy(
140
142
  client_type=client_type,
141
143
  origin=origin,
142
144
  text_answer=content,
145
+ text_reasoning=None,
143
146
  generative_answer_time=metrics[PREDICT_ANSWER_METRIC],
144
147
  generative_answer_first_chunk_time=None,
148
+ generative_reasoning_first_chunk_time=None,
145
149
  status_code=AnswerStatusCode(str(llm_status_code)),
146
150
  )
147
151
 
@@ -170,26 +174,35 @@ async def chat_streaming_generator(
170
174
  client_type: NucliaDBClientType,
171
175
  origin: str,
172
176
  user_query: str,
173
- is_json: bool,
177
+ is_ndjson_stream: bool,
178
+ metrics: AskMetrics,
174
179
  ):
175
180
  first = True
181
+ first_reasoning = True
176
182
  status_code = AnswerStatusCode.ERROR.value
177
183
  text_answer = ""
184
+ text_reasoning = ""
178
185
  json_object = None
179
- metrics = AskMetrics()
180
186
  with metrics.time(PREDICT_ANSWER_METRIC):
181
187
  async for chunk in predict_response.content:
182
- if first:
183
- metrics.record_first_chunk_yielded()
184
- first = False
185
-
186
188
  yield chunk
187
-
188
- if is_json:
189
+ if is_ndjson_stream:
189
190
  try:
190
191
  parsed_chunk = GenerativeChunk.model_validate_json(chunk).chunk
192
+ if first and isinstance(
193
+ parsed_chunk,
194
+ (TextGenerativeResponse, JSONGenerativeResponse, StatusGenerativeResponse),
195
+ ):
196
+ metrics.record_first_chunk_yielded()
197
+ first = False
198
+
191
199
  if isinstance(parsed_chunk, TextGenerativeResponse):
192
200
  text_answer += parsed_chunk.text
201
+ elif isinstance(parsed_chunk, ReasoningGenerativeResponse):
202
+ if first_reasoning:
203
+ metrics.record_first_reasoning_chunk_yielded()
204
+ first_reasoning = False
205
+ text_reasoning += parsed_chunk.text
193
206
  elif isinstance(parsed_chunk, JSONGenerativeResponse):
194
207
  json_object = parsed_chunk.object
195
208
  elif isinstance(parsed_chunk, StatusGenerativeResponse):
@@ -201,8 +214,11 @@ async def chat_streaming_generator(
201
214
  )
202
215
  else:
203
216
  text_answer += chunk.decode()
217
+ if first:
218
+ metrics.record_first_chunk_yielded()
219
+ first = False
204
220
 
205
- if is_json is False and chunk: # Ensure chunk is not empty before decoding
221
+ if is_ndjson_stream is False and chunk: # Ensure chunk is not empty before decoding
206
222
  # If response is text the status_code comes at the last chunk of data
207
223
  last_chunk = chunk.decode()
208
224
  if last_chunk[-1] == "0":
@@ -218,8 +234,10 @@ async def chat_streaming_generator(
218
234
  client_type=client_type,
219
235
  origin=origin,
220
236
  text_answer=text_answer.encode() if json_object is None else json.dumps(json_object).encode(),
237
+ text_reasoning=text_reasoning if text_reasoning else None,
221
238
  generative_answer_time=metrics[PREDICT_ANSWER_METRIC],
222
239
  generative_answer_first_chunk_time=metrics.get_first_chunk_time(),
240
+ generative_reasoning_first_chunk_time=metrics.get_first_reasoning_chunk_time(),
223
241
  status_code=AnswerStatusCode(status_code),
224
242
  )
225
243
 
@@ -232,8 +250,10 @@ def audit_predict_proxy_endpoint(
232
250
  client_type: NucliaDBClientType,
233
251
  origin: str,
234
252
  text_answer: bytes,
253
+ text_reasoning: Optional[str],
235
254
  generative_answer_time: float,
236
255
  generative_answer_first_chunk_time: Optional[float],
256
+ generative_reasoning_first_chunk_time: Optional[float],
237
257
  status_code: AnswerStatusCode,
238
258
  ):
239
259
  maybe_audit_chat(
@@ -250,8 +270,10 @@ def audit_predict_proxy_endpoint(
250
270
  query_context_order={},
251
271
  model=headers.get(NUCLIA_LEARNING_MODEL_HEADER),
252
272
  text_answer=text_answer,
273
+ text_reasoning=text_reasoning,
253
274
  generative_answer_time=generative_answer_time,
254
275
  generative_answer_first_chunk_time=generative_answer_first_chunk_time or 0,
276
+ generative_reasoning_first_chunk_time=generative_reasoning_first_chunk_time,
255
277
  rephrase_time=None,
256
278
  status_code=status_code,
257
279
  )