nucliadb 6.5.0.post4426__py3-none-any.whl → 6.5.0.post4484__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0037_backfill_catalog_facets.py +74 -0
- migrations/pg/0001_bootstrap.py +1 -1
- migrations/pg/0008_catalog_facets.py +43 -0
- migrations/pg/0009_extract_facets_safety.py +26 -0
- nucliadb/ingest/fields/base.py +18 -8
- nucliadb/ingest/orm/processor/pgcatalog.py +26 -0
- nucliadb/ingest/orm/resource.py +9 -4
- nucliadb/search/api/v1/catalog.py +14 -8
- nucliadb/search/search/chat/ask.py +12 -1
- nucliadb/search/search/chat/prompt.py +260 -201
- nucliadb/search/search/pgcatalog.py +174 -63
- {nucliadb-6.5.0.post4426.dist-info → nucliadb-6.5.0.post4484.dist-info}/METADATA +8 -8
- {nucliadb-6.5.0.post4426.dist-info → nucliadb-6.5.0.post4484.dist-info}/RECORD +16 -13
- {nucliadb-6.5.0.post4426.dist-info → nucliadb-6.5.0.post4484.dist-info}/WHEEL +0 -0
- {nucliadb-6.5.0.post4426.dist-info → nucliadb-6.5.0.post4484.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.5.0.post4426.dist-info → nucliadb-6.5.0.post4484.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,74 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
|
21
|
+
"""Migration #37
|
22
|
+
|
23
|
+
Backfill catalog facets
|
24
|
+
|
25
|
+
"""
|
26
|
+
|
27
|
+
import logging
|
28
|
+
from typing import cast
|
29
|
+
|
30
|
+
from nucliadb.common.maindb.pg import PGDriver, PGTransaction
|
31
|
+
from nucliadb.migrator.context import ExecutionContext
|
32
|
+
|
33
|
+
logger = logging.getLogger(__name__)
|
34
|
+
|
35
|
+
|
36
|
+
async def migrate(context: ExecutionContext) -> None:
|
37
|
+
driver = cast(PGDriver, context.kv_driver)
|
38
|
+
|
39
|
+
BATCH_SIZE = 1_000
|
40
|
+
async with driver.transaction() as txn:
|
41
|
+
txn = cast(PGTransaction, txn)
|
42
|
+
start_kbid = "00000000000000000000000000000000"
|
43
|
+
start_rid = "00000000000000000000000000000000"
|
44
|
+
while True:
|
45
|
+
async with txn.connection.cursor() as cur:
|
46
|
+
logger.info(f"Filling {BATCH_SIZE} catalog facets from {start_kbid}, {start_rid}")
|
47
|
+
# Get a batch of facets from the catalog table
|
48
|
+
await cur.execute(
|
49
|
+
"""
|
50
|
+
WITH i AS (
|
51
|
+
INSERT INTO catalog_facets (kbid, rid, facet)
|
52
|
+
SELECT kbid, rid, unnest(extract_facets(labels)) FROM (
|
53
|
+
SELECT * FROM catalog
|
54
|
+
WHERE (kbid = %(kbid)s AND rid > %(rid)s) OR kbid > %(kbid)s
|
55
|
+
ORDER BY kbid, rid
|
56
|
+
LIMIT %(batch)s
|
57
|
+
) rs
|
58
|
+
RETURNING kbid, rid
|
59
|
+
)
|
60
|
+
SELECT kbid, rid FROM i ORDER BY kbid DESC, rid DESC LIMIT 1;
|
61
|
+
""",
|
62
|
+
{"kbid": start_kbid, "rid": start_rid, "batch": BATCH_SIZE},
|
63
|
+
)
|
64
|
+
|
65
|
+
# Set the key for next iteration
|
66
|
+
results = await cur.fetchone() # type: ignore
|
67
|
+
if results is None:
|
68
|
+
break
|
69
|
+
(start_kbid, start_rid) = results
|
70
|
+
|
71
|
+
await txn.commit()
|
72
|
+
|
73
|
+
|
74
|
+
async def migrate_kb(context: ExecutionContext, kbid: str) -> None: ...
|
migrations/pg/0001_bootstrap.py
CHANGED
@@ -26,7 +26,7 @@ async def migrate(txn: PGTransaction) -> None:
|
|
26
26
|
# IF NOT EXISTS just for compatibility with older install predating the migration system
|
27
27
|
await cur.execute("""
|
28
28
|
CREATE TABLE IF NOT EXISTS resources (
|
29
|
-
key TEXT PRIMARY KEY,
|
29
|
+
key TEXT COLLATE ucs_basic PRIMARY KEY,
|
30
30
|
value BYTEA
|
31
31
|
);
|
32
32
|
""")
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
|
21
|
+
from nucliadb.common.maindb.pg import PGTransaction
|
22
|
+
|
23
|
+
|
24
|
+
async def migrate(txn: PGTransaction) -> None:
|
25
|
+
async with txn.connection.cursor() as cur:
|
26
|
+
await cur.execute(
|
27
|
+
"""
|
28
|
+
CREATE TABLE catalog_facets (
|
29
|
+
id BIGINT PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
|
30
|
+
kbid UUID,
|
31
|
+
rid UUID,
|
32
|
+
facet TEXT COLLATE ucs_basic,
|
33
|
+
|
34
|
+
FOREIGN KEY (kbid, rid) REFERENCES catalog (kbid, rid) ON DELETE CASCADE
|
35
|
+
);
|
36
|
+
|
37
|
+
-- For FK checks
|
38
|
+
CREATE INDEX ON catalog_facets(kbid, rid);
|
39
|
+
|
40
|
+
-- Best for per-facet aggregation, also used by search with facet filter
|
41
|
+
CREATE INDEX ON catalog_facets(kbid, facet);
|
42
|
+
"""
|
43
|
+
)
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
|
21
|
+
from nucliadb.common.maindb.pg import PGTransaction
|
22
|
+
|
23
|
+
|
24
|
+
async def migrate(txn: PGTransaction) -> None:
|
25
|
+
async with txn.connection.cursor() as cur:
|
26
|
+
await cur.execute("ALTER FUNCTION extract_facets(text[]) PARALLEL SAFE;")
|
nucliadb/ingest/fields/base.py
CHANGED
@@ -19,8 +19,10 @@
|
|
19
19
|
#
|
20
20
|
from __future__ import annotations
|
21
21
|
|
22
|
+
import asyncio
|
22
23
|
import enum
|
23
24
|
import logging
|
25
|
+
from collections import defaultdict
|
24
26
|
from datetime import datetime
|
25
27
|
from typing import TYPE_CHECKING, Any, Generic, Optional, Type, TypeVar
|
26
28
|
|
@@ -113,6 +115,8 @@ class Field(Generic[PbType]):
|
|
113
115
|
raise InvalidPBClass(self.__class__, pb.__class__)
|
114
116
|
self.value = pb
|
115
117
|
|
118
|
+
self.locks: dict[str, asyncio.Lock] = defaultdict(asyncio.Lock)
|
119
|
+
|
116
120
|
@property
|
117
121
|
def kbid(self) -> str:
|
118
122
|
return self.resource.kb.kbid
|
@@ -364,10 +368,13 @@ class Field(Generic[PbType]):
|
|
364
368
|
|
365
369
|
async def get_extracted_text(self, force=False) -> Optional[ExtractedText]:
|
366
370
|
if self.extracted_text is None or force:
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
+
async with self.locks["extracted_text"]:
|
372
|
+
# Value could have been fetched while waiting for the lock
|
373
|
+
if self.extracted_text is None or force:
|
374
|
+
sf = self.get_storage_field(FieldTypes.FIELD_TEXT)
|
375
|
+
payload = await self.storage.download_pb(sf, ExtractedText)
|
376
|
+
if payload is not None:
|
377
|
+
self.extracted_text = payload
|
371
378
|
return self.extracted_text
|
372
379
|
|
373
380
|
async def set_vectors(
|
@@ -499,10 +506,13 @@ class Field(Generic[PbType]):
|
|
499
506
|
|
500
507
|
async def get_field_metadata(self, force: bool = False) -> Optional[FieldComputedMetadata]:
|
501
508
|
if self.computed_metadata is None or force:
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
509
|
+
async with self.locks["field_metadata"]:
|
510
|
+
# Value could have been fetched while waiting for the lock
|
511
|
+
if self.computed_metadata is None or force:
|
512
|
+
sf = self.get_storage_field(FieldTypes.FIELD_METADATA)
|
513
|
+
payload = await self.storage.download_pb(sf, FieldComputedMetadata)
|
514
|
+
if payload is not None:
|
515
|
+
self.computed_metadata = payload
|
506
516
|
return self.computed_metadata
|
507
517
|
|
508
518
|
async def set_large_field_metadata(self, payload: LargeComputedMetadataWrapper):
|
@@ -40,6 +40,17 @@ def pgcatalog_enabled(kbid):
|
|
40
40
|
return isinstance(get_driver(), PGDriver)
|
41
41
|
|
42
42
|
|
43
|
+
def extract_facets(labels):
|
44
|
+
facets = set()
|
45
|
+
for label in labels:
|
46
|
+
parts = label.split("/")
|
47
|
+
facet = ""
|
48
|
+
for part in parts[1:]:
|
49
|
+
facet += f"/{part}"
|
50
|
+
facets.add(facet)
|
51
|
+
return facets
|
52
|
+
|
53
|
+
|
43
54
|
@observer.wrap({"type": "update"})
|
44
55
|
async def pgcatalog_update(txn: Transaction, kbid: str, resource: Resource, index_message: IndexMessage):
|
45
56
|
if not pgcatalog_enabled(kbid):
|
@@ -76,6 +87,21 @@ async def pgcatalog_update(txn: Transaction, kbid: str, resource: Resource, inde
|
|
76
87
|
"slug": resource.basic.slug,
|
77
88
|
},
|
78
89
|
)
|
90
|
+
await cur.execute(
|
91
|
+
"DELETE FROM catalog_facets WHERE kbid = %(kbid)s AND rid = %(rid)s",
|
92
|
+
{
|
93
|
+
"kbid": resource.kb.kbid,
|
94
|
+
"rid": resource.uuid,
|
95
|
+
},
|
96
|
+
)
|
97
|
+
await cur.execute(
|
98
|
+
"INSERT INTO catalog_facets (kbid, rid, facet) SELECT %(kbid)s AS kbid, %(rid)s AS rid, unnest(%(facets)s::text[]) AS facet",
|
99
|
+
{
|
100
|
+
"kbid": resource.kb.kbid,
|
101
|
+
"rid": resource.uuid,
|
102
|
+
"facets": list(extract_facets(index_message.labels)),
|
103
|
+
},
|
104
|
+
)
|
79
105
|
|
80
106
|
|
81
107
|
@observer.wrap({"type": "delete"})
|
nucliadb/ingest/orm/resource.py
CHANGED
@@ -19,6 +19,7 @@
|
|
19
19
|
#
|
20
20
|
from __future__ import annotations
|
21
21
|
|
22
|
+
import asyncio
|
22
23
|
import logging
|
23
24
|
from collections import defaultdict
|
24
25
|
from concurrent.futures import ThreadPoolExecutor
|
@@ -126,6 +127,7 @@ class Resource:
|
|
126
127
|
self.disable_vectors = disable_vectors
|
127
128
|
self._previous_status: Optional[Metadata.Status.ValueType] = None
|
128
129
|
self.user_relations: Optional[PBRelations] = None
|
130
|
+
self.locks: dict[str, asyncio.Lock] = defaultdict(asyncio.Lock)
|
129
131
|
|
130
132
|
async def set_slug(self):
|
131
133
|
basic = await self.get_basic()
|
@@ -306,10 +308,13 @@ class Resource:
|
|
306
308
|
async def get_field(self, key: str, type: FieldType.ValueType, load: bool = True):
|
307
309
|
field = (type, key)
|
308
310
|
if field not in self.fields:
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
311
|
+
async with self.locks["field"]:
|
312
|
+
# Field could have been fetch while waiting for the lock
|
313
|
+
if field not in self.fields:
|
314
|
+
field_obj: Field = KB_FIELDS[type](id=key, resource=self)
|
315
|
+
if load:
|
316
|
+
await field_obj.get_value()
|
317
|
+
self.fields[field] = field_obj
|
313
318
|
return self.fields[field]
|
314
319
|
|
315
320
|
async def set_field(self, type: FieldType.ValueType, key: str, payload: Any):
|
@@ -27,15 +27,13 @@ from pydantic import ValidationError
|
|
27
27
|
|
28
28
|
from nucliadb.common.datamanagers.exceptions import KnowledgeBoxNotFound
|
29
29
|
from nucliadb.common.exceptions import InvalidQueryError
|
30
|
-
from nucliadb.common.maindb.pg import PGDriver
|
31
|
-
from nucliadb.common.maindb.utils import get_driver
|
32
30
|
from nucliadb.models.responses import HTTPClientError
|
33
31
|
from nucliadb.search import logger
|
34
32
|
from nucliadb.search.api.v1.router import KB_PREFIX, api
|
35
33
|
from nucliadb.search.api.v1.utils import fastapi_query
|
36
34
|
from nucliadb.search.search import cache
|
37
35
|
from nucliadb.search.search.merge import fetch_resources
|
38
|
-
from nucliadb.search.search.pgcatalog import pgcatalog_search
|
36
|
+
from nucliadb.search.search.pgcatalog import pgcatalog_facets, pgcatalog_search
|
39
37
|
from nucliadb.search.search.query_parser.parsers import parse_catalog
|
40
38
|
from nucliadb.search.search.utils import (
|
41
39
|
maybe_log_request_payload,
|
@@ -45,6 +43,7 @@ from nucliadb_models.filters import CatalogFilterExpression
|
|
45
43
|
from nucliadb_models.metadata import ResourceProcessingStatus
|
46
44
|
from nucliadb_models.resource import NucliaDBRoles
|
47
45
|
from nucliadb_models.search import (
|
46
|
+
CatalogFacetsRequest,
|
48
47
|
CatalogRequest,
|
49
48
|
CatalogResponse,
|
50
49
|
KnowledgeboxSearchResults,
|
@@ -157,9 +156,6 @@ async def catalog(
|
|
157
156
|
returns bm25 results on titles and it does not support vector search.
|
158
157
|
It is useful for listing resources in a knowledge box.
|
159
158
|
"""
|
160
|
-
if not pgcatalog_enabled(): # pragma: no cover
|
161
|
-
return HTTPClientError(status_code=501, detail="PG driver is needed for catalog search")
|
162
|
-
|
163
159
|
maybe_log_request_payload(kbid, "/catalog", item)
|
164
160
|
start_time = time()
|
165
161
|
try:
|
@@ -196,5 +192,15 @@ async def catalog(
|
|
196
192
|
)
|
197
193
|
|
198
194
|
|
199
|
-
|
200
|
-
|
195
|
+
@api.post(
|
196
|
+
f"/{KB_PREFIX}/{{kbid}}/catalog/facets",
|
197
|
+
status_code=200,
|
198
|
+
response_model=dict[str, int],
|
199
|
+
response_model_exclude_unset=True,
|
200
|
+
tags=["Search"],
|
201
|
+
include_in_schema=False,
|
202
|
+
)
|
203
|
+
@requires(NucliaDBRoles.READER)
|
204
|
+
@version(1)
|
205
|
+
async def catalog_facets(request: Request, kbid: str, item: CatalogFacetsRequest) -> dict[str, int]:
|
206
|
+
return await pgcatalog_facets(kbid, item)
|
@@ -78,6 +78,8 @@ from nucliadb_models.search import (
|
|
78
78
|
AskRetrievalMatch,
|
79
79
|
AskTimings,
|
80
80
|
AskTokens,
|
81
|
+
AugmentedContext,
|
82
|
+
AugmentedContextResponseItem,
|
81
83
|
ChatModel,
|
82
84
|
ChatOptions,
|
83
85
|
CitationsAskResponseItem,
|
@@ -143,6 +145,7 @@ class AskResult:
|
|
143
145
|
metrics: AskMetrics,
|
144
146
|
best_matches: list[RetrievalMatch],
|
145
147
|
debug_chat_model: Optional[ChatModel],
|
148
|
+
augmented_context: AugmentedContext,
|
146
149
|
):
|
147
150
|
# Initial attributes
|
148
151
|
self.kbid = kbid
|
@@ -157,6 +160,7 @@ class AskResult:
|
|
157
160
|
self.auditor: ChatAuditor = auditor
|
158
161
|
self.metrics: AskMetrics = metrics
|
159
162
|
self.best_matches: list[RetrievalMatch] = best_matches
|
163
|
+
self.augmented_context = augmented_context
|
160
164
|
|
161
165
|
# Computed from the predict chat answer stream
|
162
166
|
self._answer_text = ""
|
@@ -272,9 +276,13 @@ class AskResult:
|
|
272
276
|
status_code=self.status_code,
|
273
277
|
)
|
274
278
|
|
279
|
+
yield AugmentedContextResponseItem(augmented=self.augmented_context)
|
280
|
+
|
275
281
|
# Stream out the citations
|
276
282
|
if self._citations is not None:
|
277
|
-
yield CitationsAskResponseItem(
|
283
|
+
yield CitationsAskResponseItem(
|
284
|
+
citations=self._citations.citations,
|
285
|
+
)
|
278
286
|
|
279
287
|
# Stream out generic metadata about the answer
|
280
288
|
if self._metadata is not None:
|
@@ -366,6 +374,7 @@ class AskResult:
|
|
366
374
|
citations=citations,
|
367
375
|
metadata=metadata,
|
368
376
|
learning_id=self.nuclia_learning_id or "",
|
377
|
+
augmented_context=self.augmented_context,
|
369
378
|
)
|
370
379
|
if self.status_code == AnswerStatusCode.ERROR and self.status_error_details:
|
371
380
|
response.error_details = self.status_error_details
|
@@ -569,6 +578,7 @@ async def ask(
|
|
569
578
|
prompt_context,
|
570
579
|
prompt_context_order,
|
571
580
|
prompt_context_images,
|
581
|
+
augmented_context,
|
572
582
|
) = await prompt_context_builder.build()
|
573
583
|
|
574
584
|
# Make the chat request to the predict API
|
@@ -631,6 +641,7 @@ async def ask(
|
|
631
641
|
metrics=metrics,
|
632
642
|
best_matches=retrieval_results.best_matches,
|
633
643
|
debug_chat_model=chat_model,
|
644
|
+
augmented_context=augmented_context,
|
634
645
|
)
|
635
646
|
|
636
647
|
|