nucliadb 6.5.0.post4426__py3-none-any.whl → 6.5.0.post4484__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,74 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ """Migration #37
22
+
23
+ Backfill catalog facets
24
+
25
+ """
26
+
27
+ import logging
28
+ from typing import cast
29
+
30
+ from nucliadb.common.maindb.pg import PGDriver, PGTransaction
31
+ from nucliadb.migrator.context import ExecutionContext
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ async def migrate(context: ExecutionContext) -> None:
37
+ driver = cast(PGDriver, context.kv_driver)
38
+
39
+ BATCH_SIZE = 1_000
40
+ async with driver.transaction() as txn:
41
+ txn = cast(PGTransaction, txn)
42
+ start_kbid = "00000000000000000000000000000000"
43
+ start_rid = "00000000000000000000000000000000"
44
+ while True:
45
+ async with txn.connection.cursor() as cur:
46
+ logger.info(f"Filling {BATCH_SIZE} catalog facets from {start_kbid}, {start_rid}")
47
+ # Get a batch of facets from the catalog table
48
+ await cur.execute(
49
+ """
50
+ WITH i AS (
51
+ INSERT INTO catalog_facets (kbid, rid, facet)
52
+ SELECT kbid, rid, unnest(extract_facets(labels)) FROM (
53
+ SELECT * FROM catalog
54
+ WHERE (kbid = %(kbid)s AND rid > %(rid)s) OR kbid > %(kbid)s
55
+ ORDER BY kbid, rid
56
+ LIMIT %(batch)s
57
+ ) rs
58
+ RETURNING kbid, rid
59
+ )
60
+ SELECT kbid, rid FROM i ORDER BY kbid DESC, rid DESC LIMIT 1;
61
+ """,
62
+ {"kbid": start_kbid, "rid": start_rid, "batch": BATCH_SIZE},
63
+ )
64
+
65
+ # Set the key for next iteration
66
+ results = await cur.fetchone() # type: ignore
67
+ if results is None:
68
+ break
69
+ (start_kbid, start_rid) = results
70
+
71
+ await txn.commit()
72
+
73
+
74
+ async def migrate_kb(context: ExecutionContext, kbid: str) -> None: ...
@@ -26,7 +26,7 @@ async def migrate(txn: PGTransaction) -> None:
26
26
  # IF NOT EXISTS just for compatibility with older install predating the migration system
27
27
  await cur.execute("""
28
28
  CREATE TABLE IF NOT EXISTS resources (
29
- key TEXT PRIMARY KEY,
29
+ key TEXT COLLATE ucs_basic PRIMARY KEY,
30
30
  value BYTEA
31
31
  );
32
32
  """)
@@ -0,0 +1,43 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ from nucliadb.common.maindb.pg import PGTransaction
22
+
23
+
24
+ async def migrate(txn: PGTransaction) -> None:
25
+ async with txn.connection.cursor() as cur:
26
+ await cur.execute(
27
+ """
28
+ CREATE TABLE catalog_facets (
29
+ id BIGINT PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
30
+ kbid UUID,
31
+ rid UUID,
32
+ facet TEXT COLLATE ucs_basic,
33
+
34
+ FOREIGN KEY (kbid, rid) REFERENCES catalog (kbid, rid) ON DELETE CASCADE
35
+ );
36
+
37
+ -- For FK checks
38
+ CREATE INDEX ON catalog_facets(kbid, rid);
39
+
40
+ -- Best for per-facet aggregation, also used by search with facet filter
41
+ CREATE INDEX ON catalog_facets(kbid, facet);
42
+ """
43
+ )
@@ -0,0 +1,26 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ from nucliadb.common.maindb.pg import PGTransaction
22
+
23
+
24
+ async def migrate(txn: PGTransaction) -> None:
25
+ async with txn.connection.cursor() as cur:
26
+ await cur.execute("ALTER FUNCTION extract_facets(text[]) PARALLEL SAFE;")
@@ -19,8 +19,10 @@
19
19
  #
20
20
  from __future__ import annotations
21
21
 
22
+ import asyncio
22
23
  import enum
23
24
  import logging
25
+ from collections import defaultdict
24
26
  from datetime import datetime
25
27
  from typing import TYPE_CHECKING, Any, Generic, Optional, Type, TypeVar
26
28
 
@@ -113,6 +115,8 @@ class Field(Generic[PbType]):
113
115
  raise InvalidPBClass(self.__class__, pb.__class__)
114
116
  self.value = pb
115
117
 
118
+ self.locks: dict[str, asyncio.Lock] = defaultdict(asyncio.Lock)
119
+
116
120
  @property
117
121
  def kbid(self) -> str:
118
122
  return self.resource.kb.kbid
@@ -364,10 +368,13 @@ class Field(Generic[PbType]):
364
368
 
365
369
  async def get_extracted_text(self, force=False) -> Optional[ExtractedText]:
366
370
  if self.extracted_text is None or force:
367
- sf = self.get_storage_field(FieldTypes.FIELD_TEXT)
368
- payload = await self.storage.download_pb(sf, ExtractedText)
369
- if payload is not None:
370
- self.extracted_text = payload
371
+ async with self.locks["extracted_text"]:
372
+ # Value could have been fetched while waiting for the lock
373
+ if self.extracted_text is None or force:
374
+ sf = self.get_storage_field(FieldTypes.FIELD_TEXT)
375
+ payload = await self.storage.download_pb(sf, ExtractedText)
376
+ if payload is not None:
377
+ self.extracted_text = payload
371
378
  return self.extracted_text
372
379
 
373
380
  async def set_vectors(
@@ -499,10 +506,13 @@ class Field(Generic[PbType]):
499
506
 
500
507
  async def get_field_metadata(self, force: bool = False) -> Optional[FieldComputedMetadata]:
501
508
  if self.computed_metadata is None or force:
502
- sf = self.get_storage_field(FieldTypes.FIELD_METADATA)
503
- payload = await self.storage.download_pb(sf, FieldComputedMetadata)
504
- if payload is not None:
505
- self.computed_metadata = payload
509
+ async with self.locks["field_metadata"]:
510
+ # Value could have been fetched while waiting for the lock
511
+ if self.computed_metadata is None or force:
512
+ sf = self.get_storage_field(FieldTypes.FIELD_METADATA)
513
+ payload = await self.storage.download_pb(sf, FieldComputedMetadata)
514
+ if payload is not None:
515
+ self.computed_metadata = payload
506
516
  return self.computed_metadata
507
517
 
508
518
  async def set_large_field_metadata(self, payload: LargeComputedMetadataWrapper):
@@ -40,6 +40,17 @@ def pgcatalog_enabled(kbid):
40
40
  return isinstance(get_driver(), PGDriver)
41
41
 
42
42
 
43
+ def extract_facets(labels):
44
+ facets = set()
45
+ for label in labels:
46
+ parts = label.split("/")
47
+ facet = ""
48
+ for part in parts[1:]:
49
+ facet += f"/{part}"
50
+ facets.add(facet)
51
+ return facets
52
+
53
+
43
54
  @observer.wrap({"type": "update"})
44
55
  async def pgcatalog_update(txn: Transaction, kbid: str, resource: Resource, index_message: IndexMessage):
45
56
  if not pgcatalog_enabled(kbid):
@@ -76,6 +87,21 @@ async def pgcatalog_update(txn: Transaction, kbid: str, resource: Resource, inde
76
87
  "slug": resource.basic.slug,
77
88
  },
78
89
  )
90
+ await cur.execute(
91
+ "DELETE FROM catalog_facets WHERE kbid = %(kbid)s AND rid = %(rid)s",
92
+ {
93
+ "kbid": resource.kb.kbid,
94
+ "rid": resource.uuid,
95
+ },
96
+ )
97
+ await cur.execute(
98
+ "INSERT INTO catalog_facets (kbid, rid, facet) SELECT %(kbid)s AS kbid, %(rid)s AS rid, unnest(%(facets)s::text[]) AS facet",
99
+ {
100
+ "kbid": resource.kb.kbid,
101
+ "rid": resource.uuid,
102
+ "facets": list(extract_facets(index_message.labels)),
103
+ },
104
+ )
79
105
 
80
106
 
81
107
  @observer.wrap({"type": "delete"})
@@ -19,6 +19,7 @@
19
19
  #
20
20
  from __future__ import annotations
21
21
 
22
+ import asyncio
22
23
  import logging
23
24
  from collections import defaultdict
24
25
  from concurrent.futures import ThreadPoolExecutor
@@ -126,6 +127,7 @@ class Resource:
126
127
  self.disable_vectors = disable_vectors
127
128
  self._previous_status: Optional[Metadata.Status.ValueType] = None
128
129
  self.user_relations: Optional[PBRelations] = None
130
+ self.locks: dict[str, asyncio.Lock] = defaultdict(asyncio.Lock)
129
131
 
130
132
  async def set_slug(self):
131
133
  basic = await self.get_basic()
@@ -306,10 +308,13 @@ class Resource:
306
308
  async def get_field(self, key: str, type: FieldType.ValueType, load: bool = True):
307
309
  field = (type, key)
308
310
  if field not in self.fields:
309
- field_obj: Field = KB_FIELDS[type](id=key, resource=self)
310
- if load:
311
- await field_obj.get_value()
312
- self.fields[field] = field_obj
311
+ async with self.locks["field"]:
312
+ # Field could have been fetch while waiting for the lock
313
+ if field not in self.fields:
314
+ field_obj: Field = KB_FIELDS[type](id=key, resource=self)
315
+ if load:
316
+ await field_obj.get_value()
317
+ self.fields[field] = field_obj
313
318
  return self.fields[field]
314
319
 
315
320
  async def set_field(self, type: FieldType.ValueType, key: str, payload: Any):
@@ -27,15 +27,13 @@ from pydantic import ValidationError
27
27
 
28
28
  from nucliadb.common.datamanagers.exceptions import KnowledgeBoxNotFound
29
29
  from nucliadb.common.exceptions import InvalidQueryError
30
- from nucliadb.common.maindb.pg import PGDriver
31
- from nucliadb.common.maindb.utils import get_driver
32
30
  from nucliadb.models.responses import HTTPClientError
33
31
  from nucliadb.search import logger
34
32
  from nucliadb.search.api.v1.router import KB_PREFIX, api
35
33
  from nucliadb.search.api.v1.utils import fastapi_query
36
34
  from nucliadb.search.search import cache
37
35
  from nucliadb.search.search.merge import fetch_resources
38
- from nucliadb.search.search.pgcatalog import pgcatalog_search
36
+ from nucliadb.search.search.pgcatalog import pgcatalog_facets, pgcatalog_search
39
37
  from nucliadb.search.search.query_parser.parsers import parse_catalog
40
38
  from nucliadb.search.search.utils import (
41
39
  maybe_log_request_payload,
@@ -45,6 +43,7 @@ from nucliadb_models.filters import CatalogFilterExpression
45
43
  from nucliadb_models.metadata import ResourceProcessingStatus
46
44
  from nucliadb_models.resource import NucliaDBRoles
47
45
  from nucliadb_models.search import (
46
+ CatalogFacetsRequest,
48
47
  CatalogRequest,
49
48
  CatalogResponse,
50
49
  KnowledgeboxSearchResults,
@@ -157,9 +156,6 @@ async def catalog(
157
156
  returns bm25 results on titles and it does not support vector search.
158
157
  It is useful for listing resources in a knowledge box.
159
158
  """
160
- if not pgcatalog_enabled(): # pragma: no cover
161
- return HTTPClientError(status_code=501, detail="PG driver is needed for catalog search")
162
-
163
159
  maybe_log_request_payload(kbid, "/catalog", item)
164
160
  start_time = time()
165
161
  try:
@@ -196,5 +192,15 @@ async def catalog(
196
192
  )
197
193
 
198
194
 
199
- def pgcatalog_enabled():
200
- return isinstance(get_driver(), PGDriver)
195
+ @api.post(
196
+ f"/{KB_PREFIX}/{{kbid}}/catalog/facets",
197
+ status_code=200,
198
+ response_model=dict[str, int],
199
+ response_model_exclude_unset=True,
200
+ tags=["Search"],
201
+ include_in_schema=False,
202
+ )
203
+ @requires(NucliaDBRoles.READER)
204
+ @version(1)
205
+ async def catalog_facets(request: Request, kbid: str, item: CatalogFacetsRequest) -> dict[str, int]:
206
+ return await pgcatalog_facets(kbid, item)
@@ -78,6 +78,8 @@ from nucliadb_models.search import (
78
78
  AskRetrievalMatch,
79
79
  AskTimings,
80
80
  AskTokens,
81
+ AugmentedContext,
82
+ AugmentedContextResponseItem,
81
83
  ChatModel,
82
84
  ChatOptions,
83
85
  CitationsAskResponseItem,
@@ -143,6 +145,7 @@ class AskResult:
143
145
  metrics: AskMetrics,
144
146
  best_matches: list[RetrievalMatch],
145
147
  debug_chat_model: Optional[ChatModel],
148
+ augmented_context: AugmentedContext,
146
149
  ):
147
150
  # Initial attributes
148
151
  self.kbid = kbid
@@ -157,6 +160,7 @@ class AskResult:
157
160
  self.auditor: ChatAuditor = auditor
158
161
  self.metrics: AskMetrics = metrics
159
162
  self.best_matches: list[RetrievalMatch] = best_matches
163
+ self.augmented_context = augmented_context
160
164
 
161
165
  # Computed from the predict chat answer stream
162
166
  self._answer_text = ""
@@ -272,9 +276,13 @@ class AskResult:
272
276
  status_code=self.status_code,
273
277
  )
274
278
 
279
+ yield AugmentedContextResponseItem(augmented=self.augmented_context)
280
+
275
281
  # Stream out the citations
276
282
  if self._citations is not None:
277
- yield CitationsAskResponseItem(citations=self._citations.citations)
283
+ yield CitationsAskResponseItem(
284
+ citations=self._citations.citations,
285
+ )
278
286
 
279
287
  # Stream out generic metadata about the answer
280
288
  if self._metadata is not None:
@@ -366,6 +374,7 @@ class AskResult:
366
374
  citations=citations,
367
375
  metadata=metadata,
368
376
  learning_id=self.nuclia_learning_id or "",
377
+ augmented_context=self.augmented_context,
369
378
  )
370
379
  if self.status_code == AnswerStatusCode.ERROR and self.status_error_details:
371
380
  response.error_details = self.status_error_details
@@ -569,6 +578,7 @@ async def ask(
569
578
  prompt_context,
570
579
  prompt_context_order,
571
580
  prompt_context_images,
581
+ augmented_context,
572
582
  ) = await prompt_context_builder.build()
573
583
 
574
584
  # Make the chat request to the predict API
@@ -631,6 +641,7 @@ async def ask(
631
641
  metrics=metrics,
632
642
  best_matches=retrieval_results.best_matches,
633
643
  debug_chat_model=chat_model,
644
+ augmented_context=augmented_context,
634
645
  )
635
646
 
636
647