nucliadb 6.7.2.post4886__py3-none-any.whl → 6.7.2.post4908__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nucliadb might be problematic. Click here for more details.

Files changed (29) hide show
  1. migrations/0023_backfill_pg_catalog.py +6 -2
  2. migrations/0028_extracted_vectors_reference.py +1 -1
  3. migrations/0038_backfill_catalog_field_labels.py +6 -2
  4. nucliadb/common/catalog/__init__.py +79 -0
  5. nucliadb/common/catalog/dummy.py +36 -0
  6. nucliadb/common/catalog/interface.py +85 -0
  7. nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +294 -208
  8. nucliadb/common/catalog/utils.py +56 -0
  9. nucliadb/common/filter_expression.py +1 -1
  10. nucliadb/common/http_clients/exceptions.py +8 -0
  11. nucliadb/common/http_clients/processing.py +4 -0
  12. nucliadb/common/http_clients/utils.py +3 -0
  13. nucliadb/common/vector_index_config.py +1 -1
  14. nucliadb/ingest/consumer/pull.py +7 -0
  15. nucliadb/ingest/orm/brain_v2.py +1 -1
  16. nucliadb/ingest/orm/processor/processor.py +4 -5
  17. nucliadb/ingest/settings.py +8 -0
  18. nucliadb/search/api/v1/catalog.py +4 -4
  19. nucliadb/search/search/query_parser/models.py +1 -29
  20. nucliadb/search/search/query_parser/parsers/ask.py +1 -1
  21. nucliadb/search/search/query_parser/parsers/catalog.py +2 -5
  22. nucliadb/search/search/query_parser/parsers/graph.py +5 -5
  23. nucliadb/search/search/rerankers.py +1 -1
  24. {nucliadb-6.7.2.post4886.dist-info → nucliadb-6.7.2.post4908.dist-info}/METADATA +6 -6
  25. {nucliadb-6.7.2.post4886.dist-info → nucliadb-6.7.2.post4908.dist-info}/RECORD +28 -25
  26. nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
  27. {nucliadb-6.7.2.post4886.dist-info → nucliadb-6.7.2.post4908.dist-info}/WHEEL +0 -0
  28. {nucliadb-6.7.2.post4886.dist-info → nucliadb-6.7.2.post4908.dist-info}/entry_points.txt +0 -0
  29. {nucliadb-6.7.2.post4886.dist-info → nucliadb-6.7.2.post4908.dist-info}/top_level.txt +0 -0
@@ -28,9 +28,10 @@ import logging
28
28
  from typing import cast
29
29
 
30
30
  from nucliadb.common import datamanagers
31
+ from nucliadb.common.catalog import catalog_update, get_catalog
32
+ from nucliadb.common.catalog.pg import PGCatalog
31
33
  from nucliadb.common.maindb.pg import PGDriver, PGTransaction
32
34
  from nucliadb.ingest.orm.index_message import get_resource_index_message
33
- from nucliadb.ingest.orm.processor.pgcatalog import pgcatalog_update
34
35
  from nucliadb.migrator.context import ExecutionContext
35
36
 
36
37
  logger = logging.getLogger(__name__)
@@ -43,6 +44,9 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
43
44
  if not isinstance(context.kv_driver, PGDriver):
44
45
  return
45
46
 
47
+ if not isinstance(get_catalog(), PGCatalog):
48
+ return
49
+
46
50
  BATCH_SIZE = 100
47
51
  async with context.kv_driver.rw_transaction() as txn:
48
52
  txn = cast(PGTransaction, txn)
@@ -75,7 +79,7 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
75
79
  continue
76
80
 
77
81
  index_message = await get_resource_index_message(resource, reindex=False)
78
- await pgcatalog_update(txn, kbid, resource, index_message)
82
+ await catalog_update(txn, kbid, resource, index_message)
79
83
 
80
84
  await txn.commit()
81
85
  continue_sql = f"AND key > '/kbs/{kbid}/r/{rid}'"
@@ -39,7 +39,7 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
39
39
  async with datamanagers.with_rw_transaction() as txn:
40
40
  vectorsets = [vs async for (_vid, vs) in datamanagers.vectorsets.iter(txn, kbid=kbid)]
41
41
 
42
- if len(vectorsets) == 0: # pragma: nocover
42
+ if len(vectorsets) == 0: # pragma: no cover
43
43
  # should never happen, everyone should have at least one
44
44
  logger.warning(f"KB has no vectorsets!", extra={"kbid": kbid})
45
45
  return
@@ -28,9 +28,10 @@ import logging
28
28
  from typing import cast
29
29
 
30
30
  from nucliadb.common import datamanagers
31
+ from nucliadb.common.catalog import catalog_update, get_catalog
32
+ from nucliadb.common.catalog.pg import PGCatalog
31
33
  from nucliadb.common.maindb.pg import PGDriver, PGTransaction
32
34
  from nucliadb.ingest.orm.index_message import get_resource_index_message
33
- from nucliadb.ingest.orm.processor.pgcatalog import pgcatalog_update
34
35
  from nucliadb.migrator.context import ExecutionContext
35
36
  from nucliadb_protos import resources_pb2
36
37
 
@@ -44,6 +45,9 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
44
45
  if not isinstance(context.kv_driver, PGDriver):
45
46
  return
46
47
 
48
+ if not isinstance(get_catalog(), PGCatalog):
49
+ return
50
+
47
51
  BATCH_SIZE = 100
48
52
  async with context.kv_driver.rw_transaction() as txn:
49
53
  txn = cast(PGTransaction, txn)
@@ -84,7 +88,7 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
84
88
  continue
85
89
 
86
90
  index_message = await get_resource_index_message(resource, reindex=False)
87
- await pgcatalog_update(txn, kbid, resource, index_message)
91
+ await catalog_update(txn, kbid, resource, index_message)
88
92
 
89
93
  if to_index:
90
94
  await txn.commit()
@@ -0,0 +1,79 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ # Copyright (C) 2021 Bosutech XXI S.L.
21
+ #
22
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
23
+ # For commercial licensing, contact us at info@nuclia.com.
24
+ #
25
+ # AGPL:
26
+ # This program is free software: you can redistribute it and/or modify
27
+ # it under the terms of the GNU Affero General Public License as
28
+ # published by the Free Software Foundation, either version 3 of the
29
+ # License, or (at your option) any later version.
30
+ #
31
+ # This program is distributed in the hope that it will be useful,
32
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
33
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34
+ # GNU Affero General Public License for more details.
35
+ #
36
+ # You should have received a copy of the GNU Affero General Public License
37
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
38
+
39
+ from nidx_protos.noderesources_pb2 import Resource as IndexMessage
40
+
41
+ from nucliadb.common.catalog.dummy import DummyCatalog
42
+ from nucliadb.common.catalog.interface import Catalog, CatalogQuery
43
+ from nucliadb.common.catalog.pg import PGCatalog
44
+ from nucliadb.common.catalog.utils import build_catalog_resource_data
45
+ from nucliadb.common.maindb.driver import Transaction
46
+ from nucliadb.ingest.orm.resource import Resource
47
+ from nucliadb.ingest.settings import CatalogConfig, settings
48
+ from nucliadb_models.search import CatalogFacetsRequest, Resources
49
+ from nucliadb_utils.exceptions import ConfigurationError
50
+
51
+
52
+ def get_catalog() -> Catalog:
53
+ if settings.catalog == CatalogConfig.UNSET:
54
+ return DummyCatalog()
55
+ elif settings.catalog == CatalogConfig.PG:
56
+ return PGCatalog()
57
+ else:
58
+ raise ConfigurationError(f"Unknown catalog configuration: {settings.catalog}")
59
+
60
+
61
+ async def catalog_update(txn: Transaction, kbid: str, resource: Resource, index_message: IndexMessage):
62
+ catalog = get_catalog()
63
+ resource_data = build_catalog_resource_data(resource, index_message)
64
+ await catalog.update(txn, kbid, resource.uuid, resource_data)
65
+
66
+
67
+ async def catalog_delete(txn: Transaction, kbid: str, rid: str):
68
+ catalog = get_catalog()
69
+ await catalog.delete(txn, kbid, rid)
70
+
71
+
72
+ async def catalog_search(query: CatalogQuery) -> Resources:
73
+ catalog = get_catalog()
74
+ return await catalog.search(query)
75
+
76
+
77
+ async def catalog_facets(kbid: str, request: CatalogFacetsRequest) -> dict[str, int]:
78
+ catalog = get_catalog()
79
+ return await catalog.facets(kbid, request)
@@ -0,0 +1,36 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ from nucliadb.common.catalog.interface import Catalog, CatalogQuery, CatalogResourceData
21
+ from nucliadb.common.maindb.driver import Transaction
22
+ from nucliadb_models.search import CatalogFacetsRequest, Resources
23
+
24
+
25
+ class DummyCatalog(Catalog):
26
+ async def update(self, txn: Transaction, kbid: str, rid: str, data: CatalogResourceData):
27
+ return
28
+
29
+ async def delete(self, txn: Transaction, kbid: str, rid: str):
30
+ return
31
+
32
+ async def search(self, query: CatalogQuery) -> Resources:
33
+ return Resources(results=[], min_score=0.0)
34
+
35
+ async def facets(self, kbid: str, request: CatalogFacetsRequest) -> dict[str, int]:
36
+ return {}
@@ -0,0 +1,85 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ from __future__ import annotations
21
+
22
+ import abc
23
+ import datetime
24
+ from dataclasses import dataclass
25
+ from typing import Literal, Optional, Union
26
+
27
+ from pydantic import BaseModel, Field
28
+
29
+ from nucliadb.common.maindb.driver import Transaction
30
+ from nucliadb_models import search as search_models
31
+ from nucliadb_models.search import CatalogFacetsRequest, Resources
32
+
33
+
34
+ class CatalogResourceData(BaseModel):
35
+ """
36
+ Data extracted from a resource to be indexed in the catalog
37
+ """
38
+
39
+ title: str = Field(description="Resource title")
40
+ created_at: datetime.datetime = Field(description="Resource creation date")
41
+ modified_at: datetime.datetime = Field(description="Resource last modification date")
42
+ labels: list[str] = Field(
43
+ description="Resource labels. This includes labels at the resource level and all classification labels of its fields"
44
+ )
45
+ slug: str = Field(description="Resource slug")
46
+
47
+
48
+ @dataclass
49
+ class CatalogExpression:
50
+ @dataclass
51
+ class Date:
52
+ field: Union[Literal["created_at"], Literal["modified_at"]]
53
+ since: Optional[datetime.datetime]
54
+ until: Optional[datetime.datetime]
55
+
56
+ bool_and: Optional[list["CatalogExpression"]] = None
57
+ bool_or: Optional[list["CatalogExpression"]] = None
58
+ bool_not: Optional["CatalogExpression"] = None
59
+ date: Optional[Date] = None
60
+ facet: Optional[str] = None
61
+ resource_id: Optional[str] = None
62
+
63
+
64
+ class CatalogQuery(BaseModel):
65
+ kbid: str
66
+ query: Optional[search_models.CatalogQuery] = Field(description="Full-text search query")
67
+ filters: Optional[CatalogExpression] = Field(description="Filters to apply to the search")
68
+ sort: search_models.SortOptions = Field(description="Sorting option")
69
+ faceted: list[str] = Field(description="List of facets to compute during the search")
70
+ page_size: int = Field(description="Used for pagination. Maximum page size is 100")
71
+ page_number: int = Field(description="Used for pagination. First page is 0")
72
+
73
+
74
+ class Catalog(abc.ABC, metaclass=abc.ABCMeta):
75
+ @abc.abstractmethod
76
+ async def update(self, txn: Transaction, kbid: str, rid: str, data: CatalogResourceData): ...
77
+
78
+ @abc.abstractmethod
79
+ async def delete(self, txn: Transaction, kbid: str, rid: str): ...
80
+
81
+ @abc.abstractmethod
82
+ async def search(self, query: CatalogQuery) -> Resources: ...
83
+
84
+ @abc.abstractmethod
85
+ async def facets(self, kbid: str, request: CatalogFacetsRequest) -> dict[str, int]: ...