nucliadb 6.7.2.post4889__py3-none-any.whl → 6.7.2.post4908__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nucliadb might be problematic. Click here for more details.
- migrations/0023_backfill_pg_catalog.py +6 -2
- migrations/0028_extracted_vectors_reference.py +1 -1
- migrations/0038_backfill_catalog_field_labels.py +6 -2
- nucliadb/common/catalog/__init__.py +79 -0
- nucliadb/common/catalog/dummy.py +36 -0
- nucliadb/common/catalog/interface.py +85 -0
- nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +294 -208
- nucliadb/common/catalog/utils.py +56 -0
- nucliadb/common/filter_expression.py +1 -1
- nucliadb/common/vector_index_config.py +1 -1
- nucliadb/ingest/orm/brain_v2.py +1 -1
- nucliadb/ingest/orm/processor/processor.py +4 -5
- nucliadb/ingest/settings.py +8 -0
- nucliadb/search/api/v1/catalog.py +4 -4
- nucliadb/search/search/query_parser/models.py +1 -29
- nucliadb/search/search/query_parser/parsers/ask.py +1 -1
- nucliadb/search/search/query_parser/parsers/catalog.py +2 -5
- nucliadb/search/search/query_parser/parsers/graph.py +5 -5
- nucliadb/search/search/rerankers.py +1 -1
- {nucliadb-6.7.2.post4889.dist-info → nucliadb-6.7.2.post4908.dist-info}/METADATA +6 -6
- {nucliadb-6.7.2.post4889.dist-info → nucliadb-6.7.2.post4908.dist-info}/RECORD +24 -21
- nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
- {nucliadb-6.7.2.post4889.dist-info → nucliadb-6.7.2.post4908.dist-info}/WHEEL +0 -0
- {nucliadb-6.7.2.post4889.dist-info → nucliadb-6.7.2.post4908.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.7.2.post4889.dist-info → nucliadb-6.7.2.post4908.dist-info}/top_level.txt +0 -0
|
@@ -28,9 +28,10 @@ import logging
|
|
|
28
28
|
from typing import cast
|
|
29
29
|
|
|
30
30
|
from nucliadb.common import datamanagers
|
|
31
|
+
from nucliadb.common.catalog import catalog_update, get_catalog
|
|
32
|
+
from nucliadb.common.catalog.pg import PGCatalog
|
|
31
33
|
from nucliadb.common.maindb.pg import PGDriver, PGTransaction
|
|
32
34
|
from nucliadb.ingest.orm.index_message import get_resource_index_message
|
|
33
|
-
from nucliadb.ingest.orm.processor.pgcatalog import pgcatalog_update
|
|
34
35
|
from nucliadb.migrator.context import ExecutionContext
|
|
35
36
|
|
|
36
37
|
logger = logging.getLogger(__name__)
|
|
@@ -43,6 +44,9 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
|
|
|
43
44
|
if not isinstance(context.kv_driver, PGDriver):
|
|
44
45
|
return
|
|
45
46
|
|
|
47
|
+
if not isinstance(get_catalog(), PGCatalog):
|
|
48
|
+
return
|
|
49
|
+
|
|
46
50
|
BATCH_SIZE = 100
|
|
47
51
|
async with context.kv_driver.rw_transaction() as txn:
|
|
48
52
|
txn = cast(PGTransaction, txn)
|
|
@@ -75,7 +79,7 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
|
|
|
75
79
|
continue
|
|
76
80
|
|
|
77
81
|
index_message = await get_resource_index_message(resource, reindex=False)
|
|
78
|
-
await
|
|
82
|
+
await catalog_update(txn, kbid, resource, index_message)
|
|
79
83
|
|
|
80
84
|
await txn.commit()
|
|
81
85
|
continue_sql = f"AND key > '/kbs/{kbid}/r/{rid}'"
|
|
@@ -39,7 +39,7 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
|
|
|
39
39
|
async with datamanagers.with_rw_transaction() as txn:
|
|
40
40
|
vectorsets = [vs async for (_vid, vs) in datamanagers.vectorsets.iter(txn, kbid=kbid)]
|
|
41
41
|
|
|
42
|
-
if len(vectorsets) == 0: # pragma:
|
|
42
|
+
if len(vectorsets) == 0: # pragma: no cover
|
|
43
43
|
# should never happen, everyone should have at least one
|
|
44
44
|
logger.warning(f"KB has no vectorsets!", extra={"kbid": kbid})
|
|
45
45
|
return
|
|
@@ -28,9 +28,10 @@ import logging
|
|
|
28
28
|
from typing import cast
|
|
29
29
|
|
|
30
30
|
from nucliadb.common import datamanagers
|
|
31
|
+
from nucliadb.common.catalog import catalog_update, get_catalog
|
|
32
|
+
from nucliadb.common.catalog.pg import PGCatalog
|
|
31
33
|
from nucliadb.common.maindb.pg import PGDriver, PGTransaction
|
|
32
34
|
from nucliadb.ingest.orm.index_message import get_resource_index_message
|
|
33
|
-
from nucliadb.ingest.orm.processor.pgcatalog import pgcatalog_update
|
|
34
35
|
from nucliadb.migrator.context import ExecutionContext
|
|
35
36
|
from nucliadb_protos import resources_pb2
|
|
36
37
|
|
|
@@ -44,6 +45,9 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
|
|
|
44
45
|
if not isinstance(context.kv_driver, PGDriver):
|
|
45
46
|
return
|
|
46
47
|
|
|
48
|
+
if not isinstance(get_catalog(), PGCatalog):
|
|
49
|
+
return
|
|
50
|
+
|
|
47
51
|
BATCH_SIZE = 100
|
|
48
52
|
async with context.kv_driver.rw_transaction() as txn:
|
|
49
53
|
txn = cast(PGTransaction, txn)
|
|
@@ -84,7 +88,7 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
|
|
|
84
88
|
continue
|
|
85
89
|
|
|
86
90
|
index_message = await get_resource_index_message(resource, reindex=False)
|
|
87
|
-
await
|
|
91
|
+
await catalog_update(txn, kbid, resource, index_message)
|
|
88
92
|
|
|
89
93
|
if to_index:
|
|
90
94
|
await txn.commit()
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
21
|
+
#
|
|
22
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
23
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
24
|
+
#
|
|
25
|
+
# AGPL:
|
|
26
|
+
# This program is free software: you can redistribute it and/or modify
|
|
27
|
+
# it under the terms of the GNU Affero General Public License as
|
|
28
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
29
|
+
# License, or (at your option) any later version.
|
|
30
|
+
#
|
|
31
|
+
# This program is distributed in the hope that it will be useful,
|
|
32
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
33
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
34
|
+
# GNU Affero General Public License for more details.
|
|
35
|
+
#
|
|
36
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
37
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
38
|
+
|
|
39
|
+
from nidx_protos.noderesources_pb2 import Resource as IndexMessage
|
|
40
|
+
|
|
41
|
+
from nucliadb.common.catalog.dummy import DummyCatalog
|
|
42
|
+
from nucliadb.common.catalog.interface import Catalog, CatalogQuery
|
|
43
|
+
from nucliadb.common.catalog.pg import PGCatalog
|
|
44
|
+
from nucliadb.common.catalog.utils import build_catalog_resource_data
|
|
45
|
+
from nucliadb.common.maindb.driver import Transaction
|
|
46
|
+
from nucliadb.ingest.orm.resource import Resource
|
|
47
|
+
from nucliadb.ingest.settings import CatalogConfig, settings
|
|
48
|
+
from nucliadb_models.search import CatalogFacetsRequest, Resources
|
|
49
|
+
from nucliadb_utils.exceptions import ConfigurationError
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def get_catalog() -> Catalog:
|
|
53
|
+
if settings.catalog == CatalogConfig.UNSET:
|
|
54
|
+
return DummyCatalog()
|
|
55
|
+
elif settings.catalog == CatalogConfig.PG:
|
|
56
|
+
return PGCatalog()
|
|
57
|
+
else:
|
|
58
|
+
raise ConfigurationError(f"Unknown catalog configuration: {settings.catalog}")
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
async def catalog_update(txn: Transaction, kbid: str, resource: Resource, index_message: IndexMessage):
|
|
62
|
+
catalog = get_catalog()
|
|
63
|
+
resource_data = build_catalog_resource_data(resource, index_message)
|
|
64
|
+
await catalog.update(txn, kbid, resource.uuid, resource_data)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
async def catalog_delete(txn: Transaction, kbid: str, rid: str):
|
|
68
|
+
catalog = get_catalog()
|
|
69
|
+
await catalog.delete(txn, kbid, rid)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
async def catalog_search(query: CatalogQuery) -> Resources:
|
|
73
|
+
catalog = get_catalog()
|
|
74
|
+
return await catalog.search(query)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
async def catalog_facets(kbid: str, request: CatalogFacetsRequest) -> dict[str, int]:
|
|
78
|
+
catalog = get_catalog()
|
|
79
|
+
return await catalog.facets(kbid, request)
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
from nucliadb.common.catalog.interface import Catalog, CatalogQuery, CatalogResourceData
|
|
21
|
+
from nucliadb.common.maindb.driver import Transaction
|
|
22
|
+
from nucliadb_models.search import CatalogFacetsRequest, Resources
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class DummyCatalog(Catalog):
|
|
26
|
+
async def update(self, txn: Transaction, kbid: str, rid: str, data: CatalogResourceData):
|
|
27
|
+
return
|
|
28
|
+
|
|
29
|
+
async def delete(self, txn: Transaction, kbid: str, rid: str):
|
|
30
|
+
return
|
|
31
|
+
|
|
32
|
+
async def search(self, query: CatalogQuery) -> Resources:
|
|
33
|
+
return Resources(results=[], min_score=0.0)
|
|
34
|
+
|
|
35
|
+
async def facets(self, kbid: str, request: CatalogFacetsRequest) -> dict[str, int]:
|
|
36
|
+
return {}
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import abc
|
|
23
|
+
import datetime
|
|
24
|
+
from dataclasses import dataclass
|
|
25
|
+
from typing import Literal, Optional, Union
|
|
26
|
+
|
|
27
|
+
from pydantic import BaseModel, Field
|
|
28
|
+
|
|
29
|
+
from nucliadb.common.maindb.driver import Transaction
|
|
30
|
+
from nucliadb_models import search as search_models
|
|
31
|
+
from nucliadb_models.search import CatalogFacetsRequest, Resources
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class CatalogResourceData(BaseModel):
|
|
35
|
+
"""
|
|
36
|
+
Data extracted from a resource to be indexed in the catalog
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
title: str = Field(description="Resource title")
|
|
40
|
+
created_at: datetime.datetime = Field(description="Resource creation date")
|
|
41
|
+
modified_at: datetime.datetime = Field(description="Resource last modification date")
|
|
42
|
+
labels: list[str] = Field(
|
|
43
|
+
description="Resource labels. This includes labels at the resource level and all classification labels of its fields"
|
|
44
|
+
)
|
|
45
|
+
slug: str = Field(description="Resource slug")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class CatalogExpression:
|
|
50
|
+
@dataclass
|
|
51
|
+
class Date:
|
|
52
|
+
field: Union[Literal["created_at"], Literal["modified_at"]]
|
|
53
|
+
since: Optional[datetime.datetime]
|
|
54
|
+
until: Optional[datetime.datetime]
|
|
55
|
+
|
|
56
|
+
bool_and: Optional[list["CatalogExpression"]] = None
|
|
57
|
+
bool_or: Optional[list["CatalogExpression"]] = None
|
|
58
|
+
bool_not: Optional["CatalogExpression"] = None
|
|
59
|
+
date: Optional[Date] = None
|
|
60
|
+
facet: Optional[str] = None
|
|
61
|
+
resource_id: Optional[str] = None
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class CatalogQuery(BaseModel):
|
|
65
|
+
kbid: str
|
|
66
|
+
query: Optional[search_models.CatalogQuery] = Field(description="Full-text search query")
|
|
67
|
+
filters: Optional[CatalogExpression] = Field(description="Filters to apply to the search")
|
|
68
|
+
sort: search_models.SortOptions = Field(description="Sorting option")
|
|
69
|
+
faceted: list[str] = Field(description="List of facets to compute during the search")
|
|
70
|
+
page_size: int = Field(description="Used for pagination. Maximum page size is 100")
|
|
71
|
+
page_number: int = Field(description="Used for pagination. First page is 0")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class Catalog(abc.ABC, metaclass=abc.ABCMeta):
|
|
75
|
+
@abc.abstractmethod
|
|
76
|
+
async def update(self, txn: Transaction, kbid: str, rid: str, data: CatalogResourceData): ...
|
|
77
|
+
|
|
78
|
+
@abc.abstractmethod
|
|
79
|
+
async def delete(self, txn: Transaction, kbid: str, rid: str): ...
|
|
80
|
+
|
|
81
|
+
@abc.abstractmethod
|
|
82
|
+
async def search(self, query: CatalogQuery) -> Resources: ...
|
|
83
|
+
|
|
84
|
+
@abc.abstractmethod
|
|
85
|
+
async def facets(self, kbid: str, request: CatalogFacetsRequest) -> dict[str, int]: ...
|