nucliadb 6.7.2.post4862__py3-none-any.whl → 6.9.2.post5282__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nucliadb might be problematic. Click here for more details.
- migrations/0016_upgrade_to_paragraphs_v2.py +1 -1
- migrations/0017_multiple_writable_shards.py +1 -1
- migrations/0018_purge_orphan_kbslugs.py +1 -1
- migrations/0019_upgrade_to_paragraphs_v3.py +1 -1
- migrations/0021_overwrite_vectorsets_key.py +1 -1
- migrations/0023_backfill_pg_catalog.py +7 -3
- migrations/0025_assign_models_to_kbs_v2.py +3 -3
- migrations/0027_rollover_texts3.py +1 -1
- migrations/0028_extracted_vectors_reference.py +1 -1
- migrations/0029_backfill_field_status.py +1 -1
- migrations/0032_remove_old_relations.py +1 -1
- migrations/0036_backfill_catalog_slug.py +1 -1
- migrations/0037_backfill_catalog_facets.py +1 -1
- migrations/0038_backfill_catalog_field_labels.py +7 -3
- migrations/0039_backfill_converation_splits_metadata.py +106 -0
- migrations/0040_migrate_search_configurations.py +79 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/backups/create.py +3 -3
- nucliadb/backups/restore.py +3 -3
- nucliadb/common/cache.py +1 -1
- nucliadb/common/catalog/__init__.py +79 -0
- nucliadb/common/catalog/dummy.py +36 -0
- nucliadb/common/catalog/interface.py +85 -0
- nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +294 -208
- nucliadb/common/catalog/utils.py +56 -0
- nucliadb/common/cluster/manager.py +3 -19
- nucliadb/common/cluster/rebalance.py +484 -110
- nucliadb/common/cluster/rollover.py +29 -0
- nucliadb/common/cluster/settings.py +1 -1
- nucliadb/common/cluster/utils.py +26 -0
- nucliadb/common/datamanagers/atomic.py +6 -0
- nucliadb/common/datamanagers/utils.py +2 -2
- nucliadb/common/external_index_providers/manager.py +1 -29
- nucliadb/common/external_index_providers/settings.py +1 -27
- nucliadb/common/filter_expression.py +16 -33
- nucliadb/common/http_clients/exceptions.py +8 -0
- nucliadb/common/http_clients/processing.py +4 -0
- nucliadb/common/http_clients/utils.py +3 -0
- nucliadb/common/ids.py +77 -55
- nucliadb/common/locking.py +4 -4
- nucliadb/common/maindb/driver.py +11 -1
- nucliadb/common/maindb/local.py +1 -1
- nucliadb/common/maindb/pg.py +1 -1
- nucliadb/common/nidx.py +19 -1
- nucliadb/common/vector_index_config.py +1 -1
- nucliadb/export_import/datamanager.py +3 -3
- nucliadb/ingest/consumer/pull.py +7 -0
- nucliadb/ingest/consumer/service.py +2 -27
- nucliadb/ingest/consumer/shard_creator.py +17 -6
- nucliadb/ingest/fields/base.py +9 -17
- nucliadb/ingest/fields/conversation.py +47 -1
- nucliadb/ingest/orm/brain_v2.py +21 -3
- nucliadb/ingest/orm/index_message.py +126 -111
- nucliadb/ingest/orm/knowledgebox.py +84 -43
- nucliadb/ingest/orm/processor/auditing.py +1 -1
- nucliadb/ingest/orm/processor/processor.py +95 -149
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +10 -1
- nucliadb/ingest/partitions.py +12 -1
- nucliadb/ingest/serialize.py +2 -2
- nucliadb/ingest/service/writer.py +26 -19
- nucliadb/ingest/settings.py +33 -11
- nucliadb/learning_proxy.py +12 -15
- nucliadb/metrics_exporter.py +17 -4
- nucliadb/migrator/datamanager.py +11 -17
- nucliadb/migrator/migrator.py +2 -2
- nucliadb/purge/__init__.py +12 -17
- nucliadb/purge/orphan_shards.py +2 -2
- nucliadb/reader/api/v1/knowledgebox.py +40 -12
- nucliadb/reader/api/v1/learning_config.py +30 -10
- nucliadb/reader/api/v1/resource.py +2 -2
- nucliadb/reader/api/v1/services.py +1 -1
- nucliadb/reader/reader/notifications.py +1 -1
- nucliadb/search/api/v1/__init__.py +1 -0
- nucliadb/search/api/v1/catalog.py +4 -4
- nucliadb/search/api/v1/find.py +1 -4
- nucliadb/search/api/v1/hydrate.py +328 -0
- nucliadb/search/api/v1/resource/ask.py +21 -1
- nucliadb/search/api/v1/search.py +1 -4
- nucliadb/search/predict.py +9 -2
- nucliadb/search/search/cache.py +1 -20
- nucliadb/search/search/chat/ask.py +50 -8
- nucliadb/search/search/chat/prompt.py +47 -15
- nucliadb/search/search/chat/query.py +8 -1
- nucliadb/search/search/fetch.py +1 -1
- nucliadb/search/search/find.py +1 -6
- nucliadb/search/search/{hydrator.py → hydrator/__init__.py} +5 -4
- nucliadb/search/search/hydrator/fields.py +175 -0
- nucliadb/search/search/hydrator/images.py +130 -0
- nucliadb/search/search/hydrator/paragraphs.py +307 -0
- nucliadb/search/search/hydrator/resources.py +56 -0
- nucliadb/search/search/metrics.py +16 -0
- nucliadb/search/search/predict_proxy.py +33 -11
- nucliadb/search/search/query.py +0 -23
- nucliadb/search/search/query_parser/fetcher.py +5 -5
- nucliadb/search/search/query_parser/models.py +1 -30
- nucliadb/search/search/query_parser/parsers/ask.py +1 -1
- nucliadb/search/search/query_parser/parsers/catalog.py +4 -7
- nucliadb/search/search/query_parser/parsers/common.py +16 -7
- nucliadb/search/search/query_parser/parsers/find.py +0 -11
- nucliadb/search/search/query_parser/parsers/graph.py +5 -5
- nucliadb/search/search/query_parser/parsers/search.py +0 -11
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +4 -11
- nucliadb/search/search/rerankers.py +1 -1
- nucliadb/search/search/summarize.py +1 -1
- nucliadb/standalone/run.py +3 -0
- nucliadb/tasks/retries.py +4 -4
- nucliadb/train/generators/sentence_classifier.py +2 -8
- nucliadb/train/generators/utils.py +1 -1
- nucliadb/train/nodes.py +4 -4
- nucliadb/train/servicer.py +1 -1
- nucliadb/train/uploader.py +1 -1
- nucliadb/writer/api/v1/field.py +14 -9
- nucliadb/writer/api/v1/knowledgebox.py +15 -52
- nucliadb/writer/api/v1/learning_config.py +5 -4
- nucliadb/writer/api/v1/resource.py +2 -2
- nucliadb/writer/resource/field.py +38 -2
- nucliadb/writer/tus/azure.py +4 -4
- nucliadb/writer/tus/gcs.py +11 -17
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/METADATA +9 -10
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/RECORD +124 -114
- nucliadb/common/external_index_providers/pinecone.py +0 -894
- nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/WHEEL +0 -0
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import abc
|
|
23
|
+
import datetime
|
|
24
|
+
from dataclasses import dataclass
|
|
25
|
+
from typing import Literal, Optional, Union
|
|
26
|
+
|
|
27
|
+
from pydantic import BaseModel, Field
|
|
28
|
+
|
|
29
|
+
from nucliadb.common.maindb.driver import Transaction
|
|
30
|
+
from nucliadb_models import search as search_models
|
|
31
|
+
from nucliadb_models.search import CatalogFacetsRequest, Resources
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class CatalogResourceData(BaseModel):
|
|
35
|
+
"""
|
|
36
|
+
Data extracted from a resource to be indexed in the catalog
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
title: str = Field(description="Resource title")
|
|
40
|
+
created_at: datetime.datetime = Field(description="Resource creation date")
|
|
41
|
+
modified_at: datetime.datetime = Field(description="Resource last modification date")
|
|
42
|
+
labels: list[str] = Field(
|
|
43
|
+
description="Resource labels. This includes labels at the resource level and all classification labels of its fields"
|
|
44
|
+
)
|
|
45
|
+
slug: str = Field(description="Resource slug")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class CatalogExpression:
|
|
50
|
+
@dataclass
|
|
51
|
+
class Date:
|
|
52
|
+
field: Union[Literal["created_at"], Literal["modified_at"]]
|
|
53
|
+
since: Optional[datetime.datetime]
|
|
54
|
+
until: Optional[datetime.datetime]
|
|
55
|
+
|
|
56
|
+
bool_and: Optional[list["CatalogExpression"]] = None
|
|
57
|
+
bool_or: Optional[list["CatalogExpression"]] = None
|
|
58
|
+
bool_not: Optional["CatalogExpression"] = None
|
|
59
|
+
date: Optional[Date] = None
|
|
60
|
+
facet: Optional[str] = None
|
|
61
|
+
resource_id: Optional[str] = None
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class CatalogQuery(BaseModel):
|
|
65
|
+
kbid: str
|
|
66
|
+
query: Optional[search_models.CatalogQuery] = Field(description="Full-text search query")
|
|
67
|
+
filters: Optional[CatalogExpression] = Field(description="Filters to apply to the search")
|
|
68
|
+
sort: search_models.SortOptions = Field(description="Sorting option")
|
|
69
|
+
faceted: list[str] = Field(description="List of facets to compute during the search")
|
|
70
|
+
page_size: int = Field(description="Used for pagination. Maximum page size is 100")
|
|
71
|
+
page_number: int = Field(description="Used for pagination. First page is 0")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class Catalog(abc.ABC, metaclass=abc.ABCMeta):
|
|
75
|
+
@abc.abstractmethod
|
|
76
|
+
async def update(self, txn: Transaction, kbid: str, rid: str, data: CatalogResourceData): ...
|
|
77
|
+
|
|
78
|
+
@abc.abstractmethod
|
|
79
|
+
async def delete(self, txn: Transaction, kbid: str, rid: str): ...
|
|
80
|
+
|
|
81
|
+
@abc.abstractmethod
|
|
82
|
+
async def search(self, query: CatalogQuery) -> Resources: ...
|
|
83
|
+
|
|
84
|
+
@abc.abstractmethod
|
|
85
|
+
async def facets(self, kbid: str, request: CatalogFacetsRequest) -> dict[str, int]: ...
|
|
@@ -26,111 +26,180 @@ from typing import Any, Literal, Union, cast
|
|
|
26
26
|
from psycopg import AsyncCursor, sql
|
|
27
27
|
from psycopg.rows import DictRow, dict_row
|
|
28
28
|
|
|
29
|
-
from nucliadb.common.
|
|
29
|
+
from nucliadb.common.catalog.interface import (
|
|
30
|
+
Catalog,
|
|
31
|
+
CatalogExpression,
|
|
32
|
+
CatalogQuery,
|
|
33
|
+
CatalogResourceData,
|
|
34
|
+
)
|
|
35
|
+
from nucliadb.common.exceptions import InvalidQueryError
|
|
36
|
+
from nucliadb.common.maindb.driver import Transaction
|
|
37
|
+
from nucliadb.common.maindb.pg import PGDriver, PGTransaction
|
|
30
38
|
from nucliadb.common.maindb.utils import get_driver
|
|
31
|
-
from nucliadb.search.search.query_parser.models import CatalogExpression, CatalogQuery
|
|
32
39
|
from nucliadb_models import search as search_models
|
|
33
|
-
from nucliadb_models.labels import translate_system_to_alias_label
|
|
34
|
-
from nucliadb_models.search import
|
|
40
|
+
from nucliadb_models.labels import translate_alias_to_system_label, translate_system_to_alias_label
|
|
41
|
+
from nucliadb_models.search import (
|
|
42
|
+
CatalogFacetsRequest,
|
|
43
|
+
ResourceResult,
|
|
44
|
+
Resources,
|
|
45
|
+
SortField,
|
|
46
|
+
SortOrder,
|
|
47
|
+
)
|
|
35
48
|
from nucliadb_telemetry import metrics
|
|
36
49
|
|
|
37
|
-
|
|
50
|
+
write_observer = metrics.Observer("pg_catalog_write", labels={"type": ""})
|
|
51
|
+
search_observer = metrics.Observer("pg_catalog_search", labels={"op": ""})
|
|
38
52
|
|
|
39
|
-
observer = metrics.Observer("pg_catalog_search", labels={"op": ""})
|
|
40
53
|
logger = logging.getLogger(__name__)
|
|
41
54
|
|
|
42
55
|
SPLIT_REGEX = re.compile(r"\W")
|
|
43
56
|
|
|
44
57
|
|
|
45
|
-
def
|
|
46
|
-
|
|
47
|
-
nonfacets = []
|
|
48
|
-
for op in operands:
|
|
49
|
-
if op.facet:
|
|
50
|
-
facets.append(op.facet)
|
|
51
|
-
else:
|
|
52
|
-
nonfacets.append(op)
|
|
58
|
+
def _pg_transaction(txn: Transaction) -> PGTransaction:
|
|
59
|
+
return cast(PGTransaction, txn)
|
|
53
60
|
|
|
54
|
-
return facets, nonfacets
|
|
55
61
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
if expr.bool_and:
|
|
59
|
-
return _convert_boolean_op(expr.bool_and, "and", filter_params)
|
|
60
|
-
elif expr.bool_or:
|
|
61
|
-
return _convert_boolean_op(expr.bool_or, "or", filter_params)
|
|
62
|
-
elif expr.bool_not:
|
|
63
|
-
return sql.SQL("(NOT {})").format(_convert_filter(expr.bool_not, filter_params))
|
|
64
|
-
elif expr.date:
|
|
65
|
-
return _convert_date_filter(expr.date, filter_params)
|
|
66
|
-
elif expr.facet:
|
|
67
|
-
param_name = f"param{len(filter_params)}"
|
|
68
|
-
filter_params[param_name] = [expr.facet]
|
|
69
|
-
if expr.facet == "/n/s/PROCESSED":
|
|
70
|
-
# Optimization for the most common case, we know PROCESSED is a full label and can use the smaller labels index
|
|
71
|
-
# This is needed because PROCESSED is present in most catalog entries and PG is unlikely to use any index
|
|
72
|
-
# for it, falling back to executing the extract_facets function which can be slow
|
|
73
|
-
return sql.SQL("labels @> {}").format(sql.Placeholder(param_name))
|
|
74
|
-
else:
|
|
75
|
-
return sql.SQL("extract_facets(labels) @> {}").format(sql.Placeholder(param_name))
|
|
76
|
-
elif expr.resource_id:
|
|
77
|
-
param_name = f"param{len(filter_params)}"
|
|
78
|
-
filter_params[param_name] = [expr.resource_id]
|
|
79
|
-
return sql.SQL("rid = {}").format(sql.Placeholder(param_name))
|
|
80
|
-
else:
|
|
81
|
-
return sql.SQL("")
|
|
62
|
+
def _pg_driver() -> PGDriver:
|
|
63
|
+
return cast(PGDriver, get_driver())
|
|
82
64
|
|
|
83
65
|
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
66
|
+
class PGCatalog(Catalog):
|
|
67
|
+
@write_observer.wrap({"type": "update"})
|
|
68
|
+
async def update(self, txn: Transaction, kbid: str, rid: str, data: CatalogResourceData):
|
|
69
|
+
async with _pg_transaction(txn).connection.cursor() as cur:
|
|
70
|
+
await cur.execute(
|
|
71
|
+
"""
|
|
72
|
+
INSERT INTO catalog
|
|
73
|
+
(kbid, rid, title, created_at, modified_at, labels, slug)
|
|
74
|
+
VALUES
|
|
75
|
+
(%(kbid)s, %(rid)s, %(title)s, %(created_at)s, %(modified_at)s, %(labels)s, %(slug)s)
|
|
76
|
+
ON CONFLICT (kbid, rid) DO UPDATE SET
|
|
77
|
+
title = excluded.title,
|
|
78
|
+
created_at = excluded.created_at,
|
|
79
|
+
modified_at = excluded.modified_at,
|
|
80
|
+
labels = excluded.labels,
|
|
81
|
+
slug = excluded.slug""",
|
|
82
|
+
{
|
|
83
|
+
"kbid": kbid,
|
|
84
|
+
"rid": rid,
|
|
85
|
+
"title": data.title,
|
|
86
|
+
"created_at": data.created_at,
|
|
87
|
+
"modified_at": data.modified_at,
|
|
88
|
+
"labels": data.labels,
|
|
89
|
+
"slug": data.slug,
|
|
90
|
+
},
|
|
91
|
+
)
|
|
92
|
+
await cur.execute(
|
|
93
|
+
"DELETE FROM catalog_facets WHERE kbid = %(kbid)s AND rid = %(rid)s",
|
|
94
|
+
{
|
|
95
|
+
"kbid": kbid,
|
|
96
|
+
"rid": rid,
|
|
97
|
+
},
|
|
98
|
+
)
|
|
99
|
+
await cur.execute(
|
|
100
|
+
"INSERT INTO catalog_facets (kbid, rid, facet) SELECT %(kbid)s AS kbid, %(rid)s AS rid, unnest(%(facets)s::text[]) AS facet",
|
|
101
|
+
{
|
|
102
|
+
"kbid": kbid,
|
|
103
|
+
"rid": rid,
|
|
104
|
+
"facets": list(extract_facets(data.labels)),
|
|
105
|
+
},
|
|
102
106
|
)
|
|
103
|
-
filter_params[param_name] = facets
|
|
104
|
-
for nonfacet in nonfacets:
|
|
105
|
-
operands_sql.append(_convert_filter(nonfacet, filter_params))
|
|
106
|
-
return sql.SQL("({})").format(sql.SQL(f" {op.upper()} ").join(operands_sql))
|
|
107
107
|
|
|
108
|
+
@write_observer.wrap({"type": "delete"})
|
|
109
|
+
async def delete(self, txn: Transaction, kbid: str, rid: str):
|
|
110
|
+
async with _pg_transaction(txn).connection.cursor() as cur:
|
|
111
|
+
await cur.execute(
|
|
112
|
+
"DELETE FROM catalog where kbid = %(kbid)s AND rid = %(rid)s", {"kbid": kbid, "rid": rid}
|
|
113
|
+
)
|
|
108
114
|
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
115
|
+
@search_observer.wrap({"op": "search"})
|
|
116
|
+
async def search(self, catalog_query: CatalogQuery) -> Resources:
|
|
117
|
+
# Prepare SQL query
|
|
118
|
+
query, query_params = _prepare_query_filters(catalog_query)
|
|
119
|
+
|
|
120
|
+
async with _pg_driver()._get_connection() as conn, conn.cursor(row_factory=dict_row) as cur:
|
|
121
|
+
facets = {}
|
|
122
|
+
|
|
123
|
+
# Faceted search
|
|
124
|
+
if catalog_query.faceted:
|
|
125
|
+
with search_observer({"op": "facets"}):
|
|
126
|
+
tmp_facets: dict[str, dict[str, int]] = {
|
|
127
|
+
translate_label(f): defaultdict(int) for f in catalog_query.faceted
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
if catalog_query.filters is None:
|
|
131
|
+
await _faceted_search_unfiltered(cur, catalog_query, tmp_facets)
|
|
132
|
+
else:
|
|
133
|
+
await _faceted_search_filtered(
|
|
134
|
+
cur, catalog_query, tmp_facets, query, query_params
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
facets = {translate_system_to_alias_label(k): v for k, v in tmp_facets.items()}
|
|
138
|
+
|
|
139
|
+
# Totals
|
|
140
|
+
with search_observer({"op": "totals"}):
|
|
141
|
+
await cur.execute(
|
|
142
|
+
sql.SQL("SELECT COUNT(*) FROM ({}) fc").format(query),
|
|
143
|
+
query_params,
|
|
144
|
+
)
|
|
145
|
+
total = (await cur.fetchone())["count"] # type: ignore
|
|
146
|
+
|
|
147
|
+
# Query
|
|
148
|
+
with search_observer({"op": "query"}):
|
|
149
|
+
query, query_params = _prepare_query(catalog_query)
|
|
150
|
+
await cur.execute(query, query_params)
|
|
151
|
+
data = await cur.fetchall()
|
|
152
|
+
|
|
153
|
+
return Resources(
|
|
154
|
+
facets=facets,
|
|
155
|
+
results=[
|
|
156
|
+
ResourceResult(
|
|
157
|
+
rid=str(r["rid"]).replace("-", ""),
|
|
158
|
+
field="title",
|
|
159
|
+
field_type="a",
|
|
160
|
+
labels=[label for label in r["labels"] if label.startswith("/l/")],
|
|
161
|
+
score=0,
|
|
162
|
+
)
|
|
163
|
+
for r in data
|
|
164
|
+
],
|
|
165
|
+
query=catalog_query.query.query if catalog_query.query else "",
|
|
166
|
+
total=total,
|
|
167
|
+
page_number=catalog_query.page_number,
|
|
168
|
+
page_size=catalog_query.page_size,
|
|
169
|
+
next_page=(catalog_query.page_size * catalog_query.page_number + len(data) < total),
|
|
170
|
+
min_score=0,
|
|
131
171
|
)
|
|
132
|
-
|
|
133
|
-
|
|
172
|
+
|
|
173
|
+
@search_observer.wrap({"op": "catalog_facets"})
|
|
174
|
+
async def facets(self, kbid: str, request: CatalogFacetsRequest) -> dict[str, int]:
|
|
175
|
+
async with _pg_driver()._get_connection() as conn, conn.cursor() as cur:
|
|
176
|
+
prefix_filters: list[sql.Composable] = []
|
|
177
|
+
prefix_params: dict[str, Any] = {}
|
|
178
|
+
for cnt, prefix in enumerate(request.prefixes):
|
|
179
|
+
prefix_sql = sql.SQL("facet LIKE {}").format(sql.Placeholder(f"prefix{cnt}"))
|
|
180
|
+
prefix_params[f"prefix{cnt}"] = f"{prefix.prefix}%"
|
|
181
|
+
if prefix.depth is not None:
|
|
182
|
+
prefix_parts = len(prefix.prefix.split("/"))
|
|
183
|
+
depth_sql = sql.SQL("SPLIT_PART(facet, '/', {}) = ''").format(
|
|
184
|
+
sql.Placeholder(f"depth{cnt}")
|
|
185
|
+
)
|
|
186
|
+
prefix_params[f"depth{cnt}"] = prefix_parts + prefix.depth + 1
|
|
187
|
+
prefix_sql = sql.SQL("({} AND {})").format(prefix_sql, depth_sql)
|
|
188
|
+
prefix_filters.append(prefix_sql)
|
|
189
|
+
|
|
190
|
+
filter_sql: sql.Composable
|
|
191
|
+
if prefix_filters:
|
|
192
|
+
filter_sql = sql.SQL("AND {}").format(sql.SQL(" OR ").join(prefix_filters))
|
|
193
|
+
else:
|
|
194
|
+
filter_sql = sql.SQL("")
|
|
195
|
+
|
|
196
|
+
await cur.execute(
|
|
197
|
+
sql.SQL(
|
|
198
|
+
"SELECT facet, COUNT(*) FROM catalog_facets WHERE kbid = %(kbid)s {} GROUP BY facet"
|
|
199
|
+
).format(filter_sql),
|
|
200
|
+
{"kbid": kbid, **prefix_params},
|
|
201
|
+
)
|
|
202
|
+
return {k: v for k, v in await cur.fetchall()}
|
|
134
203
|
|
|
135
204
|
|
|
136
205
|
def _prepare_query_filters(catalog_query: CatalogQuery) -> tuple[sql.Composable, dict[str, Any]]:
|
|
@@ -149,42 +218,16 @@ def _prepare_query_filters(catalog_query: CatalogQuery) -> tuple[sql.Composable,
|
|
|
149
218
|
)
|
|
150
219
|
|
|
151
220
|
|
|
152
|
-
def
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
if query.field == search_models.CatalogQueryField.Title:
|
|
159
|
-
# Insensitive search supported by pg_trgm for title
|
|
160
|
-
return sql.SQL("{} ILIKE %(query)s").format(sql.Identifier(query.field.value))
|
|
221
|
+
def _filter_operands(operands: list[CatalogExpression]) -> tuple[list[str], list[CatalogExpression]]:
|
|
222
|
+
facets = []
|
|
223
|
+
nonfacets = []
|
|
224
|
+
for op in operands:
|
|
225
|
+
if op.facet:
|
|
226
|
+
facets.append(op.facet)
|
|
161
227
|
else:
|
|
162
|
-
|
|
163
|
-
return sql.SQL("{} LIKE %(query)s").format(sql.Identifier(query.field.value))
|
|
164
|
-
# The rest of operators only supported by title
|
|
165
|
-
elif query.match == search_models.CatalogQueryMatch.Words:
|
|
166
|
-
# This is doing tokenization inside the SQL server (to keep the index updated). We could move it to
|
|
167
|
-
# the python code at update/query time if it ever becomes a problem but for now, a single regex
|
|
168
|
-
# executed per query is not a problem.
|
|
228
|
+
nonfacets.append(op)
|
|
169
229
|
|
|
170
|
-
|
|
171
|
-
params["query"] = [word.lower() for word in SPLIT_REGEX.split(query.query) if word]
|
|
172
|
-
return sql.SQL("regexp_split_to_array(lower(title), '\\W') @> %(query)s")
|
|
173
|
-
elif query.match == search_models.CatalogQueryMatch.Fuzzy:
|
|
174
|
-
params["query"] = query.query
|
|
175
|
-
# Note: the operator is %>, We use %%> for psycopg escaping
|
|
176
|
-
return sql.SQL("title %%> %(query)s")
|
|
177
|
-
elif query.match == search_models.CatalogQueryMatch.EndsWith:
|
|
178
|
-
params["query"] = "%" + query.query
|
|
179
|
-
return sql.SQL("title ILIKE %(query)s")
|
|
180
|
-
elif query.match == search_models.CatalogQueryMatch.Contains:
|
|
181
|
-
params["query"] = "%" + query.query + "%"
|
|
182
|
-
return sql.SQL("title ILIKE %(query)s")
|
|
183
|
-
else: # pragma: nocover
|
|
184
|
-
# This is a trick so mypy generates an error if this branch can be reached,
|
|
185
|
-
# that is, if we are missing some ifs
|
|
186
|
-
_a: int = "a"
|
|
187
|
-
return sql.SQL("")
|
|
230
|
+
return facets, nonfacets
|
|
188
231
|
|
|
189
232
|
|
|
190
233
|
def _prepare_query(catalog_query: CatalogQuery) -> tuple[sql.Composed, dict[str, Any]]:
|
|
@@ -219,67 +262,6 @@ def _prepare_query(catalog_query: CatalogQuery) -> tuple[sql.Composed, dict[str,
|
|
|
219
262
|
return query, filter_params
|
|
220
263
|
|
|
221
264
|
|
|
222
|
-
def _pg_driver() -> PGDriver:
|
|
223
|
-
return cast(PGDriver, get_driver())
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
@observer.wrap({"op": "search"})
|
|
227
|
-
async def pgcatalog_search(catalog_query: CatalogQuery) -> Resources:
|
|
228
|
-
# Prepare SQL query
|
|
229
|
-
query, query_params = _prepare_query_filters(catalog_query)
|
|
230
|
-
|
|
231
|
-
async with _pg_driver()._get_connection() as conn, conn.cursor(row_factory=dict_row) as cur:
|
|
232
|
-
facets = {}
|
|
233
|
-
|
|
234
|
-
# Faceted search
|
|
235
|
-
if catalog_query.faceted:
|
|
236
|
-
with observer({"op": "facets"}):
|
|
237
|
-
tmp_facets: dict[str, dict[str, int]] = {
|
|
238
|
-
translate_label(f): defaultdict(int) for f in catalog_query.faceted
|
|
239
|
-
}
|
|
240
|
-
|
|
241
|
-
if catalog_query.filters is None:
|
|
242
|
-
await _faceted_search_unfiltered(cur, catalog_query, tmp_facets)
|
|
243
|
-
else:
|
|
244
|
-
await _faceted_search_filtered(cur, catalog_query, tmp_facets, query, query_params)
|
|
245
|
-
|
|
246
|
-
facets = {translate_system_to_alias_label(k): v for k, v in tmp_facets.items()}
|
|
247
|
-
|
|
248
|
-
# Totals
|
|
249
|
-
with observer({"op": "totals"}):
|
|
250
|
-
await cur.execute(
|
|
251
|
-
sql.SQL("SELECT COUNT(*) FROM ({}) fc").format(query),
|
|
252
|
-
query_params,
|
|
253
|
-
)
|
|
254
|
-
total = (await cur.fetchone())["count"] # type: ignore
|
|
255
|
-
|
|
256
|
-
# Query
|
|
257
|
-
with observer({"op": "query"}):
|
|
258
|
-
query, query_params = _prepare_query(catalog_query)
|
|
259
|
-
await cur.execute(query, query_params)
|
|
260
|
-
data = await cur.fetchall()
|
|
261
|
-
|
|
262
|
-
return Resources(
|
|
263
|
-
facets=facets,
|
|
264
|
-
results=[
|
|
265
|
-
ResourceResult(
|
|
266
|
-
rid=str(r["rid"]).replace("-", ""),
|
|
267
|
-
field="title",
|
|
268
|
-
field_type="a",
|
|
269
|
-
labels=[label for label in r["labels"] if label.startswith("/l/")],
|
|
270
|
-
score=0,
|
|
271
|
-
)
|
|
272
|
-
for r in data
|
|
273
|
-
],
|
|
274
|
-
query=catalog_query.query.query if catalog_query.query else "",
|
|
275
|
-
total=total,
|
|
276
|
-
page_number=catalog_query.page_number,
|
|
277
|
-
page_size=catalog_query.page_size,
|
|
278
|
-
next_page=(catalog_query.page_size * catalog_query.page_number + len(data) < total),
|
|
279
|
-
min_score=0,
|
|
280
|
-
)
|
|
281
|
-
|
|
282
|
-
|
|
283
265
|
async def _faceted_search_unfiltered(
|
|
284
266
|
cur: AsyncCursor[DictRow], catalog_query: CatalogQuery, tmp_facets: dict[str, dict[str, int]]
|
|
285
267
|
):
|
|
@@ -360,33 +342,137 @@ async def _faceted_search_filtered(
|
|
|
360
342
|
tmp_facets[grandparent][translate_system_to_alias_label(parent)] += count
|
|
361
343
|
|
|
362
344
|
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
345
|
+
def _prepare_query_search(query: search_models.CatalogQuery, params: dict[str, Any]) -> sql.Composable:
|
|
346
|
+
if query.match == search_models.CatalogQueryMatch.Exact:
|
|
347
|
+
params["query"] = query.query
|
|
348
|
+
return sql.SQL("{} = %(query)s").format(sql.Identifier(query.field.value))
|
|
349
|
+
elif query.match == search_models.CatalogQueryMatch.StartsWith:
|
|
350
|
+
params["query"] = query.query + "%"
|
|
351
|
+
if query.field == search_models.CatalogQueryField.Title:
|
|
352
|
+
# Insensitive search supported by pg_trgm for title
|
|
353
|
+
return sql.SQL("{} ILIKE %(query)s").format(sql.Identifier(query.field.value))
|
|
354
|
+
else:
|
|
355
|
+
# Sensitive search for slug (btree does not support ILIKE and slugs are all lowercase anyway)
|
|
356
|
+
return sql.SQL("{} LIKE %(query)s").format(sql.Identifier(query.field.value))
|
|
357
|
+
# The rest of operators only supported by title
|
|
358
|
+
elif query.match == search_models.CatalogQueryMatch.Words:
|
|
359
|
+
# This is doing tokenization inside the SQL server (to keep the index updated). We could move it to
|
|
360
|
+
# the python code at update/query time if it ever becomes a problem but for now, a single regex
|
|
361
|
+
# executed per query is not a problem.
|
|
379
362
|
|
|
380
|
-
|
|
381
|
-
if
|
|
382
|
-
|
|
363
|
+
# Remove zero-length words from the split
|
|
364
|
+
params["query"] = [word.lower() for word in SPLIT_REGEX.split(query.query) if word]
|
|
365
|
+
return sql.SQL("regexp_split_to_array(lower(title), '\\W') @> %(query)s")
|
|
366
|
+
elif query.match == search_models.CatalogQueryMatch.Fuzzy:
|
|
367
|
+
params["query"] = query.query
|
|
368
|
+
# Note: the operator is %>, We use %%> for psycopg escaping
|
|
369
|
+
return sql.SQL("title %%> %(query)s")
|
|
370
|
+
elif query.match == search_models.CatalogQueryMatch.EndsWith:
|
|
371
|
+
params["query"] = "%" + query.query
|
|
372
|
+
return sql.SQL("title ILIKE %(query)s")
|
|
373
|
+
elif query.match == search_models.CatalogQueryMatch.Contains:
|
|
374
|
+
params["query"] = "%" + query.query + "%"
|
|
375
|
+
return sql.SQL("title ILIKE %(query)s")
|
|
376
|
+
else: # pragma: no cover
|
|
377
|
+
# This is a trick so mypy generates an error if this branch can be reached,
|
|
378
|
+
# that is, if we are missing some ifs
|
|
379
|
+
_a: int = "a"
|
|
380
|
+
return sql.SQL("")
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
def _convert_filter(expr: CatalogExpression, filter_params: dict[str, Any]) -> sql.Composable:
|
|
384
|
+
if expr.bool_and:
|
|
385
|
+
return _convert_boolean_op(expr.bool_and, "and", filter_params)
|
|
386
|
+
elif expr.bool_or:
|
|
387
|
+
return _convert_boolean_op(expr.bool_or, "or", filter_params)
|
|
388
|
+
elif expr.bool_not:
|
|
389
|
+
return sql.SQL("(NOT {})").format(_convert_filter(expr.bool_not, filter_params))
|
|
390
|
+
elif expr.date:
|
|
391
|
+
return _convert_date_filter(expr.date, filter_params)
|
|
392
|
+
elif expr.facet:
|
|
393
|
+
param_name = f"param{len(filter_params)}"
|
|
394
|
+
filter_params[param_name] = [expr.facet]
|
|
395
|
+
if expr.facet == "/n/s/PROCESSED":
|
|
396
|
+
# Optimization for the most common case, we know PROCESSED is a full label and can use the smaller labels index
|
|
397
|
+
# This is needed because PROCESSED is present in most catalog entries and PG is unlikely to use any index
|
|
398
|
+
# for it, falling back to executing the extract_facets function which can be slow
|
|
399
|
+
return sql.SQL("labels @> {}").format(sql.Placeholder(param_name))
|
|
383
400
|
else:
|
|
384
|
-
|
|
401
|
+
return sql.SQL("extract_facets(labels) @> {}").format(sql.Placeholder(param_name))
|
|
402
|
+
elif expr.resource_id:
|
|
403
|
+
param_name = f"param{len(filter_params)}"
|
|
404
|
+
filter_params[param_name] = [expr.resource_id]
|
|
405
|
+
return sql.SQL("rid = {}").format(sql.Placeholder(param_name))
|
|
406
|
+
else:
|
|
407
|
+
return sql.SQL("")
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
def _convert_boolean_op(
|
|
411
|
+
operands: list[CatalogExpression],
|
|
412
|
+
op: Union[Literal["and"], Literal["or"]],
|
|
413
|
+
filter_params: dict[str, Any],
|
|
414
|
+
) -> sql.Composable:
|
|
415
|
+
array_op = sql.SQL("@>" if op == "and" else "&&")
|
|
416
|
+
operands_sql: list[sql.Composable] = []
|
|
417
|
+
facets, nonfacets = _filter_operands(operands)
|
|
418
|
+
if facets:
|
|
419
|
+
param_name = f"param{len(filter_params)}"
|
|
420
|
+
if facets == ["/n/s/PROCESSED"]:
|
|
421
|
+
# Optimization for the most common case, we know PROCESSED is a full label and can use the smaller labels index
|
|
422
|
+
# This is needed because PROCESSED is present in most catalog entries and PG is unlikely to use any index
|
|
423
|
+
# for it, falling back to executing the extract_facets function which can be slow
|
|
424
|
+
operands_sql.append(sql.SQL("labels @> {}").format(sql.Placeholder(param_name)))
|
|
425
|
+
else:
|
|
426
|
+
operands_sql.append(
|
|
427
|
+
sql.SQL("extract_facets(labels) {} {}").format(array_op, sql.Placeholder(param_name))
|
|
428
|
+
)
|
|
429
|
+
filter_params[param_name] = facets
|
|
430
|
+
for nonfacet in nonfacets:
|
|
431
|
+
operands_sql.append(_convert_filter(nonfacet, filter_params))
|
|
432
|
+
return sql.SQL("({})").format(sql.SQL(f" {op.upper()} ").join(operands_sql))
|
|
433
|
+
|
|
385
434
|
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
435
|
+
def _convert_date_filter(date: CatalogExpression.Date, filter_params: dict[str, Any]) -> sql.Composable:
|
|
436
|
+
if date.since and date.until:
|
|
437
|
+
since_name = f"param{len(filter_params)}"
|
|
438
|
+
filter_params[since_name] = date.since
|
|
439
|
+
until_name = f"param{len(filter_params)}"
|
|
440
|
+
filter_params[until_name] = date.until
|
|
441
|
+
return sql.SQL("{field} BETWEEN {since} AND {until}").format(
|
|
442
|
+
field=sql.Identifier(date.field),
|
|
443
|
+
since=sql.Placeholder(since_name),
|
|
444
|
+
until=sql.Placeholder(until_name),
|
|
391
445
|
)
|
|
392
|
-
|
|
446
|
+
elif date.since:
|
|
447
|
+
since_name = f"param{len(filter_params)}"
|
|
448
|
+
filter_params[since_name] = date.since
|
|
449
|
+
return sql.SQL("{field} > {since}").format(
|
|
450
|
+
field=sql.Identifier(date.field), since=sql.Placeholder(since_name)
|
|
451
|
+
)
|
|
452
|
+
elif date.until:
|
|
453
|
+
until_name = f"param{len(filter_params)}"
|
|
454
|
+
filter_params[until_name] = date.until
|
|
455
|
+
return sql.SQL("{field} < {until}").format(
|
|
456
|
+
field=sql.Identifier(date.field), until=sql.Placeholder(until_name)
|
|
457
|
+
)
|
|
458
|
+
else:
|
|
459
|
+
raise ValueError(f"Invalid date operator")
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
def translate_label(literal: str) -> str:
|
|
463
|
+
if len(literal) == 0:
|
|
464
|
+
raise InvalidQueryError("filters", "Invalid empty label")
|
|
465
|
+
if literal[0] != "/":
|
|
466
|
+
raise InvalidQueryError("filters", f"Invalid label. It must start with a `/`: {literal}")
|
|
467
|
+
return translate_alias_to_system_label(literal)
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
def extract_facets(labels: list[str]) -> set[str]:
|
|
471
|
+
facets = set()
|
|
472
|
+
for label in labels:
|
|
473
|
+
parts = label.split("/")
|
|
474
|
+
facet = ""
|
|
475
|
+
for part in parts[1:]:
|
|
476
|
+
facet += f"/{part}"
|
|
477
|
+
facets.add(facet)
|
|
478
|
+
return facets
|