nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0023_backfill_pg_catalog.py +8 -4
- migrations/0028_extracted_vectors_reference.py +1 -1
- migrations/0029_backfill_field_status.py +3 -4
- migrations/0032_remove_old_relations.py +2 -3
- migrations/0038_backfill_catalog_field_labels.py +8 -4
- migrations/0039_backfill_converation_splits_metadata.py +106 -0
- migrations/0040_migrate_search_configurations.py +79 -0
- migrations/0041_reindex_conversations.py +137 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
- migrations/pg/0012_catalog_statistics_undo.py +26 -0
- nucliadb/backups/create.py +2 -15
- nucliadb/backups/restore.py +4 -15
- nucliadb/backups/tasks.py +4 -1
- nucliadb/common/back_pressure/cache.py +2 -3
- nucliadb/common/back_pressure/materializer.py +7 -13
- nucliadb/common/back_pressure/settings.py +6 -6
- nucliadb/common/back_pressure/utils.py +1 -0
- nucliadb/common/cache.py +9 -9
- nucliadb/common/catalog/__init__.py +79 -0
- nucliadb/common/catalog/dummy.py +36 -0
- nucliadb/common/catalog/interface.py +85 -0
- nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
- nucliadb/common/catalog/utils.py +56 -0
- nucliadb/common/cluster/manager.py +8 -23
- nucliadb/common/cluster/rebalance.py +484 -112
- nucliadb/common/cluster/rollover.py +36 -9
- nucliadb/common/cluster/settings.py +4 -9
- nucliadb/common/cluster/utils.py +34 -8
- nucliadb/common/context/__init__.py +7 -8
- nucliadb/common/context/fastapi.py +1 -2
- nucliadb/common/datamanagers/__init__.py +2 -4
- nucliadb/common/datamanagers/atomic.py +9 -2
- nucliadb/common/datamanagers/cluster.py +1 -2
- nucliadb/common/datamanagers/fields.py +3 -4
- nucliadb/common/datamanagers/kb.py +6 -6
- nucliadb/common/datamanagers/labels.py +2 -3
- nucliadb/common/datamanagers/resources.py +10 -33
- nucliadb/common/datamanagers/rollover.py +5 -7
- nucliadb/common/datamanagers/search_configurations.py +1 -2
- nucliadb/common/datamanagers/synonyms.py +1 -2
- nucliadb/common/datamanagers/utils.py +4 -4
- nucliadb/common/datamanagers/vectorsets.py +4 -4
- nucliadb/common/external_index_providers/base.py +32 -5
- nucliadb/common/external_index_providers/manager.py +5 -34
- nucliadb/common/external_index_providers/settings.py +1 -27
- nucliadb/common/filter_expression.py +129 -41
- nucliadb/common/http_clients/exceptions.py +8 -0
- nucliadb/common/http_clients/processing.py +16 -23
- nucliadb/common/http_clients/utils.py +3 -0
- nucliadb/common/ids.py +82 -58
- nucliadb/common/locking.py +1 -2
- nucliadb/common/maindb/driver.py +9 -8
- nucliadb/common/maindb/local.py +5 -5
- nucliadb/common/maindb/pg.py +9 -8
- nucliadb/common/nidx.py +22 -5
- nucliadb/common/vector_index_config.py +1 -1
- nucliadb/export_import/datamanager.py +4 -3
- nucliadb/export_import/exporter.py +11 -19
- nucliadb/export_import/importer.py +13 -6
- nucliadb/export_import/tasks.py +2 -0
- nucliadb/export_import/utils.py +6 -18
- nucliadb/health.py +2 -2
- nucliadb/ingest/app.py +8 -8
- nucliadb/ingest/consumer/consumer.py +8 -10
- nucliadb/ingest/consumer/pull.py +10 -8
- nucliadb/ingest/consumer/service.py +5 -30
- nucliadb/ingest/consumer/shard_creator.py +16 -5
- nucliadb/ingest/consumer/utils.py +1 -1
- nucliadb/ingest/fields/base.py +37 -49
- nucliadb/ingest/fields/conversation.py +55 -9
- nucliadb/ingest/fields/exceptions.py +1 -2
- nucliadb/ingest/fields/file.py +22 -8
- nucliadb/ingest/fields/link.py +7 -7
- nucliadb/ingest/fields/text.py +2 -3
- nucliadb/ingest/orm/brain_v2.py +89 -57
- nucliadb/ingest/orm/broker_message.py +2 -4
- nucliadb/ingest/orm/entities.py +10 -209
- nucliadb/ingest/orm/index_message.py +128 -113
- nucliadb/ingest/orm/knowledgebox.py +91 -59
- nucliadb/ingest/orm/processor/auditing.py +1 -3
- nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
- nucliadb/ingest/orm/processor/processor.py +98 -153
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
- nucliadb/ingest/orm/resource.py +82 -71
- nucliadb/ingest/orm/utils.py +1 -1
- nucliadb/ingest/partitions.py +12 -1
- nucliadb/ingest/processing.py +17 -17
- nucliadb/ingest/serialize.py +202 -145
- nucliadb/ingest/service/writer.py +15 -114
- nucliadb/ingest/settings.py +36 -15
- nucliadb/ingest/utils.py +1 -2
- nucliadb/learning_proxy.py +23 -26
- nucliadb/metrics_exporter.py +20 -6
- nucliadb/middleware/__init__.py +82 -1
- nucliadb/migrator/datamanager.py +4 -11
- nucliadb/migrator/migrator.py +1 -2
- nucliadb/migrator/models.py +1 -2
- nucliadb/migrator/settings.py +1 -2
- nucliadb/models/internal/augment.py +614 -0
- nucliadb/models/internal/processing.py +19 -19
- nucliadb/openapi.py +2 -2
- nucliadb/purge/__init__.py +3 -8
- nucliadb/purge/orphan_shards.py +1 -2
- nucliadb/reader/__init__.py +5 -0
- nucliadb/reader/api/models.py +6 -13
- nucliadb/reader/api/v1/download.py +59 -38
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/knowledgebox.py +37 -9
- nucliadb/reader/api/v1/learning_config.py +33 -14
- nucliadb/reader/api/v1/resource.py +61 -9
- nucliadb/reader/api/v1/services.py +18 -14
- nucliadb/reader/app.py +3 -1
- nucliadb/reader/reader/notifications.py +1 -2
- nucliadb/search/api/v1/__init__.py +3 -0
- nucliadb/search/api/v1/ask.py +3 -4
- nucliadb/search/api/v1/augment.py +585 -0
- nucliadb/search/api/v1/catalog.py +15 -19
- nucliadb/search/api/v1/find.py +16 -22
- nucliadb/search/api/v1/hydrate.py +328 -0
- nucliadb/search/api/v1/knowledgebox.py +1 -2
- nucliadb/search/api/v1/predict_proxy.py +1 -2
- nucliadb/search/api/v1/resource/ask.py +28 -8
- nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
- nucliadb/search/api/v1/resource/search.py +9 -11
- nucliadb/search/api/v1/retrieve.py +130 -0
- nucliadb/search/api/v1/search.py +28 -32
- nucliadb/search/api/v1/suggest.py +11 -14
- nucliadb/search/api/v1/summarize.py +1 -2
- nucliadb/search/api/v1/utils.py +2 -2
- nucliadb/search/app.py +3 -2
- nucliadb/search/augmentor/__init__.py +21 -0
- nucliadb/search/augmentor/augmentor.py +232 -0
- nucliadb/search/augmentor/fields.py +704 -0
- nucliadb/search/augmentor/metrics.py +24 -0
- nucliadb/search/augmentor/paragraphs.py +334 -0
- nucliadb/search/augmentor/resources.py +238 -0
- nucliadb/search/augmentor/utils.py +33 -0
- nucliadb/search/lifecycle.py +3 -1
- nucliadb/search/predict.py +33 -19
- nucliadb/search/predict_models.py +8 -9
- nucliadb/search/requesters/utils.py +11 -10
- nucliadb/search/search/cache.py +19 -42
- nucliadb/search/search/chat/ask.py +131 -59
- nucliadb/search/search/chat/exceptions.py +3 -5
- nucliadb/search/search/chat/fetcher.py +201 -0
- nucliadb/search/search/chat/images.py +6 -4
- nucliadb/search/search/chat/old_prompt.py +1375 -0
- nucliadb/search/search/chat/parser.py +510 -0
- nucliadb/search/search/chat/prompt.py +563 -615
- nucliadb/search/search/chat/query.py +453 -32
- nucliadb/search/search/chat/rpc.py +85 -0
- nucliadb/search/search/fetch.py +3 -4
- nucliadb/search/search/filters.py +8 -11
- nucliadb/search/search/find.py +33 -31
- nucliadb/search/search/find_merge.py +124 -331
- nucliadb/search/search/graph_strategy.py +14 -12
- nucliadb/search/search/hydrator/__init__.py +49 -0
- nucliadb/search/search/hydrator/fields.py +217 -0
- nucliadb/search/search/hydrator/images.py +130 -0
- nucliadb/search/search/hydrator/paragraphs.py +323 -0
- nucliadb/search/search/hydrator/resources.py +60 -0
- nucliadb/search/search/ingestion_agents.py +5 -5
- nucliadb/search/search/merge.py +90 -94
- nucliadb/search/search/metrics.py +24 -7
- nucliadb/search/search/paragraphs.py +7 -9
- nucliadb/search/search/predict_proxy.py +44 -18
- nucliadb/search/search/query.py +14 -86
- nucliadb/search/search/query_parser/fetcher.py +51 -82
- nucliadb/search/search/query_parser/models.py +19 -48
- nucliadb/search/search/query_parser/old_filters.py +20 -19
- nucliadb/search/search/query_parser/parsers/ask.py +5 -6
- nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
- nucliadb/search/search/query_parser/parsers/common.py +21 -13
- nucliadb/search/search/query_parser/parsers/find.py +6 -29
- nucliadb/search/search/query_parser/parsers/graph.py +18 -28
- nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
- nucliadb/search/search/query_parser/parsers/search.py +15 -56
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
- nucliadb/search/search/rank_fusion.py +18 -13
- nucliadb/search/search/rerankers.py +6 -7
- nucliadb/search/search/retrieval.py +300 -0
- nucliadb/search/search/summarize.py +5 -6
- nucliadb/search/search/utils.py +3 -4
- nucliadb/search/settings.py +1 -2
- nucliadb/standalone/api_router.py +1 -1
- nucliadb/standalone/app.py +4 -3
- nucliadb/standalone/auth.py +5 -6
- nucliadb/standalone/lifecycle.py +2 -2
- nucliadb/standalone/run.py +5 -4
- nucliadb/standalone/settings.py +5 -6
- nucliadb/standalone/versions.py +3 -4
- nucliadb/tasks/consumer.py +13 -8
- nucliadb/tasks/models.py +2 -1
- nucliadb/tasks/producer.py +3 -3
- nucliadb/tasks/retries.py +8 -7
- nucliadb/train/api/utils.py +1 -3
- nucliadb/train/api/v1/shards.py +1 -2
- nucliadb/train/api/v1/trainset.py +1 -2
- nucliadb/train/app.py +1 -1
- nucliadb/train/generator.py +4 -4
- nucliadb/train/generators/field_classifier.py +2 -2
- nucliadb/train/generators/field_streaming.py +6 -6
- nucliadb/train/generators/image_classifier.py +2 -2
- nucliadb/train/generators/paragraph_classifier.py +2 -2
- nucliadb/train/generators/paragraph_streaming.py +2 -2
- nucliadb/train/generators/question_answer_streaming.py +2 -2
- nucliadb/train/generators/sentence_classifier.py +4 -10
- nucliadb/train/generators/token_classifier.py +3 -2
- nucliadb/train/generators/utils.py +6 -5
- nucliadb/train/nodes.py +3 -3
- nucliadb/train/resource.py +6 -8
- nucliadb/train/settings.py +3 -4
- nucliadb/train/types.py +11 -11
- nucliadb/train/upload.py +3 -2
- nucliadb/train/uploader.py +1 -2
- nucliadb/train/utils.py +1 -2
- nucliadb/writer/api/v1/export_import.py +4 -1
- nucliadb/writer/api/v1/field.py +15 -14
- nucliadb/writer/api/v1/knowledgebox.py +18 -56
- nucliadb/writer/api/v1/learning_config.py +5 -4
- nucliadb/writer/api/v1/resource.py +9 -20
- nucliadb/writer/api/v1/services.py +10 -132
- nucliadb/writer/api/v1/upload.py +73 -72
- nucliadb/writer/app.py +8 -2
- nucliadb/writer/resource/basic.py +12 -15
- nucliadb/writer/resource/field.py +43 -5
- nucliadb/writer/resource/origin.py +7 -0
- nucliadb/writer/settings.py +2 -3
- nucliadb/writer/tus/__init__.py +2 -3
- nucliadb/writer/tus/azure.py +5 -7
- nucliadb/writer/tus/dm.py +3 -3
- nucliadb/writer/tus/exceptions.py +3 -4
- nucliadb/writer/tus/gcs.py +15 -22
- nucliadb/writer/tus/s3.py +2 -3
- nucliadb/writer/tus/storage.py +3 -3
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
- nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
- nucliadb/common/datamanagers/entities.py +0 -139
- nucliadb/common/external_index_providers/pinecone.py +0 -894
- nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
- nucliadb/search/search/hydrator.py +0 -197
- nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
|
@@ -21,116 +21,186 @@
|
|
|
21
21
|
import logging
|
|
22
22
|
import re
|
|
23
23
|
from collections import defaultdict
|
|
24
|
-
from typing import Any, Literal,
|
|
24
|
+
from typing import Any, Literal, cast
|
|
25
25
|
|
|
26
26
|
from psycopg import AsyncCursor, sql
|
|
27
27
|
from psycopg.rows import DictRow, dict_row
|
|
28
|
-
|
|
29
|
-
|
|
28
|
+
from typing_extensions import assert_never
|
|
29
|
+
|
|
30
|
+
from nucliadb.common.catalog.interface import (
|
|
31
|
+
Catalog,
|
|
32
|
+
CatalogExpression,
|
|
33
|
+
CatalogQuery,
|
|
34
|
+
CatalogResourceData,
|
|
35
|
+
)
|
|
36
|
+
from nucliadb.common.exceptions import InvalidQueryError
|
|
37
|
+
from nucliadb.common.maindb.driver import Transaction
|
|
38
|
+
from nucliadb.common.maindb.pg import PGDriver, PGTransaction
|
|
30
39
|
from nucliadb.common.maindb.utils import get_driver
|
|
31
|
-
from nucliadb.search.search.query_parser.models import CatalogExpression, CatalogQuery
|
|
32
40
|
from nucliadb_models import search as search_models
|
|
33
|
-
from nucliadb_models.labels import translate_system_to_alias_label
|
|
34
|
-
from nucliadb_models.search import
|
|
41
|
+
from nucliadb_models.labels import translate_alias_to_system_label, translate_system_to_alias_label
|
|
42
|
+
from nucliadb_models.search import (
|
|
43
|
+
CatalogFacetsRequest,
|
|
44
|
+
ResourceResult,
|
|
45
|
+
Resources,
|
|
46
|
+
SortField,
|
|
47
|
+
SortOrder,
|
|
48
|
+
)
|
|
35
49
|
from nucliadb_telemetry import metrics
|
|
36
50
|
|
|
37
|
-
|
|
51
|
+
write_observer = metrics.Observer("pg_catalog_write", labels={"type": ""})
|
|
52
|
+
search_observer = metrics.Observer("pg_catalog_search", labels={"op": ""})
|
|
38
53
|
|
|
39
|
-
observer = metrics.Observer("pg_catalog_search", labels={"op": ""})
|
|
40
54
|
logger = logging.getLogger(__name__)
|
|
41
55
|
|
|
42
56
|
SPLIT_REGEX = re.compile(r"\W")
|
|
43
57
|
|
|
44
58
|
|
|
45
|
-
def
|
|
46
|
-
|
|
47
|
-
nonfacets = []
|
|
48
|
-
for op in operands:
|
|
49
|
-
if op.facet:
|
|
50
|
-
facets.append(op.facet)
|
|
51
|
-
else:
|
|
52
|
-
nonfacets.append(op)
|
|
53
|
-
|
|
54
|
-
return facets, nonfacets
|
|
59
|
+
def _pg_transaction(txn: Transaction) -> PGTransaction:
|
|
60
|
+
return cast(PGTransaction, txn)
|
|
55
61
|
|
|
56
62
|
|
|
57
|
-
def
|
|
58
|
-
|
|
59
|
-
return _convert_boolean_op(expr.bool_and, "and", filter_params)
|
|
60
|
-
elif expr.bool_or:
|
|
61
|
-
return _convert_boolean_op(expr.bool_or, "or", filter_params)
|
|
62
|
-
elif expr.bool_not:
|
|
63
|
-
return sql.SQL("(NOT {})").format(_convert_filter(expr.bool_not, filter_params))
|
|
64
|
-
elif expr.date:
|
|
65
|
-
return _convert_date_filter(expr.date, filter_params)
|
|
66
|
-
elif expr.facet:
|
|
67
|
-
param_name = f"param{len(filter_params)}"
|
|
68
|
-
filter_params[param_name] = [expr.facet]
|
|
69
|
-
if expr.facet == "/n/s/PROCESSED":
|
|
70
|
-
# Optimization for the most common case, we know PROCESSED is a full label and can use the smaller labels index
|
|
71
|
-
# This is needed because PROCESSED is present in most catalog entries and PG is unlikely to use any index
|
|
72
|
-
# for it, falling back to executing the extract_facets function which can be slow
|
|
73
|
-
return sql.SQL("labels @> {}").format(sql.Placeholder(param_name))
|
|
74
|
-
else:
|
|
75
|
-
return sql.SQL("extract_facets(labels) @> {}").format(sql.Placeholder(param_name))
|
|
76
|
-
elif expr.resource_id:
|
|
77
|
-
param_name = f"param{len(filter_params)}"
|
|
78
|
-
filter_params[param_name] = [expr.resource_id]
|
|
79
|
-
return sql.SQL("rid = {}").format(sql.Placeholder(param_name))
|
|
80
|
-
else:
|
|
81
|
-
return sql.SQL("")
|
|
63
|
+
def _pg_driver() -> PGDriver:
|
|
64
|
+
return cast(PGDriver, get_driver())
|
|
82
65
|
|
|
83
66
|
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
67
|
+
class PGCatalog(Catalog):
|
|
68
|
+
@write_observer.wrap({"type": "update"})
|
|
69
|
+
async def update(self, txn: Transaction, kbid: str, rid: str, data: CatalogResourceData):
|
|
70
|
+
async with _pg_transaction(txn).connection.cursor() as cur:
|
|
71
|
+
await cur.execute(
|
|
72
|
+
"""
|
|
73
|
+
INSERT INTO catalog
|
|
74
|
+
(kbid, rid, title, created_at, modified_at, labels, slug)
|
|
75
|
+
VALUES
|
|
76
|
+
(%(kbid)s, %(rid)s, %(title)s, %(created_at)s, %(modified_at)s, %(labels)s, %(slug)s)
|
|
77
|
+
ON CONFLICT (kbid, rid) DO UPDATE SET
|
|
78
|
+
title = excluded.title,
|
|
79
|
+
created_at = excluded.created_at,
|
|
80
|
+
modified_at = excluded.modified_at,
|
|
81
|
+
labels = excluded.labels,
|
|
82
|
+
slug = excluded.slug""",
|
|
83
|
+
{
|
|
84
|
+
"kbid": kbid,
|
|
85
|
+
"rid": rid,
|
|
86
|
+
"title": data.title,
|
|
87
|
+
"created_at": data.created_at,
|
|
88
|
+
"modified_at": data.modified_at,
|
|
89
|
+
"labels": data.labels,
|
|
90
|
+
"slug": data.slug,
|
|
91
|
+
},
|
|
92
|
+
)
|
|
93
|
+
await cur.execute(
|
|
94
|
+
"DELETE FROM catalog_facets WHERE kbid = %(kbid)s AND rid = %(rid)s",
|
|
95
|
+
{
|
|
96
|
+
"kbid": kbid,
|
|
97
|
+
"rid": rid,
|
|
98
|
+
},
|
|
99
|
+
)
|
|
100
|
+
await cur.execute(
|
|
101
|
+
"INSERT INTO catalog_facets (kbid, rid, facet) SELECT %(kbid)s AS kbid, %(rid)s AS rid, unnest(%(facets)s::text[]) AS facet",
|
|
102
|
+
{
|
|
103
|
+
"kbid": kbid,
|
|
104
|
+
"rid": rid,
|
|
105
|
+
"facets": list(extract_facets(data.labels)),
|
|
106
|
+
},
|
|
102
107
|
)
|
|
103
|
-
filter_params[param_name] = facets
|
|
104
|
-
for nonfacet in nonfacets:
|
|
105
|
-
operands_sql.append(_convert_filter(nonfacet, filter_params))
|
|
106
|
-
return sql.SQL("({})").format(sql.SQL(f" {op.upper()} ").join(operands_sql))
|
|
107
108
|
|
|
109
|
+
@write_observer.wrap({"type": "delete"})
|
|
110
|
+
async def delete(self, txn: Transaction, kbid: str, rid: str):
|
|
111
|
+
async with _pg_transaction(txn).connection.cursor() as cur:
|
|
112
|
+
await cur.execute(
|
|
113
|
+
"DELETE FROM catalog where kbid = %(kbid)s AND rid = %(rid)s", {"kbid": kbid, "rid": rid}
|
|
114
|
+
)
|
|
108
115
|
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
116
|
+
@search_observer.wrap({"op": "search"})
|
|
117
|
+
async def search(self, catalog_query: CatalogQuery) -> Resources:
|
|
118
|
+
# Prepare SQL query
|
|
119
|
+
query, query_params = _prepare_query_filters(catalog_query)
|
|
120
|
+
|
|
121
|
+
async with _pg_driver()._get_connection() as conn, conn.cursor(row_factory=dict_row) as cur:
|
|
122
|
+
facets = {}
|
|
123
|
+
|
|
124
|
+
# Faceted search
|
|
125
|
+
if catalog_query.faceted:
|
|
126
|
+
with search_observer({"op": "facets"}):
|
|
127
|
+
tmp_facets: dict[str, dict[str, int]] = {
|
|
128
|
+
translate_label(f): defaultdict(int) for f in catalog_query.faceted
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
if catalog_query.filters is None:
|
|
132
|
+
await _faceted_search_unfiltered(cur, catalog_query, tmp_facets)
|
|
133
|
+
else:
|
|
134
|
+
await _faceted_search_filtered(
|
|
135
|
+
cur, catalog_query, tmp_facets, query, query_params
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
facets = {translate_system_to_alias_label(k): v for k, v in tmp_facets.items()}
|
|
139
|
+
|
|
140
|
+
# Totals
|
|
141
|
+
with search_observer({"op": "totals"}):
|
|
142
|
+
await cur.execute(
|
|
143
|
+
sql.SQL("SELECT COUNT(*) FROM ({}) fc").format(query),
|
|
144
|
+
query_params,
|
|
145
|
+
)
|
|
146
|
+
total = (await cur.fetchone())["count"] # type: ignore
|
|
147
|
+
|
|
148
|
+
# Query
|
|
149
|
+
with search_observer({"op": "query"}):
|
|
150
|
+
query, query_params = _prepare_query(catalog_query)
|
|
151
|
+
await cur.execute(query, query_params)
|
|
152
|
+
data = await cur.fetchall()
|
|
153
|
+
|
|
154
|
+
return Resources(
|
|
155
|
+
facets=facets,
|
|
156
|
+
results=[
|
|
157
|
+
ResourceResult(
|
|
158
|
+
rid=str(r["rid"]).replace("-", ""),
|
|
159
|
+
field="title",
|
|
160
|
+
field_type="a",
|
|
161
|
+
labels=[label for label in r["labels"] if label.startswith("/l/")],
|
|
162
|
+
score=0,
|
|
163
|
+
)
|
|
164
|
+
for r in data
|
|
165
|
+
],
|
|
166
|
+
query=catalog_query.query.query if catalog_query.query else "",
|
|
167
|
+
total=total,
|
|
168
|
+
page_number=catalog_query.page_number,
|
|
169
|
+
page_size=catalog_query.page_size,
|
|
170
|
+
next_page=(catalog_query.page_size * catalog_query.page_number + len(data) < total),
|
|
171
|
+
min_score=0,
|
|
131
172
|
)
|
|
132
|
-
|
|
133
|
-
|
|
173
|
+
|
|
174
|
+
@search_observer.wrap({"op": "catalog_facets"})
|
|
175
|
+
async def facets(self, kbid: str, request: CatalogFacetsRequest) -> dict[str, int]:
|
|
176
|
+
async with _pg_driver()._get_connection() as conn, conn.cursor() as cur:
|
|
177
|
+
prefix_filters: list[sql.Composable] = []
|
|
178
|
+
prefix_params: dict[str, Any] = {}
|
|
179
|
+
for cnt, prefix in enumerate(request.prefixes):
|
|
180
|
+
prefix_sql = sql.SQL("facet LIKE {}").format(sql.Placeholder(f"prefix{cnt}"))
|
|
181
|
+
prefix_params[f"prefix{cnt}"] = f"{prefix.prefix}%"
|
|
182
|
+
if prefix.depth is not None:
|
|
183
|
+
prefix_parts = len(prefix.prefix.split("/"))
|
|
184
|
+
depth_sql = sql.SQL("SPLIT_PART(facet, '/', {}) = ''").format(
|
|
185
|
+
sql.Placeholder(f"depth{cnt}")
|
|
186
|
+
)
|
|
187
|
+
prefix_params[f"depth{cnt}"] = prefix_parts + prefix.depth + 1
|
|
188
|
+
prefix_sql = sql.SQL("({} AND {})").format(prefix_sql, depth_sql)
|
|
189
|
+
prefix_filters.append(prefix_sql)
|
|
190
|
+
|
|
191
|
+
filter_sql: sql.Composable
|
|
192
|
+
if prefix_filters:
|
|
193
|
+
filter_sql = sql.SQL("AND {}").format(sql.SQL(" OR ").join(prefix_filters))
|
|
194
|
+
else:
|
|
195
|
+
filter_sql = sql.SQL("")
|
|
196
|
+
|
|
197
|
+
await cur.execute(
|
|
198
|
+
sql.SQL(
|
|
199
|
+
"SELECT facet, COUNT(*) FROM catalog_facets WHERE kbid = %(kbid)s {} GROUP BY facet"
|
|
200
|
+
).format(filter_sql),
|
|
201
|
+
{"kbid": kbid, **prefix_params},
|
|
202
|
+
)
|
|
203
|
+
return {k: v for k, v in await cur.fetchall()}
|
|
134
204
|
|
|
135
205
|
|
|
136
206
|
def _prepare_query_filters(catalog_query: CatalogQuery) -> tuple[sql.Composable, dict[str, Any]]:
|
|
@@ -149,42 +219,16 @@ def _prepare_query_filters(catalog_query: CatalogQuery) -> tuple[sql.Composable,
|
|
|
149
219
|
)
|
|
150
220
|
|
|
151
221
|
|
|
152
|
-
def
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
if query.field == search_models.CatalogQueryField.Title:
|
|
159
|
-
# Insensitive search supported by pg_trgm for title
|
|
160
|
-
return sql.SQL("{} ILIKE %(query)s").format(sql.Identifier(query.field.value))
|
|
222
|
+
def _filter_operands(operands: list[CatalogExpression]) -> tuple[list[str], list[CatalogExpression]]:
|
|
223
|
+
facets = []
|
|
224
|
+
nonfacets = []
|
|
225
|
+
for op in operands:
|
|
226
|
+
if op.facet:
|
|
227
|
+
facets.append(op.facet)
|
|
161
228
|
else:
|
|
162
|
-
|
|
163
|
-
return sql.SQL("{} LIKE %(query)s").format(sql.Identifier(query.field.value))
|
|
164
|
-
# The rest of operators only supported by title
|
|
165
|
-
elif query.match == search_models.CatalogQueryMatch.Words:
|
|
166
|
-
# This is doing tokenization inside the SQL server (to keep the index updated). We could move it to
|
|
167
|
-
# the python code at update/query time if it ever becomes a problem but for now, a single regex
|
|
168
|
-
# executed per query is not a problem.
|
|
229
|
+
nonfacets.append(op)
|
|
169
230
|
|
|
170
|
-
|
|
171
|
-
params["query"] = [word.lower() for word in SPLIT_REGEX.split(query.query) if word]
|
|
172
|
-
return sql.SQL("regexp_split_to_array(lower(title), '\\W') @> %(query)s")
|
|
173
|
-
elif query.match == search_models.CatalogQueryMatch.Fuzzy:
|
|
174
|
-
params["query"] = query.query
|
|
175
|
-
# Note: the operator is %>, We use %%> for psycopg escaping
|
|
176
|
-
return sql.SQL("title %%> %(query)s")
|
|
177
|
-
elif query.match == search_models.CatalogQueryMatch.EndsWith:
|
|
178
|
-
params["query"] = "%" + query.query
|
|
179
|
-
return sql.SQL("title ILIKE %(query)s")
|
|
180
|
-
elif query.match == search_models.CatalogQueryMatch.Contains:
|
|
181
|
-
params["query"] = "%" + query.query + "%"
|
|
182
|
-
return sql.SQL("title ILIKE %(query)s")
|
|
183
|
-
else: # pragma: nocover
|
|
184
|
-
# This is a trick so mypy generates an error if this branch can be reached,
|
|
185
|
-
# that is, if we are missing some ifs
|
|
186
|
-
_a: int = "a"
|
|
187
|
-
return sql.SQL("")
|
|
231
|
+
return facets, nonfacets
|
|
188
232
|
|
|
189
233
|
|
|
190
234
|
def _prepare_query(catalog_query: CatalogQuery) -> tuple[sql.Composed, dict[str, Any]]:
|
|
@@ -219,98 +263,51 @@ def _prepare_query(catalog_query: CatalogQuery) -> tuple[sql.Composed, dict[str,
|
|
|
219
263
|
return query, filter_params
|
|
220
264
|
|
|
221
265
|
|
|
222
|
-
def _pg_driver() -> PGDriver:
|
|
223
|
-
return cast(PGDriver, get_driver())
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
@observer.wrap({"op": "search"})
|
|
227
|
-
async def pgcatalog_search(catalog_query: CatalogQuery) -> Resources:
|
|
228
|
-
# Prepare SQL query
|
|
229
|
-
query, query_params = _prepare_query_filters(catalog_query)
|
|
230
|
-
|
|
231
|
-
async with _pg_driver()._get_connection() as conn, conn.cursor(row_factory=dict_row) as cur:
|
|
232
|
-
facets = {}
|
|
233
|
-
|
|
234
|
-
# Faceted search
|
|
235
|
-
if catalog_query.faceted:
|
|
236
|
-
with observer({"op": "facets"}):
|
|
237
|
-
tmp_facets: dict[str, dict[str, int]] = {
|
|
238
|
-
translate_label(f): defaultdict(int) for f in catalog_query.faceted
|
|
239
|
-
}
|
|
240
|
-
|
|
241
|
-
if catalog_query.filters is None:
|
|
242
|
-
await _faceted_search_unfiltered(cur, catalog_query, tmp_facets)
|
|
243
|
-
else:
|
|
244
|
-
await _faceted_search_filtered(cur, catalog_query, tmp_facets, query, query_params)
|
|
245
|
-
|
|
246
|
-
facets = {translate_system_to_alias_label(k): v for k, v in tmp_facets.items()}
|
|
247
|
-
|
|
248
|
-
# Totals
|
|
249
|
-
with observer({"op": "totals"}):
|
|
250
|
-
await cur.execute(
|
|
251
|
-
sql.SQL("SELECT COUNT(*) FROM ({}) fc").format(query),
|
|
252
|
-
query_params,
|
|
253
|
-
)
|
|
254
|
-
total = (await cur.fetchone())["count"] # type: ignore
|
|
255
|
-
|
|
256
|
-
# Query
|
|
257
|
-
with observer({"op": "query"}):
|
|
258
|
-
query, query_params = _prepare_query(catalog_query)
|
|
259
|
-
await cur.execute(query, query_params)
|
|
260
|
-
data = await cur.fetchall()
|
|
261
|
-
|
|
262
|
-
return Resources(
|
|
263
|
-
facets=facets,
|
|
264
|
-
results=[
|
|
265
|
-
ResourceResult(
|
|
266
|
-
rid=str(r["rid"]).replace("-", ""),
|
|
267
|
-
field="title",
|
|
268
|
-
field_type="a",
|
|
269
|
-
labels=[label for label in r["labels"] if label.startswith("/l/")],
|
|
270
|
-
score=0,
|
|
271
|
-
)
|
|
272
|
-
for r in data
|
|
273
|
-
],
|
|
274
|
-
query=catalog_query.query.query if catalog_query.query else "",
|
|
275
|
-
total=total,
|
|
276
|
-
page_number=catalog_query.page_number,
|
|
277
|
-
page_size=catalog_query.page_size,
|
|
278
|
-
next_page=(catalog_query.page_size * catalog_query.page_number + len(data) < total),
|
|
279
|
-
min_score=0,
|
|
280
|
-
)
|
|
281
|
-
|
|
282
|
-
|
|
283
266
|
async def _faceted_search_unfiltered(
|
|
284
267
|
cur: AsyncCursor[DictRow], catalog_query: CatalogQuery, tmp_facets: dict[str, dict[str, int]]
|
|
285
268
|
):
|
|
286
269
|
facet_params: dict[str, Any] = {}
|
|
287
270
|
facet_sql: sql.Composable
|
|
288
|
-
if
|
|
289
|
-
#
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
271
|
+
if list(tmp_facets.keys()) == ["/n/s"]:
|
|
272
|
+
# Special case when querying only for status. We know the list of possible facets and optimize
|
|
273
|
+
# by asking for each facet separately which makes better use of the index
|
|
274
|
+
sqls = []
|
|
275
|
+
for status in ["PENDING", "PROCESSED", "ERROR", "EMPTY"]:
|
|
276
|
+
sqls.append(
|
|
277
|
+
sql.SQL(
|
|
278
|
+
"SELECT facet, COUNT(*) FROM catalog_facets WHERE kbid = %(kbid)s AND facet = '/n/s/{}' GROUP BY facet".format(
|
|
279
|
+
status
|
|
280
|
+
)
|
|
295
281
|
)
|
|
296
282
|
)
|
|
297
|
-
|
|
298
|
-
facet_params[f"facet_len_{cnt}"] = -(len(prefix) + 1)
|
|
299
|
-
facet_sql = sql.SQL("AND {}").format(sql.SQL(" OR ").join(prefixes_sql))
|
|
300
|
-
elif all((facet.startswith("/l") or facet.startswith("/n/i") for facet in tmp_facets.keys())):
|
|
301
|
-
# Special case for the catalog query, which can have many facets asked for
|
|
302
|
-
# Filter for the categories (icon and labels) in the query, filter the rest in the code below
|
|
303
|
-
facet_sql = sql.SQL("AND (facet LIKE '/l/%%' OR facet like '/n/i/%%')")
|
|
283
|
+
await cur.execute(sql.SQL(" UNION ").join(sqls), {"kbid": catalog_query.kbid})
|
|
304
284
|
else:
|
|
305
|
-
|
|
306
|
-
|
|
285
|
+
if len(tmp_facets) <= 5:
|
|
286
|
+
# Asking for few facets, strictly filter to what we need in the query
|
|
287
|
+
prefixes_sql = []
|
|
288
|
+
for cnt, prefix in enumerate(tmp_facets.keys()):
|
|
289
|
+
prefixes_sql.append(
|
|
290
|
+
sql.SQL("(facet LIKE {} AND POSITION('/' IN RIGHT(facet, {})) = 0)").format(
|
|
291
|
+
sql.Placeholder(f"facet_{cnt}"), sql.Placeholder(f"facet_len_{cnt}")
|
|
292
|
+
)
|
|
293
|
+
)
|
|
294
|
+
facet_params[f"facet_{cnt}"] = f"{prefix}/%"
|
|
295
|
+
facet_params[f"facet_len_{cnt}"] = -(len(prefix) + 1)
|
|
296
|
+
facet_sql = sql.SQL("AND {}").format(sql.SQL(" OR ").join(prefixes_sql))
|
|
297
|
+
elif all(facet.startswith("/l") or facet.startswith("/n/i") for facet in tmp_facets.keys()):
|
|
298
|
+
# Special case for the catalog query, which can have many facets asked for
|
|
299
|
+
# Filter for the categories (icon and labels) in the query, filter the rest in the code below
|
|
300
|
+
facet_sql = sql.SQL("AND (facet LIKE '/l/%%' OR facet like '/n/i/%%')")
|
|
301
|
+
else:
|
|
302
|
+
# Worst case: ask for all facets and filter here. This is faster than applying lots of filters
|
|
303
|
+
facet_sql = sql.SQL("")
|
|
307
304
|
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
305
|
+
await cur.execute(
|
|
306
|
+
sql.SQL(
|
|
307
|
+
"SELECT facet, COUNT(*) FROM catalog_facets WHERE kbid = %(kbid)s {} GROUP BY facet"
|
|
308
|
+
).format(facet_sql),
|
|
309
|
+
{"kbid": catalog_query.kbid, **facet_params},
|
|
310
|
+
)
|
|
314
311
|
|
|
315
312
|
# Only keep the facets we asked for
|
|
316
313
|
for row in await cur.fetchall():
|
|
@@ -360,33 +357,134 @@ async def _faceted_search_filtered(
|
|
|
360
357
|
tmp_facets[grandparent][translate_system_to_alias_label(parent)] += count
|
|
361
358
|
|
|
362
359
|
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
360
|
+
def _prepare_query_search(query: search_models.CatalogQuery, params: dict[str, Any]) -> sql.Composable:
|
|
361
|
+
if query.match == search_models.CatalogQueryMatch.Exact:
|
|
362
|
+
params["query"] = query.query
|
|
363
|
+
return sql.SQL("{} = %(query)s").format(sql.Identifier(query.field.value))
|
|
364
|
+
elif query.match == search_models.CatalogQueryMatch.StartsWith:
|
|
365
|
+
params["query"] = query.query + "%"
|
|
366
|
+
if query.field == search_models.CatalogQueryField.Title:
|
|
367
|
+
# Insensitive search supported by pg_trgm for title
|
|
368
|
+
return sql.SQL("{} ILIKE %(query)s").format(sql.Identifier(query.field.value))
|
|
369
|
+
else:
|
|
370
|
+
# Sensitive search for slug (btree does not support ILIKE and slugs are all lowercase anyway)
|
|
371
|
+
return sql.SQL("{} LIKE %(query)s").format(sql.Identifier(query.field.value))
|
|
372
|
+
# The rest of operators only supported by title
|
|
373
|
+
elif query.match == search_models.CatalogQueryMatch.Words:
|
|
374
|
+
# This is doing tokenization inside the SQL server (to keep the index updated). We could move it to
|
|
375
|
+
# the python code at update/query time if it ever becomes a problem but for now, a single regex
|
|
376
|
+
# executed per query is not a problem.
|
|
377
|
+
|
|
378
|
+
# Remove zero-length words from the split
|
|
379
|
+
params["query"] = [word.lower() for word in SPLIT_REGEX.split(query.query) if word]
|
|
380
|
+
return sql.SQL("regexp_split_to_array(lower(title), '\\W') @> %(query)s")
|
|
381
|
+
elif query.match == search_models.CatalogQueryMatch.Fuzzy:
|
|
382
|
+
params["query"] = query.query
|
|
383
|
+
# Note: the operator is %>, We use %%> for psycopg escaping
|
|
384
|
+
return sql.SQL("title %%> %(query)s")
|
|
385
|
+
elif query.match == search_models.CatalogQueryMatch.EndsWith:
|
|
386
|
+
params["query"] = "%" + query.query
|
|
387
|
+
return sql.SQL("title ILIKE %(query)s")
|
|
388
|
+
elif query.match == search_models.CatalogQueryMatch.Contains:
|
|
389
|
+
params["query"] = "%" + query.query + "%"
|
|
390
|
+
return sql.SQL("title ILIKE %(query)s")
|
|
391
|
+
else: # pragma: no cover
|
|
392
|
+
assert_never(query.match)
|
|
379
393
|
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
394
|
+
|
|
395
|
+
def _convert_filter(expr: CatalogExpression, filter_params: dict[str, Any]) -> sql.Composable:
|
|
396
|
+
if expr.bool_and:
|
|
397
|
+
return _convert_boolean_op(expr.bool_and, "and", filter_params)
|
|
398
|
+
elif expr.bool_or:
|
|
399
|
+
return _convert_boolean_op(expr.bool_or, "or", filter_params)
|
|
400
|
+
elif expr.bool_not:
|
|
401
|
+
return sql.SQL("(NOT {})").format(_convert_filter(expr.bool_not, filter_params))
|
|
402
|
+
elif expr.date:
|
|
403
|
+
return _convert_date_filter(expr.date, filter_params)
|
|
404
|
+
elif expr.facet:
|
|
405
|
+
param_name = f"param{len(filter_params)}"
|
|
406
|
+
filter_params[param_name] = [expr.facet]
|
|
407
|
+
if expr.facet == "/n/s/PROCESSED":
|
|
408
|
+
# Optimization for the most common case, we know PROCESSED is a full label and can use the smaller labels index
|
|
409
|
+
# This is needed because PROCESSED is present in most catalog entries and PG is unlikely to use any index
|
|
410
|
+
# for it, falling back to executing the extract_facets function which can be slow
|
|
411
|
+
return sql.SQL("labels @> {}").format(sql.Placeholder(param_name))
|
|
383
412
|
else:
|
|
384
|
-
|
|
413
|
+
return sql.SQL("extract_facets(labels) @> {}").format(sql.Placeholder(param_name))
|
|
414
|
+
elif expr.resource_id:
|
|
415
|
+
param_name = f"param{len(filter_params)}"
|
|
416
|
+
filter_params[param_name] = [expr.resource_id]
|
|
417
|
+
return sql.SQL("rid = {}").format(sql.Placeholder(param_name))
|
|
418
|
+
else:
|
|
419
|
+
return sql.SQL("")
|
|
385
420
|
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
421
|
+
|
|
422
|
+
def _convert_boolean_op(
|
|
423
|
+
operands: list[CatalogExpression],
|
|
424
|
+
op: Literal["and"] | Literal["or"],
|
|
425
|
+
filter_params: dict[str, Any],
|
|
426
|
+
) -> sql.Composable:
|
|
427
|
+
array_op = sql.SQL("@>" if op == "and" else "&&")
|
|
428
|
+
operands_sql: list[sql.Composable] = []
|
|
429
|
+
facets, nonfacets = _filter_operands(operands)
|
|
430
|
+
if facets:
|
|
431
|
+
param_name = f"param{len(filter_params)}"
|
|
432
|
+
if facets == ["/n/s/PROCESSED"]:
|
|
433
|
+
# Optimization for the most common case, we know PROCESSED is a full label and can use the smaller labels index
|
|
434
|
+
# This is needed because PROCESSED is present in most catalog entries and PG is unlikely to use any index
|
|
435
|
+
# for it, falling back to executing the extract_facets function which can be slow
|
|
436
|
+
operands_sql.append(sql.SQL("labels @> {}").format(sql.Placeholder(param_name)))
|
|
437
|
+
else:
|
|
438
|
+
operands_sql.append(
|
|
439
|
+
sql.SQL("extract_facets(labels) {} {}").format(array_op, sql.Placeholder(param_name))
|
|
440
|
+
)
|
|
441
|
+
filter_params[param_name] = facets
|
|
442
|
+
for nonfacet in nonfacets:
|
|
443
|
+
operands_sql.append(_convert_filter(nonfacet, filter_params))
|
|
444
|
+
return sql.SQL("({})").format(sql.SQL(f" {op.upper()} ").join(operands_sql))
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
def _convert_date_filter(date: CatalogExpression.Date, filter_params: dict[str, Any]) -> sql.Composable:
|
|
448
|
+
if date.since and date.until:
|
|
449
|
+
since_name = f"param{len(filter_params)}"
|
|
450
|
+
filter_params[since_name] = date.since
|
|
451
|
+
until_name = f"param{len(filter_params)}"
|
|
452
|
+
filter_params[until_name] = date.until
|
|
453
|
+
return sql.SQL("{field} BETWEEN {since} AND {until}").format(
|
|
454
|
+
field=sql.Identifier(date.field),
|
|
455
|
+
since=sql.Placeholder(since_name),
|
|
456
|
+
until=sql.Placeholder(until_name),
|
|
391
457
|
)
|
|
392
|
-
|
|
458
|
+
elif date.since:
|
|
459
|
+
since_name = f"param{len(filter_params)}"
|
|
460
|
+
filter_params[since_name] = date.since
|
|
461
|
+
return sql.SQL("{field} > {since}").format(
|
|
462
|
+
field=sql.Identifier(date.field), since=sql.Placeholder(since_name)
|
|
463
|
+
)
|
|
464
|
+
elif date.until:
|
|
465
|
+
until_name = f"param{len(filter_params)}"
|
|
466
|
+
filter_params[until_name] = date.until
|
|
467
|
+
return sql.SQL("{field} < {until}").format(
|
|
468
|
+
field=sql.Identifier(date.field), until=sql.Placeholder(until_name)
|
|
469
|
+
)
|
|
470
|
+
else:
|
|
471
|
+
raise ValueError(f"Invalid date operator")
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
def translate_label(literal: str) -> str:
|
|
475
|
+
if len(literal) == 0:
|
|
476
|
+
raise InvalidQueryError("filters", "Invalid empty label")
|
|
477
|
+
if literal[0] != "/":
|
|
478
|
+
raise InvalidQueryError("filters", f"Invalid label. It must start with a `/`: {literal}")
|
|
479
|
+
return translate_alias_to_system_label(literal)
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
def extract_facets(labels: list[str]) -> set[str]:
|
|
483
|
+
facets = set()
|
|
484
|
+
for label in labels:
|
|
485
|
+
parts = label.split("/")
|
|
486
|
+
facet = ""
|
|
487
|
+
for part in parts[1:]:
|
|
488
|
+
facet += f"/{part}"
|
|
489
|
+
facets.add(facet)
|
|
490
|
+
return facets
|