PyPI - nucliadb - Versions diffs - 6.5.0.post4426__py3-none-any.whl → 6.5.0.post4476__py3-none-any.whl - Mend

nucliadb 6.5.0.post4426py3-none-any.whl → 6.5.0.post4476py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

migrations/0037_backfill_catalog_facets.py ADDED Viewed

@@ -0,0 +1,74 @@
+# Copyright (C) 2021 Bosutech XXI S.L.
+#
+# nucliadb is offered under the AGPL v3.0 and as commercial software.
+# For commercial licensing, contact us at info@nuclia.com.
+#
+# AGPL:
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+"""Migration #37
+Backfill catalog facets
+"""
+import logging
+from typing import cast
+from nucliadb.common.maindb.pg import PGDriver, PGTransaction
+from nucliadb.migrator.context import ExecutionContext
+logger = logging.getLogger(__name__)
+async def migrate(context: ExecutionContext) -> None:
+    driver = cast(PGDriver, context.kv_driver)
+    BATCH_SIZE = 1_000
+    async with driver.transaction() as txn:
+        txn = cast(PGTransaction, txn)
+        start_kbid = "00000000000000000000000000000000"
+        start_rid = "00000000000000000000000000000000"
+        while True:
+            async with txn.connection.cursor() as cur:
+                logger.info(f"Filling {BATCH_SIZE} catalog facets from {start_kbid}, {start_rid}")
+                # Get a batch of facets from the catalog table
+                await cur.execute(
+                    """
+                        WITH i AS (
+                            INSERT INTO catalog_facets (kbid, rid, facet)
+                            SELECT kbid, rid, unnest(extract_facets(labels)) FROM (
+                                SELECT * FROM catalog
+                                WHERE (kbid = %(kbid)s AND rid > %(rid)s) OR kbid > %(kbid)s
+                                ORDER BY kbid, rid
+                                LIMIT %(batch)s
+                            ) rs
+                            RETURNING kbid, rid
+                        )
+                        SELECT kbid, rid FROM i ORDER BY kbid DESC, rid DESC LIMIT 1;
+                    """,
+                    {"kbid": start_kbid, "rid": start_rid, "batch": BATCH_SIZE},
+                )
+                # Set the key for next iteration
+                results = await cur.fetchone()  # type: ignore
+                if results is None:
+                    break
+                (start_kbid, start_rid) = results
+                await txn.commit()
+async def migrate_kb(context: ExecutionContext, kbid: str) -> None: ...

migrations/pg/0001_bootstrap.py CHANGED Viewed

@@ -26,7 +26,7 @@ async def migrate(txn: PGTransaction) -> None:
         # IF NOT EXISTS just for compatibility with older install predating the migration system
         await cur.execute("""
             CREATE TABLE IF NOT EXISTS resources (
-                key TEXT PRIMARY KEY,
+                key TEXT COLLATE ucs_basic PRIMARY KEY,
                 value BYTEA
             );
         """)

migrations/pg/0008_catalog_facets.py ADDED Viewed

@@ -0,0 +1,43 @@
+# Copyright (C) 2021 Bosutech XXI S.L.
+#
+# nucliadb is offered under the AGPL v3.0 and as commercial software.
+# For commercial licensing, contact us at info@nuclia.com.
+#
+# AGPL:
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+from nucliadb.common.maindb.pg import PGTransaction
+async def migrate(txn: PGTransaction) -> None:
+    async with txn.connection.cursor() as cur:
+        await cur.execute(
+            """
+            CREATE TABLE catalog_facets (
+                id BIGINT PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
+                kbid UUID,
+                rid UUID,
+                facet TEXT COLLATE ucs_basic,
+                FOREIGN KEY (kbid, rid) REFERENCES catalog (kbid, rid) ON DELETE CASCADE
+            );
+            -- For FK checks
+            CREATE INDEX ON catalog_facets(kbid, rid);
+            -- Best for per-facet aggregation, also used by search with facet filter
+            CREATE INDEX ON catalog_facets(kbid, facet);
+            """
+        )

migrations/pg/0009_extract_facets_safety.py ADDED Viewed

@@ -0,0 +1,26 @@
+# Copyright (C) 2021 Bosutech XXI S.L.
+#
+# nucliadb is offered under the AGPL v3.0 and as commercial software.
+# For commercial licensing, contact us at info@nuclia.com.
+#
+# AGPL:
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+from nucliadb.common.maindb.pg import PGTransaction
+async def migrate(txn: PGTransaction) -> None:
+    async with txn.connection.cursor() as cur:
+        await cur.execute("ALTER FUNCTION extract_facets(text[]) PARALLEL SAFE;")

nucliadb/ingest/orm/processor/pgcatalog.py CHANGED Viewed

@@ -40,6 +40,17 @@ def pgcatalog_enabled(kbid):
     return isinstance(get_driver(), PGDriver)
+def extract_facets(labels):
+    facets = set()
+    for label in labels:
+        parts = label.split("/")
+        facet = ""
+        for part in parts[1:]:
+            facet += f"/{part}"
+            facets.add(facet)
+    return facets
 @observer.wrap({"type": "update"})
 async def pgcatalog_update(txn: Transaction, kbid: str, resource: Resource, index_message: IndexMessage):
     if not pgcatalog_enabled(kbid):
@@ -76,6 +87,21 @@ async def pgcatalog_update(txn: Transaction, kbid: str, resource: Resource, inde
                 "slug": resource.basic.slug,
             },
         )
+        await cur.execute(
+            "DELETE FROM catalog_facets WHERE kbid = %(kbid)s AND rid = %(rid)s",
+            {
+                "kbid": resource.kb.kbid,
+                "rid": resource.uuid,
+            },
+        )
+        await cur.execute(
+            "INSERT INTO catalog_facets (kbid, rid, facet) SELECT %(kbid)s AS kbid, %(rid)s AS rid, unnest(%(facets)s::text[]) AS facet",
+            {
+                "kbid": resource.kb.kbid,
+                "rid": resource.uuid,
+                "facets": list(extract_facets(index_message.labels)),
+            },
+        )
 @observer.wrap({"type": "delete"})

nucliadb/search/api/v1/catalog.py CHANGED Viewed

@@ -27,15 +27,13 @@ from pydantic import ValidationError
 from nucliadb.common.datamanagers.exceptions import KnowledgeBoxNotFound
 from nucliadb.common.exceptions import InvalidQueryError
-from nucliadb.common.maindb.pg import PGDriver
-from nucliadb.common.maindb.utils import get_driver
 from nucliadb.models.responses import HTTPClientError
 from nucliadb.search import logger
 from nucliadb.search.api.v1.router import KB_PREFIX, api
 from nucliadb.search.api.v1.utils import fastapi_query
 from nucliadb.search.search import cache
 from nucliadb.search.search.merge import fetch_resources
-from nucliadb.search.search.pgcatalog import pgcatalog_search
+from nucliadb.search.search.pgcatalog import pgcatalog_facets, pgcatalog_search
 from nucliadb.search.search.query_parser.parsers import parse_catalog
 from nucliadb.search.search.utils import (
     maybe_log_request_payload,
@@ -45,6 +43,7 @@ from nucliadb_models.filters import CatalogFilterExpression
 from nucliadb_models.metadata import ResourceProcessingStatus
 from nucliadb_models.resource import NucliaDBRoles
 from nucliadb_models.search import (
+    CatalogFacetsRequest,
     CatalogRequest,
     CatalogResponse,
     KnowledgeboxSearchResults,
@@ -157,9 +156,6 @@ async def catalog(
     returns bm25 results on titles and it does not support vector search.
     It is useful for listing resources in a knowledge box.
     """
-    if not pgcatalog_enabled():  # pragma: no cover
-        return HTTPClientError(status_code=501, detail="PG driver is needed for catalog search")
     maybe_log_request_payload(kbid, "/catalog", item)
     start_time = time()
     try:
@@ -196,5 +192,15 @@ async def catalog(
             )
-def pgcatalog_enabled():
-    return isinstance(get_driver(), PGDriver)
+@api.post(
+    f"/{KB_PREFIX}/{{kbid}}/catalog/facets",
+    status_code=200,
+    response_model=dict[str, int],
+    response_model_exclude_unset=True,
+    tags=["Search"],
+    include_in_schema=False,
+)
+@requires(NucliaDBRoles.READER)
+@version(1)
+async def catalog_facets(request: Request, kbid: str, item: CatalogFacetsRequest) -> dict[str, int]:
+    return await pgcatalog_facets(kbid, item)

nucliadb/search/search/pgcatalog.py CHANGED Viewed

@@ -22,19 +22,15 @@ import logging
 from collections import defaultdict
 from typing import Any, Literal, Union, cast
-from psycopg.rows import dict_row
+from psycopg import AsyncCursor, sql
+from psycopg.rows import DictRow, dict_row
 from nucliadb.common.maindb.pg import PGDriver
 from nucliadb.common.maindb.utils import get_driver
 from nucliadb.search.search.query_parser.models import CatalogExpression, CatalogQuery
 from nucliadb_models import search as search_models
 from nucliadb_models.labels import translate_system_to_alias_label
-from nucliadb_models.search import (
-    ResourceResult,
-    Resources,
-    SortField,
-    SortOrder,
-)
+from nucliadb_models.search import CatalogFacetsRequest, ResourceResult, Resources, SortField, SortOrder
 from nucliadb_telemetry import metrics
 from .filters import translate_label
@@ -55,65 +51,87 @@ def _filter_operands(operands: list[CatalogExpression]) -> tuple[list[str], list
     return facets, nonfacets
-def _convert_filter(expr: CatalogExpression, filter_params: dict[str, Any]) -> str:
+def _convert_filter(expr: CatalogExpression, filter_params: dict[str, Any]) -> sql.Composable:
     if expr.bool_and:
         return _convert_boolean_op(expr.bool_and, "and", filter_params)
     elif expr.bool_or:
         return _convert_boolean_op(expr.bool_or, "or", filter_params)
     elif expr.bool_not:
-        return f"(NOT {_convert_filter(expr.bool_not, filter_params)})"
+        return sql.SQL("(NOT {})").format(_convert_filter(expr.bool_not, filter_params))
     elif expr.date:
         return _convert_date_filter(expr.date, filter_params)
     elif expr.facet:
         param_name = f"param{len(filter_params)}"
         filter_params[param_name] = [expr.facet]
-        return f"extract_facets(labels) @> %({param_name})s"
+        if expr.facet == "/n/s/PROCESSED":
+            # Optimization for the most common case, we know PROCESSED is a full label and can use the smaller labels index
+            # This is needed because PROCESSED is present in most catalog entries and PG is unlikely to use any index
+            # for it, falling back to executing the extract_facets function which can be slow
+            return sql.SQL("labels @> {}").format(sql.Placeholder(param_name))
+        else:
+            return sql.SQL("extract_facets(labels) @> {}").format(sql.Placeholder(param_name))
     elif expr.resource_id:
         param_name = f"param{len(filter_params)}"
         filter_params[param_name] = [expr.resource_id]
-        return f"rid = %({param_name})s"
+        return sql.SQL("rid = {}").format(sql.Placeholder(param_name))
     else:
-        return ""
+        return sql.SQL("")
 def _convert_boolean_op(
     operands: list[CatalogExpression],
     op: Union[Literal["and"], Literal["or"]],
     filter_params: dict[str, Any],
-) -> str:
-    array_op = "@>" if op == "and" else "&&"
-    sql = []
+) -> sql.Composable:
+    array_op = sql.SQL("@>" if op == "and" else "&&")
+    operands_sql: list[sql.Composable] = []
     facets, nonfacets = _filter_operands(operands)
     if facets:
         param_name = f"param{len(filter_params)}"
+        if facets == ["/n/s/PROCESSED"]:
+            # Optimization for the most common case, we know PROCESSED is a full label and can use the smaller labels index
+            # This is needed because PROCESSED is present in most catalog entries and PG is unlikely to use any index
+            # for it, falling back to executing the extract_facets function which can be slow
+            operands_sql.append(sql.SQL("labels @> {}").format(sql.Placeholder(param_name)))
+        else:
+            operands_sql.append(
+                sql.SQL("extract_facets(labels) {} {}").format(array_op, sql.Placeholder(param_name))
+            )
         filter_params[param_name] = facets
-        sql.append(f"extract_facets(labels) {array_op} %({param_name})s")
     for nonfacet in nonfacets:
-        sql.append(_convert_filter(nonfacet, filter_params))
-    return "(" + f" {op.upper()} ".join(sql) + ")"
+        operands_sql.append(_convert_filter(nonfacet, filter_params))
+    return sql.SQL("({})").format(sql.SQL(f" {op.upper()} ").join(operands_sql))
-def _convert_date_filter(date: CatalogExpression.Date, filter_params: dict[str, Any]) -> str:
+def _convert_date_filter(date: CatalogExpression.Date, filter_params: dict[str, Any]) -> sql.Composable:
     if date.since and date.until:
         since_name = f"param{len(filter_params)}"
         filter_params[since_name] = date.since
         until_name = f"param{len(filter_params)}"
         filter_params[until_name] = date.until
-        return f"{date.field} BETWEEN %({since_name})s AND %({until_name})s"
+        return sql.SQL("{field} BETWEEN {since} AND {until}").format(
+            field=sql.Identifier(date.field),
+            since=sql.Placeholder(since_name),
+            until=sql.Placeholder(until_name),
+        )
     elif date.since:
         since_name = f"param{len(filter_params)}"
         filter_params[since_name] = date.since
-        return f"{date.field} > %({since_name})s"
+        return sql.SQL("{field} > {since}").format(
+            field=sql.Identifier(date.field), since=sql.Placeholder(since_name)
+        )
     elif date.until:
         until_name = f"param{len(filter_params)}"
         filter_params[until_name] = date.until
-        return f"{date.field} < %({until_name})s"
+        return sql.SQL("{field} < {until}").format(
+            field=sql.Identifier(date.field), until=sql.Placeholder(until_name)
+        )
     else:
         raise ValueError(f"Invalid date operator")
-def _prepare_query_filters(catalog_query: CatalogQuery) -> tuple[str, dict[str, Any]]:
-    filter_sql = ["kbid = %(kbid)s"]
+def _prepare_query_filters(catalog_query: CatalogQuery) -> tuple[sql.Composable, dict[str, Any]]:
+    filter_sql: list[sql.Composable] = [sql.SQL("kbid = %(kbid)s")]
     filter_params: dict[str, Any] = {"kbid": catalog_query.kbid}
     if catalog_query.query and catalog_query.query.query:
@@ -123,47 +141,50 @@ def _prepare_query_filters(catalog_query: CatalogQuery) -> tuple[str, dict[str,
         filter_sql.append(_convert_filter(catalog_query.filters, filter_params))
     return (
-        f"SELECT * FROM catalog WHERE {' AND '.join(filter_sql)}",
+        sql.SQL("SELECT * FROM catalog WHERE {}").format(sql.SQL(" AND ").join(filter_sql)),
         filter_params,
     )
-def _prepare_query_search(query: search_models.CatalogQuery, params: dict[str, Any]) -> str:
+def _prepare_query_search(query: search_models.CatalogQuery, params: dict[str, Any]) -> sql.Composable:
     if query.match == search_models.CatalogQueryMatch.Exact:
         params["query"] = query.query
-        return f"{query.field.value} = %(query)s"
+        return sql.SQL("{} = %(query)s").format(sql.Identifier(query.field.value))
     elif query.match == search_models.CatalogQueryMatch.StartsWith:
         params["query"] = query.query + "%"
         if query.field == search_models.CatalogQueryField.Title:
             # Insensitive search supported by pg_trgm for title
-            return f"{query.field.value} ILIKE %(query)s"
+            return sql.SQL("{} ILIKE %(query)s").format(sql.Identifier(query.field.value))
         else:
             # Sensitive search for slug (btree does not support ILIKE and slugs are all lowercase anyway)
-            return f"{query.field.value} LIKE %(query)s"
+            return sql.SQL("{} LIKE %(query)s").format(sql.Identifier(query.field.value))
     # The rest of operators only supported by title
     elif query.match == search_models.CatalogQueryMatch.Words:
         # This is doing tokenization inside the SQL server (to keep the index updated). We could move it to
         # the python code at update/query time if it ever becomes a problem but for now, a single regex
         # executed per query is not a problem.
         params["query"] = query.query
-        return "regexp_split_to_array(lower(title), '\\W') @> regexp_split_to_array(lower(%(query)s), '\\W')"
+        return sql.SQL(
+            "regexp_split_to_array(lower(title), '\\W') @> regexp_split_to_array(lower(%(query)s), '\\W')"
+        )
     elif query.match == search_models.CatalogQueryMatch.Fuzzy:
         params["query"] = query.query
         # Note: the operator is %>, We use %%> for psycopg escaping
-        return "title %%> %(query)s"
+        return sql.SQL("title %%> %(query)s")
     elif query.match == search_models.CatalogQueryMatch.EndsWith:
         params["query"] = "%" + query.query
-        return "title ILIKE %(query)s"
+        return sql.SQL("title ILIKE %(query)s")
     elif query.match == search_models.CatalogQueryMatch.Contains:
         params["query"] = "%" + query.query + "%"
-        return "title ILIKE %(query)s"
+        return sql.SQL("title ILIKE %(query)s")
     else:  # pragma: nocover
         # This is a trick so mypy generates an error if this branch can be reached,
         # that is, if we are missing some ifs
         _a: int = "a"
+        return sql.SQL("")
-def _prepare_query(catalog_query: CatalogQuery) -> tuple[str, dict[str, Any]]:
+def _prepare_query(catalog_query: CatalogQuery) -> tuple[sql.Composed, dict[str, Any]]:
     # Base query with all the filters
     query, filter_params = _prepare_query_filters(catalog_query)
@@ -184,11 +205,11 @@ def _prepare_query(catalog_query: CatalogQuery) -> tuple[str, dict[str, Any]]:
         else:
             order_dir = "DESC"
-        query += f" ORDER BY {order_field} {order_dir}"
+        query += sql.SQL(" ORDER BY {} {}").format(sql.Identifier(order_field), sql.SQL(order_dir))
     # Pagination
     offset = catalog_query.page_size * catalog_query.page_number
-    query += f" LIMIT %(page_size)s OFFSET %(offset)s"
+    query += sql.SQL(" LIMIT %(page_size)s OFFSET %(offset)s")
     filter_params["page_size"] = catalog_query.page_size
     filter_params["offset"] = offset
@@ -213,40 +234,18 @@ async def pgcatalog_search(catalog_query: CatalogQuery) -> Resources:
                 tmp_facets: dict[str, dict[str, int]] = {
                     translate_label(f): defaultdict(int) for f in catalog_query.faceted
                 }
-                facet_filters = " OR ".join(f"label LIKE '{f}/%%'" for f in tmp_facets.keys())
-                for facet in tmp_facets.keys():
-                    if not (
-                        facet.startswith("/n/s") or facet.startswith("/n/i") or facet.startswith("/l")
-                    ):
-                        logger.warning(
-                            f"Unexpected facet used at catalog: {facet}, kbid={catalog_query.kbid}"
-                        )
-                await cur.execute(
-                    f"SELECT label, COUNT(*) FROM (SELECT unnest(labels) AS label FROM ({query}) fc) nl WHERE ({facet_filters}) GROUP BY 1 ORDER BY 1",
-                    query_params,
-                )
-                for row in await cur.fetchall():
-                    label = row["label"]
-                    label_parts = label.split("/")
-                    parent = "/".join(label_parts[:-1])
-                    count = row["count"]
-                    if parent in tmp_facets:
-                        tmp_facets[parent][translate_system_to_alias_label(label)] = count
-                    # No need to get recursive because our facets are at most 3 levels deep (e.g: /l/set/label)
-                    if len(label_parts) >= 3:
-                        grandparent = "/".join(label_parts[:-2])
-                        if grandparent in tmp_facets:
-                            tmp_facets[grandparent][translate_system_to_alias_label(parent)] += count
+                if catalog_query.filters is None:
+                    await _faceted_search_unfiltered(cur, catalog_query, tmp_facets)
+                else:
+                    await _faceted_search_filtered(cur, catalog_query, tmp_facets, query, query_params)
                 facets = {translate_system_to_alias_label(k): v for k, v in tmp_facets.items()}
         # Totals
         with observer({"op": "totals"}):
             await cur.execute(
-                f"SELECT COUNT(*) FROM ({query}) fc",
+                sql.SQL("SELECT COUNT(*) FROM ({}) fc").format(query),
                 query_params,
             )
             total = (await cur.fetchone())["count"]  # type: ignore
@@ -276,3 +275,115 @@ async def pgcatalog_search(catalog_query: CatalogQuery) -> Resources:
         next_page=(catalog_query.page_size * catalog_query.page_number + len(data) < total),
         min_score=0,
     )
+async def _faceted_search_unfiltered(
+    cur: AsyncCursor[DictRow], catalog_query: CatalogQuery, tmp_facets: dict[str, dict[str, int]]
+):
+    facet_params: dict[str, Any] = {}
+    facet_sql: sql.Composable
+    if len(tmp_facets) <= 5:
+        # Asking for few facets, strictly filter to what we need in the query
+        prefixes_sql = []
+        for cnt, prefix in enumerate(tmp_facets.keys()):
+            prefixes_sql.append(
+                sql.SQL("(facet LIKE {} AND POSITION('/' IN RIGHT(facet, {})) = 0)").format(
+                    sql.Placeholder(f"facet_{cnt}"), sql.Placeholder(f"facet_len_{cnt}")
+                )
+            )
+            facet_params[f"facet_{cnt}"] = f"{prefix}/%"
+            facet_params[f"facet_len_{cnt}"] = -(len(prefix) + 1)
+        facet_sql = sql.SQL("AND {}").format(sql.SQL(" OR ").join(prefixes_sql))
+    elif all((facet.startswith("/l") or facet.startswith("/n/i") for facet in tmp_facets.keys())):
+        # Special case for the catalog query, which can have many facets asked for
+        # Filter for the categories (icon and labels) in the query, filter the rest in the code below
+        facet_sql = sql.SQL("AND (facet LIKE '/l/%%' OR facet like '/n/i/%%')")
+    else:
+        # Worst case: ask for all facets and filter here. This is faster than applying lots of filters
+        facet_sql = sql.SQL("")
+    await cur.execute(
+        sql.SQL(
+            "SELECT facet, COUNT(*) FROM catalog_facets WHERE kbid = %(kbid)s {} GROUP BY facet"
+        ).format(facet_sql),
+        {"kbid": catalog_query.kbid, **facet_params},
+    )
+    # Only keep the facets we asked for
+    for row in await cur.fetchall():
+        facet = row["facet"]
+        facet_parts = facet.split("/")
+        parent = "/".join(facet_parts[:-1])
+        if parent in tmp_facets:
+            tmp_facets[parent][translate_system_to_alias_label(facet)] = row["count"]
+async def _faceted_search_filtered(
+    cur: AsyncCursor[DictRow],
+    catalog_query: CatalogQuery,
+    tmp_facets: dict[str, dict[str, int]],
+    query: sql.Composable,
+    query_params: dict[str, Any],
+):
+    facet_params = {}
+    facet_filters = []
+    for cnt, facet in enumerate(tmp_facets.keys()):
+        facet_filters.append(sql.SQL("label LIKE {}").format(sql.Placeholder(f"facet_{cnt}")))
+        facet_params[f"facet_{cnt}"] = f"{facet}/%"
+    for facet in tmp_facets.keys():
+        if not (facet.startswith("/n/s") or facet.startswith("/n/i") or facet.startswith("/l")):
+            logger.warning(f"Unexpected facet used at catalog: {facet}, kbid={catalog_query.kbid}")
+    await cur.execute(
+        sql.SQL(
+            "SELECT label, COUNT(*) FROM (SELECT unnest(labels) AS label FROM ({query}) fc) nl WHERE ({facet_filters}) GROUP BY 1 ORDER BY 1"
+        ).format(query=query, facet_filters=sql.SQL(" OR ").join(facet_filters)),
+        {**query_params, **facet_params},
+    )
+    for row in await cur.fetchall():
+        label = row["label"]
+        label_parts = label.split("/")
+        parent = "/".join(label_parts[:-1])
+        count = row["count"]
+        if parent in tmp_facets:
+            tmp_facets[parent][translate_system_to_alias_label(label)] = count
+        # No need to get recursive because our facets are at most 3 levels deep (e.g: /l/set/label)
+        if len(label_parts) >= 3:
+            grandparent = "/".join(label_parts[:-2])
+            if grandparent in tmp_facets:
+                tmp_facets[grandparent][translate_system_to_alias_label(parent)] += count
+@observer.wrap({"op": "catalog_facets"})
+async def pgcatalog_facets(kbid: str, request: CatalogFacetsRequest) -> dict[str, int]:
+    async with _pg_driver()._get_connection() as conn, conn.cursor() as cur:
+        prefix_filters: list[sql.Composable] = []
+        prefix_params: dict[str, Any] = {}
+        for cnt, prefix in enumerate(request.prefixes):
+            prefix_sql = sql.SQL("facet LIKE {}").format(sql.Placeholder(f"prefix{cnt}"))
+            prefix_params[f"prefix{cnt}"] = f"{prefix.prefix}%"
+            if prefix.depth is not None:
+                prefix_parts = len(prefix.prefix.split("/"))
+                depth_sql = sql.SQL("SPLIT_PART(facet, '/', {}) = ''").format(
+                    sql.Placeholder(f"depth{cnt}")
+                )
+                prefix_params[f"depth{cnt}"] = prefix_parts + prefix.depth + 1
+                prefix_sql = sql.SQL("({} AND {})").format(prefix_sql, depth_sql)
+            prefix_filters.append(prefix_sql)
+        filter_sql: sql.Composable
+        if prefix_filters:
+            filter_sql = sql.SQL("AND {}").format(sql.SQL(" OR ").join(prefix_filters))
+        else:
+            filter_sql = sql.SQL("")
+        await cur.execute(
+            sql.SQL(
+                "SELECT facet, COUNT(*) FROM catalog_facets WHERE kbid = %(kbid)s {} GROUP BY facet"
+            ).format(filter_sql),
+            {"kbid": kbid, **prefix_params},
+        )
+        return {k: v for k, v in await cur.fetchall()}

{nucliadb-6.5.0.post4426.dist-info → nucliadb-6.5.0.post4476.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nucliadb
-Version: 6.5.0.post4426
+Version: 6.5.0.post4476
 Summary: NucliaDB
 Author-email: Nuclia <nucliadb@nuclia.com>
 License-Expression: AGPL-3.0-or-later
@@ -19,11 +19,11 @@ Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3 :: Only
 Requires-Python: <4,>=3.9
 Description-Content-Type: text/markdown
-Requires-Dist: nucliadb-telemetry[all]>=6.5.0.post4426
-Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.5.0.post4426
-Requires-Dist: nucliadb-protos>=6.5.0.post4426
-Requires-Dist: nucliadb-models>=6.5.0.post4426
-Requires-Dist: nidx-protos>=6.5.0.post4426
+Requires-Dist: nucliadb-telemetry[all]>=6.5.0.post4476
+Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.5.0.post4476
+Requires-Dist: nucliadb-protos>=6.5.0.post4476
+Requires-Dist: nucliadb-models>=6.5.0.post4476
+Requires-Dist: nidx-protos>=6.5.0.post4476
 Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
 Requires-Dist: nuclia-models>=0.24.2
 Requires-Dist: uvicorn[standard]
@@ -35,8 +35,8 @@ Requires-Dist: aiofiles>=0.8.0
 Requires-Dist: psutil>=5.9.7
 Requires-Dist: types-psutil>=5.9.5.17
 Requires-Dist: types-aiofiles>=0.8.3
-Requires-Dist: protobuf>=5
-Requires-Dist: types-protobuf>=5
+Requires-Dist: protobuf<6,>=5
+Requires-Dist: types-protobuf<6,>=5
 Requires-Dist: grpcio>=1.71.0
 Requires-Dist: grpcio-health-checking>=1.71.0
 Requires-Dist: grpcio-channelz>=1.71.0

{nucliadb-6.5.0.post4426.dist-info → nucliadb-6.5.0.post4476.dist-info}/RECORD RENAMED Viewed

@@ -32,14 +32,17 @@ migrations/0033_rollover_nidx_relation_2.py,sha256=9etpqNLVS3PA14qIdsdhorReZxenD
 migrations/0034_rollover_nidx_texts_3.py,sha256=t19QtWUgHxmTaBPoR1DooAby2IYmkLTQj8qu1z2XkFc,1452
 migrations/0035_rollover_nidx_texts_4.py,sha256=W0_AUd01pjMpYMDC3yqF6HzDLgcnnPprL80kfyb1WZI,1187
 migrations/0036_backfill_catalog_slug.py,sha256=mizRM-HfPswKq4iEmqofu4kIT6Gd97ruT3qhb257vZk,2954
+migrations/0037_backfill_catalog_facets.py,sha256=KAf3VKbKePw7ykDnJi47LyJ7pK1JwYkwMxrsXUnbt9g,2788
 migrations/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
-migrations/pg/0001_bootstrap.py,sha256=Fsqkeof50m7fKiJN05kmNEMwiKDlOrAgcAS5sLLkutA,1256
+migrations/pg/0001_bootstrap.py,sha256=3O_P17l0d0h48nebN6VQLXzM_B7S7zvDpaLR0koVgWE,1274
 migrations/pg/0002_catalog.py,sha256=Rsleecu351Ty19kYZgOpqX5G3MEAY8nMxCJrAeuS2Mw,1690
 migrations/pg/0003_catalog_kbid_index.py,sha256=uKq_vtnuf73GVf0mtl2rhzdk_czAoEU1UdiVKVZpA0M,1044
 migrations/pg/0004_catalog_facets.py,sha256=FJFASHjfEHG3sNve9BP2HnnLO4xr7dnR6Qpctnmt4LE,2180
 migrations/pg/0005_purge_tasks_index.py,sha256=3mtyFgpcK0QQ_NONYay7V9xICijCLNkyTPuoc0PBjRg,1139
 migrations/pg/0006_catalog_title_indexes.py,sha256=n2OGxwE4oeCwHAYaxBkja4t10BmwTjZ2IoCyOdjEBSc,1710
 migrations/pg/0007_catalog_slug.py,sha256=mArzZCBO-RD5DkWxRIyDKgEzrnAcis1TOGvSNUe7Kgg,1150
+migrations/pg/0008_catalog_facets.py,sha256=dxIUdHJHtI_Gyk2dpP7tjHEnL2iPzAufi6ajYm2FVMI,1595
+migrations/pg/0009_extract_facets_safety.py,sha256=k9Appx7ipp3wDyLy70qgw9oLjN7N6BEadE-N5Fhan-4,1066
 migrations/pg/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
 nucliadb/__init__.py,sha256=_abCmDJ_0ku483Os4UAjPX7Nywm39cQgAV_DiyjsKeQ,891
 nucliadb/health.py,sha256=UIxxA4oms4HIsCRZM_SZsdkIZIlgzmOxw-qSHLlWuak,3465
@@ -164,7 +167,7 @@ nucliadb/ingest/orm/utils.py,sha256=fCQRuyecgqhaY7mcBG93oaXMkzkKb9BFjOcy4-ZiSNw,
 nucliadb/ingest/orm/processor/__init__.py,sha256=Aqd9wCNTvggkMkCY3WvoI8spdr94Jnqk-0iq9XpLs18,922
 nucliadb/ingest/orm/processor/auditing.py,sha256=TeYhXGJRyQ7ROytbb2u8R0fIh_FYi3HgTu3S1ribY3U,4623
 nucliadb/ingest/orm/processor/data_augmentation.py,sha256=v-pj4GbBWSuO8dQyahs5UDr5ghsyfhCZDS0ftKd6ZYc,5179
-nucliadb/ingest/orm/processor/pgcatalog.py,sha256=Zh6s0gj_bwDKPBXSs61jlMKJ6XP-dLnPGbrMGD6RHcM,3195
+nucliadb/ingest/orm/processor/pgcatalog.py,sha256=GpzQv0_iWTHbM90J0rAz_QIh_TMv1XbghyDgs8tk_8M,4014
 nucliadb/ingest/orm/processor/processor.py,sha256=jaEBwbv--WyoC8zcdxWAyF0dAzVA5crVDJl56Bqv1eI,31444
 nucliadb/ingest/orm/processor/sequence_manager.py,sha256=uqEphtI1Ir_yk9jRl2gPf7BlzzXWovbARY5MNZSBI_8,1704
 nucliadb/ingest/service/__init__.py,sha256=LHQFUkdmNBOWqBG0Md9sMMI7g5TQZ-hLAnhw6ZblrJg,2002
@@ -218,7 +221,7 @@ nucliadb/search/utilities.py,sha256=9SsRDw0rJVXVoLBfF7rBb6q080h-thZc7u8uRcTiBeY,
 nucliadb/search/api/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
 nucliadb/search/api/v1/__init__.py,sha256=DH16OYnw9jQ38OpKlmdXeoq2j40ZPXZRtGvClKOkMhw,1239
 nucliadb/search/api/v1/ask.py,sha256=b4tz33HNsfT5DXv_2DMc_jirnFsHuobreWkbAKkzj5o,5337
-nucliadb/search/api/v1/catalog.py,sha256=3SqLgwFkFFY8x-xBruHQaZ0EGpf7oKbSj-_PnobV68E,7747
+nucliadb/search/api/v1/catalog.py,sha256=7yyG46Zsaqvuut9Da-LTl0KcWgo7n5lbEhiTXslyvwM,7865
 nucliadb/search/api/v1/feedback.py,sha256=kNLc4dHz2SXHzV0PwC1WiRAwY88fDptPcP-kO0q-FrQ,2620
 nucliadb/search/api/v1/find.py,sha256=iMjyq4y0JOMC_x1B8kUfVdkCoc9G9Ark58kPLLY4HDw,10824
 nucliadb/search/api/v1/graph.py,sha256=gthqxCOn9biE6D6s93jRGLglk0ono8U7OyS390kWiI8,4178
@@ -251,7 +254,7 @@ nucliadb/search/search/ingestion_agents.py,sha256=IK6yOPEF9rST_uoqspdVdPk0pldjDh
 nucliadb/search/search/merge.py,sha256=XiRBsxhYPshPV7lZXD-9E259KZOPIf4I2tKosY0lPo4,22470
 nucliadb/search/search/metrics.py,sha256=3I6IN0qDSmqIvUaWJmT3rt-Jyjs6LcvnKI8ZqCiuJPY,3501
 nucliadb/search/search/paragraphs.py,sha256=pNAEiYqJGGUVcEf7xf-PFMVqz0PX4Qb-WNG-_zPGN2o,7799
-nucliadb/search/search/pgcatalog.py,sha256=QtgArjoM-dW_B1oO0aXqp5au7GlLG8jAct9jevUHatw,10997
+nucliadb/search/search/pgcatalog.py,sha256=O_nRjSJf1Qc-XorVwcNlsDOftzy_zQLLfagkjU4YmSA,16718
 nucliadb/search/search/predict_proxy.py,sha256=cuD_sfM3RLdEoQaanRz0CflO6nKVGGKPzoFA17shb_w,8647
 nucliadb/search/search/query.py,sha256=0qIQdt548L3jtKOyKo06aGJ73SLBxAW3N38_Hc1M3Uw,11528
 nucliadb/search/search/rank_fusion.py,sha256=xZtXhbmKb_56gs73u6KkFm2efvTATOSMmpOV2wrAIqE,9613
@@ -372,8 +375,8 @@ nucliadb/writer/tus/local.py,sha256=7jYa_w9b-N90jWgN2sQKkNcomqn6JMVBOVeDOVYJHto,
 nucliadb/writer/tus/s3.py,sha256=vF0NkFTXiXhXq3bCVXXVV-ED38ECVoUeeYViP8uMqcU,8357
 nucliadb/writer/tus/storage.py,sha256=ToqwjoYnjI4oIcwzkhha_MPxi-k4Jk3Lt55zRwaC1SM,2903
 nucliadb/writer/tus/utils.py,sha256=MSdVbRsRSZVdkaum69_0wku7X3p5wlZf4nr6E0GMKbw,2556
-nucliadb-6.5.0.post4426.dist-info/METADATA,sha256=fRo_rQ3D5zAGctuqOfk22MzKACI4nZ8mijFy-JSGaT0,4152
-nucliadb-6.5.0.post4426.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-nucliadb-6.5.0.post4426.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
-nucliadb-6.5.0.post4426.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
-nucliadb-6.5.0.post4426.dist-info/RECORD,,
+nucliadb-6.5.0.post4476.dist-info/METADATA,sha256=ysG9rsv_jshf_4lJLNHXGBHLm8Br-jWbUKDgRymc9jY,4158
+nucliadb-6.5.0.post4476.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+nucliadb-6.5.0.post4476.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
+nucliadb-6.5.0.post4476.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
+nucliadb-6.5.0.post4476.dist-info/RECORD,,

{nucliadb-6.5.0.post4426.dist-info → nucliadb-6.5.0.post4476.dist-info}/WHEEL RENAMED Viewed

File without changes

{nucliadb-6.5.0.post4426.dist-info → nucliadb-6.5.0.post4476.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{nucliadb-6.5.0.post4426.dist-info → nucliadb-6.5.0.post4476.dist-info}/top_level.txt RENAMED Viewed

File without changes

nucliadb 6.5.0.post4426__py3-none-any.whl → 6.5.0.post4476__py3-none-any.whl

nucliadb 6.5.0.post4426py3-none-any.whl → 6.5.0.post4476py3-none-any.whl