PyPI - nucliadb - Versions diffs - 6.7.2.post4862__py3-none-any.whl → 6.9.2.post5282__py3-none-any.whl - Mend

nucliadb 6.7.2.post4862py3-none-any.whl → 6.9.2.post5282py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nucliadb might be problematic. Click here for more details.

Files changed (126) hide show

migrations/0016_upgrade_to_paragraphs_v2.py +1 -1
migrations/0017_multiple_writable_shards.py +1 -1
migrations/0018_purge_orphan_kbslugs.py +1 -1
migrations/0019_upgrade_to_paragraphs_v3.py +1 -1
migrations/0021_overwrite_vectorsets_key.py +1 -1
migrations/0023_backfill_pg_catalog.py +7 -3
migrations/0025_assign_models_to_kbs_v2.py +3 -3
migrations/0027_rollover_texts3.py +1 -1
migrations/0028_extracted_vectors_reference.py +1 -1
migrations/0029_backfill_field_status.py +1 -1
migrations/0032_remove_old_relations.py +1 -1
migrations/0036_backfill_catalog_slug.py +1 -1
migrations/0037_backfill_catalog_facets.py +1 -1
migrations/0038_backfill_catalog_field_labels.py +7 -3
migrations/0039_backfill_converation_splits_metadata.py +106 -0
migrations/0040_migrate_search_configurations.py +79 -0
migrations/pg/0010_shards_index.py +34 -0
nucliadb/backups/create.py +3 -3
nucliadb/backups/restore.py +3 -3
nucliadb/common/cache.py +1 -1
nucliadb/common/catalog/__init__.py +79 -0
nucliadb/common/catalog/dummy.py +36 -0
nucliadb/common/catalog/interface.py +85 -0
nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +294 -208
nucliadb/common/catalog/utils.py +56 -0
nucliadb/common/cluster/manager.py +3 -19
nucliadb/common/cluster/rebalance.py +484 -110
nucliadb/common/cluster/rollover.py +29 -0
nucliadb/common/cluster/settings.py +1 -1
nucliadb/common/cluster/utils.py +26 -0
nucliadb/common/datamanagers/atomic.py +6 -0
nucliadb/common/datamanagers/utils.py +2 -2
nucliadb/common/external_index_providers/manager.py +1 -29
nucliadb/common/external_index_providers/settings.py +1 -27
nucliadb/common/filter_expression.py +16 -33
nucliadb/common/http_clients/exceptions.py +8 -0
nucliadb/common/http_clients/processing.py +4 -0
nucliadb/common/http_clients/utils.py +3 -0
nucliadb/common/ids.py +77 -55
nucliadb/common/locking.py +4 -4
nucliadb/common/maindb/driver.py +11 -1
nucliadb/common/maindb/local.py +1 -1
nucliadb/common/maindb/pg.py +1 -1
nucliadb/common/nidx.py +19 -1
nucliadb/common/vector_index_config.py +1 -1
nucliadb/export_import/datamanager.py +3 -3
nucliadb/ingest/consumer/pull.py +7 -0
nucliadb/ingest/consumer/service.py +2 -27
nucliadb/ingest/consumer/shard_creator.py +17 -6
nucliadb/ingest/fields/base.py +9 -17
nucliadb/ingest/fields/conversation.py +47 -1
nucliadb/ingest/orm/brain_v2.py +21 -3
nucliadb/ingest/orm/index_message.py +126 -111
nucliadb/ingest/orm/knowledgebox.py +84 -43
nucliadb/ingest/orm/processor/auditing.py +1 -1
nucliadb/ingest/orm/processor/processor.py +95 -149
nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
nucliadb/ingest/orm/resource.py +10 -1
nucliadb/ingest/partitions.py +12 -1
nucliadb/ingest/serialize.py +2 -2
nucliadb/ingest/service/writer.py +26 -19
nucliadb/ingest/settings.py +33 -11
nucliadb/learning_proxy.py +12 -15
nucliadb/metrics_exporter.py +17 -4
nucliadb/migrator/datamanager.py +11 -17
nucliadb/migrator/migrator.py +2 -2
nucliadb/purge/__init__.py +12 -17
nucliadb/purge/orphan_shards.py +2 -2
nucliadb/reader/api/v1/knowledgebox.py +40 -12
nucliadb/reader/api/v1/learning_config.py +30 -10
nucliadb/reader/api/v1/resource.py +2 -2
nucliadb/reader/api/v1/services.py +1 -1
nucliadb/reader/reader/notifications.py +1 -1
nucliadb/search/api/v1/__init__.py +1 -0
nucliadb/search/api/v1/catalog.py +4 -4
nucliadb/search/api/v1/find.py +1 -4
nucliadb/search/api/v1/hydrate.py +328 -0
nucliadb/search/api/v1/resource/ask.py +21 -1
nucliadb/search/api/v1/search.py +1 -4
nucliadb/search/predict.py +9 -2
nucliadb/search/search/cache.py +1 -20
nucliadb/search/search/chat/ask.py +50 -8
nucliadb/search/search/chat/prompt.py +47 -15
nucliadb/search/search/chat/query.py +8 -1
nucliadb/search/search/fetch.py +1 -1
nucliadb/search/search/find.py +1 -6
nucliadb/search/search/{hydrator.py → hydrator/__init__.py} +5 -4
nucliadb/search/search/hydrator/fields.py +175 -0
nucliadb/search/search/hydrator/images.py +130 -0
nucliadb/search/search/hydrator/paragraphs.py +307 -0
nucliadb/search/search/hydrator/resources.py +56 -0
nucliadb/search/search/metrics.py +16 -0
nucliadb/search/search/predict_proxy.py +33 -11
nucliadb/search/search/query.py +0 -23
nucliadb/search/search/query_parser/fetcher.py +5 -5
nucliadb/search/search/query_parser/models.py +1 -30
nucliadb/search/search/query_parser/parsers/ask.py +1 -1
nucliadb/search/search/query_parser/parsers/catalog.py +4 -7
nucliadb/search/search/query_parser/parsers/common.py +16 -7
nucliadb/search/search/query_parser/parsers/find.py +0 -11
nucliadb/search/search/query_parser/parsers/graph.py +5 -5
nucliadb/search/search/query_parser/parsers/search.py +0 -11
nucliadb/search/search/query_parser/parsers/unit_retrieval.py +4 -11
nucliadb/search/search/rerankers.py +1 -1
nucliadb/search/search/summarize.py +1 -1
nucliadb/standalone/run.py +3 -0
nucliadb/tasks/retries.py +4 -4
nucliadb/train/generators/sentence_classifier.py +2 -8
nucliadb/train/generators/utils.py +1 -1
nucliadb/train/nodes.py +4 -4
nucliadb/train/servicer.py +1 -1
nucliadb/train/uploader.py +1 -1
nucliadb/writer/api/v1/field.py +14 -9
nucliadb/writer/api/v1/knowledgebox.py +15 -52
nucliadb/writer/api/v1/learning_config.py +5 -4
nucliadb/writer/api/v1/resource.py +2 -2
nucliadb/writer/resource/field.py +38 -2
nucliadb/writer/tus/azure.py +4 -4
nucliadb/writer/tus/gcs.py +11 -17
{nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/METADATA +9 -10
{nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/RECORD +124 -114
nucliadb/common/external_index_providers/pinecone.py +0 -894
nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
{nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/WHEEL +0 -0
{nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/entry_points.txt +0 -0
{nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/top_level.txt +0 -0

nucliadb/common/catalog/interface.py ADDED Viewed

@@ -0,0 +1,85 @@
+# Copyright (C) 2021 Bosutech XXI S.L.
+#
+# nucliadb is offered under the AGPL v3.0 and as commercial software.
+# For commercial licensing, contact us at info@nuclia.com.
+#
+# AGPL:
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+from __future__ import annotations
+import abc
+import datetime
+from dataclasses import dataclass
+from typing import Literal, Optional, Union
+from pydantic import BaseModel, Field
+from nucliadb.common.maindb.driver import Transaction
+from nucliadb_models import search as search_models
+from nucliadb_models.search import CatalogFacetsRequest, Resources
+class CatalogResourceData(BaseModel):
+    """
+    Data extracted from a resource to be indexed in the catalog
+    """
+    title: str = Field(description="Resource title")
+    created_at: datetime.datetime = Field(description="Resource creation date")
+    modified_at: datetime.datetime = Field(description="Resource last modification date")
+    labels: list[str] = Field(
+        description="Resource labels. This includes labels at the resource level and all classification labels of its fields"
+    )
+    slug: str = Field(description="Resource slug")
+@dataclass
+class CatalogExpression:
+    @dataclass
+    class Date:
+        field: Union[Literal["created_at"], Literal["modified_at"]]
+        since: Optional[datetime.datetime]
+        until: Optional[datetime.datetime]
+    bool_and: Optional[list["CatalogExpression"]] = None
+    bool_or: Optional[list["CatalogExpression"]] = None
+    bool_not: Optional["CatalogExpression"] = None
+    date: Optional[Date] = None
+    facet: Optional[str] = None
+    resource_id: Optional[str] = None
+class CatalogQuery(BaseModel):
+    kbid: str
+    query: Optional[search_models.CatalogQuery] = Field(description="Full-text search query")
+    filters: Optional[CatalogExpression] = Field(description="Filters to apply to the search")
+    sort: search_models.SortOptions = Field(description="Sorting option")
+    faceted: list[str] = Field(description="List of facets to compute during the search")
+    page_size: int = Field(description="Used for pagination. Maximum page size is 100")
+    page_number: int = Field(description="Used for pagination. First page is 0")
+class Catalog(abc.ABC, metaclass=abc.ABCMeta):
+    @abc.abstractmethod
+    async def update(self, txn: Transaction, kbid: str, rid: str, data: CatalogResourceData): ...
+    @abc.abstractmethod
+    async def delete(self, txn: Transaction, kbid: str, rid: str): ...
+    @abc.abstractmethod
+    async def search(self, query: CatalogQuery) -> Resources: ...
+    @abc.abstractmethod
+    async def facets(self, kbid: str, request: CatalogFacetsRequest) -> dict[str, int]: ...

nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} RENAMED Viewed

@@ -26,111 +26,180 @@ from typing import Any, Literal, Union, cast
 from psycopg import AsyncCursor, sql
 from psycopg.rows import DictRow, dict_row
-from nucliadb.common.maindb.pg import PGDriver
+from nucliadb.common.catalog.interface import (
+    Catalog,
+    CatalogExpression,
+    CatalogQuery,
+    CatalogResourceData,
+)
+from nucliadb.common.exceptions import InvalidQueryError
+from nucliadb.common.maindb.driver import Transaction
+from nucliadb.common.maindb.pg import PGDriver, PGTransaction
 from nucliadb.common.maindb.utils import get_driver
-from nucliadb.search.search.query_parser.models import CatalogExpression, CatalogQuery
 from nucliadb_models import search as search_models
-from nucliadb_models.labels import translate_system_to_alias_label
-from nucliadb_models.search import CatalogFacetsRequest, ResourceResult, Resources, SortField, SortOrder
+from nucliadb_models.labels import translate_alias_to_system_label, translate_system_to_alias_label
+from nucliadb_models.search import (
+    CatalogFacetsRequest,
+    ResourceResult,
+    Resources,
+    SortField,
+    SortOrder,
+)
 from nucliadb_telemetry import metrics
-from .filters import translate_label
+write_observer = metrics.Observer("pg_catalog_write", labels={"type": ""})
+search_observer = metrics.Observer("pg_catalog_search", labels={"op": ""})
-observer = metrics.Observer("pg_catalog_search", labels={"op": ""})
 logger = logging.getLogger(__name__)
 SPLIT_REGEX = re.compile(r"\W")
-def _filter_operands(operands: list[CatalogExpression]) -> tuple[list[str], list[CatalogExpression]]:
-    facets = []
-    nonfacets = []
-    for op in operands:
-        if op.facet:
-            facets.append(op.facet)
-        else:
-            nonfacets.append(op)
+def _pg_transaction(txn: Transaction) -> PGTransaction:
+    return cast(PGTransaction, txn)
-    return facets, nonfacets
-def _convert_filter(expr: CatalogExpression, filter_params: dict[str, Any]) -> sql.Composable:
-    if expr.bool_and:
-        return _convert_boolean_op(expr.bool_and, "and", filter_params)
-    elif expr.bool_or:
-        return _convert_boolean_op(expr.bool_or, "or", filter_params)
-    elif expr.bool_not:
-        return sql.SQL("(NOT {})").format(_convert_filter(expr.bool_not, filter_params))
-    elif expr.date:
-        return _convert_date_filter(expr.date, filter_params)
-    elif expr.facet:
-        param_name = f"param{len(filter_params)}"
-        filter_params[param_name] = [expr.facet]
-        if expr.facet == "/n/s/PROCESSED":
-            # Optimization for the most common case, we know PROCESSED is a full label and can use the smaller labels index
-            # This is needed because PROCESSED is present in most catalog entries and PG is unlikely to use any index
-            # for it, falling back to executing the extract_facets function which can be slow
-            return sql.SQL("labels @> {}").format(sql.Placeholder(param_name))
-        else:
-            return sql.SQL("extract_facets(labels) @> {}").format(sql.Placeholder(param_name))
-    elif expr.resource_id:
-        param_name = f"param{len(filter_params)}"
-        filter_params[param_name] = [expr.resource_id]
-        return sql.SQL("rid = {}").format(sql.Placeholder(param_name))
-    else:
-        return sql.SQL("")
+def _pg_driver() -> PGDriver:
+    return cast(PGDriver, get_driver())
-def _convert_boolean_op(
-    operands: list[CatalogExpression],
-    op: Union[Literal["and"], Literal["or"]],
-    filter_params: dict[str, Any],
-) -> sql.Composable:
-    array_op = sql.SQL("@>" if op == "and" else "&&")
-    operands_sql: list[sql.Composable] = []
-    facets, nonfacets = _filter_operands(operands)
-    if facets:
-        param_name = f"param{len(filter_params)}"
-        if facets == ["/n/s/PROCESSED"]:
-            # Optimization for the most common case, we know PROCESSED is a full label and can use the smaller labels index
-            # This is needed because PROCESSED is present in most catalog entries and PG is unlikely to use any index
-            # for it, falling back to executing the extract_facets function which can be slow
-            operands_sql.append(sql.SQL("labels @> {}").format(sql.Placeholder(param_name)))
-        else:
-            operands_sql.append(
-                sql.SQL("extract_facets(labels) {} {}").format(array_op, sql.Placeholder(param_name))
+class PGCatalog(Catalog):
+    @write_observer.wrap({"type": "update"})
+    async def update(self, txn: Transaction, kbid: str, rid: str, data: CatalogResourceData):
+        async with _pg_transaction(txn).connection.cursor() as cur:
+            await cur.execute(
+                """
+                INSERT INTO catalog
+                (kbid, rid, title, created_at, modified_at, labels, slug)
+                VALUES
+                (%(kbid)s, %(rid)s, %(title)s, %(created_at)s, %(modified_at)s, %(labels)s, %(slug)s)
+                ON CONFLICT (kbid, rid) DO UPDATE SET
+                title = excluded.title,
+                created_at = excluded.created_at,
+                modified_at = excluded.modified_at,
+                labels = excluded.labels,
+                slug = excluded.slug""",
+                {
+                    "kbid": kbid,
+                    "rid": rid,
+                    "title": data.title,
+                    "created_at": data.created_at,
+                    "modified_at": data.modified_at,
+                    "labels": data.labels,
+                    "slug": data.slug,
+                },
+            )
+            await cur.execute(
+                "DELETE FROM catalog_facets WHERE kbid = %(kbid)s AND rid = %(rid)s",
+                {
+                    "kbid": kbid,
+                    "rid": rid,
+                },
+            )
+            await cur.execute(
+                "INSERT INTO catalog_facets (kbid, rid, facet) SELECT %(kbid)s AS kbid, %(rid)s AS rid, unnest(%(facets)s::text[]) AS facet",
+                {
+                    "kbid": kbid,
+                    "rid": rid,
+                    "facets": list(extract_facets(data.labels)),
+                },
             )
-        filter_params[param_name] = facets
-    for nonfacet in nonfacets:
-        operands_sql.append(_convert_filter(nonfacet, filter_params))
-    return sql.SQL("({})").format(sql.SQL(f" {op.upper()} ").join(operands_sql))
+    @write_observer.wrap({"type": "delete"})
+    async def delete(self, txn: Transaction, kbid: str, rid: str):
+        async with _pg_transaction(txn).connection.cursor() as cur:
+            await cur.execute(
+                "DELETE FROM catalog where kbid = %(kbid)s AND rid = %(rid)s", {"kbid": kbid, "rid": rid}
+            )
-def _convert_date_filter(date: CatalogExpression.Date, filter_params: dict[str, Any]) -> sql.Composable:
-    if date.since and date.until:
-        since_name = f"param{len(filter_params)}"
-        filter_params[since_name] = date.since
-        until_name = f"param{len(filter_params)}"
-        filter_params[until_name] = date.until
-        return sql.SQL("{field} BETWEEN {since} AND {until}").format(
-            field=sql.Identifier(date.field),
-            since=sql.Placeholder(since_name),
-            until=sql.Placeholder(until_name),
-        )
-    elif date.since:
-        since_name = f"param{len(filter_params)}"
-        filter_params[since_name] = date.since
-        return sql.SQL("{field} > {since}").format(
-            field=sql.Identifier(date.field), since=sql.Placeholder(since_name)
-        )
-    elif date.until:
-        until_name = f"param{len(filter_params)}"
-        filter_params[until_name] = date.until
-        return sql.SQL("{field} < {until}").format(
-            field=sql.Identifier(date.field), until=sql.Placeholder(until_name)
+    @search_observer.wrap({"op": "search"})
+    async def search(self, catalog_query: CatalogQuery) -> Resources:
+        # Prepare SQL query
+        query, query_params = _prepare_query_filters(catalog_query)
+        async with _pg_driver()._get_connection() as conn, conn.cursor(row_factory=dict_row) as cur:
+            facets = {}
+            # Faceted search
+            if catalog_query.faceted:
+                with search_observer({"op": "facets"}):
+                    tmp_facets: dict[str, dict[str, int]] = {
+                        translate_label(f): defaultdict(int) for f in catalog_query.faceted
+                    }
+                    if catalog_query.filters is None:
+                        await _faceted_search_unfiltered(cur, catalog_query, tmp_facets)
+                    else:
+                        await _faceted_search_filtered(
+                            cur, catalog_query, tmp_facets, query, query_params
+                        )
+                    facets = {translate_system_to_alias_label(k): v for k, v in tmp_facets.items()}
+            # Totals
+            with search_observer({"op": "totals"}):
+                await cur.execute(
+                    sql.SQL("SELECT COUNT(*) FROM ({}) fc").format(query),
+                    query_params,
+                )
+                total = (await cur.fetchone())["count"]  # type: ignore
+            # Query
+            with search_observer({"op": "query"}):
+                query, query_params = _prepare_query(catalog_query)
+                await cur.execute(query, query_params)
+                data = await cur.fetchall()
+        return Resources(
+            facets=facets,
+            results=[
+                ResourceResult(
+                    rid=str(r["rid"]).replace("-", ""),
+                    field="title",
+                    field_type="a",
+                    labels=[label for label in r["labels"] if label.startswith("/l/")],
+                    score=0,
+                )
+                for r in data
+            ],
+            query=catalog_query.query.query if catalog_query.query else "",
+            total=total,
+            page_number=catalog_query.page_number,
+            page_size=catalog_query.page_size,
+            next_page=(catalog_query.page_size * catalog_query.page_number + len(data) < total),
+            min_score=0,
         )
-    else:
-        raise ValueError(f"Invalid date operator")
+    @search_observer.wrap({"op": "catalog_facets"})
+    async def facets(self, kbid: str, request: CatalogFacetsRequest) -> dict[str, int]:
+        async with _pg_driver()._get_connection() as conn, conn.cursor() as cur:
+            prefix_filters: list[sql.Composable] = []
+            prefix_params: dict[str, Any] = {}
+            for cnt, prefix in enumerate(request.prefixes):
+                prefix_sql = sql.SQL("facet LIKE {}").format(sql.Placeholder(f"prefix{cnt}"))
+                prefix_params[f"prefix{cnt}"] = f"{prefix.prefix}%"
+                if prefix.depth is not None:
+                    prefix_parts = len(prefix.prefix.split("/"))
+                    depth_sql = sql.SQL("SPLIT_PART(facet, '/', {}) = ''").format(
+                        sql.Placeholder(f"depth{cnt}")
+                    )
+                    prefix_params[f"depth{cnt}"] = prefix_parts + prefix.depth + 1
+                    prefix_sql = sql.SQL("({} AND {})").format(prefix_sql, depth_sql)
+                prefix_filters.append(prefix_sql)
+            filter_sql: sql.Composable
+            if prefix_filters:
+                filter_sql = sql.SQL("AND {}").format(sql.SQL(" OR ").join(prefix_filters))
+            else:
+                filter_sql = sql.SQL("")
+            await cur.execute(
+                sql.SQL(
+                    "SELECT facet, COUNT(*) FROM catalog_facets WHERE kbid = %(kbid)s {} GROUP BY facet"
+                ).format(filter_sql),
+                {"kbid": kbid, **prefix_params},
+            )
+            return {k: v for k, v in await cur.fetchall()}
 def _prepare_query_filters(catalog_query: CatalogQuery) -> tuple[sql.Composable, dict[str, Any]]:
@@ -149,42 +218,16 @@ def _prepare_query_filters(catalog_query: CatalogQuery) -> tuple[sql.Composable,
     )
-def _prepare_query_search(query: search_models.CatalogQuery, params: dict[str, Any]) -> sql.Composable:
-    if query.match == search_models.CatalogQueryMatch.Exact:
-        params["query"] = query.query
-        return sql.SQL("{} = %(query)s").format(sql.Identifier(query.field.value))
-    elif query.match == search_models.CatalogQueryMatch.StartsWith:
-        params["query"] = query.query + "%"
-        if query.field == search_models.CatalogQueryField.Title:
-            # Insensitive search supported by pg_trgm for title
-            return sql.SQL("{} ILIKE %(query)s").format(sql.Identifier(query.field.value))
+def _filter_operands(operands: list[CatalogExpression]) -> tuple[list[str], list[CatalogExpression]]:
+    facets = []
+    nonfacets = []
+    for op in operands:
+        if op.facet:
+            facets.append(op.facet)
         else:
-            # Sensitive search for slug (btree does not support ILIKE and slugs are all lowercase anyway)
-            return sql.SQL("{} LIKE %(query)s").format(sql.Identifier(query.field.value))
-    # The rest of operators only supported by title
-    elif query.match == search_models.CatalogQueryMatch.Words:
-        # This is doing tokenization inside the SQL server (to keep the index updated). We could move it to
-        # the python code at update/query time if it ever becomes a problem but for now, a single regex
-        # executed per query is not a problem.
+            nonfacets.append(op)
-        # Remove zero-length words from the split
-        params["query"] = [word.lower() for word in SPLIT_REGEX.split(query.query) if word]
-        return sql.SQL("regexp_split_to_array(lower(title), '\\W') @> %(query)s")
-    elif query.match == search_models.CatalogQueryMatch.Fuzzy:
-        params["query"] = query.query
-        # Note: the operator is %>, We use %%> for psycopg escaping
-        return sql.SQL("title %%> %(query)s")
-    elif query.match == search_models.CatalogQueryMatch.EndsWith:
-        params["query"] = "%" + query.query
-        return sql.SQL("title ILIKE %(query)s")
-    elif query.match == search_models.CatalogQueryMatch.Contains:
-        params["query"] = "%" + query.query + "%"
-        return sql.SQL("title ILIKE %(query)s")
-    else:  # pragma: nocover
-        # This is a trick so mypy generates an error if this branch can be reached,
-        # that is, if we are missing some ifs
-        _a: int = "a"
-        return sql.SQL("")
+    return facets, nonfacets
 def _prepare_query(catalog_query: CatalogQuery) -> tuple[sql.Composed, dict[str, Any]]:
@@ -219,67 +262,6 @@ def _prepare_query(catalog_query: CatalogQuery) -> tuple[sql.Composed, dict[str,
     return query, filter_params
-def _pg_driver() -> PGDriver:
-    return cast(PGDriver, get_driver())
-@observer.wrap({"op": "search"})
-async def pgcatalog_search(catalog_query: CatalogQuery) -> Resources:
-    # Prepare SQL query
-    query, query_params = _prepare_query_filters(catalog_query)
-    async with _pg_driver()._get_connection() as conn, conn.cursor(row_factory=dict_row) as cur:
-        facets = {}
-        # Faceted search
-        if catalog_query.faceted:
-            with observer({"op": "facets"}):
-                tmp_facets: dict[str, dict[str, int]] = {
-                    translate_label(f): defaultdict(int) for f in catalog_query.faceted
-                }
-                if catalog_query.filters is None:
-                    await _faceted_search_unfiltered(cur, catalog_query, tmp_facets)
-                else:
-                    await _faceted_search_filtered(cur, catalog_query, tmp_facets, query, query_params)
-                facets = {translate_system_to_alias_label(k): v for k, v in tmp_facets.items()}
-        # Totals
-        with observer({"op": "totals"}):
-            await cur.execute(
-                sql.SQL("SELECT COUNT(*) FROM ({}) fc").format(query),
-                query_params,
-            )
-            total = (await cur.fetchone())["count"]  # type: ignore
-        # Query
-        with observer({"op": "query"}):
-            query, query_params = _prepare_query(catalog_query)
-            await cur.execute(query, query_params)
-            data = await cur.fetchall()
-    return Resources(
-        facets=facets,
-        results=[
-            ResourceResult(
-                rid=str(r["rid"]).replace("-", ""),
-                field="title",
-                field_type="a",
-                labels=[label for label in r["labels"] if label.startswith("/l/")],
-                score=0,
-            )
-            for r in data
-        ],
-        query=catalog_query.query.query if catalog_query.query else "",
-        total=total,
-        page_number=catalog_query.page_number,
-        page_size=catalog_query.page_size,
-        next_page=(catalog_query.page_size * catalog_query.page_number + len(data) < total),
-        min_score=0,
-    )
 async def _faceted_search_unfiltered(
     cur: AsyncCursor[DictRow], catalog_query: CatalogQuery, tmp_facets: dict[str, dict[str, int]]
 ):
@@ -360,33 +342,137 @@ async def _faceted_search_filtered(
                 tmp_facets[grandparent][translate_system_to_alias_label(parent)] += count
-@observer.wrap({"op": "catalog_facets"})
-async def pgcatalog_facets(kbid: str, request: CatalogFacetsRequest) -> dict[str, int]:
-    async with _pg_driver()._get_connection() as conn, conn.cursor() as cur:
-        prefix_filters: list[sql.Composable] = []
-        prefix_params: dict[str, Any] = {}
-        for cnt, prefix in enumerate(request.prefixes):
-            prefix_sql = sql.SQL("facet LIKE {}").format(sql.Placeholder(f"prefix{cnt}"))
-            prefix_params[f"prefix{cnt}"] = f"{prefix.prefix}%"
-            if prefix.depth is not None:
-                prefix_parts = len(prefix.prefix.split("/"))
-                depth_sql = sql.SQL("SPLIT_PART(facet, '/', {}) = ''").format(
-                    sql.Placeholder(f"depth{cnt}")
-                )
-                prefix_params[f"depth{cnt}"] = prefix_parts + prefix.depth + 1
-                prefix_sql = sql.SQL("({} AND {})").format(prefix_sql, depth_sql)
-            prefix_filters.append(prefix_sql)
+def _prepare_query_search(query: search_models.CatalogQuery, params: dict[str, Any]) -> sql.Composable:
+    if query.match == search_models.CatalogQueryMatch.Exact:
+        params["query"] = query.query
+        return sql.SQL("{} = %(query)s").format(sql.Identifier(query.field.value))
+    elif query.match == search_models.CatalogQueryMatch.StartsWith:
+        params["query"] = query.query + "%"
+        if query.field == search_models.CatalogQueryField.Title:
+            # Insensitive search supported by pg_trgm for title
+            return sql.SQL("{} ILIKE %(query)s").format(sql.Identifier(query.field.value))
+        else:
+            # Sensitive search for slug (btree does not support ILIKE and slugs are all lowercase anyway)
+            return sql.SQL("{} LIKE %(query)s").format(sql.Identifier(query.field.value))
+    # The rest of operators only supported by title
+    elif query.match == search_models.CatalogQueryMatch.Words:
+        # This is doing tokenization inside the SQL server (to keep the index updated). We could move it to
+        # the python code at update/query time if it ever becomes a problem but for now, a single regex
+        # executed per query is not a problem.
-        filter_sql: sql.Composable
-        if prefix_filters:
-            filter_sql = sql.SQL("AND {}").format(sql.SQL(" OR ").join(prefix_filters))
+        # Remove zero-length words from the split
+        params["query"] = [word.lower() for word in SPLIT_REGEX.split(query.query) if word]
+        return sql.SQL("regexp_split_to_array(lower(title), '\\W') @> %(query)s")
+    elif query.match == search_models.CatalogQueryMatch.Fuzzy:
+        params["query"] = query.query
+        # Note: the operator is %>, We use %%> for psycopg escaping
+        return sql.SQL("title %%> %(query)s")
+    elif query.match == search_models.CatalogQueryMatch.EndsWith:
+        params["query"] = "%" + query.query
+        return sql.SQL("title ILIKE %(query)s")
+    elif query.match == search_models.CatalogQueryMatch.Contains:
+        params["query"] = "%" + query.query + "%"
+        return sql.SQL("title ILIKE %(query)s")
+    else:  # pragma: no cover
+        # This is a trick so mypy generates an error if this branch can be reached,
+        # that is, if we are missing some ifs
+        _a: int = "a"
+        return sql.SQL("")
+def _convert_filter(expr: CatalogExpression, filter_params: dict[str, Any]) -> sql.Composable:
+    if expr.bool_and:
+        return _convert_boolean_op(expr.bool_and, "and", filter_params)
+    elif expr.bool_or:
+        return _convert_boolean_op(expr.bool_or, "or", filter_params)
+    elif expr.bool_not:
+        return sql.SQL("(NOT {})").format(_convert_filter(expr.bool_not, filter_params))
+    elif expr.date:
+        return _convert_date_filter(expr.date, filter_params)
+    elif expr.facet:
+        param_name = f"param{len(filter_params)}"
+        filter_params[param_name] = [expr.facet]
+        if expr.facet == "/n/s/PROCESSED":
+            # Optimization for the most common case, we know PROCESSED is a full label and can use the smaller labels index
+            # This is needed because PROCESSED is present in most catalog entries and PG is unlikely to use any index
+            # for it, falling back to executing the extract_facets function which can be slow
+            return sql.SQL("labels @> {}").format(sql.Placeholder(param_name))
         else:
-            filter_sql = sql.SQL("")
+            return sql.SQL("extract_facets(labels) @> {}").format(sql.Placeholder(param_name))
+    elif expr.resource_id:
+        param_name = f"param{len(filter_params)}"
+        filter_params[param_name] = [expr.resource_id]
+        return sql.SQL("rid = {}").format(sql.Placeholder(param_name))
+    else:
+        return sql.SQL("")
+def _convert_boolean_op(
+    operands: list[CatalogExpression],
+    op: Union[Literal["and"], Literal["or"]],
+    filter_params: dict[str, Any],
+) -> sql.Composable:
+    array_op = sql.SQL("@>" if op == "and" else "&&")
+    operands_sql: list[sql.Composable] = []
+    facets, nonfacets = _filter_operands(operands)
+    if facets:
+        param_name = f"param{len(filter_params)}"
+        if facets == ["/n/s/PROCESSED"]:
+            # Optimization for the most common case, we know PROCESSED is a full label and can use the smaller labels index
+            # This is needed because PROCESSED is present in most catalog entries and PG is unlikely to use any index
+            # for it, falling back to executing the extract_facets function which can be slow
+            operands_sql.append(sql.SQL("labels @> {}").format(sql.Placeholder(param_name)))
+        else:
+            operands_sql.append(
+                sql.SQL("extract_facets(labels) {} {}").format(array_op, sql.Placeholder(param_name))
+            )
+        filter_params[param_name] = facets
+    for nonfacet in nonfacets:
+        operands_sql.append(_convert_filter(nonfacet, filter_params))
+    return sql.SQL("({})").format(sql.SQL(f" {op.upper()} ").join(operands_sql))
-        await cur.execute(
-            sql.SQL(
-                "SELECT facet, COUNT(*) FROM catalog_facets WHERE kbid = %(kbid)s {} GROUP BY facet"
-            ).format(filter_sql),
-            {"kbid": kbid, **prefix_params},
+def _convert_date_filter(date: CatalogExpression.Date, filter_params: dict[str, Any]) -> sql.Composable:
+    if date.since and date.until:
+        since_name = f"param{len(filter_params)}"
+        filter_params[since_name] = date.since
+        until_name = f"param{len(filter_params)}"
+        filter_params[until_name] = date.until
+        return sql.SQL("{field} BETWEEN {since} AND {until}").format(
+            field=sql.Identifier(date.field),
+            since=sql.Placeholder(since_name),
+            until=sql.Placeholder(until_name),
         )
-        return {k: v for k, v in await cur.fetchall()}
+    elif date.since:
+        since_name = f"param{len(filter_params)}"
+        filter_params[since_name] = date.since
+        return sql.SQL("{field} > {since}").format(
+            field=sql.Identifier(date.field), since=sql.Placeholder(since_name)
+        )
+    elif date.until:
+        until_name = f"param{len(filter_params)}"
+        filter_params[until_name] = date.until
+        return sql.SQL("{field} < {until}").format(
+            field=sql.Identifier(date.field), until=sql.Placeholder(until_name)
+        )
+    else:
+        raise ValueError(f"Invalid date operator")
+def translate_label(literal: str) -> str:
+    if len(literal) == 0:
+        raise InvalidQueryError("filters", "Invalid empty label")
+    if literal[0] != "/":
+        raise InvalidQueryError("filters", f"Invalid label. It must start with a `/`: {literal}")
+    return translate_alias_to_system_label(literal)
+def extract_facets(labels: list[str]) -> set[str]:
+    facets = set()
+    for label in labels:
+        parts = label.split("/")
+        facet = ""
+        for part in parts[1:]:
+            facet += f"/{part}"
+            facets.add(facet)
+    return facets

nucliadb 6.7.2.post4862__py3-none-any.whl → 6.9.2.post5282__py3-none-any.whl

Potentially problematic release.

nucliadb 6.7.2.post4862py3-none-any.whl → 6.9.2.post5282py3-none-any.whl