nucliadb 6.5.0.post4426__py3-none-any.whl → 6.5.0.post4476__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,74 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ """Migration #37
22
+
23
+ Backfill catalog facets
24
+
25
+ """
26
+
27
+ import logging
28
+ from typing import cast
29
+
30
+ from nucliadb.common.maindb.pg import PGDriver, PGTransaction
31
+ from nucliadb.migrator.context import ExecutionContext
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ async def migrate(context: ExecutionContext) -> None:
37
+ driver = cast(PGDriver, context.kv_driver)
38
+
39
+ BATCH_SIZE = 1_000
40
+ async with driver.transaction() as txn:
41
+ txn = cast(PGTransaction, txn)
42
+ start_kbid = "00000000000000000000000000000000"
43
+ start_rid = "00000000000000000000000000000000"
44
+ while True:
45
+ async with txn.connection.cursor() as cur:
46
+ logger.info(f"Filling {BATCH_SIZE} catalog facets from {start_kbid}, {start_rid}")
47
+ # Get a batch of facets from the catalog table
48
+ await cur.execute(
49
+ """
50
+ WITH i AS (
51
+ INSERT INTO catalog_facets (kbid, rid, facet)
52
+ SELECT kbid, rid, unnest(extract_facets(labels)) FROM (
53
+ SELECT * FROM catalog
54
+ WHERE (kbid = %(kbid)s AND rid > %(rid)s) OR kbid > %(kbid)s
55
+ ORDER BY kbid, rid
56
+ LIMIT %(batch)s
57
+ ) rs
58
+ RETURNING kbid, rid
59
+ )
60
+ SELECT kbid, rid FROM i ORDER BY kbid DESC, rid DESC LIMIT 1;
61
+ """,
62
+ {"kbid": start_kbid, "rid": start_rid, "batch": BATCH_SIZE},
63
+ )
64
+
65
+ # Set the key for next iteration
66
+ results = await cur.fetchone() # type: ignore
67
+ if results is None:
68
+ break
69
+ (start_kbid, start_rid) = results
70
+
71
+ await txn.commit()
72
+
73
+
74
+ async def migrate_kb(context: ExecutionContext, kbid: str) -> None: ...
@@ -26,7 +26,7 @@ async def migrate(txn: PGTransaction) -> None:
26
26
  # IF NOT EXISTS just for compatibility with older install predating the migration system
27
27
  await cur.execute("""
28
28
  CREATE TABLE IF NOT EXISTS resources (
29
- key TEXT PRIMARY KEY,
29
+ key TEXT COLLATE ucs_basic PRIMARY KEY,
30
30
  value BYTEA
31
31
  );
32
32
  """)
@@ -0,0 +1,43 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ from nucliadb.common.maindb.pg import PGTransaction
22
+
23
+
24
+ async def migrate(txn: PGTransaction) -> None:
25
+ async with txn.connection.cursor() as cur:
26
+ await cur.execute(
27
+ """
28
+ CREATE TABLE catalog_facets (
29
+ id BIGINT PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
30
+ kbid UUID,
31
+ rid UUID,
32
+ facet TEXT COLLATE ucs_basic,
33
+
34
+ FOREIGN KEY (kbid, rid) REFERENCES catalog (kbid, rid) ON DELETE CASCADE
35
+ );
36
+
37
+ -- For FK checks
38
+ CREATE INDEX ON catalog_facets(kbid, rid);
39
+
40
+ -- Best for per-facet aggregation, also used by search with facet filter
41
+ CREATE INDEX ON catalog_facets(kbid, facet);
42
+ """
43
+ )
@@ -0,0 +1,26 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ from nucliadb.common.maindb.pg import PGTransaction
22
+
23
+
24
+ async def migrate(txn: PGTransaction) -> None:
25
+ async with txn.connection.cursor() as cur:
26
+ await cur.execute("ALTER FUNCTION extract_facets(text[]) PARALLEL SAFE;")
@@ -40,6 +40,17 @@ def pgcatalog_enabled(kbid):
40
40
  return isinstance(get_driver(), PGDriver)
41
41
 
42
42
 
43
+ def extract_facets(labels):
44
+ facets = set()
45
+ for label in labels:
46
+ parts = label.split("/")
47
+ facet = ""
48
+ for part in parts[1:]:
49
+ facet += f"/{part}"
50
+ facets.add(facet)
51
+ return facets
52
+
53
+
43
54
  @observer.wrap({"type": "update"})
44
55
  async def pgcatalog_update(txn: Transaction, kbid: str, resource: Resource, index_message: IndexMessage):
45
56
  if not pgcatalog_enabled(kbid):
@@ -76,6 +87,21 @@ async def pgcatalog_update(txn: Transaction, kbid: str, resource: Resource, inde
76
87
  "slug": resource.basic.slug,
77
88
  },
78
89
  )
90
+ await cur.execute(
91
+ "DELETE FROM catalog_facets WHERE kbid = %(kbid)s AND rid = %(rid)s",
92
+ {
93
+ "kbid": resource.kb.kbid,
94
+ "rid": resource.uuid,
95
+ },
96
+ )
97
+ await cur.execute(
98
+ "INSERT INTO catalog_facets (kbid, rid, facet) SELECT %(kbid)s AS kbid, %(rid)s AS rid, unnest(%(facets)s::text[]) AS facet",
99
+ {
100
+ "kbid": resource.kb.kbid,
101
+ "rid": resource.uuid,
102
+ "facets": list(extract_facets(index_message.labels)),
103
+ },
104
+ )
79
105
 
80
106
 
81
107
  @observer.wrap({"type": "delete"})
@@ -27,15 +27,13 @@ from pydantic import ValidationError
27
27
 
28
28
  from nucliadb.common.datamanagers.exceptions import KnowledgeBoxNotFound
29
29
  from nucliadb.common.exceptions import InvalidQueryError
30
- from nucliadb.common.maindb.pg import PGDriver
31
- from nucliadb.common.maindb.utils import get_driver
32
30
  from nucliadb.models.responses import HTTPClientError
33
31
  from nucliadb.search import logger
34
32
  from nucliadb.search.api.v1.router import KB_PREFIX, api
35
33
  from nucliadb.search.api.v1.utils import fastapi_query
36
34
  from nucliadb.search.search import cache
37
35
  from nucliadb.search.search.merge import fetch_resources
38
- from nucliadb.search.search.pgcatalog import pgcatalog_search
36
+ from nucliadb.search.search.pgcatalog import pgcatalog_facets, pgcatalog_search
39
37
  from nucliadb.search.search.query_parser.parsers import parse_catalog
40
38
  from nucliadb.search.search.utils import (
41
39
  maybe_log_request_payload,
@@ -45,6 +43,7 @@ from nucliadb_models.filters import CatalogFilterExpression
45
43
  from nucliadb_models.metadata import ResourceProcessingStatus
46
44
  from nucliadb_models.resource import NucliaDBRoles
47
45
  from nucliadb_models.search import (
46
+ CatalogFacetsRequest,
48
47
  CatalogRequest,
49
48
  CatalogResponse,
50
49
  KnowledgeboxSearchResults,
@@ -157,9 +156,6 @@ async def catalog(
157
156
  returns bm25 results on titles and it does not support vector search.
158
157
  It is useful for listing resources in a knowledge box.
159
158
  """
160
- if not pgcatalog_enabled(): # pragma: no cover
161
- return HTTPClientError(status_code=501, detail="PG driver is needed for catalog search")
162
-
163
159
  maybe_log_request_payload(kbid, "/catalog", item)
164
160
  start_time = time()
165
161
  try:
@@ -196,5 +192,15 @@ async def catalog(
196
192
  )
197
193
 
198
194
 
199
- def pgcatalog_enabled():
200
- return isinstance(get_driver(), PGDriver)
195
+ @api.post(
196
+ f"/{KB_PREFIX}/{{kbid}}/catalog/facets",
197
+ status_code=200,
198
+ response_model=dict[str, int],
199
+ response_model_exclude_unset=True,
200
+ tags=["Search"],
201
+ include_in_schema=False,
202
+ )
203
+ @requires(NucliaDBRoles.READER)
204
+ @version(1)
205
+ async def catalog_facets(request: Request, kbid: str, item: CatalogFacetsRequest) -> dict[str, int]:
206
+ return await pgcatalog_facets(kbid, item)
@@ -22,19 +22,15 @@ import logging
22
22
  from collections import defaultdict
23
23
  from typing import Any, Literal, Union, cast
24
24
 
25
- from psycopg.rows import dict_row
25
+ from psycopg import AsyncCursor, sql
26
+ from psycopg.rows import DictRow, dict_row
26
27
 
27
28
  from nucliadb.common.maindb.pg import PGDriver
28
29
  from nucliadb.common.maindb.utils import get_driver
29
30
  from nucliadb.search.search.query_parser.models import CatalogExpression, CatalogQuery
30
31
  from nucliadb_models import search as search_models
31
32
  from nucliadb_models.labels import translate_system_to_alias_label
32
- from nucliadb_models.search import (
33
- ResourceResult,
34
- Resources,
35
- SortField,
36
- SortOrder,
37
- )
33
+ from nucliadb_models.search import CatalogFacetsRequest, ResourceResult, Resources, SortField, SortOrder
38
34
  from nucliadb_telemetry import metrics
39
35
 
40
36
  from .filters import translate_label
@@ -55,65 +51,87 @@ def _filter_operands(operands: list[CatalogExpression]) -> tuple[list[str], list
55
51
  return facets, nonfacets
56
52
 
57
53
 
58
- def _convert_filter(expr: CatalogExpression, filter_params: dict[str, Any]) -> str:
54
+ def _convert_filter(expr: CatalogExpression, filter_params: dict[str, Any]) -> sql.Composable:
59
55
  if expr.bool_and:
60
56
  return _convert_boolean_op(expr.bool_and, "and", filter_params)
61
57
  elif expr.bool_or:
62
58
  return _convert_boolean_op(expr.bool_or, "or", filter_params)
63
59
  elif expr.bool_not:
64
- return f"(NOT {_convert_filter(expr.bool_not, filter_params)})"
60
+ return sql.SQL("(NOT {})").format(_convert_filter(expr.bool_not, filter_params))
65
61
  elif expr.date:
66
62
  return _convert_date_filter(expr.date, filter_params)
67
63
  elif expr.facet:
68
64
  param_name = f"param{len(filter_params)}"
69
65
  filter_params[param_name] = [expr.facet]
70
- return f"extract_facets(labels) @> %({param_name})s"
66
+ if expr.facet == "/n/s/PROCESSED":
67
+ # Optimization for the most common case, we know PROCESSED is a full label and can use the smaller labels index
68
+ # This is needed because PROCESSED is present in most catalog entries and PG is unlikely to use any index
69
+ # for it, falling back to executing the extract_facets function which can be slow
70
+ return sql.SQL("labels @> {}").format(sql.Placeholder(param_name))
71
+ else:
72
+ return sql.SQL("extract_facets(labels) @> {}").format(sql.Placeholder(param_name))
71
73
  elif expr.resource_id:
72
74
  param_name = f"param{len(filter_params)}"
73
75
  filter_params[param_name] = [expr.resource_id]
74
- return f"rid = %({param_name})s"
76
+ return sql.SQL("rid = {}").format(sql.Placeholder(param_name))
75
77
  else:
76
- return ""
78
+ return sql.SQL("")
77
79
 
78
80
 
79
81
  def _convert_boolean_op(
80
82
  operands: list[CatalogExpression],
81
83
  op: Union[Literal["and"], Literal["or"]],
82
84
  filter_params: dict[str, Any],
83
- ) -> str:
84
- array_op = "@>" if op == "and" else "&&"
85
- sql = []
85
+ ) -> sql.Composable:
86
+ array_op = sql.SQL("@>" if op == "and" else "&&")
87
+ operands_sql: list[sql.Composable] = []
86
88
  facets, nonfacets = _filter_operands(operands)
87
89
  if facets:
88
90
  param_name = f"param{len(filter_params)}"
91
+ if facets == ["/n/s/PROCESSED"]:
92
+ # Optimization for the most common case, we know PROCESSED is a full label and can use the smaller labels index
93
+ # This is needed because PROCESSED is present in most catalog entries and PG is unlikely to use any index
94
+ # for it, falling back to executing the extract_facets function which can be slow
95
+ operands_sql.append(sql.SQL("labels @> {}").format(sql.Placeholder(param_name)))
96
+ else:
97
+ operands_sql.append(
98
+ sql.SQL("extract_facets(labels) {} {}").format(array_op, sql.Placeholder(param_name))
99
+ )
89
100
  filter_params[param_name] = facets
90
- sql.append(f"extract_facets(labels) {array_op} %({param_name})s")
91
101
  for nonfacet in nonfacets:
92
- sql.append(_convert_filter(nonfacet, filter_params))
93
- return "(" + f" {op.upper()} ".join(sql) + ")"
102
+ operands_sql.append(_convert_filter(nonfacet, filter_params))
103
+ return sql.SQL("({})").format(sql.SQL(f" {op.upper()} ").join(operands_sql))
94
104
 
95
105
 
96
- def _convert_date_filter(date: CatalogExpression.Date, filter_params: dict[str, Any]) -> str:
106
+ def _convert_date_filter(date: CatalogExpression.Date, filter_params: dict[str, Any]) -> sql.Composable:
97
107
  if date.since and date.until:
98
108
  since_name = f"param{len(filter_params)}"
99
109
  filter_params[since_name] = date.since
100
110
  until_name = f"param{len(filter_params)}"
101
111
  filter_params[until_name] = date.until
102
- return f"{date.field} BETWEEN %({since_name})s AND %({until_name})s"
112
+ return sql.SQL("{field} BETWEEN {since} AND {until}").format(
113
+ field=sql.Identifier(date.field),
114
+ since=sql.Placeholder(since_name),
115
+ until=sql.Placeholder(until_name),
116
+ )
103
117
  elif date.since:
104
118
  since_name = f"param{len(filter_params)}"
105
119
  filter_params[since_name] = date.since
106
- return f"{date.field} > %({since_name})s"
120
+ return sql.SQL("{field} > {since}").format(
121
+ field=sql.Identifier(date.field), since=sql.Placeholder(since_name)
122
+ )
107
123
  elif date.until:
108
124
  until_name = f"param{len(filter_params)}"
109
125
  filter_params[until_name] = date.until
110
- return f"{date.field} < %({until_name})s"
126
+ return sql.SQL("{field} < {until}").format(
127
+ field=sql.Identifier(date.field), until=sql.Placeholder(until_name)
128
+ )
111
129
  else:
112
130
  raise ValueError(f"Invalid date operator")
113
131
 
114
132
 
115
- def _prepare_query_filters(catalog_query: CatalogQuery) -> tuple[str, dict[str, Any]]:
116
- filter_sql = ["kbid = %(kbid)s"]
133
+ def _prepare_query_filters(catalog_query: CatalogQuery) -> tuple[sql.Composable, dict[str, Any]]:
134
+ filter_sql: list[sql.Composable] = [sql.SQL("kbid = %(kbid)s")]
117
135
  filter_params: dict[str, Any] = {"kbid": catalog_query.kbid}
118
136
 
119
137
  if catalog_query.query and catalog_query.query.query:
@@ -123,47 +141,50 @@ def _prepare_query_filters(catalog_query: CatalogQuery) -> tuple[str, dict[str,
123
141
  filter_sql.append(_convert_filter(catalog_query.filters, filter_params))
124
142
 
125
143
  return (
126
- f"SELECT * FROM catalog WHERE {' AND '.join(filter_sql)}",
144
+ sql.SQL("SELECT * FROM catalog WHERE {}").format(sql.SQL(" AND ").join(filter_sql)),
127
145
  filter_params,
128
146
  )
129
147
 
130
148
 
131
- def _prepare_query_search(query: search_models.CatalogQuery, params: dict[str, Any]) -> str:
149
+ def _prepare_query_search(query: search_models.CatalogQuery, params: dict[str, Any]) -> sql.Composable:
132
150
  if query.match == search_models.CatalogQueryMatch.Exact:
133
151
  params["query"] = query.query
134
- return f"{query.field.value} = %(query)s"
152
+ return sql.SQL("{} = %(query)s").format(sql.Identifier(query.field.value))
135
153
  elif query.match == search_models.CatalogQueryMatch.StartsWith:
136
154
  params["query"] = query.query + "%"
137
155
  if query.field == search_models.CatalogQueryField.Title:
138
156
  # Insensitive search supported by pg_trgm for title
139
- return f"{query.field.value} ILIKE %(query)s"
157
+ return sql.SQL("{} ILIKE %(query)s").format(sql.Identifier(query.field.value))
140
158
  else:
141
159
  # Sensitive search for slug (btree does not support ILIKE and slugs are all lowercase anyway)
142
- return f"{query.field.value} LIKE %(query)s"
160
+ return sql.SQL("{} LIKE %(query)s").format(sql.Identifier(query.field.value))
143
161
  # The rest of operators only supported by title
144
162
  elif query.match == search_models.CatalogQueryMatch.Words:
145
163
  # This is doing tokenization inside the SQL server (to keep the index updated). We could move it to
146
164
  # the python code at update/query time if it ever becomes a problem but for now, a single regex
147
165
  # executed per query is not a problem.
148
166
  params["query"] = query.query
149
- return "regexp_split_to_array(lower(title), '\\W') @> regexp_split_to_array(lower(%(query)s), '\\W')"
167
+ return sql.SQL(
168
+ "regexp_split_to_array(lower(title), '\\W') @> regexp_split_to_array(lower(%(query)s), '\\W')"
169
+ )
150
170
  elif query.match == search_models.CatalogQueryMatch.Fuzzy:
151
171
  params["query"] = query.query
152
172
  # Note: the operator is %>, We use %%> for psycopg escaping
153
- return "title %%> %(query)s"
173
+ return sql.SQL("title %%> %(query)s")
154
174
  elif query.match == search_models.CatalogQueryMatch.EndsWith:
155
175
  params["query"] = "%" + query.query
156
- return "title ILIKE %(query)s"
176
+ return sql.SQL("title ILIKE %(query)s")
157
177
  elif query.match == search_models.CatalogQueryMatch.Contains:
158
178
  params["query"] = "%" + query.query + "%"
159
- return "title ILIKE %(query)s"
179
+ return sql.SQL("title ILIKE %(query)s")
160
180
  else: # pragma: nocover
161
181
  # This is a trick so mypy generates an error if this branch can be reached,
162
182
  # that is, if we are missing some ifs
163
183
  _a: int = "a"
184
+ return sql.SQL("")
164
185
 
165
186
 
166
- def _prepare_query(catalog_query: CatalogQuery) -> tuple[str, dict[str, Any]]:
187
+ def _prepare_query(catalog_query: CatalogQuery) -> tuple[sql.Composed, dict[str, Any]]:
167
188
  # Base query with all the filters
168
189
  query, filter_params = _prepare_query_filters(catalog_query)
169
190
 
@@ -184,11 +205,11 @@ def _prepare_query(catalog_query: CatalogQuery) -> tuple[str, dict[str, Any]]:
184
205
  else:
185
206
  order_dir = "DESC"
186
207
 
187
- query += f" ORDER BY {order_field} {order_dir}"
208
+ query += sql.SQL(" ORDER BY {} {}").format(sql.Identifier(order_field), sql.SQL(order_dir))
188
209
 
189
210
  # Pagination
190
211
  offset = catalog_query.page_size * catalog_query.page_number
191
- query += f" LIMIT %(page_size)s OFFSET %(offset)s"
212
+ query += sql.SQL(" LIMIT %(page_size)s OFFSET %(offset)s")
192
213
  filter_params["page_size"] = catalog_query.page_size
193
214
  filter_params["offset"] = offset
194
215
 
@@ -213,40 +234,18 @@ async def pgcatalog_search(catalog_query: CatalogQuery) -> Resources:
213
234
  tmp_facets: dict[str, dict[str, int]] = {
214
235
  translate_label(f): defaultdict(int) for f in catalog_query.faceted
215
236
  }
216
- facet_filters = " OR ".join(f"label LIKE '{f}/%%'" for f in tmp_facets.keys())
217
- for facet in tmp_facets.keys():
218
- if not (
219
- facet.startswith("/n/s") or facet.startswith("/n/i") or facet.startswith("/l")
220
- ):
221
- logger.warning(
222
- f"Unexpected facet used at catalog: {facet}, kbid={catalog_query.kbid}"
223
- )
224
-
225
- await cur.execute(
226
- f"SELECT label, COUNT(*) FROM (SELECT unnest(labels) AS label FROM ({query}) fc) nl WHERE ({facet_filters}) GROUP BY 1 ORDER BY 1",
227
- query_params,
228
- )
229
-
230
- for row in await cur.fetchall():
231
- label = row["label"]
232
- label_parts = label.split("/")
233
- parent = "/".join(label_parts[:-1])
234
- count = row["count"]
235
- if parent in tmp_facets:
236
- tmp_facets[parent][translate_system_to_alias_label(label)] = count
237
237
 
238
- # No need to get recursive because our facets are at most 3 levels deep (e.g: /l/set/label)
239
- if len(label_parts) >= 3:
240
- grandparent = "/".join(label_parts[:-2])
241
- if grandparent in tmp_facets:
242
- tmp_facets[grandparent][translate_system_to_alias_label(parent)] += count
238
+ if catalog_query.filters is None:
239
+ await _faceted_search_unfiltered(cur, catalog_query, tmp_facets)
240
+ else:
241
+ await _faceted_search_filtered(cur, catalog_query, tmp_facets, query, query_params)
243
242
 
244
243
  facets = {translate_system_to_alias_label(k): v for k, v in tmp_facets.items()}
245
244
 
246
245
  # Totals
247
246
  with observer({"op": "totals"}):
248
247
  await cur.execute(
249
- f"SELECT COUNT(*) FROM ({query}) fc",
248
+ sql.SQL("SELECT COUNT(*) FROM ({}) fc").format(query),
250
249
  query_params,
251
250
  )
252
251
  total = (await cur.fetchone())["count"] # type: ignore
@@ -276,3 +275,115 @@ async def pgcatalog_search(catalog_query: CatalogQuery) -> Resources:
276
275
  next_page=(catalog_query.page_size * catalog_query.page_number + len(data) < total),
277
276
  min_score=0,
278
277
  )
278
+
279
+
280
+ async def _faceted_search_unfiltered(
281
+ cur: AsyncCursor[DictRow], catalog_query: CatalogQuery, tmp_facets: dict[str, dict[str, int]]
282
+ ):
283
+ facet_params: dict[str, Any] = {}
284
+ facet_sql: sql.Composable
285
+ if len(tmp_facets) <= 5:
286
+ # Asking for few facets, strictly filter to what we need in the query
287
+ prefixes_sql = []
288
+ for cnt, prefix in enumerate(tmp_facets.keys()):
289
+ prefixes_sql.append(
290
+ sql.SQL("(facet LIKE {} AND POSITION('/' IN RIGHT(facet, {})) = 0)").format(
291
+ sql.Placeholder(f"facet_{cnt}"), sql.Placeholder(f"facet_len_{cnt}")
292
+ )
293
+ )
294
+ facet_params[f"facet_{cnt}"] = f"{prefix}/%"
295
+ facet_params[f"facet_len_{cnt}"] = -(len(prefix) + 1)
296
+ facet_sql = sql.SQL("AND {}").format(sql.SQL(" OR ").join(prefixes_sql))
297
+ elif all((facet.startswith("/l") or facet.startswith("/n/i") for facet in tmp_facets.keys())):
298
+ # Special case for the catalog query, which can have many facets asked for
299
+ # Filter for the categories (icon and labels) in the query, filter the rest in the code below
300
+ facet_sql = sql.SQL("AND (facet LIKE '/l/%%' OR facet like '/n/i/%%')")
301
+ else:
302
+ # Worst case: ask for all facets and filter here. This is faster than applying lots of filters
303
+ facet_sql = sql.SQL("")
304
+
305
+ await cur.execute(
306
+ sql.SQL(
307
+ "SELECT facet, COUNT(*) FROM catalog_facets WHERE kbid = %(kbid)s {} GROUP BY facet"
308
+ ).format(facet_sql),
309
+ {"kbid": catalog_query.kbid, **facet_params},
310
+ )
311
+
312
+ # Only keep the facets we asked for
313
+ for row in await cur.fetchall():
314
+ facet = row["facet"]
315
+ facet_parts = facet.split("/")
316
+ parent = "/".join(facet_parts[:-1])
317
+ if parent in tmp_facets:
318
+ tmp_facets[parent][translate_system_to_alias_label(facet)] = row["count"]
319
+
320
+
321
+ async def _faceted_search_filtered(
322
+ cur: AsyncCursor[DictRow],
323
+ catalog_query: CatalogQuery,
324
+ tmp_facets: dict[str, dict[str, int]],
325
+ query: sql.Composable,
326
+ query_params: dict[str, Any],
327
+ ):
328
+ facet_params = {}
329
+ facet_filters = []
330
+ for cnt, facet in enumerate(tmp_facets.keys()):
331
+ facet_filters.append(sql.SQL("label LIKE {}").format(sql.Placeholder(f"facet_{cnt}")))
332
+ facet_params[f"facet_{cnt}"] = f"{facet}/%"
333
+
334
+ for facet in tmp_facets.keys():
335
+ if not (facet.startswith("/n/s") or facet.startswith("/n/i") or facet.startswith("/l")):
336
+ logger.warning(f"Unexpected facet used at catalog: {facet}, kbid={catalog_query.kbid}")
337
+
338
+ await cur.execute(
339
+ sql.SQL(
340
+ "SELECT label, COUNT(*) FROM (SELECT unnest(labels) AS label FROM ({query}) fc) nl WHERE ({facet_filters}) GROUP BY 1 ORDER BY 1"
341
+ ).format(query=query, facet_filters=sql.SQL(" OR ").join(facet_filters)),
342
+ {**query_params, **facet_params},
343
+ )
344
+
345
+ for row in await cur.fetchall():
346
+ label = row["label"]
347
+ label_parts = label.split("/")
348
+ parent = "/".join(label_parts[:-1])
349
+ count = row["count"]
350
+ if parent in tmp_facets:
351
+ tmp_facets[parent][translate_system_to_alias_label(label)] = count
352
+
353
+ # No need to get recursive because our facets are at most 3 levels deep (e.g: /l/set/label)
354
+ if len(label_parts) >= 3:
355
+ grandparent = "/".join(label_parts[:-2])
356
+ if grandparent in tmp_facets:
357
+ tmp_facets[grandparent][translate_system_to_alias_label(parent)] += count
358
+
359
+
360
+ @observer.wrap({"op": "catalog_facets"})
361
+ async def pgcatalog_facets(kbid: str, request: CatalogFacetsRequest) -> dict[str, int]:
362
+ async with _pg_driver()._get_connection() as conn, conn.cursor() as cur:
363
+ prefix_filters: list[sql.Composable] = []
364
+ prefix_params: dict[str, Any] = {}
365
+ for cnt, prefix in enumerate(request.prefixes):
366
+ prefix_sql = sql.SQL("facet LIKE {}").format(sql.Placeholder(f"prefix{cnt}"))
367
+ prefix_params[f"prefix{cnt}"] = f"{prefix.prefix}%"
368
+ if prefix.depth is not None:
369
+ prefix_parts = len(prefix.prefix.split("/"))
370
+ depth_sql = sql.SQL("SPLIT_PART(facet, '/', {}) = ''").format(
371
+ sql.Placeholder(f"depth{cnt}")
372
+ )
373
+ prefix_params[f"depth{cnt}"] = prefix_parts + prefix.depth + 1
374
+ prefix_sql = sql.SQL("({} AND {})").format(prefix_sql, depth_sql)
375
+ prefix_filters.append(prefix_sql)
376
+
377
+ filter_sql: sql.Composable
378
+ if prefix_filters:
379
+ filter_sql = sql.SQL("AND {}").format(sql.SQL(" OR ").join(prefix_filters))
380
+ else:
381
+ filter_sql = sql.SQL("")
382
+
383
+ await cur.execute(
384
+ sql.SQL(
385
+ "SELECT facet, COUNT(*) FROM catalog_facets WHERE kbid = %(kbid)s {} GROUP BY facet"
386
+ ).format(filter_sql),
387
+ {"kbid": kbid, **prefix_params},
388
+ )
389
+ return {k: v for k, v in await cur.fetchall()}
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nucliadb
3
- Version: 6.5.0.post4426
3
+ Version: 6.5.0.post4476
4
4
  Summary: NucliaDB
5
5
  Author-email: Nuclia <nucliadb@nuclia.com>
6
6
  License-Expression: AGPL-3.0-or-later
@@ -19,11 +19,11 @@ Classifier: Programming Language :: Python :: 3.12
19
19
  Classifier: Programming Language :: Python :: 3 :: Only
20
20
  Requires-Python: <4,>=3.9
21
21
  Description-Content-Type: text/markdown
22
- Requires-Dist: nucliadb-telemetry[all]>=6.5.0.post4426
23
- Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.5.0.post4426
24
- Requires-Dist: nucliadb-protos>=6.5.0.post4426
25
- Requires-Dist: nucliadb-models>=6.5.0.post4426
26
- Requires-Dist: nidx-protos>=6.5.0.post4426
22
+ Requires-Dist: nucliadb-telemetry[all]>=6.5.0.post4476
23
+ Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.5.0.post4476
24
+ Requires-Dist: nucliadb-protos>=6.5.0.post4476
25
+ Requires-Dist: nucliadb-models>=6.5.0.post4476
26
+ Requires-Dist: nidx-protos>=6.5.0.post4476
27
27
  Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
28
28
  Requires-Dist: nuclia-models>=0.24.2
29
29
  Requires-Dist: uvicorn[standard]
@@ -35,8 +35,8 @@ Requires-Dist: aiofiles>=0.8.0
35
35
  Requires-Dist: psutil>=5.9.7
36
36
  Requires-Dist: types-psutil>=5.9.5.17
37
37
  Requires-Dist: types-aiofiles>=0.8.3
38
- Requires-Dist: protobuf>=5
39
- Requires-Dist: types-protobuf>=5
38
+ Requires-Dist: protobuf<6,>=5
39
+ Requires-Dist: types-protobuf<6,>=5
40
40
  Requires-Dist: grpcio>=1.71.0
41
41
  Requires-Dist: grpcio-health-checking>=1.71.0
42
42
  Requires-Dist: grpcio-channelz>=1.71.0
@@ -32,14 +32,17 @@ migrations/0033_rollover_nidx_relation_2.py,sha256=9etpqNLVS3PA14qIdsdhorReZxenD
32
32
  migrations/0034_rollover_nidx_texts_3.py,sha256=t19QtWUgHxmTaBPoR1DooAby2IYmkLTQj8qu1z2XkFc,1452
33
33
  migrations/0035_rollover_nidx_texts_4.py,sha256=W0_AUd01pjMpYMDC3yqF6HzDLgcnnPprL80kfyb1WZI,1187
34
34
  migrations/0036_backfill_catalog_slug.py,sha256=mizRM-HfPswKq4iEmqofu4kIT6Gd97ruT3qhb257vZk,2954
35
+ migrations/0037_backfill_catalog_facets.py,sha256=KAf3VKbKePw7ykDnJi47LyJ7pK1JwYkwMxrsXUnbt9g,2788
35
36
  migrations/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
36
- migrations/pg/0001_bootstrap.py,sha256=Fsqkeof50m7fKiJN05kmNEMwiKDlOrAgcAS5sLLkutA,1256
37
+ migrations/pg/0001_bootstrap.py,sha256=3O_P17l0d0h48nebN6VQLXzM_B7S7zvDpaLR0koVgWE,1274
37
38
  migrations/pg/0002_catalog.py,sha256=Rsleecu351Ty19kYZgOpqX5G3MEAY8nMxCJrAeuS2Mw,1690
38
39
  migrations/pg/0003_catalog_kbid_index.py,sha256=uKq_vtnuf73GVf0mtl2rhzdk_czAoEU1UdiVKVZpA0M,1044
39
40
  migrations/pg/0004_catalog_facets.py,sha256=FJFASHjfEHG3sNve9BP2HnnLO4xr7dnR6Qpctnmt4LE,2180
40
41
  migrations/pg/0005_purge_tasks_index.py,sha256=3mtyFgpcK0QQ_NONYay7V9xICijCLNkyTPuoc0PBjRg,1139
41
42
  migrations/pg/0006_catalog_title_indexes.py,sha256=n2OGxwE4oeCwHAYaxBkja4t10BmwTjZ2IoCyOdjEBSc,1710
42
43
  migrations/pg/0007_catalog_slug.py,sha256=mArzZCBO-RD5DkWxRIyDKgEzrnAcis1TOGvSNUe7Kgg,1150
44
+ migrations/pg/0008_catalog_facets.py,sha256=dxIUdHJHtI_Gyk2dpP7tjHEnL2iPzAufi6ajYm2FVMI,1595
45
+ migrations/pg/0009_extract_facets_safety.py,sha256=k9Appx7ipp3wDyLy70qgw9oLjN7N6BEadE-N5Fhan-4,1066
43
46
  migrations/pg/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
44
47
  nucliadb/__init__.py,sha256=_abCmDJ_0ku483Os4UAjPX7Nywm39cQgAV_DiyjsKeQ,891
45
48
  nucliadb/health.py,sha256=UIxxA4oms4HIsCRZM_SZsdkIZIlgzmOxw-qSHLlWuak,3465
@@ -164,7 +167,7 @@ nucliadb/ingest/orm/utils.py,sha256=fCQRuyecgqhaY7mcBG93oaXMkzkKb9BFjOcy4-ZiSNw,
164
167
  nucliadb/ingest/orm/processor/__init__.py,sha256=Aqd9wCNTvggkMkCY3WvoI8spdr94Jnqk-0iq9XpLs18,922
165
168
  nucliadb/ingest/orm/processor/auditing.py,sha256=TeYhXGJRyQ7ROytbb2u8R0fIh_FYi3HgTu3S1ribY3U,4623
166
169
  nucliadb/ingest/orm/processor/data_augmentation.py,sha256=v-pj4GbBWSuO8dQyahs5UDr5ghsyfhCZDS0ftKd6ZYc,5179
167
- nucliadb/ingest/orm/processor/pgcatalog.py,sha256=Zh6s0gj_bwDKPBXSs61jlMKJ6XP-dLnPGbrMGD6RHcM,3195
170
+ nucliadb/ingest/orm/processor/pgcatalog.py,sha256=GpzQv0_iWTHbM90J0rAz_QIh_TMv1XbghyDgs8tk_8M,4014
168
171
  nucliadb/ingest/orm/processor/processor.py,sha256=jaEBwbv--WyoC8zcdxWAyF0dAzVA5crVDJl56Bqv1eI,31444
169
172
  nucliadb/ingest/orm/processor/sequence_manager.py,sha256=uqEphtI1Ir_yk9jRl2gPf7BlzzXWovbARY5MNZSBI_8,1704
170
173
  nucliadb/ingest/service/__init__.py,sha256=LHQFUkdmNBOWqBG0Md9sMMI7g5TQZ-hLAnhw6ZblrJg,2002
@@ -218,7 +221,7 @@ nucliadb/search/utilities.py,sha256=9SsRDw0rJVXVoLBfF7rBb6q080h-thZc7u8uRcTiBeY,
218
221
  nucliadb/search/api/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
219
222
  nucliadb/search/api/v1/__init__.py,sha256=DH16OYnw9jQ38OpKlmdXeoq2j40ZPXZRtGvClKOkMhw,1239
220
223
  nucliadb/search/api/v1/ask.py,sha256=b4tz33HNsfT5DXv_2DMc_jirnFsHuobreWkbAKkzj5o,5337
221
- nucliadb/search/api/v1/catalog.py,sha256=3SqLgwFkFFY8x-xBruHQaZ0EGpf7oKbSj-_PnobV68E,7747
224
+ nucliadb/search/api/v1/catalog.py,sha256=7yyG46Zsaqvuut9Da-LTl0KcWgo7n5lbEhiTXslyvwM,7865
222
225
  nucliadb/search/api/v1/feedback.py,sha256=kNLc4dHz2SXHzV0PwC1WiRAwY88fDptPcP-kO0q-FrQ,2620
223
226
  nucliadb/search/api/v1/find.py,sha256=iMjyq4y0JOMC_x1B8kUfVdkCoc9G9Ark58kPLLY4HDw,10824
224
227
  nucliadb/search/api/v1/graph.py,sha256=gthqxCOn9biE6D6s93jRGLglk0ono8U7OyS390kWiI8,4178
@@ -251,7 +254,7 @@ nucliadb/search/search/ingestion_agents.py,sha256=IK6yOPEF9rST_uoqspdVdPk0pldjDh
251
254
  nucliadb/search/search/merge.py,sha256=XiRBsxhYPshPV7lZXD-9E259KZOPIf4I2tKosY0lPo4,22470
252
255
  nucliadb/search/search/metrics.py,sha256=3I6IN0qDSmqIvUaWJmT3rt-Jyjs6LcvnKI8ZqCiuJPY,3501
253
256
  nucliadb/search/search/paragraphs.py,sha256=pNAEiYqJGGUVcEf7xf-PFMVqz0PX4Qb-WNG-_zPGN2o,7799
254
- nucliadb/search/search/pgcatalog.py,sha256=QtgArjoM-dW_B1oO0aXqp5au7GlLG8jAct9jevUHatw,10997
257
+ nucliadb/search/search/pgcatalog.py,sha256=O_nRjSJf1Qc-XorVwcNlsDOftzy_zQLLfagkjU4YmSA,16718
255
258
  nucliadb/search/search/predict_proxy.py,sha256=cuD_sfM3RLdEoQaanRz0CflO6nKVGGKPzoFA17shb_w,8647
256
259
  nucliadb/search/search/query.py,sha256=0qIQdt548L3jtKOyKo06aGJ73SLBxAW3N38_Hc1M3Uw,11528
257
260
  nucliadb/search/search/rank_fusion.py,sha256=xZtXhbmKb_56gs73u6KkFm2efvTATOSMmpOV2wrAIqE,9613
@@ -372,8 +375,8 @@ nucliadb/writer/tus/local.py,sha256=7jYa_w9b-N90jWgN2sQKkNcomqn6JMVBOVeDOVYJHto,
372
375
  nucliadb/writer/tus/s3.py,sha256=vF0NkFTXiXhXq3bCVXXVV-ED38ECVoUeeYViP8uMqcU,8357
373
376
  nucliadb/writer/tus/storage.py,sha256=ToqwjoYnjI4oIcwzkhha_MPxi-k4Jk3Lt55zRwaC1SM,2903
374
377
  nucliadb/writer/tus/utils.py,sha256=MSdVbRsRSZVdkaum69_0wku7X3p5wlZf4nr6E0GMKbw,2556
375
- nucliadb-6.5.0.post4426.dist-info/METADATA,sha256=fRo_rQ3D5zAGctuqOfk22MzKACI4nZ8mijFy-JSGaT0,4152
376
- nucliadb-6.5.0.post4426.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
377
- nucliadb-6.5.0.post4426.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
378
- nucliadb-6.5.0.post4426.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
379
- nucliadb-6.5.0.post4426.dist-info/RECORD,,
378
+ nucliadb-6.5.0.post4476.dist-info/METADATA,sha256=ysG9rsv_jshf_4lJLNHXGBHLm8Br-jWbUKDgRymc9jY,4158
379
+ nucliadb-6.5.0.post4476.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
380
+ nucliadb-6.5.0.post4476.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
381
+ nucliadb-6.5.0.post4476.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
382
+ nucliadb-6.5.0.post4476.dist-info/RECORD,,