nucliadb 6.7.2.post4862__py3-none-any.whl → 6.9.2.post5282__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nucliadb might be problematic. Click here for more details.

Files changed (126) hide show
  1. migrations/0016_upgrade_to_paragraphs_v2.py +1 -1
  2. migrations/0017_multiple_writable_shards.py +1 -1
  3. migrations/0018_purge_orphan_kbslugs.py +1 -1
  4. migrations/0019_upgrade_to_paragraphs_v3.py +1 -1
  5. migrations/0021_overwrite_vectorsets_key.py +1 -1
  6. migrations/0023_backfill_pg_catalog.py +7 -3
  7. migrations/0025_assign_models_to_kbs_v2.py +3 -3
  8. migrations/0027_rollover_texts3.py +1 -1
  9. migrations/0028_extracted_vectors_reference.py +1 -1
  10. migrations/0029_backfill_field_status.py +1 -1
  11. migrations/0032_remove_old_relations.py +1 -1
  12. migrations/0036_backfill_catalog_slug.py +1 -1
  13. migrations/0037_backfill_catalog_facets.py +1 -1
  14. migrations/0038_backfill_catalog_field_labels.py +7 -3
  15. migrations/0039_backfill_converation_splits_metadata.py +106 -0
  16. migrations/0040_migrate_search_configurations.py +79 -0
  17. migrations/pg/0010_shards_index.py +34 -0
  18. nucliadb/backups/create.py +3 -3
  19. nucliadb/backups/restore.py +3 -3
  20. nucliadb/common/cache.py +1 -1
  21. nucliadb/common/catalog/__init__.py +79 -0
  22. nucliadb/common/catalog/dummy.py +36 -0
  23. nucliadb/common/catalog/interface.py +85 -0
  24. nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +294 -208
  25. nucliadb/common/catalog/utils.py +56 -0
  26. nucliadb/common/cluster/manager.py +3 -19
  27. nucliadb/common/cluster/rebalance.py +484 -110
  28. nucliadb/common/cluster/rollover.py +29 -0
  29. nucliadb/common/cluster/settings.py +1 -1
  30. nucliadb/common/cluster/utils.py +26 -0
  31. nucliadb/common/datamanagers/atomic.py +6 -0
  32. nucliadb/common/datamanagers/utils.py +2 -2
  33. nucliadb/common/external_index_providers/manager.py +1 -29
  34. nucliadb/common/external_index_providers/settings.py +1 -27
  35. nucliadb/common/filter_expression.py +16 -33
  36. nucliadb/common/http_clients/exceptions.py +8 -0
  37. nucliadb/common/http_clients/processing.py +4 -0
  38. nucliadb/common/http_clients/utils.py +3 -0
  39. nucliadb/common/ids.py +77 -55
  40. nucliadb/common/locking.py +4 -4
  41. nucliadb/common/maindb/driver.py +11 -1
  42. nucliadb/common/maindb/local.py +1 -1
  43. nucliadb/common/maindb/pg.py +1 -1
  44. nucliadb/common/nidx.py +19 -1
  45. nucliadb/common/vector_index_config.py +1 -1
  46. nucliadb/export_import/datamanager.py +3 -3
  47. nucliadb/ingest/consumer/pull.py +7 -0
  48. nucliadb/ingest/consumer/service.py +2 -27
  49. nucliadb/ingest/consumer/shard_creator.py +17 -6
  50. nucliadb/ingest/fields/base.py +9 -17
  51. nucliadb/ingest/fields/conversation.py +47 -1
  52. nucliadb/ingest/orm/brain_v2.py +21 -3
  53. nucliadb/ingest/orm/index_message.py +126 -111
  54. nucliadb/ingest/orm/knowledgebox.py +84 -43
  55. nucliadb/ingest/orm/processor/auditing.py +1 -1
  56. nucliadb/ingest/orm/processor/processor.py +95 -149
  57. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  58. nucliadb/ingest/orm/resource.py +10 -1
  59. nucliadb/ingest/partitions.py +12 -1
  60. nucliadb/ingest/serialize.py +2 -2
  61. nucliadb/ingest/service/writer.py +26 -19
  62. nucliadb/ingest/settings.py +33 -11
  63. nucliadb/learning_proxy.py +12 -15
  64. nucliadb/metrics_exporter.py +17 -4
  65. nucliadb/migrator/datamanager.py +11 -17
  66. nucliadb/migrator/migrator.py +2 -2
  67. nucliadb/purge/__init__.py +12 -17
  68. nucliadb/purge/orphan_shards.py +2 -2
  69. nucliadb/reader/api/v1/knowledgebox.py +40 -12
  70. nucliadb/reader/api/v1/learning_config.py +30 -10
  71. nucliadb/reader/api/v1/resource.py +2 -2
  72. nucliadb/reader/api/v1/services.py +1 -1
  73. nucliadb/reader/reader/notifications.py +1 -1
  74. nucliadb/search/api/v1/__init__.py +1 -0
  75. nucliadb/search/api/v1/catalog.py +4 -4
  76. nucliadb/search/api/v1/find.py +1 -4
  77. nucliadb/search/api/v1/hydrate.py +328 -0
  78. nucliadb/search/api/v1/resource/ask.py +21 -1
  79. nucliadb/search/api/v1/search.py +1 -4
  80. nucliadb/search/predict.py +9 -2
  81. nucliadb/search/search/cache.py +1 -20
  82. nucliadb/search/search/chat/ask.py +50 -8
  83. nucliadb/search/search/chat/prompt.py +47 -15
  84. nucliadb/search/search/chat/query.py +8 -1
  85. nucliadb/search/search/fetch.py +1 -1
  86. nucliadb/search/search/find.py +1 -6
  87. nucliadb/search/search/{hydrator.py → hydrator/__init__.py} +5 -4
  88. nucliadb/search/search/hydrator/fields.py +175 -0
  89. nucliadb/search/search/hydrator/images.py +130 -0
  90. nucliadb/search/search/hydrator/paragraphs.py +307 -0
  91. nucliadb/search/search/hydrator/resources.py +56 -0
  92. nucliadb/search/search/metrics.py +16 -0
  93. nucliadb/search/search/predict_proxy.py +33 -11
  94. nucliadb/search/search/query.py +0 -23
  95. nucliadb/search/search/query_parser/fetcher.py +5 -5
  96. nucliadb/search/search/query_parser/models.py +1 -30
  97. nucliadb/search/search/query_parser/parsers/ask.py +1 -1
  98. nucliadb/search/search/query_parser/parsers/catalog.py +4 -7
  99. nucliadb/search/search/query_parser/parsers/common.py +16 -7
  100. nucliadb/search/search/query_parser/parsers/find.py +0 -11
  101. nucliadb/search/search/query_parser/parsers/graph.py +5 -5
  102. nucliadb/search/search/query_parser/parsers/search.py +0 -11
  103. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +4 -11
  104. nucliadb/search/search/rerankers.py +1 -1
  105. nucliadb/search/search/summarize.py +1 -1
  106. nucliadb/standalone/run.py +3 -0
  107. nucliadb/tasks/retries.py +4 -4
  108. nucliadb/train/generators/sentence_classifier.py +2 -8
  109. nucliadb/train/generators/utils.py +1 -1
  110. nucliadb/train/nodes.py +4 -4
  111. nucliadb/train/servicer.py +1 -1
  112. nucliadb/train/uploader.py +1 -1
  113. nucliadb/writer/api/v1/field.py +14 -9
  114. nucliadb/writer/api/v1/knowledgebox.py +15 -52
  115. nucliadb/writer/api/v1/learning_config.py +5 -4
  116. nucliadb/writer/api/v1/resource.py +2 -2
  117. nucliadb/writer/resource/field.py +38 -2
  118. nucliadb/writer/tus/azure.py +4 -4
  119. nucliadb/writer/tus/gcs.py +11 -17
  120. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/METADATA +9 -10
  121. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/RECORD +124 -114
  122. nucliadb/common/external_index_providers/pinecone.py +0 -894
  123. nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
  124. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/WHEEL +0 -0
  125. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/entry_points.txt +0 -0
  126. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,85 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ from __future__ import annotations
21
+
22
+ import abc
23
+ import datetime
24
+ from dataclasses import dataclass
25
+ from typing import Literal, Optional, Union
26
+
27
+ from pydantic import BaseModel, Field
28
+
29
+ from nucliadb.common.maindb.driver import Transaction
30
+ from nucliadb_models import search as search_models
31
+ from nucliadb_models.search import CatalogFacetsRequest, Resources
32
+
33
+
34
+ class CatalogResourceData(BaseModel):
35
+ """
36
+ Data extracted from a resource to be indexed in the catalog
37
+ """
38
+
39
+ title: str = Field(description="Resource title")
40
+ created_at: datetime.datetime = Field(description="Resource creation date")
41
+ modified_at: datetime.datetime = Field(description="Resource last modification date")
42
+ labels: list[str] = Field(
43
+ description="Resource labels. This includes labels at the resource level and all classification labels of its fields"
44
+ )
45
+ slug: str = Field(description="Resource slug")
46
+
47
+
48
+ @dataclass
49
+ class CatalogExpression:
50
+ @dataclass
51
+ class Date:
52
+ field: Union[Literal["created_at"], Literal["modified_at"]]
53
+ since: Optional[datetime.datetime]
54
+ until: Optional[datetime.datetime]
55
+
56
+ bool_and: Optional[list["CatalogExpression"]] = None
57
+ bool_or: Optional[list["CatalogExpression"]] = None
58
+ bool_not: Optional["CatalogExpression"] = None
59
+ date: Optional[Date] = None
60
+ facet: Optional[str] = None
61
+ resource_id: Optional[str] = None
62
+
63
+
64
+ class CatalogQuery(BaseModel):
65
+ kbid: str
66
+ query: Optional[search_models.CatalogQuery] = Field(description="Full-text search query")
67
+ filters: Optional[CatalogExpression] = Field(description="Filters to apply to the search")
68
+ sort: search_models.SortOptions = Field(description="Sorting option")
69
+ faceted: list[str] = Field(description="List of facets to compute during the search")
70
+ page_size: int = Field(description="Used for pagination. Maximum page size is 100")
71
+ page_number: int = Field(description="Used for pagination. First page is 0")
72
+
73
+
74
+ class Catalog(abc.ABC, metaclass=abc.ABCMeta):
75
+ @abc.abstractmethod
76
+ async def update(self, txn: Transaction, kbid: str, rid: str, data: CatalogResourceData): ...
77
+
78
+ @abc.abstractmethod
79
+ async def delete(self, txn: Transaction, kbid: str, rid: str): ...
80
+
81
+ @abc.abstractmethod
82
+ async def search(self, query: CatalogQuery) -> Resources: ...
83
+
84
+ @abc.abstractmethod
85
+ async def facets(self, kbid: str, request: CatalogFacetsRequest) -> dict[str, int]: ...
@@ -26,111 +26,180 @@ from typing import Any, Literal, Union, cast
26
26
  from psycopg import AsyncCursor, sql
27
27
  from psycopg.rows import DictRow, dict_row
28
28
 
29
- from nucliadb.common.maindb.pg import PGDriver
29
+ from nucliadb.common.catalog.interface import (
30
+ Catalog,
31
+ CatalogExpression,
32
+ CatalogQuery,
33
+ CatalogResourceData,
34
+ )
35
+ from nucliadb.common.exceptions import InvalidQueryError
36
+ from nucliadb.common.maindb.driver import Transaction
37
+ from nucliadb.common.maindb.pg import PGDriver, PGTransaction
30
38
  from nucliadb.common.maindb.utils import get_driver
31
- from nucliadb.search.search.query_parser.models import CatalogExpression, CatalogQuery
32
39
  from nucliadb_models import search as search_models
33
- from nucliadb_models.labels import translate_system_to_alias_label
34
- from nucliadb_models.search import CatalogFacetsRequest, ResourceResult, Resources, SortField, SortOrder
40
+ from nucliadb_models.labels import translate_alias_to_system_label, translate_system_to_alias_label
41
+ from nucliadb_models.search import (
42
+ CatalogFacetsRequest,
43
+ ResourceResult,
44
+ Resources,
45
+ SortField,
46
+ SortOrder,
47
+ )
35
48
  from nucliadb_telemetry import metrics
36
49
 
37
- from .filters import translate_label
50
+ write_observer = metrics.Observer("pg_catalog_write", labels={"type": ""})
51
+ search_observer = metrics.Observer("pg_catalog_search", labels={"op": ""})
38
52
 
39
- observer = metrics.Observer("pg_catalog_search", labels={"op": ""})
40
53
  logger = logging.getLogger(__name__)
41
54
 
42
55
  SPLIT_REGEX = re.compile(r"\W")
43
56
 
44
57
 
45
- def _filter_operands(operands: list[CatalogExpression]) -> tuple[list[str], list[CatalogExpression]]:
46
- facets = []
47
- nonfacets = []
48
- for op in operands:
49
- if op.facet:
50
- facets.append(op.facet)
51
- else:
52
- nonfacets.append(op)
58
+ def _pg_transaction(txn: Transaction) -> PGTransaction:
59
+ return cast(PGTransaction, txn)
53
60
 
54
- return facets, nonfacets
55
61
 
56
-
57
- def _convert_filter(expr: CatalogExpression, filter_params: dict[str, Any]) -> sql.Composable:
58
- if expr.bool_and:
59
- return _convert_boolean_op(expr.bool_and, "and", filter_params)
60
- elif expr.bool_or:
61
- return _convert_boolean_op(expr.bool_or, "or", filter_params)
62
- elif expr.bool_not:
63
- return sql.SQL("(NOT {})").format(_convert_filter(expr.bool_not, filter_params))
64
- elif expr.date:
65
- return _convert_date_filter(expr.date, filter_params)
66
- elif expr.facet:
67
- param_name = f"param{len(filter_params)}"
68
- filter_params[param_name] = [expr.facet]
69
- if expr.facet == "/n/s/PROCESSED":
70
- # Optimization for the most common case, we know PROCESSED is a full label and can use the smaller labels index
71
- # This is needed because PROCESSED is present in most catalog entries and PG is unlikely to use any index
72
- # for it, falling back to executing the extract_facets function which can be slow
73
- return sql.SQL("labels @> {}").format(sql.Placeholder(param_name))
74
- else:
75
- return sql.SQL("extract_facets(labels) @> {}").format(sql.Placeholder(param_name))
76
- elif expr.resource_id:
77
- param_name = f"param{len(filter_params)}"
78
- filter_params[param_name] = [expr.resource_id]
79
- return sql.SQL("rid = {}").format(sql.Placeholder(param_name))
80
- else:
81
- return sql.SQL("")
62
+ def _pg_driver() -> PGDriver:
63
+ return cast(PGDriver, get_driver())
82
64
 
83
65
 
84
- def _convert_boolean_op(
85
- operands: list[CatalogExpression],
86
- op: Union[Literal["and"], Literal["or"]],
87
- filter_params: dict[str, Any],
88
- ) -> sql.Composable:
89
- array_op = sql.SQL("@>" if op == "and" else "&&")
90
- operands_sql: list[sql.Composable] = []
91
- facets, nonfacets = _filter_operands(operands)
92
- if facets:
93
- param_name = f"param{len(filter_params)}"
94
- if facets == ["/n/s/PROCESSED"]:
95
- # Optimization for the most common case, we know PROCESSED is a full label and can use the smaller labels index
96
- # This is needed because PROCESSED is present in most catalog entries and PG is unlikely to use any index
97
- # for it, falling back to executing the extract_facets function which can be slow
98
- operands_sql.append(sql.SQL("labels @> {}").format(sql.Placeholder(param_name)))
99
- else:
100
- operands_sql.append(
101
- sql.SQL("extract_facets(labels) {} {}").format(array_op, sql.Placeholder(param_name))
66
+ class PGCatalog(Catalog):
67
+ @write_observer.wrap({"type": "update"})
68
+ async def update(self, txn: Transaction, kbid: str, rid: str, data: CatalogResourceData):
69
+ async with _pg_transaction(txn).connection.cursor() as cur:
70
+ await cur.execute(
71
+ """
72
+ INSERT INTO catalog
73
+ (kbid, rid, title, created_at, modified_at, labels, slug)
74
+ VALUES
75
+ (%(kbid)s, %(rid)s, %(title)s, %(created_at)s, %(modified_at)s, %(labels)s, %(slug)s)
76
+ ON CONFLICT (kbid, rid) DO UPDATE SET
77
+ title = excluded.title,
78
+ created_at = excluded.created_at,
79
+ modified_at = excluded.modified_at,
80
+ labels = excluded.labels,
81
+ slug = excluded.slug""",
82
+ {
83
+ "kbid": kbid,
84
+ "rid": rid,
85
+ "title": data.title,
86
+ "created_at": data.created_at,
87
+ "modified_at": data.modified_at,
88
+ "labels": data.labels,
89
+ "slug": data.slug,
90
+ },
91
+ )
92
+ await cur.execute(
93
+ "DELETE FROM catalog_facets WHERE kbid = %(kbid)s AND rid = %(rid)s",
94
+ {
95
+ "kbid": kbid,
96
+ "rid": rid,
97
+ },
98
+ )
99
+ await cur.execute(
100
+ "INSERT INTO catalog_facets (kbid, rid, facet) SELECT %(kbid)s AS kbid, %(rid)s AS rid, unnest(%(facets)s::text[]) AS facet",
101
+ {
102
+ "kbid": kbid,
103
+ "rid": rid,
104
+ "facets": list(extract_facets(data.labels)),
105
+ },
102
106
  )
103
- filter_params[param_name] = facets
104
- for nonfacet in nonfacets:
105
- operands_sql.append(_convert_filter(nonfacet, filter_params))
106
- return sql.SQL("({})").format(sql.SQL(f" {op.upper()} ").join(operands_sql))
107
107
 
108
+ @write_observer.wrap({"type": "delete"})
109
+ async def delete(self, txn: Transaction, kbid: str, rid: str):
110
+ async with _pg_transaction(txn).connection.cursor() as cur:
111
+ await cur.execute(
112
+ "DELETE FROM catalog where kbid = %(kbid)s AND rid = %(rid)s", {"kbid": kbid, "rid": rid}
113
+ )
108
114
 
109
- def _convert_date_filter(date: CatalogExpression.Date, filter_params: dict[str, Any]) -> sql.Composable:
110
- if date.since and date.until:
111
- since_name = f"param{len(filter_params)}"
112
- filter_params[since_name] = date.since
113
- until_name = f"param{len(filter_params)}"
114
- filter_params[until_name] = date.until
115
- return sql.SQL("{field} BETWEEN {since} AND {until}").format(
116
- field=sql.Identifier(date.field),
117
- since=sql.Placeholder(since_name),
118
- until=sql.Placeholder(until_name),
119
- )
120
- elif date.since:
121
- since_name = f"param{len(filter_params)}"
122
- filter_params[since_name] = date.since
123
- return sql.SQL("{field} > {since}").format(
124
- field=sql.Identifier(date.field), since=sql.Placeholder(since_name)
125
- )
126
- elif date.until:
127
- until_name = f"param{len(filter_params)}"
128
- filter_params[until_name] = date.until
129
- return sql.SQL("{field} < {until}").format(
130
- field=sql.Identifier(date.field), until=sql.Placeholder(until_name)
115
+ @search_observer.wrap({"op": "search"})
116
+ async def search(self, catalog_query: CatalogQuery) -> Resources:
117
+ # Prepare SQL query
118
+ query, query_params = _prepare_query_filters(catalog_query)
119
+
120
+ async with _pg_driver()._get_connection() as conn, conn.cursor(row_factory=dict_row) as cur:
121
+ facets = {}
122
+
123
+ # Faceted search
124
+ if catalog_query.faceted:
125
+ with search_observer({"op": "facets"}):
126
+ tmp_facets: dict[str, dict[str, int]] = {
127
+ translate_label(f): defaultdict(int) for f in catalog_query.faceted
128
+ }
129
+
130
+ if catalog_query.filters is None:
131
+ await _faceted_search_unfiltered(cur, catalog_query, tmp_facets)
132
+ else:
133
+ await _faceted_search_filtered(
134
+ cur, catalog_query, tmp_facets, query, query_params
135
+ )
136
+
137
+ facets = {translate_system_to_alias_label(k): v for k, v in tmp_facets.items()}
138
+
139
+ # Totals
140
+ with search_observer({"op": "totals"}):
141
+ await cur.execute(
142
+ sql.SQL("SELECT COUNT(*) FROM ({}) fc").format(query),
143
+ query_params,
144
+ )
145
+ total = (await cur.fetchone())["count"] # type: ignore
146
+
147
+ # Query
148
+ with search_observer({"op": "query"}):
149
+ query, query_params = _prepare_query(catalog_query)
150
+ await cur.execute(query, query_params)
151
+ data = await cur.fetchall()
152
+
153
+ return Resources(
154
+ facets=facets,
155
+ results=[
156
+ ResourceResult(
157
+ rid=str(r["rid"]).replace("-", ""),
158
+ field="title",
159
+ field_type="a",
160
+ labels=[label for label in r["labels"] if label.startswith("/l/")],
161
+ score=0,
162
+ )
163
+ for r in data
164
+ ],
165
+ query=catalog_query.query.query if catalog_query.query else "",
166
+ total=total,
167
+ page_number=catalog_query.page_number,
168
+ page_size=catalog_query.page_size,
169
+ next_page=(catalog_query.page_size * catalog_query.page_number + len(data) < total),
170
+ min_score=0,
131
171
  )
132
- else:
133
- raise ValueError(f"Invalid date operator")
172
+
173
+ @search_observer.wrap({"op": "catalog_facets"})
174
+ async def facets(self, kbid: str, request: CatalogFacetsRequest) -> dict[str, int]:
175
+ async with _pg_driver()._get_connection() as conn, conn.cursor() as cur:
176
+ prefix_filters: list[sql.Composable] = []
177
+ prefix_params: dict[str, Any] = {}
178
+ for cnt, prefix in enumerate(request.prefixes):
179
+ prefix_sql = sql.SQL("facet LIKE {}").format(sql.Placeholder(f"prefix{cnt}"))
180
+ prefix_params[f"prefix{cnt}"] = f"{prefix.prefix}%"
181
+ if prefix.depth is not None:
182
+ prefix_parts = len(prefix.prefix.split("/"))
183
+ depth_sql = sql.SQL("SPLIT_PART(facet, '/', {}) = ''").format(
184
+ sql.Placeholder(f"depth{cnt}")
185
+ )
186
+ prefix_params[f"depth{cnt}"] = prefix_parts + prefix.depth + 1
187
+ prefix_sql = sql.SQL("({} AND {})").format(prefix_sql, depth_sql)
188
+ prefix_filters.append(prefix_sql)
189
+
190
+ filter_sql: sql.Composable
191
+ if prefix_filters:
192
+ filter_sql = sql.SQL("AND {}").format(sql.SQL(" OR ").join(prefix_filters))
193
+ else:
194
+ filter_sql = sql.SQL("")
195
+
196
+ await cur.execute(
197
+ sql.SQL(
198
+ "SELECT facet, COUNT(*) FROM catalog_facets WHERE kbid = %(kbid)s {} GROUP BY facet"
199
+ ).format(filter_sql),
200
+ {"kbid": kbid, **prefix_params},
201
+ )
202
+ return {k: v for k, v in await cur.fetchall()}
134
203
 
135
204
 
136
205
  def _prepare_query_filters(catalog_query: CatalogQuery) -> tuple[sql.Composable, dict[str, Any]]:
@@ -149,42 +218,16 @@ def _prepare_query_filters(catalog_query: CatalogQuery) -> tuple[sql.Composable,
149
218
  )
150
219
 
151
220
 
152
- def _prepare_query_search(query: search_models.CatalogQuery, params: dict[str, Any]) -> sql.Composable:
153
- if query.match == search_models.CatalogQueryMatch.Exact:
154
- params["query"] = query.query
155
- return sql.SQL("{} = %(query)s").format(sql.Identifier(query.field.value))
156
- elif query.match == search_models.CatalogQueryMatch.StartsWith:
157
- params["query"] = query.query + "%"
158
- if query.field == search_models.CatalogQueryField.Title:
159
- # Insensitive search supported by pg_trgm for title
160
- return sql.SQL("{} ILIKE %(query)s").format(sql.Identifier(query.field.value))
221
+ def _filter_operands(operands: list[CatalogExpression]) -> tuple[list[str], list[CatalogExpression]]:
222
+ facets = []
223
+ nonfacets = []
224
+ for op in operands:
225
+ if op.facet:
226
+ facets.append(op.facet)
161
227
  else:
162
- # Sensitive search for slug (btree does not support ILIKE and slugs are all lowercase anyway)
163
- return sql.SQL("{} LIKE %(query)s").format(sql.Identifier(query.field.value))
164
- # The rest of operators only supported by title
165
- elif query.match == search_models.CatalogQueryMatch.Words:
166
- # This is doing tokenization inside the SQL server (to keep the index updated). We could move it to
167
- # the python code at update/query time if it ever becomes a problem but for now, a single regex
168
- # executed per query is not a problem.
228
+ nonfacets.append(op)
169
229
 
170
- # Remove zero-length words from the split
171
- params["query"] = [word.lower() for word in SPLIT_REGEX.split(query.query) if word]
172
- return sql.SQL("regexp_split_to_array(lower(title), '\\W') @> %(query)s")
173
- elif query.match == search_models.CatalogQueryMatch.Fuzzy:
174
- params["query"] = query.query
175
- # Note: the operator is %>, We use %%> for psycopg escaping
176
- return sql.SQL("title %%> %(query)s")
177
- elif query.match == search_models.CatalogQueryMatch.EndsWith:
178
- params["query"] = "%" + query.query
179
- return sql.SQL("title ILIKE %(query)s")
180
- elif query.match == search_models.CatalogQueryMatch.Contains:
181
- params["query"] = "%" + query.query + "%"
182
- return sql.SQL("title ILIKE %(query)s")
183
- else: # pragma: nocover
184
- # This is a trick so mypy generates an error if this branch can be reached,
185
- # that is, if we are missing some ifs
186
- _a: int = "a"
187
- return sql.SQL("")
230
+ return facets, nonfacets
188
231
 
189
232
 
190
233
  def _prepare_query(catalog_query: CatalogQuery) -> tuple[sql.Composed, dict[str, Any]]:
@@ -219,67 +262,6 @@ def _prepare_query(catalog_query: CatalogQuery) -> tuple[sql.Composed, dict[str,
219
262
  return query, filter_params
220
263
 
221
264
 
222
- def _pg_driver() -> PGDriver:
223
- return cast(PGDriver, get_driver())
224
-
225
-
226
- @observer.wrap({"op": "search"})
227
- async def pgcatalog_search(catalog_query: CatalogQuery) -> Resources:
228
- # Prepare SQL query
229
- query, query_params = _prepare_query_filters(catalog_query)
230
-
231
- async with _pg_driver()._get_connection() as conn, conn.cursor(row_factory=dict_row) as cur:
232
- facets = {}
233
-
234
- # Faceted search
235
- if catalog_query.faceted:
236
- with observer({"op": "facets"}):
237
- tmp_facets: dict[str, dict[str, int]] = {
238
- translate_label(f): defaultdict(int) for f in catalog_query.faceted
239
- }
240
-
241
- if catalog_query.filters is None:
242
- await _faceted_search_unfiltered(cur, catalog_query, tmp_facets)
243
- else:
244
- await _faceted_search_filtered(cur, catalog_query, tmp_facets, query, query_params)
245
-
246
- facets = {translate_system_to_alias_label(k): v for k, v in tmp_facets.items()}
247
-
248
- # Totals
249
- with observer({"op": "totals"}):
250
- await cur.execute(
251
- sql.SQL("SELECT COUNT(*) FROM ({}) fc").format(query),
252
- query_params,
253
- )
254
- total = (await cur.fetchone())["count"] # type: ignore
255
-
256
- # Query
257
- with observer({"op": "query"}):
258
- query, query_params = _prepare_query(catalog_query)
259
- await cur.execute(query, query_params)
260
- data = await cur.fetchall()
261
-
262
- return Resources(
263
- facets=facets,
264
- results=[
265
- ResourceResult(
266
- rid=str(r["rid"]).replace("-", ""),
267
- field="title",
268
- field_type="a",
269
- labels=[label for label in r["labels"] if label.startswith("/l/")],
270
- score=0,
271
- )
272
- for r in data
273
- ],
274
- query=catalog_query.query.query if catalog_query.query else "",
275
- total=total,
276
- page_number=catalog_query.page_number,
277
- page_size=catalog_query.page_size,
278
- next_page=(catalog_query.page_size * catalog_query.page_number + len(data) < total),
279
- min_score=0,
280
- )
281
-
282
-
283
265
  async def _faceted_search_unfiltered(
284
266
  cur: AsyncCursor[DictRow], catalog_query: CatalogQuery, tmp_facets: dict[str, dict[str, int]]
285
267
  ):
@@ -360,33 +342,137 @@ async def _faceted_search_filtered(
360
342
  tmp_facets[grandparent][translate_system_to_alias_label(parent)] += count
361
343
 
362
344
 
363
- @observer.wrap({"op": "catalog_facets"})
364
- async def pgcatalog_facets(kbid: str, request: CatalogFacetsRequest) -> dict[str, int]:
365
- async with _pg_driver()._get_connection() as conn, conn.cursor() as cur:
366
- prefix_filters: list[sql.Composable] = []
367
- prefix_params: dict[str, Any] = {}
368
- for cnt, prefix in enumerate(request.prefixes):
369
- prefix_sql = sql.SQL("facet LIKE {}").format(sql.Placeholder(f"prefix{cnt}"))
370
- prefix_params[f"prefix{cnt}"] = f"{prefix.prefix}%"
371
- if prefix.depth is not None:
372
- prefix_parts = len(prefix.prefix.split("/"))
373
- depth_sql = sql.SQL("SPLIT_PART(facet, '/', {}) = ''").format(
374
- sql.Placeholder(f"depth{cnt}")
375
- )
376
- prefix_params[f"depth{cnt}"] = prefix_parts + prefix.depth + 1
377
- prefix_sql = sql.SQL("({} AND {})").format(prefix_sql, depth_sql)
378
- prefix_filters.append(prefix_sql)
345
+ def _prepare_query_search(query: search_models.CatalogQuery, params: dict[str, Any]) -> sql.Composable:
346
+ if query.match == search_models.CatalogQueryMatch.Exact:
347
+ params["query"] = query.query
348
+ return sql.SQL("{} = %(query)s").format(sql.Identifier(query.field.value))
349
+ elif query.match == search_models.CatalogQueryMatch.StartsWith:
350
+ params["query"] = query.query + "%"
351
+ if query.field == search_models.CatalogQueryField.Title:
352
+ # Insensitive search supported by pg_trgm for title
353
+ return sql.SQL("{} ILIKE %(query)s").format(sql.Identifier(query.field.value))
354
+ else:
355
+ # Sensitive search for slug (btree does not support ILIKE and slugs are all lowercase anyway)
356
+ return sql.SQL("{} LIKE %(query)s").format(sql.Identifier(query.field.value))
357
+ # The rest of operators only supported by title
358
+ elif query.match == search_models.CatalogQueryMatch.Words:
359
+ # This is doing tokenization inside the SQL server (to keep the index updated). We could move it to
360
+ # the python code at update/query time if it ever becomes a problem but for now, a single regex
361
+ # executed per query is not a problem.
379
362
 
380
- filter_sql: sql.Composable
381
- if prefix_filters:
382
- filter_sql = sql.SQL("AND {}").format(sql.SQL(" OR ").join(prefix_filters))
363
+ # Remove zero-length words from the split
364
+ params["query"] = [word.lower() for word in SPLIT_REGEX.split(query.query) if word]
365
+ return sql.SQL("regexp_split_to_array(lower(title), '\\W') @> %(query)s")
366
+ elif query.match == search_models.CatalogQueryMatch.Fuzzy:
367
+ params["query"] = query.query
368
+ # Note: the operator is %>, We use %%> for psycopg escaping
369
+ return sql.SQL("title %%> %(query)s")
370
+ elif query.match == search_models.CatalogQueryMatch.EndsWith:
371
+ params["query"] = "%" + query.query
372
+ return sql.SQL("title ILIKE %(query)s")
373
+ elif query.match == search_models.CatalogQueryMatch.Contains:
374
+ params["query"] = "%" + query.query + "%"
375
+ return sql.SQL("title ILIKE %(query)s")
376
+ else: # pragma: no cover
377
+ # This is a trick so mypy generates an error if this branch can be reached,
378
+ # that is, if we are missing some ifs
379
+ _a: int = "a"
380
+ return sql.SQL("")
381
+
382
+
383
+ def _convert_filter(expr: CatalogExpression, filter_params: dict[str, Any]) -> sql.Composable:
384
+ if expr.bool_and:
385
+ return _convert_boolean_op(expr.bool_and, "and", filter_params)
386
+ elif expr.bool_or:
387
+ return _convert_boolean_op(expr.bool_or, "or", filter_params)
388
+ elif expr.bool_not:
389
+ return sql.SQL("(NOT {})").format(_convert_filter(expr.bool_not, filter_params))
390
+ elif expr.date:
391
+ return _convert_date_filter(expr.date, filter_params)
392
+ elif expr.facet:
393
+ param_name = f"param{len(filter_params)}"
394
+ filter_params[param_name] = [expr.facet]
395
+ if expr.facet == "/n/s/PROCESSED":
396
+ # Optimization for the most common case, we know PROCESSED is a full label and can use the smaller labels index
397
+ # This is needed because PROCESSED is present in most catalog entries and PG is unlikely to use any index
398
+ # for it, falling back to executing the extract_facets function which can be slow
399
+ return sql.SQL("labels @> {}").format(sql.Placeholder(param_name))
383
400
  else:
384
- filter_sql = sql.SQL("")
401
+ return sql.SQL("extract_facets(labels) @> {}").format(sql.Placeholder(param_name))
402
+ elif expr.resource_id:
403
+ param_name = f"param{len(filter_params)}"
404
+ filter_params[param_name] = [expr.resource_id]
405
+ return sql.SQL("rid = {}").format(sql.Placeholder(param_name))
406
+ else:
407
+ return sql.SQL("")
408
+
409
+
410
+ def _convert_boolean_op(
411
+ operands: list[CatalogExpression],
412
+ op: Union[Literal["and"], Literal["or"]],
413
+ filter_params: dict[str, Any],
414
+ ) -> sql.Composable:
415
+ array_op = sql.SQL("@>" if op == "and" else "&&")
416
+ operands_sql: list[sql.Composable] = []
417
+ facets, nonfacets = _filter_operands(operands)
418
+ if facets:
419
+ param_name = f"param{len(filter_params)}"
420
+ if facets == ["/n/s/PROCESSED"]:
421
+ # Optimization for the most common case, we know PROCESSED is a full label and can use the smaller labels index
422
+ # This is needed because PROCESSED is present in most catalog entries and PG is unlikely to use any index
423
+ # for it, falling back to executing the extract_facets function which can be slow
424
+ operands_sql.append(sql.SQL("labels @> {}").format(sql.Placeholder(param_name)))
425
+ else:
426
+ operands_sql.append(
427
+ sql.SQL("extract_facets(labels) {} {}").format(array_op, sql.Placeholder(param_name))
428
+ )
429
+ filter_params[param_name] = facets
430
+ for nonfacet in nonfacets:
431
+ operands_sql.append(_convert_filter(nonfacet, filter_params))
432
+ return sql.SQL("({})").format(sql.SQL(f" {op.upper()} ").join(operands_sql))
433
+
385
434
 
386
- await cur.execute(
387
- sql.SQL(
388
- "SELECT facet, COUNT(*) FROM catalog_facets WHERE kbid = %(kbid)s {} GROUP BY facet"
389
- ).format(filter_sql),
390
- {"kbid": kbid, **prefix_params},
435
+ def _convert_date_filter(date: CatalogExpression.Date, filter_params: dict[str, Any]) -> sql.Composable:
436
+ if date.since and date.until:
437
+ since_name = f"param{len(filter_params)}"
438
+ filter_params[since_name] = date.since
439
+ until_name = f"param{len(filter_params)}"
440
+ filter_params[until_name] = date.until
441
+ return sql.SQL("{field} BETWEEN {since} AND {until}").format(
442
+ field=sql.Identifier(date.field),
443
+ since=sql.Placeholder(since_name),
444
+ until=sql.Placeholder(until_name),
391
445
  )
392
- return {k: v for k, v in await cur.fetchall()}
446
+ elif date.since:
447
+ since_name = f"param{len(filter_params)}"
448
+ filter_params[since_name] = date.since
449
+ return sql.SQL("{field} > {since}").format(
450
+ field=sql.Identifier(date.field), since=sql.Placeholder(since_name)
451
+ )
452
+ elif date.until:
453
+ until_name = f"param{len(filter_params)}"
454
+ filter_params[until_name] = date.until
455
+ return sql.SQL("{field} < {until}").format(
456
+ field=sql.Identifier(date.field), until=sql.Placeholder(until_name)
457
+ )
458
+ else:
459
+ raise ValueError(f"Invalid date operator")
460
+
461
+
462
+ def translate_label(literal: str) -> str:
463
+ if len(literal) == 0:
464
+ raise InvalidQueryError("filters", "Invalid empty label")
465
+ if literal[0] != "/":
466
+ raise InvalidQueryError("filters", f"Invalid label. It must start with a `/`: {literal}")
467
+ return translate_alias_to_system_label(literal)
468
+
469
+
470
+ def extract_facets(labels: list[str]) -> set[str]:
471
+ facets = set()
472
+ for label in labels:
473
+ parts = label.split("/")
474
+ facet = ""
475
+ for part in parts[1:]:
476
+ facet += f"/{part}"
477
+ facets.add(facet)
478
+ return facets