nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (246) hide show
  1. migrations/0023_backfill_pg_catalog.py +8 -4
  2. migrations/0028_extracted_vectors_reference.py +1 -1
  3. migrations/0029_backfill_field_status.py +3 -4
  4. migrations/0032_remove_old_relations.py +2 -3
  5. migrations/0038_backfill_catalog_field_labels.py +8 -4
  6. migrations/0039_backfill_converation_splits_metadata.py +106 -0
  7. migrations/0040_migrate_search_configurations.py +79 -0
  8. migrations/0041_reindex_conversations.py +137 -0
  9. migrations/pg/0010_shards_index.py +34 -0
  10. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  11. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  12. nucliadb/backups/create.py +2 -15
  13. nucliadb/backups/restore.py +4 -15
  14. nucliadb/backups/tasks.py +4 -1
  15. nucliadb/common/back_pressure/cache.py +2 -3
  16. nucliadb/common/back_pressure/materializer.py +7 -13
  17. nucliadb/common/back_pressure/settings.py +6 -6
  18. nucliadb/common/back_pressure/utils.py +1 -0
  19. nucliadb/common/cache.py +9 -9
  20. nucliadb/common/catalog/__init__.py +79 -0
  21. nucliadb/common/catalog/dummy.py +36 -0
  22. nucliadb/common/catalog/interface.py +85 -0
  23. nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
  24. nucliadb/common/catalog/utils.py +56 -0
  25. nucliadb/common/cluster/manager.py +8 -23
  26. nucliadb/common/cluster/rebalance.py +484 -112
  27. nucliadb/common/cluster/rollover.py +36 -9
  28. nucliadb/common/cluster/settings.py +4 -9
  29. nucliadb/common/cluster/utils.py +34 -8
  30. nucliadb/common/context/__init__.py +7 -8
  31. nucliadb/common/context/fastapi.py +1 -2
  32. nucliadb/common/datamanagers/__init__.py +2 -4
  33. nucliadb/common/datamanagers/atomic.py +9 -2
  34. nucliadb/common/datamanagers/cluster.py +1 -2
  35. nucliadb/common/datamanagers/fields.py +3 -4
  36. nucliadb/common/datamanagers/kb.py +6 -6
  37. nucliadb/common/datamanagers/labels.py +2 -3
  38. nucliadb/common/datamanagers/resources.py +10 -33
  39. nucliadb/common/datamanagers/rollover.py +5 -7
  40. nucliadb/common/datamanagers/search_configurations.py +1 -2
  41. nucliadb/common/datamanagers/synonyms.py +1 -2
  42. nucliadb/common/datamanagers/utils.py +4 -4
  43. nucliadb/common/datamanagers/vectorsets.py +4 -4
  44. nucliadb/common/external_index_providers/base.py +32 -5
  45. nucliadb/common/external_index_providers/manager.py +5 -34
  46. nucliadb/common/external_index_providers/settings.py +1 -27
  47. nucliadb/common/filter_expression.py +129 -41
  48. nucliadb/common/http_clients/exceptions.py +8 -0
  49. nucliadb/common/http_clients/processing.py +16 -23
  50. nucliadb/common/http_clients/utils.py +3 -0
  51. nucliadb/common/ids.py +82 -58
  52. nucliadb/common/locking.py +1 -2
  53. nucliadb/common/maindb/driver.py +9 -8
  54. nucliadb/common/maindb/local.py +5 -5
  55. nucliadb/common/maindb/pg.py +9 -8
  56. nucliadb/common/nidx.py +22 -5
  57. nucliadb/common/vector_index_config.py +1 -1
  58. nucliadb/export_import/datamanager.py +4 -3
  59. nucliadb/export_import/exporter.py +11 -19
  60. nucliadb/export_import/importer.py +13 -6
  61. nucliadb/export_import/tasks.py +2 -0
  62. nucliadb/export_import/utils.py +6 -18
  63. nucliadb/health.py +2 -2
  64. nucliadb/ingest/app.py +8 -8
  65. nucliadb/ingest/consumer/consumer.py +8 -10
  66. nucliadb/ingest/consumer/pull.py +10 -8
  67. nucliadb/ingest/consumer/service.py +5 -30
  68. nucliadb/ingest/consumer/shard_creator.py +16 -5
  69. nucliadb/ingest/consumer/utils.py +1 -1
  70. nucliadb/ingest/fields/base.py +37 -49
  71. nucliadb/ingest/fields/conversation.py +55 -9
  72. nucliadb/ingest/fields/exceptions.py +1 -2
  73. nucliadb/ingest/fields/file.py +22 -8
  74. nucliadb/ingest/fields/link.py +7 -7
  75. nucliadb/ingest/fields/text.py +2 -3
  76. nucliadb/ingest/orm/brain_v2.py +89 -57
  77. nucliadb/ingest/orm/broker_message.py +2 -4
  78. nucliadb/ingest/orm/entities.py +10 -209
  79. nucliadb/ingest/orm/index_message.py +128 -113
  80. nucliadb/ingest/orm/knowledgebox.py +91 -59
  81. nucliadb/ingest/orm/processor/auditing.py +1 -3
  82. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  83. nucliadb/ingest/orm/processor/processor.py +98 -153
  84. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  85. nucliadb/ingest/orm/resource.py +82 -71
  86. nucliadb/ingest/orm/utils.py +1 -1
  87. nucliadb/ingest/partitions.py +12 -1
  88. nucliadb/ingest/processing.py +17 -17
  89. nucliadb/ingest/serialize.py +202 -145
  90. nucliadb/ingest/service/writer.py +15 -114
  91. nucliadb/ingest/settings.py +36 -15
  92. nucliadb/ingest/utils.py +1 -2
  93. nucliadb/learning_proxy.py +23 -26
  94. nucliadb/metrics_exporter.py +20 -6
  95. nucliadb/middleware/__init__.py +82 -1
  96. nucliadb/migrator/datamanager.py +4 -11
  97. nucliadb/migrator/migrator.py +1 -2
  98. nucliadb/migrator/models.py +1 -2
  99. nucliadb/migrator/settings.py +1 -2
  100. nucliadb/models/internal/augment.py +614 -0
  101. nucliadb/models/internal/processing.py +19 -19
  102. nucliadb/openapi.py +2 -2
  103. nucliadb/purge/__init__.py +3 -8
  104. nucliadb/purge/orphan_shards.py +1 -2
  105. nucliadb/reader/__init__.py +5 -0
  106. nucliadb/reader/api/models.py +6 -13
  107. nucliadb/reader/api/v1/download.py +59 -38
  108. nucliadb/reader/api/v1/export_import.py +4 -4
  109. nucliadb/reader/api/v1/knowledgebox.py +37 -9
  110. nucliadb/reader/api/v1/learning_config.py +33 -14
  111. nucliadb/reader/api/v1/resource.py +61 -9
  112. nucliadb/reader/api/v1/services.py +18 -14
  113. nucliadb/reader/app.py +3 -1
  114. nucliadb/reader/reader/notifications.py +1 -2
  115. nucliadb/search/api/v1/__init__.py +3 -0
  116. nucliadb/search/api/v1/ask.py +3 -4
  117. nucliadb/search/api/v1/augment.py +585 -0
  118. nucliadb/search/api/v1/catalog.py +15 -19
  119. nucliadb/search/api/v1/find.py +16 -22
  120. nucliadb/search/api/v1/hydrate.py +328 -0
  121. nucliadb/search/api/v1/knowledgebox.py +1 -2
  122. nucliadb/search/api/v1/predict_proxy.py +1 -2
  123. nucliadb/search/api/v1/resource/ask.py +28 -8
  124. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  125. nucliadb/search/api/v1/resource/search.py +9 -11
  126. nucliadb/search/api/v1/retrieve.py +130 -0
  127. nucliadb/search/api/v1/search.py +28 -32
  128. nucliadb/search/api/v1/suggest.py +11 -14
  129. nucliadb/search/api/v1/summarize.py +1 -2
  130. nucliadb/search/api/v1/utils.py +2 -2
  131. nucliadb/search/app.py +3 -2
  132. nucliadb/search/augmentor/__init__.py +21 -0
  133. nucliadb/search/augmentor/augmentor.py +232 -0
  134. nucliadb/search/augmentor/fields.py +704 -0
  135. nucliadb/search/augmentor/metrics.py +24 -0
  136. nucliadb/search/augmentor/paragraphs.py +334 -0
  137. nucliadb/search/augmentor/resources.py +238 -0
  138. nucliadb/search/augmentor/utils.py +33 -0
  139. nucliadb/search/lifecycle.py +3 -1
  140. nucliadb/search/predict.py +33 -19
  141. nucliadb/search/predict_models.py +8 -9
  142. nucliadb/search/requesters/utils.py +11 -10
  143. nucliadb/search/search/cache.py +19 -42
  144. nucliadb/search/search/chat/ask.py +131 -59
  145. nucliadb/search/search/chat/exceptions.py +3 -5
  146. nucliadb/search/search/chat/fetcher.py +201 -0
  147. nucliadb/search/search/chat/images.py +6 -4
  148. nucliadb/search/search/chat/old_prompt.py +1375 -0
  149. nucliadb/search/search/chat/parser.py +510 -0
  150. nucliadb/search/search/chat/prompt.py +563 -615
  151. nucliadb/search/search/chat/query.py +453 -32
  152. nucliadb/search/search/chat/rpc.py +85 -0
  153. nucliadb/search/search/fetch.py +3 -4
  154. nucliadb/search/search/filters.py +8 -11
  155. nucliadb/search/search/find.py +33 -31
  156. nucliadb/search/search/find_merge.py +124 -331
  157. nucliadb/search/search/graph_strategy.py +14 -12
  158. nucliadb/search/search/hydrator/__init__.py +49 -0
  159. nucliadb/search/search/hydrator/fields.py +217 -0
  160. nucliadb/search/search/hydrator/images.py +130 -0
  161. nucliadb/search/search/hydrator/paragraphs.py +323 -0
  162. nucliadb/search/search/hydrator/resources.py +60 -0
  163. nucliadb/search/search/ingestion_agents.py +5 -5
  164. nucliadb/search/search/merge.py +90 -94
  165. nucliadb/search/search/metrics.py +24 -7
  166. nucliadb/search/search/paragraphs.py +7 -9
  167. nucliadb/search/search/predict_proxy.py +44 -18
  168. nucliadb/search/search/query.py +14 -86
  169. nucliadb/search/search/query_parser/fetcher.py +51 -82
  170. nucliadb/search/search/query_parser/models.py +19 -48
  171. nucliadb/search/search/query_parser/old_filters.py +20 -19
  172. nucliadb/search/search/query_parser/parsers/ask.py +5 -6
  173. nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
  174. nucliadb/search/search/query_parser/parsers/common.py +21 -13
  175. nucliadb/search/search/query_parser/parsers/find.py +6 -29
  176. nucliadb/search/search/query_parser/parsers/graph.py +18 -28
  177. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  178. nucliadb/search/search/query_parser/parsers/search.py +15 -56
  179. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  180. nucliadb/search/search/rank_fusion.py +18 -13
  181. nucliadb/search/search/rerankers.py +6 -7
  182. nucliadb/search/search/retrieval.py +300 -0
  183. nucliadb/search/search/summarize.py +5 -6
  184. nucliadb/search/search/utils.py +3 -4
  185. nucliadb/search/settings.py +1 -2
  186. nucliadb/standalone/api_router.py +1 -1
  187. nucliadb/standalone/app.py +4 -3
  188. nucliadb/standalone/auth.py +5 -6
  189. nucliadb/standalone/lifecycle.py +2 -2
  190. nucliadb/standalone/run.py +5 -4
  191. nucliadb/standalone/settings.py +5 -6
  192. nucliadb/standalone/versions.py +3 -4
  193. nucliadb/tasks/consumer.py +13 -8
  194. nucliadb/tasks/models.py +2 -1
  195. nucliadb/tasks/producer.py +3 -3
  196. nucliadb/tasks/retries.py +8 -7
  197. nucliadb/train/api/utils.py +1 -3
  198. nucliadb/train/api/v1/shards.py +1 -2
  199. nucliadb/train/api/v1/trainset.py +1 -2
  200. nucliadb/train/app.py +1 -1
  201. nucliadb/train/generator.py +4 -4
  202. nucliadb/train/generators/field_classifier.py +2 -2
  203. nucliadb/train/generators/field_streaming.py +6 -6
  204. nucliadb/train/generators/image_classifier.py +2 -2
  205. nucliadb/train/generators/paragraph_classifier.py +2 -2
  206. nucliadb/train/generators/paragraph_streaming.py +2 -2
  207. nucliadb/train/generators/question_answer_streaming.py +2 -2
  208. nucliadb/train/generators/sentence_classifier.py +4 -10
  209. nucliadb/train/generators/token_classifier.py +3 -2
  210. nucliadb/train/generators/utils.py +6 -5
  211. nucliadb/train/nodes.py +3 -3
  212. nucliadb/train/resource.py +6 -8
  213. nucliadb/train/settings.py +3 -4
  214. nucliadb/train/types.py +11 -11
  215. nucliadb/train/upload.py +3 -2
  216. nucliadb/train/uploader.py +1 -2
  217. nucliadb/train/utils.py +1 -2
  218. nucliadb/writer/api/v1/export_import.py +4 -1
  219. nucliadb/writer/api/v1/field.py +15 -14
  220. nucliadb/writer/api/v1/knowledgebox.py +18 -56
  221. nucliadb/writer/api/v1/learning_config.py +5 -4
  222. nucliadb/writer/api/v1/resource.py +9 -20
  223. nucliadb/writer/api/v1/services.py +10 -132
  224. nucliadb/writer/api/v1/upload.py +73 -72
  225. nucliadb/writer/app.py +8 -2
  226. nucliadb/writer/resource/basic.py +12 -15
  227. nucliadb/writer/resource/field.py +43 -5
  228. nucliadb/writer/resource/origin.py +7 -0
  229. nucliadb/writer/settings.py +2 -3
  230. nucliadb/writer/tus/__init__.py +2 -3
  231. nucliadb/writer/tus/azure.py +5 -7
  232. nucliadb/writer/tus/dm.py +3 -3
  233. nucliadb/writer/tus/exceptions.py +3 -4
  234. nucliadb/writer/tus/gcs.py +15 -22
  235. nucliadb/writer/tus/s3.py +2 -3
  236. nucliadb/writer/tus/storage.py +3 -3
  237. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
  238. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  239. nucliadb/common/datamanagers/entities.py +0 -139
  240. nucliadb/common/external_index_providers/pinecone.py +0 -894
  241. nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
  242. nucliadb/search/search/hydrator.py +0 -197
  243. nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
  244. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  245. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  246. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
@@ -21,116 +21,186 @@
21
21
  import logging
22
22
  import re
23
23
  from collections import defaultdict
24
- from typing import Any, Literal, Union, cast
24
+ from typing import Any, Literal, cast
25
25
 
26
26
  from psycopg import AsyncCursor, sql
27
27
  from psycopg.rows import DictRow, dict_row
28
-
29
- from nucliadb.common.maindb.pg import PGDriver
28
+ from typing_extensions import assert_never
29
+
30
+ from nucliadb.common.catalog.interface import (
31
+ Catalog,
32
+ CatalogExpression,
33
+ CatalogQuery,
34
+ CatalogResourceData,
35
+ )
36
+ from nucliadb.common.exceptions import InvalidQueryError
37
+ from nucliadb.common.maindb.driver import Transaction
38
+ from nucliadb.common.maindb.pg import PGDriver, PGTransaction
30
39
  from nucliadb.common.maindb.utils import get_driver
31
- from nucliadb.search.search.query_parser.models import CatalogExpression, CatalogQuery
32
40
  from nucliadb_models import search as search_models
33
- from nucliadb_models.labels import translate_system_to_alias_label
34
- from nucliadb_models.search import CatalogFacetsRequest, ResourceResult, Resources, SortField, SortOrder
41
+ from nucliadb_models.labels import translate_alias_to_system_label, translate_system_to_alias_label
42
+ from nucliadb_models.search import (
43
+ CatalogFacetsRequest,
44
+ ResourceResult,
45
+ Resources,
46
+ SortField,
47
+ SortOrder,
48
+ )
35
49
  from nucliadb_telemetry import metrics
36
50
 
37
- from .filters import translate_label
51
+ write_observer = metrics.Observer("pg_catalog_write", labels={"type": ""})
52
+ search_observer = metrics.Observer("pg_catalog_search", labels={"op": ""})
38
53
 
39
- observer = metrics.Observer("pg_catalog_search", labels={"op": ""})
40
54
  logger = logging.getLogger(__name__)
41
55
 
42
56
  SPLIT_REGEX = re.compile(r"\W")
43
57
 
44
58
 
45
- def _filter_operands(operands: list[CatalogExpression]) -> tuple[list[str], list[CatalogExpression]]:
46
- facets = []
47
- nonfacets = []
48
- for op in operands:
49
- if op.facet:
50
- facets.append(op.facet)
51
- else:
52
- nonfacets.append(op)
53
-
54
- return facets, nonfacets
59
+ def _pg_transaction(txn: Transaction) -> PGTransaction:
60
+ return cast(PGTransaction, txn)
55
61
 
56
62
 
57
- def _convert_filter(expr: CatalogExpression, filter_params: dict[str, Any]) -> sql.Composable:
58
- if expr.bool_and:
59
- return _convert_boolean_op(expr.bool_and, "and", filter_params)
60
- elif expr.bool_or:
61
- return _convert_boolean_op(expr.bool_or, "or", filter_params)
62
- elif expr.bool_not:
63
- return sql.SQL("(NOT {})").format(_convert_filter(expr.bool_not, filter_params))
64
- elif expr.date:
65
- return _convert_date_filter(expr.date, filter_params)
66
- elif expr.facet:
67
- param_name = f"param{len(filter_params)}"
68
- filter_params[param_name] = [expr.facet]
69
- if expr.facet == "/n/s/PROCESSED":
70
- # Optimization for the most common case, we know PROCESSED is a full label and can use the smaller labels index
71
- # This is needed because PROCESSED is present in most catalog entries and PG is unlikely to use any index
72
- # for it, falling back to executing the extract_facets function which can be slow
73
- return sql.SQL("labels @> {}").format(sql.Placeholder(param_name))
74
- else:
75
- return sql.SQL("extract_facets(labels) @> {}").format(sql.Placeholder(param_name))
76
- elif expr.resource_id:
77
- param_name = f"param{len(filter_params)}"
78
- filter_params[param_name] = [expr.resource_id]
79
- return sql.SQL("rid = {}").format(sql.Placeholder(param_name))
80
- else:
81
- return sql.SQL("")
63
+ def _pg_driver() -> PGDriver:
64
+ return cast(PGDriver, get_driver())
82
65
 
83
66
 
84
- def _convert_boolean_op(
85
- operands: list[CatalogExpression],
86
- op: Union[Literal["and"], Literal["or"]],
87
- filter_params: dict[str, Any],
88
- ) -> sql.Composable:
89
- array_op = sql.SQL("@>" if op == "and" else "&&")
90
- operands_sql: list[sql.Composable] = []
91
- facets, nonfacets = _filter_operands(operands)
92
- if facets:
93
- param_name = f"param{len(filter_params)}"
94
- if facets == ["/n/s/PROCESSED"]:
95
- # Optimization for the most common case, we know PROCESSED is a full label and can use the smaller labels index
96
- # This is needed because PROCESSED is present in most catalog entries and PG is unlikely to use any index
97
- # for it, falling back to executing the extract_facets function which can be slow
98
- operands_sql.append(sql.SQL("labels @> {}").format(sql.Placeholder(param_name)))
99
- else:
100
- operands_sql.append(
101
- sql.SQL("extract_facets(labels) {} {}").format(array_op, sql.Placeholder(param_name))
67
+ class PGCatalog(Catalog):
68
+ @write_observer.wrap({"type": "update"})
69
+ async def update(self, txn: Transaction, kbid: str, rid: str, data: CatalogResourceData):
70
+ async with _pg_transaction(txn).connection.cursor() as cur:
71
+ await cur.execute(
72
+ """
73
+ INSERT INTO catalog
74
+ (kbid, rid, title, created_at, modified_at, labels, slug)
75
+ VALUES
76
+ (%(kbid)s, %(rid)s, %(title)s, %(created_at)s, %(modified_at)s, %(labels)s, %(slug)s)
77
+ ON CONFLICT (kbid, rid) DO UPDATE SET
78
+ title = excluded.title,
79
+ created_at = excluded.created_at,
80
+ modified_at = excluded.modified_at,
81
+ labels = excluded.labels,
82
+ slug = excluded.slug""",
83
+ {
84
+ "kbid": kbid,
85
+ "rid": rid,
86
+ "title": data.title,
87
+ "created_at": data.created_at,
88
+ "modified_at": data.modified_at,
89
+ "labels": data.labels,
90
+ "slug": data.slug,
91
+ },
92
+ )
93
+ await cur.execute(
94
+ "DELETE FROM catalog_facets WHERE kbid = %(kbid)s AND rid = %(rid)s",
95
+ {
96
+ "kbid": kbid,
97
+ "rid": rid,
98
+ },
99
+ )
100
+ await cur.execute(
101
+ "INSERT INTO catalog_facets (kbid, rid, facet) SELECT %(kbid)s AS kbid, %(rid)s AS rid, unnest(%(facets)s::text[]) AS facet",
102
+ {
103
+ "kbid": kbid,
104
+ "rid": rid,
105
+ "facets": list(extract_facets(data.labels)),
106
+ },
102
107
  )
103
- filter_params[param_name] = facets
104
- for nonfacet in nonfacets:
105
- operands_sql.append(_convert_filter(nonfacet, filter_params))
106
- return sql.SQL("({})").format(sql.SQL(f" {op.upper()} ").join(operands_sql))
107
108
 
109
+ @write_observer.wrap({"type": "delete"})
110
+ async def delete(self, txn: Transaction, kbid: str, rid: str):
111
+ async with _pg_transaction(txn).connection.cursor() as cur:
112
+ await cur.execute(
113
+ "DELETE FROM catalog where kbid = %(kbid)s AND rid = %(rid)s", {"kbid": kbid, "rid": rid}
114
+ )
108
115
 
109
- def _convert_date_filter(date: CatalogExpression.Date, filter_params: dict[str, Any]) -> sql.Composable:
110
- if date.since and date.until:
111
- since_name = f"param{len(filter_params)}"
112
- filter_params[since_name] = date.since
113
- until_name = f"param{len(filter_params)}"
114
- filter_params[until_name] = date.until
115
- return sql.SQL("{field} BETWEEN {since} AND {until}").format(
116
- field=sql.Identifier(date.field),
117
- since=sql.Placeholder(since_name),
118
- until=sql.Placeholder(until_name),
119
- )
120
- elif date.since:
121
- since_name = f"param{len(filter_params)}"
122
- filter_params[since_name] = date.since
123
- return sql.SQL("{field} > {since}").format(
124
- field=sql.Identifier(date.field), since=sql.Placeholder(since_name)
125
- )
126
- elif date.until:
127
- until_name = f"param{len(filter_params)}"
128
- filter_params[until_name] = date.until
129
- return sql.SQL("{field} < {until}").format(
130
- field=sql.Identifier(date.field), until=sql.Placeholder(until_name)
116
+ @search_observer.wrap({"op": "search"})
117
+ async def search(self, catalog_query: CatalogQuery) -> Resources:
118
+ # Prepare SQL query
119
+ query, query_params = _prepare_query_filters(catalog_query)
120
+
121
+ async with _pg_driver()._get_connection() as conn, conn.cursor(row_factory=dict_row) as cur:
122
+ facets = {}
123
+
124
+ # Faceted search
125
+ if catalog_query.faceted:
126
+ with search_observer({"op": "facets"}):
127
+ tmp_facets: dict[str, dict[str, int]] = {
128
+ translate_label(f): defaultdict(int) for f in catalog_query.faceted
129
+ }
130
+
131
+ if catalog_query.filters is None:
132
+ await _faceted_search_unfiltered(cur, catalog_query, tmp_facets)
133
+ else:
134
+ await _faceted_search_filtered(
135
+ cur, catalog_query, tmp_facets, query, query_params
136
+ )
137
+
138
+ facets = {translate_system_to_alias_label(k): v for k, v in tmp_facets.items()}
139
+
140
+ # Totals
141
+ with search_observer({"op": "totals"}):
142
+ await cur.execute(
143
+ sql.SQL("SELECT COUNT(*) FROM ({}) fc").format(query),
144
+ query_params,
145
+ )
146
+ total = (await cur.fetchone())["count"] # type: ignore
147
+
148
+ # Query
149
+ with search_observer({"op": "query"}):
150
+ query, query_params = _prepare_query(catalog_query)
151
+ await cur.execute(query, query_params)
152
+ data = await cur.fetchall()
153
+
154
+ return Resources(
155
+ facets=facets,
156
+ results=[
157
+ ResourceResult(
158
+ rid=str(r["rid"]).replace("-", ""),
159
+ field="title",
160
+ field_type="a",
161
+ labels=[label for label in r["labels"] if label.startswith("/l/")],
162
+ score=0,
163
+ )
164
+ for r in data
165
+ ],
166
+ query=catalog_query.query.query if catalog_query.query else "",
167
+ total=total,
168
+ page_number=catalog_query.page_number,
169
+ page_size=catalog_query.page_size,
170
+ next_page=(catalog_query.page_size * catalog_query.page_number + len(data) < total),
171
+ min_score=0,
131
172
  )
132
- else:
133
- raise ValueError(f"Invalid date operator")
173
+
174
+ @search_observer.wrap({"op": "catalog_facets"})
175
+ async def facets(self, kbid: str, request: CatalogFacetsRequest) -> dict[str, int]:
176
+ async with _pg_driver()._get_connection() as conn, conn.cursor() as cur:
177
+ prefix_filters: list[sql.Composable] = []
178
+ prefix_params: dict[str, Any] = {}
179
+ for cnt, prefix in enumerate(request.prefixes):
180
+ prefix_sql = sql.SQL("facet LIKE {}").format(sql.Placeholder(f"prefix{cnt}"))
181
+ prefix_params[f"prefix{cnt}"] = f"{prefix.prefix}%"
182
+ if prefix.depth is not None:
183
+ prefix_parts = len(prefix.prefix.split("/"))
184
+ depth_sql = sql.SQL("SPLIT_PART(facet, '/', {}) = ''").format(
185
+ sql.Placeholder(f"depth{cnt}")
186
+ )
187
+ prefix_params[f"depth{cnt}"] = prefix_parts + prefix.depth + 1
188
+ prefix_sql = sql.SQL("({} AND {})").format(prefix_sql, depth_sql)
189
+ prefix_filters.append(prefix_sql)
190
+
191
+ filter_sql: sql.Composable
192
+ if prefix_filters:
193
+ filter_sql = sql.SQL("AND {}").format(sql.SQL(" OR ").join(prefix_filters))
194
+ else:
195
+ filter_sql = sql.SQL("")
196
+
197
+ await cur.execute(
198
+ sql.SQL(
199
+ "SELECT facet, COUNT(*) FROM catalog_facets WHERE kbid = %(kbid)s {} GROUP BY facet"
200
+ ).format(filter_sql),
201
+ {"kbid": kbid, **prefix_params},
202
+ )
203
+ return {k: v for k, v in await cur.fetchall()}
134
204
 
135
205
 
136
206
  def _prepare_query_filters(catalog_query: CatalogQuery) -> tuple[sql.Composable, dict[str, Any]]:
@@ -149,42 +219,16 @@ def _prepare_query_filters(catalog_query: CatalogQuery) -> tuple[sql.Composable,
149
219
  )
150
220
 
151
221
 
152
- def _prepare_query_search(query: search_models.CatalogQuery, params: dict[str, Any]) -> sql.Composable:
153
- if query.match == search_models.CatalogQueryMatch.Exact:
154
- params["query"] = query.query
155
- return sql.SQL("{} = %(query)s").format(sql.Identifier(query.field.value))
156
- elif query.match == search_models.CatalogQueryMatch.StartsWith:
157
- params["query"] = query.query + "%"
158
- if query.field == search_models.CatalogQueryField.Title:
159
- # Insensitive search supported by pg_trgm for title
160
- return sql.SQL("{} ILIKE %(query)s").format(sql.Identifier(query.field.value))
222
+ def _filter_operands(operands: list[CatalogExpression]) -> tuple[list[str], list[CatalogExpression]]:
223
+ facets = []
224
+ nonfacets = []
225
+ for op in operands:
226
+ if op.facet:
227
+ facets.append(op.facet)
161
228
  else:
162
- # Sensitive search for slug (btree does not support ILIKE and slugs are all lowercase anyway)
163
- return sql.SQL("{} LIKE %(query)s").format(sql.Identifier(query.field.value))
164
- # The rest of operators only supported by title
165
- elif query.match == search_models.CatalogQueryMatch.Words:
166
- # This is doing tokenization inside the SQL server (to keep the index updated). We could move it to
167
- # the python code at update/query time if it ever becomes a problem but for now, a single regex
168
- # executed per query is not a problem.
229
+ nonfacets.append(op)
169
230
 
170
- # Remove zero-length words from the split
171
- params["query"] = [word.lower() for word in SPLIT_REGEX.split(query.query) if word]
172
- return sql.SQL("regexp_split_to_array(lower(title), '\\W') @> %(query)s")
173
- elif query.match == search_models.CatalogQueryMatch.Fuzzy:
174
- params["query"] = query.query
175
- # Note: the operator is %>, We use %%> for psycopg escaping
176
- return sql.SQL("title %%> %(query)s")
177
- elif query.match == search_models.CatalogQueryMatch.EndsWith:
178
- params["query"] = "%" + query.query
179
- return sql.SQL("title ILIKE %(query)s")
180
- elif query.match == search_models.CatalogQueryMatch.Contains:
181
- params["query"] = "%" + query.query + "%"
182
- return sql.SQL("title ILIKE %(query)s")
183
- else: # pragma: nocover
184
- # This is a trick so mypy generates an error if this branch can be reached,
185
- # that is, if we are missing some ifs
186
- _a: int = "a"
187
- return sql.SQL("")
231
+ return facets, nonfacets
188
232
 
189
233
 
190
234
  def _prepare_query(catalog_query: CatalogQuery) -> tuple[sql.Composed, dict[str, Any]]:
@@ -219,98 +263,51 @@ def _prepare_query(catalog_query: CatalogQuery) -> tuple[sql.Composed, dict[str,
219
263
  return query, filter_params
220
264
 
221
265
 
222
- def _pg_driver() -> PGDriver:
223
- return cast(PGDriver, get_driver())
224
-
225
-
226
- @observer.wrap({"op": "search"})
227
- async def pgcatalog_search(catalog_query: CatalogQuery) -> Resources:
228
- # Prepare SQL query
229
- query, query_params = _prepare_query_filters(catalog_query)
230
-
231
- async with _pg_driver()._get_connection() as conn, conn.cursor(row_factory=dict_row) as cur:
232
- facets = {}
233
-
234
- # Faceted search
235
- if catalog_query.faceted:
236
- with observer({"op": "facets"}):
237
- tmp_facets: dict[str, dict[str, int]] = {
238
- translate_label(f): defaultdict(int) for f in catalog_query.faceted
239
- }
240
-
241
- if catalog_query.filters is None:
242
- await _faceted_search_unfiltered(cur, catalog_query, tmp_facets)
243
- else:
244
- await _faceted_search_filtered(cur, catalog_query, tmp_facets, query, query_params)
245
-
246
- facets = {translate_system_to_alias_label(k): v for k, v in tmp_facets.items()}
247
-
248
- # Totals
249
- with observer({"op": "totals"}):
250
- await cur.execute(
251
- sql.SQL("SELECT COUNT(*) FROM ({}) fc").format(query),
252
- query_params,
253
- )
254
- total = (await cur.fetchone())["count"] # type: ignore
255
-
256
- # Query
257
- with observer({"op": "query"}):
258
- query, query_params = _prepare_query(catalog_query)
259
- await cur.execute(query, query_params)
260
- data = await cur.fetchall()
261
-
262
- return Resources(
263
- facets=facets,
264
- results=[
265
- ResourceResult(
266
- rid=str(r["rid"]).replace("-", ""),
267
- field="title",
268
- field_type="a",
269
- labels=[label for label in r["labels"] if label.startswith("/l/")],
270
- score=0,
271
- )
272
- for r in data
273
- ],
274
- query=catalog_query.query.query if catalog_query.query else "",
275
- total=total,
276
- page_number=catalog_query.page_number,
277
- page_size=catalog_query.page_size,
278
- next_page=(catalog_query.page_size * catalog_query.page_number + len(data) < total),
279
- min_score=0,
280
- )
281
-
282
-
283
266
  async def _faceted_search_unfiltered(
284
267
  cur: AsyncCursor[DictRow], catalog_query: CatalogQuery, tmp_facets: dict[str, dict[str, int]]
285
268
  ):
286
269
  facet_params: dict[str, Any] = {}
287
270
  facet_sql: sql.Composable
288
- if len(tmp_facets) <= 5:
289
- # Asking for few facets, strictly filter to what we need in the query
290
- prefixes_sql = []
291
- for cnt, prefix in enumerate(tmp_facets.keys()):
292
- prefixes_sql.append(
293
- sql.SQL("(facet LIKE {} AND POSITION('/' IN RIGHT(facet, {})) = 0)").format(
294
- sql.Placeholder(f"facet_{cnt}"), sql.Placeholder(f"facet_len_{cnt}")
271
+ if list(tmp_facets.keys()) == ["/n/s"]:
272
+ # Special case when querying only for status. We know the list of possible facets and optimize
273
+ # by asking for each facet separately which makes better use of the index
274
+ sqls = []
275
+ for status in ["PENDING", "PROCESSED", "ERROR", "EMPTY"]:
276
+ sqls.append(
277
+ sql.SQL(
278
+ "SELECT facet, COUNT(*) FROM catalog_facets WHERE kbid = %(kbid)s AND facet = '/n/s/{}' GROUP BY facet".format(
279
+ status
280
+ )
295
281
  )
296
282
  )
297
- facet_params[f"facet_{cnt}"] = f"{prefix}/%"
298
- facet_params[f"facet_len_{cnt}"] = -(len(prefix) + 1)
299
- facet_sql = sql.SQL("AND {}").format(sql.SQL(" OR ").join(prefixes_sql))
300
- elif all((facet.startswith("/l") or facet.startswith("/n/i") for facet in tmp_facets.keys())):
301
- # Special case for the catalog query, which can have many facets asked for
302
- # Filter for the categories (icon and labels) in the query, filter the rest in the code below
303
- facet_sql = sql.SQL("AND (facet LIKE '/l/%%' OR facet like '/n/i/%%')")
283
+ await cur.execute(sql.SQL(" UNION ").join(sqls), {"kbid": catalog_query.kbid})
304
284
  else:
305
- # Worst case: ask for all facets and filter here. This is faster than applying lots of filters
306
- facet_sql = sql.SQL("")
285
+ if len(tmp_facets) <= 5:
286
+ # Asking for few facets, strictly filter to what we need in the query
287
+ prefixes_sql = []
288
+ for cnt, prefix in enumerate(tmp_facets.keys()):
289
+ prefixes_sql.append(
290
+ sql.SQL("(facet LIKE {} AND POSITION('/' IN RIGHT(facet, {})) = 0)").format(
291
+ sql.Placeholder(f"facet_{cnt}"), sql.Placeholder(f"facet_len_{cnt}")
292
+ )
293
+ )
294
+ facet_params[f"facet_{cnt}"] = f"{prefix}/%"
295
+ facet_params[f"facet_len_{cnt}"] = -(len(prefix) + 1)
296
+ facet_sql = sql.SQL("AND {}").format(sql.SQL(" OR ").join(prefixes_sql))
297
+ elif all(facet.startswith("/l") or facet.startswith("/n/i") for facet in tmp_facets.keys()):
298
+ # Special case for the catalog query, which can have many facets asked for
299
+ # Filter for the categories (icon and labels) in the query, filter the rest in the code below
300
+ facet_sql = sql.SQL("AND (facet LIKE '/l/%%' OR facet like '/n/i/%%')")
301
+ else:
302
+ # Worst case: ask for all facets and filter here. This is faster than applying lots of filters
303
+ facet_sql = sql.SQL("")
307
304
 
308
- await cur.execute(
309
- sql.SQL(
310
- "SELECT facet, COUNT(*) FROM catalog_facets WHERE kbid = %(kbid)s {} GROUP BY facet"
311
- ).format(facet_sql),
312
- {"kbid": catalog_query.kbid, **facet_params},
313
- )
305
+ await cur.execute(
306
+ sql.SQL(
307
+ "SELECT facet, COUNT(*) FROM catalog_facets WHERE kbid = %(kbid)s {} GROUP BY facet"
308
+ ).format(facet_sql),
309
+ {"kbid": catalog_query.kbid, **facet_params},
310
+ )
314
311
 
315
312
  # Only keep the facets we asked for
316
313
  for row in await cur.fetchall():
@@ -360,33 +357,134 @@ async def _faceted_search_filtered(
360
357
  tmp_facets[grandparent][translate_system_to_alias_label(parent)] += count
361
358
 
362
359
 
363
- @observer.wrap({"op": "catalog_facets"})
364
- async def pgcatalog_facets(kbid: str, request: CatalogFacetsRequest) -> dict[str, int]:
365
- async with _pg_driver()._get_connection() as conn, conn.cursor() as cur:
366
- prefix_filters: list[sql.Composable] = []
367
- prefix_params: dict[str, Any] = {}
368
- for cnt, prefix in enumerate(request.prefixes):
369
- prefix_sql = sql.SQL("facet LIKE {}").format(sql.Placeholder(f"prefix{cnt}"))
370
- prefix_params[f"prefix{cnt}"] = f"{prefix.prefix}%"
371
- if prefix.depth is not None:
372
- prefix_parts = len(prefix.prefix.split("/"))
373
- depth_sql = sql.SQL("SPLIT_PART(facet, '/', {}) = ''").format(
374
- sql.Placeholder(f"depth{cnt}")
375
- )
376
- prefix_params[f"depth{cnt}"] = prefix_parts + prefix.depth + 1
377
- prefix_sql = sql.SQL("({} AND {})").format(prefix_sql, depth_sql)
378
- prefix_filters.append(prefix_sql)
360
+ def _prepare_query_search(query: search_models.CatalogQuery, params: dict[str, Any]) -> sql.Composable:
361
+ if query.match == search_models.CatalogQueryMatch.Exact:
362
+ params["query"] = query.query
363
+ return sql.SQL("{} = %(query)s").format(sql.Identifier(query.field.value))
364
+ elif query.match == search_models.CatalogQueryMatch.StartsWith:
365
+ params["query"] = query.query + "%"
366
+ if query.field == search_models.CatalogQueryField.Title:
367
+ # Insensitive search supported by pg_trgm for title
368
+ return sql.SQL("{} ILIKE %(query)s").format(sql.Identifier(query.field.value))
369
+ else:
370
+ # Sensitive search for slug (btree does not support ILIKE and slugs are all lowercase anyway)
371
+ return sql.SQL("{} LIKE %(query)s").format(sql.Identifier(query.field.value))
372
+ # The rest of operators only supported by title
373
+ elif query.match == search_models.CatalogQueryMatch.Words:
374
+ # This is doing tokenization inside the SQL server (to keep the index updated). We could move it to
375
+ # the python code at update/query time if it ever becomes a problem but for now, a single regex
376
+ # executed per query is not a problem.
377
+
378
+ # Remove zero-length words from the split
379
+ params["query"] = [word.lower() for word in SPLIT_REGEX.split(query.query) if word]
380
+ return sql.SQL("regexp_split_to_array(lower(title), '\\W') @> %(query)s")
381
+ elif query.match == search_models.CatalogQueryMatch.Fuzzy:
382
+ params["query"] = query.query
383
+ # Note: the operator is %>, We use %%> for psycopg escaping
384
+ return sql.SQL("title %%> %(query)s")
385
+ elif query.match == search_models.CatalogQueryMatch.EndsWith:
386
+ params["query"] = "%" + query.query
387
+ return sql.SQL("title ILIKE %(query)s")
388
+ elif query.match == search_models.CatalogQueryMatch.Contains:
389
+ params["query"] = "%" + query.query + "%"
390
+ return sql.SQL("title ILIKE %(query)s")
391
+ else: # pragma: no cover
392
+ assert_never(query.match)
379
393
 
380
- filter_sql: sql.Composable
381
- if prefix_filters:
382
- filter_sql = sql.SQL("AND {}").format(sql.SQL(" OR ").join(prefix_filters))
394
+
395
+ def _convert_filter(expr: CatalogExpression, filter_params: dict[str, Any]) -> sql.Composable:
396
+ if expr.bool_and:
397
+ return _convert_boolean_op(expr.bool_and, "and", filter_params)
398
+ elif expr.bool_or:
399
+ return _convert_boolean_op(expr.bool_or, "or", filter_params)
400
+ elif expr.bool_not:
401
+ return sql.SQL("(NOT {})").format(_convert_filter(expr.bool_not, filter_params))
402
+ elif expr.date:
403
+ return _convert_date_filter(expr.date, filter_params)
404
+ elif expr.facet:
405
+ param_name = f"param{len(filter_params)}"
406
+ filter_params[param_name] = [expr.facet]
407
+ if expr.facet == "/n/s/PROCESSED":
408
+ # Optimization for the most common case, we know PROCESSED is a full label and can use the smaller labels index
409
+ # This is needed because PROCESSED is present in most catalog entries and PG is unlikely to use any index
410
+ # for it, falling back to executing the extract_facets function which can be slow
411
+ return sql.SQL("labels @> {}").format(sql.Placeholder(param_name))
383
412
  else:
384
- filter_sql = sql.SQL("")
413
+ return sql.SQL("extract_facets(labels) @> {}").format(sql.Placeholder(param_name))
414
+ elif expr.resource_id:
415
+ param_name = f"param{len(filter_params)}"
416
+ filter_params[param_name] = [expr.resource_id]
417
+ return sql.SQL("rid = {}").format(sql.Placeholder(param_name))
418
+ else:
419
+ return sql.SQL("")
385
420
 
386
- await cur.execute(
387
- sql.SQL(
388
- "SELECT facet, COUNT(*) FROM catalog_facets WHERE kbid = %(kbid)s {} GROUP BY facet"
389
- ).format(filter_sql),
390
- {"kbid": kbid, **prefix_params},
421
+
422
+ def _convert_boolean_op(
423
+ operands: list[CatalogExpression],
424
+ op: Literal["and"] | Literal["or"],
425
+ filter_params: dict[str, Any],
426
+ ) -> sql.Composable:
427
+ array_op = sql.SQL("@>" if op == "and" else "&&")
428
+ operands_sql: list[sql.Composable] = []
429
+ facets, nonfacets = _filter_operands(operands)
430
+ if facets:
431
+ param_name = f"param{len(filter_params)}"
432
+ if facets == ["/n/s/PROCESSED"]:
433
+ # Optimization for the most common case, we know PROCESSED is a full label and can use the smaller labels index
434
+ # This is needed because PROCESSED is present in most catalog entries and PG is unlikely to use any index
435
+ # for it, falling back to executing the extract_facets function which can be slow
436
+ operands_sql.append(sql.SQL("labels @> {}").format(sql.Placeholder(param_name)))
437
+ else:
438
+ operands_sql.append(
439
+ sql.SQL("extract_facets(labels) {} {}").format(array_op, sql.Placeholder(param_name))
440
+ )
441
+ filter_params[param_name] = facets
442
+ for nonfacet in nonfacets:
443
+ operands_sql.append(_convert_filter(nonfacet, filter_params))
444
+ return sql.SQL("({})").format(sql.SQL(f" {op.upper()} ").join(operands_sql))
445
+
446
+
447
+ def _convert_date_filter(date: CatalogExpression.Date, filter_params: dict[str, Any]) -> sql.Composable:
448
+ if date.since and date.until:
449
+ since_name = f"param{len(filter_params)}"
450
+ filter_params[since_name] = date.since
451
+ until_name = f"param{len(filter_params)}"
452
+ filter_params[until_name] = date.until
453
+ return sql.SQL("{field} BETWEEN {since} AND {until}").format(
454
+ field=sql.Identifier(date.field),
455
+ since=sql.Placeholder(since_name),
456
+ until=sql.Placeholder(until_name),
391
457
  )
392
- return {k: v for k, v in await cur.fetchall()}
458
+ elif date.since:
459
+ since_name = f"param{len(filter_params)}"
460
+ filter_params[since_name] = date.since
461
+ return sql.SQL("{field} > {since}").format(
462
+ field=sql.Identifier(date.field), since=sql.Placeholder(since_name)
463
+ )
464
+ elif date.until:
465
+ until_name = f"param{len(filter_params)}"
466
+ filter_params[until_name] = date.until
467
+ return sql.SQL("{field} < {until}").format(
468
+ field=sql.Identifier(date.field), until=sql.Placeholder(until_name)
469
+ )
470
+ else:
471
+ raise ValueError(f"Invalid date operator")
472
+
473
+
474
+ def translate_label(literal: str) -> str:
475
+ if len(literal) == 0:
476
+ raise InvalidQueryError("filters", "Invalid empty label")
477
+ if literal[0] != "/":
478
+ raise InvalidQueryError("filters", f"Invalid label. It must start with a `/`: {literal}")
479
+ return translate_alias_to_system_label(literal)
480
+
481
+
482
+ def extract_facets(labels: list[str]) -> set[str]:
483
+ facets = set()
484
+ for label in labels:
485
+ parts = label.split("/")
486
+ facet = ""
487
+ for part in parts[1:]:
488
+ facet += f"/{part}"
489
+ facets.add(facet)
490
+ return facets