nucliadb 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (231) hide show
  1. migrations/0023_backfill_pg_catalog.py +2 -2
  2. migrations/0029_backfill_field_status.py +3 -4
  3. migrations/0032_remove_old_relations.py +2 -3
  4. migrations/0038_backfill_catalog_field_labels.py +2 -2
  5. migrations/0039_backfill_converation_splits_metadata.py +2 -2
  6. migrations/0041_reindex_conversations.py +137 -0
  7. migrations/pg/0010_shards_index.py +34 -0
  8. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  9. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  10. nucliadb/backups/create.py +2 -15
  11. nucliadb/backups/restore.py +4 -15
  12. nucliadb/backups/tasks.py +4 -1
  13. nucliadb/common/back_pressure/cache.py +2 -3
  14. nucliadb/common/back_pressure/materializer.py +7 -13
  15. nucliadb/common/back_pressure/settings.py +6 -6
  16. nucliadb/common/back_pressure/utils.py +1 -0
  17. nucliadb/common/cache.py +9 -9
  18. nucliadb/common/catalog/interface.py +12 -12
  19. nucliadb/common/catalog/pg.py +41 -29
  20. nucliadb/common/catalog/utils.py +3 -3
  21. nucliadb/common/cluster/manager.py +5 -4
  22. nucliadb/common/cluster/rebalance.py +483 -114
  23. nucliadb/common/cluster/rollover.py +25 -9
  24. nucliadb/common/cluster/settings.py +3 -8
  25. nucliadb/common/cluster/utils.py +34 -8
  26. nucliadb/common/context/__init__.py +7 -8
  27. nucliadb/common/context/fastapi.py +1 -2
  28. nucliadb/common/datamanagers/__init__.py +2 -4
  29. nucliadb/common/datamanagers/atomic.py +4 -2
  30. nucliadb/common/datamanagers/cluster.py +1 -2
  31. nucliadb/common/datamanagers/fields.py +3 -4
  32. nucliadb/common/datamanagers/kb.py +6 -6
  33. nucliadb/common/datamanagers/labels.py +2 -3
  34. nucliadb/common/datamanagers/resources.py +10 -33
  35. nucliadb/common/datamanagers/rollover.py +5 -7
  36. nucliadb/common/datamanagers/search_configurations.py +1 -2
  37. nucliadb/common/datamanagers/synonyms.py +1 -2
  38. nucliadb/common/datamanagers/utils.py +4 -4
  39. nucliadb/common/datamanagers/vectorsets.py +4 -4
  40. nucliadb/common/external_index_providers/base.py +32 -5
  41. nucliadb/common/external_index_providers/manager.py +4 -5
  42. nucliadb/common/filter_expression.py +128 -40
  43. nucliadb/common/http_clients/processing.py +12 -23
  44. nucliadb/common/ids.py +6 -4
  45. nucliadb/common/locking.py +1 -2
  46. nucliadb/common/maindb/driver.py +9 -8
  47. nucliadb/common/maindb/local.py +5 -5
  48. nucliadb/common/maindb/pg.py +9 -8
  49. nucliadb/common/nidx.py +3 -4
  50. nucliadb/export_import/datamanager.py +4 -3
  51. nucliadb/export_import/exporter.py +11 -19
  52. nucliadb/export_import/importer.py +13 -6
  53. nucliadb/export_import/tasks.py +2 -0
  54. nucliadb/export_import/utils.py +6 -18
  55. nucliadb/health.py +2 -2
  56. nucliadb/ingest/app.py +8 -8
  57. nucliadb/ingest/consumer/consumer.py +8 -10
  58. nucliadb/ingest/consumer/pull.py +3 -8
  59. nucliadb/ingest/consumer/service.py +3 -3
  60. nucliadb/ingest/consumer/utils.py +1 -1
  61. nucliadb/ingest/fields/base.py +28 -49
  62. nucliadb/ingest/fields/conversation.py +12 -12
  63. nucliadb/ingest/fields/exceptions.py +1 -2
  64. nucliadb/ingest/fields/file.py +22 -8
  65. nucliadb/ingest/fields/link.py +7 -7
  66. nucliadb/ingest/fields/text.py +2 -3
  67. nucliadb/ingest/orm/brain_v2.py +78 -64
  68. nucliadb/ingest/orm/broker_message.py +2 -4
  69. nucliadb/ingest/orm/entities.py +10 -209
  70. nucliadb/ingest/orm/index_message.py +4 -4
  71. nucliadb/ingest/orm/knowledgebox.py +18 -27
  72. nucliadb/ingest/orm/processor/auditing.py +1 -3
  73. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  74. nucliadb/ingest/orm/processor/processor.py +27 -27
  75. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  76. nucliadb/ingest/orm/resource.py +72 -70
  77. nucliadb/ingest/orm/utils.py +1 -1
  78. nucliadb/ingest/processing.py +17 -17
  79. nucliadb/ingest/serialize.py +202 -145
  80. nucliadb/ingest/service/writer.py +3 -109
  81. nucliadb/ingest/settings.py +3 -4
  82. nucliadb/ingest/utils.py +1 -2
  83. nucliadb/learning_proxy.py +11 -11
  84. nucliadb/metrics_exporter.py +5 -4
  85. nucliadb/middleware/__init__.py +82 -1
  86. nucliadb/migrator/datamanager.py +3 -4
  87. nucliadb/migrator/migrator.py +1 -2
  88. nucliadb/migrator/models.py +1 -2
  89. nucliadb/migrator/settings.py +1 -2
  90. nucliadb/models/internal/augment.py +614 -0
  91. nucliadb/models/internal/processing.py +19 -19
  92. nucliadb/openapi.py +2 -2
  93. nucliadb/purge/__init__.py +3 -8
  94. nucliadb/purge/orphan_shards.py +1 -2
  95. nucliadb/reader/__init__.py +5 -0
  96. nucliadb/reader/api/models.py +6 -13
  97. nucliadb/reader/api/v1/download.py +59 -38
  98. nucliadb/reader/api/v1/export_import.py +4 -4
  99. nucliadb/reader/api/v1/learning_config.py +24 -4
  100. nucliadb/reader/api/v1/resource.py +61 -9
  101. nucliadb/reader/api/v1/services.py +18 -14
  102. nucliadb/reader/app.py +3 -1
  103. nucliadb/reader/reader/notifications.py +1 -2
  104. nucliadb/search/api/v1/__init__.py +2 -0
  105. nucliadb/search/api/v1/ask.py +3 -4
  106. nucliadb/search/api/v1/augment.py +585 -0
  107. nucliadb/search/api/v1/catalog.py +11 -15
  108. nucliadb/search/api/v1/find.py +16 -22
  109. nucliadb/search/api/v1/hydrate.py +25 -25
  110. nucliadb/search/api/v1/knowledgebox.py +1 -2
  111. nucliadb/search/api/v1/predict_proxy.py +1 -2
  112. nucliadb/search/api/v1/resource/ask.py +7 -7
  113. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  114. nucliadb/search/api/v1/resource/search.py +9 -11
  115. nucliadb/search/api/v1/retrieve.py +130 -0
  116. nucliadb/search/api/v1/search.py +28 -32
  117. nucliadb/search/api/v1/suggest.py +11 -14
  118. nucliadb/search/api/v1/summarize.py +1 -2
  119. nucliadb/search/api/v1/utils.py +2 -2
  120. nucliadb/search/app.py +3 -2
  121. nucliadb/search/augmentor/__init__.py +21 -0
  122. nucliadb/search/augmentor/augmentor.py +232 -0
  123. nucliadb/search/augmentor/fields.py +704 -0
  124. nucliadb/search/augmentor/metrics.py +24 -0
  125. nucliadb/search/augmentor/paragraphs.py +334 -0
  126. nucliadb/search/augmentor/resources.py +238 -0
  127. nucliadb/search/augmentor/utils.py +33 -0
  128. nucliadb/search/lifecycle.py +3 -1
  129. nucliadb/search/predict.py +24 -17
  130. nucliadb/search/predict_models.py +8 -9
  131. nucliadb/search/requesters/utils.py +11 -10
  132. nucliadb/search/search/cache.py +19 -23
  133. nucliadb/search/search/chat/ask.py +88 -59
  134. nucliadb/search/search/chat/exceptions.py +3 -5
  135. nucliadb/search/search/chat/fetcher.py +201 -0
  136. nucliadb/search/search/chat/images.py +6 -4
  137. nucliadb/search/search/chat/old_prompt.py +1375 -0
  138. nucliadb/search/search/chat/parser.py +510 -0
  139. nucliadb/search/search/chat/prompt.py +563 -615
  140. nucliadb/search/search/chat/query.py +449 -36
  141. nucliadb/search/search/chat/rpc.py +85 -0
  142. nucliadb/search/search/fetch.py +3 -4
  143. nucliadb/search/search/filters.py +8 -11
  144. nucliadb/search/search/find.py +33 -31
  145. nucliadb/search/search/find_merge.py +124 -331
  146. nucliadb/search/search/graph_strategy.py +14 -12
  147. nucliadb/search/search/hydrator/__init__.py +3 -152
  148. nucliadb/search/search/hydrator/fields.py +92 -50
  149. nucliadb/search/search/hydrator/images.py +7 -7
  150. nucliadb/search/search/hydrator/paragraphs.py +42 -26
  151. nucliadb/search/search/hydrator/resources.py +20 -16
  152. nucliadb/search/search/ingestion_agents.py +5 -5
  153. nucliadb/search/search/merge.py +90 -94
  154. nucliadb/search/search/metrics.py +10 -9
  155. nucliadb/search/search/paragraphs.py +7 -9
  156. nucliadb/search/search/predict_proxy.py +13 -9
  157. nucliadb/search/search/query.py +14 -86
  158. nucliadb/search/search/query_parser/fetcher.py +51 -82
  159. nucliadb/search/search/query_parser/models.py +19 -20
  160. nucliadb/search/search/query_parser/old_filters.py +20 -19
  161. nucliadb/search/search/query_parser/parsers/ask.py +4 -5
  162. nucliadb/search/search/query_parser/parsers/catalog.py +5 -6
  163. nucliadb/search/search/query_parser/parsers/common.py +5 -6
  164. nucliadb/search/search/query_parser/parsers/find.py +6 -26
  165. nucliadb/search/search/query_parser/parsers/graph.py +13 -23
  166. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  167. nucliadb/search/search/query_parser/parsers/search.py +15 -53
  168. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  169. nucliadb/search/search/rank_fusion.py +18 -13
  170. nucliadb/search/search/rerankers.py +5 -6
  171. nucliadb/search/search/retrieval.py +300 -0
  172. nucliadb/search/search/summarize.py +5 -6
  173. nucliadb/search/search/utils.py +3 -4
  174. nucliadb/search/settings.py +1 -2
  175. nucliadb/standalone/api_router.py +1 -1
  176. nucliadb/standalone/app.py +4 -3
  177. nucliadb/standalone/auth.py +5 -6
  178. nucliadb/standalone/lifecycle.py +2 -2
  179. nucliadb/standalone/run.py +2 -4
  180. nucliadb/standalone/settings.py +5 -6
  181. nucliadb/standalone/versions.py +3 -4
  182. nucliadb/tasks/consumer.py +13 -8
  183. nucliadb/tasks/models.py +2 -1
  184. nucliadb/tasks/producer.py +3 -3
  185. nucliadb/tasks/retries.py +8 -7
  186. nucliadb/train/api/utils.py +1 -3
  187. nucliadb/train/api/v1/shards.py +1 -2
  188. nucliadb/train/api/v1/trainset.py +1 -2
  189. nucliadb/train/app.py +1 -1
  190. nucliadb/train/generator.py +4 -4
  191. nucliadb/train/generators/field_classifier.py +2 -2
  192. nucliadb/train/generators/field_streaming.py +6 -6
  193. nucliadb/train/generators/image_classifier.py +2 -2
  194. nucliadb/train/generators/paragraph_classifier.py +2 -2
  195. nucliadb/train/generators/paragraph_streaming.py +2 -2
  196. nucliadb/train/generators/question_answer_streaming.py +2 -2
  197. nucliadb/train/generators/sentence_classifier.py +2 -2
  198. nucliadb/train/generators/token_classifier.py +3 -2
  199. nucliadb/train/generators/utils.py +6 -5
  200. nucliadb/train/nodes.py +3 -3
  201. nucliadb/train/resource.py +6 -8
  202. nucliadb/train/settings.py +3 -4
  203. nucliadb/train/types.py +11 -11
  204. nucliadb/train/upload.py +3 -2
  205. nucliadb/train/uploader.py +1 -2
  206. nucliadb/train/utils.py +1 -2
  207. nucliadb/writer/api/v1/export_import.py +4 -1
  208. nucliadb/writer/api/v1/field.py +7 -11
  209. nucliadb/writer/api/v1/knowledgebox.py +3 -4
  210. nucliadb/writer/api/v1/resource.py +9 -20
  211. nucliadb/writer/api/v1/services.py +10 -132
  212. nucliadb/writer/api/v1/upload.py +73 -72
  213. nucliadb/writer/app.py +8 -2
  214. nucliadb/writer/resource/basic.py +12 -15
  215. nucliadb/writer/resource/field.py +7 -5
  216. nucliadb/writer/resource/origin.py +7 -0
  217. nucliadb/writer/settings.py +2 -3
  218. nucliadb/writer/tus/__init__.py +2 -3
  219. nucliadb/writer/tus/azure.py +1 -3
  220. nucliadb/writer/tus/dm.py +3 -3
  221. nucliadb/writer/tus/exceptions.py +3 -4
  222. nucliadb/writer/tus/gcs.py +5 -6
  223. nucliadb/writer/tus/s3.py +2 -3
  224. nucliadb/writer/tus/storage.py +3 -3
  225. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +9 -10
  226. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  227. nucliadb/common/datamanagers/entities.py +0 -139
  228. nucliadb-6.9.1.post5192.dist-info/RECORD +0 -392
  229. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  230. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  231. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,510 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ import logging
22
+
23
+ from pydantic import ValidationError
24
+
25
+ from nucliadb.common.exceptions import InvalidQueryError
26
+ from nucliadb.common.filter_expression import filter_from_facet
27
+ from nucliadb.common.models_utils.from_proto import RelationNodeTypeMap
28
+ from nucliadb.search.search.chat.fetcher import RAOFetcher
29
+ from nucliadb.search.search.query_parser.exceptions import InternalParserError
30
+ from nucliadb.search.search.query_parser.models import (
31
+ RelationQuery,
32
+ )
33
+ from nucliadb.search.search.query_parser.old_filters import is_paragraph_label, translate_label
34
+ from nucliadb.search.search.query_parser.parsers.common import (
35
+ parse_keyword_min_score,
36
+ should_disable_vector_search,
37
+ )
38
+ from nucliadb.search.search.rerankers import NoopReranker, PredictReranker, Reranker
39
+ from nucliadb_models import retrieval as retrieval_models
40
+ from nucliadb_models import search as search_models
41
+ from nucliadb_models.common import FieldTypeName
42
+ from nucliadb_models.filters import (
43
+ And,
44
+ DateCreated,
45
+ DateModified,
46
+ Field,
47
+ FieldFilterExpression,
48
+ FilterExpression,
49
+ Keyword,
50
+ Not,
51
+ Or,
52
+ ParagraphFilterExpression,
53
+ Resource,
54
+ )
55
+ from nucliadb_models.retrieval import RetrievalRequest
56
+ from nucliadb_models.search import Filter, FindRequest
57
+ from nucliadb_protos import knowledgebox_pb2, utils_pb2
58
+
59
+ logger = logging.getLogger(__name__)
60
+
61
+ DEFAULT_GENERIC_SEMANTIC_THRESHOLD = 0.7
62
+
63
+
64
+ async def rao_parse_find(
65
+ kbid: str, find_request: FindRequest
66
+ ) -> tuple[RAOFetcher, RetrievalRequest, Reranker]:
67
+ # This is a thin layer to convert a FindRequest into a RetrievalRequest +
68
+ # some bw/c stuff we need while refactoring and decoupling code
69
+
70
+ fetcher = RAOFetcher(
71
+ kbid,
72
+ query=find_request.query,
73
+ user_vector=find_request.vector,
74
+ vectorset=find_request.vectorset,
75
+ rephrase=find_request.rephrase,
76
+ rephrase_prompt=find_request.rephrase_prompt,
77
+ generative_model=find_request.generative_model,
78
+ query_image=find_request.query_image,
79
+ )
80
+ parser = RAOFindParser(kbid, find_request, fetcher)
81
+ retrieval_request, reranker = await parser.parse()
82
+ return fetcher, retrieval_request, reranker
83
+
84
+
85
+ class RAOFindParser:
86
+ def __init__(self, kbid: str, item: FindRequest, fetcher: RAOFetcher):
87
+ self.kbid = kbid
88
+ self.item = item
89
+ self.fetcher = fetcher
90
+
91
+ # cached data while parsing
92
+ self._query: retrieval_models.Query | None = None
93
+
94
+ async def parse(self) -> tuple[RetrievalRequest, Reranker]:
95
+ self._validate_request()
96
+
97
+ top_k = self.item.top_k
98
+
99
+ # parse search types (features)
100
+
101
+ self._query = retrieval_models.Query()
102
+
103
+ if search_models.FindOptions.KEYWORD in self.item.features:
104
+ self._query.keyword = await parse_keyword_query(self.item, fetcher=self.fetcher) # type: ignore
105
+
106
+ if search_models.FindOptions.SEMANTIC in self.item.features:
107
+ self._query.semantic = await parse_semantic_query(self.item, fetcher=self.fetcher) # type: ignore
108
+
109
+ if search_models.FindOptions.RELATIONS in self.item.features:
110
+ # skip, we'll do something about this later on
111
+ pass
112
+
113
+ if search_models.FindOptions.GRAPH in self.item.features:
114
+ self._query.graph = await self._parse_graph_query()
115
+
116
+ filters = await self._parse_filters()
117
+
118
+ # rank fusion is just forwarded to /retrieve
119
+ rank_fusion = self.item.rank_fusion
120
+
121
+ try:
122
+ reranker = self._parse_reranker()
123
+ except ValidationError as exc:
124
+ raise InternalParserError(f"Parsing error in reranker: {exc!s}") from exc
125
+
126
+ # As we'll call /retrieve, that has rank fusion integrated, we have to
127
+ # make sure we ask for enough results to rerank.
128
+ if isinstance(reranker, PredictReranker):
129
+ top_k = max(top_k, reranker.window)
130
+
131
+ retrieval = RetrievalRequest(
132
+ query=self._query,
133
+ top_k=top_k,
134
+ filters=filters,
135
+ rank_fusion=rank_fusion,
136
+ )
137
+ return retrieval, reranker
138
+
139
+ def _validate_request(self):
140
+ # synonyms are not compatible with vector/graph search
141
+ if (
142
+ self.item.with_synonyms
143
+ and self.item.query
144
+ and (
145
+ search_models.FindOptions.SEMANTIC in self.item.features
146
+ or search_models.FindOptions.RELATIONS in self.item.features
147
+ or search_models.FindOptions.GRAPH in self.item.features
148
+ )
149
+ ):
150
+ raise InvalidQueryError(
151
+ "synonyms",
152
+ "Search with custom synonyms is only supported on paragraph and document search",
153
+ )
154
+
155
+ if search_models.FindOptions.SEMANTIC in self.item.features:
156
+ if should_disable_vector_search(self.item):
157
+ self.item.features.remove(search_models.FindOptions.SEMANTIC)
158
+
159
+ if self.item.graph_query and search_models.FindOptions.GRAPH not in self.item.features:
160
+ raise InvalidQueryError("graph_query", "Using a graph query requires enabling graph feature")
161
+
162
+ async def _parse_relation_query(self) -> RelationQuery:
163
+ detected_entities = await self._get_detected_entities()
164
+
165
+ return RelationQuery(
166
+ entry_points=detected_entities, deleted_entity_groups=[], deleted_entities={}
167
+ )
168
+
169
+ async def _parse_graph_query(self) -> retrieval_models.GraphQuery:
170
+ if self.item.graph_query is None:
171
+ raise InvalidQueryError(
172
+ "graph_query", "Graph query must be provided when using graph search"
173
+ )
174
+ return retrieval_models.GraphQuery(query=self.item.graph_query)
175
+
176
+ async def _get_detected_entities(self) -> list[utils_pb2.RelationNode]:
177
+ """Get entities from request, either automatically detected or
178
+ explicitly set by the user."""
179
+
180
+ if self.item.query_entities:
181
+ detected_entities = []
182
+ for entity in self.item.query_entities:
183
+ relation_node = utils_pb2.RelationNode()
184
+ relation_node.value = entity.name
185
+ if entity.type is not None:
186
+ relation_node.ntype = RelationNodeTypeMap[entity.type]
187
+ if entity.subtype is not None:
188
+ relation_node.subtype = entity.subtype
189
+ detected_entities.append(relation_node)
190
+ else:
191
+ detected_entities = await self.fetcher.get_detected_entities()
192
+
193
+ return detected_entities
194
+
195
+ async def _parse_filters(self) -> retrieval_models.Filters:
196
+ assert self._query is not None, "query must be parsed before filters"
197
+
198
+ # this is a conversion between /find filters to /retrieve filters. As
199
+ # /find keeps maintaining old filter style, we must convert from one to
200
+ # another
201
+
202
+ has_old_filters = (
203
+ len(self.item.filters) > 0
204
+ or len(self.item.resource_filters) > 0
205
+ or len(self.item.fields) > 0
206
+ or len(self.item.keyword_filters) > 0
207
+ or self.item.range_creation_start is not None
208
+ or self.item.range_creation_end is not None
209
+ or self.item.range_modification_start is not None
210
+ or self.item.range_modification_end is not None
211
+ )
212
+ if self.item.filter_expression is not None and has_old_filters:
213
+ raise InvalidQueryError("filter_expression", "Cannot mix old filters with filter_expression")
214
+
215
+ filter_expression = None
216
+
217
+ if has_old_filters:
218
+ # convert old filters into a filter expression
219
+
220
+ operator = FilterExpression.Operator.AND
221
+ field_expression: list[FieldFilterExpression] = []
222
+ paragraph_expression: list[ParagraphFilterExpression] = []
223
+
224
+ if self.item.range_creation_start or self.item.range_creation_end:
225
+ field_expression.append(
226
+ DateCreated(
227
+ since=self.item.range_creation_start,
228
+ until=self.item.range_creation_end,
229
+ )
230
+ )
231
+
232
+ if self.item.range_modification_start or self.item.range_modification_end:
233
+ field_expression.append(
234
+ DateModified(
235
+ since=self.item.range_modification_start,
236
+ until=self.item.range_modification_end,
237
+ )
238
+ )
239
+
240
+ if self.item.filters:
241
+ classification_labels = await self.fetcher.get_classification_labels()
242
+ field_exprs, paragraph_expr = convert_labels_to_filter_expressions(
243
+ self.item.filters, classification_labels
244
+ )
245
+ if field_exprs:
246
+ field_expression.extend(field_exprs)
247
+ if paragraph_expr:
248
+ paragraph_expression.append(paragraph_expr)
249
+
250
+ if self.item.keyword_filters:
251
+ # keyword filters
252
+ for keyword_filter in self.item.keyword_filters:
253
+ if isinstance(keyword_filter, str):
254
+ field_expression.append(Keyword(word=keyword_filter))
255
+ else:
256
+ # model validates that one and only one of these match
257
+ if keyword_filter.all:
258
+ field_expression.append(
259
+ And(operands=[Keyword(word=word) for word in keyword_filter.all])
260
+ )
261
+ elif keyword_filter.any:
262
+ field_expression.append(
263
+ Or(operands=[Keyword(word=word) for word in keyword_filter.any])
264
+ )
265
+ elif keyword_filter.none:
266
+ field_expression.append(
267
+ Not(
268
+ operand=Or(
269
+ operands=[Keyword(word=word) for word in keyword_filter.none]
270
+ )
271
+ )
272
+ )
273
+ elif keyword_filter.not_all:
274
+ field_expression.append(
275
+ Not(
276
+ operand=And(
277
+ operands=[Keyword(word=word) for word in keyword_filter.not_all]
278
+ )
279
+ )
280
+ )
281
+
282
+ if self.item.fields:
283
+ operands: list[FieldFilterExpression] = []
284
+ for key in self.item.fields:
285
+ parts = key.split("/")
286
+ try:
287
+ field_type = FieldTypeName.from_abbreviation(parts[0])
288
+ except KeyError: # pragma: no cover
289
+ raise InvalidQueryError(
290
+ "fields", f"field filter {key} has an invalid field type: {parts[0]}"
291
+ )
292
+ field_id = parts[1] if len(parts) > 1 else None
293
+ operands.append(Field(type=field_type, name=field_id))
294
+
295
+ if len(operands) == 1:
296
+ field_expression.append(operands[0])
297
+ elif len(operands) > 1:
298
+ field_expression.append(Or(operands=operands))
299
+
300
+ if self.item.resource_filters:
301
+ operands = []
302
+ for key in self.item.resource_filters:
303
+ parts = key.split("/")
304
+ if len(parts) == 1:
305
+ operands.append(Resource(id=parts[0]))
306
+ else:
307
+ rid = parts[0]
308
+ field_type = FieldTypeName.from_abbreviation(parts[1])
309
+ field_id = parts[2] if len(parts) > 2 else None
310
+ operands.append(
311
+ And(operands=[Resource(id=rid), Field(type=field_type, name=field_id)])
312
+ )
313
+
314
+ if len(operands) == 1:
315
+ field_expression.append(operands[0])
316
+ elif len(operands) > 1:
317
+ field_expression.append(Or(operands=operands))
318
+
319
+ field = None
320
+ if len(field_expression) == 1:
321
+ field = field_expression[0]
322
+ elif len(field_expression) > 1:
323
+ field = And(operands=field_expression)
324
+
325
+ paragraph = None
326
+ if len(paragraph_expression) == 1:
327
+ paragraph = paragraph_expression[0]
328
+ elif len(paragraph_expression) > 1:
329
+ paragraph = And(operands=paragraph_expression)
330
+
331
+ if field or paragraph:
332
+ filter_expression = FilterExpression(field=field, paragraph=paragraph, operator=operator)
333
+
334
+ if self.item.filter_expression is not None:
335
+ filter_expression = self.item.filter_expression
336
+
337
+ return retrieval_models.Filters(
338
+ filter_expression=filter_expression,
339
+ show_hidden=self.item.show_hidden,
340
+ security=self.item.security,
341
+ with_duplicates=self.item.with_duplicates,
342
+ )
343
+
344
+ def _parse_reranker(self) -> Reranker:
345
+ reranker: Reranker
346
+ top_k = self.item.top_k
347
+
348
+ if isinstance(self.item.reranker, search_models.RerankerName):
349
+ if self.item.reranker == search_models.RerankerName.NOOP:
350
+ reranker = NoopReranker()
351
+
352
+ elif self.item.reranker == search_models.RerankerName.PREDICT_RERANKER:
353
+ # for predict rearnker, by default, we want a x2 factor with a
354
+ # top of 200 results
355
+ reranker = PredictReranker(window=min(top_k * 2, 200))
356
+
357
+ else:
358
+ raise InternalParserError(f"Unknown reranker algorithm: {self.item.reranker}")
359
+
360
+ elif isinstance(self.item.reranker, search_models.PredictReranker):
361
+ user_window = self.item.reranker.window
362
+ reranker = PredictReranker(window=min(max(user_window or 0, top_k), 200))
363
+
364
+ else:
365
+ raise InternalParserError(f"Unknown reranker {self.item.reranker}")
366
+
367
+ return reranker
368
+
369
+
370
+ async def parse_keyword_query(
371
+ item: search_models.BaseSearchRequest,
372
+ *,
373
+ fetcher: RAOFetcher,
374
+ ) -> retrieval_models.KeywordQuery:
375
+ query = item.query
376
+
377
+ # If there was a rephrase with image, we should use the rephrased query for keyword search
378
+ rephrased_query = await fetcher.get_rephrased_query()
379
+ if item.query_image is not None and rephrased_query is not None:
380
+ query = rephrased_query
381
+
382
+ min_score = parse_keyword_min_score(item.min_score)
383
+
384
+ return retrieval_models.KeywordQuery(
385
+ query=query,
386
+ # Synonym checks are done at the retrieval endpoint already
387
+ with_synonyms=item.with_synonyms,
388
+ min_score=min_score,
389
+ )
390
+
391
+
392
+ async def parse_semantic_query(
393
+ item: search_models.SearchRequest | search_models.FindRequest,
394
+ *,
395
+ fetcher: RAOFetcher,
396
+ ) -> retrieval_models.SemanticQuery:
397
+ vectorset = await fetcher.get_vectorset()
398
+ query = await fetcher.get_query_vector()
399
+
400
+ min_score = await parse_semantic_min_score(item.min_score, fetcher=fetcher)
401
+
402
+ return retrieval_models.SemanticQuery(query=query, vectorset=vectorset, min_score=min_score)
403
+
404
+
405
+ async def parse_semantic_min_score(
406
+ min_score: float | search_models.MinScore | None,
407
+ *,
408
+ fetcher: RAOFetcher,
409
+ ) -> float:
410
+ if min_score is None:
411
+ min_score = None
412
+ elif isinstance(min_score, float):
413
+ min_score = min_score
414
+ else:
415
+ min_score = min_score.semantic
416
+ if min_score is None:
417
+ # min score not defined by the user, we'll try to get the default
418
+ # from Predict API
419
+ min_score = await fetcher.get_semantic_min_score()
420
+ if min_score is None:
421
+ logger.warning(
422
+ "Semantic threshold not found in query information, using default",
423
+ extra={"kbid": fetcher.kbid},
424
+ )
425
+ min_score = DEFAULT_GENERIC_SEMANTIC_THRESHOLD
426
+
427
+ return min_score
428
+
429
+
430
+ def convert_labels_to_filter_expressions(
431
+ label_filters: list[str] | list[Filter], classification_labels: knowledgebox_pb2.Labels
432
+ ) -> tuple[list[FieldFilterExpression], ParagraphFilterExpression | None]:
433
+ field_expressions: list[FieldFilterExpression] = []
434
+ paragraph_expressions: list[ParagraphFilterExpression] = []
435
+
436
+ for label_filter in label_filters:
437
+ if isinstance(label_filter, str):
438
+ # translate_label
439
+ if len(label_filter) == 0:
440
+ raise InvalidQueryError("filters", "Invalid empty label")
441
+ if label_filter[0] != "/":
442
+ raise InvalidQueryError(
443
+ "filters", f"Invalid label. It must start with a `/`: {label_filter}"
444
+ )
445
+
446
+ label = translate_label(label_filter)
447
+ facet_filter = filter_from_facet(label)
448
+
449
+ if is_paragraph_label(label, classification_labels):
450
+ paragraph_expressions.append(facet_filter) # type: ignore[arg-type]
451
+ else:
452
+ field_expressions.append(facet_filter) # type: ignore[arg-type]
453
+
454
+ else:
455
+ combinator: type[And[FieldFilterExpression]] | type[Or[FieldFilterExpression]]
456
+ if label_filter.all:
457
+ labels = label_filter.all
458
+ combinator, negate = And, False
459
+ elif label_filter.any:
460
+ labels = label_filter.any
461
+ combinator, negate = Or, False
462
+ elif label_filter.none:
463
+ labels = label_filter.none
464
+ combinator, negate = And, True
465
+ elif label_filter.not_all:
466
+ labels = label_filter.not_all
467
+ combinator, negate = Or, True
468
+ else:
469
+ # Empty filter, should not happen due to validation, but skip just in case
470
+ continue
471
+
472
+ # equivalent to split_labels
473
+ field = []
474
+ paragraph = []
475
+ for label in labels:
476
+ label = translate_label(label)
477
+ expr = filter_from_facet(label)
478
+
479
+ if negate:
480
+ expr = Not(operand=expr) # type: ignore
481
+
482
+ if is_paragraph_label(label, classification_labels):
483
+ paragraph.append(expr)
484
+ else:
485
+ field.append(expr)
486
+
487
+ if len(paragraph) > 0 and not (combinator == And and negate is False):
488
+ raise InvalidQueryError(
489
+ "filters",
490
+ "Paragraph labels can only be used with 'all' filter",
491
+ )
492
+
493
+ if len(field) == 1:
494
+ field_expressions.append(field[0]) # type: ignore
495
+ elif len(field) > 1:
496
+ field_expressions.append(combinator(operands=field)) # type: ignore
497
+
498
+ if len(paragraph) == 1:
499
+ paragraph_expressions.append(paragraph[0]) # type: ignore
500
+ elif len(paragraph) > 1:
501
+ paragraph_expressions.append(combinator(operands=paragraph)) # type: ignore
502
+
503
+ if len(paragraph_expressions) == 1:
504
+ paragraph_expression = paragraph_expressions[0] # type: ignore
505
+ elif len(paragraph_expressions) > 1:
506
+ paragraph_expression = And(operands=paragraph_expressions) # type: ignore
507
+ else:
508
+ paragraph_expression = None
509
+
510
+ return field_expressions, paragraph_expression