nucliadb 6.1.0.post2602__py3-none-any.whl → 6.1.0.post2610__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nucliadb/search/api/v1/__init__.py +1 -0
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/resource/search.py +1 -3
- nucliadb/search/api/v1/search.py +4 -167
- nucliadb/search/search/chat/ask.py +1 -2
- nucliadb/search/search/cut.py +3 -5
- nucliadb/search/search/find.py +3 -5
- nucliadb/search/search/find_merge.py +6 -7
- nucliadb/search/search/merge.py +24 -42
- nucliadb/search/search/pgcatalog.py +32 -32
- nucliadb/search/search/query.py +5 -15
- nucliadb/search/search/query_parser/models.py +34 -0
- nucliadb/search/search/query_parser/parser.py +56 -4
- {nucliadb-6.1.0.post2602.dist-info → nucliadb-6.1.0.post2610.dist-info}/METADATA +5 -5
- {nucliadb-6.1.0.post2602.dist-info → nucliadb-6.1.0.post2610.dist-info}/RECORD +19 -18
- {nucliadb-6.1.0.post2602.dist-info → nucliadb-6.1.0.post2610.dist-info}/WHEEL +0 -0
- {nucliadb-6.1.0.post2602.dist-info → nucliadb-6.1.0.post2610.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.1.0.post2602.dist-info → nucliadb-6.1.0.post2610.dist-info}/top_level.txt +0 -0
- {nucliadb-6.1.0.post2602.dist-info → nucliadb-6.1.0.post2610.dist-info}/zip-safe +0 -0
@@ -0,0 +1,184 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
from time import time
|
21
|
+
from typing import Optional, Union
|
22
|
+
|
23
|
+
from fastapi import Request, Response
|
24
|
+
from fastapi_versioning import version
|
25
|
+
|
26
|
+
from nucliadb.common.datamanagers.exceptions import KnowledgeBoxNotFound
|
27
|
+
from nucliadb.common.maindb.pg import PGDriver
|
28
|
+
from nucliadb.common.maindb.utils import get_driver
|
29
|
+
from nucliadb.models.responses import HTTPClientError
|
30
|
+
from nucliadb.search import logger
|
31
|
+
from nucliadb.search.api.v1.router import KB_PREFIX, api
|
32
|
+
from nucliadb.search.api.v1.utils import fastapi_query
|
33
|
+
from nucliadb.search.search import cache
|
34
|
+
from nucliadb.search.search.exceptions import InvalidQueryError
|
35
|
+
from nucliadb.search.search.merge import fetch_resources
|
36
|
+
from nucliadb.search.search.pgcatalog import pgcatalog_search
|
37
|
+
from nucliadb.search.search.query_parser.parser import parse_catalog
|
38
|
+
from nucliadb.search.search.utils import (
|
39
|
+
maybe_log_request_payload,
|
40
|
+
)
|
41
|
+
from nucliadb_models.common import FieldTypeName
|
42
|
+
from nucliadb_models.metadata import ResourceProcessingStatus
|
43
|
+
from nucliadb_models.resource import NucliaDBRoles
|
44
|
+
from nucliadb_models.search import (
|
45
|
+
CatalogRequest,
|
46
|
+
CatalogResponse,
|
47
|
+
KnowledgeboxSearchResults,
|
48
|
+
ResourceProperties,
|
49
|
+
SearchParamDefaults,
|
50
|
+
SortField,
|
51
|
+
SortOptions,
|
52
|
+
SortOrder,
|
53
|
+
)
|
54
|
+
from nucliadb_models.utils import DateTime
|
55
|
+
from nucliadb_utils.authentication import requires
|
56
|
+
from nucliadb_utils.exceptions import LimitsExceededError
|
57
|
+
|
58
|
+
|
59
|
+
@api.get(
|
60
|
+
f"/{KB_PREFIX}/{{kbid}}/catalog",
|
61
|
+
status_code=200,
|
62
|
+
summary="List resources of a Knowledge Box",
|
63
|
+
description="List resources of a Knowledge Box",
|
64
|
+
response_model=KnowledgeboxSearchResults,
|
65
|
+
response_model_exclude_unset=True,
|
66
|
+
tags=["Search"],
|
67
|
+
)
|
68
|
+
@requires(NucliaDBRoles.READER)
|
69
|
+
@version(1)
|
70
|
+
async def catalog_get(
|
71
|
+
request: Request,
|
72
|
+
response: Response,
|
73
|
+
kbid: str,
|
74
|
+
query: str = fastapi_query(SearchParamDefaults.query),
|
75
|
+
filters: list[str] = fastapi_query(SearchParamDefaults.filters),
|
76
|
+
faceted: list[str] = fastapi_query(SearchParamDefaults.faceted),
|
77
|
+
sort_field: SortField = fastapi_query(SearchParamDefaults.sort_field),
|
78
|
+
sort_limit: Optional[int] = fastapi_query(SearchParamDefaults.sort_limit),
|
79
|
+
sort_order: SortOrder = fastapi_query(SearchParamDefaults.sort_order),
|
80
|
+
page_number: int = fastapi_query(SearchParamDefaults.catalog_page_number),
|
81
|
+
page_size: int = fastapi_query(SearchParamDefaults.catalog_page_size),
|
82
|
+
shards: list[str] = fastapi_query(SearchParamDefaults.shards, deprecated=True),
|
83
|
+
with_status: Optional[ResourceProcessingStatus] = fastapi_query(
|
84
|
+
SearchParamDefaults.with_status, deprecated="Use filters instead"
|
85
|
+
),
|
86
|
+
debug: bool = fastapi_query(SearchParamDefaults.debug, include_in_schema=False),
|
87
|
+
range_creation_start: Optional[DateTime] = fastapi_query(SearchParamDefaults.range_creation_start),
|
88
|
+
range_creation_end: Optional[DateTime] = fastapi_query(SearchParamDefaults.range_creation_end),
|
89
|
+
range_modification_start: Optional[DateTime] = fastapi_query(
|
90
|
+
SearchParamDefaults.range_modification_start
|
91
|
+
),
|
92
|
+
range_modification_end: Optional[DateTime] = fastapi_query(
|
93
|
+
SearchParamDefaults.range_modification_end
|
94
|
+
),
|
95
|
+
hidden: Optional[bool] = fastapi_query(SearchParamDefaults.hidden),
|
96
|
+
) -> Union[KnowledgeboxSearchResults, HTTPClientError]:
|
97
|
+
item = CatalogRequest(
|
98
|
+
query=query,
|
99
|
+
filters=filters,
|
100
|
+
faceted=faceted,
|
101
|
+
page_number=page_number,
|
102
|
+
page_size=page_size,
|
103
|
+
shards=shards,
|
104
|
+
debug=debug,
|
105
|
+
with_status=with_status,
|
106
|
+
range_creation_start=range_creation_start,
|
107
|
+
range_creation_end=range_creation_end,
|
108
|
+
range_modification_start=range_modification_start,
|
109
|
+
range_modification_end=range_modification_end,
|
110
|
+
hidden=hidden,
|
111
|
+
)
|
112
|
+
if sort_field:
|
113
|
+
item.sort = SortOptions(field=sort_field, limit=sort_limit, order=sort_order)
|
114
|
+
return await catalog(kbid, item)
|
115
|
+
|
116
|
+
|
117
|
+
@api.post(
|
118
|
+
f"/{KB_PREFIX}/{{kbid}}/catalog",
|
119
|
+
status_code=200,
|
120
|
+
summary="List resources of a Knowledge Box",
|
121
|
+
description="List resources of a Knowledge Box",
|
122
|
+
response_model=KnowledgeboxSearchResults,
|
123
|
+
response_model_exclude_unset=True,
|
124
|
+
tags=["Search"],
|
125
|
+
)
|
126
|
+
@requires(NucliaDBRoles.READER)
|
127
|
+
@version(1)
|
128
|
+
async def catalog_post(
|
129
|
+
request: Request,
|
130
|
+
kbid: str,
|
131
|
+
item: CatalogRequest,
|
132
|
+
) -> Union[CatalogResponse, HTTPClientError]:
|
133
|
+
return await catalog(kbid, item)
|
134
|
+
|
135
|
+
|
136
|
+
async def catalog(
|
137
|
+
kbid: str,
|
138
|
+
item: CatalogRequest,
|
139
|
+
):
|
140
|
+
"""
|
141
|
+
Catalog endpoint is a simplified version of the search endpoint, it only
|
142
|
+
returns bm25 results on titles and it does not support vector search.
|
143
|
+
It is useful for listing resources in a knowledge box.
|
144
|
+
"""
|
145
|
+
if not pgcatalog_enabled(): # pragma: no cover
|
146
|
+
return HTTPClientError(status_code=501, detail="PG driver is needed for catalog search")
|
147
|
+
|
148
|
+
maybe_log_request_payload(kbid, "/catalog", item)
|
149
|
+
start_time = time()
|
150
|
+
try:
|
151
|
+
with cache.request_caches():
|
152
|
+
query_parser = parse_catalog(kbid, item)
|
153
|
+
|
154
|
+
catalog_results = CatalogResponse()
|
155
|
+
catalog_results.fulltext = await pgcatalog_search(query_parser)
|
156
|
+
catalog_results.resources = await fetch_resources(
|
157
|
+
resources=[r.rid for r in catalog_results.fulltext.results],
|
158
|
+
kbid=kbid,
|
159
|
+
show=[ResourceProperties.BASIC, ResourceProperties.ERRORS],
|
160
|
+
field_type_filter=list(FieldTypeName),
|
161
|
+
extracted=[],
|
162
|
+
)
|
163
|
+
return catalog_results
|
164
|
+
except InvalidQueryError as exc:
|
165
|
+
return HTTPClientError(status_code=412, detail=str(exc))
|
166
|
+
except KnowledgeBoxNotFound:
|
167
|
+
return HTTPClientError(status_code=404, detail="Knowledge Box not found")
|
168
|
+
except LimitsExceededError as exc:
|
169
|
+
return HTTPClientError(status_code=exc.status_code, detail=exc.detail)
|
170
|
+
finally:
|
171
|
+
duration = time() - start_time
|
172
|
+
if duration > 2: # pragma: no cover
|
173
|
+
logger.warning(
|
174
|
+
"Slow catalog request",
|
175
|
+
extra={
|
176
|
+
"kbid": kbid,
|
177
|
+
"duration": duration,
|
178
|
+
"query": item.model_dump_json(),
|
179
|
+
},
|
180
|
+
)
|
181
|
+
|
182
|
+
|
183
|
+
def pgcatalog_enabled():
|
184
|
+
return isinstance(get_driver(), PGDriver)
|
@@ -90,7 +90,6 @@ async def resource_search(
|
|
90
90
|
fields,
|
91
91
|
filters,
|
92
92
|
faceted,
|
93
|
-
0,
|
94
93
|
top_k,
|
95
94
|
range_creation_start,
|
96
95
|
range_creation_end,
|
@@ -109,8 +108,7 @@ async def resource_search(
|
|
109
108
|
# We need to merge
|
110
109
|
search_results = await merge_paragraphs_results(
|
111
110
|
results,
|
112
|
-
|
113
|
-
page=0,
|
111
|
+
top_k=top_k,
|
114
112
|
kbid=kbid,
|
115
113
|
highlight_split=highlight,
|
116
114
|
min_score=0.0,
|
nucliadb/search/api/v1/search.py
CHANGED
@@ -27,21 +27,17 @@ from fastapi_versioning import version
|
|
27
27
|
from pydantic import ValidationError
|
28
28
|
|
29
29
|
from nucliadb.common.datamanagers.exceptions import KnowledgeBoxNotFound
|
30
|
-
from nucliadb.common.maindb.pg import PGDriver
|
31
|
-
from nucliadb.common.maindb.utils import get_driver
|
32
30
|
from nucliadb.models.responses import HTTPClientError
|
33
|
-
from nucliadb.search import
|
31
|
+
from nucliadb.search import predict
|
34
32
|
from nucliadb.search.api.v1.router import KB_PREFIX, api
|
35
33
|
from nucliadb.search.api.v1.utils import fastapi_query
|
36
34
|
from nucliadb.search.requesters.utils import Method, debug_nodes_info, node_query
|
37
35
|
from nucliadb.search.search import cache
|
38
36
|
from nucliadb.search.search.exceptions import InvalidQueryError
|
39
|
-
from nucliadb.search.search.merge import
|
40
|
-
from nucliadb.search.search.pgcatalog import pgcatalog_search
|
37
|
+
from nucliadb.search.search.merge import merge_results
|
41
38
|
from nucliadb.search.search.query import QueryParser
|
42
39
|
from nucliadb.search.search.utils import (
|
43
40
|
filter_hidden_resources,
|
44
|
-
maybe_log_request_payload,
|
45
41
|
min_score_from_payload,
|
46
42
|
min_score_from_query_params,
|
47
43
|
should_disable_vector_search,
|
@@ -50,10 +46,7 @@ from nucliadb_models.common import FieldTypeName
|
|
50
46
|
from nucliadb_models.metadata import ResourceProcessingStatus
|
51
47
|
from nucliadb_models.resource import ExtractedDataTypeName, NucliaDBRoles
|
52
48
|
from nucliadb_models.search import (
|
53
|
-
CatalogRequest,
|
54
|
-
CatalogResponse,
|
55
49
|
KnowledgeboxSearchResults,
|
56
|
-
MinScore,
|
57
50
|
NucliaDBClientType,
|
58
51
|
ResourceProperties,
|
59
52
|
SearchOptions,
|
@@ -202,156 +195,6 @@ async def search_knowledgebox(
|
|
202
195
|
return await _search_endpoint(response, kbid, item, x_ndb_client, x_nucliadb_user, x_forwarded_for)
|
203
196
|
|
204
197
|
|
205
|
-
@api.get(
|
206
|
-
f"/{KB_PREFIX}/{{kbid}}/catalog",
|
207
|
-
status_code=200,
|
208
|
-
summary="List resources of a Knowledge Box",
|
209
|
-
description="List resources of a Knowledge Box",
|
210
|
-
response_model=KnowledgeboxSearchResults,
|
211
|
-
response_model_exclude_unset=True,
|
212
|
-
tags=["Search"],
|
213
|
-
)
|
214
|
-
@requires(NucliaDBRoles.READER)
|
215
|
-
@version(1)
|
216
|
-
async def catalog_get(
|
217
|
-
request: Request,
|
218
|
-
response: Response,
|
219
|
-
kbid: str,
|
220
|
-
query: str = fastapi_query(SearchParamDefaults.query),
|
221
|
-
filters: list[str] = fastapi_query(SearchParamDefaults.filters),
|
222
|
-
faceted: list[str] = fastapi_query(SearchParamDefaults.faceted),
|
223
|
-
sort_field: SortField = fastapi_query(SearchParamDefaults.sort_field),
|
224
|
-
sort_limit: Optional[int] = fastapi_query(SearchParamDefaults.sort_limit),
|
225
|
-
sort_order: SortOrder = fastapi_query(SearchParamDefaults.sort_order),
|
226
|
-
page_number: int = fastapi_query(SearchParamDefaults.catalog_page_number),
|
227
|
-
page_size: int = fastapi_query(SearchParamDefaults.catalog_page_size),
|
228
|
-
shards: list[str] = fastapi_query(SearchParamDefaults.shards, deprecated=True),
|
229
|
-
with_status: Optional[ResourceProcessingStatus] = fastapi_query(
|
230
|
-
SearchParamDefaults.with_status, deprecated="Use filters instead"
|
231
|
-
),
|
232
|
-
debug: bool = fastapi_query(SearchParamDefaults.debug, include_in_schema=False),
|
233
|
-
range_creation_start: Optional[DateTime] = fastapi_query(SearchParamDefaults.range_creation_start),
|
234
|
-
range_creation_end: Optional[DateTime] = fastapi_query(SearchParamDefaults.range_creation_end),
|
235
|
-
range_modification_start: Optional[DateTime] = fastapi_query(
|
236
|
-
SearchParamDefaults.range_modification_start
|
237
|
-
),
|
238
|
-
range_modification_end: Optional[DateTime] = fastapi_query(
|
239
|
-
SearchParamDefaults.range_modification_end
|
240
|
-
),
|
241
|
-
hidden: Optional[bool] = fastapi_query(SearchParamDefaults.hidden),
|
242
|
-
) -> Union[KnowledgeboxSearchResults, HTTPClientError]:
|
243
|
-
item = CatalogRequest(
|
244
|
-
query=query,
|
245
|
-
filters=filters,
|
246
|
-
faceted=faceted,
|
247
|
-
page_number=page_number,
|
248
|
-
page_size=page_size,
|
249
|
-
shards=shards,
|
250
|
-
debug=debug,
|
251
|
-
with_status=with_status,
|
252
|
-
range_creation_start=range_creation_start,
|
253
|
-
range_creation_end=range_creation_end,
|
254
|
-
range_modification_start=range_modification_start,
|
255
|
-
range_modification_end=range_modification_end,
|
256
|
-
hidden=hidden,
|
257
|
-
)
|
258
|
-
if sort_field:
|
259
|
-
item.sort = SortOptions(field=sort_field, limit=sort_limit, order=sort_order)
|
260
|
-
return await catalog(kbid, item)
|
261
|
-
|
262
|
-
|
263
|
-
@api.post(
|
264
|
-
f"/{KB_PREFIX}/{{kbid}}/catalog",
|
265
|
-
status_code=200,
|
266
|
-
summary="List resources of a Knowledge Box",
|
267
|
-
description="List resources of a Knowledge Box",
|
268
|
-
response_model=KnowledgeboxSearchResults,
|
269
|
-
response_model_exclude_unset=True,
|
270
|
-
tags=["Search"],
|
271
|
-
)
|
272
|
-
@requires(NucliaDBRoles.READER)
|
273
|
-
@version(1)
|
274
|
-
async def catalog_post(
|
275
|
-
request: Request,
|
276
|
-
kbid: str,
|
277
|
-
item: CatalogRequest,
|
278
|
-
) -> Union[CatalogResponse, HTTPClientError]:
|
279
|
-
return await catalog(kbid, item)
|
280
|
-
|
281
|
-
|
282
|
-
async def catalog(
|
283
|
-
kbid: str,
|
284
|
-
item: CatalogRequest,
|
285
|
-
):
|
286
|
-
"""
|
287
|
-
Catalog endpoint is a simplified version of the search endpoint, it only
|
288
|
-
returns bm25 results on titles and it does not support vector search.
|
289
|
-
It is useful for listing resources in a knowledge box.
|
290
|
-
"""
|
291
|
-
if not pgcatalog_enabled(): # pragma: no cover
|
292
|
-
return HTTPClientError(status_code=501, detail="PG driver is needed for catalog search")
|
293
|
-
|
294
|
-
maybe_log_request_payload(kbid, "/catalog", item)
|
295
|
-
start_time = time()
|
296
|
-
try:
|
297
|
-
with cache.request_caches():
|
298
|
-
sort = item.sort
|
299
|
-
if sort is None:
|
300
|
-
# By default we sort by creation date (most recent first)
|
301
|
-
sort = SortOptions(
|
302
|
-
field=SortField.CREATED,
|
303
|
-
order=SortOrder.DESC,
|
304
|
-
limit=None,
|
305
|
-
)
|
306
|
-
|
307
|
-
query_parser = QueryParser(
|
308
|
-
kbid=kbid,
|
309
|
-
features=[SearchOptions.FULLTEXT],
|
310
|
-
query=item.query,
|
311
|
-
label_filters=item.filters,
|
312
|
-
keyword_filters=[],
|
313
|
-
faceted=item.faceted,
|
314
|
-
sort=sort,
|
315
|
-
page_number=item.page_number,
|
316
|
-
page_size=item.page_size,
|
317
|
-
min_score=MinScore(bm25=0, semantic=0),
|
318
|
-
fields=["a/title"],
|
319
|
-
with_status=item.with_status,
|
320
|
-
range_creation_start=item.range_creation_start,
|
321
|
-
range_creation_end=item.range_creation_end,
|
322
|
-
range_modification_start=item.range_modification_start,
|
323
|
-
range_modification_end=item.range_modification_end,
|
324
|
-
hidden=item.hidden,
|
325
|
-
)
|
326
|
-
catalog_results = CatalogResponse()
|
327
|
-
catalog_results.fulltext = await pgcatalog_search(query_parser)
|
328
|
-
catalog_results.resources = await fetch_resources(
|
329
|
-
resources=[r.rid for r in catalog_results.fulltext.results],
|
330
|
-
kbid=kbid,
|
331
|
-
show=[ResourceProperties.BASIC, ResourceProperties.ERRORS],
|
332
|
-
field_type_filter=list(FieldTypeName),
|
333
|
-
extracted=[],
|
334
|
-
)
|
335
|
-
return catalog_results
|
336
|
-
except InvalidQueryError as exc:
|
337
|
-
return HTTPClientError(status_code=412, detail=str(exc))
|
338
|
-
except KnowledgeBoxNotFound:
|
339
|
-
return HTTPClientError(status_code=404, detail="Knowledge Box not found")
|
340
|
-
except LimitsExceededError as exc:
|
341
|
-
return HTTPClientError(status_code=exc.status_code, detail=exc.detail)
|
342
|
-
finally:
|
343
|
-
duration = time() - start_time
|
344
|
-
if duration > 2: # pragma: no cover
|
345
|
-
logger.warning(
|
346
|
-
"Slow catalog request",
|
347
|
-
extra={
|
348
|
-
"kbid": kbid,
|
349
|
-
"duration": duration,
|
350
|
-
"query": item.model_dump_json(),
|
351
|
-
},
|
352
|
-
)
|
353
|
-
|
354
|
-
|
355
198
|
@api.post(
|
356
199
|
f"/{KB_PREFIX}/{{kbid}}/search",
|
357
200
|
status_code=200,
|
@@ -431,8 +274,7 @@ async def search(
|
|
431
274
|
keyword_filters=[],
|
432
275
|
faceted=item.faceted,
|
433
276
|
sort=item.sort,
|
434
|
-
|
435
|
-
page_size=item.top_k,
|
277
|
+
top_k=item.top_k,
|
436
278
|
min_score=item.min_score,
|
437
279
|
range_creation_start=item.range_creation_start,
|
438
280
|
range_creation_end=item.range_creation_end,
|
@@ -461,8 +303,7 @@ async def search(
|
|
461
303
|
# We need to merge
|
462
304
|
search_results = await merge_results(
|
463
305
|
results,
|
464
|
-
|
465
|
-
page=0,
|
306
|
+
top_k=item.top_k,
|
466
307
|
kbid=kbid,
|
467
308
|
show=item.show,
|
468
309
|
field_type_filter=item.field_type_filter,
|
@@ -491,7 +332,3 @@ async def search(
|
|
491
332
|
search_results.shards = queried_shards
|
492
333
|
search_results.autofilters = autofilters
|
493
334
|
return search_results, incomplete_results
|
494
|
-
|
495
|
-
|
496
|
-
def pgcatalog_enabled():
|
497
|
-
return isinstance(get_driver(), PGDriver)
|
nucliadb/search/search/cut.py
CHANGED
@@ -23,10 +23,8 @@ from typing import TypeVar
|
|
23
23
|
T = TypeVar("T")
|
24
24
|
|
25
25
|
|
26
|
-
def cut_page(items: list[T],
|
26
|
+
def cut_page(items: list[T], top_k: int) -> tuple[list[T], bool]:
|
27
27
|
"""Return a slice of `items` representing the specified page and a boolean
|
28
28
|
indicating whether there is a next page or not"""
|
29
|
-
|
30
|
-
|
31
|
-
next_page = len(items) > end
|
32
|
-
return items[start:end], next_page
|
29
|
+
next_page = len(items) > top_k
|
30
|
+
return items[:top_k], next_page
|
nucliadb/search/search/find.py
CHANGED
@@ -122,8 +122,7 @@ async def _index_node_retrieval(
|
|
122
122
|
relation_subgraph_query=pb_query.relations.subgraph,
|
123
123
|
min_score_bm25=pb_query.min_score_bm25,
|
124
124
|
min_score_semantic=pb_query.min_score_semantic,
|
125
|
-
|
126
|
-
page_number=0,
|
125
|
+
top_k=item.top_k,
|
127
126
|
show=item.show,
|
128
127
|
extracted=item.extracted,
|
129
128
|
field_type_filter=item.field_type_filter,
|
@@ -214,7 +213,7 @@ async def _external_index_retrieval(
|
|
214
213
|
kbid=kbid,
|
215
214
|
query=search_request.body,
|
216
215
|
),
|
217
|
-
top_k=query_parser.
|
216
|
+
top_k=query_parser.top_k,
|
218
217
|
)
|
219
218
|
find_resources = compose_find_resources(text_blocks, resources)
|
220
219
|
|
@@ -273,8 +272,7 @@ async def query_parser_from_find_request(
|
|
273
272
|
keyword_filters=item.keyword_filters,
|
274
273
|
faceted=None,
|
275
274
|
sort=None,
|
276
|
-
|
277
|
-
page_size=item.top_k,
|
275
|
+
top_k=item.top_k,
|
278
276
|
min_score=item.min_score,
|
279
277
|
range_creation_start=item.range_creation_start,
|
280
278
|
range_creation_end=item.range_creation_end,
|
@@ -75,8 +75,7 @@ async def build_find_response(
|
|
75
75
|
kbid: str,
|
76
76
|
query: str,
|
77
77
|
relation_subgraph_query: EntitiesSubgraphRequest,
|
78
|
-
|
79
|
-
page_number: int,
|
78
|
+
top_k: int,
|
80
79
|
min_score_bm25: float,
|
81
80
|
min_score_semantic: float,
|
82
81
|
rank_fusion_algorithm: RankFusionAlgorithm,
|
@@ -106,9 +105,9 @@ async def build_find_response(
|
|
106
105
|
# enforced/validated by the query parsing.
|
107
106
|
if reranker.needs_extra_results:
|
108
107
|
assert reranker.window is not None, "Reranker definition must enforce this condition"
|
109
|
-
text_blocks_page, next_page = cut_page(merged_text_blocks, reranker.window
|
108
|
+
text_blocks_page, next_page = cut_page(merged_text_blocks, reranker.window)
|
110
109
|
else:
|
111
|
-
text_blocks_page, next_page = cut_page(merged_text_blocks,
|
110
|
+
text_blocks_page, next_page = cut_page(merged_text_blocks, top_k)
|
112
111
|
|
113
112
|
# hydrate and rerank
|
114
113
|
resource_hydration_options = ResourceHydrationOptions(
|
@@ -126,7 +125,7 @@ async def build_find_response(
|
|
126
125
|
text_block_hydration_options=text_block_hydration_options,
|
127
126
|
reranker=reranker,
|
128
127
|
reranking_options=reranking_options,
|
129
|
-
top_k=
|
128
|
+
top_k=top_k,
|
130
129
|
)
|
131
130
|
|
132
131
|
# build relations graph
|
@@ -144,8 +143,8 @@ async def build_find_response(
|
|
144
143
|
best_matches=best_matches,
|
145
144
|
relations=relations,
|
146
145
|
total=total_paragraphs,
|
147
|
-
page_number=
|
148
|
-
page_size=
|
146
|
+
page_number=0, # Bw/c with pagination
|
147
|
+
page_size=top_k,
|
149
148
|
next_page=next_page,
|
150
149
|
min_score=MinScore(bm25=_round(min_score_bm25), semantic=_round(min_score_semantic)),
|
151
150
|
)
|
nucliadb/search/search/merge.py
CHANGED
@@ -24,6 +24,7 @@ from typing import Any, Optional, Set, Union
|
|
24
24
|
|
25
25
|
from nucliadb.common.ids import FieldId, ParagraphId
|
26
26
|
from nucliadb.search.search import cache
|
27
|
+
from nucliadb.search.search.cut import cut_page
|
27
28
|
from nucliadb.search.search.fetch import (
|
28
29
|
fetch_resources,
|
29
30
|
get_labels_paragraph,
|
@@ -118,8 +119,7 @@ async def get_sort_value(
|
|
118
119
|
async def merge_documents_results(
|
119
120
|
document_responses: list[DocumentSearchResponse],
|
120
121
|
resources: list[str],
|
121
|
-
|
122
|
-
page: int,
|
122
|
+
top_k: int,
|
123
123
|
kbid: str,
|
124
124
|
sort: SortOptions,
|
125
125
|
min_score: float,
|
@@ -148,15 +148,9 @@ async def merge_documents_results(
|
|
148
148
|
raw_resource_list.append((result, sort_value))
|
149
149
|
total += document_response.total
|
150
150
|
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
if length > end:
|
156
|
-
next_page = True
|
157
|
-
|
158
|
-
# We need to cut first and then sort, otherwise pagination will be wrong if the order is DESC
|
159
|
-
raw_resource_list = raw_resource_list[min(skip, length) : min(end, length)]
|
151
|
+
# We need to cut first and then sort, otherwise the page will be wrong if the order is DESC
|
152
|
+
raw_resource_list, has_more = cut_page(raw_resource_list, top_k)
|
153
|
+
next_page = next_page or has_more
|
160
154
|
raw_resource_list.sort(key=lambda x: x[1], reverse=(sort.order == SortOrder.DESC))
|
161
155
|
|
162
156
|
result_resource_list: list[ResourceResult] = []
|
@@ -181,8 +175,8 @@ async def merge_documents_results(
|
|
181
175
|
results=result_resource_list,
|
182
176
|
query=query,
|
183
177
|
total=total,
|
184
|
-
page_number=
|
185
|
-
page_size=
|
178
|
+
page_number=0, # Bw/c with pagination
|
179
|
+
page_size=top_k,
|
186
180
|
next_page=next_page,
|
187
181
|
min_score=min_score,
|
188
182
|
)
|
@@ -258,8 +252,7 @@ async def merge_vectors_results(
|
|
258
252
|
vector_responses: list[VectorSearchResponse],
|
259
253
|
resources: list[str],
|
260
254
|
kbid: str,
|
261
|
-
|
262
|
-
page: int,
|
255
|
+
top_k: int,
|
263
256
|
min_score: Optional[float] = None,
|
264
257
|
):
|
265
258
|
facets: dict[str, Any] = {}
|
@@ -276,12 +269,10 @@ async def merge_vectors_results(
|
|
276
269
|
if len(vector_responses) > 1:
|
277
270
|
raw_vectors_list.sort(key=lambda x: x.score, reverse=True)
|
278
271
|
|
279
|
-
|
280
|
-
end_element = skip + count
|
281
|
-
length = len(raw_vectors_list)
|
272
|
+
raw_vectors_list, _ = cut_page(raw_vectors_list, top_k)
|
282
273
|
|
283
274
|
result_sentence_list: list[Sentence] = []
|
284
|
-
for result in raw_vectors_list
|
275
|
+
for result in raw_vectors_list:
|
285
276
|
id_count = result.doc_id.id.count("/")
|
286
277
|
if id_count == 4:
|
287
278
|
rid, field_type, field, index, position = result.doc_id.id.split("/")
|
@@ -329,8 +320,8 @@ async def merge_vectors_results(
|
|
329
320
|
return Sentences(
|
330
321
|
results=result_sentence_list,
|
331
322
|
facets=facets,
|
332
|
-
page_number=
|
333
|
-
page_size=
|
323
|
+
page_number=0, # Bw/c with pagination
|
324
|
+
page_size=top_k,
|
334
325
|
min_score=round(min_score or 0, ndigits=3),
|
335
326
|
)
|
336
327
|
|
@@ -339,8 +330,7 @@ async def merge_paragraph_results(
|
|
339
330
|
paragraph_responses: list[ParagraphSearchResponse],
|
340
331
|
resources: list[str],
|
341
332
|
kbid: str,
|
342
|
-
|
343
|
-
page: int,
|
333
|
+
top_k: int,
|
344
334
|
highlight: bool,
|
345
335
|
sort: SortOptions,
|
346
336
|
min_score: float,
|
@@ -374,15 +364,11 @@ async def merge_paragraph_results(
|
|
374
364
|
|
375
365
|
raw_paragraph_list.sort(key=lambda x: x[1], reverse=(sort.order == SortOrder.DESC))
|
376
366
|
|
377
|
-
|
378
|
-
|
379
|
-
length = len(raw_paragraph_list)
|
380
|
-
|
381
|
-
if length > end:
|
382
|
-
next_page = True
|
367
|
+
raw_paragraph_list, has_more = cut_page(raw_paragraph_list, top_k)
|
368
|
+
next_page = next_page or has_more
|
383
369
|
|
384
370
|
result_paragraph_list: list[Paragraph] = []
|
385
|
-
for result, _ in raw_paragraph_list
|
371
|
+
for result, _ in raw_paragraph_list:
|
386
372
|
_, field_type, field = result.field.split("/")
|
387
373
|
text = await get_paragraph_text(
|
388
374
|
kbid=kbid,
|
@@ -435,8 +421,8 @@ async def merge_paragraph_results(
|
|
435
421
|
facets=facets,
|
436
422
|
query=query,
|
437
423
|
total=total,
|
438
|
-
page_number=
|
439
|
-
page_size=
|
424
|
+
page_number=0, # Bw/c with pagination
|
425
|
+
page_size=top_k,
|
440
426
|
next_page=next_page,
|
441
427
|
min_score=min_score,
|
442
428
|
)
|
@@ -494,8 +480,7 @@ def _merge_relations_results(
|
|
494
480
|
@merge_observer.wrap({"type": "merge"})
|
495
481
|
async def merge_results(
|
496
482
|
search_responses: list[SearchResponse],
|
497
|
-
|
498
|
-
page: int,
|
483
|
+
top_k: int,
|
499
484
|
kbid: str,
|
500
485
|
show: list[ResourceProperties],
|
501
486
|
field_type_filter: list[FieldTypeName],
|
@@ -520,22 +505,21 @@ async def merge_results(
|
|
520
505
|
|
521
506
|
resources: list[str] = list()
|
522
507
|
api_results.fulltext = await merge_documents_results(
|
523
|
-
documents, resources,
|
508
|
+
documents, resources, top_k, kbid, sort, min_score=min_score.bm25
|
524
509
|
)
|
525
510
|
|
526
511
|
api_results.paragraphs = await merge_paragraph_results(
|
527
512
|
paragraphs,
|
528
513
|
resources,
|
529
514
|
kbid,
|
530
|
-
|
531
|
-
page,
|
515
|
+
top_k,
|
532
516
|
highlight,
|
533
517
|
sort,
|
534
518
|
min_score=min_score.bm25,
|
535
519
|
)
|
536
520
|
|
537
521
|
api_results.sentences = await merge_vectors_results(
|
538
|
-
vectors, resources, kbid,
|
522
|
+
vectors, resources, kbid, top_k, min_score=min_score.semantic
|
539
523
|
)
|
540
524
|
|
541
525
|
api_results.relations = await merge_relations_results(relations, requested_relations)
|
@@ -546,8 +530,7 @@ async def merge_results(
|
|
546
530
|
|
547
531
|
async def merge_paragraphs_results(
|
548
532
|
responses: list[SearchResponse],
|
549
|
-
|
550
|
-
page: int,
|
533
|
+
top_k: int,
|
551
534
|
kbid: str,
|
552
535
|
highlight_split: bool,
|
553
536
|
min_score: float,
|
@@ -563,8 +546,7 @@ async def merge_paragraphs_results(
|
|
563
546
|
paragraphs,
|
564
547
|
resources,
|
565
548
|
kbid,
|
566
|
-
|
567
|
-
page,
|
549
|
+
top_k,
|
568
550
|
highlight=highlight_split,
|
569
551
|
sort=SortOptions(
|
570
552
|
field=SortField.SCORE,
|
@@ -26,6 +26,7 @@ from psycopg.rows import dict_row
|
|
26
26
|
|
27
27
|
from nucliadb.common.maindb.pg import PGDriver
|
28
28
|
from nucliadb.common.maindb.utils import get_driver
|
29
|
+
from nucliadb.search.search.query_parser.models import CatalogQuery
|
29
30
|
from nucliadb_models.labels import translate_system_to_alias_label
|
30
31
|
from nucliadb_models.metadata import ResourceProcessingStatus
|
31
32
|
from nucliadb_models.search import (
|
@@ -37,7 +38,6 @@ from nucliadb_models.search import (
|
|
37
38
|
from nucliadb_telemetry import metrics
|
38
39
|
|
39
40
|
from .filters import translate_label
|
40
|
-
from .query import QueryParser
|
41
41
|
|
42
42
|
observer = metrics.Observer("pg_catalog_search", labels={"op": ""})
|
43
43
|
logger = logging.getLogger(__name__)
|
@@ -79,60 +79,60 @@ def _convert_filter(filter, filter_params):
|
|
79
79
|
raise ValueError(f"Invalid operator {op}")
|
80
80
|
|
81
81
|
|
82
|
-
def _prepare_query(
|
82
|
+
def _prepare_query(catalog_query: CatalogQuery):
|
83
83
|
filter_sql = ["kbid = %(kbid)s"]
|
84
|
-
filter_params: dict[str, Any] = {"kbid":
|
84
|
+
filter_params: dict[str, Any] = {"kbid": catalog_query.kbid}
|
85
85
|
|
86
|
-
if
|
86
|
+
if catalog_query.query:
|
87
87
|
# This is doing tokenization inside the SQL server (to keep the index updated). We could move it to
|
88
88
|
# the python code at update/query time if it ever becomes a problem but for now, a single regex
|
89
89
|
# executed per query is not a problem.
|
90
90
|
filter_sql.append(
|
91
91
|
"regexp_split_to_array(lower(title), '\\W') @> regexp_split_to_array(lower(%(query)s), '\\W')"
|
92
92
|
)
|
93
|
-
filter_params["query"] =
|
93
|
+
filter_params["query"] = catalog_query.query
|
94
94
|
|
95
|
-
if
|
95
|
+
if catalog_query.filters.creation.after:
|
96
96
|
filter_sql.append("created_at > %(created_at_start)s")
|
97
|
-
filter_params["created_at_start"] =
|
97
|
+
filter_params["created_at_start"] = catalog_query.filters.creation.after
|
98
98
|
|
99
|
-
if
|
99
|
+
if catalog_query.filters.creation.before:
|
100
100
|
filter_sql.append("created_at < %(created_at_end)s")
|
101
|
-
filter_params["created_at_end"] =
|
101
|
+
filter_params["created_at_end"] = catalog_query.filters.creation.before
|
102
102
|
|
103
|
-
if
|
103
|
+
if catalog_query.filters.modification.after:
|
104
104
|
filter_sql.append("modified_at > %(modified_at_start)s")
|
105
|
-
filter_params["modified_at_start"] =
|
105
|
+
filter_params["modified_at_start"] = catalog_query.filters.modification.after
|
106
106
|
|
107
|
-
if
|
107
|
+
if catalog_query.filters.modification.before:
|
108
108
|
filter_sql.append("modified_at < %(modified_at_end)s")
|
109
|
-
filter_params["modified_at_end"] =
|
109
|
+
filter_params["modified_at_end"] = catalog_query.filters.modification.before
|
110
110
|
|
111
|
-
if
|
112
|
-
filter_sql.append(_convert_filter(
|
111
|
+
if catalog_query.filters.labels:
|
112
|
+
filter_sql.append(_convert_filter(catalog_query.filters.labels, filter_params))
|
113
113
|
|
114
114
|
order_sql = ""
|
115
|
-
if
|
116
|
-
if
|
115
|
+
if catalog_query.sort:
|
116
|
+
if catalog_query.sort.field == SortField.CREATED:
|
117
117
|
order_field = "created_at"
|
118
|
-
elif
|
118
|
+
elif catalog_query.sort.field == SortField.MODIFIED:
|
119
119
|
order_field = "modified_at"
|
120
|
-
elif
|
120
|
+
elif catalog_query.sort.field == SortField.TITLE:
|
121
121
|
order_field = "title"
|
122
122
|
else:
|
123
123
|
# Deprecated order by score, use created_at instead
|
124
124
|
order_field = "created_at"
|
125
125
|
|
126
|
-
if
|
126
|
+
if catalog_query.sort.order == SortOrder.ASC:
|
127
127
|
order_dir = "ASC"
|
128
128
|
else:
|
129
129
|
order_dir = "DESC"
|
130
130
|
|
131
131
|
order_sql = f" ORDER BY {order_field} {order_dir}"
|
132
132
|
|
133
|
-
if
|
133
|
+
if catalog_query.filters.with_status:
|
134
134
|
filter_sql.append("labels && %(status)s")
|
135
|
-
if
|
135
|
+
if catalog_query.filters.with_status == ResourceProcessingStatus.PROCESSED:
|
136
136
|
filter_params["status"] = ["/n/s/PROCESSED", "/n/s/ERROR"]
|
137
137
|
else:
|
138
138
|
filter_params["status"] = ["/n/s/PENDING"]
|
@@ -148,18 +148,18 @@ def _pg_driver() -> PGDriver:
|
|
148
148
|
|
149
149
|
|
150
150
|
@observer.wrap({"op": "search"})
|
151
|
-
async def pgcatalog_search(
|
151
|
+
async def pgcatalog_search(catalog_query: CatalogQuery) -> Resources:
|
152
152
|
# Prepare SQL query
|
153
|
-
query, query_params = _prepare_query(
|
153
|
+
query, query_params = _prepare_query(catalog_query)
|
154
154
|
|
155
155
|
async with _pg_driver()._get_connection() as conn, conn.cursor(row_factory=dict_row) as cur:
|
156
156
|
facets = {}
|
157
157
|
|
158
158
|
# Faceted search
|
159
|
-
if
|
159
|
+
if catalog_query.faceted:
|
160
160
|
with observer({"op": "facets"}):
|
161
161
|
tmp_facets: dict[str, dict[str, int]] = {
|
162
|
-
translate_label(f): defaultdict(int) for f in
|
162
|
+
translate_label(f): defaultdict(int) for f in catalog_query.faceted
|
163
163
|
}
|
164
164
|
facet_filters = " OR ".join(f"label LIKE '{f}/%%'" for f in tmp_facets.keys())
|
165
165
|
for facet in tmp_facets.keys():
|
@@ -167,7 +167,7 @@ async def pgcatalog_search(query_parser: QueryParser) -> Resources:
|
|
167
167
|
facet.startswith("/n/s") or facet.startswith("/n/i") or facet.startswith("/l")
|
168
168
|
):
|
169
169
|
logger.warn(
|
170
|
-
f"Unexpected facet used at catalog: {facet}, kbid={
|
170
|
+
f"Unexpected facet used at catalog: {facet}, kbid={catalog_query.kbid}"
|
171
171
|
)
|
172
172
|
|
173
173
|
await cur.execute(
|
@@ -201,12 +201,12 @@ async def pgcatalog_search(query_parser: QueryParser) -> Resources:
|
|
201
201
|
|
202
202
|
# Query
|
203
203
|
with observer({"op": "query"}):
|
204
|
-
offset =
|
204
|
+
offset = catalog_query.page_size * catalog_query.page_number
|
205
205
|
await cur.execute(
|
206
206
|
f"{query} LIMIT %(page_size)s OFFSET %(offset)s",
|
207
207
|
{
|
208
208
|
**query_params,
|
209
|
-
"page_size":
|
209
|
+
"page_size": catalog_query.page_size,
|
210
210
|
"offset": offset,
|
211
211
|
},
|
212
212
|
)
|
@@ -224,10 +224,10 @@ async def pgcatalog_search(query_parser: QueryParser) -> Resources:
|
|
224
224
|
)
|
225
225
|
for r in data
|
226
226
|
],
|
227
|
-
query=
|
227
|
+
query=catalog_query.query,
|
228
228
|
total=total,
|
229
|
-
page_number=
|
230
|
-
page_size=
|
229
|
+
page_number=catalog_query.page_number,
|
230
|
+
page_size=catalog_query.page_size,
|
231
231
|
next_page=(offset + len(data) < total),
|
232
232
|
min_score=0,
|
233
233
|
)
|
nucliadb/search/search/query.py
CHANGED
@@ -74,7 +74,6 @@ INDEX_SORTABLE_FIELDS = [
|
|
74
74
|
SortField.MODIFIED,
|
75
75
|
]
|
76
76
|
|
77
|
-
MAX_VECTOR_RESULTS_ALLOWED = 2000
|
78
77
|
DEFAULT_GENERIC_SEMANTIC_THRESHOLD = 0.7
|
79
78
|
|
80
79
|
|
@@ -105,8 +104,7 @@ class QueryParser:
|
|
105
104
|
query: str,
|
106
105
|
label_filters: Union[list[str], list[Filter]],
|
107
106
|
keyword_filters: Union[list[str], list[Filter]],
|
108
|
-
|
109
|
-
page_size: int,
|
107
|
+
top_k: int,
|
110
108
|
min_score: MinScore,
|
111
109
|
faceted: Optional[list[str]] = None,
|
112
110
|
sort: Optional[SortOptions] = None,
|
@@ -145,8 +143,7 @@ class QueryParser:
|
|
145
143
|
self.flat_label_filters: list[str] = []
|
146
144
|
self.keyword_filters: dict[str, Any] = convert_to_node_filters(keyword_filters)
|
147
145
|
self.faceted = faceted or []
|
148
|
-
self.
|
149
|
-
self.page_size = page_size
|
146
|
+
self.top_k = top_k
|
150
147
|
self.min_score = min_score
|
151
148
|
self.sort = sort
|
152
149
|
self.range_creation_start = range_creation_start
|
@@ -389,19 +386,13 @@ class QueryParser:
|
|
389
386
|
# have consistent results, we must limit them
|
390
387
|
request.result_per_page = self.sort.limit
|
391
388
|
else:
|
392
|
-
request.result_per_page = self.
|
389
|
+
request.result_per_page = self.top_k
|
393
390
|
|
394
391
|
sort_field = SortFieldMap[self.sort.field] if self.sort else None
|
395
392
|
if sort_field is not None:
|
396
393
|
request.order.sort_by = sort_field
|
397
394
|
request.order.type = SortOrderMap[self.sort.order] # type: ignore
|
398
395
|
|
399
|
-
if self.has_vector_search and request.result_per_page > MAX_VECTOR_RESULTS_ALLOWED:
|
400
|
-
raise InvalidQueryError(
|
401
|
-
"page_size",
|
402
|
-
f"Pagination of semantic results limit reached: {MAX_VECTOR_RESULTS_ALLOWED}. If you want to paginate through all results, please disable the vector search feature.", # noqa: E501
|
403
|
-
)
|
404
|
-
|
405
396
|
async def parse_min_score(self, request: nodereader_pb2.SearchRequest, incomplete: bool) -> None:
|
406
397
|
semantic_min_score = DEFAULT_GENERIC_SEMANTIC_THRESHOLD
|
407
398
|
if self.min_score.semantic is not None:
|
@@ -635,8 +626,7 @@ async def paragraph_query_to_pb(
|
|
635
626
|
fields: list[str],
|
636
627
|
filters: list[str],
|
637
628
|
faceted: list[str],
|
638
|
-
|
639
|
-
page_size: int,
|
629
|
+
top_k: int,
|
640
630
|
range_creation_start: Optional[datetime] = None,
|
641
631
|
range_creation_end: Optional[datetime] = None,
|
642
632
|
range_modification_start: Optional[datetime] = None,
|
@@ -650,7 +640,7 @@ async def paragraph_query_to_pb(
|
|
650
640
|
|
651
641
|
# We need to ask for all and cut later
|
652
642
|
request.page_number = 0
|
653
|
-
request.result_per_page =
|
643
|
+
request.result_per_page = top_k
|
654
644
|
|
655
645
|
request.body = query
|
656
646
|
|
@@ -19,6 +19,8 @@
|
|
19
19
|
#
|
20
20
|
|
21
21
|
from dataclasses import dataclass
|
22
|
+
from datetime import datetime
|
23
|
+
from typing import Any, Optional
|
22
24
|
|
23
25
|
from pydantic import (
|
24
26
|
BaseModel,
|
@@ -27,6 +29,16 @@ from pydantic import (
|
|
27
29
|
|
28
30
|
from nucliadb_models import search as search_models
|
29
31
|
|
32
|
+
### Retrieval
|
33
|
+
|
34
|
+
# filters
|
35
|
+
|
36
|
+
|
37
|
+
class DateTimeFilter(BaseModel):
|
38
|
+
after: Optional[datetime] = None # aka, start
|
39
|
+
before: Optional[datetime] = None # aka, end
|
40
|
+
|
41
|
+
|
30
42
|
# rank fusion
|
31
43
|
|
32
44
|
|
@@ -65,3 +77,25 @@ class UnitRetrieval:
|
|
65
77
|
top_k: int
|
66
78
|
rank_fusion: RankFusion
|
67
79
|
reranker: Reranker
|
80
|
+
|
81
|
+
|
82
|
+
### Catalog
|
83
|
+
|
84
|
+
|
85
|
+
class CatalogFilters(BaseModel):
|
86
|
+
labels: dict[str, Any] = Field(
|
87
|
+
default_factory=dict, description="Labels filter expression, like, `{and: {not: ...}, ...}`"
|
88
|
+
)
|
89
|
+
creation: DateTimeFilter
|
90
|
+
modification: DateTimeFilter
|
91
|
+
with_status: Optional[search_models.ResourceProcessingStatus] = None
|
92
|
+
|
93
|
+
|
94
|
+
class CatalogQuery(BaseModel):
|
95
|
+
kbid: str
|
96
|
+
query: str
|
97
|
+
filters: CatalogFilters
|
98
|
+
sort: search_models.SortOptions
|
99
|
+
faceted: list[str]
|
100
|
+
page_size: int
|
101
|
+
page_number: int
|
@@ -18,11 +18,19 @@
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
20
|
|
21
|
+
from typing import Any
|
21
22
|
|
22
23
|
from pydantic import ValidationError
|
23
24
|
|
25
|
+
from nucliadb.search.search.filters import (
|
26
|
+
convert_to_node_filters,
|
27
|
+
translate_label_filters,
|
28
|
+
)
|
24
29
|
from nucliadb.search.search.query_parser.exceptions import ParserError
|
25
30
|
from nucliadb.search.search.query_parser.models import (
|
31
|
+
CatalogFilters,
|
32
|
+
CatalogQuery,
|
33
|
+
DateTimeFilter,
|
26
34
|
MultiMatchBoosterReranker,
|
27
35
|
NoopReranker,
|
28
36
|
PredictReranker,
|
@@ -32,7 +40,14 @@ from nucliadb.search.search.query_parser.models import (
|
|
32
40
|
UnitRetrieval,
|
33
41
|
)
|
34
42
|
from nucliadb_models import search as search_models
|
35
|
-
from nucliadb_models.
|
43
|
+
from nucliadb_models.labels import LABEL_HIDDEN
|
44
|
+
from nucliadb_models.search import (
|
45
|
+
Filter,
|
46
|
+
FindRequest,
|
47
|
+
SortField,
|
48
|
+
SortOptions,
|
49
|
+
SortOrder,
|
50
|
+
)
|
36
51
|
|
37
52
|
|
38
53
|
def parse_find(item: FindRequest) -> UnitRetrieval:
|
@@ -69,9 +84,6 @@ class _FindParser:
|
|
69
84
|
)
|
70
85
|
|
71
86
|
def _parse_top_k(self) -> int:
|
72
|
-
# while pagination is still there, FindRequest has a validator that converts
|
73
|
-
# top_k to page_number and page_size. To get top_k, we can compute it from
|
74
|
-
# those
|
75
87
|
assert self.item.top_k is not None, "top_k must have an int value"
|
76
88
|
top_k = self.item.top_k
|
77
89
|
return top_k
|
@@ -129,3 +141,43 @@ class _FindParser:
|
|
129
141
|
raise ParserError(f"Unknown reranker {self.item.reranker}")
|
130
142
|
|
131
143
|
return reranking
|
144
|
+
|
145
|
+
|
146
|
+
def parse_catalog(kbid: str, item: search_models.CatalogRequest) -> CatalogQuery:
|
147
|
+
if item.hidden:
|
148
|
+
hidden_filter = Filter(all=[LABEL_HIDDEN])
|
149
|
+
else:
|
150
|
+
hidden_filter = Filter(none=[LABEL_HIDDEN])
|
151
|
+
label_filters: dict[str, Any] = convert_to_node_filters(item.filters + [hidden_filter]) # type: ignore
|
152
|
+
if len(label_filters) > 0:
|
153
|
+
label_filters = translate_label_filters(label_filters)
|
154
|
+
|
155
|
+
sort = item.sort
|
156
|
+
if sort is None:
|
157
|
+
# By default we sort by creation date (most recent first)
|
158
|
+
sort = SortOptions(
|
159
|
+
field=SortField.CREATED,
|
160
|
+
order=SortOrder.DESC,
|
161
|
+
limit=None,
|
162
|
+
)
|
163
|
+
|
164
|
+
return CatalogQuery(
|
165
|
+
kbid=kbid,
|
166
|
+
query=item.query,
|
167
|
+
filters=CatalogFilters(
|
168
|
+
labels=label_filters,
|
169
|
+
creation=DateTimeFilter(
|
170
|
+
after=item.range_creation_start,
|
171
|
+
before=item.range_creation_end,
|
172
|
+
),
|
173
|
+
modification=DateTimeFilter(
|
174
|
+
after=item.range_modification_start,
|
175
|
+
before=item.range_modification_end,
|
176
|
+
),
|
177
|
+
with_status=item.with_status,
|
178
|
+
),
|
179
|
+
sort=sort,
|
180
|
+
faceted=item.faceted,
|
181
|
+
page_number=item.page_number,
|
182
|
+
page_size=item.page_size,
|
183
|
+
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: nucliadb
|
3
|
-
Version: 6.1.0.
|
3
|
+
Version: 6.1.0.post2610
|
4
4
|
Home-page: https://docs.nuclia.dev/docs/management/nucliadb/intro
|
5
5
|
Author: NucliaDB Community
|
6
6
|
Author-email: nucliadb@nuclia.com
|
@@ -22,10 +22,10 @@ Classifier: Programming Language :: Python :: 3.12
|
|
22
22
|
Classifier: Programming Language :: Python :: 3 :: Only
|
23
23
|
Requires-Python: >=3.9, <4
|
24
24
|
Description-Content-Type: text/markdown
|
25
|
-
Requires-Dist: nucliadb-telemetry[all]>=6.1.0.
|
26
|
-
Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.1.0.
|
27
|
-
Requires-Dist: nucliadb-protos>=6.1.0.
|
28
|
-
Requires-Dist: nucliadb-models>=6.1.0.
|
25
|
+
Requires-Dist: nucliadb-telemetry[all]>=6.1.0.post2610
|
26
|
+
Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.1.0.post2610
|
27
|
+
Requires-Dist: nucliadb-protos>=6.1.0.post2610
|
28
|
+
Requires-Dist: nucliadb-models>=6.1.0.post2610
|
29
29
|
Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
|
30
30
|
Requires-Dist: nucliadb-node-binding>=2.26.0
|
31
31
|
Requires-Dist: uvicorn
|
@@ -190,52 +190,53 @@ nucliadb/search/run.py,sha256=aFb-CXRi_C8YMpP_ivNj8KW1BYhADj88y8K9Lr_nUPI,1402
|
|
190
190
|
nucliadb/search/settings.py,sha256=vem3EcyYlTPSim0kEK-xe-erF4BZg0CT_LAb8ZRQAE8,1684
|
191
191
|
nucliadb/search/utilities.py,sha256=9SsRDw0rJVXVoLBfF7rBb6q080h-thZc7u8uRcTiBeY,1037
|
192
192
|
nucliadb/search/api/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
193
|
-
nucliadb/search/api/v1/__init__.py,sha256=
|
193
|
+
nucliadb/search/api/v1/__init__.py,sha256=NSbOVF6toiHX9WMpGgqpFrjJKT50EcHzOILp-2iHk5I,1249
|
194
194
|
nucliadb/search/api/v1/ask.py,sha256=Od2U_gaOZK6dJZ1eDGQQJ3xUVnbBih58VPYVAsQErOw,3902
|
195
|
+
nucliadb/search/api/v1/catalog.py,sha256=ubYPS1wmPHzOgH9LR0qJmmV-9ELZPtHRSs5TYJ1pA9A,7117
|
195
196
|
nucliadb/search/api/v1/feedback.py,sha256=yrOZeElw6XLu6j_6m3QGHKjEMwZPWa9vtdCud4dNilU,2547
|
196
197
|
nucliadb/search/api/v1/find.py,sha256=DsnWkySu_cFajDWJIxN8DYvLL_Rm2yiCjHD8TsqPfRk,9304
|
197
198
|
nucliadb/search/api/v1/knowledgebox.py,sha256=PKT1V3vZUnBkGfkxnFGjWPuHwQarVxREDY7lAT_9k1w,8764
|
198
199
|
nucliadb/search/api/v1/predict_proxy.py,sha256=QrGzo0hKjtmyGZ6pjlJHYAh4hxwVUIOTcVcerRCw7eE,3047
|
199
200
|
nucliadb/search/api/v1/router.py,sha256=mtT07rBZcVfpa49doaw9b1tj3sdi3qLH0gn9Io6NYM0,988
|
200
|
-
nucliadb/search/api/v1/search.py,sha256=
|
201
|
+
nucliadb/search/api/v1/search.py,sha256=_5J8lIzLjfFW3j-XeaebaJqcO1vxm0W2oaX4unFJ5e8,13577
|
201
202
|
nucliadb/search/api/v1/suggest.py,sha256=SXxRVKT5hDSHNKlBYo8XozHHq9bGyvJOlo286lEruLE,5979
|
202
203
|
nucliadb/search/api/v1/summarize.py,sha256=VAHJvE6V3xUgEBfqNKhgoxmDqCvh30RnrEIBVhMcNLU,2499
|
203
204
|
nucliadb/search/api/v1/utils.py,sha256=5Ve-frn7LAE2jqAgB85F8RSeqxDlyA08--gS-AdOLS4,1434
|
204
205
|
nucliadb/search/api/v1/resource/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
205
206
|
nucliadb/search/api/v1/resource/ask.py,sha256=XMEP9_Uwy37yaXLcIYKMXGiZYNASD8RTByzQGjd9LPQ,3847
|
206
|
-
nucliadb/search/api/v1/resource/search.py,sha256=
|
207
|
+
nucliadb/search/api/v1/resource/search.py,sha256=X0rQU14r_s4_CPpoE2sc84AJPX68gvCftcP4bosWHhA,4812
|
207
208
|
nucliadb/search/requesters/__init__.py,sha256=itSI7dtTwFP55YMX4iK7JzdMHS5CQVUiB1XzQu4UBh8,833
|
208
209
|
nucliadb/search/requesters/utils.py,sha256=7ovWSGzhLpZGTMi9x9nMOi7QNCgt2qah-7Kam-cIvUg,8468
|
209
210
|
nucliadb/search/search/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
210
211
|
nucliadb/search/search/cache.py,sha256=n9vkN6Y6Xnr2RBJyoH0WzjzGTJOMfKekU9tfPTWWCPc,6810
|
211
|
-
nucliadb/search/search/cut.py,sha256=
|
212
|
+
nucliadb/search/search/cut.py,sha256=ytY0_GY7ocNjfxTb4aosxEp4ZfhQNDP--JkhEMGD298,1153
|
212
213
|
nucliadb/search/search/exceptions.py,sha256=mbToQ-ghrv8ukLEv8S_-EZrgweWaIZZ5SIpoeuGDk6s,1154
|
213
214
|
nucliadb/search/search/fetch.py,sha256=XJHIFnZmXM_8Kb37lb4lg1GYG7cZ1plT-qAIb_QziX4,6184
|
214
215
|
nucliadb/search/search/filters.py,sha256=1MkHlJjAQqoRCj7e5cEzK2HvBxGLE17I_omsjiklbtw,6476
|
215
|
-
nucliadb/search/search/find.py,sha256=
|
216
|
-
nucliadb/search/search/find_merge.py,sha256=
|
216
|
+
nucliadb/search/search/find.py,sha256=EPtnb3jJcj6_4Brzk0mRkd9meY26OVGfW5dj-RKk2yU,9829
|
217
|
+
nucliadb/search/search/find_merge.py,sha256=_R_YpHAZv5BHh3XABQ8MRd1Ci0seclGYf26yJHJ7H0I,17178
|
217
218
|
nucliadb/search/search/hydrator.py,sha256=7Zi44uf2m9b2X_b1aOV2lrWu1Vmbo9lXYgPVUGK0RGI,6728
|
218
|
-
nucliadb/search/search/merge.py,sha256=
|
219
|
+
nucliadb/search/search/merge.py,sha256=TATahN22AX23gJ-2hxGiIZLjj6H1AtnIeADN6jC11HY,20079
|
219
220
|
nucliadb/search/search/metrics.py,sha256=81X-tahGW4n2CLvUzCPdNxNClmZqUWZjcVOGCUHoiUM,2872
|
220
221
|
nucliadb/search/search/paragraphs.py,sha256=pNAEiYqJGGUVcEf7xf-PFMVqz0PX4Qb-WNG-_zPGN2o,7799
|
221
|
-
nucliadb/search/search/pgcatalog.py,sha256=
|
222
|
+
nucliadb/search/search/pgcatalog.py,sha256=IaNK4dAxdXs38PoIkTdgqMDuZDjeiOtcXn3LeaT-OMw,8855
|
222
223
|
nucliadb/search/search/predict_proxy.py,sha256=xBlh6kjuQpWRq7KsBx4pEl2PtnwljjQIiYMaTWpcCSA,3015
|
223
|
-
nucliadb/search/search/query.py,sha256=
|
224
|
+
nucliadb/search/search/query.py,sha256=1g_kek5mbxXPrFbs2ptTrFfr-WwZjsjMUHOw9Le2vZ4,37732
|
224
225
|
nucliadb/search/search/rank_fusion.py,sha256=tRGo_KlsFsVx1CQEy1iqQ6f0T1Dq1kf0axDXHuuzvvM,6946
|
225
226
|
nucliadb/search/search/rerankers.py,sha256=0kAHES9X_FKkP7KSN9NRETFmRPKzwrFAo_54MbyvM7Q,9051
|
226
227
|
nucliadb/search/search/shards.py,sha256=mM2aCHWhl_gwkCENXDShPukS-_qnB5tFS3UAJuzM9qA,2182
|
227
228
|
nucliadb/search/search/summarize.py,sha256=ksmYPubEQvAQgfPdZHfzB_rR19B2ci4IYZ6jLdHxZo8,4996
|
228
229
|
nucliadb/search/search/utils.py,sha256=iF2tbBA56gRMJH1TlE2hMrqeXqjoeOPt4KgRdp2m9Ek,3313
|
229
230
|
nucliadb/search/search/chat/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
230
|
-
nucliadb/search/search/chat/ask.py,sha256=
|
231
|
+
nucliadb/search/search/chat/ask.py,sha256=7yUPEMluZ553O4FdcghyQI3Hw042P3QL06T0AwDctJI,33799
|
231
232
|
nucliadb/search/search/chat/exceptions.py,sha256=Siy4GXW2L7oPhIR86H3WHBhE9lkV4A4YaAszuGGUf54,1356
|
232
233
|
nucliadb/search/search/chat/images.py,sha256=PA8VWxT5_HUGfW1ULhKTK46UBsVyINtWWqEM1ulzX1E,3095
|
233
234
|
nucliadb/search/search/chat/prompt.py,sha256=TIzjI_882hJ--KLKCY8rJomtJ_CMJ-MHYtHqivgG8Lk,46819
|
234
235
|
nucliadb/search/search/chat/query.py,sha256=gKtlj2ms81m417Id29-DtHFxE3M4TtJvYNB03gAgpYo,14402
|
235
236
|
nucliadb/search/search/query_parser/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
236
237
|
nucliadb/search/search/query_parser/exceptions.py,sha256=tuzl7ZyvVsRz6u0_3zMe60vx39nd3pi641prs-5nC0E,872
|
237
|
-
nucliadb/search/search/query_parser/models.py,sha256
|
238
|
-
nucliadb/search/search/query_parser/parser.py,sha256=
|
238
|
+
nucliadb/search/search/query_parser/models.py,sha256=-VlCDXUCgOroAZw1Leqhj2VMgRv_CD2w40PXXOBLaUM,2332
|
239
|
+
nucliadb/search/search/query_parser/parser.py,sha256=ElKx9JboJCSqBiFiEAVVH-JM0_7ykc_cdY4TbKfAxUg,6296
|
239
240
|
nucliadb/standalone/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
240
241
|
nucliadb/standalone/api_router.py,sha256=zR03TQ-Pd2kXx1jeV83Puw19112Z8Jhln7p1cAn69kg,6699
|
241
242
|
nucliadb/standalone/app.py,sha256=mAApNK_iVsQgJyd-mtwCeZq5csSimwnXmlQGH9a70pE,5586
|
@@ -331,9 +332,9 @@ nucliadb/writer/tus/local.py,sha256=7jYa_w9b-N90jWgN2sQKkNcomqn6JMVBOVeDOVYJHto,
|
|
331
332
|
nucliadb/writer/tus/s3.py,sha256=vF0NkFTXiXhXq3bCVXXVV-ED38ECVoUeeYViP8uMqcU,8357
|
332
333
|
nucliadb/writer/tus/storage.py,sha256=ToqwjoYnjI4oIcwzkhha_MPxi-k4Jk3Lt55zRwaC1SM,2903
|
333
334
|
nucliadb/writer/tus/utils.py,sha256=MSdVbRsRSZVdkaum69_0wku7X3p5wlZf4nr6E0GMKbw,2556
|
334
|
-
nucliadb-6.1.0.
|
335
|
-
nucliadb-6.1.0.
|
336
|
-
nucliadb-6.1.0.
|
337
|
-
nucliadb-6.1.0.
|
338
|
-
nucliadb-6.1.0.
|
339
|
-
nucliadb-6.1.0.
|
335
|
+
nucliadb-6.1.0.post2610.dist-info/METADATA,sha256=hq7ECRkXdli7ZqMNQKUI1ZjJyi0ww7yCXEC7asizcCk,4390
|
336
|
+
nucliadb-6.1.0.post2610.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
337
|
+
nucliadb-6.1.0.post2610.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
|
338
|
+
nucliadb-6.1.0.post2610.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
|
339
|
+
nucliadb-6.1.0.post2610.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
340
|
+
nucliadb-6.1.0.post2610.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|