nucliadb 6.5.0.post4404__py3-none-any.whl → 6.5.0.post4408__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/pg/0006_catalog_title_indexes.py +39 -0
- nucliadb/search/api/v1/catalog.py +2 -2
- nucliadb/search/search/pgcatalog.py +40 -10
- nucliadb/search/search/query_parser/models.py +1 -1
- nucliadb/search/search/query_parser/parsers/catalog.py +12 -1
- {nucliadb-6.5.0.post4404.dist-info → nucliadb-6.5.0.post4408.dist-info}/METADATA +6 -6
- {nucliadb-6.5.0.post4404.dist-info → nucliadb-6.5.0.post4408.dist-info}/RECORD +10 -9
- {nucliadb-6.5.0.post4404.dist-info → nucliadb-6.5.0.post4408.dist-info}/WHEEL +0 -0
- {nucliadb-6.5.0.post4404.dist-info → nucliadb-6.5.0.post4408.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.5.0.post4404.dist-info → nucliadb-6.5.0.post4408.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,39 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
|
21
|
+
from nucliadb.common.maindb.pg import PGTransaction
|
22
|
+
|
23
|
+
|
24
|
+
async def migrate(txn: PGTransaction) -> None:
|
25
|
+
# Concurrent index must be created outside of a transaction but psycopg automatically
|
26
|
+
# creates transactions. We temporarily disable this for building indexes.
|
27
|
+
await txn.connection.commit()
|
28
|
+
try:
|
29
|
+
await txn.connection.set_autocommit(True)
|
30
|
+
# btree index on title for sorting
|
31
|
+
await txn.connection.execute("CREATE INDEX CONCURRENTLY ON catalog (title);")
|
32
|
+
# trigram index on title for fuzzy search
|
33
|
+
await txn.connection.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm;")
|
34
|
+
await txn.connection.execute(
|
35
|
+
"CREATE INDEX CONCURRENTLY ON catalog USING gist (title gist_trgm_ops);"
|
36
|
+
)
|
37
|
+
await txn.connection.execute("ANALYZE catalog")
|
38
|
+
finally:
|
39
|
+
await txn.connection.set_autocommit(False)
|
@@ -99,7 +99,7 @@ async def catalog_get(
|
|
99
99
|
show: list[ResourceProperties] = fastapi_query(
|
100
100
|
SearchParamDefaults.show, default=[ResourceProperties.BASIC, ResourceProperties.ERRORS]
|
101
101
|
),
|
102
|
-
) -> Union[
|
102
|
+
) -> Union[CatalogResponse, HTTPClientError]:
|
103
103
|
try:
|
104
104
|
expr = (
|
105
105
|
CatalogFilterExpression.model_validate_json(filter_expression) if filter_expression else None
|
@@ -151,7 +151,7 @@ async def catalog_post(
|
|
151
151
|
async def catalog(
|
152
152
|
kbid: str,
|
153
153
|
item: CatalogRequest,
|
154
|
-
):
|
154
|
+
) -> Union[HTTPClientError, CatalogResponse]:
|
155
155
|
"""
|
156
156
|
Catalog endpoint is a simplified version of the search endpoint, it only
|
157
157
|
returns bm25 results on titles and it does not support vector search.
|
@@ -27,6 +27,7 @@ from psycopg.rows import dict_row
|
|
27
27
|
from nucliadb.common.maindb.pg import PGDriver
|
28
28
|
from nucliadb.common.maindb.utils import get_driver
|
29
29
|
from nucliadb.search.search.query_parser.models import CatalogExpression, CatalogQuery
|
30
|
+
from nucliadb_models import search as search_models
|
30
31
|
from nucliadb_models.labels import translate_system_to_alias_label
|
31
32
|
from nucliadb_models.search import (
|
32
33
|
ResourceResult,
|
@@ -115,14 +116,8 @@ def _prepare_query_filters(catalog_query: CatalogQuery) -> tuple[str, dict[str,
|
|
115
116
|
filter_sql = ["kbid = %(kbid)s"]
|
116
117
|
filter_params: dict[str, Any] = {"kbid": catalog_query.kbid}
|
117
118
|
|
118
|
-
if catalog_query.query:
|
119
|
-
|
120
|
-
# the python code at update/query time if it ever becomes a problem but for now, a single regex
|
121
|
-
# executed per query is not a problem.
|
122
|
-
filter_sql.append(
|
123
|
-
"regexp_split_to_array(lower(title), '\\W') @> regexp_split_to_array(lower(%(query)s), '\\W')"
|
124
|
-
)
|
125
|
-
filter_params["query"] = catalog_query.query
|
119
|
+
if catalog_query.query and catalog_query.query.query:
|
120
|
+
filter_sql.append(_prepare_query_search(catalog_query.query, filter_params))
|
126
121
|
|
127
122
|
if catalog_query.filters:
|
128
123
|
filter_sql.append(_convert_filter(catalog_query.filters, filter_params))
|
@@ -133,6 +128,41 @@ def _prepare_query_filters(catalog_query: CatalogQuery) -> tuple[str, dict[str,
|
|
133
128
|
)
|
134
129
|
|
135
130
|
|
131
|
+
def _prepare_query_search(query: search_models.CatalogQuery, params: dict[str, Any]) -> str:
|
132
|
+
if query.match == search_models.CatalogQueryMatch.Exact:
|
133
|
+
params["query"] = query.query
|
134
|
+
return f"{query.field.value} = %(query)s"
|
135
|
+
elif query.match == search_models.CatalogQueryMatch.StartsWith:
|
136
|
+
params["query"] = query.query + "%"
|
137
|
+
if query.field == search_models.CatalogQueryField.Title:
|
138
|
+
# Insensitive search supported by pg_trgm for title
|
139
|
+
return f"{query.field.value} ILIKE %(query)s"
|
140
|
+
else:
|
141
|
+
# Sensitive search for slug (btree does not support ILIKE and slugs are all lowercase anyway)
|
142
|
+
return f"{query.field.value} LIKE %(query)s"
|
143
|
+
# The rest of operators only supported by title
|
144
|
+
elif query.match == search_models.CatalogQueryMatch.Words:
|
145
|
+
# This is doing tokenization inside the SQL server (to keep the index updated). We could move it to
|
146
|
+
# the python code at update/query time if it ever becomes a problem but for now, a single regex
|
147
|
+
# executed per query is not a problem.
|
148
|
+
params["query"] = query.query
|
149
|
+
return "regexp_split_to_array(lower(title), '\\W') @> regexp_split_to_array(lower(%(query)s), '\\W')"
|
150
|
+
elif query.match == search_models.CatalogQueryMatch.Fuzzy:
|
151
|
+
params["query"] = query.query
|
152
|
+
# Note: the operator is %>, We use %%> for psycopg escaping
|
153
|
+
return "title %%> %(query)s"
|
154
|
+
elif query.match == search_models.CatalogQueryMatch.EndsWith:
|
155
|
+
params["query"] = "%" + query.query
|
156
|
+
return "title ILIKE %(query)s"
|
157
|
+
elif query.match == search_models.CatalogQueryMatch.Contains:
|
158
|
+
params["query"] = "%" + query.query + "%"
|
159
|
+
return "title ILIKE %(query)s"
|
160
|
+
else: # pragma: nocover
|
161
|
+
# This is a trick so mypy generates an error if this branch can be reached,
|
162
|
+
# that is, if we are missing some ifs
|
163
|
+
_a: int = "a"
|
164
|
+
|
165
|
+
|
136
166
|
def _prepare_query(catalog_query: CatalogQuery) -> tuple[str, dict[str, Any]]:
|
137
167
|
# Base query with all the filters
|
138
168
|
query, filter_params = _prepare_query_filters(catalog_query)
|
@@ -188,7 +218,7 @@ async def pgcatalog_search(catalog_query: CatalogQuery) -> Resources:
|
|
188
218
|
if not (
|
189
219
|
facet.startswith("/n/s") or facet.startswith("/n/i") or facet.startswith("/l")
|
190
220
|
):
|
191
|
-
logger.
|
221
|
+
logger.warning(
|
192
222
|
f"Unexpected facet used at catalog: {facet}, kbid={catalog_query.kbid}"
|
193
223
|
)
|
194
224
|
|
@@ -239,7 +269,7 @@ async def pgcatalog_search(catalog_query: CatalogQuery) -> Resources:
|
|
239
269
|
)
|
240
270
|
for r in data
|
241
271
|
],
|
242
|
-
query=catalog_query.query,
|
272
|
+
query=catalog_query.query.query if catalog_query.query else "",
|
243
273
|
total=total,
|
244
274
|
page_number=catalog_query.page_number,
|
245
275
|
page_size=catalog_query.page_size,
|
@@ -84,9 +84,20 @@ async def parse_catalog(kbid: str, item: search_models.CatalogRequest) -> Catalo
|
|
84
84
|
limit=None,
|
85
85
|
)
|
86
86
|
|
87
|
+
if isinstance(item.query, search_models.CatalogQuery):
|
88
|
+
query = item.query
|
89
|
+
elif isinstance(item.query, str) and len(item.query) > 0:
|
90
|
+
query = search_models.CatalogQuery(
|
91
|
+
field=search_models.CatalogQueryField.Title,
|
92
|
+
match=search_models.CatalogQueryMatch.Words,
|
93
|
+
query=item.query,
|
94
|
+
)
|
95
|
+
else:
|
96
|
+
query = None
|
97
|
+
|
87
98
|
return CatalogQuery(
|
88
99
|
kbid=kbid,
|
89
|
-
query=
|
100
|
+
query=query,
|
90
101
|
filters=catalog_expr,
|
91
102
|
sort=sort,
|
92
103
|
faceted=item.faceted,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: nucliadb
|
3
|
-
Version: 6.5.0.
|
3
|
+
Version: 6.5.0.post4408
|
4
4
|
Summary: NucliaDB
|
5
5
|
Author-email: Nuclia <nucliadb@nuclia.com>
|
6
6
|
License-Expression: AGPL-3.0-or-later
|
@@ -19,11 +19,11 @@ Classifier: Programming Language :: Python :: 3.12
|
|
19
19
|
Classifier: Programming Language :: Python :: 3 :: Only
|
20
20
|
Requires-Python: <4,>=3.9
|
21
21
|
Description-Content-Type: text/markdown
|
22
|
-
Requires-Dist: nucliadb-telemetry[all]>=6.5.0.
|
23
|
-
Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.5.0.
|
24
|
-
Requires-Dist: nucliadb-protos>=6.5.0.
|
25
|
-
Requires-Dist: nucliadb-models>=6.5.0.
|
26
|
-
Requires-Dist: nidx-protos>=6.5.0.
|
22
|
+
Requires-Dist: nucliadb-telemetry[all]>=6.5.0.post4408
|
23
|
+
Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.5.0.post4408
|
24
|
+
Requires-Dist: nucliadb-protos>=6.5.0.post4408
|
25
|
+
Requires-Dist: nucliadb-models>=6.5.0.post4408
|
26
|
+
Requires-Dist: nidx-protos>=6.5.0.post4408
|
27
27
|
Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
|
28
28
|
Requires-Dist: nuclia-models>=0.24.2
|
29
29
|
Requires-Dist: uvicorn[standard]
|
@@ -37,6 +37,7 @@ migrations/pg/0002_catalog.py,sha256=Rsleecu351Ty19kYZgOpqX5G3MEAY8nMxCJrAeuS2Mw
|
|
37
37
|
migrations/pg/0003_catalog_kbid_index.py,sha256=uKq_vtnuf73GVf0mtl2rhzdk_czAoEU1UdiVKVZpA0M,1044
|
38
38
|
migrations/pg/0004_catalog_facets.py,sha256=FJFASHjfEHG3sNve9BP2HnnLO4xr7dnR6Qpctnmt4LE,2180
|
39
39
|
migrations/pg/0005_purge_tasks_index.py,sha256=3mtyFgpcK0QQ_NONYay7V9xICijCLNkyTPuoc0PBjRg,1139
|
40
|
+
migrations/pg/0006_catalog_title_indexes.py,sha256=n2OGxwE4oeCwHAYaxBkja4t10BmwTjZ2IoCyOdjEBSc,1710
|
40
41
|
migrations/pg/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
41
42
|
nucliadb/__init__.py,sha256=_abCmDJ_0ku483Os4UAjPX7Nywm39cQgAV_DiyjsKeQ,891
|
42
43
|
nucliadb/health.py,sha256=UIxxA4oms4HIsCRZM_SZsdkIZIlgzmOxw-qSHLlWuak,3465
|
@@ -215,7 +216,7 @@ nucliadb/search/utilities.py,sha256=9SsRDw0rJVXVoLBfF7rBb6q080h-thZc7u8uRcTiBeY,
|
|
215
216
|
nucliadb/search/api/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
216
217
|
nucliadb/search/api/v1/__init__.py,sha256=DH16OYnw9jQ38OpKlmdXeoq2j40ZPXZRtGvClKOkMhw,1239
|
217
218
|
nucliadb/search/api/v1/ask.py,sha256=b4tz33HNsfT5DXv_2DMc_jirnFsHuobreWkbAKkzj5o,5337
|
218
|
-
nucliadb/search/api/v1/catalog.py,sha256=
|
219
|
+
nucliadb/search/api/v1/catalog.py,sha256=3SqLgwFkFFY8x-xBruHQaZ0EGpf7oKbSj-_PnobV68E,7747
|
219
220
|
nucliadb/search/api/v1/feedback.py,sha256=kNLc4dHz2SXHzV0PwC1WiRAwY88fDptPcP-kO0q-FrQ,2620
|
220
221
|
nucliadb/search/api/v1/find.py,sha256=iMjyq4y0JOMC_x1B8kUfVdkCoc9G9Ark58kPLLY4HDw,10824
|
221
222
|
nucliadb/search/api/v1/graph.py,sha256=gthqxCOn9biE6D6s93jRGLglk0ono8U7OyS390kWiI8,4178
|
@@ -248,7 +249,7 @@ nucliadb/search/search/ingestion_agents.py,sha256=IK6yOPEF9rST_uoqspdVdPk0pldjDh
|
|
248
249
|
nucliadb/search/search/merge.py,sha256=XiRBsxhYPshPV7lZXD-9E259KZOPIf4I2tKosY0lPo4,22470
|
249
250
|
nucliadb/search/search/metrics.py,sha256=3I6IN0qDSmqIvUaWJmT3rt-Jyjs6LcvnKI8ZqCiuJPY,3501
|
250
251
|
nucliadb/search/search/paragraphs.py,sha256=pNAEiYqJGGUVcEf7xf-PFMVqz0PX4Qb-WNG-_zPGN2o,7799
|
251
|
-
nucliadb/search/search/pgcatalog.py,sha256=
|
252
|
+
nucliadb/search/search/pgcatalog.py,sha256=QtgArjoM-dW_B1oO0aXqp5au7GlLG8jAct9jevUHatw,10997
|
252
253
|
nucliadb/search/search/predict_proxy.py,sha256=JwgBeEg1j4LnCjPCvTUrnmOd9LceJAt3iAu4m9cmJBo,3390
|
253
254
|
nucliadb/search/search/query.py,sha256=0qIQdt548L3jtKOyKo06aGJ73SLBxAW3N38_Hc1M3Uw,11528
|
254
255
|
nucliadb/search/search/rank_fusion.py,sha256=xZtXhbmKb_56gs73u6KkFm2efvTATOSMmpOV2wrAIqE,9613
|
@@ -265,11 +266,11 @@ nucliadb/search/search/chat/query.py,sha256=3jMPNbiFEOoS0ydMOPYkSx1qVlvAv51npzad
|
|
265
266
|
nucliadb/search/search/query_parser/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
266
267
|
nucliadb/search/search/query_parser/exceptions.py,sha256=sVl9gRNzhE-s480LBBVkiXzNRbKhYRQN5F3it5tNNp8,939
|
267
268
|
nucliadb/search/search/query_parser/fetcher.py,sha256=nP4EySj2BvH10QgCvgzvp13Nf22wwfHsdLbDoPlH2cQ,16831
|
268
|
-
nucliadb/search/search/query_parser/models.py,sha256=
|
269
|
+
nucliadb/search/search/query_parser/models.py,sha256=kAslqX_-zaIdUpcpdNU2a5uQPQh7LC605qWLZ4aZ5T4,5064
|
269
270
|
nucliadb/search/search/query_parser/old_filters.py,sha256=HircRqYEac_90bNCtFIJZ2RKA90kjbpNOQcp_ArBqR0,9083
|
270
271
|
nucliadb/search/search/query_parser/parsers/__init__.py,sha256=ySCNSdbesLXGZyR88919njulA6UE10_3PhqMG_Yj1o4,1034
|
271
272
|
nucliadb/search/search/query_parser/parsers/ask.py,sha256=eTz8wS-EJHuAagR384h6TT64itymFZRpfZJGX8r6aZM,2771
|
272
|
-
nucliadb/search/search/query_parser/parsers/catalog.py,sha256=
|
273
|
+
nucliadb/search/search/query_parser/parsers/catalog.py,sha256=JuDiBL2wdjAuEFEPo0e2nQ4VqWjF3FXakT0ziZk3Oes,7495
|
273
274
|
nucliadb/search/search/query_parser/parsers/common.py,sha256=mJMPOKurBK7-A7s3oNlPLxHP_yIn4j5Uw8rh_OQtzS4,6339
|
274
275
|
nucliadb/search/search/query_parser/parsers/find.py,sha256=lHVspg-i_eWXvu7BT9WfuFVGVKYhr380y4tDX5yfTD4,12735
|
275
276
|
nucliadb/search/search/query_parser/parsers/graph.py,sha256=zyqdUg5Afmhb2_-hvj9FUCaoLh026MUP1fgY2j-lD7c,9385
|
@@ -369,8 +370,8 @@ nucliadb/writer/tus/local.py,sha256=7jYa_w9b-N90jWgN2sQKkNcomqn6JMVBOVeDOVYJHto,
|
|
369
370
|
nucliadb/writer/tus/s3.py,sha256=vF0NkFTXiXhXq3bCVXXVV-ED38ECVoUeeYViP8uMqcU,8357
|
370
371
|
nucliadb/writer/tus/storage.py,sha256=ToqwjoYnjI4oIcwzkhha_MPxi-k4Jk3Lt55zRwaC1SM,2903
|
371
372
|
nucliadb/writer/tus/utils.py,sha256=MSdVbRsRSZVdkaum69_0wku7X3p5wlZf4nr6E0GMKbw,2556
|
372
|
-
nucliadb-6.5.0.
|
373
|
-
nucliadb-6.5.0.
|
374
|
-
nucliadb-6.5.0.
|
375
|
-
nucliadb-6.5.0.
|
376
|
-
nucliadb-6.5.0.
|
373
|
+
nucliadb-6.5.0.post4408.dist-info/METADATA,sha256=5PRi3_KM32DTF2_BSWmfndESWte8q1aH9fPh0SYJ6aE,4152
|
374
|
+
nucliadb-6.5.0.post4408.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
375
|
+
nucliadb-6.5.0.post4408.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
|
376
|
+
nucliadb-6.5.0.post4408.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
|
377
|
+
nucliadb-6.5.0.post4408.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|