nucliadb 6.4.2.post4403__py3-none-any.whl → 6.5.0.post4408__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,39 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ from nucliadb.common.maindb.pg import PGTransaction
22
+
23
+
24
+ async def migrate(txn: PGTransaction) -> None:
25
+ # Concurrent index must be created outside of a transaction but psycopg automatically
26
+ # creates transactions. We temporarily disable this for building indexes.
27
+ await txn.connection.commit()
28
+ try:
29
+ await txn.connection.set_autocommit(True)
30
+ # btree index on title for sorting
31
+ await txn.connection.execute("CREATE INDEX CONCURRENTLY ON catalog (title);")
32
+ # trigram index on title for fuzzy search
33
+ await txn.connection.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm;")
34
+ await txn.connection.execute(
35
+ "CREATE INDEX CONCURRENTLY ON catalog USING gist (title gist_trgm_ops);"
36
+ )
37
+ await txn.connection.execute("ANALYZE catalog")
38
+ finally:
39
+ await txn.connection.set_autocommit(False)
@@ -99,7 +99,7 @@ async def catalog_get(
99
99
  show: list[ResourceProperties] = fastapi_query(
100
100
  SearchParamDefaults.show, default=[ResourceProperties.BASIC, ResourceProperties.ERRORS]
101
101
  ),
102
- ) -> Union[KnowledgeboxSearchResults, HTTPClientError]:
102
+ ) -> Union[CatalogResponse, HTTPClientError]:
103
103
  try:
104
104
  expr = (
105
105
  CatalogFilterExpression.model_validate_json(filter_expression) if filter_expression else None
@@ -151,7 +151,7 @@ async def catalog_post(
151
151
  async def catalog(
152
152
  kbid: str,
153
153
  item: CatalogRequest,
154
- ):
154
+ ) -> Union[HTTPClientError, CatalogResponse]:
155
155
  """
156
156
  Catalog endpoint is a simplified version of the search endpoint, it only
157
157
  returns bm25 results on titles and it does not support vector search.
@@ -27,6 +27,7 @@ from psycopg.rows import dict_row
27
27
  from nucliadb.common.maindb.pg import PGDriver
28
28
  from nucliadb.common.maindb.utils import get_driver
29
29
  from nucliadb.search.search.query_parser.models import CatalogExpression, CatalogQuery
30
+ from nucliadb_models import search as search_models
30
31
  from nucliadb_models.labels import translate_system_to_alias_label
31
32
  from nucliadb_models.search import (
32
33
  ResourceResult,
@@ -115,14 +116,8 @@ def _prepare_query_filters(catalog_query: CatalogQuery) -> tuple[str, dict[str,
115
116
  filter_sql = ["kbid = %(kbid)s"]
116
117
  filter_params: dict[str, Any] = {"kbid": catalog_query.kbid}
117
118
 
118
- if catalog_query.query:
119
- # This is doing tokenization inside the SQL server (to keep the index updated). We could move it to
120
- # the python code at update/query time if it ever becomes a problem but for now, a single regex
121
- # executed per query is not a problem.
122
- filter_sql.append(
123
- "regexp_split_to_array(lower(title), '\\W') @> regexp_split_to_array(lower(%(query)s), '\\W')"
124
- )
125
- filter_params["query"] = catalog_query.query
119
+ if catalog_query.query and catalog_query.query.query:
120
+ filter_sql.append(_prepare_query_search(catalog_query.query, filter_params))
126
121
 
127
122
  if catalog_query.filters:
128
123
  filter_sql.append(_convert_filter(catalog_query.filters, filter_params))
@@ -133,6 +128,41 @@ def _prepare_query_filters(catalog_query: CatalogQuery) -> tuple[str, dict[str,
133
128
  )
134
129
 
135
130
 
131
+ def _prepare_query_search(query: search_models.CatalogQuery, params: dict[str, Any]) -> str:
132
+ if query.match == search_models.CatalogQueryMatch.Exact:
133
+ params["query"] = query.query
134
+ return f"{query.field.value} = %(query)s"
135
+ elif query.match == search_models.CatalogQueryMatch.StartsWith:
136
+ params["query"] = query.query + "%"
137
+ if query.field == search_models.CatalogQueryField.Title:
138
+ # Insensitive search supported by pg_trgm for title
139
+ return f"{query.field.value} ILIKE %(query)s"
140
+ else:
141
+ # Sensitive search for slug (btree does not support ILIKE and slugs are all lowercase anyway)
142
+ return f"{query.field.value} LIKE %(query)s"
143
+ # The rest of operators only supported by title
144
+ elif query.match == search_models.CatalogQueryMatch.Words:
145
+ # This is doing tokenization inside the SQL server (to keep the index updated). We could move it to
146
+ # the python code at update/query time if it ever becomes a problem but for now, a single regex
147
+ # executed per query is not a problem.
148
+ params["query"] = query.query
149
+ return "regexp_split_to_array(lower(title), '\\W') @> regexp_split_to_array(lower(%(query)s), '\\W')"
150
+ elif query.match == search_models.CatalogQueryMatch.Fuzzy:
151
+ params["query"] = query.query
152
+ # Note: the operator is %>, We use %%> for psycopg escaping
153
+ return "title %%> %(query)s"
154
+ elif query.match == search_models.CatalogQueryMatch.EndsWith:
155
+ params["query"] = "%" + query.query
156
+ return "title ILIKE %(query)s"
157
+ elif query.match == search_models.CatalogQueryMatch.Contains:
158
+ params["query"] = "%" + query.query + "%"
159
+ return "title ILIKE %(query)s"
160
+ else: # pragma: nocover
161
+ # This is a trick so mypy generates an error if this branch can be reached,
162
+ # that is, if we are missing some ifs
163
+ _a: int = "a"
164
+
165
+
136
166
  def _prepare_query(catalog_query: CatalogQuery) -> tuple[str, dict[str, Any]]:
137
167
  # Base query with all the filters
138
168
  query, filter_params = _prepare_query_filters(catalog_query)
@@ -188,7 +218,7 @@ async def pgcatalog_search(catalog_query: CatalogQuery) -> Resources:
188
218
  if not (
189
219
  facet.startswith("/n/s") or facet.startswith("/n/i") or facet.startswith("/l")
190
220
  ):
191
- logger.warn(
221
+ logger.warning(
192
222
  f"Unexpected facet used at catalog: {facet}, kbid={catalog_query.kbid}"
193
223
  )
194
224
 
@@ -239,7 +269,7 @@ async def pgcatalog_search(catalog_query: CatalogQuery) -> Resources:
239
269
  )
240
270
  for r in data
241
271
  ],
242
- query=catalog_query.query,
272
+ query=catalog_query.query.query if catalog_query.query else "",
243
273
  total=total,
244
274
  page_number=catalog_query.page_number,
245
275
  page_size=catalog_query.page_size,
@@ -172,7 +172,7 @@ class CatalogExpression:
172
172
 
173
173
  class CatalogQuery(BaseModel):
174
174
  kbid: str
175
- query: str
175
+ query: Optional[search_models.CatalogQuery]
176
176
  filters: Optional[CatalogExpression]
177
177
  sort: search_models.SortOptions
178
178
  faceted: list[str]
@@ -84,9 +84,20 @@ async def parse_catalog(kbid: str, item: search_models.CatalogRequest) -> Catalo
84
84
  limit=None,
85
85
  )
86
86
 
87
+ if isinstance(item.query, search_models.CatalogQuery):
88
+ query = item.query
89
+ elif isinstance(item.query, str) and len(item.query) > 0:
90
+ query = search_models.CatalogQuery(
91
+ field=search_models.CatalogQueryField.Title,
92
+ match=search_models.CatalogQueryMatch.Words,
93
+ query=item.query,
94
+ )
95
+ else:
96
+ query = None
97
+
87
98
  return CatalogQuery(
88
99
  kbid=kbid,
89
- query=item.query,
100
+ query=query,
90
101
  filters=catalog_expr,
91
102
  sort=sort,
92
103
  faceted=item.faceted,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nucliadb
3
- Version: 6.4.2.post4403
3
+ Version: 6.5.0.post4408
4
4
  Summary: NucliaDB
5
5
  Author-email: Nuclia <nucliadb@nuclia.com>
6
6
  License-Expression: AGPL-3.0-or-later
@@ -19,11 +19,11 @@ Classifier: Programming Language :: Python :: 3.12
19
19
  Classifier: Programming Language :: Python :: 3 :: Only
20
20
  Requires-Python: <4,>=3.9
21
21
  Description-Content-Type: text/markdown
22
- Requires-Dist: nucliadb-telemetry[all]>=6.4.2.post4403
23
- Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.4.2.post4403
24
- Requires-Dist: nucliadb-protos>=6.4.2.post4403
25
- Requires-Dist: nucliadb-models>=6.4.2.post4403
26
- Requires-Dist: nidx-protos>=6.4.2.post4403
22
+ Requires-Dist: nucliadb-telemetry[all]>=6.5.0.post4408
23
+ Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.5.0.post4408
24
+ Requires-Dist: nucliadb-protos>=6.5.0.post4408
25
+ Requires-Dist: nucliadb-models>=6.5.0.post4408
26
+ Requires-Dist: nidx-protos>=6.5.0.post4408
27
27
  Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
28
28
  Requires-Dist: nuclia-models>=0.24.2
29
29
  Requires-Dist: uvicorn[standard]
@@ -37,6 +37,7 @@ migrations/pg/0002_catalog.py,sha256=Rsleecu351Ty19kYZgOpqX5G3MEAY8nMxCJrAeuS2Mw
37
37
  migrations/pg/0003_catalog_kbid_index.py,sha256=uKq_vtnuf73GVf0mtl2rhzdk_czAoEU1UdiVKVZpA0M,1044
38
38
  migrations/pg/0004_catalog_facets.py,sha256=FJFASHjfEHG3sNve9BP2HnnLO4xr7dnR6Qpctnmt4LE,2180
39
39
  migrations/pg/0005_purge_tasks_index.py,sha256=3mtyFgpcK0QQ_NONYay7V9xICijCLNkyTPuoc0PBjRg,1139
40
+ migrations/pg/0006_catalog_title_indexes.py,sha256=n2OGxwE4oeCwHAYaxBkja4t10BmwTjZ2IoCyOdjEBSc,1710
40
41
  migrations/pg/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
41
42
  nucliadb/__init__.py,sha256=_abCmDJ_0ku483Os4UAjPX7Nywm39cQgAV_DiyjsKeQ,891
42
43
  nucliadb/health.py,sha256=UIxxA4oms4HIsCRZM_SZsdkIZIlgzmOxw-qSHLlWuak,3465
@@ -215,7 +216,7 @@ nucliadb/search/utilities.py,sha256=9SsRDw0rJVXVoLBfF7rBb6q080h-thZc7u8uRcTiBeY,
215
216
  nucliadb/search/api/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
216
217
  nucliadb/search/api/v1/__init__.py,sha256=DH16OYnw9jQ38OpKlmdXeoq2j40ZPXZRtGvClKOkMhw,1239
217
218
  nucliadb/search/api/v1/ask.py,sha256=b4tz33HNsfT5DXv_2DMc_jirnFsHuobreWkbAKkzj5o,5337
218
- nucliadb/search/api/v1/catalog.py,sha256=mVAPPf6CXimVOsBpbhPo63KXf8eXps--cifZOEQAIyk,7714
219
+ nucliadb/search/api/v1/catalog.py,sha256=3SqLgwFkFFY8x-xBruHQaZ0EGpf7oKbSj-_PnobV68E,7747
219
220
  nucliadb/search/api/v1/feedback.py,sha256=kNLc4dHz2SXHzV0PwC1WiRAwY88fDptPcP-kO0q-FrQ,2620
220
221
  nucliadb/search/api/v1/find.py,sha256=iMjyq4y0JOMC_x1B8kUfVdkCoc9G9Ark58kPLLY4HDw,10824
221
222
  nucliadb/search/api/v1/graph.py,sha256=gthqxCOn9biE6D6s93jRGLglk0ono8U7OyS390kWiI8,4178
@@ -248,7 +249,7 @@ nucliadb/search/search/ingestion_agents.py,sha256=IK6yOPEF9rST_uoqspdVdPk0pldjDh
248
249
  nucliadb/search/search/merge.py,sha256=XiRBsxhYPshPV7lZXD-9E259KZOPIf4I2tKosY0lPo4,22470
249
250
  nucliadb/search/search/metrics.py,sha256=3I6IN0qDSmqIvUaWJmT3rt-Jyjs6LcvnKI8ZqCiuJPY,3501
250
251
  nucliadb/search/search/paragraphs.py,sha256=pNAEiYqJGGUVcEf7xf-PFMVqz0PX4Qb-WNG-_zPGN2o,7799
251
- nucliadb/search/search/pgcatalog.py,sha256=s_J98fsX_RuFXwpejpkGqG-tD9ELuzz4YQ6U3ew5h2g,9313
252
+ nucliadb/search/search/pgcatalog.py,sha256=QtgArjoM-dW_B1oO0aXqp5au7GlLG8jAct9jevUHatw,10997
252
253
  nucliadb/search/search/predict_proxy.py,sha256=JwgBeEg1j4LnCjPCvTUrnmOd9LceJAt3iAu4m9cmJBo,3390
253
254
  nucliadb/search/search/query.py,sha256=0qIQdt548L3jtKOyKo06aGJ73SLBxAW3N38_Hc1M3Uw,11528
254
255
  nucliadb/search/search/rank_fusion.py,sha256=xZtXhbmKb_56gs73u6KkFm2efvTATOSMmpOV2wrAIqE,9613
@@ -265,11 +266,11 @@ nucliadb/search/search/chat/query.py,sha256=3jMPNbiFEOoS0ydMOPYkSx1qVlvAv51npzad
265
266
  nucliadb/search/search/query_parser/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
266
267
  nucliadb/search/search/query_parser/exceptions.py,sha256=sVl9gRNzhE-s480LBBVkiXzNRbKhYRQN5F3it5tNNp8,939
267
268
  nucliadb/search/search/query_parser/fetcher.py,sha256=nP4EySj2BvH10QgCvgzvp13Nf22wwfHsdLbDoPlH2cQ,16831
268
- nucliadb/search/search/query_parser/models.py,sha256=k9cCjTpndP9ynr8A9J8MBmDYmjLBKL1UM4L0GXVuJw0,5031
269
+ nucliadb/search/search/query_parser/models.py,sha256=kAslqX_-zaIdUpcpdNU2a5uQPQh7LC605qWLZ4aZ5T4,5064
269
270
  nucliadb/search/search/query_parser/old_filters.py,sha256=HircRqYEac_90bNCtFIJZ2RKA90kjbpNOQcp_ArBqR0,9083
270
271
  nucliadb/search/search/query_parser/parsers/__init__.py,sha256=ySCNSdbesLXGZyR88919njulA6UE10_3PhqMG_Yj1o4,1034
271
272
  nucliadb/search/search/query_parser/parsers/ask.py,sha256=eTz8wS-EJHuAagR384h6TT64itymFZRpfZJGX8r6aZM,2771
272
- nucliadb/search/search/query_parser/parsers/catalog.py,sha256=e89kh3nGV9JT9wjdWB8JbC2HPydn0rVk7WsKBo6q3gw,7122
273
+ nucliadb/search/search/query_parser/parsers/catalog.py,sha256=JuDiBL2wdjAuEFEPo0e2nQ4VqWjF3FXakT0ziZk3Oes,7495
273
274
  nucliadb/search/search/query_parser/parsers/common.py,sha256=mJMPOKurBK7-A7s3oNlPLxHP_yIn4j5Uw8rh_OQtzS4,6339
274
275
  nucliadb/search/search/query_parser/parsers/find.py,sha256=lHVspg-i_eWXvu7BT9WfuFVGVKYhr380y4tDX5yfTD4,12735
275
276
  nucliadb/search/search/query_parser/parsers/graph.py,sha256=zyqdUg5Afmhb2_-hvj9FUCaoLh026MUP1fgY2j-lD7c,9385
@@ -369,8 +370,8 @@ nucliadb/writer/tus/local.py,sha256=7jYa_w9b-N90jWgN2sQKkNcomqn6JMVBOVeDOVYJHto,
369
370
  nucliadb/writer/tus/s3.py,sha256=vF0NkFTXiXhXq3bCVXXVV-ED38ECVoUeeYViP8uMqcU,8357
370
371
  nucliadb/writer/tus/storage.py,sha256=ToqwjoYnjI4oIcwzkhha_MPxi-k4Jk3Lt55zRwaC1SM,2903
371
372
  nucliadb/writer/tus/utils.py,sha256=MSdVbRsRSZVdkaum69_0wku7X3p5wlZf4nr6E0GMKbw,2556
372
- nucliadb-6.4.2.post4403.dist-info/METADATA,sha256=t1OV8EADzoCOzE8RN9vyuLl6APN2AwAXYANfpl6dVsg,4152
373
- nucliadb-6.4.2.post4403.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
374
- nucliadb-6.4.2.post4403.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
375
- nucliadb-6.4.2.post4403.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
376
- nucliadb-6.4.2.post4403.dist-info/RECORD,,
373
+ nucliadb-6.5.0.post4408.dist-info/METADATA,sha256=5PRi3_KM32DTF2_BSWmfndESWte8q1aH9fPh0SYJ6aE,4152
374
+ nucliadb-6.5.0.post4408.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
375
+ nucliadb-6.5.0.post4408.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
376
+ nucliadb-6.5.0.post4408.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
377
+ nucliadb-6.5.0.post4408.dist-info/RECORD,,