nucliadb 6.7.2.post4862__py3-none-any.whl → 6.9.2.post5282__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nucliadb might be problematic. Click here for more details.

Files changed (126) hide show
  1. migrations/0016_upgrade_to_paragraphs_v2.py +1 -1
  2. migrations/0017_multiple_writable_shards.py +1 -1
  3. migrations/0018_purge_orphan_kbslugs.py +1 -1
  4. migrations/0019_upgrade_to_paragraphs_v3.py +1 -1
  5. migrations/0021_overwrite_vectorsets_key.py +1 -1
  6. migrations/0023_backfill_pg_catalog.py +7 -3
  7. migrations/0025_assign_models_to_kbs_v2.py +3 -3
  8. migrations/0027_rollover_texts3.py +1 -1
  9. migrations/0028_extracted_vectors_reference.py +1 -1
  10. migrations/0029_backfill_field_status.py +1 -1
  11. migrations/0032_remove_old_relations.py +1 -1
  12. migrations/0036_backfill_catalog_slug.py +1 -1
  13. migrations/0037_backfill_catalog_facets.py +1 -1
  14. migrations/0038_backfill_catalog_field_labels.py +7 -3
  15. migrations/0039_backfill_converation_splits_metadata.py +106 -0
  16. migrations/0040_migrate_search_configurations.py +79 -0
  17. migrations/pg/0010_shards_index.py +34 -0
  18. nucliadb/backups/create.py +3 -3
  19. nucliadb/backups/restore.py +3 -3
  20. nucliadb/common/cache.py +1 -1
  21. nucliadb/common/catalog/__init__.py +79 -0
  22. nucliadb/common/catalog/dummy.py +36 -0
  23. nucliadb/common/catalog/interface.py +85 -0
  24. nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +294 -208
  25. nucliadb/common/catalog/utils.py +56 -0
  26. nucliadb/common/cluster/manager.py +3 -19
  27. nucliadb/common/cluster/rebalance.py +484 -110
  28. nucliadb/common/cluster/rollover.py +29 -0
  29. nucliadb/common/cluster/settings.py +1 -1
  30. nucliadb/common/cluster/utils.py +26 -0
  31. nucliadb/common/datamanagers/atomic.py +6 -0
  32. nucliadb/common/datamanagers/utils.py +2 -2
  33. nucliadb/common/external_index_providers/manager.py +1 -29
  34. nucliadb/common/external_index_providers/settings.py +1 -27
  35. nucliadb/common/filter_expression.py +16 -33
  36. nucliadb/common/http_clients/exceptions.py +8 -0
  37. nucliadb/common/http_clients/processing.py +4 -0
  38. nucliadb/common/http_clients/utils.py +3 -0
  39. nucliadb/common/ids.py +77 -55
  40. nucliadb/common/locking.py +4 -4
  41. nucliadb/common/maindb/driver.py +11 -1
  42. nucliadb/common/maindb/local.py +1 -1
  43. nucliadb/common/maindb/pg.py +1 -1
  44. nucliadb/common/nidx.py +19 -1
  45. nucliadb/common/vector_index_config.py +1 -1
  46. nucliadb/export_import/datamanager.py +3 -3
  47. nucliadb/ingest/consumer/pull.py +7 -0
  48. nucliadb/ingest/consumer/service.py +2 -27
  49. nucliadb/ingest/consumer/shard_creator.py +17 -6
  50. nucliadb/ingest/fields/base.py +9 -17
  51. nucliadb/ingest/fields/conversation.py +47 -1
  52. nucliadb/ingest/orm/brain_v2.py +21 -3
  53. nucliadb/ingest/orm/index_message.py +126 -111
  54. nucliadb/ingest/orm/knowledgebox.py +84 -43
  55. nucliadb/ingest/orm/processor/auditing.py +1 -1
  56. nucliadb/ingest/orm/processor/processor.py +95 -149
  57. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  58. nucliadb/ingest/orm/resource.py +10 -1
  59. nucliadb/ingest/partitions.py +12 -1
  60. nucliadb/ingest/serialize.py +2 -2
  61. nucliadb/ingest/service/writer.py +26 -19
  62. nucliadb/ingest/settings.py +33 -11
  63. nucliadb/learning_proxy.py +12 -15
  64. nucliadb/metrics_exporter.py +17 -4
  65. nucliadb/migrator/datamanager.py +11 -17
  66. nucliadb/migrator/migrator.py +2 -2
  67. nucliadb/purge/__init__.py +12 -17
  68. nucliadb/purge/orphan_shards.py +2 -2
  69. nucliadb/reader/api/v1/knowledgebox.py +40 -12
  70. nucliadb/reader/api/v1/learning_config.py +30 -10
  71. nucliadb/reader/api/v1/resource.py +2 -2
  72. nucliadb/reader/api/v1/services.py +1 -1
  73. nucliadb/reader/reader/notifications.py +1 -1
  74. nucliadb/search/api/v1/__init__.py +1 -0
  75. nucliadb/search/api/v1/catalog.py +4 -4
  76. nucliadb/search/api/v1/find.py +1 -4
  77. nucliadb/search/api/v1/hydrate.py +328 -0
  78. nucliadb/search/api/v1/resource/ask.py +21 -1
  79. nucliadb/search/api/v1/search.py +1 -4
  80. nucliadb/search/predict.py +9 -2
  81. nucliadb/search/search/cache.py +1 -20
  82. nucliadb/search/search/chat/ask.py +50 -8
  83. nucliadb/search/search/chat/prompt.py +47 -15
  84. nucliadb/search/search/chat/query.py +8 -1
  85. nucliadb/search/search/fetch.py +1 -1
  86. nucliadb/search/search/find.py +1 -6
  87. nucliadb/search/search/{hydrator.py → hydrator/__init__.py} +5 -4
  88. nucliadb/search/search/hydrator/fields.py +175 -0
  89. nucliadb/search/search/hydrator/images.py +130 -0
  90. nucliadb/search/search/hydrator/paragraphs.py +307 -0
  91. nucliadb/search/search/hydrator/resources.py +56 -0
  92. nucliadb/search/search/metrics.py +16 -0
  93. nucliadb/search/search/predict_proxy.py +33 -11
  94. nucliadb/search/search/query.py +0 -23
  95. nucliadb/search/search/query_parser/fetcher.py +5 -5
  96. nucliadb/search/search/query_parser/models.py +1 -30
  97. nucliadb/search/search/query_parser/parsers/ask.py +1 -1
  98. nucliadb/search/search/query_parser/parsers/catalog.py +4 -7
  99. nucliadb/search/search/query_parser/parsers/common.py +16 -7
  100. nucliadb/search/search/query_parser/parsers/find.py +0 -11
  101. nucliadb/search/search/query_parser/parsers/graph.py +5 -5
  102. nucliadb/search/search/query_parser/parsers/search.py +0 -11
  103. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +4 -11
  104. nucliadb/search/search/rerankers.py +1 -1
  105. nucliadb/search/search/summarize.py +1 -1
  106. nucliadb/standalone/run.py +3 -0
  107. nucliadb/tasks/retries.py +4 -4
  108. nucliadb/train/generators/sentence_classifier.py +2 -8
  109. nucliadb/train/generators/utils.py +1 -1
  110. nucliadb/train/nodes.py +4 -4
  111. nucliadb/train/servicer.py +1 -1
  112. nucliadb/train/uploader.py +1 -1
  113. nucliadb/writer/api/v1/field.py +14 -9
  114. nucliadb/writer/api/v1/knowledgebox.py +15 -52
  115. nucliadb/writer/api/v1/learning_config.py +5 -4
  116. nucliadb/writer/api/v1/resource.py +2 -2
  117. nucliadb/writer/resource/field.py +38 -2
  118. nucliadb/writer/tus/azure.py +4 -4
  119. nucliadb/writer/tus/gcs.py +11 -17
  120. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/METADATA +9 -10
  121. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/RECORD +124 -114
  122. nucliadb/common/external_index_providers/pinecone.py +0 -894
  123. nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
  124. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/WHEEL +0 -0
  125. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/entry_points.txt +0 -0
  126. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/top_level.txt +0 -0
@@ -26,9 +26,6 @@ from nidx_protos.noderesources_pb2 import Resource
26
26
  from nucliadb.common import datamanagers
27
27
  from nucliadb.common.exceptions import InvalidQueryError
28
28
  from nucliadb.common.filter_expression import add_and_expression, parse_expression
29
- from nucliadb.search.search.filters import (
30
- translate_label,
31
- )
32
29
  from nucliadb.search.search.query_parser.fetcher import Fetcher
33
30
  from nucliadb_models.filters import FilterExpression
34
31
  from nucliadb_models.labels import LABEL_HIDDEN
@@ -166,26 +163,6 @@ def expand_entities(
166
163
  return list(result_entities.values())
167
164
 
168
165
 
169
- def apply_entities_filter(
170
- request: nodereader_pb2.SearchRequest,
171
- detected_entities: list[utils_pb2.RelationNode],
172
- ) -> list[str]:
173
- added_filters = []
174
- for entity_filter in [
175
- f"/e/{entity.subtype}/{entity.value}"
176
- for entity in detected_entities
177
- if entity.ntype == utils_pb2.RelationNode.NodeType.ENTITY
178
- ]:
179
- if entity_filter not in added_filters:
180
- added_filters.append(entity_filter)
181
- # Add the entity to the filter expression (with AND)
182
- entity_expr = nodereader_pb2.FilterExpression()
183
- entity_expr.facet.facet = translate_label(entity_filter)
184
- add_and_expression(request.field_filter, entity_expr)
185
-
186
- return added_filters
187
-
188
-
189
166
  async def suggest_query_to_pb(
190
167
  kbid: str,
191
168
  features: list[SuggestOptions],
@@ -393,7 +393,7 @@ async def get_matryoshka_dimension_cached(kbid: str, vectorset: str) -> Optional
393
393
 
394
394
  @query_parse_dependency_observer.wrap({"type": "matryoshka_dimension"})
395
395
  async def get_matryoshka_dimension(kbid: str, vectorset: Optional[str]) -> Optional[int]:
396
- async with get_driver().transaction(read_only=True) as txn:
396
+ async with get_driver().ro_transaction() as txn:
397
397
  matryoshka_dimension = None
398
398
  if not vectorset:
399
399
  # XXX this should be migrated once we remove the "default" vectorset
@@ -409,23 +409,23 @@ async def get_matryoshka_dimension(kbid: str, vectorset: Optional[str]) -> Optio
409
409
 
410
410
  @query_parse_dependency_observer.wrap({"type": "classification_labels"})
411
411
  async def get_classification_labels(kbid: str) -> knowledgebox_pb2.Labels:
412
- async with get_driver().transaction(read_only=True) as txn:
412
+ async with get_driver().ro_transaction() as txn:
413
413
  return await datamanagers.labels.get_labels(txn, kbid=kbid)
414
414
 
415
415
 
416
416
  @query_parse_dependency_observer.wrap({"type": "synonyms"})
417
417
  async def get_kb_synonyms(kbid: str) -> Optional[knowledgebox_pb2.Synonyms]:
418
- async with get_driver().transaction(read_only=True) as txn:
418
+ async with get_driver().ro_transaction() as txn:
419
419
  return await datamanagers.synonyms.get(txn, kbid=kbid)
420
420
 
421
421
 
422
422
  @query_parse_dependency_observer.wrap({"type": "entities_meta_cache"})
423
423
  async def get_entities_meta_cache(kbid: str) -> datamanagers.entities.EntitiesMetaCache:
424
- async with get_driver().transaction(read_only=True) as txn:
424
+ async with get_driver().ro_transaction() as txn:
425
425
  return await datamanagers.entities.get_entities_meta_cache(txn, kbid=kbid)
426
426
 
427
427
 
428
428
  @query_parse_dependency_observer.wrap({"type": "deleted_entities_groups"})
429
429
  async def get_deleted_entity_groups(kbid: str) -> list[str]:
430
- async with get_driver().transaction(read_only=True) as txn:
430
+ async with get_driver().ro_transaction() as txn:
431
431
  return list((await datamanagers.entities.get_deleted_groups(txn, kbid=kbid)).entities_groups)
@@ -17,9 +17,8 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from dataclasses import dataclass
21
20
  from datetime import datetime
22
- from typing import Literal, Optional, Union
21
+ from typing import Optional, Union
23
22
 
24
23
  from nidx_protos import nodereader_pb2
25
24
  from pydantic import BaseModel, ConfigDict, Field
@@ -86,7 +85,6 @@ class Filters(BaseModel):
86
85
  nodereader_pb2.FilterOperator.AND
87
86
  )
88
87
 
89
- autofilter: Optional[list[utils_pb2.RelationNode]] = None
90
88
  facets: list[str] = Field(default_factory=list)
91
89
  hidden: Optional[bool] = None
92
90
  security: Optional[search_models.RequestSecurity] = None
@@ -153,33 +151,6 @@ class ParsedQuery(BaseModel):
153
151
  generation: Optional[Generation] = None
154
152
 
155
153
 
156
- ### Catalog
157
- @dataclass
158
- class CatalogExpression:
159
- @dataclass
160
- class Date:
161
- field: Union[Literal["created_at"], Literal["modified_at"]]
162
- since: Optional[datetime]
163
- until: Optional[datetime]
164
-
165
- bool_and: Optional[list["CatalogExpression"]] = None
166
- bool_or: Optional[list["CatalogExpression"]] = None
167
- bool_not: Optional["CatalogExpression"] = None
168
- date: Optional[Date] = None
169
- facet: Optional[str] = None
170
- resource_id: Optional[str] = None
171
-
172
-
173
- class CatalogQuery(BaseModel):
174
- kbid: str
175
- query: Optional[search_models.CatalogQuery]
176
- filters: Optional[CatalogExpression]
177
- sort: search_models.SortOptions
178
- faceted: list[str]
179
- page_size: int
180
- page_number: int
181
-
182
-
183
154
  ### Graph
184
155
 
185
156
 
@@ -63,7 +63,7 @@ class _AskParser:
63
63
  )
64
64
  elif isinstance(self.item.max_tokens, MaxTokens):
65
65
  max_tokens = self.item.max_tokens
66
- else: # pragma: nocover
66
+ else: # pragma: no cover
67
67
  # This is a trick so mypy generates an error if this branch can be reached,
68
68
  # that is, if we are missing some ifs
69
69
  _a: int = "a"
@@ -19,13 +19,10 @@
19
19
  #
20
20
 
21
21
  from nucliadb.common import datamanagers
22
+ from nucliadb.common.catalog.interface import CatalogExpression, CatalogQuery
22
23
  from nucliadb.common.exceptions import InvalidQueryError
23
- from nucliadb.common.filter_expression import FacetFilterTypes, facet_from_filter
24
+ from nucliadb.common.filter_expression import FacetFilter, facet_from_filter
24
25
  from nucliadb.search.search.filters import translate_label
25
- from nucliadb.search.search.query_parser.models import (
26
- CatalogExpression,
27
- CatalogQuery,
28
- )
29
26
  from nucliadb_models import search as search_models
30
27
  from nucliadb_models.filters import (
31
28
  And,
@@ -185,14 +182,14 @@ async def parse_filter_expression(expr: ResourceFilterExpression, kbid: str) ->
185
182
  if rid is None:
186
183
  raise InvalidQueryError("slug", f"Cannot find slug {expr.slug}")
187
184
  cat.resource_id = rid
188
- else: # pragma: nocover
185
+ else: # pragma: no cover
189
186
  # Cannot happen due to model validation
190
187
  raise ValueError("Resource needs id or slug")
191
188
  elif isinstance(expr, DateCreated):
192
189
  cat.date = CatalogExpression.Date(field="created_at", since=expr.since, until=expr.until)
193
190
  elif isinstance(expr, DateModified):
194
191
  cat.date = CatalogExpression.Date(field="modified_at", since=expr.since, until=expr.until)
195
- elif isinstance(expr, FacetFilterTypes):
192
+ elif isinstance(expr, FacetFilter):
196
193
  cat.facet = facet_from_filter(expr)
197
194
  else:
198
195
  # This is a trick so mypy generates an error if this branch can be reached,
@@ -21,7 +21,6 @@ import re
21
21
  import string
22
22
  from typing import Optional, Union
23
23
 
24
- from nucliadb.common.exceptions import InvalidQueryError
25
24
  from nucliadb.search import logger
26
25
  from nucliadb.search.search.query_parser.fetcher import Fetcher
27
26
  from nucliadb.search.search.query_parser.models import (
@@ -32,15 +31,20 @@ from nucliadb_models import search as search_models
32
31
 
33
32
  DEFAULT_GENERIC_SEMANTIC_THRESHOLD = 0.7
34
33
 
35
- # -* is an invalid query in tantivy and it won't return results but if you add some whitespaces
36
- # between - and *, it will actually trigger a tantivy bug and panic
37
- INVALID_QUERY = re.compile(r"- +\*")
38
34
 
35
+ def validate_query_syntax(query: str) -> str:
36
+ """Filter some queries that panic tantivy, better than returning the 500"""
39
37
 
40
- def validate_query_syntax(query: str):
41
- # Filter some queries that panic tantivy, better than returning the 500
38
+ # -* is an invalid query in tantivy and it won't return results but if you add some whitespaces
39
+ # between - and *, it will actually trigger a tantivy bug and panic
40
+ INVALID_QUERY = re.compile(r"- *\*+")
42
41
  if INVALID_QUERY.search(query):
43
- raise InvalidQueryError("query", "Invalid query syntax")
42
+ # remove the * and extra spaces, as it's probably what doesn't have
43
+ # meaning in both cases: -* and - *
44
+ fixed = re.sub(INVALID_QUERY, "- ", query)
45
+ query = fixed
46
+
47
+ return query
44
48
 
45
49
 
46
50
  def is_empty_query(request: search_models.BaseSearchRequest) -> bool:
@@ -85,6 +89,7 @@ async def parse_keyword_query(
85
89
  fetcher: Fetcher,
86
90
  ) -> KeywordQuery:
87
91
  query = item.query
92
+
88
93
  # If there was a rephrase with image, we should use the rephrased query for keyword search
89
94
  rephrased_query = await fetcher.get_rephrased_query()
90
95
  if item.query_image is not None and rephrased_query is not None:
@@ -98,6 +103,10 @@ async def parse_keyword_query(
98
103
  query = synonyms_query
99
104
  is_synonyms_query = True
100
105
 
106
+ # after all query transformations, pass a validator that can fix some
107
+ # queries that trigger a panic on the index
108
+ query = validate_query_syntax(query)
109
+
101
110
  min_score = parse_keyword_min_score(item.min_score)
102
111
 
103
112
  return KeywordQuery(
@@ -57,7 +57,6 @@ from .common import (
57
57
  parse_semantic_query,
58
58
  parse_top_k,
59
59
  should_disable_vector_search,
60
- validate_query_syntax,
61
60
  )
62
61
 
63
62
 
@@ -146,8 +145,6 @@ class _FindParser:
146
145
  return retrieval
147
146
 
148
147
  def _validate_request(self):
149
- validate_query_syntax(self.item.query)
150
-
151
148
  # synonyms are not compatible with vector/graph search
152
149
  if (
153
150
  self.item.with_synonyms
@@ -256,17 +253,9 @@ class _FindParser:
256
253
  else:
257
254
  filter_operator = nodereader_pb2.FilterOperator.AND
258
255
 
259
- autofilter = None
260
- if self.item.autofilter:
261
- if self._query.relation is not None:
262
- autofilter = self._query.relation.entry_points
263
- else:
264
- autofilter = await self._get_detected_entities()
265
-
266
256
  hidden = await filter_hidden_resources(self.kbid, self.item.show_hidden)
267
257
 
268
258
  return Filters(
269
- autofilter=autofilter,
270
259
  facets=[],
271
260
  field_expression=field_expr,
272
261
  paragraph_expression=paragraph_expr,
@@ -153,7 +153,7 @@ def parse_path_query(expr: graph_requests.GraphPathQuery) -> nodereader_pb2.Grap
153
153
  elif isinstance(expr, graph_requests.Generated):
154
154
  _set_generated_to_pb(expr, pb)
155
155
 
156
- else: # pragma: nocover
156
+ else: # pragma: no cover
157
157
  # This is a trick so mypy generates an error if this branch can be reached,
158
158
  # that is, if we are missing some ifs
159
159
  _a: int = "a"
@@ -182,7 +182,7 @@ def _parse_node_query(expr: graph_requests.GraphNodesQuery) -> nodereader_pb2.Gr
182
182
  elif isinstance(expr, graph_requests.Generated):
183
183
  _set_generated_to_pb(expr, pb)
184
184
 
185
- else: # pragma: nocover
185
+ else: # pragma: no cover
186
186
  # This is a trick so mypy generates an error if this branch can be reached,
187
187
  # that is, if we are missing some ifs
188
188
  _a: int = "a"
@@ -212,7 +212,7 @@ def _parse_relation_query(
212
212
  elif isinstance(expr, graph_requests.Generated):
213
213
  _set_generated_to_pb(expr, pb)
214
214
 
215
- else: # pragma: nocover
215
+ else: # pragma: no cover
216
216
  # This is a trick so mypy generates an error if this branch can be reached,
217
217
  # that is, if we are missing some ifs
218
218
  _a: int = "a"
@@ -230,7 +230,7 @@ def _set_node_to_pb(node: graph_requests.GraphNode, pb: nodereader_pb2.GraphQuer
230
230
  pb.fuzzy.kind = nodereader_pb2.GraphQuery.Node.MatchLocation.PREFIX
231
231
  pb.fuzzy.distance = 1
232
232
 
233
- else: # pragma: nocover
233
+ else: # pragma: no cover
234
234
  # This is a trick so mypy generates an error if this branch can be reached,
235
235
  # that is, if we are missing some ifs
236
236
  _a: int = "a"
@@ -263,7 +263,7 @@ def _set_generated_to_pb(generated: graph_requests.Generated, pb: nodereader_pb2
263
263
 
264
264
  pb.facet.facet = facet
265
265
 
266
- else: # pragma: nocover
266
+ else: # pragma: no cover
267
267
  # This is a trick so mypy generates an error if this branch can be reached,
268
268
  # that is, if we are missing some ifs
269
269
  _a: int = "a"
@@ -51,7 +51,6 @@ from .common import (
51
51
  parse_semantic_query,
52
52
  parse_top_k,
53
53
  should_disable_vector_search,
54
- validate_query_syntax,
55
54
  )
56
55
 
57
56
  INDEX_SORTABLE_FIELDS = [
@@ -128,8 +127,6 @@ class _SearchParser:
128
127
  return retrieval
129
128
 
130
129
  def _validate_request(self):
131
- validate_query_syntax(self.item.query)
132
-
133
130
  # synonyms are not compatible with vector/graph search
134
131
  if (
135
132
  self.item.with_synonyms
@@ -254,17 +251,9 @@ class _SearchParser:
254
251
  else:
255
252
  filter_operator = nodereader_pb2.FilterOperator.AND
256
253
 
257
- autofilter = None
258
- if self.item.autofilter:
259
- if self._query.relation is not None:
260
- autofilter = self._query.relation.entry_points
261
- else:
262
- autofilter = await self._get_detected_entities()
263
-
264
254
  hidden = await filter_hidden_resources(self.kbid, self.item.show_hidden)
265
255
 
266
256
  return Filters(
267
- autofilter=autofilter,
268
257
  facets=self.item.faceted,
269
258
  field_expression=field_expr,
270
259
  paragraph_expression=paragraph_expr,
@@ -25,10 +25,10 @@ from nidx_protos.nodereader_pb2 import SearchRequest
25
25
  from nucliadb.common.filter_expression import add_and_expression
26
26
  from nucliadb.search.search.filters import translate_label
27
27
  from nucliadb.search.search.metrics import node_features, query_parser_observer
28
- from nucliadb.search.search.query import apply_entities_filter, get_sort_field_proto
28
+ from nucliadb.search.search.query import get_sort_field_proto
29
29
  from nucliadb.search.search.query_parser.models import ParsedQuery, PredictReranker, UnitRetrieval
30
30
  from nucliadb.search.search.query_parser.parsers.graph import parse_path_query
31
- from nucliadb_models.labels import LABEL_HIDDEN, translate_system_to_alias_label
31
+ from nucliadb_models.labels import LABEL_HIDDEN
32
32
  from nucliadb_models.search import SortOrderMap
33
33
  from nucliadb_protos import utils_pb2
34
34
 
@@ -36,7 +36,7 @@ from nucliadb_protos import utils_pb2
36
36
  @query_parser_observer.wrap({"type": "convert_retrieval_to_proto"})
37
37
  async def legacy_convert_retrieval_to_proto(
38
38
  parsed: ParsedQuery,
39
- ) -> tuple[SearchRequest, bool, list[str], Optional[str]]:
39
+ ) -> tuple[SearchRequest, bool, Optional[str]]:
40
40
  converter = _Converter(parsed.retrieval)
41
41
  request = converter.into_search_request()
42
42
 
@@ -44,13 +44,12 @@ async def legacy_convert_retrieval_to_proto(
44
44
  # needed. We should find a better abstraction
45
45
 
46
46
  incomplete = is_incomplete(parsed.retrieval)
47
- autofilter = converter._autofilter
48
47
 
49
48
  rephrased_query = None
50
49
  if parsed.retrieval.query.semantic:
51
50
  rephrased_query = await parsed.fetcher.get_rephrased_query()
52
51
 
53
- return request, incomplete, autofilter, rephrased_query
52
+ return request, incomplete, rephrased_query
54
53
 
55
54
 
56
55
  @query_parser_observer.wrap({"type": "convert_retrieval_to_proto"})
@@ -65,8 +64,6 @@ class _Converter:
65
64
  self.req = nodereader_pb2.SearchRequest()
66
65
  self.retrieval = retrieval
67
66
 
68
- self._autofilter: list[str] = []
69
-
70
67
  def into_search_request(self) -> nodereader_pb2.SearchRequest:
71
68
  """Generate a SearchRequest proto from a retrieval operation."""
72
69
  self._apply_text_queries()
@@ -235,10 +232,6 @@ class _Converter:
235
232
  self.req.paragraph_filter.CopyFrom(self.retrieval.filters.paragraph_expression)
236
233
  self.req.filter_operator = self.retrieval.filters.filter_expression_operator
237
234
 
238
- if self.retrieval.filters.autofilter:
239
- entity_filters = apply_entities_filter(self.req, self.retrieval.filters.autofilter)
240
- self._autofilter.extend([translate_system_to_alias_label(e) for e in entity_filters])
241
-
242
235
  if self.retrieval.filters.hidden is not None:
243
236
  expr = nodereader_pb2.FilterExpression()
244
237
  if self.retrieval.filters.hidden:
@@ -181,7 +181,7 @@ def get_reranker(reranker: parser_models.Reranker) -> Reranker:
181
181
  elif isinstance(reranker, parser_models.PredictReranker):
182
182
  algorithm = PredictReranker(reranker.window)
183
183
 
184
- else: # pragma: nocover
184
+ else: # pragma: no cover
185
185
  # This is a trick so mypy generates an error if this branch can be reached,
186
186
  # that is, if we are missing some ifs
187
187
  _a: int = "a"
@@ -77,7 +77,7 @@ async def get_extracted_texts(kbid: str, resource_uuids_or_slugs: list[str]) ->
77
77
  tasks = []
78
78
 
79
79
  # Schedule getting extracted text for each field of each resource
80
- async with driver.transaction(read_only=True) as txn:
80
+ async with driver.ro_transaction() as txn:
81
81
  if not await datamanagers.kb.exists_kb(txn, kbid=kbid):
82
82
  raise datamanagers.exceptions.KnowledgeBoxNotFound(kbid)
83
83
 
@@ -116,6 +116,9 @@ def run():
116
116
  if nuclia_settings.nuclia_service_account:
117
117
  settings_to_output["NUA API key"] = "Configured ✔"
118
118
  settings_to_output["NUA API zone"] = nuclia_settings.nuclia_zone
119
+ settings_to_output["NUA API url"] = (
120
+ nuclia_settings.nuclia_public_url.format(zone=nuclia_settings.nuclia_zone) + "/api"
121
+ )
119
122
 
120
123
  settings_to_output_fmted = "\n".join(
121
124
  [f"|| - {k}:{' ' * (27 - len(k))}{v}" for k, v in settings_to_output.items()]
nucliadb/tasks/retries.py CHANGED
@@ -151,7 +151,7 @@ class TaskRetryHandler:
151
151
 
152
152
 
153
153
  async def _get_metadata(kv_driver: Driver, metadata_key: str) -> Optional[TaskMetadata]:
154
- async with kv_driver.transaction(read_only=True) as txn:
154
+ async with kv_driver.ro_transaction() as txn:
155
155
  metadata = await txn.get(metadata_key)
156
156
  if metadata is None:
157
157
  return None
@@ -159,7 +159,7 @@ async def _get_metadata(kv_driver: Driver, metadata_key: str) -> Optional[TaskMe
159
159
 
160
160
 
161
161
  async def _set_metadata(kv_driver: Driver, metadata_key: str, metadata: TaskMetadata) -> None:
162
- async with kv_driver.transaction() as txn:
162
+ async with kv_driver.rw_transaction() as txn:
163
163
  await txn.set(metadata_key, metadata.model_dump_json().encode())
164
164
  await txn.commit()
165
165
 
@@ -188,7 +188,7 @@ async def purge_batch(
188
188
  """
189
189
  Returns the next start key and the number of purged records. If start is None, it means there are no more records to purge.
190
190
  """
191
- async with kv_driver.transaction() as txn:
191
+ async with kv_driver.rw_transaction() as txn:
192
192
  txn = cast(PGTransaction, txn)
193
193
  async with txn.connection.cursor() as cur:
194
194
  await cur.execute(
@@ -226,7 +226,7 @@ async def purge_batch(
226
226
  while len(to_delete) > 0:
227
227
  batch = to_delete[:delete_batch_size]
228
228
  to_delete = to_delete[delete_batch_size:]
229
- async with kv_driver.transaction() as txn:
229
+ async with kv_driver.rw_transaction() as txn:
230
230
  for key in batch:
231
231
  logger.info("Purging task metadata", extra={"key": key})
232
232
  await txn.delete(key)
@@ -116,10 +116,7 @@ async def get_sentences(kbid: str, result: str) -> list[str]:
116
116
  if split is not None:
117
117
  text = extracted_text.split_text[split]
118
118
  for paragraph in field_metadata.split_metadata[split].paragraphs:
119
- if paragraph.key == "":
120
- key = f"{rid}/{field_type}/{field}/{paragraph.start}-{paragraph.end}"
121
- else:
122
- key = paragraph.key
119
+ key = f"{rid}/{field_type}/{field}/{paragraph.start}-{paragraph.end}"
123
120
  if key == result:
124
121
  for sentence in paragraph.sentences:
125
122
  splitted_text = text[sentence.start : sentence.end]
@@ -127,10 +124,7 @@ async def get_sentences(kbid: str, result: str) -> list[str]:
127
124
  else:
128
125
  text = extracted_text.text
129
126
  for paragraph in field_metadata.metadata.paragraphs:
130
- if paragraph.key == "":
131
- key = f"{rid}/{field_type}/{field}/{paragraph.start}-{paragraph.end}"
132
- else:
133
- key = paragraph.key
127
+ key = f"{rid}/{field_type}/{field}/{paragraph.start}-{paragraph.end}"
134
128
  if key == result:
135
129
  for sentence in paragraph.sentences:
136
130
  splitted_text = text[sentence.start : sentence.end]
@@ -41,7 +41,7 @@ async def get_resource_from_cache_or_db(kbid: str, uuid: str) -> Optional[Resour
41
41
 
42
42
  async def _get_resource_from_db(kbid: str, uuid: str) -> Optional[ResourceORM]:
43
43
  storage = await get_storage(service_name=SERVICE_NAME)
44
- async with get_driver().transaction(read_only=True) as transaction:
44
+ async with get_driver().ro_transaction() as transaction:
45
45
  kb = KnowledgeBoxORM(transaction, storage, kbid)
46
46
  return await kb.get(uuid)
47
47
 
nucliadb/train/nodes.py CHANGED
@@ -81,7 +81,7 @@ class TrainShardManager(manager.KBShardManager):
81
81
  return manager
82
82
 
83
83
  async def kb_sentences(self, request: GetSentencesRequest) -> AsyncIterator[TrainSentence]:
84
- async with self.driver.transaction(read_only=True) as txn:
84
+ async with self.driver.ro_transaction() as txn:
85
85
  kb = KnowledgeBox(txn, self.storage, request.kb.uuid)
86
86
  if request.uuid != "":
87
87
  # Filter by uuid
@@ -95,7 +95,7 @@ class TrainShardManager(manager.KBShardManager):
95
95
  yield sentence
96
96
 
97
97
  async def kb_paragraphs(self, request: GetParagraphsRequest) -> AsyncIterator[TrainParagraph]:
98
- async with self.driver.transaction(read_only=True) as txn:
98
+ async with self.driver.ro_transaction() as txn:
99
99
  kb = KnowledgeBox(txn, self.storage, request.kb.uuid)
100
100
  if request.uuid != "":
101
101
  # Filter by uuid
@@ -109,7 +109,7 @@ class TrainShardManager(manager.KBShardManager):
109
109
  yield paragraph
110
110
 
111
111
  async def kb_fields(self, request: GetFieldsRequest) -> AsyncIterator[TrainField]:
112
- async with self.driver.transaction(read_only=True) as txn:
112
+ async with self.driver.ro_transaction() as txn:
113
113
  kb = KnowledgeBox(txn, self.storage, request.kb.uuid)
114
114
  if request.uuid != "":
115
115
  # Filter by uuid
@@ -123,7 +123,7 @@ class TrainShardManager(manager.KBShardManager):
123
123
  yield field
124
124
 
125
125
  async def kb_resources(self, request: GetResourcesRequest) -> AsyncIterator[TrainResource]:
126
- async with self.driver.transaction(read_only=True) as txn:
126
+ async with self.driver.ro_transaction() as txn:
127
127
  kb = KnowledgeBox(txn, self.storage, request.kb.uuid)
128
128
  base = KB_RESOURCE_SLUG_BASE.format(kbid=request.kb.uuid)
129
129
  async for key in txn.keys(match=base):
@@ -89,7 +89,7 @@ class TrainServicer(train_pb2_grpc.TrainServicer):
89
89
  ) -> GetEntitiesResponse:
90
90
  kbid = request.kb.uuid
91
91
  response = GetEntitiesResponse()
92
- async with self.proc.driver.transaction(read_only=True) as txn:
92
+ async with self.proc.driver.ro_transaction() as txn:
93
93
  entities_manager = await self.proc.get_kb_entities_manager(txn, kbid)
94
94
  if entities_manager is None:
95
95
  await txn.abort()
@@ -75,7 +75,7 @@ class UploadServicer:
75
75
  ) -> GetEntitiesResponse:
76
76
  kbid = request.kb.uuid
77
77
  response = GetEntitiesResponse()
78
- async with self.proc.driver.transaction(read_only=True) as txn:
78
+ async with self.proc.driver.ro_transaction() as txn:
79
79
  kbobj = await self.proc.get_kb_obj(txn, request.kb)
80
80
  if kbobj is None:
81
81
  response.status = GetEntitiesResponse.Status.NOTFOUND
@@ -18,7 +18,7 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  from inspect import iscoroutinefunction
21
- from typing import TYPE_CHECKING, Annotated, Callable, Optional, Type, Union
21
+ from typing import TYPE_CHECKING, Annotated, Callable, List, Optional, Type, Union
22
22
 
23
23
  import pydantic
24
24
  from fastapi import HTTPException, Query, Response
@@ -249,9 +249,10 @@ async def parse_conversation_field_adapter(
249
249
  writer: BrokerMessage,
250
250
  toprocess: PushPayload,
251
251
  resource_classifications: ResourceClassifications,
252
+ replace_field: bool = False,
252
253
  ):
253
254
  return await parse_conversation_field(
254
- field_id, field_payload, writer, toprocess, kbid, rid, resource_classifications
255
+ field_id, field_payload, writer, toprocess, kbid, rid, resource_classifications, replace_field
255
256
  )
256
257
 
257
258
 
@@ -380,7 +381,9 @@ async def add_resource_field_conversation_rslug_prefix(
380
381
  field_id: FieldIdString,
381
382
  field_payload: models.InputConversationField,
382
383
  ) -> ResourceFieldAdded:
383
- return await add_field_to_resource_by_slug(request, kbid, rslug, field_id, field_payload)
384
+ return await add_field_to_resource_by_slug(
385
+ request, kbid, rslug, field_id, field_payload, replace_field=True
386
+ )
384
387
 
385
388
 
386
389
  @api.put(
@@ -399,7 +402,7 @@ async def add_resource_field_conversation_rid_prefix(
399
402
  field_id: FieldIdString,
400
403
  field_payload: models.InputConversationField,
401
404
  ) -> ResourceFieldAdded:
402
- return await add_field_to_resource(request, kbid, rid, field_id, field_payload)
405
+ return await add_field_to_resource(request, kbid, rid, field_id, field_payload, replace_field=True)
403
406
 
404
407
 
405
408
  @api.put(
@@ -460,13 +463,15 @@ async def append_messages_to_conversation_field_rslug_prefix(
460
463
  kbid: str,
461
464
  rslug: str,
462
465
  field_id: FieldIdString,
463
- messages: list[models.InputMessage],
466
+ messages: List[models.InputMessage],
464
467
  ) -> ResourceFieldAdded:
465
468
  try:
466
469
  field = models.InputConversationField(messages=messages)
467
470
  except pydantic.ValidationError as e:
468
471
  raise HTTPException(status_code=422, detail=str(e))
469
- return await add_field_to_resource_by_slug(request, kbid, rslug, field_id, field)
472
+ return await add_field_to_resource_by_slug(
473
+ request, kbid, rslug, field_id, field, replace_field=False
474
+ )
470
475
 
471
476
 
472
477
  @api.put(
@@ -483,13 +488,13 @@ async def append_messages_to_conversation_field_rid_prefix(
483
488
  kbid: str,
484
489
  rid: str,
485
490
  field_id: FieldIdString,
486
- messages: list[models.InputMessage],
491
+ messages: List[models.InputMessage],
487
492
  ) -> ResourceFieldAdded:
488
493
  try:
489
494
  field = models.InputConversationField(messages=messages)
490
495
  except pydantic.ValidationError as e:
491
496
  raise HTTPException(status_code=422, detail=str(e))
492
- return await add_field_to_resource(request, kbid, rid, field_id, field)
497
+ return await add_field_to_resource(request, kbid, rid, field_id, field, replace_field=False)
493
498
 
494
499
 
495
500
  @api.delete(
@@ -572,7 +577,7 @@ async def reprocess_file_field(
572
577
  storage = await get_storage(service_name=SERVICE_NAME)
573
578
  driver = get_driver()
574
579
 
575
- async with driver.transaction(read_only=True) as txn:
580
+ async with driver.ro_transaction() as txn:
576
581
  kb = KnowledgeBox(txn, storage, kbid)
577
582
 
578
583
  resource = await kb.get(rid)