nucliadb 6.7.2.post4862__py3-none-any.whl → 6.9.2.post5282__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nucliadb might be problematic. Click here for more details.
- migrations/0016_upgrade_to_paragraphs_v2.py +1 -1
- migrations/0017_multiple_writable_shards.py +1 -1
- migrations/0018_purge_orphan_kbslugs.py +1 -1
- migrations/0019_upgrade_to_paragraphs_v3.py +1 -1
- migrations/0021_overwrite_vectorsets_key.py +1 -1
- migrations/0023_backfill_pg_catalog.py +7 -3
- migrations/0025_assign_models_to_kbs_v2.py +3 -3
- migrations/0027_rollover_texts3.py +1 -1
- migrations/0028_extracted_vectors_reference.py +1 -1
- migrations/0029_backfill_field_status.py +1 -1
- migrations/0032_remove_old_relations.py +1 -1
- migrations/0036_backfill_catalog_slug.py +1 -1
- migrations/0037_backfill_catalog_facets.py +1 -1
- migrations/0038_backfill_catalog_field_labels.py +7 -3
- migrations/0039_backfill_converation_splits_metadata.py +106 -0
- migrations/0040_migrate_search_configurations.py +79 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/backups/create.py +3 -3
- nucliadb/backups/restore.py +3 -3
- nucliadb/common/cache.py +1 -1
- nucliadb/common/catalog/__init__.py +79 -0
- nucliadb/common/catalog/dummy.py +36 -0
- nucliadb/common/catalog/interface.py +85 -0
- nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +294 -208
- nucliadb/common/catalog/utils.py +56 -0
- nucliadb/common/cluster/manager.py +3 -19
- nucliadb/common/cluster/rebalance.py +484 -110
- nucliadb/common/cluster/rollover.py +29 -0
- nucliadb/common/cluster/settings.py +1 -1
- nucliadb/common/cluster/utils.py +26 -0
- nucliadb/common/datamanagers/atomic.py +6 -0
- nucliadb/common/datamanagers/utils.py +2 -2
- nucliadb/common/external_index_providers/manager.py +1 -29
- nucliadb/common/external_index_providers/settings.py +1 -27
- nucliadb/common/filter_expression.py +16 -33
- nucliadb/common/http_clients/exceptions.py +8 -0
- nucliadb/common/http_clients/processing.py +4 -0
- nucliadb/common/http_clients/utils.py +3 -0
- nucliadb/common/ids.py +77 -55
- nucliadb/common/locking.py +4 -4
- nucliadb/common/maindb/driver.py +11 -1
- nucliadb/common/maindb/local.py +1 -1
- nucliadb/common/maindb/pg.py +1 -1
- nucliadb/common/nidx.py +19 -1
- nucliadb/common/vector_index_config.py +1 -1
- nucliadb/export_import/datamanager.py +3 -3
- nucliadb/ingest/consumer/pull.py +7 -0
- nucliadb/ingest/consumer/service.py +2 -27
- nucliadb/ingest/consumer/shard_creator.py +17 -6
- nucliadb/ingest/fields/base.py +9 -17
- nucliadb/ingest/fields/conversation.py +47 -1
- nucliadb/ingest/orm/brain_v2.py +21 -3
- nucliadb/ingest/orm/index_message.py +126 -111
- nucliadb/ingest/orm/knowledgebox.py +84 -43
- nucliadb/ingest/orm/processor/auditing.py +1 -1
- nucliadb/ingest/orm/processor/processor.py +95 -149
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +10 -1
- nucliadb/ingest/partitions.py +12 -1
- nucliadb/ingest/serialize.py +2 -2
- nucliadb/ingest/service/writer.py +26 -19
- nucliadb/ingest/settings.py +33 -11
- nucliadb/learning_proxy.py +12 -15
- nucliadb/metrics_exporter.py +17 -4
- nucliadb/migrator/datamanager.py +11 -17
- nucliadb/migrator/migrator.py +2 -2
- nucliadb/purge/__init__.py +12 -17
- nucliadb/purge/orphan_shards.py +2 -2
- nucliadb/reader/api/v1/knowledgebox.py +40 -12
- nucliadb/reader/api/v1/learning_config.py +30 -10
- nucliadb/reader/api/v1/resource.py +2 -2
- nucliadb/reader/api/v1/services.py +1 -1
- nucliadb/reader/reader/notifications.py +1 -1
- nucliadb/search/api/v1/__init__.py +1 -0
- nucliadb/search/api/v1/catalog.py +4 -4
- nucliadb/search/api/v1/find.py +1 -4
- nucliadb/search/api/v1/hydrate.py +328 -0
- nucliadb/search/api/v1/resource/ask.py +21 -1
- nucliadb/search/api/v1/search.py +1 -4
- nucliadb/search/predict.py +9 -2
- nucliadb/search/search/cache.py +1 -20
- nucliadb/search/search/chat/ask.py +50 -8
- nucliadb/search/search/chat/prompt.py +47 -15
- nucliadb/search/search/chat/query.py +8 -1
- nucliadb/search/search/fetch.py +1 -1
- nucliadb/search/search/find.py +1 -6
- nucliadb/search/search/{hydrator.py → hydrator/__init__.py} +5 -4
- nucliadb/search/search/hydrator/fields.py +175 -0
- nucliadb/search/search/hydrator/images.py +130 -0
- nucliadb/search/search/hydrator/paragraphs.py +307 -0
- nucliadb/search/search/hydrator/resources.py +56 -0
- nucliadb/search/search/metrics.py +16 -0
- nucliadb/search/search/predict_proxy.py +33 -11
- nucliadb/search/search/query.py +0 -23
- nucliadb/search/search/query_parser/fetcher.py +5 -5
- nucliadb/search/search/query_parser/models.py +1 -30
- nucliadb/search/search/query_parser/parsers/ask.py +1 -1
- nucliadb/search/search/query_parser/parsers/catalog.py +4 -7
- nucliadb/search/search/query_parser/parsers/common.py +16 -7
- nucliadb/search/search/query_parser/parsers/find.py +0 -11
- nucliadb/search/search/query_parser/parsers/graph.py +5 -5
- nucliadb/search/search/query_parser/parsers/search.py +0 -11
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +4 -11
- nucliadb/search/search/rerankers.py +1 -1
- nucliadb/search/search/summarize.py +1 -1
- nucliadb/standalone/run.py +3 -0
- nucliadb/tasks/retries.py +4 -4
- nucliadb/train/generators/sentence_classifier.py +2 -8
- nucliadb/train/generators/utils.py +1 -1
- nucliadb/train/nodes.py +4 -4
- nucliadb/train/servicer.py +1 -1
- nucliadb/train/uploader.py +1 -1
- nucliadb/writer/api/v1/field.py +14 -9
- nucliadb/writer/api/v1/knowledgebox.py +15 -52
- nucliadb/writer/api/v1/learning_config.py +5 -4
- nucliadb/writer/api/v1/resource.py +2 -2
- nucliadb/writer/resource/field.py +38 -2
- nucliadb/writer/tus/azure.py +4 -4
- nucliadb/writer/tus/gcs.py +11 -17
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/METADATA +9 -10
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/RECORD +124 -114
- nucliadb/common/external_index_providers/pinecone.py +0 -894
- nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/WHEEL +0 -0
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/top_level.txt +0 -0
nucliadb/search/search/query.py
CHANGED
|
@@ -26,9 +26,6 @@ from nidx_protos.noderesources_pb2 import Resource
|
|
|
26
26
|
from nucliadb.common import datamanagers
|
|
27
27
|
from nucliadb.common.exceptions import InvalidQueryError
|
|
28
28
|
from nucliadb.common.filter_expression import add_and_expression, parse_expression
|
|
29
|
-
from nucliadb.search.search.filters import (
|
|
30
|
-
translate_label,
|
|
31
|
-
)
|
|
32
29
|
from nucliadb.search.search.query_parser.fetcher import Fetcher
|
|
33
30
|
from nucliadb_models.filters import FilterExpression
|
|
34
31
|
from nucliadb_models.labels import LABEL_HIDDEN
|
|
@@ -166,26 +163,6 @@ def expand_entities(
|
|
|
166
163
|
return list(result_entities.values())
|
|
167
164
|
|
|
168
165
|
|
|
169
|
-
def apply_entities_filter(
|
|
170
|
-
request: nodereader_pb2.SearchRequest,
|
|
171
|
-
detected_entities: list[utils_pb2.RelationNode],
|
|
172
|
-
) -> list[str]:
|
|
173
|
-
added_filters = []
|
|
174
|
-
for entity_filter in [
|
|
175
|
-
f"/e/{entity.subtype}/{entity.value}"
|
|
176
|
-
for entity in detected_entities
|
|
177
|
-
if entity.ntype == utils_pb2.RelationNode.NodeType.ENTITY
|
|
178
|
-
]:
|
|
179
|
-
if entity_filter not in added_filters:
|
|
180
|
-
added_filters.append(entity_filter)
|
|
181
|
-
# Add the entity to the filter expression (with AND)
|
|
182
|
-
entity_expr = nodereader_pb2.FilterExpression()
|
|
183
|
-
entity_expr.facet.facet = translate_label(entity_filter)
|
|
184
|
-
add_and_expression(request.field_filter, entity_expr)
|
|
185
|
-
|
|
186
|
-
return added_filters
|
|
187
|
-
|
|
188
|
-
|
|
189
166
|
async def suggest_query_to_pb(
|
|
190
167
|
kbid: str,
|
|
191
168
|
features: list[SuggestOptions],
|
|
@@ -393,7 +393,7 @@ async def get_matryoshka_dimension_cached(kbid: str, vectorset: str) -> Optional
|
|
|
393
393
|
|
|
394
394
|
@query_parse_dependency_observer.wrap({"type": "matryoshka_dimension"})
|
|
395
395
|
async def get_matryoshka_dimension(kbid: str, vectorset: Optional[str]) -> Optional[int]:
|
|
396
|
-
async with get_driver().
|
|
396
|
+
async with get_driver().ro_transaction() as txn:
|
|
397
397
|
matryoshka_dimension = None
|
|
398
398
|
if not vectorset:
|
|
399
399
|
# XXX this should be migrated once we remove the "default" vectorset
|
|
@@ -409,23 +409,23 @@ async def get_matryoshka_dimension(kbid: str, vectorset: Optional[str]) -> Optio
|
|
|
409
409
|
|
|
410
410
|
@query_parse_dependency_observer.wrap({"type": "classification_labels"})
|
|
411
411
|
async def get_classification_labels(kbid: str) -> knowledgebox_pb2.Labels:
|
|
412
|
-
async with get_driver().
|
|
412
|
+
async with get_driver().ro_transaction() as txn:
|
|
413
413
|
return await datamanagers.labels.get_labels(txn, kbid=kbid)
|
|
414
414
|
|
|
415
415
|
|
|
416
416
|
@query_parse_dependency_observer.wrap({"type": "synonyms"})
|
|
417
417
|
async def get_kb_synonyms(kbid: str) -> Optional[knowledgebox_pb2.Synonyms]:
|
|
418
|
-
async with get_driver().
|
|
418
|
+
async with get_driver().ro_transaction() as txn:
|
|
419
419
|
return await datamanagers.synonyms.get(txn, kbid=kbid)
|
|
420
420
|
|
|
421
421
|
|
|
422
422
|
@query_parse_dependency_observer.wrap({"type": "entities_meta_cache"})
|
|
423
423
|
async def get_entities_meta_cache(kbid: str) -> datamanagers.entities.EntitiesMetaCache:
|
|
424
|
-
async with get_driver().
|
|
424
|
+
async with get_driver().ro_transaction() as txn:
|
|
425
425
|
return await datamanagers.entities.get_entities_meta_cache(txn, kbid=kbid)
|
|
426
426
|
|
|
427
427
|
|
|
428
428
|
@query_parse_dependency_observer.wrap({"type": "deleted_entities_groups"})
|
|
429
429
|
async def get_deleted_entity_groups(kbid: str) -> list[str]:
|
|
430
|
-
async with get_driver().
|
|
430
|
+
async with get_driver().ro_transaction() as txn:
|
|
431
431
|
return list((await datamanagers.entities.get_deleted_groups(txn, kbid=kbid)).entities_groups)
|
|
@@ -17,9 +17,8 @@
|
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
|
-
from dataclasses import dataclass
|
|
21
20
|
from datetime import datetime
|
|
22
|
-
from typing import
|
|
21
|
+
from typing import Optional, Union
|
|
23
22
|
|
|
24
23
|
from nidx_protos import nodereader_pb2
|
|
25
24
|
from pydantic import BaseModel, ConfigDict, Field
|
|
@@ -86,7 +85,6 @@ class Filters(BaseModel):
|
|
|
86
85
|
nodereader_pb2.FilterOperator.AND
|
|
87
86
|
)
|
|
88
87
|
|
|
89
|
-
autofilter: Optional[list[utils_pb2.RelationNode]] = None
|
|
90
88
|
facets: list[str] = Field(default_factory=list)
|
|
91
89
|
hidden: Optional[bool] = None
|
|
92
90
|
security: Optional[search_models.RequestSecurity] = None
|
|
@@ -153,33 +151,6 @@ class ParsedQuery(BaseModel):
|
|
|
153
151
|
generation: Optional[Generation] = None
|
|
154
152
|
|
|
155
153
|
|
|
156
|
-
### Catalog
|
|
157
|
-
@dataclass
|
|
158
|
-
class CatalogExpression:
|
|
159
|
-
@dataclass
|
|
160
|
-
class Date:
|
|
161
|
-
field: Union[Literal["created_at"], Literal["modified_at"]]
|
|
162
|
-
since: Optional[datetime]
|
|
163
|
-
until: Optional[datetime]
|
|
164
|
-
|
|
165
|
-
bool_and: Optional[list["CatalogExpression"]] = None
|
|
166
|
-
bool_or: Optional[list["CatalogExpression"]] = None
|
|
167
|
-
bool_not: Optional["CatalogExpression"] = None
|
|
168
|
-
date: Optional[Date] = None
|
|
169
|
-
facet: Optional[str] = None
|
|
170
|
-
resource_id: Optional[str] = None
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
class CatalogQuery(BaseModel):
|
|
174
|
-
kbid: str
|
|
175
|
-
query: Optional[search_models.CatalogQuery]
|
|
176
|
-
filters: Optional[CatalogExpression]
|
|
177
|
-
sort: search_models.SortOptions
|
|
178
|
-
faceted: list[str]
|
|
179
|
-
page_size: int
|
|
180
|
-
page_number: int
|
|
181
|
-
|
|
182
|
-
|
|
183
154
|
### Graph
|
|
184
155
|
|
|
185
156
|
|
|
@@ -63,7 +63,7 @@ class _AskParser:
|
|
|
63
63
|
)
|
|
64
64
|
elif isinstance(self.item.max_tokens, MaxTokens):
|
|
65
65
|
max_tokens = self.item.max_tokens
|
|
66
|
-
else: # pragma:
|
|
66
|
+
else: # pragma: no cover
|
|
67
67
|
# This is a trick so mypy generates an error if this branch can be reached,
|
|
68
68
|
# that is, if we are missing some ifs
|
|
69
69
|
_a: int = "a"
|
|
@@ -19,13 +19,10 @@
|
|
|
19
19
|
#
|
|
20
20
|
|
|
21
21
|
from nucliadb.common import datamanagers
|
|
22
|
+
from nucliadb.common.catalog.interface import CatalogExpression, CatalogQuery
|
|
22
23
|
from nucliadb.common.exceptions import InvalidQueryError
|
|
23
|
-
from nucliadb.common.filter_expression import
|
|
24
|
+
from nucliadb.common.filter_expression import FacetFilter, facet_from_filter
|
|
24
25
|
from nucliadb.search.search.filters import translate_label
|
|
25
|
-
from nucliadb.search.search.query_parser.models import (
|
|
26
|
-
CatalogExpression,
|
|
27
|
-
CatalogQuery,
|
|
28
|
-
)
|
|
29
26
|
from nucliadb_models import search as search_models
|
|
30
27
|
from nucliadb_models.filters import (
|
|
31
28
|
And,
|
|
@@ -185,14 +182,14 @@ async def parse_filter_expression(expr: ResourceFilterExpression, kbid: str) ->
|
|
|
185
182
|
if rid is None:
|
|
186
183
|
raise InvalidQueryError("slug", f"Cannot find slug {expr.slug}")
|
|
187
184
|
cat.resource_id = rid
|
|
188
|
-
else: # pragma:
|
|
185
|
+
else: # pragma: no cover
|
|
189
186
|
# Cannot happen due to model validation
|
|
190
187
|
raise ValueError("Resource needs id or slug")
|
|
191
188
|
elif isinstance(expr, DateCreated):
|
|
192
189
|
cat.date = CatalogExpression.Date(field="created_at", since=expr.since, until=expr.until)
|
|
193
190
|
elif isinstance(expr, DateModified):
|
|
194
191
|
cat.date = CatalogExpression.Date(field="modified_at", since=expr.since, until=expr.until)
|
|
195
|
-
elif isinstance(expr,
|
|
192
|
+
elif isinstance(expr, FacetFilter):
|
|
196
193
|
cat.facet = facet_from_filter(expr)
|
|
197
194
|
else:
|
|
198
195
|
# This is a trick so mypy generates an error if this branch can be reached,
|
|
@@ -21,7 +21,6 @@ import re
|
|
|
21
21
|
import string
|
|
22
22
|
from typing import Optional, Union
|
|
23
23
|
|
|
24
|
-
from nucliadb.common.exceptions import InvalidQueryError
|
|
25
24
|
from nucliadb.search import logger
|
|
26
25
|
from nucliadb.search.search.query_parser.fetcher import Fetcher
|
|
27
26
|
from nucliadb.search.search.query_parser.models import (
|
|
@@ -32,15 +31,20 @@ from nucliadb_models import search as search_models
|
|
|
32
31
|
|
|
33
32
|
DEFAULT_GENERIC_SEMANTIC_THRESHOLD = 0.7
|
|
34
33
|
|
|
35
|
-
# -* is an invalid query in tantivy and it won't return results but if you add some whitespaces
|
|
36
|
-
# between - and *, it will actually trigger a tantivy bug and panic
|
|
37
|
-
INVALID_QUERY = re.compile(r"- +\*")
|
|
38
34
|
|
|
35
|
+
def validate_query_syntax(query: str) -> str:
|
|
36
|
+
"""Filter some queries that panic tantivy, better than returning the 500"""
|
|
39
37
|
|
|
40
|
-
|
|
41
|
-
#
|
|
38
|
+
# -* is an invalid query in tantivy and it won't return results but if you add some whitespaces
|
|
39
|
+
# between - and *, it will actually trigger a tantivy bug and panic
|
|
40
|
+
INVALID_QUERY = re.compile(r"- *\*+")
|
|
42
41
|
if INVALID_QUERY.search(query):
|
|
43
|
-
|
|
42
|
+
# remove the * and extra spaces, as it's probably what doesn't have
|
|
43
|
+
# meaning in both cases: -* and - *
|
|
44
|
+
fixed = re.sub(INVALID_QUERY, "- ", query)
|
|
45
|
+
query = fixed
|
|
46
|
+
|
|
47
|
+
return query
|
|
44
48
|
|
|
45
49
|
|
|
46
50
|
def is_empty_query(request: search_models.BaseSearchRequest) -> bool:
|
|
@@ -85,6 +89,7 @@ async def parse_keyword_query(
|
|
|
85
89
|
fetcher: Fetcher,
|
|
86
90
|
) -> KeywordQuery:
|
|
87
91
|
query = item.query
|
|
92
|
+
|
|
88
93
|
# If there was a rephrase with image, we should use the rephrased query for keyword search
|
|
89
94
|
rephrased_query = await fetcher.get_rephrased_query()
|
|
90
95
|
if item.query_image is not None and rephrased_query is not None:
|
|
@@ -98,6 +103,10 @@ async def parse_keyword_query(
|
|
|
98
103
|
query = synonyms_query
|
|
99
104
|
is_synonyms_query = True
|
|
100
105
|
|
|
106
|
+
# after all query transformations, pass a validator that can fix some
|
|
107
|
+
# queries that trigger a panic on the index
|
|
108
|
+
query = validate_query_syntax(query)
|
|
109
|
+
|
|
101
110
|
min_score = parse_keyword_min_score(item.min_score)
|
|
102
111
|
|
|
103
112
|
return KeywordQuery(
|
|
@@ -57,7 +57,6 @@ from .common import (
|
|
|
57
57
|
parse_semantic_query,
|
|
58
58
|
parse_top_k,
|
|
59
59
|
should_disable_vector_search,
|
|
60
|
-
validate_query_syntax,
|
|
61
60
|
)
|
|
62
61
|
|
|
63
62
|
|
|
@@ -146,8 +145,6 @@ class _FindParser:
|
|
|
146
145
|
return retrieval
|
|
147
146
|
|
|
148
147
|
def _validate_request(self):
|
|
149
|
-
validate_query_syntax(self.item.query)
|
|
150
|
-
|
|
151
148
|
# synonyms are not compatible with vector/graph search
|
|
152
149
|
if (
|
|
153
150
|
self.item.with_synonyms
|
|
@@ -256,17 +253,9 @@ class _FindParser:
|
|
|
256
253
|
else:
|
|
257
254
|
filter_operator = nodereader_pb2.FilterOperator.AND
|
|
258
255
|
|
|
259
|
-
autofilter = None
|
|
260
|
-
if self.item.autofilter:
|
|
261
|
-
if self._query.relation is not None:
|
|
262
|
-
autofilter = self._query.relation.entry_points
|
|
263
|
-
else:
|
|
264
|
-
autofilter = await self._get_detected_entities()
|
|
265
|
-
|
|
266
256
|
hidden = await filter_hidden_resources(self.kbid, self.item.show_hidden)
|
|
267
257
|
|
|
268
258
|
return Filters(
|
|
269
|
-
autofilter=autofilter,
|
|
270
259
|
facets=[],
|
|
271
260
|
field_expression=field_expr,
|
|
272
261
|
paragraph_expression=paragraph_expr,
|
|
@@ -153,7 +153,7 @@ def parse_path_query(expr: graph_requests.GraphPathQuery) -> nodereader_pb2.Grap
|
|
|
153
153
|
elif isinstance(expr, graph_requests.Generated):
|
|
154
154
|
_set_generated_to_pb(expr, pb)
|
|
155
155
|
|
|
156
|
-
else: # pragma:
|
|
156
|
+
else: # pragma: no cover
|
|
157
157
|
# This is a trick so mypy generates an error if this branch can be reached,
|
|
158
158
|
# that is, if we are missing some ifs
|
|
159
159
|
_a: int = "a"
|
|
@@ -182,7 +182,7 @@ def _parse_node_query(expr: graph_requests.GraphNodesQuery) -> nodereader_pb2.Gr
|
|
|
182
182
|
elif isinstance(expr, graph_requests.Generated):
|
|
183
183
|
_set_generated_to_pb(expr, pb)
|
|
184
184
|
|
|
185
|
-
else: # pragma:
|
|
185
|
+
else: # pragma: no cover
|
|
186
186
|
# This is a trick so mypy generates an error if this branch can be reached,
|
|
187
187
|
# that is, if we are missing some ifs
|
|
188
188
|
_a: int = "a"
|
|
@@ -212,7 +212,7 @@ def _parse_relation_query(
|
|
|
212
212
|
elif isinstance(expr, graph_requests.Generated):
|
|
213
213
|
_set_generated_to_pb(expr, pb)
|
|
214
214
|
|
|
215
|
-
else: # pragma:
|
|
215
|
+
else: # pragma: no cover
|
|
216
216
|
# This is a trick so mypy generates an error if this branch can be reached,
|
|
217
217
|
# that is, if we are missing some ifs
|
|
218
218
|
_a: int = "a"
|
|
@@ -230,7 +230,7 @@ def _set_node_to_pb(node: graph_requests.GraphNode, pb: nodereader_pb2.GraphQuer
|
|
|
230
230
|
pb.fuzzy.kind = nodereader_pb2.GraphQuery.Node.MatchLocation.PREFIX
|
|
231
231
|
pb.fuzzy.distance = 1
|
|
232
232
|
|
|
233
|
-
else: # pragma:
|
|
233
|
+
else: # pragma: no cover
|
|
234
234
|
# This is a trick so mypy generates an error if this branch can be reached,
|
|
235
235
|
# that is, if we are missing some ifs
|
|
236
236
|
_a: int = "a"
|
|
@@ -263,7 +263,7 @@ def _set_generated_to_pb(generated: graph_requests.Generated, pb: nodereader_pb2
|
|
|
263
263
|
|
|
264
264
|
pb.facet.facet = facet
|
|
265
265
|
|
|
266
|
-
else: # pragma:
|
|
266
|
+
else: # pragma: no cover
|
|
267
267
|
# This is a trick so mypy generates an error if this branch can be reached,
|
|
268
268
|
# that is, if we are missing some ifs
|
|
269
269
|
_a: int = "a"
|
|
@@ -51,7 +51,6 @@ from .common import (
|
|
|
51
51
|
parse_semantic_query,
|
|
52
52
|
parse_top_k,
|
|
53
53
|
should_disable_vector_search,
|
|
54
|
-
validate_query_syntax,
|
|
55
54
|
)
|
|
56
55
|
|
|
57
56
|
INDEX_SORTABLE_FIELDS = [
|
|
@@ -128,8 +127,6 @@ class _SearchParser:
|
|
|
128
127
|
return retrieval
|
|
129
128
|
|
|
130
129
|
def _validate_request(self):
|
|
131
|
-
validate_query_syntax(self.item.query)
|
|
132
|
-
|
|
133
130
|
# synonyms are not compatible with vector/graph search
|
|
134
131
|
if (
|
|
135
132
|
self.item.with_synonyms
|
|
@@ -254,17 +251,9 @@ class _SearchParser:
|
|
|
254
251
|
else:
|
|
255
252
|
filter_operator = nodereader_pb2.FilterOperator.AND
|
|
256
253
|
|
|
257
|
-
autofilter = None
|
|
258
|
-
if self.item.autofilter:
|
|
259
|
-
if self._query.relation is not None:
|
|
260
|
-
autofilter = self._query.relation.entry_points
|
|
261
|
-
else:
|
|
262
|
-
autofilter = await self._get_detected_entities()
|
|
263
|
-
|
|
264
254
|
hidden = await filter_hidden_resources(self.kbid, self.item.show_hidden)
|
|
265
255
|
|
|
266
256
|
return Filters(
|
|
267
|
-
autofilter=autofilter,
|
|
268
257
|
facets=self.item.faceted,
|
|
269
258
|
field_expression=field_expr,
|
|
270
259
|
paragraph_expression=paragraph_expr,
|
|
@@ -25,10 +25,10 @@ from nidx_protos.nodereader_pb2 import SearchRequest
|
|
|
25
25
|
from nucliadb.common.filter_expression import add_and_expression
|
|
26
26
|
from nucliadb.search.search.filters import translate_label
|
|
27
27
|
from nucliadb.search.search.metrics import node_features, query_parser_observer
|
|
28
|
-
from nucliadb.search.search.query import
|
|
28
|
+
from nucliadb.search.search.query import get_sort_field_proto
|
|
29
29
|
from nucliadb.search.search.query_parser.models import ParsedQuery, PredictReranker, UnitRetrieval
|
|
30
30
|
from nucliadb.search.search.query_parser.parsers.graph import parse_path_query
|
|
31
|
-
from nucliadb_models.labels import LABEL_HIDDEN
|
|
31
|
+
from nucliadb_models.labels import LABEL_HIDDEN
|
|
32
32
|
from nucliadb_models.search import SortOrderMap
|
|
33
33
|
from nucliadb_protos import utils_pb2
|
|
34
34
|
|
|
@@ -36,7 +36,7 @@ from nucliadb_protos import utils_pb2
|
|
|
36
36
|
@query_parser_observer.wrap({"type": "convert_retrieval_to_proto"})
|
|
37
37
|
async def legacy_convert_retrieval_to_proto(
|
|
38
38
|
parsed: ParsedQuery,
|
|
39
|
-
) -> tuple[SearchRequest, bool,
|
|
39
|
+
) -> tuple[SearchRequest, bool, Optional[str]]:
|
|
40
40
|
converter = _Converter(parsed.retrieval)
|
|
41
41
|
request = converter.into_search_request()
|
|
42
42
|
|
|
@@ -44,13 +44,12 @@ async def legacy_convert_retrieval_to_proto(
|
|
|
44
44
|
# needed. We should find a better abstraction
|
|
45
45
|
|
|
46
46
|
incomplete = is_incomplete(parsed.retrieval)
|
|
47
|
-
autofilter = converter._autofilter
|
|
48
47
|
|
|
49
48
|
rephrased_query = None
|
|
50
49
|
if parsed.retrieval.query.semantic:
|
|
51
50
|
rephrased_query = await parsed.fetcher.get_rephrased_query()
|
|
52
51
|
|
|
53
|
-
return request, incomplete,
|
|
52
|
+
return request, incomplete, rephrased_query
|
|
54
53
|
|
|
55
54
|
|
|
56
55
|
@query_parser_observer.wrap({"type": "convert_retrieval_to_proto"})
|
|
@@ -65,8 +64,6 @@ class _Converter:
|
|
|
65
64
|
self.req = nodereader_pb2.SearchRequest()
|
|
66
65
|
self.retrieval = retrieval
|
|
67
66
|
|
|
68
|
-
self._autofilter: list[str] = []
|
|
69
|
-
|
|
70
67
|
def into_search_request(self) -> nodereader_pb2.SearchRequest:
|
|
71
68
|
"""Generate a SearchRequest proto from a retrieval operation."""
|
|
72
69
|
self._apply_text_queries()
|
|
@@ -235,10 +232,6 @@ class _Converter:
|
|
|
235
232
|
self.req.paragraph_filter.CopyFrom(self.retrieval.filters.paragraph_expression)
|
|
236
233
|
self.req.filter_operator = self.retrieval.filters.filter_expression_operator
|
|
237
234
|
|
|
238
|
-
if self.retrieval.filters.autofilter:
|
|
239
|
-
entity_filters = apply_entities_filter(self.req, self.retrieval.filters.autofilter)
|
|
240
|
-
self._autofilter.extend([translate_system_to_alias_label(e) for e in entity_filters])
|
|
241
|
-
|
|
242
235
|
if self.retrieval.filters.hidden is not None:
|
|
243
236
|
expr = nodereader_pb2.FilterExpression()
|
|
244
237
|
if self.retrieval.filters.hidden:
|
|
@@ -181,7 +181,7 @@ def get_reranker(reranker: parser_models.Reranker) -> Reranker:
|
|
|
181
181
|
elif isinstance(reranker, parser_models.PredictReranker):
|
|
182
182
|
algorithm = PredictReranker(reranker.window)
|
|
183
183
|
|
|
184
|
-
else: # pragma:
|
|
184
|
+
else: # pragma: no cover
|
|
185
185
|
# This is a trick so mypy generates an error if this branch can be reached,
|
|
186
186
|
# that is, if we are missing some ifs
|
|
187
187
|
_a: int = "a"
|
|
@@ -77,7 +77,7 @@ async def get_extracted_texts(kbid: str, resource_uuids_or_slugs: list[str]) ->
|
|
|
77
77
|
tasks = []
|
|
78
78
|
|
|
79
79
|
# Schedule getting extracted text for each field of each resource
|
|
80
|
-
async with driver.
|
|
80
|
+
async with driver.ro_transaction() as txn:
|
|
81
81
|
if not await datamanagers.kb.exists_kb(txn, kbid=kbid):
|
|
82
82
|
raise datamanagers.exceptions.KnowledgeBoxNotFound(kbid)
|
|
83
83
|
|
nucliadb/standalone/run.py
CHANGED
|
@@ -116,6 +116,9 @@ def run():
|
|
|
116
116
|
if nuclia_settings.nuclia_service_account:
|
|
117
117
|
settings_to_output["NUA API key"] = "Configured ✔"
|
|
118
118
|
settings_to_output["NUA API zone"] = nuclia_settings.nuclia_zone
|
|
119
|
+
settings_to_output["NUA API url"] = (
|
|
120
|
+
nuclia_settings.nuclia_public_url.format(zone=nuclia_settings.nuclia_zone) + "/api"
|
|
121
|
+
)
|
|
119
122
|
|
|
120
123
|
settings_to_output_fmted = "\n".join(
|
|
121
124
|
[f"|| - {k}:{' ' * (27 - len(k))}{v}" for k, v in settings_to_output.items()]
|
nucliadb/tasks/retries.py
CHANGED
|
@@ -151,7 +151,7 @@ class TaskRetryHandler:
|
|
|
151
151
|
|
|
152
152
|
|
|
153
153
|
async def _get_metadata(kv_driver: Driver, metadata_key: str) -> Optional[TaskMetadata]:
|
|
154
|
-
async with kv_driver.
|
|
154
|
+
async with kv_driver.ro_transaction() as txn:
|
|
155
155
|
metadata = await txn.get(metadata_key)
|
|
156
156
|
if metadata is None:
|
|
157
157
|
return None
|
|
@@ -159,7 +159,7 @@ async def _get_metadata(kv_driver: Driver, metadata_key: str) -> Optional[TaskMe
|
|
|
159
159
|
|
|
160
160
|
|
|
161
161
|
async def _set_metadata(kv_driver: Driver, metadata_key: str, metadata: TaskMetadata) -> None:
|
|
162
|
-
async with kv_driver.
|
|
162
|
+
async with kv_driver.rw_transaction() as txn:
|
|
163
163
|
await txn.set(metadata_key, metadata.model_dump_json().encode())
|
|
164
164
|
await txn.commit()
|
|
165
165
|
|
|
@@ -188,7 +188,7 @@ async def purge_batch(
|
|
|
188
188
|
"""
|
|
189
189
|
Returns the next start key and the number of purged records. If start is None, it means there are no more records to purge.
|
|
190
190
|
"""
|
|
191
|
-
async with kv_driver.
|
|
191
|
+
async with kv_driver.rw_transaction() as txn:
|
|
192
192
|
txn = cast(PGTransaction, txn)
|
|
193
193
|
async with txn.connection.cursor() as cur:
|
|
194
194
|
await cur.execute(
|
|
@@ -226,7 +226,7 @@ async def purge_batch(
|
|
|
226
226
|
while len(to_delete) > 0:
|
|
227
227
|
batch = to_delete[:delete_batch_size]
|
|
228
228
|
to_delete = to_delete[delete_batch_size:]
|
|
229
|
-
async with kv_driver.
|
|
229
|
+
async with kv_driver.rw_transaction() as txn:
|
|
230
230
|
for key in batch:
|
|
231
231
|
logger.info("Purging task metadata", extra={"key": key})
|
|
232
232
|
await txn.delete(key)
|
|
@@ -116,10 +116,7 @@ async def get_sentences(kbid: str, result: str) -> list[str]:
|
|
|
116
116
|
if split is not None:
|
|
117
117
|
text = extracted_text.split_text[split]
|
|
118
118
|
for paragraph in field_metadata.split_metadata[split].paragraphs:
|
|
119
|
-
|
|
120
|
-
key = f"{rid}/{field_type}/{field}/{paragraph.start}-{paragraph.end}"
|
|
121
|
-
else:
|
|
122
|
-
key = paragraph.key
|
|
119
|
+
key = f"{rid}/{field_type}/{field}/{paragraph.start}-{paragraph.end}"
|
|
123
120
|
if key == result:
|
|
124
121
|
for sentence in paragraph.sentences:
|
|
125
122
|
splitted_text = text[sentence.start : sentence.end]
|
|
@@ -127,10 +124,7 @@ async def get_sentences(kbid: str, result: str) -> list[str]:
|
|
|
127
124
|
else:
|
|
128
125
|
text = extracted_text.text
|
|
129
126
|
for paragraph in field_metadata.metadata.paragraphs:
|
|
130
|
-
|
|
131
|
-
key = f"{rid}/{field_type}/{field}/{paragraph.start}-{paragraph.end}"
|
|
132
|
-
else:
|
|
133
|
-
key = paragraph.key
|
|
127
|
+
key = f"{rid}/{field_type}/{field}/{paragraph.start}-{paragraph.end}"
|
|
134
128
|
if key == result:
|
|
135
129
|
for sentence in paragraph.sentences:
|
|
136
130
|
splitted_text = text[sentence.start : sentence.end]
|
|
@@ -41,7 +41,7 @@ async def get_resource_from_cache_or_db(kbid: str, uuid: str) -> Optional[Resour
|
|
|
41
41
|
|
|
42
42
|
async def _get_resource_from_db(kbid: str, uuid: str) -> Optional[ResourceORM]:
|
|
43
43
|
storage = await get_storage(service_name=SERVICE_NAME)
|
|
44
|
-
async with get_driver().
|
|
44
|
+
async with get_driver().ro_transaction() as transaction:
|
|
45
45
|
kb = KnowledgeBoxORM(transaction, storage, kbid)
|
|
46
46
|
return await kb.get(uuid)
|
|
47
47
|
|
nucliadb/train/nodes.py
CHANGED
|
@@ -81,7 +81,7 @@ class TrainShardManager(manager.KBShardManager):
|
|
|
81
81
|
return manager
|
|
82
82
|
|
|
83
83
|
async def kb_sentences(self, request: GetSentencesRequest) -> AsyncIterator[TrainSentence]:
|
|
84
|
-
async with self.driver.
|
|
84
|
+
async with self.driver.ro_transaction() as txn:
|
|
85
85
|
kb = KnowledgeBox(txn, self.storage, request.kb.uuid)
|
|
86
86
|
if request.uuid != "":
|
|
87
87
|
# Filter by uuid
|
|
@@ -95,7 +95,7 @@ class TrainShardManager(manager.KBShardManager):
|
|
|
95
95
|
yield sentence
|
|
96
96
|
|
|
97
97
|
async def kb_paragraphs(self, request: GetParagraphsRequest) -> AsyncIterator[TrainParagraph]:
|
|
98
|
-
async with self.driver.
|
|
98
|
+
async with self.driver.ro_transaction() as txn:
|
|
99
99
|
kb = KnowledgeBox(txn, self.storage, request.kb.uuid)
|
|
100
100
|
if request.uuid != "":
|
|
101
101
|
# Filter by uuid
|
|
@@ -109,7 +109,7 @@ class TrainShardManager(manager.KBShardManager):
|
|
|
109
109
|
yield paragraph
|
|
110
110
|
|
|
111
111
|
async def kb_fields(self, request: GetFieldsRequest) -> AsyncIterator[TrainField]:
|
|
112
|
-
async with self.driver.
|
|
112
|
+
async with self.driver.ro_transaction() as txn:
|
|
113
113
|
kb = KnowledgeBox(txn, self.storage, request.kb.uuid)
|
|
114
114
|
if request.uuid != "":
|
|
115
115
|
# Filter by uuid
|
|
@@ -123,7 +123,7 @@ class TrainShardManager(manager.KBShardManager):
|
|
|
123
123
|
yield field
|
|
124
124
|
|
|
125
125
|
async def kb_resources(self, request: GetResourcesRequest) -> AsyncIterator[TrainResource]:
|
|
126
|
-
async with self.driver.
|
|
126
|
+
async with self.driver.ro_transaction() as txn:
|
|
127
127
|
kb = KnowledgeBox(txn, self.storage, request.kb.uuid)
|
|
128
128
|
base = KB_RESOURCE_SLUG_BASE.format(kbid=request.kb.uuid)
|
|
129
129
|
async for key in txn.keys(match=base):
|
nucliadb/train/servicer.py
CHANGED
|
@@ -89,7 +89,7 @@ class TrainServicer(train_pb2_grpc.TrainServicer):
|
|
|
89
89
|
) -> GetEntitiesResponse:
|
|
90
90
|
kbid = request.kb.uuid
|
|
91
91
|
response = GetEntitiesResponse()
|
|
92
|
-
async with self.proc.driver.
|
|
92
|
+
async with self.proc.driver.ro_transaction() as txn:
|
|
93
93
|
entities_manager = await self.proc.get_kb_entities_manager(txn, kbid)
|
|
94
94
|
if entities_manager is None:
|
|
95
95
|
await txn.abort()
|
nucliadb/train/uploader.py
CHANGED
|
@@ -75,7 +75,7 @@ class UploadServicer:
|
|
|
75
75
|
) -> GetEntitiesResponse:
|
|
76
76
|
kbid = request.kb.uuid
|
|
77
77
|
response = GetEntitiesResponse()
|
|
78
|
-
async with self.proc.driver.
|
|
78
|
+
async with self.proc.driver.ro_transaction() as txn:
|
|
79
79
|
kbobj = await self.proc.get_kb_obj(txn, request.kb)
|
|
80
80
|
if kbobj is None:
|
|
81
81
|
response.status = GetEntitiesResponse.Status.NOTFOUND
|
nucliadb/writer/api/v1/field.py
CHANGED
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
20
|
from inspect import iscoroutinefunction
|
|
21
|
-
from typing import TYPE_CHECKING, Annotated, Callable, Optional, Type, Union
|
|
21
|
+
from typing import TYPE_CHECKING, Annotated, Callable, List, Optional, Type, Union
|
|
22
22
|
|
|
23
23
|
import pydantic
|
|
24
24
|
from fastapi import HTTPException, Query, Response
|
|
@@ -249,9 +249,10 @@ async def parse_conversation_field_adapter(
|
|
|
249
249
|
writer: BrokerMessage,
|
|
250
250
|
toprocess: PushPayload,
|
|
251
251
|
resource_classifications: ResourceClassifications,
|
|
252
|
+
replace_field: bool = False,
|
|
252
253
|
):
|
|
253
254
|
return await parse_conversation_field(
|
|
254
|
-
field_id, field_payload, writer, toprocess, kbid, rid, resource_classifications
|
|
255
|
+
field_id, field_payload, writer, toprocess, kbid, rid, resource_classifications, replace_field
|
|
255
256
|
)
|
|
256
257
|
|
|
257
258
|
|
|
@@ -380,7 +381,9 @@ async def add_resource_field_conversation_rslug_prefix(
|
|
|
380
381
|
field_id: FieldIdString,
|
|
381
382
|
field_payload: models.InputConversationField,
|
|
382
383
|
) -> ResourceFieldAdded:
|
|
383
|
-
return await add_field_to_resource_by_slug(
|
|
384
|
+
return await add_field_to_resource_by_slug(
|
|
385
|
+
request, kbid, rslug, field_id, field_payload, replace_field=True
|
|
386
|
+
)
|
|
384
387
|
|
|
385
388
|
|
|
386
389
|
@api.put(
|
|
@@ -399,7 +402,7 @@ async def add_resource_field_conversation_rid_prefix(
|
|
|
399
402
|
field_id: FieldIdString,
|
|
400
403
|
field_payload: models.InputConversationField,
|
|
401
404
|
) -> ResourceFieldAdded:
|
|
402
|
-
return await add_field_to_resource(request, kbid, rid, field_id, field_payload)
|
|
405
|
+
return await add_field_to_resource(request, kbid, rid, field_id, field_payload, replace_field=True)
|
|
403
406
|
|
|
404
407
|
|
|
405
408
|
@api.put(
|
|
@@ -460,13 +463,15 @@ async def append_messages_to_conversation_field_rslug_prefix(
|
|
|
460
463
|
kbid: str,
|
|
461
464
|
rslug: str,
|
|
462
465
|
field_id: FieldIdString,
|
|
463
|
-
messages:
|
|
466
|
+
messages: List[models.InputMessage],
|
|
464
467
|
) -> ResourceFieldAdded:
|
|
465
468
|
try:
|
|
466
469
|
field = models.InputConversationField(messages=messages)
|
|
467
470
|
except pydantic.ValidationError as e:
|
|
468
471
|
raise HTTPException(status_code=422, detail=str(e))
|
|
469
|
-
return await add_field_to_resource_by_slug(
|
|
472
|
+
return await add_field_to_resource_by_slug(
|
|
473
|
+
request, kbid, rslug, field_id, field, replace_field=False
|
|
474
|
+
)
|
|
470
475
|
|
|
471
476
|
|
|
472
477
|
@api.put(
|
|
@@ -483,13 +488,13 @@ async def append_messages_to_conversation_field_rid_prefix(
|
|
|
483
488
|
kbid: str,
|
|
484
489
|
rid: str,
|
|
485
490
|
field_id: FieldIdString,
|
|
486
|
-
messages:
|
|
491
|
+
messages: List[models.InputMessage],
|
|
487
492
|
) -> ResourceFieldAdded:
|
|
488
493
|
try:
|
|
489
494
|
field = models.InputConversationField(messages=messages)
|
|
490
495
|
except pydantic.ValidationError as e:
|
|
491
496
|
raise HTTPException(status_code=422, detail=str(e))
|
|
492
|
-
return await add_field_to_resource(request, kbid, rid, field_id, field)
|
|
497
|
+
return await add_field_to_resource(request, kbid, rid, field_id, field, replace_field=False)
|
|
493
498
|
|
|
494
499
|
|
|
495
500
|
@api.delete(
|
|
@@ -572,7 +577,7 @@ async def reprocess_file_field(
|
|
|
572
577
|
storage = await get_storage(service_name=SERVICE_NAME)
|
|
573
578
|
driver = get_driver()
|
|
574
579
|
|
|
575
|
-
async with driver.
|
|
580
|
+
async with driver.ro_transaction() as txn:
|
|
576
581
|
kb = KnowledgeBox(txn, storage, kbid)
|
|
577
582
|
|
|
578
583
|
resource = await kb.get(rid)
|