nucliadb 6.7.2.post4862__py3-none-any.whl → 6.9.2.post5282__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nucliadb might be problematic. Click here for more details.

Files changed (126) hide show
  1. migrations/0016_upgrade_to_paragraphs_v2.py +1 -1
  2. migrations/0017_multiple_writable_shards.py +1 -1
  3. migrations/0018_purge_orphan_kbslugs.py +1 -1
  4. migrations/0019_upgrade_to_paragraphs_v3.py +1 -1
  5. migrations/0021_overwrite_vectorsets_key.py +1 -1
  6. migrations/0023_backfill_pg_catalog.py +7 -3
  7. migrations/0025_assign_models_to_kbs_v2.py +3 -3
  8. migrations/0027_rollover_texts3.py +1 -1
  9. migrations/0028_extracted_vectors_reference.py +1 -1
  10. migrations/0029_backfill_field_status.py +1 -1
  11. migrations/0032_remove_old_relations.py +1 -1
  12. migrations/0036_backfill_catalog_slug.py +1 -1
  13. migrations/0037_backfill_catalog_facets.py +1 -1
  14. migrations/0038_backfill_catalog_field_labels.py +7 -3
  15. migrations/0039_backfill_converation_splits_metadata.py +106 -0
  16. migrations/0040_migrate_search_configurations.py +79 -0
  17. migrations/pg/0010_shards_index.py +34 -0
  18. nucliadb/backups/create.py +3 -3
  19. nucliadb/backups/restore.py +3 -3
  20. nucliadb/common/cache.py +1 -1
  21. nucliadb/common/catalog/__init__.py +79 -0
  22. nucliadb/common/catalog/dummy.py +36 -0
  23. nucliadb/common/catalog/interface.py +85 -0
  24. nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +294 -208
  25. nucliadb/common/catalog/utils.py +56 -0
  26. nucliadb/common/cluster/manager.py +3 -19
  27. nucliadb/common/cluster/rebalance.py +484 -110
  28. nucliadb/common/cluster/rollover.py +29 -0
  29. nucliadb/common/cluster/settings.py +1 -1
  30. nucliadb/common/cluster/utils.py +26 -0
  31. nucliadb/common/datamanagers/atomic.py +6 -0
  32. nucliadb/common/datamanagers/utils.py +2 -2
  33. nucliadb/common/external_index_providers/manager.py +1 -29
  34. nucliadb/common/external_index_providers/settings.py +1 -27
  35. nucliadb/common/filter_expression.py +16 -33
  36. nucliadb/common/http_clients/exceptions.py +8 -0
  37. nucliadb/common/http_clients/processing.py +4 -0
  38. nucliadb/common/http_clients/utils.py +3 -0
  39. nucliadb/common/ids.py +77 -55
  40. nucliadb/common/locking.py +4 -4
  41. nucliadb/common/maindb/driver.py +11 -1
  42. nucliadb/common/maindb/local.py +1 -1
  43. nucliadb/common/maindb/pg.py +1 -1
  44. nucliadb/common/nidx.py +19 -1
  45. nucliadb/common/vector_index_config.py +1 -1
  46. nucliadb/export_import/datamanager.py +3 -3
  47. nucliadb/ingest/consumer/pull.py +7 -0
  48. nucliadb/ingest/consumer/service.py +2 -27
  49. nucliadb/ingest/consumer/shard_creator.py +17 -6
  50. nucliadb/ingest/fields/base.py +9 -17
  51. nucliadb/ingest/fields/conversation.py +47 -1
  52. nucliadb/ingest/orm/brain_v2.py +21 -3
  53. nucliadb/ingest/orm/index_message.py +126 -111
  54. nucliadb/ingest/orm/knowledgebox.py +84 -43
  55. nucliadb/ingest/orm/processor/auditing.py +1 -1
  56. nucliadb/ingest/orm/processor/processor.py +95 -149
  57. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  58. nucliadb/ingest/orm/resource.py +10 -1
  59. nucliadb/ingest/partitions.py +12 -1
  60. nucliadb/ingest/serialize.py +2 -2
  61. nucliadb/ingest/service/writer.py +26 -19
  62. nucliadb/ingest/settings.py +33 -11
  63. nucliadb/learning_proxy.py +12 -15
  64. nucliadb/metrics_exporter.py +17 -4
  65. nucliadb/migrator/datamanager.py +11 -17
  66. nucliadb/migrator/migrator.py +2 -2
  67. nucliadb/purge/__init__.py +12 -17
  68. nucliadb/purge/orphan_shards.py +2 -2
  69. nucliadb/reader/api/v1/knowledgebox.py +40 -12
  70. nucliadb/reader/api/v1/learning_config.py +30 -10
  71. nucliadb/reader/api/v1/resource.py +2 -2
  72. nucliadb/reader/api/v1/services.py +1 -1
  73. nucliadb/reader/reader/notifications.py +1 -1
  74. nucliadb/search/api/v1/__init__.py +1 -0
  75. nucliadb/search/api/v1/catalog.py +4 -4
  76. nucliadb/search/api/v1/find.py +1 -4
  77. nucliadb/search/api/v1/hydrate.py +328 -0
  78. nucliadb/search/api/v1/resource/ask.py +21 -1
  79. nucliadb/search/api/v1/search.py +1 -4
  80. nucliadb/search/predict.py +9 -2
  81. nucliadb/search/search/cache.py +1 -20
  82. nucliadb/search/search/chat/ask.py +50 -8
  83. nucliadb/search/search/chat/prompt.py +47 -15
  84. nucliadb/search/search/chat/query.py +8 -1
  85. nucliadb/search/search/fetch.py +1 -1
  86. nucliadb/search/search/find.py +1 -6
  87. nucliadb/search/search/{hydrator.py → hydrator/__init__.py} +5 -4
  88. nucliadb/search/search/hydrator/fields.py +175 -0
  89. nucliadb/search/search/hydrator/images.py +130 -0
  90. nucliadb/search/search/hydrator/paragraphs.py +307 -0
  91. nucliadb/search/search/hydrator/resources.py +56 -0
  92. nucliadb/search/search/metrics.py +16 -0
  93. nucliadb/search/search/predict_proxy.py +33 -11
  94. nucliadb/search/search/query.py +0 -23
  95. nucliadb/search/search/query_parser/fetcher.py +5 -5
  96. nucliadb/search/search/query_parser/models.py +1 -30
  97. nucliadb/search/search/query_parser/parsers/ask.py +1 -1
  98. nucliadb/search/search/query_parser/parsers/catalog.py +4 -7
  99. nucliadb/search/search/query_parser/parsers/common.py +16 -7
  100. nucliadb/search/search/query_parser/parsers/find.py +0 -11
  101. nucliadb/search/search/query_parser/parsers/graph.py +5 -5
  102. nucliadb/search/search/query_parser/parsers/search.py +0 -11
  103. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +4 -11
  104. nucliadb/search/search/rerankers.py +1 -1
  105. nucliadb/search/search/summarize.py +1 -1
  106. nucliadb/standalone/run.py +3 -0
  107. nucliadb/tasks/retries.py +4 -4
  108. nucliadb/train/generators/sentence_classifier.py +2 -8
  109. nucliadb/train/generators/utils.py +1 -1
  110. nucliadb/train/nodes.py +4 -4
  111. nucliadb/train/servicer.py +1 -1
  112. nucliadb/train/uploader.py +1 -1
  113. nucliadb/writer/api/v1/field.py +14 -9
  114. nucliadb/writer/api/v1/knowledgebox.py +15 -52
  115. nucliadb/writer/api/v1/learning_config.py +5 -4
  116. nucliadb/writer/api/v1/resource.py +2 -2
  117. nucliadb/writer/resource/field.py +38 -2
  118. nucliadb/writer/tus/azure.py +4 -4
  119. nucliadb/writer/tus/gcs.py +11 -17
  120. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/METADATA +9 -10
  121. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/RECORD +124 -114
  122. nucliadb/common/external_index_providers/pinecone.py +0 -894
  123. nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
  124. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/WHEEL +0 -0
  125. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/entry_points.txt +0 -0
  126. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/top_level.txt +0 -0
@@ -34,8 +34,10 @@ from nucliadb.common.external_index_providers.base import ExternalIndexManager
34
34
  from nucliadb.common.external_index_providers.manager import (
35
35
  get_external_index_manager,
36
36
  )
37
+ from nucliadb.common.maindb.utils import get_driver
37
38
  from nucliadb.common.nidx import get_nidx_api_client
38
39
  from nucliadb.common.vector_index_config import nucliadb_index_config_to_nidx
40
+ from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
39
41
  from nucliadb.migrator.settings import settings
40
42
  from nucliadb_protos import utils_pb2, writer_pb2
41
43
  from nucliadb_telemetry import errors
@@ -45,6 +47,7 @@ from .utils import (
45
47
  get_resource,
46
48
  get_rollover_resource_index_message,
47
49
  index_resource_to_shard,
50
+ wait_for_nidx,
48
51
  )
49
52
 
50
53
  logger = logging.getLogger(__name__)
@@ -254,6 +257,7 @@ async def index_to_rollover_index(
254
257
  for rid in resource_ids
255
258
  ]
256
259
  await asyncio.gather(*batch)
260
+ await wait_for_indexing_to_catch_up(app_context)
257
261
 
258
262
  async with datamanagers.with_transaction() as txn:
259
263
  state.resources_indexed = True
@@ -262,6 +266,22 @@ async def index_to_rollover_index(
262
266
  await txn.commit()
263
267
 
264
268
 
269
+ async def wait_for_indexing_to_catch_up(app_context: ApplicationContext):
270
+ try:
271
+ app_context.nats_manager
272
+ except AssertionError:
273
+ logger.warning("Nats manager not initialized. Cannot wait for indexing to catch up")
274
+ return
275
+ max_pending = 1000
276
+ while True:
277
+ try:
278
+ await wait_for_nidx(app_context.nats_manager, max_wait_seconds=60, max_pending=max_pending)
279
+ return
280
+ except asyncio.TimeoutError:
281
+ logger.warning(f"Nidx is behind more than {max_pending} messages. Throttling rollover.")
282
+ await asyncio.sleep(30)
283
+
284
+
265
285
  async def _index_resource_to_rollover_index(
266
286
  app_context: ApplicationContext,
267
287
  rollover_shards: writer_pb2.Shards,
@@ -415,6 +435,15 @@ async def cutover_shards(app_context: ApplicationContext, kbid: str) -> None:
415
435
 
416
436
  await txn.commit()
417
437
 
438
+ # For KBs with pre-warm enabled, we must configure the new shards. There may
439
+ # be some small delay between this call and the shards being actually
440
+ # prewarmed, but rollovers are quite unusual and we prefer this rather than
441
+ # prewarming old and new shards at the same time
442
+ kb_config = await datamanagers.atomic.kb.get_config(kbid=kbid)
443
+ if kb_config is not None and kb_config.prewarm_enabled:
444
+ driver = get_driver()
445
+ await KnowledgeBox.configure_shards(driver, kbid, prewarm=True)
446
+
418
447
 
419
448
  async def validate_indexed_data(
420
449
  app_context: ApplicationContext, kbid: str, external: Optional[ExternalIndexManager] = None
@@ -42,7 +42,7 @@ class Settings(BaseSettings):
42
42
  description="Maximum number of paragraphs to target per shard",
43
43
  )
44
44
  max_resource_paragraphs: int = Field(
45
- default=50_000,
45
+ default=300_000,
46
46
  title="Max paragraphs per resource",
47
47
  description="Maximum number of paragraphs allowed on a single resource",
48
48
  )
@@ -32,6 +32,7 @@ from nucliadb.common.cluster.settings import settings
32
32
  from nucliadb.ingest.orm import index_message
33
33
  from nucliadb.ingest.orm.resource import Resource
34
34
  from nucliadb_protos import writer_pb2
35
+ from nucliadb_utils.nats import NatsConnectionManager
35
36
  from nucliadb_utils.utilities import Utility, clean_utility, get_utility, set_utility
36
37
 
37
38
  if TYPE_CHECKING: # pragma: no cover
@@ -125,3 +126,28 @@ async def delete_resource_from_shard(
125
126
  partition = partitioning.generate_partition(kbid, resource_id)
126
127
 
127
128
  await sm.delete_resource(shard, resource_id, 0, str(partition), kbid)
129
+
130
+
131
+ async def get_nats_consumer_pending_messages(
132
+ nats_manager: NatsConnectionManager, *, stream: str, consumer: str
133
+ ) -> int:
134
+ # get raw js client
135
+ js = nats_manager.js
136
+ consumer_info = await js.consumer_info(stream, consumer)
137
+ return consumer_info.num_pending
138
+
139
+
140
+ async def wait_for_nidx(
141
+ nats_manager: NatsConnectionManager,
142
+ max_pending: int,
143
+ poll_interval_seconds: int = 5,
144
+ max_wait_seconds: int = 60,
145
+ ):
146
+ async with asyncio.timeout(max_wait_seconds): # type: ignore
147
+ while True:
148
+ pending = await get_nats_consumer_pending_messages(
149
+ nats_manager, stream="nidx", consumer="nidx"
150
+ )
151
+ if pending < max_pending:
152
+ return
153
+ await asyncio.sleep(poll_interval_seconds)
@@ -42,6 +42,7 @@ from typing_extensions import Concatenate, ParamSpec
42
42
 
43
43
  from nucliadb.common.maindb.driver import Transaction
44
44
 
45
+ from . import cluster as cluster_dm
45
46
  from . import kb as kb_dm
46
47
  from . import labels as labels_dm
47
48
  from . import resources as resources_dm
@@ -73,6 +74,10 @@ def rw_txn_wrap(fun: Callable[Concatenate[Transaction, P], Awaitable[T]]) -> Cal
73
74
  return wrapper
74
75
 
75
76
 
77
+ class cluster:
78
+ get_kb_shards = ro_txn_wrap(cluster_dm.get_kb_shards)
79
+
80
+
76
81
  class kb:
77
82
  exists_kb = ro_txn_wrap(kb_dm.exists_kb)
78
83
  get_config = ro_txn_wrap(kb_dm.get_config)
@@ -83,6 +88,7 @@ class resources:
83
88
  get_resource_uuid_from_slug = ro_txn_wrap(resources_dm.get_resource_uuid_from_slug)
84
89
  resource_exists = ro_txn_wrap(resources_dm.resource_exists)
85
90
  slug_exists = ro_txn_wrap(resources_dm.slug_exists)
91
+ get_all_field_ids = ro_txn_wrap(resources_dm.get_all_field_ids)
86
92
 
87
93
 
88
94
  class labelset:
@@ -42,7 +42,7 @@ async def get_kv_pb(
42
42
  @contextlib.asynccontextmanager
43
43
  async def with_rw_transaction():
44
44
  driver = get_driver()
45
- async with driver.transaction(read_only=False) as txn:
45
+ async with driver.rw_transaction() as txn:
46
46
  yield txn
47
47
 
48
48
 
@@ -53,5 +53,5 @@ with_transaction = with_rw_transaction
53
53
  @contextlib.asynccontextmanager
54
54
  async def with_ro_transaction():
55
55
  driver = get_driver()
56
- async with driver.transaction(read_only=True) as ro_txn:
56
+ async with driver.ro_transaction() as ro_txn:
57
57
  yield ro_txn
@@ -23,13 +23,9 @@ import async_lru
23
23
 
24
24
  from nucliadb.common import datamanagers
25
25
  from nucliadb.common.external_index_providers.base import ExternalIndexManager
26
- from nucliadb.common.external_index_providers.pinecone import PineconeIndexManager
27
- from nucliadb.common.external_index_providers.settings import settings
28
26
  from nucliadb_protos.knowledgebox_pb2 import (
29
- ExternalIndexProviderType,
30
27
  StoredExternalIndexProviderMetadata,
31
28
  )
32
- from nucliadb_utils.utilities import get_endecryptor
33
29
 
34
30
 
35
31
  async def get_external_index_manager(
@@ -39,31 +35,7 @@ async def get_external_index_manager(
39
35
  Returns an ExternalIndexManager for the given kbid.
40
36
  If for_rollover is True, the ExternalIndexManager returned will include the rollover indexes (if any).
41
37
  """
42
- metadata = await get_external_index_metadata(kbid)
43
- if metadata is None or metadata.type != ExternalIndexProviderType.PINECONE:
44
- # Only Pinecone is supported for now
45
- return None
46
-
47
- api_key = get_endecryptor().decrypt(metadata.pinecone_config.encrypted_api_key)
48
- default_vectorset = await get_default_vectorset_id(kbid)
49
-
50
- rollover_indexes = None
51
- if for_rollover:
52
- rollover_metadata = await get_rollover_external_index_metadata(kbid)
53
- if rollover_metadata is not None:
54
- rollover_indexes = dict(rollover_metadata.pinecone_config.indexes)
55
-
56
- return PineconeIndexManager(
57
- kbid=kbid,
58
- api_key=api_key,
59
- indexes=dict(metadata.pinecone_config.indexes),
60
- upsert_parallelism=settings.pinecone_upsert_parallelism,
61
- delete_parallelism=settings.pinecone_delete_parallelism,
62
- upsert_timeout=settings.pinecone_upsert_timeout,
63
- delete_timeout=settings.pinecone_delete_timeout,
64
- default_vectorset=default_vectorset,
65
- rollover_indexes=rollover_indexes,
66
- )
38
+ return None
67
39
 
68
40
 
69
41
  @async_lru.alru_cache(maxsize=None)
@@ -17,36 +17,10 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from pydantic import Field
21
20
  from pydantic_settings import BaseSettings
22
21
 
23
22
 
24
- class ExternalIndexProvidersSettings(BaseSettings):
25
- pinecone_upsert_parallelism: int = Field(
26
- default=3,
27
- title="Pinecone upsert parallelism",
28
- description="Number of parallel upserts to Pinecone on each set resource operation",
29
- )
30
- pinecone_delete_parallelism: int = Field(
31
- default=2,
32
- title="Pinecone delete parallelism",
33
- description="Number of parallel deletes to Pinecone on each delete resource operation",
34
- )
35
- pinecone_upsert_timeout: float = Field(
36
- default=10.0,
37
- title="Pinecone upsert timeout",
38
- description="Timeout in seconds for each upsert operation to Pinecone",
39
- )
40
- pinecone_delete_timeout: float = Field(
41
- default=10.0,
42
- title="Pinecone delete timeout",
43
- description="Timeout in seconds for each delete operation to Pinecone",
44
- )
45
- pinecone_query_timeout: float = Field(
46
- default=10.0,
47
- title="Pinecone query timeout",
48
- description="Timeout in seconds for each query operation to Pinecone",
49
- )
23
+ class ExternalIndexProvidersSettings(BaseSettings): ...
50
24
 
51
25
 
52
26
  settings = ExternalIndexProvidersSettings()
@@ -52,37 +52,20 @@ from nucliadb_models.filters import (
52
52
  )
53
53
 
54
54
  # Filters that end up as a facet
55
- FacetFilter = Union[
56
- OriginTag,
57
- Label,
58
- ResourceMimetype,
59
- FieldMimetype,
60
- Entity,
61
- Language,
62
- OriginMetadata,
63
- OriginPath,
64
- Generated,
65
- Kind,
66
- OriginCollaborator,
67
- OriginSource,
68
- Status,
69
- ]
70
- # In Python 3.9 we cannot do isinstance against an union
71
- # Once we support only 3.10+, we can remove this
72
- FacetFilterTypes = (
73
- OriginTag,
74
- Label,
75
- ResourceMimetype,
76
- FieldMimetype,
77
- Entity,
78
- Language,
79
- OriginMetadata,
80
- OriginPath,
81
- Generated,
82
- Kind,
83
- OriginCollaborator,
84
- OriginSource,
85
- Status,
55
+ FacetFilter = (
56
+ OriginTag
57
+ | Label
58
+ | ResourceMimetype
59
+ | FieldMimetype
60
+ | Entity
61
+ | Language
62
+ | OriginMetadata
63
+ | OriginPath
64
+ | Generated
65
+ | Kind
66
+ | OriginCollaborator
67
+ | OriginSource
68
+ | Status
86
69
  )
87
70
 
88
71
 
@@ -110,7 +93,7 @@ async def parse_expression(
110
93
  if rid is None:
111
94
  raise InvalidQueryError("slug", f"Cannot find slug {expr.slug}")
112
95
  f.resource.resource_id = rid
113
- else: # pragma: nocover
96
+ else: # pragma: no cover
114
97
  # Cannot happen due to model validation
115
98
  raise ValueError("Resource needs id or slug")
116
99
  elif isinstance(expr, Field):
@@ -131,7 +114,7 @@ async def parse_expression(
131
114
  f.date.since.FromDatetime(expr.since)
132
115
  if expr.until:
133
116
  f.date.until.FromDatetime(expr.until)
134
- elif isinstance(expr, FacetFilterTypes):
117
+ elif isinstance(expr, FacetFilter):
135
118
  f.facet.facet = facet_from_filter(expr)
136
119
  else:
137
120
  # This is a trick so mypy generates an error if this branch can be reached,
@@ -21,6 +21,10 @@ class ClientException(Exception):
21
21
  pass
22
22
 
23
23
 
24
+ class ServerException(Exception):
25
+ pass
26
+
27
+
24
28
  class NotFoundException(ClientException):
25
29
  pass
26
30
 
@@ -35,3 +39,7 @@ class RateLimitException(ClientException):
35
39
 
36
40
  class AccountLimitException(ClientException):
37
41
  pass
42
+
43
+
44
+ class ServiceUnavailableException(ServerException):
45
+ pass
@@ -209,6 +209,10 @@ class ProcessingHTTPClient:
209
209
  async def close(self):
210
210
  await self.session.close()
211
211
 
212
+ async def reset_session(self):
213
+ await self.close()
214
+ self.session = aiohttp.ClientSession()
215
+
212
216
  async def in_progress(self, ack_token: str):
213
217
  url = self.base_url_v2 + "/pull/in_progress"
214
218
  request = InProgressRequest(ack=[ack_token])
@@ -33,5 +33,8 @@ def check_status(resp: aiohttp.ClientResponse, resp_text: str) -> None:
33
33
  raise exceptions.AuthorizationException(f"Unauthorized to access: {resp.status}")
34
34
  elif resp.status == 429:
35
35
  raise exceptions.RateLimitException("Rate limited")
36
+ elif resp.status in (502, 503):
37
+ # Service unavailable, can be retried
38
+ raise exceptions.ServiceUnavailableException(f"Service unavailable: {resp.status} - {resp_text}")
36
39
  else:
37
40
  raise exceptions.ClientException(f"Unknown error: {resp.status} - {resp_text}")
nucliadb/common/ids.py CHANGED
@@ -47,6 +47,8 @@ FIELD_TYPE_NAME_TO_STR = {
47
47
  FieldTypeName.CONVERSATION: "c",
48
48
  }
49
49
 
50
+ FIELD_TYPE_STR_TO_NAME = {v: k for k, v in FIELD_TYPE_NAME_TO_STR.items()}
51
+
50
52
 
51
53
  @dataclass
52
54
  class FieldId:
@@ -65,7 +67,7 @@ class FieldId:
65
67
 
66
68
  Examples:
67
69
 
68
- >>> FieldId(rid="rid", type="u", key="/my-link")
70
+ >>> FieldId(rid="rid", type="u", key="my-link")
69
71
  FieldID("rid/u/my-link")
70
72
  >>> FieldId.from_string("rid/u/my-link")
71
73
  FieldID("rid/u/my-link")
@@ -77,31 +79,6 @@ class FieldId:
77
79
  # also knwon as `split`, this indicates a part of a field in, for example, conversations
78
80
  subfield_id: Optional[str] = None
79
81
 
80
- def __repr__(self) -> str:
81
- return f"FieldId({self.full()})"
82
-
83
- def short_without_subfield(self) -> str:
84
- return f"/{self.type}/{self.key}"
85
-
86
- def full(self) -> str:
87
- if self.subfield_id is None:
88
- return f"{self.rid}/{self.type}/{self.key}"
89
- else:
90
- return f"{self.rid}/{self.type}/{self.key}/{self.subfield_id}"
91
-
92
- def __hash__(self) -> int:
93
- return hash(self.full())
94
-
95
- @property
96
- def pb_type(self) -> FieldType.ValueType:
97
- return FIELD_TYPE_STR_TO_PB[self.type]
98
-
99
- @classmethod
100
- def from_pb(
101
- cls, rid: str, field_type: FieldType.ValueType, key: str, subfield_id: Optional[str] = None
102
- ) -> "FieldId":
103
- return cls(rid=rid, type=FIELD_TYPE_PB_TO_STR[field_type], key=key, subfield_id=subfield_id)
104
-
105
82
  @classmethod
106
83
  def from_string(cls, value: str) -> "FieldId":
107
84
  """
@@ -120,11 +97,11 @@ class FieldId:
120
97
  parts = value.split("/")
121
98
  if len(parts) == 3:
122
99
  rid, _type, key = parts
123
- _type = cls.parse_field_type(_type)
100
+ _type = cls._parse_field_type(_type)
124
101
  return cls(rid=rid, type=_type, key=key)
125
102
  elif len(parts) == 4:
126
103
  rid, _type, key, subfield_id = parts
127
- _type = cls.parse_field_type(_type)
104
+ _type = cls._parse_field_type(_type)
128
105
  return cls(
129
106
  rid=rid,
130
107
  type=_type,
@@ -135,7 +112,46 @@ class FieldId:
135
112
  raise ValueError(f"Invalid FieldId: {value}")
136
113
 
137
114
  @classmethod
138
- def parse_field_type(cls, _type: str) -> str:
115
+ def from_pb(
116
+ cls, rid: str, field_type: FieldType.ValueType, key: str, subfield_id: Optional[str] = None
117
+ ) -> "FieldId":
118
+ return cls(rid=rid, type=FIELD_TYPE_PB_TO_STR[field_type], key=key, subfield_id=subfield_id)
119
+
120
+ @property
121
+ def pb_type(self) -> FieldType.ValueType:
122
+ return FIELD_TYPE_STR_TO_PB[self.type]
123
+
124
+ def full(self) -> str:
125
+ if self.subfield_id is None:
126
+ return f"{self.rid}/{self.type}/{self.key}"
127
+ else:
128
+ return f"{self.rid}/{self.type}/{self.key}/{self.subfield_id}"
129
+
130
+ def short_without_subfield(self) -> str:
131
+ return f"/{self.type}/{self.key}"
132
+
133
+ def paragraph_id(self, paragraph_start: int, paragraph_end: int) -> "ParagraphId":
134
+ """Generate a ParagraphId from the current field given its start and
135
+ end.
136
+
137
+ """
138
+ return ParagraphId(
139
+ field_id=self,
140
+ paragraph_start=paragraph_start,
141
+ paragraph_end=paragraph_end,
142
+ )
143
+
144
+ def __str__(self) -> str:
145
+ return self.full()
146
+
147
+ def __repr__(self) -> str:
148
+ return f"FieldId({self.full()})"
149
+
150
+ def __hash__(self) -> int:
151
+ return hash(self.full())
152
+
153
+ @staticmethod
154
+ def _parse_field_type(_type: str) -> str:
139
155
  if _type not in FIELD_TYPE_STR_TO_PB:
140
156
  # Try to parse the enum value
141
157
  # XXX: This is to support field types that are integer values of FieldType
@@ -157,19 +173,6 @@ class ParagraphId:
157
173
  paragraph_start: int
158
174
  paragraph_end: int
159
175
 
160
- def __repr__(self) -> str:
161
- return f"ParagraphId({self.full()})"
162
-
163
- def full(self) -> str:
164
- return f"{self.field_id.full()}/{self.paragraph_start}-{self.paragraph_end}"
165
-
166
- def __hash__(self) -> int:
167
- return hash(self.full())
168
-
169
- @property
170
- def rid(self) -> str:
171
- return self.field_id.rid
172
-
173
176
  @classmethod
174
177
  def from_string(cls, value: str) -> "ParagraphId":
175
178
  parts = value.split("/")
@@ -192,6 +195,22 @@ class ParagraphId:
192
195
  paragraph_end=vid.vector_end,
193
196
  )
194
197
 
198
+ @property
199
+ def rid(self) -> str:
200
+ return self.field_id.rid
201
+
202
+ def full(self) -> str:
203
+ return f"{self.field_id.full()}/{self.paragraph_start}-{self.paragraph_end}"
204
+
205
+ def __str__(self) -> str:
206
+ return self.full()
207
+
208
+ def __repr__(self) -> str:
209
+ return f"ParagraphId({self.full()})"
210
+
211
+ def __hash__(self) -> int:
212
+ return hash(self.full())
213
+
195
214
 
196
215
  @dataclass
197
216
  class VectorId:
@@ -217,19 +236,6 @@ class VectorId:
217
236
  vector_start: int
218
237
  vector_end: int
219
238
 
220
- def __repr__(self) -> str:
221
- return f"VectorId({self.full()})"
222
-
223
- def full(self) -> str:
224
- return f"{self.field_id.full()}/{self.index}/{self.vector_start}-{self.vector_end}"
225
-
226
- def __hash__(self) -> int:
227
- return hash(self.full())
228
-
229
- @property
230
- def rid(self) -> str:
231
- return self.field_id.rid
232
-
233
239
  @classmethod
234
240
  def from_string(cls, value: str) -> "VectorId":
235
241
  parts = value.split("/")
@@ -239,6 +245,22 @@ class VectorId:
239
245
  field_id = FieldId.from_string("/".join(parts[:-2]))
240
246
  return cls(field_id=field_id, index=index, vector_start=start, vector_end=end)
241
247
 
248
+ @property
249
+ def rid(self) -> str:
250
+ return self.field_id.rid
251
+
252
+ def full(self) -> str:
253
+ return f"{self.field_id.full()}/{self.index}/{self.vector_start}-{self.vector_end}"
254
+
255
+ def __str__(self) -> str:
256
+ return self.full()
257
+
258
+ def __repr__(self) -> str:
259
+ return f"VectorId({self.full()})"
260
+
261
+ def __hash__(self) -> int:
262
+ return hash(self.full())
263
+
242
264
 
243
265
  def extract_data_augmentation_id(generated_field_id: str) -> Optional[str]:
244
266
  """Data augmentation generated fields have a strict id with the following
@@ -75,7 +75,7 @@ class _Lock:
75
75
  start = time.time()
76
76
  while True:
77
77
  try:
78
- async with self.driver.transaction() as txn:
78
+ async with self.driver.rw_transaction() as txn:
79
79
  lock_data = await self.get_lock_data(txn)
80
80
  if lock_data is None:
81
81
  await self._set_lock_value(txn)
@@ -128,7 +128,7 @@ class _Lock:
128
128
  while True:
129
129
  try:
130
130
  await asyncio.sleep(self.refresh_timeout)
131
- async with self.driver.transaction() as txn:
131
+ async with self.driver.rw_transaction() as txn:
132
132
  await self._update_lock_value(txn)
133
133
  await txn.commit()
134
134
  except (asyncio.CancelledError, RuntimeError):
@@ -138,12 +138,12 @@ class _Lock:
138
138
 
139
139
  async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
140
140
  self.task.cancel()
141
- async with self.driver.transaction() as txn:
141
+ async with self.driver.rw_transaction() as txn:
142
142
  await txn.delete(self.key)
143
143
  await txn.commit()
144
144
 
145
145
  async def is_locked(self) -> bool:
146
- async with get_driver().transaction(read_only=True) as txn:
146
+ async with get_driver().ro_transaction() as txn:
147
147
  lock_data = await self.get_lock_data(txn)
148
148
  return lock_data is not None and time.time() < lock_data.expires_at
149
149
 
@@ -81,5 +81,15 @@ class Driver:
81
81
  pass
82
82
 
83
83
  @asynccontextmanager
84
- async def transaction(self, read_only: bool = False) -> AsyncGenerator[Transaction, None]:
84
+ async def _transaction(self, *, read_only: bool) -> AsyncGenerator[Transaction, None]:
85
85
  yield Transaction()
86
+
87
+ @asynccontextmanager
88
+ async def ro_transaction(self) -> AsyncGenerator[Transaction, None]:
89
+ async with self._transaction(read_only=True) as txn:
90
+ yield txn
91
+
92
+ @asynccontextmanager
93
+ async def rw_transaction(self) -> AsyncGenerator[Transaction, None]:
94
+ async with self._transaction(read_only=False) as txn:
95
+ yield txn
@@ -222,7 +222,7 @@ class LocalDriver(Driver):
222
222
  pass
223
223
 
224
224
  @asynccontextmanager
225
- async def transaction(self, read_only: bool = False) -> AsyncGenerator[Transaction, None]:
225
+ async def _transaction(self, *, read_only: bool) -> AsyncGenerator[Transaction, None]:
226
226
  if self.url is None:
227
227
  raise AttributeError("Invalid url")
228
228
  txn = LocalTransaction(self.url, self)
@@ -330,7 +330,7 @@ class PGDriver(Driver):
330
330
  metric.set(value)
331
331
 
332
332
  @asynccontextmanager
333
- async def transaction(self, read_only: bool = False) -> AsyncGenerator[Transaction, None]:
333
+ async def _transaction(self, *, read_only: bool) -> AsyncGenerator[Transaction, None]:
334
334
  if read_only:
335
335
  yield ReadOnlyPGTransaction(self)
336
336
  else:
nucliadb/common/nidx.py CHANGED
@@ -82,6 +82,24 @@ def _storage_config(prefix: str, bucket: Optional[str]) -> dict[str, str]:
82
82
  config[f"{prefix}__REGION_NAME"] = storage_settings.s3_region_name or ""
83
83
  if storage_settings.s3_endpoint:
84
84
  config[f"{prefix}__ENDPOINT"] = storage_settings.s3_endpoint
85
+ elif storage_settings.file_backend == FileBackendConfig.AZURE:
86
+ if storage_settings.azure_account_url is None:
87
+ raise ValueError("Azure account is required")
88
+ config[f"{prefix}__OBJECT_STORE"] = "azure"
89
+ url = storage_settings.azure_account_url
90
+ container = bucket or extended_storage_settings.azure_indexing_bucket
91
+ if container:
92
+ url += f"/{container}"
93
+ config[f"{prefix}__CONTAINER_URL"] = url
94
+ if storage_settings.azure_connection_string:
95
+ params = {
96
+ p.split("=", 1)[0]: p.split("=", 1)[1]
97
+ for p in storage_settings.azure_connection_string.split(";")
98
+ }
99
+ if "AccountKey" in params:
100
+ config[f"{prefix}__ACCOUNT_KEY"] = params["AccountKey"]
101
+ if "BlobEndpoint" in params:
102
+ config[f"{prefix}__ENDPOINT"] = params["BlobEndpoint"]
85
103
 
86
104
  return config
87
105
 
@@ -198,7 +216,7 @@ class NidxServiceUtility(NidxUtility):
198
216
  return await self.indexer.index(writer)
199
217
 
200
218
 
201
- async def start_nidx_utility(service_name: str = "nucliadb.nidx") -> Optional[NidxUtility]:
219
+ async def start_nidx_utility(service_name: str = "nucliadb.nidx") -> NidxUtility:
202
220
  nidx = get_utility(Utility.NIDX)
203
221
  if nidx:
204
222
  return nidx
@@ -26,7 +26,7 @@ from nucliadb_protos import knowledgebox_pb2 as Nucliadb
26
26
  def nucliadb_vector_type_to_nidx(nucliadb: Nucliadb.VectorType.ValueType) -> Nidx.VectorType.ValueType:
27
27
  if nucliadb == Nucliadb.DENSE_F32:
28
28
  return Nidx.DENSE_F32
29
- else: # pragma: nocover
29
+ else: # pragma: no cover
30
30
  raise Exception("Unknown vector type")
31
31
 
32
32