nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (246) hide show
  1. migrations/0023_backfill_pg_catalog.py +8 -4
  2. migrations/0028_extracted_vectors_reference.py +1 -1
  3. migrations/0029_backfill_field_status.py +3 -4
  4. migrations/0032_remove_old_relations.py +2 -3
  5. migrations/0038_backfill_catalog_field_labels.py +8 -4
  6. migrations/0039_backfill_converation_splits_metadata.py +106 -0
  7. migrations/0040_migrate_search_configurations.py +79 -0
  8. migrations/0041_reindex_conversations.py +137 -0
  9. migrations/pg/0010_shards_index.py +34 -0
  10. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  11. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  12. nucliadb/backups/create.py +2 -15
  13. nucliadb/backups/restore.py +4 -15
  14. nucliadb/backups/tasks.py +4 -1
  15. nucliadb/common/back_pressure/cache.py +2 -3
  16. nucliadb/common/back_pressure/materializer.py +7 -13
  17. nucliadb/common/back_pressure/settings.py +6 -6
  18. nucliadb/common/back_pressure/utils.py +1 -0
  19. nucliadb/common/cache.py +9 -9
  20. nucliadb/common/catalog/__init__.py +79 -0
  21. nucliadb/common/catalog/dummy.py +36 -0
  22. nucliadb/common/catalog/interface.py +85 -0
  23. nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
  24. nucliadb/common/catalog/utils.py +56 -0
  25. nucliadb/common/cluster/manager.py +8 -23
  26. nucliadb/common/cluster/rebalance.py +484 -112
  27. nucliadb/common/cluster/rollover.py +36 -9
  28. nucliadb/common/cluster/settings.py +4 -9
  29. nucliadb/common/cluster/utils.py +34 -8
  30. nucliadb/common/context/__init__.py +7 -8
  31. nucliadb/common/context/fastapi.py +1 -2
  32. nucliadb/common/datamanagers/__init__.py +2 -4
  33. nucliadb/common/datamanagers/atomic.py +9 -2
  34. nucliadb/common/datamanagers/cluster.py +1 -2
  35. nucliadb/common/datamanagers/fields.py +3 -4
  36. nucliadb/common/datamanagers/kb.py +6 -6
  37. nucliadb/common/datamanagers/labels.py +2 -3
  38. nucliadb/common/datamanagers/resources.py +10 -33
  39. nucliadb/common/datamanagers/rollover.py +5 -7
  40. nucliadb/common/datamanagers/search_configurations.py +1 -2
  41. nucliadb/common/datamanagers/synonyms.py +1 -2
  42. nucliadb/common/datamanagers/utils.py +4 -4
  43. nucliadb/common/datamanagers/vectorsets.py +4 -4
  44. nucliadb/common/external_index_providers/base.py +32 -5
  45. nucliadb/common/external_index_providers/manager.py +5 -34
  46. nucliadb/common/external_index_providers/settings.py +1 -27
  47. nucliadb/common/filter_expression.py +129 -41
  48. nucliadb/common/http_clients/exceptions.py +8 -0
  49. nucliadb/common/http_clients/processing.py +16 -23
  50. nucliadb/common/http_clients/utils.py +3 -0
  51. nucliadb/common/ids.py +82 -58
  52. nucliadb/common/locking.py +1 -2
  53. nucliadb/common/maindb/driver.py +9 -8
  54. nucliadb/common/maindb/local.py +5 -5
  55. nucliadb/common/maindb/pg.py +9 -8
  56. nucliadb/common/nidx.py +22 -5
  57. nucliadb/common/vector_index_config.py +1 -1
  58. nucliadb/export_import/datamanager.py +4 -3
  59. nucliadb/export_import/exporter.py +11 -19
  60. nucliadb/export_import/importer.py +13 -6
  61. nucliadb/export_import/tasks.py +2 -0
  62. nucliadb/export_import/utils.py +6 -18
  63. nucliadb/health.py +2 -2
  64. nucliadb/ingest/app.py +8 -8
  65. nucliadb/ingest/consumer/consumer.py +8 -10
  66. nucliadb/ingest/consumer/pull.py +10 -8
  67. nucliadb/ingest/consumer/service.py +5 -30
  68. nucliadb/ingest/consumer/shard_creator.py +16 -5
  69. nucliadb/ingest/consumer/utils.py +1 -1
  70. nucliadb/ingest/fields/base.py +37 -49
  71. nucliadb/ingest/fields/conversation.py +55 -9
  72. nucliadb/ingest/fields/exceptions.py +1 -2
  73. nucliadb/ingest/fields/file.py +22 -8
  74. nucliadb/ingest/fields/link.py +7 -7
  75. nucliadb/ingest/fields/text.py +2 -3
  76. nucliadb/ingest/orm/brain_v2.py +89 -57
  77. nucliadb/ingest/orm/broker_message.py +2 -4
  78. nucliadb/ingest/orm/entities.py +10 -209
  79. nucliadb/ingest/orm/index_message.py +128 -113
  80. nucliadb/ingest/orm/knowledgebox.py +91 -59
  81. nucliadb/ingest/orm/processor/auditing.py +1 -3
  82. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  83. nucliadb/ingest/orm/processor/processor.py +98 -153
  84. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  85. nucliadb/ingest/orm/resource.py +82 -71
  86. nucliadb/ingest/orm/utils.py +1 -1
  87. nucliadb/ingest/partitions.py +12 -1
  88. nucliadb/ingest/processing.py +17 -17
  89. nucliadb/ingest/serialize.py +202 -145
  90. nucliadb/ingest/service/writer.py +15 -114
  91. nucliadb/ingest/settings.py +36 -15
  92. nucliadb/ingest/utils.py +1 -2
  93. nucliadb/learning_proxy.py +23 -26
  94. nucliadb/metrics_exporter.py +20 -6
  95. nucliadb/middleware/__init__.py +82 -1
  96. nucliadb/migrator/datamanager.py +4 -11
  97. nucliadb/migrator/migrator.py +1 -2
  98. nucliadb/migrator/models.py +1 -2
  99. nucliadb/migrator/settings.py +1 -2
  100. nucliadb/models/internal/augment.py +614 -0
  101. nucliadb/models/internal/processing.py +19 -19
  102. nucliadb/openapi.py +2 -2
  103. nucliadb/purge/__init__.py +3 -8
  104. nucliadb/purge/orphan_shards.py +1 -2
  105. nucliadb/reader/__init__.py +5 -0
  106. nucliadb/reader/api/models.py +6 -13
  107. nucliadb/reader/api/v1/download.py +59 -38
  108. nucliadb/reader/api/v1/export_import.py +4 -4
  109. nucliadb/reader/api/v1/knowledgebox.py +37 -9
  110. nucliadb/reader/api/v1/learning_config.py +33 -14
  111. nucliadb/reader/api/v1/resource.py +61 -9
  112. nucliadb/reader/api/v1/services.py +18 -14
  113. nucliadb/reader/app.py +3 -1
  114. nucliadb/reader/reader/notifications.py +1 -2
  115. nucliadb/search/api/v1/__init__.py +3 -0
  116. nucliadb/search/api/v1/ask.py +3 -4
  117. nucliadb/search/api/v1/augment.py +585 -0
  118. nucliadb/search/api/v1/catalog.py +15 -19
  119. nucliadb/search/api/v1/find.py +16 -22
  120. nucliadb/search/api/v1/hydrate.py +328 -0
  121. nucliadb/search/api/v1/knowledgebox.py +1 -2
  122. nucliadb/search/api/v1/predict_proxy.py +1 -2
  123. nucliadb/search/api/v1/resource/ask.py +28 -8
  124. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  125. nucliadb/search/api/v1/resource/search.py +9 -11
  126. nucliadb/search/api/v1/retrieve.py +130 -0
  127. nucliadb/search/api/v1/search.py +28 -32
  128. nucliadb/search/api/v1/suggest.py +11 -14
  129. nucliadb/search/api/v1/summarize.py +1 -2
  130. nucliadb/search/api/v1/utils.py +2 -2
  131. nucliadb/search/app.py +3 -2
  132. nucliadb/search/augmentor/__init__.py +21 -0
  133. nucliadb/search/augmentor/augmentor.py +232 -0
  134. nucliadb/search/augmentor/fields.py +704 -0
  135. nucliadb/search/augmentor/metrics.py +24 -0
  136. nucliadb/search/augmentor/paragraphs.py +334 -0
  137. nucliadb/search/augmentor/resources.py +238 -0
  138. nucliadb/search/augmentor/utils.py +33 -0
  139. nucliadb/search/lifecycle.py +3 -1
  140. nucliadb/search/predict.py +33 -19
  141. nucliadb/search/predict_models.py +8 -9
  142. nucliadb/search/requesters/utils.py +11 -10
  143. nucliadb/search/search/cache.py +19 -42
  144. nucliadb/search/search/chat/ask.py +131 -59
  145. nucliadb/search/search/chat/exceptions.py +3 -5
  146. nucliadb/search/search/chat/fetcher.py +201 -0
  147. nucliadb/search/search/chat/images.py +6 -4
  148. nucliadb/search/search/chat/old_prompt.py +1375 -0
  149. nucliadb/search/search/chat/parser.py +510 -0
  150. nucliadb/search/search/chat/prompt.py +563 -615
  151. nucliadb/search/search/chat/query.py +453 -32
  152. nucliadb/search/search/chat/rpc.py +85 -0
  153. nucliadb/search/search/fetch.py +3 -4
  154. nucliadb/search/search/filters.py +8 -11
  155. nucliadb/search/search/find.py +33 -31
  156. nucliadb/search/search/find_merge.py +124 -331
  157. nucliadb/search/search/graph_strategy.py +14 -12
  158. nucliadb/search/search/hydrator/__init__.py +49 -0
  159. nucliadb/search/search/hydrator/fields.py +217 -0
  160. nucliadb/search/search/hydrator/images.py +130 -0
  161. nucliadb/search/search/hydrator/paragraphs.py +323 -0
  162. nucliadb/search/search/hydrator/resources.py +60 -0
  163. nucliadb/search/search/ingestion_agents.py +5 -5
  164. nucliadb/search/search/merge.py +90 -94
  165. nucliadb/search/search/metrics.py +24 -7
  166. nucliadb/search/search/paragraphs.py +7 -9
  167. nucliadb/search/search/predict_proxy.py +44 -18
  168. nucliadb/search/search/query.py +14 -86
  169. nucliadb/search/search/query_parser/fetcher.py +51 -82
  170. nucliadb/search/search/query_parser/models.py +19 -48
  171. nucliadb/search/search/query_parser/old_filters.py +20 -19
  172. nucliadb/search/search/query_parser/parsers/ask.py +5 -6
  173. nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
  174. nucliadb/search/search/query_parser/parsers/common.py +21 -13
  175. nucliadb/search/search/query_parser/parsers/find.py +6 -29
  176. nucliadb/search/search/query_parser/parsers/graph.py +18 -28
  177. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  178. nucliadb/search/search/query_parser/parsers/search.py +15 -56
  179. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  180. nucliadb/search/search/rank_fusion.py +18 -13
  181. nucliadb/search/search/rerankers.py +6 -7
  182. nucliadb/search/search/retrieval.py +300 -0
  183. nucliadb/search/search/summarize.py +5 -6
  184. nucliadb/search/search/utils.py +3 -4
  185. nucliadb/search/settings.py +1 -2
  186. nucliadb/standalone/api_router.py +1 -1
  187. nucliadb/standalone/app.py +4 -3
  188. nucliadb/standalone/auth.py +5 -6
  189. nucliadb/standalone/lifecycle.py +2 -2
  190. nucliadb/standalone/run.py +5 -4
  191. nucliadb/standalone/settings.py +5 -6
  192. nucliadb/standalone/versions.py +3 -4
  193. nucliadb/tasks/consumer.py +13 -8
  194. nucliadb/tasks/models.py +2 -1
  195. nucliadb/tasks/producer.py +3 -3
  196. nucliadb/tasks/retries.py +8 -7
  197. nucliadb/train/api/utils.py +1 -3
  198. nucliadb/train/api/v1/shards.py +1 -2
  199. nucliadb/train/api/v1/trainset.py +1 -2
  200. nucliadb/train/app.py +1 -1
  201. nucliadb/train/generator.py +4 -4
  202. nucliadb/train/generators/field_classifier.py +2 -2
  203. nucliadb/train/generators/field_streaming.py +6 -6
  204. nucliadb/train/generators/image_classifier.py +2 -2
  205. nucliadb/train/generators/paragraph_classifier.py +2 -2
  206. nucliadb/train/generators/paragraph_streaming.py +2 -2
  207. nucliadb/train/generators/question_answer_streaming.py +2 -2
  208. nucliadb/train/generators/sentence_classifier.py +4 -10
  209. nucliadb/train/generators/token_classifier.py +3 -2
  210. nucliadb/train/generators/utils.py +6 -5
  211. nucliadb/train/nodes.py +3 -3
  212. nucliadb/train/resource.py +6 -8
  213. nucliadb/train/settings.py +3 -4
  214. nucliadb/train/types.py +11 -11
  215. nucliadb/train/upload.py +3 -2
  216. nucliadb/train/uploader.py +1 -2
  217. nucliadb/train/utils.py +1 -2
  218. nucliadb/writer/api/v1/export_import.py +4 -1
  219. nucliadb/writer/api/v1/field.py +15 -14
  220. nucliadb/writer/api/v1/knowledgebox.py +18 -56
  221. nucliadb/writer/api/v1/learning_config.py +5 -4
  222. nucliadb/writer/api/v1/resource.py +9 -20
  223. nucliadb/writer/api/v1/services.py +10 -132
  224. nucliadb/writer/api/v1/upload.py +73 -72
  225. nucliadb/writer/app.py +8 -2
  226. nucliadb/writer/resource/basic.py +12 -15
  227. nucliadb/writer/resource/field.py +43 -5
  228. nucliadb/writer/resource/origin.py +7 -0
  229. nucliadb/writer/settings.py +2 -3
  230. nucliadb/writer/tus/__init__.py +2 -3
  231. nucliadb/writer/tus/azure.py +5 -7
  232. nucliadb/writer/tus/dm.py +3 -3
  233. nucliadb/writer/tus/exceptions.py +3 -4
  234. nucliadb/writer/tus/gcs.py +15 -22
  235. nucliadb/writer/tus/s3.py +2 -3
  236. nucliadb/writer/tus/storage.py +3 -3
  237. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
  238. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  239. nucliadb/common/datamanagers/entities.py +0 -139
  240. nucliadb/common/external_index_providers/pinecone.py +0 -894
  241. nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
  242. nucliadb/search/search/hydrator.py +0 -197
  243. nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
  244. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  245. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  246. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
@@ -18,7 +18,6 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
 
21
- from typing import Optional
22
21
 
23
22
  from nucliadb.common.datamanagers.utils import get_kv_pb
24
23
  from nucliadb.common.maindb.driver import Transaction
@@ -27,7 +26,7 @@ from nucliadb_protos import knowledgebox_pb2
27
26
  KB_SYNONYMS = "/kbs/{kbid}/synonyms"
28
27
 
29
28
 
30
- async def get(txn: Transaction, *, kbid: str) -> Optional[knowledgebox_pb2.Synonyms]:
29
+ async def get(txn: Transaction, *, kbid: str) -> knowledgebox_pb2.Synonyms | None:
31
30
  key = KB_SYNONYMS.format(kbid=kbid)
32
31
  return await get_kv_pb(txn, key, knowledgebox_pb2.Synonyms, for_update=False)
33
32
 
@@ -18,7 +18,7 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import contextlib
21
- from typing import Optional, Type, TypeVar
21
+ from typing import TypeVar
22
22
 
23
23
  from google.protobuf.message import Message
24
24
 
@@ -29,9 +29,9 @@ PB_TYPE = TypeVar("PB_TYPE", bound=Message)
29
29
 
30
30
 
31
31
  async def get_kv_pb(
32
- txn: Transaction, key: str, pb_type: Type[PB_TYPE], for_update: bool = True
33
- ) -> Optional[PB_TYPE]:
34
- serialized: Optional[bytes] = await txn.get(key, for_update=for_update)
32
+ txn: Transaction, key: str, pb_type: type[PB_TYPE], for_update: bool = True
33
+ ) -> PB_TYPE | None:
34
+ serialized: bytes | None = await txn.get(key, for_update=for_update)
35
35
  if serialized is None:
36
36
  return None
37
37
  pb = pb_type()
@@ -17,7 +17,7 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from typing import AsyncIterator, Optional
20
+ from collections.abc import AsyncIterator
21
21
 
22
22
  from nucliadb.common.datamanagers.utils import get_kv_pb
23
23
  from nucliadb.common.maindb.driver import Transaction
@@ -37,7 +37,7 @@ async def initialize(txn: Transaction, *, kbid: str):
37
37
 
38
38
  async def get(
39
39
  txn: Transaction, *, kbid: str, vectorset_id: str
40
- ) -> Optional[knowledgebox_pb2.VectorSetConfig]:
40
+ ) -> knowledgebox_pb2.VectorSetConfig | None:
41
41
  kb_vectorsets = await _get_or_default(txn, kbid=kbid, for_update=False)
42
42
  index = _find_vectorset(kb_vectorsets, vectorset_id)
43
43
  if index is None:
@@ -80,7 +80,7 @@ async def set(txn: Transaction, *, kbid: str, config: knowledgebox_pb2.VectorSet
80
80
 
81
81
  async def delete(
82
82
  txn: Transaction, *, kbid: str, vectorset_id: str
83
- ) -> Optional[knowledgebox_pb2.VectorSetConfig]:
83
+ ) -> knowledgebox_pb2.VectorSetConfig | None:
84
84
  kb_vectorsets = await _get_or_default(txn, kbid=kbid, for_update=True)
85
85
  index = _find_vectorset(kb_vectorsets, vectorset_id)
86
86
  if index is None:
@@ -111,7 +111,7 @@ async def _get_or_default(
111
111
 
112
112
  def _find_vectorset(
113
113
  kb_vectorsets: knowledgebox_pb2.KnowledgeBoxVectorSetsConfig, vectorset_id: str
114
- ) -> Optional[int]:
114
+ ) -> int | None:
115
115
  """Return the position of the vectorset in `vectorsets` or `None` if not found."""
116
116
  for idx, vectorset in enumerate(kb_vectorsets.vectorsets):
117
117
  if vectorset.vectorset_id == vectorset_id:
@@ -19,8 +19,9 @@
19
19
  #
20
20
  import abc
21
21
  import logging
22
+ from collections.abc import Iterator
22
23
  from dataclasses import dataclass
23
- from typing import Any, Iterator, Optional
24
+ from typing import Any
24
25
 
25
26
  from nidx_protos.nodereader_pb2 import SearchRequest
26
27
  from nidx_protos.noderesources_pb2 import Resource
@@ -30,7 +31,9 @@ from nucliadb.common.counters import IndexCounts
30
31
  from nucliadb.common.external_index_providers.exceptions import ExternalIndexingError
31
32
  from nucliadb.common.ids import ParagraphId
32
33
  from nucliadb_models.external_index_providers import ExternalIndexProviderType
34
+ from nucliadb_models.retrieval import Score
33
35
  from nucliadb_models.search import SCORE_TYPE, Relations, TextPosition
36
+ from nucliadb_protos import resources_pb2
34
37
  from nucliadb_protos.knowledgebox_pb2 import (
35
38
  CreateExternalIndexProviderMetadata,
36
39
  StoredExternalIndexProviderMetadata,
@@ -43,6 +46,16 @@ logger = logging.getLogger(__name__)
43
46
  manager_observer = Observer("external_index_manager", labels={"operation": "", "provider": ""})
44
47
 
45
48
 
49
+ # /k/ocr
50
+ _OCR_LABEL = (
51
+ f"/k/{resources_pb2.Paragraph.TypeParagraph.Name(resources_pb2.Paragraph.TypeParagraph.OCR).lower()}"
52
+ )
53
+ # /k/inception
54
+ _INCEPTION_LABEL = (
55
+ f"/k/{resources_pb2.Paragraph.TypeParagraph.Name(resources_pb2.Paragraph.TypeParagraph.OCR).lower()}"
56
+ )
57
+
58
+
46
59
  @dataclass
47
60
  class VectorsetExternalIndex:
48
61
  """
@@ -57,9 +70,19 @@ class VectorsetExternalIndex:
57
70
 
58
71
  class ScoredTextBlock(BaseModel):
59
72
  paragraph_id: ParagraphId
60
- score: float
61
73
  score_type: SCORE_TYPE
62
74
 
75
+ scores: list[Score]
76
+
77
+ @property
78
+ def score(self) -> float:
79
+ return self.current_score.score
80
+
81
+ @property
82
+ def current_score(self) -> Score:
83
+ assert len(self.scores) > 0, "text block matches must be scored"
84
+ return self.scores[-1]
85
+
63
86
 
64
87
  class TextBlockMatch(ScoredTextBlock):
65
88
  """
@@ -72,11 +95,15 @@ class TextBlockMatch(ScoredTextBlock):
72
95
  page_with_visual: bool = False
73
96
  fuzzy_search: bool
74
97
  is_a_table: bool = False
75
- representation_file: Optional[str] = None
98
+ representation_file: str | None = None
76
99
  paragraph_labels: list[str] = []
77
100
  field_labels: list[str] = []
78
- text: Optional[str] = None
79
- relevant_relations: Optional[Relations] = None
101
+ text: str | None = None
102
+ relevant_relations: Relations | None = None
103
+
104
+ @property
105
+ def is_an_image(self) -> bool:
106
+ return _OCR_LABEL in self.paragraph_labels or _INCEPTION_LABEL in self.paragraph_labels
80
107
 
81
108
 
82
109
  class QueryResults(BaseModel):
@@ -17,62 +17,33 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from typing import Optional
21
20
 
22
21
  import async_lru
23
22
 
24
23
  from nucliadb.common import datamanagers
25
24
  from nucliadb.common.external_index_providers.base import ExternalIndexManager
26
- from nucliadb.common.external_index_providers.pinecone import PineconeIndexManager
27
- from nucliadb.common.external_index_providers.settings import settings
28
25
  from nucliadb_protos.knowledgebox_pb2 import (
29
- ExternalIndexProviderType,
30
26
  StoredExternalIndexProviderMetadata,
31
27
  )
32
- from nucliadb_utils.utilities import get_endecryptor
33
28
 
34
29
 
35
30
  async def get_external_index_manager(
36
31
  kbid: str, for_rollover: bool = False
37
- ) -> Optional[ExternalIndexManager]:
32
+ ) -> ExternalIndexManager | None:
38
33
  """
39
34
  Returns an ExternalIndexManager for the given kbid.
40
35
  If for_rollover is True, the ExternalIndexManager returned will include the rollover indexes (if any).
41
36
  """
42
- metadata = await get_external_index_metadata(kbid)
43
- if metadata is None or metadata.type != ExternalIndexProviderType.PINECONE:
44
- # Only Pinecone is supported for now
45
- return None
46
-
47
- api_key = get_endecryptor().decrypt(metadata.pinecone_config.encrypted_api_key)
48
- default_vectorset = await get_default_vectorset_id(kbid)
49
-
50
- rollover_indexes = None
51
- if for_rollover:
52
- rollover_metadata = await get_rollover_external_index_metadata(kbid)
53
- if rollover_metadata is not None:
54
- rollover_indexes = dict(rollover_metadata.pinecone_config.indexes)
55
-
56
- return PineconeIndexManager(
57
- kbid=kbid,
58
- api_key=api_key,
59
- indexes=dict(metadata.pinecone_config.indexes),
60
- upsert_parallelism=settings.pinecone_upsert_parallelism,
61
- delete_parallelism=settings.pinecone_delete_parallelism,
62
- upsert_timeout=settings.pinecone_upsert_timeout,
63
- delete_timeout=settings.pinecone_delete_timeout,
64
- default_vectorset=default_vectorset,
65
- rollover_indexes=rollover_indexes,
66
- )
37
+ return None
67
38
 
68
39
 
69
40
  @async_lru.alru_cache(maxsize=None)
70
- async def get_external_index_metadata(kbid: str) -> Optional[StoredExternalIndexProviderMetadata]:
41
+ async def get_external_index_metadata(kbid: str) -> StoredExternalIndexProviderMetadata | None:
71
42
  return await datamanagers.atomic.kb.get_external_index_provider_metadata(kbid=kbid)
72
43
 
73
44
 
74
45
  @async_lru.alru_cache(maxsize=None)
75
- async def get_default_vectorset_id(kbid: str) -> Optional[str]:
46
+ async def get_default_vectorset_id(kbid: str) -> str | None:
76
47
  """
77
48
  While we are transitioning to the new vectorset system, we need to take into account
78
49
  that KBs that have only one semantic model will have the `vectorset_id` field on BrokerMessage.field_vectors
@@ -96,6 +67,6 @@ async def get_default_vectorset_id(kbid: str) -> Optional[str]:
96
67
 
97
68
  async def get_rollover_external_index_metadata(
98
69
  kbid: str,
99
- ) -> Optional[StoredExternalIndexProviderMetadata]:
70
+ ) -> StoredExternalIndexProviderMetadata | None:
100
71
  async with datamanagers.with_ro_transaction() as txn:
101
72
  return await datamanagers.rollover.get_kb_rollover_external_index_metadata(txn, kbid=kbid)
@@ -17,36 +17,10 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from pydantic import Field
21
20
  from pydantic_settings import BaseSettings
22
21
 
23
22
 
24
- class ExternalIndexProvidersSettings(BaseSettings):
25
- pinecone_upsert_parallelism: int = Field(
26
- default=3,
27
- title="Pinecone upsert parallelism",
28
- description="Number of parallel upserts to Pinecone on each set resource operation",
29
- )
30
- pinecone_delete_parallelism: int = Field(
31
- default=2,
32
- title="Pinecone delete parallelism",
33
- description="Number of parallel deletes to Pinecone on each delete resource operation",
34
- )
35
- pinecone_upsert_timeout: float = Field(
36
- default=10.0,
37
- title="Pinecone upsert timeout",
38
- description="Timeout in seconds for each upsert operation to Pinecone",
39
- )
40
- pinecone_delete_timeout: float = Field(
41
- default=10.0,
42
- title="Pinecone delete timeout",
43
- description="Timeout in seconds for each delete operation to Pinecone",
44
- )
45
- pinecone_query_timeout: float = Field(
46
- default=10.0,
47
- title="Pinecone query timeout",
48
- description="Timeout in seconds for each query operation to Pinecone",
49
- )
23
+ class ExternalIndexProvidersSettings(BaseSettings): ...
50
24
 
51
25
 
52
26
  settings = ExternalIndexProvidersSettings()
@@ -18,13 +18,14 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
 
21
- from typing import Union
22
21
 
23
22
  from nidx_protos.nodereader_pb2 import FilterExpression as PBFilterExpression
23
+ from typing_extensions import assert_never
24
24
 
25
25
  from nucliadb.common import datamanagers
26
26
  from nucliadb.common.exceptions import InvalidQueryError
27
27
  from nucliadb.common.ids import FIELD_TYPE_NAME_TO_STR
28
+ from nucliadb_models.common import Paragraph
28
29
  from nucliadb_models.filters import (
29
30
  And,
30
31
  DateCreated,
@@ -50,44 +51,28 @@ from nucliadb_models.filters import (
50
51
  ResourceMimetype,
51
52
  Status,
52
53
  )
54
+ from nucliadb_models.metadata import ResourceProcessingStatus
53
55
 
54
56
  # Filters that end up as a facet
55
- FacetFilter = Union[
56
- OriginTag,
57
- Label,
58
- ResourceMimetype,
59
- FieldMimetype,
60
- Entity,
61
- Language,
62
- OriginMetadata,
63
- OriginPath,
64
- Generated,
65
- Kind,
66
- OriginCollaborator,
67
- OriginSource,
68
- Status,
69
- ]
70
- # In Python 3.9 we cannot do isinstance against an union
71
- # Once we support only 3.10+, we can remove this
72
- FacetFilterTypes = (
73
- OriginTag,
74
- Label,
75
- ResourceMimetype,
76
- FieldMimetype,
77
- Entity,
78
- Language,
79
- OriginMetadata,
80
- OriginPath,
81
- Generated,
82
- Kind,
83
- OriginCollaborator,
84
- OriginSource,
85
- Status,
57
+ FacetFilter = (
58
+ OriginTag
59
+ | Label
60
+ | ResourceMimetype
61
+ | FieldMimetype
62
+ | Entity
63
+ | Language
64
+ | OriginMetadata
65
+ | OriginPath
66
+ | Generated
67
+ | Kind
68
+ | OriginCollaborator
69
+ | OriginSource
70
+ | Status
86
71
  )
87
72
 
88
73
 
89
74
  async def parse_expression(
90
- expr: Union[FieldFilterExpression, ParagraphFilterExpression],
75
+ expr: FieldFilterExpression | ParagraphFilterExpression,
91
76
  kbid: str,
92
77
  ) -> PBFilterExpression:
93
78
  f = PBFilterExpression()
@@ -110,7 +95,7 @@ async def parse_expression(
110
95
  if rid is None:
111
96
  raise InvalidQueryError("slug", f"Cannot find slug {expr.slug}")
112
97
  f.resource.resource_id = rid
113
- else: # pragma: nocover
98
+ else: # pragma: no cover
114
99
  # Cannot happen due to model validation
115
100
  raise ValueError("Resource needs id or slug")
116
101
  elif isinstance(expr, Field):
@@ -131,12 +116,10 @@ async def parse_expression(
131
116
  f.date.since.FromDatetime(expr.since)
132
117
  if expr.until:
133
118
  f.date.until.FromDatetime(expr.until)
134
- elif isinstance(expr, FacetFilterTypes):
119
+ elif isinstance(expr, FacetFilter):
135
120
  f.facet.facet = facet_from_filter(expr)
136
121
  else:
137
- # This is a trick so mypy generates an error if this branch can be reached,
138
- # that is, if we are missing some ifs
139
- _a: int = "a"
122
+ assert_never(expr)
140
123
 
141
124
  return f
142
125
 
@@ -190,13 +173,118 @@ def facet_from_filter(expr: FacetFilter) -> str:
190
173
  elif isinstance(expr, Status):
191
174
  facet = f"/n/s/{expr.status.value}"
192
175
  else:
193
- # This is a trick so mypy generates an error if this branch can be reached,
194
- # that is, if we are missing some ifs
195
- _a: int = "a"
176
+ assert_never(expr)
196
177
 
197
178
  return facet
198
179
 
199
180
 
181
+ def filter_from_facet(facet: str) -> FacetFilter:
182
+ expr: FacetFilter
183
+
184
+ if facet.startswith("/t/"):
185
+ value = facet.removeprefix("/t/")
186
+ expr = OriginTag(tag=value)
187
+
188
+ elif facet.startswith("/l/"):
189
+ value = facet.removeprefix("/l/")
190
+ parts = value.split("/", maxsplit=1)
191
+ if len(parts) == 1:
192
+ type = parts[0]
193
+ expr = Label(labelset=type)
194
+ else:
195
+ type, subtype = parts
196
+ expr = Label(labelset=type, label=subtype)
197
+
198
+ elif facet.startswith("/n/i/"):
199
+ value = facet.removeprefix("/n/i/")
200
+ parts = value.split("/", maxsplit=1)
201
+ if len(parts) == 1:
202
+ type = parts[0]
203
+ expr = ResourceMimetype(type=type)
204
+ else:
205
+ type, subtype = parts
206
+ expr = ResourceMimetype(type=type, subtype=subtype)
207
+
208
+ elif facet.startswith("/mt/"):
209
+ value = facet.removeprefix("/mt/")
210
+ parts = value.split("/", maxsplit=1)
211
+ if len(parts) == 1:
212
+ type = parts[0]
213
+ expr = FieldMimetype(type=type)
214
+ else:
215
+ type, subtype = parts
216
+ expr = FieldMimetype(type=type, subtype=subtype)
217
+
218
+ elif facet.startswith("/e/"):
219
+ value = facet.removeprefix("/e/")
220
+ parts = value.split("/", maxsplit=1)
221
+ if len(parts) == 1:
222
+ subtype = parts[0]
223
+ expr = Entity(subtype=subtype)
224
+ else:
225
+ subtype, value = parts
226
+ expr = Entity(subtype=subtype, value=value)
227
+
228
+ elif facet.startswith("/s/p"):
229
+ value = facet.removeprefix("/s/p/")
230
+ expr = Language(language=value, only_primary=True)
231
+
232
+ elif facet.startswith("/s/s"):
233
+ value = facet.removeprefix("/s/s/")
234
+ expr = Language(language=value, only_primary=False)
235
+
236
+ elif facet.startswith("/m/"):
237
+ value = facet.removeprefix("/m/")
238
+ parts = value.split("/", maxsplit=1)
239
+ if len(parts) == 1:
240
+ field = parts[0]
241
+ expr = OriginMetadata(field=field)
242
+ else:
243
+ field, value = parts
244
+ expr = OriginMetadata(field=field, value=value)
245
+
246
+ elif facet.startswith("/p/"):
247
+ value = facet.removeprefix("/p/")
248
+ expr = OriginPath(prefix=value)
249
+
250
+ elif facet.startswith("/g/da"):
251
+ value = facet.removeprefix("/g/da")
252
+ expr = expr = Generated(by="data-augmentation")
253
+ if value.removeprefix("/"):
254
+ expr.da_task = value.removeprefix("/")
255
+
256
+ elif facet.startswith("/k/"):
257
+ value = facet.removeprefix("/k/")
258
+ try:
259
+ kind = Paragraph.TypeParagraph(value.upper())
260
+ except ValueError:
261
+ raise InvalidQueryError("filters", f"invalid paragraph kind: {value}")
262
+ expr = Kind(kind=kind)
263
+
264
+ elif facet.startswith("/u/o/"):
265
+ value = facet.removeprefix("/u/o/")
266
+ expr = OriginCollaborator(collaborator=value)
267
+
268
+ elif facet.startswith("/u/s"):
269
+ value = facet.removeprefix("/u/s")
270
+ expr = OriginSource()
271
+ if value.removeprefix("/"):
272
+ expr.id = value.removeprefix("/")
273
+
274
+ elif facet.startswith("/n/s/"):
275
+ value = facet.removeprefix("/n/s/")
276
+ try:
277
+ status = ResourceProcessingStatus(value.upper())
278
+ except ValueError:
279
+ raise InvalidQueryError("filters", f"invalid resource processing status: {value}")
280
+ expr = Status(status=status)
281
+
282
+ else:
283
+ raise InvalidQueryError("filters", f"invalid filter: {facet}")
284
+
285
+ return expr
286
+
287
+
200
288
  def add_and_expression(dest: PBFilterExpression, add: PBFilterExpression):
201
289
  dest_expr_type = dest.WhichOneof("expr")
202
290
  if dest_expr_type is None:
@@ -21,6 +21,10 @@ class ClientException(Exception):
21
21
  pass
22
22
 
23
23
 
24
+ class ServerException(Exception):
25
+ pass
26
+
27
+
24
28
  class NotFoundException(ClientException):
25
29
  pass
26
30
 
@@ -35,3 +39,7 @@ class RateLimitException(ClientException):
35
39
 
36
40
  class AccountLimitException(ClientException):
37
41
  pass
42
+
43
+
44
+ class ServiceUnavailableException(ServerException):
45
+ pass
@@ -19,10 +19,8 @@
19
19
  #
20
20
  import logging
21
21
  from datetime import datetime
22
- from typing import Optional
23
22
 
24
23
  import aiohttp
25
- import jwt
26
24
  import pydantic
27
25
 
28
26
  from nucliadb_utils.helpers import MessageProgressUpdater
@@ -33,15 +31,6 @@ from .utils import check_status
33
31
  logger = logging.getLogger(__name__)
34
32
 
35
33
 
36
- def get_nua_api_id() -> str:
37
- assert nuclia_settings.nuclia_service_account is not None
38
- claimset = jwt.decode(
39
- nuclia_settings.nuclia_service_account,
40
- options={"verify_signature": False},
41
- )
42
- return claimset.get("sub")
43
-
44
-
45
34
  def get_processing_api_url() -> str:
46
35
  if nuclia_settings.nuclia_service_account:
47
36
  return (
@@ -64,10 +53,10 @@ def get_processing_api_v2_url() -> str:
64
53
 
65
54
  class PullResponse(pydantic.BaseModel):
66
55
  status: str
67
- payload: Optional[str] = None
56
+ payload: str | None = None
68
57
  payloads: list[bytes] = []
69
- msgid: Optional[str] = None
70
- cursor: Optional[int] = None
58
+ msgid: str | None = None
59
+ cursor: int | None = None
71
60
 
72
61
 
73
62
  class PullPosition(pydantic.BaseModel):
@@ -86,7 +75,7 @@ class RequestsResult(pydantic.BaseModel):
86
75
  description="Resource ID.",
87
76
  )
88
77
  kbid: str = pydantic.Field(..., title="KnowledgeBox ID")
89
- title: Optional[str] = pydantic.Field(
78
+ title: str | None = pydantic.Field(
90
79
  None,
91
80
  title="Title",
92
81
  description="Title of the resource.",
@@ -111,12 +100,12 @@ class RequestsResult(pydantic.BaseModel):
111
100
  title="Timestamp",
112
101
  description="Timestamp of when the resource was first scheduled.",
113
102
  )
114
- completed_at: Optional[datetime] = pydantic.Field(
103
+ completed_at: datetime | None = pydantic.Field(
115
104
  None,
116
105
  title="Completed At",
117
106
  description="Timestamp of when the resource was completed",
118
107
  )
119
- scheduled_at: Optional[datetime] = pydantic.Field(
108
+ scheduled_at: datetime | None = pydantic.Field(
120
109
  None,
121
110
  title="Scheduled At",
122
111
  description="Timestamp of when the resource was first scheduled.",
@@ -149,7 +138,7 @@ class RequestsResults(pydantic.BaseModel):
149
138
  title="Results",
150
139
  description="List of results.",
151
140
  )
152
- cursor: Optional[str] = pydantic.Field(
141
+ cursor: str | None = pydantic.Field(
153
142
  None,
154
143
  title="Cursor",
155
144
  description="Cursor to use for the next page of results.",
@@ -209,6 +198,10 @@ class ProcessingHTTPClient:
209
198
  async def close(self):
210
199
  await self.session.close()
211
200
 
201
+ async def reset_session(self):
202
+ await self.close()
203
+ self.session = aiohttp.ClientSession()
204
+
212
205
  async def in_progress(self, ack_token: str):
213
206
  url = self.base_url_v2 + "/pull/in_progress"
214
207
  request = InProgressRequest(ack=[ack_token])
@@ -220,7 +213,7 @@ class ProcessingHTTPClient:
220
213
 
221
214
  async def pull_v2(
222
215
  self, ack_tokens: list[str], limit: int = 1, timeout: float = 5
223
- ) -> Optional[PullResponseV2]:
216
+ ) -> PullResponseV2 | None:
224
217
  url = self.base_url_v2 + "/pull"
225
218
  request = PullRequestV2(limit=limit, timeout=timeout, ack=ack_tokens)
226
219
  async with self.session.post(
@@ -244,9 +237,9 @@ class ProcessingHTTPClient:
244
237
 
245
238
  async def requests(
246
239
  self,
247
- cursor: Optional[str] = None,
248
- scheduled: Optional[bool] = None,
249
- kbid: Optional[str] = None,
240
+ cursor: str | None = None,
241
+ scheduled: bool | None = None,
242
+ kbid: str | None = None,
250
243
  limit: int = 20,
251
244
  ) -> RequestsResults:
252
245
  url = self.base_url + "/requests"
@@ -263,7 +256,7 @@ class ProcessingHTTPClient:
263
256
  check_status(resp, resp_text)
264
257
  return RequestsResults.model_validate_json(resp_text)
265
258
 
266
- async def stats(self, kbid: str, timeout: Optional[float] = 1.0) -> StatsResponse:
259
+ async def stats(self, kbid: str, timeout: float | None = 1.0) -> StatsResponse:
267
260
  url = self.base_url + "/stats"
268
261
  async with self.session.get(
269
262
  url,
@@ -33,5 +33,8 @@ def check_status(resp: aiohttp.ClientResponse, resp_text: str) -> None:
33
33
  raise exceptions.AuthorizationException(f"Unauthorized to access: {resp.status}")
34
34
  elif resp.status == 429:
35
35
  raise exceptions.RateLimitException("Rate limited")
36
+ elif resp.status in (502, 503):
37
+ # Service unavailable, can be retried
38
+ raise exceptions.ServiceUnavailableException(f"Service unavailable: {resp.status} - {resp_text}")
36
39
  else:
37
40
  raise exceptions.ClientException(f"Unknown error: {resp.status} - {resp_text}")