nucliadb 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (231) hide show
  1. migrations/0023_backfill_pg_catalog.py +2 -2
  2. migrations/0029_backfill_field_status.py +3 -4
  3. migrations/0032_remove_old_relations.py +2 -3
  4. migrations/0038_backfill_catalog_field_labels.py +2 -2
  5. migrations/0039_backfill_converation_splits_metadata.py +2 -2
  6. migrations/0041_reindex_conversations.py +137 -0
  7. migrations/pg/0010_shards_index.py +34 -0
  8. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  9. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  10. nucliadb/backups/create.py +2 -15
  11. nucliadb/backups/restore.py +4 -15
  12. nucliadb/backups/tasks.py +4 -1
  13. nucliadb/common/back_pressure/cache.py +2 -3
  14. nucliadb/common/back_pressure/materializer.py +7 -13
  15. nucliadb/common/back_pressure/settings.py +6 -6
  16. nucliadb/common/back_pressure/utils.py +1 -0
  17. nucliadb/common/cache.py +9 -9
  18. nucliadb/common/catalog/interface.py +12 -12
  19. nucliadb/common/catalog/pg.py +41 -29
  20. nucliadb/common/catalog/utils.py +3 -3
  21. nucliadb/common/cluster/manager.py +5 -4
  22. nucliadb/common/cluster/rebalance.py +483 -114
  23. nucliadb/common/cluster/rollover.py +25 -9
  24. nucliadb/common/cluster/settings.py +3 -8
  25. nucliadb/common/cluster/utils.py +34 -8
  26. nucliadb/common/context/__init__.py +7 -8
  27. nucliadb/common/context/fastapi.py +1 -2
  28. nucliadb/common/datamanagers/__init__.py +2 -4
  29. nucliadb/common/datamanagers/atomic.py +4 -2
  30. nucliadb/common/datamanagers/cluster.py +1 -2
  31. nucliadb/common/datamanagers/fields.py +3 -4
  32. nucliadb/common/datamanagers/kb.py +6 -6
  33. nucliadb/common/datamanagers/labels.py +2 -3
  34. nucliadb/common/datamanagers/resources.py +10 -33
  35. nucliadb/common/datamanagers/rollover.py +5 -7
  36. nucliadb/common/datamanagers/search_configurations.py +1 -2
  37. nucliadb/common/datamanagers/synonyms.py +1 -2
  38. nucliadb/common/datamanagers/utils.py +4 -4
  39. nucliadb/common/datamanagers/vectorsets.py +4 -4
  40. nucliadb/common/external_index_providers/base.py +32 -5
  41. nucliadb/common/external_index_providers/manager.py +4 -5
  42. nucliadb/common/filter_expression.py +128 -40
  43. nucliadb/common/http_clients/processing.py +12 -23
  44. nucliadb/common/ids.py +6 -4
  45. nucliadb/common/locking.py +1 -2
  46. nucliadb/common/maindb/driver.py +9 -8
  47. nucliadb/common/maindb/local.py +5 -5
  48. nucliadb/common/maindb/pg.py +9 -8
  49. nucliadb/common/nidx.py +3 -4
  50. nucliadb/export_import/datamanager.py +4 -3
  51. nucliadb/export_import/exporter.py +11 -19
  52. nucliadb/export_import/importer.py +13 -6
  53. nucliadb/export_import/tasks.py +2 -0
  54. nucliadb/export_import/utils.py +6 -18
  55. nucliadb/health.py +2 -2
  56. nucliadb/ingest/app.py +8 -8
  57. nucliadb/ingest/consumer/consumer.py +8 -10
  58. nucliadb/ingest/consumer/pull.py +3 -8
  59. nucliadb/ingest/consumer/service.py +3 -3
  60. nucliadb/ingest/consumer/utils.py +1 -1
  61. nucliadb/ingest/fields/base.py +28 -49
  62. nucliadb/ingest/fields/conversation.py +12 -12
  63. nucliadb/ingest/fields/exceptions.py +1 -2
  64. nucliadb/ingest/fields/file.py +22 -8
  65. nucliadb/ingest/fields/link.py +7 -7
  66. nucliadb/ingest/fields/text.py +2 -3
  67. nucliadb/ingest/orm/brain_v2.py +78 -64
  68. nucliadb/ingest/orm/broker_message.py +2 -4
  69. nucliadb/ingest/orm/entities.py +10 -209
  70. nucliadb/ingest/orm/index_message.py +4 -4
  71. nucliadb/ingest/orm/knowledgebox.py +18 -27
  72. nucliadb/ingest/orm/processor/auditing.py +1 -3
  73. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  74. nucliadb/ingest/orm/processor/processor.py +27 -27
  75. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  76. nucliadb/ingest/orm/resource.py +72 -70
  77. nucliadb/ingest/orm/utils.py +1 -1
  78. nucliadb/ingest/processing.py +17 -17
  79. nucliadb/ingest/serialize.py +202 -145
  80. nucliadb/ingest/service/writer.py +3 -109
  81. nucliadb/ingest/settings.py +3 -4
  82. nucliadb/ingest/utils.py +1 -2
  83. nucliadb/learning_proxy.py +11 -11
  84. nucliadb/metrics_exporter.py +5 -4
  85. nucliadb/middleware/__init__.py +82 -1
  86. nucliadb/migrator/datamanager.py +3 -4
  87. nucliadb/migrator/migrator.py +1 -2
  88. nucliadb/migrator/models.py +1 -2
  89. nucliadb/migrator/settings.py +1 -2
  90. nucliadb/models/internal/augment.py +614 -0
  91. nucliadb/models/internal/processing.py +19 -19
  92. nucliadb/openapi.py +2 -2
  93. nucliadb/purge/__init__.py +3 -8
  94. nucliadb/purge/orphan_shards.py +1 -2
  95. nucliadb/reader/__init__.py +5 -0
  96. nucliadb/reader/api/models.py +6 -13
  97. nucliadb/reader/api/v1/download.py +59 -38
  98. nucliadb/reader/api/v1/export_import.py +4 -4
  99. nucliadb/reader/api/v1/learning_config.py +24 -4
  100. nucliadb/reader/api/v1/resource.py +61 -9
  101. nucliadb/reader/api/v1/services.py +18 -14
  102. nucliadb/reader/app.py +3 -1
  103. nucliadb/reader/reader/notifications.py +1 -2
  104. nucliadb/search/api/v1/__init__.py +2 -0
  105. nucliadb/search/api/v1/ask.py +3 -4
  106. nucliadb/search/api/v1/augment.py +585 -0
  107. nucliadb/search/api/v1/catalog.py +11 -15
  108. nucliadb/search/api/v1/find.py +16 -22
  109. nucliadb/search/api/v1/hydrate.py +25 -25
  110. nucliadb/search/api/v1/knowledgebox.py +1 -2
  111. nucliadb/search/api/v1/predict_proxy.py +1 -2
  112. nucliadb/search/api/v1/resource/ask.py +7 -7
  113. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  114. nucliadb/search/api/v1/resource/search.py +9 -11
  115. nucliadb/search/api/v1/retrieve.py +130 -0
  116. nucliadb/search/api/v1/search.py +28 -32
  117. nucliadb/search/api/v1/suggest.py +11 -14
  118. nucliadb/search/api/v1/summarize.py +1 -2
  119. nucliadb/search/api/v1/utils.py +2 -2
  120. nucliadb/search/app.py +3 -2
  121. nucliadb/search/augmentor/__init__.py +21 -0
  122. nucliadb/search/augmentor/augmentor.py +232 -0
  123. nucliadb/search/augmentor/fields.py +704 -0
  124. nucliadb/search/augmentor/metrics.py +24 -0
  125. nucliadb/search/augmentor/paragraphs.py +334 -0
  126. nucliadb/search/augmentor/resources.py +238 -0
  127. nucliadb/search/augmentor/utils.py +33 -0
  128. nucliadb/search/lifecycle.py +3 -1
  129. nucliadb/search/predict.py +24 -17
  130. nucliadb/search/predict_models.py +8 -9
  131. nucliadb/search/requesters/utils.py +11 -10
  132. nucliadb/search/search/cache.py +19 -23
  133. nucliadb/search/search/chat/ask.py +88 -59
  134. nucliadb/search/search/chat/exceptions.py +3 -5
  135. nucliadb/search/search/chat/fetcher.py +201 -0
  136. nucliadb/search/search/chat/images.py +6 -4
  137. nucliadb/search/search/chat/old_prompt.py +1375 -0
  138. nucliadb/search/search/chat/parser.py +510 -0
  139. nucliadb/search/search/chat/prompt.py +563 -615
  140. nucliadb/search/search/chat/query.py +449 -36
  141. nucliadb/search/search/chat/rpc.py +85 -0
  142. nucliadb/search/search/fetch.py +3 -4
  143. nucliadb/search/search/filters.py +8 -11
  144. nucliadb/search/search/find.py +33 -31
  145. nucliadb/search/search/find_merge.py +124 -331
  146. nucliadb/search/search/graph_strategy.py +14 -12
  147. nucliadb/search/search/hydrator/__init__.py +3 -152
  148. nucliadb/search/search/hydrator/fields.py +92 -50
  149. nucliadb/search/search/hydrator/images.py +7 -7
  150. nucliadb/search/search/hydrator/paragraphs.py +42 -26
  151. nucliadb/search/search/hydrator/resources.py +20 -16
  152. nucliadb/search/search/ingestion_agents.py +5 -5
  153. nucliadb/search/search/merge.py +90 -94
  154. nucliadb/search/search/metrics.py +10 -9
  155. nucliadb/search/search/paragraphs.py +7 -9
  156. nucliadb/search/search/predict_proxy.py +13 -9
  157. nucliadb/search/search/query.py +14 -86
  158. nucliadb/search/search/query_parser/fetcher.py +51 -82
  159. nucliadb/search/search/query_parser/models.py +19 -20
  160. nucliadb/search/search/query_parser/old_filters.py +20 -19
  161. nucliadb/search/search/query_parser/parsers/ask.py +4 -5
  162. nucliadb/search/search/query_parser/parsers/catalog.py +5 -6
  163. nucliadb/search/search/query_parser/parsers/common.py +5 -6
  164. nucliadb/search/search/query_parser/parsers/find.py +6 -26
  165. nucliadb/search/search/query_parser/parsers/graph.py +13 -23
  166. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  167. nucliadb/search/search/query_parser/parsers/search.py +15 -53
  168. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  169. nucliadb/search/search/rank_fusion.py +18 -13
  170. nucliadb/search/search/rerankers.py +5 -6
  171. nucliadb/search/search/retrieval.py +300 -0
  172. nucliadb/search/search/summarize.py +5 -6
  173. nucliadb/search/search/utils.py +3 -4
  174. nucliadb/search/settings.py +1 -2
  175. nucliadb/standalone/api_router.py +1 -1
  176. nucliadb/standalone/app.py +4 -3
  177. nucliadb/standalone/auth.py +5 -6
  178. nucliadb/standalone/lifecycle.py +2 -2
  179. nucliadb/standalone/run.py +2 -4
  180. nucliadb/standalone/settings.py +5 -6
  181. nucliadb/standalone/versions.py +3 -4
  182. nucliadb/tasks/consumer.py +13 -8
  183. nucliadb/tasks/models.py +2 -1
  184. nucliadb/tasks/producer.py +3 -3
  185. nucliadb/tasks/retries.py +8 -7
  186. nucliadb/train/api/utils.py +1 -3
  187. nucliadb/train/api/v1/shards.py +1 -2
  188. nucliadb/train/api/v1/trainset.py +1 -2
  189. nucliadb/train/app.py +1 -1
  190. nucliadb/train/generator.py +4 -4
  191. nucliadb/train/generators/field_classifier.py +2 -2
  192. nucliadb/train/generators/field_streaming.py +6 -6
  193. nucliadb/train/generators/image_classifier.py +2 -2
  194. nucliadb/train/generators/paragraph_classifier.py +2 -2
  195. nucliadb/train/generators/paragraph_streaming.py +2 -2
  196. nucliadb/train/generators/question_answer_streaming.py +2 -2
  197. nucliadb/train/generators/sentence_classifier.py +2 -2
  198. nucliadb/train/generators/token_classifier.py +3 -2
  199. nucliadb/train/generators/utils.py +6 -5
  200. nucliadb/train/nodes.py +3 -3
  201. nucliadb/train/resource.py +6 -8
  202. nucliadb/train/settings.py +3 -4
  203. nucliadb/train/types.py +11 -11
  204. nucliadb/train/upload.py +3 -2
  205. nucliadb/train/uploader.py +1 -2
  206. nucliadb/train/utils.py +1 -2
  207. nucliadb/writer/api/v1/export_import.py +4 -1
  208. nucliadb/writer/api/v1/field.py +7 -11
  209. nucliadb/writer/api/v1/knowledgebox.py +3 -4
  210. nucliadb/writer/api/v1/resource.py +9 -20
  211. nucliadb/writer/api/v1/services.py +10 -132
  212. nucliadb/writer/api/v1/upload.py +73 -72
  213. nucliadb/writer/app.py +8 -2
  214. nucliadb/writer/resource/basic.py +12 -15
  215. nucliadb/writer/resource/field.py +7 -5
  216. nucliadb/writer/resource/origin.py +7 -0
  217. nucliadb/writer/settings.py +2 -3
  218. nucliadb/writer/tus/__init__.py +2 -3
  219. nucliadb/writer/tus/azure.py +1 -3
  220. nucliadb/writer/tus/dm.py +3 -3
  221. nucliadb/writer/tus/exceptions.py +3 -4
  222. nucliadb/writer/tus/gcs.py +5 -6
  223. nucliadb/writer/tus/s3.py +2 -3
  224. nucliadb/writer/tus/storage.py +3 -3
  225. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +9 -10
  226. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  227. nucliadb/common/datamanagers/entities.py +0 -139
  228. nucliadb-6.9.1.post5192.dist-info/RECORD +0 -392
  229. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  230. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  231. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
@@ -18,7 +18,7 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import contextlib
21
- from typing import Optional, Type, TypeVar
21
+ from typing import TypeVar
22
22
 
23
23
  from google.protobuf.message import Message
24
24
 
@@ -29,9 +29,9 @@ PB_TYPE = TypeVar("PB_TYPE", bound=Message)
29
29
 
30
30
 
31
31
  async def get_kv_pb(
32
- txn: Transaction, key: str, pb_type: Type[PB_TYPE], for_update: bool = True
33
- ) -> Optional[PB_TYPE]:
34
- serialized: Optional[bytes] = await txn.get(key, for_update=for_update)
32
+ txn: Transaction, key: str, pb_type: type[PB_TYPE], for_update: bool = True
33
+ ) -> PB_TYPE | None:
34
+ serialized: bytes | None = await txn.get(key, for_update=for_update)
35
35
  if serialized is None:
36
36
  return None
37
37
  pb = pb_type()
@@ -17,7 +17,7 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from typing import AsyncIterator, Optional
20
+ from collections.abc import AsyncIterator
21
21
 
22
22
  from nucliadb.common.datamanagers.utils import get_kv_pb
23
23
  from nucliadb.common.maindb.driver import Transaction
@@ -37,7 +37,7 @@ async def initialize(txn: Transaction, *, kbid: str):
37
37
 
38
38
  async def get(
39
39
  txn: Transaction, *, kbid: str, vectorset_id: str
40
- ) -> Optional[knowledgebox_pb2.VectorSetConfig]:
40
+ ) -> knowledgebox_pb2.VectorSetConfig | None:
41
41
  kb_vectorsets = await _get_or_default(txn, kbid=kbid, for_update=False)
42
42
  index = _find_vectorset(kb_vectorsets, vectorset_id)
43
43
  if index is None:
@@ -80,7 +80,7 @@ async def set(txn: Transaction, *, kbid: str, config: knowledgebox_pb2.VectorSet
80
80
 
81
81
  async def delete(
82
82
  txn: Transaction, *, kbid: str, vectorset_id: str
83
- ) -> Optional[knowledgebox_pb2.VectorSetConfig]:
83
+ ) -> knowledgebox_pb2.VectorSetConfig | None:
84
84
  kb_vectorsets = await _get_or_default(txn, kbid=kbid, for_update=True)
85
85
  index = _find_vectorset(kb_vectorsets, vectorset_id)
86
86
  if index is None:
@@ -111,7 +111,7 @@ async def _get_or_default(
111
111
 
112
112
  def _find_vectorset(
113
113
  kb_vectorsets: knowledgebox_pb2.KnowledgeBoxVectorSetsConfig, vectorset_id: str
114
- ) -> Optional[int]:
114
+ ) -> int | None:
115
115
  """Return the position of the vectorset in `vectorsets` or `None` if not found."""
116
116
  for idx, vectorset in enumerate(kb_vectorsets.vectorsets):
117
117
  if vectorset.vectorset_id == vectorset_id:
@@ -19,8 +19,9 @@
19
19
  #
20
20
  import abc
21
21
  import logging
22
+ from collections.abc import Iterator
22
23
  from dataclasses import dataclass
23
- from typing import Any, Iterator, Optional
24
+ from typing import Any
24
25
 
25
26
  from nidx_protos.nodereader_pb2 import SearchRequest
26
27
  from nidx_protos.noderesources_pb2 import Resource
@@ -30,7 +31,9 @@ from nucliadb.common.counters import IndexCounts
30
31
  from nucliadb.common.external_index_providers.exceptions import ExternalIndexingError
31
32
  from nucliadb.common.ids import ParagraphId
32
33
  from nucliadb_models.external_index_providers import ExternalIndexProviderType
34
+ from nucliadb_models.retrieval import Score
33
35
  from nucliadb_models.search import SCORE_TYPE, Relations, TextPosition
36
+ from nucliadb_protos import resources_pb2
34
37
  from nucliadb_protos.knowledgebox_pb2 import (
35
38
  CreateExternalIndexProviderMetadata,
36
39
  StoredExternalIndexProviderMetadata,
@@ -43,6 +46,16 @@ logger = logging.getLogger(__name__)
43
46
  manager_observer = Observer("external_index_manager", labels={"operation": "", "provider": ""})
44
47
 
45
48
 
49
+ # /k/ocr
50
+ _OCR_LABEL = (
51
+ f"/k/{resources_pb2.Paragraph.TypeParagraph.Name(resources_pb2.Paragraph.TypeParagraph.OCR).lower()}"
52
+ )
53
+ # /k/inception
54
+ _INCEPTION_LABEL = (
55
+ f"/k/{resources_pb2.Paragraph.TypeParagraph.Name(resources_pb2.Paragraph.TypeParagraph.OCR).lower()}"
56
+ )
57
+
58
+
46
59
  @dataclass
47
60
  class VectorsetExternalIndex:
48
61
  """
@@ -57,9 +70,19 @@ class VectorsetExternalIndex:
57
70
 
58
71
  class ScoredTextBlock(BaseModel):
59
72
  paragraph_id: ParagraphId
60
- score: float
61
73
  score_type: SCORE_TYPE
62
74
 
75
+ scores: list[Score]
76
+
77
+ @property
78
+ def score(self) -> float:
79
+ return self.current_score.score
80
+
81
+ @property
82
+ def current_score(self) -> Score:
83
+ assert len(self.scores) > 0, "text block matches must be scored"
84
+ return self.scores[-1]
85
+
63
86
 
64
87
  class TextBlockMatch(ScoredTextBlock):
65
88
  """
@@ -72,11 +95,15 @@ class TextBlockMatch(ScoredTextBlock):
72
95
  page_with_visual: bool = False
73
96
  fuzzy_search: bool
74
97
  is_a_table: bool = False
75
- representation_file: Optional[str] = None
98
+ representation_file: str | None = None
76
99
  paragraph_labels: list[str] = []
77
100
  field_labels: list[str] = []
78
- text: Optional[str] = None
79
- relevant_relations: Optional[Relations] = None
101
+ text: str | None = None
102
+ relevant_relations: Relations | None = None
103
+
104
+ @property
105
+ def is_an_image(self) -> bool:
106
+ return _OCR_LABEL in self.paragraph_labels or _INCEPTION_LABEL in self.paragraph_labels
80
107
 
81
108
 
82
109
  class QueryResults(BaseModel):
@@ -17,7 +17,6 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from typing import Optional
21
20
 
22
21
  import async_lru
23
22
 
@@ -30,7 +29,7 @@ from nucliadb_protos.knowledgebox_pb2 import (
30
29
 
31
30
  async def get_external_index_manager(
32
31
  kbid: str, for_rollover: bool = False
33
- ) -> Optional[ExternalIndexManager]:
32
+ ) -> ExternalIndexManager | None:
34
33
  """
35
34
  Returns an ExternalIndexManager for the given kbid.
36
35
  If for_rollover is True, the ExternalIndexManager returned will include the rollover indexes (if any).
@@ -39,12 +38,12 @@ async def get_external_index_manager(
39
38
 
40
39
 
41
40
  @async_lru.alru_cache(maxsize=None)
42
- async def get_external_index_metadata(kbid: str) -> Optional[StoredExternalIndexProviderMetadata]:
41
+ async def get_external_index_metadata(kbid: str) -> StoredExternalIndexProviderMetadata | None:
43
42
  return await datamanagers.atomic.kb.get_external_index_provider_metadata(kbid=kbid)
44
43
 
45
44
 
46
45
  @async_lru.alru_cache(maxsize=None)
47
- async def get_default_vectorset_id(kbid: str) -> Optional[str]:
46
+ async def get_default_vectorset_id(kbid: str) -> str | None:
48
47
  """
49
48
  While we are transitioning to the new vectorset system, we need to take into account
50
49
  that KBs that have only one semantic model will have the `vectorset_id` field on BrokerMessage.field_vectors
@@ -68,6 +67,6 @@ async def get_default_vectorset_id(kbid: str) -> Optional[str]:
68
67
 
69
68
  async def get_rollover_external_index_metadata(
70
69
  kbid: str,
71
- ) -> Optional[StoredExternalIndexProviderMetadata]:
70
+ ) -> StoredExternalIndexProviderMetadata | None:
72
71
  async with datamanagers.with_ro_transaction() as txn:
73
72
  return await datamanagers.rollover.get_kb_rollover_external_index_metadata(txn, kbid=kbid)
@@ -18,13 +18,14 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
 
21
- from typing import Union
22
21
 
23
22
  from nidx_protos.nodereader_pb2 import FilterExpression as PBFilterExpression
23
+ from typing_extensions import assert_never
24
24
 
25
25
  from nucliadb.common import datamanagers
26
26
  from nucliadb.common.exceptions import InvalidQueryError
27
27
  from nucliadb.common.ids import FIELD_TYPE_NAME_TO_STR
28
+ from nucliadb_models.common import Paragraph
28
29
  from nucliadb_models.filters import (
29
30
  And,
30
31
  DateCreated,
@@ -50,44 +51,28 @@ from nucliadb_models.filters import (
50
51
  ResourceMimetype,
51
52
  Status,
52
53
  )
54
+ from nucliadb_models.metadata import ResourceProcessingStatus
53
55
 
54
56
  # Filters that end up as a facet
55
- FacetFilter = Union[
56
- OriginTag,
57
- Label,
58
- ResourceMimetype,
59
- FieldMimetype,
60
- Entity,
61
- Language,
62
- OriginMetadata,
63
- OriginPath,
64
- Generated,
65
- Kind,
66
- OriginCollaborator,
67
- OriginSource,
68
- Status,
69
- ]
70
- # In Python 3.9 we cannot do isinstance against an union
71
- # Once we support only 3.10+, we can remove this
72
- FacetFilterTypes = (
73
- OriginTag,
74
- Label,
75
- ResourceMimetype,
76
- FieldMimetype,
77
- Entity,
78
- Language,
79
- OriginMetadata,
80
- OriginPath,
81
- Generated,
82
- Kind,
83
- OriginCollaborator,
84
- OriginSource,
85
- Status,
57
+ FacetFilter = (
58
+ OriginTag
59
+ | Label
60
+ | ResourceMimetype
61
+ | FieldMimetype
62
+ | Entity
63
+ | Language
64
+ | OriginMetadata
65
+ | OriginPath
66
+ | Generated
67
+ | Kind
68
+ | OriginCollaborator
69
+ | OriginSource
70
+ | Status
86
71
  )
87
72
 
88
73
 
89
74
  async def parse_expression(
90
- expr: Union[FieldFilterExpression, ParagraphFilterExpression],
75
+ expr: FieldFilterExpression | ParagraphFilterExpression,
91
76
  kbid: str,
92
77
  ) -> PBFilterExpression:
93
78
  f = PBFilterExpression()
@@ -131,12 +116,10 @@ async def parse_expression(
131
116
  f.date.since.FromDatetime(expr.since)
132
117
  if expr.until:
133
118
  f.date.until.FromDatetime(expr.until)
134
- elif isinstance(expr, FacetFilterTypes):
119
+ elif isinstance(expr, FacetFilter):
135
120
  f.facet.facet = facet_from_filter(expr)
136
121
  else:
137
- # This is a trick so mypy generates an error if this branch can be reached,
138
- # that is, if we are missing some ifs
139
- _a: int = "a"
122
+ assert_never(expr)
140
123
 
141
124
  return f
142
125
 
@@ -190,13 +173,118 @@ def facet_from_filter(expr: FacetFilter) -> str:
190
173
  elif isinstance(expr, Status):
191
174
  facet = f"/n/s/{expr.status.value}"
192
175
  else:
193
- # This is a trick so mypy generates an error if this branch can be reached,
194
- # that is, if we are missing some ifs
195
- _a: int = "a"
176
+ assert_never(expr)
196
177
 
197
178
  return facet
198
179
 
199
180
 
181
+ def filter_from_facet(facet: str) -> FacetFilter:
182
+ expr: FacetFilter
183
+
184
+ if facet.startswith("/t/"):
185
+ value = facet.removeprefix("/t/")
186
+ expr = OriginTag(tag=value)
187
+
188
+ elif facet.startswith("/l/"):
189
+ value = facet.removeprefix("/l/")
190
+ parts = value.split("/", maxsplit=1)
191
+ if len(parts) == 1:
192
+ type = parts[0]
193
+ expr = Label(labelset=type)
194
+ else:
195
+ type, subtype = parts
196
+ expr = Label(labelset=type, label=subtype)
197
+
198
+ elif facet.startswith("/n/i/"):
199
+ value = facet.removeprefix("/n/i/")
200
+ parts = value.split("/", maxsplit=1)
201
+ if len(parts) == 1:
202
+ type = parts[0]
203
+ expr = ResourceMimetype(type=type)
204
+ else:
205
+ type, subtype = parts
206
+ expr = ResourceMimetype(type=type, subtype=subtype)
207
+
208
+ elif facet.startswith("/mt/"):
209
+ value = facet.removeprefix("/mt/")
210
+ parts = value.split("/", maxsplit=1)
211
+ if len(parts) == 1:
212
+ type = parts[0]
213
+ expr = FieldMimetype(type=type)
214
+ else:
215
+ type, subtype = parts
216
+ expr = FieldMimetype(type=type, subtype=subtype)
217
+
218
+ elif facet.startswith("/e/"):
219
+ value = facet.removeprefix("/e/")
220
+ parts = value.split("/", maxsplit=1)
221
+ if len(parts) == 1:
222
+ subtype = parts[0]
223
+ expr = Entity(subtype=subtype)
224
+ else:
225
+ subtype, value = parts
226
+ expr = Entity(subtype=subtype, value=value)
227
+
228
+ elif facet.startswith("/s/p"):
229
+ value = facet.removeprefix("/s/p/")
230
+ expr = Language(language=value, only_primary=True)
231
+
232
+ elif facet.startswith("/s/s"):
233
+ value = facet.removeprefix("/s/s/")
234
+ expr = Language(language=value, only_primary=False)
235
+
236
+ elif facet.startswith("/m/"):
237
+ value = facet.removeprefix("/m/")
238
+ parts = value.split("/", maxsplit=1)
239
+ if len(parts) == 1:
240
+ field = parts[0]
241
+ expr = OriginMetadata(field=field)
242
+ else:
243
+ field, value = parts
244
+ expr = OriginMetadata(field=field, value=value)
245
+
246
+ elif facet.startswith("/p/"):
247
+ value = facet.removeprefix("/p/")
248
+ expr = OriginPath(prefix=value)
249
+
250
+ elif facet.startswith("/g/da"):
251
+ value = facet.removeprefix("/g/da")
252
+ expr = expr = Generated(by="data-augmentation")
253
+ if value.removeprefix("/"):
254
+ expr.da_task = value.removeprefix("/")
255
+
256
+ elif facet.startswith("/k/"):
257
+ value = facet.removeprefix("/k/")
258
+ try:
259
+ kind = Paragraph.TypeParagraph(value.upper())
260
+ except ValueError:
261
+ raise InvalidQueryError("filters", f"invalid paragraph kind: {value}")
262
+ expr = Kind(kind=kind)
263
+
264
+ elif facet.startswith("/u/o/"):
265
+ value = facet.removeprefix("/u/o/")
266
+ expr = OriginCollaborator(collaborator=value)
267
+
268
+ elif facet.startswith("/u/s"):
269
+ value = facet.removeprefix("/u/s")
270
+ expr = OriginSource()
271
+ if value.removeprefix("/"):
272
+ expr.id = value.removeprefix("/")
273
+
274
+ elif facet.startswith("/n/s/"):
275
+ value = facet.removeprefix("/n/s/")
276
+ try:
277
+ status = ResourceProcessingStatus(value.upper())
278
+ except ValueError:
279
+ raise InvalidQueryError("filters", f"invalid resource processing status: {value}")
280
+ expr = Status(status=status)
281
+
282
+ else:
283
+ raise InvalidQueryError("filters", f"invalid filter: {facet}")
284
+
285
+ return expr
286
+
287
+
200
288
  def add_and_expression(dest: PBFilterExpression, add: PBFilterExpression):
201
289
  dest_expr_type = dest.WhichOneof("expr")
202
290
  if dest_expr_type is None:
@@ -19,10 +19,8 @@
19
19
  #
20
20
  import logging
21
21
  from datetime import datetime
22
- from typing import Optional
23
22
 
24
23
  import aiohttp
25
- import jwt
26
24
  import pydantic
27
25
 
28
26
  from nucliadb_utils.helpers import MessageProgressUpdater
@@ -33,15 +31,6 @@ from .utils import check_status
33
31
  logger = logging.getLogger(__name__)
34
32
 
35
33
 
36
- def get_nua_api_id() -> str:
37
- assert nuclia_settings.nuclia_service_account is not None
38
- claimset = jwt.decode(
39
- nuclia_settings.nuclia_service_account,
40
- options={"verify_signature": False},
41
- )
42
- return claimset.get("sub")
43
-
44
-
45
34
  def get_processing_api_url() -> str:
46
35
  if nuclia_settings.nuclia_service_account:
47
36
  return (
@@ -64,10 +53,10 @@ def get_processing_api_v2_url() -> str:
64
53
 
65
54
  class PullResponse(pydantic.BaseModel):
66
55
  status: str
67
- payload: Optional[str] = None
56
+ payload: str | None = None
68
57
  payloads: list[bytes] = []
69
- msgid: Optional[str] = None
70
- cursor: Optional[int] = None
58
+ msgid: str | None = None
59
+ cursor: int | None = None
71
60
 
72
61
 
73
62
  class PullPosition(pydantic.BaseModel):
@@ -86,7 +75,7 @@ class RequestsResult(pydantic.BaseModel):
86
75
  description="Resource ID.",
87
76
  )
88
77
  kbid: str = pydantic.Field(..., title="KnowledgeBox ID")
89
- title: Optional[str] = pydantic.Field(
78
+ title: str | None = pydantic.Field(
90
79
  None,
91
80
  title="Title",
92
81
  description="Title of the resource.",
@@ -111,12 +100,12 @@ class RequestsResult(pydantic.BaseModel):
111
100
  title="Timestamp",
112
101
  description="Timestamp of when the resource was first scheduled.",
113
102
  )
114
- completed_at: Optional[datetime] = pydantic.Field(
103
+ completed_at: datetime | None = pydantic.Field(
115
104
  None,
116
105
  title="Completed At",
117
106
  description="Timestamp of when the resource was completed",
118
107
  )
119
- scheduled_at: Optional[datetime] = pydantic.Field(
108
+ scheduled_at: datetime | None = pydantic.Field(
120
109
  None,
121
110
  title="Scheduled At",
122
111
  description="Timestamp of when the resource was first scheduled.",
@@ -149,7 +138,7 @@ class RequestsResults(pydantic.BaseModel):
149
138
  title="Results",
150
139
  description="List of results.",
151
140
  )
152
- cursor: Optional[str] = pydantic.Field(
141
+ cursor: str | None = pydantic.Field(
153
142
  None,
154
143
  title="Cursor",
155
144
  description="Cursor to use for the next page of results.",
@@ -224,7 +213,7 @@ class ProcessingHTTPClient:
224
213
 
225
214
  async def pull_v2(
226
215
  self, ack_tokens: list[str], limit: int = 1, timeout: float = 5
227
- ) -> Optional[PullResponseV2]:
216
+ ) -> PullResponseV2 | None:
228
217
  url = self.base_url_v2 + "/pull"
229
218
  request = PullRequestV2(limit=limit, timeout=timeout, ack=ack_tokens)
230
219
  async with self.session.post(
@@ -248,9 +237,9 @@ class ProcessingHTTPClient:
248
237
 
249
238
  async def requests(
250
239
  self,
251
- cursor: Optional[str] = None,
252
- scheduled: Optional[bool] = None,
253
- kbid: Optional[str] = None,
240
+ cursor: str | None = None,
241
+ scheduled: bool | None = None,
242
+ kbid: str | None = None,
254
243
  limit: int = 20,
255
244
  ) -> RequestsResults:
256
245
  url = self.base_url + "/requests"
@@ -267,7 +256,7 @@ class ProcessingHTTPClient:
267
256
  check_status(resp, resp_text)
268
257
  return RequestsResults.model_validate_json(resp_text)
269
258
 
270
- async def stats(self, kbid: str, timeout: Optional[float] = 1.0) -> StatsResponse:
259
+ async def stats(self, kbid: str, timeout: float | None = 1.0) -> StatsResponse:
271
260
  url = self.base_url + "/stats"
272
261
  async with self.session.get(
273
262
  url,
nucliadb/common/ids.py CHANGED
@@ -24,7 +24,6 @@ paragraphs... Avoiding spread of id construction and parsing everywhere
24
24
  """
25
25
 
26
26
  from dataclasses import dataclass
27
- from typing import Optional
28
27
 
29
28
  from nucliadb_models.common import FieldTypeName
30
29
  from nucliadb_protos.resources_pb2 import FieldType
@@ -77,7 +76,7 @@ class FieldId:
77
76
  type: str
78
77
  key: str
79
78
  # also knwon as `split`, this indicates a part of a field in, for example, conversations
80
- subfield_id: Optional[str] = None
79
+ subfield_id: str | None = None
81
80
 
82
81
  @classmethod
83
82
  def from_string(cls, value: str) -> "FieldId":
@@ -113,7 +112,7 @@ class FieldId:
113
112
 
114
113
  @classmethod
115
114
  def from_pb(
116
- cls, rid: str, field_type: FieldType.ValueType, key: str, subfield_id: Optional[str] = None
115
+ cls, rid: str, field_type: FieldType.ValueType, key: str, subfield_id: str | None = None
117
116
  ) -> "FieldId":
118
117
  return cls(rid=rid, type=FIELD_TYPE_PB_TO_STR[field_type], key=key, subfield_id=subfield_id)
119
118
 
@@ -127,6 +126,9 @@ class FieldId:
127
126
  else:
128
127
  return f"{self.rid}/{self.type}/{self.key}/{self.subfield_id}"
129
128
 
129
+ def full_without_subfield(self) -> str:
130
+ return f"{self.rid}/{self.type}/{self.key}"
131
+
130
132
  def short_without_subfield(self) -> str:
131
133
  return f"/{self.type}/{self.key}"
132
134
 
@@ -262,7 +264,7 @@ class VectorId:
262
264
  return hash(self.full())
263
265
 
264
266
 
265
- def extract_data_augmentation_id(generated_field_id: str) -> Optional[str]:
267
+ def extract_data_augmentation_id(generated_field_id: str) -> str | None:
266
268
  """Data augmentation generated fields have a strict id with the following
267
269
  format:
268
270
  `da-{task_id}-{original:field_type}-{original:field_id}[-{original:split}]`
@@ -22,7 +22,6 @@ import logging
22
22
  import time
23
23
  import uuid
24
24
  from dataclasses import dataclass
25
- from typing import Optional
26
25
 
27
26
  import orjson
28
27
 
@@ -99,7 +98,7 @@ class _Lock:
99
98
  self.task = asyncio.create_task(self._refresh_task())
100
99
  return self
101
100
 
102
- async def get_lock_data(self, txn: Transaction) -> Optional[LockValue]:
101
+ async def get_lock_data(self, txn: Transaction) -> LockValue | None:
103
102
  existing_data = await txn.get(self.key, for_update=True)
104
103
  if existing_data is None:
105
104
  return None
@@ -20,8 +20,9 @@
20
20
  from __future__ import annotations
21
21
 
22
22
  import asyncio
23
+ from collections.abc import AsyncGenerator
23
24
  from contextlib import asynccontextmanager
24
- from typing import AsyncGenerator, Optional
25
+ from typing import ClassVar
25
26
 
26
27
  DEFAULT_SCAN_LIMIT = -1
27
28
  DEFAULT_BATCH_SCAN_LIMIT = 500
@@ -37,10 +38,10 @@ class Transaction:
37
38
  async def commit(self):
38
39
  raise NotImplementedError()
39
40
 
40
- async def batch_get(self, keys: list[str], for_update: bool = False) -> list[Optional[bytes]]:
41
+ async def batch_get(self, keys: list[str], for_update: bool = False) -> list[bytes | None]:
41
42
  raise NotImplementedError()
42
43
 
43
- async def get(self, key: str, for_update: bool = False) -> Optional[bytes]:
44
+ async def get(self, key: str, for_update: bool = False) -> bytes | None:
44
45
  raise NotImplementedError()
45
46
 
46
47
  async def set(self, key: str, value: bytes):
@@ -57,7 +58,7 @@ class Transaction:
57
58
 
58
59
  def keys(
59
60
  self, match: str, count: int = DEFAULT_SCAN_LIMIT, include_start: bool = True
60
- ) -> AsyncGenerator[str, None]:
61
+ ) -> AsyncGenerator[str]:
61
62
  raise NotImplementedError()
62
63
 
63
64
  async def count(self, match: str) -> int:
@@ -66,7 +67,7 @@ class Transaction:
66
67
 
67
68
  class Driver:
68
69
  initialized = False
69
- _abort_tasks: list[asyncio.Task] = []
70
+ _abort_tasks: ClassVar[list[asyncio.Task]] = []
70
71
 
71
72
  async def initialize(self):
72
73
  raise NotImplementedError()
@@ -81,15 +82,15 @@ class Driver:
81
82
  pass
82
83
 
83
84
  @asynccontextmanager
84
- async def _transaction(self, *, read_only: bool) -> AsyncGenerator[Transaction, None]:
85
+ async def _transaction(self, *, read_only: bool) -> AsyncGenerator[Transaction]:
85
86
  yield Transaction()
86
87
 
87
88
  @asynccontextmanager
88
- async def ro_transaction(self) -> AsyncGenerator[Transaction, None]:
89
+ async def ro_transaction(self) -> AsyncGenerator[Transaction]:
89
90
  async with self._transaction(read_only=True) as txn:
90
91
  yield txn
91
92
 
92
93
  @asynccontextmanager
93
- async def rw_transaction(self) -> AsyncGenerator[Transaction, None]:
94
+ async def rw_transaction(self) -> AsyncGenerator[Transaction]:
94
95
  async with self._transaction(read_only=False) as txn:
95
96
  yield txn
@@ -19,8 +19,8 @@
19
19
  #
20
20
  import glob
21
21
  import os
22
+ from collections.abc import AsyncGenerator
22
23
  from contextlib import asynccontextmanager
23
- from typing import AsyncGenerator, Optional
24
24
 
25
25
  from nucliadb.common.maindb.driver import (
26
26
  DEFAULT_BATCH_SCAN_LIMIT,
@@ -78,7 +78,7 @@ class LocalTransaction(Transaction):
78
78
  # Deleting a key that does not exist
79
79
  pass
80
80
 
81
- async def read(self, key: str) -> Optional[bytes]:
81
+ async def read(self, key: str) -> bytes | None:
82
82
  try:
83
83
  async with aiofiles.open(self.compute_path(key), "rb") as resp:
84
84
  return await resp.read()
@@ -106,8 +106,8 @@ class LocalTransaction(Transaction):
106
106
  self.clean()
107
107
  self.open = False
108
108
 
109
- async def batch_get(self, keys: list[str], for_update: bool = False) -> list[Optional[bytes]]:
110
- results: list[Optional[bytes]] = []
109
+ async def batch_get(self, keys: list[str], for_update: bool = False) -> list[bytes | None]:
110
+ results: list[bytes | None] = []
111
111
  for key in keys:
112
112
  obj = await self.get(key)
113
113
  if obj:
@@ -125,7 +125,7 @@ class LocalTransaction(Transaction):
125
125
 
126
126
  return results
127
127
 
128
- async def get(self, key: str, for_update: bool = False) -> Optional[bytes]:
128
+ async def get(self, key: str, for_update: bool = False) -> bytes | None:
129
129
  if key in self.deleted_keys:
130
130
  raise KeyError(f"Not found {key}")
131
131