nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (246) hide show
  1. migrations/0023_backfill_pg_catalog.py +8 -4
  2. migrations/0028_extracted_vectors_reference.py +1 -1
  3. migrations/0029_backfill_field_status.py +3 -4
  4. migrations/0032_remove_old_relations.py +2 -3
  5. migrations/0038_backfill_catalog_field_labels.py +8 -4
  6. migrations/0039_backfill_converation_splits_metadata.py +106 -0
  7. migrations/0040_migrate_search_configurations.py +79 -0
  8. migrations/0041_reindex_conversations.py +137 -0
  9. migrations/pg/0010_shards_index.py +34 -0
  10. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  11. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  12. nucliadb/backups/create.py +2 -15
  13. nucliadb/backups/restore.py +4 -15
  14. nucliadb/backups/tasks.py +4 -1
  15. nucliadb/common/back_pressure/cache.py +2 -3
  16. nucliadb/common/back_pressure/materializer.py +7 -13
  17. nucliadb/common/back_pressure/settings.py +6 -6
  18. nucliadb/common/back_pressure/utils.py +1 -0
  19. nucliadb/common/cache.py +9 -9
  20. nucliadb/common/catalog/__init__.py +79 -0
  21. nucliadb/common/catalog/dummy.py +36 -0
  22. nucliadb/common/catalog/interface.py +85 -0
  23. nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
  24. nucliadb/common/catalog/utils.py +56 -0
  25. nucliadb/common/cluster/manager.py +8 -23
  26. nucliadb/common/cluster/rebalance.py +484 -112
  27. nucliadb/common/cluster/rollover.py +36 -9
  28. nucliadb/common/cluster/settings.py +4 -9
  29. nucliadb/common/cluster/utils.py +34 -8
  30. nucliadb/common/context/__init__.py +7 -8
  31. nucliadb/common/context/fastapi.py +1 -2
  32. nucliadb/common/datamanagers/__init__.py +2 -4
  33. nucliadb/common/datamanagers/atomic.py +9 -2
  34. nucliadb/common/datamanagers/cluster.py +1 -2
  35. nucliadb/common/datamanagers/fields.py +3 -4
  36. nucliadb/common/datamanagers/kb.py +6 -6
  37. nucliadb/common/datamanagers/labels.py +2 -3
  38. nucliadb/common/datamanagers/resources.py +10 -33
  39. nucliadb/common/datamanagers/rollover.py +5 -7
  40. nucliadb/common/datamanagers/search_configurations.py +1 -2
  41. nucliadb/common/datamanagers/synonyms.py +1 -2
  42. nucliadb/common/datamanagers/utils.py +4 -4
  43. nucliadb/common/datamanagers/vectorsets.py +4 -4
  44. nucliadb/common/external_index_providers/base.py +32 -5
  45. nucliadb/common/external_index_providers/manager.py +5 -34
  46. nucliadb/common/external_index_providers/settings.py +1 -27
  47. nucliadb/common/filter_expression.py +129 -41
  48. nucliadb/common/http_clients/exceptions.py +8 -0
  49. nucliadb/common/http_clients/processing.py +16 -23
  50. nucliadb/common/http_clients/utils.py +3 -0
  51. nucliadb/common/ids.py +82 -58
  52. nucliadb/common/locking.py +1 -2
  53. nucliadb/common/maindb/driver.py +9 -8
  54. nucliadb/common/maindb/local.py +5 -5
  55. nucliadb/common/maindb/pg.py +9 -8
  56. nucliadb/common/nidx.py +22 -5
  57. nucliadb/common/vector_index_config.py +1 -1
  58. nucliadb/export_import/datamanager.py +4 -3
  59. nucliadb/export_import/exporter.py +11 -19
  60. nucliadb/export_import/importer.py +13 -6
  61. nucliadb/export_import/tasks.py +2 -0
  62. nucliadb/export_import/utils.py +6 -18
  63. nucliadb/health.py +2 -2
  64. nucliadb/ingest/app.py +8 -8
  65. nucliadb/ingest/consumer/consumer.py +8 -10
  66. nucliadb/ingest/consumer/pull.py +10 -8
  67. nucliadb/ingest/consumer/service.py +5 -30
  68. nucliadb/ingest/consumer/shard_creator.py +16 -5
  69. nucliadb/ingest/consumer/utils.py +1 -1
  70. nucliadb/ingest/fields/base.py +37 -49
  71. nucliadb/ingest/fields/conversation.py +55 -9
  72. nucliadb/ingest/fields/exceptions.py +1 -2
  73. nucliadb/ingest/fields/file.py +22 -8
  74. nucliadb/ingest/fields/link.py +7 -7
  75. nucliadb/ingest/fields/text.py +2 -3
  76. nucliadb/ingest/orm/brain_v2.py +89 -57
  77. nucliadb/ingest/orm/broker_message.py +2 -4
  78. nucliadb/ingest/orm/entities.py +10 -209
  79. nucliadb/ingest/orm/index_message.py +128 -113
  80. nucliadb/ingest/orm/knowledgebox.py +91 -59
  81. nucliadb/ingest/orm/processor/auditing.py +1 -3
  82. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  83. nucliadb/ingest/orm/processor/processor.py +98 -153
  84. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  85. nucliadb/ingest/orm/resource.py +82 -71
  86. nucliadb/ingest/orm/utils.py +1 -1
  87. nucliadb/ingest/partitions.py +12 -1
  88. nucliadb/ingest/processing.py +17 -17
  89. nucliadb/ingest/serialize.py +202 -145
  90. nucliadb/ingest/service/writer.py +15 -114
  91. nucliadb/ingest/settings.py +36 -15
  92. nucliadb/ingest/utils.py +1 -2
  93. nucliadb/learning_proxy.py +23 -26
  94. nucliadb/metrics_exporter.py +20 -6
  95. nucliadb/middleware/__init__.py +82 -1
  96. nucliadb/migrator/datamanager.py +4 -11
  97. nucliadb/migrator/migrator.py +1 -2
  98. nucliadb/migrator/models.py +1 -2
  99. nucliadb/migrator/settings.py +1 -2
  100. nucliadb/models/internal/augment.py +614 -0
  101. nucliadb/models/internal/processing.py +19 -19
  102. nucliadb/openapi.py +2 -2
  103. nucliadb/purge/__init__.py +3 -8
  104. nucliadb/purge/orphan_shards.py +1 -2
  105. nucliadb/reader/__init__.py +5 -0
  106. nucliadb/reader/api/models.py +6 -13
  107. nucliadb/reader/api/v1/download.py +59 -38
  108. nucliadb/reader/api/v1/export_import.py +4 -4
  109. nucliadb/reader/api/v1/knowledgebox.py +37 -9
  110. nucliadb/reader/api/v1/learning_config.py +33 -14
  111. nucliadb/reader/api/v1/resource.py +61 -9
  112. nucliadb/reader/api/v1/services.py +18 -14
  113. nucliadb/reader/app.py +3 -1
  114. nucliadb/reader/reader/notifications.py +1 -2
  115. nucliadb/search/api/v1/__init__.py +3 -0
  116. nucliadb/search/api/v1/ask.py +3 -4
  117. nucliadb/search/api/v1/augment.py +585 -0
  118. nucliadb/search/api/v1/catalog.py +15 -19
  119. nucliadb/search/api/v1/find.py +16 -22
  120. nucliadb/search/api/v1/hydrate.py +328 -0
  121. nucliadb/search/api/v1/knowledgebox.py +1 -2
  122. nucliadb/search/api/v1/predict_proxy.py +1 -2
  123. nucliadb/search/api/v1/resource/ask.py +28 -8
  124. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  125. nucliadb/search/api/v1/resource/search.py +9 -11
  126. nucliadb/search/api/v1/retrieve.py +130 -0
  127. nucliadb/search/api/v1/search.py +28 -32
  128. nucliadb/search/api/v1/suggest.py +11 -14
  129. nucliadb/search/api/v1/summarize.py +1 -2
  130. nucliadb/search/api/v1/utils.py +2 -2
  131. nucliadb/search/app.py +3 -2
  132. nucliadb/search/augmentor/__init__.py +21 -0
  133. nucliadb/search/augmentor/augmentor.py +232 -0
  134. nucliadb/search/augmentor/fields.py +704 -0
  135. nucliadb/search/augmentor/metrics.py +24 -0
  136. nucliadb/search/augmentor/paragraphs.py +334 -0
  137. nucliadb/search/augmentor/resources.py +238 -0
  138. nucliadb/search/augmentor/utils.py +33 -0
  139. nucliadb/search/lifecycle.py +3 -1
  140. nucliadb/search/predict.py +33 -19
  141. nucliadb/search/predict_models.py +8 -9
  142. nucliadb/search/requesters/utils.py +11 -10
  143. nucliadb/search/search/cache.py +19 -42
  144. nucliadb/search/search/chat/ask.py +131 -59
  145. nucliadb/search/search/chat/exceptions.py +3 -5
  146. nucliadb/search/search/chat/fetcher.py +201 -0
  147. nucliadb/search/search/chat/images.py +6 -4
  148. nucliadb/search/search/chat/old_prompt.py +1375 -0
  149. nucliadb/search/search/chat/parser.py +510 -0
  150. nucliadb/search/search/chat/prompt.py +563 -615
  151. nucliadb/search/search/chat/query.py +453 -32
  152. nucliadb/search/search/chat/rpc.py +85 -0
  153. nucliadb/search/search/fetch.py +3 -4
  154. nucliadb/search/search/filters.py +8 -11
  155. nucliadb/search/search/find.py +33 -31
  156. nucliadb/search/search/find_merge.py +124 -331
  157. nucliadb/search/search/graph_strategy.py +14 -12
  158. nucliadb/search/search/hydrator/__init__.py +49 -0
  159. nucliadb/search/search/hydrator/fields.py +217 -0
  160. nucliadb/search/search/hydrator/images.py +130 -0
  161. nucliadb/search/search/hydrator/paragraphs.py +323 -0
  162. nucliadb/search/search/hydrator/resources.py +60 -0
  163. nucliadb/search/search/ingestion_agents.py +5 -5
  164. nucliadb/search/search/merge.py +90 -94
  165. nucliadb/search/search/metrics.py +24 -7
  166. nucliadb/search/search/paragraphs.py +7 -9
  167. nucliadb/search/search/predict_proxy.py +44 -18
  168. nucliadb/search/search/query.py +14 -86
  169. nucliadb/search/search/query_parser/fetcher.py +51 -82
  170. nucliadb/search/search/query_parser/models.py +19 -48
  171. nucliadb/search/search/query_parser/old_filters.py +20 -19
  172. nucliadb/search/search/query_parser/parsers/ask.py +5 -6
  173. nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
  174. nucliadb/search/search/query_parser/parsers/common.py +21 -13
  175. nucliadb/search/search/query_parser/parsers/find.py +6 -29
  176. nucliadb/search/search/query_parser/parsers/graph.py +18 -28
  177. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  178. nucliadb/search/search/query_parser/parsers/search.py +15 -56
  179. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  180. nucliadb/search/search/rank_fusion.py +18 -13
  181. nucliadb/search/search/rerankers.py +6 -7
  182. nucliadb/search/search/retrieval.py +300 -0
  183. nucliadb/search/search/summarize.py +5 -6
  184. nucliadb/search/search/utils.py +3 -4
  185. nucliadb/search/settings.py +1 -2
  186. nucliadb/standalone/api_router.py +1 -1
  187. nucliadb/standalone/app.py +4 -3
  188. nucliadb/standalone/auth.py +5 -6
  189. nucliadb/standalone/lifecycle.py +2 -2
  190. nucliadb/standalone/run.py +5 -4
  191. nucliadb/standalone/settings.py +5 -6
  192. nucliadb/standalone/versions.py +3 -4
  193. nucliadb/tasks/consumer.py +13 -8
  194. nucliadb/tasks/models.py +2 -1
  195. nucliadb/tasks/producer.py +3 -3
  196. nucliadb/tasks/retries.py +8 -7
  197. nucliadb/train/api/utils.py +1 -3
  198. nucliadb/train/api/v1/shards.py +1 -2
  199. nucliadb/train/api/v1/trainset.py +1 -2
  200. nucliadb/train/app.py +1 -1
  201. nucliadb/train/generator.py +4 -4
  202. nucliadb/train/generators/field_classifier.py +2 -2
  203. nucliadb/train/generators/field_streaming.py +6 -6
  204. nucliadb/train/generators/image_classifier.py +2 -2
  205. nucliadb/train/generators/paragraph_classifier.py +2 -2
  206. nucliadb/train/generators/paragraph_streaming.py +2 -2
  207. nucliadb/train/generators/question_answer_streaming.py +2 -2
  208. nucliadb/train/generators/sentence_classifier.py +4 -10
  209. nucliadb/train/generators/token_classifier.py +3 -2
  210. nucliadb/train/generators/utils.py +6 -5
  211. nucliadb/train/nodes.py +3 -3
  212. nucliadb/train/resource.py +6 -8
  213. nucliadb/train/settings.py +3 -4
  214. nucliadb/train/types.py +11 -11
  215. nucliadb/train/upload.py +3 -2
  216. nucliadb/train/uploader.py +1 -2
  217. nucliadb/train/utils.py +1 -2
  218. nucliadb/writer/api/v1/export_import.py +4 -1
  219. nucliadb/writer/api/v1/field.py +15 -14
  220. nucliadb/writer/api/v1/knowledgebox.py +18 -56
  221. nucliadb/writer/api/v1/learning_config.py +5 -4
  222. nucliadb/writer/api/v1/resource.py +9 -20
  223. nucliadb/writer/api/v1/services.py +10 -132
  224. nucliadb/writer/api/v1/upload.py +73 -72
  225. nucliadb/writer/app.py +8 -2
  226. nucliadb/writer/resource/basic.py +12 -15
  227. nucliadb/writer/resource/field.py +43 -5
  228. nucliadb/writer/resource/origin.py +7 -0
  229. nucliadb/writer/settings.py +2 -3
  230. nucliadb/writer/tus/__init__.py +2 -3
  231. nucliadb/writer/tus/azure.py +5 -7
  232. nucliadb/writer/tus/dm.py +3 -3
  233. nucliadb/writer/tus/exceptions.py +3 -4
  234. nucliadb/writer/tus/gcs.py +15 -22
  235. nucliadb/writer/tus/s3.py +2 -3
  236. nucliadb/writer/tus/storage.py +3 -3
  237. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
  238. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  239. nucliadb/common/datamanagers/entities.py +0 -139
  240. nucliadb/common/external_index_providers/pinecone.py +0 -894
  241. nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
  242. nucliadb/search/search/hydrator.py +0 -197
  243. nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
  244. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  245. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  246. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
nucliadb/backups/tasks.py CHANGED
@@ -17,7 +17,7 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from typing import Awaitable, Callable
20
+ from collections.abc import Awaitable, Callable
21
21
 
22
22
  from nucliadb.backups.const import BackupsNatsConfig
23
23
  from nucliadb.backups.create import backup_kb_task
@@ -38,6 +38,7 @@ def creator_consumer() -> NatsTaskConsumer[CreateBackupRequest]:
38
38
  callback=backup_kb_task,
39
39
  msg_type=CreateBackupRequest,
40
40
  max_concurrent_messages=10,
41
+ max_retries=100,
41
42
  )
42
43
  return consumer
43
44
 
@@ -64,6 +65,7 @@ def restorer_consumer() -> NatsTaskConsumer[RestoreBackupRequest]:
64
65
  callback=restore_kb_task,
65
66
  msg_type=RestoreBackupRequest,
66
67
  max_concurrent_messages=10,
68
+ max_retries=100,
67
69
  )
68
70
  return consumer
69
71
 
@@ -90,6 +92,7 @@ def deleter_consumer() -> NatsTaskConsumer[DeleteBackupRequest]:
90
92
  callback=delete_backup_task,
91
93
  msg_type=DeleteBackupRequest,
92
94
  max_concurrent_messages=2,
95
+ max_retries=100,
93
96
  )
94
97
  return consumer
95
98
 
@@ -21,7 +21,6 @@ import contextlib
21
21
  import logging
22
22
  import threading
23
23
  from datetime import datetime, timezone
24
- from typing import Optional
25
24
 
26
25
  from cachetools import TTLCache
27
26
 
@@ -47,7 +46,7 @@ class BackPressureCache:
47
46
  self._cache = TTLCache(maxsize=1024, ttl=5 * 60)
48
47
  self._lock = threading.Lock()
49
48
 
50
- def get(self, key: str) -> Optional[BackPressureData]:
49
+ def get(self, key: str) -> BackPressureData | None:
51
50
  with self._lock:
52
51
  data = self._cache.get(key, None)
53
52
  if data is None:
@@ -72,7 +71,7 @@ def cached_back_pressure(cache_key: str):
72
71
  Context manager that handles the caching of the try again in time so that
73
72
  we don't recompute try again times if we have already applied back pressure.
74
73
  """
75
- data: Optional[BackPressureData] = _cache.get(cache_key)
74
+ data: BackPressureData | None = _cache.get(cache_key)
76
75
  if data is not None:
77
76
  back_pressure_type = data.type
78
77
  RATE_LIMITED_REQUESTS_COUNTER.inc({"type": back_pressure_type, "cached": "true"})
@@ -20,7 +20,6 @@
20
20
  import asyncio
21
21
  import logging
22
22
  import threading
23
- from typing import Optional
24
23
 
25
24
  from cachetools import TTLCache
26
25
  from fastapi import HTTPException
@@ -118,12 +117,6 @@ class BackPressureMaterializer:
118
117
  extra={"kbid": kbid},
119
118
  )
120
119
  return 0
121
-
122
- if pending > 0:
123
- logger.info(
124
- f"Processing returned {pending} pending messages for KB",
125
- extra={"kbid": kbid},
126
- )
127
120
  self.processing_pending_cache[kbid] = pending
128
121
  return pending
129
122
 
@@ -184,7 +177,7 @@ class BackPressureMaterializer:
184
177
  pending=pending,
185
178
  max_wait=settings.max_wait_time,
186
179
  )
187
- data = BackPressureData(type="indexing", try_after=try_after)
180
+ data = BackPressureData(type="indexing", try_after=try_after, pending=pending)
188
181
  raise BackPressureException(data)
189
182
 
190
183
  def check_ingest(self):
@@ -199,7 +192,7 @@ class BackPressureMaterializer:
199
192
  pending=ingest_pending,
200
193
  max_wait=settings.max_wait_time,
201
194
  )
202
- data = BackPressureData(type="ingest", try_after=try_after)
195
+ data = BackPressureData(type="ingest", try_after=try_after, pending=ingest_pending)
203
196
  raise BackPressureException(data)
204
197
 
205
198
  async def check_processing(self, kbid: str):
@@ -215,11 +208,11 @@ class BackPressureMaterializer:
215
208
  pending=kb_pending,
216
209
  max_wait=settings.max_wait_time,
217
210
  )
218
- data = BackPressureData(type="processing", try_after=try_after)
211
+ data = BackPressureData(type="processing", try_after=try_after, pending=kb_pending)
219
212
  raise BackPressureException(data)
220
213
 
221
214
 
222
- MATERIALIZER: Optional[BackPressureMaterializer] = None
215
+ MATERIALIZER: BackPressureMaterializer | None = None
223
216
  materializer_lock = threading.Lock()
224
217
 
225
218
 
@@ -268,7 +261,7 @@ def get_materializer() -> BackPressureMaterializer:
268
261
  return MATERIALIZER
269
262
 
270
263
 
271
- async def maybe_back_pressure(kbid: str, resource_uuid: Optional[str] = None) -> None:
264
+ async def maybe_back_pressure(kbid: str, resource_uuid: str | None = None) -> None:
272
265
  """
273
266
  This function does system checks to see if we need to put back pressure on writes.
274
267
  In that case, a HTTP 429 will be raised with the estimated time to try again.
@@ -278,7 +271,7 @@ async def maybe_back_pressure(kbid: str, resource_uuid: Optional[str] = None) ->
278
271
  await back_pressure_checks(kbid, resource_uuid)
279
272
 
280
273
 
281
- async def back_pressure_checks(kbid: str, resource_uuid: Optional[str] = None):
274
+ async def back_pressure_checks(kbid: str, resource_uuid: str | None = None):
282
275
  """
283
276
  Will raise a 429 if back pressure is needed:
284
277
  - If the processing engine is behind.
@@ -299,6 +292,7 @@ async def back_pressure_checks(kbid: str, resource_uuid: Optional[str] = None):
299
292
  "resource_uuid": resource_uuid,
300
293
  "try_after": exc.data.try_after,
301
294
  "back_pressure_type": exc.data.type,
295
+ "pending": exc.data.pending,
302
296
  },
303
297
  )
304
298
  raise HTTPException(
@@ -29,30 +29,30 @@ class BackPressureSettings(BaseSettings):
29
29
  )
30
30
  indexing_rate: float = Field(
31
31
  default=10,
32
- description="Estimation of the indexing rate in messages per second. This is used to calculate the try again in time", # noqa
32
+ description="Estimation of the indexing rate in messages per second. This is used to calculate the try again in time",
33
33
  )
34
34
  ingest_rate: float = Field(
35
35
  default=4,
36
- description="Estimation of the ingest processed consumer rate in messages per second. This is used to calculate the try again in time", # noqa
36
+ description="Estimation of the ingest processed consumer rate in messages per second. This is used to calculate the try again in time",
37
37
  )
38
38
  processing_rate: float = Field(
39
39
  default=1,
40
- description="Estimation of the processing rate in messages per second. This is used to calculate the try again in time", # noqa
40
+ description="Estimation of the processing rate in messages per second. This is used to calculate the try again in time",
41
41
  )
42
42
  max_indexing_pending: int = Field(
43
43
  default=1000,
44
- description="Max number of messages pending to index in a node queue before rate limiting writes. Set to 0 to disable indexing back pressure checks", # noqa
44
+ description="Max number of messages pending to index in a node queue before rate limiting writes. Set to 0 to disable indexing back pressure checks",
45
45
  alias="back_pressure_max_indexing_pending",
46
46
  )
47
47
  max_ingest_pending: int = Field(
48
48
  # Disabled by default
49
49
  default=0,
50
- description="Max number of messages pending to be ingested by processed consumers before rate limiting writes. Set to 0 to disable ingest back pressure checks", # noqa
50
+ description="Max number of messages pending to be ingested by processed consumers before rate limiting writes. Set to 0 to disable ingest back pressure checks",
51
51
  alias="back_pressure_max_ingest_pending",
52
52
  )
53
53
  max_processing_pending: int = Field(
54
54
  default=1000,
55
- description="Max number of messages pending to process per Knowledge Box before rate limiting writes. Set to 0 to disable processing back pressure checks", # noqa
55
+ description="Max number of messages pending to process per Knowledge Box before rate limiting writes. Set to 0 to disable processing back pressure checks",
56
56
  alias="back_pressure_max_processing_pending",
57
57
  )
58
58
  indexing_check_interval: int = Field(
@@ -28,6 +28,7 @@ from nucliadb_utils.nats import NatsConnectionManager
28
28
  class BackPressureData:
29
29
  type: str
30
30
  try_after: datetime
31
+ pending: int = 0
31
32
 
32
33
 
33
34
  class BackPressureException(Exception):
nucliadb/common/cache.py CHANGED
@@ -24,7 +24,7 @@ from abc import ABC, abstractmethod
24
24
  from contextvars import ContextVar
25
25
  from dataclasses import dataclass
26
26
  from functools import cached_property
27
- from typing import Generic, Optional, TypeVar
27
+ from typing import Generic, TypeVar
28
28
 
29
29
  import backoff
30
30
  from async_lru import _LRUCacheWrapper, alru_cache
@@ -66,9 +66,9 @@ class Cache(Generic[K, T], ABC):
66
66
 
67
67
  """
68
68
 
69
- cache: _LRUCacheWrapper[Optional[T]]
69
+ cache: _LRUCacheWrapper[T | None]
70
70
 
71
- async def get(self, *args: K.args, **kwargs: K.kwargs) -> Optional[T]:
71
+ async def get(self, *args: K.args, **kwargs: K.kwargs) -> T | None:
72
72
  result = await self.cache(*args)
73
73
  # Do not cache None
74
74
  if result is None:
@@ -88,7 +88,7 @@ class Cache(Generic[K, T], ABC):
88
88
  class ResourceCache(Cache[[str, str], ResourceORM]):
89
89
  def __init__(self, cache_size: int) -> None:
90
90
  @alru_cache(maxsize=cache_size)
91
- async def _get_resource(kbid: str, rid: str) -> Optional[ResourceORM]:
91
+ async def _get_resource(kbid: str, rid: str) -> ResourceORM | None:
92
92
  storage = await get_storage()
93
93
  async with get_driver().ro_transaction() as txn:
94
94
  kb = KnowledgeBoxORM(txn, storage, kbid)
@@ -115,7 +115,7 @@ class ExtractedTextCache(Cache[[str, FieldId], ExtractedText]):
115
115
  def __init__(self, cache_size: int) -> None:
116
116
  @alru_cache(maxsize=cache_size)
117
117
  @backoff.on_exception(backoff.expo, (Exception,), jitter=backoff.random_jitter, max_tries=3)
118
- async def _get_extracted_text(kbid: str, field_id: FieldId) -> Optional[ExtractedText]:
118
+ async def _get_extracted_text(kbid: str, field_id: FieldId) -> ExtractedText | None:
119
119
  storage = await get_storage()
120
120
  try:
121
121
  sf = storage.file_extracted(
@@ -144,18 +144,18 @@ class ExtractedTextCache(Cache[[str, FieldId], ExtractedText]):
144
144
 
145
145
  # Global caches (per asyncio task)
146
146
 
147
- rcache: ContextVar[Optional[ResourceCache]] = ContextVar("rcache", default=None)
148
- etcache: ContextVar[Optional[ExtractedTextCache]] = ContextVar("etcache", default=None)
147
+ rcache: ContextVar[ResourceCache | None] = ContextVar("rcache", default=None)
148
+ etcache: ContextVar[ExtractedTextCache | None] = ContextVar("etcache", default=None)
149
149
 
150
150
 
151
151
  # Cache management
152
152
 
153
153
 
154
- def get_resource_cache() -> Optional[ResourceCache]:
154
+ def get_resource_cache() -> ResourceCache | None:
155
155
  return rcache.get()
156
156
 
157
157
 
158
- def get_extracted_text_cache() -> Optional[ExtractedTextCache]:
158
+ def get_extracted_text_cache() -> ExtractedTextCache | None:
159
159
  return etcache.get()
160
160
 
161
161
 
@@ -0,0 +1,79 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ # Copyright (C) 2021 Bosutech XXI S.L.
21
+ #
22
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
23
+ # For commercial licensing, contact us at info@nuclia.com.
24
+ #
25
+ # AGPL:
26
+ # This program is free software: you can redistribute it and/or modify
27
+ # it under the terms of the GNU Affero General Public License as
28
+ # published by the Free Software Foundation, either version 3 of the
29
+ # License, or (at your option) any later version.
30
+ #
31
+ # This program is distributed in the hope that it will be useful,
32
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
33
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34
+ # GNU Affero General Public License for more details.
35
+ #
36
+ # You should have received a copy of the GNU Affero General Public License
37
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
38
+
39
+ from nidx_protos.noderesources_pb2 import Resource as IndexMessage
40
+
41
+ from nucliadb.common.catalog.dummy import DummyCatalog
42
+ from nucliadb.common.catalog.interface import Catalog, CatalogQuery
43
+ from nucliadb.common.catalog.pg import PGCatalog
44
+ from nucliadb.common.catalog.utils import build_catalog_resource_data
45
+ from nucliadb.common.maindb.driver import Transaction
46
+ from nucliadb.ingest.orm.resource import Resource
47
+ from nucliadb.ingest.settings import CatalogConfig, settings
48
+ from nucliadb_models.search import CatalogFacetsRequest, Resources
49
+ from nucliadb_utils.exceptions import ConfigurationError
50
+
51
+
52
+ def get_catalog() -> Catalog:
53
+ if settings.catalog == CatalogConfig.UNSET:
54
+ return DummyCatalog()
55
+ elif settings.catalog == CatalogConfig.PG:
56
+ return PGCatalog()
57
+ else:
58
+ raise ConfigurationError(f"Unknown catalog configuration: {settings.catalog}")
59
+
60
+
61
+ async def catalog_update(txn: Transaction, kbid: str, resource: Resource, index_message: IndexMessage):
62
+ catalog = get_catalog()
63
+ resource_data = build_catalog_resource_data(resource, index_message)
64
+ await catalog.update(txn, kbid, resource.uuid, resource_data)
65
+
66
+
67
+ async def catalog_delete(txn: Transaction, kbid: str, rid: str):
68
+ catalog = get_catalog()
69
+ await catalog.delete(txn, kbid, rid)
70
+
71
+
72
+ async def catalog_search(query: CatalogQuery) -> Resources:
73
+ catalog = get_catalog()
74
+ return await catalog.search(query)
75
+
76
+
77
+ async def catalog_facets(kbid: str, request: CatalogFacetsRequest) -> dict[str, int]:
78
+ catalog = get_catalog()
79
+ return await catalog.facets(kbid, request)
@@ -0,0 +1,36 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ from nucliadb.common.catalog.interface import Catalog, CatalogQuery, CatalogResourceData
21
+ from nucliadb.common.maindb.driver import Transaction
22
+ from nucliadb_models.search import CatalogFacetsRequest, Resources
23
+
24
+
25
+ class DummyCatalog(Catalog):
26
+ async def update(self, txn: Transaction, kbid: str, rid: str, data: CatalogResourceData):
27
+ return
28
+
29
+ async def delete(self, txn: Transaction, kbid: str, rid: str):
30
+ return
31
+
32
+ async def search(self, query: CatalogQuery) -> Resources:
33
+ return Resources(results=[], min_score=0.0)
34
+
35
+ async def facets(self, kbid: str, request: CatalogFacetsRequest) -> dict[str, int]:
36
+ return {}
@@ -0,0 +1,85 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ from __future__ import annotations
21
+
22
+ import abc
23
+ import datetime
24
+ from dataclasses import dataclass
25
+ from typing import Literal
26
+
27
+ from pydantic import BaseModel, Field
28
+
29
+ from nucliadb.common.maindb.driver import Transaction
30
+ from nucliadb_models import search as search_models
31
+ from nucliadb_models.search import CatalogFacetsRequest, Resources
32
+
33
+
34
+ class CatalogResourceData(BaseModel):
35
+ """
36
+ Data extracted from a resource to be indexed in the catalog
37
+ """
38
+
39
+ title: str = Field(description="Resource title")
40
+ created_at: datetime.datetime = Field(description="Resource creation date")
41
+ modified_at: datetime.datetime = Field(description="Resource last modification date")
42
+ labels: list[str] = Field(
43
+ description="Resource labels. This includes labels at the resource level and all classification labels of its fields"
44
+ )
45
+ slug: str = Field(description="Resource slug")
46
+
47
+
48
+ @dataclass
49
+ class CatalogExpression:
50
+ @dataclass
51
+ class Date:
52
+ field: Literal["created_at"] | Literal["modified_at"]
53
+ since: datetime.datetime | None
54
+ until: datetime.datetime | None
55
+
56
+ bool_and: list[CatalogExpression] | None = None
57
+ bool_or: list[CatalogExpression] | None = None
58
+ bool_not: CatalogExpression | None = None
59
+ date: Date | None = None
60
+ facet: str | None = None
61
+ resource_id: str | None = None
62
+
63
+
64
+ class CatalogQuery(BaseModel):
65
+ kbid: str
66
+ query: search_models.CatalogQuery | None = Field(description="Full-text search query")
67
+ filters: CatalogExpression | None = Field(description="Filters to apply to the search")
68
+ sort: search_models.SortOptions = Field(description="Sorting option")
69
+ faceted: list[str] = Field(description="List of facets to compute during the search")
70
+ page_size: int = Field(description="Used for pagination. Maximum page size is 100")
71
+ page_number: int = Field(description="Used for pagination. First page is 0")
72
+
73
+
74
+ class Catalog(abc.ABC, metaclass=abc.ABCMeta):
75
+ @abc.abstractmethod
76
+ async def update(self, txn: Transaction, kbid: str, rid: str, data: CatalogResourceData): ...
77
+
78
+ @abc.abstractmethod
79
+ async def delete(self, txn: Transaction, kbid: str, rid: str): ...
80
+
81
+ @abc.abstractmethod
82
+ async def search(self, query: CatalogQuery) -> Resources: ...
83
+
84
+ @abc.abstractmethod
85
+ async def facets(self, kbid: str, request: CatalogFacetsRequest) -> dict[str, int]: ...