nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (246) hide show
  1. migrations/0023_backfill_pg_catalog.py +8 -4
  2. migrations/0028_extracted_vectors_reference.py +1 -1
  3. migrations/0029_backfill_field_status.py +3 -4
  4. migrations/0032_remove_old_relations.py +2 -3
  5. migrations/0038_backfill_catalog_field_labels.py +8 -4
  6. migrations/0039_backfill_converation_splits_metadata.py +106 -0
  7. migrations/0040_migrate_search_configurations.py +79 -0
  8. migrations/0041_reindex_conversations.py +137 -0
  9. migrations/pg/0010_shards_index.py +34 -0
  10. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  11. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  12. nucliadb/backups/create.py +2 -15
  13. nucliadb/backups/restore.py +4 -15
  14. nucliadb/backups/tasks.py +4 -1
  15. nucliadb/common/back_pressure/cache.py +2 -3
  16. nucliadb/common/back_pressure/materializer.py +7 -13
  17. nucliadb/common/back_pressure/settings.py +6 -6
  18. nucliadb/common/back_pressure/utils.py +1 -0
  19. nucliadb/common/cache.py +9 -9
  20. nucliadb/common/catalog/__init__.py +79 -0
  21. nucliadb/common/catalog/dummy.py +36 -0
  22. nucliadb/common/catalog/interface.py +85 -0
  23. nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
  24. nucliadb/common/catalog/utils.py +56 -0
  25. nucliadb/common/cluster/manager.py +8 -23
  26. nucliadb/common/cluster/rebalance.py +484 -112
  27. nucliadb/common/cluster/rollover.py +36 -9
  28. nucliadb/common/cluster/settings.py +4 -9
  29. nucliadb/common/cluster/utils.py +34 -8
  30. nucliadb/common/context/__init__.py +7 -8
  31. nucliadb/common/context/fastapi.py +1 -2
  32. nucliadb/common/datamanagers/__init__.py +2 -4
  33. nucliadb/common/datamanagers/atomic.py +9 -2
  34. nucliadb/common/datamanagers/cluster.py +1 -2
  35. nucliadb/common/datamanagers/fields.py +3 -4
  36. nucliadb/common/datamanagers/kb.py +6 -6
  37. nucliadb/common/datamanagers/labels.py +2 -3
  38. nucliadb/common/datamanagers/resources.py +10 -33
  39. nucliadb/common/datamanagers/rollover.py +5 -7
  40. nucliadb/common/datamanagers/search_configurations.py +1 -2
  41. nucliadb/common/datamanagers/synonyms.py +1 -2
  42. nucliadb/common/datamanagers/utils.py +4 -4
  43. nucliadb/common/datamanagers/vectorsets.py +4 -4
  44. nucliadb/common/external_index_providers/base.py +32 -5
  45. nucliadb/common/external_index_providers/manager.py +5 -34
  46. nucliadb/common/external_index_providers/settings.py +1 -27
  47. nucliadb/common/filter_expression.py +129 -41
  48. nucliadb/common/http_clients/exceptions.py +8 -0
  49. nucliadb/common/http_clients/processing.py +16 -23
  50. nucliadb/common/http_clients/utils.py +3 -0
  51. nucliadb/common/ids.py +82 -58
  52. nucliadb/common/locking.py +1 -2
  53. nucliadb/common/maindb/driver.py +9 -8
  54. nucliadb/common/maindb/local.py +5 -5
  55. nucliadb/common/maindb/pg.py +9 -8
  56. nucliadb/common/nidx.py +22 -5
  57. nucliadb/common/vector_index_config.py +1 -1
  58. nucliadb/export_import/datamanager.py +4 -3
  59. nucliadb/export_import/exporter.py +11 -19
  60. nucliadb/export_import/importer.py +13 -6
  61. nucliadb/export_import/tasks.py +2 -0
  62. nucliadb/export_import/utils.py +6 -18
  63. nucliadb/health.py +2 -2
  64. nucliadb/ingest/app.py +8 -8
  65. nucliadb/ingest/consumer/consumer.py +8 -10
  66. nucliadb/ingest/consumer/pull.py +10 -8
  67. nucliadb/ingest/consumer/service.py +5 -30
  68. nucliadb/ingest/consumer/shard_creator.py +16 -5
  69. nucliadb/ingest/consumer/utils.py +1 -1
  70. nucliadb/ingest/fields/base.py +37 -49
  71. nucliadb/ingest/fields/conversation.py +55 -9
  72. nucliadb/ingest/fields/exceptions.py +1 -2
  73. nucliadb/ingest/fields/file.py +22 -8
  74. nucliadb/ingest/fields/link.py +7 -7
  75. nucliadb/ingest/fields/text.py +2 -3
  76. nucliadb/ingest/orm/brain_v2.py +89 -57
  77. nucliadb/ingest/orm/broker_message.py +2 -4
  78. nucliadb/ingest/orm/entities.py +10 -209
  79. nucliadb/ingest/orm/index_message.py +128 -113
  80. nucliadb/ingest/orm/knowledgebox.py +91 -59
  81. nucliadb/ingest/orm/processor/auditing.py +1 -3
  82. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  83. nucliadb/ingest/orm/processor/processor.py +98 -153
  84. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  85. nucliadb/ingest/orm/resource.py +82 -71
  86. nucliadb/ingest/orm/utils.py +1 -1
  87. nucliadb/ingest/partitions.py +12 -1
  88. nucliadb/ingest/processing.py +17 -17
  89. nucliadb/ingest/serialize.py +202 -145
  90. nucliadb/ingest/service/writer.py +15 -114
  91. nucliadb/ingest/settings.py +36 -15
  92. nucliadb/ingest/utils.py +1 -2
  93. nucliadb/learning_proxy.py +23 -26
  94. nucliadb/metrics_exporter.py +20 -6
  95. nucliadb/middleware/__init__.py +82 -1
  96. nucliadb/migrator/datamanager.py +4 -11
  97. nucliadb/migrator/migrator.py +1 -2
  98. nucliadb/migrator/models.py +1 -2
  99. nucliadb/migrator/settings.py +1 -2
  100. nucliadb/models/internal/augment.py +614 -0
  101. nucliadb/models/internal/processing.py +19 -19
  102. nucliadb/openapi.py +2 -2
  103. nucliadb/purge/__init__.py +3 -8
  104. nucliadb/purge/orphan_shards.py +1 -2
  105. nucliadb/reader/__init__.py +5 -0
  106. nucliadb/reader/api/models.py +6 -13
  107. nucliadb/reader/api/v1/download.py +59 -38
  108. nucliadb/reader/api/v1/export_import.py +4 -4
  109. nucliadb/reader/api/v1/knowledgebox.py +37 -9
  110. nucliadb/reader/api/v1/learning_config.py +33 -14
  111. nucliadb/reader/api/v1/resource.py +61 -9
  112. nucliadb/reader/api/v1/services.py +18 -14
  113. nucliadb/reader/app.py +3 -1
  114. nucliadb/reader/reader/notifications.py +1 -2
  115. nucliadb/search/api/v1/__init__.py +3 -0
  116. nucliadb/search/api/v1/ask.py +3 -4
  117. nucliadb/search/api/v1/augment.py +585 -0
  118. nucliadb/search/api/v1/catalog.py +15 -19
  119. nucliadb/search/api/v1/find.py +16 -22
  120. nucliadb/search/api/v1/hydrate.py +328 -0
  121. nucliadb/search/api/v1/knowledgebox.py +1 -2
  122. nucliadb/search/api/v1/predict_proxy.py +1 -2
  123. nucliadb/search/api/v1/resource/ask.py +28 -8
  124. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  125. nucliadb/search/api/v1/resource/search.py +9 -11
  126. nucliadb/search/api/v1/retrieve.py +130 -0
  127. nucliadb/search/api/v1/search.py +28 -32
  128. nucliadb/search/api/v1/suggest.py +11 -14
  129. nucliadb/search/api/v1/summarize.py +1 -2
  130. nucliadb/search/api/v1/utils.py +2 -2
  131. nucliadb/search/app.py +3 -2
  132. nucliadb/search/augmentor/__init__.py +21 -0
  133. nucliadb/search/augmentor/augmentor.py +232 -0
  134. nucliadb/search/augmentor/fields.py +704 -0
  135. nucliadb/search/augmentor/metrics.py +24 -0
  136. nucliadb/search/augmentor/paragraphs.py +334 -0
  137. nucliadb/search/augmentor/resources.py +238 -0
  138. nucliadb/search/augmentor/utils.py +33 -0
  139. nucliadb/search/lifecycle.py +3 -1
  140. nucliadb/search/predict.py +33 -19
  141. nucliadb/search/predict_models.py +8 -9
  142. nucliadb/search/requesters/utils.py +11 -10
  143. nucliadb/search/search/cache.py +19 -42
  144. nucliadb/search/search/chat/ask.py +131 -59
  145. nucliadb/search/search/chat/exceptions.py +3 -5
  146. nucliadb/search/search/chat/fetcher.py +201 -0
  147. nucliadb/search/search/chat/images.py +6 -4
  148. nucliadb/search/search/chat/old_prompt.py +1375 -0
  149. nucliadb/search/search/chat/parser.py +510 -0
  150. nucliadb/search/search/chat/prompt.py +563 -615
  151. nucliadb/search/search/chat/query.py +453 -32
  152. nucliadb/search/search/chat/rpc.py +85 -0
  153. nucliadb/search/search/fetch.py +3 -4
  154. nucliadb/search/search/filters.py +8 -11
  155. nucliadb/search/search/find.py +33 -31
  156. nucliadb/search/search/find_merge.py +124 -331
  157. nucliadb/search/search/graph_strategy.py +14 -12
  158. nucliadb/search/search/hydrator/__init__.py +49 -0
  159. nucliadb/search/search/hydrator/fields.py +217 -0
  160. nucliadb/search/search/hydrator/images.py +130 -0
  161. nucliadb/search/search/hydrator/paragraphs.py +323 -0
  162. nucliadb/search/search/hydrator/resources.py +60 -0
  163. nucliadb/search/search/ingestion_agents.py +5 -5
  164. nucliadb/search/search/merge.py +90 -94
  165. nucliadb/search/search/metrics.py +24 -7
  166. nucliadb/search/search/paragraphs.py +7 -9
  167. nucliadb/search/search/predict_proxy.py +44 -18
  168. nucliadb/search/search/query.py +14 -86
  169. nucliadb/search/search/query_parser/fetcher.py +51 -82
  170. nucliadb/search/search/query_parser/models.py +19 -48
  171. nucliadb/search/search/query_parser/old_filters.py +20 -19
  172. nucliadb/search/search/query_parser/parsers/ask.py +5 -6
  173. nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
  174. nucliadb/search/search/query_parser/parsers/common.py +21 -13
  175. nucliadb/search/search/query_parser/parsers/find.py +6 -29
  176. nucliadb/search/search/query_parser/parsers/graph.py +18 -28
  177. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  178. nucliadb/search/search/query_parser/parsers/search.py +15 -56
  179. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  180. nucliadb/search/search/rank_fusion.py +18 -13
  181. nucliadb/search/search/rerankers.py +6 -7
  182. nucliadb/search/search/retrieval.py +300 -0
  183. nucliadb/search/search/summarize.py +5 -6
  184. nucliadb/search/search/utils.py +3 -4
  185. nucliadb/search/settings.py +1 -2
  186. nucliadb/standalone/api_router.py +1 -1
  187. nucliadb/standalone/app.py +4 -3
  188. nucliadb/standalone/auth.py +5 -6
  189. nucliadb/standalone/lifecycle.py +2 -2
  190. nucliadb/standalone/run.py +5 -4
  191. nucliadb/standalone/settings.py +5 -6
  192. nucliadb/standalone/versions.py +3 -4
  193. nucliadb/tasks/consumer.py +13 -8
  194. nucliadb/tasks/models.py +2 -1
  195. nucliadb/tasks/producer.py +3 -3
  196. nucliadb/tasks/retries.py +8 -7
  197. nucliadb/train/api/utils.py +1 -3
  198. nucliadb/train/api/v1/shards.py +1 -2
  199. nucliadb/train/api/v1/trainset.py +1 -2
  200. nucliadb/train/app.py +1 -1
  201. nucliadb/train/generator.py +4 -4
  202. nucliadb/train/generators/field_classifier.py +2 -2
  203. nucliadb/train/generators/field_streaming.py +6 -6
  204. nucliadb/train/generators/image_classifier.py +2 -2
  205. nucliadb/train/generators/paragraph_classifier.py +2 -2
  206. nucliadb/train/generators/paragraph_streaming.py +2 -2
  207. nucliadb/train/generators/question_answer_streaming.py +2 -2
  208. nucliadb/train/generators/sentence_classifier.py +4 -10
  209. nucliadb/train/generators/token_classifier.py +3 -2
  210. nucliadb/train/generators/utils.py +6 -5
  211. nucliadb/train/nodes.py +3 -3
  212. nucliadb/train/resource.py +6 -8
  213. nucliadb/train/settings.py +3 -4
  214. nucliadb/train/types.py +11 -11
  215. nucliadb/train/upload.py +3 -2
  216. nucliadb/train/uploader.py +1 -2
  217. nucliadb/train/utils.py +1 -2
  218. nucliadb/writer/api/v1/export_import.py +4 -1
  219. nucliadb/writer/api/v1/field.py +15 -14
  220. nucliadb/writer/api/v1/knowledgebox.py +18 -56
  221. nucliadb/writer/api/v1/learning_config.py +5 -4
  222. nucliadb/writer/api/v1/resource.py +9 -20
  223. nucliadb/writer/api/v1/services.py +10 -132
  224. nucliadb/writer/api/v1/upload.py +73 -72
  225. nucliadb/writer/app.py +8 -2
  226. nucliadb/writer/resource/basic.py +12 -15
  227. nucliadb/writer/resource/field.py +43 -5
  228. nucliadb/writer/resource/origin.py +7 -0
  229. nucliadb/writer/settings.py +2 -3
  230. nucliadb/writer/tus/__init__.py +2 -3
  231. nucliadb/writer/tus/azure.py +5 -7
  232. nucliadb/writer/tus/dm.py +3 -3
  233. nucliadb/writer/tus/exceptions.py +3 -4
  234. nucliadb/writer/tus/gcs.py +15 -22
  235. nucliadb/writer/tus/s3.py +2 -3
  236. nucliadb/writer/tus/storage.py +3 -3
  237. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
  238. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  239. nucliadb/common/datamanagers/entities.py +0 -139
  240. nucliadb/common/external_index_providers/pinecone.py +0 -894
  241. nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
  242. nucliadb/search/search/hydrator.py +0 -197
  243. nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
  244. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  245. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  246. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
@@ -18,7 +18,6 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  from enum import Enum
21
- from typing import Optional
22
21
 
23
22
  from pydantic import Field
24
23
  from pydantic_settings import BaseSettings
@@ -41,13 +40,13 @@ class DriverConfig(Enum):
41
40
 
42
41
  class DriverSettings(BaseSettings):
43
42
  driver: DriverConfig = Field(default=DriverConfig.PG, description="K/V storage driver")
44
- driver_local_url: Optional[str] = Field(
43
+ driver_local_url: str | None = Field(
45
44
  default=None,
46
45
  description="Local path to store data on file system. Example: /nucliadb/data/main",
47
46
  )
48
- driver_pg_url: Optional[str] = Field(
47
+ driver_pg_url: str | None = Field(
49
48
  default=None,
50
- description="PostgreSQL DSN. The connection string to the PG server. Example: postgres://username:password@postgres:5432/nucliadb.", # noqa
49
+ description="PostgreSQL DSN. The connection string to the PG server. Example: postgres://username:password@postgres:5432/nucliadb.",
51
50
  )
52
51
  driver_pg_connection_pool_min_size: int = Field(
53
52
  default=10,
@@ -67,6 +66,11 @@ class DriverSettings(BaseSettings):
67
66
  )
68
67
 
69
68
 
69
+ class CatalogConfig(Enum):
70
+ UNSET = "unset"
71
+ PG = "pg"
72
+
73
+
70
74
  # For use during migration from pull v1 to pull v2
71
75
  class ProcessingPullMode(Enum):
72
76
  OFF = "off"
@@ -75,26 +79,43 @@ class ProcessingPullMode(Enum):
75
79
 
76
80
 
77
81
  class Settings(DriverSettings):
78
- grpc_port: int = 8030
79
-
80
- partitions: list[str] = ["1"]
82
+ # Catalog settings
83
+ catalog: CatalogConfig = Field(default=CatalogConfig.PG, description="Catalog backend")
81
84
 
85
+ # Pull worker settings
82
86
  pull_time_error_backoff: int = 30
83
87
  pull_api_timeout: int = 60
84
- disable_pull_worker: bool = False
88
+ disable_pull_worker: bool = Field(
89
+ default=False, description="Set to true to disable the pull worker task"
90
+ )
85
91
 
86
- # ingest consumer sts replica settings
87
- replica_number: int = -1
88
- total_replicas: int = 1 # number of ingest processor replicas in the cluster
89
- nuclia_partitions: int = 50
92
+ # Ingest consumer sts replica settings
93
+ replica_number: int = Field(
94
+ default=-1,
95
+ description="The replica number of this ingest statefulset instance. Leave to -1 to auto-assign based on hostname.",
96
+ )
97
+ total_replicas: int = Field(default=1, description="Number of ingest statefulset replicas deployed")
98
+ nuclia_partitions: int = Field(
99
+ default=50, description="Total number of partitions of the nats stream."
100
+ )
101
+ partitions: list[str] = Field(
102
+ default=["1"],
103
+ description="List of partitions assigned to this ingest statefulset instance. This is automatically assigned based on the replica number and total replicas.",
104
+ )
105
+ max_concurrent_ingest_processing: int = Field(
106
+ default=5,
107
+ description="Controls the number of concurrent messages from different partitions that can be processed at the same time by ingest statefulset consumers.",
108
+ )
90
109
 
91
- max_receive_message_length: int = 500 # In MB
110
+ # Grpc server settings
111
+ grpc_port: int = 8030
112
+ max_receive_message_length: int = Field(
113
+ default=500, description="Maximum receive grpc message length in MB."
114
+ )
92
115
 
93
116
  # Search query timeouts
94
117
  relation_search_timeout: float = 10.0
95
118
  relation_types_timeout: float = 10.0
96
119
 
97
- max_concurrent_ingest_processing: int = 5
98
-
99
120
 
100
121
  settings = Settings()
nucliadb/ingest/utils.py CHANGED
@@ -17,7 +17,6 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from typing import Optional
21
20
 
22
21
  from nucliadb.common.maindb.utils import setup_driver
23
22
  from nucliadb_protos.writer_pb2_grpc import WriterStub
@@ -26,7 +25,7 @@ from nucliadb_utils.settings import nucliadb_settings
26
25
  from nucliadb_utils.utilities import Utility, clean_utility, get_utility, set_utility
27
26
 
28
27
 
29
- async def start_ingest(service_name: Optional[str] = None):
28
+ async def start_ingest(service_name: str | None = None):
30
29
  await setup_driver()
31
30
 
32
31
  actual_service = get_utility(Utility.INGEST)
@@ -24,7 +24,7 @@ import os
24
24
  from abc import ABC, abstractmethod
25
25
  from collections.abc import AsyncIterator
26
26
  from enum import Enum, IntEnum
27
- from typing import Any, Optional, Union
27
+ from typing import Any
28
28
 
29
29
  import backoff
30
30
  import httpx
@@ -43,8 +43,8 @@ logger = logging.getLogger(SERVICE_NAME)
43
43
  WHITELISTED_HEADERS = {
44
44
  "x-nucliadb-user",
45
45
  "x-nucliadb-roles",
46
- "x-stf-roles",
47
- "x-stf-user",
46
+ "x-nucliadb-account",
47
+ "x-nucliadb-account-type",
48
48
  "x-forwarded-for",
49
49
  "x-forwarded-host",
50
50
  "x-forwarded-proto",
@@ -89,12 +89,12 @@ class LearningConfiguration(BaseModel):
89
89
  # aka similarity function
90
90
  semantic_vector_similarity: str
91
91
  # aka vector_dimension
92
- semantic_vector_size: Optional[int] = None
92
+ semantic_vector_size: int | None = None
93
93
  # aka min_score
94
- semantic_threshold: Optional[float] = None
94
+ semantic_threshold: float | None = None
95
95
  # List of possible subdivisions of the matryoshka embeddings (if the model
96
96
  # supports it)
97
- semantic_matryoshka_dimensions: Optional[list[int]] = Field(
97
+ semantic_matryoshka_dimensions: list[int] | None = Field(
98
98
  default=None, alias="semantic_matryoshka_dims"
99
99
  )
100
100
 
@@ -154,7 +154,7 @@ class LearningConfiguration(BaseModel):
154
154
 
155
155
 
156
156
  class ProxiedLearningConfigError(Exception):
157
- def __init__(self, status_code: int, content: Union[str, dict[str, Any]]):
157
+ def __init__(self, status_code: int, content: str | dict[str, Any]):
158
158
  self.status_code = status_code
159
159
  self.content = content
160
160
 
@@ -176,7 +176,7 @@ def raise_for_status(response: httpx.Response) -> None:
176
176
 
177
177
  async def get_configuration(
178
178
  kbid: str,
179
- ) -> Optional[LearningConfiguration]:
179
+ ) -> LearningConfiguration | None:
180
180
  return await learning_config_service().get_configuration(kbid)
181
181
 
182
182
 
@@ -204,14 +204,14 @@ async def learning_config_proxy(
204
204
  request: Request,
205
205
  method: str,
206
206
  url: str,
207
- extra_headers: Optional[dict[str, str]] = None,
208
- ) -> Union[Response, StreamingResponse]:
207
+ headers: dict[str, str] = {},
208
+ ) -> Response | StreamingResponse:
209
209
  return await proxy(
210
210
  service=LearningService.CONFIG,
211
211
  request=request,
212
212
  method=method,
213
213
  url=url,
214
- extra_headers=extra_headers,
214
+ headers=headers,
215
215
  )
216
216
 
217
217
 
@@ -244,24 +244,21 @@ async def proxy(
244
244
  request: Request,
245
245
  method: str,
246
246
  url: str,
247
- extra_headers: Optional[dict[str, str]] = None,
248
- ) -> Union[Response, StreamingResponse]:
247
+ headers: dict[str, str] = {},
248
+ ) -> Response | StreamingResponse:
249
249
  """
250
250
  Proxy the request to a learning API.
251
251
 
252
- service: LearningService. The learning service to proxy the request to.
253
- request: Request. The incoming request.
254
- method: str. The HTTP method to use.
255
- url: str. The URL to proxy the request to.
256
- extra_headers: Optional[dict[str, str]]. Extra headers to include in the proxied request.
252
+ service: The learning service to proxy the request to.
253
+ request: The incoming request.
254
+ method: The HTTP method to use.
255
+ url: The URL to proxy the request to.
256
+ headers: Extra headers to include in the proxied request.
257
257
 
258
258
  Returns: Response. The response from the learning API. If the response is chunked, a StreamingResponse is returned.
259
259
  """
260
-
261
- proxied_headers = extra_headers or {}
262
- proxied_headers.update(
263
- {k.lower(): v for k, v in request.headers.items() if is_white_listed_header(k)}
264
- )
260
+ proxied_headers = {k.lower(): v for k, v in request.headers.items() if is_white_listed_header(k)}
261
+ proxied_headers.update(**headers)
265
262
 
266
263
  async with service_client(
267
264
  base_url=get_base_url(service=service),
@@ -429,7 +426,7 @@ class DummyClient(httpx.AsyncClient):
429
426
 
430
427
  class LearningConfigService(ABC):
431
428
  @abstractmethod
432
- async def get_configuration(self, kbid: str) -> Optional[LearningConfiguration]: ...
429
+ async def get_configuration(self, kbid: str) -> LearningConfiguration | None: ...
433
430
 
434
431
  @abstractmethod
435
432
  async def set_configuration(self, kbid: str, config: dict[str, Any]) -> LearningConfiguration: ...
@@ -442,7 +439,7 @@ class LearningConfigService(ABC):
442
439
 
443
440
 
444
441
  class ProxiedLearningConfig(LearningConfigService):
445
- async def get_configuration(self, kbid: str) -> Optional[LearningConfiguration]:
442
+ async def get_configuration(self, kbid: str) -> LearningConfiguration | None:
446
443
  async with self._client() as client:
447
444
  resp = await client.get(f"config/{kbid}")
448
445
  try:
@@ -486,7 +483,7 @@ class InMemoryLearningConfig(LearningConfigService):
486
483
  def __init__(self):
487
484
  self.in_memory_configs = {}
488
485
 
489
- async def get_configuration(self, kbid: str) -> Optional[LearningConfiguration]:
486
+ async def get_configuration(self, kbid: str) -> LearningConfiguration | None:
490
487
  return _IN_MEMORY_CONFIGS.get(kbid, None)
491
488
 
492
489
  async def set_configuration(self, kbid: str, config: dict[str, Any]) -> LearningConfiguration:
@@ -20,7 +20,8 @@
20
20
  from __future__ import annotations
21
21
 
22
22
  import asyncio
23
- from typing import AsyncGenerator, Callable, Tuple, cast
23
+ from collections.abc import AsyncGenerator, Callable
24
+ from typing import cast
24
25
 
25
26
  from nucliadb import logger
26
27
  from nucliadb.common import datamanagers
@@ -37,8 +38,11 @@ MIGRATION_COUNT = metrics.Gauge("nucliadb_migration", labels={"type": "", "versi
37
38
 
38
39
  PENDING_RESOURCE_COUNT = metrics.Gauge("nucliadb_pending_resources_count")
39
40
 
41
+ KBS_COUNT = metrics.Gauge("nucliadb_kbs_count")
42
+ RESOURCES_COUNT = metrics.Gauge("nucliadb_resources_count")
40
43
 
41
- async def iter_kbids(context: ApplicationContext) -> AsyncGenerator[str, None]:
44
+
45
+ async def iter_kbids(context: ApplicationContext) -> AsyncGenerator[str]:
42
46
  """
43
47
  Return a list of all KB ids.
44
48
  """
@@ -47,9 +51,11 @@ async def iter_kbids(context: ApplicationContext) -> AsyncGenerator[str, None]:
47
51
  yield kbid
48
52
 
49
53
 
50
- async def update_migration_metrics(context: ApplicationContext):
54
+ async def update_kb_metrics(context: ApplicationContext):
51
55
  """
52
- Report the global migration version and the number of KBs per migration version.
56
+ Report metrics at the kb level:
57
+ - total number of KBs
58
+ - the global migration version and the number of KBs per migration version.
53
59
  """
54
60
  # Clear previoulsy set values so that we report only the current state
55
61
  MIGRATION_COUNT.gauge.clear()
@@ -60,12 +66,16 @@ async def update_migration_metrics(context: ApplicationContext):
60
66
  MIGRATION_COUNT.set(1, labels=dict(type="global", version=str(global_info.current_version)))
61
67
 
62
68
  version_count: dict[str, int] = {}
69
+ n_kbs = 0
63
70
  async for kbid in iter_kbids(context):
64
71
  kb_info = await mdm.get_kb_info(kbid)
65
72
  if kb_info is not None:
66
73
  current_version = str(kb_info.current_version)
67
74
  version_count.setdefault(current_version, 0)
68
75
  version_count[current_version] += 1
76
+ n_kbs += 1
77
+
78
+ KBS_COUNT.set(n_kbs)
69
79
 
70
80
  for version, count in version_count.items():
71
81
  MIGRATION_COUNT.set(count, labels=dict(type="kb", version=version))
@@ -85,9 +95,13 @@ async def update_resource_metrics(context: ApplicationContext):
85
95
  "WHERE labels @> '{/n/s/PENDING}' "
86
96
  "AND COALESCE(modified_at, created_at) BETWEEN NOW() - INTERVAL '1 month' AND NOW() - INTERVAL '6 hours'"
87
97
  )
88
- count = cast(Tuple[int], await cur.fetchone())[0]
98
+ count = cast(tuple[int], await cur.fetchone())[0]
89
99
  PENDING_RESOURCE_COUNT.set(count)
90
100
 
101
+ await cur.execute("SELECT COUNT(*) FROM catalog")
102
+ count = cast(tuple[int], await cur.fetchone())[0]
103
+ RESOURCES_COUNT.set(count)
104
+
91
105
 
92
106
  async def run_exporter_task(context: ApplicationContext, exporter_task: Callable, interval: int):
93
107
  """
@@ -109,7 +123,7 @@ async def run_exporter(context: ApplicationContext):
109
123
  # Schedule exporter tasks
110
124
  tasks = []
111
125
  for export_task, interval in [
112
- (update_migration_metrics, 60 * 3),
126
+ (update_kb_metrics, 60 * 3),
113
127
  (update_resource_metrics, 60 * 5),
114
128
  ]:
115
129
  tasks.append(asyncio.create_task(run_exporter_task(context, export_task, interval=interval)))
@@ -17,7 +17,10 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
 
20
+ import logging
20
21
  import time
22
+ from collections import deque
23
+ from typing import ClassVar
21
24
 
22
25
  from starlette.middleware.base import BaseHTTPMiddleware, RequestResponseEndpoint
23
26
  from starlette.requests import Request
@@ -27,6 +30,9 @@ PROCESS_TIME_HEADER = "X-PROCESS-TIME"
27
30
  ACCESS_CONTROL_EXPOSE_HEADER = "Access-Control-Expose-Headers"
28
31
 
29
32
 
33
+ logger = logging.getLogger("nucliadb.middleware")
34
+
35
+
30
36
  class ProcessTimeHeaderMiddleware(BaseHTTPMiddleware):
31
37
  def capture_process_time(self, response, duration: float):
32
38
  response.headers[PROCESS_TIME_HEADER] = str(duration)
@@ -44,9 +50,84 @@ class ProcessTimeHeaderMiddleware(BaseHTTPMiddleware):
44
50
  start = time.perf_counter()
45
51
  try:
46
52
  response = await call_next(request)
53
+ return response
47
54
  finally:
48
55
  if response is not None:
49
56
  duration = time.perf_counter() - start
50
57
  self.capture_process_time(response, duration)
51
58
  self.expose_process_time_header(response)
52
- return response
59
+
60
+
61
+ class ClientErrorPayloadLoggerMiddleware(BaseHTTPMiddleware):
62
+ """
63
+ Middleware that logs the payload of client error responses (HTTP 412 and 422).
64
+ This helps supporting clients by providing more context about the errors they
65
+ encounter which otherwise we don't have much visibility on.
66
+
67
+ There is a limit of logs per IP to avoid flooding the logs in case of
68
+ misbehaving clients.
69
+ """
70
+
71
+ log_counters: ClassVar[dict[str, "HourlyLogCounter"]] = {}
72
+ max_logs: int = 200
73
+
74
+ def get_request_host(self, request: Request) -> str:
75
+ return request.client.host if request.client else "unknown"
76
+
77
+ async def dispatch(self, request: Request, call_next: RequestResponseEndpoint) -> Response:
78
+ response = await call_next(request)
79
+
80
+ host = self.get_request_host(request)
81
+ counter = self.log_counters.setdefault(host, HourlyLogCounter())
82
+ if response.status_code in (412, 422) and counter.get_count() < self.max_logs:
83
+ counter.log_event()
84
+
85
+ response_body = b""
86
+ chunk: bytes
87
+ async for chunk in response.body_iterator: # type: ignore
88
+ response_body += chunk
89
+
90
+ logger.info(
91
+ f"Client payload validation error",
92
+ extra={
93
+ "request_method": request.method,
94
+ "request_path": request.url.path,
95
+ "response_status_code": response.status_code,
96
+ "response_payload": response_body.decode("utf-8", errors="replace"),
97
+ },
98
+ )
99
+ # Recreate the response body iterator since it has been consumed
100
+ response = Response(
101
+ content=response_body,
102
+ status_code=response.status_code,
103
+ headers=dict(response.headers),
104
+ media_type=response.media_type,
105
+ background=response.background,
106
+ )
107
+ return response
108
+
109
+
110
+ class EventCounter:
111
+ def __init__(self, window_seconds: int = 3600):
112
+ self.window_seconds = window_seconds
113
+ self.events: deque[float] = deque()
114
+
115
+ def log_event(self):
116
+ current_time = time.time()
117
+ # Remove events older than the window
118
+ while self.events and self.events[0] < current_time - self.window_seconds:
119
+ self.events.popleft()
120
+ # Add current event
121
+ self.events.append(current_time)
122
+
123
+ def get_count(self) -> int:
124
+ current_time = time.time()
125
+ # Remove old events and return count
126
+ while self.events and self.events[0] < current_time - self.window_seconds:
127
+ self.events.popleft()
128
+ return len(self.events)
129
+
130
+
131
+ class HourlyLogCounter(EventCounter):
132
+ def __init__(self):
133
+ super().__init__(window_seconds=3600)
@@ -17,7 +17,6 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from typing import Optional, Union
21
20
 
22
21
  from nucliadb.common import datamanagers
23
22
  from nucliadb.common.maindb.driver import Driver
@@ -69,7 +68,7 @@ class MigrationsDataManager:
69
68
  await txn.delete(MIGRATIONS_KEY.format(kbid=kbid))
70
69
  await txn.commit()
71
70
 
72
- async def get_kb_info(self, kbid: str) -> Optional[KnowledgeBoxInfo]:
71
+ async def get_kb_info(self, kbid: str) -> KnowledgeBoxInfo | None:
73
72
  async with self.driver.ro_transaction() as txn:
74
73
  kb_config = await datamanagers.kb.get_config(txn, kbid=kbid)
75
74
  if kb_config is None:
@@ -77,13 +76,7 @@ class MigrationsDataManager:
77
76
  return KnowledgeBoxInfo(current_version=kb_config.migration_version)
78
77
 
79
78
  async def update_kb_info(self, *, kbid: str, current_version: int) -> None:
80
- async with self.driver.rw_transaction() as txn:
81
- kb_config = await datamanagers.kb.get_config(txn, kbid=kbid, for_update=True)
82
- if kb_config is None:
83
- raise Exception(f"KB {kbid} does not exist")
84
- kb_config.migration_version = current_version
85
- await KnowledgeBoxORM.update(txn, kbid, config=kb_config)
86
- await txn.commit()
79
+ await KnowledgeBoxORM.update(self.driver, kbid, migration_version=current_version)
87
80
 
88
81
  async def get_global_info(self) -> GlobalInfo:
89
82
  async with self.driver.ro_transaction() as txn:
@@ -97,8 +90,8 @@ class MigrationsDataManager:
97
90
  async def update_global_info(
98
91
  self,
99
92
  *,
100
- current_version: Union[int, _Unset] = _UNSET,
101
- target_version: Union[int, None, _Unset] = _UNSET,
93
+ current_version: int | _Unset = _UNSET,
94
+ target_version: int | None | _Unset = _UNSET,
102
95
  ) -> None:
103
96
  async with self.driver.rw_transaction() as txn:
104
97
  raw_pb = await txn.get(MIGRATION_INFO_KEY, for_update=True)
@@ -19,7 +19,6 @@
19
19
  #
20
20
  import asyncio
21
21
  import logging
22
- from typing import Optional
23
22
 
24
23
  from nucliadb.common import locking
25
24
  from nucliadb.common.cluster.rollover import rollover_kb_index
@@ -233,7 +232,7 @@ async def run_pg_schema_migrations(driver: PGDriver):
233
232
  await tx.commit()
234
233
 
235
234
 
236
- async def run(context: ExecutionContext, target_version: Optional[int] = None) -> None:
235
+ async def run(context: ExecutionContext, target_version: int | None = None) -> None:
237
236
  # Run schema migrations first, since they create the `resources` table needed for the lock below
238
237
  # Schema migrations use their own locking system
239
238
  if isinstance(context.kv_driver, PGDriver):
@@ -19,7 +19,6 @@
19
19
  #
20
20
  import types
21
21
  from dataclasses import dataclass
22
- from typing import Optional
23
22
 
24
23
 
25
24
  @dataclass
@@ -36,4 +35,4 @@ class KnowledgeBoxInfo:
36
35
  @dataclass
37
36
  class GlobalInfo:
38
37
  current_version: int
39
- target_version: Optional[int] = None
38
+ target_version: int | None = None
@@ -17,14 +17,13 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from typing import Optional
21
20
 
22
21
  import pydantic
23
22
  import pydantic_settings
24
23
 
25
24
 
26
25
  class Settings(pydantic_settings.BaseSettings):
27
- redis_url: Optional[str] = None
26
+ redis_url: str | None = None
28
27
  max_concurrent_migrations: int = pydantic.Field(
29
28
  default=5,
30
29
  description="Maximum number of concurrent KB migrations allowed.",