nucliadb 6.2.0.post2675__py3-none-any.whl → 6.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. migrations/0028_extracted_vectors_reference.py +61 -0
  2. migrations/0029_backfill_field_status.py +149 -0
  3. migrations/0030_label_deduplication.py +60 -0
  4. nucliadb/common/cluster/manager.py +41 -331
  5. nucliadb/common/cluster/rebalance.py +2 -2
  6. nucliadb/common/cluster/rollover.py +12 -71
  7. nucliadb/common/cluster/settings.py +3 -0
  8. nucliadb/common/cluster/standalone/utils.py +0 -43
  9. nucliadb/common/cluster/utils.py +0 -16
  10. nucliadb/common/counters.py +1 -0
  11. nucliadb/common/datamanagers/fields.py +48 -7
  12. nucliadb/common/datamanagers/vectorsets.py +11 -2
  13. nucliadb/common/external_index_providers/base.py +2 -1
  14. nucliadb/common/external_index_providers/pinecone.py +3 -5
  15. nucliadb/common/ids.py +18 -4
  16. nucliadb/common/models_utils/from_proto.py +479 -0
  17. nucliadb/common/models_utils/to_proto.py +60 -0
  18. nucliadb/common/nidx.py +76 -37
  19. nucliadb/export_import/models.py +3 -3
  20. nucliadb/health.py +0 -7
  21. nucliadb/ingest/app.py +0 -8
  22. nucliadb/ingest/consumer/auditing.py +1 -1
  23. nucliadb/ingest/consumer/shard_creator.py +1 -1
  24. nucliadb/ingest/fields/base.py +83 -21
  25. nucliadb/ingest/orm/brain.py +55 -56
  26. nucliadb/ingest/orm/broker_message.py +12 -2
  27. nucliadb/ingest/orm/entities.py +6 -17
  28. nucliadb/ingest/orm/knowledgebox.py +44 -22
  29. nucliadb/ingest/orm/processor/data_augmentation.py +7 -29
  30. nucliadb/ingest/orm/processor/processor.py +5 -2
  31. nucliadb/ingest/orm/resource.py +222 -413
  32. nucliadb/ingest/processing.py +8 -2
  33. nucliadb/ingest/serialize.py +77 -46
  34. nucliadb/ingest/service/writer.py +2 -56
  35. nucliadb/ingest/settings.py +1 -4
  36. nucliadb/learning_proxy.py +6 -4
  37. nucliadb/purge/__init__.py +102 -12
  38. nucliadb/purge/orphan_shards.py +6 -4
  39. nucliadb/reader/api/models.py +3 -3
  40. nucliadb/reader/api/v1/__init__.py +1 -0
  41. nucliadb/reader/api/v1/download.py +2 -2
  42. nucliadb/reader/api/v1/knowledgebox.py +3 -3
  43. nucliadb/reader/api/v1/resource.py +23 -12
  44. nucliadb/reader/api/v1/services.py +4 -4
  45. nucliadb/reader/api/v1/vectorsets.py +48 -0
  46. nucliadb/search/api/v1/ask.py +11 -1
  47. nucliadb/search/api/v1/feedback.py +3 -3
  48. nucliadb/search/api/v1/knowledgebox.py +8 -13
  49. nucliadb/search/api/v1/search.py +3 -2
  50. nucliadb/search/api/v1/suggest.py +0 -2
  51. nucliadb/search/predict.py +6 -4
  52. nucliadb/search/requesters/utils.py +1 -2
  53. nucliadb/search/search/chat/ask.py +77 -13
  54. nucliadb/search/search/chat/prompt.py +16 -5
  55. nucliadb/search/search/chat/query.py +74 -34
  56. nucliadb/search/search/exceptions.py +2 -7
  57. nucliadb/search/search/find.py +9 -5
  58. nucliadb/search/search/find_merge.py +10 -4
  59. nucliadb/search/search/graph_strategy.py +884 -0
  60. nucliadb/search/search/hydrator.py +6 -0
  61. nucliadb/search/search/merge.py +79 -24
  62. nucliadb/search/search/query.py +74 -245
  63. nucliadb/search/search/query_parser/exceptions.py +11 -1
  64. nucliadb/search/search/query_parser/fetcher.py +405 -0
  65. nucliadb/search/search/query_parser/models.py +0 -3
  66. nucliadb/search/search/query_parser/parser.py +22 -21
  67. nucliadb/search/search/rerankers.py +1 -42
  68. nucliadb/search/search/shards.py +19 -0
  69. nucliadb/standalone/api_router.py +2 -14
  70. nucliadb/standalone/settings.py +4 -0
  71. nucliadb/train/generators/field_streaming.py +7 -3
  72. nucliadb/train/lifecycle.py +3 -6
  73. nucliadb/train/nodes.py +14 -12
  74. nucliadb/train/resource.py +380 -0
  75. nucliadb/writer/api/constants.py +20 -16
  76. nucliadb/writer/api/v1/__init__.py +1 -0
  77. nucliadb/writer/api/v1/export_import.py +1 -1
  78. nucliadb/writer/api/v1/field.py +13 -7
  79. nucliadb/writer/api/v1/knowledgebox.py +3 -46
  80. nucliadb/writer/api/v1/resource.py +20 -13
  81. nucliadb/writer/api/v1/services.py +10 -1
  82. nucliadb/writer/api/v1/upload.py +61 -34
  83. nucliadb/writer/{vectorsets.py → api/v1/vectorsets.py} +99 -47
  84. nucliadb/writer/back_pressure.py +17 -46
  85. nucliadb/writer/resource/basic.py +9 -7
  86. nucliadb/writer/resource/field.py +42 -9
  87. nucliadb/writer/settings.py +2 -2
  88. nucliadb/writer/tus/gcs.py +11 -10
  89. {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/METADATA +11 -14
  90. {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/RECORD +94 -96
  91. {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/WHEEL +1 -1
  92. nucliadb/common/cluster/discovery/base.py +0 -178
  93. nucliadb/common/cluster/discovery/k8s.py +0 -301
  94. nucliadb/common/cluster/discovery/manual.py +0 -57
  95. nucliadb/common/cluster/discovery/single.py +0 -51
  96. nucliadb/common/cluster/discovery/types.py +0 -32
  97. nucliadb/common/cluster/discovery/utils.py +0 -67
  98. nucliadb/common/cluster/standalone/grpc_node_binding.py +0 -349
  99. nucliadb/common/cluster/standalone/index_node.py +0 -123
  100. nucliadb/common/cluster/standalone/service.py +0 -84
  101. nucliadb/standalone/introspect.py +0 -208
  102. nucliadb-6.2.0.post2675.dist-info/zip-safe +0 -1
  103. /nucliadb/common/{cluster/discovery → models_utils}/__init__.py +0 -0
  104. {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/entry_points.txt +0 -0
  105. {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/top_level.txt +0 -0
@@ -1,208 +0,0 @@
1
- # Copyright (C) 2021 Bosutech XXI S.L.
2
- #
3
- # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
- # For commercial licensing, contact us at info@nuclia.com.
5
- #
6
- # AGPL:
7
- # This program is free software: you can redistribute it and/or modify
8
- # it under the terms of the GNU Affero General Public License as
9
- # published by the Free Software Foundation, either version 3 of the
10
- # License, or (at your option) any later version.
11
- #
12
- # This program is distributed in the hope that it will be useful,
13
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- # GNU Affero General Public License for more details.
16
- #
17
- # You should have received a copy of the GNU Affero General Public License
18
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
- #
20
-
21
- import asyncio
22
- import os
23
- import platform
24
- import sys
25
- import tarfile
26
- import tempfile
27
- from collections.abc import AsyncGenerator
28
- from typing import Optional
29
-
30
- import pkg_resources
31
- import psutil
32
- from fastapi import FastAPI
33
- from pydantic import BaseModel
34
-
35
- from nucliadb.common.cluster import manager as cluster_manager
36
- from nucliadb.standalone.settings import Settings
37
- from nucliadb_telemetry.settings import LogOutputType, LogSettings
38
-
39
- MB = 1024 * 1024
40
- CHUNK_SIZE = 2 * MB
41
- SYSTEM_INFO_TEMPLATE = """System info
42
- ===========
43
-
44
- Python
45
- ------
46
- - Version: {python_version}
47
-
48
- Operative system
49
- ----------------
50
- - Name: {os_name}
51
- - Release: {os_release}
52
- - Version: {os_version}
53
- - Machine: {os_machine}
54
- - File System Encoding: {os_file_system_encoding}
55
-
56
- CPU information
57
- ---------------
58
- - Number of CPUs: {cpu_count}
59
-
60
- Memory information
61
- ------------------
62
- - Total: {memory_total:.2f} MB
63
- - Available: {memory_available:.2f} MB
64
- - Used: {memory_used:.2f} MB
65
- - Used %: {memory_used_percent:.2f}%
66
- """
67
-
68
-
69
- class NodeInfo(BaseModel):
70
- id: str
71
- address: str
72
- shard_count: int
73
- primary_id: Optional[str] = None
74
-
75
-
76
- class ClusterInfo(BaseModel):
77
- nodes: list[NodeInfo]
78
-
79
-
80
- async def stream_tar(app: FastAPI) -> AsyncGenerator[bytes, None]:
81
- with tempfile.TemporaryDirectory() as temp_dir:
82
- tar_file = os.path.join(temp_dir, "introspect.tar.gz")
83
- with tarfile.open(tar_file, mode="w:gz") as tar:
84
- await add_system_info(temp_dir, tar)
85
- await add_dependencies(temp_dir, tar)
86
- await add_cluster_info(temp_dir, tar)
87
- settings: Settings = app.settings.copy() # type: ignore
88
- await add_settings(temp_dir, tar, settings)
89
- if settings.log_output_type == LogOutputType.FILE:
90
- await add_logs(tar)
91
-
92
- async for chunk in stream_out_tar(tar_file):
93
- yield chunk
94
-
95
-
96
- async def stream_out_tar(tar_file: str) -> AsyncGenerator[bytes, None]:
97
- loop = asyncio.get_event_loop()
98
- with open(tar_file, "rb") as f:
99
- chunk = await loop.run_in_executor(None, f.read, CHUNK_SIZE)
100
- while chunk:
101
- yield chunk
102
- chunk = await loop.run_in_executor(None, f.read, CHUNK_SIZE)
103
-
104
-
105
- async def add_system_info(temp_dir: str, tar: tarfile.TarFile):
106
- loop = asyncio.get_event_loop()
107
- await loop.run_in_executor(None, _add_system_info_to_tar, temp_dir, tar)
108
-
109
-
110
- def _add_system_info_to_tar(temp_dir: str, tar: tarfile.TarFile):
111
- system_info_file = os.path.join(temp_dir, "system_info.txt")
112
- with open(system_info_file, "w") as f:
113
- memory = psutil.virtual_memory()
114
- f.write(
115
- SYSTEM_INFO_TEMPLATE.format(
116
- python_version=sys.version,
117
- os_name=os.uname().sysname,
118
- os_release=platform.release(),
119
- os_version=platform.version(),
120
- os_machine=platform.machine(),
121
- os_file_system_encoding=os.sys.getfilesystemencoding(), # type: ignore
122
- cpu_count=psutil.cpu_count(),
123
- memory_total=memory.total / MB,
124
- memory_available=memory.available / MB,
125
- memory_used=memory.used / MB,
126
- memory_used_percent=memory.percent,
127
- )
128
- )
129
- tar.add(system_info_file, arcname="system_info.txt")
130
-
131
-
132
- async def add_dependencies(temp_dir: str, tar: tarfile.TarFile):
133
- loop = asyncio.get_event_loop()
134
- await loop.run_in_executor(None, _add_dependencies_to_tar, temp_dir, tar)
135
-
136
-
137
- def _add_dependencies_to_tar(temp_dir: str, tar: tarfile.TarFile):
138
- dependendies_file = os.path.join(temp_dir, "dependencies.txt")
139
- with open(dependendies_file, "w") as f:
140
- installed_packages = [pkg for pkg in pkg_resources.working_set]
141
- lines = []
142
- for pkg in sorted(installed_packages, key=lambda p: p.key):
143
- lines.append(f"{pkg.key}=={pkg.version}\n")
144
- f.writelines(lines)
145
- tar.add(dependendies_file, arcname="dependencies.txt")
146
-
147
-
148
- async def add_cluster_info(temp_dir: str, tar: tarfile.TarFile):
149
- loop = asyncio.get_event_loop()
150
- await loop.run_in_executor(None, _add_cluster_info_to_tar, temp_dir, tar)
151
-
152
-
153
- def _add_cluster_info_to_tar(temp_dir: str, tar: tarfile.TarFile):
154
- cluster_info = ClusterInfo(
155
- nodes=[
156
- NodeInfo(
157
- id=node.id,
158
- address=node.address,
159
- shard_count=node.shard_count,
160
- primary_id=node.primary_id,
161
- )
162
- for node in cluster_manager.get_index_nodes()
163
- ]
164
- )
165
- cluster_info_file = os.path.join(temp_dir, "cluster_info.txt")
166
- with open(cluster_info_file, "w") as f:
167
- f.write(cluster_info.model_dump_json(indent=4))
168
- tar.add(cluster_info_file, arcname="cluster_info.txt")
169
-
170
-
171
- async def add_settings(temp_dir: str, tar: tarfile.TarFile, settings: Settings):
172
- loop = asyncio.get_event_loop()
173
- await loop.run_in_executor(None, _add_settings_to_tar, temp_dir, tar, settings)
174
-
175
-
176
- def _add_settings_to_tar(temp_dir: str, tar: tarfile.TarFile, settings: Settings):
177
- remove_sensitive_settings(settings)
178
- settings_file = os.path.join(temp_dir, "settings.json")
179
- with open(settings_file, "w") as f:
180
- f.write(settings.model_dump_json(indent=4))
181
- tar.add(settings_file, arcname="settings.json")
182
-
183
-
184
- def remove_sensitive_settings(settings: Settings):
185
- for sensitive_setting in [
186
- "nua_api_key",
187
- "jwk_key",
188
- "gcs_base64_creds",
189
- "s3_client_secret",
190
- "driver_pg_url",
191
- ]:
192
- if hasattr(settings, sensitive_setting):
193
- setattr(settings, sensitive_setting, "********")
194
-
195
-
196
- async def add_logs(tar):
197
- loop = asyncio.get_event_loop()
198
- await loop.run_in_executor(None, _add_logs_to_tar, tar)
199
-
200
-
201
- def _add_logs_to_tar(tar: tarfile.TarFile):
202
- log_settings = LogSettings()
203
- access_log = os.path.realpath(log_settings.access_log)
204
- tar.add(access_log, arcname="logs/access.log")
205
- error_log = os.path.realpath(log_settings.error_log)
206
- tar.add(error_log, arcname="logs/error.log")
207
- info_log = os.path.realpath(log_settings.info_log)
208
- tar.add(info_log, arcname="logs/info.log")
@@ -1 +0,0 @@
1
-