nucliadb 6.2.0.post2675__py3-none-any.whl → 6.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. migrations/0028_extracted_vectors_reference.py +61 -0
  2. migrations/0029_backfill_field_status.py +149 -0
  3. migrations/0030_label_deduplication.py +60 -0
  4. nucliadb/common/cluster/manager.py +41 -331
  5. nucliadb/common/cluster/rebalance.py +2 -2
  6. nucliadb/common/cluster/rollover.py +12 -71
  7. nucliadb/common/cluster/settings.py +3 -0
  8. nucliadb/common/cluster/standalone/utils.py +0 -43
  9. nucliadb/common/cluster/utils.py +0 -16
  10. nucliadb/common/counters.py +1 -0
  11. nucliadb/common/datamanagers/fields.py +48 -7
  12. nucliadb/common/datamanagers/vectorsets.py +11 -2
  13. nucliadb/common/external_index_providers/base.py +2 -1
  14. nucliadb/common/external_index_providers/pinecone.py +3 -5
  15. nucliadb/common/ids.py +18 -4
  16. nucliadb/common/models_utils/from_proto.py +479 -0
  17. nucliadb/common/models_utils/to_proto.py +60 -0
  18. nucliadb/common/nidx.py +76 -37
  19. nucliadb/export_import/models.py +3 -3
  20. nucliadb/health.py +0 -7
  21. nucliadb/ingest/app.py +0 -8
  22. nucliadb/ingest/consumer/auditing.py +1 -1
  23. nucliadb/ingest/consumer/shard_creator.py +1 -1
  24. nucliadb/ingest/fields/base.py +83 -21
  25. nucliadb/ingest/orm/brain.py +55 -56
  26. nucliadb/ingest/orm/broker_message.py +12 -2
  27. nucliadb/ingest/orm/entities.py +6 -17
  28. nucliadb/ingest/orm/knowledgebox.py +44 -22
  29. nucliadb/ingest/orm/processor/data_augmentation.py +7 -29
  30. nucliadb/ingest/orm/processor/processor.py +5 -2
  31. nucliadb/ingest/orm/resource.py +222 -413
  32. nucliadb/ingest/processing.py +8 -2
  33. nucliadb/ingest/serialize.py +77 -46
  34. nucliadb/ingest/service/writer.py +2 -56
  35. nucliadb/ingest/settings.py +1 -4
  36. nucliadb/learning_proxy.py +6 -4
  37. nucliadb/purge/__init__.py +102 -12
  38. nucliadb/purge/orphan_shards.py +6 -4
  39. nucliadb/reader/api/models.py +3 -3
  40. nucliadb/reader/api/v1/__init__.py +1 -0
  41. nucliadb/reader/api/v1/download.py +2 -2
  42. nucliadb/reader/api/v1/knowledgebox.py +3 -3
  43. nucliadb/reader/api/v1/resource.py +23 -12
  44. nucliadb/reader/api/v1/services.py +4 -4
  45. nucliadb/reader/api/v1/vectorsets.py +48 -0
  46. nucliadb/search/api/v1/ask.py +11 -1
  47. nucliadb/search/api/v1/feedback.py +3 -3
  48. nucliadb/search/api/v1/knowledgebox.py +8 -13
  49. nucliadb/search/api/v1/search.py +3 -2
  50. nucliadb/search/api/v1/suggest.py +0 -2
  51. nucliadb/search/predict.py +6 -4
  52. nucliadb/search/requesters/utils.py +1 -2
  53. nucliadb/search/search/chat/ask.py +77 -13
  54. nucliadb/search/search/chat/prompt.py +16 -5
  55. nucliadb/search/search/chat/query.py +74 -34
  56. nucliadb/search/search/exceptions.py +2 -7
  57. nucliadb/search/search/find.py +9 -5
  58. nucliadb/search/search/find_merge.py +10 -4
  59. nucliadb/search/search/graph_strategy.py +884 -0
  60. nucliadb/search/search/hydrator.py +6 -0
  61. nucliadb/search/search/merge.py +79 -24
  62. nucliadb/search/search/query.py +74 -245
  63. nucliadb/search/search/query_parser/exceptions.py +11 -1
  64. nucliadb/search/search/query_parser/fetcher.py +405 -0
  65. nucliadb/search/search/query_parser/models.py +0 -3
  66. nucliadb/search/search/query_parser/parser.py +22 -21
  67. nucliadb/search/search/rerankers.py +1 -42
  68. nucliadb/search/search/shards.py +19 -0
  69. nucliadb/standalone/api_router.py +2 -14
  70. nucliadb/standalone/settings.py +4 -0
  71. nucliadb/train/generators/field_streaming.py +7 -3
  72. nucliadb/train/lifecycle.py +3 -6
  73. nucliadb/train/nodes.py +14 -12
  74. nucliadb/train/resource.py +380 -0
  75. nucliadb/writer/api/constants.py +20 -16
  76. nucliadb/writer/api/v1/__init__.py +1 -0
  77. nucliadb/writer/api/v1/export_import.py +1 -1
  78. nucliadb/writer/api/v1/field.py +13 -7
  79. nucliadb/writer/api/v1/knowledgebox.py +3 -46
  80. nucliadb/writer/api/v1/resource.py +20 -13
  81. nucliadb/writer/api/v1/services.py +10 -1
  82. nucliadb/writer/api/v1/upload.py +61 -34
  83. nucliadb/writer/{vectorsets.py → api/v1/vectorsets.py} +99 -47
  84. nucliadb/writer/back_pressure.py +17 -46
  85. nucliadb/writer/resource/basic.py +9 -7
  86. nucliadb/writer/resource/field.py +42 -9
  87. nucliadb/writer/settings.py +2 -2
  88. nucliadb/writer/tus/gcs.py +11 -10
  89. {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/METADATA +11 -14
  90. {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/RECORD +94 -96
  91. {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/WHEEL +1 -1
  92. nucliadb/common/cluster/discovery/base.py +0 -178
  93. nucliadb/common/cluster/discovery/k8s.py +0 -301
  94. nucliadb/common/cluster/discovery/manual.py +0 -57
  95. nucliadb/common/cluster/discovery/single.py +0 -51
  96. nucliadb/common/cluster/discovery/types.py +0 -32
  97. nucliadb/common/cluster/discovery/utils.py +0 -67
  98. nucliadb/common/cluster/standalone/grpc_node_binding.py +0 -349
  99. nucliadb/common/cluster/standalone/index_node.py +0 -123
  100. nucliadb/common/cluster/standalone/service.py +0 -84
  101. nucliadb/standalone/introspect.py +0 -208
  102. nucliadb-6.2.0.post2675.dist-info/zip-safe +0 -1
  103. /nucliadb/common/{cluster/discovery → models_utils}/__init__.py +0 -0
  104. {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/entry_points.txt +0 -0
  105. {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,61 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ """Migration #28
22
+
23
+ Add a key to each vectorset to know how to build the storage key for extracted vectors
24
+ """
25
+
26
+ import logging
27
+
28
+ from nucliadb.common import datamanagers
29
+ from nucliadb.migrator.context import ExecutionContext
30
+ from nucliadb_protos import knowledgebox_pb2
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ async def migrate(context: ExecutionContext) -> None: ...
36
+
37
+
38
+ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
39
+ async with datamanagers.with_rw_transaction() as txn:
40
+ vectorsets = [vs async for (_vid, vs) in datamanagers.vectorsets.iter(txn, kbid=kbid)]
41
+
42
+ if len(vectorsets) == 0: # pragma: nocover
43
+ # should never happen, everyone should have at least one
44
+ logger.warning(f"KB has no vectorsets!", extra={"kbid": kbid})
45
+ return
46
+
47
+ elif len(vectorsets) == 1:
48
+ logger.info(f"Migrating KB with a single vectorset", extra={"kbid": kbid})
49
+ vectorset = vectorsets[0]
50
+ vectorset.storage_key_kind = knowledgebox_pb2.VectorSetConfig.StorageKeyKind.LEGACY
51
+ await datamanagers.vectorsets.set(txn, kbid=kbid, config=vectorset)
52
+
53
+ else:
54
+ logger.info(f"Migrating KB with {len(vectorsets)} vectorsets", extra={"kbid": kbid})
55
+ for vectorset in vectorsets:
56
+ vectorset.storage_key_kind = (
57
+ knowledgebox_pb2.VectorSetConfig.StorageKeyKind.VECTORSET_PREFIX
58
+ )
59
+ await datamanagers.vectorsets.set(txn, kbid=kbid, config=vectorset)
60
+
61
+ await txn.commit()
@@ -0,0 +1,149 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ """Migration #29
22
+
23
+ Backfill field status (from error)
24
+ """
25
+
26
+ import logging
27
+ from typing import Optional
28
+
29
+ from nucliadb.migrator.context import ExecutionContext
30
+ from nucliadb_protos import resources_pb2, writer_pb2
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ async def migrate(context: ExecutionContext) -> None:
36
+ start: Optional[str] = ""
37
+ while True:
38
+ if start is None:
39
+ break
40
+ start = await do_batch(context, start)
41
+
42
+
43
+ async def migrate_kb(context: ExecutionContext, kbid: str) -> None: ...
44
+
45
+
46
+ async def do_batch(context: ExecutionContext, start: str) -> Optional[str]:
47
+ logger.info(f"Running batch from {start}")
48
+ async with context.kv_driver.transaction(read_only=False) as txn:
49
+ async with txn.connection.cursor() as cur: # type: ignore
50
+ # Retrieve a batch of fields
51
+ await cur.execute(
52
+ """
53
+ SELECT key FROM resources
54
+ WHERE key ~ '^/kbs/[^/]*/r/[^/]*/f/[^/]*/[^/]*$'
55
+ AND key > %s
56
+ ORDER BY key
57
+ LIMIT 500""",
58
+ (start,),
59
+ )
60
+ records = await cur.fetchall()
61
+ if len(records) == 0:
62
+ return None
63
+
64
+ field_keys = [r[0] for r in records]
65
+
66
+ # Retrieve resources basic (to check status)
67
+ resource_keys = set(["/".join(f.split("/")[:5]) for f in field_keys])
68
+ await cur.execute(
69
+ """
70
+ SELECT key, value FROM resources
71
+ WHERE key = ANY (%s)
72
+ ORDER BY key
73
+ """,
74
+ (list(resource_keys),),
75
+ )
76
+ records = await cur.fetchall()
77
+ resources_basic = {}
78
+ for k, v in records:
79
+ row_basic = resources_pb2.Basic()
80
+ row_basic.ParseFromString(v)
81
+ resources_basic[k] = row_basic
82
+
83
+ # Retrieve field errors
84
+ await cur.execute(
85
+ """
86
+ SELECT key, value FROM resources
87
+ WHERE key ~ '^/kbs/[^/]*/r/[^/]*/f/[^/]*/[^/]*/error$'
88
+ AND key > %s AND key <= %s
89
+ ORDER BY key
90
+ """,
91
+ (start, field_keys[-1] + "/error"),
92
+ )
93
+ records = await cur.fetchall()
94
+ errors = {}
95
+ for k, v in records:
96
+ row_error = writer_pb2.Error()
97
+ row_error.ParseFromString(v)
98
+ errors[k] = row_error
99
+
100
+ # Retrieve existing status keys
101
+ await cur.execute(
102
+ """
103
+ SELECT key FROM resources
104
+ WHERE key ~ '^/kbs/[^/]*/r/[^/]*/f/[^/]*/[^/]*/status$'
105
+ AND key > %s AND key <= %s
106
+ ORDER BY key
107
+ """,
108
+ (start, field_keys[-1] + "/status"),
109
+ )
110
+ records = await cur.fetchall()
111
+ has_status = [r[0] for r in records]
112
+
113
+ set_batch = []
114
+ for field_key in field_keys:
115
+ if field_key + "/status" in has_status:
116
+ # Already has status, skip
117
+ continue
118
+
119
+ resource_key = "/".join(field_key.split("/")[:5])
120
+ basic = resources_basic.get(resource_key, None)
121
+ if basic is None:
122
+ logger.warn(f"{field_key} resource has no basic, skipped")
123
+ continue
124
+
125
+ status = writer_pb2.FieldStatus()
126
+ status.status = writer_pb2.FieldStatus.Status.PROCESSED
127
+ error = errors.get(field_key + "/error", None)
128
+ # We only copy errors if they come from data augmentation or if the resource is in error
129
+ # This way we ensure we do not set an error for resources that were previously not in error
130
+ # There is no way to do this 100% accurate since the /error key is only cleared on field deletion
131
+ if error:
132
+ if (
133
+ error.code == writer_pb2.Error.ErrorCode.DATAAUGMENTATION
134
+ or basic.metadata.status == resources_pb2.Metadata.Status.ERROR
135
+ ):
136
+ field_error = writer_pb2.FieldError(
137
+ source_error=error,
138
+ )
139
+ status.errors.append(field_error)
140
+ status.status = writer_pb2.FieldStatus.Status.ERROR
141
+ set_batch.append((field_key + "/status", status.SerializeToString()))
142
+
143
+ # Write everything to the database in batch
144
+ async with cur.copy("COPY resources (key, value) FROM STDIN") as copy:
145
+ for row in set_batch:
146
+ await copy.write_row(row)
147
+ await txn.commit()
148
+
149
+ return field_keys[-1]
@@ -0,0 +1,60 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ """Migration #30
22
+
23
+ We want to support labels with the same title anymore. Run a deduplication for
24
+ all labelsets
25
+
26
+ """
27
+
28
+ import logging
29
+
30
+ from nucliadb.common import datamanagers
31
+ from nucliadb.migrator.context import ExecutionContext
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ async def migrate(context: ExecutionContext) -> None: ...
37
+
38
+
39
+ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
40
+ async with datamanagers.with_rw_transaction() as txn:
41
+ kb_labels = await datamanagers.labels.get_labels(txn, kbid=kbid)
42
+ changed = False
43
+
44
+ for labelset in kb_labels.labelset.values():
45
+ current_labels = labelset.labels
46
+ labelset.ClearField("labels")
47
+ deduplicator = set()
48
+
49
+ for label in current_labels:
50
+ label_id = label.title.lower()
51
+ if label_id not in deduplicator:
52
+ deduplicator.add(label_id)
53
+ labelset.labels.append(label)
54
+
55
+ changed = changed or (len(labelset.labels) < len(current_labels))
56
+
57
+ if changed:
58
+ await datamanagers.labels.set_labels(txn, kbid=kbid, labels=kb_labels)
59
+
60
+ await txn.commit()