aiverify-moonshot 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiverify_moonshot-0.4.0.dist-info/METADATA +249 -0
- aiverify_moonshot-0.4.0.dist-info/RECORD +163 -0
- aiverify_moonshot-0.4.0.dist-info/WHEEL +4 -0
- aiverify_moonshot-0.4.0.dist-info/licenses/AUTHORS.md +5 -0
- aiverify_moonshot-0.4.0.dist-info/licenses/LICENSE.md +201 -0
- aiverify_moonshot-0.4.0.dist-info/licenses/NOTICES.md +3340 -0
- moonshot/__init__.py +0 -0
- moonshot/__main__.py +198 -0
- moonshot/api.py +155 -0
- moonshot/integrations/__init__.py +0 -0
- moonshot/integrations/cli/__init__.py +0 -0
- moonshot/integrations/cli/__main__.py +25 -0
- moonshot/integrations/cli/active_session_cfg.py +1 -0
- moonshot/integrations/cli/benchmark/__init__.py +0 -0
- moonshot/integrations/cli/benchmark/benchmark.py +186 -0
- moonshot/integrations/cli/benchmark/cookbook.py +545 -0
- moonshot/integrations/cli/benchmark/datasets.py +164 -0
- moonshot/integrations/cli/benchmark/metrics.py +141 -0
- moonshot/integrations/cli/benchmark/recipe.py +598 -0
- moonshot/integrations/cli/benchmark/result.py +216 -0
- moonshot/integrations/cli/benchmark/run.py +140 -0
- moonshot/integrations/cli/benchmark/runner.py +174 -0
- moonshot/integrations/cli/cli.py +64 -0
- moonshot/integrations/cli/common/__init__.py +0 -0
- moonshot/integrations/cli/common/common.py +72 -0
- moonshot/integrations/cli/common/connectors.py +325 -0
- moonshot/integrations/cli/common/display_helper.py +42 -0
- moonshot/integrations/cli/common/prompt_template.py +94 -0
- moonshot/integrations/cli/initialisation/__init__.py +0 -0
- moonshot/integrations/cli/initialisation/initialisation.py +14 -0
- moonshot/integrations/cli/redteam/__init__.py +0 -0
- moonshot/integrations/cli/redteam/attack_module.py +70 -0
- moonshot/integrations/cli/redteam/context_strategy.py +147 -0
- moonshot/integrations/cli/redteam/prompt_template.py +67 -0
- moonshot/integrations/cli/redteam/redteam.py +90 -0
- moonshot/integrations/cli/redteam/session.py +467 -0
- moonshot/integrations/web_api/.env.dev +7 -0
- moonshot/integrations/web_api/__init__.py +0 -0
- moonshot/integrations/web_api/__main__.py +56 -0
- moonshot/integrations/web_api/app.py +125 -0
- moonshot/integrations/web_api/container.py +146 -0
- moonshot/integrations/web_api/log/.gitkeep +0 -0
- moonshot/integrations/web_api/logging_conf.py +114 -0
- moonshot/integrations/web_api/routes/__init__.py +0 -0
- moonshot/integrations/web_api/routes/attack_modules.py +66 -0
- moonshot/integrations/web_api/routes/benchmark.py +116 -0
- moonshot/integrations/web_api/routes/benchmark_result.py +175 -0
- moonshot/integrations/web_api/routes/context_strategy.py +129 -0
- moonshot/integrations/web_api/routes/cookbook.py +225 -0
- moonshot/integrations/web_api/routes/dataset.py +120 -0
- moonshot/integrations/web_api/routes/endpoint.py +282 -0
- moonshot/integrations/web_api/routes/metric.py +78 -0
- moonshot/integrations/web_api/routes/prompt_template.py +128 -0
- moonshot/integrations/web_api/routes/recipe.py +219 -0
- moonshot/integrations/web_api/routes/redteam.py +609 -0
- moonshot/integrations/web_api/routes/runner.py +239 -0
- moonshot/integrations/web_api/schemas/__init__.py +0 -0
- moonshot/integrations/web_api/schemas/benchmark_runner_dto.py +13 -0
- moonshot/integrations/web_api/schemas/cookbook_create_dto.py +19 -0
- moonshot/integrations/web_api/schemas/cookbook_response_model.py +9 -0
- moonshot/integrations/web_api/schemas/dataset_response_dto.py +9 -0
- moonshot/integrations/web_api/schemas/endpoint_create_dto.py +21 -0
- moonshot/integrations/web_api/schemas/endpoint_response_model.py +11 -0
- moonshot/integrations/web_api/schemas/prompt_response_model.py +14 -0
- moonshot/integrations/web_api/schemas/prompt_template_response_model.py +10 -0
- moonshot/integrations/web_api/schemas/recipe_create_dto.py +32 -0
- moonshot/integrations/web_api/schemas/recipe_response_model.py +7 -0
- moonshot/integrations/web_api/schemas/session_create_dto.py +16 -0
- moonshot/integrations/web_api/schemas/session_prompt_dto.py +7 -0
- moonshot/integrations/web_api/schemas/session_response_model.py +38 -0
- moonshot/integrations/web_api/services/__init__.py +0 -0
- moonshot/integrations/web_api/services/attack_module_service.py +34 -0
- moonshot/integrations/web_api/services/auto_red_team_test_manager.py +86 -0
- moonshot/integrations/web_api/services/auto_red_team_test_state.py +57 -0
- moonshot/integrations/web_api/services/base_service.py +8 -0
- moonshot/integrations/web_api/services/benchmark_result_service.py +25 -0
- moonshot/integrations/web_api/services/benchmark_test_manager.py +106 -0
- moonshot/integrations/web_api/services/benchmark_test_state.py +56 -0
- moonshot/integrations/web_api/services/benchmarking_service.py +31 -0
- moonshot/integrations/web_api/services/context_strategy_service.py +22 -0
- moonshot/integrations/web_api/services/cookbook_service.py +194 -0
- moonshot/integrations/web_api/services/dataset_service.py +20 -0
- moonshot/integrations/web_api/services/endpoint_service.py +65 -0
- moonshot/integrations/web_api/services/metric_service.py +14 -0
- moonshot/integrations/web_api/services/prompt_template_service.py +39 -0
- moonshot/integrations/web_api/services/recipe_service.py +155 -0
- moonshot/integrations/web_api/services/runner_service.py +147 -0
- moonshot/integrations/web_api/services/session_service.py +350 -0
- moonshot/integrations/web_api/services/utils/exceptions_handler.py +41 -0
- moonshot/integrations/web_api/services/utils/results_formatter.py +47 -0
- moonshot/integrations/web_api/status_updater/interface/benchmark_progress_callback.py +14 -0
- moonshot/integrations/web_api/status_updater/interface/redteam_progress_callback.py +14 -0
- moonshot/integrations/web_api/status_updater/moonshot_ui_webhook.py +72 -0
- moonshot/integrations/web_api/types/types.py +99 -0
- moonshot/src/__init__.py +0 -0
- moonshot/src/api/__init__.py +0 -0
- moonshot/src/api/api_connector.py +58 -0
- moonshot/src/api/api_connector_endpoint.py +162 -0
- moonshot/src/api/api_context_strategy.py +57 -0
- moonshot/src/api/api_cookbook.py +160 -0
- moonshot/src/api/api_dataset.py +46 -0
- moonshot/src/api/api_environment_variables.py +17 -0
- moonshot/src/api/api_metrics.py +51 -0
- moonshot/src/api/api_prompt_template.py +43 -0
- moonshot/src/api/api_recipe.py +182 -0
- moonshot/src/api/api_red_teaming.py +59 -0
- moonshot/src/api/api_result.py +84 -0
- moonshot/src/api/api_run.py +74 -0
- moonshot/src/api/api_runner.py +132 -0
- moonshot/src/api/api_session.py +290 -0
- moonshot/src/configs/__init__.py +0 -0
- moonshot/src/configs/env_variables.py +187 -0
- moonshot/src/connectors/__init__.py +0 -0
- moonshot/src/connectors/connector.py +327 -0
- moonshot/src/connectors/connector_prompt_arguments.py +17 -0
- moonshot/src/connectors_endpoints/__init__.py +0 -0
- moonshot/src/connectors_endpoints/connector_endpoint.py +211 -0
- moonshot/src/connectors_endpoints/connector_endpoint_arguments.py +54 -0
- moonshot/src/cookbooks/__init__.py +0 -0
- moonshot/src/cookbooks/cookbook.py +225 -0
- moonshot/src/cookbooks/cookbook_arguments.py +34 -0
- moonshot/src/datasets/__init__.py +0 -0
- moonshot/src/datasets/dataset.py +255 -0
- moonshot/src/datasets/dataset_arguments.py +50 -0
- moonshot/src/metrics/__init__.py +0 -0
- moonshot/src/metrics/metric.py +192 -0
- moonshot/src/metrics/metric_interface.py +95 -0
- moonshot/src/prompt_templates/__init__.py +0 -0
- moonshot/src/prompt_templates/prompt_template.py +103 -0
- moonshot/src/recipes/__init__.py +0 -0
- moonshot/src/recipes/recipe.py +340 -0
- moonshot/src/recipes/recipe_arguments.py +111 -0
- moonshot/src/redteaming/__init__.py +0 -0
- moonshot/src/redteaming/attack/__init__.py +0 -0
- moonshot/src/redteaming/attack/attack_module.py +618 -0
- moonshot/src/redteaming/attack/attack_module_arguments.py +44 -0
- moonshot/src/redteaming/attack/context_strategy.py +131 -0
- moonshot/src/redteaming/context_strategy/__init__.py +0 -0
- moonshot/src/redteaming/context_strategy/context_strategy_interface.py +46 -0
- moonshot/src/redteaming/session/__init__.py +0 -0
- moonshot/src/redteaming/session/chat.py +209 -0
- moonshot/src/redteaming/session/red_teaming_progress.py +128 -0
- moonshot/src/redteaming/session/red_teaming_type.py +6 -0
- moonshot/src/redteaming/session/session.py +775 -0
- moonshot/src/results/__init__.py +0 -0
- moonshot/src/results/result.py +119 -0
- moonshot/src/results/result_arguments.py +44 -0
- moonshot/src/runners/__init__.py +0 -0
- moonshot/src/runners/runner.py +476 -0
- moonshot/src/runners/runner_arguments.py +46 -0
- moonshot/src/runners/runner_type.py +6 -0
- moonshot/src/runs/__init__.py +0 -0
- moonshot/src/runs/run.py +344 -0
- moonshot/src/runs/run_arguments.py +162 -0
- moonshot/src/runs/run_progress.py +145 -0
- moonshot/src/runs/run_status.py +10 -0
- moonshot/src/storage/__init__.py +0 -0
- moonshot/src/storage/db_interface.py +128 -0
- moonshot/src/storage/io_interface.py +31 -0
- moonshot/src/storage/storage.py +525 -0
- moonshot/src/utils/__init__.py +0 -0
- moonshot/src/utils/import_modules.py +96 -0
- moonshot/src/utils/timeit.py +25 -0
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from pydantic import validate_call
|
|
6
|
+
from slugify import slugify
|
|
7
|
+
|
|
8
|
+
from moonshot.src.configs.env_variables import EnvVariables
|
|
9
|
+
from moonshot.src.cookbooks.cookbook_arguments import CookbookArguments
|
|
10
|
+
from moonshot.src.storage.storage import Storage
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Cookbook:
|
|
14
|
+
def __init__(self, cb_args: CookbookArguments) -> None:
|
|
15
|
+
self.id = cb_args.id
|
|
16
|
+
self.name = cb_args.name
|
|
17
|
+
self.description = cb_args.description
|
|
18
|
+
self.recipes = cb_args.recipes
|
|
19
|
+
|
|
20
|
+
@classmethod
|
|
21
|
+
def load(cls, cb_id: str) -> Cookbook:
|
|
22
|
+
"""
|
|
23
|
+
This method loads a cookbook from a JSON file.
|
|
24
|
+
|
|
25
|
+
It uses the cookbook ID to construct the file path for the JSON file in the designated cookbook directory.
|
|
26
|
+
The method then reads the JSON file and returns the cookbook information as a Cookbook instance.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
cb_id (str): The unique identifier of the cookbook.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
Cookbook: An instance of the Cookbook class populated with the loaded cookbook information.
|
|
33
|
+
"""
|
|
34
|
+
cb_info = Storage.read_object(EnvVariables.COOKBOOKS.name, cb_id, "json")
|
|
35
|
+
return cls(CookbookArguments(**cb_info))
|
|
36
|
+
|
|
37
|
+
@staticmethod
|
|
38
|
+
def create(cb_args: CookbookArguments) -> str:
|
|
39
|
+
"""
|
|
40
|
+
This method is responsible for creating a new cookbook and storing its details in a JSON file.
|
|
41
|
+
|
|
42
|
+
The function accepts `cb_args` parameter which contains the necessary details for creating a new cookbook.
|
|
43
|
+
It generates a unique ID for the cookbook by slugifying the cookbook name. After that, it constructs a
|
|
44
|
+
dictionary with the cookbook's details and writes this information to a JSON file. The JSON file is named after
|
|
45
|
+
the cookbook ID and is stored in the directory specified by `EnvironmentVars.COOKBOOKS`.
|
|
46
|
+
|
|
47
|
+
If the operation encounters any error, an exception is raised and the error message is printed.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
cb_args (CookbookArguments): An object that holds the necessary details for creating a new cookbook.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
str: The unique ID of the newly created cookbook.
|
|
54
|
+
|
|
55
|
+
Raises:
|
|
56
|
+
RuntimeError: If any of the recipes specified in the cookbook does not exist.
|
|
57
|
+
Exception: If there is an error during the file writing process or any other operation within the method.
|
|
58
|
+
"""
|
|
59
|
+
try:
|
|
60
|
+
cb_id = slugify(cb_args.name, lowercase=True)
|
|
61
|
+
|
|
62
|
+
# check if the cookbook exists
|
|
63
|
+
if Storage.is_object_exists(EnvVariables.COOKBOOKS.name, cb_id, "json"):
|
|
64
|
+
raise RuntimeError(f"Cookbook with ID '{cb_id}' already exists.")
|
|
65
|
+
|
|
66
|
+
# check if recipes in list exist before creating cookbook
|
|
67
|
+
for recipe in cb_args.recipes:
|
|
68
|
+
if not Storage.is_object_exists(
|
|
69
|
+
EnvVariables.RECIPES.name, recipe, "json"
|
|
70
|
+
):
|
|
71
|
+
raise RuntimeError(f"{recipe} recipe does not exist.")
|
|
72
|
+
|
|
73
|
+
cb_info = {
|
|
74
|
+
"id": cb_id,
|
|
75
|
+
"name": cb_args.name,
|
|
76
|
+
"description": cb_args.description,
|
|
77
|
+
"recipes": cb_args.recipes,
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
# Write as json output
|
|
81
|
+
Storage.create_object(EnvVariables.COOKBOOKS.name, cb_id, cb_info, "json")
|
|
82
|
+
return cb_id
|
|
83
|
+
|
|
84
|
+
except Exception as e:
|
|
85
|
+
print(f"Failed to create cookbook: {str(e)}")
|
|
86
|
+
raise e
|
|
87
|
+
|
|
88
|
+
@staticmethod
|
|
89
|
+
@validate_call
|
|
90
|
+
def read(cb_id: str) -> CookbookArguments:
|
|
91
|
+
"""
|
|
92
|
+
Retrieves the details of a specified cookbook.
|
|
93
|
+
|
|
94
|
+
This method accepts a cookbook ID as an argument, locates the corresponding JSON file in the directory
|
|
95
|
+
defined by `EnvironmentVars.COOKBOOKS`, and returns a CookbookArguments object that encapsulates the cookbook's
|
|
96
|
+
details. If any error occurs during the process, an exception is raised and the error message is logged.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
cb_id (str): The unique identifier of the cookbook to be retrieved.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
CookbookArguments: An object encapsulating the details of the retrieved cookbook.
|
|
103
|
+
|
|
104
|
+
Raises:
|
|
105
|
+
Exception: If there's an error during the file reading process or any other operation within the method.
|
|
106
|
+
"""
|
|
107
|
+
try:
|
|
108
|
+
if not cb_id:
|
|
109
|
+
raise RuntimeError("Cookbook ID is empty")
|
|
110
|
+
|
|
111
|
+
obj_results = Storage.read_object(
|
|
112
|
+
EnvVariables.COOKBOOKS.name, cb_id, "json"
|
|
113
|
+
)
|
|
114
|
+
if obj_results:
|
|
115
|
+
return CookbookArguments(**obj_results)
|
|
116
|
+
else:
|
|
117
|
+
raise RuntimeError(f"Unable to get results for {cb_id}.")
|
|
118
|
+
|
|
119
|
+
except Exception as e:
|
|
120
|
+
print(f"Failed to read cookbook: {str(e)}")
|
|
121
|
+
raise e
|
|
122
|
+
|
|
123
|
+
@staticmethod
|
|
124
|
+
def update(cb_args: CookbookArguments) -> bool:
|
|
125
|
+
"""
|
|
126
|
+
Updates the details of an existing cookbook.
|
|
127
|
+
|
|
128
|
+
This method accepts a CookbookArguments object, converts it to a dictionary, and writes the updated
|
|
129
|
+
information to the corresponding JSON file in the directory defined by `EnvVariables.COOKBOOKS`.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
cb_args (CookbookArguments): An object containing the updated details of the cookbook.
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
bool: True if the update was successful.
|
|
136
|
+
|
|
137
|
+
Raises:
|
|
138
|
+
Exception: If there's an error during the update process.
|
|
139
|
+
"""
|
|
140
|
+
try:
|
|
141
|
+
# check if recipes in list exist before creating cookbook
|
|
142
|
+
for recipe in cb_args.recipes:
|
|
143
|
+
if not Storage.is_object_exists(
|
|
144
|
+
EnvVariables.RECIPES.name, recipe, "json"
|
|
145
|
+
):
|
|
146
|
+
raise RuntimeError(f"{recipe} recipe does not exist.")
|
|
147
|
+
|
|
148
|
+
# Convert the cookbook arguments to a dictionary
|
|
149
|
+
cb_info = cb_args.to_dict()
|
|
150
|
+
|
|
151
|
+
# Write the updated cookbook information to the file
|
|
152
|
+
Storage.create_object(
|
|
153
|
+
EnvVariables.COOKBOOKS.name, cb_args.id, cb_info, "json"
|
|
154
|
+
)
|
|
155
|
+
return True
|
|
156
|
+
|
|
157
|
+
except Exception as e:
|
|
158
|
+
print(f"Failed to update cookbook: {str(e)}")
|
|
159
|
+
raise e
|
|
160
|
+
|
|
161
|
+
@staticmethod
|
|
162
|
+
@validate_call
|
|
163
|
+
def delete(cb_id: str) -> bool:
|
|
164
|
+
"""
|
|
165
|
+
Deletes a cookbook identified by its ID.
|
|
166
|
+
|
|
167
|
+
This method removes the cookbook's JSON file from the storage, using the `Storage.delete_object` method.
|
|
168
|
+
The `EnvVariables.COOKBOOKS` environment variable specifies the directory where the cookbook files are stored.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
cb_id (str): The unique identifier of the cookbook to be deleted.
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
bool: True if the deletion was successful.
|
|
175
|
+
|
|
176
|
+
Raises:
|
|
177
|
+
Exception: If there's an error during the deletion process.
|
|
178
|
+
"""
|
|
179
|
+
try:
|
|
180
|
+
Storage.delete_object(EnvVariables.COOKBOOKS.name, cb_id, "json")
|
|
181
|
+
return True
|
|
182
|
+
|
|
183
|
+
except Exception as e:
|
|
184
|
+
print(f"Failed to delete cookbook: {str(e)}")
|
|
185
|
+
raise e
|
|
186
|
+
|
|
187
|
+
@staticmethod
|
|
188
|
+
def get_available_items() -> tuple[list[str], list[CookbookArguments]]:
|
|
189
|
+
"""
|
|
190
|
+
Retrieves and returns all available cookbooks.
|
|
191
|
+
|
|
192
|
+
This method scans the directory specified by `EnvironmentVars.COOKBOOKS` and identifies all stored cookbook
|
|
193
|
+
files. It excludes any files that contain "__" in their names. For each valid cookbook file, the method reads
|
|
194
|
+
the file content and constructs a CookbookArguments object encapsulating the cookbook's details.
|
|
195
|
+
Both the CookbookArguments object and the cookbook ID are then appended to their respective lists.
|
|
196
|
+
|
|
197
|
+
Returns:
|
|
198
|
+
tuple[list[str], list[CookbookArguments]]: A tuple where the first element is a list of cookbook IDs and
|
|
199
|
+
the second element is a list of CookbookArguments objects representing the details of each cookbook.
|
|
200
|
+
|
|
201
|
+
Raises:
|
|
202
|
+
Exception: If an error occurs during the file reading process or any other operation within the method.
|
|
203
|
+
"""
|
|
204
|
+
try:
|
|
205
|
+
retn_cbs = []
|
|
206
|
+
retn_cbs_ids = []
|
|
207
|
+
|
|
208
|
+
cbs = Storage.get_objects(EnvVariables.COOKBOOKS.name, "json")
|
|
209
|
+
for cb in cbs:
|
|
210
|
+
if "__" in cb:
|
|
211
|
+
continue
|
|
212
|
+
|
|
213
|
+
cb_info = CookbookArguments(
|
|
214
|
+
**Storage.read_object(
|
|
215
|
+
EnvVariables.COOKBOOKS.name, Path(cb).stem, "json"
|
|
216
|
+
)
|
|
217
|
+
)
|
|
218
|
+
retn_cbs.append(cb_info)
|
|
219
|
+
retn_cbs_ids.append(cb_info.id)
|
|
220
|
+
|
|
221
|
+
return retn_cbs_ids, retn_cbs
|
|
222
|
+
|
|
223
|
+
except Exception as e:
|
|
224
|
+
print(f"Failed to get available cookbooks: {str(e)}")
|
|
225
|
+
raise e
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from pydantic import BaseModel, Field
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class CookbookArguments(BaseModel):
|
|
5
|
+
id: str # id (str): The unique identifier for the Cookbook.
|
|
6
|
+
|
|
7
|
+
name: str = Field(min_length=1) # name (str): The name of the Cookbook.
|
|
8
|
+
|
|
9
|
+
description: str # description (str): A brief description of the Cookbook.
|
|
10
|
+
|
|
11
|
+
recipes: list[str] = Field(
|
|
12
|
+
min_length=1
|
|
13
|
+
) # recipes (list): A list of recipes included in the Cookbook.
|
|
14
|
+
|
|
15
|
+
def to_dict(self) -> dict:
|
|
16
|
+
"""
|
|
17
|
+
Converts the CookbookArguments instance into a dictionary.
|
|
18
|
+
|
|
19
|
+
This method takes all the attributes of the CookbookArguments instance and constructs a dictionary
|
|
20
|
+
with attribute names as keys and their corresponding values. This includes the id, name, description,
|
|
21
|
+
and recipes.
|
|
22
|
+
|
|
23
|
+
This dictionary can be used for serialization purposes, such as storing the cookbook information in a JSON file
|
|
24
|
+
or sending it over a network.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
dict: A dictionary representation of the CookbookArguments instance.
|
|
28
|
+
"""
|
|
29
|
+
return {
|
|
30
|
+
"id": self.id,
|
|
31
|
+
"name": self.name,
|
|
32
|
+
"description": self.description,
|
|
33
|
+
"recipes": self.recipes,
|
|
34
|
+
}
|
|
File without changes
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from pydantic import validate_call
|
|
6
|
+
|
|
7
|
+
from moonshot.src.configs.env_variables import EnvVariables
|
|
8
|
+
from moonshot.src.datasets.dataset_arguments import DatasetArguments
|
|
9
|
+
from moonshot.src.storage.storage import Storage
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Dataset:
|
|
13
|
+
cache_name = "cache"
|
|
14
|
+
cache_extension = "json"
|
|
15
|
+
|
|
16
|
+
@staticmethod
|
|
17
|
+
@validate_call
|
|
18
|
+
def read(ds_id: str) -> DatasetArguments:
|
|
19
|
+
"""
|
|
20
|
+
Fetches the details of a given dataset.
|
|
21
|
+
|
|
22
|
+
This method takes a dataset ID as input, finds the corresponding JSON file in the directory
|
|
23
|
+
specified by `EnvVariables.DATASETS`, and returns a DatasetArguments object
|
|
24
|
+
that contains the dataset's details. If any error arises during the process, an exception is raised and the
|
|
25
|
+
error message is logged.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
ds_id (str): The unique ID of the dataset to be fetched.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
DatasetArguments: An object encapsulating the details of the fetched dataset.
|
|
32
|
+
|
|
33
|
+
Raises:
|
|
34
|
+
Exception: If there's an error during the file reading process or any other operation within the method.
|
|
35
|
+
"""
|
|
36
|
+
try:
|
|
37
|
+
if ds_id:
|
|
38
|
+
return DatasetArguments(**Dataset._read_dataset(ds_id))
|
|
39
|
+
else:
|
|
40
|
+
raise RuntimeError("Dataset ID is empty")
|
|
41
|
+
|
|
42
|
+
except Exception as e:
|
|
43
|
+
print(f"Failed to read dataset: {str(e)}")
|
|
44
|
+
raise e
|
|
45
|
+
|
|
46
|
+
@staticmethod
|
|
47
|
+
def _read_dataset(ds_id: str) -> dict:
|
|
48
|
+
"""
|
|
49
|
+
Retrieves dataset information from storage and augments it with metadata.
|
|
50
|
+
|
|
51
|
+
This method takes a dataset ID, locates the corresponding JSON file within the directory
|
|
52
|
+
specified by `EnvVariables.DATASETS`, and constructs a dictionary that includes the dataset's
|
|
53
|
+
core details, as well as metadata such as the creation datetime and the count of dataset prompts.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
ds_id (str): The unique identifier of the dataset to be retrieved.
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
dict: A dictionary with the dataset's core information, enriched with metadata like the creation datetime
|
|
60
|
+
and the total number of prompts contained within the dataset.
|
|
61
|
+
"""
|
|
62
|
+
# Read the basic dataset information
|
|
63
|
+
dataset_info = Storage.read_object_with_iterator(
|
|
64
|
+
obj_type=EnvVariables.DATASETS.name,
|
|
65
|
+
obj_id=ds_id,
|
|
66
|
+
obj_extension="json",
|
|
67
|
+
json_keys=["name", "description", "license", "reference"],
|
|
68
|
+
iterator_keys=["examples.item"],
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
# Add additional parameters - [id, num_of_dataset_prompts, creation_date]
|
|
72
|
+
# Append the dataset ID to the dataset_info
|
|
73
|
+
dataset_info["id"] = ds_id
|
|
74
|
+
|
|
75
|
+
# Use Storage.count_objects to get the number of examples in a memory-efficient way
|
|
76
|
+
dataset_info["num_of_dataset_prompts"] = Storage.count_objects(
|
|
77
|
+
EnvVariables.DATASETS.name, ds_id, "json", "examples.item"
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
# Assign the creation date to the dataset_info
|
|
81
|
+
creation_datetime = Storage.get_creation_datetime(
|
|
82
|
+
EnvVariables.DATASETS.name, ds_id, "json"
|
|
83
|
+
)
|
|
84
|
+
dataset_info["created_date"] = creation_datetime.replace(
|
|
85
|
+
microsecond=0
|
|
86
|
+
).isoformat(" ")
|
|
87
|
+
|
|
88
|
+
return dataset_info
|
|
89
|
+
|
|
90
|
+
@staticmethod
|
|
91
|
+
@validate_call
|
|
92
|
+
def delete(ds_id: str) -> bool:
|
|
93
|
+
"""
|
|
94
|
+
Deletes a dataset from storage.
|
|
95
|
+
|
|
96
|
+
This method attempts to delete the dataset with the given ID from the storage. If the deletion is successful,
|
|
97
|
+
it returns True. If an exception occurs during the deletion process, it prints an error message and re-raises
|
|
98
|
+
the exception.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
ds_id (str): The unique identifier of the dataset to be deleted.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
bool: True if the dataset was successfully deleted.
|
|
105
|
+
|
|
106
|
+
Raises:
|
|
107
|
+
Exception: If an error occurs during the deletion process.
|
|
108
|
+
"""
|
|
109
|
+
try:
|
|
110
|
+
Storage.delete_object(EnvVariables.DATASETS.name, ds_id, "json")
|
|
111
|
+
return True
|
|
112
|
+
|
|
113
|
+
except Exception as e:
|
|
114
|
+
print(f"Failed to delete dataset: {str(e)}")
|
|
115
|
+
raise e
|
|
116
|
+
|
|
117
|
+
@staticmethod
|
|
118
|
+
def get_cache_information() -> dict:
|
|
119
|
+
"""
|
|
120
|
+
Retrieves cache information from the storage.
|
|
121
|
+
|
|
122
|
+
This method attempts to read the cache information from the storage and return it as a dictionary.
|
|
123
|
+
If the cache information does not exist or an error occurs, it returns an empty dictionary.
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
dict: A dictionary containing the cache information or an empty dictionary if an error occurs
|
|
127
|
+
or if the cache information does not exist.
|
|
128
|
+
|
|
129
|
+
Raises:
|
|
130
|
+
Exception: If there's an error during the retrieval process, it is logged and an
|
|
131
|
+
empty dictionary is returned.
|
|
132
|
+
"""
|
|
133
|
+
try:
|
|
134
|
+
# Retrieve cache information from the storage and return it as a dictionary
|
|
135
|
+
cache_info = Storage.read_object(
|
|
136
|
+
EnvVariables.DATASETS.name, Dataset.cache_name, Dataset.cache_extension
|
|
137
|
+
)
|
|
138
|
+
return cache_info if cache_info else {}
|
|
139
|
+
except Exception as e:
|
|
140
|
+
print(f"Failed to retrieve cache information: {str(e)}")
|
|
141
|
+
return {}
|
|
142
|
+
|
|
143
|
+
@staticmethod
|
|
144
|
+
def write_cache_information(cache_info: dict) -> None:
|
|
145
|
+
"""
|
|
146
|
+
Writes the updated cache information to the storage.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
cache_info (dict): The cache information to be written.
|
|
150
|
+
"""
|
|
151
|
+
try:
|
|
152
|
+
Storage.create_object(
|
|
153
|
+
obj_type=EnvVariables.DATASETS.name,
|
|
154
|
+
obj_id=Dataset.cache_name,
|
|
155
|
+
obj_info=cache_info,
|
|
156
|
+
obj_extension=Dataset.cache_extension,
|
|
157
|
+
)
|
|
158
|
+
except Exception as e:
|
|
159
|
+
print(f"Failed to write cache information: {str(e)}")
|
|
160
|
+
raise e
|
|
161
|
+
|
|
162
|
+
@staticmethod
|
|
163
|
+
def get_available_items(
|
|
164
|
+
datasets: list[str] = [],
|
|
165
|
+
) -> tuple[list[str], list[DatasetArguments]]:
|
|
166
|
+
"""
|
|
167
|
+
Retrieves a list of available dataset IDs and their corresponding DatasetArguments objects.
|
|
168
|
+
|
|
169
|
+
This method filters out any non-dataset files and the cache file from the list of datasets. It then
|
|
170
|
+
retrieves or updates the dataset information from the cache for each dataset. If the cache is updated
|
|
171
|
+
during this process, it writes the updated cache information back to the storage.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
datasets (list[str], optional): A list of dataset file names. If not provided, it will retrieve
|
|
175
|
+
the list of all dataset files from the storage. Defaults to an empty list.
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
tuple[list[str], list[DatasetArguments]]: A tuple containing two lists:
|
|
179
|
+
- The first list contains the IDs of the available datasets.
|
|
180
|
+
- The second list contains the corresponding DatasetArguments objects for those IDs.
|
|
181
|
+
"""
|
|
182
|
+
try:
|
|
183
|
+
retn_datasets = []
|
|
184
|
+
retn_datasets_ids = []
|
|
185
|
+
ds_cache_info = Dataset.get_cache_information()
|
|
186
|
+
cache_needs_update = False # Initialize a flag to track cache updates
|
|
187
|
+
|
|
188
|
+
if datasets:
|
|
189
|
+
datasets_objects = datasets
|
|
190
|
+
else:
|
|
191
|
+
datasets_objects = Storage.get_objects(
|
|
192
|
+
EnvVariables.DATASETS.name, "json"
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
for ds in datasets_objects:
|
|
196
|
+
if (
|
|
197
|
+
"__" in ds
|
|
198
|
+
or f"{Dataset.cache_name}.{Dataset.cache_extension}" in ds
|
|
199
|
+
):
|
|
200
|
+
continue
|
|
201
|
+
|
|
202
|
+
ds_name = Path(ds).stem
|
|
203
|
+
ds_info, cache_updated = Dataset._get_or_update_dataset_info(
|
|
204
|
+
ds_name, ds_cache_info
|
|
205
|
+
)
|
|
206
|
+
if cache_updated:
|
|
207
|
+
cache_needs_update = True # Set the flag if any cache was updated
|
|
208
|
+
|
|
209
|
+
retn_datasets.append(ds_info)
|
|
210
|
+
retn_datasets_ids.append(ds_info.id)
|
|
211
|
+
|
|
212
|
+
if cache_needs_update: # Check the flag after the loop
|
|
213
|
+
Dataset.write_cache_information(ds_cache_info)
|
|
214
|
+
|
|
215
|
+
return retn_datasets_ids, retn_datasets
|
|
216
|
+
|
|
217
|
+
except Exception as e:
|
|
218
|
+
print(f"Failed to get available datasets: {str(e)}")
|
|
219
|
+
raise e
|
|
220
|
+
|
|
221
|
+
@staticmethod
|
|
222
|
+
def _get_or_update_dataset_info(
|
|
223
|
+
ds_name: str, ds_cache_info: dict
|
|
224
|
+
) -> tuple[DatasetArguments, bool]:
|
|
225
|
+
"""
|
|
226
|
+
Retrieves or updates the dataset information from the cache.
|
|
227
|
+
|
|
228
|
+
This method checks if the dataset information is already available in the cache and if the file hash matches
|
|
229
|
+
the one stored in the cache. If it does, the information is retrieved from the cache. If not, the dataset
|
|
230
|
+
information is read from the storage, the cache is updated with the new information and the new file hash,
|
|
231
|
+
and a flag is set to indicate that the cache has been updated.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
ds_name (str): The name of the dataset.
|
|
235
|
+
ds_cache_info (dict): A dictionary containing the cached dataset information.
|
|
236
|
+
|
|
237
|
+
Returns:
|
|
238
|
+
tuple[DatasetArguments, bool]: A tuple containing the DatasetArguments object with the dataset information
|
|
239
|
+
and a boolean indicating whether the cache was updated or not.
|
|
240
|
+
"""
|
|
241
|
+
file_hash = Storage.get_file_hash(EnvVariables.DATASETS.name, ds_name, "json")
|
|
242
|
+
cache_updated = False
|
|
243
|
+
|
|
244
|
+
if ds_name in ds_cache_info and file_hash == ds_cache_info[ds_name]["hash"]:
|
|
245
|
+
ds_metadata = ds_cache_info[ds_name].copy()
|
|
246
|
+
ds_metadata.pop("hash", None)
|
|
247
|
+
ds_info = DatasetArguments(**ds_metadata)
|
|
248
|
+
else:
|
|
249
|
+
ds_info = DatasetArguments(**Dataset._read_dataset(ds_name))
|
|
250
|
+
ds_info.examples = None
|
|
251
|
+
ds_cache_info[ds_name] = ds_info.copy().to_dict()
|
|
252
|
+
ds_cache_info[ds_name]["hash"] = file_hash
|
|
253
|
+
cache_updated = True
|
|
254
|
+
|
|
255
|
+
return ds_info, cache_updated
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from pydantic import BaseModel
|
|
2
|
+
from pyparsing import Iterator
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class DatasetArguments(BaseModel):
|
|
6
|
+
class Config:
|
|
7
|
+
arbitrary_types_allowed = True
|
|
8
|
+
|
|
9
|
+
# id (str): Unique identifier for the dataset
|
|
10
|
+
id: str
|
|
11
|
+
|
|
12
|
+
# name (str): Name of the dataset
|
|
13
|
+
name: str
|
|
14
|
+
|
|
15
|
+
# description (str): Description of the dataset's contents and purpose
|
|
16
|
+
description: str
|
|
17
|
+
|
|
18
|
+
# examples (Iterator[dict] | None): Generator of examples from the dataset, where each example is a dictionary.
|
|
19
|
+
examples: Iterator[dict] | None
|
|
20
|
+
|
|
21
|
+
# num_of_dataset_prompts (int): The number of dataset prompts, automatically calculated
|
|
22
|
+
num_of_dataset_prompts: int = 0
|
|
23
|
+
|
|
24
|
+
# created_date (str): The creation date and time of the dataset in ISO format without 'T'. Automatically generated.
|
|
25
|
+
created_date: str = ""
|
|
26
|
+
|
|
27
|
+
# reference (str): An optional string to store a reference link or identifier for the dataset
|
|
28
|
+
reference: str = ""
|
|
29
|
+
|
|
30
|
+
# license (str): License information for the dataset. Defaults to an empty string if not provided.
|
|
31
|
+
license: str = ""
|
|
32
|
+
|
|
33
|
+
def to_dict(self) -> dict:
|
|
34
|
+
"""
|
|
35
|
+
Converts the DatasetArguments object to a dictionary.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
dict: A dictionary representation of the DatasetArguments object, including the id, name, description,
|
|
39
|
+
examples, number of dataset prompts, created date, reference, and license.
|
|
40
|
+
"""
|
|
41
|
+
return {
|
|
42
|
+
"id": self.id,
|
|
43
|
+
"name": self.name,
|
|
44
|
+
"description": self.description,
|
|
45
|
+
"examples": self.examples,
|
|
46
|
+
"num_of_dataset_prompts": self.num_of_dataset_prompts,
|
|
47
|
+
"created_date": self.created_date,
|
|
48
|
+
"reference": self.reference,
|
|
49
|
+
"license": self.license,
|
|
50
|
+
}
|
|
File without changes
|