aiverify-moonshot 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (163) hide show
  1. aiverify_moonshot-0.4.0.dist-info/METADATA +249 -0
  2. aiverify_moonshot-0.4.0.dist-info/RECORD +163 -0
  3. aiverify_moonshot-0.4.0.dist-info/WHEEL +4 -0
  4. aiverify_moonshot-0.4.0.dist-info/licenses/AUTHORS.md +5 -0
  5. aiverify_moonshot-0.4.0.dist-info/licenses/LICENSE.md +201 -0
  6. aiverify_moonshot-0.4.0.dist-info/licenses/NOTICES.md +3340 -0
  7. moonshot/__init__.py +0 -0
  8. moonshot/__main__.py +198 -0
  9. moonshot/api.py +155 -0
  10. moonshot/integrations/__init__.py +0 -0
  11. moonshot/integrations/cli/__init__.py +0 -0
  12. moonshot/integrations/cli/__main__.py +25 -0
  13. moonshot/integrations/cli/active_session_cfg.py +1 -0
  14. moonshot/integrations/cli/benchmark/__init__.py +0 -0
  15. moonshot/integrations/cli/benchmark/benchmark.py +186 -0
  16. moonshot/integrations/cli/benchmark/cookbook.py +545 -0
  17. moonshot/integrations/cli/benchmark/datasets.py +164 -0
  18. moonshot/integrations/cli/benchmark/metrics.py +141 -0
  19. moonshot/integrations/cli/benchmark/recipe.py +598 -0
  20. moonshot/integrations/cli/benchmark/result.py +216 -0
  21. moonshot/integrations/cli/benchmark/run.py +140 -0
  22. moonshot/integrations/cli/benchmark/runner.py +174 -0
  23. moonshot/integrations/cli/cli.py +64 -0
  24. moonshot/integrations/cli/common/__init__.py +0 -0
  25. moonshot/integrations/cli/common/common.py +72 -0
  26. moonshot/integrations/cli/common/connectors.py +325 -0
  27. moonshot/integrations/cli/common/display_helper.py +42 -0
  28. moonshot/integrations/cli/common/prompt_template.py +94 -0
  29. moonshot/integrations/cli/initialisation/__init__.py +0 -0
  30. moonshot/integrations/cli/initialisation/initialisation.py +14 -0
  31. moonshot/integrations/cli/redteam/__init__.py +0 -0
  32. moonshot/integrations/cli/redteam/attack_module.py +70 -0
  33. moonshot/integrations/cli/redteam/context_strategy.py +147 -0
  34. moonshot/integrations/cli/redteam/prompt_template.py +67 -0
  35. moonshot/integrations/cli/redteam/redteam.py +90 -0
  36. moonshot/integrations/cli/redteam/session.py +467 -0
  37. moonshot/integrations/web_api/.env.dev +7 -0
  38. moonshot/integrations/web_api/__init__.py +0 -0
  39. moonshot/integrations/web_api/__main__.py +56 -0
  40. moonshot/integrations/web_api/app.py +125 -0
  41. moonshot/integrations/web_api/container.py +146 -0
  42. moonshot/integrations/web_api/log/.gitkeep +0 -0
  43. moonshot/integrations/web_api/logging_conf.py +114 -0
  44. moonshot/integrations/web_api/routes/__init__.py +0 -0
  45. moonshot/integrations/web_api/routes/attack_modules.py +66 -0
  46. moonshot/integrations/web_api/routes/benchmark.py +116 -0
  47. moonshot/integrations/web_api/routes/benchmark_result.py +175 -0
  48. moonshot/integrations/web_api/routes/context_strategy.py +129 -0
  49. moonshot/integrations/web_api/routes/cookbook.py +225 -0
  50. moonshot/integrations/web_api/routes/dataset.py +120 -0
  51. moonshot/integrations/web_api/routes/endpoint.py +282 -0
  52. moonshot/integrations/web_api/routes/metric.py +78 -0
  53. moonshot/integrations/web_api/routes/prompt_template.py +128 -0
  54. moonshot/integrations/web_api/routes/recipe.py +219 -0
  55. moonshot/integrations/web_api/routes/redteam.py +609 -0
  56. moonshot/integrations/web_api/routes/runner.py +239 -0
  57. moonshot/integrations/web_api/schemas/__init__.py +0 -0
  58. moonshot/integrations/web_api/schemas/benchmark_runner_dto.py +13 -0
  59. moonshot/integrations/web_api/schemas/cookbook_create_dto.py +19 -0
  60. moonshot/integrations/web_api/schemas/cookbook_response_model.py +9 -0
  61. moonshot/integrations/web_api/schemas/dataset_response_dto.py +9 -0
  62. moonshot/integrations/web_api/schemas/endpoint_create_dto.py +21 -0
  63. moonshot/integrations/web_api/schemas/endpoint_response_model.py +11 -0
  64. moonshot/integrations/web_api/schemas/prompt_response_model.py +14 -0
  65. moonshot/integrations/web_api/schemas/prompt_template_response_model.py +10 -0
  66. moonshot/integrations/web_api/schemas/recipe_create_dto.py +32 -0
  67. moonshot/integrations/web_api/schemas/recipe_response_model.py +7 -0
  68. moonshot/integrations/web_api/schemas/session_create_dto.py +16 -0
  69. moonshot/integrations/web_api/schemas/session_prompt_dto.py +7 -0
  70. moonshot/integrations/web_api/schemas/session_response_model.py +38 -0
  71. moonshot/integrations/web_api/services/__init__.py +0 -0
  72. moonshot/integrations/web_api/services/attack_module_service.py +34 -0
  73. moonshot/integrations/web_api/services/auto_red_team_test_manager.py +86 -0
  74. moonshot/integrations/web_api/services/auto_red_team_test_state.py +57 -0
  75. moonshot/integrations/web_api/services/base_service.py +8 -0
  76. moonshot/integrations/web_api/services/benchmark_result_service.py +25 -0
  77. moonshot/integrations/web_api/services/benchmark_test_manager.py +106 -0
  78. moonshot/integrations/web_api/services/benchmark_test_state.py +56 -0
  79. moonshot/integrations/web_api/services/benchmarking_service.py +31 -0
  80. moonshot/integrations/web_api/services/context_strategy_service.py +22 -0
  81. moonshot/integrations/web_api/services/cookbook_service.py +194 -0
  82. moonshot/integrations/web_api/services/dataset_service.py +20 -0
  83. moonshot/integrations/web_api/services/endpoint_service.py +65 -0
  84. moonshot/integrations/web_api/services/metric_service.py +14 -0
  85. moonshot/integrations/web_api/services/prompt_template_service.py +39 -0
  86. moonshot/integrations/web_api/services/recipe_service.py +155 -0
  87. moonshot/integrations/web_api/services/runner_service.py +147 -0
  88. moonshot/integrations/web_api/services/session_service.py +350 -0
  89. moonshot/integrations/web_api/services/utils/exceptions_handler.py +41 -0
  90. moonshot/integrations/web_api/services/utils/results_formatter.py +47 -0
  91. moonshot/integrations/web_api/status_updater/interface/benchmark_progress_callback.py +14 -0
  92. moonshot/integrations/web_api/status_updater/interface/redteam_progress_callback.py +14 -0
  93. moonshot/integrations/web_api/status_updater/moonshot_ui_webhook.py +72 -0
  94. moonshot/integrations/web_api/types/types.py +99 -0
  95. moonshot/src/__init__.py +0 -0
  96. moonshot/src/api/__init__.py +0 -0
  97. moonshot/src/api/api_connector.py +58 -0
  98. moonshot/src/api/api_connector_endpoint.py +162 -0
  99. moonshot/src/api/api_context_strategy.py +57 -0
  100. moonshot/src/api/api_cookbook.py +160 -0
  101. moonshot/src/api/api_dataset.py +46 -0
  102. moonshot/src/api/api_environment_variables.py +17 -0
  103. moonshot/src/api/api_metrics.py +51 -0
  104. moonshot/src/api/api_prompt_template.py +43 -0
  105. moonshot/src/api/api_recipe.py +182 -0
  106. moonshot/src/api/api_red_teaming.py +59 -0
  107. moonshot/src/api/api_result.py +84 -0
  108. moonshot/src/api/api_run.py +74 -0
  109. moonshot/src/api/api_runner.py +132 -0
  110. moonshot/src/api/api_session.py +290 -0
  111. moonshot/src/configs/__init__.py +0 -0
  112. moonshot/src/configs/env_variables.py +187 -0
  113. moonshot/src/connectors/__init__.py +0 -0
  114. moonshot/src/connectors/connector.py +327 -0
  115. moonshot/src/connectors/connector_prompt_arguments.py +17 -0
  116. moonshot/src/connectors_endpoints/__init__.py +0 -0
  117. moonshot/src/connectors_endpoints/connector_endpoint.py +211 -0
  118. moonshot/src/connectors_endpoints/connector_endpoint_arguments.py +54 -0
  119. moonshot/src/cookbooks/__init__.py +0 -0
  120. moonshot/src/cookbooks/cookbook.py +225 -0
  121. moonshot/src/cookbooks/cookbook_arguments.py +34 -0
  122. moonshot/src/datasets/__init__.py +0 -0
  123. moonshot/src/datasets/dataset.py +255 -0
  124. moonshot/src/datasets/dataset_arguments.py +50 -0
  125. moonshot/src/metrics/__init__.py +0 -0
  126. moonshot/src/metrics/metric.py +192 -0
  127. moonshot/src/metrics/metric_interface.py +95 -0
  128. moonshot/src/prompt_templates/__init__.py +0 -0
  129. moonshot/src/prompt_templates/prompt_template.py +103 -0
  130. moonshot/src/recipes/__init__.py +0 -0
  131. moonshot/src/recipes/recipe.py +340 -0
  132. moonshot/src/recipes/recipe_arguments.py +111 -0
  133. moonshot/src/redteaming/__init__.py +0 -0
  134. moonshot/src/redteaming/attack/__init__.py +0 -0
  135. moonshot/src/redteaming/attack/attack_module.py +618 -0
  136. moonshot/src/redteaming/attack/attack_module_arguments.py +44 -0
  137. moonshot/src/redteaming/attack/context_strategy.py +131 -0
  138. moonshot/src/redteaming/context_strategy/__init__.py +0 -0
  139. moonshot/src/redteaming/context_strategy/context_strategy_interface.py +46 -0
  140. moonshot/src/redteaming/session/__init__.py +0 -0
  141. moonshot/src/redteaming/session/chat.py +209 -0
  142. moonshot/src/redteaming/session/red_teaming_progress.py +128 -0
  143. moonshot/src/redteaming/session/red_teaming_type.py +6 -0
  144. moonshot/src/redteaming/session/session.py +775 -0
  145. moonshot/src/results/__init__.py +0 -0
  146. moonshot/src/results/result.py +119 -0
  147. moonshot/src/results/result_arguments.py +44 -0
  148. moonshot/src/runners/__init__.py +0 -0
  149. moonshot/src/runners/runner.py +476 -0
  150. moonshot/src/runners/runner_arguments.py +46 -0
  151. moonshot/src/runners/runner_type.py +6 -0
  152. moonshot/src/runs/__init__.py +0 -0
  153. moonshot/src/runs/run.py +344 -0
  154. moonshot/src/runs/run_arguments.py +162 -0
  155. moonshot/src/runs/run_progress.py +145 -0
  156. moonshot/src/runs/run_status.py +10 -0
  157. moonshot/src/storage/__init__.py +0 -0
  158. moonshot/src/storage/db_interface.py +128 -0
  159. moonshot/src/storage/io_interface.py +31 -0
  160. moonshot/src/storage/storage.py +525 -0
  161. moonshot/src/utils/__init__.py +0 -0
  162. moonshot/src/utils/import_modules.py +96 -0
  163. moonshot/src/utils/timeit.py +25 -0
@@ -0,0 +1,225 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ from pydantic import validate_call
6
+ from slugify import slugify
7
+
8
+ from moonshot.src.configs.env_variables import EnvVariables
9
+ from moonshot.src.cookbooks.cookbook_arguments import CookbookArguments
10
+ from moonshot.src.storage.storage import Storage
11
+
12
+
13
+ class Cookbook:
14
+ def __init__(self, cb_args: CookbookArguments) -> None:
15
+ self.id = cb_args.id
16
+ self.name = cb_args.name
17
+ self.description = cb_args.description
18
+ self.recipes = cb_args.recipes
19
+
20
+ @classmethod
21
+ def load(cls, cb_id: str) -> Cookbook:
22
+ """
23
+ This method loads a cookbook from a JSON file.
24
+
25
+ It uses the cookbook ID to construct the file path for the JSON file in the designated cookbook directory.
26
+ The method then reads the JSON file and returns the cookbook information as a Cookbook instance.
27
+
28
+ Args:
29
+ cb_id (str): The unique identifier of the cookbook.
30
+
31
+ Returns:
32
+ Cookbook: An instance of the Cookbook class populated with the loaded cookbook information.
33
+ """
34
+ cb_info = Storage.read_object(EnvVariables.COOKBOOKS.name, cb_id, "json")
35
+ return cls(CookbookArguments(**cb_info))
36
+
37
+ @staticmethod
38
+ def create(cb_args: CookbookArguments) -> str:
39
+ """
40
+ This method is responsible for creating a new cookbook and storing its details in a JSON file.
41
+
42
+ The function accepts `cb_args` parameter which contains the necessary details for creating a new cookbook.
43
+ It generates a unique ID for the cookbook by slugifying the cookbook name. After that, it constructs a
44
+ dictionary with the cookbook's details and writes this information to a JSON file. The JSON file is named after
45
+ the cookbook ID and is stored in the directory specified by `EnvironmentVars.COOKBOOKS`.
46
+
47
+ If the operation encounters any error, an exception is raised and the error message is printed.
48
+
49
+ Args:
50
+ cb_args (CookbookArguments): An object that holds the necessary details for creating a new cookbook.
51
+
52
+ Returns:
53
+ str: The unique ID of the newly created cookbook.
54
+
55
+ Raises:
56
+ RuntimeError: If any of the recipes specified in the cookbook does not exist.
57
+ Exception: If there is an error during the file writing process or any other operation within the method.
58
+ """
59
+ try:
60
+ cb_id = slugify(cb_args.name, lowercase=True)
61
+
62
+ # check if the cookbook exists
63
+ if Storage.is_object_exists(EnvVariables.COOKBOOKS.name, cb_id, "json"):
64
+ raise RuntimeError(f"Cookbook with ID '{cb_id}' already exists.")
65
+
66
+ # check if recipes in list exist before creating cookbook
67
+ for recipe in cb_args.recipes:
68
+ if not Storage.is_object_exists(
69
+ EnvVariables.RECIPES.name, recipe, "json"
70
+ ):
71
+ raise RuntimeError(f"{recipe} recipe does not exist.")
72
+
73
+ cb_info = {
74
+ "id": cb_id,
75
+ "name": cb_args.name,
76
+ "description": cb_args.description,
77
+ "recipes": cb_args.recipes,
78
+ }
79
+
80
+ # Write as json output
81
+ Storage.create_object(EnvVariables.COOKBOOKS.name, cb_id, cb_info, "json")
82
+ return cb_id
83
+
84
+ except Exception as e:
85
+ print(f"Failed to create cookbook: {str(e)}")
86
+ raise e
87
+
88
+ @staticmethod
89
+ @validate_call
90
+ def read(cb_id: str) -> CookbookArguments:
91
+ """
92
+ Retrieves the details of a specified cookbook.
93
+
94
+ This method accepts a cookbook ID as an argument, locates the corresponding JSON file in the directory
95
+ defined by `EnvironmentVars.COOKBOOKS`, and returns a CookbookArguments object that encapsulates the cookbook's
96
+ details. If any error occurs during the process, an exception is raised and the error message is logged.
97
+
98
+ Args:
99
+ cb_id (str): The unique identifier of the cookbook to be retrieved.
100
+
101
+ Returns:
102
+ CookbookArguments: An object encapsulating the details of the retrieved cookbook.
103
+
104
+ Raises:
105
+ Exception: If there's an error during the file reading process or any other operation within the method.
106
+ """
107
+ try:
108
+ if not cb_id:
109
+ raise RuntimeError("Cookbook ID is empty")
110
+
111
+ obj_results = Storage.read_object(
112
+ EnvVariables.COOKBOOKS.name, cb_id, "json"
113
+ )
114
+ if obj_results:
115
+ return CookbookArguments(**obj_results)
116
+ else:
117
+ raise RuntimeError(f"Unable to get results for {cb_id}.")
118
+
119
+ except Exception as e:
120
+ print(f"Failed to read cookbook: {str(e)}")
121
+ raise e
122
+
123
+ @staticmethod
124
+ def update(cb_args: CookbookArguments) -> bool:
125
+ """
126
+ Updates the details of an existing cookbook.
127
+
128
+ This method accepts a CookbookArguments object, converts it to a dictionary, and writes the updated
129
+ information to the corresponding JSON file in the directory defined by `EnvVariables.COOKBOOKS`.
130
+
131
+ Args:
132
+ cb_args (CookbookArguments): An object containing the updated details of the cookbook.
133
+
134
+ Returns:
135
+ bool: True if the update was successful.
136
+
137
+ Raises:
138
+ Exception: If there's an error during the update process.
139
+ """
140
+ try:
141
+ # check if recipes in list exist before creating cookbook
142
+ for recipe in cb_args.recipes:
143
+ if not Storage.is_object_exists(
144
+ EnvVariables.RECIPES.name, recipe, "json"
145
+ ):
146
+ raise RuntimeError(f"{recipe} recipe does not exist.")
147
+
148
+ # Convert the cookbook arguments to a dictionary
149
+ cb_info = cb_args.to_dict()
150
+
151
+ # Write the updated cookbook information to the file
152
+ Storage.create_object(
153
+ EnvVariables.COOKBOOKS.name, cb_args.id, cb_info, "json"
154
+ )
155
+ return True
156
+
157
+ except Exception as e:
158
+ print(f"Failed to update cookbook: {str(e)}")
159
+ raise e
160
+
161
+ @staticmethod
162
+ @validate_call
163
+ def delete(cb_id: str) -> bool:
164
+ """
165
+ Deletes a cookbook identified by its ID.
166
+
167
+ This method removes the cookbook's JSON file from the storage, using the `Storage.delete_object` method.
168
+ The `EnvVariables.COOKBOOKS` environment variable specifies the directory where the cookbook files are stored.
169
+
170
+ Args:
171
+ cb_id (str): The unique identifier of the cookbook to be deleted.
172
+
173
+ Returns:
174
+ bool: True if the deletion was successful.
175
+
176
+ Raises:
177
+ Exception: If there's an error during the deletion process.
178
+ """
179
+ try:
180
+ Storage.delete_object(EnvVariables.COOKBOOKS.name, cb_id, "json")
181
+ return True
182
+
183
+ except Exception as e:
184
+ print(f"Failed to delete cookbook: {str(e)}")
185
+ raise e
186
+
187
+ @staticmethod
188
+ def get_available_items() -> tuple[list[str], list[CookbookArguments]]:
189
+ """
190
+ Retrieves and returns all available cookbooks.
191
+
192
+ This method scans the directory specified by `EnvironmentVars.COOKBOOKS` and identifies all stored cookbook
193
+ files. It excludes any files that contain "__" in their names. For each valid cookbook file, the method reads
194
+ the file content and constructs a CookbookArguments object encapsulating the cookbook's details.
195
+ Both the CookbookArguments object and the cookbook ID are then appended to their respective lists.
196
+
197
+ Returns:
198
+ tuple[list[str], list[CookbookArguments]]: A tuple where the first element is a list of cookbook IDs and
199
+ the second element is a list of CookbookArguments objects representing the details of each cookbook.
200
+
201
+ Raises:
202
+ Exception: If an error occurs during the file reading process or any other operation within the method.
203
+ """
204
+ try:
205
+ retn_cbs = []
206
+ retn_cbs_ids = []
207
+
208
+ cbs = Storage.get_objects(EnvVariables.COOKBOOKS.name, "json")
209
+ for cb in cbs:
210
+ if "__" in cb:
211
+ continue
212
+
213
+ cb_info = CookbookArguments(
214
+ **Storage.read_object(
215
+ EnvVariables.COOKBOOKS.name, Path(cb).stem, "json"
216
+ )
217
+ )
218
+ retn_cbs.append(cb_info)
219
+ retn_cbs_ids.append(cb_info.id)
220
+
221
+ return retn_cbs_ids, retn_cbs
222
+
223
+ except Exception as e:
224
+ print(f"Failed to get available cookbooks: {str(e)}")
225
+ raise e
@@ -0,0 +1,34 @@
1
+ from pydantic import BaseModel, Field
2
+
3
+
4
+ class CookbookArguments(BaseModel):
5
+ id: str # id (str): The unique identifier for the Cookbook.
6
+
7
+ name: str = Field(min_length=1) # name (str): The name of the Cookbook.
8
+
9
+ description: str # description (str): A brief description of the Cookbook.
10
+
11
+ recipes: list[str] = Field(
12
+ min_length=1
13
+ ) # recipes (list): A list of recipes included in the Cookbook.
14
+
15
+ def to_dict(self) -> dict:
16
+ """
17
+ Converts the CookbookArguments instance into a dictionary.
18
+
19
+ This method takes all the attributes of the CookbookArguments instance and constructs a dictionary
20
+ with attribute names as keys and their corresponding values. This includes the id, name, description,
21
+ and recipes.
22
+
23
+ This dictionary can be used for serialization purposes, such as storing the cookbook information in a JSON file
24
+ or sending it over a network.
25
+
26
+ Returns:
27
+ dict: A dictionary representation of the CookbookArguments instance.
28
+ """
29
+ return {
30
+ "id": self.id,
31
+ "name": self.name,
32
+ "description": self.description,
33
+ "recipes": self.recipes,
34
+ }
File without changes
@@ -0,0 +1,255 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ from pydantic import validate_call
6
+
7
+ from moonshot.src.configs.env_variables import EnvVariables
8
+ from moonshot.src.datasets.dataset_arguments import DatasetArguments
9
+ from moonshot.src.storage.storage import Storage
10
+
11
+
12
+ class Dataset:
13
+ cache_name = "cache"
14
+ cache_extension = "json"
15
+
16
+ @staticmethod
17
+ @validate_call
18
+ def read(ds_id: str) -> DatasetArguments:
19
+ """
20
+ Fetches the details of a given dataset.
21
+
22
+ This method takes a dataset ID as input, finds the corresponding JSON file in the directory
23
+ specified by `EnvVariables.DATASETS`, and returns a DatasetArguments object
24
+ that contains the dataset's details. If any error arises during the process, an exception is raised and the
25
+ error message is logged.
26
+
27
+ Args:
28
+ ds_id (str): The unique ID of the dataset to be fetched.
29
+
30
+ Returns:
31
+ DatasetArguments: An object encapsulating the details of the fetched dataset.
32
+
33
+ Raises:
34
+ Exception: If there's an error during the file reading process or any other operation within the method.
35
+ """
36
+ try:
37
+ if ds_id:
38
+ return DatasetArguments(**Dataset._read_dataset(ds_id))
39
+ else:
40
+ raise RuntimeError("Dataset ID is empty")
41
+
42
+ except Exception as e:
43
+ print(f"Failed to read dataset: {str(e)}")
44
+ raise e
45
+
46
+ @staticmethod
47
+ def _read_dataset(ds_id: str) -> dict:
48
+ """
49
+ Retrieves dataset information from storage and augments it with metadata.
50
+
51
+ This method takes a dataset ID, locates the corresponding JSON file within the directory
52
+ specified by `EnvVariables.DATASETS`, and constructs a dictionary that includes the dataset's
53
+ core details, as well as metadata such as the creation datetime and the count of dataset prompts.
54
+
55
+ Args:
56
+ ds_id (str): The unique identifier of the dataset to be retrieved.
57
+
58
+ Returns:
59
+ dict: A dictionary with the dataset's core information, enriched with metadata like the creation datetime
60
+ and the total number of prompts contained within the dataset.
61
+ """
62
+ # Read the basic dataset information
63
+ dataset_info = Storage.read_object_with_iterator(
64
+ obj_type=EnvVariables.DATASETS.name,
65
+ obj_id=ds_id,
66
+ obj_extension="json",
67
+ json_keys=["name", "description", "license", "reference"],
68
+ iterator_keys=["examples.item"],
69
+ )
70
+
71
+ # Add additional parameters - [id, num_of_dataset_prompts, creation_date]
72
+ # Append the dataset ID to the dataset_info
73
+ dataset_info["id"] = ds_id
74
+
75
+ # Use Storage.count_objects to get the number of examples in a memory-efficient way
76
+ dataset_info["num_of_dataset_prompts"] = Storage.count_objects(
77
+ EnvVariables.DATASETS.name, ds_id, "json", "examples.item"
78
+ )
79
+
80
+ # Assign the creation date to the dataset_info
81
+ creation_datetime = Storage.get_creation_datetime(
82
+ EnvVariables.DATASETS.name, ds_id, "json"
83
+ )
84
+ dataset_info["created_date"] = creation_datetime.replace(
85
+ microsecond=0
86
+ ).isoformat(" ")
87
+
88
+ return dataset_info
89
+
90
+ @staticmethod
91
+ @validate_call
92
+ def delete(ds_id: str) -> bool:
93
+ """
94
+ Deletes a dataset from storage.
95
+
96
+ This method attempts to delete the dataset with the given ID from the storage. If the deletion is successful,
97
+ it returns True. If an exception occurs during the deletion process, it prints an error message and re-raises
98
+ the exception.
99
+
100
+ Args:
101
+ ds_id (str): The unique identifier of the dataset to be deleted.
102
+
103
+ Returns:
104
+ bool: True if the dataset was successfully deleted.
105
+
106
+ Raises:
107
+ Exception: If an error occurs during the deletion process.
108
+ """
109
+ try:
110
+ Storage.delete_object(EnvVariables.DATASETS.name, ds_id, "json")
111
+ return True
112
+
113
+ except Exception as e:
114
+ print(f"Failed to delete dataset: {str(e)}")
115
+ raise e
116
+
117
+ @staticmethod
118
+ def get_cache_information() -> dict:
119
+ """
120
+ Retrieves cache information from the storage.
121
+
122
+ This method attempts to read the cache information from the storage and return it as a dictionary.
123
+ If the cache information does not exist or an error occurs, it returns an empty dictionary.
124
+
125
+ Returns:
126
+ dict: A dictionary containing the cache information or an empty dictionary if an error occurs
127
+ or if the cache information does not exist.
128
+
129
+ Raises:
130
+ Exception: If there's an error during the retrieval process, it is logged and an
131
+ empty dictionary is returned.
132
+ """
133
+ try:
134
+ # Retrieve cache information from the storage and return it as a dictionary
135
+ cache_info = Storage.read_object(
136
+ EnvVariables.DATASETS.name, Dataset.cache_name, Dataset.cache_extension
137
+ )
138
+ return cache_info if cache_info else {}
139
+ except Exception as e:
140
+ print(f"Failed to retrieve cache information: {str(e)}")
141
+ return {}
142
+
143
+ @staticmethod
144
+ def write_cache_information(cache_info: dict) -> None:
145
+ """
146
+ Writes the updated cache information to the storage.
147
+
148
+ Args:
149
+ cache_info (dict): The cache information to be written.
150
+ """
151
+ try:
152
+ Storage.create_object(
153
+ obj_type=EnvVariables.DATASETS.name,
154
+ obj_id=Dataset.cache_name,
155
+ obj_info=cache_info,
156
+ obj_extension=Dataset.cache_extension,
157
+ )
158
+ except Exception as e:
159
+ print(f"Failed to write cache information: {str(e)}")
160
+ raise e
161
+
162
+ @staticmethod
163
+ def get_available_items(
164
+ datasets: list[str] = [],
165
+ ) -> tuple[list[str], list[DatasetArguments]]:
166
+ """
167
+ Retrieves a list of available dataset IDs and their corresponding DatasetArguments objects.
168
+
169
+ This method filters out any non-dataset files and the cache file from the list of datasets. It then
170
+ retrieves or updates the dataset information from the cache for each dataset. If the cache is updated
171
+ during this process, it writes the updated cache information back to the storage.
172
+
173
+ Args:
174
+ datasets (list[str], optional): A list of dataset file names. If not provided, it will retrieve
175
+ the list of all dataset files from the storage. Defaults to an empty list.
176
+
177
+ Returns:
178
+ tuple[list[str], list[DatasetArguments]]: A tuple containing two lists:
179
+ - The first list contains the IDs of the available datasets.
180
+ - The second list contains the corresponding DatasetArguments objects for those IDs.
181
+ """
182
+ try:
183
+ retn_datasets = []
184
+ retn_datasets_ids = []
185
+ ds_cache_info = Dataset.get_cache_information()
186
+ cache_needs_update = False # Initialize a flag to track cache updates
187
+
188
+ if datasets:
189
+ datasets_objects = datasets
190
+ else:
191
+ datasets_objects = Storage.get_objects(
192
+ EnvVariables.DATASETS.name, "json"
193
+ )
194
+
195
+ for ds in datasets_objects:
196
+ if (
197
+ "__" in ds
198
+ or f"{Dataset.cache_name}.{Dataset.cache_extension}" in ds
199
+ ):
200
+ continue
201
+
202
+ ds_name = Path(ds).stem
203
+ ds_info, cache_updated = Dataset._get_or_update_dataset_info(
204
+ ds_name, ds_cache_info
205
+ )
206
+ if cache_updated:
207
+ cache_needs_update = True # Set the flag if any cache was updated
208
+
209
+ retn_datasets.append(ds_info)
210
+ retn_datasets_ids.append(ds_info.id)
211
+
212
+ if cache_needs_update: # Check the flag after the loop
213
+ Dataset.write_cache_information(ds_cache_info)
214
+
215
+ return retn_datasets_ids, retn_datasets
216
+
217
+ except Exception as e:
218
+ print(f"Failed to get available datasets: {str(e)}")
219
+ raise e
220
+
221
+ @staticmethod
222
+ def _get_or_update_dataset_info(
223
+ ds_name: str, ds_cache_info: dict
224
+ ) -> tuple[DatasetArguments, bool]:
225
+ """
226
+ Retrieves or updates the dataset information from the cache.
227
+
228
+ This method checks if the dataset information is already available in the cache and if the file hash matches
229
+ the one stored in the cache. If it does, the information is retrieved from the cache. If not, the dataset
230
+ information is read from the storage, the cache is updated with the new information and the new file hash,
231
+ and a flag is set to indicate that the cache has been updated.
232
+
233
+ Args:
234
+ ds_name (str): The name of the dataset.
235
+ ds_cache_info (dict): A dictionary containing the cached dataset information.
236
+
237
+ Returns:
238
+ tuple[DatasetArguments, bool]: A tuple containing the DatasetArguments object with the dataset information
239
+ and a boolean indicating whether the cache was updated or not.
240
+ """
241
+ file_hash = Storage.get_file_hash(EnvVariables.DATASETS.name, ds_name, "json")
242
+ cache_updated = False
243
+
244
+ if ds_name in ds_cache_info and file_hash == ds_cache_info[ds_name]["hash"]:
245
+ ds_metadata = ds_cache_info[ds_name].copy()
246
+ ds_metadata.pop("hash", None)
247
+ ds_info = DatasetArguments(**ds_metadata)
248
+ else:
249
+ ds_info = DatasetArguments(**Dataset._read_dataset(ds_name))
250
+ ds_info.examples = None
251
+ ds_cache_info[ds_name] = ds_info.copy().to_dict()
252
+ ds_cache_info[ds_name]["hash"] = file_hash
253
+ cache_updated = True
254
+
255
+ return ds_info, cache_updated
@@ -0,0 +1,50 @@
1
+ from pydantic import BaseModel
2
+ from pyparsing import Iterator
3
+
4
+
5
+ class DatasetArguments(BaseModel):
6
+ class Config:
7
+ arbitrary_types_allowed = True
8
+
9
+ # id (str): Unique identifier for the dataset
10
+ id: str
11
+
12
+ # name (str): Name of the dataset
13
+ name: str
14
+
15
+ # description (str): Description of the dataset's contents and purpose
16
+ description: str
17
+
18
+ # examples (Iterator[dict] | None): Generator of examples from the dataset, where each example is a dictionary.
19
+ examples: Iterator[dict] | None
20
+
21
+ # num_of_dataset_prompts (int): The number of dataset prompts, automatically calculated
22
+ num_of_dataset_prompts: int = 0
23
+
24
+ # created_date (str): The creation date and time of the dataset in ISO format without 'T'. Automatically generated.
25
+ created_date: str = ""
26
+
27
+ # reference (str): An optional string to store a reference link or identifier for the dataset
28
+ reference: str = ""
29
+
30
+ # license (str): License information for the dataset. Defaults to an empty string if not provided.
31
+ license: str = ""
32
+
33
+ def to_dict(self) -> dict:
34
+ """
35
+ Converts the DatasetArguments object to a dictionary.
36
+
37
+ Returns:
38
+ dict: A dictionary representation of the DatasetArguments object, including the id, name, description,
39
+ examples, number of dataset prompts, created date, reference, and license.
40
+ """
41
+ return {
42
+ "id": self.id,
43
+ "name": self.name,
44
+ "description": self.description,
45
+ "examples": self.examples,
46
+ "num_of_dataset_prompts": self.num_of_dataset_prompts,
47
+ "created_date": self.created_date,
48
+ "reference": self.reference,
49
+ "license": self.license,
50
+ }
File without changes