nomad-ml-workflows 0.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nomad_ml_workflows/__init__.py +0 -0
- nomad_ml_workflows/actions/__init__.py +3 -0
- nomad_ml_workflows/actions/export_entries/__init__.py +53 -0
- nomad_ml_workflows/actions/export_entries/activities.py +209 -0
- nomad_ml_workflows/actions/export_entries/models.py +245 -0
- nomad_ml_workflows/actions/export_entries/utils.py +172 -0
- nomad_ml_workflows/actions/export_entries/workflows.py +169 -0
- nomad_ml_workflows-0.0.6.dist-info/METADATA +197 -0
- nomad_ml_workflows-0.0.6.dist-info/RECORD +13 -0
- nomad_ml_workflows-0.0.6.dist-info/WHEEL +5 -0
- nomad_ml_workflows-0.0.6.dist-info/entry_points.txt +2 -0
- nomad_ml_workflows-0.0.6.dist-info/licenses/LICENSE +22 -0
- nomad_ml_workflows-0.0.6.dist-info/top_level.txt +1 -0
|
File without changes
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
from nomad.actions import TaskQueue
|
|
2
|
+
from pydantic import Field
|
|
3
|
+
from temporalio import workflow
|
|
4
|
+
|
|
5
|
+
with workflow.unsafe.imports_passed_through():
|
|
6
|
+
from nomad.config.models.plugins import ActionEntryPoint
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ExportEntriesActionEntryPoint(ActionEntryPoint):
|
|
10
|
+
search_batch_timeout: int = Field(
|
|
11
|
+
default=7200, # 2 hours
|
|
12
|
+
description='Timeout (in seconds) for each search batch in the Export Entries '
|
|
13
|
+
'action. Set this accordingly to time out longer searches.',
|
|
14
|
+
)
|
|
15
|
+
max_entries_export_limit: int = Field(
|
|
16
|
+
default=100000,
|
|
17
|
+
description='Maximum number of entries that can be exported in a single '
|
|
18
|
+
'Export Entries action.',
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
def load(self):
|
|
22
|
+
from nomad.actions import Action
|
|
23
|
+
|
|
24
|
+
from nomad_ml_workflows.actions.export_entries.activities import (
|
|
25
|
+
cleanup_artifacts,
|
|
26
|
+
create_artifact_subdirectory,
|
|
27
|
+
export_dataset_to_upload,
|
|
28
|
+
merge_output_files,
|
|
29
|
+
search,
|
|
30
|
+
)
|
|
31
|
+
from nomad_ml_workflows.actions.export_entries.workflows import (
|
|
32
|
+
ExportEntriesWorkflow,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
return Action(
|
|
36
|
+
task_queue=self.task_queue,
|
|
37
|
+
workflow=ExportEntriesWorkflow,
|
|
38
|
+
activities=[
|
|
39
|
+
create_artifact_subdirectory,
|
|
40
|
+
search,
|
|
41
|
+
merge_output_files,
|
|
42
|
+
export_dataset_to_upload,
|
|
43
|
+
cleanup_artifacts,
|
|
44
|
+
],
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
export_entries = ExportEntriesActionEntryPoint(
|
|
49
|
+
name='Export Entries Action',
|
|
50
|
+
description='An action to search entries and export them as a zip file in the '
|
|
51
|
+
'specified upload.',
|
|
52
|
+
task_queue=TaskQueue.CPU,
|
|
53
|
+
)
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import shutil
|
|
4
|
+
import zipfile
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
|
+
|
|
7
|
+
from nomad.actions.manager import action_artifacts_dir, get_upload_files
|
|
8
|
+
from nomad.files import StagingUploadFiles
|
|
9
|
+
from nomad.search import search as nomad_search
|
|
10
|
+
from temporalio import activity
|
|
11
|
+
|
|
12
|
+
from nomad_ml_workflows.actions.export_entries.models import (
|
|
13
|
+
CleanupArtifactsInput,
|
|
14
|
+
CreateArtifactSubdirectoryInput,
|
|
15
|
+
ExportDatasetInput,
|
|
16
|
+
MergeOutputFilesInput,
|
|
17
|
+
SearchInput,
|
|
18
|
+
SearchOutput,
|
|
19
|
+
)
|
|
20
|
+
from nomad_ml_workflows.actions.export_entries.utils import (
|
|
21
|
+
merge_files,
|
|
22
|
+
write_json_file,
|
|
23
|
+
write_parquet_file,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@activity.defn
|
|
28
|
+
async def create_artifact_subdirectory(data: CreateArtifactSubdirectoryInput) -> str:
|
|
29
|
+
"""
|
|
30
|
+
Creates a subdirectory within the action artifacts directory.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
data (CreateArtifactSubdirectoryInput): Input data for creating subdirectory.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
str: Path to the created subdirectory.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
subdir_path = os.path.join(action_artifacts_dir(), data.subdir_name)
|
|
40
|
+
|
|
41
|
+
assert not os.path.exists(subdir_path), (
|
|
42
|
+
f'Artifact subdirectory "{subdir_path}" already exists.'
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
os.makedirs(subdir_path)
|
|
46
|
+
|
|
47
|
+
return subdir_path
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@activity.defn
|
|
51
|
+
async def search(data: SearchInput) -> SearchOutput:
|
|
52
|
+
"""
|
|
53
|
+
Activity to perform NOMAD search based on the provided input data. The search
|
|
54
|
+
results are written to a file in the specified format (Parquet or JSON) in the
|
|
55
|
+
artifacts directory.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
data (SearchInput): Input data for the search activity.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
SearchOutput: Output data from the search activity.
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
write_dataset_file = {
|
|
65
|
+
'parquet': write_parquet_file,
|
|
66
|
+
'json': write_json_file,
|
|
67
|
+
}.get(data.batch_file_type)
|
|
68
|
+
if write_dataset_file is None:
|
|
69
|
+
raise ValueError(f'Unsupported batch file type "{data.batch_file_type}". ')
|
|
70
|
+
|
|
71
|
+
start = datetime.now(timezone.utc).isoformat()
|
|
72
|
+
response = nomad_search(
|
|
73
|
+
user_id=data.user_id,
|
|
74
|
+
owner=data.owner,
|
|
75
|
+
query=data.query,
|
|
76
|
+
required=data.required,
|
|
77
|
+
pagination=data.pagination,
|
|
78
|
+
aggregations={}, # aggregations support can be added later
|
|
79
|
+
)
|
|
80
|
+
end = datetime.now(timezone.utc).isoformat()
|
|
81
|
+
|
|
82
|
+
# Limit the number of exported entries
|
|
83
|
+
if len(response.data) > data.max_entries_export_limit:
|
|
84
|
+
entry_list = response.data[: data.max_entries_export_limit]
|
|
85
|
+
else:
|
|
86
|
+
entry_list = response.data
|
|
87
|
+
|
|
88
|
+
output = SearchOutput(
|
|
89
|
+
search_start_time=start,
|
|
90
|
+
search_end_time=end,
|
|
91
|
+
num_entries_exported=len(entry_list),
|
|
92
|
+
num_entries_available=response.pagination.total,
|
|
93
|
+
pagination_next_page_after_value=response.pagination.next_page_after_value,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
if len(entry_list) == 0:
|
|
97
|
+
# skip writing empty files and stop subsequent searches
|
|
98
|
+
output.pagination_next_page_after_value = None
|
|
99
|
+
else:
|
|
100
|
+
write_dataset_file(path=data.output_file_path, data=entry_list)
|
|
101
|
+
|
|
102
|
+
return output
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
@activity.defn
|
|
106
|
+
async def merge_output_files(data: MergeOutputFilesInput) -> str | None:
|
|
107
|
+
"""
|
|
108
|
+
Activity to merge multiple batch files into a single file.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
data (MergeOutputFilesInput): Input data for merging files.
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
str | None: Path of the merged output file, or None if no files were merged.
|
|
115
|
+
"""
|
|
116
|
+
|
|
117
|
+
if not data.generated_file_paths:
|
|
118
|
+
raise ValueError('No generated file paths provided for merging.')
|
|
119
|
+
|
|
120
|
+
merged_file_path = os.path.join(
|
|
121
|
+
data.artifact_subdirectory, 'data.' + data.output_file_type
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
merge_files(data.generated_file_paths, data.output_file_type, merged_file_path)
|
|
125
|
+
|
|
126
|
+
return merged_file_path
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
@activity.defn
|
|
130
|
+
async def export_dataset_to_upload(data: ExportDatasetInput) -> str:
|
|
131
|
+
"""
|
|
132
|
+
Activity to export the generated dataset files as a zip file to the specified
|
|
133
|
+
upload. A metadata file is also included in the zip.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
data (ExportDatasetInput): Input data for exporting the dataset to the upload.
|
|
137
|
+
Returns:
|
|
138
|
+
str: Path to the saved zip file in the upload.
|
|
139
|
+
"""
|
|
140
|
+
|
|
141
|
+
def unique_filename(filename: str, upload_files: StagingUploadFiles) -> str:
|
|
142
|
+
"""Generate a unique filename for the upload_files directory."""
|
|
143
|
+
if not upload_files.raw_path_exists(filename):
|
|
144
|
+
return filename
|
|
145
|
+
|
|
146
|
+
count = 1
|
|
147
|
+
while True:
|
|
148
|
+
name, ext = os.path.splitext(filename)
|
|
149
|
+
_filename = f'{name}({count}){ext}'
|
|
150
|
+
if not upload_files.raw_path_exists(_filename):
|
|
151
|
+
return _filename
|
|
152
|
+
count += 1
|
|
153
|
+
|
|
154
|
+
upload_files = get_upload_files(data.upload_id, data.user_id)
|
|
155
|
+
if not upload_files:
|
|
156
|
+
raise ValueError(
|
|
157
|
+
f'Upload with ID {data.upload_id} for user {data.user_id} not found.'
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
# Create a metadata.json file in the artifact subdirectory
|
|
161
|
+
metadata_dict = {
|
|
162
|
+
'note': 'This metadata file contains information about the exported dataset '
|
|
163
|
+
'and the conditions under which it was generated.',
|
|
164
|
+
'data': data.metadata.model_dump(),
|
|
165
|
+
'schema': data.metadata.model_json_schema(),
|
|
166
|
+
}
|
|
167
|
+
metadata_path = os.path.join(data.artifact_subdirectory, 'metadata.json')
|
|
168
|
+
with open(metadata_path, 'w', encoding='utf-8') as metafile:
|
|
169
|
+
json.dump(metadata_dict, metafile, indent=4)
|
|
170
|
+
|
|
171
|
+
exportable_filepaths = data.source_paths + [metadata_path]
|
|
172
|
+
exportable_dir_name = unique_filename(data.exportable_dir_name, upload_files)
|
|
173
|
+
|
|
174
|
+
# Create a zip file containing all the source paths and the metadata file
|
|
175
|
+
if data.zip_output:
|
|
176
|
+
zipname = exportable_dir_name + '.zip'
|
|
177
|
+
zippath = os.path.join(data.artifact_subdirectory, zipname)
|
|
178
|
+
with zipfile.ZipFile(zippath, 'w', compression=zipfile.ZIP_DEFLATED) as zipf:
|
|
179
|
+
for filepath in exportable_filepaths:
|
|
180
|
+
arcname = os.path.basename(filepath)
|
|
181
|
+
zipf.write(filepath, arcname=arcname)
|
|
182
|
+
# Add zip file to the NOMAD Upload
|
|
183
|
+
upload_files.add_rawfiles(path=zippath, auto_decompress=False)
|
|
184
|
+
return zipname
|
|
185
|
+
|
|
186
|
+
# If not zipping, copy files to directory named exportable_dir_name
|
|
187
|
+
exportable_dir_path = os.path.join(data.artifact_subdirectory, exportable_dir_name)
|
|
188
|
+
os.mkdir(exportable_dir_path)
|
|
189
|
+
for filepath in exportable_filepaths:
|
|
190
|
+
temp_path = os.path.join(exportable_dir_path, os.path.basename(filepath))
|
|
191
|
+
shutil.copy2(filepath, temp_path)
|
|
192
|
+
# Add directory to the NOMAD Upload
|
|
193
|
+
upload_files.add_rawfiles(
|
|
194
|
+
path=exportable_dir_path, target_dir=exportable_dir_name
|
|
195
|
+
)
|
|
196
|
+
return exportable_dir_name
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
@activity.defn
|
|
200
|
+
async def cleanup_artifacts(data: CleanupArtifactsInput) -> None:
|
|
201
|
+
"""
|
|
202
|
+
Activity to clean up the action artifacts directory.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
data (CleanupArtifactsInput): Input data for cleaning up artifacts.
|
|
206
|
+
"""
|
|
207
|
+
|
|
208
|
+
if os.path.exists(data.subdir_path):
|
|
209
|
+
shutil.rmtree(data.subdir_path)
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Literal
|
|
3
|
+
|
|
4
|
+
from nomad.app.v1.models.models import MetadataPagination, MetadataRequired, Query
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
|
|
7
|
+
OwnerLiteral = Literal['public', 'visible', 'shared', 'user', 'staging']
|
|
8
|
+
BatchFileTypeLiteral = Literal['parquet', 'json']
|
|
9
|
+
OutputFileTypeLiteral = Literal['parquet', 'csv', 'json']
|
|
10
|
+
IndexLiteral = Literal['entries', 'datasets', 'models', 'spaces']
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class SearchSettings(BaseModel):
|
|
14
|
+
owner: OwnerLiteral = Field(
|
|
15
|
+
'visible', description='Owner of the entries to be searched.'
|
|
16
|
+
)
|
|
17
|
+
query: str = Field(
|
|
18
|
+
...,
|
|
19
|
+
description="""Query for extracting entries. Should be a valid dictionary
|
|
20
|
+
string. For example:
|
|
21
|
+
{
|
|
22
|
+
'entry_type': 'ELNSample'
|
|
23
|
+
}""",
|
|
24
|
+
# TODO: add `ui:widget` though `json_schema_extra` after NOMAD UI supports it
|
|
25
|
+
)
|
|
26
|
+
required_include: list[str] = Field(
|
|
27
|
+
None,
|
|
28
|
+
description='List of fields to include in the search results. For example: '
|
|
29
|
+
'results*, data.results*',
|
|
30
|
+
)
|
|
31
|
+
required_exclude: list[str] = Field(
|
|
32
|
+
None,
|
|
33
|
+
description='List of fields to exclude from the search results. For example: '
|
|
34
|
+
'results.method.method_name',
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class OutputSettings(BaseModel):
|
|
39
|
+
output_file_type: OutputFileTypeLiteral = Field(
|
|
40
|
+
'parquet',
|
|
41
|
+
description='Type of the output file.',
|
|
42
|
+
)
|
|
43
|
+
batch_size: int = Field(
|
|
44
|
+
1000,
|
|
45
|
+
gt=0,
|
|
46
|
+
description='Number of entries to be fetched and written per search batch. '
|
|
47
|
+
'Use smaller batch sizes when exporting large entries to reduce memory usage.',
|
|
48
|
+
)
|
|
49
|
+
zip_output: bool = Field(
|
|
50
|
+
True,
|
|
51
|
+
description='Whether to create a zip file for the output file(s). Set it '
|
|
52
|
+
'to true if you want download the dataset for external use. If you want to '
|
|
53
|
+
'work with the exported data in NOMAD, set it to false. This will export the '
|
|
54
|
+
'dataset as a directory in the specified project.',
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class ExportEntriesUserInput(BaseModel):
|
|
59
|
+
upload_id: str = Field(
|
|
60
|
+
...,
|
|
61
|
+
description='Unique identifier for the upload associated with the workflow.',
|
|
62
|
+
)
|
|
63
|
+
user_id: str = Field(
|
|
64
|
+
..., description='Unique identifier for the user who initiated the workflow.'
|
|
65
|
+
)
|
|
66
|
+
search_settings: SearchSettings
|
|
67
|
+
output_settings: OutputSettings
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class CreateArtifactSubdirectoryInput(BaseModel):
|
|
71
|
+
subdir_name: str = Field(..., description='Name of the subdirectory to be created.')
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class SearchInput(BaseModel):
|
|
75
|
+
user_id: str = Field(..., description='User ID performing the search.')
|
|
76
|
+
owner: OwnerLiteral = Field(..., description='Owner of the entries to be searched.')
|
|
77
|
+
query: Query = Field(..., description='Search query parameters.')
|
|
78
|
+
required: MetadataRequired = Field(
|
|
79
|
+
..., description='Required fields for filtering the search results.'
|
|
80
|
+
)
|
|
81
|
+
pagination: MetadataPagination = Field(
|
|
82
|
+
..., description='Pagination settings for the search results.'
|
|
83
|
+
)
|
|
84
|
+
batch_file_type: BatchFileTypeLiteral = Field(
|
|
85
|
+
..., description='Type of the output file.'
|
|
86
|
+
)
|
|
87
|
+
output_file_path: str = Field(..., description='Path to the generated output file.')
|
|
88
|
+
max_entries_export_limit: int = Field(
|
|
89
|
+
..., description='Maximum number of entries to be exported.'
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
@classmethod
|
|
93
|
+
def from_user_input(
|
|
94
|
+
cls,
|
|
95
|
+
user_input: ExportEntriesUserInput,
|
|
96
|
+
/,
|
|
97
|
+
output_file_path: str,
|
|
98
|
+
max_entries_export_limit: int,
|
|
99
|
+
) -> 'SearchInput':
|
|
100
|
+
"""Convert from ExportEntriesUserInput to SearchInput"""
|
|
101
|
+
|
|
102
|
+
def _clean_field(field: str) -> str:
|
|
103
|
+
"""
|
|
104
|
+
Removes trailing whitespaces and inverted commas
|
|
105
|
+
"""
|
|
106
|
+
return field.strip().strip("'").strip('"')
|
|
107
|
+
|
|
108
|
+
query = json.loads(
|
|
109
|
+
_clean_field(user_input.search_settings.query).replace("'", '"')
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
required = MetadataRequired()
|
|
113
|
+
if user_input.search_settings.required_include is not None:
|
|
114
|
+
include = [
|
|
115
|
+
_clean_field(field)
|
|
116
|
+
for field in user_input.search_settings.required_include
|
|
117
|
+
]
|
|
118
|
+
required.include = include if include else None
|
|
119
|
+
if user_input.search_settings.required_exclude:
|
|
120
|
+
exclude = [
|
|
121
|
+
_clean_field(field)
|
|
122
|
+
for field in user_input.search_settings.required_exclude
|
|
123
|
+
]
|
|
124
|
+
required.exclude = exclude if exclude else None
|
|
125
|
+
|
|
126
|
+
pagination = MetadataPagination(page_size=user_input.output_settings.batch_size)
|
|
127
|
+
|
|
128
|
+
batch_file_type = user_input.output_settings.output_file_type
|
|
129
|
+
if batch_file_type == 'csv':
|
|
130
|
+
batch_file_type = 'parquet' # use parquet batches for csv
|
|
131
|
+
|
|
132
|
+
return cls(
|
|
133
|
+
user_id=user_input.user_id,
|
|
134
|
+
owner=user_input.search_settings.owner,
|
|
135
|
+
query=query,
|
|
136
|
+
required=required,
|
|
137
|
+
pagination=pagination,
|
|
138
|
+
batch_file_type=batch_file_type,
|
|
139
|
+
output_file_path=output_file_path,
|
|
140
|
+
max_entries_export_limit=max_entries_export_limit,
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
class SearchOutput(BaseModel):
|
|
145
|
+
num_entries_exported: int = Field(
|
|
146
|
+
..., description='Number of entries exported to the output file.'
|
|
147
|
+
)
|
|
148
|
+
num_entries_available: int = Field(
|
|
149
|
+
...,
|
|
150
|
+
description='Total number of entries available for the given search query.',
|
|
151
|
+
)
|
|
152
|
+
search_start_time: str = Field(
|
|
153
|
+
..., description='Timestamp when the search started.'
|
|
154
|
+
)
|
|
155
|
+
search_end_time: str = Field(
|
|
156
|
+
..., description='Timestamp when the search completed.'
|
|
157
|
+
)
|
|
158
|
+
pagination_next_page_after_value: str | None = Field(
|
|
159
|
+
None,
|
|
160
|
+
description='The next_page_after_value from pagination, if more results are '
|
|
161
|
+
'available.',
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
class MergeOutputFilesInput(BaseModel):
|
|
166
|
+
artifact_subdirectory: str = Field(
|
|
167
|
+
...,
|
|
168
|
+
description='Subdirectory where the merged output file will be stored.',
|
|
169
|
+
)
|
|
170
|
+
output_file_type: OutputFileTypeLiteral = Field(
|
|
171
|
+
...,
|
|
172
|
+
description='Type of the output file.',
|
|
173
|
+
)
|
|
174
|
+
generated_file_paths: list[str] = Field(
|
|
175
|
+
...,
|
|
176
|
+
description='List of the generated file paths to be merged into a single file.',
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
class ExportDatasetMetadata(BaseModel):
|
|
181
|
+
num_entries_exported: int = Field(
|
|
182
|
+
0,
|
|
183
|
+
description='Total number of entries exported in all the exported dataset '
|
|
184
|
+
'batches.',
|
|
185
|
+
)
|
|
186
|
+
num_entries_available: int = Field(
|
|
187
|
+
0,
|
|
188
|
+
description='Total number of entries available for the given search query.',
|
|
189
|
+
)
|
|
190
|
+
reached_max_entries_limit: bool = Field(
|
|
191
|
+
False,
|
|
192
|
+
description='Indicates whether the export reached the maximum number of '
|
|
193
|
+
'entries allowed. If true, the exported dataset contains the first N entries '
|
|
194
|
+
'up to the maximum limit.',
|
|
195
|
+
)
|
|
196
|
+
search_start_time: str = Field(
|
|
197
|
+
'',
|
|
198
|
+
description='Timestamp when the first search batch started.',
|
|
199
|
+
)
|
|
200
|
+
search_end_time: str = Field(
|
|
201
|
+
'',
|
|
202
|
+
description='Timestamp when the last search batch completed.',
|
|
203
|
+
)
|
|
204
|
+
user_input: ExportEntriesUserInput | None = Field(
|
|
205
|
+
None, description='Original user input for the export entries workflow.'
|
|
206
|
+
)
|
|
207
|
+
error_info: str | None = Field(
|
|
208
|
+
None,
|
|
209
|
+
description='Error information if any error occurred during the search and '
|
|
210
|
+
'merging process.',
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
class ExportDatasetInput(BaseModel):
|
|
215
|
+
user_id: str = Field(
|
|
216
|
+
..., description='User ID performing the export dataset operation.'
|
|
217
|
+
)
|
|
218
|
+
upload_id: str = Field(
|
|
219
|
+
..., description='Upload ID associated with the export dataset operation.'
|
|
220
|
+
)
|
|
221
|
+
artifact_subdirectory: str = Field(
|
|
222
|
+
...,
|
|
223
|
+
description='Subdirectory where the exported dataset zip file will be stored.',
|
|
224
|
+
)
|
|
225
|
+
zip_output: bool = Field(
|
|
226
|
+
...,
|
|
227
|
+
description='Whether to create a zip file for the exported dataset.',
|
|
228
|
+
)
|
|
229
|
+
exportable_dir_name: str = Field(
|
|
230
|
+
...,
|
|
231
|
+
description='Name of the directory containing the dataset that will be '
|
|
232
|
+
'exported.',
|
|
233
|
+
)
|
|
234
|
+
source_paths: list[str] = Field(
|
|
235
|
+
..., description='List of paths to the source files of the dataset.'
|
|
236
|
+
)
|
|
237
|
+
metadata: ExportDatasetMetadata = Field(
|
|
238
|
+
..., description='Metadata associated with the exported dataset.'
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
class CleanupArtifactsInput(BaseModel):
|
|
243
|
+
subdir_path: str = Field(
|
|
244
|
+
..., description='Path to the subdirectory to be cleaned up.'
|
|
245
|
+
)
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
import json_stream
|
|
4
|
+
from nomad.utils import dict_to_dataframe
|
|
5
|
+
|
|
6
|
+
try:
|
|
7
|
+
import pyarrow as pa
|
|
8
|
+
import pyarrow.csv as pcsv
|
|
9
|
+
import pyarrow.dataset as ds
|
|
10
|
+
import pyarrow.parquet as pq
|
|
11
|
+
except ImportError as e:
|
|
12
|
+
raise ImportError(
|
|
13
|
+
'pyarrow is required. Install with: pip install nomad-ml-workflows[cpu-action]'
|
|
14
|
+
) from e
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _is_nested_type(dtype: pa.DataType) -> bool:
|
|
18
|
+
"""Check if a PyArrow type is nested."""
|
|
19
|
+
return pa.types.is_nested(dtype)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _get_csv_compatible_schema(schema: pa.Schema) -> pa.Schema:
|
|
23
|
+
"""Convert schema to CSV-compatible format by changing nested types to strings."""
|
|
24
|
+
new_fields = []
|
|
25
|
+
for field in schema:
|
|
26
|
+
if _is_nested_type(field.type):
|
|
27
|
+
new_fields.append(pa.field(field.name, pa.string(), field.nullable))
|
|
28
|
+
else:
|
|
29
|
+
new_fields.append(field)
|
|
30
|
+
return pa.schema(new_fields)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _stringify_nested_columns(batch: pa.RecordBatch) -> pa.RecordBatch:
|
|
34
|
+
"""Convert nested columns (list, struct) in a batch to JSON strings."""
|
|
35
|
+
new_columns = []
|
|
36
|
+
for i, column in enumerate(batch.columns):
|
|
37
|
+
if _is_nested_type(batch.schema.field(i).type):
|
|
38
|
+
# Convert each element to JSON string
|
|
39
|
+
stringified = pa.array(
|
|
40
|
+
[
|
|
41
|
+
json.dumps(val.as_py()) if val.as_py() is not None else None
|
|
42
|
+
for val in column
|
|
43
|
+
],
|
|
44
|
+
type=pa.string(),
|
|
45
|
+
)
|
|
46
|
+
new_columns.append(stringified)
|
|
47
|
+
else:
|
|
48
|
+
new_columns.append(column)
|
|
49
|
+
|
|
50
|
+
return pa.RecordBatch.from_arrays(
|
|
51
|
+
new_columns, schema=_get_csv_compatible_schema(batch.schema)
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def write_parquet_file(path: str, data: list[dict]):
|
|
56
|
+
"""Writes a list of NOMAD entry dicts to a parquet file.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
path (str): The path where the file will be saved.
|
|
60
|
+
data (list[dict]): The list of NOMAD entry dicts to be written to the file.
|
|
61
|
+
"""
|
|
62
|
+
if not path.endswith('parquet'):
|
|
63
|
+
raise ValueError('Unsupported file type. Please use parquet.')
|
|
64
|
+
|
|
65
|
+
df = dict_to_dataframe(data)
|
|
66
|
+
|
|
67
|
+
table = pa.Table.from_pandas(df)
|
|
68
|
+
with pq.ParquetWriter(
|
|
69
|
+
path,
|
|
70
|
+
table.schema,
|
|
71
|
+
compression='snappy', # snappy for faster write/read for individual files
|
|
72
|
+
use_dictionary=True,
|
|
73
|
+
) as writer:
|
|
74
|
+
writer.write_table(table)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def write_csv_file(path: str, data: list[dict]):
|
|
78
|
+
"""Writes a list of NOMAD entry dicts to a CSV file.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
path (str): The path where the file will be saved.
|
|
82
|
+
data (list[dict]): The list of NOMAD entry dicts to be written to the file.
|
|
83
|
+
"""
|
|
84
|
+
if not path.endswith('csv'):
|
|
85
|
+
raise ValueError('Unsupported file type. Please use csv.')
|
|
86
|
+
|
|
87
|
+
df = dict_to_dataframe(data)
|
|
88
|
+
|
|
89
|
+
df.to_csv(path, index=False, mode='w', header=True)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def write_json_file(path: str, data: list[dict]):
|
|
93
|
+
"""Writes a list of NOMAD entry dicts to a JSON file.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
path (str): The path where the file will be saved.
|
|
97
|
+
data (list[dict]): The list of NOMAD entry dicts to be written to the file.
|
|
98
|
+
"""
|
|
99
|
+
if not path.endswith('json'):
|
|
100
|
+
raise ValueError('Unsupported file type. Please use json.')
|
|
101
|
+
|
|
102
|
+
with open(path, 'w') as f:
|
|
103
|
+
json.dump(data, f, indent=4)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def merge_files(
|
|
107
|
+
input_file_paths: list[str], output_file_type: str, output_file_path: str
|
|
108
|
+
):
|
|
109
|
+
"""Merges multiple Parquet or JSON files into a single file.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
input_file_paths (list[str]): List of file paths to be merged.
|
|
113
|
+
output_file_type (str): The type of the output file ('parquet', 'csv', or
|
|
114
|
+
'json').
|
|
115
|
+
output_file_path (str): Path of the merged output file.
|
|
116
|
+
"""
|
|
117
|
+
if output_file_type == 'parquet':
|
|
118
|
+
# Creates a logical dataset from the input files, not loading all data into
|
|
119
|
+
# memory. Also, unifies the schema across the files.
|
|
120
|
+
dataset = ds.dataset(input_file_paths, format='parquet')
|
|
121
|
+
|
|
122
|
+
# Write the dataset to a single Parquet file in batches
|
|
123
|
+
with pq.ParquetWriter(
|
|
124
|
+
output_file_path,
|
|
125
|
+
dataset.schema,
|
|
126
|
+
compression='zstd', # for better compression for merged file
|
|
127
|
+
compression_level=3,
|
|
128
|
+
use_dictionary=True,
|
|
129
|
+
) as writer:
|
|
130
|
+
for batch in dataset.to_batches():
|
|
131
|
+
writer.write_batch(batch)
|
|
132
|
+
|
|
133
|
+
elif output_file_type == 'csv':
|
|
134
|
+
# Creates a logical dataset from the input files, not loading all data into
|
|
135
|
+
# memory. Also, unifies the schema across the files.
|
|
136
|
+
# The batch files for `csv` are written in Parquet format for efficiency,
|
|
137
|
+
# so we read them as Parquet here.
|
|
138
|
+
dataset = ds.dataset(input_file_paths, format='parquet')
|
|
139
|
+
|
|
140
|
+
# PyArrow CSV writer doesn't support nested types (list, struct, etc.)
|
|
141
|
+
# Convert nested columns to JSON strings
|
|
142
|
+
csv_schema = _get_csv_compatible_schema(dataset.schema)
|
|
143
|
+
|
|
144
|
+
# Write the dataset to a single CSV file in batches
|
|
145
|
+
with pcsv.CSVWriter(output_file_path, csv_schema) as writer:
|
|
146
|
+
for batch in dataset.to_batches():
|
|
147
|
+
csv_batch = _stringify_nested_columns(batch)
|
|
148
|
+
writer.write_batch(csv_batch)
|
|
149
|
+
|
|
150
|
+
elif output_file_type == 'json':
|
|
151
|
+
|
|
152
|
+
def _json_stream_files(input_file_paths):
|
|
153
|
+
"""Generator that streams one entry dict at a time from multiple files."""
|
|
154
|
+
for file_path in input_file_paths:
|
|
155
|
+
with open(file_path, encoding='utf-8') as f:
|
|
156
|
+
data = json_stream.load(f)
|
|
157
|
+
yield from data
|
|
158
|
+
|
|
159
|
+
# Write a single JSON file by streaming entry dicts and wrapping in a list
|
|
160
|
+
with open(output_file_path, 'w', encoding='utf-8') as f:
|
|
161
|
+
f.write('[\n')
|
|
162
|
+
first_item = True
|
|
163
|
+
for item in _json_stream_files(input_file_paths):
|
|
164
|
+
if not first_item:
|
|
165
|
+
f.write(',\n')
|
|
166
|
+
# Convert transient json_stream object to standard Python types
|
|
167
|
+
json.dump(json_stream.to_standard_types(item), f, indent=4)
|
|
168
|
+
first_item = False
|
|
169
|
+
f.write('\n]')
|
|
170
|
+
|
|
171
|
+
else:
|
|
172
|
+
raise ValueError('Unsupported file type. Please use parquet, csv, or json.')
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
from datetime import timedelta
|
|
2
|
+
|
|
3
|
+
from temporalio import workflow
|
|
4
|
+
from temporalio.common import RetryPolicy
|
|
5
|
+
from temporalio.exceptions import ApplicationError
|
|
6
|
+
|
|
7
|
+
with workflow.unsafe.imports_passed_through():
|
|
8
|
+
from nomad.config import config as nomad_config
|
|
9
|
+
|
|
10
|
+
from nomad_ml_workflows.actions.export_entries.activities import (
|
|
11
|
+
cleanup_artifacts,
|
|
12
|
+
create_artifact_subdirectory,
|
|
13
|
+
export_dataset_to_upload,
|
|
14
|
+
merge_output_files,
|
|
15
|
+
search,
|
|
16
|
+
)
|
|
17
|
+
from nomad_ml_workflows.actions.export_entries.models import (
|
|
18
|
+
CleanupArtifactsInput,
|
|
19
|
+
CreateArtifactSubdirectoryInput,
|
|
20
|
+
ExportDatasetInput,
|
|
21
|
+
ExportDatasetMetadata,
|
|
22
|
+
ExportEntriesUserInput,
|
|
23
|
+
MergeOutputFilesInput,
|
|
24
|
+
SearchInput,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@workflow.defn
|
|
29
|
+
class ExportEntriesWorkflow:
|
|
30
|
+
@workflow.run
|
|
31
|
+
async def run(self, data: ExportEntriesUserInput) -> str:
|
|
32
|
+
"""
|
|
33
|
+
Workflow to search entries and export them into a datafile in the specified
|
|
34
|
+
upload.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
data (ExportEntriesUserInput): Input data for the export entries workflow.
|
|
38
|
+
Returns:
|
|
39
|
+
str: Path to the saved dataset in the upload's `raw` folder.
|
|
40
|
+
"""
|
|
41
|
+
retry_policy = RetryPolicy(
|
|
42
|
+
maximum_attempts=1,
|
|
43
|
+
initial_interval=timedelta(seconds=10),
|
|
44
|
+
maximum_interval=timedelta(minutes=1),
|
|
45
|
+
backoff_coefficient=2.0,
|
|
46
|
+
)
|
|
47
|
+
artifact_subdirectory = await workflow.execute_activity(
|
|
48
|
+
create_artifact_subdirectory,
|
|
49
|
+
CreateArtifactSubdirectoryInput(subdir_name=workflow.info().workflow_id),
|
|
50
|
+
start_to_close_timeout=timedelta(minutes=10),
|
|
51
|
+
retry_policy=retry_policy,
|
|
52
|
+
)
|
|
53
|
+
export_dataset_input = ExportDatasetInput(
|
|
54
|
+
user_id=data.user_id,
|
|
55
|
+
upload_id=data.upload_id,
|
|
56
|
+
artifact_subdirectory=artifact_subdirectory,
|
|
57
|
+
exportable_dir_name='export_entries_error', # name used in case of error
|
|
58
|
+
zip_output=data.output_settings.zip_output,
|
|
59
|
+
source_paths=[],
|
|
60
|
+
metadata=ExportDatasetMetadata(user_input=data),
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
config = nomad_config.get_plugin_entry_point(
|
|
65
|
+
'nomad_ml_workflows.actions:export_entries'
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
search_counter = 0
|
|
69
|
+
num_entries_available = 0
|
|
70
|
+
generated_file_paths = []
|
|
71
|
+
search_start_times = []
|
|
72
|
+
search_end_times = []
|
|
73
|
+
total_num_entries_exported = 0
|
|
74
|
+
reached_max_entries_limit = False
|
|
75
|
+
search_input = SearchInput.from_user_input(
|
|
76
|
+
data,
|
|
77
|
+
output_file_path='', # Placeholder, will be set in loop
|
|
78
|
+
max_entries_export_limit=config.max_entries_export_limit,
|
|
79
|
+
)
|
|
80
|
+
while True:
|
|
81
|
+
search_counter += 1
|
|
82
|
+
search_input.output_file_path = (
|
|
83
|
+
f'{artifact_subdirectory}/{search_counter}.'
|
|
84
|
+
f'{search_input.batch_file_type}'
|
|
85
|
+
)
|
|
86
|
+
search_output = await workflow.execute_activity(
|
|
87
|
+
search,
|
|
88
|
+
search_input,
|
|
89
|
+
activity_id=f'search-activity-{search_counter}',
|
|
90
|
+
start_to_close_timeout=timedelta(
|
|
91
|
+
seconds=config.search_batch_timeout
|
|
92
|
+
),
|
|
93
|
+
retry_policy=retry_policy,
|
|
94
|
+
)
|
|
95
|
+
if search_counter == 1:
|
|
96
|
+
# capture the total available entries from the first search output
|
|
97
|
+
num_entries_available = search_output.num_entries_available
|
|
98
|
+
if search_output.num_entries_exported > 0:
|
|
99
|
+
# only save paths if the writing files was not skipped
|
|
100
|
+
generated_file_paths.append(search_input.output_file_path)
|
|
101
|
+
search_start_times.append(search_output.search_start_time)
|
|
102
|
+
search_end_times.append(search_output.search_end_time)
|
|
103
|
+
total_num_entries_exported += search_output.num_entries_exported
|
|
104
|
+
# Update pagination for next iteration
|
|
105
|
+
search_input.pagination.page_after_value = (
|
|
106
|
+
search_output.pagination_next_page_after_value
|
|
107
|
+
)
|
|
108
|
+
search_input.max_entries_export_limit -= (
|
|
109
|
+
search_output.num_entries_exported
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
if search_output.pagination_next_page_after_value is None:
|
|
113
|
+
# break if there are no more pages to fetch
|
|
114
|
+
break
|
|
115
|
+
if search_input.max_entries_export_limit <= 0:
|
|
116
|
+
# break early if the max entries limit has been reached
|
|
117
|
+
reached_max_entries_limit = True
|
|
118
|
+
break
|
|
119
|
+
|
|
120
|
+
merged_file_path = await workflow.execute_activity(
|
|
121
|
+
merge_output_files,
|
|
122
|
+
MergeOutputFilesInput(
|
|
123
|
+
artifact_subdirectory=artifact_subdirectory,
|
|
124
|
+
output_file_type=data.output_settings.output_file_type,
|
|
125
|
+
generated_file_paths=generated_file_paths,
|
|
126
|
+
),
|
|
127
|
+
start_to_close_timeout=timedelta(hours=2),
|
|
128
|
+
retry_policy=retry_policy,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
# Prepare export dataset input and metadata
|
|
132
|
+
export_dataset_input.exportable_dir_name = (
|
|
133
|
+
'export_entries_' + search_start_times[0].replace(':', '-')
|
|
134
|
+
)
|
|
135
|
+
export_dataset_input.source_paths = [merged_file_path]
|
|
136
|
+
export_dataset_input.metadata = ExportDatasetMetadata(
|
|
137
|
+
num_entries_exported=total_num_entries_exported,
|
|
138
|
+
num_entries_available=num_entries_available,
|
|
139
|
+
reached_max_entries_limit=reached_max_entries_limit,
|
|
140
|
+
search_start_time=search_start_times[0],
|
|
141
|
+
search_end_time=search_end_times[-1],
|
|
142
|
+
user_input=data,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
except Exception as e:
|
|
146
|
+
# Capture error info to include in metadata
|
|
147
|
+
import traceback
|
|
148
|
+
|
|
149
|
+
export_dataset_input.metadata.error_info = traceback.format_exc()
|
|
150
|
+
raise ApplicationError(
|
|
151
|
+
'Encountered an error during export entries workflow.',
|
|
152
|
+
) from e
|
|
153
|
+
|
|
154
|
+
finally:
|
|
155
|
+
saved_dataset_path = await workflow.execute_activity(
|
|
156
|
+
export_dataset_to_upload,
|
|
157
|
+
export_dataset_input,
|
|
158
|
+
start_to_close_timeout=timedelta(hours=2),
|
|
159
|
+
retry_policy=retry_policy,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
await workflow.execute_activity(
|
|
163
|
+
cleanup_artifacts,
|
|
164
|
+
CleanupArtifactsInput(subdir_path=artifact_subdirectory),
|
|
165
|
+
start_to_close_timeout=timedelta(hours=2),
|
|
166
|
+
retry_policy=retry_policy,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
return saved_dataset_path
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: nomad-ml-workflows
|
|
3
|
+
Version: 0.0.6
|
|
4
|
+
Summary: A NOMAD plugin for managing ML workflows.
|
|
5
|
+
Author-email: Sarthak Kapoor <sarthak.kapoor@physik.hu-berlin.de>
|
|
6
|
+
Maintainer-email: Sarthak Kapoor <sarthak.kapoor@physik.hu-berlin.de>
|
|
7
|
+
License:
|
|
8
|
+
The MIT License (MIT)
|
|
9
|
+
|
|
10
|
+
Copyright (c) 2025 Sarthak Kapoor
|
|
11
|
+
|
|
12
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
13
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
14
|
+
in the Software without restriction, including without limitation the rights
|
|
15
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
16
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
17
|
+
furnished to do so, subject to the following conditions:
|
|
18
|
+
|
|
19
|
+
The above copyright notice and this permission notice shall be included in
|
|
20
|
+
all copies or substantial portions of the Software.
|
|
21
|
+
|
|
22
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
23
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
24
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
25
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
26
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
27
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
28
|
+
THE SOFTWARE.
|
|
29
|
+
|
|
30
|
+
Project-URL: Repository, https://github.com/FAIRmat-NFDI/nomad-ml-workflows
|
|
31
|
+
Classifier: Intended Audience :: Developers
|
|
32
|
+
Classifier: Operating System :: OS Independent
|
|
33
|
+
Classifier: Programming Language :: Python
|
|
34
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
35
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
36
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
37
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
38
|
+
Requires-Python: >=3.10
|
|
39
|
+
Description-Content-Type: text/markdown
|
|
40
|
+
License-File: LICENSE
|
|
41
|
+
Requires-Dist: nomad-lab>=1.4.0
|
|
42
|
+
Requires-Dist: json-stream
|
|
43
|
+
Requires-Dist: pydantic
|
|
44
|
+
Requires-Dist: temporalio
|
|
45
|
+
Provides-Extra: dev
|
|
46
|
+
Requires-Dist: nomad-lab[infrastructure]>=1.4.0; extra == "dev"
|
|
47
|
+
Requires-Dist: ruff; extra == "dev"
|
|
48
|
+
Requires-Dist: pytest; extra == "dev"
|
|
49
|
+
Requires-Dist: structlog; extra == "dev"
|
|
50
|
+
Requires-Dist: mkdocs; extra == "dev"
|
|
51
|
+
Requires-Dist: mkdocs-material>=8.1.1; extra == "dev"
|
|
52
|
+
Requires-Dist: pymdown-extensions; extra == "dev"
|
|
53
|
+
Requires-Dist: mkdocs-click; extra == "dev"
|
|
54
|
+
Requires-Dist: pytest-asyncio; extra == "dev"
|
|
55
|
+
Provides-Extra: cpu-action
|
|
56
|
+
Requires-Dist: pyarrow; extra == "cpu-action"
|
|
57
|
+
Dynamic: license-file
|
|
58
|
+
|
|
59
|
+
# nomad-ml-workflows
|
|
60
|
+
|
|
61
|
+
A NOMAD plugin for managing ML workflows. Currently, it provides an action to export large number of entries from NOMAD database as tabular data files. Other ML workflow related actions and schemas will be added in future.
|
|
62
|
+
|
|
63
|
+
## 📦 Installation
|
|
64
|
+
You can install the plugin using pip:
|
|
65
|
+
```sh
|
|
66
|
+
pip install nomad-ml-workflows @ git+https://github.com/FAIRmat-NFDI/nomad-ml-workflows.git
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
However, to fully utilize the plugin, you need to add it to your NOMAD instance as described [below](#-adding-this-plugin-to-nomad).
|
|
70
|
+
|
|
71
|
+
## ✨ Features
|
|
72
|
+
|
|
73
|
+
- Export a large number of NOMAD entries as tabular data files (CSV, Parquet) using NOMAD Actions. Once the action is triggered, it will:
|
|
74
|
+
- Search entries based on user-defined criteria.
|
|
75
|
+
- Optionally include or exclude data fields from the entries.
|
|
76
|
+
- Package the entries into tabular data files like CSV or Parquet (or as JSON)
|
|
77
|
+
- Export the files to a specified Project (or previously known as Upload) in NOAMD.
|
|
78
|
+
|
|
79
|
+
These can then be downloaded from the NOMAD web interface for local use.
|
|
80
|
+
|
|
81
|
+
## ⚙️ Configuration
|
|
82
|
+
The Export Entries action can be configured using the following parameters in
|
|
83
|
+
the `nomad.yaml` configuration file of your NOMAD Oasis instance:
|
|
84
|
+
|
|
85
|
+
```yaml
|
|
86
|
+
plugins:
|
|
87
|
+
entry_points:
|
|
88
|
+
options:
|
|
89
|
+
nomad_ml_workflows.actions:export_entries:
|
|
90
|
+
search_batch_timeout: 7200
|
|
91
|
+
# Timeout (in seconds) for each search batch in the Export Entries
|
|
92
|
+
# action. Set this accordingly to time out longer searches.
|
|
93
|
+
max_entries_export_limit: 100000
|
|
94
|
+
# Maximum number of entries that can be exported in a single
|
|
95
|
+
# Export Entries action.
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
## 🚀 Adding this plugin to NOMAD
|
|
100
|
+
|
|
101
|
+
Currently, NOMAD has two distinct flavors that are relevant depending on your role as an user:
|
|
102
|
+
1. [A NOMAD Oasis](#adding-this-plugin-in-your-nomad-oasis): any user with a NOMAD Oasis instance.
|
|
103
|
+
2. [Local NOMAD installation and the source code of NOMAD](#adding-this-plugin-in-your-local-nomad-installation-and-the-source-code-of-nomad): internal developers.
|
|
104
|
+
|
|
105
|
+
### Adding this plugin in your NOMAD Oasis
|
|
106
|
+
|
|
107
|
+
Read the [NOMAD plugin documentation](https://nomad-lab.eu/prod/v1/staging/docs/howto/oasis/plugins_install.html) for all details on how to deploy the plugin on your NOMAD instance.
|
|
108
|
+
|
|
109
|
+
### Adding this plugin in your local NOMAD installation and the source code of NOMAD
|
|
110
|
+
|
|
111
|
+
We now recommend using the dedicated [`nomad-distro-dev`](https://github.com/FAIRmat-NFDI/nomad-distro-dev) repository to simplify the process. Please refer to that repository for detailed instructions.
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
## 🛠️ Development
|
|
115
|
+
|
|
116
|
+
If you want to develop locally this plugin, clone the project and in the plugin folder, create a virtual environment (you can use Python 3.10, 3.11 or 3.12):
|
|
117
|
+
```sh
|
|
118
|
+
git clone https://github.com/FAIRmat-NFDI/nomad-ml-workflows.git
|
|
119
|
+
cd nomad-ml-workflows
|
|
120
|
+
python3.11 -m venv .pyenv
|
|
121
|
+
. .pyenv/bin/activate
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
Make sure to have `pip` upgraded:
|
|
125
|
+
```sh
|
|
126
|
+
pip install --upgrade pip
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
We recommend installing `uv` for fast pip installation of the packages:
|
|
130
|
+
```sh
|
|
131
|
+
pip install uv
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
Install the `nomad-lab` package:
|
|
135
|
+
```sh
|
|
136
|
+
uv pip install -e '.[dev]'
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
### Run linting and auto-formatting
|
|
140
|
+
|
|
141
|
+
We use [Ruff](https://docs.astral.sh/ruff/) for linting and formatting the code. Ruff auto-formatting is also a part of the GitHub workflow actions. You can run locally:
|
|
142
|
+
```sh
|
|
143
|
+
ruff check .
|
|
144
|
+
ruff format . --check
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
### Debugging
|
|
148
|
+
|
|
149
|
+
For interactive debugging of the tests, use `pytest` with the `--pdb` flag. We recommend using an IDE for debugging, e.g., _VSCode_. If that is the case, add the following snippet to your `.vscode/launch.json`:
|
|
150
|
+
```json
|
|
151
|
+
{
|
|
152
|
+
"configurations": [
|
|
153
|
+
{
|
|
154
|
+
"name": "<descriptive tag>",
|
|
155
|
+
"type": "debugpy",
|
|
156
|
+
"request": "launch",
|
|
157
|
+
"cwd": "${workspaceFolder}",
|
|
158
|
+
"program": "${workspaceFolder}/.pyenv/bin/pytest",
|
|
159
|
+
"justMyCode": true,
|
|
160
|
+
"env": {
|
|
161
|
+
"_PYTEST_RAISE": "1"
|
|
162
|
+
},
|
|
163
|
+
"args": [
|
|
164
|
+
"-sv",
|
|
165
|
+
"--pdb",
|
|
166
|
+
"<path-to-plugin-tests>",
|
|
167
|
+
]
|
|
168
|
+
}
|
|
169
|
+
]
|
|
170
|
+
}
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
where `<path-to-plugin-tests>` must be changed to the local path to the test module to be debugged.
|
|
174
|
+
|
|
175
|
+
The settings configuration file `.vscode/settings.json` automatically applies the linting and formatting upon saving the modified file.
|
|
176
|
+
|
|
177
|
+
### Documentation on Github pages
|
|
178
|
+
|
|
179
|
+
To view the documentation locally, install the related packages using:
|
|
180
|
+
```sh
|
|
181
|
+
uv pip install -r requirements_docs.txt
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
Run the documentation server:
|
|
185
|
+
```sh
|
|
186
|
+
mkdocs serve
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
## 👥 Main contributors
|
|
191
|
+
| Name | E-mail |
|
|
192
|
+
|------|------------|
|
|
193
|
+
| Sarthak Kapoor | [sarthak.kapoor@physik.hu-berlin.de](mailto:sarthak.kapoor@physik.hu-berlin.de)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
## 📄 License
|
|
197
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
nomad_ml_workflows/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
nomad_ml_workflows/actions/__init__.py,sha256=ooUE9fX622d8JU8ckbr3tQJuZ3oQWvnJFS3CASCINqc,99
|
|
3
|
+
nomad_ml_workflows/actions/export_entries/__init__.py,sha256=bFxKQr8zSQZQcwJwSfJ2bC35orXWh9COZZVOGY5Jy7k,1681
|
|
4
|
+
nomad_ml_workflows/actions/export_entries/activities.py,sha256=ie7uyQMU6I-9u3yuSASN9v2EWU7SxC8xWO_m11uAY44,6943
|
|
5
|
+
nomad_ml_workflows/actions/export_entries/models.py,sha256=YQRrdCbxK9cY0ukrdUxtn8PKvCVYFrrVsde-bDD5glk,8521
|
|
6
|
+
nomad_ml_workflows/actions/export_entries/utils.py,sha256=g2ZbwJZgPEBOcIdXgCYMRGK1XViNYp9414h4ZvUbHRY,6193
|
|
7
|
+
nomad_ml_workflows/actions/export_entries/workflows.py,sha256=T1png3P8JDoi0Yf7X798xFHEao1ePYXQqSUjzDkXoeg,6890
|
|
8
|
+
nomad_ml_workflows-0.0.6.dist-info/licenses/LICENSE,sha256=Wji1LdkrEkb33BQ4YJUjF9EPtOiz74OQW3XO0omcZ3U,1082
|
|
9
|
+
nomad_ml_workflows-0.0.6.dist-info/METADATA,sha256=2sX3U7TM-II4YLu5ctYRRg2tOkudLNXAViKzrDFEqqo,7433
|
|
10
|
+
nomad_ml_workflows-0.0.6.dist-info/WHEEL,sha256=YCfwYGOYMi5Jhw2fU4yNgwErybb2IX5PEwBKV4ZbdBo,91
|
|
11
|
+
nomad_ml_workflows-0.0.6.dist-info/entry_points.txt,sha256=unxYXwft1X1R3FCg9NyANBqevaAZUZNBQJ7We5ez0eA,93
|
|
12
|
+
nomad_ml_workflows-0.0.6.dist-info/top_level.txt,sha256=1NCxJd5BKyy75vikv64WJSwZL9bdISeeHO5FCAeQI_E,19
|
|
13
|
+
nomad_ml_workflows-0.0.6.dist-info/RECORD,,
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
|
|
2
|
+
The MIT License (MIT)
|
|
3
|
+
|
|
4
|
+
Copyright (c) 2025 Sarthak Kapoor
|
|
5
|
+
|
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
7
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
8
|
+
in the Software without restriction, including without limitation the rights
|
|
9
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
10
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
11
|
+
furnished to do so, subject to the following conditions:
|
|
12
|
+
|
|
13
|
+
The above copyright notice and this permission notice shall be included in
|
|
14
|
+
all copies or substantial portions of the Software.
|
|
15
|
+
|
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
17
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
18
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
19
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
20
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
21
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
22
|
+
THE SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
nomad_ml_workflows
|