nomad-ml-workflows 0.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,3 @@
1
+ from nomad_ml_workflows.actions.export_entries import export_entries
2
+
3
+ __all__ = ['export_entries']
@@ -0,0 +1,53 @@
1
+ from nomad.actions import TaskQueue
2
+ from pydantic import Field
3
+ from temporalio import workflow
4
+
5
+ with workflow.unsafe.imports_passed_through():
6
+ from nomad.config.models.plugins import ActionEntryPoint
7
+
8
+
9
+ class ExportEntriesActionEntryPoint(ActionEntryPoint):
10
+ search_batch_timeout: int = Field(
11
+ default=7200, # 2 hours
12
+ description='Timeout (in seconds) for each search batch in the Export Entries '
13
+ 'action. Set this accordingly to time out longer searches.',
14
+ )
15
+ max_entries_export_limit: int = Field(
16
+ default=100000,
17
+ description='Maximum number of entries that can be exported in a single '
18
+ 'Export Entries action.',
19
+ )
20
+
21
+ def load(self):
22
+ from nomad.actions import Action
23
+
24
+ from nomad_ml_workflows.actions.export_entries.activities import (
25
+ cleanup_artifacts,
26
+ create_artifact_subdirectory,
27
+ export_dataset_to_upload,
28
+ merge_output_files,
29
+ search,
30
+ )
31
+ from nomad_ml_workflows.actions.export_entries.workflows import (
32
+ ExportEntriesWorkflow,
33
+ )
34
+
35
+ return Action(
36
+ task_queue=self.task_queue,
37
+ workflow=ExportEntriesWorkflow,
38
+ activities=[
39
+ create_artifact_subdirectory,
40
+ search,
41
+ merge_output_files,
42
+ export_dataset_to_upload,
43
+ cleanup_artifacts,
44
+ ],
45
+ )
46
+
47
+
48
+ export_entries = ExportEntriesActionEntryPoint(
49
+ name='Export Entries Action',
50
+ description='An action to search entries and export them as a zip file in the '
51
+ 'specified upload.',
52
+ task_queue=TaskQueue.CPU,
53
+ )
@@ -0,0 +1,209 @@
1
+ import json
2
+ import os
3
+ import shutil
4
+ import zipfile
5
+ from datetime import datetime, timezone
6
+
7
+ from nomad.actions.manager import action_artifacts_dir, get_upload_files
8
+ from nomad.files import StagingUploadFiles
9
+ from nomad.search import search as nomad_search
10
+ from temporalio import activity
11
+
12
+ from nomad_ml_workflows.actions.export_entries.models import (
13
+ CleanupArtifactsInput,
14
+ CreateArtifactSubdirectoryInput,
15
+ ExportDatasetInput,
16
+ MergeOutputFilesInput,
17
+ SearchInput,
18
+ SearchOutput,
19
+ )
20
+ from nomad_ml_workflows.actions.export_entries.utils import (
21
+ merge_files,
22
+ write_json_file,
23
+ write_parquet_file,
24
+ )
25
+
26
+
27
+ @activity.defn
28
+ async def create_artifact_subdirectory(data: CreateArtifactSubdirectoryInput) -> str:
29
+ """
30
+ Creates a subdirectory within the action artifacts directory.
31
+
32
+ Args:
33
+ data (CreateArtifactSubdirectoryInput): Input data for creating subdirectory.
34
+
35
+ Returns:
36
+ str: Path to the created subdirectory.
37
+ """
38
+
39
+ subdir_path = os.path.join(action_artifacts_dir(), data.subdir_name)
40
+
41
+ assert not os.path.exists(subdir_path), (
42
+ f'Artifact subdirectory "{subdir_path}" already exists.'
43
+ )
44
+
45
+ os.makedirs(subdir_path)
46
+
47
+ return subdir_path
48
+
49
+
50
+ @activity.defn
51
+ async def search(data: SearchInput) -> SearchOutput:
52
+ """
53
+ Activity to perform NOMAD search based on the provided input data. The search
54
+ results are written to a file in the specified format (Parquet or JSON) in the
55
+ artifacts directory.
56
+
57
+ Args:
58
+ data (SearchInput): Input data for the search activity.
59
+
60
+ Returns:
61
+ SearchOutput: Output data from the search activity.
62
+ """
63
+
64
+ write_dataset_file = {
65
+ 'parquet': write_parquet_file,
66
+ 'json': write_json_file,
67
+ }.get(data.batch_file_type)
68
+ if write_dataset_file is None:
69
+ raise ValueError(f'Unsupported batch file type "{data.batch_file_type}". ')
70
+
71
+ start = datetime.now(timezone.utc).isoformat()
72
+ response = nomad_search(
73
+ user_id=data.user_id,
74
+ owner=data.owner,
75
+ query=data.query,
76
+ required=data.required,
77
+ pagination=data.pagination,
78
+ aggregations={}, # aggregations support can be added later
79
+ )
80
+ end = datetime.now(timezone.utc).isoformat()
81
+
82
+ # Limit the number of exported entries
83
+ if len(response.data) > data.max_entries_export_limit:
84
+ entry_list = response.data[: data.max_entries_export_limit]
85
+ else:
86
+ entry_list = response.data
87
+
88
+ output = SearchOutput(
89
+ search_start_time=start,
90
+ search_end_time=end,
91
+ num_entries_exported=len(entry_list),
92
+ num_entries_available=response.pagination.total,
93
+ pagination_next_page_after_value=response.pagination.next_page_after_value,
94
+ )
95
+
96
+ if len(entry_list) == 0:
97
+ # skip writing empty files and stop subsequent searches
98
+ output.pagination_next_page_after_value = None
99
+ else:
100
+ write_dataset_file(path=data.output_file_path, data=entry_list)
101
+
102
+ return output
103
+
104
+
105
+ @activity.defn
106
+ async def merge_output_files(data: MergeOutputFilesInput) -> str | None:
107
+ """
108
+ Activity to merge multiple batch files into a single file.
109
+
110
+ Args:
111
+ data (MergeOutputFilesInput): Input data for merging files.
112
+
113
+ Returns:
114
+ str | None: Path of the merged output file, or None if no files were merged.
115
+ """
116
+
117
+ if not data.generated_file_paths:
118
+ raise ValueError('No generated file paths provided for merging.')
119
+
120
+ merged_file_path = os.path.join(
121
+ data.artifact_subdirectory, 'data.' + data.output_file_type
122
+ )
123
+
124
+ merge_files(data.generated_file_paths, data.output_file_type, merged_file_path)
125
+
126
+ return merged_file_path
127
+
128
+
129
+ @activity.defn
130
+ async def export_dataset_to_upload(data: ExportDatasetInput) -> str:
131
+ """
132
+ Activity to export the generated dataset files as a zip file to the specified
133
+ upload. A metadata file is also included in the zip.
134
+
135
+ Args:
136
+ data (ExportDatasetInput): Input data for exporting the dataset to the upload.
137
+ Returns:
138
+ str: Path to the saved zip file in the upload.
139
+ """
140
+
141
+ def unique_filename(filename: str, upload_files: StagingUploadFiles) -> str:
142
+ """Generate a unique filename for the upload_files directory."""
143
+ if not upload_files.raw_path_exists(filename):
144
+ return filename
145
+
146
+ count = 1
147
+ while True:
148
+ name, ext = os.path.splitext(filename)
149
+ _filename = f'{name}({count}){ext}'
150
+ if not upload_files.raw_path_exists(_filename):
151
+ return _filename
152
+ count += 1
153
+
154
+ upload_files = get_upload_files(data.upload_id, data.user_id)
155
+ if not upload_files:
156
+ raise ValueError(
157
+ f'Upload with ID {data.upload_id} for user {data.user_id} not found.'
158
+ )
159
+
160
+ # Create a metadata.json file in the artifact subdirectory
161
+ metadata_dict = {
162
+ 'note': 'This metadata file contains information about the exported dataset '
163
+ 'and the conditions under which it was generated.',
164
+ 'data': data.metadata.model_dump(),
165
+ 'schema': data.metadata.model_json_schema(),
166
+ }
167
+ metadata_path = os.path.join(data.artifact_subdirectory, 'metadata.json')
168
+ with open(metadata_path, 'w', encoding='utf-8') as metafile:
169
+ json.dump(metadata_dict, metafile, indent=4)
170
+
171
+ exportable_filepaths = data.source_paths + [metadata_path]
172
+ exportable_dir_name = unique_filename(data.exportable_dir_name, upload_files)
173
+
174
+ # Create a zip file containing all the source paths and the metadata file
175
+ if data.zip_output:
176
+ zipname = exportable_dir_name + '.zip'
177
+ zippath = os.path.join(data.artifact_subdirectory, zipname)
178
+ with zipfile.ZipFile(zippath, 'w', compression=zipfile.ZIP_DEFLATED) as zipf:
179
+ for filepath in exportable_filepaths:
180
+ arcname = os.path.basename(filepath)
181
+ zipf.write(filepath, arcname=arcname)
182
+ # Add zip file to the NOMAD Upload
183
+ upload_files.add_rawfiles(path=zippath, auto_decompress=False)
184
+ return zipname
185
+
186
+ # If not zipping, copy files to directory named exportable_dir_name
187
+ exportable_dir_path = os.path.join(data.artifact_subdirectory, exportable_dir_name)
188
+ os.mkdir(exportable_dir_path)
189
+ for filepath in exportable_filepaths:
190
+ temp_path = os.path.join(exportable_dir_path, os.path.basename(filepath))
191
+ shutil.copy2(filepath, temp_path)
192
+ # Add directory to the NOMAD Upload
193
+ upload_files.add_rawfiles(
194
+ path=exportable_dir_path, target_dir=exportable_dir_name
195
+ )
196
+ return exportable_dir_name
197
+
198
+
199
+ @activity.defn
200
+ async def cleanup_artifacts(data: CleanupArtifactsInput) -> None:
201
+ """
202
+ Activity to clean up the action artifacts directory.
203
+
204
+ Args:
205
+ data (CleanupArtifactsInput): Input data for cleaning up artifacts.
206
+ """
207
+
208
+ if os.path.exists(data.subdir_path):
209
+ shutil.rmtree(data.subdir_path)
@@ -0,0 +1,245 @@
1
+ import json
2
+ from typing import Literal
3
+
4
+ from nomad.app.v1.models.models import MetadataPagination, MetadataRequired, Query
5
+ from pydantic import BaseModel, Field
6
+
7
+ OwnerLiteral = Literal['public', 'visible', 'shared', 'user', 'staging']
8
+ BatchFileTypeLiteral = Literal['parquet', 'json']
9
+ OutputFileTypeLiteral = Literal['parquet', 'csv', 'json']
10
+ IndexLiteral = Literal['entries', 'datasets', 'models', 'spaces']
11
+
12
+
13
+ class SearchSettings(BaseModel):
14
+ owner: OwnerLiteral = Field(
15
+ 'visible', description='Owner of the entries to be searched.'
16
+ )
17
+ query: str = Field(
18
+ ...,
19
+ description="""Query for extracting entries. Should be a valid dictionary
20
+ string. For example:
21
+ {
22
+ 'entry_type': 'ELNSample'
23
+ }""",
24
+ # TODO: add `ui:widget` though `json_schema_extra` after NOMAD UI supports it
25
+ )
26
+ required_include: list[str] = Field(
27
+ None,
28
+ description='List of fields to include in the search results. For example: '
29
+ 'results*, data.results*',
30
+ )
31
+ required_exclude: list[str] = Field(
32
+ None,
33
+ description='List of fields to exclude from the search results. For example: '
34
+ 'results.method.method_name',
35
+ )
36
+
37
+
38
+ class OutputSettings(BaseModel):
39
+ output_file_type: OutputFileTypeLiteral = Field(
40
+ 'parquet',
41
+ description='Type of the output file.',
42
+ )
43
+ batch_size: int = Field(
44
+ 1000,
45
+ gt=0,
46
+ description='Number of entries to be fetched and written per search batch. '
47
+ 'Use smaller batch sizes when exporting large entries to reduce memory usage.',
48
+ )
49
+ zip_output: bool = Field(
50
+ True,
51
+ description='Whether to create a zip file for the output file(s). Set it '
52
+ 'to true if you want download the dataset for external use. If you want to '
53
+ 'work with the exported data in NOMAD, set it to false. This will export the '
54
+ 'dataset as a directory in the specified project.',
55
+ )
56
+
57
+
58
+ class ExportEntriesUserInput(BaseModel):
59
+ upload_id: str = Field(
60
+ ...,
61
+ description='Unique identifier for the upload associated with the workflow.',
62
+ )
63
+ user_id: str = Field(
64
+ ..., description='Unique identifier for the user who initiated the workflow.'
65
+ )
66
+ search_settings: SearchSettings
67
+ output_settings: OutputSettings
68
+
69
+
70
+ class CreateArtifactSubdirectoryInput(BaseModel):
71
+ subdir_name: str = Field(..., description='Name of the subdirectory to be created.')
72
+
73
+
74
+ class SearchInput(BaseModel):
75
+ user_id: str = Field(..., description='User ID performing the search.')
76
+ owner: OwnerLiteral = Field(..., description='Owner of the entries to be searched.')
77
+ query: Query = Field(..., description='Search query parameters.')
78
+ required: MetadataRequired = Field(
79
+ ..., description='Required fields for filtering the search results.'
80
+ )
81
+ pagination: MetadataPagination = Field(
82
+ ..., description='Pagination settings for the search results.'
83
+ )
84
+ batch_file_type: BatchFileTypeLiteral = Field(
85
+ ..., description='Type of the output file.'
86
+ )
87
+ output_file_path: str = Field(..., description='Path to the generated output file.')
88
+ max_entries_export_limit: int = Field(
89
+ ..., description='Maximum number of entries to be exported.'
90
+ )
91
+
92
+ @classmethod
93
+ def from_user_input(
94
+ cls,
95
+ user_input: ExportEntriesUserInput,
96
+ /,
97
+ output_file_path: str,
98
+ max_entries_export_limit: int,
99
+ ) -> 'SearchInput':
100
+ """Convert from ExportEntriesUserInput to SearchInput"""
101
+
102
+ def _clean_field(field: str) -> str:
103
+ """
104
+ Removes trailing whitespaces and inverted commas
105
+ """
106
+ return field.strip().strip("'").strip('"')
107
+
108
+ query = json.loads(
109
+ _clean_field(user_input.search_settings.query).replace("'", '"')
110
+ )
111
+
112
+ required = MetadataRequired()
113
+ if user_input.search_settings.required_include is not None:
114
+ include = [
115
+ _clean_field(field)
116
+ for field in user_input.search_settings.required_include
117
+ ]
118
+ required.include = include if include else None
119
+ if user_input.search_settings.required_exclude:
120
+ exclude = [
121
+ _clean_field(field)
122
+ for field in user_input.search_settings.required_exclude
123
+ ]
124
+ required.exclude = exclude if exclude else None
125
+
126
+ pagination = MetadataPagination(page_size=user_input.output_settings.batch_size)
127
+
128
+ batch_file_type = user_input.output_settings.output_file_type
129
+ if batch_file_type == 'csv':
130
+ batch_file_type = 'parquet' # use parquet batches for csv
131
+
132
+ return cls(
133
+ user_id=user_input.user_id,
134
+ owner=user_input.search_settings.owner,
135
+ query=query,
136
+ required=required,
137
+ pagination=pagination,
138
+ batch_file_type=batch_file_type,
139
+ output_file_path=output_file_path,
140
+ max_entries_export_limit=max_entries_export_limit,
141
+ )
142
+
143
+
144
+ class SearchOutput(BaseModel):
145
+ num_entries_exported: int = Field(
146
+ ..., description='Number of entries exported to the output file.'
147
+ )
148
+ num_entries_available: int = Field(
149
+ ...,
150
+ description='Total number of entries available for the given search query.',
151
+ )
152
+ search_start_time: str = Field(
153
+ ..., description='Timestamp when the search started.'
154
+ )
155
+ search_end_time: str = Field(
156
+ ..., description='Timestamp when the search completed.'
157
+ )
158
+ pagination_next_page_after_value: str | None = Field(
159
+ None,
160
+ description='The next_page_after_value from pagination, if more results are '
161
+ 'available.',
162
+ )
163
+
164
+
165
+ class MergeOutputFilesInput(BaseModel):
166
+ artifact_subdirectory: str = Field(
167
+ ...,
168
+ description='Subdirectory where the merged output file will be stored.',
169
+ )
170
+ output_file_type: OutputFileTypeLiteral = Field(
171
+ ...,
172
+ description='Type of the output file.',
173
+ )
174
+ generated_file_paths: list[str] = Field(
175
+ ...,
176
+ description='List of the generated file paths to be merged into a single file.',
177
+ )
178
+
179
+
180
+ class ExportDatasetMetadata(BaseModel):
181
+ num_entries_exported: int = Field(
182
+ 0,
183
+ description='Total number of entries exported in all the exported dataset '
184
+ 'batches.',
185
+ )
186
+ num_entries_available: int = Field(
187
+ 0,
188
+ description='Total number of entries available for the given search query.',
189
+ )
190
+ reached_max_entries_limit: bool = Field(
191
+ False,
192
+ description='Indicates whether the export reached the maximum number of '
193
+ 'entries allowed. If true, the exported dataset contains the first N entries '
194
+ 'up to the maximum limit.',
195
+ )
196
+ search_start_time: str = Field(
197
+ '',
198
+ description='Timestamp when the first search batch started.',
199
+ )
200
+ search_end_time: str = Field(
201
+ '',
202
+ description='Timestamp when the last search batch completed.',
203
+ )
204
+ user_input: ExportEntriesUserInput | None = Field(
205
+ None, description='Original user input for the export entries workflow.'
206
+ )
207
+ error_info: str | None = Field(
208
+ None,
209
+ description='Error information if any error occurred during the search and '
210
+ 'merging process.',
211
+ )
212
+
213
+
214
+ class ExportDatasetInput(BaseModel):
215
+ user_id: str = Field(
216
+ ..., description='User ID performing the export dataset operation.'
217
+ )
218
+ upload_id: str = Field(
219
+ ..., description='Upload ID associated with the export dataset operation.'
220
+ )
221
+ artifact_subdirectory: str = Field(
222
+ ...,
223
+ description='Subdirectory where the exported dataset zip file will be stored.',
224
+ )
225
+ zip_output: bool = Field(
226
+ ...,
227
+ description='Whether to create a zip file for the exported dataset.',
228
+ )
229
+ exportable_dir_name: str = Field(
230
+ ...,
231
+ description='Name of the directory containing the dataset that will be '
232
+ 'exported.',
233
+ )
234
+ source_paths: list[str] = Field(
235
+ ..., description='List of paths to the source files of the dataset.'
236
+ )
237
+ metadata: ExportDatasetMetadata = Field(
238
+ ..., description='Metadata associated with the exported dataset.'
239
+ )
240
+
241
+
242
+ class CleanupArtifactsInput(BaseModel):
243
+ subdir_path: str = Field(
244
+ ..., description='Path to the subdirectory to be cleaned up.'
245
+ )
@@ -0,0 +1,172 @@
1
+ import json
2
+
3
+ import json_stream
4
+ from nomad.utils import dict_to_dataframe
5
+
6
+ try:
7
+ import pyarrow as pa
8
+ import pyarrow.csv as pcsv
9
+ import pyarrow.dataset as ds
10
+ import pyarrow.parquet as pq
11
+ except ImportError as e:
12
+ raise ImportError(
13
+ 'pyarrow is required. Install with: pip install nomad-ml-workflows[cpu-action]'
14
+ ) from e
15
+
16
+
17
+ def _is_nested_type(dtype: pa.DataType) -> bool:
18
+ """Check if a PyArrow type is nested."""
19
+ return pa.types.is_nested(dtype)
20
+
21
+
22
+ def _get_csv_compatible_schema(schema: pa.Schema) -> pa.Schema:
23
+ """Convert schema to CSV-compatible format by changing nested types to strings."""
24
+ new_fields = []
25
+ for field in schema:
26
+ if _is_nested_type(field.type):
27
+ new_fields.append(pa.field(field.name, pa.string(), field.nullable))
28
+ else:
29
+ new_fields.append(field)
30
+ return pa.schema(new_fields)
31
+
32
+
33
+ def _stringify_nested_columns(batch: pa.RecordBatch) -> pa.RecordBatch:
34
+ """Convert nested columns (list, struct) in a batch to JSON strings."""
35
+ new_columns = []
36
+ for i, column in enumerate(batch.columns):
37
+ if _is_nested_type(batch.schema.field(i).type):
38
+ # Convert each element to JSON string
39
+ stringified = pa.array(
40
+ [
41
+ json.dumps(val.as_py()) if val.as_py() is not None else None
42
+ for val in column
43
+ ],
44
+ type=pa.string(),
45
+ )
46
+ new_columns.append(stringified)
47
+ else:
48
+ new_columns.append(column)
49
+
50
+ return pa.RecordBatch.from_arrays(
51
+ new_columns, schema=_get_csv_compatible_schema(batch.schema)
52
+ )
53
+
54
+
55
+ def write_parquet_file(path: str, data: list[dict]):
56
+ """Writes a list of NOMAD entry dicts to a parquet file.
57
+
58
+ Args:
59
+ path (str): The path where the file will be saved.
60
+ data (list[dict]): The list of NOMAD entry dicts to be written to the file.
61
+ """
62
+ if not path.endswith('parquet'):
63
+ raise ValueError('Unsupported file type. Please use parquet.')
64
+
65
+ df = dict_to_dataframe(data)
66
+
67
+ table = pa.Table.from_pandas(df)
68
+ with pq.ParquetWriter(
69
+ path,
70
+ table.schema,
71
+ compression='snappy', # snappy for faster write/read for individual files
72
+ use_dictionary=True,
73
+ ) as writer:
74
+ writer.write_table(table)
75
+
76
+
77
+ def write_csv_file(path: str, data: list[dict]):
78
+ """Writes a list of NOMAD entry dicts to a CSV file.
79
+
80
+ Args:
81
+ path (str): The path where the file will be saved.
82
+ data (list[dict]): The list of NOMAD entry dicts to be written to the file.
83
+ """
84
+ if not path.endswith('csv'):
85
+ raise ValueError('Unsupported file type. Please use csv.')
86
+
87
+ df = dict_to_dataframe(data)
88
+
89
+ df.to_csv(path, index=False, mode='w', header=True)
90
+
91
+
92
+ def write_json_file(path: str, data: list[dict]):
93
+ """Writes a list of NOMAD entry dicts to a JSON file.
94
+
95
+ Args:
96
+ path (str): The path where the file will be saved.
97
+ data (list[dict]): The list of NOMAD entry dicts to be written to the file.
98
+ """
99
+ if not path.endswith('json'):
100
+ raise ValueError('Unsupported file type. Please use json.')
101
+
102
+ with open(path, 'w') as f:
103
+ json.dump(data, f, indent=4)
104
+
105
+
106
+ def merge_files(
107
+ input_file_paths: list[str], output_file_type: str, output_file_path: str
108
+ ):
109
+ """Merges multiple Parquet or JSON files into a single file.
110
+
111
+ Args:
112
+ input_file_paths (list[str]): List of file paths to be merged.
113
+ output_file_type (str): The type of the output file ('parquet', 'csv', or
114
+ 'json').
115
+ output_file_path (str): Path of the merged output file.
116
+ """
117
+ if output_file_type == 'parquet':
118
+ # Creates a logical dataset from the input files, not loading all data into
119
+ # memory. Also, unifies the schema across the files.
120
+ dataset = ds.dataset(input_file_paths, format='parquet')
121
+
122
+ # Write the dataset to a single Parquet file in batches
123
+ with pq.ParquetWriter(
124
+ output_file_path,
125
+ dataset.schema,
126
+ compression='zstd', # for better compression for merged file
127
+ compression_level=3,
128
+ use_dictionary=True,
129
+ ) as writer:
130
+ for batch in dataset.to_batches():
131
+ writer.write_batch(batch)
132
+
133
+ elif output_file_type == 'csv':
134
+ # Creates a logical dataset from the input files, not loading all data into
135
+ # memory. Also, unifies the schema across the files.
136
+ # The batch files for `csv` are written in Parquet format for efficiency,
137
+ # so we read them as Parquet here.
138
+ dataset = ds.dataset(input_file_paths, format='parquet')
139
+
140
+ # PyArrow CSV writer doesn't support nested types (list, struct, etc.)
141
+ # Convert nested columns to JSON strings
142
+ csv_schema = _get_csv_compatible_schema(dataset.schema)
143
+
144
+ # Write the dataset to a single CSV file in batches
145
+ with pcsv.CSVWriter(output_file_path, csv_schema) as writer:
146
+ for batch in dataset.to_batches():
147
+ csv_batch = _stringify_nested_columns(batch)
148
+ writer.write_batch(csv_batch)
149
+
150
+ elif output_file_type == 'json':
151
+
152
+ def _json_stream_files(input_file_paths):
153
+ """Generator that streams one entry dict at a time from multiple files."""
154
+ for file_path in input_file_paths:
155
+ with open(file_path, encoding='utf-8') as f:
156
+ data = json_stream.load(f)
157
+ yield from data
158
+
159
+ # Write a single JSON file by streaming entry dicts and wrapping in a list
160
+ with open(output_file_path, 'w', encoding='utf-8') as f:
161
+ f.write('[\n')
162
+ first_item = True
163
+ for item in _json_stream_files(input_file_paths):
164
+ if not first_item:
165
+ f.write(',\n')
166
+ # Convert transient json_stream object to standard Python types
167
+ json.dump(json_stream.to_standard_types(item), f, indent=4)
168
+ first_item = False
169
+ f.write('\n]')
170
+
171
+ else:
172
+ raise ValueError('Unsupported file type. Please use parquet, csv, or json.')
@@ -0,0 +1,169 @@
1
+ from datetime import timedelta
2
+
3
+ from temporalio import workflow
4
+ from temporalio.common import RetryPolicy
5
+ from temporalio.exceptions import ApplicationError
6
+
7
+ with workflow.unsafe.imports_passed_through():
8
+ from nomad.config import config as nomad_config
9
+
10
+ from nomad_ml_workflows.actions.export_entries.activities import (
11
+ cleanup_artifacts,
12
+ create_artifact_subdirectory,
13
+ export_dataset_to_upload,
14
+ merge_output_files,
15
+ search,
16
+ )
17
+ from nomad_ml_workflows.actions.export_entries.models import (
18
+ CleanupArtifactsInput,
19
+ CreateArtifactSubdirectoryInput,
20
+ ExportDatasetInput,
21
+ ExportDatasetMetadata,
22
+ ExportEntriesUserInput,
23
+ MergeOutputFilesInput,
24
+ SearchInput,
25
+ )
26
+
27
+
28
+ @workflow.defn
29
+ class ExportEntriesWorkflow:
30
+ @workflow.run
31
+ async def run(self, data: ExportEntriesUserInput) -> str:
32
+ """
33
+ Workflow to search entries and export them into a datafile in the specified
34
+ upload.
35
+
36
+ Args:
37
+ data (ExportEntriesUserInput): Input data for the export entries workflow.
38
+ Returns:
39
+ str: Path to the saved dataset in the upload's `raw` folder.
40
+ """
41
+ retry_policy = RetryPolicy(
42
+ maximum_attempts=1,
43
+ initial_interval=timedelta(seconds=10),
44
+ maximum_interval=timedelta(minutes=1),
45
+ backoff_coefficient=2.0,
46
+ )
47
+ artifact_subdirectory = await workflow.execute_activity(
48
+ create_artifact_subdirectory,
49
+ CreateArtifactSubdirectoryInput(subdir_name=workflow.info().workflow_id),
50
+ start_to_close_timeout=timedelta(minutes=10),
51
+ retry_policy=retry_policy,
52
+ )
53
+ export_dataset_input = ExportDatasetInput(
54
+ user_id=data.user_id,
55
+ upload_id=data.upload_id,
56
+ artifact_subdirectory=artifact_subdirectory,
57
+ exportable_dir_name='export_entries_error', # name used in case of error
58
+ zip_output=data.output_settings.zip_output,
59
+ source_paths=[],
60
+ metadata=ExportDatasetMetadata(user_input=data),
61
+ )
62
+
63
+ try:
64
+ config = nomad_config.get_plugin_entry_point(
65
+ 'nomad_ml_workflows.actions:export_entries'
66
+ )
67
+
68
+ search_counter = 0
69
+ num_entries_available = 0
70
+ generated_file_paths = []
71
+ search_start_times = []
72
+ search_end_times = []
73
+ total_num_entries_exported = 0
74
+ reached_max_entries_limit = False
75
+ search_input = SearchInput.from_user_input(
76
+ data,
77
+ output_file_path='', # Placeholder, will be set in loop
78
+ max_entries_export_limit=config.max_entries_export_limit,
79
+ )
80
+ while True:
81
+ search_counter += 1
82
+ search_input.output_file_path = (
83
+ f'{artifact_subdirectory}/{search_counter}.'
84
+ f'{search_input.batch_file_type}'
85
+ )
86
+ search_output = await workflow.execute_activity(
87
+ search,
88
+ search_input,
89
+ activity_id=f'search-activity-{search_counter}',
90
+ start_to_close_timeout=timedelta(
91
+ seconds=config.search_batch_timeout
92
+ ),
93
+ retry_policy=retry_policy,
94
+ )
95
+ if search_counter == 1:
96
+ # capture the total available entries from the first search output
97
+ num_entries_available = search_output.num_entries_available
98
+ if search_output.num_entries_exported > 0:
99
+ # only save paths if the writing files was not skipped
100
+ generated_file_paths.append(search_input.output_file_path)
101
+ search_start_times.append(search_output.search_start_time)
102
+ search_end_times.append(search_output.search_end_time)
103
+ total_num_entries_exported += search_output.num_entries_exported
104
+ # Update pagination for next iteration
105
+ search_input.pagination.page_after_value = (
106
+ search_output.pagination_next_page_after_value
107
+ )
108
+ search_input.max_entries_export_limit -= (
109
+ search_output.num_entries_exported
110
+ )
111
+
112
+ if search_output.pagination_next_page_after_value is None:
113
+ # break if there are no more pages to fetch
114
+ break
115
+ if search_input.max_entries_export_limit <= 0:
116
+ # break early if the max entries limit has been reached
117
+ reached_max_entries_limit = True
118
+ break
119
+
120
+ merged_file_path = await workflow.execute_activity(
121
+ merge_output_files,
122
+ MergeOutputFilesInput(
123
+ artifact_subdirectory=artifact_subdirectory,
124
+ output_file_type=data.output_settings.output_file_type,
125
+ generated_file_paths=generated_file_paths,
126
+ ),
127
+ start_to_close_timeout=timedelta(hours=2),
128
+ retry_policy=retry_policy,
129
+ )
130
+
131
+ # Prepare export dataset input and metadata
132
+ export_dataset_input.exportable_dir_name = (
133
+ 'export_entries_' + search_start_times[0].replace(':', '-')
134
+ )
135
+ export_dataset_input.source_paths = [merged_file_path]
136
+ export_dataset_input.metadata = ExportDatasetMetadata(
137
+ num_entries_exported=total_num_entries_exported,
138
+ num_entries_available=num_entries_available,
139
+ reached_max_entries_limit=reached_max_entries_limit,
140
+ search_start_time=search_start_times[0],
141
+ search_end_time=search_end_times[-1],
142
+ user_input=data,
143
+ )
144
+
145
+ except Exception as e:
146
+ # Capture error info to include in metadata
147
+ import traceback
148
+
149
+ export_dataset_input.metadata.error_info = traceback.format_exc()
150
+ raise ApplicationError(
151
+ 'Encountered an error during export entries workflow.',
152
+ ) from e
153
+
154
+ finally:
155
+ saved_dataset_path = await workflow.execute_activity(
156
+ export_dataset_to_upload,
157
+ export_dataset_input,
158
+ start_to_close_timeout=timedelta(hours=2),
159
+ retry_policy=retry_policy,
160
+ )
161
+
162
+ await workflow.execute_activity(
163
+ cleanup_artifacts,
164
+ CleanupArtifactsInput(subdir_path=artifact_subdirectory),
165
+ start_to_close_timeout=timedelta(hours=2),
166
+ retry_policy=retry_policy,
167
+ )
168
+
169
+ return saved_dataset_path
@@ -0,0 +1,197 @@
1
+ Metadata-Version: 2.4
2
+ Name: nomad-ml-workflows
3
+ Version: 0.0.6
4
+ Summary: A NOMAD plugin for managing ML workflows.
5
+ Author-email: Sarthak Kapoor <sarthak.kapoor@physik.hu-berlin.de>
6
+ Maintainer-email: Sarthak Kapoor <sarthak.kapoor@physik.hu-berlin.de>
7
+ License:
8
+ The MIT License (MIT)
9
+
10
+ Copyright (c) 2025 Sarthak Kapoor
11
+
12
+ Permission is hereby granted, free of charge, to any person obtaining a copy
13
+ of this software and associated documentation files (the "Software"), to deal
14
+ in the Software without restriction, including without limitation the rights
15
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16
+ copies of the Software, and to permit persons to whom the Software is
17
+ furnished to do so, subject to the following conditions:
18
+
19
+ The above copyright notice and this permission notice shall be included in
20
+ all copies or substantial portions of the Software.
21
+
22
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
28
+ THE SOFTWARE.
29
+
30
+ Project-URL: Repository, https://github.com/FAIRmat-NFDI/nomad-ml-workflows
31
+ Classifier: Intended Audience :: Developers
32
+ Classifier: Operating System :: OS Independent
33
+ Classifier: Programming Language :: Python
34
+ Classifier: Programming Language :: Python :: 3.10
35
+ Classifier: Programming Language :: Python :: 3.11
36
+ Classifier: Programming Language :: Python :: 3.12
37
+ Classifier: License :: OSI Approved :: MIT License
38
+ Requires-Python: >=3.10
39
+ Description-Content-Type: text/markdown
40
+ License-File: LICENSE
41
+ Requires-Dist: nomad-lab>=1.4.0
42
+ Requires-Dist: json-stream
43
+ Requires-Dist: pydantic
44
+ Requires-Dist: temporalio
45
+ Provides-Extra: dev
46
+ Requires-Dist: nomad-lab[infrastructure]>=1.4.0; extra == "dev"
47
+ Requires-Dist: ruff; extra == "dev"
48
+ Requires-Dist: pytest; extra == "dev"
49
+ Requires-Dist: structlog; extra == "dev"
50
+ Requires-Dist: mkdocs; extra == "dev"
51
+ Requires-Dist: mkdocs-material>=8.1.1; extra == "dev"
52
+ Requires-Dist: pymdown-extensions; extra == "dev"
53
+ Requires-Dist: mkdocs-click; extra == "dev"
54
+ Requires-Dist: pytest-asyncio; extra == "dev"
55
+ Provides-Extra: cpu-action
56
+ Requires-Dist: pyarrow; extra == "cpu-action"
57
+ Dynamic: license-file
58
+
59
+ # nomad-ml-workflows
60
+
61
+ A NOMAD plugin for managing ML workflows. Currently, it provides an action to export large number of entries from NOMAD database as tabular data files. Other ML workflow related actions and schemas will be added in future.
62
+
63
+ ## 📦 Installation
64
+ You can install the plugin using pip:
65
+ ```sh
66
+ pip install nomad-ml-workflows @ git+https://github.com/FAIRmat-NFDI/nomad-ml-workflows.git
67
+ ```
68
+
69
+ However, to fully utilize the plugin, you need to add it to your NOMAD instance as described [below](#-adding-this-plugin-to-nomad).
70
+
71
+ ## ✨ Features
72
+
73
+ - Export a large number of NOMAD entries as tabular data files (CSV, Parquet) using NOMAD Actions. Once the action is triggered, it will:
74
+ - Search entries based on user-defined criteria.
75
+ - Optionally include or exclude data fields from the entries.
76
+ - Package the entries into tabular data files like CSV or Parquet (or as JSON)
77
+ - Export the files to a specified Project (or previously known as Upload) in NOAMD.
78
+
79
+ These can then be downloaded from the NOMAD web interface for local use.
80
+
81
+ ## ⚙️ Configuration
82
+ The Export Entries action can be configured using the following parameters in
83
+ the `nomad.yaml` configuration file of your NOMAD Oasis instance:
84
+
85
+ ```yaml
86
+ plugins:
87
+ entry_points:
88
+ options:
89
+ nomad_ml_workflows.actions:export_entries:
90
+ search_batch_timeout: 7200
91
+ # Timeout (in seconds) for each search batch in the Export Entries
92
+ # action. Set this accordingly to time out longer searches.
93
+ max_entries_export_limit: 100000
94
+ # Maximum number of entries that can be exported in a single
95
+ # Export Entries action.
96
+ ```
97
+
98
+
99
+ ## 🚀 Adding this plugin to NOMAD
100
+
101
+ Currently, NOMAD has two distinct flavors that are relevant depending on your role as an user:
102
+ 1. [A NOMAD Oasis](#adding-this-plugin-in-your-nomad-oasis): any user with a NOMAD Oasis instance.
103
+ 2. [Local NOMAD installation and the source code of NOMAD](#adding-this-plugin-in-your-local-nomad-installation-and-the-source-code-of-nomad): internal developers.
104
+
105
+ ### Adding this plugin in your NOMAD Oasis
106
+
107
+ Read the [NOMAD plugin documentation](https://nomad-lab.eu/prod/v1/staging/docs/howto/oasis/plugins_install.html) for all details on how to deploy the plugin on your NOMAD instance.
108
+
109
+ ### Adding this plugin in your local NOMAD installation and the source code of NOMAD
110
+
111
+ We now recommend using the dedicated [`nomad-distro-dev`](https://github.com/FAIRmat-NFDI/nomad-distro-dev) repository to simplify the process. Please refer to that repository for detailed instructions.
112
+
113
+
114
+ ## 🛠️ Development
115
+
116
+ If you want to develop locally this plugin, clone the project and in the plugin folder, create a virtual environment (you can use Python 3.10, 3.11 or 3.12):
117
+ ```sh
118
+ git clone https://github.com/FAIRmat-NFDI/nomad-ml-workflows.git
119
+ cd nomad-ml-workflows
120
+ python3.11 -m venv .pyenv
121
+ . .pyenv/bin/activate
122
+ ```
123
+
124
+ Make sure to have `pip` upgraded:
125
+ ```sh
126
+ pip install --upgrade pip
127
+ ```
128
+
129
+ We recommend installing `uv` for fast pip installation of the packages:
130
+ ```sh
131
+ pip install uv
132
+ ```
133
+
134
+ Install the `nomad-lab` package:
135
+ ```sh
136
+ uv pip install -e '.[dev]'
137
+ ```
138
+
139
+ ### Run linting and auto-formatting
140
+
141
+ We use [Ruff](https://docs.astral.sh/ruff/) for linting and formatting the code. Ruff auto-formatting is also a part of the GitHub workflow actions. You can run locally:
142
+ ```sh
143
+ ruff check .
144
+ ruff format . --check
145
+ ```
146
+
147
+ ### Debugging
148
+
149
+ For interactive debugging of the tests, use `pytest` with the `--pdb` flag. We recommend using an IDE for debugging, e.g., _VSCode_. If that is the case, add the following snippet to your `.vscode/launch.json`:
150
+ ```json
151
+ {
152
+ "configurations": [
153
+ {
154
+ "name": "<descriptive tag>",
155
+ "type": "debugpy",
156
+ "request": "launch",
157
+ "cwd": "${workspaceFolder}",
158
+ "program": "${workspaceFolder}/.pyenv/bin/pytest",
159
+ "justMyCode": true,
160
+ "env": {
161
+ "_PYTEST_RAISE": "1"
162
+ },
163
+ "args": [
164
+ "-sv",
165
+ "--pdb",
166
+ "<path-to-plugin-tests>",
167
+ ]
168
+ }
169
+ ]
170
+ }
171
+ ```
172
+
173
+ where `<path-to-plugin-tests>` must be changed to the local path to the test module to be debugged.
174
+
175
+ The settings configuration file `.vscode/settings.json` automatically applies the linting and formatting upon saving the modified file.
176
+
177
+ ### Documentation on Github pages
178
+
179
+ To view the documentation locally, install the related packages using:
180
+ ```sh
181
+ uv pip install -r requirements_docs.txt
182
+ ```
183
+
184
+ Run the documentation server:
185
+ ```sh
186
+ mkdocs serve
187
+ ```
188
+
189
+
190
+ ## 👥 Main contributors
191
+ | Name | E-mail |
192
+ |------|------------|
193
+ | Sarthak Kapoor | [sarthak.kapoor@physik.hu-berlin.de](mailto:sarthak.kapoor@physik.hu-berlin.de)
194
+
195
+
196
+ ## 📄 License
197
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
@@ -0,0 +1,13 @@
1
+ nomad_ml_workflows/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ nomad_ml_workflows/actions/__init__.py,sha256=ooUE9fX622d8JU8ckbr3tQJuZ3oQWvnJFS3CASCINqc,99
3
+ nomad_ml_workflows/actions/export_entries/__init__.py,sha256=bFxKQr8zSQZQcwJwSfJ2bC35orXWh9COZZVOGY5Jy7k,1681
4
+ nomad_ml_workflows/actions/export_entries/activities.py,sha256=ie7uyQMU6I-9u3yuSASN9v2EWU7SxC8xWO_m11uAY44,6943
5
+ nomad_ml_workflows/actions/export_entries/models.py,sha256=YQRrdCbxK9cY0ukrdUxtn8PKvCVYFrrVsde-bDD5glk,8521
6
+ nomad_ml_workflows/actions/export_entries/utils.py,sha256=g2ZbwJZgPEBOcIdXgCYMRGK1XViNYp9414h4ZvUbHRY,6193
7
+ nomad_ml_workflows/actions/export_entries/workflows.py,sha256=T1png3P8JDoi0Yf7X798xFHEao1ePYXQqSUjzDkXoeg,6890
8
+ nomad_ml_workflows-0.0.6.dist-info/licenses/LICENSE,sha256=Wji1LdkrEkb33BQ4YJUjF9EPtOiz74OQW3XO0omcZ3U,1082
9
+ nomad_ml_workflows-0.0.6.dist-info/METADATA,sha256=2sX3U7TM-II4YLu5ctYRRg2tOkudLNXAViKzrDFEqqo,7433
10
+ nomad_ml_workflows-0.0.6.dist-info/WHEEL,sha256=YCfwYGOYMi5Jhw2fU4yNgwErybb2IX5PEwBKV4ZbdBo,91
11
+ nomad_ml_workflows-0.0.6.dist-info/entry_points.txt,sha256=unxYXwft1X1R3FCg9NyANBqevaAZUZNBQJ7We5ez0eA,93
12
+ nomad_ml_workflows-0.0.6.dist-info/top_level.txt,sha256=1NCxJd5BKyy75vikv64WJSwZL9bdISeeHO5FCAeQI_E,19
13
+ nomad_ml_workflows-0.0.6.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [nomad.plugin]
2
+ export_entries_action_entry_point = nomad_ml_workflows.actions:export_entries
@@ -0,0 +1,22 @@
1
+
2
+ The MIT License (MIT)
3
+
4
+ Copyright (c) 2025 Sarthak Kapoor
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in
14
+ all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ THE SOFTWARE.
@@ -0,0 +1 @@
1
+ nomad_ml_workflows