runnable 0.50.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- extensions/README.md +0 -0
- extensions/__init__.py +0 -0
- extensions/catalog/README.md +0 -0
- extensions/catalog/any_path.py +214 -0
- extensions/catalog/file_system.py +52 -0
- extensions/catalog/minio.py +72 -0
- extensions/catalog/pyproject.toml +14 -0
- extensions/catalog/s3.py +11 -0
- extensions/job_executor/README.md +0 -0
- extensions/job_executor/__init__.py +236 -0
- extensions/job_executor/emulate.py +70 -0
- extensions/job_executor/k8s.py +553 -0
- extensions/job_executor/k8s_job_spec.yaml +37 -0
- extensions/job_executor/local.py +35 -0
- extensions/job_executor/local_container.py +161 -0
- extensions/job_executor/pyproject.toml +16 -0
- extensions/nodes/README.md +0 -0
- extensions/nodes/__init__.py +0 -0
- extensions/nodes/conditional.py +301 -0
- extensions/nodes/fail.py +78 -0
- extensions/nodes/loop.py +394 -0
- extensions/nodes/map.py +477 -0
- extensions/nodes/parallel.py +281 -0
- extensions/nodes/pyproject.toml +15 -0
- extensions/nodes/stub.py +93 -0
- extensions/nodes/success.py +78 -0
- extensions/nodes/task.py +156 -0
- extensions/pipeline_executor/README.md +0 -0
- extensions/pipeline_executor/__init__.py +871 -0
- extensions/pipeline_executor/argo.py +1266 -0
- extensions/pipeline_executor/emulate.py +119 -0
- extensions/pipeline_executor/local.py +226 -0
- extensions/pipeline_executor/local_container.py +369 -0
- extensions/pipeline_executor/mocked.py +159 -0
- extensions/pipeline_executor/pyproject.toml +16 -0
- extensions/run_log_store/README.md +0 -0
- extensions/run_log_store/__init__.py +0 -0
- extensions/run_log_store/any_path.py +100 -0
- extensions/run_log_store/chunked_fs.py +122 -0
- extensions/run_log_store/chunked_minio.py +141 -0
- extensions/run_log_store/file_system.py +91 -0
- extensions/run_log_store/generic_chunked.py +549 -0
- extensions/run_log_store/minio.py +114 -0
- extensions/run_log_store/pyproject.toml +15 -0
- extensions/secrets/README.md +0 -0
- extensions/secrets/dotenv.py +62 -0
- extensions/secrets/pyproject.toml +15 -0
- runnable/__init__.py +108 -0
- runnable/catalog.py +141 -0
- runnable/cli.py +484 -0
- runnable/context.py +730 -0
- runnable/datastore.py +1058 -0
- runnable/defaults.py +159 -0
- runnable/entrypoints.py +390 -0
- runnable/exceptions.py +137 -0
- runnable/executor.py +561 -0
- runnable/gantt.py +1646 -0
- runnable/graph.py +501 -0
- runnable/names.py +546 -0
- runnable/nodes.py +593 -0
- runnable/parameters.py +217 -0
- runnable/pickler.py +96 -0
- runnable/sdk.py +1277 -0
- runnable/secrets.py +92 -0
- runnable/tasks.py +1268 -0
- runnable/telemetry.py +142 -0
- runnable/utils.py +423 -0
- runnable-0.50.0.dist-info/METADATA +189 -0
- runnable-0.50.0.dist-info/RECORD +72 -0
- runnable-0.50.0.dist-info/WHEEL +4 -0
- runnable-0.50.0.dist-info/entry_points.txt +53 -0
- runnable-0.50.0.dist-info/licenses/LICENSE +201 -0
runnable/telemetry.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Telemetry support for runnable pipelines.
|
|
3
|
+
|
|
4
|
+
Uses logfire-api for zero-dependency instrumentation.
|
|
5
|
+
If logfire is installed, spans are emitted. If not, all calls are no-ops.
|
|
6
|
+
|
|
7
|
+
For real-time streaming (e.g., FastAPI SSE), use StreamingSpanProcessor.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
from contextvars import ContextVar
|
|
12
|
+
from queue import Queue
|
|
13
|
+
from typing import Any, Optional
|
|
14
|
+
|
|
15
|
+
import logfire_api as logfire # noqa: F401 - re-exported for convenience
|
|
16
|
+
|
|
17
|
+
# Context var for active stream queue (set by FastAPI when SSE is active)
|
|
18
|
+
_stream_queue: ContextVar[Optional[Queue]] = ContextVar("stream_queue", default=None)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def truncate_value(value: Any, max_bytes: int = 256) -> Any:
|
|
22
|
+
"""
|
|
23
|
+
Truncate a single serialized value to max_bytes.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
value: Any JSON-serializable value
|
|
27
|
+
max_bytes: Maximum length for string representation
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
The value (possibly truncated if string representation exceeds max_bytes)
|
|
31
|
+
"""
|
|
32
|
+
try:
|
|
33
|
+
serialized = json.dumps(value, default=str)
|
|
34
|
+
if len(serialized) > max_bytes:
|
|
35
|
+
# Return truncated string representation
|
|
36
|
+
return serialized[: max_bytes - 3] + "..."
|
|
37
|
+
return serialized
|
|
38
|
+
except Exception:
|
|
39
|
+
return f"<unserializable: {type(value).__name__}>"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def set_stream_queue(q: Optional[Queue]) -> None:
|
|
43
|
+
"""
|
|
44
|
+
Set the queue for streaming spans.
|
|
45
|
+
|
|
46
|
+
Called by FastAPI endpoint to enable real-time span streaming.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
q: Queue to push span data to, or None to disable streaming
|
|
50
|
+
"""
|
|
51
|
+
_stream_queue.set(q)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def get_stream_queue() -> Optional[Queue]:
|
|
55
|
+
"""
|
|
56
|
+
Get the current stream queue.
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
The active Queue if SSE streaming is enabled, None otherwise
|
|
60
|
+
"""
|
|
61
|
+
return _stream_queue.get()
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# Optional OTEL imports for streaming processor
|
|
65
|
+
try:
|
|
66
|
+
from opentelemetry.sdk.trace import ReadableSpan, SpanProcessor
|
|
67
|
+
|
|
68
|
+
OTEL_AVAILABLE = True
|
|
69
|
+
except ImportError:
|
|
70
|
+
OTEL_AVAILABLE = False
|
|
71
|
+
SpanProcessor = object # type: ignore
|
|
72
|
+
ReadableSpan = object # type: ignore
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
if OTEL_AVAILABLE:
|
|
76
|
+
|
|
77
|
+
class StreamingSpanProcessor(SpanProcessor):
|
|
78
|
+
"""
|
|
79
|
+
SpanProcessor that:
|
|
80
|
+
1. Always forwards to base processor (collector export) if provided
|
|
81
|
+
2. Also pushes to stream queue if SSE is active
|
|
82
|
+
|
|
83
|
+
This enables dual output: persistent collector storage AND
|
|
84
|
+
real-time streaming to UI.
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
def __init__(self, base_processor: Optional[SpanProcessor] = None):
|
|
88
|
+
"""
|
|
89
|
+
Initialize the streaming processor.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
base_processor: Optional underlying processor for collector export
|
|
93
|
+
"""
|
|
94
|
+
self.base_processor = base_processor
|
|
95
|
+
|
|
96
|
+
def on_start(self, span, parent_context=None):
|
|
97
|
+
"""Called when a span starts."""
|
|
98
|
+
if self.base_processor:
|
|
99
|
+
self.base_processor.on_start(span, parent_context)
|
|
100
|
+
|
|
101
|
+
q = _stream_queue.get()
|
|
102
|
+
if q is not None:
|
|
103
|
+
q.put_nowait(
|
|
104
|
+
{
|
|
105
|
+
"type": "span_start",
|
|
106
|
+
"name": span.name,
|
|
107
|
+
"span_id": format(span.context.span_id, "016x"),
|
|
108
|
+
}
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
def on_end(self, span: ReadableSpan):
|
|
112
|
+
"""Called when a span ends."""
|
|
113
|
+
if self.base_processor:
|
|
114
|
+
self.base_processor.on_end(span)
|
|
115
|
+
|
|
116
|
+
q = _stream_queue.get()
|
|
117
|
+
if q is not None:
|
|
118
|
+
q.put_nowait(
|
|
119
|
+
{
|
|
120
|
+
"type": "span_end",
|
|
121
|
+
"name": span.name,
|
|
122
|
+
"span_id": format(span.context.span_id, "016x"),
|
|
123
|
+
"status": span.status.status_code.name,
|
|
124
|
+
"duration_ms": (span.end_time - span.start_time) # type: ignore
|
|
125
|
+
/ 1_000_000, # ty: ignore
|
|
126
|
+
"attributes": dict(span.attributes) if span.attributes else {},
|
|
127
|
+
}
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
def shutdown(self):
|
|
131
|
+
"""Shutdown the processor."""
|
|
132
|
+
if self.base_processor:
|
|
133
|
+
self.base_processor.shutdown()
|
|
134
|
+
|
|
135
|
+
def force_flush(self, timeout_millis=None):
|
|
136
|
+
"""Force flush any pending spans."""
|
|
137
|
+
if self.base_processor:
|
|
138
|
+
self.base_processor.force_flush(timeout_millis) # ty: ignore
|
|
139
|
+
|
|
140
|
+
else:
|
|
141
|
+
# Placeholder when OTEL is not installed
|
|
142
|
+
StreamingSpanProcessor = None # type: ignore
|
runnable/utils.py
ADDED
|
@@ -0,0 +1,423 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
import random
|
|
8
|
+
import string
|
|
9
|
+
import subprocess
|
|
10
|
+
import time
|
|
11
|
+
from collections import OrderedDict
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from string import Template as str_template
|
|
14
|
+
from typing import Any, Dict, Optional, Tuple, Union
|
|
15
|
+
|
|
16
|
+
from ruamel.yaml import YAML
|
|
17
|
+
|
|
18
|
+
import runnable.context as context
|
|
19
|
+
from runnable import console, defaults
|
|
20
|
+
from runnable.defaults import IterableParameterModel
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(defaults.LOGGER_NAME)
|
|
23
|
+
logging.getLogger("stevedore").setLevel(logging.CRITICAL)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def does_file_exist(file_path: str) -> bool:
|
|
27
|
+
"""Check if a file exists.
|
|
28
|
+
Implemented here to avoid repetition of logic.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
file_path (str): The file path to check
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
bool: False if it does not otherwise True
|
|
35
|
+
"""
|
|
36
|
+
my_file = Path(file_path)
|
|
37
|
+
return my_file.is_file()
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def does_dir_exist(file_path: Union[str, Path]) -> bool:
|
|
41
|
+
"""Check if a directory exists.
|
|
42
|
+
Implemented here to avoid repetition of logic.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
file_path (str or Path): The directory path to check
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
bool: False if the directory does not exist, True otherwise
|
|
49
|
+
"""
|
|
50
|
+
my_file = Path(file_path)
|
|
51
|
+
return my_file.is_dir()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def safe_make_dir(directory: Union[str, Path]):
|
|
55
|
+
"""Safely make the directory.
|
|
56
|
+
Ignore if it exists and create the parents if necessary.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
directory (str): The directory path to create
|
|
60
|
+
"""
|
|
61
|
+
Path(directory).mkdir(parents=True, exist_ok=True)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def apply_variables(
|
|
65
|
+
apply_to: Dict[str, Any], variables: Dict[str, str]
|
|
66
|
+
) -> Dict[str, Any]:
|
|
67
|
+
"""Safely applies the variables to a config.
|
|
68
|
+
|
|
69
|
+
For example: For config:
|
|
70
|
+
{'a' : ${b}}, the value of ${b} is replaced by b in the variables.
|
|
71
|
+
|
|
72
|
+
If the ${b} does not exist in the variables, it is ignored in the config.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
apply_to (dict): The config to apply variables
|
|
76
|
+
variables (dict): The variables in key, value pairs
|
|
77
|
+
|
|
78
|
+
Raises:
|
|
79
|
+
Exception: If the variables is not dict
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
dict: A transformed dict with variables applied
|
|
83
|
+
"""
|
|
84
|
+
if not isinstance(variables, dict):
|
|
85
|
+
raise Exception("Argument Variables should be dict")
|
|
86
|
+
|
|
87
|
+
json_d = json.dumps(apply_to)
|
|
88
|
+
string_template = str_template(json_d)
|
|
89
|
+
|
|
90
|
+
template = string_template.safe_substitute(variables)
|
|
91
|
+
|
|
92
|
+
if "$" in template:
|
|
93
|
+
logger.warning(
|
|
94
|
+
"Not all variables found in the config are found in the variables"
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
return json.loads(template)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def get_module_and_attr_names(command: str) -> Tuple[str, str]:
|
|
101
|
+
"""Given a string of module.function, this functions returns the module name and func names.
|
|
102
|
+
|
|
103
|
+
It also checks to make sure that the string is of expected 'module.func' format
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
command (str): String of format module.function_name
|
|
107
|
+
|
|
108
|
+
Raises:
|
|
109
|
+
Exception: If the string is of not format
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
Tuple[str, str]: (module_name, function_name) extracted from the input string
|
|
113
|
+
"""
|
|
114
|
+
mods = command.split(".")
|
|
115
|
+
if len(mods) <= 1:
|
|
116
|
+
raise Exception("The command should be a function to call")
|
|
117
|
+
func = mods[-1]
|
|
118
|
+
module = ".".join(mods[:-1])
|
|
119
|
+
return module, func
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def load_yaml(file_path: str, load_type: str = "safe") -> Dict[str, Any]:
|
|
123
|
+
"""Loads an yaml and returns the dictionary.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
file_path (str): The path of the yamlfile
|
|
127
|
+
load_type (str, optional): The load type as understood by ruamel. Defaults to 'safe'.
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
dict: The mapping as defined in the yaml file
|
|
131
|
+
"""
|
|
132
|
+
with open(file_path, encoding="utf-8") as f:
|
|
133
|
+
yaml = YAML(typ=load_type, pure=True)
|
|
134
|
+
yaml_config = yaml.load(f)
|
|
135
|
+
return yaml_config
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def is_a_git_repo() -> bool:
|
|
139
|
+
"""Does a git command to see if the project is git versioned.
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
bool: True if it is git versioned, False otherwise
|
|
143
|
+
"""
|
|
144
|
+
command = "git rev-parse --is-inside-work-tree"
|
|
145
|
+
try:
|
|
146
|
+
subprocess.check_output(command.split()).strip().decode("utf-8")
|
|
147
|
+
logger.info("Found the code to be git versioned")
|
|
148
|
+
return True
|
|
149
|
+
except BaseException: # pylint: disable=W0702
|
|
150
|
+
console.print("Not a git repo", style="bold red")
|
|
151
|
+
|
|
152
|
+
return False
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def get_current_code_commit() -> Union[str, None]:
|
|
156
|
+
"""Gets the git sha id if the project is version controlled.
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
Union[str, None]: SHA ID if the code is versioned, None otherwise
|
|
160
|
+
"""
|
|
161
|
+
command = "git rev-parse HEAD"
|
|
162
|
+
if not is_a_git_repo():
|
|
163
|
+
return None
|
|
164
|
+
try:
|
|
165
|
+
label = subprocess.check_output(command.split()).strip().decode("utf-8")
|
|
166
|
+
logger.info("Found the git commit to be: %s", label)
|
|
167
|
+
return label
|
|
168
|
+
except BaseException: # pylint: disable=W0702
|
|
169
|
+
console.print("Not a git repo, error getting hash", style="bold red")
|
|
170
|
+
raise
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def is_git_clean() -> Tuple[bool, Union[None, str]]:
|
|
174
|
+
"""Checks if the git tree is clean and there are no modified tracked files.
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
Tuple[bool, Union[None, str]]: None if its clean, comma-seperated file names if it is changed
|
|
178
|
+
"""
|
|
179
|
+
command = "git diff --name-only"
|
|
180
|
+
if not is_a_git_repo():
|
|
181
|
+
return False, None
|
|
182
|
+
try:
|
|
183
|
+
label = subprocess.check_output(command.split()).strip().decode("utf-8")
|
|
184
|
+
if not label:
|
|
185
|
+
return True, None
|
|
186
|
+
return False, label
|
|
187
|
+
except BaseException: # pylint: disable=W0702
|
|
188
|
+
console.print("Not a git repo, not clean", style="bold red")
|
|
189
|
+
|
|
190
|
+
return False, None
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def get_git_remote() -> Union[str, None]:
|
|
194
|
+
"""Gets the remote URL of git.
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
Union[str, None]: Remote URL if the code is version controlled, None otherwise
|
|
198
|
+
"""
|
|
199
|
+
command = "git config --get remote.origin.url"
|
|
200
|
+
if not is_a_git_repo():
|
|
201
|
+
return None
|
|
202
|
+
try:
|
|
203
|
+
label = subprocess.check_output(command.split()).strip().decode("utf-8")
|
|
204
|
+
logger.info("Found the git remote to be: %s", label)
|
|
205
|
+
return label
|
|
206
|
+
except BaseException: # pylint: disable=W0702
|
|
207
|
+
console.print("Not a git repo, no remote", style="bold red")
|
|
208
|
+
raise
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def get_local_docker_image_id(image_name: str) -> str:
|
|
212
|
+
"""If we are running in local settings, return the docker image id.
|
|
213
|
+
|
|
214
|
+
Args:
|
|
215
|
+
image_name (str): The image name we need the digest for
|
|
216
|
+
|
|
217
|
+
Returns:
|
|
218
|
+
str: The docker image digest
|
|
219
|
+
"""
|
|
220
|
+
try:
|
|
221
|
+
import docker
|
|
222
|
+
|
|
223
|
+
client = docker.from_env()
|
|
224
|
+
image = client.images.get(image_name)
|
|
225
|
+
return image.attrs["Id"]
|
|
226
|
+
except ImportError: # pragma: no cover
|
|
227
|
+
logger.warning(
|
|
228
|
+
"Did not find docker installed, some functionality might be affected"
|
|
229
|
+
)
|
|
230
|
+
except BaseException:
|
|
231
|
+
logger.exception(f"Could not find the image by name {image_name}")
|
|
232
|
+
|
|
233
|
+
return ""
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def get_git_code_identity():
|
|
237
|
+
"""Returns a code identity object for version controlled code.
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
run_log_store (runnable.datastore.BaseRunLogStore): The run log store used in this process
|
|
241
|
+
|
|
242
|
+
Returns:
|
|
243
|
+
runnable.datastore.CodeIdentity: The code identity used by the run log store.
|
|
244
|
+
"""
|
|
245
|
+
current_context = context.get_run_context()
|
|
246
|
+
if current_context is None:
|
|
247
|
+
raise RuntimeError("No run context available")
|
|
248
|
+
code_identity = current_context.run_log_store.create_code_identity()
|
|
249
|
+
try:
|
|
250
|
+
code_identity.code_identifier = get_current_code_commit()
|
|
251
|
+
code_identity.code_identifier_type = "git"
|
|
252
|
+
code_identity.code_identifier_dependable, changed = is_git_clean()
|
|
253
|
+
code_identity.code_identifier_url = get_git_remote()
|
|
254
|
+
if changed:
|
|
255
|
+
code_identity.code_identifier_message = "changes found in " + ", ".join(
|
|
256
|
+
changed.split("\n")
|
|
257
|
+
)
|
|
258
|
+
except BaseException:
|
|
259
|
+
logger.exception("Git code versioning problems")
|
|
260
|
+
|
|
261
|
+
return code_identity
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def remove_prefix(text: str, prefix: str) -> str:
|
|
265
|
+
"""Removes a prefix if one is present in the input text.
|
|
266
|
+
|
|
267
|
+
Args:
|
|
268
|
+
text (str): The input text to remove the prefix from
|
|
269
|
+
prefix (str): The prefix that has to be removed
|
|
270
|
+
|
|
271
|
+
Returns:
|
|
272
|
+
str: The original string if no prefix is found, or the right prefix chomped string if present
|
|
273
|
+
"""
|
|
274
|
+
if text.startswith(prefix):
|
|
275
|
+
return text[len(prefix) :]
|
|
276
|
+
return text # or whatever is given
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
def diff_dict(d1: Dict[str, Any], d2: Dict[str, Any]) -> Dict[str, Any]:
|
|
280
|
+
"""
|
|
281
|
+
Given two dicts d1 and d2, return a new dict that has upsert items from d1.
|
|
282
|
+
|
|
283
|
+
Args:
|
|
284
|
+
d1 (reference): The reference dict.
|
|
285
|
+
d2 (compare): Any new or modified items compared to d1 would be returned back
|
|
286
|
+
|
|
287
|
+
Returns:
|
|
288
|
+
dict: Any new or modified items in d2 in comparison to d1 would be sent back
|
|
289
|
+
"""
|
|
290
|
+
diff = {}
|
|
291
|
+
|
|
292
|
+
for k2, v2 in d2.items():
|
|
293
|
+
if k2 in d1 and d1[k2] != v2:
|
|
294
|
+
diff[k2] = v2
|
|
295
|
+
continue
|
|
296
|
+
diff[k2] = v2
|
|
297
|
+
|
|
298
|
+
return diff
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def get_data_hash(file_name: str) -> str:
|
|
302
|
+
"""Returns the hash of the data file.
|
|
303
|
+
|
|
304
|
+
For small files (<1GB): Returns full SHA256 hash
|
|
305
|
+
For large files (>=1GB): Returns fingerprint hash of first chunk + last chunk + file size
|
|
306
|
+
|
|
307
|
+
Args:
|
|
308
|
+
file_name (str): The file name to generate the hash for
|
|
309
|
+
|
|
310
|
+
Raises:
|
|
311
|
+
FileNotFoundError: If the file does not exist
|
|
312
|
+
PermissionError: If the file cannot be read due to permissions
|
|
313
|
+
OSError: If there are other I/O errors
|
|
314
|
+
|
|
315
|
+
Returns:
|
|
316
|
+
str: The SHA256 hash or fingerprint of the file contents
|
|
317
|
+
"""
|
|
318
|
+
start_time = time.time()
|
|
319
|
+
|
|
320
|
+
try:
|
|
321
|
+
file_path = Path(file_name)
|
|
322
|
+
file_size = file_path.stat().st_size
|
|
323
|
+
|
|
324
|
+
# Use appropriate algorithm based on file size
|
|
325
|
+
if file_size < defaults.LARGE_FILE_THRESHOLD_BYTES:
|
|
326
|
+
result = _compute_full_file_hash(file_name)
|
|
327
|
+
logger.debug(
|
|
328
|
+
f"Full hash computed for {file_name} ({file_size} bytes) in {time.time() - start_time:.3f}s"
|
|
329
|
+
)
|
|
330
|
+
else:
|
|
331
|
+
result = _compute_large_file_fingerprint(file_name, file_size)
|
|
332
|
+
logger.info(
|
|
333
|
+
f"Fingerprint hash computed for {file_name} ({file_size} bytes) in {time.time() - start_time:.3f}s"
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
return result
|
|
337
|
+
except FileNotFoundError:
|
|
338
|
+
logger.error(f"File not found: {file_name}")
|
|
339
|
+
raise
|
|
340
|
+
except PermissionError:
|
|
341
|
+
logger.error(f"Permission denied accessing file: {file_name}")
|
|
342
|
+
raise
|
|
343
|
+
except OSError as e:
|
|
344
|
+
logger.error(f"I/O error accessing file {file_name}: {e}")
|
|
345
|
+
raise
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def _compute_full_file_hash(file_name: str) -> str:
|
|
349
|
+
"""Compute SHA256 hash of entire file using streaming approach."""
|
|
350
|
+
with open(file_name, "rb") as f:
|
|
351
|
+
file_hash = hashlib.sha256()
|
|
352
|
+
for chunk in iter(lambda: f.read(4096), b""):
|
|
353
|
+
file_hash.update(chunk)
|
|
354
|
+
return file_hash.hexdigest()
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
def _compute_large_file_fingerprint(file_name: str, file_size: int) -> str:
|
|
358
|
+
"""Compute fingerprint hash for large files using first/last chunks + metadata."""
|
|
359
|
+
with open(file_name, "rb") as f:
|
|
360
|
+
file_hash = hashlib.sha256()
|
|
361
|
+
|
|
362
|
+
# Include file size in hash for uniqueness
|
|
363
|
+
file_hash.update(str(file_size).encode())
|
|
364
|
+
|
|
365
|
+
# Read first chunk
|
|
366
|
+
first_chunk = f.read(defaults.HASH_CHUNK_SIZE)
|
|
367
|
+
file_hash.update(first_chunk)
|
|
368
|
+
|
|
369
|
+
# Read last chunk if file is large enough and different from first chunk
|
|
370
|
+
if file_size > defaults.HASH_CHUNK_SIZE:
|
|
371
|
+
f.seek(-min(defaults.HASH_CHUNK_SIZE, file_size - len(first_chunk)), 2)
|
|
372
|
+
last_chunk = f.read(defaults.HASH_CHUNK_SIZE)
|
|
373
|
+
file_hash.update(last_chunk)
|
|
374
|
+
|
|
375
|
+
return file_hash.hexdigest()
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
def json_to_ordered_dict(json_str: str) -> OrderedDict:
|
|
379
|
+
"""Decode a JSON str into OrderedDict.
|
|
380
|
+
|
|
381
|
+
Args:
|
|
382
|
+
json_str ([str]): The JSON string to decode
|
|
383
|
+
|
|
384
|
+
Returns:
|
|
385
|
+
[OrderedDict]: The decoded OrderedDict
|
|
386
|
+
"""
|
|
387
|
+
if json_str and json_str != "{}":
|
|
388
|
+
return json.loads(json_str, object_pairs_hook=OrderedDict)
|
|
389
|
+
|
|
390
|
+
return OrderedDict()
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
def gather_variables() -> Dict[str, str]:
|
|
394
|
+
"""Gather all the environment variables used by runnable. All the variables start with runnable_VAR_.
|
|
395
|
+
|
|
396
|
+
Returns:
|
|
397
|
+
dict: All the environment variables present in the environment.
|
|
398
|
+
"""
|
|
399
|
+
variables = {}
|
|
400
|
+
|
|
401
|
+
for env_var, value in os.environ.items():
|
|
402
|
+
if env_var.startswith(defaults.VARIABLE_PREFIX):
|
|
403
|
+
key = remove_prefix(env_var, defaults.VARIABLE_PREFIX)
|
|
404
|
+
variables[key] = value
|
|
405
|
+
|
|
406
|
+
return variables
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
def make_log_file_name(
|
|
410
|
+
name: str,
|
|
411
|
+
iter_variable: Optional[IterableParameterModel] = None,
|
|
412
|
+
) -> str:
|
|
413
|
+
random_tag = "".join(random.choices(string.ascii_uppercase + string.digits, k=3))
|
|
414
|
+
log_file_name = name
|
|
415
|
+
|
|
416
|
+
if iter_variable and iter_variable.map_variable:
|
|
417
|
+
for _, value in iter_variable.map_variable.items():
|
|
418
|
+
log_file_name += "_" + str(value)
|
|
419
|
+
|
|
420
|
+
log_file_name += "_" + random_tag
|
|
421
|
+
log_file_name = "".join(x for x in log_file_name if x.isalnum()) + ".execution.log"
|
|
422
|
+
|
|
423
|
+
return log_file_name
|