gluekit 1.0.1.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gluekit/__init__.py +7 -0
- gluekit/app.py +0 -0
- gluekit/cli.py +64 -0
- gluekit/commands/__init__.py +1 -0
- gluekit/commands/add.py +455 -0
- gluekit/commands/build.py +816 -0
- gluekit/commands/checkout.py +114 -0
- gluekit/commands/clone.py +516 -0
- gluekit/commands/config_commands.py +180 -0
- gluekit/commands/constants.py +47 -0
- gluekit/commands/convert.py +336 -0
- gluekit/commands/edit.py +1104 -0
- gluekit/commands/helpers.py +1068 -0
- gluekit/commands/init.py +798 -0
- gluekit/commands/list.py +16 -0
- gluekit/commands/local_commands.py +680 -0
- gluekit/commands/pull.py +374 -0
- gluekit/commands/push.py +251 -0
- gluekit/commands/remove.py +161 -0
- gluekit/commands/run.py +126 -0
- gluekit/commands/status.py +97 -0
- gluekit/commands/sync.py +97 -0
- gluekit/commands/update.py +104 -0
- gluekit/job_mgmt/__init__.py +0 -0
- gluekit/job_mgmt/glue_jobs.py +1323 -0
- gluekit/job_mgmt/magics.py +122 -0
- gluekit/job_mgmt/resources/__init__.py +0 -0
- gluekit/job_mgmt/resources/glue_job_schema.json +40341 -0
- gluekit/job_mgmt/resources/magic_map.json +83 -0
- gluekit/job_mgmt/schema.py +165 -0
- gluekit/local/__init__.py +6 -0
- gluekit/local/awsglue/__init__.py +1 -0
- gluekit/local/awsglue/context.py +30 -0
- gluekit/local/awsglue/job.py +9 -0
- gluekit/local/awsglue/utils.py +17 -0
- gluekit/local/local.py +434 -0
- gluekit/local/local_fixtures.py +337 -0
- gluekit/local/pyspark/__init__.py +7 -0
- gluekit/local/pyspark/context.py +31 -0
- gluekit/local/pyspark/sql/__init__.py +6 -0
- gluekit/local/pyspark/sql/session.py +29 -0
- gluekit-1.0.1.dev1.dist-info/METADATA +1176 -0
- gluekit-1.0.1.dev1.dist-info/RECORD +46 -0
- gluekit-1.0.1.dev1.dist-info/WHEEL +5 -0
- gluekit-1.0.1.dev1.dist-info/entry_points.txt +2 -0
- gluekit-1.0.1.dev1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1323 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Glue job sync tooling for real AWS Glue resources.
|
|
3
|
+
|
|
4
|
+
This module pulls AWS Glue job configs and scripts into the repo so they can be
|
|
5
|
+
tracked alongside migration artifacts. The main flow:
|
|
6
|
+
|
|
7
|
+
- Fetch Glue jobs via boto3 using an AWS CLI credential profile when provided.
|
|
8
|
+
- Download the job config JSON, the script (.py), and (when present) the
|
|
9
|
+
notebook (.ipynb) from the job's ScriptLocation bucket.
|
|
10
|
+
- If the job is a NOTEBOOK, remove outputs to keep diffs clean.
|
|
11
|
+
|
|
12
|
+
These helpers are different from the local fixture commands in
|
|
13
|
+
``gluekit.commands.local_commands``: they use real AWS Glue and S3 APIs, and
|
|
14
|
+
``profile_name`` means an AWS CLI/boto3 credential profile.
|
|
15
|
+
|
|
16
|
+
Preferred usage:
|
|
17
|
+
- Use the Typer CLI for a git-like flow:
|
|
18
|
+
gluekit pull <job_name> or gluekit pull "*"
|
|
19
|
+
|
|
20
|
+
Outputs:
|
|
21
|
+
- glue/glue_full_job_list.csv: complete job list for auditing.
|
|
22
|
+
- glue/configs/<slug>.json: job config with SourceControlDetails metadata.
|
|
23
|
+
- glue/scripts/<slug>.py and glue/notebooks/<slug>.ipynb (when available).
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
import csv
|
|
27
|
+
import json
|
|
28
|
+
import re
|
|
29
|
+
import subprocess
|
|
30
|
+
import tempfile
|
|
31
|
+
import uuid
|
|
32
|
+
import zipfile
|
|
33
|
+
from copy import deepcopy
|
|
34
|
+
from datetime import datetime
|
|
35
|
+
from pathlib import Path
|
|
36
|
+
|
|
37
|
+
import boto3
|
|
38
|
+
from botocore.exceptions import BotoCoreError, ClientError
|
|
39
|
+
from slugify import slugify
|
|
40
|
+
|
|
41
|
+
from dotenv import load_dotenv
|
|
42
|
+
from .magics import build_magic_cell_sources
|
|
43
|
+
from .schema import DEPRECATED_JOB_FIELDS, get_updateable_job_fields
|
|
44
|
+
|
|
45
|
+
load_dotenv()
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def normalize_glue_config_data(config_data):
|
|
49
|
+
if not isinstance(config_data, dict):
|
|
50
|
+
return config_data
|
|
51
|
+
|
|
52
|
+
job_data = config_data.get("Job")
|
|
53
|
+
if not isinstance(job_data, dict):
|
|
54
|
+
return config_data
|
|
55
|
+
|
|
56
|
+
normalized = deepcopy(job_data)
|
|
57
|
+
for key, value in config_data.items():
|
|
58
|
+
if key == "Job":
|
|
59
|
+
continue
|
|
60
|
+
if key == "SourceControlDetails" and isinstance(value, dict):
|
|
61
|
+
existing = normalized.get("SourceControlDetails")
|
|
62
|
+
if isinstance(existing, dict):
|
|
63
|
+
merged = deepcopy(value)
|
|
64
|
+
merged.update(existing)
|
|
65
|
+
normalized["SourceControlDetails"] = merged
|
|
66
|
+
else:
|
|
67
|
+
normalized["SourceControlDetails"] = deepcopy(value)
|
|
68
|
+
continue
|
|
69
|
+
normalized.setdefault(key, value)
|
|
70
|
+
|
|
71
|
+
return normalized
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _new_boto3_session(profile_name=None):
|
|
75
|
+
if profile_name:
|
|
76
|
+
return boto3.Session(profile_name=profile_name)
|
|
77
|
+
return boto3.Session()
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def ensure_aws_login(profile_name=None, auto_login=True):
|
|
81
|
+
"""
|
|
82
|
+
Ensure real AWS credentials are valid for an AWS CLI/boto3 profile.
|
|
83
|
+
|
|
84
|
+
If credentials are invalid and auto_login is enabled, trigger
|
|
85
|
+
``aws sso login`` and return a fresh boto3 session.
|
|
86
|
+
"""
|
|
87
|
+
session = _new_boto3_session(profile_name)
|
|
88
|
+
sts = session.client("sts")
|
|
89
|
+
try:
|
|
90
|
+
sts.get_caller_identity()
|
|
91
|
+
return session
|
|
92
|
+
except (BotoCoreError, ClientError):
|
|
93
|
+
if not auto_login:
|
|
94
|
+
raise
|
|
95
|
+
|
|
96
|
+
profile_label = profile_name or "default"
|
|
97
|
+
print(
|
|
98
|
+
f"AWS auth missing/expired for profile '{profile_label}'. Running aws sso login..."
|
|
99
|
+
)
|
|
100
|
+
login_cmd = ["aws", "sso", "login"]
|
|
101
|
+
if profile_name:
|
|
102
|
+
login_cmd.extend(["--profile", profile_name])
|
|
103
|
+
try:
|
|
104
|
+
subprocess.run(login_cmd, check=True)
|
|
105
|
+
except FileNotFoundError as exc:
|
|
106
|
+
raise RuntimeError(
|
|
107
|
+
"AWS CLI is not installed or not on PATH. It is required for SSO login."
|
|
108
|
+
) from exc
|
|
109
|
+
except subprocess.CalledProcessError as exc:
|
|
110
|
+
raise RuntimeError("aws sso login failed or was canceled.") from exc
|
|
111
|
+
return _new_boto3_session(profile_name)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def custom_serializer(obj):
|
|
115
|
+
if isinstance(obj, datetime):
|
|
116
|
+
return obj.isoformat() # Converts datetime to an ISO 8601 string
|
|
117
|
+
raise TypeError(f"Type {type(obj)} not serializable")
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def list_glue_jobs(profile_name=None, auto_login=True):
|
|
121
|
+
session = ensure_aws_login(profile_name=profile_name, auto_login=auto_login)
|
|
122
|
+
glue = session.client("glue")
|
|
123
|
+
jobs = []
|
|
124
|
+
next_token = None
|
|
125
|
+
while True:
|
|
126
|
+
response = (
|
|
127
|
+
glue.get_jobs(NextToken=next_token) if next_token else glue.get_jobs()
|
|
128
|
+
)
|
|
129
|
+
jobs.extend(response["Jobs"])
|
|
130
|
+
next_token = response.get("NextToken")
|
|
131
|
+
if not next_token:
|
|
132
|
+
break
|
|
133
|
+
return jobs
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def get_glue_job_config(name, profile_name=None, auto_login=True):
|
|
137
|
+
"""Fetch one Glue job config without writing local artifacts."""
|
|
138
|
+
session = ensure_aws_login(profile_name=profile_name, auto_login=auto_login)
|
|
139
|
+
glue = session.client("glue")
|
|
140
|
+
return normalize_glue_config_data(glue.get_job(JobName=name).get("Job"))
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def parse_s3_bucket_and_prefix(s3_path):
|
|
144
|
+
s3_path = s3_path.replace("s3://", "")
|
|
145
|
+
bucket_name, prefix = s3_path.split("/", 1)
|
|
146
|
+
return bucket_name, prefix
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _strip_deprecated_fields(job_data):
|
|
150
|
+
for field in DEPRECATED_JOB_FIELDS:
|
|
151
|
+
job_data.pop(field, None)
|
|
152
|
+
return job_data
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def download_glue_job_files(
|
|
156
|
+
name,
|
|
157
|
+
config_path,
|
|
158
|
+
local_path=None,
|
|
159
|
+
remove_nb_outputs=True,
|
|
160
|
+
include_components=None,
|
|
161
|
+
existing_sc=None,
|
|
162
|
+
profile_name=None,
|
|
163
|
+
auto_login=True,
|
|
164
|
+
):
|
|
165
|
+
|
|
166
|
+
session = ensure_aws_login(profile_name=profile_name, auto_login=auto_login)
|
|
167
|
+
glue = session.client("glue")
|
|
168
|
+
job = normalize_glue_config_data(glue.get_job(JobName=name).get("Job"))
|
|
169
|
+
if not job:
|
|
170
|
+
print(f"Job {name} not found")
|
|
171
|
+
return None
|
|
172
|
+
s3 = session.client("s3")
|
|
173
|
+
|
|
174
|
+
bucket_name, script_location = parse_s3_bucket_and_prefix(
|
|
175
|
+
job["Command"]["ScriptLocation"]
|
|
176
|
+
)
|
|
177
|
+
# Define paths
|
|
178
|
+
config_file_name = str(config_path)
|
|
179
|
+
if local_path:
|
|
180
|
+
script_file_name = local_path
|
|
181
|
+
else:
|
|
182
|
+
script_file_name = f"glue/scripts/{slugify(job['Name'])}.py"
|
|
183
|
+
|
|
184
|
+
script_path = Path(script_file_name)
|
|
185
|
+
if "scripts" in script_path.parts:
|
|
186
|
+
notebook_file_name = str(
|
|
187
|
+
Path(
|
|
188
|
+
*(
|
|
189
|
+
"notebooks" if part == "scripts" else part
|
|
190
|
+
for part in script_path.parts
|
|
191
|
+
)
|
|
192
|
+
).with_suffix(".ipynb")
|
|
193
|
+
)
|
|
194
|
+
else:
|
|
195
|
+
notebook_file_name = str(
|
|
196
|
+
Path("glue/notebooks") / script_path.with_suffix(".ipynb").name
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
# Get notebook
|
|
200
|
+
if "scripts/" in script_location and script_location.endswith(".py"):
|
|
201
|
+
notebook_location = script_location.replace("scripts/", "notebooks/").replace(
|
|
202
|
+
".py", ".ipynb"
|
|
203
|
+
)
|
|
204
|
+
else:
|
|
205
|
+
notebook_location = script_location.replace(".py", ".ipynb")
|
|
206
|
+
|
|
207
|
+
print(f"Notebook location: {notebook_location}")
|
|
208
|
+
|
|
209
|
+
notebook_dict = None
|
|
210
|
+
if "notebook" in include_components:
|
|
211
|
+
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=notebook_location)
|
|
212
|
+
if "Contents" in response:
|
|
213
|
+
response = s3.get_object(Bucket=bucket_name, Key=notebook_location)
|
|
214
|
+
notebook_content = response["Body"].read().decode("utf-8")
|
|
215
|
+
notebook_dict = json.loads(notebook_content)
|
|
216
|
+
if remove_nb_outputs:
|
|
217
|
+
notebook_dict = clear_notebook_outputs(notebook_dict)
|
|
218
|
+
print(f"Cleared outputs in notebook {notebook_file_name}")
|
|
219
|
+
else:
|
|
220
|
+
print(
|
|
221
|
+
f"No notebook found at {notebook_location}, skipping notebook download."
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
job.setdefault("SourceControlDetails", {})
|
|
225
|
+
if existing_sc:
|
|
226
|
+
for key, value in existing_sc.items():
|
|
227
|
+
job["SourceControlDetails"].setdefault(key, value)
|
|
228
|
+
job["SourceControlDetails"]["NotebookLocation"] = notebook_file_name
|
|
229
|
+
job["SourceControlDetails"]["ScriptLocation"] = script_file_name
|
|
230
|
+
job["SourceControlDetails"].pop("LocalPath", None)
|
|
231
|
+
job["SourceControlDetails"]["RemoveNbOutputs"] = remove_nb_outputs
|
|
232
|
+
_strip_deprecated_fields(job)
|
|
233
|
+
|
|
234
|
+
# Write files
|
|
235
|
+
if "config" in include_components:
|
|
236
|
+
Path(config_file_name).write_text(
|
|
237
|
+
json.dumps(job, indent=4, default=custom_serializer)
|
|
238
|
+
)
|
|
239
|
+
print(f"Configuration file created: {config_file_name}")
|
|
240
|
+
|
|
241
|
+
if "script" in include_components:
|
|
242
|
+
script_path.parent.mkdir(parents=True, exist_ok=True)
|
|
243
|
+
s3.download_file(bucket_name, script_location, script_file_name)
|
|
244
|
+
print(f"Script downloaded: {script_file_name}")
|
|
245
|
+
|
|
246
|
+
if notebook_dict and "notebook" in include_components:
|
|
247
|
+
notebook_path = Path(notebook_file_name)
|
|
248
|
+
notebook_path.parent.mkdir(parents=True, exist_ok=True)
|
|
249
|
+
notebook_path.write_text(
|
|
250
|
+
json.dumps(notebook_dict, indent=4, default=custom_serializer)
|
|
251
|
+
)
|
|
252
|
+
print(f"Notebook downloaded: {notebook_file_name}")
|
|
253
|
+
|
|
254
|
+
if "script" in include_components and "notebook" in include_components:
|
|
255
|
+
_ensure_local_script_from_notebook(notebook_file_name, script_file_name)
|
|
256
|
+
_ensure_local_notebook_from_script(
|
|
257
|
+
script_file_name,
|
|
258
|
+
notebook_file_name,
|
|
259
|
+
config_data=job,
|
|
260
|
+
include_magics=True,
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
if include_components.intersection({"extra-py-files", "extra-files"}):
|
|
264
|
+
sc_for_extras = job.get("SourceControlDetails", {})
|
|
265
|
+
additional_mappings, additional_errors = _collect_additional_file_mappings(
|
|
266
|
+
sc_for_extras
|
|
267
|
+
)
|
|
268
|
+
for error in additional_errors:
|
|
269
|
+
print(f"Additional files error: {error}")
|
|
270
|
+
|
|
271
|
+
include_sources: set[str] = set()
|
|
272
|
+
if "extra-py-files" in include_components:
|
|
273
|
+
include_sources.update({"AdditionalPythonFiles", "ExtraPyFiles"})
|
|
274
|
+
if "extra-files" in include_components:
|
|
275
|
+
include_sources.update({"AdditionalFiles", "ExtraFiles"})
|
|
276
|
+
|
|
277
|
+
additional_mappings = [
|
|
278
|
+
mapping
|
|
279
|
+
for mapping in additional_mappings
|
|
280
|
+
if mapping["source"] in include_sources
|
|
281
|
+
]
|
|
282
|
+
|
|
283
|
+
if not additional_mappings:
|
|
284
|
+
print("No additional files configured in SourceControlDetails; skipping.")
|
|
285
|
+
for mapping in additional_mappings:
|
|
286
|
+
local = Path(mapping["local_path"])
|
|
287
|
+
s3_path = mapping["s3_path"]
|
|
288
|
+
if not s3_path.startswith("s3://"):
|
|
289
|
+
print(f"Additional file has invalid S3 path: {s3_path}")
|
|
290
|
+
continue
|
|
291
|
+
bucket, key = parse_s3_bucket_and_prefix(s3_path)
|
|
292
|
+
is_zip = s3_path.endswith(".zip")
|
|
293
|
+
treat_as_dir = is_zip and (
|
|
294
|
+
local.exists() and local.is_dir() or local.suffix != ".zip"
|
|
295
|
+
)
|
|
296
|
+
if treat_as_dir:
|
|
297
|
+
local.mkdir(parents=True, exist_ok=True)
|
|
298
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
299
|
+
zip_path = Path(temp_dir) / "additional_files.zip"
|
|
300
|
+
s3.download_file(bucket, key, str(zip_path))
|
|
301
|
+
with zipfile.ZipFile(zip_path, "r") as archive:
|
|
302
|
+
archive.extractall(local)
|
|
303
|
+
print(f"Extracted additional archive: {s3_path} -> {local}")
|
|
304
|
+
else:
|
|
305
|
+
local.parent.mkdir(parents=True, exist_ok=True)
|
|
306
|
+
s3.download_file(bucket, key, str(local))
|
|
307
|
+
print(f"Downloaded additional file: {s3_path} -> {local}")
|
|
308
|
+
|
|
309
|
+
print("S3 sync complete.")
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def _resolve_notebook_path(script_path):
|
|
313
|
+
if "scripts" in script_path.parts:
|
|
314
|
+
return str(
|
|
315
|
+
Path(
|
|
316
|
+
*(
|
|
317
|
+
"notebooks" if part == "scripts" else part
|
|
318
|
+
for part in script_path.parts
|
|
319
|
+
)
|
|
320
|
+
).with_suffix(".ipynb")
|
|
321
|
+
)
|
|
322
|
+
return str(Path("glue/notebooks") / script_path.with_suffix(".ipynb").name)
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def _script_location_to_notebook_key(script_key):
|
|
326
|
+
if "scripts/" in script_key and script_key.endswith(".py"):
|
|
327
|
+
return script_key.replace("scripts/", "notebooks/").replace(".py", ".ipynb")
|
|
328
|
+
return script_key.replace(".py", ".ipynb")
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def _load_config_data(config):
|
|
332
|
+
if isinstance(config, (str, Path)):
|
|
333
|
+
config_path = Path(config)
|
|
334
|
+
try:
|
|
335
|
+
config_data = normalize_glue_config_data(
|
|
336
|
+
json.loads(config_path.read_text())
|
|
337
|
+
)
|
|
338
|
+
except json.JSONDecodeError as exc:
|
|
339
|
+
raise ValueError(f"Invalid JSON in {config_path}") from exc
|
|
340
|
+
return config_data, config_path
|
|
341
|
+
if isinstance(config, dict):
|
|
342
|
+
return normalize_glue_config_data(config), None
|
|
343
|
+
raise TypeError("config must be a dict or a path-like string")
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def _collect_additional_file_mappings(sc):
|
|
347
|
+
mappings: list[dict[str, str]] = []
|
|
348
|
+
errors: list[str] = []
|
|
349
|
+
seen: set[tuple[str, str]] = set()
|
|
350
|
+
sources = [
|
|
351
|
+
("AdditionalPythonFiles", sc.get("AdditionalPythonFiles")),
|
|
352
|
+
("ExtraPyFiles", sc.get("ExtraPyFiles")),
|
|
353
|
+
("AdditionalFiles", sc.get("AdditionalFiles")),
|
|
354
|
+
("ExtraFiles", sc.get("ExtraFiles")),
|
|
355
|
+
]
|
|
356
|
+
|
|
357
|
+
for label, items in sources:
|
|
358
|
+
if not items:
|
|
359
|
+
continue
|
|
360
|
+
if isinstance(items, dict):
|
|
361
|
+
items = [items]
|
|
362
|
+
if not isinstance(items, list):
|
|
363
|
+
errors.append(f"{label} must be a list or dict.")
|
|
364
|
+
continue
|
|
365
|
+
for item in items:
|
|
366
|
+
local_path = None
|
|
367
|
+
s3_path = None
|
|
368
|
+
if isinstance(item, dict):
|
|
369
|
+
local_path = (
|
|
370
|
+
item.get("LocalPath")
|
|
371
|
+
or item.get("local_path")
|
|
372
|
+
or item.get("localPath")
|
|
373
|
+
)
|
|
374
|
+
s3_path = (
|
|
375
|
+
item.get("S3Path") or item.get("s3_path") or item.get("s3Path")
|
|
376
|
+
)
|
|
377
|
+
elif isinstance(item, (list, tuple)) and len(item) == 2:
|
|
378
|
+
local_path, s3_path = item
|
|
379
|
+
else:
|
|
380
|
+
errors.append(f"{label} contains an unsupported entry: {item!r}")
|
|
381
|
+
continue
|
|
382
|
+
|
|
383
|
+
if not local_path or not s3_path:
|
|
384
|
+
errors.append(f"{label} entry missing LocalPath/S3Path: {item!r}")
|
|
385
|
+
continue
|
|
386
|
+
|
|
387
|
+
key = (str(local_path), str(s3_path))
|
|
388
|
+
if key in seen:
|
|
389
|
+
continue
|
|
390
|
+
seen.add(key)
|
|
391
|
+
mappings.append(
|
|
392
|
+
{
|
|
393
|
+
"local_path": str(local_path),
|
|
394
|
+
"s3_path": str(s3_path),
|
|
395
|
+
"source": label,
|
|
396
|
+
}
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
return mappings, errors
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
def _parse_csv_list(value):
|
|
403
|
+
if not value:
|
|
404
|
+
return []
|
|
405
|
+
return [item.strip() for item in next(csv.reader([value])) if item.strip()]
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
def _infer_local_dependency_path(s3_path, script_location, local_script_path):
|
|
409
|
+
if not isinstance(s3_path, str) or not s3_path.startswith("s3://"):
|
|
410
|
+
return None
|
|
411
|
+
|
|
412
|
+
try:
|
|
413
|
+
_bucket, key = parse_s3_bucket_and_prefix(s3_path)
|
|
414
|
+
except ValueError:
|
|
415
|
+
return None
|
|
416
|
+
|
|
417
|
+
key_path = Path(key)
|
|
418
|
+
candidates: list[Path] = [key_path]
|
|
419
|
+
if key_path.parts and key_path.parts[0] != "glue":
|
|
420
|
+
candidates.append(Path("glue") / key_path)
|
|
421
|
+
|
|
422
|
+
if isinstance(script_location, str) and script_location.startswith("s3://"):
|
|
423
|
+
try:
|
|
424
|
+
_script_bucket, script_key = parse_s3_bucket_and_prefix(script_location)
|
|
425
|
+
except ValueError:
|
|
426
|
+
script_key = None
|
|
427
|
+
if script_key is not None:
|
|
428
|
+
script_dir = Path(script_key).parent
|
|
429
|
+
key_parts = key_path.parts
|
|
430
|
+
script_dir_parts = script_dir.parts
|
|
431
|
+
if (
|
|
432
|
+
script_dir_parts
|
|
433
|
+
and key_parts[: len(script_dir_parts)] == script_dir_parts
|
|
434
|
+
):
|
|
435
|
+
rel = key_path.relative_to(script_dir)
|
|
436
|
+
candidates.append(local_script_path.parent / rel)
|
|
437
|
+
|
|
438
|
+
seen: set[str] = set()
|
|
439
|
+
for candidate in candidates:
|
|
440
|
+
candidate_key = candidate.as_posix()
|
|
441
|
+
if candidate_key in seen:
|
|
442
|
+
continue
|
|
443
|
+
seen.add(candidate_key)
|
|
444
|
+
if candidate.exists():
|
|
445
|
+
return candidate
|
|
446
|
+
|
|
447
|
+
return None
|
|
448
|
+
|
|
449
|
+
|
|
450
|
+
def _collect_configured_dependency_mappings(config_data, explicit_mappings):
|
|
451
|
+
default_args = config_data.get("DefaultArguments", {})
|
|
452
|
+
if not isinstance(default_args, dict):
|
|
453
|
+
return [], []
|
|
454
|
+
|
|
455
|
+
command = config_data.get("Command", {})
|
|
456
|
+
if not isinstance(command, dict):
|
|
457
|
+
command = {}
|
|
458
|
+
sc = config_data.get("SourceControlDetails", {})
|
|
459
|
+
if not isinstance(sc, dict):
|
|
460
|
+
sc = {}
|
|
461
|
+
|
|
462
|
+
job_name = config_data.get("Name") or "unknown"
|
|
463
|
+
local_script_path = Path(
|
|
464
|
+
sc.get("ScriptLocation")
|
|
465
|
+
or sc.get("LocalPath")
|
|
466
|
+
or f"glue/scripts/{slugify(job_name)}.py"
|
|
467
|
+
)
|
|
468
|
+
script_location = command.get("ScriptLocation")
|
|
469
|
+
|
|
470
|
+
explicit_s3_paths = {mapping["s3_path"] for mapping in explicit_mappings}
|
|
471
|
+
seen: set[tuple[str, str]] = set()
|
|
472
|
+
mappings: list[dict[str, str]] = []
|
|
473
|
+
errors: list[str] = []
|
|
474
|
+
dependency_args = (
|
|
475
|
+
("DefaultArguments.--additional-python-modules", "--additional-python-modules"),
|
|
476
|
+
("DefaultArguments.--extra-py-files", "--extra-py-files"),
|
|
477
|
+
)
|
|
478
|
+
|
|
479
|
+
for source_label, arg_name in dependency_args:
|
|
480
|
+
for item in _parse_csv_list(default_args.get(arg_name)):
|
|
481
|
+
if not item.startswith("s3://"):
|
|
482
|
+
continue
|
|
483
|
+
if item in explicit_s3_paths:
|
|
484
|
+
continue
|
|
485
|
+
|
|
486
|
+
local_path = _infer_local_dependency_path(
|
|
487
|
+
item, script_location, local_script_path
|
|
488
|
+
)
|
|
489
|
+
if local_path is None:
|
|
490
|
+
errors.append(
|
|
491
|
+
f"{arg_name} entry has no matching local path for upload: {item}"
|
|
492
|
+
)
|
|
493
|
+
continue
|
|
494
|
+
|
|
495
|
+
key = (local_path.as_posix(), item)
|
|
496
|
+
if key in seen:
|
|
497
|
+
continue
|
|
498
|
+
seen.add(key)
|
|
499
|
+
mappings.append(
|
|
500
|
+
{
|
|
501
|
+
"local_path": local_path.as_posix(),
|
|
502
|
+
"s3_path": item,
|
|
503
|
+
"source": source_label,
|
|
504
|
+
}
|
|
505
|
+
)
|
|
506
|
+
|
|
507
|
+
return mappings, errors
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
def _zip_directory(source_dir, zip_path):
|
|
511
|
+
source_dir = Path(source_dir)
|
|
512
|
+
zip_path = Path(zip_path)
|
|
513
|
+
zip_path.parent.mkdir(parents=True, exist_ok=True)
|
|
514
|
+
|
|
515
|
+
base_dir = source_dir.parent
|
|
516
|
+
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
|
|
517
|
+
for path in source_dir.rglob("*"):
|
|
518
|
+
if path.is_dir():
|
|
519
|
+
continue
|
|
520
|
+
archive.write(path, path.relative_to(base_dir).as_posix())
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
def _build_job_update(config_data):
|
|
524
|
+
allowed_keys = {
|
|
525
|
+
"Description",
|
|
526
|
+
"LogUri",
|
|
527
|
+
"Role",
|
|
528
|
+
"ExecutionProperty",
|
|
529
|
+
"Command",
|
|
530
|
+
"DefaultArguments",
|
|
531
|
+
"NonOverridableArguments",
|
|
532
|
+
"Connections",
|
|
533
|
+
"MaxRetries",
|
|
534
|
+
"Timeout",
|
|
535
|
+
"MaxCapacity",
|
|
536
|
+
"WorkerType",
|
|
537
|
+
"NumberOfWorkers",
|
|
538
|
+
"SecurityConfiguration",
|
|
539
|
+
"NotificationProperty",
|
|
540
|
+
"GlueVersion",
|
|
541
|
+
"CodeGenConfigurationNodes",
|
|
542
|
+
"ExecutionClass",
|
|
543
|
+
"JobMode",
|
|
544
|
+
"ExecutionClass",
|
|
545
|
+
"JobRunQueuingEnabled",
|
|
546
|
+
}
|
|
547
|
+
return {
|
|
548
|
+
key: config_data[key]
|
|
549
|
+
for key in allowed_keys
|
|
550
|
+
if key in config_data and key not in DEPRECATED_JOB_FIELDS
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
|
|
554
|
+
def _get_job_update_allowed_fields(profile_name=None):
|
|
555
|
+
del profile_name
|
|
556
|
+
return get_updateable_job_fields(prefer_live=True)
|
|
557
|
+
|
|
558
|
+
|
|
559
|
+
def _filter_job_update_fields(job_update, allowed_fields):
|
|
560
|
+
if not allowed_fields:
|
|
561
|
+
return job_update, []
|
|
562
|
+
filtered = {
|
|
563
|
+
key: value for key, value in job_update.items() if key in allowed_fields
|
|
564
|
+
}
|
|
565
|
+
dropped = [key for key in job_update if key not in allowed_fields]
|
|
566
|
+
return filtered, dropped
|
|
567
|
+
|
|
568
|
+
|
|
569
|
+
def update_glue_job_from_config(
|
|
570
|
+
config_data, dry_run=False, profile_name=None, auto_login=True
|
|
571
|
+
):
|
|
572
|
+
"""
|
|
573
|
+
Uses the Glue service model (via botocore) when available to validate
|
|
574
|
+
JobUpdate fields and report anything dropped during dry runs.
|
|
575
|
+
|
|
576
|
+
TODO: Consider fetching the current Glue job definition and merging with
|
|
577
|
+
local config values before update, so unchanged fields stay in sync.
|
|
578
|
+
"""
|
|
579
|
+
name = config_data.get("Name")
|
|
580
|
+
if not name:
|
|
581
|
+
print("Missing Name in config data, skipping Glue job update.")
|
|
582
|
+
return False
|
|
583
|
+
|
|
584
|
+
deprecated_in_config = [key for key in DEPRECATED_JOB_FIELDS if key in config_data]
|
|
585
|
+
if deprecated_in_config:
|
|
586
|
+
print(
|
|
587
|
+
f"Skipping deprecated fields for {name}: {', '.join(sorted(deprecated_in_config))}"
|
|
588
|
+
)
|
|
589
|
+
|
|
590
|
+
job_update = _build_job_update(config_data)
|
|
591
|
+
allowed_fields = _get_job_update_allowed_fields(profile_name=profile_name)
|
|
592
|
+
job_update, dropped = _filter_job_update_fields(job_update, allowed_fields)
|
|
593
|
+
if dropped:
|
|
594
|
+
print(
|
|
595
|
+
f"Dropping unsupported JobUpdate fields for {name}: {', '.join(sorted(dropped))}"
|
|
596
|
+
)
|
|
597
|
+
if dry_run and not allowed_fields:
|
|
598
|
+
print("Model-based validation unavailable; using local allowlist only.")
|
|
599
|
+
if not job_update:
|
|
600
|
+
print(f"No updatable fields found for {name}, skipping Glue job update.")
|
|
601
|
+
return False
|
|
602
|
+
|
|
603
|
+
if dry_run:
|
|
604
|
+
keys = ", ".join(sorted(job_update.keys()))
|
|
605
|
+
print(f"Would update Glue job {name} with: {keys}")
|
|
606
|
+
return True
|
|
607
|
+
|
|
608
|
+
session = ensure_aws_login(profile_name=profile_name, auto_login=auto_login)
|
|
609
|
+
glue = session.client("glue")
|
|
610
|
+
glue.update_job(JobName=name, JobUpdate=job_update)
|
|
611
|
+
print(f"Updated Glue job configuration: {name}")
|
|
612
|
+
return True
|
|
613
|
+
|
|
614
|
+
|
|
615
|
+
def _job_supports_notebook_artifacts(config_data):
|
|
616
|
+
job_mode = str(config_data.get("JobMode") or "").strip().upper()
|
|
617
|
+
if not job_mode:
|
|
618
|
+
return True
|
|
619
|
+
return job_mode == "NOTEBOOK"
|
|
620
|
+
|
|
621
|
+
|
|
622
|
+
def resolve_glue_artifact_mappings(config_data, include_components=None):
|
|
623
|
+
"""Resolve local-to-S3 artifact mappings without contacting AWS."""
|
|
624
|
+
config_data = normalize_glue_config_data(config_data)
|
|
625
|
+
include_components = set(
|
|
626
|
+
include_components
|
|
627
|
+
or {
|
|
628
|
+
"script",
|
|
629
|
+
"notebook",
|
|
630
|
+
"additional-python-modules",
|
|
631
|
+
"extra-files",
|
|
632
|
+
}
|
|
633
|
+
)
|
|
634
|
+
warnings: list[str] = []
|
|
635
|
+
dependency_warnings: list[str] = []
|
|
636
|
+
|
|
637
|
+
command = config_data.get("Command", {})
|
|
638
|
+
if not isinstance(command, dict):
|
|
639
|
+
warnings.append("Command must be an object.")
|
|
640
|
+
command = {}
|
|
641
|
+
|
|
642
|
+
sc = config_data.get("SourceControlDetails", {})
|
|
643
|
+
if not isinstance(sc, dict):
|
|
644
|
+
warnings.append("SourceControlDetails must be an object.")
|
|
645
|
+
sc = {}
|
|
646
|
+
|
|
647
|
+
name = config_data.get("Name") or "unknown"
|
|
648
|
+
script_path = Path(
|
|
649
|
+
sc.get("ScriptLocation")
|
|
650
|
+
or sc.get("LocalPath")
|
|
651
|
+
or f"glue/scripts/{slugify(name)}.py"
|
|
652
|
+
)
|
|
653
|
+
script_s3_path = command.get("ScriptLocation")
|
|
654
|
+
notebook_enabled = (
|
|
655
|
+
"notebook" in include_components
|
|
656
|
+
and _job_supports_notebook_artifacts(config_data)
|
|
657
|
+
)
|
|
658
|
+
notebook_path = None
|
|
659
|
+
notebook_s3_path = None
|
|
660
|
+
mappings: list[dict[str, str]] = []
|
|
661
|
+
|
|
662
|
+
if "script" in include_components:
|
|
663
|
+
if isinstance(script_s3_path, str) and script_s3_path.startswith("s3://"):
|
|
664
|
+
mappings.append(
|
|
665
|
+
{
|
|
666
|
+
"artifact_type": "script",
|
|
667
|
+
"local_path": script_path.as_posix(),
|
|
668
|
+
"s3_path": script_s3_path,
|
|
669
|
+
"source": "Command.ScriptLocation",
|
|
670
|
+
}
|
|
671
|
+
)
|
|
672
|
+
else:
|
|
673
|
+
warnings.append("Command.ScriptLocation must be a valid S3 path.")
|
|
674
|
+
|
|
675
|
+
if notebook_enabled:
|
|
676
|
+
notebook_value = sc.get("NotebookPath") or sc.get("NotebookLocation")
|
|
677
|
+
notebook_path = (
|
|
678
|
+
Path(notebook_value)
|
|
679
|
+
if notebook_value
|
|
680
|
+
else Path(_resolve_notebook_path(script_path))
|
|
681
|
+
)
|
|
682
|
+
if isinstance(script_s3_path, str) and script_s3_path.startswith("s3://"):
|
|
683
|
+
try:
|
|
684
|
+
bucket_name, script_key = parse_s3_bucket_and_prefix(script_s3_path)
|
|
685
|
+
except ValueError:
|
|
686
|
+
warnings.append(
|
|
687
|
+
f"Cannot derive notebook S3 path from Command.ScriptLocation: {script_s3_path}"
|
|
688
|
+
)
|
|
689
|
+
else:
|
|
690
|
+
notebook_key = _script_location_to_notebook_key(script_key)
|
|
691
|
+
notebook_s3_path = f"s3://{bucket_name}/{notebook_key}"
|
|
692
|
+
mappings.append(
|
|
693
|
+
{
|
|
694
|
+
"artifact_type": "notebook",
|
|
695
|
+
"local_path": notebook_path.as_posix(),
|
|
696
|
+
"s3_path": notebook_s3_path,
|
|
697
|
+
"source": "Command.ScriptLocation",
|
|
698
|
+
}
|
|
699
|
+
)
|
|
700
|
+
|
|
701
|
+
include_sources: set[str] = set()
|
|
702
|
+
dependency_components = {"additional-python-modules", "extra-py-files"}
|
|
703
|
+
if include_components.intersection(dependency_components):
|
|
704
|
+
include_sources.update({"AdditionalPythonFiles", "ExtraPyFiles"})
|
|
705
|
+
if "extra-files" in include_components:
|
|
706
|
+
include_sources.update({"AdditionalFiles", "ExtraFiles"})
|
|
707
|
+
|
|
708
|
+
additional_mappings: list[dict[str, str]] = []
|
|
709
|
+
if include_sources:
|
|
710
|
+
additional_mappings, additional_errors = _collect_additional_file_mappings(sc)
|
|
711
|
+
dependency_warnings.extend(additional_errors)
|
|
712
|
+
warnings.extend(additional_errors)
|
|
713
|
+
additional_mappings = [
|
|
714
|
+
mapping
|
|
715
|
+
for mapping in additional_mappings
|
|
716
|
+
if mapping["source"] in include_sources
|
|
717
|
+
]
|
|
718
|
+
|
|
719
|
+
if include_components.intersection(dependency_components):
|
|
720
|
+
configured_mappings, configured_errors = (
|
|
721
|
+
_collect_configured_dependency_mappings(
|
|
722
|
+
config_data,
|
|
723
|
+
additional_mappings,
|
|
724
|
+
)
|
|
725
|
+
)
|
|
726
|
+
additional_mappings.extend(configured_mappings)
|
|
727
|
+
dependency_warnings.extend(configured_errors)
|
|
728
|
+
warnings.extend(configured_errors)
|
|
729
|
+
|
|
730
|
+
artifact_types = {
|
|
731
|
+
"AdditionalPythonFiles": "additional-python-file",
|
|
732
|
+
"ExtraPyFiles": "extra-py-file",
|
|
733
|
+
"AdditionalFiles": "extra-file",
|
|
734
|
+
"ExtraFiles": "extra-file",
|
|
735
|
+
"DefaultArguments.--additional-python-modules": "additional-python-module",
|
|
736
|
+
"DefaultArguments.--extra-py-files": "extra-py-file",
|
|
737
|
+
}
|
|
738
|
+
for mapping in additional_mappings:
|
|
739
|
+
mappings.append(
|
|
740
|
+
{
|
|
741
|
+
**mapping,
|
|
742
|
+
"artifact_type": artifact_types.get(mapping["source"], "dependency"),
|
|
743
|
+
}
|
|
744
|
+
)
|
|
745
|
+
|
|
746
|
+
deduplicated: list[dict[str, str]] = []
|
|
747
|
+
seen: set[tuple[str, str]] = set()
|
|
748
|
+
for mapping in mappings:
|
|
749
|
+
key = (mapping["local_path"], mapping["s3_path"])
|
|
750
|
+
if key in seen:
|
|
751
|
+
continue
|
|
752
|
+
seen.add(key)
|
|
753
|
+
deduplicated.append(mapping)
|
|
754
|
+
|
|
755
|
+
return {
|
|
756
|
+
"mappings": deduplicated,
|
|
757
|
+
"warnings": warnings,
|
|
758
|
+
"dependency_warnings": dependency_warnings,
|
|
759
|
+
"script_path": script_path,
|
|
760
|
+
"script_s3_path": script_s3_path,
|
|
761
|
+
"notebook_enabled": notebook_enabled,
|
|
762
|
+
"notebook_path": notebook_path,
|
|
763
|
+
"notebook_s3_path": notebook_s3_path,
|
|
764
|
+
}
|
|
765
|
+
|
|
766
|
+
|
|
767
|
+
def upload_glue_job_files_from_config(
|
|
768
|
+
config,
|
|
769
|
+
dry_run=False,
|
|
770
|
+
update_job_config=True,
|
|
771
|
+
include_components=None,
|
|
772
|
+
profile_name=None,
|
|
773
|
+
auto_login=True,
|
|
774
|
+
):
|
|
775
|
+
config_data, config_path = _load_config_data(config)
|
|
776
|
+
include_components = set(
|
|
777
|
+
include_components
|
|
778
|
+
or {
|
|
779
|
+
"script",
|
|
780
|
+
"notebook",
|
|
781
|
+
"additional-python-modules",
|
|
782
|
+
"extra-files",
|
|
783
|
+
"job-config",
|
|
784
|
+
}
|
|
785
|
+
)
|
|
786
|
+
artifact_resolution = resolve_glue_artifact_mappings(
|
|
787
|
+
config_data,
|
|
788
|
+
include_components=include_components,
|
|
789
|
+
)
|
|
790
|
+
notebook_enabled = artifact_resolution["notebook_enabled"]
|
|
791
|
+
script_location = artifact_resolution["script_s3_path"]
|
|
792
|
+
needs_script_location = "script" in include_components or notebook_enabled
|
|
793
|
+
if needs_script_location and not script_location:
|
|
794
|
+
print(f"Missing Command.ScriptLocation in {config_path or 'config data'}")
|
|
795
|
+
return False
|
|
796
|
+
|
|
797
|
+
if script_location:
|
|
798
|
+
bucket_name, script_key = parse_s3_bucket_and_prefix(script_location)
|
|
799
|
+
notebook_key = _script_location_to_notebook_key(script_key)
|
|
800
|
+
else:
|
|
801
|
+
bucket_name = None
|
|
802
|
+
script_key = None
|
|
803
|
+
notebook_key = None
|
|
804
|
+
script_path = artifact_resolution["script_path"]
|
|
805
|
+
notebook_path = artifact_resolution["notebook_path"]
|
|
806
|
+
|
|
807
|
+
script_available = script_path.exists()
|
|
808
|
+
notebook_available = bool(notebook_path and Path(notebook_path).exists())
|
|
809
|
+
|
|
810
|
+
if "script" in include_components and notebook_path:
|
|
811
|
+
script_available = (
|
|
812
|
+
_ensure_local_script_from_notebook(
|
|
813
|
+
notebook_path,
|
|
814
|
+
script_path,
|
|
815
|
+
dry_run=dry_run,
|
|
816
|
+
)
|
|
817
|
+
or script_available
|
|
818
|
+
)
|
|
819
|
+
if notebook_enabled and notebook_path:
|
|
820
|
+
notebook_available = (
|
|
821
|
+
_ensure_local_notebook_from_script(
|
|
822
|
+
script_path,
|
|
823
|
+
notebook_path,
|
|
824
|
+
config_data=config_data,
|
|
825
|
+
include_magics=True,
|
|
826
|
+
dry_run=dry_run,
|
|
827
|
+
)
|
|
828
|
+
or notebook_available
|
|
829
|
+
)
|
|
830
|
+
|
|
831
|
+
if "script" in include_components and not script_available:
|
|
832
|
+
print(f"Local script not found: {script_path}")
|
|
833
|
+
return False
|
|
834
|
+
|
|
835
|
+
additional_mappings = [
|
|
836
|
+
mapping
|
|
837
|
+
for mapping in artifact_resolution["mappings"]
|
|
838
|
+
if mapping["artifact_type"] not in {"script", "notebook"}
|
|
839
|
+
]
|
|
840
|
+
additional_errors = artifact_resolution["dependency_warnings"]
|
|
841
|
+
|
|
842
|
+
if dry_run:
|
|
843
|
+
if "script" in include_components:
|
|
844
|
+
print(
|
|
845
|
+
f"Would upload script: {script_path} -> s3://{bucket_name}/{script_key}"
|
|
846
|
+
)
|
|
847
|
+
if notebook_enabled:
|
|
848
|
+
if notebook_available:
|
|
849
|
+
print(
|
|
850
|
+
f"Would upload notebook: {notebook_path} -> s3://{bucket_name}/{notebook_key}"
|
|
851
|
+
)
|
|
852
|
+
else:
|
|
853
|
+
print(f"Notebook not found locally: {notebook_path}")
|
|
854
|
+
for error in additional_errors:
|
|
855
|
+
print(f"Additional files error: {error}")
|
|
856
|
+
for mapping in additional_mappings:
|
|
857
|
+
local = Path(mapping["local_path"])
|
|
858
|
+
s3_path = mapping["s3_path"]
|
|
859
|
+
if not local.exists():
|
|
860
|
+
print(f"Additional file not found locally: {local}")
|
|
861
|
+
continue
|
|
862
|
+
if not s3_path.startswith("s3://"):
|
|
863
|
+
print(f"Additional file has invalid S3 path: {s3_path}")
|
|
864
|
+
continue
|
|
865
|
+
if local.is_dir():
|
|
866
|
+
if not s3_path.endswith(".zip"):
|
|
867
|
+
print(f"Additional directory must target a .zip S3 path: {s3_path}")
|
|
868
|
+
continue
|
|
869
|
+
print(f"Would zip directory: {local} -> {s3_path}")
|
|
870
|
+
continue
|
|
871
|
+
bucket, key = parse_s3_bucket_and_prefix(s3_path)
|
|
872
|
+
print(f"Would upload additional file: {local} -> s3://{bucket}/{key}")
|
|
873
|
+
if update_job_config and "job-config" in include_components:
|
|
874
|
+
update_glue_job_from_config(
|
|
875
|
+
config_data,
|
|
876
|
+
dry_run=True,
|
|
877
|
+
profile_name=profile_name,
|
|
878
|
+
auto_login=auto_login,
|
|
879
|
+
)
|
|
880
|
+
return True
|
|
881
|
+
|
|
882
|
+
ok = True
|
|
883
|
+
file_components = include_components.intersection(
|
|
884
|
+
{
|
|
885
|
+
"script",
|
|
886
|
+
"notebook",
|
|
887
|
+
"additional-python-modules",
|
|
888
|
+
"extra-py-files",
|
|
889
|
+
"extra-files",
|
|
890
|
+
}
|
|
891
|
+
)
|
|
892
|
+
needs_aws = bool(
|
|
893
|
+
file_components or (update_job_config and "job-config" in include_components)
|
|
894
|
+
)
|
|
895
|
+
session = (
|
|
896
|
+
ensure_aws_login(profile_name=profile_name, auto_login=auto_login)
|
|
897
|
+
if needs_aws
|
|
898
|
+
else None
|
|
899
|
+
)
|
|
900
|
+
s3 = session.client("s3") if file_components and session else None
|
|
901
|
+
|
|
902
|
+
if "script" in include_components:
|
|
903
|
+
s3.upload_file(str(script_path), bucket_name, script_key)
|
|
904
|
+
print(f"Uploaded script: {script_path} -> s3://{bucket_name}/{script_key}")
|
|
905
|
+
|
|
906
|
+
if notebook_enabled:
|
|
907
|
+
if notebook_available:
|
|
908
|
+
s3.upload_file(str(notebook_path), bucket_name, notebook_key)
|
|
909
|
+
print(
|
|
910
|
+
f"Uploaded notebook: {notebook_path} -> s3://{bucket_name}/{notebook_key}"
|
|
911
|
+
)
|
|
912
|
+
else:
|
|
913
|
+
print(f"Notebook not found locally: {notebook_path}")
|
|
914
|
+
|
|
915
|
+
for error in additional_errors:
|
|
916
|
+
print(f"Additional files error: {error}")
|
|
917
|
+
ok = False
|
|
918
|
+
|
|
919
|
+
for mapping in additional_mappings:
|
|
920
|
+
local = Path(mapping["local_path"])
|
|
921
|
+
s3_path = mapping["s3_path"]
|
|
922
|
+
if not local.exists():
|
|
923
|
+
print(f"Additional file not found locally: {local}")
|
|
924
|
+
ok = False
|
|
925
|
+
continue
|
|
926
|
+
if not s3_path.startswith("s3://"):
|
|
927
|
+
print(f"Additional file has invalid S3 path: {s3_path}")
|
|
928
|
+
ok = False
|
|
929
|
+
continue
|
|
930
|
+
if local.is_dir():
|
|
931
|
+
if not s3_path.endswith(".zip"):
|
|
932
|
+
print(f"Additional directory must target a .zip S3 path: {s3_path}")
|
|
933
|
+
ok = False
|
|
934
|
+
continue
|
|
935
|
+
bucket, key = parse_s3_bucket_and_prefix(s3_path)
|
|
936
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
937
|
+
zip_path = Path(temp_dir) / f"{local.name}.zip"
|
|
938
|
+
_zip_directory(local, zip_path)
|
|
939
|
+
s3.upload_file(str(zip_path), bucket, key)
|
|
940
|
+
print(f"Uploaded additional directory: {local} -> s3://{bucket}/{key}")
|
|
941
|
+
continue
|
|
942
|
+
bucket, key = parse_s3_bucket_and_prefix(s3_path)
|
|
943
|
+
s3.upload_file(str(local), bucket, key)
|
|
944
|
+
print(f"Uploaded additional file: {local} -> s3://{bucket}/{key}")
|
|
945
|
+
|
|
946
|
+
if update_job_config and "job-config" in include_components:
|
|
947
|
+
updated = update_glue_job_from_config(
|
|
948
|
+
config_data,
|
|
949
|
+
dry_run=False,
|
|
950
|
+
profile_name=profile_name,
|
|
951
|
+
auto_login=auto_login,
|
|
952
|
+
)
|
|
953
|
+
ok = ok and updated
|
|
954
|
+
|
|
955
|
+
return ok
|
|
956
|
+
|
|
957
|
+
|
|
958
|
+
def clear_notebook_outputs(notebook):
|
|
959
|
+
"""
|
|
960
|
+
Clears the outputs of all code cells in a Jupyter notebook.
|
|
961
|
+
|
|
962
|
+
This function takes a Jupyter notebook (either as a dictionary, JSON string or a file path),
|
|
963
|
+
clears the outputs of all code cells
|
|
964
|
+
Args:
|
|
965
|
+
notebook (str or dict): The Jupyter notebook to be processed. This can be either:
|
|
966
|
+
- A string representing the path to the notebook file.
|
|
967
|
+
- A JSON string representing the notebook content.
|
|
968
|
+
- A dictionary representing the notebook content.
|
|
969
|
+
Raises:
|
|
970
|
+
ValueError: If the provided string is neither a valid path nor a valid JSON string.
|
|
971
|
+
Returns:
|
|
972
|
+
None
|
|
973
|
+
Example:
|
|
974
|
+
# This can also be the JSON/dictionary version of the notebook content
|
|
975
|
+
clear_notebook_outputs(notebook_dict, 'path/to/output_notebook.ipynb')
|
|
976
|
+
"""
|
|
977
|
+
|
|
978
|
+
if isinstance(notebook, (str, Path)):
|
|
979
|
+
try:
|
|
980
|
+
# Check if the string is a path-like string
|
|
981
|
+
if Path(notebook).exists():
|
|
982
|
+
notebook = json.loads(Path(notebook).read_text(encoding="utf-8"))
|
|
983
|
+
else:
|
|
984
|
+
# Try to parse the string as JSON
|
|
985
|
+
notebook = json.loads(notebook)
|
|
986
|
+
except json.JSONDecodeError:
|
|
987
|
+
raise ValueError(
|
|
988
|
+
"The provided string is neither a valid path nor a valid JSON string."
|
|
989
|
+
)
|
|
990
|
+
|
|
991
|
+
for cell in notebook.get("cells", []):
|
|
992
|
+
if cell.get("cell_type") == "code":
|
|
993
|
+
cell["outputs"] = []
|
|
994
|
+
|
|
995
|
+
return notebook
|
|
996
|
+
|
|
997
|
+
|
|
998
|
+
def _new_code_cell(source_lines):
|
|
999
|
+
return {
|
|
1000
|
+
"cell_type": "code",
|
|
1001
|
+
"id": str(uuid.uuid4()),
|
|
1002
|
+
"metadata": {
|
|
1003
|
+
"tags": [],
|
|
1004
|
+
"trusted": True,
|
|
1005
|
+
"vscode": {"languageId": "python_glue_session"},
|
|
1006
|
+
},
|
|
1007
|
+
"source": source_lines,
|
|
1008
|
+
"execution_count": None,
|
|
1009
|
+
"outputs": [],
|
|
1010
|
+
}
|
|
1011
|
+
|
|
1012
|
+
|
|
1013
|
+
def _new_markdown_cell(source_lines):
|
|
1014
|
+
return {
|
|
1015
|
+
"cell_type": "markdown",
|
|
1016
|
+
"id": str(uuid.uuid4()),
|
|
1017
|
+
"metadata": {},
|
|
1018
|
+
"source": source_lines,
|
|
1019
|
+
}
|
|
1020
|
+
|
|
1021
|
+
|
|
1022
|
+
def _parse_percent_cell_marker(raw_line):
|
|
1023
|
+
match = re.match(r"^\s*#\s*%%(.*)$", raw_line)
|
|
1024
|
+
if not match:
|
|
1025
|
+
return None
|
|
1026
|
+
|
|
1027
|
+
marker_text = match.group(1).strip()
|
|
1028
|
+
return {
|
|
1029
|
+
"marker_text": marker_text,
|
|
1030
|
+
"is_markdown": "markdown" in marker_text.casefold(),
|
|
1031
|
+
}
|
|
1032
|
+
|
|
1033
|
+
|
|
1034
|
+
def _canonical_code_cell_marker():
|
|
1035
|
+
return "#%%\n"
|
|
1036
|
+
|
|
1037
|
+
|
|
1038
|
+
def _code_cell_has_payload(source_lines):
|
|
1039
|
+
for line in source_lines:
|
|
1040
|
+
if _parse_percent_cell_marker(line):
|
|
1041
|
+
continue
|
|
1042
|
+
if line.strip():
|
|
1043
|
+
return True
|
|
1044
|
+
return False
|
|
1045
|
+
|
|
1046
|
+
|
|
1047
|
+
def _code_cell_starts_with_percent_marker(source_lines):
|
|
1048
|
+
for line in source_lines:
|
|
1049
|
+
if not line.strip():
|
|
1050
|
+
continue
|
|
1051
|
+
return _parse_percent_cell_marker(line) is not None
|
|
1052
|
+
return False
|
|
1053
|
+
|
|
1054
|
+
|
|
1055
|
+
def _ensure_local_script_from_notebook(notebook_path, script_path, *, dry_run=False):
|
|
1056
|
+
notebook_path = Path(notebook_path)
|
|
1057
|
+
script_path = Path(script_path)
|
|
1058
|
+
if script_path.exists() or not notebook_path.exists():
|
|
1059
|
+
return False
|
|
1060
|
+
if dry_run:
|
|
1061
|
+
print(f"Would generate script from notebook: {notebook_path} -> {script_path}")
|
|
1062
|
+
return True
|
|
1063
|
+
convert_notebook_to_script(notebook_path, script_path)
|
|
1064
|
+
print(f"Generated script from notebook: {notebook_path} -> {script_path}")
|
|
1065
|
+
return True
|
|
1066
|
+
|
|
1067
|
+
|
|
1068
|
+
def _ensure_local_notebook_from_script(
|
|
1069
|
+
script_path,
|
|
1070
|
+
notebook_path,
|
|
1071
|
+
*,
|
|
1072
|
+
config_data=None,
|
|
1073
|
+
include_magics=True,
|
|
1074
|
+
dry_run=False,
|
|
1075
|
+
):
|
|
1076
|
+
script_path = Path(script_path)
|
|
1077
|
+
notebook_path = Path(notebook_path)
|
|
1078
|
+
if notebook_path.exists() or not script_path.exists():
|
|
1079
|
+
return False
|
|
1080
|
+
if dry_run:
|
|
1081
|
+
print(f"Would generate notebook from script: {script_path} -> {notebook_path}")
|
|
1082
|
+
return True
|
|
1083
|
+
convert_script_to_notebook(
|
|
1084
|
+
script_path,
|
|
1085
|
+
notebook_path,
|
|
1086
|
+
config_data=config_data,
|
|
1087
|
+
include_magics=include_magics,
|
|
1088
|
+
)
|
|
1089
|
+
print(f"Generated notebook from script: {script_path} -> {notebook_path}")
|
|
1090
|
+
return True
|
|
1091
|
+
|
|
1092
|
+
|
|
1093
|
+
def _script_to_cells(script_text):
|
|
1094
|
+
cells = []
|
|
1095
|
+
current_lines = []
|
|
1096
|
+
current_type = "code"
|
|
1097
|
+
|
|
1098
|
+
def flush():
|
|
1099
|
+
if not current_lines:
|
|
1100
|
+
return
|
|
1101
|
+
if current_type == "code":
|
|
1102
|
+
if not _code_cell_has_payload(current_lines):
|
|
1103
|
+
return
|
|
1104
|
+
elif not any(line.strip() for line in current_lines):
|
|
1105
|
+
return
|
|
1106
|
+
source = current_lines
|
|
1107
|
+
if current_type == "code":
|
|
1108
|
+
cell = _new_code_cell(source)
|
|
1109
|
+
else:
|
|
1110
|
+
cell = _new_markdown_cell(source)
|
|
1111
|
+
cells.append(cell)
|
|
1112
|
+
|
|
1113
|
+
for line in script_text.splitlines(keepends=True):
|
|
1114
|
+
marker = _parse_percent_cell_marker(line)
|
|
1115
|
+
if marker:
|
|
1116
|
+
if current_type == "code" and not _code_cell_has_payload(current_lines):
|
|
1117
|
+
if marker["is_markdown"]:
|
|
1118
|
+
current_lines = []
|
|
1119
|
+
else:
|
|
1120
|
+
current_lines = [
|
|
1121
|
+
existing_line
|
|
1122
|
+
for existing_line in current_lines
|
|
1123
|
+
if _parse_percent_cell_marker(existing_line)
|
|
1124
|
+
]
|
|
1125
|
+
else:
|
|
1126
|
+
flush()
|
|
1127
|
+
current_lines = []
|
|
1128
|
+
current_type = "markdown" if marker["is_markdown"] else "code"
|
|
1129
|
+
if current_type == "code":
|
|
1130
|
+
current_lines.append(_canonical_code_cell_marker())
|
|
1131
|
+
continue
|
|
1132
|
+
current_lines.append(line)
|
|
1133
|
+
|
|
1134
|
+
flush()
|
|
1135
|
+
return cells
|
|
1136
|
+
|
|
1137
|
+
|
|
1138
|
+
def _strip_generated_script_config_cells(script_text):
|
|
1139
|
+
def is_glue_magic_line(raw_line):
|
|
1140
|
+
if _parse_percent_cell_marker(raw_line):
|
|
1141
|
+
return False
|
|
1142
|
+
candidate = raw_line.lstrip()
|
|
1143
|
+
if candidate.startswith("#"):
|
|
1144
|
+
candidate = candidate[1:].lstrip()
|
|
1145
|
+
return (
|
|
1146
|
+
candidate.startswith("%")
|
|
1147
|
+
or "%%configure" in candidate
|
|
1148
|
+
or "AWS Glue configs" in candidate
|
|
1149
|
+
)
|
|
1150
|
+
|
|
1151
|
+
def split_into_cells(all_lines):
|
|
1152
|
+
starts = [
|
|
1153
|
+
idx
|
|
1154
|
+
for idx, line in enumerate(all_lines)
|
|
1155
|
+
if _parse_percent_cell_marker(line)
|
|
1156
|
+
]
|
|
1157
|
+
if not starts:
|
|
1158
|
+
return [all_lines]
|
|
1159
|
+
if starts[0] != 0:
|
|
1160
|
+
starts = [0] + starts
|
|
1161
|
+
boundaries = starts[1:] + [len(all_lines)]
|
|
1162
|
+
return [all_lines[start:end] for start, end in zip(starts, boundaries)]
|
|
1163
|
+
|
|
1164
|
+
kept = []
|
|
1165
|
+
for cell_lines in split_into_cells(script_text.splitlines(keepends=True)):
|
|
1166
|
+
if any(is_glue_magic_line(line) for line in cell_lines):
|
|
1167
|
+
first_code_idx = None
|
|
1168
|
+
for idx, line in enumerate(cell_lines):
|
|
1169
|
+
stripped = line.strip()
|
|
1170
|
+
if not stripped:
|
|
1171
|
+
continue
|
|
1172
|
+
if _parse_percent_cell_marker(line):
|
|
1173
|
+
continue
|
|
1174
|
+
if stripped.startswith("#"):
|
|
1175
|
+
continue
|
|
1176
|
+
first_code_idx = idx
|
|
1177
|
+
break
|
|
1178
|
+
if first_code_idx is not None:
|
|
1179
|
+
kept.extend(cell_lines[first_code_idx:])
|
|
1180
|
+
continue
|
|
1181
|
+
kept.extend(cell_lines)
|
|
1182
|
+
|
|
1183
|
+
return "".join(kept)
|
|
1184
|
+
|
|
1185
|
+
|
|
1186
|
+
def _build_magic_cells(config_data):
|
|
1187
|
+
if not config_data:
|
|
1188
|
+
return []
|
|
1189
|
+
|
|
1190
|
+
cells = [_new_markdown_cell(["# AWS Glue configs\n"])]
|
|
1191
|
+
cells.extend(
|
|
1192
|
+
_new_code_cell(source_lines)
|
|
1193
|
+
for source_lines in build_magic_cell_sources(config_data)
|
|
1194
|
+
)
|
|
1195
|
+
|
|
1196
|
+
cells.append(_new_markdown_cell(["# AWS Glue Script\n"]))
|
|
1197
|
+
|
|
1198
|
+
return cells
|
|
1199
|
+
|
|
1200
|
+
|
|
1201
|
+
def _cell_source_to_lines(source):
|
|
1202
|
+
if source is None:
|
|
1203
|
+
return []
|
|
1204
|
+
if isinstance(source, str):
|
|
1205
|
+
return source.splitlines(keepends=True)
|
|
1206
|
+
lines: list[str] = []
|
|
1207
|
+
for part in source:
|
|
1208
|
+
if part.endswith("\n"):
|
|
1209
|
+
lines.append(part)
|
|
1210
|
+
else:
|
|
1211
|
+
lines.append(f"{part}\n")
|
|
1212
|
+
return lines
|
|
1213
|
+
|
|
1214
|
+
|
|
1215
|
+
def _format_markdown_lines(source_lines):
|
|
1216
|
+
formatted: list[str] = []
|
|
1217
|
+
for line in source_lines:
|
|
1218
|
+
stripped = line.rstrip("\n")
|
|
1219
|
+
if stripped.strip():
|
|
1220
|
+
formatted.append(f"# {stripped}\n")
|
|
1221
|
+
else:
|
|
1222
|
+
formatted.append("#\n")
|
|
1223
|
+
return formatted
|
|
1224
|
+
|
|
1225
|
+
|
|
1226
|
+
def _is_magic_cell(source_lines):
|
|
1227
|
+
for line in source_lines:
|
|
1228
|
+
stripped = line.lstrip()
|
|
1229
|
+
if not stripped:
|
|
1230
|
+
continue
|
|
1231
|
+
if stripped.startswith("%"):
|
|
1232
|
+
return True
|
|
1233
|
+
return False
|
|
1234
|
+
|
|
1235
|
+
|
|
1236
|
+
def _default_notebook_metadata():
|
|
1237
|
+
return {
|
|
1238
|
+
"kernelspec": {
|
|
1239
|
+
"name": "glue_pyspark",
|
|
1240
|
+
"display_name": "Glue PySpark",
|
|
1241
|
+
"language": "python",
|
|
1242
|
+
},
|
|
1243
|
+
"language_info": {
|
|
1244
|
+
"name": "Python_Glue_Session",
|
|
1245
|
+
"mimetype": "text/x-python",
|
|
1246
|
+
"codemirror_mode": {"name": "python", "version": 3},
|
|
1247
|
+
"pygments_lexer": "python3",
|
|
1248
|
+
"file_extension": ".py",
|
|
1249
|
+
},
|
|
1250
|
+
}
|
|
1251
|
+
|
|
1252
|
+
|
|
1253
|
+
def convert_notebook_to_script(
|
|
1254
|
+
notebook_path,
|
|
1255
|
+
script_path,
|
|
1256
|
+
):
|
|
1257
|
+
notebook_path = Path(notebook_path)
|
|
1258
|
+
script_path = Path(script_path)
|
|
1259
|
+
|
|
1260
|
+
notebook = json.loads(notebook_path.read_text())
|
|
1261
|
+
lines: list[str] = []
|
|
1262
|
+
|
|
1263
|
+
for cell in notebook.get("cells", []):
|
|
1264
|
+
cell_type = cell.get("cell_type")
|
|
1265
|
+
source_lines = _cell_source_to_lines(cell.get("source"))
|
|
1266
|
+
if cell_type == "markdown":
|
|
1267
|
+
lines.append("#%% [markdown]\n")
|
|
1268
|
+
lines.extend(_format_markdown_lines(source_lines))
|
|
1269
|
+
elif cell_type == "code":
|
|
1270
|
+
if _is_magic_cell(source_lines):
|
|
1271
|
+
lines.append("#%% [markdown]\n")
|
|
1272
|
+
lines.extend(_format_markdown_lines(source_lines))
|
|
1273
|
+
else:
|
|
1274
|
+
if not _code_cell_starts_with_percent_marker(source_lines):
|
|
1275
|
+
lines.append(_canonical_code_cell_marker())
|
|
1276
|
+
lines.extend(source_lines)
|
|
1277
|
+
else:
|
|
1278
|
+
continue
|
|
1279
|
+
if lines and not lines[-1].endswith("\n"):
|
|
1280
|
+
lines[-1] = f"{lines[-1]}\n"
|
|
1281
|
+
lines.append("\n")
|
|
1282
|
+
|
|
1283
|
+
script_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1284
|
+
script_path.write_text("".join(lines))
|
|
1285
|
+
return script_path
|
|
1286
|
+
|
|
1287
|
+
|
|
1288
|
+
def convert_script_to_notebook(
|
|
1289
|
+
script_path,
|
|
1290
|
+
notebook_path,
|
|
1291
|
+
config_data=None,
|
|
1292
|
+
include_magics=True,
|
|
1293
|
+
):
|
|
1294
|
+
script_path = Path(script_path)
|
|
1295
|
+
notebook_path = Path(notebook_path)
|
|
1296
|
+
|
|
1297
|
+
script_text = script_path.read_text()
|
|
1298
|
+
script_text = re.sub(r"(?m)^\s*job\.commit\(\)\s*(?:#.*)?$\n?", "", script_text)
|
|
1299
|
+
script_text = _strip_generated_script_config_cells(script_text)
|
|
1300
|
+
cells = _script_to_cells(script_text)
|
|
1301
|
+
if include_magics and config_data:
|
|
1302
|
+
cells = _build_magic_cells(config_data) + cells
|
|
1303
|
+
|
|
1304
|
+
if notebook_path.exists():
|
|
1305
|
+
template = json.loads(notebook_path.read_text())
|
|
1306
|
+
metadata = template.get("metadata") or _default_notebook_metadata()
|
|
1307
|
+
nbformat = template.get("nbformat", 4)
|
|
1308
|
+
nbformat_minor = template.get("nbformat_minor", 5)
|
|
1309
|
+
else:
|
|
1310
|
+
metadata = _default_notebook_metadata()
|
|
1311
|
+
nbformat = 4
|
|
1312
|
+
nbformat_minor = 5
|
|
1313
|
+
|
|
1314
|
+
notebook = {
|
|
1315
|
+
"metadata": metadata,
|
|
1316
|
+
"nbformat": nbformat,
|
|
1317
|
+
"nbformat_minor": nbformat_minor,
|
|
1318
|
+
"cells": cells,
|
|
1319
|
+
}
|
|
1320
|
+
|
|
1321
|
+
notebook_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1322
|
+
notebook_path.write_text(json.dumps(notebook, indent=4))
|
|
1323
|
+
return notebook_path
|