gluekit 1.0.1.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. gluekit/__init__.py +7 -0
  2. gluekit/app.py +0 -0
  3. gluekit/cli.py +64 -0
  4. gluekit/commands/__init__.py +1 -0
  5. gluekit/commands/add.py +455 -0
  6. gluekit/commands/build.py +816 -0
  7. gluekit/commands/checkout.py +114 -0
  8. gluekit/commands/clone.py +516 -0
  9. gluekit/commands/config_commands.py +180 -0
  10. gluekit/commands/constants.py +47 -0
  11. gluekit/commands/convert.py +336 -0
  12. gluekit/commands/edit.py +1104 -0
  13. gluekit/commands/helpers.py +1068 -0
  14. gluekit/commands/init.py +798 -0
  15. gluekit/commands/list.py +16 -0
  16. gluekit/commands/local_commands.py +680 -0
  17. gluekit/commands/pull.py +374 -0
  18. gluekit/commands/push.py +251 -0
  19. gluekit/commands/remove.py +161 -0
  20. gluekit/commands/run.py +126 -0
  21. gluekit/commands/status.py +97 -0
  22. gluekit/commands/sync.py +97 -0
  23. gluekit/commands/update.py +104 -0
  24. gluekit/job_mgmt/__init__.py +0 -0
  25. gluekit/job_mgmt/glue_jobs.py +1323 -0
  26. gluekit/job_mgmt/magics.py +122 -0
  27. gluekit/job_mgmt/resources/__init__.py +0 -0
  28. gluekit/job_mgmt/resources/glue_job_schema.json +40341 -0
  29. gluekit/job_mgmt/resources/magic_map.json +83 -0
  30. gluekit/job_mgmt/schema.py +165 -0
  31. gluekit/local/__init__.py +6 -0
  32. gluekit/local/awsglue/__init__.py +1 -0
  33. gluekit/local/awsglue/context.py +30 -0
  34. gluekit/local/awsglue/job.py +9 -0
  35. gluekit/local/awsglue/utils.py +17 -0
  36. gluekit/local/local.py +434 -0
  37. gluekit/local/local_fixtures.py +337 -0
  38. gluekit/local/pyspark/__init__.py +7 -0
  39. gluekit/local/pyspark/context.py +31 -0
  40. gluekit/local/pyspark/sql/__init__.py +6 -0
  41. gluekit/local/pyspark/sql/session.py +29 -0
  42. gluekit-1.0.1.dev1.dist-info/METADATA +1176 -0
  43. gluekit-1.0.1.dev1.dist-info/RECORD +46 -0
  44. gluekit-1.0.1.dev1.dist-info/WHEEL +5 -0
  45. gluekit-1.0.1.dev1.dist-info/entry_points.txt +2 -0
  46. gluekit-1.0.1.dev1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1323 @@
1
+ """
2
+ Glue job sync tooling for real AWS Glue resources.
3
+
4
+ This module pulls AWS Glue job configs and scripts into the repo so they can be
5
+ tracked alongside migration artifacts. The main flow:
6
+
7
+ - Fetch Glue jobs via boto3 using an AWS CLI credential profile when provided.
8
+ - Download the job config JSON, the script (.py), and (when present) the
9
+ notebook (.ipynb) from the job's ScriptLocation bucket.
10
+ - If the job is a NOTEBOOK, remove outputs to keep diffs clean.
11
+
12
+ These helpers are different from the local fixture commands in
13
+ ``gluekit.commands.local_commands``: they use real AWS Glue and S3 APIs, and
14
+ ``profile_name`` means an AWS CLI/boto3 credential profile.
15
+
16
+ Preferred usage:
17
+ - Use the Typer CLI for a git-like flow:
18
+ gluekit pull <job_name> or gluekit pull "*"
19
+
20
+ Outputs:
21
+ - glue/glue_full_job_list.csv: complete job list for auditing.
22
+ - glue/configs/<slug>.json: job config with SourceControlDetails metadata.
23
+ - glue/scripts/<slug>.py and glue/notebooks/<slug>.ipynb (when available).
24
+ """
25
+
26
+ import csv
27
+ import json
28
+ import re
29
+ import subprocess
30
+ import tempfile
31
+ import uuid
32
+ import zipfile
33
+ from copy import deepcopy
34
+ from datetime import datetime
35
+ from pathlib import Path
36
+
37
+ import boto3
38
+ from botocore.exceptions import BotoCoreError, ClientError
39
+ from slugify import slugify
40
+
41
+ from dotenv import load_dotenv
42
+ from .magics import build_magic_cell_sources
43
+ from .schema import DEPRECATED_JOB_FIELDS, get_updateable_job_fields
44
+
45
+ load_dotenv()
46
+
47
+
48
+ def normalize_glue_config_data(config_data):
49
+ if not isinstance(config_data, dict):
50
+ return config_data
51
+
52
+ job_data = config_data.get("Job")
53
+ if not isinstance(job_data, dict):
54
+ return config_data
55
+
56
+ normalized = deepcopy(job_data)
57
+ for key, value in config_data.items():
58
+ if key == "Job":
59
+ continue
60
+ if key == "SourceControlDetails" and isinstance(value, dict):
61
+ existing = normalized.get("SourceControlDetails")
62
+ if isinstance(existing, dict):
63
+ merged = deepcopy(value)
64
+ merged.update(existing)
65
+ normalized["SourceControlDetails"] = merged
66
+ else:
67
+ normalized["SourceControlDetails"] = deepcopy(value)
68
+ continue
69
+ normalized.setdefault(key, value)
70
+
71
+ return normalized
72
+
73
+
74
+ def _new_boto3_session(profile_name=None):
75
+ if profile_name:
76
+ return boto3.Session(profile_name=profile_name)
77
+ return boto3.Session()
78
+
79
+
80
+ def ensure_aws_login(profile_name=None, auto_login=True):
81
+ """
82
+ Ensure real AWS credentials are valid for an AWS CLI/boto3 profile.
83
+
84
+ If credentials are invalid and auto_login is enabled, trigger
85
+ ``aws sso login`` and return a fresh boto3 session.
86
+ """
87
+ session = _new_boto3_session(profile_name)
88
+ sts = session.client("sts")
89
+ try:
90
+ sts.get_caller_identity()
91
+ return session
92
+ except (BotoCoreError, ClientError):
93
+ if not auto_login:
94
+ raise
95
+
96
+ profile_label = profile_name or "default"
97
+ print(
98
+ f"AWS auth missing/expired for profile '{profile_label}'. Running aws sso login..."
99
+ )
100
+ login_cmd = ["aws", "sso", "login"]
101
+ if profile_name:
102
+ login_cmd.extend(["--profile", profile_name])
103
+ try:
104
+ subprocess.run(login_cmd, check=True)
105
+ except FileNotFoundError as exc:
106
+ raise RuntimeError(
107
+ "AWS CLI is not installed or not on PATH. It is required for SSO login."
108
+ ) from exc
109
+ except subprocess.CalledProcessError as exc:
110
+ raise RuntimeError("aws sso login failed or was canceled.") from exc
111
+ return _new_boto3_session(profile_name)
112
+
113
+
114
+ def custom_serializer(obj):
115
+ if isinstance(obj, datetime):
116
+ return obj.isoformat() # Converts datetime to an ISO 8601 string
117
+ raise TypeError(f"Type {type(obj)} not serializable")
118
+
119
+
120
+ def list_glue_jobs(profile_name=None, auto_login=True):
121
+ session = ensure_aws_login(profile_name=profile_name, auto_login=auto_login)
122
+ glue = session.client("glue")
123
+ jobs = []
124
+ next_token = None
125
+ while True:
126
+ response = (
127
+ glue.get_jobs(NextToken=next_token) if next_token else glue.get_jobs()
128
+ )
129
+ jobs.extend(response["Jobs"])
130
+ next_token = response.get("NextToken")
131
+ if not next_token:
132
+ break
133
+ return jobs
134
+
135
+
136
+ def get_glue_job_config(name, profile_name=None, auto_login=True):
137
+ """Fetch one Glue job config without writing local artifacts."""
138
+ session = ensure_aws_login(profile_name=profile_name, auto_login=auto_login)
139
+ glue = session.client("glue")
140
+ return normalize_glue_config_data(glue.get_job(JobName=name).get("Job"))
141
+
142
+
143
+ def parse_s3_bucket_and_prefix(s3_path):
144
+ s3_path = s3_path.replace("s3://", "")
145
+ bucket_name, prefix = s3_path.split("/", 1)
146
+ return bucket_name, prefix
147
+
148
+
149
+ def _strip_deprecated_fields(job_data):
150
+ for field in DEPRECATED_JOB_FIELDS:
151
+ job_data.pop(field, None)
152
+ return job_data
153
+
154
+
155
+ def download_glue_job_files(
156
+ name,
157
+ config_path,
158
+ local_path=None,
159
+ remove_nb_outputs=True,
160
+ include_components=None,
161
+ existing_sc=None,
162
+ profile_name=None,
163
+ auto_login=True,
164
+ ):
165
+
166
+ session = ensure_aws_login(profile_name=profile_name, auto_login=auto_login)
167
+ glue = session.client("glue")
168
+ job = normalize_glue_config_data(glue.get_job(JobName=name).get("Job"))
169
+ if not job:
170
+ print(f"Job {name} not found")
171
+ return None
172
+ s3 = session.client("s3")
173
+
174
+ bucket_name, script_location = parse_s3_bucket_and_prefix(
175
+ job["Command"]["ScriptLocation"]
176
+ )
177
+ # Define paths
178
+ config_file_name = str(config_path)
179
+ if local_path:
180
+ script_file_name = local_path
181
+ else:
182
+ script_file_name = f"glue/scripts/{slugify(job['Name'])}.py"
183
+
184
+ script_path = Path(script_file_name)
185
+ if "scripts" in script_path.parts:
186
+ notebook_file_name = str(
187
+ Path(
188
+ *(
189
+ "notebooks" if part == "scripts" else part
190
+ for part in script_path.parts
191
+ )
192
+ ).with_suffix(".ipynb")
193
+ )
194
+ else:
195
+ notebook_file_name = str(
196
+ Path("glue/notebooks") / script_path.with_suffix(".ipynb").name
197
+ )
198
+
199
+ # Get notebook
200
+ if "scripts/" in script_location and script_location.endswith(".py"):
201
+ notebook_location = script_location.replace("scripts/", "notebooks/").replace(
202
+ ".py", ".ipynb"
203
+ )
204
+ else:
205
+ notebook_location = script_location.replace(".py", ".ipynb")
206
+
207
+ print(f"Notebook location: {notebook_location}")
208
+
209
+ notebook_dict = None
210
+ if "notebook" in include_components:
211
+ response = s3.list_objects_v2(Bucket=bucket_name, Prefix=notebook_location)
212
+ if "Contents" in response:
213
+ response = s3.get_object(Bucket=bucket_name, Key=notebook_location)
214
+ notebook_content = response["Body"].read().decode("utf-8")
215
+ notebook_dict = json.loads(notebook_content)
216
+ if remove_nb_outputs:
217
+ notebook_dict = clear_notebook_outputs(notebook_dict)
218
+ print(f"Cleared outputs in notebook {notebook_file_name}")
219
+ else:
220
+ print(
221
+ f"No notebook found at {notebook_location}, skipping notebook download."
222
+ )
223
+
224
+ job.setdefault("SourceControlDetails", {})
225
+ if existing_sc:
226
+ for key, value in existing_sc.items():
227
+ job["SourceControlDetails"].setdefault(key, value)
228
+ job["SourceControlDetails"]["NotebookLocation"] = notebook_file_name
229
+ job["SourceControlDetails"]["ScriptLocation"] = script_file_name
230
+ job["SourceControlDetails"].pop("LocalPath", None)
231
+ job["SourceControlDetails"]["RemoveNbOutputs"] = remove_nb_outputs
232
+ _strip_deprecated_fields(job)
233
+
234
+ # Write files
235
+ if "config" in include_components:
236
+ Path(config_file_name).write_text(
237
+ json.dumps(job, indent=4, default=custom_serializer)
238
+ )
239
+ print(f"Configuration file created: {config_file_name}")
240
+
241
+ if "script" in include_components:
242
+ script_path.parent.mkdir(parents=True, exist_ok=True)
243
+ s3.download_file(bucket_name, script_location, script_file_name)
244
+ print(f"Script downloaded: {script_file_name}")
245
+
246
+ if notebook_dict and "notebook" in include_components:
247
+ notebook_path = Path(notebook_file_name)
248
+ notebook_path.parent.mkdir(parents=True, exist_ok=True)
249
+ notebook_path.write_text(
250
+ json.dumps(notebook_dict, indent=4, default=custom_serializer)
251
+ )
252
+ print(f"Notebook downloaded: {notebook_file_name}")
253
+
254
+ if "script" in include_components and "notebook" in include_components:
255
+ _ensure_local_script_from_notebook(notebook_file_name, script_file_name)
256
+ _ensure_local_notebook_from_script(
257
+ script_file_name,
258
+ notebook_file_name,
259
+ config_data=job,
260
+ include_magics=True,
261
+ )
262
+
263
+ if include_components.intersection({"extra-py-files", "extra-files"}):
264
+ sc_for_extras = job.get("SourceControlDetails", {})
265
+ additional_mappings, additional_errors = _collect_additional_file_mappings(
266
+ sc_for_extras
267
+ )
268
+ for error in additional_errors:
269
+ print(f"Additional files error: {error}")
270
+
271
+ include_sources: set[str] = set()
272
+ if "extra-py-files" in include_components:
273
+ include_sources.update({"AdditionalPythonFiles", "ExtraPyFiles"})
274
+ if "extra-files" in include_components:
275
+ include_sources.update({"AdditionalFiles", "ExtraFiles"})
276
+
277
+ additional_mappings = [
278
+ mapping
279
+ for mapping in additional_mappings
280
+ if mapping["source"] in include_sources
281
+ ]
282
+
283
+ if not additional_mappings:
284
+ print("No additional files configured in SourceControlDetails; skipping.")
285
+ for mapping in additional_mappings:
286
+ local = Path(mapping["local_path"])
287
+ s3_path = mapping["s3_path"]
288
+ if not s3_path.startswith("s3://"):
289
+ print(f"Additional file has invalid S3 path: {s3_path}")
290
+ continue
291
+ bucket, key = parse_s3_bucket_and_prefix(s3_path)
292
+ is_zip = s3_path.endswith(".zip")
293
+ treat_as_dir = is_zip and (
294
+ local.exists() and local.is_dir() or local.suffix != ".zip"
295
+ )
296
+ if treat_as_dir:
297
+ local.mkdir(parents=True, exist_ok=True)
298
+ with tempfile.TemporaryDirectory() as temp_dir:
299
+ zip_path = Path(temp_dir) / "additional_files.zip"
300
+ s3.download_file(bucket, key, str(zip_path))
301
+ with zipfile.ZipFile(zip_path, "r") as archive:
302
+ archive.extractall(local)
303
+ print(f"Extracted additional archive: {s3_path} -> {local}")
304
+ else:
305
+ local.parent.mkdir(parents=True, exist_ok=True)
306
+ s3.download_file(bucket, key, str(local))
307
+ print(f"Downloaded additional file: {s3_path} -> {local}")
308
+
309
+ print("S3 sync complete.")
310
+
311
+
312
+ def _resolve_notebook_path(script_path):
313
+ if "scripts" in script_path.parts:
314
+ return str(
315
+ Path(
316
+ *(
317
+ "notebooks" if part == "scripts" else part
318
+ for part in script_path.parts
319
+ )
320
+ ).with_suffix(".ipynb")
321
+ )
322
+ return str(Path("glue/notebooks") / script_path.with_suffix(".ipynb").name)
323
+
324
+
325
+ def _script_location_to_notebook_key(script_key):
326
+ if "scripts/" in script_key and script_key.endswith(".py"):
327
+ return script_key.replace("scripts/", "notebooks/").replace(".py", ".ipynb")
328
+ return script_key.replace(".py", ".ipynb")
329
+
330
+
331
+ def _load_config_data(config):
332
+ if isinstance(config, (str, Path)):
333
+ config_path = Path(config)
334
+ try:
335
+ config_data = normalize_glue_config_data(
336
+ json.loads(config_path.read_text())
337
+ )
338
+ except json.JSONDecodeError as exc:
339
+ raise ValueError(f"Invalid JSON in {config_path}") from exc
340
+ return config_data, config_path
341
+ if isinstance(config, dict):
342
+ return normalize_glue_config_data(config), None
343
+ raise TypeError("config must be a dict or a path-like string")
344
+
345
+
346
+ def _collect_additional_file_mappings(sc):
347
+ mappings: list[dict[str, str]] = []
348
+ errors: list[str] = []
349
+ seen: set[tuple[str, str]] = set()
350
+ sources = [
351
+ ("AdditionalPythonFiles", sc.get("AdditionalPythonFiles")),
352
+ ("ExtraPyFiles", sc.get("ExtraPyFiles")),
353
+ ("AdditionalFiles", sc.get("AdditionalFiles")),
354
+ ("ExtraFiles", sc.get("ExtraFiles")),
355
+ ]
356
+
357
+ for label, items in sources:
358
+ if not items:
359
+ continue
360
+ if isinstance(items, dict):
361
+ items = [items]
362
+ if not isinstance(items, list):
363
+ errors.append(f"{label} must be a list or dict.")
364
+ continue
365
+ for item in items:
366
+ local_path = None
367
+ s3_path = None
368
+ if isinstance(item, dict):
369
+ local_path = (
370
+ item.get("LocalPath")
371
+ or item.get("local_path")
372
+ or item.get("localPath")
373
+ )
374
+ s3_path = (
375
+ item.get("S3Path") or item.get("s3_path") or item.get("s3Path")
376
+ )
377
+ elif isinstance(item, (list, tuple)) and len(item) == 2:
378
+ local_path, s3_path = item
379
+ else:
380
+ errors.append(f"{label} contains an unsupported entry: {item!r}")
381
+ continue
382
+
383
+ if not local_path or not s3_path:
384
+ errors.append(f"{label} entry missing LocalPath/S3Path: {item!r}")
385
+ continue
386
+
387
+ key = (str(local_path), str(s3_path))
388
+ if key in seen:
389
+ continue
390
+ seen.add(key)
391
+ mappings.append(
392
+ {
393
+ "local_path": str(local_path),
394
+ "s3_path": str(s3_path),
395
+ "source": label,
396
+ }
397
+ )
398
+
399
+ return mappings, errors
400
+
401
+
402
+ def _parse_csv_list(value):
403
+ if not value:
404
+ return []
405
+ return [item.strip() for item in next(csv.reader([value])) if item.strip()]
406
+
407
+
408
+ def _infer_local_dependency_path(s3_path, script_location, local_script_path):
409
+ if not isinstance(s3_path, str) or not s3_path.startswith("s3://"):
410
+ return None
411
+
412
+ try:
413
+ _bucket, key = parse_s3_bucket_and_prefix(s3_path)
414
+ except ValueError:
415
+ return None
416
+
417
+ key_path = Path(key)
418
+ candidates: list[Path] = [key_path]
419
+ if key_path.parts and key_path.parts[0] != "glue":
420
+ candidates.append(Path("glue") / key_path)
421
+
422
+ if isinstance(script_location, str) and script_location.startswith("s3://"):
423
+ try:
424
+ _script_bucket, script_key = parse_s3_bucket_and_prefix(script_location)
425
+ except ValueError:
426
+ script_key = None
427
+ if script_key is not None:
428
+ script_dir = Path(script_key).parent
429
+ key_parts = key_path.parts
430
+ script_dir_parts = script_dir.parts
431
+ if (
432
+ script_dir_parts
433
+ and key_parts[: len(script_dir_parts)] == script_dir_parts
434
+ ):
435
+ rel = key_path.relative_to(script_dir)
436
+ candidates.append(local_script_path.parent / rel)
437
+
438
+ seen: set[str] = set()
439
+ for candidate in candidates:
440
+ candidate_key = candidate.as_posix()
441
+ if candidate_key in seen:
442
+ continue
443
+ seen.add(candidate_key)
444
+ if candidate.exists():
445
+ return candidate
446
+
447
+ return None
448
+
449
+
450
+ def _collect_configured_dependency_mappings(config_data, explicit_mappings):
451
+ default_args = config_data.get("DefaultArguments", {})
452
+ if not isinstance(default_args, dict):
453
+ return [], []
454
+
455
+ command = config_data.get("Command", {})
456
+ if not isinstance(command, dict):
457
+ command = {}
458
+ sc = config_data.get("SourceControlDetails", {})
459
+ if not isinstance(sc, dict):
460
+ sc = {}
461
+
462
+ job_name = config_data.get("Name") or "unknown"
463
+ local_script_path = Path(
464
+ sc.get("ScriptLocation")
465
+ or sc.get("LocalPath")
466
+ or f"glue/scripts/{slugify(job_name)}.py"
467
+ )
468
+ script_location = command.get("ScriptLocation")
469
+
470
+ explicit_s3_paths = {mapping["s3_path"] for mapping in explicit_mappings}
471
+ seen: set[tuple[str, str]] = set()
472
+ mappings: list[dict[str, str]] = []
473
+ errors: list[str] = []
474
+ dependency_args = (
475
+ ("DefaultArguments.--additional-python-modules", "--additional-python-modules"),
476
+ ("DefaultArguments.--extra-py-files", "--extra-py-files"),
477
+ )
478
+
479
+ for source_label, arg_name in dependency_args:
480
+ for item in _parse_csv_list(default_args.get(arg_name)):
481
+ if not item.startswith("s3://"):
482
+ continue
483
+ if item in explicit_s3_paths:
484
+ continue
485
+
486
+ local_path = _infer_local_dependency_path(
487
+ item, script_location, local_script_path
488
+ )
489
+ if local_path is None:
490
+ errors.append(
491
+ f"{arg_name} entry has no matching local path for upload: {item}"
492
+ )
493
+ continue
494
+
495
+ key = (local_path.as_posix(), item)
496
+ if key in seen:
497
+ continue
498
+ seen.add(key)
499
+ mappings.append(
500
+ {
501
+ "local_path": local_path.as_posix(),
502
+ "s3_path": item,
503
+ "source": source_label,
504
+ }
505
+ )
506
+
507
+ return mappings, errors
508
+
509
+
510
+ def _zip_directory(source_dir, zip_path):
511
+ source_dir = Path(source_dir)
512
+ zip_path = Path(zip_path)
513
+ zip_path.parent.mkdir(parents=True, exist_ok=True)
514
+
515
+ base_dir = source_dir.parent
516
+ with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
517
+ for path in source_dir.rglob("*"):
518
+ if path.is_dir():
519
+ continue
520
+ archive.write(path, path.relative_to(base_dir).as_posix())
521
+
522
+
523
+ def _build_job_update(config_data):
524
+ allowed_keys = {
525
+ "Description",
526
+ "LogUri",
527
+ "Role",
528
+ "ExecutionProperty",
529
+ "Command",
530
+ "DefaultArguments",
531
+ "NonOverridableArguments",
532
+ "Connections",
533
+ "MaxRetries",
534
+ "Timeout",
535
+ "MaxCapacity",
536
+ "WorkerType",
537
+ "NumberOfWorkers",
538
+ "SecurityConfiguration",
539
+ "NotificationProperty",
540
+ "GlueVersion",
541
+ "CodeGenConfigurationNodes",
542
+ "ExecutionClass",
543
+ "JobMode",
544
+ "ExecutionClass",
545
+ "JobRunQueuingEnabled",
546
+ }
547
+ return {
548
+ key: config_data[key]
549
+ for key in allowed_keys
550
+ if key in config_data and key not in DEPRECATED_JOB_FIELDS
551
+ }
552
+
553
+
554
+ def _get_job_update_allowed_fields(profile_name=None):
555
+ del profile_name
556
+ return get_updateable_job_fields(prefer_live=True)
557
+
558
+
559
+ def _filter_job_update_fields(job_update, allowed_fields):
560
+ if not allowed_fields:
561
+ return job_update, []
562
+ filtered = {
563
+ key: value for key, value in job_update.items() if key in allowed_fields
564
+ }
565
+ dropped = [key for key in job_update if key not in allowed_fields]
566
+ return filtered, dropped
567
+
568
+
569
+ def update_glue_job_from_config(
570
+ config_data, dry_run=False, profile_name=None, auto_login=True
571
+ ):
572
+ """
573
+ Uses the Glue service model (via botocore) when available to validate
574
+ JobUpdate fields and report anything dropped during dry runs.
575
+
576
+ TODO: Consider fetching the current Glue job definition and merging with
577
+ local config values before update, so unchanged fields stay in sync.
578
+ """
579
+ name = config_data.get("Name")
580
+ if not name:
581
+ print("Missing Name in config data, skipping Glue job update.")
582
+ return False
583
+
584
+ deprecated_in_config = [key for key in DEPRECATED_JOB_FIELDS if key in config_data]
585
+ if deprecated_in_config:
586
+ print(
587
+ f"Skipping deprecated fields for {name}: {', '.join(sorted(deprecated_in_config))}"
588
+ )
589
+
590
+ job_update = _build_job_update(config_data)
591
+ allowed_fields = _get_job_update_allowed_fields(profile_name=profile_name)
592
+ job_update, dropped = _filter_job_update_fields(job_update, allowed_fields)
593
+ if dropped:
594
+ print(
595
+ f"Dropping unsupported JobUpdate fields for {name}: {', '.join(sorted(dropped))}"
596
+ )
597
+ if dry_run and not allowed_fields:
598
+ print("Model-based validation unavailable; using local allowlist only.")
599
+ if not job_update:
600
+ print(f"No updatable fields found for {name}, skipping Glue job update.")
601
+ return False
602
+
603
+ if dry_run:
604
+ keys = ", ".join(sorted(job_update.keys()))
605
+ print(f"Would update Glue job {name} with: {keys}")
606
+ return True
607
+
608
+ session = ensure_aws_login(profile_name=profile_name, auto_login=auto_login)
609
+ glue = session.client("glue")
610
+ glue.update_job(JobName=name, JobUpdate=job_update)
611
+ print(f"Updated Glue job configuration: {name}")
612
+ return True
613
+
614
+
615
+ def _job_supports_notebook_artifacts(config_data):
616
+ job_mode = str(config_data.get("JobMode") or "").strip().upper()
617
+ if not job_mode:
618
+ return True
619
+ return job_mode == "NOTEBOOK"
620
+
621
+
622
+ def resolve_glue_artifact_mappings(config_data, include_components=None):
623
+ """Resolve local-to-S3 artifact mappings without contacting AWS."""
624
+ config_data = normalize_glue_config_data(config_data)
625
+ include_components = set(
626
+ include_components
627
+ or {
628
+ "script",
629
+ "notebook",
630
+ "additional-python-modules",
631
+ "extra-files",
632
+ }
633
+ )
634
+ warnings: list[str] = []
635
+ dependency_warnings: list[str] = []
636
+
637
+ command = config_data.get("Command", {})
638
+ if not isinstance(command, dict):
639
+ warnings.append("Command must be an object.")
640
+ command = {}
641
+
642
+ sc = config_data.get("SourceControlDetails", {})
643
+ if not isinstance(sc, dict):
644
+ warnings.append("SourceControlDetails must be an object.")
645
+ sc = {}
646
+
647
+ name = config_data.get("Name") or "unknown"
648
+ script_path = Path(
649
+ sc.get("ScriptLocation")
650
+ or sc.get("LocalPath")
651
+ or f"glue/scripts/{slugify(name)}.py"
652
+ )
653
+ script_s3_path = command.get("ScriptLocation")
654
+ notebook_enabled = (
655
+ "notebook" in include_components
656
+ and _job_supports_notebook_artifacts(config_data)
657
+ )
658
+ notebook_path = None
659
+ notebook_s3_path = None
660
+ mappings: list[dict[str, str]] = []
661
+
662
+ if "script" in include_components:
663
+ if isinstance(script_s3_path, str) and script_s3_path.startswith("s3://"):
664
+ mappings.append(
665
+ {
666
+ "artifact_type": "script",
667
+ "local_path": script_path.as_posix(),
668
+ "s3_path": script_s3_path,
669
+ "source": "Command.ScriptLocation",
670
+ }
671
+ )
672
+ else:
673
+ warnings.append("Command.ScriptLocation must be a valid S3 path.")
674
+
675
+ if notebook_enabled:
676
+ notebook_value = sc.get("NotebookPath") or sc.get("NotebookLocation")
677
+ notebook_path = (
678
+ Path(notebook_value)
679
+ if notebook_value
680
+ else Path(_resolve_notebook_path(script_path))
681
+ )
682
+ if isinstance(script_s3_path, str) and script_s3_path.startswith("s3://"):
683
+ try:
684
+ bucket_name, script_key = parse_s3_bucket_and_prefix(script_s3_path)
685
+ except ValueError:
686
+ warnings.append(
687
+ f"Cannot derive notebook S3 path from Command.ScriptLocation: {script_s3_path}"
688
+ )
689
+ else:
690
+ notebook_key = _script_location_to_notebook_key(script_key)
691
+ notebook_s3_path = f"s3://{bucket_name}/{notebook_key}"
692
+ mappings.append(
693
+ {
694
+ "artifact_type": "notebook",
695
+ "local_path": notebook_path.as_posix(),
696
+ "s3_path": notebook_s3_path,
697
+ "source": "Command.ScriptLocation",
698
+ }
699
+ )
700
+
701
+ include_sources: set[str] = set()
702
+ dependency_components = {"additional-python-modules", "extra-py-files"}
703
+ if include_components.intersection(dependency_components):
704
+ include_sources.update({"AdditionalPythonFiles", "ExtraPyFiles"})
705
+ if "extra-files" in include_components:
706
+ include_sources.update({"AdditionalFiles", "ExtraFiles"})
707
+
708
+ additional_mappings: list[dict[str, str]] = []
709
+ if include_sources:
710
+ additional_mappings, additional_errors = _collect_additional_file_mappings(sc)
711
+ dependency_warnings.extend(additional_errors)
712
+ warnings.extend(additional_errors)
713
+ additional_mappings = [
714
+ mapping
715
+ for mapping in additional_mappings
716
+ if mapping["source"] in include_sources
717
+ ]
718
+
719
+ if include_components.intersection(dependency_components):
720
+ configured_mappings, configured_errors = (
721
+ _collect_configured_dependency_mappings(
722
+ config_data,
723
+ additional_mappings,
724
+ )
725
+ )
726
+ additional_mappings.extend(configured_mappings)
727
+ dependency_warnings.extend(configured_errors)
728
+ warnings.extend(configured_errors)
729
+
730
+ artifact_types = {
731
+ "AdditionalPythonFiles": "additional-python-file",
732
+ "ExtraPyFiles": "extra-py-file",
733
+ "AdditionalFiles": "extra-file",
734
+ "ExtraFiles": "extra-file",
735
+ "DefaultArguments.--additional-python-modules": "additional-python-module",
736
+ "DefaultArguments.--extra-py-files": "extra-py-file",
737
+ }
738
+ for mapping in additional_mappings:
739
+ mappings.append(
740
+ {
741
+ **mapping,
742
+ "artifact_type": artifact_types.get(mapping["source"], "dependency"),
743
+ }
744
+ )
745
+
746
+ deduplicated: list[dict[str, str]] = []
747
+ seen: set[tuple[str, str]] = set()
748
+ for mapping in mappings:
749
+ key = (mapping["local_path"], mapping["s3_path"])
750
+ if key in seen:
751
+ continue
752
+ seen.add(key)
753
+ deduplicated.append(mapping)
754
+
755
+ return {
756
+ "mappings": deduplicated,
757
+ "warnings": warnings,
758
+ "dependency_warnings": dependency_warnings,
759
+ "script_path": script_path,
760
+ "script_s3_path": script_s3_path,
761
+ "notebook_enabled": notebook_enabled,
762
+ "notebook_path": notebook_path,
763
+ "notebook_s3_path": notebook_s3_path,
764
+ }
765
+
766
+
767
+ def upload_glue_job_files_from_config(
768
+ config,
769
+ dry_run=False,
770
+ update_job_config=True,
771
+ include_components=None,
772
+ profile_name=None,
773
+ auto_login=True,
774
+ ):
775
+ config_data, config_path = _load_config_data(config)
776
+ include_components = set(
777
+ include_components
778
+ or {
779
+ "script",
780
+ "notebook",
781
+ "additional-python-modules",
782
+ "extra-files",
783
+ "job-config",
784
+ }
785
+ )
786
+ artifact_resolution = resolve_glue_artifact_mappings(
787
+ config_data,
788
+ include_components=include_components,
789
+ )
790
+ notebook_enabled = artifact_resolution["notebook_enabled"]
791
+ script_location = artifact_resolution["script_s3_path"]
792
+ needs_script_location = "script" in include_components or notebook_enabled
793
+ if needs_script_location and not script_location:
794
+ print(f"Missing Command.ScriptLocation in {config_path or 'config data'}")
795
+ return False
796
+
797
+ if script_location:
798
+ bucket_name, script_key = parse_s3_bucket_and_prefix(script_location)
799
+ notebook_key = _script_location_to_notebook_key(script_key)
800
+ else:
801
+ bucket_name = None
802
+ script_key = None
803
+ notebook_key = None
804
+ script_path = artifact_resolution["script_path"]
805
+ notebook_path = artifact_resolution["notebook_path"]
806
+
807
+ script_available = script_path.exists()
808
+ notebook_available = bool(notebook_path and Path(notebook_path).exists())
809
+
810
+ if "script" in include_components and notebook_path:
811
+ script_available = (
812
+ _ensure_local_script_from_notebook(
813
+ notebook_path,
814
+ script_path,
815
+ dry_run=dry_run,
816
+ )
817
+ or script_available
818
+ )
819
+ if notebook_enabled and notebook_path:
820
+ notebook_available = (
821
+ _ensure_local_notebook_from_script(
822
+ script_path,
823
+ notebook_path,
824
+ config_data=config_data,
825
+ include_magics=True,
826
+ dry_run=dry_run,
827
+ )
828
+ or notebook_available
829
+ )
830
+
831
+ if "script" in include_components and not script_available:
832
+ print(f"Local script not found: {script_path}")
833
+ return False
834
+
835
+ additional_mappings = [
836
+ mapping
837
+ for mapping in artifact_resolution["mappings"]
838
+ if mapping["artifact_type"] not in {"script", "notebook"}
839
+ ]
840
+ additional_errors = artifact_resolution["dependency_warnings"]
841
+
842
+ if dry_run:
843
+ if "script" in include_components:
844
+ print(
845
+ f"Would upload script: {script_path} -> s3://{bucket_name}/{script_key}"
846
+ )
847
+ if notebook_enabled:
848
+ if notebook_available:
849
+ print(
850
+ f"Would upload notebook: {notebook_path} -> s3://{bucket_name}/{notebook_key}"
851
+ )
852
+ else:
853
+ print(f"Notebook not found locally: {notebook_path}")
854
+ for error in additional_errors:
855
+ print(f"Additional files error: {error}")
856
+ for mapping in additional_mappings:
857
+ local = Path(mapping["local_path"])
858
+ s3_path = mapping["s3_path"]
859
+ if not local.exists():
860
+ print(f"Additional file not found locally: {local}")
861
+ continue
862
+ if not s3_path.startswith("s3://"):
863
+ print(f"Additional file has invalid S3 path: {s3_path}")
864
+ continue
865
+ if local.is_dir():
866
+ if not s3_path.endswith(".zip"):
867
+ print(f"Additional directory must target a .zip S3 path: {s3_path}")
868
+ continue
869
+ print(f"Would zip directory: {local} -> {s3_path}")
870
+ continue
871
+ bucket, key = parse_s3_bucket_and_prefix(s3_path)
872
+ print(f"Would upload additional file: {local} -> s3://{bucket}/{key}")
873
+ if update_job_config and "job-config" in include_components:
874
+ update_glue_job_from_config(
875
+ config_data,
876
+ dry_run=True,
877
+ profile_name=profile_name,
878
+ auto_login=auto_login,
879
+ )
880
+ return True
881
+
882
+ ok = True
883
+ file_components = include_components.intersection(
884
+ {
885
+ "script",
886
+ "notebook",
887
+ "additional-python-modules",
888
+ "extra-py-files",
889
+ "extra-files",
890
+ }
891
+ )
892
+ needs_aws = bool(
893
+ file_components or (update_job_config and "job-config" in include_components)
894
+ )
895
+ session = (
896
+ ensure_aws_login(profile_name=profile_name, auto_login=auto_login)
897
+ if needs_aws
898
+ else None
899
+ )
900
+ s3 = session.client("s3") if file_components and session else None
901
+
902
+ if "script" in include_components:
903
+ s3.upload_file(str(script_path), bucket_name, script_key)
904
+ print(f"Uploaded script: {script_path} -> s3://{bucket_name}/{script_key}")
905
+
906
+ if notebook_enabled:
907
+ if notebook_available:
908
+ s3.upload_file(str(notebook_path), bucket_name, notebook_key)
909
+ print(
910
+ f"Uploaded notebook: {notebook_path} -> s3://{bucket_name}/{notebook_key}"
911
+ )
912
+ else:
913
+ print(f"Notebook not found locally: {notebook_path}")
914
+
915
+ for error in additional_errors:
916
+ print(f"Additional files error: {error}")
917
+ ok = False
918
+
919
+ for mapping in additional_mappings:
920
+ local = Path(mapping["local_path"])
921
+ s3_path = mapping["s3_path"]
922
+ if not local.exists():
923
+ print(f"Additional file not found locally: {local}")
924
+ ok = False
925
+ continue
926
+ if not s3_path.startswith("s3://"):
927
+ print(f"Additional file has invalid S3 path: {s3_path}")
928
+ ok = False
929
+ continue
930
+ if local.is_dir():
931
+ if not s3_path.endswith(".zip"):
932
+ print(f"Additional directory must target a .zip S3 path: {s3_path}")
933
+ ok = False
934
+ continue
935
+ bucket, key = parse_s3_bucket_and_prefix(s3_path)
936
+ with tempfile.TemporaryDirectory() as temp_dir:
937
+ zip_path = Path(temp_dir) / f"{local.name}.zip"
938
+ _zip_directory(local, zip_path)
939
+ s3.upload_file(str(zip_path), bucket, key)
940
+ print(f"Uploaded additional directory: {local} -> s3://{bucket}/{key}")
941
+ continue
942
+ bucket, key = parse_s3_bucket_and_prefix(s3_path)
943
+ s3.upload_file(str(local), bucket, key)
944
+ print(f"Uploaded additional file: {local} -> s3://{bucket}/{key}")
945
+
946
+ if update_job_config and "job-config" in include_components:
947
+ updated = update_glue_job_from_config(
948
+ config_data,
949
+ dry_run=False,
950
+ profile_name=profile_name,
951
+ auto_login=auto_login,
952
+ )
953
+ ok = ok and updated
954
+
955
+ return ok
956
+
957
+
958
+ def clear_notebook_outputs(notebook):
959
+ """
960
+ Clears the outputs of all code cells in a Jupyter notebook.
961
+
962
+ This function takes a Jupyter notebook (either as a dictionary, JSON string or a file path),
963
+ clears the outputs of all code cells
964
+ Args:
965
+ notebook (str or dict): The Jupyter notebook to be processed. This can be either:
966
+ - A string representing the path to the notebook file.
967
+ - A JSON string representing the notebook content.
968
+ - A dictionary representing the notebook content.
969
+ Raises:
970
+ ValueError: If the provided string is neither a valid path nor a valid JSON string.
971
+ Returns:
972
+ None
973
+ Example:
974
+ # This can also be the JSON/dictionary version of the notebook content
975
+ clear_notebook_outputs(notebook_dict, 'path/to/output_notebook.ipynb')
976
+ """
977
+
978
+ if isinstance(notebook, (str, Path)):
979
+ try:
980
+ # Check if the string is a path-like string
981
+ if Path(notebook).exists():
982
+ notebook = json.loads(Path(notebook).read_text(encoding="utf-8"))
983
+ else:
984
+ # Try to parse the string as JSON
985
+ notebook = json.loads(notebook)
986
+ except json.JSONDecodeError:
987
+ raise ValueError(
988
+ "The provided string is neither a valid path nor a valid JSON string."
989
+ )
990
+
991
+ for cell in notebook.get("cells", []):
992
+ if cell.get("cell_type") == "code":
993
+ cell["outputs"] = []
994
+
995
+ return notebook
996
+
997
+
998
+ def _new_code_cell(source_lines):
999
+ return {
1000
+ "cell_type": "code",
1001
+ "id": str(uuid.uuid4()),
1002
+ "metadata": {
1003
+ "tags": [],
1004
+ "trusted": True,
1005
+ "vscode": {"languageId": "python_glue_session"},
1006
+ },
1007
+ "source": source_lines,
1008
+ "execution_count": None,
1009
+ "outputs": [],
1010
+ }
1011
+
1012
+
1013
+ def _new_markdown_cell(source_lines):
1014
+ return {
1015
+ "cell_type": "markdown",
1016
+ "id": str(uuid.uuid4()),
1017
+ "metadata": {},
1018
+ "source": source_lines,
1019
+ }
1020
+
1021
+
1022
+ def _parse_percent_cell_marker(raw_line):
1023
+ match = re.match(r"^\s*#\s*%%(.*)$", raw_line)
1024
+ if not match:
1025
+ return None
1026
+
1027
+ marker_text = match.group(1).strip()
1028
+ return {
1029
+ "marker_text": marker_text,
1030
+ "is_markdown": "markdown" in marker_text.casefold(),
1031
+ }
1032
+
1033
+
1034
+ def _canonical_code_cell_marker():
1035
+ return "#%%\n"
1036
+
1037
+
1038
+ def _code_cell_has_payload(source_lines):
1039
+ for line in source_lines:
1040
+ if _parse_percent_cell_marker(line):
1041
+ continue
1042
+ if line.strip():
1043
+ return True
1044
+ return False
1045
+
1046
+
1047
+ def _code_cell_starts_with_percent_marker(source_lines):
1048
+ for line in source_lines:
1049
+ if not line.strip():
1050
+ continue
1051
+ return _parse_percent_cell_marker(line) is not None
1052
+ return False
1053
+
1054
+
1055
+ def _ensure_local_script_from_notebook(notebook_path, script_path, *, dry_run=False):
1056
+ notebook_path = Path(notebook_path)
1057
+ script_path = Path(script_path)
1058
+ if script_path.exists() or not notebook_path.exists():
1059
+ return False
1060
+ if dry_run:
1061
+ print(f"Would generate script from notebook: {notebook_path} -> {script_path}")
1062
+ return True
1063
+ convert_notebook_to_script(notebook_path, script_path)
1064
+ print(f"Generated script from notebook: {notebook_path} -> {script_path}")
1065
+ return True
1066
+
1067
+
1068
+ def _ensure_local_notebook_from_script(
1069
+ script_path,
1070
+ notebook_path,
1071
+ *,
1072
+ config_data=None,
1073
+ include_magics=True,
1074
+ dry_run=False,
1075
+ ):
1076
+ script_path = Path(script_path)
1077
+ notebook_path = Path(notebook_path)
1078
+ if notebook_path.exists() or not script_path.exists():
1079
+ return False
1080
+ if dry_run:
1081
+ print(f"Would generate notebook from script: {script_path} -> {notebook_path}")
1082
+ return True
1083
+ convert_script_to_notebook(
1084
+ script_path,
1085
+ notebook_path,
1086
+ config_data=config_data,
1087
+ include_magics=include_magics,
1088
+ )
1089
+ print(f"Generated notebook from script: {script_path} -> {notebook_path}")
1090
+ return True
1091
+
1092
+
1093
+ def _script_to_cells(script_text):
1094
+ cells = []
1095
+ current_lines = []
1096
+ current_type = "code"
1097
+
1098
+ def flush():
1099
+ if not current_lines:
1100
+ return
1101
+ if current_type == "code":
1102
+ if not _code_cell_has_payload(current_lines):
1103
+ return
1104
+ elif not any(line.strip() for line in current_lines):
1105
+ return
1106
+ source = current_lines
1107
+ if current_type == "code":
1108
+ cell = _new_code_cell(source)
1109
+ else:
1110
+ cell = _new_markdown_cell(source)
1111
+ cells.append(cell)
1112
+
1113
+ for line in script_text.splitlines(keepends=True):
1114
+ marker = _parse_percent_cell_marker(line)
1115
+ if marker:
1116
+ if current_type == "code" and not _code_cell_has_payload(current_lines):
1117
+ if marker["is_markdown"]:
1118
+ current_lines = []
1119
+ else:
1120
+ current_lines = [
1121
+ existing_line
1122
+ for existing_line in current_lines
1123
+ if _parse_percent_cell_marker(existing_line)
1124
+ ]
1125
+ else:
1126
+ flush()
1127
+ current_lines = []
1128
+ current_type = "markdown" if marker["is_markdown"] else "code"
1129
+ if current_type == "code":
1130
+ current_lines.append(_canonical_code_cell_marker())
1131
+ continue
1132
+ current_lines.append(line)
1133
+
1134
+ flush()
1135
+ return cells
1136
+
1137
+
1138
+ def _strip_generated_script_config_cells(script_text):
1139
+ def is_glue_magic_line(raw_line):
1140
+ if _parse_percent_cell_marker(raw_line):
1141
+ return False
1142
+ candidate = raw_line.lstrip()
1143
+ if candidate.startswith("#"):
1144
+ candidate = candidate[1:].lstrip()
1145
+ return (
1146
+ candidate.startswith("%")
1147
+ or "%%configure" in candidate
1148
+ or "AWS Glue configs" in candidate
1149
+ )
1150
+
1151
+ def split_into_cells(all_lines):
1152
+ starts = [
1153
+ idx
1154
+ for idx, line in enumerate(all_lines)
1155
+ if _parse_percent_cell_marker(line)
1156
+ ]
1157
+ if not starts:
1158
+ return [all_lines]
1159
+ if starts[0] != 0:
1160
+ starts = [0] + starts
1161
+ boundaries = starts[1:] + [len(all_lines)]
1162
+ return [all_lines[start:end] for start, end in zip(starts, boundaries)]
1163
+
1164
+ kept = []
1165
+ for cell_lines in split_into_cells(script_text.splitlines(keepends=True)):
1166
+ if any(is_glue_magic_line(line) for line in cell_lines):
1167
+ first_code_idx = None
1168
+ for idx, line in enumerate(cell_lines):
1169
+ stripped = line.strip()
1170
+ if not stripped:
1171
+ continue
1172
+ if _parse_percent_cell_marker(line):
1173
+ continue
1174
+ if stripped.startswith("#"):
1175
+ continue
1176
+ first_code_idx = idx
1177
+ break
1178
+ if first_code_idx is not None:
1179
+ kept.extend(cell_lines[first_code_idx:])
1180
+ continue
1181
+ kept.extend(cell_lines)
1182
+
1183
+ return "".join(kept)
1184
+
1185
+
1186
+ def _build_magic_cells(config_data):
1187
+ if not config_data:
1188
+ return []
1189
+
1190
+ cells = [_new_markdown_cell(["# AWS Glue configs\n"])]
1191
+ cells.extend(
1192
+ _new_code_cell(source_lines)
1193
+ for source_lines in build_magic_cell_sources(config_data)
1194
+ )
1195
+
1196
+ cells.append(_new_markdown_cell(["# AWS Glue Script\n"]))
1197
+
1198
+ return cells
1199
+
1200
+
1201
+ def _cell_source_to_lines(source):
1202
+ if source is None:
1203
+ return []
1204
+ if isinstance(source, str):
1205
+ return source.splitlines(keepends=True)
1206
+ lines: list[str] = []
1207
+ for part in source:
1208
+ if part.endswith("\n"):
1209
+ lines.append(part)
1210
+ else:
1211
+ lines.append(f"{part}\n")
1212
+ return lines
1213
+
1214
+
1215
+ def _format_markdown_lines(source_lines):
1216
+ formatted: list[str] = []
1217
+ for line in source_lines:
1218
+ stripped = line.rstrip("\n")
1219
+ if stripped.strip():
1220
+ formatted.append(f"# {stripped}\n")
1221
+ else:
1222
+ formatted.append("#\n")
1223
+ return formatted
1224
+
1225
+
1226
+ def _is_magic_cell(source_lines):
1227
+ for line in source_lines:
1228
+ stripped = line.lstrip()
1229
+ if not stripped:
1230
+ continue
1231
+ if stripped.startswith("%"):
1232
+ return True
1233
+ return False
1234
+
1235
+
1236
+ def _default_notebook_metadata():
1237
+ return {
1238
+ "kernelspec": {
1239
+ "name": "glue_pyspark",
1240
+ "display_name": "Glue PySpark",
1241
+ "language": "python",
1242
+ },
1243
+ "language_info": {
1244
+ "name": "Python_Glue_Session",
1245
+ "mimetype": "text/x-python",
1246
+ "codemirror_mode": {"name": "python", "version": 3},
1247
+ "pygments_lexer": "python3",
1248
+ "file_extension": ".py",
1249
+ },
1250
+ }
1251
+
1252
+
1253
+ def convert_notebook_to_script(
1254
+ notebook_path,
1255
+ script_path,
1256
+ ):
1257
+ notebook_path = Path(notebook_path)
1258
+ script_path = Path(script_path)
1259
+
1260
+ notebook = json.loads(notebook_path.read_text())
1261
+ lines: list[str] = []
1262
+
1263
+ for cell in notebook.get("cells", []):
1264
+ cell_type = cell.get("cell_type")
1265
+ source_lines = _cell_source_to_lines(cell.get("source"))
1266
+ if cell_type == "markdown":
1267
+ lines.append("#%% [markdown]\n")
1268
+ lines.extend(_format_markdown_lines(source_lines))
1269
+ elif cell_type == "code":
1270
+ if _is_magic_cell(source_lines):
1271
+ lines.append("#%% [markdown]\n")
1272
+ lines.extend(_format_markdown_lines(source_lines))
1273
+ else:
1274
+ if not _code_cell_starts_with_percent_marker(source_lines):
1275
+ lines.append(_canonical_code_cell_marker())
1276
+ lines.extend(source_lines)
1277
+ else:
1278
+ continue
1279
+ if lines and not lines[-1].endswith("\n"):
1280
+ lines[-1] = f"{lines[-1]}\n"
1281
+ lines.append("\n")
1282
+
1283
+ script_path.parent.mkdir(parents=True, exist_ok=True)
1284
+ script_path.write_text("".join(lines))
1285
+ return script_path
1286
+
1287
+
1288
+ def convert_script_to_notebook(
1289
+ script_path,
1290
+ notebook_path,
1291
+ config_data=None,
1292
+ include_magics=True,
1293
+ ):
1294
+ script_path = Path(script_path)
1295
+ notebook_path = Path(notebook_path)
1296
+
1297
+ script_text = script_path.read_text()
1298
+ script_text = re.sub(r"(?m)^\s*job\.commit\(\)\s*(?:#.*)?$\n?", "", script_text)
1299
+ script_text = _strip_generated_script_config_cells(script_text)
1300
+ cells = _script_to_cells(script_text)
1301
+ if include_magics and config_data:
1302
+ cells = _build_magic_cells(config_data) + cells
1303
+
1304
+ if notebook_path.exists():
1305
+ template = json.loads(notebook_path.read_text())
1306
+ metadata = template.get("metadata") or _default_notebook_metadata()
1307
+ nbformat = template.get("nbformat", 4)
1308
+ nbformat_minor = template.get("nbformat_minor", 5)
1309
+ else:
1310
+ metadata = _default_notebook_metadata()
1311
+ nbformat = 4
1312
+ nbformat_minor = 5
1313
+
1314
+ notebook = {
1315
+ "metadata": metadata,
1316
+ "nbformat": nbformat,
1317
+ "nbformat_minor": nbformat_minor,
1318
+ "cells": cells,
1319
+ }
1320
+
1321
+ notebook_path.parent.mkdir(parents=True, exist_ok=True)
1322
+ notebook_path.write_text(json.dumps(notebook, indent=4))
1323
+ return notebook_path