gluekit 1.0.1.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. gluekit/__init__.py +7 -0
  2. gluekit/app.py +0 -0
  3. gluekit/cli.py +64 -0
  4. gluekit/commands/__init__.py +1 -0
  5. gluekit/commands/add.py +455 -0
  6. gluekit/commands/build.py +816 -0
  7. gluekit/commands/checkout.py +114 -0
  8. gluekit/commands/clone.py +516 -0
  9. gluekit/commands/config_commands.py +180 -0
  10. gluekit/commands/constants.py +47 -0
  11. gluekit/commands/convert.py +336 -0
  12. gluekit/commands/edit.py +1104 -0
  13. gluekit/commands/helpers.py +1068 -0
  14. gluekit/commands/init.py +798 -0
  15. gluekit/commands/list.py +16 -0
  16. gluekit/commands/local_commands.py +680 -0
  17. gluekit/commands/pull.py +374 -0
  18. gluekit/commands/push.py +251 -0
  19. gluekit/commands/remove.py +161 -0
  20. gluekit/commands/run.py +126 -0
  21. gluekit/commands/status.py +97 -0
  22. gluekit/commands/sync.py +97 -0
  23. gluekit/commands/update.py +104 -0
  24. gluekit/job_mgmt/__init__.py +0 -0
  25. gluekit/job_mgmt/glue_jobs.py +1323 -0
  26. gluekit/job_mgmt/magics.py +122 -0
  27. gluekit/job_mgmt/resources/__init__.py +0 -0
  28. gluekit/job_mgmt/resources/glue_job_schema.json +40341 -0
  29. gluekit/job_mgmt/resources/magic_map.json +83 -0
  30. gluekit/job_mgmt/schema.py +165 -0
  31. gluekit/local/__init__.py +6 -0
  32. gluekit/local/awsglue/__init__.py +1 -0
  33. gluekit/local/awsglue/context.py +30 -0
  34. gluekit/local/awsglue/job.py +9 -0
  35. gluekit/local/awsglue/utils.py +17 -0
  36. gluekit/local/local.py +434 -0
  37. gluekit/local/local_fixtures.py +337 -0
  38. gluekit/local/pyspark/__init__.py +7 -0
  39. gluekit/local/pyspark/context.py +31 -0
  40. gluekit/local/pyspark/sql/__init__.py +6 -0
  41. gluekit/local/pyspark/sql/session.py +29 -0
  42. gluekit-1.0.1.dev1.dist-info/METADATA +1176 -0
  43. gluekit-1.0.1.dev1.dist-info/RECORD +46 -0
  44. gluekit-1.0.1.dev1.dist-info/WHEEL +5 -0
  45. gluekit-1.0.1.dev1.dist-info/entry_points.txt +2 -0
  46. gluekit-1.0.1.dev1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,180 @@
1
+ from __future__ import annotations
2
+
3
+ import csv
4
+ import copy
5
+ import json
6
+ import re
7
+ import shutil
8
+ import subprocess
9
+ import tarfile
10
+ import uuid
11
+ import zipfile
12
+ from collections.abc import Mapping
13
+ from datetime import datetime, timezone
14
+ from email.parser import Parser
15
+ from fnmatch import fnmatch
16
+ from pathlib import Path
17
+ from tempfile import TemporaryDirectory
18
+ from typing import Any, Optional
19
+
20
+ import typer
21
+ from slugify import slugify
22
+
23
+ from ..job_mgmt.glue_jobs import (
24
+ download_glue_job_files,
25
+ list_glue_jobs,
26
+ normalize_glue_config_data,
27
+ convert_script_to_notebook,
28
+ convert_notebook_to_script,
29
+ _resolve_notebook_path,
30
+ upload_glue_job_files_from_config,
31
+ )
32
+ from ..job_mgmt.magics import build_magic_cell_sources as _build_magic_cell_sources
33
+
34
+ from .constants import *
35
+ from .helpers import *
36
+ from ..cli import app, glue_config_app
37
+
38
+
39
+ def _parse_set_args(args: list[str]) -> dict[str, Any]:
40
+ parsed: dict[str, Any] = {}
41
+ idx = 0
42
+ while idx < len(args):
43
+ token = args[idx]
44
+ if not token.startswith("--"):
45
+ raise typer.BadParameter(
46
+ f"Invalid token '{token}'. Use --key value or --key=value."
47
+ )
48
+ key_token = token[2:]
49
+ if not key_token:
50
+ raise typer.BadParameter("Invalid empty parameter name.")
51
+
52
+ if "=" in key_token:
53
+ key, raw_value = key_token.split("=", 1)
54
+ if not key:
55
+ raise typer.BadParameter("Invalid empty parameter name.")
56
+ parsed[key] = _coerce_set_value(raw_value)
57
+ idx += 1
58
+ continue
59
+
60
+ key = key_token
61
+ idx += 1
62
+ if idx >= len(args):
63
+ raise typer.BadParameter(f"Missing value for --{key}.")
64
+ raw_value = args[idx]
65
+ if raw_value.startswith("--"):
66
+ raise typer.BadParameter(f"Missing value for --{key}.")
67
+ parsed[key] = _coerce_set_value(raw_value)
68
+ idx += 1
69
+
70
+ if not parsed:
71
+ raise typer.BadParameter("Provide one or more --key value pairs.")
72
+ return parsed
73
+
74
+
75
+ @app.command(
76
+ "set",
77
+ context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
78
+ epilog=_examples_epilog(
79
+ "gluekit set my-job --script_location glue/scripts/my-job.py --extra_files s3://my-bucket/shared/config.json",
80
+ "gluekit pull my-job --profile my-sso-profile",
81
+ 'gluekit set --global --additional_python_modules "pydantic==2.11.7"',
82
+ 'gluekit push "my-job-\\*" --profile my-sso-profile',
83
+ "gluekit config set my-job --default_arguments.--TempDir s3://my-bucket/tmp/",
84
+ "gluekit config get my-job",
85
+ "gluekit config get",
86
+ "gluekit set my-job --extra_py_files s3://my-bucket/dist/gluekit-0.0.1-py3-none-any.whl",
87
+ "gluekit push my-job --build --build-tool auto --profile my-sso-profile",
88
+ ),
89
+ )
90
+ def glue_set(
91
+ ctx: typer.Context,
92
+ job_name: Optional[str] = typer.Argument(
93
+ None,
94
+ help="Glue job name to save params for.",
95
+ ),
96
+ global_scope: bool = typer.Option(
97
+ False,
98
+ "--global",
99
+ help="Save params as global defaults for all jobs.",
100
+ ),
101
+ profile: Optional[str] = typer.Option(
102
+ None,
103
+ "--profile",
104
+ "-p",
105
+ help=(
106
+ "Save params under this gluekit profile scope, usually matching an "
107
+ "AWS CLI profile name; does not contact AWS."
108
+ ),
109
+ ),
110
+ ) -> None:
111
+ """Set reusable local key/value parameters for a Glue job."""
112
+ parsed = _parse_set_args(list(ctx.args))
113
+ target = _set_saved_scope(parsed, job_name, global_scope, profile=profile)
114
+ typer.echo(f"Saved {len(parsed)} parameter(s) for '{target}' in {GLUE_SET_FILE}.")
115
+
116
+
117
+ @glue_config_app.command(
118
+ "set",
119
+ context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
120
+ epilog=_examples_epilog(
121
+ "gluekit config set my-job --default_arguments.--TempDir s3://my-bucket/tmp/",
122
+ 'gluekit config set --global --additional_python_modules "pydantic==2.11.7"',
123
+ ),
124
+ )
125
+ def glue_config_set(
126
+ ctx: typer.Context,
127
+ job_name: Optional[str] = typer.Argument(
128
+ None,
129
+ help="Glue job name to save params for.",
130
+ ),
131
+ global_scope: bool = typer.Option(
132
+ False,
133
+ "--global",
134
+ help="Save params as global defaults for all jobs.",
135
+ ),
136
+ profile: Optional[str] = typer.Option(
137
+ None,
138
+ "--profile",
139
+ "-p",
140
+ help=(
141
+ "Save params under this gluekit profile scope, usually matching an "
142
+ "AWS CLI profile name; does not contact AWS."
143
+ ),
144
+ ),
145
+ ) -> None:
146
+ """Set reusable local Glue config params by job or globally."""
147
+ parsed = _parse_set_args(list(ctx.args))
148
+ target = _set_saved_scope(parsed, job_name, global_scope, profile=profile)
149
+ typer.echo(f"Saved {len(parsed)} parameter(s) for '{target}' in {GLUE_SET_FILE}.")
150
+
151
+
152
+ @glue_config_app.command(
153
+ "get",
154
+ epilog=_examples_epilog(
155
+ "gluekit config get my-job",
156
+ "gluekit config get",
157
+ ),
158
+ )
159
+ def glue_config_get(
160
+ job_name: Optional[str] = typer.Argument(
161
+ None,
162
+ help="Optional Glue job name to view merged params (global + job).",
163
+ ),
164
+ profile: Optional[str] = typer.Option(
165
+ None,
166
+ "--profile",
167
+ "-p",
168
+ help=(
169
+ "Include params from this gluekit profile scope, usually matching an "
170
+ "AWS CLI profile name; does not contact AWS."
171
+ ),
172
+ ),
173
+ ) -> None:
174
+ """Show stored local Glue config params."""
175
+ store = _load_glue_set_store()
176
+ if not job_name:
177
+ typer.echo(json.dumps(store, indent=4))
178
+ return
179
+ merged = _get_saved_params_for_job(job_name, profile=profile)
180
+ typer.echo(json.dumps(merged, indent=4))
@@ -0,0 +1,47 @@
1
+ from pathlib import Path
2
+
3
+ PULL_COMPONENTS = {
4
+ "config",
5
+ "script",
6
+ "notebook",
7
+ "additional-python-modules",
8
+ "extra-files",
9
+ }
10
+
11
+ PUSH_COMPONENTS = {
12
+ "script",
13
+ "notebook",
14
+ "additional-python-modules",
15
+ "extra-files",
16
+ "job-config",
17
+ }
18
+
19
+ PULL_COMPONENT_ALIASES = {
20
+ "extra-py-files": "additional-python-modules",
21
+ "extra_py_files": "additional-python-modules",
22
+ "extra-py": "additional-python-modules",
23
+ "additional-python-files": "additional-python-modules",
24
+ "additional-python": "additional-python-modules",
25
+ "additional-py-files": "additional-python-modules",
26
+ "extra_files": "extra-files",
27
+ "additional-files": "extra-files",
28
+ "extra": "extra-files",
29
+ }
30
+
31
+ PUSH_COMPONENT_ALIASES = {
32
+ **PULL_COMPONENT_ALIASES,
33
+ "config": "job-config",
34
+ "job_config": "job-config",
35
+ "update-config": "job-config",
36
+ "update_config": "job-config",
37
+ }
38
+
39
+ GLUE_SET_FILE = Path(".gluekit/glue_set.json")
40
+
41
+ __all__ = [
42
+ "PULL_COMPONENTS",
43
+ "PUSH_COMPONENTS",
44
+ "PULL_COMPONENT_ALIASES",
45
+ "PUSH_COMPONENT_ALIASES",
46
+ "GLUE_SET_FILE",
47
+ ]
@@ -0,0 +1,336 @@
1
+ from __future__ import annotations
2
+
3
+ import csv
4
+ import copy
5
+ import json
6
+ import re
7
+ import shutil
8
+ import subprocess
9
+ import tarfile
10
+ import uuid
11
+ import zipfile
12
+ from collections.abc import Mapping
13
+ from datetime import datetime, timezone
14
+ from email.parser import Parser
15
+ from fnmatch import fnmatch
16
+ from pathlib import Path
17
+ from tempfile import TemporaryDirectory
18
+ from typing import Any, Optional
19
+
20
+ import typer
21
+ from slugify import slugify
22
+
23
+ from ..job_mgmt.glue_jobs import (
24
+ download_glue_job_files,
25
+ list_glue_jobs,
26
+ normalize_glue_config_data,
27
+ convert_script_to_notebook,
28
+ convert_notebook_to_script,
29
+ _resolve_notebook_path,
30
+ upload_glue_job_files_from_config,
31
+ )
32
+ from ..job_mgmt.magics import build_magic_cell_sources as _build_magic_cell_sources
33
+
34
+ from .constants import *
35
+ from .helpers import *
36
+ from ..cli import app, glue_config_app
37
+
38
+
39
+ def _notebook_source_to_lines(source: Any) -> list[str]:
40
+ if source is None:
41
+ return []
42
+ if isinstance(source, str):
43
+ return source.splitlines(keepends=True)
44
+ lines: list[str] = []
45
+ for part in source:
46
+ if part.endswith("\n"):
47
+ lines.append(part)
48
+ else:
49
+ lines.append(f"{part}\n")
50
+ return lines
51
+
52
+
53
+ def _build_script_magic_cell_lines(config_data: dict[str, Any]) -> list[str]:
54
+ lines = ["#%% [markdown]\n", "# AWS Glue configs\n"]
55
+
56
+ for source_lines in _build_magic_cell_sources(config_data):
57
+ for line in source_lines:
58
+ stripped = line.rstrip("\n")
59
+ lines.append(f"# {stripped}\n" if stripped else "#\n")
60
+
61
+ lines.append("\n")
62
+ lines.append("#%%\n")
63
+ return lines
64
+
65
+
66
+ def _update_script_config_cell(
67
+ script_path: Path,
68
+ config_data: dict[str, Any],
69
+ dry_run: bool,
70
+ ) -> bool:
71
+ if not script_path.exists():
72
+ typer.echo(f"Script not found: {script_path}")
73
+ return False
74
+
75
+ script_text = script_path.read_text()
76
+ lines = script_text.splitlines(keepends=True)
77
+ magic_lines = _build_script_magic_cell_lines(config_data)
78
+
79
+ marker_re = re.compile(r"^#\s*%%")
80
+
81
+ def is_glue_magic_line(raw_line: str) -> bool:
82
+ candidate = raw_line.lstrip()
83
+ if candidate.startswith("#"):
84
+ candidate = candidate[1:].lstrip()
85
+ return (
86
+ candidate.startswith("%")
87
+ or "%%configure" in candidate
88
+ or "AWS Glue configs" in candidate
89
+ )
90
+
91
+ def split_into_cells(all_lines: list[str]) -> list[list[str]]:
92
+ starts = [idx for idx, line in enumerate(all_lines) if marker_re.match(line)]
93
+ if not starts:
94
+ return [all_lines]
95
+ if starts[0] != 0:
96
+ starts = [0] + starts
97
+ boundaries = starts[1:] + [len(all_lines)]
98
+ return [all_lines[start:end] for start, end in zip(starts, boundaries)]
99
+
100
+ kept: list[str] = []
101
+ for cell_lines in split_into_cells(lines):
102
+ if any(is_glue_magic_line(line) for line in cell_lines):
103
+ first_code_idx = None
104
+ for idx, line in enumerate(cell_lines):
105
+ stripped = line.strip()
106
+ if not stripped:
107
+ continue
108
+ if stripped.startswith("#"):
109
+ continue
110
+ if marker_re.match(line):
111
+ continue
112
+ first_code_idx = idx
113
+ break
114
+ if first_code_idx is not None:
115
+ kept.extend(cell_lines[first_code_idx:])
116
+ continue
117
+ kept.extend(cell_lines)
118
+
119
+ new_lines = magic_lines + kept
120
+
121
+ if new_lines == lines:
122
+ return False
123
+
124
+ if dry_run:
125
+ typer.echo(f"Would update script config cell: {script_path}")
126
+ return True
127
+
128
+ script_path.write_text("".join(new_lines))
129
+ typer.echo(f"Updated script config cell: {script_path}")
130
+ return True
131
+
132
+
133
+ def _extract_config_from_cell(lines: list[str]) -> tuple[int, dict[str, Any]]:
134
+ for idx, line in enumerate(lines):
135
+ if "%%configure" in line:
136
+ raw = "".join(lines[idx + 1 :]).strip()
137
+ if not raw:
138
+ return idx, {}
139
+ start = raw.find("{")
140
+ end = raw.rfind("}")
141
+ if start == -1 or end == -1 or end < start:
142
+ raise ValueError("Unable to locate JSON block after %%configure.")
143
+ payload = raw[start : end + 1]
144
+ return idx, json.loads(payload)
145
+ raise ValueError("No %%configure cell found.")
146
+
147
+
148
+ def _new_notebook_code_cell(source_lines: list[str]) -> dict[str, Any]:
149
+ return {
150
+ "cell_type": "code",
151
+ "id": str(uuid.uuid4()),
152
+ "metadata": {
153
+ "tags": [],
154
+ "trusted": True,
155
+ "vscode": {"languageId": "python_glue_session"},
156
+ },
157
+ "source": source_lines,
158
+ "execution_count": None,
159
+ "outputs": [],
160
+ }
161
+
162
+
163
+ def _new_notebook_markdown_cell(source_lines: list[str]) -> dict[str, Any]:
164
+ return {
165
+ "cell_type": "markdown",
166
+ "id": str(uuid.uuid4()),
167
+ "metadata": {},
168
+ "source": source_lines,
169
+ }
170
+
171
+
172
+ def _is_notebook_magic_cell(cell: dict[str, Any]) -> bool:
173
+ if cell.get("cell_type") != "code":
174
+ return False
175
+ lines = _notebook_source_to_lines(cell.get("source"))
176
+ for line in lines:
177
+ if line.lstrip().startswith("%"):
178
+ return True
179
+ return False
180
+
181
+
182
+ def _is_generated_glue_heading_cell(cell: dict[str, Any]) -> bool:
183
+ if cell.get("cell_type") != "markdown":
184
+ return False
185
+ lines = _notebook_source_to_lines(cell.get("source"))
186
+ text = "".join(lines).strip()
187
+ return text in {"# AWS Glue configs", "# AWS Glue Script"}
188
+
189
+
190
+ def _build_notebook_magic_cells(config_data: dict[str, Any]) -> list[dict[str, Any]]:
191
+ cells: list[dict[str, Any]] = [
192
+ _new_notebook_markdown_cell(["# AWS Glue configs\n"])
193
+ ]
194
+ cells.extend(
195
+ _new_notebook_code_cell(source_lines)
196
+ for source_lines in _build_magic_cell_sources(config_data)
197
+ )
198
+
199
+ cells.append(_new_notebook_markdown_cell(["# AWS Glue Script\n"]))
200
+ return cells
201
+
202
+
203
+ def _update_notebook_config_cell(
204
+ notebook_path: Path,
205
+ config_data: dict[str, Any],
206
+ keys_to_update: set[str],
207
+ dry_run: bool,
208
+ ) -> bool:
209
+ if not notebook_path.exists():
210
+ typer.echo(f"Notebook not found: {notebook_path}")
211
+ return False
212
+
213
+ del keys_to_update
214
+
215
+ notebook = json.loads(notebook_path.read_text())
216
+ existing_cells = notebook.get("cells", [])
217
+ cleaned_cells = [
218
+ cell
219
+ for cell in existing_cells
220
+ if not _is_notebook_magic_cell(cell)
221
+ and not _is_generated_glue_heading_cell(cell)
222
+ ]
223
+
224
+ new_magic_cells = _build_notebook_magic_cells(config_data)
225
+ notebook["cells"] = new_magic_cells + cleaned_cells
226
+
227
+ if dry_run:
228
+ typer.echo(f"Would update notebook config cell: {notebook_path}")
229
+ return True
230
+
231
+ notebook_path.write_text(json.dumps(notebook, indent=4))
232
+ typer.echo(f"Updated notebook config cell: {notebook_path}")
233
+ return True
234
+
235
+
236
+ @app.command(
237
+ "convert",
238
+ epilog=_examples_epilog(
239
+ "gluekit convert my-job --from script --to notebook --use-config",
240
+ "gluekit convert my-job --from notebook --to script --script-only",
241
+ ),
242
+ )
243
+ def glue_convert(
244
+ job_name: Optional[str] = typer.Argument(
245
+ None,
246
+ help="Glue job name to convert. Defaults to the active checkout selection.",
247
+ ),
248
+ from_format: str = typer.Option(
249
+ "script",
250
+ "--from",
251
+ help="Source format (script or notebook).",
252
+ ),
253
+ to_format: str = typer.Option(
254
+ "notebook",
255
+ "--to",
256
+ help="Target format (script or notebook).",
257
+ ),
258
+ use_config: bool = typer.Option(
259
+ True,
260
+ "--use-config/--script-only",
261
+ help="Use config metadata (magics and paths) when available.",
262
+ ),
263
+ dry_run: bool = typer.Option(
264
+ False,
265
+ "--dry-run",
266
+ help="Show what would be converted without writing files.",
267
+ ),
268
+ config_dir: Path = typer.Option(
269
+ Path("glue/configs"),
270
+ "--config-dir",
271
+ help="Directory containing Glue job config files.",
272
+ ),
273
+ ) -> None:
274
+ """Convert Glue jobs between script and notebook formats."""
275
+ job_name = _resolve_single_job_name(job_name, "glue convert")
276
+ from_format = from_format.lower()
277
+ to_format = to_format.lower()
278
+
279
+ if from_format == to_format:
280
+ raise typer.BadParameter("--from and --to must be different values.")
281
+
282
+ if from_format not in {"script", "notebook"} or to_format not in {
283
+ "script",
284
+ "notebook",
285
+ }:
286
+ raise typer.BadParameter("--from/--to must be either script or notebook.")
287
+
288
+ config_data = None
289
+ if use_config:
290
+ config_index = _load_config_index(config_dir)
291
+ config_entry = config_index.get(job_name)
292
+ if config_entry:
293
+ config_data = config_entry.get("config", {})
294
+ sc = config_data.get("SourceControlDetails", {})
295
+ script_path = Path(
296
+ sc.get("ScriptLocation")
297
+ or sc.get("LocalPath")
298
+ or f"glue/scripts/{slugify(job_name)}.py"
299
+ )
300
+ notebook_path = sc.get("NotebookPath") or sc.get("NotebookLocation")
301
+ if notebook_path:
302
+ notebook_path = Path(notebook_path)
303
+ else:
304
+ notebook_path = Path(_resolve_notebook_path(script_path))
305
+ else:
306
+ script_path = Path(f"glue/scripts/{slugify(job_name)}.py")
307
+ notebook_path = Path(f"glue/notebooks/{slugify(job_name)}.ipynb")
308
+ else:
309
+ script_path = Path(f"glue/scripts/{slugify(job_name)}.py")
310
+ notebook_path = Path(f"glue/notebooks/{slugify(job_name)}.ipynb")
311
+
312
+ if from_format == "script" and not script_path.exists():
313
+ raise typer.BadParameter(f"Script not found: {script_path}")
314
+ if from_format == "notebook" and not notebook_path.exists():
315
+ raise typer.BadParameter(f"Notebook not found: {notebook_path}")
316
+
317
+ if dry_run:
318
+ if from_format == "notebook":
319
+ typer.echo(f"Would convert: {notebook_path} -> {script_path}")
320
+ else:
321
+ typer.echo(f"Would convert: {script_path} -> {notebook_path}")
322
+ return
323
+
324
+ if from_format == "script" and to_format == "notebook":
325
+ converted_path = convert_script_to_notebook(
326
+ script_path,
327
+ notebook_path,
328
+ config_data=config_data,
329
+ include_magics=use_config,
330
+ )
331
+ typer.echo(f"Converted: {script_path} -> {converted_path}")
332
+ elif from_format == "notebook" and to_format == "script":
333
+ converted_path = convert_notebook_to_script(notebook_path, script_path)
334
+ typer.echo(f"Converted: {notebook_path} -> {converted_path}")
335
+ else:
336
+ raise typer.BadParameter("Only script <-> notebook conversions are supported.")