gluekit 1.0.1.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. gluekit/__init__.py +7 -0
  2. gluekit/app.py +0 -0
  3. gluekit/cli.py +64 -0
  4. gluekit/commands/__init__.py +1 -0
  5. gluekit/commands/add.py +455 -0
  6. gluekit/commands/build.py +816 -0
  7. gluekit/commands/checkout.py +114 -0
  8. gluekit/commands/clone.py +516 -0
  9. gluekit/commands/config_commands.py +180 -0
  10. gluekit/commands/constants.py +47 -0
  11. gluekit/commands/convert.py +336 -0
  12. gluekit/commands/edit.py +1104 -0
  13. gluekit/commands/helpers.py +1068 -0
  14. gluekit/commands/init.py +798 -0
  15. gluekit/commands/list.py +16 -0
  16. gluekit/commands/local_commands.py +680 -0
  17. gluekit/commands/pull.py +374 -0
  18. gluekit/commands/push.py +251 -0
  19. gluekit/commands/remove.py +161 -0
  20. gluekit/commands/run.py +126 -0
  21. gluekit/commands/status.py +97 -0
  22. gluekit/commands/sync.py +97 -0
  23. gluekit/commands/update.py +104 -0
  24. gluekit/job_mgmt/__init__.py +0 -0
  25. gluekit/job_mgmt/glue_jobs.py +1323 -0
  26. gluekit/job_mgmt/magics.py +122 -0
  27. gluekit/job_mgmt/resources/__init__.py +0 -0
  28. gluekit/job_mgmt/resources/glue_job_schema.json +40341 -0
  29. gluekit/job_mgmt/resources/magic_map.json +83 -0
  30. gluekit/job_mgmt/schema.py +165 -0
  31. gluekit/local/__init__.py +6 -0
  32. gluekit/local/awsglue/__init__.py +1 -0
  33. gluekit/local/awsglue/context.py +30 -0
  34. gluekit/local/awsglue/job.py +9 -0
  35. gluekit/local/awsglue/utils.py +17 -0
  36. gluekit/local/local.py +434 -0
  37. gluekit/local/local_fixtures.py +337 -0
  38. gluekit/local/pyspark/__init__.py +7 -0
  39. gluekit/local/pyspark/context.py +31 -0
  40. gluekit/local/pyspark/sql/__init__.py +6 -0
  41. gluekit/local/pyspark/sql/session.py +29 -0
  42. gluekit-1.0.1.dev1.dist-info/METADATA +1176 -0
  43. gluekit-1.0.1.dev1.dist-info/RECORD +46 -0
  44. gluekit-1.0.1.dev1.dist-info/WHEEL +5 -0
  45. gluekit-1.0.1.dev1.dist-info/entry_points.txt +2 -0
  46. gluekit-1.0.1.dev1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,337 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import posixpath
5
+ import shutil
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ LOCAL_RUN_CONFIG_FILE = Path(".gluekit/local.json")
10
+
11
+
12
+ def empty_local_run_config() -> dict[str, Any]:
13
+ return {
14
+ "version": 1,
15
+ "s3": {"buckets": {}, "objects": {}, "directories": []},
16
+ "ssm": {"parameters": {}},
17
+ }
18
+
19
+
20
+ def parse_s3_uri(raw: str) -> tuple[str, str]:
21
+ if not raw.startswith("s3://"):
22
+ raise ValueError(f"S3 URI must start with s3://. Received: {raw!r}")
23
+ bucket_and_key = raw[5:]
24
+ bucket, _, key = bucket_and_key.partition("/")
25
+ bucket = bucket.strip()
26
+ if not bucket:
27
+ raise ValueError(f"S3 URI must include a bucket name. Received: {raw!r}")
28
+ return bucket, key
29
+
30
+
31
+ def is_s3_uri(raw: str) -> bool:
32
+ return raw.startswith("s3://")
33
+
34
+
35
+ def normalize_s3_uri(raw: str) -> str:
36
+ bucket, key = parse_s3_uri(raw)
37
+ normalized_key = posixpath.normpath(key.strip("/")) if key.strip("/") else ""
38
+ if normalized_key == ".":
39
+ normalized_key = ""
40
+ if normalized_key:
41
+ return f"s3://{bucket}/{normalized_key}"
42
+ return f"s3://{bucket}"
43
+
44
+
45
+ def normalize_s3_object_uri(raw: str) -> str:
46
+ bucket, key = parse_s3_uri(raw)
47
+ normalized = normalize_s3_uri(raw)
48
+ if not key.strip("/"):
49
+ raise ValueError(f"S3 object URI must include a key. Received: {raw!r}")
50
+ return normalized
51
+
52
+
53
+ def s3_destination_uri(destination: str, source_name: str) -> str:
54
+ bucket, key = parse_s3_uri(destination)
55
+ if not key or destination.endswith("/"):
56
+ return normalize_s3_object_uri(
57
+ f"s3://{bucket}/{posixpath.join(key, source_name)}"
58
+ )
59
+ return normalize_s3_object_uri(destination)
60
+
61
+
62
+ def s3_prefix(raw: str) -> str:
63
+ bucket, key = parse_s3_uri(raw)
64
+ normalized_key = key.strip("/")
65
+ if normalized_key:
66
+ return f"s3://{bucket}/{normalized_key}/"
67
+ return f"s3://{bucket}/"
68
+
69
+
70
+ def s3_directory_destination(raw: str) -> str:
71
+ return s3_prefix(normalize_s3_uri(raw))
72
+
73
+
74
+ def s3_uri_matches_prefix(uri: str, prefix: str) -> bool:
75
+ return uri.startswith(prefix)
76
+
77
+
78
+ def normalize_s3_bucket_name(raw: str) -> str:
79
+ if raw.startswith("s3://"):
80
+ bucket, key = parse_s3_uri(raw)
81
+ if key.strip("/"):
82
+ raise ValueError(f"S3 bucket mapping must not include a key: {raw!r}")
83
+ return bucket
84
+ bucket = raw.strip()
85
+ if not bucket:
86
+ raise ValueError("S3 bucket mapping must include a bucket name.")
87
+ if "/" in bucket:
88
+ raise ValueError(f"S3 bucket mapping must be a bucket name: {raw!r}")
89
+ return bucket
90
+
91
+
92
+ def normalize_s3_root_path(raw: str) -> str:
93
+ root = raw.strip()
94
+ if not root:
95
+ raise ValueError("S3 root mapping must include a local directory.")
96
+ return root
97
+
98
+
99
+ def load_local_fixture_config(
100
+ config_file: str | Path | None = None,
101
+ ) -> dict[str, Any]:
102
+ path = Path(config_file) if config_file is not None else LOCAL_RUN_CONFIG_FILE
103
+ if not path.exists():
104
+ return empty_local_run_config()
105
+ try:
106
+ data = json.loads(path.read_text(encoding="utf-8"))
107
+ except json.JSONDecodeError as exc:
108
+ raise ValueError(f"Invalid JSON in local run config {path}: {exc}") from exc
109
+ if not isinstance(data, dict):
110
+ raise ValueError(f"Local run config {path} must contain a JSON object.")
111
+ return normalize_local_fixture_config(data, path=path)
112
+
113
+
114
+ def normalize_local_fixture_config(
115
+ data: dict[str, Any], *, path: Path | None = None
116
+ ) -> dict[str, Any]:
117
+ location = f" in {path}" if path else ""
118
+ if data.get("version") != 1:
119
+ raise ValueError(f"Local run config{location} must use version 1.")
120
+ raw_profile = data.get("profile")
121
+ if raw_profile is not None and not isinstance(raw_profile, str):
122
+ raise ValueError(f"Local run config{location} has invalid profile.")
123
+ profile = raw_profile.strip() if isinstance(raw_profile, str) else None
124
+
125
+ s3 = data.get("s3", {})
126
+ if not isinstance(s3, dict):
127
+ raise ValueError(f"Local run config{location} has invalid s3 section.")
128
+ raw_buckets = s3.get("buckets", {})
129
+ raw_root = s3.get("root")
130
+ raw_objects = s3.get("objects", {})
131
+ raw_directories = s3.get("directories", [])
132
+ if raw_root is not None and not isinstance(raw_root, str):
133
+ raise ValueError(f"Local run config{location} has invalid s3.root.")
134
+ if not isinstance(raw_buckets, dict) or not all(
135
+ isinstance(bucket, str) and isinstance(local_path, str)
136
+ for bucket, local_path in raw_buckets.items()
137
+ ):
138
+ raise ValueError(f"Local run config{location} has invalid s3.buckets.")
139
+ if not isinstance(raw_objects, dict) or not all(
140
+ isinstance(uri, str) and isinstance(local_path, str)
141
+ for uri, local_path in raw_objects.items()
142
+ ):
143
+ raise ValueError(f"Local run config{location} has invalid s3.objects.")
144
+ if not isinstance(raw_directories, list):
145
+ raise ValueError(f"Local run config{location} has invalid s3.directories.")
146
+
147
+ buckets: dict[str, str] = {}
148
+ for bucket, local_path in raw_buckets.items():
149
+ normalized_bucket = normalize_s3_bucket_name(bucket)
150
+ if not local_path.strip():
151
+ raise ValueError(
152
+ f"Local run config{location} has invalid local path for S3 bucket {bucket!r}."
153
+ )
154
+ buckets[normalized_bucket] = local_path
155
+
156
+ root = normalize_s3_root_path(raw_root) if isinstance(raw_root, str) else None
157
+
158
+ objects: dict[str, str] = {}
159
+ for uri, local_path in raw_objects.items():
160
+ normalized_uri = normalize_s3_object_uri(uri)
161
+ objects[normalized_uri] = local_path
162
+ directories: list[dict[str, str]] = []
163
+ seen_directories: set[tuple[str, str]] = set()
164
+ for entry in raw_directories:
165
+ if not isinstance(entry, dict):
166
+ raise ValueError(
167
+ f"Local run config{location} has invalid s3.directories entry."
168
+ )
169
+ raw_local_path = entry.get("local_path")
170
+ raw_destination = entry.get("destination")
171
+ if not isinstance(raw_local_path, str) or not raw_local_path.strip():
172
+ raise ValueError(
173
+ f"Local run config{location} has invalid local_path in s3.directories."
174
+ )
175
+ if not isinstance(raw_destination, str) or not raw_destination.strip():
176
+ raise ValueError(
177
+ f"Local run config{location} has invalid destination in s3.directories."
178
+ )
179
+ normalized_destination = s3_directory_destination(raw_destination)
180
+ directory_key = (raw_local_path, normalized_destination)
181
+ if directory_key in seen_directories:
182
+ continue
183
+ seen_directories.add(directory_key)
184
+ directories.append(
185
+ {
186
+ "local_path": raw_local_path,
187
+ "destination": normalized_destination,
188
+ }
189
+ )
190
+
191
+ ssm = data.get("ssm", {})
192
+ if not isinstance(ssm, dict):
193
+ raise ValueError(f"Local run config{location} has invalid ssm section.")
194
+ raw_parameters = ssm.get("parameters", {})
195
+ if not isinstance(raw_parameters, dict):
196
+ raise ValueError(f"Local run config{location} has invalid ssm.parameters.")
197
+
198
+ parameters: dict[str, str] = {}
199
+ for name, value in raw_parameters.items():
200
+ if not isinstance(name, str) or not name.strip() or " " in name:
201
+ raise ValueError(
202
+ f"Local run config{location} has invalid SSM parameter name."
203
+ )
204
+ if not isinstance(value, str):
205
+ raise ValueError(
206
+ f"Local run config{location} has invalid value for SSM parameter {name!r}."
207
+ )
208
+ parameters[name.strip()] = value
209
+
210
+ s3_config: dict[str, Any] = {
211
+ "buckets": dict(sorted(buckets.items())),
212
+ "objects": dict(sorted(objects.items())),
213
+ "directories": sorted(
214
+ directories,
215
+ key=lambda entry: (entry["destination"], entry["local_path"]),
216
+ ),
217
+ }
218
+ if root is not None:
219
+ s3_config["root"] = root
220
+
221
+ normalized: dict[str, Any] = {
222
+ "version": 1,
223
+ "s3": s3_config,
224
+ "ssm": {"parameters": dict(sorted(parameters.items()))},
225
+ }
226
+ if profile:
227
+ normalized["profile"] = profile
228
+ return normalized
229
+
230
+
231
+ def save_local_fixture_config(
232
+ config: dict[str, Any], config_file: str | Path | None = None
233
+ ) -> None:
234
+ path = Path(config_file) if config_file is not None else LOCAL_RUN_CONFIG_FILE
235
+ normalized = normalize_local_fixture_config(config, path=path)
236
+ path.parent.mkdir(parents=True, exist_ok=True)
237
+ path.write_text(json.dumps(normalized, indent=4) + "\n", encoding="utf-8")
238
+
239
+
240
+ def config_to_runtime_kwargs(config: dict[str, Any]) -> dict[str, Any]:
241
+ s3_objects = expand_s3_root_mapping(config)
242
+ s3_objects.update(expand_bucket_mappings(config))
243
+ s3_objects.update(config["s3"]["objects"])
244
+ s3_objects.update(expand_directory_mappings(config))
245
+ bucket_names = {parse_s3_uri(object_uri)[0] for object_uri in s3_objects}
246
+ return {
247
+ "create_buckets": sorted(bucket_names),
248
+ "s3_objects": s3_objects,
249
+ "ssm_parameters": dict(config["ssm"]["parameters"]),
250
+ }
251
+
252
+
253
+ def load_runtime_config(config_file: str | Path | None = None) -> dict[str, Any]:
254
+ path = Path(config_file) if config_file is not None else LOCAL_RUN_CONFIG_FILE
255
+ if not path.exists():
256
+ return {}
257
+ return config_to_runtime_kwargs(load_local_fixture_config(path))
258
+
259
+
260
+ def expand_s3_root_mapping(config: dict[str, Any]) -> dict[str, str]:
261
+ root = config.get("s3", {}).get("root")
262
+ if not isinstance(root, str):
263
+ return {}
264
+ root_path = Path(root)
265
+ if not root_path.exists():
266
+ raise ValueError(f"Mapped local S3 root directory does not exist: {root_path}")
267
+ if not root_path.is_dir():
268
+ raise ValueError(f"Mapped local S3 root path is not a directory: {root_path}")
269
+
270
+ expanded: dict[str, str] = {}
271
+ for bucket_dir in sorted(filter(Path.is_dir, root_path.iterdir())):
272
+ bucket = normalize_s3_bucket_name(bucket_dir.name)
273
+ for candidate in sorted(filter(Path.is_file, bucket_dir.rglob("*"))):
274
+ relative_path = candidate.relative_to(bucket_dir).as_posix()
275
+ target_uri = normalize_s3_object_uri(f"s3://{bucket}/{relative_path}")
276
+ expanded[target_uri] = str(candidate)
277
+ return expanded
278
+
279
+
280
+ def expand_bucket_mappings(config: dict[str, Any]) -> dict[str, str]:
281
+ expanded: dict[str, str] = {}
282
+ buckets = config.get("s3", {}).get("buckets", {})
283
+ if not isinstance(buckets, dict):
284
+ return expanded
285
+ for bucket, local_path in buckets.items():
286
+ if not isinstance(bucket, str) or not isinstance(local_path, str):
287
+ continue
288
+ source_dir = Path(local_path)
289
+ if not source_dir.exists():
290
+ raise ValueError(
291
+ f"Mapped local S3 bucket directory does not exist: {source_dir}"
292
+ )
293
+ if not source_dir.is_dir():
294
+ raise ValueError(
295
+ f"Mapped local S3 bucket path is not a directory: {source_dir}"
296
+ )
297
+ for candidate in sorted(filter(Path.is_file, source_dir.rglob("*"))):
298
+ relative_path = candidate.relative_to(source_dir).as_posix()
299
+ target_uri = normalize_s3_object_uri(f"s3://{bucket}/{relative_path}")
300
+ expanded[target_uri] = str(candidate)
301
+ return expanded
302
+
303
+
304
+ def expand_directory_mappings(config: dict[str, Any]) -> dict[str, str]:
305
+ expanded: dict[str, str] = {}
306
+ directories = config.get("s3", {}).get("directories", [])
307
+ if not isinstance(directories, list):
308
+ return expanded
309
+ for entry in directories:
310
+ if not isinstance(entry, dict):
311
+ continue
312
+ local_path = entry.get("local_path")
313
+ destination = entry.get("destination")
314
+ if not isinstance(local_path, str) or not isinstance(destination, str):
315
+ continue
316
+ source_dir = Path(local_path)
317
+ if not source_dir.exists():
318
+ raise ValueError(f"Mapped local directory does not exist: {source_dir}")
319
+ if not source_dir.is_dir():
320
+ raise ValueError(f"Mapped local path is not a directory: {source_dir}")
321
+ for candidate in sorted(filter(Path.is_file, source_dir.rglob("*"))):
322
+ relative_path = candidate.relative_to(source_dir).as_posix()
323
+ target_uri = normalize_s3_object_uri(f"{destination}{relative_path}")
324
+ expanded[target_uri] = str(candidate)
325
+ return expanded
326
+
327
+
328
+ def copy_mapped_file(source_path: str, destination: str | Path) -> Path:
329
+ source = Path(source_path)
330
+ if not source.is_file():
331
+ raise ValueError(f"Mapped local file does not exist: {source}")
332
+ destination_path = Path(destination)
333
+ if str(destination).endswith(("/", "\\")) or destination_path.is_dir():
334
+ destination_path = destination_path / source.name
335
+ destination_path.parent.mkdir(parents=True, exist_ok=True)
336
+ shutil.copyfile(source, destination_path)
337
+ return destination_path
@@ -0,0 +1,7 @@
1
+ """pyspark top-level module"""
2
+
3
+ from . import sql,context
4
+
5
+
6
+
7
+ __all__ = ["context", "sql"]
@@ -0,0 +1,31 @@
1
+ from sqlframe.duckdb import DuckDBSession
2
+
3
+
4
+ class SparkContext:
5
+ """
6
+
7
+ Minimal local placeholder for Glue bootstrap compatibility.
8
+
9
+ Example usage in Glue job or notebook:
10
+
11
+ from awsglue.context import GlueContext
12
+ from awsglue.job import Job
13
+ from awsglue.utils import getResolvedOptions
14
+ from pyspark.context import SparkContext
15
+
16
+ sc = SparkContext()
17
+ glueContext = GlueContext(sc)
18
+ spark = glueContext.spark_session
19
+ args = getResolvedOptions(sys.argv, ['JOB_NAME'])
20
+ job = Job(glueContext)
21
+ job.init(args['JOB_NAME'], args)
22
+
23
+ """
24
+
25
+ def __init__(self):
26
+
27
+ self._builder = DuckDBSession.builder
28
+
29
+ @classmethod
30
+ def getOrCreate(cls):
31
+ return cls()
@@ -0,0 +1,6 @@
1
+ from sqlframe import activate
2
+
3
+ activate(engine="duckdb")
4
+
5
+
6
+ from .session import SparkSession
@@ -0,0 +1,29 @@
1
+ from sqlframe.duckdb import DuckDBSession
2
+
3
+
4
+ class SparkSession(DuckDBSession):
5
+ """
6
+
7
+
8
+ Minimal local placeholder for Glue bootstrap compatibility.
9
+ Probably do not need this as this is redundant with
10
+ sqlframe but using but adding here just in case.
11
+
12
+ Example usage in Glue job or notebook:
13
+
14
+ from pyspark.sql import SparkSession
15
+
16
+ spark = (
17
+ SparkSession.builder.appName("DataMigrationCrossTrackProfiling"
18
+ ).config("spark.sql.caseSensitive", "true").getOrCreate()
19
+ )
20
+
21
+
22
+ """
23
+
24
+ def __init__(self):
25
+ self._builder = DuckDBSession.builder
26
+
27
+ def getOrCreate(self):
28
+ session = self._builder.getOrCreate()
29
+ return session