gluekit 1.0.1.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gluekit/__init__.py +7 -0
- gluekit/app.py +0 -0
- gluekit/cli.py +64 -0
- gluekit/commands/__init__.py +1 -0
- gluekit/commands/add.py +455 -0
- gluekit/commands/build.py +816 -0
- gluekit/commands/checkout.py +114 -0
- gluekit/commands/clone.py +516 -0
- gluekit/commands/config_commands.py +180 -0
- gluekit/commands/constants.py +47 -0
- gluekit/commands/convert.py +336 -0
- gluekit/commands/edit.py +1104 -0
- gluekit/commands/helpers.py +1068 -0
- gluekit/commands/init.py +798 -0
- gluekit/commands/list.py +16 -0
- gluekit/commands/local_commands.py +680 -0
- gluekit/commands/pull.py +374 -0
- gluekit/commands/push.py +251 -0
- gluekit/commands/remove.py +161 -0
- gluekit/commands/run.py +126 -0
- gluekit/commands/status.py +97 -0
- gluekit/commands/sync.py +97 -0
- gluekit/commands/update.py +104 -0
- gluekit/job_mgmt/__init__.py +0 -0
- gluekit/job_mgmt/glue_jobs.py +1323 -0
- gluekit/job_mgmt/magics.py +122 -0
- gluekit/job_mgmt/resources/__init__.py +0 -0
- gluekit/job_mgmt/resources/glue_job_schema.json +40341 -0
- gluekit/job_mgmt/resources/magic_map.json +83 -0
- gluekit/job_mgmt/schema.py +165 -0
- gluekit/local/__init__.py +6 -0
- gluekit/local/awsglue/__init__.py +1 -0
- gluekit/local/awsglue/context.py +30 -0
- gluekit/local/awsglue/job.py +9 -0
- gluekit/local/awsglue/utils.py +17 -0
- gluekit/local/local.py +434 -0
- gluekit/local/local_fixtures.py +337 -0
- gluekit/local/pyspark/__init__.py +7 -0
- gluekit/local/pyspark/context.py +31 -0
- gluekit/local/pyspark/sql/__init__.py +6 -0
- gluekit/local/pyspark/sql/session.py +29 -0
- gluekit-1.0.1.dev1.dist-info/METADATA +1176 -0
- gluekit-1.0.1.dev1.dist-info/RECORD +46 -0
- gluekit-1.0.1.dev1.dist-info/WHEEL +5 -0
- gluekit-1.0.1.dev1.dist-info/entry_points.txt +2 -0
- gluekit-1.0.1.dev1.dist-info/top_level.txt +1 -0
gluekit/commands/init.py
ADDED
|
@@ -0,0 +1,798 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import configparser
|
|
4
|
+
import csv
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
import shlex
|
|
8
|
+
from collections.abc import Mapping
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from datetime import datetime, timezone
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any, Optional
|
|
13
|
+
|
|
14
|
+
import typer
|
|
15
|
+
from rapidfuzz import fuzz
|
|
16
|
+
from slugify import slugify
|
|
17
|
+
|
|
18
|
+
from ..job_mgmt.glue_jobs import (
|
|
19
|
+
download_glue_job_files,
|
|
20
|
+
get_glue_job_config,
|
|
21
|
+
list_glue_jobs,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
from .constants import GLUE_SET_FILE
|
|
25
|
+
from .helpers import (
|
|
26
|
+
_examples_epilog,
|
|
27
|
+
_get_checked_out_jobs,
|
|
28
|
+
_load_glue_set_store,
|
|
29
|
+
_resolve_checkout_job_name,
|
|
30
|
+
_save_checked_out_jobs,
|
|
31
|
+
_save_glue_set_store,
|
|
32
|
+
_set_saved_scope,
|
|
33
|
+
)
|
|
34
|
+
from ..cli import app
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
S3_URI_PATTERN = re.compile(r"^s3://(?P<bucket>[^/\s,]+)(?:/(?P<key>[^\s,]*))?$")
|
|
38
|
+
S3_DETECTION_ROOTS = {
|
|
39
|
+
"Command",
|
|
40
|
+
"DefaultArguments",
|
|
41
|
+
"NonOverridableArguments",
|
|
42
|
+
}
|
|
43
|
+
S3_SOURCE_CONTROL_KEYS = {
|
|
44
|
+
"AdditionalPythonFiles",
|
|
45
|
+
"ExtraPyFiles",
|
|
46
|
+
"AdditionalFiles",
|
|
47
|
+
"ExtraFiles",
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass(frozen=True)
|
|
52
|
+
class S3UriRecord:
|
|
53
|
+
field_path: str
|
|
54
|
+
token_index: int
|
|
55
|
+
uri: str
|
|
56
|
+
bucket: str
|
|
57
|
+
key: str
|
|
58
|
+
field_value: Any
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def identity(self) -> tuple[str, int]:
|
|
62
|
+
return self.field_path, self.token_index
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@dataclass(frozen=True)
|
|
66
|
+
class S3DetectionResult:
|
|
67
|
+
params: dict[str, Any]
|
|
68
|
+
skipped: list[str]
|
|
69
|
+
candidates: dict[str, list[tuple[str, int]]]
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _aws_profiles_from_config(config_path: Path) -> list[str]:
|
|
73
|
+
parser = configparser.ConfigParser()
|
|
74
|
+
if not config_path.exists():
|
|
75
|
+
return []
|
|
76
|
+
parser.read(config_path)
|
|
77
|
+
profiles: list[str] = []
|
|
78
|
+
for section in parser.sections():
|
|
79
|
+
if section == "default":
|
|
80
|
+
profiles.append("default")
|
|
81
|
+
elif section.startswith("profile "):
|
|
82
|
+
profiles.append(section.removeprefix("profile ").strip())
|
|
83
|
+
return sorted({profile for profile in profiles if profile})
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _save_profile_param(
|
|
87
|
+
*,
|
|
88
|
+
profile: str,
|
|
89
|
+
job_name: str,
|
|
90
|
+
key: str,
|
|
91
|
+
value: Any,
|
|
92
|
+
) -> None:
|
|
93
|
+
_set_saved_scope({key: value}, job_name, False, profile=profile)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _parse_s3_uri(value: str) -> tuple[str, str] | None:
|
|
97
|
+
match = S3_URI_PATTERN.match(value.strip())
|
|
98
|
+
if not match:
|
|
99
|
+
return None
|
|
100
|
+
return match.group("bucket"), match.group("key") or ""
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _parse_csv_tokens(value: str) -> list[str]:
|
|
104
|
+
try:
|
|
105
|
+
return [item.strip() for item in next(csv.reader([value]))]
|
|
106
|
+
except csv.Error:
|
|
107
|
+
return [value.strip()]
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _append_s3_records_from_string(
|
|
111
|
+
records: list[S3UriRecord],
|
|
112
|
+
*,
|
|
113
|
+
field_path: str,
|
|
114
|
+
field_value: str,
|
|
115
|
+
) -> None:
|
|
116
|
+
tokens = _parse_csv_tokens(field_value)
|
|
117
|
+
if not tokens:
|
|
118
|
+
tokens = [field_value.strip()]
|
|
119
|
+
for index, token in enumerate(tokens):
|
|
120
|
+
parsed = _parse_s3_uri(token)
|
|
121
|
+
if not parsed:
|
|
122
|
+
continue
|
|
123
|
+
bucket, key = parsed
|
|
124
|
+
records.append(
|
|
125
|
+
S3UriRecord(
|
|
126
|
+
field_path=field_path,
|
|
127
|
+
token_index=index,
|
|
128
|
+
uri=token,
|
|
129
|
+
bucket=bucket,
|
|
130
|
+
key=key,
|
|
131
|
+
field_value=field_value,
|
|
132
|
+
)
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _collect_s3_uri_records_from_value(
|
|
137
|
+
value: Any,
|
|
138
|
+
*,
|
|
139
|
+
field_path: str,
|
|
140
|
+
records: list[S3UriRecord],
|
|
141
|
+
) -> None:
|
|
142
|
+
if isinstance(value, str):
|
|
143
|
+
_append_s3_records_from_string(
|
|
144
|
+
records, field_path=field_path, field_value=value
|
|
145
|
+
)
|
|
146
|
+
return
|
|
147
|
+
if isinstance(value, dict):
|
|
148
|
+
for key, child in value.items():
|
|
149
|
+
_collect_s3_uri_records_from_value(
|
|
150
|
+
child,
|
|
151
|
+
field_path=f"{field_path}.{key}",
|
|
152
|
+
records=records,
|
|
153
|
+
)
|
|
154
|
+
return
|
|
155
|
+
if isinstance(value, list):
|
|
156
|
+
for index, child in enumerate(value):
|
|
157
|
+
_collect_s3_uri_records_from_value(
|
|
158
|
+
child,
|
|
159
|
+
field_path=f"{field_path}[{index}]",
|
|
160
|
+
records=records,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def collect_config_s3_uri_records(config_data: dict[str, Any]) -> list[S3UriRecord]:
|
|
165
|
+
records: list[S3UriRecord] = []
|
|
166
|
+
for root in S3_DETECTION_ROOTS:
|
|
167
|
+
value = config_data.get(root)
|
|
168
|
+
if value is not None:
|
|
169
|
+
_collect_s3_uri_records_from_value(
|
|
170
|
+
value,
|
|
171
|
+
field_path=root,
|
|
172
|
+
records=records,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
source_control = config_data.get("SourceControlDetails", {})
|
|
176
|
+
if isinstance(source_control, dict):
|
|
177
|
+
for key in S3_SOURCE_CONTROL_KEYS:
|
|
178
|
+
value = source_control.get(key)
|
|
179
|
+
if value is not None:
|
|
180
|
+
_collect_s3_uri_records_from_value(
|
|
181
|
+
value,
|
|
182
|
+
field_path=f"SourceControlDetails.{key}",
|
|
183
|
+
records=records,
|
|
184
|
+
)
|
|
185
|
+
return records
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def detect_profile_s3_params(
|
|
189
|
+
*,
|
|
190
|
+
baseline_config: dict[str, Any],
|
|
191
|
+
target_config: dict[str, Any],
|
|
192
|
+
match_threshold: int,
|
|
193
|
+
) -> S3DetectionResult:
|
|
194
|
+
baseline_records = collect_config_s3_uri_records(baseline_config)
|
|
195
|
+
target_records = collect_config_s3_uri_records(target_config)
|
|
196
|
+
target_by_identity = {record.identity: record for record in target_records}
|
|
197
|
+
target_by_field_path: dict[str, list[S3UriRecord]] = {}
|
|
198
|
+
for record in target_records:
|
|
199
|
+
target_by_field_path.setdefault(record.field_path, []).append(record)
|
|
200
|
+
|
|
201
|
+
params: dict[str, Any] = {}
|
|
202
|
+
skipped: list[str] = []
|
|
203
|
+
candidate_summary: dict[str, list[tuple[str, int]]] = {}
|
|
204
|
+
|
|
205
|
+
for baseline_record in baseline_records:
|
|
206
|
+
target_record = target_by_identity.get(baseline_record.identity)
|
|
207
|
+
if target_record is None:
|
|
208
|
+
choices = target_by_field_path.get(baseline_record.field_path, [])
|
|
209
|
+
if choices:
|
|
210
|
+
target_record, candidates = _best_s3_record_match(
|
|
211
|
+
baseline_record, choices, match_threshold
|
|
212
|
+
)
|
|
213
|
+
if candidates:
|
|
214
|
+
candidate_summary[baseline_record.uri] = candidates
|
|
215
|
+
if target_record is None:
|
|
216
|
+
skipped.append(
|
|
217
|
+
f"{baseline_record.field_path}: no target URI matched {baseline_record.uri}"
|
|
218
|
+
)
|
|
219
|
+
continue
|
|
220
|
+
|
|
221
|
+
if target_record.uri == baseline_record.uri:
|
|
222
|
+
continue
|
|
223
|
+
if target_record.key == baseline_record.key:
|
|
224
|
+
if target_record.bucket != baseline_record.bucket:
|
|
225
|
+
params[target_record.field_path] = target_record.field_value
|
|
226
|
+
continue
|
|
227
|
+
|
|
228
|
+
choices = target_by_field_path.get(baseline_record.field_path, [target_record])
|
|
229
|
+
matched_record, candidates = _best_s3_record_match(
|
|
230
|
+
baseline_record, choices, match_threshold
|
|
231
|
+
)
|
|
232
|
+
if candidates:
|
|
233
|
+
candidate_summary[baseline_record.uri] = candidates
|
|
234
|
+
if matched_record is None:
|
|
235
|
+
skipped.append(
|
|
236
|
+
f"{baseline_record.field_path}: no confident target URI matched {baseline_record.uri}"
|
|
237
|
+
)
|
|
238
|
+
continue
|
|
239
|
+
params[matched_record.field_path] = matched_record.field_value
|
|
240
|
+
|
|
241
|
+
return S3DetectionResult(
|
|
242
|
+
params=params,
|
|
243
|
+
skipped=skipped,
|
|
244
|
+
candidates=candidate_summary,
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def _best_s3_record_match(
|
|
249
|
+
baseline_record: S3UriRecord,
|
|
250
|
+
choices: list[S3UriRecord],
|
|
251
|
+
match_threshold: int,
|
|
252
|
+
) -> tuple[S3UriRecord | None, list[tuple[str, int]]]:
|
|
253
|
+
if not choices:
|
|
254
|
+
return None, []
|
|
255
|
+
scored = sorted(
|
|
256
|
+
(
|
|
257
|
+
(record, int(fuzz.ratio(baseline_record.key, record.key)))
|
|
258
|
+
for record in choices
|
|
259
|
+
),
|
|
260
|
+
key=lambda item: item[1],
|
|
261
|
+
reverse=True,
|
|
262
|
+
)
|
|
263
|
+
candidates = [(record.uri, score) for record, score in scored[:2]]
|
|
264
|
+
best_record, best_score = scored[0]
|
|
265
|
+
if best_score < match_threshold:
|
|
266
|
+
return None, candidates
|
|
267
|
+
return best_record, candidates
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def _fetch_profile_job_configs(
|
|
271
|
+
*,
|
|
272
|
+
profile_jobs: Mapping[str, str],
|
|
273
|
+
auto_login: bool,
|
|
274
|
+
) -> tuple[dict[str, dict[str, Any]], dict[str, str]]:
|
|
275
|
+
configs: dict[str, dict[str, Any]] = {}
|
|
276
|
+
errors: dict[str, str] = {}
|
|
277
|
+
for profile, job_name in profile_jobs.items():
|
|
278
|
+
try:
|
|
279
|
+
config = get_glue_job_config(
|
|
280
|
+
job_name,
|
|
281
|
+
profile_name=profile,
|
|
282
|
+
auto_login=auto_login,
|
|
283
|
+
)
|
|
284
|
+
except Exception as exc: # noqa: BLE001 - preserve interactive fallback.
|
|
285
|
+
errors[profile] = str(exc)
|
|
286
|
+
continue
|
|
287
|
+
if isinstance(config, dict) and config:
|
|
288
|
+
configs[profile] = config
|
|
289
|
+
else:
|
|
290
|
+
errors[profile] = "Glue returned an empty job config."
|
|
291
|
+
return configs, errors
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def _print_missing_checkout_job_recommendation(
|
|
295
|
+
*,
|
|
296
|
+
profile: str,
|
|
297
|
+
job_name: str,
|
|
298
|
+
) -> None:
|
|
299
|
+
typer.echo(
|
|
300
|
+
f"Recommendation: AWS profile '{profile}' could not fetch the checked-out "
|
|
301
|
+
f"job '{job_name}'. If that checkout is stale, clear it and rerun init:",
|
|
302
|
+
err=True,
|
|
303
|
+
)
|
|
304
|
+
typer.echo(" gluekit checkout --clear", err=True)
|
|
305
|
+
typer.echo(
|
|
306
|
+
"Then run `gluekit init` again; with no job checked out, pressing Enter at "
|
|
307
|
+
"the job prompt will list Glue jobs from the selected profile.",
|
|
308
|
+
err=True,
|
|
309
|
+
)
|
|
310
|
+
typer.echo(
|
|
311
|
+
"You can also inspect that profile directly with:",
|
|
312
|
+
err=True,
|
|
313
|
+
)
|
|
314
|
+
typer.echo(
|
|
315
|
+
f" aws glue get-jobs --profile {profile} --query 'Jobs[].Name' --output table",
|
|
316
|
+
err=True,
|
|
317
|
+
)
|
|
318
|
+
typer.echo(
|
|
319
|
+
"If the job exists in that profile, reselect it with the profile:",
|
|
320
|
+
err=True,
|
|
321
|
+
)
|
|
322
|
+
typer.echo(
|
|
323
|
+
f" gluekit checkout {shlex.quote(job_name)} --profile {shlex.quote(profile)}",
|
|
324
|
+
err=True,
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def _parse_profile_job_options(profile_job: list[str]) -> dict[str, str]:
|
|
329
|
+
profile_jobs: dict[str, str] = {}
|
|
330
|
+
for raw_value in profile_job:
|
|
331
|
+
profile, separator, job_name = raw_value.partition("=")
|
|
332
|
+
if not separator or not profile.strip() or not job_name.strip():
|
|
333
|
+
raise typer.BadParameter(
|
|
334
|
+
"--profile-job must be provided as PROFILE=GLUE_JOB_NAME."
|
|
335
|
+
)
|
|
336
|
+
profile_jobs[profile.strip()] = job_name.strip()
|
|
337
|
+
return profile_jobs
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def _job_modified_sort_value(job: dict[str, Any]) -> datetime:
|
|
341
|
+
value = job.get("LastModifiedOn") or job.get("CreatedOn")
|
|
342
|
+
if isinstance(value, datetime):
|
|
343
|
+
if value.tzinfo is None:
|
|
344
|
+
return value.replace(tzinfo=timezone.utc)
|
|
345
|
+
return value
|
|
346
|
+
return datetime.min.replace(tzinfo=timezone.utc)
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def _render_job_modified(job: dict[str, Any]) -> str:
|
|
350
|
+
value = job.get("LastModifiedOn") or job.get("CreatedOn")
|
|
351
|
+
if isinstance(value, datetime):
|
|
352
|
+
return value.isoformat()
|
|
353
|
+
if value:
|
|
354
|
+
return str(value)
|
|
355
|
+
return "unknown modified time"
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
def _select_job_from_profile_listing(
|
|
359
|
+
*,
|
|
360
|
+
profile: str,
|
|
361
|
+
auto_login: bool,
|
|
362
|
+
) -> str | None:
|
|
363
|
+
try:
|
|
364
|
+
jobs = list_glue_jobs(profile_name=profile, auto_login=auto_login)
|
|
365
|
+
except Exception as exc: # noqa: BLE001 - keep init interactive.
|
|
366
|
+
typer.echo(f"Could not list Glue jobs for {profile}: {exc}", err=True)
|
|
367
|
+
jobs = []
|
|
368
|
+
|
|
369
|
+
named_jobs = [
|
|
370
|
+
job for job in jobs if isinstance(job, dict) and isinstance(job.get("Name"), str)
|
|
371
|
+
]
|
|
372
|
+
named_jobs.sort(key=_job_modified_sort_value, reverse=True)
|
|
373
|
+
if not named_jobs:
|
|
374
|
+
return typer.prompt(
|
|
375
|
+
f"Glue job name for profile '{profile}'",
|
|
376
|
+
default="",
|
|
377
|
+
show_default=False,
|
|
378
|
+
).strip() or None
|
|
379
|
+
|
|
380
|
+
typer.echo(f"Glue jobs for profile '{profile}':")
|
|
381
|
+
for index, job in enumerate(named_jobs, start=1):
|
|
382
|
+
typer.echo(f"{index}. {job['Name']} ({_render_job_modified(job)})")
|
|
383
|
+
|
|
384
|
+
names = {job["Name"] for job in named_jobs}
|
|
385
|
+
while True:
|
|
386
|
+
selection = typer.prompt(
|
|
387
|
+
f"Select Glue job for profile '{profile}' by number or name",
|
|
388
|
+
default="",
|
|
389
|
+
show_default=False,
|
|
390
|
+
).strip()
|
|
391
|
+
if not selection:
|
|
392
|
+
return None
|
|
393
|
+
if selection.isdigit():
|
|
394
|
+
index = int(selection)
|
|
395
|
+
if 1 <= index <= len(named_jobs):
|
|
396
|
+
return named_jobs[index - 1]["Name"]
|
|
397
|
+
if selection in names:
|
|
398
|
+
return selection
|
|
399
|
+
typer.echo("Enter a listed number, an exact job name, or press Enter to skip.")
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
def _prompt_source_job_for_profile(
|
|
403
|
+
*,
|
|
404
|
+
profile: str,
|
|
405
|
+
default_job: str,
|
|
406
|
+
profile_job_overrides: Mapping[str, str],
|
|
407
|
+
auto_login: bool,
|
|
408
|
+
) -> str | None:
|
|
409
|
+
override = profile_job_overrides.get(profile)
|
|
410
|
+
if override:
|
|
411
|
+
typer.echo(f"Using Glue job '{override}' for profile '{profile}'.")
|
|
412
|
+
return override
|
|
413
|
+
|
|
414
|
+
prompt_text = f"Glue job to inspect for profile '{profile}'"
|
|
415
|
+
if default_job:
|
|
416
|
+
return typer.prompt(prompt_text, default=default_job).strip() or None
|
|
417
|
+
|
|
418
|
+
job_name = typer.prompt(prompt_text, default="", show_default=False).strip()
|
|
419
|
+
if job_name:
|
|
420
|
+
return job_name
|
|
421
|
+
|
|
422
|
+
return _select_job_from_profile_listing(
|
|
423
|
+
profile=profile,
|
|
424
|
+
auto_login=auto_login,
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
def _add_detected_param(
|
|
429
|
+
detected_params: dict[str, dict[str, Any]],
|
|
430
|
+
*,
|
|
431
|
+
profile: str,
|
|
432
|
+
key: str,
|
|
433
|
+
value: Any,
|
|
434
|
+
) -> None:
|
|
435
|
+
detected_params.setdefault(profile, {})[key] = value
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
def _prompt_manual_param_for_skip(
|
|
439
|
+
*,
|
|
440
|
+
profile: str,
|
|
441
|
+
reason: str,
|
|
442
|
+
detected_params: dict[str, dict[str, Any]],
|
|
443
|
+
) -> None:
|
|
444
|
+
typer.echo(f"Skipped S3 detection for {profile}: {reason}")
|
|
445
|
+
if not typer.confirm(f"Enter a manual profile param for {profile}?", default=False):
|
|
446
|
+
return
|
|
447
|
+
key = typer.prompt("Param key")
|
|
448
|
+
value = typer.prompt("Param value")
|
|
449
|
+
if key.strip():
|
|
450
|
+
_add_detected_param(
|
|
451
|
+
detected_params,
|
|
452
|
+
profile=profile,
|
|
453
|
+
key=key.strip(),
|
|
454
|
+
value=value,
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
def _print_detection_summary(
|
|
459
|
+
*,
|
|
460
|
+
local_job_name: str,
|
|
461
|
+
profile_source_jobs: Mapping[str, str],
|
|
462
|
+
detected_params: dict[str, dict[str, Any]],
|
|
463
|
+
skipped: dict[str, list[str]],
|
|
464
|
+
candidates: dict[str, dict[str, list[tuple[str, int]]]],
|
|
465
|
+
) -> None:
|
|
466
|
+
typer.echo("Profile source jobs:")
|
|
467
|
+
for profile in sorted(profile_source_jobs):
|
|
468
|
+
typer.echo(f"- {profile}: {profile_source_jobs[profile]} -> {local_job_name}")
|
|
469
|
+
|
|
470
|
+
typer.echo("Detected profile params:")
|
|
471
|
+
if not detected_params:
|
|
472
|
+
typer.echo("- (none)")
|
|
473
|
+
for profile in sorted(detected_params):
|
|
474
|
+
typer.echo(f"- {profile}:")
|
|
475
|
+
for key, value in sorted(detected_params[profile].items()):
|
|
476
|
+
typer.echo(f" {key}: {value}")
|
|
477
|
+
|
|
478
|
+
if candidates:
|
|
479
|
+
typer.echo("S3 match candidates:")
|
|
480
|
+
for profile in sorted(candidates):
|
|
481
|
+
for source_uri, matches in sorted(candidates[profile].items()):
|
|
482
|
+
rendered = ", ".join(f"{uri} ({score})" for uri, score in matches)
|
|
483
|
+
typer.echo(f"- {profile}: {source_uri} -> {rendered}")
|
|
484
|
+
|
|
485
|
+
if skipped:
|
|
486
|
+
typer.echo("Skipped S3 detections:")
|
|
487
|
+
for profile in sorted(skipped):
|
|
488
|
+
for reason in skipped[profile]:
|
|
489
|
+
typer.echo(f"- {profile}: {reason}")
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
def _glue_init(
|
|
493
|
+
*,
|
|
494
|
+
aws_config: Path,
|
|
495
|
+
config_dir: Path,
|
|
496
|
+
auto_login: bool,
|
|
497
|
+
detect_roles: bool,
|
|
498
|
+
detect_buckets: bool,
|
|
499
|
+
match_threshold: int,
|
|
500
|
+
baseline_profile: Optional[str],
|
|
501
|
+
profile_job: list[str],
|
|
502
|
+
) -> None:
|
|
503
|
+
if match_threshold < 0 or match_threshold > 100:
|
|
504
|
+
raise typer.BadParameter("--match-threshold must be between 0 and 100.")
|
|
505
|
+
|
|
506
|
+
profile_job_overrides = _parse_profile_job_options(profile_job)
|
|
507
|
+
profiles = _aws_profiles_from_config(aws_config)
|
|
508
|
+
if not profiles:
|
|
509
|
+
typer.echo(f"No AWS profiles found in {aws_config}.")
|
|
510
|
+
return
|
|
511
|
+
|
|
512
|
+
selected_profiles: list[str] = []
|
|
513
|
+
profile_source_jobs: dict[str, str] = {}
|
|
514
|
+
checked_out_jobs = _get_checked_out_jobs()
|
|
515
|
+
default_job = checked_out_jobs[0] if len(checked_out_jobs) == 1 else ""
|
|
516
|
+
for profile in profiles:
|
|
517
|
+
if typer.confirm(
|
|
518
|
+
f"Use AWS profile '{profile}' in this repository?", default=False
|
|
519
|
+
):
|
|
520
|
+
source_job = _prompt_source_job_for_profile(
|
|
521
|
+
profile=profile,
|
|
522
|
+
default_job=default_job,
|
|
523
|
+
profile_job_overrides=profile_job_overrides,
|
|
524
|
+
auto_login=auto_login,
|
|
525
|
+
)
|
|
526
|
+
if not source_job:
|
|
527
|
+
typer.echo(
|
|
528
|
+
f"Skipping profile '{profile}' because no Glue job was selected."
|
|
529
|
+
)
|
|
530
|
+
continue
|
|
531
|
+
selected_profiles.append(profile)
|
|
532
|
+
profile_source_jobs[profile] = source_job
|
|
533
|
+
|
|
534
|
+
if not selected_profiles:
|
|
535
|
+
typer.echo("No profiles selected.")
|
|
536
|
+
return
|
|
537
|
+
|
|
538
|
+
missing_overrides = sorted(set(profile_job_overrides) - set(selected_profiles))
|
|
539
|
+
if missing_overrides:
|
|
540
|
+
raise typer.BadParameter(
|
|
541
|
+
"--profile-job was provided for profiles that were not selected: "
|
|
542
|
+
+ ", ".join(missing_overrides)
|
|
543
|
+
)
|
|
544
|
+
|
|
545
|
+
store = _load_glue_set_store()
|
|
546
|
+
profiles_store = store.setdefault("profiles", {})
|
|
547
|
+
for profile in selected_profiles:
|
|
548
|
+
profile_store = profiles_store.setdefault(profile, {})
|
|
549
|
+
if not isinstance(profile_store, dict):
|
|
550
|
+
profiles_store[profile] = {"global": {}, "jobs": {}}
|
|
551
|
+
continue
|
|
552
|
+
profile_store.setdefault("global", {})
|
|
553
|
+
profile_store.setdefault("jobs", {})
|
|
554
|
+
_save_glue_set_store(store)
|
|
555
|
+
|
|
556
|
+
local_job_name = default_job or profile_source_jobs[selected_profiles[0]]
|
|
557
|
+
resolved_job_name, source = _resolve_checkout_job_name(
|
|
558
|
+
local_job_name, config_dir=config_dir
|
|
559
|
+
)
|
|
560
|
+
if baseline_profile:
|
|
561
|
+
first_profile = baseline_profile
|
|
562
|
+
else:
|
|
563
|
+
first_profile = typer.prompt(
|
|
564
|
+
"Profile to checkout first / use as S3 baseline",
|
|
565
|
+
default=selected_profiles[0],
|
|
566
|
+
)
|
|
567
|
+
if first_profile not in selected_profiles:
|
|
568
|
+
raise typer.BadParameter(
|
|
569
|
+
f"Profile '{first_profile}' was not selected for this repository."
|
|
570
|
+
)
|
|
571
|
+
_save_checked_out_jobs(
|
|
572
|
+
[resolved_job_name],
|
|
573
|
+
resolved_job_name,
|
|
574
|
+
source=source,
|
|
575
|
+
profile=first_profile,
|
|
576
|
+
)
|
|
577
|
+
typer.echo(f"Checked out {resolved_job_name} with profile {first_profile}.")
|
|
578
|
+
|
|
579
|
+
role_detection_enabled = detect_roles and typer.confirm(
|
|
580
|
+
"Auto-detect profile-specific Glue IAM roles?", default=True
|
|
581
|
+
)
|
|
582
|
+
bucket_detection_enabled = detect_buckets and typer.confirm(
|
|
583
|
+
"Auto-detect profile-specific S3 bucket mappings?", default=True
|
|
584
|
+
)
|
|
585
|
+
|
|
586
|
+
detected_params: dict[str, dict[str, Any]] = {}
|
|
587
|
+
skipped_s3: dict[str, list[str]] = {}
|
|
588
|
+
s3_candidates: dict[str, dict[str, list[tuple[str, int]]]] = {}
|
|
589
|
+
|
|
590
|
+
profile_configs: dict[str, dict[str, Any]] = {}
|
|
591
|
+
profile_errors: dict[str, str] = {}
|
|
592
|
+
if role_detection_enabled or bucket_detection_enabled:
|
|
593
|
+
profile_configs, profile_errors = _fetch_profile_job_configs(
|
|
594
|
+
profile_jobs=profile_source_jobs,
|
|
595
|
+
auto_login=auto_login,
|
|
596
|
+
)
|
|
597
|
+
for profile, error in sorted(profile_errors.items()):
|
|
598
|
+
typer.echo(
|
|
599
|
+
f"Could not fetch Glue job config for {profile} "
|
|
600
|
+
f"({profile_source_jobs[profile]}): {error}",
|
|
601
|
+
err=True,
|
|
602
|
+
)
|
|
603
|
+
if default_job and profile_source_jobs[profile] == default_job:
|
|
604
|
+
_print_missing_checkout_job_recommendation(
|
|
605
|
+
profile=profile,
|
|
606
|
+
job_name=default_job,
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
if role_detection_enabled:
|
|
610
|
+
for profile in selected_profiles:
|
|
611
|
+
role = profile_configs.get(profile, {}).get("Role")
|
|
612
|
+
if isinstance(role, str) and role.strip():
|
|
613
|
+
_add_detected_param(
|
|
614
|
+
detected_params,
|
|
615
|
+
profile=profile,
|
|
616
|
+
key="Role",
|
|
617
|
+
value=role.strip(),
|
|
618
|
+
)
|
|
619
|
+
continue
|
|
620
|
+
role = typer.prompt(f"Role for {profile}", default="")
|
|
621
|
+
if role.strip():
|
|
622
|
+
_add_detected_param(
|
|
623
|
+
detected_params,
|
|
624
|
+
profile=profile,
|
|
625
|
+
key="Role",
|
|
626
|
+
value=role.strip(),
|
|
627
|
+
)
|
|
628
|
+
|
|
629
|
+
if bucket_detection_enabled:
|
|
630
|
+
baseline_config = profile_configs.get(first_profile)
|
|
631
|
+
if not baseline_config:
|
|
632
|
+
typer.echo(
|
|
633
|
+
f"Cannot detect S3 bucket mappings without baseline config for {first_profile}.",
|
|
634
|
+
err=True,
|
|
635
|
+
)
|
|
636
|
+
else:
|
|
637
|
+
for profile in selected_profiles:
|
|
638
|
+
if profile == first_profile:
|
|
639
|
+
continue
|
|
640
|
+
target_config = profile_configs.get(profile)
|
|
641
|
+
if not target_config:
|
|
642
|
+
continue
|
|
643
|
+
result = detect_profile_s3_params(
|
|
644
|
+
baseline_config=baseline_config,
|
|
645
|
+
target_config=target_config,
|
|
646
|
+
match_threshold=match_threshold,
|
|
647
|
+
)
|
|
648
|
+
for key, value in result.params.items():
|
|
649
|
+
_add_detected_param(
|
|
650
|
+
detected_params,
|
|
651
|
+
profile=profile,
|
|
652
|
+
key=key,
|
|
653
|
+
value=value,
|
|
654
|
+
)
|
|
655
|
+
if result.skipped:
|
|
656
|
+
skipped_s3[profile] = list(result.skipped)
|
|
657
|
+
if result.candidates:
|
|
658
|
+
s3_candidates[profile] = dict(result.candidates)
|
|
659
|
+
|
|
660
|
+
for profile, reasons in skipped_s3.items():
|
|
661
|
+
for reason in reasons:
|
|
662
|
+
_prompt_manual_param_for_skip(
|
|
663
|
+
profile=profile,
|
|
664
|
+
reason=reason,
|
|
665
|
+
detected_params=detected_params,
|
|
666
|
+
)
|
|
667
|
+
|
|
668
|
+
if role_detection_enabled or bucket_detection_enabled:
|
|
669
|
+
_print_detection_summary(
|
|
670
|
+
local_job_name=resolved_job_name,
|
|
671
|
+
profile_source_jobs=profile_source_jobs,
|
|
672
|
+
detected_params=detected_params,
|
|
673
|
+
skipped=skipped_s3,
|
|
674
|
+
candidates=s3_candidates,
|
|
675
|
+
)
|
|
676
|
+
for profile, params in detected_params.items():
|
|
677
|
+
for key, value in params.items():
|
|
678
|
+
_save_profile_param(
|
|
679
|
+
profile=profile,
|
|
680
|
+
job_name=resolved_job_name,
|
|
681
|
+
key=key,
|
|
682
|
+
value=value,
|
|
683
|
+
)
|
|
684
|
+
|
|
685
|
+
if typer.confirm(
|
|
686
|
+
"Pull the baseline Glue job now using the checkout profile?", default=False
|
|
687
|
+
):
|
|
688
|
+
source_job_name = profile_source_jobs[first_profile]
|
|
689
|
+
if source_job_name != resolved_job_name:
|
|
690
|
+
raise typer.BadParameter(
|
|
691
|
+
"Pulling during init requires the baseline source job to match the "
|
|
692
|
+
f"local checked-out job ({resolved_job_name}); selected "
|
|
693
|
+
f"{source_job_name} for {first_profile}."
|
|
694
|
+
)
|
|
695
|
+
config_dir.mkdir(parents=True, exist_ok=True)
|
|
696
|
+
jobs = list_glue_jobs(profile_name=first_profile, auto_login=auto_login)
|
|
697
|
+
matches = [job for job in jobs if job.get("Name") == source_job_name]
|
|
698
|
+
if not matches:
|
|
699
|
+
raise typer.BadParameter(
|
|
700
|
+
f'No Glue job named "{source_job_name}" found for profile {first_profile}.'
|
|
701
|
+
)
|
|
702
|
+
config_path = config_dir / f"{slugify(resolved_job_name)}.json"
|
|
703
|
+
download_glue_job_files(
|
|
704
|
+
name=source_job_name,
|
|
705
|
+
local_path=f"glue/scripts/{slugify(resolved_job_name)}.py",
|
|
706
|
+
config_path=config_path,
|
|
707
|
+
include_components={"config", "script", "notebook"},
|
|
708
|
+
existing_sc={},
|
|
709
|
+
profile_name=first_profile,
|
|
710
|
+
auto_login=auto_login,
|
|
711
|
+
)
|
|
712
|
+
|
|
713
|
+
store = _load_glue_set_store()
|
|
714
|
+
automation = store.setdefault("automation", {})
|
|
715
|
+
if isinstance(automation, dict):
|
|
716
|
+
automation["profiles"] = selected_profiles
|
|
717
|
+
automation["source_jobs"] = profile_source_jobs
|
|
718
|
+
automation["default_job"] = resolved_job_name
|
|
719
|
+
automation["build"] = {
|
|
720
|
+
"package_whl": typer.confirm(
|
|
721
|
+
"Default build flow should refresh dist/*.whl in Glue config?",
|
|
722
|
+
default=True,
|
|
723
|
+
),
|
|
724
|
+
"push": typer.confirm(
|
|
725
|
+
"Default build flow should push after build?",
|
|
726
|
+
default=False,
|
|
727
|
+
),
|
|
728
|
+
}
|
|
729
|
+
automation["local_dev_file_type"] = typer.prompt(
|
|
730
|
+
"Local development file type",
|
|
731
|
+
default="notebook",
|
|
732
|
+
)
|
|
733
|
+
_save_glue_set_store(store)
|
|
734
|
+
typer.echo(f"Saved setup in {GLUE_SET_FILE}.")
|
|
735
|
+
|
|
736
|
+
|
|
737
|
+
@app.command(
|
|
738
|
+
"init",
|
|
739
|
+
epilog=_examples_epilog(
|
|
740
|
+
"gluekit init",
|
|
741
|
+
),
|
|
742
|
+
)
|
|
743
|
+
def glue_init(
|
|
744
|
+
aws_config: Optional[Path] = typer.Option(
|
|
745
|
+
None,
|
|
746
|
+
"--aws-config",
|
|
747
|
+
help="Local AWS CLI config file to scan for credential profile names; defaults to AWS_CONFIG_FILE or ~/.aws/config.",
|
|
748
|
+
),
|
|
749
|
+
config_dir: Path = typer.Option(
|
|
750
|
+
Path("glue/configs"),
|
|
751
|
+
"--config-dir",
|
|
752
|
+
help="Directory containing Glue job config files.",
|
|
753
|
+
),
|
|
754
|
+
auto_login: bool = typer.Option(
|
|
755
|
+
True,
|
|
756
|
+
"--auto-login/--no-auto-login",
|
|
757
|
+
help="For selected real AWS profiles, automatically run 'aws sso login' when credentials are missing or expired.",
|
|
758
|
+
),
|
|
759
|
+
detect_roles: bool = typer.Option(
|
|
760
|
+
True,
|
|
761
|
+
"--detect-roles/--no-detect-roles",
|
|
762
|
+
help="Fetch the selected real AWS Glue job in each profile and save its Role as local gluekit params.",
|
|
763
|
+
),
|
|
764
|
+
detect_buckets: bool = typer.Option(
|
|
765
|
+
True,
|
|
766
|
+
"--detect-buckets/--no-detect-buckets",
|
|
767
|
+
help="Compare S3 URIs in real AWS Glue job configs across profiles and save local gluekit profile mappings.",
|
|
768
|
+
),
|
|
769
|
+
match_threshold: int = typer.Option(
|
|
770
|
+
85,
|
|
771
|
+
"--match-threshold",
|
|
772
|
+
help="Minimum rapidfuzz score for non-exact S3 key matches.",
|
|
773
|
+
),
|
|
774
|
+
baseline_profile: Optional[str] = typer.Option(
|
|
775
|
+
None,
|
|
776
|
+
"--baseline-profile",
|
|
777
|
+
help="Selected AWS CLI profile to store as the first gluekit checkout scope and S3 comparison baseline.",
|
|
778
|
+
),
|
|
779
|
+
profile_job: Optional[list[str]] = typer.Option(
|
|
780
|
+
None,
|
|
781
|
+
"--profile-job",
|
|
782
|
+
help="Real AWS Glue job to inspect for an AWS CLI profile, as PROFILE=GLUE_JOB_NAME. May be repeated.",
|
|
783
|
+
),
|
|
784
|
+
) -> None:
|
|
785
|
+
"""Initialize local repo-level Gluekit settings from AWS CLI profiles."""
|
|
786
|
+
resolved_aws_config = aws_config or Path(
|
|
787
|
+
os.environ.get("AWS_CONFIG_FILE", Path.home() / ".aws" / "config")
|
|
788
|
+
)
|
|
789
|
+
_glue_init(
|
|
790
|
+
aws_config=resolved_aws_config,
|
|
791
|
+
config_dir=config_dir,
|
|
792
|
+
auto_login=auto_login,
|
|
793
|
+
detect_roles=detect_roles,
|
|
794
|
+
detect_buckets=detect_buckets,
|
|
795
|
+
match_threshold=match_threshold,
|
|
796
|
+
baseline_profile=baseline_profile,
|
|
797
|
+
profile_job=profile_job or [],
|
|
798
|
+
)
|