pyfabric-dev 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyfabric_dev/__init__.py +3 -0
- pyfabric_dev/cli/__init__.py +0 -0
- pyfabric_dev/cli/generate_notebooks.py +1677 -0
- pyfabric_dev/cli/run_notebook.py +44 -0
- pyfabric_dev/cli/run_pipeline.py +72 -0
- pyfabric_dev/cli/run_tests_parallel.py +346 -0
- pyfabric_dev/defs.py +23 -0
- pyfabric_dev/fabric.py +41 -0
- pyfabric_dev/functions.py +757 -0
- pyfabric_dev/local_config.py +76 -0
- pyfabric_dev/local_env.py +43 -0
- pyfabric_dev/mock_notebookutils.py +119 -0
- pyfabric_dev/runners/__init__.py +6 -0
- pyfabric_dev/runners/hooks.py +40 -0
- pyfabric_dev/runners/notebook.py +402 -0
- pyfabric_dev/runners/pipeline.py +469 -0
- pyfabric_dev/spark.py +249 -0
- pyfabric_dev-0.3.0.dist-info/METADATA +118 -0
- pyfabric_dev-0.3.0.dist-info/RECORD +24 -0
- pyfabric_dev-0.3.0.dist-info/WHEEL +5 -0
- pyfabric_dev-0.3.0.dist-info/entry_points.txt +5 -0
- pyfabric_dev-0.3.0.dist-info/licenses/LICENSE +202 -0
- pyfabric_dev-0.3.0.dist-info/licenses/NOTICE +16 -0
- pyfabric_dev-0.3.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1677 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Generate Microsoft Fabric notebooks from Python modules.
|
|
4
|
+
|
|
5
|
+
This script generates Fabric-compatible .Notebook directories from the
|
|
6
|
+
Python modules in src/. Generated notebooks import and execute the
|
|
7
|
+
corresponding module functions.
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
python dev/generate_notebooks.py
|
|
11
|
+
|
|
12
|
+
# Generate only specific notebooks
|
|
13
|
+
python dev/generate_notebooks.py --only bronze
|
|
14
|
+
|
|
15
|
+
# Regenerate all notebooks (including common)
|
|
16
|
+
python dev/generate_notebooks.py --all
|
|
17
|
+
|
|
18
|
+
# Dry run (show what would be generated)
|
|
19
|
+
python dev/generate_notebooks.py --dry-run
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
import argparse
|
|
23
|
+
import ast
|
|
24
|
+
import json
|
|
25
|
+
import os
|
|
26
|
+
import sys
|
|
27
|
+
from dataclasses import dataclass
|
|
28
|
+
from pathlib import Path
|
|
29
|
+
from typing import Literal
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# Project root is resolved at CLI invocation (via --project-root, or
|
|
33
|
+
# Path.cwd() as a fallback). All source/output path computations route
|
|
34
|
+
# through this global so the generator works for any consumer, not just
|
|
35
|
+
# the package install location.
|
|
36
|
+
PROJECT_ROOT: Path = Path.cwd()
|
|
37
|
+
|
|
38
|
+
# Lakehouse config is loaded lazily after PROJECT_ROOT is set in main().
|
|
39
|
+
LAKEHOUSE_CONFIGS: dict = {}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def load_lakehouse_config(project_root: Path | None = None) -> dict:
|
|
43
|
+
"""Load lakehouse configuration from <project_root>/config/lakehouse_config.json."""
|
|
44
|
+
root = project_root if project_root is not None else PROJECT_ROOT
|
|
45
|
+
config_path = root / "config" / "lakehouse_config.json"
|
|
46
|
+
|
|
47
|
+
if not config_path.exists():
|
|
48
|
+
print(f"Warning: Lakehouse config not found at {config_path}")
|
|
49
|
+
print("Using empty config. Create config/lakehouse_config.json from template.")
|
|
50
|
+
return {"bronze": {}, "silver": {}, "gold": {}, "tests": {}, "common": {}}
|
|
51
|
+
|
|
52
|
+
with open(config_path) as f:
|
|
53
|
+
return json.load(f)
|
|
54
|
+
|
|
55
|
+
# Notebook naming prefixes
|
|
56
|
+
LAYER_PREFIXES = {
|
|
57
|
+
"bronze": "10_bronze_",
|
|
58
|
+
"silver": "20_silver_",
|
|
59
|
+
"gold": "30_gold_",
|
|
60
|
+
"backup": "",
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@dataclass
|
|
65
|
+
class NotebookConfig:
|
|
66
|
+
"""Configuration for generating a notebook."""
|
|
67
|
+
module_path: Path
|
|
68
|
+
notebook_name: str
|
|
69
|
+
layer: Literal["bronze", "silver", "gold", "backup", "common", "tests"]
|
|
70
|
+
has_run_function: bool = True
|
|
71
|
+
parameters: dict = None
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def generate_metadata_block(lakehouse_config: dict = None) -> str:
|
|
75
|
+
"""Generate the notebook METADATA block."""
|
|
76
|
+
meta = {
|
|
77
|
+
"kernel_info": {
|
|
78
|
+
"name": "synapse_pyspark"
|
|
79
|
+
},
|
|
80
|
+
"dependencies": {}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
if lakehouse_config:
|
|
84
|
+
meta["dependencies"]["lakehouse"] = lakehouse_config
|
|
85
|
+
|
|
86
|
+
# Format dependencies as indented JSON within META comments
|
|
87
|
+
deps_json = json.dumps(
|
|
88
|
+
{"lakehouse": lakehouse_config} if lakehouse_config else {},
|
|
89
|
+
indent=2
|
|
90
|
+
).replace("\n", "\n# META ")
|
|
91
|
+
|
|
92
|
+
return f"""# METADATA ********************
|
|
93
|
+
|
|
94
|
+
# META {{
|
|
95
|
+
# META "kernel_info": {{
|
|
96
|
+
# META "name": "synapse_pyspark"
|
|
97
|
+
# META }},
|
|
98
|
+
# META "dependencies": {deps_json}
|
|
99
|
+
# META }}"""
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def generate_cell_metadata() -> str:
|
|
103
|
+
"""Generate metadata for a code cell."""
|
|
104
|
+
return """# METADATA ********************
|
|
105
|
+
|
|
106
|
+
# META {
|
|
107
|
+
# META "language": "python",
|
|
108
|
+
# META "language_group": "synapse_pyspark"
|
|
109
|
+
# META }"""
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def generate_path_setup_cell() -> str:
|
|
113
|
+
"""Generate the cell that sets up the Python path for src imports."""
|
|
114
|
+
return """# CELL ********************
|
|
115
|
+
|
|
116
|
+
# Setup Python path for src imports
|
|
117
|
+
# This ensures src modules can be imported in both Fabric and local environments
|
|
118
|
+
import sys
|
|
119
|
+
from pathlib import Path
|
|
120
|
+
|
|
121
|
+
# In Fabric, notebooks run from the workspace root where src/ is located
|
|
122
|
+
# Locally, we need to ensure the project root is in the path
|
|
123
|
+
_notebook_dir = Path.cwd()
|
|
124
|
+
_project_root = _notebook_dir.parent if _notebook_dir.name.endswith('.Notebook') else _notebook_dir
|
|
125
|
+
|
|
126
|
+
if str(_project_root) not in sys.path:
|
|
127
|
+
sys.path.insert(0, str(_project_root))
|
|
128
|
+
|
|
129
|
+
# METADATA ********************
|
|
130
|
+
|
|
131
|
+
# META {
|
|
132
|
+
# META "language": "python",
|
|
133
|
+
# META "language_group": "synapse_pyspark"
|
|
134
|
+
# META }
|
|
135
|
+
"""
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def extract_imports_from_module(module_path: Path, skip_imports: list[str] = None) -> list[str]:
|
|
139
|
+
"""
|
|
140
|
+
Extract import statements from a Python module.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
module_path: Path to the Python module
|
|
144
|
+
skip_imports: List of import patterns to skip (e.g., ['from src.common'])
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
List of import statement strings
|
|
148
|
+
"""
|
|
149
|
+
skip_imports = skip_imports or []
|
|
150
|
+
|
|
151
|
+
with open(module_path) as f:
|
|
152
|
+
lines = f.readlines()
|
|
153
|
+
|
|
154
|
+
import_lines = []
|
|
155
|
+
in_multiline_import = False
|
|
156
|
+
in_docstring = False
|
|
157
|
+
docstring_char = None
|
|
158
|
+
current_import = []
|
|
159
|
+
|
|
160
|
+
for line in lines:
|
|
161
|
+
stripped = line.strip()
|
|
162
|
+
|
|
163
|
+
# Handle docstrings
|
|
164
|
+
if not in_docstring:
|
|
165
|
+
if stripped.startswith('"""') or stripped.startswith("'''"):
|
|
166
|
+
docstring_char = stripped[:3]
|
|
167
|
+
# Check if it's a single-line docstring
|
|
168
|
+
if stripped.count(docstring_char) >= 2:
|
|
169
|
+
continue # Single line docstring, skip it
|
|
170
|
+
in_docstring = True
|
|
171
|
+
continue
|
|
172
|
+
else:
|
|
173
|
+
# We're inside a docstring, check if this line ends it
|
|
174
|
+
if docstring_char in stripped:
|
|
175
|
+
in_docstring = False
|
|
176
|
+
continue
|
|
177
|
+
|
|
178
|
+
# Skip comments
|
|
179
|
+
if stripped.startswith("#"):
|
|
180
|
+
continue
|
|
181
|
+
|
|
182
|
+
# Check if this is an import line
|
|
183
|
+
is_import = stripped.startswith("import ") or stripped.startswith("from ")
|
|
184
|
+
|
|
185
|
+
if in_multiline_import:
|
|
186
|
+
current_import.append(line.rstrip())
|
|
187
|
+
# Check if this line ends the multiline import
|
|
188
|
+
if stripped.endswith(")") or (not stripped.endswith("\\") and not stripped.endswith(",")):
|
|
189
|
+
in_multiline_import = False
|
|
190
|
+
# Check if we should skip this import
|
|
191
|
+
full_import = "\n".join(current_import)
|
|
192
|
+
should_skip = any(pattern in full_import for pattern in skip_imports)
|
|
193
|
+
if not should_skip:
|
|
194
|
+
import_lines.append(full_import)
|
|
195
|
+
current_import = []
|
|
196
|
+
elif is_import:
|
|
197
|
+
# Check if this is a multiline import
|
|
198
|
+
if ("(" in stripped and ")" not in stripped) or stripped.endswith("\\"):
|
|
199
|
+
in_multiline_import = True
|
|
200
|
+
current_import = [line.rstrip()]
|
|
201
|
+
else:
|
|
202
|
+
# Single line import - check if we should skip it
|
|
203
|
+
should_skip = any(pattern in line for pattern in skip_imports)
|
|
204
|
+
if not should_skip:
|
|
205
|
+
import_lines.append(line.rstrip())
|
|
206
|
+
elif stripped and not is_import:
|
|
207
|
+
# Stop at first non-import, non-comment line (start of actual code)
|
|
208
|
+
break
|
|
209
|
+
|
|
210
|
+
return import_lines
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def read_module_code(module_path: Path, skip_imports: list[str] = None, skip_top_level_only: bool = False) -> str:
|
|
214
|
+
"""
|
|
215
|
+
Read Python module and extract code suitable for notebook inlining.
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
module_path: Path to the Python module
|
|
219
|
+
skip_imports: List of import patterns to skip (e.g., ['from src.common'])
|
|
220
|
+
skip_top_level_only: If True, only skip imports at indentation level 0 (preserve imports inside functions)
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
Code string with imports filtered
|
|
224
|
+
"""
|
|
225
|
+
with open(module_path) as f:
|
|
226
|
+
lines = f.readlines()
|
|
227
|
+
|
|
228
|
+
skip_imports = skip_imports or []
|
|
229
|
+
result_lines = []
|
|
230
|
+
in_docstring = False
|
|
231
|
+
docstring_char = None
|
|
232
|
+
in_multiline_import = False
|
|
233
|
+
|
|
234
|
+
for line in lines:
|
|
235
|
+
stripped = line.strip()
|
|
236
|
+
|
|
237
|
+
# Track multiline imports (lines ending with \ or inside parentheses)
|
|
238
|
+
if in_multiline_import:
|
|
239
|
+
# Check if this line ends the multiline import
|
|
240
|
+
if stripped.endswith(")") or (not stripped.endswith("\\") and not stripped.endswith(",")):
|
|
241
|
+
in_multiline_import = False
|
|
242
|
+
continue # Skip all lines of the multiline import
|
|
243
|
+
|
|
244
|
+
# Track docstrings
|
|
245
|
+
if not in_docstring:
|
|
246
|
+
if stripped.startswith('"""') or stripped.startswith("'''"):
|
|
247
|
+
docstring_char = stripped[:3]
|
|
248
|
+
if stripped.count(docstring_char) >= 2:
|
|
249
|
+
# Single line docstring
|
|
250
|
+
result_lines.append(line)
|
|
251
|
+
continue
|
|
252
|
+
in_docstring = True
|
|
253
|
+
else:
|
|
254
|
+
if docstring_char in stripped:
|
|
255
|
+
in_docstring = False
|
|
256
|
+
result_lines.append(line)
|
|
257
|
+
continue
|
|
258
|
+
|
|
259
|
+
# Skip specified imports
|
|
260
|
+
should_skip = False
|
|
261
|
+
for skip_pattern in skip_imports:
|
|
262
|
+
if stripped.startswith(skip_pattern):
|
|
263
|
+
# If skip_top_level_only is True, only skip if this is a top-level import (no indentation)
|
|
264
|
+
if skip_top_level_only:
|
|
265
|
+
# Check if line starts with the pattern (no leading whitespace)
|
|
266
|
+
if line.startswith(skip_pattern):
|
|
267
|
+
should_skip = True
|
|
268
|
+
else:
|
|
269
|
+
should_skip = True
|
|
270
|
+
|
|
271
|
+
if should_skip:
|
|
272
|
+
# Check if this is a multiline import
|
|
273
|
+
if "(" in stripped and ")" not in stripped:
|
|
274
|
+
in_multiline_import = True
|
|
275
|
+
elif stripped.endswith("\\"):
|
|
276
|
+
in_multiline_import = True
|
|
277
|
+
break
|
|
278
|
+
|
|
279
|
+
if not should_skip:
|
|
280
|
+
result_lines.append(line)
|
|
281
|
+
|
|
282
|
+
return "".join(result_lines)
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def generate_common_defs_notebook() -> str:
|
|
286
|
+
"""Generate the common_defs notebook content with inlined code.
|
|
287
|
+
|
|
288
|
+
Inlines framework_defs.py first, then defs.py (with the
|
|
289
|
+
framework_defs cross-import stripped). framework_defs is the
|
|
290
|
+
framework-extractable subset; defs.py holds CashHero specifics.
|
|
291
|
+
"""
|
|
292
|
+
project_root = PROJECT_ROOT
|
|
293
|
+
framework_defs_path = project_root / "src" / "common" / "framework_defs.py"
|
|
294
|
+
defs_path = project_root / "src" / "common" / "defs.py"
|
|
295
|
+
|
|
296
|
+
content = ["# Fabric notebook source", ""]
|
|
297
|
+
content.append(generate_metadata_block())
|
|
298
|
+
content.append("")
|
|
299
|
+
|
|
300
|
+
# Cell 1: Inlined code from src/common/framework_defs.py + src/common/defs.py
|
|
301
|
+
content.append("# CELL ********************")
|
|
302
|
+
content.append("")
|
|
303
|
+
|
|
304
|
+
if framework_defs_path.exists():
|
|
305
|
+
content.append(read_module_code(framework_defs_path).rstrip())
|
|
306
|
+
content.append("")
|
|
307
|
+
|
|
308
|
+
if defs_path.exists():
|
|
309
|
+
code = read_module_code(defs_path, skip_imports=[
|
|
310
|
+
"from src.common",
|
|
311
|
+
"import src.common",
|
|
312
|
+
])
|
|
313
|
+
content.append(code.rstrip())
|
|
314
|
+
else:
|
|
315
|
+
content.append("# WARNING: src/common/defs.py not found")
|
|
316
|
+
content.append("# Please create the module first")
|
|
317
|
+
|
|
318
|
+
content.append("")
|
|
319
|
+
content.append(generate_cell_metadata())
|
|
320
|
+
content.append("")
|
|
321
|
+
|
|
322
|
+
return "\n".join(content)
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def generate_common_functions_notebook() -> str:
|
|
326
|
+
"""Generate the common_functions notebook content with inlined code."""
|
|
327
|
+
project_root = PROJECT_ROOT
|
|
328
|
+
functions_path = project_root / "src" / "common" / "functions.py"
|
|
329
|
+
spark_path = project_root / "src" / "common" / "spark.py"
|
|
330
|
+
|
|
331
|
+
lakehouse_config = LAKEHOUSE_CONFIGS.get("bronze") # Default to bronze
|
|
332
|
+
|
|
333
|
+
content = ["# Fabric notebook source", ""]
|
|
334
|
+
content.append(generate_metadata_block(lakehouse_config))
|
|
335
|
+
content.append("")
|
|
336
|
+
|
|
337
|
+
# Cell 1: Run common_defs
|
|
338
|
+
content.append("# CELL ********************")
|
|
339
|
+
content.append("")
|
|
340
|
+
content.append("%run common_defs")
|
|
341
|
+
content.append("")
|
|
342
|
+
content.append(generate_cell_metadata())
|
|
343
|
+
content.append("")
|
|
344
|
+
|
|
345
|
+
# Cell 2: Imports from source file
|
|
346
|
+
content.append("# CELL ********************")
|
|
347
|
+
content.append("")
|
|
348
|
+
|
|
349
|
+
# Extract imports from src/common/functions.py + cashhero_org.py,
|
|
350
|
+
# excluding src.common imports (provided by %run common_defs).
|
|
351
|
+
# Deduplicate so symbols common to both files appear once.
|
|
352
|
+
cashhero_org_path = project_root / "src" / "common" / "cashhero_org.py"
|
|
353
|
+
if functions_path.exists():
|
|
354
|
+
imports = extract_imports_from_module(functions_path, skip_imports=[
|
|
355
|
+
"from src.common",
|
|
356
|
+
"import src.common",
|
|
357
|
+
])
|
|
358
|
+
if cashhero_org_path.exists():
|
|
359
|
+
extra_imports = extract_imports_from_module(cashhero_org_path, skip_imports=[
|
|
360
|
+
"from src.common",
|
|
361
|
+
"import src.common",
|
|
362
|
+
])
|
|
363
|
+
for imp in extra_imports:
|
|
364
|
+
if imp not in imports:
|
|
365
|
+
imports.append(imp)
|
|
366
|
+
|
|
367
|
+
# Add Fabric-specific imports that aren't in the source file
|
|
368
|
+
# (notebookutils is only available in Fabric, not in local dev)
|
|
369
|
+
fabric_imports = ["from notebookutils import mssparkutils"]
|
|
370
|
+
|
|
371
|
+
for imp in imports:
|
|
372
|
+
content.append(imp)
|
|
373
|
+
|
|
374
|
+
if imports:
|
|
375
|
+
content.append("")
|
|
376
|
+
|
|
377
|
+
for imp in fabric_imports:
|
|
378
|
+
content.append(imp)
|
|
379
|
+
else:
|
|
380
|
+
content.append("# WARNING: src/common/functions.py not found")
|
|
381
|
+
|
|
382
|
+
content.append("")
|
|
383
|
+
content.append(generate_cell_metadata())
|
|
384
|
+
content.append("")
|
|
385
|
+
|
|
386
|
+
# Cell 3: Fabric-specific utilities (extracted from src/common/fabric.py)
|
|
387
|
+
content.append("# CELL ********************")
|
|
388
|
+
content.append("")
|
|
389
|
+
|
|
390
|
+
fabric_path = project_root / "src" / "common" / "fabric.py"
|
|
391
|
+
if fabric_path.exists():
|
|
392
|
+
# Extract functions from fabric.py, skipping all imports
|
|
393
|
+
code = read_module_code(fabric_path, skip_imports=[
|
|
394
|
+
"import ",
|
|
395
|
+
"from ",
|
|
396
|
+
])
|
|
397
|
+
content.append(code.rstrip())
|
|
398
|
+
else:
|
|
399
|
+
content.append("# WARNING: src/common/fabric.py not found")
|
|
400
|
+
content.append("# Fabric-specific utilities should be defined here")
|
|
401
|
+
|
|
402
|
+
content.append("")
|
|
403
|
+
content.append(generate_cell_metadata())
|
|
404
|
+
content.append("")
|
|
405
|
+
|
|
406
|
+
# Cell 4: Environment config loader (extracted from src/common/env.py)
|
|
407
|
+
content.append("# CELL ********************")
|
|
408
|
+
content.append("")
|
|
409
|
+
|
|
410
|
+
env_path = project_root / "src" / "common" / "env.py"
|
|
411
|
+
if env_path.exists():
|
|
412
|
+
# Extract functions from env.py, skipping all imports
|
|
413
|
+
code = read_module_code(env_path, skip_imports=[
|
|
414
|
+
"import ",
|
|
415
|
+
"from ",
|
|
416
|
+
])
|
|
417
|
+
content.append(code.rstrip())
|
|
418
|
+
else:
|
|
419
|
+
content.append("# WARNING: src/common/env.py not found")
|
|
420
|
+
content.append("# Environment config loader should be defined here")
|
|
421
|
+
|
|
422
|
+
content.append("")
|
|
423
|
+
content.append(generate_cell_metadata())
|
|
424
|
+
content.append("")
|
|
425
|
+
|
|
426
|
+
# Cell 5: Inlined functions from src/common/functions.py + cashhero_org.py
|
|
427
|
+
# functions.py is the framework-extractable subset; cashhero_org.py is
|
|
428
|
+
# the CashHero-domain subset. Both are inlined into the same notebook
|
|
429
|
+
# so callers see all symbols in a single namespace after %run.
|
|
430
|
+
content.append("# CELL ********************")
|
|
431
|
+
content.append("")
|
|
432
|
+
|
|
433
|
+
if functions_path.exists():
|
|
434
|
+
# Skip all imports since they're already in Cell 2
|
|
435
|
+
code = read_module_code(functions_path, skip_imports=[
|
|
436
|
+
"import ",
|
|
437
|
+
"from ",
|
|
438
|
+
])
|
|
439
|
+
content.append(code.rstrip())
|
|
440
|
+
else:
|
|
441
|
+
content.append("# WARNING: src/common/functions.py not found")
|
|
442
|
+
|
|
443
|
+
if cashhero_org_path.exists():
|
|
444
|
+
content.append("")
|
|
445
|
+
code = read_module_code(cashhero_org_path, skip_imports=[
|
|
446
|
+
"import ",
|
|
447
|
+
"from ",
|
|
448
|
+
])
|
|
449
|
+
content.append(code.rstrip())
|
|
450
|
+
|
|
451
|
+
content.append("")
|
|
452
|
+
content.append(generate_cell_metadata())
|
|
453
|
+
content.append("")
|
|
454
|
+
|
|
455
|
+
return "\n".join(content)
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
def generate_quickbooks_auth_notebook() -> str:
|
|
459
|
+
"""Generate the 10_bronze_quickbooks_auth notebook with inlined code from quickbooks_auth.py."""
|
|
460
|
+
project_root = PROJECT_ROOT
|
|
461
|
+
module_path = project_root / "src" / "common" / "quickbooks_auth.py"
|
|
462
|
+
|
|
463
|
+
content = ["# Fabric notebook source", ""]
|
|
464
|
+
content.append(generate_metadata_block())
|
|
465
|
+
content.append("")
|
|
466
|
+
|
|
467
|
+
# Cell 1: Run common_defs (needed for QUICKBOOKS_TOKEN_URL)
|
|
468
|
+
content.append("# CELL ********************")
|
|
469
|
+
content.append("")
|
|
470
|
+
content.append("%run common_defs")
|
|
471
|
+
content.append("")
|
|
472
|
+
content.append(generate_cell_metadata())
|
|
473
|
+
content.append("")
|
|
474
|
+
|
|
475
|
+
# Cell 2: Imports
|
|
476
|
+
content.append("# CELL ********************")
|
|
477
|
+
content.append("")
|
|
478
|
+
if module_path.exists():
|
|
479
|
+
imports = extract_imports_from_module(module_path, skip_imports=[
|
|
480
|
+
"from src.common",
|
|
481
|
+
"import src.common",
|
|
482
|
+
])
|
|
483
|
+
for imp in imports:
|
|
484
|
+
content.append(imp)
|
|
485
|
+
content.append("")
|
|
486
|
+
content.append(generate_cell_metadata())
|
|
487
|
+
content.append("")
|
|
488
|
+
|
|
489
|
+
# Cell 3: Inlined code (functions only, top-level imports stripped)
|
|
490
|
+
content.append("# CELL ********************")
|
|
491
|
+
content.append("")
|
|
492
|
+
if module_path.exists():
|
|
493
|
+
code = read_module_code(module_path, skip_imports=[
|
|
494
|
+
"import ",
|
|
495
|
+
"from ",
|
|
496
|
+
], skip_top_level_only=True)
|
|
497
|
+
content.append(code.rstrip())
|
|
498
|
+
else:
|
|
499
|
+
content.append("# WARNING: src/common/quickbooks_auth.py not found")
|
|
500
|
+
content.append("")
|
|
501
|
+
content.append(generate_cell_metadata())
|
|
502
|
+
content.append("")
|
|
503
|
+
|
|
504
|
+
return "\n".join(content)
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
def generate_quickbooks_client_notebook() -> str:
|
|
508
|
+
"""Generate the 10_bronze_quickbooks_client notebook with inlined code from quickbooks_client.py."""
|
|
509
|
+
project_root = PROJECT_ROOT
|
|
510
|
+
module_path = project_root / "src" / "common" / "quickbooks_client.py"
|
|
511
|
+
|
|
512
|
+
content = ["# Fabric notebook source", ""]
|
|
513
|
+
content.append(generate_metadata_block())
|
|
514
|
+
content.append("")
|
|
515
|
+
|
|
516
|
+
# Cell 1: Run common_defs (needed for QUICKBOOKS_API_BASE_URL etc.)
|
|
517
|
+
content.append("# CELL ********************")
|
|
518
|
+
content.append("")
|
|
519
|
+
content.append("%run common_defs")
|
|
520
|
+
content.append("")
|
|
521
|
+
content.append(generate_cell_metadata())
|
|
522
|
+
content.append("")
|
|
523
|
+
|
|
524
|
+
# Cell 2: Imports
|
|
525
|
+
content.append("# CELL ********************")
|
|
526
|
+
content.append("")
|
|
527
|
+
if module_path.exists():
|
|
528
|
+
imports = extract_imports_from_module(module_path, skip_imports=[
|
|
529
|
+
"from src.common",
|
|
530
|
+
"import src.common",
|
|
531
|
+
])
|
|
532
|
+
for imp in imports:
|
|
533
|
+
content.append(imp)
|
|
534
|
+
content.append("")
|
|
535
|
+
content.append(generate_cell_metadata())
|
|
536
|
+
content.append("")
|
|
537
|
+
|
|
538
|
+
# Cell 3: Inlined code (functions only, top-level imports stripped)
|
|
539
|
+
content.append("# CELL ********************")
|
|
540
|
+
content.append("")
|
|
541
|
+
if module_path.exists():
|
|
542
|
+
code = read_module_code(module_path, skip_imports=[
|
|
543
|
+
"import ",
|
|
544
|
+
"from ",
|
|
545
|
+
], skip_top_level_only=True)
|
|
546
|
+
content.append(code.rstrip())
|
|
547
|
+
else:
|
|
548
|
+
content.append("# WARNING: src/common/quickbooks_client.py not found")
|
|
549
|
+
content.append("")
|
|
550
|
+
content.append(generate_cell_metadata())
|
|
551
|
+
content.append("")
|
|
552
|
+
|
|
553
|
+
return "\n".join(content)
|
|
554
|
+
|
|
555
|
+
|
|
556
|
+
def generate_pipeline_notebook(config: NotebookConfig) -> str:
|
|
557
|
+
"""Generate a pipeline notebook (bronze/silver/gold) with inlined code."""
|
|
558
|
+
lakehouse_config = LAKEHOUSE_CONFIGS.get(config.layer)
|
|
559
|
+
|
|
560
|
+
content = ["# Fabric notebook source", ""]
|
|
561
|
+
content.append(generate_metadata_block(lakehouse_config))
|
|
562
|
+
content.append("")
|
|
563
|
+
|
|
564
|
+
# Cell 1: Run common_functions
|
|
565
|
+
content.append("# CELL ********************")
|
|
566
|
+
content.append("")
|
|
567
|
+
content.append("%run common_functions")
|
|
568
|
+
content.append("")
|
|
569
|
+
content.append(generate_cell_metadata())
|
|
570
|
+
content.append("")
|
|
571
|
+
|
|
572
|
+
# Special case: notebooks that need update_watermark_table functions
|
|
573
|
+
if ("ingest_from_priority" in config.notebook_name
|
|
574
|
+
or "post_process_priority_extract" in config.notebook_name):
|
|
575
|
+
content.append("# CELL ********************")
|
|
576
|
+
content.append("")
|
|
577
|
+
content.append("%run 10_bronze_update_watermark_table")
|
|
578
|
+
content.append("")
|
|
579
|
+
content.append(generate_cell_metadata())
|
|
580
|
+
content.append("")
|
|
581
|
+
|
|
582
|
+
# Special case: QuickBooks ingestion needs auth and client helpers
|
|
583
|
+
if "ingest_from_quickbooks" in config.notebook_name:
|
|
584
|
+
content.append("# CELL ********************")
|
|
585
|
+
content.append("")
|
|
586
|
+
content.append("%run 10_bronze_quickbooks_auth")
|
|
587
|
+
content.append("")
|
|
588
|
+
content.append(generate_cell_metadata())
|
|
589
|
+
content.append("")
|
|
590
|
+
|
|
591
|
+
content.append("# CELL ********************")
|
|
592
|
+
content.append("")
|
|
593
|
+
content.append("%run 10_bronze_quickbooks_client")
|
|
594
|
+
content.append("")
|
|
595
|
+
content.append(generate_cell_metadata())
|
|
596
|
+
content.append("")
|
|
597
|
+
|
|
598
|
+
# Special case: onboard_org needs finalize_accounts for globals().
|
|
599
|
+
# Set RUN_MAIN = False so %run loads functions without executing their run() entry points.
|
|
600
|
+
if "onboard_org" in config.notebook_name:
|
|
601
|
+
content.append("# CELL ********************")
|
|
602
|
+
content.append("")
|
|
603
|
+
content.append("_SAVED_RUN_MAIN = globals().get('RUN_MAIN', True)")
|
|
604
|
+
content.append("RUN_MAIN = False")
|
|
605
|
+
content.append("")
|
|
606
|
+
content.append(generate_cell_metadata())
|
|
607
|
+
content.append("")
|
|
608
|
+
content.append("# CELL ********************")
|
|
609
|
+
content.append("")
|
|
610
|
+
content.append("%run 30_gold_finalize_accounts")
|
|
611
|
+
content.append("")
|
|
612
|
+
content.append(generate_cell_metadata())
|
|
613
|
+
content.append("")
|
|
614
|
+
content.append("# CELL ********************")
|
|
615
|
+
content.append("")
|
|
616
|
+
content.append("RUN_MAIN = _SAVED_RUN_MAIN")
|
|
617
|
+
content.append("")
|
|
618
|
+
content.append(generate_cell_metadata())
|
|
619
|
+
content.append("")
|
|
620
|
+
|
|
621
|
+
# Cell 2: Parameters (if any)
|
|
622
|
+
if config.parameters:
|
|
623
|
+
content.append("# PARAMETERS CELL ********************")
|
|
624
|
+
content.append("")
|
|
625
|
+
for key, value in config.parameters.items():
|
|
626
|
+
# value is already a string representation from ast_unparse_default
|
|
627
|
+
# Special handling: get_lakehouse_path_func should default to cf_get_lakehouse_path
|
|
628
|
+
# (available from %run common_functions) instead of None
|
|
629
|
+
if key == "get_lakehouse_path_func" and value == "None":
|
|
630
|
+
content.append(f'{key} = cf_get_lakehouse_path')
|
|
631
|
+
else:
|
|
632
|
+
content.append(f'{key} = {value}')
|
|
633
|
+
content.append("")
|
|
634
|
+
content.append(generate_cell_metadata())
|
|
635
|
+
content.append("")
|
|
636
|
+
|
|
637
|
+
# Cell 3: Setup spark and logger
|
|
638
|
+
content.append("# CELL ********************")
|
|
639
|
+
content.append("")
|
|
640
|
+
content.append("# Allow callers/tests to inject an existing Spark session / logger before executing this")
|
|
641
|
+
content.append("# notebook (e.g., setting globals()['_spark'] or globals()['_logger']).")
|
|
642
|
+
content.append('if "_logger" not in globals():')
|
|
643
|
+
content.append(f' _logger = cf_create_logger("{config.notebook_name}")')
|
|
644
|
+
content.append("")
|
|
645
|
+
content.append('if "_spark" not in globals():')
|
|
646
|
+
content.append(" _spark = cf_create_spark_session()")
|
|
647
|
+
content.append("")
|
|
648
|
+
content.append(generate_cell_metadata())
|
|
649
|
+
content.append("")
|
|
650
|
+
|
|
651
|
+
# Cell 4: Pipeline-specific imports (not provided by common_functions)
|
|
652
|
+
content.append("# CELL ********************")
|
|
653
|
+
content.append("")
|
|
654
|
+
|
|
655
|
+
if config.module_path.exists():
|
|
656
|
+
# Extract imports from pipeline module, excluding what common_functions provides
|
|
657
|
+
pipeline_imports = extract_imports_from_module(config.module_path, skip_imports=[
|
|
658
|
+
"from src.common",
|
|
659
|
+
"import src.common",
|
|
660
|
+
])
|
|
661
|
+
|
|
662
|
+
# Add only imports NOT already provided by common_functions
|
|
663
|
+
# Common functions provides: logging, os, random, re, shutil, uuid, zipfile,
|
|
664
|
+
# datetime, functools.reduce, Logger, DataFrame, SparkSession, F,
|
|
665
|
+
# common pyspark.sql.functions, common pyspark.sql.types, DeltaTable
|
|
666
|
+
for imp in pipeline_imports:
|
|
667
|
+
# Skip imports entirely provided by common_functions
|
|
668
|
+
skip_completely = [
|
|
669
|
+
"from logging import Logger",
|
|
670
|
+
"from functools import reduce",
|
|
671
|
+
"from pyspark.sql import functions as F",
|
|
672
|
+
]
|
|
673
|
+
|
|
674
|
+
# These modules are fully covered in common_functions
|
|
675
|
+
if any(imp == skip or imp.startswith(skip) for skip in skip_completely):
|
|
676
|
+
continue
|
|
677
|
+
|
|
678
|
+
# For standard library imports, skip if already in common_functions
|
|
679
|
+
stdlib_covered = ["import logging", "import os", "import random", "import re",
|
|
680
|
+
"import shutil", "import uuid", "import zipfile"]
|
|
681
|
+
if any(imp == skip for skip in stdlib_covered):
|
|
682
|
+
continue
|
|
683
|
+
|
|
684
|
+
# For datetime, skip if it's the same as common_functions
|
|
685
|
+
if imp == "from datetime import datetime, timedelta, timezone":
|
|
686
|
+
continue
|
|
687
|
+
|
|
688
|
+
content.append(imp)
|
|
689
|
+
else:
|
|
690
|
+
content.append("# No pipeline-specific imports")
|
|
691
|
+
|
|
692
|
+
content.append("")
|
|
693
|
+
content.append(generate_cell_metadata())
|
|
694
|
+
content.append("")
|
|
695
|
+
|
|
696
|
+
# Cell 5: Inlined pipeline code
|
|
697
|
+
content.append("# CELL ********************")
|
|
698
|
+
content.append("")
|
|
699
|
+
|
|
700
|
+
if config.module_path.exists():
|
|
701
|
+
# Skip top-level imports (already in Cell 4), but keep imports inside functions
|
|
702
|
+
code = read_module_code(config.module_path, skip_imports=[
|
|
703
|
+
"import ",
|
|
704
|
+
"from ",
|
|
705
|
+
], skip_top_level_only=True)
|
|
706
|
+
content.append(code.rstrip())
|
|
707
|
+
else:
|
|
708
|
+
content.append(f"# WARNING: {config.module_path} not found")
|
|
709
|
+
|
|
710
|
+
content.append("")
|
|
711
|
+
content.append(generate_cell_metadata())
|
|
712
|
+
content.append("")
|
|
713
|
+
|
|
714
|
+
# Cell 6: Run the pipeline
|
|
715
|
+
content.append("# CELL ********************")
|
|
716
|
+
content.append("")
|
|
717
|
+
content.append("# Run the pipeline")
|
|
718
|
+
content.append('if "RUN_MAIN" not in globals():')
|
|
719
|
+
content.append(" RUN_MAIN = True")
|
|
720
|
+
content.append("")
|
|
721
|
+
content.append("if RUN_MAIN:")
|
|
722
|
+
# Build the run() call with parameters
|
|
723
|
+
run_args = ["_spark", "_logger"]
|
|
724
|
+
if config.parameters:
|
|
725
|
+
# Pass optional parameters as keyword arguments so notebook-level
|
|
726
|
+
# variable renames or signature reorderings don't silently misbind.
|
|
727
|
+
for param_name in config.parameters.keys():
|
|
728
|
+
run_args.append(f"{param_name}={param_name}")
|
|
729
|
+
run_call = f" run({', '.join(run_args)})"
|
|
730
|
+
content.append(run_call)
|
|
731
|
+
content.append("")
|
|
732
|
+
content.append(generate_cell_metadata())
|
|
733
|
+
content.append("")
|
|
734
|
+
|
|
735
|
+
return "\n".join(content)
|
|
736
|
+
|
|
737
|
+
|
|
738
|
+
def module_imports_fabric_harness(module_path: Path) -> bool:
|
|
739
|
+
"""
|
|
740
|
+
True if the test module imports tests.fabric_test_tables.
|
|
741
|
+
|
|
742
|
+
Generated Fabric notebooks cannot use ``from tests.*``; when this is True,
|
|
743
|
+
``generate_test_notebook`` inlines ``tests/fabric_test_tables.py`` so
|
|
744
|
+
``run_tests`` and helpers resolve in Synapse.
|
|
745
|
+
"""
|
|
746
|
+
if not module_path.exists():
|
|
747
|
+
return False
|
|
748
|
+
try:
|
|
749
|
+
tree = ast.parse(module_path.read_text(encoding="utf-8"))
|
|
750
|
+
except (SyntaxError, OSError):
|
|
751
|
+
return False
|
|
752
|
+
for node in ast.walk(tree):
|
|
753
|
+
if isinstance(node, ast.ImportFrom) and node.module == "tests.fabric_test_tables":
|
|
754
|
+
return True
|
|
755
|
+
if isinstance(node, ast.Import):
|
|
756
|
+
for alias in node.names:
|
|
757
|
+
if alias.name == "tests.fabric_test_tables":
|
|
758
|
+
return True
|
|
759
|
+
return False
|
|
760
|
+
|
|
761
|
+
|
|
762
|
+
def generate_test_notebook(config: NotebookConfig) -> str:
|
|
763
|
+
"""Generate a test notebook with inlined test code."""
|
|
764
|
+
lakehouse_config = LAKEHOUSE_CONFIGS.get("tests")
|
|
765
|
+
project_root = PROJECT_ROOT.resolve()
|
|
766
|
+
fabric_harness_path = project_root / "tests" / "fabric_test_tables.py"
|
|
767
|
+
|
|
768
|
+
content = ["# Fabric notebook source", ""]
|
|
769
|
+
content.append(generate_metadata_block(lakehouse_config))
|
|
770
|
+
content.append("")
|
|
771
|
+
|
|
772
|
+
# Cell 1: Disable RUN_MAIN for the notebook being tested
|
|
773
|
+
content.append("# CELL ********************")
|
|
774
|
+
content.append("")
|
|
775
|
+
content.append("RUN_MAIN = False")
|
|
776
|
+
content.append("")
|
|
777
|
+
content.append(generate_cell_metadata())
|
|
778
|
+
content.append("")
|
|
779
|
+
|
|
780
|
+
# Cell 2: Run common_functions
|
|
781
|
+
content.append("# CELL ********************")
|
|
782
|
+
content.append("")
|
|
783
|
+
content.append("%run common_functions")
|
|
784
|
+
content.append("")
|
|
785
|
+
content.append(generate_cell_metadata())
|
|
786
|
+
content.append("")
|
|
787
|
+
|
|
788
|
+
# Cell 3: Run the production notebook (to get constants and functions)
|
|
789
|
+
# Extract production notebook name from test notebook name (test_X -> X)
|
|
790
|
+
prod_notebook_name = config.notebook_name
|
|
791
|
+
if prod_notebook_name.startswith("test_"):
|
|
792
|
+
prod_notebook_name = prod_notebook_name[5:] # Remove "test_" prefix
|
|
793
|
+
|
|
794
|
+
content.append("# CELL ********************")
|
|
795
|
+
content.append("")
|
|
796
|
+
content.append(f"%run {prod_notebook_name}")
|
|
797
|
+
content.append("")
|
|
798
|
+
content.append(generate_cell_metadata())
|
|
799
|
+
content.append("")
|
|
800
|
+
|
|
801
|
+
# Cell 4: Setup spark and logger
|
|
802
|
+
content.append("# CELL ********************")
|
|
803
|
+
content.append("")
|
|
804
|
+
content.append('if "_logger" not in globals():')
|
|
805
|
+
content.append(f' _logger = cf_create_logger("{config.notebook_name}")')
|
|
806
|
+
content.append("")
|
|
807
|
+
content.append('if "_spark" not in globals():')
|
|
808
|
+
content.append(" _spark = cf_create_spark_session()")
|
|
809
|
+
content.append("")
|
|
810
|
+
content.append(generate_cell_metadata())
|
|
811
|
+
content.append("")
|
|
812
|
+
|
|
813
|
+
# Cell 5: Test imports
|
|
814
|
+
content.append("# CELL ********************")
|
|
815
|
+
content.append("")
|
|
816
|
+
|
|
817
|
+
# Extract imports from the test source file, excluding src module imports
|
|
818
|
+
# (since those are provided by %run common_functions and %run production_notebook)
|
|
819
|
+
if config.module_path.exists():
|
|
820
|
+
test_imports = extract_imports_from_module(config.module_path, skip_imports=[
|
|
821
|
+
"from src.common",
|
|
822
|
+
"import src.common",
|
|
823
|
+
"from src.silver",
|
|
824
|
+
"from src.bronze",
|
|
825
|
+
"from src.gold",
|
|
826
|
+
"import src.",
|
|
827
|
+
"from tests.",
|
|
828
|
+
])
|
|
829
|
+
|
|
830
|
+
for imp in test_imports:
|
|
831
|
+
content.append(imp)
|
|
832
|
+
else:
|
|
833
|
+
content.append("# WARNING: Test imports could not be extracted")
|
|
834
|
+
|
|
835
|
+
content.append("")
|
|
836
|
+
content.append(generate_cell_metadata())
|
|
837
|
+
content.append("")
|
|
838
|
+
|
|
839
|
+
# Cell 6 (optional): inline Fabric test harness when tests import tests.fabric_test_tables
|
|
840
|
+
if (
|
|
841
|
+
config.module_path.exists()
|
|
842
|
+
and module_imports_fabric_harness(config.module_path)
|
|
843
|
+
and fabric_harness_path.is_file()
|
|
844
|
+
):
|
|
845
|
+
content.append("# CELL ********************")
|
|
846
|
+
content.append("")
|
|
847
|
+
content.append(
|
|
848
|
+
"# Inlined from tests/fabric_test_tables.py (Fabric cannot import from tests.*)"
|
|
849
|
+
)
|
|
850
|
+
fabric_code = read_module_code(fabric_harness_path, skip_imports=[])
|
|
851
|
+
content.append(fabric_code.rstrip())
|
|
852
|
+
content.append("")
|
|
853
|
+
content.append(generate_cell_metadata())
|
|
854
|
+
content.append("")
|
|
855
|
+
|
|
856
|
+
# Cell 7: Inlined test code
|
|
857
|
+
content.append("# CELL ********************")
|
|
858
|
+
content.append("")
|
|
859
|
+
|
|
860
|
+
if config.module_path.exists():
|
|
861
|
+
# Skip top-level imports (already in Cell 5), but keep imports inside functions
|
|
862
|
+
code = read_module_code(config.module_path, skip_imports=[
|
|
863
|
+
"import ",
|
|
864
|
+
"from ",
|
|
865
|
+
], skip_top_level_only=True)
|
|
866
|
+
content.append(code.rstrip())
|
|
867
|
+
else:
|
|
868
|
+
content.append(f"# WARNING: {config.module_path} not found")
|
|
869
|
+
|
|
870
|
+
content.append("")
|
|
871
|
+
content.append(generate_cell_metadata())
|
|
872
|
+
content.append("")
|
|
873
|
+
|
|
874
|
+
# Cell 8: Run the tests
|
|
875
|
+
content.append("# CELL ********************")
|
|
876
|
+
content.append("")
|
|
877
|
+
content.append("# Run the tests")
|
|
878
|
+
content.append("run_tests(_spark, _logger)")
|
|
879
|
+
content.append("")
|
|
880
|
+
content.append(generate_cell_metadata())
|
|
881
|
+
content.append("")
|
|
882
|
+
|
|
883
|
+
return "\n".join(content)
|
|
884
|
+
|
|
885
|
+
|
|
886
|
+
def extract_run_function_parameters(module_path: Path) -> dict:
|
|
887
|
+
"""
|
|
888
|
+
Extract all parameters (except spark/logger) from the run() function signature.
|
|
889
|
+
|
|
890
|
+
Excludes 'spark' and 'logger' parameters as they are always provided by the notebook.
|
|
891
|
+
Required parameters (without defaults) get an empty string default for the parameters cell.
|
|
892
|
+
|
|
893
|
+
Returns:
|
|
894
|
+
Dictionary mapping parameter names to their default values (as strings for code generation)
|
|
895
|
+
"""
|
|
896
|
+
try:
|
|
897
|
+
with open(module_path) as f:
|
|
898
|
+
tree = ast.parse(f.read())
|
|
899
|
+
|
|
900
|
+
# Find the run() function
|
|
901
|
+
run_func = None
|
|
902
|
+
for node in ast.walk(tree):
|
|
903
|
+
if isinstance(node, ast.FunctionDef) and node.name == "run":
|
|
904
|
+
run_func = node
|
|
905
|
+
break
|
|
906
|
+
|
|
907
|
+
if not run_func:
|
|
908
|
+
return {}
|
|
909
|
+
|
|
910
|
+
# Extract parameters
|
|
911
|
+
# Parameters are in args.args, defaults are in args.defaults
|
|
912
|
+
# Defaults align with the last N parameters
|
|
913
|
+
params = {}
|
|
914
|
+
args = run_func.args
|
|
915
|
+
num_defaults = len(args.defaults)
|
|
916
|
+
num_args = len(args.args)
|
|
917
|
+
|
|
918
|
+
# Skip spark and logger (typically first two positional args)
|
|
919
|
+
skip_params = {"spark", "logger"}
|
|
920
|
+
|
|
921
|
+
for i, arg in enumerate(args.args):
|
|
922
|
+
param_name = arg.arg
|
|
923
|
+
if param_name in skip_params:
|
|
924
|
+
continue
|
|
925
|
+
|
|
926
|
+
# Check if this parameter has a default
|
|
927
|
+
# Defaults align with the last N parameters
|
|
928
|
+
default_index = i - (num_args - num_defaults)
|
|
929
|
+
if default_index >= 0:
|
|
930
|
+
default_value = args.defaults[default_index]
|
|
931
|
+
# Convert AST default value to string representation
|
|
932
|
+
params[param_name] = ast_unparse_default(default_value)
|
|
933
|
+
else:
|
|
934
|
+
# Required parameter without default - use empty string
|
|
935
|
+
params[param_name] = '""'
|
|
936
|
+
|
|
937
|
+
return params
|
|
938
|
+
|
|
939
|
+
except (SyntaxError, FileNotFoundError) as e:
|
|
940
|
+
print(f"Warning: Could not parse {module_path} for parameters: {e}")
|
|
941
|
+
return {}
|
|
942
|
+
|
|
943
|
+
|
|
944
|
+
def ast_unparse_default(node: ast.AST) -> str:
|
|
945
|
+
"""
|
|
946
|
+
Convert an AST node representing a default value to a string representation.
|
|
947
|
+
|
|
948
|
+
Handles common cases: None, strings, numbers, booleans, etc.
|
|
949
|
+
For complex expressions, uses ast.unparse if available (Python 3.9+).
|
|
950
|
+
"""
|
|
951
|
+
# Try to use ast.unparse for complex expressions first (Python 3.9+)
|
|
952
|
+
try:
|
|
953
|
+
if hasattr(ast, 'unparse'):
|
|
954
|
+
return ast.unparse(node)
|
|
955
|
+
except Exception:
|
|
956
|
+
pass
|
|
957
|
+
|
|
958
|
+
# Handle simple constant values
|
|
959
|
+
if isinstance(node, ast.Constant):
|
|
960
|
+
value = node.value
|
|
961
|
+
if value is None:
|
|
962
|
+
return "None"
|
|
963
|
+
elif isinstance(value, str):
|
|
964
|
+
return f'"{value}"'
|
|
965
|
+
elif isinstance(value, bool):
|
|
966
|
+
return str(value)
|
|
967
|
+
elif isinstance(value, (int, float)):
|
|
968
|
+
return str(value)
|
|
969
|
+
else:
|
|
970
|
+
return repr(value)
|
|
971
|
+
elif isinstance(node, ast.NameConstant): # Python < 3.8 compatibility
|
|
972
|
+
if node.value is None:
|
|
973
|
+
return "None"
|
|
974
|
+
elif isinstance(node.value, bool):
|
|
975
|
+
return str(node.value)
|
|
976
|
+
else:
|
|
977
|
+
return repr(node.value)
|
|
978
|
+
elif isinstance(node, ast.Str): # Python < 3.8 compatibility
|
|
979
|
+
return f'"{node.s}"'
|
|
980
|
+
elif isinstance(node, ast.Num): # Python < 3.8 compatibility
|
|
981
|
+
return str(node.n)
|
|
982
|
+
elif isinstance(node, ast.Name) and node.id in ("None", "True", "False"):
|
|
983
|
+
return node.id
|
|
984
|
+
else:
|
|
985
|
+
# Fallback: try repr or return None string
|
|
986
|
+
try:
|
|
987
|
+
return repr(node)
|
|
988
|
+
except Exception:
|
|
989
|
+
return "None"
|
|
990
|
+
|
|
991
|
+
|
|
992
|
+
def find_modules_with_run_function(src_dir: Path) -> list[NotebookConfig]:
|
|
993
|
+
"""Find all Python modules that have a run() function and extract their parameters."""
|
|
994
|
+
configs = []
|
|
995
|
+
|
|
996
|
+
for layer in ["bronze", "silver", "gold", "backup"]:
|
|
997
|
+
layer_dir = src_dir / layer
|
|
998
|
+
if not layer_dir.exists():
|
|
999
|
+
continue
|
|
1000
|
+
|
|
1001
|
+
for py_file in layer_dir.glob("*.py"):
|
|
1002
|
+
if py_file.name.startswith("_"):
|
|
1003
|
+
continue
|
|
1004
|
+
|
|
1005
|
+
# Check if module has a run() function
|
|
1006
|
+
try:
|
|
1007
|
+
with open(py_file) as f:
|
|
1008
|
+
tree = ast.parse(f.read())
|
|
1009
|
+
|
|
1010
|
+
has_run = any(
|
|
1011
|
+
isinstance(node, ast.FunctionDef) and node.name == "run"
|
|
1012
|
+
for node in ast.walk(tree)
|
|
1013
|
+
)
|
|
1014
|
+
|
|
1015
|
+
if has_run:
|
|
1016
|
+
# Generate notebook name from module name
|
|
1017
|
+
module_name = py_file.stem
|
|
1018
|
+
prefix = LAYER_PREFIXES.get(layer, "")
|
|
1019
|
+
notebook_name = f"{prefix}{module_name}"
|
|
1020
|
+
|
|
1021
|
+
# Extract optional parameters from run() function signature
|
|
1022
|
+
parameters = extract_run_function_parameters(py_file)
|
|
1023
|
+
|
|
1024
|
+
configs.append(NotebookConfig(
|
|
1025
|
+
module_path=py_file,
|
|
1026
|
+
notebook_name=notebook_name,
|
|
1027
|
+
layer=layer,
|
|
1028
|
+
has_run_function=True,
|
|
1029
|
+
parameters=parameters if parameters else None,
|
|
1030
|
+
))
|
|
1031
|
+
|
|
1032
|
+
except SyntaxError as e:
|
|
1033
|
+
print(f"Warning: Could not parse {py_file}: {e}")
|
|
1034
|
+
|
|
1035
|
+
return configs
|
|
1036
|
+
|
|
1037
|
+
|
|
1038
|
+
def validate_no_src_imports(notebook_name: str, content: str, warn_only: bool = False):
|
|
1039
|
+
"""Ensure no src.* imports leaked into generated notebook content."""
|
|
1040
|
+
in_docstring = False
|
|
1041
|
+
for i, line in enumerate(content.split("\n"), 1):
|
|
1042
|
+
stripped = line.strip()
|
|
1043
|
+
# Track docstrings (triple-quoted strings)
|
|
1044
|
+
if not in_docstring:
|
|
1045
|
+
for quote in ('"""', "'''"):
|
|
1046
|
+
if quote in stripped:
|
|
1047
|
+
# Check if docstring opens and closes on same line
|
|
1048
|
+
if stripped.count(quote) == 1:
|
|
1049
|
+
in_docstring = True
|
|
1050
|
+
break
|
|
1051
|
+
if in_docstring:
|
|
1052
|
+
continue
|
|
1053
|
+
else:
|
|
1054
|
+
if '"""' in stripped or "'''" in stripped:
|
|
1055
|
+
in_docstring = False
|
|
1056
|
+
continue
|
|
1057
|
+
# Skip comments
|
|
1058
|
+
if stripped.startswith("#"):
|
|
1059
|
+
continue
|
|
1060
|
+
if stripped.startswith("from src.") or stripped.startswith("import src."):
|
|
1061
|
+
msg = (
|
|
1062
|
+
f"Notebook '{notebook_name}' line {i} contains src import: {stripped}\n"
|
|
1063
|
+
f" src.* imports don't work in Fabric. Use try/except (ImportError, FileNotFoundError) "
|
|
1064
|
+
f"or move the functionality to common_functions."
|
|
1065
|
+
)
|
|
1066
|
+
if warn_only:
|
|
1067
|
+
print(f" WARNING: {msg}")
|
|
1068
|
+
return # One warning per notebook is enough
|
|
1069
|
+
else:
|
|
1070
|
+
raise ValueError(msg)
|
|
1071
|
+
|
|
1072
|
+
|
|
1073
|
+
def write_notebook(output_dir: Path, notebook_name: str, content: str, dry_run: bool = False):
|
|
1074
|
+
"""Write a notebook to disk."""
|
|
1075
|
+
is_test = notebook_name.startswith("test_")
|
|
1076
|
+
validate_no_src_imports(notebook_name, content, warn_only=is_test)
|
|
1077
|
+
notebook_dir = output_dir / f"{notebook_name}.Notebook"
|
|
1078
|
+
|
|
1079
|
+
if dry_run:
|
|
1080
|
+
print(f" Would create: {notebook_dir}")
|
|
1081
|
+
return
|
|
1082
|
+
|
|
1083
|
+
notebook_dir.mkdir(parents=True, exist_ok=True)
|
|
1084
|
+
notebook_file = notebook_dir / "notebook-content.py"
|
|
1085
|
+
|
|
1086
|
+
with open(notebook_file, "w") as f:
|
|
1087
|
+
f.write(content)
|
|
1088
|
+
|
|
1089
|
+
print(f" Created: {notebook_dir}")
|
|
1090
|
+
|
|
1091
|
+
|
|
1092
|
+
##############################################################################
|
|
1093
|
+
# AST-based notebook validation
|
|
1094
|
+
##############################################################################
|
|
1095
|
+
|
|
1096
|
+
import builtins as _builtins_module
|
|
1097
|
+
|
|
1098
|
+
# Names provided by the Fabric runtime (not defined in any notebook)
|
|
1099
|
+
FABRIC_RUNTIME_NAMES = {
|
|
1100
|
+
"_spark", "_logger", "spark", "notebookutils", "mssparkutils",
|
|
1101
|
+
"display", "RUN_MAIN",
|
|
1102
|
+
}
|
|
1103
|
+
|
|
1104
|
+
# Names that are provided at runtime for specific pipeline notebooks (e.g. by optional
|
|
1105
|
+
# %run or by common modules not inlined into common_functions). Validation treats these as known.
|
|
1106
|
+
PIPELINE_VALIDATION_ALLOWLIST: dict[str, set[str]] = {}
|
|
1107
|
+
|
|
1108
|
+
# Names from standard library / PySpark that are imported in common_functions
|
|
1109
|
+
# These are available because common_functions has an imports cell.
|
|
1110
|
+
COMMON_IMPORTS_NAMES = {
|
|
1111
|
+
# Standard library modules imported at top of common_functions
|
|
1112
|
+
"json", "logging", "os", "random", "re", "shutil", "uuid", "zipfile", "builtins",
|
|
1113
|
+
"datetime", "timedelta", "timezone", "reduce", "Logger",
|
|
1114
|
+
# PySpark
|
|
1115
|
+
"DataFrame", "SparkSession", "F", "col", "lit", "udf",
|
|
1116
|
+
"BooleanType", "LongType", "StringType", "StructField", "StructType",
|
|
1117
|
+
"DeltaTable",
|
|
1118
|
+
# Fabric-specific
|
|
1119
|
+
"mssparkutils",
|
|
1120
|
+
}
|
|
1121
|
+
|
|
1122
|
+
|
|
1123
|
+
def _extract_code_cells(content: str) -> list[tuple[int, str]]:
|
|
1124
|
+
"""Extract Python code cells from notebook content.
|
|
1125
|
+
|
|
1126
|
+
Returns list of (line_offset, cell_code) tuples.
|
|
1127
|
+
Skips metadata blocks and %run lines.
|
|
1128
|
+
"""
|
|
1129
|
+
cells: list[tuple[int, str]] = []
|
|
1130
|
+
current_cell_lines: list[str] = []
|
|
1131
|
+
cell_start_line = 0
|
|
1132
|
+
in_cell = False
|
|
1133
|
+
|
|
1134
|
+
for i, line in enumerate(content.split("\n"), 1):
|
|
1135
|
+
if line.startswith("# CELL **") or line.startswith("# PARAMETERS CELL **"):
|
|
1136
|
+
if in_cell and current_cell_lines:
|
|
1137
|
+
cells.append((cell_start_line, "\n".join(current_cell_lines)))
|
|
1138
|
+
current_cell_lines = []
|
|
1139
|
+
cell_start_line = i + 1 # code starts on the next line
|
|
1140
|
+
in_cell = True
|
|
1141
|
+
continue
|
|
1142
|
+
if line.startswith("# METADATA **"):
|
|
1143
|
+
if in_cell and current_cell_lines:
|
|
1144
|
+
cells.append((cell_start_line, "\n".join(current_cell_lines)))
|
|
1145
|
+
in_cell = False
|
|
1146
|
+
current_cell_lines = []
|
|
1147
|
+
continue
|
|
1148
|
+
if line.startswith("# META "):
|
|
1149
|
+
continue
|
|
1150
|
+
if in_cell:
|
|
1151
|
+
current_cell_lines.append(line)
|
|
1152
|
+
|
|
1153
|
+
# Flush last cell
|
|
1154
|
+
if in_cell and current_cell_lines:
|
|
1155
|
+
cells.append((cell_start_line, "\n".join(current_cell_lines)))
|
|
1156
|
+
|
|
1157
|
+
return cells
|
|
1158
|
+
|
|
1159
|
+
|
|
1160
|
+
def _collect_defined_names_from_stmts(nodes: list[ast.AST], names: set[str]) -> None:
|
|
1161
|
+
"""Collect names defined in a list of statements (e.g. try/except body)."""
|
|
1162
|
+
for node in nodes:
|
|
1163
|
+
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
1164
|
+
names.add(node.name)
|
|
1165
|
+
elif isinstance(node, ast.ClassDef):
|
|
1166
|
+
names.add(node.name)
|
|
1167
|
+
elif isinstance(node, ast.Assign):
|
|
1168
|
+
for target in node.targets:
|
|
1169
|
+
if isinstance(target, ast.Name):
|
|
1170
|
+
names.add(target.id)
|
|
1171
|
+
elif isinstance(target, (ast.Tuple, ast.List)):
|
|
1172
|
+
for elt in target.elts:
|
|
1173
|
+
if isinstance(elt, ast.Name):
|
|
1174
|
+
names.add(elt.id)
|
|
1175
|
+
elif isinstance(node, ast.AnnAssign) and isinstance(node.target, ast.Name):
|
|
1176
|
+
names.add(node.target.id)
|
|
1177
|
+
elif isinstance(node, ast.Import):
|
|
1178
|
+
for alias in node.names:
|
|
1179
|
+
names.add(alias.asname if alias.asname else alias.name.split(".")[0])
|
|
1180
|
+
elif isinstance(node, ast.ImportFrom):
|
|
1181
|
+
for alias in node.names:
|
|
1182
|
+
names.add(alias.asname if alias.asname else alias.name)
|
|
1183
|
+
elif isinstance(node, ast.AugAssign) and isinstance(node.target, ast.Name):
|
|
1184
|
+
names.add(node.target.id)
|
|
1185
|
+
elif isinstance(node, (ast.For, ast.With)):
|
|
1186
|
+
_collect_targets(node, names)
|
|
1187
|
+
|
|
1188
|
+
|
|
1189
|
+
def _collect_defined_names_from_code(code: str) -> set[str]:
|
|
1190
|
+
"""Collect all names defined at module level in the given code."""
|
|
1191
|
+
try:
|
|
1192
|
+
tree = ast.parse(code)
|
|
1193
|
+
except SyntaxError:
|
|
1194
|
+
return set()
|
|
1195
|
+
|
|
1196
|
+
names: set[str] = set()
|
|
1197
|
+
for node in ast.iter_child_nodes(tree):
|
|
1198
|
+
if isinstance(node, ast.FunctionDef) or isinstance(node, ast.AsyncFunctionDef):
|
|
1199
|
+
names.add(node.name)
|
|
1200
|
+
elif isinstance(node, ast.ClassDef):
|
|
1201
|
+
names.add(node.name)
|
|
1202
|
+
elif isinstance(node, ast.Assign):
|
|
1203
|
+
for target in node.targets:
|
|
1204
|
+
if isinstance(target, ast.Name):
|
|
1205
|
+
names.add(target.id)
|
|
1206
|
+
elif isinstance(target, ast.Tuple) or isinstance(target, ast.List):
|
|
1207
|
+
for elt in target.elts:
|
|
1208
|
+
if isinstance(elt, ast.Name):
|
|
1209
|
+
names.add(elt.id)
|
|
1210
|
+
elif isinstance(node, ast.AnnAssign) and isinstance(node.target, ast.Name):
|
|
1211
|
+
names.add(node.target.id)
|
|
1212
|
+
elif isinstance(node, ast.Import):
|
|
1213
|
+
for alias in node.names:
|
|
1214
|
+
names.add(alias.asname if alias.asname else alias.name.split(".")[0])
|
|
1215
|
+
elif isinstance(node, ast.ImportFrom):
|
|
1216
|
+
for alias in node.names:
|
|
1217
|
+
names.add(alias.asname if alias.asname else alias.name)
|
|
1218
|
+
elif isinstance(node, ast.AugAssign) and isinstance(node.target, ast.Name):
|
|
1219
|
+
names.add(node.target.id)
|
|
1220
|
+
elif isinstance(node, (ast.For, ast.With)):
|
|
1221
|
+
# Top-level for/with loop targets
|
|
1222
|
+
_collect_targets(node, names)
|
|
1223
|
+
elif isinstance(node, ast.Try):
|
|
1224
|
+
# Names defined in except blocks (e.g. Fabric fallback helpers) are visible module-wide
|
|
1225
|
+
for handler in node.handlers:
|
|
1226
|
+
_collect_defined_names_from_stmts(handler.body, names)
|
|
1227
|
+
return names
|
|
1228
|
+
|
|
1229
|
+
|
|
1230
|
+
def _collect_targets(node: ast.AST, names: set[str]):
|
|
1231
|
+
"""Collect assignment targets from for/with statements."""
|
|
1232
|
+
if isinstance(node, ast.For):
|
|
1233
|
+
if isinstance(node.target, ast.Name):
|
|
1234
|
+
names.add(node.target.id)
|
|
1235
|
+
elif isinstance(node.target, (ast.Tuple, ast.List)):
|
|
1236
|
+
for elt in node.target.elts:
|
|
1237
|
+
if isinstance(elt, ast.Name):
|
|
1238
|
+
names.add(elt.id)
|
|
1239
|
+
elif isinstance(node, ast.With):
|
|
1240
|
+
for item in node.items:
|
|
1241
|
+
if item.optional_vars and isinstance(item.optional_vars, ast.Name):
|
|
1242
|
+
names.add(item.optional_vars.id)
|
|
1243
|
+
|
|
1244
|
+
|
|
1245
|
+
def _collect_notebook_exports(content: str) -> set[str]:
|
|
1246
|
+
"""Collect all names exported by a notebook (from all its code cells)."""
|
|
1247
|
+
names: set[str] = set()
|
|
1248
|
+
for _offset, cell_code in _extract_code_cells(content):
|
|
1249
|
+
# Skip %run lines
|
|
1250
|
+
code_lines = [l for l in cell_code.split("\n") if not l.strip().startswith("%run")]
|
|
1251
|
+
code = "\n".join(code_lines)
|
|
1252
|
+
names |= _collect_defined_names_from_code(code)
|
|
1253
|
+
return names
|
|
1254
|
+
|
|
1255
|
+
|
|
1256
|
+
def _collect_local_names(func_node: ast.FunctionDef) -> set[str]:
|
|
1257
|
+
"""Collect all locally defined names within a function body."""
|
|
1258
|
+
local_names: set[str] = set()
|
|
1259
|
+
|
|
1260
|
+
# Parameters
|
|
1261
|
+
for arg in func_node.args.args:
|
|
1262
|
+
local_names.add(arg.arg)
|
|
1263
|
+
for arg in func_node.args.kwonlyargs:
|
|
1264
|
+
local_names.add(arg.arg)
|
|
1265
|
+
if func_node.args.vararg:
|
|
1266
|
+
local_names.add(func_node.args.vararg.arg)
|
|
1267
|
+
if func_node.args.kwarg:
|
|
1268
|
+
local_names.add(func_node.args.kwarg.arg)
|
|
1269
|
+
|
|
1270
|
+
# Walk the function body for all Store targets, imports, nested defs
|
|
1271
|
+
for node in ast.walk(func_node):
|
|
1272
|
+
if isinstance(node, ast.Name) and isinstance(node.ctx, ast.Store):
|
|
1273
|
+
local_names.add(node.id)
|
|
1274
|
+
elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
1275
|
+
if node is not func_node: # Skip the function itself
|
|
1276
|
+
local_names.add(node.name)
|
|
1277
|
+
# Also collect nested function parameters
|
|
1278
|
+
for arg in node.args.args:
|
|
1279
|
+
local_names.add(arg.arg)
|
|
1280
|
+
for arg in node.args.kwonlyargs:
|
|
1281
|
+
local_names.add(arg.arg)
|
|
1282
|
+
if node.args.vararg:
|
|
1283
|
+
local_names.add(node.args.vararg.arg)
|
|
1284
|
+
if node.args.kwarg:
|
|
1285
|
+
local_names.add(node.args.kwarg.arg)
|
|
1286
|
+
elif isinstance(node, ast.Lambda):
|
|
1287
|
+
# Collect lambda parameter names
|
|
1288
|
+
for arg in node.args.args:
|
|
1289
|
+
local_names.add(arg.arg)
|
|
1290
|
+
for arg in node.args.kwonlyargs:
|
|
1291
|
+
local_names.add(arg.arg)
|
|
1292
|
+
if node.args.vararg:
|
|
1293
|
+
local_names.add(node.args.vararg.arg)
|
|
1294
|
+
if node.args.kwarg:
|
|
1295
|
+
local_names.add(node.args.kwarg.arg)
|
|
1296
|
+
elif isinstance(node, ast.ClassDef):
|
|
1297
|
+
local_names.add(node.name)
|
|
1298
|
+
elif isinstance(node, ast.Import):
|
|
1299
|
+
for alias in node.names:
|
|
1300
|
+
local_names.add(alias.asname if alias.asname else alias.name.split(".")[0])
|
|
1301
|
+
elif isinstance(node, ast.ImportFrom):
|
|
1302
|
+
for alias in node.names:
|
|
1303
|
+
local_names.add(alias.asname if alias.asname else alias.name)
|
|
1304
|
+
elif isinstance(node, ast.ExceptHandler):
|
|
1305
|
+
if node.name:
|
|
1306
|
+
local_names.add(node.name)
|
|
1307
|
+
elif isinstance(node, ast.comprehension):
|
|
1308
|
+
if isinstance(node.target, ast.Name):
|
|
1309
|
+
local_names.add(node.target.id)
|
|
1310
|
+
elif isinstance(node.target, (ast.Tuple, ast.List)):
|
|
1311
|
+
for elt in node.target.elts:
|
|
1312
|
+
if isinstance(elt, ast.Name):
|
|
1313
|
+
local_names.add(elt.id)
|
|
1314
|
+
|
|
1315
|
+
return local_names
|
|
1316
|
+
|
|
1317
|
+
|
|
1318
|
+
@dataclass
|
|
1319
|
+
class UnresolvedName:
|
|
1320
|
+
"""An unresolved name reference found during validation."""
|
|
1321
|
+
name: str
|
|
1322
|
+
line: int # Line number in the notebook file
|
|
1323
|
+
func_name: str | None # Function where it was found, or None for module level
|
|
1324
|
+
|
|
1325
|
+
|
|
1326
|
+
def _find_undefined_names(code: str, line_offset: int, known_names: set[str]) -> list[UnresolvedName]:
|
|
1327
|
+
"""Find names in code that are used but not defined."""
|
|
1328
|
+
try:
|
|
1329
|
+
tree = ast.parse(code)
|
|
1330
|
+
except SyntaxError:
|
|
1331
|
+
return []
|
|
1332
|
+
|
|
1333
|
+
python_builtins = set(dir(_builtins_module))
|
|
1334
|
+
all_known = known_names | python_builtins | FABRIC_RUNTIME_NAMES
|
|
1335
|
+
|
|
1336
|
+
# Collect module-level defined names
|
|
1337
|
+
module_names = _collect_defined_names_from_code(code)
|
|
1338
|
+
all_known = all_known | module_names
|
|
1339
|
+
|
|
1340
|
+
issues: list[UnresolvedName] = []
|
|
1341
|
+
|
|
1342
|
+
for node in ast.iter_child_nodes(tree):
|
|
1343
|
+
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
1344
|
+
local_names = _collect_local_names(node)
|
|
1345
|
+
func_scope = all_known | local_names
|
|
1346
|
+
|
|
1347
|
+
# Walk function body looking for Name in Load context
|
|
1348
|
+
for child in ast.walk(node):
|
|
1349
|
+
if isinstance(child, ast.Name) and isinstance(child.ctx, ast.Load):
|
|
1350
|
+
name = child.id
|
|
1351
|
+
if name.startswith("_"):
|
|
1352
|
+
continue
|
|
1353
|
+
if name in func_scope:
|
|
1354
|
+
continue
|
|
1355
|
+
issues.append(UnresolvedName(
|
|
1356
|
+
name=name,
|
|
1357
|
+
line=line_offset + child.lineno - 1,
|
|
1358
|
+
func_name=node.name,
|
|
1359
|
+
))
|
|
1360
|
+
|
|
1361
|
+
# Also check module-level Load references (outside functions)
|
|
1362
|
+
elif isinstance(node, ast.Expr):
|
|
1363
|
+
for child in ast.walk(node):
|
|
1364
|
+
if isinstance(child, ast.Name) and isinstance(child.ctx, ast.Load):
|
|
1365
|
+
if child.id.startswith("_") or child.id in all_known:
|
|
1366
|
+
continue
|
|
1367
|
+
issues.append(UnresolvedName(
|
|
1368
|
+
name=child.id,
|
|
1369
|
+
line=line_offset + child.lineno - 1,
|
|
1370
|
+
func_name=None,
|
|
1371
|
+
))
|
|
1372
|
+
elif isinstance(node, ast.If):
|
|
1373
|
+
# Module-level if statements (like RUN_MAIN guard)
|
|
1374
|
+
_check_node_for_undefined(node, line_offset, all_known, issues)
|
|
1375
|
+
|
|
1376
|
+
return issues
|
|
1377
|
+
|
|
1378
|
+
|
|
1379
|
+
def _check_node_for_undefined(
|
|
1380
|
+
node: ast.AST, line_offset: int, known: set[str], issues: list[UnresolvedName]
|
|
1381
|
+
):
|
|
1382
|
+
"""Check an AST node tree for undefined name references."""
|
|
1383
|
+
# First collect all Store targets within this block
|
|
1384
|
+
block_locals: set[str] = set()
|
|
1385
|
+
for child in ast.walk(node):
|
|
1386
|
+
if isinstance(child, ast.Name) and isinstance(child.ctx, ast.Store):
|
|
1387
|
+
block_locals.add(child.id)
|
|
1388
|
+
elif isinstance(child, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
1389
|
+
block_locals.add(child.name)
|
|
1390
|
+
elif isinstance(child, ast.Import):
|
|
1391
|
+
for alias in child.names:
|
|
1392
|
+
block_locals.add(alias.asname if alias.asname else alias.name.split(".")[0])
|
|
1393
|
+
elif isinstance(child, ast.ImportFrom):
|
|
1394
|
+
for alias in child.names:
|
|
1395
|
+
block_locals.add(alias.asname if alias.asname else alias.name)
|
|
1396
|
+
elif isinstance(child, ast.comprehension):
|
|
1397
|
+
if isinstance(child.target, ast.Name):
|
|
1398
|
+
block_locals.add(child.target.id)
|
|
1399
|
+
|
|
1400
|
+
effective_known = known | block_locals
|
|
1401
|
+
|
|
1402
|
+
for child in ast.walk(node):
|
|
1403
|
+
if isinstance(child, ast.Name) and isinstance(child.ctx, ast.Load):
|
|
1404
|
+
if child.id.startswith("_") or child.id in effective_known:
|
|
1405
|
+
continue
|
|
1406
|
+
issues.append(UnresolvedName(
|
|
1407
|
+
name=child.id,
|
|
1408
|
+
line=line_offset + child.lineno - 1,
|
|
1409
|
+
func_name=None,
|
|
1410
|
+
))
|
|
1411
|
+
|
|
1412
|
+
|
|
1413
|
+
def validate_all_notebooks(generated: dict[str, str]) -> int:
|
|
1414
|
+
"""
|
|
1415
|
+
Validate all generated notebooks for undefined name references.
|
|
1416
|
+
|
|
1417
|
+
Args:
|
|
1418
|
+
generated: Dict mapping notebook_name -> content
|
|
1419
|
+
|
|
1420
|
+
Returns:
|
|
1421
|
+
Number of errors found in production notebooks
|
|
1422
|
+
"""
|
|
1423
|
+
print("\n" + "=" * 50)
|
|
1424
|
+
print("Validating notebooks for undefined names...")
|
|
1425
|
+
print("=" * 50)
|
|
1426
|
+
|
|
1427
|
+
# Step 1: Collect exports from common_defs
|
|
1428
|
+
common_defs_exports: set[str] = set()
|
|
1429
|
+
if "common_defs" in generated:
|
|
1430
|
+
common_defs_exports = _collect_notebook_exports(generated["common_defs"])
|
|
1431
|
+
|
|
1432
|
+
# Step 2: Collect exports from common_functions
|
|
1433
|
+
common_functions_exports: set[str] = set()
|
|
1434
|
+
if "common_functions" in generated:
|
|
1435
|
+
common_functions_exports = _collect_notebook_exports(generated["common_functions"])
|
|
1436
|
+
|
|
1437
|
+
# Base known names for pipeline notebooks
|
|
1438
|
+
base_known = common_defs_exports | common_functions_exports | COMMON_IMPORTS_NAMES
|
|
1439
|
+
|
|
1440
|
+
# Step 3: Validate common_functions itself
|
|
1441
|
+
errors_count = 0
|
|
1442
|
+
if "common_functions" in generated:
|
|
1443
|
+
count = _validate_single_notebook(
|
|
1444
|
+
"common_functions", generated["common_functions"],
|
|
1445
|
+
common_defs_exports | COMMON_IMPORTS_NAMES,
|
|
1446
|
+
is_test=False,
|
|
1447
|
+
)
|
|
1448
|
+
errors_count += count
|
|
1449
|
+
|
|
1450
|
+
# Step 4: Validate pipeline notebooks
|
|
1451
|
+
# Collect exports from each pipeline notebook for cross-referencing
|
|
1452
|
+
pipeline_exports: dict[str, set[str]] = {}
|
|
1453
|
+
for name, content in generated.items():
|
|
1454
|
+
if name in ("common_defs", "common_functions") or name.startswith("test_"):
|
|
1455
|
+
continue
|
|
1456
|
+
pipeline_exports[name] = _collect_notebook_exports(content)
|
|
1457
|
+
|
|
1458
|
+
for name, content in generated.items():
|
|
1459
|
+
if name in ("common_defs", "common_functions") or name.startswith("test_"):
|
|
1460
|
+
continue
|
|
1461
|
+
|
|
1462
|
+
known = set(base_known)
|
|
1463
|
+
# Add per-notebook allowlist (names provided at runtime, e.g. optional %run)
|
|
1464
|
+
known |= PIPELINE_VALIDATION_ALLOWLIST.get(name, set())
|
|
1465
|
+
# Add exports from %run dependencies
|
|
1466
|
+
for line in content.split("\n"):
|
|
1467
|
+
stripped = line.strip()
|
|
1468
|
+
if stripped.startswith("%run ") and stripped != "%run common_functions" and stripped != "%run common_defs":
|
|
1469
|
+
dep_name = stripped[5:].strip()
|
|
1470
|
+
if dep_name in pipeline_exports:
|
|
1471
|
+
known |= pipeline_exports[dep_name]
|
|
1472
|
+
|
|
1473
|
+
count = _validate_single_notebook(name, content, known, is_test=False)
|
|
1474
|
+
errors_count += count
|
|
1475
|
+
|
|
1476
|
+
# Step 5: Validate test notebooks
|
|
1477
|
+
# Names used in the generated "run tests" cell that are always expected
|
|
1478
|
+
test_runner_names = {"unittest", "sys", "run_tests"}
|
|
1479
|
+
# Names provided by tests.conftest when run with pytest; in Fabric they may be
|
|
1480
|
+
# defined in try/except ImportError fallback (validator collects those from Try handlers)
|
|
1481
|
+
test_conftest_names = {"clean_default_db_table", "force_create_default_db_table"}
|
|
1482
|
+
|
|
1483
|
+
for name, content in generated.items():
|
|
1484
|
+
if not name.startswith("test_"):
|
|
1485
|
+
continue
|
|
1486
|
+
|
|
1487
|
+
known = set(base_known) | test_runner_names | test_conftest_names
|
|
1488
|
+
# Add exports from %run dependencies (production notebook + any others)
|
|
1489
|
+
for line in content.split("\n"):
|
|
1490
|
+
stripped = line.strip()
|
|
1491
|
+
if stripped.startswith("%run ") and stripped != "%run common_functions" and stripped != "%run common_defs":
|
|
1492
|
+
dep_name = stripped[5:].strip()
|
|
1493
|
+
if dep_name in pipeline_exports:
|
|
1494
|
+
known |= pipeline_exports[dep_name]
|
|
1495
|
+
|
|
1496
|
+
count = _validate_single_notebook(name, content, known, is_test=True)
|
|
1497
|
+
# Test notebook issues are warnings, don't count as errors
|
|
1498
|
+
|
|
1499
|
+
return errors_count
|
|
1500
|
+
|
|
1501
|
+
|
|
1502
|
+
def _validate_single_notebook(
|
|
1503
|
+
notebook_name: str, content: str, known_names: set[str], is_test: bool
|
|
1504
|
+
) -> int:
|
|
1505
|
+
"""Validate a single notebook. Returns number of issues found."""
|
|
1506
|
+
all_issues: list[UnresolvedName] = []
|
|
1507
|
+
|
|
1508
|
+
# First, collect all names defined across ALL cells in this notebook
|
|
1509
|
+
# so that cross-cell references resolve correctly
|
|
1510
|
+
notebook_own_exports = _collect_notebook_exports(content)
|
|
1511
|
+
effective_known = known_names | notebook_own_exports
|
|
1512
|
+
|
|
1513
|
+
for offset, cell_code in _extract_code_cells(content):
|
|
1514
|
+
# Skip %run lines from the code
|
|
1515
|
+
code_lines = [l for l in cell_code.split("\n") if not l.strip().startswith("%run")]
|
|
1516
|
+
code = "\n".join(code_lines)
|
|
1517
|
+
if not code.strip():
|
|
1518
|
+
continue
|
|
1519
|
+
|
|
1520
|
+
issues = _find_undefined_names(code, offset, effective_known)
|
|
1521
|
+
all_issues.extend(issues)
|
|
1522
|
+
|
|
1523
|
+
# Deduplicate by (name, line)
|
|
1524
|
+
seen: set[tuple[str, int]] = set()
|
|
1525
|
+
unique_issues: list[UnresolvedName] = []
|
|
1526
|
+
for issue in all_issues:
|
|
1527
|
+
key = (issue.name, issue.line)
|
|
1528
|
+
if key not in seen:
|
|
1529
|
+
seen.add(key)
|
|
1530
|
+
unique_issues.append(issue)
|
|
1531
|
+
|
|
1532
|
+
if unique_issues:
|
|
1533
|
+
prefix = "WARNING" if is_test else "ERROR"
|
|
1534
|
+
print(f"\n {prefix}: {notebook_name} has {len(unique_issues)} undefined name(s):")
|
|
1535
|
+
for issue in unique_issues:
|
|
1536
|
+
loc = f"line {issue.line}"
|
|
1537
|
+
if issue.func_name:
|
|
1538
|
+
loc += f" in {issue.func_name}()"
|
|
1539
|
+
print(f" - '{issue.name}' at {loc}")
|
|
1540
|
+
|
|
1541
|
+
return 0 if is_test else len(unique_issues)
|
|
1542
|
+
|
|
1543
|
+
|
|
1544
|
+
def main():
|
|
1545
|
+
parser = argparse.ArgumentParser(description="Generate Fabric notebooks from Python modules")
|
|
1546
|
+
parser.add_argument("--project-root", type=Path, default=Path.cwd(),
|
|
1547
|
+
help="Consumer project root containing src/, config/, etc. (default: CWD)")
|
|
1548
|
+
parser.add_argument("--only", choices=["bronze", "silver", "gold", "backup", "common", "tests"],
|
|
1549
|
+
help="Generate only notebooks for specified layer")
|
|
1550
|
+
parser.add_argument("--all", action="store_true",
|
|
1551
|
+
help="Regenerate all notebooks including common")
|
|
1552
|
+
parser.add_argument("--dry-run", action="store_true",
|
|
1553
|
+
help="Show what would be generated without writing files")
|
|
1554
|
+
parser.add_argument("--output", type=Path, default=None,
|
|
1555
|
+
help="Output directory (default: project root)")
|
|
1556
|
+
parser.add_argument("--strict-validate", action="store_true",
|
|
1557
|
+
help="Exit with non-zero status if validation finds issues in production notebooks")
|
|
1558
|
+
args = parser.parse_args()
|
|
1559
|
+
|
|
1560
|
+
global PROJECT_ROOT, LAKEHOUSE_CONFIGS
|
|
1561
|
+
PROJECT_ROOT = args.project_root.resolve()
|
|
1562
|
+
LAKEHOUSE_CONFIGS = load_lakehouse_config(PROJECT_ROOT)
|
|
1563
|
+
|
|
1564
|
+
project_root = PROJECT_ROOT
|
|
1565
|
+
src_dir = project_root / "src"
|
|
1566
|
+
output_dir = args.output or project_root
|
|
1567
|
+
|
|
1568
|
+
print("Fabric Notebook Generator")
|
|
1569
|
+
print("=" * 50)
|
|
1570
|
+
|
|
1571
|
+
# Track all generated notebook contents for validation
|
|
1572
|
+
generated_notebooks: dict[str, str] = {}
|
|
1573
|
+
|
|
1574
|
+
# Generate common notebooks
|
|
1575
|
+
if args.only in [None, "common"] or args.all:
|
|
1576
|
+
print("\nGenerating common notebooks...")
|
|
1577
|
+
|
|
1578
|
+
# common_defs
|
|
1579
|
+
content = generate_common_defs_notebook()
|
|
1580
|
+
write_notebook(output_dir / "common", "common_defs", content, args.dry_run)
|
|
1581
|
+
generated_notebooks["common_defs"] = content
|
|
1582
|
+
|
|
1583
|
+
# common_functions
|
|
1584
|
+
content = generate_common_functions_notebook()
|
|
1585
|
+
write_notebook(output_dir / "common", "common_functions", content, args.dry_run)
|
|
1586
|
+
generated_notebooks["common_functions"] = content
|
|
1587
|
+
|
|
1588
|
+
# Generate QuickBooks helper notebooks (placed in bronze/) only when the
|
|
1589
|
+
# consumer ships the corresponding source modules. They're CashHero-flavored
|
|
1590
|
+
# and won't be needed by most consumers.
|
|
1591
|
+
if args.only in [None, "bronze"] or args.all:
|
|
1592
|
+
qb_auth_src = project_root / "src" / "common" / "quickbooks_auth.py"
|
|
1593
|
+
qb_client_src = project_root / "src" / "common" / "quickbooks_client.py"
|
|
1594
|
+
if qb_auth_src.exists():
|
|
1595
|
+
content = generate_quickbooks_auth_notebook()
|
|
1596
|
+
write_notebook(output_dir / "bronze", "10_bronze_quickbooks_auth", content, args.dry_run)
|
|
1597
|
+
generated_notebooks["10_bronze_quickbooks_auth"] = content
|
|
1598
|
+
if qb_client_src.exists():
|
|
1599
|
+
content = generate_quickbooks_client_notebook()
|
|
1600
|
+
write_notebook(output_dir / "bronze", "10_bronze_quickbooks_client", content, args.dry_run)
|
|
1601
|
+
generated_notebooks["10_bronze_quickbooks_client"] = content
|
|
1602
|
+
|
|
1603
|
+
# Generate pipeline notebooks
|
|
1604
|
+
if args.only in [None, "bronze", "silver", "gold", "backup"] or args.all:
|
|
1605
|
+
configs = find_modules_with_run_function(src_dir)
|
|
1606
|
+
|
|
1607
|
+
if args.only:
|
|
1608
|
+
configs = [c for c in configs if c.layer == args.only]
|
|
1609
|
+
|
|
1610
|
+
if configs:
|
|
1611
|
+
print(f"\nGenerating {len(configs)} pipeline notebooks...")
|
|
1612
|
+
for config in configs:
|
|
1613
|
+
content = generate_pipeline_notebook(config)
|
|
1614
|
+
# Write production notebooks under layer dir (bronze/, silver/, gold/)
|
|
1615
|
+
layer_output_dir = output_dir / config.layer
|
|
1616
|
+
write_notebook(layer_output_dir, config.notebook_name, content, args.dry_run)
|
|
1617
|
+
generated_notebooks[config.notebook_name] = content
|
|
1618
|
+
else:
|
|
1619
|
+
print("\nNo pipeline modules found with run() functions.")
|
|
1620
|
+
print("Create modules in src/bronze/, src/silver/, or src/gold/ with a run() function.")
|
|
1621
|
+
|
|
1622
|
+
# Generate test notebooks for migrated modules
|
|
1623
|
+
if args.only in [None, "tests"] or args.all:
|
|
1624
|
+
tests_dir = project_root / "tests"
|
|
1625
|
+
if tests_dir.exists():
|
|
1626
|
+
# Find test files that correspond to migrated modules
|
|
1627
|
+
migrated_configs = find_modules_with_run_function(src_dir)
|
|
1628
|
+
test_notebooks_generated = 0
|
|
1629
|
+
|
|
1630
|
+
for config in migrated_configs:
|
|
1631
|
+
# Map module name to test file
|
|
1632
|
+
# e.g., src/silver/transform_accounts.py -> tests/test_silver_transform_accounts.py
|
|
1633
|
+
module_name = config.module_path.stem # transform_accounts
|
|
1634
|
+
layer = config.layer # silver
|
|
1635
|
+
test_file = tests_dir / f"test_{layer}_{module_name}.py"
|
|
1636
|
+
|
|
1637
|
+
# Also try without layer prefix
|
|
1638
|
+
if not test_file.exists():
|
|
1639
|
+
test_file = tests_dir / f"test_{module_name}.py"
|
|
1640
|
+
|
|
1641
|
+
if test_file.exists():
|
|
1642
|
+
# Generate test notebook with the notebook naming convention
|
|
1643
|
+
# e.g., test_20_silver_transform_accounts
|
|
1644
|
+
test_notebook_name = f"test_{config.notebook_name}"
|
|
1645
|
+
|
|
1646
|
+
test_config = NotebookConfig(
|
|
1647
|
+
module_path=test_file,
|
|
1648
|
+
notebook_name=test_notebook_name,
|
|
1649
|
+
layer="tests",
|
|
1650
|
+
has_run_function=False,
|
|
1651
|
+
)
|
|
1652
|
+
content = generate_test_notebook(test_config)
|
|
1653
|
+
# Write test notebooks under layer/tests/ (e.g. bronze/tests/, silver/tests/, gold/tests/)
|
|
1654
|
+
test_output_dir = output_dir / config.layer / "tests"
|
|
1655
|
+
write_notebook(test_output_dir, test_config.notebook_name, content, args.dry_run)
|
|
1656
|
+
generated_notebooks[test_config.notebook_name] = content
|
|
1657
|
+
test_notebooks_generated += 1
|
|
1658
|
+
|
|
1659
|
+
if test_notebooks_generated > 0:
|
|
1660
|
+
print(f"\nGenerated {test_notebooks_generated} test notebooks for migrated modules")
|
|
1661
|
+
|
|
1662
|
+
# Validate all generated notebooks
|
|
1663
|
+
if not args.dry_run and generated_notebooks:
|
|
1664
|
+
error_count = validate_all_notebooks(generated_notebooks)
|
|
1665
|
+
if error_count > 0:
|
|
1666
|
+
print(f"\nValidation: {error_count} undefined name(s) found in production notebooks.")
|
|
1667
|
+
if args.strict_validate:
|
|
1668
|
+
print("Fix the source modules and regenerate (--strict-validate is on).")
|
|
1669
|
+
sys.exit(1)
|
|
1670
|
+
else:
|
|
1671
|
+
print("\nValidation passed: no undefined names found.")
|
|
1672
|
+
|
|
1673
|
+
print("\nDone!")
|
|
1674
|
+
|
|
1675
|
+
|
|
1676
|
+
if __name__ == "__main__":
|
|
1677
|
+
main()
|