pyfabric-dev 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1677 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Generate Microsoft Fabric notebooks from Python modules.
4
+
5
+ This script generates Fabric-compatible .Notebook directories from the
6
+ Python modules in src/. Generated notebooks import and execute the
7
+ corresponding module functions.
8
+
9
+ Usage:
10
+ python dev/generate_notebooks.py
11
+
12
+ # Generate only specific notebooks
13
+ python dev/generate_notebooks.py --only bronze
14
+
15
+ # Regenerate all notebooks (including common)
16
+ python dev/generate_notebooks.py --all
17
+
18
+ # Dry run (show what would be generated)
19
+ python dev/generate_notebooks.py --dry-run
20
+ """
21
+
22
+ import argparse
23
+ import ast
24
+ import json
25
+ import os
26
+ import sys
27
+ from dataclasses import dataclass
28
+ from pathlib import Path
29
+ from typing import Literal
30
+
31
+
32
+ # Project root is resolved at CLI invocation (via --project-root, or
33
+ # Path.cwd() as a fallback). All source/output path computations route
34
+ # through this global so the generator works for any consumer, not just
35
+ # the package install location.
36
+ PROJECT_ROOT: Path = Path.cwd()
37
+
38
+ # Lakehouse config is loaded lazily after PROJECT_ROOT is set in main().
39
+ LAKEHOUSE_CONFIGS: dict = {}
40
+
41
+
42
+ def load_lakehouse_config(project_root: Path | None = None) -> dict:
43
+ """Load lakehouse configuration from <project_root>/config/lakehouse_config.json."""
44
+ root = project_root if project_root is not None else PROJECT_ROOT
45
+ config_path = root / "config" / "lakehouse_config.json"
46
+
47
+ if not config_path.exists():
48
+ print(f"Warning: Lakehouse config not found at {config_path}")
49
+ print("Using empty config. Create config/lakehouse_config.json from template.")
50
+ return {"bronze": {}, "silver": {}, "gold": {}, "tests": {}, "common": {}}
51
+
52
+ with open(config_path) as f:
53
+ return json.load(f)
54
+
55
+ # Notebook naming prefixes
56
+ LAYER_PREFIXES = {
57
+ "bronze": "10_bronze_",
58
+ "silver": "20_silver_",
59
+ "gold": "30_gold_",
60
+ "backup": "",
61
+ }
62
+
63
+
64
+ @dataclass
65
+ class NotebookConfig:
66
+ """Configuration for generating a notebook."""
67
+ module_path: Path
68
+ notebook_name: str
69
+ layer: Literal["bronze", "silver", "gold", "backup", "common", "tests"]
70
+ has_run_function: bool = True
71
+ parameters: dict = None
72
+
73
+
74
+ def generate_metadata_block(lakehouse_config: dict = None) -> str:
75
+ """Generate the notebook METADATA block."""
76
+ meta = {
77
+ "kernel_info": {
78
+ "name": "synapse_pyspark"
79
+ },
80
+ "dependencies": {}
81
+ }
82
+
83
+ if lakehouse_config:
84
+ meta["dependencies"]["lakehouse"] = lakehouse_config
85
+
86
+ # Format dependencies as indented JSON within META comments
87
+ deps_json = json.dumps(
88
+ {"lakehouse": lakehouse_config} if lakehouse_config else {},
89
+ indent=2
90
+ ).replace("\n", "\n# META ")
91
+
92
+ return f"""# METADATA ********************
93
+
94
+ # META {{
95
+ # META "kernel_info": {{
96
+ # META "name": "synapse_pyspark"
97
+ # META }},
98
+ # META "dependencies": {deps_json}
99
+ # META }}"""
100
+
101
+
102
+ def generate_cell_metadata() -> str:
103
+ """Generate metadata for a code cell."""
104
+ return """# METADATA ********************
105
+
106
+ # META {
107
+ # META "language": "python",
108
+ # META "language_group": "synapse_pyspark"
109
+ # META }"""
110
+
111
+
112
+ def generate_path_setup_cell() -> str:
113
+ """Generate the cell that sets up the Python path for src imports."""
114
+ return """# CELL ********************
115
+
116
+ # Setup Python path for src imports
117
+ # This ensures src modules can be imported in both Fabric and local environments
118
+ import sys
119
+ from pathlib import Path
120
+
121
+ # In Fabric, notebooks run from the workspace root where src/ is located
122
+ # Locally, we need to ensure the project root is in the path
123
+ _notebook_dir = Path.cwd()
124
+ _project_root = _notebook_dir.parent if _notebook_dir.name.endswith('.Notebook') else _notebook_dir
125
+
126
+ if str(_project_root) not in sys.path:
127
+ sys.path.insert(0, str(_project_root))
128
+
129
+ # METADATA ********************
130
+
131
+ # META {
132
+ # META "language": "python",
133
+ # META "language_group": "synapse_pyspark"
134
+ # META }
135
+ """
136
+
137
+
138
+ def extract_imports_from_module(module_path: Path, skip_imports: list[str] = None) -> list[str]:
139
+ """
140
+ Extract import statements from a Python module.
141
+
142
+ Args:
143
+ module_path: Path to the Python module
144
+ skip_imports: List of import patterns to skip (e.g., ['from src.common'])
145
+
146
+ Returns:
147
+ List of import statement strings
148
+ """
149
+ skip_imports = skip_imports or []
150
+
151
+ with open(module_path) as f:
152
+ lines = f.readlines()
153
+
154
+ import_lines = []
155
+ in_multiline_import = False
156
+ in_docstring = False
157
+ docstring_char = None
158
+ current_import = []
159
+
160
+ for line in lines:
161
+ stripped = line.strip()
162
+
163
+ # Handle docstrings
164
+ if not in_docstring:
165
+ if stripped.startswith('"""') or stripped.startswith("'''"):
166
+ docstring_char = stripped[:3]
167
+ # Check if it's a single-line docstring
168
+ if stripped.count(docstring_char) >= 2:
169
+ continue # Single line docstring, skip it
170
+ in_docstring = True
171
+ continue
172
+ else:
173
+ # We're inside a docstring, check if this line ends it
174
+ if docstring_char in stripped:
175
+ in_docstring = False
176
+ continue
177
+
178
+ # Skip comments
179
+ if stripped.startswith("#"):
180
+ continue
181
+
182
+ # Check if this is an import line
183
+ is_import = stripped.startswith("import ") or stripped.startswith("from ")
184
+
185
+ if in_multiline_import:
186
+ current_import.append(line.rstrip())
187
+ # Check if this line ends the multiline import
188
+ if stripped.endswith(")") or (not stripped.endswith("\\") and not stripped.endswith(",")):
189
+ in_multiline_import = False
190
+ # Check if we should skip this import
191
+ full_import = "\n".join(current_import)
192
+ should_skip = any(pattern in full_import for pattern in skip_imports)
193
+ if not should_skip:
194
+ import_lines.append(full_import)
195
+ current_import = []
196
+ elif is_import:
197
+ # Check if this is a multiline import
198
+ if ("(" in stripped and ")" not in stripped) or stripped.endswith("\\"):
199
+ in_multiline_import = True
200
+ current_import = [line.rstrip()]
201
+ else:
202
+ # Single line import - check if we should skip it
203
+ should_skip = any(pattern in line for pattern in skip_imports)
204
+ if not should_skip:
205
+ import_lines.append(line.rstrip())
206
+ elif stripped and not is_import:
207
+ # Stop at first non-import, non-comment line (start of actual code)
208
+ break
209
+
210
+ return import_lines
211
+
212
+
213
+ def read_module_code(module_path: Path, skip_imports: list[str] = None, skip_top_level_only: bool = False) -> str:
214
+ """
215
+ Read Python module and extract code suitable for notebook inlining.
216
+
217
+ Args:
218
+ module_path: Path to the Python module
219
+ skip_imports: List of import patterns to skip (e.g., ['from src.common'])
220
+ skip_top_level_only: If True, only skip imports at indentation level 0 (preserve imports inside functions)
221
+
222
+ Returns:
223
+ Code string with imports filtered
224
+ """
225
+ with open(module_path) as f:
226
+ lines = f.readlines()
227
+
228
+ skip_imports = skip_imports or []
229
+ result_lines = []
230
+ in_docstring = False
231
+ docstring_char = None
232
+ in_multiline_import = False
233
+
234
+ for line in lines:
235
+ stripped = line.strip()
236
+
237
+ # Track multiline imports (lines ending with \ or inside parentheses)
238
+ if in_multiline_import:
239
+ # Check if this line ends the multiline import
240
+ if stripped.endswith(")") or (not stripped.endswith("\\") and not stripped.endswith(",")):
241
+ in_multiline_import = False
242
+ continue # Skip all lines of the multiline import
243
+
244
+ # Track docstrings
245
+ if not in_docstring:
246
+ if stripped.startswith('"""') or stripped.startswith("'''"):
247
+ docstring_char = stripped[:3]
248
+ if stripped.count(docstring_char) >= 2:
249
+ # Single line docstring
250
+ result_lines.append(line)
251
+ continue
252
+ in_docstring = True
253
+ else:
254
+ if docstring_char in stripped:
255
+ in_docstring = False
256
+ result_lines.append(line)
257
+ continue
258
+
259
+ # Skip specified imports
260
+ should_skip = False
261
+ for skip_pattern in skip_imports:
262
+ if stripped.startswith(skip_pattern):
263
+ # If skip_top_level_only is True, only skip if this is a top-level import (no indentation)
264
+ if skip_top_level_only:
265
+ # Check if line starts with the pattern (no leading whitespace)
266
+ if line.startswith(skip_pattern):
267
+ should_skip = True
268
+ else:
269
+ should_skip = True
270
+
271
+ if should_skip:
272
+ # Check if this is a multiline import
273
+ if "(" in stripped and ")" not in stripped:
274
+ in_multiline_import = True
275
+ elif stripped.endswith("\\"):
276
+ in_multiline_import = True
277
+ break
278
+
279
+ if not should_skip:
280
+ result_lines.append(line)
281
+
282
+ return "".join(result_lines)
283
+
284
+
285
+ def generate_common_defs_notebook() -> str:
286
+ """Generate the common_defs notebook content with inlined code.
287
+
288
+ Inlines framework_defs.py first, then defs.py (with the
289
+ framework_defs cross-import stripped). framework_defs is the
290
+ framework-extractable subset; defs.py holds CashHero specifics.
291
+ """
292
+ project_root = PROJECT_ROOT
293
+ framework_defs_path = project_root / "src" / "common" / "framework_defs.py"
294
+ defs_path = project_root / "src" / "common" / "defs.py"
295
+
296
+ content = ["# Fabric notebook source", ""]
297
+ content.append(generate_metadata_block())
298
+ content.append("")
299
+
300
+ # Cell 1: Inlined code from src/common/framework_defs.py + src/common/defs.py
301
+ content.append("# CELL ********************")
302
+ content.append("")
303
+
304
+ if framework_defs_path.exists():
305
+ content.append(read_module_code(framework_defs_path).rstrip())
306
+ content.append("")
307
+
308
+ if defs_path.exists():
309
+ code = read_module_code(defs_path, skip_imports=[
310
+ "from src.common",
311
+ "import src.common",
312
+ ])
313
+ content.append(code.rstrip())
314
+ else:
315
+ content.append("# WARNING: src/common/defs.py not found")
316
+ content.append("# Please create the module first")
317
+
318
+ content.append("")
319
+ content.append(generate_cell_metadata())
320
+ content.append("")
321
+
322
+ return "\n".join(content)
323
+
324
+
325
+ def generate_common_functions_notebook() -> str:
326
+ """Generate the common_functions notebook content with inlined code."""
327
+ project_root = PROJECT_ROOT
328
+ functions_path = project_root / "src" / "common" / "functions.py"
329
+ spark_path = project_root / "src" / "common" / "spark.py"
330
+
331
+ lakehouse_config = LAKEHOUSE_CONFIGS.get("bronze") # Default to bronze
332
+
333
+ content = ["# Fabric notebook source", ""]
334
+ content.append(generate_metadata_block(lakehouse_config))
335
+ content.append("")
336
+
337
+ # Cell 1: Run common_defs
338
+ content.append("# CELL ********************")
339
+ content.append("")
340
+ content.append("%run common_defs")
341
+ content.append("")
342
+ content.append(generate_cell_metadata())
343
+ content.append("")
344
+
345
+ # Cell 2: Imports from source file
346
+ content.append("# CELL ********************")
347
+ content.append("")
348
+
349
+ # Extract imports from src/common/functions.py + cashhero_org.py,
350
+ # excluding src.common imports (provided by %run common_defs).
351
+ # Deduplicate so symbols common to both files appear once.
352
+ cashhero_org_path = project_root / "src" / "common" / "cashhero_org.py"
353
+ if functions_path.exists():
354
+ imports = extract_imports_from_module(functions_path, skip_imports=[
355
+ "from src.common",
356
+ "import src.common",
357
+ ])
358
+ if cashhero_org_path.exists():
359
+ extra_imports = extract_imports_from_module(cashhero_org_path, skip_imports=[
360
+ "from src.common",
361
+ "import src.common",
362
+ ])
363
+ for imp in extra_imports:
364
+ if imp not in imports:
365
+ imports.append(imp)
366
+
367
+ # Add Fabric-specific imports that aren't in the source file
368
+ # (notebookutils is only available in Fabric, not in local dev)
369
+ fabric_imports = ["from notebookutils import mssparkutils"]
370
+
371
+ for imp in imports:
372
+ content.append(imp)
373
+
374
+ if imports:
375
+ content.append("")
376
+
377
+ for imp in fabric_imports:
378
+ content.append(imp)
379
+ else:
380
+ content.append("# WARNING: src/common/functions.py not found")
381
+
382
+ content.append("")
383
+ content.append(generate_cell_metadata())
384
+ content.append("")
385
+
386
+ # Cell 3: Fabric-specific utilities (extracted from src/common/fabric.py)
387
+ content.append("# CELL ********************")
388
+ content.append("")
389
+
390
+ fabric_path = project_root / "src" / "common" / "fabric.py"
391
+ if fabric_path.exists():
392
+ # Extract functions from fabric.py, skipping all imports
393
+ code = read_module_code(fabric_path, skip_imports=[
394
+ "import ",
395
+ "from ",
396
+ ])
397
+ content.append(code.rstrip())
398
+ else:
399
+ content.append("# WARNING: src/common/fabric.py not found")
400
+ content.append("# Fabric-specific utilities should be defined here")
401
+
402
+ content.append("")
403
+ content.append(generate_cell_metadata())
404
+ content.append("")
405
+
406
+ # Cell 4: Environment config loader (extracted from src/common/env.py)
407
+ content.append("# CELL ********************")
408
+ content.append("")
409
+
410
+ env_path = project_root / "src" / "common" / "env.py"
411
+ if env_path.exists():
412
+ # Extract functions from env.py, skipping all imports
413
+ code = read_module_code(env_path, skip_imports=[
414
+ "import ",
415
+ "from ",
416
+ ])
417
+ content.append(code.rstrip())
418
+ else:
419
+ content.append("# WARNING: src/common/env.py not found")
420
+ content.append("# Environment config loader should be defined here")
421
+
422
+ content.append("")
423
+ content.append(generate_cell_metadata())
424
+ content.append("")
425
+
426
+ # Cell 5: Inlined functions from src/common/functions.py + cashhero_org.py
427
+ # functions.py is the framework-extractable subset; cashhero_org.py is
428
+ # the CashHero-domain subset. Both are inlined into the same notebook
429
+ # so callers see all symbols in a single namespace after %run.
430
+ content.append("# CELL ********************")
431
+ content.append("")
432
+
433
+ if functions_path.exists():
434
+ # Skip all imports since they're already in Cell 2
435
+ code = read_module_code(functions_path, skip_imports=[
436
+ "import ",
437
+ "from ",
438
+ ])
439
+ content.append(code.rstrip())
440
+ else:
441
+ content.append("# WARNING: src/common/functions.py not found")
442
+
443
+ if cashhero_org_path.exists():
444
+ content.append("")
445
+ code = read_module_code(cashhero_org_path, skip_imports=[
446
+ "import ",
447
+ "from ",
448
+ ])
449
+ content.append(code.rstrip())
450
+
451
+ content.append("")
452
+ content.append(generate_cell_metadata())
453
+ content.append("")
454
+
455
+ return "\n".join(content)
456
+
457
+
458
+ def generate_quickbooks_auth_notebook() -> str:
459
+ """Generate the 10_bronze_quickbooks_auth notebook with inlined code from quickbooks_auth.py."""
460
+ project_root = PROJECT_ROOT
461
+ module_path = project_root / "src" / "common" / "quickbooks_auth.py"
462
+
463
+ content = ["# Fabric notebook source", ""]
464
+ content.append(generate_metadata_block())
465
+ content.append("")
466
+
467
+ # Cell 1: Run common_defs (needed for QUICKBOOKS_TOKEN_URL)
468
+ content.append("# CELL ********************")
469
+ content.append("")
470
+ content.append("%run common_defs")
471
+ content.append("")
472
+ content.append(generate_cell_metadata())
473
+ content.append("")
474
+
475
+ # Cell 2: Imports
476
+ content.append("# CELL ********************")
477
+ content.append("")
478
+ if module_path.exists():
479
+ imports = extract_imports_from_module(module_path, skip_imports=[
480
+ "from src.common",
481
+ "import src.common",
482
+ ])
483
+ for imp in imports:
484
+ content.append(imp)
485
+ content.append("")
486
+ content.append(generate_cell_metadata())
487
+ content.append("")
488
+
489
+ # Cell 3: Inlined code (functions only, top-level imports stripped)
490
+ content.append("# CELL ********************")
491
+ content.append("")
492
+ if module_path.exists():
493
+ code = read_module_code(module_path, skip_imports=[
494
+ "import ",
495
+ "from ",
496
+ ], skip_top_level_only=True)
497
+ content.append(code.rstrip())
498
+ else:
499
+ content.append("# WARNING: src/common/quickbooks_auth.py not found")
500
+ content.append("")
501
+ content.append(generate_cell_metadata())
502
+ content.append("")
503
+
504
+ return "\n".join(content)
505
+
506
+
507
+ def generate_quickbooks_client_notebook() -> str:
508
+ """Generate the 10_bronze_quickbooks_client notebook with inlined code from quickbooks_client.py."""
509
+ project_root = PROJECT_ROOT
510
+ module_path = project_root / "src" / "common" / "quickbooks_client.py"
511
+
512
+ content = ["# Fabric notebook source", ""]
513
+ content.append(generate_metadata_block())
514
+ content.append("")
515
+
516
+ # Cell 1: Run common_defs (needed for QUICKBOOKS_API_BASE_URL etc.)
517
+ content.append("# CELL ********************")
518
+ content.append("")
519
+ content.append("%run common_defs")
520
+ content.append("")
521
+ content.append(generate_cell_metadata())
522
+ content.append("")
523
+
524
+ # Cell 2: Imports
525
+ content.append("# CELL ********************")
526
+ content.append("")
527
+ if module_path.exists():
528
+ imports = extract_imports_from_module(module_path, skip_imports=[
529
+ "from src.common",
530
+ "import src.common",
531
+ ])
532
+ for imp in imports:
533
+ content.append(imp)
534
+ content.append("")
535
+ content.append(generate_cell_metadata())
536
+ content.append("")
537
+
538
+ # Cell 3: Inlined code (functions only, top-level imports stripped)
539
+ content.append("# CELL ********************")
540
+ content.append("")
541
+ if module_path.exists():
542
+ code = read_module_code(module_path, skip_imports=[
543
+ "import ",
544
+ "from ",
545
+ ], skip_top_level_only=True)
546
+ content.append(code.rstrip())
547
+ else:
548
+ content.append("# WARNING: src/common/quickbooks_client.py not found")
549
+ content.append("")
550
+ content.append(generate_cell_metadata())
551
+ content.append("")
552
+
553
+ return "\n".join(content)
554
+
555
+
556
+ def generate_pipeline_notebook(config: NotebookConfig) -> str:
557
+ """Generate a pipeline notebook (bronze/silver/gold) with inlined code."""
558
+ lakehouse_config = LAKEHOUSE_CONFIGS.get(config.layer)
559
+
560
+ content = ["# Fabric notebook source", ""]
561
+ content.append(generate_metadata_block(lakehouse_config))
562
+ content.append("")
563
+
564
+ # Cell 1: Run common_functions
565
+ content.append("# CELL ********************")
566
+ content.append("")
567
+ content.append("%run common_functions")
568
+ content.append("")
569
+ content.append(generate_cell_metadata())
570
+ content.append("")
571
+
572
+ # Special case: notebooks that need update_watermark_table functions
573
+ if ("ingest_from_priority" in config.notebook_name
574
+ or "post_process_priority_extract" in config.notebook_name):
575
+ content.append("# CELL ********************")
576
+ content.append("")
577
+ content.append("%run 10_bronze_update_watermark_table")
578
+ content.append("")
579
+ content.append(generate_cell_metadata())
580
+ content.append("")
581
+
582
+ # Special case: QuickBooks ingestion needs auth and client helpers
583
+ if "ingest_from_quickbooks" in config.notebook_name:
584
+ content.append("# CELL ********************")
585
+ content.append("")
586
+ content.append("%run 10_bronze_quickbooks_auth")
587
+ content.append("")
588
+ content.append(generate_cell_metadata())
589
+ content.append("")
590
+
591
+ content.append("# CELL ********************")
592
+ content.append("")
593
+ content.append("%run 10_bronze_quickbooks_client")
594
+ content.append("")
595
+ content.append(generate_cell_metadata())
596
+ content.append("")
597
+
598
+ # Special case: onboard_org needs finalize_accounts for globals().
599
+ # Set RUN_MAIN = False so %run loads functions without executing their run() entry points.
600
+ if "onboard_org" in config.notebook_name:
601
+ content.append("# CELL ********************")
602
+ content.append("")
603
+ content.append("_SAVED_RUN_MAIN = globals().get('RUN_MAIN', True)")
604
+ content.append("RUN_MAIN = False")
605
+ content.append("")
606
+ content.append(generate_cell_metadata())
607
+ content.append("")
608
+ content.append("# CELL ********************")
609
+ content.append("")
610
+ content.append("%run 30_gold_finalize_accounts")
611
+ content.append("")
612
+ content.append(generate_cell_metadata())
613
+ content.append("")
614
+ content.append("# CELL ********************")
615
+ content.append("")
616
+ content.append("RUN_MAIN = _SAVED_RUN_MAIN")
617
+ content.append("")
618
+ content.append(generate_cell_metadata())
619
+ content.append("")
620
+
621
+ # Cell 2: Parameters (if any)
622
+ if config.parameters:
623
+ content.append("# PARAMETERS CELL ********************")
624
+ content.append("")
625
+ for key, value in config.parameters.items():
626
+ # value is already a string representation from ast_unparse_default
627
+ # Special handling: get_lakehouse_path_func should default to cf_get_lakehouse_path
628
+ # (available from %run common_functions) instead of None
629
+ if key == "get_lakehouse_path_func" and value == "None":
630
+ content.append(f'{key} = cf_get_lakehouse_path')
631
+ else:
632
+ content.append(f'{key} = {value}')
633
+ content.append("")
634
+ content.append(generate_cell_metadata())
635
+ content.append("")
636
+
637
+ # Cell 3: Setup spark and logger
638
+ content.append("# CELL ********************")
639
+ content.append("")
640
+ content.append("# Allow callers/tests to inject an existing Spark session / logger before executing this")
641
+ content.append("# notebook (e.g., setting globals()['_spark'] or globals()['_logger']).")
642
+ content.append('if "_logger" not in globals():')
643
+ content.append(f' _logger = cf_create_logger("{config.notebook_name}")')
644
+ content.append("")
645
+ content.append('if "_spark" not in globals():')
646
+ content.append(" _spark = cf_create_spark_session()")
647
+ content.append("")
648
+ content.append(generate_cell_metadata())
649
+ content.append("")
650
+
651
+ # Cell 4: Pipeline-specific imports (not provided by common_functions)
652
+ content.append("# CELL ********************")
653
+ content.append("")
654
+
655
+ if config.module_path.exists():
656
+ # Extract imports from pipeline module, excluding what common_functions provides
657
+ pipeline_imports = extract_imports_from_module(config.module_path, skip_imports=[
658
+ "from src.common",
659
+ "import src.common",
660
+ ])
661
+
662
+ # Add only imports NOT already provided by common_functions
663
+ # Common functions provides: logging, os, random, re, shutil, uuid, zipfile,
664
+ # datetime, functools.reduce, Logger, DataFrame, SparkSession, F,
665
+ # common pyspark.sql.functions, common pyspark.sql.types, DeltaTable
666
+ for imp in pipeline_imports:
667
+ # Skip imports entirely provided by common_functions
668
+ skip_completely = [
669
+ "from logging import Logger",
670
+ "from functools import reduce",
671
+ "from pyspark.sql import functions as F",
672
+ ]
673
+
674
+ # These modules are fully covered in common_functions
675
+ if any(imp == skip or imp.startswith(skip) for skip in skip_completely):
676
+ continue
677
+
678
+ # For standard library imports, skip if already in common_functions
679
+ stdlib_covered = ["import logging", "import os", "import random", "import re",
680
+ "import shutil", "import uuid", "import zipfile"]
681
+ if any(imp == skip for skip in stdlib_covered):
682
+ continue
683
+
684
+ # For datetime, skip if it's the same as common_functions
685
+ if imp == "from datetime import datetime, timedelta, timezone":
686
+ continue
687
+
688
+ content.append(imp)
689
+ else:
690
+ content.append("# No pipeline-specific imports")
691
+
692
+ content.append("")
693
+ content.append(generate_cell_metadata())
694
+ content.append("")
695
+
696
+ # Cell 5: Inlined pipeline code
697
+ content.append("# CELL ********************")
698
+ content.append("")
699
+
700
+ if config.module_path.exists():
701
+ # Skip top-level imports (already in Cell 4), but keep imports inside functions
702
+ code = read_module_code(config.module_path, skip_imports=[
703
+ "import ",
704
+ "from ",
705
+ ], skip_top_level_only=True)
706
+ content.append(code.rstrip())
707
+ else:
708
+ content.append(f"# WARNING: {config.module_path} not found")
709
+
710
+ content.append("")
711
+ content.append(generate_cell_metadata())
712
+ content.append("")
713
+
714
+ # Cell 6: Run the pipeline
715
+ content.append("# CELL ********************")
716
+ content.append("")
717
+ content.append("# Run the pipeline")
718
+ content.append('if "RUN_MAIN" not in globals():')
719
+ content.append(" RUN_MAIN = True")
720
+ content.append("")
721
+ content.append("if RUN_MAIN:")
722
+ # Build the run() call with parameters
723
+ run_args = ["_spark", "_logger"]
724
+ if config.parameters:
725
+ # Pass optional parameters as keyword arguments so notebook-level
726
+ # variable renames or signature reorderings don't silently misbind.
727
+ for param_name in config.parameters.keys():
728
+ run_args.append(f"{param_name}={param_name}")
729
+ run_call = f" run({', '.join(run_args)})"
730
+ content.append(run_call)
731
+ content.append("")
732
+ content.append(generate_cell_metadata())
733
+ content.append("")
734
+
735
+ return "\n".join(content)
736
+
737
+
738
+ def module_imports_fabric_harness(module_path: Path) -> bool:
739
+ """
740
+ True if the test module imports tests.fabric_test_tables.
741
+
742
+ Generated Fabric notebooks cannot use ``from tests.*``; when this is True,
743
+ ``generate_test_notebook`` inlines ``tests/fabric_test_tables.py`` so
744
+ ``run_tests`` and helpers resolve in Synapse.
745
+ """
746
+ if not module_path.exists():
747
+ return False
748
+ try:
749
+ tree = ast.parse(module_path.read_text(encoding="utf-8"))
750
+ except (SyntaxError, OSError):
751
+ return False
752
+ for node in ast.walk(tree):
753
+ if isinstance(node, ast.ImportFrom) and node.module == "tests.fabric_test_tables":
754
+ return True
755
+ if isinstance(node, ast.Import):
756
+ for alias in node.names:
757
+ if alias.name == "tests.fabric_test_tables":
758
+ return True
759
+ return False
760
+
761
+
762
+ def generate_test_notebook(config: NotebookConfig) -> str:
763
+ """Generate a test notebook with inlined test code."""
764
+ lakehouse_config = LAKEHOUSE_CONFIGS.get("tests")
765
+ project_root = PROJECT_ROOT.resolve()
766
+ fabric_harness_path = project_root / "tests" / "fabric_test_tables.py"
767
+
768
+ content = ["# Fabric notebook source", ""]
769
+ content.append(generate_metadata_block(lakehouse_config))
770
+ content.append("")
771
+
772
+ # Cell 1: Disable RUN_MAIN for the notebook being tested
773
+ content.append("# CELL ********************")
774
+ content.append("")
775
+ content.append("RUN_MAIN = False")
776
+ content.append("")
777
+ content.append(generate_cell_metadata())
778
+ content.append("")
779
+
780
+ # Cell 2: Run common_functions
781
+ content.append("# CELL ********************")
782
+ content.append("")
783
+ content.append("%run common_functions")
784
+ content.append("")
785
+ content.append(generate_cell_metadata())
786
+ content.append("")
787
+
788
+ # Cell 3: Run the production notebook (to get constants and functions)
789
+ # Extract production notebook name from test notebook name (test_X -> X)
790
+ prod_notebook_name = config.notebook_name
791
+ if prod_notebook_name.startswith("test_"):
792
+ prod_notebook_name = prod_notebook_name[5:] # Remove "test_" prefix
793
+
794
+ content.append("# CELL ********************")
795
+ content.append("")
796
+ content.append(f"%run {prod_notebook_name}")
797
+ content.append("")
798
+ content.append(generate_cell_metadata())
799
+ content.append("")
800
+
801
+ # Cell 4: Setup spark and logger
802
+ content.append("# CELL ********************")
803
+ content.append("")
804
+ content.append('if "_logger" not in globals():')
805
+ content.append(f' _logger = cf_create_logger("{config.notebook_name}")')
806
+ content.append("")
807
+ content.append('if "_spark" not in globals():')
808
+ content.append(" _spark = cf_create_spark_session()")
809
+ content.append("")
810
+ content.append(generate_cell_metadata())
811
+ content.append("")
812
+
813
+ # Cell 5: Test imports
814
+ content.append("# CELL ********************")
815
+ content.append("")
816
+
817
+ # Extract imports from the test source file, excluding src module imports
818
+ # (since those are provided by %run common_functions and %run production_notebook)
819
+ if config.module_path.exists():
820
+ test_imports = extract_imports_from_module(config.module_path, skip_imports=[
821
+ "from src.common",
822
+ "import src.common",
823
+ "from src.silver",
824
+ "from src.bronze",
825
+ "from src.gold",
826
+ "import src.",
827
+ "from tests.",
828
+ ])
829
+
830
+ for imp in test_imports:
831
+ content.append(imp)
832
+ else:
833
+ content.append("# WARNING: Test imports could not be extracted")
834
+
835
+ content.append("")
836
+ content.append(generate_cell_metadata())
837
+ content.append("")
838
+
839
+ # Cell 6 (optional): inline Fabric test harness when tests import tests.fabric_test_tables
840
+ if (
841
+ config.module_path.exists()
842
+ and module_imports_fabric_harness(config.module_path)
843
+ and fabric_harness_path.is_file()
844
+ ):
845
+ content.append("# CELL ********************")
846
+ content.append("")
847
+ content.append(
848
+ "# Inlined from tests/fabric_test_tables.py (Fabric cannot import from tests.*)"
849
+ )
850
+ fabric_code = read_module_code(fabric_harness_path, skip_imports=[])
851
+ content.append(fabric_code.rstrip())
852
+ content.append("")
853
+ content.append(generate_cell_metadata())
854
+ content.append("")
855
+
856
+ # Cell 7: Inlined test code
857
+ content.append("# CELL ********************")
858
+ content.append("")
859
+
860
+ if config.module_path.exists():
861
+ # Skip top-level imports (already in Cell 5), but keep imports inside functions
862
+ code = read_module_code(config.module_path, skip_imports=[
863
+ "import ",
864
+ "from ",
865
+ ], skip_top_level_only=True)
866
+ content.append(code.rstrip())
867
+ else:
868
+ content.append(f"# WARNING: {config.module_path} not found")
869
+
870
+ content.append("")
871
+ content.append(generate_cell_metadata())
872
+ content.append("")
873
+
874
+ # Cell 8: Run the tests
875
+ content.append("# CELL ********************")
876
+ content.append("")
877
+ content.append("# Run the tests")
878
+ content.append("run_tests(_spark, _logger)")
879
+ content.append("")
880
+ content.append(generate_cell_metadata())
881
+ content.append("")
882
+
883
+ return "\n".join(content)
884
+
885
+
886
+ def extract_run_function_parameters(module_path: Path) -> dict:
887
+ """
888
+ Extract all parameters (except spark/logger) from the run() function signature.
889
+
890
+ Excludes 'spark' and 'logger' parameters as they are always provided by the notebook.
891
+ Required parameters (without defaults) get an empty string default for the parameters cell.
892
+
893
+ Returns:
894
+ Dictionary mapping parameter names to their default values (as strings for code generation)
895
+ """
896
+ try:
897
+ with open(module_path) as f:
898
+ tree = ast.parse(f.read())
899
+
900
+ # Find the run() function
901
+ run_func = None
902
+ for node in ast.walk(tree):
903
+ if isinstance(node, ast.FunctionDef) and node.name == "run":
904
+ run_func = node
905
+ break
906
+
907
+ if not run_func:
908
+ return {}
909
+
910
+ # Extract parameters
911
+ # Parameters are in args.args, defaults are in args.defaults
912
+ # Defaults align with the last N parameters
913
+ params = {}
914
+ args = run_func.args
915
+ num_defaults = len(args.defaults)
916
+ num_args = len(args.args)
917
+
918
+ # Skip spark and logger (typically first two positional args)
919
+ skip_params = {"spark", "logger"}
920
+
921
+ for i, arg in enumerate(args.args):
922
+ param_name = arg.arg
923
+ if param_name in skip_params:
924
+ continue
925
+
926
+ # Check if this parameter has a default
927
+ # Defaults align with the last N parameters
928
+ default_index = i - (num_args - num_defaults)
929
+ if default_index >= 0:
930
+ default_value = args.defaults[default_index]
931
+ # Convert AST default value to string representation
932
+ params[param_name] = ast_unparse_default(default_value)
933
+ else:
934
+ # Required parameter without default - use empty string
935
+ params[param_name] = '""'
936
+
937
+ return params
938
+
939
+ except (SyntaxError, FileNotFoundError) as e:
940
+ print(f"Warning: Could not parse {module_path} for parameters: {e}")
941
+ return {}
942
+
943
+
944
+ def ast_unparse_default(node: ast.AST) -> str:
945
+ """
946
+ Convert an AST node representing a default value to a string representation.
947
+
948
+ Handles common cases: None, strings, numbers, booleans, etc.
949
+ For complex expressions, uses ast.unparse if available (Python 3.9+).
950
+ """
951
+ # Try to use ast.unparse for complex expressions first (Python 3.9+)
952
+ try:
953
+ if hasattr(ast, 'unparse'):
954
+ return ast.unparse(node)
955
+ except Exception:
956
+ pass
957
+
958
+ # Handle simple constant values
959
+ if isinstance(node, ast.Constant):
960
+ value = node.value
961
+ if value is None:
962
+ return "None"
963
+ elif isinstance(value, str):
964
+ return f'"{value}"'
965
+ elif isinstance(value, bool):
966
+ return str(value)
967
+ elif isinstance(value, (int, float)):
968
+ return str(value)
969
+ else:
970
+ return repr(value)
971
+ elif isinstance(node, ast.NameConstant): # Python < 3.8 compatibility
972
+ if node.value is None:
973
+ return "None"
974
+ elif isinstance(node.value, bool):
975
+ return str(node.value)
976
+ else:
977
+ return repr(node.value)
978
+ elif isinstance(node, ast.Str): # Python < 3.8 compatibility
979
+ return f'"{node.s}"'
980
+ elif isinstance(node, ast.Num): # Python < 3.8 compatibility
981
+ return str(node.n)
982
+ elif isinstance(node, ast.Name) and node.id in ("None", "True", "False"):
983
+ return node.id
984
+ else:
985
+ # Fallback: try repr or return None string
986
+ try:
987
+ return repr(node)
988
+ except Exception:
989
+ return "None"
990
+
991
+
992
+ def find_modules_with_run_function(src_dir: Path) -> list[NotebookConfig]:
993
+ """Find all Python modules that have a run() function and extract their parameters."""
994
+ configs = []
995
+
996
+ for layer in ["bronze", "silver", "gold", "backup"]:
997
+ layer_dir = src_dir / layer
998
+ if not layer_dir.exists():
999
+ continue
1000
+
1001
+ for py_file in layer_dir.glob("*.py"):
1002
+ if py_file.name.startswith("_"):
1003
+ continue
1004
+
1005
+ # Check if module has a run() function
1006
+ try:
1007
+ with open(py_file) as f:
1008
+ tree = ast.parse(f.read())
1009
+
1010
+ has_run = any(
1011
+ isinstance(node, ast.FunctionDef) and node.name == "run"
1012
+ for node in ast.walk(tree)
1013
+ )
1014
+
1015
+ if has_run:
1016
+ # Generate notebook name from module name
1017
+ module_name = py_file.stem
1018
+ prefix = LAYER_PREFIXES.get(layer, "")
1019
+ notebook_name = f"{prefix}{module_name}"
1020
+
1021
+ # Extract optional parameters from run() function signature
1022
+ parameters = extract_run_function_parameters(py_file)
1023
+
1024
+ configs.append(NotebookConfig(
1025
+ module_path=py_file,
1026
+ notebook_name=notebook_name,
1027
+ layer=layer,
1028
+ has_run_function=True,
1029
+ parameters=parameters if parameters else None,
1030
+ ))
1031
+
1032
+ except SyntaxError as e:
1033
+ print(f"Warning: Could not parse {py_file}: {e}")
1034
+
1035
+ return configs
1036
+
1037
+
1038
+ def validate_no_src_imports(notebook_name: str, content: str, warn_only: bool = False):
1039
+ """Ensure no src.* imports leaked into generated notebook content."""
1040
+ in_docstring = False
1041
+ for i, line in enumerate(content.split("\n"), 1):
1042
+ stripped = line.strip()
1043
+ # Track docstrings (triple-quoted strings)
1044
+ if not in_docstring:
1045
+ for quote in ('"""', "'''"):
1046
+ if quote in stripped:
1047
+ # Check if docstring opens and closes on same line
1048
+ if stripped.count(quote) == 1:
1049
+ in_docstring = True
1050
+ break
1051
+ if in_docstring:
1052
+ continue
1053
+ else:
1054
+ if '"""' in stripped or "'''" in stripped:
1055
+ in_docstring = False
1056
+ continue
1057
+ # Skip comments
1058
+ if stripped.startswith("#"):
1059
+ continue
1060
+ if stripped.startswith("from src.") or stripped.startswith("import src."):
1061
+ msg = (
1062
+ f"Notebook '{notebook_name}' line {i} contains src import: {stripped}\n"
1063
+ f" src.* imports don't work in Fabric. Use try/except (ImportError, FileNotFoundError) "
1064
+ f"or move the functionality to common_functions."
1065
+ )
1066
+ if warn_only:
1067
+ print(f" WARNING: {msg}")
1068
+ return # One warning per notebook is enough
1069
+ else:
1070
+ raise ValueError(msg)
1071
+
1072
+
1073
+ def write_notebook(output_dir: Path, notebook_name: str, content: str, dry_run: bool = False):
1074
+ """Write a notebook to disk."""
1075
+ is_test = notebook_name.startswith("test_")
1076
+ validate_no_src_imports(notebook_name, content, warn_only=is_test)
1077
+ notebook_dir = output_dir / f"{notebook_name}.Notebook"
1078
+
1079
+ if dry_run:
1080
+ print(f" Would create: {notebook_dir}")
1081
+ return
1082
+
1083
+ notebook_dir.mkdir(parents=True, exist_ok=True)
1084
+ notebook_file = notebook_dir / "notebook-content.py"
1085
+
1086
+ with open(notebook_file, "w") as f:
1087
+ f.write(content)
1088
+
1089
+ print(f" Created: {notebook_dir}")
1090
+
1091
+
1092
+ ##############################################################################
1093
+ # AST-based notebook validation
1094
+ ##############################################################################
1095
+
1096
+ import builtins as _builtins_module
1097
+
1098
+ # Names provided by the Fabric runtime (not defined in any notebook)
1099
+ FABRIC_RUNTIME_NAMES = {
1100
+ "_spark", "_logger", "spark", "notebookutils", "mssparkutils",
1101
+ "display", "RUN_MAIN",
1102
+ }
1103
+
1104
+ # Names that are provided at runtime for specific pipeline notebooks (e.g. by optional
1105
+ # %run or by common modules not inlined into common_functions). Validation treats these as known.
1106
+ PIPELINE_VALIDATION_ALLOWLIST: dict[str, set[str]] = {}
1107
+
1108
+ # Names from standard library / PySpark that are imported in common_functions
1109
+ # These are available because common_functions has an imports cell.
1110
+ COMMON_IMPORTS_NAMES = {
1111
+ # Standard library modules imported at top of common_functions
1112
+ "json", "logging", "os", "random", "re", "shutil", "uuid", "zipfile", "builtins",
1113
+ "datetime", "timedelta", "timezone", "reduce", "Logger",
1114
+ # PySpark
1115
+ "DataFrame", "SparkSession", "F", "col", "lit", "udf",
1116
+ "BooleanType", "LongType", "StringType", "StructField", "StructType",
1117
+ "DeltaTable",
1118
+ # Fabric-specific
1119
+ "mssparkutils",
1120
+ }
1121
+
1122
+
1123
+ def _extract_code_cells(content: str) -> list[tuple[int, str]]:
1124
+ """Extract Python code cells from notebook content.
1125
+
1126
+ Returns list of (line_offset, cell_code) tuples.
1127
+ Skips metadata blocks and %run lines.
1128
+ """
1129
+ cells: list[tuple[int, str]] = []
1130
+ current_cell_lines: list[str] = []
1131
+ cell_start_line = 0
1132
+ in_cell = False
1133
+
1134
+ for i, line in enumerate(content.split("\n"), 1):
1135
+ if line.startswith("# CELL **") or line.startswith("# PARAMETERS CELL **"):
1136
+ if in_cell and current_cell_lines:
1137
+ cells.append((cell_start_line, "\n".join(current_cell_lines)))
1138
+ current_cell_lines = []
1139
+ cell_start_line = i + 1 # code starts on the next line
1140
+ in_cell = True
1141
+ continue
1142
+ if line.startswith("# METADATA **"):
1143
+ if in_cell and current_cell_lines:
1144
+ cells.append((cell_start_line, "\n".join(current_cell_lines)))
1145
+ in_cell = False
1146
+ current_cell_lines = []
1147
+ continue
1148
+ if line.startswith("# META "):
1149
+ continue
1150
+ if in_cell:
1151
+ current_cell_lines.append(line)
1152
+
1153
+ # Flush last cell
1154
+ if in_cell and current_cell_lines:
1155
+ cells.append((cell_start_line, "\n".join(current_cell_lines)))
1156
+
1157
+ return cells
1158
+
1159
+
1160
+ def _collect_defined_names_from_stmts(nodes: list[ast.AST], names: set[str]) -> None:
1161
+ """Collect names defined in a list of statements (e.g. try/except body)."""
1162
+ for node in nodes:
1163
+ if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
1164
+ names.add(node.name)
1165
+ elif isinstance(node, ast.ClassDef):
1166
+ names.add(node.name)
1167
+ elif isinstance(node, ast.Assign):
1168
+ for target in node.targets:
1169
+ if isinstance(target, ast.Name):
1170
+ names.add(target.id)
1171
+ elif isinstance(target, (ast.Tuple, ast.List)):
1172
+ for elt in target.elts:
1173
+ if isinstance(elt, ast.Name):
1174
+ names.add(elt.id)
1175
+ elif isinstance(node, ast.AnnAssign) and isinstance(node.target, ast.Name):
1176
+ names.add(node.target.id)
1177
+ elif isinstance(node, ast.Import):
1178
+ for alias in node.names:
1179
+ names.add(alias.asname if alias.asname else alias.name.split(".")[0])
1180
+ elif isinstance(node, ast.ImportFrom):
1181
+ for alias in node.names:
1182
+ names.add(alias.asname if alias.asname else alias.name)
1183
+ elif isinstance(node, ast.AugAssign) and isinstance(node.target, ast.Name):
1184
+ names.add(node.target.id)
1185
+ elif isinstance(node, (ast.For, ast.With)):
1186
+ _collect_targets(node, names)
1187
+
1188
+
1189
+ def _collect_defined_names_from_code(code: str) -> set[str]:
1190
+ """Collect all names defined at module level in the given code."""
1191
+ try:
1192
+ tree = ast.parse(code)
1193
+ except SyntaxError:
1194
+ return set()
1195
+
1196
+ names: set[str] = set()
1197
+ for node in ast.iter_child_nodes(tree):
1198
+ if isinstance(node, ast.FunctionDef) or isinstance(node, ast.AsyncFunctionDef):
1199
+ names.add(node.name)
1200
+ elif isinstance(node, ast.ClassDef):
1201
+ names.add(node.name)
1202
+ elif isinstance(node, ast.Assign):
1203
+ for target in node.targets:
1204
+ if isinstance(target, ast.Name):
1205
+ names.add(target.id)
1206
+ elif isinstance(target, ast.Tuple) or isinstance(target, ast.List):
1207
+ for elt in target.elts:
1208
+ if isinstance(elt, ast.Name):
1209
+ names.add(elt.id)
1210
+ elif isinstance(node, ast.AnnAssign) and isinstance(node.target, ast.Name):
1211
+ names.add(node.target.id)
1212
+ elif isinstance(node, ast.Import):
1213
+ for alias in node.names:
1214
+ names.add(alias.asname if alias.asname else alias.name.split(".")[0])
1215
+ elif isinstance(node, ast.ImportFrom):
1216
+ for alias in node.names:
1217
+ names.add(alias.asname if alias.asname else alias.name)
1218
+ elif isinstance(node, ast.AugAssign) and isinstance(node.target, ast.Name):
1219
+ names.add(node.target.id)
1220
+ elif isinstance(node, (ast.For, ast.With)):
1221
+ # Top-level for/with loop targets
1222
+ _collect_targets(node, names)
1223
+ elif isinstance(node, ast.Try):
1224
+ # Names defined in except blocks (e.g. Fabric fallback helpers) are visible module-wide
1225
+ for handler in node.handlers:
1226
+ _collect_defined_names_from_stmts(handler.body, names)
1227
+ return names
1228
+
1229
+
1230
+ def _collect_targets(node: ast.AST, names: set[str]):
1231
+ """Collect assignment targets from for/with statements."""
1232
+ if isinstance(node, ast.For):
1233
+ if isinstance(node.target, ast.Name):
1234
+ names.add(node.target.id)
1235
+ elif isinstance(node.target, (ast.Tuple, ast.List)):
1236
+ for elt in node.target.elts:
1237
+ if isinstance(elt, ast.Name):
1238
+ names.add(elt.id)
1239
+ elif isinstance(node, ast.With):
1240
+ for item in node.items:
1241
+ if item.optional_vars and isinstance(item.optional_vars, ast.Name):
1242
+ names.add(item.optional_vars.id)
1243
+
1244
+
1245
+ def _collect_notebook_exports(content: str) -> set[str]:
1246
+ """Collect all names exported by a notebook (from all its code cells)."""
1247
+ names: set[str] = set()
1248
+ for _offset, cell_code in _extract_code_cells(content):
1249
+ # Skip %run lines
1250
+ code_lines = [l for l in cell_code.split("\n") if not l.strip().startswith("%run")]
1251
+ code = "\n".join(code_lines)
1252
+ names |= _collect_defined_names_from_code(code)
1253
+ return names
1254
+
1255
+
1256
+ def _collect_local_names(func_node: ast.FunctionDef) -> set[str]:
1257
+ """Collect all locally defined names within a function body."""
1258
+ local_names: set[str] = set()
1259
+
1260
+ # Parameters
1261
+ for arg in func_node.args.args:
1262
+ local_names.add(arg.arg)
1263
+ for arg in func_node.args.kwonlyargs:
1264
+ local_names.add(arg.arg)
1265
+ if func_node.args.vararg:
1266
+ local_names.add(func_node.args.vararg.arg)
1267
+ if func_node.args.kwarg:
1268
+ local_names.add(func_node.args.kwarg.arg)
1269
+
1270
+ # Walk the function body for all Store targets, imports, nested defs
1271
+ for node in ast.walk(func_node):
1272
+ if isinstance(node, ast.Name) and isinstance(node.ctx, ast.Store):
1273
+ local_names.add(node.id)
1274
+ elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
1275
+ if node is not func_node: # Skip the function itself
1276
+ local_names.add(node.name)
1277
+ # Also collect nested function parameters
1278
+ for arg in node.args.args:
1279
+ local_names.add(arg.arg)
1280
+ for arg in node.args.kwonlyargs:
1281
+ local_names.add(arg.arg)
1282
+ if node.args.vararg:
1283
+ local_names.add(node.args.vararg.arg)
1284
+ if node.args.kwarg:
1285
+ local_names.add(node.args.kwarg.arg)
1286
+ elif isinstance(node, ast.Lambda):
1287
+ # Collect lambda parameter names
1288
+ for arg in node.args.args:
1289
+ local_names.add(arg.arg)
1290
+ for arg in node.args.kwonlyargs:
1291
+ local_names.add(arg.arg)
1292
+ if node.args.vararg:
1293
+ local_names.add(node.args.vararg.arg)
1294
+ if node.args.kwarg:
1295
+ local_names.add(node.args.kwarg.arg)
1296
+ elif isinstance(node, ast.ClassDef):
1297
+ local_names.add(node.name)
1298
+ elif isinstance(node, ast.Import):
1299
+ for alias in node.names:
1300
+ local_names.add(alias.asname if alias.asname else alias.name.split(".")[0])
1301
+ elif isinstance(node, ast.ImportFrom):
1302
+ for alias in node.names:
1303
+ local_names.add(alias.asname if alias.asname else alias.name)
1304
+ elif isinstance(node, ast.ExceptHandler):
1305
+ if node.name:
1306
+ local_names.add(node.name)
1307
+ elif isinstance(node, ast.comprehension):
1308
+ if isinstance(node.target, ast.Name):
1309
+ local_names.add(node.target.id)
1310
+ elif isinstance(node.target, (ast.Tuple, ast.List)):
1311
+ for elt in node.target.elts:
1312
+ if isinstance(elt, ast.Name):
1313
+ local_names.add(elt.id)
1314
+
1315
+ return local_names
1316
+
1317
+
1318
+ @dataclass
1319
+ class UnresolvedName:
1320
+ """An unresolved name reference found during validation."""
1321
+ name: str
1322
+ line: int # Line number in the notebook file
1323
+ func_name: str | None # Function where it was found, or None for module level
1324
+
1325
+
1326
+ def _find_undefined_names(code: str, line_offset: int, known_names: set[str]) -> list[UnresolvedName]:
1327
+ """Find names in code that are used but not defined."""
1328
+ try:
1329
+ tree = ast.parse(code)
1330
+ except SyntaxError:
1331
+ return []
1332
+
1333
+ python_builtins = set(dir(_builtins_module))
1334
+ all_known = known_names | python_builtins | FABRIC_RUNTIME_NAMES
1335
+
1336
+ # Collect module-level defined names
1337
+ module_names = _collect_defined_names_from_code(code)
1338
+ all_known = all_known | module_names
1339
+
1340
+ issues: list[UnresolvedName] = []
1341
+
1342
+ for node in ast.iter_child_nodes(tree):
1343
+ if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
1344
+ local_names = _collect_local_names(node)
1345
+ func_scope = all_known | local_names
1346
+
1347
+ # Walk function body looking for Name in Load context
1348
+ for child in ast.walk(node):
1349
+ if isinstance(child, ast.Name) and isinstance(child.ctx, ast.Load):
1350
+ name = child.id
1351
+ if name.startswith("_"):
1352
+ continue
1353
+ if name in func_scope:
1354
+ continue
1355
+ issues.append(UnresolvedName(
1356
+ name=name,
1357
+ line=line_offset + child.lineno - 1,
1358
+ func_name=node.name,
1359
+ ))
1360
+
1361
+ # Also check module-level Load references (outside functions)
1362
+ elif isinstance(node, ast.Expr):
1363
+ for child in ast.walk(node):
1364
+ if isinstance(child, ast.Name) and isinstance(child.ctx, ast.Load):
1365
+ if child.id.startswith("_") or child.id in all_known:
1366
+ continue
1367
+ issues.append(UnresolvedName(
1368
+ name=child.id,
1369
+ line=line_offset + child.lineno - 1,
1370
+ func_name=None,
1371
+ ))
1372
+ elif isinstance(node, ast.If):
1373
+ # Module-level if statements (like RUN_MAIN guard)
1374
+ _check_node_for_undefined(node, line_offset, all_known, issues)
1375
+
1376
+ return issues
1377
+
1378
+
1379
+ def _check_node_for_undefined(
1380
+ node: ast.AST, line_offset: int, known: set[str], issues: list[UnresolvedName]
1381
+ ):
1382
+ """Check an AST node tree for undefined name references."""
1383
+ # First collect all Store targets within this block
1384
+ block_locals: set[str] = set()
1385
+ for child in ast.walk(node):
1386
+ if isinstance(child, ast.Name) and isinstance(child.ctx, ast.Store):
1387
+ block_locals.add(child.id)
1388
+ elif isinstance(child, (ast.FunctionDef, ast.AsyncFunctionDef)):
1389
+ block_locals.add(child.name)
1390
+ elif isinstance(child, ast.Import):
1391
+ for alias in child.names:
1392
+ block_locals.add(alias.asname if alias.asname else alias.name.split(".")[0])
1393
+ elif isinstance(child, ast.ImportFrom):
1394
+ for alias in child.names:
1395
+ block_locals.add(alias.asname if alias.asname else alias.name)
1396
+ elif isinstance(child, ast.comprehension):
1397
+ if isinstance(child.target, ast.Name):
1398
+ block_locals.add(child.target.id)
1399
+
1400
+ effective_known = known | block_locals
1401
+
1402
+ for child in ast.walk(node):
1403
+ if isinstance(child, ast.Name) and isinstance(child.ctx, ast.Load):
1404
+ if child.id.startswith("_") or child.id in effective_known:
1405
+ continue
1406
+ issues.append(UnresolvedName(
1407
+ name=child.id,
1408
+ line=line_offset + child.lineno - 1,
1409
+ func_name=None,
1410
+ ))
1411
+
1412
+
1413
+ def validate_all_notebooks(generated: dict[str, str]) -> int:
1414
+ """
1415
+ Validate all generated notebooks for undefined name references.
1416
+
1417
+ Args:
1418
+ generated: Dict mapping notebook_name -> content
1419
+
1420
+ Returns:
1421
+ Number of errors found in production notebooks
1422
+ """
1423
+ print("\n" + "=" * 50)
1424
+ print("Validating notebooks for undefined names...")
1425
+ print("=" * 50)
1426
+
1427
+ # Step 1: Collect exports from common_defs
1428
+ common_defs_exports: set[str] = set()
1429
+ if "common_defs" in generated:
1430
+ common_defs_exports = _collect_notebook_exports(generated["common_defs"])
1431
+
1432
+ # Step 2: Collect exports from common_functions
1433
+ common_functions_exports: set[str] = set()
1434
+ if "common_functions" in generated:
1435
+ common_functions_exports = _collect_notebook_exports(generated["common_functions"])
1436
+
1437
+ # Base known names for pipeline notebooks
1438
+ base_known = common_defs_exports | common_functions_exports | COMMON_IMPORTS_NAMES
1439
+
1440
+ # Step 3: Validate common_functions itself
1441
+ errors_count = 0
1442
+ if "common_functions" in generated:
1443
+ count = _validate_single_notebook(
1444
+ "common_functions", generated["common_functions"],
1445
+ common_defs_exports | COMMON_IMPORTS_NAMES,
1446
+ is_test=False,
1447
+ )
1448
+ errors_count += count
1449
+
1450
+ # Step 4: Validate pipeline notebooks
1451
+ # Collect exports from each pipeline notebook for cross-referencing
1452
+ pipeline_exports: dict[str, set[str]] = {}
1453
+ for name, content in generated.items():
1454
+ if name in ("common_defs", "common_functions") or name.startswith("test_"):
1455
+ continue
1456
+ pipeline_exports[name] = _collect_notebook_exports(content)
1457
+
1458
+ for name, content in generated.items():
1459
+ if name in ("common_defs", "common_functions") or name.startswith("test_"):
1460
+ continue
1461
+
1462
+ known = set(base_known)
1463
+ # Add per-notebook allowlist (names provided at runtime, e.g. optional %run)
1464
+ known |= PIPELINE_VALIDATION_ALLOWLIST.get(name, set())
1465
+ # Add exports from %run dependencies
1466
+ for line in content.split("\n"):
1467
+ stripped = line.strip()
1468
+ if stripped.startswith("%run ") and stripped != "%run common_functions" and stripped != "%run common_defs":
1469
+ dep_name = stripped[5:].strip()
1470
+ if dep_name in pipeline_exports:
1471
+ known |= pipeline_exports[dep_name]
1472
+
1473
+ count = _validate_single_notebook(name, content, known, is_test=False)
1474
+ errors_count += count
1475
+
1476
+ # Step 5: Validate test notebooks
1477
+ # Names used in the generated "run tests" cell that are always expected
1478
+ test_runner_names = {"unittest", "sys", "run_tests"}
1479
+ # Names provided by tests.conftest when run with pytest; in Fabric they may be
1480
+ # defined in try/except ImportError fallback (validator collects those from Try handlers)
1481
+ test_conftest_names = {"clean_default_db_table", "force_create_default_db_table"}
1482
+
1483
+ for name, content in generated.items():
1484
+ if not name.startswith("test_"):
1485
+ continue
1486
+
1487
+ known = set(base_known) | test_runner_names | test_conftest_names
1488
+ # Add exports from %run dependencies (production notebook + any others)
1489
+ for line in content.split("\n"):
1490
+ stripped = line.strip()
1491
+ if stripped.startswith("%run ") and stripped != "%run common_functions" and stripped != "%run common_defs":
1492
+ dep_name = stripped[5:].strip()
1493
+ if dep_name in pipeline_exports:
1494
+ known |= pipeline_exports[dep_name]
1495
+
1496
+ count = _validate_single_notebook(name, content, known, is_test=True)
1497
+ # Test notebook issues are warnings, don't count as errors
1498
+
1499
+ return errors_count
1500
+
1501
+
1502
+ def _validate_single_notebook(
1503
+ notebook_name: str, content: str, known_names: set[str], is_test: bool
1504
+ ) -> int:
1505
+ """Validate a single notebook. Returns number of issues found."""
1506
+ all_issues: list[UnresolvedName] = []
1507
+
1508
+ # First, collect all names defined across ALL cells in this notebook
1509
+ # so that cross-cell references resolve correctly
1510
+ notebook_own_exports = _collect_notebook_exports(content)
1511
+ effective_known = known_names | notebook_own_exports
1512
+
1513
+ for offset, cell_code in _extract_code_cells(content):
1514
+ # Skip %run lines from the code
1515
+ code_lines = [l for l in cell_code.split("\n") if not l.strip().startswith("%run")]
1516
+ code = "\n".join(code_lines)
1517
+ if not code.strip():
1518
+ continue
1519
+
1520
+ issues = _find_undefined_names(code, offset, effective_known)
1521
+ all_issues.extend(issues)
1522
+
1523
+ # Deduplicate by (name, line)
1524
+ seen: set[tuple[str, int]] = set()
1525
+ unique_issues: list[UnresolvedName] = []
1526
+ for issue in all_issues:
1527
+ key = (issue.name, issue.line)
1528
+ if key not in seen:
1529
+ seen.add(key)
1530
+ unique_issues.append(issue)
1531
+
1532
+ if unique_issues:
1533
+ prefix = "WARNING" if is_test else "ERROR"
1534
+ print(f"\n {prefix}: {notebook_name} has {len(unique_issues)} undefined name(s):")
1535
+ for issue in unique_issues:
1536
+ loc = f"line {issue.line}"
1537
+ if issue.func_name:
1538
+ loc += f" in {issue.func_name}()"
1539
+ print(f" - '{issue.name}' at {loc}")
1540
+
1541
+ return 0 if is_test else len(unique_issues)
1542
+
1543
+
1544
+ def main():
1545
+ parser = argparse.ArgumentParser(description="Generate Fabric notebooks from Python modules")
1546
+ parser.add_argument("--project-root", type=Path, default=Path.cwd(),
1547
+ help="Consumer project root containing src/, config/, etc. (default: CWD)")
1548
+ parser.add_argument("--only", choices=["bronze", "silver", "gold", "backup", "common", "tests"],
1549
+ help="Generate only notebooks for specified layer")
1550
+ parser.add_argument("--all", action="store_true",
1551
+ help="Regenerate all notebooks including common")
1552
+ parser.add_argument("--dry-run", action="store_true",
1553
+ help="Show what would be generated without writing files")
1554
+ parser.add_argument("--output", type=Path, default=None,
1555
+ help="Output directory (default: project root)")
1556
+ parser.add_argument("--strict-validate", action="store_true",
1557
+ help="Exit with non-zero status if validation finds issues in production notebooks")
1558
+ args = parser.parse_args()
1559
+
1560
+ global PROJECT_ROOT, LAKEHOUSE_CONFIGS
1561
+ PROJECT_ROOT = args.project_root.resolve()
1562
+ LAKEHOUSE_CONFIGS = load_lakehouse_config(PROJECT_ROOT)
1563
+
1564
+ project_root = PROJECT_ROOT
1565
+ src_dir = project_root / "src"
1566
+ output_dir = args.output or project_root
1567
+
1568
+ print("Fabric Notebook Generator")
1569
+ print("=" * 50)
1570
+
1571
+ # Track all generated notebook contents for validation
1572
+ generated_notebooks: dict[str, str] = {}
1573
+
1574
+ # Generate common notebooks
1575
+ if args.only in [None, "common"] or args.all:
1576
+ print("\nGenerating common notebooks...")
1577
+
1578
+ # common_defs
1579
+ content = generate_common_defs_notebook()
1580
+ write_notebook(output_dir / "common", "common_defs", content, args.dry_run)
1581
+ generated_notebooks["common_defs"] = content
1582
+
1583
+ # common_functions
1584
+ content = generate_common_functions_notebook()
1585
+ write_notebook(output_dir / "common", "common_functions", content, args.dry_run)
1586
+ generated_notebooks["common_functions"] = content
1587
+
1588
+ # Generate QuickBooks helper notebooks (placed in bronze/) only when the
1589
+ # consumer ships the corresponding source modules. They're CashHero-flavored
1590
+ # and won't be needed by most consumers.
1591
+ if args.only in [None, "bronze"] or args.all:
1592
+ qb_auth_src = project_root / "src" / "common" / "quickbooks_auth.py"
1593
+ qb_client_src = project_root / "src" / "common" / "quickbooks_client.py"
1594
+ if qb_auth_src.exists():
1595
+ content = generate_quickbooks_auth_notebook()
1596
+ write_notebook(output_dir / "bronze", "10_bronze_quickbooks_auth", content, args.dry_run)
1597
+ generated_notebooks["10_bronze_quickbooks_auth"] = content
1598
+ if qb_client_src.exists():
1599
+ content = generate_quickbooks_client_notebook()
1600
+ write_notebook(output_dir / "bronze", "10_bronze_quickbooks_client", content, args.dry_run)
1601
+ generated_notebooks["10_bronze_quickbooks_client"] = content
1602
+
1603
+ # Generate pipeline notebooks
1604
+ if args.only in [None, "bronze", "silver", "gold", "backup"] or args.all:
1605
+ configs = find_modules_with_run_function(src_dir)
1606
+
1607
+ if args.only:
1608
+ configs = [c for c in configs if c.layer == args.only]
1609
+
1610
+ if configs:
1611
+ print(f"\nGenerating {len(configs)} pipeline notebooks...")
1612
+ for config in configs:
1613
+ content = generate_pipeline_notebook(config)
1614
+ # Write production notebooks under layer dir (bronze/, silver/, gold/)
1615
+ layer_output_dir = output_dir / config.layer
1616
+ write_notebook(layer_output_dir, config.notebook_name, content, args.dry_run)
1617
+ generated_notebooks[config.notebook_name] = content
1618
+ else:
1619
+ print("\nNo pipeline modules found with run() functions.")
1620
+ print("Create modules in src/bronze/, src/silver/, or src/gold/ with a run() function.")
1621
+
1622
+ # Generate test notebooks for migrated modules
1623
+ if args.only in [None, "tests"] or args.all:
1624
+ tests_dir = project_root / "tests"
1625
+ if tests_dir.exists():
1626
+ # Find test files that correspond to migrated modules
1627
+ migrated_configs = find_modules_with_run_function(src_dir)
1628
+ test_notebooks_generated = 0
1629
+
1630
+ for config in migrated_configs:
1631
+ # Map module name to test file
1632
+ # e.g., src/silver/transform_accounts.py -> tests/test_silver_transform_accounts.py
1633
+ module_name = config.module_path.stem # transform_accounts
1634
+ layer = config.layer # silver
1635
+ test_file = tests_dir / f"test_{layer}_{module_name}.py"
1636
+
1637
+ # Also try without layer prefix
1638
+ if not test_file.exists():
1639
+ test_file = tests_dir / f"test_{module_name}.py"
1640
+
1641
+ if test_file.exists():
1642
+ # Generate test notebook with the notebook naming convention
1643
+ # e.g., test_20_silver_transform_accounts
1644
+ test_notebook_name = f"test_{config.notebook_name}"
1645
+
1646
+ test_config = NotebookConfig(
1647
+ module_path=test_file,
1648
+ notebook_name=test_notebook_name,
1649
+ layer="tests",
1650
+ has_run_function=False,
1651
+ )
1652
+ content = generate_test_notebook(test_config)
1653
+ # Write test notebooks under layer/tests/ (e.g. bronze/tests/, silver/tests/, gold/tests/)
1654
+ test_output_dir = output_dir / config.layer / "tests"
1655
+ write_notebook(test_output_dir, test_config.notebook_name, content, args.dry_run)
1656
+ generated_notebooks[test_config.notebook_name] = content
1657
+ test_notebooks_generated += 1
1658
+
1659
+ if test_notebooks_generated > 0:
1660
+ print(f"\nGenerated {test_notebooks_generated} test notebooks for migrated modules")
1661
+
1662
+ # Validate all generated notebooks
1663
+ if not args.dry_run and generated_notebooks:
1664
+ error_count = validate_all_notebooks(generated_notebooks)
1665
+ if error_count > 0:
1666
+ print(f"\nValidation: {error_count} undefined name(s) found in production notebooks.")
1667
+ if args.strict_validate:
1668
+ print("Fix the source modules and regenerate (--strict-validate is on).")
1669
+ sys.exit(1)
1670
+ else:
1671
+ print("\nValidation passed: no undefined names found.")
1672
+
1673
+ print("\nDone!")
1674
+
1675
+
1676
+ if __name__ == "__main__":
1677
+ main()