algomancy-quickstart 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. algomancy_quickstart/__init__.py +2 -0
  2. algomancy_quickstart/asset_manager.py +202 -0
  3. algomancy_quickstart/data_inference.py +517 -0
  4. algomancy_quickstart/main.py +62 -0
  5. algomancy_quickstart/quickstart.py +683 -0
  6. algomancy_quickstart/styling_wizard.py +347 -0
  7. algomancy_quickstart/templates/__init__.py +0 -0
  8. algomancy_quickstart/templates/algorithm.py.jinja +104 -0
  9. algomancy_quickstart/templates/assets/CQM-logo-white.png +0 -0
  10. algomancy_quickstart/templates/assets/cqm-button-white.png +0 -0
  11. algomancy_quickstart/templates/assets/cqm-button.png +0 -0
  12. algomancy_quickstart/templates/assets/cqm-logo.png +0 -0
  13. algomancy_quickstart/templates/assets/css/button_colors.css +285 -0
  14. algomancy_quickstart/templates/assets/css/cqm_loader.css +47 -0
  15. algomancy_quickstart/templates/assets/css/sidebar_layout.css +189 -0
  16. algomancy_quickstart/templates/assets/css/theme_colors.css +90 -0
  17. algomancy_quickstart/templates/assets/letter-c.svg +4 -0
  18. algomancy_quickstart/templates/assets/letter-m.svg +4 -0
  19. algomancy_quickstart/templates/assets/letter-q.svg +4 -0
  20. algomancy_quickstart/templates/assets/letters/letter-c.png +0 -0
  21. algomancy_quickstart/templates/assets/letters/letter-m.png +0 -0
  22. algomancy_quickstart/templates/assets/letters/letter-q.png +0 -0
  23. algomancy_quickstart/templates/assets/pepsi_girl.jpeg +0 -0
  24. algomancy_quickstart/templates/assets/style.css +421 -0
  25. algomancy_quickstart/templates/compare_page.py.jinja +133 -0
  26. algomancy_quickstart/templates/data_page.py.jinja +94 -0
  27. algomancy_quickstart/templates/etl_factory.py.jinja +108 -0
  28. algomancy_quickstart/templates/etl_factory_generated.py.jinja +82 -0
  29. algomancy_quickstart/templates/generated_schemas.py.jinja +55 -0
  30. algomancy_quickstart/templates/home_page.py.jinja +65 -0
  31. algomancy_quickstart/templates/kpi.py.jinja +76 -0
  32. algomancy_quickstart/templates/main.py.jinja +42 -0
  33. algomancy_quickstart/templates/main_custom.py.jinja +55 -0
  34. algomancy_quickstart/templates/main_generated_etl.py.jinja +72 -0
  35. algomancy_quickstart/templates/main_with_styling.py.jinja +83 -0
  36. algomancy_quickstart/templates/overview_page.py.jinja +98 -0
  37. algomancy_quickstart/templates/scenario_page.py.jinja +77 -0
  38. algomancy_quickstart/templates/schema.py.jinja +58 -0
  39. algomancy_quickstart/templates/styling_config.py.jinja +53 -0
  40. algomancy_quickstart-0.7.0.dist-info/METADATA +29 -0
  41. algomancy_quickstart-0.7.0.dist-info/RECORD +42 -0
  42. algomancy_quickstart-0.7.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,517 @@
1
+ """
2
+ Data inference utilities for detecting file types and inferring schemas.
3
+ """
4
+
5
+ import pandas as pd
6
+ from pathlib import Path
7
+ from typing import Dict, List
8
+ import warnings
9
+ import click
10
+
11
+ from algomancy_data import DataType, FileExtension
12
+
13
+
14
+ class DataFileInfo:
15
+ """Information about a detected data file."""
16
+
17
+ def __init__(
18
+ self,
19
+ file_path: Path,
20
+ file_name: str,
21
+ extension: FileExtension,
22
+ sheet_names: List[str] = None,
23
+ ):
24
+ self.file_path = file_path
25
+ self.file_name = file_name
26
+ self.extension = extension
27
+ self.sheet_names = sheet_names or []
28
+ self.inferred_schemas: Dict[str, Dict[str, DataType]] = {}
29
+ # Per-schema (per-sheet for MULTI, "default" for SINGLE) primary-key
30
+ # candidates inferred from sample data. Consumed by the schema
31
+ # template to emit ``primary_key=True`` on the right columns.
32
+ self.primary_key_columns: Dict[str, List[str]] = {}
33
+
34
+ # User configuration
35
+ self.csv_separator: str = "," # Default separator
36
+ self.selected_sheets: List[str] = [] # For Excel files
37
+ self.skip_file: bool = False # User can choose to skip a file
38
+
39
+ @property
40
+ def is_multi_sheet(self) -> bool:
41
+ """Check if this file contains multiple sheets to be extracted."""
42
+ return len(self.selected_sheets) > 1
43
+
44
+ @property
45
+ def sheets_to_extract(self) -> List[str]:
46
+ """Get list of sheets that should be extracted (for Excel files)."""
47
+ return self.selected_sheets if self.selected_sheets else self.sheet_names
48
+
49
+
50
+ class SchemaInferenceEngine:
51
+ """Engine for inferring data schemas from files."""
52
+
53
+ # Mapping from file extensions to FileExtension enum
54
+ EXTENSION_MAP = {
55
+ ".csv": FileExtension.CSV,
56
+ ".xlsx": FileExtension.XLSX,
57
+ ".json": FileExtension.JSON,
58
+ }
59
+
60
+ def __init__(self, sample_rows: int = 100):
61
+ self.sample_rows = sample_rows
62
+
63
+ def scan_directory(self, directory: Path) -> List[DataFileInfo]:
64
+ """
65
+ Scan a directory for supported data files.
66
+
67
+ Args:
68
+ directory: Path to scan for data files.
69
+
70
+ Returns:
71
+ List of DataFileInfo objects for detected files.
72
+ """
73
+ files = []
74
+
75
+ if not directory.exists():
76
+ return files
77
+
78
+ for file_path in directory.iterdir():
79
+ if not file_path.is_file():
80
+ continue
81
+
82
+ extension = self._get_file_extension(file_path)
83
+ if extension is None:
84
+ continue
85
+
86
+ file_info = DataFileInfo(
87
+ file_path=file_path, file_name=file_path.stem, extension=extension
88
+ )
89
+
90
+ # For Excel files, detect sheet names
91
+ if extension == FileExtension.XLSX:
92
+ file_info.sheet_names = self._get_excel_sheets(file_path)
93
+
94
+ files.append(file_info)
95
+
96
+ return files
97
+
98
+ def infer_schema_interactive(self, file_info: DataFileInfo) -> bool:
99
+ """
100
+ Interactively infer schema from a data file with user input.
101
+
102
+ Prompts user for configuration (CSV separator, Excel sheets) before
103
+ inferring the schema.
104
+
105
+ Args:
106
+ file_info: DataFileInfo object to infer schema for.
107
+
108
+ Returns:
109
+ True if schema was successfully inferred, False if skipped or failed.
110
+ """
111
+ click.echo()
112
+ click.echo(
113
+ click.style(
114
+ f"━━━ Processing: {file_info.file_name}{file_info.file_path.suffix} ━━━",
115
+ fg="cyan",
116
+ bold=True,
117
+ )
118
+ )
119
+ click.echo()
120
+
121
+ # Ask if user wants to process this file
122
+ if not click.confirm(
123
+ "Do you want to include this file in your ETL pipeline?", default=True
124
+ ):
125
+ file_info.skip_file = True
126
+ click.echo(click.style(" ⊘ Skipping file", fg="yellow"))
127
+ return False
128
+
129
+ # Get file-specific configuration
130
+ if file_info.extension == FileExtension.CSV:
131
+ self._configure_csv(file_info)
132
+ elif file_info.extension == FileExtension.XLSX:
133
+ self._configure_excel(file_info)
134
+
135
+ # Infer schema with user configuration
136
+ return self._infer_schema_with_config(file_info)
137
+
138
+ def _configure_csv(self, file_info: DataFileInfo):
139
+ """
140
+ Ask user for CSV-specific configuration.
141
+
142
+ Args:
143
+ file_info: DataFileInfo for a CSV file.
144
+ """
145
+ click.echo("CSV Configuration:")
146
+
147
+ # Try to detect separator by reading first few lines
148
+ detected_sep = self._detect_csv_separator(file_info.file_path)
149
+
150
+ if detected_sep:
151
+ click.echo(f" Detected separator: '{detected_sep}'")
152
+ default_sep = detected_sep
153
+ else:
154
+ click.echo(" Could not auto-detect separator")
155
+ default_sep = ","
156
+
157
+ separator = click.prompt(" Enter CSV separator", default=default_sep, type=str)
158
+
159
+ file_info.csv_separator = separator
160
+ click.echo()
161
+
162
+ def _configure_excel(self, file_info: DataFileInfo):
163
+ """
164
+ Ask user which Excel sheets to extract.
165
+
166
+ Args:
167
+ file_info: DataFileInfo for an Excel file.
168
+ """
169
+ if not file_info.sheet_names:
170
+ click.echo(click.style(" No sheets detected in Excel file", fg="yellow"))
171
+ return
172
+
173
+ click.echo(f"Excel file contains {len(file_info.sheet_names)} sheet(s):")
174
+ for i, sheet in enumerate(file_info.sheet_names, 1):
175
+ click.echo(f" {i}. {sheet}")
176
+ click.echo()
177
+
178
+ if len(file_info.sheet_names) == 1:
179
+ # Only one sheet, use it by default
180
+ file_info.selected_sheets = file_info.sheet_names
181
+ click.echo(f" → Using single sheet: {file_info.sheet_names[0]}")
182
+ else:
183
+ # Multiple sheets, ask user
184
+ choice = click.prompt(
185
+ "Extract all sheets or select specific ones?",
186
+ type=click.Choice(["all", "select"], case_sensitive=False),
187
+ default="all",
188
+ )
189
+
190
+ if choice == "all":
191
+ file_info.selected_sheets = file_info.sheet_names
192
+ click.echo(f" → Extracting all {len(file_info.sheet_names)} sheets")
193
+ else:
194
+ # Let user select sheets
195
+ click.echo()
196
+ click.echo(
197
+ "Enter sheet numbers to extract (comma-separated, e.g., '1,3' or '1-3'):"
198
+ )
199
+ selected = click.prompt(" Sheets", type=str, default="1")
200
+
201
+ file_info.selected_sheets = self._parse_sheet_selection(
202
+ selected, file_info.sheet_names
203
+ )
204
+
205
+ if file_info.selected_sheets:
206
+ click.echo(
207
+ f" → Selected sheets: {', '.join(file_info.selected_sheets)}"
208
+ )
209
+ else:
210
+ click.echo(
211
+ click.style(
212
+ " No valid sheets selected, using all", fg="yellow"
213
+ )
214
+ )
215
+ file_info.selected_sheets = file_info.sheet_names
216
+
217
+ click.echo()
218
+
219
+ def _parse_sheet_selection(
220
+ self, selection: str, sheet_names: List[str]
221
+ ) -> List[str]:
222
+ """
223
+ Parse user sheet selection string into list of sheet names.
224
+
225
+ Supports:
226
+ - Individual numbers: "1,3,5"
227
+ - Ranges: "1-3"
228
+ - Mixed: "1,3-5,7"
229
+
230
+ Args:
231
+ selection: User input string
232
+ sheet_names: List of available sheet names
233
+
234
+ Returns:
235
+ List of selected sheet names
236
+ """
237
+ selected_sheets = []
238
+
239
+ try:
240
+ # Split by comma
241
+ parts = selection.split(",")
242
+
243
+ for part in parts:
244
+ part = part.strip()
245
+
246
+ if "-" in part:
247
+ # Range
248
+ start, end = part.split("-")
249
+ start_idx = int(start.strip()) - 1
250
+ end_idx = int(end.strip()) - 1
251
+
252
+ for i in range(start_idx, end_idx + 1):
253
+ if 0 <= i < len(sheet_names):
254
+ selected_sheets.append(sheet_names[i])
255
+ else:
256
+ # Single number
257
+ idx = int(part) - 1
258
+ if 0 <= idx < len(sheet_names):
259
+ selected_sheets.append(sheet_names[idx])
260
+
261
+ # Remove duplicates while preserving order
262
+ seen = set()
263
+ unique_sheets = []
264
+ for sheet in selected_sheets:
265
+ if sheet not in seen:
266
+ seen.add(sheet)
267
+ unique_sheets.append(sheet)
268
+
269
+ return unique_sheets
270
+
271
+ except (ValueError, IndexError):
272
+ return []
273
+
274
+ def _detect_csv_separator(self, file_path: Path) -> str | None:
275
+ """
276
+ Try to detect CSV separator by reading first few lines.
277
+
278
+ Args:
279
+ file_path: Path to CSV file.
280
+
281
+ Returns:
282
+ Detected separator or None if detection failed.
283
+ """
284
+ try:
285
+ with open(file_path, "r", encoding="utf-8") as f:
286
+ # Read first 5 lines
287
+ lines = [f.readline() for _ in range(5)]
288
+
289
+ # Count common separators
290
+ separators = [",", ";", "\t", "|"]
291
+ counts = {}
292
+
293
+ for sep in separators:
294
+ # Count occurrences in each line
295
+ line_counts = [line.count(sep) for line in lines if line.strip()]
296
+
297
+ # If separator appears consistently across lines, it's likely correct
298
+ if line_counts and len(set(line_counts)) == 1 and line_counts[0] > 0:
299
+ counts[sep] = line_counts[0]
300
+
301
+ # Return separator with highest count
302
+ if counts:
303
+ return max(counts, key=counts.get)
304
+
305
+ return None
306
+
307
+ except Exception:
308
+ return None
309
+
310
+ def _infer_schema_with_config(self, file_info: DataFileInfo) -> bool:
311
+ """
312
+ Infer schema using user-provided configuration.
313
+
314
+ Args:
315
+ file_info: DataFileInfo with configuration set.
316
+
317
+ Returns:
318
+ True if successful, False otherwise.
319
+ """
320
+ try:
321
+ if file_info.extension == FileExtension.CSV:
322
+ df = pd.read_csv(
323
+ file_info.file_path,
324
+ nrows=self.sample_rows,
325
+ sep=file_info.csv_separator,
326
+ )
327
+ file_info.inferred_schemas["default"] = self._infer_from_dataframe(df)
328
+ file_info.primary_key_columns["default"] = self._infer_primary_keys(df)
329
+ click.echo(f" ✓ Inferred schema with {len(df.columns)} columns")
330
+
331
+ elif file_info.extension == FileExtension.XLSX:
332
+ # Read selected sheets only
333
+ for sheet_name in file_info.sheets_to_extract:
334
+ df = pd.read_excel(
335
+ file_info.file_path,
336
+ sheet_name=sheet_name,
337
+ nrows=self.sample_rows,
338
+ )
339
+ file_info.inferred_schemas[sheet_name] = self._infer_from_dataframe(
340
+ df
341
+ )
342
+ file_info.primary_key_columns[sheet_name] = (
343
+ self._infer_primary_keys(df)
344
+ )
345
+ click.echo(f" ✓ Sheet '{sheet_name}': {len(df.columns)} columns")
346
+
347
+ elif file_info.extension == FileExtension.JSON:
348
+ # Use json_normalize to flatten nested structures
349
+ import json
350
+
351
+ with open(file_info.file_path, "r") as f:
352
+ json_data = json.load(f)
353
+
354
+ # Normalize JSON to flatten nested objects
355
+ df = pd.json_normalize(json_data)
356
+
357
+ if len(df) > self.sample_rows:
358
+ df = df.head(self.sample_rows)
359
+
360
+ file_info.inferred_schemas["default"] = self._infer_from_dataframe(df)
361
+ file_info.primary_key_columns["default"] = self._infer_primary_keys(df)
362
+ click.echo(f" ✓ Inferred schema with {len(df.columns)} columns")
363
+
364
+ return True
365
+
366
+ except Exception as e:
367
+ click.echo(click.style(f" Error inferring schema: {e}", fg="red"))
368
+ return False
369
+
370
+ def _infer_from_dataframe(self, df: pd.DataFrame) -> Dict[str, DataType]:
371
+ """
372
+ Infer data types from a pandas DataFrame.
373
+
374
+ Args:
375
+ df: DataFrame to infer types from.
376
+
377
+ Returns:
378
+ Dictionary mapping column names to DataType enum values.
379
+ """
380
+ schema = {}
381
+
382
+ for column in df.columns:
383
+ dtype = df[column].dtype
384
+
385
+ # Infer DataType from pandas dtype
386
+ if pd.api.types.is_integer_dtype(dtype):
387
+ schema[column] = DataType.INTEGER
388
+ elif pd.api.types.is_float_dtype(dtype):
389
+ schema[column] = DataType.FLOAT
390
+ elif pd.api.types.is_bool_dtype(dtype):
391
+ schema[column] = DataType.BOOLEAN
392
+ elif pd.api.types.is_datetime64_any_dtype(dtype):
393
+ schema[column] = DataType.DATETIME
394
+ elif pd.api.types.is_string_dtype(dtype) or pd.api.types.is_object_dtype(
395
+ dtype
396
+ ):
397
+ # Check if column contains nested structures (lists or dicts)
398
+ if self._contains_nested_structures(df[column]):
399
+ # For nested structures, treat as STRING
400
+ # (will be converted to JSON string during extraction)
401
+ schema[column] = DataType.STRING
402
+ # Try to detect if it's actually a date
403
+ elif self._looks_like_datetime(df[column]):
404
+ schema[column] = DataType.DATETIME
405
+ else:
406
+ schema[column] = DataType.STRING
407
+ else:
408
+ # Default to STRING for unknown types
409
+ schema[column] = DataType.STRING
410
+
411
+ return schema
412
+
413
+ def _infer_primary_keys(self, df: pd.DataFrame) -> List[str]:
414
+ """Heuristically pick primary-key columns from sample data.
415
+
416
+ A column is treated as PK-like when it is all-unique, all-non-null,
417
+ and either named ``id`` / ``*_id`` or is the single best unique
418
+ column in the sample. Returns at most one column to avoid emitting
419
+ compound PKs that the user didn't actually ask for.
420
+ """
421
+ if df.empty:
422
+ return []
423
+
424
+ candidates: List[str] = []
425
+ for column in df.columns:
426
+ series = df[column]
427
+ if series.isna().any():
428
+ continue
429
+ if series.nunique() != len(series):
430
+ continue
431
+ lowered = str(column).lower()
432
+ if lowered == "id" or lowered.endswith("_id") or lowered.endswith("id"):
433
+ candidates.append(column)
434
+
435
+ return candidates[:1]
436
+
437
+ def _contains_nested_structures(self, series: pd.Series) -> bool:
438
+ """
439
+ Check if a series contains nested structures (lists or dicts).
440
+
441
+ Args:
442
+ series: Pandas series to check.
443
+
444
+ Returns:
445
+ True if the series contains lists or dictionaries.
446
+ """
447
+ if len(series) == 0:
448
+ return False
449
+
450
+ # Sample a few non-null values
451
+ sample = series.dropna().head(5)
452
+ if len(sample) == 0:
453
+ return False
454
+
455
+ # Check if any sampled value is a list or dict
456
+ for value in sample:
457
+ if isinstance(value, (list, dict)):
458
+ return True
459
+
460
+ return False
461
+
462
+ def _looks_like_datetime(self, series: pd.Series) -> bool:
463
+ """
464
+ Check if a series looks like it contains datetime values.
465
+
466
+ Args:
467
+ series: Pandas series to check.
468
+
469
+ Returns:
470
+ True if the series appears to contain datetime values.
471
+ """
472
+ if len(series) == 0:
473
+ return False
474
+
475
+ # Sample a few non-null values
476
+ sample = series.dropna().head(5)
477
+ if len(sample) == 0:
478
+ return False
479
+
480
+ # Try to parse as datetime
481
+ try:
482
+ # Suppress the UserWarning about format inference
483
+ with warnings.catch_warnings():
484
+ warnings.simplefilter("ignore", UserWarning)
485
+ pd.to_datetime(sample)
486
+ return True
487
+ except (ValueError, TypeError):
488
+ return False
489
+
490
+ def _get_file_extension(self, file_path: Path) -> FileExtension | None:
491
+ """
492
+ Get FileExtension enum for a file path.
493
+
494
+ Args:
495
+ file_path: Path to check.
496
+
497
+ Returns:
498
+ FileExtension enum value or None if not supported.
499
+ """
500
+ suffix = file_path.suffix.lower()
501
+ return self.EXTENSION_MAP.get(suffix)
502
+
503
+ def _get_excel_sheets(self, file_path: Path) -> List[str]:
504
+ """
505
+ Get list of sheet names from an Excel file.
506
+
507
+ Args:
508
+ file_path: Path to Excel file.
509
+
510
+ Returns:
511
+ List of sheet names.
512
+ """
513
+ try:
514
+ excel_file = pd.ExcelFile(file_path)
515
+ return excel_file.sheet_names
516
+ except Exception:
517
+ return []
@@ -0,0 +1,62 @@
1
+ import click
2
+ import sys
3
+ import os
4
+
5
+
6
+ def _ensure_dev_path():
7
+ """Ensure the dev path is available for imports during development."""
8
+ here = os.path.abspath(os.path.dirname(__file__))
9
+ project_root = os.path.abspath(os.path.join(here, "..", "..", "..", ".."))
10
+ if project_root not in sys.path:
11
+ sys.path.insert(0, project_root)
12
+ try:
13
+ from algomancy_quickstart.quickstart import run_quickstart # noqa: F401
14
+
15
+ return
16
+ except Exception:
17
+ pass
18
+
19
+
20
+ _ensure_dev_path()
21
+
22
+ from algomancy_quickstart.quickstart import run_quickstart # type: ignore # noqa: E402
23
+
24
+
25
+ @click.command()
26
+ @click.option(
27
+ "--skip-confirmation",
28
+ is_flag=True,
29
+ help="Skip confirmation prompts and use defaults where possible.",
30
+ )
31
+ @click.option(
32
+ "--title", default=None, help="Project title (will be prompted if not provided)."
33
+ )
34
+ def main(skip_confirmation: bool, title: str | None):
35
+ """
36
+ Algomancy Quickstart - Interactive setup wizard for Algomancy applications.
37
+
38
+ This tool will guide you through creating a new Algomancy application with:
39
+ - Folder structure
40
+ - Basic main.py with placeholders
41
+ - Custom implementation shells
42
+ - ETL pipeline generation
43
+ - Asset imports
44
+ - Styling configuration
45
+ """
46
+ click.echo(click.style("Algomancy Quickstart Wizard", fg="cyan", bold=True))
47
+ click.echo()
48
+
49
+ try:
50
+ run_quickstart(skip_confirmation=skip_confirmation, title=title)
51
+ except KeyboardInterrupt:
52
+ click.echo()
53
+ click.echo(click.style("Setup cancelled by user.", fg="red"))
54
+ sys.exit(1)
55
+ except Exception as e:
56
+ click.echo()
57
+ click.echo(click.style(f"Error: {e}", fg="red"))
58
+ sys.exit(1)
59
+
60
+
61
+ if __name__ == "__main__":
62
+ main()