sibi-dst 2025.9.8__py3-none-any.whl → 2025.9.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -57,13 +57,13 @@ class ParquetArtifact(ManagedResource):
57
57
  raise ValueError("Required argument 'parquet_filename' is missing.")
58
58
 
59
59
  self._storage_path: str = self.all_kwargs["parquet_storage_path"]
60
- self._parquet_filename: str = self.all_kwargs["parquet_filename"]
60
+ #self._parquet_filename: str = self.all_kwargs["parquet_filename"]
61
61
  self._data_wrapper_class: Optional[Type] = self.all_kwargs.get("data_wrapper_class")
62
62
 
63
63
  # Update logger extra with specific context
64
64
  self.logger_extra.update({
65
65
  "artifact_storage_path": self._storage_path,
66
- "artifact_filename": self._parquet_filename
66
+ #"artifact_filename": self._parquet_filename
67
67
  })
68
68
 
69
69
  # --------------------- Helpers ---------------------
@@ -124,6 +124,8 @@ class ParquetArtifact(ManagedResource):
124
124
  "description": getattr(self._data_wrapper_class, "__name__", "DataWrapper"),
125
125
  "skipped": list(skipped_files),
126
126
  "mmanifest": self.mmanifest, # Pass the instance
127
+ "partition_on": self.all_kwargs.get("partition_on", ["partition_date"]),
128
+ "hive_style": self.all_kwargs.get("hive_style", True),
127
129
  }
128
130
  return UpdatePlanner(**cfg)
129
131
 
@@ -147,7 +149,6 @@ class ParquetArtifact(ManagedResource):
147
149
  # Prepare configuration for the DataWrapper
148
150
  cfg = {
149
151
  "data_path": self._storage_path,
150
- "parquet_filename": self._parquet_filename,
151
152
  "fs": self.fs,
152
153
  "debug": self.debug,
153
154
  "logger": self.logger,
@@ -189,9 +190,9 @@ class ParquetArtifact(ManagedResource):
189
190
  "logger": self.logger,
190
191
  "debug": self.debug,
191
192
  "parquet_storage_path": self._storage_path,
192
- "parquet_filename": self._parquet_filename,
193
193
  "parquet_start_date": self.all_kwargs.get("parquet_start_date"),
194
194
  "parquet_end_date": self.all_kwargs.get("parquet_end_date"),
195
+ "partition_on": self.all_kwargs.get("partition_on", ["partition_date"]),
195
196
  **(self.all_kwargs.get("class_params") or {}),
196
197
  }
197
198
 
@@ -218,7 +219,7 @@ class ParquetArtifact(ManagedResource):
218
219
  self._invalidate_cached("mmanifest") # Overwrite affects manifest creation
219
220
 
220
221
  # --- 3. Global concurrency control ---
221
- key = (self._storage_path, self._parquet_filename)
222
+ key = self._storage_path
222
223
  with ParquetArtifact._global_lock:
223
224
  if key in ParquetArtifact._active_runs:
224
225
  self.logger.info(
@@ -418,324 +419,3 @@ class ParquetArtifact(ManagedResource):
418
419
  self.logger.warning(f"Error during ParquetArtifact resource cleanup: {e}", extra=self.logger_extra)
419
420
 
420
421
 
421
- # from __future__ import annotations
422
- #
423
- # import datetime as dt
424
- # import threading
425
- # from functools import cached_property
426
- # from typing import Any, Dict, Type, TypeVar
427
- #
428
- # from sibi_dst.utils import DataWrapper, DateUtils, UpdatePlanner, ManagedResource
429
- # from sibi_dst.utils import MissingManifestManager, Logger
430
- #
431
- # T = TypeVar("T")
432
- #
433
- #
434
- # class ParquetArtifact(ManagedResource):
435
- # """
436
- # Orchestrates a single dataset:
437
- # - Builds/uses MissingManifestManager
438
- # - Plans work with UpdatePlanner
439
- # - Executes with DataWrapper (threaded) saving Dask → Parquet
440
- # - Prevents duplicate concurrent runs per (storage_path, filename)
441
- # - Forwards retry/backoff knobs to DataWrapper.process()
442
- # """
443
- #
444
- # _global_lock = threading.RLock()
445
- # _active_runs: set[tuple[str, str]] = set()
446
- # logger_extra = {"sibi_dst_component": __name__}
447
- #
448
- # def __init__(self, **kwargs: Any):
449
- # # Merge defaults from ManagedResource and caller kwargs
450
- # self.all_kwargs: Dict[str, Any] = {**kwargs}
451
- # super().__init__(**self.all_kwargs)
452
- #
453
- # # Persist the minimal config we depend on frequently
454
- # self._lock = threading.RLock()
455
- #
456
- # # Required knobs
457
- # self._storage_path: str = self.all_kwargs["parquet_storage_path"]
458
- # self._parquet_filename: str = self.all_kwargs["parquet_filename"]
459
- # self._data_wrapper_class = self.all_kwargs.get("data_wrapper_class")
460
- #
461
- # # ---------- helpers ----------
462
- # def _invalidate_cached(self, *names: str) -> None:
463
- # for n in names:
464
- # self.__dict__.pop(n, None)
465
- #
466
- # def _build_manifest_path(self) -> str:
467
- # base = f"{self._storage_path}".rstrip("/") + "/"
468
- # return f"{base}_manifests/missing.parquet"
469
- #
470
- # # ---------- lazy members ----------
471
- # @cached_property
472
- # def mmanifest(self) -> MissingManifestManager:
473
- # self.logger.info("Initializing MissingManifestManager...", extra=self.logger_extra)
474
- # manifest_path = self._build_manifest_path()
475
- #
476
- # # ensure manifest directory exists
477
- # manifest_dir = manifest_path.rsplit("/", 1)[0] if "/" in manifest_path else manifest_path
478
- # self.ensure_directory_exists(manifest_dir)
479
- #
480
- # mgr = MissingManifestManager(
481
- # fs=self.fs,
482
- # manifest_path=manifest_path,
483
- # clear_existing=self.all_kwargs.get("overwrite", False),
484
- # debug=self.debug,
485
- # logger=self.logger,
486
- # overwrite=self.all_kwargs.get("overwrite", False),
487
- # )
488
- #
489
- # if not mgr._safe_exists(mgr.manifest_path):
490
- # self.logger.info(f"Creating new manifest at {mgr.manifest_path}", extra=self.logger_extra)
491
- # mgr.save()
492
- # else:
493
- # self.logger.info(f"Manifest already exists at {mgr.manifest_path}", extra=self.logger_extra)
494
- #
495
- # return mgr
496
- #
497
- # @cached_property
498
- # def update_planner(self) -> UpdatePlanner:
499
- # self.logger.info("Initializing UpdatePlanner...", extra=self.logger_extra)
500
- # skipped_files = self.mmanifest.load_existing() or []
501
- #
502
- # cfg = {
503
- # **self.all_kwargs,
504
- # "fs": self.fs,
505
- # "debug": self.debug,
506
- # "logger": self.logger,
507
- # "description": getattr(self._data_wrapper_class, "__name__", "DataWrapper"),
508
- # "skipped": list(skipped_files),
509
- # "mmanifest": self.mmanifest,
510
- # }
511
- # return UpdatePlanner(**cfg)
512
- #
513
- # @cached_property
514
- # def data_wrapper(self) -> DataWrapper:
515
- # self.logger.info("Initializing DataWrapper...", extra=self.logger_extra)
516
- #
517
- # # Ensure the planner has a plan
518
- # if getattr(self.update_planner, "plan", None) is None:
519
- # self.update_planner.generate_plan()
520
- #
521
- # class_params = {
522
- # "debug": self.debug,
523
- # "logger": self.logger,
524
- # "fs": self.fs,
525
- # "verbose": self.verbose,
526
- # }
527
- #
528
- # cfg = {
529
- # "data_path": self._storage_path,
530
- # "parquet_filename": self._parquet_filename,
531
- # "fs": self.fs,
532
- # "debug": self.debug,
533
- # "logger": self.logger,
534
- # "verbose": self.verbose,
535
- # "dataclass": self._data_wrapper_class,
536
- # "class_params": class_params,
537
- # "load_params": self.all_kwargs.get("load_params", {}) or {},
538
- # "mmanifest": self.mmanifest,
539
- # "update_planner": self.update_planner,
540
- # "date_field": self.all_kwargs.get("date_field"),
541
- # # pipeline execution knobs
542
- # "show_progress": bool(self.all_kwargs.get("show_progress", False)),
543
- # "timeout": float(self.all_kwargs.get("timeout", 30.0)),
544
- # "max_threads": int(self.all_kwargs.get("max_threads", 3)),
545
- # }
546
- # return DataWrapper(**cfg)
547
- #
548
- # # ---------- public API ----------
549
- # def load(self, **kwargs: Any):
550
- # """
551
- # Direct load using the configured data_wrapper_class (no planner/manifest round-trip).
552
- # Expected to return a Dask DataFrame from the loader.
553
- # """
554
- # self.logger.info(f"Loading data from {self._storage_path}")
555
- #
556
- # if not self._data_wrapper_class:
557
- # raise ValueError("data_wrapper_class is not configured.")
558
- #
559
- # params = {
560
- # "backend": "parquet",
561
- # "fs": self.fs,
562
- # "logger": self.logger,
563
- # "debug": self.debug,
564
- # "parquet_storage_path": self._storage_path,
565
- # "parquet_filename": self._parquet_filename,
566
- # "parquet_start_date": self.all_kwargs.get("parquet_start_date"),
567
- # "parquet_end_date": self.all_kwargs.get("parquet_end_date"),
568
- # **(self.all_kwargs.get("class_params") or {}),
569
- # }
570
- #
571
- # cls = self._data_wrapper_class
572
- # with cls(**params) as instance:
573
- # return instance.load(**kwargs)
574
- #
575
- # def generate_parquet(self, **kwargs: Any) -> None:
576
- # """
577
- # Generate or update Parquet according to the plan.
578
- # - Merges runtime kwargs
579
- # - Invalidates dependent caches
580
- # - Guards against duplicate concurrent runs
581
- # - Forwards retry/backoff to DataWrapper.process()
582
- # """
583
- # # Merge and invalidate caches that depend on runtime changes
584
- # self.all_kwargs.update(kwargs)
585
- # self._invalidate_cached("update_planner", "data_wrapper")
586
- # if "overwrite" in kwargs:
587
- # self._invalidate_cached("mmanifest")
588
- #
589
- # # Global de-dupe guard
590
- # key = (self._storage_path, self._parquet_filename)
591
- # with ParquetArtifact._global_lock:
592
- # if key in ParquetArtifact._active_runs:
593
- # self.logger.info(
594
- # f"Run already in progress for {key}; skipping this invocation.", extra=self.logger_extra
595
- # )
596
- # return
597
- # ParquetArtifact._active_runs.add(key)
598
- #
599
- # try:
600
- # self.ensure_directory_exists(self._storage_path)
601
- #
602
- # self.update_planner.generate_plan()
603
- # plan = getattr(self.update_planner, "plan", None)
604
- # if plan is None or (hasattr(plan, "empty") and plan.empty):
605
- # # Planning uses Pandas; this is safe to check.
606
- # self.logger.info("No updates needed. Skipping Parquet generation.", extra=self.logger_extra)
607
- # return
608
- #
609
- # # Print plan once per run
610
- # if (
611
- # getattr(self.update_planner, "show_progress", False)
612
- # and not getattr(self.update_planner, "_printed_this_run", False)
613
- # ):
614
- # self.update_planner.show_update_plan()
615
- # setattr(self.update_planner, "_printed_this_run", True)
616
- #
617
- # # ---- forward retry/backoff knobs to DataWrapper.process() ----
618
- # dw_retry_kwargs = {
619
- # k: self.all_kwargs[k]
620
- # for k in ("max_retries", "backoff_base", "backoff_jitter", "backoff_max")
621
- # if k in self.all_kwargs
622
- # }
623
- #
624
- # with self._lock:
625
- # dw = self.data_wrapper # single cached_property access
626
- # if hasattr(dw, "process"):
627
- # dw.process(**dw_retry_kwargs)
628
- # if getattr(self.update_planner, "show_progress", False) and hasattr(
629
- # dw, "show_benchmark_summary"
630
- # ):
631
- # dw.show_benchmark_summary()
632
- #
633
- # finally:
634
- # with ParquetArtifact._global_lock:
635
- # ParquetArtifact._active_runs.discard(key)
636
- #
637
- # def update_parquet(self, period: str = "today", **kwargs: Any) -> None:
638
- # """
639
- # High-level entry point to update Parquet for a given period:
640
- # - 'today', 'yesterday', 'last_7_days', etc. via DateUtils.parse_period
641
- # - 'ytd'
642
- # - 'itd' (requires history_begins_on)
643
- # - 'custom' (requires start_on / end_on)
644
- # Also accepts retry/backoff knobs which flow to DataWrapper.process().
645
- # """
646
- # final_kwargs = {**self.all_kwargs, **kwargs}
647
- #
648
- # def itd_config():
649
- # start_date = final_kwargs.get("history_begins_on")
650
- # if not start_date:
651
- # raise ValueError(
652
- # "For period 'itd', 'history_begins_on' must be configured."
653
- # )
654
- # return {
655
- # "parquet_start_date": start_date,
656
- # "parquet_end_date": dt.date.today(),
657
- # }
658
- #
659
- # def ytd_config():
660
- # return {
661
- # "parquet_start_date": dt.date(dt.date.today().year, 1, 1),
662
- # "parquet_end_date": dt.date.today(),
663
- # }
664
- #
665
- # def custom_config():
666
- # """
667
- # Prepare parameters for 'custom' period execution, ensuring `start_on` and `end_on`
668
- # are provided (with backward compatibility for `start_date`/`end_date` aliases).
669
- # """
670
- # # Backward compatibility: normalize aliases
671
- # alias_map = {
672
- # "start_on": ("start_date", "start"),
673
- # "end_on": ("end_date", "end"),
674
- # }
675
- # normalized_kwargs = dict(kwargs) # shallow copy so we don't mutate original
676
- # for target, aliases in alias_map.items():
677
- # if target not in normalized_kwargs:
678
- # for alias in aliases:
679
- # if alias in normalized_kwargs:
680
- # normalized_kwargs[target] = normalized_kwargs[alias]
681
- # break
682
- #
683
- # # Validation
684
- # missing = [k for k in ("start_on", "end_on") if k not in normalized_kwargs]
685
- # if missing:
686
- # raise ValueError(
687
- # f"For period 'custom', the following required parameters are missing: {', '.join(missing)}"
688
- # )
689
- #
690
- # return {
691
- # "parquet_start_date": normalized_kwargs["start_on"],
692
- # "parquet_end_date": normalized_kwargs["end_on"],
693
- # }
694
- #
695
- # if period == "itd":
696
- # period_params = itd_config()
697
- # elif period == "ytd":
698
- # period_params = ytd_config()
699
- # elif period == "custom":
700
- # period_params = custom_config()
701
- # else:
702
- # start_date, end_date = DateUtils.parse_period(period=period)
703
- # period_params = {
704
- # "parquet_start_date": start_date,
705
- # "parquet_end_date": end_date,
706
- # }
707
- #
708
- # final_kwargs.update(period_params)
709
- # self.logger.debug(
710
- # f"kwargs passed to update_parquet/generate_parquet: {final_kwargs}", extra=self.logger_extra
711
- # )
712
- #
713
- # # Delegate to generator (handles cache invalidation + forwarding knobs)
714
- # self.generate_parquet(**final_kwargs)
715
- #
716
- # # ---------- utils ----------
717
- # def ensure_directory_exists(self, path: str) -> None:
718
- # """Ensure the directory exists across fsspec backends."""
719
- # with self._lock:
720
- # if not self.fs.exists(path):
721
- # self.logger.info(f"Creating directory: {path}", extra=self.logger_extra)
722
- # try:
723
- # self.fs.makedirs(path, exist_ok=True)
724
- # except TypeError:
725
- # try:
726
- # self.fs.makedirs(path)
727
- # except FileExistsError:
728
- # pass
729
- #
730
- # def _cleanup(self):
731
- # """Clean up resources upon exit."""
732
- # try:
733
- # if "mmanifest" in self.__dict__ and getattr(
734
- # self.mmanifest, "_new_records", None
735
- # ):
736
- # if self.mmanifest._new_records:
737
- # self.mmanifest.save()
738
- # if "data_wrapper" in self.__dict__ and hasattr(self.data_wrapper, "close"):
739
- # self.data_wrapper.close()
740
- # except Exception as e:
741
- # self.logger.warning(f"Error during resource cleanup: {e}", extra=self.logger_extra)
@@ -39,7 +39,8 @@ class ParquetReader(DfHelper):
39
39
  :type fs: fsspec.AbstractFileSystem
40
40
  """
41
41
  DEFAULT_CONFIG: ClassVar[Dict[str, Any]] = {
42
- 'backend': 'parquet'
42
+ 'backend': 'parquet',
43
+ 'partition_on': ['partition_date']
43
44
  }
44
45
  df: Optional[Union[dd.DataFrame, pd.DataFrame]] = None
45
46
 
@@ -30,6 +30,7 @@ class ParquetConfig(BaseModel):
30
30
  fs: Optional[fsspec.spec.AbstractFileSystem] = None
31
31
  logger: Optional[Logger] = None
32
32
  debug: bool = False
33
+ partition_on: Optional[list[str]] = None # column name for partitioned datasets
33
34
 
34
35
  # ---- Derived / runtime fields (lazy) ----
35
36
  parquet_full_path: Optional[str] = None # file or directory
@@ -80,7 +81,7 @@ class ParquetConfig(BaseModel):
80
81
  )
81
82
 
82
83
  # file vs dataset-at-root
83
- if self.parquet_filename:
84
+ if self.parquet_filename and self.partition_on is None:
84
85
  self.parquet_full_path = self.ensure_file_extension(
85
86
  posixpath.join(str(self.parquet_storage_path), str(self.parquet_filename)),
86
87
  "parquet",
@@ -224,11 +225,34 @@ class ParquetConfig(BaseModel):
224
225
 
225
226
  # ------------------------- internals -------------------------
226
227
 
228
+
227
229
  def _resolve_paths_for_read(self) -> List[str]:
228
230
  """
229
231
  Builds a list of path patterns for dask.read_parquet.
232
+ Respects partition_on + start/end date if given.
230
233
  """
231
- # Date-ranged folders
234
+ print(f"_resolve_paths_for_read: {self.partition_on}")
235
+ # Partitioned dataset by column
236
+ if self.partition_on and self.parquet_start_date and self.parquet_end_date:
237
+ if not isinstance(self.partition_on, (list, tuple)):
238
+ parts = [self.partition_on]
239
+ else:
240
+ parts = self.partition_on
241
+
242
+ start = dt.datetime.strptime(self.parquet_start_date, "%Y-%m-%d").date()
243
+ end = dt.datetime.strptime(self.parquet_end_date, "%Y-%m-%d").date()
244
+ days = pd.date_range(start=start, end=end, freq="D").date
245
+
246
+ base = self.parquet_storage_path.rstrip("/")
247
+ print("base:",base)
248
+ result= [
249
+ f"{base}/{parts[0]}={d.isoformat()}/*.parquet"
250
+ for d in days
251
+ ]
252
+ print("result:",result)
253
+ return result
254
+
255
+ # Date-ranged folders (non-partitioned, using FilePathGenerator)
232
256
  if self.parquet_folder_list:
233
257
  dirs = {self._dirname(p) for p in self.parquet_folder_list}
234
258
  return [d.rstrip("/") + "/*.parquet" for d in sorted(dirs)]
@@ -1,16 +1,20 @@
1
- from .base_parquet_artifact import BaseParquetArtifact
1
+ from __future__ import annotations
2
+ from .base_attacher import make_attacher, AttachmentMaker
2
3
  from .base_data_cube import BaseDataCube
3
- from .base_attacher import make_attacher
4
+ from .base_parquet_artifact import BaseParquetArtifact
4
5
  from .base_parquet_reader import BaseParquetReader
5
- from .hybrid_data_loader import HybridDataLoader
6
6
  from .base_pipeline import BasePipeline
7
+ from .base_pipeline_template import PipelineTemplate
8
+ from .hybrid_data_loader import HybridDataLoader
7
9
 
8
10
  __all__ = [
9
11
  "BaseDataCube",
10
12
  "BaseParquetArtifact",
13
+ "AttachmentMaker",
11
14
  "make_attacher",
12
15
  "BaseParquetReader",
13
16
  "HybridDataLoader",
14
17
  "BasePipeline",
18
+ "PipelineTemplate",
15
19
  ]
16
20
 
@@ -1,25 +1,70 @@
1
1
  from typing import Any, Awaitable, Callable, Sequence, Type
2
2
 
3
- def make_attacher(
4
- cube_cls: Type,
5
- fieldnames: Sequence[str],
6
- column_names: Sequence[str],
7
- ) -> Callable[..., Awaitable[Any]]:
3
+
4
+ class AttachmentMaker:
8
5
  """
9
- Factory for async attachers.
6
+ Async attacher class.
10
7
  Skips work if any param value is falsy ([], None, {}, etc.).
11
8
  """
12
9
 
13
- async def attach(*, logger=None, debug: bool = False, **params: Any):
10
+ def __init__(
11
+ self,
12
+ cube_cls: Type,
13
+ fieldnames: Sequence[str],
14
+ column_names: Sequence[str],
15
+ ):
16
+ self.cube_cls = cube_cls
17
+ self.fieldnames = tuple(fieldnames)
18
+ self.column_names = list(column_names)
19
+
20
+ async def attach(self, *, logger=None, debug: bool = False, **params: Any):
14
21
  if any(not v for v in params.values()):
15
22
  return None
16
23
  call_params = {
17
- "fieldnames": tuple(fieldnames),
18
- "column_names": list(column_names),
24
+ "fieldnames": self.fieldnames,
25
+ "column_names": self.column_names,
19
26
  **params,
20
27
  }
21
- return await cube_cls(logger=logger, debug=debug).aload(**call_params)
28
+ return await self.cube_cls(logger=logger, debug=debug).aload(**call_params)
29
+
30
+
31
+ # Factory function for backward compatibility
32
+ def make_attacher(
33
+ cube_cls: Type,
34
+ fieldnames: Sequence[str],
35
+ column_names: Sequence[str],
36
+ ) -> Callable[..., Awaitable[Any]]:
37
+ """
38
+ Factory for async attachers.
39
+ Skips work if any param value is falsy ([], None, {}, etc.).
40
+ """
41
+ attacher = AttachmentMaker(cube_cls, fieldnames, column_names)
42
+ return attacher.attach
22
43
 
23
- return attach
24
44
 
25
- __all__ = ['make_attacher']
45
+ __all__ = ['AttachmentMaker', 'make_attacher']
46
+ # from typing import Any, Awaitable, Callable, Sequence, Type
47
+ #
48
+ # def make_attacher(
49
+ # cube_cls: Type,
50
+ # fieldnames: Sequence[str],
51
+ # column_names: Sequence[str],
52
+ # ) -> Callable[..., Awaitable[Any]]:
53
+ # """
54
+ # Factory for async attachers.
55
+ # Skips work if any param value is falsy ([], None, {}, etc.).
56
+ # """
57
+ #
58
+ # async def attach(*, logger=None, debug: bool = False, **params: Any):
59
+ # if any(not v for v in params.values()):
60
+ # return None
61
+ # call_params = {
62
+ # "fieldnames": tuple(fieldnames),
63
+ # "column_names": list(column_names),
64
+ # **params,
65
+ # }
66
+ # return await cube_cls(logger=logger, debug=debug).aload(**call_params)
67
+ #
68
+ # return attach
69
+ #
70
+ # __all__ = ['make_attacher']
@@ -91,35 +91,19 @@ class BasePipeline(ManagedResource):
91
91
  return
92
92
 
93
93
  df[self.date_field] = dd.to_datetime(df[self.date_field], errors="coerce")
94
- dates = DateRangeHelper.generate_daily_ranges(self.start_date, self.end_date)
95
-
96
- tasks = []
97
- with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
98
- for date_str in dates:
99
- date_obj = pd.to_datetime(date_str).date()
100
- df_day = df[df[self.date_field].dt.date == date_obj]
101
- if dask_is_empty(df_day):
102
- self.logger.info(f"No data for {date_obj}, skipping.")
103
- continue
104
-
105
- path = self._get_storage_path_for_date(pd.Timestamp(date_obj))
106
- await self.emit("status", message=f"Saving data for {date_obj}")
107
-
108
- saver = ParquetSaver(
109
- df_result=df_day,
110
- parquet_storage_path=path,
111
- fs=self.fs,
112
- debug=self.debug,
113
- logger=self.logger,
114
- )
115
-
116
- tasks.append(
117
- asyncio.get_running_loop().run_in_executor(
118
- executor, saver.save_to_parquet, self._get_output_filename()
119
- )
120
- )
121
-
122
- await asyncio.gather(*tasks)
94
+ df["partition_date"] = df[self.date_field].dt.date.astype(str)
95
+
96
+ out_path = self.storage_path.rstrip("/")+"/"+self._get_output_filename(fmt="parquet")
97
+ self.logger.info("Saving dataset to %s", out_path)
98
+ ps = ParquetSaver(
99
+ df_result=df,
100
+ parquet_storage_path=out_path,
101
+ engine="pyarrow",
102
+ fs=self.fs,
103
+ partition_on=["partition_date"],
104
+ write_index=False,
105
+ )
106
+ ps.save_to_parquet()
123
107
  await self.emit("complete", message="All partitions written.")
124
108
 
125
109
  async def from_parquet(self, **kwargs) -> dd.DataFrame:
@@ -153,6 +137,7 @@ class BasePipeline(ManagedResource):
153
137
  self.logger.warning("No valid dates found for partitioning.")
154
138
  return
155
139
 
140
+ clk_conf['table'] = self.filename
156
141
  clk = ClickHouseWriter(**clk_conf)
157
142
  loop = asyncio.get_running_loop()
158
143
  tasks = []
@@ -0,0 +1,54 @@
1
+ from __future__ import annotations
2
+
3
+ import pandas as pd
4
+
5
+ from sibi_dst.utils.boilerplate import BasePipeline
6
+
7
+
8
+ class PipelineTemplate:
9
+ """
10
+ A reusable base class for executing product-related pipelines end-to-end.
11
+ """
12
+
13
+ def __init__(
14
+ self,
15
+ start_date: str,
16
+ end_date: str,
17
+ fs_instance,
18
+ storage_path: str,
19
+ dataset_cls,
20
+ filename: str,
21
+ date_field: str = "last_activity_dt",
22
+ **kwargs
23
+ ):
24
+ self.start_date = start_date
25
+ self.end_date = end_date
26
+ self.max_workers = kwargs.pop('max_workers', 4)
27
+ self.fs = fs_instance
28
+ self.storage_path = storage_path
29
+
30
+ self.pipeline = BasePipeline(
31
+ start_date=self.start_date,
32
+ end_date=self.end_date,
33
+ dataset_cls=dataset_cls,
34
+ parquet_storage_path=self.storage_path,
35
+ fs=self.fs,
36
+ filename=filename,
37
+ date_field=date_field,
38
+ max_workers=self.max_workers,
39
+ )
40
+
41
+ async def to_parquet(self, **kwargs) -> pd.DataFrame:
42
+ await self.pipeline.to_parquet(**kwargs)
43
+ df = await self.pipeline.from_parquet(**kwargs)
44
+ return df
45
+
46
+ async def from_parquet(self, **kwargs) -> pd.DataFrame:
47
+ df = await self.pipeline.from_parquet(**kwargs)
48
+ return df
49
+
50
+ async def to_clickhouse(self, clickhouse_conf, **kwargs) -> None:
51
+ cnf = clickhouse_conf.copy()
52
+ cnf["table"] = self.pipeline.filename
53
+ cnf["overwrite"] = True
54
+ await self.pipeline.to_clickhouse(cnf, **kwargs)
@@ -224,7 +224,7 @@ class ClickHouseWriter(ManagedResource):
224
224
  def _default_engine_sql(self) -> str:
225
225
  # minimal MergeTree clause; quote order_by safely
226
226
  ob = self.order_by if self.order_by.startswith("(") else f"(`{self.order_by}`)"
227
- return f"ENGINE = MergeTree ORDER BY {ob}"
227
+ return f"ENGINE = MergeTree ORDER BY {ob} SETTINGS allow_nullable_key = 1"
228
228
 
229
229
  # ------------- partition write -------------
230
230