sibi-dst 2025.9.9__py3-none-any.whl → 2025.9.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_artifact_updater_async.py +191 -137
- sibi_dst/df_helper/_parquet_artifact.py +6 -326
- sibi_dst/df_helper/_parquet_reader.py +2 -1
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +24 -2
- sibi_dst/utils/boilerplate/__init__.py +5 -3
- sibi_dst/utils/boilerplate/base_pipeline.py +14 -29
- sibi_dst/utils/business_days.py +19 -51
- sibi_dst/utils/clickhouse_writer.py +1 -1
- sibi_dst/utils/data_wrapper.py +46 -312
- sibi_dst/utils/filepath_generator.py +1 -154
- sibi_dst/utils/parquet_saver.py +29 -16
- sibi_dst/utils/progress/sse_runner.py +39 -11
- sibi_dst/utils/update_planner.py +161 -805
- {sibi_dst-2025.9.9.dist-info → sibi_dst-2025.9.11.dist-info}/METADATA +2 -1
- {sibi_dst-2025.9.9.dist-info → sibi_dst-2025.9.11.dist-info}/RECORD +16 -16
- {sibi_dst-2025.9.9.dist-info → sibi_dst-2025.9.11.dist-info}/WHEEL +0 -0
@@ -57,13 +57,13 @@ class ParquetArtifact(ManagedResource):
|
|
57
57
|
raise ValueError("Required argument 'parquet_filename' is missing.")
|
58
58
|
|
59
59
|
self._storage_path: str = self.all_kwargs["parquet_storage_path"]
|
60
|
-
self._parquet_filename: str = self.all_kwargs["parquet_filename"]
|
60
|
+
#self._parquet_filename: str = self.all_kwargs["parquet_filename"]
|
61
61
|
self._data_wrapper_class: Optional[Type] = self.all_kwargs.get("data_wrapper_class")
|
62
62
|
|
63
63
|
# Update logger extra with specific context
|
64
64
|
self.logger_extra.update({
|
65
65
|
"artifact_storage_path": self._storage_path,
|
66
|
-
"artifact_filename": self._parquet_filename
|
66
|
+
#"artifact_filename": self._parquet_filename
|
67
67
|
})
|
68
68
|
|
69
69
|
# --------------------- Helpers ---------------------
|
@@ -124,6 +124,8 @@ class ParquetArtifact(ManagedResource):
|
|
124
124
|
"description": getattr(self._data_wrapper_class, "__name__", "DataWrapper"),
|
125
125
|
"skipped": list(skipped_files),
|
126
126
|
"mmanifest": self.mmanifest, # Pass the instance
|
127
|
+
"partition_on": self.all_kwargs.get("partition_on", ["partition_date"]),
|
128
|
+
"hive_style": self.all_kwargs.get("hive_style", True),
|
127
129
|
}
|
128
130
|
return UpdatePlanner(**cfg)
|
129
131
|
|
@@ -147,7 +149,6 @@ class ParquetArtifact(ManagedResource):
|
|
147
149
|
# Prepare configuration for the DataWrapper
|
148
150
|
cfg = {
|
149
151
|
"data_path": self._storage_path,
|
150
|
-
"parquet_filename": self._parquet_filename,
|
151
152
|
"fs": self.fs,
|
152
153
|
"debug": self.debug,
|
153
154
|
"logger": self.logger,
|
@@ -189,9 +190,9 @@ class ParquetArtifact(ManagedResource):
|
|
189
190
|
"logger": self.logger,
|
190
191
|
"debug": self.debug,
|
191
192
|
"parquet_storage_path": self._storage_path,
|
192
|
-
"parquet_filename": self._parquet_filename,
|
193
193
|
"parquet_start_date": self.all_kwargs.get("parquet_start_date"),
|
194
194
|
"parquet_end_date": self.all_kwargs.get("parquet_end_date"),
|
195
|
+
"partition_on": self.all_kwargs.get("partition_on", ["partition_date"]),
|
195
196
|
**(self.all_kwargs.get("class_params") or {}),
|
196
197
|
}
|
197
198
|
|
@@ -218,7 +219,7 @@ class ParquetArtifact(ManagedResource):
|
|
218
219
|
self._invalidate_cached("mmanifest") # Overwrite affects manifest creation
|
219
220
|
|
220
221
|
# --- 3. Global concurrency control ---
|
221
|
-
key =
|
222
|
+
key = self._storage_path
|
222
223
|
with ParquetArtifact._global_lock:
|
223
224
|
if key in ParquetArtifact._active_runs:
|
224
225
|
self.logger.info(
|
@@ -418,324 +419,3 @@ class ParquetArtifact(ManagedResource):
|
|
418
419
|
self.logger.warning(f"Error during ParquetArtifact resource cleanup: {e}", extra=self.logger_extra)
|
419
420
|
|
420
421
|
|
421
|
-
# from __future__ import annotations
|
422
|
-
#
|
423
|
-
# import datetime as dt
|
424
|
-
# import threading
|
425
|
-
# from functools import cached_property
|
426
|
-
# from typing import Any, Dict, Type, TypeVar
|
427
|
-
#
|
428
|
-
# from sibi_dst.utils import DataWrapper, DateUtils, UpdatePlanner, ManagedResource
|
429
|
-
# from sibi_dst.utils import MissingManifestManager, Logger
|
430
|
-
#
|
431
|
-
# T = TypeVar("T")
|
432
|
-
#
|
433
|
-
#
|
434
|
-
# class ParquetArtifact(ManagedResource):
|
435
|
-
# """
|
436
|
-
# Orchestrates a single dataset:
|
437
|
-
# - Builds/uses MissingManifestManager
|
438
|
-
# - Plans work with UpdatePlanner
|
439
|
-
# - Executes with DataWrapper (threaded) saving Dask → Parquet
|
440
|
-
# - Prevents duplicate concurrent runs per (storage_path, filename)
|
441
|
-
# - Forwards retry/backoff knobs to DataWrapper.process()
|
442
|
-
# """
|
443
|
-
#
|
444
|
-
# _global_lock = threading.RLock()
|
445
|
-
# _active_runs: set[tuple[str, str]] = set()
|
446
|
-
# logger_extra = {"sibi_dst_component": __name__}
|
447
|
-
#
|
448
|
-
# def __init__(self, **kwargs: Any):
|
449
|
-
# # Merge defaults from ManagedResource and caller kwargs
|
450
|
-
# self.all_kwargs: Dict[str, Any] = {**kwargs}
|
451
|
-
# super().__init__(**self.all_kwargs)
|
452
|
-
#
|
453
|
-
# # Persist the minimal config we depend on frequently
|
454
|
-
# self._lock = threading.RLock()
|
455
|
-
#
|
456
|
-
# # Required knobs
|
457
|
-
# self._storage_path: str = self.all_kwargs["parquet_storage_path"]
|
458
|
-
# self._parquet_filename: str = self.all_kwargs["parquet_filename"]
|
459
|
-
# self._data_wrapper_class = self.all_kwargs.get("data_wrapper_class")
|
460
|
-
#
|
461
|
-
# # ---------- helpers ----------
|
462
|
-
# def _invalidate_cached(self, *names: str) -> None:
|
463
|
-
# for n in names:
|
464
|
-
# self.__dict__.pop(n, None)
|
465
|
-
#
|
466
|
-
# def _build_manifest_path(self) -> str:
|
467
|
-
# base = f"{self._storage_path}".rstrip("/") + "/"
|
468
|
-
# return f"{base}_manifests/missing.parquet"
|
469
|
-
#
|
470
|
-
# # ---------- lazy members ----------
|
471
|
-
# @cached_property
|
472
|
-
# def mmanifest(self) -> MissingManifestManager:
|
473
|
-
# self.logger.info("Initializing MissingManifestManager...", extra=self.logger_extra)
|
474
|
-
# manifest_path = self._build_manifest_path()
|
475
|
-
#
|
476
|
-
# # ensure manifest directory exists
|
477
|
-
# manifest_dir = manifest_path.rsplit("/", 1)[0] if "/" in manifest_path else manifest_path
|
478
|
-
# self.ensure_directory_exists(manifest_dir)
|
479
|
-
#
|
480
|
-
# mgr = MissingManifestManager(
|
481
|
-
# fs=self.fs,
|
482
|
-
# manifest_path=manifest_path,
|
483
|
-
# clear_existing=self.all_kwargs.get("overwrite", False),
|
484
|
-
# debug=self.debug,
|
485
|
-
# logger=self.logger,
|
486
|
-
# overwrite=self.all_kwargs.get("overwrite", False),
|
487
|
-
# )
|
488
|
-
#
|
489
|
-
# if not mgr._safe_exists(mgr.manifest_path):
|
490
|
-
# self.logger.info(f"Creating new manifest at {mgr.manifest_path}", extra=self.logger_extra)
|
491
|
-
# mgr.save()
|
492
|
-
# else:
|
493
|
-
# self.logger.info(f"Manifest already exists at {mgr.manifest_path}", extra=self.logger_extra)
|
494
|
-
#
|
495
|
-
# return mgr
|
496
|
-
#
|
497
|
-
# @cached_property
|
498
|
-
# def update_planner(self) -> UpdatePlanner:
|
499
|
-
# self.logger.info("Initializing UpdatePlanner...", extra=self.logger_extra)
|
500
|
-
# skipped_files = self.mmanifest.load_existing() or []
|
501
|
-
#
|
502
|
-
# cfg = {
|
503
|
-
# **self.all_kwargs,
|
504
|
-
# "fs": self.fs,
|
505
|
-
# "debug": self.debug,
|
506
|
-
# "logger": self.logger,
|
507
|
-
# "description": getattr(self._data_wrapper_class, "__name__", "DataWrapper"),
|
508
|
-
# "skipped": list(skipped_files),
|
509
|
-
# "mmanifest": self.mmanifest,
|
510
|
-
# }
|
511
|
-
# return UpdatePlanner(**cfg)
|
512
|
-
#
|
513
|
-
# @cached_property
|
514
|
-
# def data_wrapper(self) -> DataWrapper:
|
515
|
-
# self.logger.info("Initializing DataWrapper...", extra=self.logger_extra)
|
516
|
-
#
|
517
|
-
# # Ensure the planner has a plan
|
518
|
-
# if getattr(self.update_planner, "plan", None) is None:
|
519
|
-
# self.update_planner.generate_plan()
|
520
|
-
#
|
521
|
-
# class_params = {
|
522
|
-
# "debug": self.debug,
|
523
|
-
# "logger": self.logger,
|
524
|
-
# "fs": self.fs,
|
525
|
-
# "verbose": self.verbose,
|
526
|
-
# }
|
527
|
-
#
|
528
|
-
# cfg = {
|
529
|
-
# "data_path": self._storage_path,
|
530
|
-
# "parquet_filename": self._parquet_filename,
|
531
|
-
# "fs": self.fs,
|
532
|
-
# "debug": self.debug,
|
533
|
-
# "logger": self.logger,
|
534
|
-
# "verbose": self.verbose,
|
535
|
-
# "dataclass": self._data_wrapper_class,
|
536
|
-
# "class_params": class_params,
|
537
|
-
# "load_params": self.all_kwargs.get("load_params", {}) or {},
|
538
|
-
# "mmanifest": self.mmanifest,
|
539
|
-
# "update_planner": self.update_planner,
|
540
|
-
# "date_field": self.all_kwargs.get("date_field"),
|
541
|
-
# # pipeline execution knobs
|
542
|
-
# "show_progress": bool(self.all_kwargs.get("show_progress", False)),
|
543
|
-
# "timeout": float(self.all_kwargs.get("timeout", 30.0)),
|
544
|
-
# "max_threads": int(self.all_kwargs.get("max_threads", 3)),
|
545
|
-
# }
|
546
|
-
# return DataWrapper(**cfg)
|
547
|
-
#
|
548
|
-
# # ---------- public API ----------
|
549
|
-
# def load(self, **kwargs: Any):
|
550
|
-
# """
|
551
|
-
# Direct load using the configured data_wrapper_class (no planner/manifest round-trip).
|
552
|
-
# Expected to return a Dask DataFrame from the loader.
|
553
|
-
# """
|
554
|
-
# self.logger.info(f"Loading data from {self._storage_path}")
|
555
|
-
#
|
556
|
-
# if not self._data_wrapper_class:
|
557
|
-
# raise ValueError("data_wrapper_class is not configured.")
|
558
|
-
#
|
559
|
-
# params = {
|
560
|
-
# "backend": "parquet",
|
561
|
-
# "fs": self.fs,
|
562
|
-
# "logger": self.logger,
|
563
|
-
# "debug": self.debug,
|
564
|
-
# "parquet_storage_path": self._storage_path,
|
565
|
-
# "parquet_filename": self._parquet_filename,
|
566
|
-
# "parquet_start_date": self.all_kwargs.get("parquet_start_date"),
|
567
|
-
# "parquet_end_date": self.all_kwargs.get("parquet_end_date"),
|
568
|
-
# **(self.all_kwargs.get("class_params") or {}),
|
569
|
-
# }
|
570
|
-
#
|
571
|
-
# cls = self._data_wrapper_class
|
572
|
-
# with cls(**params) as instance:
|
573
|
-
# return instance.load(**kwargs)
|
574
|
-
#
|
575
|
-
# def generate_parquet(self, **kwargs: Any) -> None:
|
576
|
-
# """
|
577
|
-
# Generate or update Parquet according to the plan.
|
578
|
-
# - Merges runtime kwargs
|
579
|
-
# - Invalidates dependent caches
|
580
|
-
# - Guards against duplicate concurrent runs
|
581
|
-
# - Forwards retry/backoff to DataWrapper.process()
|
582
|
-
# """
|
583
|
-
# # Merge and invalidate caches that depend on runtime changes
|
584
|
-
# self.all_kwargs.update(kwargs)
|
585
|
-
# self._invalidate_cached("update_planner", "data_wrapper")
|
586
|
-
# if "overwrite" in kwargs:
|
587
|
-
# self._invalidate_cached("mmanifest")
|
588
|
-
#
|
589
|
-
# # Global de-dupe guard
|
590
|
-
# key = (self._storage_path, self._parquet_filename)
|
591
|
-
# with ParquetArtifact._global_lock:
|
592
|
-
# if key in ParquetArtifact._active_runs:
|
593
|
-
# self.logger.info(
|
594
|
-
# f"Run already in progress for {key}; skipping this invocation.", extra=self.logger_extra
|
595
|
-
# )
|
596
|
-
# return
|
597
|
-
# ParquetArtifact._active_runs.add(key)
|
598
|
-
#
|
599
|
-
# try:
|
600
|
-
# self.ensure_directory_exists(self._storage_path)
|
601
|
-
#
|
602
|
-
# self.update_planner.generate_plan()
|
603
|
-
# plan = getattr(self.update_planner, "plan", None)
|
604
|
-
# if plan is None or (hasattr(plan, "empty") and plan.empty):
|
605
|
-
# # Planning uses Pandas; this is safe to check.
|
606
|
-
# self.logger.info("No updates needed. Skipping Parquet generation.", extra=self.logger_extra)
|
607
|
-
# return
|
608
|
-
#
|
609
|
-
# # Print plan once per run
|
610
|
-
# if (
|
611
|
-
# getattr(self.update_planner, "show_progress", False)
|
612
|
-
# and not getattr(self.update_planner, "_printed_this_run", False)
|
613
|
-
# ):
|
614
|
-
# self.update_planner.show_update_plan()
|
615
|
-
# setattr(self.update_planner, "_printed_this_run", True)
|
616
|
-
#
|
617
|
-
# # ---- forward retry/backoff knobs to DataWrapper.process() ----
|
618
|
-
# dw_retry_kwargs = {
|
619
|
-
# k: self.all_kwargs[k]
|
620
|
-
# for k in ("max_retries", "backoff_base", "backoff_jitter", "backoff_max")
|
621
|
-
# if k in self.all_kwargs
|
622
|
-
# }
|
623
|
-
#
|
624
|
-
# with self._lock:
|
625
|
-
# dw = self.data_wrapper # single cached_property access
|
626
|
-
# if hasattr(dw, "process"):
|
627
|
-
# dw.process(**dw_retry_kwargs)
|
628
|
-
# if getattr(self.update_planner, "show_progress", False) and hasattr(
|
629
|
-
# dw, "show_benchmark_summary"
|
630
|
-
# ):
|
631
|
-
# dw.show_benchmark_summary()
|
632
|
-
#
|
633
|
-
# finally:
|
634
|
-
# with ParquetArtifact._global_lock:
|
635
|
-
# ParquetArtifact._active_runs.discard(key)
|
636
|
-
#
|
637
|
-
# def update_parquet(self, period: str = "today", **kwargs: Any) -> None:
|
638
|
-
# """
|
639
|
-
# High-level entry point to update Parquet for a given period:
|
640
|
-
# - 'today', 'yesterday', 'last_7_days', etc. via DateUtils.parse_period
|
641
|
-
# - 'ytd'
|
642
|
-
# - 'itd' (requires history_begins_on)
|
643
|
-
# - 'custom' (requires start_on / end_on)
|
644
|
-
# Also accepts retry/backoff knobs which flow to DataWrapper.process().
|
645
|
-
# """
|
646
|
-
# final_kwargs = {**self.all_kwargs, **kwargs}
|
647
|
-
#
|
648
|
-
# def itd_config():
|
649
|
-
# start_date = final_kwargs.get("history_begins_on")
|
650
|
-
# if not start_date:
|
651
|
-
# raise ValueError(
|
652
|
-
# "For period 'itd', 'history_begins_on' must be configured."
|
653
|
-
# )
|
654
|
-
# return {
|
655
|
-
# "parquet_start_date": start_date,
|
656
|
-
# "parquet_end_date": dt.date.today(),
|
657
|
-
# }
|
658
|
-
#
|
659
|
-
# def ytd_config():
|
660
|
-
# return {
|
661
|
-
# "parquet_start_date": dt.date(dt.date.today().year, 1, 1),
|
662
|
-
# "parquet_end_date": dt.date.today(),
|
663
|
-
# }
|
664
|
-
#
|
665
|
-
# def custom_config():
|
666
|
-
# """
|
667
|
-
# Prepare parameters for 'custom' period execution, ensuring `start_on` and `end_on`
|
668
|
-
# are provided (with backward compatibility for `start_date`/`end_date` aliases).
|
669
|
-
# """
|
670
|
-
# # Backward compatibility: normalize aliases
|
671
|
-
# alias_map = {
|
672
|
-
# "start_on": ("start_date", "start"),
|
673
|
-
# "end_on": ("end_date", "end"),
|
674
|
-
# }
|
675
|
-
# normalized_kwargs = dict(kwargs) # shallow copy so we don't mutate original
|
676
|
-
# for target, aliases in alias_map.items():
|
677
|
-
# if target not in normalized_kwargs:
|
678
|
-
# for alias in aliases:
|
679
|
-
# if alias in normalized_kwargs:
|
680
|
-
# normalized_kwargs[target] = normalized_kwargs[alias]
|
681
|
-
# break
|
682
|
-
#
|
683
|
-
# # Validation
|
684
|
-
# missing = [k for k in ("start_on", "end_on") if k not in normalized_kwargs]
|
685
|
-
# if missing:
|
686
|
-
# raise ValueError(
|
687
|
-
# f"For period 'custom', the following required parameters are missing: {', '.join(missing)}"
|
688
|
-
# )
|
689
|
-
#
|
690
|
-
# return {
|
691
|
-
# "parquet_start_date": normalized_kwargs["start_on"],
|
692
|
-
# "parquet_end_date": normalized_kwargs["end_on"],
|
693
|
-
# }
|
694
|
-
#
|
695
|
-
# if period == "itd":
|
696
|
-
# period_params = itd_config()
|
697
|
-
# elif period == "ytd":
|
698
|
-
# period_params = ytd_config()
|
699
|
-
# elif period == "custom":
|
700
|
-
# period_params = custom_config()
|
701
|
-
# else:
|
702
|
-
# start_date, end_date = DateUtils.parse_period(period=period)
|
703
|
-
# period_params = {
|
704
|
-
# "parquet_start_date": start_date,
|
705
|
-
# "parquet_end_date": end_date,
|
706
|
-
# }
|
707
|
-
#
|
708
|
-
# final_kwargs.update(period_params)
|
709
|
-
# self.logger.debug(
|
710
|
-
# f"kwargs passed to update_parquet/generate_parquet: {final_kwargs}", extra=self.logger_extra
|
711
|
-
# )
|
712
|
-
#
|
713
|
-
# # Delegate to generator (handles cache invalidation + forwarding knobs)
|
714
|
-
# self.generate_parquet(**final_kwargs)
|
715
|
-
#
|
716
|
-
# # ---------- utils ----------
|
717
|
-
# def ensure_directory_exists(self, path: str) -> None:
|
718
|
-
# """Ensure the directory exists across fsspec backends."""
|
719
|
-
# with self._lock:
|
720
|
-
# if not self.fs.exists(path):
|
721
|
-
# self.logger.info(f"Creating directory: {path}", extra=self.logger_extra)
|
722
|
-
# try:
|
723
|
-
# self.fs.makedirs(path, exist_ok=True)
|
724
|
-
# except TypeError:
|
725
|
-
# try:
|
726
|
-
# self.fs.makedirs(path)
|
727
|
-
# except FileExistsError:
|
728
|
-
# pass
|
729
|
-
#
|
730
|
-
# def _cleanup(self):
|
731
|
-
# """Clean up resources upon exit."""
|
732
|
-
# try:
|
733
|
-
# if "mmanifest" in self.__dict__ and getattr(
|
734
|
-
# self.mmanifest, "_new_records", None
|
735
|
-
# ):
|
736
|
-
# if self.mmanifest._new_records:
|
737
|
-
# self.mmanifest.save()
|
738
|
-
# if "data_wrapper" in self.__dict__ and hasattr(self.data_wrapper, "close"):
|
739
|
-
# self.data_wrapper.close()
|
740
|
-
# except Exception as e:
|
741
|
-
# self.logger.warning(f"Error during resource cleanup: {e}", extra=self.logger_extra)
|
@@ -39,7 +39,8 @@ class ParquetReader(DfHelper):
|
|
39
39
|
:type fs: fsspec.AbstractFileSystem
|
40
40
|
"""
|
41
41
|
DEFAULT_CONFIG: ClassVar[Dict[str, Any]] = {
|
42
|
-
'backend': 'parquet'
|
42
|
+
'backend': 'parquet',
|
43
|
+
'partition_on': ['partition_date']
|
43
44
|
}
|
44
45
|
df: Optional[Union[dd.DataFrame, pd.DataFrame]] = None
|
45
46
|
|
@@ -30,6 +30,7 @@ class ParquetConfig(BaseModel):
|
|
30
30
|
fs: Optional[fsspec.spec.AbstractFileSystem] = None
|
31
31
|
logger: Optional[Logger] = None
|
32
32
|
debug: bool = False
|
33
|
+
partition_on: Optional[list[str]] = None # column name for partitioned datasets
|
33
34
|
|
34
35
|
# ---- Derived / runtime fields (lazy) ----
|
35
36
|
parquet_full_path: Optional[str] = None # file or directory
|
@@ -80,7 +81,7 @@ class ParquetConfig(BaseModel):
|
|
80
81
|
)
|
81
82
|
|
82
83
|
# file vs dataset-at-root
|
83
|
-
if self.parquet_filename:
|
84
|
+
if self.parquet_filename and self.partition_on is None:
|
84
85
|
self.parquet_full_path = self.ensure_file_extension(
|
85
86
|
posixpath.join(str(self.parquet_storage_path), str(self.parquet_filename)),
|
86
87
|
"parquet",
|
@@ -224,11 +225,32 @@ class ParquetConfig(BaseModel):
|
|
224
225
|
|
225
226
|
# ------------------------- internals -------------------------
|
226
227
|
|
228
|
+
|
227
229
|
def _resolve_paths_for_read(self) -> List[str]:
|
228
230
|
"""
|
229
231
|
Builds a list of path patterns for dask.read_parquet.
|
232
|
+
Respects partition_on + start/end date if given.
|
230
233
|
"""
|
231
|
-
|
234
|
+
self.logger.debug(f"_resolve_paths_for_read: {self.partition_on}")
|
235
|
+
# Partitioned dataset by column
|
236
|
+
if self.partition_on and self.parquet_start_date and self.parquet_end_date:
|
237
|
+
if not isinstance(self.partition_on, (list, tuple)):
|
238
|
+
parts = [self.partition_on]
|
239
|
+
else:
|
240
|
+
parts = self.partition_on
|
241
|
+
|
242
|
+
start = dt.datetime.strptime(self.parquet_start_date, "%Y-%m-%d").date()
|
243
|
+
end = dt.datetime.strptime(self.parquet_end_date, "%Y-%m-%d").date()
|
244
|
+
days = pd.date_range(start=start, end=end, freq="D").date
|
245
|
+
|
246
|
+
base = self.parquet_storage_path.rstrip("/")
|
247
|
+
result= [
|
248
|
+
f"{base}/{parts[0]}={d.isoformat()}/*.parquet"
|
249
|
+
for d in days
|
250
|
+
]
|
251
|
+
return result
|
252
|
+
|
253
|
+
# Date-ranged folders (non-partitioned, using FilePathGenerator)
|
232
254
|
if self.parquet_folder_list:
|
233
255
|
dirs = {self._dirname(p) for p in self.parquet_folder_list}
|
234
256
|
return [d.rstrip("/") + "/*.parquet" for d in sorted(dirs)]
|
@@ -1,14 +1,16 @@
|
|
1
|
-
from
|
1
|
+
from __future__ import annotations
|
2
|
+
from .base_attacher import make_attacher, AttachmentMaker
|
2
3
|
from .base_data_cube import BaseDataCube
|
3
|
-
from .
|
4
|
+
from .base_parquet_artifact import BaseParquetArtifact
|
4
5
|
from .base_parquet_reader import BaseParquetReader
|
5
|
-
from .hybrid_data_loader import HybridDataLoader
|
6
6
|
from .base_pipeline import BasePipeline
|
7
7
|
from .base_pipeline_template import PipelineTemplate
|
8
|
+
from .hybrid_data_loader import HybridDataLoader
|
8
9
|
|
9
10
|
__all__ = [
|
10
11
|
"BaseDataCube",
|
11
12
|
"BaseParquetArtifact",
|
13
|
+
"AttachmentMaker",
|
12
14
|
"make_attacher",
|
13
15
|
"BaseParquetReader",
|
14
16
|
"HybridDataLoader",
|
@@ -91,35 +91,19 @@ class BasePipeline(ManagedResource):
|
|
91
91
|
return
|
92
92
|
|
93
93
|
df[self.date_field] = dd.to_datetime(df[self.date_field], errors="coerce")
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
saver = ParquetSaver(
|
109
|
-
df_result=df_day,
|
110
|
-
parquet_storage_path=path,
|
111
|
-
fs=self.fs,
|
112
|
-
debug=self.debug,
|
113
|
-
logger=self.logger,
|
114
|
-
)
|
115
|
-
|
116
|
-
tasks.append(
|
117
|
-
asyncio.get_running_loop().run_in_executor(
|
118
|
-
executor, saver.save_to_parquet, self._get_output_filename()
|
119
|
-
)
|
120
|
-
)
|
121
|
-
|
122
|
-
await asyncio.gather(*tasks)
|
94
|
+
df["partition_date"] = df[self.date_field].dt.date.astype(str)
|
95
|
+
|
96
|
+
out_path = self.storage_path.rstrip("/")+"/"+self._get_output_filename(fmt="parquet")
|
97
|
+
self.logger.info("Saving dataset to %s", out_path)
|
98
|
+
ps = ParquetSaver(
|
99
|
+
df_result=df,
|
100
|
+
parquet_storage_path=out_path,
|
101
|
+
engine="pyarrow",
|
102
|
+
fs=self.fs,
|
103
|
+
partition_on=["partition_date"],
|
104
|
+
write_index=False,
|
105
|
+
)
|
106
|
+
ps.save_to_parquet()
|
123
107
|
await self.emit("complete", message="All partitions written.")
|
124
108
|
|
125
109
|
async def from_parquet(self, **kwargs) -> dd.DataFrame:
|
@@ -153,6 +137,7 @@ class BasePipeline(ManagedResource):
|
|
153
137
|
self.logger.warning("No valid dates found for partitioning.")
|
154
138
|
return
|
155
139
|
|
140
|
+
clk_conf['table'] = self.filename
|
156
141
|
clk = ClickHouseWriter(**clk_conf)
|
157
142
|
loop = asyncio.get_running_loop()
|
158
143
|
tasks = []
|