duckrun 0.1.6.2__py3-none-any.whl → 0.1.6.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
duckrun/core.py CHANGED
@@ -506,6 +506,246 @@ class Duckrun:
506
506
  print('='*60)
507
507
  return True
508
508
 
509
+ def copy(self, local_folder: str, remote_folder: str,
510
+ file_extensions: Optional[List[str]] = None,
511
+ overwrite: bool = False) -> bool:
512
+ """
513
+ Copy files from a local folder to OneLake Files section.
514
+
515
+ Args:
516
+ local_folder: Path to local folder containing files to upload
517
+ remote_folder: Target subfolder path in OneLake Files (e.g., "reports/daily") - REQUIRED
518
+ file_extensions: Optional list of file extensions to filter (e.g., ['.csv', '.parquet'])
519
+ overwrite: Whether to overwrite existing files (default: False)
520
+
521
+ Returns:
522
+ True if all files uploaded successfully, False otherwise
523
+
524
+ Examples:
525
+ # Upload all files from local folder to a target folder
526
+ dr.copy("./local_data", "uploaded_data")
527
+
528
+ # Upload only CSV files to a specific subfolder
529
+ dr.copy("./reports", "daily_reports", ['.csv'])
530
+
531
+ # Upload with overwrite enabled
532
+ dr.copy("./backup", "backups", overwrite=True)
533
+ """
534
+ if not os.path.exists(local_folder):
535
+ print(f"❌ Local folder not found: {local_folder}")
536
+ return False
537
+
538
+ if not os.path.isdir(local_folder):
539
+ print(f"❌ Path is not a directory: {local_folder}")
540
+ return False
541
+
542
+ # Get Azure token
543
+ token = self._get_storage_token()
544
+ if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
545
+ print("Getting Azure token for file upload...")
546
+ from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
547
+ credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
548
+ token_obj = credential.get_token("https://storage.azure.com/.default")
549
+ token = token_obj.token
550
+ os.environ["AZURE_STORAGE_TOKEN"] = token
551
+
552
+ # Setup OneLake Files URL (not Tables)
553
+ files_base_url = f'abfss://{self.workspace}@onelake.dfs.fabric.microsoft.com/{self.lakehouse_name}.Lakehouse/Files/'
554
+ store = AzureStore.from_url(files_base_url, bearer_token=token)
555
+
556
+ # Collect files to upload
557
+ files_to_upload = []
558
+ for root, dirs, files in os.walk(local_folder):
559
+ for file in files:
560
+ local_file_path = os.path.join(root, file)
561
+
562
+ # Filter by extensions if specified
563
+ if file_extensions:
564
+ _, ext = os.path.splitext(file)
565
+ if ext.lower() not in [e.lower() for e in file_extensions]:
566
+ continue
567
+
568
+ # Calculate relative path from local_folder
569
+ rel_path = os.path.relpath(local_file_path, local_folder)
570
+
571
+ # Build remote path in OneLake Files (remote_folder is now mandatory)
572
+ remote_path = f"{remote_folder.strip('/')}/{rel_path}".replace("\\", "/")
573
+
574
+ files_to_upload.append((local_file_path, remote_path))
575
+
576
+ if not files_to_upload:
577
+ print(f"No files found to upload in {local_folder}")
578
+ if file_extensions:
579
+ print(f" (filtered by extensions: {file_extensions})")
580
+ return True
581
+
582
+ print(f"📁 Uploading {len(files_to_upload)} files from '{local_folder}' to OneLake Files...")
583
+ print(f" Target folder: {remote_folder}")
584
+
585
+ uploaded_count = 0
586
+ failed_count = 0
587
+
588
+ for local_path, remote_path in files_to_upload:
589
+ try:
590
+ # Check if file exists (if not overwriting)
591
+ if not overwrite:
592
+ try:
593
+ obs.head(store, remote_path)
594
+ print(f" ⏭ Skipped (exists): {remote_path}")
595
+ continue
596
+ except Exception:
597
+ # File doesn't exist, proceed with upload
598
+ pass
599
+
600
+ # Read local file
601
+ with open(local_path, 'rb') as f:
602
+ file_data = f.read()
603
+
604
+ # Upload to OneLake Files
605
+ obs.put(store, remote_path, file_data)
606
+
607
+ file_size = len(file_data)
608
+ size_mb = file_size / (1024 * 1024) if file_size > 1024*1024 else file_size / 1024
609
+ size_unit = "MB" if file_size > 1024*1024 else "KB"
610
+
611
+ print(f" ✓ Uploaded: {local_path} → {remote_path} ({size_mb:.1f} {size_unit})")
612
+ uploaded_count += 1
613
+
614
+ except Exception as e:
615
+ print(f" ❌ Failed: {local_path} → {remote_path} | Error: {str(e)[:100]}")
616
+ failed_count += 1
617
+
618
+ print(f"\n{'='*60}")
619
+ if failed_count == 0:
620
+ print(f"✅ Successfully uploaded all {uploaded_count} files to OneLake Files")
621
+ else:
622
+ print(f"⚠ Uploaded {uploaded_count} files, {failed_count} failed")
623
+ print(f"{'='*60}")
624
+
625
+ return failed_count == 0
626
+
627
+ def download(self, remote_folder: str = "", local_folder: str = "./downloaded_files",
628
+ file_extensions: Optional[List[str]] = None,
629
+ overwrite: bool = False) -> bool:
630
+ """
631
+ Download files from OneLake Files section to a local folder.
632
+
633
+ Args:
634
+ remote_folder: Optional subfolder path in OneLake Files to download from
635
+ local_folder: Local folder path to download files to (default: "./downloaded_files")
636
+ file_extensions: Optional list of file extensions to filter (e.g., ['.csv', '.parquet'])
637
+ overwrite: Whether to overwrite existing local files (default: False)
638
+
639
+ Returns:
640
+ True if all files downloaded successfully, False otherwise
641
+
642
+ Examples:
643
+ # Download all files from OneLake Files root
644
+ dr.download_from_files()
645
+
646
+ # Download only CSV files from a specific subfolder
647
+ dr.download_from_files("daily_reports", "./reports", ['.csv'])
648
+ """
649
+ # Get Azure token
650
+ token = self._get_storage_token()
651
+ if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
652
+ print("Getting Azure token for file download...")
653
+ from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
654
+ credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
655
+ token_obj = credential.get_token("https://storage.azure.com/.default")
656
+ token = token_obj.token
657
+ os.environ["AZURE_STORAGE_TOKEN"] = token
658
+
659
+ # Setup OneLake Files URL (not Tables)
660
+ files_base_url = f'abfss://{self.workspace}@onelake.dfs.fabric.microsoft.com/{self.lakehouse_name}.Lakehouse/Files/'
661
+ store = AzureStore.from_url(files_base_url, bearer_token=token)
662
+
663
+ # Create local directory
664
+ os.makedirs(local_folder, exist_ok=True)
665
+
666
+ # List files in OneLake Files
667
+ print(f"📁 Discovering files in OneLake Files...")
668
+ if remote_folder:
669
+ print(f" Source folder: {remote_folder}")
670
+ prefix = f"{remote_folder.strip('/')}/"
671
+ else:
672
+ prefix = ""
673
+
674
+ try:
675
+ list_stream = obs.list(store, prefix=prefix)
676
+ files_to_download = []
677
+
678
+ for batch in list_stream:
679
+ for obj in batch:
680
+ remote_path = obj["path"]
681
+
682
+ # Filter by extensions if specified
683
+ if file_extensions:
684
+ _, ext = os.path.splitext(remote_path)
685
+ if ext.lower() not in [e.lower() for e in file_extensions]:
686
+ continue
687
+
688
+ # Calculate local path
689
+ if remote_folder:
690
+ rel_path = os.path.relpath(remote_path, remote_folder.strip('/'))
691
+ else:
692
+ rel_path = remote_path
693
+
694
+ local_path = os.path.join(local_folder, rel_path).replace('/', os.sep)
695
+ files_to_download.append((remote_path, local_path))
696
+
697
+ if not files_to_download:
698
+ print(f"No files found to download")
699
+ if file_extensions:
700
+ print(f" (filtered by extensions: {file_extensions})")
701
+ return True
702
+
703
+ print(f"📥 Downloading {len(files_to_download)} files to '{local_folder}'...")
704
+
705
+ downloaded_count = 0
706
+ failed_count = 0
707
+
708
+ for remote_path, local_path in files_to_download:
709
+ try:
710
+ # Check if local file exists (if not overwriting)
711
+ if not overwrite and os.path.exists(local_path):
712
+ print(f" ⏭ Skipped (exists): {local_path}")
713
+ continue
714
+
715
+ # Ensure local directory exists
716
+ os.makedirs(os.path.dirname(local_path), exist_ok=True)
717
+
718
+ # Download file
719
+ data = obs.get(store, remote_path).bytes()
720
+
721
+ # Write to local file
722
+ with open(local_path, 'wb') as f:
723
+ f.write(data)
724
+
725
+ file_size = len(data)
726
+ size_mb = file_size / (1024 * 1024) if file_size > 1024*1024 else file_size / 1024
727
+ size_unit = "MB" if file_size > 1024*1024 else "KB"
728
+
729
+ print(f" ✓ Downloaded: {remote_path} → {local_path} ({size_mb:.1f} {size_unit})")
730
+ downloaded_count += 1
731
+
732
+ except Exception as e:
733
+ print(f" ❌ Failed: {remote_path} → {local_path} | Error: {str(e)[:100]}")
734
+ failed_count += 1
735
+
736
+ print(f"\n{'='*60}")
737
+ if failed_count == 0:
738
+ print(f"✅ Successfully downloaded all {downloaded_count} files from OneLake Files")
739
+ else:
740
+ print(f"⚠ Downloaded {downloaded_count} files, {failed_count} failed")
741
+ print(f"{'='*60}")
742
+
743
+ return failed_count == 0
744
+
745
+ except Exception as e:
746
+ print(f"❌ Error listing files from OneLake: {e}")
747
+ return False
748
+
509
749
  def sql(self, query: str):
510
750
  """
511
751
  Execute raw SQL query with Spark-style write API.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.1.6.2
3
+ Version: 0.1.6.3
4
4
  Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
5
5
  Author: mim
6
6
  License: MIT
@@ -58,6 +58,10 @@ con.sql("SELECT * FROM my_table LIMIT 10").show()
58
58
 
59
59
  # Write to Delta tables (Spark-style API)
60
60
  con.sql("SELECT * FROM source").write.mode("overwrite").saveAsTable("target")
61
+
62
+ # Upload/download files to/from OneLake Files
63
+ con.copy("./local_folder", "target_folder") # Upload files
64
+ con.download("target_folder", "./downloaded") # Download files
61
65
  ```
62
66
 
63
67
  That's it! No `sql_folder` needed for data exploration.
@@ -127,7 +131,38 @@ con.sql("SELECT * FROM new_orders").write.mode("append").saveAsTable("orders")
127
131
 
128
132
  **Note:** `.format("delta")` is optional - Delta is the default format!
129
133
 
130
- ### 2. Pipeline Orchestration
134
+ ### 2. File Management (OneLake Files)
135
+
136
+ Upload and download files to/from OneLake Files section (not Delta tables):
137
+
138
+ ```python
139
+ con = duckrun.connect("workspace/lakehouse.lakehouse/dbo")
140
+
141
+ # Upload files to OneLake Files (remote_folder is required)
142
+ con.copy("./local_data", "uploaded_data")
143
+
144
+ # Upload only specific file types
145
+ con.copy("./reports", "daily_reports", ['.csv', '.parquet'])
146
+
147
+ # Upload with overwrite enabled (default is False for safety)
148
+ con.copy("./backup", "backups", overwrite=True)
149
+
150
+ # Download files from OneLake Files
151
+ con.download("uploaded_data", "./downloaded")
152
+
153
+ # Download only CSV files from a specific folder
154
+ con.download("daily_reports", "./reports", ['.csv'])
155
+ ```
156
+
157
+ **Key Features:**
158
+ - ✅ **Files go to OneLake Files section** (not Delta Tables)
159
+ - ✅ **`remote_folder` parameter is required** for uploads (prevents accidental uploads)
160
+ - ✅ **`overwrite=False` by default** (safer - prevents accidental overwrites)
161
+ - ✅ **File extension filtering** (e.g., only `.csv` or `.parquet` files)
162
+ - ✅ **Preserves folder structure** during upload/download
163
+ - ✅ **Progress reporting** with file sizes and upload status
164
+
165
+ ### 3. Pipeline Orchestration
131
166
 
132
167
  For production workflows with reusable SQL and Python tasks:
133
168
 
@@ -286,6 +321,63 @@ con = duckrun.connect(
286
321
  )
287
322
  ```
288
323
 
324
+ ## File Management API Reference
325
+
326
+ ### `copy(local_folder, remote_folder, file_extensions=None, overwrite=False)`
327
+
328
+ Upload files from a local folder to OneLake Files section.
329
+
330
+ **Parameters:**
331
+ - `local_folder` (str): Path to local folder containing files to upload
332
+ - `remote_folder` (str): **Required** target folder path in OneLake Files
333
+ - `file_extensions` (list, optional): Filter by file extensions (e.g., `['.csv', '.parquet']`)
334
+ - `overwrite` (bool, optional): Whether to overwrite existing files (default: False)
335
+
336
+ **Returns:** `True` if all files uploaded successfully, `False` otherwise
337
+
338
+ **Examples:**
339
+ ```python
340
+ # Upload all files to a target folder
341
+ con.copy("./data", "processed_data")
342
+
343
+ # Upload only CSV and Parquet files
344
+ con.copy("./reports", "monthly_reports", ['.csv', '.parquet'])
345
+
346
+ # Upload with overwrite enabled
347
+ con.copy("./backup", "daily_backup", overwrite=True)
348
+ ```
349
+
350
+ ### `download(remote_folder="", local_folder="./downloaded_files", file_extensions=None, overwrite=False)`
351
+
352
+ Download files from OneLake Files section to a local folder.
353
+
354
+ **Parameters:**
355
+ - `remote_folder` (str, optional): Source folder path in OneLake Files (default: root)
356
+ - `local_folder` (str, optional): Local destination folder (default: "./downloaded_files")
357
+ - `file_extensions` (list, optional): Filter by file extensions (e.g., `['.csv', '.json']`)
358
+ - `overwrite` (bool, optional): Whether to overwrite existing local files (default: False)
359
+
360
+ **Returns:** `True` if all files downloaded successfully, `False` otherwise
361
+
362
+ **Examples:**
363
+ ```python
364
+ # Download all files from OneLake Files root
365
+ con.download()
366
+
367
+ # Download from specific folder
368
+ con.download("processed_data", "./local_data")
369
+
370
+ # Download only JSON files
371
+ con.download("config", "./configs", ['.json'])
372
+ ```
373
+
374
+ **Important Notes:**
375
+ - Files are uploaded/downloaded to/from the **OneLake Files section**, not Delta Tables
376
+ - The `remote_folder` parameter is **required** for uploads to prevent accidental uploads
377
+ - Both methods default to `overwrite=False` for safety
378
+ - Folder structure is preserved during upload/download operations
379
+ - Progress is reported with file names, sizes, and upload/download status
380
+
289
381
  ## Complete Example
290
382
 
291
383
  ```python
@@ -294,7 +386,10 @@ import duckrun
294
386
  # Connect (specify schema for best performance)
295
387
  con = duckrun.connect("Analytics/Sales.lakehouse/dbo", sql_folder="./sql")
296
388
 
297
- # Pipeline with mixed tasks
389
+ # 1. Upload raw data files to OneLake Files
390
+ con.copy("./raw_data", "raw_uploads", ['.csv', '.json'])
391
+
392
+ # 2. Pipeline with mixed tasks
298
393
  pipeline = [
299
394
  # Download raw data (Python)
300
395
  ('fetch_api_data', ('https://api.example.com/sales', 'raw')),
@@ -309,20 +404,30 @@ pipeline = [
309
404
  ('sales_history', 'append')
310
405
  ]
311
406
 
312
- # Run
407
+ # Run pipeline
313
408
  success = con.run(pipeline)
314
409
 
315
- # Explore results
410
+ # 3. Explore results using DuckDB
316
411
  con.sql("SELECT * FROM regional_summary").show()
317
412
 
318
- # Export to new table
413
+ # 4. Export to new Delta table
319
414
  con.sql("""
320
415
  SELECT region, SUM(total) as grand_total
321
416
  FROM regional_summary
322
417
  GROUP BY region
323
418
  """).write.mode("overwrite").saveAsTable("region_totals")
419
+
420
+ # 5. Download processed files for external systems
421
+ con.download("processed_reports", "./exports", ['.csv'])
324
422
  ```
325
423
 
424
+ **This example demonstrates:**
425
+ - 📁 **File uploads** to OneLake Files section
426
+ - 🔄 **Pipeline orchestration** with SQL and Python tasks
427
+ - ⚡ **Fast data exploration** with DuckDB
428
+ - 💾 **Delta table creation** with Spark-style API
429
+ - 📤 **File downloads** from OneLake Files
430
+
326
431
  ## How It Works
327
432
 
328
433
  1. **Connection**: Duckrun connects to your Fabric lakehouse using OneLake and Azure authentication
@@ -0,0 +1,7 @@
1
+ duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
2
+ duckrun/core.py,sha256=CT2NH5hCLsv4uB5zH3VxTuCVQy0nWkPBG-cICLPhG_8,34245
3
+ duckrun-0.1.6.3.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
4
+ duckrun-0.1.6.3.dist-info/METADATA,sha256=ny5DcRSU1B4SdHdJqHCYk0-hNo9-zqFABqMY9ulAVNk,13595
5
+ duckrun-0.1.6.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
6
+ duckrun-0.1.6.3.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
7
+ duckrun-0.1.6.3.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
2
- duckrun/core.py,sha256=_18GjaaT_CqhtivyDQuLIQx5UUuUIZNBMK9nBQgavXc,23180
3
- duckrun-0.1.6.2.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
4
- duckrun-0.1.6.2.dist-info/METADATA,sha256=dYy1d8V2yq2JwqkLXwJC8iBLMP6UbbFm9ZGHsBJLGuY,9497
5
- duckrun-0.1.6.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
6
- duckrun-0.1.6.2.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
7
- duckrun-0.1.6.2.dist-info/RECORD,,